{ "best_metric": 0.6816287040710449, "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b/checkpoint-225", "epoch": 8.0, "eval_steps": 1.0, "global_step": 256, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03125, "grad_norm": 1.3320099054231718, "learning_rate": 0.0, "loss": 1.3851, "step": 1 }, { "epoch": 0.03125, "eval_loss": 1.3910757303237915, "eval_runtime": 63.0135, "eval_samples_per_second": 3.174, "eval_steps_per_second": 0.397, "step": 1 }, { "epoch": 0.0625, "grad_norm": 1.0473401758450829, "learning_rate": 8.613531161467863e-06, "loss": 1.3255, "step": 2 }, { "epoch": 0.0625, "eval_loss": 1.3910757303237915, "eval_runtime": 56.9747, "eval_samples_per_second": 3.51, "eval_steps_per_second": 0.439, "step": 2 }, { "epoch": 0.09375, "grad_norm": 1.0429876090069883, "learning_rate": 1.3652123889719709e-05, "loss": 1.3737, "step": 3 }, { "epoch": 0.09375, "eval_loss": 1.3638323545455933, "eval_runtime": 56.6988, "eval_samples_per_second": 3.527, "eval_steps_per_second": 0.441, "step": 3 }, { "epoch": 0.125, "grad_norm": 0.9193695742967616, "learning_rate": 1.7227062322935725e-05, "loss": 1.3309, "step": 4 }, { "epoch": 0.125, "eval_loss": 1.3227791786193848, "eval_runtime": 56.6188, "eval_samples_per_second": 3.532, "eval_steps_per_second": 0.442, "step": 4 }, { "epoch": 0.15625, "grad_norm": 1.0043594584185398, "learning_rate": 2e-05, "loss": 1.2984, "step": 5 }, { "epoch": 0.15625, "eval_loss": 1.2728056907653809, "eval_runtime": 58.8213, "eval_samples_per_second": 3.4, "eval_steps_per_second": 0.425, "step": 5 }, { "epoch": 0.1875, "grad_norm": 0.8222566364005439, "learning_rate": 2e-05, "loss": 1.2639, "step": 6 }, { "epoch": 0.1875, "eval_loss": 1.2296103239059448, "eval_runtime": 56.6504, "eval_samples_per_second": 3.53, "eval_steps_per_second": 0.441, "step": 6 }, { "epoch": 0.21875, "grad_norm": 0.6389176248800544, "learning_rate": 2e-05, "loss": 1.2314, "step": 7 }, { "epoch": 0.21875, "eval_loss": 1.1983529329299927, "eval_runtime": 56.5641, "eval_samples_per_second": 3.536, "eval_steps_per_second": 0.442, "step": 7 }, { "epoch": 0.25, "grad_norm": 0.599291017991319, "learning_rate": 2e-05, "loss": 1.2037, "step": 8 }, { "epoch": 0.25, "eval_loss": 1.1734061241149902, "eval_runtime": 56.6005, "eval_samples_per_second": 3.534, "eval_steps_per_second": 0.442, "step": 8 }, { "epoch": 0.28125, "grad_norm": 0.4952974010296138, "learning_rate": 2e-05, "loss": 1.226, "step": 9 }, { "epoch": 0.28125, "eval_loss": 1.1502536535263062, "eval_runtime": 56.7524, "eval_samples_per_second": 3.524, "eval_steps_per_second": 0.441, "step": 9 }, { "epoch": 0.3125, "grad_norm": 0.4967350606769311, "learning_rate": 2e-05, "loss": 1.1613, "step": 10 }, { "epoch": 0.3125, "eval_loss": 1.127350091934204, "eval_runtime": 56.7569, "eval_samples_per_second": 3.524, "eval_steps_per_second": 0.44, "step": 10 }, { "epoch": 0.34375, "grad_norm": 0.43644425188108293, "learning_rate": 2e-05, "loss": 1.2077, "step": 11 }, { "epoch": 0.34375, "eval_loss": 1.104610562324524, "eval_runtime": 56.607, "eval_samples_per_second": 3.533, "eval_steps_per_second": 0.442, "step": 11 }, { "epoch": 0.375, "grad_norm": 0.4763392566533296, "learning_rate": 2e-05, "loss": 1.1593, "step": 12 }, { "epoch": 0.375, "eval_loss": 1.0827140808105469, "eval_runtime": 56.6548, "eval_samples_per_second": 3.53, "eval_steps_per_second": 0.441, "step": 12 }, { "epoch": 0.40625, "grad_norm": 0.49138280391100253, "learning_rate": 2e-05, "loss": 1.1679, "step": 13 }, { "epoch": 0.40625, "eval_loss": 1.0621232986450195, "eval_runtime": 56.8147, "eval_samples_per_second": 3.52, "eval_steps_per_second": 0.44, "step": 13 }, { "epoch": 0.4375, "grad_norm": 0.4305508696222477, "learning_rate": 2e-05, "loss": 1.0008, "step": 14 }, { "epoch": 0.4375, "eval_loss": 1.0437134504318237, "eval_runtime": 56.7306, "eval_samples_per_second": 3.525, "eval_steps_per_second": 0.441, "step": 14 }, { "epoch": 0.46875, "grad_norm": 0.39438622708065774, "learning_rate": 2e-05, "loss": 1.1206, "step": 15 }, { "epoch": 0.46875, "eval_loss": 1.0277280807495117, "eval_runtime": 56.6499, "eval_samples_per_second": 3.53, "eval_steps_per_second": 0.441, "step": 15 }, { "epoch": 0.5, "grad_norm": 0.40300919769454296, "learning_rate": 2e-05, "loss": 1.0501, "step": 16 }, { "epoch": 0.5, "eval_loss": 1.0134528875350952, "eval_runtime": 56.3333, "eval_samples_per_second": 3.55, "eval_steps_per_second": 0.444, "step": 16 }, { "epoch": 0.53125, "grad_norm": 0.35230570754831836, "learning_rate": 2e-05, "loss": 1.0593, "step": 17 }, { "epoch": 0.53125, "eval_loss": 1.0004419088363647, "eval_runtime": 56.6019, "eval_samples_per_second": 3.533, "eval_steps_per_second": 0.442, "step": 17 }, { "epoch": 0.5625, "grad_norm": 0.37606931260721715, "learning_rate": 2e-05, "loss": 1.0482, "step": 18 }, { "epoch": 0.5625, "eval_loss": 0.9879937767982483, "eval_runtime": 56.6945, "eval_samples_per_second": 3.528, "eval_steps_per_second": 0.441, "step": 18 }, { "epoch": 0.59375, "grad_norm": 0.2941404563021841, "learning_rate": 2e-05, "loss": 0.9707, "step": 19 }, { "epoch": 0.59375, "eval_loss": 0.976818859577179, "eval_runtime": 56.6805, "eval_samples_per_second": 3.529, "eval_steps_per_second": 0.441, "step": 19 }, { "epoch": 0.625, "grad_norm": 0.2958263397509482, "learning_rate": 2e-05, "loss": 1.091, "step": 20 }, { "epoch": 0.625, "eval_loss": 0.9669834971427917, "eval_runtime": 57.6231, "eval_samples_per_second": 3.471, "eval_steps_per_second": 0.434, "step": 20 }, { "epoch": 0.65625, "grad_norm": 0.2485896802049987, "learning_rate": 2e-05, "loss": 1.0041, "step": 21 }, { "epoch": 0.65625, "eval_loss": 0.9583450555801392, "eval_runtime": 56.5142, "eval_samples_per_second": 3.539, "eval_steps_per_second": 0.442, "step": 21 }, { "epoch": 0.6875, "grad_norm": 0.296994298254859, "learning_rate": 2e-05, "loss": 1.055, "step": 22 }, { "epoch": 0.6875, "eval_loss": 0.9502925276756287, "eval_runtime": 56.6393, "eval_samples_per_second": 3.531, "eval_steps_per_second": 0.441, "step": 22 }, { "epoch": 0.71875, "grad_norm": 0.2499735192340966, "learning_rate": 2e-05, "loss": 1.04, "step": 23 }, { "epoch": 0.71875, "eval_loss": 0.9427899122238159, "eval_runtime": 56.5467, "eval_samples_per_second": 3.537, "eval_steps_per_second": 0.442, "step": 23 }, { "epoch": 0.75, "grad_norm": 0.23614468035916372, "learning_rate": 2e-05, "loss": 1.0387, "step": 24 }, { "epoch": 0.75, "eval_loss": 0.9359552264213562, "eval_runtime": 56.8371, "eval_samples_per_second": 3.519, "eval_steps_per_second": 0.44, "step": 24 }, { "epoch": 0.78125, "grad_norm": 0.2597610358499704, "learning_rate": 2e-05, "loss": 0.9821, "step": 25 }, { "epoch": 0.78125, "eval_loss": 0.929139256477356, "eval_runtime": 56.659, "eval_samples_per_second": 3.53, "eval_steps_per_second": 0.441, "step": 25 }, { "epoch": 0.8125, "grad_norm": 0.2483654904520099, "learning_rate": 2e-05, "loss": 1.0139, "step": 26 }, { "epoch": 0.8125, "eval_loss": 0.9226005673408508, "eval_runtime": 56.4669, "eval_samples_per_second": 3.542, "eval_steps_per_second": 0.443, "step": 26 }, { "epoch": 0.84375, "grad_norm": 0.2814780741041167, "learning_rate": 2e-05, "loss": 0.9374, "step": 27 }, { "epoch": 0.84375, "eval_loss": 0.9160022735595703, "eval_runtime": 56.6558, "eval_samples_per_second": 3.53, "eval_steps_per_second": 0.441, "step": 27 }, { "epoch": 0.875, "grad_norm": 0.29993540247195477, "learning_rate": 2e-05, "loss": 0.948, "step": 28 }, { "epoch": 0.875, "eval_loss": 0.9092594981193542, "eval_runtime": 56.743, "eval_samples_per_second": 3.525, "eval_steps_per_second": 0.441, "step": 28 }, { "epoch": 0.90625, "grad_norm": 0.24302264777949295, "learning_rate": 2e-05, "loss": 0.9676, "step": 29 }, { "epoch": 0.90625, "eval_loss": 0.9028491377830505, "eval_runtime": 56.802, "eval_samples_per_second": 3.521, "eval_steps_per_second": 0.44, "step": 29 }, { "epoch": 0.9375, "grad_norm": 0.28001197555170687, "learning_rate": 2e-05, "loss": 1.0044, "step": 30 }, { "epoch": 0.9375, "eval_loss": 0.8969234228134155, "eval_runtime": 56.8402, "eval_samples_per_second": 3.519, "eval_steps_per_second": 0.44, "step": 30 }, { "epoch": 0.96875, "grad_norm": 0.26990828196944483, "learning_rate": 2e-05, "loss": 0.8417, "step": 31 }, { "epoch": 0.96875, "eval_loss": 0.890943169593811, "eval_runtime": 56.9987, "eval_samples_per_second": 3.509, "eval_steps_per_second": 0.439, "step": 31 }, { "epoch": 1.0, "grad_norm": 0.25976007498641823, "learning_rate": 2e-05, "loss": 0.95, "step": 32 }, { "epoch": 1.0, "eval_loss": 0.8852173686027527, "eval_runtime": 56.722, "eval_samples_per_second": 3.526, "eval_steps_per_second": 0.441, "step": 32 }, { "epoch": 1.03125, "grad_norm": 0.29530149620990226, "learning_rate": 2e-05, "loss": 0.9931, "step": 33 }, { "epoch": 1.03125, "eval_loss": 0.8795143961906433, "eval_runtime": 56.8541, "eval_samples_per_second": 3.518, "eval_steps_per_second": 0.44, "step": 33 }, { "epoch": 1.0625, "grad_norm": 0.2759239362577793, "learning_rate": 2e-05, "loss": 0.9978, "step": 34 }, { "epoch": 1.0625, "eval_loss": 0.8741766214370728, "eval_runtime": 56.7708, "eval_samples_per_second": 3.523, "eval_steps_per_second": 0.44, "step": 34 }, { "epoch": 1.09375, "grad_norm": 0.246531740102282, "learning_rate": 2e-05, "loss": 1.0163, "step": 35 }, { "epoch": 1.09375, "eval_loss": 0.8691757321357727, "eval_runtime": 56.8382, "eval_samples_per_second": 3.519, "eval_steps_per_second": 0.44, "step": 35 }, { "epoch": 1.125, "grad_norm": 0.2646078522027086, "learning_rate": 2e-05, "loss": 0.971, "step": 36 }, { "epoch": 1.125, "eval_loss": 0.8643682599067688, "eval_runtime": 56.689, "eval_samples_per_second": 3.528, "eval_steps_per_second": 0.441, "step": 36 }, { "epoch": 1.15625, "grad_norm": 0.2395171492146917, "learning_rate": 2e-05, "loss": 0.9227, "step": 37 }, { "epoch": 1.15625, "eval_loss": 0.8600785136222839, "eval_runtime": 56.72, "eval_samples_per_second": 3.526, "eval_steps_per_second": 0.441, "step": 37 }, { "epoch": 1.1875, "grad_norm": 0.28215229152733834, "learning_rate": 2e-05, "loss": 0.9308, "step": 38 }, { "epoch": 1.1875, "eval_loss": 0.8562959432601929, "eval_runtime": 56.8289, "eval_samples_per_second": 3.519, "eval_steps_per_second": 0.44, "step": 38 }, { "epoch": 1.21875, "grad_norm": 0.27116244597267625, "learning_rate": 2e-05, "loss": 0.9563, "step": 39 }, { "epoch": 1.21875, "eval_loss": 0.8526366949081421, "eval_runtime": 56.6829, "eval_samples_per_second": 3.528, "eval_steps_per_second": 0.441, "step": 39 }, { "epoch": 1.25, "grad_norm": 0.2623711894386991, "learning_rate": 2e-05, "loss": 0.9535, "step": 40 }, { "epoch": 1.25, "eval_loss": 0.8490655422210693, "eval_runtime": 56.6874, "eval_samples_per_second": 3.528, "eval_steps_per_second": 0.441, "step": 40 }, { "epoch": 1.28125, "grad_norm": 0.27251908150193377, "learning_rate": 2e-05, "loss": 0.9287, "step": 41 }, { "epoch": 1.28125, "eval_loss": 0.8451938629150391, "eval_runtime": 56.7117, "eval_samples_per_second": 3.527, "eval_steps_per_second": 0.441, "step": 41 }, { "epoch": 1.3125, "grad_norm": 0.2642817191103673, "learning_rate": 2e-05, "loss": 0.9186, "step": 42 }, { "epoch": 1.3125, "eval_loss": 0.8413894772529602, "eval_runtime": 56.9042, "eval_samples_per_second": 3.515, "eval_steps_per_second": 0.439, "step": 42 }, { "epoch": 1.34375, "grad_norm": 0.26857391288606197, "learning_rate": 2e-05, "loss": 0.8792, "step": 43 }, { "epoch": 1.34375, "eval_loss": 0.8373947739601135, "eval_runtime": 56.7211, "eval_samples_per_second": 3.526, "eval_steps_per_second": 0.441, "step": 43 }, { "epoch": 1.375, "grad_norm": 0.2474531366673803, "learning_rate": 2e-05, "loss": 0.8965, "step": 44 }, { "epoch": 1.375, "eval_loss": 0.8339560031890869, "eval_runtime": 56.8277, "eval_samples_per_second": 3.519, "eval_steps_per_second": 0.44, "step": 44 }, { "epoch": 1.40625, "grad_norm": 0.26467660282496797, "learning_rate": 2e-05, "loss": 0.8762, "step": 45 }, { "epoch": 1.40625, "eval_loss": 0.8309465050697327, "eval_runtime": 56.7019, "eval_samples_per_second": 3.527, "eval_steps_per_second": 0.441, "step": 45 }, { "epoch": 1.4375, "grad_norm": 0.2652288034609541, "learning_rate": 2e-05, "loss": 0.9118, "step": 46 }, { "epoch": 1.4375, "eval_loss": 0.8279169201850891, "eval_runtime": 56.6271, "eval_samples_per_second": 3.532, "eval_steps_per_second": 0.441, "step": 46 }, { "epoch": 1.46875, "grad_norm": 0.27355995161173785, "learning_rate": 2e-05, "loss": 0.9249, "step": 47 }, { "epoch": 1.46875, "eval_loss": 0.8252391219139099, "eval_runtime": 56.6323, "eval_samples_per_second": 3.532, "eval_steps_per_second": 0.441, "step": 47 }, { "epoch": 1.5, "grad_norm": 0.2588399009432225, "learning_rate": 2e-05, "loss": 0.8359, "step": 48 }, { "epoch": 1.5, "eval_loss": 0.8225956559181213, "eval_runtime": 58.0142, "eval_samples_per_second": 3.447, "eval_steps_per_second": 0.431, "step": 48 }, { "epoch": 1.53125, "grad_norm": 0.28116473918910634, "learning_rate": 2e-05, "loss": 0.846, "step": 49 }, { "epoch": 1.53125, "eval_loss": 0.8198111057281494, "eval_runtime": 56.6785, "eval_samples_per_second": 3.529, "eval_steps_per_second": 0.441, "step": 49 }, { "epoch": 1.5625, "grad_norm": 0.30791508615928687, "learning_rate": 2e-05, "loss": 0.8364, "step": 50 }, { "epoch": 1.5625, "eval_loss": 0.816419243812561, "eval_runtime": 56.7867, "eval_samples_per_second": 3.522, "eval_steps_per_second": 0.44, "step": 50 }, { "epoch": 1.59375, "grad_norm": 0.2635774938006065, "learning_rate": 2e-05, "loss": 0.8565, "step": 51 }, { "epoch": 1.59375, "eval_loss": 0.8128839731216431, "eval_runtime": 56.5904, "eval_samples_per_second": 3.534, "eval_steps_per_second": 0.442, "step": 51 }, { "epoch": 1.625, "grad_norm": 0.25740594308086223, "learning_rate": 2e-05, "loss": 0.7573, "step": 52 }, { "epoch": 1.625, "eval_loss": 0.8096449971199036, "eval_runtime": 56.7381, "eval_samples_per_second": 3.525, "eval_steps_per_second": 0.441, "step": 52 }, { "epoch": 1.65625, "grad_norm": 0.25917235006885775, "learning_rate": 2e-05, "loss": 0.8982, "step": 53 }, { "epoch": 1.65625, "eval_loss": 0.8064478039741516, "eval_runtime": 57.4343, "eval_samples_per_second": 3.482, "eval_steps_per_second": 0.435, "step": 53 }, { "epoch": 1.6875, "grad_norm": 0.2831937064873763, "learning_rate": 2e-05, "loss": 0.8781, "step": 54 }, { "epoch": 1.6875, "eval_loss": 0.8034397959709167, "eval_runtime": 56.8346, "eval_samples_per_second": 3.519, "eval_steps_per_second": 0.44, "step": 54 }, { "epoch": 1.71875, "grad_norm": 0.2863024186152095, "learning_rate": 2e-05, "loss": 0.8861, "step": 55 }, { "epoch": 1.71875, "eval_loss": 0.800960898399353, "eval_runtime": 56.7424, "eval_samples_per_second": 3.525, "eval_steps_per_second": 0.441, "step": 55 }, { "epoch": 1.75, "grad_norm": 0.28320211213029406, "learning_rate": 2e-05, "loss": 0.9514, "step": 56 }, { "epoch": 1.75, "eval_loss": 0.7988448143005371, "eval_runtime": 57.0405, "eval_samples_per_second": 3.506, "eval_steps_per_second": 0.438, "step": 56 }, { "epoch": 1.78125, "grad_norm": 0.3204132014824286, "learning_rate": 2e-05, "loss": 0.8947, "step": 57 }, { "epoch": 1.78125, "eval_loss": 0.7971951365470886, "eval_runtime": 57.1716, "eval_samples_per_second": 3.498, "eval_steps_per_second": 0.437, "step": 57 }, { "epoch": 1.8125, "grad_norm": 0.29386668880511096, "learning_rate": 2e-05, "loss": 0.9125, "step": 58 }, { "epoch": 1.8125, "eval_loss": 0.7956165075302124, "eval_runtime": 57.3457, "eval_samples_per_second": 3.488, "eval_steps_per_second": 0.436, "step": 58 }, { "epoch": 1.84375, "grad_norm": 0.31091076146467406, "learning_rate": 2e-05, "loss": 0.8638, "step": 59 }, { "epoch": 1.84375, "eval_loss": 0.7935267090797424, "eval_runtime": 57.373, "eval_samples_per_second": 3.486, "eval_steps_per_second": 0.436, "step": 59 }, { "epoch": 1.875, "grad_norm": 0.28779917523565474, "learning_rate": 2e-05, "loss": 0.9113, "step": 60 }, { "epoch": 1.875, "eval_loss": 0.7914787530899048, "eval_runtime": 57.2668, "eval_samples_per_second": 3.492, "eval_steps_per_second": 0.437, "step": 60 }, { "epoch": 1.90625, "grad_norm": 0.31820258275619673, "learning_rate": 2e-05, "loss": 0.8113, "step": 61 }, { "epoch": 1.90625, "eval_loss": 0.788929283618927, "eval_runtime": 57.2581, "eval_samples_per_second": 3.493, "eval_steps_per_second": 0.437, "step": 61 }, { "epoch": 1.9375, "grad_norm": 0.30186200117869055, "learning_rate": 2e-05, "loss": 0.8685, "step": 62 }, { "epoch": 1.9375, "eval_loss": 0.7862411737442017, "eval_runtime": 57.2688, "eval_samples_per_second": 3.492, "eval_steps_per_second": 0.437, "step": 62 }, { "epoch": 1.96875, "grad_norm": 0.27549296702686904, "learning_rate": 2e-05, "loss": 0.911, "step": 63 }, { "epoch": 1.96875, "eval_loss": 0.7838772535324097, "eval_runtime": 57.5102, "eval_samples_per_second": 3.478, "eval_steps_per_second": 0.435, "step": 63 }, { "epoch": 2.0, "grad_norm": 0.29444542350221403, "learning_rate": 2e-05, "loss": 0.8877, "step": 64 }, { "epoch": 2.0, "eval_loss": 0.7814672589302063, "eval_runtime": 57.3342, "eval_samples_per_second": 3.488, "eval_steps_per_second": 0.436, "step": 64 }, { "epoch": 2.03125, "grad_norm": 0.32976362380066954, "learning_rate": 2e-05, "loss": 0.836, "step": 65 }, { "epoch": 2.03125, "eval_loss": 0.7788661122322083, "eval_runtime": 57.6392, "eval_samples_per_second": 3.47, "eval_steps_per_second": 0.434, "step": 65 }, { "epoch": 2.0625, "grad_norm": 0.3091109685624876, "learning_rate": 2e-05, "loss": 0.8565, "step": 66 }, { "epoch": 2.0625, "eval_loss": 0.7769085764884949, "eval_runtime": 57.2017, "eval_samples_per_second": 3.496, "eval_steps_per_second": 0.437, "step": 66 }, { "epoch": 2.09375, "grad_norm": 0.3011651623444141, "learning_rate": 2e-05, "loss": 0.8265, "step": 67 }, { "epoch": 2.09375, "eval_loss": 0.7751161456108093, "eval_runtime": 57.4125, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.435, "step": 67 }, { "epoch": 2.125, "grad_norm": 0.28278958612422994, "learning_rate": 2e-05, "loss": 0.8893, "step": 68 }, { "epoch": 2.125, "eval_loss": 0.7736042737960815, "eval_runtime": 57.2826, "eval_samples_per_second": 3.491, "eval_steps_per_second": 0.436, "step": 68 }, { "epoch": 2.15625, "grad_norm": 0.30212533045014006, "learning_rate": 2e-05, "loss": 0.8256, "step": 69 }, { "epoch": 2.15625, "eval_loss": 0.7718043327331543, "eval_runtime": 59.4842, "eval_samples_per_second": 3.362, "eval_steps_per_second": 0.42, "step": 69 }, { "epoch": 2.1875, "grad_norm": 0.32231592883907934, "learning_rate": 2e-05, "loss": 0.7754, "step": 70 }, { "epoch": 2.1875, "eval_loss": 0.7697712779045105, "eval_runtime": 57.2127, "eval_samples_per_second": 3.496, "eval_steps_per_second": 0.437, "step": 70 }, { "epoch": 2.21875, "grad_norm": 0.29880148326318595, "learning_rate": 2e-05, "loss": 0.864, "step": 71 }, { "epoch": 2.21875, "eval_loss": 0.7679712176322937, "eval_runtime": 57.1052, "eval_samples_per_second": 3.502, "eval_steps_per_second": 0.438, "step": 71 }, { "epoch": 2.25, "grad_norm": 0.30389759178870646, "learning_rate": 2e-05, "loss": 0.7831, "step": 72 }, { "epoch": 2.25, "eval_loss": 0.7662644386291504, "eval_runtime": 57.37, "eval_samples_per_second": 3.486, "eval_steps_per_second": 0.436, "step": 72 }, { "epoch": 2.28125, "grad_norm": 0.3424258847516451, "learning_rate": 2e-05, "loss": 0.8311, "step": 73 }, { "epoch": 2.28125, "eval_loss": 0.7646127343177795, "eval_runtime": 57.1884, "eval_samples_per_second": 3.497, "eval_steps_per_second": 0.437, "step": 73 }, { "epoch": 2.3125, "grad_norm": 0.2831654885374943, "learning_rate": 2e-05, "loss": 0.8261, "step": 74 }, { "epoch": 2.3125, "eval_loss": 0.7631255388259888, "eval_runtime": 57.4573, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.435, "step": 74 }, { "epoch": 2.34375, "grad_norm": 0.29894569677081223, "learning_rate": 2e-05, "loss": 0.8801, "step": 75 }, { "epoch": 2.34375, "eval_loss": 0.7617875933647156, "eval_runtime": 57.1641, "eval_samples_per_second": 3.499, "eval_steps_per_second": 0.437, "step": 75 }, { "epoch": 2.375, "grad_norm": 0.3030991848050202, "learning_rate": 2e-05, "loss": 0.7921, "step": 76 }, { "epoch": 2.375, "eval_loss": 0.7605040073394775, "eval_runtime": 57.0991, "eval_samples_per_second": 3.503, "eval_steps_per_second": 0.438, "step": 76 }, { "epoch": 2.40625, "grad_norm": 0.30216971620226146, "learning_rate": 2e-05, "loss": 0.8527, "step": 77 }, { "epoch": 2.40625, "eval_loss": 0.7591890096664429, "eval_runtime": 58.6087, "eval_samples_per_second": 3.412, "eval_steps_per_second": 0.427, "step": 77 }, { "epoch": 2.4375, "grad_norm": 0.34907486616204614, "learning_rate": 2e-05, "loss": 0.841, "step": 78 }, { "epoch": 2.4375, "eval_loss": 0.7577351331710815, "eval_runtime": 59.509, "eval_samples_per_second": 3.361, "eval_steps_per_second": 0.42, "step": 78 }, { "epoch": 2.46875, "grad_norm": 0.3356288667630128, "learning_rate": 2e-05, "loss": 0.8417, "step": 79 }, { "epoch": 2.46875, "eval_loss": 0.7571098208427429, "eval_runtime": 57.4972, "eval_samples_per_second": 3.478, "eval_steps_per_second": 0.435, "step": 79 }, { "epoch": 2.5, "grad_norm": 0.3547770718977253, "learning_rate": 2e-05, "loss": 0.8865, "step": 80 }, { "epoch": 2.5, "eval_loss": 0.7565757632255554, "eval_runtime": 57.4262, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.435, "step": 80 }, { "epoch": 2.53125, "grad_norm": 0.36400071548952273, "learning_rate": 2e-05, "loss": 0.8201, "step": 81 }, { "epoch": 2.53125, "eval_loss": 0.7553688287734985, "eval_runtime": 59.6772, "eval_samples_per_second": 3.351, "eval_steps_per_second": 0.419, "step": 81 }, { "epoch": 2.5625, "grad_norm": 0.32432854183732784, "learning_rate": 2e-05, "loss": 0.8705, "step": 82 }, { "epoch": 2.5625, "eval_loss": 0.7540337443351746, "eval_runtime": 58.1967, "eval_samples_per_second": 3.437, "eval_steps_per_second": 0.43, "step": 82 }, { "epoch": 2.59375, "grad_norm": 0.3367161155473714, "learning_rate": 2e-05, "loss": 0.8225, "step": 83 }, { "epoch": 2.59375, "eval_loss": 0.752601683139801, "eval_runtime": 59.728, "eval_samples_per_second": 3.349, "eval_steps_per_second": 0.419, "step": 83 }, { "epoch": 2.625, "grad_norm": 0.3542073894911913, "learning_rate": 2e-05, "loss": 0.7887, "step": 84 }, { "epoch": 2.625, "eval_loss": 0.750983715057373, "eval_runtime": 58.2468, "eval_samples_per_second": 3.434, "eval_steps_per_second": 0.429, "step": 84 }, { "epoch": 2.65625, "grad_norm": 0.3387577198880303, "learning_rate": 2e-05, "loss": 0.7594, "step": 85 }, { "epoch": 2.65625, "eval_loss": 0.7495383620262146, "eval_runtime": 58.3457, "eval_samples_per_second": 3.428, "eval_steps_per_second": 0.428, "step": 85 }, { "epoch": 2.6875, "grad_norm": 0.381221735797731, "learning_rate": 2e-05, "loss": 0.7911, "step": 86 }, { "epoch": 2.6875, "eval_loss": 0.7477438449859619, "eval_runtime": 58.0584, "eval_samples_per_second": 3.445, "eval_steps_per_second": 0.431, "step": 86 }, { "epoch": 2.71875, "grad_norm": 0.3782280426863171, "learning_rate": 2e-05, "loss": 0.8115, "step": 87 }, { "epoch": 2.71875, "eval_loss": 0.7464295029640198, "eval_runtime": 57.9835, "eval_samples_per_second": 3.449, "eval_steps_per_second": 0.431, "step": 87 }, { "epoch": 2.75, "grad_norm": 0.3751127153118298, "learning_rate": 2e-05, "loss": 0.8896, "step": 88 }, { "epoch": 2.75, "eval_loss": 0.7451103329658508, "eval_runtime": 58.1947, "eval_samples_per_second": 3.437, "eval_steps_per_second": 0.43, "step": 88 }, { "epoch": 2.78125, "grad_norm": 0.3580034870691801, "learning_rate": 2e-05, "loss": 0.7964, "step": 89 }, { "epoch": 2.78125, "eval_loss": 0.744097113609314, "eval_runtime": 58.1644, "eval_samples_per_second": 3.439, "eval_steps_per_second": 0.43, "step": 89 }, { "epoch": 2.8125, "grad_norm": 0.3630926811819107, "learning_rate": 2e-05, "loss": 0.848, "step": 90 }, { "epoch": 2.8125, "eval_loss": 0.7432359457015991, "eval_runtime": 58.0811, "eval_samples_per_second": 3.443, "eval_steps_per_second": 0.43, "step": 90 }, { "epoch": 2.84375, "grad_norm": 0.3668484035124972, "learning_rate": 2e-05, "loss": 0.7444, "step": 91 }, { "epoch": 2.84375, "eval_loss": 0.7424789667129517, "eval_runtime": 59.6811, "eval_samples_per_second": 3.351, "eval_steps_per_second": 0.419, "step": 91 }, { "epoch": 2.875, "grad_norm": 0.37526030248163283, "learning_rate": 2e-05, "loss": 0.8381, "step": 92 }, { "epoch": 2.875, "eval_loss": 0.7417113780975342, "eval_runtime": 58.1209, "eval_samples_per_second": 3.441, "eval_steps_per_second": 0.43, "step": 92 }, { "epoch": 2.90625, "grad_norm": 0.36285898832422037, "learning_rate": 2e-05, "loss": 0.7797, "step": 93 }, { "epoch": 2.90625, "eval_loss": 0.7411203980445862, "eval_runtime": 58.3212, "eval_samples_per_second": 3.429, "eval_steps_per_second": 0.429, "step": 93 }, { "epoch": 2.9375, "grad_norm": 0.39983168875602654, "learning_rate": 2e-05, "loss": 0.8571, "step": 94 }, { "epoch": 2.9375, "eval_loss": 0.7402496933937073, "eval_runtime": 58.0746, "eval_samples_per_second": 3.444, "eval_steps_per_second": 0.43, "step": 94 }, { "epoch": 2.96875, "grad_norm": 0.3697896026052261, "learning_rate": 2e-05, "loss": 0.7917, "step": 95 }, { "epoch": 2.96875, "eval_loss": 0.7398749589920044, "eval_runtime": 59.8008, "eval_samples_per_second": 3.344, "eval_steps_per_second": 0.418, "step": 95 }, { "epoch": 3.0, "grad_norm": 0.39419135002625816, "learning_rate": 2e-05, "loss": 0.7987, "step": 96 }, { "epoch": 3.0, "eval_loss": 0.7384353876113892, "eval_runtime": 58.3389, "eval_samples_per_second": 3.428, "eval_steps_per_second": 0.429, "step": 96 }, { "epoch": 3.03125, "grad_norm": 0.40732207424611727, "learning_rate": 2e-05, "loss": 0.7205, "step": 97 }, { "epoch": 3.03125, "eval_loss": 0.73604416847229, "eval_runtime": 58.2114, "eval_samples_per_second": 3.436, "eval_steps_per_second": 0.429, "step": 97 }, { "epoch": 3.0625, "grad_norm": 0.3641635271623762, "learning_rate": 2e-05, "loss": 0.8062, "step": 98 }, { "epoch": 3.0625, "eval_loss": 0.7333144545555115, "eval_runtime": 59.7484, "eval_samples_per_second": 3.347, "eval_steps_per_second": 0.418, "step": 98 }, { "epoch": 3.09375, "grad_norm": 0.3556866449584765, "learning_rate": 2e-05, "loss": 0.7681, "step": 99 }, { "epoch": 3.09375, "eval_loss": 0.7306910157203674, "eval_runtime": 58.141, "eval_samples_per_second": 3.44, "eval_steps_per_second": 0.43, "step": 99 }, { "epoch": 3.125, "grad_norm": 0.3826129743685834, "learning_rate": 2e-05, "loss": 0.7961, "step": 100 }, { "epoch": 3.125, "eval_loss": 0.7283279895782471, "eval_runtime": 58.1482, "eval_samples_per_second": 3.439, "eval_steps_per_second": 0.43, "step": 100 }, { "epoch": 3.15625, "grad_norm": 0.35166540759020914, "learning_rate": 2e-05, "loss": 0.7382, "step": 101 }, { "epoch": 3.15625, "eval_loss": 0.7267993688583374, "eval_runtime": 57.8007, "eval_samples_per_second": 3.46, "eval_steps_per_second": 0.433, "step": 101 }, { "epoch": 3.1875, "grad_norm": 0.38414476136018477, "learning_rate": 2e-05, "loss": 0.7999, "step": 102 }, { "epoch": 3.1875, "eval_loss": 0.7261015176773071, "eval_runtime": 57.9723, "eval_samples_per_second": 3.45, "eval_steps_per_second": 0.431, "step": 102 }, { "epoch": 3.21875, "grad_norm": 0.40218377868187477, "learning_rate": 2e-05, "loss": 0.8115, "step": 103 }, { "epoch": 3.21875, "eval_loss": 0.7257917523384094, "eval_runtime": 58.0394, "eval_samples_per_second": 3.446, "eval_steps_per_second": 0.431, "step": 103 }, { "epoch": 3.25, "grad_norm": 0.41934721904445194, "learning_rate": 2e-05, "loss": 0.7228, "step": 104 }, { "epoch": 3.25, "eval_loss": 0.7251278758049011, "eval_runtime": 59.2828, "eval_samples_per_second": 3.374, "eval_steps_per_second": 0.422, "step": 104 }, { "epoch": 3.28125, "grad_norm": 0.3882012129329853, "learning_rate": 2e-05, "loss": 0.7658, "step": 105 }, { "epoch": 3.28125, "eval_loss": 0.724635899066925, "eval_runtime": 59.0543, "eval_samples_per_second": 3.387, "eval_steps_per_second": 0.423, "step": 105 }, { "epoch": 3.3125, "grad_norm": 0.4068559748805906, "learning_rate": 2e-05, "loss": 0.7977, "step": 106 }, { "epoch": 3.3125, "eval_loss": 0.7242235541343689, "eval_runtime": 58.5527, "eval_samples_per_second": 3.416, "eval_steps_per_second": 0.427, "step": 106 }, { "epoch": 3.34375, "grad_norm": 0.4620335365938039, "learning_rate": 2e-05, "loss": 0.7015, "step": 107 }, { "epoch": 3.34375, "eval_loss": 0.7226566076278687, "eval_runtime": 58.8135, "eval_samples_per_second": 3.401, "eval_steps_per_second": 0.425, "step": 107 }, { "epoch": 3.375, "grad_norm": 0.4009314815042761, "learning_rate": 2e-05, "loss": 0.7488, "step": 108 }, { "epoch": 3.375, "eval_loss": 0.7213454246520996, "eval_runtime": 58.735, "eval_samples_per_second": 3.405, "eval_steps_per_second": 0.426, "step": 108 }, { "epoch": 3.40625, "grad_norm": 0.456822567760836, "learning_rate": 2e-05, "loss": 0.7307, "step": 109 }, { "epoch": 3.40625, "eval_loss": 0.719496488571167, "eval_runtime": 58.9211, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.424, "step": 109 }, { "epoch": 3.4375, "grad_norm": 0.45520197938839, "learning_rate": 2e-05, "loss": 0.7348, "step": 110 }, { "epoch": 3.4375, "eval_loss": 0.7171263098716736, "eval_runtime": 58.9274, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.424, "step": 110 }, { "epoch": 3.46875, "grad_norm": 0.4421606621837213, "learning_rate": 2e-05, "loss": 0.8011, "step": 111 }, { "epoch": 3.46875, "eval_loss": 0.7155402898788452, "eval_runtime": 58.4009, "eval_samples_per_second": 3.425, "eval_steps_per_second": 0.428, "step": 111 }, { "epoch": 3.5, "grad_norm": 0.4111011701354251, "learning_rate": 2e-05, "loss": 0.7829, "step": 112 }, { "epoch": 3.5, "eval_loss": 0.714958667755127, "eval_runtime": 58.3143, "eval_samples_per_second": 3.43, "eval_steps_per_second": 0.429, "step": 112 }, { "epoch": 3.53125, "grad_norm": 0.40366265866888357, "learning_rate": 2e-05, "loss": 0.8596, "step": 113 }, { "epoch": 3.53125, "eval_loss": 0.7153159976005554, "eval_runtime": 58.5749, "eval_samples_per_second": 3.414, "eval_steps_per_second": 0.427, "step": 113 }, { "epoch": 3.5625, "grad_norm": 0.44914251592864773, "learning_rate": 2e-05, "loss": 0.7268, "step": 114 }, { "epoch": 3.5625, "eval_loss": 0.7159590721130371, "eval_runtime": 58.6872, "eval_samples_per_second": 3.408, "eval_steps_per_second": 0.426, "step": 114 }, { "epoch": 3.59375, "grad_norm": 0.4062399312752312, "learning_rate": 2e-05, "loss": 0.7875, "step": 115 }, { "epoch": 3.59375, "eval_loss": 0.7165355086326599, "eval_runtime": 58.4703, "eval_samples_per_second": 3.421, "eval_steps_per_second": 0.428, "step": 115 }, { "epoch": 3.625, "grad_norm": 0.44817350106485787, "learning_rate": 2e-05, "loss": 0.7623, "step": 116 }, { "epoch": 3.625, "eval_loss": 0.716560423374176, "eval_runtime": 58.5904, "eval_samples_per_second": 3.414, "eval_steps_per_second": 0.427, "step": 116 }, { "epoch": 3.65625, "grad_norm": 0.4309671248224914, "learning_rate": 2e-05, "loss": 0.7604, "step": 117 }, { "epoch": 3.65625, "eval_loss": 0.7165713310241699, "eval_runtime": 58.5214, "eval_samples_per_second": 3.418, "eval_steps_per_second": 0.427, "step": 117 }, { "epoch": 3.6875, "grad_norm": 0.44823929530189277, "learning_rate": 2e-05, "loss": 0.7751, "step": 118 }, { "epoch": 3.6875, "eval_loss": 0.7170334458351135, "eval_runtime": 58.7428, "eval_samples_per_second": 3.405, "eval_steps_per_second": 0.426, "step": 118 }, { "epoch": 3.71875, "grad_norm": 0.4369363559974751, "learning_rate": 2e-05, "loss": 0.8321, "step": 119 }, { "epoch": 3.71875, "eval_loss": 0.7169127464294434, "eval_runtime": 58.6794, "eval_samples_per_second": 3.408, "eval_steps_per_second": 0.426, "step": 119 }, { "epoch": 3.75, "grad_norm": 0.43105130939689645, "learning_rate": 2e-05, "loss": 0.7722, "step": 120 }, { "epoch": 3.75, "eval_loss": 0.7162806987762451, "eval_runtime": 58.7674, "eval_samples_per_second": 3.403, "eval_steps_per_second": 0.425, "step": 120 }, { "epoch": 3.78125, "grad_norm": 0.43789804607163635, "learning_rate": 2e-05, "loss": 0.7548, "step": 121 }, { "epoch": 3.78125, "eval_loss": 0.7144981622695923, "eval_runtime": 58.3815, "eval_samples_per_second": 3.426, "eval_steps_per_second": 0.428, "step": 121 }, { "epoch": 3.8125, "grad_norm": 0.46941128815266536, "learning_rate": 2e-05, "loss": 0.8189, "step": 122 }, { "epoch": 3.8125, "eval_loss": 0.712846040725708, "eval_runtime": 58.5034, "eval_samples_per_second": 3.419, "eval_steps_per_second": 0.427, "step": 122 }, { "epoch": 3.84375, "grad_norm": 0.4415453126320104, "learning_rate": 2e-05, "loss": 0.7484, "step": 123 }, { "epoch": 3.84375, "eval_loss": 0.7111316919326782, "eval_runtime": 58.566, "eval_samples_per_second": 3.415, "eval_steps_per_second": 0.427, "step": 123 }, { "epoch": 3.875, "grad_norm": 0.4237981688992312, "learning_rate": 2e-05, "loss": 0.77, "step": 124 }, { "epoch": 3.875, "eval_loss": 0.7098332047462463, "eval_runtime": 58.5232, "eval_samples_per_second": 3.417, "eval_steps_per_second": 0.427, "step": 124 }, { "epoch": 3.90625, "grad_norm": 0.49069037639672286, "learning_rate": 2e-05, "loss": 0.8059, "step": 125 }, { "epoch": 3.90625, "eval_loss": 0.7086107730865479, "eval_runtime": 59.8651, "eval_samples_per_second": 3.341, "eval_steps_per_second": 0.418, "step": 125 }, { "epoch": 3.9375, "grad_norm": 0.48569295378281013, "learning_rate": 2e-05, "loss": 0.7799, "step": 126 }, { "epoch": 3.9375, "eval_loss": 0.7077484726905823, "eval_runtime": 58.4449, "eval_samples_per_second": 3.422, "eval_steps_per_second": 0.428, "step": 126 }, { "epoch": 3.96875, "grad_norm": 0.47224685972430797, "learning_rate": 2e-05, "loss": 0.7381, "step": 127 }, { "epoch": 3.96875, "eval_loss": 0.7073386907577515, "eval_runtime": 58.5961, "eval_samples_per_second": 3.413, "eval_steps_per_second": 0.427, "step": 127 }, { "epoch": 4.0, "grad_norm": 0.48833051814427636, "learning_rate": 2e-05, "loss": 0.678, "step": 128 }, { "epoch": 4.0, "eval_loss": 0.706765353679657, "eval_runtime": 60.6877, "eval_samples_per_second": 3.296, "eval_steps_per_second": 0.412, "step": 128 }, { "epoch": 4.03125, "grad_norm": 0.4116173650136014, "learning_rate": 2e-05, "loss": 0.7582, "step": 129 }, { "epoch": 4.03125, "eval_loss": 0.7067686319351196, "eval_runtime": 58.4349, "eval_samples_per_second": 3.423, "eval_steps_per_second": 0.428, "step": 129 }, { "epoch": 4.0625, "grad_norm": 0.46176556383782513, "learning_rate": 2e-05, "loss": 0.7749, "step": 130 }, { "epoch": 4.0625, "eval_loss": 0.7066690325737, "eval_runtime": 58.7029, "eval_samples_per_second": 3.407, "eval_steps_per_second": 0.426, "step": 130 }, { "epoch": 4.09375, "grad_norm": 0.4454696779432102, "learning_rate": 2e-05, "loss": 0.773, "step": 131 }, { "epoch": 4.09375, "eval_loss": 0.7064326405525208, "eval_runtime": 61.252, "eval_samples_per_second": 3.265, "eval_steps_per_second": 0.408, "step": 131 }, { "epoch": 4.125, "grad_norm": 0.5015422163334902, "learning_rate": 2e-05, "loss": 0.7369, "step": 132 }, { "epoch": 4.125, "eval_loss": 0.7057382464408875, "eval_runtime": 59.411, "eval_samples_per_second": 3.366, "eval_steps_per_second": 0.421, "step": 132 }, { "epoch": 4.15625, "grad_norm": 0.472373878055723, "learning_rate": 2e-05, "loss": 0.8262, "step": 133 }, { "epoch": 4.15625, "eval_loss": 0.7050846815109253, "eval_runtime": 59.2996, "eval_samples_per_second": 3.373, "eval_steps_per_second": 0.422, "step": 133 }, { "epoch": 4.1875, "grad_norm": 0.5384950553698907, "learning_rate": 2e-05, "loss": 0.74, "step": 134 }, { "epoch": 4.1875, "eval_loss": 0.7045766711235046, "eval_runtime": 59.2928, "eval_samples_per_second": 3.373, "eval_steps_per_second": 0.422, "step": 134 }, { "epoch": 4.21875, "grad_norm": 0.4692662892631433, "learning_rate": 2e-05, "loss": 0.7443, "step": 135 }, { "epoch": 4.21875, "eval_loss": 0.7045109272003174, "eval_runtime": 59.525, "eval_samples_per_second": 3.36, "eval_steps_per_second": 0.42, "step": 135 }, { "epoch": 4.25, "grad_norm": 0.49707639799158876, "learning_rate": 2e-05, "loss": 0.733, "step": 136 }, { "epoch": 4.25, "eval_loss": 0.7047656178474426, "eval_runtime": 60.1718, "eval_samples_per_second": 3.324, "eval_steps_per_second": 0.415, "step": 136 }, { "epoch": 4.28125, "grad_norm": 0.5042999858449994, "learning_rate": 2e-05, "loss": 0.7303, "step": 137 }, { "epoch": 4.28125, "eval_loss": 0.7046284675598145, "eval_runtime": 60.01, "eval_samples_per_second": 3.333, "eval_steps_per_second": 0.417, "step": 137 }, { "epoch": 4.3125, "grad_norm": 0.5236583357740581, "learning_rate": 2e-05, "loss": 0.7254, "step": 138 }, { "epoch": 4.3125, "eval_loss": 0.7038366794586182, "eval_runtime": 60.3496, "eval_samples_per_second": 3.314, "eval_steps_per_second": 0.414, "step": 138 }, { "epoch": 4.34375, "grad_norm": 0.5197559530441114, "learning_rate": 2e-05, "loss": 0.6956, "step": 139 }, { "epoch": 4.34375, "eval_loss": 0.7023048400878906, "eval_runtime": 60.3808, "eval_samples_per_second": 3.312, "eval_steps_per_second": 0.414, "step": 139 }, { "epoch": 4.375, "grad_norm": 0.5214546280852583, "learning_rate": 2e-05, "loss": 0.7243, "step": 140 }, { "epoch": 4.375, "eval_loss": 0.7011681199073792, "eval_runtime": 60.1368, "eval_samples_per_second": 3.326, "eval_steps_per_second": 0.416, "step": 140 }, { "epoch": 4.40625, "grad_norm": 0.47638616269940814, "learning_rate": 2e-05, "loss": 0.7442, "step": 141 }, { "epoch": 4.40625, "eval_loss": 0.7005561590194702, "eval_runtime": 61.003, "eval_samples_per_second": 3.279, "eval_steps_per_second": 0.41, "step": 141 }, { "epoch": 4.4375, "grad_norm": 0.5067672241908349, "learning_rate": 2e-05, "loss": 0.693, "step": 142 }, { "epoch": 4.4375, "eval_loss": 0.7004985809326172, "eval_runtime": 60.1646, "eval_samples_per_second": 3.324, "eval_steps_per_second": 0.416, "step": 142 }, { "epoch": 4.46875, "grad_norm": 0.5323088696033406, "learning_rate": 2e-05, "loss": 0.7019, "step": 143 }, { "epoch": 4.46875, "eval_loss": 0.7001196146011353, "eval_runtime": 59.9527, "eval_samples_per_second": 3.336, "eval_steps_per_second": 0.417, "step": 143 }, { "epoch": 4.5, "grad_norm": 0.4994538125400832, "learning_rate": 2e-05, "loss": 0.684, "step": 144 }, { "epoch": 4.5, "eval_loss": 0.6989223957061768, "eval_runtime": 59.7753, "eval_samples_per_second": 3.346, "eval_steps_per_second": 0.418, "step": 144 }, { "epoch": 4.53125, "grad_norm": 0.5328972466603664, "learning_rate": 2e-05, "loss": 0.7581, "step": 145 }, { "epoch": 4.53125, "eval_loss": 0.697172999382019, "eval_runtime": 59.678, "eval_samples_per_second": 3.351, "eval_steps_per_second": 0.419, "step": 145 }, { "epoch": 4.5625, "grad_norm": 0.557725244530984, "learning_rate": 2e-05, "loss": 0.6562, "step": 146 }, { "epoch": 4.5625, "eval_loss": 0.6954514980316162, "eval_runtime": 59.6753, "eval_samples_per_second": 3.351, "eval_steps_per_second": 0.419, "step": 146 }, { "epoch": 4.59375, "grad_norm": 0.520999668899182, "learning_rate": 2e-05, "loss": 0.7108, "step": 147 }, { "epoch": 4.59375, "eval_loss": 0.6949453949928284, "eval_runtime": 59.7891, "eval_samples_per_second": 3.345, "eval_steps_per_second": 0.418, "step": 147 }, { "epoch": 4.625, "grad_norm": 0.513677589761833, "learning_rate": 2e-05, "loss": 0.6697, "step": 148 }, { "epoch": 4.625, "eval_loss": 0.6953239440917969, "eval_runtime": 59.7415, "eval_samples_per_second": 3.348, "eval_steps_per_second": 0.418, "step": 148 }, { "epoch": 4.65625, "grad_norm": 0.5054488117701784, "learning_rate": 2e-05, "loss": 0.7793, "step": 149 }, { "epoch": 4.65625, "eval_loss": 0.6959659457206726, "eval_runtime": 59.9711, "eval_samples_per_second": 3.335, "eval_steps_per_second": 0.417, "step": 149 }, { "epoch": 4.6875, "grad_norm": 0.5962123257952582, "learning_rate": 2e-05, "loss": 0.7068, "step": 150 }, { "epoch": 4.6875, "eval_loss": 0.6952192783355713, "eval_runtime": 59.6824, "eval_samples_per_second": 3.351, "eval_steps_per_second": 0.419, "step": 150 }, { "epoch": 4.71875, "grad_norm": 0.6009619303481951, "learning_rate": 2e-05, "loss": 0.7261, "step": 151 }, { "epoch": 4.71875, "eval_loss": 0.6935360431671143, "eval_runtime": 59.5352, "eval_samples_per_second": 3.359, "eval_steps_per_second": 0.42, "step": 151 }, { "epoch": 4.75, "grad_norm": 0.5670117266130251, "learning_rate": 2e-05, "loss": 0.744, "step": 152 }, { "epoch": 4.75, "eval_loss": 0.6924968957901001, "eval_runtime": 61.2965, "eval_samples_per_second": 3.263, "eval_steps_per_second": 0.408, "step": 152 }, { "epoch": 4.78125, "grad_norm": 0.5564998515626721, "learning_rate": 2e-05, "loss": 0.6982, "step": 153 }, { "epoch": 4.78125, "eval_loss": 0.6924961805343628, "eval_runtime": 61.2731, "eval_samples_per_second": 3.264, "eval_steps_per_second": 0.408, "step": 153 }, { "epoch": 4.8125, "grad_norm": 0.528752035989291, "learning_rate": 2e-05, "loss": 0.7109, "step": 154 }, { "epoch": 4.8125, "eval_loss": 0.6933311223983765, "eval_runtime": 59.8859, "eval_samples_per_second": 3.34, "eval_steps_per_second": 0.417, "step": 154 }, { "epoch": 4.84375, "grad_norm": 0.5868388300709311, "learning_rate": 2e-05, "loss": 0.6592, "step": 155 }, { "epoch": 4.84375, "eval_loss": 0.6933980584144592, "eval_runtime": 59.9915, "eval_samples_per_second": 3.334, "eval_steps_per_second": 0.417, "step": 155 }, { "epoch": 4.875, "grad_norm": 0.5602090329210427, "learning_rate": 2e-05, "loss": 0.7682, "step": 156 }, { "epoch": 4.875, "eval_loss": 0.6923888921737671, "eval_runtime": 61.499, "eval_samples_per_second": 3.252, "eval_steps_per_second": 0.407, "step": 156 }, { "epoch": 4.90625, "grad_norm": 0.5051330890531748, "learning_rate": 2e-05, "loss": 0.7491, "step": 157 }, { "epoch": 4.90625, "eval_loss": 0.69191575050354, "eval_runtime": 59.6969, "eval_samples_per_second": 3.35, "eval_steps_per_second": 0.419, "step": 157 }, { "epoch": 4.9375, "grad_norm": 0.5377224007409029, "learning_rate": 2e-05, "loss": 0.7501, "step": 158 }, { "epoch": 4.9375, "eval_loss": 0.69122314453125, "eval_runtime": 60.1345, "eval_samples_per_second": 3.326, "eval_steps_per_second": 0.416, "step": 158 }, { "epoch": 4.96875, "grad_norm": 0.544576473903093, "learning_rate": 2e-05, "loss": 0.714, "step": 159 }, { "epoch": 4.96875, "eval_loss": 0.6905286908149719, "eval_runtime": 59.9667, "eval_samples_per_second": 3.335, "eval_steps_per_second": 0.417, "step": 159 }, { "epoch": 5.0, "grad_norm": 0.5027197538560159, "learning_rate": 2e-05, "loss": 0.7181, "step": 160 }, { "epoch": 5.0, "eval_loss": 0.6906802654266357, "eval_runtime": 60.0766, "eval_samples_per_second": 3.329, "eval_steps_per_second": 0.416, "step": 160 }, { "epoch": 5.03125, "grad_norm": 0.5041535532115543, "learning_rate": 2e-05, "loss": 0.6636, "step": 161 }, { "epoch": 5.03125, "eval_loss": 0.6912646293640137, "eval_runtime": 63.5855, "eval_samples_per_second": 3.145, "eval_steps_per_second": 0.393, "step": 161 }, { "epoch": 5.0625, "grad_norm": 0.5286650599348627, "learning_rate": 2e-05, "loss": 0.8107, "step": 162 }, { "epoch": 5.0625, "eval_loss": 0.6922540068626404, "eval_runtime": 56.5364, "eval_samples_per_second": 3.538, "eval_steps_per_second": 0.442, "step": 162 }, { "epoch": 5.09375, "grad_norm": 0.588785168960039, "learning_rate": 2e-05, "loss": 0.6169, "step": 163 }, { "epoch": 5.09375, "eval_loss": 0.692643404006958, "eval_runtime": 56.5005, "eval_samples_per_second": 3.54, "eval_steps_per_second": 0.442, "step": 163 }, { "epoch": 5.125, "grad_norm": 0.5752677936578872, "learning_rate": 2e-05, "loss": 0.7473, "step": 164 }, { "epoch": 5.125, "eval_loss": 0.6927568912506104, "eval_runtime": 58.5386, "eval_samples_per_second": 3.417, "eval_steps_per_second": 0.427, "step": 164 }, { "epoch": 5.15625, "grad_norm": 0.6487162117437294, "learning_rate": 2e-05, "loss": 0.588, "step": 165 }, { "epoch": 5.15625, "eval_loss": 0.692574143409729, "eval_runtime": 56.4611, "eval_samples_per_second": 3.542, "eval_steps_per_second": 0.443, "step": 165 }, { "epoch": 5.1875, "grad_norm": 0.6353608377871973, "learning_rate": 2e-05, "loss": 0.6933, "step": 166 }, { "epoch": 5.1875, "eval_loss": 0.6932590007781982, "eval_runtime": 56.5989, "eval_samples_per_second": 3.534, "eval_steps_per_second": 0.442, "step": 166 }, { "epoch": 5.21875, "grad_norm": 0.5450036592535661, "learning_rate": 2e-05, "loss": 0.7175, "step": 167 }, { "epoch": 5.21875, "eval_loss": 0.6944625973701477, "eval_runtime": 56.5362, "eval_samples_per_second": 3.538, "eval_steps_per_second": 0.442, "step": 167 }, { "epoch": 5.25, "grad_norm": 0.6095734786538398, "learning_rate": 2e-05, "loss": 0.7478, "step": 168 }, { "epoch": 5.25, "eval_loss": 0.695120632648468, "eval_runtime": 56.465, "eval_samples_per_second": 3.542, "eval_steps_per_second": 0.443, "step": 168 }, { "epoch": 5.28125, "grad_norm": 0.5879704367364821, "learning_rate": 2e-05, "loss": 0.674, "step": 169 }, { "epoch": 5.28125, "eval_loss": 0.6956540942192078, "eval_runtime": 56.6007, "eval_samples_per_second": 3.534, "eval_steps_per_second": 0.442, "step": 169 }, { "epoch": 5.3125, "grad_norm": 0.6595426789183463, "learning_rate": 2e-05, "loss": 0.6536, "step": 170 }, { "epoch": 5.3125, "eval_loss": 0.6957553029060364, "eval_runtime": 56.4722, "eval_samples_per_second": 3.542, "eval_steps_per_second": 0.443, "step": 170 }, { "epoch": 5.34375, "grad_norm": 0.7708120772721636, "learning_rate": 2e-05, "loss": 0.666, "step": 171 }, { "epoch": 5.34375, "eval_loss": 0.693030834197998, "eval_runtime": 56.3518, "eval_samples_per_second": 3.549, "eval_steps_per_second": 0.444, "step": 171 }, { "epoch": 5.375, "grad_norm": 0.666091377671071, "learning_rate": 2e-05, "loss": 0.7422, "step": 172 }, { "epoch": 5.375, "eval_loss": 0.6900334358215332, "eval_runtime": 56.5395, "eval_samples_per_second": 3.537, "eval_steps_per_second": 0.442, "step": 172 }, { "epoch": 5.40625, "grad_norm": 0.6203365868953359, "learning_rate": 2e-05, "loss": 0.7069, "step": 173 }, { "epoch": 5.40625, "eval_loss": 0.6880744099617004, "eval_runtime": 56.4675, "eval_samples_per_second": 3.542, "eval_steps_per_second": 0.443, "step": 173 }, { "epoch": 5.4375, "grad_norm": 0.6299525495855296, "learning_rate": 2e-05, "loss": 0.7422, "step": 174 }, { "epoch": 5.4375, "eval_loss": 0.686725378036499, "eval_runtime": 56.671, "eval_samples_per_second": 3.529, "eval_steps_per_second": 0.441, "step": 174 }, { "epoch": 5.46875, "grad_norm": 0.6415660970283229, "learning_rate": 2e-05, "loss": 0.7347, "step": 175 }, { "epoch": 5.46875, "eval_loss": 0.6870352029800415, "eval_runtime": 56.5976, "eval_samples_per_second": 3.534, "eval_steps_per_second": 0.442, "step": 175 }, { "epoch": 5.5, "grad_norm": 0.6569935128967318, "learning_rate": 2e-05, "loss": 0.6773, "step": 176 }, { "epoch": 5.5, "eval_loss": 0.6870338320732117, "eval_runtime": 57.2325, "eval_samples_per_second": 3.495, "eval_steps_per_second": 0.437, "step": 176 }, { "epoch": 5.53125, "grad_norm": 0.6895239904364278, "learning_rate": 2e-05, "loss": 0.7106, "step": 177 }, { "epoch": 5.53125, "eval_loss": 0.6859387755393982, "eval_runtime": 57.3075, "eval_samples_per_second": 3.49, "eval_steps_per_second": 0.436, "step": 177 }, { "epoch": 5.5625, "grad_norm": 0.5855839234707383, "learning_rate": 2e-05, "loss": 0.7361, "step": 178 }, { "epoch": 5.5625, "eval_loss": 0.6856819987297058, "eval_runtime": 57.5973, "eval_samples_per_second": 3.472, "eval_steps_per_second": 0.434, "step": 178 }, { "epoch": 5.59375, "grad_norm": 0.6198072484940144, "learning_rate": 2e-05, "loss": 0.6386, "step": 179 }, { "epoch": 5.59375, "eval_loss": 0.6865841746330261, "eval_runtime": 57.4429, "eval_samples_per_second": 3.482, "eval_steps_per_second": 0.435, "step": 179 }, { "epoch": 5.625, "grad_norm": 0.6169444945747248, "learning_rate": 2e-05, "loss": 0.6455, "step": 180 }, { "epoch": 5.625, "eval_loss": 0.6871997714042664, "eval_runtime": 57.3975, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.436, "step": 180 }, { "epoch": 5.65625, "grad_norm": 0.6524804251939137, "learning_rate": 2e-05, "loss": 0.6588, "step": 181 }, { "epoch": 5.65625, "eval_loss": 0.6873356103897095, "eval_runtime": 57.4579, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.435, "step": 181 }, { "epoch": 5.6875, "grad_norm": 0.6578787618504525, "learning_rate": 2e-05, "loss": 0.6274, "step": 182 }, { "epoch": 5.6875, "eval_loss": 0.6880214214324951, "eval_runtime": 57.5735, "eval_samples_per_second": 3.474, "eval_steps_per_second": 0.434, "step": 182 }, { "epoch": 5.71875, "grad_norm": 0.732160801451622, "learning_rate": 2e-05, "loss": 0.6623, "step": 183 }, { "epoch": 5.71875, "eval_loss": 0.6879817247390747, "eval_runtime": 57.5801, "eval_samples_per_second": 3.473, "eval_steps_per_second": 0.434, "step": 183 }, { "epoch": 5.75, "grad_norm": 0.7294753965107613, "learning_rate": 2e-05, "loss": 0.6562, "step": 184 }, { "epoch": 5.75, "eval_loss": 0.6870495676994324, "eval_runtime": 57.6659, "eval_samples_per_second": 3.468, "eval_steps_per_second": 0.434, "step": 184 }, { "epoch": 5.78125, "grad_norm": 0.6947870304881401, "learning_rate": 2e-05, "loss": 0.695, "step": 185 }, { "epoch": 5.78125, "eval_loss": 0.6856162548065186, "eval_runtime": 57.4452, "eval_samples_per_second": 3.482, "eval_steps_per_second": 0.435, "step": 185 }, { "epoch": 5.8125, "grad_norm": 0.7085011414361884, "learning_rate": 2e-05, "loss": 0.6634, "step": 186 }, { "epoch": 5.8125, "eval_loss": 0.6839439272880554, "eval_runtime": 57.3621, "eval_samples_per_second": 3.487, "eval_steps_per_second": 0.436, "step": 186 }, { "epoch": 5.84375, "grad_norm": 0.6548606152047736, "learning_rate": 2e-05, "loss": 0.7117, "step": 187 }, { "epoch": 5.84375, "eval_loss": 0.6837204098701477, "eval_runtime": 57.3849, "eval_samples_per_second": 3.485, "eval_steps_per_second": 0.436, "step": 187 }, { "epoch": 5.875, "grad_norm": 0.6662179186613736, "learning_rate": 2e-05, "loss": 0.6528, "step": 188 }, { "epoch": 5.875, "eval_loss": 0.6844826340675354, "eval_runtime": 57.3173, "eval_samples_per_second": 3.489, "eval_steps_per_second": 0.436, "step": 188 }, { "epoch": 5.90625, "grad_norm": 0.6638311768585444, "learning_rate": 2e-05, "loss": 0.6582, "step": 189 }, { "epoch": 5.90625, "eval_loss": 0.6846724152565002, "eval_runtime": 57.5354, "eval_samples_per_second": 3.476, "eval_steps_per_second": 0.435, "step": 189 }, { "epoch": 5.9375, "grad_norm": 0.7007259768118588, "learning_rate": 2e-05, "loss": 0.6742, "step": 190 }, { "epoch": 5.9375, "eval_loss": 0.6834731101989746, "eval_runtime": 57.4134, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.435, "step": 190 }, { "epoch": 5.96875, "grad_norm": 0.6563132346432226, "learning_rate": 2e-05, "loss": 0.6752, "step": 191 }, { "epoch": 5.96875, "eval_loss": 0.6817070245742798, "eval_runtime": 56.6649, "eval_samples_per_second": 3.53, "eval_steps_per_second": 0.441, "step": 191 }, { "epoch": 6.0, "grad_norm": 0.6349703649303867, "learning_rate": 2e-05, "loss": 0.6795, "step": 192 }, { "epoch": 6.0, "eval_loss": 0.6804311871528625, "eval_runtime": 56.4378, "eval_samples_per_second": 3.544, "eval_steps_per_second": 0.443, "step": 192 }, { "epoch": 6.03125, "grad_norm": 0.6716039243820887, "learning_rate": 2e-05, "loss": 0.7145, "step": 193 }, { "epoch": 6.03125, "eval_loss": 0.6804825067520142, "eval_runtime": 56.6403, "eval_samples_per_second": 3.531, "eval_steps_per_second": 0.441, "step": 193 }, { "epoch": 6.0625, "grad_norm": 0.5950395984856348, "learning_rate": 2e-05, "loss": 0.6768, "step": 194 }, { "epoch": 6.0625, "eval_loss": 0.6823931932449341, "eval_runtime": 56.5459, "eval_samples_per_second": 3.537, "eval_steps_per_second": 0.442, "step": 194 }, { "epoch": 6.09375, "grad_norm": 0.6787703014730869, "learning_rate": 2e-05, "loss": 0.6158, "step": 195 }, { "epoch": 6.09375, "eval_loss": 0.6854414939880371, "eval_runtime": 56.5293, "eval_samples_per_second": 3.538, "eval_steps_per_second": 0.442, "step": 195 }, { "epoch": 6.125, "grad_norm": 0.6526684210082853, "learning_rate": 2e-05, "loss": 0.6479, "step": 196 }, { "epoch": 6.125, "eval_loss": 0.6892845034599304, "eval_runtime": 56.5099, "eval_samples_per_second": 3.539, "eval_steps_per_second": 0.442, "step": 196 }, { "epoch": 6.15625, "grad_norm": 0.6997704487164051, "learning_rate": 2e-05, "loss": 0.6706, "step": 197 }, { "epoch": 6.15625, "eval_loss": 0.6941932439804077, "eval_runtime": 58.514, "eval_samples_per_second": 3.418, "eval_steps_per_second": 0.427, "step": 197 }, { "epoch": 6.1875, "grad_norm": 0.7511370305129338, "learning_rate": 2e-05, "loss": 0.7418, "step": 198 }, { "epoch": 6.1875, "eval_loss": 0.6964046955108643, "eval_runtime": 58.4428, "eval_samples_per_second": 3.422, "eval_steps_per_second": 0.428, "step": 198 }, { "epoch": 6.21875, "grad_norm": 0.8468482037911412, "learning_rate": 2e-05, "loss": 0.618, "step": 199 }, { "epoch": 6.21875, "eval_loss": 0.6947888731956482, "eval_runtime": 56.6921, "eval_samples_per_second": 3.528, "eval_steps_per_second": 0.441, "step": 199 }, { "epoch": 6.25, "grad_norm": 0.80366391754735, "learning_rate": 2e-05, "loss": 0.6712, "step": 200 }, { "epoch": 6.25, "eval_loss": 0.691255509853363, "eval_runtime": 56.7536, "eval_samples_per_second": 3.524, "eval_steps_per_second": 0.441, "step": 200 }, { "epoch": 6.28125, "grad_norm": 0.7123001788838409, "learning_rate": 2e-05, "loss": 0.6886, "step": 201 }, { "epoch": 6.28125, "eval_loss": 0.6888566613197327, "eval_runtime": 57.4537, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.435, "step": 201 }, { "epoch": 6.3125, "grad_norm": 0.7785807978964993, "learning_rate": 2e-05, "loss": 0.6096, "step": 202 }, { "epoch": 6.3125, "eval_loss": 0.6869829297065735, "eval_runtime": 57.3967, "eval_samples_per_second": 3.485, "eval_steps_per_second": 0.436, "step": 202 }, { "epoch": 6.34375, "grad_norm": 0.6771659776183533, "learning_rate": 2e-05, "loss": 0.7328, "step": 203 }, { "epoch": 6.34375, "eval_loss": 0.6867367625236511, "eval_runtime": 57.5277, "eval_samples_per_second": 3.477, "eval_steps_per_second": 0.435, "step": 203 }, { "epoch": 6.375, "grad_norm": 0.8106446356590065, "learning_rate": 2e-05, "loss": 0.5931, "step": 204 }, { "epoch": 6.375, "eval_loss": 0.6862130165100098, "eval_runtime": 57.4868, "eval_samples_per_second": 3.479, "eval_steps_per_second": 0.435, "step": 204 }, { "epoch": 6.40625, "grad_norm": 0.6600674902481064, "learning_rate": 2e-05, "loss": 0.5789, "step": 205 }, { "epoch": 6.40625, "eval_loss": 0.6866827607154846, "eval_runtime": 57.4287, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.435, "step": 205 }, { "epoch": 6.4375, "grad_norm": 0.8177118767015663, "learning_rate": 2e-05, "loss": 0.6395, "step": 206 }, { "epoch": 6.4375, "eval_loss": 0.6866394281387329, "eval_runtime": 57.0918, "eval_samples_per_second": 3.503, "eval_steps_per_second": 0.438, "step": 206 }, { "epoch": 6.46875, "grad_norm": 0.7284237801181533, "learning_rate": 2e-05, "loss": 0.6835, "step": 207 }, { "epoch": 6.46875, "eval_loss": 0.6864017248153687, "eval_runtime": 57.1565, "eval_samples_per_second": 3.499, "eval_steps_per_second": 0.437, "step": 207 }, { "epoch": 6.5, "grad_norm": 0.7603002790103086, "learning_rate": 2e-05, "loss": 0.6347, "step": 208 }, { "epoch": 6.5, "eval_loss": 0.6871703267097473, "eval_runtime": 57.4181, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.435, "step": 208 }, { "epoch": 6.53125, "grad_norm": 0.8359766442946917, "learning_rate": 2e-05, "loss": 0.6088, "step": 209 }, { "epoch": 6.53125, "eval_loss": 0.6878347992897034, "eval_runtime": 57.4837, "eval_samples_per_second": 3.479, "eval_steps_per_second": 0.435, "step": 209 }, { "epoch": 6.5625, "grad_norm": 0.7778968951616311, "learning_rate": 2e-05, "loss": 0.5912, "step": 210 }, { "epoch": 6.5625, "eval_loss": 0.6893374919891357, "eval_runtime": 57.6159, "eval_samples_per_second": 3.471, "eval_steps_per_second": 0.434, "step": 210 }, { "epoch": 6.59375, "grad_norm": 0.8300437291816744, "learning_rate": 2e-05, "loss": 0.6299, "step": 211 }, { "epoch": 6.59375, "eval_loss": 0.6899804472923279, "eval_runtime": 57.1491, "eval_samples_per_second": 3.5, "eval_steps_per_second": 0.437, "step": 211 }, { "epoch": 6.625, "grad_norm": 0.7994430152763061, "learning_rate": 2e-05, "loss": 0.6073, "step": 212 }, { "epoch": 6.625, "eval_loss": 0.6889459490776062, "eval_runtime": 57.3773, "eval_samples_per_second": 3.486, "eval_steps_per_second": 0.436, "step": 212 }, { "epoch": 6.65625, "grad_norm": 0.7475670453371858, "learning_rate": 2e-05, "loss": 0.6774, "step": 213 }, { "epoch": 6.65625, "eval_loss": 0.6873544454574585, "eval_runtime": 57.4114, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.435, "step": 213 }, { "epoch": 6.6875, "grad_norm": 0.7281375343651885, "learning_rate": 2e-05, "loss": 0.6404, "step": 214 }, { "epoch": 6.6875, "eval_loss": 0.6867469549179077, "eval_runtime": 57.2899, "eval_samples_per_second": 3.491, "eval_steps_per_second": 0.436, "step": 214 }, { "epoch": 6.71875, "grad_norm": 0.7684115091080507, "learning_rate": 2e-05, "loss": 0.6382, "step": 215 }, { "epoch": 6.71875, "eval_loss": 0.6860084533691406, "eval_runtime": 57.38, "eval_samples_per_second": 3.486, "eval_steps_per_second": 0.436, "step": 215 }, { "epoch": 6.75, "grad_norm": 0.7962356695445627, "learning_rate": 2e-05, "loss": 0.6398, "step": 216 }, { "epoch": 6.75, "eval_loss": 0.6856002807617188, "eval_runtime": 57.2399, "eval_samples_per_second": 3.494, "eval_steps_per_second": 0.437, "step": 216 }, { "epoch": 6.78125, "grad_norm": 0.7893826807634562, "learning_rate": 2e-05, "loss": 0.59, "step": 217 }, { "epoch": 6.78125, "eval_loss": 0.6870043873786926, "eval_runtime": 57.1671, "eval_samples_per_second": 3.499, "eval_steps_per_second": 0.437, "step": 217 }, { "epoch": 6.8125, "grad_norm": 0.8329644141570051, "learning_rate": 2e-05, "loss": 0.5932, "step": 218 }, { "epoch": 6.8125, "eval_loss": 0.6870229840278625, "eval_runtime": 57.3642, "eval_samples_per_second": 3.486, "eval_steps_per_second": 0.436, "step": 218 }, { "epoch": 6.84375, "grad_norm": 0.9075127715796286, "learning_rate": 2e-05, "loss": 0.669, "step": 219 }, { "epoch": 6.84375, "eval_loss": 0.6856889128684998, "eval_runtime": 57.4226, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.435, "step": 219 }, { "epoch": 6.875, "grad_norm": 0.8464505810718659, "learning_rate": 2e-05, "loss": 0.686, "step": 220 }, { "epoch": 6.875, "eval_loss": 0.6835823059082031, "eval_runtime": 57.2105, "eval_samples_per_second": 3.496, "eval_steps_per_second": 0.437, "step": 220 }, { "epoch": 6.90625, "grad_norm": 0.7799140952562077, "learning_rate": 2e-05, "loss": 0.6503, "step": 221 }, { "epoch": 6.90625, "eval_loss": 0.6825523376464844, "eval_runtime": 57.0985, "eval_samples_per_second": 3.503, "eval_steps_per_second": 0.438, "step": 221 }, { "epoch": 6.9375, "grad_norm": 0.8495343756184095, "learning_rate": 2e-05, "loss": 0.6533, "step": 222 }, { "epoch": 6.9375, "eval_loss": 0.6813305616378784, "eval_runtime": 57.1896, "eval_samples_per_second": 3.497, "eval_steps_per_second": 0.437, "step": 222 }, { "epoch": 6.96875, "grad_norm": 0.8191950862245413, "learning_rate": 2e-05, "loss": 0.6627, "step": 223 }, { "epoch": 6.96875, "eval_loss": 0.6800451874732971, "eval_runtime": 57.3904, "eval_samples_per_second": 3.485, "eval_steps_per_second": 0.436, "step": 223 }, { "epoch": 7.0, "grad_norm": 0.8196747980504347, "learning_rate": 2e-05, "loss": 0.7337, "step": 224 }, { "epoch": 7.0, "eval_loss": 0.6801488399505615, "eval_runtime": 59.0121, "eval_samples_per_second": 3.389, "eval_steps_per_second": 0.424, "step": 224 }, { "epoch": 7.03125, "grad_norm": 0.7095908101379159, "learning_rate": 2e-05, "loss": 0.6203, "step": 225 }, { "epoch": 7.03125, "eval_loss": 0.6816287040710449, "eval_runtime": 57.1754, "eval_samples_per_second": 3.498, "eval_steps_per_second": 0.437, "step": 225 }, { "epoch": 7.0625, "grad_norm": 0.7916901149958031, "learning_rate": 2e-05, "loss": 0.5489, "step": 226 }, { "epoch": 7.0625, "eval_loss": 0.6857742071151733, "eval_runtime": 58.0461, "eval_samples_per_second": 3.446, "eval_steps_per_second": 0.431, "step": 226 }, { "epoch": 7.09375, "grad_norm": 0.8190252103616696, "learning_rate": 2e-05, "loss": 0.613, "step": 227 }, { "epoch": 7.09375, "eval_loss": 0.6924745440483093, "eval_runtime": 58.351, "eval_samples_per_second": 3.428, "eval_steps_per_second": 0.428, "step": 227 }, { "epoch": 7.125, "grad_norm": 0.9385023798254423, "learning_rate": 2e-05, "loss": 0.5647, "step": 228 }, { "epoch": 7.125, "eval_loss": 0.7020445466041565, "eval_runtime": 58.1868, "eval_samples_per_second": 3.437, "eval_steps_per_second": 0.43, "step": 228 }, { "epoch": 7.15625, "grad_norm": 1.178887354836488, "learning_rate": 2e-05, "loss": 0.5957, "step": 229 }, { "epoch": 7.15625, "eval_loss": 0.7064430117607117, "eval_runtime": 58.3297, "eval_samples_per_second": 3.429, "eval_steps_per_second": 0.429, "step": 229 }, { "epoch": 7.1875, "grad_norm": 1.0054198258359948, "learning_rate": 2e-05, "loss": 0.5667, "step": 230 }, { "epoch": 7.1875, "eval_loss": 0.7060463428497314, "eval_runtime": 58.3212, "eval_samples_per_second": 3.429, "eval_steps_per_second": 0.429, "step": 230 }, { "epoch": 7.21875, "grad_norm": 1.005055760217432, "learning_rate": 2e-05, "loss": 0.6546, "step": 231 }, { "epoch": 7.21875, "eval_loss": 0.7029504179954529, "eval_runtime": 58.0188, "eval_samples_per_second": 3.447, "eval_steps_per_second": 0.431, "step": 231 }, { "epoch": 7.25, "grad_norm": 0.9458472260674603, "learning_rate": 2e-05, "loss": 0.6503, "step": 232 }, { "epoch": 7.25, "eval_loss": 0.6988745927810669, "eval_runtime": 58.3149, "eval_samples_per_second": 3.43, "eval_steps_per_second": 0.429, "step": 232 }, { "epoch": 7.28125, "grad_norm": 1.022594832986886, "learning_rate": 2e-05, "loss": 0.611, "step": 233 }, { "epoch": 7.28125, "eval_loss": 0.6943955421447754, "eval_runtime": 58.3693, "eval_samples_per_second": 3.426, "eval_steps_per_second": 0.428, "step": 233 }, { "epoch": 7.3125, "grad_norm": 0.8953283498269817, "learning_rate": 2e-05, "loss": 0.6438, "step": 234 }, { "epoch": 7.3125, "eval_loss": 0.6924715638160706, "eval_runtime": 58.214, "eval_samples_per_second": 3.436, "eval_steps_per_second": 0.429, "step": 234 }, { "epoch": 7.34375, "grad_norm": 0.9094812403228425, "learning_rate": 2e-05, "loss": 0.6123, "step": 235 }, { "epoch": 7.34375, "eval_loss": 0.690609335899353, "eval_runtime": 58.6042, "eval_samples_per_second": 3.413, "eval_steps_per_second": 0.427, "step": 235 }, { "epoch": 7.375, "grad_norm": 0.9433427892139121, "learning_rate": 2e-05, "loss": 0.5772, "step": 236 }, { "epoch": 7.375, "eval_loss": 0.6895288825035095, "eval_runtime": 58.0083, "eval_samples_per_second": 3.448, "eval_steps_per_second": 0.431, "step": 236 }, { "epoch": 7.40625, "grad_norm": 0.9654218046347709, "learning_rate": 2e-05, "loss": 0.62, "step": 237 }, { "epoch": 7.40625, "eval_loss": 0.6887797713279724, "eval_runtime": 58.1374, "eval_samples_per_second": 3.44, "eval_steps_per_second": 0.43, "step": 237 }, { "epoch": 7.4375, "grad_norm": 1.033591761626784, "learning_rate": 2e-05, "loss": 0.6163, "step": 238 }, { "epoch": 7.4375, "eval_loss": 0.6888651847839355, "eval_runtime": 58.2539, "eval_samples_per_second": 3.433, "eval_steps_per_second": 0.429, "step": 238 }, { "epoch": 7.46875, "grad_norm": 0.9059638854254064, "learning_rate": 2e-05, "loss": 0.6364, "step": 239 }, { "epoch": 7.46875, "eval_loss": 0.6905540227890015, "eval_runtime": 58.0992, "eval_samples_per_second": 3.442, "eval_steps_per_second": 0.43, "step": 239 }, { "epoch": 7.5, "grad_norm": 0.9193726862314907, "learning_rate": 2e-05, "loss": 0.5845, "step": 240 }, { "epoch": 7.5, "eval_loss": 0.693742036819458, "eval_runtime": 58.2336, "eval_samples_per_second": 3.434, "eval_steps_per_second": 0.429, "step": 240 }, { "epoch": 7.53125, "grad_norm": 0.8539139714986941, "learning_rate": 2e-05, "loss": 0.6344, "step": 241 }, { "epoch": 7.53125, "eval_loss": 0.696897566318512, "eval_runtime": 59.3124, "eval_samples_per_second": 3.372, "eval_steps_per_second": 0.421, "step": 241 }, { "epoch": 7.5625, "grad_norm": 0.9552275495908527, "learning_rate": 2e-05, "loss": 0.6159, "step": 242 }, { "epoch": 7.5625, "eval_loss": 0.6991227865219116, "eval_runtime": 58.1037, "eval_samples_per_second": 3.442, "eval_steps_per_second": 0.43, "step": 242 }, { "epoch": 7.59375, "grad_norm": 0.8953175982318474, "learning_rate": 2e-05, "loss": 0.5934, "step": 243 }, { "epoch": 7.59375, "eval_loss": 0.7009669542312622, "eval_runtime": 59.9178, "eval_samples_per_second": 3.338, "eval_steps_per_second": 0.417, "step": 243 }, { "epoch": 7.625, "grad_norm": 1.1254017430464345, "learning_rate": 2e-05, "loss": 0.6721, "step": 244 }, { "epoch": 7.625, "eval_loss": 0.7003803253173828, "eval_runtime": 59.9278, "eval_samples_per_second": 3.337, "eval_steps_per_second": 0.417, "step": 244 }, { "epoch": 7.65625, "grad_norm": 0.9666525684896161, "learning_rate": 2e-05, "loss": 0.5793, "step": 245 }, { "epoch": 7.65625, "eval_loss": 0.6997054815292358, "eval_runtime": 58.3355, "eval_samples_per_second": 3.428, "eval_steps_per_second": 0.429, "step": 245 }, { "epoch": 7.6875, "grad_norm": 1.0500213825228455, "learning_rate": 2e-05, "loss": 0.6262, "step": 246 }, { "epoch": 7.6875, "eval_loss": 0.6956760883331299, "eval_runtime": 57.9053, "eval_samples_per_second": 3.454, "eval_steps_per_second": 0.432, "step": 246 }, { "epoch": 7.71875, "grad_norm": 1.0445166827193935, "learning_rate": 2e-05, "loss": 0.6111, "step": 247 }, { "epoch": 7.71875, "eval_loss": 0.6909776329994202, "eval_runtime": 58.1856, "eval_samples_per_second": 3.437, "eval_steps_per_second": 0.43, "step": 247 }, { "epoch": 7.75, "grad_norm": 0.8935484171996528, "learning_rate": 2e-05, "loss": 0.6036, "step": 248 }, { "epoch": 7.75, "eval_loss": 0.6887417435646057, "eval_runtime": 58.1651, "eval_samples_per_second": 3.438, "eval_steps_per_second": 0.43, "step": 248 }, { "epoch": 7.78125, "grad_norm": 0.9329951454150782, "learning_rate": 2e-05, "loss": 0.6434, "step": 249 }, { "epoch": 7.78125, "eval_loss": 0.6893429756164551, "eval_runtime": 58.4106, "eval_samples_per_second": 3.424, "eval_steps_per_second": 0.428, "step": 249 }, { "epoch": 7.8125, "grad_norm": 0.8799352767832798, "learning_rate": 2e-05, "loss": 0.6519, "step": 250 }, { "epoch": 7.8125, "eval_loss": 0.6929408311843872, "eval_runtime": 58.3105, "eval_samples_per_second": 3.43, "eval_steps_per_second": 0.429, "step": 250 }, { "epoch": 7.84375, "grad_norm": 0.9322996227983372, "learning_rate": 2e-05, "loss": 0.5684, "step": 251 }, { "epoch": 7.84375, "eval_loss": 0.6954038739204407, "eval_runtime": 57.8998, "eval_samples_per_second": 3.454, "eval_steps_per_second": 0.432, "step": 251 }, { "epoch": 7.875, "grad_norm": 1.0904651907324217, "learning_rate": 2e-05, "loss": 0.5851, "step": 252 }, { "epoch": 7.875, "eval_loss": 0.6938650012016296, "eval_runtime": 58.4905, "eval_samples_per_second": 3.419, "eval_steps_per_second": 0.427, "step": 252 }, { "epoch": 7.90625, "grad_norm": 1.0103592741616823, "learning_rate": 2e-05, "loss": 0.6655, "step": 253 }, { "epoch": 7.90625, "eval_loss": 0.6909225583076477, "eval_runtime": 58.1801, "eval_samples_per_second": 3.438, "eval_steps_per_second": 0.43, "step": 253 }, { "epoch": 7.9375, "grad_norm": 0.9208541649120607, "learning_rate": 2e-05, "loss": 0.6051, "step": 254 }, { "epoch": 7.9375, "eval_loss": 0.6913868188858032, "eval_runtime": 58.4224, "eval_samples_per_second": 3.423, "eval_steps_per_second": 0.428, "step": 254 }, { "epoch": 7.96875, "grad_norm": 0.9567638724372727, "learning_rate": 2e-05, "loss": 0.5529, "step": 255 }, { "epoch": 7.96875, "eval_loss": 0.6918243169784546, "eval_runtime": 58.1569, "eval_samples_per_second": 3.439, "eval_steps_per_second": 0.43, "step": 255 }, { "epoch": 8.0, "grad_norm": 0.8913592607849594, "learning_rate": 2e-05, "loss": 0.6076, "step": 256 }, { "epoch": 8.0, "eval_loss": 0.6921086311340332, "eval_runtime": 58.8193, "eval_samples_per_second": 3.4, "eval_steps_per_second": 0.425, "step": 256 }, { "epoch": 8.0, "step": 256, "total_flos": 77213396434944.0, "train_loss": 0.24315254529938102, "train_runtime": 7656.702, "train_samples_per_second": 1.045, "train_steps_per_second": 0.033 } ], "logging_steps": 1.0, "max_steps": 256, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 77213396434944.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }