|
{ |
|
"best_metric": 0.6816287040710449, |
|
"best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b/checkpoint-225", |
|
"epoch": 8.0, |
|
"eval_steps": 1.0, |
|
"global_step": 256, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 1.3320099054231718, |
|
"learning_rate": 0.0, |
|
"loss": 1.3851, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"eval_loss": 1.3910757303237915, |
|
"eval_runtime": 63.0135, |
|
"eval_samples_per_second": 3.174, |
|
"eval_steps_per_second": 0.397, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 1.0473401758450829, |
|
"learning_rate": 8.613531161467863e-06, |
|
"loss": 1.3255, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"eval_loss": 1.3910757303237915, |
|
"eval_runtime": 56.9747, |
|
"eval_samples_per_second": 3.51, |
|
"eval_steps_per_second": 0.439, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 1.0429876090069883, |
|
"learning_rate": 1.3652123889719709e-05, |
|
"loss": 1.3737, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"eval_loss": 1.3638323545455933, |
|
"eval_runtime": 56.6988, |
|
"eval_samples_per_second": 3.527, |
|
"eval_steps_per_second": 0.441, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.9193695742967616, |
|
"learning_rate": 1.7227062322935725e-05, |
|
"loss": 1.3309, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"eval_loss": 1.3227791786193848, |
|
"eval_runtime": 56.6188, |
|
"eval_samples_per_second": 3.532, |
|
"eval_steps_per_second": 0.442, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 1.0043594584185398, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2984, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"eval_loss": 1.2728056907653809, |
|
"eval_runtime": 58.8213, |
|
"eval_samples_per_second": 3.4, |
|
"eval_steps_per_second": 0.425, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.8222566364005439, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2639, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"eval_loss": 1.2296103239059448, |
|
"eval_runtime": 56.6504, |
|
"eval_samples_per_second": 3.53, |
|
"eval_steps_per_second": 0.441, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.6389176248800544, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2314, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"eval_loss": 1.1983529329299927, |
|
"eval_runtime": 56.5641, |
|
"eval_samples_per_second": 3.536, |
|
"eval_steps_per_second": 0.442, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.599291017991319, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2037, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.1734061241149902, |
|
"eval_runtime": 56.6005, |
|
"eval_samples_per_second": 3.534, |
|
"eval_steps_per_second": 0.442, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.4952974010296138, |
|
"learning_rate": 2e-05, |
|
"loss": 1.226, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"eval_loss": 1.1502536535263062, |
|
"eval_runtime": 56.7524, |
|
"eval_samples_per_second": 3.524, |
|
"eval_steps_per_second": 0.441, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.4967350606769311, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1613, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"eval_loss": 1.127350091934204, |
|
"eval_runtime": 56.7569, |
|
"eval_samples_per_second": 3.524, |
|
"eval_steps_per_second": 0.44, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.43644425188108293, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2077, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"eval_loss": 1.104610562324524, |
|
"eval_runtime": 56.607, |
|
"eval_samples_per_second": 3.533, |
|
"eval_steps_per_second": 0.442, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.4763392566533296, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1593, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"eval_loss": 1.0827140808105469, |
|
"eval_runtime": 56.6548, |
|
"eval_samples_per_second": 3.53, |
|
"eval_steps_per_second": 0.441, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.49138280391100253, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1679, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"eval_loss": 1.0621232986450195, |
|
"eval_runtime": 56.8147, |
|
"eval_samples_per_second": 3.52, |
|
"eval_steps_per_second": 0.44, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.4305508696222477, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0008, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"eval_loss": 1.0437134504318237, |
|
"eval_runtime": 56.7306, |
|
"eval_samples_per_second": 3.525, |
|
"eval_steps_per_second": 0.441, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.39438622708065774, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1206, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"eval_loss": 1.0277280807495117, |
|
"eval_runtime": 56.6499, |
|
"eval_samples_per_second": 3.53, |
|
"eval_steps_per_second": 0.441, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.40300919769454296, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0501, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0134528875350952, |
|
"eval_runtime": 56.3333, |
|
"eval_samples_per_second": 3.55, |
|
"eval_steps_per_second": 0.444, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.35230570754831836, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0593, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"eval_loss": 1.0004419088363647, |
|
"eval_runtime": 56.6019, |
|
"eval_samples_per_second": 3.533, |
|
"eval_steps_per_second": 0.442, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.37606931260721715, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0482, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"eval_loss": 0.9879937767982483, |
|
"eval_runtime": 56.6945, |
|
"eval_samples_per_second": 3.528, |
|
"eval_steps_per_second": 0.441, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.2941404563021841, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9707, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"eval_loss": 0.976818859577179, |
|
"eval_runtime": 56.6805, |
|
"eval_samples_per_second": 3.529, |
|
"eval_steps_per_second": 0.441, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.2958263397509482, |
|
"learning_rate": 2e-05, |
|
"loss": 1.091, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"eval_loss": 0.9669834971427917, |
|
"eval_runtime": 57.6231, |
|
"eval_samples_per_second": 3.471, |
|
"eval_steps_per_second": 0.434, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 0.2485896802049987, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0041, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"eval_loss": 0.9583450555801392, |
|
"eval_runtime": 56.5142, |
|
"eval_samples_per_second": 3.539, |
|
"eval_steps_per_second": 0.442, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.296994298254859, |
|
"learning_rate": 2e-05, |
|
"loss": 1.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"eval_loss": 0.9502925276756287, |
|
"eval_runtime": 56.6393, |
|
"eval_samples_per_second": 3.531, |
|
"eval_steps_per_second": 0.441, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 0.2499735192340966, |
|
"learning_rate": 2e-05, |
|
"loss": 1.04, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"eval_loss": 0.9427899122238159, |
|
"eval_runtime": 56.5467, |
|
"eval_samples_per_second": 3.537, |
|
"eval_steps_per_second": 0.442, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.23614468035916372, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0387, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.9359552264213562, |
|
"eval_runtime": 56.8371, |
|
"eval_samples_per_second": 3.519, |
|
"eval_steps_per_second": 0.44, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.2597610358499704, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9821, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"eval_loss": 0.929139256477356, |
|
"eval_runtime": 56.659, |
|
"eval_samples_per_second": 3.53, |
|
"eval_steps_per_second": 0.441, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.2483654904520099, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0139, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"eval_loss": 0.9226005673408508, |
|
"eval_runtime": 56.4669, |
|
"eval_samples_per_second": 3.542, |
|
"eval_steps_per_second": 0.443, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.2814780741041167, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9374, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"eval_loss": 0.9160022735595703, |
|
"eval_runtime": 56.6558, |
|
"eval_samples_per_second": 3.53, |
|
"eval_steps_per_second": 0.441, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.29993540247195477, |
|
"learning_rate": 2e-05, |
|
"loss": 0.948, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"eval_loss": 0.9092594981193542, |
|
"eval_runtime": 56.743, |
|
"eval_samples_per_second": 3.525, |
|
"eval_steps_per_second": 0.441, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 0.24302264777949295, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9676, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"eval_loss": 0.9028491377830505, |
|
"eval_runtime": 56.802, |
|
"eval_samples_per_second": 3.521, |
|
"eval_steps_per_second": 0.44, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.28001197555170687, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0044, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"eval_loss": 0.8969234228134155, |
|
"eval_runtime": 56.8402, |
|
"eval_samples_per_second": 3.519, |
|
"eval_steps_per_second": 0.44, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 0.26990828196944483, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8417, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"eval_loss": 0.890943169593811, |
|
"eval_runtime": 56.9987, |
|
"eval_samples_per_second": 3.509, |
|
"eval_steps_per_second": 0.439, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.25976007498641823, |
|
"learning_rate": 2e-05, |
|
"loss": 0.95, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.8852173686027527, |
|
"eval_runtime": 56.722, |
|
"eval_samples_per_second": 3.526, |
|
"eval_steps_per_second": 0.441, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 0.29530149620990226, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9931, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"eval_loss": 0.8795143961906433, |
|
"eval_runtime": 56.8541, |
|
"eval_samples_per_second": 3.518, |
|
"eval_steps_per_second": 0.44, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 0.2759239362577793, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9978, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"eval_loss": 0.8741766214370728, |
|
"eval_runtime": 56.7708, |
|
"eval_samples_per_second": 3.523, |
|
"eval_steps_per_second": 0.44, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.246531740102282, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0163, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"eval_loss": 0.8691757321357727, |
|
"eval_runtime": 56.8382, |
|
"eval_samples_per_second": 3.519, |
|
"eval_steps_per_second": 0.44, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.2646078522027086, |
|
"learning_rate": 2e-05, |
|
"loss": 0.971, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"eval_loss": 0.8643682599067688, |
|
"eval_runtime": 56.689, |
|
"eval_samples_per_second": 3.528, |
|
"eval_steps_per_second": 0.441, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 0.2395171492146917, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9227, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"eval_loss": 0.8600785136222839, |
|
"eval_runtime": 56.72, |
|
"eval_samples_per_second": 3.526, |
|
"eval_steps_per_second": 0.441, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 0.28215229152733834, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9308, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"eval_loss": 0.8562959432601929, |
|
"eval_runtime": 56.8289, |
|
"eval_samples_per_second": 3.519, |
|
"eval_steps_per_second": 0.44, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 0.27116244597267625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9563, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"eval_loss": 0.8526366949081421, |
|
"eval_runtime": 56.6829, |
|
"eval_samples_per_second": 3.528, |
|
"eval_steps_per_second": 0.441, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.2623711894386991, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9535, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8490655422210693, |
|
"eval_runtime": 56.6874, |
|
"eval_samples_per_second": 3.528, |
|
"eval_steps_per_second": 0.441, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 0.27251908150193377, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9287, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"eval_loss": 0.8451938629150391, |
|
"eval_runtime": 56.7117, |
|
"eval_samples_per_second": 3.527, |
|
"eval_steps_per_second": 0.441, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 0.2642817191103673, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9186, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"eval_loss": 0.8413894772529602, |
|
"eval_runtime": 56.9042, |
|
"eval_samples_per_second": 3.515, |
|
"eval_steps_per_second": 0.439, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 0.26857391288606197, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8792, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"eval_loss": 0.8373947739601135, |
|
"eval_runtime": 56.7211, |
|
"eval_samples_per_second": 3.526, |
|
"eval_steps_per_second": 0.441, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.2474531366673803, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8965, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"eval_loss": 0.8339560031890869, |
|
"eval_runtime": 56.8277, |
|
"eval_samples_per_second": 3.519, |
|
"eval_steps_per_second": 0.44, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.26467660282496797, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8762, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"eval_loss": 0.8309465050697327, |
|
"eval_runtime": 56.7019, |
|
"eval_samples_per_second": 3.527, |
|
"eval_steps_per_second": 0.441, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 0.2652288034609541, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9118, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"eval_loss": 0.8279169201850891, |
|
"eval_runtime": 56.6271, |
|
"eval_samples_per_second": 3.532, |
|
"eval_steps_per_second": 0.441, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 0.27355995161173785, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9249, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"eval_loss": 0.8252391219139099, |
|
"eval_runtime": 56.6323, |
|
"eval_samples_per_second": 3.532, |
|
"eval_steps_per_second": 0.441, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.2588399009432225, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8359, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.8225956559181213, |
|
"eval_runtime": 58.0142, |
|
"eval_samples_per_second": 3.447, |
|
"eval_steps_per_second": 0.431, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 0.28116473918910634, |
|
"learning_rate": 2e-05, |
|
"loss": 0.846, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"eval_loss": 0.8198111057281494, |
|
"eval_runtime": 56.6785, |
|
"eval_samples_per_second": 3.529, |
|
"eval_steps_per_second": 0.441, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.30791508615928687, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8364, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"eval_loss": 0.816419243812561, |
|
"eval_runtime": 56.7867, |
|
"eval_samples_per_second": 3.522, |
|
"eval_steps_per_second": 0.44, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 0.2635774938006065, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8565, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"eval_loss": 0.8128839731216431, |
|
"eval_runtime": 56.5904, |
|
"eval_samples_per_second": 3.534, |
|
"eval_steps_per_second": 0.442, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.25740594308086223, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7573, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"eval_loss": 0.8096449971199036, |
|
"eval_runtime": 56.7381, |
|
"eval_samples_per_second": 3.525, |
|
"eval_steps_per_second": 0.441, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 0.25917235006885775, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8982, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"eval_loss": 0.8064478039741516, |
|
"eval_runtime": 57.4343, |
|
"eval_samples_per_second": 3.482, |
|
"eval_steps_per_second": 0.435, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 0.2831937064873763, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8781, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"eval_loss": 0.8034397959709167, |
|
"eval_runtime": 56.8346, |
|
"eval_samples_per_second": 3.519, |
|
"eval_steps_per_second": 0.44, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.2863024186152095, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8861, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"eval_loss": 0.800960898399353, |
|
"eval_runtime": 56.7424, |
|
"eval_samples_per_second": 3.525, |
|
"eval_steps_per_second": 0.441, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.28320211213029406, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9514, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.7988448143005371, |
|
"eval_runtime": 57.0405, |
|
"eval_samples_per_second": 3.506, |
|
"eval_steps_per_second": 0.438, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 0.3204132014824286, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8947, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"eval_loss": 0.7971951365470886, |
|
"eval_runtime": 57.1716, |
|
"eval_samples_per_second": 3.498, |
|
"eval_steps_per_second": 0.437, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 0.29386668880511096, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9125, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"eval_loss": 0.7956165075302124, |
|
"eval_runtime": 57.3457, |
|
"eval_samples_per_second": 3.488, |
|
"eval_steps_per_second": 0.436, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 0.31091076146467406, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8638, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"eval_loss": 0.7935267090797424, |
|
"eval_runtime": 57.373, |
|
"eval_samples_per_second": 3.486, |
|
"eval_steps_per_second": 0.436, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.28779917523565474, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9113, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"eval_loss": 0.7914787530899048, |
|
"eval_runtime": 57.2668, |
|
"eval_samples_per_second": 3.492, |
|
"eval_steps_per_second": 0.437, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 0.31820258275619673, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8113, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"eval_loss": 0.788929283618927, |
|
"eval_runtime": 57.2581, |
|
"eval_samples_per_second": 3.493, |
|
"eval_steps_per_second": 0.437, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 0.30186200117869055, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8685, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"eval_loss": 0.7862411737442017, |
|
"eval_runtime": 57.2688, |
|
"eval_samples_per_second": 3.492, |
|
"eval_steps_per_second": 0.437, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 0.27549296702686904, |
|
"learning_rate": 2e-05, |
|
"loss": 0.911, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"eval_loss": 0.7838772535324097, |
|
"eval_runtime": 57.5102, |
|
"eval_samples_per_second": 3.478, |
|
"eval_steps_per_second": 0.435, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.29444542350221403, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8877, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7814672589302063, |
|
"eval_runtime": 57.3342, |
|
"eval_samples_per_second": 3.488, |
|
"eval_steps_per_second": 0.436, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 0.32976362380066954, |
|
"learning_rate": 2e-05, |
|
"loss": 0.836, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"eval_loss": 0.7788661122322083, |
|
"eval_runtime": 57.6392, |
|
"eval_samples_per_second": 3.47, |
|
"eval_steps_per_second": 0.434, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 0.3091109685624876, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8565, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"eval_loss": 0.7769085764884949, |
|
"eval_runtime": 57.2017, |
|
"eval_samples_per_second": 3.496, |
|
"eval_steps_per_second": 0.437, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 0.3011651623444141, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8265, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"eval_loss": 0.7751161456108093, |
|
"eval_runtime": 57.4125, |
|
"eval_samples_per_second": 3.484, |
|
"eval_steps_per_second": 0.435, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.28278958612422994, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8893, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"eval_loss": 0.7736042737960815, |
|
"eval_runtime": 57.2826, |
|
"eval_samples_per_second": 3.491, |
|
"eval_steps_per_second": 0.436, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 0.30212533045014006, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8256, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"eval_loss": 0.7718043327331543, |
|
"eval_runtime": 59.4842, |
|
"eval_samples_per_second": 3.362, |
|
"eval_steps_per_second": 0.42, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.32231592883907934, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7754, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"eval_loss": 0.7697712779045105, |
|
"eval_runtime": 57.2127, |
|
"eval_samples_per_second": 3.496, |
|
"eval_steps_per_second": 0.437, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 0.29880148326318595, |
|
"learning_rate": 2e-05, |
|
"loss": 0.864, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"eval_loss": 0.7679712176322937, |
|
"eval_runtime": 57.1052, |
|
"eval_samples_per_second": 3.502, |
|
"eval_steps_per_second": 0.438, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.30389759178870646, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7831, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.7662644386291504, |
|
"eval_runtime": 57.37, |
|
"eval_samples_per_second": 3.486, |
|
"eval_steps_per_second": 0.436, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 0.3424258847516451, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8311, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"eval_loss": 0.7646127343177795, |
|
"eval_runtime": 57.1884, |
|
"eval_samples_per_second": 3.497, |
|
"eval_steps_per_second": 0.437, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 0.2831654885374943, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8261, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"eval_loss": 0.7631255388259888, |
|
"eval_runtime": 57.4573, |
|
"eval_samples_per_second": 3.481, |
|
"eval_steps_per_second": 0.435, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 0.29894569677081223, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8801, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"eval_loss": 0.7617875933647156, |
|
"eval_runtime": 57.1641, |
|
"eval_samples_per_second": 3.499, |
|
"eval_steps_per_second": 0.437, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.3030991848050202, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7921, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"eval_loss": 0.7605040073394775, |
|
"eval_runtime": 57.0991, |
|
"eval_samples_per_second": 3.503, |
|
"eval_steps_per_second": 0.438, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 0.30216971620226146, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8527, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"eval_loss": 0.7591890096664429, |
|
"eval_runtime": 58.6087, |
|
"eval_samples_per_second": 3.412, |
|
"eval_steps_per_second": 0.427, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 0.34907486616204614, |
|
"learning_rate": 2e-05, |
|
"loss": 0.841, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"eval_loss": 0.7577351331710815, |
|
"eval_runtime": 59.509, |
|
"eval_samples_per_second": 3.361, |
|
"eval_steps_per_second": 0.42, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 0.3356288667630128, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8417, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"eval_loss": 0.7571098208427429, |
|
"eval_runtime": 57.4972, |
|
"eval_samples_per_second": 3.478, |
|
"eval_steps_per_second": 0.435, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.3547770718977253, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8865, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.7565757632255554, |
|
"eval_runtime": 57.4262, |
|
"eval_samples_per_second": 3.483, |
|
"eval_steps_per_second": 0.435, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 0.36400071548952273, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8201, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"eval_loss": 0.7553688287734985, |
|
"eval_runtime": 59.6772, |
|
"eval_samples_per_second": 3.351, |
|
"eval_steps_per_second": 0.419, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 0.32432854183732784, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8705, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"eval_loss": 0.7540337443351746, |
|
"eval_runtime": 58.1967, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 0.43, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 0.3367161155473714, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8225, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"eval_loss": 0.752601683139801, |
|
"eval_runtime": 59.728, |
|
"eval_samples_per_second": 3.349, |
|
"eval_steps_per_second": 0.419, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.3542073894911913, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7887, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"eval_loss": 0.750983715057373, |
|
"eval_runtime": 58.2468, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 0.429, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.3387577198880303, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7594, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"eval_loss": 0.7495383620262146, |
|
"eval_runtime": 58.3457, |
|
"eval_samples_per_second": 3.428, |
|
"eval_steps_per_second": 0.428, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 0.381221735797731, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7911, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"eval_loss": 0.7477438449859619, |
|
"eval_runtime": 58.0584, |
|
"eval_samples_per_second": 3.445, |
|
"eval_steps_per_second": 0.431, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 0.3782280426863171, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8115, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"eval_loss": 0.7464295029640198, |
|
"eval_runtime": 57.9835, |
|
"eval_samples_per_second": 3.449, |
|
"eval_steps_per_second": 0.431, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.3751127153118298, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8896, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 0.7451103329658508, |
|
"eval_runtime": 58.1947, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 0.43, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 0.3580034870691801, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7964, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"eval_loss": 0.744097113609314, |
|
"eval_runtime": 58.1644, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 0.43, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.3630926811819107, |
|
"learning_rate": 2e-05, |
|
"loss": 0.848, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"eval_loss": 0.7432359457015991, |
|
"eval_runtime": 58.0811, |
|
"eval_samples_per_second": 3.443, |
|
"eval_steps_per_second": 0.43, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 0.3668484035124972, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7444, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"eval_loss": 0.7424789667129517, |
|
"eval_runtime": 59.6811, |
|
"eval_samples_per_second": 3.351, |
|
"eval_steps_per_second": 0.419, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.37526030248163283, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8381, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"eval_loss": 0.7417113780975342, |
|
"eval_runtime": 58.1209, |
|
"eval_samples_per_second": 3.441, |
|
"eval_steps_per_second": 0.43, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 0.36285898832422037, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7797, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"eval_loss": 0.7411203980445862, |
|
"eval_runtime": 58.3212, |
|
"eval_samples_per_second": 3.429, |
|
"eval_steps_per_second": 0.429, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 0.39983168875602654, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8571, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"eval_loss": 0.7402496933937073, |
|
"eval_runtime": 58.0746, |
|
"eval_samples_per_second": 3.444, |
|
"eval_steps_per_second": 0.43, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.3697896026052261, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7917, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"eval_loss": 0.7398749589920044, |
|
"eval_runtime": 59.8008, |
|
"eval_samples_per_second": 3.344, |
|
"eval_steps_per_second": 0.418, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.39419135002625816, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7987, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7384353876113892, |
|
"eval_runtime": 58.3389, |
|
"eval_samples_per_second": 3.428, |
|
"eval_steps_per_second": 0.429, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"grad_norm": 0.40732207424611727, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7205, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"eval_loss": 0.73604416847229, |
|
"eval_runtime": 58.2114, |
|
"eval_samples_per_second": 3.436, |
|
"eval_steps_per_second": 0.429, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 0.3641635271623762, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8062, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"eval_loss": 0.7333144545555115, |
|
"eval_runtime": 59.7484, |
|
"eval_samples_per_second": 3.347, |
|
"eval_steps_per_second": 0.418, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"grad_norm": 0.3556866449584765, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7681, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"eval_loss": 0.7306910157203674, |
|
"eval_runtime": 58.141, |
|
"eval_samples_per_second": 3.44, |
|
"eval_steps_per_second": 0.43, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.3826129743685834, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7961, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_loss": 0.7283279895782471, |
|
"eval_runtime": 58.1482, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 0.43, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"grad_norm": 0.35166540759020914, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7382, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"eval_loss": 0.7267993688583374, |
|
"eval_runtime": 57.8007, |
|
"eval_samples_per_second": 3.46, |
|
"eval_steps_per_second": 0.433, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 0.38414476136018477, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7999, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"eval_loss": 0.7261015176773071, |
|
"eval_runtime": 57.9723, |
|
"eval_samples_per_second": 3.45, |
|
"eval_steps_per_second": 0.431, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"grad_norm": 0.40218377868187477, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8115, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"eval_loss": 0.7257917523384094, |
|
"eval_runtime": 58.0394, |
|
"eval_samples_per_second": 3.446, |
|
"eval_steps_per_second": 0.431, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.41934721904445194, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7228, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_loss": 0.7251278758049011, |
|
"eval_runtime": 59.2828, |
|
"eval_samples_per_second": 3.374, |
|
"eval_steps_per_second": 0.422, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 0.3882012129329853, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7658, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"eval_loss": 0.724635899066925, |
|
"eval_runtime": 59.0543, |
|
"eval_samples_per_second": 3.387, |
|
"eval_steps_per_second": 0.423, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 0.4068559748805906, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7977, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"eval_loss": 0.7242235541343689, |
|
"eval_runtime": 58.5527, |
|
"eval_samples_per_second": 3.416, |
|
"eval_steps_per_second": 0.427, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"grad_norm": 0.4620335365938039, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7015, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"eval_loss": 0.7226566076278687, |
|
"eval_runtime": 58.8135, |
|
"eval_samples_per_second": 3.401, |
|
"eval_steps_per_second": 0.425, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.4009314815042761, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7488, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"eval_loss": 0.7213454246520996, |
|
"eval_runtime": 58.735, |
|
"eval_samples_per_second": 3.405, |
|
"eval_steps_per_second": 0.426, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"grad_norm": 0.456822567760836, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7307, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"eval_loss": 0.719496488571167, |
|
"eval_runtime": 58.9211, |
|
"eval_samples_per_second": 3.394, |
|
"eval_steps_per_second": 0.424, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.45520197938839, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7348, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"eval_loss": 0.7171263098716736, |
|
"eval_runtime": 58.9274, |
|
"eval_samples_per_second": 3.394, |
|
"eval_steps_per_second": 0.424, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"grad_norm": 0.4421606621837213, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8011, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"eval_loss": 0.7155402898788452, |
|
"eval_runtime": 58.4009, |
|
"eval_samples_per_second": 3.425, |
|
"eval_steps_per_second": 0.428, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.4111011701354251, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7829, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 0.714958667755127, |
|
"eval_runtime": 58.3143, |
|
"eval_samples_per_second": 3.43, |
|
"eval_steps_per_second": 0.429, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"grad_norm": 0.40366265866888357, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8596, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"eval_loss": 0.7153159976005554, |
|
"eval_runtime": 58.5749, |
|
"eval_samples_per_second": 3.414, |
|
"eval_steps_per_second": 0.427, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 0.44914251592864773, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7268, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"eval_loss": 0.7159590721130371, |
|
"eval_runtime": 58.6872, |
|
"eval_samples_per_second": 3.408, |
|
"eval_steps_per_second": 0.426, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 0.4062399312752312, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7875, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"eval_loss": 0.7165355086326599, |
|
"eval_runtime": 58.4703, |
|
"eval_samples_per_second": 3.421, |
|
"eval_steps_per_second": 0.428, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 0.44817350106485787, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7623, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"eval_loss": 0.716560423374176, |
|
"eval_runtime": 58.5904, |
|
"eval_samples_per_second": 3.414, |
|
"eval_steps_per_second": 0.427, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"grad_norm": 0.4309671248224914, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7604, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"eval_loss": 0.7165713310241699, |
|
"eval_runtime": 58.5214, |
|
"eval_samples_per_second": 3.418, |
|
"eval_steps_per_second": 0.427, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"grad_norm": 0.44823929530189277, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7751, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"eval_loss": 0.7170334458351135, |
|
"eval_runtime": 58.7428, |
|
"eval_samples_per_second": 3.405, |
|
"eval_steps_per_second": 0.426, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"grad_norm": 0.4369363559974751, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8321, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"eval_loss": 0.7169127464294434, |
|
"eval_runtime": 58.6794, |
|
"eval_samples_per_second": 3.408, |
|
"eval_steps_per_second": 0.426, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.43105130939689645, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7722, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.7162806987762451, |
|
"eval_runtime": 58.7674, |
|
"eval_samples_per_second": 3.403, |
|
"eval_steps_per_second": 0.425, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"grad_norm": 0.43789804607163635, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7548, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"eval_loss": 0.7144981622695923, |
|
"eval_runtime": 58.3815, |
|
"eval_samples_per_second": 3.426, |
|
"eval_steps_per_second": 0.428, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"grad_norm": 0.46941128815266536, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8189, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"eval_loss": 0.712846040725708, |
|
"eval_runtime": 58.5034, |
|
"eval_samples_per_second": 3.419, |
|
"eval_steps_per_second": 0.427, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"grad_norm": 0.4415453126320104, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7484, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"eval_loss": 0.7111316919326782, |
|
"eval_runtime": 58.566, |
|
"eval_samples_per_second": 3.415, |
|
"eval_steps_per_second": 0.427, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 0.4237981688992312, |
|
"learning_rate": 2e-05, |
|
"loss": 0.77, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"eval_loss": 0.7098332047462463, |
|
"eval_runtime": 58.5232, |
|
"eval_samples_per_second": 3.417, |
|
"eval_steps_per_second": 0.427, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 0.49069037639672286, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8059, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"eval_loss": 0.7086107730865479, |
|
"eval_runtime": 59.8651, |
|
"eval_samples_per_second": 3.341, |
|
"eval_steps_per_second": 0.418, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"grad_norm": 0.48569295378281013, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7799, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"eval_loss": 0.7077484726905823, |
|
"eval_runtime": 58.4449, |
|
"eval_samples_per_second": 3.422, |
|
"eval_steps_per_second": 0.428, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"grad_norm": 0.47224685972430797, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7381, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"eval_loss": 0.7073386907577515, |
|
"eval_runtime": 58.5961, |
|
"eval_samples_per_second": 3.413, |
|
"eval_steps_per_second": 0.427, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.48833051814427636, |
|
"learning_rate": 2e-05, |
|
"loss": 0.678, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.706765353679657, |
|
"eval_runtime": 60.6877, |
|
"eval_samples_per_second": 3.296, |
|
"eval_steps_per_second": 0.412, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"grad_norm": 0.4116173650136014, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7582, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"eval_loss": 0.7067686319351196, |
|
"eval_runtime": 58.4349, |
|
"eval_samples_per_second": 3.423, |
|
"eval_steps_per_second": 0.428, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 0.46176556383782513, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7749, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"eval_loss": 0.7066690325737, |
|
"eval_runtime": 58.7029, |
|
"eval_samples_per_second": 3.407, |
|
"eval_steps_per_second": 0.426, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"grad_norm": 0.4454696779432102, |
|
"learning_rate": 2e-05, |
|
"loss": 0.773, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"eval_loss": 0.7064326405525208, |
|
"eval_runtime": 61.252, |
|
"eval_samples_per_second": 3.265, |
|
"eval_steps_per_second": 0.408, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 0.5015422163334902, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7369, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"eval_loss": 0.7057382464408875, |
|
"eval_runtime": 59.411, |
|
"eval_samples_per_second": 3.366, |
|
"eval_steps_per_second": 0.421, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"grad_norm": 0.472373878055723, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8262, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"eval_loss": 0.7050846815109253, |
|
"eval_runtime": 59.2996, |
|
"eval_samples_per_second": 3.373, |
|
"eval_steps_per_second": 0.422, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"grad_norm": 0.5384950553698907, |
|
"learning_rate": 2e-05, |
|
"loss": 0.74, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"eval_loss": 0.7045766711235046, |
|
"eval_runtime": 59.2928, |
|
"eval_samples_per_second": 3.373, |
|
"eval_steps_per_second": 0.422, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 0.4692662892631433, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7443, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"eval_loss": 0.7045109272003174, |
|
"eval_runtime": 59.525, |
|
"eval_samples_per_second": 3.36, |
|
"eval_steps_per_second": 0.42, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.49707639799158876, |
|
"learning_rate": 2e-05, |
|
"loss": 0.733, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"eval_loss": 0.7047656178474426, |
|
"eval_runtime": 60.1718, |
|
"eval_samples_per_second": 3.324, |
|
"eval_steps_per_second": 0.415, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"grad_norm": 0.5042999858449994, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7303, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"eval_loss": 0.7046284675598145, |
|
"eval_runtime": 60.01, |
|
"eval_samples_per_second": 3.333, |
|
"eval_steps_per_second": 0.417, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"grad_norm": 0.5236583357740581, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7254, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"eval_loss": 0.7038366794586182, |
|
"eval_runtime": 60.3496, |
|
"eval_samples_per_second": 3.314, |
|
"eval_steps_per_second": 0.414, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"grad_norm": 0.5197559530441114, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6956, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"eval_loss": 0.7023048400878906, |
|
"eval_runtime": 60.3808, |
|
"eval_samples_per_second": 3.312, |
|
"eval_steps_per_second": 0.414, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 0.5214546280852583, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7243, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"eval_loss": 0.7011681199073792, |
|
"eval_runtime": 60.1368, |
|
"eval_samples_per_second": 3.326, |
|
"eval_steps_per_second": 0.416, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"grad_norm": 0.47638616269940814, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7442, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"eval_loss": 0.7005561590194702, |
|
"eval_runtime": 61.003, |
|
"eval_samples_per_second": 3.279, |
|
"eval_steps_per_second": 0.41, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"grad_norm": 0.5067672241908349, |
|
"learning_rate": 2e-05, |
|
"loss": 0.693, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"eval_loss": 0.7004985809326172, |
|
"eval_runtime": 60.1646, |
|
"eval_samples_per_second": 3.324, |
|
"eval_steps_per_second": 0.416, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"grad_norm": 0.5323088696033406, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7019, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"eval_loss": 0.7001196146011353, |
|
"eval_runtime": 59.9527, |
|
"eval_samples_per_second": 3.336, |
|
"eval_steps_per_second": 0.417, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.4994538125400832, |
|
"learning_rate": 2e-05, |
|
"loss": 0.684, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"eval_loss": 0.6989223957061768, |
|
"eval_runtime": 59.7753, |
|
"eval_samples_per_second": 3.346, |
|
"eval_steps_per_second": 0.418, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 0.5328972466603664, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7581, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"eval_loss": 0.697172999382019, |
|
"eval_runtime": 59.678, |
|
"eval_samples_per_second": 3.351, |
|
"eval_steps_per_second": 0.419, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"grad_norm": 0.557725244530984, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6562, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"eval_loss": 0.6954514980316162, |
|
"eval_runtime": 59.6753, |
|
"eval_samples_per_second": 3.351, |
|
"eval_steps_per_second": 0.419, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"grad_norm": 0.520999668899182, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7108, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"eval_loss": 0.6949453949928284, |
|
"eval_runtime": 59.7891, |
|
"eval_samples_per_second": 3.345, |
|
"eval_steps_per_second": 0.418, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 0.513677589761833, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6697, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"eval_loss": 0.6953239440917969, |
|
"eval_runtime": 59.7415, |
|
"eval_samples_per_second": 3.348, |
|
"eval_steps_per_second": 0.418, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"grad_norm": 0.5054488117701784, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7793, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"eval_loss": 0.6959659457206726, |
|
"eval_runtime": 59.9711, |
|
"eval_samples_per_second": 3.335, |
|
"eval_steps_per_second": 0.417, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 0.5962123257952582, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7068, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"eval_loss": 0.6952192783355713, |
|
"eval_runtime": 59.6824, |
|
"eval_samples_per_second": 3.351, |
|
"eval_steps_per_second": 0.419, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"grad_norm": 0.6009619303481951, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7261, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"eval_loss": 0.6935360431671143, |
|
"eval_runtime": 59.5352, |
|
"eval_samples_per_second": 3.359, |
|
"eval_steps_per_second": 0.42, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.5670117266130251, |
|
"learning_rate": 2e-05, |
|
"loss": 0.744, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_loss": 0.6924968957901001, |
|
"eval_runtime": 61.2965, |
|
"eval_samples_per_second": 3.263, |
|
"eval_steps_per_second": 0.408, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"grad_norm": 0.5564998515626721, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6982, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"eval_loss": 0.6924961805343628, |
|
"eval_runtime": 61.2731, |
|
"eval_samples_per_second": 3.264, |
|
"eval_steps_per_second": 0.408, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"grad_norm": 0.528752035989291, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7109, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"eval_loss": 0.6933311223983765, |
|
"eval_runtime": 59.8859, |
|
"eval_samples_per_second": 3.34, |
|
"eval_steps_per_second": 0.417, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 0.5868388300709311, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6592, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"eval_loss": 0.6933980584144592, |
|
"eval_runtime": 59.9915, |
|
"eval_samples_per_second": 3.334, |
|
"eval_steps_per_second": 0.417, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 0.5602090329210427, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7682, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"eval_loss": 0.6923888921737671, |
|
"eval_runtime": 61.499, |
|
"eval_samples_per_second": 3.252, |
|
"eval_steps_per_second": 0.407, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"grad_norm": 0.5051330890531748, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7491, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"eval_loss": 0.69191575050354, |
|
"eval_runtime": 59.6969, |
|
"eval_samples_per_second": 3.35, |
|
"eval_steps_per_second": 0.419, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"grad_norm": 0.5377224007409029, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7501, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"eval_loss": 0.69122314453125, |
|
"eval_runtime": 60.1345, |
|
"eval_samples_per_second": 3.326, |
|
"eval_steps_per_second": 0.416, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"grad_norm": 0.544576473903093, |
|
"learning_rate": 2e-05, |
|
"loss": 0.714, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"eval_loss": 0.6905286908149719, |
|
"eval_runtime": 59.9667, |
|
"eval_samples_per_second": 3.335, |
|
"eval_steps_per_second": 0.417, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.5027197538560159, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7181, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.6906802654266357, |
|
"eval_runtime": 60.0766, |
|
"eval_samples_per_second": 3.329, |
|
"eval_steps_per_second": 0.416, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"grad_norm": 0.5041535532115543, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6636, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"eval_loss": 0.6912646293640137, |
|
"eval_runtime": 63.5855, |
|
"eval_samples_per_second": 3.145, |
|
"eval_steps_per_second": 0.393, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"grad_norm": 0.5286650599348627, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8107, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"eval_loss": 0.6922540068626404, |
|
"eval_runtime": 56.5364, |
|
"eval_samples_per_second": 3.538, |
|
"eval_steps_per_second": 0.442, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"grad_norm": 0.588785168960039, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6169, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"eval_loss": 0.692643404006958, |
|
"eval_runtime": 56.5005, |
|
"eval_samples_per_second": 3.54, |
|
"eval_steps_per_second": 0.442, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"grad_norm": 0.5752677936578872, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7473, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"eval_loss": 0.6927568912506104, |
|
"eval_runtime": 58.5386, |
|
"eval_samples_per_second": 3.417, |
|
"eval_steps_per_second": 0.427, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 0.6487162117437294, |
|
"learning_rate": 2e-05, |
|
"loss": 0.588, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"eval_loss": 0.692574143409729, |
|
"eval_runtime": 56.4611, |
|
"eval_samples_per_second": 3.542, |
|
"eval_steps_per_second": 0.443, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"grad_norm": 0.6353608377871973, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6933, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"eval_loss": 0.6932590007781982, |
|
"eval_runtime": 56.5989, |
|
"eval_samples_per_second": 3.534, |
|
"eval_steps_per_second": 0.442, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"grad_norm": 0.5450036592535661, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7175, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"eval_loss": 0.6944625973701477, |
|
"eval_runtime": 56.5362, |
|
"eval_samples_per_second": 3.538, |
|
"eval_steps_per_second": 0.442, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.6095734786538398, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7478, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"eval_loss": 0.695120632648468, |
|
"eval_runtime": 56.465, |
|
"eval_samples_per_second": 3.542, |
|
"eval_steps_per_second": 0.443, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"grad_norm": 0.5879704367364821, |
|
"learning_rate": 2e-05, |
|
"loss": 0.674, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"eval_loss": 0.6956540942192078, |
|
"eval_runtime": 56.6007, |
|
"eval_samples_per_second": 3.534, |
|
"eval_steps_per_second": 0.442, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 0.6595426789183463, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6536, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"eval_loss": 0.6957553029060364, |
|
"eval_runtime": 56.4722, |
|
"eval_samples_per_second": 3.542, |
|
"eval_steps_per_second": 0.443, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"grad_norm": 0.7708120772721636, |
|
"learning_rate": 2e-05, |
|
"loss": 0.666, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"eval_loss": 0.693030834197998, |
|
"eval_runtime": 56.3518, |
|
"eval_samples_per_second": 3.549, |
|
"eval_steps_per_second": 0.444, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"grad_norm": 0.666091377671071, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7422, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"eval_loss": 0.6900334358215332, |
|
"eval_runtime": 56.5395, |
|
"eval_samples_per_second": 3.537, |
|
"eval_steps_per_second": 0.442, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"grad_norm": 0.6203365868953359, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7069, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"eval_loss": 0.6880744099617004, |
|
"eval_runtime": 56.4675, |
|
"eval_samples_per_second": 3.542, |
|
"eval_steps_per_second": 0.443, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"grad_norm": 0.6299525495855296, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7422, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"eval_loss": 0.686725378036499, |
|
"eval_runtime": 56.671, |
|
"eval_samples_per_second": 3.529, |
|
"eval_steps_per_second": 0.441, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 0.6415660970283229, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7347, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"eval_loss": 0.6870352029800415, |
|
"eval_runtime": 56.5976, |
|
"eval_samples_per_second": 3.534, |
|
"eval_steps_per_second": 0.442, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.6569935128967318, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6773, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"eval_loss": 0.6870338320732117, |
|
"eval_runtime": 57.2325, |
|
"eval_samples_per_second": 3.495, |
|
"eval_steps_per_second": 0.437, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"grad_norm": 0.6895239904364278, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7106, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"eval_loss": 0.6859387755393982, |
|
"eval_runtime": 57.3075, |
|
"eval_samples_per_second": 3.49, |
|
"eval_steps_per_second": 0.436, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"grad_norm": 0.5855839234707383, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7361, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"eval_loss": 0.6856819987297058, |
|
"eval_runtime": 57.5973, |
|
"eval_samples_per_second": 3.472, |
|
"eval_steps_per_second": 0.434, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"grad_norm": 0.6198072484940144, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6386, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"eval_loss": 0.6865841746330261, |
|
"eval_runtime": 57.4429, |
|
"eval_samples_per_second": 3.482, |
|
"eval_steps_per_second": 0.435, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.6169444945747248, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6455, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"eval_loss": 0.6871997714042664, |
|
"eval_runtime": 57.3975, |
|
"eval_samples_per_second": 3.484, |
|
"eval_steps_per_second": 0.436, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"grad_norm": 0.6524804251939137, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6588, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"eval_loss": 0.6873356103897095, |
|
"eval_runtime": 57.4579, |
|
"eval_samples_per_second": 3.481, |
|
"eval_steps_per_second": 0.435, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"grad_norm": 0.6578787618504525, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6274, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"eval_loss": 0.6880214214324951, |
|
"eval_runtime": 57.5735, |
|
"eval_samples_per_second": 3.474, |
|
"eval_steps_per_second": 0.434, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"grad_norm": 0.732160801451622, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6623, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"eval_loss": 0.6879817247390747, |
|
"eval_runtime": 57.5801, |
|
"eval_samples_per_second": 3.473, |
|
"eval_steps_per_second": 0.434, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.7294753965107613, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6562, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"eval_loss": 0.6870495676994324, |
|
"eval_runtime": 57.6659, |
|
"eval_samples_per_second": 3.468, |
|
"eval_steps_per_second": 0.434, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 0.6947870304881401, |
|
"learning_rate": 2e-05, |
|
"loss": 0.695, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"eval_loss": 0.6856162548065186, |
|
"eval_runtime": 57.4452, |
|
"eval_samples_per_second": 3.482, |
|
"eval_steps_per_second": 0.435, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"grad_norm": 0.7085011414361884, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6634, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"eval_loss": 0.6839439272880554, |
|
"eval_runtime": 57.3621, |
|
"eval_samples_per_second": 3.487, |
|
"eval_steps_per_second": 0.436, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"grad_norm": 0.6548606152047736, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7117, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"eval_loss": 0.6837204098701477, |
|
"eval_runtime": 57.3849, |
|
"eval_samples_per_second": 3.485, |
|
"eval_steps_per_second": 0.436, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"grad_norm": 0.6662179186613736, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6528, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"eval_loss": 0.6844826340675354, |
|
"eval_runtime": 57.3173, |
|
"eval_samples_per_second": 3.489, |
|
"eval_steps_per_second": 0.436, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"grad_norm": 0.6638311768585444, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6582, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"eval_loss": 0.6846724152565002, |
|
"eval_runtime": 57.5354, |
|
"eval_samples_per_second": 3.476, |
|
"eval_steps_per_second": 0.435, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.7007259768118588, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6742, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"eval_loss": 0.6834731101989746, |
|
"eval_runtime": 57.4134, |
|
"eval_samples_per_second": 3.484, |
|
"eval_steps_per_second": 0.435, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"grad_norm": 0.6563132346432226, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6752, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"eval_loss": 0.6817070245742798, |
|
"eval_runtime": 56.6649, |
|
"eval_samples_per_second": 3.53, |
|
"eval_steps_per_second": 0.441, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.6349703649303867, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6795, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.6804311871528625, |
|
"eval_runtime": 56.4378, |
|
"eval_samples_per_second": 3.544, |
|
"eval_steps_per_second": 0.443, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"grad_norm": 0.6716039243820887, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7145, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"eval_loss": 0.6804825067520142, |
|
"eval_runtime": 56.6403, |
|
"eval_samples_per_second": 3.531, |
|
"eval_steps_per_second": 0.441, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"grad_norm": 0.5950395984856348, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6768, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"eval_loss": 0.6823931932449341, |
|
"eval_runtime": 56.5459, |
|
"eval_samples_per_second": 3.537, |
|
"eval_steps_per_second": 0.442, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"grad_norm": 0.6787703014730869, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6158, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"eval_loss": 0.6854414939880371, |
|
"eval_runtime": 56.5293, |
|
"eval_samples_per_second": 3.538, |
|
"eval_steps_per_second": 0.442, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"grad_norm": 0.6526684210082853, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6479, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"eval_loss": 0.6892845034599304, |
|
"eval_runtime": 56.5099, |
|
"eval_samples_per_second": 3.539, |
|
"eval_steps_per_second": 0.442, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"grad_norm": 0.6997704487164051, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6706, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"eval_loss": 0.6941932439804077, |
|
"eval_runtime": 58.514, |
|
"eval_samples_per_second": 3.418, |
|
"eval_steps_per_second": 0.427, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"grad_norm": 0.7511370305129338, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7418, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"eval_loss": 0.6964046955108643, |
|
"eval_runtime": 58.4428, |
|
"eval_samples_per_second": 3.422, |
|
"eval_steps_per_second": 0.428, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"grad_norm": 0.8468482037911412, |
|
"learning_rate": 2e-05, |
|
"loss": 0.618, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"eval_loss": 0.6947888731956482, |
|
"eval_runtime": 56.6921, |
|
"eval_samples_per_second": 3.528, |
|
"eval_steps_per_second": 0.441, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.80366391754735, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6712, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_loss": 0.691255509853363, |
|
"eval_runtime": 56.7536, |
|
"eval_samples_per_second": 3.524, |
|
"eval_steps_per_second": 0.441, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"grad_norm": 0.7123001788838409, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6886, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"eval_loss": 0.6888566613197327, |
|
"eval_runtime": 57.4537, |
|
"eval_samples_per_second": 3.481, |
|
"eval_steps_per_second": 0.435, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"grad_norm": 0.7785807978964993, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6096, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"eval_loss": 0.6869829297065735, |
|
"eval_runtime": 57.3967, |
|
"eval_samples_per_second": 3.485, |
|
"eval_steps_per_second": 0.436, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"grad_norm": 0.6771659776183533, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7328, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"eval_loss": 0.6867367625236511, |
|
"eval_runtime": 57.5277, |
|
"eval_samples_per_second": 3.477, |
|
"eval_steps_per_second": 0.435, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"grad_norm": 0.8106446356590065, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5931, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"eval_loss": 0.6862130165100098, |
|
"eval_runtime": 57.4868, |
|
"eval_samples_per_second": 3.479, |
|
"eval_steps_per_second": 0.435, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"grad_norm": 0.6600674902481064, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5789, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"eval_loss": 0.6866827607154846, |
|
"eval_runtime": 57.4287, |
|
"eval_samples_per_second": 3.483, |
|
"eval_steps_per_second": 0.435, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"grad_norm": 0.8177118767015663, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6395, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"eval_loss": 0.6866394281387329, |
|
"eval_runtime": 57.0918, |
|
"eval_samples_per_second": 3.503, |
|
"eval_steps_per_second": 0.438, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"grad_norm": 0.7284237801181533, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6835, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"eval_loss": 0.6864017248153687, |
|
"eval_runtime": 57.1565, |
|
"eval_samples_per_second": 3.499, |
|
"eval_steps_per_second": 0.437, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.7603002790103086, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6347, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"eval_loss": 0.6871703267097473, |
|
"eval_runtime": 57.4181, |
|
"eval_samples_per_second": 3.483, |
|
"eval_steps_per_second": 0.435, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"grad_norm": 0.8359766442946917, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6088, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"eval_loss": 0.6878347992897034, |
|
"eval_runtime": 57.4837, |
|
"eval_samples_per_second": 3.479, |
|
"eval_steps_per_second": 0.435, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 0.7778968951616311, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5912, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"eval_loss": 0.6893374919891357, |
|
"eval_runtime": 57.6159, |
|
"eval_samples_per_second": 3.471, |
|
"eval_steps_per_second": 0.434, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"grad_norm": 0.8300437291816744, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6299, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"eval_loss": 0.6899804472923279, |
|
"eval_runtime": 57.1491, |
|
"eval_samples_per_second": 3.5, |
|
"eval_steps_per_second": 0.437, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"grad_norm": 0.7994430152763061, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6073, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"eval_loss": 0.6889459490776062, |
|
"eval_runtime": 57.3773, |
|
"eval_samples_per_second": 3.486, |
|
"eval_steps_per_second": 0.436, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"grad_norm": 0.7475670453371858, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6774, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"eval_loss": 0.6873544454574585, |
|
"eval_runtime": 57.4114, |
|
"eval_samples_per_second": 3.484, |
|
"eval_steps_per_second": 0.435, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"grad_norm": 0.7281375343651885, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6404, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"eval_loss": 0.6867469549179077, |
|
"eval_runtime": 57.2899, |
|
"eval_samples_per_second": 3.491, |
|
"eval_steps_per_second": 0.436, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"grad_norm": 0.7684115091080507, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6382, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"eval_loss": 0.6860084533691406, |
|
"eval_runtime": 57.38, |
|
"eval_samples_per_second": 3.486, |
|
"eval_steps_per_second": 0.436, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.7962356695445627, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6398, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"eval_loss": 0.6856002807617188, |
|
"eval_runtime": 57.2399, |
|
"eval_samples_per_second": 3.494, |
|
"eval_steps_per_second": 0.437, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"grad_norm": 0.7893826807634562, |
|
"learning_rate": 2e-05, |
|
"loss": 0.59, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"eval_loss": 0.6870043873786926, |
|
"eval_runtime": 57.1671, |
|
"eval_samples_per_second": 3.499, |
|
"eval_steps_per_second": 0.437, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"grad_norm": 0.8329644141570051, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5932, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"eval_loss": 0.6870229840278625, |
|
"eval_runtime": 57.3642, |
|
"eval_samples_per_second": 3.486, |
|
"eval_steps_per_second": 0.436, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"grad_norm": 0.9075127715796286, |
|
"learning_rate": 2e-05, |
|
"loss": 0.669, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"eval_loss": 0.6856889128684998, |
|
"eval_runtime": 57.4226, |
|
"eval_samples_per_second": 3.483, |
|
"eval_steps_per_second": 0.435, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.8464505810718659, |
|
"learning_rate": 2e-05, |
|
"loss": 0.686, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"eval_loss": 0.6835823059082031, |
|
"eval_runtime": 57.2105, |
|
"eval_samples_per_second": 3.496, |
|
"eval_steps_per_second": 0.437, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"grad_norm": 0.7799140952562077, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6503, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"eval_loss": 0.6825523376464844, |
|
"eval_runtime": 57.0985, |
|
"eval_samples_per_second": 3.503, |
|
"eval_steps_per_second": 0.438, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"grad_norm": 0.8495343756184095, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6533, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"eval_loss": 0.6813305616378784, |
|
"eval_runtime": 57.1896, |
|
"eval_samples_per_second": 3.497, |
|
"eval_steps_per_second": 0.437, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"grad_norm": 0.8191950862245413, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6627, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"eval_loss": 0.6800451874732971, |
|
"eval_runtime": 57.3904, |
|
"eval_samples_per_second": 3.485, |
|
"eval_steps_per_second": 0.436, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.8196747980504347, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7337, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.6801488399505615, |
|
"eval_runtime": 59.0121, |
|
"eval_samples_per_second": 3.389, |
|
"eval_steps_per_second": 0.424, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"grad_norm": 0.7095908101379159, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6203, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"eval_loss": 0.6816287040710449, |
|
"eval_runtime": 57.1754, |
|
"eval_samples_per_second": 3.498, |
|
"eval_steps_per_second": 0.437, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"grad_norm": 0.7916901149958031, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5489, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"eval_loss": 0.6857742071151733, |
|
"eval_runtime": 58.0461, |
|
"eval_samples_per_second": 3.446, |
|
"eval_steps_per_second": 0.431, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.09375, |
|
"grad_norm": 0.8190252103616696, |
|
"learning_rate": 2e-05, |
|
"loss": 0.613, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.09375, |
|
"eval_loss": 0.6924745440483093, |
|
"eval_runtime": 58.351, |
|
"eval_samples_per_second": 3.428, |
|
"eval_steps_per_second": 0.428, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"grad_norm": 0.9385023798254423, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5647, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"eval_loss": 0.7020445466041565, |
|
"eval_runtime": 58.1868, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 0.43, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.15625, |
|
"grad_norm": 1.178887354836488, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5957, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.15625, |
|
"eval_loss": 0.7064430117607117, |
|
"eval_runtime": 58.3297, |
|
"eval_samples_per_second": 3.429, |
|
"eval_steps_per_second": 0.429, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"grad_norm": 1.0054198258359948, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5667, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"eval_loss": 0.7060463428497314, |
|
"eval_runtime": 58.3212, |
|
"eval_samples_per_second": 3.429, |
|
"eval_steps_per_second": 0.429, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.21875, |
|
"grad_norm": 1.005055760217432, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6546, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.21875, |
|
"eval_loss": 0.7029504179954529, |
|
"eval_runtime": 58.0188, |
|
"eval_samples_per_second": 3.447, |
|
"eval_steps_per_second": 0.431, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.9458472260674603, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6503, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"eval_loss": 0.6988745927810669, |
|
"eval_runtime": 58.3149, |
|
"eval_samples_per_second": 3.43, |
|
"eval_steps_per_second": 0.429, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.28125, |
|
"grad_norm": 1.022594832986886, |
|
"learning_rate": 2e-05, |
|
"loss": 0.611, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.28125, |
|
"eval_loss": 0.6943955421447754, |
|
"eval_runtime": 58.3693, |
|
"eval_samples_per_second": 3.426, |
|
"eval_steps_per_second": 0.428, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"grad_norm": 0.8953283498269817, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6438, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"eval_loss": 0.6924715638160706, |
|
"eval_runtime": 58.214, |
|
"eval_samples_per_second": 3.436, |
|
"eval_steps_per_second": 0.429, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"grad_norm": 0.9094812403228425, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6123, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"eval_loss": 0.690609335899353, |
|
"eval_runtime": 58.6042, |
|
"eval_samples_per_second": 3.413, |
|
"eval_steps_per_second": 0.427, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"grad_norm": 0.9433427892139121, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5772, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"eval_loss": 0.6895288825035095, |
|
"eval_runtime": 58.0083, |
|
"eval_samples_per_second": 3.448, |
|
"eval_steps_per_second": 0.431, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.40625, |
|
"grad_norm": 0.9654218046347709, |
|
"learning_rate": 2e-05, |
|
"loss": 0.62, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.40625, |
|
"eval_loss": 0.6887797713279724, |
|
"eval_runtime": 58.1374, |
|
"eval_samples_per_second": 3.44, |
|
"eval_steps_per_second": 0.43, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"grad_norm": 1.033591761626784, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6163, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"eval_loss": 0.6888651847839355, |
|
"eval_runtime": 58.2539, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 0.429, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.46875, |
|
"grad_norm": 0.9059638854254064, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6364, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.46875, |
|
"eval_loss": 0.6905540227890015, |
|
"eval_runtime": 58.0992, |
|
"eval_samples_per_second": 3.442, |
|
"eval_steps_per_second": 0.43, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.9193726862314907, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5845, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_loss": 0.693742036819458, |
|
"eval_runtime": 58.2336, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 0.429, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.53125, |
|
"grad_norm": 0.8539139714986941, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6344, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.53125, |
|
"eval_loss": 0.696897566318512, |
|
"eval_runtime": 59.3124, |
|
"eval_samples_per_second": 3.372, |
|
"eval_steps_per_second": 0.421, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"grad_norm": 0.9552275495908527, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6159, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"eval_loss": 0.6991227865219116, |
|
"eval_runtime": 58.1037, |
|
"eval_samples_per_second": 3.442, |
|
"eval_steps_per_second": 0.43, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.59375, |
|
"grad_norm": 0.8953175982318474, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5934, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.59375, |
|
"eval_loss": 0.7009669542312622, |
|
"eval_runtime": 59.9178, |
|
"eval_samples_per_second": 3.338, |
|
"eval_steps_per_second": 0.417, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"grad_norm": 1.1254017430464345, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6721, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"eval_loss": 0.7003803253173828, |
|
"eval_runtime": 59.9278, |
|
"eval_samples_per_second": 3.337, |
|
"eval_steps_per_second": 0.417, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"grad_norm": 0.9666525684896161, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5793, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"eval_loss": 0.6997054815292358, |
|
"eval_runtime": 58.3355, |
|
"eval_samples_per_second": 3.428, |
|
"eval_steps_per_second": 0.429, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"grad_norm": 1.0500213825228455, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6262, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"eval_loss": 0.6956760883331299, |
|
"eval_runtime": 57.9053, |
|
"eval_samples_per_second": 3.454, |
|
"eval_steps_per_second": 0.432, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.71875, |
|
"grad_norm": 1.0445166827193935, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6111, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.71875, |
|
"eval_loss": 0.6909776329994202, |
|
"eval_runtime": 58.1856, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 0.43, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.8935484171996528, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6036, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"eval_loss": 0.6887417435646057, |
|
"eval_runtime": 58.1651, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 0.43, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.78125, |
|
"grad_norm": 0.9329951454150782, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6434, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 7.78125, |
|
"eval_loss": 0.6893429756164551, |
|
"eval_runtime": 58.4106, |
|
"eval_samples_per_second": 3.424, |
|
"eval_steps_per_second": 0.428, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"grad_norm": 0.8799352767832798, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6519, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"eval_loss": 0.6929408311843872, |
|
"eval_runtime": 58.3105, |
|
"eval_samples_per_second": 3.43, |
|
"eval_steps_per_second": 0.429, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.84375, |
|
"grad_norm": 0.9322996227983372, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5684, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 7.84375, |
|
"eval_loss": 0.6954038739204407, |
|
"eval_runtime": 57.8998, |
|
"eval_samples_per_second": 3.454, |
|
"eval_steps_per_second": 0.432, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"grad_norm": 1.0904651907324217, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5851, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"eval_loss": 0.6938650012016296, |
|
"eval_runtime": 58.4905, |
|
"eval_samples_per_second": 3.419, |
|
"eval_steps_per_second": 0.427, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 7.90625, |
|
"grad_norm": 1.0103592741616823, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6655, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 7.90625, |
|
"eval_loss": 0.6909225583076477, |
|
"eval_runtime": 58.1801, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 0.43, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"grad_norm": 0.9208541649120607, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6051, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"eval_loss": 0.6913868188858032, |
|
"eval_runtime": 58.4224, |
|
"eval_samples_per_second": 3.423, |
|
"eval_steps_per_second": 0.428, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"grad_norm": 0.9567638724372727, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5529, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"eval_loss": 0.6918243169784546, |
|
"eval_runtime": 58.1569, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 0.43, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.8913592607849594, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6076, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.6921086311340332, |
|
"eval_runtime": 58.8193, |
|
"eval_samples_per_second": 3.4, |
|
"eval_steps_per_second": 0.425, |
|
"step": 256 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 256, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 77213396434944.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|