|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.88888888888889, |
|
"eval_steps": 256, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 4.872000000000001e-05, |
|
"loss": 0.8173, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_test_accuracy": 0.09690721649484536, |
|
"eval_test_loss": 0.4266754984855652, |
|
"eval_test_runtime": 233.5115, |
|
"eval_test_samples_per_second": 10.385, |
|
"eval_test_steps_per_second": 1.302, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 4.744e-05, |
|
"loss": 0.441, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_test_accuracy": 0.16742268041237113, |
|
"eval_test_loss": 0.36029067635536194, |
|
"eval_test_runtime": 232.4589, |
|
"eval_test_samples_per_second": 10.432, |
|
"eval_test_steps_per_second": 1.308, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 4.6160000000000005e-05, |
|
"loss": 0.397, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_test_accuracy": 0.17649484536082474, |
|
"eval_test_loss": 0.3348633348941803, |
|
"eval_test_runtime": 227.1069, |
|
"eval_test_samples_per_second": 10.678, |
|
"eval_test_steps_per_second": 1.339, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.488e-05, |
|
"loss": 0.3611, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_test_accuracy": 0.23010309278350516, |
|
"eval_test_loss": 0.3209882974624634, |
|
"eval_test_runtime": 226.4132, |
|
"eval_test_samples_per_second": 10.711, |
|
"eval_test_steps_per_second": 1.343, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"learning_rate": 4.36e-05, |
|
"loss": 0.3469, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"eval_test_accuracy": 0.19505154639175257, |
|
"eval_test_loss": 0.31938230991363525, |
|
"eval_test_runtime": 227.1405, |
|
"eval_test_samples_per_second": 10.676, |
|
"eval_test_steps_per_second": 1.338, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"learning_rate": 4.232e-05, |
|
"loss": 0.3301, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"eval_test_accuracy": 0.21608247422680413, |
|
"eval_test_loss": 0.3045227825641632, |
|
"eval_test_runtime": 229.033, |
|
"eval_test_samples_per_second": 10.588, |
|
"eval_test_steps_per_second": 1.327, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 4.104e-05, |
|
"loss": 0.3194, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_test_accuracy": 0.2354639175257732, |
|
"eval_test_loss": 0.2938610017299652, |
|
"eval_test_runtime": 225.016, |
|
"eval_test_samples_per_second": 10.777, |
|
"eval_test_steps_per_second": 1.351, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 3.9760000000000006e-05, |
|
"loss": 0.3189, |
|
"step": 2048 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"eval_test_accuracy": 0.23876288659793815, |
|
"eval_test_loss": 0.2826090455055237, |
|
"eval_test_runtime": 235.6637, |
|
"eval_test_samples_per_second": 10.29, |
|
"eval_test_steps_per_second": 1.29, |
|
"step": 2048 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"learning_rate": 3.848e-05, |
|
"loss": 0.3031, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_test_accuracy": 0.23752577319587628, |
|
"eval_test_loss": 0.276397705078125, |
|
"eval_test_runtime": 228.4566, |
|
"eval_test_samples_per_second": 10.615, |
|
"eval_test_steps_per_second": 1.331, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"learning_rate": 3.72e-05, |
|
"loss": 0.3008, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"eval_test_accuracy": 0.231340206185567, |
|
"eval_test_loss": 0.27680695056915283, |
|
"eval_test_runtime": 232.6253, |
|
"eval_test_samples_per_second": 10.424, |
|
"eval_test_steps_per_second": 1.307, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"learning_rate": 3.592e-05, |
|
"loss": 0.2941, |
|
"step": 2816 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_test_accuracy": 0.24082474226804124, |
|
"eval_test_loss": 0.27100667357444763, |
|
"eval_test_runtime": 234.9203, |
|
"eval_test_samples_per_second": 10.323, |
|
"eval_test_steps_per_second": 1.294, |
|
"step": 2816 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"learning_rate": 3.464e-05, |
|
"loss": 0.2855, |
|
"step": 3072 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"eval_test_accuracy": 0.2552577319587629, |
|
"eval_test_loss": 0.26808857917785645, |
|
"eval_test_runtime": 233.699, |
|
"eval_test_samples_per_second": 10.377, |
|
"eval_test_steps_per_second": 1.301, |
|
"step": 3072 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"learning_rate": 3.336e-05, |
|
"loss": 0.2827, |
|
"step": 3328 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"eval_test_accuracy": 0.23876288659793815, |
|
"eval_test_loss": 0.26740676164627075, |
|
"eval_test_runtime": 232.615, |
|
"eval_test_samples_per_second": 10.425, |
|
"eval_test_steps_per_second": 1.307, |
|
"step": 3328 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"learning_rate": 3.208e-05, |
|
"loss": 0.2824, |
|
"step": 3584 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"eval_test_accuracy": 0.23670103092783504, |
|
"eval_test_loss": 0.26493921875953674, |
|
"eval_test_runtime": 234.3958, |
|
"eval_test_samples_per_second": 10.346, |
|
"eval_test_steps_per_second": 1.297, |
|
"step": 3584 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"learning_rate": 3.08e-05, |
|
"loss": 0.2699, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"eval_test_accuracy": 0.24082474226804124, |
|
"eval_test_loss": 0.2653910517692566, |
|
"eval_test_runtime": 236.6454, |
|
"eval_test_samples_per_second": 10.247, |
|
"eval_test_steps_per_second": 1.285, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 2.9520000000000002e-05, |
|
"loss": 0.2744, |
|
"step": 4096 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"eval_test_accuracy": 0.2503092783505155, |
|
"eval_test_loss": 0.25940871238708496, |
|
"eval_test_runtime": 230.5104, |
|
"eval_test_samples_per_second": 10.52, |
|
"eval_test_steps_per_second": 1.319, |
|
"step": 4096 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 2.824e-05, |
|
"loss": 0.2679, |
|
"step": 4352 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"eval_test_accuracy": 0.24824742268041236, |
|
"eval_test_loss": 0.25839442014694214, |
|
"eval_test_runtime": 279.8888, |
|
"eval_test_samples_per_second": 8.664, |
|
"eval_test_steps_per_second": 1.086, |
|
"step": 4352 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"learning_rate": 2.6960000000000003e-05, |
|
"loss": 0.27, |
|
"step": 4608 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"eval_test_accuracy": 0.2465979381443299, |
|
"eval_test_loss": 0.25590091943740845, |
|
"eval_test_runtime": 229.037, |
|
"eval_test_samples_per_second": 10.588, |
|
"eval_test_steps_per_second": 1.327, |
|
"step": 4608 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"learning_rate": 2.5679999999999998e-05, |
|
"loss": 0.2644, |
|
"step": 4864 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"eval_test_accuracy": 0.23835051546391753, |
|
"eval_test_loss": 0.25387293100357056, |
|
"eval_test_runtime": 231.73, |
|
"eval_test_samples_per_second": 10.465, |
|
"eval_test_steps_per_second": 1.312, |
|
"step": 4864 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"learning_rate": 2.44e-05, |
|
"loss": 0.2664, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"eval_test_accuracy": 0.24783505154639177, |
|
"eval_test_loss": 0.2522536516189575, |
|
"eval_test_runtime": 233.9019, |
|
"eval_test_samples_per_second": 10.368, |
|
"eval_test_steps_per_second": 1.3, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"learning_rate": 2.312e-05, |
|
"loss": 0.2557, |
|
"step": 5376 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"eval_test_accuracy": 0.23628865979381444, |
|
"eval_test_loss": 0.2543907165527344, |
|
"eval_test_runtime": 230.6139, |
|
"eval_test_samples_per_second": 10.515, |
|
"eval_test_steps_per_second": 1.318, |
|
"step": 5376 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"learning_rate": 2.184e-05, |
|
"loss": 0.2544, |
|
"step": 5632 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"eval_test_accuracy": 0.2445360824742268, |
|
"eval_test_loss": 0.24817436933517456, |
|
"eval_test_runtime": 231.9449, |
|
"eval_test_samples_per_second": 10.455, |
|
"eval_test_steps_per_second": 1.311, |
|
"step": 5632 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"learning_rate": 2.0560000000000003e-05, |
|
"loss": 0.2549, |
|
"step": 5888 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"eval_test_accuracy": 0.24618556701030928, |
|
"eval_test_loss": 0.24915936589241028, |
|
"eval_test_runtime": 226.5831, |
|
"eval_test_samples_per_second": 10.702, |
|
"eval_test_steps_per_second": 1.342, |
|
"step": 5888 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"learning_rate": 1.9280000000000002e-05, |
|
"loss": 0.2545, |
|
"step": 6144 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"eval_test_accuracy": 0.2465979381443299, |
|
"eval_test_loss": 0.2478923350572586, |
|
"eval_test_runtime": 233.3756, |
|
"eval_test_samples_per_second": 10.391, |
|
"eval_test_steps_per_second": 1.303, |
|
"step": 6144 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.2539, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"eval_test_accuracy": 0.24989690721649485, |
|
"eval_test_loss": 0.24747271835803986, |
|
"eval_test_runtime": 231.4653, |
|
"eval_test_samples_per_second": 10.477, |
|
"eval_test_steps_per_second": 1.313, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"learning_rate": 1.672e-05, |
|
"loss": 0.2466, |
|
"step": 6656 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"eval_test_accuracy": 0.24329896907216494, |
|
"eval_test_loss": 0.24743035435676575, |
|
"eval_test_runtime": 233.3396, |
|
"eval_test_samples_per_second": 10.393, |
|
"eval_test_steps_per_second": 1.303, |
|
"step": 6656 |
|
}, |
|
{ |
|
"epoch": 6.14, |
|
"learning_rate": 1.544e-05, |
|
"loss": 0.2507, |
|
"step": 6912 |
|
}, |
|
{ |
|
"epoch": 6.14, |
|
"eval_test_accuracy": 0.25237113402061856, |
|
"eval_test_loss": 0.24656306207180023, |
|
"eval_test_runtime": 228.6153, |
|
"eval_test_samples_per_second": 10.607, |
|
"eval_test_steps_per_second": 1.33, |
|
"step": 6912 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"learning_rate": 1.4160000000000002e-05, |
|
"loss": 0.2456, |
|
"step": 7168 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"eval_test_accuracy": 0.24701030927835052, |
|
"eval_test_loss": 0.24853350222110748, |
|
"eval_test_runtime": 228.5698, |
|
"eval_test_samples_per_second": 10.609, |
|
"eval_test_steps_per_second": 1.33, |
|
"step": 7168 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"learning_rate": 1.288e-05, |
|
"loss": 0.241, |
|
"step": 7424 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"eval_test_accuracy": 0.2490721649484536, |
|
"eval_test_loss": 0.2469691038131714, |
|
"eval_test_runtime": 225.5172, |
|
"eval_test_samples_per_second": 10.753, |
|
"eval_test_steps_per_second": 1.348, |
|
"step": 7424 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"learning_rate": 1.16e-05, |
|
"loss": 0.2443, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"eval_test_accuracy": 0.24824742268041236, |
|
"eval_test_loss": 0.24246640503406525, |
|
"eval_test_runtime": 226.9617, |
|
"eval_test_samples_per_second": 10.685, |
|
"eval_test_steps_per_second": 1.339, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"learning_rate": 1.0320000000000001e-05, |
|
"loss": 0.2429, |
|
"step": 7936 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"eval_test_accuracy": 0.2515463917525773, |
|
"eval_test_loss": 0.24250201880931854, |
|
"eval_test_runtime": 228.9785, |
|
"eval_test_samples_per_second": 10.591, |
|
"eval_test_steps_per_second": 1.328, |
|
"step": 7936 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"learning_rate": 9.04e-06, |
|
"loss": 0.241, |
|
"step": 8192 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"eval_test_accuracy": 0.25195876288659796, |
|
"eval_test_loss": 0.24321790039539337, |
|
"eval_test_runtime": 228.3774, |
|
"eval_test_samples_per_second": 10.618, |
|
"eval_test_steps_per_second": 1.331, |
|
"step": 8192 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"learning_rate": 7.76e-06, |
|
"loss": 0.2418, |
|
"step": 8448 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"eval_test_accuracy": 0.2490721649484536, |
|
"eval_test_loss": 0.24221281707286835, |
|
"eval_test_runtime": 228.7396, |
|
"eval_test_samples_per_second": 10.602, |
|
"eval_test_steps_per_second": 1.329, |
|
"step": 8448 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"learning_rate": 6.48e-06, |
|
"loss": 0.2378, |
|
"step": 8704 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"eval_test_accuracy": 0.2465979381443299, |
|
"eval_test_loss": 0.24190692603588104, |
|
"eval_test_runtime": 228.3602, |
|
"eval_test_samples_per_second": 10.619, |
|
"eval_test_steps_per_second": 1.331, |
|
"step": 8704 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.2388, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"eval_test_accuracy": 0.25237113402061856, |
|
"eval_test_loss": 0.24081237614154816, |
|
"eval_test_runtime": 228.954, |
|
"eval_test_samples_per_second": 10.592, |
|
"eval_test_steps_per_second": 1.328, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"learning_rate": 3.92e-06, |
|
"loss": 0.2304, |
|
"step": 9216 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"eval_test_accuracy": 0.257319587628866, |
|
"eval_test_loss": 0.24084854125976562, |
|
"eval_test_runtime": 227.9624, |
|
"eval_test_samples_per_second": 10.638, |
|
"eval_test_steps_per_second": 1.334, |
|
"step": 9216 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"learning_rate": 2.64e-06, |
|
"loss": 0.2423, |
|
"step": 9472 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"eval_test_accuracy": 0.2556701030927835, |
|
"eval_test_loss": 0.2404756098985672, |
|
"eval_test_runtime": 228.8532, |
|
"eval_test_samples_per_second": 10.596, |
|
"eval_test_steps_per_second": 1.328, |
|
"step": 9472 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"learning_rate": 1.36e-06, |
|
"loss": 0.2366, |
|
"step": 9728 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"eval_test_accuracy": 0.2556701030927835, |
|
"eval_test_loss": 0.24009202420711517, |
|
"eval_test_runtime": 229.0313, |
|
"eval_test_samples_per_second": 10.588, |
|
"eval_test_steps_per_second": 1.327, |
|
"step": 9728 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"learning_rate": 8e-08, |
|
"loss": 0.2321, |
|
"step": 9984 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"eval_test_accuracy": 0.25278350515463915, |
|
"eval_test_loss": 0.23984022438526154, |
|
"eval_test_runtime": 231.7368, |
|
"eval_test_samples_per_second": 10.464, |
|
"eval_test_steps_per_second": 1.312, |
|
"step": 9984 |
|
} |
|
], |
|
"logging_steps": 256, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9, |
|
"save_steps": 256, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|