adapters-opt-gptq-QLORA-super_glue-axg
/
trainer_state-opt-gptq-QLORA-super_glue-axg-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 10.0, | |
"eval_steps": 1, | |
"global_step": 90, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.1111111111111111, | |
"grad_norm": 34.7608757019043, | |
"learning_rate": 2.5e-05, | |
"loss": 1.2054, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.1111111111111111, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 1.0094876289367676, | |
"eval_runtime": 0.7089, | |
"eval_samples_per_second": 101.565, | |
"eval_steps_per_second": 7.053, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"grad_norm": 30.633609771728516, | |
"learning_rate": 5e-05, | |
"loss": 1.1567, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.9987521767616272, | |
"eval_runtime": 0.603, | |
"eval_samples_per_second": 119.406, | |
"eval_steps_per_second": 8.292, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"grad_norm": 25.385608673095703, | |
"learning_rate": 4.943181818181818e-05, | |
"loss": 1.0806, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.9628159999847412, | |
"eval_runtime": 0.7075, | |
"eval_samples_per_second": 101.767, | |
"eval_steps_per_second": 7.067, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"grad_norm": 22.26402473449707, | |
"learning_rate": 4.886363636363637e-05, | |
"loss": 0.9883, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.9259067177772522, | |
"eval_runtime": 0.7031, | |
"eval_samples_per_second": 102.402, | |
"eval_steps_per_second": 7.111, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"grad_norm": 26.522153854370117, | |
"learning_rate": 4.829545454545455e-05, | |
"loss": 0.9736, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.8975830078125, | |
"eval_runtime": 0.7036, | |
"eval_samples_per_second": 102.336, | |
"eval_steps_per_second": 7.107, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"grad_norm": 18.14297866821289, | |
"learning_rate": 4.772727272727273e-05, | |
"loss": 0.8267, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.8745185136795044, | |
"eval_runtime": 0.6043, | |
"eval_samples_per_second": 119.149, | |
"eval_steps_per_second": 8.274, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"grad_norm": 10.767016410827637, | |
"learning_rate": 4.715909090909091e-05, | |
"loss": 0.8047, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8637288212776184, | |
"eval_runtime": 0.7076, | |
"eval_samples_per_second": 101.757, | |
"eval_steps_per_second": 7.066, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"grad_norm": 9.212442398071289, | |
"learning_rate": 4.659090909090909e-05, | |
"loss": 0.6807, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.8712022304534912, | |
"eval_runtime": 0.7061, | |
"eval_samples_per_second": 101.97, | |
"eval_steps_per_second": 7.081, | |
"step": 8 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 10.467320442199707, | |
"learning_rate": 4.602272727272727e-05, | |
"loss": 0.7425, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.8966064453125, | |
"eval_runtime": 0.7054, | |
"eval_samples_per_second": 102.068, | |
"eval_steps_per_second": 7.088, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"grad_norm": 8.758502960205078, | |
"learning_rate": 4.545454545454546e-05, | |
"loss": 0.7941, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.896728515625, | |
"eval_runtime": 0.7073, | |
"eval_samples_per_second": 101.79, | |
"eval_steps_per_second": 7.069, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"grad_norm": 2.704101800918579, | |
"learning_rate": 4.488636363636364e-05, | |
"loss": 0.6092, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"eval_accuracy": 0.3333333333333333, | |
"eval_loss": 0.8883192539215088, | |
"eval_runtime": 0.603, | |
"eval_samples_per_second": 119.401, | |
"eval_steps_per_second": 8.292, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"grad_norm": 3.9579689502716064, | |
"learning_rate": 4.431818181818182e-05, | |
"loss": 0.7372, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"eval_accuracy": 0.3333333333333333, | |
"eval_loss": 0.8713921308517456, | |
"eval_runtime": 0.6557, | |
"eval_samples_per_second": 109.8, | |
"eval_steps_per_second": 7.625, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"grad_norm": 4.962625980377197, | |
"learning_rate": 4.375e-05, | |
"loss": 0.6553, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"eval_accuracy": 0.3194444444444444, | |
"eval_loss": 0.8686930537223816, | |
"eval_runtime": 0.7054, | |
"eval_samples_per_second": 102.073, | |
"eval_steps_per_second": 7.088, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"grad_norm": 2.9537837505340576, | |
"learning_rate": 4.318181818181819e-05, | |
"loss": 0.7266, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"eval_accuracy": 0.2777777777777778, | |
"eval_loss": 0.8620063066482544, | |
"eval_runtime": 0.7075, | |
"eval_samples_per_second": 101.766, | |
"eval_steps_per_second": 7.067, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"grad_norm": 4.937398910522461, | |
"learning_rate": 4.261363636363637e-05, | |
"loss": 0.7462, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"eval_accuracy": 0.3333333333333333, | |
"eval_loss": 0.8434787392616272, | |
"eval_runtime": 0.7043, | |
"eval_samples_per_second": 102.225, | |
"eval_steps_per_second": 7.099, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"grad_norm": 8.647956848144531, | |
"learning_rate": 4.204545454545455e-05, | |
"loss": 0.66, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8198378086090088, | |
"eval_runtime": 0.7049, | |
"eval_samples_per_second": 102.141, | |
"eval_steps_per_second": 7.093, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"grad_norm": 6.325115203857422, | |
"learning_rate": 4.1477272727272734e-05, | |
"loss": 0.6908, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.8034396767616272, | |
"eval_runtime": 0.6506, | |
"eval_samples_per_second": 110.674, | |
"eval_steps_per_second": 7.686, | |
"step": 17 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 2.925394058227539, | |
"learning_rate": 4.0909090909090915e-05, | |
"loss": 0.6993, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.7982177734375, | |
"eval_runtime": 0.7049, | |
"eval_samples_per_second": 102.145, | |
"eval_steps_per_second": 7.093, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"grad_norm": 4.028661251068115, | |
"learning_rate": 4.034090909090909e-05, | |
"loss": 0.7788, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.787353515625, | |
"eval_runtime": 0.7038, | |
"eval_samples_per_second": 102.296, | |
"eval_steps_per_second": 7.104, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"grad_norm": 3.346914529800415, | |
"learning_rate": 3.9772727272727275e-05, | |
"loss": 0.6594, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"eval_accuracy": 0.5138888888888888, | |
"eval_loss": 0.7767741084098816, | |
"eval_runtime": 0.7031, | |
"eval_samples_per_second": 102.403, | |
"eval_steps_per_second": 7.111, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"grad_norm": 5.400989532470703, | |
"learning_rate": 3.9204545454545456e-05, | |
"loss": 0.7303, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.7682834267616272, | |
"eval_runtime": 0.6498, | |
"eval_samples_per_second": 110.797, | |
"eval_steps_per_second": 7.694, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"grad_norm": 2.919682025909424, | |
"learning_rate": 3.8636363636363636e-05, | |
"loss": 0.6688, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"eval_accuracy": 0.4861111111111111, | |
"eval_loss": 0.7653401494026184, | |
"eval_runtime": 0.551, | |
"eval_samples_per_second": 130.66, | |
"eval_steps_per_second": 9.074, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"grad_norm": 9.212122917175293, | |
"learning_rate": 3.8068181818181816e-05, | |
"loss": 0.7337, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.7591145634651184, | |
"eval_runtime": 0.7017, | |
"eval_samples_per_second": 102.602, | |
"eval_steps_per_second": 7.125, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"grad_norm": 8.884961128234863, | |
"learning_rate": 3.7500000000000003e-05, | |
"loss": 0.6648, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.7524074912071228, | |
"eval_runtime": 0.7032, | |
"eval_samples_per_second": 102.392, | |
"eval_steps_per_second": 7.111, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"grad_norm": 5.935830116271973, | |
"learning_rate": 3.6931818181818184e-05, | |
"loss": 0.6589, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"eval_accuracy": 0.5138888888888888, | |
"eval_loss": 0.7492743730545044, | |
"eval_runtime": 0.6514, | |
"eval_samples_per_second": 110.527, | |
"eval_steps_per_second": 7.675, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"grad_norm": 7.1309027671813965, | |
"learning_rate": 3.6363636363636364e-05, | |
"loss": 0.8163, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"eval_accuracy": 0.5416666666666666, | |
"eval_loss": 0.7473212480545044, | |
"eval_runtime": 0.706, | |
"eval_samples_per_second": 101.99, | |
"eval_steps_per_second": 7.083, | |
"step": 26 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 5.935413837432861, | |
"learning_rate": 3.579545454545455e-05, | |
"loss": 0.6456, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.5694444444444444, | |
"eval_loss": 0.74639892578125, | |
"eval_runtime": 0.6008, | |
"eval_samples_per_second": 119.848, | |
"eval_steps_per_second": 8.323, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"grad_norm": 4.931116104125977, | |
"learning_rate": 3.522727272727273e-05, | |
"loss": 0.7576, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 0.7495456337928772, | |
"eval_runtime": 0.6526, | |
"eval_samples_per_second": 110.329, | |
"eval_steps_per_second": 7.662, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"grad_norm": 8.242729187011719, | |
"learning_rate": 3.465909090909091e-05, | |
"loss": 0.6763, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 0.7501423954963684, | |
"eval_runtime": 0.7058, | |
"eval_samples_per_second": 102.009, | |
"eval_steps_per_second": 7.084, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"grad_norm": 4.411537170410156, | |
"learning_rate": 3.409090909090909e-05, | |
"loss": 0.7565, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 0.7513563632965088, | |
"eval_runtime": 0.7022, | |
"eval_samples_per_second": 102.536, | |
"eval_steps_per_second": 7.121, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"grad_norm": 5.082301139831543, | |
"learning_rate": 3.352272727272727e-05, | |
"loss": 0.7888, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.7555338740348816, | |
"eval_runtime": 0.7031, | |
"eval_samples_per_second": 102.41, | |
"eval_steps_per_second": 7.112, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"grad_norm": 3.5483062267303467, | |
"learning_rate": 3.295454545454545e-05, | |
"loss": 0.6769, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.7550930380821228, | |
"eval_runtime": 0.6042, | |
"eval_samples_per_second": 119.164, | |
"eval_steps_per_second": 8.275, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"grad_norm": 3.9409377574920654, | |
"learning_rate": 3.238636363636364e-05, | |
"loss": 0.718, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.7593722939491272, | |
"eval_runtime": 0.7024, | |
"eval_samples_per_second": 102.508, | |
"eval_steps_per_second": 7.119, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"grad_norm": 11.279867172241211, | |
"learning_rate": 3.181818181818182e-05, | |
"loss": 0.6424, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"eval_accuracy": 0.5138888888888888, | |
"eval_loss": 0.7629123330116272, | |
"eval_runtime": 0.7053, | |
"eval_samples_per_second": 102.077, | |
"eval_steps_per_second": 7.089, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"grad_norm": 8.519107818603516, | |
"learning_rate": 3.125e-05, | |
"loss": 0.6471, | |
"step": 35 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"eval_accuracy": 0.4861111111111111, | |
"eval_loss": 0.7659505009651184, | |
"eval_runtime": 0.703, | |
"eval_samples_per_second": 102.418, | |
"eval_steps_per_second": 7.112, | |
"step": 35 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 6.580870628356934, | |
"learning_rate": 3.068181818181818e-05, | |
"loss": 0.727, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.7695041298866272, | |
"eval_runtime": 0.6543, | |
"eval_samples_per_second": 110.049, | |
"eval_steps_per_second": 7.642, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"grad_norm": 11.691934585571289, | |
"learning_rate": 3.0113636363636365e-05, | |
"loss": 0.73, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.7756076455116272, | |
"eval_runtime": 0.7028, | |
"eval_samples_per_second": 102.442, | |
"eval_steps_per_second": 7.114, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"grad_norm": 13.515392303466797, | |
"learning_rate": 2.954545454545455e-05, | |
"loss": 0.687, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.7799343466758728, | |
"eval_runtime": 0.705, | |
"eval_samples_per_second": 102.128, | |
"eval_steps_per_second": 7.092, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"grad_norm": 3.4968955516815186, | |
"learning_rate": 2.8977272727272732e-05, | |
"loss": 0.6573, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.7805582880973816, | |
"eval_runtime": 0.7046, | |
"eval_samples_per_second": 102.181, | |
"eval_steps_per_second": 7.096, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"grad_norm": 6.2186713218688965, | |
"learning_rate": 2.8409090909090912e-05, | |
"loss": 0.6691, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"eval_accuracy": 0.4861111111111111, | |
"eval_loss": 0.7847764492034912, | |
"eval_runtime": 0.7058, | |
"eval_samples_per_second": 102.009, | |
"eval_steps_per_second": 7.084, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"grad_norm": 9.075811386108398, | |
"learning_rate": 2.784090909090909e-05, | |
"loss": 0.8023, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.78857421875, | |
"eval_runtime": 0.7051, | |
"eval_samples_per_second": 102.113, | |
"eval_steps_per_second": 7.091, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"grad_norm": 5.013810157775879, | |
"learning_rate": 2.7272727272727273e-05, | |
"loss": 0.6703, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"eval_accuracy": 0.4861111111111111, | |
"eval_loss": 0.7918837070465088, | |
"eval_runtime": 0.655, | |
"eval_samples_per_second": 109.915, | |
"eval_steps_per_second": 7.633, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"grad_norm": 5.304051876068115, | |
"learning_rate": 2.6704545454545453e-05, | |
"loss": 0.7019, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.7928195595741272, | |
"eval_runtime": 0.6557, | |
"eval_samples_per_second": 109.8, | |
"eval_steps_per_second": 7.625, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"grad_norm": 7.697951316833496, | |
"learning_rate": 2.6136363636363637e-05, | |
"loss": 0.6973, | |
"step": 44 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"eval_accuracy": 0.4861111111111111, | |
"eval_loss": 0.8020155429840088, | |
"eval_runtime": 0.6545, | |
"eval_samples_per_second": 110.008, | |
"eval_steps_per_second": 7.639, | |
"step": 44 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 18.014299392700195, | |
"learning_rate": 2.5568181818181817e-05, | |
"loss": 0.7859, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8017985224723816, | |
"eval_runtime": 0.702, | |
"eval_samples_per_second": 102.565, | |
"eval_steps_per_second": 7.123, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"grad_norm": 5.591033458709717, | |
"learning_rate": 2.5e-05, | |
"loss": 0.6987, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8059489130973816, | |
"eval_runtime": 0.7047, | |
"eval_samples_per_second": 102.172, | |
"eval_steps_per_second": 7.095, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"grad_norm": 4.745278835296631, | |
"learning_rate": 2.4431818181818185e-05, | |
"loss": 0.6566, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.807861328125, | |
"eval_runtime": 0.6555, | |
"eval_samples_per_second": 109.837, | |
"eval_steps_per_second": 7.628, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"grad_norm": 7.4073486328125, | |
"learning_rate": 2.3863636363636365e-05, | |
"loss": 0.7671, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8123643398284912, | |
"eval_runtime": 0.7034, | |
"eval_samples_per_second": 102.354, | |
"eval_steps_per_second": 7.108, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"grad_norm": 7.266972541809082, | |
"learning_rate": 2.3295454545454546e-05, | |
"loss": 0.7223, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8111029863357544, | |
"eval_runtime": 0.7066, | |
"eval_samples_per_second": 101.892, | |
"eval_steps_per_second": 7.076, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"grad_norm": 3.803126573562622, | |
"learning_rate": 2.272727272727273e-05, | |
"loss": 0.637, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8150363564491272, | |
"eval_runtime": 0.5565, | |
"eval_samples_per_second": 129.373, | |
"eval_steps_per_second": 8.984, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"grad_norm": 6.748312950134277, | |
"learning_rate": 2.215909090909091e-05, | |
"loss": 0.6406, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8139106035232544, | |
"eval_runtime": 0.6526, | |
"eval_samples_per_second": 110.328, | |
"eval_steps_per_second": 7.662, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"grad_norm": 14.67806625366211, | |
"learning_rate": 2.1590909090909093e-05, | |
"loss": 0.7293, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8112928867340088, | |
"eval_runtime": 0.5507, | |
"eval_samples_per_second": 130.733, | |
"eval_steps_per_second": 9.079, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"grad_norm": 12.766432762145996, | |
"learning_rate": 2.1022727272727274e-05, | |
"loss": 0.6668, | |
"step": 53 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8097330927848816, | |
"eval_runtime": 0.6502, | |
"eval_samples_per_second": 110.73, | |
"eval_steps_per_second": 7.69, | |
"step": 53 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 6.800503730773926, | |
"learning_rate": 2.0454545454545457e-05, | |
"loss": 0.69, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8104926347732544, | |
"eval_runtime": 0.5514, | |
"eval_samples_per_second": 130.565, | |
"eval_steps_per_second": 9.067, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"grad_norm": 3.1313846111297607, | |
"learning_rate": 1.9886363636363638e-05, | |
"loss": 0.6828, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8099636435508728, | |
"eval_runtime": 0.5992, | |
"eval_samples_per_second": 120.168, | |
"eval_steps_per_second": 8.345, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"grad_norm": 4.510275363922119, | |
"learning_rate": 1.9318181818181818e-05, | |
"loss": 0.6665, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8113335371017456, | |
"eval_runtime": 0.5509, | |
"eval_samples_per_second": 130.706, | |
"eval_steps_per_second": 9.077, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"grad_norm": 8.366003036499023, | |
"learning_rate": 1.8750000000000002e-05, | |
"loss": 0.7423, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8116319179534912, | |
"eval_runtime": 0.5995, | |
"eval_samples_per_second": 120.099, | |
"eval_steps_per_second": 8.34, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"grad_norm": 2.654489517211914, | |
"learning_rate": 1.8181818181818182e-05, | |
"loss": 0.7059, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8082817792892456, | |
"eval_runtime": 0.5516, | |
"eval_samples_per_second": 130.519, | |
"eval_steps_per_second": 9.064, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"grad_norm": 9.320557594299316, | |
"learning_rate": 1.7613636363636366e-05, | |
"loss": 0.7396, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8052436113357544, | |
"eval_runtime": 0.5998, | |
"eval_samples_per_second": 120.045, | |
"eval_steps_per_second": 8.336, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"grad_norm": 7.24456787109375, | |
"learning_rate": 1.7045454545454546e-05, | |
"loss": 0.6561, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8018798828125, | |
"eval_runtime": 0.5995, | |
"eval_samples_per_second": 120.107, | |
"eval_steps_per_second": 8.341, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"grad_norm": 7.286713123321533, | |
"learning_rate": 1.6477272727272726e-05, | |
"loss": 0.7355, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8011067509651184, | |
"eval_runtime": 0.6, | |
"eval_samples_per_second": 119.998, | |
"eval_steps_per_second": 8.333, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"grad_norm": 4.26874303817749, | |
"learning_rate": 1.590909090909091e-05, | |
"loss": 0.7353, | |
"step": 62 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.7994384765625, | |
"eval_runtime": 0.5529, | |
"eval_samples_per_second": 130.221, | |
"eval_steps_per_second": 9.043, | |
"step": 62 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 3.4262943267822266, | |
"learning_rate": 1.534090909090909e-05, | |
"loss": 0.6377, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.7936333417892456, | |
"eval_runtime": 0.5486, | |
"eval_samples_per_second": 131.251, | |
"eval_steps_per_second": 9.115, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"grad_norm": 5.528657913208008, | |
"learning_rate": 1.4772727272727274e-05, | |
"loss": 0.6874, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.7888047695159912, | |
"eval_runtime": 0.6014, | |
"eval_samples_per_second": 119.712, | |
"eval_steps_per_second": 8.313, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"grad_norm": 5.771848678588867, | |
"learning_rate": 1.4204545454545456e-05, | |
"loss": 0.7112, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.7859971523284912, | |
"eval_runtime": 0.6005, | |
"eval_samples_per_second": 119.903, | |
"eval_steps_per_second": 8.327, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"grad_norm": 8.485419273376465, | |
"learning_rate": 1.3636363636363637e-05, | |
"loss": 0.6282, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.7813856601715088, | |
"eval_runtime": 0.6515, | |
"eval_samples_per_second": 110.513, | |
"eval_steps_per_second": 7.675, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"grad_norm": 10.057698249816895, | |
"learning_rate": 1.3068181818181819e-05, | |
"loss": 0.6793, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.7774929404258728, | |
"eval_runtime": 0.5496, | |
"eval_samples_per_second": 131.003, | |
"eval_steps_per_second": 9.097, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"grad_norm": 9.190048217773438, | |
"learning_rate": 1.25e-05, | |
"loss": 0.6707, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7748887538909912, | |
"eval_runtime": 0.5517, | |
"eval_samples_per_second": 130.502, | |
"eval_steps_per_second": 9.063, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"grad_norm": 6.497270107269287, | |
"learning_rate": 1.1931818181818183e-05, | |
"loss": 0.6571, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.77294921875, | |
"eval_runtime": 0.5507, | |
"eval_samples_per_second": 130.752, | |
"eval_steps_per_second": 9.08, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"grad_norm": 5.636641025543213, | |
"learning_rate": 1.1363636363636365e-05, | |
"loss": 0.6539, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.7728678584098816, | |
"eval_runtime": 0.5491, | |
"eval_samples_per_second": 131.117, | |
"eval_steps_per_second": 9.105, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"grad_norm": 4.822576522827148, | |
"learning_rate": 1.0795454545454547e-05, | |
"loss": 0.6953, | |
"step": 71 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.7734103798866272, | |
"eval_runtime": 0.5482, | |
"eval_samples_per_second": 131.348, | |
"eval_steps_per_second": 9.121, | |
"step": 71 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 7.65333366394043, | |
"learning_rate": 1.0227272727272729e-05, | |
"loss": 0.7214, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.7741156816482544, | |
"eval_runtime": 0.6017, | |
"eval_samples_per_second": 119.662, | |
"eval_steps_per_second": 8.31, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"grad_norm": 8.882245063781738, | |
"learning_rate": 9.659090909090909e-06, | |
"loss": 0.722, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.7755805253982544, | |
"eval_runtime": 0.5489, | |
"eval_samples_per_second": 131.162, | |
"eval_steps_per_second": 9.108, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"grad_norm": 3.2345800399780273, | |
"learning_rate": 9.090909090909091e-06, | |
"loss": 0.6578, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.771728515625, | |
"eval_runtime": 0.6516, | |
"eval_samples_per_second": 110.506, | |
"eval_steps_per_second": 7.674, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"grad_norm": 4.6726484298706055, | |
"learning_rate": 8.522727272727273e-06, | |
"loss": 0.7094, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.7747802734375, | |
"eval_runtime": 0.5499, | |
"eval_samples_per_second": 130.927, | |
"eval_steps_per_second": 9.092, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"grad_norm": 16.0637264251709, | |
"learning_rate": 7.954545454545455e-06, | |
"loss": 0.7181, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.7764350175857544, | |
"eval_runtime": 0.7006, | |
"eval_samples_per_second": 102.77, | |
"eval_steps_per_second": 7.137, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"grad_norm": 4.427946090698242, | |
"learning_rate": 7.386363636363637e-06, | |
"loss": 0.6544, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.7750922441482544, | |
"eval_runtime": 0.5503, | |
"eval_samples_per_second": 130.846, | |
"eval_steps_per_second": 9.086, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"grad_norm": 3.1340301036834717, | |
"learning_rate": 6.818181818181818e-06, | |
"loss": 0.6482, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.7764485478401184, | |
"eval_runtime": 0.6071, | |
"eval_samples_per_second": 118.588, | |
"eval_steps_per_second": 8.235, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"grad_norm": 14.458281517028809, | |
"learning_rate": 6.25e-06, | |
"loss": 0.7313, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.7769097089767456, | |
"eval_runtime": 0.6033, | |
"eval_samples_per_second": 119.344, | |
"eval_steps_per_second": 8.288, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"grad_norm": 10.602180480957031, | |
"learning_rate": 5.681818181818182e-06, | |
"loss": 0.7158, | |
"step": 80 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.7765977382659912, | |
"eval_runtime": 0.7058, | |
"eval_samples_per_second": 102.018, | |
"eval_steps_per_second": 7.085, | |
"step": 80 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 8.127225875854492, | |
"learning_rate": 5.113636363636364e-06, | |
"loss": 0.6447, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.7755669355392456, | |
"eval_runtime": 0.6543, | |
"eval_samples_per_second": 110.049, | |
"eval_steps_per_second": 7.642, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"grad_norm": 6.457716464996338, | |
"learning_rate": 4.5454545454545455e-06, | |
"loss": 0.6601, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"eval_accuracy": 0.3333333333333333, | |
"eval_loss": 0.7762587070465088, | |
"eval_runtime": 0.6069, | |
"eval_samples_per_second": 118.636, | |
"eval_steps_per_second": 8.239, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"grad_norm": 3.813633918762207, | |
"learning_rate": 3.9772727272727275e-06, | |
"loss": 0.7732, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7770453691482544, | |
"eval_runtime": 0.7076, | |
"eval_samples_per_second": 101.748, | |
"eval_steps_per_second": 7.066, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"grad_norm": 10.45854377746582, | |
"learning_rate": 3.409090909090909e-06, | |
"loss": 0.6657, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7784559726715088, | |
"eval_runtime": 0.7109, | |
"eval_samples_per_second": 101.287, | |
"eval_steps_per_second": 7.034, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"grad_norm": 4.11740255355835, | |
"learning_rate": 2.840909090909091e-06, | |
"loss": 0.716, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7778727412223816, | |
"eval_runtime": 0.5543, | |
"eval_samples_per_second": 129.897, | |
"eval_steps_per_second": 9.021, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"grad_norm": 10.107967376708984, | |
"learning_rate": 2.2727272727272728e-06, | |
"loss": 0.7518, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7781982421875, | |
"eval_runtime": 0.6493, | |
"eval_samples_per_second": 110.892, | |
"eval_steps_per_second": 7.701, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"grad_norm": 6.475906848907471, | |
"learning_rate": 1.7045454545454546e-06, | |
"loss": 0.713, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7775471806526184, | |
"eval_runtime": 0.5511, | |
"eval_samples_per_second": 130.655, | |
"eval_steps_per_second": 9.073, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"grad_norm": 11.71182632446289, | |
"learning_rate": 1.1363636363636364e-06, | |
"loss": 0.6931, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.7794325351715088, | |
"eval_runtime": 0.5536, | |
"eval_samples_per_second": 130.057, | |
"eval_steps_per_second": 9.032, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"grad_norm": 8.962738037109375, | |
"learning_rate": 5.681818181818182e-07, | |
"loss": 0.7001, | |
"step": 89 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7806532382965088, | |
"eval_runtime": 0.6003, | |
"eval_samples_per_second": 119.933, | |
"eval_steps_per_second": 8.329, | |
"step": 89 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 8.880528450012207, | |
"learning_rate": 0.0, | |
"loss": 0.6525, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7818332314491272, | |
"eval_runtime": 0.6538, | |
"eval_samples_per_second": 110.129, | |
"eval_steps_per_second": 7.648, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"step": 90, | |
"total_flos": 13264513597440.0, | |
"train_loss": 0.7229804992675781, | |
"train_runtime": 239.59, | |
"train_samples_per_second": 11.854, | |
"train_steps_per_second": 0.376 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 90, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"total_flos": 13264513597440.0, | |
"train_batch_size": 4, | |
"trial_name": null, | |
"trial_params": null | |
} | |