{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1111111111111111, "grad_norm": 34.7608757019043, "learning_rate": 2.5e-05, "loss": 1.2054, "step": 1 }, { "epoch": 0.1111111111111111, "eval_accuracy": 0.5277777777777778, "eval_loss": 1.0094876289367676, "eval_runtime": 0.7089, "eval_samples_per_second": 101.565, "eval_steps_per_second": 7.053, "step": 1 }, { "epoch": 0.2222222222222222, "grad_norm": 30.633609771728516, "learning_rate": 5e-05, "loss": 1.1567, "step": 2 }, { "epoch": 0.2222222222222222, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.9987521767616272, "eval_runtime": 0.603, "eval_samples_per_second": 119.406, "eval_steps_per_second": 8.292, "step": 2 }, { "epoch": 0.3333333333333333, "grad_norm": 25.385608673095703, "learning_rate": 4.943181818181818e-05, "loss": 1.0806, "step": 3 }, { "epoch": 0.3333333333333333, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.9628159999847412, "eval_runtime": 0.7075, "eval_samples_per_second": 101.767, "eval_steps_per_second": 7.067, "step": 3 }, { "epoch": 0.4444444444444444, "grad_norm": 22.26402473449707, "learning_rate": 4.886363636363637e-05, "loss": 0.9883, "step": 4 }, { "epoch": 0.4444444444444444, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.9259067177772522, "eval_runtime": 0.7031, "eval_samples_per_second": 102.402, "eval_steps_per_second": 7.111, "step": 4 }, { "epoch": 0.5555555555555556, "grad_norm": 26.522153854370117, "learning_rate": 4.829545454545455e-05, "loss": 0.9736, "step": 5 }, { "epoch": 0.5555555555555556, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.8975830078125, "eval_runtime": 0.7036, "eval_samples_per_second": 102.336, "eval_steps_per_second": 7.107, "step": 5 }, { "epoch": 0.6666666666666666, "grad_norm": 18.14297866821289, "learning_rate": 4.772727272727273e-05, "loss": 0.8267, "step": 6 }, { "epoch": 0.6666666666666666, "eval_accuracy": 0.4583333333333333, "eval_loss": 0.8745185136795044, "eval_runtime": 0.6043, "eval_samples_per_second": 119.149, "eval_steps_per_second": 8.274, "step": 6 }, { "epoch": 0.7777777777777778, "grad_norm": 10.767016410827637, "learning_rate": 4.715909090909091e-05, "loss": 0.8047, "step": 7 }, { "epoch": 0.7777777777777778, "eval_accuracy": 0.4166666666666667, "eval_loss": 0.8637288212776184, "eval_runtime": 0.7076, "eval_samples_per_second": 101.757, "eval_steps_per_second": 7.066, "step": 7 }, { "epoch": 0.8888888888888888, "grad_norm": 9.212442398071289, "learning_rate": 4.659090909090909e-05, "loss": 0.6807, "step": 8 }, { "epoch": 0.8888888888888888, "eval_accuracy": 0.4027777777777778, "eval_loss": 0.8712022304534912, "eval_runtime": 0.7061, "eval_samples_per_second": 101.97, "eval_steps_per_second": 7.081, "step": 8 }, { "epoch": 1.0, "grad_norm": 10.467320442199707, "learning_rate": 4.602272727272727e-05, "loss": 0.7425, "step": 9 }, { "epoch": 1.0, "eval_accuracy": 0.3611111111111111, "eval_loss": 0.8966064453125, "eval_runtime": 0.7054, "eval_samples_per_second": 102.068, "eval_steps_per_second": 7.088, "step": 9 }, { "epoch": 1.1111111111111112, "grad_norm": 8.758502960205078, "learning_rate": 4.545454545454546e-05, "loss": 0.7941, "step": 10 }, { "epoch": 1.1111111111111112, "eval_accuracy": 0.3611111111111111, "eval_loss": 0.896728515625, "eval_runtime": 0.7073, "eval_samples_per_second": 101.79, "eval_steps_per_second": 7.069, "step": 10 }, { "epoch": 1.2222222222222223, "grad_norm": 2.704101800918579, "learning_rate": 4.488636363636364e-05, "loss": 0.6092, "step": 11 }, { "epoch": 1.2222222222222223, "eval_accuracy": 0.3333333333333333, "eval_loss": 0.8883192539215088, "eval_runtime": 0.603, "eval_samples_per_second": 119.401, "eval_steps_per_second": 8.292, "step": 11 }, { "epoch": 1.3333333333333333, "grad_norm": 3.9579689502716064, "learning_rate": 4.431818181818182e-05, "loss": 0.7372, "step": 12 }, { "epoch": 1.3333333333333333, "eval_accuracy": 0.3333333333333333, "eval_loss": 0.8713921308517456, "eval_runtime": 0.6557, "eval_samples_per_second": 109.8, "eval_steps_per_second": 7.625, "step": 12 }, { "epoch": 1.4444444444444444, "grad_norm": 4.962625980377197, "learning_rate": 4.375e-05, "loss": 0.6553, "step": 13 }, { "epoch": 1.4444444444444444, "eval_accuracy": 0.3194444444444444, "eval_loss": 0.8686930537223816, "eval_runtime": 0.7054, "eval_samples_per_second": 102.073, "eval_steps_per_second": 7.088, "step": 13 }, { "epoch": 1.5555555555555556, "grad_norm": 2.9537837505340576, "learning_rate": 4.318181818181819e-05, "loss": 0.7266, "step": 14 }, { "epoch": 1.5555555555555556, "eval_accuracy": 0.2777777777777778, "eval_loss": 0.8620063066482544, "eval_runtime": 0.7075, "eval_samples_per_second": 101.766, "eval_steps_per_second": 7.067, "step": 14 }, { "epoch": 1.6666666666666665, "grad_norm": 4.937398910522461, "learning_rate": 4.261363636363637e-05, "loss": 0.7462, "step": 15 }, { "epoch": 1.6666666666666665, "eval_accuracy": 0.3333333333333333, "eval_loss": 0.8434787392616272, "eval_runtime": 0.7043, "eval_samples_per_second": 102.225, "eval_steps_per_second": 7.099, "step": 15 }, { "epoch": 1.7777777777777777, "grad_norm": 8.647956848144531, "learning_rate": 4.204545454545455e-05, "loss": 0.66, "step": 16 }, { "epoch": 1.7777777777777777, "eval_accuracy": 0.375, "eval_loss": 0.8198378086090088, "eval_runtime": 0.7049, "eval_samples_per_second": 102.141, "eval_steps_per_second": 7.093, "step": 16 }, { "epoch": 1.8888888888888888, "grad_norm": 6.325115203857422, "learning_rate": 4.1477272727272734e-05, "loss": 0.6908, "step": 17 }, { "epoch": 1.8888888888888888, "eval_accuracy": 0.4444444444444444, "eval_loss": 0.8034396767616272, "eval_runtime": 0.6506, "eval_samples_per_second": 110.674, "eval_steps_per_second": 7.686, "step": 17 }, { "epoch": 2.0, "grad_norm": 2.925394058227539, "learning_rate": 4.0909090909090915e-05, "loss": 0.6993, "step": 18 }, { "epoch": 2.0, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.7982177734375, "eval_runtime": 0.7049, "eval_samples_per_second": 102.145, "eval_steps_per_second": 7.093, "step": 18 }, { "epoch": 2.111111111111111, "grad_norm": 4.028661251068115, "learning_rate": 4.034090909090909e-05, "loss": 0.7788, "step": 19 }, { "epoch": 2.111111111111111, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.787353515625, "eval_runtime": 0.7038, "eval_samples_per_second": 102.296, "eval_steps_per_second": 7.104, "step": 19 }, { "epoch": 2.2222222222222223, "grad_norm": 3.346914529800415, "learning_rate": 3.9772727272727275e-05, "loss": 0.6594, "step": 20 }, { "epoch": 2.2222222222222223, "eval_accuracy": 0.5138888888888888, "eval_loss": 0.7767741084098816, "eval_runtime": 0.7031, "eval_samples_per_second": 102.403, "eval_steps_per_second": 7.111, "step": 20 }, { "epoch": 2.3333333333333335, "grad_norm": 5.400989532470703, "learning_rate": 3.9204545454545456e-05, "loss": 0.7303, "step": 21 }, { "epoch": 2.3333333333333335, "eval_accuracy": 0.5, "eval_loss": 0.7682834267616272, "eval_runtime": 0.6498, "eval_samples_per_second": 110.797, "eval_steps_per_second": 7.694, "step": 21 }, { "epoch": 2.4444444444444446, "grad_norm": 2.919682025909424, "learning_rate": 3.8636363636363636e-05, "loss": 0.6688, "step": 22 }, { "epoch": 2.4444444444444446, "eval_accuracy": 0.4861111111111111, "eval_loss": 0.7653401494026184, "eval_runtime": 0.551, "eval_samples_per_second": 130.66, "eval_steps_per_second": 9.074, "step": 22 }, { "epoch": 2.5555555555555554, "grad_norm": 9.212122917175293, "learning_rate": 3.8068181818181816e-05, "loss": 0.7337, "step": 23 }, { "epoch": 2.5555555555555554, "eval_accuracy": 0.5, "eval_loss": 0.7591145634651184, "eval_runtime": 0.7017, "eval_samples_per_second": 102.602, "eval_steps_per_second": 7.125, "step": 23 }, { "epoch": 2.6666666666666665, "grad_norm": 8.884961128234863, "learning_rate": 3.7500000000000003e-05, "loss": 0.6648, "step": 24 }, { "epoch": 2.6666666666666665, "eval_accuracy": 0.5, "eval_loss": 0.7524074912071228, "eval_runtime": 0.7032, "eval_samples_per_second": 102.392, "eval_steps_per_second": 7.111, "step": 24 }, { "epoch": 2.7777777777777777, "grad_norm": 5.935830116271973, "learning_rate": 3.6931818181818184e-05, "loss": 0.6589, "step": 25 }, { "epoch": 2.7777777777777777, "eval_accuracy": 0.5138888888888888, "eval_loss": 0.7492743730545044, "eval_runtime": 0.6514, "eval_samples_per_second": 110.527, "eval_steps_per_second": 7.675, "step": 25 }, { "epoch": 2.888888888888889, "grad_norm": 7.1309027671813965, "learning_rate": 3.6363636363636364e-05, "loss": 0.8163, "step": 26 }, { "epoch": 2.888888888888889, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.7473212480545044, "eval_runtime": 0.706, "eval_samples_per_second": 101.99, "eval_steps_per_second": 7.083, "step": 26 }, { "epoch": 3.0, "grad_norm": 5.935413837432861, "learning_rate": 3.579545454545455e-05, "loss": 0.6456, "step": 27 }, { "epoch": 3.0, "eval_accuracy": 0.5694444444444444, "eval_loss": 0.74639892578125, "eval_runtime": 0.6008, "eval_samples_per_second": 119.848, "eval_steps_per_second": 8.323, "step": 27 }, { "epoch": 3.111111111111111, "grad_norm": 4.931116104125977, "learning_rate": 3.522727272727273e-05, "loss": 0.7576, "step": 28 }, { "epoch": 3.111111111111111, "eval_accuracy": 0.5555555555555556, "eval_loss": 0.7495456337928772, "eval_runtime": 0.6526, "eval_samples_per_second": 110.329, "eval_steps_per_second": 7.662, "step": 28 }, { "epoch": 3.2222222222222223, "grad_norm": 8.242729187011719, "learning_rate": 3.465909090909091e-05, "loss": 0.6763, "step": 29 }, { "epoch": 3.2222222222222223, "eval_accuracy": 0.5555555555555556, "eval_loss": 0.7501423954963684, "eval_runtime": 0.7058, "eval_samples_per_second": 102.009, "eval_steps_per_second": 7.084, "step": 29 }, { "epoch": 3.3333333333333335, "grad_norm": 4.411537170410156, "learning_rate": 3.409090909090909e-05, "loss": 0.7565, "step": 30 }, { "epoch": 3.3333333333333335, "eval_accuracy": 0.5555555555555556, "eval_loss": 0.7513563632965088, "eval_runtime": 0.7022, "eval_samples_per_second": 102.536, "eval_steps_per_second": 7.121, "step": 30 }, { "epoch": 3.4444444444444446, "grad_norm": 5.082301139831543, "learning_rate": 3.352272727272727e-05, "loss": 0.7888, "step": 31 }, { "epoch": 3.4444444444444446, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.7555338740348816, "eval_runtime": 0.7031, "eval_samples_per_second": 102.41, "eval_steps_per_second": 7.112, "step": 31 }, { "epoch": 3.5555555555555554, "grad_norm": 3.5483062267303467, "learning_rate": 3.295454545454545e-05, "loss": 0.6769, "step": 32 }, { "epoch": 3.5555555555555554, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.7550930380821228, "eval_runtime": 0.6042, "eval_samples_per_second": 119.164, "eval_steps_per_second": 8.275, "step": 32 }, { "epoch": 3.6666666666666665, "grad_norm": 3.9409377574920654, "learning_rate": 3.238636363636364e-05, "loss": 0.718, "step": 33 }, { "epoch": 3.6666666666666665, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.7593722939491272, "eval_runtime": 0.7024, "eval_samples_per_second": 102.508, "eval_steps_per_second": 7.119, "step": 33 }, { "epoch": 3.7777777777777777, "grad_norm": 11.279867172241211, "learning_rate": 3.181818181818182e-05, "loss": 0.6424, "step": 34 }, { "epoch": 3.7777777777777777, "eval_accuracy": 0.5138888888888888, "eval_loss": 0.7629123330116272, "eval_runtime": 0.7053, "eval_samples_per_second": 102.077, "eval_steps_per_second": 7.089, "step": 34 }, { "epoch": 3.888888888888889, "grad_norm": 8.519107818603516, "learning_rate": 3.125e-05, "loss": 0.6471, "step": 35 }, { "epoch": 3.888888888888889, "eval_accuracy": 0.4861111111111111, "eval_loss": 0.7659505009651184, "eval_runtime": 0.703, "eval_samples_per_second": 102.418, "eval_steps_per_second": 7.112, "step": 35 }, { "epoch": 4.0, "grad_norm": 6.580870628356934, "learning_rate": 3.068181818181818e-05, "loss": 0.727, "step": 36 }, { "epoch": 4.0, "eval_accuracy": 0.4027777777777778, "eval_loss": 0.7695041298866272, "eval_runtime": 0.6543, "eval_samples_per_second": 110.049, "eval_steps_per_second": 7.642, "step": 36 }, { "epoch": 4.111111111111111, "grad_norm": 11.691934585571289, "learning_rate": 3.0113636363636365e-05, "loss": 0.73, "step": 37 }, { "epoch": 4.111111111111111, "eval_accuracy": 0.4027777777777778, "eval_loss": 0.7756076455116272, "eval_runtime": 0.7028, "eval_samples_per_second": 102.442, "eval_steps_per_second": 7.114, "step": 37 }, { "epoch": 4.222222222222222, "grad_norm": 13.515392303466797, "learning_rate": 2.954545454545455e-05, "loss": 0.687, "step": 38 }, { "epoch": 4.222222222222222, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.7799343466758728, "eval_runtime": 0.705, "eval_samples_per_second": 102.128, "eval_steps_per_second": 7.092, "step": 38 }, { "epoch": 4.333333333333333, "grad_norm": 3.4968955516815186, "learning_rate": 2.8977272727272732e-05, "loss": 0.6573, "step": 39 }, { "epoch": 4.333333333333333, "eval_accuracy": 0.4583333333333333, "eval_loss": 0.7805582880973816, "eval_runtime": 0.7046, "eval_samples_per_second": 102.181, "eval_steps_per_second": 7.096, "step": 39 }, { "epoch": 4.444444444444445, "grad_norm": 6.2186713218688965, "learning_rate": 2.8409090909090912e-05, "loss": 0.6691, "step": 40 }, { "epoch": 4.444444444444445, "eval_accuracy": 0.4861111111111111, "eval_loss": 0.7847764492034912, "eval_runtime": 0.7058, "eval_samples_per_second": 102.009, "eval_steps_per_second": 7.084, "step": 40 }, { "epoch": 4.555555555555555, "grad_norm": 9.075811386108398, "learning_rate": 2.784090909090909e-05, "loss": 0.8023, "step": 41 }, { "epoch": 4.555555555555555, "eval_accuracy": 0.4583333333333333, "eval_loss": 0.78857421875, "eval_runtime": 0.7051, "eval_samples_per_second": 102.113, "eval_steps_per_second": 7.091, "step": 41 }, { "epoch": 4.666666666666667, "grad_norm": 5.013810157775879, "learning_rate": 2.7272727272727273e-05, "loss": 0.6703, "step": 42 }, { "epoch": 4.666666666666667, "eval_accuracy": 0.4861111111111111, "eval_loss": 0.7918837070465088, "eval_runtime": 0.655, "eval_samples_per_second": 109.915, "eval_steps_per_second": 7.633, "step": 42 }, { "epoch": 4.777777777777778, "grad_norm": 5.304051876068115, "learning_rate": 2.6704545454545453e-05, "loss": 0.7019, "step": 43 }, { "epoch": 4.777777777777778, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.7928195595741272, "eval_runtime": 0.6557, "eval_samples_per_second": 109.8, "eval_steps_per_second": 7.625, "step": 43 }, { "epoch": 4.888888888888889, "grad_norm": 7.697951316833496, "learning_rate": 2.6136363636363637e-05, "loss": 0.6973, "step": 44 }, { "epoch": 4.888888888888889, "eval_accuracy": 0.4861111111111111, "eval_loss": 0.8020155429840088, "eval_runtime": 0.6545, "eval_samples_per_second": 110.008, "eval_steps_per_second": 7.639, "step": 44 }, { "epoch": 5.0, "grad_norm": 18.014299392700195, "learning_rate": 2.5568181818181817e-05, "loss": 0.7859, "step": 45 }, { "epoch": 5.0, "eval_accuracy": 0.4166666666666667, "eval_loss": 0.8017985224723816, "eval_runtime": 0.702, "eval_samples_per_second": 102.565, "eval_steps_per_second": 7.123, "step": 45 }, { "epoch": 5.111111111111111, "grad_norm": 5.591033458709717, "learning_rate": 2.5e-05, "loss": 0.6987, "step": 46 }, { "epoch": 5.111111111111111, "eval_accuracy": 0.4166666666666667, "eval_loss": 0.8059489130973816, "eval_runtime": 0.7047, "eval_samples_per_second": 102.172, "eval_steps_per_second": 7.095, "step": 46 }, { "epoch": 5.222222222222222, "grad_norm": 4.745278835296631, "learning_rate": 2.4431818181818185e-05, "loss": 0.6566, "step": 47 }, { "epoch": 5.222222222222222, "eval_accuracy": 0.4305555555555556, "eval_loss": 0.807861328125, "eval_runtime": 0.6555, "eval_samples_per_second": 109.837, "eval_steps_per_second": 7.628, "step": 47 }, { "epoch": 5.333333333333333, "grad_norm": 7.4073486328125, "learning_rate": 2.3863636363636365e-05, "loss": 0.7671, "step": 48 }, { "epoch": 5.333333333333333, "eval_accuracy": 0.4166666666666667, "eval_loss": 0.8123643398284912, "eval_runtime": 0.7034, "eval_samples_per_second": 102.354, "eval_steps_per_second": 7.108, "step": 48 }, { "epoch": 5.444444444444445, "grad_norm": 7.266972541809082, "learning_rate": 2.3295454545454546e-05, "loss": 0.7223, "step": 49 }, { "epoch": 5.444444444444445, "eval_accuracy": 0.4166666666666667, "eval_loss": 0.8111029863357544, "eval_runtime": 0.7066, "eval_samples_per_second": 101.892, "eval_steps_per_second": 7.076, "step": 49 }, { "epoch": 5.555555555555555, "grad_norm": 3.803126573562622, "learning_rate": 2.272727272727273e-05, "loss": 0.637, "step": 50 }, { "epoch": 5.555555555555555, "eval_accuracy": 0.4166666666666667, "eval_loss": 0.8150363564491272, "eval_runtime": 0.5565, "eval_samples_per_second": 129.373, "eval_steps_per_second": 8.984, "step": 50 }, { "epoch": 5.666666666666667, "grad_norm": 6.748312950134277, "learning_rate": 2.215909090909091e-05, "loss": 0.6406, "step": 51 }, { "epoch": 5.666666666666667, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.8139106035232544, "eval_runtime": 0.6526, "eval_samples_per_second": 110.328, "eval_steps_per_second": 7.662, "step": 51 }, { "epoch": 5.777777777777778, "grad_norm": 14.67806625366211, "learning_rate": 2.1590909090909093e-05, "loss": 0.7293, "step": 52 }, { "epoch": 5.777777777777778, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.8112928867340088, "eval_runtime": 0.5507, "eval_samples_per_second": 130.733, "eval_steps_per_second": 9.079, "step": 52 }, { "epoch": 5.888888888888889, "grad_norm": 12.766432762145996, "learning_rate": 2.1022727272727274e-05, "loss": 0.6668, "step": 53 }, { "epoch": 5.888888888888889, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.8097330927848816, "eval_runtime": 0.6502, "eval_samples_per_second": 110.73, "eval_steps_per_second": 7.69, "step": 53 }, { "epoch": 6.0, "grad_norm": 6.800503730773926, "learning_rate": 2.0454545454545457e-05, "loss": 0.69, "step": 54 }, { "epoch": 6.0, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.8104926347732544, "eval_runtime": 0.5514, "eval_samples_per_second": 130.565, "eval_steps_per_second": 9.067, "step": 54 }, { "epoch": 6.111111111111111, "grad_norm": 3.1313846111297607, "learning_rate": 1.9886363636363638e-05, "loss": 0.6828, "step": 55 }, { "epoch": 6.111111111111111, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.8099636435508728, "eval_runtime": 0.5992, "eval_samples_per_second": 120.168, "eval_steps_per_second": 8.345, "step": 55 }, { "epoch": 6.222222222222222, "grad_norm": 4.510275363922119, "learning_rate": 1.9318181818181818e-05, "loss": 0.6665, "step": 56 }, { "epoch": 6.222222222222222, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.8113335371017456, "eval_runtime": 0.5509, "eval_samples_per_second": 130.706, "eval_steps_per_second": 9.077, "step": 56 }, { "epoch": 6.333333333333333, "grad_norm": 8.366003036499023, "learning_rate": 1.8750000000000002e-05, "loss": 0.7423, "step": 57 }, { "epoch": 6.333333333333333, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.8116319179534912, "eval_runtime": 0.5995, "eval_samples_per_second": 120.099, "eval_steps_per_second": 8.34, "step": 57 }, { "epoch": 6.444444444444445, "grad_norm": 2.654489517211914, "learning_rate": 1.8181818181818182e-05, "loss": 0.7059, "step": 58 }, { "epoch": 6.444444444444445, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.8082817792892456, "eval_runtime": 0.5516, "eval_samples_per_second": 130.519, "eval_steps_per_second": 9.064, "step": 58 }, { "epoch": 6.555555555555555, "grad_norm": 9.320557594299316, "learning_rate": 1.7613636363636366e-05, "loss": 0.7396, "step": 59 }, { "epoch": 6.555555555555555, "eval_accuracy": 0.375, "eval_loss": 0.8052436113357544, "eval_runtime": 0.5998, "eval_samples_per_second": 120.045, "eval_steps_per_second": 8.336, "step": 59 }, { "epoch": 6.666666666666667, "grad_norm": 7.24456787109375, "learning_rate": 1.7045454545454546e-05, "loss": 0.6561, "step": 60 }, { "epoch": 6.666666666666667, "eval_accuracy": 0.375, "eval_loss": 0.8018798828125, "eval_runtime": 0.5995, "eval_samples_per_second": 120.107, "eval_steps_per_second": 8.341, "step": 60 }, { "epoch": 6.777777777777778, "grad_norm": 7.286713123321533, "learning_rate": 1.6477272727272726e-05, "loss": 0.7355, "step": 61 }, { "epoch": 6.777777777777778, "eval_accuracy": 0.375, "eval_loss": 0.8011067509651184, "eval_runtime": 0.6, "eval_samples_per_second": 119.998, "eval_steps_per_second": 8.333, "step": 61 }, { "epoch": 6.888888888888889, "grad_norm": 4.26874303817749, "learning_rate": 1.590909090909091e-05, "loss": 0.7353, "step": 62 }, { "epoch": 6.888888888888889, "eval_accuracy": 0.375, "eval_loss": 0.7994384765625, "eval_runtime": 0.5529, "eval_samples_per_second": 130.221, "eval_steps_per_second": 9.043, "step": 62 }, { "epoch": 7.0, "grad_norm": 3.4262943267822266, "learning_rate": 1.534090909090909e-05, "loss": 0.6377, "step": 63 }, { "epoch": 7.0, "eval_accuracy": 0.375, "eval_loss": 0.7936333417892456, "eval_runtime": 0.5486, "eval_samples_per_second": 131.251, "eval_steps_per_second": 9.115, "step": 63 }, { "epoch": 7.111111111111111, "grad_norm": 5.528657913208008, "learning_rate": 1.4772727272727274e-05, "loss": 0.6874, "step": 64 }, { "epoch": 7.111111111111111, "eval_accuracy": 0.375, "eval_loss": 0.7888047695159912, "eval_runtime": 0.6014, "eval_samples_per_second": 119.712, "eval_steps_per_second": 8.313, "step": 64 }, { "epoch": 7.222222222222222, "grad_norm": 5.771848678588867, "learning_rate": 1.4204545454545456e-05, "loss": 0.7112, "step": 65 }, { "epoch": 7.222222222222222, "eval_accuracy": 0.3611111111111111, "eval_loss": 0.7859971523284912, "eval_runtime": 0.6005, "eval_samples_per_second": 119.903, "eval_steps_per_second": 8.327, "step": 65 }, { "epoch": 7.333333333333333, "grad_norm": 8.485419273376465, "learning_rate": 1.3636363636363637e-05, "loss": 0.6282, "step": 66 }, { "epoch": 7.333333333333333, "eval_accuracy": 0.3611111111111111, "eval_loss": 0.7813856601715088, "eval_runtime": 0.6515, "eval_samples_per_second": 110.513, "eval_steps_per_second": 7.675, "step": 66 }, { "epoch": 7.444444444444445, "grad_norm": 10.057698249816895, "learning_rate": 1.3068181818181819e-05, "loss": 0.6793, "step": 67 }, { "epoch": 7.444444444444445, "eval_accuracy": 0.375, "eval_loss": 0.7774929404258728, "eval_runtime": 0.5496, "eval_samples_per_second": 131.003, "eval_steps_per_second": 9.097, "step": 67 }, { "epoch": 7.555555555555555, "grad_norm": 9.190048217773438, "learning_rate": 1.25e-05, "loss": 0.6707, "step": 68 }, { "epoch": 7.555555555555555, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.7748887538909912, "eval_runtime": 0.5517, "eval_samples_per_second": 130.502, "eval_steps_per_second": 9.063, "step": 68 }, { "epoch": 7.666666666666667, "grad_norm": 6.497270107269287, "learning_rate": 1.1931818181818183e-05, "loss": 0.6571, "step": 69 }, { "epoch": 7.666666666666667, "eval_accuracy": 0.375, "eval_loss": 0.77294921875, "eval_runtime": 0.5507, "eval_samples_per_second": 130.752, "eval_steps_per_second": 9.08, "step": 69 }, { "epoch": 7.777777777777778, "grad_norm": 5.636641025543213, "learning_rate": 1.1363636363636365e-05, "loss": 0.6539, "step": 70 }, { "epoch": 7.777777777777778, "eval_accuracy": 0.3611111111111111, "eval_loss": 0.7728678584098816, "eval_runtime": 0.5491, "eval_samples_per_second": 131.117, "eval_steps_per_second": 9.105, "step": 70 }, { "epoch": 7.888888888888889, "grad_norm": 4.822576522827148, "learning_rate": 1.0795454545454547e-05, "loss": 0.6953, "step": 71 }, { "epoch": 7.888888888888889, "eval_accuracy": 0.3472222222222222, "eval_loss": 0.7734103798866272, "eval_runtime": 0.5482, "eval_samples_per_second": 131.348, "eval_steps_per_second": 9.121, "step": 71 }, { "epoch": 8.0, "grad_norm": 7.65333366394043, "learning_rate": 1.0227272727272729e-05, "loss": 0.7214, "step": 72 }, { "epoch": 8.0, "eval_accuracy": 0.3611111111111111, "eval_loss": 0.7741156816482544, "eval_runtime": 0.6017, "eval_samples_per_second": 119.662, "eval_steps_per_second": 8.31, "step": 72 }, { "epoch": 8.11111111111111, "grad_norm": 8.882245063781738, "learning_rate": 9.659090909090909e-06, "loss": 0.722, "step": 73 }, { "epoch": 8.11111111111111, "eval_accuracy": 0.3472222222222222, "eval_loss": 0.7755805253982544, "eval_runtime": 0.5489, "eval_samples_per_second": 131.162, "eval_steps_per_second": 9.108, "step": 73 }, { "epoch": 8.222222222222221, "grad_norm": 3.2345800399780273, "learning_rate": 9.090909090909091e-06, "loss": 0.6578, "step": 74 }, { "epoch": 8.222222222222221, "eval_accuracy": 0.3472222222222222, "eval_loss": 0.771728515625, "eval_runtime": 0.6516, "eval_samples_per_second": 110.506, "eval_steps_per_second": 7.674, "step": 74 }, { "epoch": 8.333333333333334, "grad_norm": 4.6726484298706055, "learning_rate": 8.522727272727273e-06, "loss": 0.7094, "step": 75 }, { "epoch": 8.333333333333334, "eval_accuracy": 0.3472222222222222, "eval_loss": 0.7747802734375, "eval_runtime": 0.5499, "eval_samples_per_second": 130.927, "eval_steps_per_second": 9.092, "step": 75 }, { "epoch": 8.444444444444445, "grad_norm": 16.0637264251709, "learning_rate": 7.954545454545455e-06, "loss": 0.7181, "step": 76 }, { "epoch": 8.444444444444445, "eval_accuracy": 0.375, "eval_loss": 0.7764350175857544, "eval_runtime": 0.7006, "eval_samples_per_second": 102.77, "eval_steps_per_second": 7.137, "step": 76 }, { "epoch": 8.555555555555555, "grad_norm": 4.427946090698242, "learning_rate": 7.386363636363637e-06, "loss": 0.6544, "step": 77 }, { "epoch": 8.555555555555555, "eval_accuracy": 0.3472222222222222, "eval_loss": 0.7750922441482544, "eval_runtime": 0.5503, "eval_samples_per_second": 130.846, "eval_steps_per_second": 9.086, "step": 77 }, { "epoch": 8.666666666666666, "grad_norm": 3.1340301036834717, "learning_rate": 6.818181818181818e-06, "loss": 0.6482, "step": 78 }, { "epoch": 8.666666666666666, "eval_accuracy": 0.3472222222222222, "eval_loss": 0.7764485478401184, "eval_runtime": 0.6071, "eval_samples_per_second": 118.588, "eval_steps_per_second": 8.235, "step": 78 }, { "epoch": 8.777777777777779, "grad_norm": 14.458281517028809, "learning_rate": 6.25e-06, "loss": 0.7313, "step": 79 }, { "epoch": 8.777777777777779, "eval_accuracy": 0.3472222222222222, "eval_loss": 0.7769097089767456, "eval_runtime": 0.6033, "eval_samples_per_second": 119.344, "eval_steps_per_second": 8.288, "step": 79 }, { "epoch": 8.88888888888889, "grad_norm": 10.602180480957031, "learning_rate": 5.681818181818182e-06, "loss": 0.7158, "step": 80 }, { "epoch": 8.88888888888889, "eval_accuracy": 0.3611111111111111, "eval_loss": 0.7765977382659912, "eval_runtime": 0.7058, "eval_samples_per_second": 102.018, "eval_steps_per_second": 7.085, "step": 80 }, { "epoch": 9.0, "grad_norm": 8.127225875854492, "learning_rate": 5.113636363636364e-06, "loss": 0.6447, "step": 81 }, { "epoch": 9.0, "eval_accuracy": 0.3472222222222222, "eval_loss": 0.7755669355392456, "eval_runtime": 0.6543, "eval_samples_per_second": 110.049, "eval_steps_per_second": 7.642, "step": 81 }, { "epoch": 9.11111111111111, "grad_norm": 6.457716464996338, "learning_rate": 4.5454545454545455e-06, "loss": 0.6601, "step": 82 }, { "epoch": 9.11111111111111, "eval_accuracy": 0.3333333333333333, "eval_loss": 0.7762587070465088, "eval_runtime": 0.6069, "eval_samples_per_second": 118.636, "eval_steps_per_second": 8.239, "step": 82 }, { "epoch": 9.222222222222221, "grad_norm": 3.813633918762207, "learning_rate": 3.9772727272727275e-06, "loss": 0.7732, "step": 83 }, { "epoch": 9.222222222222221, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.7770453691482544, "eval_runtime": 0.7076, "eval_samples_per_second": 101.748, "eval_steps_per_second": 7.066, "step": 83 }, { "epoch": 9.333333333333334, "grad_norm": 10.45854377746582, "learning_rate": 3.409090909090909e-06, "loss": 0.6657, "step": 84 }, { "epoch": 9.333333333333334, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.7784559726715088, "eval_runtime": 0.7109, "eval_samples_per_second": 101.287, "eval_steps_per_second": 7.034, "step": 84 }, { "epoch": 9.444444444444445, "grad_norm": 4.11740255355835, "learning_rate": 2.840909090909091e-06, "loss": 0.716, "step": 85 }, { "epoch": 9.444444444444445, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.7778727412223816, "eval_runtime": 0.5543, "eval_samples_per_second": 129.897, "eval_steps_per_second": 9.021, "step": 85 }, { "epoch": 9.555555555555555, "grad_norm": 10.107967376708984, "learning_rate": 2.2727272727272728e-06, "loss": 0.7518, "step": 86 }, { "epoch": 9.555555555555555, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.7781982421875, "eval_runtime": 0.6493, "eval_samples_per_second": 110.892, "eval_steps_per_second": 7.701, "step": 86 }, { "epoch": 9.666666666666666, "grad_norm": 6.475906848907471, "learning_rate": 1.7045454545454546e-06, "loss": 0.713, "step": 87 }, { "epoch": 9.666666666666666, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.7775471806526184, "eval_runtime": 0.5511, "eval_samples_per_second": 130.655, "eval_steps_per_second": 9.073, "step": 87 }, { "epoch": 9.777777777777779, "grad_norm": 11.71182632446289, "learning_rate": 1.1363636363636364e-06, "loss": 0.6931, "step": 88 }, { "epoch": 9.777777777777779, "eval_accuracy": 0.375, "eval_loss": 0.7794325351715088, "eval_runtime": 0.5536, "eval_samples_per_second": 130.057, "eval_steps_per_second": 9.032, "step": 88 }, { "epoch": 9.88888888888889, "grad_norm": 8.962738037109375, "learning_rate": 5.681818181818182e-07, "loss": 0.7001, "step": 89 }, { "epoch": 9.88888888888889, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.7806532382965088, "eval_runtime": 0.6003, "eval_samples_per_second": 119.933, "eval_steps_per_second": 8.329, "step": 89 }, { "epoch": 10.0, "grad_norm": 8.880528450012207, "learning_rate": 0.0, "loss": 0.6525, "step": 90 }, { "epoch": 10.0, "eval_accuracy": 0.3888888888888889, "eval_loss": 0.7818332314491272, "eval_runtime": 0.6538, "eval_samples_per_second": 110.129, "eval_steps_per_second": 7.648, "step": 90 }, { "epoch": 10.0, "step": 90, "total_flos": 13264513597440.0, "train_loss": 0.7229804992675781, "train_runtime": 239.59, "train_samples_per_second": 11.854, "train_steps_per_second": 0.376 } ], "logging_steps": 1, "max_steps": 90, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 13264513597440.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }