{ "best_metric": 0.5025785565376282, "best_model_checkpoint": "models/E-Coli-FFT/KCYHSM/checkpoint-21500", "epoch": 8.96, "eval_steps": 500, "global_step": 28000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 0.560947835445404, "learning_rate": 4.996016e-05, "loss": 2.0872, "step": 500 }, { "epoch": 0.16, "eval_accuracy_per_token": 0.30261626839637756, "eval_loss": 2.0467002391815186, "eval_runtime": 226.6763, "eval_samples_per_second": 110.289, "eval_steps_per_second": 6.895, "step": 500 }, { "epoch": 0.32, "grad_norm": 0.5276467204093933, "learning_rate": 4.992016000000001e-05, "loss": 2.033, "step": 1000 }, { "epoch": 0.32, "eval_accuracy_per_token": 0.31899595260620117, "eval_loss": 2.0086700916290283, "eval_runtime": 215.1833, "eval_samples_per_second": 116.18, "eval_steps_per_second": 7.264, "step": 1000 }, { "epoch": 0.48, "grad_norm": 1.3343302011489868, "learning_rate": 4.988016e-05, "loss": 1.9439, "step": 1500 }, { "epoch": 0.48, "eval_accuracy_per_token": 0.3831162750720978, "eval_loss": 1.8512518405914307, "eval_runtime": 215.5299, "eval_samples_per_second": 115.993, "eval_steps_per_second": 7.252, "step": 1500 }, { "epoch": 0.64, "grad_norm": 1.4562740325927734, "learning_rate": 4.984016e-05, "loss": 1.7198, "step": 2000 }, { "epoch": 0.64, "eval_accuracy_per_token": 0.47227564454078674, "eval_loss": 1.6024138927459717, "eval_runtime": 215.7037, "eval_samples_per_second": 115.9, "eval_steps_per_second": 7.246, "step": 2000 }, { "epoch": 0.8, "grad_norm": 2.136140823364258, "learning_rate": 4.980016e-05, "loss": 1.5124, "step": 2500 }, { "epoch": 0.8, "eval_accuracy_per_token": 0.5323951840400696, "eval_loss": 1.4281255006790161, "eval_runtime": 215.8041, "eval_samples_per_second": 115.846, "eval_steps_per_second": 7.243, "step": 2500 }, { "epoch": 0.96, "grad_norm": 1.6090725660324097, "learning_rate": 4.976016e-05, "loss": 1.3463, "step": 3000 }, { "epoch": 0.96, "eval_accuracy_per_token": 0.5815901160240173, "eval_loss": 1.2842084169387817, "eval_runtime": 215.7888, "eval_samples_per_second": 115.854, "eval_steps_per_second": 7.243, "step": 3000 }, { "epoch": 1.12, "grad_norm": 2.4967901706695557, "learning_rate": 4.972016e-05, "loss": 1.1657, "step": 3500 }, { "epoch": 1.12, "eval_accuracy_per_token": 0.6198335886001587, "eval_loss": 1.1727232933044434, "eval_runtime": 215.8392, "eval_samples_per_second": 115.827, "eval_steps_per_second": 7.242, "step": 3500 }, { "epoch": 1.28, "grad_norm": 2.750229597091675, "learning_rate": 4.968016e-05, "loss": 1.0624, "step": 4000 }, { "epoch": 1.28, "eval_accuracy_per_token": 0.6496000289916992, "eval_loss": 1.0832889080047607, "eval_runtime": 216.0152, "eval_samples_per_second": 115.733, "eval_steps_per_second": 7.236, "step": 4000 }, { "epoch": 1.44, "grad_norm": 2.4956400394439697, "learning_rate": 4.9640160000000003e-05, "loss": 0.9957, "step": 4500 }, { "epoch": 1.44, "eval_accuracy_per_token": 0.6765108108520508, "eval_loss": 1.0036195516586304, "eval_runtime": 215.7199, "eval_samples_per_second": 115.891, "eval_steps_per_second": 7.246, "step": 4500 }, { "epoch": 1.6, "grad_norm": 2.816002368927002, "learning_rate": 4.9600160000000004e-05, "loss": 0.913, "step": 5000 }, { "epoch": 1.6, "eval_accuracy_per_token": 0.6984472274780273, "eval_loss": 0.9418182969093323, "eval_runtime": 215.8333, "eval_samples_per_second": 115.83, "eval_steps_per_second": 7.242, "step": 5000 }, { "epoch": 1.76, "grad_norm": 2.293124198913574, "learning_rate": 4.956016e-05, "loss": 0.8747, "step": 5500 }, { "epoch": 1.76, "eval_accuracy_per_token": 0.7186679244041443, "eval_loss": 0.8788002133369446, "eval_runtime": 215.9244, "eval_samples_per_second": 115.781, "eval_steps_per_second": 7.239, "step": 5500 }, { "epoch": 1.92, "grad_norm": 2.6259233951568604, "learning_rate": 4.9520160000000005e-05, "loss": 0.8064, "step": 6000 }, { "epoch": 1.92, "eval_accuracy_per_token": 0.7395341992378235, "eval_loss": 0.8194286823272705, "eval_runtime": 215.8802, "eval_samples_per_second": 115.805, "eval_steps_per_second": 7.24, "step": 6000 }, { "epoch": 2.08, "grad_norm": 2.7774460315704346, "learning_rate": 4.9480160000000005e-05, "loss": 0.7329, "step": 6500 }, { "epoch": 2.08, "eval_accuracy_per_token": 0.7511833310127258, "eval_loss": 0.7874646782875061, "eval_runtime": 215.9936, "eval_samples_per_second": 115.744, "eval_steps_per_second": 7.236, "step": 6500 }, { "epoch": 2.24, "grad_norm": 2.8111910820007324, "learning_rate": 4.9440160000000005e-05, "loss": 0.6541, "step": 7000 }, { "epoch": 2.24, "eval_accuracy_per_token": 0.7654264569282532, "eval_loss": 0.7462261915206909, "eval_runtime": 215.7774, "eval_samples_per_second": 115.86, "eval_steps_per_second": 7.244, "step": 7000 }, { "epoch": 2.4, "grad_norm": 2.734023332595825, "learning_rate": 4.940016e-05, "loss": 0.6368, "step": 7500 }, { "epoch": 2.4, "eval_accuracy_per_token": 0.7762272953987122, "eval_loss": 0.7165001630783081, "eval_runtime": 215.5714, "eval_samples_per_second": 115.971, "eval_steps_per_second": 7.25, "step": 7500 }, { "epoch": 2.56, "grad_norm": 3.085864782333374, "learning_rate": 4.936016e-05, "loss": 0.6019, "step": 8000 }, { "epoch": 2.56, "eval_accuracy_per_token": 0.7840808033943176, "eval_loss": 0.6941312551498413, "eval_runtime": 215.5763, "eval_samples_per_second": 115.968, "eval_steps_per_second": 7.25, "step": 8000 }, { "epoch": 2.7199999999999998, "grad_norm": 2.5468504428863525, "learning_rate": 4.9320320000000004e-05, "loss": 0.5855, "step": 8500 }, { "epoch": 2.7199999999999998, "eval_accuracy_per_token": 0.7905924916267395, "eval_loss": 0.6726610064506531, "eval_runtime": 215.5941, "eval_samples_per_second": 115.959, "eval_steps_per_second": 7.25, "step": 8500 }, { "epoch": 2.88, "grad_norm": 2.7991392612457275, "learning_rate": 4.9280320000000005e-05, "loss": 0.565, "step": 9000 }, { "epoch": 2.88, "eval_accuracy_per_token": 0.7970213890075684, "eval_loss": 0.6574403047561646, "eval_runtime": 215.6086, "eval_samples_per_second": 115.951, "eval_steps_per_second": 7.249, "step": 9000 }, { "epoch": 3.04, "grad_norm": 2.0875983238220215, "learning_rate": 4.924032e-05, "loss": 0.5437, "step": 9500 }, { "epoch": 3.04, "eval_accuracy_per_token": 0.8036227226257324, "eval_loss": 0.6450381278991699, "eval_runtime": 215.7698, "eval_samples_per_second": 115.864, "eval_steps_per_second": 7.244, "step": 9500 }, { "epoch": 3.2, "grad_norm": 2.820004940032959, "learning_rate": 4.920032e-05, "loss": 0.4618, "step": 10000 }, { "epoch": 3.2, "eval_accuracy_per_token": 0.8065482378005981, "eval_loss": 0.6290712952613831, "eval_runtime": 215.5858, "eval_samples_per_second": 115.963, "eval_steps_per_second": 7.25, "step": 10000 }, { "epoch": 3.36, "grad_norm": 2.534975051879883, "learning_rate": 4.916048e-05, "loss": 0.4836, "step": 10500 }, { "epoch": 3.36, "eval_accuracy_per_token": 0.8109478950500488, "eval_loss": 0.621654748916626, "eval_runtime": 215.6832, "eval_samples_per_second": 115.911, "eval_steps_per_second": 7.247, "step": 10500 }, { "epoch": 3.52, "grad_norm": 2.2555694580078125, "learning_rate": 4.9120480000000004e-05, "loss": 0.4793, "step": 11000 }, { "epoch": 3.52, "eval_accuracy_per_token": 0.8153020739555359, "eval_loss": 0.6032074093818665, "eval_runtime": 215.7864, "eval_samples_per_second": 115.855, "eval_steps_per_second": 7.243, "step": 11000 }, { "epoch": 3.68, "grad_norm": 2.6011011600494385, "learning_rate": 4.9080480000000004e-05, "loss": 0.4639, "step": 11500 }, { "epoch": 3.68, "eval_accuracy_per_token": 0.8189014792442322, "eval_loss": 0.5884661078453064, "eval_runtime": 215.9181, "eval_samples_per_second": 115.785, "eval_steps_per_second": 7.239, "step": 11500 }, { "epoch": 3.84, "grad_norm": 2.201638698577881, "learning_rate": 4.904048e-05, "loss": 0.4734, "step": 12000 }, { "epoch": 3.84, "eval_accuracy_per_token": 0.8225154280662537, "eval_loss": 0.5901506543159485, "eval_runtime": 215.7481, "eval_samples_per_second": 115.876, "eval_steps_per_second": 7.245, "step": 12000 }, { "epoch": 4.0, "grad_norm": 2.115771770477295, "learning_rate": 4.9000480000000005e-05, "loss": 0.4589, "step": 12500 }, { "epoch": 4.0, "eval_accuracy_per_token": 0.824912428855896, "eval_loss": 0.57036292552948, "eval_runtime": 215.9409, "eval_samples_per_second": 115.772, "eval_steps_per_second": 7.238, "step": 12500 }, { "epoch": 4.16, "grad_norm": 2.13425350189209, "learning_rate": 4.8960480000000005e-05, "loss": 0.3846, "step": 13000 }, { "epoch": 4.16, "eval_accuracy_per_token": 0.8280689716339111, "eval_loss": 0.5732296705245972, "eval_runtime": 215.8629, "eval_samples_per_second": 115.814, "eval_steps_per_second": 7.241, "step": 13000 }, { "epoch": 4.32, "grad_norm": 2.227823257446289, "learning_rate": 4.892048e-05, "loss": 0.3907, "step": 13500 }, { "epoch": 4.32, "eval_accuracy_per_token": 0.8309552073478699, "eval_loss": 0.5606500506401062, "eval_runtime": 215.8455, "eval_samples_per_second": 115.824, "eval_steps_per_second": 7.241, "step": 13500 }, { "epoch": 4.48, "grad_norm": 1.9877572059631348, "learning_rate": 4.888048e-05, "loss": 0.3781, "step": 14000 }, { "epoch": 4.48, "eval_accuracy_per_token": 0.833513617515564, "eval_loss": 0.5528165102005005, "eval_runtime": 216.0002, "eval_samples_per_second": 115.741, "eval_steps_per_second": 7.236, "step": 14000 }, { "epoch": 4.64, "grad_norm": 2.0553159713745117, "learning_rate": 4.884048e-05, "loss": 0.3914, "step": 14500 }, { "epoch": 4.64, "eval_accuracy_per_token": 0.8335487246513367, "eval_loss": 0.549156904220581, "eval_runtime": 215.7201, "eval_samples_per_second": 115.891, "eval_steps_per_second": 7.246, "step": 14500 }, { "epoch": 4.8, "grad_norm": 2.2741925716400146, "learning_rate": 4.880048000000001e-05, "loss": 0.3894, "step": 15000 }, { "epoch": 4.8, "eval_accuracy_per_token": 0.8362560272216797, "eval_loss": 0.5504783391952515, "eval_runtime": 215.7979, "eval_samples_per_second": 115.849, "eval_steps_per_second": 7.243, "step": 15000 }, { "epoch": 4.96, "grad_norm": 2.08858585357666, "learning_rate": 4.876048e-05, "loss": 0.3944, "step": 15500 }, { "epoch": 4.96, "eval_accuracy_per_token": 0.8395103812217712, "eval_loss": 0.5324631929397583, "eval_runtime": 215.9188, "eval_samples_per_second": 115.784, "eval_steps_per_second": 7.239, "step": 15500 }, { "epoch": 5.12, "grad_norm": 2.1368110179901123, "learning_rate": 4.872048e-05, "loss": 0.3307, "step": 16000 }, { "epoch": 5.12, "eval_accuracy_per_token": 0.839469850063324, "eval_loss": 0.5465147495269775, "eval_runtime": 216.142, "eval_samples_per_second": 115.665, "eval_steps_per_second": 7.231, "step": 16000 }, { "epoch": 5.28, "grad_norm": 2.2641913890838623, "learning_rate": 4.868048e-05, "loss": 0.3125, "step": 16500 }, { "epoch": 5.28, "eval_accuracy_per_token": 0.8418903946876526, "eval_loss": 0.5389049053192139, "eval_runtime": 216.0271, "eval_samples_per_second": 115.726, "eval_steps_per_second": 7.235, "step": 16500 }, { "epoch": 5.44, "grad_norm": 1.6883608102798462, "learning_rate": 4.864048e-05, "loss": 0.3213, "step": 17000 }, { "epoch": 5.44, "eval_accuracy_per_token": 0.8448163270950317, "eval_loss": 0.5376200675964355, "eval_runtime": 216.0239, "eval_samples_per_second": 115.728, "eval_steps_per_second": 7.235, "step": 17000 }, { "epoch": 5.6, "grad_norm": 2.2113685607910156, "learning_rate": 4.860048e-05, "loss": 0.3269, "step": 17500 }, { "epoch": 5.6, "eval_accuracy_per_token": 0.8447655439376831, "eval_loss": 0.5422470569610596, "eval_runtime": 215.062, "eval_samples_per_second": 116.246, "eval_steps_per_second": 7.268, "step": 17500 }, { "epoch": 5.76, "grad_norm": 1.9853556156158447, "learning_rate": 4.856048e-05, "loss": 0.3374, "step": 18000 }, { "epoch": 5.76, "eval_accuracy_per_token": 0.847917914390564, "eval_loss": 0.5255292057991028, "eval_runtime": 213.7043, "eval_samples_per_second": 116.984, "eval_steps_per_second": 7.314, "step": 18000 }, { "epoch": 5.92, "grad_norm": 2.584245204925537, "learning_rate": 4.852048e-05, "loss": 0.3229, "step": 18500 }, { "epoch": 5.92, "eval_accuracy_per_token": 0.8491668701171875, "eval_loss": 0.5113908052444458, "eval_runtime": 214.1955, "eval_samples_per_second": 116.716, "eval_steps_per_second": 7.297, "step": 18500 }, { "epoch": 6.08, "grad_norm": 2.0866963863372803, "learning_rate": 4.848056e-05, "loss": 0.3061, "step": 19000 }, { "epoch": 6.08, "eval_accuracy_per_token": 0.8502768874168396, "eval_loss": 0.5418649911880493, "eval_runtime": 214.0933, "eval_samples_per_second": 116.771, "eval_steps_per_second": 7.301, "step": 19000 }, { "epoch": 6.24, "grad_norm": 2.0000994205474854, "learning_rate": 4.844056e-05, "loss": 0.2556, "step": 19500 }, { "epoch": 6.24, "eval_accuracy_per_token": 0.85043865442276, "eval_loss": 0.542536199092865, "eval_runtime": 213.1241, "eval_samples_per_second": 117.303, "eval_steps_per_second": 7.334, "step": 19500 }, { "epoch": 6.4, "grad_norm": 2.849112033843994, "learning_rate": 4.840064e-05, "loss": 0.2672, "step": 20000 }, { "epoch": 6.4, "eval_accuracy_per_token": 0.8512768149375916, "eval_loss": 0.5348747372627258, "eval_runtime": 215.5865, "eval_samples_per_second": 115.963, "eval_steps_per_second": 7.25, "step": 20000 }, { "epoch": 6.5600000000000005, "grad_norm": 2.216937303543091, "learning_rate": 4.836064e-05, "loss": 0.2791, "step": 20500 }, { "epoch": 6.5600000000000005, "eval_accuracy_per_token": 0.8536346554756165, "eval_loss": 0.5185777544975281, "eval_runtime": 215.5365, "eval_samples_per_second": 115.99, "eval_steps_per_second": 7.252, "step": 20500 }, { "epoch": 6.72, "grad_norm": 2.2237415313720703, "learning_rate": 4.832064e-05, "loss": 0.2792, "step": 21000 }, { "epoch": 6.72, "eval_accuracy_per_token": 0.8542323112487793, "eval_loss": 0.5137789249420166, "eval_runtime": 215.6897, "eval_samples_per_second": 115.907, "eval_steps_per_second": 7.247, "step": 21000 }, { "epoch": 6.88, "grad_norm": 2.6220571994781494, "learning_rate": 4.828064e-05, "loss": 0.2867, "step": 21500 }, { "epoch": 6.88, "eval_accuracy_per_token": 0.8552057147026062, "eval_loss": 0.5025785565376282, "eval_runtime": 215.6135, "eval_samples_per_second": 115.948, "eval_steps_per_second": 7.249, "step": 21500 }, { "epoch": 7.04, "grad_norm": 2.4813005924224854, "learning_rate": 4.82408e-05, "loss": 0.2628, "step": 22000 }, { "epoch": 7.04, "eval_accuracy_per_token": 0.8553439974784851, "eval_loss": 0.5604137778282166, "eval_runtime": 215.6781, "eval_samples_per_second": 115.913, "eval_steps_per_second": 7.247, "step": 22000 }, { "epoch": 7.2, "grad_norm": 2.316831588745117, "learning_rate": 4.82008e-05, "loss": 0.2124, "step": 22500 }, { "epoch": 7.2, "eval_accuracy_per_token": 0.8566614389419556, "eval_loss": 0.5436919927597046, "eval_runtime": 215.5335, "eval_samples_per_second": 115.991, "eval_steps_per_second": 7.252, "step": 22500 }, { "epoch": 7.36, "grad_norm": 2.0445761680603027, "learning_rate": 4.81608e-05, "loss": 0.2246, "step": 23000 }, { "epoch": 7.36, "eval_accuracy_per_token": 0.8579392433166504, "eval_loss": 0.5327755212783813, "eval_runtime": 215.6442, "eval_samples_per_second": 115.932, "eval_steps_per_second": 7.248, "step": 23000 }, { "epoch": 7.52, "grad_norm": 1.8908203840255737, "learning_rate": 4.81208e-05, "loss": 0.2266, "step": 23500 }, { "epoch": 7.52, "eval_accuracy_per_token": 0.8593510389328003, "eval_loss": 0.5242588520050049, "eval_runtime": 215.5467, "eval_samples_per_second": 115.984, "eval_steps_per_second": 7.251, "step": 23500 }, { "epoch": 7.68, "grad_norm": 1.698264241218567, "learning_rate": 4.80808e-05, "loss": 0.2335, "step": 24000 }, { "epoch": 7.68, "eval_accuracy_per_token": 0.8600181341171265, "eval_loss": 0.5188168883323669, "eval_runtime": 215.5491, "eval_samples_per_second": 115.983, "eval_steps_per_second": 7.251, "step": 24000 }, { "epoch": 7.84, "grad_norm": 2.196866035461426, "learning_rate": 4.80408e-05, "loss": 0.2375, "step": 24500 }, { "epoch": 7.84, "eval_accuracy_per_token": 0.8607679605484009, "eval_loss": 0.5174950957298279, "eval_runtime": 215.872, "eval_samples_per_second": 115.809, "eval_steps_per_second": 7.24, "step": 24500 }, { "epoch": 8.0, "grad_norm": 2.599553346633911, "learning_rate": 4.800088e-05, "loss": 0.2426, "step": 25000 }, { "epoch": 8.0, "eval_accuracy_per_token": 0.8615555763244629, "eval_loss": 0.5121429562568665, "eval_runtime": 215.6323, "eval_samples_per_second": 115.938, "eval_steps_per_second": 7.248, "step": 25000 }, { "epoch": 8.16, "grad_norm": 2.2151308059692383, "learning_rate": 4.796088e-05, "loss": 0.1708, "step": 25500 }, { "epoch": 8.16, "eval_accuracy_per_token": 0.8606916666030884, "eval_loss": 0.5612675547599792, "eval_runtime": 215.6202, "eval_samples_per_second": 115.945, "eval_steps_per_second": 7.249, "step": 25500 }, { "epoch": 8.32, "grad_norm": 2.0478286743164062, "learning_rate": 4.792088e-05, "loss": 0.1739, "step": 26000 }, { "epoch": 8.32, "eval_accuracy_per_token": 0.8610015511512756, "eval_loss": 0.5620591640472412, "eval_runtime": 215.5503, "eval_samples_per_second": 115.982, "eval_steps_per_second": 7.251, "step": 26000 }, { "epoch": 8.48, "grad_norm": 2.2109131813049316, "learning_rate": 4.788088e-05, "loss": 0.1893, "step": 26500 }, { "epoch": 8.48, "eval_accuracy_per_token": 0.861874520778656, "eval_loss": 0.5581731200218201, "eval_runtime": 215.6555, "eval_samples_per_second": 115.926, "eval_steps_per_second": 7.248, "step": 26500 }, { "epoch": 8.64, "grad_norm": 2.688985586166382, "learning_rate": 4.784096e-05, "loss": 0.1948, "step": 27000 }, { "epoch": 8.64, "eval_accuracy_per_token": 0.864142119884491, "eval_loss": 0.5396531820297241, "eval_runtime": 215.5944, "eval_samples_per_second": 115.959, "eval_steps_per_second": 7.25, "step": 27000 }, { "epoch": 8.8, "grad_norm": 2.5816333293914795, "learning_rate": 4.780096e-05, "loss": 0.2022, "step": 27500 }, { "epoch": 8.8, "eval_accuracy_per_token": 0.8647276759147644, "eval_loss": 0.537101149559021, "eval_runtime": 215.522, "eval_samples_per_second": 115.997, "eval_steps_per_second": 7.252, "step": 27500 }, { "epoch": 8.96, "grad_norm": 2.262164354324341, "learning_rate": 4.776096e-05, "loss": 0.2027, "step": 28000 }, { "epoch": 8.96, "eval_accuracy_per_token": 0.8652188777923584, "eval_loss": 0.5241075158119202, "eval_runtime": 215.8588, "eval_samples_per_second": 115.816, "eval_steps_per_second": 7.241, "step": 28000 } ], "logging_steps": 500, "max_steps": 625000, "num_input_tokens_seen": 0, "num_train_epochs": 200, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 19, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.043038124900352e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }