{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.010214409722222, "eval_steps": 1000, "global_step": 36000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0086102430555555, "grad_norm": 0.3148804008960724, "learning_rate": 0.0009998758966336297, "loss": 5.1098, "num_input_tokens_seen": 521485760, "step": 1000 }, { "epoch": 1.0086102430555555, "eval_loss": 4.390458106994629, "eval_runtime": 157.9404, "eval_samples_per_second": 116.5, "eval_steps_per_second": 7.281, "num_input_tokens_seen": 521485760, "step": 1000 }, { "epoch": 2.0169140625, "grad_norm": 0.24177567660808563, "learning_rate": 0.0009969004950996173, "loss": 3.7649, "num_input_tokens_seen": 1042810727, "step": 2000 }, { "epoch": 2.0169140625, "eval_loss": 4.018285274505615, "eval_runtime": 157.619, "eval_samples_per_second": 116.737, "eval_steps_per_second": 7.296, "num_input_tokens_seen": 1042810727, "step": 2000 }, { "epoch": 4.006030381944444, "grad_norm": 0.22102612257003784, "learning_rate": 0.0009899808525182935, "loss": 3.3244, "num_input_tokens_seen": 1564614407, "step": 3000 }, { "epoch": 4.006030381944444, "eval_loss": 3.883763074874878, "eval_runtime": 158.112, "eval_samples_per_second": 116.373, "eval_steps_per_second": 7.273, "num_input_tokens_seen": 1564614407, "step": 3000 }, { "epoch": 5.0143046875, "grad_norm": 0.25155559182167053, "learning_rate": 0.0009791718948528457, "loss": 3.2291, "num_input_tokens_seen": 2086545237, "step": 4000 }, { "epoch": 5.0143046875, "eval_loss": 3.7907443046569824, "eval_runtime": 157.3668, "eval_samples_per_second": 116.924, "eval_steps_per_second": 7.308, "num_input_tokens_seen": 2086545237, "step": 4000 }, { "epoch": 7.002637152777778, "grad_norm": 0.23016224801540375, "learning_rate": 0.0009645594202357438, "loss": 3.1779, "num_input_tokens_seen": 2608572303, "step": 5000 }, { "epoch": 7.002637152777778, "eval_loss": 3.7254457473754883, "eval_runtime": 158.0679, "eval_samples_per_second": 116.406, "eval_steps_per_second": 7.275, "num_input_tokens_seen": 2608572303, "step": 5000 }, { "epoch": 8.010796006944444, "grad_norm": 0.1911187618970871, "learning_rate": 0.0009462594179299406, "loss": 3.1106, "num_input_tokens_seen": 3130451615, "step": 6000 }, { "epoch": 8.010796006944444, "eval_loss": 3.6576924324035645, "eval_runtime": 156.4171, "eval_samples_per_second": 117.634, "eval_steps_per_second": 7.352, "num_input_tokens_seen": 3130451615, "step": 6000 }, { "epoch": 9.0191015625, "grad_norm": 0.24357692897319794, "learning_rate": 0.0009244171476423036, "loss": 3.0493, "num_input_tokens_seen": 3652271439, "step": 7000 }, { "epoch": 9.0191015625, "eval_loss": 3.6375908851623535, "eval_runtime": 157.211, "eval_samples_per_second": 117.04, "eval_steps_per_second": 7.315, "num_input_tokens_seen": 3652271439, "step": 7000 }, { "epoch": 1.0083489583333334, "grad_norm": 0.17816534638404846, "learning_rate": 0.0008992059864973972, "loss": 3.0005, "num_input_tokens_seen": 4174145711, "step": 8000 }, { "epoch": 1.0083489583333334, "eval_loss": 3.5855612754821777, "eval_runtime": 155.3476, "eval_samples_per_second": 118.444, "eval_steps_per_second": 7.403, "num_input_tokens_seen": 4174145711, "step": 8000 }, { "epoch": 2.016631076388889, "grad_norm": 0.1740865856409073, "learning_rate": 0.0008708260528239789, "loss": 3.0155, "num_input_tokens_seen": 4696186623, "step": 9000 }, { "epoch": 2.016631076388889, "eval_loss": 3.580111265182495, "eval_runtime": 154.9402, "eval_samples_per_second": 118.755, "eval_steps_per_second": 7.422, "num_input_tokens_seen": 4696186623, "step": 9000 }, { "epoch": 4.005178819444445, "grad_norm": 0.19117484986782074, "learning_rate": 0.0008395026176781626, "loss": 2.9609, "num_input_tokens_seen": 5217948966, "step": 10000 }, { "epoch": 4.005178819444445, "eval_loss": 3.5479941368103027, "eval_runtime": 154.297, "eval_samples_per_second": 119.251, "eval_steps_per_second": 7.453, "num_input_tokens_seen": 5217948966, "step": 10000 }, { "epoch": 5.013779513888889, "grad_norm": 0.16193148493766785, "learning_rate": 0.0008054843167120826, "loss": 2.9376, "num_input_tokens_seen": 5739503318, "step": 11000 }, { "epoch": 5.013779513888889, "eval_loss": 3.5245985984802246, "eval_runtime": 154.2866, "eval_samples_per_second": 119.259, "eval_steps_per_second": 7.454, "num_input_tokens_seen": 5739503318, "step": 11000 }, { "epoch": 7.002284722222222, "grad_norm": 0.16273680329322815, "learning_rate": 0.0007690411765816864, "loss": 2.9641, "num_input_tokens_seen": 6261603062, "step": 12000 }, { "epoch": 7.002284722222222, "eval_loss": 3.5051820278167725, "eval_runtime": 153.5212, "eval_samples_per_second": 119.853, "eval_steps_per_second": 7.491, "num_input_tokens_seen": 6261603062, "step": 12000 }, { "epoch": 8.010577256944444, "grad_norm": 0.1603420227766037, "learning_rate": 0.0007304624715594139, "loss": 2.9166, "num_input_tokens_seen": 6783593829, "step": 13000 }, { "epoch": 8.010577256944444, "eval_loss": 3.497760772705078, "eval_runtime": 153.4387, "eval_samples_per_second": 119.918, "eval_steps_per_second": 7.495, "num_input_tokens_seen": 6783593829, "step": 13000 }, { "epoch": 9.018918402777778, "grad_norm": 0.1665274053812027, "learning_rate": 0.0006900544273653075, "loss": 2.8821, "num_input_tokens_seen": 7305434352, "step": 14000 }, { "epoch": 9.018918402777778, "eval_loss": 3.4890758991241455, "eval_runtime": 153.9326, "eval_samples_per_second": 119.533, "eval_steps_per_second": 7.471, "num_input_tokens_seen": 7305434352, "step": 14000 }, { "epoch": 1.0083055555555556, "grad_norm": 0.16452226042747498, "learning_rate": 0.000648137790442817, "loss": 2.8766, "num_input_tokens_seen": 7827274720, "step": 15000 }, { "epoch": 1.0083055555555556, "eval_loss": 3.4698522090911865, "eval_runtime": 153.8232, "eval_samples_per_second": 119.618, "eval_steps_per_second": 7.476, "num_input_tokens_seen": 7827274720, "step": 15000 }, { "epoch": 2.0166302083333334, "grad_norm": 0.17168712615966797, "learning_rate": 0.0006050452819736389, "loss": 2.8809, "num_input_tokens_seen": 8348682560, "step": 16000 }, { "epoch": 2.0166302083333334, "eval_loss": 3.4530444145202637, "eval_runtime": 152.7846, "eval_samples_per_second": 120.431, "eval_steps_per_second": 7.527, "num_input_tokens_seen": 8348682560, "step": 16000 }, { "epoch": 4.005391493055556, "grad_norm": 0.16454088687896729, "learning_rate": 0.0005611189568408173, "loss": 2.885, "num_input_tokens_seen": 8870476792, "step": 17000 }, { "epoch": 4.005391493055556, "eval_loss": 3.442610025405884, "eval_runtime": 153.4312, "eval_samples_per_second": 119.923, "eval_steps_per_second": 7.495, "num_input_tokens_seen": 8870476792, "step": 17000 }, { "epoch": 5.013413194444444, "grad_norm": 0.1566428244113922, "learning_rate": 0.0005167074885038374, "loss": 2.8616, "num_input_tokens_seen": 9392395768, "step": 18000 }, { "epoch": 5.013413194444444, "eval_loss": 3.4214282035827637, "eval_runtime": 153.1872, "eval_samples_per_second": 120.115, "eval_steps_per_second": 7.507, "num_input_tokens_seen": 9392395768, "step": 18000 }, { "epoch": 7.001969618055556, "grad_norm": 0.16596154868602753, "learning_rate": 0.000472163401337526, "loss": 2.8546, "num_input_tokens_seen": 9914322356, "step": 19000 }, { "epoch": 7.001969618055556, "eval_loss": 3.399761438369751, "eval_runtime": 153.8483, "eval_samples_per_second": 119.598, "eval_steps_per_second": 7.475, "num_input_tokens_seen": 9914322356, "step": 19000 }, { "epoch": 8.010303819444445, "grad_norm": 0.16305798292160034, "learning_rate": 0.00042784027240358674, "loss": 2.8039, "num_input_tokens_seen": 10436233756, "step": 20000 }, { "epoch": 8.010303819444445, "eval_loss": 3.3940844535827637, "eval_runtime": 158.2602, "eval_samples_per_second": 116.264, "eval_steps_per_second": 7.267, "num_input_tokens_seen": 10436233756, "step": 20000 }, { "epoch": 1.0082630208333334, "grad_norm": 0.2344750463962555, "learning_rate": 0.00038408992486623584, "loss": 2.7959, "num_input_tokens_seen": 10958156105, "step": 21000 }, { "epoch": 1.0082630208333334, "eval_loss": 3.4070310592651367, "eval_runtime": 155.4587, "eval_samples_per_second": 118.359, "eval_steps_per_second": 7.397, "num_input_tokens_seen": 10958156105, "step": 21000 }, { "epoch": 2.0165078125, "grad_norm": 0.16524210572242737, "learning_rate": 0.0003412596353297288, "loss": 2.791, "num_input_tokens_seen": 11479508780, "step": 22000 }, { "epoch": 2.0165078125, "eval_loss": 3.3836262226104736, "eval_runtime": 155.5662, "eval_samples_per_second": 118.278, "eval_steps_per_second": 7.392, "num_input_tokens_seen": 11479508780, "step": 22000 }, { "epoch": 4.005475694444445, "grad_norm": 0.18112713098526, "learning_rate": 0.0002996893772650602, "loss": 2.7732, "num_input_tokens_seen": 12001335662, "step": 23000 }, { "epoch": 4.005475694444445, "eval_loss": 3.374772548675537, "eval_runtime": 158.5198, "eval_samples_per_second": 116.074, "eval_steps_per_second": 7.255, "num_input_tokens_seen": 12001335662, "step": 23000 }, { "epoch": 5.013543402777778, "grad_norm": 0.20000149309635162, "learning_rate": 0.0002597091224066581, "loss": 2.776, "num_input_tokens_seen": 12523295262, "step": 24000 }, { "epoch": 5.013543402777778, "eval_loss": 3.3714466094970703, "eval_runtime": 156.2603, "eval_samples_per_second": 117.752, "eval_steps_per_second": 7.36, "num_input_tokens_seen": 12523295262, "step": 24000 }, { "epoch": 1.0082881944444444, "grad_norm": 0.17906545102596283, "learning_rate": 0.0002216362215397393, "loss": 2.7496, "num_input_tokens_seen": 13045103310, "step": 25000 }, { "epoch": 1.0082881944444444, "eval_loss": 3.3571267127990723, "eval_runtime": 156.601, "eval_samples_per_second": 117.496, "eval_steps_per_second": 7.344, "num_input_tokens_seen": 13045103310, "step": 25000 }, { "epoch": 2.0165208333333333, "grad_norm": 0.17018218338489532, "learning_rate": 0.00018577288546882165, "loss": 2.7435, "num_input_tokens_seen": 13566410636, "step": 26000 }, { "epoch": 2.0165208333333333, "eval_loss": 3.3441579341888428, "eval_runtime": 157.974, "eval_samples_per_second": 116.475, "eval_steps_per_second": 7.28, "num_input_tokens_seen": 13566410636, "step": 26000 }, { "epoch": 4.005661458333333, "grad_norm": 0.16987864673137665, "learning_rate": 0.00015240378616267886, "loss": 2.7321, "num_input_tokens_seen": 14088234739, "step": 27000 }, { "epoch": 4.005661458333333, "eval_loss": 3.335869550704956, "eval_runtime": 156.5095, "eval_samples_per_second": 117.565, "eval_steps_per_second": 7.348, "num_input_tokens_seen": 14088234739, "step": 27000 }, { "epoch": 5.013948784722222, "grad_norm": 0.18387989699840546, "learning_rate": 0.00012179379711709738, "loss": 2.7137, "num_input_tokens_seen": 14609933027, "step": 28000 }, { "epoch": 5.013948784722222, "eval_loss": 3.323387861251831, "eval_runtime": 156.9047, "eval_samples_per_second": 117.269, "eval_steps_per_second": 7.329, "num_input_tokens_seen": 14609933027, "step": 28000 }, { "epoch": 7.002777777777778, "grad_norm": 0.17292186617851257, "learning_rate": 9.418589087173441e-05, "loss": 2.7152, "num_input_tokens_seen": 15131802301, "step": 29000 }, { "epoch": 7.002777777777778, "eval_loss": 3.318844795227051, "eval_runtime": 156.7029, "eval_samples_per_second": 117.42, "eval_steps_per_second": 7.339, "num_input_tokens_seen": 15131802301, "step": 29000 }, { "epoch": 8.0111015625, "grad_norm": 0.17619270086288452, "learning_rate": 6.979921036993042e-05, "loss": 2.7042, "num_input_tokens_seen": 15653732413, "step": 30000 }, { "epoch": 8.0111015625, "eval_loss": 3.3149850368499756, "eval_runtime": 157.8074, "eval_samples_per_second": 116.598, "eval_steps_per_second": 7.287, "num_input_tokens_seen": 15653732413, "step": 30000 }, { "epoch": 1.0080494791666668, "grad_norm": 0.20021264255046844, "learning_rate": 4.882732947041818e-05, "loss": 2.7114, "num_input_tokens_seen": 16175778125, "step": 31000 }, { "epoch": 1.0080494791666668, "eval_loss": 3.3138480186462402, "eval_runtime": 155.6732, "eval_samples_per_second": 118.196, "eval_steps_per_second": 7.387, "num_input_tokens_seen": 16175778125, "step": 31000 }, { "epoch": 2.0162526041666666, "grad_norm": 0.16703246533870697, "learning_rate": 3.143671641844831e-05, "loss": 2.6986, "num_input_tokens_seen": 16697712121, "step": 32000 }, { "epoch": 2.0162526041666666, "eval_loss": 3.3022472858428955, "eval_runtime": 156.1087, "eval_samples_per_second": 117.867, "eval_steps_per_second": 7.367, "num_input_tokens_seen": 16697712121, "step": 32000 }, { "epoch": 4.0051015625, "grad_norm": 0.16916561126708984, "learning_rate": 1.776541247281177e-05, "loss": 2.6798, "num_input_tokens_seen": 17219624329, "step": 33000 }, { "epoch": 4.0051015625, "eval_loss": 3.301964521408081, "eval_runtime": 154.9317, "eval_samples_per_second": 118.762, "eval_steps_per_second": 7.423, "num_input_tokens_seen": 17219624329, "step": 33000 }, { "epoch": 5.013414930555555, "grad_norm": 0.16186361014842987, "learning_rate": 7.921936177411049e-06, "loss": 2.6798, "num_input_tokens_seen": 17741407609, "step": 34000 }, { "epoch": 5.013414930555555, "eval_loss": 3.300182580947876, "eval_runtime": 155.5972, "eval_samples_per_second": 118.254, "eval_steps_per_second": 7.391, "num_input_tokens_seen": 17741407609, "step": 34000 }, { "epoch": 7.002003472222222, "grad_norm": 0.1644630879163742, "learning_rate": 1.984421974927375e-06, "loss": 2.7195, "num_input_tokens_seen": 18263254841, "step": 35000 }, { "epoch": 7.002003472222222, "eval_loss": 3.3019583225250244, "eval_runtime": 155.6463, "eval_samples_per_second": 118.217, "eval_steps_per_second": 7.389, "num_input_tokens_seen": 18263254841, "step": 35000 }, { "epoch": 8.010214409722222, "grad_norm": 0.1579432338476181, "learning_rate": 0.0, "loss": 2.683, "num_input_tokens_seen": 18785262825, "step": 36000 }, { "epoch": 8.010214409722222, "eval_loss": 3.3019728660583496, "eval_runtime": 155.7512, "eval_samples_per_second": 118.137, "eval_steps_per_second": 7.384, "num_input_tokens_seen": 18785262825, "step": 36000 } ], "logging_steps": 1000, "max_steps": 36000, "num_input_tokens_seen": 18785262825, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "total_flos": 1.2950229157653494e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }