{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.192, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 0.7807257175445557, "learning_rate": 4e-05, "loss": 2.509, "step": 1 }, { "epoch": 0.0064, "grad_norm": 0.7869868874549866, "learning_rate": 8e-05, "loss": 2.5141, "step": 2 }, { "epoch": 0.0096, "grad_norm": 0.7782315611839294, "learning_rate": 0.00012, "loss": 2.4717, "step": 3 }, { "epoch": 0.0128, "grad_norm": 0.8121834993362427, "learning_rate": 0.00016, "loss": 2.3125, "step": 4 }, { "epoch": 0.016, "grad_norm": 0.8421923518180847, "learning_rate": 0.0002, "loss": 2.087, "step": 5 }, { "epoch": 0.0192, "grad_norm": 1.1386044025421143, "learning_rate": 0.00019636363636363636, "loss": 1.8111, "step": 6 }, { "epoch": 0.0224, "grad_norm": 1.2734416723251343, "learning_rate": 0.00019272727272727274, "loss": 1.497, "step": 7 }, { "epoch": 0.0256, "grad_norm": 1.3242508172988892, "learning_rate": 0.0001890909090909091, "loss": 1.1827, "step": 8 }, { "epoch": 0.0288, "grad_norm": 1.5871871709823608, "learning_rate": 0.00018545454545454545, "loss": 0.9281, "step": 9 }, { "epoch": 0.032, "grad_norm": 1.4778683185577393, "learning_rate": 0.00018181818181818183, "loss": 0.7139, "step": 10 }, { "epoch": 0.0352, "grad_norm": 1.833769679069519, "learning_rate": 0.0001781818181818182, "loss": 0.572, "step": 11 }, { "epoch": 0.0384, "grad_norm": 1.3144052028656006, "learning_rate": 0.00017454545454545454, "loss": 0.4447, "step": 12 }, { "epoch": 0.0416, "grad_norm": 3.582252264022827, "learning_rate": 0.0001709090909090909, "loss": 0.38, "step": 13 }, { "epoch": 0.0448, "grad_norm": 2.804332971572876, "learning_rate": 0.00016727272727272728, "loss": 0.3184, "step": 14 }, { "epoch": 0.048, "grad_norm": 1.5338329076766968, "learning_rate": 0.00016363636363636366, "loss": 0.2658, "step": 15 }, { "epoch": 0.0512, "grad_norm": 0.9025959968566895, "learning_rate": 0.00016, "loss": 0.2266, "step": 16 }, { "epoch": 0.0544, "grad_norm": 0.9067522883415222, "learning_rate": 0.00015636363636363637, "loss": 0.1958, "step": 17 }, { "epoch": 0.0576, "grad_norm": 0.5175400376319885, "learning_rate": 0.00015272727272727275, "loss": 0.1932, "step": 18 }, { "epoch": 0.0608, "grad_norm": 0.30395233631134033, "learning_rate": 0.0001490909090909091, "loss": 0.1829, "step": 19 }, { "epoch": 0.064, "grad_norm": 0.28186506032943726, "learning_rate": 0.00014545454545454546, "loss": 0.1788, "step": 20 }, { "epoch": 0.0672, "grad_norm": 0.21213509142398834, "learning_rate": 0.00014181818181818184, "loss": 0.1711, "step": 21 }, { "epoch": 0.0704, "grad_norm": 0.2244960516691208, "learning_rate": 0.0001381818181818182, "loss": 0.1712, "step": 22 }, { "epoch": 0.0736, "grad_norm": 0.22173970937728882, "learning_rate": 0.00013454545454545455, "loss": 0.1652, "step": 23 }, { "epoch": 0.0768, "grad_norm": 0.16787873208522797, "learning_rate": 0.00013090909090909093, "loss": 0.166, "step": 24 }, { "epoch": 0.08, "grad_norm": 0.20377585291862488, "learning_rate": 0.00012727272727272728, "loss": 0.1593, "step": 25 }, { "epoch": 0.0832, "grad_norm": 0.17485077679157257, "learning_rate": 0.00012363636363636364, "loss": 0.1591, "step": 26 }, { "epoch": 0.0864, "grad_norm": 0.1908087134361267, "learning_rate": 0.00012, "loss": 0.1585, "step": 27 }, { "epoch": 0.0896, "grad_norm": 0.21306680142879486, "learning_rate": 0.00011636363636363636, "loss": 0.1612, "step": 28 }, { "epoch": 0.0928, "grad_norm": 0.1579781174659729, "learning_rate": 0.00011272727272727272, "loss": 0.1532, "step": 29 }, { "epoch": 0.096, "grad_norm": 0.21352051198482513, "learning_rate": 0.00010909090909090909, "loss": 0.1464, "step": 30 }, { "epoch": 0.0992, "grad_norm": 0.2137880176305771, "learning_rate": 0.00010545454545454545, "loss": 0.1527, "step": 31 }, { "epoch": 0.1024, "grad_norm": 0.21048834919929504, "learning_rate": 0.00010181818181818181, "loss": 0.1499, "step": 32 }, { "epoch": 0.1056, "grad_norm": 0.206934854388237, "learning_rate": 9.818181818181818e-05, "loss": 0.1457, "step": 33 }, { "epoch": 0.1088, "grad_norm": 0.1933310180902481, "learning_rate": 9.454545454545455e-05, "loss": 0.1487, "step": 34 }, { "epoch": 0.112, "grad_norm": 0.19178827106952667, "learning_rate": 9.090909090909092e-05, "loss": 0.1474, "step": 35 }, { "epoch": 0.1152, "grad_norm": 0.24533748626708984, "learning_rate": 8.727272727272727e-05, "loss": 0.1495, "step": 36 }, { "epoch": 0.1184, "grad_norm": 0.29513201117515564, "learning_rate": 8.363636363636364e-05, "loss": 0.1469, "step": 37 }, { "epoch": 0.1216, "grad_norm": 0.19339747726917267, "learning_rate": 8e-05, "loss": 0.1409, "step": 38 }, { "epoch": 0.1248, "grad_norm": 0.21743986010551453, "learning_rate": 7.636363636363637e-05, "loss": 0.1418, "step": 39 }, { "epoch": 0.128, "grad_norm": 0.23053643107414246, "learning_rate": 7.272727272727273e-05, "loss": 0.1418, "step": 40 }, { "epoch": 0.1312, "grad_norm": 0.22498559951782227, "learning_rate": 6.90909090909091e-05, "loss": 0.1403, "step": 41 }, { "epoch": 0.1344, "grad_norm": 0.2134191244840622, "learning_rate": 6.545454545454546e-05, "loss": 0.1416, "step": 42 }, { "epoch": 0.1376, "grad_norm": 0.20188340544700623, "learning_rate": 6.181818181818182e-05, "loss": 0.1388, "step": 43 }, { "epoch": 0.1408, "grad_norm": 0.22138598561286926, "learning_rate": 5.818181818181818e-05, "loss": 0.137, "step": 44 }, { "epoch": 0.144, "grad_norm": 0.2358243465423584, "learning_rate": 5.4545454545454546e-05, "loss": 0.1386, "step": 45 }, { "epoch": 0.1472, "grad_norm": 0.2473626732826233, "learning_rate": 5.090909090909091e-05, "loss": 0.1367, "step": 46 }, { "epoch": 0.1504, "grad_norm": 0.23792380094528198, "learning_rate": 4.7272727272727275e-05, "loss": 0.1335, "step": 47 }, { "epoch": 0.1536, "grad_norm": 0.2288408726453781, "learning_rate": 4.3636363636363636e-05, "loss": 0.1345, "step": 48 }, { "epoch": 0.1568, "grad_norm": 0.2344062626361847, "learning_rate": 4e-05, "loss": 0.1319, "step": 49 }, { "epoch": 0.16, "grad_norm": 0.23560936748981476, "learning_rate": 3.6363636363636364e-05, "loss": 0.131, "step": 50 }, { "epoch": 0.1632, "grad_norm": 0.24920542538166046, "learning_rate": 3.272727272727273e-05, "loss": 0.1351, "step": 51 }, { "epoch": 0.1664, "grad_norm": 0.2517445683479309, "learning_rate": 2.909090909090909e-05, "loss": 0.1344, "step": 52 }, { "epoch": 0.1696, "grad_norm": 0.24988840520381927, "learning_rate": 2.5454545454545454e-05, "loss": 0.1302, "step": 53 }, { "epoch": 0.1728, "grad_norm": 0.2461981624364853, "learning_rate": 2.1818181818181818e-05, "loss": 0.1284, "step": 54 }, { "epoch": 0.176, "grad_norm": 0.25435972213745117, "learning_rate": 1.8181818181818182e-05, "loss": 0.1302, "step": 55 }, { "epoch": 0.1792, "grad_norm": 0.2487025409936905, "learning_rate": 1.4545454545454545e-05, "loss": 0.1309, "step": 56 }, { "epoch": 0.1824, "grad_norm": 0.24456636607646942, "learning_rate": 1.0909090909090909e-05, "loss": 0.1275, "step": 57 }, { "epoch": 0.1856, "grad_norm": 0.2547883689403534, "learning_rate": 7.272727272727272e-06, "loss": 0.1254, "step": 58 }, { "epoch": 0.1888, "grad_norm": 0.2569814622402191, "learning_rate": 3.636363636363636e-06, "loss": 0.1296, "step": 59 }, { "epoch": 0.192, "grad_norm": 0.2542433440685272, "learning_rate": 0.0, "loss": 0.126, "step": 60 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.55328713719808e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }