{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.19, "grad_norm": 8.953761100769043, "learning_rate": 1.888888888888889e-05, "loss": 1.3623, "step": 17 }, { "epoch": 0.38, "grad_norm": 10.632478713989258, "learning_rate": 3.777777777777778e-05, "loss": 1.0225, "step": 34 }, { "epoch": 0.57, "grad_norm": 16.316967010498047, "learning_rate": 4.925925925925926e-05, "loss": 0.6166, "step": 51 }, { "epoch": 0.76, "grad_norm": 21.347354888916016, "learning_rate": 4.7160493827160495e-05, "loss": 0.5825, "step": 68 }, { "epoch": 0.94, "grad_norm": 8.19096565246582, "learning_rate": 4.506172839506173e-05, "loss": 0.4259, "step": 85 }, { "epoch": 1.13, "grad_norm": 12.711194038391113, "learning_rate": 4.296296296296296e-05, "loss": 0.2901, "step": 102 }, { "epoch": 1.32, "grad_norm": 18.733543395996094, "learning_rate": 4.0864197530864204e-05, "loss": 0.2684, "step": 119 }, { "epoch": 1.51, "grad_norm": 6.416273593902588, "learning_rate": 3.876543209876544e-05, "loss": 0.2797, "step": 136 }, { "epoch": 1.7, "grad_norm": 4.896417617797852, "learning_rate": 3.6666666666666666e-05, "loss": 0.3023, "step": 153 }, { "epoch": 1.89, "grad_norm": 17.5325984954834, "learning_rate": 3.45679012345679e-05, "loss": 0.2397, "step": 170 }, { "epoch": 2.08, "grad_norm": 9.514878273010254, "learning_rate": 3.2469135802469134e-05, "loss": 0.2499, "step": 187 }, { "epoch": 2.27, "grad_norm": 6.783230304718018, "learning_rate": 3.037037037037037e-05, "loss": 0.2137, "step": 204 }, { "epoch": 2.46, "grad_norm": 9.813042640686035, "learning_rate": 2.8271604938271606e-05, "loss": 0.1946, "step": 221 }, { "epoch": 2.64, "grad_norm": 4.386660099029541, "learning_rate": 2.617283950617284e-05, "loss": 0.2326, "step": 238 }, { "epoch": 2.83, "grad_norm": 13.430244445800781, "learning_rate": 2.4074074074074074e-05, "loss": 0.1931, "step": 255 }, { "epoch": 3.02, "grad_norm": 10.026385307312012, "learning_rate": 2.1975308641975308e-05, "loss": 0.1584, "step": 272 }, { "epoch": 3.21, "grad_norm": 5.96959924697876, "learning_rate": 1.9876543209876546e-05, "loss": 0.1612, "step": 289 }, { "epoch": 3.4, "grad_norm": 8.849898338317871, "learning_rate": 1.777777777777778e-05, "loss": 0.1843, "step": 306 }, { "epoch": 3.59, "grad_norm": 4.787600517272949, "learning_rate": 1.5679012345679014e-05, "loss": 0.1395, "step": 323 }, { "epoch": 3.78, "grad_norm": 5.98640251159668, "learning_rate": 1.3580246913580247e-05, "loss": 0.1351, "step": 340 }, { "epoch": 3.97, "grad_norm": 8.119280815124512, "learning_rate": 1.1481481481481482e-05, "loss": 0.1626, "step": 357 }, { "epoch": 4.16, "grad_norm": 7.947086334228516, "learning_rate": 9.382716049382717e-06, "loss": 0.1658, "step": 374 }, { "epoch": 4.34, "grad_norm": 2.4981260299682617, "learning_rate": 7.283950617283951e-06, "loss": 0.1147, "step": 391 }, { "epoch": 4.53, "grad_norm": 5.954956531524658, "learning_rate": 5.185185185185185e-06, "loss": 0.1295, "step": 408 }, { "epoch": 4.72, "grad_norm": 9.320396423339844, "learning_rate": 3.0864197530864196e-06, "loss": 0.1309, "step": 425 }, { "epoch": 4.91, "grad_norm": 9.413307189941406, "learning_rate": 9.876543209876544e-07, "loss": 0.095, "step": 442 } ], "logging_steps": 17, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.1242852922068992e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }