{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.936936936936937, "eval_steps": 500, "global_step": 26, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07207207207207207, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 110.8848, "step": 1 }, { "epoch": 0.14414414414414414, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.377, "step": 2 }, { "epoch": 0.21621621621621623, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 110.6699, "step": 3 }, { "epoch": 0.2882882882882883, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.3672, "step": 4 }, { "epoch": 0.36036036036036034, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 110.9941, "step": 5 }, { "epoch": 0.43243243243243246, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.1211, "step": 6 }, { "epoch": 0.5045045045045045, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.4082, "step": 7 }, { "epoch": 0.5765765765765766, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.418, "step": 8 }, { "epoch": 0.6486486486486487, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 110.4473, "step": 9 }, { "epoch": 0.7207207207207207, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.0137, "step": 10 }, { "epoch": 0.7927927927927928, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.3359, "step": 11 }, { "epoch": 0.8648648648648649, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.1465, "step": 12 }, { "epoch": 0.9369369369369369, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 110.6699, "step": 13 }, { "epoch": 1.072072072072072, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 138.8008, "step": 14 }, { "epoch": 1.1441441441441442, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.1465, "step": 15 }, { "epoch": 1.2162162162162162, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 110.8516, "step": 16 }, { "epoch": 1.2882882882882882, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.6328, "step": 17 }, { "epoch": 1.3603603603603602, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 111.127, "step": 18 }, { "epoch": 1.4324324324324325, "grad_norm": 123.0553207397461, "learning_rate": 0.0001, "loss": 111.041, "step": 19 }, { "epoch": 1.5045045045045045, "grad_norm": 121.99684143066406, "learning_rate": 9.96057350657239e-05, "loss": 92.9346, "step": 20 }, { "epoch": 1.5765765765765765, "grad_norm": 44.45335006713867, "learning_rate": 9.842915805643155e-05, "loss": 36.5908, "step": 21 }, { "epoch": 1.6486486486486487, "grad_norm": 44.45335006713867, "learning_rate": 9.842915805643155e-05, "loss": 26.5054, "step": 22 }, { "epoch": 1.7207207207207207, "grad_norm": 44.45335006713867, "learning_rate": 9.842915805643155e-05, "loss": 26.6528, "step": 23 }, { "epoch": 1.7927927927927927, "grad_norm": 71.70808410644531, "learning_rate": 9.648882429441257e-05, "loss": 25.5659, "step": 24 }, { "epoch": 1.864864864864865, "grad_norm": 45.514339447021484, "learning_rate": 9.381533400219318e-05, "loss": 25.1484, "step": 25 }, { "epoch": 1.936936936936937, "grad_norm": 45.514339447021484, "learning_rate": 9.381533400219318e-05, "loss": 31.6045, "step": 26 }, { "epoch": 1.936936936936937, "step": 26, "total_flos": 3.909058176221184e+16, "train_loss": 92.4405987079327, "train_runtime": 5129.5954, "train_samples_per_second": 0.344, "train_steps_per_second": 0.005 } ], "logging_steps": 1.0, "max_steps": 26, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.909058176221184e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }