{ "best_metric": 0.78855299949646, "best_model_checkpoint": "./results_v1/checkpoint-3724", "epoch": 4.0, "eval_steps": 500, "global_step": 3724, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10741138560687433, "grad_norm": 12224.1005859375, "learning_rate": 0.0004865735767991407, "loss": 0.9019, "step": 100 }, { "epoch": 0.21482277121374865, "grad_norm": 11560.216796875, "learning_rate": 0.00047314715359828143, "loss": 0.9088, "step": 200 }, { "epoch": 0.322234156820623, "grad_norm": 10229.27734375, "learning_rate": 0.0004597207303974221, "loss": 0.9132, "step": 300 }, { "epoch": 0.4296455424274973, "grad_norm": 23398.751953125, "learning_rate": 0.00044629430719656286, "loss": 0.9093, "step": 400 }, { "epoch": 0.5370569280343717, "grad_norm": 9462.7333984375, "learning_rate": 0.00043286788399570354, "loss": 0.9045, "step": 500 }, { "epoch": 0.644468313641246, "grad_norm": 11711.173828125, "learning_rate": 0.00041944146079484423, "loss": 0.9007, "step": 600 }, { "epoch": 0.7518796992481203, "grad_norm": 12136.8037109375, "learning_rate": 0.00040601503759398497, "loss": 0.896, "step": 700 }, { "epoch": 0.8592910848549946, "grad_norm": 17582.23828125, "learning_rate": 0.00039258861439312565, "loss": 0.8989, "step": 800 }, { "epoch": 0.966702470461869, "grad_norm": 10597.923828125, "learning_rate": 0.0003791621911922664, "loss": 0.8925, "step": 900 }, { "epoch": 1.0, "eval_loss": 0.8314220905303955, "eval_runtime": 18.8784, "eval_samples_per_second": 423.764, "eval_steps_per_second": 3.337, "step": 931 }, { "epoch": 1.0741138560687433, "grad_norm": 10806.78125, "learning_rate": 0.00036573576799140713, "loss": 0.8738, "step": 1000 }, { "epoch": 1.1815252416756177, "grad_norm": 11682.1171875, "learning_rate": 0.0003523093447905478, "loss": 0.8611, "step": 1100 }, { "epoch": 1.2889366272824918, "grad_norm": 14700.29296875, "learning_rate": 0.00033888292158968855, "loss": 0.8755, "step": 1200 }, { "epoch": 1.3963480128893662, "grad_norm": 11728.32421875, "learning_rate": 0.00032545649838882924, "loss": 0.8683, "step": 1300 }, { "epoch": 1.5037593984962405, "grad_norm": 12242.0029296875, "learning_rate": 0.0003120300751879699, "loss": 0.8745, "step": 1400 }, { "epoch": 1.6111707841031149, "grad_norm": 11073.6376953125, "learning_rate": 0.00029860365198711066, "loss": 0.848, "step": 1500 }, { "epoch": 1.7185821697099892, "grad_norm": 10194.4599609375, "learning_rate": 0.00028517722878625135, "loss": 0.8516, "step": 1600 }, { "epoch": 1.8259935553168636, "grad_norm": 12174.0341796875, "learning_rate": 0.00027175080558539203, "loss": 0.8597, "step": 1700 }, { "epoch": 1.933404940923738, "grad_norm": 11925.640625, "learning_rate": 0.00025832438238453277, "loss": 0.859, "step": 1800 }, { "epoch": 2.0, "eval_loss": 0.8080422878265381, "eval_runtime": 19.478, "eval_samples_per_second": 410.72, "eval_steps_per_second": 3.234, "step": 1862 }, { "epoch": 2.0408163265306123, "grad_norm": 12538.783203125, "learning_rate": 0.00024489795918367346, "loss": 0.8473, "step": 1900 }, { "epoch": 2.1482277121374866, "grad_norm": 11984.9697265625, "learning_rate": 0.00023147153598281417, "loss": 0.8413, "step": 2000 }, { "epoch": 2.255639097744361, "grad_norm": 12034.09765625, "learning_rate": 0.00021804511278195488, "loss": 0.8344, "step": 2100 }, { "epoch": 2.3630504833512354, "grad_norm": 10639.9912109375, "learning_rate": 0.00020461868958109562, "loss": 0.8344, "step": 2200 }, { "epoch": 2.4704618689581097, "grad_norm": 10943.125, "learning_rate": 0.0001911922663802363, "loss": 0.8309, "step": 2300 }, { "epoch": 2.5778732545649836, "grad_norm": 12608.966796875, "learning_rate": 0.00017776584317937702, "loss": 0.835, "step": 2400 }, { "epoch": 2.685284640171858, "grad_norm": 12656.138671875, "learning_rate": 0.00016433941997851773, "loss": 0.8338, "step": 2500 }, { "epoch": 2.7926960257787323, "grad_norm": 13147.025390625, "learning_rate": 0.00015091299677765844, "loss": 0.8361, "step": 2600 }, { "epoch": 2.9001074113856067, "grad_norm": 11629.3115234375, "learning_rate": 0.00013748657357679915, "loss": 0.8314, "step": 2700 }, { "epoch": 3.0, "eval_loss": 0.793174684047699, "eval_runtime": 18.8558, "eval_samples_per_second": 424.272, "eval_steps_per_second": 3.341, "step": 2793 }, { "epoch": 3.007518796992481, "grad_norm": 11106.541015625, "learning_rate": 0.00012406015037593984, "loss": 0.8261, "step": 2800 }, { "epoch": 3.1149301825993554, "grad_norm": 11267.052734375, "learning_rate": 0.00011063372717508056, "loss": 0.8172, "step": 2900 }, { "epoch": 3.2223415682062297, "grad_norm": 10196.681640625, "learning_rate": 9.720730397422128e-05, "loss": 0.8162, "step": 3000 }, { "epoch": 3.329752953813104, "grad_norm": 11563.2431640625, "learning_rate": 8.378088077336197e-05, "loss": 0.8181, "step": 3100 }, { "epoch": 3.4371643394199785, "grad_norm": 12632.6240234375, "learning_rate": 7.035445757250269e-05, "loss": 0.8176, "step": 3200 }, { "epoch": 3.544575725026853, "grad_norm": 13568.869140625, "learning_rate": 5.692803437164339e-05, "loss": 0.8165, "step": 3300 }, { "epoch": 3.651987110633727, "grad_norm": 12489.8134765625, "learning_rate": 4.35016111707841e-05, "loss": 0.8205, "step": 3400 }, { "epoch": 3.7593984962406015, "grad_norm": 10397.326171875, "learning_rate": 3.007518796992481e-05, "loss": 0.819, "step": 3500 }, { "epoch": 3.866809881847476, "grad_norm": 9453.00390625, "learning_rate": 1.664876476906552e-05, "loss": 0.8167, "step": 3600 }, { "epoch": 3.9742212674543502, "grad_norm": 11430.806640625, "learning_rate": 3.22234156820623e-06, "loss": 0.8097, "step": 3700 }, { "epoch": 4.0, "eval_loss": 0.78855299949646, "eval_runtime": 18.9148, "eval_samples_per_second": 422.95, "eval_steps_per_second": 3.331, "step": 3724 } ], "logging_steps": 100, "max_steps": 3724, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6121847720443904e+16, "train_batch_size": 256, "trial_name": null, "trial_params": null }