|
{ |
|
"best_metric": 0.78855299949646, |
|
"best_model_checkpoint": "./results_v1/checkpoint-3724", |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 3724, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10741138560687433, |
|
"grad_norm": 12224.1005859375, |
|
"learning_rate": 0.0004865735767991407, |
|
"loss": 0.9019, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21482277121374865, |
|
"grad_norm": 11560.216796875, |
|
"learning_rate": 0.00047314715359828143, |
|
"loss": 0.9088, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.322234156820623, |
|
"grad_norm": 10229.27734375, |
|
"learning_rate": 0.0004597207303974221, |
|
"loss": 0.9132, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4296455424274973, |
|
"grad_norm": 23398.751953125, |
|
"learning_rate": 0.00044629430719656286, |
|
"loss": 0.9093, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5370569280343717, |
|
"grad_norm": 9462.7333984375, |
|
"learning_rate": 0.00043286788399570354, |
|
"loss": 0.9045, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.644468313641246, |
|
"grad_norm": 11711.173828125, |
|
"learning_rate": 0.00041944146079484423, |
|
"loss": 0.9007, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7518796992481203, |
|
"grad_norm": 12136.8037109375, |
|
"learning_rate": 0.00040601503759398497, |
|
"loss": 0.896, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8592910848549946, |
|
"grad_norm": 17582.23828125, |
|
"learning_rate": 0.00039258861439312565, |
|
"loss": 0.8989, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.966702470461869, |
|
"grad_norm": 10597.923828125, |
|
"learning_rate": 0.0003791621911922664, |
|
"loss": 0.8925, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.8314220905303955, |
|
"eval_runtime": 18.8784, |
|
"eval_samples_per_second": 423.764, |
|
"eval_steps_per_second": 3.337, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 1.0741138560687433, |
|
"grad_norm": 10806.78125, |
|
"learning_rate": 0.00036573576799140713, |
|
"loss": 0.8738, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1815252416756177, |
|
"grad_norm": 11682.1171875, |
|
"learning_rate": 0.0003523093447905478, |
|
"loss": 0.8611, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.2889366272824918, |
|
"grad_norm": 14700.29296875, |
|
"learning_rate": 0.00033888292158968855, |
|
"loss": 0.8755, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.3963480128893662, |
|
"grad_norm": 11728.32421875, |
|
"learning_rate": 0.00032545649838882924, |
|
"loss": 0.8683, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5037593984962405, |
|
"grad_norm": 12242.0029296875, |
|
"learning_rate": 0.0003120300751879699, |
|
"loss": 0.8745, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6111707841031149, |
|
"grad_norm": 11073.6376953125, |
|
"learning_rate": 0.00029860365198711066, |
|
"loss": 0.848, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7185821697099892, |
|
"grad_norm": 10194.4599609375, |
|
"learning_rate": 0.00028517722878625135, |
|
"loss": 0.8516, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8259935553168636, |
|
"grad_norm": 12174.0341796875, |
|
"learning_rate": 0.00027175080558539203, |
|
"loss": 0.8597, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.933404940923738, |
|
"grad_norm": 11925.640625, |
|
"learning_rate": 0.00025832438238453277, |
|
"loss": 0.859, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.8080422878265381, |
|
"eval_runtime": 19.478, |
|
"eval_samples_per_second": 410.72, |
|
"eval_steps_per_second": 3.234, |
|
"step": 1862 |
|
}, |
|
{ |
|
"epoch": 2.0408163265306123, |
|
"grad_norm": 12538.783203125, |
|
"learning_rate": 0.00024489795918367346, |
|
"loss": 0.8473, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.1482277121374866, |
|
"grad_norm": 11984.9697265625, |
|
"learning_rate": 0.00023147153598281417, |
|
"loss": 0.8413, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.255639097744361, |
|
"grad_norm": 12034.09765625, |
|
"learning_rate": 0.00021804511278195488, |
|
"loss": 0.8344, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.3630504833512354, |
|
"grad_norm": 10639.9912109375, |
|
"learning_rate": 0.00020461868958109562, |
|
"loss": 0.8344, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.4704618689581097, |
|
"grad_norm": 10943.125, |
|
"learning_rate": 0.0001911922663802363, |
|
"loss": 0.8309, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.5778732545649836, |
|
"grad_norm": 12608.966796875, |
|
"learning_rate": 0.00017776584317937702, |
|
"loss": 0.835, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.685284640171858, |
|
"grad_norm": 12656.138671875, |
|
"learning_rate": 0.00016433941997851773, |
|
"loss": 0.8338, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.7926960257787323, |
|
"grad_norm": 13147.025390625, |
|
"learning_rate": 0.00015091299677765844, |
|
"loss": 0.8361, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.9001074113856067, |
|
"grad_norm": 11629.3115234375, |
|
"learning_rate": 0.00013748657357679915, |
|
"loss": 0.8314, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.793174684047699, |
|
"eval_runtime": 18.8558, |
|
"eval_samples_per_second": 424.272, |
|
"eval_steps_per_second": 3.341, |
|
"step": 2793 |
|
}, |
|
{ |
|
"epoch": 3.007518796992481, |
|
"grad_norm": 11106.541015625, |
|
"learning_rate": 0.00012406015037593984, |
|
"loss": 0.8261, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.1149301825993554, |
|
"grad_norm": 11267.052734375, |
|
"learning_rate": 0.00011063372717508056, |
|
"loss": 0.8172, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.2223415682062297, |
|
"grad_norm": 10196.681640625, |
|
"learning_rate": 9.720730397422128e-05, |
|
"loss": 0.8162, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.329752953813104, |
|
"grad_norm": 11563.2431640625, |
|
"learning_rate": 8.378088077336197e-05, |
|
"loss": 0.8181, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.4371643394199785, |
|
"grad_norm": 12632.6240234375, |
|
"learning_rate": 7.035445757250269e-05, |
|
"loss": 0.8176, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.544575725026853, |
|
"grad_norm": 13568.869140625, |
|
"learning_rate": 5.692803437164339e-05, |
|
"loss": 0.8165, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.651987110633727, |
|
"grad_norm": 12489.8134765625, |
|
"learning_rate": 4.35016111707841e-05, |
|
"loss": 0.8205, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.7593984962406015, |
|
"grad_norm": 10397.326171875, |
|
"learning_rate": 3.007518796992481e-05, |
|
"loss": 0.819, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.866809881847476, |
|
"grad_norm": 9453.00390625, |
|
"learning_rate": 1.664876476906552e-05, |
|
"loss": 0.8167, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.9742212674543502, |
|
"grad_norm": 11430.806640625, |
|
"learning_rate": 3.22234156820623e-06, |
|
"loss": 0.8097, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.78855299949646, |
|
"eval_runtime": 18.9148, |
|
"eval_samples_per_second": 422.95, |
|
"eval_steps_per_second": 3.331, |
|
"step": 3724 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 3724, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 2, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6121847720443904e+16, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|