|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 13.0, |
|
"eval_steps": 500, |
|
"global_step": 4134, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.9968553459119497, |
|
"grad_norm": 1.7784031629562378, |
|
"learning_rate": 1.8466376390904693e-05, |
|
"loss": 1.7687, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.67, |
|
"eval_loss": 0.9660505652427673, |
|
"eval_runtime": 1.6131, |
|
"eval_samples_per_second": 1921.733, |
|
"eval_steps_per_second": 40.294, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.9937106918238994, |
|
"grad_norm": 1.4204388856887817, |
|
"learning_rate": 1.6932752781809388e-05, |
|
"loss": 0.6811, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.83, |
|
"eval_loss": 0.2671310305595398, |
|
"eval_runtime": 1.5805, |
|
"eval_samples_per_second": 1961.395, |
|
"eval_steps_per_second": 41.126, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.990566037735849, |
|
"grad_norm": 1.0138155221939087, |
|
"learning_rate": 1.539912917271408e-05, |
|
"loss": 0.2272, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9067741935483871, |
|
"eval_loss": 0.11616384238004684, |
|
"eval_runtime": 1.5765, |
|
"eval_samples_per_second": 1966.361, |
|
"eval_steps_per_second": 41.23, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 3.9874213836477987, |
|
"grad_norm": 0.9868770837783813, |
|
"learning_rate": 1.3865505563618772e-05, |
|
"loss": 0.1117, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9245161290322581, |
|
"eval_loss": 0.07967381924390793, |
|
"eval_runtime": 1.6048, |
|
"eval_samples_per_second": 1931.717, |
|
"eval_steps_per_second": 40.504, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 4.984276729559748, |
|
"grad_norm": 0.5568020939826965, |
|
"learning_rate": 1.2331881954523466e-05, |
|
"loss": 0.0762, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9332258064516129, |
|
"eval_loss": 0.0637175664305687, |
|
"eval_runtime": 1.6245, |
|
"eval_samples_per_second": 1908.268, |
|
"eval_steps_per_second": 40.012, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.981132075471698, |
|
"grad_norm": 0.9021977782249451, |
|
"learning_rate": 1.0798258345428159e-05, |
|
"loss": 0.0609, |
|
"step": 1902 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9341935483870968, |
|
"eval_loss": 0.05572959780693054, |
|
"eval_runtime": 1.5863, |
|
"eval_samples_per_second": 1954.213, |
|
"eval_steps_per_second": 40.975, |
|
"step": 1908 |
|
}, |
|
{ |
|
"epoch": 6.977987421383648, |
|
"grad_norm": 0.6561667323112488, |
|
"learning_rate": 9.26463473633285e-06, |
|
"loss": 0.0529, |
|
"step": 2219 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9380645161290323, |
|
"eval_loss": 0.05086366832256317, |
|
"eval_runtime": 1.5942, |
|
"eval_samples_per_second": 1944.608, |
|
"eval_steps_per_second": 40.774, |
|
"step": 2226 |
|
}, |
|
{ |
|
"epoch": 7.9748427672955975, |
|
"grad_norm": 0.39448273181915283, |
|
"learning_rate": 7.731011127237543e-06, |
|
"loss": 0.0471, |
|
"step": 2536 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9390322580645162, |
|
"eval_loss": 0.04851401969790459, |
|
"eval_runtime": 1.5838, |
|
"eval_samples_per_second": 1957.35, |
|
"eval_steps_per_second": 41.041, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 8.971698113207546, |
|
"grad_norm": 0.562044620513916, |
|
"learning_rate": 6.197387518142236e-06, |
|
"loss": 0.0439, |
|
"step": 2853 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9345161290322581, |
|
"eval_loss": 0.04487784206867218, |
|
"eval_runtime": 1.5718, |
|
"eval_samples_per_second": 1972.247, |
|
"eval_steps_per_second": 41.354, |
|
"step": 2862 |
|
}, |
|
{ |
|
"epoch": 9.968553459119496, |
|
"grad_norm": 0.3506462275981903, |
|
"learning_rate": 4.6637639090469286e-06, |
|
"loss": 0.0413, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9396774193548387, |
|
"eval_loss": 0.043323710560798645, |
|
"eval_runtime": 1.5787, |
|
"eval_samples_per_second": 1963.58, |
|
"eval_steps_per_second": 41.172, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 10.965408805031446, |
|
"grad_norm": 0.33104729652404785, |
|
"learning_rate": 3.1301402999516213e-06, |
|
"loss": 0.0394, |
|
"step": 3487 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.9396774193548387, |
|
"eval_loss": 0.04240846261382103, |
|
"eval_runtime": 1.5874, |
|
"eval_samples_per_second": 1952.868, |
|
"eval_steps_per_second": 40.947, |
|
"step": 3498 |
|
}, |
|
{ |
|
"epoch": 11.962264150943396, |
|
"grad_norm": 0.37558791041374207, |
|
"learning_rate": 1.5965166908563137e-06, |
|
"loss": 0.0382, |
|
"step": 3804 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.9403225806451613, |
|
"eval_loss": 0.04175195470452309, |
|
"eval_runtime": 1.593, |
|
"eval_samples_per_second": 1945.992, |
|
"eval_steps_per_second": 40.803, |
|
"step": 3816 |
|
}, |
|
{ |
|
"epoch": 12.959119496855346, |
|
"grad_norm": 0.4860643446445465, |
|
"learning_rate": 6.289308176100629e-08, |
|
"loss": 0.0376, |
|
"step": 4121 |
|
} |
|
], |
|
"logging_steps": 317, |
|
"max_steps": 4134, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 13, |
|
"save_steps": 1000000000.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1072259767520340.0, |
|
"train_batch_size": 48, |
|
"trial_name": null, |
|
"trial_params": { |
|
"alpha": 0.3858447457522741, |
|
"num_train_epochs": 13, |
|
"temperature": 1 |
|
} |
|
} |
|
|