oldiday's picture
Training in progress, step 100, checkpoint
ca5852b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.015958507879513265,
"eval_steps": 9,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00015958507879513265,
"eval_loss": 2.050989866256714,
"eval_runtime": 454.429,
"eval_samples_per_second": 23.225,
"eval_steps_per_second": 2.905,
"step": 1
},
{
"epoch": 0.00047875523638539794,
"grad_norm": 0.9697971343994141,
"learning_rate": 1.5e-05,
"loss": 1.9725,
"step": 3
},
{
"epoch": 0.0009575104727707959,
"grad_norm": 0.901890754699707,
"learning_rate": 3e-05,
"loss": 2.0065,
"step": 6
},
{
"epoch": 0.001436265709156194,
"grad_norm": 0.9856873154640198,
"learning_rate": 4.5e-05,
"loss": 1.931,
"step": 9
},
{
"epoch": 0.001436265709156194,
"eval_loss": 1.9257014989852905,
"eval_runtime": 457.2135,
"eval_samples_per_second": 23.083,
"eval_steps_per_second": 2.887,
"step": 9
},
{
"epoch": 0.0019150209455415918,
"grad_norm": 0.7096590399742126,
"learning_rate": 4.993910125649561e-05,
"loss": 1.7845,
"step": 12
},
{
"epoch": 0.0023937761819269898,
"grad_norm": 0.726519763469696,
"learning_rate": 4.962019382530521e-05,
"loss": 1.8313,
"step": 15
},
{
"epoch": 0.002872531418312388,
"grad_norm": 0.6210381388664246,
"learning_rate": 4.9031542398457974e-05,
"loss": 1.8015,
"step": 18
},
{
"epoch": 0.002872531418312388,
"eval_loss": 1.7303786277770996,
"eval_runtime": 456.9299,
"eval_samples_per_second": 23.098,
"eval_steps_per_second": 2.889,
"step": 18
},
{
"epoch": 0.0033512866546977858,
"grad_norm": 0.6193668246269226,
"learning_rate": 4.817959636416969e-05,
"loss": 1.7996,
"step": 21
},
{
"epoch": 0.0038300418910831835,
"grad_norm": 0.6268954873085022,
"learning_rate": 4.707368982147318e-05,
"loss": 1.6603,
"step": 24
},
{
"epoch": 0.004308797127468581,
"grad_norm": 0.5985816121101379,
"learning_rate": 4.572593931387604e-05,
"loss": 1.716,
"step": 27
},
{
"epoch": 0.004308797127468581,
"eval_loss": 1.707382321357727,
"eval_runtime": 456.9479,
"eval_samples_per_second": 23.097,
"eval_steps_per_second": 2.889,
"step": 27
},
{
"epoch": 0.0047875523638539795,
"grad_norm": 0.5452908873558044,
"learning_rate": 4.415111107797445e-05,
"loss": 1.7306,
"step": 30
},
{
"epoch": 0.005266307600239378,
"grad_norm": 0.5148115754127502,
"learning_rate": 4.2366459261474933e-05,
"loss": 1.658,
"step": 33
},
{
"epoch": 0.005745062836624776,
"grad_norm": 0.5095010995864868,
"learning_rate": 4.039153688314145e-05,
"loss": 1.7653,
"step": 36
},
{
"epoch": 0.005745062836624776,
"eval_loss": 1.6911619901657104,
"eval_runtime": 457.3928,
"eval_samples_per_second": 23.074,
"eval_steps_per_second": 2.886,
"step": 36
},
{
"epoch": 0.006223818073010173,
"grad_norm": 0.480747789144516,
"learning_rate": 3.824798160583012e-05,
"loss": 1.6363,
"step": 39
},
{
"epoch": 0.0067025733093955715,
"grad_norm": 0.4569284915924072,
"learning_rate": 3.5959278669726935e-05,
"loss": 1.6942,
"step": 42
},
{
"epoch": 0.00718132854578097,
"grad_norm": 0.5147221684455872,
"learning_rate": 3.355050358314172e-05,
"loss": 1.7732,
"step": 45
},
{
"epoch": 0.00718132854578097,
"eval_loss": 1.6826040744781494,
"eval_runtime": 457.1352,
"eval_samples_per_second": 23.087,
"eval_steps_per_second": 2.888,
"step": 45
},
{
"epoch": 0.007660083782166367,
"grad_norm": 0.46845507621765137,
"learning_rate": 3.104804738999169e-05,
"loss": 1.6482,
"step": 48
},
{
"epoch": 0.008138839018551766,
"grad_norm": 0.5122058391571045,
"learning_rate": 2.8479327524001636e-05,
"loss": 1.6855,
"step": 51
},
{
"epoch": 0.008617594254937163,
"grad_norm": 0.46340513229370117,
"learning_rate": 2.587248741756253e-05,
"loss": 1.5771,
"step": 54
},
{
"epoch": 0.008617594254937163,
"eval_loss": 1.678298830986023,
"eval_runtime": 457.7537,
"eval_samples_per_second": 23.056,
"eval_steps_per_second": 2.884,
"step": 54
},
{
"epoch": 0.00909634949132256,
"grad_norm": 0.4864901602268219,
"learning_rate": 2.3256088156396868e-05,
"loss": 1.6974,
"step": 57
},
{
"epoch": 0.009575104727707959,
"grad_norm": 0.47456270456314087,
"learning_rate": 2.0658795558326743e-05,
"loss": 1.7266,
"step": 60
},
{
"epoch": 0.010053859964093357,
"grad_norm": 0.44960471987724304,
"learning_rate": 1.8109066104575023e-05,
"loss": 1.6703,
"step": 63
},
{
"epoch": 0.010053859964093357,
"eval_loss": 1.6748311519622803,
"eval_runtime": 457.1158,
"eval_samples_per_second": 23.088,
"eval_steps_per_second": 2.888,
"step": 63
},
{
"epoch": 0.010532615200478756,
"grad_norm": 0.5045154690742493,
"learning_rate": 1.56348351646022e-05,
"loss": 1.6842,
"step": 66
},
{
"epoch": 0.011011370436864154,
"grad_norm": 0.49012699723243713,
"learning_rate": 1.3263210930352737e-05,
"loss": 1.6247,
"step": 69
},
{
"epoch": 0.011490125673249552,
"grad_norm": 0.5065352320671082,
"learning_rate": 1.1020177413231334e-05,
"loss": 1.7605,
"step": 72
},
{
"epoch": 0.011490125673249552,
"eval_loss": 1.6728583574295044,
"eval_runtime": 457.0738,
"eval_samples_per_second": 23.09,
"eval_steps_per_second": 2.888,
"step": 72
},
{
"epoch": 0.011968880909634948,
"grad_norm": 0.5055304169654846,
"learning_rate": 8.930309757836517e-06,
"loss": 1.5792,
"step": 75
},
{
"epoch": 0.012447636146020347,
"grad_norm": 0.46882903575897217,
"learning_rate": 7.016504991533726e-06,
"loss": 1.714,
"step": 78
},
{
"epoch": 0.012926391382405745,
"grad_norm": 0.5146296620368958,
"learning_rate": 5.299731159831953e-06,
"loss": 1.5839,
"step": 81
},
{
"epoch": 0.012926391382405745,
"eval_loss": 1.6719353199005127,
"eval_runtime": 456.9575,
"eval_samples_per_second": 23.096,
"eval_steps_per_second": 2.889,
"step": 81
},
{
"epoch": 0.013405146618791143,
"grad_norm": 0.4731264114379883,
"learning_rate": 3.798797596089351e-06,
"loss": 1.7055,
"step": 84
},
{
"epoch": 0.013883901855176541,
"grad_norm": 0.5054563879966736,
"learning_rate": 2.5301488425208296e-06,
"loss": 1.5845,
"step": 87
},
{
"epoch": 0.01436265709156194,
"grad_norm": 0.46672162413597107,
"learning_rate": 1.5076844803522922e-06,
"loss": 1.7207,
"step": 90
},
{
"epoch": 0.01436265709156194,
"eval_loss": 1.671474575996399,
"eval_runtime": 456.9871,
"eval_samples_per_second": 23.095,
"eval_steps_per_second": 2.888,
"step": 90
},
{
"epoch": 0.014841412327947338,
"grad_norm": 0.5002412796020508,
"learning_rate": 7.426068431000882e-07,
"loss": 1.6392,
"step": 93
},
{
"epoch": 0.015320167564332734,
"grad_norm": 0.47195887565612793,
"learning_rate": 2.4329828146074095e-07,
"loss": 1.6559,
"step": 96
},
{
"epoch": 0.015798922800718134,
"grad_norm": 0.513578474521637,
"learning_rate": 1.522932452260595e-08,
"loss": 1.7077,
"step": 99
},
{
"epoch": 0.015798922800718134,
"eval_loss": 1.6713879108428955,
"eval_runtime": 457.4203,
"eval_samples_per_second": 23.073,
"eval_steps_per_second": 2.886,
"step": 99
}
],
"logging_steps": 3,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 9,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.75001861668864e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}