emessy's picture
Checkpoint at step 6000
9c3ff68 verified
raw
history blame
11.7 kB
{
"best_metric": 1.613356113433838,
"best_model_checkpoint": "output/checkpoint-6000",
"epoch": 1.8028846153846154,
"eval_steps": 2000,
"global_step": 6000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030048076923076924,
"grad_norm": 2.3787589073181152,
"learning_rate": 2e-05,
"loss": 3.6173,
"step": 100
},
{
"epoch": 0.06009615384615385,
"grad_norm": 1.996222734451294,
"learning_rate": 4e-05,
"loss": 1.7498,
"step": 200
},
{
"epoch": 0.09014423076923077,
"grad_norm": 1.5797070264816284,
"learning_rate": 6e-05,
"loss": 1.7224,
"step": 300
},
{
"epoch": 0.1201923076923077,
"grad_norm": 1.4705523252487183,
"learning_rate": 8e-05,
"loss": 1.6887,
"step": 400
},
{
"epoch": 0.1502403846153846,
"grad_norm": 1.807113766670227,
"learning_rate": 0.0001,
"loss": 1.6883,
"step": 500
},
{
"epoch": 0.18028846153846154,
"grad_norm": 1.211004376411438,
"learning_rate": 9.99725705593595e-05,
"loss": 1.6908,
"step": 600
},
{
"epoch": 0.21033653846153846,
"grad_norm": 1.194908857345581,
"learning_rate": 9.989031233240653e-05,
"loss": 1.6865,
"step": 700
},
{
"epoch": 0.2403846153846154,
"grad_norm": 1.125349998474121,
"learning_rate": 9.975331557102723e-05,
"loss": 1.6796,
"step": 800
},
{
"epoch": 0.2704326923076923,
"grad_norm": 1.2960246801376343,
"learning_rate": 9.9561730585003e-05,
"loss": 1.6697,
"step": 900
},
{
"epoch": 0.3004807692307692,
"grad_norm": 1.1631627082824707,
"learning_rate": 9.931576757709384e-05,
"loss": 1.6675,
"step": 1000
},
{
"epoch": 0.33052884615384615,
"grad_norm": 1.0823206901550293,
"learning_rate": 9.901569641240883e-05,
"loss": 1.6532,
"step": 1100
},
{
"epoch": 0.3605769230769231,
"grad_norm": 1.1251226663589478,
"learning_rate": 9.866184632231592e-05,
"loss": 1.6625,
"step": 1200
},
{
"epoch": 0.390625,
"grad_norm": 1.27994704246521,
"learning_rate": 9.825460554321679e-05,
"loss": 1.6463,
"step": 1300
},
{
"epoch": 0.4206730769230769,
"grad_norm": 1.3841296434402466,
"learning_rate": 9.779442089058252e-05,
"loss": 1.6462,
"step": 1400
},
{
"epoch": 0.45072115384615385,
"grad_norm": 1.1315901279449463,
"learning_rate": 9.728179726871762e-05,
"loss": 1.6452,
"step": 1500
},
{
"epoch": 0.4807692307692308,
"grad_norm": 1.2269665002822876,
"learning_rate": 9.671729711679036e-05,
"loss": 1.6449,
"step": 1600
},
{
"epoch": 0.5108173076923077,
"grad_norm": 1.135764479637146,
"learning_rate": 9.610153979173711e-05,
"loss": 1.6367,
"step": 1700
},
{
"epoch": 0.5408653846153846,
"grad_norm": 1.2129323482513428,
"learning_rate": 9.543520088871773e-05,
"loss": 1.6271,
"step": 1800
},
{
"epoch": 0.5709134615384616,
"grad_norm": 1.2092684507369995,
"learning_rate": 9.471901149986767e-05,
"loss": 1.6353,
"step": 1900
},
{
"epoch": 0.6009615384615384,
"grad_norm": 1.2054263353347778,
"learning_rate": 9.39537574121601e-05,
"loss": 1.6386,
"step": 2000
},
{
"epoch": 0.6009615384615384,
"eval_loss": 1.627854585647583,
"eval_runtime": 242.6417,
"eval_samples_per_second": 73.141,
"eval_steps_per_second": 9.145,
"step": 2000
},
{
"epoch": 0.6310096153846154,
"grad_norm": 1.4155864715576172,
"learning_rate": 9.314027824525798e-05,
"loss": 1.6322,
"step": 2100
},
{
"epoch": 0.6610576923076923,
"grad_norm": 1.2875721454620361,
"learning_rate": 9.22794665303021e-05,
"loss": 1.6205,
"step": 2200
},
{
"epoch": 0.6911057692307693,
"grad_norm": 1.2036750316619873,
"learning_rate": 9.137226673064603e-05,
"loss": 1.6201,
"step": 2300
},
{
"epoch": 0.7211538461538461,
"grad_norm": 1.3741754293441772,
"learning_rate": 9.04196742056119e-05,
"loss": 1.6197,
"step": 2400
},
{
"epoch": 0.7512019230769231,
"grad_norm": 1.3001148700714111,
"learning_rate": 8.942273411840452e-05,
"loss": 1.6285,
"step": 2500
},
{
"epoch": 0.78125,
"grad_norm": 1.3223652839660645,
"learning_rate": 8.838254028938162e-05,
"loss": 1.6323,
"step": 2600
},
{
"epoch": 0.8112980769230769,
"grad_norm": 1.399418592453003,
"learning_rate": 8.730023399593876e-05,
"loss": 1.6184,
"step": 2700
},
{
"epoch": 0.8413461538461539,
"grad_norm": 1.2166377305984497,
"learning_rate": 8.617700272032516e-05,
"loss": 1.6165,
"step": 2800
},
{
"epoch": 0.8713942307692307,
"grad_norm": 1.208473563194275,
"learning_rate": 8.501407884676479e-05,
"loss": 1.616,
"step": 2900
},
{
"epoch": 0.9014423076923077,
"grad_norm": 1.2277066707611084,
"learning_rate": 8.381273830931207e-05,
"loss": 1.6122,
"step": 3000
},
{
"epoch": 0.9314903846153846,
"grad_norm": 1.204302191734314,
"learning_rate": 8.257429919192542e-05,
"loss": 1.6186,
"step": 3100
},
{
"epoch": 0.9615384615384616,
"grad_norm": 1.2529523372650146,
"learning_rate": 8.130012028229512e-05,
"loss": 1.6164,
"step": 3200
},
{
"epoch": 0.9915865384615384,
"grad_norm": 1.513980507850647,
"learning_rate": 7.999159958101186e-05,
"loss": 1.5971,
"step": 3300
},
{
"epoch": 1.0216346153846154,
"grad_norm": 1.420433521270752,
"learning_rate": 7.865017276771173e-05,
"loss": 1.4976,
"step": 3400
},
{
"epoch": 1.0516826923076923,
"grad_norm": 1.4616764783859253,
"learning_rate": 7.727731162588074e-05,
"loss": 1.4486,
"step": 3500
},
{
"epoch": 1.0817307692307692,
"grad_norm": 1.4156601428985596,
"learning_rate": 7.587452242804676e-05,
"loss": 1.4467,
"step": 3600
},
{
"epoch": 1.1117788461538463,
"grad_norm": 1.3352571725845337,
"learning_rate": 7.444334428313112e-05,
"loss": 1.4516,
"step": 3700
},
{
"epoch": 1.1418269230769231,
"grad_norm": 1.4259686470031738,
"learning_rate": 7.298534744777267e-05,
"loss": 1.4466,
"step": 3800
},
{
"epoch": 1.171875,
"grad_norm": 1.4755374193191528,
"learning_rate": 7.150213160347743e-05,
"loss": 1.446,
"step": 3900
},
{
"epoch": 1.2019230769230769,
"grad_norm": 1.4399892091751099,
"learning_rate": 6.999532410148371e-05,
"loss": 1.4331,
"step": 4000
},
{
"epoch": 1.2019230769230769,
"eval_loss": 1.6244958639144897,
"eval_runtime": 248.0586,
"eval_samples_per_second": 71.544,
"eval_steps_per_second": 8.945,
"step": 4000
},
{
"epoch": 1.2319711538461537,
"grad_norm": 1.629622220993042,
"learning_rate": 6.846657817726882e-05,
"loss": 1.4356,
"step": 4100
},
{
"epoch": 1.2620192307692308,
"grad_norm": 1.5240803956985474,
"learning_rate": 6.691757113665606e-05,
"loss": 1.4403,
"step": 4200
},
{
"epoch": 1.2920673076923077,
"grad_norm": 1.947218894958496,
"learning_rate": 6.535000251551231e-05,
"loss": 1.452,
"step": 4300
},
{
"epoch": 1.3221153846153846,
"grad_norm": 1.6493359804153442,
"learning_rate": 6.376559221505535e-05,
"loss": 1.4435,
"step": 4400
},
{
"epoch": 1.3521634615384617,
"grad_norm": 1.6366957426071167,
"learning_rate": 6.216607861481659e-05,
"loss": 1.4385,
"step": 4500
},
{
"epoch": 1.3822115384615383,
"grad_norm": 1.679699182510376,
"learning_rate": 6.055321666533013e-05,
"loss": 1.4509,
"step": 4600
},
{
"epoch": 1.4122596153846154,
"grad_norm": 1.5405994653701782,
"learning_rate": 5.8928775962640146e-05,
"loss": 1.4375,
"step": 4700
},
{
"epoch": 1.4423076923076923,
"grad_norm": 1.5734689235687256,
"learning_rate": 5.7294538806739775e-05,
"loss": 1.4315,
"step": 4800
},
{
"epoch": 1.4723557692307692,
"grad_norm": 1.6284011602401733,
"learning_rate": 5.565229824607143e-05,
"loss": 1.4457,
"step": 4900
},
{
"epoch": 1.5024038461538463,
"grad_norm": 1.5765283107757568,
"learning_rate": 5.400385611023416e-05,
"loss": 1.4374,
"step": 5000
},
{
"epoch": 1.5324519230769231,
"grad_norm": 1.7722498178482056,
"learning_rate": 5.235102103305654e-05,
"loss": 1.4513,
"step": 5100
},
{
"epoch": 1.5625,
"grad_norm": 1.6080434322357178,
"learning_rate": 5.0695606468204095e-05,
"loss": 1.4322,
"step": 5200
},
{
"epoch": 1.5925480769230769,
"grad_norm": 1.7113045454025269,
"learning_rate": 4.90394286994985e-05,
"loss": 1.4372,
"step": 5300
},
{
"epoch": 1.6225961538461537,
"grad_norm": 1.595045566558838,
"learning_rate": 4.738430484813162e-05,
"loss": 1.4391,
"step": 5400
},
{
"epoch": 1.6526442307692308,
"grad_norm": 1.5261282920837402,
"learning_rate": 4.5732050878960816e-05,
"loss": 1.4375,
"step": 5500
},
{
"epoch": 1.6826923076923077,
"grad_norm": 1.6319518089294434,
"learning_rate": 4.40844796080729e-05,
"loss": 1.4269,
"step": 5600
},
{
"epoch": 1.7127403846153846,
"grad_norm": 1.6978216171264648,
"learning_rate": 4.244339871380291e-05,
"loss": 1.4261,
"step": 5700
},
{
"epoch": 1.7427884615384617,
"grad_norm": 1.748810887336731,
"learning_rate": 4.0810608753389864e-05,
"loss": 1.4349,
"step": 5800
},
{
"epoch": 1.7728365384615383,
"grad_norm": 1.4340012073516846,
"learning_rate": 3.9187901187445675e-05,
"loss": 1.4349,
"step": 5900
},
{
"epoch": 1.8028846153846154,
"grad_norm": 1.7568167448043823,
"learning_rate": 3.757705641440461e-05,
"loss": 1.4318,
"step": 6000
},
{
"epoch": 1.8028846153846154,
"eval_loss": 1.613356113433838,
"eval_runtime": 246.395,
"eval_samples_per_second": 72.027,
"eval_steps_per_second": 9.006,
"step": 6000
}
],
"logging_steps": 100,
"max_steps": 9984,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.830080251756462e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}