|
{ |
|
"best_metric": 1.613356113433838, |
|
"best_model_checkpoint": "output/checkpoint-6000", |
|
"epoch": 1.8028846153846154, |
|
"eval_steps": 2000, |
|
"global_step": 6000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030048076923076924, |
|
"grad_norm": 2.3787589073181152, |
|
"learning_rate": 2e-05, |
|
"loss": 3.6173, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06009615384615385, |
|
"grad_norm": 1.996222734451294, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7498, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09014423076923077, |
|
"grad_norm": 1.5797070264816284, |
|
"learning_rate": 6e-05, |
|
"loss": 1.7224, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1201923076923077, |
|
"grad_norm": 1.4705523252487183, |
|
"learning_rate": 8e-05, |
|
"loss": 1.6887, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1502403846153846, |
|
"grad_norm": 1.807113766670227, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6883, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18028846153846154, |
|
"grad_norm": 1.211004376411438, |
|
"learning_rate": 9.99725705593595e-05, |
|
"loss": 1.6908, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.21033653846153846, |
|
"grad_norm": 1.194908857345581, |
|
"learning_rate": 9.989031233240653e-05, |
|
"loss": 1.6865, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2403846153846154, |
|
"grad_norm": 1.125349998474121, |
|
"learning_rate": 9.975331557102723e-05, |
|
"loss": 1.6796, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2704326923076923, |
|
"grad_norm": 1.2960246801376343, |
|
"learning_rate": 9.9561730585003e-05, |
|
"loss": 1.6697, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3004807692307692, |
|
"grad_norm": 1.1631627082824707, |
|
"learning_rate": 9.931576757709384e-05, |
|
"loss": 1.6675, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.33052884615384615, |
|
"grad_norm": 1.0823206901550293, |
|
"learning_rate": 9.901569641240883e-05, |
|
"loss": 1.6532, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3605769230769231, |
|
"grad_norm": 1.1251226663589478, |
|
"learning_rate": 9.866184632231592e-05, |
|
"loss": 1.6625, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 1.27994704246521, |
|
"learning_rate": 9.825460554321679e-05, |
|
"loss": 1.6463, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4206730769230769, |
|
"grad_norm": 1.3841296434402466, |
|
"learning_rate": 9.779442089058252e-05, |
|
"loss": 1.6462, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.45072115384615385, |
|
"grad_norm": 1.1315901279449463, |
|
"learning_rate": 9.728179726871762e-05, |
|
"loss": 1.6452, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4807692307692308, |
|
"grad_norm": 1.2269665002822876, |
|
"learning_rate": 9.671729711679036e-05, |
|
"loss": 1.6449, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5108173076923077, |
|
"grad_norm": 1.135764479637146, |
|
"learning_rate": 9.610153979173711e-05, |
|
"loss": 1.6367, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5408653846153846, |
|
"grad_norm": 1.2129323482513428, |
|
"learning_rate": 9.543520088871773e-05, |
|
"loss": 1.6271, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5709134615384616, |
|
"grad_norm": 1.2092684507369995, |
|
"learning_rate": 9.471901149986767e-05, |
|
"loss": 1.6353, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6009615384615384, |
|
"grad_norm": 1.2054263353347778, |
|
"learning_rate": 9.39537574121601e-05, |
|
"loss": 1.6386, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6009615384615384, |
|
"eval_loss": 1.627854585647583, |
|
"eval_runtime": 242.6417, |
|
"eval_samples_per_second": 73.141, |
|
"eval_steps_per_second": 9.145, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6310096153846154, |
|
"grad_norm": 1.4155864715576172, |
|
"learning_rate": 9.314027824525798e-05, |
|
"loss": 1.6322, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6610576923076923, |
|
"grad_norm": 1.2875721454620361, |
|
"learning_rate": 9.22794665303021e-05, |
|
"loss": 1.6205, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6911057692307693, |
|
"grad_norm": 1.2036750316619873, |
|
"learning_rate": 9.137226673064603e-05, |
|
"loss": 1.6201, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.7211538461538461, |
|
"grad_norm": 1.3741754293441772, |
|
"learning_rate": 9.04196742056119e-05, |
|
"loss": 1.6197, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7512019230769231, |
|
"grad_norm": 1.3001148700714111, |
|
"learning_rate": 8.942273411840452e-05, |
|
"loss": 1.6285, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 1.3223652839660645, |
|
"learning_rate": 8.838254028938162e-05, |
|
"loss": 1.6323, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.8112980769230769, |
|
"grad_norm": 1.399418592453003, |
|
"learning_rate": 8.730023399593876e-05, |
|
"loss": 1.6184, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8413461538461539, |
|
"grad_norm": 1.2166377305984497, |
|
"learning_rate": 8.617700272032516e-05, |
|
"loss": 1.6165, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8713942307692307, |
|
"grad_norm": 1.208473563194275, |
|
"learning_rate": 8.501407884676479e-05, |
|
"loss": 1.616, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.9014423076923077, |
|
"grad_norm": 1.2277066707611084, |
|
"learning_rate": 8.381273830931207e-05, |
|
"loss": 1.6122, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9314903846153846, |
|
"grad_norm": 1.204302191734314, |
|
"learning_rate": 8.257429919192542e-05, |
|
"loss": 1.6186, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 1.2529523372650146, |
|
"learning_rate": 8.130012028229512e-05, |
|
"loss": 1.6164, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9915865384615384, |
|
"grad_norm": 1.513980507850647, |
|
"learning_rate": 7.999159958101186e-05, |
|
"loss": 1.5971, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.0216346153846154, |
|
"grad_norm": 1.420433521270752, |
|
"learning_rate": 7.865017276771173e-05, |
|
"loss": 1.4976, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.0516826923076923, |
|
"grad_norm": 1.4616764783859253, |
|
"learning_rate": 7.727731162588074e-05, |
|
"loss": 1.4486, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.0817307692307692, |
|
"grad_norm": 1.4156601428985596, |
|
"learning_rate": 7.587452242804676e-05, |
|
"loss": 1.4467, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.1117788461538463, |
|
"grad_norm": 1.3352571725845337, |
|
"learning_rate": 7.444334428313112e-05, |
|
"loss": 1.4516, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.1418269230769231, |
|
"grad_norm": 1.4259686470031738, |
|
"learning_rate": 7.298534744777267e-05, |
|
"loss": 1.4466, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.171875, |
|
"grad_norm": 1.4755374193191528, |
|
"learning_rate": 7.150213160347743e-05, |
|
"loss": 1.446, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.2019230769230769, |
|
"grad_norm": 1.4399892091751099, |
|
"learning_rate": 6.999532410148371e-05, |
|
"loss": 1.4331, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.2019230769230769, |
|
"eval_loss": 1.6244958639144897, |
|
"eval_runtime": 248.0586, |
|
"eval_samples_per_second": 71.544, |
|
"eval_steps_per_second": 8.945, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.2319711538461537, |
|
"grad_norm": 1.629622220993042, |
|
"learning_rate": 6.846657817726882e-05, |
|
"loss": 1.4356, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.2620192307692308, |
|
"grad_norm": 1.5240803956985474, |
|
"learning_rate": 6.691757113665606e-05, |
|
"loss": 1.4403, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.2920673076923077, |
|
"grad_norm": 1.947218894958496, |
|
"learning_rate": 6.535000251551231e-05, |
|
"loss": 1.452, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.3221153846153846, |
|
"grad_norm": 1.6493359804153442, |
|
"learning_rate": 6.376559221505535e-05, |
|
"loss": 1.4435, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.3521634615384617, |
|
"grad_norm": 1.6366957426071167, |
|
"learning_rate": 6.216607861481659e-05, |
|
"loss": 1.4385, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.3822115384615383, |
|
"grad_norm": 1.679699182510376, |
|
"learning_rate": 6.055321666533013e-05, |
|
"loss": 1.4509, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.4122596153846154, |
|
"grad_norm": 1.5405994653701782, |
|
"learning_rate": 5.8928775962640146e-05, |
|
"loss": 1.4375, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.4423076923076923, |
|
"grad_norm": 1.5734689235687256, |
|
"learning_rate": 5.7294538806739775e-05, |
|
"loss": 1.4315, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.4723557692307692, |
|
"grad_norm": 1.6284011602401733, |
|
"learning_rate": 5.565229824607143e-05, |
|
"loss": 1.4457, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.5024038461538463, |
|
"grad_norm": 1.5765283107757568, |
|
"learning_rate": 5.400385611023416e-05, |
|
"loss": 1.4374, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.5324519230769231, |
|
"grad_norm": 1.7722498178482056, |
|
"learning_rate": 5.235102103305654e-05, |
|
"loss": 1.4513, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 1.6080434322357178, |
|
"learning_rate": 5.0695606468204095e-05, |
|
"loss": 1.4322, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.5925480769230769, |
|
"grad_norm": 1.7113045454025269, |
|
"learning_rate": 4.90394286994985e-05, |
|
"loss": 1.4372, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.6225961538461537, |
|
"grad_norm": 1.595045566558838, |
|
"learning_rate": 4.738430484813162e-05, |
|
"loss": 1.4391, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.6526442307692308, |
|
"grad_norm": 1.5261282920837402, |
|
"learning_rate": 4.5732050878960816e-05, |
|
"loss": 1.4375, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.6826923076923077, |
|
"grad_norm": 1.6319518089294434, |
|
"learning_rate": 4.40844796080729e-05, |
|
"loss": 1.4269, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.7127403846153846, |
|
"grad_norm": 1.6978216171264648, |
|
"learning_rate": 4.244339871380291e-05, |
|
"loss": 1.4261, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.7427884615384617, |
|
"grad_norm": 1.748810887336731, |
|
"learning_rate": 4.0810608753389864e-05, |
|
"loss": 1.4349, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.7728365384615383, |
|
"grad_norm": 1.4340012073516846, |
|
"learning_rate": 3.9187901187445675e-05, |
|
"loss": 1.4349, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.8028846153846154, |
|
"grad_norm": 1.7568167448043823, |
|
"learning_rate": 3.757705641440461e-05, |
|
"loss": 1.4318, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.8028846153846154, |
|
"eval_loss": 1.613356113433838, |
|
"eval_runtime": 246.395, |
|
"eval_samples_per_second": 72.027, |
|
"eval_steps_per_second": 9.006, |
|
"step": 6000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 9984, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.830080251756462e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|