|
{ |
|
"best_metric": 2.9304451942443848, |
|
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-32k-earlystop_seed-42_1e-3/checkpoint-38620", |
|
"epoch": 19.992235020059532, |
|
"eval_steps": 500, |
|
"global_step": 38620, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5176653293645658, |
|
"grad_norm": 0.516268789768219, |
|
"learning_rate": 3.125e-05, |
|
"loss": 5.9216, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9996117510029766, |
|
"eval_accuracy": 0.32528310960329715, |
|
"eval_loss": 4.013359069824219, |
|
"eval_runtime": 112.2514, |
|
"eval_samples_per_second": 464.939, |
|
"eval_steps_per_second": 7.269, |
|
"step": 1931 |
|
}, |
|
{ |
|
"epoch": 1.0353306587291315, |
|
"grad_norm": 0.6370756030082703, |
|
"learning_rate": 6.25e-05, |
|
"loss": 4.1987, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.5529959880936974, |
|
"grad_norm": 0.6017232537269592, |
|
"learning_rate": 9.375e-05, |
|
"loss": 3.7977, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.9997411673353178, |
|
"eval_accuracy": 0.3639096213308086, |
|
"eval_loss": 3.544811725616455, |
|
"eval_runtime": 112.2075, |
|
"eval_samples_per_second": 465.12, |
|
"eval_steps_per_second": 7.272, |
|
"step": 3863 |
|
}, |
|
{ |
|
"epoch": 2.070661317458263, |
|
"grad_norm": 0.5702515840530396, |
|
"learning_rate": 0.000125, |
|
"loss": 3.5582, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.588326646822829, |
|
"grad_norm": 0.48817870020866394, |
|
"learning_rate": 0.00015625, |
|
"loss": 3.3887, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.9998705836676587, |
|
"eval_accuracy": 0.3840780857274889, |
|
"eval_loss": 3.324249744415283, |
|
"eval_runtime": 112.382, |
|
"eval_samples_per_second": 464.398, |
|
"eval_steps_per_second": 7.261, |
|
"step": 5795 |
|
}, |
|
{ |
|
"epoch": 3.105991976187395, |
|
"grad_norm": 0.45357462763786316, |
|
"learning_rate": 0.0001875, |
|
"loss": 3.2719, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.6236573055519608, |
|
"grad_norm": 0.42539018392562866, |
|
"learning_rate": 0.00021875, |
|
"loss": 3.1805, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3949032381682315, |
|
"eval_loss": 3.2081618309020996, |
|
"eval_runtime": 112.1977, |
|
"eval_samples_per_second": 465.161, |
|
"eval_steps_per_second": 7.273, |
|
"step": 7727 |
|
}, |
|
{ |
|
"epoch": 4.141322634916526, |
|
"grad_norm": 0.41741499304771423, |
|
"learning_rate": 0.00025, |
|
"loss": 3.1173, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.658987964281092, |
|
"grad_norm": 0.3810145854949951, |
|
"learning_rate": 0.00028125000000000003, |
|
"loss": 3.0632, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.999611751002977, |
|
"eval_accuracy": 0.401180377880219, |
|
"eval_loss": 3.143218517303467, |
|
"eval_runtime": 112.3679, |
|
"eval_samples_per_second": 464.457, |
|
"eval_steps_per_second": 7.262, |
|
"step": 9658 |
|
}, |
|
{ |
|
"epoch": 5.176653293645658, |
|
"grad_norm": 0.3555419445037842, |
|
"learning_rate": 0.0003125, |
|
"loss": 3.0212, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.694318623010224, |
|
"grad_norm": 0.3318658173084259, |
|
"learning_rate": 0.00034375, |
|
"loss": 2.9865, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.999741167335317, |
|
"eval_accuracy": 0.4055885546400971, |
|
"eval_loss": 3.101013422012329, |
|
"eval_runtime": 112.2779, |
|
"eval_samples_per_second": 464.829, |
|
"eval_steps_per_second": 7.268, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 6.21198395237479, |
|
"grad_norm": 0.3243854343891144, |
|
"learning_rate": 0.000375, |
|
"loss": 2.9568, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 6.729649281739356, |
|
"grad_norm": 0.3086845874786377, |
|
"learning_rate": 0.00040625000000000004, |
|
"loss": 2.9347, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.999870583667659, |
|
"eval_accuracy": 0.4087078510269791, |
|
"eval_loss": 3.071547746658325, |
|
"eval_runtime": 111.3256, |
|
"eval_samples_per_second": 468.805, |
|
"eval_steps_per_second": 7.33, |
|
"step": 13522 |
|
}, |
|
{ |
|
"epoch": 7.2473146111039215, |
|
"grad_norm": 0.29632025957107544, |
|
"learning_rate": 0.0004375, |
|
"loss": 2.9084, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 7.764979940468487, |
|
"grad_norm": 0.28605663776397705, |
|
"learning_rate": 0.0004686875, |
|
"loss": 2.8953, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4107785654978604, |
|
"eval_loss": 3.053938388824463, |
|
"eval_runtime": 110.44, |
|
"eval_samples_per_second": 472.564, |
|
"eval_steps_per_second": 7.389, |
|
"step": 15454 |
|
}, |
|
{ |
|
"epoch": 8.282645269833052, |
|
"grad_norm": 0.2786637246608734, |
|
"learning_rate": 0.0004999375, |
|
"loss": 2.8698, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 8.80031059919762, |
|
"grad_norm": 0.2667602002620697, |
|
"learning_rate": 0.00053115625, |
|
"loss": 2.8689, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 8.999611751002977, |
|
"eval_accuracy": 0.4122456033572655, |
|
"eval_loss": 3.039193868637085, |
|
"eval_runtime": 110.7811, |
|
"eval_samples_per_second": 471.109, |
|
"eval_steps_per_second": 7.366, |
|
"step": 17385 |
|
}, |
|
{ |
|
"epoch": 9.317975928562184, |
|
"grad_norm": 0.25813835859298706, |
|
"learning_rate": 0.00056240625, |
|
"loss": 2.8401, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 9.835641257926751, |
|
"grad_norm": 0.2392367571592331, |
|
"learning_rate": 0.00059365625, |
|
"loss": 2.8456, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 9.999741167335317, |
|
"eval_accuracy": 0.4133619617611367, |
|
"eval_loss": 3.0309925079345703, |
|
"eval_runtime": 110.6518, |
|
"eval_samples_per_second": 471.66, |
|
"eval_steps_per_second": 7.374, |
|
"step": 19317 |
|
}, |
|
{ |
|
"epoch": 10.353306587291316, |
|
"grad_norm": 0.2465026080608368, |
|
"learning_rate": 0.00062490625, |
|
"loss": 2.8163, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 10.870971916655883, |
|
"grad_norm": 0.21547040343284607, |
|
"learning_rate": 0.000656125, |
|
"loss": 2.8298, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 10.99987058366766, |
|
"eval_accuracy": 0.41438394403555634, |
|
"eval_loss": 3.0251340866088867, |
|
"eval_runtime": 110.5195, |
|
"eval_samples_per_second": 472.224, |
|
"eval_steps_per_second": 7.383, |
|
"step": 21249 |
|
}, |
|
{ |
|
"epoch": 11.388637246020448, |
|
"grad_norm": 0.23142270743846893, |
|
"learning_rate": 0.0006873749999999999, |
|
"loss": 2.798, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 11.906302575385013, |
|
"grad_norm": 0.2184012234210968, |
|
"learning_rate": 0.00071859375, |
|
"loss": 2.817, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.4152235609706615, |
|
"eval_loss": 3.0175206661224365, |
|
"eval_runtime": 110.5797, |
|
"eval_samples_per_second": 471.967, |
|
"eval_steps_per_second": 7.379, |
|
"step": 23181 |
|
}, |
|
{ |
|
"epoch": 12.42396790474958, |
|
"grad_norm": 0.211518794298172, |
|
"learning_rate": 0.0007498437500000001, |
|
"loss": 2.7828, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 12.941633234114144, |
|
"grad_norm": 0.2092377245426178, |
|
"learning_rate": 0.00078109375, |
|
"loss": 2.8069, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 12.999611751002977, |
|
"eval_accuracy": 0.41580334298885296, |
|
"eval_loss": 3.0118961334228516, |
|
"eval_runtime": 110.6039, |
|
"eval_samples_per_second": 471.864, |
|
"eval_steps_per_second": 7.378, |
|
"step": 25112 |
|
}, |
|
{ |
|
"epoch": 13.459298563478711, |
|
"grad_norm": 0.20688970386981964, |
|
"learning_rate": 0.00081234375, |
|
"loss": 2.7707, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 13.976963892843276, |
|
"grad_norm": 0.20183704793453217, |
|
"learning_rate": 0.00084353125, |
|
"loss": 2.7996, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 13.999741167335317, |
|
"eval_accuracy": 0.4162821365373128, |
|
"eval_loss": 3.005990743637085, |
|
"eval_runtime": 110.6858, |
|
"eval_samples_per_second": 471.515, |
|
"eval_steps_per_second": 7.372, |
|
"step": 27044 |
|
}, |
|
{ |
|
"epoch": 14.494629222207843, |
|
"grad_norm": 0.19293886423110962, |
|
"learning_rate": 0.00087478125, |
|
"loss": 2.7615, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 14.99987058366766, |
|
"eval_accuracy": 0.4170801257847458, |
|
"eval_loss": 3.0038492679595947, |
|
"eval_runtime": 110.7527, |
|
"eval_samples_per_second": 471.23, |
|
"eval_steps_per_second": 7.368, |
|
"step": 28976 |
|
}, |
|
{ |
|
"epoch": 15.012294551572408, |
|
"grad_norm": 0.1958475559949875, |
|
"learning_rate": 0.0009060312499999999, |
|
"loss": 2.7934, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 15.529959880936975, |
|
"grad_norm": 0.18981003761291504, |
|
"learning_rate": 0.00093728125, |
|
"loss": 2.7575, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4168810041740398, |
|
"eval_loss": 3.0022430419921875, |
|
"eval_runtime": 110.7398, |
|
"eval_samples_per_second": 471.285, |
|
"eval_steps_per_second": 7.369, |
|
"step": 30908 |
|
}, |
|
{ |
|
"epoch": 16.04762521030154, |
|
"grad_norm": 0.21319861710071564, |
|
"learning_rate": 0.00096853125, |
|
"loss": 2.7826, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 16.565290539666105, |
|
"grad_norm": 0.188828244805336, |
|
"learning_rate": 0.00099975, |
|
"loss": 2.7573, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 16.999611751002977, |
|
"eval_accuracy": 0.41896870033700395, |
|
"eval_loss": 2.9876515865325928, |
|
"eval_runtime": 110.8493, |
|
"eval_samples_per_second": 470.819, |
|
"eval_steps_per_second": 7.361, |
|
"step": 32839 |
|
}, |
|
{ |
|
"epoch": 17.082955869030673, |
|
"grad_norm": 0.18218275904655457, |
|
"learning_rate": 0.0008501510574018127, |
|
"loss": 2.764, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 17.60062119839524, |
|
"grad_norm": 0.1826547086238861, |
|
"learning_rate": 0.0006992447129909366, |
|
"loss": 2.71, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 17.99974116733532, |
|
"eval_accuracy": 0.42300027426184117, |
|
"eval_loss": 2.9554731845855713, |
|
"eval_runtime": 110.9201, |
|
"eval_samples_per_second": 470.519, |
|
"eval_steps_per_second": 7.357, |
|
"step": 34771 |
|
}, |
|
{ |
|
"epoch": 18.118286527759803, |
|
"grad_norm": 0.198472797870636, |
|
"learning_rate": 0.0005483383685800605, |
|
"loss": 2.6857, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 18.63595185712437, |
|
"grad_norm": 0.1863366812467575, |
|
"learning_rate": 0.00039728096676737163, |
|
"loss": 2.6248, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 18.999870583667658, |
|
"eval_accuracy": 0.42677494373875247, |
|
"eval_loss": 2.9324660301208496, |
|
"eval_runtime": 110.7979, |
|
"eval_samples_per_second": 471.038, |
|
"eval_steps_per_second": 7.365, |
|
"step": 36703 |
|
}, |
|
{ |
|
"epoch": 19.153617186488933, |
|
"grad_norm": 0.19636893272399902, |
|
"learning_rate": 0.0002462235649546828, |
|
"loss": 2.5874, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 19.671282515853502, |
|
"grad_norm": 0.19793163239955902, |
|
"learning_rate": 9.516616314199396e-05, |
|
"loss": 2.5172, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 19.992235020059532, |
|
"eval_accuracy": 0.42861324947683616, |
|
"eval_loss": 2.9304451942443848, |
|
"eval_runtime": 110.6301, |
|
"eval_samples_per_second": 471.752, |
|
"eval_steps_per_second": 7.376, |
|
"step": 38620 |
|
}, |
|
{ |
|
"epoch": 19.992235020059532, |
|
"step": 38620, |
|
"total_flos": 1.291633274290176e+18, |
|
"train_loss": 3.0124951987362847, |
|
"train_runtime": 40662.5012, |
|
"train_samples_per_second": 243.23, |
|
"train_steps_per_second": 0.95 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 38620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.291633274290176e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|