{ "best_metric": 2.9304451942443848, "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-32k-earlystop_seed-42_1e-3/checkpoint-38620", "epoch": 19.992235020059532, "eval_steps": 500, "global_step": 38620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5176653293645658, "grad_norm": 0.516268789768219, "learning_rate": 3.125e-05, "loss": 5.9216, "step": 1000 }, { "epoch": 0.9996117510029766, "eval_accuracy": 0.32528310960329715, "eval_loss": 4.013359069824219, "eval_runtime": 112.2514, "eval_samples_per_second": 464.939, "eval_steps_per_second": 7.269, "step": 1931 }, { "epoch": 1.0353306587291315, "grad_norm": 0.6370756030082703, "learning_rate": 6.25e-05, "loss": 4.1987, "step": 2000 }, { "epoch": 1.5529959880936974, "grad_norm": 0.6017232537269592, "learning_rate": 9.375e-05, "loss": 3.7977, "step": 3000 }, { "epoch": 1.9997411673353178, "eval_accuracy": 0.3639096213308086, "eval_loss": 3.544811725616455, "eval_runtime": 112.2075, "eval_samples_per_second": 465.12, "eval_steps_per_second": 7.272, "step": 3863 }, { "epoch": 2.070661317458263, "grad_norm": 0.5702515840530396, "learning_rate": 0.000125, "loss": 3.5582, "step": 4000 }, { "epoch": 2.588326646822829, "grad_norm": 0.48817870020866394, "learning_rate": 0.00015625, "loss": 3.3887, "step": 5000 }, { "epoch": 2.9998705836676587, "eval_accuracy": 0.3840780857274889, "eval_loss": 3.324249744415283, "eval_runtime": 112.382, "eval_samples_per_second": 464.398, "eval_steps_per_second": 7.261, "step": 5795 }, { "epoch": 3.105991976187395, "grad_norm": 0.45357462763786316, "learning_rate": 0.0001875, "loss": 3.2719, "step": 6000 }, { "epoch": 3.6236573055519608, "grad_norm": 0.42539018392562866, "learning_rate": 0.00021875, "loss": 3.1805, "step": 7000 }, { "epoch": 4.0, "eval_accuracy": 0.3949032381682315, "eval_loss": 3.2081618309020996, "eval_runtime": 112.1977, "eval_samples_per_second": 465.161, "eval_steps_per_second": 7.273, "step": 7727 }, { "epoch": 4.141322634916526, "grad_norm": 0.41741499304771423, "learning_rate": 0.00025, "loss": 3.1173, "step": 8000 }, { "epoch": 4.658987964281092, "grad_norm": 0.3810145854949951, "learning_rate": 0.00028125000000000003, "loss": 3.0632, "step": 9000 }, { "epoch": 4.999611751002977, "eval_accuracy": 0.401180377880219, "eval_loss": 3.143218517303467, "eval_runtime": 112.3679, "eval_samples_per_second": 464.457, "eval_steps_per_second": 7.262, "step": 9658 }, { "epoch": 5.176653293645658, "grad_norm": 0.3555419445037842, "learning_rate": 0.0003125, "loss": 3.0212, "step": 10000 }, { "epoch": 5.694318623010224, "grad_norm": 0.3318658173084259, "learning_rate": 0.00034375, "loss": 2.9865, "step": 11000 }, { "epoch": 5.999741167335317, "eval_accuracy": 0.4055885546400971, "eval_loss": 3.101013422012329, "eval_runtime": 112.2779, "eval_samples_per_second": 464.829, "eval_steps_per_second": 7.268, "step": 11590 }, { "epoch": 6.21198395237479, "grad_norm": 0.3243854343891144, "learning_rate": 0.000375, "loss": 2.9568, "step": 12000 }, { "epoch": 6.729649281739356, "grad_norm": 0.3086845874786377, "learning_rate": 0.00040625000000000004, "loss": 2.9347, "step": 13000 }, { "epoch": 6.999870583667659, "eval_accuracy": 0.4087078510269791, "eval_loss": 3.071547746658325, "eval_runtime": 111.3256, "eval_samples_per_second": 468.805, "eval_steps_per_second": 7.33, "step": 13522 }, { "epoch": 7.2473146111039215, "grad_norm": 0.29632025957107544, "learning_rate": 0.0004375, "loss": 2.9084, "step": 14000 }, { "epoch": 7.764979940468487, "grad_norm": 0.28605663776397705, "learning_rate": 0.0004686875, "loss": 2.8953, "step": 15000 }, { "epoch": 8.0, "eval_accuracy": 0.4107785654978604, "eval_loss": 3.053938388824463, "eval_runtime": 110.44, "eval_samples_per_second": 472.564, "eval_steps_per_second": 7.389, "step": 15454 }, { "epoch": 8.282645269833052, "grad_norm": 0.2786637246608734, "learning_rate": 0.0004999375, "loss": 2.8698, "step": 16000 }, { "epoch": 8.80031059919762, "grad_norm": 0.2667602002620697, "learning_rate": 0.00053115625, "loss": 2.8689, "step": 17000 }, { "epoch": 8.999611751002977, "eval_accuracy": 0.4122456033572655, "eval_loss": 3.039193868637085, "eval_runtime": 110.7811, "eval_samples_per_second": 471.109, "eval_steps_per_second": 7.366, "step": 17385 }, { "epoch": 9.317975928562184, "grad_norm": 0.25813835859298706, "learning_rate": 0.00056240625, "loss": 2.8401, "step": 18000 }, { "epoch": 9.835641257926751, "grad_norm": 0.2392367571592331, "learning_rate": 0.00059365625, "loss": 2.8456, "step": 19000 }, { "epoch": 9.999741167335317, "eval_accuracy": 0.4133619617611367, "eval_loss": 3.0309925079345703, "eval_runtime": 110.6518, "eval_samples_per_second": 471.66, "eval_steps_per_second": 7.374, "step": 19317 }, { "epoch": 10.353306587291316, "grad_norm": 0.2465026080608368, "learning_rate": 0.00062490625, "loss": 2.8163, "step": 20000 }, { "epoch": 10.870971916655883, "grad_norm": 0.21547040343284607, "learning_rate": 0.000656125, "loss": 2.8298, "step": 21000 }, { "epoch": 10.99987058366766, "eval_accuracy": 0.41438394403555634, "eval_loss": 3.0251340866088867, "eval_runtime": 110.5195, "eval_samples_per_second": 472.224, "eval_steps_per_second": 7.383, "step": 21249 }, { "epoch": 11.388637246020448, "grad_norm": 0.23142270743846893, "learning_rate": 0.0006873749999999999, "loss": 2.798, "step": 22000 }, { "epoch": 11.906302575385013, "grad_norm": 0.2184012234210968, "learning_rate": 0.00071859375, "loss": 2.817, "step": 23000 }, { "epoch": 12.0, "eval_accuracy": 0.4152235609706615, "eval_loss": 3.0175206661224365, "eval_runtime": 110.5797, "eval_samples_per_second": 471.967, "eval_steps_per_second": 7.379, "step": 23181 }, { "epoch": 12.42396790474958, "grad_norm": 0.211518794298172, "learning_rate": 0.0007498437500000001, "loss": 2.7828, "step": 24000 }, { "epoch": 12.941633234114144, "grad_norm": 0.2092377245426178, "learning_rate": 0.00078109375, "loss": 2.8069, "step": 25000 }, { "epoch": 12.999611751002977, "eval_accuracy": 0.41580334298885296, "eval_loss": 3.0118961334228516, "eval_runtime": 110.6039, "eval_samples_per_second": 471.864, "eval_steps_per_second": 7.378, "step": 25112 }, { "epoch": 13.459298563478711, "grad_norm": 0.20688970386981964, "learning_rate": 0.00081234375, "loss": 2.7707, "step": 26000 }, { "epoch": 13.976963892843276, "grad_norm": 0.20183704793453217, "learning_rate": 0.00084353125, "loss": 2.7996, "step": 27000 }, { "epoch": 13.999741167335317, "eval_accuracy": 0.4162821365373128, "eval_loss": 3.005990743637085, "eval_runtime": 110.6858, "eval_samples_per_second": 471.515, "eval_steps_per_second": 7.372, "step": 27044 }, { "epoch": 14.494629222207843, "grad_norm": 0.19293886423110962, "learning_rate": 0.00087478125, "loss": 2.7615, "step": 28000 }, { "epoch": 14.99987058366766, "eval_accuracy": 0.4170801257847458, "eval_loss": 3.0038492679595947, "eval_runtime": 110.7527, "eval_samples_per_second": 471.23, "eval_steps_per_second": 7.368, "step": 28976 }, { "epoch": 15.012294551572408, "grad_norm": 0.1958475559949875, "learning_rate": 0.0009060312499999999, "loss": 2.7934, "step": 29000 }, { "epoch": 15.529959880936975, "grad_norm": 0.18981003761291504, "learning_rate": 0.00093728125, "loss": 2.7575, "step": 30000 }, { "epoch": 16.0, "eval_accuracy": 0.4168810041740398, "eval_loss": 3.0022430419921875, "eval_runtime": 110.7398, "eval_samples_per_second": 471.285, "eval_steps_per_second": 7.369, "step": 30908 }, { "epoch": 16.04762521030154, "grad_norm": 0.21319861710071564, "learning_rate": 0.00096853125, "loss": 2.7826, "step": 31000 }, { "epoch": 16.565290539666105, "grad_norm": 0.188828244805336, "learning_rate": 0.00099975, "loss": 2.7573, "step": 32000 }, { "epoch": 16.999611751002977, "eval_accuracy": 0.41896870033700395, "eval_loss": 2.9876515865325928, "eval_runtime": 110.8493, "eval_samples_per_second": 470.819, "eval_steps_per_second": 7.361, "step": 32839 }, { "epoch": 17.082955869030673, "grad_norm": 0.18218275904655457, "learning_rate": 0.0008501510574018127, "loss": 2.764, "step": 33000 }, { "epoch": 17.60062119839524, "grad_norm": 0.1826547086238861, "learning_rate": 0.0006992447129909366, "loss": 2.71, "step": 34000 }, { "epoch": 17.99974116733532, "eval_accuracy": 0.42300027426184117, "eval_loss": 2.9554731845855713, "eval_runtime": 110.9201, "eval_samples_per_second": 470.519, "eval_steps_per_second": 7.357, "step": 34771 }, { "epoch": 18.118286527759803, "grad_norm": 0.198472797870636, "learning_rate": 0.0005483383685800605, "loss": 2.6857, "step": 35000 }, { "epoch": 18.63595185712437, "grad_norm": 0.1863366812467575, "learning_rate": 0.00039728096676737163, "loss": 2.6248, "step": 36000 }, { "epoch": 18.999870583667658, "eval_accuracy": 0.42677494373875247, "eval_loss": 2.9324660301208496, "eval_runtime": 110.7979, "eval_samples_per_second": 471.038, "eval_steps_per_second": 7.365, "step": 36703 }, { "epoch": 19.153617186488933, "grad_norm": 0.19636893272399902, "learning_rate": 0.0002462235649546828, "loss": 2.5874, "step": 37000 }, { "epoch": 19.671282515853502, "grad_norm": 0.19793163239955902, "learning_rate": 9.516616314199396e-05, "loss": 2.5172, "step": 38000 }, { "epoch": 19.992235020059532, "eval_accuracy": 0.42861324947683616, "eval_loss": 2.9304451942443848, "eval_runtime": 110.6301, "eval_samples_per_second": 471.752, "eval_steps_per_second": 7.376, "step": 38620 }, { "epoch": 19.992235020059532, "step": 38620, "total_flos": 1.291633274290176e+18, "train_loss": 3.0124951987362847, "train_runtime": 40662.5012, "train_samples_per_second": 243.23, "train_steps_per_second": 0.95 } ], "logging_steps": 1000, "max_steps": 38620, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.291633274290176e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }