|
{ |
|
"best_metric": 0.40528106689453125, |
|
"best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/drone/drone-DinoVdeau-from-binary-large-2024_11_14-batch-size16_freeze_probs/checkpoint-22776", |
|
"epoch": 62.0, |
|
"eval_steps": 500, |
|
"global_step": 27156, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"eval_explained_variance": 0.2881631553173065, |
|
"eval_kl_divergence": 1.006906509399414, |
|
"eval_loss": 0.43063807487487793, |
|
"eval_mae": 0.16208958625793457, |
|
"eval_rmse": 0.22103922069072723, |
|
"eval_runtime": 65.2687, |
|
"eval_samples_per_second": 36.066, |
|
"eval_steps_per_second": 2.268, |
|
"learning_rate": 0.001, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.1415525114155252, |
|
"grad_norm": 0.5616265535354614, |
|
"learning_rate": 0.001, |
|
"loss": 0.4808, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_explained_variance": 0.31177183985710144, |
|
"eval_kl_divergence": 1.3118820190429688, |
|
"eval_loss": 0.4245865046977997, |
|
"eval_mae": 0.15473191440105438, |
|
"eval_rmse": 0.21785493195056915, |
|
"eval_runtime": 70.2445, |
|
"eval_samples_per_second": 33.512, |
|
"eval_steps_per_second": 2.107, |
|
"learning_rate": 0.001, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 2.2831050228310503, |
|
"grad_norm": 0.5421963930130005, |
|
"learning_rate": 0.001, |
|
"loss": 0.421, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_explained_variance": 0.3191607892513275, |
|
"eval_kl_divergence": 1.0982407331466675, |
|
"eval_loss": 0.422325998544693, |
|
"eval_mae": 0.1554209440946579, |
|
"eval_rmse": 0.21583305299282074, |
|
"eval_runtime": 63.3078, |
|
"eval_samples_per_second": 37.183, |
|
"eval_steps_per_second": 2.338, |
|
"learning_rate": 0.001, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 3.4246575342465753, |
|
"grad_norm": 0.4156647324562073, |
|
"learning_rate": 0.001, |
|
"loss": 0.4151, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_explained_variance": 0.3350948095321655, |
|
"eval_kl_divergence": 1.041384220123291, |
|
"eval_loss": 0.41912660002708435, |
|
"eval_mae": 0.15517595410346985, |
|
"eval_rmse": 0.21416835486888885, |
|
"eval_runtime": 63.7743, |
|
"eval_samples_per_second": 36.911, |
|
"eval_steps_per_second": 2.321, |
|
"learning_rate": 0.001, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 4.566210045662101, |
|
"grad_norm": 0.2765987813472748, |
|
"learning_rate": 0.001, |
|
"loss": 0.4114, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_explained_variance": 0.33842501044273376, |
|
"eval_kl_divergence": 1.0698424577713013, |
|
"eval_loss": 0.41713497042655945, |
|
"eval_mae": 0.15411676466464996, |
|
"eval_rmse": 0.21232052147388458, |
|
"eval_runtime": 61.7723, |
|
"eval_samples_per_second": 38.108, |
|
"eval_steps_per_second": 2.396, |
|
"learning_rate": 0.001, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 5.707762557077626, |
|
"grad_norm": 0.34299173951148987, |
|
"learning_rate": 0.001, |
|
"loss": 0.4089, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_explained_variance": 0.3310842514038086, |
|
"eval_kl_divergence": 1.1958788633346558, |
|
"eval_loss": 0.42093637585639954, |
|
"eval_mae": 0.1519818753004074, |
|
"eval_rmse": 0.21403205394744873, |
|
"eval_runtime": 61.4619, |
|
"eval_samples_per_second": 38.3, |
|
"eval_steps_per_second": 2.408, |
|
"learning_rate": 0.001, |
|
"step": 2628 |
|
}, |
|
{ |
|
"epoch": 6.8493150684931505, |
|
"grad_norm": 0.30921000242233276, |
|
"learning_rate": 0.001, |
|
"loss": 0.4091, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_explained_variance": 0.33822229504585266, |
|
"eval_kl_divergence": 1.1708621978759766, |
|
"eval_loss": 0.4166290760040283, |
|
"eval_mae": 0.153007373213768, |
|
"eval_rmse": 0.21260716021060944, |
|
"eval_runtime": 60.3411, |
|
"eval_samples_per_second": 39.012, |
|
"eval_steps_per_second": 2.453, |
|
"learning_rate": 0.001, |
|
"step": 3066 |
|
}, |
|
{ |
|
"epoch": 7.9908675799086755, |
|
"grad_norm": 0.21716275811195374, |
|
"learning_rate": 0.001, |
|
"loss": 0.4071, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_explained_variance": 0.33456894755363464, |
|
"eval_kl_divergence": 0.971220850944519, |
|
"eval_loss": 0.41946443915367126, |
|
"eval_mae": 0.15562371909618378, |
|
"eval_rmse": 0.2142825573682785, |
|
"eval_runtime": 62.8353, |
|
"eval_samples_per_second": 37.463, |
|
"eval_steps_per_second": 2.355, |
|
"learning_rate": 0.001, |
|
"step": 3504 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_explained_variance": 0.3415004014968872, |
|
"eval_kl_divergence": 1.1432474851608276, |
|
"eval_loss": 0.41668570041656494, |
|
"eval_mae": 0.1524006426334381, |
|
"eval_rmse": 0.21208135783672333, |
|
"eval_runtime": 62.325, |
|
"eval_samples_per_second": 37.77, |
|
"eval_steps_per_second": 2.375, |
|
"learning_rate": 0.001, |
|
"step": 3942 |
|
}, |
|
{ |
|
"epoch": 9.132420091324201, |
|
"grad_norm": 0.2371012270450592, |
|
"learning_rate": 0.001, |
|
"loss": 0.4062, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_explained_variance": 0.34203192591667175, |
|
"eval_kl_divergence": 0.9120630025863647, |
|
"eval_loss": 0.4186115860939026, |
|
"eval_mae": 0.15351708233356476, |
|
"eval_rmse": 0.2138604372739792, |
|
"eval_runtime": 60.5397, |
|
"eval_samples_per_second": 38.884, |
|
"eval_steps_per_second": 2.445, |
|
"learning_rate": 0.001, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 10.273972602739725, |
|
"grad_norm": 0.2552158236503601, |
|
"learning_rate": 0.001, |
|
"loss": 0.4052, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_explained_variance": 0.34416234493255615, |
|
"eval_kl_divergence": 0.995019793510437, |
|
"eval_loss": 0.41557687520980835, |
|
"eval_mae": 0.15356659889221191, |
|
"eval_rmse": 0.2114415019750595, |
|
"eval_runtime": 61.7293, |
|
"eval_samples_per_second": 38.134, |
|
"eval_steps_per_second": 2.398, |
|
"learning_rate": 0.001, |
|
"step": 4818 |
|
}, |
|
{ |
|
"epoch": 11.415525114155251, |
|
"grad_norm": 0.20953956246376038, |
|
"learning_rate": 0.001, |
|
"loss": 0.406, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_explained_variance": 0.3389909565448761, |
|
"eval_kl_divergence": 1.0105773210525513, |
|
"eval_loss": 0.41883811354637146, |
|
"eval_mae": 0.1555173546075821, |
|
"eval_rmse": 0.21388684213161469, |
|
"eval_runtime": 62.5745, |
|
"eval_samples_per_second": 37.619, |
|
"eval_steps_per_second": 2.365, |
|
"learning_rate": 0.001, |
|
"step": 5256 |
|
}, |
|
{ |
|
"epoch": 12.557077625570777, |
|
"grad_norm": 0.18659397959709167, |
|
"learning_rate": 0.001, |
|
"loss": 0.4058, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_explained_variance": 0.34248629212379456, |
|
"eval_kl_divergence": 1.1481796503067017, |
|
"eval_loss": 0.41630858182907104, |
|
"eval_mae": 0.15531976521015167, |
|
"eval_rmse": 0.21213315427303314, |
|
"eval_runtime": 61.6003, |
|
"eval_samples_per_second": 38.214, |
|
"eval_steps_per_second": 2.403, |
|
"learning_rate": 0.001, |
|
"step": 5694 |
|
}, |
|
{ |
|
"epoch": 13.698630136986301, |
|
"grad_norm": 0.19523686170578003, |
|
"learning_rate": 0.001, |
|
"loss": 0.4056, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_explained_variance": 0.3286344110965729, |
|
"eval_kl_divergence": 1.211091160774231, |
|
"eval_loss": 0.4193180799484253, |
|
"eval_mae": 0.15458153188228607, |
|
"eval_rmse": 0.21381880342960358, |
|
"eval_runtime": 62.0339, |
|
"eval_samples_per_second": 37.947, |
|
"eval_steps_per_second": 2.386, |
|
"learning_rate": 0.001, |
|
"step": 6132 |
|
}, |
|
{ |
|
"epoch": 14.840182648401827, |
|
"grad_norm": 0.18541939556598663, |
|
"learning_rate": 0.001, |
|
"loss": 0.4033, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_explained_variance": 0.3402325212955475, |
|
"eval_kl_divergence": 1.2042615413665771, |
|
"eval_loss": 0.416218638420105, |
|
"eval_mae": 0.15419499576091766, |
|
"eval_rmse": 0.2121332883834839, |
|
"eval_runtime": 62.9591, |
|
"eval_samples_per_second": 37.389, |
|
"eval_steps_per_second": 2.351, |
|
"learning_rate": 0.001, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 15.981735159817351, |
|
"grad_norm": 0.16085268557071686, |
|
"learning_rate": 0.001, |
|
"loss": 0.4057, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_explained_variance": 0.35001620650291443, |
|
"eval_kl_divergence": 1.0827727317810059, |
|
"eval_loss": 0.41389620304107666, |
|
"eval_mae": 0.1527981460094452, |
|
"eval_rmse": 0.21022744476795197, |
|
"eval_runtime": 62.4108, |
|
"eval_samples_per_second": 37.718, |
|
"eval_steps_per_second": 2.371, |
|
"learning_rate": 0.001, |
|
"step": 7008 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_explained_variance": 0.3429690897464752, |
|
"eval_kl_divergence": 1.0005594491958618, |
|
"eval_loss": 0.4171081781387329, |
|
"eval_mae": 0.15638333559036255, |
|
"eval_rmse": 0.21180683374404907, |
|
"eval_runtime": 63.4048, |
|
"eval_samples_per_second": 37.127, |
|
"eval_steps_per_second": 2.334, |
|
"learning_rate": 0.001, |
|
"step": 7446 |
|
}, |
|
{ |
|
"epoch": 17.123287671232877, |
|
"grad_norm": 0.17030780017375946, |
|
"learning_rate": 0.001, |
|
"loss": 0.405, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_explained_variance": 0.3499327600002289, |
|
"eval_kl_divergence": 1.0514436960220337, |
|
"eval_loss": 0.4146382212638855, |
|
"eval_mae": 0.1507440060377121, |
|
"eval_rmse": 0.2107054442167282, |
|
"eval_runtime": 64.4758, |
|
"eval_samples_per_second": 36.51, |
|
"eval_steps_per_second": 2.295, |
|
"learning_rate": 0.001, |
|
"step": 7884 |
|
}, |
|
{ |
|
"epoch": 18.264840182648403, |
|
"grad_norm": 0.16620762646198273, |
|
"learning_rate": 0.001, |
|
"loss": 0.4035, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_explained_variance": 0.3467938005924225, |
|
"eval_kl_divergence": 0.9575299024581909, |
|
"eval_loss": 0.41857486963272095, |
|
"eval_mae": 0.1531781703233719, |
|
"eval_rmse": 0.21135376393795013, |
|
"eval_runtime": 65.1272, |
|
"eval_samples_per_second": 36.145, |
|
"eval_steps_per_second": 2.272, |
|
"learning_rate": 0.001, |
|
"step": 8322 |
|
}, |
|
{ |
|
"epoch": 19.40639269406393, |
|
"grad_norm": 0.21431417763233185, |
|
"learning_rate": 0.001, |
|
"loss": 0.4031, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_explained_variance": 0.34868308901786804, |
|
"eval_kl_divergence": 1.164780855178833, |
|
"eval_loss": 0.41434723138809204, |
|
"eval_mae": 0.15129883587360382, |
|
"eval_rmse": 0.21083922684192657, |
|
"eval_runtime": 62.809, |
|
"eval_samples_per_second": 37.479, |
|
"eval_steps_per_second": 2.356, |
|
"learning_rate": 0.001, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 20.54794520547945, |
|
"grad_norm": 0.16674350202083588, |
|
"learning_rate": 0.001, |
|
"loss": 0.4048, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_explained_variance": 0.3385157585144043, |
|
"eval_kl_divergence": 1.2949873208999634, |
|
"eval_loss": 0.4195358157157898, |
|
"eval_mae": 0.15333952009677887, |
|
"eval_rmse": 0.21233241260051727, |
|
"eval_runtime": 62.2788, |
|
"eval_samples_per_second": 37.798, |
|
"eval_steps_per_second": 2.376, |
|
"learning_rate": 0.001, |
|
"step": 9198 |
|
}, |
|
{ |
|
"epoch": 21.689497716894977, |
|
"grad_norm": 0.2121485322713852, |
|
"learning_rate": 0.001, |
|
"loss": 0.4055, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_explained_variance": 0.34627434611320496, |
|
"eval_kl_divergence": Infinity, |
|
"eval_loss": 0.4339658319950104, |
|
"eval_mae": 0.15240180492401123, |
|
"eval_rmse": 0.21100641787052155, |
|
"eval_runtime": 63.2767, |
|
"eval_samples_per_second": 37.202, |
|
"eval_steps_per_second": 2.339, |
|
"learning_rate": 0.001, |
|
"step": 9636 |
|
}, |
|
{ |
|
"epoch": 22.831050228310502, |
|
"grad_norm": 0.17502234876155853, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4022, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_explained_variance": 0.362075537443161, |
|
"eval_kl_divergence": NaN, |
|
"eval_loss": 0.43265336751937866, |
|
"eval_mae": 0.1517171412706375, |
|
"eval_rmse": 0.2084527164697647, |
|
"eval_runtime": 61.7803, |
|
"eval_samples_per_second": 38.103, |
|
"eval_steps_per_second": 2.396, |
|
"learning_rate": 0.0001, |
|
"step": 10074 |
|
}, |
|
{ |
|
"epoch": 23.972602739726028, |
|
"grad_norm": 0.20596392452716827, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3978, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_explained_variance": 0.3582542836666107, |
|
"eval_kl_divergence": NaN, |
|
"eval_loss": 0.4384593963623047, |
|
"eval_mae": 0.14925144612789154, |
|
"eval_rmse": 0.20924808084964752, |
|
"eval_runtime": 62.266, |
|
"eval_samples_per_second": 37.806, |
|
"eval_steps_per_second": 2.377, |
|
"learning_rate": 0.0001, |
|
"step": 10512 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_explained_variance": 0.3649435043334961, |
|
"eval_kl_divergence": Infinity, |
|
"eval_loss": 0.4271779954433441, |
|
"eval_mae": 0.14897416532039642, |
|
"eval_rmse": 0.20736177265644073, |
|
"eval_runtime": 63.0259, |
|
"eval_samples_per_second": 37.35, |
|
"eval_steps_per_second": 2.348, |
|
"learning_rate": 0.0001, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 25.114155251141554, |
|
"grad_norm": 0.14978627860546112, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3988, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_explained_variance": 0.36444517970085144, |
|
"eval_kl_divergence": 1.1902661323547363, |
|
"eval_loss": 0.41048941016197205, |
|
"eval_mae": 0.148028165102005, |
|
"eval_rmse": 0.20754428207874298, |
|
"eval_runtime": 62.2088, |
|
"eval_samples_per_second": 37.84, |
|
"eval_steps_per_second": 2.379, |
|
"learning_rate": 0.0001, |
|
"step": 11388 |
|
}, |
|
{ |
|
"epoch": 26.255707762557076, |
|
"grad_norm": 0.13278695940971375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3958, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_explained_variance": 0.3687790632247925, |
|
"eval_kl_divergence": 0.9915334582328796, |
|
"eval_loss": 0.4096038341522217, |
|
"eval_mae": 0.1493707150220871, |
|
"eval_rmse": 0.20674215257167816, |
|
"eval_runtime": 63.9932, |
|
"eval_samples_per_second": 36.785, |
|
"eval_steps_per_second": 2.313, |
|
"learning_rate": 0.0001, |
|
"step": 11826 |
|
}, |
|
{ |
|
"epoch": 27.397260273972602, |
|
"grad_norm": 0.16862636804580688, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3965, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_explained_variance": 0.3680773675441742, |
|
"eval_kl_divergence": 0.9668822288513184, |
|
"eval_loss": 0.4104350507259369, |
|
"eval_mae": 0.1493188589811325, |
|
"eval_rmse": 0.20746104419231415, |
|
"eval_runtime": 64.0647, |
|
"eval_samples_per_second": 36.744, |
|
"eval_steps_per_second": 2.31, |
|
"learning_rate": 0.0001, |
|
"step": 12264 |
|
}, |
|
{ |
|
"epoch": 28.538812785388128, |
|
"grad_norm": 0.16052192449569702, |
|
"learning_rate": 0.0001, |
|
"loss": 0.396, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_explained_variance": 0.3695773184299469, |
|
"eval_kl_divergence": 1.0432541370391846, |
|
"eval_loss": 0.40966179966926575, |
|
"eval_mae": 0.1468651443719864, |
|
"eval_rmse": 0.20694835484027863, |
|
"eval_runtime": 63.2767, |
|
"eval_samples_per_second": 37.202, |
|
"eval_steps_per_second": 2.339, |
|
"learning_rate": 0.0001, |
|
"step": 12702 |
|
}, |
|
{ |
|
"epoch": 29.680365296803654, |
|
"grad_norm": 0.14418508112430573, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3936, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_explained_variance": 0.373136430978775, |
|
"eval_kl_divergence": 0.908222496509552, |
|
"eval_loss": 0.4094092547893524, |
|
"eval_mae": 0.14899054169654846, |
|
"eval_rmse": 0.20645444095134735, |
|
"eval_runtime": 62.5038, |
|
"eval_samples_per_second": 37.662, |
|
"eval_steps_per_second": 2.368, |
|
"learning_rate": 0.0001, |
|
"step": 13140 |
|
}, |
|
{ |
|
"epoch": 30.82191780821918, |
|
"grad_norm": 0.19649599492549896, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3944, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_explained_variance": 0.3705109655857086, |
|
"eval_kl_divergence": 1.0120004415512085, |
|
"eval_loss": 0.40909385681152344, |
|
"eval_mae": 0.14699043333530426, |
|
"eval_rmse": 0.20654882490634918, |
|
"eval_runtime": 63.2971, |
|
"eval_samples_per_second": 37.19, |
|
"eval_steps_per_second": 2.338, |
|
"learning_rate": 0.0001, |
|
"step": 13578 |
|
}, |
|
{ |
|
"epoch": 31.963470319634702, |
|
"grad_norm": 0.228424534201622, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3941, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_explained_variance": 0.37417080998420715, |
|
"eval_kl_divergence": 0.9708234071731567, |
|
"eval_loss": 0.4084269404411316, |
|
"eval_mae": 0.14826728403568268, |
|
"eval_rmse": 0.2059999257326126, |
|
"eval_runtime": 64.3761, |
|
"eval_samples_per_second": 36.566, |
|
"eval_steps_per_second": 2.299, |
|
"learning_rate": 0.0001, |
|
"step": 14016 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_explained_variance": 0.37551748752593994, |
|
"eval_kl_divergence": 0.9317126870155334, |
|
"eval_loss": 0.40824124217033386, |
|
"eval_mae": 0.14738227427005768, |
|
"eval_rmse": 0.20570062100887299, |
|
"eval_runtime": 63.4848, |
|
"eval_samples_per_second": 37.08, |
|
"eval_steps_per_second": 2.331, |
|
"learning_rate": 0.0001, |
|
"step": 14454 |
|
}, |
|
{ |
|
"epoch": 33.10502283105023, |
|
"grad_norm": 0.2595873773097992, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3933, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_explained_variance": 0.37467464804649353, |
|
"eval_kl_divergence": 0.9618669748306274, |
|
"eval_loss": 0.40851354598999023, |
|
"eval_mae": 0.14805640280246735, |
|
"eval_rmse": 0.20609329640865326, |
|
"eval_runtime": 65.3615, |
|
"eval_samples_per_second": 36.015, |
|
"eval_steps_per_second": 2.264, |
|
"learning_rate": 0.0001, |
|
"step": 14892 |
|
}, |
|
{ |
|
"epoch": 34.24657534246575, |
|
"grad_norm": 0.26568445563316345, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3926, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_explained_variance": 0.375776082277298, |
|
"eval_kl_divergence": 1.0522711277008057, |
|
"eval_loss": 0.4072923958301544, |
|
"eval_mae": 0.14664247632026672, |
|
"eval_rmse": 0.20538650453090668, |
|
"eval_runtime": 64.7697, |
|
"eval_samples_per_second": 36.344, |
|
"eval_steps_per_second": 2.285, |
|
"learning_rate": 0.0001, |
|
"step": 15330 |
|
}, |
|
{ |
|
"epoch": 35.38812785388128, |
|
"grad_norm": 0.15931576490402222, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3936, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_explained_variance": 0.3770906925201416, |
|
"eval_kl_divergence": 1.0621892213821411, |
|
"eval_loss": 0.40741708874702454, |
|
"eval_mae": 0.1460237056016922, |
|
"eval_rmse": 0.20519912242889404, |
|
"eval_runtime": 64.23, |
|
"eval_samples_per_second": 36.65, |
|
"eval_steps_per_second": 2.304, |
|
"learning_rate": 0.0001, |
|
"step": 15768 |
|
}, |
|
{ |
|
"epoch": 36.529680365296805, |
|
"grad_norm": 0.22164444625377655, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3935, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_explained_variance": 0.38024798035621643, |
|
"eval_kl_divergence": 1.020066261291504, |
|
"eval_loss": 0.40657544136047363, |
|
"eval_mae": 0.1456020027399063, |
|
"eval_rmse": 0.20468135178089142, |
|
"eval_runtime": 63.8016, |
|
"eval_samples_per_second": 36.896, |
|
"eval_steps_per_second": 2.32, |
|
"learning_rate": 0.0001, |
|
"step": 16206 |
|
}, |
|
{ |
|
"epoch": 37.67123287671233, |
|
"grad_norm": 0.2097047120332718, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3927, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_explained_variance": 0.3799835741519928, |
|
"eval_kl_divergence": 1.0557153224945068, |
|
"eval_loss": 0.406360387802124, |
|
"eval_mae": 0.14585663378238678, |
|
"eval_rmse": 0.20454762876033783, |
|
"eval_runtime": 63.2021, |
|
"eval_samples_per_second": 37.246, |
|
"eval_steps_per_second": 2.342, |
|
"learning_rate": 0.0001, |
|
"step": 16644 |
|
}, |
|
{ |
|
"epoch": 38.81278538812786, |
|
"grad_norm": 0.34068891406059265, |
|
"learning_rate": 0.0001, |
|
"loss": 0.392, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_explained_variance": 0.377095103263855, |
|
"eval_kl_divergence": 1.005536675453186, |
|
"eval_loss": 0.4077896773815155, |
|
"eval_mae": 0.14692139625549316, |
|
"eval_rmse": 0.2055957317352295, |
|
"eval_runtime": 62.5136, |
|
"eval_samples_per_second": 37.656, |
|
"eval_steps_per_second": 2.367, |
|
"learning_rate": 0.0001, |
|
"step": 17082 |
|
}, |
|
{ |
|
"epoch": 39.954337899543376, |
|
"grad_norm": 0.23111671209335327, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3915, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_explained_variance": 0.38054999709129333, |
|
"eval_kl_divergence": 0.9849128723144531, |
|
"eval_loss": 0.4068063199520111, |
|
"eval_mae": 0.14637430012226105, |
|
"eval_rmse": 0.20490336418151855, |
|
"eval_runtime": 62.8552, |
|
"eval_samples_per_second": 37.451, |
|
"eval_steps_per_second": 2.355, |
|
"learning_rate": 0.0001, |
|
"step": 17520 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_explained_variance": 0.3777576982975006, |
|
"eval_kl_divergence": 0.899895191192627, |
|
"eval_loss": 0.40890073776245117, |
|
"eval_mae": 0.1488751471042633, |
|
"eval_rmse": 0.20631897449493408, |
|
"eval_runtime": 63.9481, |
|
"eval_samples_per_second": 36.811, |
|
"eval_steps_per_second": 2.314, |
|
"learning_rate": 0.0001, |
|
"step": 17958 |
|
}, |
|
{ |
|
"epoch": 41.0958904109589, |
|
"grad_norm": 0.28402578830718994, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3907, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_explained_variance": 0.37971171736717224, |
|
"eval_kl_divergence": 1.0616570711135864, |
|
"eval_loss": 0.4068816602230072, |
|
"eval_mae": 0.14634381234645844, |
|
"eval_rmse": 0.20491831004619598, |
|
"eval_runtime": 63.1884, |
|
"eval_samples_per_second": 37.254, |
|
"eval_steps_per_second": 2.342, |
|
"learning_rate": 0.0001, |
|
"step": 18396 |
|
}, |
|
{ |
|
"epoch": 42.23744292237443, |
|
"grad_norm": 0.24103382229804993, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3919, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_explained_variance": 0.3829738199710846, |
|
"eval_kl_divergence": 1.0520097017288208, |
|
"eval_loss": 0.40578988194465637, |
|
"eval_mae": 0.14498426020145416, |
|
"eval_rmse": 0.2040938138961792, |
|
"eval_runtime": 64.2301, |
|
"eval_samples_per_second": 36.649, |
|
"eval_steps_per_second": 2.304, |
|
"learning_rate": 0.0001, |
|
"step": 18834 |
|
}, |
|
{ |
|
"epoch": 43.37899543378995, |
|
"grad_norm": 0.3461155891418457, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3902, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_explained_variance": 0.3809111416339874, |
|
"eval_kl_divergence": 1.0053679943084717, |
|
"eval_loss": 0.4070681035518646, |
|
"eval_mae": 0.14748047292232513, |
|
"eval_rmse": 0.20503848791122437, |
|
"eval_runtime": 63.682, |
|
"eval_samples_per_second": 36.965, |
|
"eval_steps_per_second": 2.324, |
|
"learning_rate": 0.0001, |
|
"step": 19272 |
|
}, |
|
{ |
|
"epoch": 44.52054794520548, |
|
"grad_norm": 0.21600213646888733, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3896, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_explained_variance": 0.38130107522010803, |
|
"eval_kl_divergence": 1.13860285282135, |
|
"eval_loss": 0.40669572353363037, |
|
"eval_mae": 0.14402073621749878, |
|
"eval_rmse": 0.2047145813703537, |
|
"eval_runtime": 61.9143, |
|
"eval_samples_per_second": 38.02, |
|
"eval_steps_per_second": 2.39, |
|
"learning_rate": 0.0001, |
|
"step": 19710 |
|
}, |
|
{ |
|
"epoch": 45.662100456621005, |
|
"grad_norm": 0.2100251168012619, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3925, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_explained_variance": 0.3830677270889282, |
|
"eval_kl_divergence": 1.0252840518951416, |
|
"eval_loss": 0.40670666098594666, |
|
"eval_mae": 0.14572028815746307, |
|
"eval_rmse": 0.20469875633716583, |
|
"eval_runtime": 61.3533, |
|
"eval_samples_per_second": 38.368, |
|
"eval_steps_per_second": 2.412, |
|
"learning_rate": 0.0001, |
|
"step": 20148 |
|
}, |
|
{ |
|
"epoch": 46.80365296803653, |
|
"grad_norm": 0.16854612529277802, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3896, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_explained_variance": 0.3834179639816284, |
|
"eval_kl_divergence": 1.0430312156677246, |
|
"eval_loss": 0.4062415659427643, |
|
"eval_mae": 0.14726205170154572, |
|
"eval_rmse": 0.20429861545562744, |
|
"eval_runtime": 62.7532, |
|
"eval_samples_per_second": 37.512, |
|
"eval_steps_per_second": 2.358, |
|
"learning_rate": 0.0001, |
|
"step": 20586 |
|
}, |
|
{ |
|
"epoch": 47.945205479452056, |
|
"grad_norm": 0.2040056735277176, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3902, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_explained_variance": 0.38119378685951233, |
|
"eval_kl_divergence": 1.104145884513855, |
|
"eval_loss": 0.4064981937408447, |
|
"eval_mae": 0.14571230113506317, |
|
"eval_rmse": 0.20479492843151093, |
|
"eval_runtime": 66.5743, |
|
"eval_samples_per_second": 35.359, |
|
"eval_steps_per_second": 2.223, |
|
"learning_rate": 0.0001, |
|
"step": 21024 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_explained_variance": 0.37976840138435364, |
|
"eval_kl_divergence": 1.0702213048934937, |
|
"eval_loss": 0.40709760785102844, |
|
"eval_mae": 0.14625640213489532, |
|
"eval_rmse": 0.20520327985286713, |
|
"eval_runtime": 62.1191, |
|
"eval_samples_per_second": 37.895, |
|
"eval_steps_per_second": 2.383, |
|
"learning_rate": 0.0001, |
|
"step": 21462 |
|
}, |
|
{ |
|
"epoch": 49.08675799086758, |
|
"grad_norm": 0.2242765724658966, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3897, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_explained_variance": 0.38569536805152893, |
|
"eval_kl_divergence": 0.8917386531829834, |
|
"eval_loss": 0.40644556283950806, |
|
"eval_mae": 0.1479080468416214, |
|
"eval_rmse": 0.2042473703622818, |
|
"eval_runtime": 62.3011, |
|
"eval_samples_per_second": 37.784, |
|
"eval_steps_per_second": 2.376, |
|
"learning_rate": 1e-05, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 50.22831050228311, |
|
"grad_norm": 0.21291576325893402, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3875, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_explained_variance": 0.3844810426235199, |
|
"eval_kl_divergence": 0.9960101842880249, |
|
"eval_loss": 0.40579161047935486, |
|
"eval_mae": 0.14372152090072632, |
|
"eval_rmse": 0.20405276119709015, |
|
"eval_runtime": 61.2114, |
|
"eval_samples_per_second": 38.457, |
|
"eval_steps_per_second": 2.418, |
|
"learning_rate": 1e-05, |
|
"step": 22338 |
|
}, |
|
{ |
|
"epoch": 51.36986301369863, |
|
"grad_norm": 0.24317112565040588, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3874, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_explained_variance": 0.385125994682312, |
|
"eval_kl_divergence": 1.0567286014556885, |
|
"eval_loss": 0.40528106689453125, |
|
"eval_mae": 0.14458806812763214, |
|
"eval_rmse": 0.20368923246860504, |
|
"eval_runtime": 62.8042, |
|
"eval_samples_per_second": 37.482, |
|
"eval_steps_per_second": 2.357, |
|
"learning_rate": 1e-05, |
|
"step": 22776 |
|
}, |
|
{ |
|
"epoch": 52.51141552511415, |
|
"grad_norm": 0.30417612195014954, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3899, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_explained_variance": 0.3858625590801239, |
|
"eval_kl_divergence": 1.0205212831497192, |
|
"eval_loss": 0.4056229293346405, |
|
"eval_mae": 0.14624176919460297, |
|
"eval_rmse": 0.20387189090251923, |
|
"eval_runtime": 62.9117, |
|
"eval_samples_per_second": 37.418, |
|
"eval_steps_per_second": 2.353, |
|
"learning_rate": 1e-05, |
|
"step": 23214 |
|
}, |
|
{ |
|
"epoch": 53.65296803652968, |
|
"grad_norm": 0.24982061982154846, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3892, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_explained_variance": 0.3853992521762848, |
|
"eval_kl_divergence": 0.9905322194099426, |
|
"eval_loss": 0.4058997631072998, |
|
"eval_mae": 0.14412301778793335, |
|
"eval_rmse": 0.20410750806331635, |
|
"eval_runtime": 63.4824, |
|
"eval_samples_per_second": 37.081, |
|
"eval_steps_per_second": 2.331, |
|
"learning_rate": 1e-05, |
|
"step": 23652 |
|
}, |
|
{ |
|
"epoch": 54.794520547945204, |
|
"grad_norm": 0.2903271019458771, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3892, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_explained_variance": 0.38560736179351807, |
|
"eval_kl_divergence": 0.937917947769165, |
|
"eval_loss": 0.4060685932636261, |
|
"eval_mae": 0.1471087485551834, |
|
"eval_rmse": 0.20407529175281525, |
|
"eval_runtime": 64.4026, |
|
"eval_samples_per_second": 36.551, |
|
"eval_steps_per_second": 2.298, |
|
"learning_rate": 1e-05, |
|
"step": 24090 |
|
}, |
|
{ |
|
"epoch": 55.93607305936073, |
|
"grad_norm": 0.2701994776725769, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3869, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_explained_variance": 0.3853694200515747, |
|
"eval_kl_divergence": 0.9695614576339722, |
|
"eval_loss": 0.40592971444129944, |
|
"eval_mae": 0.14540034532546997, |
|
"eval_rmse": 0.20410047471523285, |
|
"eval_runtime": 63.4818, |
|
"eval_samples_per_second": 37.081, |
|
"eval_steps_per_second": 2.331, |
|
"learning_rate": 1e-05, |
|
"step": 24528 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_explained_variance": 0.3842361867427826, |
|
"eval_kl_divergence": 1.0590680837631226, |
|
"eval_loss": 0.4058408737182617, |
|
"eval_mae": 0.1459987610578537, |
|
"eval_rmse": 0.20412230491638184, |
|
"eval_runtime": 62.5651, |
|
"eval_samples_per_second": 37.625, |
|
"eval_steps_per_second": 2.366, |
|
"learning_rate": 1e-05, |
|
"step": 24966 |
|
}, |
|
{ |
|
"epoch": 57.077625570776256, |
|
"grad_norm": 0.20055490732192993, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3874, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_explained_variance": 0.38601794838905334, |
|
"eval_kl_divergence": 0.9275628328323364, |
|
"eval_loss": 0.4063320457935333, |
|
"eval_mae": 0.14603658020496368, |
|
"eval_rmse": 0.20428447425365448, |
|
"eval_runtime": 62.6353, |
|
"eval_samples_per_second": 37.583, |
|
"eval_steps_per_second": 2.363, |
|
"learning_rate": 1e-05, |
|
"step": 25404 |
|
}, |
|
{ |
|
"epoch": 58.21917808219178, |
|
"grad_norm": 0.24670056998729706, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.3887, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_explained_variance": 0.3867626488208771, |
|
"eval_kl_divergence": 0.9793874621391296, |
|
"eval_loss": 0.4056239724159241, |
|
"eval_mae": 0.14530591666698456, |
|
"eval_rmse": 0.20382745563983917, |
|
"eval_runtime": 63.6318, |
|
"eval_samples_per_second": 36.994, |
|
"eval_steps_per_second": 2.326, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 25842 |
|
}, |
|
{ |
|
"epoch": 59.36073059360731, |
|
"grad_norm": 0.27373573184013367, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.3882, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_explained_variance": 0.3851200044155121, |
|
"eval_kl_divergence": 1.0348856449127197, |
|
"eval_loss": 0.40571752190589905, |
|
"eval_mae": 0.1446085125207901, |
|
"eval_rmse": 0.20402370393276215, |
|
"eval_runtime": 63.8531, |
|
"eval_samples_per_second": 36.866, |
|
"eval_steps_per_second": 2.318, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 26280 |
|
}, |
|
{ |
|
"epoch": 60.50228310502283, |
|
"grad_norm": 0.23867332935333252, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.389, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_explained_variance": 0.38573384284973145, |
|
"eval_kl_divergence": 0.9859956502914429, |
|
"eval_loss": 0.4058452248573303, |
|
"eval_mae": 0.14494158327579498, |
|
"eval_rmse": 0.2040751427412033, |
|
"eval_runtime": 61.8751, |
|
"eval_samples_per_second": 38.044, |
|
"eval_steps_per_second": 2.392, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 26718 |
|
}, |
|
{ |
|
"epoch": 61.64383561643836, |
|
"grad_norm": 0.21306726336479187, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.3882, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_explained_variance": 0.3864554166793823, |
|
"eval_kl_divergence": 0.9528394937515259, |
|
"eval_loss": 0.4054276943206787, |
|
"eval_mae": 0.14455263316631317, |
|
"eval_rmse": 0.20368416607379913, |
|
"eval_runtime": 61.7886, |
|
"eval_samples_per_second": 38.098, |
|
"eval_steps_per_second": 2.395, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 27156 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 27156, |
|
"total_flos": 6.42634409963284e+19, |
|
"train_loss": 0.3985773164651095, |
|
"train_runtime": 16834.9641, |
|
"train_samples_per_second": 62.397, |
|
"train_steps_per_second": 3.903 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 65700, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 10, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.42634409963284e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|