{ "best_metric": 0.40528106689453125, "best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/drone/drone-DinoVdeau-from-binary-large-2024_11_14-batch-size16_freeze_probs/checkpoint-22776", "epoch": 62.0, "eval_steps": 500, "global_step": 27156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_explained_variance": 0.2881631553173065, "eval_kl_divergence": 1.006906509399414, "eval_loss": 0.43063807487487793, "eval_mae": 0.16208958625793457, "eval_rmse": 0.22103922069072723, "eval_runtime": 65.2687, "eval_samples_per_second": 36.066, "eval_steps_per_second": 2.268, "learning_rate": 0.001, "step": 438 }, { "epoch": 1.1415525114155252, "grad_norm": 0.5616265535354614, "learning_rate": 0.001, "loss": 0.4808, "step": 500 }, { "epoch": 2.0, "eval_explained_variance": 0.31177183985710144, "eval_kl_divergence": 1.3118820190429688, "eval_loss": 0.4245865046977997, "eval_mae": 0.15473191440105438, "eval_rmse": 0.21785493195056915, "eval_runtime": 70.2445, "eval_samples_per_second": 33.512, "eval_steps_per_second": 2.107, "learning_rate": 0.001, "step": 876 }, { "epoch": 2.2831050228310503, "grad_norm": 0.5421963930130005, "learning_rate": 0.001, "loss": 0.421, "step": 1000 }, { "epoch": 3.0, "eval_explained_variance": 0.3191607892513275, "eval_kl_divergence": 1.0982407331466675, "eval_loss": 0.422325998544693, "eval_mae": 0.1554209440946579, "eval_rmse": 0.21583305299282074, "eval_runtime": 63.3078, "eval_samples_per_second": 37.183, "eval_steps_per_second": 2.338, "learning_rate": 0.001, "step": 1314 }, { "epoch": 3.4246575342465753, "grad_norm": 0.4156647324562073, "learning_rate": 0.001, "loss": 0.4151, "step": 1500 }, { "epoch": 4.0, "eval_explained_variance": 0.3350948095321655, "eval_kl_divergence": 1.041384220123291, "eval_loss": 0.41912660002708435, "eval_mae": 0.15517595410346985, "eval_rmse": 0.21416835486888885, "eval_runtime": 63.7743, "eval_samples_per_second": 36.911, "eval_steps_per_second": 2.321, "learning_rate": 0.001, "step": 1752 }, { "epoch": 4.566210045662101, "grad_norm": 0.2765987813472748, "learning_rate": 0.001, "loss": 0.4114, "step": 2000 }, { "epoch": 5.0, "eval_explained_variance": 0.33842501044273376, "eval_kl_divergence": 1.0698424577713013, "eval_loss": 0.41713497042655945, "eval_mae": 0.15411676466464996, "eval_rmse": 0.21232052147388458, "eval_runtime": 61.7723, "eval_samples_per_second": 38.108, "eval_steps_per_second": 2.396, "learning_rate": 0.001, "step": 2190 }, { "epoch": 5.707762557077626, "grad_norm": 0.34299173951148987, "learning_rate": 0.001, "loss": 0.4089, "step": 2500 }, { "epoch": 6.0, "eval_explained_variance": 0.3310842514038086, "eval_kl_divergence": 1.1958788633346558, "eval_loss": 0.42093637585639954, "eval_mae": 0.1519818753004074, "eval_rmse": 0.21403205394744873, "eval_runtime": 61.4619, "eval_samples_per_second": 38.3, "eval_steps_per_second": 2.408, "learning_rate": 0.001, "step": 2628 }, { "epoch": 6.8493150684931505, "grad_norm": 0.30921000242233276, "learning_rate": 0.001, "loss": 0.4091, "step": 3000 }, { "epoch": 7.0, "eval_explained_variance": 0.33822229504585266, "eval_kl_divergence": 1.1708621978759766, "eval_loss": 0.4166290760040283, "eval_mae": 0.153007373213768, "eval_rmse": 0.21260716021060944, "eval_runtime": 60.3411, "eval_samples_per_second": 39.012, "eval_steps_per_second": 2.453, "learning_rate": 0.001, "step": 3066 }, { "epoch": 7.9908675799086755, "grad_norm": 0.21716275811195374, "learning_rate": 0.001, "loss": 0.4071, "step": 3500 }, { "epoch": 8.0, "eval_explained_variance": 0.33456894755363464, "eval_kl_divergence": 0.971220850944519, "eval_loss": 0.41946443915367126, "eval_mae": 0.15562371909618378, "eval_rmse": 0.2142825573682785, "eval_runtime": 62.8353, "eval_samples_per_second": 37.463, "eval_steps_per_second": 2.355, "learning_rate": 0.001, "step": 3504 }, { "epoch": 9.0, "eval_explained_variance": 0.3415004014968872, "eval_kl_divergence": 1.1432474851608276, "eval_loss": 0.41668570041656494, "eval_mae": 0.1524006426334381, "eval_rmse": 0.21208135783672333, "eval_runtime": 62.325, "eval_samples_per_second": 37.77, "eval_steps_per_second": 2.375, "learning_rate": 0.001, "step": 3942 }, { "epoch": 9.132420091324201, "grad_norm": 0.2371012270450592, "learning_rate": 0.001, "loss": 0.4062, "step": 4000 }, { "epoch": 10.0, "eval_explained_variance": 0.34203192591667175, "eval_kl_divergence": 0.9120630025863647, "eval_loss": 0.4186115860939026, "eval_mae": 0.15351708233356476, "eval_rmse": 0.2138604372739792, "eval_runtime": 60.5397, "eval_samples_per_second": 38.884, "eval_steps_per_second": 2.445, "learning_rate": 0.001, "step": 4380 }, { "epoch": 10.273972602739725, "grad_norm": 0.2552158236503601, "learning_rate": 0.001, "loss": 0.4052, "step": 4500 }, { "epoch": 11.0, "eval_explained_variance": 0.34416234493255615, "eval_kl_divergence": 0.995019793510437, "eval_loss": 0.41557687520980835, "eval_mae": 0.15356659889221191, "eval_rmse": 0.2114415019750595, "eval_runtime": 61.7293, "eval_samples_per_second": 38.134, "eval_steps_per_second": 2.398, "learning_rate": 0.001, "step": 4818 }, { "epoch": 11.415525114155251, "grad_norm": 0.20953956246376038, "learning_rate": 0.001, "loss": 0.406, "step": 5000 }, { "epoch": 12.0, "eval_explained_variance": 0.3389909565448761, "eval_kl_divergence": 1.0105773210525513, "eval_loss": 0.41883811354637146, "eval_mae": 0.1555173546075821, "eval_rmse": 0.21388684213161469, "eval_runtime": 62.5745, "eval_samples_per_second": 37.619, "eval_steps_per_second": 2.365, "learning_rate": 0.001, "step": 5256 }, { "epoch": 12.557077625570777, "grad_norm": 0.18659397959709167, "learning_rate": 0.001, "loss": 0.4058, "step": 5500 }, { "epoch": 13.0, "eval_explained_variance": 0.34248629212379456, "eval_kl_divergence": 1.1481796503067017, "eval_loss": 0.41630858182907104, "eval_mae": 0.15531976521015167, "eval_rmse": 0.21213315427303314, "eval_runtime": 61.6003, "eval_samples_per_second": 38.214, "eval_steps_per_second": 2.403, "learning_rate": 0.001, "step": 5694 }, { "epoch": 13.698630136986301, "grad_norm": 0.19523686170578003, "learning_rate": 0.001, "loss": 0.4056, "step": 6000 }, { "epoch": 14.0, "eval_explained_variance": 0.3286344110965729, "eval_kl_divergence": 1.211091160774231, "eval_loss": 0.4193180799484253, "eval_mae": 0.15458153188228607, "eval_rmse": 0.21381880342960358, "eval_runtime": 62.0339, "eval_samples_per_second": 37.947, "eval_steps_per_second": 2.386, "learning_rate": 0.001, "step": 6132 }, { "epoch": 14.840182648401827, "grad_norm": 0.18541939556598663, "learning_rate": 0.001, "loss": 0.4033, "step": 6500 }, { "epoch": 15.0, "eval_explained_variance": 0.3402325212955475, "eval_kl_divergence": 1.2042615413665771, "eval_loss": 0.416218638420105, "eval_mae": 0.15419499576091766, "eval_rmse": 0.2121332883834839, "eval_runtime": 62.9591, "eval_samples_per_second": 37.389, "eval_steps_per_second": 2.351, "learning_rate": 0.001, "step": 6570 }, { "epoch": 15.981735159817351, "grad_norm": 0.16085268557071686, "learning_rate": 0.001, "loss": 0.4057, "step": 7000 }, { "epoch": 16.0, "eval_explained_variance": 0.35001620650291443, "eval_kl_divergence": 1.0827727317810059, "eval_loss": 0.41389620304107666, "eval_mae": 0.1527981460094452, "eval_rmse": 0.21022744476795197, "eval_runtime": 62.4108, "eval_samples_per_second": 37.718, "eval_steps_per_second": 2.371, "learning_rate": 0.001, "step": 7008 }, { "epoch": 17.0, "eval_explained_variance": 0.3429690897464752, "eval_kl_divergence": 1.0005594491958618, "eval_loss": 0.4171081781387329, "eval_mae": 0.15638333559036255, "eval_rmse": 0.21180683374404907, "eval_runtime": 63.4048, "eval_samples_per_second": 37.127, "eval_steps_per_second": 2.334, "learning_rate": 0.001, "step": 7446 }, { "epoch": 17.123287671232877, "grad_norm": 0.17030780017375946, "learning_rate": 0.001, "loss": 0.405, "step": 7500 }, { "epoch": 18.0, "eval_explained_variance": 0.3499327600002289, "eval_kl_divergence": 1.0514436960220337, "eval_loss": 0.4146382212638855, "eval_mae": 0.1507440060377121, "eval_rmse": 0.2107054442167282, "eval_runtime": 64.4758, "eval_samples_per_second": 36.51, "eval_steps_per_second": 2.295, "learning_rate": 0.001, "step": 7884 }, { "epoch": 18.264840182648403, "grad_norm": 0.16620762646198273, "learning_rate": 0.001, "loss": 0.4035, "step": 8000 }, { "epoch": 19.0, "eval_explained_variance": 0.3467938005924225, "eval_kl_divergence": 0.9575299024581909, "eval_loss": 0.41857486963272095, "eval_mae": 0.1531781703233719, "eval_rmse": 0.21135376393795013, "eval_runtime": 65.1272, "eval_samples_per_second": 36.145, "eval_steps_per_second": 2.272, "learning_rate": 0.001, "step": 8322 }, { "epoch": 19.40639269406393, "grad_norm": 0.21431417763233185, "learning_rate": 0.001, "loss": 0.4031, "step": 8500 }, { "epoch": 20.0, "eval_explained_variance": 0.34868308901786804, "eval_kl_divergence": 1.164780855178833, "eval_loss": 0.41434723138809204, "eval_mae": 0.15129883587360382, "eval_rmse": 0.21083922684192657, "eval_runtime": 62.809, "eval_samples_per_second": 37.479, "eval_steps_per_second": 2.356, "learning_rate": 0.001, "step": 8760 }, { "epoch": 20.54794520547945, "grad_norm": 0.16674350202083588, "learning_rate": 0.001, "loss": 0.4048, "step": 9000 }, { "epoch": 21.0, "eval_explained_variance": 0.3385157585144043, "eval_kl_divergence": 1.2949873208999634, "eval_loss": 0.4195358157157898, "eval_mae": 0.15333952009677887, "eval_rmse": 0.21233241260051727, "eval_runtime": 62.2788, "eval_samples_per_second": 37.798, "eval_steps_per_second": 2.376, "learning_rate": 0.001, "step": 9198 }, { "epoch": 21.689497716894977, "grad_norm": 0.2121485322713852, "learning_rate": 0.001, "loss": 0.4055, "step": 9500 }, { "epoch": 22.0, "eval_explained_variance": 0.34627434611320496, "eval_kl_divergence": Infinity, "eval_loss": 0.4339658319950104, "eval_mae": 0.15240180492401123, "eval_rmse": 0.21100641787052155, "eval_runtime": 63.2767, "eval_samples_per_second": 37.202, "eval_steps_per_second": 2.339, "learning_rate": 0.001, "step": 9636 }, { "epoch": 22.831050228310502, "grad_norm": 0.17502234876155853, "learning_rate": 0.0001, "loss": 0.4022, "step": 10000 }, { "epoch": 23.0, "eval_explained_variance": 0.362075537443161, "eval_kl_divergence": NaN, "eval_loss": 0.43265336751937866, "eval_mae": 0.1517171412706375, "eval_rmse": 0.2084527164697647, "eval_runtime": 61.7803, "eval_samples_per_second": 38.103, "eval_steps_per_second": 2.396, "learning_rate": 0.0001, "step": 10074 }, { "epoch": 23.972602739726028, "grad_norm": 0.20596392452716827, "learning_rate": 0.0001, "loss": 0.3978, "step": 10500 }, { "epoch": 24.0, "eval_explained_variance": 0.3582542836666107, "eval_kl_divergence": NaN, "eval_loss": 0.4384593963623047, "eval_mae": 0.14925144612789154, "eval_rmse": 0.20924808084964752, "eval_runtime": 62.266, "eval_samples_per_second": 37.806, "eval_steps_per_second": 2.377, "learning_rate": 0.0001, "step": 10512 }, { "epoch": 25.0, "eval_explained_variance": 0.3649435043334961, "eval_kl_divergence": Infinity, "eval_loss": 0.4271779954433441, "eval_mae": 0.14897416532039642, "eval_rmse": 0.20736177265644073, "eval_runtime": 63.0259, "eval_samples_per_second": 37.35, "eval_steps_per_second": 2.348, "learning_rate": 0.0001, "step": 10950 }, { "epoch": 25.114155251141554, "grad_norm": 0.14978627860546112, "learning_rate": 0.0001, "loss": 0.3988, "step": 11000 }, { "epoch": 26.0, "eval_explained_variance": 0.36444517970085144, "eval_kl_divergence": 1.1902661323547363, "eval_loss": 0.41048941016197205, "eval_mae": 0.148028165102005, "eval_rmse": 0.20754428207874298, "eval_runtime": 62.2088, "eval_samples_per_second": 37.84, "eval_steps_per_second": 2.379, "learning_rate": 0.0001, "step": 11388 }, { "epoch": 26.255707762557076, "grad_norm": 0.13278695940971375, "learning_rate": 0.0001, "loss": 0.3958, "step": 11500 }, { "epoch": 27.0, "eval_explained_variance": 0.3687790632247925, "eval_kl_divergence": 0.9915334582328796, "eval_loss": 0.4096038341522217, "eval_mae": 0.1493707150220871, "eval_rmse": 0.20674215257167816, "eval_runtime": 63.9932, "eval_samples_per_second": 36.785, "eval_steps_per_second": 2.313, "learning_rate": 0.0001, "step": 11826 }, { "epoch": 27.397260273972602, "grad_norm": 0.16862636804580688, "learning_rate": 0.0001, "loss": 0.3965, "step": 12000 }, { "epoch": 28.0, "eval_explained_variance": 0.3680773675441742, "eval_kl_divergence": 0.9668822288513184, "eval_loss": 0.4104350507259369, "eval_mae": 0.1493188589811325, "eval_rmse": 0.20746104419231415, "eval_runtime": 64.0647, "eval_samples_per_second": 36.744, "eval_steps_per_second": 2.31, "learning_rate": 0.0001, "step": 12264 }, { "epoch": 28.538812785388128, "grad_norm": 0.16052192449569702, "learning_rate": 0.0001, "loss": 0.396, "step": 12500 }, { "epoch": 29.0, "eval_explained_variance": 0.3695773184299469, "eval_kl_divergence": 1.0432541370391846, "eval_loss": 0.40966179966926575, "eval_mae": 0.1468651443719864, "eval_rmse": 0.20694835484027863, "eval_runtime": 63.2767, "eval_samples_per_second": 37.202, "eval_steps_per_second": 2.339, "learning_rate": 0.0001, "step": 12702 }, { "epoch": 29.680365296803654, "grad_norm": 0.14418508112430573, "learning_rate": 0.0001, "loss": 0.3936, "step": 13000 }, { "epoch": 30.0, "eval_explained_variance": 0.373136430978775, "eval_kl_divergence": 0.908222496509552, "eval_loss": 0.4094092547893524, "eval_mae": 0.14899054169654846, "eval_rmse": 0.20645444095134735, "eval_runtime": 62.5038, "eval_samples_per_second": 37.662, "eval_steps_per_second": 2.368, "learning_rate": 0.0001, "step": 13140 }, { "epoch": 30.82191780821918, "grad_norm": 0.19649599492549896, "learning_rate": 0.0001, "loss": 0.3944, "step": 13500 }, { "epoch": 31.0, "eval_explained_variance": 0.3705109655857086, "eval_kl_divergence": 1.0120004415512085, "eval_loss": 0.40909385681152344, "eval_mae": 0.14699043333530426, "eval_rmse": 0.20654882490634918, "eval_runtime": 63.2971, "eval_samples_per_second": 37.19, "eval_steps_per_second": 2.338, "learning_rate": 0.0001, "step": 13578 }, { "epoch": 31.963470319634702, "grad_norm": 0.228424534201622, "learning_rate": 0.0001, "loss": 0.3941, "step": 14000 }, { "epoch": 32.0, "eval_explained_variance": 0.37417080998420715, "eval_kl_divergence": 0.9708234071731567, "eval_loss": 0.4084269404411316, "eval_mae": 0.14826728403568268, "eval_rmse": 0.2059999257326126, "eval_runtime": 64.3761, "eval_samples_per_second": 36.566, "eval_steps_per_second": 2.299, "learning_rate": 0.0001, "step": 14016 }, { "epoch": 33.0, "eval_explained_variance": 0.37551748752593994, "eval_kl_divergence": 0.9317126870155334, "eval_loss": 0.40824124217033386, "eval_mae": 0.14738227427005768, "eval_rmse": 0.20570062100887299, "eval_runtime": 63.4848, "eval_samples_per_second": 37.08, "eval_steps_per_second": 2.331, "learning_rate": 0.0001, "step": 14454 }, { "epoch": 33.10502283105023, "grad_norm": 0.2595873773097992, "learning_rate": 0.0001, "loss": 0.3933, "step": 14500 }, { "epoch": 34.0, "eval_explained_variance": 0.37467464804649353, "eval_kl_divergence": 0.9618669748306274, "eval_loss": 0.40851354598999023, "eval_mae": 0.14805640280246735, "eval_rmse": 0.20609329640865326, "eval_runtime": 65.3615, "eval_samples_per_second": 36.015, "eval_steps_per_second": 2.264, "learning_rate": 0.0001, "step": 14892 }, { "epoch": 34.24657534246575, "grad_norm": 0.26568445563316345, "learning_rate": 0.0001, "loss": 0.3926, "step": 15000 }, { "epoch": 35.0, "eval_explained_variance": 0.375776082277298, "eval_kl_divergence": 1.0522711277008057, "eval_loss": 0.4072923958301544, "eval_mae": 0.14664247632026672, "eval_rmse": 0.20538650453090668, "eval_runtime": 64.7697, "eval_samples_per_second": 36.344, "eval_steps_per_second": 2.285, "learning_rate": 0.0001, "step": 15330 }, { "epoch": 35.38812785388128, "grad_norm": 0.15931576490402222, "learning_rate": 0.0001, "loss": 0.3936, "step": 15500 }, { "epoch": 36.0, "eval_explained_variance": 0.3770906925201416, "eval_kl_divergence": 1.0621892213821411, "eval_loss": 0.40741708874702454, "eval_mae": 0.1460237056016922, "eval_rmse": 0.20519912242889404, "eval_runtime": 64.23, "eval_samples_per_second": 36.65, "eval_steps_per_second": 2.304, "learning_rate": 0.0001, "step": 15768 }, { "epoch": 36.529680365296805, "grad_norm": 0.22164444625377655, "learning_rate": 0.0001, "loss": 0.3935, "step": 16000 }, { "epoch": 37.0, "eval_explained_variance": 0.38024798035621643, "eval_kl_divergence": 1.020066261291504, "eval_loss": 0.40657544136047363, "eval_mae": 0.1456020027399063, "eval_rmse": 0.20468135178089142, "eval_runtime": 63.8016, "eval_samples_per_second": 36.896, "eval_steps_per_second": 2.32, "learning_rate": 0.0001, "step": 16206 }, { "epoch": 37.67123287671233, "grad_norm": 0.2097047120332718, "learning_rate": 0.0001, "loss": 0.3927, "step": 16500 }, { "epoch": 38.0, "eval_explained_variance": 0.3799835741519928, "eval_kl_divergence": 1.0557153224945068, "eval_loss": 0.406360387802124, "eval_mae": 0.14585663378238678, "eval_rmse": 0.20454762876033783, "eval_runtime": 63.2021, "eval_samples_per_second": 37.246, "eval_steps_per_second": 2.342, "learning_rate": 0.0001, "step": 16644 }, { "epoch": 38.81278538812786, "grad_norm": 0.34068891406059265, "learning_rate": 0.0001, "loss": 0.392, "step": 17000 }, { "epoch": 39.0, "eval_explained_variance": 0.377095103263855, "eval_kl_divergence": 1.005536675453186, "eval_loss": 0.4077896773815155, "eval_mae": 0.14692139625549316, "eval_rmse": 0.2055957317352295, "eval_runtime": 62.5136, "eval_samples_per_second": 37.656, "eval_steps_per_second": 2.367, "learning_rate": 0.0001, "step": 17082 }, { "epoch": 39.954337899543376, "grad_norm": 0.23111671209335327, "learning_rate": 0.0001, "loss": 0.3915, "step": 17500 }, { "epoch": 40.0, "eval_explained_variance": 0.38054999709129333, "eval_kl_divergence": 0.9849128723144531, "eval_loss": 0.4068063199520111, "eval_mae": 0.14637430012226105, "eval_rmse": 0.20490336418151855, "eval_runtime": 62.8552, "eval_samples_per_second": 37.451, "eval_steps_per_second": 2.355, "learning_rate": 0.0001, "step": 17520 }, { "epoch": 41.0, "eval_explained_variance": 0.3777576982975006, "eval_kl_divergence": 0.899895191192627, "eval_loss": 0.40890073776245117, "eval_mae": 0.1488751471042633, "eval_rmse": 0.20631897449493408, "eval_runtime": 63.9481, "eval_samples_per_second": 36.811, "eval_steps_per_second": 2.314, "learning_rate": 0.0001, "step": 17958 }, { "epoch": 41.0958904109589, "grad_norm": 0.28402578830718994, "learning_rate": 0.0001, "loss": 0.3907, "step": 18000 }, { "epoch": 42.0, "eval_explained_variance": 0.37971171736717224, "eval_kl_divergence": 1.0616570711135864, "eval_loss": 0.4068816602230072, "eval_mae": 0.14634381234645844, "eval_rmse": 0.20491831004619598, "eval_runtime": 63.1884, "eval_samples_per_second": 37.254, "eval_steps_per_second": 2.342, "learning_rate": 0.0001, "step": 18396 }, { "epoch": 42.23744292237443, "grad_norm": 0.24103382229804993, "learning_rate": 0.0001, "loss": 0.3919, "step": 18500 }, { "epoch": 43.0, "eval_explained_variance": 0.3829738199710846, "eval_kl_divergence": 1.0520097017288208, "eval_loss": 0.40578988194465637, "eval_mae": 0.14498426020145416, "eval_rmse": 0.2040938138961792, "eval_runtime": 64.2301, "eval_samples_per_second": 36.649, "eval_steps_per_second": 2.304, "learning_rate": 0.0001, "step": 18834 }, { "epoch": 43.37899543378995, "grad_norm": 0.3461155891418457, "learning_rate": 0.0001, "loss": 0.3902, "step": 19000 }, { "epoch": 44.0, "eval_explained_variance": 0.3809111416339874, "eval_kl_divergence": 1.0053679943084717, "eval_loss": 0.4070681035518646, "eval_mae": 0.14748047292232513, "eval_rmse": 0.20503848791122437, "eval_runtime": 63.682, "eval_samples_per_second": 36.965, "eval_steps_per_second": 2.324, "learning_rate": 0.0001, "step": 19272 }, { "epoch": 44.52054794520548, "grad_norm": 0.21600213646888733, "learning_rate": 0.0001, "loss": 0.3896, "step": 19500 }, { "epoch": 45.0, "eval_explained_variance": 0.38130107522010803, "eval_kl_divergence": 1.13860285282135, "eval_loss": 0.40669572353363037, "eval_mae": 0.14402073621749878, "eval_rmse": 0.2047145813703537, "eval_runtime": 61.9143, "eval_samples_per_second": 38.02, "eval_steps_per_second": 2.39, "learning_rate": 0.0001, "step": 19710 }, { "epoch": 45.662100456621005, "grad_norm": 0.2100251168012619, "learning_rate": 0.0001, "loss": 0.3925, "step": 20000 }, { "epoch": 46.0, "eval_explained_variance": 0.3830677270889282, "eval_kl_divergence": 1.0252840518951416, "eval_loss": 0.40670666098594666, "eval_mae": 0.14572028815746307, "eval_rmse": 0.20469875633716583, "eval_runtime": 61.3533, "eval_samples_per_second": 38.368, "eval_steps_per_second": 2.412, "learning_rate": 0.0001, "step": 20148 }, { "epoch": 46.80365296803653, "grad_norm": 0.16854612529277802, "learning_rate": 0.0001, "loss": 0.3896, "step": 20500 }, { "epoch": 47.0, "eval_explained_variance": 0.3834179639816284, "eval_kl_divergence": 1.0430312156677246, "eval_loss": 0.4062415659427643, "eval_mae": 0.14726205170154572, "eval_rmse": 0.20429861545562744, "eval_runtime": 62.7532, "eval_samples_per_second": 37.512, "eval_steps_per_second": 2.358, "learning_rate": 0.0001, "step": 20586 }, { "epoch": 47.945205479452056, "grad_norm": 0.2040056735277176, "learning_rate": 0.0001, "loss": 0.3902, "step": 21000 }, { "epoch": 48.0, "eval_explained_variance": 0.38119378685951233, "eval_kl_divergence": 1.104145884513855, "eval_loss": 0.4064981937408447, "eval_mae": 0.14571230113506317, "eval_rmse": 0.20479492843151093, "eval_runtime": 66.5743, "eval_samples_per_second": 35.359, "eval_steps_per_second": 2.223, "learning_rate": 0.0001, "step": 21024 }, { "epoch": 49.0, "eval_explained_variance": 0.37976840138435364, "eval_kl_divergence": 1.0702213048934937, "eval_loss": 0.40709760785102844, "eval_mae": 0.14625640213489532, "eval_rmse": 0.20520327985286713, "eval_runtime": 62.1191, "eval_samples_per_second": 37.895, "eval_steps_per_second": 2.383, "learning_rate": 0.0001, "step": 21462 }, { "epoch": 49.08675799086758, "grad_norm": 0.2242765724658966, "learning_rate": 1e-05, "loss": 0.3897, "step": 21500 }, { "epoch": 50.0, "eval_explained_variance": 0.38569536805152893, "eval_kl_divergence": 0.8917386531829834, "eval_loss": 0.40644556283950806, "eval_mae": 0.1479080468416214, "eval_rmse": 0.2042473703622818, "eval_runtime": 62.3011, "eval_samples_per_second": 37.784, "eval_steps_per_second": 2.376, "learning_rate": 1e-05, "step": 21900 }, { "epoch": 50.22831050228311, "grad_norm": 0.21291576325893402, "learning_rate": 1e-05, "loss": 0.3875, "step": 22000 }, { "epoch": 51.0, "eval_explained_variance": 0.3844810426235199, "eval_kl_divergence": 0.9960101842880249, "eval_loss": 0.40579161047935486, "eval_mae": 0.14372152090072632, "eval_rmse": 0.20405276119709015, "eval_runtime": 61.2114, "eval_samples_per_second": 38.457, "eval_steps_per_second": 2.418, "learning_rate": 1e-05, "step": 22338 }, { "epoch": 51.36986301369863, "grad_norm": 0.24317112565040588, "learning_rate": 1e-05, "loss": 0.3874, "step": 22500 }, { "epoch": 52.0, "eval_explained_variance": 0.385125994682312, "eval_kl_divergence": 1.0567286014556885, "eval_loss": 0.40528106689453125, "eval_mae": 0.14458806812763214, "eval_rmse": 0.20368923246860504, "eval_runtime": 62.8042, "eval_samples_per_second": 37.482, "eval_steps_per_second": 2.357, "learning_rate": 1e-05, "step": 22776 }, { "epoch": 52.51141552511415, "grad_norm": 0.30417612195014954, "learning_rate": 1e-05, "loss": 0.3899, "step": 23000 }, { "epoch": 53.0, "eval_explained_variance": 0.3858625590801239, "eval_kl_divergence": 1.0205212831497192, "eval_loss": 0.4056229293346405, "eval_mae": 0.14624176919460297, "eval_rmse": 0.20387189090251923, "eval_runtime": 62.9117, "eval_samples_per_second": 37.418, "eval_steps_per_second": 2.353, "learning_rate": 1e-05, "step": 23214 }, { "epoch": 53.65296803652968, "grad_norm": 0.24982061982154846, "learning_rate": 1e-05, "loss": 0.3892, "step": 23500 }, { "epoch": 54.0, "eval_explained_variance": 0.3853992521762848, "eval_kl_divergence": 0.9905322194099426, "eval_loss": 0.4058997631072998, "eval_mae": 0.14412301778793335, "eval_rmse": 0.20410750806331635, "eval_runtime": 63.4824, "eval_samples_per_second": 37.081, "eval_steps_per_second": 2.331, "learning_rate": 1e-05, "step": 23652 }, { "epoch": 54.794520547945204, "grad_norm": 0.2903271019458771, "learning_rate": 1e-05, "loss": 0.3892, "step": 24000 }, { "epoch": 55.0, "eval_explained_variance": 0.38560736179351807, "eval_kl_divergence": 0.937917947769165, "eval_loss": 0.4060685932636261, "eval_mae": 0.1471087485551834, "eval_rmse": 0.20407529175281525, "eval_runtime": 64.4026, "eval_samples_per_second": 36.551, "eval_steps_per_second": 2.298, "learning_rate": 1e-05, "step": 24090 }, { "epoch": 55.93607305936073, "grad_norm": 0.2701994776725769, "learning_rate": 1e-05, "loss": 0.3869, "step": 24500 }, { "epoch": 56.0, "eval_explained_variance": 0.3853694200515747, "eval_kl_divergence": 0.9695614576339722, "eval_loss": 0.40592971444129944, "eval_mae": 0.14540034532546997, "eval_rmse": 0.20410047471523285, "eval_runtime": 63.4818, "eval_samples_per_second": 37.081, "eval_steps_per_second": 2.331, "learning_rate": 1e-05, "step": 24528 }, { "epoch": 57.0, "eval_explained_variance": 0.3842361867427826, "eval_kl_divergence": 1.0590680837631226, "eval_loss": 0.4058408737182617, "eval_mae": 0.1459987610578537, "eval_rmse": 0.20412230491638184, "eval_runtime": 62.5651, "eval_samples_per_second": 37.625, "eval_steps_per_second": 2.366, "learning_rate": 1e-05, "step": 24966 }, { "epoch": 57.077625570776256, "grad_norm": 0.20055490732192993, "learning_rate": 1e-05, "loss": 0.3874, "step": 25000 }, { "epoch": 58.0, "eval_explained_variance": 0.38601794838905334, "eval_kl_divergence": 0.9275628328323364, "eval_loss": 0.4063320457935333, "eval_mae": 0.14603658020496368, "eval_rmse": 0.20428447425365448, "eval_runtime": 62.6353, "eval_samples_per_second": 37.583, "eval_steps_per_second": 2.363, "learning_rate": 1e-05, "step": 25404 }, { "epoch": 58.21917808219178, "grad_norm": 0.24670056998729706, "learning_rate": 1.0000000000000002e-06, "loss": 0.3887, "step": 25500 }, { "epoch": 59.0, "eval_explained_variance": 0.3867626488208771, "eval_kl_divergence": 0.9793874621391296, "eval_loss": 0.4056239724159241, "eval_mae": 0.14530591666698456, "eval_rmse": 0.20382745563983917, "eval_runtime": 63.6318, "eval_samples_per_second": 36.994, "eval_steps_per_second": 2.326, "learning_rate": 1.0000000000000002e-06, "step": 25842 }, { "epoch": 59.36073059360731, "grad_norm": 0.27373573184013367, "learning_rate": 1.0000000000000002e-06, "loss": 0.3882, "step": 26000 }, { "epoch": 60.0, "eval_explained_variance": 0.3851200044155121, "eval_kl_divergence": 1.0348856449127197, "eval_loss": 0.40571752190589905, "eval_mae": 0.1446085125207901, "eval_rmse": 0.20402370393276215, "eval_runtime": 63.8531, "eval_samples_per_second": 36.866, "eval_steps_per_second": 2.318, "learning_rate": 1.0000000000000002e-06, "step": 26280 }, { "epoch": 60.50228310502283, "grad_norm": 0.23867332935333252, "learning_rate": 1.0000000000000002e-06, "loss": 0.389, "step": 26500 }, { "epoch": 61.0, "eval_explained_variance": 0.38573384284973145, "eval_kl_divergence": 0.9859956502914429, "eval_loss": 0.4058452248573303, "eval_mae": 0.14494158327579498, "eval_rmse": 0.2040751427412033, "eval_runtime": 61.8751, "eval_samples_per_second": 38.044, "eval_steps_per_second": 2.392, "learning_rate": 1.0000000000000002e-06, "step": 26718 }, { "epoch": 61.64383561643836, "grad_norm": 0.21306726336479187, "learning_rate": 1.0000000000000002e-06, "loss": 0.3882, "step": 27000 }, { "epoch": 62.0, "eval_explained_variance": 0.3864554166793823, "eval_kl_divergence": 0.9528394937515259, "eval_loss": 0.4054276943206787, "eval_mae": 0.14455263316631317, "eval_rmse": 0.20368416607379913, "eval_runtime": 61.7886, "eval_samples_per_second": 38.098, "eval_steps_per_second": 2.395, "learning_rate": 1.0000000000000002e-06, "step": 27156 }, { "epoch": 62.0, "learning_rate": 1.0000000000000002e-06, "step": 27156, "total_flos": 6.42634409963284e+19, "train_loss": 0.3985773164651095, "train_runtime": 16834.9641, "train_samples_per_second": 62.397, "train_steps_per_second": 3.903 } ], "logging_steps": 500, "max_steps": 65700, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.42634409963284e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }