lombardata's picture
Evaluation on the test set completed on 2024_11_14.
3ea82a1 verified
{
"best_metric": 0.40528106689453125,
"best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/drone/drone-DinoVdeau-from-binary-large-2024_11_14-batch-size16_freeze_probs/checkpoint-22776",
"epoch": 62.0,
"eval_steps": 500,
"global_step": 27156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"eval_explained_variance": 0.2881631553173065,
"eval_kl_divergence": 1.006906509399414,
"eval_loss": 0.43063807487487793,
"eval_mae": 0.16208958625793457,
"eval_rmse": 0.22103922069072723,
"eval_runtime": 65.2687,
"eval_samples_per_second": 36.066,
"eval_steps_per_second": 2.268,
"learning_rate": 0.001,
"step": 438
},
{
"epoch": 1.1415525114155252,
"grad_norm": 0.5616265535354614,
"learning_rate": 0.001,
"loss": 0.4808,
"step": 500
},
{
"epoch": 2.0,
"eval_explained_variance": 0.31177183985710144,
"eval_kl_divergence": 1.3118820190429688,
"eval_loss": 0.4245865046977997,
"eval_mae": 0.15473191440105438,
"eval_rmse": 0.21785493195056915,
"eval_runtime": 70.2445,
"eval_samples_per_second": 33.512,
"eval_steps_per_second": 2.107,
"learning_rate": 0.001,
"step": 876
},
{
"epoch": 2.2831050228310503,
"grad_norm": 0.5421963930130005,
"learning_rate": 0.001,
"loss": 0.421,
"step": 1000
},
{
"epoch": 3.0,
"eval_explained_variance": 0.3191607892513275,
"eval_kl_divergence": 1.0982407331466675,
"eval_loss": 0.422325998544693,
"eval_mae": 0.1554209440946579,
"eval_rmse": 0.21583305299282074,
"eval_runtime": 63.3078,
"eval_samples_per_second": 37.183,
"eval_steps_per_second": 2.338,
"learning_rate": 0.001,
"step": 1314
},
{
"epoch": 3.4246575342465753,
"grad_norm": 0.4156647324562073,
"learning_rate": 0.001,
"loss": 0.4151,
"step": 1500
},
{
"epoch": 4.0,
"eval_explained_variance": 0.3350948095321655,
"eval_kl_divergence": 1.041384220123291,
"eval_loss": 0.41912660002708435,
"eval_mae": 0.15517595410346985,
"eval_rmse": 0.21416835486888885,
"eval_runtime": 63.7743,
"eval_samples_per_second": 36.911,
"eval_steps_per_second": 2.321,
"learning_rate": 0.001,
"step": 1752
},
{
"epoch": 4.566210045662101,
"grad_norm": 0.2765987813472748,
"learning_rate": 0.001,
"loss": 0.4114,
"step": 2000
},
{
"epoch": 5.0,
"eval_explained_variance": 0.33842501044273376,
"eval_kl_divergence": 1.0698424577713013,
"eval_loss": 0.41713497042655945,
"eval_mae": 0.15411676466464996,
"eval_rmse": 0.21232052147388458,
"eval_runtime": 61.7723,
"eval_samples_per_second": 38.108,
"eval_steps_per_second": 2.396,
"learning_rate": 0.001,
"step": 2190
},
{
"epoch": 5.707762557077626,
"grad_norm": 0.34299173951148987,
"learning_rate": 0.001,
"loss": 0.4089,
"step": 2500
},
{
"epoch": 6.0,
"eval_explained_variance": 0.3310842514038086,
"eval_kl_divergence": 1.1958788633346558,
"eval_loss": 0.42093637585639954,
"eval_mae": 0.1519818753004074,
"eval_rmse": 0.21403205394744873,
"eval_runtime": 61.4619,
"eval_samples_per_second": 38.3,
"eval_steps_per_second": 2.408,
"learning_rate": 0.001,
"step": 2628
},
{
"epoch": 6.8493150684931505,
"grad_norm": 0.30921000242233276,
"learning_rate": 0.001,
"loss": 0.4091,
"step": 3000
},
{
"epoch": 7.0,
"eval_explained_variance": 0.33822229504585266,
"eval_kl_divergence": 1.1708621978759766,
"eval_loss": 0.4166290760040283,
"eval_mae": 0.153007373213768,
"eval_rmse": 0.21260716021060944,
"eval_runtime": 60.3411,
"eval_samples_per_second": 39.012,
"eval_steps_per_second": 2.453,
"learning_rate": 0.001,
"step": 3066
},
{
"epoch": 7.9908675799086755,
"grad_norm": 0.21716275811195374,
"learning_rate": 0.001,
"loss": 0.4071,
"step": 3500
},
{
"epoch": 8.0,
"eval_explained_variance": 0.33456894755363464,
"eval_kl_divergence": 0.971220850944519,
"eval_loss": 0.41946443915367126,
"eval_mae": 0.15562371909618378,
"eval_rmse": 0.2142825573682785,
"eval_runtime": 62.8353,
"eval_samples_per_second": 37.463,
"eval_steps_per_second": 2.355,
"learning_rate": 0.001,
"step": 3504
},
{
"epoch": 9.0,
"eval_explained_variance": 0.3415004014968872,
"eval_kl_divergence": 1.1432474851608276,
"eval_loss": 0.41668570041656494,
"eval_mae": 0.1524006426334381,
"eval_rmse": 0.21208135783672333,
"eval_runtime": 62.325,
"eval_samples_per_second": 37.77,
"eval_steps_per_second": 2.375,
"learning_rate": 0.001,
"step": 3942
},
{
"epoch": 9.132420091324201,
"grad_norm": 0.2371012270450592,
"learning_rate": 0.001,
"loss": 0.4062,
"step": 4000
},
{
"epoch": 10.0,
"eval_explained_variance": 0.34203192591667175,
"eval_kl_divergence": 0.9120630025863647,
"eval_loss": 0.4186115860939026,
"eval_mae": 0.15351708233356476,
"eval_rmse": 0.2138604372739792,
"eval_runtime": 60.5397,
"eval_samples_per_second": 38.884,
"eval_steps_per_second": 2.445,
"learning_rate": 0.001,
"step": 4380
},
{
"epoch": 10.273972602739725,
"grad_norm": 0.2552158236503601,
"learning_rate": 0.001,
"loss": 0.4052,
"step": 4500
},
{
"epoch": 11.0,
"eval_explained_variance": 0.34416234493255615,
"eval_kl_divergence": 0.995019793510437,
"eval_loss": 0.41557687520980835,
"eval_mae": 0.15356659889221191,
"eval_rmse": 0.2114415019750595,
"eval_runtime": 61.7293,
"eval_samples_per_second": 38.134,
"eval_steps_per_second": 2.398,
"learning_rate": 0.001,
"step": 4818
},
{
"epoch": 11.415525114155251,
"grad_norm": 0.20953956246376038,
"learning_rate": 0.001,
"loss": 0.406,
"step": 5000
},
{
"epoch": 12.0,
"eval_explained_variance": 0.3389909565448761,
"eval_kl_divergence": 1.0105773210525513,
"eval_loss": 0.41883811354637146,
"eval_mae": 0.1555173546075821,
"eval_rmse": 0.21388684213161469,
"eval_runtime": 62.5745,
"eval_samples_per_second": 37.619,
"eval_steps_per_second": 2.365,
"learning_rate": 0.001,
"step": 5256
},
{
"epoch": 12.557077625570777,
"grad_norm": 0.18659397959709167,
"learning_rate": 0.001,
"loss": 0.4058,
"step": 5500
},
{
"epoch": 13.0,
"eval_explained_variance": 0.34248629212379456,
"eval_kl_divergence": 1.1481796503067017,
"eval_loss": 0.41630858182907104,
"eval_mae": 0.15531976521015167,
"eval_rmse": 0.21213315427303314,
"eval_runtime": 61.6003,
"eval_samples_per_second": 38.214,
"eval_steps_per_second": 2.403,
"learning_rate": 0.001,
"step": 5694
},
{
"epoch": 13.698630136986301,
"grad_norm": 0.19523686170578003,
"learning_rate": 0.001,
"loss": 0.4056,
"step": 6000
},
{
"epoch": 14.0,
"eval_explained_variance": 0.3286344110965729,
"eval_kl_divergence": 1.211091160774231,
"eval_loss": 0.4193180799484253,
"eval_mae": 0.15458153188228607,
"eval_rmse": 0.21381880342960358,
"eval_runtime": 62.0339,
"eval_samples_per_second": 37.947,
"eval_steps_per_second": 2.386,
"learning_rate": 0.001,
"step": 6132
},
{
"epoch": 14.840182648401827,
"grad_norm": 0.18541939556598663,
"learning_rate": 0.001,
"loss": 0.4033,
"step": 6500
},
{
"epoch": 15.0,
"eval_explained_variance": 0.3402325212955475,
"eval_kl_divergence": 1.2042615413665771,
"eval_loss": 0.416218638420105,
"eval_mae": 0.15419499576091766,
"eval_rmse": 0.2121332883834839,
"eval_runtime": 62.9591,
"eval_samples_per_second": 37.389,
"eval_steps_per_second": 2.351,
"learning_rate": 0.001,
"step": 6570
},
{
"epoch": 15.981735159817351,
"grad_norm": 0.16085268557071686,
"learning_rate": 0.001,
"loss": 0.4057,
"step": 7000
},
{
"epoch": 16.0,
"eval_explained_variance": 0.35001620650291443,
"eval_kl_divergence": 1.0827727317810059,
"eval_loss": 0.41389620304107666,
"eval_mae": 0.1527981460094452,
"eval_rmse": 0.21022744476795197,
"eval_runtime": 62.4108,
"eval_samples_per_second": 37.718,
"eval_steps_per_second": 2.371,
"learning_rate": 0.001,
"step": 7008
},
{
"epoch": 17.0,
"eval_explained_variance": 0.3429690897464752,
"eval_kl_divergence": 1.0005594491958618,
"eval_loss": 0.4171081781387329,
"eval_mae": 0.15638333559036255,
"eval_rmse": 0.21180683374404907,
"eval_runtime": 63.4048,
"eval_samples_per_second": 37.127,
"eval_steps_per_second": 2.334,
"learning_rate": 0.001,
"step": 7446
},
{
"epoch": 17.123287671232877,
"grad_norm": 0.17030780017375946,
"learning_rate": 0.001,
"loss": 0.405,
"step": 7500
},
{
"epoch": 18.0,
"eval_explained_variance": 0.3499327600002289,
"eval_kl_divergence": 1.0514436960220337,
"eval_loss": 0.4146382212638855,
"eval_mae": 0.1507440060377121,
"eval_rmse": 0.2107054442167282,
"eval_runtime": 64.4758,
"eval_samples_per_second": 36.51,
"eval_steps_per_second": 2.295,
"learning_rate": 0.001,
"step": 7884
},
{
"epoch": 18.264840182648403,
"grad_norm": 0.16620762646198273,
"learning_rate": 0.001,
"loss": 0.4035,
"step": 8000
},
{
"epoch": 19.0,
"eval_explained_variance": 0.3467938005924225,
"eval_kl_divergence": 0.9575299024581909,
"eval_loss": 0.41857486963272095,
"eval_mae": 0.1531781703233719,
"eval_rmse": 0.21135376393795013,
"eval_runtime": 65.1272,
"eval_samples_per_second": 36.145,
"eval_steps_per_second": 2.272,
"learning_rate": 0.001,
"step": 8322
},
{
"epoch": 19.40639269406393,
"grad_norm": 0.21431417763233185,
"learning_rate": 0.001,
"loss": 0.4031,
"step": 8500
},
{
"epoch": 20.0,
"eval_explained_variance": 0.34868308901786804,
"eval_kl_divergence": 1.164780855178833,
"eval_loss": 0.41434723138809204,
"eval_mae": 0.15129883587360382,
"eval_rmse": 0.21083922684192657,
"eval_runtime": 62.809,
"eval_samples_per_second": 37.479,
"eval_steps_per_second": 2.356,
"learning_rate": 0.001,
"step": 8760
},
{
"epoch": 20.54794520547945,
"grad_norm": 0.16674350202083588,
"learning_rate": 0.001,
"loss": 0.4048,
"step": 9000
},
{
"epoch": 21.0,
"eval_explained_variance": 0.3385157585144043,
"eval_kl_divergence": 1.2949873208999634,
"eval_loss": 0.4195358157157898,
"eval_mae": 0.15333952009677887,
"eval_rmse": 0.21233241260051727,
"eval_runtime": 62.2788,
"eval_samples_per_second": 37.798,
"eval_steps_per_second": 2.376,
"learning_rate": 0.001,
"step": 9198
},
{
"epoch": 21.689497716894977,
"grad_norm": 0.2121485322713852,
"learning_rate": 0.001,
"loss": 0.4055,
"step": 9500
},
{
"epoch": 22.0,
"eval_explained_variance": 0.34627434611320496,
"eval_kl_divergence": Infinity,
"eval_loss": 0.4339658319950104,
"eval_mae": 0.15240180492401123,
"eval_rmse": 0.21100641787052155,
"eval_runtime": 63.2767,
"eval_samples_per_second": 37.202,
"eval_steps_per_second": 2.339,
"learning_rate": 0.001,
"step": 9636
},
{
"epoch": 22.831050228310502,
"grad_norm": 0.17502234876155853,
"learning_rate": 0.0001,
"loss": 0.4022,
"step": 10000
},
{
"epoch": 23.0,
"eval_explained_variance": 0.362075537443161,
"eval_kl_divergence": NaN,
"eval_loss": 0.43265336751937866,
"eval_mae": 0.1517171412706375,
"eval_rmse": 0.2084527164697647,
"eval_runtime": 61.7803,
"eval_samples_per_second": 38.103,
"eval_steps_per_second": 2.396,
"learning_rate": 0.0001,
"step": 10074
},
{
"epoch": 23.972602739726028,
"grad_norm": 0.20596392452716827,
"learning_rate": 0.0001,
"loss": 0.3978,
"step": 10500
},
{
"epoch": 24.0,
"eval_explained_variance": 0.3582542836666107,
"eval_kl_divergence": NaN,
"eval_loss": 0.4384593963623047,
"eval_mae": 0.14925144612789154,
"eval_rmse": 0.20924808084964752,
"eval_runtime": 62.266,
"eval_samples_per_second": 37.806,
"eval_steps_per_second": 2.377,
"learning_rate": 0.0001,
"step": 10512
},
{
"epoch": 25.0,
"eval_explained_variance": 0.3649435043334961,
"eval_kl_divergence": Infinity,
"eval_loss": 0.4271779954433441,
"eval_mae": 0.14897416532039642,
"eval_rmse": 0.20736177265644073,
"eval_runtime": 63.0259,
"eval_samples_per_second": 37.35,
"eval_steps_per_second": 2.348,
"learning_rate": 0.0001,
"step": 10950
},
{
"epoch": 25.114155251141554,
"grad_norm": 0.14978627860546112,
"learning_rate": 0.0001,
"loss": 0.3988,
"step": 11000
},
{
"epoch": 26.0,
"eval_explained_variance": 0.36444517970085144,
"eval_kl_divergence": 1.1902661323547363,
"eval_loss": 0.41048941016197205,
"eval_mae": 0.148028165102005,
"eval_rmse": 0.20754428207874298,
"eval_runtime": 62.2088,
"eval_samples_per_second": 37.84,
"eval_steps_per_second": 2.379,
"learning_rate": 0.0001,
"step": 11388
},
{
"epoch": 26.255707762557076,
"grad_norm": 0.13278695940971375,
"learning_rate": 0.0001,
"loss": 0.3958,
"step": 11500
},
{
"epoch": 27.0,
"eval_explained_variance": 0.3687790632247925,
"eval_kl_divergence": 0.9915334582328796,
"eval_loss": 0.4096038341522217,
"eval_mae": 0.1493707150220871,
"eval_rmse": 0.20674215257167816,
"eval_runtime": 63.9932,
"eval_samples_per_second": 36.785,
"eval_steps_per_second": 2.313,
"learning_rate": 0.0001,
"step": 11826
},
{
"epoch": 27.397260273972602,
"grad_norm": 0.16862636804580688,
"learning_rate": 0.0001,
"loss": 0.3965,
"step": 12000
},
{
"epoch": 28.0,
"eval_explained_variance": 0.3680773675441742,
"eval_kl_divergence": 0.9668822288513184,
"eval_loss": 0.4104350507259369,
"eval_mae": 0.1493188589811325,
"eval_rmse": 0.20746104419231415,
"eval_runtime": 64.0647,
"eval_samples_per_second": 36.744,
"eval_steps_per_second": 2.31,
"learning_rate": 0.0001,
"step": 12264
},
{
"epoch": 28.538812785388128,
"grad_norm": 0.16052192449569702,
"learning_rate": 0.0001,
"loss": 0.396,
"step": 12500
},
{
"epoch": 29.0,
"eval_explained_variance": 0.3695773184299469,
"eval_kl_divergence": 1.0432541370391846,
"eval_loss": 0.40966179966926575,
"eval_mae": 0.1468651443719864,
"eval_rmse": 0.20694835484027863,
"eval_runtime": 63.2767,
"eval_samples_per_second": 37.202,
"eval_steps_per_second": 2.339,
"learning_rate": 0.0001,
"step": 12702
},
{
"epoch": 29.680365296803654,
"grad_norm": 0.14418508112430573,
"learning_rate": 0.0001,
"loss": 0.3936,
"step": 13000
},
{
"epoch": 30.0,
"eval_explained_variance": 0.373136430978775,
"eval_kl_divergence": 0.908222496509552,
"eval_loss": 0.4094092547893524,
"eval_mae": 0.14899054169654846,
"eval_rmse": 0.20645444095134735,
"eval_runtime": 62.5038,
"eval_samples_per_second": 37.662,
"eval_steps_per_second": 2.368,
"learning_rate": 0.0001,
"step": 13140
},
{
"epoch": 30.82191780821918,
"grad_norm": 0.19649599492549896,
"learning_rate": 0.0001,
"loss": 0.3944,
"step": 13500
},
{
"epoch": 31.0,
"eval_explained_variance": 0.3705109655857086,
"eval_kl_divergence": 1.0120004415512085,
"eval_loss": 0.40909385681152344,
"eval_mae": 0.14699043333530426,
"eval_rmse": 0.20654882490634918,
"eval_runtime": 63.2971,
"eval_samples_per_second": 37.19,
"eval_steps_per_second": 2.338,
"learning_rate": 0.0001,
"step": 13578
},
{
"epoch": 31.963470319634702,
"grad_norm": 0.228424534201622,
"learning_rate": 0.0001,
"loss": 0.3941,
"step": 14000
},
{
"epoch": 32.0,
"eval_explained_variance": 0.37417080998420715,
"eval_kl_divergence": 0.9708234071731567,
"eval_loss": 0.4084269404411316,
"eval_mae": 0.14826728403568268,
"eval_rmse": 0.2059999257326126,
"eval_runtime": 64.3761,
"eval_samples_per_second": 36.566,
"eval_steps_per_second": 2.299,
"learning_rate": 0.0001,
"step": 14016
},
{
"epoch": 33.0,
"eval_explained_variance": 0.37551748752593994,
"eval_kl_divergence": 0.9317126870155334,
"eval_loss": 0.40824124217033386,
"eval_mae": 0.14738227427005768,
"eval_rmse": 0.20570062100887299,
"eval_runtime": 63.4848,
"eval_samples_per_second": 37.08,
"eval_steps_per_second": 2.331,
"learning_rate": 0.0001,
"step": 14454
},
{
"epoch": 33.10502283105023,
"grad_norm": 0.2595873773097992,
"learning_rate": 0.0001,
"loss": 0.3933,
"step": 14500
},
{
"epoch": 34.0,
"eval_explained_variance": 0.37467464804649353,
"eval_kl_divergence": 0.9618669748306274,
"eval_loss": 0.40851354598999023,
"eval_mae": 0.14805640280246735,
"eval_rmse": 0.20609329640865326,
"eval_runtime": 65.3615,
"eval_samples_per_second": 36.015,
"eval_steps_per_second": 2.264,
"learning_rate": 0.0001,
"step": 14892
},
{
"epoch": 34.24657534246575,
"grad_norm": 0.26568445563316345,
"learning_rate": 0.0001,
"loss": 0.3926,
"step": 15000
},
{
"epoch": 35.0,
"eval_explained_variance": 0.375776082277298,
"eval_kl_divergence": 1.0522711277008057,
"eval_loss": 0.4072923958301544,
"eval_mae": 0.14664247632026672,
"eval_rmse": 0.20538650453090668,
"eval_runtime": 64.7697,
"eval_samples_per_second": 36.344,
"eval_steps_per_second": 2.285,
"learning_rate": 0.0001,
"step": 15330
},
{
"epoch": 35.38812785388128,
"grad_norm": 0.15931576490402222,
"learning_rate": 0.0001,
"loss": 0.3936,
"step": 15500
},
{
"epoch": 36.0,
"eval_explained_variance": 0.3770906925201416,
"eval_kl_divergence": 1.0621892213821411,
"eval_loss": 0.40741708874702454,
"eval_mae": 0.1460237056016922,
"eval_rmse": 0.20519912242889404,
"eval_runtime": 64.23,
"eval_samples_per_second": 36.65,
"eval_steps_per_second": 2.304,
"learning_rate": 0.0001,
"step": 15768
},
{
"epoch": 36.529680365296805,
"grad_norm": 0.22164444625377655,
"learning_rate": 0.0001,
"loss": 0.3935,
"step": 16000
},
{
"epoch": 37.0,
"eval_explained_variance": 0.38024798035621643,
"eval_kl_divergence": 1.020066261291504,
"eval_loss": 0.40657544136047363,
"eval_mae": 0.1456020027399063,
"eval_rmse": 0.20468135178089142,
"eval_runtime": 63.8016,
"eval_samples_per_second": 36.896,
"eval_steps_per_second": 2.32,
"learning_rate": 0.0001,
"step": 16206
},
{
"epoch": 37.67123287671233,
"grad_norm": 0.2097047120332718,
"learning_rate": 0.0001,
"loss": 0.3927,
"step": 16500
},
{
"epoch": 38.0,
"eval_explained_variance": 0.3799835741519928,
"eval_kl_divergence": 1.0557153224945068,
"eval_loss": 0.406360387802124,
"eval_mae": 0.14585663378238678,
"eval_rmse": 0.20454762876033783,
"eval_runtime": 63.2021,
"eval_samples_per_second": 37.246,
"eval_steps_per_second": 2.342,
"learning_rate": 0.0001,
"step": 16644
},
{
"epoch": 38.81278538812786,
"grad_norm": 0.34068891406059265,
"learning_rate": 0.0001,
"loss": 0.392,
"step": 17000
},
{
"epoch": 39.0,
"eval_explained_variance": 0.377095103263855,
"eval_kl_divergence": 1.005536675453186,
"eval_loss": 0.4077896773815155,
"eval_mae": 0.14692139625549316,
"eval_rmse": 0.2055957317352295,
"eval_runtime": 62.5136,
"eval_samples_per_second": 37.656,
"eval_steps_per_second": 2.367,
"learning_rate": 0.0001,
"step": 17082
},
{
"epoch": 39.954337899543376,
"grad_norm": 0.23111671209335327,
"learning_rate": 0.0001,
"loss": 0.3915,
"step": 17500
},
{
"epoch": 40.0,
"eval_explained_variance": 0.38054999709129333,
"eval_kl_divergence": 0.9849128723144531,
"eval_loss": 0.4068063199520111,
"eval_mae": 0.14637430012226105,
"eval_rmse": 0.20490336418151855,
"eval_runtime": 62.8552,
"eval_samples_per_second": 37.451,
"eval_steps_per_second": 2.355,
"learning_rate": 0.0001,
"step": 17520
},
{
"epoch": 41.0,
"eval_explained_variance": 0.3777576982975006,
"eval_kl_divergence": 0.899895191192627,
"eval_loss": 0.40890073776245117,
"eval_mae": 0.1488751471042633,
"eval_rmse": 0.20631897449493408,
"eval_runtime": 63.9481,
"eval_samples_per_second": 36.811,
"eval_steps_per_second": 2.314,
"learning_rate": 0.0001,
"step": 17958
},
{
"epoch": 41.0958904109589,
"grad_norm": 0.28402578830718994,
"learning_rate": 0.0001,
"loss": 0.3907,
"step": 18000
},
{
"epoch": 42.0,
"eval_explained_variance": 0.37971171736717224,
"eval_kl_divergence": 1.0616570711135864,
"eval_loss": 0.4068816602230072,
"eval_mae": 0.14634381234645844,
"eval_rmse": 0.20491831004619598,
"eval_runtime": 63.1884,
"eval_samples_per_second": 37.254,
"eval_steps_per_second": 2.342,
"learning_rate": 0.0001,
"step": 18396
},
{
"epoch": 42.23744292237443,
"grad_norm": 0.24103382229804993,
"learning_rate": 0.0001,
"loss": 0.3919,
"step": 18500
},
{
"epoch": 43.0,
"eval_explained_variance": 0.3829738199710846,
"eval_kl_divergence": 1.0520097017288208,
"eval_loss": 0.40578988194465637,
"eval_mae": 0.14498426020145416,
"eval_rmse": 0.2040938138961792,
"eval_runtime": 64.2301,
"eval_samples_per_second": 36.649,
"eval_steps_per_second": 2.304,
"learning_rate": 0.0001,
"step": 18834
},
{
"epoch": 43.37899543378995,
"grad_norm": 0.3461155891418457,
"learning_rate": 0.0001,
"loss": 0.3902,
"step": 19000
},
{
"epoch": 44.0,
"eval_explained_variance": 0.3809111416339874,
"eval_kl_divergence": 1.0053679943084717,
"eval_loss": 0.4070681035518646,
"eval_mae": 0.14748047292232513,
"eval_rmse": 0.20503848791122437,
"eval_runtime": 63.682,
"eval_samples_per_second": 36.965,
"eval_steps_per_second": 2.324,
"learning_rate": 0.0001,
"step": 19272
},
{
"epoch": 44.52054794520548,
"grad_norm": 0.21600213646888733,
"learning_rate": 0.0001,
"loss": 0.3896,
"step": 19500
},
{
"epoch": 45.0,
"eval_explained_variance": 0.38130107522010803,
"eval_kl_divergence": 1.13860285282135,
"eval_loss": 0.40669572353363037,
"eval_mae": 0.14402073621749878,
"eval_rmse": 0.2047145813703537,
"eval_runtime": 61.9143,
"eval_samples_per_second": 38.02,
"eval_steps_per_second": 2.39,
"learning_rate": 0.0001,
"step": 19710
},
{
"epoch": 45.662100456621005,
"grad_norm": 0.2100251168012619,
"learning_rate": 0.0001,
"loss": 0.3925,
"step": 20000
},
{
"epoch": 46.0,
"eval_explained_variance": 0.3830677270889282,
"eval_kl_divergence": 1.0252840518951416,
"eval_loss": 0.40670666098594666,
"eval_mae": 0.14572028815746307,
"eval_rmse": 0.20469875633716583,
"eval_runtime": 61.3533,
"eval_samples_per_second": 38.368,
"eval_steps_per_second": 2.412,
"learning_rate": 0.0001,
"step": 20148
},
{
"epoch": 46.80365296803653,
"grad_norm": 0.16854612529277802,
"learning_rate": 0.0001,
"loss": 0.3896,
"step": 20500
},
{
"epoch": 47.0,
"eval_explained_variance": 0.3834179639816284,
"eval_kl_divergence": 1.0430312156677246,
"eval_loss": 0.4062415659427643,
"eval_mae": 0.14726205170154572,
"eval_rmse": 0.20429861545562744,
"eval_runtime": 62.7532,
"eval_samples_per_second": 37.512,
"eval_steps_per_second": 2.358,
"learning_rate": 0.0001,
"step": 20586
},
{
"epoch": 47.945205479452056,
"grad_norm": 0.2040056735277176,
"learning_rate": 0.0001,
"loss": 0.3902,
"step": 21000
},
{
"epoch": 48.0,
"eval_explained_variance": 0.38119378685951233,
"eval_kl_divergence": 1.104145884513855,
"eval_loss": 0.4064981937408447,
"eval_mae": 0.14571230113506317,
"eval_rmse": 0.20479492843151093,
"eval_runtime": 66.5743,
"eval_samples_per_second": 35.359,
"eval_steps_per_second": 2.223,
"learning_rate": 0.0001,
"step": 21024
},
{
"epoch": 49.0,
"eval_explained_variance": 0.37976840138435364,
"eval_kl_divergence": 1.0702213048934937,
"eval_loss": 0.40709760785102844,
"eval_mae": 0.14625640213489532,
"eval_rmse": 0.20520327985286713,
"eval_runtime": 62.1191,
"eval_samples_per_second": 37.895,
"eval_steps_per_second": 2.383,
"learning_rate": 0.0001,
"step": 21462
},
{
"epoch": 49.08675799086758,
"grad_norm": 0.2242765724658966,
"learning_rate": 1e-05,
"loss": 0.3897,
"step": 21500
},
{
"epoch": 50.0,
"eval_explained_variance": 0.38569536805152893,
"eval_kl_divergence": 0.8917386531829834,
"eval_loss": 0.40644556283950806,
"eval_mae": 0.1479080468416214,
"eval_rmse": 0.2042473703622818,
"eval_runtime": 62.3011,
"eval_samples_per_second": 37.784,
"eval_steps_per_second": 2.376,
"learning_rate": 1e-05,
"step": 21900
},
{
"epoch": 50.22831050228311,
"grad_norm": 0.21291576325893402,
"learning_rate": 1e-05,
"loss": 0.3875,
"step": 22000
},
{
"epoch": 51.0,
"eval_explained_variance": 0.3844810426235199,
"eval_kl_divergence": 0.9960101842880249,
"eval_loss": 0.40579161047935486,
"eval_mae": 0.14372152090072632,
"eval_rmse": 0.20405276119709015,
"eval_runtime": 61.2114,
"eval_samples_per_second": 38.457,
"eval_steps_per_second": 2.418,
"learning_rate": 1e-05,
"step": 22338
},
{
"epoch": 51.36986301369863,
"grad_norm": 0.24317112565040588,
"learning_rate": 1e-05,
"loss": 0.3874,
"step": 22500
},
{
"epoch": 52.0,
"eval_explained_variance": 0.385125994682312,
"eval_kl_divergence": 1.0567286014556885,
"eval_loss": 0.40528106689453125,
"eval_mae": 0.14458806812763214,
"eval_rmse": 0.20368923246860504,
"eval_runtime": 62.8042,
"eval_samples_per_second": 37.482,
"eval_steps_per_second": 2.357,
"learning_rate": 1e-05,
"step": 22776
},
{
"epoch": 52.51141552511415,
"grad_norm": 0.30417612195014954,
"learning_rate": 1e-05,
"loss": 0.3899,
"step": 23000
},
{
"epoch": 53.0,
"eval_explained_variance": 0.3858625590801239,
"eval_kl_divergence": 1.0205212831497192,
"eval_loss": 0.4056229293346405,
"eval_mae": 0.14624176919460297,
"eval_rmse": 0.20387189090251923,
"eval_runtime": 62.9117,
"eval_samples_per_second": 37.418,
"eval_steps_per_second": 2.353,
"learning_rate": 1e-05,
"step": 23214
},
{
"epoch": 53.65296803652968,
"grad_norm": 0.24982061982154846,
"learning_rate": 1e-05,
"loss": 0.3892,
"step": 23500
},
{
"epoch": 54.0,
"eval_explained_variance": 0.3853992521762848,
"eval_kl_divergence": 0.9905322194099426,
"eval_loss": 0.4058997631072998,
"eval_mae": 0.14412301778793335,
"eval_rmse": 0.20410750806331635,
"eval_runtime": 63.4824,
"eval_samples_per_second": 37.081,
"eval_steps_per_second": 2.331,
"learning_rate": 1e-05,
"step": 23652
},
{
"epoch": 54.794520547945204,
"grad_norm": 0.2903271019458771,
"learning_rate": 1e-05,
"loss": 0.3892,
"step": 24000
},
{
"epoch": 55.0,
"eval_explained_variance": 0.38560736179351807,
"eval_kl_divergence": 0.937917947769165,
"eval_loss": 0.4060685932636261,
"eval_mae": 0.1471087485551834,
"eval_rmse": 0.20407529175281525,
"eval_runtime": 64.4026,
"eval_samples_per_second": 36.551,
"eval_steps_per_second": 2.298,
"learning_rate": 1e-05,
"step": 24090
},
{
"epoch": 55.93607305936073,
"grad_norm": 0.2701994776725769,
"learning_rate": 1e-05,
"loss": 0.3869,
"step": 24500
},
{
"epoch": 56.0,
"eval_explained_variance": 0.3853694200515747,
"eval_kl_divergence": 0.9695614576339722,
"eval_loss": 0.40592971444129944,
"eval_mae": 0.14540034532546997,
"eval_rmse": 0.20410047471523285,
"eval_runtime": 63.4818,
"eval_samples_per_second": 37.081,
"eval_steps_per_second": 2.331,
"learning_rate": 1e-05,
"step": 24528
},
{
"epoch": 57.0,
"eval_explained_variance": 0.3842361867427826,
"eval_kl_divergence": 1.0590680837631226,
"eval_loss": 0.4058408737182617,
"eval_mae": 0.1459987610578537,
"eval_rmse": 0.20412230491638184,
"eval_runtime": 62.5651,
"eval_samples_per_second": 37.625,
"eval_steps_per_second": 2.366,
"learning_rate": 1e-05,
"step": 24966
},
{
"epoch": 57.077625570776256,
"grad_norm": 0.20055490732192993,
"learning_rate": 1e-05,
"loss": 0.3874,
"step": 25000
},
{
"epoch": 58.0,
"eval_explained_variance": 0.38601794838905334,
"eval_kl_divergence": 0.9275628328323364,
"eval_loss": 0.4063320457935333,
"eval_mae": 0.14603658020496368,
"eval_rmse": 0.20428447425365448,
"eval_runtime": 62.6353,
"eval_samples_per_second": 37.583,
"eval_steps_per_second": 2.363,
"learning_rate": 1e-05,
"step": 25404
},
{
"epoch": 58.21917808219178,
"grad_norm": 0.24670056998729706,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.3887,
"step": 25500
},
{
"epoch": 59.0,
"eval_explained_variance": 0.3867626488208771,
"eval_kl_divergence": 0.9793874621391296,
"eval_loss": 0.4056239724159241,
"eval_mae": 0.14530591666698456,
"eval_rmse": 0.20382745563983917,
"eval_runtime": 63.6318,
"eval_samples_per_second": 36.994,
"eval_steps_per_second": 2.326,
"learning_rate": 1.0000000000000002e-06,
"step": 25842
},
{
"epoch": 59.36073059360731,
"grad_norm": 0.27373573184013367,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.3882,
"step": 26000
},
{
"epoch": 60.0,
"eval_explained_variance": 0.3851200044155121,
"eval_kl_divergence": 1.0348856449127197,
"eval_loss": 0.40571752190589905,
"eval_mae": 0.1446085125207901,
"eval_rmse": 0.20402370393276215,
"eval_runtime": 63.8531,
"eval_samples_per_second": 36.866,
"eval_steps_per_second": 2.318,
"learning_rate": 1.0000000000000002e-06,
"step": 26280
},
{
"epoch": 60.50228310502283,
"grad_norm": 0.23867332935333252,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.389,
"step": 26500
},
{
"epoch": 61.0,
"eval_explained_variance": 0.38573384284973145,
"eval_kl_divergence": 0.9859956502914429,
"eval_loss": 0.4058452248573303,
"eval_mae": 0.14494158327579498,
"eval_rmse": 0.2040751427412033,
"eval_runtime": 61.8751,
"eval_samples_per_second": 38.044,
"eval_steps_per_second": 2.392,
"learning_rate": 1.0000000000000002e-06,
"step": 26718
},
{
"epoch": 61.64383561643836,
"grad_norm": 0.21306726336479187,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.3882,
"step": 27000
},
{
"epoch": 62.0,
"eval_explained_variance": 0.3864554166793823,
"eval_kl_divergence": 0.9528394937515259,
"eval_loss": 0.4054276943206787,
"eval_mae": 0.14455263316631317,
"eval_rmse": 0.20368416607379913,
"eval_runtime": 61.7886,
"eval_samples_per_second": 38.098,
"eval_steps_per_second": 2.395,
"learning_rate": 1.0000000000000002e-06,
"step": 27156
},
{
"epoch": 62.0,
"learning_rate": 1.0000000000000002e-06,
"step": 27156,
"total_flos": 6.42634409963284e+19,
"train_loss": 0.3985773164651095,
"train_runtime": 16834.9641,
"train_samples_per_second": 62.397,
"train_steps_per_second": 3.903
}
],
"logging_steps": 500,
"max_steps": 65700,
"num_input_tokens_seen": 0,
"num_train_epochs": 150,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.42634409963284e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}