|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.988183161004431, |
|
"eval_steps": 500, |
|
"global_step": 1352, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.059084194977843424, |
|
"grad_norm": 0.6387189059092414, |
|
"learning_rate": 9.998650208062712e-05, |
|
"loss": 0.6972, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11816838995568685, |
|
"grad_norm": 0.48601036335500336, |
|
"learning_rate": 9.994601561026155e-05, |
|
"loss": 0.5423, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17725258493353027, |
|
"grad_norm": 0.40437591619558344, |
|
"learning_rate": 9.98785624482278e-05, |
|
"loss": 0.5149, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2363367799113737, |
|
"grad_norm": 0.5030211090458666, |
|
"learning_rate": 9.978417901361958e-05, |
|
"loss": 0.513, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.29542097488921715, |
|
"grad_norm": 0.49098083558596206, |
|
"learning_rate": 9.96629162656365e-05, |
|
"loss": 0.4957, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35450516986706054, |
|
"grad_norm": 0.6329012343929873, |
|
"learning_rate": 9.951483967607041e-05, |
|
"loss": 0.495, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.413589364844904, |
|
"grad_norm": 0.48252149699962754, |
|
"learning_rate": 9.934002919395592e-05, |
|
"loss": 0.4943, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4726735598227474, |
|
"grad_norm": 0.5221971068930835, |
|
"learning_rate": 9.91385792024048e-05, |
|
"loss": 0.4738, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5317577548005908, |
|
"grad_norm": 0.42549092560536134, |
|
"learning_rate": 9.891059846764679e-05, |
|
"loss": 0.4565, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5908419497784343, |
|
"grad_norm": 0.4606171719652433, |
|
"learning_rate": 9.865621008030492e-05, |
|
"loss": 0.4674, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6499261447562777, |
|
"grad_norm": 0.46353540699946943, |
|
"learning_rate": 9.83755513889369e-05, |
|
"loss": 0.4727, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7090103397341211, |
|
"grad_norm": 0.4815928480268326, |
|
"learning_rate": 9.80687739258782e-05, |
|
"loss": 0.4736, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7680945347119645, |
|
"grad_norm": 0.485838392040906, |
|
"learning_rate": 9.773604332542729e-05, |
|
"loss": 0.47, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.827178729689808, |
|
"grad_norm": 0.5025850953484241, |
|
"learning_rate": 9.737753923441688e-05, |
|
"loss": 0.467, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8862629246676514, |
|
"grad_norm": 0.45997226542102815, |
|
"learning_rate": 9.69934552152196e-05, |
|
"loss": 0.4522, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9453471196454948, |
|
"grad_norm": 0.4546309634161405, |
|
"learning_rate": 9.658399864124037e-05, |
|
"loss": 0.4613, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0044313146233383, |
|
"grad_norm": 0.6577670848472944, |
|
"learning_rate": 9.61493905849521e-05, |
|
"loss": 0.4641, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0635155096011817, |
|
"grad_norm": 0.5743711541659764, |
|
"learning_rate": 9.568986569853487e-05, |
|
"loss": 0.3946, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.122599704579025, |
|
"grad_norm": 0.5374571463070855, |
|
"learning_rate": 9.520567208718337e-05, |
|
"loss": 0.3882, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1816838995568686, |
|
"grad_norm": 0.6531134738909915, |
|
"learning_rate": 9.469707117515067e-05, |
|
"loss": 0.4205, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.240768094534712, |
|
"grad_norm": 0.5201345622237219, |
|
"learning_rate": 9.416433756460091e-05, |
|
"loss": 0.386, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2998522895125553, |
|
"grad_norm": 0.7267530776250423, |
|
"learning_rate": 9.360775888734698e-05, |
|
"loss": 0.4096, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3589364844903988, |
|
"grad_norm": 0.5480147887972682, |
|
"learning_rate": 9.302763564955331e-05, |
|
"loss": 0.3921, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4180206794682422, |
|
"grad_norm": 0.6228502280992079, |
|
"learning_rate": 9.242428106948749e-05, |
|
"loss": 0.3788, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4771048744460857, |
|
"grad_norm": 0.7119116813192373, |
|
"learning_rate": 9.179802090840853e-05, |
|
"loss": 0.3894, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.536189069423929, |
|
"grad_norm": 0.5987495739217404, |
|
"learning_rate": 9.114919329468282e-05, |
|
"loss": 0.3707, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5952732644017726, |
|
"grad_norm": 0.6278387167291306, |
|
"learning_rate": 9.04781485412231e-05, |
|
"loss": 0.3938, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.654357459379616, |
|
"grad_norm": 0.5312391005336979, |
|
"learning_rate": 8.978524895634842e-05, |
|
"loss": 0.3799, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7134416543574593, |
|
"grad_norm": 0.7329625126762797, |
|
"learning_rate": 8.907086864816803e-05, |
|
"loss": 0.403, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7725258493353029, |
|
"grad_norm": 0.5600401734580108, |
|
"learning_rate": 8.833539332259398e-05, |
|
"loss": 0.3758, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8316100443131462, |
|
"grad_norm": 0.503395394356135, |
|
"learning_rate": 8.757922007509207e-05, |
|
"loss": 0.3963, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.8906942392909896, |
|
"grad_norm": 0.6324775953771359, |
|
"learning_rate": 8.680275717628337e-05, |
|
"loss": 0.3858, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9497784342688331, |
|
"grad_norm": 0.5574107174736728, |
|
"learning_rate": 8.600642385151205e-05, |
|
"loss": 0.3799, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.0088626292466767, |
|
"grad_norm": 0.5250864362886176, |
|
"learning_rate": 8.519065005449858e-05, |
|
"loss": 0.3763, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.06794682422452, |
|
"grad_norm": 0.7754155552722259, |
|
"learning_rate": 8.43558762352005e-05, |
|
"loss": 0.2934, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.1270310192023634, |
|
"grad_norm": 0.6889925166837503, |
|
"learning_rate": 8.350255310200612e-05, |
|
"loss": 0.3078, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.186115214180207, |
|
"grad_norm": 0.6281519102970855, |
|
"learning_rate": 8.263114137838947e-05, |
|
"loss": 0.3028, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.24519940915805, |
|
"grad_norm": 0.7619069663070173, |
|
"learning_rate": 8.174211155415799e-05, |
|
"loss": 0.2972, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.3042836041358936, |
|
"grad_norm": 0.6469214076306162, |
|
"learning_rate": 8.083594363142717e-05, |
|
"loss": 0.2995, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.363367799113737, |
|
"grad_norm": 0.7761443706661119, |
|
"learning_rate": 7.991312686545937e-05, |
|
"loss": 0.2963, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.4224519940915803, |
|
"grad_norm": 0.6652219653359559, |
|
"learning_rate": 7.897415950050676e-05, |
|
"loss": 0.2987, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.481536189069424, |
|
"grad_norm": 0.7586700003856579, |
|
"learning_rate": 7.801954850080075e-05, |
|
"loss": 0.3092, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.5406203840472674, |
|
"grad_norm": 0.6663785963305503, |
|
"learning_rate": 7.704980927683359e-05, |
|
"loss": 0.2951, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.5997045790251105, |
|
"grad_norm": 0.6913254608481674, |
|
"learning_rate": 7.60654654070796e-05, |
|
"loss": 0.3097, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.658788774002954, |
|
"grad_norm": 0.6943652391419598, |
|
"learning_rate": 7.506704835530634e-05, |
|
"loss": 0.2999, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.7178729689807977, |
|
"grad_norm": 0.644096596125316, |
|
"learning_rate": 7.405509718362842e-05, |
|
"loss": 0.2905, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.7769571639586412, |
|
"grad_norm": 0.7660517390503399, |
|
"learning_rate": 7.303015826145885e-05, |
|
"loss": 0.309, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.8360413589364843, |
|
"grad_norm": 0.7708918433168639, |
|
"learning_rate": 7.199278497051498e-05, |
|
"loss": 0.302, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.895125553914328, |
|
"grad_norm": 0.626692120748867, |
|
"learning_rate": 7.094353740603839e-05, |
|
"loss": 0.297, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.9542097488921715, |
|
"grad_norm": 0.7926892798861292, |
|
"learning_rate": 6.988298207439021e-05, |
|
"loss": 0.3101, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.9542097488921715, |
|
"eval_loss": 0.4939613938331604, |
|
"eval_runtime": 53.5552, |
|
"eval_samples_per_second": 2.073, |
|
"eval_steps_per_second": 0.523, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.0132939438700146, |
|
"grad_norm": 0.5747285632843896, |
|
"learning_rate": 6.881169158718474e-05, |
|
"loss": 0.2736, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.072378138847858, |
|
"grad_norm": 0.7661842329328333, |
|
"learning_rate": 6.773024435212678e-05, |
|
"loss": 0.2187, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.1314623338257017, |
|
"grad_norm": 0.8401289392682185, |
|
"learning_rate": 6.663922426071977e-05, |
|
"loss": 0.2057, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.1905465288035453, |
|
"grad_norm": 0.7506650372127406, |
|
"learning_rate": 6.553922037301283e-05, |
|
"loss": 0.2067, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.2496307237813884, |
|
"grad_norm": 0.7842941173009434, |
|
"learning_rate": 6.443082659955738e-05, |
|
"loss": 0.1989, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.308714918759232, |
|
"grad_norm": 0.7593744868915973, |
|
"learning_rate": 6.331464138074493e-05, |
|
"loss": 0.2179, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.3677991137370755, |
|
"grad_norm": 0.756091123285611, |
|
"learning_rate": 6.219126736369903e-05, |
|
"loss": 0.2176, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.4268833087149186, |
|
"grad_norm": 0.7869512689224245, |
|
"learning_rate": 6.106131107689599e-05, |
|
"loss": 0.2215, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.485967503692762, |
|
"grad_norm": 0.7636724676762149, |
|
"learning_rate": 5.9925382602689974e-05, |
|
"loss": 0.2153, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.5450516986706058, |
|
"grad_norm": 0.6815318566179079, |
|
"learning_rate": 5.8784095247919305e-05, |
|
"loss": 0.2133, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.604135893648449, |
|
"grad_norm": 0.7848948829508617, |
|
"learning_rate": 5.763806521277184e-05, |
|
"loss": 0.2109, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.6632200886262924, |
|
"grad_norm": 0.7715200213470335, |
|
"learning_rate": 5.648791125808809e-05, |
|
"loss": 0.2214, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.722304283604136, |
|
"grad_norm": 0.6480973448749833, |
|
"learning_rate": 5.5334254371281934e-05, |
|
"loss": 0.212, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.781388478581979, |
|
"grad_norm": 0.6618796234383102, |
|
"learning_rate": 5.417771743105907e-05, |
|
"loss": 0.2196, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.8404726735598227, |
|
"grad_norm": 0.9462710180746308, |
|
"learning_rate": 5.3018924871114305e-05, |
|
"loss": 0.2145, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.8995568685376663, |
|
"grad_norm": 0.7674586984793125, |
|
"learning_rate": 5.185850234298942e-05, |
|
"loss": 0.2199, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.9586410635155094, |
|
"grad_norm": 0.8420752213128357, |
|
"learning_rate": 5.0697076378273354e-05, |
|
"loss": 0.218, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.017725258493353, |
|
"grad_norm": 0.6235542870137168, |
|
"learning_rate": 4.953527405032723e-05, |
|
"loss": 0.1987, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.0768094534711965, |
|
"grad_norm": 0.8247620064480204, |
|
"learning_rate": 4.8373722635717086e-05, |
|
"loss": 0.1425, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.13589364844904, |
|
"grad_norm": 0.806070554149328, |
|
"learning_rate": 4.721304927553658e-05, |
|
"loss": 0.1313, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.194977843426884, |
|
"grad_norm": 1.1960115947293068, |
|
"learning_rate": 4.60538806368031e-05, |
|
"loss": 0.1397, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.254062038404727, |
|
"grad_norm": 0.7571134276917469, |
|
"learning_rate": 4.489684257410958e-05, |
|
"loss": 0.1421, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.31314623338257, |
|
"grad_norm": 0.8865025891479803, |
|
"learning_rate": 4.374255979171538e-05, |
|
"loss": 0.1386, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.372230428360414, |
|
"grad_norm": 0.9290220213968287, |
|
"learning_rate": 4.2591655506257645e-05, |
|
"loss": 0.1444, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.431314623338257, |
|
"grad_norm": 0.8896418597603776, |
|
"learning_rate": 4.144475111026643e-05, |
|
"loss": 0.1391, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.4903988183161, |
|
"grad_norm": 0.9053211649213782, |
|
"learning_rate": 4.030246583666437e-05, |
|
"loss": 0.1438, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.549483013293944, |
|
"grad_norm": 0.8453862385052026, |
|
"learning_rate": 3.9165416424432414e-05, |
|
"loss": 0.1415, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.608567208271787, |
|
"grad_norm": 0.8724405655899441, |
|
"learning_rate": 3.803421678562213e-05, |
|
"loss": 0.1492, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.66765140324963, |
|
"grad_norm": 0.98302153579186, |
|
"learning_rate": 3.690947767389426e-05, |
|
"loss": 0.1512, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 4.726735598227474, |
|
"grad_norm": 0.8137008256198869, |
|
"learning_rate": 3.57918063547627e-05, |
|
"loss": 0.1481, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.7858197932053175, |
|
"grad_norm": 0.8604546990181478, |
|
"learning_rate": 3.468180627772144e-05, |
|
"loss": 0.1418, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.844903988183161, |
|
"grad_norm": 0.8028241587093687, |
|
"learning_rate": 3.358007675043224e-05, |
|
"loss": 0.146, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.903988183161005, |
|
"grad_norm": 0.8209334713352179, |
|
"learning_rate": 3.2487212615148316e-05, |
|
"loss": 0.1407, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.963072378138848, |
|
"grad_norm": 0.9318035909918932, |
|
"learning_rate": 3.1403803927549006e-05, |
|
"loss": 0.1502, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 5.022156573116692, |
|
"grad_norm": 0.6982555180872486, |
|
"learning_rate": 3.0330435638158806e-05, |
|
"loss": 0.1322, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.081240768094535, |
|
"grad_norm": 0.9099529668785485, |
|
"learning_rate": 2.9267687276522876e-05, |
|
"loss": 0.0985, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 5.140324963072378, |
|
"grad_norm": 0.8093436219539469, |
|
"learning_rate": 2.821613263830912e-05, |
|
"loss": 0.0929, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 5.199409158050222, |
|
"grad_norm": 0.908675419764728, |
|
"learning_rate": 2.717633947550651e-05, |
|
"loss": 0.0941, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 5.258493353028065, |
|
"grad_norm": 0.9144782506889362, |
|
"learning_rate": 2.614886918988604e-05, |
|
"loss": 0.0951, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 5.317577548005908, |
|
"grad_norm": 0.8302439773379418, |
|
"learning_rate": 2.5134276529890644e-05, |
|
"loss": 0.0926, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.376661742983752, |
|
"grad_norm": 0.8145999759475107, |
|
"learning_rate": 2.4133109291117156e-05, |
|
"loss": 0.095, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 5.435745937961595, |
|
"grad_norm": 0.7459552675334057, |
|
"learning_rate": 2.314590802055232e-05, |
|
"loss": 0.0886, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 5.4948301329394384, |
|
"grad_norm": 0.9033419203896008, |
|
"learning_rate": 2.2173205724722318e-05, |
|
"loss": 0.096, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 5.5539143279172825, |
|
"grad_norm": 0.882377183289936, |
|
"learning_rate": 2.121552758191366e-05, |
|
"loss": 0.0962, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 5.612998522895126, |
|
"grad_norm": 0.8033415953079253, |
|
"learning_rate": 2.027339065862064e-05, |
|
"loss": 0.0985, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 5.672082717872969, |
|
"grad_norm": 0.9269892443045443, |
|
"learning_rate": 1.934730363037237e-05, |
|
"loss": 0.0939, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 5.731166912850813, |
|
"grad_norm": 0.8483635995759108, |
|
"learning_rate": 1.843776650709046e-05, |
|
"loss": 0.0975, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 5.790251107828656, |
|
"grad_norm": 0.7664349269568396, |
|
"learning_rate": 1.7545270363125153e-05, |
|
"loss": 0.093, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 5.849335302806499, |
|
"grad_norm": 0.7270945706246518, |
|
"learning_rate": 1.6670297072116165e-05, |
|
"loss": 0.0959, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 5.908419497784343, |
|
"grad_norm": 0.4763779090409354, |
|
"learning_rate": 1.581331904682089e-05, |
|
"loss": 0.0918, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.908419497784343, |
|
"eval_loss": 0.7311862707138062, |
|
"eval_runtime": 52.641, |
|
"eval_samples_per_second": 2.109, |
|
"eval_steps_per_second": 0.532, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.967503692762186, |
|
"grad_norm": 0.7880059330281542, |
|
"learning_rate": 1.4974798984050942e-05, |
|
"loss": 0.0933, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 6.026587887740029, |
|
"grad_norm": 0.5434228078562925, |
|
"learning_rate": 1.4155189614854275e-05, |
|
"loss": 0.0828, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 6.085672082717873, |
|
"grad_norm": 0.6734629959345333, |
|
"learning_rate": 1.3354933460078217e-05, |
|
"loss": 0.0678, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 6.144756277695716, |
|
"grad_norm": 0.6754051572707525, |
|
"learning_rate": 1.257446259144494e-05, |
|
"loss": 0.0629, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 6.203840472673559, |
|
"grad_norm": 0.6799650760230072, |
|
"learning_rate": 1.1814198398268794e-05, |
|
"loss": 0.0697, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 6.262924667651403, |
|
"grad_norm": 0.6540795313123376, |
|
"learning_rate": 1.1074551359941021e-05, |
|
"loss": 0.0644, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 6.3220088626292466, |
|
"grad_norm": 0.814973115457644, |
|
"learning_rate": 1.0355920824305127e-05, |
|
"loss": 0.069, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 6.381093057607091, |
|
"grad_norm": 0.8090154097885933, |
|
"learning_rate": 9.658694792042284e-06, |
|
"loss": 0.0666, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 6.440177252584934, |
|
"grad_norm": 0.7016078216972328, |
|
"learning_rate": 8.98324970718319e-06, |
|
"loss": 0.0679, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 6.499261447562777, |
|
"grad_norm": 0.7288879724763213, |
|
"learning_rate": 8.329950253859703e-06, |
|
"loss": 0.0656, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.558345642540621, |
|
"grad_norm": 0.7247263147770627, |
|
"learning_rate": 7.699149159405734e-06, |
|
"loss": 0.0664, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 6.617429837518464, |
|
"grad_norm": 0.6219352284185693, |
|
"learning_rate": 7.0911870039138015e-06, |
|
"loss": 0.0673, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 6.676514032496307, |
|
"grad_norm": 0.7481718513639776, |
|
"learning_rate": 6.506392036350167e-06, |
|
"loss": 0.0697, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 6.735598227474151, |
|
"grad_norm": 0.7642809598885449, |
|
"learning_rate": 5.945079997327713e-06, |
|
"loss": 0.0669, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 6.794682422451994, |
|
"grad_norm": 0.7060296135322504, |
|
"learning_rate": 5.407553948632277e-06, |
|
"loss": 0.0683, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 6.853766617429837, |
|
"grad_norm": 0.7218214547563072, |
|
"learning_rate": 4.894104109594466e-06, |
|
"loss": 0.0684, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 6.912850812407681, |
|
"grad_norm": 0.6944520001481999, |
|
"learning_rate": 4.405007700395497e-06, |
|
"loss": 0.0687, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 6.971935007385524, |
|
"grad_norm": 0.8260640922228497, |
|
"learning_rate": 3.940528792391223e-06, |
|
"loss": 0.0721, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 7.0310192023633675, |
|
"grad_norm": 0.6707004455651014, |
|
"learning_rate": 3.5009181655356826e-06, |
|
"loss": 0.0653, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 7.0901033973412115, |
|
"grad_norm": 0.6805257469848828, |
|
"learning_rate": 3.0864131729807398e-06, |
|
"loss": 0.0579, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 7.149187592319055, |
|
"grad_norm": 0.5495171159123228, |
|
"learning_rate": 2.6972376129251686e-06, |
|
"loss": 0.0585, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 7.208271787296898, |
|
"grad_norm": 0.6136512130241976, |
|
"learning_rate": 2.3336016077822154e-06, |
|
"loss": 0.0562, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 7.267355982274742, |
|
"grad_norm": 0.7591755724568, |
|
"learning_rate": 1.9957014907310224e-06, |
|
"loss": 0.0572, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 7.326440177252585, |
|
"grad_norm": 0.6022538817881757, |
|
"learning_rate": 1.6837196997130434e-06, |
|
"loss": 0.06, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 7.385524372230428, |
|
"grad_norm": 0.774444698651241, |
|
"learning_rate": 1.3978246789307149e-06, |
|
"loss": 0.0565, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 7.444608567208272, |
|
"grad_norm": 0.7442025153147653, |
|
"learning_rate": 1.1381707879016157e-06, |
|
"loss": 0.0562, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 7.503692762186115, |
|
"grad_norm": 0.5700289390004409, |
|
"learning_rate": 9.048982181171894e-07, |
|
"loss": 0.0556, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 7.562776957163958, |
|
"grad_norm": 0.695724523693832, |
|
"learning_rate": 6.98132917350991e-07, |
|
"loss": 0.0525, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 7.621861152141802, |
|
"grad_norm": 0.6867233808424225, |
|
"learning_rate": 5.179865216573654e-07, |
|
"loss": 0.0549, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 7.680945347119645, |
|
"grad_norm": 0.7051791342497011, |
|
"learning_rate": 3.6455629509730136e-07, |
|
"loss": 0.0583, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 7.7400295420974885, |
|
"grad_norm": 0.6772707185024253, |
|
"learning_rate": 2.3792507722388835e-07, |
|
"loss": 0.0553, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 7.7991137370753325, |
|
"grad_norm": 0.6815226755163512, |
|
"learning_rate": 1.3816123835588834e-07, |
|
"loss": 0.0556, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 7.858197932053176, |
|
"grad_norm": 0.6475756485632287, |
|
"learning_rate": 6.531864266343113e-08, |
|
"loss": 0.0589, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 7.917282127031019, |
|
"grad_norm": 0.7007100620520975, |
|
"learning_rate": 1.943661908586636e-08, |
|
"loss": 0.0603, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 7.976366322008863, |
|
"grad_norm": 0.7030879428421536, |
|
"learning_rate": 5.399400973882251e-10, |
|
"loss": 0.0568, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 7.988183161004431, |
|
"step": 1352, |
|
"total_flos": 1435586865135616.0, |
|
"train_loss": 0.22072081432606167, |
|
"train_runtime": 22818.9262, |
|
"train_samples_per_second": 1.896, |
|
"train_steps_per_second": 0.059 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1352, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1435586865135616.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|