{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.988183161004431, "eval_steps": 500, "global_step": 1352, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.059084194977843424, "grad_norm": 0.6387189059092414, "learning_rate": 9.998650208062712e-05, "loss": 0.6972, "step": 10 }, { "epoch": 0.11816838995568685, "grad_norm": 0.48601036335500336, "learning_rate": 9.994601561026155e-05, "loss": 0.5423, "step": 20 }, { "epoch": 0.17725258493353027, "grad_norm": 0.40437591619558344, "learning_rate": 9.98785624482278e-05, "loss": 0.5149, "step": 30 }, { "epoch": 0.2363367799113737, "grad_norm": 0.5030211090458666, "learning_rate": 9.978417901361958e-05, "loss": 0.513, "step": 40 }, { "epoch": 0.29542097488921715, "grad_norm": 0.49098083558596206, "learning_rate": 9.96629162656365e-05, "loss": 0.4957, "step": 50 }, { "epoch": 0.35450516986706054, "grad_norm": 0.6329012343929873, "learning_rate": 9.951483967607041e-05, "loss": 0.495, "step": 60 }, { "epoch": 0.413589364844904, "grad_norm": 0.48252149699962754, "learning_rate": 9.934002919395592e-05, "loss": 0.4943, "step": 70 }, { "epoch": 0.4726735598227474, "grad_norm": 0.5221971068930835, "learning_rate": 9.91385792024048e-05, "loss": 0.4738, "step": 80 }, { "epoch": 0.5317577548005908, "grad_norm": 0.42549092560536134, "learning_rate": 9.891059846764679e-05, "loss": 0.4565, "step": 90 }, { "epoch": 0.5908419497784343, "grad_norm": 0.4606171719652433, "learning_rate": 9.865621008030492e-05, "loss": 0.4674, "step": 100 }, { "epoch": 0.6499261447562777, "grad_norm": 0.46353540699946943, "learning_rate": 9.83755513889369e-05, "loss": 0.4727, "step": 110 }, { "epoch": 0.7090103397341211, "grad_norm": 0.4815928480268326, "learning_rate": 9.80687739258782e-05, "loss": 0.4736, "step": 120 }, { "epoch": 0.7680945347119645, "grad_norm": 0.485838392040906, "learning_rate": 9.773604332542729e-05, "loss": 0.47, "step": 130 }, { "epoch": 0.827178729689808, "grad_norm": 0.5025850953484241, "learning_rate": 9.737753923441688e-05, "loss": 0.467, "step": 140 }, { "epoch": 0.8862629246676514, "grad_norm": 0.45997226542102815, "learning_rate": 9.69934552152196e-05, "loss": 0.4522, "step": 150 }, { "epoch": 0.9453471196454948, "grad_norm": 0.4546309634161405, "learning_rate": 9.658399864124037e-05, "loss": 0.4613, "step": 160 }, { "epoch": 1.0044313146233383, "grad_norm": 0.6577670848472944, "learning_rate": 9.61493905849521e-05, "loss": 0.4641, "step": 170 }, { "epoch": 1.0635155096011817, "grad_norm": 0.5743711541659764, "learning_rate": 9.568986569853487e-05, "loss": 0.3946, "step": 180 }, { "epoch": 1.122599704579025, "grad_norm": 0.5374571463070855, "learning_rate": 9.520567208718337e-05, "loss": 0.3882, "step": 190 }, { "epoch": 1.1816838995568686, "grad_norm": 0.6531134738909915, "learning_rate": 9.469707117515067e-05, "loss": 0.4205, "step": 200 }, { "epoch": 1.240768094534712, "grad_norm": 0.5201345622237219, "learning_rate": 9.416433756460091e-05, "loss": 0.386, "step": 210 }, { "epoch": 1.2998522895125553, "grad_norm": 0.7267530776250423, "learning_rate": 9.360775888734698e-05, "loss": 0.4096, "step": 220 }, { "epoch": 1.3589364844903988, "grad_norm": 0.5480147887972682, "learning_rate": 9.302763564955331e-05, "loss": 0.3921, "step": 230 }, { "epoch": 1.4180206794682422, "grad_norm": 0.6228502280992079, "learning_rate": 9.242428106948749e-05, "loss": 0.3788, "step": 240 }, { "epoch": 1.4771048744460857, "grad_norm": 0.7119116813192373, "learning_rate": 9.179802090840853e-05, "loss": 0.3894, "step": 250 }, { "epoch": 1.536189069423929, "grad_norm": 0.5987495739217404, "learning_rate": 9.114919329468282e-05, "loss": 0.3707, "step": 260 }, { "epoch": 1.5952732644017726, "grad_norm": 0.6278387167291306, "learning_rate": 9.04781485412231e-05, "loss": 0.3938, "step": 270 }, { "epoch": 1.654357459379616, "grad_norm": 0.5312391005336979, "learning_rate": 8.978524895634842e-05, "loss": 0.3799, "step": 280 }, { "epoch": 1.7134416543574593, "grad_norm": 0.7329625126762797, "learning_rate": 8.907086864816803e-05, "loss": 0.403, "step": 290 }, { "epoch": 1.7725258493353029, "grad_norm": 0.5600401734580108, "learning_rate": 8.833539332259398e-05, "loss": 0.3758, "step": 300 }, { "epoch": 1.8316100443131462, "grad_norm": 0.503395394356135, "learning_rate": 8.757922007509207e-05, "loss": 0.3963, "step": 310 }, { "epoch": 1.8906942392909896, "grad_norm": 0.6324775953771359, "learning_rate": 8.680275717628337e-05, "loss": 0.3858, "step": 320 }, { "epoch": 1.9497784342688331, "grad_norm": 0.5574107174736728, "learning_rate": 8.600642385151205e-05, "loss": 0.3799, "step": 330 }, { "epoch": 2.0088626292466767, "grad_norm": 0.5250864362886176, "learning_rate": 8.519065005449858e-05, "loss": 0.3763, "step": 340 }, { "epoch": 2.06794682422452, "grad_norm": 0.7754155552722259, "learning_rate": 8.43558762352005e-05, "loss": 0.2934, "step": 350 }, { "epoch": 2.1270310192023634, "grad_norm": 0.6889925166837503, "learning_rate": 8.350255310200612e-05, "loss": 0.3078, "step": 360 }, { "epoch": 2.186115214180207, "grad_norm": 0.6281519102970855, "learning_rate": 8.263114137838947e-05, "loss": 0.3028, "step": 370 }, { "epoch": 2.24519940915805, "grad_norm": 0.7619069663070173, "learning_rate": 8.174211155415799e-05, "loss": 0.2972, "step": 380 }, { "epoch": 2.3042836041358936, "grad_norm": 0.6469214076306162, "learning_rate": 8.083594363142717e-05, "loss": 0.2995, "step": 390 }, { "epoch": 2.363367799113737, "grad_norm": 0.7761443706661119, "learning_rate": 7.991312686545937e-05, "loss": 0.2963, "step": 400 }, { "epoch": 2.4224519940915803, "grad_norm": 0.6652219653359559, "learning_rate": 7.897415950050676e-05, "loss": 0.2987, "step": 410 }, { "epoch": 2.481536189069424, "grad_norm": 0.7586700003856579, "learning_rate": 7.801954850080075e-05, "loss": 0.3092, "step": 420 }, { "epoch": 2.5406203840472674, "grad_norm": 0.6663785963305503, "learning_rate": 7.704980927683359e-05, "loss": 0.2951, "step": 430 }, { "epoch": 2.5997045790251105, "grad_norm": 0.6913254608481674, "learning_rate": 7.60654654070796e-05, "loss": 0.3097, "step": 440 }, { "epoch": 2.658788774002954, "grad_norm": 0.6943652391419598, "learning_rate": 7.506704835530634e-05, "loss": 0.2999, "step": 450 }, { "epoch": 2.7178729689807977, "grad_norm": 0.644096596125316, "learning_rate": 7.405509718362842e-05, "loss": 0.2905, "step": 460 }, { "epoch": 2.7769571639586412, "grad_norm": 0.7660517390503399, "learning_rate": 7.303015826145885e-05, "loss": 0.309, "step": 470 }, { "epoch": 2.8360413589364843, "grad_norm": 0.7708918433168639, "learning_rate": 7.199278497051498e-05, "loss": 0.302, "step": 480 }, { "epoch": 2.895125553914328, "grad_norm": 0.626692120748867, "learning_rate": 7.094353740603839e-05, "loss": 0.297, "step": 490 }, { "epoch": 2.9542097488921715, "grad_norm": 0.7926892798861292, "learning_rate": 6.988298207439021e-05, "loss": 0.3101, "step": 500 }, { "epoch": 2.9542097488921715, "eval_loss": 0.4939613938331604, "eval_runtime": 53.5552, "eval_samples_per_second": 2.073, "eval_steps_per_second": 0.523, "step": 500 }, { "epoch": 3.0132939438700146, "grad_norm": 0.5747285632843896, "learning_rate": 6.881169158718474e-05, "loss": 0.2736, "step": 510 }, { "epoch": 3.072378138847858, "grad_norm": 0.7661842329328333, "learning_rate": 6.773024435212678e-05, "loss": 0.2187, "step": 520 }, { "epoch": 3.1314623338257017, "grad_norm": 0.8401289392682185, "learning_rate": 6.663922426071977e-05, "loss": 0.2057, "step": 530 }, { "epoch": 3.1905465288035453, "grad_norm": 0.7506650372127406, "learning_rate": 6.553922037301283e-05, "loss": 0.2067, "step": 540 }, { "epoch": 3.2496307237813884, "grad_norm": 0.7842941173009434, "learning_rate": 6.443082659955738e-05, "loss": 0.1989, "step": 550 }, { "epoch": 3.308714918759232, "grad_norm": 0.7593744868915973, "learning_rate": 6.331464138074493e-05, "loss": 0.2179, "step": 560 }, { "epoch": 3.3677991137370755, "grad_norm": 0.756091123285611, "learning_rate": 6.219126736369903e-05, "loss": 0.2176, "step": 570 }, { "epoch": 3.4268833087149186, "grad_norm": 0.7869512689224245, "learning_rate": 6.106131107689599e-05, "loss": 0.2215, "step": 580 }, { "epoch": 3.485967503692762, "grad_norm": 0.7636724676762149, "learning_rate": 5.9925382602689974e-05, "loss": 0.2153, "step": 590 }, { "epoch": 3.5450516986706058, "grad_norm": 0.6815318566179079, "learning_rate": 5.8784095247919305e-05, "loss": 0.2133, "step": 600 }, { "epoch": 3.604135893648449, "grad_norm": 0.7848948829508617, "learning_rate": 5.763806521277184e-05, "loss": 0.2109, "step": 610 }, { "epoch": 3.6632200886262924, "grad_norm": 0.7715200213470335, "learning_rate": 5.648791125808809e-05, "loss": 0.2214, "step": 620 }, { "epoch": 3.722304283604136, "grad_norm": 0.6480973448749833, "learning_rate": 5.5334254371281934e-05, "loss": 0.212, "step": 630 }, { "epoch": 3.781388478581979, "grad_norm": 0.6618796234383102, "learning_rate": 5.417771743105907e-05, "loss": 0.2196, "step": 640 }, { "epoch": 3.8404726735598227, "grad_norm": 0.9462710180746308, "learning_rate": 5.3018924871114305e-05, "loss": 0.2145, "step": 650 }, { "epoch": 3.8995568685376663, "grad_norm": 0.7674586984793125, "learning_rate": 5.185850234298942e-05, "loss": 0.2199, "step": 660 }, { "epoch": 3.9586410635155094, "grad_norm": 0.8420752213128357, "learning_rate": 5.0697076378273354e-05, "loss": 0.218, "step": 670 }, { "epoch": 4.017725258493353, "grad_norm": 0.6235542870137168, "learning_rate": 4.953527405032723e-05, "loss": 0.1987, "step": 680 }, { "epoch": 4.0768094534711965, "grad_norm": 0.8247620064480204, "learning_rate": 4.8373722635717086e-05, "loss": 0.1425, "step": 690 }, { "epoch": 4.13589364844904, "grad_norm": 0.806070554149328, "learning_rate": 4.721304927553658e-05, "loss": 0.1313, "step": 700 }, { "epoch": 4.194977843426884, "grad_norm": 1.1960115947293068, "learning_rate": 4.60538806368031e-05, "loss": 0.1397, "step": 710 }, { "epoch": 4.254062038404727, "grad_norm": 0.7571134276917469, "learning_rate": 4.489684257410958e-05, "loss": 0.1421, "step": 720 }, { "epoch": 4.31314623338257, "grad_norm": 0.8865025891479803, "learning_rate": 4.374255979171538e-05, "loss": 0.1386, "step": 730 }, { "epoch": 4.372230428360414, "grad_norm": 0.9290220213968287, "learning_rate": 4.2591655506257645e-05, "loss": 0.1444, "step": 740 }, { "epoch": 4.431314623338257, "grad_norm": 0.8896418597603776, "learning_rate": 4.144475111026643e-05, "loss": 0.1391, "step": 750 }, { "epoch": 4.4903988183161, "grad_norm": 0.9053211649213782, "learning_rate": 4.030246583666437e-05, "loss": 0.1438, "step": 760 }, { "epoch": 4.549483013293944, "grad_norm": 0.8453862385052026, "learning_rate": 3.9165416424432414e-05, "loss": 0.1415, "step": 770 }, { "epoch": 4.608567208271787, "grad_norm": 0.8724405655899441, "learning_rate": 3.803421678562213e-05, "loss": 0.1492, "step": 780 }, { "epoch": 4.66765140324963, "grad_norm": 0.98302153579186, "learning_rate": 3.690947767389426e-05, "loss": 0.1512, "step": 790 }, { "epoch": 4.726735598227474, "grad_norm": 0.8137008256198869, "learning_rate": 3.57918063547627e-05, "loss": 0.1481, "step": 800 }, { "epoch": 4.7858197932053175, "grad_norm": 0.8604546990181478, "learning_rate": 3.468180627772144e-05, "loss": 0.1418, "step": 810 }, { "epoch": 4.844903988183161, "grad_norm": 0.8028241587093687, "learning_rate": 3.358007675043224e-05, "loss": 0.146, "step": 820 }, { "epoch": 4.903988183161005, "grad_norm": 0.8209334713352179, "learning_rate": 3.2487212615148316e-05, "loss": 0.1407, "step": 830 }, { "epoch": 4.963072378138848, "grad_norm": 0.9318035909918932, "learning_rate": 3.1403803927549006e-05, "loss": 0.1502, "step": 840 }, { "epoch": 5.022156573116692, "grad_norm": 0.6982555180872486, "learning_rate": 3.0330435638158806e-05, "loss": 0.1322, "step": 850 }, { "epoch": 5.081240768094535, "grad_norm": 0.9099529668785485, "learning_rate": 2.9267687276522876e-05, "loss": 0.0985, "step": 860 }, { "epoch": 5.140324963072378, "grad_norm": 0.8093436219539469, "learning_rate": 2.821613263830912e-05, "loss": 0.0929, "step": 870 }, { "epoch": 5.199409158050222, "grad_norm": 0.908675419764728, "learning_rate": 2.717633947550651e-05, "loss": 0.0941, "step": 880 }, { "epoch": 5.258493353028065, "grad_norm": 0.9144782506889362, "learning_rate": 2.614886918988604e-05, "loss": 0.0951, "step": 890 }, { "epoch": 5.317577548005908, "grad_norm": 0.8302439773379418, "learning_rate": 2.5134276529890644e-05, "loss": 0.0926, "step": 900 }, { "epoch": 5.376661742983752, "grad_norm": 0.8145999759475107, "learning_rate": 2.4133109291117156e-05, "loss": 0.095, "step": 910 }, { "epoch": 5.435745937961595, "grad_norm": 0.7459552675334057, "learning_rate": 2.314590802055232e-05, "loss": 0.0886, "step": 920 }, { "epoch": 5.4948301329394384, "grad_norm": 0.9033419203896008, "learning_rate": 2.2173205724722318e-05, "loss": 0.096, "step": 930 }, { "epoch": 5.5539143279172825, "grad_norm": 0.882377183289936, "learning_rate": 2.121552758191366e-05, "loss": 0.0962, "step": 940 }, { "epoch": 5.612998522895126, "grad_norm": 0.8033415953079253, "learning_rate": 2.027339065862064e-05, "loss": 0.0985, "step": 950 }, { "epoch": 5.672082717872969, "grad_norm": 0.9269892443045443, "learning_rate": 1.934730363037237e-05, "loss": 0.0939, "step": 960 }, { "epoch": 5.731166912850813, "grad_norm": 0.8483635995759108, "learning_rate": 1.843776650709046e-05, "loss": 0.0975, "step": 970 }, { "epoch": 5.790251107828656, "grad_norm": 0.7664349269568396, "learning_rate": 1.7545270363125153e-05, "loss": 0.093, "step": 980 }, { "epoch": 5.849335302806499, "grad_norm": 0.7270945706246518, "learning_rate": 1.6670297072116165e-05, "loss": 0.0959, "step": 990 }, { "epoch": 5.908419497784343, "grad_norm": 0.4763779090409354, "learning_rate": 1.581331904682089e-05, "loss": 0.0918, "step": 1000 }, { "epoch": 5.908419497784343, "eval_loss": 0.7311862707138062, "eval_runtime": 52.641, "eval_samples_per_second": 2.109, "eval_steps_per_second": 0.532, "step": 1000 }, { "epoch": 5.967503692762186, "grad_norm": 0.7880059330281542, "learning_rate": 1.4974798984050942e-05, "loss": 0.0933, "step": 1010 }, { "epoch": 6.026587887740029, "grad_norm": 0.5434228078562925, "learning_rate": 1.4155189614854275e-05, "loss": 0.0828, "step": 1020 }, { "epoch": 6.085672082717873, "grad_norm": 0.6734629959345333, "learning_rate": 1.3354933460078217e-05, "loss": 0.0678, "step": 1030 }, { "epoch": 6.144756277695716, "grad_norm": 0.6754051572707525, "learning_rate": 1.257446259144494e-05, "loss": 0.0629, "step": 1040 }, { "epoch": 6.203840472673559, "grad_norm": 0.6799650760230072, "learning_rate": 1.1814198398268794e-05, "loss": 0.0697, "step": 1050 }, { "epoch": 6.262924667651403, "grad_norm": 0.6540795313123376, "learning_rate": 1.1074551359941021e-05, "loss": 0.0644, "step": 1060 }, { "epoch": 6.3220088626292466, "grad_norm": 0.814973115457644, "learning_rate": 1.0355920824305127e-05, "loss": 0.069, "step": 1070 }, { "epoch": 6.381093057607091, "grad_norm": 0.8090154097885933, "learning_rate": 9.658694792042284e-06, "loss": 0.0666, "step": 1080 }, { "epoch": 6.440177252584934, "grad_norm": 0.7016078216972328, "learning_rate": 8.98324970718319e-06, "loss": 0.0679, "step": 1090 }, { "epoch": 6.499261447562777, "grad_norm": 0.7288879724763213, "learning_rate": 8.329950253859703e-06, "loss": 0.0656, "step": 1100 }, { "epoch": 6.558345642540621, "grad_norm": 0.7247263147770627, "learning_rate": 7.699149159405734e-06, "loss": 0.0664, "step": 1110 }, { "epoch": 6.617429837518464, "grad_norm": 0.6219352284185693, "learning_rate": 7.0911870039138015e-06, "loss": 0.0673, "step": 1120 }, { "epoch": 6.676514032496307, "grad_norm": 0.7481718513639776, "learning_rate": 6.506392036350167e-06, "loss": 0.0697, "step": 1130 }, { "epoch": 6.735598227474151, "grad_norm": 0.7642809598885449, "learning_rate": 5.945079997327713e-06, "loss": 0.0669, "step": 1140 }, { "epoch": 6.794682422451994, "grad_norm": 0.7060296135322504, "learning_rate": 5.407553948632277e-06, "loss": 0.0683, "step": 1150 }, { "epoch": 6.853766617429837, "grad_norm": 0.7218214547563072, "learning_rate": 4.894104109594466e-06, "loss": 0.0684, "step": 1160 }, { "epoch": 6.912850812407681, "grad_norm": 0.6944520001481999, "learning_rate": 4.405007700395497e-06, "loss": 0.0687, "step": 1170 }, { "epoch": 6.971935007385524, "grad_norm": 0.8260640922228497, "learning_rate": 3.940528792391223e-06, "loss": 0.0721, "step": 1180 }, { "epoch": 7.0310192023633675, "grad_norm": 0.6707004455651014, "learning_rate": 3.5009181655356826e-06, "loss": 0.0653, "step": 1190 }, { "epoch": 7.0901033973412115, "grad_norm": 0.6805257469848828, "learning_rate": 3.0864131729807398e-06, "loss": 0.0579, "step": 1200 }, { "epoch": 7.149187592319055, "grad_norm": 0.5495171159123228, "learning_rate": 2.6972376129251686e-06, "loss": 0.0585, "step": 1210 }, { "epoch": 7.208271787296898, "grad_norm": 0.6136512130241976, "learning_rate": 2.3336016077822154e-06, "loss": 0.0562, "step": 1220 }, { "epoch": 7.267355982274742, "grad_norm": 0.7591755724568, "learning_rate": 1.9957014907310224e-06, "loss": 0.0572, "step": 1230 }, { "epoch": 7.326440177252585, "grad_norm": 0.6022538817881757, "learning_rate": 1.6837196997130434e-06, "loss": 0.06, "step": 1240 }, { "epoch": 7.385524372230428, "grad_norm": 0.774444698651241, "learning_rate": 1.3978246789307149e-06, "loss": 0.0565, "step": 1250 }, { "epoch": 7.444608567208272, "grad_norm": 0.7442025153147653, "learning_rate": 1.1381707879016157e-06, "loss": 0.0562, "step": 1260 }, { "epoch": 7.503692762186115, "grad_norm": 0.5700289390004409, "learning_rate": 9.048982181171894e-07, "loss": 0.0556, "step": 1270 }, { "epoch": 7.562776957163958, "grad_norm": 0.695724523693832, "learning_rate": 6.98132917350991e-07, "loss": 0.0525, "step": 1280 }, { "epoch": 7.621861152141802, "grad_norm": 0.6867233808424225, "learning_rate": 5.179865216573654e-07, "loss": 0.0549, "step": 1290 }, { "epoch": 7.680945347119645, "grad_norm": 0.7051791342497011, "learning_rate": 3.6455629509730136e-07, "loss": 0.0583, "step": 1300 }, { "epoch": 7.7400295420974885, "grad_norm": 0.6772707185024253, "learning_rate": 2.3792507722388835e-07, "loss": 0.0553, "step": 1310 }, { "epoch": 7.7991137370753325, "grad_norm": 0.6815226755163512, "learning_rate": 1.3816123835588834e-07, "loss": 0.0556, "step": 1320 }, { "epoch": 7.858197932053176, "grad_norm": 0.6475756485632287, "learning_rate": 6.531864266343113e-08, "loss": 0.0589, "step": 1330 }, { "epoch": 7.917282127031019, "grad_norm": 0.7007100620520975, "learning_rate": 1.943661908586636e-08, "loss": 0.0603, "step": 1340 }, { "epoch": 7.976366322008863, "grad_norm": 0.7030879428421536, "learning_rate": 5.399400973882251e-10, "loss": 0.0568, "step": 1350 }, { "epoch": 7.988183161004431, "step": 1352, "total_flos": 1435586865135616.0, "train_loss": 0.22072081432606167, "train_runtime": 22818.9262, "train_samples_per_second": 1.896, "train_steps_per_second": 0.059 } ], "logging_steps": 10, "max_steps": 1352, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1435586865135616.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }