{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990884229717412, "eval_steps": 500, "global_step": 274, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0036463081130355514, "grad_norm": 1.5564329067676301, "learning_rate": 1.818181818181818e-06, "loss": 1.699, "step": 1 }, { "epoch": 0.007292616226071103, "grad_norm": 44902.29063456451, "learning_rate": 3.636363636363636e-06, "loss": 1.7642, "step": 2 }, { "epoch": 0.014585232452142206, "grad_norm": 1.8353451125445739, "learning_rate": 7.272727272727272e-06, "loss": 1.7172, "step": 4 }, { "epoch": 0.02187784867821331, "grad_norm": 1.8226533138713261, "learning_rate": 1.0909090909090909e-05, "loss": 1.7472, "step": 6 }, { "epoch": 0.02917046490428441, "grad_norm": 51.31275799965548, "learning_rate": 1.4545454545454545e-05, "loss": 1.6743, "step": 8 }, { "epoch": 0.03646308113035551, "grad_norm": 2.4177896118244924, "learning_rate": 1.8181818181818182e-05, "loss": 1.6505, "step": 10 }, { "epoch": 0.04375569735642662, "grad_norm": 1.4056780909392894, "learning_rate": 2.1818181818181818e-05, "loss": 1.662, "step": 12 }, { "epoch": 0.05104831358249772, "grad_norm": 1.123663560927214, "learning_rate": 2.5454545454545454e-05, "loss": 1.6014, "step": 14 }, { "epoch": 0.05834092980856882, "grad_norm": 0.660170892307739, "learning_rate": 2.909090909090909e-05, "loss": 1.5094, "step": 16 }, { "epoch": 0.06563354603463993, "grad_norm": 0.42755156023092894, "learning_rate": 3.272727272727273e-05, "loss": 1.5272, "step": 18 }, { "epoch": 0.07292616226071102, "grad_norm": 0.3829807576012777, "learning_rate": 3.6363636363636364e-05, "loss": 1.4067, "step": 20 }, { "epoch": 0.08021877848678213, "grad_norm": 0.41242113134912567, "learning_rate": 4e-05, "loss": 1.3437, "step": 22 }, { "epoch": 0.08751139471285324, "grad_norm": 0.4213871682292563, "learning_rate": 4.3636363636363636e-05, "loss": 1.3318, "step": 24 }, { "epoch": 0.09480401093892434, "grad_norm": 0.3915162267949715, "learning_rate": 4.7272727272727275e-05, "loss": 1.3009, "step": 26 }, { "epoch": 0.10209662716499544, "grad_norm": 0.37294398118323746, "learning_rate": 5.090909090909091e-05, "loss": 1.2709, "step": 28 }, { "epoch": 0.10938924339106655, "grad_norm": 0.3180941760614004, "learning_rate": 5.4545454545454546e-05, "loss": 1.1872, "step": 30 }, { "epoch": 0.11668185961713765, "grad_norm": 0.22743090816603856, "learning_rate": 5.818181818181818e-05, "loss": 1.1793, "step": 32 }, { "epoch": 0.12397447584320875, "grad_norm": 0.15136600844382636, "learning_rate": 6.181818181818182e-05, "loss": 1.137, "step": 34 }, { "epoch": 0.13126709206927986, "grad_norm": 0.14922343265678947, "learning_rate": 6.545454545454546e-05, "loss": 1.1545, "step": 36 }, { "epoch": 0.13855970829535097, "grad_norm": 0.11950915139776307, "learning_rate": 6.90909090909091e-05, "loss": 1.0923, "step": 38 }, { "epoch": 0.14585232452142205, "grad_norm": 0.12975286429339442, "learning_rate": 7.272727272727273e-05, "loss": 1.0946, "step": 40 }, { "epoch": 0.15314494074749316, "grad_norm": 0.11636352912910496, "learning_rate": 7.636363636363637e-05, "loss": 1.0651, "step": 42 }, { "epoch": 0.16043755697356427, "grad_norm": 0.12013810386339561, "learning_rate": 8e-05, "loss": 1.0725, "step": 44 }, { "epoch": 0.16773017319963537, "grad_norm": 0.09545625176758893, "learning_rate": 8.363636363636364e-05, "loss": 1.1092, "step": 46 }, { "epoch": 0.17502278942570648, "grad_norm": 0.08240688428637283, "learning_rate": 8.727272727272727e-05, "loss": 1.0449, "step": 48 }, { "epoch": 0.18231540565177756, "grad_norm": 0.08336026564156031, "learning_rate": 9.090909090909092e-05, "loss": 1.053, "step": 50 }, { "epoch": 0.18960802187784867, "grad_norm": 0.08733239972775936, "learning_rate": 9.454545454545455e-05, "loss": 1.0629, "step": 52 }, { "epoch": 0.19690063810391978, "grad_norm": 0.07845494536138678, "learning_rate": 9.818181818181818e-05, "loss": 1.0642, "step": 54 }, { "epoch": 0.2041932543299909, "grad_norm": 0.0792882467275035, "learning_rate": 9.999977231314127e-05, "loss": 1.0824, "step": 56 }, { "epoch": 0.211485870556062, "grad_norm": 0.07831448467807114, "learning_rate": 9.999795083071328e-05, "loss": 1.0554, "step": 58 }, { "epoch": 0.2187784867821331, "grad_norm": 0.09325192568500952, "learning_rate": 9.999430793221355e-05, "loss": 1.0361, "step": 60 }, { "epoch": 0.22607110300820418, "grad_norm": 0.08001023877218082, "learning_rate": 9.998884375035221e-05, "loss": 1.0501, "step": 62 }, { "epoch": 0.2333637192342753, "grad_norm": 0.08525503053749453, "learning_rate": 9.99815584841884e-05, "loss": 1.0421, "step": 64 }, { "epoch": 0.2406563354603464, "grad_norm": 0.18952595666861716, "learning_rate": 9.997245239912299e-05, "loss": 1.0598, "step": 66 }, { "epoch": 0.2479489516864175, "grad_norm": 0.5356478609043368, "learning_rate": 9.996152582688898e-05, "loss": 1.0195, "step": 68 }, { "epoch": 0.2552415679124886, "grad_norm": 0.07439723408480747, "learning_rate": 9.994877916553938e-05, "loss": 1.0119, "step": 70 }, { "epoch": 0.2625341841385597, "grad_norm": 0.08769208744835676, "learning_rate": 9.993421287943269e-05, "loss": 1.0367, "step": 72 }, { "epoch": 0.2698268003646308, "grad_norm": 0.07727464499454006, "learning_rate": 9.991782749921601e-05, "loss": 1.0222, "step": 74 }, { "epoch": 0.27711941659070194, "grad_norm": 0.07684989155110282, "learning_rate": 9.98996236218057e-05, "loss": 1.0291, "step": 76 }, { "epoch": 0.284412032816773, "grad_norm": 0.074797273929808, "learning_rate": 9.987960191036562e-05, "loss": 1.0265, "step": 78 }, { "epoch": 0.2917046490428441, "grad_norm": 0.07808867527853167, "learning_rate": 9.985776309428305e-05, "loss": 1.0394, "step": 80 }, { "epoch": 0.29899726526891524, "grad_norm": 0.08366367602394825, "learning_rate": 9.983410796914196e-05, "loss": 0.9918, "step": 82 }, { "epoch": 0.3062898814949863, "grad_norm": 0.08235936275195418, "learning_rate": 9.98086373966942e-05, "loss": 1.0093, "step": 84 }, { "epoch": 0.31358249772105745, "grad_norm": 0.07114879422193107, "learning_rate": 9.978135230482797e-05, "loss": 1.0331, "step": 86 }, { "epoch": 0.32087511394712853, "grad_norm": 0.0789659528845, "learning_rate": 9.975225368753412e-05, "loss": 1.0083, "step": 88 }, { "epoch": 0.3281677301731996, "grad_norm": 0.12722043213844833, "learning_rate": 9.972134260486988e-05, "loss": 0.9868, "step": 90 }, { "epoch": 0.33546034639927075, "grad_norm": 0.07460212363414274, "learning_rate": 9.968862018292026e-05, "loss": 0.9787, "step": 92 }, { "epoch": 0.34275296262534183, "grad_norm": 0.0794228233950649, "learning_rate": 9.965408761375701e-05, "loss": 0.9983, "step": 94 }, { "epoch": 0.35004557885141296, "grad_norm": 0.07805234354253066, "learning_rate": 9.961774615539522e-05, "loss": 0.9712, "step": 96 }, { "epoch": 0.35733819507748404, "grad_norm": 0.07531557083466454, "learning_rate": 9.957959713174748e-05, "loss": 0.9888, "step": 98 }, { "epoch": 0.3646308113035551, "grad_norm": 0.07667315677213174, "learning_rate": 9.953964193257563e-05, "loss": 0.9782, "step": 100 }, { "epoch": 0.37192342752962626, "grad_norm": 0.0769553697010856, "learning_rate": 9.949788201344019e-05, "loss": 0.9615, "step": 102 }, { "epoch": 0.37921604375569734, "grad_norm": 0.08212042393870912, "learning_rate": 9.945431889564723e-05, "loss": 0.9755, "step": 104 }, { "epoch": 0.3865086599817685, "grad_norm": 0.08314540306193914, "learning_rate": 9.940895416619309e-05, "loss": 1.0069, "step": 106 }, { "epoch": 0.39380127620783956, "grad_norm": 0.07970050714561497, "learning_rate": 9.936178947770641e-05, "loss": 0.9686, "step": 108 }, { "epoch": 0.4010938924339107, "grad_norm": 0.2747778373047559, "learning_rate": 9.931282654838803e-05, "loss": 0.9878, "step": 110 }, { "epoch": 0.4083865086599818, "grad_norm": 0.08106771126002589, "learning_rate": 9.926206716194842e-05, "loss": 0.9948, "step": 112 }, { "epoch": 0.41567912488605285, "grad_norm": 0.08133105923791639, "learning_rate": 9.920951316754259e-05, "loss": 0.9621, "step": 114 }, { "epoch": 0.422971741112124, "grad_norm": 0.07751316025392309, "learning_rate": 9.915516647970282e-05, "loss": 1.009, "step": 116 }, { "epoch": 0.43026435733819507, "grad_norm": 0.07429775187060293, "learning_rate": 9.909902907826884e-05, "loss": 0.9564, "step": 118 }, { "epoch": 0.4375569735642662, "grad_norm": 0.06885396447375916, "learning_rate": 9.904110300831577e-05, "loss": 0.9516, "step": 120 }, { "epoch": 0.4448495897903373, "grad_norm": 0.11911641611028377, "learning_rate": 9.898139038007961e-05, "loss": 0.9501, "step": 122 }, { "epoch": 0.45214220601640837, "grad_norm": 0.08134204746867607, "learning_rate": 9.891989336888032e-05, "loss": 0.9787, "step": 124 }, { "epoch": 0.4594348222424795, "grad_norm": 0.06703677306759141, "learning_rate": 9.88566142150426e-05, "loss": 0.9383, "step": 126 }, { "epoch": 0.4667274384685506, "grad_norm": 0.08875939081498214, "learning_rate": 9.87915552238143e-05, "loss": 0.9565, "step": 128 }, { "epoch": 0.4740200546946217, "grad_norm": 0.06975398482601901, "learning_rate": 9.872471876528236e-05, "loss": 0.9351, "step": 130 }, { "epoch": 0.4813126709206928, "grad_norm": 0.08668686890512382, "learning_rate": 9.865610727428661e-05, "loss": 0.9619, "step": 132 }, { "epoch": 0.4886052871467639, "grad_norm": 0.07299762770901298, "learning_rate": 9.858572325033089e-05, "loss": 0.9666, "step": 134 }, { "epoch": 0.495897903372835, "grad_norm": 0.22619188588419292, "learning_rate": 9.851356925749217e-05, "loss": 0.9564, "step": 136 }, { "epoch": 0.5031905195989061, "grad_norm": 0.06929553898073587, "learning_rate": 9.843964792432702e-05, "loss": 0.9416, "step": 138 }, { "epoch": 0.5104831358249772, "grad_norm": 0.07296219603236209, "learning_rate": 9.836396194377586e-05, "loss": 0.9606, "step": 140 }, { "epoch": 0.5177757520510483, "grad_norm": 0.17258152427782444, "learning_rate": 9.828651407306495e-05, "loss": 0.9405, "step": 142 }, { "epoch": 0.5250683682771194, "grad_norm": 0.08048053517750381, "learning_rate": 9.820730713360584e-05, "loss": 0.9308, "step": 144 }, { "epoch": 0.5323609845031905, "grad_norm": 0.07478421518396797, "learning_rate": 9.812634401089265e-05, "loss": 0.9433, "step": 146 }, { "epoch": 0.5396536007292616, "grad_norm": 0.0744608512059185, "learning_rate": 9.804362765439688e-05, "loss": 0.9545, "step": 148 }, { "epoch": 0.5469462169553327, "grad_norm": 0.07446299592606315, "learning_rate": 9.795916107746009e-05, "loss": 0.925, "step": 150 }, { "epoch": 0.5542388331814039, "grad_norm": 0.07071717466745271, "learning_rate": 9.787294735718397e-05, "loss": 0.9173, "step": 152 }, { "epoch": 0.5615314494074749, "grad_norm": 0.0701984606480579, "learning_rate": 9.778498963431837e-05, "loss": 0.9082, "step": 154 }, { "epoch": 0.568824065633546, "grad_norm": 0.07669702930168845, "learning_rate": 9.769529111314682e-05, "loss": 0.9497, "step": 156 }, { "epoch": 0.5761166818596172, "grad_norm": 0.11697404287375447, "learning_rate": 9.76038550613698e-05, "loss": 0.9144, "step": 158 }, { "epoch": 0.5834092980856882, "grad_norm": 0.07825369109874582, "learning_rate": 9.75106848099857e-05, "loss": 0.8793, "step": 160 }, { "epoch": 0.5907019143117593, "grad_norm": 1.1005245093691354, "learning_rate": 9.741578375316952e-05, "loss": 0.9145, "step": 162 }, { "epoch": 0.5979945305378305, "grad_norm": 0.08370330168443421, "learning_rate": 9.731915534814912e-05, "loss": 0.9511, "step": 164 }, { "epoch": 0.6052871467639015, "grad_norm": 0.08122736176843745, "learning_rate": 9.722080311507937e-05, "loss": 0.9584, "step": 166 }, { "epoch": 0.6125797629899726, "grad_norm": 0.07648634858832534, "learning_rate": 9.712073063691386e-05, "loss": 0.9262, "step": 168 }, { "epoch": 0.6198723792160438, "grad_norm": 0.08053849014890098, "learning_rate": 9.701894155927445e-05, "loss": 0.9313, "step": 170 }, { "epoch": 0.6271649954421149, "grad_norm": 0.07210396245886411, "learning_rate": 9.69154395903183e-05, "loss": 0.9174, "step": 172 }, { "epoch": 0.6344576116681859, "grad_norm": 0.07286947087570095, "learning_rate": 9.681022850060296e-05, "loss": 0.8893, "step": 174 }, { "epoch": 0.6417502278942571, "grad_norm": 0.06923686161728522, "learning_rate": 9.670331212294889e-05, "loss": 0.9395, "step": 176 }, { "epoch": 0.6490428441203282, "grad_norm": 0.0673865610450944, "learning_rate": 9.659469435229992e-05, "loss": 0.91, "step": 178 }, { "epoch": 0.6563354603463992, "grad_norm": 0.06948906848590711, "learning_rate": 9.648437914558124e-05, "loss": 0.9168, "step": 180 }, { "epoch": 0.6636280765724704, "grad_norm": 0.07289843532121563, "learning_rate": 9.63723705215554e-05, "loss": 0.9552, "step": 182 }, { "epoch": 0.6709206927985415, "grad_norm": 0.06990802284713049, "learning_rate": 9.625867256067578e-05, "loss": 0.9033, "step": 184 }, { "epoch": 0.6782133090246126, "grad_norm": 0.07555680912326869, "learning_rate": 9.614328940493798e-05, "loss": 0.9206, "step": 186 }, { "epoch": 0.6855059252506837, "grad_norm": 0.07743540693217674, "learning_rate": 9.602622525772895e-05, "loss": 0.9005, "step": 188 }, { "epoch": 0.6927985414767548, "grad_norm": 0.07177992266999077, "learning_rate": 9.590748438367388e-05, "loss": 0.9041, "step": 190 }, { "epoch": 0.7000911577028259, "grad_norm": 0.06957828378926412, "learning_rate": 9.578707110848078e-05, "loss": 0.8957, "step": 192 }, { "epoch": 0.707383773928897, "grad_norm": 0.06622017193744313, "learning_rate": 9.56649898187829e-05, "loss": 0.9108, "step": 194 }, { "epoch": 0.7146763901549681, "grad_norm": 0.06939738922827116, "learning_rate": 9.554124496197898e-05, "loss": 0.9467, "step": 196 }, { "epoch": 0.7219690063810392, "grad_norm": 0.06510687273754218, "learning_rate": 9.54158410460712e-05, "loss": 0.9101, "step": 198 }, { "epoch": 0.7292616226071102, "grad_norm": 0.12122407982819745, "learning_rate": 9.528878263950094e-05, "loss": 0.9271, "step": 200 }, { "epoch": 0.7365542388331814, "grad_norm": 0.0703901780763089, "learning_rate": 9.516007437098237e-05, "loss": 0.9162, "step": 202 }, { "epoch": 0.7438468550592525, "grad_norm": 0.13768411678101625, "learning_rate": 9.502972092933384e-05, "loss": 0.8917, "step": 204 }, { "epoch": 0.7511394712853237, "grad_norm": 0.13103063463158354, "learning_rate": 9.489772706330706e-05, "loss": 0.9074, "step": 206 }, { "epoch": 0.7584320875113947, "grad_norm": 0.07056505425629164, "learning_rate": 9.476409758141405e-05, "loss": 0.9288, "step": 208 }, { "epoch": 0.7657247037374658, "grad_norm": 0.07383435280021707, "learning_rate": 9.462883735175205e-05, "loss": 0.9059, "step": 210 }, { "epoch": 0.773017319963537, "grad_norm": 0.1015687322518938, "learning_rate": 9.449195130182613e-05, "loss": 0.9193, "step": 212 }, { "epoch": 0.780309936189608, "grad_norm": 0.07104451767439372, "learning_rate": 9.435344441836968e-05, "loss": 0.9097, "step": 214 }, { "epoch": 0.7876025524156791, "grad_norm": 0.06852107601664209, "learning_rate": 9.42133217471628e-05, "loss": 0.9126, "step": 216 }, { "epoch": 0.7948951686417502, "grad_norm": 0.07368568637779982, "learning_rate": 9.407158839284835e-05, "loss": 0.945, "step": 218 }, { "epoch": 0.8021877848678214, "grad_norm": 0.0648178325805628, "learning_rate": 9.392824951874617e-05, "loss": 0.8912, "step": 220 }, { "epoch": 0.8094804010938924, "grad_norm": 0.06456581944586823, "learning_rate": 9.378331034666484e-05, "loss": 0.8899, "step": 222 }, { "epoch": 0.8167730173199635, "grad_norm": 0.06900542842249857, "learning_rate": 9.363677615671148e-05, "loss": 0.9119, "step": 224 }, { "epoch": 0.8240656335460347, "grad_norm": 0.06967833214883962, "learning_rate": 9.348865228709947e-05, "loss": 0.889, "step": 226 }, { "epoch": 0.8313582497721057, "grad_norm": 0.08360540263683244, "learning_rate": 9.333894413395387e-05, "loss": 0.865, "step": 228 }, { "epoch": 0.8386508659981768, "grad_norm": 0.07306384866137224, "learning_rate": 9.318765715111497e-05, "loss": 0.9074, "step": 230 }, { "epoch": 0.845943482224248, "grad_norm": 0.07245184365008314, "learning_rate": 9.303479684993942e-05, "loss": 0.908, "step": 232 }, { "epoch": 0.853236098450319, "grad_norm": 0.08363970365051233, "learning_rate": 9.288036879909968e-05, "loss": 0.8873, "step": 234 }, { "epoch": 0.8605287146763901, "grad_norm": 0.07030389813918717, "learning_rate": 9.272437862438094e-05, "loss": 0.8869, "step": 236 }, { "epoch": 0.8678213309024613, "grad_norm": 0.08293652025190876, "learning_rate": 9.256683200847638e-05, "loss": 0.871, "step": 238 }, { "epoch": 0.8751139471285324, "grad_norm": 0.07471078064157549, "learning_rate": 9.240773469077993e-05, "loss": 0.8742, "step": 240 }, { "epoch": 0.8824065633546034, "grad_norm": 0.07710454753144116, "learning_rate": 9.22470924671774e-05, "loss": 0.8743, "step": 242 }, { "epoch": 0.8896991795806746, "grad_norm": 0.06889403226940291, "learning_rate": 9.208491118983514e-05, "loss": 0.8367, "step": 244 }, { "epoch": 0.8969917958067457, "grad_norm": 0.07079796419035034, "learning_rate": 9.192119676698703e-05, "loss": 0.8699, "step": 246 }, { "epoch": 0.9042844120328167, "grad_norm": 0.08107811116365189, "learning_rate": 9.17559551627191e-05, "loss": 0.8794, "step": 248 }, { "epoch": 0.9115770282588879, "grad_norm": 0.07964800640935288, "learning_rate": 9.158919239675236e-05, "loss": 0.9364, "step": 250 }, { "epoch": 0.918869644484959, "grad_norm": 0.07802559563260855, "learning_rate": 9.14209145442234e-05, "loss": 0.8561, "step": 252 }, { "epoch": 0.92616226071103, "grad_norm": 0.08673021093662099, "learning_rate": 9.125112773546315e-05, "loss": 0.8854, "step": 254 }, { "epoch": 0.9334548769371012, "grad_norm": 0.073859691158192, "learning_rate": 9.107983815577359e-05, "loss": 0.8949, "step": 256 }, { "epoch": 0.9407474931631723, "grad_norm": 0.06585824434560679, "learning_rate": 9.090705204520231e-05, "loss": 0.8642, "step": 258 }, { "epoch": 0.9480401093892434, "grad_norm": 0.07928660195604606, "learning_rate": 9.073277569831526e-05, "loss": 0.8936, "step": 260 }, { "epoch": 0.9553327256153145, "grad_norm": 0.07629458882620191, "learning_rate": 9.05570154639674e-05, "loss": 0.889, "step": 262 }, { "epoch": 0.9626253418413856, "grad_norm": 0.12177477305372565, "learning_rate": 9.03797777450715e-05, "loss": 0.8869, "step": 264 }, { "epoch": 0.9699179580674567, "grad_norm": 0.07512333956042923, "learning_rate": 9.020106899836472e-05, "loss": 0.8821, "step": 266 }, { "epoch": 0.9772105742935278, "grad_norm": 0.07673096729175252, "learning_rate": 9.002089573417356e-05, "loss": 0.8406, "step": 268 }, { "epoch": 0.9845031905195989, "grad_norm": 0.13567692134785442, "learning_rate": 8.983926451617664e-05, "loss": 0.8644, "step": 270 }, { "epoch": 0.99179580674567, "grad_norm": 0.08759393012824235, "learning_rate": 8.965618196116549e-05, "loss": 0.844, "step": 272 }, { "epoch": 0.9990884229717412, "grad_norm": 0.08369780561981159, "learning_rate": 8.947165473880363e-05, "loss": 0.8516, "step": 274 } ], "logging_steps": 2, "max_steps": 1096, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1698152759427072.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }