{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997297681695349, "eval_steps": 500, "global_step": 2634, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011378182335371924, "grad_norm": 4.778728008270264, "learning_rate": 7.633587786259543e-07, "loss": 1.0811, "step": 10 }, { "epoch": 0.022756364670743847, "grad_norm": 4.217845439910889, "learning_rate": 1.5267175572519086e-06, "loss": 1.0744, "step": 20 }, { "epoch": 0.034134547006115776, "grad_norm": 4.523660182952881, "learning_rate": 2.2900763358778625e-06, "loss": 1.0751, "step": 30 }, { "epoch": 0.045512729341487694, "grad_norm": 4.545449256896973, "learning_rate": 3.0534351145038173e-06, "loss": 1.0621, "step": 40 }, { "epoch": 0.05689091167685962, "grad_norm": 4.702221393585205, "learning_rate": 3.816793893129772e-06, "loss": 1.0629, "step": 50 }, { "epoch": 0.06826909401223155, "grad_norm": 5.066348552703857, "learning_rate": 4.580152671755725e-06, "loss": 1.0345, "step": 60 }, { "epoch": 0.07964727634760348, "grad_norm": 4.052086353302002, "learning_rate": 5.34351145038168e-06, "loss": 0.9583, "step": 70 }, { "epoch": 0.09102545868297539, "grad_norm": 3.1136085987091064, "learning_rate": 6.1068702290076346e-06, "loss": 0.8734, "step": 80 }, { "epoch": 0.10240364101834731, "grad_norm": 2.0436317920684814, "learning_rate": 6.870229007633589e-06, "loss": 0.798, "step": 90 }, { "epoch": 0.11378182335371924, "grad_norm": 1.6023002862930298, "learning_rate": 7.633587786259543e-06, "loss": 0.7381, "step": 100 }, { "epoch": 0.12516000568909116, "grad_norm": 1.3594775199890137, "learning_rate": 8.396946564885497e-06, "loss": 0.6903, "step": 110 }, { "epoch": 0.1365381880244631, "grad_norm": 1.1952372789382935, "learning_rate": 9.16030534351145e-06, "loss": 0.6682, "step": 120 }, { "epoch": 0.14791637035983501, "grad_norm": 1.0661766529083252, "learning_rate": 9.923664122137405e-06, "loss": 0.6561, "step": 130 }, { "epoch": 0.15929455269520695, "grad_norm": 1.1098530292510986, "learning_rate": 9.999680994290835e-06, "loss": 0.652, "step": 140 }, { "epoch": 0.17067273503057886, "grad_norm": 1.070675015449524, "learning_rate": 9.998578310766133e-06, "loss": 0.625, "step": 150 }, { "epoch": 0.18205091736595078, "grad_norm": 1.066764235496521, "learning_rate": 9.996688184751814e-06, "loss": 0.6181, "step": 160 }, { "epoch": 0.19342909970132272, "grad_norm": 1.1886961460113525, "learning_rate": 9.994010914005644e-06, "loss": 0.6246, "step": 170 }, { "epoch": 0.20480728203669463, "grad_norm": 1.1360214948654175, "learning_rate": 9.990546920286894e-06, "loss": 0.6058, "step": 180 }, { "epoch": 0.21618546437206657, "grad_norm": 1.2674109935760498, "learning_rate": 9.986296749289894e-06, "loss": 0.6091, "step": 190 }, { "epoch": 0.22756364670743848, "grad_norm": 1.0798505544662476, "learning_rate": 9.981261070558069e-06, "loss": 0.6091, "step": 200 }, { "epoch": 0.23894182904281042, "grad_norm": 1.795167088508606, "learning_rate": 9.975440677378455e-06, "loss": 0.6095, "step": 210 }, { "epoch": 0.2503200113781823, "grad_norm": 1.219923496246338, "learning_rate": 9.968836486656746e-06, "loss": 0.6158, "step": 220 }, { "epoch": 0.26169819371355424, "grad_norm": 1.474357008934021, "learning_rate": 9.961449538772838e-06, "loss": 0.6111, "step": 230 }, { "epoch": 0.2730763760489262, "grad_norm": 1.2231380939483643, "learning_rate": 9.953280997416937e-06, "loss": 0.6134, "step": 240 }, { "epoch": 0.2844545583842981, "grad_norm": 1.2513004541397095, "learning_rate": 9.944332149406242e-06, "loss": 0.6106, "step": 250 }, { "epoch": 0.29583274071967003, "grad_norm": 1.2386839389801025, "learning_rate": 9.93460440448223e-06, "loss": 0.5959, "step": 260 }, { "epoch": 0.30721092305504194, "grad_norm": 2.128643274307251, "learning_rate": 9.924099295088565e-06, "loss": 0.6016, "step": 270 }, { "epoch": 0.3185891053904139, "grad_norm": 1.2879629135131836, "learning_rate": 9.9128184761297e-06, "loss": 0.6064, "step": 280 }, { "epoch": 0.3299672877257858, "grad_norm": 1.2779061794281006, "learning_rate": 9.900763724710169e-06, "loss": 0.5975, "step": 290 }, { "epoch": 0.34134547006115773, "grad_norm": 2.398998975753784, "learning_rate": 9.887936939854625e-06, "loss": 0.5923, "step": 300 }, { "epoch": 0.35272365239652964, "grad_norm": 1.3152679204940796, "learning_rate": 9.8743401422087e-06, "loss": 0.5961, "step": 310 }, { "epoch": 0.36410183473190155, "grad_norm": 1.3303252458572388, "learning_rate": 9.859975473720664e-06, "loss": 0.5988, "step": 320 }, { "epoch": 0.3754800170672735, "grad_norm": 1.3672901391983032, "learning_rate": 9.844845197304013e-06, "loss": 0.6062, "step": 330 }, { "epoch": 0.38685819940264543, "grad_norm": 1.2618701457977295, "learning_rate": 9.828951696480982e-06, "loss": 0.5993, "step": 340 }, { "epoch": 0.39823638173801734, "grad_norm": 5.503425121307373, "learning_rate": 9.81229747500705e-06, "loss": 0.5942, "step": 350 }, { "epoch": 0.40961456407338925, "grad_norm": 2.4071738719940186, "learning_rate": 9.794885156476532e-06, "loss": 0.5893, "step": 360 }, { "epoch": 0.4209927464087612, "grad_norm": 1.395342469215393, "learning_rate": 9.776717483909261e-06, "loss": 0.5861, "step": 370 }, { "epoch": 0.43237092874413313, "grad_norm": 1.5855286121368408, "learning_rate": 9.75779731931848e-06, "loss": 0.6038, "step": 380 }, { "epoch": 0.44374911107950504, "grad_norm": 1.3623788356781006, "learning_rate": 9.738127643259978e-06, "loss": 0.5767, "step": 390 }, { "epoch": 0.45512729341487695, "grad_norm": 1.9213502407073975, "learning_rate": 9.717711554362544e-06, "loss": 0.5927, "step": 400 }, { "epoch": 0.4665054757502489, "grad_norm": 2.3041162490844727, "learning_rate": 9.696552268839846e-06, "loss": 0.5869, "step": 410 }, { "epoch": 0.47788365808562083, "grad_norm": 1.4046136140823364, "learning_rate": 9.67465311998376e-06, "loss": 0.589, "step": 420 }, { "epoch": 0.48926184042099274, "grad_norm": 1.43635094165802, "learning_rate": 9.652017557639269e-06, "loss": 0.5869, "step": 430 }, { "epoch": 0.5006400227563647, "grad_norm": 1.453419804573059, "learning_rate": 9.628649147660989e-06, "loss": 0.5862, "step": 440 }, { "epoch": 0.5120182050917366, "grad_norm": 1.5567063093185425, "learning_rate": 9.604551571351442e-06, "loss": 0.5821, "step": 450 }, { "epoch": 0.5233963874271085, "grad_norm": 1.4351816177368164, "learning_rate": 9.579728624881117e-06, "loss": 0.5969, "step": 460 }, { "epoch": 0.5347745697624804, "grad_norm": 1.4502918720245361, "learning_rate": 9.554184218690458e-06, "loss": 0.5796, "step": 470 }, { "epoch": 0.5461527520978524, "grad_norm": 1.4197890758514404, "learning_rate": 9.527922376873828e-06, "loss": 0.5882, "step": 480 }, { "epoch": 0.5575309344332243, "grad_norm": 1.5547962188720703, "learning_rate": 9.500947236545583e-06, "loss": 0.5734, "step": 490 }, { "epoch": 0.5689091167685962, "grad_norm": 1.4449543952941895, "learning_rate": 9.473263047188344e-06, "loss": 0.5788, "step": 500 }, { "epoch": 0.5802872991039681, "grad_norm": 1.480697751045227, "learning_rate": 9.444874169983562e-06, "loss": 0.568, "step": 510 }, { "epoch": 0.5916654814393401, "grad_norm": 1.7815808057785034, "learning_rate": 9.415785077124474e-06, "loss": 0.5866, "step": 520 }, { "epoch": 0.603043663774712, "grad_norm": 1.5812064409255981, "learning_rate": 9.386000351111602e-06, "loss": 0.6007, "step": 530 }, { "epoch": 0.6144218461100839, "grad_norm": 1.4948545694351196, "learning_rate": 9.355524684030841e-06, "loss": 0.5773, "step": 540 }, { "epoch": 0.6258000284454558, "grad_norm": 2.1749398708343506, "learning_rate": 9.324362876814315e-06, "loss": 0.5891, "step": 550 }, { "epoch": 0.6371782107808278, "grad_norm": 2.3155527114868164, "learning_rate": 9.292519838484052e-06, "loss": 0.5753, "step": 560 }, { "epoch": 0.6485563931161997, "grad_norm": 1.5408862829208374, "learning_rate": 9.260000585378664e-06, "loss": 0.583, "step": 570 }, { "epoch": 0.6599345754515716, "grad_norm": 2.3708317279815674, "learning_rate": 9.2268102403631e-06, "loss": 0.582, "step": 580 }, { "epoch": 0.6713127577869435, "grad_norm": 1.5664266347885132, "learning_rate": 9.192954032021626e-06, "loss": 0.5687, "step": 590 }, { "epoch": 0.6826909401223155, "grad_norm": 1.8958492279052734, "learning_rate": 9.15843729383415e-06, "loss": 0.5689, "step": 600 }, { "epoch": 0.6940691224576874, "grad_norm": 1.5079455375671387, "learning_rate": 9.123265463336022e-06, "loss": 0.5787, "step": 610 }, { "epoch": 0.7054473047930593, "grad_norm": 1.612349033355713, "learning_rate": 9.087444081261433e-06, "loss": 0.584, "step": 620 }, { "epoch": 0.7168254871284312, "grad_norm": 1.7752846479415894, "learning_rate": 9.050978790670575e-06, "loss": 0.5744, "step": 630 }, { "epoch": 0.7282036694638031, "grad_norm": 1.5847499370574951, "learning_rate": 9.013875336060671e-06, "loss": 0.5908, "step": 640 }, { "epoch": 0.7395818517991751, "grad_norm": 1.6264406442642212, "learning_rate": 8.976139562461018e-06, "loss": 0.57, "step": 650 }, { "epoch": 0.750960034134547, "grad_norm": 1.783176064491272, "learning_rate": 8.937777414512198e-06, "loss": 0.5759, "step": 660 }, { "epoch": 0.762338216469919, "grad_norm": 1.7870570421218872, "learning_rate": 8.89879493552962e-06, "loss": 0.5782, "step": 670 }, { "epoch": 0.7737163988052909, "grad_norm": 1.666524052619934, "learning_rate": 8.85919826655147e-06, "loss": 0.5734, "step": 680 }, { "epoch": 0.7850945811406628, "grad_norm": 1.7140533924102783, "learning_rate": 8.81899364537131e-06, "loss": 0.5854, "step": 690 }, { "epoch": 0.7964727634760347, "grad_norm": 1.655806303024292, "learning_rate": 8.778187405555412e-06, "loss": 0.581, "step": 700 }, { "epoch": 0.8078509458114066, "grad_norm": 1.8484113216400146, "learning_rate": 8.736785975445009e-06, "loss": 0.5722, "step": 710 }, { "epoch": 0.8192291281467785, "grad_norm": 1.7415412664413452, "learning_rate": 8.694795877143625e-06, "loss": 0.5804, "step": 720 }, { "epoch": 0.8306073104821505, "grad_norm": 1.6867784261703491, "learning_rate": 8.652223725489614e-06, "loss": 0.5675, "step": 730 }, { "epoch": 0.8419854928175224, "grad_norm": 1.7585080862045288, "learning_rate": 8.60907622701412e-06, "loss": 0.5766, "step": 740 }, { "epoch": 0.8533636751528944, "grad_norm": 1.9183036088943481, "learning_rate": 8.565360178884554e-06, "loss": 0.5763, "step": 750 }, { "epoch": 0.8647418574882663, "grad_norm": 1.7552123069763184, "learning_rate": 8.521082467833832e-06, "loss": 0.5734, "step": 760 }, { "epoch": 0.8761200398236382, "grad_norm": 1.8437159061431885, "learning_rate": 8.476250069075476e-06, "loss": 0.5796, "step": 770 }, { "epoch": 0.8874982221590101, "grad_norm": 1.8383750915527344, "learning_rate": 8.430870045204787e-06, "loss": 0.5711, "step": 780 }, { "epoch": 0.898876404494382, "grad_norm": 1.6691679954528809, "learning_rate": 8.384949545086255e-06, "loss": 0.5922, "step": 790 }, { "epoch": 0.9102545868297539, "grad_norm": 2.3490517139434814, "learning_rate": 8.338495802727364e-06, "loss": 0.5772, "step": 800 }, { "epoch": 0.9216327691651258, "grad_norm": 1.8429322242736816, "learning_rate": 8.291516136139017e-06, "loss": 0.561, "step": 810 }, { "epoch": 0.9330109515004978, "grad_norm": 1.7698525190353394, "learning_rate": 8.244017946182676e-06, "loss": 0.5755, "step": 820 }, { "epoch": 0.9443891338358698, "grad_norm": 1.839783787727356, "learning_rate": 8.196008715404506e-06, "loss": 0.5781, "step": 830 }, { "epoch": 0.9557673161712417, "grad_norm": 1.793582558631897, "learning_rate": 8.14749600685661e-06, "loss": 0.5734, "step": 840 }, { "epoch": 0.9671454985066136, "grad_norm": 1.85206139087677, "learning_rate": 8.098487462905603e-06, "loss": 0.5633, "step": 850 }, { "epoch": 0.9785236808419855, "grad_norm": 1.9638901948928833, "learning_rate": 8.048990804028678e-06, "loss": 0.5568, "step": 860 }, { "epoch": 0.9899018631773574, "grad_norm": 2.9568979740142822, "learning_rate": 7.999013827597381e-06, "loss": 0.573, "step": 870 }, { "epoch": 1.0014222727919215, "grad_norm": 3.1088480949401855, "learning_rate": 7.948564406649268e-06, "loss": 0.5728, "step": 880 }, { "epoch": 1.0128004551272933, "grad_norm": 2.6320693492889404, "learning_rate": 7.897650488647628e-06, "loss": 0.5593, "step": 890 }, { "epoch": 1.0241786374626654, "grad_norm": 3.3647408485412598, "learning_rate": 7.846280094229498e-06, "loss": 0.5741, "step": 900 }, { "epoch": 1.0355568197980372, "grad_norm": 3.51078724861145, "learning_rate": 7.794461315942154e-06, "loss": 0.5617, "step": 910 }, { "epoch": 1.0469350021334092, "grad_norm": 3.4864909648895264, "learning_rate": 7.742202316968254e-06, "loss": 0.5707, "step": 920 }, { "epoch": 1.0583131844687812, "grad_norm": 3.092726469039917, "learning_rate": 7.689511329839868e-06, "loss": 0.5787, "step": 930 }, { "epoch": 1.069691366804153, "grad_norm": 5.743341445922852, "learning_rate": 7.636396655141585e-06, "loss": 0.5667, "step": 940 }, { "epoch": 1.081069549139525, "grad_norm": 3.3752024173736572, "learning_rate": 7.5828666602028886e-06, "loss": 0.5665, "step": 950 }, { "epoch": 1.0924477314748968, "grad_norm": 3.439068555831909, "learning_rate": 7.5289297777800275e-06, "loss": 0.5666, "step": 960 }, { "epoch": 1.1038259138102688, "grad_norm": 2.710096597671509, "learning_rate": 7.474594504727576e-06, "loss": 0.5643, "step": 970 }, { "epoch": 1.1152040961456406, "grad_norm": 3.292506456375122, "learning_rate": 7.419869400659889e-06, "loss": 0.5651, "step": 980 }, { "epoch": 1.1265822784810127, "grad_norm": 4.223087787628174, "learning_rate": 7.364763086602696e-06, "loss": 0.5614, "step": 990 }, { "epoch": 1.1379604608163847, "grad_norm": 3.193408966064453, "learning_rate": 7.3092842436349776e-06, "loss": 0.5671, "step": 1000 }, { "epoch": 1.1493386431517565, "grad_norm": 3.6121826171875, "learning_rate": 7.253441611521426e-06, "loss": 0.582, "step": 1010 }, { "epoch": 1.1607168254871285, "grad_norm": 3.511631965637207, "learning_rate": 7.197243987335636e-06, "loss": 0.5687, "step": 1020 }, { "epoch": 1.1720950078225003, "grad_norm": 3.4007201194763184, "learning_rate": 7.140700224074269e-06, "loss": 0.5655, "step": 1030 }, { "epoch": 1.1834731901578723, "grad_norm": 3.5121917724609375, "learning_rate": 7.083819229262413e-06, "loss": 0.5763, "step": 1040 }, { "epoch": 1.1948513724932441, "grad_norm": 2.648864507675171, "learning_rate": 7.026609963550353e-06, "loss": 0.5648, "step": 1050 }, { "epoch": 1.2062295548286162, "grad_norm": 3.010129451751709, "learning_rate": 6.969081439301975e-06, "loss": 0.559, "step": 1060 }, { "epoch": 1.217607737163988, "grad_norm": 2.5858263969421387, "learning_rate": 6.9112427191749975e-06, "loss": 0.5686, "step": 1070 }, { "epoch": 1.22898591949936, "grad_norm": 3.160468578338623, "learning_rate": 6.853102914693336e-06, "loss": 0.5676, "step": 1080 }, { "epoch": 1.2403641018347318, "grad_norm": 3.4577555656433105, "learning_rate": 6.794671184811699e-06, "loss": 0.5652, "step": 1090 }, { "epoch": 1.2517422841701038, "grad_norm": 2.875277042388916, "learning_rate": 6.735956734472772e-06, "loss": 0.5652, "step": 1100 }, { "epoch": 1.2631204665054758, "grad_norm": 3.1883442401885986, "learning_rate": 6.6769688131571255e-06, "loss": 0.575, "step": 1110 }, { "epoch": 1.2744986488408476, "grad_norm": 2.6882236003875732, "learning_rate": 6.617716713426105e-06, "loss": 0.5731, "step": 1120 }, { "epoch": 1.2858768311762196, "grad_norm": 2.7670583724975586, "learning_rate": 6.55820976945796e-06, "loss": 0.5742, "step": 1130 }, { "epoch": 1.2972550135115914, "grad_norm": 3.168198823928833, "learning_rate": 6.498457355577385e-06, "loss": 0.5652, "step": 1140 }, { "epoch": 1.3086331958469635, "grad_norm": 3.868013620376587, "learning_rate": 6.438468884778762e-06, "loss": 0.5647, "step": 1150 }, { "epoch": 1.3200113781823353, "grad_norm": 2.759697437286377, "learning_rate": 6.3782538072432935e-06, "loss": 0.5648, "step": 1160 }, { "epoch": 1.3313895605177073, "grad_norm": 3.8222415447235107, "learning_rate": 6.317821608850287e-06, "loss": 0.5529, "step": 1170 }, { "epoch": 1.3427677428530793, "grad_norm": 2.72162127494812, "learning_rate": 6.257181809682813e-06, "loss": 0.5659, "step": 1180 }, { "epoch": 1.3541459251884511, "grad_norm": 3.477682590484619, "learning_rate": 6.196343962527975e-06, "loss": 0.5576, "step": 1190 }, { "epoch": 1.3655241075238231, "grad_norm": 58.64210891723633, "learning_rate": 6.135317651372031e-06, "loss": 0.5649, "step": 1200 }, { "epoch": 1.376902289859195, "grad_norm": 4.460225582122803, "learning_rate": 6.07411248989059e-06, "loss": 0.5554, "step": 1210 }, { "epoch": 1.388280472194567, "grad_norm": 3.014174461364746, "learning_rate": 6.012738119934141e-06, "loss": 0.5624, "step": 1220 }, { "epoch": 1.3996586545299388, "grad_norm": 3.1429977416992188, "learning_rate": 5.951204210009141e-06, "loss": 0.5656, "step": 1230 }, { "epoch": 1.4110368368653108, "grad_norm": 2.938356399536133, "learning_rate": 5.889520453754904e-06, "loss": 0.5623, "step": 1240 }, { "epoch": 1.4224150192006828, "grad_norm": 3.665616512298584, "learning_rate": 5.827696568416532e-06, "loss": 0.5623, "step": 1250 }, { "epoch": 1.4337932015360546, "grad_norm": 3.3361687660217285, "learning_rate": 5.765742293314119e-06, "loss": 0.5649, "step": 1260 }, { "epoch": 1.4451713838714264, "grad_norm": 2.919668436050415, "learning_rate": 5.703667388308487e-06, "loss": 0.5633, "step": 1270 }, { "epoch": 1.4565495662067984, "grad_norm": 2.943481683731079, "learning_rate": 5.641481632263687e-06, "loss": 0.5662, "step": 1280 }, { "epoch": 1.4679277485421705, "grad_norm": 3.160897731781006, "learning_rate": 5.579194821506497e-06, "loss": 0.5564, "step": 1290 }, { "epoch": 1.4793059308775423, "grad_norm": 3.4720726013183594, "learning_rate": 5.516816768283174e-06, "loss": 0.5624, "step": 1300 }, { "epoch": 1.4906841132129143, "grad_norm": 3.757178544998169, "learning_rate": 5.454357299213708e-06, "loss": 0.5597, "step": 1310 }, { "epoch": 1.5020622955482863, "grad_norm": 3.1537322998046875, "learning_rate": 5.391826253743788e-06, "loss": 0.5587, "step": 1320 }, { "epoch": 1.513440477883658, "grad_norm": 3.478196382522583, "learning_rate": 5.329233482594771e-06, "loss": 0.5532, "step": 1330 }, { "epoch": 1.52481866021903, "grad_norm": 4.120450496673584, "learning_rate": 5.266588846211865e-06, "loss": 0.5504, "step": 1340 }, { "epoch": 1.536196842554402, "grad_norm": 2.9817895889282227, "learning_rate": 5.20390221321078e-06, "loss": 0.5551, "step": 1350 }, { "epoch": 1.547575024889774, "grad_norm": 3.2641355991363525, "learning_rate": 5.141183458823085e-06, "loss": 0.5533, "step": 1360 }, { "epoch": 1.5589532072251457, "grad_norm": 3.226193428039551, "learning_rate": 5.078442463340543e-06, "loss": 0.5521, "step": 1370 }, { "epoch": 1.5703313895605175, "grad_norm": 2.790149211883545, "learning_rate": 5.01568911055863e-06, "loss": 0.5627, "step": 1380 }, { "epoch": 1.5817095718958898, "grad_norm": 2.948485851287842, "learning_rate": 4.952933286219503e-06, "loss": 0.5697, "step": 1390 }, { "epoch": 1.5930877542312616, "grad_norm": 2.847878932952881, "learning_rate": 4.890184876454675e-06, "loss": 0.5624, "step": 1400 }, { "epoch": 1.6044659365666334, "grad_norm": 3.067723512649536, "learning_rate": 4.827453766227617e-06, "loss": 0.5634, "step": 1410 }, { "epoch": 1.6158441189020054, "grad_norm": 3.5816497802734375, "learning_rate": 4.764749837776545e-06, "loss": 0.5607, "step": 1420 }, { "epoch": 1.6272223012373774, "grad_norm": 3.231588840484619, "learning_rate": 4.702082969057646e-06, "loss": 0.5555, "step": 1430 }, { "epoch": 1.6386004835727492, "grad_norm": 3.1863789558410645, "learning_rate": 4.6394630321889535e-06, "loss": 0.5555, "step": 1440 }, { "epoch": 1.649978665908121, "grad_norm": 3.0723366737365723, "learning_rate": 4.576899891895185e-06, "loss": 0.5543, "step": 1450 }, { "epoch": 1.661356848243493, "grad_norm": 2.7250912189483643, "learning_rate": 4.5144034039537e-06, "loss": 0.5568, "step": 1460 }, { "epoch": 1.672735030578865, "grad_norm": 3.2123124599456787, "learning_rate": 4.451983413641894e-06, "loss": 0.5556, "step": 1470 }, { "epoch": 1.6841132129142369, "grad_norm": 3.832956552505493, "learning_rate": 4.389649754186242e-06, "loss": 0.5606, "step": 1480 }, { "epoch": 1.695491395249609, "grad_norm": 3.1121504306793213, "learning_rate": 4.327412245213232e-06, "loss": 0.5494, "step": 1490 }, { "epoch": 1.706869577584981, "grad_norm": 3.0781760215759277, "learning_rate": 4.265280691202449e-06, "loss": 0.5625, "step": 1500 }, { "epoch": 1.7182477599203527, "grad_norm": 3.1622416973114014, "learning_rate": 4.20326487994205e-06, "loss": 0.5525, "step": 1510 }, { "epoch": 1.7296259422557245, "grad_norm": 3.425814628601074, "learning_rate": 4.141374580986853e-06, "loss": 0.5642, "step": 1520 }, { "epoch": 1.7410041245910965, "grad_norm": 3.0332634449005127, "learning_rate": 4.079619544119303e-06, "loss": 0.5682, "step": 1530 }, { "epoch": 1.7523823069264686, "grad_norm": 3.9311952590942383, "learning_rate": 4.018009497813575e-06, "loss": 0.5496, "step": 1540 }, { "epoch": 1.7637604892618404, "grad_norm": 3.6143958568573, "learning_rate": 3.956554147702994e-06, "loss": 0.5607, "step": 1550 }, { "epoch": 1.7751386715972124, "grad_norm": 3.6590614318847656, "learning_rate": 3.895263175051089e-06, "loss": 0.5686, "step": 1560 }, { "epoch": 1.7865168539325844, "grad_norm": 3.8076541423797607, "learning_rate": 3.834146235226467e-06, "loss": 0.5724, "step": 1570 }, { "epoch": 1.7978950362679562, "grad_norm": 3.879667043685913, "learning_rate": 3.7732129561817687e-06, "loss": 0.5481, "step": 1580 }, { "epoch": 1.809273218603328, "grad_norm": 2.696838855743408, "learning_rate": 3.7124729369369473e-06, "loss": 0.5684, "step": 1590 }, { "epoch": 1.8206514009387, "grad_norm": 3.3719265460968018, "learning_rate": 3.6519357460671096e-06, "loss": 0.5645, "step": 1600 }, { "epoch": 1.832029583274072, "grad_norm": 2.9818122386932373, "learning_rate": 3.5916109201951355e-06, "loss": 0.5591, "step": 1610 }, { "epoch": 1.8434077656094439, "grad_norm": 2.6824517250061035, "learning_rate": 3.531507962489349e-06, "loss": 0.5672, "step": 1620 }, { "epoch": 1.8547859479448157, "grad_norm": 3.0721547603607178, "learning_rate": 3.4716363411664517e-06, "loss": 0.5513, "step": 1630 }, { "epoch": 1.866164130280188, "grad_norm": 3.090949535369873, "learning_rate": 3.4120054879999654e-06, "loss": 0.568, "step": 1640 }, { "epoch": 1.8775423126155597, "grad_norm": 3.357748031616211, "learning_rate": 3.3526247968344084e-06, "loss": 0.5602, "step": 1650 }, { "epoch": 1.8889204949509315, "grad_norm": 3.1600940227508545, "learning_rate": 3.2935036221054638e-06, "loss": 0.5555, "step": 1660 }, { "epoch": 1.9002986772863035, "grad_norm": 4.0066447257995605, "learning_rate": 3.234651277366325e-06, "loss": 0.5771, "step": 1670 }, { "epoch": 1.9116768596216756, "grad_norm": 3.6013309955596924, "learning_rate": 3.1760770338205204e-06, "loss": 0.5724, "step": 1680 }, { "epoch": 1.9230550419570474, "grad_norm": 3.687427043914795, "learning_rate": 3.11779011886138e-06, "loss": 0.5551, "step": 1690 }, { "epoch": 1.9344332242924192, "grad_norm": 3.2290844917297363, "learning_rate": 3.0597997146184183e-06, "loss": 0.5619, "step": 1700 }, { "epoch": 1.9458114066277912, "grad_norm": 3.9673118591308594, "learning_rate": 3.0021149565108355e-06, "loss": 0.555, "step": 1710 }, { "epoch": 1.9571895889631632, "grad_norm": 3.0431714057922363, "learning_rate": 2.9447449318083964e-06, "loss": 0.5549, "step": 1720 }, { "epoch": 1.968567771298535, "grad_norm": 2.991881847381592, "learning_rate": 2.887698678199872e-06, "loss": 0.5507, "step": 1730 }, { "epoch": 1.979945953633907, "grad_norm": 3.0121707916259766, "learning_rate": 2.8309851823693114e-06, "loss": 0.5609, "step": 1740 }, { "epoch": 1.991324135969279, "grad_norm": 3.5490665435791016, "learning_rate": 2.7746133785803363e-06, "loss": 0.5728, "step": 1750 }, { "epoch": 2.002844545583843, "grad_norm": 5.156729221343994, "learning_rate": 2.71859214726869e-06, "loss": 0.5611, "step": 1760 }, { "epoch": 2.014222727919215, "grad_norm": 4.809017181396484, "learning_rate": 2.6629303136432873e-06, "loss": 0.5519, "step": 1770 }, { "epoch": 2.0256009102545867, "grad_norm": 4.811421871185303, "learning_rate": 2.6076366462959368e-06, "loss": 0.5576, "step": 1780 }, { "epoch": 2.036979092589959, "grad_norm": 3.962195634841919, "learning_rate": 2.5527198558200124e-06, "loss": 0.556, "step": 1790 }, { "epoch": 2.0483572749253307, "grad_norm": 5.088486194610596, "learning_rate": 2.4981885934382198e-06, "loss": 0.5571, "step": 1800 }, { "epoch": 2.0597354572607025, "grad_norm": 4.567300319671631, "learning_rate": 2.444051449639766e-06, "loss": 0.5554, "step": 1810 }, { "epoch": 2.0711136395960743, "grad_norm": 4.686933517456055, "learning_rate": 2.3903169528270564e-06, "loss": 0.5549, "step": 1820 }, { "epoch": 2.0824918219314466, "grad_norm": 4.070396423339844, "learning_rate": 2.336993567972193e-06, "loss": 0.5686, "step": 1830 }, { "epoch": 2.0938700042668184, "grad_norm": 4.52924108505249, "learning_rate": 2.2840896952834573e-06, "loss": 0.5661, "step": 1840 }, { "epoch": 2.10524818660219, "grad_norm": 4.499566078186035, "learning_rate": 2.2316136688820006e-06, "loss": 0.5767, "step": 1850 }, { "epoch": 2.1166263689375624, "grad_norm": 4.242886543273926, "learning_rate": 2.1795737554889384e-06, "loss": 0.5567, "step": 1860 }, { "epoch": 2.128004551272934, "grad_norm": 4.661407947540283, "learning_rate": 2.127978153123071e-06, "loss": 0.5549, "step": 1870 }, { "epoch": 2.139382733608306, "grad_norm": 4.505501747131348, "learning_rate": 2.0768349898094204e-06, "loss": 0.5615, "step": 1880 }, { "epoch": 2.150760915943678, "grad_norm": 4.7061357498168945, "learning_rate": 2.026152322298791e-06, "loss": 0.5568, "step": 1890 }, { "epoch": 2.16213909827905, "grad_norm": 4.6892476081848145, "learning_rate": 1.975938134798577e-06, "loss": 0.5377, "step": 1900 }, { "epoch": 2.173517280614422, "grad_norm": 4.749743938446045, "learning_rate": 1.926200337714968e-06, "loss": 0.5402, "step": 1910 }, { "epoch": 2.1848954629497936, "grad_norm": 3.927170991897583, "learning_rate": 1.8769467664068058e-06, "loss": 0.5572, "step": 1920 }, { "epoch": 2.196273645285166, "grad_norm": 4.769582748413086, "learning_rate": 1.8281851799512551e-06, "loss": 0.5634, "step": 1930 }, { "epoch": 2.2076518276205377, "grad_norm": 5.719987869262695, "learning_rate": 1.779923259921481e-06, "loss": 0.5544, "step": 1940 }, { "epoch": 2.2190300099559095, "grad_norm": 4.865458965301514, "learning_rate": 1.732168609176556e-06, "loss": 0.5572, "step": 1950 }, { "epoch": 2.2304081922912813, "grad_norm": 24.83760643005371, "learning_rate": 1.684928750663758e-06, "loss": 0.5581, "step": 1960 }, { "epoch": 2.2417863746266535, "grad_norm": 5.071547031402588, "learning_rate": 1.6382111262334454e-06, "loss": 0.559, "step": 1970 }, { "epoch": 2.2531645569620253, "grad_norm": 4.077290058135986, "learning_rate": 1.5920230954667198e-06, "loss": 0.5471, "step": 1980 }, { "epoch": 2.264542739297397, "grad_norm": 3.790095090866089, "learning_rate": 1.5463719345160571e-06, "loss": 0.558, "step": 1990 }, { "epoch": 2.2759209216327694, "grad_norm": 5.027350425720215, "learning_rate": 1.501264834959061e-06, "loss": 0.5627, "step": 2000 }, { "epoch": 2.287299103968141, "grad_norm": 4.2677083015441895, "learning_rate": 1.4567089026655524e-06, "loss": 0.5497, "step": 2010 }, { "epoch": 2.298677286303513, "grad_norm": 5.105195045471191, "learning_rate": 1.4127111566781664e-06, "loss": 0.5419, "step": 2020 }, { "epoch": 2.310055468638885, "grad_norm": 4.892566680908203, "learning_rate": 1.369278528106603e-06, "loss": 0.553, "step": 2030 }, { "epoch": 2.321433650974257, "grad_norm": 4.721883296966553, "learning_rate": 1.3264178590357607e-06, "loss": 0.5481, "step": 2040 }, { "epoch": 2.332811833309629, "grad_norm": 4.8671722412109375, "learning_rate": 1.2841359014478772e-06, "loss": 0.5647, "step": 2050 }, { "epoch": 2.3441900156450006, "grad_norm": 4.7961201667785645, "learning_rate": 1.2424393161588632e-06, "loss": 0.5465, "step": 2060 }, { "epoch": 2.3555681979803724, "grad_norm": 5.445614337921143, "learning_rate": 1.2013346717690072e-06, "loss": 0.5523, "step": 2070 }, { "epoch": 2.3669463803157447, "grad_norm": 5.255151271820068, "learning_rate": 1.1608284436281974e-06, "loss": 0.574, "step": 2080 }, { "epoch": 2.3783245626511165, "grad_norm": 3.8508081436157227, "learning_rate": 1.120927012815844e-06, "loss": 0.539, "step": 2090 }, { "epoch": 2.3897027449864883, "grad_norm": 5.08436393737793, "learning_rate": 1.081636665135642e-06, "loss": 0.5392, "step": 2100 }, { "epoch": 2.4010809273218605, "grad_norm": 5.37895393371582, "learning_rate": 1.0429635901253516e-06, "loss": 0.5636, "step": 2110 }, { "epoch": 2.4124591096572323, "grad_norm": 3.8906028270721436, "learning_rate": 1.0049138800817332e-06, "loss": 0.567, "step": 2120 }, { "epoch": 2.423837291992604, "grad_norm": 4.598055362701416, "learning_rate": 9.674935291008198e-07, "loss": 0.5522, "step": 2130 }, { "epoch": 2.435215474327976, "grad_norm": 4.944962501525879, "learning_rate": 9.307084321336357e-07, "loss": 0.5586, "step": 2140 }, { "epoch": 2.446593656663348, "grad_norm": 4.658571243286133, "learning_rate": 8.94564384057553e-07, "loss": 0.5517, "step": 2150 }, { "epoch": 2.45797183899872, "grad_norm": 4.260778427124023, "learning_rate": 8.59067078763397e-07, "loss": 0.5527, "step": 2160 }, { "epoch": 2.4693500213340918, "grad_norm": 4.492344856262207, "learning_rate": 8.242221082584811e-07, "loss": 0.5593, "step": 2170 }, { "epoch": 2.4807282036694636, "grad_norm": 4.839461326599121, "learning_rate": 7.900349617856684e-07, "loss": 0.5572, "step": 2180 }, { "epoch": 2.492106386004836, "grad_norm": 4.276406764984131, "learning_rate": 7.5651102495864e-07, "loss": 0.5506, "step": 2190 }, { "epoch": 2.5034845683402076, "grad_norm": 4.654806137084961, "learning_rate": 7.236555789134769e-07, "loss": 0.5449, "step": 2200 }, { "epoch": 2.5148627506755794, "grad_norm": 4.328060626983643, "learning_rate": 6.914737994767101e-07, "loss": 0.5614, "step": 2210 }, { "epoch": 2.5262409330109517, "grad_norm": 6.862987041473389, "learning_rate": 6.599707563499514e-07, "loss": 0.556, "step": 2220 }, { "epoch": 2.5376191153463235, "grad_norm": 4.085236072540283, "learning_rate": 6.291514123112474e-07, "loss": 0.5636, "step": 2230 }, { "epoch": 2.5489972976816953, "grad_norm": 4.0781025886535645, "learning_rate": 5.990206224332801e-07, "loss": 0.5598, "step": 2240 }, { "epoch": 2.5603754800170675, "grad_norm": 4.941834449768066, "learning_rate": 5.695831333185265e-07, "loss": 0.5484, "step": 2250 }, { "epoch": 2.5717536623524393, "grad_norm": 4.038600444793701, "learning_rate": 5.408435823515145e-07, "loss": 0.575, "step": 2260 }, { "epoch": 2.583131844687811, "grad_norm": 4.425263404846191, "learning_rate": 5.128064969682778e-07, "loss": 0.5693, "step": 2270 }, { "epoch": 2.594510027023183, "grad_norm": 4.681412696838379, "learning_rate": 4.854762939431362e-07, "loss": 0.5397, "step": 2280 }, { "epoch": 2.6058882093585547, "grad_norm": 4.6591267585754395, "learning_rate": 4.5885727869290354e-07, "loss": 0.5561, "step": 2290 }, { "epoch": 2.617266391693927, "grad_norm": 4.0130157470703125, "learning_rate": 4.32953644598646e-07, "loss": 0.5588, "step": 2300 }, { "epoch": 2.6286445740292987, "grad_norm": 5.125436305999756, "learning_rate": 4.077694723450815e-07, "loss": 0.5537, "step": 2310 }, { "epoch": 2.6400227563646705, "grad_norm": 4.90755558013916, "learning_rate": 3.8330872927773544e-07, "loss": 0.5465, "step": 2320 }, { "epoch": 2.651400938700043, "grad_norm": 4.263889312744141, "learning_rate": 3.595752687779541e-07, "loss": 0.5507, "step": 2330 }, { "epoch": 2.6627791210354146, "grad_norm": 4.307155609130859, "learning_rate": 3.365728296558668e-07, "loss": 0.5518, "step": 2340 }, { "epoch": 2.6741573033707864, "grad_norm": 4.401244640350342, "learning_rate": 3.1430503556140026e-07, "loss": 0.5637, "step": 2350 }, { "epoch": 2.6855354857061586, "grad_norm": 4.537615776062012, "learning_rate": 2.9277539441343204e-07, "loss": 0.5376, "step": 2360 }, { "epoch": 2.6969136680415304, "grad_norm": 4.655327796936035, "learning_rate": 2.7198729784718047e-07, "loss": 0.5606, "step": 2370 }, { "epoch": 2.7082918503769022, "grad_norm": 4.768523216247559, "learning_rate": 2.519440206799001e-07, "loss": 0.5588, "step": 2380 }, { "epoch": 2.7196700327122745, "grad_norm": 5.224249839782715, "learning_rate": 2.326487203949984e-07, "loss": 0.5555, "step": 2390 }, { "epoch": 2.7310482150476463, "grad_norm": 3.7500874996185303, "learning_rate": 2.141044366446221e-07, "loss": 0.5449, "step": 2400 }, { "epoch": 2.742426397383018, "grad_norm": 8.473684310913086, "learning_rate": 1.9631409077081277e-07, "loss": 0.558, "step": 2410 }, { "epoch": 2.75380457971839, "grad_norm": 4.536637783050537, "learning_rate": 1.792804853452962e-07, "loss": 0.5598, "step": 2420 }, { "epoch": 2.7651827620537617, "grad_norm": 4.456907749176025, "learning_rate": 1.6300630372798666e-07, "loss": 0.5552, "step": 2430 }, { "epoch": 2.776560944389134, "grad_norm": 4.3202385902404785, "learning_rate": 1.4749410964426558e-07, "loss": 0.5515, "step": 2440 }, { "epoch": 2.7879391267245057, "grad_norm": 5.083593845367432, "learning_rate": 1.3274634678111187e-07, "loss": 0.5564, "step": 2450 }, { "epoch": 2.7993173090598775, "grad_norm": 4.5219221115112305, "learning_rate": 1.187653384021381e-07, "loss": 0.5554, "step": 2460 }, { "epoch": 2.8106954913952498, "grad_norm": 4.617882251739502, "learning_rate": 1.0555328698159951e-07, "loss": 0.5624, "step": 2470 }, { "epoch": 2.8220736737306216, "grad_norm": 4.936917781829834, "learning_rate": 9.311227385743249e-08, "loss": 0.5511, "step": 2480 }, { "epoch": 2.8334518560659934, "grad_norm": 4.036168575286865, "learning_rate": 8.144425890337371e-08, "loss": 0.5472, "step": 2490 }, { "epoch": 2.8448300384013656, "grad_norm": 5.006382465362549, "learning_rate": 7.05510802202125e-08, "loss": 0.5461, "step": 2500 }, { "epoch": 2.8562082207367374, "grad_norm": 6.169336795806885, "learning_rate": 6.043445384623259e-08, "loss": 0.5496, "step": 2510 }, { "epoch": 2.867586403072109, "grad_norm": 4.207286834716797, "learning_rate": 5.109597348687545e-08, "loss": 0.554, "step": 2520 }, { "epoch": 2.878964585407481, "grad_norm": 4.170133113861084, "learning_rate": 4.2537110263682343e-08, "loss": 0.5514, "step": 2530 }, { "epoch": 2.890342767742853, "grad_norm": 4.8024582862854, "learning_rate": 3.475921248254022e-08, "loss": 0.5471, "step": 2540 }, { "epoch": 2.901720950078225, "grad_norm": 4.820261478424072, "learning_rate": 2.7763505421281035e-08, "loss": 0.5487, "step": 2550 }, { "epoch": 2.913099132413597, "grad_norm": 4.229453086853027, "learning_rate": 2.155109113665732e-08, "loss": 0.5615, "step": 2560 }, { "epoch": 2.9244773147489687, "grad_norm": 4.419139862060547, "learning_rate": 1.612294829073491e-08, "loss": 0.5671, "step": 2570 }, { "epoch": 2.935855497084341, "grad_norm": 4.717645168304443, "learning_rate": 1.147993199671682e-08, "loss": 0.5434, "step": 2580 }, { "epoch": 2.9472336794197127, "grad_norm": 4.646683692932129, "learning_rate": 7.62277368423936e-09, "loss": 0.5543, "step": 2590 }, { "epoch": 2.9586118617550845, "grad_norm": 4.718910217285156, "learning_rate": 4.552080984143725e-09, "loss": 0.553, "step": 2600 }, { "epoch": 2.9699900440904567, "grad_norm": 3.963477849960327, "learning_rate": 2.2683376327548022e-09, "loss": 0.5507, "step": 2610 }, { "epoch": 2.9813682264258285, "grad_norm": 8.025522232055664, "learning_rate": 7.719033956782307e-10, "loss": 0.5677, "step": 2620 }, { "epoch": 2.9927464087612003, "grad_norm": 5.158026695251465, "learning_rate": 6.301401112296379e-11, "loss": 0.5586, "step": 2630 } ], "logging_steps": 10, "max_steps": 2634, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.685410418372693e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }