{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9942418426103647, "eval_steps": 500, "global_step": 585, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025591810620601407, "grad_norm": 0.3465089797973633, "learning_rate": 4.999098819177214e-05, "loss": 0.8049, "step": 5 }, { "epoch": 0.05118362124120281, "grad_norm": 0.288526713848114, "learning_rate": 4.9963959264103544e-05, "loss": 0.7441, "step": 10 }, { "epoch": 0.07677543186180422, "grad_norm": 0.29372313618659973, "learning_rate": 4.9918932703355256e-05, "loss": 0.6875, "step": 15 }, { "epoch": 0.10236724248240563, "grad_norm": 0.28075122833251953, "learning_rate": 4.98559409711857e-05, "loss": 0.6468, "step": 20 }, { "epoch": 0.12795905310300704, "grad_norm": 0.2833946645259857, "learning_rate": 4.977502948114772e-05, "loss": 0.6117, "step": 25 }, { "epoch": 0.15355086372360843, "grad_norm": 0.2722305953502655, "learning_rate": 4.967625656594782e-05, "loss": 0.5909, "step": 30 }, { "epoch": 0.17914267434420986, "grad_norm": 0.25874456763267517, "learning_rate": 4.955969343539162e-05, "loss": 0.5685, "step": 35 }, { "epoch": 0.20473448496481125, "grad_norm": 0.2593054473400116, "learning_rate": 4.942542412504543e-05, "loss": 0.5595, "step": 40 }, { "epoch": 0.23032629558541268, "grad_norm": 0.2561085522174835, "learning_rate": 4.92735454356513e-05, "loss": 0.5419, "step": 45 }, { "epoch": 0.2559181062060141, "grad_norm": 0.2852307856082916, "learning_rate": 4.910416686333906e-05, "loss": 0.5384, "step": 50 }, { "epoch": 0.28150991682661547, "grad_norm": 0.29978978633880615, "learning_rate": 4.8917410520685635e-05, "loss": 0.5243, "step": 55 }, { "epoch": 0.30710172744721687, "grad_norm": 0.30773016810417175, "learning_rate": 4.8713411048678635e-05, "loss": 0.5229, "step": 60 }, { "epoch": 0.3326935380678183, "grad_norm": 0.3103509247303009, "learning_rate": 4.849231551964771e-05, "loss": 0.5124, "step": 65 }, { "epoch": 0.3582853486884197, "grad_norm": 0.32078924775123596, "learning_rate": 4.8254283331233464e-05, "loss": 0.5118, "step": 70 }, { "epoch": 0.3838771593090211, "grad_norm": 0.3069196939468384, "learning_rate": 4.799948609147061e-05, "loss": 0.501, "step": 75 }, { "epoch": 0.4094689699296225, "grad_norm": 0.30792945623397827, "learning_rate": 4.77281074950681e-05, "loss": 0.5015, "step": 80 }, { "epoch": 0.4350607805502239, "grad_norm": 0.3218609094619751, "learning_rate": 4.744034319097535e-05, "loss": 0.5099, "step": 85 }, { "epoch": 0.46065259117082535, "grad_norm": 0.35433730483055115, "learning_rate": 4.713640064133025e-05, "loss": 0.4912, "step": 90 }, { "epoch": 0.48624440179142675, "grad_norm": 0.3382019102573395, "learning_rate": 4.681649897189036e-05, "loss": 0.4951, "step": 95 }, { "epoch": 0.5118362124120281, "grad_norm": 0.37824833393096924, "learning_rate": 4.6480868814055424e-05, "loss": 0.4889, "step": 100 }, { "epoch": 0.5374280230326296, "grad_norm": 0.3094428479671478, "learning_rate": 4.6129752138594874e-05, "loss": 0.4878, "step": 105 }, { "epoch": 0.5630198336532309, "grad_norm": 0.39037370681762695, "learning_rate": 4.5763402081200294e-05, "loss": 0.4913, "step": 110 }, { "epoch": 0.5886116442738324, "grad_norm": 0.34269848465919495, "learning_rate": 4.538208275998861e-05, "loss": 0.4926, "step": 115 }, { "epoch": 0.6142034548944337, "grad_norm": 0.3708631098270416, "learning_rate": 4.498606908508754e-05, "loss": 0.4931, "step": 120 }, { "epoch": 0.6397952655150352, "grad_norm": 0.3521125614643097, "learning_rate": 4.457564656044056e-05, "loss": 0.481, "step": 125 }, { "epoch": 0.6653870761356366, "grad_norm": 0.3537481427192688, "learning_rate": 4.415111107797445e-05, "loss": 0.4842, "step": 130 }, { "epoch": 0.690978886756238, "grad_norm": 0.38676100969314575, "learning_rate": 4.371276870427753e-05, "loss": 0.4765, "step": 135 }, { "epoch": 0.7165706973768394, "grad_norm": 0.4416983723640442, "learning_rate": 4.3260935459942584e-05, "loss": 0.4652, "step": 140 }, { "epoch": 0.7421625079974408, "grad_norm": 0.36901989579200745, "learning_rate": 4.2795937091733515e-05, "loss": 0.4819, "step": 145 }, { "epoch": 0.7677543186180422, "grad_norm": 0.3908597528934479, "learning_rate": 4.231810883773999e-05, "loss": 0.4612, "step": 150 }, { "epoch": 0.7933461292386437, "grad_norm": 0.4273853003978729, "learning_rate": 4.182779518568926e-05, "loss": 0.4713, "step": 155 }, { "epoch": 0.818937939859245, "grad_norm": 0.36905694007873535, "learning_rate": 4.132534962458962e-05, "loss": 0.4734, "step": 160 }, { "epoch": 0.8445297504798465, "grad_norm": 0.3763083517551422, "learning_rate": 4.0811134389884433e-05, "loss": 0.4659, "step": 165 }, { "epoch": 0.8701215611004478, "grad_norm": 0.3596407175064087, "learning_rate": 4.028552020230031e-05, "loss": 0.4631, "step": 170 }, { "epoch": 0.8957133717210493, "grad_norm": 0.396517276763916, "learning_rate": 3.974888600057808e-05, "loss": 0.4634, "step": 175 }, { "epoch": 0.9213051823416507, "grad_norm": 0.38290080428123474, "learning_rate": 3.920161866827889e-05, "loss": 0.4634, "step": 180 }, { "epoch": 0.946896992962252, "grad_norm": 0.39791831374168396, "learning_rate": 3.8644112754862614e-05, "loss": 0.4638, "step": 185 }, { "epoch": 0.9724888035828535, "grad_norm": 0.3460437059402466, "learning_rate": 3.807677019123944e-05, "loss": 0.4647, "step": 190 }, { "epoch": 0.9980806142034548, "grad_norm": 0.39563411474227905, "learning_rate": 3.7500000000000003e-05, "loss": 0.465, "step": 195 }, { "epoch": 1.0236724248240563, "grad_norm": 0.3726441264152527, "learning_rate": 3.69142180005327e-05, "loss": 0.5229, "step": 200 }, { "epoch": 1.0492642354446577, "grad_norm": 0.402972936630249, "learning_rate": 3.631984650924094e-05, "loss": 0.4445, "step": 205 }, { "epoch": 1.0748560460652592, "grad_norm": 0.4153399169445038, "learning_rate": 3.5717314035076355e-05, "loss": 0.4516, "step": 210 }, { "epoch": 1.1004478566858604, "grad_norm": 0.5092347860336304, "learning_rate": 3.510705497060762e-05, "loss": 0.4528, "step": 215 }, { "epoch": 1.1260396673064619, "grad_norm": 0.40579304099082947, "learning_rate": 3.4489509278847414e-05, "loss": 0.4395, "step": 220 }, { "epoch": 1.1516314779270633, "grad_norm": 0.43368446826934814, "learning_rate": 3.386512217606339e-05, "loss": 0.4484, "step": 225 }, { "epoch": 1.1772232885476648, "grad_norm": 0.41571882367134094, "learning_rate": 3.323434381080199e-05, "loss": 0.4376, "step": 230 }, { "epoch": 1.2028150991682662, "grad_norm": 0.44565585255622864, "learning_rate": 3.2597628939356175e-05, "loss": 0.4578, "step": 235 }, { "epoch": 1.2284069097888675, "grad_norm": 0.4231709837913513, "learning_rate": 3.195543659791132e-05, "loss": 0.439, "step": 240 }, { "epoch": 1.253998720409469, "grad_norm": 0.4322074055671692, "learning_rate": 3.130822977160554e-05, "loss": 0.4429, "step": 245 }, { "epoch": 1.2795905310300704, "grad_norm": 0.42521893978118896, "learning_rate": 3.065647506074306e-05, "loss": 0.4427, "step": 250 }, { "epoch": 1.3051823416506718, "grad_norm": 0.416797012090683, "learning_rate": 3.0000642344401113e-05, "loss": 0.4315, "step": 255 }, { "epoch": 1.3307741522712733, "grad_norm": 0.41196638345718384, "learning_rate": 2.9341204441673266e-05, "loss": 0.4425, "step": 260 }, { "epoch": 1.3563659628918745, "grad_norm": 0.4643464982509613, "learning_rate": 2.8678636770792906e-05, "loss": 0.4488, "step": 265 }, { "epoch": 1.381957773512476, "grad_norm": 0.40053775906562805, "learning_rate": 2.8013417006383076e-05, "loss": 0.4289, "step": 270 }, { "epoch": 1.4075495841330774, "grad_norm": 0.4310891032218933, "learning_rate": 2.7346024735079486e-05, "loss": 0.4348, "step": 275 }, { "epoch": 1.4331413947536789, "grad_norm": 0.5526339411735535, "learning_rate": 2.667694110977506e-05, "loss": 0.4294, "step": 280 }, { "epoch": 1.4587332053742803, "grad_norm": 0.473822683095932, "learning_rate": 2.600664850273538e-05, "loss": 0.4361, "step": 285 }, { "epoch": 1.4843250159948815, "grad_norm": 0.4439009726047516, "learning_rate": 2.5335630157834937e-05, "loss": 0.4413, "step": 290 }, { "epoch": 1.5099168266154832, "grad_norm": 0.4225783944129944, "learning_rate": 2.4664369842165068e-05, "loss": 0.4296, "step": 295 }, { "epoch": 1.5355086372360844, "grad_norm": 0.497530460357666, "learning_rate": 2.399335149726463e-05, "loss": 0.4447, "step": 300 }, { "epoch": 1.561100447856686, "grad_norm": 0.4586053490638733, "learning_rate": 2.3323058890224938e-05, "loss": 0.4207, "step": 305 }, { "epoch": 1.5866922584772873, "grad_norm": 0.5011757612228394, "learning_rate": 2.265397526492052e-05, "loss": 0.4398, "step": 310 }, { "epoch": 1.6122840690978886, "grad_norm": 0.4521833658218384, "learning_rate": 2.1986582993616926e-05, "loss": 0.4367, "step": 315 }, { "epoch": 1.6378758797184902, "grad_norm": 0.4077029526233673, "learning_rate": 2.1321363229207096e-05, "loss": 0.4274, "step": 320 }, { "epoch": 1.6634676903390915, "grad_norm": 0.4575510621070862, "learning_rate": 2.0658795558326743e-05, "loss": 0.4412, "step": 325 }, { "epoch": 1.689059500959693, "grad_norm": 0.48359596729278564, "learning_rate": 1.9999357655598893e-05, "loss": 0.4408, "step": 330 }, { "epoch": 1.7146513115802944, "grad_norm": 0.4537913203239441, "learning_rate": 1.934352493925695e-05, "loss": 0.4306, "step": 335 }, { "epoch": 1.7402431222008956, "grad_norm": 0.4479089379310608, "learning_rate": 1.8691770228394456e-05, "loss": 0.4347, "step": 340 }, { "epoch": 1.7658349328214973, "grad_norm": 0.432325154542923, "learning_rate": 1.8044563402088684e-05, "loss": 0.4252, "step": 345 }, { "epoch": 1.7914267434420985, "grad_norm": 0.4653799831867218, "learning_rate": 1.740237106064383e-05, "loss": 0.4306, "step": 350 }, { "epoch": 1.8170185540627, "grad_norm": 0.46541592478752136, "learning_rate": 1.6765656189198013e-05, "loss": 0.4201, "step": 355 }, { "epoch": 1.8426103646833014, "grad_norm": 0.5030815005302429, "learning_rate": 1.613487782393661e-05, "loss": 0.426, "step": 360 }, { "epoch": 1.8682021753039026, "grad_norm": 0.5130462646484375, "learning_rate": 1.5510490721152592e-05, "loss": 0.4276, "step": 365 }, { "epoch": 1.8937939859245043, "grad_norm": 0.48607420921325684, "learning_rate": 1.489294502939238e-05, "loss": 0.418, "step": 370 }, { "epoch": 1.9193857965451055, "grad_norm": 0.43401479721069336, "learning_rate": 1.4282685964923642e-05, "loss": 0.4326, "step": 375 }, { "epoch": 1.944977607165707, "grad_norm": 0.4771016538143158, "learning_rate": 1.3680153490759073e-05, "loss": 0.433, "step": 380 }, { "epoch": 1.9705694177863085, "grad_norm": 0.503084123134613, "learning_rate": 1.3085781999467303e-05, "loss": 0.4278, "step": 385 }, { "epoch": 1.9961612284069097, "grad_norm": 0.504097580909729, "learning_rate": 1.2500000000000006e-05, "loss": 0.4221, "step": 390 }, { "epoch": 2.0217530390275114, "grad_norm": 0.46889811754226685, "learning_rate": 1.1923229808760564e-05, "loss": 0.4834, "step": 395 }, { "epoch": 2.0473448496481126, "grad_norm": 0.45117413997650146, "learning_rate": 1.1355887245137383e-05, "loss": 0.4055, "step": 400 }, { "epoch": 2.072936660268714, "grad_norm": 0.47571200132369995, "learning_rate": 1.0798381331721109e-05, "loss": 0.4143, "step": 405 }, { "epoch": 2.0985284708893155, "grad_norm": 0.5209171175956726, "learning_rate": 1.0251113999421935e-05, "loss": 0.4189, "step": 410 }, { "epoch": 2.1241202815099167, "grad_norm": 0.49102234840393066, "learning_rate": 9.714479797699694e-06, "loss": 0.4051, "step": 415 }, { "epoch": 2.1497120921305184, "grad_norm": 0.5026312470436096, "learning_rate": 9.18886561011557e-06, "loss": 0.4181, "step": 420 }, { "epoch": 2.1753039027511196, "grad_norm": 0.4433274269104004, "learning_rate": 8.67465037541038e-06, "loss": 0.4141, "step": 425 }, { "epoch": 2.200895713371721, "grad_norm": 0.4839063286781311, "learning_rate": 8.172204814310742e-06, "loss": 0.4148, "step": 430 }, { "epoch": 2.2264875239923225, "grad_norm": 0.46064066886901855, "learning_rate": 7.681891162260015e-06, "loss": 0.4093, "step": 435 }, { "epoch": 2.2520793346129238, "grad_norm": 0.47170594334602356, "learning_rate": 7.20406290826649e-06, "loss": 0.4152, "step": 440 }, { "epoch": 2.2776711452335254, "grad_norm": 0.4596922695636749, "learning_rate": 6.739064540057424e-06, "loss": 0.4219, "step": 445 }, { "epoch": 2.3032629558541267, "grad_norm": 0.45822423696517944, "learning_rate": 6.28723129572247e-06, "loss": 0.4169, "step": 450 }, { "epoch": 2.328854766474728, "grad_norm": 0.46449440717697144, "learning_rate": 5.848888922025553e-06, "loss": 0.4033, "step": 455 }, { "epoch": 2.3544465770953296, "grad_norm": 0.4573959708213806, "learning_rate": 5.424353439559446e-06, "loss": 0.4139, "step": 460 }, { "epoch": 2.380038387715931, "grad_norm": 0.48029977083206177, "learning_rate": 5.013930914912476e-06, "loss": 0.4184, "step": 465 }, { "epoch": 2.4056301983365325, "grad_norm": 0.5101778507232666, "learning_rate": 4.617917240011394e-06, "loss": 0.4142, "step": 470 }, { "epoch": 2.4312220089571337, "grad_norm": 0.5021511912345886, "learning_rate": 4.236597918799709e-06, "loss": 0.4105, "step": 475 }, { "epoch": 2.456813819577735, "grad_norm": 0.46041542291641235, "learning_rate": 3.8702478614051355e-06, "loss": 0.4033, "step": 480 }, { "epoch": 2.4824056301983366, "grad_norm": 0.4977727234363556, "learning_rate": 3.5191311859445796e-06, "loss": 0.4136, "step": 485 }, { "epoch": 2.507997440818938, "grad_norm": 0.4449191093444824, "learning_rate": 3.183501028109642e-06, "loss": 0.4026, "step": 490 }, { "epoch": 2.5335892514395395, "grad_norm": 0.4880577027797699, "learning_rate": 2.8635993586697553e-06, "loss": 0.4144, "step": 495 }, { "epoch": 2.5591810620601407, "grad_norm": 0.4731748402118683, "learning_rate": 2.5596568090246548e-06, "loss": 0.4156, "step": 500 }, { "epoch": 2.584772872680742, "grad_norm": 0.4792173206806183, "learning_rate": 2.271892504931905e-06, "loss": 0.4129, "step": 505 }, { "epoch": 2.6103646833013436, "grad_norm": 0.4491451680660248, "learning_rate": 2.0005139085293945e-06, "loss": 0.3978, "step": 510 }, { "epoch": 2.635956493921945, "grad_norm": 0.5214707851409912, "learning_rate": 1.7457166687665449e-06, "loss": 0.4162, "step": 515 }, { "epoch": 2.6615483045425465, "grad_norm": 0.44630053639411926, "learning_rate": 1.5076844803522922e-06, "loss": 0.4048, "step": 520 }, { "epoch": 2.6871401151631478, "grad_norm": 0.45181167125701904, "learning_rate": 1.286588951321363e-06, "loss": 0.4159, "step": 525 }, { "epoch": 2.712731925783749, "grad_norm": 0.4882276952266693, "learning_rate": 1.0825894793143721e-06, "loss": 0.4171, "step": 530 }, { "epoch": 2.7383237364043507, "grad_norm": 0.470037043094635, "learning_rate": 8.958331366609423e-07, "loss": 0.4049, "step": 535 }, { "epoch": 2.763915547024952, "grad_norm": 0.4617624580860138, "learning_rate": 7.264545643486997e-07, "loss": 0.4062, "step": 540 }, { "epoch": 2.7895073576455536, "grad_norm": 0.47536271810531616, "learning_rate": 5.745758749545749e-07, "loss": 0.4153, "step": 545 }, { "epoch": 2.815099168266155, "grad_norm": 0.4715920388698578, "learning_rate": 4.403065646083809e-07, "loss": 0.4208, "step": 550 }, { "epoch": 2.840690978886756, "grad_norm": 0.4840976297855377, "learning_rate": 3.237434340521789e-07, "loss": 0.4083, "step": 555 }, { "epoch": 2.8662827895073577, "grad_norm": 0.4537844657897949, "learning_rate": 2.2497051885228827e-07, "loss": 0.4085, "step": 560 }, { "epoch": 2.891874600127959, "grad_norm": 0.4958924651145935, "learning_rate": 1.4405902881430288e-07, "loss": 0.417, "step": 565 }, { "epoch": 2.9174664107485606, "grad_norm": 0.48199719190597534, "learning_rate": 8.106729664475176e-08, "loss": 0.415, "step": 570 }, { "epoch": 2.943058221369162, "grad_norm": 0.45770469307899475, "learning_rate": 3.604073589645596e-08, "loss": 0.4043, "step": 575 }, { "epoch": 2.968650031989763, "grad_norm": 0.5054221153259277, "learning_rate": 9.011808227865625e-09, "loss": 0.4034, "step": 580 }, { "epoch": 2.9942418426103647, "grad_norm": 0.4703996777534485, "learning_rate": 0.0, "loss": 0.4106, "step": 585 } ], "logging_steps": 5, "max_steps": 585, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5149343615798477e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }