{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996954314720812, "eval_steps": 500, "global_step": 2214, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00676818950930626, "grad_norm": 7.503111624582946, "learning_rate": 4.504504504504504e-08, "loss": 0.7511, "step": 5 }, { "epoch": 0.01353637901861252, "grad_norm": 7.290232887177767, "learning_rate": 9.009009009009008e-08, "loss": 0.7468, "step": 10 }, { "epoch": 0.02030456852791878, "grad_norm": 6.95340550567438, "learning_rate": 1.3513513513513515e-07, "loss": 0.7048, "step": 15 }, { "epoch": 0.02707275803722504, "grad_norm": 6.910978359875767, "learning_rate": 1.8018018018018017e-07, "loss": 0.734, "step": 20 }, { "epoch": 0.0338409475465313, "grad_norm": 5.990453108110331, "learning_rate": 2.2522522522522522e-07, "loss": 0.7075, "step": 25 }, { "epoch": 0.04060913705583756, "grad_norm": 6.315135953523189, "learning_rate": 2.702702702702703e-07, "loss": 0.7034, "step": 30 }, { "epoch": 0.047377326565143825, "grad_norm": 4.938125726164199, "learning_rate": 3.153153153153153e-07, "loss": 0.6955, "step": 35 }, { "epoch": 0.05414551607445008, "grad_norm": 4.514212008344891, "learning_rate": 3.6036036036036033e-07, "loss": 0.6675, "step": 40 }, { "epoch": 0.06091370558375635, "grad_norm": 2.5175394937632234, "learning_rate": 4.054054054054054e-07, "loss": 0.6121, "step": 45 }, { "epoch": 0.0676818950930626, "grad_norm": 2.044111131735211, "learning_rate": 4.5045045045045043e-07, "loss": 0.5689, "step": 50 }, { "epoch": 0.07445008460236886, "grad_norm": 1.9607335603547595, "learning_rate": 4.954954954954955e-07, "loss": 0.5681, "step": 55 }, { "epoch": 0.08121827411167512, "grad_norm": 1.7055910206402423, "learning_rate": 5.405405405405406e-07, "loss": 0.5632, "step": 60 }, { "epoch": 0.08798646362098139, "grad_norm": 1.6356611418313087, "learning_rate": 5.855855855855856e-07, "loss": 0.5455, "step": 65 }, { "epoch": 0.09475465313028765, "grad_norm": 1.5898009813340985, "learning_rate": 6.306306306306306e-07, "loss": 0.5124, "step": 70 }, { "epoch": 0.10152284263959391, "grad_norm": 1.6241164657421379, "learning_rate": 6.756756756756756e-07, "loss": 0.5235, "step": 75 }, { "epoch": 0.10829103214890017, "grad_norm": 1.5281172683623996, "learning_rate": 7.207207207207207e-07, "loss": 0.5274, "step": 80 }, { "epoch": 0.11505922165820642, "grad_norm": 1.467101912720914, "learning_rate": 7.657657657657657e-07, "loss": 0.5152, "step": 85 }, { "epoch": 0.1218274111675127, "grad_norm": 1.4144943645830494, "learning_rate": 8.108108108108108e-07, "loss": 0.4903, "step": 90 }, { "epoch": 0.12859560067681894, "grad_norm": 1.5168733264951235, "learning_rate": 8.558558558558558e-07, "loss": 0.5235, "step": 95 }, { "epoch": 0.1353637901861252, "grad_norm": 1.4719778866974211, "learning_rate": 9.009009009009009e-07, "loss": 0.4955, "step": 100 }, { "epoch": 0.14213197969543148, "grad_norm": 1.5617750862395916, "learning_rate": 9.459459459459459e-07, "loss": 0.5023, "step": 105 }, { "epoch": 0.14890016920473773, "grad_norm": 1.608427652038671, "learning_rate": 9.90990990990991e-07, "loss": 0.5161, "step": 110 }, { "epoch": 0.155668358714044, "grad_norm": 1.470675404237294, "learning_rate": 1.0360360360360361e-06, "loss": 0.4629, "step": 115 }, { "epoch": 0.16243654822335024, "grad_norm": 1.5907906676415646, "learning_rate": 1.0810810810810812e-06, "loss": 0.5272, "step": 120 }, { "epoch": 0.1692047377326565, "grad_norm": 1.4139201116022069, "learning_rate": 1.1261261261261262e-06, "loss": 0.4828, "step": 125 }, { "epoch": 0.17597292724196278, "grad_norm": 1.572210590864309, "learning_rate": 1.1711711711711712e-06, "loss": 0.4932, "step": 130 }, { "epoch": 0.18274111675126903, "grad_norm": 1.8317582344702474, "learning_rate": 1.2162162162162162e-06, "loss": 0.4851, "step": 135 }, { "epoch": 0.1895093062605753, "grad_norm": 1.4834369904658524, "learning_rate": 1.2612612612612613e-06, "loss": 0.4823, "step": 140 }, { "epoch": 0.19627749576988154, "grad_norm": 1.487385439852692, "learning_rate": 1.3063063063063063e-06, "loss": 0.4835, "step": 145 }, { "epoch": 0.20304568527918782, "grad_norm": 1.6282121013160264, "learning_rate": 1.3513513513513513e-06, "loss": 0.4845, "step": 150 }, { "epoch": 0.2098138747884941, "grad_norm": 1.410710327011968, "learning_rate": 1.3963963963963963e-06, "loss": 0.4604, "step": 155 }, { "epoch": 0.21658206429780033, "grad_norm": 1.5959737364732594, "learning_rate": 1.4414414414414413e-06, "loss": 0.49, "step": 160 }, { "epoch": 0.2233502538071066, "grad_norm": 1.5229055220751309, "learning_rate": 1.4864864864864864e-06, "loss": 0.4907, "step": 165 }, { "epoch": 0.23011844331641285, "grad_norm": 1.5262092330780643, "learning_rate": 1.5315315315315314e-06, "loss": 0.4603, "step": 170 }, { "epoch": 0.23688663282571912, "grad_norm": 1.6342082189425051, "learning_rate": 1.5765765765765766e-06, "loss": 0.4872, "step": 175 }, { "epoch": 0.2436548223350254, "grad_norm": 1.4374951298863459, "learning_rate": 1.6216216216216216e-06, "loss": 0.4648, "step": 180 }, { "epoch": 0.25042301184433163, "grad_norm": 1.5280129895322156, "learning_rate": 1.6666666666666667e-06, "loss": 0.4467, "step": 185 }, { "epoch": 0.2571912013536379, "grad_norm": 1.5525834339213638, "learning_rate": 1.7117117117117117e-06, "loss": 0.4738, "step": 190 }, { "epoch": 0.2639593908629442, "grad_norm": 1.5093982724761799, "learning_rate": 1.7567567567567567e-06, "loss": 0.483, "step": 195 }, { "epoch": 0.2707275803722504, "grad_norm": 1.6721932829256383, "learning_rate": 1.8018018018018017e-06, "loss": 0.463, "step": 200 }, { "epoch": 0.27749576988155666, "grad_norm": 1.5493470193453733, "learning_rate": 1.8468468468468467e-06, "loss": 0.4432, "step": 205 }, { "epoch": 0.28426395939086296, "grad_norm": 1.5282256717827059, "learning_rate": 1.8918918918918918e-06, "loss": 0.4659, "step": 210 }, { "epoch": 0.2910321489001692, "grad_norm": 1.493666122191364, "learning_rate": 1.936936936936937e-06, "loss": 0.4789, "step": 215 }, { "epoch": 0.29780033840947545, "grad_norm": 1.7222146450605926, "learning_rate": 1.981981981981982e-06, "loss": 0.4636, "step": 220 }, { "epoch": 0.30456852791878175, "grad_norm": 1.6787579180811572, "learning_rate": 1.999988807353673e-06, "loss": 0.4697, "step": 225 }, { "epoch": 0.311336717428088, "grad_norm": 1.5374929160562005, "learning_rate": 1.999920408755684e-06, "loss": 0.4702, "step": 230 }, { "epoch": 0.31810490693739424, "grad_norm": 1.5486529898439687, "learning_rate": 1.9997898339445025e-06, "loss": 0.4545, "step": 235 }, { "epoch": 0.3248730964467005, "grad_norm": 1.5982741872566966, "learning_rate": 1.9995970910394226e-06, "loss": 0.4769, "step": 240 }, { "epoch": 0.3316412859560068, "grad_norm": 1.5356403333143016, "learning_rate": 1.999342192025422e-06, "loss": 0.456, "step": 245 }, { "epoch": 0.338409475465313, "grad_norm": 1.5906824266272626, "learning_rate": 1.9990251527524177e-06, "loss": 0.4803, "step": 250 }, { "epoch": 0.34517766497461927, "grad_norm": 1.5359520161986076, "learning_rate": 1.99864599293428e-06, "loss": 0.4475, "step": 255 }, { "epoch": 0.35194585448392557, "grad_norm": 1.5036681203646087, "learning_rate": 1.9982047361476074e-06, "loss": 0.464, "step": 260 }, { "epoch": 0.3587140439932318, "grad_norm": 1.461884922691461, "learning_rate": 1.9977014098302594e-06, "loss": 0.4593, "step": 265 }, { "epoch": 0.36548223350253806, "grad_norm": 1.434636405242703, "learning_rate": 1.997136045279652e-06, "loss": 0.4524, "step": 270 }, { "epoch": 0.37225042301184436, "grad_norm": 1.4806026186739891, "learning_rate": 1.996508677650809e-06, "loss": 0.455, "step": 275 }, { "epoch": 0.3790186125211506, "grad_norm": 1.5577278096965192, "learning_rate": 1.9958193459541803e-06, "loss": 0.4715, "step": 280 }, { "epoch": 0.38578680203045684, "grad_norm": 1.6997591630335047, "learning_rate": 1.9950680930532106e-06, "loss": 0.4613, "step": 285 }, { "epoch": 0.3925549915397631, "grad_norm": 1.750274534222129, "learning_rate": 1.9942549656616785e-06, "loss": 0.4654, "step": 290 }, { "epoch": 0.3993231810490694, "grad_norm": 1.8009950734484617, "learning_rate": 1.9933800143407914e-06, "loss": 0.449, "step": 295 }, { "epoch": 0.40609137055837563, "grad_norm": 1.5530961439855866, "learning_rate": 1.992443293496038e-06, "loss": 0.4458, "step": 300 }, { "epoch": 0.4128595600676819, "grad_norm": 1.5081709456372263, "learning_rate": 1.9914448613738106e-06, "loss": 0.4565, "step": 305 }, { "epoch": 0.4196277495769882, "grad_norm": 1.6772043840290556, "learning_rate": 1.9903847800577777e-06, "loss": 0.4804, "step": 310 }, { "epoch": 0.4263959390862944, "grad_norm": 1.727246671277696, "learning_rate": 1.9892631154650277e-06, "loss": 0.4576, "step": 315 }, { "epoch": 0.43316412859560066, "grad_norm": 1.5355208962246418, "learning_rate": 1.9880799373419697e-06, "loss": 0.4453, "step": 320 }, { "epoch": 0.43993231810490696, "grad_norm": 1.5063427073085303, "learning_rate": 1.986835319259994e-06, "loss": 0.4452, "step": 325 }, { "epoch": 0.4467005076142132, "grad_norm": 1.5195960228713277, "learning_rate": 1.985529338610899e-06, "loss": 0.465, "step": 330 }, { "epoch": 0.45346869712351945, "grad_norm": 1.5343721052947665, "learning_rate": 1.98416207660208e-06, "loss": 0.436, "step": 335 }, { "epoch": 0.4602368866328257, "grad_norm": 1.5094089297695503, "learning_rate": 1.982733618251478e-06, "loss": 0.4583, "step": 340 }, { "epoch": 0.467005076142132, "grad_norm": 1.3787923632280212, "learning_rate": 1.981244052382293e-06, "loss": 0.4512, "step": 345 }, { "epoch": 0.47377326565143824, "grad_norm": 1.5868715153799644, "learning_rate": 1.9796934716174616e-06, "loss": 0.427, "step": 350 }, { "epoch": 0.4805414551607445, "grad_norm": 1.5348278464191702, "learning_rate": 1.978081972373899e-06, "loss": 0.4593, "step": 355 }, { "epoch": 0.4873096446700508, "grad_norm": 1.587810620443399, "learning_rate": 1.976409654856501e-06, "loss": 0.4453, "step": 360 }, { "epoch": 0.494077834179357, "grad_norm": 1.6669439346571866, "learning_rate": 1.9746766230519137e-06, "loss": 0.4726, "step": 365 }, { "epoch": 0.5008460236886633, "grad_norm": 1.6972318618519455, "learning_rate": 1.9728829847220696e-06, "loss": 0.454, "step": 370 }, { "epoch": 0.5076142131979695, "grad_norm": 1.5739750068827298, "learning_rate": 1.9710288513974846e-06, "loss": 0.4592, "step": 375 }, { "epoch": 0.5143824027072758, "grad_norm": 1.5240777641912773, "learning_rate": 1.969114338370324e-06, "loss": 0.4297, "step": 380 }, { "epoch": 0.5211505922165821, "grad_norm": 1.6105894445618845, "learning_rate": 1.9671395646872323e-06, "loss": 0.447, "step": 385 }, { "epoch": 0.5279187817258884, "grad_norm": 1.7397039341357436, "learning_rate": 1.965104653141933e-06, "loss": 0.4516, "step": 390 }, { "epoch": 0.5346869712351946, "grad_norm": 1.667337409755205, "learning_rate": 1.9630097302675913e-06, "loss": 0.4497, "step": 395 }, { "epoch": 0.5414551607445008, "grad_norm": 1.472174322913246, "learning_rate": 1.9608549263289456e-06, "loss": 0.4396, "step": 400 }, { "epoch": 0.5482233502538071, "grad_norm": 1.390664891183549, "learning_rate": 1.95864037531421e-06, "loss": 0.4584, "step": 405 }, { "epoch": 0.5549915397631133, "grad_norm": 1.7716596272823615, "learning_rate": 1.9563662149267405e-06, "loss": 0.4417, "step": 410 }, { "epoch": 0.5617597292724196, "grad_norm": 1.7181597011497438, "learning_rate": 1.9540325865764725e-06, "loss": 0.4566, "step": 415 }, { "epoch": 0.5685279187817259, "grad_norm": 1.5659915803916853, "learning_rate": 1.951639635371129e-06, "loss": 0.4636, "step": 420 }, { "epoch": 0.5752961082910322, "grad_norm": 1.3420492760813085, "learning_rate": 1.9491875101071985e-06, "loss": 0.4392, "step": 425 }, { "epoch": 0.5820642978003384, "grad_norm": 1.7068688103479133, "learning_rate": 1.946676363260679e-06, "loss": 0.4442, "step": 430 }, { "epoch": 0.5888324873096447, "grad_norm": 1.5986442251841577, "learning_rate": 1.9441063509776e-06, "loss": 0.4461, "step": 435 }, { "epoch": 0.5956006768189509, "grad_norm": 1.3829526404505894, "learning_rate": 1.9414776330643123e-06, "loss": 0.4342, "step": 440 }, { "epoch": 0.6023688663282571, "grad_norm": 1.5703877223063394, "learning_rate": 1.9387903729775516e-06, "loss": 0.4508, "step": 445 }, { "epoch": 0.6091370558375635, "grad_norm": 1.6558038588752109, "learning_rate": 1.9360447378142724e-06, "loss": 0.4223, "step": 450 }, { "epoch": 0.6159052453468697, "grad_norm": 1.4868682565255178, "learning_rate": 1.9332408983012616e-06, "loss": 0.4452, "step": 455 }, { "epoch": 0.622673434856176, "grad_norm": 1.7326589310573917, "learning_rate": 1.930379028784518e-06, "loss": 0.4488, "step": 460 }, { "epoch": 0.6294416243654822, "grad_norm": 1.5873739274578285, "learning_rate": 1.9274593072184147e-06, "loss": 0.4605, "step": 465 }, { "epoch": 0.6362098138747885, "grad_norm": 1.334324454741449, "learning_rate": 1.924481915154632e-06, "loss": 0.4098, "step": 470 }, { "epoch": 0.6429780033840947, "grad_norm": 1.4711516821260495, "learning_rate": 1.9214470377308698e-06, "loss": 0.4512, "step": 475 }, { "epoch": 0.649746192893401, "grad_norm": 1.5422978596049304, "learning_rate": 1.918354863659332e-06, "loss": 0.4434, "step": 480 }, { "epoch": 0.6565143824027073, "grad_norm": 1.6734420314479854, "learning_rate": 1.915205585214998e-06, "loss": 0.4656, "step": 485 }, { "epoch": 0.6632825719120136, "grad_norm": 1.4869130554051895, "learning_rate": 1.9119993982236605e-06, "loss": 0.4658, "step": 490 }, { "epoch": 0.6700507614213198, "grad_norm": 1.4648439577812231, "learning_rate": 1.908736502049754e-06, "loss": 0.4485, "step": 495 }, { "epoch": 0.676818950930626, "grad_norm": 1.4924798254155862, "learning_rate": 1.9054170995839543e-06, "loss": 0.4434, "step": 500 }, { "epoch": 0.6835871404399323, "grad_norm": 1.5718324313516365, "learning_rate": 1.9020413972305652e-06, "loss": 0.4141, "step": 505 }, { "epoch": 0.6903553299492385, "grad_norm": 1.6927420013265664, "learning_rate": 1.8986096048946822e-06, "loss": 0.4188, "step": 510 }, { "epoch": 0.6971235194585449, "grad_norm": 1.9638689317654023, "learning_rate": 1.8951219359691416e-06, "loss": 0.4085, "step": 515 }, { "epoch": 0.7038917089678511, "grad_norm": 1.7008350118895288, "learning_rate": 1.8915786073212506e-06, "loss": 0.425, "step": 520 }, { "epoch": 0.7106598984771574, "grad_norm": 1.5729307770372762, "learning_rate": 1.887979839279303e-06, "loss": 0.4486, "step": 525 }, { "epoch": 0.7174280879864636, "grad_norm": 1.6239899543583527, "learning_rate": 1.8843258556188783e-06, "loss": 0.4314, "step": 530 }, { "epoch": 0.7241962774957699, "grad_norm": 1.5552777279734928, "learning_rate": 1.8806168835489277e-06, "loss": 0.426, "step": 535 }, { "epoch": 0.7309644670050761, "grad_norm": 1.4332159863043232, "learning_rate": 1.876853153697645e-06, "loss": 0.4297, "step": 540 }, { "epoch": 0.7377326565143824, "grad_norm": 1.3873749896369056, "learning_rate": 1.8730349000981267e-06, "loss": 0.445, "step": 545 }, { "epoch": 0.7445008460236887, "grad_norm": 1.4780965073836605, "learning_rate": 1.8691623601738197e-06, "loss": 0.458, "step": 550 }, { "epoch": 0.751269035532995, "grad_norm": 1.5268831705229313, "learning_rate": 1.8652357747237578e-06, "loss": 0.4354, "step": 555 }, { "epoch": 0.7580372250423012, "grad_norm": 1.777684181196839, "learning_rate": 1.8612553879075873e-06, "loss": 0.4521, "step": 560 }, { "epoch": 0.7648054145516074, "grad_norm": 1.5369674855501423, "learning_rate": 1.8572214472303868e-06, "loss": 0.4403, "step": 565 }, { "epoch": 0.7715736040609137, "grad_norm": 1.4081724058322669, "learning_rate": 1.8531342035272765e-06, "loss": 0.432, "step": 570 }, { "epoch": 0.7783417935702199, "grad_norm": 1.6328252561599401, "learning_rate": 1.8489939109478203e-06, "loss": 0.4447, "step": 575 }, { "epoch": 0.7851099830795262, "grad_norm": 1.4953859268513063, "learning_rate": 1.8448008269402226e-06, "loss": 0.4239, "step": 580 }, { "epoch": 0.7918781725888325, "grad_norm": 1.5304225811890608, "learning_rate": 1.840555212235321e-06, "loss": 0.4293, "step": 585 }, { "epoch": 0.7986463620981388, "grad_norm": 1.4486153048909152, "learning_rate": 1.8362573308303717e-06, "loss": 0.4304, "step": 590 }, { "epoch": 0.805414551607445, "grad_norm": 1.4602386588010192, "learning_rate": 1.831907449972636e-06, "loss": 0.4472, "step": 595 }, { "epoch": 0.8121827411167513, "grad_norm": 1.4965634086484065, "learning_rate": 1.8275058401427618e-06, "loss": 0.4409, "step": 600 }, { "epoch": 0.8189509306260575, "grad_norm": 1.5872590957255623, "learning_rate": 1.823052775037964e-06, "loss": 0.4522, "step": 605 }, { "epoch": 0.8257191201353637, "grad_norm": 1.4920706655623288, "learning_rate": 1.818548531555006e-06, "loss": 0.4289, "step": 610 }, { "epoch": 0.8324873096446701, "grad_norm": 1.4808293841541855, "learning_rate": 1.8139933897729832e-06, "loss": 0.4404, "step": 615 }, { "epoch": 0.8392554991539763, "grad_norm": 1.788578348932924, "learning_rate": 1.8093876329359058e-06, "loss": 0.4199, "step": 620 }, { "epoch": 0.8460236886632826, "grad_norm": 1.5479379288714532, "learning_rate": 1.8047315474350868e-06, "loss": 0.4199, "step": 625 }, { "epoch": 0.8527918781725888, "grad_norm": 1.4758005354779442, "learning_rate": 1.8000254227913344e-06, "loss": 0.4503, "step": 630 }, { "epoch": 0.8595600676818951, "grad_norm": 1.5709164106643387, "learning_rate": 1.7952695516369488e-06, "loss": 0.436, "step": 635 }, { "epoch": 0.8663282571912013, "grad_norm": 1.4869937326587306, "learning_rate": 1.7904642296975261e-06, "loss": 0.4416, "step": 640 }, { "epoch": 0.8730964467005076, "grad_norm": 1.5189251087228999, "learning_rate": 1.7856097557735694e-06, "loss": 0.4367, "step": 645 }, { "epoch": 0.8798646362098139, "grad_norm": 1.5390825637238734, "learning_rate": 1.7807064317219093e-06, "loss": 0.4385, "step": 650 }, { "epoch": 0.8866328257191202, "grad_norm": 1.500939175744208, "learning_rate": 1.7757545624369347e-06, "loss": 0.4112, "step": 655 }, { "epoch": 0.8934010152284264, "grad_norm": 1.5891603689470157, "learning_rate": 1.770754455831633e-06, "loss": 0.4496, "step": 660 }, { "epoch": 0.9001692047377327, "grad_norm": 1.6353444834776032, "learning_rate": 1.7657064228184444e-06, "loss": 0.4222, "step": 665 }, { "epoch": 0.9069373942470389, "grad_norm": 1.6181070946717884, "learning_rate": 1.7606107772899285e-06, "loss": 0.4296, "step": 670 }, { "epoch": 0.9137055837563451, "grad_norm": 1.633235008124872, "learning_rate": 1.7554678360992475e-06, "loss": 0.4308, "step": 675 }, { "epoch": 0.9204737732656514, "grad_norm": 1.4850942271908985, "learning_rate": 1.7502779190404611e-06, "loss": 0.425, "step": 680 }, { "epoch": 0.9272419627749577, "grad_norm": 1.553921180752237, "learning_rate": 1.745041348828645e-06, "loss": 0.4228, "step": 685 }, { "epoch": 0.934010152284264, "grad_norm": 1.5091342123918963, "learning_rate": 1.7397584510798206e-06, "loss": 0.4447, "step": 690 }, { "epoch": 0.9407783417935702, "grad_norm": 1.5470959471430534, "learning_rate": 1.7344295542907105e-06, "loss": 0.4515, "step": 695 }, { "epoch": 0.9475465313028765, "grad_norm": 1.584257899917925, "learning_rate": 1.7290549898183109e-06, "loss": 0.4276, "step": 700 }, { "epoch": 0.9543147208121827, "grad_norm": 1.5896926257834991, "learning_rate": 1.7236350918592866e-06, "loss": 0.4429, "step": 705 }, { "epoch": 0.961082910321489, "grad_norm": 1.4741862947637046, "learning_rate": 1.7181701974291928e-06, "loss": 0.417, "step": 710 }, { "epoch": 0.9678510998307953, "grad_norm": 1.5194416495298846, "learning_rate": 1.7126606463415164e-06, "loss": 0.4299, "step": 715 }, { "epoch": 0.9746192893401016, "grad_norm": 1.6092727303865384, "learning_rate": 1.7071067811865474e-06, "loss": 0.4351, "step": 720 }, { "epoch": 0.9813874788494078, "grad_norm": 1.4151798722540438, "learning_rate": 1.701508947310077e-06, "loss": 0.4194, "step": 725 }, { "epoch": 0.988155668358714, "grad_norm": 1.5504577940513988, "learning_rate": 1.695867492791921e-06, "loss": 0.4269, "step": 730 }, { "epoch": 0.9949238578680203, "grad_norm": 1.5420736691888262, "learning_rate": 1.690182768424279e-06, "loss": 0.4406, "step": 735 }, { "epoch": 1.0016920473773265, "grad_norm": 1.7092059209864483, "learning_rate": 1.6844551276899184e-06, "loss": 0.3988, "step": 740 }, { "epoch": 1.0084602368866329, "grad_norm": 1.5056006562387958, "learning_rate": 1.6786849267401978e-06, "loss": 0.3558, "step": 745 }, { "epoch": 1.015228426395939, "grad_norm": 1.767638909765125, "learning_rate": 1.6728725243729187e-06, "loss": 0.3396, "step": 750 }, { "epoch": 1.0219966159052454, "grad_norm": 1.6366144961909075, "learning_rate": 1.667018282010016e-06, "loss": 0.3238, "step": 755 }, { "epoch": 1.0287648054145515, "grad_norm": 1.5321955881585025, "learning_rate": 1.6611225636750836e-06, "loss": 0.3422, "step": 760 }, { "epoch": 1.0355329949238579, "grad_norm": 1.680806770640699, "learning_rate": 1.6551857359707405e-06, "loss": 0.3309, "step": 765 }, { "epoch": 1.0423011844331642, "grad_norm": 1.5471253871545012, "learning_rate": 1.649208168055833e-06, "loss": 0.324, "step": 770 }, { "epoch": 1.0490693739424704, "grad_norm": 1.4947975015447832, "learning_rate": 1.6431902316224818e-06, "loss": 0.3329, "step": 775 }, { "epoch": 1.0558375634517767, "grad_norm": 1.6086982480144758, "learning_rate": 1.6371323008729687e-06, "loss": 0.3196, "step": 780 }, { "epoch": 1.0626057529610828, "grad_norm": 1.580457843877164, "learning_rate": 1.6310347524964687e-06, "loss": 0.3262, "step": 785 }, { "epoch": 1.0693739424703892, "grad_norm": 1.6566966310786062, "learning_rate": 1.6248979656456272e-06, "loss": 0.3438, "step": 790 }, { "epoch": 1.0761421319796955, "grad_norm": 1.6737061189158637, "learning_rate": 1.6187223219129823e-06, "loss": 0.343, "step": 795 }, { "epoch": 1.0829103214890017, "grad_norm": 1.7888201683292473, "learning_rate": 1.6125082053072405e-06, "loss": 0.3381, "step": 800 }, { "epoch": 1.089678510998308, "grad_norm": 1.6140042434311925, "learning_rate": 1.6062560022293933e-06, "loss": 0.3299, "step": 805 }, { "epoch": 1.0964467005076142, "grad_norm": 1.5841270947078938, "learning_rate": 1.5999661014486955e-06, "loss": 0.3312, "step": 810 }, { "epoch": 1.1032148900169205, "grad_norm": 1.6193416591297873, "learning_rate": 1.5936388940784883e-06, "loss": 0.3523, "step": 815 }, { "epoch": 1.1099830795262267, "grad_norm": 1.550619017130258, "learning_rate": 1.5872747735518798e-06, "loss": 0.3228, "step": 820 }, { "epoch": 1.116751269035533, "grad_norm": 1.7508231230240185, "learning_rate": 1.5808741355972807e-06, "loss": 0.3324, "step": 825 }, { "epoch": 1.1235194585448394, "grad_norm": 1.6682501120844164, "learning_rate": 1.574437378213799e-06, "loss": 0.3239, "step": 830 }, { "epoch": 1.1302876480541455, "grad_norm": 1.8314883569950966, "learning_rate": 1.5679649016464895e-06, "loss": 0.3296, "step": 835 }, { "epoch": 1.1370558375634519, "grad_norm": 1.59630852333473, "learning_rate": 1.561457108361468e-06, "loss": 0.3289, "step": 840 }, { "epoch": 1.143824027072758, "grad_norm": 1.5574912791068813, "learning_rate": 1.5549144030208855e-06, "loss": 0.3346, "step": 845 }, { "epoch": 1.1505922165820643, "grad_norm": 1.6059497129644689, "learning_rate": 1.5483371924577634e-06, "loss": 0.3381, "step": 850 }, { "epoch": 1.1573604060913705, "grad_norm": 1.7201015765204164, "learning_rate": 1.5417258856506994e-06, "loss": 0.3271, "step": 855 }, { "epoch": 1.1641285956006768, "grad_norm": 1.6165299860217694, "learning_rate": 1.535080893698435e-06, "loss": 0.3312, "step": 860 }, { "epoch": 1.1708967851099832, "grad_norm": 1.6502865565791853, "learning_rate": 1.5284026297942926e-06, "loss": 0.3362, "step": 865 }, { "epoch": 1.1776649746192893, "grad_norm": 1.7002581521681002, "learning_rate": 1.5216915092004844e-06, "loss": 0.3215, "step": 870 }, { "epoch": 1.1844331641285957, "grad_norm": 1.7420409861182249, "learning_rate": 1.5149479492222886e-06, "loss": 0.3464, "step": 875 }, { "epoch": 1.1912013536379018, "grad_norm": 1.707048857027911, "learning_rate": 1.5081723691821026e-06, "loss": 0.3455, "step": 880 }, { "epoch": 1.1979695431472082, "grad_norm": 1.6420304279171187, "learning_rate": 1.5013651903933683e-06, "loss": 0.3332, "step": 885 }, { "epoch": 1.2047377326565143, "grad_norm": 1.8125800875620734, "learning_rate": 1.4945268361343746e-06, "loss": 0.3382, "step": 890 }, { "epoch": 1.2115059221658206, "grad_norm": 1.6640857688992343, "learning_rate": 1.4876577316219374e-06, "loss": 0.3369, "step": 895 }, { "epoch": 1.218274111675127, "grad_norm": 1.6451257316850152, "learning_rate": 1.4807583039849586e-06, "loss": 0.3539, "step": 900 }, { "epoch": 1.2250423011844331, "grad_norm": 1.6610764662131192, "learning_rate": 1.4738289822378683e-06, "loss": 0.3369, "step": 905 }, { "epoch": 1.2318104906937395, "grad_norm": 1.74944774821556, "learning_rate": 1.4668701972539456e-06, "loss": 0.3414, "step": 910 }, { "epoch": 1.2385786802030456, "grad_norm": 1.6889905704095276, "learning_rate": 1.4598823817385296e-06, "loss": 0.3462, "step": 915 }, { "epoch": 1.245346869712352, "grad_norm": 1.87748003800123, "learning_rate": 1.4528659702021106e-06, "loss": 0.347, "step": 920 }, { "epoch": 1.252115059221658, "grad_norm": 1.7676519334092846, "learning_rate": 1.4458213989333125e-06, "loss": 0.3344, "step": 925 }, { "epoch": 1.2588832487309645, "grad_norm": 1.8625421673744915, "learning_rate": 1.4387491059717651e-06, "loss": 0.3259, "step": 930 }, { "epoch": 1.2656514382402708, "grad_norm": 1.5243120020428504, "learning_rate": 1.431649531080864e-06, "loss": 0.3286, "step": 935 }, { "epoch": 1.272419627749577, "grad_norm": 1.773494740626271, "learning_rate": 1.424523115720428e-06, "loss": 0.3366, "step": 940 }, { "epoch": 1.2791878172588833, "grad_norm": 1.642323556539902, "learning_rate": 1.4173703030192466e-06, "loss": 0.3381, "step": 945 }, { "epoch": 1.2859560067681894, "grad_norm": 1.8298130052806405, "learning_rate": 1.4101915377475273e-06, "loss": 0.3472, "step": 950 }, { "epoch": 1.2927241962774958, "grad_norm": 1.5564171598002208, "learning_rate": 1.4029872662892382e-06, "loss": 0.3378, "step": 955 }, { "epoch": 1.299492385786802, "grad_norm": 1.7850954669361399, "learning_rate": 1.3957579366143519e-06, "loss": 0.3363, "step": 960 }, { "epoch": 1.3062605752961083, "grad_norm": 1.614939575319601, "learning_rate": 1.3885039982509905e-06, "loss": 0.3166, "step": 965 }, { "epoch": 1.3130287648054146, "grad_norm": 1.7656883518798847, "learning_rate": 1.3812259022574715e-06, "loss": 0.3426, "step": 970 }, { "epoch": 1.3197969543147208, "grad_norm": 1.4996842720105086, "learning_rate": 1.373924101194264e-06, "loss": 0.3377, "step": 975 }, { "epoch": 1.3265651438240271, "grad_norm": 1.834478494924892, "learning_rate": 1.3665990490958437e-06, "loss": 0.3408, "step": 980 }, { "epoch": 1.3333333333333333, "grad_norm": 1.687498482197505, "learning_rate": 1.3592512014424644e-06, "loss": 0.3341, "step": 985 }, { "epoch": 1.3401015228426396, "grad_norm": 1.4779395904473713, "learning_rate": 1.351881015131833e-06, "loss": 0.3319, "step": 990 }, { "epoch": 1.3468697123519457, "grad_norm": 1.5491258438326576, "learning_rate": 1.3444889484507009e-06, "loss": 0.3287, "step": 995 }, { "epoch": 1.353637901861252, "grad_norm": 1.753194944328746, "learning_rate": 1.3370754610463652e-06, "loss": 0.3264, "step": 1000 }, { "epoch": 1.3604060913705585, "grad_norm": 1.8046926820280387, "learning_rate": 1.32964101389809e-06, "loss": 0.3453, "step": 1005 }, { "epoch": 1.3671742808798646, "grad_norm": 1.5582819679996394, "learning_rate": 1.3221860692884395e-06, "loss": 0.3185, "step": 1010 }, { "epoch": 1.373942470389171, "grad_norm": 1.734615015555365, "learning_rate": 1.3147110907745336e-06, "loss": 0.3209, "step": 1015 }, { "epoch": 1.380710659898477, "grad_norm": 1.8370614645263001, "learning_rate": 1.3072165431592248e-06, "loss": 0.3389, "step": 1020 }, { "epoch": 1.3874788494077834, "grad_norm": 1.6416288844308489, "learning_rate": 1.2997028924621943e-06, "loss": 0.3465, "step": 1025 }, { "epoch": 1.3942470389170896, "grad_norm": 1.7141835707827855, "learning_rate": 1.2921706058909756e-06, "loss": 0.3379, "step": 1030 }, { "epoch": 1.401015228426396, "grad_norm": 1.7703336159956253, "learning_rate": 1.2846201518119017e-06, "loss": 0.3331, "step": 1035 }, { "epoch": 1.4077834179357023, "grad_norm": 1.7164709273217806, "learning_rate": 1.2770519997209835e-06, "loss": 0.3316, "step": 1040 }, { "epoch": 1.4145516074450084, "grad_norm": 1.698294459133158, "learning_rate": 1.2694666202147137e-06, "loss": 0.3407, "step": 1045 }, { "epoch": 1.4213197969543148, "grad_norm": 1.7231395084021628, "learning_rate": 1.2618644849608067e-06, "loss": 0.3383, "step": 1050 }, { "epoch": 1.4280879864636211, "grad_norm": 1.6225747755972384, "learning_rate": 1.2542460666688678e-06, "loss": 0.3272, "step": 1055 }, { "epoch": 1.4348561759729273, "grad_norm": 1.6273808164138512, "learning_rate": 1.246611839061002e-06, "loss": 0.3307, "step": 1060 }, { "epoch": 1.4416243654822334, "grad_norm": 1.640412382244569, "learning_rate": 1.2389622768423536e-06, "loss": 0.3326, "step": 1065 }, { "epoch": 1.4483925549915397, "grad_norm": 1.5742322851792212, "learning_rate": 1.231297855671593e-06, "loss": 0.311, "step": 1070 }, { "epoch": 1.455160744500846, "grad_norm": 1.6398609226586531, "learning_rate": 1.223619052131337e-06, "loss": 0.3417, "step": 1075 }, { "epoch": 1.4619289340101522, "grad_norm": 1.5919173149091699, "learning_rate": 1.2159263436985136e-06, "loss": 0.3311, "step": 1080 }, { "epoch": 1.4686971235194586, "grad_norm": 1.7663834242591079, "learning_rate": 1.2082202087146751e-06, "loss": 0.3404, "step": 1085 }, { "epoch": 1.475465313028765, "grad_norm": 1.6365555810498733, "learning_rate": 1.2005011263562513e-06, "loss": 0.3211, "step": 1090 }, { "epoch": 1.482233502538071, "grad_norm": 1.7543606709062083, "learning_rate": 1.1927695766047538e-06, "loss": 0.3345, "step": 1095 }, { "epoch": 1.4890016920473772, "grad_norm": 1.6454656998875175, "learning_rate": 1.185026040216934e-06, "loss": 0.329, "step": 1100 }, { "epoch": 1.4957698815566836, "grad_norm": 1.6242171627203073, "learning_rate": 1.1772709986948827e-06, "loss": 0.3274, "step": 1105 }, { "epoch": 1.50253807106599, "grad_norm": 2.0678978985333596, "learning_rate": 1.1695049342560967e-06, "loss": 0.3544, "step": 1110 }, { "epoch": 1.509306260575296, "grad_norm": 1.7262447342718426, "learning_rate": 1.161728329803488e-06, "loss": 0.341, "step": 1115 }, { "epoch": 1.5160744500846024, "grad_norm": 1.6907890988982508, "learning_rate": 1.153941668895361e-06, "loss": 0.3292, "step": 1120 }, { "epoch": 1.5228426395939088, "grad_norm": 1.6131818091865402, "learning_rate": 1.1461454357153406e-06, "loss": 0.3273, "step": 1125 }, { "epoch": 1.5296108291032149, "grad_norm": 1.970023298749538, "learning_rate": 1.1383401150422668e-06, "loss": 0.3389, "step": 1130 }, { "epoch": 1.536379018612521, "grad_norm": 1.7477654667475575, "learning_rate": 1.1305261922200517e-06, "loss": 0.336, "step": 1135 }, { "epoch": 1.5431472081218274, "grad_norm": 1.8260233194529998, "learning_rate": 1.1227041531274977e-06, "loss": 0.3394, "step": 1140 }, { "epoch": 1.5499153976311337, "grad_norm": 1.533061734694472, "learning_rate": 1.1148744841480873e-06, "loss": 0.3274, "step": 1145 }, { "epoch": 1.5566835871404399, "grad_norm": 1.769403392681689, "learning_rate": 1.1070376721397372e-06, "loss": 0.3438, "step": 1150 }, { "epoch": 1.5634517766497462, "grad_norm": 1.6263236224467823, "learning_rate": 1.0991942044045274e-06, "loss": 0.3437, "step": 1155 }, { "epoch": 1.5702199661590526, "grad_norm": 1.9050438393576472, "learning_rate": 1.0913445686583974e-06, "loss": 0.3208, "step": 1160 }, { "epoch": 1.5769881556683587, "grad_norm": 1.7107041476766611, "learning_rate": 1.0834892530008214e-06, "loss": 0.3192, "step": 1165 }, { "epoch": 1.5837563451776648, "grad_norm": 1.5694513216338701, "learning_rate": 1.0756287458844569e-06, "loss": 0.3339, "step": 1170 }, { "epoch": 1.5905245346869712, "grad_norm": 1.5469737030155013, "learning_rate": 1.0677635360847722e-06, "loss": 0.3323, "step": 1175 }, { "epoch": 1.5972927241962775, "grad_norm": 1.652088656809816, "learning_rate": 1.0598941126696543e-06, "loss": 0.3331, "step": 1180 }, { "epoch": 1.6040609137055837, "grad_norm": 1.696570545718917, "learning_rate": 1.0520209649689977e-06, "loss": 0.3258, "step": 1185 }, { "epoch": 1.61082910321489, "grad_norm": 1.759912674332406, "learning_rate": 1.0441445825442771e-06, "loss": 0.3379, "step": 1190 }, { "epoch": 1.6175972927241964, "grad_norm": 1.7089027285892213, "learning_rate": 1.0362654551581062e-06, "loss": 0.3449, "step": 1195 }, { "epoch": 1.6243654822335025, "grad_norm": 1.5565566000298192, "learning_rate": 1.0283840727437832e-06, "loss": 0.338, "step": 1200 }, { "epoch": 1.6311336717428087, "grad_norm": 1.6464360907618232, "learning_rate": 1.0205009253748272e-06, "loss": 0.3327, "step": 1205 }, { "epoch": 1.637901861252115, "grad_norm": 1.6450744022431256, "learning_rate": 1.0126165032345037e-06, "loss": 0.3411, "step": 1210 }, { "epoch": 1.6446700507614214, "grad_norm": 1.8547448481156448, "learning_rate": 1.0047312965853454e-06, "loss": 0.3383, "step": 1215 }, { "epoch": 1.6514382402707275, "grad_norm": 1.6495177484989427, "learning_rate": 9.968457957386662e-07, "loss": 0.3263, "step": 1220 }, { "epoch": 1.6582064297800339, "grad_norm": 1.6895491290871958, "learning_rate": 9.88960491024074e-07, "loss": 0.3325, "step": 1225 }, { "epoch": 1.6649746192893402, "grad_norm": 1.627812788457914, "learning_rate": 9.810758727589813e-07, "loss": 0.3291, "step": 1230 }, { "epoch": 1.6717428087986463, "grad_norm": 1.7996421239110534, "learning_rate": 9.731924312181148e-07, "loss": 0.3354, "step": 1235 }, { "epoch": 1.6785109983079525, "grad_norm": 1.8622024185022505, "learning_rate": 9.653106566030328e-07, "loss": 0.3459, "step": 1240 }, { "epoch": 1.6852791878172588, "grad_norm": 1.7964469673524814, "learning_rate": 9.574310390116418e-07, "loss": 0.3205, "step": 1245 }, { "epoch": 1.6920473773265652, "grad_norm": 1.6712003471107053, "learning_rate": 9.495540684077214e-07, "loss": 0.3368, "step": 1250 }, { "epoch": 1.6988155668358713, "grad_norm": 1.6956557822203489, "learning_rate": 9.41680234590459e-07, "loss": 0.3249, "step": 1255 }, { "epoch": 1.7055837563451777, "grad_norm": 1.694408958921992, "learning_rate": 9.338100271639931e-07, "loss": 0.3498, "step": 1260 }, { "epoch": 1.712351945854484, "grad_norm": 1.6701624264401975, "learning_rate": 9.25943935506969e-07, "loss": 0.3257, "step": 1265 }, { "epoch": 1.7191201353637902, "grad_norm": 1.7930456020095138, "learning_rate": 9.180824487421076e-07, "loss": 0.3261, "step": 1270 }, { "epoch": 1.7258883248730963, "grad_norm": 1.5441215575090625, "learning_rate": 9.102260557057935e-07, "loss": 0.336, "step": 1275 }, { "epoch": 1.7326565143824029, "grad_norm": 1.64455257154265, "learning_rate": 9.023752449176772e-07, "loss": 0.3269, "step": 1280 }, { "epoch": 1.739424703891709, "grad_norm": 1.5448960299784444, "learning_rate": 8.945305045502984e-07, "loss": 0.3288, "step": 1285 }, { "epoch": 1.7461928934010151, "grad_norm": 1.6868399015307785, "learning_rate": 8.866923223987302e-07, "loss": 0.3196, "step": 1290 }, { "epoch": 1.7529610829103215, "grad_norm": 1.6844238790169903, "learning_rate": 8.788611858502489e-07, "loss": 0.3524, "step": 1295 }, { "epoch": 1.7597292724196278, "grad_norm": 1.536238947791522, "learning_rate": 8.710375818540279e-07, "loss": 0.323, "step": 1300 }, { "epoch": 1.766497461928934, "grad_norm": 1.6594155852789063, "learning_rate": 8.632219968908555e-07, "loss": 0.3388, "step": 1305 }, { "epoch": 1.77326565143824, "grad_norm": 1.8357910067878633, "learning_rate": 8.554149169428892e-07, "loss": 0.319, "step": 1310 }, { "epoch": 1.7800338409475467, "grad_norm": 1.770060103182254, "learning_rate": 8.476168274634341e-07, "loss": 0.3533, "step": 1315 }, { "epoch": 1.7868020304568528, "grad_norm": 1.6650622145152638, "learning_rate": 8.398282133467578e-07, "loss": 0.3313, "step": 1320 }, { "epoch": 1.793570219966159, "grad_norm": 1.6934630286297077, "learning_rate": 8.320495588979377e-07, "loss": 0.3273, "step": 1325 }, { "epoch": 1.8003384094754653, "grad_norm": 1.6802792500913968, "learning_rate": 8.242813478027491e-07, "loss": 0.3425, "step": 1330 }, { "epoch": 1.8071065989847717, "grad_norm": 1.6829953200882048, "learning_rate": 8.165240630975861e-07, "loss": 0.351, "step": 1335 }, { "epoch": 1.8138747884940778, "grad_norm": 1.7159022109793864, "learning_rate": 8.087781871394279e-07, "loss": 0.3211, "step": 1340 }, { "epoch": 1.8206429780033841, "grad_norm": 1.6151668145042095, "learning_rate": 8.010442015758445e-07, "loss": 0.316, "step": 1345 }, { "epoch": 1.8274111675126905, "grad_norm": 1.6174069637779651, "learning_rate": 7.93322587315047e-07, "loss": 0.332, "step": 1350 }, { "epoch": 1.8341793570219966, "grad_norm": 1.6680877411690365, "learning_rate": 7.856138244959849e-07, "loss": 0.3243, "step": 1355 }, { "epoch": 1.8409475465313028, "grad_norm": 1.7974753943714166, "learning_rate": 7.7791839245849e-07, "loss": 0.3294, "step": 1360 }, { "epoch": 1.8477157360406091, "grad_norm": 1.646510776278098, "learning_rate": 7.702367697134701e-07, "loss": 0.3304, "step": 1365 }, { "epoch": 1.8544839255499155, "grad_norm": 1.7474734525141256, "learning_rate": 7.625694339131563e-07, "loss": 0.3588, "step": 1370 }, { "epoch": 1.8612521150592216, "grad_norm": 1.658323098173442, "learning_rate": 7.549168618213994e-07, "loss": 0.3362, "step": 1375 }, { "epoch": 1.868020304568528, "grad_norm": 1.67431452331962, "learning_rate": 7.472795292840269e-07, "loss": 0.3427, "step": 1380 }, { "epoch": 1.8747884940778343, "grad_norm": 1.6444897700367918, "learning_rate": 7.396579111992522e-07, "loss": 0.3552, "step": 1385 }, { "epoch": 1.8815566835871405, "grad_norm": 1.8283646757854843, "learning_rate": 7.32052481488147e-07, "loss": 0.3312, "step": 1390 }, { "epoch": 1.8883248730964466, "grad_norm": 1.755531433154494, "learning_rate": 7.244637130651693e-07, "loss": 0.3366, "step": 1395 }, { "epoch": 1.895093062605753, "grad_norm": 1.634727248259316, "learning_rate": 7.168920778087601e-07, "loss": 0.3323, "step": 1400 }, { "epoch": 1.9018612521150593, "grad_norm": 1.6187492591322967, "learning_rate": 7.093380465320008e-07, "loss": 0.345, "step": 1405 }, { "epoch": 1.9086294416243654, "grad_norm": 2.046018137628728, "learning_rate": 7.018020889533347e-07, "loss": 0.3316, "step": 1410 }, { "epoch": 1.9153976311336718, "grad_norm": 1.7585386765032196, "learning_rate": 6.942846736673633e-07, "loss": 0.3404, "step": 1415 }, { "epoch": 1.9221658206429781, "grad_norm": 1.600209455454873, "learning_rate": 6.867862681157066e-07, "loss": 0.3319, "step": 1420 }, { "epoch": 1.9289340101522843, "grad_norm": 1.6939356089504074, "learning_rate": 6.793073385579372e-07, "loss": 0.3353, "step": 1425 }, { "epoch": 1.9357021996615904, "grad_norm": 1.730441931386039, "learning_rate": 6.718483500425866e-07, "loss": 0.3448, "step": 1430 }, { "epoch": 1.9424703891708968, "grad_norm": 1.602246560839999, "learning_rate": 6.644097663782308e-07, "loss": 0.3207, "step": 1435 }, { "epoch": 1.9492385786802031, "grad_norm": 1.5255973835779064, "learning_rate": 6.569920501046473e-07, "loss": 0.3211, "step": 1440 }, { "epoch": 1.9560067681895092, "grad_norm": 1.787975436248423, "learning_rate": 6.495956624640558e-07, "loss": 0.3331, "step": 1445 }, { "epoch": 1.9627749576988156, "grad_norm": 1.635182796509772, "learning_rate": 6.422210633724359e-07, "loss": 0.3151, "step": 1450 }, { "epoch": 1.969543147208122, "grad_norm": 1.745570057757413, "learning_rate": 6.348687113909303e-07, "loss": 0.3166, "step": 1455 }, { "epoch": 1.976311336717428, "grad_norm": 1.666654337456338, "learning_rate": 6.275390636973315e-07, "loss": 0.3287, "step": 1460 }, { "epoch": 1.9830795262267342, "grad_norm": 1.7830502067885774, "learning_rate": 6.20232576057651e-07, "loss": 0.3374, "step": 1465 }, { "epoch": 1.9898477157360406, "grad_norm": 1.6063864832196357, "learning_rate": 6.129497027977828e-07, "loss": 0.3333, "step": 1470 }, { "epoch": 1.996615905245347, "grad_norm": 1.84518355863186, "learning_rate": 6.05690896775251e-07, "loss": 0.3338, "step": 1475 }, { "epoch": 2.003384094754653, "grad_norm": 1.996243516154583, "learning_rate": 5.984566093510508e-07, "loss": 0.3076, "step": 1480 }, { "epoch": 2.010152284263959, "grad_norm": 1.6441418715481781, "learning_rate": 5.91247290361582e-07, "loss": 0.2734, "step": 1485 }, { "epoch": 2.0169204737732658, "grad_norm": 1.7461011995129512, "learning_rate": 5.840633880906787e-07, "loss": 0.2483, "step": 1490 }, { "epoch": 2.023688663282572, "grad_norm": 1.780208702006572, "learning_rate": 5.769053492417341e-07, "loss": 0.2597, "step": 1495 }, { "epoch": 2.030456852791878, "grad_norm": 1.6957870173539042, "learning_rate": 5.69773618909923e-07, "loss": 0.2534, "step": 1500 }, { "epoch": 2.0372250423011846, "grad_norm": 1.7614796776092347, "learning_rate": 5.62668640554526e-07, "loss": 0.2684, "step": 1505 }, { "epoch": 2.0439932318104908, "grad_norm": 1.8361067674268434, "learning_rate": 5.55590855971356e-07, "loss": 0.2645, "step": 1510 }, { "epoch": 2.050761421319797, "grad_norm": 1.803442468077519, "learning_rate": 5.485407052652844e-07, "loss": 0.2637, "step": 1515 }, { "epoch": 2.057529610829103, "grad_norm": 1.6146644628968327, "learning_rate": 5.415186268228762e-07, "loss": 0.2657, "step": 1520 }, { "epoch": 2.0642978003384096, "grad_norm": 2.478325459542897, "learning_rate": 5.3452505728513e-07, "loss": 0.2528, "step": 1525 }, { "epoch": 2.0710659898477157, "grad_norm": 1.902805376038303, "learning_rate": 5.275604315203292e-07, "loss": 0.2625, "step": 1530 }, { "epoch": 2.077834179357022, "grad_norm": 1.707278374587984, "learning_rate": 5.206251825969973e-07, "loss": 0.2557, "step": 1535 }, { "epoch": 2.0846023688663284, "grad_norm": 1.7063185136076124, "learning_rate": 5.137197417569738e-07, "loss": 0.2397, "step": 1540 }, { "epoch": 2.0913705583756346, "grad_norm": 1.8210841183819555, "learning_rate": 5.068445383885961e-07, "loss": 0.2511, "step": 1545 }, { "epoch": 2.0981387478849407, "grad_norm": 1.8554223864028092, "learning_rate": 5.000000000000002e-07, "loss": 0.2553, "step": 1550 }, { "epoch": 2.104906937394247, "grad_norm": 1.7188069419320222, "learning_rate": 4.931865521925383e-07, "loss": 0.2454, "step": 1555 }, { "epoch": 2.1116751269035534, "grad_norm": 1.8040405864378724, "learning_rate": 4.864046186343139e-07, "loss": 0.2608, "step": 1560 }, { "epoch": 2.1184433164128595, "grad_norm": 1.9594461805438623, "learning_rate": 4.796546210338387e-07, "loss": 0.262, "step": 1565 }, { "epoch": 2.1252115059221657, "grad_norm": 1.6763587503898223, "learning_rate": 4.7293697911380846e-07, "loss": 0.2622, "step": 1570 }, { "epoch": 2.1319796954314723, "grad_norm": 1.966220531714013, "learning_rate": 4.662521105850055e-07, "loss": 0.2512, "step": 1575 }, { "epoch": 2.1387478849407784, "grad_norm": 1.755764907373717, "learning_rate": 4.596004311203242e-07, "loss": 0.249, "step": 1580 }, { "epoch": 2.1455160744500845, "grad_norm": 1.8223089125739613, "learning_rate": 4.5298235432892575e-07, "loss": 0.2465, "step": 1585 }, { "epoch": 2.152284263959391, "grad_norm": 1.7206718832666286, "learning_rate": 4.463982917305155e-07, "loss": 0.2458, "step": 1590 }, { "epoch": 2.1590524534686972, "grad_norm": 1.8397041856544176, "learning_rate": 4.398486527297595e-07, "loss": 0.2577, "step": 1595 }, { "epoch": 2.1658206429780034, "grad_norm": 1.8789732250278144, "learning_rate": 4.3333384459082247e-07, "loss": 0.2547, "step": 1600 }, { "epoch": 2.1725888324873095, "grad_norm": 1.9842491893922207, "learning_rate": 4.268542724120475e-07, "loss": 0.2407, "step": 1605 }, { "epoch": 2.179357021996616, "grad_norm": 1.6840880785917902, "learning_rate": 4.204103391007623e-07, "loss": 0.2453, "step": 1610 }, { "epoch": 2.186125211505922, "grad_norm": 1.7102283764854913, "learning_rate": 4.140024453482307e-07, "loss": 0.2531, "step": 1615 }, { "epoch": 2.1928934010152283, "grad_norm": 1.9705580867409258, "learning_rate": 4.076309896047336e-07, "loss": 0.239, "step": 1620 }, { "epoch": 2.199661590524535, "grad_norm": 1.793405155653999, "learning_rate": 4.012963680547946e-07, "loss": 0.2565, "step": 1625 }, { "epoch": 2.206429780033841, "grad_norm": 1.720247219871775, "learning_rate": 3.949989745925437e-07, "loss": 0.2675, "step": 1630 }, { "epoch": 2.213197969543147, "grad_norm": 1.6800247601578724, "learning_rate": 3.8873920079722644e-07, "loss": 0.2568, "step": 1635 }, { "epoch": 2.2199661590524533, "grad_norm": 1.8843811057218098, "learning_rate": 3.8251743590885256e-07, "loss": 0.2431, "step": 1640 }, { "epoch": 2.22673434856176, "grad_norm": 1.6769092970745525, "learning_rate": 3.7633406680399416e-07, "loss": 0.2513, "step": 1645 }, { "epoch": 2.233502538071066, "grad_norm": 1.9149891397189558, "learning_rate": 3.701894779717286e-07, "loss": 0.2441, "step": 1650 }, { "epoch": 2.240270727580372, "grad_norm": 1.8352949058879366, "learning_rate": 3.640840514897322e-07, "loss": 0.2512, "step": 1655 }, { "epoch": 2.2470389170896787, "grad_norm": 1.7191969563079663, "learning_rate": 3.580181670005182e-07, "loss": 0.2514, "step": 1660 }, { "epoch": 2.253807106598985, "grad_norm": 1.7862905463536183, "learning_rate": 3.519922016878356e-07, "loss": 0.2523, "step": 1665 }, { "epoch": 2.260575296108291, "grad_norm": 1.7759093954687228, "learning_rate": 3.460065302532108e-07, "loss": 0.2455, "step": 1670 }, { "epoch": 2.267343485617597, "grad_norm": 1.8251542509381542, "learning_rate": 3.400615248926506e-07, "loss": 0.2628, "step": 1675 }, { "epoch": 2.2741116751269037, "grad_norm": 1.8854432012171292, "learning_rate": 3.341575552734978e-07, "loss": 0.2496, "step": 1680 }, { "epoch": 2.28087986463621, "grad_norm": 1.7098687857977533, "learning_rate": 3.2829498851144577e-07, "loss": 0.2486, "step": 1685 }, { "epoch": 2.287648054145516, "grad_norm": 1.9407175434129038, "learning_rate": 3.224741891477095e-07, "loss": 0.254, "step": 1690 }, { "epoch": 2.2944162436548226, "grad_norm": 1.8048961105532955, "learning_rate": 3.166955191263587e-07, "loss": 0.2596, "step": 1695 }, { "epoch": 2.3011844331641287, "grad_norm": 1.9500522244148442, "learning_rate": 3.109593377718116e-07, "loss": 0.2674, "step": 1700 }, { "epoch": 2.307952622673435, "grad_norm": 1.7430933854279251, "learning_rate": 3.0526600176649153e-07, "loss": 0.2426, "step": 1705 }, { "epoch": 2.314720812182741, "grad_norm": 1.7044330031218278, "learning_rate": 2.9961586512864944e-07, "loss": 0.2545, "step": 1710 }, { "epoch": 2.3214890016920475, "grad_norm": 1.750375443426282, "learning_rate": 2.9400927919034726e-07, "loss": 0.2408, "step": 1715 }, { "epoch": 2.3282571912013537, "grad_norm": 1.8950517301756609, "learning_rate": 2.884465925756159e-07, "loss": 0.2489, "step": 1720 }, { "epoch": 2.33502538071066, "grad_norm": 1.8482803305522688, "learning_rate": 2.829281511787739e-07, "loss": 0.2625, "step": 1725 }, { "epoch": 2.3417935702199664, "grad_norm": 1.9547518786899865, "learning_rate": 2.774542981429214e-07, "loss": 0.2539, "step": 1730 }, { "epoch": 2.3485617597292725, "grad_norm": 1.681029190992801, "learning_rate": 2.7202537383860193e-07, "loss": 0.2569, "step": 1735 }, { "epoch": 2.3553299492385786, "grad_norm": 1.8386556962717924, "learning_rate": 2.6664171584263927e-07, "loss": 0.2738, "step": 1740 }, { "epoch": 2.3620981387478848, "grad_norm": 1.8580649687840483, "learning_rate": 2.613036589171443e-07, "loss": 0.2587, "step": 1745 }, { "epoch": 2.3688663282571913, "grad_norm": 1.8932490915132734, "learning_rate": 2.560115349887013e-07, "loss": 0.2597, "step": 1750 }, { "epoch": 2.3756345177664975, "grad_norm": 1.8291342664374226, "learning_rate": 2.5076567312772636e-07, "loss": 0.2514, "step": 1755 }, { "epoch": 2.3824027072758036, "grad_norm": 1.8170289718176122, "learning_rate": 2.4556639952800784e-07, "loss": 0.2508, "step": 1760 }, { "epoch": 2.38917089678511, "grad_norm": 1.770454255610108, "learning_rate": 2.4041403748642085e-07, "loss": 0.2607, "step": 1765 }, { "epoch": 2.3959390862944163, "grad_norm": 1.920241090922087, "learning_rate": 2.353089073828255e-07, "loss": 0.2497, "step": 1770 }, { "epoch": 2.4027072758037225, "grad_norm": 1.9199730244133009, "learning_rate": 2.302513266601449e-07, "loss": 0.2534, "step": 1775 }, { "epoch": 2.4094754653130286, "grad_norm": 1.770783110899942, "learning_rate": 2.2524160980462747e-07, "loss": 0.2577, "step": 1780 }, { "epoch": 2.416243654822335, "grad_norm": 1.779049283308491, "learning_rate": 2.2028006832628876e-07, "loss": 0.2648, "step": 1785 }, { "epoch": 2.4230118443316413, "grad_norm": 1.8951806130885847, "learning_rate": 2.1536701073954556e-07, "loss": 0.2552, "step": 1790 }, { "epoch": 2.4297800338409474, "grad_norm": 1.890797368455652, "learning_rate": 2.1050274254402812e-07, "loss": 0.2533, "step": 1795 }, { "epoch": 2.436548223350254, "grad_norm": 1.8879449556370194, "learning_rate": 2.0568756620558736e-07, "loss": 0.2621, "step": 1800 }, { "epoch": 2.44331641285956, "grad_norm": 1.9716199735140634, "learning_rate": 2.0092178113748348e-07, "loss": 0.251, "step": 1805 }, { "epoch": 2.4500846023688663, "grad_norm": 1.664484183169836, "learning_rate": 1.962056836817718e-07, "loss": 0.2451, "step": 1810 }, { "epoch": 2.4568527918781724, "grad_norm": 1.7827762385482862, "learning_rate": 1.9153956709087337e-07, "loss": 0.2561, "step": 1815 }, { "epoch": 2.463620981387479, "grad_norm": 1.776234828523817, "learning_rate": 1.8692372150934111e-07, "loss": 0.2396, "step": 1820 }, { "epoch": 2.470389170896785, "grad_norm": 1.745340346927932, "learning_rate": 1.8235843395581795e-07, "loss": 0.2494, "step": 1825 }, { "epoch": 2.4771573604060912, "grad_norm": 1.8045126818551005, "learning_rate": 1.7784398830519e-07, "loss": 0.2522, "step": 1830 }, { "epoch": 2.483925549915398, "grad_norm": 1.89151528655817, "learning_rate": 1.733806652709351e-07, "loss": 0.2528, "step": 1835 }, { "epoch": 2.490693739424704, "grad_norm": 1.7633235435458878, "learning_rate": 1.68968742387667e-07, "loss": 0.2518, "step": 1840 }, { "epoch": 2.49746192893401, "grad_norm": 1.7902731019901965, "learning_rate": 1.6460849399387845e-07, "loss": 0.2552, "step": 1845 }, { "epoch": 2.504230118443316, "grad_norm": 1.8529889195491502, "learning_rate": 1.6030019121488226e-07, "loss": 0.2588, "step": 1850 }, { "epoch": 2.510998307952623, "grad_norm": 1.7334848080126557, "learning_rate": 1.5604410194595264e-07, "loss": 0.2495, "step": 1855 }, { "epoch": 2.517766497461929, "grad_norm": 1.8562652968231852, "learning_rate": 1.5184049083566687e-07, "loss": 0.2468, "step": 1860 }, { "epoch": 2.524534686971235, "grad_norm": 1.7483934506844119, "learning_rate": 1.476896192694499e-07, "loss": 0.2527, "step": 1865 }, { "epoch": 2.5313028764805416, "grad_norm": 1.8012786807901255, "learning_rate": 1.4359174535331998e-07, "loss": 0.2495, "step": 1870 }, { "epoch": 2.5380710659898478, "grad_norm": 1.882303438003816, "learning_rate": 1.3954712389783996e-07, "loss": 0.2633, "step": 1875 }, { "epoch": 2.544839255499154, "grad_norm": 1.9004686167123348, "learning_rate": 1.3555600640227283e-07, "loss": 0.2482, "step": 1880 }, { "epoch": 2.55160744500846, "grad_norm": 1.9008963364397549, "learning_rate": 1.3161864103894361e-07, "loss": 0.2601, "step": 1885 }, { "epoch": 2.5583756345177666, "grad_norm": 1.7910128571435278, "learning_rate": 1.2773527263780626e-07, "loss": 0.2483, "step": 1890 }, { "epoch": 2.5651438240270727, "grad_norm": 1.7108145043264886, "learning_rate": 1.23906142671222e-07, "loss": 0.2468, "step": 1895 }, { "epoch": 2.571912013536379, "grad_norm": 1.8225256818804154, "learning_rate": 1.2013148923894212e-07, "loss": 0.2543, "step": 1900 }, { "epoch": 2.5786802030456855, "grad_norm": 1.79411507183842, "learning_rate": 1.1641154705330502e-07, "loss": 0.2409, "step": 1905 }, { "epoch": 2.5854483925549916, "grad_norm": 1.7242811103863596, "learning_rate": 1.127465474246384e-07, "loss": 0.2571, "step": 1910 }, { "epoch": 2.5922165820642977, "grad_norm": 1.9407815471081673, "learning_rate": 1.0913671824687953e-07, "loss": 0.251, "step": 1915 }, { "epoch": 2.598984771573604, "grad_norm": 1.6296473201174055, "learning_rate": 1.0558228398340186e-07, "loss": 0.2388, "step": 1920 }, { "epoch": 2.6057529610829104, "grad_norm": 1.6835802138496847, "learning_rate": 1.020834656530597e-07, "loss": 0.2427, "step": 1925 }, { "epoch": 2.6125211505922166, "grad_norm": 1.8446991621937052, "learning_rate": 9.86404808164426e-08, "loss": 0.24, "step": 1930 }, { "epoch": 2.6192893401015227, "grad_norm": 1.762353539890661, "learning_rate": 9.525354356235004e-08, "loss": 0.24, "step": 1935 }, { "epoch": 2.6260575296108293, "grad_norm": 1.9831610169369156, "learning_rate": 9.192286449447684e-08, "loss": 0.2451, "step": 1940 }, { "epoch": 2.6328257191201354, "grad_norm": 1.6135822284867207, "learning_rate": 8.864865071831829e-08, "loss": 0.2534, "step": 1945 }, { "epoch": 2.6395939086294415, "grad_norm": 1.6037304176671419, "learning_rate": 8.543110582829272e-08, "loss": 0.243, "step": 1950 }, { "epoch": 2.6463620981387477, "grad_norm": 1.7531920017607536, "learning_rate": 8.227042989508104e-08, "loss": 0.2482, "step": 1955 }, { "epoch": 2.6531302876480543, "grad_norm": 1.723522478352686, "learning_rate": 7.916681945318648e-08, "loss": 0.2477, "step": 1960 }, { "epoch": 2.6598984771573604, "grad_norm": 1.7614779368425475, "learning_rate": 7.612046748871326e-08, "loss": 0.253, "step": 1965 }, { "epoch": 2.6666666666666665, "grad_norm": 1.887871045430256, "learning_rate": 7.313156342736738e-08, "loss": 0.2508, "step": 1970 }, { "epoch": 2.673434856175973, "grad_norm": 1.9069055836407365, "learning_rate": 7.020029312267727e-08, "loss": 0.267, "step": 1975 }, { "epoch": 2.6802030456852792, "grad_norm": 1.8939133923811016, "learning_rate": 6.732683884443735e-08, "loss": 0.2692, "step": 1980 }, { "epoch": 2.6869712351945854, "grad_norm": 1.8130887567930416, "learning_rate": 6.451137926737415e-08, "loss": 0.2527, "step": 1985 }, { "epoch": 2.6937394247038915, "grad_norm": 2.101117018466412, "learning_rate": 6.175408946003702e-08, "loss": 0.2497, "step": 1990 }, { "epoch": 2.700507614213198, "grad_norm": 1.654995259770582, "learning_rate": 5.90551408739105e-08, "loss": 0.2306, "step": 1995 }, { "epoch": 2.707275803722504, "grad_norm": 1.83930610346023, "learning_rate": 5.641470133275472e-08, "loss": 0.2573, "step": 2000 }, { "epoch": 2.7140439932318103, "grad_norm": 1.810068940701123, "learning_rate": 5.3832935022169015e-08, "loss": 0.236, "step": 2005 }, { "epoch": 2.720812182741117, "grad_norm": 1.8885737097550355, "learning_rate": 5.1310002479383665e-08, "loss": 0.2543, "step": 2010 }, { "epoch": 2.727580372250423, "grad_norm": 1.722305340814142, "learning_rate": 4.884606058327612e-08, "loss": 0.2472, "step": 2015 }, { "epoch": 2.734348561759729, "grad_norm": 1.9391080536086678, "learning_rate": 4.644126254461755e-08, "loss": 0.259, "step": 2020 }, { "epoch": 2.7411167512690353, "grad_norm": 1.6499649417235132, "learning_rate": 4.409575789654474e-08, "loss": 0.2566, "step": 2025 }, { "epoch": 2.747884940778342, "grad_norm": 1.6917223527602334, "learning_rate": 4.180969248526334e-08, "loss": 0.2626, "step": 2030 }, { "epoch": 2.754653130287648, "grad_norm": 1.598065120848905, "learning_rate": 3.958320846097685e-08, "loss": 0.2428, "step": 2035 }, { "epoch": 2.761421319796954, "grad_norm": 1.9249388241762717, "learning_rate": 3.7416444269050326e-08, "loss": 0.2589, "step": 2040 }, { "epoch": 2.7681895093062607, "grad_norm": 1.7249000353236292, "learning_rate": 3.530953464139919e-08, "loss": 0.2381, "step": 2045 }, { "epoch": 2.774957698815567, "grad_norm": 1.8216343143692146, "learning_rate": 3.3262610588113305e-08, "loss": 0.2519, "step": 2050 }, { "epoch": 2.781725888324873, "grad_norm": 1.7835249600977168, "learning_rate": 3.127579938930891e-08, "loss": 0.2498, "step": 2055 }, { "epoch": 2.788494077834179, "grad_norm": 1.9602449026673896, "learning_rate": 2.934922458721578e-08, "loss": 0.2609, "step": 2060 }, { "epoch": 2.7952622673434857, "grad_norm": 1.7932782220063819, "learning_rate": 2.748300597849429e-08, "loss": 0.2463, "step": 2065 }, { "epoch": 2.802030456852792, "grad_norm": 1.8373837478135608, "learning_rate": 2.5677259606786682e-08, "loss": 0.2587, "step": 2070 }, { "epoch": 2.808798646362098, "grad_norm": 1.7579583567771166, "learning_rate": 2.393209775550087e-08, "loss": 0.2409, "step": 2075 }, { "epoch": 2.8155668358714045, "grad_norm": 1.7193835474755472, "learning_rate": 2.224762894082921e-08, "loss": 0.2558, "step": 2080 }, { "epoch": 2.8223350253807107, "grad_norm": 1.9046287715832801, "learning_rate": 2.06239579050006e-08, "loss": 0.2551, "step": 2085 }, { "epoch": 2.829103214890017, "grad_norm": 2.0268943381701585, "learning_rate": 1.9061185609766995e-08, "loss": 0.2412, "step": 2090 }, { "epoch": 2.835871404399323, "grad_norm": 1.9877508850932446, "learning_rate": 1.7559409230125997e-08, "loss": 0.2554, "step": 2095 }, { "epoch": 2.8426395939086295, "grad_norm": 1.8920868869430731, "learning_rate": 1.6118722148278584e-08, "loss": 0.2563, "step": 2100 }, { "epoch": 2.8494077834179357, "grad_norm": 2.075343149112532, "learning_rate": 1.4739213947821737e-08, "loss": 0.2524, "step": 2105 }, { "epoch": 2.8561759729272422, "grad_norm": 2.055837498560825, "learning_rate": 1.342097040817891e-08, "loss": 0.2502, "step": 2110 }, { "epoch": 2.8629441624365484, "grad_norm": 1.7560367998970217, "learning_rate": 1.2164073499265403e-08, "loss": 0.2619, "step": 2115 }, { "epoch": 2.8697123519458545, "grad_norm": 1.892609707767642, "learning_rate": 1.0968601376391995e-08, "loss": 0.2583, "step": 2120 }, { "epoch": 2.8764805414551606, "grad_norm": 1.990394034627918, "learning_rate": 9.834628375404718e-09, "loss": 0.2644, "step": 2125 }, { "epoch": 2.8832487309644668, "grad_norm": 1.828132568376944, "learning_rate": 8.762225008062673e-09, "loss": 0.2532, "step": 2130 }, { "epoch": 2.8900169204737733, "grad_norm": 1.912512170265383, "learning_rate": 7.75145795765375e-09, "loss": 0.2561, "step": 2135 }, { "epoch": 2.8967851099830795, "grad_norm": 1.8627242489328315, "learning_rate": 6.80239007484773e-09, "loss": 0.2524, "step": 2140 }, { "epoch": 2.903553299492386, "grad_norm": 1.532454933071209, "learning_rate": 5.915080373788961e-09, "loss": 0.2369, "step": 2145 }, { "epoch": 2.910321489001692, "grad_norm": 1.799514001407587, "learning_rate": 5.089584028425742e-09, "loss": 0.2561, "step": 2150 }, { "epoch": 2.9170896785109983, "grad_norm": 1.6517806725559891, "learning_rate": 4.325952369080288e-09, "loss": 0.2491, "step": 2155 }, { "epoch": 2.9238578680203045, "grad_norm": 1.8524564319779193, "learning_rate": 3.6242328792567278e-09, "loss": 0.2548, "step": 2160 }, { "epoch": 2.9306260575296106, "grad_norm": 1.7195279301746678, "learning_rate": 2.984469192688577e-09, "loss": 0.2588, "step": 2165 }, { "epoch": 2.937394247038917, "grad_norm": 1.9136832821313436, "learning_rate": 2.4067010906254628e-09, "loss": 0.2537, "step": 2170 }, { "epoch": 2.9441624365482233, "grad_norm": 2.0082222078514844, "learning_rate": 1.8909644993593267e-09, "loss": 0.2654, "step": 2175 }, { "epoch": 2.95093062605753, "grad_norm": 1.8775828520860633, "learning_rate": 1.4372914879909881e-09, "loss": 0.2486, "step": 2180 }, { "epoch": 2.957698815566836, "grad_norm": 1.8912270182535766, "learning_rate": 1.0457102664356288e-09, "loss": 0.2471, "step": 2185 }, { "epoch": 2.964467005076142, "grad_norm": 1.8579123732190646, "learning_rate": 7.162451836685291e-10, "loss": 0.2366, "step": 2190 }, { "epoch": 2.9712351945854483, "grad_norm": 1.7842037052182547, "learning_rate": 4.4891672621161226e-10, "loss": 0.2467, "step": 2195 }, { "epoch": 2.9780033840947544, "grad_norm": 1.875106833427518, "learning_rate": 2.4374151685913057e-10, "loss": 0.2626, "step": 2200 }, { "epoch": 2.984771573604061, "grad_norm": 1.7694776387255406, "learning_rate": 1.007323136438254e-10, "loss": 0.2515, "step": 2205 }, { "epoch": 2.991539763113367, "grad_norm": 1.8122490703088023, "learning_rate": 1.9898009044450048e-11, "loss": 0.2448, "step": 2210 }, { "epoch": 2.996954314720812, "step": 2214, "total_flos": 865276955983872.0, "train_loss": 0.3529874208604525, "train_runtime": 40097.4129, "train_samples_per_second": 7.073, "train_steps_per_second": 0.055 } ], "logging_steps": 5, "max_steps": 2214, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10086, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 865276955983872.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }