{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997049277072882, "eval_steps": 500, "global_step": 847, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011802891708468575, "grad_norm": NaN, "learning_rate": 4.11764705882353e-06, "loss": 2.4846, "step": 10 }, { "epoch": 0.02360578341693715, "grad_norm": 1.4208085536956787, "learning_rate": 9.411764705882354e-06, "loss": 2.1584, "step": 20 }, { "epoch": 0.03540867512540572, "grad_norm": 0.5537272095680237, "learning_rate": 1.5294117647058826e-05, "loss": 1.6811, "step": 30 }, { "epoch": 0.0472115668338743, "grad_norm": 0.7016017436981201, "learning_rate": 2.1176470588235296e-05, "loss": 1.4406, "step": 40 }, { "epoch": 0.05901445854234287, "grad_norm": 1.1232593059539795, "learning_rate": 2.7058823529411766e-05, "loss": 1.3102, "step": 50 }, { "epoch": 0.07081735025081144, "grad_norm": 2.9730782508850098, "learning_rate": 3.294117647058824e-05, "loss": 1.2155, "step": 60 }, { "epoch": 0.08262024195928003, "grad_norm": 0.692528247833252, "learning_rate": 3.882352941176471e-05, "loss": 1.1449, "step": 70 }, { "epoch": 0.0944231336677486, "grad_norm": 1.3414469957351685, "learning_rate": 4.470588235294118e-05, "loss": 1.1908, "step": 80 }, { "epoch": 0.10622602537621717, "grad_norm": 0.6828320622444153, "learning_rate": 4.999978752922572e-05, "loss": 1.1663, "step": 90 }, { "epoch": 0.11802891708468574, "grad_norm": NaN, "learning_rate": 4.998640308387074e-05, "loss": 1.1033, "step": 100 }, { "epoch": 0.1298318087931543, "grad_norm": 9.297845840454102, "learning_rate": 4.993119095936937e-05, "loss": 1.1636, "step": 110 }, { "epoch": 0.14163470050162288, "grad_norm": 0.8314323425292969, "learning_rate": 4.983360758155341e-05, "loss": 1.1268, "step": 120 }, { "epoch": 0.15343759221009148, "grad_norm": 0.6411977410316467, "learning_rate": 4.9693818796090927e-05, "loss": 1.1321, "step": 130 }, { "epoch": 0.16524048391856005, "grad_norm": 0.5500069856643677, "learning_rate": 4.951206217791564e-05, "loss": 1.0142, "step": 140 }, { "epoch": 0.17704337562702863, "grad_norm": 0.6950828433036804, "learning_rate": 4.9288646627461645e-05, "loss": 1.1114, "step": 150 }, { "epoch": 0.1888462673354972, "grad_norm": 0.6412365436553955, "learning_rate": 4.902395184567859e-05, "loss": 1.0839, "step": 160 }, { "epoch": 0.20064915904396577, "grad_norm": 0.6320505738258362, "learning_rate": 4.871842768871928e-05, "loss": 1.0705, "step": 170 }, { "epoch": 0.21245205075243434, "grad_norm": 0.7705304622650146, "learning_rate": 4.837259340339665e-05, "loss": 1.0312, "step": 180 }, { "epoch": 0.2242549424609029, "grad_norm": 0.6497272849082947, "learning_rate": 4.7987036744709326e-05, "loss": 1.0162, "step": 190 }, { "epoch": 0.23605783416937148, "grad_norm": 2.37109112739563, "learning_rate": 4.756241297693566e-05, "loss": 1.0297, "step": 200 }, { "epoch": 0.24786072587784008, "grad_norm": 0.523530125617981, "learning_rate": 4.7099443759993837e-05, "loss": 1.0039, "step": 210 }, { "epoch": 0.2596636175863086, "grad_norm": 0.8198627233505249, "learning_rate": 4.659891592296071e-05, "loss": 0.9934, "step": 220 }, { "epoch": 0.2714665092947772, "grad_norm": 0.7993373870849609, "learning_rate": 4.606168012683394e-05, "loss": 1.0378, "step": 230 }, { "epoch": 0.28326940100324577, "grad_norm": 0.7412521243095398, "learning_rate": 4.548864941880988e-05, "loss": 1.007, "step": 240 }, { "epoch": 0.29507229271171437, "grad_norm": 0.715391993522644, "learning_rate": 4.488079768053447e-05, "loss": 0.987, "step": 250 }, { "epoch": 0.30687518442018297, "grad_norm": 1.2131056785583496, "learning_rate": 4.423915797296425e-05, "loss": 0.9728, "step": 260 }, { "epoch": 0.3186780761286515, "grad_norm": 0.7467273473739624, "learning_rate": 4.3564820780650496e-05, "loss": 0.9608, "step": 270 }, { "epoch": 0.3304809678371201, "grad_norm": 0.8544695377349854, "learning_rate": 4.285893215843037e-05, "loss": 0.9591, "step": 280 }, { "epoch": 0.34228385954558865, "grad_norm": 0.7751792669296265, "learning_rate": 4.2122691783674786e-05, "loss": 0.9466, "step": 290 }, { "epoch": 0.35408675125405725, "grad_norm": 2.504579544067383, "learning_rate": 4.1357350917403314e-05, "loss": 0.9384, "step": 300 }, { "epoch": 0.3658896429625258, "grad_norm": 22.522701263427734, "learning_rate": 4.056421027773126e-05, "loss": 0.9517, "step": 310 }, { "epoch": 0.3776925346709944, "grad_norm": 0.9187076091766357, "learning_rate": 3.974461782926299e-05, "loss": 0.9449, "step": 320 }, { "epoch": 0.389495426379463, "grad_norm": 1.2791095972061157, "learning_rate": 3.889996649218852e-05, "loss": 0.9171, "step": 330 }, { "epoch": 0.40129831808793154, "grad_norm": 0.7749762535095215, "learning_rate": 3.8031691774976904e-05, "loss": 0.9611, "step": 340 }, { "epoch": 0.41310120979640014, "grad_norm": 0.917222797870636, "learning_rate": 3.714126933468959e-05, "loss": 0.9678, "step": 350 }, { "epoch": 0.4249041015048687, "grad_norm": 0.552499532699585, "learning_rate": 3.623021246906018e-05, "loss": 0.9134, "step": 360 }, { "epoch": 0.4367069932133373, "grad_norm": 0.5541794896125793, "learning_rate": 3.530006954460274e-05, "loss": 0.928, "step": 370 }, { "epoch": 0.4485098849218058, "grad_norm": 0.5857645869255066, "learning_rate": 3.435242136511984e-05, "loss": 0.8963, "step": 380 }, { "epoch": 0.4603127766302744, "grad_norm": 0.5237164497375488, "learning_rate": 3.338887848508242e-05, "loss": 0.8931, "step": 390 }, { "epoch": 0.47211566833874297, "grad_norm": 0.49927598237991333, "learning_rate": 3.241107847244769e-05, "loss": 0.9113, "step": 400 }, { "epoch": 0.48391856004721157, "grad_norm": 1.1223820447921753, "learning_rate": 3.14206831255667e-05, "loss": 0.9172, "step": 410 }, { "epoch": 0.49572145175568016, "grad_norm": 0.7306515574455261, "learning_rate": 3.041937564891183e-05, "loss": 0.903, "step": 420 }, { "epoch": 0.5075243434641488, "grad_norm": 1.0228462219238281, "learning_rate": 2.940885779242387e-05, "loss": 0.8676, "step": 430 }, { "epoch": 0.5193272351726173, "grad_norm": 0.5814535021781921, "learning_rate": 2.8390846959340638e-05, "loss": 0.8709, "step": 440 }, { "epoch": 0.5311301268810859, "grad_norm": 0.7466275095939636, "learning_rate": 2.736707328742234e-05, "loss": 0.8659, "step": 450 }, { "epoch": 0.5429330185895545, "grad_norm": 0.7288652062416077, "learning_rate": 2.633927670853425e-05, "loss": 0.8683, "step": 460 }, { "epoch": 0.554735910298023, "grad_norm": 0.5865510106086731, "learning_rate": 2.5309203991584073e-05, "loss": 0.871, "step": 470 }, { "epoch": 0.5665388020064915, "grad_norm": 0.5332498550415039, "learning_rate": 2.4278605773839548e-05, "loss": 0.8574, "step": 480 }, { "epoch": 0.5783416937149601, "grad_norm": 0.634554922580719, "learning_rate": 2.3249233585671636e-05, "loss": 0.9529, "step": 490 }, { "epoch": 0.5901445854234287, "grad_norm": 0.655619740486145, "learning_rate": 2.2222836873779888e-05, "loss": 0.8546, "step": 500 }, { "epoch": 0.6019474771318973, "grad_norm": 0.5780109167098999, "learning_rate": 2.1201160027959077e-05, "loss": 0.8452, "step": 510 }, { "epoch": 0.6137503688403659, "grad_norm": 0.49844804406166077, "learning_rate": 2.0185939416460133e-05, "loss": 0.876, "step": 520 }, { "epoch": 0.6255532605488344, "grad_norm": 0.430317223072052, "learning_rate": 1.917890043498397e-05, "loss": 0.8406, "step": 530 }, { "epoch": 0.637356152257303, "grad_norm": 0.4516479969024658, "learning_rate": 1.8181754574323446e-05, "loss": 0.8226, "step": 540 }, { "epoch": 0.6491590439657716, "grad_norm": 0.5004287362098694, "learning_rate": 1.7196196511637084e-05, "loss": 0.9068, "step": 550 }, { "epoch": 0.6609619356742402, "grad_norm": 0.6133776307106018, "learning_rate": 1.6223901230298062e-05, "loss": 0.8361, "step": 560 }, { "epoch": 0.6727648273827088, "grad_norm": 1.207897663116455, "learning_rate": 1.5266521173213306e-05, "loss": 0.8515, "step": 570 }, { "epoch": 0.6845677190911773, "grad_norm": 0.5756068825721741, "learning_rate": 1.432568343445077e-05, "loss": 0.8553, "step": 580 }, { "epoch": 0.6963706107996459, "grad_norm": 0.5454249382019043, "learning_rate": 1.340298699394777e-05, "loss": 0.8731, "step": 590 }, { "epoch": 0.7081735025081145, "grad_norm": 0.527680516242981, "learning_rate": 1.2500000000000006e-05, "loss": 0.8545, "step": 600 }, { "epoch": 0.7199763942165831, "grad_norm": 0.6664750576019287, "learning_rate": 1.1618257104149898e-05, "loss": 0.8259, "step": 610 }, { "epoch": 0.7317792859250516, "grad_norm": 0.7168521881103516, "learning_rate": 1.0759256853003578e-05, "loss": 0.856, "step": 620 }, { "epoch": 0.7435821776335202, "grad_norm": 0.4886401891708374, "learning_rate": 9.92445914140912e-06, "loss": 0.8448, "step": 630 }, { "epoch": 0.7553850693419888, "grad_norm": 0.541182279586792, "learning_rate": 9.115282731324696e-06, "loss": 0.8142, "step": 640 }, { "epoch": 0.7671879610504574, "grad_norm": 0.5412532091140747, "learning_rate": 8.333102840593015e-06, "loss": 0.8607, "step": 650 }, { "epoch": 0.778990852758926, "grad_norm": 0.603594183921814, "learning_rate": 7.579248805720396e-06, "loss": 0.8283, "step": 660 }, { "epoch": 0.7907937444673945, "grad_norm": 0.7654862403869629, "learning_rate": 6.855001822632278e-06, "loss": 0.8514, "step": 670 }, { "epoch": 0.8025966361758631, "grad_norm": 0.5184491872787476, "learning_rate": 6.161592769245114e-06, "loss": 0.8125, "step": 680 }, { "epoch": 0.8143995278843317, "grad_norm": 0.5151306986808777, "learning_rate": 5.500200113555071e-06, "loss": 0.8122, "step": 690 }, { "epoch": 0.8262024195928003, "grad_norm": 0.5323730111122131, "learning_rate": 4.871947910798818e-06, "loss": 0.7771, "step": 700 }, { "epoch": 0.8380053113012688, "grad_norm": 0.6055124402046204, "learning_rate": 4.277903893090407e-06, "loss": 0.8132, "step": 710 }, { "epoch": 0.8498082030097374, "grad_norm": 0.5266196727752686, "learning_rate": 3.7190776547807447e-06, "loss": 0.8095, "step": 720 }, { "epoch": 0.861611094718206, "grad_norm": 0.5904248356819153, "learning_rate": 3.1964189366239377e-06, "loss": 0.8285, "step": 730 }, { "epoch": 0.8734139864266746, "grad_norm": 0.6103527545928955, "learning_rate": 2.7108160116663893e-06, "loss": 0.8297, "step": 740 }, { "epoch": 0.8852168781351432, "grad_norm": 0.5870690941810608, "learning_rate": 2.2630941756020512e-06, "loss": 0.8621, "step": 750 }, { "epoch": 0.8970197698436116, "grad_norm": 0.49599605798721313, "learning_rate": 1.8540143441593854e-06, "loss": 0.8191, "step": 760 }, { "epoch": 0.9088226615520802, "grad_norm": 0.5189609527587891, "learning_rate": 1.4842717599039047e-06, "loss": 0.8048, "step": 770 }, { "epoch": 0.9206255532605488, "grad_norm": 0.485893577337265, "learning_rate": 1.1544948106540775e-06, "loss": 0.782, "step": 780 }, { "epoch": 0.9324284449690174, "grad_norm": 0.5677834749221802, "learning_rate": 8.652439615187163e-07, "loss": 0.8249, "step": 790 }, { "epoch": 0.9442313366774859, "grad_norm": 0.5319679379463196, "learning_rate": 6.170108023709348e-07, "loss": 0.817, "step": 800 }, { "epoch": 0.9560342283859545, "grad_norm": 0.5445159673690796, "learning_rate": 4.1021721237745337e-07, "loss": 0.8029, "step": 810 }, { "epoch": 0.9678371200944231, "grad_norm": 0.618567943572998, "learning_rate": 2.452146430032165e-07, "loss": 0.8493, "step": 820 }, { "epoch": 0.9796400118028917, "grad_norm": 0.5149559378623962, "learning_rate": 1.2228352070983719e-07, "loss": 0.8134, "step": 830 }, { "epoch": 0.9914429035113603, "grad_norm": 0.4901800751686096, "learning_rate": 4.1632770363012056e-08, "loss": 0.8336, "step": 840 }, { "epoch": 0.9997049277072882, "step": 847, "total_flos": 1.1591373314404123e+19, "train_loss": 0.973909290228149, "train_runtime": 71632.8605, "train_samples_per_second": 0.378, "train_steps_per_second": 0.012 } ], "logging_steps": 10, "max_steps": 847, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1591373314404123e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }