|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997049277072882, |
|
"eval_steps": 500, |
|
"global_step": 847, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011802891708468575, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.11764705882353e-06, |
|
"loss": 2.4846, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02360578341693715, |
|
"grad_norm": 1.4208085536956787, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 2.1584, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03540867512540572, |
|
"grad_norm": 0.5537272095680237, |
|
"learning_rate": 1.5294117647058826e-05, |
|
"loss": 1.6811, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0472115668338743, |
|
"grad_norm": 0.7016017436981201, |
|
"learning_rate": 2.1176470588235296e-05, |
|
"loss": 1.4406, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05901445854234287, |
|
"grad_norm": 1.1232593059539795, |
|
"learning_rate": 2.7058823529411766e-05, |
|
"loss": 1.3102, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07081735025081144, |
|
"grad_norm": 2.9730782508850098, |
|
"learning_rate": 3.294117647058824e-05, |
|
"loss": 1.2155, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08262024195928003, |
|
"grad_norm": 0.692528247833252, |
|
"learning_rate": 3.882352941176471e-05, |
|
"loss": 1.1449, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0944231336677486, |
|
"grad_norm": 1.3414469957351685, |
|
"learning_rate": 4.470588235294118e-05, |
|
"loss": 1.1908, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10622602537621717, |
|
"grad_norm": 0.6828320622444153, |
|
"learning_rate": 4.999978752922572e-05, |
|
"loss": 1.1663, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11802891708468574, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.998640308387074e-05, |
|
"loss": 1.1033, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1298318087931543, |
|
"grad_norm": 9.297845840454102, |
|
"learning_rate": 4.993119095936937e-05, |
|
"loss": 1.1636, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14163470050162288, |
|
"grad_norm": 0.8314323425292969, |
|
"learning_rate": 4.983360758155341e-05, |
|
"loss": 1.1268, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15343759221009148, |
|
"grad_norm": 0.6411977410316467, |
|
"learning_rate": 4.9693818796090927e-05, |
|
"loss": 1.1321, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16524048391856005, |
|
"grad_norm": 0.5500069856643677, |
|
"learning_rate": 4.951206217791564e-05, |
|
"loss": 1.0142, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17704337562702863, |
|
"grad_norm": 0.6950828433036804, |
|
"learning_rate": 4.9288646627461645e-05, |
|
"loss": 1.1114, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1888462673354972, |
|
"grad_norm": 0.6412365436553955, |
|
"learning_rate": 4.902395184567859e-05, |
|
"loss": 1.0839, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20064915904396577, |
|
"grad_norm": 0.6320505738258362, |
|
"learning_rate": 4.871842768871928e-05, |
|
"loss": 1.0705, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21245205075243434, |
|
"grad_norm": 0.7705304622650146, |
|
"learning_rate": 4.837259340339665e-05, |
|
"loss": 1.0312, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2242549424609029, |
|
"grad_norm": 0.6497272849082947, |
|
"learning_rate": 4.7987036744709326e-05, |
|
"loss": 1.0162, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23605783416937148, |
|
"grad_norm": 2.37109112739563, |
|
"learning_rate": 4.756241297693566e-05, |
|
"loss": 1.0297, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24786072587784008, |
|
"grad_norm": 0.523530125617981, |
|
"learning_rate": 4.7099443759993837e-05, |
|
"loss": 1.0039, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2596636175863086, |
|
"grad_norm": 0.8198627233505249, |
|
"learning_rate": 4.659891592296071e-05, |
|
"loss": 0.9934, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2714665092947772, |
|
"grad_norm": 0.7993373870849609, |
|
"learning_rate": 4.606168012683394e-05, |
|
"loss": 1.0378, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28326940100324577, |
|
"grad_norm": 0.7412521243095398, |
|
"learning_rate": 4.548864941880988e-05, |
|
"loss": 1.007, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.29507229271171437, |
|
"grad_norm": 0.715391993522644, |
|
"learning_rate": 4.488079768053447e-05, |
|
"loss": 0.987, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.30687518442018297, |
|
"grad_norm": 1.2131056785583496, |
|
"learning_rate": 4.423915797296425e-05, |
|
"loss": 0.9728, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3186780761286515, |
|
"grad_norm": 0.7467273473739624, |
|
"learning_rate": 4.3564820780650496e-05, |
|
"loss": 0.9608, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3304809678371201, |
|
"grad_norm": 0.8544695377349854, |
|
"learning_rate": 4.285893215843037e-05, |
|
"loss": 0.9591, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.34228385954558865, |
|
"grad_norm": 0.7751792669296265, |
|
"learning_rate": 4.2122691783674786e-05, |
|
"loss": 0.9466, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.35408675125405725, |
|
"grad_norm": 2.504579544067383, |
|
"learning_rate": 4.1357350917403314e-05, |
|
"loss": 0.9384, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3658896429625258, |
|
"grad_norm": 22.522701263427734, |
|
"learning_rate": 4.056421027773126e-05, |
|
"loss": 0.9517, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3776925346709944, |
|
"grad_norm": 0.9187076091766357, |
|
"learning_rate": 3.974461782926299e-05, |
|
"loss": 0.9449, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.389495426379463, |
|
"grad_norm": 1.2791095972061157, |
|
"learning_rate": 3.889996649218852e-05, |
|
"loss": 0.9171, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.40129831808793154, |
|
"grad_norm": 0.7749762535095215, |
|
"learning_rate": 3.8031691774976904e-05, |
|
"loss": 0.9611, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.41310120979640014, |
|
"grad_norm": 0.917222797870636, |
|
"learning_rate": 3.714126933468959e-05, |
|
"loss": 0.9678, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4249041015048687, |
|
"grad_norm": 0.552499532699585, |
|
"learning_rate": 3.623021246906018e-05, |
|
"loss": 0.9134, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4367069932133373, |
|
"grad_norm": 0.5541794896125793, |
|
"learning_rate": 3.530006954460274e-05, |
|
"loss": 0.928, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4485098849218058, |
|
"grad_norm": 0.5857645869255066, |
|
"learning_rate": 3.435242136511984e-05, |
|
"loss": 0.8963, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4603127766302744, |
|
"grad_norm": 0.5237164497375488, |
|
"learning_rate": 3.338887848508242e-05, |
|
"loss": 0.8931, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.47211566833874297, |
|
"grad_norm": 0.49927598237991333, |
|
"learning_rate": 3.241107847244769e-05, |
|
"loss": 0.9113, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.48391856004721157, |
|
"grad_norm": 1.1223820447921753, |
|
"learning_rate": 3.14206831255667e-05, |
|
"loss": 0.9172, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.49572145175568016, |
|
"grad_norm": 0.7306515574455261, |
|
"learning_rate": 3.041937564891183e-05, |
|
"loss": 0.903, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5075243434641488, |
|
"grad_norm": 1.0228462219238281, |
|
"learning_rate": 2.940885779242387e-05, |
|
"loss": 0.8676, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5193272351726173, |
|
"grad_norm": 0.5814535021781921, |
|
"learning_rate": 2.8390846959340638e-05, |
|
"loss": 0.8709, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5311301268810859, |
|
"grad_norm": 0.7466275095939636, |
|
"learning_rate": 2.736707328742234e-05, |
|
"loss": 0.8659, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5429330185895545, |
|
"grad_norm": 0.7288652062416077, |
|
"learning_rate": 2.633927670853425e-05, |
|
"loss": 0.8683, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.554735910298023, |
|
"grad_norm": 0.5865510106086731, |
|
"learning_rate": 2.5309203991584073e-05, |
|
"loss": 0.871, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5665388020064915, |
|
"grad_norm": 0.5332498550415039, |
|
"learning_rate": 2.4278605773839548e-05, |
|
"loss": 0.8574, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5783416937149601, |
|
"grad_norm": 0.634554922580719, |
|
"learning_rate": 2.3249233585671636e-05, |
|
"loss": 0.9529, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5901445854234287, |
|
"grad_norm": 0.655619740486145, |
|
"learning_rate": 2.2222836873779888e-05, |
|
"loss": 0.8546, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6019474771318973, |
|
"grad_norm": 0.5780109167098999, |
|
"learning_rate": 2.1201160027959077e-05, |
|
"loss": 0.8452, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6137503688403659, |
|
"grad_norm": 0.49844804406166077, |
|
"learning_rate": 2.0185939416460133e-05, |
|
"loss": 0.876, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6255532605488344, |
|
"grad_norm": 0.430317223072052, |
|
"learning_rate": 1.917890043498397e-05, |
|
"loss": 0.8406, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.637356152257303, |
|
"grad_norm": 0.4516479969024658, |
|
"learning_rate": 1.8181754574323446e-05, |
|
"loss": 0.8226, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6491590439657716, |
|
"grad_norm": 0.5004287362098694, |
|
"learning_rate": 1.7196196511637084e-05, |
|
"loss": 0.9068, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6609619356742402, |
|
"grad_norm": 0.6133776307106018, |
|
"learning_rate": 1.6223901230298062e-05, |
|
"loss": 0.8361, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6727648273827088, |
|
"grad_norm": 1.207897663116455, |
|
"learning_rate": 1.5266521173213306e-05, |
|
"loss": 0.8515, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6845677190911773, |
|
"grad_norm": 0.5756068825721741, |
|
"learning_rate": 1.432568343445077e-05, |
|
"loss": 0.8553, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6963706107996459, |
|
"grad_norm": 0.5454249382019043, |
|
"learning_rate": 1.340298699394777e-05, |
|
"loss": 0.8731, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7081735025081145, |
|
"grad_norm": 0.527680516242981, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 0.8545, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7199763942165831, |
|
"grad_norm": 0.6664750576019287, |
|
"learning_rate": 1.1618257104149898e-05, |
|
"loss": 0.8259, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7317792859250516, |
|
"grad_norm": 0.7168521881103516, |
|
"learning_rate": 1.0759256853003578e-05, |
|
"loss": 0.856, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7435821776335202, |
|
"grad_norm": 0.4886401891708374, |
|
"learning_rate": 9.92445914140912e-06, |
|
"loss": 0.8448, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7553850693419888, |
|
"grad_norm": 0.541182279586792, |
|
"learning_rate": 9.115282731324696e-06, |
|
"loss": 0.8142, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7671879610504574, |
|
"grad_norm": 0.5412532091140747, |
|
"learning_rate": 8.333102840593015e-06, |
|
"loss": 0.8607, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.778990852758926, |
|
"grad_norm": 0.603594183921814, |
|
"learning_rate": 7.579248805720396e-06, |
|
"loss": 0.8283, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7907937444673945, |
|
"grad_norm": 0.7654862403869629, |
|
"learning_rate": 6.855001822632278e-06, |
|
"loss": 0.8514, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8025966361758631, |
|
"grad_norm": 0.5184491872787476, |
|
"learning_rate": 6.161592769245114e-06, |
|
"loss": 0.8125, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8143995278843317, |
|
"grad_norm": 0.5151306986808777, |
|
"learning_rate": 5.500200113555071e-06, |
|
"loss": 0.8122, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8262024195928003, |
|
"grad_norm": 0.5323730111122131, |
|
"learning_rate": 4.871947910798818e-06, |
|
"loss": 0.7771, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8380053113012688, |
|
"grad_norm": 0.6055124402046204, |
|
"learning_rate": 4.277903893090407e-06, |
|
"loss": 0.8132, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8498082030097374, |
|
"grad_norm": 0.5266196727752686, |
|
"learning_rate": 3.7190776547807447e-06, |
|
"loss": 0.8095, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.861611094718206, |
|
"grad_norm": 0.5904248356819153, |
|
"learning_rate": 3.1964189366239377e-06, |
|
"loss": 0.8285, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8734139864266746, |
|
"grad_norm": 0.6103527545928955, |
|
"learning_rate": 2.7108160116663893e-06, |
|
"loss": 0.8297, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8852168781351432, |
|
"grad_norm": 0.5870690941810608, |
|
"learning_rate": 2.2630941756020512e-06, |
|
"loss": 0.8621, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8970197698436116, |
|
"grad_norm": 0.49599605798721313, |
|
"learning_rate": 1.8540143441593854e-06, |
|
"loss": 0.8191, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9088226615520802, |
|
"grad_norm": 0.5189609527587891, |
|
"learning_rate": 1.4842717599039047e-06, |
|
"loss": 0.8048, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9206255532605488, |
|
"grad_norm": 0.485893577337265, |
|
"learning_rate": 1.1544948106540775e-06, |
|
"loss": 0.782, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9324284449690174, |
|
"grad_norm": 0.5677834749221802, |
|
"learning_rate": 8.652439615187163e-07, |
|
"loss": 0.8249, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9442313366774859, |
|
"grad_norm": 0.5319679379463196, |
|
"learning_rate": 6.170108023709348e-07, |
|
"loss": 0.817, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9560342283859545, |
|
"grad_norm": 0.5445159673690796, |
|
"learning_rate": 4.1021721237745337e-07, |
|
"loss": 0.8029, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9678371200944231, |
|
"grad_norm": 0.618567943572998, |
|
"learning_rate": 2.452146430032165e-07, |
|
"loss": 0.8493, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9796400118028917, |
|
"grad_norm": 0.5149559378623962, |
|
"learning_rate": 1.2228352070983719e-07, |
|
"loss": 0.8134, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9914429035113603, |
|
"grad_norm": 0.4901800751686096, |
|
"learning_rate": 4.1632770363012056e-08, |
|
"loss": 0.8336, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9997049277072882, |
|
"step": 847, |
|
"total_flos": 1.1591373314404123e+19, |
|
"train_loss": 0.973909290228149, |
|
"train_runtime": 71632.8605, |
|
"train_samples_per_second": 0.378, |
|
"train_steps_per_second": 0.012 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 847, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1591373314404123e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|