{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03176872369152569, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003176872369152569, "grad_norm": 0.6423737406730652, "learning_rate": 0.0002, "loss": 1.894, "step": 10 }, { "epoch": 0.0006353744738305139, "grad_norm": 0.43151596188545227, "learning_rate": 0.00019994965423831854, "loss": 1.4999, "step": 20 }, { "epoch": 0.0009530617107457708, "grad_norm": 0.5214523673057556, "learning_rate": 0.00019979866764718843, "loss": 1.4241, "step": 30 }, { "epoch": 0.0012707489476610277, "grad_norm": 0.6065482497215271, "learning_rate": 0.00019954719225730847, "loss": 1.4224, "step": 40 }, { "epoch": 0.0015884361845762847, "grad_norm": 0.5870187878608704, "learning_rate": 0.00019919548128307954, "loss": 1.3358, "step": 50 }, { "epoch": 0.0019061234214915416, "grad_norm": 0.5518338680267334, "learning_rate": 0.00019874388886763944, "loss": 1.4493, "step": 60 }, { "epoch": 0.0022238106584067984, "grad_norm": 0.5498958230018616, "learning_rate": 0.00019819286972627066, "loss": 1.3476, "step": 70 }, { "epoch": 0.0025414978953220554, "grad_norm": 0.4921424388885498, "learning_rate": 0.00019754297868854073, "loss": 1.3261, "step": 80 }, { "epoch": 0.0028591851322373124, "grad_norm": 0.4568871855735779, "learning_rate": 0.00019679487013963564, "loss": 1.3331, "step": 90 }, { "epoch": 0.0031768723691525695, "grad_norm": 0.4740012288093567, "learning_rate": 0.00019594929736144976, "loss": 1.3137, "step": 100 }, { "epoch": 0.003494559606067826, "grad_norm": 0.3695839047431946, "learning_rate": 0.00019500711177409454, "loss": 1.3602, "step": 110 }, { "epoch": 0.003812246842983083, "grad_norm": 0.4483315050601959, "learning_rate": 0.00019396926207859084, "loss": 1.3655, "step": 120 }, { "epoch": 0.00412993407989834, "grad_norm": 0.2625809609889984, "learning_rate": 0.00019283679330160726, "loss": 1.342, "step": 130 }, { "epoch": 0.004447621316813597, "grad_norm": 0.3869137465953827, "learning_rate": 0.00019161084574320696, "loss": 1.3862, "step": 140 }, { "epoch": 0.004765308553728854, "grad_norm": 0.3571557402610779, "learning_rate": 0.00019029265382866214, "loss": 1.3556, "step": 150 }, { "epoch": 0.005082995790644111, "grad_norm": 0.3197844624519348, "learning_rate": 0.00018888354486549237, "loss": 1.3683, "step": 160 }, { "epoch": 0.005400683027559368, "grad_norm": 0.3757419288158417, "learning_rate": 0.00018738493770697852, "loss": 1.3256, "step": 170 }, { "epoch": 0.005718370264474625, "grad_norm": 0.4688069820404053, "learning_rate": 0.00018579834132349772, "loss": 1.2979, "step": 180 }, { "epoch": 0.006036057501389882, "grad_norm": 0.41569846868515015, "learning_rate": 0.00018412535328311814, "loss": 1.3002, "step": 190 }, { "epoch": 0.006353744738305139, "grad_norm": 0.30035635828971863, "learning_rate": 0.0001823676581429833, "loss": 1.3526, "step": 200 }, { "epoch": 0.006671431975220395, "grad_norm": 0.48865431547164917, "learning_rate": 0.00018052702575310588, "loss": 1.3166, "step": 210 }, { "epoch": 0.006989119212135652, "grad_norm": 0.35794293880462646, "learning_rate": 0.00017860530947427875, "loss": 1.2699, "step": 220 }, { "epoch": 0.007306806449050909, "grad_norm": 0.34798431396484375, "learning_rate": 0.0001766044443118978, "loss": 1.3589, "step": 230 }, { "epoch": 0.007624493685966166, "grad_norm": 0.411737859249115, "learning_rate": 0.0001745264449675755, "loss": 1.2895, "step": 240 }, { "epoch": 0.007942180922881422, "grad_norm": 0.4812396764755249, "learning_rate": 0.00017237340381050703, "loss": 1.3143, "step": 250 }, { "epoch": 0.00825986815979668, "grad_norm": 0.4005419611930847, "learning_rate": 0.00017014748877063214, "loss": 1.3313, "step": 260 }, { "epoch": 0.008577555396711936, "grad_norm": 0.5115179419517517, "learning_rate": 0.00016785094115571322, "loss": 1.304, "step": 270 }, { "epoch": 0.008895242633627194, "grad_norm": 0.4151971936225891, "learning_rate": 0.00016548607339452853, "loss": 1.3317, "step": 280 }, { "epoch": 0.00921292987054245, "grad_norm": 0.3900114893913269, "learning_rate": 0.00016305526670845226, "loss": 1.295, "step": 290 }, { "epoch": 0.009530617107457708, "grad_norm": 0.39282795786857605, "learning_rate": 0.00016056096871376667, "loss": 1.2736, "step": 300 }, { "epoch": 0.009848304344372965, "grad_norm": 0.38796964287757874, "learning_rate": 0.00015800569095711982, "loss": 1.4088, "step": 310 }, { "epoch": 0.010165991581288222, "grad_norm": 0.503355860710144, "learning_rate": 0.00015539200638661104, "loss": 1.2614, "step": 320 }, { "epoch": 0.010483678818203479, "grad_norm": 0.2735162079334259, "learning_rate": 0.00015272254676105025, "loss": 1.3718, "step": 330 }, { "epoch": 0.010801366055118736, "grad_norm": 0.4685092866420746, "learning_rate": 0.00015000000000000001, "loss": 1.3372, "step": 340 }, { "epoch": 0.011119053292033993, "grad_norm": 0.39124029874801636, "learning_rate": 0.0001472271074772683, "loss": 1.3087, "step": 350 }, { "epoch": 0.01143674052894925, "grad_norm": 0.41046321392059326, "learning_rate": 0.00014440666126057744, "loss": 1.3672, "step": 360 }, { "epoch": 0.011754427765864507, "grad_norm": 0.4849304258823395, "learning_rate": 0.00014154150130018866, "loss": 1.3599, "step": 370 }, { "epoch": 0.012072115002779764, "grad_norm": 0.4031984508037567, "learning_rate": 0.00013863451256931287, "loss": 1.3568, "step": 380 }, { "epoch": 0.012389802239695021, "grad_norm": 0.4110875427722931, "learning_rate": 0.00013568862215918717, "loss": 1.3258, "step": 390 }, { "epoch": 0.012707489476610278, "grad_norm": 0.34926387667655945, "learning_rate": 0.00013270679633174218, "loss": 1.348, "step": 400 }, { "epoch": 0.013025176713525535, "grad_norm": 0.3700936436653137, "learning_rate": 0.0001296920375328275, "loss": 1.387, "step": 410 }, { "epoch": 0.01334286395044079, "grad_norm": 0.3237079977989197, "learning_rate": 0.00012664738136900348, "loss": 1.3228, "step": 420 }, { "epoch": 0.013660551187356047, "grad_norm": 0.3236962556838989, "learning_rate": 0.00012357589355094275, "loss": 1.2783, "step": 430 }, { "epoch": 0.013978238424271304, "grad_norm": 0.5266190767288208, "learning_rate": 0.00012048066680651908, "loss": 1.2917, "step": 440 }, { "epoch": 0.014295925661186561, "grad_norm": 0.3163311779499054, "learning_rate": 0.00011736481776669306, "loss": 1.4051, "step": 450 }, { "epoch": 0.014613612898101818, "grad_norm": 0.385373055934906, "learning_rate": 0.00011423148382732853, "loss": 1.2769, "step": 460 }, { "epoch": 0.014931300135017075, "grad_norm": 0.4726508557796478, "learning_rate": 0.00011108381999010111, "loss": 1.2518, "step": 470 }, { "epoch": 0.015248987371932332, "grad_norm": 0.3889691233634949, "learning_rate": 0.00010792499568567884, "loss": 1.3434, "step": 480 }, { "epoch": 0.01556667460884759, "grad_norm": 0.2896924316883087, "learning_rate": 0.00010475819158237425, "loss": 1.3799, "step": 490 }, { "epoch": 0.015884361845762845, "grad_norm": 0.468254029750824, "learning_rate": 0.00010158659638348081, "loss": 1.3084, "step": 500 }, { "epoch": 0.016202049082678104, "grad_norm": 0.37335512042045593, "learning_rate": 9.84134036165192e-05, "loss": 1.2532, "step": 510 }, { "epoch": 0.01651973631959336, "grad_norm": 0.44291970133781433, "learning_rate": 9.524180841762577e-05, "loss": 1.3034, "step": 520 }, { "epoch": 0.016837423556508618, "grad_norm": 0.37135663628578186, "learning_rate": 9.207500431432115e-05, "loss": 1.3557, "step": 530 }, { "epoch": 0.017155110793423873, "grad_norm": 0.2865736484527588, "learning_rate": 8.891618000989891e-05, "loss": 1.3384, "step": 540 }, { "epoch": 0.01747279803033913, "grad_norm": 0.37501224875450134, "learning_rate": 8.57685161726715e-05, "loss": 1.2738, "step": 550 }, { "epoch": 0.017790485267254387, "grad_norm": 0.4823606014251709, "learning_rate": 8.263518223330697e-05, "loss": 1.2745, "step": 560 }, { "epoch": 0.018108172504169646, "grad_norm": 0.43968015909194946, "learning_rate": 7.951933319348095e-05, "loss": 1.2699, "step": 570 }, { "epoch": 0.0184258597410849, "grad_norm": 0.4717681407928467, "learning_rate": 7.642410644905726e-05, "loss": 1.3567, "step": 580 }, { "epoch": 0.01874354697800016, "grad_norm": 0.4859773516654968, "learning_rate": 7.335261863099651e-05, "loss": 1.3445, "step": 590 }, { "epoch": 0.019061234214915415, "grad_norm": 0.45088648796081543, "learning_rate": 7.030796246717255e-05, "loss": 1.3573, "step": 600 }, { "epoch": 0.019378921451830674, "grad_norm": 0.34462448954582214, "learning_rate": 6.729320366825784e-05, "loss": 1.3151, "step": 610 }, { "epoch": 0.01969660868874593, "grad_norm": 0.392468124628067, "learning_rate": 6.431137784081282e-05, "loss": 1.295, "step": 620 }, { "epoch": 0.020014295925661188, "grad_norm": 0.3955654799938202, "learning_rate": 6.136548743068713e-05, "loss": 1.3995, "step": 630 }, { "epoch": 0.020331983162576443, "grad_norm": 0.47426915168762207, "learning_rate": 5.845849869981137e-05, "loss": 1.364, "step": 640 }, { "epoch": 0.020649670399491702, "grad_norm": 0.42022913694381714, "learning_rate": 5.559333873942259e-05, "loss": 1.4191, "step": 650 }, { "epoch": 0.020967357636406957, "grad_norm": 0.42357268929481506, "learning_rate": 5.277289252273174e-05, "loss": 1.295, "step": 660 }, { "epoch": 0.021285044873322213, "grad_norm": 0.5024418234825134, "learning_rate": 5.000000000000002e-05, "loss": 1.2788, "step": 670 }, { "epoch": 0.02160273211023747, "grad_norm": 0.37930434942245483, "learning_rate": 4.727745323894976e-05, "loss": 1.3805, "step": 680 }, { "epoch": 0.021920419347152727, "grad_norm": 0.4679397642612457, "learning_rate": 4.4607993613388976e-05, "loss": 1.2376, "step": 690 }, { "epoch": 0.022238106584067985, "grad_norm": 0.3530070185661316, "learning_rate": 4.19943090428802e-05, "loss": 1.329, "step": 700 }, { "epoch": 0.02255579382098324, "grad_norm": 0.4302089512348175, "learning_rate": 3.943903128623335e-05, "loss": 1.3312, "step": 710 }, { "epoch": 0.0228734810578985, "grad_norm": 0.32680538296699524, "learning_rate": 3.694473329154778e-05, "loss": 1.3037, "step": 720 }, { "epoch": 0.023191168294813755, "grad_norm": 0.3701346516609192, "learning_rate": 3.45139266054715e-05, "loss": 1.279, "step": 730 }, { "epoch": 0.023508855531729014, "grad_norm": 0.4951785206794739, "learning_rate": 3.21490588442868e-05, "loss": 1.2554, "step": 740 }, { "epoch": 0.02382654276864427, "grad_norm": 0.4673095643520355, "learning_rate": 2.9852511229367865e-05, "loss": 1.246, "step": 750 }, { "epoch": 0.024144230005559528, "grad_norm": 0.3411801755428314, "learning_rate": 2.7626596189492983e-05, "loss": 1.2991, "step": 760 }, { "epoch": 0.024461917242474783, "grad_norm": 0.3999599814414978, "learning_rate": 2.5473555032424533e-05, "loss": 1.3437, "step": 770 }, { "epoch": 0.024779604479390042, "grad_norm": 0.4905704855918884, "learning_rate": 2.339555568810221e-05, "loss": 1.2865, "step": 780 }, { "epoch": 0.025097291716305297, "grad_norm": 0.32417967915534973, "learning_rate": 2.139469052572127e-05, "loss": 1.3154, "step": 790 }, { "epoch": 0.025414978953220556, "grad_norm": 0.45307573676109314, "learning_rate": 1.947297424689414e-05, "loss": 1.3177, "step": 800 }, { "epoch": 0.02573266619013581, "grad_norm": 0.36498716473579407, "learning_rate": 1.763234185701673e-05, "loss": 1.2842, "step": 810 }, { "epoch": 0.02605035342705107, "grad_norm": 0.38874876499176025, "learning_rate": 1.587464671688187e-05, "loss": 1.2899, "step": 820 }, { "epoch": 0.026368040663966325, "grad_norm": 0.2834196388721466, "learning_rate": 1.4201658676502294e-05, "loss": 1.2695, "step": 830 }, { "epoch": 0.02668572790088158, "grad_norm": 0.4036119282245636, "learning_rate": 1.2615062293021507e-05, "loss": 1.2822, "step": 840 }, { "epoch": 0.02700341513779684, "grad_norm": 0.4357603192329407, "learning_rate": 1.1116455134507664e-05, "loss": 1.249, "step": 850 }, { "epoch": 0.027321102374712095, "grad_norm": 0.33189335465431213, "learning_rate": 9.707346171337894e-06, "loss": 1.3145, "step": 860 }, { "epoch": 0.027638789611627353, "grad_norm": 0.4761389493942261, "learning_rate": 8.38915425679304e-06, "loss": 1.3295, "step": 870 }, { "epoch": 0.02795647684854261, "grad_norm": 0.29026517271995544, "learning_rate": 7.163206698392744e-06, "loss": 1.3258, "step": 880 }, { "epoch": 0.028274164085457867, "grad_norm": 0.4444005787372589, "learning_rate": 6.030737921409169e-06, "loss": 1.3555, "step": 890 }, { "epoch": 0.028591851322373123, "grad_norm": 0.4796519875526428, "learning_rate": 4.992888225905468e-06, "loss": 1.3859, "step": 900 }, { "epoch": 0.02890953855928838, "grad_norm": 0.3482547402381897, "learning_rate": 4.050702638550275e-06, "loss": 1.2922, "step": 910 }, { "epoch": 0.029227225796203637, "grad_norm": 0.4299076199531555, "learning_rate": 3.2051298603643753e-06, "loss": 1.2522, "step": 920 }, { "epoch": 0.029544913033118896, "grad_norm": 0.3982202708721161, "learning_rate": 2.4570213114592954e-06, "loss": 1.3475, "step": 930 }, { "epoch": 0.02986260027003415, "grad_norm": 0.2972886264324188, "learning_rate": 1.8071302737293295e-06, "loss": 1.2685, "step": 940 }, { "epoch": 0.03018028750694941, "grad_norm": 0.5200194716453552, "learning_rate": 1.2561111323605712e-06, "loss": 1.3588, "step": 950 }, { "epoch": 0.030497974743864665, "grad_norm": 0.35589033365249634, "learning_rate": 8.04518716920466e-07, "loss": 1.3555, "step": 960 }, { "epoch": 0.030815661980779924, "grad_norm": 0.33646684885025024, "learning_rate": 4.5280774269154115e-07, "loss": 1.2939, "step": 970 }, { "epoch": 0.03113334921769518, "grad_norm": 0.36592164635658264, "learning_rate": 2.0133235281156736e-07, "loss": 1.2658, "step": 980 }, { "epoch": 0.031451036454610434, "grad_norm": 0.40712451934814453, "learning_rate": 5.0345761681491746e-08, "loss": 1.2341, "step": 990 }, { "epoch": 0.03176872369152569, "grad_norm": 0.2817166745662689, "learning_rate": 0.0, "loss": 1.2603, "step": 1000 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.838672882171904e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }