{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9934123847167324, "eval_steps": 500, "global_step": 852, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03513394817742644, "grad_norm": 0.43717896938323975, "learning_rate": 0.00019230769230769233, "loss": 1.994, "step": 10 }, { "epoch": 0.07026789635485288, "grad_norm": 0.4105484187602997, "learning_rate": 0.00038461538461538467, "loss": 1.7122, "step": 20 }, { "epoch": 0.10540184453227931, "grad_norm": 0.39099714159965515, "learning_rate": 0.0004999710691449165, "loss": 1.7017, "step": 30 }, { "epoch": 0.14053579270970576, "grad_norm": 0.418653279542923, "learning_rate": 0.0004996456739191905, "loss": 1.7533, "step": 40 }, { "epoch": 0.1756697408871322, "grad_norm": 0.402630478143692, "learning_rate": 0.0004989591921187147, "loss": 1.6842, "step": 50 }, { "epoch": 0.21080368906455862, "grad_norm": 0.4017012119293213, "learning_rate": 0.0004979126166682133, "loss": 1.6915, "step": 60 }, { "epoch": 0.24593763724198506, "grad_norm": 0.5224213004112244, "learning_rate": 0.0004965074613305277, "loss": 1.7208, "step": 70 }, { "epoch": 0.2810715854194115, "grad_norm": 0.5111001133918762, "learning_rate": 0.0004947457585171148, "loss": 1.6386, "step": 80 }, { "epoch": 0.31620553359683795, "grad_norm": 0.6671903133392334, "learning_rate": 0.000492630056348375, "loss": 1.6592, "step": 90 }, { "epoch": 0.3513394817742644, "grad_norm": 0.4950112998485565, "learning_rate": 0.0004901634149680608, "loss": 1.6741, "step": 100 }, { "epoch": 0.3864734299516908, "grad_norm": 0.5058407783508301, "learning_rate": 0.0004873494021170953, "loss": 1.7568, "step": 110 }, { "epoch": 0.42160737812911725, "grad_norm": 0.8665825724601746, "learning_rate": 0.00048419208797320564, "loss": 1.7356, "step": 120 }, { "epoch": 0.4567413263065437, "grad_norm": 0.5124475955963135, "learning_rate": 0.00048069603926383277, "loss": 1.7199, "step": 130 }, { "epoch": 0.4918752744839701, "grad_norm": 0.505696177482605, "learning_rate": 0.0004768663126608342, "loss": 1.6813, "step": 140 }, { "epoch": 0.5270092226613966, "grad_norm": 0.509445071220398, "learning_rate": 0.0004727084474665322, "loss": 1.7074, "step": 150 }, { "epoch": 0.562143170838823, "grad_norm": 0.5511056184768677, "learning_rate": 0.00046822845760168783, "loss": 1.6766, "step": 160 }, { "epoch": 0.5972771190162495, "grad_norm": 0.46946030855178833, "learning_rate": 0.0004634328229069881, "loss": 1.7018, "step": 170 }, { "epoch": 0.6324110671936759, "grad_norm": 0.49886614084243774, "learning_rate": 0.00045832847977062875, "loss": 1.7293, "step": 180 }, { "epoch": 0.6675450153711023, "grad_norm": 0.49424228072166443, "learning_rate": 0.0004529228110955478, "loss": 1.7306, "step": 190 }, { "epoch": 0.7026789635485288, "grad_norm": 0.5640605092048645, "learning_rate": 0.00044722363562082237, "loss": 1.7369, "step": 200 }, { "epoch": 0.7378129117259552, "grad_norm": 0.5802103281021118, "learning_rate": 0.0004412391966126735, "loss": 1.7463, "step": 210 }, { "epoch": 0.7729468599033816, "grad_norm": 0.5365092158317566, "learning_rate": 0.0004349781499414369, "loss": 1.7198, "step": 220 }, { "epoch": 0.8080808080808081, "grad_norm": 0.5591799020767212, "learning_rate": 0.00042844955156174345, "loss": 1.7298, "step": 230 }, { "epoch": 0.8432147562582345, "grad_norm": 0.5256664156913757, "learning_rate": 0.000421662844414021, "loss": 1.6863, "step": 240 }, { "epoch": 0.8783487044356609, "grad_norm": 0.6382088661193848, "learning_rate": 0.0004146278447662597, "loss": 1.7195, "step": 250 }, { "epoch": 0.9134826526130874, "grad_norm": 0.5260637402534485, "learning_rate": 0.00040735472801579887, "loss": 1.7135, "step": 260 }, { "epoch": 0.9486166007905138, "grad_norm": 0.5244185328483582, "learning_rate": 0.0003998540139716701, "loss": 1.6944, "step": 270 }, { "epoch": 0.9837505489679402, "grad_norm": 0.5998988747596741, "learning_rate": 0.00039213655163878436, "loss": 1.6982, "step": 280 }, { "epoch": 1.0188844971453668, "grad_norm": 0.6315314173698425, "learning_rate": 0.00038421350352597195, "loss": 1.5473, "step": 290 }, { "epoch": 1.0540184453227932, "grad_norm": 0.6495606899261475, "learning_rate": 0.00037609632950057095, "loss": 1.3535, "step": 300 }, { "epoch": 1.0891523935002196, "grad_norm": 0.6094574928283691, "learning_rate": 0.0003677967702129177, "loss": 1.3452, "step": 310 }, { "epoch": 1.124286341677646, "grad_norm": 0.5786643028259277, "learning_rate": 0.0003593268301147139, "loss": 1.3433, "step": 320 }, { "epoch": 1.1594202898550725, "grad_norm": 0.6357799172401428, "learning_rate": 0.00035069876009583234, "loss": 1.4166, "step": 330 }, { "epoch": 1.194554238032499, "grad_norm": 0.6568606495857239, "learning_rate": 0.00034192503976467525, "loss": 1.323, "step": 340 }, { "epoch": 1.2296881862099254, "grad_norm": 0.6073617339134216, "learning_rate": 0.0003330183593977152, "loss": 1.389, "step": 350 }, { "epoch": 1.2648221343873518, "grad_norm": 0.5338005423545837, "learning_rate": 0.00032399160158432606, "loss": 1.3739, "step": 360 }, { "epoch": 1.2999560825647782, "grad_norm": 0.6769421100616455, "learning_rate": 0.00031485782259345406, "loss": 1.4024, "step": 370 }, { "epoch": 1.3350900307422047, "grad_norm": 0.6495899558067322, "learning_rate": 0.0003056302334890786, "loss": 1.3615, "step": 380 }, { "epoch": 1.370223978919631, "grad_norm": 0.6488791108131409, "learning_rate": 0.0002963221810217786, "loss": 1.3548, "step": 390 }, { "epoch": 1.4053579270970575, "grad_norm": 0.746868908405304, "learning_rate": 0.00028694712832404195, "loss": 1.3749, "step": 400 }, { "epoch": 1.440491875274484, "grad_norm": 0.6373124122619629, "learning_rate": 0.0002775186354372408, "loss": 1.3555, "step": 410 }, { "epoch": 1.4756258234519104, "grad_norm": 0.6247098445892334, "learning_rate": 0.0002680503396984382, "loss": 1.3977, "step": 420 }, { "epoch": 1.5107597716293368, "grad_norm": 0.6373061537742615, "learning_rate": 0.00025855593601539415, "loss": 1.3637, "step": 430 }, { "epoch": 1.5458937198067633, "grad_norm": 0.7269704341888428, "learning_rate": 0.00024904915705830234, "loss": 1.4263, "step": 440 }, { "epoch": 1.5810276679841897, "grad_norm": 0.6946704983711243, "learning_rate": 0.0002395437533969069, "loss": 1.3822, "step": 450 }, { "epoch": 1.6161616161616161, "grad_norm": 0.6399605870246887, "learning_rate": 0.0002300534736117292, "loss": 1.4348, "step": 460 }, { "epoch": 1.6512955643390426, "grad_norm": 0.6449073553085327, "learning_rate": 0.00022059204440817246, "loss": 1.3793, "step": 470 }, { "epoch": 1.686429512516469, "grad_norm": 0.616384744644165, "learning_rate": 0.00021117315076226557, "loss": 1.3917, "step": 480 }, { "epoch": 1.7215634606938954, "grad_norm": 0.6383651494979858, "learning_rate": 0.0002018104161267652, "loss": 1.4097, "step": 490 }, { "epoch": 1.7566974088713219, "grad_norm": 0.720451295375824, "learning_rate": 0.00019251738272624416, "loss": 1.3997, "step": 500 }, { "epoch": 1.7918313570487485, "grad_norm": 0.6487829685211182, "learning_rate": 0.00018330749196966806, "loss": 1.4366, "step": 510 }, { "epoch": 1.826965305226175, "grad_norm": 0.6398463249206543, "learning_rate": 0.00017419406500879115, "loss": 1.3536, "step": 520 }, { "epoch": 1.8620992534036014, "grad_norm": 0.7006503939628601, "learning_rate": 0.00016519028347049242, "loss": 1.3934, "step": 530 }, { "epoch": 1.8972332015810278, "grad_norm": 0.6170542240142822, "learning_rate": 0.00015630917039091919, "loss": 1.4171, "step": 540 }, { "epoch": 1.9323671497584543, "grad_norm": 0.6998418569564819, "learning_rate": 0.00014756357137901604, "loss": 1.3809, "step": 550 }, { "epoch": 1.9675010979358807, "grad_norm": 0.6567032933235168, "learning_rate": 0.00013896613603668365, "loss": 1.3223, "step": 560 }, { "epoch": 2.002635046113307, "grad_norm": 0.6512529253959656, "learning_rate": 0.00013052929966244216, "loss": 1.3693, "step": 570 }, { "epoch": 2.0377689942907335, "grad_norm": 0.8671336770057678, "learning_rate": 0.00012226526526506093, "loss": 1.0046, "step": 580 }, { "epoch": 2.07290294246816, "grad_norm": 0.7728586792945862, "learning_rate": 0.00011418598591317242, "loss": 1.0138, "step": 590 }, { "epoch": 2.1080368906455864, "grad_norm": 0.8196272253990173, "learning_rate": 0.0001063031474463983, "loss": 0.9985, "step": 600 }, { "epoch": 2.143170838823013, "grad_norm": 0.899488091468811, "learning_rate": 9.862815157299391e-05, "loss": 0.9397, "step": 610 }, { "epoch": 2.1783047870004393, "grad_norm": 0.7770416736602783, "learning_rate": 9.117209937846053e-05, "loss": 0.9307, "step": 620 }, { "epoch": 2.2134387351778657, "grad_norm": 0.8353050947189331, "learning_rate": 8.394577526897565e-05, "loss": 0.9334, "step": 630 }, { "epoch": 2.248572683355292, "grad_norm": 0.7152834534645081, "learning_rate": 7.69596313728691e-05, "loss": 0.9483, "step": 640 }, { "epoch": 2.2837066315327186, "grad_norm": 1.0104659795761108, "learning_rate": 7.022377242270251e-05, "loss": 0.942, "step": 650 }, { "epoch": 2.318840579710145, "grad_norm": 0.7319175004959106, "learning_rate": 6.374794113982232e-05, "loss": 0.9242, "step": 660 }, { "epoch": 2.3539745278875714, "grad_norm": 0.7895592451095581, "learning_rate": 5.7541504142523406e-05, "loss": 0.99, "step": 670 }, { "epoch": 2.389108476064998, "grad_norm": 0.9462332725524902, "learning_rate": 5.161343839820762e-05, "loss": 0.9733, "step": 680 }, { "epoch": 2.4242424242424243, "grad_norm": 0.9174964427947998, "learning_rate": 4.597231823913112e-05, "loss": 0.9478, "step": 690 }, { "epoch": 2.4593763724198507, "grad_norm": 0.7435886859893799, "learning_rate": 4.062630296052222e-05, "loss": 0.9487, "step": 700 }, { "epoch": 2.494510320597277, "grad_norm": 0.908430278301239, "learning_rate": 3.558312501900718e-05, "loss": 0.9517, "step": 710 }, { "epoch": 2.5296442687747036, "grad_norm": 0.8427926301956177, "learning_rate": 3.0850078848413704e-05, "loss": 0.93, "step": 720 }, { "epoch": 2.56477821695213, "grad_norm": 0.8396946787834167, "learning_rate": 2.643401030912876e-05, "loss": 0.9528, "step": 730 }, { "epoch": 2.5999121651295565, "grad_norm": 0.8085779547691345, "learning_rate": 2.234130678627169e-05, "loss": 0.9257, "step": 740 }, { "epoch": 2.635046113306983, "grad_norm": 0.872416079044342, "learning_rate": 1.8577887951004264e-05, "loss": 0.9294, "step": 750 }, { "epoch": 2.6701800614844093, "grad_norm": 0.8148744702339172, "learning_rate": 1.5149197198340014e-05, "loss": 0.9166, "step": 760 }, { "epoch": 2.7053140096618358, "grad_norm": 0.8573042750358582, "learning_rate": 1.206019377383813e-05, "loss": 0.9481, "step": 770 }, { "epoch": 2.740447957839262, "grad_norm": 0.7604958415031433, "learning_rate": 9.315345600569069e-06, "loss": 0.9425, "step": 780 }, { "epoch": 2.7755819060166886, "grad_norm": 0.870971143245697, "learning_rate": 6.918622816727255e-06, "loss": 0.9087, "step": 790 }, { "epoch": 2.810715854194115, "grad_norm": 0.9157975316047668, "learning_rate": 4.873492033237864e-06, "loss": 0.9547, "step": 800 }, { "epoch": 2.8458498023715415, "grad_norm": 0.9207829236984253, "learning_rate": 3.1829113196638614e-06, "loss": 0.9275, "step": 810 }, { "epoch": 2.880983750548968, "grad_norm": 0.9513605237007141, "learning_rate": 1.8493259256649187e-06, "loss": 0.981, "step": 820 }, { "epoch": 2.9161176987263944, "grad_norm": 0.854158878326416, "learning_rate": 8.746647441975619e-07, "loss": 0.9124, "step": 830 }, { "epoch": 2.951251646903821, "grad_norm": 0.8659394383430481, "learning_rate": 2.603375215716186e-07, "loss": 0.9643, "step": 840 }, { "epoch": 2.9863855950812472, "grad_norm": 0.94295734167099, "learning_rate": 7.23281839820622e-09, "loss": 0.9282, "step": 850 } ], "logging_steps": 10, "max_steps": 852, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0935759953200589e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }