{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.927710843373493, "eval_steps": 500, "global_step": 1030, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0963855421686747, "grad_norm": 0.8919573686450439, "learning_rate": 9.997674418116758e-05, "loss": 1.9774, "step": 10 }, { "epoch": 0.1927710843373494, "grad_norm": 0.910704767308804, "learning_rate": 9.990699835799469e-05, "loss": 1.6692, "step": 20 }, { "epoch": 0.2891566265060241, "grad_norm": 0.812298320227036, "learning_rate": 9.979082741033047e-05, "loss": 1.5706, "step": 30 }, { "epoch": 0.3855421686746988, "grad_norm": 0.7732472432905145, "learning_rate": 9.96283394041954e-05, "loss": 1.5094, "step": 40 }, { "epoch": 0.4819277108433735, "grad_norm": 0.8531022514130023, "learning_rate": 9.941968549125481e-05, "loss": 1.463, "step": 50 }, { "epoch": 0.5783132530120482, "grad_norm": 0.9116145183088319, "learning_rate": 9.916505976821263e-05, "loss": 1.4822, "step": 60 }, { "epoch": 0.6746987951807228, "grad_norm": 0.8879028360760914, "learning_rate": 9.886469909625625e-05, "loss": 1.4681, "step": 70 }, { "epoch": 0.7710843373493976, "grad_norm": 0.9147335792078393, "learning_rate": 9.851888288072053e-05, "loss": 1.3971, "step": 80 }, { "epoch": 0.8674698795180723, "grad_norm": 0.887118205770356, "learning_rate": 9.81279328111758e-05, "loss": 1.4472, "step": 90 }, { "epoch": 0.963855421686747, "grad_norm": 0.8531475377514153, "learning_rate": 9.769221256218164e-05, "loss": 1.4119, "step": 100 }, { "epoch": 1.0602409638554218, "grad_norm": 0.8430581950762995, "learning_rate": 9.721212745498494e-05, "loss": 1.3639, "step": 110 }, { "epoch": 1.1566265060240963, "grad_norm": 0.9779626124304455, "learning_rate": 9.668812408047679e-05, "loss": 1.3209, "step": 120 }, { "epoch": 1.2530120481927711, "grad_norm": 1.0822085428116495, "learning_rate": 9.612068988375897e-05, "loss": 1.3343, "step": 130 }, { "epoch": 1.3493975903614457, "grad_norm": 1.0414395683595772, "learning_rate": 9.551035271070664e-05, "loss": 1.2922, "step": 140 }, { "epoch": 1.4457831325301205, "grad_norm": 1.1344579456156207, "learning_rate": 9.485768031694872e-05, "loss": 1.3155, "step": 150 }, { "epoch": 1.5421686746987953, "grad_norm": 1.1349887667909784, "learning_rate": 9.416327983972304e-05, "loss": 1.2964, "step": 160 }, { "epoch": 1.6385542168674698, "grad_norm": 1.0901371692538764, "learning_rate": 9.342779723309745e-05, "loss": 1.3039, "step": 170 }, { "epoch": 1.7349397590361446, "grad_norm": 1.1237203962852838, "learning_rate": 9.265191666708209e-05, "loss": 1.3382, "step": 180 }, { "epoch": 1.8313253012048194, "grad_norm": 1.153171162807342, "learning_rate": 9.18363598911921e-05, "loss": 1.2993, "step": 190 }, { "epoch": 1.927710843373494, "grad_norm": 1.0603198764699773, "learning_rate": 9.098188556305263e-05, "loss": 1.2905, "step": 200 }, { "epoch": 2.0240963855421685, "grad_norm": 1.0860380934510327, "learning_rate": 9.008928854267054e-05, "loss": 1.2749, "step": 210 }, { "epoch": 2.1204819277108435, "grad_norm": 1.3046592302747626, "learning_rate": 8.915939915302968e-05, "loss": 1.1839, "step": 220 }, { "epoch": 2.216867469879518, "grad_norm": 1.2724868030271497, "learning_rate": 8.819308240769724e-05, "loss": 1.1675, "step": 230 }, { "epoch": 2.3132530120481927, "grad_norm": 1.4063722146992672, "learning_rate": 8.71912372061598e-05, "loss": 1.1765, "step": 240 }, { "epoch": 2.4096385542168672, "grad_norm": 1.4911158254860226, "learning_rate": 8.615479549763756e-05, "loss": 1.1846, "step": 250 }, { "epoch": 2.5060240963855422, "grad_norm": 1.4235766805578518, "learning_rate": 8.508472141415467e-05, "loss": 1.169, "step": 260 }, { "epoch": 2.602409638554217, "grad_norm": 1.38772225064988, "learning_rate": 8.398201037367201e-05, "loss": 1.1542, "step": 270 }, { "epoch": 2.6987951807228914, "grad_norm": 1.451662491739912, "learning_rate": 8.284768815411692e-05, "loss": 1.1561, "step": 280 }, { "epoch": 2.7951807228915664, "grad_norm": 1.4370300928384818, "learning_rate": 8.168280993917077e-05, "loss": 1.1738, "step": 290 }, { "epoch": 2.891566265060241, "grad_norm": 1.40886699437724, "learning_rate": 8.048845933670273e-05, "loss": 1.1636, "step": 300 }, { "epoch": 2.9879518072289155, "grad_norm": 1.4108567826491814, "learning_rate": 7.926574737076211e-05, "loss": 1.1971, "step": 310 }, { "epoch": 3.0843373493975905, "grad_norm": 1.7067948053286397, "learning_rate": 7.801581144806752e-05, "loss": 1.0335, "step": 320 }, { "epoch": 3.180722891566265, "grad_norm": 1.6598822585089195, "learning_rate": 7.673981429995372e-05, "loss": 1.0365, "step": 330 }, { "epoch": 3.2771084337349397, "grad_norm": 1.7326188759467145, "learning_rate": 7.543894290076103e-05, "loss": 1.0218, "step": 340 }, { "epoch": 3.3734939759036147, "grad_norm": 1.7737673244546652, "learning_rate": 7.411440736367281e-05, "loss": 1.0187, "step": 350 }, { "epoch": 3.4698795180722892, "grad_norm": 1.7613273642895082, "learning_rate": 7.276743981502856e-05, "loss": 1.0217, "step": 360 }, { "epoch": 3.566265060240964, "grad_norm": 1.880423633009527, "learning_rate": 7.139929324815965e-05, "loss": 1.0378, "step": 370 }, { "epoch": 3.662650602409639, "grad_norm": 1.9055198219521683, "learning_rate": 7.00112403578139e-05, "loss": 1.0406, "step": 380 }, { "epoch": 3.7590361445783134, "grad_norm": 1.926797264762729, "learning_rate": 6.860457235625322e-05, "loss": 1.0331, "step": 390 }, { "epoch": 3.855421686746988, "grad_norm": 1.9427994346863178, "learning_rate": 6.718059777212567e-05, "loss": 1.0354, "step": 400 }, { "epoch": 3.9518072289156625, "grad_norm": 1.9647182978122792, "learning_rate": 6.574064123322925e-05, "loss": 1.0445, "step": 410 }, { "epoch": 4.048192771084337, "grad_norm": 1.837995364067002, "learning_rate": 6.42860422342998e-05, "loss": 0.9521, "step": 420 }, { "epoch": 4.144578313253012, "grad_norm": 2.148688266798288, "learning_rate": 6.281815389096903e-05, "loss": 0.8772, "step": 430 }, { "epoch": 4.240963855421687, "grad_norm": 2.376205483254627, "learning_rate": 6.133834168105206e-05, "loss": 0.8616, "step": 440 }, { "epoch": 4.337349397590361, "grad_norm": 2.2364751098182527, "learning_rate": 5.9847982174335316e-05, "loss": 0.8602, "step": 450 }, { "epoch": 4.433734939759036, "grad_norm": 2.240796410509819, "learning_rate": 5.8348461752046116e-05, "loss": 0.8754, "step": 460 }, { "epoch": 4.530120481927711, "grad_norm": 2.2490246329419574, "learning_rate": 5.6841175317195515e-05, "loss": 0.8893, "step": 470 }, { "epoch": 4.626506024096385, "grad_norm": 2.328620328311832, "learning_rate": 5.532752499699381e-05, "loss": 0.8905, "step": 480 }, { "epoch": 4.72289156626506, "grad_norm": 2.4232879309790314, "learning_rate": 5.380891883854591e-05, "loss": 0.8761, "step": 490 }, { "epoch": 4.8192771084337345, "grad_norm": 2.330347626994359, "learning_rate": 5.228676949903973e-05, "loss": 0.8947, "step": 500 }, { "epoch": 4.9156626506024095, "grad_norm": 2.352453843969406, "learning_rate": 5.07624929316463e-05, "loss": 0.8807, "step": 510 }, { "epoch": 5.0120481927710845, "grad_norm": 2.2677109085585476, "learning_rate": 4.923750706835371e-05, "loss": 0.877, "step": 520 }, { "epoch": 5.108433734939759, "grad_norm": 2.929089690621969, "learning_rate": 4.771323050096028e-05, "loss": 0.6911, "step": 530 }, { "epoch": 5.204819277108434, "grad_norm": 2.6683793778473053, "learning_rate": 4.619108116145411e-05, "loss": 0.7144, "step": 540 }, { "epoch": 5.301204819277109, "grad_norm": 2.7047381812494806, "learning_rate": 4.46724750030062e-05, "loss": 0.725, "step": 550 }, { "epoch": 5.397590361445783, "grad_norm": 2.851421639172732, "learning_rate": 4.31588246828045e-05, "loss": 0.7243, "step": 560 }, { "epoch": 5.493975903614458, "grad_norm": 2.776878874955075, "learning_rate": 4.16515382479539e-05, "loss": 0.6986, "step": 570 }, { "epoch": 5.590361445783133, "grad_norm": 2.9830553324496734, "learning_rate": 4.015201782566471e-05, "loss": 0.7311, "step": 580 }, { "epoch": 5.686746987951807, "grad_norm": 2.8075065703920905, "learning_rate": 3.866165831894796e-05, "loss": 0.7511, "step": 590 }, { "epoch": 5.783132530120482, "grad_norm": 3.060856835395072, "learning_rate": 3.7181846109031005e-05, "loss": 0.7277, "step": 600 }, { "epoch": 5.879518072289157, "grad_norm": 2.9964402401459407, "learning_rate": 3.571395776570023e-05, "loss": 0.7507, "step": 610 }, { "epoch": 5.975903614457831, "grad_norm": 2.840908376303377, "learning_rate": 3.4259358766770766e-05, "loss": 0.7488, "step": 620 }, { "epoch": 6.072289156626506, "grad_norm": 3.4568397677126326, "learning_rate": 3.2819402227874365e-05, "loss": 0.6285, "step": 630 }, { "epoch": 6.168674698795181, "grad_norm": 3.1654598373448137, "learning_rate": 3.1395427643746796e-05, "loss": 0.5836, "step": 640 }, { "epoch": 6.265060240963855, "grad_norm": 3.2675864044433585, "learning_rate": 2.9988759642186097e-05, "loss": 0.5849, "step": 650 }, { "epoch": 6.36144578313253, "grad_norm": 3.2183821392920664, "learning_rate": 2.860070675184036e-05, "loss": 0.5949, "step": 660 }, { "epoch": 6.457831325301205, "grad_norm": 3.0719394770792885, "learning_rate": 2.7232560184971434e-05, "loss": 0.5786, "step": 670 }, { "epoch": 6.554216867469879, "grad_norm": 3.3384530164789523, "learning_rate": 2.588559263632719e-05, "loss": 0.5883, "step": 680 }, { "epoch": 6.650602409638554, "grad_norm": 3.257889301524805, "learning_rate": 2.456105709923897e-05, "loss": 0.5929, "step": 690 }, { "epoch": 6.746987951807229, "grad_norm": 3.3823887425006545, "learning_rate": 2.3260185700046294e-05, "loss": 0.594, "step": 700 }, { "epoch": 6.843373493975903, "grad_norm": 3.435900725711256, "learning_rate": 2.1984188551932512e-05, "loss": 0.5924, "step": 710 }, { "epoch": 6.9397590361445785, "grad_norm": 3.4675705465638473, "learning_rate": 2.0734252629237894e-05, "loss": 0.5927, "step": 720 }, { "epoch": 7.036144578313253, "grad_norm": 2.905863499929125, "learning_rate": 1.9511540663297285e-05, "loss": 0.5441, "step": 730 }, { "epoch": 7.132530120481928, "grad_norm": 3.525660740561049, "learning_rate": 1.831719006082924e-05, "loss": 0.4634, "step": 740 }, { "epoch": 7.228915662650603, "grad_norm": 3.293264937231894, "learning_rate": 1.7152311845883095e-05, "loss": 0.4671, "step": 750 }, { "epoch": 7.325301204819277, "grad_norm": 3.4643723238577904, "learning_rate": 1.601798962632799e-05, "loss": 0.476, "step": 760 }, { "epoch": 7.421686746987952, "grad_norm": 3.188250058544125, "learning_rate": 1.491527858584535e-05, "loss": 0.4981, "step": 770 }, { "epoch": 7.518072289156627, "grad_norm": 3.4201215516664516, "learning_rate": 1.384520450236244e-05, "loss": 0.4714, "step": 780 }, { "epoch": 7.614457831325301, "grad_norm": 3.2016173663444145, "learning_rate": 1.2808762793840201e-05, "loss": 0.4878, "step": 790 }, { "epoch": 7.710843373493976, "grad_norm": 3.513424701410591, "learning_rate": 1.1806917592302762e-05, "loss": 0.4914, "step": 800 }, { "epoch": 7.807228915662651, "grad_norm": 3.4642537282355543, "learning_rate": 1.0840600846970334e-05, "loss": 0.4742, "step": 810 }, { "epoch": 7.903614457831325, "grad_norm": 3.4479929396821443, "learning_rate": 9.91071145732948e-06, "loss": 0.4749, "step": 820 }, { "epoch": 8.0, "grad_norm": 3.417147882208991, "learning_rate": 9.018114436947373e-06, "loss": 0.4918, "step": 830 }, { "epoch": 8.096385542168674, "grad_norm": 3.152072043029231, "learning_rate": 8.163640108807896e-06, "loss": 0.4106, "step": 840 }, { "epoch": 8.19277108433735, "grad_norm": 3.5086623296717816, "learning_rate": 7.348083332917926e-06, "loss": 0.4141, "step": 850 }, { "epoch": 8.289156626506024, "grad_norm": 3.727099779414922, "learning_rate": 6.572202766902569e-06, "loss": 0.4287, "step": 860 }, { "epoch": 8.385542168674698, "grad_norm": 3.356502113157289, "learning_rate": 5.83672016027697e-06, "loss": 0.3974, "step": 870 }, { "epoch": 8.481927710843374, "grad_norm": 3.5114290988775427, "learning_rate": 5.1423196830513e-06, "loss": 0.4111, "step": 880 }, { "epoch": 8.578313253012048, "grad_norm": 3.3470018501511447, "learning_rate": 4.489647289293369e-06, "loss": 0.4165, "step": 890 }, { "epoch": 8.674698795180722, "grad_norm": 3.1455647605463666, "learning_rate": 3.879310116241042e-06, "loss": 0.4156, "step": 900 }, { "epoch": 8.771084337349398, "grad_norm": 3.4300111085823124, "learning_rate": 3.3118759195232275e-06, "loss": 0.3935, "step": 910 }, { "epoch": 8.867469879518072, "grad_norm": 3.2501800712068825, "learning_rate": 2.787872545015069e-06, "loss": 0.4103, "step": 920 }, { "epoch": 8.963855421686747, "grad_norm": 3.2829723029519666, "learning_rate": 2.307787437818365e-06, "loss": 0.4131, "step": 930 }, { "epoch": 9.060240963855422, "grad_norm": 3.145951812862308, "learning_rate": 1.8720671888242059e-06, "loss": 0.3828, "step": 940 }, { "epoch": 9.156626506024097, "grad_norm": 3.1956957268186112, "learning_rate": 1.4811171192794627e-06, "loss": 0.365, "step": 950 }, { "epoch": 9.25301204819277, "grad_norm": 3.024863424355031, "learning_rate": 1.1353009037437523e-06, "loss": 0.3736, "step": 960 }, { "epoch": 9.349397590361447, "grad_norm": 3.4426367987042106, "learning_rate": 8.349402317873789e-07, "loss": 0.378, "step": 970 }, { "epoch": 9.44578313253012, "grad_norm": 3.3566957382084515, "learning_rate": 5.803145087451945e-07, "loss": 0.4013, "step": 980 }, { "epoch": 9.542168674698795, "grad_norm": 3.616442803141579, "learning_rate": 3.716605958046071e-07, "loss": 0.386, "step": 990 }, { "epoch": 9.638554216867469, "grad_norm": 3.1110352724634835, "learning_rate": 2.0917258966953733e-07, "loss": 0.3862, "step": 1000 }, { "epoch": 9.734939759036145, "grad_norm": 3.3766596212586504, "learning_rate": 9.300164200530814e-08, "loss": 0.3808, "step": 1010 }, { "epoch": 9.831325301204819, "grad_norm": 3.202391216065192, "learning_rate": 2.3255818832423894e-08, "loss": 0.3844, "step": 1020 }, { "epoch": 9.927710843373493, "grad_norm": 3.3375553267537876, "learning_rate": 0.0, "loss": 0.367, "step": 1030 }, { "epoch": 9.927710843373493, "step": 1030, "total_flos": 144813601193984.0, "train_loss": 0.8549524587334938, "train_runtime": 9342.774, "train_samples_per_second": 0.888, "train_steps_per_second": 0.11 } ], "logging_steps": 10, "max_steps": 1030, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 144813601193984.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }