{ "best_metric": null, "best_model_checkpoint": null, "epoch": 29.850746268656717, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15, "learning_rate": 2.7034552830322406e-05, "loss": 4.2127, "step": 10 }, { "epoch": 0.3, "learning_rate": 3.5172764151612024e-05, "loss": 3.3881, "step": 20 }, { "epoch": 0.45, "learning_rate": 3.993331259751083e-05, "loss": 3.0315, "step": 30 }, { "epoch": 0.6, "learning_rate": 4.331097547290165e-05, "loss": 2.8793, "step": 40 }, { "epoch": 0.75, "learning_rate": 4.5930894339355186e-05, "loss": 2.7883, "step": 50 }, { "epoch": 0.9, "learning_rate": 4.8071523918800455e-05, "loss": 2.7364, "step": 60 }, { "epoch": 1.04, "learning_rate": 4.9881400439889756e-05, "loss": 2.6109, "step": 70 }, { "epoch": 1.19, "learning_rate": 5.1449186794191275e-05, "loss": 2.5014, "step": 80 }, { "epoch": 1.34, "learning_rate": 5.283207236469926e-05, "loss": 2.4571, "step": 90 }, { "epoch": 1.49, "learning_rate": 5.406910566064481e-05, "loss": 2.5222, "step": 100 }, { "epoch": 1.64, "learning_rate": 5.518813839434375e-05, "loss": 2.4918, "step": 110 }, { "epoch": 1.79, "learning_rate": 5.620973524009008e-05, "loss": 2.4835, "step": 120 }, { "epoch": 1.94, "learning_rate": 5.714951323824802e-05, "loss": 2.4628, "step": 130 }, { "epoch": 2.09, "learning_rate": 5.8019611761179374e-05, "loss": 2.3972, "step": 140 }, { "epoch": 2.24, "learning_rate": 5.882965410654361e-05, "loss": 2.3233, "step": 150 }, { "epoch": 2.39, "learning_rate": 5.95873981154809e-05, "loss": 2.3075, "step": 160 }, { "epoch": 2.54, "learning_rate": 6.029918920033657e-05, "loss": 2.321, "step": 170 }, { "epoch": 2.69, "learning_rate": 6.0970283685988885e-05, "loss": 2.2946, "step": 180 }, { "epoch": 2.84, "learning_rate": 6.160508461224668e-05, "loss": 2.2593, "step": 190 }, { "epoch": 2.99, "learning_rate": 6.220731698193443e-05, "loss": 2.2308, "step": 200 }, { "epoch": 3.13, "learning_rate": 6.278016020707817e-05, "loss": 2.0929, "step": 210 }, { "epoch": 3.28, "learning_rate": 6.332634971563337e-05, "loss": 2.156, "step": 220 }, { "epoch": 3.43, "learning_rate": 6.384825595366063e-05, "loss": 2.1504, "step": 230 }, { "epoch": 3.58, "learning_rate": 6.43479465613797e-05, "loss": 2.182, "step": 240 }, { "epoch": 3.73, "learning_rate": 6.482723584838796e-05, "loss": 2.1635, "step": 250 }, { "epoch": 3.88, "learning_rate": 6.528772455953764e-05, "loss": 2.1302, "step": 260 }, { "epoch": 4.03, "learning_rate": 6.573083213188768e-05, "loss": 2.1364, "step": 270 }, { "epoch": 4.18, "learning_rate": 6.6157823082469e-05, "loss": 2.015, "step": 280 }, { "epoch": 4.33, "learning_rate": 6.656982876347945e-05, "loss": 2.0479, "step": 290 }, { "epoch": 4.48, "learning_rate": 6.696786542783324e-05, "loss": 1.9968, "step": 300 }, { "epoch": 4.63, "learning_rate": 6.735284933140416e-05, "loss": 2.012, "step": 310 }, { "epoch": 4.78, "learning_rate": 6.772560943677052e-05, "loss": 2.0458, "step": 320 }, { "epoch": 4.93, "learning_rate": 6.808689816153217e-05, "loss": 2.0991, "step": 330 }, { "epoch": 5.07, "learning_rate": 6.84374005216262e-05, "loss": 1.9845, "step": 340 }, { "epoch": 5.22, "learning_rate": 6.877774194892253e-05, "loss": 1.9325, "step": 350 }, { "epoch": 5.37, "learning_rate": 6.910849500727851e-05, "loss": 1.9603, "step": 360 }, { "epoch": 5.52, "learning_rate": 6.943018518821426e-05, "loss": 1.9095, "step": 370 }, { "epoch": 5.67, "learning_rate": 6.97432959335363e-05, "loss": 1.9443, "step": 380 }, { "epoch": 5.82, "learning_rate": 7.004827300543644e-05, "loss": 1.9461, "step": 390 }, { "epoch": 5.97, "learning_rate": 7.034552830322405e-05, "loss": 1.9462, "step": 400 }, { "epoch": 6.12, "learning_rate": 7.063544320870321e-05, "loss": 1.8685, "step": 410 }, { "epoch": 6.27, "learning_rate": 7.09183715283678e-05, "loss": 1.8694, "step": 420 }, { "epoch": 6.42, "learning_rate": 7.119464208935388e-05, "loss": 1.8429, "step": 430 }, { "epoch": 6.57, "learning_rate": 7.146456103692298e-05, "loss": 1.8458, "step": 440 }, { "epoch": 6.72, "learning_rate": 7.172841387373204e-05, "loss": 1.9065, "step": 450 }, { "epoch": 6.87, "learning_rate": 7.198646727495026e-05, "loss": 1.911, "step": 460 }, { "epoch": 7.01, "learning_rate": 7.223897070815449e-05, "loss": 1.8694, "step": 470 }, { "epoch": 7.16, "learning_rate": 7.248615788266932e-05, "loss": 1.7973, "step": 480 }, { "epoch": 7.31, "learning_rate": 7.272824804945709e-05, "loss": 1.8114, "step": 490 }, { "epoch": 7.46, "learning_rate": 7.296544716967758e-05, "loss": 1.7718, "step": 500 }, { "epoch": 7.46, "eval_loss": 2.8234732151031494, "eval_runtime": 35.5202, "eval_samples_per_second": 18.328, "eval_steps_per_second": 0.253, "step": 500 }, { "epoch": 7.61, "learning_rate": 7.319794896752499e-05, "loss": 1.7947, "step": 510 }, { "epoch": 7.76, "learning_rate": 7.342593588082727e-05, "loss": 1.8117, "step": 520 }, { "epoch": 7.91, "learning_rate": 7.364957992109503e-05, "loss": 1.8188, "step": 530 }, { "epoch": 8.06, "learning_rate": 7.386904345317732e-05, "loss": 1.8015, "step": 540 }, { "epoch": 8.21, "learning_rate": 7.408447990337652e-05, "loss": 1.734, "step": 550 }, { "epoch": 8.36, "learning_rate": 7.429603440375862e-05, "loss": 1.7217, "step": 560 }, { "epoch": 8.51, "learning_rate": 7.450384437943511e-05, "loss": 1.7398, "step": 570 }, { "epoch": 8.66, "learning_rate": 7.470804008476907e-05, "loss": 1.7452, "step": 580 }, { "epoch": 8.81, "learning_rate": 7.490874509374465e-05, "loss": 1.778, "step": 590 }, { "epoch": 8.96, "learning_rate": 7.510607674912285e-05, "loss": 1.7332, "step": 600 }, { "epoch": 9.1, "learning_rate": 7.530014657447177e-05, "loss": 1.6922, "step": 610 }, { "epoch": 9.25, "learning_rate": 7.549106065269378e-05, "loss": 1.6818, "step": 620 }, { "epoch": 9.4, "learning_rate": 7.567891997426661e-05, "loss": 1.6757, "step": 630 }, { "epoch": 9.55, "learning_rate": 7.586382075806015e-05, "loss": 1.752, "step": 640 }, { "epoch": 9.7, "learning_rate": 7.604585474728082e-05, "loss": 1.7074, "step": 650 }, { "epoch": 9.85, "learning_rate": 7.62251094828218e-05, "loss": 1.7052, "step": 660 }, { "epoch": 10.0, "learning_rate": 7.640166855605846e-05, "loss": 1.742, "step": 670 }, { "epoch": 10.15, "learning_rate": 7.65756118429158e-05, "loss": 1.6759, "step": 680 }, { "epoch": 10.3, "learning_rate": 7.674701572084905e-05, "loss": 1.6935, "step": 690 }, { "epoch": 10.45, "learning_rate": 7.691595327021215e-05, "loss": 1.6563, "step": 700 }, { "epoch": 10.6, "learning_rate": 7.708249446134367e-05, "loss": 1.6941, "step": 710 }, { "epoch": 10.75, "learning_rate": 7.724670632856813e-05, "loss": 1.676, "step": 720 }, { "epoch": 10.9, "learning_rate": 7.740865313219632e-05, "loss": 1.6948, "step": 730 }, { "epoch": 11.04, "learning_rate": 7.756839650950389e-05, "loss": 1.6687, "step": 740 }, { "epoch": 11.19, "learning_rate": 7.772599561557638e-05, "loss": 1.6469, "step": 750 }, { "epoch": 11.34, "learning_rate": 7.788150725482592e-05, "loss": 1.6783, "step": 760 }, { "epoch": 11.49, "learning_rate": 7.803498600391108e-05, "loss": 1.6408, "step": 770 }, { "epoch": 11.64, "learning_rate": 7.818648432672608e-05, "loss": 1.6521, "step": 780 }, { "epoch": 11.79, "learning_rate": 7.833605268206489e-05, "loss": 1.6451, "step": 790 }, { "epoch": 11.94, "learning_rate": 7.848373962451368e-05, "loss": 1.6504, "step": 800 }, { "epoch": 12.09, "learning_rate": 7.862959189907611e-05, "loss": 1.6431, "step": 810 }, { "epoch": 12.24, "learning_rate": 7.877365452999284e-05, "loss": 1.6131, "step": 820 }, { "epoch": 12.39, "learning_rate": 7.89159709041777e-05, "loss": 1.6256, "step": 830 }, { "epoch": 12.54, "learning_rate": 7.905658284965742e-05, "loss": 1.6257, "step": 840 }, { "epoch": 12.69, "learning_rate": 7.919553070936936e-05, "loss": 1.6143, "step": 850 }, { "epoch": 12.84, "learning_rate": 7.933285341064351e-05, "loss": 1.6383, "step": 860 }, { "epoch": 12.99, "learning_rate": 7.946858853066788e-05, "loss": 1.6234, "step": 870 }, { "epoch": 13.13, "learning_rate": 7.960277235821263e-05, "loss": 1.5871, "step": 880 }, { "epoch": 13.28, "learning_rate": 7.973543995186684e-05, "loss": 1.6028, "step": 890 }, { "epoch": 13.43, "learning_rate": 7.986662519502166e-05, "loss": 1.5723, "step": 900 }, { "epoch": 13.58, "learning_rate": 7.999636084781537e-05, "loss": 1.5936, "step": 910 }, { "epoch": 13.73, "learning_rate": 8.012467859623988e-05, "loss": 1.5869, "step": 920 }, { "epoch": 13.88, "learning_rate": 8.025160909859258e-05, "loss": 1.6018, "step": 930 }, { "epoch": 14.03, "learning_rate": 8.037718202944411e-05, "loss": 1.5926, "step": 940 }, { "epoch": 14.18, "learning_rate": 8.050142612127945e-05, "loss": 1.5546, "step": 950 }, { "epoch": 14.33, "learning_rate": 8.062436920395896e-05, "loss": 1.5601, "step": 960 }, { "epoch": 14.48, "learning_rate": 8.074603824213446e-05, "loss": 1.5668, "step": 970 }, { "epoch": 14.63, "learning_rate": 8.086645937074672e-05, "loss": 1.5623, "step": 980 }, { "epoch": 14.78, "learning_rate": 8.09856579287206e-05, "loss": 1.579, "step": 990 }, { "epoch": 14.93, "learning_rate": 8.110365849096721e-05, "loss": 1.5637, "step": 1000 }, { "epoch": 14.93, "eval_loss": 3.1092050075531006, "eval_runtime": 35.1461, "eval_samples_per_second": 18.523, "eval_steps_per_second": 0.256, "step": 1000 }, { "epoch": 15.07, "learning_rate": 8.122048489879363e-05, "loss": 1.5647, "step": 1010 }, { "epoch": 15.22, "learning_rate": 8.133616028881462e-05, "loss": 1.5349, "step": 1020 }, { "epoch": 15.37, "learning_rate": 8.145070712045392e-05, "loss": 1.542, "step": 1030 }, { "epoch": 15.52, "learning_rate": 8.15641472021169e-05, "loss": 1.5345, "step": 1040 }, { "epoch": 15.67, "learning_rate": 8.167650171611095e-05, "loss": 1.5491, "step": 1050 }, { "epoch": 15.82, "learning_rate": 8.178779124238466e-05, "loss": 1.5469, "step": 1060 }, { "epoch": 15.97, "learning_rate": 8.189803578115246e-05, "loss": 1.5825, "step": 1070 }, { "epoch": 16.12, "learning_rate": 8.200725477446693e-05, "loss": 1.5314, "step": 1080 }, { "epoch": 16.27, "learning_rate": 8.211546712679696e-05, "loss": 1.5126, "step": 1090 }, { "epoch": 16.42, "learning_rate": 8.222269122466616e-05, "loss": 1.5194, "step": 1100 }, { "epoch": 16.57, "learning_rate": 8.232894495540269e-05, "loss": 1.5276, "step": 1110 }, { "epoch": 16.72, "learning_rate": 8.243424572504824e-05, "loss": 1.5376, "step": 1120 }, { "epoch": 16.87, "learning_rate": 8.2538610475471e-05, "loss": 1.5393, "step": 1130 }, { "epoch": 17.01, "learning_rate": 8.264205570072473e-05, "loss": 1.5298, "step": 1140 }, { "epoch": 17.16, "learning_rate": 8.27445974626934e-05, "loss": 1.5135, "step": 1150 }, { "epoch": 17.31, "learning_rate": 8.284625140605869e-05, "loss": 1.5175, "step": 1160 }, { "epoch": 17.46, "learning_rate": 8.294703277262488e-05, "loss": 1.5106, "step": 1170 }, { "epoch": 17.61, "learning_rate": 8.304695641503428e-05, "loss": 1.5276, "step": 1180 }, { "epoch": 17.76, "learning_rate": 8.31460368099039e-05, "loss": 1.5227, "step": 1190 }, { "epoch": 17.91, "learning_rate": 8.324428807041249e-05, "loss": 1.5241, "step": 1200 }, { "epoch": 18.06, "learning_rate": 8.334172395836509e-05, "loss": 1.5187, "step": 1210 }, { "epoch": 18.21, "learning_rate": 8.34383578957614e-05, "loss": 1.4929, "step": 1220 }, { "epoch": 18.36, "learning_rate": 8.353420297589165e-05, "loss": 1.4934, "step": 1230 }, { "epoch": 18.51, "learning_rate": 8.362927197398341e-05, "loss": 1.5061, "step": 1240 }, { "epoch": 18.66, "learning_rate": 8.372357735742074e-05, "loss": 1.5068, "step": 1250 }, { "epoch": 18.81, "learning_rate": 8.381713129555623e-05, "loss": 1.5058, "step": 1260 }, { "epoch": 18.96, "learning_rate": 8.390994566913507e-05, "loss": 1.4944, "step": 1270 }, { "epoch": 19.1, "learning_rate": 8.400203207934977e-05, "loss": 1.4905, "step": 1280 }, { "epoch": 19.25, "learning_rate": 8.409340185654231e-05, "loss": 1.4908, "step": 1290 }, { "epoch": 19.4, "learning_rate": 8.418406606857043e-05, "loss": 1.4788, "step": 1300 }, { "epoch": 19.55, "learning_rate": 8.427403552885332e-05, "loss": 1.4851, "step": 1310 }, { "epoch": 19.7, "learning_rate": 8.436332080411142e-05, "loss": 1.4934, "step": 1320 }, { "epoch": 19.85, "learning_rate": 8.445193222181402e-05, "loss": 1.4862, "step": 1330 }, { "epoch": 20.0, "learning_rate": 8.453987987734808e-05, "loss": 1.4922, "step": 1340 }, { "epoch": 20.15, "learning_rate": 8.462717364092046e-05, "loss": 1.48, "step": 1350 }, { "epoch": 20.3, "learning_rate": 8.471382316420545e-05, "loss": 1.4731, "step": 1360 }, { "epoch": 20.45, "learning_rate": 8.479983788674874e-05, "loss": 1.4746, "step": 1370 }, { "epoch": 20.6, "learning_rate": 8.488522704213867e-05, "loss": 1.48, "step": 1380 }, { "epoch": 20.75, "learning_rate": 8.496999966395455e-05, "loss": 1.4743, "step": 1390 }, { "epoch": 20.9, "learning_rate": 8.505416459150177e-05, "loss": 1.4758, "step": 1400 }, { "epoch": 21.04, "learning_rate": 8.513773047534291e-05, "loss": 1.4738, "step": 1410 }, { "epoch": 21.19, "learning_rate": 8.522070578263329e-05, "loss": 1.4589, "step": 1420 }, { "epoch": 21.34, "learning_rate": 8.530309880226936e-05, "loss": 1.4783, "step": 1430 }, { "epoch": 21.49, "learning_rate": 8.538491764985775e-05, "loss": 1.4656, "step": 1440 }, { "epoch": 21.64, "learning_rate": 8.546617027251222e-05, "loss": 1.4702, "step": 1450 }, { "epoch": 21.79, "learning_rate": 8.554686445348594e-05, "loss": 1.4768, "step": 1460 }, { "epoch": 21.94, "learning_rate": 8.562700781664552e-05, "loss": 1.4802, "step": 1470 }, { "epoch": 22.09, "learning_rate": 8.57066078307935e-05, "loss": 1.463, "step": 1480 }, { "epoch": 22.24, "learning_rate": 8.578567181384524e-05, "loss": 1.4582, "step": 1490 }, { "epoch": 22.39, "learning_rate": 8.586420693686602e-05, "loss": 1.4588, "step": 1500 }, { "epoch": 22.39, "eval_loss": 3.2750725746154785, "eval_runtime": 34.9007, "eval_samples_per_second": 18.653, "eval_steps_per_second": 0.258, "step": 1500 }, { "epoch": 22.54, "learning_rate": 8.594222022797423e-05, "loss": 1.462, "step": 1510 }, { "epoch": 22.69, "learning_rate": 8.601971857611555e-05, "loss": 1.4671, "step": 1520 }, { "epoch": 22.84, "learning_rate": 8.609670873471342e-05, "loss": 1.4637, "step": 1530 }, { "epoch": 22.99, "learning_rate": 8.617319732520071e-05, "loss": 1.4661, "step": 1540 }, { "epoch": 23.13, "learning_rate": 8.624919084043694e-05, "loss": 1.4601, "step": 1550 }, { "epoch": 23.28, "learning_rate": 8.632469564801571e-05, "loss": 1.4553, "step": 1560 }, { "epoch": 23.43, "learning_rate": 8.639971799346644e-05, "loss": 1.4543, "step": 1570 }, { "epoch": 23.58, "learning_rate": 8.647426400335451e-05, "loss": 1.4667, "step": 1580 }, { "epoch": 23.73, "learning_rate": 8.654833968828348e-05, "loss": 1.4622, "step": 1590 }, { "epoch": 23.88, "learning_rate": 8.66219509458033e-05, "loss": 1.4654, "step": 1600 }, { "epoch": 24.03, "learning_rate": 8.669510356322798e-05, "loss": 1.4532, "step": 1610 }, { "epoch": 24.18, "learning_rate": 8.676780322036573e-05, "loss": 1.4525, "step": 1620 }, { "epoch": 24.33, "learning_rate": 8.684005549216557e-05, "loss": 1.4508, "step": 1630 }, { "epoch": 24.48, "learning_rate": 8.691186585128246e-05, "loss": 1.4526, "step": 1640 }, { "epoch": 24.63, "learning_rate": 8.698323967056495e-05, "loss": 1.4499, "step": 1650 }, { "epoch": 24.78, "learning_rate": 8.705418222546732e-05, "loss": 1.4633, "step": 1660 }, { "epoch": 24.93, "learning_rate": 8.712469869638952e-05, "loss": 1.4513, "step": 1670 }, { "epoch": 25.07, "learning_rate": 8.719479417094704e-05, "loss": 1.4543, "step": 1680 }, { "epoch": 25.22, "learning_rate": 8.726447364617366e-05, "loss": 1.4454, "step": 1690 }, { "epoch": 25.37, "learning_rate": 8.733374203065898e-05, "loss": 1.4462, "step": 1700 }, { "epoch": 25.52, "learning_rate": 8.740260414662352e-05, "loss": 1.4561, "step": 1710 }, { "epoch": 25.67, "learning_rate": 8.747106473193313e-05, "loss": 1.4503, "step": 1720 }, { "epoch": 25.82, "learning_rate": 8.753912844205501e-05, "loss": 1.453, "step": 1730 }, { "epoch": 25.97, "learning_rate": 8.76067998519575e-05, "loss": 1.4593, "step": 1740 }, { "epoch": 26.12, "learning_rate": 8.76740834579553e-05, "loss": 1.4412, "step": 1750 }, { "epoch": 26.27, "learning_rate": 8.774098367950224e-05, "loss": 1.4476, "step": 1760 }, { "epoch": 26.42, "learning_rate": 8.780750486093308e-05, "loss": 1.4412, "step": 1770 }, { "epoch": 26.57, "learning_rate": 8.787365127315646e-05, "loss": 1.4481, "step": 1780 }, { "epoch": 26.72, "learning_rate": 8.79394271153003e-05, "loss": 1.4471, "step": 1790 }, { "epoch": 26.87, "learning_rate": 8.800483651631128e-05, "loss": 1.447, "step": 1800 }, { "epoch": 27.01, "learning_rate": 8.806988353651037e-05, "loss": 1.4507, "step": 1810 }, { "epoch": 27.16, "learning_rate": 8.813457216910499e-05, "loss": 1.435, "step": 1820 }, { "epoch": 27.31, "learning_rate": 8.81989063416602e-05, "loss": 1.4361, "step": 1830 }, { "epoch": 27.46, "learning_rate": 8.82628899175295e-05, "loss": 1.4359, "step": 1840 }, { "epoch": 27.61, "learning_rate": 8.832652669724704e-05, "loss": 1.4379, "step": 1850 }, { "epoch": 27.76, "learning_rate": 8.838982041988221e-05, "loss": 1.4476, "step": 1860 }, { "epoch": 27.91, "learning_rate": 8.845277476435792e-05, "loss": 1.4395, "step": 1870 }, { "epoch": 28.06, "learning_rate": 8.851539335073373e-05, "loss": 1.4403, "step": 1880 }, { "epoch": 28.21, "learning_rate": 8.857767974145503e-05, "loss": 1.4387, "step": 1890 }, { "epoch": 28.36, "learning_rate": 8.863963744256908e-05, "loss": 1.4388, "step": 1900 }, { "epoch": 28.51, "learning_rate": 8.87012699049093e-05, "loss": 1.4377, "step": 1910 }, { "epoch": 28.66, "learning_rate": 8.876258052524857e-05, "loss": 1.4367, "step": 1920 }, { "epoch": 28.81, "learning_rate": 8.882357264742258e-05, "loss": 1.4482, "step": 1930 }, { "epoch": 28.96, "learning_rate": 8.88842495634241e-05, "loss": 1.4354, "step": 1940 }, { "epoch": 29.1, "learning_rate": 8.894461451446924e-05, "loss": 1.4333, "step": 1950 }, { "epoch": 29.25, "learning_rate": 8.900467069203634e-05, "loss": 1.4334, "step": 1960 }, { "epoch": 29.4, "learning_rate": 8.906442123887845e-05, "loss": 1.4454, "step": 1970 }, { "epoch": 29.55, "learning_rate": 8.912386925001022e-05, "loss": 1.4368, "step": 1980 }, { "epoch": 29.7, "learning_rate": 8.918301777366981e-05, "loss": 1.4319, "step": 1990 }, { "epoch": 29.85, "learning_rate": 8.924186981225684e-05, "loss": 1.4337, "step": 2000 }, { "epoch": 29.85, "eval_loss": 3.362933874130249, "eval_runtime": 35.3655, "eval_samples_per_second": 18.408, "eval_steps_per_second": 0.254, "step": 2000 } ], "max_steps": 50000, "num_train_epochs": 747, "total_flos": 348961395840.0, "trial_name": null, "trial_params": null }