diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5590 @@ +{ + "best_metric": 0.1484375, + "best_model_checkpoint": "/mnt/vdc/metamath_leaderboard/checkpoint-6168", + "epoch": 3.0, + "global_step": 9252, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 7.194244604316547e-07, + "loss": 0.7531, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.4388489208633094e-06, + "loss": 0.6605, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 2.158273381294964e-06, + "loss": 0.4646, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 2.877697841726619e-06, + "loss": 0.3787, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 3.5971223021582737e-06, + "loss": 0.3369, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 4.316546762589928e-06, + "loss": 0.3264, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 5.035971223021583e-06, + "loss": 0.3008, + "step": 70 + }, + { + "epoch": 0.03, + "learning_rate": 5.755395683453238e-06, + "loss": 0.2889, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 6.474820143884892e-06, + "loss": 0.2497, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 7.194244604316547e-06, + "loss": 0.2774, + "step": 100 + }, + { + "epoch": 0.04, + "learning_rate": 7.913669064748202e-06, + "loss": 0.2653, + "step": 110 + }, + { + "epoch": 0.04, + "learning_rate": 8.633093525179856e-06, + "loss": 0.2691, + "step": 120 + }, + { + "epoch": 0.04, + "learning_rate": 9.35251798561151e-06, + "loss": 0.2661, + "step": 130 + }, + { + "epoch": 0.05, + "learning_rate": 1.0071942446043167e-05, + "loss": 0.2732, + "step": 140 + }, + { + "epoch": 0.05, + "learning_rate": 1.0791366906474821e-05, + "loss": 0.2507, + "step": 150 + }, + { + "epoch": 0.05, + "learning_rate": 1.1510791366906475e-05, + "loss": 0.2482, + "step": 160 + }, + { + "epoch": 0.06, + "learning_rate": 1.223021582733813e-05, + "loss": 0.2368, + "step": 170 + }, + { + "epoch": 0.06, + "learning_rate": 1.2949640287769784e-05, + "loss": 0.2357, + "step": 180 + }, + { + "epoch": 0.06, + "learning_rate": 1.3669064748201439e-05, + "loss": 0.2404, + "step": 190 + }, + { + "epoch": 0.06, + "learning_rate": 1.4388489208633095e-05, + "loss": 0.2479, + "step": 200 + }, + { + "epoch": 0.07, + "learning_rate": 1.5107913669064749e-05, + "loss": 0.2501, + "step": 210 + }, + { + "epoch": 0.07, + "learning_rate": 1.5827338129496403e-05, + "loss": 0.2502, + "step": 220 + }, + { + "epoch": 0.07, + "learning_rate": 1.6546762589928058e-05, + "loss": 0.232, + "step": 230 + }, + { + "epoch": 0.08, + "learning_rate": 1.7266187050359712e-05, + "loss": 0.237, + "step": 240 + }, + { + "epoch": 0.08, + "learning_rate": 1.7985611510791367e-05, + "loss": 0.2469, + "step": 250 + }, + { + "epoch": 0.08, + "learning_rate": 1.870503597122302e-05, + "loss": 0.2302, + "step": 260 + }, + { + "epoch": 0.09, + "learning_rate": 1.9424460431654675e-05, + "loss": 0.2409, + "step": 270 + }, + { + "epoch": 0.09, + "learning_rate": 1.99999975489194e-05, + "loss": 0.2106, + "step": 280 + }, + { + "epoch": 0.09, + "learning_rate": 1.9999911761224496e-05, + "loss": 0.2308, + "step": 290 + }, + { + "epoch": 0.1, + "learning_rate": 1.999970342070106e-05, + "loss": 0.2361, + "step": 300 + }, + { + "epoch": 0.1, + "learning_rate": 1.9999372529902386e-05, + "loss": 0.2277, + "step": 310 + }, + { + "epoch": 0.1, + "learning_rate": 1.9998919092883666e-05, + "loss": 0.2204, + "step": 320 + }, + { + "epoch": 0.11, + "learning_rate": 1.9998343115201945e-05, + "loss": 0.2374, + "step": 330 + }, + { + "epoch": 0.11, + "learning_rate": 1.999764460391606e-05, + "loss": 0.2073, + "step": 340 + }, + { + "epoch": 0.11, + "learning_rate": 1.999682356758654e-05, + "loss": 0.2217, + "step": 350 + }, + { + "epoch": 0.12, + "learning_rate": 1.9995880016275502e-05, + "loss": 0.2327, + "step": 360 + }, + { + "epoch": 0.12, + "learning_rate": 1.9994813961546543e-05, + "loss": 0.2303, + "step": 370 + }, + { + "epoch": 0.12, + "learning_rate": 1.9993625416464575e-05, + "loss": 0.2229, + "step": 380 + }, + { + "epoch": 0.13, + "learning_rate": 1.9992314395595686e-05, + "loss": 0.2188, + "step": 390 + }, + { + "epoch": 0.13, + "learning_rate": 1.9990880915006945e-05, + "loss": 0.2244, + "step": 400 + }, + { + "epoch": 0.13, + "learning_rate": 1.998932499226622e-05, + "loss": 0.2188, + "step": 410 + }, + { + "epoch": 0.14, + "learning_rate": 1.9987646646441956e-05, + "loss": 0.2196, + "step": 420 + }, + { + "epoch": 0.14, + "learning_rate": 1.9985845898102933e-05, + "loss": 0.2022, + "step": 430 + }, + { + "epoch": 0.14, + "learning_rate": 1.9983922769318024e-05, + "loss": 0.2219, + "step": 440 + }, + { + "epoch": 0.15, + "learning_rate": 1.9981877283655924e-05, + "loss": 0.2014, + "step": 450 + }, + { + "epoch": 0.15, + "learning_rate": 1.997970946618487e-05, + "loss": 0.1935, + "step": 460 + }, + { + "epoch": 0.15, + "learning_rate": 1.99774193434723e-05, + "loss": 0.2011, + "step": 470 + }, + { + "epoch": 0.16, + "learning_rate": 1.997500694358457e-05, + "loss": 0.2003, + "step": 480 + }, + { + "epoch": 0.16, + "learning_rate": 1.9972472296086583e-05, + "loss": 0.1996, + "step": 490 + }, + { + "epoch": 0.16, + "learning_rate": 1.9969815432041434e-05, + "loss": 0.2131, + "step": 500 + }, + { + "epoch": 0.17, + "learning_rate": 1.996703638401003e-05, + "loss": 0.2119, + "step": 510 + }, + { + "epoch": 0.17, + "learning_rate": 1.9964135186050692e-05, + "loss": 0.2192, + "step": 520 + }, + { + "epoch": 0.17, + "learning_rate": 1.996111187371874e-05, + "loss": 0.2051, + "step": 530 + }, + { + "epoch": 0.18, + "learning_rate": 1.995796648406604e-05, + "loss": 0.1961, + "step": 540 + }, + { + "epoch": 0.18, + "learning_rate": 1.9954699055640576e-05, + "loss": 0.2017, + "step": 550 + }, + { + "epoch": 0.18, + "learning_rate": 1.9951309628485963e-05, + "loss": 0.1997, + "step": 560 + }, + { + "epoch": 0.18, + "learning_rate": 1.9947798244140954e-05, + "loss": 0.2003, + "step": 570 + }, + { + "epoch": 0.19, + "learning_rate": 1.994416494563894e-05, + "loss": 0.2025, + "step": 580 + }, + { + "epoch": 0.19, + "learning_rate": 1.9940409777507407e-05, + "loss": 0.2038, + "step": 590 + }, + { + "epoch": 0.19, + "learning_rate": 1.9936532785767416e-05, + "loss": 0.2068, + "step": 600 + }, + { + "epoch": 0.2, + "learning_rate": 1.9932534017933015e-05, + "loss": 0.205, + "step": 610 + }, + { + "epoch": 0.2, + "learning_rate": 1.9928413523010667e-05, + "loss": 0.2122, + "step": 620 + }, + { + "epoch": 0.2, + "learning_rate": 1.9924171351498645e-05, + "loss": 0.1979, + "step": 630 + }, + { + "epoch": 0.21, + "learning_rate": 1.9919807555386426e-05, + "loss": 0.1921, + "step": 640 + }, + { + "epoch": 0.21, + "learning_rate": 1.9915322188154033e-05, + "loss": 0.2027, + "step": 650 + }, + { + "epoch": 0.21, + "learning_rate": 1.9910715304771396e-05, + "loss": 0.1852, + "step": 660 + }, + { + "epoch": 0.22, + "learning_rate": 1.9905986961697675e-05, + "loss": 0.1957, + "step": 670 + }, + { + "epoch": 0.22, + "learning_rate": 1.9901137216880556e-05, + "loss": 0.1945, + "step": 680 + }, + { + "epoch": 0.22, + "learning_rate": 1.989616612975557e-05, + "loss": 0.209, + "step": 690 + }, + { + "epoch": 0.23, + "learning_rate": 1.9891073761245318e-05, + "loss": 0.1963, + "step": 700 + }, + { + "epoch": 0.23, + "learning_rate": 1.988586017375878e-05, + "loss": 0.1757, + "step": 710 + }, + { + "epoch": 0.23, + "learning_rate": 1.9880525431190503e-05, + "loss": 0.1856, + "step": 720 + }, + { + "epoch": 0.24, + "learning_rate": 1.9875069598919844e-05, + "loss": 0.179, + "step": 730 + }, + { + "epoch": 0.24, + "learning_rate": 1.9869492743810163e-05, + "loss": 0.1891, + "step": 740 + }, + { + "epoch": 0.24, + "learning_rate": 1.9863794934207994e-05, + "loss": 0.1975, + "step": 750 + }, + { + "epoch": 0.25, + "learning_rate": 1.9857976239942228e-05, + "loss": 0.1819, + "step": 760 + }, + { + "epoch": 0.25, + "learning_rate": 1.9852036732323237e-05, + "loss": 0.2062, + "step": 770 + }, + { + "epoch": 0.25, + "learning_rate": 1.9845976484142003e-05, + "loss": 0.1912, + "step": 780 + }, + { + "epoch": 0.26, + "learning_rate": 1.9839795569669246e-05, + "loss": 0.1938, + "step": 790 + }, + { + "epoch": 0.26, + "learning_rate": 1.9833494064654485e-05, + "loss": 0.1864, + "step": 800 + }, + { + "epoch": 0.26, + "learning_rate": 1.982707204632513e-05, + "loss": 0.1907, + "step": 810 + }, + { + "epoch": 0.27, + "learning_rate": 1.9820529593385516e-05, + "loss": 0.188, + "step": 820 + }, + { + "epoch": 0.27, + "learning_rate": 1.981386678601598e-05, + "loss": 0.1889, + "step": 830 + }, + { + "epoch": 0.27, + "learning_rate": 1.980708370587182e-05, + "loss": 0.1871, + "step": 840 + }, + { + "epoch": 0.28, + "learning_rate": 1.9800180436082335e-05, + "loss": 0.1772, + "step": 850 + }, + { + "epoch": 0.28, + "learning_rate": 1.97931570612498e-05, + "loss": 0.1817, + "step": 860 + }, + { + "epoch": 0.28, + "learning_rate": 1.9786013667448416e-05, + "loss": 0.1765, + "step": 870 + }, + { + "epoch": 0.29, + "learning_rate": 1.977875034222327e-05, + "loss": 0.1987, + "step": 880 + }, + { + "epoch": 0.29, + "learning_rate": 1.977136717458925e-05, + "loss": 0.2069, + "step": 890 + }, + { + "epoch": 0.29, + "learning_rate": 1.9763864255029962e-05, + "loss": 0.1817, + "step": 900 + }, + { + "epoch": 0.3, + "learning_rate": 1.975624167549662e-05, + "loss": 0.1883, + "step": 910 + }, + { + "epoch": 0.3, + "learning_rate": 1.9748499529406918e-05, + "loss": 0.1738, + "step": 920 + }, + { + "epoch": 0.3, + "learning_rate": 1.9740637911643882e-05, + "loss": 0.1873, + "step": 930 + }, + { + "epoch": 0.3, + "learning_rate": 1.973265691855471e-05, + "loss": 0.193, + "step": 940 + }, + { + "epoch": 0.31, + "learning_rate": 1.9724556647949597e-05, + "loss": 0.1725, + "step": 950 + }, + { + "epoch": 0.31, + "learning_rate": 1.971633719910052e-05, + "loss": 0.1896, + "step": 960 + }, + { + "epoch": 0.31, + "learning_rate": 1.9707998672740045e-05, + "loss": 0.185, + "step": 970 + }, + { + "epoch": 0.32, + "learning_rate": 1.9699541171060068e-05, + "loss": 0.1745, + "step": 980 + }, + { + "epoch": 0.32, + "learning_rate": 1.9690964797710585e-05, + "loss": 0.1862, + "step": 990 + }, + { + "epoch": 0.32, + "learning_rate": 1.9682269657798395e-05, + "loss": 0.1801, + "step": 1000 + }, + { + "epoch": 0.33, + "learning_rate": 1.9673455857885846e-05, + "loss": 0.1754, + "step": 1010 + }, + { + "epoch": 0.33, + "learning_rate": 1.9664523505989498e-05, + "loss": 0.1882, + "step": 1020 + }, + { + "epoch": 0.33, + "learning_rate": 1.965547271157882e-05, + "loss": 0.1888, + "step": 1030 + }, + { + "epoch": 0.34, + "learning_rate": 1.9646303585574832e-05, + "loss": 0.1965, + "step": 1040 + }, + { + "epoch": 0.34, + "learning_rate": 1.9637016240348755e-05, + "loss": 0.1785, + "step": 1050 + }, + { + "epoch": 0.34, + "learning_rate": 1.9627610789720647e-05, + "loss": 0.19, + "step": 1060 + }, + { + "epoch": 0.35, + "learning_rate": 1.9618087348957973e-05, + "loss": 0.1789, + "step": 1070 + }, + { + "epoch": 0.35, + "learning_rate": 1.9608446034774225e-05, + "loss": 0.1785, + "step": 1080 + }, + { + "epoch": 0.35, + "learning_rate": 1.9598686965327483e-05, + "loss": 0.2006, + "step": 1090 + }, + { + "epoch": 0.36, + "learning_rate": 1.9588810260218955e-05, + "loss": 0.1937, + "step": 1100 + }, + { + "epoch": 0.36, + "learning_rate": 1.9578816040491526e-05, + "loss": 0.183, + "step": 1110 + }, + { + "epoch": 0.36, + "learning_rate": 1.956870442862826e-05, + "loss": 0.1834, + "step": 1120 + }, + { + "epoch": 0.37, + "learning_rate": 1.9558475548550924e-05, + "loss": 0.1784, + "step": 1130 + }, + { + "epoch": 0.37, + "learning_rate": 1.9548129525618434e-05, + "loss": 0.1753, + "step": 1140 + }, + { + "epoch": 0.37, + "learning_rate": 1.9537666486625352e-05, + "loss": 0.1813, + "step": 1150 + }, + { + "epoch": 0.38, + "learning_rate": 1.9527086559800307e-05, + "loss": 0.191, + "step": 1160 + }, + { + "epoch": 0.38, + "learning_rate": 1.9516389874804442e-05, + "loss": 0.1749, + "step": 1170 + }, + { + "epoch": 0.38, + "learning_rate": 1.9505576562729818e-05, + "loss": 0.184, + "step": 1180 + }, + { + "epoch": 0.39, + "learning_rate": 1.949464675609779e-05, + "loss": 0.1711, + "step": 1190 + }, + { + "epoch": 0.39, + "learning_rate": 1.9483600588857428e-05, + "loss": 0.1784, + "step": 1200 + }, + { + "epoch": 0.39, + "learning_rate": 1.9472438196383817e-05, + "loss": 0.1721, + "step": 1210 + }, + { + "epoch": 0.4, + "learning_rate": 1.946115971547645e-05, + "loss": 0.1883, + "step": 1220 + }, + { + "epoch": 0.4, + "learning_rate": 1.9449765284357514e-05, + "loss": 0.181, + "step": 1230 + }, + { + "epoch": 0.4, + "learning_rate": 1.943825504267022e-05, + "loss": 0.1884, + "step": 1240 + }, + { + "epoch": 0.41, + "learning_rate": 1.942662913147708e-05, + "loss": 0.1586, + "step": 1250 + }, + { + "epoch": 0.41, + "learning_rate": 1.9414887693258185e-05, + "loss": 0.1689, + "step": 1260 + }, + { + "epoch": 0.41, + "learning_rate": 1.9403030871909443e-05, + "loss": 0.1663, + "step": 1270 + }, + { + "epoch": 0.42, + "learning_rate": 1.9391058812740845e-05, + "loss": 0.1652, + "step": 1280 + }, + { + "epoch": 0.42, + "learning_rate": 1.9378971662474652e-05, + "loss": 0.1728, + "step": 1290 + }, + { + "epoch": 0.42, + "learning_rate": 1.9366769569243614e-05, + "loss": 0.1883, + "step": 1300 + }, + { + "epoch": 0.42, + "learning_rate": 1.9354452682589162e-05, + "loss": 0.183, + "step": 1310 + }, + { + "epoch": 0.43, + "learning_rate": 1.9342021153459554e-05, + "loss": 0.1786, + "step": 1320 + }, + { + "epoch": 0.43, + "learning_rate": 1.9329475134208037e-05, + "loss": 0.158, + "step": 1330 + }, + { + "epoch": 0.43, + "learning_rate": 1.9316814778590984e-05, + "loss": 0.1811, + "step": 1340 + }, + { + "epoch": 0.44, + "learning_rate": 1.9304040241766008e-05, + "loss": 0.1834, + "step": 1350 + }, + { + "epoch": 0.44, + "learning_rate": 1.9291151680290045e-05, + "loss": 0.1691, + "step": 1360 + }, + { + "epoch": 0.44, + "learning_rate": 1.927814925211746e-05, + "loss": 0.1707, + "step": 1370 + }, + { + "epoch": 0.45, + "learning_rate": 1.9265033116598096e-05, + "loss": 0.1738, + "step": 1380 + }, + { + "epoch": 0.45, + "learning_rate": 1.9251803434475317e-05, + "loss": 0.1783, + "step": 1390 + }, + { + "epoch": 0.45, + "learning_rate": 1.923846036788405e-05, + "loss": 0.1722, + "step": 1400 + }, + { + "epoch": 0.46, + "learning_rate": 1.92250040803488e-05, + "loss": 0.1752, + "step": 1410 + }, + { + "epoch": 0.46, + "learning_rate": 1.9211434736781624e-05, + "loss": 0.1702, + "step": 1420 + }, + { + "epoch": 0.46, + "learning_rate": 1.919775250348014e-05, + "loss": 0.1676, + "step": 1430 + }, + { + "epoch": 0.47, + "learning_rate": 1.918395754812546e-05, + "loss": 0.1676, + "step": 1440 + }, + { + "epoch": 0.47, + "learning_rate": 1.9170050039780158e-05, + "loss": 0.1753, + "step": 1450 + }, + { + "epoch": 0.47, + "learning_rate": 1.9156030148886193e-05, + "loss": 0.1604, + "step": 1460 + }, + { + "epoch": 0.48, + "learning_rate": 1.91418980472628e-05, + "loss": 0.184, + "step": 1470 + }, + { + "epoch": 0.48, + "learning_rate": 1.9127653908104414e-05, + "loss": 0.1724, + "step": 1480 + }, + { + "epoch": 0.48, + "learning_rate": 1.911329790597853e-05, + "loss": 0.1765, + "step": 1490 + }, + { + "epoch": 0.49, + "learning_rate": 1.9098830216823568e-05, + "loss": 0.1708, + "step": 1500 + }, + { + "epoch": 0.49, + "learning_rate": 1.9084251017946713e-05, + "loss": 0.1725, + "step": 1510 + }, + { + "epoch": 0.49, + "learning_rate": 1.9069560488021744e-05, + "loss": 0.178, + "step": 1520 + }, + { + "epoch": 0.5, + "learning_rate": 1.905475880708686e-05, + "loss": 0.1853, + "step": 1530 + }, + { + "epoch": 0.5, + "learning_rate": 1.9039846156542442e-05, + "loss": 0.1619, + "step": 1540 + }, + { + "epoch": 0.5, + "learning_rate": 1.9024822719148853e-05, + "loss": 0.1616, + "step": 1550 + }, + { + "epoch": 0.51, + "learning_rate": 1.900968867902419e-05, + "loss": 0.1689, + "step": 1560 + }, + { + "epoch": 0.51, + "learning_rate": 1.899444422164204e-05, + "loss": 0.1567, + "step": 1570 + }, + { + "epoch": 0.51, + "learning_rate": 1.8979089533829182e-05, + "loss": 0.1683, + "step": 1580 + }, + { + "epoch": 0.52, + "learning_rate": 1.8963624803763318e-05, + "loss": 0.1677, + "step": 1590 + }, + { + "epoch": 0.52, + "learning_rate": 1.8948050220970763e-05, + "loss": 0.1642, + "step": 1600 + }, + { + "epoch": 0.52, + "learning_rate": 1.893236597632412e-05, + "loss": 0.1792, + "step": 1610 + }, + { + "epoch": 0.53, + "learning_rate": 1.891657226203994e-05, + "loss": 0.1805, + "step": 1620 + }, + { + "epoch": 0.53, + "learning_rate": 1.8900669271676367e-05, + "loss": 0.1573, + "step": 1630 + }, + { + "epoch": 0.53, + "learning_rate": 1.8884657200130763e-05, + "loss": 0.1696, + "step": 1640 + }, + { + "epoch": 0.54, + "learning_rate": 1.8868536243637327e-05, + "loss": 0.1725, + "step": 1650 + }, + { + "epoch": 0.54, + "learning_rate": 1.8852306599764683e-05, + "loss": 0.1755, + "step": 1660 + }, + { + "epoch": 0.54, + "learning_rate": 1.8835968467413465e-05, + "loss": 0.1597, + "step": 1670 + }, + { + "epoch": 0.54, + "learning_rate": 1.8819522046813873e-05, + "loss": 0.1741, + "step": 1680 + }, + { + "epoch": 0.55, + "learning_rate": 1.8802967539523215e-05, + "loss": 0.1712, + "step": 1690 + }, + { + "epoch": 0.55, + "learning_rate": 1.8786305148423463e-05, + "loss": 0.1759, + "step": 1700 + }, + { + "epoch": 0.55, + "learning_rate": 1.8769535077718725e-05, + "loss": 0.1602, + "step": 1710 + }, + { + "epoch": 0.56, + "learning_rate": 1.8752657532932774e-05, + "loss": 0.1693, + "step": 1720 + }, + { + "epoch": 0.56, + "learning_rate": 1.8735672720906527e-05, + "loss": 0.1539, + "step": 1730 + }, + { + "epoch": 0.56, + "learning_rate": 1.8718580849795494e-05, + "loss": 0.166, + "step": 1740 + }, + { + "epoch": 0.57, + "learning_rate": 1.8701382129067232e-05, + "loss": 0.1695, + "step": 1750 + }, + { + "epoch": 0.57, + "learning_rate": 1.86840767694988e-05, + "loss": 0.1664, + "step": 1760 + }, + { + "epoch": 0.57, + "learning_rate": 1.8666664983174137e-05, + "loss": 0.1693, + "step": 1770 + }, + { + "epoch": 0.58, + "learning_rate": 1.864914698348149e-05, + "loss": 0.168, + "step": 1780 + }, + { + "epoch": 0.58, + "learning_rate": 1.8631522985110803e-05, + "loss": 0.161, + "step": 1790 + }, + { + "epoch": 0.58, + "learning_rate": 1.8613793204051066e-05, + "loss": 0.1825, + "step": 1800 + }, + { + "epoch": 0.59, + "learning_rate": 1.859595785758767e-05, + "loss": 0.1688, + "step": 1810 + }, + { + "epoch": 0.59, + "learning_rate": 1.8578017164299767e-05, + "loss": 0.1584, + "step": 1820 + }, + { + "epoch": 0.59, + "learning_rate": 1.8559971344057562e-05, + "loss": 0.1602, + "step": 1830 + }, + { + "epoch": 0.6, + "learning_rate": 1.8541820618019647e-05, + "loss": 0.1773, + "step": 1840 + }, + { + "epoch": 0.6, + "learning_rate": 1.8523565208630257e-05, + "loss": 0.1665, + "step": 1850 + }, + { + "epoch": 0.6, + "learning_rate": 1.8505205339616577e-05, + "loss": 0.1706, + "step": 1860 + }, + { + "epoch": 0.61, + "learning_rate": 1.848674123598598e-05, + "loss": 0.1699, + "step": 1870 + }, + { + "epoch": 0.61, + "learning_rate": 1.846817312402327e-05, + "loss": 0.1613, + "step": 1880 + }, + { + "epoch": 0.61, + "learning_rate": 1.8449501231287926e-05, + "loss": 0.1678, + "step": 1890 + }, + { + "epoch": 0.62, + "learning_rate": 1.8430725786611293e-05, + "loss": 0.1777, + "step": 1900 + }, + { + "epoch": 0.62, + "learning_rate": 1.8411847020093784e-05, + "loss": 0.1729, + "step": 1910 + }, + { + "epoch": 0.62, + "learning_rate": 1.8392865163102065e-05, + "loss": 0.1619, + "step": 1920 + }, + { + "epoch": 0.63, + "learning_rate": 1.8373780448266213e-05, + "loss": 0.1723, + "step": 1930 + }, + { + "epoch": 0.63, + "learning_rate": 1.8354593109476877e-05, + "loss": 0.1561, + "step": 1940 + }, + { + "epoch": 0.63, + "learning_rate": 1.833530338188239e-05, + "loss": 0.166, + "step": 1950 + }, + { + "epoch": 0.64, + "learning_rate": 1.8315911501885905e-05, + "loss": 0.1684, + "step": 1960 + }, + { + "epoch": 0.64, + "learning_rate": 1.82964177071425e-05, + "loss": 0.1627, + "step": 1970 + }, + { + "epoch": 0.64, + "learning_rate": 1.8276822236556246e-05, + "loss": 0.171, + "step": 1980 + }, + { + "epoch": 0.65, + "learning_rate": 1.82571253302773e-05, + "loss": 0.1604, + "step": 1990 + }, + { + "epoch": 0.65, + "learning_rate": 1.8237327229698943e-05, + "loss": 0.176, + "step": 2000 + }, + { + "epoch": 0.65, + "learning_rate": 1.821742817745465e-05, + "loss": 0.1724, + "step": 2010 + }, + { + "epoch": 0.65, + "learning_rate": 1.8197428417415075e-05, + "loss": 0.1688, + "step": 2020 + }, + { + "epoch": 0.66, + "learning_rate": 1.8177328194685108e-05, + "loss": 0.1579, + "step": 2030 + }, + { + "epoch": 0.66, + "learning_rate": 1.8157127755600826e-05, + "loss": 0.1561, + "step": 2040 + }, + { + "epoch": 0.66, + "learning_rate": 1.8136827347726516e-05, + "loss": 0.1663, + "step": 2050 + }, + { + "epoch": 0.67, + "learning_rate": 1.8116427219851615e-05, + "loss": 0.1621, + "step": 2060 + }, + { + "epoch": 0.67, + "learning_rate": 1.8095927621987658e-05, + "loss": 0.1647, + "step": 2070 + }, + { + "epoch": 0.67, + "learning_rate": 1.807532880536524e-05, + "loss": 0.1773, + "step": 2080 + }, + { + "epoch": 0.68, + "learning_rate": 1.8054631022430913e-05, + "loss": 0.1668, + "step": 2090 + }, + { + "epoch": 0.68, + "learning_rate": 1.8033834526844095e-05, + "loss": 0.1496, + "step": 2100 + }, + { + "epoch": 0.68, + "learning_rate": 1.8012939573473972e-05, + "loss": 0.169, + "step": 2110 + }, + { + "epoch": 0.69, + "learning_rate": 1.7991946418396365e-05, + "loss": 0.1706, + "step": 2120 + }, + { + "epoch": 0.69, + "learning_rate": 1.7970855318890606e-05, + "loss": 0.1599, + "step": 2130 + }, + { + "epoch": 0.69, + "learning_rate": 1.7949666533436358e-05, + "loss": 0.1673, + "step": 2140 + }, + { + "epoch": 0.7, + "learning_rate": 1.792838032171047e-05, + "loss": 0.1586, + "step": 2150 + }, + { + "epoch": 0.7, + "learning_rate": 1.79069969445838e-05, + "loss": 0.1623, + "step": 2160 + }, + { + "epoch": 0.7, + "learning_rate": 1.7885516664117982e-05, + "loss": 0.1572, + "step": 2170 + }, + { + "epoch": 0.71, + "learning_rate": 1.7863939743562266e-05, + "loss": 0.1637, + "step": 2180 + }, + { + "epoch": 0.71, + "learning_rate": 1.7842266447350236e-05, + "loss": 0.1637, + "step": 2190 + }, + { + "epoch": 0.71, + "learning_rate": 1.782049704109662e-05, + "loss": 0.1568, + "step": 2200 + }, + { + "epoch": 0.72, + "learning_rate": 1.7798631791594e-05, + "loss": 0.1585, + "step": 2210 + }, + { + "epoch": 0.72, + "learning_rate": 1.777667096680956e-05, + "loss": 0.1649, + "step": 2220 + }, + { + "epoch": 0.72, + "learning_rate": 1.7754614835881795e-05, + "loss": 0.1646, + "step": 2230 + }, + { + "epoch": 0.73, + "learning_rate": 1.7732463669117206e-05, + "loss": 0.1605, + "step": 2240 + }, + { + "epoch": 0.73, + "learning_rate": 1.7710217737987008e-05, + "loss": 0.1515, + "step": 2250 + }, + { + "epoch": 0.73, + "learning_rate": 1.768787731512379e-05, + "loss": 0.1458, + "step": 2260 + }, + { + "epoch": 0.74, + "learning_rate": 1.766544267431816e-05, + "loss": 0.1674, + "step": 2270 + }, + { + "epoch": 0.74, + "learning_rate": 1.7642914090515423e-05, + "loss": 0.1659, + "step": 2280 + }, + { + "epoch": 0.74, + "learning_rate": 1.762029183981217e-05, + "loss": 0.1512, + "step": 2290 + }, + { + "epoch": 0.75, + "learning_rate": 1.759757619945294e-05, + "loss": 0.1736, + "step": 2300 + }, + { + "epoch": 0.75, + "learning_rate": 1.7574767447826776e-05, + "loss": 0.1656, + "step": 2310 + }, + { + "epoch": 0.75, + "learning_rate": 1.7551865864463857e-05, + "loss": 0.157, + "step": 2320 + }, + { + "epoch": 0.76, + "learning_rate": 1.7528871730032034e-05, + "loss": 0.1588, + "step": 2330 + }, + { + "epoch": 0.76, + "learning_rate": 1.750578532633342e-05, + "loss": 0.1547, + "step": 2340 + }, + { + "epoch": 0.76, + "learning_rate": 1.748260693630092e-05, + "loss": 0.1528, + "step": 2350 + }, + { + "epoch": 0.77, + "learning_rate": 1.7459336843994758e-05, + "loss": 0.1541, + "step": 2360 + }, + { + "epoch": 0.77, + "learning_rate": 1.7435975334599026e-05, + "loss": 0.1554, + "step": 2370 + }, + { + "epoch": 0.77, + "learning_rate": 1.741252269441815e-05, + "loss": 0.1728, + "step": 2380 + }, + { + "epoch": 0.77, + "learning_rate": 1.73889792108734e-05, + "loss": 0.1678, + "step": 2390 + }, + { + "epoch": 0.78, + "learning_rate": 1.736534517249938e-05, + "loss": 0.1586, + "step": 2400 + }, + { + "epoch": 0.78, + "learning_rate": 1.7341620868940467e-05, + "loss": 0.1549, + "step": 2410 + }, + { + "epoch": 0.78, + "learning_rate": 1.731780659094728e-05, + "loss": 0.1561, + "step": 2420 + }, + { + "epoch": 0.79, + "learning_rate": 1.7293902630373103e-05, + "loss": 0.1624, + "step": 2430 + }, + { + "epoch": 0.79, + "learning_rate": 1.726990928017032e-05, + "loss": 0.1561, + "step": 2440 + }, + { + "epoch": 0.79, + "learning_rate": 1.7245826834386825e-05, + "loss": 0.1424, + "step": 2450 + }, + { + "epoch": 0.8, + "learning_rate": 1.7221655588162397e-05, + "loss": 0.1605, + "step": 2460 + }, + { + "epoch": 0.8, + "learning_rate": 1.7197395837725118e-05, + "loss": 0.1547, + "step": 2470 + }, + { + "epoch": 0.8, + "learning_rate": 1.717304788038771e-05, + "loss": 0.164, + "step": 2480 + }, + { + "epoch": 0.81, + "learning_rate": 1.7148612014543915e-05, + "loss": 0.1569, + "step": 2490 + }, + { + "epoch": 0.81, + "learning_rate": 1.712408853966482e-05, + "loss": 0.1527, + "step": 2500 + }, + { + "epoch": 0.81, + "learning_rate": 1.7099477756295195e-05, + "loss": 0.154, + "step": 2510 + }, + { + "epoch": 0.82, + "learning_rate": 1.7074779966049818e-05, + "loss": 0.1588, + "step": 2520 + }, + { + "epoch": 0.82, + "learning_rate": 1.7049995471609765e-05, + "loss": 0.1595, + "step": 2530 + }, + { + "epoch": 0.82, + "learning_rate": 1.70251245767187e-05, + "loss": 0.1761, + "step": 2540 + }, + { + "epoch": 0.83, + "learning_rate": 1.7000167586179173e-05, + "loss": 0.1563, + "step": 2550 + }, + { + "epoch": 0.83, + "learning_rate": 1.6975124805848852e-05, + "loss": 0.1592, + "step": 2560 + }, + { + "epoch": 0.83, + "learning_rate": 1.694999654263681e-05, + "loss": 0.1597, + "step": 2570 + }, + { + "epoch": 0.84, + "learning_rate": 1.692478310449973e-05, + "loss": 0.1611, + "step": 2580 + }, + { + "epoch": 0.84, + "learning_rate": 1.689948480043816e-05, + "loss": 0.1712, + "step": 2590 + }, + { + "epoch": 0.84, + "learning_rate": 1.6874101940492707e-05, + "loss": 0.1603, + "step": 2600 + }, + { + "epoch": 0.85, + "learning_rate": 1.684863483574024e-05, + "loss": 0.1666, + "step": 2610 + }, + { + "epoch": 0.85, + "learning_rate": 1.6823083798290092e-05, + "loss": 0.1599, + "step": 2620 + }, + { + "epoch": 0.85, + "learning_rate": 1.6797449141280213e-05, + "loss": 0.1468, + "step": 2630 + }, + { + "epoch": 0.86, + "learning_rate": 1.6771731178873344e-05, + "loss": 0.1519, + "step": 2640 + }, + { + "epoch": 0.86, + "learning_rate": 1.674593022625318e-05, + "loss": 0.1565, + "step": 2650 + }, + { + "epoch": 0.86, + "learning_rate": 1.6720046599620476e-05, + "loss": 0.1513, + "step": 2660 + }, + { + "epoch": 0.87, + "learning_rate": 1.6694080616189197e-05, + "loss": 0.1616, + "step": 2670 + }, + { + "epoch": 0.87, + "learning_rate": 1.6668032594182623e-05, + "loss": 0.1642, + "step": 2680 + }, + { + "epoch": 0.87, + "learning_rate": 1.664190285282945e-05, + "loss": 0.1564, + "step": 2690 + }, + { + "epoch": 0.88, + "learning_rate": 1.661569171235988e-05, + "loss": 0.1604, + "step": 2700 + }, + { + "epoch": 0.88, + "learning_rate": 1.658939949400167e-05, + "loss": 0.1552, + "step": 2710 + }, + { + "epoch": 0.88, + "learning_rate": 1.656302651997626e-05, + "loss": 0.1526, + "step": 2720 + }, + { + "epoch": 0.89, + "learning_rate": 1.6536573113494737e-05, + "loss": 0.16, + "step": 2730 + }, + { + "epoch": 0.89, + "learning_rate": 1.6510039598753953e-05, + "loss": 0.155, + "step": 2740 + }, + { + "epoch": 0.89, + "learning_rate": 1.64834263009325e-05, + "loss": 0.1641, + "step": 2750 + }, + { + "epoch": 0.89, + "learning_rate": 1.6456733546186755e-05, + "loss": 0.1423, + "step": 2760 + }, + { + "epoch": 0.9, + "learning_rate": 1.6429961661646858e-05, + "loss": 0.1604, + "step": 2770 + }, + { + "epoch": 0.9, + "learning_rate": 1.6403110975412723e-05, + "loss": 0.1698, + "step": 2780 + }, + { + "epoch": 0.9, + "learning_rate": 1.637618181655001e-05, + "loss": 0.1537, + "step": 2790 + }, + { + "epoch": 0.91, + "learning_rate": 1.6349174515086087e-05, + "loss": 0.158, + "step": 2800 + }, + { + "epoch": 0.91, + "learning_rate": 1.6322089402005995e-05, + "loss": 0.145, + "step": 2810 + }, + { + "epoch": 0.91, + "learning_rate": 1.629492680924839e-05, + "loss": 0.1462, + "step": 2820 + }, + { + "epoch": 0.92, + "learning_rate": 1.6267687069701455e-05, + "loss": 0.1536, + "step": 2830 + }, + { + "epoch": 0.92, + "learning_rate": 1.6240370517198855e-05, + "loss": 0.1456, + "step": 2840 + }, + { + "epoch": 0.92, + "learning_rate": 1.6212977486515626e-05, + "loss": 0.1576, + "step": 2850 + }, + { + "epoch": 0.93, + "learning_rate": 1.618550831336406e-05, + "loss": 0.1555, + "step": 2860 + }, + { + "epoch": 0.93, + "learning_rate": 1.6157963334389623e-05, + "loss": 0.1593, + "step": 2870 + }, + { + "epoch": 0.93, + "learning_rate": 1.61303428871668e-05, + "loss": 0.155, + "step": 2880 + }, + { + "epoch": 0.94, + "learning_rate": 1.6102647310194964e-05, + "loss": 0.1502, + "step": 2890 + }, + { + "epoch": 0.94, + "learning_rate": 1.607487694289425e-05, + "loss": 0.144, + "step": 2900 + }, + { + "epoch": 0.94, + "learning_rate": 1.6047032125601364e-05, + "loss": 0.1422, + "step": 2910 + }, + { + "epoch": 0.95, + "learning_rate": 1.6019113199565424e-05, + "loss": 0.1594, + "step": 2920 + }, + { + "epoch": 0.95, + "learning_rate": 1.599112050694379e-05, + "loss": 0.1488, + "step": 2930 + }, + { + "epoch": 0.95, + "learning_rate": 1.596305439079785e-05, + "loss": 0.1631, + "step": 2940 + }, + { + "epoch": 0.96, + "learning_rate": 1.5934915195088842e-05, + "loss": 0.1401, + "step": 2950 + }, + { + "epoch": 0.96, + "learning_rate": 1.5906703264673598e-05, + "loss": 0.1526, + "step": 2960 + }, + { + "epoch": 0.96, + "learning_rate": 1.5878418945300363e-05, + "loss": 0.15, + "step": 2970 + }, + { + "epoch": 0.97, + "learning_rate": 1.5850062583604534e-05, + "loss": 0.1589, + "step": 2980 + }, + { + "epoch": 0.97, + "learning_rate": 1.58216345271044e-05, + "loss": 0.1551, + "step": 2990 + }, + { + "epoch": 0.97, + "learning_rate": 1.5793135124196916e-05, + "loss": 0.1482, + "step": 3000 + }, + { + "epoch": 0.98, + "learning_rate": 1.5764564724153406e-05, + "loss": 0.1518, + "step": 3010 + }, + { + "epoch": 0.98, + "learning_rate": 1.5735923677115298e-05, + "loss": 0.1495, + "step": 3020 + }, + { + "epoch": 0.98, + "learning_rate": 1.570721233408981e-05, + "loss": 0.1492, + "step": 3030 + }, + { + "epoch": 0.99, + "learning_rate": 1.567843104694569e-05, + "loss": 0.1538, + "step": 3040 + }, + { + "epoch": 0.99, + "learning_rate": 1.5649580168408854e-05, + "loss": 0.1521, + "step": 3050 + }, + { + "epoch": 0.99, + "learning_rate": 1.5620660052058108e-05, + "loss": 0.1593, + "step": 3060 + }, + { + "epoch": 1.0, + "learning_rate": 1.5591671052320784e-05, + "loss": 0.1604, + "step": 3070 + }, + { + "epoch": 1.0, + "learning_rate": 1.55626135244684e-05, + "loss": 0.1405, + "step": 3080 + }, + { + "epoch": 1.0, + "eval_loss": 0.1611328125, + "eval_runtime": 6.2849, + "eval_samples_per_second": 20.366, + "eval_steps_per_second": 0.159, + "step": 3084 + }, + { + "epoch": 1.0, + "learning_rate": 1.553348782461233e-05, + "loss": 0.1398, + "step": 3090 + }, + { + "epoch": 1.01, + "learning_rate": 1.550429430969941e-05, + "loss": 0.1198, + "step": 3100 + }, + { + "epoch": 1.01, + "learning_rate": 1.5475033337507583e-05, + "loss": 0.109, + "step": 3110 + }, + { + "epoch": 1.01, + "learning_rate": 1.54457052666415e-05, + "loss": 0.1167, + "step": 3120 + }, + { + "epoch": 1.01, + "learning_rate": 1.541631045652814e-05, + "loss": 0.108, + "step": 3130 + }, + { + "epoch": 1.02, + "learning_rate": 1.5386849267412388e-05, + "loss": 0.1184, + "step": 3140 + }, + { + "epoch": 1.02, + "learning_rate": 1.5357322060352646e-05, + "loss": 0.1193, + "step": 3150 + }, + { + "epoch": 1.02, + "learning_rate": 1.5327729197216373e-05, + "loss": 0.1218, + "step": 3160 + }, + { + "epoch": 1.03, + "learning_rate": 1.529807104067568e-05, + "loss": 0.1152, + "step": 3170 + }, + { + "epoch": 1.03, + "learning_rate": 1.5268347954202872e-05, + "loss": 0.1079, + "step": 3180 + }, + { + "epoch": 1.03, + "learning_rate": 1.5238560302065992e-05, + "loss": 0.1128, + "step": 3190 + }, + { + "epoch": 1.04, + "learning_rate": 1.5208708449324369e-05, + "loss": 0.1158, + "step": 3200 + }, + { + "epoch": 1.04, + "learning_rate": 1.5178792761824129e-05, + "loss": 0.1204, + "step": 3210 + }, + { + "epoch": 1.04, + "learning_rate": 1.5148813606193715e-05, + "loss": 0.111, + "step": 3220 + }, + { + "epoch": 1.05, + "learning_rate": 1.5118771349839402e-05, + "loss": 0.1161, + "step": 3230 + }, + { + "epoch": 1.05, + "learning_rate": 1.5088666360940795e-05, + "loss": 0.1158, + "step": 3240 + }, + { + "epoch": 1.05, + "learning_rate": 1.5058499008446296e-05, + "loss": 0.1143, + "step": 3250 + }, + { + "epoch": 1.06, + "learning_rate": 1.502826966206861e-05, + "loss": 0.113, + "step": 3260 + }, + { + "epoch": 1.06, + "learning_rate": 1.4997978692280191e-05, + "loss": 0.122, + "step": 3270 + }, + { + "epoch": 1.06, + "learning_rate": 1.496762647030872e-05, + "loss": 0.1213, + "step": 3280 + }, + { + "epoch": 1.07, + "learning_rate": 1.4937213368132549e-05, + "loss": 0.125, + "step": 3290 + }, + { + "epoch": 1.07, + "learning_rate": 1.490673975847613e-05, + "loss": 0.1162, + "step": 3300 + }, + { + "epoch": 1.07, + "learning_rate": 1.4876206014805465e-05, + "loss": 0.1181, + "step": 3310 + }, + { + "epoch": 1.08, + "learning_rate": 1.4845612511323526e-05, + "loss": 0.1216, + "step": 3320 + }, + { + "epoch": 1.08, + "learning_rate": 1.4814959622965657e-05, + "loss": 0.1216, + "step": 3330 + }, + { + "epoch": 1.08, + "learning_rate": 1.478424772539499e-05, + "loss": 0.106, + "step": 3340 + }, + { + "epoch": 1.09, + "learning_rate": 1.4753477194997836e-05, + "loss": 0.1239, + "step": 3350 + }, + { + "epoch": 1.09, + "learning_rate": 1.4722648408879078e-05, + "loss": 0.1101, + "step": 3360 + }, + { + "epoch": 1.09, + "learning_rate": 1.4691761744857545e-05, + "loss": 0.1233, + "step": 3370 + }, + { + "epoch": 1.1, + "learning_rate": 1.466081758146138e-05, + "loss": 0.117, + "step": 3380 + }, + { + "epoch": 1.1, + "learning_rate": 1.4629816297923404e-05, + "loss": 0.1162, + "step": 3390 + }, + { + "epoch": 1.1, + "learning_rate": 1.4598758274176467e-05, + "loss": 0.1214, + "step": 3400 + }, + { + "epoch": 1.11, + "learning_rate": 1.4567643890848796e-05, + "loss": 0.1139, + "step": 3410 + }, + { + "epoch": 1.11, + "learning_rate": 1.4536473529259325e-05, + "loss": 0.1191, + "step": 3420 + }, + { + "epoch": 1.11, + "learning_rate": 1.4505247571413019e-05, + "loss": 0.1132, + "step": 3430 + }, + { + "epoch": 1.12, + "learning_rate": 1.4473966399996203e-05, + "loss": 0.1151, + "step": 3440 + }, + { + "epoch": 1.12, + "learning_rate": 1.444263039837186e-05, + "loss": 0.1244, + "step": 3450 + }, + { + "epoch": 1.12, + "learning_rate": 1.4411239950574946e-05, + "loss": 0.113, + "step": 3460 + }, + { + "epoch": 1.13, + "learning_rate": 1.4379795441307673e-05, + "loss": 0.1155, + "step": 3470 + }, + { + "epoch": 1.13, + "learning_rate": 1.4348297255934793e-05, + "loss": 0.12, + "step": 3480 + }, + { + "epoch": 1.13, + "learning_rate": 1.4316745780478885e-05, + "loss": 0.1129, + "step": 3490 + }, + { + "epoch": 1.13, + "learning_rate": 1.4285141401615619e-05, + "loss": 0.1191, + "step": 3500 + }, + { + "epoch": 1.14, + "learning_rate": 1.4253484506669012e-05, + "loss": 0.1143, + "step": 3510 + }, + { + "epoch": 1.14, + "learning_rate": 1.422177548360669e-05, + "loss": 0.124, + "step": 3520 + }, + { + "epoch": 1.14, + "learning_rate": 1.4190014721035127e-05, + "loss": 0.1236, + "step": 3530 + }, + { + "epoch": 1.15, + "learning_rate": 1.4158202608194893e-05, + "loss": 0.116, + "step": 3540 + }, + { + "epoch": 1.15, + "learning_rate": 1.4126339534955863e-05, + "loss": 0.1128, + "step": 3550 + }, + { + "epoch": 1.15, + "learning_rate": 1.4094425891812457e-05, + "loss": 0.1196, + "step": 3560 + }, + { + "epoch": 1.16, + "learning_rate": 1.4062462069878855e-05, + "loss": 0.1128, + "step": 3570 + }, + { + "epoch": 1.16, + "learning_rate": 1.4030448460884191e-05, + "loss": 0.1163, + "step": 3580 + }, + { + "epoch": 1.16, + "learning_rate": 1.3998385457167758e-05, + "loss": 0.1178, + "step": 3590 + }, + { + "epoch": 1.17, + "learning_rate": 1.3966273451674203e-05, + "loss": 0.1128, + "step": 3600 + }, + { + "epoch": 1.17, + "learning_rate": 1.3934112837948712e-05, + "loss": 0.1167, + "step": 3610 + }, + { + "epoch": 1.17, + "learning_rate": 1.3901904010132178e-05, + "loss": 0.1181, + "step": 3620 + }, + { + "epoch": 1.18, + "learning_rate": 1.3869647362956381e-05, + "loss": 0.1124, + "step": 3630 + }, + { + "epoch": 1.18, + "learning_rate": 1.3837343291739143e-05, + "loss": 0.1189, + "step": 3640 + }, + { + "epoch": 1.18, + "learning_rate": 1.3804992192379487e-05, + "loss": 0.121, + "step": 3650 + }, + { + "epoch": 1.19, + "learning_rate": 1.3772594461352786e-05, + "loss": 0.1185, + "step": 3660 + }, + { + "epoch": 1.19, + "learning_rate": 1.3740150495705904e-05, + "loss": 0.1208, + "step": 3670 + }, + { + "epoch": 1.19, + "learning_rate": 1.3707660693052318e-05, + "loss": 0.1214, + "step": 3680 + }, + { + "epoch": 1.2, + "learning_rate": 1.3675125451567268e-05, + "loss": 0.1103, + "step": 3690 + }, + { + "epoch": 1.2, + "learning_rate": 1.364254516998286e-05, + "loss": 0.1119, + "step": 3700 + }, + { + "epoch": 1.2, + "learning_rate": 1.3609920247583182e-05, + "loss": 0.1192, + "step": 3710 + }, + { + "epoch": 1.21, + "learning_rate": 1.3577251084199412e-05, + "loss": 0.1249, + "step": 3720 + }, + { + "epoch": 1.21, + "learning_rate": 1.3544538080204922e-05, + "loss": 0.1212, + "step": 3730 + }, + { + "epoch": 1.21, + "learning_rate": 1.351178163651037e-05, + "loss": 0.115, + "step": 3740 + }, + { + "epoch": 1.22, + "learning_rate": 1.3478982154558778e-05, + "loss": 0.1195, + "step": 3750 + }, + { + "epoch": 1.22, + "learning_rate": 1.3446140036320621e-05, + "loss": 0.1264, + "step": 3760 + }, + { + "epoch": 1.22, + "learning_rate": 1.34132556842889e-05, + "loss": 0.1165, + "step": 3770 + }, + { + "epoch": 1.23, + "learning_rate": 1.3380329501474207e-05, + "loss": 0.1211, + "step": 3780 + }, + { + "epoch": 1.23, + "learning_rate": 1.3347361891399786e-05, + "loss": 0.113, + "step": 3790 + }, + { + "epoch": 1.23, + "learning_rate": 1.3314353258096588e-05, + "loss": 0.1135, + "step": 3800 + }, + { + "epoch": 1.24, + "learning_rate": 1.3281304006098324e-05, + "loss": 0.1125, + "step": 3810 + }, + { + "epoch": 1.24, + "learning_rate": 1.3248214540436495e-05, + "loss": 0.1245, + "step": 3820 + }, + { + "epoch": 1.24, + "learning_rate": 1.3215085266635442e-05, + "loss": 0.1112, + "step": 3830 + }, + { + "epoch": 1.25, + "learning_rate": 1.3181916590707366e-05, + "loss": 0.1209, + "step": 3840 + }, + { + "epoch": 1.25, + "learning_rate": 1.3148708919147364e-05, + "loss": 0.117, + "step": 3850 + }, + { + "epoch": 1.25, + "learning_rate": 1.3115462658928434e-05, + "loss": 0.1164, + "step": 3860 + }, + { + "epoch": 1.25, + "learning_rate": 1.3082178217496488e-05, + "loss": 0.1148, + "step": 3870 + }, + { + "epoch": 1.26, + "learning_rate": 1.304885600276538e-05, + "loss": 0.1159, + "step": 3880 + }, + { + "epoch": 1.26, + "learning_rate": 1.3015496423111871e-05, + "loss": 0.1198, + "step": 3890 + }, + { + "epoch": 1.26, + "learning_rate": 1.298209988737066e-05, + "loss": 0.1186, + "step": 3900 + }, + { + "epoch": 1.27, + "learning_rate": 1.2948666804829345e-05, + "loss": 0.1093, + "step": 3910 + }, + { + "epoch": 1.27, + "learning_rate": 1.2915197585223427e-05, + "loss": 0.1189, + "step": 3920 + }, + { + "epoch": 1.27, + "learning_rate": 1.288169263873128e-05, + "loss": 0.1027, + "step": 3930 + }, + { + "epoch": 1.28, + "learning_rate": 1.284815237596912e-05, + "loss": 0.114, + "step": 3940 + }, + { + "epoch": 1.28, + "learning_rate": 1.2814577207985984e-05, + "loss": 0.11, + "step": 3950 + }, + { + "epoch": 1.28, + "learning_rate": 1.2780967546258683e-05, + "loss": 0.1129, + "step": 3960 + }, + { + "epoch": 1.29, + "learning_rate": 1.2747323802686761e-05, + "loss": 0.1159, + "step": 3970 + }, + { + "epoch": 1.29, + "learning_rate": 1.2713646389587453e-05, + "loss": 0.1213, + "step": 3980 + }, + { + "epoch": 1.29, + "learning_rate": 1.267993571969062e-05, + "loss": 0.1117, + "step": 3990 + }, + { + "epoch": 1.3, + "learning_rate": 1.2646192206133705e-05, + "loss": 0.1187, + "step": 4000 + }, + { + "epoch": 1.3, + "learning_rate": 1.2612416262456659e-05, + "loss": 0.1165, + "step": 4010 + }, + { + "epoch": 1.3, + "learning_rate": 1.2578608302596878e-05, + "loss": 0.1277, + "step": 4020 + }, + { + "epoch": 1.31, + "learning_rate": 1.254476874088413e-05, + "loss": 0.1223, + "step": 4030 + }, + { + "epoch": 1.31, + "learning_rate": 1.2510897992035475e-05, + "loss": 0.1187, + "step": 4040 + }, + { + "epoch": 1.31, + "learning_rate": 1.2476996471150183e-05, + "loss": 0.1177, + "step": 4050 + }, + { + "epoch": 1.32, + "learning_rate": 1.2443064593704645e-05, + "loss": 0.1202, + "step": 4060 + }, + { + "epoch": 1.32, + "learning_rate": 1.240910277554729e-05, + "loss": 0.1163, + "step": 4070 + }, + { + "epoch": 1.32, + "learning_rate": 1.2375111432893479e-05, + "loss": 0.1062, + "step": 4080 + }, + { + "epoch": 1.33, + "learning_rate": 1.2341090982320398e-05, + "loss": 0.1186, + "step": 4090 + }, + { + "epoch": 1.33, + "learning_rate": 1.2307041840761983e-05, + "loss": 0.1193, + "step": 4100 + }, + { + "epoch": 1.33, + "learning_rate": 1.2272964425503768e-05, + "loss": 0.1174, + "step": 4110 + }, + { + "epoch": 1.34, + "learning_rate": 1.2238859154177805e-05, + "loss": 0.109, + "step": 4120 + }, + { + "epoch": 1.34, + "learning_rate": 1.2204726444757527e-05, + "loss": 0.1251, + "step": 4130 + }, + { + "epoch": 1.34, + "learning_rate": 1.2170566715552634e-05, + "loss": 0.1166, + "step": 4140 + }, + { + "epoch": 1.35, + "learning_rate": 1.2136380385203965e-05, + "loss": 0.1123, + "step": 4150 + }, + { + "epoch": 1.35, + "learning_rate": 1.2102167872678366e-05, + "loss": 0.1273, + "step": 4160 + }, + { + "epoch": 1.35, + "learning_rate": 1.2067929597263552e-05, + "loss": 0.1201, + "step": 4170 + }, + { + "epoch": 1.36, + "learning_rate": 1.2033665978562973e-05, + "loss": 0.1197, + "step": 4180 + }, + { + "epoch": 1.36, + "learning_rate": 1.1999377436490682e-05, + "loss": 0.1126, + "step": 4190 + }, + { + "epoch": 1.36, + "learning_rate": 1.1965064391266158e-05, + "loss": 0.1264, + "step": 4200 + }, + { + "epoch": 1.37, + "learning_rate": 1.1930727263409194e-05, + "loss": 0.1153, + "step": 4210 + }, + { + "epoch": 1.37, + "learning_rate": 1.1896366473734715e-05, + "loss": 0.1085, + "step": 4220 + }, + { + "epoch": 1.37, + "learning_rate": 1.1861982443347633e-05, + "loss": 0.1116, + "step": 4230 + }, + { + "epoch": 1.37, + "learning_rate": 1.1827575593637683e-05, + "loss": 0.1107, + "step": 4240 + }, + { + "epoch": 1.38, + "learning_rate": 1.1793146346274262e-05, + "loss": 0.121, + "step": 4250 + }, + { + "epoch": 1.38, + "learning_rate": 1.1758695123201262e-05, + "loss": 0.1179, + "step": 4260 + }, + { + "epoch": 1.38, + "learning_rate": 1.1724222346631886e-05, + "loss": 0.1118, + "step": 4270 + }, + { + "epoch": 1.39, + "learning_rate": 1.1689728439043495e-05, + "loss": 0.1135, + "step": 4280 + }, + { + "epoch": 1.39, + "learning_rate": 1.1655213823172407e-05, + "loss": 0.1168, + "step": 4290 + }, + { + "epoch": 1.39, + "learning_rate": 1.1620678922008736e-05, + "loss": 0.1076, + "step": 4300 + }, + { + "epoch": 1.4, + "learning_rate": 1.1586124158791205e-05, + "loss": 0.1145, + "step": 4310 + }, + { + "epoch": 1.4, + "learning_rate": 1.1551549957001944e-05, + "loss": 0.1222, + "step": 4320 + }, + { + "epoch": 1.4, + "learning_rate": 1.151695674036131e-05, + "loss": 0.1219, + "step": 4330 + }, + { + "epoch": 1.41, + "learning_rate": 1.1482344932822706e-05, + "loss": 0.1145, + "step": 4340 + }, + { + "epoch": 1.41, + "learning_rate": 1.1447714958567361e-05, + "loss": 0.1201, + "step": 4350 + }, + { + "epoch": 1.41, + "learning_rate": 1.1413067241999153e-05, + "loss": 0.1203, + "step": 4360 + }, + { + "epoch": 1.42, + "learning_rate": 1.1378402207739394e-05, + "loss": 0.1135, + "step": 4370 + }, + { + "epoch": 1.42, + "learning_rate": 1.134372028062163e-05, + "loss": 0.1151, + "step": 4380 + }, + { + "epoch": 1.42, + "learning_rate": 1.1309021885686446e-05, + "loss": 0.1167, + "step": 4390 + }, + { + "epoch": 1.43, + "learning_rate": 1.1274307448176227e-05, + "loss": 0.1125, + "step": 4400 + }, + { + "epoch": 1.43, + "learning_rate": 1.1239577393529988e-05, + "loss": 0.1128, + "step": 4410 + }, + { + "epoch": 1.43, + "learning_rate": 1.1204832147378125e-05, + "loss": 0.1201, + "step": 4420 + }, + { + "epoch": 1.44, + "learning_rate": 1.1170072135537213e-05, + "loss": 0.1081, + "step": 4430 + }, + { + "epoch": 1.44, + "learning_rate": 1.113529778400479e-05, + "loss": 0.1055, + "step": 4440 + }, + { + "epoch": 1.44, + "learning_rate": 1.110050951895413e-05, + "loss": 0.1167, + "step": 4450 + }, + { + "epoch": 1.45, + "learning_rate": 1.1065707766729024e-05, + "loss": 0.1257, + "step": 4460 + }, + { + "epoch": 1.45, + "learning_rate": 1.1030892953838548e-05, + "loss": 0.1137, + "step": 4470 + }, + { + "epoch": 1.45, + "learning_rate": 1.0996065506951854e-05, + "loss": 0.1106, + "step": 4480 + }, + { + "epoch": 1.46, + "learning_rate": 1.0961225852892914e-05, + "loss": 0.111, + "step": 4490 + }, + { + "epoch": 1.46, + "learning_rate": 1.0926374418635317e-05, + "loss": 0.107, + "step": 4500 + }, + { + "epoch": 1.46, + "learning_rate": 1.0891511631297009e-05, + "loss": 0.117, + "step": 4510 + }, + { + "epoch": 1.47, + "learning_rate": 1.0856637918135087e-05, + "loss": 0.1237, + "step": 4520 + }, + { + "epoch": 1.47, + "learning_rate": 1.0821753706540539e-05, + "loss": 0.1168, + "step": 4530 + }, + { + "epoch": 1.47, + "learning_rate": 1.0786859424033014e-05, + "loss": 0.1055, + "step": 4540 + }, + { + "epoch": 1.48, + "learning_rate": 1.0751955498255595e-05, + "loss": 0.1207, + "step": 4550 + }, + { + "epoch": 1.48, + "learning_rate": 1.0717042356969529e-05, + "loss": 0.1104, + "step": 4560 + }, + { + "epoch": 1.48, + "learning_rate": 1.0682120428049025e-05, + "loss": 0.1231, + "step": 4570 + }, + { + "epoch": 1.49, + "learning_rate": 1.0647190139475967e-05, + "loss": 0.1176, + "step": 4580 + }, + { + "epoch": 1.49, + "learning_rate": 1.0612251919334703e-05, + "loss": 0.1168, + "step": 4590 + }, + { + "epoch": 1.49, + "learning_rate": 1.057730619580678e-05, + "loss": 0.1098, + "step": 4600 + }, + { + "epoch": 1.49, + "learning_rate": 1.0542353397165706e-05, + "loss": 0.1119, + "step": 4610 + }, + { + "epoch": 1.5, + "learning_rate": 1.0507393951771695e-05, + "loss": 0.111, + "step": 4620 + }, + { + "epoch": 1.5, + "learning_rate": 1.0472428288066413e-05, + "loss": 0.1134, + "step": 4630 + }, + { + "epoch": 1.5, + "learning_rate": 1.043745683456775e-05, + "loss": 0.1146, + "step": 4640 + }, + { + "epoch": 1.51, + "learning_rate": 1.040248001986453e-05, + "loss": 0.1133, + "step": 4650 + }, + { + "epoch": 1.51, + "learning_rate": 1.0367498272611303e-05, + "loss": 0.1121, + "step": 4660 + }, + { + "epoch": 1.51, + "learning_rate": 1.0332512021523054e-05, + "loss": 0.1174, + "step": 4670 + }, + { + "epoch": 1.52, + "learning_rate": 1.0297521695369974e-05, + "loss": 0.1161, + "step": 4680 + }, + { + "epoch": 1.52, + "learning_rate": 1.0262527722972185e-05, + "loss": 0.1004, + "step": 4690 + }, + { + "epoch": 1.52, + "learning_rate": 1.0227530533194508e-05, + "loss": 0.1155, + "step": 4700 + }, + { + "epoch": 1.53, + "learning_rate": 1.0192530554941177e-05, + "loss": 0.1261, + "step": 4710 + }, + { + "epoch": 1.53, + "learning_rate": 1.0157528217150624e-05, + "loss": 0.1201, + "step": 4720 + }, + { + "epoch": 1.53, + "learning_rate": 1.0122523948790174e-05, + "loss": 0.1192, + "step": 4730 + }, + { + "epoch": 1.54, + "learning_rate": 1.0087518178850824e-05, + "loss": 0.1115, + "step": 4740 + }, + { + "epoch": 1.54, + "learning_rate": 1.005251133634198e-05, + "loss": 0.1127, + "step": 4750 + }, + { + "epoch": 1.54, + "learning_rate": 1.0017503850286167e-05, + "loss": 0.1117, + "step": 4760 + }, + { + "epoch": 1.55, + "learning_rate": 9.982496149713835e-06, + "loss": 0.1112, + "step": 4770 + }, + { + "epoch": 1.55, + "learning_rate": 9.947488663658027e-06, + "loss": 0.1084, + "step": 4780 + }, + { + "epoch": 1.55, + "learning_rate": 9.912481821149176e-06, + "loss": 0.1109, + "step": 4790 + }, + { + "epoch": 1.56, + "learning_rate": 9.877476051209827e-06, + "loss": 0.1051, + "step": 4800 + }, + { + "epoch": 1.56, + "learning_rate": 9.842471782849381e-06, + "loss": 0.1187, + "step": 4810 + }, + { + "epoch": 1.56, + "learning_rate": 9.807469445058824e-06, + "loss": 0.1246, + "step": 4820 + }, + { + "epoch": 1.57, + "learning_rate": 9.772469466805499e-06, + "loss": 0.1111, + "step": 4830 + }, + { + "epoch": 1.57, + "learning_rate": 9.737472277027817e-06, + "loss": 0.112, + "step": 4840 + }, + { + "epoch": 1.57, + "learning_rate": 9.702478304630028e-06, + "loss": 0.112, + "step": 4850 + }, + { + "epoch": 1.58, + "learning_rate": 9.66748797847695e-06, + "loss": 0.115, + "step": 4860 + }, + { + "epoch": 1.58, + "learning_rate": 9.6325017273887e-06, + "loss": 0.1166, + "step": 4870 + }, + { + "epoch": 1.58, + "learning_rate": 9.597519980135472e-06, + "loss": 0.1186, + "step": 4880 + }, + { + "epoch": 1.59, + "learning_rate": 9.562543165432255e-06, + "loss": 0.1185, + "step": 4890 + }, + { + "epoch": 1.59, + "learning_rate": 9.52757171193359e-06, + "loss": 0.1133, + "step": 4900 + }, + { + "epoch": 1.59, + "learning_rate": 9.49260604822831e-06, + "loss": 0.1193, + "step": 4910 + }, + { + "epoch": 1.6, + "learning_rate": 9.457646602834295e-06, + "loss": 0.1076, + "step": 4920 + }, + { + "epoch": 1.6, + "learning_rate": 9.42269380419322e-06, + "loss": 0.1147, + "step": 4930 + }, + { + "epoch": 1.6, + "learning_rate": 9.387748080665298e-06, + "loss": 0.1067, + "step": 4940 + }, + { + "epoch": 1.61, + "learning_rate": 9.352809860524037e-06, + "loss": 0.1146, + "step": 4950 + }, + { + "epoch": 1.61, + "learning_rate": 9.31787957195098e-06, + "loss": 0.1094, + "step": 4960 + }, + { + "epoch": 1.61, + "learning_rate": 9.28295764303047e-06, + "loss": 0.1011, + "step": 4970 + }, + { + "epoch": 1.61, + "learning_rate": 9.248044501744409e-06, + "loss": 0.1108, + "step": 4980 + }, + { + "epoch": 1.62, + "learning_rate": 9.21314057596699e-06, + "loss": 0.1108, + "step": 4990 + }, + { + "epoch": 1.62, + "learning_rate": 9.178246293459466e-06, + "loss": 0.1078, + "step": 5000 + }, + { + "epoch": 1.62, + "learning_rate": 9.143362081864917e-06, + "loss": 0.1123, + "step": 5010 + }, + { + "epoch": 1.63, + "learning_rate": 9.108488368702991e-06, + "loss": 0.1079, + "step": 5020 + }, + { + "epoch": 1.63, + "learning_rate": 9.073625581364686e-06, + "loss": 0.1053, + "step": 5030 + }, + { + "epoch": 1.63, + "learning_rate": 9.03877414710709e-06, + "loss": 0.1116, + "step": 5040 + }, + { + "epoch": 1.64, + "learning_rate": 9.00393449304815e-06, + "loss": 0.1056, + "step": 5050 + }, + { + "epoch": 1.64, + "learning_rate": 8.969107046161452e-06, + "loss": 0.1082, + "step": 5060 + }, + { + "epoch": 1.64, + "learning_rate": 8.93429223327098e-06, + "loss": 0.1106, + "step": 5070 + }, + { + "epoch": 1.65, + "learning_rate": 8.899490481045873e-06, + "loss": 0.1157, + "step": 5080 + }, + { + "epoch": 1.65, + "learning_rate": 8.864702215995213e-06, + "loss": 0.1134, + "step": 5090 + }, + { + "epoch": 1.65, + "learning_rate": 8.82992786446279e-06, + "loss": 0.111, + "step": 5100 + }, + { + "epoch": 1.66, + "learning_rate": 8.795167852621877e-06, + "loss": 0.1267, + "step": 5110 + }, + { + "epoch": 1.66, + "learning_rate": 8.760422606470015e-06, + "loss": 0.1096, + "step": 5120 + }, + { + "epoch": 1.66, + "learning_rate": 8.725692551823776e-06, + "loss": 0.111, + "step": 5130 + }, + { + "epoch": 1.67, + "learning_rate": 8.69097811431356e-06, + "loss": 0.1127, + "step": 5140 + }, + { + "epoch": 1.67, + "learning_rate": 8.65627971937837e-06, + "loss": 0.1062, + "step": 5150 + }, + { + "epoch": 1.67, + "learning_rate": 8.621597792260608e-06, + "loss": 0.1128, + "step": 5160 + }, + { + "epoch": 1.68, + "learning_rate": 8.58693275800085e-06, + "loss": 0.1089, + "step": 5170 + }, + { + "epoch": 1.68, + "learning_rate": 8.55228504143264e-06, + "loss": 0.1109, + "step": 5180 + }, + { + "epoch": 1.68, + "learning_rate": 8.517655067177295e-06, + "loss": 0.1125, + "step": 5190 + }, + { + "epoch": 1.69, + "learning_rate": 8.48304325963869e-06, + "loss": 0.1176, + "step": 5200 + }, + { + "epoch": 1.69, + "learning_rate": 8.44845004299806e-06, + "loss": 0.1101, + "step": 5210 + }, + { + "epoch": 1.69, + "learning_rate": 8.413875841208797e-06, + "loss": 0.1122, + "step": 5220 + }, + { + "epoch": 1.7, + "learning_rate": 8.379321077991265e-06, + "loss": 0.1115, + "step": 5230 + }, + { + "epoch": 1.7, + "learning_rate": 8.344786176827594e-06, + "loss": 0.1139, + "step": 5240 + }, + { + "epoch": 1.7, + "learning_rate": 8.310271560956509e-06, + "loss": 0.1117, + "step": 5250 + }, + { + "epoch": 1.71, + "learning_rate": 8.275777653368119e-06, + "loss": 0.1073, + "step": 5260 + }, + { + "epoch": 1.71, + "learning_rate": 8.241304876798742e-06, + "loss": 0.1193, + "step": 5270 + }, + { + "epoch": 1.71, + "learning_rate": 8.20685365372574e-06, + "loss": 0.1171, + "step": 5280 + }, + { + "epoch": 1.72, + "learning_rate": 8.172424406362319e-06, + "loss": 0.1189, + "step": 5290 + }, + { + "epoch": 1.72, + "learning_rate": 8.13801755665237e-06, + "loss": 0.1183, + "step": 5300 + }, + { + "epoch": 1.72, + "learning_rate": 8.103633526265289e-06, + "loss": 0.1169, + "step": 5310 + }, + { + "epoch": 1.73, + "learning_rate": 8.069272736590809e-06, + "loss": 0.1044, + "step": 5320 + }, + { + "epoch": 1.73, + "learning_rate": 8.034935608733843e-06, + "loss": 0.1128, + "step": 5330 + }, + { + "epoch": 1.73, + "learning_rate": 8.00062256350932e-06, + "loss": 0.1086, + "step": 5340 + }, + { + "epoch": 1.73, + "learning_rate": 7.966334021437028e-06, + "loss": 0.1181, + "step": 5350 + }, + { + "epoch": 1.74, + "learning_rate": 7.932070402736451e-06, + "loss": 0.1153, + "step": 5360 + }, + { + "epoch": 1.74, + "learning_rate": 7.897832127321639e-06, + "loss": 0.1158, + "step": 5370 + }, + { + "epoch": 1.74, + "learning_rate": 7.863619614796035e-06, + "loss": 0.1068, + "step": 5380 + }, + { + "epoch": 1.75, + "learning_rate": 7.829433284447367e-06, + "loss": 0.1138, + "step": 5390 + }, + { + "epoch": 1.75, + "learning_rate": 7.795273555242476e-06, + "loss": 0.1123, + "step": 5400 + }, + { + "epoch": 1.75, + "learning_rate": 7.761140845822199e-06, + "loss": 0.1093, + "step": 5410 + }, + { + "epoch": 1.76, + "learning_rate": 7.727035574496234e-06, + "loss": 0.1094, + "step": 5420 + }, + { + "epoch": 1.76, + "learning_rate": 7.69295815923802e-06, + "loss": 0.113, + "step": 5430 + }, + { + "epoch": 1.76, + "learning_rate": 7.658909017679604e-06, + "loss": 0.1124, + "step": 5440 + }, + { + "epoch": 1.77, + "learning_rate": 7.6248885671065264e-06, + "loss": 0.1058, + "step": 5450 + }, + { + "epoch": 1.77, + "learning_rate": 7.590897224452716e-06, + "loss": 0.1107, + "step": 5460 + }, + { + "epoch": 1.77, + "learning_rate": 7.556935406295356e-06, + "loss": 0.106, + "step": 5470 + }, + { + "epoch": 1.78, + "learning_rate": 7.5230035288498204e-06, + "loss": 0.115, + "step": 5480 + }, + { + "epoch": 1.78, + "learning_rate": 7.4891020079645285e-06, + "loss": 0.1082, + "step": 5490 + }, + { + "epoch": 1.78, + "learning_rate": 7.455231259115872e-06, + "loss": 0.1146, + "step": 5500 + }, + { + "epoch": 1.79, + "learning_rate": 7.421391697403122e-06, + "loss": 0.1126, + "step": 5510 + }, + { + "epoch": 1.79, + "learning_rate": 7.3875837375433445e-06, + "loss": 0.1119, + "step": 5520 + }, + { + "epoch": 1.79, + "learning_rate": 7.353807793866299e-06, + "loss": 0.1081, + "step": 5530 + }, + { + "epoch": 1.8, + "learning_rate": 7.3200642803093835e-06, + "loss": 0.1127, + "step": 5540 + }, + { + "epoch": 1.8, + "learning_rate": 7.286353610412553e-06, + "loss": 0.1146, + "step": 5550 + }, + { + "epoch": 1.8, + "learning_rate": 7.2526761973132395e-06, + "loss": 0.1079, + "step": 5560 + }, + { + "epoch": 1.81, + "learning_rate": 7.2190324537413196e-06, + "loss": 0.1059, + "step": 5570 + }, + { + "epoch": 1.81, + "learning_rate": 7.185422792014019e-06, + "loss": 0.1072, + "step": 5580 + }, + { + "epoch": 1.81, + "learning_rate": 7.151847624030882e-06, + "loss": 0.1123, + "step": 5590 + }, + { + "epoch": 1.82, + "learning_rate": 7.118307361268721e-06, + "loss": 0.108, + "step": 5600 + }, + { + "epoch": 1.82, + "learning_rate": 7.084802414776575e-06, + "loss": 0.1056, + "step": 5610 + }, + { + "epoch": 1.82, + "learning_rate": 7.051333195170658e-06, + "loss": 0.099, + "step": 5620 + }, + { + "epoch": 1.83, + "learning_rate": 7.0179001126293435e-06, + "loss": 0.1123, + "step": 5630 + }, + { + "epoch": 1.83, + "learning_rate": 6.9845035768881285e-06, + "loss": 0.1089, + "step": 5640 + }, + { + "epoch": 1.83, + "learning_rate": 6.951143997234622e-06, + "loss": 0.1123, + "step": 5650 + }, + { + "epoch": 1.84, + "learning_rate": 6.917821782503513e-06, + "loss": 0.1081, + "step": 5660 + }, + { + "epoch": 1.84, + "learning_rate": 6.884537341071571e-06, + "loss": 0.1112, + "step": 5670 + }, + { + "epoch": 1.84, + "learning_rate": 6.85129108085264e-06, + "loss": 0.1064, + "step": 5680 + }, + { + "epoch": 1.85, + "learning_rate": 6.818083409292634e-06, + "loss": 0.1145, + "step": 5690 + }, + { + "epoch": 1.85, + "learning_rate": 6.784914733364563e-06, + "loss": 0.1083, + "step": 5700 + }, + { + "epoch": 1.85, + "learning_rate": 6.751785459563509e-06, + "loss": 0.119, + "step": 5710 + }, + { + "epoch": 1.85, + "learning_rate": 6.718695993901678e-06, + "loss": 0.1134, + "step": 5720 + }, + { + "epoch": 1.86, + "learning_rate": 6.685646741903411e-06, + "loss": 0.1154, + "step": 5730 + }, + { + "epoch": 1.86, + "learning_rate": 6.652638108600215e-06, + "loss": 0.1128, + "step": 5740 + }, + { + "epoch": 1.86, + "learning_rate": 6.619670498525796e-06, + "loss": 0.1043, + "step": 5750 + }, + { + "epoch": 1.87, + "learning_rate": 6.586744315711102e-06, + "loss": 0.1103, + "step": 5760 + }, + { + "epoch": 1.87, + "learning_rate": 6.5538599636793846e-06, + "loss": 0.1063, + "step": 5770 + }, + { + "epoch": 1.87, + "learning_rate": 6.521017845441225e-06, + "loss": 0.1125, + "step": 5780 + }, + { + "epoch": 1.88, + "learning_rate": 6.488218363489633e-06, + "loss": 0.105, + "step": 5790 + }, + { + "epoch": 1.88, + "learning_rate": 6.455461919795079e-06, + "loss": 0.1096, + "step": 5800 + }, + { + "epoch": 1.88, + "learning_rate": 6.422748915800592e-06, + "loss": 0.1126, + "step": 5810 + }, + { + "epoch": 1.89, + "learning_rate": 6.39007975241682e-06, + "loss": 0.1078, + "step": 5820 + }, + { + "epoch": 1.89, + "learning_rate": 6.357454830017143e-06, + "loss": 0.1161, + "step": 5830 + }, + { + "epoch": 1.89, + "learning_rate": 6.324874548432734e-06, + "loss": 0.1121, + "step": 5840 + }, + { + "epoch": 1.9, + "learning_rate": 6.292339306947685e-06, + "loss": 0.1067, + "step": 5850 + }, + { + "epoch": 1.9, + "learning_rate": 6.259849504294102e-06, + "loss": 0.1119, + "step": 5860 + }, + { + "epoch": 1.9, + "learning_rate": 6.227405538647213e-06, + "loss": 0.1046, + "step": 5870 + }, + { + "epoch": 1.91, + "learning_rate": 6.195007807620514e-06, + "loss": 0.1049, + "step": 5880 + }, + { + "epoch": 1.91, + "learning_rate": 6.16265670826086e-06, + "loss": 0.1111, + "step": 5890 + }, + { + "epoch": 1.91, + "learning_rate": 6.130352637043622e-06, + "loss": 0.0993, + "step": 5900 + }, + { + "epoch": 1.92, + "learning_rate": 6.098095989867822e-06, + "loss": 0.1073, + "step": 5910 + }, + { + "epoch": 1.92, + "learning_rate": 6.065887162051291e-06, + "loss": 0.1219, + "step": 5920 + }, + { + "epoch": 1.92, + "learning_rate": 6.033726548325798e-06, + "loss": 0.1139, + "step": 5930 + }, + { + "epoch": 1.93, + "learning_rate": 6.0016145428322445e-06, + "loss": 0.1108, + "step": 5940 + }, + { + "epoch": 1.93, + "learning_rate": 5.969551539115814e-06, + "loss": 0.1118, + "step": 5950 + }, + { + "epoch": 1.93, + "learning_rate": 5.937537930121145e-06, + "loss": 0.1002, + "step": 5960 + }, + { + "epoch": 1.94, + "learning_rate": 5.905574108187544e-06, + "loss": 0.1038, + "step": 5970 + }, + { + "epoch": 1.94, + "learning_rate": 5.873660465044141e-06, + "loss": 0.1023, + "step": 5980 + }, + { + "epoch": 1.94, + "learning_rate": 5.841797391805113e-06, + "loss": 0.1099, + "step": 5990 + }, + { + "epoch": 1.95, + "learning_rate": 5.809985278964875e-06, + "loss": 0.104, + "step": 6000 + }, + { + "epoch": 1.95, + "learning_rate": 5.778224516393312e-06, + "loss": 0.1036, + "step": 6010 + }, + { + "epoch": 1.95, + "learning_rate": 5.746515493330992e-06, + "loss": 0.1053, + "step": 6020 + }, + { + "epoch": 1.96, + "learning_rate": 5.714858598384387e-06, + "loss": 0.1099, + "step": 6030 + }, + { + "epoch": 1.96, + "learning_rate": 5.683254219521117e-06, + "loss": 0.1014, + "step": 6040 + }, + { + "epoch": 1.96, + "learning_rate": 5.651702744065207e-06, + "loss": 0.1054, + "step": 6050 + }, + { + "epoch": 1.96, + "learning_rate": 5.620204558692331e-06, + "loss": 0.102, + "step": 6060 + }, + { + "epoch": 1.97, + "learning_rate": 5.588760049425057e-06, + "loss": 0.1084, + "step": 6070 + }, + { + "epoch": 1.97, + "learning_rate": 5.557369601628142e-06, + "loss": 0.1095, + "step": 6080 + }, + { + "epoch": 1.97, + "learning_rate": 5.5260336000038e-06, + "loss": 0.1104, + "step": 6090 + }, + { + "epoch": 1.98, + "learning_rate": 5.494752428586985e-06, + "loss": 0.1011, + "step": 6100 + }, + { + "epoch": 1.98, + "learning_rate": 5.46352647074068e-06, + "loss": 0.1108, + "step": 6110 + }, + { + "epoch": 1.98, + "learning_rate": 5.4323561091512045e-06, + "loss": 0.1034, + "step": 6120 + }, + { + "epoch": 1.99, + "learning_rate": 5.401241725823536e-06, + "loss": 0.1085, + "step": 6130 + }, + { + "epoch": 1.99, + "learning_rate": 5.370183702076599e-06, + "loss": 0.116, + "step": 6140 + }, + { + "epoch": 1.99, + "learning_rate": 5.33918241853862e-06, + "loss": 0.1138, + "step": 6150 + }, + { + "epoch": 2.0, + "learning_rate": 5.308238255142457e-06, + "loss": 0.112, + "step": 6160 + }, + { + "epoch": 2.0, + "eval_loss": 0.1484375, + "eval_runtime": 6.4637, + "eval_samples_per_second": 19.803, + "eval_steps_per_second": 0.155, + "step": 6168 + }, + { + "epoch": 2.0, + "learning_rate": 5.277351591120926e-06, + "loss": 0.0985, + "step": 6170 + }, + { + "epoch": 2.0, + "learning_rate": 5.246522805002168e-06, + "loss": 0.0714, + "step": 6180 + }, + { + "epoch": 2.01, + "learning_rate": 5.215752274605012e-06, + "loss": 0.0702, + "step": 6190 + }, + { + "epoch": 2.01, + "learning_rate": 5.185040377034347e-06, + "loss": 0.0655, + "step": 6200 + }, + { + "epoch": 2.01, + "learning_rate": 5.1543874886764774e-06, + "loss": 0.0648, + "step": 6210 + }, + { + "epoch": 2.02, + "learning_rate": 5.123793985194536e-06, + "loss": 0.0666, + "step": 6220 + }, + { + "epoch": 2.02, + "learning_rate": 5.093260241523872e-06, + "loss": 0.0688, + "step": 6230 + }, + { + "epoch": 2.02, + "learning_rate": 5.0627866318674544e-06, + "loss": 0.0657, + "step": 6240 + }, + { + "epoch": 2.03, + "learning_rate": 5.032373529691283e-06, + "loss": 0.0696, + "step": 6250 + }, + { + "epoch": 2.03, + "learning_rate": 5.002021307719811e-06, + "loss": 0.0691, + "step": 6260 + }, + { + "epoch": 2.03, + "learning_rate": 4.971730337931391e-06, + "loss": 0.065, + "step": 6270 + }, + { + "epoch": 2.04, + "learning_rate": 4.9415009915537045e-06, + "loss": 0.0648, + "step": 6280 + }, + { + "epoch": 2.04, + "learning_rate": 4.911333639059208e-06, + "loss": 0.0624, + "step": 6290 + }, + { + "epoch": 2.04, + "learning_rate": 4.881228650160598e-06, + "loss": 0.0708, + "step": 6300 + }, + { + "epoch": 2.05, + "learning_rate": 4.85118639380629e-06, + "loss": 0.0691, + "step": 6310 + }, + { + "epoch": 2.05, + "learning_rate": 4.8212072381758744e-06, + "loss": 0.0708, + "step": 6320 + }, + { + "epoch": 2.05, + "learning_rate": 4.791291550675635e-06, + "loss": 0.0716, + "step": 6330 + }, + { + "epoch": 2.06, + "learning_rate": 4.761439697934009e-06, + "loss": 0.0712, + "step": 6340 + }, + { + "epoch": 2.06, + "learning_rate": 4.731652045797134e-06, + "loss": 0.0689, + "step": 6350 + }, + { + "epoch": 2.06, + "learning_rate": 4.701928959324323e-06, + "loss": 0.0662, + "step": 6360 + }, + { + "epoch": 2.07, + "learning_rate": 4.672270802783628e-06, + "loss": 0.0718, + "step": 6370 + }, + { + "epoch": 2.07, + "learning_rate": 4.642677939647356e-06, + "loss": 0.0733, + "step": 6380 + }, + { + "epoch": 2.07, + "learning_rate": 4.6131507325876144e-06, + "loss": 0.0686, + "step": 6390 + }, + { + "epoch": 2.08, + "learning_rate": 4.583689543471863e-06, + "loss": 0.0706, + "step": 6400 + }, + { + "epoch": 2.08, + "learning_rate": 4.5542947333585e-06, + "loss": 0.0649, + "step": 6410 + }, + { + "epoch": 2.08, + "learning_rate": 4.5249666624924195e-06, + "loss": 0.0677, + "step": 6420 + }, + { + "epoch": 2.08, + "learning_rate": 4.495705690300593e-06, + "loss": 0.0675, + "step": 6430 + }, + { + "epoch": 2.09, + "learning_rate": 4.466512175387672e-06, + "loss": 0.0642, + "step": 6440 + }, + { + "epoch": 2.09, + "learning_rate": 4.437386475531601e-06, + "loss": 0.0714, + "step": 6450 + }, + { + "epoch": 2.09, + "learning_rate": 4.408328947679221e-06, + "loss": 0.0693, + "step": 6460 + }, + { + "epoch": 2.1, + "learning_rate": 4.379339947941896e-06, + "loss": 0.0676, + "step": 6470 + }, + { + "epoch": 2.1, + "learning_rate": 4.350419831591147e-06, + "loss": 0.068, + "step": 6480 + }, + { + "epoch": 2.1, + "learning_rate": 4.321568953054316e-06, + "loss": 0.0696, + "step": 6490 + }, + { + "epoch": 2.11, + "learning_rate": 4.2927876659101905e-06, + "loss": 0.0699, + "step": 6500 + }, + { + "epoch": 2.11, + "learning_rate": 4.264076322884708e-06, + "loss": 0.0683, + "step": 6510 + }, + { + "epoch": 2.11, + "learning_rate": 4.2354352758465945e-06, + "loss": 0.0673, + "step": 6520 + }, + { + "epoch": 2.12, + "learning_rate": 4.206864875803086e-06, + "loss": 0.0702, + "step": 6530 + }, + { + "epoch": 2.12, + "learning_rate": 4.178365472895602e-06, + "loss": 0.0692, + "step": 6540 + }, + { + "epoch": 2.12, + "learning_rate": 4.149937416395468e-06, + "loss": 0.0699, + "step": 6550 + }, + { + "epoch": 2.13, + "learning_rate": 4.121581054699636e-06, + "loss": 0.0651, + "step": 6560 + }, + { + "epoch": 2.13, + "learning_rate": 4.093296735326404e-06, + "loss": 0.07, + "step": 6570 + }, + { + "epoch": 2.13, + "learning_rate": 4.065084804911165e-06, + "loss": 0.0689, + "step": 6580 + }, + { + "epoch": 2.14, + "learning_rate": 4.036945609202146e-06, + "loss": 0.0759, + "step": 6590 + }, + { + "epoch": 2.14, + "learning_rate": 4.008879493056212e-06, + "loss": 0.0721, + "step": 6600 + }, + { + "epoch": 2.14, + "learning_rate": 3.98088680043458e-06, + "loss": 0.0662, + "step": 6610 + }, + { + "epoch": 2.15, + "learning_rate": 3.95296787439864e-06, + "loss": 0.0758, + "step": 6620 + }, + { + "epoch": 2.15, + "learning_rate": 3.9251230571057495e-06, + "loss": 0.0663, + "step": 6630 + }, + { + "epoch": 2.15, + "learning_rate": 3.897352689805036e-06, + "loss": 0.069, + "step": 6640 + }, + { + "epoch": 2.16, + "learning_rate": 3.869657112833206e-06, + "loss": 0.069, + "step": 6650 + }, + { + "epoch": 2.16, + "learning_rate": 3.842036665610379e-06, + "loss": 0.0711, + "step": 6660 + }, + { + "epoch": 2.16, + "learning_rate": 3.814491686635943e-06, + "loss": 0.0671, + "step": 6670 + }, + { + "epoch": 2.17, + "learning_rate": 3.7870225134843776e-06, + "loss": 0.0706, + "step": 6680 + }, + { + "epoch": 2.17, + "learning_rate": 3.7596294828011483e-06, + "loss": 0.0685, + "step": 6690 + }, + { + "epoch": 2.17, + "learning_rate": 3.7323129302985485e-06, + "loss": 0.0659, + "step": 6700 + }, + { + "epoch": 2.18, + "learning_rate": 3.705073190751617e-06, + "loss": 0.0664, + "step": 6710 + }, + { + "epoch": 2.18, + "learning_rate": 3.6779105979940056e-06, + "loss": 0.0702, + "step": 6720 + }, + { + "epoch": 2.18, + "learning_rate": 3.650825484913916e-06, + "loss": 0.0657, + "step": 6730 + }, + { + "epoch": 2.19, + "learning_rate": 3.623818183449992e-06, + "loss": 0.0666, + "step": 6740 + }, + { + "epoch": 2.19, + "learning_rate": 3.59688902458728e-06, + "loss": 0.0668, + "step": 6750 + }, + { + "epoch": 2.19, + "learning_rate": 3.5700383383531467e-06, + "loss": 0.0643, + "step": 6760 + }, + { + "epoch": 2.2, + "learning_rate": 3.5432664538132446e-06, + "loss": 0.0618, + "step": 6770 + }, + { + "epoch": 2.2, + "learning_rate": 3.516573699067499e-06, + "loss": 0.0685, + "step": 6780 + }, + { + "epoch": 2.2, + "learning_rate": 3.48996040124605e-06, + "loss": 0.0664, + "step": 6790 + }, + { + "epoch": 2.2, + "learning_rate": 3.463426886505268e-06, + "loss": 0.0704, + "step": 6800 + }, + { + "epoch": 2.21, + "learning_rate": 3.436973480023743e-06, + "loss": 0.0671, + "step": 6810 + }, + { + "epoch": 2.21, + "learning_rate": 3.4106005059983283e-06, + "loss": 0.068, + "step": 6820 + }, + { + "epoch": 2.21, + "learning_rate": 3.3843082876401265e-06, + "loss": 0.0685, + "step": 6830 + }, + { + "epoch": 2.22, + "learning_rate": 3.3580971471705492e-06, + "loss": 0.0677, + "step": 6840 + }, + { + "epoch": 2.22, + "learning_rate": 3.331967405817379e-06, + "loss": 0.07, + "step": 6850 + }, + { + "epoch": 2.22, + "learning_rate": 3.3059193838108037e-06, + "loss": 0.0684, + "step": 6860 + }, + { + "epoch": 2.23, + "learning_rate": 3.2799534003795274e-06, + "loss": 0.0677, + "step": 6870 + }, + { + "epoch": 2.23, + "learning_rate": 3.254069773746822e-06, + "loss": 0.0719, + "step": 6880 + }, + { + "epoch": 2.23, + "learning_rate": 3.2282688211266568e-06, + "loss": 0.0706, + "step": 6890 + }, + { + "epoch": 2.24, + "learning_rate": 3.2025508587197907e-06, + "loss": 0.067, + "step": 6900 + }, + { + "epoch": 2.24, + "learning_rate": 3.176916201709912e-06, + "loss": 0.069, + "step": 6910 + }, + { + "epoch": 2.24, + "learning_rate": 3.1513651642597607e-06, + "loss": 0.065, + "step": 6920 + }, + { + "epoch": 2.25, + "learning_rate": 3.1258980595072976e-06, + "loss": 0.0708, + "step": 6930 + }, + { + "epoch": 2.25, + "learning_rate": 3.1005151995618454e-06, + "loss": 0.0684, + "step": 6940 + }, + { + "epoch": 2.25, + "learning_rate": 3.0752168955002735e-06, + "loss": 0.068, + "step": 6950 + }, + { + "epoch": 2.26, + "learning_rate": 3.0500034573631943e-06, + "loss": 0.0661, + "step": 6960 + }, + { + "epoch": 2.26, + "learning_rate": 3.024875194151151e-06, + "loss": 0.0669, + "step": 6970 + }, + { + "epoch": 2.26, + "learning_rate": 2.9998324138208336e-06, + "loss": 0.0672, + "step": 6980 + }, + { + "epoch": 2.27, + "learning_rate": 2.974875423281299e-06, + "loss": 0.0702, + "step": 6990 + }, + { + "epoch": 2.27, + "learning_rate": 2.950004528390238e-06, + "loss": 0.0674, + "step": 7000 + }, + { + "epoch": 2.27, + "learning_rate": 2.9252200339501847e-06, + "loss": 0.0687, + "step": 7010 + }, + { + "epoch": 2.28, + "learning_rate": 2.9005222437048054e-06, + "loss": 0.0689, + "step": 7020 + }, + { + "epoch": 2.28, + "learning_rate": 2.8759114603351836e-06, + "loss": 0.0695, + "step": 7030 + }, + { + "epoch": 2.28, + "learning_rate": 2.8513879854560856e-06, + "loss": 0.0667, + "step": 7040 + }, + { + "epoch": 2.29, + "learning_rate": 2.8269521196122907e-06, + "loss": 0.072, + "step": 7050 + }, + { + "epoch": 2.29, + "learning_rate": 2.8026041622748822e-06, + "loss": 0.0665, + "step": 7060 + }, + { + "epoch": 2.29, + "learning_rate": 2.7783444118376046e-06, + "loss": 0.0633, + "step": 7070 + }, + { + "epoch": 2.3, + "learning_rate": 2.754173165613179e-06, + "loss": 0.0663, + "step": 7080 + }, + { + "epoch": 2.3, + "learning_rate": 2.730090719829682e-06, + "loss": 0.0682, + "step": 7090 + }, + { + "epoch": 2.3, + "learning_rate": 2.7060973696269e-06, + "loss": 0.0688, + "step": 7100 + }, + { + "epoch": 2.31, + "learning_rate": 2.6821934090527245e-06, + "loss": 0.0683, + "step": 7110 + }, + { + "epoch": 2.31, + "learning_rate": 2.6583791310595376e-06, + "loss": 0.0666, + "step": 7120 + }, + { + "epoch": 2.31, + "learning_rate": 2.6346548275006232e-06, + "loss": 0.0664, + "step": 7130 + }, + { + "epoch": 2.32, + "learning_rate": 2.6110207891266013e-06, + "loss": 0.0667, + "step": 7140 + }, + { + "epoch": 2.32, + "learning_rate": 2.5874773055818557e-06, + "loss": 0.0663, + "step": 7150 + }, + { + "epoch": 2.32, + "learning_rate": 2.564024665400978e-06, + "loss": 0.0671, + "step": 7160 + }, + { + "epoch": 2.32, + "learning_rate": 2.5406631560052396e-06, + "loss": 0.0703, + "step": 7170 + }, + { + "epoch": 2.33, + "learning_rate": 2.517393063699084e-06, + "loss": 0.0665, + "step": 7180 + }, + { + "epoch": 2.33, + "learning_rate": 2.4942146736665827e-06, + "loss": 0.0667, + "step": 7190 + }, + { + "epoch": 2.33, + "learning_rate": 2.4711282699679718e-06, + "loss": 0.0665, + "step": 7200 + }, + { + "epoch": 2.34, + "learning_rate": 2.4481341355361487e-06, + "loss": 0.0656, + "step": 7210 + }, + { + "epoch": 2.34, + "learning_rate": 2.4252325521732267e-06, + "loss": 0.0676, + "step": 7220 + }, + { + "epoch": 2.34, + "learning_rate": 2.402423800547067e-06, + "loss": 0.0703, + "step": 7230 + }, + { + "epoch": 2.35, + "learning_rate": 2.3797081601878315e-06, + "loss": 0.0702, + "step": 7240 + }, + { + "epoch": 2.35, + "learning_rate": 2.3570859094845823e-06, + "loss": 0.0661, + "step": 7250 + }, + { + "epoch": 2.35, + "learning_rate": 2.33455732568184e-06, + "loss": 0.0714, + "step": 7260 + }, + { + "epoch": 2.36, + "learning_rate": 2.3121226848762124e-06, + "loss": 0.0687, + "step": 7270 + }, + { + "epoch": 2.36, + "learning_rate": 2.2897822620129904e-06, + "loss": 0.0669, + "step": 7280 + }, + { + "epoch": 2.36, + "learning_rate": 2.267536330882797e-06, + "loss": 0.0683, + "step": 7290 + }, + { + "epoch": 2.37, + "learning_rate": 2.2453851641182124e-06, + "loss": 0.0663, + "step": 7300 + }, + { + "epoch": 2.37, + "learning_rate": 2.2233290331904432e-06, + "loss": 0.0669, + "step": 7310 + }, + { + "epoch": 2.37, + "learning_rate": 2.2013682084060008e-06, + "loss": 0.0673, + "step": 7320 + }, + { + "epoch": 2.38, + "learning_rate": 2.1795029589033835e-06, + "loss": 0.061, + "step": 7330 + }, + { + "epoch": 2.38, + "learning_rate": 2.1577335526497677e-06, + "loss": 0.065, + "step": 7340 + }, + { + "epoch": 2.38, + "learning_rate": 2.1360602564377386e-06, + "loss": 0.0653, + "step": 7350 + }, + { + "epoch": 2.39, + "learning_rate": 2.114483335882017e-06, + "loss": 0.0653, + "step": 7360 + }, + { + "epoch": 2.39, + "learning_rate": 2.093003055416204e-06, + "loss": 0.0671, + "step": 7370 + }, + { + "epoch": 2.39, + "learning_rate": 2.0716196782895326e-06, + "loss": 0.0707, + "step": 7380 + }, + { + "epoch": 2.4, + "learning_rate": 2.050333466563643e-06, + "loss": 0.0639, + "step": 7390 + }, + { + "epoch": 2.4, + "learning_rate": 2.0291446811093964e-06, + "loss": 0.0665, + "step": 7400 + }, + { + "epoch": 2.4, + "learning_rate": 2.0080535816036363e-06, + "loss": 0.0657, + "step": 7410 + }, + { + "epoch": 2.41, + "learning_rate": 1.987060426526033e-06, + "loss": 0.0671, + "step": 7420 + }, + { + "epoch": 2.41, + "learning_rate": 1.9661654731559086e-06, + "loss": 0.0618, + "step": 7430 + }, + { + "epoch": 2.41, + "learning_rate": 1.945368977569089e-06, + "loss": 0.0697, + "step": 7440 + }, + { + "epoch": 2.42, + "learning_rate": 1.924671194634761e-06, + "loss": 0.0662, + "step": 7450 + }, + { + "epoch": 2.42, + "learning_rate": 1.9040723780123416e-06, + "loss": 0.0629, + "step": 7460 + }, + { + "epoch": 2.42, + "learning_rate": 1.8835727801483894e-06, + "loss": 0.0706, + "step": 7470 + }, + { + "epoch": 2.43, + "learning_rate": 1.863172652273485e-06, + "loss": 0.065, + "step": 7480 + }, + { + "epoch": 2.43, + "learning_rate": 1.8428722443991764e-06, + "loss": 0.0634, + "step": 7490 + }, + { + "epoch": 2.43, + "learning_rate": 1.8226718053148951e-06, + "loss": 0.0637, + "step": 7500 + }, + { + "epoch": 2.44, + "learning_rate": 1.8025715825849266e-06, + "loss": 0.0683, + "step": 7510 + }, + { + "epoch": 2.44, + "learning_rate": 1.7825718225453547e-06, + "loss": 0.0688, + "step": 7520 + }, + { + "epoch": 2.44, + "learning_rate": 1.762672770301057e-06, + "loss": 0.0666, + "step": 7530 + }, + { + "epoch": 2.44, + "learning_rate": 1.742874669722703e-06, + "loss": 0.0679, + "step": 7540 + }, + { + "epoch": 2.45, + "learning_rate": 1.7231777634437563e-06, + "loss": 0.0645, + "step": 7550 + }, + { + "epoch": 2.45, + "learning_rate": 1.703582292857503e-06, + "loss": 0.0667, + "step": 7560 + }, + { + "epoch": 2.45, + "learning_rate": 1.6840884981140948e-06, + "loss": 0.0674, + "step": 7570 + }, + { + "epoch": 2.46, + "learning_rate": 1.6646966181176117e-06, + "loss": 0.0671, + "step": 7580 + }, + { + "epoch": 2.46, + "learning_rate": 1.6454068905231258e-06, + "loss": 0.0687, + "step": 7590 + }, + { + "epoch": 2.46, + "learning_rate": 1.6262195517337887e-06, + "loss": 0.0637, + "step": 7600 + }, + { + "epoch": 2.47, + "learning_rate": 1.6071348368979377e-06, + "loss": 0.0636, + "step": 7610 + }, + { + "epoch": 2.47, + "learning_rate": 1.5881529799062167e-06, + "loss": 0.0675, + "step": 7620 + }, + { + "epoch": 2.47, + "learning_rate": 1.5692742133887095e-06, + "loss": 0.0637, + "step": 7630 + }, + { + "epoch": 2.48, + "learning_rate": 1.550498768712073e-06, + "loss": 0.065, + "step": 7640 + }, + { + "epoch": 2.48, + "learning_rate": 1.5318268759767307e-06, + "loss": 0.0623, + "step": 7650 + }, + { + "epoch": 2.48, + "learning_rate": 1.5132587640140227e-06, + "loss": 0.0681, + "step": 7660 + }, + { + "epoch": 2.49, + "learning_rate": 1.494794660383425e-06, + "loss": 0.0692, + "step": 7670 + }, + { + "epoch": 2.49, + "learning_rate": 1.4764347913697441e-06, + "loss": 0.0678, + "step": 7680 + }, + { + "epoch": 2.49, + "learning_rate": 1.4581793819803559e-06, + "loss": 0.0617, + "step": 7690 + }, + { + "epoch": 2.5, + "learning_rate": 1.4400286559424392e-06, + "loss": 0.0675, + "step": 7700 + }, + { + "epoch": 2.5, + "learning_rate": 1.4219828357002351e-06, + "loss": 0.0681, + "step": 7710 + }, + { + "epoch": 2.5, + "learning_rate": 1.4040421424123308e-06, + "loss": 0.0651, + "step": 7720 + }, + { + "epoch": 2.51, + "learning_rate": 1.3862067959489377e-06, + "loss": 0.0666, + "step": 7730 + }, + { + "epoch": 2.51, + "learning_rate": 1.368477014889199e-06, + "loss": 0.0637, + "step": 7740 + }, + { + "epoch": 2.51, + "learning_rate": 1.3508530165185096e-06, + "loss": 0.0677, + "step": 7750 + }, + { + "epoch": 2.52, + "learning_rate": 1.3333350168258651e-06, + "loss": 0.0639, + "step": 7760 + }, + { + "epoch": 2.52, + "learning_rate": 1.3159232305012027e-06, + "loss": 0.064, + "step": 7770 + }, + { + "epoch": 2.52, + "learning_rate": 1.298617870932769e-06, + "loss": 0.0643, + "step": 7780 + }, + { + "epoch": 2.53, + "learning_rate": 1.2814191502045093e-06, + "loss": 0.0655, + "step": 7790 + }, + { + "epoch": 2.53, + "learning_rate": 1.2643272790934735e-06, + "loss": 0.0666, + "step": 7800 + }, + { + "epoch": 2.53, + "learning_rate": 1.2473424670672264e-06, + "loss": 0.0645, + "step": 7810 + }, + { + "epoch": 2.54, + "learning_rate": 1.2304649222812792e-06, + "loss": 0.0662, + "step": 7820 + }, + { + "epoch": 2.54, + "learning_rate": 1.2136948515765402e-06, + "loss": 0.0688, + "step": 7830 + }, + { + "epoch": 2.54, + "learning_rate": 1.1970324604767836e-06, + "loss": 0.0655, + "step": 7840 + }, + { + "epoch": 2.55, + "learning_rate": 1.180477953186131e-06, + "loss": 0.0646, + "step": 7850 + }, + { + "epoch": 2.55, + "learning_rate": 1.1640315325865358e-06, + "loss": 0.069, + "step": 7860 + }, + { + "epoch": 2.55, + "learning_rate": 1.1476934002353191e-06, + "loss": 0.0635, + "step": 7870 + }, + { + "epoch": 2.56, + "learning_rate": 1.1314637563626774e-06, + "loss": 0.0638, + "step": 7880 + }, + { + "epoch": 2.56, + "learning_rate": 1.1153427998692401e-06, + "loss": 0.0656, + "step": 7890 + }, + { + "epoch": 2.56, + "learning_rate": 1.0993307283236355e-06, + "loss": 0.0647, + "step": 7900 + }, + { + "epoch": 2.56, + "learning_rate": 1.083427737960062e-06, + "loss": 0.0664, + "step": 7910 + }, + { + "epoch": 2.57, + "learning_rate": 1.067634023675882e-06, + "loss": 0.0653, + "step": 7920 + }, + { + "epoch": 2.57, + "learning_rate": 1.0519497790292388e-06, + "loss": 0.0662, + "step": 7930 + }, + { + "epoch": 2.57, + "learning_rate": 1.036375196236684e-06, + "loss": 0.0632, + "step": 7940 + }, + { + "epoch": 2.58, + "learning_rate": 1.0209104661708225e-06, + "loss": 0.0627, + "step": 7950 + }, + { + "epoch": 2.58, + "learning_rate": 1.0055557783579627e-06, + "loss": 0.0646, + "step": 7960 + }, + { + "epoch": 2.58, + "learning_rate": 9.903113209758098e-07, + "loss": 0.0689, + "step": 7970 + }, + { + "epoch": 2.59, + "learning_rate": 9.751772808511474e-07, + "loss": 0.0667, + "step": 7980 + }, + { + "epoch": 2.59, + "learning_rate": 9.601538434575586e-07, + "loss": 0.0589, + "step": 7990 + }, + { + "epoch": 2.59, + "learning_rate": 9.452411929131411e-07, + "loss": 0.0668, + "step": 8000 + }, + { + "epoch": 2.6, + "learning_rate": 9.30439511978255e-07, + "loss": 0.0633, + "step": 8010 + }, + { + "epoch": 2.6, + "learning_rate": 9.157489820532905e-07, + "loss": 0.0669, + "step": 8020 + }, + { + "epoch": 2.6, + "learning_rate": 9.011697831764366e-07, + "loss": 0.0614, + "step": 8030 + }, + { + "epoch": 2.61, + "learning_rate": 8.867020940214743e-07, + "loss": 0.0641, + "step": 8040 + }, + { + "epoch": 2.61, + "learning_rate": 8.723460918955895e-07, + "loss": 0.0676, + "step": 8050 + }, + { + "epoch": 2.61, + "learning_rate": 8.581019527372037e-07, + "loss": 0.0687, + "step": 8060 + }, + { + "epoch": 2.62, + "learning_rate": 8.439698511138106e-07, + "loss": 0.0665, + "step": 8070 + }, + { + "epoch": 2.62, + "learning_rate": 8.299499602198413e-07, + "loss": 0.0664, + "step": 8080 + }, + { + "epoch": 2.62, + "learning_rate": 8.160424518745425e-07, + "loss": 0.0693, + "step": 8090 + }, + { + "epoch": 2.63, + "learning_rate": 8.022474965198635e-07, + "loss": 0.0628, + "step": 8100 + }, + { + "epoch": 2.63, + "learning_rate": 7.885652632183771e-07, + "loss": 0.0626, + "step": 8110 + }, + { + "epoch": 2.63, + "learning_rate": 7.749959196512014e-07, + "loss": 0.0674, + "step": 8120 + }, + { + "epoch": 2.64, + "learning_rate": 7.615396321159496e-07, + "loss": 0.0685, + "step": 8130 + }, + { + "epoch": 2.64, + "learning_rate": 7.481965655246859e-07, + "loss": 0.0663, + "step": 8140 + }, + { + "epoch": 2.64, + "learning_rate": 7.349668834019063e-07, + "loss": 0.0663, + "step": 8150 + }, + { + "epoch": 2.65, + "learning_rate": 7.218507478825387e-07, + "loss": 0.0636, + "step": 8160 + }, + { + "epoch": 2.65, + "learning_rate": 7.088483197099561e-07, + "loss": 0.0643, + "step": 8170 + }, + { + "epoch": 2.65, + "learning_rate": 6.95959758233995e-07, + "loss": 0.0625, + "step": 8180 + }, + { + "epoch": 2.66, + "learning_rate": 6.831852214090163e-07, + "loss": 0.064, + "step": 8190 + }, + { + "epoch": 2.66, + "learning_rate": 6.705248657919638e-07, + "loss": 0.064, + "step": 8200 + }, + { + "epoch": 2.66, + "learning_rate": 6.579788465404491e-07, + "loss": 0.064, + "step": 8210 + }, + { + "epoch": 2.67, + "learning_rate": 6.455473174108396e-07, + "loss": 0.0641, + "step": 8220 + }, + { + "epoch": 2.67, + "learning_rate": 6.332304307563853e-07, + "loss": 0.0647, + "step": 8230 + }, + { + "epoch": 2.67, + "learning_rate": 6.210283375253512e-07, + "loss": 0.0653, + "step": 8240 + }, + { + "epoch": 2.68, + "learning_rate": 6.089411872591566e-07, + "loss": 0.0643, + "step": 8250 + }, + { + "epoch": 2.68, + "learning_rate": 5.969691280905565e-07, + "loss": 0.0635, + "step": 8260 + }, + { + "epoch": 2.68, + "learning_rate": 5.851123067418185e-07, + "loss": 0.0685, + "step": 8270 + }, + { + "epoch": 2.68, + "learning_rate": 5.733708685229222e-07, + "loss": 0.0644, + "step": 8280 + }, + { + "epoch": 2.69, + "learning_rate": 5.617449573297828e-07, + "loss": 0.0665, + "step": 8290 + }, + { + "epoch": 2.69, + "learning_rate": 5.502347156424881e-07, + "loss": 0.064, + "step": 8300 + }, + { + "epoch": 2.69, + "learning_rate": 5.388402845235541e-07, + "loss": 0.0673, + "step": 8310 + }, + { + "epoch": 2.7, + "learning_rate": 5.275618036161856e-07, + "loss": 0.0619, + "step": 8320 + }, + { + "epoch": 2.7, + "learning_rate": 5.163994111425752e-07, + "loss": 0.0654, + "step": 8330 + }, + { + "epoch": 2.7, + "learning_rate": 5.05353243902208e-07, + "loss": 0.0627, + "step": 8340 + }, + { + "epoch": 2.71, + "learning_rate": 4.944234372701851e-07, + "loss": 0.067, + "step": 8350 + }, + { + "epoch": 2.71, + "learning_rate": 4.836101251955583e-07, + "loss": 0.0639, + "step": 8360 + }, + { + "epoch": 2.71, + "learning_rate": 4.7291344019969374e-07, + "loss": 0.0671, + "step": 8370 + }, + { + "epoch": 2.72, + "learning_rate": 4.6233351337464984e-07, + "loss": 0.066, + "step": 8380 + }, + { + "epoch": 2.72, + "learning_rate": 4.518704743815672e-07, + "loss": 0.0688, + "step": 8390 + }, + { + "epoch": 2.72, + "learning_rate": 4.415244514490791e-07, + "loss": 0.0644, + "step": 8400 + }, + { + "epoch": 2.73, + "learning_rate": 4.312955713717404e-07, + "loss": 0.0632, + "step": 8410 + }, + { + "epoch": 2.73, + "learning_rate": 4.2118395950847767e-07, + "loss": 0.0641, + "step": 8420 + }, + { + "epoch": 2.73, + "learning_rate": 4.1118973978104603e-07, + "loss": 0.0665, + "step": 8430 + }, + { + "epoch": 2.74, + "learning_rate": 4.0131303467251804e-07, + "loss": 0.0643, + "step": 8440 + }, + { + "epoch": 2.74, + "learning_rate": 3.9155396522577496e-07, + "loss": 0.0628, + "step": 8450 + }, + { + "epoch": 2.74, + "learning_rate": 3.8191265104203014e-07, + "loss": 0.0638, + "step": 8460 + }, + { + "epoch": 2.75, + "learning_rate": 3.723892102793558e-07, + "loss": 0.0648, + "step": 8470 + }, + { + "epoch": 2.75, + "learning_rate": 3.629837596512453e-07, + "loss": 0.0665, + "step": 8480 + }, + { + "epoch": 2.75, + "learning_rate": 3.53696414425172e-07, + "loss": 0.0661, + "step": 8490 + }, + { + "epoch": 2.76, + "learning_rate": 3.445272884211837e-07, + "loss": 0.0691, + "step": 8500 + }, + { + "epoch": 2.76, + "learning_rate": 3.3547649401050265e-07, + "loss": 0.0628, + "step": 8510 + }, + { + "epoch": 2.76, + "learning_rate": 3.2654414211415463e-07, + "loss": 0.0624, + "step": 8520 + }, + { + "epoch": 2.77, + "learning_rate": 3.177303422016065e-07, + "loss": 0.0662, + "step": 8530 + }, + { + "epoch": 2.77, + "learning_rate": 3.0903520228941944e-07, + "loss": 0.0636, + "step": 8540 + }, + { + "epoch": 2.77, + "learning_rate": 3.004588289399324e-07, + "loss": 0.0657, + "step": 8550 + }, + { + "epoch": 2.78, + "learning_rate": 2.9200132725995644e-07, + "loss": 0.0639, + "step": 8560 + }, + { + "epoch": 2.78, + "learning_rate": 2.8366280089948126e-07, + "loss": 0.0648, + "step": 8570 + }, + { + "epoch": 2.78, + "learning_rate": 2.7544335205040626e-07, + "loss": 0.0648, + "step": 8580 + }, + { + "epoch": 2.79, + "learning_rate": 2.6734308144529154e-07, + "loss": 0.0644, + "step": 8590 + }, + { + "epoch": 2.79, + "learning_rate": 2.59362088356121e-07, + "loss": 0.0631, + "step": 8600 + }, + { + "epoch": 2.79, + "learning_rate": 2.515004705930835e-07, + "loss": 0.0628, + "step": 8610 + }, + { + "epoch": 2.8, + "learning_rate": 2.437583245033814e-07, + "loss": 0.0631, + "step": 8620 + }, + { + "epoch": 2.8, + "learning_rate": 2.3613574497003967e-07, + "loss": 0.0624, + "step": 8630 + }, + { + "epoch": 2.8, + "learning_rate": 2.2863282541075394e-07, + "loss": 0.0617, + "step": 8640 + }, + { + "epoch": 2.8, + "learning_rate": 2.2124965777673313e-07, + "loss": 0.067, + "step": 8650 + }, + { + "epoch": 2.81, + "learning_rate": 2.1398633255158675e-07, + "loss": 0.0618, + "step": 8660 + }, + { + "epoch": 2.81, + "learning_rate": 2.0684293875020245e-07, + "loss": 0.0644, + "step": 8670 + }, + { + "epoch": 2.81, + "learning_rate": 1.99819563917667e-07, + "loss": 0.0698, + "step": 8680 + }, + { + "epoch": 2.82, + "learning_rate": 1.9291629412818368e-07, + "loss": 0.0672, + "step": 8690 + }, + { + "epoch": 2.82, + "learning_rate": 1.8613321398402107e-07, + "loss": 0.0694, + "step": 8700 + }, + { + "epoch": 2.82, + "learning_rate": 1.7947040661448256e-07, + "loss": 0.0661, + "step": 8710 + }, + { + "epoch": 2.83, + "learning_rate": 1.7292795367487513e-07, + "loss": 0.0628, + "step": 8720 + }, + { + "epoch": 2.83, + "learning_rate": 1.6650593534551673e-07, + "loss": 0.0646, + "step": 8730 + }, + { + "epoch": 2.83, + "learning_rate": 1.6020443033075485e-07, + "loss": 0.0627, + "step": 8740 + }, + { + "epoch": 2.84, + "learning_rate": 1.5402351585799725e-07, + "loss": 0.0647, + "step": 8750 + }, + { + "epoch": 2.84, + "learning_rate": 1.4796326767676617e-07, + "loss": 0.0675, + "step": 8760 + }, + { + "epoch": 2.84, + "learning_rate": 1.420237600577734e-07, + "loss": 0.0627, + "step": 8770 + }, + { + "epoch": 2.85, + "learning_rate": 1.3620506579200777e-07, + "loss": 0.0614, + "step": 8780 + }, + { + "epoch": 2.85, + "learning_rate": 1.3050725618984017e-07, + "loss": 0.0644, + "step": 8790 + }, + { + "epoch": 2.85, + "learning_rate": 1.2493040108015774e-07, + "loss": 0.0634, + "step": 8800 + }, + { + "epoch": 2.86, + "learning_rate": 1.1947456880949893e-07, + "loss": 0.0649, + "step": 8810 + }, + { + "epoch": 2.86, + "learning_rate": 1.1413982624122189e-07, + "loss": 0.0632, + "step": 8820 + }, + { + "epoch": 2.86, + "learning_rate": 1.08926238754683e-07, + "loss": 0.0639, + "step": 8830 + }, + { + "epoch": 2.87, + "learning_rate": 1.0383387024443414e-07, + "loss": 0.0619, + "step": 8840 + }, + { + "epoch": 2.87, + "learning_rate": 9.88627831194433e-08, + "loss": 0.0627, + "step": 8850 + }, + { + "epoch": 2.87, + "learning_rate": 9.401303830232855e-08, + "loss": 0.0657, + "step": 8860 + }, + { + "epoch": 2.88, + "learning_rate": 8.928469522860527e-08, + "loss": 0.0646, + "step": 8870 + }, + { + "epoch": 2.88, + "learning_rate": 8.467781184596901e-08, + "loss": 0.0635, + "step": 8880 + }, + { + "epoch": 2.88, + "learning_rate": 8.0192444613576e-08, + "loss": 0.0671, + "step": 8890 + }, + { + "epoch": 2.89, + "learning_rate": 7.582864850135707e-08, + "loss": 0.0652, + "step": 8900 + }, + { + "epoch": 2.89, + "learning_rate": 7.158647698933707e-08, + "loss": 0.0676, + "step": 8910 + }, + { + "epoch": 2.89, + "learning_rate": 6.746598206698762e-08, + "loss": 0.0631, + "step": 8920 + }, + { + "epoch": 2.9, + "learning_rate": 6.34672142325865e-08, + "loss": 0.065, + "step": 8930 + }, + { + "epoch": 2.9, + "learning_rate": 5.959022249259594e-08, + "loss": 0.0669, + "step": 8940 + }, + { + "epoch": 2.9, + "learning_rate": 5.583505436106529e-08, + "loss": 0.067, + "step": 8950 + }, + { + "epoch": 2.91, + "learning_rate": 5.220175585904819e-08, + "loss": 0.061, + "step": 8960 + }, + { + "epoch": 2.91, + "learning_rate": 4.8690371514039656e-08, + "loss": 0.0661, + "step": 8970 + }, + { + "epoch": 2.91, + "learning_rate": 4.5300944359425446e-08, + "loss": 0.0626, + "step": 8980 + }, + { + "epoch": 2.92, + "learning_rate": 4.203351593396354e-08, + "loss": 0.0633, + "step": 8990 + }, + { + "epoch": 2.92, + "learning_rate": 3.8888126281264593e-08, + "loss": 0.0652, + "step": 9000 + }, + { + "epoch": 2.92, + "learning_rate": 3.586481394930896e-08, + "loss": 0.065, + "step": 9010 + }, + { + "epoch": 2.92, + "learning_rate": 3.2963615989971553e-08, + "loss": 0.0652, + "step": 9020 + }, + { + "epoch": 2.93, + "learning_rate": 3.0184567958567724e-08, + "loss": 0.0655, + "step": 9030 + }, + { + "epoch": 2.93, + "learning_rate": 2.752770391341919e-08, + "loss": 0.066, + "step": 9040 + }, + { + "epoch": 2.93, + "learning_rate": 2.499305641543104e-08, + "loss": 0.0654, + "step": 9050 + }, + { + "epoch": 2.94, + "learning_rate": 2.2580656527700916e-08, + "loss": 0.0673, + "step": 9060 + }, + { + "epoch": 2.94, + "learning_rate": 2.0290533815132683e-08, + "loss": 0.0658, + "step": 9070 + }, + { + "epoch": 2.94, + "learning_rate": 1.8122716344074476e-08, + "loss": 0.0628, + "step": 9080 + }, + { + "epoch": 2.95, + "learning_rate": 1.6077230681978972e-08, + "loss": 0.0635, + "step": 9090 + }, + { + "epoch": 2.95, + "learning_rate": 1.4154101897070338e-08, + "loss": 0.0635, + "step": 9100 + }, + { + "epoch": 2.95, + "learning_rate": 1.2353353558045566e-08, + "loss": 0.0679, + "step": 9110 + }, + { + "epoch": 2.96, + "learning_rate": 1.0675007733780273e-08, + "loss": 0.0656, + "step": 9120 + }, + { + "epoch": 2.96, + "learning_rate": 9.119084993055583e-09, + "loss": 0.0652, + "step": 9130 + }, + { + "epoch": 2.96, + "learning_rate": 7.685604404316094e-09, + "loss": 0.0594, + "step": 9140 + }, + { + "epoch": 2.97, + "learning_rate": 6.374583535426748e-09, + "loss": 0.0645, + "step": 9150 + }, + { + "epoch": 2.97, + "learning_rate": 5.186038453458553e-09, + "loss": 0.0657, + "step": 9160 + }, + { + "epoch": 2.97, + "learning_rate": 4.119983724497623e-09, + "loss": 0.0668, + "step": 9170 + }, + { + "epoch": 2.98, + "learning_rate": 3.1764324134631043e-09, + "loss": 0.0617, + "step": 9180 + }, + { + "epoch": 2.98, + "learning_rate": 2.355396083941752e-09, + "loss": 0.0632, + "step": 9190 + }, + { + "epoch": 2.98, + "learning_rate": 1.656884798058034e-09, + "loss": 0.0633, + "step": 9200 + }, + { + "epoch": 2.99, + "learning_rate": 1.0809071163386808e-09, + "loss": 0.0661, + "step": 9210 + }, + { + "epoch": 2.99, + "learning_rate": 6.274700976161008e-10, + "loss": 0.0656, + "step": 9220 + }, + { + "epoch": 2.99, + "learning_rate": 2.9657929893955886e-10, + "loss": 0.0652, + "step": 9230 + }, + { + "epoch": 3.0, + "learning_rate": 8.823877550301341e-11, + "loss": 0.0603, + "step": 9240 + }, + { + "epoch": 3.0, + "learning_rate": 2.4510806018174237e-12, + "loss": 0.06, + "step": 9250 + }, + { + "epoch": 3.0, + "eval_loss": 0.1728515625, + "eval_runtime": 6.4727, + "eval_samples_per_second": 19.775, + "eval_steps_per_second": 0.154, + "step": 9252 + } + ], + "max_steps": 9252, + "num_train_epochs": 3, + "total_flos": 9.615097431625354e+19, + "trial_name": null, + "trial_params": null +}