{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 1000, "global_step": 1922, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.0927835051546394e-07, "loss": 0.5665, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.092783505154639e-06, "loss": 0.7248, "step": 10 }, { "epoch": 0.02, "learning_rate": 6.185567010309278e-06, "loss": 0.7059, "step": 20 }, { "epoch": 0.03, "learning_rate": 9.278350515463918e-06, "loss": 0.6139, "step": 30 }, { "epoch": 0.04, "learning_rate": 1.2371134020618556e-05, "loss": 0.5087, "step": 40 }, { "epoch": 0.05, "learning_rate": 1.5463917525773194e-05, "loss": 0.4724, "step": 50 }, { "epoch": 0.06, "learning_rate": 1.8556701030927837e-05, "loss": 0.4124, "step": 60 }, { "epoch": 0.07, "learning_rate": 2.1649484536082473e-05, "loss": 0.4254, "step": 70 }, { "epoch": 0.08, "learning_rate": 2.4742268041237112e-05, "loss": 0.3885, "step": 80 }, { "epoch": 0.09, "learning_rate": 2.7835051546391755e-05, "loss": 0.3624, "step": 90 }, { "epoch": 0.1, "learning_rate": 2.999979997843754e-05, "loss": 0.4005, "step": 100 }, { "epoch": 0.11, "learning_rate": 2.999624418794702e-05, "loss": 0.3947, "step": 110 }, { "epoch": 0.12, "learning_rate": 2.998824468665815e-05, "loss": 0.383, "step": 120 }, { "epoch": 0.14, "learning_rate": 2.997580384499581e-05, "loss": 0.3477, "step": 130 }, { "epoch": 0.15, "learning_rate": 2.9958925349449884e-05, "loss": 0.3627, "step": 140 }, { "epoch": 0.16, "learning_rate": 2.9937614201482864e-05, "loss": 0.3744, "step": 150 }, { "epoch": 0.17, "learning_rate": 2.9911876716047828e-05, "loss": 0.3545, "step": 160 }, { "epoch": 0.18, "learning_rate": 2.988172051971717e-05, "loss": 0.3598, "step": 170 }, { "epoch": 0.19, "learning_rate": 2.9847154548422685e-05, "loss": 0.336, "step": 180 }, { "epoch": 0.2, "learning_rate": 2.9808189044807664e-05, "loss": 0.3482, "step": 190 }, { "epoch": 0.21, "learning_rate": 2.976483555519177e-05, "loss": 0.3675, "step": 200 }, { "epoch": 0.22, "learning_rate": 2.9717106926149626e-05, "loss": 0.3417, "step": 210 }, { "epoch": 0.23, "learning_rate": 2.9665017300704087e-05, "loss": 0.3401, "step": 220 }, { "epoch": 0.24, "learning_rate": 2.9608582114135352e-05, "loss": 0.3315, "step": 230 }, { "epoch": 0.25, "learning_rate": 2.954781808940717e-05, "loss": 0.366, "step": 240 }, { "epoch": 0.26, "learning_rate": 2.9482743232211473e-05, "loss": 0.3604, "step": 250 }, { "epoch": 0.27, "learning_rate": 2.9413376825632873e-05, "loss": 0.3548, "step": 260 }, { "epoch": 0.28, "learning_rate": 2.9339739424434687e-05, "loss": 0.3615, "step": 270 }, { "epoch": 0.29, "learning_rate": 2.9261852848968123e-05, "loss": 0.3681, "step": 280 }, { "epoch": 0.3, "learning_rate": 2.9179740178706412e-05, "loss": 0.368, "step": 290 }, { "epoch": 0.31, "learning_rate": 2.9093425745405897e-05, "loss": 0.3509, "step": 300 }, { "epoch": 0.32, "learning_rate": 2.9012171361681493e-05, "loss": 0.3321, "step": 310 }, { "epoch": 0.33, "learning_rate": 2.8917945065723644e-05, "loss": 0.3653, "step": 320 }, { "epoch": 0.34, "learning_rate": 2.8819594582268972e-05, "loss": 0.3548, "step": 330 }, { "epoch": 0.35, "learning_rate": 2.8717149054688286e-05, "loss": 0.3602, "step": 340 }, { "epoch": 0.36, "learning_rate": 2.8610638839802383e-05, "loss": 0.3333, "step": 350 }, { "epoch": 0.37, "learning_rate": 2.85000954988866e-05, "loss": 0.3642, "step": 360 }, { "epoch": 0.39, "learning_rate": 2.838555178831857e-05, "loss": 0.346, "step": 370 }, { "epoch": 0.4, "learning_rate": 2.8267041649871796e-05, "loss": 0.3542, "step": 380 }, { "epoch": 0.41, "learning_rate": 2.8144600200657953e-05, "loss": 0.3364, "step": 390 }, { "epoch": 0.42, "learning_rate": 2.8018263722720953e-05, "loss": 0.3526, "step": 400 }, { "epoch": 0.43, "learning_rate": 2.7888069652285765e-05, "loss": 0.3457, "step": 410 }, { "epoch": 0.44, "learning_rate": 2.775405656866529e-05, "loss": 0.328, "step": 420 }, { "epoch": 0.45, "learning_rate": 2.7616264182828423e-05, "loss": 0.3445, "step": 430 }, { "epoch": 0.46, "learning_rate": 2.747473332563291e-05, "loss": 0.3503, "step": 440 }, { "epoch": 0.47, "learning_rate": 2.7329505935726215e-05, "loss": 0.3631, "step": 450 }, { "epoch": 0.48, "learning_rate": 2.718062504711823e-05, "loss": 0.3456, "step": 460 }, { "epoch": 0.49, "learning_rate": 2.7028134776429372e-05, "loss": 0.3373, "step": 470 }, { "epoch": 0.5, "learning_rate": 2.6872080309817844e-05, "loss": 0.3477, "step": 480 }, { "epoch": 0.51, "learning_rate": 2.6712507889590042e-05, "loss": 0.3603, "step": 490 }, { "epoch": 0.52, "learning_rate": 2.654946480049793e-05, "loss": 0.3242, "step": 500 }, { "epoch": 0.53, "learning_rate": 2.6382999355727565e-05, "loss": 0.3277, "step": 510 }, { "epoch": 0.54, "learning_rate": 2.6213160882582855e-05, "loss": 0.3424, "step": 520 }, { "epoch": 0.55, "learning_rate": 2.6039999707868797e-05, "loss": 0.3277, "step": 530 }, { "epoch": 0.56, "learning_rate": 2.5863567142978596e-05, "loss": 0.34, "step": 540 }, { "epoch": 0.57, "learning_rate": 2.5683915468688945e-05, "loss": 0.3086, "step": 550 }, { "epoch": 0.58, "learning_rate": 2.5501097919668147e-05, "loss": 0.3462, "step": 560 }, { "epoch": 0.59, "learning_rate": 2.531516866870149e-05, "loss": 0.3497, "step": 570 }, { "epoch": 0.6, "learning_rate": 2.512618281063873e-05, "loss": 0.3328, "step": 580 }, { "epoch": 0.61, "learning_rate": 2.4934196346068248e-05, "loss": 0.339, "step": 590 }, { "epoch": 0.62, "learning_rate": 2.4739266164722916e-05, "loss": 0.3323, "step": 600 }, { "epoch": 0.63, "learning_rate": 2.4541450028622397e-05, "loss": 0.344, "step": 610 }, { "epoch": 0.65, "learning_rate": 2.4340806554957033e-05, "loss": 0.3494, "step": 620 }, { "epoch": 0.66, "learning_rate": 2.413739519871833e-05, "loss": 0.3224, "step": 630 }, { "epoch": 0.67, "learning_rate": 2.39312762350811e-05, "loss": 0.3296, "step": 640 }, { "epoch": 0.68, "learning_rate": 2.372251074154269e-05, "loss": 0.3324, "step": 650 }, { "epoch": 0.69, "learning_rate": 2.351116057982436e-05, "loss": 0.3217, "step": 660 }, { "epoch": 0.7, "learning_rate": 2.329728837754034e-05, "loss": 0.344, "step": 670 }, { "epoch": 0.71, "learning_rate": 2.3080957509639908e-05, "loss": 0.3426, "step": 680 }, { "epoch": 0.72, "learning_rate": 2.286223207962802e-05, "loss": 0.3166, "step": 690 }, { "epoch": 0.73, "learning_rate": 2.264117690057007e-05, "loss": 0.3145, "step": 700 }, { "epoch": 0.74, "learning_rate": 2.2417857475886383e-05, "loss": 0.342, "step": 710 }, { "epoch": 0.75, "learning_rate": 2.219233997994211e-05, "loss": 0.3259, "step": 720 }, { "epoch": 0.76, "learning_rate": 2.1964691238438374e-05, "loss": 0.3195, "step": 730 }, { "epoch": 0.77, "learning_rate": 2.173497870861034e-05, "loss": 0.3442, "step": 740 }, { "epoch": 0.78, "learning_rate": 2.1503270459238204e-05, "loss": 0.3387, "step": 750 }, { "epoch": 0.79, "learning_rate": 2.1269635150476917e-05, "loss": 0.337, "step": 760 }, { "epoch": 0.8, "learning_rate": 2.1034142013510735e-05, "loss": 0.32, "step": 770 }, { "epoch": 0.81, "learning_rate": 2.0796860830038506e-05, "loss": 0.3168, "step": 780 }, { "epoch": 0.82, "learning_rate": 2.055786191159583e-05, "loss": 0.3229, "step": 790 }, { "epoch": 0.83, "learning_rate": 2.0317216078720264e-05, "loss": 0.3341, "step": 800 }, { "epoch": 0.84, "learning_rate": 2.0074994639965654e-05, "loss": 0.3204, "step": 810 }, { "epoch": 0.85, "learning_rate": 1.9831269370771864e-05, "loss": 0.3518, "step": 820 }, { "epoch": 0.86, "learning_rate": 1.9586112492196187e-05, "loss": 0.317, "step": 830 }, { "epoch": 0.87, "learning_rate": 1.9339596649512653e-05, "loss": 0.3297, "step": 840 }, { "epoch": 0.88, "learning_rate": 1.9091794890685704e-05, "loss": 0.3332, "step": 850 }, { "epoch": 0.89, "learning_rate": 1.884278064472448e-05, "loss": 0.3375, "step": 860 }, { "epoch": 0.91, "learning_rate": 1.8592627699924218e-05, "loss": 0.3372, "step": 870 }, { "epoch": 0.92, "learning_rate": 1.8341410182001225e-05, "loss": 0.3291, "step": 880 }, { "epoch": 0.93, "learning_rate": 1.8089202532127756e-05, "loss": 0.33, "step": 890 }, { "epoch": 0.94, "learning_rate": 1.783607948487357e-05, "loss": 0.3214, "step": 900 }, { "epoch": 0.95, "learning_rate": 1.758211604606041e-05, "loss": 0.3423, "step": 910 }, { "epoch": 0.96, "learning_rate": 1.7327387470536174e-05, "loss": 0.3171, "step": 920 }, { "epoch": 0.97, "learning_rate": 1.7071969239875282e-05, "loss": 0.3308, "step": 930 }, { "epoch": 0.98, "learning_rate": 1.6815937040011855e-05, "loss": 0.3234, "step": 940 }, { "epoch": 0.99, "learning_rate": 1.6559366738812345e-05, "loss": 0.3103, "step": 950 }, { "epoch": 1.0, "learning_rate": 1.6302334363594227e-05, "loss": 0.3305, "step": 960 }, { "epoch": 1.01, "learning_rate": 1.6044916078597505e-05, "loss": 0.289, "step": 970 }, { "epoch": 1.02, "learning_rate": 1.578718816241556e-05, "loss": 0.3036, "step": 980 }, { "epoch": 1.03, "learning_rate": 1.5529226985392172e-05, "loss": 0.2895, "step": 990 }, { "epoch": 1.04, "learning_rate": 1.527110898699136e-05, "loss": 0.2954, "step": 1000 }, { "epoch": 1.04, "eval_loss": 0.450633704662323, "eval_runtime": 13.7441, "eval_samples_per_second": 3.201, "eval_steps_per_second": 0.8, "step": 1000 }, { "epoch": 1.05, "learning_rate": 1.5012910653146696e-05, "loss": 0.2868, "step": 1010 }, { "epoch": 1.06, "learning_rate": 1.475470849359687e-05, "loss": 0.3139, "step": 1020 }, { "epoch": 1.07, "learning_rate": 1.449657901921421e-05, "loss": 0.2902, "step": 1030 }, { "epoch": 1.08, "learning_rate": 1.4238598719332875e-05, "loss": 0.3213, "step": 1040 }, { "epoch": 1.09, "learning_rate": 1.398084403908339e-05, "loss": 0.2874, "step": 1050 }, { "epoch": 1.1, "learning_rate": 1.372339135674031e-05, "loss": 0.3001, "step": 1060 }, { "epoch": 1.11, "learning_rate": 1.3466316961089692e-05, "loss": 0.3068, "step": 1070 }, { "epoch": 1.12, "learning_rate": 1.3209697028823101e-05, "loss": 0.281, "step": 1080 }, { "epoch": 1.13, "learning_rate": 1.295360760196477e-05, "loss": 0.2817, "step": 1090 }, { "epoch": 1.14, "learning_rate": 1.2698124565338744e-05, "loss": 0.2874, "step": 1100 }, { "epoch": 1.16, "learning_rate": 1.244332362408255e-05, "loss": 0.2759, "step": 1110 }, { "epoch": 1.17, "learning_rate": 1.2189280281214128e-05, "loss": 0.289, "step": 1120 }, { "epoch": 1.18, "learning_rate": 1.193606981525869e-05, "loss": 0.3084, "step": 1130 }, { "epoch": 1.19, "learning_rate": 1.1683767257942017e-05, "loss": 0.27, "step": 1140 }, { "epoch": 1.2, "learning_rate": 1.143244737195698e-05, "loss": 0.3034, "step": 1150 }, { "epoch": 1.21, "learning_rate": 1.1182184628809725e-05, "loss": 0.3149, "step": 1160 }, { "epoch": 1.22, "learning_rate": 1.093305318675215e-05, "loss": 0.3223, "step": 1170 }, { "epoch": 1.23, "learning_rate": 1.068512686880725e-05, "loss": 0.2817, "step": 1180 }, { "epoch": 1.24, "learning_rate": 1.0438479140893711e-05, "loss": 0.2904, "step": 1190 }, { "epoch": 1.25, "learning_rate": 1.0193183090056407e-05, "loss": 0.3009, "step": 1200 }, { "epoch": 1.26, "learning_rate": 9.949311402809125e-06, "loss": 0.2932, "step": 1210 }, { "epoch": 1.27, "learning_rate": 9.706936343595973e-06, "loss": 0.3013, "step": 1220 }, { "epoch": 1.28, "learning_rate": 9.466129733377887e-06, "loss": 0.2935, "step": 1230 }, { "epoch": 1.29, "learning_rate": 9.226962928350503e-06, "loss": 0.289, "step": 1240 }, { "epoch": 1.3, "learning_rate": 8.989506798799793e-06, "loss": 0.2822, "step": 1250 }, { "epoch": 1.31, "learning_rate": 8.753831708101669e-06, "loss": 0.3064, "step": 1260 }, { "epoch": 1.32, "learning_rate": 8.520007491871777e-06, "loss": 0.3173, "step": 1270 }, { "epoch": 1.33, "learning_rate": 8.288103437271743e-06, "loss": 0.2797, "step": 1280 }, { "epoch": 1.34, "learning_rate": 8.058188262477835e-06, "loss": 0.2604, "step": 1290 }, { "epoch": 1.35, "learning_rate": 7.830330096318326e-06, "loss": 0.2976, "step": 1300 }, { "epoch": 1.36, "learning_rate": 7.604596458085425e-06, "loss": 0.2776, "step": 1310 }, { "epoch": 1.37, "learning_rate": 7.381054237527869e-06, "loss": 0.288, "step": 1320 }, { "epoch": 1.38, "learning_rate": 7.159769675030054e-06, "loss": 0.2975, "step": 1330 }, { "epoch": 1.39, "learning_rate": 6.940808341983558e-06, "loss": 0.2845, "step": 1340 }, { "epoch": 1.4, "learning_rate": 6.724235121356932e-06, "loss": 0.2831, "step": 1350 }, { "epoch": 1.42, "learning_rate": 6.510114188469482e-06, "loss": 0.2933, "step": 1360 }, { "epoch": 1.43, "learning_rate": 6.298508991974726e-06, "loss": 0.2913, "step": 1370 }, { "epoch": 1.44, "learning_rate": 6.089482235059209e-06, "loss": 0.2855, "step": 1380 }, { "epoch": 1.45, "learning_rate": 5.8830958568621676e-06, "loss": 0.3165, "step": 1390 }, { "epoch": 1.46, "learning_rate": 5.679411014121645e-06, "loss": 0.2847, "step": 1400 }, { "epoch": 1.47, "learning_rate": 5.478488063052423e-06, "loss": 0.2904, "step": 1410 }, { "epoch": 1.48, "learning_rate": 5.280386541461171e-06, "loss": 0.2815, "step": 1420 }, { "epoch": 1.49, "learning_rate": 5.085165151104125e-06, "loss": 0.3055, "step": 1430 }, { "epoch": 1.5, "learning_rate": 4.892881740292471e-06, "loss": 0.2837, "step": 1440 }, { "epoch": 1.51, "learning_rate": 4.7035932867506446e-06, "loss": 0.2791, "step": 1450 }, { "epoch": 1.52, "learning_rate": 4.51735588073261e-06, "loss": 0.2597, "step": 1460 }, { "epoch": 1.53, "learning_rate": 4.334224708401099e-06, "loss": 0.2961, "step": 1470 }, { "epoch": 1.54, "learning_rate": 4.154254035474777e-06, "loss": 0.2974, "step": 1480 }, { "epoch": 1.55, "learning_rate": 3.977497191148111e-06, "loss": 0.2864, "step": 1490 }, { "epoch": 1.56, "learning_rate": 3.804006552288802e-06, "loss": 0.2961, "step": 1500 }, { "epoch": 1.57, "learning_rate": 3.633833527917379e-06, "loss": 0.2924, "step": 1510 }, { "epoch": 1.58, "learning_rate": 3.467028543973591e-06, "loss": 0.2942, "step": 1520 }, { "epoch": 1.59, "learning_rate": 3.303641028374138e-06, "loss": 0.2837, "step": 1530 }, { "epoch": 1.6, "learning_rate": 3.1437193963660797e-06, "loss": 0.3026, "step": 1540 }, { "epoch": 1.61, "learning_rate": 2.987311036180369e-06, "loss": 0.2928, "step": 1550 }, { "epoch": 1.62, "learning_rate": 2.8344622949896986e-06, "loss": 0.2862, "step": 1560 }, { "epoch": 1.63, "learning_rate": 2.685218465174828e-06, "loss": 0.2872, "step": 1570 }, { "epoch": 1.64, "learning_rate": 2.5396237709034816e-06, "loss": 0.3098, "step": 1580 }, { "epoch": 1.65, "learning_rate": 2.3977213550257583e-06, "loss": 0.2866, "step": 1590 }, { "epoch": 1.66, "learning_rate": 2.2595532662899915e-06, "loss": 0.3035, "step": 1600 }, { "epoch": 1.68, "learning_rate": 2.1251604468827823e-06, "loss": 0.3049, "step": 1610 }, { "epoch": 1.69, "learning_rate": 1.994582720296954e-06, "loss": 0.2678, "step": 1620 }, { "epoch": 1.7, "learning_rate": 1.8678587795309971e-06, "loss": 0.2966, "step": 1630 }, { "epoch": 1.71, "learning_rate": 1.7450261756234849e-06, "loss": 0.2787, "step": 1640 }, { "epoch": 1.72, "learning_rate": 1.6261213065258984e-06, "loss": 0.2957, "step": 1650 }, { "epoch": 1.73, "learning_rate": 1.511179406317131e-06, "loss": 0.2747, "step": 1660 }, { "epoch": 1.74, "learning_rate": 1.4002345347628598e-06, "loss": 0.2764, "step": 1670 }, { "epoch": 1.75, "learning_rate": 1.293319567222918e-06, "loss": 0.2983, "step": 1680 }, { "epoch": 1.76, "learning_rate": 1.1904661849095993e-06, "loss": 0.2981, "step": 1690 }, { "epoch": 1.77, "learning_rate": 1.0917048654998407e-06, "loss": 0.3148, "step": 1700 }, { "epoch": 1.78, "learning_rate": 9.970648741040273e-07, "loss": 0.3123, "step": 1710 }, { "epoch": 1.79, "learning_rate": 9.065742545941052e-07, "loss": 0.2972, "step": 1720 }, { "epoch": 1.8, "learning_rate": 8.202598212935897e-07, "loss": 0.271, "step": 1730 }, { "epoch": 1.81, "learning_rate": 7.381471510318799e-07, "loss": 0.2761, "step": 1740 }, { "epoch": 1.82, "learning_rate": 6.602605755653124e-07, "loss": 0.3004, "step": 1750 }, { "epoch": 1.83, "learning_rate": 5.866231743671296e-07, "loss": 0.2771, "step": 1760 }, { "epoch": 1.84, "learning_rate": 5.172567677885276e-07, "loss": 0.2985, "step": 1770 }, { "epoch": 1.85, "learning_rate": 4.521819105928293e-07, "loss": 0.2873, "step": 1780 }, { "epoch": 1.86, "learning_rate": 3.9141788586465224e-07, "loss": 0.2869, "step": 1790 }, { "epoch": 1.87, "learning_rate": 3.349826992959154e-07, "loss": 0.2814, "step": 1800 }, { "epoch": 1.88, "learning_rate": 2.828930738503727e-07, "loss": 0.2875, "step": 1810 }, { "epoch": 1.89, "learning_rate": 2.3516444480822962e-07, "loss": 0.3106, "step": 1820 }, { "epoch": 1.9, "learning_rate": 1.9181095519233748e-07, "loss": 0.2933, "step": 1830 }, { "epoch": 1.91, "learning_rate": 1.52845451577317e-07, "loss": 0.2798, "step": 1840 }, { "epoch": 1.93, "learning_rate": 1.1827948028283353e-07, "loss": 0.3051, "step": 1850 }, { "epoch": 1.94, "learning_rate": 8.812328395217395e-08, "loss": 0.2644, "step": 1860 }, { "epoch": 1.95, "learning_rate": 6.238579851713611e-08, "loss": 0.3041, "step": 1870 }, { "epoch": 1.96, "learning_rate": 4.1074650550116946e-08, "loss": 0.2848, "step": 1880 }, { "epoch": 1.97, "learning_rate": 2.4196155004190256e-08, "loss": 0.2695, "step": 1890 }, { "epoch": 1.98, "learning_rate": 1.175531334185187e-08, "loss": 0.2958, "step": 1900 }, { "epoch": 1.99, "learning_rate": 3.755812052983476e-09, "loss": 0.2994, "step": 1910 }, { "epoch": 2.0, "learning_rate": 2.0002156246146096e-10, "loss": 0.2929, "step": 1920 }, { "epoch": 2.0, "step": 1922, "total_flos": 3.4599459174219776e+17, "train_loss": 0.3247208358966101, "train_runtime": 7447.4364, "train_samples_per_second": 1.032, "train_steps_per_second": 0.258 } ], "logging_steps": 10, "max_steps": 1922, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 3.4599459174219776e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }