{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 310860, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06433764395547835, "grad_norm": 0.9469536542892456, "learning_rate": 9.375e-06, "loss": 7.0597, "step": 1000 }, { "epoch": 0.1286752879109567, "grad_norm": 0.9877486824989319, "learning_rate": 1.875e-05, "loss": 4.8574, "step": 2000 }, { "epoch": 0.19301293186643506, "grad_norm": 1.1811100244522095, "learning_rate": 2.8125e-05, "loss": 4.5464, "step": 3000 }, { "epoch": 0.2573505758219134, "grad_norm": 1.155553936958313, "learning_rate": 3.75e-05, "loss": 4.3086, "step": 4000 }, { "epoch": 0.32168821977739176, "grad_norm": 1.0062898397445679, "learning_rate": 4.6874999999999994e-05, "loss": 4.1307, "step": 5000 }, { "epoch": 0.3860258637328701, "grad_norm": 0.9749443531036377, "learning_rate": 5.625e-05, "loss": 3.986, "step": 6000 }, { "epoch": 0.45036350768834843, "grad_norm": 0.9870838522911072, "learning_rate": 6.5625e-05, "loss": 3.8708, "step": 7000 }, { "epoch": 0.5147011516438268, "grad_norm": 1.0740858316421509, "learning_rate": 7.5e-05, "loss": 3.781, "step": 8000 }, { "epoch": 0.5790387955993052, "grad_norm": 0.969571053981781, "learning_rate": 8.437499999999999e-05, "loss": 3.6942, "step": 9000 }, { "epoch": 0.6433764395547835, "grad_norm": 0.923062801361084, "learning_rate": 9.374999999999999e-05, "loss": 3.6225, "step": 10000 }, { "epoch": 0.7077140835102619, "grad_norm": 0.87486732006073, "learning_rate": 0.00010312499999999999, "loss": 3.5667, "step": 11000 }, { "epoch": 0.7720517274657402, "grad_norm": 0.8343172073364258, "learning_rate": 0.000112490625, "loss": 3.5107, "step": 12000 }, { "epoch": 0.8363893714212186, "grad_norm": 0.8089198470115662, "learning_rate": 0.000121865625, "loss": 3.4681, "step": 13000 }, { "epoch": 0.9007270153766969, "grad_norm": 0.8141182661056519, "learning_rate": 0.00013123125, "loss": 3.4337, "step": 14000 }, { "epoch": 0.9650646593321752, "grad_norm": 0.7596079707145691, "learning_rate": 0.00014060625, "loss": 3.3944, "step": 15000 }, { "epoch": 1.0, "eval_accuracy": 0.37339323372369543, "eval_loss": 3.4211647510528564, "eval_runtime": 112.5567, "eval_samples_per_second": 465.898, "eval_steps_per_second": 7.285, "step": 15543 }, { "epoch": 1.0294023032876536, "grad_norm": 0.7583508491516113, "learning_rate": 0.000149971875, "loss": 3.345, "step": 16000 }, { "epoch": 1.093739947243132, "grad_norm": 0.7395954728126526, "learning_rate": 0.00015933749999999996, "loss": 3.3182, "step": 17000 }, { "epoch": 1.1580775911986103, "grad_norm": 0.7119142413139343, "learning_rate": 0.00016871249999999996, "loss": 3.304, "step": 18000 }, { "epoch": 1.2224152351540887, "grad_norm": 0.7133814692497253, "learning_rate": 0.00017808749999999999, "loss": 3.2808, "step": 19000 }, { "epoch": 1.286752879109567, "grad_norm": 0.6662284731864929, "learning_rate": 0.00018745312499999998, "loss": 3.2624, "step": 20000 }, { "epoch": 1.3510905230650454, "grad_norm": 0.6821054816246033, "learning_rate": 0.00019682812499999998, "loss": 3.2468, "step": 21000 }, { "epoch": 1.4154281670205238, "grad_norm": 0.6423399448394775, "learning_rate": 0.00020619374999999998, "loss": 3.2323, "step": 22000 }, { "epoch": 1.4797658109760021, "grad_norm": 0.6489351987838745, "learning_rate": 0.00021556874999999998, "loss": 3.218, "step": 23000 }, { "epoch": 1.5441034549314803, "grad_norm": 0.6388360261917114, "learning_rate": 0.00022493437499999998, "loss": 3.2063, "step": 24000 }, { "epoch": 1.6084410988869586, "grad_norm": 0.6035541296005249, "learning_rate": 0.00023430937499999997, "loss": 3.1971, "step": 25000 }, { "epoch": 1.672778742842437, "grad_norm": 0.5949345231056213, "learning_rate": 0.00024367499999999997, "loss": 3.1683, "step": 26000 }, { "epoch": 1.7371163867979154, "grad_norm": 0.5953760147094727, "learning_rate": 0.00025305, "loss": 3.1728, "step": 27000 }, { "epoch": 1.8014540307533937, "grad_norm": 0.5276063680648804, "learning_rate": 0.000262415625, "loss": 3.1607, "step": 28000 }, { "epoch": 1.865791674708872, "grad_norm": 0.5257272124290466, "learning_rate": 0.000271790625, "loss": 3.1472, "step": 29000 }, { "epoch": 1.9301293186643504, "grad_norm": 0.49043259024620056, "learning_rate": 0.000281165625, "loss": 3.1367, "step": 30000 }, { "epoch": 1.9944669626198288, "grad_norm": 0.5030378699302673, "learning_rate": 0.000290521875, "loss": 3.1245, "step": 31000 }, { "epoch": 2.0, "eval_accuracy": 0.3939620256950988, "eval_loss": 3.2037432193756104, "eval_runtime": 113.0293, "eval_samples_per_second": 463.951, "eval_steps_per_second": 7.255, "step": 31086 }, { "epoch": 2.058804606575307, "grad_norm": 0.5003546476364136, "learning_rate": 0.000299896875, "loss": 3.0828, "step": 32000 }, { "epoch": 2.1231422505307855, "grad_norm": 0.48286330699920654, "learning_rate": 0.00029893602524564295, "loss": 3.08, "step": 33000 }, { "epoch": 2.187479894486264, "grad_norm": 0.4852472245693207, "learning_rate": 0.0002978602165961414, "loss": 3.0633, "step": 34000 }, { "epoch": 2.2518175384417423, "grad_norm": 0.4629572927951813, "learning_rate": 0.00029678548375528934, "loss": 3.063, "step": 35000 }, { "epoch": 2.3161551823972206, "grad_norm": 0.4571368992328644, "learning_rate": 0.0002957096751057878, "loss": 3.0453, "step": 36000 }, { "epoch": 2.380492826352699, "grad_norm": 0.44331055879592896, "learning_rate": 0.0002946349422649358, "loss": 3.0408, "step": 37000 }, { "epoch": 2.4448304703081774, "grad_norm": 0.4230923354625702, "learning_rate": 0.00029355913361543424, "loss": 3.0359, "step": 38000 }, { "epoch": 2.5091681142636557, "grad_norm": 0.4260108768939972, "learning_rate": 0.0002924833249659327, "loss": 3.0316, "step": 39000 }, { "epoch": 2.573505758219134, "grad_norm": 0.41887935996055603, "learning_rate": 0.0002914085921250807, "loss": 3.0299, "step": 40000 }, { "epoch": 2.6378434021746124, "grad_norm": 0.41068920493125916, "learning_rate": 0.00029033278347557914, "loss": 3.0138, "step": 41000 }, { "epoch": 2.702181046130091, "grad_norm": 0.39430394768714905, "learning_rate": 0.0002892591264433766, "loss": 3.0038, "step": 42000 }, { "epoch": 2.766518690085569, "grad_norm": 0.4100017547607422, "learning_rate": 0.00028818331779387505, "loss": 3.0088, "step": 43000 }, { "epoch": 2.8308563340410475, "grad_norm": 0.4101816415786743, "learning_rate": 0.0002871075091443735, "loss": 2.9937, "step": 44000 }, { "epoch": 2.895193977996526, "grad_norm": 0.38294607400894165, "learning_rate": 0.000286031700494872, "loss": 2.9898, "step": 45000 }, { "epoch": 2.9595316219520043, "grad_norm": 0.37260037660598755, "learning_rate": 0.00028495589184537043, "loss": 2.9807, "step": 46000 }, { "epoch": 3.0, "eval_accuracy": 0.40728960081362825, "eval_loss": 3.079435110092163, "eval_runtime": 112.8117, "eval_samples_per_second": 464.845, "eval_steps_per_second": 7.269, "step": 46629 }, { "epoch": 3.0238692659074826, "grad_norm": 0.40783312916755676, "learning_rate": 0.00028388115900451836, "loss": 2.9534, "step": 47000 }, { "epoch": 3.088206909862961, "grad_norm": 0.38361623883247375, "learning_rate": 0.0002828053503550168, "loss": 2.9122, "step": 48000 }, { "epoch": 3.1525445538184393, "grad_norm": 0.3766690790653229, "learning_rate": 0.0002817306175141648, "loss": 2.9119, "step": 49000 }, { "epoch": 3.2168821977739177, "grad_norm": 0.38536399602890015, "learning_rate": 0.00028065480886466326, "loss": 2.9135, "step": 50000 }, { "epoch": 3.2812198417293956, "grad_norm": 0.38374361395835876, "learning_rate": 0.0002795800760238112, "loss": 2.9094, "step": 51000 }, { "epoch": 3.345557485684874, "grad_norm": 0.3872029185295105, "learning_rate": 0.00027850426737430965, "loss": 2.9069, "step": 52000 }, { "epoch": 3.4098951296403524, "grad_norm": 0.37401601672172546, "learning_rate": 0.0002774284587248081, "loss": 2.9092, "step": 53000 }, { "epoch": 3.4742327735958307, "grad_norm": 0.34951257705688477, "learning_rate": 0.0002763537258839561, "loss": 2.9124, "step": 54000 }, { "epoch": 3.538570417551309, "grad_norm": 0.36252182722091675, "learning_rate": 0.000275278993043104, "loss": 2.9086, "step": 55000 }, { "epoch": 3.6029080615067874, "grad_norm": 0.3610841631889343, "learning_rate": 0.0002742031843936025, "loss": 2.9044, "step": 56000 }, { "epoch": 3.667245705462266, "grad_norm": 0.356315016746521, "learning_rate": 0.00027312737574410094, "loss": 2.8984, "step": 57000 }, { "epoch": 3.731583349417744, "grad_norm": 0.3501368761062622, "learning_rate": 0.0002720515670945994, "loss": 2.8991, "step": 58000 }, { "epoch": 3.7959209933732225, "grad_norm": 0.3654986619949341, "learning_rate": 0.00027097575844509786, "loss": 2.8946, "step": 59000 }, { "epoch": 3.860258637328701, "grad_norm": 0.34233444929122925, "learning_rate": 0.00026990102560424585, "loss": 2.8981, "step": 60000 }, { "epoch": 3.9245962812841793, "grad_norm": 0.35118335485458374, "learning_rate": 0.0002688252169547443, "loss": 2.8932, "step": 61000 }, { "epoch": 3.9889339252396576, "grad_norm": 0.3542274236679077, "learning_rate": 0.00026775048411389223, "loss": 2.8872, "step": 62000 }, { "epoch": 4.0, "eval_accuracy": 0.4139775803532702, "eval_loss": 3.0204551219940186, "eval_runtime": 113.294, "eval_samples_per_second": 462.866, "eval_steps_per_second": 7.238, "step": 62172 }, { "epoch": 4.053271569195136, "grad_norm": 0.359250545501709, "learning_rate": 0.0002666746754643907, "loss": 2.8242, "step": 63000 }, { "epoch": 4.117609213150614, "grad_norm": 0.34917619824409485, "learning_rate": 0.00026559886681488915, "loss": 2.8144, "step": 64000 }, { "epoch": 4.181946857106093, "grad_norm": 0.351457417011261, "learning_rate": 0.00026452413397403714, "loss": 2.8132, "step": 65000 }, { "epoch": 4.246284501061571, "grad_norm": 0.35231146216392517, "learning_rate": 0.0002634483253245356, "loss": 2.8203, "step": 66000 }, { "epoch": 4.310622145017049, "grad_norm": 0.354030579328537, "learning_rate": 0.0002623735924836836, "loss": 2.8273, "step": 67000 }, { "epoch": 4.374959788972528, "grad_norm": 0.3434860408306122, "learning_rate": 0.00026129778383418204, "loss": 2.8221, "step": 68000 }, { "epoch": 4.439297432928006, "grad_norm": 0.35598379373550415, "learning_rate": 0.0002602219751846805, "loss": 2.8283, "step": 69000 }, { "epoch": 4.5036350768834845, "grad_norm": 0.350340873003006, "learning_rate": 0.00025914616653517896, "loss": 2.8242, "step": 70000 }, { "epoch": 4.567972720838963, "grad_norm": 0.34078752994537354, "learning_rate": 0.0002580714336943269, "loss": 2.8309, "step": 71000 }, { "epoch": 4.632310364794441, "grad_norm": 0.3571733832359314, "learning_rate": 0.00025699670085347487, "loss": 2.8248, "step": 72000 }, { "epoch": 4.69664800874992, "grad_norm": 0.35940021276474, "learning_rate": 0.00025592089220397333, "loss": 2.8334, "step": 73000 }, { "epoch": 4.760985652705398, "grad_norm": 0.3354775607585907, "learning_rate": 0.0002548450835544718, "loss": 2.8263, "step": 74000 }, { "epoch": 4.825323296660876, "grad_norm": 0.330805242061615, "learning_rate": 0.0002537703507136197, "loss": 2.8296, "step": 75000 }, { "epoch": 4.889660940616355, "grad_norm": 0.32566189765930176, "learning_rate": 0.0002526945420641182, "loss": 2.8208, "step": 76000 }, { "epoch": 4.953998584571833, "grad_norm": 0.32299116253852844, "learning_rate": 0.00025161980922326616, "loss": 2.8286, "step": 77000 }, { "epoch": 5.0, "eval_accuracy": 0.417990981289541, "eval_loss": 2.988518238067627, "eval_runtime": 113.0062, "eval_samples_per_second": 464.045, "eval_steps_per_second": 7.256, "step": 77715 }, { "epoch": 5.018336228527311, "grad_norm": 0.332711786031723, "learning_rate": 0.00025054400057376457, "loss": 2.797, "step": 78000 }, { "epoch": 5.08267387248279, "grad_norm": 0.3597155809402466, "learning_rate": 0.000249468191924263, "loss": 2.7461, "step": 79000 }, { "epoch": 5.147011516438268, "grad_norm": 0.3411096930503845, "learning_rate": 0.000248393459083411, "loss": 2.7493, "step": 80000 }, { "epoch": 5.2113491603937465, "grad_norm": 0.35248109698295593, "learning_rate": 0.00024731765043390947, "loss": 2.7584, "step": 81000 }, { "epoch": 5.275686804349225, "grad_norm": 0.3520190417766571, "learning_rate": 0.00024624184178440793, "loss": 2.755, "step": 82000 }, { "epoch": 5.340024448304703, "grad_norm": 0.34867680072784424, "learning_rate": 0.00024516710894355586, "loss": 2.7649, "step": 83000 }, { "epoch": 5.404362092260182, "grad_norm": 0.3400154709815979, "learning_rate": 0.00024409130029405434, "loss": 2.7586, "step": 84000 }, { "epoch": 5.46869973621566, "grad_norm": 0.3640024662017822, "learning_rate": 0.0002430154916445528, "loss": 2.7606, "step": 85000 }, { "epoch": 5.533037380171138, "grad_norm": 0.3456322252750397, "learning_rate": 0.00024193968299505126, "loss": 2.767, "step": 86000 }, { "epoch": 5.597375024126617, "grad_norm": 0.3284786343574524, "learning_rate": 0.00024086495015419922, "loss": 2.7687, "step": 87000 }, { "epoch": 5.661712668082095, "grad_norm": 0.3351786732673645, "learning_rate": 0.00023978914150469768, "loss": 2.7705, "step": 88000 }, { "epoch": 5.726050312037573, "grad_norm": 0.3189627528190613, "learning_rate": 0.00023871440866384563, "loss": 2.7743, "step": 89000 }, { "epoch": 5.790387955993052, "grad_norm": 0.3447468876838684, "learning_rate": 0.0002376386000143441, "loss": 2.7712, "step": 90000 }, { "epoch": 5.85472559994853, "grad_norm": 0.3212040364742279, "learning_rate": 0.00023656386717349205, "loss": 2.7741, "step": 91000 }, { "epoch": 5.9190632439040085, "grad_norm": 0.3384701609611511, "learning_rate": 0.0002354880585239905, "loss": 2.7747, "step": 92000 }, { "epoch": 5.983400887859487, "grad_norm": 0.33266380429267883, "learning_rate": 0.00023441224987448894, "loss": 2.779, "step": 93000 }, { "epoch": 6.0, "eval_accuracy": 0.4206485843765424, "eval_loss": 2.969926595687866, "eval_runtime": 113.3066, "eval_samples_per_second": 462.815, "eval_steps_per_second": 7.237, "step": 93258 }, { "epoch": 6.047738531814965, "grad_norm": 0.35645657777786255, "learning_rate": 0.0002333364412249874, "loss": 2.7059, "step": 94000 }, { "epoch": 6.112076175770444, "grad_norm": 0.35733386874198914, "learning_rate": 0.0002322617083841354, "loss": 2.6966, "step": 95000 }, { "epoch": 6.176413819725922, "grad_norm": 0.3435540199279785, "learning_rate": 0.00023118589973463387, "loss": 2.6986, "step": 96000 }, { "epoch": 6.2407514636814, "grad_norm": 0.3479596972465515, "learning_rate": 0.0002301100910851323, "loss": 2.7068, "step": 97000 }, { "epoch": 6.305089107636879, "grad_norm": 0.3150424659252167, "learning_rate": 0.00022903428243563077, "loss": 2.7074, "step": 98000 }, { "epoch": 6.369426751592357, "grad_norm": 0.34055858850479126, "learning_rate": 0.00022795954959477872, "loss": 2.7065, "step": 99000 }, { "epoch": 6.433764395547835, "grad_norm": 0.3491341769695282, "learning_rate": 0.0002268848167539267, "loss": 2.7156, "step": 100000 }, { "epoch": 6.498102039503314, "grad_norm": 0.3347100019454956, "learning_rate": 0.00022580900810442514, "loss": 2.714, "step": 101000 }, { "epoch": 6.562439683458791, "grad_norm": 0.35210439562797546, "learning_rate": 0.00022473427526357312, "loss": 2.7194, "step": 102000 }, { "epoch": 6.6267773274142705, "grad_norm": 0.3326897919178009, "learning_rate": 0.00022365846661407155, "loss": 2.727, "step": 103000 }, { "epoch": 6.691114971369748, "grad_norm": 0.3269229531288147, "learning_rate": 0.00022258265796457, "loss": 2.7203, "step": 104000 }, { "epoch": 6.755452615325227, "grad_norm": 0.34183254837989807, "learning_rate": 0.00022150684931506847, "loss": 2.7328, "step": 105000 }, { "epoch": 6.819790259280705, "grad_norm": 0.33449244499206543, "learning_rate": 0.00022043211647421643, "loss": 2.7291, "step": 106000 }, { "epoch": 6.884127903236184, "grad_norm": 0.33734798431396484, "learning_rate": 0.0002193563078247149, "loss": 2.7277, "step": 107000 }, { "epoch": 6.948465547191661, "grad_norm": 0.34088870882987976, "learning_rate": 0.00021828157498386284, "loss": 2.7316, "step": 108000 }, { "epoch": 7.0, "eval_accuracy": 0.42224443247932275, "eval_loss": 2.958820104598999, "eval_runtime": 113.0743, "eval_samples_per_second": 463.766, "eval_steps_per_second": 7.252, "step": 108801 }, { "epoch": 7.01280319114714, "grad_norm": 0.3517482876777649, "learning_rate": 0.0002172057663343613, "loss": 2.7116, "step": 109000 }, { "epoch": 7.077140835102618, "grad_norm": 0.3411085903644562, "learning_rate": 0.00021613103349350926, "loss": 2.6469, "step": 110000 }, { "epoch": 7.1414784790580965, "grad_norm": 0.3486618399620056, "learning_rate": 0.00021505522484400772, "loss": 2.6546, "step": 111000 }, { "epoch": 7.205816123013575, "grad_norm": 0.35618531703948975, "learning_rate": 0.00021397941619450618, "loss": 2.6603, "step": 112000 }, { "epoch": 7.270153766969053, "grad_norm": 0.34740447998046875, "learning_rate": 0.00021290468335365413, "loss": 2.6632, "step": 113000 }, { "epoch": 7.334491410924532, "grad_norm": 0.339108407497406, "learning_rate": 0.0002118288747041526, "loss": 2.6682, "step": 114000 }, { "epoch": 7.39882905488001, "grad_norm": 0.36686399579048157, "learning_rate": 0.00021075306605465105, "loss": 2.6718, "step": 115000 }, { "epoch": 7.463166698835488, "grad_norm": 0.3336213529109955, "learning_rate": 0.000209678333213799, "loss": 2.6806, "step": 116000 }, { "epoch": 7.527504342790967, "grad_norm": 0.34256553649902344, "learning_rate": 0.00020860252456429747, "loss": 2.6772, "step": 117000 }, { "epoch": 7.591841986746445, "grad_norm": 0.3527204096317291, "learning_rate": 0.00020752671591479593, "loss": 2.6786, "step": 118000 }, { "epoch": 7.656179630701923, "grad_norm": 0.34285178780555725, "learning_rate": 0.0002064509072652944, "loss": 2.6816, "step": 119000 }, { "epoch": 7.720517274657402, "grad_norm": 0.3418208658695221, "learning_rate": 0.00020537617442444234, "loss": 2.6893, "step": 120000 }, { "epoch": 7.78485491861288, "grad_norm": 0.34486138820648193, "learning_rate": 0.0002043003657749408, "loss": 2.6847, "step": 121000 }, { "epoch": 7.8491925625683585, "grad_norm": 0.348530650138855, "learning_rate": 0.00020322563293408876, "loss": 2.6826, "step": 122000 }, { "epoch": 7.913530206523837, "grad_norm": 0.33808425068855286, "learning_rate": 0.00020215090009323674, "loss": 2.6905, "step": 123000 }, { "epoch": 7.977867850479315, "grad_norm": 0.3486366868019104, "learning_rate": 0.0002010750914437352, "loss": 2.6909, "step": 124000 }, { "epoch": 8.0, "eval_accuracy": 0.4232837528604119, "eval_loss": 2.9554243087768555, "eval_runtime": 111.9904, "eval_samples_per_second": 468.254, "eval_steps_per_second": 7.322, "step": 124344 }, { "epoch": 8.042205494434794, "grad_norm": 0.35380104184150696, "learning_rate": 0.00020000035860288316, "loss": 2.6303, "step": 125000 }, { "epoch": 8.106543138390272, "grad_norm": 0.3654320240020752, "learning_rate": 0.00019892454995338162, "loss": 2.6128, "step": 126000 }, { "epoch": 8.170880782345751, "grad_norm": 0.3670574724674225, "learning_rate": 0.00019784874130388008, "loss": 2.617, "step": 127000 }, { "epoch": 8.235218426301229, "grad_norm": 0.38059455156326294, "learning_rate": 0.00019677400846302803, "loss": 2.6274, "step": 128000 }, { "epoch": 8.299556070256708, "grad_norm": 0.3698261082172394, "learning_rate": 0.00019569927562217599, "loss": 2.6309, "step": 129000 }, { "epoch": 8.363893714212185, "grad_norm": 0.3583601117134094, "learning_rate": 0.00019462346697267445, "loss": 2.6312, "step": 130000 }, { "epoch": 8.428231358167665, "grad_norm": 0.3602234721183777, "learning_rate": 0.0001935476583231729, "loss": 2.6368, "step": 131000 }, { "epoch": 8.492569002123142, "grad_norm": 0.3441711664199829, "learning_rate": 0.00019247184967367137, "loss": 2.6372, "step": 132000 }, { "epoch": 8.556906646078621, "grad_norm": 0.3533187508583069, "learning_rate": 0.00019139604102416983, "loss": 2.6443, "step": 133000 }, { "epoch": 8.621244290034099, "grad_norm": 0.3579193651676178, "learning_rate": 0.00019032130818331778, "loss": 2.6481, "step": 134000 }, { "epoch": 8.685581933989578, "grad_norm": 0.3524502217769623, "learning_rate": 0.00018924549953381624, "loss": 2.6509, "step": 135000 }, { "epoch": 8.749919577945056, "grad_norm": 0.36159747838974, "learning_rate": 0.0001881707666929642, "loss": 2.6456, "step": 136000 }, { "epoch": 8.814257221900533, "grad_norm": 0.34249147772789, "learning_rate": 0.00018709495804346266, "loss": 2.6538, "step": 137000 }, { "epoch": 8.878594865856012, "grad_norm": 0.34867429733276367, "learning_rate": 0.0001860202252026106, "loss": 2.6558, "step": 138000 }, { "epoch": 8.942932509811492, "grad_norm": 0.3351230025291443, "learning_rate": 0.00018494441655310907, "loss": 2.6504, "step": 139000 }, { "epoch": 9.0, "eval_accuracy": 0.4238085730096768, "eval_loss": 2.9544410705566406, "eval_runtime": 112.4181, "eval_samples_per_second": 466.473, "eval_steps_per_second": 7.294, "step": 139887 }, { "epoch": 9.007270153766969, "grad_norm": 0.36276528239250183, "learning_rate": 0.00018386968371225703, "loss": 2.6469, "step": 140000 }, { "epoch": 9.071607797722447, "grad_norm": 0.36368831992149353, "learning_rate": 0.0001827938750627555, "loss": 2.5666, "step": 141000 }, { "epoch": 9.135945441677926, "grad_norm": 0.36417004466056824, "learning_rate": 0.00018171806641325395, "loss": 2.5832, "step": 142000 }, { "epoch": 9.200283085633403, "grad_norm": 0.3550620973110199, "learning_rate": 0.0001806422577637524, "loss": 2.5888, "step": 143000 }, { "epoch": 9.264620729588882, "grad_norm": 0.3513035178184509, "learning_rate": 0.00017956644911425084, "loss": 2.5872, "step": 144000 }, { "epoch": 9.32895837354436, "grad_norm": 0.3576969802379608, "learning_rate": 0.00017849279208204832, "loss": 2.599, "step": 145000 }, { "epoch": 9.39329601749984, "grad_norm": 0.3496710956096649, "learning_rate": 0.00017741698343254678, "loss": 2.6042, "step": 146000 }, { "epoch": 9.457633661455317, "grad_norm": 0.3502206802368164, "learning_rate": 0.00017634225059169476, "loss": 2.6069, "step": 147000 }, { "epoch": 9.521971305410796, "grad_norm": 0.3516786992549896, "learning_rate": 0.00017526644194219322, "loss": 2.606, "step": 148000 }, { "epoch": 9.586308949366273, "grad_norm": 0.3671824336051941, "learning_rate": 0.00017419063329269168, "loss": 2.6151, "step": 149000 }, { "epoch": 9.650646593321753, "grad_norm": 0.36615684628486633, "learning_rate": 0.00017311590045183964, "loss": 2.6174, "step": 150000 }, { "epoch": 9.71498423727723, "grad_norm": 0.369759202003479, "learning_rate": 0.0001720400918023381, "loss": 2.6162, "step": 151000 }, { "epoch": 9.77932188123271, "grad_norm": 0.3495037257671356, "learning_rate": 0.00017096428315283656, "loss": 2.6186, "step": 152000 }, { "epoch": 9.843659525188187, "grad_norm": 0.3635868728160858, "learning_rate": 0.0001698895503119845, "loss": 2.616, "step": 153000 }, { "epoch": 9.907997169143666, "grad_norm": 0.352250337600708, "learning_rate": 0.00016881374166248297, "loss": 2.626, "step": 154000 }, { "epoch": 9.972334813099144, "grad_norm": 0.3688776195049286, "learning_rate": 0.00016773900882163093, "loss": 2.6246, "step": 155000 }, { "epoch": 10.0, "eval_accuracy": 0.424411016885778, "eval_loss": 2.9523308277130127, "eval_runtime": 112.0636, "eval_samples_per_second": 467.948, "eval_steps_per_second": 7.317, "step": 155430 }, { "epoch": 10.036672457054623, "grad_norm": 0.3961314558982849, "learning_rate": 0.0001666632001721294, "loss": 2.5827, "step": 156000 }, { "epoch": 10.1010101010101, "grad_norm": 0.3705954849720001, "learning_rate": 0.00016558739152262782, "loss": 2.5413, "step": 157000 }, { "epoch": 10.16534774496558, "grad_norm": 0.37091416120529175, "learning_rate": 0.00016451158287312628, "loss": 2.5502, "step": 158000 }, { "epoch": 10.229685388921057, "grad_norm": 0.38428565859794617, "learning_rate": 0.00016343685003227424, "loss": 2.5592, "step": 159000 }, { "epoch": 10.294023032876536, "grad_norm": 0.3688577115535736, "learning_rate": 0.0001623610413827727, "loss": 2.5673, "step": 160000 }, { "epoch": 10.358360676832014, "grad_norm": 0.38183775544166565, "learning_rate": 0.00016128630854192065, "loss": 2.5697, "step": 161000 }, { "epoch": 10.422698320787493, "grad_norm": 0.37677517533302307, "learning_rate": 0.0001602104998924191, "loss": 2.5713, "step": 162000 }, { "epoch": 10.48703596474297, "grad_norm": 0.3694332540035248, "learning_rate": 0.00015913576705156707, "loss": 2.5751, "step": 163000 }, { "epoch": 10.55137360869845, "grad_norm": 0.3814958333969116, "learning_rate": 0.00015806103421071502, "loss": 2.5792, "step": 164000 }, { "epoch": 10.615711252653927, "grad_norm": 0.38280004262924194, "learning_rate": 0.00015698522556121348, "loss": 2.5782, "step": 165000 }, { "epoch": 10.680048896609406, "grad_norm": 0.3659280240535736, "learning_rate": 0.00015590941691171194, "loss": 2.5862, "step": 166000 }, { "epoch": 10.744386540564884, "grad_norm": 0.34562841057777405, "learning_rate": 0.0001548336082622104, "loss": 2.5869, "step": 167000 }, { "epoch": 10.808724184520363, "grad_norm": 0.3570345938205719, "learning_rate": 0.00015375887542135836, "loss": 2.59, "step": 168000 }, { "epoch": 10.87306182847584, "grad_norm": 0.360215961933136, "learning_rate": 0.00015268306677185682, "loss": 2.5979, "step": 169000 }, { "epoch": 10.93739947243132, "grad_norm": 0.370670884847641, "learning_rate": 0.00015160725812235528, "loss": 2.5988, "step": 170000 }, { "epoch": 11.0, "eval_accuracy": 0.4248191770987571, "eval_loss": 2.9567785263061523, "eval_runtime": 112.9375, "eval_samples_per_second": 464.328, "eval_steps_per_second": 7.261, "step": 170973 }, { "epoch": 11.001737116386797, "grad_norm": 0.38218948245048523, "learning_rate": 0.00015053252528150323, "loss": 2.5933, "step": 171000 }, { "epoch": 11.066074760342277, "grad_norm": 0.396331787109375, "learning_rate": 0.00014945671663200172, "loss": 2.5023, "step": 172000 }, { "epoch": 11.130412404297754, "grad_norm": 0.3751789927482605, "learning_rate": 0.00014838090798250018, "loss": 2.5227, "step": 173000 }, { "epoch": 11.194750048253233, "grad_norm": 0.37265828251838684, "learning_rate": 0.00014730509933299864, "loss": 2.5299, "step": 174000 }, { "epoch": 11.25908769220871, "grad_norm": 0.37080228328704834, "learning_rate": 0.0001462303664921466, "loss": 2.5333, "step": 175000 }, { "epoch": 11.32342533616419, "grad_norm": 0.3808966875076294, "learning_rate": 0.00014515563365129455, "loss": 2.5376, "step": 176000 }, { "epoch": 11.387762980119668, "grad_norm": 0.38901346921920776, "learning_rate": 0.000144079825001793, "loss": 2.5422, "step": 177000 }, { "epoch": 11.452100624075147, "grad_norm": 0.380100816488266, "learning_rate": 0.00014300401635229144, "loss": 2.5533, "step": 178000 }, { "epoch": 11.516438268030624, "grad_norm": 0.39306920766830444, "learning_rate": 0.0001419282077027899, "loss": 2.5507, "step": 179000 }, { "epoch": 11.580775911986104, "grad_norm": 0.3917422890663147, "learning_rate": 0.00014085239905328836, "loss": 2.5579, "step": 180000 }, { "epoch": 11.645113555941581, "grad_norm": 0.38742849230766296, "learning_rate": 0.00013977766621243632, "loss": 2.5531, "step": 181000 }, { "epoch": 11.70945119989706, "grad_norm": 0.3767852187156677, "learning_rate": 0.00013870185756293478, "loss": 2.5633, "step": 182000 }, { "epoch": 11.773788843852538, "grad_norm": 0.39576900005340576, "learning_rate": 0.00013762604891343324, "loss": 2.5648, "step": 183000 }, { "epoch": 11.838126487808017, "grad_norm": 0.37659791111946106, "learning_rate": 0.00013655131607258122, "loss": 2.5631, "step": 184000 }, { "epoch": 11.902464131763494, "grad_norm": 0.38377416133880615, "learning_rate": 0.00013547658323172918, "loss": 2.5631, "step": 185000 }, { "epoch": 11.966801775718974, "grad_norm": 0.37857234477996826, "learning_rate": 0.00013440077458222764, "loss": 2.5639, "step": 186000 }, { "epoch": 12.0, "eval_accuracy": 0.4247610714766456, "eval_loss": 2.9595353603363037, "eval_runtime": 112.0415, "eval_samples_per_second": 468.041, "eval_steps_per_second": 7.319, "step": 186516 }, { "epoch": 12.031139419674451, "grad_norm": 0.4024442136287689, "learning_rate": 0.0001333249659327261, "loss": 2.5273, "step": 187000 }, { "epoch": 12.09547706362993, "grad_norm": 0.4137458801269531, "learning_rate": 0.00013225023309187405, "loss": 2.4933, "step": 188000 }, { "epoch": 12.159814707585408, "grad_norm": 0.409184992313385, "learning_rate": 0.0001311744244423725, "loss": 2.4967, "step": 189000 }, { "epoch": 12.224152351540887, "grad_norm": 0.41316309571266174, "learning_rate": 0.00013009861579287097, "loss": 2.5063, "step": 190000 }, { "epoch": 12.288489995496365, "grad_norm": 0.3909110724925995, "learning_rate": 0.00012902280714336943, "loss": 2.5153, "step": 191000 }, { "epoch": 12.352827639451844, "grad_norm": 0.39046111702919006, "learning_rate": 0.0001279469984938679, "loss": 2.5115, "step": 192000 }, { "epoch": 12.417165283407321, "grad_norm": 0.40070855617523193, "learning_rate": 0.00012687226565301585, "loss": 2.5157, "step": 193000 }, { "epoch": 12.4815029273628, "grad_norm": 0.3970703184604645, "learning_rate": 0.00012579645700351428, "loss": 2.5198, "step": 194000 }, { "epoch": 12.545840571318278, "grad_norm": 0.40202242136001587, "learning_rate": 0.00012472064835401274, "loss": 2.526, "step": 195000 }, { "epoch": 12.610178215273757, "grad_norm": 0.3841732144355774, "learning_rate": 0.0001236459155131607, "loss": 2.5295, "step": 196000 }, { "epoch": 12.674515859229235, "grad_norm": 0.40759024024009705, "learning_rate": 0.00012257010686365916, "loss": 2.5307, "step": 197000 }, { "epoch": 12.738853503184714, "grad_norm": 0.3963831663131714, "learning_rate": 0.00012149429821415763, "loss": 2.534, "step": 198000 }, { "epoch": 12.803191147140192, "grad_norm": 0.37255486845970154, "learning_rate": 0.0001204195653733056, "loss": 2.5354, "step": 199000 }, { "epoch": 12.86752879109567, "grad_norm": 0.397368460893631, "learning_rate": 0.00011934375672380406, "loss": 2.5352, "step": 200000 }, { "epoch": 12.931866435051148, "grad_norm": 0.379574716091156, "learning_rate": 0.00011826902388295201, "loss": 2.5397, "step": 201000 }, { "epoch": 12.996204079006628, "grad_norm": 0.3803842067718506, "learning_rate": 0.00011719321523345048, "loss": 2.5361, "step": 202000 }, { "epoch": 13.0, "eval_accuracy": 0.42475613586395655, "eval_loss": 2.9698119163513184, "eval_runtime": 112.2708, "eval_samples_per_second": 467.085, "eval_steps_per_second": 7.304, "step": 202059 }, { "epoch": 13.060541722962105, "grad_norm": 0.40816885232925415, "learning_rate": 0.00011611740658394894, "loss": 2.4669, "step": 203000 }, { "epoch": 13.124879366917584, "grad_norm": 0.42818671464920044, "learning_rate": 0.00011504159793444738, "loss": 2.467, "step": 204000 }, { "epoch": 13.189217010873062, "grad_norm": 0.40255987644195557, "learning_rate": 0.00011396686509359535, "loss": 2.4753, "step": 205000 }, { "epoch": 13.253554654828541, "grad_norm": 0.4254453778266907, "learning_rate": 0.0001128921322527433, "loss": 2.4808, "step": 206000 }, { "epoch": 13.317892298784018, "grad_norm": 0.4060657322406769, "learning_rate": 0.00011181632360324175, "loss": 2.4932, "step": 207000 }, { "epoch": 13.382229942739498, "grad_norm": 0.4138365387916565, "learning_rate": 0.00011074051495374021, "loss": 2.4922, "step": 208000 }, { "epoch": 13.446567586694975, "grad_norm": 0.4098254442214966, "learning_rate": 0.00010966578211288817, "loss": 2.4948, "step": 209000 }, { "epoch": 13.510905230650454, "grad_norm": 0.4242159128189087, "learning_rate": 0.00010858997346338663, "loss": 2.5012, "step": 210000 }, { "epoch": 13.575242874605932, "grad_norm": 0.42177829146385193, "learning_rate": 0.00010751416481388509, "loss": 2.4998, "step": 211000 }, { "epoch": 13.63958051856141, "grad_norm": 0.4196189045906067, "learning_rate": 0.00010643943197303304, "loss": 2.5048, "step": 212000 }, { "epoch": 13.703918162516889, "grad_norm": 0.3965640366077423, "learning_rate": 0.0001053636233235315, "loss": 2.5092, "step": 213000 }, { "epoch": 13.768255806472368, "grad_norm": 0.39778339862823486, "learning_rate": 0.00010428781467402996, "loss": 2.5121, "step": 214000 }, { "epoch": 13.832593450427845, "grad_norm": 0.40292391180992126, "learning_rate": 0.00010321308183317793, "loss": 2.5119, "step": 215000 }, { "epoch": 13.896931094383323, "grad_norm": 0.41673198342323303, "learning_rate": 0.00010213727318367639, "loss": 2.5112, "step": 216000 }, { "epoch": 13.961268738338802, "grad_norm": 0.40400612354278564, "learning_rate": 0.00010106254034282435, "loss": 2.5098, "step": 217000 }, { "epoch": 14.0, "eval_accuracy": 0.424743796832234, "eval_loss": 2.9747180938720703, "eval_runtime": 112.0802, "eval_samples_per_second": 467.879, "eval_steps_per_second": 7.316, "step": 217602 }, { "epoch": 14.02560638229428, "grad_norm": 0.40745100378990173, "learning_rate": 9.998673169332281e-05, "loss": 2.4894, "step": 218000 }, { "epoch": 14.089944026249759, "grad_norm": 0.42399463057518005, "learning_rate": 9.891092304382127e-05, "loss": 2.449, "step": 219000 }, { "epoch": 14.154281670205236, "grad_norm": 0.4149724841117859, "learning_rate": 9.783511439431973e-05, "loss": 2.4534, "step": 220000 }, { "epoch": 14.218619314160716, "grad_norm": 0.40756285190582275, "learning_rate": 9.676145736211718e-05, "loss": 2.4576, "step": 221000 }, { "epoch": 14.282956958116193, "grad_norm": 0.4224795997142792, "learning_rate": 9.568564871261564e-05, "loss": 2.4584, "step": 222000 }, { "epoch": 14.347294602071672, "grad_norm": 0.41213053464889526, "learning_rate": 9.461091587176359e-05, "loss": 2.4707, "step": 223000 }, { "epoch": 14.41163224602715, "grad_norm": 0.4161031246185303, "learning_rate": 9.353510722226205e-05, "loss": 2.4701, "step": 224000 }, { "epoch": 14.475969889982629, "grad_norm": 0.42417025566101074, "learning_rate": 9.245929857276051e-05, "loss": 2.4706, "step": 225000 }, { "epoch": 14.540307533938106, "grad_norm": 0.4227360785007477, "learning_rate": 9.138348992325897e-05, "loss": 2.4678, "step": 226000 }, { "epoch": 14.604645177893586, "grad_norm": 0.3956305682659149, "learning_rate": 9.030768127375742e-05, "loss": 2.4816, "step": 227000 }, { "epoch": 14.668982821849063, "grad_norm": 0.42013561725616455, "learning_rate": 8.92329484329054e-05, "loss": 2.4791, "step": 228000 }, { "epoch": 14.733320465804542, "grad_norm": 0.41232335567474365, "learning_rate": 8.815713978340386e-05, "loss": 2.4861, "step": 229000 }, { "epoch": 14.79765810976002, "grad_norm": 0.398253858089447, "learning_rate": 8.708240694255182e-05, "loss": 2.4857, "step": 230000 }, { "epoch": 14.8619957537155, "grad_norm": 0.41056615114212036, "learning_rate": 8.600659829305028e-05, "loss": 2.4826, "step": 231000 }, { "epoch": 14.926333397670977, "grad_norm": 0.4065124988555908, "learning_rate": 8.493186545219823e-05, "loss": 2.4791, "step": 232000 }, { "epoch": 14.990671041626456, "grad_norm": 0.42194780707359314, "learning_rate": 8.385605680269669e-05, "loss": 2.4899, "step": 233000 }, { "epoch": 15.0, "eval_accuracy": 0.4246625087868862, "eval_loss": 2.9792003631591797, "eval_runtime": 112.3403, "eval_samples_per_second": 466.796, "eval_steps_per_second": 7.299, "step": 233145 }, { "epoch": 15.055008685581933, "grad_norm": 0.444181889295578, "learning_rate": 8.278024815319514e-05, "loss": 2.4309, "step": 234000 }, { "epoch": 15.119346329537413, "grad_norm": 0.4177301526069641, "learning_rate": 8.17044395036936e-05, "loss": 2.4254, "step": 235000 }, { "epoch": 15.18368397349289, "grad_norm": 0.43864157795906067, "learning_rate": 8.062970666284155e-05, "loss": 2.432, "step": 236000 }, { "epoch": 15.24802161744837, "grad_norm": 0.43071264028549194, "learning_rate": 7.955497382198951e-05, "loss": 2.4372, "step": 237000 }, { "epoch": 15.312359261403847, "grad_norm": 0.44551989436149597, "learning_rate": 7.847916517248797e-05, "loss": 2.4441, "step": 238000 }, { "epoch": 15.376696905359326, "grad_norm": 0.42598387598991394, "learning_rate": 7.740335652298643e-05, "loss": 2.4448, "step": 239000 }, { "epoch": 15.441034549314804, "grad_norm": 0.4412069618701935, "learning_rate": 7.632754787348489e-05, "loss": 2.4481, "step": 240000 }, { "epoch": 15.505372193270283, "grad_norm": 0.4257245361804962, "learning_rate": 7.525173922398335e-05, "loss": 2.4496, "step": 241000 }, { "epoch": 15.56970983722576, "grad_norm": 0.4463740885257721, "learning_rate": 7.417593057448181e-05, "loss": 2.4583, "step": 242000 }, { "epoch": 15.63404748118124, "grad_norm": 0.40843266248703003, "learning_rate": 7.310119773362977e-05, "loss": 2.4549, "step": 243000 }, { "epoch": 15.698385125136717, "grad_norm": 0.43823161721229553, "learning_rate": 7.202538908412823e-05, "loss": 2.4565, "step": 244000 }, { "epoch": 15.762722769092196, "grad_norm": 0.4224304258823395, "learning_rate": 7.09506562432762e-05, "loss": 2.4664, "step": 245000 }, { "epoch": 15.827060413047674, "grad_norm": 0.42779698967933655, "learning_rate": 6.987484759377464e-05, "loss": 2.4607, "step": 246000 }, { "epoch": 15.891398057003153, "grad_norm": 0.41904374957084656, "learning_rate": 6.880011475292261e-05, "loss": 2.463, "step": 247000 }, { "epoch": 15.95573570095863, "grad_norm": 0.4636126458644867, "learning_rate": 6.772430610342107e-05, "loss": 2.4626, "step": 248000 }, { "epoch": 16.0, "eval_accuracy": 0.4244173733566653, "eval_loss": 2.9882283210754395, "eval_runtime": 112.1119, "eval_samples_per_second": 467.747, "eval_steps_per_second": 7.314, "step": 248688 }, { "epoch": 16.02007334491411, "grad_norm": 0.44689562916755676, "learning_rate": 6.664849745391953e-05, "loss": 2.4432, "step": 249000 }, { "epoch": 16.08441098886959, "grad_norm": 0.45889049768447876, "learning_rate": 6.557376461306749e-05, "loss": 2.4048, "step": 250000 }, { "epoch": 16.148748632825065, "grad_norm": 0.4538269639015198, "learning_rate": 6.449795596356593e-05, "loss": 2.4123, "step": 251000 }, { "epoch": 16.213086276780544, "grad_norm": 0.44775742292404175, "learning_rate": 6.342214731406439e-05, "loss": 2.4137, "step": 252000 }, { "epoch": 16.277423920736023, "grad_norm": 0.4506843090057373, "learning_rate": 6.234741447321236e-05, "loss": 2.4152, "step": 253000 }, { "epoch": 16.341761564691502, "grad_norm": 0.4564642310142517, "learning_rate": 6.127160582371082e-05, "loss": 2.4236, "step": 254000 }, { "epoch": 16.406099208646978, "grad_norm": 0.4492376744747162, "learning_rate": 6.0195797174209275e-05, "loss": 2.4222, "step": 255000 }, { "epoch": 16.470436852602457, "grad_norm": 0.44002753496170044, "learning_rate": 5.9119988524707736e-05, "loss": 2.4277, "step": 256000 }, { "epoch": 16.534774496557937, "grad_norm": 0.437580406665802, "learning_rate": 5.8044179875206196e-05, "loss": 2.4303, "step": 257000 }, { "epoch": 16.599112140513416, "grad_norm": 0.42502424120903015, "learning_rate": 5.697052284300365e-05, "loss": 2.4359, "step": 258000 }, { "epoch": 16.66344978446889, "grad_norm": 0.44441190361976624, "learning_rate": 5.5894714193502106e-05, "loss": 2.4306, "step": 259000 }, { "epoch": 16.72778742842437, "grad_norm": 0.4539526700973511, "learning_rate": 5.4818905544000566e-05, "loss": 2.4342, "step": 260000 }, { "epoch": 16.79212507237985, "grad_norm": 0.4554595947265625, "learning_rate": 5.374417270314853e-05, "loss": 2.4388, "step": 261000 }, { "epoch": 16.85646271633533, "grad_norm": 0.4573330283164978, "learning_rate": 5.266836405364699e-05, "loss": 2.441, "step": 262000 }, { "epoch": 16.920800360290805, "grad_norm": 0.449770450592041, "learning_rate": 5.159255540414545e-05, "loss": 2.4411, "step": 263000 }, { "epoch": 16.985138004246284, "grad_norm": 0.48139625787734985, "learning_rate": 5.05178225632934e-05, "loss": 2.4399, "step": 264000 }, { "epoch": 17.0, "eval_accuracy": 0.4242649676193895, "eval_loss": 2.9961202144622803, "eval_runtime": 112.3106, "eval_samples_per_second": 466.919, "eval_steps_per_second": 7.301, "step": 264231 }, { "epoch": 17.049475648201764, "grad_norm": 0.4543195366859436, "learning_rate": 4.9442013913791863e-05, "loss": 2.4013, "step": 265000 }, { "epoch": 17.113813292157243, "grad_norm": 0.4699794054031372, "learning_rate": 4.836620526429032e-05, "loss": 2.3928, "step": 266000 }, { "epoch": 17.17815093611272, "grad_norm": 0.4636929929256439, "learning_rate": 4.7291472423438285e-05, "loss": 2.3989, "step": 267000 }, { "epoch": 17.242488580068198, "grad_norm": 0.4614698886871338, "learning_rate": 4.6215663773936746e-05, "loss": 2.4004, "step": 268000 }, { "epoch": 17.306826224023677, "grad_norm": 0.46002906560897827, "learning_rate": 4.513985512443519e-05, "loss": 2.3982, "step": 269000 }, { "epoch": 17.371163867979156, "grad_norm": 0.42619064450263977, "learning_rate": 4.4065122283583154e-05, "loss": 2.4053, "step": 270000 }, { "epoch": 17.435501511934632, "grad_norm": 0.45975300669670105, "learning_rate": 4.2989313634081614e-05, "loss": 2.4041, "step": 271000 }, { "epoch": 17.49983915589011, "grad_norm": 0.4545740485191345, "learning_rate": 4.1913504984580075e-05, "loss": 2.406, "step": 272000 }, { "epoch": 17.56417679984559, "grad_norm": 0.458011269569397, "learning_rate": 4.083769633507853e-05, "loss": 2.4168, "step": 273000 }, { "epoch": 17.62851444380107, "grad_norm": 0.4604107439517975, "learning_rate": 3.976296349422649e-05, "loss": 2.411, "step": 274000 }, { "epoch": 17.692852087756545, "grad_norm": 0.4420773684978485, "learning_rate": 3.8687154844724944e-05, "loss": 2.4144, "step": 275000 }, { "epoch": 17.757189731712025, "grad_norm": 0.45774900913238525, "learning_rate": 3.7611346195223404e-05, "loss": 2.412, "step": 276000 }, { "epoch": 17.821527375667504, "grad_norm": 0.4509606659412384, "learning_rate": 3.6536613354371366e-05, "loss": 2.4086, "step": 277000 }, { "epoch": 17.885865019622983, "grad_norm": 0.4442935883998871, "learning_rate": 3.5460804704869826e-05, "loss": 2.4134, "step": 278000 }, { "epoch": 17.95020266357846, "grad_norm": 0.42292436957359314, "learning_rate": 3.438607186401778e-05, "loss": 2.4186, "step": 279000 }, { "epoch": 18.0, "eval_accuracy": 0.42388941236296196, "eval_loss": 3.0051016807556152, "eval_runtime": 112.3705, "eval_samples_per_second": 466.67, "eval_steps_per_second": 7.297, "step": 279774 }, { "epoch": 18.014540307533938, "grad_norm": 0.48824623227119446, "learning_rate": 3.331026321451624e-05, "loss": 2.4055, "step": 280000 }, { "epoch": 18.078877951489417, "grad_norm": 0.46934977173805237, "learning_rate": 3.22355303736642e-05, "loss": 2.3736, "step": 281000 }, { "epoch": 18.143215595444893, "grad_norm": 0.5045217275619507, "learning_rate": 3.115972172416266e-05, "loss": 2.382, "step": 282000 }, { "epoch": 18.207553239400372, "grad_norm": 0.46461954712867737, "learning_rate": 3.008391307466112e-05, "loss": 2.3806, "step": 283000 }, { "epoch": 18.27189088335585, "grad_norm": 0.4565331041812897, "learning_rate": 2.9009180233809078e-05, "loss": 2.3813, "step": 284000 }, { "epoch": 18.33622852731133, "grad_norm": 0.4561784863471985, "learning_rate": 2.793337158430754e-05, "loss": 2.3863, "step": 285000 }, { "epoch": 18.400566171266806, "grad_norm": 0.4438989758491516, "learning_rate": 2.6858638743455493e-05, "loss": 2.3845, "step": 286000 }, { "epoch": 18.464903815222286, "grad_norm": 0.461086630821228, "learning_rate": 2.578283009395395e-05, "loss": 2.3833, "step": 287000 }, { "epoch": 18.529241459177765, "grad_norm": 0.4639764726161957, "learning_rate": 2.470702144445241e-05, "loss": 2.3918, "step": 288000 }, { "epoch": 18.593579103133244, "grad_norm": 0.4645422697067261, "learning_rate": 2.3631212794950868e-05, "loss": 2.3953, "step": 289000 }, { "epoch": 18.65791674708872, "grad_norm": 0.47392553091049194, "learning_rate": 2.2555404145449328e-05, "loss": 2.3829, "step": 290000 }, { "epoch": 18.7222543910442, "grad_norm": 0.4530762732028961, "learning_rate": 2.148174711324679e-05, "loss": 2.3904, "step": 291000 }, { "epoch": 18.78659203499968, "grad_norm": 0.47473639249801636, "learning_rate": 2.0405938463745248e-05, "loss": 2.3966, "step": 292000 }, { "epoch": 18.850929678955158, "grad_norm": 0.43500351905822754, "learning_rate": 1.9330129814243705e-05, "loss": 2.396, "step": 293000 }, { "epoch": 18.915267322910633, "grad_norm": 0.45157596468925476, "learning_rate": 1.8254321164742165e-05, "loss": 2.3959, "step": 294000 }, { "epoch": 18.979604966866113, "grad_norm": 0.4546051621437073, "learning_rate": 1.7180664132539624e-05, "loss": 2.3869, "step": 295000 }, { "epoch": 19.0, "eval_accuracy": 0.42373925008599933, "eval_loss": 3.011887311935425, "eval_runtime": 112.2632, "eval_samples_per_second": 467.116, "eval_steps_per_second": 7.304, "step": 295317 }, { "epoch": 19.043942610821592, "grad_norm": 0.46901893615722656, "learning_rate": 1.610485548303808e-05, "loss": 2.3726, "step": 296000 }, { "epoch": 19.10828025477707, "grad_norm": 0.43862438201904297, "learning_rate": 1.502904683353654e-05, "loss": 2.3688, "step": 297000 }, { "epoch": 19.172617898732547, "grad_norm": 0.4580424427986145, "learning_rate": 1.3954313992684501e-05, "loss": 2.3682, "step": 298000 }, { "epoch": 19.236955542688026, "grad_norm": 0.47557470202445984, "learning_rate": 1.2878505343182957e-05, "loss": 2.3687, "step": 299000 }, { "epoch": 19.301293186643505, "grad_norm": 0.48615992069244385, "learning_rate": 1.1802696693681415e-05, "loss": 2.3636, "step": 300000 }, { "epoch": 19.365630830598985, "grad_norm": 0.5019800662994385, "learning_rate": 1.0726888044179874e-05, "loss": 2.3668, "step": 301000 }, { "epoch": 19.42996847455446, "grad_norm": 0.4481401741504669, "learning_rate": 9.652155203327834e-06, "loss": 2.3721, "step": 302000 }, { "epoch": 19.49430611850994, "grad_norm": 0.4632056653499603, "learning_rate": 8.577422362475793e-06, "loss": 2.372, "step": 303000 }, { "epoch": 19.55864376246542, "grad_norm": 0.4590476453304291, "learning_rate": 7.5016137129742514e-06, "loss": 2.3725, "step": 304000 }, { "epoch": 19.622981406420898, "grad_norm": 0.4774569272994995, "learning_rate": 6.42580506347271e-06, "loss": 2.37, "step": 305000 }, { "epoch": 19.687319050376374, "grad_norm": 0.47048863768577576, "learning_rate": 5.351072222620669e-06, "loss": 2.371, "step": 306000 }, { "epoch": 19.751656694331853, "grad_norm": 0.4567144215106964, "learning_rate": 4.275263573119128e-06, "loss": 2.3706, "step": 307000 }, { "epoch": 19.815994338287332, "grad_norm": 0.4492277503013611, "learning_rate": 3.200530732267087e-06, "loss": 2.3714, "step": 308000 }, { "epoch": 19.88033198224281, "grad_norm": 0.44562822580337524, "learning_rate": 2.1247220827655454e-06, "loss": 2.3711, "step": 309000 }, { "epoch": 19.944669626198287, "grad_norm": 0.4758046269416809, "learning_rate": 1.0499892419135049e-06, "loss": 2.3686, "step": 310000 }, { "epoch": 20.0, "eval_accuracy": 0.4234014597448438, "eval_loss": 3.0190186500549316, "eval_runtime": 112.3566, "eval_samples_per_second": 466.728, "eval_steps_per_second": 7.298, "step": 310860 }, { "epoch": 20.0, "step": 310860, "total_flos": 1.29957250203648e+18, "train_loss": 2.7038125842232534, "train_runtime": 44110.0674, "train_samples_per_second": 225.51, "train_steps_per_second": 7.047 } ], "logging_steps": 1000, "max_steps": 310860, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.29957250203648e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }