|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 310860, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06433764395547835, |
|
"grad_norm": 0.9469536542892456, |
|
"learning_rate": 9.375e-06, |
|
"loss": 7.0597, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1286752879109567, |
|
"grad_norm": 0.9877486824989319, |
|
"learning_rate": 1.875e-05, |
|
"loss": 4.8574, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.19301293186643506, |
|
"grad_norm": 1.1811100244522095, |
|
"learning_rate": 2.8125e-05, |
|
"loss": 4.5464, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2573505758219134, |
|
"grad_norm": 1.155553936958313, |
|
"learning_rate": 3.75e-05, |
|
"loss": 4.3086, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.32168821977739176, |
|
"grad_norm": 1.0062898397445679, |
|
"learning_rate": 4.6874999999999994e-05, |
|
"loss": 4.1307, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3860258637328701, |
|
"grad_norm": 0.9749443531036377, |
|
"learning_rate": 5.625e-05, |
|
"loss": 3.986, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.45036350768834843, |
|
"grad_norm": 0.9870838522911072, |
|
"learning_rate": 6.5625e-05, |
|
"loss": 3.8708, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5147011516438268, |
|
"grad_norm": 1.0740858316421509, |
|
"learning_rate": 7.5e-05, |
|
"loss": 3.781, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5790387955993052, |
|
"grad_norm": 0.969571053981781, |
|
"learning_rate": 8.437499999999999e-05, |
|
"loss": 3.6942, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.6433764395547835, |
|
"grad_norm": 0.923062801361084, |
|
"learning_rate": 9.374999999999999e-05, |
|
"loss": 3.6225, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7077140835102619, |
|
"grad_norm": 0.87486732006073, |
|
"learning_rate": 0.00010312499999999999, |
|
"loss": 3.5667, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.7720517274657402, |
|
"grad_norm": 0.8343172073364258, |
|
"learning_rate": 0.000112490625, |
|
"loss": 3.5107, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.8363893714212186, |
|
"grad_norm": 0.8089198470115662, |
|
"learning_rate": 0.000121865625, |
|
"loss": 3.4681, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9007270153766969, |
|
"grad_norm": 0.8141182661056519, |
|
"learning_rate": 0.00013123125, |
|
"loss": 3.4337, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.9650646593321752, |
|
"grad_norm": 0.7596079707145691, |
|
"learning_rate": 0.00014060625, |
|
"loss": 3.3944, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.37339323372369543, |
|
"eval_loss": 3.4211647510528564, |
|
"eval_runtime": 111.2734, |
|
"eval_samples_per_second": 471.271, |
|
"eval_steps_per_second": 7.369, |
|
"step": 15543 |
|
}, |
|
{ |
|
"epoch": 1.0294023032876536, |
|
"grad_norm": 0.7583508491516113, |
|
"learning_rate": 0.000149971875, |
|
"loss": 3.345, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.093739947243132, |
|
"grad_norm": 0.7395954728126526, |
|
"learning_rate": 0.00015933749999999996, |
|
"loss": 3.3182, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.1580775911986103, |
|
"grad_norm": 0.7119142413139343, |
|
"learning_rate": 0.00016871249999999996, |
|
"loss": 3.304, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.2224152351540887, |
|
"grad_norm": 0.7133814692497253, |
|
"learning_rate": 0.00017808749999999999, |
|
"loss": 3.2808, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.286752879109567, |
|
"grad_norm": 0.6662284731864929, |
|
"learning_rate": 0.00018745312499999998, |
|
"loss": 3.2624, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.3510905230650454, |
|
"grad_norm": 0.6821054816246033, |
|
"learning_rate": 0.00019682812499999998, |
|
"loss": 3.2468, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.4154281670205238, |
|
"grad_norm": 0.6423399448394775, |
|
"learning_rate": 0.00020619374999999998, |
|
"loss": 3.2323, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.4797658109760021, |
|
"grad_norm": 0.6489351987838745, |
|
"learning_rate": 0.00021556874999999998, |
|
"loss": 3.218, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.5441034549314803, |
|
"grad_norm": 0.6388360261917114, |
|
"learning_rate": 0.00022493437499999998, |
|
"loss": 3.2063, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.6084410988869586, |
|
"grad_norm": 0.6035541296005249, |
|
"learning_rate": 0.00023430937499999997, |
|
"loss": 3.1971, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.672778742842437, |
|
"grad_norm": 0.5949345231056213, |
|
"learning_rate": 0.00024367499999999997, |
|
"loss": 3.1683, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.7371163867979154, |
|
"grad_norm": 0.5953760147094727, |
|
"learning_rate": 0.00025305, |
|
"loss": 3.1728, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.8014540307533937, |
|
"grad_norm": 0.5276063680648804, |
|
"learning_rate": 0.000262415625, |
|
"loss": 3.1607, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.865791674708872, |
|
"grad_norm": 0.5257272124290466, |
|
"learning_rate": 0.000271790625, |
|
"loss": 3.1472, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.9301293186643504, |
|
"grad_norm": 0.49043259024620056, |
|
"learning_rate": 0.000281165625, |
|
"loss": 3.1367, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.9944669626198288, |
|
"grad_norm": 0.5030378699302673, |
|
"learning_rate": 0.000290521875, |
|
"loss": 3.1245, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.3939620256950988, |
|
"eval_loss": 3.2037432193756104, |
|
"eval_runtime": 111.7392, |
|
"eval_samples_per_second": 469.307, |
|
"eval_steps_per_second": 7.339, |
|
"step": 31086 |
|
}, |
|
{ |
|
"epoch": 2.058804606575307, |
|
"grad_norm": 0.5003546476364136, |
|
"learning_rate": 0.000299896875, |
|
"loss": 3.0828, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.1231422505307855, |
|
"grad_norm": 0.48286330699920654, |
|
"learning_rate": 0.00029893602524564295, |
|
"loss": 3.08, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.187479894486264, |
|
"grad_norm": 0.4852472245693207, |
|
"learning_rate": 0.0002978602165961414, |
|
"loss": 3.0633, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.2518175384417423, |
|
"grad_norm": 0.4629572927951813, |
|
"learning_rate": 0.00029678548375528934, |
|
"loss": 3.063, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.3161551823972206, |
|
"grad_norm": 0.4571368992328644, |
|
"learning_rate": 0.0002957096751057878, |
|
"loss": 3.0453, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.380492826352699, |
|
"grad_norm": 0.44331055879592896, |
|
"learning_rate": 0.0002946349422649358, |
|
"loss": 3.0408, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.4448304703081774, |
|
"grad_norm": 0.4230923354625702, |
|
"learning_rate": 0.00029355913361543424, |
|
"loss": 3.0359, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.5091681142636557, |
|
"grad_norm": 0.4260108768939972, |
|
"learning_rate": 0.0002924833249659327, |
|
"loss": 3.0316, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.573505758219134, |
|
"grad_norm": 0.41887935996055603, |
|
"learning_rate": 0.0002914085921250807, |
|
"loss": 3.0299, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.6378434021746124, |
|
"grad_norm": 0.41068920493125916, |
|
"learning_rate": 0.00029033278347557914, |
|
"loss": 3.0138, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.702181046130091, |
|
"grad_norm": 0.39430394768714905, |
|
"learning_rate": 0.0002892591264433766, |
|
"loss": 3.0038, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.766518690085569, |
|
"grad_norm": 0.4100017547607422, |
|
"learning_rate": 0.00028818331779387505, |
|
"loss": 3.0088, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.8308563340410475, |
|
"grad_norm": 0.4101816415786743, |
|
"learning_rate": 0.0002871075091443735, |
|
"loss": 2.9937, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.895193977996526, |
|
"grad_norm": 0.38294607400894165, |
|
"learning_rate": 0.000286031700494872, |
|
"loss": 2.9898, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.9595316219520043, |
|
"grad_norm": 0.37260037660598755, |
|
"learning_rate": 0.00028495589184537043, |
|
"loss": 2.9807, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.40728960081362825, |
|
"eval_loss": 3.079435110092163, |
|
"eval_runtime": 111.6836, |
|
"eval_samples_per_second": 469.541, |
|
"eval_steps_per_second": 7.342, |
|
"step": 46629 |
|
}, |
|
{ |
|
"epoch": 3.0238692659074826, |
|
"grad_norm": 0.40783312916755676, |
|
"learning_rate": 0.00028388115900451836, |
|
"loss": 2.9534, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.088206909862961, |
|
"grad_norm": 0.38361623883247375, |
|
"learning_rate": 0.0002828053503550168, |
|
"loss": 2.9122, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.1525445538184393, |
|
"grad_norm": 0.3766690790653229, |
|
"learning_rate": 0.0002817306175141648, |
|
"loss": 2.9119, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.2168821977739177, |
|
"grad_norm": 0.38536399602890015, |
|
"learning_rate": 0.00028065480886466326, |
|
"loss": 2.9135, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.2812198417293956, |
|
"grad_norm": 0.38374361395835876, |
|
"learning_rate": 0.0002795800760238112, |
|
"loss": 2.9094, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 3.345557485684874, |
|
"grad_norm": 0.3872029185295105, |
|
"learning_rate": 0.00027850426737430965, |
|
"loss": 2.9069, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 3.4098951296403524, |
|
"grad_norm": 0.37401601672172546, |
|
"learning_rate": 0.0002774284587248081, |
|
"loss": 2.9092, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 3.4742327735958307, |
|
"grad_norm": 0.34951257705688477, |
|
"learning_rate": 0.0002763537258839561, |
|
"loss": 2.9124, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.538570417551309, |
|
"grad_norm": 0.36252182722091675, |
|
"learning_rate": 0.000275278993043104, |
|
"loss": 2.9086, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.6029080615067874, |
|
"grad_norm": 0.3610841631889343, |
|
"learning_rate": 0.0002742031843936025, |
|
"loss": 2.9044, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.667245705462266, |
|
"grad_norm": 0.356315016746521, |
|
"learning_rate": 0.00027312737574410094, |
|
"loss": 2.8984, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.731583349417744, |
|
"grad_norm": 0.3501368761062622, |
|
"learning_rate": 0.0002720515670945994, |
|
"loss": 2.8991, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.7959209933732225, |
|
"grad_norm": 0.3654986619949341, |
|
"learning_rate": 0.00027097575844509786, |
|
"loss": 2.8946, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.860258637328701, |
|
"grad_norm": 0.34233444929122925, |
|
"learning_rate": 0.00026990102560424585, |
|
"loss": 2.8981, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.9245962812841793, |
|
"grad_norm": 0.35118335485458374, |
|
"learning_rate": 0.0002688252169547443, |
|
"loss": 2.8932, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.9889339252396576, |
|
"grad_norm": 0.3542274236679077, |
|
"learning_rate": 0.00026775048411389223, |
|
"loss": 2.8872, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.4139775803532702, |
|
"eval_loss": 3.0204551219940186, |
|
"eval_runtime": 111.8979, |
|
"eval_samples_per_second": 468.642, |
|
"eval_steps_per_second": 7.328, |
|
"step": 62172 |
|
}, |
|
{ |
|
"epoch": 4.053271569195136, |
|
"grad_norm": 0.359250545501709, |
|
"learning_rate": 0.0002666746754643907, |
|
"loss": 2.8242, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 4.117609213150614, |
|
"grad_norm": 0.34917619824409485, |
|
"learning_rate": 0.00026559886681488915, |
|
"loss": 2.8144, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 4.181946857106093, |
|
"grad_norm": 0.351457417011261, |
|
"learning_rate": 0.00026452413397403714, |
|
"loss": 2.8132, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 4.246284501061571, |
|
"grad_norm": 0.35231146216392517, |
|
"learning_rate": 0.0002634483253245356, |
|
"loss": 2.8203, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 4.310622145017049, |
|
"grad_norm": 0.354030579328537, |
|
"learning_rate": 0.0002623735924836836, |
|
"loss": 2.8273, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 4.374959788972528, |
|
"grad_norm": 0.3434860408306122, |
|
"learning_rate": 0.00026129778383418204, |
|
"loss": 2.8221, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 4.439297432928006, |
|
"grad_norm": 0.35598379373550415, |
|
"learning_rate": 0.0002602219751846805, |
|
"loss": 2.8283, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 4.5036350768834845, |
|
"grad_norm": 0.350340873003006, |
|
"learning_rate": 0.00025914616653517896, |
|
"loss": 2.8242, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 4.567972720838963, |
|
"grad_norm": 0.34078752994537354, |
|
"learning_rate": 0.0002580714336943269, |
|
"loss": 2.8309, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 4.632310364794441, |
|
"grad_norm": 0.3571733832359314, |
|
"learning_rate": 0.00025699670085347487, |
|
"loss": 2.8248, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 4.69664800874992, |
|
"grad_norm": 0.35940021276474, |
|
"learning_rate": 0.00025592089220397333, |
|
"loss": 2.8334, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 4.760985652705398, |
|
"grad_norm": 0.3354775607585907, |
|
"learning_rate": 0.0002548450835544718, |
|
"loss": 2.8263, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.825323296660876, |
|
"grad_norm": 0.330805242061615, |
|
"learning_rate": 0.0002537703507136197, |
|
"loss": 2.8296, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.889660940616355, |
|
"grad_norm": 0.32566189765930176, |
|
"learning_rate": 0.0002526945420641182, |
|
"loss": 2.8208, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.953998584571833, |
|
"grad_norm": 0.32299116253852844, |
|
"learning_rate": 0.00025161980922326616, |
|
"loss": 2.8286, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.417990981289541, |
|
"eval_loss": 2.988518238067627, |
|
"eval_runtime": 111.9091, |
|
"eval_samples_per_second": 468.595, |
|
"eval_steps_per_second": 7.327, |
|
"step": 77715 |
|
}, |
|
{ |
|
"epoch": 5.018336228527311, |
|
"grad_norm": 0.332711786031723, |
|
"learning_rate": 0.00025054400057376457, |
|
"loss": 2.797, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 5.08267387248279, |
|
"grad_norm": 0.3597155809402466, |
|
"learning_rate": 0.000249468191924263, |
|
"loss": 2.7461, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 5.147011516438268, |
|
"grad_norm": 0.3411096930503845, |
|
"learning_rate": 0.000248393459083411, |
|
"loss": 2.7493, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 5.2113491603937465, |
|
"grad_norm": 0.35248109698295593, |
|
"learning_rate": 0.00024731765043390947, |
|
"loss": 2.7584, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 5.275686804349225, |
|
"grad_norm": 0.3520190417766571, |
|
"learning_rate": 0.00024624184178440793, |
|
"loss": 2.755, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 5.340024448304703, |
|
"grad_norm": 0.34867680072784424, |
|
"learning_rate": 0.00024516710894355586, |
|
"loss": 2.7649, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 5.404362092260182, |
|
"grad_norm": 0.3400154709815979, |
|
"learning_rate": 0.00024409130029405434, |
|
"loss": 2.7586, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 5.46869973621566, |
|
"grad_norm": 0.3640024662017822, |
|
"learning_rate": 0.0002430154916445528, |
|
"loss": 2.7606, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 5.533037380171138, |
|
"grad_norm": 0.3456322252750397, |
|
"learning_rate": 0.00024193968299505126, |
|
"loss": 2.767, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 5.597375024126617, |
|
"grad_norm": 0.3284786343574524, |
|
"learning_rate": 0.00024086495015419922, |
|
"loss": 2.7687, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 5.661712668082095, |
|
"grad_norm": 0.3351786732673645, |
|
"learning_rate": 0.00023978914150469768, |
|
"loss": 2.7705, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 5.726050312037573, |
|
"grad_norm": 0.3189627528190613, |
|
"learning_rate": 0.00023871440866384563, |
|
"loss": 2.7743, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 5.790387955993052, |
|
"grad_norm": 0.3447468876838684, |
|
"learning_rate": 0.0002376386000143441, |
|
"loss": 2.7712, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 5.85472559994853, |
|
"grad_norm": 0.3212040364742279, |
|
"learning_rate": 0.00023656386717349205, |
|
"loss": 2.7741, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 5.9190632439040085, |
|
"grad_norm": 0.3384701609611511, |
|
"learning_rate": 0.0002354880585239905, |
|
"loss": 2.7747, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.983400887859487, |
|
"grad_norm": 0.33266380429267883, |
|
"learning_rate": 0.00023441224987448894, |
|
"loss": 2.779, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.4206485843765424, |
|
"eval_loss": 2.969926595687866, |
|
"eval_runtime": 111.9683, |
|
"eval_samples_per_second": 468.347, |
|
"eval_steps_per_second": 7.324, |
|
"step": 93258 |
|
}, |
|
{ |
|
"epoch": 6.047738531814965, |
|
"grad_norm": 0.35645657777786255, |
|
"learning_rate": 0.0002333364412249874, |
|
"loss": 2.7059, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 6.112076175770444, |
|
"grad_norm": 0.35733386874198914, |
|
"learning_rate": 0.0002322617083841354, |
|
"loss": 2.6966, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 6.176413819725922, |
|
"grad_norm": 0.3435540199279785, |
|
"learning_rate": 0.00023118589973463387, |
|
"loss": 2.6986, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 6.2407514636814, |
|
"grad_norm": 0.3479596972465515, |
|
"learning_rate": 0.0002301100910851323, |
|
"loss": 2.7068, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 6.305089107636879, |
|
"grad_norm": 0.3150424659252167, |
|
"learning_rate": 0.00022903428243563077, |
|
"loss": 2.7074, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 6.369426751592357, |
|
"grad_norm": 0.34055858850479126, |
|
"learning_rate": 0.00022795954959477872, |
|
"loss": 2.7065, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 6.433764395547835, |
|
"grad_norm": 0.3491341769695282, |
|
"learning_rate": 0.0002268848167539267, |
|
"loss": 2.7156, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 6.498102039503314, |
|
"grad_norm": 0.3347100019454956, |
|
"learning_rate": 0.00022580900810442514, |
|
"loss": 2.714, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 6.562439683458791, |
|
"grad_norm": 0.35210439562797546, |
|
"learning_rate": 0.00022473427526357312, |
|
"loss": 2.7194, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 6.6267773274142705, |
|
"grad_norm": 0.3326897919178009, |
|
"learning_rate": 0.00022365846661407155, |
|
"loss": 2.727, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 6.691114971369748, |
|
"grad_norm": 0.3269229531288147, |
|
"learning_rate": 0.00022258265796457, |
|
"loss": 2.7203, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 6.755452615325227, |
|
"grad_norm": 0.34183254837989807, |
|
"learning_rate": 0.00022150684931506847, |
|
"loss": 2.7328, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 6.819790259280705, |
|
"grad_norm": 0.33449244499206543, |
|
"learning_rate": 0.00022043211647421643, |
|
"loss": 2.7291, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 6.884127903236184, |
|
"grad_norm": 0.33734798431396484, |
|
"learning_rate": 0.0002193563078247149, |
|
"loss": 2.7277, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 6.948465547191661, |
|
"grad_norm": 0.34088870882987976, |
|
"learning_rate": 0.00021828157498386284, |
|
"loss": 2.7316, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.42224443247932275, |
|
"eval_loss": 2.958820104598999, |
|
"eval_runtime": 111.7467, |
|
"eval_samples_per_second": 469.276, |
|
"eval_steps_per_second": 7.338, |
|
"step": 108801 |
|
}, |
|
{ |
|
"epoch": 7.01280319114714, |
|
"grad_norm": 0.3517482876777649, |
|
"learning_rate": 0.0002172057663343613, |
|
"loss": 2.7116, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 7.077140835102618, |
|
"grad_norm": 0.3411085903644562, |
|
"learning_rate": 0.00021613103349350926, |
|
"loss": 2.6469, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 7.1414784790580965, |
|
"grad_norm": 0.3486618399620056, |
|
"learning_rate": 0.00021505522484400772, |
|
"loss": 2.6546, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 7.205816123013575, |
|
"grad_norm": 0.35618531703948975, |
|
"learning_rate": 0.00021397941619450618, |
|
"loss": 2.6603, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 7.270153766969053, |
|
"grad_norm": 0.34740447998046875, |
|
"learning_rate": 0.00021290468335365413, |
|
"loss": 2.6632, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 7.334491410924532, |
|
"grad_norm": 0.339108407497406, |
|
"learning_rate": 0.0002118288747041526, |
|
"loss": 2.6682, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 7.39882905488001, |
|
"grad_norm": 0.36686399579048157, |
|
"learning_rate": 0.00021075306605465105, |
|
"loss": 2.6718, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 7.463166698835488, |
|
"grad_norm": 0.3336213529109955, |
|
"learning_rate": 0.000209678333213799, |
|
"loss": 2.6806, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 7.527504342790967, |
|
"grad_norm": 0.34256553649902344, |
|
"learning_rate": 0.00020860252456429747, |
|
"loss": 2.6772, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 7.591841986746445, |
|
"grad_norm": 0.3527204096317291, |
|
"learning_rate": 0.00020752671591479593, |
|
"loss": 2.6786, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 7.656179630701923, |
|
"grad_norm": 0.34285178780555725, |
|
"learning_rate": 0.0002064509072652944, |
|
"loss": 2.6816, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 7.720517274657402, |
|
"grad_norm": 0.3418208658695221, |
|
"learning_rate": 0.00020537617442444234, |
|
"loss": 2.6893, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 7.78485491861288, |
|
"grad_norm": 0.34486138820648193, |
|
"learning_rate": 0.0002043003657749408, |
|
"loss": 2.6847, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 7.8491925625683585, |
|
"grad_norm": 0.348530650138855, |
|
"learning_rate": 0.00020322563293408876, |
|
"loss": 2.6826, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 7.913530206523837, |
|
"grad_norm": 0.33808425068855286, |
|
"learning_rate": 0.00020215090009323674, |
|
"loss": 2.6905, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 7.977867850479315, |
|
"grad_norm": 0.3486366868019104, |
|
"learning_rate": 0.0002010750914437352, |
|
"loss": 2.6909, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4232837528604119, |
|
"eval_loss": 2.9554243087768555, |
|
"eval_runtime": 112.1041, |
|
"eval_samples_per_second": 467.779, |
|
"eval_steps_per_second": 7.315, |
|
"step": 124344 |
|
}, |
|
{ |
|
"epoch": 8.042205494434794, |
|
"grad_norm": 0.35380104184150696, |
|
"learning_rate": 0.00020000035860288316, |
|
"loss": 2.6303, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 8.106543138390272, |
|
"grad_norm": 0.3654320240020752, |
|
"learning_rate": 0.00019892454995338162, |
|
"loss": 2.6128, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 8.170880782345751, |
|
"grad_norm": 0.3670574724674225, |
|
"learning_rate": 0.00019784874130388008, |
|
"loss": 2.617, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 8.235218426301229, |
|
"grad_norm": 0.38059455156326294, |
|
"learning_rate": 0.00019677400846302803, |
|
"loss": 2.6274, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 8.299556070256708, |
|
"grad_norm": 0.3698261082172394, |
|
"learning_rate": 0.00019569927562217599, |
|
"loss": 2.6309, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 8.363893714212185, |
|
"grad_norm": 0.3583601117134094, |
|
"learning_rate": 0.00019462346697267445, |
|
"loss": 2.6312, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 8.428231358167665, |
|
"grad_norm": 0.3602234721183777, |
|
"learning_rate": 0.0001935476583231729, |
|
"loss": 2.6368, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 8.492569002123142, |
|
"grad_norm": 0.3441711664199829, |
|
"learning_rate": 0.00019247184967367137, |
|
"loss": 2.6372, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 8.556906646078621, |
|
"grad_norm": 0.3533187508583069, |
|
"learning_rate": 0.00019139604102416983, |
|
"loss": 2.6443, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 8.621244290034099, |
|
"grad_norm": 0.3579193651676178, |
|
"learning_rate": 0.00019032130818331778, |
|
"loss": 2.6481, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 8.685581933989578, |
|
"grad_norm": 0.3524502217769623, |
|
"learning_rate": 0.00018924549953381624, |
|
"loss": 2.6509, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 8.749919577945056, |
|
"grad_norm": 0.36159747838974, |
|
"learning_rate": 0.0001881707666929642, |
|
"loss": 2.6456, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 8.814257221900533, |
|
"grad_norm": 0.34249147772789, |
|
"learning_rate": 0.00018709495804346266, |
|
"loss": 2.6538, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 8.878594865856012, |
|
"grad_norm": 0.34867429733276367, |
|
"learning_rate": 0.0001860202252026106, |
|
"loss": 2.6558, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 8.942932509811492, |
|
"grad_norm": 0.3351230025291443, |
|
"learning_rate": 0.00018494441655310907, |
|
"loss": 2.6504, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.4238085730096768, |
|
"eval_loss": 2.9544410705566406, |
|
"eval_runtime": 111.7789, |
|
"eval_samples_per_second": 469.14, |
|
"eval_steps_per_second": 7.336, |
|
"step": 139887 |
|
}, |
|
{ |
|
"epoch": 9.007270153766969, |
|
"grad_norm": 0.36276528239250183, |
|
"learning_rate": 0.00018386968371225703, |
|
"loss": 2.6469, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 9.071607797722447, |
|
"grad_norm": 0.36368831992149353, |
|
"learning_rate": 0.0001827938750627555, |
|
"loss": 2.5666, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 9.135945441677926, |
|
"grad_norm": 0.36417004466056824, |
|
"learning_rate": 0.00018171806641325395, |
|
"loss": 2.5832, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 9.200283085633403, |
|
"grad_norm": 0.3550620973110199, |
|
"learning_rate": 0.0001806422577637524, |
|
"loss": 2.5888, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 9.264620729588882, |
|
"grad_norm": 0.3513035178184509, |
|
"learning_rate": 0.00017956644911425084, |
|
"loss": 2.5872, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 9.32895837354436, |
|
"grad_norm": 0.3576969802379608, |
|
"learning_rate": 0.00017849279208204832, |
|
"loss": 2.599, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 9.39329601749984, |
|
"grad_norm": 0.3496710956096649, |
|
"learning_rate": 0.00017741698343254678, |
|
"loss": 2.6042, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 9.457633661455317, |
|
"grad_norm": 0.3502206802368164, |
|
"learning_rate": 0.00017634225059169476, |
|
"loss": 2.6069, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 9.521971305410796, |
|
"grad_norm": 0.3516786992549896, |
|
"learning_rate": 0.00017526644194219322, |
|
"loss": 2.606, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 9.586308949366273, |
|
"grad_norm": 0.3671824336051941, |
|
"learning_rate": 0.00017419063329269168, |
|
"loss": 2.6151, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 9.650646593321753, |
|
"grad_norm": 0.36615684628486633, |
|
"learning_rate": 0.00017311590045183964, |
|
"loss": 2.6174, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 9.71498423727723, |
|
"grad_norm": 0.369759202003479, |
|
"learning_rate": 0.0001720400918023381, |
|
"loss": 2.6162, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 9.77932188123271, |
|
"grad_norm": 0.3495037257671356, |
|
"learning_rate": 0.00017096428315283656, |
|
"loss": 2.6186, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 9.843659525188187, |
|
"grad_norm": 0.3635868728160858, |
|
"learning_rate": 0.0001698895503119845, |
|
"loss": 2.616, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 9.907997169143666, |
|
"grad_norm": 0.352250337600708, |
|
"learning_rate": 0.00016881374166248297, |
|
"loss": 2.626, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 9.972334813099144, |
|
"grad_norm": 0.3688776195049286, |
|
"learning_rate": 0.00016773900882163093, |
|
"loss": 2.6246, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.424411016885778, |
|
"eval_loss": 2.9523308277130127, |
|
"eval_runtime": 111.8379, |
|
"eval_samples_per_second": 468.893, |
|
"eval_steps_per_second": 7.332, |
|
"step": 155430 |
|
}, |
|
{ |
|
"epoch": 10.036672457054623, |
|
"grad_norm": 0.3961314558982849, |
|
"learning_rate": 0.0001666632001721294, |
|
"loss": 2.5827, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 10.1010101010101, |
|
"grad_norm": 0.3705954849720001, |
|
"learning_rate": 0.00016558739152262782, |
|
"loss": 2.5413, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 10.16534774496558, |
|
"grad_norm": 0.37091416120529175, |
|
"learning_rate": 0.00016451158287312628, |
|
"loss": 2.5502, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 10.229685388921057, |
|
"grad_norm": 0.38428565859794617, |
|
"learning_rate": 0.00016343685003227424, |
|
"loss": 2.5592, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 10.294023032876536, |
|
"grad_norm": 0.3688577115535736, |
|
"learning_rate": 0.0001623610413827727, |
|
"loss": 2.5673, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 10.358360676832014, |
|
"grad_norm": 0.38183775544166565, |
|
"learning_rate": 0.00016128630854192065, |
|
"loss": 2.5697, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 10.422698320787493, |
|
"grad_norm": 0.37677517533302307, |
|
"learning_rate": 0.0001602104998924191, |
|
"loss": 2.5713, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 10.48703596474297, |
|
"grad_norm": 0.3694332540035248, |
|
"learning_rate": 0.00015913576705156707, |
|
"loss": 2.5751, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 10.55137360869845, |
|
"grad_norm": 0.3814958333969116, |
|
"learning_rate": 0.00015806103421071502, |
|
"loss": 2.5792, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 10.615711252653927, |
|
"grad_norm": 0.38280004262924194, |
|
"learning_rate": 0.00015698522556121348, |
|
"loss": 2.5782, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 10.680048896609406, |
|
"grad_norm": 0.3659280240535736, |
|
"learning_rate": 0.00015590941691171194, |
|
"loss": 2.5862, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 10.744386540564884, |
|
"grad_norm": 0.34562841057777405, |
|
"learning_rate": 0.0001548336082622104, |
|
"loss": 2.5869, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 10.808724184520363, |
|
"grad_norm": 0.3570345938205719, |
|
"learning_rate": 0.00015375887542135836, |
|
"loss": 2.59, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 10.87306182847584, |
|
"grad_norm": 0.360215961933136, |
|
"learning_rate": 0.00015268306677185682, |
|
"loss": 2.5979, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 10.93739947243132, |
|
"grad_norm": 0.370670884847641, |
|
"learning_rate": 0.00015160725812235528, |
|
"loss": 2.5988, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.4248191770987571, |
|
"eval_loss": 2.9567785263061523, |
|
"eval_runtime": 112.2402, |
|
"eval_samples_per_second": 467.212, |
|
"eval_steps_per_second": 7.306, |
|
"step": 170973 |
|
}, |
|
{ |
|
"epoch": 11.001737116386797, |
|
"grad_norm": 0.38218948245048523, |
|
"learning_rate": 0.00015053252528150323, |
|
"loss": 2.5933, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 11.066074760342277, |
|
"grad_norm": 0.396331787109375, |
|
"learning_rate": 0.00014945671663200172, |
|
"loss": 2.5023, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 11.130412404297754, |
|
"grad_norm": 0.3751789927482605, |
|
"learning_rate": 0.00014838090798250018, |
|
"loss": 2.5227, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 11.194750048253233, |
|
"grad_norm": 0.37265828251838684, |
|
"learning_rate": 0.00014730509933299864, |
|
"loss": 2.5299, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 11.25908769220871, |
|
"grad_norm": 0.37080228328704834, |
|
"learning_rate": 0.0001462303664921466, |
|
"loss": 2.5333, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 11.32342533616419, |
|
"grad_norm": 0.3808966875076294, |
|
"learning_rate": 0.00014515563365129455, |
|
"loss": 2.5376, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 11.387762980119668, |
|
"grad_norm": 0.38901346921920776, |
|
"learning_rate": 0.000144079825001793, |
|
"loss": 2.5422, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 11.452100624075147, |
|
"grad_norm": 0.380100816488266, |
|
"learning_rate": 0.00014300401635229144, |
|
"loss": 2.5533, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 11.516438268030624, |
|
"grad_norm": 0.39306920766830444, |
|
"learning_rate": 0.0001419282077027899, |
|
"loss": 2.5507, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 11.580775911986104, |
|
"grad_norm": 0.3917422890663147, |
|
"learning_rate": 0.00014085239905328836, |
|
"loss": 2.5579, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 11.645113555941581, |
|
"grad_norm": 0.38742849230766296, |
|
"learning_rate": 0.00013977766621243632, |
|
"loss": 2.5531, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 11.70945119989706, |
|
"grad_norm": 0.3767852187156677, |
|
"learning_rate": 0.00013870185756293478, |
|
"loss": 2.5633, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 11.773788843852538, |
|
"grad_norm": 0.39576900005340576, |
|
"learning_rate": 0.00013762604891343324, |
|
"loss": 2.5648, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 11.838126487808017, |
|
"grad_norm": 0.37659791111946106, |
|
"learning_rate": 0.00013655131607258122, |
|
"loss": 2.5631, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 11.902464131763494, |
|
"grad_norm": 0.38377416133880615, |
|
"learning_rate": 0.00013547658323172918, |
|
"loss": 2.5631, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 11.966801775718974, |
|
"grad_norm": 0.37857234477996826, |
|
"learning_rate": 0.00013440077458222764, |
|
"loss": 2.5639, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.4247610714766456, |
|
"eval_loss": 2.9595353603363037, |
|
"eval_runtime": 111.9131, |
|
"eval_samples_per_second": 468.578, |
|
"eval_steps_per_second": 7.327, |
|
"step": 186516 |
|
}, |
|
{ |
|
"epoch": 12.031139419674451, |
|
"grad_norm": 0.4024442136287689, |
|
"learning_rate": 0.0001333249659327261, |
|
"loss": 2.5273, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 12.09547706362993, |
|
"grad_norm": 0.4137458801269531, |
|
"learning_rate": 0.00013225023309187405, |
|
"loss": 2.4933, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 12.159814707585408, |
|
"grad_norm": 0.409184992313385, |
|
"learning_rate": 0.0001311744244423725, |
|
"loss": 2.4967, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 12.224152351540887, |
|
"grad_norm": 0.41316309571266174, |
|
"learning_rate": 0.00013009861579287097, |
|
"loss": 2.5063, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 12.288489995496365, |
|
"grad_norm": 0.3909110724925995, |
|
"learning_rate": 0.00012902280714336943, |
|
"loss": 2.5153, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 12.352827639451844, |
|
"grad_norm": 0.39046111702919006, |
|
"learning_rate": 0.0001279469984938679, |
|
"loss": 2.5115, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 12.417165283407321, |
|
"grad_norm": 0.40070855617523193, |
|
"learning_rate": 0.00012687226565301585, |
|
"loss": 2.5157, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 12.4815029273628, |
|
"grad_norm": 0.3970703184604645, |
|
"learning_rate": 0.00012579645700351428, |
|
"loss": 2.5198, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 12.545840571318278, |
|
"grad_norm": 0.40202242136001587, |
|
"learning_rate": 0.00012472064835401274, |
|
"loss": 2.526, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 12.610178215273757, |
|
"grad_norm": 0.3841732144355774, |
|
"learning_rate": 0.0001236459155131607, |
|
"loss": 2.5295, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 12.674515859229235, |
|
"grad_norm": 0.40759024024009705, |
|
"learning_rate": 0.00012257010686365916, |
|
"loss": 2.5307, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 12.738853503184714, |
|
"grad_norm": 0.3963831663131714, |
|
"learning_rate": 0.00012149429821415763, |
|
"loss": 2.534, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 12.803191147140192, |
|
"grad_norm": 0.37255486845970154, |
|
"learning_rate": 0.0001204195653733056, |
|
"loss": 2.5354, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 12.86752879109567, |
|
"grad_norm": 0.397368460893631, |
|
"learning_rate": 0.00011934375672380406, |
|
"loss": 2.5352, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 12.931866435051148, |
|
"grad_norm": 0.379574716091156, |
|
"learning_rate": 0.00011826902388295201, |
|
"loss": 2.5397, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 12.996204079006628, |
|
"grad_norm": 0.3803842067718506, |
|
"learning_rate": 0.00011719321523345048, |
|
"loss": 2.5361, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.42475613586395655, |
|
"eval_loss": 2.9698119163513184, |
|
"eval_runtime": 112.0848, |
|
"eval_samples_per_second": 467.86, |
|
"eval_steps_per_second": 7.316, |
|
"step": 202059 |
|
}, |
|
{ |
|
"epoch": 13.060541722962105, |
|
"grad_norm": 0.40816885232925415, |
|
"learning_rate": 0.00011611740658394894, |
|
"loss": 2.4669, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 13.124879366917584, |
|
"grad_norm": 0.42818671464920044, |
|
"learning_rate": 0.00011504159793444738, |
|
"loss": 2.467, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 13.189217010873062, |
|
"grad_norm": 0.40255987644195557, |
|
"learning_rate": 0.00011396686509359535, |
|
"loss": 2.4753, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 13.253554654828541, |
|
"grad_norm": 0.4254453778266907, |
|
"learning_rate": 0.0001128921322527433, |
|
"loss": 2.4808, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 13.317892298784018, |
|
"grad_norm": 0.4060657322406769, |
|
"learning_rate": 0.00011181632360324175, |
|
"loss": 2.4932, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 13.382229942739498, |
|
"grad_norm": 0.4138365387916565, |
|
"learning_rate": 0.00011074051495374021, |
|
"loss": 2.4922, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 13.446567586694975, |
|
"grad_norm": 0.4098254442214966, |
|
"learning_rate": 0.00010966578211288817, |
|
"loss": 2.4948, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 13.510905230650454, |
|
"grad_norm": 0.4242159128189087, |
|
"learning_rate": 0.00010858997346338663, |
|
"loss": 2.5012, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 13.575242874605932, |
|
"grad_norm": 0.42177829146385193, |
|
"learning_rate": 0.00010751416481388509, |
|
"loss": 2.4998, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 13.63958051856141, |
|
"grad_norm": 0.4196189045906067, |
|
"learning_rate": 0.00010643943197303304, |
|
"loss": 2.5048, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 13.703918162516889, |
|
"grad_norm": 0.3965640366077423, |
|
"learning_rate": 0.0001053636233235315, |
|
"loss": 2.5092, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 13.768255806472368, |
|
"grad_norm": 0.39778339862823486, |
|
"learning_rate": 0.00010428781467402996, |
|
"loss": 2.5121, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 13.832593450427845, |
|
"grad_norm": 0.40292391180992126, |
|
"learning_rate": 0.00010321308183317793, |
|
"loss": 2.5119, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 13.896931094383323, |
|
"grad_norm": 0.41673198342323303, |
|
"learning_rate": 0.00010213727318367639, |
|
"loss": 2.5112, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 13.961268738338802, |
|
"grad_norm": 0.40400612354278564, |
|
"learning_rate": 0.00010106254034282435, |
|
"loss": 2.5098, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.424743796832234, |
|
"eval_loss": 2.9747180938720703, |
|
"eval_runtime": 111.9808, |
|
"eval_samples_per_second": 468.295, |
|
"eval_steps_per_second": 7.323, |
|
"step": 217602 |
|
}, |
|
{ |
|
"epoch": 14.02560638229428, |
|
"grad_norm": 0.40745100378990173, |
|
"learning_rate": 9.998673169332281e-05, |
|
"loss": 2.4894, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 14.089944026249759, |
|
"grad_norm": 0.42399463057518005, |
|
"learning_rate": 9.891092304382127e-05, |
|
"loss": 2.449, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 14.154281670205236, |
|
"grad_norm": 0.4149724841117859, |
|
"learning_rate": 9.783511439431973e-05, |
|
"loss": 2.4534, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 14.218619314160716, |
|
"grad_norm": 0.40756285190582275, |
|
"learning_rate": 9.676145736211718e-05, |
|
"loss": 2.4576, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 14.282956958116193, |
|
"grad_norm": 0.4224795997142792, |
|
"learning_rate": 9.568564871261564e-05, |
|
"loss": 2.4584, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 14.347294602071672, |
|
"grad_norm": 0.41213053464889526, |
|
"learning_rate": 9.461091587176359e-05, |
|
"loss": 2.4707, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 14.41163224602715, |
|
"grad_norm": 0.4161031246185303, |
|
"learning_rate": 9.353510722226205e-05, |
|
"loss": 2.4701, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 14.475969889982629, |
|
"grad_norm": 0.42417025566101074, |
|
"learning_rate": 9.245929857276051e-05, |
|
"loss": 2.4706, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 14.540307533938106, |
|
"grad_norm": 0.4227360785007477, |
|
"learning_rate": 9.138348992325897e-05, |
|
"loss": 2.4678, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 14.604645177893586, |
|
"grad_norm": 0.3956305682659149, |
|
"learning_rate": 9.030768127375742e-05, |
|
"loss": 2.4816, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 14.668982821849063, |
|
"grad_norm": 0.42013561725616455, |
|
"learning_rate": 8.92329484329054e-05, |
|
"loss": 2.4791, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 14.733320465804542, |
|
"grad_norm": 0.41232335567474365, |
|
"learning_rate": 8.815713978340386e-05, |
|
"loss": 2.4861, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 14.79765810976002, |
|
"grad_norm": 0.398253858089447, |
|
"learning_rate": 8.708240694255182e-05, |
|
"loss": 2.4857, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 14.8619957537155, |
|
"grad_norm": 0.41056615114212036, |
|
"learning_rate": 8.600659829305028e-05, |
|
"loss": 2.4826, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 14.926333397670977, |
|
"grad_norm": 0.4065124988555908, |
|
"learning_rate": 8.493186545219823e-05, |
|
"loss": 2.4791, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 14.990671041626456, |
|
"grad_norm": 0.42194780707359314, |
|
"learning_rate": 8.385605680269669e-05, |
|
"loss": 2.4899, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.4246625087868862, |
|
"eval_loss": 2.9792003631591797, |
|
"eval_runtime": 111.7676, |
|
"eval_samples_per_second": 469.188, |
|
"eval_steps_per_second": 7.337, |
|
"step": 233145 |
|
}, |
|
{ |
|
"epoch": 15.055008685581933, |
|
"grad_norm": 0.444181889295578, |
|
"learning_rate": 8.278024815319514e-05, |
|
"loss": 2.4309, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 15.119346329537413, |
|
"grad_norm": 0.4177301526069641, |
|
"learning_rate": 8.17044395036936e-05, |
|
"loss": 2.4254, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 15.18368397349289, |
|
"grad_norm": 0.43864157795906067, |
|
"learning_rate": 8.062970666284155e-05, |
|
"loss": 2.432, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 15.24802161744837, |
|
"grad_norm": 0.43071264028549194, |
|
"learning_rate": 7.955497382198951e-05, |
|
"loss": 2.4372, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 15.312359261403847, |
|
"grad_norm": 0.44551989436149597, |
|
"learning_rate": 7.847916517248797e-05, |
|
"loss": 2.4441, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 15.376696905359326, |
|
"grad_norm": 0.42598387598991394, |
|
"learning_rate": 7.740335652298643e-05, |
|
"loss": 2.4448, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 15.441034549314804, |
|
"grad_norm": 0.4412069618701935, |
|
"learning_rate": 7.632754787348489e-05, |
|
"loss": 2.4481, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 15.505372193270283, |
|
"grad_norm": 0.4257245361804962, |
|
"learning_rate": 7.525173922398335e-05, |
|
"loss": 2.4496, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 15.56970983722576, |
|
"grad_norm": 0.4463740885257721, |
|
"learning_rate": 7.417593057448181e-05, |
|
"loss": 2.4583, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 15.63404748118124, |
|
"grad_norm": 0.40843266248703003, |
|
"learning_rate": 7.310119773362977e-05, |
|
"loss": 2.4549, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 15.698385125136717, |
|
"grad_norm": 0.43823161721229553, |
|
"learning_rate": 7.202538908412823e-05, |
|
"loss": 2.4565, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 15.762722769092196, |
|
"grad_norm": 0.4224304258823395, |
|
"learning_rate": 7.09506562432762e-05, |
|
"loss": 2.4664, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 15.827060413047674, |
|
"grad_norm": 0.42779698967933655, |
|
"learning_rate": 6.987484759377464e-05, |
|
"loss": 2.4607, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 15.891398057003153, |
|
"grad_norm": 0.41904374957084656, |
|
"learning_rate": 6.880011475292261e-05, |
|
"loss": 2.463, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 15.95573570095863, |
|
"grad_norm": 0.4636126458644867, |
|
"learning_rate": 6.772430610342107e-05, |
|
"loss": 2.4626, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4244173733566653, |
|
"eval_loss": 2.9882283210754395, |
|
"eval_runtime": 112.1651, |
|
"eval_samples_per_second": 467.525, |
|
"eval_steps_per_second": 7.311, |
|
"step": 248688 |
|
}, |
|
{ |
|
"epoch": 16.02007334491411, |
|
"grad_norm": 0.44689562916755676, |
|
"learning_rate": 6.664849745391953e-05, |
|
"loss": 2.4432, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 16.08441098886959, |
|
"grad_norm": 0.45889049768447876, |
|
"learning_rate": 6.557376461306749e-05, |
|
"loss": 2.4048, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 16.148748632825065, |
|
"grad_norm": 0.4538269639015198, |
|
"learning_rate": 6.449795596356593e-05, |
|
"loss": 2.4123, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 16.213086276780544, |
|
"grad_norm": 0.44775742292404175, |
|
"learning_rate": 6.342214731406439e-05, |
|
"loss": 2.4137, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 16.277423920736023, |
|
"grad_norm": 0.4506843090057373, |
|
"learning_rate": 6.234741447321236e-05, |
|
"loss": 2.4152, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 16.341761564691502, |
|
"grad_norm": 0.4564642310142517, |
|
"learning_rate": 6.127160582371082e-05, |
|
"loss": 2.4236, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 16.406099208646978, |
|
"grad_norm": 0.4492376744747162, |
|
"learning_rate": 6.0195797174209275e-05, |
|
"loss": 2.4222, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 16.470436852602457, |
|
"grad_norm": 0.44002753496170044, |
|
"learning_rate": 5.9119988524707736e-05, |
|
"loss": 2.4277, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 16.534774496557937, |
|
"grad_norm": 0.437580406665802, |
|
"learning_rate": 5.8044179875206196e-05, |
|
"loss": 2.4303, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 16.599112140513416, |
|
"grad_norm": 0.42502424120903015, |
|
"learning_rate": 5.697052284300365e-05, |
|
"loss": 2.4359, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 16.66344978446889, |
|
"grad_norm": 0.44441190361976624, |
|
"learning_rate": 5.5894714193502106e-05, |
|
"loss": 2.4306, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 16.72778742842437, |
|
"grad_norm": 0.4539526700973511, |
|
"learning_rate": 5.4818905544000566e-05, |
|
"loss": 2.4342, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 16.79212507237985, |
|
"grad_norm": 0.4554595947265625, |
|
"learning_rate": 5.374417270314853e-05, |
|
"loss": 2.4388, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 16.85646271633533, |
|
"grad_norm": 0.4573330283164978, |
|
"learning_rate": 5.266836405364699e-05, |
|
"loss": 2.441, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 16.920800360290805, |
|
"grad_norm": 0.449770450592041, |
|
"learning_rate": 5.159255540414545e-05, |
|
"loss": 2.4411, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 16.985138004246284, |
|
"grad_norm": 0.48139625787734985, |
|
"learning_rate": 5.05178225632934e-05, |
|
"loss": 2.4399, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.4242649676193895, |
|
"eval_loss": 2.9961202144622803, |
|
"eval_runtime": 111.9399, |
|
"eval_samples_per_second": 468.466, |
|
"eval_steps_per_second": 7.325, |
|
"step": 264231 |
|
}, |
|
{ |
|
"epoch": 17.049475648201764, |
|
"grad_norm": 0.4543195366859436, |
|
"learning_rate": 4.9442013913791863e-05, |
|
"loss": 2.4013, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 17.113813292157243, |
|
"grad_norm": 0.4699794054031372, |
|
"learning_rate": 4.836620526429032e-05, |
|
"loss": 2.3928, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 17.17815093611272, |
|
"grad_norm": 0.4636929929256439, |
|
"learning_rate": 4.7291472423438285e-05, |
|
"loss": 2.3989, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 17.242488580068198, |
|
"grad_norm": 0.4614698886871338, |
|
"learning_rate": 4.6215663773936746e-05, |
|
"loss": 2.4004, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 17.306826224023677, |
|
"grad_norm": 0.46002906560897827, |
|
"learning_rate": 4.513985512443519e-05, |
|
"loss": 2.3982, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 17.371163867979156, |
|
"grad_norm": 0.42619064450263977, |
|
"learning_rate": 4.4065122283583154e-05, |
|
"loss": 2.4053, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 17.435501511934632, |
|
"grad_norm": 0.45975300669670105, |
|
"learning_rate": 4.2989313634081614e-05, |
|
"loss": 2.4041, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 17.49983915589011, |
|
"grad_norm": 0.4545740485191345, |
|
"learning_rate": 4.1913504984580075e-05, |
|
"loss": 2.406, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 17.56417679984559, |
|
"grad_norm": 0.458011269569397, |
|
"learning_rate": 4.083769633507853e-05, |
|
"loss": 2.4168, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 17.62851444380107, |
|
"grad_norm": 0.4604107439517975, |
|
"learning_rate": 3.976296349422649e-05, |
|
"loss": 2.411, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 17.692852087756545, |
|
"grad_norm": 0.4420773684978485, |
|
"learning_rate": 3.8687154844724944e-05, |
|
"loss": 2.4144, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 17.757189731712025, |
|
"grad_norm": 0.45774900913238525, |
|
"learning_rate": 3.7611346195223404e-05, |
|
"loss": 2.412, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 17.821527375667504, |
|
"grad_norm": 0.4509606659412384, |
|
"learning_rate": 3.6536613354371366e-05, |
|
"loss": 2.4086, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 17.885865019622983, |
|
"grad_norm": 0.4442935883998871, |
|
"learning_rate": 3.5460804704869826e-05, |
|
"loss": 2.4134, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 17.95020266357846, |
|
"grad_norm": 0.42292436957359314, |
|
"learning_rate": 3.438607186401778e-05, |
|
"loss": 2.4186, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.42388941236296196, |
|
"eval_loss": 3.0051016807556152, |
|
"eval_runtime": 111.9834, |
|
"eval_samples_per_second": 468.284, |
|
"eval_steps_per_second": 7.323, |
|
"step": 279774 |
|
}, |
|
{ |
|
"epoch": 18.014540307533938, |
|
"grad_norm": 0.48824623227119446, |
|
"learning_rate": 3.331026321451624e-05, |
|
"loss": 2.4055, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 18.078877951489417, |
|
"grad_norm": 0.46934977173805237, |
|
"learning_rate": 3.22355303736642e-05, |
|
"loss": 2.3736, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 18.143215595444893, |
|
"grad_norm": 0.5045217275619507, |
|
"learning_rate": 3.115972172416266e-05, |
|
"loss": 2.382, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 18.207553239400372, |
|
"grad_norm": 0.46461954712867737, |
|
"learning_rate": 3.008391307466112e-05, |
|
"loss": 2.3806, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 18.27189088335585, |
|
"grad_norm": 0.4565331041812897, |
|
"learning_rate": 2.9009180233809078e-05, |
|
"loss": 2.3813, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 18.33622852731133, |
|
"grad_norm": 0.4561784863471985, |
|
"learning_rate": 2.793337158430754e-05, |
|
"loss": 2.3863, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 18.400566171266806, |
|
"grad_norm": 0.4438989758491516, |
|
"learning_rate": 2.6858638743455493e-05, |
|
"loss": 2.3845, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 18.464903815222286, |
|
"grad_norm": 0.461086630821228, |
|
"learning_rate": 2.578283009395395e-05, |
|
"loss": 2.3833, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 18.529241459177765, |
|
"grad_norm": 0.4639764726161957, |
|
"learning_rate": 2.470702144445241e-05, |
|
"loss": 2.3918, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 18.593579103133244, |
|
"grad_norm": 0.4645422697067261, |
|
"learning_rate": 2.3631212794950868e-05, |
|
"loss": 2.3953, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 18.65791674708872, |
|
"grad_norm": 0.47392553091049194, |
|
"learning_rate": 2.2555404145449328e-05, |
|
"loss": 2.3829, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 18.7222543910442, |
|
"grad_norm": 0.4530762732028961, |
|
"learning_rate": 2.148174711324679e-05, |
|
"loss": 2.3904, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 18.78659203499968, |
|
"grad_norm": 0.47473639249801636, |
|
"learning_rate": 2.0405938463745248e-05, |
|
"loss": 2.3966, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 18.850929678955158, |
|
"grad_norm": 0.43500351905822754, |
|
"learning_rate": 1.9330129814243705e-05, |
|
"loss": 2.396, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 18.915267322910633, |
|
"grad_norm": 0.45157596468925476, |
|
"learning_rate": 1.8254321164742165e-05, |
|
"loss": 2.3959, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 18.979604966866113, |
|
"grad_norm": 0.4546051621437073, |
|
"learning_rate": 1.7180664132539624e-05, |
|
"loss": 2.3869, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.42373925008599933, |
|
"eval_loss": 3.011887311935425, |
|
"eval_runtime": 112.1189, |
|
"eval_samples_per_second": 467.718, |
|
"eval_steps_per_second": 7.314, |
|
"step": 295317 |
|
}, |
|
{ |
|
"epoch": 19.043942610821592, |
|
"grad_norm": 0.46901893615722656, |
|
"learning_rate": 1.610485548303808e-05, |
|
"loss": 2.3726, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 19.10828025477707, |
|
"grad_norm": 0.43862438201904297, |
|
"learning_rate": 1.502904683353654e-05, |
|
"loss": 2.3688, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 19.172617898732547, |
|
"grad_norm": 0.4580424427986145, |
|
"learning_rate": 1.3954313992684501e-05, |
|
"loss": 2.3682, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 19.236955542688026, |
|
"grad_norm": 0.47557470202445984, |
|
"learning_rate": 1.2878505343182957e-05, |
|
"loss": 2.3687, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 19.301293186643505, |
|
"grad_norm": 0.48615992069244385, |
|
"learning_rate": 1.1802696693681415e-05, |
|
"loss": 2.3636, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 19.365630830598985, |
|
"grad_norm": 0.5019800662994385, |
|
"learning_rate": 1.0726888044179874e-05, |
|
"loss": 2.3668, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 19.42996847455446, |
|
"grad_norm": 0.4481401741504669, |
|
"learning_rate": 9.652155203327834e-06, |
|
"loss": 2.3721, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 19.49430611850994, |
|
"grad_norm": 0.4632056653499603, |
|
"learning_rate": 8.577422362475793e-06, |
|
"loss": 2.372, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 19.55864376246542, |
|
"grad_norm": 0.4590476453304291, |
|
"learning_rate": 7.5016137129742514e-06, |
|
"loss": 2.3725, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 19.622981406420898, |
|
"grad_norm": 0.4774569272994995, |
|
"learning_rate": 6.42580506347271e-06, |
|
"loss": 2.37, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 19.687319050376374, |
|
"grad_norm": 0.47048863768577576, |
|
"learning_rate": 5.351072222620669e-06, |
|
"loss": 2.371, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 19.751656694331853, |
|
"grad_norm": 0.4567144215106964, |
|
"learning_rate": 4.275263573119128e-06, |
|
"loss": 2.3706, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 19.815994338287332, |
|
"grad_norm": 0.4492277503013611, |
|
"learning_rate": 3.200530732267087e-06, |
|
"loss": 2.3714, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 19.88033198224281, |
|
"grad_norm": 0.44562822580337524, |
|
"learning_rate": 2.1247220827655454e-06, |
|
"loss": 2.3711, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 19.944669626198287, |
|
"grad_norm": 0.4758046269416809, |
|
"learning_rate": 1.0499892419135049e-06, |
|
"loss": 2.3686, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.4234014597448438, |
|
"eval_loss": 3.0190186500549316, |
|
"eval_runtime": 111.9505, |
|
"eval_samples_per_second": 468.421, |
|
"eval_steps_per_second": 7.325, |
|
"step": 310860 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 310860, |
|
"total_flos": 1.29957250203648e+18, |
|
"train_loss": 2.7038125842232534, |
|
"train_runtime": 43992.4165, |
|
"train_samples_per_second": 226.114, |
|
"train_steps_per_second": 7.066 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 310860, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.29957250203648e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|