|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.991296779808529, |
|
"eval_steps": 500, |
|
"global_step": 5740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0017406440382941688, |
|
"grad_norm": 600.0, |
|
"learning_rate": 3.4843205574912896e-07, |
|
"loss": 32.6735, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008703220191470844, |
|
"grad_norm": 548.0, |
|
"learning_rate": 1.7421602787456445e-06, |
|
"loss": 30.8839, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.017406440382941687, |
|
"grad_norm": 338.0, |
|
"learning_rate": 3.484320557491289e-06, |
|
"loss": 28.8366, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02610966057441253, |
|
"grad_norm": 197.0, |
|
"learning_rate": 5.226480836236934e-06, |
|
"loss": 24.2042, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.034812880765883375, |
|
"grad_norm": 82.5, |
|
"learning_rate": 6.968641114982578e-06, |
|
"loss": 20.0294, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04351610095735422, |
|
"grad_norm": 52.75, |
|
"learning_rate": 8.710801393728225e-06, |
|
"loss": 18.1416, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05221932114882506, |
|
"grad_norm": 25.375, |
|
"learning_rate": 1.0452961672473868e-05, |
|
"loss": 16.3408, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.060922541340295906, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.2195121951219513e-05, |
|
"loss": 15.4579, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06962576153176675, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 1.3937282229965156e-05, |
|
"loss": 14.8065, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0783289817232376, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 1.56794425087108e-05, |
|
"loss": 14.3068, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08703220191470844, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.742160278745645e-05, |
|
"loss": 13.9, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09573542210617929, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.9163763066202093e-05, |
|
"loss": 13.1213, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10443864229765012, |
|
"grad_norm": 21.25, |
|
"learning_rate": 2.0905923344947736e-05, |
|
"loss": 12.0425, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11314186248912098, |
|
"grad_norm": 44.5, |
|
"learning_rate": 2.264808362369338e-05, |
|
"loss": 10.0004, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12184508268059181, |
|
"grad_norm": 42.75, |
|
"learning_rate": 2.4390243902439026e-05, |
|
"loss": 5.8601, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13054830287206268, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 2.6132404181184672e-05, |
|
"loss": 2.3269, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1392515230635335, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 2.7874564459930312e-05, |
|
"loss": 1.8038, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14795474325500435, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 2.9616724738675962e-05, |
|
"loss": 1.6296, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1566579634464752, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.13588850174216e-05, |
|
"loss": 1.4928, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16536118363794605, |
|
"grad_norm": 8.0, |
|
"learning_rate": 3.310104529616725e-05, |
|
"loss": 1.4006, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17406440382941687, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 3.48432055749129e-05, |
|
"loss": 1.366, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18276762402088773, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.6585365853658535e-05, |
|
"loss": 1.2998, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.19147084421235858, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 3.8327526132404185e-05, |
|
"loss": 1.2497, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20017406440382943, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.006968641114983e-05, |
|
"loss": 1.2106, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20887728459530025, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 4.181184668989547e-05, |
|
"loss": 1.2119, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2175805047867711, |
|
"grad_norm": 22.75, |
|
"learning_rate": 4.3554006968641115e-05, |
|
"loss": 1.1897, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.22628372497824195, |
|
"grad_norm": 16.375, |
|
"learning_rate": 4.529616724738676e-05, |
|
"loss": 1.1921, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2349869451697128, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 4.703832752613241e-05, |
|
"loss": 1.1743, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24369016536118362, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 4.878048780487805e-05, |
|
"loss": 1.1545, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2523933855526545, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 5.0522648083623695e-05, |
|
"loss": 1.1263, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.26109660574412535, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 5.2264808362369345e-05, |
|
"loss": 1.1232, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.26979982593559615, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 5.400696864111499e-05, |
|
"loss": 1.0888, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.278503046127067, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 5.5749128919860624e-05, |
|
"loss": 1.0734, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.28720626631853785, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 5.749128919860628e-05, |
|
"loss": 1.0545, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2959094865100087, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 5.9233449477351924e-05, |
|
"loss": 1.0451, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.30461270670147955, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 6.097560975609756e-05, |
|
"loss": 1.023, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3133159268929504, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 6.27177700348432e-05, |
|
"loss": 1.0475, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32201914708442125, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 6.445993031358886e-05, |
|
"loss": 1.008, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.3307223672758921, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 6.62020905923345e-05, |
|
"loss": 0.9875, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3394255874673629, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 6.794425087108013e-05, |
|
"loss": 1.0139, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.34812880765883375, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 6.96864111498258e-05, |
|
"loss": 0.9675, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3568320278503046, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.9668, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.36553524804177545, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.317073170731707e-05, |
|
"loss": 0.9589, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3742384682332463, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 7.491289198606272e-05, |
|
"loss": 0.9686, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.38294168842471715, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 7.665505226480837e-05, |
|
"loss": 0.9702, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.391644908616188, |
|
"grad_norm": 2.875, |
|
"learning_rate": 7.839721254355401e-05, |
|
"loss": 0.9665, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.40034812880765885, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 8.013937282229966e-05, |
|
"loss": 0.9598, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4090513489991297, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 8.188153310104531e-05, |
|
"loss": 0.944, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4177545691906005, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 8.362369337979094e-05, |
|
"loss": 0.9368, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.42645778938207135, |
|
"grad_norm": 3.625, |
|
"learning_rate": 8.53658536585366e-05, |
|
"loss": 0.9467, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4351610095735422, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 8.710801393728223e-05, |
|
"loss": 0.9282, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.44386422976501305, |
|
"grad_norm": 5.0, |
|
"learning_rate": 8.885017421602788e-05, |
|
"loss": 0.9065, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4525674499564839, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.059233449477352e-05, |
|
"loss": 0.911, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.46127067014795475, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 9.233449477351917e-05, |
|
"loss": 0.9055, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4699738903394256, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 9.407665505226482e-05, |
|
"loss": 0.9053, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.47867711053089645, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.581881533101045e-05, |
|
"loss": 0.8986, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.48738033072236725, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 9.75609756097561e-05, |
|
"loss": 0.8968, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4960835509138381, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.930313588850174e-05, |
|
"loss": 0.9147, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.504786771105309, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 0.00010104529616724739, |
|
"loss": 0.8892, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5134899912967799, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.00010278745644599304, |
|
"loss": 0.8896, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5221932114882507, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00010452961672473869, |
|
"loss": 0.8926, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5308964316797214, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00010627177700348431, |
|
"loss": 0.8943, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5395996518711923, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00010801393728222998, |
|
"loss": 0.8807, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5483028720626631, |
|
"grad_norm": 20.125, |
|
"learning_rate": 0.00010975609756097563, |
|
"loss": 0.882, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.557006092254134, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 0.00011149825783972125, |
|
"loss": 0.9149, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5657093124456049, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.00011324041811846691, |
|
"loss": 0.8841, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5744125326370757, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.00011498257839721256, |
|
"loss": 0.891, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5831157528285466, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.00011672473867595819, |
|
"loss": 0.8778, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5918189730200174, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00011846689895470385, |
|
"loss": 0.8664, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6005221932114883, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.00012020905923344947, |
|
"loss": 0.8838, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6092254134029591, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00012195121951219512, |
|
"loss": 0.8502, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.61792863359443, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.00012369337979094077, |
|
"loss": 0.8579, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6266318537859008, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.0001254355400696864, |
|
"loss": 0.8601, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6353350739773717, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.00012717770034843207, |
|
"loss": 0.8575, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6440382941688425, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.00012891986062717772, |
|
"loss": 0.8595, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6527415143603134, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00013066202090592334, |
|
"loss": 0.8814, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.6614447345517842, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.000132404181184669, |
|
"loss": 0.8418, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6701479547432551, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00013414634146341464, |
|
"loss": 0.846, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6788511749347258, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00013588850174216027, |
|
"loss": 0.8663, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6875543951261966, |
|
"grad_norm": 2.625, |
|
"learning_rate": 0.00013763066202090594, |
|
"loss": 0.8521, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.6962576153176675, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 0.0001393728222996516, |
|
"loss": 0.8726, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7049608355091384, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 0.00014111498257839722, |
|
"loss": 0.8606, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.7136640557006092, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.8517, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.72236727589208, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.00014459930313588852, |
|
"loss": 0.8638, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7310704960835509, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.00014634146341463414, |
|
"loss": 0.8448, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7397737162750218, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0001480836236933798, |
|
"loss": 0.8351, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7484769364664926, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 0.00014982578397212544, |
|
"loss": 0.8616, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7571801566579635, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 0.0001515679442508711, |
|
"loss": 0.8569, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.7658833768494343, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 0.00015331010452961674, |
|
"loss": 0.8531, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7745865970409052, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00015505226480836236, |
|
"loss": 0.8483, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.783289817232376, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 0.00015679442508710801, |
|
"loss": 0.8444, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7919930374238469, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 0.00015853658536585366, |
|
"loss": 0.8513, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8006962576153177, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.00016027874564459931, |
|
"loss": 0.8326, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8093994778067886, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.00016202090592334496, |
|
"loss": 0.8554, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.8181026979982594, |
|
"grad_norm": 3.0, |
|
"learning_rate": 0.00016376306620209061, |
|
"loss": 0.8334, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8268059181897301, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00016550522648083624, |
|
"loss": 0.8547, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.835509138381201, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.0001672473867595819, |
|
"loss": 0.8496, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8442123585726719, |
|
"grad_norm": 42.75, |
|
"learning_rate": 0.00016898954703832754, |
|
"loss": 0.8443, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.8529155787641427, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.0001707317073170732, |
|
"loss": 0.8419, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8616187989556136, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00017247386759581884, |
|
"loss": 0.8447, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8703220191470844, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.00017421602787456446, |
|
"loss": 0.836, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8790252393385553, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.0001759581881533101, |
|
"loss": 0.8499, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.8877284595300261, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.00017770034843205576, |
|
"loss": 0.8429, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.896431679721497, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.00017944250871080138, |
|
"loss": 0.8562, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.9051348999129678, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.00018118466898954703, |
|
"loss": 0.8553, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9138381201044387, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.0001829268292682927, |
|
"loss": 0.8297, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.9225413402959095, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00018466898954703833, |
|
"loss": 0.8288, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9312445604873804, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00018641114982578398, |
|
"loss": 0.8325, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.9399477806788512, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00018815331010452963, |
|
"loss": 0.8317, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9486510008703221, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.00018989547038327526, |
|
"loss": 0.8422, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.9573542210617929, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.0001916376306620209, |
|
"loss": 0.8528, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9660574412532638, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00019337979094076658, |
|
"loss": 0.8385, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.9747606614447345, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.0001951219512195122, |
|
"loss": 0.8434, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9834638816362054, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.00019686411149825786, |
|
"loss": 0.8416, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.9921671018276762, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 0.00019860627177700348, |
|
"loss": 0.8445, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.999129677980853, |
|
"eval_loss": 2.1106536388397217, |
|
"eval_runtime": 1.1027, |
|
"eval_samples_per_second": 5.441, |
|
"eval_steps_per_second": 0.907, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.000870322019147, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00019999998150897728, |
|
"loss": 0.8531, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.009573542210618, |
|
"grad_norm": 2.625, |
|
"learning_rate": 0.00019999933432389942, |
|
"loss": 0.7707, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0182767624020888, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 0.00019999776259452297, |
|
"loss": 0.7908, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.0269799825935597, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 0.00019999526633537938, |
|
"loss": 0.7832, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0356832027850305, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 0.00019999184556954776, |
|
"loss": 0.7502, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.0443864229765012, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 0.00019998750032865483, |
|
"loss": 0.7704, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0530896431679722, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00019998223065287456, |
|
"loss": 0.7887, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.061792863359443, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 0.00019997603659092773, |
|
"loss": 0.7848, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0704960835509139, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.00019996891820008164, |
|
"loss": 0.7635, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.0791993037423846, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00019996087554614934, |
|
"loss": 0.7591, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0879025239338556, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00019995190870348922, |
|
"loss": 0.7569, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.0966057441253263, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0001999420177550043, |
|
"loss": 0.7677, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1053089643167973, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00019993120279214135, |
|
"loss": 0.7648, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.114012184508268, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.00019991946391489018, |
|
"loss": 0.7819, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.122715404699739, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 0.00019990680123178263, |
|
"loss": 0.7606, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.1314186248912097, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00019989321485989163, |
|
"loss": 0.796, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1401218450826807, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.00019987870492482997, |
|
"loss": 0.7866, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.1488250652741514, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00019986327156074939, |
|
"loss": 0.7824, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.1575282854656224, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 0.00019984691491033906, |
|
"loss": 0.7748, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.166231505657093, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.00019982963512482453, |
|
"loss": 0.794, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.174934725848564, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 0.00019981143236396612, |
|
"loss": 0.7733, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.1836379460400348, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.00019979230679605749, |
|
"loss": 0.7919, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.1923411662315058, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0001997722585979242, |
|
"loss": 0.7668, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.2010443864229765, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.000199751287954922, |
|
"loss": 0.7746, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2097476066144472, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.000199729395060935, |
|
"loss": 0.778, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.2184508268059182, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00019970658011837404, |
|
"loss": 0.7742, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.227154046997389, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00019968284333817486, |
|
"loss": 0.7856, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.23585726718886, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00019965818493979586, |
|
"loss": 0.78, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.2445604873803306, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00019963260515121648, |
|
"loss": 0.804, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.2532637075718016, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001996061042089347, |
|
"loss": 0.7713, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.2619669277632724, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00019957868235796514, |
|
"loss": 0.7725, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.2706701479547433, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0001995503398518366, |
|
"loss": 0.7738, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.279373368146214, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00019952107695258992, |
|
"loss": 0.7935, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.288076588337685, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0001994908939307753, |
|
"loss": 0.7573, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2967798085291558, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00019945979106545002, |
|
"loss": 0.8069, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.3054830287206267, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.0001994277686441758, |
|
"loss": 0.7752, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3141862489120975, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.00019939482696301606, |
|
"loss": 0.7989, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.3228894691035684, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00019936096632653324, |
|
"loss": 0.7946, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.3315926892950392, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.000199326187047786, |
|
"loss": 0.7781, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.34029590948651, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 0.00019929048944832638, |
|
"loss": 0.7819, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.3489991296779809, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00019925387385819664, |
|
"loss": 0.7702, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.3577023498694518, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00019921634061592644, |
|
"loss": 0.7759, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.3664055700609226, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0001991778900685295, |
|
"loss": 0.7683, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.3751087902523933, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00019913852257150052, |
|
"loss": 0.7831, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.3838120104438643, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.0001990982384888119, |
|
"loss": 0.7823, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.392515230635335, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0001990570381929103, |
|
"loss": 0.7698, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.401218450826806, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00019901492206471325, |
|
"loss": 0.7663, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.4099216710182767, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00019897189049360557, |
|
"loss": 0.7966, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.4186248912097477, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00019892794387743593, |
|
"loss": 0.7792, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.4273281114012184, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00019888308262251285, |
|
"loss": 0.7761, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.4360313315926894, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.00019883730714360137, |
|
"loss": 0.772, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.44473455178416, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 0.00019879061786391881, |
|
"loss": 0.7705, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.453437771975631, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.0001987430152151312, |
|
"loss": 0.7637, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.4621409921671018, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00019869449963734893, |
|
"loss": 0.7647, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.4708442123585725, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0001986450715791231, |
|
"loss": 0.7772, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.4795474325500435, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.000198594731497441, |
|
"loss": 0.7538, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.4882506527415145, |
|
"grad_norm": 3.5, |
|
"learning_rate": 0.00019854347985772208, |
|
"loss": 0.7732, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.4969538729329852, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00019849131713381364, |
|
"loss": 0.7777, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.505657093124456, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.00019843824380798633, |
|
"loss": 0.7742, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.514360313315927, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00019838426037092988, |
|
"loss": 0.7596, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.5230635335073979, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.00019832936732174834, |
|
"loss": 0.7668, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.5317667536988686, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.0001982735651679557, |
|
"loss": 0.7635, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.5404699738903394, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.000198216854425471, |
|
"loss": 0.7745, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.5491731940818103, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0001981592356186137, |
|
"loss": 0.7905, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.5578764142732813, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00019810070928009867, |
|
"loss": 0.7773, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.566579634464752, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.0001980412759510315, |
|
"loss": 0.7611, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.5752828546562228, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00019798093618090328, |
|
"loss": 0.7705, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.5839860748476937, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00019791969052758562, |
|
"loss": 0.7895, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.5926892950391645, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 0.0001978575395573255, |
|
"loss": 0.7738, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.6013925152306352, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00019779448384474, |
|
"loss": 0.7661, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.6100957354221062, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.000197730523972811, |
|
"loss": 0.7561, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.6187989556135771, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00019766566053287975, |
|
"loss": 0.7742, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.6275021758050479, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00019759989412464153, |
|
"loss": 0.7742, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.6362053959965186, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0001975332253561399, |
|
"loss": 0.769, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.6449086161879896, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00019746565484376132, |
|
"loss": 0.7564, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.6536118363794605, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00019739718321222928, |
|
"loss": 0.7574, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.6623150565709313, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00019732781109459846, |
|
"loss": 0.7702, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.671018276762402, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00019725753913224918, |
|
"loss": 0.7785, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.679721496953873, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 0.0001971863679748812, |
|
"loss": 0.7694, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.688424717145344, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00019711429828050769, |
|
"loss": 0.7802, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.6971279373368147, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00019704133071544942, |
|
"loss": 0.7629, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.7058311575282854, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00019696746595432828, |
|
"loss": 0.7739, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.7145343777197564, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00019689270468006132, |
|
"loss": 0.7794, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.723237597911227, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00019681704758385418, |
|
"loss": 0.7575, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.7319408181026978, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0001967404953651949, |
|
"loss": 0.7673, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.7406440382941688, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00019666304873184739, |
|
"loss": 0.7734, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.7493472584856398, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.0001965847083998448, |
|
"loss": 0.7785, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.7580504786771105, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.00019650547509348306, |
|
"loss": 0.7652, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.7667536988685812, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.0001964253495453141, |
|
"loss": 0.7631, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.7754569190600522, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00019634433249613898, |
|
"loss": 0.7819, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.7841601392515232, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0001962624246950012, |
|
"loss": 0.7774, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.792863359442994, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00019617962689917975, |
|
"loss": 0.7723, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.8015665796344646, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00019609593987418198, |
|
"loss": 0.7645, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.8102697998259356, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00019601136439373668, |
|
"loss": 0.7653, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.8189730200174066, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001959259012397868, |
|
"loss": 0.7756, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.8276762402088773, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00019583955120248237, |
|
"loss": 0.7656, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.836379460400348, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00019575231508017307, |
|
"loss": 0.761, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.845082680591819, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0001956641936794008, |
|
"loss": 0.7584, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.85378590078329, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00019557518781489238, |
|
"loss": 0.749, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.8624891209747607, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00019548529830955196, |
|
"loss": 0.7635, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.8711923411662315, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00019539452599445336, |
|
"loss": 0.7601, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.8798955613577024, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001953028717088324, |
|
"loss": 0.7869, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.8885987815491732, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.00019521033630007928, |
|
"loss": 0.766, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.897302001740644, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00019511692062373044, |
|
"loss": 0.7744, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.9060052219321149, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.000195022625543461, |
|
"loss": 0.7749, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.9147084421235858, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0001949274519310765, |
|
"loss": 0.7684, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.9234116623150566, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00019483140066650507, |
|
"loss": 0.7596, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.9321148825065273, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00019473447263778905, |
|
"loss": 0.768, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.9408181026979983, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 0.00019463666874107704, |
|
"loss": 0.7563, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.9495213228894692, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00019453798988061535, |
|
"loss": 0.7834, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.95822454308094, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 0.00019443843696873985, |
|
"loss": 0.7471, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.9669277632724107, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.00019433801092586742, |
|
"loss": 0.768, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.9756309834638817, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00019423671268048754, |
|
"loss": 0.7806, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.9843342036553526, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00019413454316915356, |
|
"loss": 0.7543, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.9930374238468234, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00019403150333647417, |
|
"loss": 0.784, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.230104684829712, |
|
"eval_runtime": 0.7759, |
|
"eval_samples_per_second": 7.733, |
|
"eval_steps_per_second": 1.289, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 2.001740644038294, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0001939275941351046, |
|
"loss": 0.7099, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.010443864229765, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00019382281652573785, |
|
"loss": 0.6306, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.019147084421236, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00019371717147709583, |
|
"loss": 0.6241, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.0278503046127065, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0001936106599659202, |
|
"loss": 0.6167, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.0365535248041775, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00019350328297696373, |
|
"loss": 0.6173, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.0452567449956485, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00019339504150298084, |
|
"loss": 0.6234, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.0539599651871194, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00019328593654471848, |
|
"loss": 0.6151, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.06266318537859, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00019317596911090713, |
|
"loss": 0.6386, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.071366405570061, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00019306514021825118, |
|
"loss": 0.6209, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.080069625761532, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00019295345089141963, |
|
"loss": 0.625, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.0887728459530024, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00019284090216303666, |
|
"loss": 0.6336, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.0974760661444734, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00019272749507367212, |
|
"loss": 0.6266, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.1061792863359443, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00019261323067183166, |
|
"loss": 0.6286, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.1148825065274153, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0001924981100139474, |
|
"loss": 0.6458, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.123585726718886, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.00019238213416436785, |
|
"loss": 0.6328, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.1322889469103568, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00019226530419534833, |
|
"loss": 0.6398, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.1409921671018277, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.00019214762118704076, |
|
"loss": 0.6361, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.1496953872932987, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.000192029086227484, |
|
"loss": 0.6357, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.158398607484769, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00019190970041259352, |
|
"loss": 0.6277, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.16710182767624, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0001917894648461514, |
|
"loss": 0.6455, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.175805047867711, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00019166838063979614, |
|
"loss": 0.6374, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.184508268059182, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0001915464489130123, |
|
"loss": 0.6343, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.1932114882506526, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00019142367079312021, |
|
"loss": 0.623, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.2019147084421236, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00019130004741526558, |
|
"loss": 0.6359, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.2106179286335945, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00019117557992240887, |
|
"loss": 0.6344, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.2193211488250655, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00019105026946531482, |
|
"loss": 0.6511, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.228024369016536, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.0001909241172025419, |
|
"loss": 0.636, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.236727589208007, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00019079712430043134, |
|
"loss": 0.6374, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.245430809399478, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.0001906692919330967, |
|
"loss": 0.6359, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.254134029590949, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00019054062128241264, |
|
"loss": 0.6518, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.2628372497824194, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00019041111353800425, |
|
"loss": 0.6428, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.2715404699738904, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00019028076989723597, |
|
"loss": 0.6562, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.2802436901653613, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00019014959156520052, |
|
"loss": 0.6495, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.288946910356832, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0001900175797547078, |
|
"loss": 0.6466, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.297650130548303, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00018988473568627354, |
|
"loss": 0.6603, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.3063533507397738, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00018975106058810823, |
|
"loss": 0.6352, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.3150565709312447, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.00018961655569610557, |
|
"loss": 0.6592, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.3237597911227152, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00018948122225383114, |
|
"loss": 0.6515, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.332463011314186, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00018934506151251093, |
|
"loss": 0.6534, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.341166231505657, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00018920807473101964, |
|
"loss": 0.6558, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.349869451697128, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00018907026317586923, |
|
"loss": 0.6547, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.3585726718885986, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00018893162812119702, |
|
"loss": 0.6541, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.3672758920800696, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00018879217084875408, |
|
"loss": 0.655, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.3759791122715406, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0001886518926478932, |
|
"loss": 0.648, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.3846823324630115, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00018851079481555714, |
|
"loss": 0.6474, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.393385552654482, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00018836887865626654, |
|
"loss": 0.6543, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.402088772845953, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00018822614548210797, |
|
"loss": 0.6529, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.410791993037424, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00018808259661272153, |
|
"loss": 0.6612, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.4194952132288945, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.000187938233375289, |
|
"loss": 0.6519, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.4281984334203655, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.00018779305710452132, |
|
"loss": 0.6558, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.4369016536118364, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00018764706914264635, |
|
"loss": 0.6532, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.4456048738033074, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00018750027083939654, |
|
"loss": 0.6443, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.454308093994778, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00018735266355199618, |
|
"loss": 0.6544, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.463011314186249, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00018720424864514913, |
|
"loss": 0.6663, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.47171453437772, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 0.0001870550274910261, |
|
"loss": 0.6654, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.480417754569191, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00018690500146925193, |
|
"loss": 0.6456, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.4891209747606613, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00018675417196689292, |
|
"loss": 0.6495, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.4978241949521323, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.6551, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 2.506527415143603, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00018645010810581535, |
|
"loss": 0.6432, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.515230635335074, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00018629687655832063, |
|
"loss": 0.6521, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 2.5239338555265447, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00018614284715266264, |
|
"loss": 0.6626, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.5326370757180157, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00018598802131292093, |
|
"loss": 0.6451, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 2.5413402959094866, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00018583240047053863, |
|
"loss": 0.6627, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.550043516100957, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00018567598606430882, |
|
"loss": 0.6756, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 2.558746736292428, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00018551877954036162, |
|
"loss": 0.6734, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.567449956483899, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0001853607823521507, |
|
"loss": 0.6495, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.57615317667537, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00018520199596043976, |
|
"loss": 0.6459, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.584856396866841, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0001850424218332891, |
|
"loss": 0.6665, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.5935596170583115, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00018488206144604203, |
|
"loss": 0.6637, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.6022628372497825, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.00018472091628131125, |
|
"loss": 0.6705, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 2.6109660574412534, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00018455898782896511, |
|
"loss": 0.6601, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.619669277632724, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00018439627758611385, |
|
"loss": 0.6591, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 2.628372497824195, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018423278705709573, |
|
"loss": 0.6574, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.637075718015666, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00018406851775346322, |
|
"loss": 0.6665, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 2.645778938207137, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.0001839034711939689, |
|
"loss": 0.6591, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.6544821583986073, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00018373764890455146, |
|
"loss": 0.6505, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.6631853785900783, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00018357105241832163, |
|
"loss": 0.6654, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.6718885987815493, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000183403683275548, |
|
"loss": 0.6551, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.68059181897302, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00018323554302364272, |
|
"loss": 0.6647, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.6892950391644908, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.0001830666332171473, |
|
"loss": 0.6658, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.6979982593559617, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.00018289695541771802, |
|
"loss": 0.6584, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.7067014795474327, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00018272651119411186, |
|
"loss": 0.6661, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.7154046997389036, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.0001825553021221716, |
|
"loss": 0.6695, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.724107919930374, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00018238332978481148, |
|
"loss": 0.6592, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 2.732811140121845, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0001822105957720025, |
|
"loss": 0.6587, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.741514360313316, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00018203710168075788, |
|
"loss": 0.6635, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.7502175805047866, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00018186284911511787, |
|
"loss": 0.6567, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.7589208006962576, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.0001816878396861355, |
|
"loss": 0.6543, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 2.7676240208877285, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0001815120750118611, |
|
"loss": 0.6662, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.7763272410791995, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0001813355567173279, |
|
"loss": 0.6637, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 2.78503046127067, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.00018115828643453647, |
|
"loss": 0.6598, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.793733681462141, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0001809802658024401, |
|
"loss": 0.6734, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 2.802436901653612, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.0001808014964669293, |
|
"loss": 0.6547, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.8111401218450824, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0001806219800808168, |
|
"loss": 0.6662, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 2.8198433420365534, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00018044171830382215, |
|
"loss": 0.658, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.8285465622280244, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0001802607128025564, |
|
"loss": 0.6574, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 2.8372497824194953, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0001800789652505068, |
|
"loss": 0.6631, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.8459530026109663, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00017989647732802113, |
|
"loss": 0.6606, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 2.854656222802437, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00017971325072229226, |
|
"loss": 0.6759, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.8633594429939078, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00017952928712734268, |
|
"loss": 0.6751, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 2.8720626631853787, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00017934458824400858, |
|
"loss": 0.6604, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.8807658833768492, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00017915915577992433, |
|
"loss": 0.6528, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 2.88946910356832, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00017897299144950662, |
|
"loss": 0.653, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.898172323759791, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00017878609697393868, |
|
"loss": 0.6757, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 2.906875543951262, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00017859847408115414, |
|
"loss": 0.6608, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.9155787641427326, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 0.00017841012450582134, |
|
"loss": 0.6624, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 2.9242819843342036, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00017822104998932713, |
|
"loss": 0.671, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.9329852045256746, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00017803125227976082, |
|
"loss": 0.6495, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 2.941688424717145, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00017784073313189795, |
|
"loss": 0.6729, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.950391644908616, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00017764949430718426, |
|
"loss": 0.6656, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 2.959094865100087, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00017745753757371905, |
|
"loss": 0.6674, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.967798085291558, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.00017726486470623926, |
|
"loss": 0.6585, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 2.976501305483029, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.00017707147748610274, |
|
"loss": 0.6659, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.9852045256744995, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00017687737770127185, |
|
"loss": 0.67, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 2.9939077458659704, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00017668256714629713, |
|
"loss": 0.6545, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.9991296779808527, |
|
"eval_loss": 2.432891607284546, |
|
"eval_runtime": 1.0987, |
|
"eval_samples_per_second": 5.461, |
|
"eval_steps_per_second": 0.91, |
|
"step": 1723 |
|
}, |
|
{ |
|
"epoch": 3.0026109660574414, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00017648704762230036, |
|
"loss": 0.6195, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.011314186248912, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00017629082093695823, |
|
"loss": 0.5228, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.020017406440383, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.00017609388890448547, |
|
"loss": 0.5116, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 3.028720626631854, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00017589625334561801, |
|
"loss": 0.5045, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.037423846823325, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00017569791608759635, |
|
"loss": 0.51, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 3.0461270670147953, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00017549887896414851, |
|
"loss": 0.5144, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.0548302872062663, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0001752991438154731, |
|
"loss": 0.5033, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 3.063533507397737, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00017509871248822236, |
|
"loss": 0.5268, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.072236727589208, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00017489758683548502, |
|
"loss": 0.5163, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 3.0809399477806787, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00017469576871676922, |
|
"loss": 0.5165, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.0896431679721497, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00017449325999798528, |
|
"loss": 0.5237, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 3.0983463881636206, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00017429006255142851, |
|
"loss": 0.5108, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.1070496083550916, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0001740861782557618, |
|
"loss": 0.5086, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 3.115752828546562, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.0001738816089959983, |
|
"loss": 0.523, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.124456048738033, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.00017367635666348406, |
|
"loss": 0.5265, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 3.133159268929504, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.00017347042315588046, |
|
"loss": 0.5328, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.1418624891209745, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00017326381037714668, |
|
"loss": 0.5294, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 3.1505657093124455, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00017305652023752205, |
|
"loss": 0.5264, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.1592689295039165, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00017284855465350856, |
|
"loss": 0.5164, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 3.1679721496953874, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0001726399155478529, |
|
"loss": 0.5269, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.176675369886858, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00017243060484952894, |
|
"loss": 0.5237, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 3.185378590078329, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00017222062449371962, |
|
"loss": 0.5189, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.1940818102698, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001720099764217993, |
|
"loss": 0.5306, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 3.202785030461271, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00017179866258131568, |
|
"loss": 0.5401, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.2114882506527413, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00017158668492597186, |
|
"loss": 0.5254, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 3.2201914708442123, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00017137404541560817, |
|
"loss": 0.5306, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.2288946910356833, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00017116074601618417, |
|
"loss": 0.5299, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 3.2375979112271542, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00017094678869976045, |
|
"loss": 0.53, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.2463011314186248, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0001707321754444803, |
|
"loss": 0.5422, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 3.2550043516100957, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00017051690823455162, |
|
"loss": 0.5357, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.2637075718015667, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00017030098906022832, |
|
"loss": 0.5355, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 3.272410791993037, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0001700844199177921, |
|
"loss": 0.5439, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.281114012184508, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00016986720280953396, |
|
"loss": 0.5294, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 3.289817232375979, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0001696493397437357, |
|
"loss": 0.5485, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.29852045256745, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0001694308327346512, |
|
"loss": 0.5429, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 3.307223672758921, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.0001692116838024881, |
|
"loss": 0.5518, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.3159268929503916, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00016899189497338876, |
|
"loss": 0.5429, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 3.3246301131418625, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00016877146827941187, |
|
"loss": 0.5392, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00016855040575851335, |
|
"loss": 0.5338, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 3.342036553524804, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00016832870945452776, |
|
"loss": 0.545, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.350739773716275, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00016810638141714934, |
|
"loss": 0.56, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 3.359442993907746, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 0.0001678834237019129, |
|
"loss": 0.5483, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.368146214099217, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00016765983837017503, |
|
"loss": 0.5448, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 3.3768494342906874, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00016743562748909493, |
|
"loss": 0.5463, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.3855526544821584, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00016721079313161534, |
|
"loss": 0.5518, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 3.3942558746736293, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00016698533737644327, |
|
"loss": 0.551, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.4029590948651, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.000166759262308031, |
|
"loss": 0.5452, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 3.411662315056571, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00016653257001655652, |
|
"loss": 0.5371, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.4203655352480418, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00016630526259790455, |
|
"loss": 0.5615, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 3.4290687554395127, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00016607734215364674, |
|
"loss": 0.5466, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.4377719756309837, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00016584881079102263, |
|
"loss": 0.554, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 3.446475195822454, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00016561967062292, |
|
"loss": 0.5541, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.455178416013925, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00016538992376785529, |
|
"loss": 0.5476, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 3.463881636205396, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0001651595723499541, |
|
"loss": 0.5543, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.4725848563968666, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001649286184989315, |
|
"loss": 0.5547, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 3.4812880765883376, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00016469706435007236, |
|
"loss": 0.5467, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.4899912967798086, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0001644649120442116, |
|
"loss": 0.539, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 3.4986945169712795, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00016423216372771443, |
|
"loss": 0.5448, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.5073977371627505, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0001639988215524565, |
|
"loss": 0.5639, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 3.516100957354221, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0001637648876758039, |
|
"loss": 0.5511, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.524804177545692, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00016353036426059334, |
|
"loss": 0.5438, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 3.5335073977371625, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0001632952534751122, |
|
"loss": 0.548, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.5422106179286335, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00016305955749307816, |
|
"loss": 0.5532, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 3.5509138381201044, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00016282327849361967, |
|
"loss": 0.5432, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.5596170583115754, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00016258641866125518, |
|
"loss": 0.551, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 3.5683202785030463, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.5454, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.577023498694517, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00016211096526271273, |
|
"loss": 0.5555, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 3.585726718885988, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00016187237609234132, |
|
"loss": 0.5503, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.594429939077459, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00016163321488063637, |
|
"loss": 0.5432, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 3.6031331592689293, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.000161393483838764, |
|
"loss": 0.5531, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.6118363794604003, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0001611531851831586, |
|
"loss": 0.5479, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 3.620539599651871, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0001609123211355025, |
|
"loss": 0.553, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.629242819843342, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.00016067089392270533, |
|
"loss": 0.5554, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 3.637946040034813, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00016042890577688349, |
|
"loss": 0.5501, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.6466492602262837, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0001601863589353395, |
|
"loss": 0.5488, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 3.6553524804177546, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00015994325564054122, |
|
"loss": 0.5618, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.664055700609225, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00015969959814010132, |
|
"loss": 0.5526, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 3.672758920800696, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00015945538868675628, |
|
"loss": 0.5492, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.681462140992167, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0001592106295383458, |
|
"loss": 0.5558, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 3.690165361183638, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00015896532295779157, |
|
"loss": 0.5576, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.698868581375109, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00015871947121307676, |
|
"loss": 0.5514, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 3.7075718015665795, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0001584730765772248, |
|
"loss": 0.5615, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.7162750217580505, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00015822614132827837, |
|
"loss": 0.5489, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 3.7249782419495214, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00015797866774927848, |
|
"loss": 0.5507, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.733681462140992, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0001577306581282432, |
|
"loss": 0.5574, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 3.742384682332463, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00015748211475814658, |
|
"loss": 0.5579, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.751087902523934, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00015723303993689754, |
|
"loss": 0.5736, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 3.759791122715405, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0001569834359673184, |
|
"loss": 0.553, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.768494342906876, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00015673330515712382, |
|
"loss": 0.5617, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 3.7771975630983463, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00015648264981889934, |
|
"loss": 0.5583, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.7859007832898173, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00015623147227008006, |
|
"loss": 0.5584, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 3.7946040034812882, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00015597977483292907, |
|
"loss": 0.5559, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.8033072236727588, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00015572755983451626, |
|
"loss": 0.5543, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 3.8120104438642297, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00015547482960669645, |
|
"loss": 0.5554, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.8207136640557007, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00015522158648608817, |
|
"loss": 0.5665, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 3.8294168842471716, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00015496783281405177, |
|
"loss": 0.5614, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.838120104438642, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00015471357093666804, |
|
"loss": 0.5596, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 3.846823324630113, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001544588032047163, |
|
"loss": 0.553, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.855526544821584, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.0001542035319736528, |
|
"loss": 0.549, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 3.8642297650130546, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0001539477596035888, |
|
"loss": 0.5562, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.8729329852045256, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00015369148845926893, |
|
"loss": 0.5658, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 3.8816362053959965, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00015343472091004925, |
|
"loss": 0.5625, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.8903394255874675, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00015317745932987524, |
|
"loss": 0.5613, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 3.8990426457789384, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00015291970609726007, |
|
"loss": 0.567, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.907745865970409, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0001526614635952624, |
|
"loss": 0.568, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 3.91644908616188, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001524027342114644, |
|
"loss": 0.5671, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.925152306353351, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.0001521435203379498, |
|
"loss": 0.5538, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 3.9338555265448214, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00015188382437128167, |
|
"loss": 0.5624, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.9425587467362924, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00015162364871248023, |
|
"loss": 0.5491, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 3.9512619669277633, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0001513629957670007, |
|
"loss": 0.5575, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.9599651871192343, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00015110186794471103, |
|
"loss": 0.5639, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 3.968668407310705, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00015084026765986979, |
|
"loss": 0.564, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.9773716275021758, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00015057819733110348, |
|
"loss": 0.569, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 3.9860748476936467, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.00015031565938138458, |
|
"loss": 0.5676, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.9947780678851172, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.0001500526562380089, |
|
"loss": 0.5693, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.70300030708313, |
|
"eval_runtime": 0.778, |
|
"eval_samples_per_second": 7.712, |
|
"eval_steps_per_second": 1.285, |
|
"step": 2298 |
|
}, |
|
{ |
|
"epoch": 4.003481288076588, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00014978919033257316, |
|
"loss": 0.5013, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.012184508268059, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00014952526410095258, |
|
"loss": 0.412, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 4.02088772845953, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00014926087998327837, |
|
"loss": 0.4225, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.029590948651001, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00014899604042391506, |
|
"loss": 0.4255, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 4.038294168842472, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.000148730747871438, |
|
"loss": 0.4108, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.046997389033942, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0001484650047786107, |
|
"loss": 0.4152, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 4.055700609225413, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00014819881360236207, |
|
"loss": 0.4197, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.064403829416884, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00014793217680376394, |
|
"loss": 0.4203, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 4.073107049608355, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00014766509684800794, |
|
"loss": 0.4138, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.081810269799826, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00014739757620438307, |
|
"loss": 0.4167, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 4.090513489991297, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00014712961734625264, |
|
"loss": 0.4183, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.099216710182768, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0001468612227510315, |
|
"loss": 0.4302, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 4.107919930374239, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00014659239490016302, |
|
"loss": 0.4329, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.116623150565709, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00014632313627909642, |
|
"loss": 0.4304, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 4.12532637075718, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00014605344937726345, |
|
"loss": 0.4194, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.134029590948651, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00014578333668805558, |
|
"loss": 0.4195, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 4.142732811140122, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0001455128007088009, |
|
"loss": 0.4354, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.151436031331593, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00014524184394074102, |
|
"loss": 0.442, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 4.160139251523064, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00014497046888900801, |
|
"loss": 0.433, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.168842471714535, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00014469867806260115, |
|
"loss": 0.4325, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 4.177545691906005, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00014442647397436365, |
|
"loss": 0.4255, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.186248912097476, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.0001441538591409598, |
|
"loss": 0.4419, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 4.194952132288947, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00014388083608285113, |
|
"loss": 0.4354, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.203655352480418, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00014360740732427367, |
|
"loss": 0.4308, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 4.212358572671889, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00014333357539321416, |
|
"loss": 0.434, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.22106179286336, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00014305934282138701, |
|
"loss": 0.4402, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 4.2297650130548305, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00014278471214421073, |
|
"loss": 0.4298, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.2384682332463015, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0001425096859007844, |
|
"loss": 0.4332, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 4.247171453437772, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0001422342666338645, |
|
"loss": 0.4441, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.2558746736292425, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00014195845688984104, |
|
"loss": 0.435, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 4.2645778938207135, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00014168225921871433, |
|
"loss": 0.4355, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.2732811140121845, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00014140567617407105, |
|
"loss": 0.4422, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 4.281984334203655, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00014112871031306119, |
|
"loss": 0.4347, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.290687554395126, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00014085136419637369, |
|
"loss": 0.4353, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 4.299390774586597, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00014057364038821347, |
|
"loss": 0.4425, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.308093994778067, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00014029554145627714, |
|
"loss": 0.4419, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 4.316797214969538, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00014001706997172973, |
|
"loss": 0.4403, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.325500435161009, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00013973822850918055, |
|
"loss": 0.4427, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 4.33420365535248, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.0001394590196466596, |
|
"loss": 0.4351, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.342906875543951, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00013917944596559376, |
|
"loss": 0.437, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 4.351610095735422, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0001388995100507827, |
|
"loss": 0.4383, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.360313315926893, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0001386192144903752, |
|
"loss": 0.4403, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 4.369016536118364, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00013833856187584514, |
|
"loss": 0.4474, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.377719756309834, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00013805755480196755, |
|
"loss": 0.4424, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 4.386422976501305, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0001377761958667946, |
|
"loss": 0.4495, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.395126196692776, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00013749448767163156, |
|
"loss": 0.4468, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 4.403829416884247, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0001372124328210129, |
|
"loss": 0.4472, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.412532637075718, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001369300339226779, |
|
"loss": 0.4459, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 4.421235857267189, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.000136647293587547, |
|
"loss": 0.4462, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.42993907745866, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00013636421442969718, |
|
"loss": 0.4439, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 4.438642297650131, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00013608079906633807, |
|
"loss": 0.4468, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.447345517841601, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00013579705011778766, |
|
"loss": 0.4528, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 4.456048738033072, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00013551297020744825, |
|
"loss": 0.4449, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.464751958224543, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0001352285619617818, |
|
"loss": 0.4475, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 4.473455178416014, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00013494382801028615, |
|
"loss": 0.4431, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.482158398607485, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00013465877098547033, |
|
"loss": 0.4472, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 4.490861618798956, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00013437339352283026, |
|
"loss": 0.4492, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.499564838990427, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00013408769826082467, |
|
"loss": 0.46, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 4.508268059181898, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00013380168784085027, |
|
"loss": 0.449, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.516971279373368, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00013351536490721784, |
|
"loss": 0.4548, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 4.525674499564839, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00013322873210712727, |
|
"loss": 0.4428, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.53437771975631, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00013294179209064348, |
|
"loss": 0.4523, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 4.543080939947781, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0001326545475106716, |
|
"loss": 0.4523, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.551784160139252, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0001323670010229328, |
|
"loss": 0.4463, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 4.560487380330723, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00013207915528593933, |
|
"loss": 0.4485, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.569190600522193, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00013179101296097035, |
|
"loss": 0.4508, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 4.577893820713664, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00013150257671204696, |
|
"loss": 0.446, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.586597040905135, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00013121384920590786, |
|
"loss": 0.448, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 4.595300261096606, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00013092483311198444, |
|
"loss": 0.4522, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.604003481288077, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00013063553110237642, |
|
"loss": 0.4565, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 4.6127067014795475, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00013034594585182677, |
|
"loss": 0.4575, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.6214099216710185, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00013005608003769718, |
|
"loss": 0.4544, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 4.6301131418624895, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00012976593633994346, |
|
"loss": 0.457, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.63881636205396, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00012947551744109043, |
|
"loss": 0.4478, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 4.6475195822454305, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00012918482602620733, |
|
"loss": 0.4591, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.6562228024369015, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00012889386478288299, |
|
"loss": 0.4549, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 4.664926022628372, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00012860263640120085, |
|
"loss": 0.4468, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.673629242819843, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00012831114357371426, |
|
"loss": 0.444, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 4.682332463011314, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001280193889954215, |
|
"loss": 0.4649, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.691035683202785, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.0001277273753637408, |
|
"loss": 0.4608, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 4.699738903394256, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00012743510537848555, |
|
"loss": 0.4522, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.708442123585726, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0001271425817418392, |
|
"loss": 0.4637, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 4.717145343777197, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00012684980715833039, |
|
"loss": 0.4589, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.725848563968668, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0001265567843348078, |
|
"loss": 0.4552, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 4.734551784160139, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00012626351598041532, |
|
"loss": 0.4555, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.74325500435161, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00012597000480656684, |
|
"loss": 0.463, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 4.751958224543081, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00012567625352692127, |
|
"loss": 0.462, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.760661444734552, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00012538226485735735, |
|
"loss": 0.4553, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 4.769364664926023, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00012508804151594867, |
|
"loss": 0.4525, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.778067885117493, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.0001247935862229385, |
|
"loss": 0.4609, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 4.786771105308964, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00012449890170071454, |
|
"loss": 0.4491, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.795474325500435, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00012420399067378392, |
|
"loss": 0.4502, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 4.804177545691906, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00012390885586874783, |
|
"loss": 0.4527, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.812880765883377, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0001236135000142765, |
|
"loss": 0.4531, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 4.821583986074848, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00012331792584108374, |
|
"loss": 0.4511, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.830287206266319, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00012302213608190202, |
|
"loss": 0.4504, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 4.838990426457789, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0001227261334714568, |
|
"loss": 0.4538, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.84769364664926, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00012242992074644162, |
|
"loss": 0.4585, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 4.856396866840731, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0001221335006454925, |
|
"loss": 0.4518, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.865100087032202, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00012183687590916291, |
|
"loss": 0.4534, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 4.873803307223673, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00012154004927989815, |
|
"loss": 0.4543, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.882506527415144, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00012124302350201016, |
|
"loss": 0.4549, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 4.891209747606615, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00012094580132165211, |
|
"loss": 0.4405, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.899912967798086, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00012064838548679307, |
|
"loss": 0.4501, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 4.908616187989556, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00012035077874719242, |
|
"loss": 0.4574, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.917319408181027, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.00012005298385437467, |
|
"loss": 0.4515, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 4.926022628372498, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00011975500356160383, |
|
"loss": 0.4532, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.934725848563969, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00011945684062385803, |
|
"loss": 0.4533, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 4.94342906875544, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00011915849779780408, |
|
"loss": 0.4633, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.952132288946911, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.00011885997784177196, |
|
"loss": 0.4568, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 4.960835509138382, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00011856128351572921, |
|
"loss": 0.4543, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.969538729329852, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00011826241758125565, |
|
"loss": 0.4576, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 4.978241949521323, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00011796338280151756, |
|
"loss": 0.4595, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.986945169712794, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0001176641819412424, |
|
"loss": 0.4549, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 4.9956483899042645, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.4555, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.999129677980853, |
|
"eval_loss": 3.144505500793457, |
|
"eval_runtime": 1.1115, |
|
"eval_samples_per_second": 5.398, |
|
"eval_steps_per_second": 0.9, |
|
"step": 2872 |
|
}, |
|
{ |
|
"epoch": 5.0043516100957355, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00011706529304564235, |
|
"loss": 0.4042, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 5.013054830287206, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00011676561054734749, |
|
"loss": 0.3352, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 5.021758050478677, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00011646577304252433, |
|
"loss": 0.3304, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 5.030461270670148, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.0001161657833033219, |
|
"loss": 0.3354, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 5.039164490861618, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.0001158656441032967, |
|
"loss": 0.3342, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 5.047867711053089, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00011556535821738705, |
|
"loss": 0.3344, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.05657093124456, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00011526492842188745, |
|
"loss": 0.3339, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 5.065274151436031, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.000114964357494423, |
|
"loss": 0.3343, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 5.073977371627502, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00011466364821392348, |
|
"loss": 0.3391, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 5.082680591818973, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00011436280336059799, |
|
"loss": 0.34, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 5.091383812010444, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00011406182571590893, |
|
"loss": 0.3388, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 5.100087032201914, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00011376071806254651, |
|
"loss": 0.3371, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 5.108790252393385, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00011345948318440289, |
|
"loss": 0.3496, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 5.117493472584856, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.0001131581238665465, |
|
"loss": 0.3433, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 5.126196692776327, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00011285664289519626, |
|
"loss": 0.3426, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 5.134899912967798, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00011255504305769589, |
|
"loss": 0.3352, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.143603133159269, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00011225332714248804, |
|
"loss": 0.3492, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 5.15230635335074, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00011195149793908856, |
|
"loss": 0.338, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 5.161009573542211, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00011164955823806079, |
|
"loss": 0.343, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 5.169712793733681, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00011134751083098946, |
|
"loss": 0.3407, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 5.178416013925152, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00011104535851045539, |
|
"loss": 0.3391, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 5.187119234116623, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00011074310407000914, |
|
"loss": 0.3438, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 5.195822454308094, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00011044075030414553, |
|
"loss": 0.3394, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 5.204525674499565, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00011013830000827767, |
|
"loss": 0.3471, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 5.213228894691036, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00010983575597871114, |
|
"loss": 0.3392, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 5.221932114882507, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00010953312101261815, |
|
"loss": 0.3436, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.230635335073977, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00010923039790801164, |
|
"loss": 0.3398, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 5.239338555265448, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00010892758946371944, |
|
"loss": 0.3469, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 5.248041775456919, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00010862469847935841, |
|
"loss": 0.3444, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 5.25674499564839, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00010832172775530851, |
|
"loss": 0.3431, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 5.265448215839861, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00010801868009268691, |
|
"loss": 0.3513, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 5.274151436031332, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00010771555829332223, |
|
"loss": 0.3476, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 5.282854656222803, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00010741236515972839, |
|
"loss": 0.3471, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 5.291557876414274, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.0001071091034950788, |
|
"loss": 0.3416, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 5.300261096605744, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00010680577610318072, |
|
"loss": 0.3454, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 5.308964316797215, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0001065023857884488, |
|
"loss": 0.3486, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.317667536988686, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.00010619893535587964, |
|
"loss": 0.3386, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 5.326370757180157, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00010589542761102553, |
|
"loss": 0.3418, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 5.335073977371628, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00010559186535996873, |
|
"loss": 0.3522, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 5.3437771975630985, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00010528825140929541, |
|
"loss": 0.3449, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 5.3524804177545695, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00010498458856606972, |
|
"loss": 0.3473, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 5.36118363794604, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00010468087963780789, |
|
"loss": 0.353, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 5.3698868581375105, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00010437712743245209, |
|
"loss": 0.352, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 5.3785900783289815, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00010407333475834487, |
|
"loss": 0.354, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 5.3872932985204525, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00010376950442420259, |
|
"loss": 0.3436, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 5.395996518711923, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00010346563923909014, |
|
"loss": 0.3511, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.404699738903394, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.00010316174201239437, |
|
"loss": 0.3472, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 5.413402959094865, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00010285781555379852, |
|
"loss": 0.3449, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 5.422106179286336, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00010255386267325602, |
|
"loss": 0.3471, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 5.430809399477806, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00010224988618096458, |
|
"loss": 0.3523, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 5.439512619669277, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00010194588888734027, |
|
"loss": 0.3492, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 5.448215839860748, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00010164187360299142, |
|
"loss": 0.3465, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 5.456919060052219, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00010133784313869277, |
|
"loss": 0.3472, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 5.46562228024369, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00010103380030535929, |
|
"loss": 0.3558, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 5.474325500435161, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001007297479140204, |
|
"loss": 0.3539, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 5.483028720626632, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00010042568877579388, |
|
"loss": 0.3486, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.491731940818102, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00010012162570185983, |
|
"loss": 0.3573, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 5.500435161009573, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 9.981756150343485e-05, |
|
"loss": 0.3473, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 5.509138381201044, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 9.951349899174577e-05, |
|
"loss": 0.3558, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 5.517841601392515, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 9.920944097800398e-05, |
|
"loss": 0.3542, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 5.526544821583986, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 9.890539027337924e-05, |
|
"loss": 0.3471, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 5.535248041775457, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 9.860134968897366e-05, |
|
"loss": 0.3553, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 5.543951261966928, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.829732203579584e-05, |
|
"loss": 0.3558, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 5.552654482158399, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 9.799331012473493e-05, |
|
"loss": 0.3526, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 5.56135770234987, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 9.768931676653427e-05, |
|
"loss": 0.3499, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 5.57006092254134, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.738534477176596e-05, |
|
"loss": 0.3447, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.578764142732811, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 9.708139695080441e-05, |
|
"loss": 0.3568, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 5.587467362924282, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 9.677747611380058e-05, |
|
"loss": 0.3575, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 5.596170583115753, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 9.647358507065594e-05, |
|
"loss": 0.3536, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 5.604873803307224, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 9.616972663099647e-05, |
|
"loss": 0.3524, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 5.613577023498695, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 9.58659036041468e-05, |
|
"loss": 0.3541, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 5.622280243690165, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 9.556211879910414e-05, |
|
"loss": 0.3519, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 5.630983463881636, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 9.52583750245122e-05, |
|
"loss": 0.3514, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 5.639686684073107, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.495467508863542e-05, |
|
"loss": 0.3485, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 5.648389904264578, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 9.465102179933302e-05, |
|
"loss": 0.3547, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 5.657093124456049, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 9.434741796403282e-05, |
|
"loss": 0.3549, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 5.66579634464752, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 9.404386638970542e-05, |
|
"loss": 0.3502, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 5.674499564838991, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 9.37403698828383e-05, |
|
"loss": 0.354, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 5.683202785030462, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 9.343693124940977e-05, |
|
"loss": 0.3499, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 5.691906005221933, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 9.313355329486318e-05, |
|
"loss": 0.3535, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 5.700609225413403, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 9.283023882408065e-05, |
|
"loss": 0.3487, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 5.709312445604874, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.252699064135758e-05, |
|
"loss": 0.3458, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 5.718015665796345, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 9.22238115503764e-05, |
|
"loss": 0.3518, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 5.7267188859878155, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 9.192070435418079e-05, |
|
"loss": 0.3488, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 5.7354221061792865, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 9.161767185514964e-05, |
|
"loss": 0.3529, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 5.7441253263707575, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.131471685497134e-05, |
|
"loss": 0.3553, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 5.7528285465622275, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.101184215461774e-05, |
|
"loss": 0.3494, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 5.7615317667536985, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.070905055431822e-05, |
|
"loss": 0.357, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 5.7702349869451695, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 9.040634485353389e-05, |
|
"loss": 0.3592, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 5.77893820713664, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.010372785093167e-05, |
|
"loss": 0.3521, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 5.787641427328111, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 8.980120234435849e-05, |
|
"loss": 0.3605, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 5.796344647519582, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 8.949877113081521e-05, |
|
"loss": 0.35, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 5.805047867711053, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.919643700643103e-05, |
|
"loss": 0.3483, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 5.813751087902524, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 8.889420276643746e-05, |
|
"loss": 0.3505, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 5.822454308093995, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 8.859207120514255e-05, |
|
"loss": 0.3468, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 5.831157528285465, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 8.829004511590501e-05, |
|
"loss": 0.3539, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 5.839860748476936, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 8.798812729110837e-05, |
|
"loss": 0.3481, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 5.848563968668407, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 8.768632052213531e-05, |
|
"loss": 0.3551, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 5.857267188859878, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 8.738462759934168e-05, |
|
"loss": 0.3509, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 5.865970409051349, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 8.708305131203072e-05, |
|
"loss": 0.3551, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 5.87467362924282, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 8.678159444842737e-05, |
|
"loss": 0.3469, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 5.883376849434291, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 8.648025979565245e-05, |
|
"loss": 0.3544, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 5.892080069625761, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 8.617905013969688e-05, |
|
"loss": 0.3476, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 5.900783289817232, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 8.587796826539585e-05, |
|
"loss": 0.3531, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 5.909486510008703, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 8.557701695640321e-05, |
|
"loss": 0.3401, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 5.918189730200174, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 8.527619899516567e-05, |
|
"loss": 0.35, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.926892950391645, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 8.497551716289703e-05, |
|
"loss": 0.3474, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 5.935596170583116, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 8.467497423955249e-05, |
|
"loss": 0.35, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 5.944299390774587, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 8.437457300380309e-05, |
|
"loss": 0.3564, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 5.953002610966058, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 8.407431623300983e-05, |
|
"loss": 0.3516, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 5.961705831157528, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 8.377420670319795e-05, |
|
"loss": 0.356, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 5.970409051348999, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 8.347424718903151e-05, |
|
"loss": 0.3538, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 5.97911227154047, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 8.317444046378757e-05, |
|
"loss": 0.3491, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 5.987815491731941, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.28747892993306e-05, |
|
"loss": 0.3559, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 5.996518711923412, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 8.257529646608672e-05, |
|
"loss": 0.3504, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 3.7196741104125977, |
|
"eval_runtime": 0.7785, |
|
"eval_samples_per_second": 7.707, |
|
"eval_steps_per_second": 1.285, |
|
"step": 3447 |
|
}, |
|
{ |
|
"epoch": 6.005221932114883, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 8.227596473301835e-05, |
|
"loss": 0.2993, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 6.013925152306354, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.19767968675983e-05, |
|
"loss": 0.2552, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 6.022628372497824, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.167779563578456e-05, |
|
"loss": 0.2635, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 6.031331592689295, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 8.13789638019942e-05, |
|
"loss": 0.2613, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 6.040034812880766, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 8.108030412907844e-05, |
|
"loss": 0.2631, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 6.048738033072237, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 8.078181937829656e-05, |
|
"loss": 0.2646, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 6.057441253263708, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 8.048351230929074e-05, |
|
"loss": 0.2621, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 6.066144473455179, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 8.018538568006027e-05, |
|
"loss": 0.267, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 6.07484769364665, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 7.988744224693625e-05, |
|
"loss": 0.2599, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 6.0835509138381205, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 7.958968476455608e-05, |
|
"loss": 0.2643, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 6.092254134029591, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 7.929211598583794e-05, |
|
"loss": 0.269, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.100957354221062, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 7.899473866195526e-05, |
|
"loss": 0.2622, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 6.1096605744125325, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 7.869755554231145e-05, |
|
"loss": 0.2633, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 6.1183637946040035, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 7.840056937451444e-05, |
|
"loss": 0.2687, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 6.127067014795474, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 7.810378290435108e-05, |
|
"loss": 0.2622, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 6.135770234986945, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 7.780719887576213e-05, |
|
"loss": 0.2652, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 6.144473455178416, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 7.751082003081653e-05, |
|
"loss": 0.267, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 6.153176675369886, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.721464910968627e-05, |
|
"loss": 0.2621, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 6.161879895561357, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 7.691868885062088e-05, |
|
"loss": 0.2614, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 6.170583115752828, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 7.662294198992228e-05, |
|
"loss": 0.264, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 6.179286335944299, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 7.632741126191947e-05, |
|
"loss": 0.267, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 6.18798955613577, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.603209939894312e-05, |
|
"loss": 0.2638, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 6.196692776327241, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 7.573700913130035e-05, |
|
"loss": 0.2614, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 6.205395996518712, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.544214318724961e-05, |
|
"loss": 0.2659, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 6.214099216710183, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 7.514750429297528e-05, |
|
"loss": 0.2686, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 6.222802436901653, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 7.485309517256267e-05, |
|
"loss": 0.268, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 6.231505657093124, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 7.455891854797256e-05, |
|
"loss": 0.2652, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 6.240208877284595, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 7.426497713901629e-05, |
|
"loss": 0.2638, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 6.248912097476066, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.397127366333048e-05, |
|
"loss": 0.2649, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 6.257615317667537, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 7.3677810836352e-05, |
|
"loss": 0.271, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 6.266318537859008, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 7.338459137129266e-05, |
|
"loss": 0.2661, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.275021758050479, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 7.309161797911441e-05, |
|
"loss": 0.2693, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 6.283724978241949, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 7.279889336850408e-05, |
|
"loss": 0.2668, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 6.29242819843342, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 7.250642024584835e-05, |
|
"loss": 0.2709, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 6.301131418624891, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.22142013152088e-05, |
|
"loss": 0.2682, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 6.309834638816362, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 7.192223927829689e-05, |
|
"loss": 0.264, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 6.318537859007833, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 7.163053683444901e-05, |
|
"loss": 0.2719, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 6.327241079199304, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 7.133909668060131e-05, |
|
"loss": 0.2715, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 6.335944299390775, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 7.104792151126515e-05, |
|
"loss": 0.263, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 6.344647519582246, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 7.075701401850183e-05, |
|
"loss": 0.2629, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 6.353350739773716, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 7.046637689189794e-05, |
|
"loss": 0.2674, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 6.362053959965187, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 7.017601281854027e-05, |
|
"loss": 0.2684, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 6.370757180156658, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.988592448299124e-05, |
|
"loss": 0.2652, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 6.379460400348129, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 6.959611456726387e-05, |
|
"loss": 0.2642, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 6.3881636205396, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.930658575079705e-05, |
|
"loss": 0.2696, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 6.396866840731071, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 6.901734071043071e-05, |
|
"loss": 0.27, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 6.405570060922542, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 6.872838212038122e-05, |
|
"loss": 0.2699, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 6.414273281114012, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.843971265221655e-05, |
|
"loss": 0.2687, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 6.422976501305483, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 6.815133497483157e-05, |
|
"loss": 0.2681, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 6.431679721496954, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 6.786325175442339e-05, |
|
"loss": 0.2631, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 6.440382941688425, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.75754656544667e-05, |
|
"loss": 0.2619, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.449086161879896, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 6.728797933568924e-05, |
|
"loss": 0.2658, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 6.4577893820713665, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 6.700079545604708e-05, |
|
"loss": 0.2696, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 6.4664926022628375, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 6.671391667070002e-05, |
|
"loss": 0.2707, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 6.4751958224543085, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 6.642734563198723e-05, |
|
"loss": 0.2653, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 6.4838990426457785, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 6.614108498940252e-05, |
|
"loss": 0.2721, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 6.4926022628372495, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 6.585513738956996e-05, |
|
"loss": 0.2674, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 6.5013054830287205, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.556950547621936e-05, |
|
"loss": 0.2689, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 6.510008703220191, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.52841918901619e-05, |
|
"loss": 0.2695, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 6.518711923411662, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 6.499919926926566e-05, |
|
"loss": 0.269, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 6.527415143603133, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.471453024843113e-05, |
|
"loss": 0.2655, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 6.536118363794604, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.44301874595671e-05, |
|
"loss": 0.265, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 6.544821583986074, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 6.414617353156605e-05, |
|
"loss": 0.2627, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 6.553524804177545, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 6.386249109028013e-05, |
|
"loss": 0.2724, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 6.562228024369016, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.357914275849652e-05, |
|
"loss": 0.2693, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 6.570931244560487, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 6.329613115591359e-05, |
|
"loss": 0.273, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 6.579634464751958, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.301345889911637e-05, |
|
"loss": 0.2665, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 6.588337684943429, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 6.273112860155251e-05, |
|
"loss": 0.2676, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 6.5970409051349, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.2449142873508e-05, |
|
"loss": 0.2659, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 6.605744125326371, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 6.21675043220832e-05, |
|
"loss": 0.2691, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 6.614447345517842, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 6.188621555116865e-05, |
|
"loss": 0.273, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.623150565709312, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 6.160527916142093e-05, |
|
"loss": 0.2637, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 6.631853785900783, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 6.132469775023867e-05, |
|
"loss": 0.2665, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 6.640557006092254, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.104447391173858e-05, |
|
"loss": 0.2675, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 6.649260226283725, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 6.0764610236731524e-05, |
|
"loss": 0.2696, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 6.657963446475196, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 6.048510931269824e-05, |
|
"loss": 0.2654, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 6.020597372376589e-05, |
|
"loss": 0.2746, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 6.675369886858137, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.992720605068378e-05, |
|
"loss": 0.2731, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 6.684073107049608, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 5.964880887079972e-05, |
|
"loss": 0.2694, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 6.692776327241079, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 5.937078475803607e-05, |
|
"loss": 0.2718, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 6.70147954743255, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 5.909313628286601e-05, |
|
"loss": 0.2679, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 6.710182767624021, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 5.881586601228983e-05, |
|
"loss": 0.2644, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 6.718885987815492, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.853897650981107e-05, |
|
"loss": 0.2712, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 6.727589208006963, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 5.8262470335412834e-05, |
|
"loss": 0.2645, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 6.736292428198434, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.798635004553421e-05, |
|
"loss": 0.2668, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 6.744995648389905, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 5.771061819304664e-05, |
|
"loss": 0.2735, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 6.753698868581375, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.7435277327230206e-05, |
|
"loss": 0.2721, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 6.762402088772846, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 5.716032999375006e-05, |
|
"loss": 0.2654, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 6.771105308964317, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.6885778734633074e-05, |
|
"loss": 0.2701, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 6.779808529155788, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 0.2684, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 6.788511749347259, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 5.6337874589262915e-05, |
|
"loss": 0.2686, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 6.79721496953873, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 5.606452676865993e-05, |
|
"loss": 0.2666, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 6.8059181897302, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.5791585153673774e-05, |
|
"loss": 0.2687, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 6.814621409921671, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 5.5519052267787444e-05, |
|
"loss": 0.2667, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 6.823324630113142, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.524693063070492e-05, |
|
"loss": 0.2689, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 6.832027850304613, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 5.497522275832799e-05, |
|
"loss": 0.2666, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 6.8407310704960835, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.4703931162733116e-05, |
|
"loss": 0.265, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 6.8494342906875545, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 5.4433058352147914e-05, |
|
"loss": 0.2667, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 6.8581375108790255, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 5.416260683092814e-05, |
|
"loss": 0.2629, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 6.866840731070496, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 5.389257909953462e-05, |
|
"loss": 0.2712, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 6.875543951261967, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 5.362297765450999e-05, |
|
"loss": 0.2671, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 6.8842471714534375, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 5.335380498845559e-05, |
|
"loss": 0.261, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 6.892950391644908, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 5.308506359000851e-05, |
|
"loss": 0.2663, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 6.901653611836379, |
|
"grad_norm": 0.75, |
|
"learning_rate": 5.281675594381859e-05, |
|
"loss": 0.2673, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 6.91035683202785, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 5.25488845305254e-05, |
|
"loss": 0.2691, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 6.919060052219321, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 5.228145182673532e-05, |
|
"loss": 0.2725, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 6.927763272410792, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.2014460304998545e-05, |
|
"loss": 0.2653, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 6.936466492602263, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.1747912433786497e-05, |
|
"loss": 0.2661, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 6.945169712793733, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 5.148181067746862e-05, |
|
"loss": 0.2707, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 6.953872932985204, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 5.121615749629003e-05, |
|
"loss": 0.267, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 6.962576153176675, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.0950955346348314e-05, |
|
"loss": 0.2662, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.971279373368146, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 5.068620667957123e-05, |
|
"loss": 0.2695, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 6.979982593559617, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 5.042191394369371e-05, |
|
"loss": 0.266, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 6.988685813751088, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 5.01580795822355e-05, |
|
"loss": 0.2737, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 6.997389033942559, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.989470603447835e-05, |
|
"loss": 0.2672, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 6.999129677980853, |
|
"eval_loss": 4.307767391204834, |
|
"eval_runtime": 1.1109, |
|
"eval_samples_per_second": 5.401, |
|
"eval_steps_per_second": 0.9, |
|
"step": 4021 |
|
}, |
|
{ |
|
"epoch": 7.00609225413403, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 4.963179573544357e-05, |
|
"loss": 0.2314, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 7.0147954743255, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.9369351115869535e-05, |
|
"loss": 0.2146, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 7.023498694516971, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 4.9107374602189216e-05, |
|
"loss": 0.2171, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 7.032201914708442, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.8845868616507617e-05, |
|
"loss": 0.2179, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 7.040905134899913, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 4.8584835576579466e-05, |
|
"loss": 0.2184, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 7.049608355091384, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.832427789578701e-05, |
|
"loss": 0.2178, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 7.058311575282855, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.806419798311739e-05, |
|
"loss": 0.214, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 7.067014795474326, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.7804598243140666e-05, |
|
"loss": 0.2176, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 7.075718015665796, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.754548107598736e-05, |
|
"loss": 0.2158, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 7.084421235857267, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 4.728684887732649e-05, |
|
"loss": 0.2175, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 7.093124456048738, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.702870403834317e-05, |
|
"loss": 0.2162, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 7.101827676240209, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.6771048945716664e-05, |
|
"loss": 0.2189, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 7.11053089643168, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 4.65138859815983e-05, |
|
"loss": 0.2187, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 7.119234116623151, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.62572175235895e-05, |
|
"loss": 0.2207, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 7.127937336814622, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 4.60010459447196e-05, |
|
"loss": 0.2111, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 7.136640557006093, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 4.574537361342407e-05, |
|
"loss": 0.2194, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.145343777197563, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.5490202893522614e-05, |
|
"loss": 0.2172, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 7.154046997389034, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.5235536144197353e-05, |
|
"loss": 0.2194, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 7.162750217580505, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.498137571997081e-05, |
|
"loss": 0.2166, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 7.171453437771976, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 4.472772397068431e-05, |
|
"loss": 0.2176, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 7.180156657963447, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 4.447458324147629e-05, |
|
"loss": 0.225, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 7.188859878154918, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.422195587276058e-05, |
|
"loss": 0.217, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 7.1975630983463885, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.396984420020451e-05, |
|
"loss": 0.2182, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 7.206266318537859, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 4.3718250554707784e-05, |
|
"loss": 0.2171, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 7.21496953872933, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.34671772623806e-05, |
|
"loss": 0.2155, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 7.2236727589208005, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 4.321662664452221e-05, |
|
"loss": 0.217, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 7.2323759791122715, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.296660101759942e-05, |
|
"loss": 0.2158, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 7.241079199303742, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.271710269322536e-05, |
|
"loss": 0.2191, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 7.249782419495213, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 4.2468133978137945e-05, |
|
"loss": 0.2119, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 7.258485639686684, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.221969717417852e-05, |
|
"loss": 0.2125, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 7.267188859878155, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.1971794578270654e-05, |
|
"loss": 0.2176, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 7.275892080069625, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.1724428482398945e-05, |
|
"loss": 0.2171, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 7.284595300261096, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.1477601173587836e-05, |
|
"loss": 0.2168, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 7.293298520452567, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.1231314933880175e-05, |
|
"loss": 0.2171, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 7.302001740644038, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.098557204031658e-05, |
|
"loss": 0.217, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 7.310704960835509, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 4.0740374764914136e-05, |
|
"loss": 0.2184, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 7.31940818102698, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.049572537464531e-05, |
|
"loss": 0.2126, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 7.328111401218451, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.025162613141713e-05, |
|
"loss": 0.2173, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 7.336814621409921, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.000807929205027e-05, |
|
"loss": 0.2113, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 7.345517841601392, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.9765087108258204e-05, |
|
"loss": 0.2215, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 7.354221061792863, |
|
"grad_norm": 0.75, |
|
"learning_rate": 3.95226518266262e-05, |
|
"loss": 0.2204, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 7.362924281984334, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.9280775688590735e-05, |
|
"loss": 0.2169, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 7.371627502175805, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 3.903946093041877e-05, |
|
"loss": 0.2188, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 7.380330722367276, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.8798709783187036e-05, |
|
"loss": 0.2162, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 7.389033942558747, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.85585244727613e-05, |
|
"loss": 0.2163, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 7.397737162750218, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 3.8318907219775935e-05, |
|
"loss": 0.2179, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 7.406440382941688, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.8079860239613395e-05, |
|
"loss": 0.2197, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 7.415143603133159, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 3.784138574238357e-05, |
|
"loss": 0.2177, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 7.42384682332463, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 3.760348593290348e-05, |
|
"loss": 0.2188, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 7.432550043516101, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.736616301067694e-05, |
|
"loss": 0.2187, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 7.441253263707572, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.7129419169874114e-05, |
|
"loss": 0.221, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 7.449956483899043, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.689325659931123e-05, |
|
"loss": 0.2236, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 7.458659704090514, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.6657677482430377e-05, |
|
"loss": 0.2188, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 7.467362924281984, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.642268399727941e-05, |
|
"loss": 0.2165, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 7.476066144473455, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.618827831649158e-05, |
|
"loss": 0.2183, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 7.484769364664926, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 3.595446260726576e-05, |
|
"loss": 0.2117, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.493472584856397, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.2167, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 7.502175805047868, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 3.5488609745002214e-05, |
|
"loss": 0.219, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 7.510879025239339, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 3.525657689900923e-05, |
|
"loss": 0.2145, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 7.51958224543081, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.502514263862793e-05, |
|
"loss": 0.2159, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 7.528285465622281, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.479430910358474e-05, |
|
"loss": 0.2177, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 7.536988685813752, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 3.456407842805223e-05, |
|
"loss": 0.2154, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 7.545691906005222, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.433445274062907e-05, |
|
"loss": 0.2157, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 7.554395126196693, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 3.410543416432069e-05, |
|
"loss": 0.2122, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 7.563098346388164, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 3.387702481651931e-05, |
|
"loss": 0.2215, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 7.5718015665796345, |
|
"grad_norm": 0.75, |
|
"learning_rate": 3.364922680898458e-05, |
|
"loss": 0.2192, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 7.5805047867711055, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.342204224782406e-05, |
|
"loss": 0.2168, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 7.5892080069625765, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.3195473233473584e-05, |
|
"loss": 0.2163, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 7.5979112271540465, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.2969521860678066e-05, |
|
"loss": 0.2162, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 7.6066144473455175, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.2744190218471884e-05, |
|
"loss": 0.2178, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 7.6153176675369885, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.2519480390159806e-05, |
|
"loss": 0.218, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 7.624020887728459, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.229539445329752e-05, |
|
"loss": 0.216, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 7.63272410791993, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.207193447967264e-05, |
|
"loss": 0.2207, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 7.641427328111401, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.184910253528528e-05, |
|
"loss": 0.217, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 7.650130548302872, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.162690068032926e-05, |
|
"loss": 0.2183, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 7.658833768494343, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.140533096917282e-05, |
|
"loss": 0.2197, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 7.667536988685814, |
|
"grad_norm": 0.75, |
|
"learning_rate": 3.118439545033969e-05, |
|
"loss": 0.2204, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 7.676240208877284, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.096409616649023e-05, |
|
"loss": 0.2194, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 7.684943429068755, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.074443515440252e-05, |
|
"loss": 0.2211, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 7.693646649260226, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.0525414444953396e-05, |
|
"loss": 0.219, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 7.702349869451697, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.0307036063099782e-05, |
|
"loss": 0.2131, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 7.711053089643168, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.0089302027860044e-05, |
|
"loss": 0.2141, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 7.719756309834639, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 2.9872214352295213e-05, |
|
"loss": 0.2192, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 7.728459530026109, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.965577504349035e-05, |
|
"loss": 0.2214, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 7.73716275021758, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 2.9439986102536043e-05, |
|
"loss": 0.2188, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 7.745865970409051, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.9224849524509936e-05, |
|
"loss": 0.2155, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 7.754569190600522, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 2.901036729845831e-05, |
|
"loss": 0.2156, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 7.763272410791993, |
|
"grad_norm": 0.75, |
|
"learning_rate": 2.879654140737743e-05, |
|
"loss": 0.2161, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 7.771975630983464, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.8583373828195603e-05, |
|
"loss": 0.2185, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 7.780678851174935, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.837086653175468e-05, |
|
"loss": 0.2226, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 7.789382071366406, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.8159021482791802e-05, |
|
"loss": 0.2171, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 7.798085291557877, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.794784063992131e-05, |
|
"loss": 0.2204, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 7.806788511749347, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.7737325955616643e-05, |
|
"loss": 0.215, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 7.815491731940818, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 2.7527479376192366e-05, |
|
"loss": 0.2161, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 7.824194952132289, |
|
"grad_norm": 0.75, |
|
"learning_rate": 2.7318302841785827e-05, |
|
"loss": 0.2187, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 7.83289817232376, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.7109798286339705e-05, |
|
"loss": 0.2214, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 7.841601392515231, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.6901967637583835e-05, |
|
"loss": 0.2142, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 7.850304612706702, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.669481281701739e-05, |
|
"loss": 0.2194, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 7.859007832898172, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.6488335739891178e-05, |
|
"loss": 0.2228, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 7.867711053089643, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.6282538315189974e-05, |
|
"loss": 0.2196, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 7.876414273281114, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.607742244561484e-05, |
|
"loss": 0.2225, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 7.885117493472585, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.5872990027565434e-05, |
|
"loss": 0.2163, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 7.893820713664056, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 2.5669242951122586e-05, |
|
"loss": 0.2155, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 7.902523933855527, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 2.5466183100030837e-05, |
|
"loss": 0.2167, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 7.911227154046998, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.5263812351680995e-05, |
|
"loss": 0.2184, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 7.919930374238469, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 2.50621325770927e-05, |
|
"loss": 0.2132, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 7.9286335944299395, |
|
"grad_norm": 0.75, |
|
"learning_rate": 2.4861145640897188e-05, |
|
"loss": 0.2144, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 7.93733681462141, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.466085340132014e-05, |
|
"loss": 0.2171, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 7.946040034812881, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 2.446125771016433e-05, |
|
"loss": 0.2167, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 7.9547432550043515, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.426236041279266e-05, |
|
"loss": 0.2196, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 7.9634464751958225, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.4064163348110956e-05, |
|
"loss": 0.2196, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 7.9721496953872935, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.3866668348551112e-05, |
|
"loss": 0.212, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 7.980852915578764, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.366987724005404e-05, |
|
"loss": 0.2119, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 7.9895561357702345, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.3473791842052774e-05, |
|
"loss": 0.2194, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 7.9982593559617055, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 2.327841396745578e-05, |
|
"loss": 0.2167, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 4.845585823059082, |
|
"eval_runtime": 0.7795, |
|
"eval_samples_per_second": 7.697, |
|
"eval_steps_per_second": 1.283, |
|
"step": 4596 |
|
}, |
|
{ |
|
"epoch": 8.006962576153176, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 2.3083745422630122e-05, |
|
"loss": 0.2056, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.015665796344647, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 2.2889788007384683e-05, |
|
"loss": 0.1978, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 8.024369016536118, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 2.2696543514953595e-05, |
|
"loss": 0.2014, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 8.03307223672759, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.2504013731979732e-05, |
|
"loss": 0.1991, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 8.04177545691906, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.2312200438498043e-05, |
|
"loss": 0.2006, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 8.050478677110531, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.212110540791924e-05, |
|
"loss": 0.2018, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 8.059181897302002, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.1930730407013245e-05, |
|
"loss": 0.1963, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 8.067885117493473, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 2.1741077195893043e-05, |
|
"loss": 0.1995, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 8.076588337684944, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.1552147527998213e-05, |
|
"loss": 0.1984, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 8.085291557876415, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.136394315007889e-05, |
|
"loss": 0.2005, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 8.093994778067884, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.1176465802179467e-05, |
|
"loss": 0.203, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 8.102697998259355, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.0989717217622652e-05, |
|
"loss": 0.1967, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 8.111401218450826, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.0803699122993293e-05, |
|
"loss": 0.2029, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 8.120104438642297, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.061841323812257e-05, |
|
"loss": 0.2005, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 8.128807658833768, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 2.0433861276071942e-05, |
|
"loss": 0.1966, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 8.137510879025239, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.0250044943117385e-05, |
|
"loss": 0.2023, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 8.14621409921671, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.0066965938733707e-05, |
|
"loss": 0.198, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 8.154917319408181, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1.9884625955578594e-05, |
|
"loss": 0.196, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 8.163620539599652, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1.9703026679477256e-05, |
|
"loss": 0.1954, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 8.172323759791123, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.9522169789406575e-05, |
|
"loss": 0.196, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 8.181026979982594, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.934205695747978e-05, |
|
"loss": 0.2014, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 8.189730200174065, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.916268984893086e-05, |
|
"loss": 0.1984, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 8.198433420365536, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.8984070122099218e-05, |
|
"loss": 0.1994, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 8.207136640557007, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.880619942841435e-05, |
|
"loss": 0.2002, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 8.215839860748478, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 1.862907941238059e-05, |
|
"loss": 0.197, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 8.224543080939949, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1.8452711711561842e-05, |
|
"loss": 0.2023, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 8.233246301131418, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.8277097956566437e-05, |
|
"loss": 0.201, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 8.241949521322889, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 1.810223977103217e-05, |
|
"loss": 0.1982, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 8.25065274151436, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.7928138771611225e-05, |
|
"loss": 0.1983, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 8.25935596170583, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.7754796567955155e-05, |
|
"loss": 0.2005, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 8.268059181897302, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.7582214762700054e-05, |
|
"loss": 0.1974, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 8.276762402088773, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.7410394951451814e-05, |
|
"loss": 0.1993, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 8.285465622280244, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.7239338722771327e-05, |
|
"loss": 0.2046, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 8.294168842471715, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.706904765815963e-05, |
|
"loss": 0.2007, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 8.302872062663186, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.6899523332043586e-05, |
|
"loss": 0.2041, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 8.311575282854657, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.673076731176114e-05, |
|
"loss": 0.2024, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 8.320278503046127, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.6562781157546835e-05, |
|
"loss": 0.2025, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 8.328981723237598, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.639556642251737e-05, |
|
"loss": 0.1961, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 8.33768494342907, |
|
"grad_norm": 0.75, |
|
"learning_rate": 1.622912465265738e-05, |
|
"loss": 0.1966, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 8.34638816362054, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.6063457386805004e-05, |
|
"loss": 0.1987, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 8.35509138381201, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.5898566156637708e-05, |
|
"loss": 0.2005, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 8.36379460400348, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.573445248665806e-05, |
|
"loss": 0.1993, |
|
"step": 4805 |
|
}, |
|
{ |
|
"epoch": 8.372497824194951, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.5571117894179754e-05, |
|
"loss": 0.2004, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 8.381201044386422, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 1.540856388931359e-05, |
|
"loss": 0.1989, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 8.389904264577893, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.5246791974953223e-05, |
|
"loss": 0.1935, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 8.398607484769364, |
|
"grad_norm": 0.625, |
|
"learning_rate": 1.5085803646761687e-05, |
|
"loss": 0.1989, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 8.407310704960835, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.4925600393157324e-05, |
|
"loss": 0.1976, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 8.416013925152306, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 1.4766183695300006e-05, |
|
"loss": 0.2008, |
|
"step": 4835 |
|
}, |
|
{ |
|
"epoch": 8.424717145343777, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 1.4607555027077525e-05, |
|
"loss": 0.2007, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 8.433420365535248, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.4449715855091972e-05, |
|
"loss": 0.1992, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 8.44212358572672, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.429266763864614e-05, |
|
"loss": 0.1959, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 8.45082680591819, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.4136411829730023e-05, |
|
"loss": 0.1981, |
|
"step": 4855 |
|
}, |
|
{ |
|
"epoch": 8.459530026109661, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.3980949873007364e-05, |
|
"loss": 0.2006, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 8.468233246301132, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.3826283205802427e-05, |
|
"loss": 0.1991, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 8.476936466492603, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.3672413258086592e-05, |
|
"loss": 0.1991, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 8.485639686684074, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.3519341452465151e-05, |
|
"loss": 0.2025, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 8.494342906875543, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.336706920416415e-05, |
|
"loss": 0.2, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 8.503046127067014, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.3215597921017387e-05, |
|
"loss": 0.2004, |
|
"step": 4885 |
|
}, |
|
{ |
|
"epoch": 8.511749347258485, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.3064929003453286e-05, |
|
"loss": 0.1985, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 8.520452567449956, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.2915063844481989e-05, |
|
"loss": 0.1978, |
|
"step": 4895 |
|
}, |
|
{ |
|
"epoch": 8.529155787641427, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.2766003829682505e-05, |
|
"loss": 0.1972, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 8.537859007832898, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.2617750337189904e-05, |
|
"loss": 0.1993, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 8.546562228024369, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.2470304737682514e-05, |
|
"loss": 0.1956, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 8.55526544821584, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.232366839436926e-05, |
|
"loss": 0.1976, |
|
"step": 4915 |
|
}, |
|
{ |
|
"epoch": 8.56396866840731, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.2177842662977135e-05, |
|
"loss": 0.192, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 8.572671888598782, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.2032828891738646e-05, |
|
"loss": 0.2021, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 8.581375108790253, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.1888628421379221e-05, |
|
"loss": 0.1987, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 8.590078328981724, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.1745242585104955e-05, |
|
"loss": 0.2024, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 8.598781549173195, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.160267270859029e-05, |
|
"loss": 0.2027, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 8.607484769364666, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.1460920109965612e-05, |
|
"loss": 0.2012, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 8.616187989556135, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.1319986099805279e-05, |
|
"loss": 0.2001, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 8.624891209747606, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.1179871981115253e-05, |
|
"loss": 0.2014, |
|
"step": 4955 |
|
}, |
|
{ |
|
"epoch": 8.633594429939077, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 1.1040579049321309e-05, |
|
"loss": 0.2014, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 8.642297650130548, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.0902108592256831e-05, |
|
"loss": 0.2002, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 8.651000870322019, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.0764461890151112e-05, |
|
"loss": 0.1967, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 8.65970409051349, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.062764021561733e-05, |
|
"loss": 0.2005, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 8.66840731070496, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.0491644833640868e-05, |
|
"loss": 0.2013, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 8.677110530896432, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.0356477001567677e-05, |
|
"loss": 0.197, |
|
"step": 4985 |
|
}, |
|
{ |
|
"epoch": 8.685813751087903, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.0222137969092581e-05, |
|
"loss": 0.2012, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 8.694516971279374, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.0088628978247694e-05, |
|
"loss": 0.2006, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 8.703220191470844, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 9.955951263390972e-06, |
|
"loss": 0.1987, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.711923411662315, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.824106051194859e-06, |
|
"loss": 0.1977, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 8.720626631853786, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.69309456063484e-06, |
|
"loss": 0.1986, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 8.729329852045257, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 9.562918002978283e-06, |
|
"loss": 0.2016, |
|
"step": 5015 |
|
}, |
|
{ |
|
"epoch": 8.738033072236728, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 9.43357758177309e-06, |
|
"loss": 0.1969, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 8.7467362924282, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 9.305074492836763e-06, |
|
"loss": 0.197, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 8.755439512619668, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 9.177409924245161e-06, |
|
"loss": 0.1953, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 8.76414273281114, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 9.050585056321626e-06, |
|
"loss": 0.1979, |
|
"step": 5035 |
|
}, |
|
{ |
|
"epoch": 8.77284595300261, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 8.924601061626048e-06, |
|
"loss": 0.1969, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 8.781549173194081, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.799459104944064e-06, |
|
"loss": 0.1983, |
|
"step": 5045 |
|
}, |
|
{ |
|
"epoch": 8.790252393385552, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 8.675160343276167e-06, |
|
"loss": 0.1982, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 8.798955613577023, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 8.551705925827103e-06, |
|
"loss": 0.1989, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 8.807658833768494, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 8.429096993995277e-06, |
|
"loss": 0.1958, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 8.816362053959965, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 8.307334681362133e-06, |
|
"loss": 0.1996, |
|
"step": 5065 |
|
}, |
|
{ |
|
"epoch": 8.825065274151436, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 8.18642011368167e-06, |
|
"loss": 0.2031, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 8.833768494342907, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 8.066354408870048e-06, |
|
"loss": 0.201, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 8.842471714534378, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 7.947138676995302e-06, |
|
"loss": 0.2003, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 8.851174934725849, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 7.828774020267072e-06, |
|
"loss": 0.1989, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 8.85987815491732, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 7.711261533026238e-06, |
|
"loss": 0.2007, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 8.868581375108791, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.594602301735087e-06, |
|
"loss": 0.204, |
|
"step": 5095 |
|
}, |
|
{ |
|
"epoch": 8.877284595300262, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 7.478797404967075e-06, |
|
"loss": 0.1964, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 8.885987815491731, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 7.363847913396882e-06, |
|
"loss": 0.1953, |
|
"step": 5105 |
|
}, |
|
{ |
|
"epoch": 8.894691035683202, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.249754889790539e-06, |
|
"loss": 0.2054, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 8.903394255874673, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 7.136519388995633e-06, |
|
"loss": 0.1996, |
|
"step": 5115 |
|
}, |
|
{ |
|
"epoch": 8.912097476066144, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 7.024142457931504e-06, |
|
"loss": 0.198, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 8.920800696257615, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.9126251355795864e-06, |
|
"loss": 0.1938, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 8.929503916449086, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 6.8019684529737505e-06, |
|
"loss": 0.2041, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 8.938207136640557, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.6921734331908735e-06, |
|
"loss": 0.199, |
|
"step": 5135 |
|
}, |
|
{ |
|
"epoch": 8.946910356832028, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.583241091341353e-06, |
|
"loss": 0.1971, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 8.955613577023499, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.475172434559573e-06, |
|
"loss": 0.1962, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 8.96431679721497, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 6.367968461994833e-06, |
|
"loss": 0.1993, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 8.97302001740644, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.261630164801957e-06, |
|
"loss": 0.2026, |
|
"step": 5155 |
|
}, |
|
{ |
|
"epoch": 8.981723237597912, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.156158526132139e-06, |
|
"loss": 0.1999, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 8.990426457789383, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 6.05155452112387e-06, |
|
"loss": 0.1983, |
|
"step": 5165 |
|
}, |
|
{ |
|
"epoch": 8.999129677980854, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 5.947819116893971e-06, |
|
"loss": 0.2037, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 8.999129677980854, |
|
"eval_loss": 5.056090831756592, |
|
"eval_runtime": 1.1157, |
|
"eval_samples_per_second": 5.378, |
|
"eval_steps_per_second": 0.896, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 9.007832898172325, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 5.8449532725286196e-06, |
|
"loss": 0.1957, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 9.016536118363794, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 5.742957939074412e-06, |
|
"loss": 0.1967, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 9.025239338555265, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 5.641834059529661e-06, |
|
"loss": 0.1998, |
|
"step": 5185 |
|
}, |
|
{ |
|
"epoch": 9.033942558746736, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 5.541582568835679e-06, |
|
"loss": 0.2032, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 9.042645778938207, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 5.442204393868056e-06, |
|
"loss": 0.1979, |
|
"step": 5195 |
|
}, |
|
{ |
|
"epoch": 9.051348999129678, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 5.343700453428168e-06, |
|
"loss": 0.1942, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 9.060052219321149, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 5.246071658234642e-06, |
|
"loss": 0.2022, |
|
"step": 5205 |
|
}, |
|
{ |
|
"epoch": 9.06875543951262, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 5.1493189109149575e-06, |
|
"loss": 0.2016, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 9.07745865970409, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 5.0534431059970685e-06, |
|
"loss": 0.1946, |
|
"step": 5215 |
|
}, |
|
{ |
|
"epoch": 9.086161879895561, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.958445129901146e-06, |
|
"loss": 0.2002, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 9.094865100087032, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.864325860931429e-06, |
|
"loss": 0.1978, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 9.103568320278503, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 4.771086169268057e-06, |
|
"loss": 0.1992, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 9.112271540469974, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.678726916958998e-06, |
|
"loss": 0.1997, |
|
"step": 5235 |
|
}, |
|
{ |
|
"epoch": 9.120974760661445, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.587248957912138e-06, |
|
"loss": 0.1998, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 9.129677980852916, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.496653137887386e-06, |
|
"loss": 0.1923, |
|
"step": 5245 |
|
}, |
|
{ |
|
"epoch": 9.138381201044387, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.40694029448877e-06, |
|
"loss": 0.1998, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 9.147084421235856, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.318111257156831e-06, |
|
"loss": 0.1911, |
|
"step": 5255 |
|
}, |
|
{ |
|
"epoch": 9.155787641427327, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.230166847160799e-06, |
|
"loss": 0.1949, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 9.164490861618798, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.143107877591135e-06, |
|
"loss": 0.1974, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 9.17319408181027, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.056935153351937e-06, |
|
"loss": 0.1964, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 9.18189730200174, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.971649471153516e-06, |
|
"loss": 0.1956, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 9.190600522193211, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 3.887251619505028e-06, |
|
"loss": 0.1969, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 9.199303742384682, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 3.803742378707198e-06, |
|
"loss": 0.1992, |
|
"step": 5285 |
|
}, |
|
{ |
|
"epoch": 9.208006962576153, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 3.7211225208450774e-06, |
|
"loss": 0.1945, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 9.216710182767624, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 3.6393928097809617e-06, |
|
"loss": 0.199, |
|
"step": 5295 |
|
}, |
|
{ |
|
"epoch": 9.225413402959095, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 3.5585540011472516e-06, |
|
"loss": 0.1956, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 9.234116623150566, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 3.4786068423395044e-06, |
|
"loss": 0.1991, |
|
"step": 5305 |
|
}, |
|
{ |
|
"epoch": 9.242819843342037, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 3.3995520725095486e-06, |
|
"loss": 0.1943, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 9.251523063533508, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 3.3213904225586346e-06, |
|
"loss": 0.1973, |
|
"step": 5315 |
|
}, |
|
{ |
|
"epoch": 9.260226283724979, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 3.2441226151306404e-06, |
|
"loss": 0.1907, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 9.26892950391645, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 3.16774936460541e-06, |
|
"loss": 0.1968, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 9.27763272410792, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 3.092271377092215e-06, |
|
"loss": 0.1968, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 9.28633594429939, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 3.0176893504230807e-06, |
|
"loss": 0.1955, |
|
"step": 5335 |
|
}, |
|
{ |
|
"epoch": 9.295039164490861, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.944003974146525e-06, |
|
"loss": 0.1939, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 9.303742384682332, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 2.8712159295209873e-06, |
|
"loss": 0.1955, |
|
"step": 5345 |
|
}, |
|
{ |
|
"epoch": 9.312445604873803, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 2.7993258895086973e-06, |
|
"loss": 0.1925, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 9.321148825065274, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 2.7283345187693264e-06, |
|
"loss": 0.196, |
|
"step": 5355 |
|
}, |
|
{ |
|
"epoch": 9.329852045256745, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 2.658242473653905e-06, |
|
"loss": 0.1929, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 9.338555265448216, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 2.589050402198767e-06, |
|
"loss": 0.1958, |
|
"step": 5365 |
|
}, |
|
{ |
|
"epoch": 9.347258485639687, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 2.520758944119539e-06, |
|
"loss": 0.1939, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 9.355961705831158, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.4533687308051835e-06, |
|
"loss": 0.1917, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 9.364664926022629, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 2.386880385312218e-06, |
|
"loss": 0.1937, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 9.3733681462141, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 2.321294522358952e-06, |
|
"loss": 0.1988, |
|
"step": 5385 |
|
}, |
|
{ |
|
"epoch": 9.38207136640557, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 2.256611748319792e-06, |
|
"loss": 0.1943, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 9.390774586597042, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 2.1928326612196015e-06, |
|
"loss": 0.1964, |
|
"step": 5395 |
|
}, |
|
{ |
|
"epoch": 9.399477806788513, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 2.1299578507282147e-06, |
|
"loss": 0.196, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 9.408181026979982, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.0679878981549993e-06, |
|
"loss": 0.1921, |
|
"step": 5405 |
|
}, |
|
{ |
|
"epoch": 9.416884247171453, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.006923376443415e-06, |
|
"loss": 0.1983, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 9.425587467362924, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.946764850165772e-06, |
|
"loss": 0.1984, |
|
"step": 5415 |
|
}, |
|
{ |
|
"epoch": 9.434290687554395, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 1.8875128755179938e-06, |
|
"loss": 0.198, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 9.442993907745866, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.8291680003145073e-06, |
|
"loss": 0.1977, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 9.451697127937337, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.7717307639831037e-06, |
|
"loss": 0.1966, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 9.460400348128807, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.7152016975599983e-06, |
|
"loss": 0.1959, |
|
"step": 5435 |
|
}, |
|
{ |
|
"epoch": 9.469103568320278, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.6595813236849556e-06, |
|
"loss": 0.1946, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 9.47780678851175, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.604870156596383e-06, |
|
"loss": 0.194, |
|
"step": 5445 |
|
}, |
|
{ |
|
"epoch": 9.48651000870322, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.5510687021266234e-06, |
|
"loss": 0.1926, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 9.495213228894691, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.4981774576972584e-06, |
|
"loss": 0.1963, |
|
"step": 5455 |
|
}, |
|
{ |
|
"epoch": 9.503916449086162, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.4461969123145457e-06, |
|
"loss": 0.1973, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 9.512619669277633, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.395127546564845e-06, |
|
"loss": 0.1963, |
|
"step": 5465 |
|
}, |
|
{ |
|
"epoch": 9.521322889469104, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.344969832610199e-06, |
|
"loss": 0.1932, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 9.530026109660575, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.2957242341839927e-06, |
|
"loss": 0.197, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 9.538729329852046, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 1.2473912065866345e-06, |
|
"loss": 0.1921, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 9.547432550043515, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 1.1999711966813377e-06, |
|
"loss": 0.1969, |
|
"step": 5485 |
|
}, |
|
{ |
|
"epoch": 9.556135770234986, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.1534646428900232e-06, |
|
"loss": 0.1981, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 9.564838990426457, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.107871975189234e-06, |
|
"loss": 0.2015, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 9.573542210617928, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 1.0631936151062172e-06, |
|
"loss": 0.1953, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 9.5822454308094, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.019429975714914e-06, |
|
"loss": 0.1969, |
|
"step": 5505 |
|
}, |
|
{ |
|
"epoch": 9.59094865100087, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.765814616322755e-07, |
|
"loss": 0.1956, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 9.599651871192341, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 9.346484690144319e-07, |
|
"loss": 0.1987, |
|
"step": 5515 |
|
}, |
|
{ |
|
"epoch": 9.608355091383812, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 8.936313855530398e-07, |
|
"loss": 0.1944, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 9.617058311575283, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 8.535305904717517e-07, |
|
"loss": 0.1932, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 9.625761531766754, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 8.143464545226298e-07, |
|
"loss": 0.196, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 9.634464751958225, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 7.760793399827937e-07, |
|
"loss": 0.1967, |
|
"step": 5535 |
|
}, |
|
{ |
|
"epoch": 9.643167972149696, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.387296006510225e-07, |
|
"loss": 0.1958, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 9.651871192341167, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.022975818445022e-07, |
|
"loss": 0.1933, |
|
"step": 5545 |
|
}, |
|
{ |
|
"epoch": 9.660574412532638, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 6.667836203956168e-07, |
|
"loss": 0.1972, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 9.669277632724107, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 6.321880446488737e-07, |
|
"loss": 0.1932, |
|
"step": 5555 |
|
}, |
|
{ |
|
"epoch": 9.677980852915578, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 5.985111744578165e-07, |
|
"loss": 0.1977, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 9.686684073107049, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 5.657533211820942e-07, |
|
"loss": 0.1979, |
|
"step": 5565 |
|
}, |
|
{ |
|
"epoch": 9.69538729329852, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 5.339147876845974e-07, |
|
"loss": 0.1961, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 9.70409051348999, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 5.029958683286263e-07, |
|
"loss": 0.197, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 9.712793733681462, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 4.7299684897520456e-07, |
|
"loss": 0.193, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 9.721496953872933, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.4391800698038165e-07, |
|
"loss": 0.1961, |
|
"step": 5585 |
|
}, |
|
{ |
|
"epoch": 9.730200174064404, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.157596111927342e-07, |
|
"loss": 0.1903, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 9.738903394255875, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 3.8852192195083516e-07, |
|
"loss": 0.1948, |
|
"step": 5595 |
|
}, |
|
{ |
|
"epoch": 9.747606614447346, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 3.622051910808666e-07, |
|
"loss": 0.1969, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 9.756309834638817, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 3.368096618942773e-07, |
|
"loss": 0.1948, |
|
"step": 5605 |
|
}, |
|
{ |
|
"epoch": 9.765013054830288, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 3.1233556918555117e-07, |
|
"loss": 0.1982, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 9.773716275021759, |
|
"grad_norm": 0.625, |
|
"learning_rate": 2.8878313923002e-07, |
|
"loss": 0.1929, |
|
"step": 5615 |
|
}, |
|
{ |
|
"epoch": 9.78241949521323, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.661525897817874e-07, |
|
"loss": 0.1987, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 9.7911227154047, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.444441300717082e-07, |
|
"loss": 0.1953, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 9.799825935596171, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 2.2365796080542345e-07, |
|
"loss": 0.2007, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 9.80852915578764, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 2.037942741615617e-07, |
|
"loss": 0.2001, |
|
"step": 5635 |
|
}, |
|
{ |
|
"epoch": 9.817232375979112, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.8485325378994056e-07, |
|
"loss": 0.198, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 9.825935596170583, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.6683507480983462e-07, |
|
"loss": 0.1958, |
|
"step": 5645 |
|
}, |
|
{ |
|
"epoch": 9.834638816362054, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.4973990380841019e-07, |
|
"loss": 0.1938, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 9.843342036553524, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.3356789883914865e-07, |
|
"loss": 0.1938, |
|
"step": 5655 |
|
}, |
|
{ |
|
"epoch": 9.852045256744995, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.1831920942039221e-07, |
|
"loss": 0.1973, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 9.860748476936466, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.0399397653395593e-07, |
|
"loss": 0.2024, |
|
"step": 5665 |
|
}, |
|
{ |
|
"epoch": 9.869451697127937, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 9.059233262386225e-08, |
|
"loss": 0.1995, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 9.878154917319408, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 7.811440159507522e-08, |
|
"loss": 0.1972, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 9.88685813751088, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 6.656029881233483e-08, |
|
"loss": 0.1938, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 9.89556135770235, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 5.593013109917999e-08, |
|
"loss": 0.1974, |
|
"step": 5685 |
|
}, |
|
{ |
|
"epoch": 9.904264577893821, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.6223996736860506e-08, |
|
"loss": 0.1957, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 9.912967798085292, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 3.744198546348221e-08, |
|
"loss": 0.1971, |
|
"step": 5695 |
|
}, |
|
{ |
|
"epoch": 9.921671018276763, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.9584178473174296e-08, |
|
"loss": 0.1977, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 9.930374238468232, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 2.2650648415334376e-08, |
|
"loss": 0.1934, |
|
"step": 5705 |
|
}, |
|
{ |
|
"epoch": 9.939077458659703, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.664145939394013e-08, |
|
"loss": 0.1937, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 9.947780678851174, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.1556666966971997e-08, |
|
"loss": 0.1996, |
|
"step": 5715 |
|
}, |
|
{ |
|
"epoch": 9.956483899042645, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 7.39631814590247e-09, |
|
"loss": 0.1984, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 9.965187119234116, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.160451395263109e-09, |
|
"loss": 0.1927, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 9.973890339425587, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.8490966322670666e-09, |
|
"loss": 0.2, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 9.982593559617058, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 4.6227522655373223e-10, |
|
"loss": 0.1946, |
|
"step": 5735 |
|
}, |
|
{ |
|
"epoch": 9.991296779808529, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0, |
|
"loss": 0.1899, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 9.991296779808529, |
|
"eval_loss": 5.076513767242432, |
|
"eval_runtime": 0.7783, |
|
"eval_samples_per_second": 7.709, |
|
"eval_steps_per_second": 1.285, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 9.991296779808529, |
|
"step": 5740, |
|
"total_flos": 6.645284010274587e+18, |
|
"train_loss": 0.6571785643956387, |
|
"train_runtime": 32584.0572, |
|
"train_samples_per_second": 4.229, |
|
"train_steps_per_second": 0.176 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 5740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.645284010274587e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|