|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 1100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00909090909090909, |
|
"grad_norm": 548.0, |
|
"learning_rate": 1.818181818181818e-06, |
|
"loss": 53.7965, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.045454545454545456, |
|
"grad_norm": 668.0, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 55.3182, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 486.0, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 51.173, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13636363636363635, |
|
"grad_norm": 159.0, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 33.7467, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 37.5, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 24.0423, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 32.25, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 22.1424, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 21.25, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 20.3818, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3181818181818182, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 6.363636363636364e-05, |
|
"loss": 18.8064, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 11.625, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 17.8515, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4090909090909091, |
|
"grad_norm": 27.125, |
|
"learning_rate": 8.181818181818183e-05, |
|
"loss": 16.8604, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 56.25, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 12.4796, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 0.0001, |
|
"loss": 4.4979, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 3.25, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 1.9919, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5909090909090909, |
|
"grad_norm": 25.375, |
|
"learning_rate": 0.0001181818181818182, |
|
"loss": 1.6619, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 3.125, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 1.527, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 1.3688, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 1.2618, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7727272727272727, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 0.00015454545454545454, |
|
"loss": 1.238, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 1.1871, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8636363636363636, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 0.00017272727272727275, |
|
"loss": 1.154, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 1.1375, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9545454545454546, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 0.00019090909090909092, |
|
"loss": 1.0352, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 9.25, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0256, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.388709783554077, |
|
"eval_runtime": 1.0069, |
|
"eval_samples_per_second": 4.966, |
|
"eval_steps_per_second": 1.986, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0454545454545454, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.00019998741276738754, |
|
"loss": 1.0186, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 0.00019994965423831854, |
|
"loss": 0.9466, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 0.0001998867339183008, |
|
"loss": 0.9645, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1818181818181819, |
|
"grad_norm": 86.5, |
|
"learning_rate": 0.00019979866764718843, |
|
"loss": 0.9547, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2272727272727273, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00019968547759519425, |
|
"loss": 0.9527, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00019954719225730847, |
|
"loss": 0.9091, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3181818181818181, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.00019938384644612543, |
|
"loss": 0.8789, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.00019919548128307954, |
|
"loss": 0.9036, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4090909090909092, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 0.8787, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00019874388886763944, |
|
"loss": 0.8671, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 0.8872, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5454545454545454, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 0.00019819286972627066, |
|
"loss": 0.9179, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 3.25, |
|
"learning_rate": 0.00019788024462147788, |
|
"loss": 0.8857, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 0.00019754297868854073, |
|
"loss": 0.8474, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6818181818181817, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 0.00019718115683235417, |
|
"loss": 0.861, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 0.00019679487013963564, |
|
"loss": 0.8266, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7727272727272727, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00019638421585599423, |
|
"loss": 0.8515, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 0.8328, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8636363636363638, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.0001954902241444074, |
|
"loss": 0.8601, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.00019500711177409454, |
|
"loss": 0.8435, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9545454545454546, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.00019450008187146684, |
|
"loss": 0.8082, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.8325, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.227036952972412, |
|
"eval_runtime": 1.0053, |
|
"eval_samples_per_second": 4.974, |
|
"eval_steps_per_second": 1.989, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0454545454545454, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00019341478602651069, |
|
"loss": 0.7413, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.090909090909091, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 0.00019283679330160726, |
|
"loss": 0.7526, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1363636363636362, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00019223542941045817, |
|
"loss": 0.739, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00019161084574320696, |
|
"loss": 0.7665, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.227272727272727, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00019096319953545185, |
|
"loss": 0.7655, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 0.00019029265382866214, |
|
"loss": 0.753, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.3181818181818183, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.00018959937742913359, |
|
"loss": 0.7557, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00018888354486549237, |
|
"loss": 0.7805, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.409090909090909, |
|
"grad_norm": 4.75, |
|
"learning_rate": 0.00018814533634475822, |
|
"loss": 0.7451, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.4545454545454546, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00018738493770697852, |
|
"loss": 0.7308, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.8117, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.00018579834132349772, |
|
"loss": 0.7756, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.590909090909091, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00018497254299495146, |
|
"loss": 0.7503, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.6363636363636362, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 0.7886, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.6818181818181817, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00018325698546347715, |
|
"loss": 0.7452, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 4.0, |
|
"learning_rate": 0.0001823676581429833, |
|
"loss": 0.7579, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.7727272727272725, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00018145759520503358, |
|
"loss": 0.7494, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.8181818181818183, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.00018052702575310588, |
|
"loss": 0.7343, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8636363636363638, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 0.00017957618405308324, |
|
"loss": 0.7402, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00017860530947427875, |
|
"loss": 0.7428, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.9545454545454546, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0001776146464291757, |
|
"loss": 0.7699, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.749, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.2333221435546875, |
|
"eval_runtime": 1.0046, |
|
"eval_samples_per_second": 4.977, |
|
"eval_steps_per_second": 1.991, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.0454545454545454, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00017557495743542585, |
|
"loss": 0.6475, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.090909090909091, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0001745264449675755, |
|
"loss": 0.6833, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.1363636363636362, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00017345917086575332, |
|
"loss": 0.6776, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00017237340381050703, |
|
"loss": 0.6798, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.227272727272727, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00017126941713788632, |
|
"loss": 0.6606, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00017014748877063214, |
|
"loss": 0.6629, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.3181818181818183, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00016900790114821122, |
|
"loss": 0.6871, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.3636363636363638, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00016785094115571322, |
|
"loss": 0.6575, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.409090909090909, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00016667690005162916, |
|
"loss": 0.6674, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.4545454545454546, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 0.7054, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 0.6646, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.5454545454545454, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00016305526670845226, |
|
"loss": 0.6766, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.590909090909091, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.00016181589862206052, |
|
"loss": 0.6716, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00016056096871376667, |
|
"loss": 0.6976, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.6818181818181817, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.00015929079290546408, |
|
"loss": 0.6624, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.7272727272727275, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00015800569095711982, |
|
"loss": 0.6907, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.7727272727272725, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00015670598638627706, |
|
"loss": 0.6664, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.8181818181818183, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00015539200638661104, |
|
"loss": 0.6642, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.8636363636363638, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 0.7091, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.909090909090909, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.00015272254676105025, |
|
"loss": 0.7034, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.9545454545454546, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00015136773915734066, |
|
"loss": 0.6679, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.6755, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.29927659034729, |
|
"eval_runtime": 1.0067, |
|
"eval_samples_per_second": 4.967, |
|
"eval_steps_per_second": 1.987, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.045454545454546, |
|
"grad_norm": 2.75, |
|
"learning_rate": 0.00014861967361004687, |
|
"loss": 0.6121, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0001472271074772683, |
|
"loss": 0.598, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.136363636363637, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00014582265217274104, |
|
"loss": 0.5832, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.181818181818182, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00014440666126057744, |
|
"loss": 0.591, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.2272727272727275, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00014297949120891718, |
|
"loss": 0.5767, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.2727272727272725, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 0.5742, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.318181818181818, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00014009305354066137, |
|
"loss": 0.5806, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00013863451256931287, |
|
"loss": 0.605, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.409090909090909, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00013716624556603274, |
|
"loss": 0.6142, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.454545454545454, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00013568862215918717, |
|
"loss": 0.586, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 0.6088, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00013270679633174218, |
|
"loss": 0.5974, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.590909090909091, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.0001312033445698487, |
|
"loss": 0.5808, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.636363636363637, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.0001296920375328275, |
|
"loss": 0.6008, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.681818181818182, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.00012817325568414297, |
|
"loss": 0.6069, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00012664738136900348, |
|
"loss": 0.6111, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.7727272727272725, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0001251147987181079, |
|
"loss": 0.6114, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.818181818181818, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00012357589355094275, |
|
"loss": 0.5905, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.863636363636363, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00012203105327865407, |
|
"loss": 0.6153, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.909090909090909, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00012048066680651908, |
|
"loss": 0.6179, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.954545454545455, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00011892512443604102, |
|
"loss": 0.5833, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.6197, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.3820290565490723, |
|
"eval_runtime": 1.0056, |
|
"eval_samples_per_second": 4.972, |
|
"eval_steps_per_second": 1.989, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 5.045454545454546, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.000115800139597335, |
|
"loss": 0.5034, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 5.090909090909091, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 0.5171, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 5.136363636363637, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00011265924535737493, |
|
"loss": 0.5233, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 5.181818181818182, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00011108381999010111, |
|
"loss": 0.5103, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 5.2272727272727275, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00010950560433041826, |
|
"loss": 0.5371, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 5.2727272727272725, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00010792499568567884, |
|
"loss": 0.5228, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 5.318181818181818, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00010634239196565646, |
|
"loss": 0.5308, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 5.363636363636363, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00010475819158237425, |
|
"loss": 0.5349, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 5.409090909090909, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00010317279334980678, |
|
"loss": 0.5204, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00010158659638348081, |
|
"loss": 0.5182, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5215, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 5.545454545454545, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.84134036165192e-05, |
|
"loss": 0.5282, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 5.590909090909091, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.682720665019325e-05, |
|
"loss": 0.5103, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 5.636363636363637, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.524180841762577e-05, |
|
"loss": 0.5257, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.681818181818182, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.365760803434355e-05, |
|
"loss": 0.5228, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 5.7272727272727275, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.207500431432115e-05, |
|
"loss": 0.5487, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.7727272727272725, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.049439566958175e-05, |
|
"loss": 0.528, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 5.818181818181818, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 8.891618000989891e-05, |
|
"loss": 0.5213, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.863636363636363, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 8.734075464262507e-05, |
|
"loss": 0.5312, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 5.909090909090909, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 0.5208, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.954545454545455, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 8.4199860402665e-05, |
|
"loss": 0.5372, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.5208, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.586942672729492, |
|
"eval_runtime": 1.0065, |
|
"eval_samples_per_second": 4.968, |
|
"eval_steps_per_second": 1.987, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 6.045454545454546, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 8.107487556395901e-05, |
|
"loss": 0.46, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 6.090909090909091, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 7.951933319348095e-05, |
|
"loss": 0.4614, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 6.136363636363637, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.796894672134594e-05, |
|
"loss": 0.4648, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 6.181818181818182, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 7.642410644905726e-05, |
|
"loss": 0.4445, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 6.2272727272727275, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 7.488520128189209e-05, |
|
"loss": 0.4483, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 6.2727272727272725, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 7.335261863099651e-05, |
|
"loss": 0.4484, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 6.318181818181818, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 7.182674431585704e-05, |
|
"loss": 0.4452, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 6.363636363636363, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.030796246717255e-05, |
|
"loss": 0.4615, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 6.409090909090909, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 6.87966554301513e-05, |
|
"loss": 0.4443, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 6.454545454545454, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 6.729320366825784e-05, |
|
"loss": 0.4509, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 0.461, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 6.545454545454545, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 6.431137784081282e-05, |
|
"loss": 0.4571, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 6.590909090909091, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 6.283375443396726e-05, |
|
"loss": 0.4698, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 6.636363636363637, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 6.136548743068713e-05, |
|
"loss": 0.441, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 6.681818181818182, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 5.9906946459338656e-05, |
|
"loss": 0.4464, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 6.7272727272727275, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 0.4455, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.7727272727272725, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 5.702050879108284e-05, |
|
"loss": 0.4541, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 6.818181818181818, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 5.559333873942259e-05, |
|
"loss": 0.4546, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.863636363636363, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 5.417734782725896e-05, |
|
"loss": 0.4427, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 6.909090909090909, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 5.277289252273174e-05, |
|
"loss": 0.4497, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.954545454545455, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 5.138032638995315e-05, |
|
"loss": 0.4519, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.4474, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.838920831680298, |
|
"eval_runtime": 1.0059, |
|
"eval_samples_per_second": 4.971, |
|
"eval_steps_per_second": 1.988, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 7.045454545454546, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.8632260842659393e-05, |
|
"loss": 0.3915, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 7.090909090909091, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.727745323894976e-05, |
|
"loss": 0.3938, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 7.136363636363637, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 4.593591825444028e-05, |
|
"loss": 0.3943, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 7.181818181818182, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.4607993613388976e-05, |
|
"loss": 0.3993, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 7.2272727272727275, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 4.329401361372294e-05, |
|
"loss": 0.397, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.19943090428802e-05, |
|
"loss": 0.3952, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 7.318181818181818, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.070920709453597e-05, |
|
"loss": 0.3992, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 7.363636363636363, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 3.943903128623335e-05, |
|
"loss": 0.3869, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 7.409090909090909, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 3.8184101377939476e-05, |
|
"loss": 0.3931, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 7.454545454545454, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 3.694473329154778e-05, |
|
"loss": 0.398, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.4006, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 7.545454545454545, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 0.3935, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 7.590909090909091, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.332309994837085e-05, |
|
"loss": 0.3984, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 7.636363636363637, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 3.21490588442868e-05, |
|
"loss": 0.3981, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 7.681818181818182, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 3.099209885178882e-05, |
|
"loss": 0.3981, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 7.7272727272727275, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 2.9852511229367865e-05, |
|
"loss": 0.3978, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 7.7727272727272725, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 2.8730582862113742e-05, |
|
"loss": 0.3943, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 7.818181818181818, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 2.7626596189492983e-05, |
|
"loss": 0.3913, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 7.863636363636363, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 2.654082913424668e-05, |
|
"loss": 0.4045, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 7.909090909090909, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 2.5473555032424533e-05, |
|
"loss": 0.3828, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 7.954545454545455, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 2.4425042564574184e-05, |
|
"loss": 0.388, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.4044, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 3.102931499481201, |
|
"eval_runtime": 1.0057, |
|
"eval_samples_per_second": 4.972, |
|
"eval_steps_per_second": 1.989, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 8.045454545454545, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 2.2385353570824308e-05, |
|
"loss": 0.3693, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 8.090909090909092, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 2.139469052572127e-05, |
|
"loss": 0.3641, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 8.136363636363637, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 2.042381594691678e-05, |
|
"loss": 0.3647, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 8.181818181818182, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.947297424689414e-05, |
|
"loss": 0.3619, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 8.227272727272727, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 1.854240479496643e-05, |
|
"loss": 0.3566, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 8.272727272727273, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 1.763234185701673e-05, |
|
"loss": 0.354, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 8.318181818181818, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.6743014536522873e-05, |
|
"loss": 0.3703, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 8.363636363636363, |
|
"grad_norm": 0.75, |
|
"learning_rate": 1.587464671688187e-05, |
|
"loss": 0.3597, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 8.409090909090908, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.5027457005048573e-05, |
|
"loss": 0.3705, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 8.454545454545455, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.4201658676502294e-05, |
|
"loss": 0.367, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 0.3607, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 8.545454545454545, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.2615062293021507e-05, |
|
"loss": 0.3633, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 8.590909090909092, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 1.1854663655241805e-05, |
|
"loss": 0.3622, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 8.636363636363637, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 1.1116455134507664e-05, |
|
"loss": 0.3734, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 8.681818181818182, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 1.0400622570866425e-05, |
|
"loss": 0.367, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 8.727272727272727, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.707346171337894e-06, |
|
"loss": 0.3578, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 8.772727272727273, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.036800464548157e-06, |
|
"loss": 0.364, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 8.818181818181818, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.38915425679304e-06, |
|
"loss": 0.3558, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 8.863636363636363, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 7.764570589541875e-06, |
|
"loss": 0.3664, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 8.909090909090908, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 7.163206698392744e-06, |
|
"loss": 0.3573, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 8.954545454545455, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 6.585213973489335e-06, |
|
"loss": 0.3717, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 0.3573, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 3.357294797897339, |
|
"eval_runtime": 1.005, |
|
"eval_samples_per_second": 4.975, |
|
"eval_steps_per_second": 1.99, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 9.045454545454545, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 5.499918128533155e-06, |
|
"loss": 0.3595, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 4.992888225905468e-06, |
|
"loss": 0.359, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 9.136363636363637, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.509775855592613e-06, |
|
"loss": 0.3584, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 9.181818181818182, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.050702638550275e-06, |
|
"loss": 0.3575, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 9.227272727272727, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 3.615784144005796e-06, |
|
"loss": 0.3449, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 9.272727272727273, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 3.2051298603643753e-06, |
|
"loss": 0.354, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 9.318181818181818, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 2.818843167645835e-06, |
|
"loss": 0.3463, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 9.363636363636363, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 2.4570213114592954e-06, |
|
"loss": 0.351, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 9.409090909090908, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.119755378522137e-06, |
|
"loss": 0.3594, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 9.454545454545455, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.8071302737293295e-06, |
|
"loss": 0.3541, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 0.36, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 9.545454545454545, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 1.2561111323605712e-06, |
|
"loss": 0.3573, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 9.590909090909092, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.0178558119067315e-06, |
|
"loss": 0.3601, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 9.636363636363637, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 8.04518716920466e-07, |
|
"loss": 0.3574, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 9.681818181818182, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 6.161535538745878e-07, |
|
"loss": 0.3664, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 9.727272727272727, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 4.5280774269154115e-07, |
|
"loss": 0.3542, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 9.772727272727273, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 3.145224048057727e-07, |
|
"loss": 0.36, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 9.818181818181818, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 2.0133235281156736e-07, |
|
"loss": 0.3565, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 9.863636363636363, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.1326608169920372e-07, |
|
"loss": 0.3694, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 9.909090909090908, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 5.0345761681491746e-08, |
|
"loss": 0.354, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 9.954545454545455, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 1.2587232612493172e-08, |
|
"loss": 0.3516, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0, |
|
"loss": 0.354, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 3.380370616912842, |
|
"eval_runtime": 1.0228, |
|
"eval_samples_per_second": 4.888, |
|
"eval_steps_per_second": 1.955, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 1100, |
|
"total_flos": 1.6817604900715233e+18, |
|
"train_loss": 1.8264882094209844, |
|
"train_runtime": 8654.8771, |
|
"train_samples_per_second": 2.032, |
|
"train_steps_per_second": 0.127 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6817604900715233e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|