|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.893337455557273, |
|
"eval_steps": 200, |
|
"global_step": 16000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0002981331673926571, |
|
"loss": 2.3038, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 1.589147925376892, |
|
"eval_runtime": 2.1684, |
|
"eval_samples_per_second": 461.163, |
|
"eval_steps_per_second": 57.645, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.00029439950217797134, |
|
"loss": 1.5398, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.4633430242538452, |
|
"eval_runtime": 2.1071, |
|
"eval_samples_per_second": 474.58, |
|
"eval_steps_per_second": 59.323, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 0.00029066583696328563, |
|
"loss": 1.4615, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 1.4296711683273315, |
|
"eval_runtime": 2.4812, |
|
"eval_samples_per_second": 403.032, |
|
"eval_steps_per_second": 50.379, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 0.0002869321717485998, |
|
"loss": 1.4244, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_loss": 1.3793567419052124, |
|
"eval_runtime": 2.9698, |
|
"eval_samples_per_second": 336.72, |
|
"eval_steps_per_second": 42.09, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 0.0002831985065339141, |
|
"loss": 1.3921, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 1.3315461874008179, |
|
"eval_runtime": 2.7793, |
|
"eval_samples_per_second": 359.806, |
|
"eval_steps_per_second": 44.976, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 0.00027946484131922836, |
|
"loss": 1.0958, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 0.7548955082893372, |
|
"eval_runtime": 2.7656, |
|
"eval_samples_per_second": 361.592, |
|
"eval_steps_per_second": 45.199, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 0.0002757311761045426, |
|
"loss": 0.6312, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 0.3990221917629242, |
|
"eval_runtime": 2.1502, |
|
"eval_samples_per_second": 465.071, |
|
"eval_steps_per_second": 58.134, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 0.00027199751088985685, |
|
"loss": 0.4093, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"eval_loss": 0.26113563776016235, |
|
"eval_runtime": 3.5103, |
|
"eval_samples_per_second": 284.875, |
|
"eval_steps_per_second": 35.609, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 0.00026826384567517114, |
|
"loss": 0.2959, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"eval_loss": 0.2783801257610321, |
|
"eval_runtime": 2.8441, |
|
"eval_samples_per_second": 351.603, |
|
"eval_steps_per_second": 43.95, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 0.00026453018046048533, |
|
"loss": 0.2589, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_loss": 0.20705343782901764, |
|
"eval_runtime": 2.7524, |
|
"eval_samples_per_second": 363.322, |
|
"eval_steps_per_second": 45.415, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"learning_rate": 0.0002607965152457996, |
|
"loss": 0.2246, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"eval_loss": 0.15551678836345673, |
|
"eval_runtime": 2.15, |
|
"eval_samples_per_second": 465.117, |
|
"eval_steps_per_second": 58.14, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"learning_rate": 0.00025706285003111387, |
|
"loss": 0.1991, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_loss": 0.15825262665748596, |
|
"eval_runtime": 2.1344, |
|
"eval_samples_per_second": 468.515, |
|
"eval_steps_per_second": 58.564, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 0.0002533291848164281, |
|
"loss": 0.1784, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"eval_loss": 0.12008943408727646, |
|
"eval_runtime": 2.1414, |
|
"eval_samples_per_second": 466.985, |
|
"eval_steps_per_second": 58.373, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"learning_rate": 0.00024959551960174235, |
|
"loss": 0.1598, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"eval_loss": 0.12511701881885529, |
|
"eval_runtime": 2.55, |
|
"eval_samples_per_second": 392.155, |
|
"eval_steps_per_second": 49.019, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"learning_rate": 0.0002458618543870566, |
|
"loss": 0.164, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"eval_loss": 0.11049681156873703, |
|
"eval_runtime": 2.9765, |
|
"eval_samples_per_second": 335.97, |
|
"eval_steps_per_second": 41.996, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"learning_rate": 0.00024212818917237084, |
|
"loss": 0.1475, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"eval_loss": 0.0954003781080246, |
|
"eval_runtime": 2.8437, |
|
"eval_samples_per_second": 351.659, |
|
"eval_steps_per_second": 43.957, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 0.0002383945239576851, |
|
"loss": 0.1388, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"eval_loss": 0.10116879642009735, |
|
"eval_runtime": 2.7628, |
|
"eval_samples_per_second": 361.954, |
|
"eval_steps_per_second": 45.244, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"learning_rate": 0.00023466085874299935, |
|
"loss": 0.1346, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_loss": 0.10693109035491943, |
|
"eval_runtime": 3.1697, |
|
"eval_samples_per_second": 315.491, |
|
"eval_steps_per_second": 39.436, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 0.00023092719352831362, |
|
"loss": 0.1232, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"eval_loss": 0.09901304543018341, |
|
"eval_runtime": 2.1178, |
|
"eval_samples_per_second": 472.183, |
|
"eval_steps_per_second": 59.023, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 0.00022719352831362786, |
|
"loss": 0.1187, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"eval_loss": 0.11418598890304565, |
|
"eval_runtime": 2.1348, |
|
"eval_samples_per_second": 468.423, |
|
"eval_steps_per_second": 58.553, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"learning_rate": 0.0002234598630989421, |
|
"loss": 0.1133, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 0.0984039306640625, |
|
"eval_runtime": 2.1382, |
|
"eval_samples_per_second": 467.676, |
|
"eval_steps_per_second": 58.459, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"learning_rate": 0.00021972619788425635, |
|
"loss": 0.1088, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"eval_loss": 0.07466612011194229, |
|
"eval_runtime": 2.8862, |
|
"eval_samples_per_second": 346.477, |
|
"eval_steps_per_second": 43.31, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"learning_rate": 0.00021599253266957062, |
|
"loss": 0.1025, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"eval_loss": 0.1227998435497284, |
|
"eval_runtime": 2.8738, |
|
"eval_samples_per_second": 347.966, |
|
"eval_steps_per_second": 43.496, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"learning_rate": 0.00021225886745488486, |
|
"loss": 0.0971, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"eval_loss": 0.07324225455522537, |
|
"eval_runtime": 2.2831, |
|
"eval_samples_per_second": 437.994, |
|
"eval_steps_per_second": 54.749, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"learning_rate": 0.00020852520224019913, |
|
"loss": 0.0853, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"eval_loss": 0.07788190990686417, |
|
"eval_runtime": 2.1358, |
|
"eval_samples_per_second": 468.199, |
|
"eval_steps_per_second": 58.525, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"learning_rate": 0.00020479153702551337, |
|
"loss": 0.0865, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"eval_loss": 0.06575259566307068, |
|
"eval_runtime": 2.1474, |
|
"eval_samples_per_second": 465.679, |
|
"eval_steps_per_second": 58.21, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"learning_rate": 0.0002010578718108276, |
|
"loss": 0.0768, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"eval_loss": 0.08183684200048447, |
|
"eval_runtime": 2.1211, |
|
"eval_samples_per_second": 471.453, |
|
"eval_steps_per_second": 58.932, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"learning_rate": 0.00019732420659614186, |
|
"loss": 0.0738, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"eval_loss": 0.04662672430276871, |
|
"eval_runtime": 2.7913, |
|
"eval_samples_per_second": 358.253, |
|
"eval_steps_per_second": 44.782, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"learning_rate": 0.0001935905413814561, |
|
"loss": 0.0622, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"eval_loss": 0.0433196946978569, |
|
"eval_runtime": 3.1597, |
|
"eval_samples_per_second": 316.49, |
|
"eval_steps_per_second": 39.561, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 0.00018985687616677037, |
|
"loss": 0.0671, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"eval_loss": 0.038382936269044876, |
|
"eval_runtime": 2.1009, |
|
"eval_samples_per_second": 475.976, |
|
"eval_steps_per_second": 59.497, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"learning_rate": 0.0001861232109520846, |
|
"loss": 0.0545, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"eval_loss": 0.04082392156124115, |
|
"eval_runtime": 2.1346, |
|
"eval_samples_per_second": 468.481, |
|
"eval_steps_per_second": 58.56, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"learning_rate": 0.00018238954573739888, |
|
"loss": 0.0564, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"eval_loss": 0.043197453022003174, |
|
"eval_runtime": 2.1169, |
|
"eval_samples_per_second": 472.389, |
|
"eval_steps_per_second": 59.049, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"learning_rate": 0.0001786558805227131, |
|
"loss": 0.0523, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"eval_loss": 0.03342806547880173, |
|
"eval_runtime": 2.4926, |
|
"eval_samples_per_second": 401.182, |
|
"eval_steps_per_second": 50.148, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"learning_rate": 0.00017492221530802736, |
|
"loss": 0.0456, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"eval_loss": 0.02744474820792675, |
|
"eval_runtime": 3.0, |
|
"eval_samples_per_second": 333.335, |
|
"eval_steps_per_second": 41.667, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"learning_rate": 0.0001711885500933416, |
|
"loss": 0.0442, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"eval_loss": 0.024560416117310524, |
|
"eval_runtime": 2.6752, |
|
"eval_samples_per_second": 373.806, |
|
"eval_steps_per_second": 46.726, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"learning_rate": 0.00016745488487865588, |
|
"loss": 0.0383, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"eval_loss": 0.018605533987283707, |
|
"eval_runtime": 2.1117, |
|
"eval_samples_per_second": 473.559, |
|
"eval_steps_per_second": 59.195, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"learning_rate": 0.00016372121966397012, |
|
"loss": 0.0348, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"eval_loss": 0.01473915483802557, |
|
"eval_runtime": 2.1223, |
|
"eval_samples_per_second": 471.193, |
|
"eval_steps_per_second": 58.899, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"learning_rate": 0.0001599875544492844, |
|
"loss": 0.0299, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"eval_loss": 0.025838036090135574, |
|
"eval_runtime": 2.1138, |
|
"eval_samples_per_second": 473.088, |
|
"eval_steps_per_second": 59.136, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"learning_rate": 0.0001562538892345986, |
|
"loss": 0.0268, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"eval_loss": 0.01688736118376255, |
|
"eval_runtime": 2.1658, |
|
"eval_samples_per_second": 461.718, |
|
"eval_steps_per_second": 57.715, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"learning_rate": 0.00015252022401991287, |
|
"loss": 0.0272, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"eval_loss": 0.020514091476798058, |
|
"eval_runtime": 2.1415, |
|
"eval_samples_per_second": 466.966, |
|
"eval_steps_per_second": 58.371, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"learning_rate": 0.00014878655880522712, |
|
"loss": 0.0277, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"eval_loss": 0.018993763253092766, |
|
"eval_runtime": 2.3074, |
|
"eval_samples_per_second": 433.383, |
|
"eval_steps_per_second": 54.173, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"learning_rate": 0.00014505289359054139, |
|
"loss": 0.0253, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"eval_loss": 0.0132982786744833, |
|
"eval_runtime": 2.7723, |
|
"eval_samples_per_second": 360.706, |
|
"eval_steps_per_second": 45.088, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"learning_rate": 0.00014131922837585563, |
|
"loss": 0.0208, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"eval_loss": 0.011603164486587048, |
|
"eval_runtime": 2.2147, |
|
"eval_samples_per_second": 451.518, |
|
"eval_steps_per_second": 56.44, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"learning_rate": 0.00013758556316116987, |
|
"loss": 0.019, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"eval_loss": 0.007933158427476883, |
|
"eval_runtime": 2.565, |
|
"eval_samples_per_second": 389.858, |
|
"eval_steps_per_second": 48.732, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"learning_rate": 0.00013385189794648414, |
|
"loss": 0.0179, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"eval_loss": 0.00808796752244234, |
|
"eval_runtime": 2.157, |
|
"eval_samples_per_second": 463.605, |
|
"eval_steps_per_second": 57.951, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"learning_rate": 0.00013011823273179835, |
|
"loss": 0.0136, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"eval_loss": 0.02137412503361702, |
|
"eval_runtime": 2.1642, |
|
"eval_samples_per_second": 462.06, |
|
"eval_steps_per_second": 57.758, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"learning_rate": 0.00012638456751711262, |
|
"loss": 0.0196, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"eval_loss": 0.009271830320358276, |
|
"eval_runtime": 2.7483, |
|
"eval_samples_per_second": 363.865, |
|
"eval_steps_per_second": 45.483, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 5.94, |
|
"learning_rate": 0.00012265090230242687, |
|
"loss": 0.015, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 5.94, |
|
"eval_loss": 0.011388062499463558, |
|
"eval_runtime": 3.1063, |
|
"eval_samples_per_second": 321.931, |
|
"eval_steps_per_second": 40.241, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"learning_rate": 0.00011891723708774112, |
|
"loss": 0.0196, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"eval_loss": 0.009324445389211178, |
|
"eval_runtime": 2.9695, |
|
"eval_samples_per_second": 336.759, |
|
"eval_steps_per_second": 42.095, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"learning_rate": 0.00011518357187305538, |
|
"loss": 0.0192, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"eval_loss": 0.008494062349200249, |
|
"eval_runtime": 2.1785, |
|
"eval_samples_per_second": 459.035, |
|
"eval_steps_per_second": 57.379, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"learning_rate": 0.00011144990665836963, |
|
"loss": 0.0155, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"eval_loss": 0.005131287965923548, |
|
"eval_runtime": 2.2151, |
|
"eval_samples_per_second": 451.441, |
|
"eval_steps_per_second": 56.43, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 6.43, |
|
"learning_rate": 0.00010771624144368388, |
|
"loss": 0.0182, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 6.43, |
|
"eval_loss": 0.01033452432602644, |
|
"eval_runtime": 2.204, |
|
"eval_samples_per_second": 453.729, |
|
"eval_steps_per_second": 56.716, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"learning_rate": 0.00010398257622899813, |
|
"loss": 0.0149, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"eval_loss": 0.006081216037273407, |
|
"eval_runtime": 2.6138, |
|
"eval_samples_per_second": 382.582, |
|
"eval_steps_per_second": 47.823, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"learning_rate": 0.00010024891101431236, |
|
"loss": 0.0155, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"eval_loss": 0.008235114626586437, |
|
"eval_runtime": 2.9799, |
|
"eval_samples_per_second": 335.587, |
|
"eval_steps_per_second": 41.948, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"learning_rate": 9.651524579962662e-05, |
|
"loss": 0.0125, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"eval_loss": 0.0061024767346680164, |
|
"eval_runtime": 3.1763, |
|
"eval_samples_per_second": 314.832, |
|
"eval_steps_per_second": 39.354, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"learning_rate": 9.278158058494087e-05, |
|
"loss": 0.0126, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"eval_loss": 0.0077368393540382385, |
|
"eval_runtime": 2.1677, |
|
"eval_samples_per_second": 461.31, |
|
"eval_steps_per_second": 57.664, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"learning_rate": 8.904791537025512e-05, |
|
"loss": 0.016, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"eval_loss": 0.01462015975266695, |
|
"eval_runtime": 2.163, |
|
"eval_samples_per_second": 462.313, |
|
"eval_steps_per_second": 57.789, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"learning_rate": 8.531425015556937e-05, |
|
"loss": 0.0168, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"eval_loss": 0.013114248402416706, |
|
"eval_runtime": 2.177, |
|
"eval_samples_per_second": 459.355, |
|
"eval_steps_per_second": 57.419, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"learning_rate": 8.158058494088363e-05, |
|
"loss": 0.0115, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"eval_loss": 0.0058467877097427845, |
|
"eval_runtime": 2.8432, |
|
"eval_samples_per_second": 351.72, |
|
"eval_steps_per_second": 43.965, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"learning_rate": 7.784691972619787e-05, |
|
"loss": 0.0109, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"eval_loss": 0.007328983396291733, |
|
"eval_runtime": 2.9781, |
|
"eval_samples_per_second": 335.785, |
|
"eval_steps_per_second": 41.973, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"learning_rate": 7.411325451151213e-05, |
|
"loss": 0.01, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"eval_loss": 0.00543447770178318, |
|
"eval_runtime": 2.13, |
|
"eval_samples_per_second": 469.492, |
|
"eval_steps_per_second": 58.686, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"learning_rate": 7.037958929682637e-05, |
|
"loss": 0.0085, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"eval_loss": 0.005294375587254763, |
|
"eval_runtime": 2.1484, |
|
"eval_samples_per_second": 465.459, |
|
"eval_steps_per_second": 58.182, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"learning_rate": 6.664592408214062e-05, |
|
"loss": 0.0105, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"eval_loss": 0.0051603252068161964, |
|
"eval_runtime": 2.1621, |
|
"eval_samples_per_second": 462.523, |
|
"eval_steps_per_second": 57.815, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"learning_rate": 6.291225886745488e-05, |
|
"loss": 0.01, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"eval_loss": 0.005722519941627979, |
|
"eval_runtime": 2.7684, |
|
"eval_samples_per_second": 361.216, |
|
"eval_steps_per_second": 45.152, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"learning_rate": 5.917859365276913e-05, |
|
"loss": 0.0071, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"eval_loss": 0.004564732778817415, |
|
"eval_runtime": 2.7551, |
|
"eval_samples_per_second": 362.961, |
|
"eval_steps_per_second": 45.37, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"learning_rate": 5.5444928438083385e-05, |
|
"loss": 0.0065, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"eval_loss": 0.004461783915758133, |
|
"eval_runtime": 3.1705, |
|
"eval_samples_per_second": 315.412, |
|
"eval_steps_per_second": 39.426, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 8.29, |
|
"learning_rate": 5.171126322339763e-05, |
|
"loss": 0.0075, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 8.29, |
|
"eval_loss": 0.004132562782615423, |
|
"eval_runtime": 3.5027, |
|
"eval_samples_per_second": 285.498, |
|
"eval_steps_per_second": 35.687, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 8.41, |
|
"learning_rate": 4.797759800871188e-05, |
|
"loss": 0.0072, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 8.41, |
|
"eval_loss": 0.004298557061702013, |
|
"eval_runtime": 2.1516, |
|
"eval_samples_per_second": 464.775, |
|
"eval_steps_per_second": 58.097, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 8.53, |
|
"learning_rate": 4.424393279402613e-05, |
|
"loss": 0.0077, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 8.53, |
|
"eval_loss": 0.005747557617723942, |
|
"eval_runtime": 2.1174, |
|
"eval_samples_per_second": 472.272, |
|
"eval_steps_per_second": 59.034, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"learning_rate": 4.051026757934038e-05, |
|
"loss": 0.009, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"eval_loss": 0.005076244939118624, |
|
"eval_runtime": 2.1715, |
|
"eval_samples_per_second": 460.514, |
|
"eval_steps_per_second": 57.564, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 8.78, |
|
"learning_rate": 3.677660236465463e-05, |
|
"loss": 0.0066, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 8.78, |
|
"eval_loss": 0.004328867886215448, |
|
"eval_runtime": 2.1457, |
|
"eval_samples_per_second": 466.038, |
|
"eval_steps_per_second": 58.255, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"learning_rate": 3.304293714996888e-05, |
|
"loss": 0.0065, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"eval_loss": 0.004579309374094009, |
|
"eval_runtime": 2.5023, |
|
"eval_samples_per_second": 399.626, |
|
"eval_steps_per_second": 49.953, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"learning_rate": 2.9309271935283136e-05, |
|
"loss": 0.0047, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"eval_loss": 0.00406376738101244, |
|
"eval_runtime": 3.0193, |
|
"eval_samples_per_second": 331.204, |
|
"eval_steps_per_second": 41.401, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"learning_rate": 2.5575606720597382e-05, |
|
"loss": 0.0049, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"eval_loss": 0.0037133253645151854, |
|
"eval_runtime": 2.5419, |
|
"eval_samples_per_second": 393.406, |
|
"eval_steps_per_second": 49.176, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"learning_rate": 2.1841941505911635e-05, |
|
"loss": 0.0048, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"eval_loss": 0.0035180081613361835, |
|
"eval_runtime": 2.1535, |
|
"eval_samples_per_second": 464.362, |
|
"eval_steps_per_second": 58.045, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"learning_rate": 1.8108276291225884e-05, |
|
"loss": 0.0045, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"eval_loss": 0.0041992985643446445, |
|
"eval_runtime": 2.1652, |
|
"eval_samples_per_second": 461.858, |
|
"eval_steps_per_second": 57.732, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"learning_rate": 1.4374611076540135e-05, |
|
"loss": 0.0041, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"eval_loss": 0.003915323410183191, |
|
"eval_runtime": 2.7057, |
|
"eval_samples_per_second": 369.59, |
|
"eval_steps_per_second": 46.199, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"learning_rate": 1.0640945861854385e-05, |
|
"loss": 0.0042, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"eval_loss": 0.0032798268366605043, |
|
"eval_runtime": 3.0263, |
|
"eval_samples_per_second": 330.438, |
|
"eval_steps_per_second": 41.305, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 9.77, |
|
"learning_rate": 6.907280647168636e-06, |
|
"loss": 0.0041, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 9.77, |
|
"eval_loss": 0.003197046695277095, |
|
"eval_runtime": 2.2279, |
|
"eval_samples_per_second": 448.855, |
|
"eval_steps_per_second": 56.107, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"learning_rate": 3.173615432482887e-06, |
|
"loss": 0.0039, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"eval_loss": 0.003054018598049879, |
|
"eval_runtime": 2.2116, |
|
"eval_samples_per_second": 452.155, |
|
"eval_steps_per_second": 56.519, |
|
"step": 16000 |
|
} |
|
], |
|
"logging_steps": 200, |
|
"max_steps": 16170, |
|
"num_train_epochs": 10, |
|
"save_steps": 200, |
|
"total_flos": 6146864391499776.0, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|