|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.42533081285444235, |
|
"eval_steps": 25, |
|
"global_step": 1125, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00945179584120983, |
|
"grad_norm": 0.48126843571662903, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4186, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.00945179584120983, |
|
"eval_loss": 1.2822998762130737, |
|
"eval_runtime": 1560.6226, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01890359168241966, |
|
"grad_norm": 0.8693311810493469, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2478, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01890359168241966, |
|
"eval_loss": 1.261049747467041, |
|
"eval_runtime": 1561.6033, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02835538752362949, |
|
"grad_norm": 0.4594016969203949, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1961, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02835538752362949, |
|
"eval_loss": 1.2359907627105713, |
|
"eval_runtime": 1561.4875, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03780718336483932, |
|
"grad_norm": 0.7460442185401917, |
|
"learning_rate": 0.0002, |
|
"loss": 1.245, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03780718336483932, |
|
"eval_loss": 1.2357805967330933, |
|
"eval_runtime": 1561.6317, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04725897920604915, |
|
"grad_norm": 0.37976986169815063, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2213, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.04725897920604915, |
|
"eval_loss": 1.2154258489608765, |
|
"eval_runtime": 1561.4032, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05671077504725898, |
|
"grad_norm": 0.6762637495994568, |
|
"learning_rate": 0.0002, |
|
"loss": 1.199, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05671077504725898, |
|
"eval_loss": 1.2192034721374512, |
|
"eval_runtime": 1561.5162, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0661625708884688, |
|
"grad_norm": 0.3414202034473419, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1825, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0661625708884688, |
|
"eval_loss": 1.199916124343872, |
|
"eval_runtime": 1561.7104, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.07561436672967864, |
|
"grad_norm": 0.8801635503768921, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1358, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07561436672967864, |
|
"eval_loss": 1.201659083366394, |
|
"eval_runtime": 1561.6394, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08506616257088846, |
|
"grad_norm": 0.31596821546554565, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2173, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.08506616257088846, |
|
"eval_loss": 1.18569016456604, |
|
"eval_runtime": 1561.8408, |
|
"eval_samples_per_second": 0.846, |
|
"eval_steps_per_second": 0.212, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.0945179584120983, |
|
"grad_norm": 0.9426243305206299, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1652, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0945179584120983, |
|
"eval_loss": 1.1847585439682007, |
|
"eval_runtime": 1561.46, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10396975425330812, |
|
"grad_norm": 0.3340831398963928, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1563, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.10396975425330812, |
|
"eval_loss": 1.1764663457870483, |
|
"eval_runtime": 1563.551, |
|
"eval_samples_per_second": 0.846, |
|
"eval_steps_per_second": 0.212, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.11342155009451796, |
|
"grad_norm": 1.1844408512115479, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1976, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11342155009451796, |
|
"eval_loss": 1.182220697402954, |
|
"eval_runtime": 1562.1264, |
|
"eval_samples_per_second": 0.846, |
|
"eval_steps_per_second": 0.212, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12287334593572778, |
|
"grad_norm": 0.35529959201812744, |
|
"learning_rate": 0.0002, |
|
"loss": 1.197, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.12287334593572778, |
|
"eval_loss": 1.170316219329834, |
|
"eval_runtime": 1561.289, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.1323251417769376, |
|
"grad_norm": 0.644234836101532, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1317, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1323251417769376, |
|
"eval_loss": 1.173732876777649, |
|
"eval_runtime": 1561.2179, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14177693761814744, |
|
"grad_norm": 0.38344722986221313, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2229, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.14177693761814744, |
|
"eval_loss": 1.1632750034332275, |
|
"eval_runtime": 1562.0523, |
|
"eval_samples_per_second": 0.846, |
|
"eval_steps_per_second": 0.212, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.15122873345935728, |
|
"grad_norm": 0.709377646446228, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1853, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15122873345935728, |
|
"eval_loss": 1.1692676544189453, |
|
"eval_runtime": 1561.2568, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.16068052930056712, |
|
"grad_norm": 0.34974658489227295, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1479, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.16068052930056712, |
|
"eval_loss": 1.1600748300552368, |
|
"eval_runtime": 1562.1908, |
|
"eval_samples_per_second": 0.846, |
|
"eval_steps_per_second": 0.212, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.17013232514177692, |
|
"grad_norm": 0.8809393644332886, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1047, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.17013232514177692, |
|
"eval_loss": 1.1649720668792725, |
|
"eval_runtime": 1563.0297, |
|
"eval_samples_per_second": 0.846, |
|
"eval_steps_per_second": 0.212, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.17958412098298676, |
|
"grad_norm": 0.319968581199646, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1477, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.17958412098298676, |
|
"eval_loss": 1.1558725833892822, |
|
"eval_runtime": 1561.2187, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.1890359168241966, |
|
"grad_norm": 0.7769630551338196, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1831, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1890359168241966, |
|
"eval_loss": 1.162941336631775, |
|
"eval_runtime": 1561.4384, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.19848771266540643, |
|
"grad_norm": 0.3040992319583893, |
|
"learning_rate": 0.0002, |
|
"loss": 1.134, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.19848771266540643, |
|
"eval_loss": 1.153849720954895, |
|
"eval_runtime": 1561.176, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.20793950850661624, |
|
"grad_norm": 0.656995415687561, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1366, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.20793950850661624, |
|
"eval_loss": 1.156500220298767, |
|
"eval_runtime": 1561.228, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 0.32160601019859314, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1581, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"eval_loss": 1.1488285064697266, |
|
"eval_runtime": 1561.286, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.22684310018903592, |
|
"grad_norm": 0.5169605016708374, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1179, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.22684310018903592, |
|
"eval_loss": 1.1587059497833252, |
|
"eval_runtime": 1561.443, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.23629489603024575, |
|
"grad_norm": 0.3807673156261444, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1654, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.23629489603024575, |
|
"eval_loss": 1.146795630455017, |
|
"eval_runtime": 1561.4729, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.24574669187145556, |
|
"grad_norm": 1.206275224685669, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1549, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24574669187145556, |
|
"eval_loss": 1.149159550666809, |
|
"eval_runtime": 1561.5158, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2551984877126654, |
|
"grad_norm": 0.3218563497066498, |
|
"learning_rate": 0.0002, |
|
"loss": 1.147, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.2551984877126654, |
|
"eval_loss": 1.1431602239608765, |
|
"eval_runtime": 1561.424, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.2646502835538752, |
|
"grad_norm": 0.7758462429046631, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1113, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2646502835538752, |
|
"eval_loss": 1.1470929384231567, |
|
"eval_runtime": 1561.3413, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2741020793950851, |
|
"grad_norm": 0.3400532901287079, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1684, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.2741020793950851, |
|
"eval_loss": 1.1409646272659302, |
|
"eval_runtime": 1561.1615, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.2835538752362949, |
|
"grad_norm": 0.48636239767074585, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1016, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2835538752362949, |
|
"eval_loss": 1.1419570446014404, |
|
"eval_runtime": 1561.247, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.29300567107750475, |
|
"grad_norm": 0.3466539978981018, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1589, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.29300567107750475, |
|
"eval_loss": 1.137436032295227, |
|
"eval_runtime": 1561.3303, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.30245746691871456, |
|
"grad_norm": 1.0184762477874756, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1275, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.30245746691871456, |
|
"eval_loss": 1.1429524421691895, |
|
"eval_runtime": 1561.4223, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.31190926275992437, |
|
"grad_norm": 0.3569687306880951, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2014, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.31190926275992437, |
|
"eval_loss": 1.134521722793579, |
|
"eval_runtime": 1561.5607, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.32136105860113423, |
|
"grad_norm": 0.503614068031311, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0947, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.32136105860113423, |
|
"eval_loss": 1.1380345821380615, |
|
"eval_runtime": 1561.4636, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.33081285444234404, |
|
"grad_norm": 0.4224971532821655, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1505, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.33081285444234404, |
|
"eval_loss": 1.1311566829681396, |
|
"eval_runtime": 1561.5445, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.34026465028355385, |
|
"grad_norm": 0.6001178026199341, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1121, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.34026465028355385, |
|
"eval_loss": 1.1359593868255615, |
|
"eval_runtime": 1561.55, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3497164461247637, |
|
"grad_norm": 0.3645350933074951, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1452, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3497164461247637, |
|
"eval_loss": 1.1279844045639038, |
|
"eval_runtime": 1561.6948, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3591682419659735, |
|
"grad_norm": 0.6315143704414368, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0865, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3591682419659735, |
|
"eval_loss": 1.1323318481445312, |
|
"eval_runtime": 1561.5263, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3686200378071834, |
|
"grad_norm": 0.3632996380329132, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1383, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.3686200378071834, |
|
"eval_loss": 1.1256133317947388, |
|
"eval_runtime": 1561.413, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.3780718336483932, |
|
"grad_norm": 0.8775736689567566, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1071, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3780718336483932, |
|
"eval_loss": 1.130606770515442, |
|
"eval_runtime": 1561.5903, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.387523629489603, |
|
"grad_norm": 0.32248276472091675, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1603, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.387523629489603, |
|
"eval_loss": 1.122152328491211, |
|
"eval_runtime": 1561.5582, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.39697542533081287, |
|
"grad_norm": 1.2496217489242554, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0542, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.39697542533081287, |
|
"eval_loss": 1.129094123840332, |
|
"eval_runtime": 1561.5299, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4064272211720227, |
|
"grad_norm": 0.31586310267448425, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1224, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.4064272211720227, |
|
"eval_loss": 1.1187065839767456, |
|
"eval_runtime": 1561.5901, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.4158790170132325, |
|
"grad_norm": 0.944985032081604, |
|
"learning_rate": 0.0002, |
|
"loss": 1.133, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4158790170132325, |
|
"eval_loss": 1.122226595878601, |
|
"eval_runtime": 1561.6201, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.42533081285444235, |
|
"grad_norm": 0.3063657879829407, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1122, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.42533081285444235, |
|
"eval_loss": 1.1147044897079468, |
|
"eval_runtime": 1561.5596, |
|
"eval_samples_per_second": 0.847, |
|
"eval_steps_per_second": 0.212, |
|
"step": 1125 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 2645, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"total_flos": 8.654158217045606e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|