|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 43.146067415730336, |
|
"eval_steps": 500, |
|
"global_step": 960, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 7.640738010406494, |
|
"learning_rate": 0.0002, |
|
"loss": 4.2661, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.9313442707061768, |
|
"learning_rate": 0.0002, |
|
"loss": 2.306, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.9562789797782898, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0093, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.4072426557540894, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8391, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 1.1522574424743652, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7152, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 5.39, |
|
"grad_norm": 0.9669118523597717, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5656, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"grad_norm": 1.1254830360412598, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4272, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 1.4298350811004639, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2586, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 8.09, |
|
"grad_norm": 1.3048124313354492, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1003, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.99, |
|
"grad_norm": 1.5784626007080078, |
|
"learning_rate": 0.0002, |
|
"loss": 0.941, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"grad_norm": 1.6184762716293335, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7854, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 10.79, |
|
"grad_norm": 2.039607048034668, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6663, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 11.69, |
|
"grad_norm": 1.9069631099700928, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5598, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 12.58, |
|
"grad_norm": 3.269792079925537, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4746, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 13.48, |
|
"grad_norm": 1.695237159729004, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3884, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 14.38, |
|
"grad_norm": 1.7961617708206177, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3197, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 15.28, |
|
"grad_norm": 1.6906554698944092, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2876, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 16.18, |
|
"grad_norm": 1.5338362455368042, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2476, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 17.08, |
|
"grad_norm": 1.482823371887207, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2152, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 17.98, |
|
"grad_norm": 1.6050206422805786, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1864, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"grad_norm": 1.7870419025421143, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1527, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 19.78, |
|
"grad_norm": 1.6181118488311768, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1387, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 20.67, |
|
"grad_norm": 1.545577049255371, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1291, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 21.57, |
|
"grad_norm": 1.4766790866851807, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1216, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 22.47, |
|
"grad_norm": 1.2652430534362793, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1125, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 23.37, |
|
"grad_norm": 1.3792601823806763, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1064, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 24.27, |
|
"grad_norm": 1.1617250442504883, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0965, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 25.17, |
|
"grad_norm": 1.0318264961242676, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0892, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 26.07, |
|
"grad_norm": 1.0102779865264893, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0866, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 26.97, |
|
"grad_norm": 1.2883203029632568, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0805, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 27.87, |
|
"grad_norm": 1.1580032110214233, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0748, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 28.76, |
|
"grad_norm": 1.114597201347351, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0745, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 29.66, |
|
"grad_norm": 1.0546940565109253, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0721, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 30.56, |
|
"grad_norm": 1.0050326585769653, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0697, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 31.46, |
|
"grad_norm": 1.0160025358200073, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0658, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 32.36, |
|
"grad_norm": 0.9212460517883301, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0643, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 33.26, |
|
"grad_norm": 0.8616517186164856, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0621, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 34.16, |
|
"grad_norm": 0.8040679693222046, |
|
"learning_rate": 0.0002, |
|
"loss": 0.061, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 35.06, |
|
"grad_norm": 0.7591003179550171, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0589, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 35.96, |
|
"grad_norm": 1.0100669860839844, |
|
"learning_rate": 0.0002, |
|
"loss": 0.055, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 36.85, |
|
"grad_norm": 0.9912341237068176, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0513, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 37.75, |
|
"grad_norm": 0.9290223121643066, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0515, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 38.65, |
|
"grad_norm": 0.8802034854888916, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0511, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 39.55, |
|
"grad_norm": 0.8020614981651306, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0506, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 40.45, |
|
"grad_norm": 0.8280277848243713, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0502, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 41.35, |
|
"grad_norm": 0.7979443669319153, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0496, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 42.25, |
|
"grad_norm": 0.7503458857536316, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0487, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 43.15, |
|
"grad_norm": 0.7012200355529785, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0469, |
|
"step": 960 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 46, |
|
"save_steps": 20, |
|
"total_flos": 2.927218722298921e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|