|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1221, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02457002457002457, |
|
"grad_norm": 6.1998985197847745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8819, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04914004914004914, |
|
"grad_norm": 58.73927160489986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7904, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07371007371007371, |
|
"grad_norm": 0.7748310375081953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7623, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09828009828009827, |
|
"grad_norm": 1.0183734838338003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7382, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12285012285012285, |
|
"grad_norm": 1.4797556953661242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7184, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14742014742014742, |
|
"grad_norm": 0.9599955498815156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.714, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.171990171990172, |
|
"grad_norm": 1.0089273827026637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7048, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19656019656019655, |
|
"grad_norm": 0.7283907495801997, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7015, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22113022113022113, |
|
"grad_norm": 0.665386593224946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6941, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2457002457002457, |
|
"grad_norm": 0.6931565576787211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6901, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 0.5017251950296421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6844, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.29484029484029484, |
|
"grad_norm": 0.5071354456321502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6879, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3194103194103194, |
|
"grad_norm": 0.6070247331517776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6715, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.343980343980344, |
|
"grad_norm": 0.627106293158071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6774, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.36855036855036855, |
|
"grad_norm": 0.7310651144047745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6675, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3931203931203931, |
|
"grad_norm": 0.5686459982063066, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6755, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4176904176904177, |
|
"grad_norm": 0.5320144044432522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6711, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.44226044226044225, |
|
"grad_norm": 0.7374032221533481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6781, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4668304668304668, |
|
"grad_norm": 0.7028348920446017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6655, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4914004914004914, |
|
"grad_norm": 0.6462713543254518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6725, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.515970515970516, |
|
"grad_norm": 0.5063976505878001, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6623, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 0.5560125316540017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6583, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5651105651105651, |
|
"grad_norm": 0.7044183401160061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6669, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5896805896805897, |
|
"grad_norm": 0.547110632127531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6604, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6142506142506142, |
|
"grad_norm": 0.45751487478129493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6496, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6388206388206388, |
|
"grad_norm": 0.4993294423942927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.663, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6633906633906634, |
|
"grad_norm": 0.5942361162546903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6679, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.687960687960688, |
|
"grad_norm": 0.7411082671948905, |
|
"learning_rate": 5e-06, |
|
"loss": 0.66, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7125307125307125, |
|
"grad_norm": 0.5402865637174907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.656, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7371007371007371, |
|
"grad_norm": 0.7940852714316793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6559, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7616707616707616, |
|
"grad_norm": 0.4440401110733847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6609, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7862407862407862, |
|
"grad_norm": 0.5659551226784064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6493, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 0.5872037056746705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6537, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8353808353808354, |
|
"grad_norm": 0.5670870147361556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6521, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.85995085995086, |
|
"grad_norm": 0.45655325491327986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6532, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8845208845208845, |
|
"grad_norm": 0.4447712119508118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6535, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.49332039079874884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6544, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9336609336609336, |
|
"grad_norm": 0.4881054319905847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.651, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9582309582309583, |
|
"grad_norm": 0.5876239994992207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6559, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9828009828009828, |
|
"grad_norm": 0.5399792337431831, |
|
"learning_rate": 5e-06, |
|
"loss": 0.65, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6456817984580994, |
|
"eval_runtime": 217.9368, |
|
"eval_samples_per_second": 50.299, |
|
"eval_steps_per_second": 0.395, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.0073710073710074, |
|
"grad_norm": 0.9036800627518505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6391, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.031941031941032, |
|
"grad_norm": 0.5211566701261678, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6082, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0565110565110565, |
|
"grad_norm": 0.49201394538696464, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6047, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 0.5230677761506219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.605, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1056511056511056, |
|
"grad_norm": 0.5061353049548358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6117, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1302211302211302, |
|
"grad_norm": 0.5308414282748481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6038, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.154791154791155, |
|
"grad_norm": 0.5669662460358014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6046, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1793611793611793, |
|
"grad_norm": 0.4422388402451124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6094, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.203931203931204, |
|
"grad_norm": 0.6204673536874101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6098, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2285012285012284, |
|
"grad_norm": 0.5083801522633119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6054, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.253071253071253, |
|
"grad_norm": 0.6670278402124389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6119, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2776412776412776, |
|
"grad_norm": 0.4939088110674659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6165, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3022113022113022, |
|
"grad_norm": 0.560793275781157, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6045, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3267813267813269, |
|
"grad_norm": 0.7550429966767501, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6131, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 0.5972489241212914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6086, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.375921375921376, |
|
"grad_norm": 0.5508921516494931, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6093, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4004914004914004, |
|
"grad_norm": 0.5721594827498228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6083, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.425061425061425, |
|
"grad_norm": 0.44567791335112333, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6088, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4496314496314495, |
|
"grad_norm": 0.4714094356929579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.608, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4742014742014742, |
|
"grad_norm": 0.45010217795717233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6023, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4987714987714988, |
|
"grad_norm": 0.4507933071597356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6111, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5233415233415233, |
|
"grad_norm": 0.5050862616941423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6093, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.547911547911548, |
|
"grad_norm": 0.4815368134390591, |
|
"learning_rate": 5e-06, |
|
"loss": 0.612, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5724815724815726, |
|
"grad_norm": 0.4550986333720554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6134, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.597051597051597, |
|
"grad_norm": 0.49348414741752084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6126, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 0.46259072485753394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6102, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6461916461916462, |
|
"grad_norm": 0.4489893330738159, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6059, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6707616707616708, |
|
"grad_norm": 0.532020034080249, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6021, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6953316953316953, |
|
"grad_norm": 0.5291299108019797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6092, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.71990171990172, |
|
"grad_norm": 0.4977355779144285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6105, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7444717444717446, |
|
"grad_norm": 0.422516384063118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6066, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.769041769041769, |
|
"grad_norm": 0.5230305107651955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.616, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7936117936117935, |
|
"grad_norm": 0.6480321772002665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6053, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.46981041595278134, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6147, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8427518427518428, |
|
"grad_norm": 0.5219860110928298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6061, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8673218673218672, |
|
"grad_norm": 0.6090315148152918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6062, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 0.45264061321802845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6086, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9164619164619165, |
|
"grad_norm": 0.4862492186635227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6062, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.941031941031941, |
|
"grad_norm": 0.4262515523099187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.609, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9656019656019657, |
|
"grad_norm": 0.540857759125553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6003, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9901719901719903, |
|
"grad_norm": 0.5033832069244489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6074, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6354637742042542, |
|
"eval_runtime": 218.2757, |
|
"eval_samples_per_second": 50.221, |
|
"eval_steps_per_second": 0.394, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 2.0147420147420148, |
|
"grad_norm": 0.7307156038164634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5822, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.039312039312039, |
|
"grad_norm": 0.5232318359619684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5664, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.063882063882064, |
|
"grad_norm": 0.5745655618201481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5624, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.0884520884520885, |
|
"grad_norm": 0.4968769160797791, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5606, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.113022113022113, |
|
"grad_norm": 0.5327370268989237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5643, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.1375921375921374, |
|
"grad_norm": 0.561176488765928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5582, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 0.5155315136467447, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5579, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.1867321867321867, |
|
"grad_norm": 0.5150732586456199, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5687, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.211302211302211, |
|
"grad_norm": 0.44289319818662953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.559, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.235872235872236, |
|
"grad_norm": 0.4685578082551758, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5702, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.2604422604422605, |
|
"grad_norm": 0.5785484819826471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5748, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.285012285012285, |
|
"grad_norm": 0.5830747436786318, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.30958230958231, |
|
"grad_norm": 0.6088954402907469, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5699, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.3341523341523343, |
|
"grad_norm": 0.5065740936571793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.57, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.3587223587223587, |
|
"grad_norm": 0.4385965614068684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5698, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.383292383292383, |
|
"grad_norm": 0.5105642133338127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5658, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.407862407862408, |
|
"grad_norm": 0.44745755786742697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5638, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 0.4744160372716773, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5676, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.457002457002457, |
|
"grad_norm": 0.5308528263789132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5658, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.4815724815724813, |
|
"grad_norm": 0.5707200384455066, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5734, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.506142506142506, |
|
"grad_norm": 0.5519286283869067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5654, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.5307125307125307, |
|
"grad_norm": 0.4588518825063357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.572, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.555282555282555, |
|
"grad_norm": 0.4919724825790031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.57985257985258, |
|
"grad_norm": 0.4839147831831718, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5686, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.6044226044226044, |
|
"grad_norm": 0.4855007349272119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5711, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.628992628992629, |
|
"grad_norm": 0.4652801035899528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5655, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.6535626535626538, |
|
"grad_norm": 0.4816575537658345, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5672, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.678132678132678, |
|
"grad_norm": 0.4924974787000159, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5717, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 0.46678604918391425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5761, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.4815650588760393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5592, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.751842751842752, |
|
"grad_norm": 0.451576443537888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.7764127764127764, |
|
"grad_norm": 0.49058626754375095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.564, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.800982800982801, |
|
"grad_norm": 0.5407476269825076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5715, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.8255528255528253, |
|
"grad_norm": 0.5145580657005562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5713, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.85012285012285, |
|
"grad_norm": 0.5608100121904279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.8746928746928746, |
|
"grad_norm": 0.5060915278089009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5682, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.899262899262899, |
|
"grad_norm": 0.6967844756678558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5666, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.923832923832924, |
|
"grad_norm": 0.5006587440021466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5698, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.9484029484029484, |
|
"grad_norm": 0.47107358105339453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5706, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 0.46151336241822494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5665, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.9975429975429977, |
|
"grad_norm": 0.5264094237400985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.561, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.6375428438186646, |
|
"eval_runtime": 218.7931, |
|
"eval_samples_per_second": 50.102, |
|
"eval_steps_per_second": 0.393, |
|
"step": 1221 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1221, |
|
"total_flos": 2044801717370880.0, |
|
"train_loss": 0.6189516479367608, |
|
"train_runtime": 36570.9767, |
|
"train_samples_per_second": 17.085, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1221, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2044801717370880.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|