|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 24920, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.4730186462402344, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3314, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.50852108001709, |
|
"learning_rate": 1.9999872447769624e-05, |
|
"loss": 0.0313, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.40423300862312317, |
|
"learning_rate": 1.9999489794332404e-05, |
|
"loss": 0.0226, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4923788011074066, |
|
"learning_rate": 1.9998852049449998e-05, |
|
"loss": 0.0144, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.039026230573654175, |
|
"learning_rate": 1.9997959229391567e-05, |
|
"loss": 0.0088, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.13339382410049438, |
|
"learning_rate": 1.9996811356933346e-05, |
|
"loss": 0.0097, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.03016798384487629, |
|
"learning_rate": 1.9995408461358074e-05, |
|
"loss": 0.0063, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.12673313915729523, |
|
"learning_rate": 1.9993750578454248e-05, |
|
"loss": 0.0084, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.012224284932017326, |
|
"learning_rate": 1.999183775051519e-05, |
|
"loss": 0.006, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.144702434539795, |
|
"learning_rate": 1.9989670026338002e-05, |
|
"loss": 0.0076, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.02996153011918068, |
|
"learning_rate": 1.9987247461222297e-05, |
|
"loss": 0.0052, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.008165411651134491, |
|
"learning_rate": 1.9984570116968785e-05, |
|
"loss": 0.0052, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.07781720906496048, |
|
"learning_rate": 1.9981638061877714e-05, |
|
"loss": 0.0056, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.016230745241045952, |
|
"learning_rate": 1.9978451370747122e-05, |
|
"loss": 0.0052, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.58638858795166, |
|
"learning_rate": 1.997501012487091e-05, |
|
"loss": 0.0037, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.029858984053134918, |
|
"learning_rate": 1.9971314412036807e-05, |
|
"loss": 0.0037, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2630612850189209, |
|
"learning_rate": 1.996736432652409e-05, |
|
"loss": 0.0046, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.013704606331884861, |
|
"learning_rate": 1.9963159969101207e-05, |
|
"loss": 0.0056, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.09422887861728668, |
|
"learning_rate": 1.9958701447023188e-05, |
|
"loss": 0.0041, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.02264581061899662, |
|
"learning_rate": 1.9953988874028917e-05, |
|
"loss": 0.0039, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.020697351545095444, |
|
"learning_rate": 1.994902237033824e-05, |
|
"loss": 0.0048, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.1434525102376938, |
|
"learning_rate": 1.9943802062648877e-05, |
|
"loss": 0.0051, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.06430936604738235, |
|
"learning_rate": 1.9938328084133206e-05, |
|
"loss": 0.0023, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.008740647695958614, |
|
"learning_rate": 1.9932600574434864e-05, |
|
"loss": 0.004, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.014448602683842182, |
|
"learning_rate": 1.9926619679665175e-05, |
|
"loss": 0.0043, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.02901746705174446, |
|
"learning_rate": 1.9920385552399434e-05, |
|
"loss": 0.0019, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.6094048023223877, |
|
"learning_rate": 1.9913898351673006e-05, |
|
"loss": 0.0022, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.36414438486099243, |
|
"learning_rate": 1.990715824297728e-05, |
|
"loss": 0.0056, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.1421067714691162, |
|
"learning_rate": 1.9900165398255434e-05, |
|
"loss": 0.003, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.028148509562015533, |
|
"learning_rate": 1.9892919995898052e-05, |
|
"loss": 0.0016, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.015177663415670395, |
|
"learning_rate": 1.9885422220738583e-05, |
|
"loss": 0.004, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.25167518854141235, |
|
"learning_rate": 1.9877672264048618e-05, |
|
"loss": 0.0027, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.03997454047203064, |
|
"learning_rate": 1.9869670323533005e-05, |
|
"loss": 0.003, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.0028598604258149862, |
|
"learning_rate": 1.986141660332482e-05, |
|
"loss": 0.0019, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.1343453824520111, |
|
"learning_rate": 1.9852911313980146e-05, |
|
"loss": 0.0034, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.12534217536449432, |
|
"learning_rate": 1.9844154672472707e-05, |
|
"loss": 0.0026, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.24615176022052765, |
|
"learning_rate": 1.9835146902188336e-05, |
|
"loss": 0.0018, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.013632328249514103, |
|
"learning_rate": 1.9825888232919268e-05, |
|
"loss": 0.0019, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4895065724849701, |
|
"learning_rate": 1.9816378900858288e-05, |
|
"loss": 0.0037, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.3317159414291382, |
|
"learning_rate": 1.98066191485927e-05, |
|
"loss": 0.0022, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.008369619026780128, |
|
"learning_rate": 1.9796609225098136e-05, |
|
"loss": 0.0008, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.007471293676644564, |
|
"learning_rate": 1.9786349385732212e-05, |
|
"loss": 0.0031, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.011221354827284813, |
|
"learning_rate": 1.9775839892228004e-05, |
|
"loss": 0.0013, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.006684939842671156, |
|
"learning_rate": 1.976508101268738e-05, |
|
"loss": 0.0024, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7115832567214966, |
|
"learning_rate": 1.9754073021574153e-05, |
|
"loss": 0.0027, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.031222190707921982, |
|
"learning_rate": 1.9742816199707096e-05, |
|
"loss": 0.0013, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.033377427607774734, |
|
"learning_rate": 1.9731310834252747e-05, |
|
"loss": 0.0046, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.3707186281681061, |
|
"learning_rate": 1.9719557218718116e-05, |
|
"loss": 0.0025, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.030659371986985207, |
|
"learning_rate": 1.970755565294318e-05, |
|
"loss": 0.0033, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.031716570258140564, |
|
"learning_rate": 1.969530644309323e-05, |
|
"loss": 0.0017, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.018681248649954796, |
|
"learning_rate": 1.9682809901651074e-05, |
|
"loss": 0.0025, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.031807415187358856, |
|
"learning_rate": 1.9670066347409063e-05, |
|
"loss": 0.0029, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.00439372006803751, |
|
"learning_rate": 1.9657076105460945e-05, |
|
"loss": 0.0024, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.2656748592853546, |
|
"learning_rate": 1.964383950719359e-05, |
|
"loss": 0.0019, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.18905937671661377, |
|
"learning_rate": 1.9630356890278527e-05, |
|
"loss": 0.0015, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.1290869414806366, |
|
"learning_rate": 1.9616628598663322e-05, |
|
"loss": 0.0034, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.004318041726946831, |
|
"learning_rate": 1.9602654982562822e-05, |
|
"loss": 0.0014, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.15103724598884583, |
|
"learning_rate": 1.9588436398450206e-05, |
|
"loss": 0.0013, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.039078064262866974, |
|
"learning_rate": 1.9573973209047893e-05, |
|
"loss": 0.0015, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.7625222206115723, |
|
"learning_rate": 1.9559265783318304e-05, |
|
"loss": 0.0018, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.32574662566185, |
|
"learning_rate": 1.9544314496454423e-05, |
|
"loss": 0.0023, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.014155671931803226, |
|
"learning_rate": 1.9529119729870253e-05, |
|
"loss": 0.0011, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.007221277803182602, |
|
"learning_rate": 1.9513681871191063e-05, |
|
"loss": 0.0033, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.013470535166561604, |
|
"learning_rate": 1.949800131424352e-05, |
|
"loss": 0.003, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.003437698120251298, |
|
"learning_rate": 1.9482078459045617e-05, |
|
"loss": 0.0012, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.006796371191740036, |
|
"learning_rate": 1.9465913711796502e-05, |
|
"loss": 0.001, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7563895583152771, |
|
"learning_rate": 1.9449507484866084e-05, |
|
"loss": 0.0018, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.016867786645889282, |
|
"learning_rate": 1.9432860196784533e-05, |
|
"loss": 0.0016, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.006416236516088247, |
|
"learning_rate": 1.941597227223159e-05, |
|
"loss": 0.0008, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.0008421270758844912, |
|
"learning_rate": 1.9398844142025746e-05, |
|
"loss": 0.0011, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.012384007684886456, |
|
"learning_rate": 1.9381476243113243e-05, |
|
"loss": 0.0017, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.004778598435223103, |
|
"learning_rate": 1.9363869018556928e-05, |
|
"loss": 0.0016, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.00394839234650135, |
|
"learning_rate": 1.9346022917524958e-05, |
|
"loss": 0.0017, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.006818379741162062, |
|
"learning_rate": 1.9327938395279325e-05, |
|
"loss": 0.0019, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.028212955221533775, |
|
"learning_rate": 1.9309615913164262e-05, |
|
"loss": 0.0013, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.017115216702222824, |
|
"learning_rate": 1.9291055938594464e-05, |
|
"loss": 0.0017, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.02033141627907753, |
|
"learning_rate": 1.9272258945043154e-05, |
|
"loss": 0.0006, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.2443268299102783, |
|
"learning_rate": 1.9253225412030028e-05, |
|
"loss": 0.0011, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.0025956807658076286, |
|
"learning_rate": 1.9233955825109e-05, |
|
"loss": 0.001, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 13.584095001220703, |
|
"learning_rate": 1.9214450675855832e-05, |
|
"loss": 0.0029, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.0022025350481271744, |
|
"learning_rate": 1.919471046185558e-05, |
|
"loss": 0.001, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.002522263675928116, |
|
"learning_rate": 1.917473568668991e-05, |
|
"loss": 0.0024, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.3481580018997192, |
|
"learning_rate": 1.9154526859924242e-05, |
|
"loss": 0.0017, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.010974978096783161, |
|
"learning_rate": 1.9134084497094766e-05, |
|
"loss": 0.0008, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.005469560623168945, |
|
"learning_rate": 1.9113409119695276e-05, |
|
"loss": 0.0012, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.08603531867265701, |
|
"learning_rate": 1.9092501255163874e-05, |
|
"loss": 0.0025, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.0022059655748307705, |
|
"learning_rate": 1.907136143686951e-05, |
|
"loss": 0.0013, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.0005606790073215961, |
|
"learning_rate": 1.904999020409837e-05, |
|
"loss": 0.0014, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.012699578888714314, |
|
"learning_rate": 1.902838810204015e-05, |
|
"loss": 0.0012, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.00312532065436244, |
|
"learning_rate": 1.90065556817741e-05, |
|
"loss": 0.0012, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.008270219899713993, |
|
"learning_rate": 1.8984493500255e-05, |
|
"loss": 0.0014, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.01274858694523573, |
|
"learning_rate": 1.8962202120298948e-05, |
|
"loss": 0.0013, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.012512357905507088, |
|
"learning_rate": 1.8939682110568982e-05, |
|
"loss": 0.001, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.0043404679745435715, |
|
"learning_rate": 1.8916934045560603e-05, |
|
"loss": 0.0023, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.700385332107544, |
|
"learning_rate": 1.8893958505587093e-05, |
|
"loss": 0.0031, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.016660619527101517, |
|
"learning_rate": 1.8870756076764728e-05, |
|
"loss": 0.0019, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.047528013586997986, |
|
"learning_rate": 1.8847327350997814e-05, |
|
"loss": 0.0008, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.01734699122607708, |
|
"learning_rate": 1.8823672925963598e-05, |
|
"loss": 0.0022, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.019478172063827515, |
|
"learning_rate": 1.879979340509701e-05, |
|
"loss": 0.0009, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.1385781317949295, |
|
"learning_rate": 1.877568939757529e-05, |
|
"loss": 0.0015, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.143458753824234, |
|
"learning_rate": 1.8751361518302413e-05, |
|
"loss": 0.0008, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.003875449998304248, |
|
"learning_rate": 1.8726810387893438e-05, |
|
"loss": 0.0029, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.0030744324903935194, |
|
"learning_rate": 1.8702036632658646e-05, |
|
"loss": 0.0012, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.012408553622663021, |
|
"learning_rate": 1.867704088458759e-05, |
|
"loss": 0.0017, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.001399656874127686, |
|
"learning_rate": 1.8651823781332948e-05, |
|
"loss": 0.0006, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.001192159834317863, |
|
"learning_rate": 1.8626385966194275e-05, |
|
"loss": 0.001, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.6697074770927429, |
|
"learning_rate": 1.8600728088101587e-05, |
|
"loss": 0.0021, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.007071709726005793, |
|
"learning_rate": 1.857485080159879e-05, |
|
"loss": 0.0016, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.005609508138149977, |
|
"learning_rate": 1.8548754766827016e-05, |
|
"loss": 0.0022, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.0166325643658638, |
|
"learning_rate": 1.852244064950775e-05, |
|
"loss": 0.0032, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.0896034687757492, |
|
"learning_rate": 1.8495909120925857e-05, |
|
"loss": 0.0015, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.24882763624191284, |
|
"learning_rate": 1.846916085791247e-05, |
|
"loss": 0.0017, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.013551519252359867, |
|
"learning_rate": 1.8442196542827712e-05, |
|
"loss": 0.0007, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.01764736883342266, |
|
"learning_rate": 1.8415016863543286e-05, |
|
"loss": 0.0005, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.001765413791872561, |
|
"learning_rate": 1.8387622513424942e-05, |
|
"loss": 0.0044, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.0014637404819950461, |
|
"learning_rate": 1.836001419131476e-05, |
|
"loss": 0.0012, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.005813998635858297, |
|
"learning_rate": 1.8332192601513358e-05, |
|
"loss": 0.0006, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.0008558441768400371, |
|
"learning_rate": 1.8304158453761904e-05, |
|
"loss": 0.002, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.0033321972005069256, |
|
"learning_rate": 1.827591246322401e-05, |
|
"loss": 0.0005, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5678846836090088, |
|
"learning_rate": 1.8247455350467496e-05, |
|
"loss": 0.0014, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.009241198189556599, |
|
"learning_rate": 1.8218787841446003e-05, |
|
"loss": 0.0004, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.07260189205408096, |
|
"learning_rate": 1.8189910667480476e-05, |
|
"loss": 0.0015, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.0022115109022706747, |
|
"learning_rate": 1.8160824565240495e-05, |
|
"loss": 0.0029, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.020148636773228645, |
|
"learning_rate": 1.8131530276725514e-05, |
|
"loss": 0.0015, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2839891314506531, |
|
"learning_rate": 1.8102028549245894e-05, |
|
"loss": 0.0015, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.004473233129829168, |
|
"learning_rate": 1.8072320135403862e-05, |
|
"loss": 0.0012, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.0016079695196822286, |
|
"learning_rate": 1.804240579307431e-05, |
|
"loss": 0.0009, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.015555166639387608, |
|
"learning_rate": 1.8012286285385456e-05, |
|
"loss": 0.0015, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.014301074668765068, |
|
"learning_rate": 1.7981962380699376e-05, |
|
"loss": 0.0006, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.005016946699470282, |
|
"learning_rate": 1.7951434852592406e-05, |
|
"loss": 0.0008, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.006782655604183674, |
|
"learning_rate": 1.79207044798354e-05, |
|
"loss": 0.0004, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.006875937804579735, |
|
"learning_rate": 1.788977204637388e-05, |
|
"loss": 0.0027, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.007184536661952734, |
|
"learning_rate": 1.7858638341308026e-05, |
|
"loss": 0.0013, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.0019093825249001384, |
|
"learning_rate": 1.7827304158872538e-05, |
|
"loss": 0.0014, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.7438774108886719, |
|
"learning_rate": 1.779577029841638e-05, |
|
"loss": 0.0027, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.0496426597237587, |
|
"learning_rate": 1.776403756438241e-05, |
|
"loss": 0.0016, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.005185109097510576, |
|
"learning_rate": 1.773210676628682e-05, |
|
"loss": 0.0004, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.0010710001224651933, |
|
"learning_rate": 1.769997871869852e-05, |
|
"loss": 0.0009, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.013586360029876232, |
|
"learning_rate": 1.7667654241218332e-05, |
|
"loss": 0.0028, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.021679196506738663, |
|
"learning_rate": 1.7635134158458095e-05, |
|
"loss": 0.0011, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.2464137077331543, |
|
"learning_rate": 1.7602419300019627e-05, |
|
"loss": 0.0014, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.0015559865860268474, |
|
"learning_rate": 1.7569510500473566e-05, |
|
"loss": 0.0003, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.007820005528628826, |
|
"learning_rate": 1.753640859933806e-05, |
|
"loss": 0.001, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.0021381524857133627, |
|
"learning_rate": 1.7503114441057374e-05, |
|
"loss": 0.0019, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.004830517340451479, |
|
"learning_rate": 1.746962887498034e-05, |
|
"loss": 0.0006, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.004048046190291643, |
|
"learning_rate": 1.743595275533869e-05, |
|
"loss": 0.001, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.0010604397393763065, |
|
"learning_rate": 1.7402086941225246e-05, |
|
"loss": 0.0007, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.10798583179712296, |
|
"learning_rate": 1.736803229657204e-05, |
|
"loss": 0.0016, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.00545172905549407, |
|
"learning_rate": 1.7333789690128252e-05, |
|
"loss": 0.0003, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.051401443779468536, |
|
"learning_rate": 1.7299359995438046e-05, |
|
"loss": 0.0007, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.0022059613838791847, |
|
"learning_rate": 1.7264744090818284e-05, |
|
"loss": 0.0009, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.0031678322702646255, |
|
"learning_rate": 1.7229942859336142e-05, |
|
"loss": 0.0003, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.4575800597667694, |
|
"learning_rate": 1.719495718878655e-05, |
|
"loss": 0.0008, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.0009642325458116829, |
|
"learning_rate": 1.7159787971669586e-05, |
|
"loss": 0.001, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.0024611325934529305, |
|
"learning_rate": 1.712443610516765e-05, |
|
"loss": 0.0007, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.007828389294445515, |
|
"learning_rate": 1.7088902491122636e-05, |
|
"loss": 0.0003, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.0010621993569657207, |
|
"learning_rate": 1.7053188036012885e-05, |
|
"loss": 0.0022, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.003037663409486413, |
|
"learning_rate": 1.7017293650930083e-05, |
|
"loss": 0.0011, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.021871333941817284, |
|
"learning_rate": 1.6981220251555996e-05, |
|
"loss": 0.0005, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.2676248848438263, |
|
"learning_rate": 1.6944968758139144e-05, |
|
"loss": 0.0012, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.002139603951945901, |
|
"learning_rate": 1.6908540095471288e-05, |
|
"loss": 0.0016, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.00880183931440115, |
|
"learning_rate": 1.6871935192863862e-05, |
|
"loss": 0.0007, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.006027602590620518, |
|
"learning_rate": 1.6835154984124266e-05, |
|
"loss": 0.0027, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.002160316100344062, |
|
"learning_rate": 1.6798200407532025e-05, |
|
"loss": 0.0004, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.004634434822946787, |
|
"learning_rate": 1.676107240581488e-05, |
|
"loss": 0.0005, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.013142119161784649, |
|
"learning_rate": 1.6723771926124704e-05, |
|
"loss": 0.0012, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.0017228337237611413, |
|
"learning_rate": 1.6686299920013388e-05, |
|
"loss": 0.001, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.011891527101397514, |
|
"learning_rate": 1.6648657343408517e-05, |
|
"loss": 0.0004, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.03485196456313133, |
|
"learning_rate": 1.661084515658901e-05, |
|
"loss": 0.0008, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.006045693065971136, |
|
"learning_rate": 1.6572864324160617e-05, |
|
"loss": 0.0016, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.0319632813334465, |
|
"learning_rate": 1.6534715815031325e-05, |
|
"loss": 0.001, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.000651439419016242, |
|
"learning_rate": 1.649640060238661e-05, |
|
"loss": 0.0008, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.013289058580994606, |
|
"learning_rate": 1.645791966366464e-05, |
|
"loss": 0.0004, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.1903555393218994, |
|
"learning_rate": 1.6419273980531333e-05, |
|
"loss": 0.0009, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.0004181715485174209, |
|
"learning_rate": 1.63804645388553e-05, |
|
"loss": 0.0008, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.0005597823183052242, |
|
"learning_rate": 1.6341492328682703e-05, |
|
"loss": 0.0015, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.003250558627769351, |
|
"learning_rate": 1.6302358344212025e-05, |
|
"loss": 0.0015, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.00633718678727746, |
|
"learning_rate": 1.6263063583768652e-05, |
|
"loss": 0.0007, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.0012848949991166592, |
|
"learning_rate": 1.622360904977946e-05, |
|
"loss": 0.001, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.03510229289531708, |
|
"learning_rate": 1.6183995748747204e-05, |
|
"loss": 0.001, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.00044461427023634315, |
|
"learning_rate": 1.6144224691224868e-05, |
|
"loss": 0.0005, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.056695085018873215, |
|
"learning_rate": 1.6104296891789867e-05, |
|
"loss": 0.0011, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.002499540336430073, |
|
"learning_rate": 1.606421336901818e-05, |
|
"loss": 0.0005, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.00433159526437521, |
|
"learning_rate": 1.6023975145458352e-05, |
|
"loss": 0.0007, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.0020198116544634104, |
|
"learning_rate": 1.5983583247605414e-05, |
|
"loss": 0.0005, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.004110960755497217, |
|
"learning_rate": 1.5943038705874697e-05, |
|
"loss": 0.001, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.0036564720794558525, |
|
"learning_rate": 1.590234255457555e-05, |
|
"loss": 0.0014, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.0010110210860148072, |
|
"learning_rate": 1.5861495831884942e-05, |
|
"loss": 0.0008, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.11147645115852356, |
|
"learning_rate": 1.582049957982099e-05, |
|
"loss": 0.0004, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.002563257934525609, |
|
"learning_rate": 1.5779354844216377e-05, |
|
"loss": 0.0003, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.002472821157425642, |
|
"learning_rate": 1.5738062674691657e-05, |
|
"loss": 0.0003, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.08172155171632767, |
|
"learning_rate": 1.5696624124628495e-05, |
|
"loss": 0.0005, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.009228968061506748, |
|
"learning_rate": 1.5655040251142787e-05, |
|
"loss": 0.0008, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.027170058339834213, |
|
"learning_rate": 1.5613312115057697e-05, |
|
"loss": 0.0002, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.003953781444579363, |
|
"learning_rate": 1.5571440780876588e-05, |
|
"loss": 0.0009, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.2073652297258377, |
|
"learning_rate": 1.5529427316755876e-05, |
|
"loss": 0.001, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.000581290340051055, |
|
"learning_rate": 1.548727279447777e-05, |
|
"loss": 0.0007, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.001082689268514514, |
|
"learning_rate": 1.5444978289422937e-05, |
|
"loss": 0.0011, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.020316652953624725, |
|
"learning_rate": 1.540254488054307e-05, |
|
"loss": 0.0007, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.0026674780528992414, |
|
"learning_rate": 1.5359973650333352e-05, |
|
"loss": 0.0006, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.007332425098866224, |
|
"learning_rate": 1.5317265684804865e-05, |
|
"loss": 0.001, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.0030178299639374018, |
|
"learning_rate": 1.5274422073456853e-05, |
|
"loss": 0.0002, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.0009229824645444751, |
|
"learning_rate": 1.5231443909248956e-05, |
|
"loss": 0.0006, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.000619510596152395, |
|
"learning_rate": 1.5188332288573313e-05, |
|
"loss": 0.0011, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.0273954626172781, |
|
"learning_rate": 1.5145088311226599e-05, |
|
"loss": 0.0004, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.009902927093207836, |
|
"learning_rate": 1.510171308038197e-05, |
|
"loss": 0.0006, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.012104487977921963, |
|
"learning_rate": 1.5058207702560907e-05, |
|
"loss": 0.0004, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.8639243245124817, |
|
"learning_rate": 1.501457328760501e-05, |
|
"loss": 0.0007, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.07374394685029984, |
|
"learning_rate": 1.4970810948647664e-05, |
|
"loss": 0.0007, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.0010848238598555326, |
|
"learning_rate": 1.4926921802085662e-05, |
|
"loss": 0.0008, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.03615230694413185, |
|
"learning_rate": 1.4882906967550708e-05, |
|
"loss": 0.0002, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.00893151480704546, |
|
"learning_rate": 1.4838767567880865e-05, |
|
"loss": 0.0012, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.0023355227895081043, |
|
"learning_rate": 1.479450472909191e-05, |
|
"loss": 0.0023, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.7856515645980835, |
|
"learning_rate": 1.4750119580348601e-05, |
|
"loss": 0.0008, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.0007943073869682848, |
|
"learning_rate": 1.4705613253935886e-05, |
|
"loss": 0.0002, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.000991353183053434, |
|
"learning_rate": 1.4660986885230002e-05, |
|
"loss": 0.0005, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.0023457545321434736, |
|
"learning_rate": 1.4616241612669523e-05, |
|
"loss": 0.0035, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.0007867334061302245, |
|
"learning_rate": 1.4571378577726317e-05, |
|
"loss": 0.0006, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.0013897059252485633, |
|
"learning_rate": 1.4526398924876407e-05, |
|
"loss": 0.0005, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.0020729138050228357, |
|
"learning_rate": 1.4481303801570805e-05, |
|
"loss": 0.0009, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.002158819232136011, |
|
"learning_rate": 1.4436094358206224e-05, |
|
"loss": 0.0005, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.0038599702529609203, |
|
"learning_rate": 1.4390771748095735e-05, |
|
"loss": 0.0007, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.0022886607330292463, |
|
"learning_rate": 1.4345337127439333e-05, |
|
"loss": 0.0002, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.011882874183356762, |
|
"learning_rate": 1.4299791655294461e-05, |
|
"loss": 0.0021, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.011625411920249462, |
|
"learning_rate": 1.4254136493546432e-05, |
|
"loss": 0.0003, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.004132369067519903, |
|
"learning_rate": 1.4208372806878782e-05, |
|
"loss": 0.0004, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.0008625888731330633, |
|
"learning_rate": 1.4162501762743579e-05, |
|
"loss": 0.0013, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.02038603462278843, |
|
"learning_rate": 1.4116524531331616e-05, |
|
"loss": 0.001, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.021469444036483765, |
|
"learning_rate": 1.4070442285542579e-05, |
|
"loss": 0.0004, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.7221536040306091, |
|
"learning_rate": 1.402425620095511e-05, |
|
"loss": 0.0007, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.0006885859766043723, |
|
"learning_rate": 1.3977967455796828e-05, |
|
"loss": 0.0009, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.0010358589934185147, |
|
"learning_rate": 1.393157723091428e-05, |
|
"loss": 0.0003, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.0008139715064316988, |
|
"learning_rate": 1.3885086709742788e-05, |
|
"loss": 0.0005, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.6740529537200928, |
|
"learning_rate": 1.3838497078276288e-05, |
|
"loss": 0.0018, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.0022279045078903437, |
|
"learning_rate": 1.3791809525037057e-05, |
|
"loss": 0.0005, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.0020128132309764624, |
|
"learning_rate": 1.3745025241045414e-05, |
|
"loss": 0.0002, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.000531968311406672, |
|
"learning_rate": 1.3698145419789302e-05, |
|
"loss": 0.0007, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.00039000410470180213, |
|
"learning_rate": 1.3651171257193883e-05, |
|
"loss": 0.0006, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.041364409029483795, |
|
"learning_rate": 1.3604103951590993e-05, |
|
"loss": 0.0003, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.0011203879257664084, |
|
"learning_rate": 1.3556944703688592e-05, |
|
"loss": 0.0003, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.001194770447909832, |
|
"learning_rate": 1.3509694716540135e-05, |
|
"loss": 0.0002, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.002378718461841345, |
|
"learning_rate": 1.3462355195513868e-05, |
|
"loss": 0.0006, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.32328903675079346, |
|
"learning_rate": 1.341492734826209e-05, |
|
"loss": 0.001, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.002817463595420122, |
|
"learning_rate": 1.3367412384690346e-05, |
|
"loss": 0.0017, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.0024801292456686497, |
|
"learning_rate": 1.3319811516926541e-05, |
|
"loss": 0.0005, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.004569421522319317, |
|
"learning_rate": 1.3272125959290059e-05, |
|
"loss": 0.0008, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.1658785045146942, |
|
"learning_rate": 1.3224356928260735e-05, |
|
"loss": 0.0005, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.001219902653247118, |
|
"learning_rate": 1.317650564244787e-05, |
|
"loss": 0.0005, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.002142696175724268, |
|
"learning_rate": 1.3128573322559097e-05, |
|
"loss": 0.0022, |
|
"step": 24900 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 62300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 1.706415322300416e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|