|
{ |
|
"best_metric": 0.2916193902492523, |
|
"best_model_checkpoint": "./convnext-base/checkpoint-8792", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 10990, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 12.231302261352539, |
|
"learning_rate": 9.99795725199423e-05, |
|
"loss": 1.8816, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 15.359071731567383, |
|
"learning_rate": 9.991830677104683e-05, |
|
"loss": 0.9793, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 18.62959098815918, |
|
"learning_rate": 9.981625281350813e-05, |
|
"loss": 0.8021, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 15.989691734313965, |
|
"learning_rate": 9.967349403553353e-05, |
|
"loss": 0.6843, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 13.825401306152344, |
|
"learning_rate": 9.949014708520663e-05, |
|
"loss": 0.7125, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 17.203922271728516, |
|
"learning_rate": 9.926636177517427e-05, |
|
"loss": 0.5937, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 13.448897361755371, |
|
"learning_rate": 9.900232096023477e-05, |
|
"loss": 0.6399, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 9.717751502990723, |
|
"learning_rate": 9.869824038792741e-05, |
|
"loss": 0.6142, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 9.631546020507812, |
|
"learning_rate": 9.835436852224525e-05, |
|
"loss": 0.5774, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 14.26085376739502, |
|
"learning_rate": 9.797098634061542e-05, |
|
"loss": 0.5947, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.868389662027833, |
|
"eval_loss": 0.42419883608818054, |
|
"eval_runtime": 109.2833, |
|
"eval_samples_per_second": 23.014, |
|
"eval_steps_per_second": 1.446, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 11.6874361038208, |
|
"learning_rate": 9.754840710431274e-05, |
|
"loss": 0.6197, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 7.372115135192871, |
|
"learning_rate": 9.708697610249406e-05, |
|
"loss": 0.4679, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 12.95173168182373, |
|
"learning_rate": 9.658707037006294e-05, |
|
"loss": 0.4685, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 8.779945373535156, |
|
"learning_rate": 9.604909837959455e-05, |
|
"loss": 0.4719, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 16.67346954345703, |
|
"learning_rate": 9.547349970757317e-05, |
|
"loss": 0.4539, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 20.67113494873047, |
|
"learning_rate": 9.486074467521456e-05, |
|
"loss": 0.455, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 13.069211959838867, |
|
"learning_rate": 9.421133396416686e-05, |
|
"loss": 0.5008, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 16.976469039916992, |
|
"learning_rate": 9.352579820740405e-05, |
|
"loss": 0.4652, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 10.576595306396484, |
|
"learning_rate": 9.280469755564613e-05, |
|
"loss": 0.4612, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 12.198715209960938, |
|
"learning_rate": 9.204862121966044e-05, |
|
"loss": 0.3841, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 12.983030319213867, |
|
"learning_rate": 9.125818698881798e-05, |
|
"loss": 0.4798, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8727634194831014, |
|
"eval_loss": 0.42417293787002563, |
|
"eval_runtime": 109.7743, |
|
"eval_samples_per_second": 22.911, |
|
"eval_steps_per_second": 1.439, |
|
"step": 2198 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 10.284225463867188, |
|
"learning_rate": 9.043404072629829e-05, |
|
"loss": 0.4405, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 10.147916793823242, |
|
"learning_rate": 8.957685584135502e-05, |
|
"loss": 0.3597, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 17.661441802978516, |
|
"learning_rate": 8.86873327390739e-05, |
|
"loss": 0.3931, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 11.3087797164917, |
|
"learning_rate": 8.776619824807224e-05, |
|
"loss": 0.3019, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 7.633837699890137, |
|
"learning_rate": 8.681420502660786e-05, |
|
"loss": 0.3771, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 13.283084869384766, |
|
"learning_rate": 8.583213094758261e-05, |
|
"loss": 0.3434, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.092094898223877, |
|
"learning_rate": 8.482077846294308e-05, |
|
"loss": 0.3502, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 7.117873191833496, |
|
"learning_rate": 8.378097394799773e-05, |
|
"loss": 0.4205, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 4.9123215675354, |
|
"learning_rate": 8.271356702618626e-05, |
|
"loss": 0.359, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 6.197904109954834, |
|
"learning_rate": 8.161942987485303e-05, |
|
"loss": 0.3543, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 15.387776374816895, |
|
"learning_rate": 8.049945651259163e-05, |
|
"loss": 0.3625, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9077534791252485, |
|
"eval_loss": 0.3553118109703064, |
|
"eval_runtime": 109.5619, |
|
"eval_samples_per_second": 22.955, |
|
"eval_steps_per_second": 1.442, |
|
"step": 3297 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 12.840432167053223, |
|
"learning_rate": 7.935456206874292e-05, |
|
"loss": 0.3386, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 17.542098999023438, |
|
"learning_rate": 7.818568203564374e-05, |
|
"loss": 0.2708, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 2.256465196609497, |
|
"learning_rate": 7.699377150423672e-05, |
|
"loss": 0.2731, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 13.450770378112793, |
|
"learning_rate": 7.577980438366628e-05, |
|
"loss": 0.2903, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 12.049773216247559, |
|
"learning_rate": 7.454477260549828e-05, |
|
"loss": 0.2849, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 5.758849143981934, |
|
"learning_rate": 7.32896853132135e-05, |
|
"loss": 0.2638, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 24.772600173950195, |
|
"learning_rate": 7.201556803763725e-05, |
|
"loss": 0.3027, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 4.821949481964111, |
|
"learning_rate": 7.07234618589791e-05, |
|
"loss": 0.2988, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 2.069714069366455, |
|
"learning_rate": 6.94144225561669e-05, |
|
"loss": 0.2806, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.6293389201164246, |
|
"learning_rate": 6.808951974417078e-05, |
|
"loss": 0.2627, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 8.25537109375, |
|
"learning_rate": 6.674983600002155e-05, |
|
"loss": 0.2777, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9184890656063618, |
|
"eval_loss": 0.32412779331207275, |
|
"eval_runtime": 109.2619, |
|
"eval_samples_per_second": 23.018, |
|
"eval_steps_per_second": 1.446, |
|
"step": 4396 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.3740968704223633, |
|
"learning_rate": 6.539646597823791e-05, |
|
"loss": 0.2558, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 8.975493431091309, |
|
"learning_rate": 6.403051551638508e-05, |
|
"loss": 0.2225, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 7.33804988861084, |
|
"learning_rate": 6.265310073149584e-05, |
|
"loss": 0.2356, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 6.3810834884643555, |
|
"learning_rate": 6.126534710809216e-05, |
|
"loss": 0.2136, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.08626483380794525, |
|
"learning_rate": 5.9868388578552734e-05, |
|
"loss": 0.2346, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 8.195035934448242, |
|
"learning_rate": 5.8463366596577706e-05, |
|
"loss": 0.2214, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 2.5158026218414307, |
|
"learning_rate": 5.705142920450777e-05, |
|
"loss": 0.1991, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 1.2365189790725708, |
|
"learning_rate": 5.5633730095259695e-05, |
|
"loss": 0.2105, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 3.150585889816284, |
|
"learning_rate": 5.421142766964474e-05, |
|
"loss": 0.2264, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 21.01511001586914, |
|
"learning_rate": 5.278568408984037e-05, |
|
"loss": 0.213, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 16.65435791015625, |
|
"learning_rate": 5.135766432978829e-05, |
|
"loss": 0.2368, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9244532803180915, |
|
"eval_loss": 0.34125077724456787, |
|
"eval_runtime": 104.85, |
|
"eval_samples_per_second": 23.987, |
|
"eval_steps_per_second": 1.507, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 4.176883697509766, |
|
"learning_rate": 4.9928535223295344e-05, |
|
"loss": 0.2106, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 0.6758583188056946, |
|
"learning_rate": 4.849946451061443e-05, |
|
"loss": 0.1445, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 14.350722312927246, |
|
"learning_rate": 4.707161988428495e-05, |
|
"loss": 0.1812, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 1.7648365497589111, |
|
"learning_rate": 4.564616803501205e-05, |
|
"loss": 0.1644, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 0.08469072729349136, |
|
"learning_rate": 4.4224273698364735e-05, |
|
"loss": 0.1678, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 4.34414005279541, |
|
"learning_rate": 4.2807098703071255e-05, |
|
"loss": 0.1434, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 5.815812110900879, |
|
"learning_rate": 4.1395801021689746e-05, |
|
"loss": 0.188, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 1.176138997077942, |
|
"learning_rate": 3.999153382442995e-05, |
|
"loss": 0.1407, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 10.93614387512207, |
|
"learning_rate": 3.859544453689853e-05, |
|
"loss": 0.192, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 1.6799432039260864, |
|
"learning_rate": 3.7208673902538706e-05, |
|
"loss": 0.1552, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 15.9625883102417, |
|
"learning_rate": 3.583235505052955e-05, |
|
"loss": 0.1635, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9355864811133201, |
|
"eval_loss": 0.3116415739059448, |
|
"eval_runtime": 105.6961, |
|
"eval_samples_per_second": 23.795, |
|
"eval_steps_per_second": 1.495, |
|
"step": 6594 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 8.467019081115723, |
|
"learning_rate": 3.446761256990723e-05, |
|
"loss": 0.1804, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 17.132722854614258, |
|
"learning_rate": 3.311556159066397e-05, |
|
"loss": 0.1311, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 16.02684783935547, |
|
"learning_rate": 3.177730687257639e-05, |
|
"loss": 0.139, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 13.188374519348145, |
|
"learning_rate": 3.0453941902507177e-05, |
|
"loss": 0.1163, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 0.007096346002072096, |
|
"learning_rate": 2.914654800091768e-05, |
|
"loss": 0.1489, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 0.02945764735341072, |
|
"learning_rate": 2.7856193438321986e-05, |
|
"loss": 0.1576, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 1.3139564990997314, |
|
"learning_rate": 2.6583932562403957e-05, |
|
"loss": 0.1117, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 4.071590900421143, |
|
"learning_rate": 2.5330804936510373e-05, |
|
"loss": 0.1233, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 11.481673240661621, |
|
"learning_rate": 2.409783449022475e-05, |
|
"loss": 0.1515, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 13.781807899475098, |
|
"learning_rate": 2.2886028682715217e-05, |
|
"loss": 0.1426, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 1.7268832921981812, |
|
"learning_rate": 2.169637767954048e-05, |
|
"loss": 0.1564, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9359840954274354, |
|
"eval_loss": 0.2996560335159302, |
|
"eval_runtime": 107.9382, |
|
"eval_samples_per_second": 23.3, |
|
"eval_steps_per_second": 1.464, |
|
"step": 7693 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 7.562883377075195, |
|
"learning_rate": 2.052985354358622e-05, |
|
"loss": 0.1016, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 12.858866691589355, |
|
"learning_rate": 1.9387409440793386e-05, |
|
"loss": 0.1068, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 0.6010186076164246, |
|
"learning_rate": 1.82699788613271e-05, |
|
"loss": 0.1298, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 3.0228309631347656, |
|
"learning_rate": 1.7178474856822456e-05, |
|
"loss": 0.1028, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 1.0708427429199219, |
|
"learning_rate": 1.611378929433083e-05, |
|
"loss": 0.1333, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 0.3288826048374176, |
|
"learning_rate": 1.5076792127576073e-05, |
|
"loss": 0.0871, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 17.829463958740234, |
|
"learning_rate": 1.4068330686115943e-05, |
|
"loss": 0.1159, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.05485250800848007, |
|
"learning_rate": 1.308922898298977e-05, |
|
"loss": 0.1167, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"grad_norm": 0.1920652985572815, |
|
"learning_rate": 1.2140287041418203e-05, |
|
"loss": 0.12, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 0.21363206207752228, |
|
"learning_rate": 1.1222280241104716e-05, |
|
"loss": 0.1011, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 0.3207741379737854, |
|
"learning_rate": 1.0335958684673574e-05, |
|
"loss": 0.1082, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9451292246520875, |
|
"eval_loss": 0.2916193902492523, |
|
"eval_runtime": 106.9736, |
|
"eval_samples_per_second": 23.51, |
|
"eval_steps_per_second": 1.477, |
|
"step": 8792 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.5695350766181946, |
|
"learning_rate": 9.482046584761495e-06, |
|
"loss": 0.0866, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 0.629421591758728, |
|
"learning_rate": 8.661241672264192e-06, |
|
"loss": 0.1015, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 0.16660259664058685, |
|
"learning_rate": 7.874214626220899e-06, |
|
"loss": 0.0913, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 4.9127607345581055, |
|
"learning_rate": 7.1216085258031414e-06, |
|
"loss": 0.0762, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 0.2511623501777649, |
|
"learning_rate": 6.404038324855222e-06, |
|
"loss": 0.1046, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 0.004342870321124792, |
|
"learning_rate": 5.7220903494159316e-06, |
|
"loss": 0.0764, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 7.590545177459717, |
|
"learning_rate": 5.076321818632018e-06, |
|
"loss": 0.0868, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 0.1453438103199005, |
|
"learning_rate": 4.467260389454864e-06, |
|
"loss": 0.0799, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 0.7127562165260315, |
|
"learning_rate": 3.895403725492402e-06, |
|
"loss": 0.0994, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"grad_norm": 8.983748435974121, |
|
"learning_rate": 3.3612190903686005e-06, |
|
"loss": 0.113, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 7.194179534912109, |
|
"learning_rate": 2.86514296592269e-06, |
|
"loss": 0.1146, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9431411530815109, |
|
"eval_loss": 0.2962559461593628, |
|
"eval_runtime": 107.8198, |
|
"eval_samples_per_second": 23.326, |
|
"eval_steps_per_second": 1.465, |
|
"step": 9891 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"grad_norm": 4.514409065246582, |
|
"learning_rate": 2.407580695560252e-06, |
|
"loss": 0.0801, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 0.0045217666774988174, |
|
"learning_rate": 1.9889061530473986e-06, |
|
"loss": 0.0669, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"grad_norm": 0.0040066540241241455, |
|
"learning_rate": 1.6094614370188499e-06, |
|
"loss": 0.0772, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 3.6479694843292236, |
|
"learning_rate": 1.269556591449389e-06, |
|
"loss": 0.0743, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"grad_norm": 0.03315883129835129, |
|
"learning_rate": 9.694693523171927e-07, |
|
"loss": 0.0791, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 0.5166642665863037, |
|
"learning_rate": 7.094449206659748e-07, |
|
"loss": 0.0764, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 1.8740558624267578, |
|
"learning_rate": 4.896957622514298e-07, |
|
"loss": 0.1069, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"grad_norm": 9.233416557312012, |
|
"learning_rate": 3.104014339355921e-07, |
|
"loss": 0.0785, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 5.323319911956787, |
|
"learning_rate": 1.7170843697111304e-07, |
|
"loss": 0.1104, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"grad_norm": 11.701455116271973, |
|
"learning_rate": 7.37300972951771e-08, |
|
"loss": 0.1072, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 0.15105102956295013, |
|
"learning_rate": 1.654647293098388e-08, |
|
"loss": 0.0801, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9439363817097416, |
|
"eval_loss": 0.2945658564567566, |
|
"eval_runtime": 107.8038, |
|
"eval_samples_per_second": 23.329, |
|
"eval_steps_per_second": 1.466, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 10990, |
|
"total_flos": 4.09349935387607e+19, |
|
"train_loss": 0.269900291632044, |
|
"train_runtime": 17210.4334, |
|
"train_samples_per_second": 10.215, |
|
"train_steps_per_second": 0.639 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10990, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 4.09349935387607e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|