|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 200, |
|
"global_step": 488, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004098360655737705, |
|
"grad_norm": 0.10885735931524154, |
|
"learning_rate": 2.040816326530612e-06, |
|
"loss": 0.5811, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00819672131147541, |
|
"grad_norm": 0.09543984170626801, |
|
"learning_rate": 4.081632653061224e-06, |
|
"loss": 0.5563, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.012295081967213115, |
|
"grad_norm": 0.10737563422882106, |
|
"learning_rate": 6.122448979591837e-06, |
|
"loss": 0.6131, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01639344262295082, |
|
"grad_norm": 0.10229836665000518, |
|
"learning_rate": 8.163265306122448e-06, |
|
"loss": 0.5505, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020491803278688523, |
|
"grad_norm": 0.10892687282476174, |
|
"learning_rate": 1.0204081632653061e-05, |
|
"loss": 0.6017, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02459016393442623, |
|
"grad_norm": 0.10094785009524743, |
|
"learning_rate": 1.2244897959183674e-05, |
|
"loss": 0.5789, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.028688524590163935, |
|
"grad_norm": 0.10851202963964662, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.5539, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03278688524590164, |
|
"grad_norm": 0.12753413559171203, |
|
"learning_rate": 1.6326530612244897e-05, |
|
"loss": 0.5965, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.036885245901639344, |
|
"grad_norm": 0.0919789082515372, |
|
"learning_rate": 1.836734693877551e-05, |
|
"loss": 0.4591, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.040983606557377046, |
|
"grad_norm": 0.14112664773884412, |
|
"learning_rate": 2.0408163265306123e-05, |
|
"loss": 0.5436, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.045081967213114756, |
|
"grad_norm": 0.1499072991458913, |
|
"learning_rate": 2.2448979591836737e-05, |
|
"loss": 0.5932, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04918032786885246, |
|
"grad_norm": 0.16694128474867986, |
|
"learning_rate": 2.448979591836735e-05, |
|
"loss": 0.6225, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05327868852459016, |
|
"grad_norm": 0.1588995885349821, |
|
"learning_rate": 2.6530612244897963e-05, |
|
"loss": 0.5069, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.05737704918032787, |
|
"grad_norm": 0.18129701774719173, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.5482, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06147540983606557, |
|
"grad_norm": 0.2183572552487636, |
|
"learning_rate": 3.061224489795919e-05, |
|
"loss": 0.5687, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06557377049180328, |
|
"grad_norm": 0.18334307701927258, |
|
"learning_rate": 3.265306122448979e-05, |
|
"loss": 0.5238, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06967213114754098, |
|
"grad_norm": 0.217210631302923, |
|
"learning_rate": 3.469387755102041e-05, |
|
"loss": 0.5037, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07377049180327869, |
|
"grad_norm": 0.2215307239284415, |
|
"learning_rate": 3.673469387755102e-05, |
|
"loss": 0.5432, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0778688524590164, |
|
"grad_norm": 0.17636357532215036, |
|
"learning_rate": 3.8775510204081634e-05, |
|
"loss": 0.539, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08196721311475409, |
|
"grad_norm": 0.1328655781521927, |
|
"learning_rate": 4.0816326530612245e-05, |
|
"loss": 0.424, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0860655737704918, |
|
"grad_norm": 0.14615441733689596, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.4999, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09016393442622951, |
|
"grad_norm": 0.14657086694880161, |
|
"learning_rate": 4.4897959183673474e-05, |
|
"loss": 0.4599, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0942622950819672, |
|
"grad_norm": 0.18601844475495982, |
|
"learning_rate": 4.6938775510204086e-05, |
|
"loss": 0.4769, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.09836065573770492, |
|
"grad_norm": 0.14918312992832872, |
|
"learning_rate": 4.89795918367347e-05, |
|
"loss": 0.524, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10245901639344263, |
|
"grad_norm": 0.12351557422248065, |
|
"learning_rate": 5.102040816326531e-05, |
|
"loss": 0.4415, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10655737704918032, |
|
"grad_norm": 0.1426398947034758, |
|
"learning_rate": 5.3061224489795926e-05, |
|
"loss": 0.4785, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11065573770491803, |
|
"grad_norm": 0.10410317953089403, |
|
"learning_rate": 5.510204081632653e-05, |
|
"loss": 0.4654, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.11475409836065574, |
|
"grad_norm": 0.10825565630969619, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.3948, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.11885245901639344, |
|
"grad_norm": 0.10198414371268509, |
|
"learning_rate": 5.918367346938776e-05, |
|
"loss": 0.3919, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12295081967213115, |
|
"grad_norm": 0.11312291852027019, |
|
"learning_rate": 6.122448979591838e-05, |
|
"loss": 0.3686, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12704918032786885, |
|
"grad_norm": 0.09311311604978195, |
|
"learning_rate": 6.326530612244899e-05, |
|
"loss": 0.3657, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13114754098360656, |
|
"grad_norm": 0.09879942365923487, |
|
"learning_rate": 6.530612244897959e-05, |
|
"loss": 0.3603, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.13524590163934427, |
|
"grad_norm": 0.10475690388929891, |
|
"learning_rate": 6.73469387755102e-05, |
|
"loss": 0.3936, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.13934426229508196, |
|
"grad_norm": 0.10155707550196463, |
|
"learning_rate": 6.938775510204082e-05, |
|
"loss": 0.3886, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.14344262295081966, |
|
"grad_norm": 0.0926126977092899, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.3437, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.14754098360655737, |
|
"grad_norm": 0.10340542063874245, |
|
"learning_rate": 7.346938775510205e-05, |
|
"loss": 0.3732, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.15163934426229508, |
|
"grad_norm": 0.08967895723482798, |
|
"learning_rate": 7.551020408163266e-05, |
|
"loss": 0.3843, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1557377049180328, |
|
"grad_norm": 0.08021185638722013, |
|
"learning_rate": 7.755102040816327e-05, |
|
"loss": 0.3499, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1598360655737705, |
|
"grad_norm": 0.08290688475528614, |
|
"learning_rate": 7.959183673469388e-05, |
|
"loss": 0.3199, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"grad_norm": 0.1025320920521767, |
|
"learning_rate": 8.163265306122449e-05, |
|
"loss": 0.3394, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1680327868852459, |
|
"grad_norm": 0.10293043209271259, |
|
"learning_rate": 8.367346938775511e-05, |
|
"loss": 0.3431, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1721311475409836, |
|
"grad_norm": 0.09025360865879611, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.3199, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1762295081967213, |
|
"grad_norm": 0.0798997188410711, |
|
"learning_rate": 8.775510204081632e-05, |
|
"loss": 0.2775, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.18032786885245902, |
|
"grad_norm": 0.0897092505499119, |
|
"learning_rate": 8.979591836734695e-05, |
|
"loss": 0.3086, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.18442622950819673, |
|
"grad_norm": 0.09029803208911406, |
|
"learning_rate": 9.183673469387756e-05, |
|
"loss": 0.2814, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1885245901639344, |
|
"grad_norm": 0.11450944516178525, |
|
"learning_rate": 9.387755102040817e-05, |
|
"loss": 0.3394, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.19262295081967212, |
|
"grad_norm": 0.09462426859596793, |
|
"learning_rate": 9.591836734693878e-05, |
|
"loss": 0.2795, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.19672131147540983, |
|
"grad_norm": 0.09550025936843255, |
|
"learning_rate": 9.79591836734694e-05, |
|
"loss": 0.2901, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.20081967213114754, |
|
"grad_norm": 0.10107778614583034, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2772, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.20491803278688525, |
|
"grad_norm": 0.10654444377790294, |
|
"learning_rate": 9.999871970850594e-05, |
|
"loss": 0.2801, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20901639344262296, |
|
"grad_norm": 0.09722504544564385, |
|
"learning_rate": 9.99948788995896e-05, |
|
"loss": 0.2908, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.21311475409836064, |
|
"grad_norm": 0.09187510891379361, |
|
"learning_rate": 9.998847776994521e-05, |
|
"loss": 0.2662, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.21721311475409835, |
|
"grad_norm": 0.09903882074067999, |
|
"learning_rate": 9.99795166473852e-05, |
|
"loss": 0.3133, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.22131147540983606, |
|
"grad_norm": 0.10275668687634087, |
|
"learning_rate": 9.996799599082358e-05, |
|
"loss": 0.2974, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.22540983606557377, |
|
"grad_norm": 0.08972023342314824, |
|
"learning_rate": 9.995391639025224e-05, |
|
"loss": 0.251, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.22950819672131148, |
|
"grad_norm": 0.09464401309394976, |
|
"learning_rate": 9.993727856671093e-05, |
|
"loss": 0.2477, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2336065573770492, |
|
"grad_norm": 0.10011396555704247, |
|
"learning_rate": 9.99180833722502e-05, |
|
"loss": 0.2493, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.23770491803278687, |
|
"grad_norm": 0.11226340201350676, |
|
"learning_rate": 9.989633178988782e-05, |
|
"loss": 0.263, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.24180327868852458, |
|
"grad_norm": 0.09792926575267794, |
|
"learning_rate": 9.98720249335584e-05, |
|
"loss": 0.2326, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2459016393442623, |
|
"grad_norm": 0.11495988550279207, |
|
"learning_rate": 9.984516404805643e-05, |
|
"loss": 0.2696, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.0832562354006034, |
|
"learning_rate": 9.981575050897245e-05, |
|
"loss": 0.2272, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2540983606557377, |
|
"grad_norm": 0.09272122094979285, |
|
"learning_rate": 9.978378582262258e-05, |
|
"loss": 0.2277, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2581967213114754, |
|
"grad_norm": 0.0926643761595329, |
|
"learning_rate": 9.974927162597147e-05, |
|
"loss": 0.2222, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.26229508196721313, |
|
"grad_norm": 0.09529461804236374, |
|
"learning_rate": 9.971220968654842e-05, |
|
"loss": 0.2147, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.26639344262295084, |
|
"grad_norm": 0.10578872457966666, |
|
"learning_rate": 9.967260190235686e-05, |
|
"loss": 0.2395, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.27049180327868855, |
|
"grad_norm": 0.08716137251502158, |
|
"learning_rate": 9.963045030177716e-05, |
|
"loss": 0.191, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.27459016393442626, |
|
"grad_norm": 0.0933351028764273, |
|
"learning_rate": 9.958575704346275e-05, |
|
"loss": 0.2106, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2786885245901639, |
|
"grad_norm": 0.09293060930004443, |
|
"learning_rate": 9.953852441622958e-05, |
|
"loss": 0.2045, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2827868852459016, |
|
"grad_norm": 0.11576298641567612, |
|
"learning_rate": 9.948875483893885e-05, |
|
"loss": 0.2129, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.28688524590163933, |
|
"grad_norm": 0.10041163065836892, |
|
"learning_rate": 9.943645086037325e-05, |
|
"loss": 0.2115, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.29098360655737704, |
|
"grad_norm": 0.11798299846536875, |
|
"learning_rate": 9.938161515910633e-05, |
|
"loss": 0.2481, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.29508196721311475, |
|
"grad_norm": 0.10102114528623451, |
|
"learning_rate": 9.932425054336536e-05, |
|
"loss": 0.2177, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.29918032786885246, |
|
"grad_norm": 0.09976403649412756, |
|
"learning_rate": 9.926435995088751e-05, |
|
"loss": 0.1893, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.30327868852459017, |
|
"grad_norm": 0.11235776084128149, |
|
"learning_rate": 9.920194644876948e-05, |
|
"loss": 0.2272, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3073770491803279, |
|
"grad_norm": 0.11132462529983536, |
|
"learning_rate": 9.913701323331024e-05, |
|
"loss": 0.1853, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3114754098360656, |
|
"grad_norm": 0.09930579681430861, |
|
"learning_rate": 9.906956362984754e-05, |
|
"loss": 0.1728, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3155737704918033, |
|
"grad_norm": 0.09952860431385552, |
|
"learning_rate": 9.899960109258755e-05, |
|
"loss": 0.1731, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.319672131147541, |
|
"grad_norm": 0.12343923967205338, |
|
"learning_rate": 9.89271292044279e-05, |
|
"loss": 0.2246, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3237704918032787, |
|
"grad_norm": 0.12308319513476172, |
|
"learning_rate": 9.88521516767743e-05, |
|
"loss": 0.2027, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 0.13087972401563355, |
|
"learning_rate": 9.877467234935035e-05, |
|
"loss": 0.2154, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3319672131147541, |
|
"grad_norm": 0.1045904395470389, |
|
"learning_rate": 9.869469519000103e-05, |
|
"loss": 0.178, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3360655737704918, |
|
"grad_norm": 0.14375976436812823, |
|
"learning_rate": 9.861222429448939e-05, |
|
"loss": 0.1973, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3401639344262295, |
|
"grad_norm": 0.11828496216829337, |
|
"learning_rate": 9.852726388628689e-05, |
|
"loss": 0.1611, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3442622950819672, |
|
"grad_norm": 0.11221666880904256, |
|
"learning_rate": 9.843981831635704e-05, |
|
"loss": 0.1885, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3483606557377049, |
|
"grad_norm": 0.12144099494087787, |
|
"learning_rate": 9.834989206293264e-05, |
|
"loss": 0.1896, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3524590163934426, |
|
"grad_norm": 0.11011171362279154, |
|
"learning_rate": 9.825748973128633e-05, |
|
"loss": 0.1811, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.35655737704918034, |
|
"grad_norm": 0.12633893642514252, |
|
"learning_rate": 9.816261605349493e-05, |
|
"loss": 0.1992, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.36065573770491804, |
|
"grad_norm": 0.12484980936203433, |
|
"learning_rate": 9.806527588819692e-05, |
|
"loss": 0.1956, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.36475409836065575, |
|
"grad_norm": 0.12944319616467373, |
|
"learning_rate": 9.796547422034374e-05, |
|
"loss": 0.1825, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.36885245901639346, |
|
"grad_norm": 0.13039578366779397, |
|
"learning_rate": 9.786321616094444e-05, |
|
"loss": 0.1749, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3729508196721312, |
|
"grad_norm": 0.16458414758299922, |
|
"learning_rate": 9.775850694680397e-05, |
|
"loss": 0.1944, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3770491803278688, |
|
"grad_norm": 0.13219546608772784, |
|
"learning_rate": 9.765135194025499e-05, |
|
"loss": 0.1771, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.38114754098360654, |
|
"grad_norm": 0.14006629744621393, |
|
"learning_rate": 9.754175662888321e-05, |
|
"loss": 0.1661, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.38524590163934425, |
|
"grad_norm": 0.14470317387177975, |
|
"learning_rate": 9.742972662524644e-05, |
|
"loss": 0.1981, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.38934426229508196, |
|
"grad_norm": 0.13843078519217147, |
|
"learning_rate": 9.731526766658712e-05, |
|
"loss": 0.1735, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.39344262295081966, |
|
"grad_norm": 0.1353061977442374, |
|
"learning_rate": 9.719838561453848e-05, |
|
"loss": 0.1582, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3975409836065574, |
|
"grad_norm": 0.1357950229192003, |
|
"learning_rate": 9.707908645482442e-05, |
|
"loss": 0.18, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4016393442622951, |
|
"grad_norm": 0.14178080733722706, |
|
"learning_rate": 9.695737629695292e-05, |
|
"loss": 0.158, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4057377049180328, |
|
"grad_norm": 0.1515071621238269, |
|
"learning_rate": 9.683326137390314e-05, |
|
"loss": 0.1829, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.4098360655737705, |
|
"grad_norm": 0.12126660667998342, |
|
"learning_rate": 9.670674804180633e-05, |
|
"loss": 0.158, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4139344262295082, |
|
"grad_norm": 0.15274368057486823, |
|
"learning_rate": 9.657784277962017e-05, |
|
"loss": 0.1912, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.4180327868852459, |
|
"grad_norm": 0.14586661447264065, |
|
"learning_rate": 9.644655218879713e-05, |
|
"loss": 0.1585, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.42213114754098363, |
|
"grad_norm": 0.13918569664369232, |
|
"learning_rate": 9.631288299294625e-05, |
|
"loss": 0.1642, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.4262295081967213, |
|
"grad_norm": 0.13062238504926085, |
|
"learning_rate": 9.617684203748894e-05, |
|
"loss": 0.1719, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.430327868852459, |
|
"grad_norm": 0.13095311012132155, |
|
"learning_rate": 9.603843628930827e-05, |
|
"loss": 0.1366, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4344262295081967, |
|
"grad_norm": 0.1378443383619325, |
|
"learning_rate": 9.589767283639238e-05, |
|
"loss": 0.1484, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.4385245901639344, |
|
"grad_norm": 0.1588568831317976, |
|
"learning_rate": 9.575455888747129e-05, |
|
"loss": 0.1426, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.4426229508196721, |
|
"grad_norm": 0.14141099713374855, |
|
"learning_rate": 9.560910177164788e-05, |
|
"loss": 0.1442, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.44672131147540983, |
|
"grad_norm": 0.15511211544124912, |
|
"learning_rate": 9.546130893802246e-05, |
|
"loss": 0.1495, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.45081967213114754, |
|
"grad_norm": 0.13807932571830756, |
|
"learning_rate": 9.531118795531136e-05, |
|
"loss": 0.1395, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.45491803278688525, |
|
"grad_norm": 0.15789654305622566, |
|
"learning_rate": 9.515874651145926e-05, |
|
"loss": 0.1871, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.45901639344262296, |
|
"grad_norm": 0.15796719793789657, |
|
"learning_rate": 9.50039924132455e-05, |
|
"loss": 0.1634, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.46311475409836067, |
|
"grad_norm": 0.14938155771658035, |
|
"learning_rate": 9.484693358588435e-05, |
|
"loss": 0.1819, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4672131147540984, |
|
"grad_norm": 0.18259563364597392, |
|
"learning_rate": 9.468757807261899e-05, |
|
"loss": 0.1749, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.4713114754098361, |
|
"grad_norm": 0.17556492537142315, |
|
"learning_rate": 9.452593403430978e-05, |
|
"loss": 0.1856, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.47540983606557374, |
|
"grad_norm": 0.1602448986838981, |
|
"learning_rate": 9.436200974901619e-05, |
|
"loss": 0.1723, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.47950819672131145, |
|
"grad_norm": 0.1691861443696351, |
|
"learning_rate": 9.419581361157295e-05, |
|
"loss": 0.1927, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.48360655737704916, |
|
"grad_norm": 0.16086945108806114, |
|
"learning_rate": 9.402735413316012e-05, |
|
"loss": 0.1593, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.48770491803278687, |
|
"grad_norm": 0.17451078487412305, |
|
"learning_rate": 9.385663994086717e-05, |
|
"loss": 0.1463, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.4918032786885246, |
|
"grad_norm": 0.1702475093023926, |
|
"learning_rate": 9.368367977725126e-05, |
|
"loss": 0.1757, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4959016393442623, |
|
"grad_norm": 0.16478933308887905, |
|
"learning_rate": 9.350848249988942e-05, |
|
"loss": 0.1488, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.16273089994599113, |
|
"learning_rate": 9.333105708092499e-05, |
|
"loss": 0.1689, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5040983606557377, |
|
"grad_norm": 0.1409960739640557, |
|
"learning_rate": 9.315141260660823e-05, |
|
"loss": 0.1389, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5081967213114754, |
|
"grad_norm": 0.15124176821591653, |
|
"learning_rate": 9.296955827683075e-05, |
|
"loss": 0.1521, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5122950819672131, |
|
"grad_norm": 0.1587729268931332, |
|
"learning_rate": 9.278550340465469e-05, |
|
"loss": 0.1717, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5163934426229508, |
|
"grad_norm": 0.13589035641321393, |
|
"learning_rate": 9.259925741583549e-05, |
|
"loss": 0.1347, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5204918032786885, |
|
"grad_norm": 0.13759923174432373, |
|
"learning_rate": 9.241082984833937e-05, |
|
"loss": 0.1378, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5245901639344263, |
|
"grad_norm": 0.15855139799675505, |
|
"learning_rate": 9.222023035185481e-05, |
|
"loss": 0.1545, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5286885245901639, |
|
"grad_norm": 0.15545461669283303, |
|
"learning_rate": 9.20274686872984e-05, |
|
"loss": 0.1502, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5327868852459017, |
|
"grad_norm": 0.15510775952098196, |
|
"learning_rate": 9.183255472631486e-05, |
|
"loss": 0.1338, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5368852459016393, |
|
"grad_norm": 0.2008082231462037, |
|
"learning_rate": 9.163549845077172e-05, |
|
"loss": 0.1671, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5409836065573771, |
|
"grad_norm": 0.16870678428794317, |
|
"learning_rate": 9.143630995224785e-05, |
|
"loss": 0.1512, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5450819672131147, |
|
"grad_norm": 0.16476629914112087, |
|
"learning_rate": 9.123499943151692e-05, |
|
"loss": 0.1565, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5491803278688525, |
|
"grad_norm": 0.17078350636291376, |
|
"learning_rate": 9.10315771980248e-05, |
|
"loss": 0.153, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5532786885245902, |
|
"grad_norm": 0.15909313462691074, |
|
"learning_rate": 9.082605366936168e-05, |
|
"loss": 0.1392, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5573770491803278, |
|
"grad_norm": 0.16223350919827026, |
|
"learning_rate": 9.061843937072861e-05, |
|
"loss": 0.1677, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5614754098360656, |
|
"grad_norm": 0.17376681067395994, |
|
"learning_rate": 9.040874493439839e-05, |
|
"loss": 0.1383, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5655737704918032, |
|
"grad_norm": 0.1641172902680086, |
|
"learning_rate": 9.019698109917119e-05, |
|
"loss": 0.1668, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.569672131147541, |
|
"grad_norm": 0.15234776968115493, |
|
"learning_rate": 8.998315870982444e-05, |
|
"loss": 0.1218, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.5737704918032787, |
|
"grad_norm": 0.1872820726954906, |
|
"learning_rate": 8.976728871655762e-05, |
|
"loss": 0.1676, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5778688524590164, |
|
"grad_norm": 0.16023446148249956, |
|
"learning_rate": 8.954938217443136e-05, |
|
"loss": 0.1437, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.5819672131147541, |
|
"grad_norm": 0.19633500296640014, |
|
"learning_rate": 8.932945024280139e-05, |
|
"loss": 0.1739, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.5860655737704918, |
|
"grad_norm": 0.1704501675079385, |
|
"learning_rate": 8.910750418474693e-05, |
|
"loss": 0.1537, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.5901639344262295, |
|
"grad_norm": 0.1800207222497811, |
|
"learning_rate": 8.8883555366494e-05, |
|
"loss": 0.1475, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5942622950819673, |
|
"grad_norm": 0.19139439132267907, |
|
"learning_rate": 8.865761525683329e-05, |
|
"loss": 0.1733, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5983606557377049, |
|
"grad_norm": 0.1812519339640167, |
|
"learning_rate": 8.842969542653281e-05, |
|
"loss": 0.1416, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6024590163934426, |
|
"grad_norm": 0.168458137590727, |
|
"learning_rate": 8.819980754774539e-05, |
|
"loss": 0.1408, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6065573770491803, |
|
"grad_norm": 0.17241876574464468, |
|
"learning_rate": 8.796796339341083e-05, |
|
"loss": 0.1404, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.610655737704918, |
|
"grad_norm": 0.19062819140366627, |
|
"learning_rate": 8.773417483665309e-05, |
|
"loss": 0.1442, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6147540983606558, |
|
"grad_norm": 0.15972796819072785, |
|
"learning_rate": 8.749845385017221e-05, |
|
"loss": 0.1373, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6188524590163934, |
|
"grad_norm": 0.16127899772885856, |
|
"learning_rate": 8.726081250563114e-05, |
|
"loss": 0.1386, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6229508196721312, |
|
"grad_norm": 0.18918243767524542, |
|
"learning_rate": 8.702126297303754e-05, |
|
"loss": 0.1563, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6270491803278688, |
|
"grad_norm": 0.18155811710350117, |
|
"learning_rate": 8.677981752012061e-05, |
|
"loss": 0.1457, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6311475409836066, |
|
"grad_norm": 0.17684911682878873, |
|
"learning_rate": 8.65364885117027e-05, |
|
"loss": 0.1451, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6352459016393442, |
|
"grad_norm": 0.17235430323704784, |
|
"learning_rate": 8.629128840906622e-05, |
|
"loss": 0.1346, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.639344262295082, |
|
"grad_norm": 0.18565663737598154, |
|
"learning_rate": 8.604422976931538e-05, |
|
"loss": 0.1447, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6434426229508197, |
|
"grad_norm": 0.20944409129337707, |
|
"learning_rate": 8.579532524473322e-05, |
|
"loss": 0.1274, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6475409836065574, |
|
"grad_norm": 0.17754246041424776, |
|
"learning_rate": 8.554458758213352e-05, |
|
"loss": 0.1329, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6516393442622951, |
|
"grad_norm": 0.17810845667806482, |
|
"learning_rate": 8.529202962220818e-05, |
|
"loss": 0.1361, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.1608582250209417, |
|
"learning_rate": 8.50376642988695e-05, |
|
"loss": 0.1197, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6598360655737705, |
|
"grad_norm": 0.17739164241648425, |
|
"learning_rate": 8.478150463858788e-05, |
|
"loss": 0.1593, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6639344262295082, |
|
"grad_norm": 0.1656512173363593, |
|
"learning_rate": 8.452356375972466e-05, |
|
"loss": 0.1339, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6680327868852459, |
|
"grad_norm": 0.1788275198288848, |
|
"learning_rate": 8.42638548718604e-05, |
|
"loss": 0.1658, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.6721311475409836, |
|
"grad_norm": 0.25206698616186196, |
|
"learning_rate": 8.40023912751183e-05, |
|
"loss": 0.1391, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6762295081967213, |
|
"grad_norm": 0.17369829387561495, |
|
"learning_rate": 8.373918635948311e-05, |
|
"loss": 0.1332, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.680327868852459, |
|
"grad_norm": 0.16861239567737674, |
|
"learning_rate": 8.34742536041154e-05, |
|
"loss": 0.1263, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.6844262295081968, |
|
"grad_norm": 0.1929344032900212, |
|
"learning_rate": 8.320760657666133e-05, |
|
"loss": 0.1278, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.6885245901639344, |
|
"grad_norm": 0.17898592779538816, |
|
"learning_rate": 8.293925893255771e-05, |
|
"loss": 0.1336, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.6926229508196722, |
|
"grad_norm": 0.19635180613433656, |
|
"learning_rate": 8.266922441433284e-05, |
|
"loss": 0.1453, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.6967213114754098, |
|
"grad_norm": 0.16858384058703138, |
|
"learning_rate": 8.239751685090253e-05, |
|
"loss": 0.1139, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7008196721311475, |
|
"grad_norm": 0.19395668797648288, |
|
"learning_rate": 8.212415015686213e-05, |
|
"loss": 0.1251, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7049180327868853, |
|
"grad_norm": 0.19396674290108995, |
|
"learning_rate": 8.184913833177372e-05, |
|
"loss": 0.1569, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7090163934426229, |
|
"grad_norm": 0.18760069818410455, |
|
"learning_rate": 8.157249545944934e-05, |
|
"loss": 0.1519, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.7131147540983607, |
|
"grad_norm": 0.18143260363629768, |
|
"learning_rate": 8.129423570722964e-05, |
|
"loss": 0.1338, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7172131147540983, |
|
"grad_norm": 0.1853756885298579, |
|
"learning_rate": 8.101437332525837e-05, |
|
"loss": 0.1359, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7213114754098361, |
|
"grad_norm": 0.17905848511103561, |
|
"learning_rate": 8.073292264575263e-05, |
|
"loss": 0.1197, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7254098360655737, |
|
"grad_norm": 0.1693931071052849, |
|
"learning_rate": 8.044989808226885e-05, |
|
"loss": 0.1183, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7295081967213115, |
|
"grad_norm": 0.19298527281872263, |
|
"learning_rate": 8.016531412896468e-05, |
|
"loss": 0.1372, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7336065573770492, |
|
"grad_norm": 0.18769205530531333, |
|
"learning_rate": 7.98791853598567e-05, |
|
"loss": 0.1214, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7377049180327869, |
|
"grad_norm": 0.18199077690644846, |
|
"learning_rate": 7.959152642807411e-05, |
|
"loss": 0.1364, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7418032786885246, |
|
"grad_norm": 0.18257163156810308, |
|
"learning_rate": 7.930235206510821e-05, |
|
"loss": 0.1209, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7459016393442623, |
|
"grad_norm": 0.18002525316518497, |
|
"learning_rate": 7.901167708005812e-05, |
|
"loss": 0.1264, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.18519588230164483, |
|
"learning_rate": 7.871951635887228e-05, |
|
"loss": 0.1287, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7540983606557377, |
|
"grad_norm": 0.1848979809474253, |
|
"learning_rate": 7.842588486358611e-05, |
|
"loss": 0.1268, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.7581967213114754, |
|
"grad_norm": 0.22212797312114452, |
|
"learning_rate": 7.813079763155587e-05, |
|
"loss": 0.149, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7622950819672131, |
|
"grad_norm": 0.17739677448000538, |
|
"learning_rate": 7.783426977468847e-05, |
|
"loss": 0.1154, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.7663934426229508, |
|
"grad_norm": 0.18735600614636536, |
|
"learning_rate": 7.753631647866764e-05, |
|
"loss": 0.1287, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.7704918032786885, |
|
"grad_norm": 0.1927980118919512, |
|
"learning_rate": 7.723695300217619e-05, |
|
"loss": 0.1442, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.7745901639344263, |
|
"grad_norm": 0.17103075338378548, |
|
"learning_rate": 7.693619467611464e-05, |
|
"loss": 0.1122, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.7786885245901639, |
|
"grad_norm": 0.19226189850908318, |
|
"learning_rate": 7.663405690281602e-05, |
|
"loss": 0.1434, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7827868852459017, |
|
"grad_norm": 0.2022866385878345, |
|
"learning_rate": 7.633055515525721e-05, |
|
"loss": 0.1387, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.7868852459016393, |
|
"grad_norm": 0.17280425691572138, |
|
"learning_rate": 7.602570497626641e-05, |
|
"loss": 0.1247, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7909836065573771, |
|
"grad_norm": 0.1953392846226545, |
|
"learning_rate": 7.571952197772733e-05, |
|
"loss": 0.1346, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.7950819672131147, |
|
"grad_norm": 0.1819827161854772, |
|
"learning_rate": 7.541202183977944e-05, |
|
"loss": 0.1343, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.7991803278688525, |
|
"grad_norm": 0.18956673745527652, |
|
"learning_rate": 7.510322031001523e-05, |
|
"loss": 0.1403, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8032786885245902, |
|
"grad_norm": 0.17710158780741328, |
|
"learning_rate": 7.479313320267356e-05, |
|
"loss": 0.1203, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8073770491803278, |
|
"grad_norm": 0.17208339576484202, |
|
"learning_rate": 7.448177639782988e-05, |
|
"loss": 0.1021, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.8114754098360656, |
|
"grad_norm": 0.19679964840557646, |
|
"learning_rate": 7.416916584058291e-05, |
|
"loss": 0.1324, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8155737704918032, |
|
"grad_norm": 0.18208263603534253, |
|
"learning_rate": 7.385531754023818e-05, |
|
"loss": 0.1246, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"grad_norm": 0.17562187461702322, |
|
"learning_rate": 7.354024756948805e-05, |
|
"loss": 0.1143, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"eval_loss": 0.13085269927978516, |
|
"eval_runtime": 15.8068, |
|
"eval_samples_per_second": 1.265, |
|
"eval_steps_per_second": 0.316, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8237704918032787, |
|
"grad_norm": 0.17110972100979424, |
|
"learning_rate": 7.322397206358868e-05, |
|
"loss": 0.1242, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8278688524590164, |
|
"grad_norm": 0.18437063894283692, |
|
"learning_rate": 7.290650721953365e-05, |
|
"loss": 0.1102, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8319672131147541, |
|
"grad_norm": 0.20407404831982653, |
|
"learning_rate": 7.258786929522454e-05, |
|
"loss": 0.1402, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8360655737704918, |
|
"grad_norm": 0.18871267379413306, |
|
"learning_rate": 7.226807460863834e-05, |
|
"loss": 0.1188, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8401639344262295, |
|
"grad_norm": 0.1818608820461858, |
|
"learning_rate": 7.194713953699171e-05, |
|
"loss": 0.1271, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8442622950819673, |
|
"grad_norm": 0.1963272550808891, |
|
"learning_rate": 7.162508051590236e-05, |
|
"loss": 0.1391, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8483606557377049, |
|
"grad_norm": 0.17373436467685702, |
|
"learning_rate": 7.130191403854728e-05, |
|
"loss": 0.1153, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.8524590163934426, |
|
"grad_norm": 0.19266380445159845, |
|
"learning_rate": 7.097765665481818e-05, |
|
"loss": 0.1227, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.8565573770491803, |
|
"grad_norm": 0.18106479081023585, |
|
"learning_rate": 7.065232497047384e-05, |
|
"loss": 0.1144, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.860655737704918, |
|
"grad_norm": 0.19507966902057225, |
|
"learning_rate": 7.032593564628982e-05, |
|
"loss": 0.1199, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8647540983606558, |
|
"grad_norm": 0.17636125591066742, |
|
"learning_rate": 6.999850539720514e-05, |
|
"loss": 0.1183, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.8688524590163934, |
|
"grad_norm": 0.19919789500462576, |
|
"learning_rate": 6.967005099146629e-05, |
|
"loss": 0.1342, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.8729508196721312, |
|
"grad_norm": 0.19745218824351637, |
|
"learning_rate": 6.934058924976855e-05, |
|
"loss": 0.1171, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.8770491803278688, |
|
"grad_norm": 0.19184610328273202, |
|
"learning_rate": 6.901013704439456e-05, |
|
"loss": 0.1339, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.8811475409836066, |
|
"grad_norm": 0.1892246102487988, |
|
"learning_rate": 6.86787112983502e-05, |
|
"loss": 0.1251, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8852459016393442, |
|
"grad_norm": 0.19330230046016966, |
|
"learning_rate": 6.834632898449804e-05, |
|
"loss": 0.1516, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.889344262295082, |
|
"grad_norm": 0.17725855021655712, |
|
"learning_rate": 6.801300712468802e-05, |
|
"loss": 0.1158, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.8934426229508197, |
|
"grad_norm": 0.1897706357212999, |
|
"learning_rate": 6.767876278888585e-05, |
|
"loss": 0.134, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.8975409836065574, |
|
"grad_norm": 0.20290492492222809, |
|
"learning_rate": 6.734361309429871e-05, |
|
"loss": 0.1277, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.9016393442622951, |
|
"grad_norm": 0.18241237783566233, |
|
"learning_rate": 6.700757520449873e-05, |
|
"loss": 0.104, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9057377049180327, |
|
"grad_norm": 0.19405703008792743, |
|
"learning_rate": 6.6670666328544e-05, |
|
"loss": 0.1405, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9098360655737705, |
|
"grad_norm": 0.17961334851449848, |
|
"learning_rate": 6.633290372009722e-05, |
|
"loss": 0.119, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9139344262295082, |
|
"grad_norm": 0.20202960404041628, |
|
"learning_rate": 6.599430467654222e-05, |
|
"loss": 0.1426, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.9180327868852459, |
|
"grad_norm": 0.19111258609207246, |
|
"learning_rate": 6.565488653809797e-05, |
|
"loss": 0.1241, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9221311475409836, |
|
"grad_norm": 0.18774100787950773, |
|
"learning_rate": 6.531466668693071e-05, |
|
"loss": 0.1217, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9262295081967213, |
|
"grad_norm": 0.20807525080372385, |
|
"learning_rate": 6.497366254626372e-05, |
|
"loss": 0.1444, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.930327868852459, |
|
"grad_norm": 0.17361340661409613, |
|
"learning_rate": 6.463189157948499e-05, |
|
"loss": 0.1157, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9344262295081968, |
|
"grad_norm": 0.17667512844881067, |
|
"learning_rate": 6.428937128925303e-05, |
|
"loss": 0.1137, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9385245901639344, |
|
"grad_norm": 0.19522411577627083, |
|
"learning_rate": 6.394611921660036e-05, |
|
"loss": 0.1142, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9426229508196722, |
|
"grad_norm": 0.19061253813497658, |
|
"learning_rate": 6.360215294003538e-05, |
|
"loss": 0.1147, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9467213114754098, |
|
"grad_norm": 0.1889320875988051, |
|
"learning_rate": 6.325749007464201e-05, |
|
"loss": 0.1285, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.9508196721311475, |
|
"grad_norm": 0.20098732591185248, |
|
"learning_rate": 6.291214827117761e-05, |
|
"loss": 0.1502, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.9549180327868853, |
|
"grad_norm": 0.1946804559624313, |
|
"learning_rate": 6.256614521516915e-05, |
|
"loss": 0.1349, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.9590163934426229, |
|
"grad_norm": 0.2146414059717948, |
|
"learning_rate": 6.221949862600741e-05, |
|
"loss": 0.1453, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.9631147540983607, |
|
"grad_norm": 0.19717528258719821, |
|
"learning_rate": 6.187222625603957e-05, |
|
"loss": 0.1193, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9672131147540983, |
|
"grad_norm": 0.20399136486459374, |
|
"learning_rate": 6.15243458896601e-05, |
|
"loss": 0.1391, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.9713114754098361, |
|
"grad_norm": 0.1926288686899339, |
|
"learning_rate": 6.117587534239992e-05, |
|
"loss": 0.1117, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.9754098360655737, |
|
"grad_norm": 0.19247883868686658, |
|
"learning_rate": 6.082683246001416e-05, |
|
"loss": 0.1252, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.9795081967213115, |
|
"grad_norm": 0.19242584781802, |
|
"learning_rate": 6.047723511756815e-05, |
|
"loss": 0.1282, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 0.21275544312497674, |
|
"learning_rate": 6.012710121852205e-05, |
|
"loss": 0.1341, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9877049180327869, |
|
"grad_norm": 0.1804664142069766, |
|
"learning_rate": 5.977644869381398e-05, |
|
"loss": 0.1249, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.9918032786885246, |
|
"grad_norm": 0.18351682596698626, |
|
"learning_rate": 5.9425295500941704e-05, |
|
"loss": 0.1093, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.9959016393442623, |
|
"grad_norm": 0.19849556475127272, |
|
"learning_rate": 5.907365962304308e-05, |
|
"loss": 0.1183, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2024234506725076, |
|
"learning_rate": 5.872155906797503e-05, |
|
"loss": 0.109, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.0040983606557377, |
|
"grad_norm": 0.17645238784331063, |
|
"learning_rate": 5.83690118673914e-05, |
|
"loss": 0.0887, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.0081967213114753, |
|
"grad_norm": 0.17707033906237873, |
|
"learning_rate": 5.801603607581947e-05, |
|
"loss": 0.0734, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.0122950819672132, |
|
"grad_norm": 0.1879118674622085, |
|
"learning_rate": 5.766264976973538e-05, |
|
"loss": 0.0823, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.0163934426229508, |
|
"grad_norm": 0.20061285756993943, |
|
"learning_rate": 5.73088710466384e-05, |
|
"loss": 0.0866, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.0204918032786885, |
|
"grad_norm": 0.19444493978087618, |
|
"learning_rate": 5.695471802412413e-05, |
|
"loss": 0.0928, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.0245901639344261, |
|
"grad_norm": 0.19996457964971276, |
|
"learning_rate": 5.660020883895668e-05, |
|
"loss": 0.0804, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.028688524590164, |
|
"grad_norm": 0.21425344554956094, |
|
"learning_rate": 5.6245361646139794e-05, |
|
"loss": 0.0934, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.0327868852459017, |
|
"grad_norm": 0.22898083674351027, |
|
"learning_rate": 5.58901946179872e-05, |
|
"loss": 0.0921, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.0368852459016393, |
|
"grad_norm": 0.290540627425331, |
|
"learning_rate": 5.553472594319189e-05, |
|
"loss": 0.0934, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.040983606557377, |
|
"grad_norm": 0.273931261917051, |
|
"learning_rate": 5.5178973825894706e-05, |
|
"loss": 0.1117, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.0450819672131149, |
|
"grad_norm": 0.25430931712778365, |
|
"learning_rate": 5.482295648475203e-05, |
|
"loss": 0.1049, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0491803278688525, |
|
"grad_norm": 0.24194932502272815, |
|
"learning_rate": 5.446669215200281e-05, |
|
"loss": 0.1146, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.0532786885245902, |
|
"grad_norm": 0.20579577668490157, |
|
"learning_rate": 5.411019907253482e-05, |
|
"loss": 0.0853, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.0573770491803278, |
|
"grad_norm": 0.2185751015406012, |
|
"learning_rate": 5.375349550295038e-05, |
|
"loss": 0.0916, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.0614754098360655, |
|
"grad_norm": 0.21677244042516453, |
|
"learning_rate": 5.339659971063132e-05, |
|
"loss": 0.0905, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.0655737704918034, |
|
"grad_norm": 0.2109428012580277, |
|
"learning_rate": 5.303952997280355e-05, |
|
"loss": 0.0837, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.069672131147541, |
|
"grad_norm": 0.1988852883385447, |
|
"learning_rate": 5.268230457560095e-05, |
|
"loss": 0.0848, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.0737704918032787, |
|
"grad_norm": 0.20185225614786018, |
|
"learning_rate": 5.232494181312906e-05, |
|
"loss": 0.0882, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.0778688524590163, |
|
"grad_norm": 0.21775228450593942, |
|
"learning_rate": 5.196745998652807e-05, |
|
"loss": 0.0886, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.0819672131147542, |
|
"grad_norm": 0.23117101550005217, |
|
"learning_rate": 5.160987740303564e-05, |
|
"loss": 0.1024, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.0860655737704918, |
|
"grad_norm": 0.21437209879423688, |
|
"learning_rate": 5.12522123750494e-05, |
|
"loss": 0.0803, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.0901639344262295, |
|
"grad_norm": 0.2265139026178152, |
|
"learning_rate": 5.0894483219189046e-05, |
|
"loss": 0.0855, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.0942622950819672, |
|
"grad_norm": 0.22534362681306327, |
|
"learning_rate": 5.053670825535842e-05, |
|
"loss": 0.0865, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.098360655737705, |
|
"grad_norm": 0.23493559614373521, |
|
"learning_rate": 5.017890580580723e-05, |
|
"loss": 0.0954, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.1024590163934427, |
|
"grad_norm": 0.22639122256659971, |
|
"learning_rate": 4.982109419419277e-05, |
|
"loss": 0.0861, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.1065573770491803, |
|
"grad_norm": 0.20366634594892993, |
|
"learning_rate": 4.946329174464158e-05, |
|
"loss": 0.0811, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.110655737704918, |
|
"grad_norm": 0.20832107440198078, |
|
"learning_rate": 4.9105516780810946e-05, |
|
"loss": 0.0738, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.1147540983606556, |
|
"grad_norm": 0.24035085358430688, |
|
"learning_rate": 4.8747787624950604e-05, |
|
"loss": 0.1044, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.1188524590163935, |
|
"grad_norm": 0.1941737888109515, |
|
"learning_rate": 4.8390122596964355e-05, |
|
"loss": 0.0824, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.1229508196721312, |
|
"grad_norm": 0.20919358980409994, |
|
"learning_rate": 4.803254001347193e-05, |
|
"loss": 0.0862, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.1270491803278688, |
|
"grad_norm": 0.231047601436464, |
|
"learning_rate": 4.7675058186870944e-05, |
|
"loss": 0.0907, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.1311475409836065, |
|
"grad_norm": 0.21531381241133493, |
|
"learning_rate": 4.7317695424399044e-05, |
|
"loss": 0.0932, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.1352459016393444, |
|
"grad_norm": 0.24193852025077817, |
|
"learning_rate": 4.6960470027196456e-05, |
|
"loss": 0.085, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.139344262295082, |
|
"grad_norm": 0.20448950800710466, |
|
"learning_rate": 4.6603400289368676e-05, |
|
"loss": 0.088, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.1434426229508197, |
|
"grad_norm": 0.2257853631598955, |
|
"learning_rate": 4.624650449704962e-05, |
|
"loss": 0.0914, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.1475409836065573, |
|
"grad_norm": 0.23959875777441447, |
|
"learning_rate": 4.588980092746518e-05, |
|
"loss": 0.1042, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.151639344262295, |
|
"grad_norm": 0.21621082794043103, |
|
"learning_rate": 4.553330784799721e-05, |
|
"loss": 0.0801, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.1557377049180328, |
|
"grad_norm": 0.23154362074682183, |
|
"learning_rate": 4.517704351524798e-05, |
|
"loss": 0.116, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.1598360655737705, |
|
"grad_norm": 0.21749721579038517, |
|
"learning_rate": 4.48210261741053e-05, |
|
"loss": 0.08, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.1639344262295082, |
|
"grad_norm": 0.22621057695770447, |
|
"learning_rate": 4.446527405680812e-05, |
|
"loss": 0.0843, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.1680327868852458, |
|
"grad_norm": 0.21629529519346516, |
|
"learning_rate": 4.410980538201282e-05, |
|
"loss": 0.0783, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.1721311475409837, |
|
"grad_norm": 0.2583962088351615, |
|
"learning_rate": 4.375463835386022e-05, |
|
"loss": 0.0832, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.1762295081967213, |
|
"grad_norm": 0.2550952219128452, |
|
"learning_rate": 4.339979116104334e-05, |
|
"loss": 0.1102, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.180327868852459, |
|
"grad_norm": 0.22046670945180735, |
|
"learning_rate": 4.3045281975875875e-05, |
|
"loss": 0.0687, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.1844262295081966, |
|
"grad_norm": 0.21963199970259858, |
|
"learning_rate": 4.269112895336161e-05, |
|
"loss": 0.0765, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.1885245901639343, |
|
"grad_norm": 0.20549051860300943, |
|
"learning_rate": 4.2337350230264635e-05, |
|
"loss": 0.0773, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1926229508196722, |
|
"grad_norm": 0.20173502922198425, |
|
"learning_rate": 4.198396392418054e-05, |
|
"loss": 0.0603, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.1967213114754098, |
|
"grad_norm": 0.20756479162187122, |
|
"learning_rate": 4.1630988132608614e-05, |
|
"loss": 0.0698, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.2008196721311475, |
|
"grad_norm": 0.19579961343255203, |
|
"learning_rate": 4.127844093202498e-05, |
|
"loss": 0.0721, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.2049180327868854, |
|
"grad_norm": 0.22373098356251547, |
|
"learning_rate": 4.092634037695694e-05, |
|
"loss": 0.0793, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.209016393442623, |
|
"grad_norm": 0.22555565622897975, |
|
"learning_rate": 4.057470449905831e-05, |
|
"loss": 0.0901, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.2131147540983607, |
|
"grad_norm": 0.22693118605112406, |
|
"learning_rate": 4.022355130618604e-05, |
|
"loss": 0.0826, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.2172131147540983, |
|
"grad_norm": 0.23153954853832903, |
|
"learning_rate": 3.9872898781477954e-05, |
|
"loss": 0.0886, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.221311475409836, |
|
"grad_norm": 0.22314167716303715, |
|
"learning_rate": 3.952276488243186e-05, |
|
"loss": 0.0685, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.2254098360655739, |
|
"grad_norm": 0.22032532570326238, |
|
"learning_rate": 3.917316753998585e-05, |
|
"loss": 0.0736, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.2295081967213115, |
|
"grad_norm": 0.23039264717337804, |
|
"learning_rate": 3.882412465760009e-05, |
|
"loss": 0.0793, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.2336065573770492, |
|
"grad_norm": 0.21565650717021032, |
|
"learning_rate": 3.847565411033992e-05, |
|
"loss": 0.0675, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.2377049180327868, |
|
"grad_norm": 0.22078315656307607, |
|
"learning_rate": 3.8127773743960426e-05, |
|
"loss": 0.0825, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.2418032786885247, |
|
"grad_norm": 0.277778960283362, |
|
"learning_rate": 3.7780501373992596e-05, |
|
"loss": 0.0909, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.2459016393442623, |
|
"grad_norm": 0.25457025493811986, |
|
"learning_rate": 3.7433854784830854e-05, |
|
"loss": 0.0844, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.230488602289965, |
|
"learning_rate": 3.7087851728822405e-05, |
|
"loss": 0.0732, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.2540983606557377, |
|
"grad_norm": 0.2769088300832485, |
|
"learning_rate": 3.674250992535802e-05, |
|
"loss": 0.1133, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.2581967213114753, |
|
"grad_norm": 0.25238428313016165, |
|
"learning_rate": 3.639784705996463e-05, |
|
"loss": 0.1028, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.2622950819672132, |
|
"grad_norm": 0.22474085546390393, |
|
"learning_rate": 3.6053880783399654e-05, |
|
"loss": 0.0748, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.2663934426229508, |
|
"grad_norm": 0.2306185209891483, |
|
"learning_rate": 3.5710628710747e-05, |
|
"loss": 0.0909, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.2704918032786885, |
|
"grad_norm": 0.21441826461789973, |
|
"learning_rate": 3.5368108420515036e-05, |
|
"loss": 0.0827, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.2745901639344264, |
|
"grad_norm": 0.21674907407231, |
|
"learning_rate": 3.5026337453736314e-05, |
|
"loss": 0.0843, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.278688524590164, |
|
"grad_norm": 0.23693824083399773, |
|
"learning_rate": 3.4685333313069315e-05, |
|
"loss": 0.0908, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.2827868852459017, |
|
"grad_norm": 0.22068000066238444, |
|
"learning_rate": 3.4345113461902055e-05, |
|
"loss": 0.0859, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.2868852459016393, |
|
"grad_norm": 0.2141000026469413, |
|
"learning_rate": 3.400569532345781e-05, |
|
"loss": 0.0837, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.290983606557377, |
|
"grad_norm": 0.20219236400730675, |
|
"learning_rate": 3.3667096279902794e-05, |
|
"loss": 0.0708, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.2950819672131146, |
|
"grad_norm": 0.20804695200056297, |
|
"learning_rate": 3.3329333671456024e-05, |
|
"loss": 0.0739, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.2991803278688525, |
|
"grad_norm": 0.2343703975098053, |
|
"learning_rate": 3.2992424795501284e-05, |
|
"loss": 0.089, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.3032786885245902, |
|
"grad_norm": 0.21489328846354372, |
|
"learning_rate": 3.26563869057013e-05, |
|
"loss": 0.0804, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.3073770491803278, |
|
"grad_norm": 0.2198465103470338, |
|
"learning_rate": 3.232123721111415e-05, |
|
"loss": 0.0816, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.3114754098360657, |
|
"grad_norm": 0.2484592052479722, |
|
"learning_rate": 3.198699287531198e-05, |
|
"loss": 0.1007, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.3155737704918034, |
|
"grad_norm": 0.25565736054210597, |
|
"learning_rate": 3.165367101550197e-05, |
|
"loss": 0.1016, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.319672131147541, |
|
"grad_norm": 0.19989451433971764, |
|
"learning_rate": 3.13212887016498e-05, |
|
"loss": 0.0698, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.3237704918032787, |
|
"grad_norm": 0.21152406721818276, |
|
"learning_rate": 3.098986295560545e-05, |
|
"loss": 0.092, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.3278688524590163, |
|
"grad_norm": 0.24438562628885818, |
|
"learning_rate": 3.0659410750231454e-05, |
|
"loss": 0.0965, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.331967213114754, |
|
"grad_norm": 0.21428460598845647, |
|
"learning_rate": 3.032994900853372e-05, |
|
"loss": 0.0861, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.3360655737704918, |
|
"grad_norm": 0.2459292100893367, |
|
"learning_rate": 3.0001494602794867e-05, |
|
"loss": 0.1088, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.3401639344262295, |
|
"grad_norm": 0.22552854130941782, |
|
"learning_rate": 2.967406435371018e-05, |
|
"loss": 0.0862, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.3442622950819672, |
|
"grad_norm": 0.22398158633984547, |
|
"learning_rate": 2.934767502952616e-05, |
|
"loss": 0.084, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.348360655737705, |
|
"grad_norm": 0.22664651281034953, |
|
"learning_rate": 2.9022343345181846e-05, |
|
"loss": 0.0748, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.3524590163934427, |
|
"grad_norm": 0.22062475345873886, |
|
"learning_rate": 2.8698085961452724e-05, |
|
"loss": 0.087, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.3565573770491803, |
|
"grad_norm": 0.22114071809701424, |
|
"learning_rate": 2.8374919484097663e-05, |
|
"loss": 0.0798, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.360655737704918, |
|
"grad_norm": 0.2318013572074171, |
|
"learning_rate": 2.8052860463008295e-05, |
|
"loss": 0.0778, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.3647540983606556, |
|
"grad_norm": 0.21592005617557827, |
|
"learning_rate": 2.7731925391361673e-05, |
|
"loss": 0.0699, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.3688524590163935, |
|
"grad_norm": 0.22284005643229798, |
|
"learning_rate": 2.741213070477545e-05, |
|
"loss": 0.0688, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.3729508196721312, |
|
"grad_norm": 0.24331228121733403, |
|
"learning_rate": 2.7093492780466355e-05, |
|
"loss": 0.1021, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.3770491803278688, |
|
"grad_norm": 0.22916717485660554, |
|
"learning_rate": 2.6776027936411318e-05, |
|
"loss": 0.0787, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.3811475409836065, |
|
"grad_norm": 0.2486820570939316, |
|
"learning_rate": 2.6459752430511952e-05, |
|
"loss": 0.0968, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.3852459016393444, |
|
"grad_norm": 0.21399877551380306, |
|
"learning_rate": 2.6144682459761814e-05, |
|
"loss": 0.0689, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.389344262295082, |
|
"grad_norm": 0.23817514427496975, |
|
"learning_rate": 2.58308341594171e-05, |
|
"loss": 0.076, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.3934426229508197, |
|
"grad_norm": 0.19580850481427564, |
|
"learning_rate": 2.5518223602170134e-05, |
|
"loss": 0.066, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.3975409836065573, |
|
"grad_norm": 0.23341882458103203, |
|
"learning_rate": 2.5206866797326446e-05, |
|
"loss": 0.0795, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.401639344262295, |
|
"grad_norm": 0.24905408203075094, |
|
"learning_rate": 2.4896779689984783e-05, |
|
"loss": 0.0979, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.4057377049180328, |
|
"grad_norm": 0.2373253447398963, |
|
"learning_rate": 2.4587978160220563e-05, |
|
"loss": 0.0849, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.4098360655737705, |
|
"grad_norm": 0.23312222655260098, |
|
"learning_rate": 2.4280478022272696e-05, |
|
"loss": 0.0833, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.4139344262295082, |
|
"grad_norm": 0.24464678291649106, |
|
"learning_rate": 2.3974295023733577e-05, |
|
"loss": 0.0924, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.418032786885246, |
|
"grad_norm": 0.2423574669974183, |
|
"learning_rate": 2.3669444844742812e-05, |
|
"loss": 0.0876, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.4221311475409837, |
|
"grad_norm": 0.24768803714943216, |
|
"learning_rate": 2.336594309718399e-05, |
|
"loss": 0.0911, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.4262295081967213, |
|
"grad_norm": 0.2353256965607498, |
|
"learning_rate": 2.3063805323885383e-05, |
|
"loss": 0.0791, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.430327868852459, |
|
"grad_norm": 0.2184971836745015, |
|
"learning_rate": 2.276304699782381e-05, |
|
"loss": 0.071, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.4344262295081966, |
|
"grad_norm": 0.22589388512628691, |
|
"learning_rate": 2.2463683521332374e-05, |
|
"loss": 0.0883, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.4385245901639343, |
|
"grad_norm": 0.23861282261739694, |
|
"learning_rate": 2.2165730225311532e-05, |
|
"loss": 0.0949, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.4426229508196722, |
|
"grad_norm": 0.21125222504207164, |
|
"learning_rate": 2.1869202368444146e-05, |
|
"loss": 0.0669, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.4467213114754098, |
|
"grad_norm": 0.23839842143172482, |
|
"learning_rate": 2.1574115136413892e-05, |
|
"loss": 0.1115, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.4508196721311475, |
|
"grad_norm": 0.23021878248583671, |
|
"learning_rate": 2.128048364112774e-05, |
|
"loss": 0.0824, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.4549180327868854, |
|
"grad_norm": 0.21922671215563078, |
|
"learning_rate": 2.098832291994188e-05, |
|
"loss": 0.0805, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.459016393442623, |
|
"grad_norm": 0.22973971719352562, |
|
"learning_rate": 2.0697647934891807e-05, |
|
"loss": 0.0809, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.4631147540983607, |
|
"grad_norm": 0.23888720332729385, |
|
"learning_rate": 2.0408473571925908e-05, |
|
"loss": 0.0795, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.4672131147540983, |
|
"grad_norm": 0.20105531202188945, |
|
"learning_rate": 2.0120814640143314e-05, |
|
"loss": 0.0738, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.471311475409836, |
|
"grad_norm": 0.20815592735234387, |
|
"learning_rate": 1.983468587103533e-05, |
|
"loss": 0.0747, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.4754098360655736, |
|
"grad_norm": 0.21875945763540222, |
|
"learning_rate": 1.9550101917731166e-05, |
|
"loss": 0.0782, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.4795081967213115, |
|
"grad_norm": 0.23652904185716042, |
|
"learning_rate": 1.9267077354247394e-05, |
|
"loss": 0.0785, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.4836065573770492, |
|
"grad_norm": 0.22747137570450954, |
|
"learning_rate": 1.8985626674741643e-05, |
|
"loss": 0.1122, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.4877049180327868, |
|
"grad_norm": 0.2331279262069605, |
|
"learning_rate": 1.8705764292770383e-05, |
|
"loss": 0.0915, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.4918032786885247, |
|
"grad_norm": 0.22030724266502305, |
|
"learning_rate": 1.8427504540550677e-05, |
|
"loss": 0.0665, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.4959016393442623, |
|
"grad_norm": 0.22505977083225506, |
|
"learning_rate": 1.8150861668226304e-05, |
|
"loss": 0.075, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.21199992455583846, |
|
"learning_rate": 1.7875849843137893e-05, |
|
"loss": 0.0811, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.5040983606557377, |
|
"grad_norm": 0.21810290159002446, |
|
"learning_rate": 1.760248314909747e-05, |
|
"loss": 0.0756, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.5081967213114753, |
|
"grad_norm": 0.2558311827574221, |
|
"learning_rate": 1.7330775585667164e-05, |
|
"loss": 0.0843, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.512295081967213, |
|
"grad_norm": 0.2233574367138228, |
|
"learning_rate": 1.7060741067442288e-05, |
|
"loss": 0.0762, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.5163934426229508, |
|
"grad_norm": 0.21907681063831233, |
|
"learning_rate": 1.679239342333867e-05, |
|
"loss": 0.0741, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.5204918032786885, |
|
"grad_norm": 0.2386759157009457, |
|
"learning_rate": 1.6525746395884605e-05, |
|
"loss": 0.0922, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.5245901639344264, |
|
"grad_norm": 0.22163173702404845, |
|
"learning_rate": 1.626081364051691e-05, |
|
"loss": 0.0783, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.528688524590164, |
|
"grad_norm": 0.2610954803411835, |
|
"learning_rate": 1.599760872488171e-05, |
|
"loss": 0.0724, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.5327868852459017, |
|
"grad_norm": 0.2439120392285164, |
|
"learning_rate": 1.573614512813961e-05, |
|
"loss": 0.0787, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.5368852459016393, |
|
"grad_norm": 0.22669981293370467, |
|
"learning_rate": 1.5476436240275344e-05, |
|
"loss": 0.0713, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.540983606557377, |
|
"grad_norm": 0.22032890300346933, |
|
"learning_rate": 1.5218495361412145e-05, |
|
"loss": 0.0699, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.5450819672131146, |
|
"grad_norm": 0.23028230012736722, |
|
"learning_rate": 1.4962335701130509e-05, |
|
"loss": 0.0807, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.5491803278688525, |
|
"grad_norm": 0.2357458200653599, |
|
"learning_rate": 1.470797037779183e-05, |
|
"loss": 0.0833, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.5532786885245902, |
|
"grad_norm": 0.23495810636275852, |
|
"learning_rate": 1.4455412417866476e-05, |
|
"loss": 0.0908, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.5573770491803278, |
|
"grad_norm": 0.24624217602429513, |
|
"learning_rate": 1.4204674755266789e-05, |
|
"loss": 0.0884, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.5614754098360657, |
|
"grad_norm": 0.24094166363923858, |
|
"learning_rate": 1.3955770230684611e-05, |
|
"loss": 0.0817, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.5655737704918034, |
|
"grad_norm": 0.2304769705042891, |
|
"learning_rate": 1.3708711590933792e-05, |
|
"loss": 0.0774, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.569672131147541, |
|
"grad_norm": 0.2377015225778547, |
|
"learning_rate": 1.3463511488297304e-05, |
|
"loss": 0.0961, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.5737704918032787, |
|
"grad_norm": 0.25107490882186423, |
|
"learning_rate": 1.3220182479879406e-05, |
|
"loss": 0.0939, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.5778688524590163, |
|
"grad_norm": 0.23260968536054955, |
|
"learning_rate": 1.2978737026962456e-05, |
|
"loss": 0.0837, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.581967213114754, |
|
"grad_norm": 0.23108056293448653, |
|
"learning_rate": 1.2739187494368877e-05, |
|
"loss": 0.0875, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.5860655737704918, |
|
"grad_norm": 0.23368547866832753, |
|
"learning_rate": 1.2501546149827792e-05, |
|
"loss": 0.0854, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.5901639344262295, |
|
"grad_norm": 0.24320292213137776, |
|
"learning_rate": 1.2265825163346911e-05, |
|
"loss": 0.0893, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.5942622950819674, |
|
"grad_norm": 0.21570274646869986, |
|
"learning_rate": 1.2032036606589175e-05, |
|
"loss": 0.0685, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.598360655737705, |
|
"grad_norm": 0.22099512747621203, |
|
"learning_rate": 1.1800192452254627e-05, |
|
"loss": 0.0897, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6024590163934427, |
|
"grad_norm": 0.23793535397292723, |
|
"learning_rate": 1.15703045734672e-05, |
|
"loss": 0.0957, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.6065573770491803, |
|
"grad_norm": 0.24731877068153732, |
|
"learning_rate": 1.1342384743166723e-05, |
|
"loss": 0.1042, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.610655737704918, |
|
"grad_norm": 0.22927608698188615, |
|
"learning_rate": 1.1116444633506019e-05, |
|
"loss": 0.0716, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.6147540983606556, |
|
"grad_norm": 0.23857230000679394, |
|
"learning_rate": 1.0892495815253085e-05, |
|
"loss": 0.0768, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.6188524590163933, |
|
"grad_norm": 0.24372403659065045, |
|
"learning_rate": 1.0670549757198633e-05, |
|
"loss": 0.0834, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.6229508196721312, |
|
"grad_norm": 0.22414275748214862, |
|
"learning_rate": 1.0450617825568642e-05, |
|
"loss": 0.0786, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.6270491803278688, |
|
"grad_norm": 0.21841238944069888, |
|
"learning_rate": 1.0232711283442403e-05, |
|
"loss": 0.075, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.6311475409836067, |
|
"grad_norm": 0.23509123920348915, |
|
"learning_rate": 1.0016841290175572e-05, |
|
"loss": 0.077, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.6352459016393444, |
|
"grad_norm": 0.2357206152062922, |
|
"learning_rate": 9.803018900828837e-06, |
|
"loss": 0.0879, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.639344262295082, |
|
"grad_norm": 0.2192628484070166, |
|
"learning_rate": 9.591255065601612e-06, |
|
"loss": 0.0814, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.639344262295082, |
|
"eval_loss": 0.11391327530145645, |
|
"eval_runtime": 15.815, |
|
"eval_samples_per_second": 1.265, |
|
"eval_steps_per_second": 0.316, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.6434426229508197, |
|
"grad_norm": 0.22464264802842762, |
|
"learning_rate": 9.381560629271407e-06, |
|
"loss": 0.0816, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.6475409836065573, |
|
"grad_norm": 0.261870721310818, |
|
"learning_rate": 9.173946330638328e-06, |
|
"loss": 0.0816, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.651639344262295, |
|
"grad_norm": 0.2149823369960777, |
|
"learning_rate": 8.968422801975223e-06, |
|
"loss": 0.0705, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.6557377049180326, |
|
"grad_norm": 0.2474828411009011, |
|
"learning_rate": 8.765000568483084e-06, |
|
"loss": 0.097, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.6598360655737705, |
|
"grad_norm": 0.22154288128693178, |
|
"learning_rate": 8.563690047752148e-06, |
|
"loss": 0.0765, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.6639344262295082, |
|
"grad_norm": 0.23677782055144925, |
|
"learning_rate": 8.364501549228288e-06, |
|
"loss": 0.0991, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.668032786885246, |
|
"grad_norm": 0.2197162495146224, |
|
"learning_rate": 8.167445273685143e-06, |
|
"loss": 0.062, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.6721311475409837, |
|
"grad_norm": 0.23637971545996592, |
|
"learning_rate": 7.97253131270162e-06, |
|
"loss": 0.0873, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.6762295081967213, |
|
"grad_norm": 0.22398519500886505, |
|
"learning_rate": 7.779769648145201e-06, |
|
"loss": 0.0712, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.680327868852459, |
|
"grad_norm": 0.23937512798359475, |
|
"learning_rate": 7.589170151660657e-06, |
|
"loss": 0.0847, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.6844262295081966, |
|
"grad_norm": 0.22939957435275588, |
|
"learning_rate": 7.400742584164533e-06, |
|
"loss": 0.0832, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.6885245901639343, |
|
"grad_norm": 0.22699971772595506, |
|
"learning_rate": 7.2144965953453385e-06, |
|
"loss": 0.0813, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.6926229508196722, |
|
"grad_norm": 0.23071893713415165, |
|
"learning_rate": 7.030441723169251e-06, |
|
"loss": 0.0816, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.6967213114754098, |
|
"grad_norm": 0.23702923002235138, |
|
"learning_rate": 6.848587393391792e-06, |
|
"loss": 0.0853, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.7008196721311475, |
|
"grad_norm": 0.22339505663904513, |
|
"learning_rate": 6.668942919074994e-06, |
|
"loss": 0.0717, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.7049180327868854, |
|
"grad_norm": 0.24483844945562266, |
|
"learning_rate": 6.491517500110589e-06, |
|
"loss": 0.0945, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.709016393442623, |
|
"grad_norm": 0.25539399462778006, |
|
"learning_rate": 6.31632022274874e-06, |
|
"loss": 0.0979, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.7131147540983607, |
|
"grad_norm": 0.24626473814812097, |
|
"learning_rate": 6.1433600591328296e-06, |
|
"loss": 0.0889, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.7172131147540983, |
|
"grad_norm": 0.20413994594867838, |
|
"learning_rate": 5.972645866839882e-06, |
|
"loss": 0.0643, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.721311475409836, |
|
"grad_norm": 0.22121713216766659, |
|
"learning_rate": 5.804186388427052e-06, |
|
"loss": 0.0909, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.7254098360655736, |
|
"grad_norm": 0.23283310959693312, |
|
"learning_rate": 5.637990250983821e-06, |
|
"loss": 0.0906, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.7295081967213115, |
|
"grad_norm": 0.2443306803824525, |
|
"learning_rate": 5.4740659656902284e-06, |
|
"loss": 0.0811, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.7336065573770492, |
|
"grad_norm": 0.22756772215296253, |
|
"learning_rate": 5.312421927381017e-06, |
|
"loss": 0.0827, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.737704918032787, |
|
"grad_norm": 0.2228814962421585, |
|
"learning_rate": 5.153066414115659e-06, |
|
"loss": 0.0669, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.7418032786885247, |
|
"grad_norm": 0.2661863386779688, |
|
"learning_rate": 4.9960075867544974e-06, |
|
"loss": 0.1088, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.7459016393442623, |
|
"grad_norm": 0.2561903847946149, |
|
"learning_rate": 4.841253488540748e-06, |
|
"loss": 0.0945, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.2267206036798771, |
|
"learning_rate": 4.688812044688645e-06, |
|
"loss": 0.0719, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.7540983606557377, |
|
"grad_norm": 0.22796746097335438, |
|
"learning_rate": 4.53869106197754e-06, |
|
"loss": 0.068, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.7581967213114753, |
|
"grad_norm": 0.26106758617331294, |
|
"learning_rate": 4.390898228352131e-06, |
|
"loss": 0.1061, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.762295081967213, |
|
"grad_norm": 0.247381900560375, |
|
"learning_rate": 4.245441112528714e-06, |
|
"loss": 0.0908, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7663934426229508, |
|
"grad_norm": 0.24294560089179307, |
|
"learning_rate": 4.1023271636076335e-06, |
|
"loss": 0.0926, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.7704918032786885, |
|
"grad_norm": 0.24215296053576252, |
|
"learning_rate": 3.961563710691729e-06, |
|
"loss": 0.0779, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.7745901639344264, |
|
"grad_norm": 0.23374549340723078, |
|
"learning_rate": 3.823157962511076e-06, |
|
"loss": 0.0728, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.778688524590164, |
|
"grad_norm": 0.24289701548347858, |
|
"learning_rate": 3.687117007053742e-06, |
|
"loss": 0.0867, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.7827868852459017, |
|
"grad_norm": 0.2522452950320934, |
|
"learning_rate": 3.553447811202876e-06, |
|
"loss": 0.0929, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.7868852459016393, |
|
"grad_norm": 0.22114275777413536, |
|
"learning_rate": 3.4221572203798234e-06, |
|
"loss": 0.0743, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.790983606557377, |
|
"grad_norm": 0.21366511169828778, |
|
"learning_rate": 3.293251958193683e-06, |
|
"loss": 0.0706, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.7950819672131146, |
|
"grad_norm": 0.21948702570848402, |
|
"learning_rate": 3.1667386260968657e-06, |
|
"loss": 0.0681, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.7991803278688525, |
|
"grad_norm": 0.23774843706100457, |
|
"learning_rate": 3.0426237030470984e-06, |
|
"loss": 0.0895, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.8032786885245902, |
|
"grad_norm": 0.22399865620122292, |
|
"learning_rate": 2.9209135451755854e-06, |
|
"loss": 0.0667, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.8073770491803278, |
|
"grad_norm": 0.21426419849873798, |
|
"learning_rate": 2.8016143854615207e-06, |
|
"loss": 0.0811, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.8114754098360657, |
|
"grad_norm": 0.21259033673347263, |
|
"learning_rate": 2.6847323334128927e-06, |
|
"loss": 0.0676, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.8155737704918034, |
|
"grad_norm": 0.24929681837203466, |
|
"learning_rate": 2.570273374753568e-06, |
|
"loss": 0.0808, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.819672131147541, |
|
"grad_norm": 0.23655301701967882, |
|
"learning_rate": 2.458243371116803e-06, |
|
"loss": 0.0716, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.8237704918032787, |
|
"grad_norm": 0.21403845251592035, |
|
"learning_rate": 2.3486480597450233e-06, |
|
"loss": 0.0666, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.8278688524590163, |
|
"grad_norm": 0.23585733424654748, |
|
"learning_rate": 2.2414930531960366e-06, |
|
"loss": 0.0804, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.831967213114754, |
|
"grad_norm": 0.23393497320481016, |
|
"learning_rate": 2.1367838390555615e-06, |
|
"loss": 0.0822, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.8360655737704918, |
|
"grad_norm": 0.22648840130944828, |
|
"learning_rate": 2.0345257796562657e-06, |
|
"loss": 0.0697, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.8401639344262295, |
|
"grad_norm": 0.26501948838229905, |
|
"learning_rate": 1.9347241118030823e-06, |
|
"loss": 0.1039, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.8442622950819674, |
|
"grad_norm": 0.24866758119966959, |
|
"learning_rate": 1.8373839465050779e-06, |
|
"loss": 0.0945, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.848360655737705, |
|
"grad_norm": 0.23782724905733918, |
|
"learning_rate": 1.7425102687136708e-06, |
|
"loss": 0.0717, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.8524590163934427, |
|
"grad_norm": 0.24694288793510444, |
|
"learning_rate": 1.65010793706738e-06, |
|
"loss": 0.0846, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.8565573770491803, |
|
"grad_norm": 0.23687783352322112, |
|
"learning_rate": 1.5601816836429584e-06, |
|
"loss": 0.0864, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.860655737704918, |
|
"grad_norm": 0.22117276028464855, |
|
"learning_rate": 1.4727361137131136e-06, |
|
"loss": 0.0769, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.8647540983606556, |
|
"grad_norm": 0.22732784122754088, |
|
"learning_rate": 1.3877757055106132e-06, |
|
"loss": 0.0752, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.8688524590163933, |
|
"grad_norm": 0.23983266256314234, |
|
"learning_rate": 1.3053048099989807e-06, |
|
"loss": 0.0817, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.8729508196721312, |
|
"grad_norm": 0.29876460611279054, |
|
"learning_rate": 1.2253276506496547e-06, |
|
"loss": 0.0928, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.8770491803278688, |
|
"grad_norm": 0.25065773079557147, |
|
"learning_rate": 1.1478483232257088e-06, |
|
"loss": 0.0685, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.8811475409836067, |
|
"grad_norm": 0.22154067972076463, |
|
"learning_rate": 1.0728707955721006e-06, |
|
"loss": 0.0677, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.8852459016393444, |
|
"grad_norm": 0.24820921053712106, |
|
"learning_rate": 1.000398907412453e-06, |
|
"loss": 0.0807, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.889344262295082, |
|
"grad_norm": 0.23265461838690038, |
|
"learning_rate": 9.304363701524654e-07, |
|
"loss": 0.076, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.8934426229508197, |
|
"grad_norm": 0.2212340843550487, |
|
"learning_rate": 8.629867666897773e-07, |
|
"loss": 0.0735, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.8975409836065573, |
|
"grad_norm": 0.22696045078695706, |
|
"learning_rate": 7.980535512305376e-07, |
|
"loss": 0.0749, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.901639344262295, |
|
"grad_norm": 0.23353372696636585, |
|
"learning_rate": 7.356400491124737e-07, |
|
"loss": 0.0739, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.9057377049180326, |
|
"grad_norm": 0.22465718167778462, |
|
"learning_rate": 6.757494566346445e-07, |
|
"loss": 0.0635, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.9098360655737705, |
|
"grad_norm": 0.2714574406648733, |
|
"learning_rate": 6.183848408936709e-07, |
|
"loss": 0.0978, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.9139344262295082, |
|
"grad_norm": 0.2241827281383872, |
|
"learning_rate": 5.635491396267456e-07, |
|
"loss": 0.0728, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.918032786885246, |
|
"grad_norm": 0.22303567853923986, |
|
"learning_rate": 5.112451610611469e-07, |
|
"loss": 0.0735, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.9221311475409837, |
|
"grad_norm": 0.25184864069673707, |
|
"learning_rate": 4.614755837704321e-07, |
|
"loss": 0.0845, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.9262295081967213, |
|
"grad_norm": 0.2238797951927238, |
|
"learning_rate": 4.14242956537253e-07, |
|
"loss": 0.079, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.930327868852459, |
|
"grad_norm": 0.2323006006995677, |
|
"learning_rate": 3.69549698222843e-07, |
|
"loss": 0.0787, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.9344262295081966, |
|
"grad_norm": 0.23426756288417372, |
|
"learning_rate": 3.273980976431501e-07, |
|
"loss": 0.0901, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.9385245901639343, |
|
"grad_norm": 0.23048072960491015, |
|
"learning_rate": 2.8779031345159136e-07, |
|
"loss": 0.0767, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.9426229508196722, |
|
"grad_norm": 0.24308841255499397, |
|
"learning_rate": 2.5072837402854157e-07, |
|
"loss": 0.0928, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.9467213114754098, |
|
"grad_norm": 0.2889749002741685, |
|
"learning_rate": 2.162141773774329e-07, |
|
"loss": 0.0944, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.9508196721311475, |
|
"grad_norm": 0.24338145335001735, |
|
"learning_rate": 1.842494910275605e-07, |
|
"loss": 0.0818, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.9549180327868854, |
|
"grad_norm": 0.20651636363797357, |
|
"learning_rate": 1.5483595194356048e-07, |
|
"loss": 0.0633, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.959016393442623, |
|
"grad_norm": 0.22304906627189303, |
|
"learning_rate": 1.2797506644159351e-07, |
|
"loss": 0.0694, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.9631147540983607, |
|
"grad_norm": 0.20982729632198355, |
|
"learning_rate": 1.0366821011218997e-07, |
|
"loss": 0.065, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.9672131147540983, |
|
"grad_norm": 0.23590436160688563, |
|
"learning_rate": 8.191662774980625e-08, |
|
"loss": 0.0831, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.971311475409836, |
|
"grad_norm": 0.2550649137151106, |
|
"learning_rate": 6.272143328907575e-08, |
|
"loss": 0.0849, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.9754098360655736, |
|
"grad_norm": 0.2280158394119794, |
|
"learning_rate": 4.608360974776571e-08, |
|
"loss": 0.0773, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.9795081967213115, |
|
"grad_norm": 0.24398300931602282, |
|
"learning_rate": 3.200400917643398e-08, |
|
"loss": 0.1115, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.9836065573770492, |
|
"grad_norm": 0.23215824795287876, |
|
"learning_rate": 2.048335261479739e-08, |
|
"loss": 0.0668, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.987704918032787, |
|
"grad_norm": 0.23261181672658787, |
|
"learning_rate": 1.152223005479458e-08, |
|
"loss": 0.0768, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.9918032786885247, |
|
"grad_norm": 0.22434385789931668, |
|
"learning_rate": 5.121100410393487e-09, |
|
"loss": 0.0691, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.9959016393442623, |
|
"grad_norm": 0.23535013520501705, |
|
"learning_rate": 1.2802914940601707e-09, |
|
"loss": 0.082, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2273460494952869, |
|
"learning_rate": 0.0, |
|
"loss": 0.0848, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 488, |
|
"total_flos": 2471609260572672.0, |
|
"train_loss": 0.1494467844423212, |
|
"train_runtime": 5820.7005, |
|
"train_samples_per_second": 0.671, |
|
"train_steps_per_second": 0.084 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 488, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2471609260572672.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|