|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9996597771540356, |
|
"eval_steps": 500, |
|
"global_step": 11756, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003402228459641065, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 3.809091090277921e-07, |
|
"loss": 4.24, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00680445691928213, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 7.618182180555842e-07, |
|
"loss": 4.4323, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010206685378923195, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.1427273270833762e-06, |
|
"loss": 4.2758, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01360891383856426, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 1.5236364361111684e-06, |
|
"loss": 4.1231, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017011142298205325, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.9045455451389605e-06, |
|
"loss": 4.097, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02041337075784639, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 2.2854546541667524e-06, |
|
"loss": 4.0712, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.023815599217487455, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.6663637631945448e-06, |
|
"loss": 3.8851, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02721782767712852, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 3.0472728722223367e-06, |
|
"loss": 3.6937, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.030620056136769585, |
|
"grad_norm": 8.25, |
|
"learning_rate": 3.4281819812501286e-06, |
|
"loss": 3.6468, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03402228459641065, |
|
"grad_norm": 9.625, |
|
"learning_rate": 3.809091090277921e-06, |
|
"loss": 3.4787, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03742451305605171, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 4.190000199305713e-06, |
|
"loss": 3.3235, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04082674151569278, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 4.570909308333505e-06, |
|
"loss": 3.2806, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04422896997533384, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 4.951818417361297e-06, |
|
"loss": 3.0432, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04763119843497491, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 5.3327275263890896e-06, |
|
"loss": 2.8991, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05103342689461597, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 5.7136366354168815e-06, |
|
"loss": 2.8202, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05443565535425704, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 6.094545744444673e-06, |
|
"loss": 2.6361, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0578378838138981, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 6.475454853472465e-06, |
|
"loss": 2.5525, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06124011227353917, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 6.856363962500257e-06, |
|
"loss": 2.5685, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06464234073318023, |
|
"grad_norm": 2.125, |
|
"learning_rate": 7.237273071528049e-06, |
|
"loss": 2.5133, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0680445691928213, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 7.618182180555842e-06, |
|
"loss": 2.4096, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07144679765246237, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 7.999091289583632e-06, |
|
"loss": 2.4864, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07484902611210342, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 8.380000398611426e-06, |
|
"loss": 2.4321, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.07825125457174449, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.760909507639218e-06, |
|
"loss": 2.3582, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08165348303138556, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 9.14181861666701e-06, |
|
"loss": 2.3401, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08505571149102663, |
|
"grad_norm": 2.625, |
|
"learning_rate": 9.522727725694802e-06, |
|
"loss": 2.3312, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08845793995066768, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 9.903636834722594e-06, |
|
"loss": 2.3672, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09186016841030875, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.0284545943750385e-05, |
|
"loss": 2.3025, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.09526239686994982, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.0665455052778179e-05, |
|
"loss": 2.3273, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.09866462532959089, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.104636416180597e-05, |
|
"loss": 2.2746, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.10206685378923194, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.1427273270833763e-05, |
|
"loss": 2.3196, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10546908224887301, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.1808182379861553e-05, |
|
"loss": 2.2645, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.10887131070851408, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.2189091488889347e-05, |
|
"loss": 2.2902, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.11227353916815515, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.2570000597917139e-05, |
|
"loss": 2.2503, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1156757676277962, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.295090970694493e-05, |
|
"loss": 2.1882, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.11907799608743727, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.3331818815972723e-05, |
|
"loss": 2.2266, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.12248022454707834, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.344607904627746e-05, |
|
"loss": 2.2011, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1258824530067194, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.3446017810126854e-05, |
|
"loss": 2.1828, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.12928468146636046, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.3445905544333626e-05, |
|
"loss": 2.2727, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.13268690992600152, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.344574224974991e-05, |
|
"loss": 2.2222, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1360891383856426, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.3445527927615165e-05, |
|
"loss": 2.2107, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.13949136684528365, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.3445262579556173e-05, |
|
"loss": 2.1671, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.14289359530492474, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.3444946207587011e-05, |
|
"loss": 2.1878, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1462958237645658, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.3444578814109056e-05, |
|
"loss": 2.1358, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.14969805222420685, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.3444160401910943e-05, |
|
"loss": 2.1564, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.15310028068384793, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.3443690974168565e-05, |
|
"loss": 2.1756, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.15650250914348898, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.344317053444504e-05, |
|
"loss": 2.1606, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.15990473760313004, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.344259908669068e-05, |
|
"loss": 2.2352, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.16330696606277112, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.3441976635242969e-05, |
|
"loss": 2.1258, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.16670919452241217, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.3441303184826526e-05, |
|
"loss": 2.1533, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.17011142298205326, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.3440578740553065e-05, |
|
"loss": 2.1179, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1735136514416943, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.3439803307921367e-05, |
|
"loss": 2.1868, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.17691587990133537, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.343897689281723e-05, |
|
"loss": 2.1144, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.18031810836097645, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.343809950151342e-05, |
|
"loss": 2.1722, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1837203368206175, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.3437171140669643e-05, |
|
"loss": 2.1725, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.18712256528025856, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.3436191817332471e-05, |
|
"loss": 2.1871, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.19052479373989964, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.3435161538935297e-05, |
|
"loss": 2.2134, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1939270221995407, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.3434080313298288e-05, |
|
"loss": 2.1545, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.19732925065918178, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.3432948148628312e-05, |
|
"loss": 2.1173, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.20073147911882283, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.3431765053518884e-05, |
|
"loss": 2.1703, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.20413370757846389, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.3430531036950099e-05, |
|
"loss": 2.1662, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.20753593603810497, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.3429246108288562e-05, |
|
"loss": 2.153, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.21093816449774602, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.3427910277287318e-05, |
|
"loss": 2.1421, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.21434039295738708, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.3426523554085776e-05, |
|
"loss": 2.1315, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.21774262141702816, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.342508594920964e-05, |
|
"loss": 2.1187, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.22114484987666921, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.342359747357082e-05, |
|
"loss": 2.1447, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2245470783363103, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.3422058138467349e-05, |
|
"loss": 2.1614, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.22794930679595135, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.3420467955583304e-05, |
|
"loss": 2.1521, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.2313515352555924, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.3418826936988714e-05, |
|
"loss": 2.1474, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2347537637152335, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.3417135095139467e-05, |
|
"loss": 2.1887, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.23815599217487454, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.341539244287722e-05, |
|
"loss": 2.1432, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2415582206345156, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.3413598993429295e-05, |
|
"loss": 2.1202, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.24496044909415668, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.3411754760408584e-05, |
|
"loss": 2.201, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.24836267755379773, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.3409859757813437e-05, |
|
"loss": 2.104, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2517649060134388, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.3407914000027573e-05, |
|
"loss": 2.1118, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.25516713447307987, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.3405917501819956e-05, |
|
"loss": 2.1533, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2585693629327209, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.340387027834468e-05, |
|
"loss": 2.0738, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.261971591392362, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.3401772345140874e-05, |
|
"loss": 2.1696, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.26537381985200303, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.3399623718132557e-05, |
|
"loss": 2.0847, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.26877604831164414, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.3397424413628542e-05, |
|
"loss": 2.1644, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2721782767712852, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.3395174448322298e-05, |
|
"loss": 2.0891, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.27558050523092625, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.3392873839291825e-05, |
|
"loss": 2.1638, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2789827336905673, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.339052260399953e-05, |
|
"loss": 2.078, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.28238496215020836, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.3388120760292085e-05, |
|
"loss": 2.1191, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.2857871906098495, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.33856683264003e-05, |
|
"loss": 2.0554, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2891894190694905, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.3383165320938983e-05, |
|
"loss": 2.0385, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2925916475291316, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.3380611762906796e-05, |
|
"loss": 2.1071, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.29599387598877264, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.3378007671686113e-05, |
|
"loss": 2.1171, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.2993961044484137, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.337535306704287e-05, |
|
"loss": 2.1264, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3027983329080548, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.337264796912642e-05, |
|
"loss": 2.0562, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.30620056136769586, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.3369892398469373e-05, |
|
"loss": 2.1343, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3096027898273369, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.3367086375987447e-05, |
|
"loss": 2.0563, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.31300501828697797, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.3364229922979311e-05, |
|
"loss": 2.1302, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.316407246746619, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.3361323061126409e-05, |
|
"loss": 2.0733, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.3198094752062601, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.3358365812492812e-05, |
|
"loss": 2.1027, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3232117036659012, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.3355358199525042e-05, |
|
"loss": 2.0455, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.32661393212554224, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.3352300245051904e-05, |
|
"loss": 2.0785, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3300161605851833, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.3349191972284314e-05, |
|
"loss": 2.1594, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.33341838904482435, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.3346033404815114e-05, |
|
"loss": 2.066, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.3368206175044654, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.3342824566618907e-05, |
|
"loss": 2.1451, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.3402228459641065, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.3339565482051866e-05, |
|
"loss": 2.152, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.34362507442374757, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.3336256175851549e-05, |
|
"loss": 2.1232, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3470273028833886, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.3332896673136717e-05, |
|
"loss": 2.1158, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.3504295313430297, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.3329486999407136e-05, |
|
"loss": 2.102, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.35383175980267073, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.3326027180543387e-05, |
|
"loss": 2.1266, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.35723398826231184, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.3322517242806673e-05, |
|
"loss": 2.0884, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3606362167219529, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.3318957212838615e-05, |
|
"loss": 2.0793, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.36403844518159395, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.3315347117661048e-05, |
|
"loss": 2.0574, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.367440673641235, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.3311686984675822e-05, |
|
"loss": 2.0716, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.37084290210087606, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.3307976841664591e-05, |
|
"loss": 2.0523, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.3742451305605171, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.33042167167886e-05, |
|
"loss": 2.0203, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3776473590201582, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.330040663858848e-05, |
|
"loss": 2.0823, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3810495874797993, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.3296546635984012e-05, |
|
"loss": 2.0758, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.38445181593944033, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.3292636738273931e-05, |
|
"loss": 2.1138, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3878540443990814, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.3288676975135689e-05, |
|
"loss": 2.0277, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.39125627285872244, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.3284667376625236e-05, |
|
"loss": 2.042, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.39465850131836355, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.3280607973176785e-05, |
|
"loss": 2.114, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3980607297780046, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.327649879560259e-05, |
|
"loss": 2.0477, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.40146295823764566, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.3272339875092701e-05, |
|
"loss": 2.0101, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4048651866972867, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.3268131243214744e-05, |
|
"loss": 2.1261, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.40826741515692777, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.326387293191366e-05, |
|
"loss": 2.0788, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4116696436165688, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.325956497351148e-05, |
|
"loss": 2.0694, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.41507187207620994, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.3255207400707076e-05, |
|
"loss": 2.11, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.418474100535851, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.3250800246575906e-05, |
|
"loss": 2.0621, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.42187632899549204, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.3246343544569764e-05, |
|
"loss": 2.0923, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.4252785574551331, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.3241837328516535e-05, |
|
"loss": 2.1005, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.42868078591477415, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.323728163261993e-05, |
|
"loss": 2.0634, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.43208301437441526, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.323267649145923e-05, |
|
"loss": 2.0635, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.4354852428340563, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.3228021939989018e-05, |
|
"loss": 2.131, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.4388874712936974, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.3223318013538927e-05, |
|
"loss": 2.1021, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.44228969975333843, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.3218564747813355e-05, |
|
"loss": 2.0758, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4456919282129795, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.3213762178891202e-05, |
|
"loss": 2.0198, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.4490941566726206, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.3208910343225603e-05, |
|
"loss": 2.1226, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.45249638513226165, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.3204009277643636e-05, |
|
"loss": 2.077, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.4558986135919027, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.3199059019346055e-05, |
|
"loss": 2.1154, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.45930084205154376, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.3194059605907003e-05, |
|
"loss": 2.1109, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.4627030705111848, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.318901107527373e-05, |
|
"loss": 2.1108, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.46610529897082587, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.3183913465766294e-05, |
|
"loss": 2.1203, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.469507527430467, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.3178766816077288e-05, |
|
"loss": 2.0667, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.47290975589010803, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.317357116527153e-05, |
|
"loss": 2.0428, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.4763119843497491, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.3168326552785775e-05, |
|
"loss": 2.0836, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.47971421280939014, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.3163033018428418e-05, |
|
"loss": 2.0031, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.4831164412690312, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.315769060237918e-05, |
|
"loss": 2.096, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.4865186697286723, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.3152299345188815e-05, |
|
"loss": 2.0325, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.48992089818831336, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.3146859287778799e-05, |
|
"loss": 2.0444, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4933231266479544, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.3141370471441016e-05, |
|
"loss": 2.0971, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.49672535510759547, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.3135832937837444e-05, |
|
"loss": 2.0014, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.5001275835672365, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.3130246728999852e-05, |
|
"loss": 2.0086, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.5035298120268776, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.3124611887329459e-05, |
|
"loss": 2.0079, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5069320404865186, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.3118928455596627e-05, |
|
"loss": 2.0654, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5103342689461597, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.3113196476940538e-05, |
|
"loss": 2.0195, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5137364974058009, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.3107415994868855e-05, |
|
"loss": 2.0196, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.5171387258654419, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.3101587053257404e-05, |
|
"loss": 2.0552, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.520540954325083, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.3095709696349833e-05, |
|
"loss": 2.0833, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.523943182784724, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.3089783968757277e-05, |
|
"loss": 2.1067, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.5273454112443651, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.308380991545802e-05, |
|
"loss": 2.0313, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5307476397040061, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.3077787581797163e-05, |
|
"loss": 2.0918, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5341498681636472, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.3071717013486259e-05, |
|
"loss": 2.0505, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.5375520966232883, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.3065598256602989e-05, |
|
"loss": 2.1166, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.5409543250829293, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.3059431357590797e-05, |
|
"loss": 2.1196, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.5443565535425704, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.3053216363258537e-05, |
|
"loss": 2.0623, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5477587820022114, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.3046953320780136e-05, |
|
"loss": 2.051, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.5511610104618525, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.304064227769421e-05, |
|
"loss": 2.0341, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.5545632389214936, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.3034283281903722e-05, |
|
"loss": 2.001, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.5579654673811346, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.3027876381675611e-05, |
|
"loss": 1.9871, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.5613676958407757, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.3021421625640427e-05, |
|
"loss": 2.0712, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5647699243004167, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.3014919062791965e-05, |
|
"loss": 2.0444, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.5681721527600578, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.3008368742486882e-05, |
|
"loss": 2.0598, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.571574381219699, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.300177071444434e-05, |
|
"loss": 2.0744, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.57497660967934, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.299512502874561e-05, |
|
"loss": 1.9854, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.578378838138981, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.2988431735833709e-05, |
|
"loss": 2.0348, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.581781066598622, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.2981690886513001e-05, |
|
"loss": 2.0189, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5851832950582632, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.2974902531948826e-05, |
|
"loss": 1.9997, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.5885855235179043, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.2968066723667104e-05, |
|
"loss": 1.9861, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.5919877519775453, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.2961183513553937e-05, |
|
"loss": 2.0284, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5953899804371864, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.2954252953855236e-05, |
|
"loss": 2.0376, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5987922088968274, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.2947275097176301e-05, |
|
"loss": 2.0059, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.6021944373564685, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.2940249996481436e-05, |
|
"loss": 2.0906, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.6055966658161096, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.2933177705093541e-05, |
|
"loss": 2.0076, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.6089988942757506, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.2926058276693715e-05, |
|
"loss": 2.0247, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.6124011227353917, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.2918891765320837e-05, |
|
"loss": 2.113, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6158033511950327, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.2911678225371164e-05, |
|
"loss": 2.0201, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.6192055796546738, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.2904417711597916e-05, |
|
"loss": 2.0172, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.6226078081143149, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.289711027911086e-05, |
|
"loss": 2.1396, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.6260100365739559, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.2889755983375892e-05, |
|
"loss": 2.045, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.629412265033597, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.2882354880214616e-05, |
|
"loss": 2.012, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.632814493493238, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.2874907025803922e-05, |
|
"loss": 2.058, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.6362167219528791, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.2867412476675554e-05, |
|
"loss": 2.0796, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.6396189504125201, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.2859871289715688e-05, |
|
"loss": 2.0956, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.6430211788721613, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.2852283522164496e-05, |
|
"loss": 1.983, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.6464234073318024, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.2844649231615713e-05, |
|
"loss": 1.9861, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6498256357914434, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.2836968476016196e-05, |
|
"loss": 2.0683, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.6532278642510845, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.2829241313665494e-05, |
|
"loss": 2.0916, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.6566300927107255, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.2821467803215395e-05, |
|
"loss": 2.0254, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.6600323211703666, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.2813648003669482e-05, |
|
"loss": 2.0332, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.6634345496300077, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.2805781974382694e-05, |
|
"loss": 2.0225, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6668367780896487, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.2797869775060866e-05, |
|
"loss": 2.0563, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.6702390065492898, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.2789911465760281e-05, |
|
"loss": 2.0027, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.6736412350089308, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.2781907106887209e-05, |
|
"loss": 1.9895, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.6770434634685719, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.2773856759197455e-05, |
|
"loss": 2.0175, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.680445691928213, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.2765760483795895e-05, |
|
"loss": 2.0702, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.683847920387854, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.275761834213601e-05, |
|
"loss": 2.023, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.6872501488474951, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.2749430396019423e-05, |
|
"loss": 2.0051, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.6906523773071361, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.2741196707595429e-05, |
|
"loss": 2.017, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.6940546057667772, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.273291733936052e-05, |
|
"loss": 2.0481, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.6974568342264184, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.2724592354157912e-05, |
|
"loss": 2.0281, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.7008590626860594, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.2716221815177076e-05, |
|
"loss": 2.0459, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.7042612911457005, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.2707805785953245e-05, |
|
"loss": 2.0705, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.7076635196053415, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.2699344330366942e-05, |
|
"loss": 2.0759, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.7110657480649826, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.2690837512643495e-05, |
|
"loss": 2.0324, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.7144679765246237, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.2682285397352535e-05, |
|
"loss": 1.9784, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7178702049842647, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.2673688049407526e-05, |
|
"loss": 1.9902, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.7212724334439058, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.266504553406526e-05, |
|
"loss": 2.0631, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.7246746619035468, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.2656357916925368e-05, |
|
"loss": 2.0039, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.7280768903631879, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.2647625263929817e-05, |
|
"loss": 1.9975, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.7314791188228289, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.2638847641362408e-05, |
|
"loss": 2.0368, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.73488134728247, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.2630025115848282e-05, |
|
"loss": 2.0954, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.7382835757421111, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.2621157754353404e-05, |
|
"loss": 2.0297, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.7416858042017521, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.2612245624184062e-05, |
|
"loss": 2.0445, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.7450880326613932, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.2603288792986354e-05, |
|
"loss": 2.0587, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.7484902611210342, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.2594287328745672e-05, |
|
"loss": 2.0126, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7518924895806753, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.258524129978619e-05, |
|
"loss": 2.0213, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.7552947180403164, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.257615077477034e-05, |
|
"loss": 1.9826, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.7586969464999574, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.25670158226983e-05, |
|
"loss": 2.0467, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.7620991749595986, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.2557836512907456e-05, |
|
"loss": 1.9924, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.7655014034192396, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.2548612915071894e-05, |
|
"loss": 1.9864, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.7689036318788807, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.2539345099201851e-05, |
|
"loss": 1.9966, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.7723058603385218, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.2530033135643203e-05, |
|
"loss": 2.0092, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.7757080887981628, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.2520677095076918e-05, |
|
"loss": 1.97, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.7791103172578039, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.2511277048518522e-05, |
|
"loss": 1.9781, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.7825125457174449, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.2501833067317562e-05, |
|
"loss": 2.0167, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.785914774177086, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.2492345223157068e-05, |
|
"loss": 2.0108, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.7893170026367271, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.2482813588053004e-05, |
|
"loss": 2.0094, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.7927192310963681, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.2473238234353713e-05, |
|
"loss": 1.9266, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.7961214595560092, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.2463619234739388e-05, |
|
"loss": 1.9982, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.7995236880156502, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.2453956662221504e-05, |
|
"loss": 2.0688, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8029259164752913, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.2444250590142271e-05, |
|
"loss": 1.9658, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.8063281449349324, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.2434501092174075e-05, |
|
"loss": 1.9954, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.8097303733945734, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.242470824231892e-05, |
|
"loss": 2.0507, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.8131326018542145, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.241487211490786e-05, |
|
"loss": 2.0469, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.8165348303138555, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.2404992784600451e-05, |
|
"loss": 2.0436, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8199370587734967, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.2395070326384164e-05, |
|
"loss": 2.0195, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.8233392872331377, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.238510481557383e-05, |
|
"loss": 1.9674, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.8267415156927788, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.2375096327811061e-05, |
|
"loss": 1.9918, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.8301437441524199, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.2365044939063687e-05, |
|
"loss": 2.0161, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.8335459726120609, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.2354950725625158e-05, |
|
"loss": 2.0303, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.836948201071702, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.2344813764113985e-05, |
|
"loss": 1.973, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.840350429531343, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.2334634131473154e-05, |
|
"loss": 2.0389, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.8437526579909841, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.2324411904969535e-05, |
|
"loss": 2.0597, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.8471548864506252, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.2314147162193302e-05, |
|
"loss": 2.029, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.8505571149102662, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.2303839981057342e-05, |
|
"loss": 2.0216, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8539593433699073, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.2293490439796658e-05, |
|
"loss": 1.9839, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.8573615718295483, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.2283098616967793e-05, |
|
"loss": 2.0373, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.8607638002891894, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.2272664591448208e-05, |
|
"loss": 2.075, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.8641660287488305, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.2262188442435706e-05, |
|
"loss": 2.071, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.8675682572084715, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.2251670249447816e-05, |
|
"loss": 2.0474, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.8709704856681126, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.22411100923212e-05, |
|
"loss": 1.9866, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.8743727141277536, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.2230508051211039e-05, |
|
"loss": 2.0365, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.8777749425873947, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.2219864206590427e-05, |
|
"loss": 2.0041, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.8811771710470359, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.2209178639249763e-05, |
|
"loss": 2.0164, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.8845793995066769, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.2198451430296135e-05, |
|
"loss": 2.0469, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.887981627966318, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.2187682661152705e-05, |
|
"loss": 1.9873, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.891383856425959, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.2176872413558087e-05, |
|
"loss": 2.0442, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.8947860848856001, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.2166020769565741e-05, |
|
"loss": 2.0356, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.8981883133452412, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.2155127811543326e-05, |
|
"loss": 2.0253, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.9015905418048822, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.2144193622172099e-05, |
|
"loss": 1.974, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.9049927702645233, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.2133218284446276e-05, |
|
"loss": 2.0084, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.9083949987241643, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.2122201881672392e-05, |
|
"loss": 2.1215, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.9117972271838054, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.2111144497468698e-05, |
|
"loss": 1.9749, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.9151994556434464, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.2100046215764493e-05, |
|
"loss": 1.9601, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.9186016841030875, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.2088907120799507e-05, |
|
"loss": 1.9761, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9220039125627286, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.2077727297123258e-05, |
|
"loss": 2.0309, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.9254061410223696, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.2066506829594404e-05, |
|
"loss": 2.0306, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.9288083694820107, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.2055245803380112e-05, |
|
"loss": 2.0073, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.9322105979416517, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.2043944303955393e-05, |
|
"loss": 1.9904, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.9356128264012928, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.2032602417102472e-05, |
|
"loss": 2.0916, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.939015054860934, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.2021220228910125e-05, |
|
"loss": 1.9665, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.942417283320575, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.2009797825773027e-05, |
|
"loss": 1.9822, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.9458195117802161, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.1998335294391099e-05, |
|
"loss": 1.9947, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.9492217402398571, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.1986832721768856e-05, |
|
"loss": 1.9626, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.9526239686994982, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.1975290195214724e-05, |
|
"loss": 1.9772, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.9560261971591393, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.1963707802340409e-05, |
|
"loss": 2.0471, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.9594284256187803, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.1952085631060207e-05, |
|
"loss": 1.9514, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.9628306540784214, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.1940423769590349e-05, |
|
"loss": 1.9974, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.9662328825380624, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.1928722306448326e-05, |
|
"loss": 2.0036, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.9696351109977035, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.1916981330452221e-05, |
|
"loss": 1.9803, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.9730373394573446, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.1905200930720032e-05, |
|
"loss": 2.0608, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.9764395679169856, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.1893381196668997e-05, |
|
"loss": 1.9857, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.9798417963766267, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.1881522218014912e-05, |
|
"loss": 2.0197, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.9832440248362677, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.1869624084771457e-05, |
|
"loss": 1.9883, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.9866462532959088, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.185768688724951e-05, |
|
"loss": 2.0941, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.9900484817555499, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.184571071605645e-05, |
|
"loss": 1.9953, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.9934507102151909, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.1833695662095493e-05, |
|
"loss": 1.9833, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.996852938674832, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.1821641816564982e-05, |
|
"loss": 2.0431, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.000255167134473, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.1809549270957697e-05, |
|
"loss": 1.886, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.0036573955941142, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.1797418117060173e-05, |
|
"loss": 1.9804, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.0070596240537553, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.1785248446951988e-05, |
|
"loss": 2.0657, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.0104618525133964, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.1773040353005074e-05, |
|
"loss": 2.0112, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.0138640809730373, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.1760793927883016e-05, |
|
"loss": 2.0262, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.0172663094326784, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.174850926454034e-05, |
|
"loss": 2.0007, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.0206685378923195, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.1736186456221816e-05, |
|
"loss": 1.9723, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.0240707663519606, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.1723825596461751e-05, |
|
"loss": 1.9384, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.0274729948116017, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.1711426779083267e-05, |
|
"loss": 1.9556, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.0308752232712426, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.1698990098197604e-05, |
|
"loss": 1.9963, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.0342774517308837, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.1686515648203396e-05, |
|
"loss": 1.9429, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.0376796801905248, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.1674003523785957e-05, |
|
"loss": 1.8885, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.041081908650166, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.1661453819916565e-05, |
|
"loss": 1.9456, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.0444841371098068, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.1648866631851738e-05, |
|
"loss": 1.9386, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.047886365569448, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.1636242055132511e-05, |
|
"loss": 1.9569, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.051288594029089, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.1623580185583711e-05, |
|
"loss": 1.9159, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.0546908224887301, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.1610881119313231e-05, |
|
"loss": 1.9094, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.0580930509483712, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.1598144952711302e-05, |
|
"loss": 2.0189, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.0614952794080121, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.1585371782449755e-05, |
|
"loss": 2.0053, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.0648975078676532, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.1572561705481294e-05, |
|
"loss": 1.9826, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.0682997363272944, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.1559714819038756e-05, |
|
"loss": 1.9597, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.0717019647869355, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.1546831220634377e-05, |
|
"loss": 1.9255, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.0751041932465766, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.1533911008059046e-05, |
|
"loss": 1.9859, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.0785064217062175, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.1520954279381567e-05, |
|
"loss": 1.9651, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.0819086501658586, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.1507961132947917e-05, |
|
"loss": 1.9321, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.0853108786254997, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.1494931667380492e-05, |
|
"loss": 1.9215, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.0887131070851408, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.1481865981577362e-05, |
|
"loss": 1.982, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.092115335544782, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.1468764174711526e-05, |
|
"loss": 1.9728, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.0955175640044228, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.1455626346230147e-05, |
|
"loss": 2.0267, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.098919792464064, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.1442452595853809e-05, |
|
"loss": 1.9484, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.102322020923705, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.1429243023575758e-05, |
|
"loss": 1.9867, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.1057242493833461, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.1415997729661134e-05, |
|
"loss": 1.9269, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.1091264778429872, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.140271681464622e-05, |
|
"loss": 1.9095, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.1125287063026281, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.1389400379337676e-05, |
|
"loss": 2.0021, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.1159309347622692, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.137604852481177e-05, |
|
"loss": 2.0117, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.1193331632219103, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.1362661352413616e-05, |
|
"loss": 1.9835, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.1227353916815515, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.1349238963756402e-05, |
|
"loss": 1.9492, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.1261376201411926, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.1335781460720621e-05, |
|
"loss": 1.9394, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.1295398486008335, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.1322288945453292e-05, |
|
"loss": 1.9442, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.1329420770604746, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.1308761520367196e-05, |
|
"loss": 1.9256, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.1363443055201157, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.1295199288140082e-05, |
|
"loss": 1.9861, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.1397465339797568, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.1281602351713905e-05, |
|
"loss": 1.9598, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.143148762439398, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.1267970814294032e-05, |
|
"loss": 1.9839, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.1465509908990388, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.1254304779348466e-05, |
|
"loss": 1.9654, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.14995321935868, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.1240604350607055e-05, |
|
"loss": 1.9536, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.153355447818321, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.122686963206071e-05, |
|
"loss": 1.9331, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.156757676277962, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.1213100727960614e-05, |
|
"loss": 1.9218, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.1601599047376032, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.1199297742817428e-05, |
|
"loss": 1.9979, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.163562133197244, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.11854607814005e-05, |
|
"loss": 2.02, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.1669643616568852, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.117158994873707e-05, |
|
"loss": 2.0195, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.1703665901165263, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.1157685350111472e-05, |
|
"loss": 2.0053, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.1737688185761674, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.1143747091064334e-05, |
|
"loss": 2.014, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.1771710470358085, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.1129775277391782e-05, |
|
"loss": 1.9057, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.1805732754954494, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.1115770015144628e-05, |
|
"loss": 1.9496, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.1839755039550905, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.1101731410627574e-05, |
|
"loss": 1.9163, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.1873777324147317, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.1087659570398397e-05, |
|
"loss": 1.9717, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.1907799608743728, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.1073554601267147e-05, |
|
"loss": 2.0302, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.1941821893340139, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.1059416610295336e-05, |
|
"loss": 1.9523, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.1975844177936548, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.104524570479512e-05, |
|
"loss": 1.9842, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.2009866462532959, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.1031041992328483e-05, |
|
"loss": 2.0036, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.204388874712937, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.1016805580706439e-05, |
|
"loss": 2.048, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.207791103172578, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.1002536577988182e-05, |
|
"loss": 1.9545, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.2111933316322192, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.0988235092480297e-05, |
|
"loss": 1.9575, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.21459556009186, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.0973901232735917e-05, |
|
"loss": 1.9759, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.2179977885515012, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.0959535107553909e-05, |
|
"loss": 1.9737, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.2214000170111423, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.0945136825978049e-05, |
|
"loss": 2.0414, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.2248022454707834, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.0930706497296186e-05, |
|
"loss": 1.9566, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.2282044739304245, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.0916244231039415e-05, |
|
"loss": 1.9614, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.2316067023900654, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.0901750136981258e-05, |
|
"loss": 2.0045, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.2350089308497065, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.0887224325136807e-05, |
|
"loss": 1.9703, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.2384111593093476, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.0872666905761921e-05, |
|
"loss": 1.9609, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.2418133877689888, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.0858077989352354e-05, |
|
"loss": 1.9865, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.2452156162286299, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.084345768664294e-05, |
|
"loss": 1.9276, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.2486178446882708, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.0828806108606748e-05, |
|
"loss": 1.9673, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.2520200731479119, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.081412336645423e-05, |
|
"loss": 1.9522, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.255422301607553, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.0799409571632395e-05, |
|
"loss": 1.8882, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.258824530067194, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.0784664835823945e-05, |
|
"loss": 1.9378, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.2622267585268352, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.076988927094643e-05, |
|
"loss": 2.0231, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.265628986986476, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.0755082989151417e-05, |
|
"loss": 1.925, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.2690312154461172, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.0740246102823613e-05, |
|
"loss": 1.8958, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.2724334439057583, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.0725378724580027e-05, |
|
"loss": 1.9536, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.2758356723653994, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.0710480967269115e-05, |
|
"loss": 1.9541, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.2792379008250405, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.0695552943969919e-05, |
|
"loss": 1.9327, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.2826401292846814, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.0680594767991203e-05, |
|
"loss": 1.9935, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.2860423577443225, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.0665606552870612e-05, |
|
"loss": 1.9933, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.2894445862039636, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.0650588412373792e-05, |
|
"loss": 1.9314, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.2928468146636047, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.0635540460493534e-05, |
|
"loss": 1.9136, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.2962490431232458, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.0620462811448904e-05, |
|
"loss": 1.9175, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.2996512715828867, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.0605355579684382e-05, |
|
"loss": 1.9929, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.3030535000425278, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.0590218879868998e-05, |
|
"loss": 1.9072, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.306455728502169, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.0575052826895442e-05, |
|
"loss": 1.9315, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.30985795696181, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.0559857535879212e-05, |
|
"loss": 2.045, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.3132601854214512, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.0544633122157734e-05, |
|
"loss": 1.9443, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.316662413881092, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.0529379701289476e-05, |
|
"loss": 1.9742, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.3200646423407332, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.051409738905309e-05, |
|
"loss": 1.9852, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.3234668708003743, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.0498786301446519e-05, |
|
"loss": 1.997, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.3268690992600152, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.0483446554686125e-05, |
|
"loss": 1.9083, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.3302713277196565, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.0468078265205796e-05, |
|
"loss": 1.974, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.3336735561792974, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.0452681549656073e-05, |
|
"loss": 1.9885, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.3370757846389385, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.0437256524903258e-05, |
|
"loss": 1.9872, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.3404780130985796, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.0421803308028533e-05, |
|
"loss": 1.9477, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.3438802415582205, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.0406322016327067e-05, |
|
"loss": 2.0032, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.3472824700178618, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.0390812767307123e-05, |
|
"loss": 1.9942, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.3506846984775027, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.0375275678689174e-05, |
|
"loss": 2.0242, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.3540869269371438, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.0359710868405e-05, |
|
"loss": 2.0306, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.357489155396785, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.0344118454596807e-05, |
|
"loss": 1.9709, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.3608913838564258, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.032849855561631e-05, |
|
"loss": 1.9812, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.3642936123160672, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.0312851290023851e-05, |
|
"loss": 2.0006, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.367695840775708, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.0297176776587497e-05, |
|
"loss": 1.9679, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.3710980692353492, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.028147513428213e-05, |
|
"loss": 1.934, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.3745002976949903, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.026574648228855e-05, |
|
"loss": 1.9867, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.3779025261546312, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.0249990939992573e-05, |
|
"loss": 1.899, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.3813047546142723, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.023420862698412e-05, |
|
"loss": 1.9799, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.3847069830739134, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.021839966305631e-05, |
|
"loss": 2.0251, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.3881092115335545, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.0202564168204549e-05, |
|
"loss": 1.9332, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.3915114399931956, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.0186702262625632e-05, |
|
"loss": 1.971, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.3949136684528365, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.0170814066716807e-05, |
|
"loss": 1.9266, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.3983158969124776, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.0154899701074883e-05, |
|
"loss": 1.9282, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.4017181253721187, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.0138959286495303e-05, |
|
"loss": 2.0014, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.4051203538317598, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.0122992943971232e-05, |
|
"loss": 1.9463, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.408522582291401, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.0107000794692637e-05, |
|
"loss": 2.003, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.4119248107510418, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.0090982960045363e-05, |
|
"loss": 2.0, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.415327039210683, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.0074939561610221e-05, |
|
"loss": 1.9832, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.418729267670324, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.005887072116206e-05, |
|
"loss": 1.8977, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.4221314961299651, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.0042776560668832e-05, |
|
"loss": 1.9778, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.4255337245896063, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.0026657202290696e-05, |
|
"loss": 1.9389, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 1.4289359530492471, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.0010512768379053e-05, |
|
"loss": 1.909, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.4323381815088883, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 9.994343381475644e-06, |
|
"loss": 1.9563, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 1.4357404099685294, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.978149164311613e-06, |
|
"loss": 1.9725, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.4391426384281705, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 9.961930239806571e-06, |
|
"loss": 2.0237, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.4425448668878116, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 9.945686731067668e-06, |
|
"loss": 1.9415, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.4459470953474525, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.929418761388654e-06, |
|
"loss": 1.9221, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.4493493238070936, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.91312645424895e-06, |
|
"loss": 1.9062, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.4527515522667347, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.896809933312702e-06, |
|
"loss": 1.9621, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 1.4561537807263758, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 9.88046932242785e-06, |
|
"loss": 1.9721, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.459556009186017, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 9.864104745625186e-06, |
|
"loss": 2.0143, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 1.4629582376456578, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.847716327117408e-06, |
|
"loss": 1.9356, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.466360466105299, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 9.831304191298181e-06, |
|
"loss": 1.9466, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 1.46976269456494, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 9.814868462741196e-06, |
|
"loss": 1.9112, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.4731649230245811, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 9.798409266199217e-06, |
|
"loss": 1.9464, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 1.4765671514842222, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.781926726603141e-06, |
|
"loss": 1.9421, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 1.4799693799438631, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.765420969061045e-06, |
|
"loss": 2.0682, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.4833716084035042, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 9.748892118857236e-06, |
|
"loss": 1.9912, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 1.4867738368631453, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 9.73234030145131e-06, |
|
"loss": 1.9594, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 1.4901760653227865, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 9.71576564247718e-06, |
|
"loss": 1.9444, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 1.4935782937824276, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.699168267742144e-06, |
|
"loss": 1.9882, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 1.4969805222420685, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 9.682548303225915e-06, |
|
"loss": 1.9076, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.5003827507017096, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 9.665905875079679e-06, |
|
"loss": 1.9594, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 1.5037849791613507, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 9.649241109625111e-06, |
|
"loss": 2.0808, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 1.5071872076209918, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 9.632554133353453e-06, |
|
"loss": 1.9688, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 1.510589436080633, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 9.615845072924522e-06, |
|
"loss": 1.971, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 1.5139916645402738, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 9.59911405516577e-06, |
|
"loss": 1.9759, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.517393892999915, |
|
"grad_norm": 2.125, |
|
"learning_rate": 9.582361207071299e-06, |
|
"loss": 1.975, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 1.520796121459556, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 9.565586655800928e-06, |
|
"loss": 1.9975, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 1.5241983499191971, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 9.5487905286792e-06, |
|
"loss": 1.966, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 1.5276005783788382, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 9.531972953194425e-06, |
|
"loss": 1.9374, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 1.5310028068384791, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.51513405699772e-06, |
|
"loss": 1.9567, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.5344050352981202, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.498273967902033e-06, |
|
"loss": 1.9704, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 1.5378072637577613, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 9.481392813881164e-06, |
|
"loss": 1.9064, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 1.5412094922174024, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.464490723068811e-06, |
|
"loss": 1.9553, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 1.5446117206770436, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.447567823757589e-06, |
|
"loss": 1.9416, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 1.5480139491366844, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 9.430624244398053e-06, |
|
"loss": 2.0401, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.5514161775963256, |
|
"grad_norm": 2.125, |
|
"learning_rate": 9.413660113597731e-06, |
|
"loss": 1.9495, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 1.5548184060559667, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 9.396675560120143e-06, |
|
"loss": 2.0093, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 1.5582206345156078, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 9.379670712883817e-06, |
|
"loss": 1.974, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 1.5616228629752489, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 9.362645700961327e-06, |
|
"loss": 1.935, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 1.5650250914348898, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.345600653578297e-06, |
|
"loss": 1.9727, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.5684273198945309, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 9.328535700112433e-06, |
|
"loss": 1.9115, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 1.571829548354172, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 9.311450970092529e-06, |
|
"loss": 1.9329, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 1.575231776813813, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 9.294346593197489e-06, |
|
"loss": 1.9534, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 1.5786340052734542, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 9.277222699255353e-06, |
|
"loss": 1.9047, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 1.582036233733095, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 9.260079418242293e-06, |
|
"loss": 1.9975, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.5854384621927362, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.242916880281638e-06, |
|
"loss": 1.9347, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 1.5888406906523773, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 9.225735215642885e-06, |
|
"loss": 1.9552, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 1.5922429191120182, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 9.208534554740706e-06, |
|
"loss": 1.9052, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 1.5956451475716595, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 9.191315028133966e-06, |
|
"loss": 1.9881, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 1.5990473760313004, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.17407676652472e-06, |
|
"loss": 1.9671, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.6024496044909415, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 9.156819900757237e-06, |
|
"loss": 1.9753, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 1.6058518329505826, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 9.139544561816991e-06, |
|
"loss": 1.9516, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 1.6092540614102235, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 9.122250880829674e-06, |
|
"loss": 1.9615, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 1.6126562898698649, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 9.104938989060205e-06, |
|
"loss": 1.9325, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 1.6160585183295058, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 9.087609017911725e-06, |
|
"loss": 1.9227, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.6194607467891469, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 9.070261098924604e-06, |
|
"loss": 1.9796, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 1.622862975248788, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 9.052895363775442e-06, |
|
"loss": 1.977, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 1.6262652037084289, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.035511944276075e-06, |
|
"loss": 1.8778, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 1.6296674321680702, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 9.018110972372563e-06, |
|
"loss": 1.924, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 1.633069660627711, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 9.000692580144194e-06, |
|
"loss": 1.9173, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.6364718890873522, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 8.983256899802485e-06, |
|
"loss": 1.9433, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 1.6398741175469933, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.96580406369018e-06, |
|
"loss": 1.9947, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 1.6432763460066342, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 8.948334204280234e-06, |
|
"loss": 1.9073, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 1.6466785744662755, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.930847454174817e-06, |
|
"loss": 1.9565, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 1.6500808029259164, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 8.913343946104305e-06, |
|
"loss": 1.8945, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.6534830313855575, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 8.895823812926273e-06, |
|
"loss": 1.9491, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 1.6568852598451986, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 8.878287187624486e-06, |
|
"loss": 1.8916, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 1.6602874883048395, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 8.860734203307893e-06, |
|
"loss": 1.9758, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 1.6636897167644809, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 8.84316499320961e-06, |
|
"loss": 1.9791, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 1.6670919452241217, |
|
"grad_norm": 2.0, |
|
"learning_rate": 8.825579690685907e-06, |
|
"loss": 2.0407, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.6704941736837629, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 8.807978429215212e-06, |
|
"loss": 2.0039, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 1.673896402143404, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 8.79036134239708e-06, |
|
"loss": 2.0093, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 1.6772986306030448, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 8.772728563951189e-06, |
|
"loss": 1.8997, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 1.6807008590626862, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 8.755080227716316e-06, |
|
"loss": 1.908, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 1.684103087522327, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 8.737416467649337e-06, |
|
"loss": 1.9478, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.6875053159819682, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 8.71973741782419e-06, |
|
"loss": 1.9497, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 1.6909075444416093, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 8.70204321243087e-06, |
|
"loss": 1.9035, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 1.6943097729012502, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.684333985774413e-06, |
|
"loss": 1.9666, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 1.6977120013608915, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 8.666609872273867e-06, |
|
"loss": 1.9943, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 1.7011142298205324, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 8.648871006461278e-06, |
|
"loss": 1.9293, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.7045164582801735, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 8.631117522980663e-06, |
|
"loss": 1.9369, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 1.7079186867398146, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 8.613349556587001e-06, |
|
"loss": 1.9117, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 1.7113209151994555, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 8.59556724214519e-06, |
|
"loss": 1.9757, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 1.7147231436590968, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.577770714629042e-06, |
|
"loss": 1.9838, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 1.7181253721187377, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.559960109120251e-06, |
|
"loss": 1.9571, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.7215276005783788, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 8.542135560807365e-06, |
|
"loss": 1.9588, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 1.72492982903802, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 8.524297204984759e-06, |
|
"loss": 1.9542, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 1.7283320574976608, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 8.506445177051624e-06, |
|
"loss": 1.9691, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 1.7317342859573022, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 8.488579612510915e-06, |
|
"loss": 1.9141, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 1.735136514416943, |
|
"grad_norm": 2.0, |
|
"learning_rate": 8.470700646968339e-06, |
|
"loss": 2.0129, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.7385387428765842, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.452808416131319e-06, |
|
"loss": 1.9424, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 1.7419409713362253, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 8.434903055807971e-06, |
|
"loss": 1.9041, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 1.7453431997958662, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 8.416984701906065e-06, |
|
"loss": 1.9514, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 1.7487454282555075, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 8.399053490431994e-06, |
|
"loss": 1.9846, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 1.7521476567151484, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 8.38110955748975e-06, |
|
"loss": 1.9242, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.7555498851747895, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 8.363153039279882e-06, |
|
"loss": 1.9853, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 1.7589521136344306, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 8.345184072098464e-06, |
|
"loss": 2.0005, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 1.7623543420940715, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.327202792336068e-06, |
|
"loss": 2.0181, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 1.7657565705537128, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 8.309209336476713e-06, |
|
"loss": 1.9119, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 1.7691587990133537, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.29120384109685e-06, |
|
"loss": 1.9061, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.7725610274729948, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 8.273186442864303e-06, |
|
"loss": 1.9584, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 1.775963255932636, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 8.25515727853725e-06, |
|
"loss": 1.9456, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 1.7793654843922768, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 8.23711648496318e-06, |
|
"loss": 1.9162, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 1.7827677128519182, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.219064199077837e-06, |
|
"loss": 1.9735, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 1.786169941311559, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 8.201000557904217e-06, |
|
"loss": 1.9512, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.7895721697712001, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 8.182925698551491e-06, |
|
"loss": 1.9886, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 1.7929743982308413, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 8.164839758213986e-06, |
|
"loss": 1.9956, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 1.7963766266904821, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.14674287417013e-06, |
|
"loss": 1.9076, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 1.7997788551501235, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 8.128635183781433e-06, |
|
"loss": 1.912, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 1.8031810836097644, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 8.11051682449141e-06, |
|
"loss": 1.9582, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.8065833120694055, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 8.092387933824571e-06, |
|
"loss": 1.979, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 1.8099855405290466, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 8.074248649385357e-06, |
|
"loss": 1.9679, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 1.8133877689886875, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 8.056099108857101e-06, |
|
"loss": 1.9288, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 1.8167899974483288, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 8.037939450000985e-06, |
|
"loss": 1.922, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 1.8201922259079697, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 8.019769810654989e-06, |
|
"loss": 1.9022, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.8235944543676108, |
|
"grad_norm": 2.0, |
|
"learning_rate": 8.00159032873285e-06, |
|
"loss": 1.9698, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 1.826996682827252, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.98340114222302e-06, |
|
"loss": 1.9087, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 1.8303989112868928, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 7.9652023891876e-06, |
|
"loss": 1.9785, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 1.8338011397465341, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 7.946994207761316e-06, |
|
"loss": 1.9983, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 1.837203368206175, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 7.928776736150451e-06, |
|
"loss": 2.0148, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.8406055966658161, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 7.910550112631802e-06, |
|
"loss": 1.9808, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 1.8440078251254572, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 7.892314475551641e-06, |
|
"loss": 1.9153, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 1.8474100535850981, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 7.87406996332465e-06, |
|
"loss": 1.9285, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 1.8508122820447395, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 7.855816714432878e-06, |
|
"loss": 1.952, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 1.8542145105043804, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.837554867424685e-06, |
|
"loss": 1.9335, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.8576167389640215, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 7.8192845609137e-06, |
|
"loss": 1.943, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 1.8610189674236626, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.801005933577753e-06, |
|
"loss": 2.0204, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 1.8644211958833035, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 7.782719124157842e-06, |
|
"loss": 1.915, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 1.8678234243429448, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 7.764424271457067e-06, |
|
"loss": 1.9207, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 1.8712256528025857, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 7.746121514339576e-06, |
|
"loss": 1.9593, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.8746278812622268, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 7.727810991729512e-06, |
|
"loss": 1.904, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 1.878030109721868, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 7.709492842609971e-06, |
|
"loss": 1.9757, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 1.8814323381815088, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 7.691167206021928e-06, |
|
"loss": 1.938, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 1.88483456664115, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 7.67283422106319e-06, |
|
"loss": 1.956, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 1.888236795100791, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 7.654494026887346e-06, |
|
"loss": 1.9298, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.8916390235604321, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 7.636146762702703e-06, |
|
"loss": 1.8893, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 1.8950412520200732, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 7.617792567771233e-06, |
|
"loss": 1.9309, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 1.8984434804797141, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 7.59943158140751e-06, |
|
"loss": 1.9064, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 1.9018457089393552, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.581063942977662e-06, |
|
"loss": 1.9647, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 1.9052479373989963, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 7.56268979189831e-06, |
|
"loss": 1.9417, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.9086501658586374, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 7.544309267635502e-06, |
|
"loss": 1.96, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 1.9120523943182786, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.525922509703665e-06, |
|
"loss": 1.9672, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 1.9154546227779194, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 7.507529657664538e-06, |
|
"loss": 1.9975, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 1.9188568512375606, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.489130851126123e-06, |
|
"loss": 1.9763, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 1.9222590796972017, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.470726229741613e-06, |
|
"loss": 1.9206, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.9256613081568428, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 7.45231593320834e-06, |
|
"loss": 2.0314, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 1.9290635366164839, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 7.433900101266712e-06, |
|
"loss": 1.9449, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 1.9324657650761248, |
|
"grad_norm": 2.0, |
|
"learning_rate": 7.415478873699151e-06, |
|
"loss": 1.9294, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 1.9358679935357659, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 7.3970523903290335e-06, |
|
"loss": 1.8888, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 1.939270221995407, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.378620791019634e-06, |
|
"loss": 1.9365, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.942672450455048, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 7.360184215673055e-06, |
|
"loss": 1.9441, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 1.9460746789146892, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.341742804229166e-06, |
|
"loss": 1.9156, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 1.94947690737433, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.32329669666455e-06, |
|
"loss": 1.9051, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 1.9528791358339712, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 7.304846032991432e-06, |
|
"loss": 2.0019, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 1.9562813642936123, |
|
"grad_norm": 1.875, |
|
"learning_rate": 7.2863909532566196e-06, |
|
"loss": 1.8679, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.9596835927532534, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.2679315975404405e-06, |
|
"loss": 1.9605, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 1.9630858212128945, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 7.249468105955679e-06, |
|
"loss": 1.9355, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 1.9664880496725354, |
|
"grad_norm": 2.0, |
|
"learning_rate": 7.231000618646511e-06, |
|
"loss": 1.8908, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 1.9698902781321765, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.212529275787436e-06, |
|
"loss": 1.9578, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 1.9732925065918177, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 7.194054217582234e-06, |
|
"loss": 1.9287, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.9766947350514585, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.17557558426287e-06, |
|
"loss": 1.9672, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 1.9800969635110999, |
|
"grad_norm": 2.0, |
|
"learning_rate": 7.157093516088451e-06, |
|
"loss": 1.9581, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 1.9834991919707408, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 7.138608153344156e-06, |
|
"loss": 1.9872, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 1.9869014204303819, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 7.120119636340172e-06, |
|
"loss": 1.9525, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 1.990303648890023, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 7.101628105410625e-06, |
|
"loss": 1.9093, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.9937058773496639, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.0831337009125195e-06, |
|
"loss": 1.9706, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 1.9971081058093052, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.064636563224674e-06, |
|
"loss": 1.9331, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 2.000510334268946, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.046136832746647e-06, |
|
"loss": 1.9434, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 2.0039125627285874, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 7.027634649897679e-06, |
|
"loss": 1.8678, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 2.0073147911882283, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 7.009130155115627e-06, |
|
"loss": 1.9193, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.010717019647869, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.990623488855899e-06, |
|
"loss": 1.9459, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 2.0141192481075105, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.972114791590378e-06, |
|
"loss": 1.9229, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 2.0175214765671514, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.953604203806366e-06, |
|
"loss": 1.9008, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 2.0209237050267927, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 6.935091866005518e-06, |
|
"loss": 1.9513, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 2.0243259334864336, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.9165779187027685e-06, |
|
"loss": 1.9013, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 2.0277281619460745, |
|
"grad_norm": 2.25, |
|
"learning_rate": 6.898062502425267e-06, |
|
"loss": 1.914, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 2.031130390405716, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.87954575771132e-06, |
|
"loss": 1.8773, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 2.0345326188653567, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.861027825109312e-06, |
|
"loss": 1.9337, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 2.037934847324998, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.842508845176642e-06, |
|
"loss": 1.9866, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 2.041337075784639, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 6.8239889584786644e-06, |
|
"loss": 1.9557, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.04473930424428, |
|
"grad_norm": 2.0, |
|
"learning_rate": 6.805468305587612e-06, |
|
"loss": 1.9082, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 2.048141532703921, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.786947027081537e-06, |
|
"loss": 1.8822, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 2.051543761163562, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 6.768425263543234e-06, |
|
"loss": 1.9611, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 2.0549459896232034, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.7499031555591875e-06, |
|
"loss": 1.9623, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 2.0583482180828443, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 6.7313808437184895e-06, |
|
"loss": 1.9902, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 2.061750446542485, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 6.71285846861178e-06, |
|
"loss": 1.9358, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 2.0651526750021265, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 6.694336170830184e-06, |
|
"loss": 1.8377, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 2.0685549034617674, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 6.675814090964238e-06, |
|
"loss": 1.9771, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 2.0719571319214087, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 6.6572923696028185e-06, |
|
"loss": 1.8634, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 2.0753593603810496, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 6.638771147332086e-06, |
|
"loss": 1.9388, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.0787615888406905, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.62025056473442e-06, |
|
"loss": 1.918, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 2.082163817300332, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.601730762387327e-06, |
|
"loss": 1.9617, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 2.0855660457599727, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.583211880862406e-06, |
|
"loss": 1.9056, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 2.0889682742196136, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 6.56469406072426e-06, |
|
"loss": 1.9458, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 2.092370502679255, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 6.546177442529437e-06, |
|
"loss": 1.9393, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 2.095772731138896, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 6.5276621668253645e-06, |
|
"loss": 1.9038, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 2.099174959598537, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.509148374149276e-06, |
|
"loss": 1.9621, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 2.102577188058178, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.490636205027152e-06, |
|
"loss": 1.9206, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 2.105979416517819, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 6.472125799972643e-06, |
|
"loss": 1.9409, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 2.1093816449774603, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.453617299486017e-06, |
|
"loss": 1.9348, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.112783873437101, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 6.435110844053086e-06, |
|
"loss": 1.9364, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 2.1161861018967425, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.416606574144131e-06, |
|
"loss": 1.9042, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 2.1195883303563834, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 6.398104630212853e-06, |
|
"loss": 1.9547, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 2.1229905588160243, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 6.379605152695294e-06, |
|
"loss": 1.9768, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 2.1263927872756656, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.361108282008776e-06, |
|
"loss": 1.9522, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 2.1297950157353065, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 6.342614158550832e-06, |
|
"loss": 1.9168, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 2.133197244194948, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 6.324122922698143e-06, |
|
"loss": 1.9871, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 2.1365994726545887, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 6.305634714805481e-06, |
|
"loss": 1.9398, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 2.1400017011142296, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 6.287149675204619e-06, |
|
"loss": 1.9629, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 2.143403929573871, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 6.268667944203294e-06, |
|
"loss": 1.9102, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.146806158033512, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 6.2501896620841255e-06, |
|
"loss": 1.8596, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 2.150208386493153, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.231714969103553e-06, |
|
"loss": 1.7886, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 2.153610614952794, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 6.213244005490776e-06, |
|
"loss": 1.9695, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 2.157012843412435, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.194776911446687e-06, |
|
"loss": 1.971, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 2.1604150718720763, |
|
"grad_norm": 2.375, |
|
"learning_rate": 6.176313827142807e-06, |
|
"loss": 1.9136, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 2.163817300331717, |
|
"grad_norm": 2.25, |
|
"learning_rate": 6.157854892720216e-06, |
|
"loss": 1.9184, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 2.1672195287913585, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.139400248288503e-06, |
|
"loss": 1.9933, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 2.1706217572509994, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 6.120950033924691e-06, |
|
"loss": 1.9114, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 2.1740239857106403, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 6.102504389672177e-06, |
|
"loss": 1.9974, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 2.1774262141702816, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 6.084063455539671e-06, |
|
"loss": 1.8925, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.1808284426299225, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 6.065627371500128e-06, |
|
"loss": 1.9208, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 2.184230671089564, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 6.0471962774896946e-06, |
|
"loss": 1.8757, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 2.1876328995492047, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 6.0287703134066385e-06, |
|
"loss": 1.905, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 2.1910351280088456, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.010349619110283e-06, |
|
"loss": 1.8878, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 2.194437356468487, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 5.991934334419968e-06, |
|
"loss": 1.9549, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 2.197839584928128, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.973524599113954e-06, |
|
"loss": 1.9137, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 2.201241813387769, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.9551205529283955e-06, |
|
"loss": 1.9856, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 2.20464404184741, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 5.936722335556252e-06, |
|
"loss": 1.9262, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 2.208046270307051, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 5.91833008664625e-06, |
|
"loss": 1.9596, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 2.2114484987666922, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 5.89994394580181e-06, |
|
"loss": 1.907, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.214850727226333, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.881564052579987e-06, |
|
"loss": 1.938, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 2.2182529556859745, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 5.863190546490422e-06, |
|
"loss": 1.9615, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 2.2216551841456154, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.844823566994264e-06, |
|
"loss": 1.9353, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 2.2250574126052562, |
|
"grad_norm": 2.75, |
|
"learning_rate": 5.826463253503132e-06, |
|
"loss": 1.98, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 2.2284596410648976, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.808109745378048e-06, |
|
"loss": 1.8649, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 2.2318618695245385, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.789763181928373e-06, |
|
"loss": 1.9079, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 2.23526409798418, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 5.771423702410762e-06, |
|
"loss": 1.9156, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 2.2386663264438207, |
|
"grad_norm": 2.0, |
|
"learning_rate": 5.753091446028094e-06, |
|
"loss": 1.9416, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 2.2420685549034616, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.734766551928427e-06, |
|
"loss": 1.8595, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 2.245470783363103, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 5.716449159203939e-06, |
|
"loss": 1.9292, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 2.248873011822744, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 5.698139406889855e-06, |
|
"loss": 1.9578, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 2.252275240282385, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 5.679837433963432e-06, |
|
"loss": 1.9706, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 2.255677468742026, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 5.661543379342855e-06, |
|
"loss": 1.9641, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 2.259079697201667, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.643257381886218e-06, |
|
"loss": 1.9505, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 2.2624819256613082, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 5.624979580390459e-06, |
|
"loss": 1.9631, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 2.265884154120949, |
|
"grad_norm": 2.375, |
|
"learning_rate": 5.6067101135902996e-06, |
|
"loss": 1.9767, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 2.2692863825805905, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 5.588449120157205e-06, |
|
"loss": 1.9077, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 2.2726886110402313, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 5.57019673869832e-06, |
|
"loss": 1.9133, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 2.2760908394998722, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.5519531077554244e-06, |
|
"loss": 1.8405, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 2.2794930679595136, |
|
"grad_norm": 2.375, |
|
"learning_rate": 5.533718365803875e-06, |
|
"loss": 1.8948, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 2.2828952964191545, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.51549265125156e-06, |
|
"loss": 1.9344, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 2.286297524878796, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.4972761024378514e-06, |
|
"loss": 1.842, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 2.2896997533384367, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 5.479068857632542e-06, |
|
"loss": 1.9172, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 2.2931019817980776, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.46087105503481e-06, |
|
"loss": 1.9252, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 2.296504210257719, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.4426828327721594e-06, |
|
"loss": 1.9356, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 2.29990643871736, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 5.4245043288993795e-06, |
|
"loss": 1.9462, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 2.303308667177001, |
|
"grad_norm": 2.375, |
|
"learning_rate": 5.406335681397498e-06, |
|
"loss": 1.9788, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 2.306710895636642, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.388177028172714e-06, |
|
"loss": 1.9221, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 2.310113124096283, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 5.370028507055387e-06, |
|
"loss": 1.9344, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 2.313515352555924, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.351890255798953e-06, |
|
"loss": 1.871, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.316917581015565, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 5.333762412078907e-06, |
|
"loss": 1.975, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 2.3203198094752064, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.315645113491743e-06, |
|
"loss": 1.9103, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 2.3237220379348473, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 5.2975384975539145e-06, |
|
"loss": 1.9036, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 2.327124266394488, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.279442701700792e-06, |
|
"loss": 1.9292, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 2.3305264948541295, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 5.261357863285613e-06, |
|
"loss": 1.9181, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 2.3339287233137704, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 5.243284119578448e-06, |
|
"loss": 1.8917, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 2.3373309517734118, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 5.225221607765159e-06, |
|
"loss": 1.9389, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 2.3407331802330527, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.207170464946342e-06, |
|
"loss": 1.9298, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 2.3441354086926935, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.189130828136312e-06, |
|
"loss": 1.9011, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 2.347537637152335, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 5.1711028342620375e-06, |
|
"loss": 1.908, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.3509398656119758, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 5.153086620162123e-06, |
|
"loss": 1.8829, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 2.354342094071617, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.135082322585758e-06, |
|
"loss": 1.9441, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 2.357744322531258, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 5.117090078191676e-06, |
|
"loss": 1.9403, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 2.361146550990899, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 5.09911002354713e-06, |
|
"loss": 1.9478, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 2.36454877945054, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 5.081142295126842e-06, |
|
"loss": 1.8916, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 2.367951007910181, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 5.063187029311983e-06, |
|
"loss": 1.9323, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 2.3713532363698224, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 5.045244362389115e-06, |
|
"loss": 1.9571, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 2.3747554648294633, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 5.027314430549185e-06, |
|
"loss": 1.9486, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 2.378157693289104, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 5.009397369886466e-06, |
|
"loss": 1.944, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 2.3815599217487455, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.991493316397536e-06, |
|
"loss": 1.9539, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.3849621502083864, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 4.973602405980251e-06, |
|
"loss": 1.8877, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 2.3883643786680278, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.955724774432697e-06, |
|
"loss": 1.9579, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 2.3917666071276686, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.937860557452174e-06, |
|
"loss": 1.9066, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 2.3951688355873095, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.920009890634164e-06, |
|
"loss": 1.9488, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 2.398571064046951, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.902172909471289e-06, |
|
"loss": 1.9939, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 2.4019732925065918, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.884349749352304e-06, |
|
"loss": 1.9718, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 2.405375520966233, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 4.866540545561045e-06, |
|
"loss": 1.9198, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 2.408777749425874, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 4.848745433275427e-06, |
|
"loss": 1.8993, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 2.412179977885515, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 4.830964547566399e-06, |
|
"loss": 1.9977, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 2.415582206345156, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.813198023396925e-06, |
|
"loss": 1.911, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.418984434804797, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.795445995620965e-06, |
|
"loss": 1.977, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 2.4223866632644384, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.777708598982436e-06, |
|
"loss": 1.9065, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 2.4257888917240793, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 4.759985968114213e-06, |
|
"loss": 1.9569, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 2.42919112018372, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.742278237537088e-06, |
|
"loss": 1.9151, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 2.4325933486433615, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.72458554165875e-06, |
|
"loss": 1.984, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 2.4359955771030024, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 4.706908014772776e-06, |
|
"loss": 1.9921, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 2.4393978055626437, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.689245791057602e-06, |
|
"loss": 1.9753, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 2.4428000340222846, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 4.671599004575511e-06, |
|
"loss": 1.9305, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 2.4462022624819255, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.653967789271607e-06, |
|
"loss": 1.8709, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 2.449604490941567, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 4.636352278972806e-06, |
|
"loss": 1.9123, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.4530067194012077, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.618752607386824e-06, |
|
"loss": 1.8976, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 2.456408947860849, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.601168908101142e-06, |
|
"loss": 2.0117, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 2.45981117632049, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.5836013145820175e-06, |
|
"loss": 1.8844, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 2.463213404780131, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.5660499601734545e-06, |
|
"loss": 1.9541, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 2.466615633239772, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.548514978096198e-06, |
|
"loss": 1.9029, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 2.470017861699413, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.5309965014467246e-06, |
|
"loss": 1.9122, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 2.4734200901590544, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.513494663196221e-06, |
|
"loss": 1.8935, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 2.4768223186186953, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.496009596189593e-06, |
|
"loss": 1.9198, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 2.480224547078336, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.478541433144435e-06, |
|
"loss": 1.8702, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 2.4836267755379775, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.461090306650046e-06, |
|
"loss": 1.9336, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.4870290039976184, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.443656349166409e-06, |
|
"loss": 1.9156, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 2.4904312324572597, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.426239693023181e-06, |
|
"loss": 1.949, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 2.4938334609169006, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.408840470418706e-06, |
|
"loss": 1.9331, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 2.4972356893765415, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.391458813418992e-06, |
|
"loss": 1.9376, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 2.500637917836183, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.374094853956726e-06, |
|
"loss": 1.8894, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 2.5040401462958237, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.3567487238302625e-06, |
|
"loss": 2.0008, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 2.507442374755465, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.3394205547026224e-06, |
|
"loss": 1.8901, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 2.510844603215106, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.322110478100502e-06, |
|
"loss": 1.9533, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 2.514246831674747, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.3048186254132606e-06, |
|
"loss": 1.9216, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 2.517649060134388, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 4.287545127891939e-06, |
|
"loss": 1.9397, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.521051288594029, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.270290116648254e-06, |
|
"loss": 1.9161, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 2.5244535170536704, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.2530537226536075e-06, |
|
"loss": 1.8427, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 2.5278557455133113, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.235836076738085e-06, |
|
"loss": 1.917, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 2.531257973972952, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 4.218637309589471e-06, |
|
"loss": 1.8681, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 2.5346602024325935, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.201457551752256e-06, |
|
"loss": 1.9049, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 2.5380624308922344, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.184296933626636e-06, |
|
"loss": 1.9001, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 2.5414646593518757, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.167155585467538e-06, |
|
"loss": 1.895, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 2.5448668878115166, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 4.150033637383623e-06, |
|
"loss": 1.9132, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 2.5482691162711575, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.132931219336289e-06, |
|
"loss": 1.9031, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 2.551671344730799, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 4.115848461138707e-06, |
|
"loss": 1.8727, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.5550735731904397, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.0987854924548134e-06, |
|
"loss": 1.8808, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 2.558475801650081, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.081742442798342e-06, |
|
"loss": 1.9265, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 2.561878030109722, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.064719441531834e-06, |
|
"loss": 1.9463, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 2.565280258569363, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 4.04771661786565e-06, |
|
"loss": 1.9341, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 2.568682487029004, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 4.030734100857004e-06, |
|
"loss": 1.9036, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 2.572084715488645, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 4.013772019408969e-06, |
|
"loss": 1.9604, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 2.5754869439482864, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 3.9968305022695076e-06, |
|
"loss": 1.8938, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 2.5788891724079273, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.979909678030498e-06, |
|
"loss": 1.976, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 2.582291400867568, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.9630096751267395e-06, |
|
"loss": 1.9534, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 2.5856936293272095, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.946130621835003e-06, |
|
"loss": 1.9374, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.5890958577868504, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.929272646273037e-06, |
|
"loss": 1.9044, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 2.5924980862464917, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.9124358763986045e-06, |
|
"loss": 1.9723, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 2.5959003147061326, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.895620440008517e-06, |
|
"loss": 1.8593, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 2.5993025431657735, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.878826464737643e-06, |
|
"loss": 1.9203, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 2.602704771625415, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.862054078057968e-06, |
|
"loss": 1.9127, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 2.6061070000850557, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.845303407277605e-06, |
|
"loss": 1.8969, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 2.609509228544697, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.828574579539842e-06, |
|
"loss": 1.957, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 2.612911457004338, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.811867721822161e-06, |
|
"loss": 1.9497, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 2.616313685463979, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.7951829609352926e-06, |
|
"loss": 1.9144, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 2.61971591392362, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 3.778520423522247e-06, |
|
"loss": 1.9252, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.623118142383261, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.7618802360573384e-06, |
|
"loss": 1.9192, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 2.6265203708429024, |
|
"grad_norm": 2.0, |
|
"learning_rate": 3.7452625248452478e-06, |
|
"loss": 1.887, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 2.6299225993025432, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.728667416020052e-06, |
|
"loss": 1.9326, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 2.633324827762184, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.7120950355442677e-06, |
|
"loss": 1.9739, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 2.6367270562218255, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.6955455092078956e-06, |
|
"loss": 1.9417, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 2.6401292846814663, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.679018962627461e-06, |
|
"loss": 1.9288, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 2.6435315131411077, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.6625155212450754e-06, |
|
"loss": 1.9062, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 2.6469337416007486, |
|
"grad_norm": 2.625, |
|
"learning_rate": 3.6460353103274615e-06, |
|
"loss": 1.9304, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 2.6503359700603895, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.6295784549650233e-06, |
|
"loss": 1.9378, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 2.6537381985200303, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.613145080070886e-06, |
|
"loss": 1.9244, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.6571404269796717, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.59673531037995e-06, |
|
"loss": 1.8997, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 2.660542655439313, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.5803492704479488e-06, |
|
"loss": 1.9715, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 2.663944883898954, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.5639870846504873e-06, |
|
"loss": 1.917, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 2.667347112358595, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 3.54764887718212e-06, |
|
"loss": 1.9122, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 2.6707493408182357, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.5313347720553963e-06, |
|
"loss": 1.9234, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 2.674151569277877, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.5150448930999113e-06, |
|
"loss": 1.9519, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 2.6775537977375183, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.4987793639613926e-06, |
|
"loss": 1.9065, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 2.6809560261971592, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 3.482538308100727e-06, |
|
"loss": 1.8604, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 2.6843582546568, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.4663218487930547e-06, |
|
"loss": 1.8554, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 2.687760483116441, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 3.4501301091268043e-06, |
|
"loss": 1.936, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.6911627115760823, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.433963212002789e-06, |
|
"loss": 1.8966, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 2.6945649400357237, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.41782128013325e-06, |
|
"loss": 1.9634, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 2.6979671684953646, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.4017044360409375e-06, |
|
"loss": 1.922, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 2.7013693969550054, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 3.3856128020581783e-06, |
|
"loss": 1.9411, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 2.7047716254146463, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.3695465003259376e-06, |
|
"loss": 1.8679, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 2.7081738538742877, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.353505652792909e-06, |
|
"loss": 1.906, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 2.711576082333929, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.3374903812145784e-06, |
|
"loss": 1.8951, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 2.71497831079357, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.3215008071522965e-06, |
|
"loss": 1.9556, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 2.7183805392532108, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.3055370519723652e-06, |
|
"loss": 1.9427, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 2.7217827677128517, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 3.289599236845113e-06, |
|
"loss": 1.9533, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.725184996172493, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.273687482743974e-06, |
|
"loss": 1.9608, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 2.7285872246321343, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 3.2578019104445702e-06, |
|
"loss": 1.9894, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 2.731989453091775, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 3.241942640523791e-06, |
|
"loss": 1.864, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 2.735391681551416, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.2261097933588893e-06, |
|
"loss": 1.9567, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 2.738793910011057, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 3.210303489126551e-06, |
|
"loss": 1.9093, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 2.7421961384706983, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 3.1945238478020003e-06, |
|
"loss": 1.9673, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 2.745598366930339, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.1787709891580763e-06, |
|
"loss": 1.9712, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 2.7490005953899805, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.1630450327643315e-06, |
|
"loss": 1.9127, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 2.7524028238496214, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.147346097986121e-06, |
|
"loss": 1.9763, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 2.7558050523092623, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 3.1316743039836908e-06, |
|
"loss": 1.8313, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.7592072807689036, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.1160297697112855e-06, |
|
"loss": 1.9062, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 2.7626095092285445, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.10041261391624e-06, |
|
"loss": 1.9072, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 2.766011737688186, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.0848229551380702e-06, |
|
"loss": 1.932, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 2.7694139661478268, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.069260911707586e-06, |
|
"loss": 1.9311, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 2.7728161946074676, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.0537266017459856e-06, |
|
"loss": 1.9067, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 2.776218423067109, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.0382201431639656e-06, |
|
"loss": 1.978, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 2.77962065152675, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.0227416536608095e-06, |
|
"loss": 1.9084, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 2.783022879986391, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.0072912507235167e-06, |
|
"loss": 1.8865, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 2.786425108446032, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.991869051625898e-06, |
|
"loss": 1.9293, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 2.789827336905673, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.9764751734276803e-06, |
|
"loss": 1.9127, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.7932295653653143, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.9611097329736394e-06, |
|
"loss": 1.9198, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 2.796631793824955, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.9457728468926836e-06, |
|
"loss": 1.9261, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 2.8000340222845965, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.930464631596993e-06, |
|
"loss": 1.9068, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 2.8034362507442374, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.915185203281126e-06, |
|
"loss": 1.947, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 2.8068384792038783, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.899934677921133e-06, |
|
"loss": 1.9014, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 2.8102407076635196, |
|
"grad_norm": 2.25, |
|
"learning_rate": 2.884713171273686e-06, |
|
"loss": 1.9012, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 2.8136429361231605, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.869520798875194e-06, |
|
"loss": 1.9299, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 2.817045164582802, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.8543576760409264e-06, |
|
"loss": 1.9472, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 2.8204473930424427, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.839223917864142e-06, |
|
"loss": 1.9323, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 2.8238496215020836, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 2.824119639215203e-06, |
|
"loss": 1.9394, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.827251849961725, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.809044954740723e-06, |
|
"loss": 1.9369, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 2.830654078421366, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.7939999788626755e-06, |
|
"loss": 1.9025, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 2.834056306881007, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.778984825777543e-06, |
|
"loss": 1.908, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 2.837458535340648, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.763999609455441e-06, |
|
"loss": 1.9814, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 2.840860763800289, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.7490444436392535e-06, |
|
"loss": 1.9804, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 2.8442629922599303, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 2.7341194418437747e-06, |
|
"loss": 1.9187, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 2.847665220719571, |
|
"grad_norm": 2.25, |
|
"learning_rate": 2.7192247173548356e-06, |
|
"loss": 1.8885, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 2.8510674491792125, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.7043603832284616e-06, |
|
"loss": 1.9056, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 2.8544696776388534, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.689526552289997e-06, |
|
"loss": 1.9068, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 2.8578719060984943, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 2.6747233371332606e-06, |
|
"loss": 2.0559, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.8612741345581356, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.6599508501196876e-06, |
|
"loss": 1.9102, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 2.8646763630177765, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.6452092033774744e-06, |
|
"loss": 1.878, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 2.868078591477418, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 2.630498508800734e-06, |
|
"loss": 1.9412, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 2.8714808199370587, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.6158188780486312e-06, |
|
"loss": 1.8957, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 2.8748830483966996, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 2.6011704225445548e-06, |
|
"loss": 1.8656, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 2.878285276856341, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.586553253475264e-06, |
|
"loss": 1.9598, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 2.881687505315982, |
|
"grad_norm": 2.25, |
|
"learning_rate": 2.5719674817900346e-06, |
|
"loss": 1.957, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 2.885089733775623, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 2.5574132181998334e-06, |
|
"loss": 1.9725, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 2.888491962235264, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 2.5428905731764664e-06, |
|
"loss": 1.9228, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 2.891894190694905, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.5283996569517464e-06, |
|
"loss": 1.938, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.8952964191545463, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 2.5139405795166538e-06, |
|
"loss": 1.9243, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 2.898698647614187, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.4995134506204964e-06, |
|
"loss": 1.9328, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 2.9021008760738285, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 2.48511837977009e-06, |
|
"loss": 1.9199, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 2.9055031045334694, |
|
"grad_norm": 2.625, |
|
"learning_rate": 2.4707554762289077e-06, |
|
"loss": 1.9613, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 2.9089053329931103, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.4564248490162763e-06, |
|
"loss": 1.9547, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 2.9123075614527516, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 2.442126606906526e-06, |
|
"loss": 2.0251, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 2.9157097899123925, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.4278608584281694e-06, |
|
"loss": 1.9231, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 2.919112018372034, |
|
"grad_norm": 2.625, |
|
"learning_rate": 2.413627711863091e-06, |
|
"loss": 1.9295, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 2.9225142468316747, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.399427275245705e-06, |
|
"loss": 1.9444, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 2.9259164752913156, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 2.3852596563621536e-06, |
|
"loss": 1.9794, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.929318703750957, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 2.3711249627494803e-06, |
|
"loss": 1.9096, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 2.932720932210598, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 2.3570233016948133e-06, |
|
"loss": 1.9062, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 2.936123160670239, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.3429547802345537e-06, |
|
"loss": 1.8779, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 2.93952538912988, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.3289195051535584e-06, |
|
"loss": 1.8901, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 2.942927617589521, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 2.3149175829843367e-06, |
|
"loss": 1.9073, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 2.9463298460491623, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.3009491200062343e-06, |
|
"loss": 1.9434, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 2.949732074508803, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 2.287014222244634e-06, |
|
"loss": 1.88, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 2.9531343029684445, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.273112995470147e-06, |
|
"loss": 1.968, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 2.9565365314280854, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.259245545197807e-06, |
|
"loss": 1.9048, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 2.9599387598877263, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.245411976686278e-06, |
|
"loss": 1.9502, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.9633409883473676, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.231612394937042e-06, |
|
"loss": 1.87, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 2.9667432168070085, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 2.217846904693616e-06, |
|
"loss": 1.9337, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 2.97014544526665, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.2041156104407518e-06, |
|
"loss": 1.9095, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 2.9735476737262907, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.1904186164036358e-06, |
|
"loss": 1.9346, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 2.9769499021859316, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 2.1767560265471087e-06, |
|
"loss": 1.9296, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 2.980352130645573, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 2.163127944574872e-06, |
|
"loss": 1.9386, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 2.983754359105214, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.149534473928699e-06, |
|
"loss": 1.9189, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 2.987156587564855, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.135975717787654e-06, |
|
"loss": 1.8996, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 2.990558816024496, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 2.1224517790673003e-06, |
|
"loss": 1.937, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 2.993961044484137, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 2.108962760418933e-06, |
|
"loss": 1.9724, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.9973632729437782, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.0955087642287833e-06, |
|
"loss": 1.9497, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 3.000765501403419, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.0820898926172546e-06, |
|
"loss": 1.9683, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 3.0041677298630605, |
|
"grad_norm": 2.375, |
|
"learning_rate": 2.0687062474381516e-06, |
|
"loss": 1.9146, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 3.0075699583227014, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.05535793027788e-06, |
|
"loss": 1.9749, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 3.0109721867823422, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.042045042454711e-06, |
|
"loss": 1.9554, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 3.0143744152419836, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.028767685017981e-06, |
|
"loss": 1.8963, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 3.0177766437016245, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 2.015525958747352e-06, |
|
"loss": 1.938, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 3.021178872161266, |
|
"grad_norm": 2.625, |
|
"learning_rate": 2.0023199641520177e-06, |
|
"loss": 1.9223, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 3.0245811006209067, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.989149801469974e-06, |
|
"loss": 1.8825, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 3.0279833290805476, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.97601557066723e-06, |
|
"loss": 1.9489, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 3.031385557540189, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.9629173714370583e-06, |
|
"loss": 1.9236, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 3.03478778599983, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.949855303199246e-06, |
|
"loss": 1.9561, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 3.038190014459471, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.9368294650993263e-06, |
|
"loss": 1.8969, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 3.041592242919112, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.92383995600784e-06, |
|
"loss": 1.9331, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 3.044994471378753, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.910886874519575e-06, |
|
"loss": 1.9734, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 3.0483966998383942, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.8979703189528225e-06, |
|
"loss": 1.918, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 3.051798928298035, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.885090387348631e-06, |
|
"loss": 1.9162, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 3.0552011567576765, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.8722471774700541e-06, |
|
"loss": 1.9047, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 3.0586033852173173, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.8594407868014222e-06, |
|
"loss": 1.9391, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 3.0620056136769582, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.8466713125475953e-06, |
|
"loss": 1.9597, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.0654078421365996, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.8339388516332183e-06, |
|
"loss": 1.9123, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 3.0688100705962404, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.8212435007019987e-06, |
|
"loss": 1.9063, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 3.072212299055882, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.8085853561159651e-06, |
|
"loss": 1.8604, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 3.0756145275155227, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.7959645139547367e-06, |
|
"loss": 1.9165, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 3.0790167559751636, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.7833810700147973e-06, |
|
"loss": 1.9096, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 3.082418984434805, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.770835119808758e-06, |
|
"loss": 1.9433, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 3.0858212128944458, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.7583267585646496e-06, |
|
"loss": 1.972, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 3.089223441354087, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.7458560812251807e-06, |
|
"loss": 1.9191, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 3.092625669813728, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.7334231824470327e-06, |
|
"loss": 1.882, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 3.096027898273369, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.7210281566001321e-06, |
|
"loss": 1.9086, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 3.09943012673301, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.7086710977669391e-06, |
|
"loss": 1.9225, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 3.102832355192651, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.6963520997417304e-06, |
|
"loss": 1.9364, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 3.1062345836522924, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.684071256029885e-06, |
|
"loss": 1.962, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 3.1096368121119333, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.6718286598471834e-06, |
|
"loss": 1.9557, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 3.113039040571574, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.6596244041190884e-06, |
|
"loss": 1.963, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 3.1164412690312155, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.6474585814800486e-06, |
|
"loss": 1.8665, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 3.1198434974908564, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.6353312842727971e-06, |
|
"loss": 1.9364, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 3.1232457259504978, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.6232426045476368e-06, |
|
"loss": 1.9379, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 3.1266479544101387, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.6111926340617594e-06, |
|
"loss": 1.8696, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 3.1300501828697795, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.599181464278531e-06, |
|
"loss": 1.9511, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 3.133452411329421, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.587209186366815e-06, |
|
"loss": 1.9289, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 3.1368546397890618, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.5752758912002694e-06, |
|
"loss": 1.8937, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 3.140256868248703, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.5633816693566608e-06, |
|
"loss": 1.8763, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 3.143659096708344, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.5515266111171768e-06, |
|
"loss": 1.9913, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 3.147061325167985, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.5397108064657348e-06, |
|
"loss": 1.8861, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 3.150463553627626, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.5279343450883104e-06, |
|
"loss": 1.9029, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 3.153865782087267, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.5161973163722477e-06, |
|
"loss": 1.9382, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 3.1572680105469084, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.5044998094055818e-06, |
|
"loss": 1.8859, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 3.1606702390065493, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.4928419129763672e-06, |
|
"loss": 1.8785, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 3.16407246746619, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.4812237155720006e-06, |
|
"loss": 1.8864, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 3.1674746959258315, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.4696453053785496e-06, |
|
"loss": 1.8698, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 3.1708769243854724, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.4581067702800793e-06, |
|
"loss": 1.9852, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 3.1742791528451137, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.4466081978579942e-06, |
|
"loss": 1.98, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 3.1776813813047546, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.4351496753903699e-06, |
|
"loss": 1.925, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 3.1810836097643955, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.4237312898512816e-06, |
|
"loss": 1.9355, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 3.184485838224037, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.4123531279101576e-06, |
|
"loss": 1.9966, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 3.1878880666836777, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.4010152759311148e-06, |
|
"loss": 1.8377, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 3.191290295143319, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.3897178199723027e-06, |
|
"loss": 1.9501, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 3.19469252360296, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.3784608457852537e-06, |
|
"loss": 1.9103, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 3.198094752062601, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.3672444388142238e-06, |
|
"loss": 1.9575, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 3.201496980522242, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.3560686841955576e-06, |
|
"loss": 1.929, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 3.204899208981883, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.3449336667570272e-06, |
|
"loss": 1.9606, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 3.2083014374415244, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.3338394710172017e-06, |
|
"loss": 1.9379, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 3.2117036659011653, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.3227861811847961e-06, |
|
"loss": 1.8995, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 3.215105894360806, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.3117738811580378e-06, |
|
"loss": 1.9038, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 3.2185081228204475, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.3008026545240273e-06, |
|
"loss": 1.9499, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 3.2219103512800884, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.2898725845581015e-06, |
|
"loss": 1.9625, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 3.2253125797397297, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.2789837542232062e-06, |
|
"loss": 2.0014, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 3.2287148081993706, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.2681362461692674e-06, |
|
"loss": 1.9227, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 3.2321170366590115, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.2573301427325523e-06, |
|
"loss": 1.9411, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.235519265118653, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.246565525935065e-06, |
|
"loss": 1.8898, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 3.2389214935782937, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.2358424774839005e-06, |
|
"loss": 1.8962, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 3.242323722037935, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.2251610787706435e-06, |
|
"loss": 1.9404, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 3.245725950497576, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.2145214108707407e-06, |
|
"loss": 1.8978, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 3.249128178957217, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.2039235545428843e-06, |
|
"loss": 1.9312, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 3.252530407416858, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.1933675902284088e-06, |
|
"loss": 1.8721, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 3.255932635876499, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.182853598050669e-06, |
|
"loss": 1.9304, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 3.2593348643361404, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.1723816578144417e-06, |
|
"loss": 1.8912, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 3.2627370927957813, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.1619518490053083e-06, |
|
"loss": 1.8852, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 3.266139321255422, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.1515642507890646e-06, |
|
"loss": 1.9256, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 3.2695415497150635, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.141218942011112e-06, |
|
"loss": 1.8988, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 3.2729437781747044, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.1309160011958583e-06, |
|
"loss": 1.9262, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 3.2763460066343457, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.1206555065461265e-06, |
|
"loss": 1.9177, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 3.2797482350939866, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.1104375359425585e-06, |
|
"loss": 1.9117, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 3.2831504635536275, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.100262166943023e-06, |
|
"loss": 1.9711, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 3.286552692013269, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.0901294767820318e-06, |
|
"loss": 1.9243, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 3.2899549204729097, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.0800395423701436e-06, |
|
"loss": 1.9023, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 3.293357148932551, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.0699924402933917e-06, |
|
"loss": 1.938, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 3.296759377392192, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.0599882468126933e-06, |
|
"loss": 1.9328, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 3.300161605851833, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.0500270378632782e-06, |
|
"loss": 1.9429, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 3.303563834311474, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.0401088890541082e-06, |
|
"loss": 1.9068, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 3.306966062771115, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.0302338756673032e-06, |
|
"loss": 1.9121, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 3.3103682912307564, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.0204020726575725e-06, |
|
"loss": 1.9197, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 3.3137705196903973, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.0106135546516385e-06, |
|
"loss": 1.9347, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 3.317172748150038, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.0008683959476827e-06, |
|
"loss": 1.929, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 3.3205749766096795, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 9.911666705147721e-07, |
|
"loss": 1.8878, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 3.3239772050693204, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.815084519922975e-07, |
|
"loss": 1.8525, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 3.3273794335289617, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 9.718938136894211e-07, |
|
"loss": 1.8368, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 3.3307816619886026, |
|
"grad_norm": 2.0, |
|
"learning_rate": 9.623228285845155e-07, |
|
"loss": 1.8964, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 3.3341838904482435, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 9.527955693246117e-07, |
|
"loss": 1.9062, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 3.337586118907885, |
|
"grad_norm": 2.125, |
|
"learning_rate": 9.433121082248422e-07, |
|
"loss": 1.87, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 3.3409883473675257, |
|
"grad_norm": 2.5, |
|
"learning_rate": 9.33872517267902e-07, |
|
"loss": 1.9351, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 3.344390575827167, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.244768681034954e-07, |
|
"loss": 1.9826, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 3.347792804286808, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.151252320477888e-07, |
|
"loss": 1.9788, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 3.351195032746449, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 9.058176800828842e-07, |
|
"loss": 1.9306, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 3.35459726120609, |
|
"grad_norm": 2.375, |
|
"learning_rate": 8.965542828562589e-07, |
|
"loss": 1.9304, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 3.357999489665731, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 8.873351106802486e-07, |
|
"loss": 1.9565, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 3.3614017181253724, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.781602335315041e-07, |
|
"loss": 1.9325, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 3.3648039465850133, |
|
"grad_norm": 2.25, |
|
"learning_rate": 8.690297210504589e-07, |
|
"loss": 1.9074, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 3.368206175044654, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 8.599436425408064e-07, |
|
"loss": 1.9338, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 3.3716084035042955, |
|
"grad_norm": 2.625, |
|
"learning_rate": 8.509020669689717e-07, |
|
"loss": 1.9236, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 3.3750106319639364, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.419050629635849e-07, |
|
"loss": 1.9387, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 3.3784128604235777, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 8.329526988149661e-07, |
|
"loss": 1.9503, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 3.3818150888832186, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 8.240450424745993e-07, |
|
"loss": 1.9232, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 3.3852173173428595, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 8.151821615546263e-07, |
|
"loss": 1.9435, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 3.388619545802501, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 8.063641233273221e-07, |
|
"loss": 1.9005, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 3.3920217742621417, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 7.975909947245956e-07, |
|
"loss": 1.864, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 3.3954240027217826, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 7.888628423374738e-07, |
|
"loss": 1.9707, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 3.398826231181424, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.801797324156009e-07, |
|
"loss": 1.9314, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 3.402228459641065, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.715417308667326e-07, |
|
"loss": 1.9229, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.405630688100706, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.629489032562336e-07, |
|
"loss": 1.86, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 3.409032916560347, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 7.544013148065898e-07, |
|
"loss": 1.9123, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 3.412435145019988, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 7.45899030396898e-07, |
|
"loss": 1.8735, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 3.4158373734796292, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.374421145623891e-07, |
|
"loss": 1.9386, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 3.41923960193927, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.290306314939283e-07, |
|
"loss": 1.8794, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 3.4226418303989115, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 7.206646450375306e-07, |
|
"loss": 1.9236, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 3.4260440588585523, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.123442186938769e-07, |
|
"loss": 1.9224, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 3.4294462873181932, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.040694156178301e-07, |
|
"loss": 1.9089, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 3.4328485157778346, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.958402986179579e-07, |
|
"loss": 1.9395, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 3.4362507442374755, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 6.87656930156057e-07, |
|
"loss": 1.9217, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 3.439652972697117, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.795193723466726e-07, |
|
"loss": 1.9458, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 3.4430552011567577, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 6.714276869566347e-07, |
|
"loss": 1.9698, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 3.4464574296163986, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 6.633819354045855e-07, |
|
"loss": 1.9773, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 3.44985965807604, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 6.553821787605149e-07, |
|
"loss": 1.8458, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 3.453261886535681, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.474284777452948e-07, |
|
"loss": 1.9633, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 3.456664114995322, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.395208927302167e-07, |
|
"loss": 1.9253, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 3.460066343454963, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 6.31659483736541e-07, |
|
"loss": 1.8867, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 3.463468571914604, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.238443104350302e-07, |
|
"loss": 1.9415, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 3.466870800374245, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 6.160754321455092e-07, |
|
"loss": 1.8688, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 3.470273028833886, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 6.083529078364046e-07, |
|
"loss": 1.8777, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 3.4736752572935274, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 6.006767961242978e-07, |
|
"loss": 1.8808, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 3.4770774857531683, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.930471552734888e-07, |
|
"loss": 1.9203, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 3.480479714212809, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.854640431955407e-07, |
|
"loss": 1.9427, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 3.4838819426724505, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 5.779275174488542e-07, |
|
"loss": 1.9229, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 3.4872841711320914, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.704376352382198e-07, |
|
"loss": 1.8909, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 3.4906863995917328, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.629944534143905e-07, |
|
"loss": 1.9481, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 3.4940886280513737, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 5.555980284736454e-07, |
|
"loss": 1.9152, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 3.4974908565110145, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5.482484165573627e-07, |
|
"loss": 1.9002, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 3.500893084970656, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 5.409456734515961e-07, |
|
"loss": 1.9427, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 3.5042953134302968, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 5.336898545866455e-07, |
|
"loss": 1.9312, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 3.5076975418899377, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 5.264810150366431e-07, |
|
"loss": 1.9146, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 3.511099770349579, |
|
"grad_norm": 2.625, |
|
"learning_rate": 5.193192095191315e-07, |
|
"loss": 1.932, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 3.51450199880922, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.122044923946488e-07, |
|
"loss": 1.9544, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 3.517904227268861, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.051369176663161e-07, |
|
"loss": 1.9132, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 3.521306455728502, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.981165389794265e-07, |
|
"loss": 1.9379, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 3.524708684188143, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 4.911434096210408e-07, |
|
"loss": 1.8495, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 3.5281109126477843, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 4.842175825195817e-07, |
|
"loss": 1.964, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 3.531513141107425, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.773391102444278e-07, |
|
"loss": 1.8755, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 3.5349153695670665, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 4.705080450055242e-07, |
|
"loss": 1.902, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 3.5383175980267074, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 4.63724438652977e-07, |
|
"loss": 1.9428, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 3.5417198264863483, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.5698834267666295e-07, |
|
"loss": 1.8812, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 3.5451220549459896, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.502998082058419e-07, |
|
"loss": 1.9378, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 3.5485242834056305, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.4365888600876105e-07, |
|
"loss": 1.8586, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 3.551926511865272, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.3706562649227966e-07, |
|
"loss": 1.9303, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 3.5553287403249128, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 4.305200797014755e-07, |
|
"loss": 1.8785, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 3.5587309687845536, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.2402229531927284e-07, |
|
"loss": 1.8698, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 3.562133197244195, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.1757232266606775e-07, |
|
"loss": 1.9134, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 3.565535425703836, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.1117021069934086e-07, |
|
"loss": 1.9092, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 3.568937654163477, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.048160080133004e-07, |
|
"loss": 1.8521, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 3.572339882623118, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.985097628385017e-07, |
|
"loss": 1.9322, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.575742111082759, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.9225152304149186e-07, |
|
"loss": 1.95, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 3.5791443395424003, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.8604133612443344e-07, |
|
"loss": 1.8966, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 3.582546568002041, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 3.798792492247598e-07, |
|
"loss": 1.8615, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 3.5859487964616825, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.737653091148046e-07, |
|
"loss": 1.9687, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 3.5893510249213234, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.6769956220144835e-07, |
|
"loss": 1.9133, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 3.5927532533809643, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.61682054525775e-07, |
|
"loss": 1.9313, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 3.5961554818406056, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.5571283176270955e-07, |
|
"loss": 2.0094, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 3.5995577103002465, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.4979193922068417e-07, |
|
"loss": 1.9955, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 3.602959938759888, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.439194218412834e-07, |
|
"loss": 1.9294, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 3.6063621672195287, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.380953241989119e-07, |
|
"loss": 1.8658, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 3.6097643956791696, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 3.3231969050044987e-07, |
|
"loss": 1.9264, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 3.613166624138811, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.2659256458491855e-07, |
|
"loss": 1.9539, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 3.616568852598452, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.209139899231508e-07, |
|
"loss": 1.9833, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 3.619971081058093, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.1528400961745953e-07, |
|
"loss": 1.9088, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 3.623373309517734, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.0970266640130633e-07, |
|
"loss": 1.9261, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 3.626775537977375, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.0417000263898494e-07, |
|
"loss": 1.8439, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 3.6301777664370163, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.9868606032529224e-07, |
|
"loss": 1.9474, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 3.633579994896657, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 2.932508810852159e-07, |
|
"loss": 1.9432, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 3.6369822233562985, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 2.8786450617361245e-07, |
|
"loss": 1.8769, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 3.6403844518159394, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.825269764748977e-07, |
|
"loss": 1.9754, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 3.6437866802755803, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.772383325027377e-07, |
|
"loss": 1.9327, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 3.6471889087352216, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.719986143997357e-07, |
|
"loss": 1.916, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 3.6505911371948625, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 2.668078619371333e-07, |
|
"loss": 1.8941, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 3.653993365654504, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.616661145145063e-07, |
|
"loss": 1.9525, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 3.6573955941141447, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.5657341115946487e-07, |
|
"loss": 1.8995, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 3.6607978225737856, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 2.5152979052736e-07, |
|
"loss": 1.9815, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 3.664200051033427, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 2.46535290900983e-07, |
|
"loss": 1.8823, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 3.667602279493068, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.4158995019028676e-07, |
|
"loss": 1.9158, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 3.671004507952709, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 2.3669380593208516e-07, |
|
"loss": 1.8857, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 3.67440673641235, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.3184689528977832e-07, |
|
"loss": 1.8922, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 3.677808964871991, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.270492550530667e-07, |
|
"loss": 1.9044, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 3.6812111933316323, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 2.2230092163766907e-07, |
|
"loss": 1.9365, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 3.684613421791273, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 2.1760193108504913e-07, |
|
"loss": 1.894, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 3.6880156502509145, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.1295231906214332e-07, |
|
"loss": 1.9366, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 3.6914178787105554, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 2.0835212086108594e-07, |
|
"loss": 1.9098, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 3.6948201071701963, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.038013713989457e-07, |
|
"loss": 1.9487, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 3.6982223356298376, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.9930010521745713e-07, |
|
"loss": 1.8716, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 3.7016245640894785, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.9484835648276147e-07, |
|
"loss": 1.8958, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 3.70502679254912, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.904461589851424e-07, |
|
"loss": 1.8943, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 3.7084290210087607, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.8609354613877697e-07, |
|
"loss": 1.8747, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 3.7118312494684016, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.817905509814755e-07, |
|
"loss": 1.9229, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 3.715233477928043, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.7753720617443335e-07, |
|
"loss": 1.9303, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 3.718635706387684, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.7333354400198364e-07, |
|
"loss": 1.9388, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 3.722037934847325, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.691795963713496e-07, |
|
"loss": 1.892, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 3.725440163306966, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.6507539481240707e-07, |
|
"loss": 1.9215, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 3.728842391766607, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.6102097047744054e-07, |
|
"loss": 1.9803, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 3.7322446202262483, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.5701635414090798e-07, |
|
"loss": 1.9324, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 3.735646848685889, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.530615761992094e-07, |
|
"loss": 1.8066, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 3.7390490771455305, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.4915666667045188e-07, |
|
"loss": 1.8818, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 3.7424513056051714, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.4530165519422625e-07, |
|
"loss": 1.9121, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.7458535340648123, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.4149657103138097e-07, |
|
"loss": 1.9224, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 3.7492557625244536, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.377414430637975e-07, |
|
"loss": 1.9537, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 3.7526579909840945, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.3403629979417308e-07, |
|
"loss": 1.9439, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 3.756060219443736, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.303811693458042e-07, |
|
"loss": 1.9555, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 3.7594624479033767, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.2677607946237328e-07, |
|
"loss": 1.9296, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 3.7628646763630176, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.2322105750773803e-07, |
|
"loss": 1.9048, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 3.766266904822659, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.1971613046572323e-07, |
|
"loss": 1.9255, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 3.7696691332823, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.1626132493991633e-07, |
|
"loss": 1.9011, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 3.773071361741941, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.1285666715346502e-07, |
|
"loss": 1.8918, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 3.776473590201582, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.0950218294888028e-07, |
|
"loss": 1.84, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 3.779875818661223, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.0619789778783557e-07, |
|
"loss": 1.979, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 3.7832780471208642, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.0294383675097872e-07, |
|
"loss": 1.9141, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 3.786680275580505, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.974002453774011e-08, |
|
"loss": 1.98, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 3.7900825040401465, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.658648546614084e-08, |
|
"loss": 1.9723, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 3.7934847324997873, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 9.348324347261734e-08, |
|
"loss": 1.8887, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 3.7968869609594282, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 9.04303221118288e-08, |
|
"loss": 1.8763, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 3.8002891894190696, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 8.742774455648695e-08, |
|
"loss": 1.9326, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 3.8036914178787105, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 8.447553359717545e-08, |
|
"loss": 1.8815, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 3.807093646338352, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 8.157371164217902e-08, |
|
"loss": 1.971, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 3.8104958747979927, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.872230071731239e-08, |
|
"loss": 1.9483, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 3.8138981032576336, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 7.592132246575323e-08, |
|
"loss": 1.9457, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 3.817300331717275, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.317079814787934e-08, |
|
"loss": 1.9193, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 3.820702560176916, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.047074864110375e-08, |
|
"loss": 1.9131, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 3.824104788636557, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 6.782119443972094e-08, |
|
"loss": 1.9334, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 3.827507017096198, |
|
"grad_norm": 2.625, |
|
"learning_rate": 6.522215565474712e-08, |
|
"loss": 1.958, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 3.830909245555839, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 6.267365201377092e-08, |
|
"loss": 1.9266, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 3.8343114740154802, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.017570286079965e-08, |
|
"loss": 1.9022, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 3.837713702475121, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 5.77283271561175e-08, |
|
"loss": 1.8612, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 3.8411159309347624, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.5331543476137706e-08, |
|
"loss": 1.9326, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 3.8445181593944033, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 5.298537001326303e-08, |
|
"loss": 1.8951, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 3.847920387854044, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 5.068982457574685e-08, |
|
"loss": 1.9788, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 3.8513226163136856, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 4.8444924587559654e-08, |
|
"loss": 1.9643, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 3.8547248447733264, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 4.625068708825534e-08, |
|
"loss": 1.9245, |
|
"step": 11330 |
|
}, |
|
{ |
|
"epoch": 3.8581270732329678, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.4107128732841385e-08, |
|
"loss": 1.8401, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 3.8615293016926087, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.20142657916557e-08, |
|
"loss": 1.9087, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 3.8649315301522496, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.99721141502382e-08, |
|
"loss": 1.9401, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 3.868333758611891, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.798068930921441e-08, |
|
"loss": 1.9699, |
|
"step": 11370 |
|
}, |
|
{ |
|
"epoch": 3.8717359870715318, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.6040006384174545e-08, |
|
"loss": 1.954, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 3.875138215531173, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.4150080105563755e-08, |
|
"loss": 1.8693, |
|
"step": 11390 |
|
}, |
|
{ |
|
"epoch": 3.878540443990814, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.231092481856271e-08, |
|
"loss": 1.9307, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 3.881942672450455, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.052255448298612e-08, |
|
"loss": 1.956, |
|
"step": 11410 |
|
}, |
|
{ |
|
"epoch": 3.885344900910096, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 2.878498267317298e-08, |
|
"loss": 1.9185, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 3.888747129369737, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.7098222577882825e-08, |
|
"loss": 1.8685, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 3.8921493578293784, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 2.5462287000197963e-08, |
|
"loss": 1.9734, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 3.8955515862890193, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 2.3877188357427174e-08, |
|
"loss": 1.8995, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 3.89895381474866, |
|
"grad_norm": 2.25, |
|
"learning_rate": 2.2342938681005695e-08, |
|
"loss": 1.8764, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 3.9023560432083015, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.085954961641164e-08, |
|
"loss": 1.8865, |
|
"step": 11470 |
|
}, |
|
{ |
|
"epoch": 3.9057582716679424, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.9427032423071165e-08, |
|
"loss": 1.8932, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 3.9091605001275838, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.8045397974277166e-08, |
|
"loss": 1.9042, |
|
"step": 11490 |
|
}, |
|
{ |
|
"epoch": 3.9125627285872246, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.6714656757104883e-08, |
|
"loss": 1.94, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.9159649570468655, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.5434818872331314e-08, |
|
"loss": 1.8879, |
|
"step": 11510 |
|
}, |
|
{ |
|
"epoch": 3.919367185506507, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.4205894034362065e-08, |
|
"loss": 1.9147, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 3.9227694139661478, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.3027891571153722e-08, |
|
"loss": 1.8714, |
|
"step": 11530 |
|
}, |
|
{ |
|
"epoch": 3.926171642425789, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.1900820424145176e-08, |
|
"loss": 1.9371, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 3.92957387088543, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.0824689148190455e-08, |
|
"loss": 1.9505, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 3.932976099345071, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.799505911490794e-09, |
|
"loss": 1.8738, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 3.936378327804712, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.825278495535672e-09, |
|
"loss": 1.8447, |
|
"step": 11570 |
|
}, |
|
{ |
|
"epoch": 3.939780556264353, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.902014295042352e-09, |
|
"loss": 1.8987, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 3.9431827847239944, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.029720317899902e-09, |
|
"loss": 1.9864, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 3.9465850131836353, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 6.20840318511545e-09, |
|
"loss": 1.9454, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 3.949987241643276, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.438069130766418e-09, |
|
"loss": 1.9871, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 3.9533894701029175, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.718724001949017e-09, |
|
"loss": 1.8746, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 3.9567916985625584, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.050373258737196e-09, |
|
"loss": 1.9578, |
|
"step": 11630 |
|
}, |
|
{ |
|
"epoch": 3.9601939270221997, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 3.4330219741408427e-09, |
|
"loss": 1.9242, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 3.9635961554818406, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 2.8666748340662245e-09, |
|
"loss": 1.9133, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 3.9669983839414815, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 2.351336137279413e-09, |
|
"loss": 1.9196, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 3.970400612401123, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.887009795377922e-09, |
|
"loss": 1.9906, |
|
"step": 11670 |
|
}, |
|
{ |
|
"epoch": 3.9738028408607637, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.473699332754879e-09, |
|
"loss": 1.8989, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 3.977205069320405, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.1114078865781264e-09, |
|
"loss": 1.8962, |
|
"step": 11690 |
|
}, |
|
{ |
|
"epoch": 3.980607297780046, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 8.001382067626036e-10, |
|
"loss": 1.944, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 3.984009526239687, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.398926559516878e-10, |
|
"loss": 1.8959, |
|
"step": 11710 |
|
}, |
|
{ |
|
"epoch": 3.987411754699328, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.306732094962939e-10, |
|
"loss": 1.9388, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 3.990813983158969, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.7248145544367861e-10, |
|
"loss": 1.9133, |
|
"step": 11730 |
|
}, |
|
{ |
|
"epoch": 3.9942162116186104, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 6.531859452325864e-11, |
|
"loss": 1.957, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 3.9976184400782513, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 9.185440136907336e-12, |
|
"loss": 1.9494, |
|
"step": 11750 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 11756, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0768921731962634e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|