DeepCNCF2BAdapter / trainer_state.json
dnsch's picture
Add DeepCNCF2B model
33efec1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9996597771540356,
"eval_steps": 500,
"global_step": 11756,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003402228459641065,
"grad_norm": 7.6875,
"learning_rate": 3.809091090277921e-07,
"loss": 4.24,
"step": 10
},
{
"epoch": 0.00680445691928213,
"grad_norm": 6.46875,
"learning_rate": 7.618182180555842e-07,
"loss": 4.4323,
"step": 20
},
{
"epoch": 0.010206685378923195,
"grad_norm": 8.4375,
"learning_rate": 1.1427273270833762e-06,
"loss": 4.2758,
"step": 30
},
{
"epoch": 0.01360891383856426,
"grad_norm": 7.53125,
"learning_rate": 1.5236364361111684e-06,
"loss": 4.1231,
"step": 40
},
{
"epoch": 0.017011142298205325,
"grad_norm": 5.90625,
"learning_rate": 1.9045455451389605e-06,
"loss": 4.097,
"step": 50
},
{
"epoch": 0.02041337075784639,
"grad_norm": 5.15625,
"learning_rate": 2.2854546541667524e-06,
"loss": 4.0712,
"step": 60
},
{
"epoch": 0.023815599217487455,
"grad_norm": 4.5625,
"learning_rate": 2.6663637631945448e-06,
"loss": 3.8851,
"step": 70
},
{
"epoch": 0.02721782767712852,
"grad_norm": 6.78125,
"learning_rate": 3.0472728722223367e-06,
"loss": 3.6937,
"step": 80
},
{
"epoch": 0.030620056136769585,
"grad_norm": 8.25,
"learning_rate": 3.4281819812501286e-06,
"loss": 3.6468,
"step": 90
},
{
"epoch": 0.03402228459641065,
"grad_norm": 9.625,
"learning_rate": 3.809091090277921e-06,
"loss": 3.4787,
"step": 100
},
{
"epoch": 0.03742451305605171,
"grad_norm": 7.53125,
"learning_rate": 4.190000199305713e-06,
"loss": 3.3235,
"step": 110
},
{
"epoch": 0.04082674151569278,
"grad_norm": 9.4375,
"learning_rate": 4.570909308333505e-06,
"loss": 3.2806,
"step": 120
},
{
"epoch": 0.04422896997533384,
"grad_norm": 10.3125,
"learning_rate": 4.951818417361297e-06,
"loss": 3.0432,
"step": 130
},
{
"epoch": 0.04763119843497491,
"grad_norm": 5.84375,
"learning_rate": 5.3327275263890896e-06,
"loss": 2.8991,
"step": 140
},
{
"epoch": 0.05103342689461597,
"grad_norm": 4.1875,
"learning_rate": 5.7136366354168815e-06,
"loss": 2.8202,
"step": 150
},
{
"epoch": 0.05443565535425704,
"grad_norm": 1.828125,
"learning_rate": 6.094545744444673e-06,
"loss": 2.6361,
"step": 160
},
{
"epoch": 0.0578378838138981,
"grad_norm": 1.8359375,
"learning_rate": 6.475454853472465e-06,
"loss": 2.5525,
"step": 170
},
{
"epoch": 0.06124011227353917,
"grad_norm": 1.765625,
"learning_rate": 6.856363962500257e-06,
"loss": 2.5685,
"step": 180
},
{
"epoch": 0.06464234073318023,
"grad_norm": 2.125,
"learning_rate": 7.237273071528049e-06,
"loss": 2.5133,
"step": 190
},
{
"epoch": 0.0680445691928213,
"grad_norm": 1.71875,
"learning_rate": 7.618182180555842e-06,
"loss": 2.4096,
"step": 200
},
{
"epoch": 0.07144679765246237,
"grad_norm": 1.9140625,
"learning_rate": 7.999091289583632e-06,
"loss": 2.4864,
"step": 210
},
{
"epoch": 0.07484902611210342,
"grad_norm": 1.9765625,
"learning_rate": 8.380000398611426e-06,
"loss": 2.4321,
"step": 220
},
{
"epoch": 0.07825125457174449,
"grad_norm": 2.3125,
"learning_rate": 8.760909507639218e-06,
"loss": 2.3582,
"step": 230
},
{
"epoch": 0.08165348303138556,
"grad_norm": 2.3125,
"learning_rate": 9.14181861666701e-06,
"loss": 2.3401,
"step": 240
},
{
"epoch": 0.08505571149102663,
"grad_norm": 2.625,
"learning_rate": 9.522727725694802e-06,
"loss": 2.3312,
"step": 250
},
{
"epoch": 0.08845793995066768,
"grad_norm": 1.9609375,
"learning_rate": 9.903636834722594e-06,
"loss": 2.3672,
"step": 260
},
{
"epoch": 0.09186016841030875,
"grad_norm": 1.453125,
"learning_rate": 1.0284545943750385e-05,
"loss": 2.3025,
"step": 270
},
{
"epoch": 0.09526239686994982,
"grad_norm": 1.46875,
"learning_rate": 1.0665455052778179e-05,
"loss": 2.3273,
"step": 280
},
{
"epoch": 0.09866462532959089,
"grad_norm": 2.25,
"learning_rate": 1.104636416180597e-05,
"loss": 2.2746,
"step": 290
},
{
"epoch": 0.10206685378923194,
"grad_norm": 1.5859375,
"learning_rate": 1.1427273270833763e-05,
"loss": 2.3196,
"step": 300
},
{
"epoch": 0.10546908224887301,
"grad_norm": 1.5078125,
"learning_rate": 1.1808182379861553e-05,
"loss": 2.2645,
"step": 310
},
{
"epoch": 0.10887131070851408,
"grad_norm": 1.6640625,
"learning_rate": 1.2189091488889347e-05,
"loss": 2.2902,
"step": 320
},
{
"epoch": 0.11227353916815515,
"grad_norm": 1.5859375,
"learning_rate": 1.2570000597917139e-05,
"loss": 2.2503,
"step": 330
},
{
"epoch": 0.1156757676277962,
"grad_norm": 1.5,
"learning_rate": 1.295090970694493e-05,
"loss": 2.1882,
"step": 340
},
{
"epoch": 0.11907799608743727,
"grad_norm": 1.359375,
"learning_rate": 1.3331818815972723e-05,
"loss": 2.2266,
"step": 350
},
{
"epoch": 0.12248022454707834,
"grad_norm": 1.8125,
"learning_rate": 1.344607904627746e-05,
"loss": 2.2011,
"step": 360
},
{
"epoch": 0.1258824530067194,
"grad_norm": 1.4765625,
"learning_rate": 1.3446017810126854e-05,
"loss": 2.1828,
"step": 370
},
{
"epoch": 0.12928468146636046,
"grad_norm": 1.5234375,
"learning_rate": 1.3445905544333626e-05,
"loss": 2.2727,
"step": 380
},
{
"epoch": 0.13268690992600152,
"grad_norm": 1.6328125,
"learning_rate": 1.344574224974991e-05,
"loss": 2.2222,
"step": 390
},
{
"epoch": 0.1360891383856426,
"grad_norm": 1.59375,
"learning_rate": 1.3445527927615165e-05,
"loss": 2.2107,
"step": 400
},
{
"epoch": 0.13949136684528365,
"grad_norm": 1.515625,
"learning_rate": 1.3445262579556173e-05,
"loss": 2.1671,
"step": 410
},
{
"epoch": 0.14289359530492474,
"grad_norm": 1.3671875,
"learning_rate": 1.3444946207587011e-05,
"loss": 2.1878,
"step": 420
},
{
"epoch": 0.1462958237645658,
"grad_norm": 1.4453125,
"learning_rate": 1.3444578814109056e-05,
"loss": 2.1358,
"step": 430
},
{
"epoch": 0.14969805222420685,
"grad_norm": 1.734375,
"learning_rate": 1.3444160401910943e-05,
"loss": 2.1564,
"step": 440
},
{
"epoch": 0.15310028068384793,
"grad_norm": 1.4765625,
"learning_rate": 1.3443690974168565e-05,
"loss": 2.1756,
"step": 450
},
{
"epoch": 0.15650250914348898,
"grad_norm": 1.546875,
"learning_rate": 1.344317053444504e-05,
"loss": 2.1606,
"step": 460
},
{
"epoch": 0.15990473760313004,
"grad_norm": 1.78125,
"learning_rate": 1.344259908669068e-05,
"loss": 2.2352,
"step": 470
},
{
"epoch": 0.16330696606277112,
"grad_norm": 1.5078125,
"learning_rate": 1.3441976635242969e-05,
"loss": 2.1258,
"step": 480
},
{
"epoch": 0.16670919452241217,
"grad_norm": 1.6484375,
"learning_rate": 1.3441303184826526e-05,
"loss": 2.1533,
"step": 490
},
{
"epoch": 0.17011142298205326,
"grad_norm": 1.78125,
"learning_rate": 1.3440578740553065e-05,
"loss": 2.1179,
"step": 500
},
{
"epoch": 0.1735136514416943,
"grad_norm": 1.484375,
"learning_rate": 1.3439803307921367e-05,
"loss": 2.1868,
"step": 510
},
{
"epoch": 0.17691587990133537,
"grad_norm": 1.671875,
"learning_rate": 1.343897689281723e-05,
"loss": 2.1144,
"step": 520
},
{
"epoch": 0.18031810836097645,
"grad_norm": 1.5078125,
"learning_rate": 1.343809950151342e-05,
"loss": 2.1722,
"step": 530
},
{
"epoch": 0.1837203368206175,
"grad_norm": 1.6171875,
"learning_rate": 1.3437171140669643e-05,
"loss": 2.1725,
"step": 540
},
{
"epoch": 0.18712256528025856,
"grad_norm": 1.5234375,
"learning_rate": 1.3436191817332471e-05,
"loss": 2.1871,
"step": 550
},
{
"epoch": 0.19052479373989964,
"grad_norm": 1.7890625,
"learning_rate": 1.3435161538935297e-05,
"loss": 2.2134,
"step": 560
},
{
"epoch": 0.1939270221995407,
"grad_norm": 1.78125,
"learning_rate": 1.3434080313298288e-05,
"loss": 2.1545,
"step": 570
},
{
"epoch": 0.19732925065918178,
"grad_norm": 1.6328125,
"learning_rate": 1.3432948148628312e-05,
"loss": 2.1173,
"step": 580
},
{
"epoch": 0.20073147911882283,
"grad_norm": 1.640625,
"learning_rate": 1.3431765053518884e-05,
"loss": 2.1703,
"step": 590
},
{
"epoch": 0.20413370757846389,
"grad_norm": 1.6796875,
"learning_rate": 1.3430531036950099e-05,
"loss": 2.1662,
"step": 600
},
{
"epoch": 0.20753593603810497,
"grad_norm": 1.6171875,
"learning_rate": 1.3429246108288562e-05,
"loss": 2.153,
"step": 610
},
{
"epoch": 0.21093816449774602,
"grad_norm": 1.6328125,
"learning_rate": 1.3427910277287318e-05,
"loss": 2.1421,
"step": 620
},
{
"epoch": 0.21434039295738708,
"grad_norm": 1.4453125,
"learning_rate": 1.3426523554085776e-05,
"loss": 2.1315,
"step": 630
},
{
"epoch": 0.21774262141702816,
"grad_norm": 1.5703125,
"learning_rate": 1.342508594920964e-05,
"loss": 2.1187,
"step": 640
},
{
"epoch": 0.22114484987666921,
"grad_norm": 1.7578125,
"learning_rate": 1.342359747357082e-05,
"loss": 2.1447,
"step": 650
},
{
"epoch": 0.2245470783363103,
"grad_norm": 1.671875,
"learning_rate": 1.3422058138467349e-05,
"loss": 2.1614,
"step": 660
},
{
"epoch": 0.22794930679595135,
"grad_norm": 1.5390625,
"learning_rate": 1.3420467955583304e-05,
"loss": 2.1521,
"step": 670
},
{
"epoch": 0.2313515352555924,
"grad_norm": 1.6953125,
"learning_rate": 1.3418826936988714e-05,
"loss": 2.1474,
"step": 680
},
{
"epoch": 0.2347537637152335,
"grad_norm": 1.6484375,
"learning_rate": 1.3417135095139467e-05,
"loss": 2.1887,
"step": 690
},
{
"epoch": 0.23815599217487454,
"grad_norm": 1.71875,
"learning_rate": 1.341539244287722e-05,
"loss": 2.1432,
"step": 700
},
{
"epoch": 0.2415582206345156,
"grad_norm": 1.8046875,
"learning_rate": 1.3413598993429295e-05,
"loss": 2.1202,
"step": 710
},
{
"epoch": 0.24496044909415668,
"grad_norm": 1.7578125,
"learning_rate": 1.3411754760408584e-05,
"loss": 2.201,
"step": 720
},
{
"epoch": 0.24836267755379773,
"grad_norm": 1.5390625,
"learning_rate": 1.3409859757813437e-05,
"loss": 2.104,
"step": 730
},
{
"epoch": 0.2517649060134388,
"grad_norm": 1.703125,
"learning_rate": 1.3407914000027573e-05,
"loss": 2.1118,
"step": 740
},
{
"epoch": 0.25516713447307987,
"grad_norm": 1.5546875,
"learning_rate": 1.3405917501819956e-05,
"loss": 2.1533,
"step": 750
},
{
"epoch": 0.2585693629327209,
"grad_norm": 1.3828125,
"learning_rate": 1.340387027834468e-05,
"loss": 2.0738,
"step": 760
},
{
"epoch": 0.261971591392362,
"grad_norm": 1.625,
"learning_rate": 1.3401772345140874e-05,
"loss": 2.1696,
"step": 770
},
{
"epoch": 0.26537381985200303,
"grad_norm": 1.921875,
"learning_rate": 1.3399623718132557e-05,
"loss": 2.0847,
"step": 780
},
{
"epoch": 0.26877604831164414,
"grad_norm": 1.5390625,
"learning_rate": 1.3397424413628542e-05,
"loss": 2.1644,
"step": 790
},
{
"epoch": 0.2721782767712852,
"grad_norm": 1.640625,
"learning_rate": 1.3395174448322298e-05,
"loss": 2.0891,
"step": 800
},
{
"epoch": 0.27558050523092625,
"grad_norm": 1.9453125,
"learning_rate": 1.3392873839291825e-05,
"loss": 2.1638,
"step": 810
},
{
"epoch": 0.2789827336905673,
"grad_norm": 1.625,
"learning_rate": 1.339052260399953e-05,
"loss": 2.078,
"step": 820
},
{
"epoch": 0.28238496215020836,
"grad_norm": 1.7890625,
"learning_rate": 1.3388120760292085e-05,
"loss": 2.1191,
"step": 830
},
{
"epoch": 0.2857871906098495,
"grad_norm": 1.765625,
"learning_rate": 1.33856683264003e-05,
"loss": 2.0554,
"step": 840
},
{
"epoch": 0.2891894190694905,
"grad_norm": 1.8203125,
"learning_rate": 1.3383165320938983e-05,
"loss": 2.0385,
"step": 850
},
{
"epoch": 0.2925916475291316,
"grad_norm": 1.7109375,
"learning_rate": 1.3380611762906796e-05,
"loss": 2.1071,
"step": 860
},
{
"epoch": 0.29599387598877264,
"grad_norm": 1.6640625,
"learning_rate": 1.3378007671686113e-05,
"loss": 2.1171,
"step": 870
},
{
"epoch": 0.2993961044484137,
"grad_norm": 1.4609375,
"learning_rate": 1.337535306704287e-05,
"loss": 2.1264,
"step": 880
},
{
"epoch": 0.3027983329080548,
"grad_norm": 1.75,
"learning_rate": 1.337264796912642e-05,
"loss": 2.0562,
"step": 890
},
{
"epoch": 0.30620056136769586,
"grad_norm": 1.78125,
"learning_rate": 1.3369892398469373e-05,
"loss": 2.1343,
"step": 900
},
{
"epoch": 0.3096027898273369,
"grad_norm": 1.53125,
"learning_rate": 1.3367086375987447e-05,
"loss": 2.0563,
"step": 910
},
{
"epoch": 0.31300501828697797,
"grad_norm": 1.7578125,
"learning_rate": 1.3364229922979311e-05,
"loss": 2.1302,
"step": 920
},
{
"epoch": 0.316407246746619,
"grad_norm": 1.609375,
"learning_rate": 1.3361323061126409e-05,
"loss": 2.0733,
"step": 930
},
{
"epoch": 0.3198094752062601,
"grad_norm": 1.921875,
"learning_rate": 1.3358365812492812e-05,
"loss": 2.1027,
"step": 940
},
{
"epoch": 0.3232117036659012,
"grad_norm": 1.7265625,
"learning_rate": 1.3355358199525042e-05,
"loss": 2.0455,
"step": 950
},
{
"epoch": 0.32661393212554224,
"grad_norm": 1.6953125,
"learning_rate": 1.3352300245051904e-05,
"loss": 2.0785,
"step": 960
},
{
"epoch": 0.3300161605851833,
"grad_norm": 1.671875,
"learning_rate": 1.3349191972284314e-05,
"loss": 2.1594,
"step": 970
},
{
"epoch": 0.33341838904482435,
"grad_norm": 1.78125,
"learning_rate": 1.3346033404815114e-05,
"loss": 2.066,
"step": 980
},
{
"epoch": 0.3368206175044654,
"grad_norm": 1.59375,
"learning_rate": 1.3342824566618907e-05,
"loss": 2.1451,
"step": 990
},
{
"epoch": 0.3402228459641065,
"grad_norm": 1.6953125,
"learning_rate": 1.3339565482051866e-05,
"loss": 2.152,
"step": 1000
},
{
"epoch": 0.34362507442374757,
"grad_norm": 1.7109375,
"learning_rate": 1.3336256175851549e-05,
"loss": 2.1232,
"step": 1010
},
{
"epoch": 0.3470273028833886,
"grad_norm": 1.8828125,
"learning_rate": 1.3332896673136717e-05,
"loss": 2.1158,
"step": 1020
},
{
"epoch": 0.3504295313430297,
"grad_norm": 1.7421875,
"learning_rate": 1.3329486999407136e-05,
"loss": 2.102,
"step": 1030
},
{
"epoch": 0.35383175980267073,
"grad_norm": 1.8125,
"learning_rate": 1.3326027180543387e-05,
"loss": 2.1266,
"step": 1040
},
{
"epoch": 0.35723398826231184,
"grad_norm": 1.421875,
"learning_rate": 1.3322517242806673e-05,
"loss": 2.0884,
"step": 1050
},
{
"epoch": 0.3606362167219529,
"grad_norm": 1.5546875,
"learning_rate": 1.3318957212838615e-05,
"loss": 2.0793,
"step": 1060
},
{
"epoch": 0.36403844518159395,
"grad_norm": 1.78125,
"learning_rate": 1.3315347117661048e-05,
"loss": 2.0574,
"step": 1070
},
{
"epoch": 0.367440673641235,
"grad_norm": 1.6171875,
"learning_rate": 1.3311686984675822e-05,
"loss": 2.0716,
"step": 1080
},
{
"epoch": 0.37084290210087606,
"grad_norm": 1.8671875,
"learning_rate": 1.3307976841664591e-05,
"loss": 2.0523,
"step": 1090
},
{
"epoch": 0.3742451305605171,
"grad_norm": 1.703125,
"learning_rate": 1.33042167167886e-05,
"loss": 2.0203,
"step": 1100
},
{
"epoch": 0.3776473590201582,
"grad_norm": 1.546875,
"learning_rate": 1.330040663858848e-05,
"loss": 2.0823,
"step": 1110
},
{
"epoch": 0.3810495874797993,
"grad_norm": 1.796875,
"learning_rate": 1.3296546635984012e-05,
"loss": 2.0758,
"step": 1120
},
{
"epoch": 0.38445181593944033,
"grad_norm": 1.7421875,
"learning_rate": 1.3292636738273931e-05,
"loss": 2.1138,
"step": 1130
},
{
"epoch": 0.3878540443990814,
"grad_norm": 1.5,
"learning_rate": 1.3288676975135689e-05,
"loss": 2.0277,
"step": 1140
},
{
"epoch": 0.39125627285872244,
"grad_norm": 1.5703125,
"learning_rate": 1.3284667376625236e-05,
"loss": 2.042,
"step": 1150
},
{
"epoch": 0.39465850131836355,
"grad_norm": 1.8515625,
"learning_rate": 1.3280607973176785e-05,
"loss": 2.114,
"step": 1160
},
{
"epoch": 0.3980607297780046,
"grad_norm": 1.796875,
"learning_rate": 1.327649879560259e-05,
"loss": 2.0477,
"step": 1170
},
{
"epoch": 0.40146295823764566,
"grad_norm": 1.8046875,
"learning_rate": 1.3272339875092701e-05,
"loss": 2.0101,
"step": 1180
},
{
"epoch": 0.4048651866972867,
"grad_norm": 1.984375,
"learning_rate": 1.3268131243214744e-05,
"loss": 2.1261,
"step": 1190
},
{
"epoch": 0.40826741515692777,
"grad_norm": 1.9375,
"learning_rate": 1.326387293191366e-05,
"loss": 2.0788,
"step": 1200
},
{
"epoch": 0.4116696436165688,
"grad_norm": 1.78125,
"learning_rate": 1.325956497351148e-05,
"loss": 2.0694,
"step": 1210
},
{
"epoch": 0.41507187207620994,
"grad_norm": 1.9296875,
"learning_rate": 1.3255207400707076e-05,
"loss": 2.11,
"step": 1220
},
{
"epoch": 0.418474100535851,
"grad_norm": 1.796875,
"learning_rate": 1.3250800246575906e-05,
"loss": 2.0621,
"step": 1230
},
{
"epoch": 0.42187632899549204,
"grad_norm": 1.6875,
"learning_rate": 1.3246343544569764e-05,
"loss": 2.0923,
"step": 1240
},
{
"epoch": 0.4252785574551331,
"grad_norm": 1.6640625,
"learning_rate": 1.3241837328516535e-05,
"loss": 2.1005,
"step": 1250
},
{
"epoch": 0.42868078591477415,
"grad_norm": 1.953125,
"learning_rate": 1.323728163261993e-05,
"loss": 2.0634,
"step": 1260
},
{
"epoch": 0.43208301437441526,
"grad_norm": 1.859375,
"learning_rate": 1.323267649145923e-05,
"loss": 2.0635,
"step": 1270
},
{
"epoch": 0.4354852428340563,
"grad_norm": 1.640625,
"learning_rate": 1.3228021939989018e-05,
"loss": 2.131,
"step": 1280
},
{
"epoch": 0.4388874712936974,
"grad_norm": 1.7421875,
"learning_rate": 1.3223318013538927e-05,
"loss": 2.1021,
"step": 1290
},
{
"epoch": 0.44228969975333843,
"grad_norm": 1.734375,
"learning_rate": 1.3218564747813355e-05,
"loss": 2.0758,
"step": 1300
},
{
"epoch": 0.4456919282129795,
"grad_norm": 1.6953125,
"learning_rate": 1.3213762178891202e-05,
"loss": 2.0198,
"step": 1310
},
{
"epoch": 0.4490941566726206,
"grad_norm": 1.8515625,
"learning_rate": 1.3208910343225603e-05,
"loss": 2.1226,
"step": 1320
},
{
"epoch": 0.45249638513226165,
"grad_norm": 1.703125,
"learning_rate": 1.3204009277643636e-05,
"loss": 2.077,
"step": 1330
},
{
"epoch": 0.4558986135919027,
"grad_norm": 1.6953125,
"learning_rate": 1.3199059019346055e-05,
"loss": 2.1154,
"step": 1340
},
{
"epoch": 0.45930084205154376,
"grad_norm": 1.8984375,
"learning_rate": 1.3194059605907003e-05,
"loss": 2.1109,
"step": 1350
},
{
"epoch": 0.4627030705111848,
"grad_norm": 1.8203125,
"learning_rate": 1.318901107527373e-05,
"loss": 2.1108,
"step": 1360
},
{
"epoch": 0.46610529897082587,
"grad_norm": 2.09375,
"learning_rate": 1.3183913465766294e-05,
"loss": 2.1203,
"step": 1370
},
{
"epoch": 0.469507527430467,
"grad_norm": 1.8671875,
"learning_rate": 1.3178766816077288e-05,
"loss": 2.0667,
"step": 1380
},
{
"epoch": 0.47290975589010803,
"grad_norm": 1.8671875,
"learning_rate": 1.317357116527153e-05,
"loss": 2.0428,
"step": 1390
},
{
"epoch": 0.4763119843497491,
"grad_norm": 1.703125,
"learning_rate": 1.3168326552785775e-05,
"loss": 2.0836,
"step": 1400
},
{
"epoch": 0.47971421280939014,
"grad_norm": 1.6015625,
"learning_rate": 1.3163033018428418e-05,
"loss": 2.0031,
"step": 1410
},
{
"epoch": 0.4831164412690312,
"grad_norm": 2.0625,
"learning_rate": 1.315769060237918e-05,
"loss": 2.096,
"step": 1420
},
{
"epoch": 0.4865186697286723,
"grad_norm": 1.828125,
"learning_rate": 1.3152299345188815e-05,
"loss": 2.0325,
"step": 1430
},
{
"epoch": 0.48992089818831336,
"grad_norm": 1.65625,
"learning_rate": 1.3146859287778799e-05,
"loss": 2.0444,
"step": 1440
},
{
"epoch": 0.4933231266479544,
"grad_norm": 2.140625,
"learning_rate": 1.3141370471441016e-05,
"loss": 2.0971,
"step": 1450
},
{
"epoch": 0.49672535510759547,
"grad_norm": 2.0,
"learning_rate": 1.3135832937837444e-05,
"loss": 2.0014,
"step": 1460
},
{
"epoch": 0.5001275835672365,
"grad_norm": 1.6796875,
"learning_rate": 1.3130246728999852e-05,
"loss": 2.0086,
"step": 1470
},
{
"epoch": 0.5035298120268776,
"grad_norm": 1.78125,
"learning_rate": 1.3124611887329459e-05,
"loss": 2.0079,
"step": 1480
},
{
"epoch": 0.5069320404865186,
"grad_norm": 1.9296875,
"learning_rate": 1.3118928455596627e-05,
"loss": 2.0654,
"step": 1490
},
{
"epoch": 0.5103342689461597,
"grad_norm": 1.875,
"learning_rate": 1.3113196476940538e-05,
"loss": 2.0195,
"step": 1500
},
{
"epoch": 0.5137364974058009,
"grad_norm": 1.8203125,
"learning_rate": 1.3107415994868855e-05,
"loss": 2.0196,
"step": 1510
},
{
"epoch": 0.5171387258654419,
"grad_norm": 2.125,
"learning_rate": 1.3101587053257404e-05,
"loss": 2.0552,
"step": 1520
},
{
"epoch": 0.520540954325083,
"grad_norm": 1.734375,
"learning_rate": 1.3095709696349833e-05,
"loss": 2.0833,
"step": 1530
},
{
"epoch": 0.523943182784724,
"grad_norm": 1.765625,
"learning_rate": 1.3089783968757277e-05,
"loss": 2.1067,
"step": 1540
},
{
"epoch": 0.5273454112443651,
"grad_norm": 1.9921875,
"learning_rate": 1.308380991545802e-05,
"loss": 2.0313,
"step": 1550
},
{
"epoch": 0.5307476397040061,
"grad_norm": 1.9296875,
"learning_rate": 1.3077787581797163e-05,
"loss": 2.0918,
"step": 1560
},
{
"epoch": 0.5341498681636472,
"grad_norm": 1.609375,
"learning_rate": 1.3071717013486259e-05,
"loss": 2.0505,
"step": 1570
},
{
"epoch": 0.5375520966232883,
"grad_norm": 1.421875,
"learning_rate": 1.3065598256602989e-05,
"loss": 2.1166,
"step": 1580
},
{
"epoch": 0.5409543250829293,
"grad_norm": 1.6015625,
"learning_rate": 1.3059431357590797e-05,
"loss": 2.1196,
"step": 1590
},
{
"epoch": 0.5443565535425704,
"grad_norm": 1.765625,
"learning_rate": 1.3053216363258537e-05,
"loss": 2.0623,
"step": 1600
},
{
"epoch": 0.5477587820022114,
"grad_norm": 1.671875,
"learning_rate": 1.3046953320780136e-05,
"loss": 2.051,
"step": 1610
},
{
"epoch": 0.5511610104618525,
"grad_norm": 1.734375,
"learning_rate": 1.304064227769421e-05,
"loss": 2.0341,
"step": 1620
},
{
"epoch": 0.5545632389214936,
"grad_norm": 1.8671875,
"learning_rate": 1.3034283281903722e-05,
"loss": 2.001,
"step": 1630
},
{
"epoch": 0.5579654673811346,
"grad_norm": 2.125,
"learning_rate": 1.3027876381675611e-05,
"loss": 1.9871,
"step": 1640
},
{
"epoch": 0.5613676958407757,
"grad_norm": 1.8359375,
"learning_rate": 1.3021421625640427e-05,
"loss": 2.0712,
"step": 1650
},
{
"epoch": 0.5647699243004167,
"grad_norm": 1.8671875,
"learning_rate": 1.3014919062791965e-05,
"loss": 2.0444,
"step": 1660
},
{
"epoch": 0.5681721527600578,
"grad_norm": 1.9609375,
"learning_rate": 1.3008368742486882e-05,
"loss": 2.0598,
"step": 1670
},
{
"epoch": 0.571574381219699,
"grad_norm": 1.8828125,
"learning_rate": 1.300177071444434e-05,
"loss": 2.0744,
"step": 1680
},
{
"epoch": 0.57497660967934,
"grad_norm": 2.109375,
"learning_rate": 1.299512502874561e-05,
"loss": 1.9854,
"step": 1690
},
{
"epoch": 0.578378838138981,
"grad_norm": 2.0,
"learning_rate": 1.2988431735833709e-05,
"loss": 2.0348,
"step": 1700
},
{
"epoch": 0.581781066598622,
"grad_norm": 1.84375,
"learning_rate": 1.2981690886513001e-05,
"loss": 2.0189,
"step": 1710
},
{
"epoch": 0.5851832950582632,
"grad_norm": 1.875,
"learning_rate": 1.2974902531948826e-05,
"loss": 1.9997,
"step": 1720
},
{
"epoch": 0.5885855235179043,
"grad_norm": 1.6640625,
"learning_rate": 1.2968066723667104e-05,
"loss": 1.9861,
"step": 1730
},
{
"epoch": 0.5919877519775453,
"grad_norm": 1.796875,
"learning_rate": 1.2961183513553937e-05,
"loss": 2.0284,
"step": 1740
},
{
"epoch": 0.5953899804371864,
"grad_norm": 1.734375,
"learning_rate": 1.2954252953855236e-05,
"loss": 2.0376,
"step": 1750
},
{
"epoch": 0.5987922088968274,
"grad_norm": 1.7734375,
"learning_rate": 1.2947275097176301e-05,
"loss": 2.0059,
"step": 1760
},
{
"epoch": 0.6021944373564685,
"grad_norm": 2.09375,
"learning_rate": 1.2940249996481436e-05,
"loss": 2.0906,
"step": 1770
},
{
"epoch": 0.6055966658161096,
"grad_norm": 1.8359375,
"learning_rate": 1.2933177705093541e-05,
"loss": 2.0076,
"step": 1780
},
{
"epoch": 0.6089988942757506,
"grad_norm": 1.7265625,
"learning_rate": 1.2926058276693715e-05,
"loss": 2.0247,
"step": 1790
},
{
"epoch": 0.6124011227353917,
"grad_norm": 1.8359375,
"learning_rate": 1.2918891765320837e-05,
"loss": 2.113,
"step": 1800
},
{
"epoch": 0.6158033511950327,
"grad_norm": 1.671875,
"learning_rate": 1.2911678225371164e-05,
"loss": 2.0201,
"step": 1810
},
{
"epoch": 0.6192055796546738,
"grad_norm": 1.8828125,
"learning_rate": 1.2904417711597916e-05,
"loss": 2.0172,
"step": 1820
},
{
"epoch": 0.6226078081143149,
"grad_norm": 1.9609375,
"learning_rate": 1.289711027911086e-05,
"loss": 2.1396,
"step": 1830
},
{
"epoch": 0.6260100365739559,
"grad_norm": 1.75,
"learning_rate": 1.2889755983375892e-05,
"loss": 2.045,
"step": 1840
},
{
"epoch": 0.629412265033597,
"grad_norm": 1.9375,
"learning_rate": 1.2882354880214616e-05,
"loss": 2.012,
"step": 1850
},
{
"epoch": 0.632814493493238,
"grad_norm": 1.8671875,
"learning_rate": 1.2874907025803922e-05,
"loss": 2.058,
"step": 1860
},
{
"epoch": 0.6362167219528791,
"grad_norm": 1.8359375,
"learning_rate": 1.2867412476675554e-05,
"loss": 2.0796,
"step": 1870
},
{
"epoch": 0.6396189504125201,
"grad_norm": 1.8671875,
"learning_rate": 1.2859871289715688e-05,
"loss": 2.0956,
"step": 1880
},
{
"epoch": 0.6430211788721613,
"grad_norm": 1.7421875,
"learning_rate": 1.2852283522164496e-05,
"loss": 1.983,
"step": 1890
},
{
"epoch": 0.6464234073318024,
"grad_norm": 1.921875,
"learning_rate": 1.2844649231615713e-05,
"loss": 1.9861,
"step": 1900
},
{
"epoch": 0.6498256357914434,
"grad_norm": 1.890625,
"learning_rate": 1.2836968476016196e-05,
"loss": 2.0683,
"step": 1910
},
{
"epoch": 0.6532278642510845,
"grad_norm": 1.6875,
"learning_rate": 1.2829241313665494e-05,
"loss": 2.0916,
"step": 1920
},
{
"epoch": 0.6566300927107255,
"grad_norm": 1.609375,
"learning_rate": 1.2821467803215395e-05,
"loss": 2.0254,
"step": 1930
},
{
"epoch": 0.6600323211703666,
"grad_norm": 1.9765625,
"learning_rate": 1.2813648003669482e-05,
"loss": 2.0332,
"step": 1940
},
{
"epoch": 0.6634345496300077,
"grad_norm": 1.9140625,
"learning_rate": 1.2805781974382694e-05,
"loss": 2.0225,
"step": 1950
},
{
"epoch": 0.6668367780896487,
"grad_norm": 1.859375,
"learning_rate": 1.2797869775060866e-05,
"loss": 2.0563,
"step": 1960
},
{
"epoch": 0.6702390065492898,
"grad_norm": 1.6953125,
"learning_rate": 1.2789911465760281e-05,
"loss": 2.0027,
"step": 1970
},
{
"epoch": 0.6736412350089308,
"grad_norm": 1.890625,
"learning_rate": 1.2781907106887209e-05,
"loss": 1.9895,
"step": 1980
},
{
"epoch": 0.6770434634685719,
"grad_norm": 2.015625,
"learning_rate": 1.2773856759197455e-05,
"loss": 2.0175,
"step": 1990
},
{
"epoch": 0.680445691928213,
"grad_norm": 1.7890625,
"learning_rate": 1.2765760483795895e-05,
"loss": 2.0702,
"step": 2000
},
{
"epoch": 0.683847920387854,
"grad_norm": 1.796875,
"learning_rate": 1.275761834213601e-05,
"loss": 2.023,
"step": 2010
},
{
"epoch": 0.6872501488474951,
"grad_norm": 1.9140625,
"learning_rate": 1.2749430396019423e-05,
"loss": 2.0051,
"step": 2020
},
{
"epoch": 0.6906523773071361,
"grad_norm": 1.9765625,
"learning_rate": 1.2741196707595429e-05,
"loss": 2.017,
"step": 2030
},
{
"epoch": 0.6940546057667772,
"grad_norm": 1.9296875,
"learning_rate": 1.273291733936052e-05,
"loss": 2.0481,
"step": 2040
},
{
"epoch": 0.6974568342264184,
"grad_norm": 1.7265625,
"learning_rate": 1.2724592354157912e-05,
"loss": 2.0281,
"step": 2050
},
{
"epoch": 0.7008590626860594,
"grad_norm": 1.8984375,
"learning_rate": 1.2716221815177076e-05,
"loss": 2.0459,
"step": 2060
},
{
"epoch": 0.7042612911457005,
"grad_norm": 2.21875,
"learning_rate": 1.2707805785953245e-05,
"loss": 2.0705,
"step": 2070
},
{
"epoch": 0.7076635196053415,
"grad_norm": 2.109375,
"learning_rate": 1.2699344330366942e-05,
"loss": 2.0759,
"step": 2080
},
{
"epoch": 0.7110657480649826,
"grad_norm": 1.765625,
"learning_rate": 1.2690837512643495e-05,
"loss": 2.0324,
"step": 2090
},
{
"epoch": 0.7144679765246237,
"grad_norm": 1.75,
"learning_rate": 1.2682285397352535e-05,
"loss": 1.9784,
"step": 2100
},
{
"epoch": 0.7178702049842647,
"grad_norm": 1.9140625,
"learning_rate": 1.2673688049407526e-05,
"loss": 1.9902,
"step": 2110
},
{
"epoch": 0.7212724334439058,
"grad_norm": 1.890625,
"learning_rate": 1.266504553406526e-05,
"loss": 2.0631,
"step": 2120
},
{
"epoch": 0.7246746619035468,
"grad_norm": 2.015625,
"learning_rate": 1.2656357916925368e-05,
"loss": 2.0039,
"step": 2130
},
{
"epoch": 0.7280768903631879,
"grad_norm": 2.15625,
"learning_rate": 1.2647625263929817e-05,
"loss": 1.9975,
"step": 2140
},
{
"epoch": 0.7314791188228289,
"grad_norm": 1.71875,
"learning_rate": 1.2638847641362408e-05,
"loss": 2.0368,
"step": 2150
},
{
"epoch": 0.73488134728247,
"grad_norm": 1.9296875,
"learning_rate": 1.2630025115848282e-05,
"loss": 2.0954,
"step": 2160
},
{
"epoch": 0.7382835757421111,
"grad_norm": 1.6484375,
"learning_rate": 1.2621157754353404e-05,
"loss": 2.0297,
"step": 2170
},
{
"epoch": 0.7416858042017521,
"grad_norm": 1.65625,
"learning_rate": 1.2612245624184062e-05,
"loss": 2.0445,
"step": 2180
},
{
"epoch": 0.7450880326613932,
"grad_norm": 1.7578125,
"learning_rate": 1.2603288792986354e-05,
"loss": 2.0587,
"step": 2190
},
{
"epoch": 0.7484902611210342,
"grad_norm": 1.8203125,
"learning_rate": 1.2594287328745672e-05,
"loss": 2.0126,
"step": 2200
},
{
"epoch": 0.7518924895806753,
"grad_norm": 1.7890625,
"learning_rate": 1.258524129978619e-05,
"loss": 2.0213,
"step": 2210
},
{
"epoch": 0.7552947180403164,
"grad_norm": 1.953125,
"learning_rate": 1.257615077477034e-05,
"loss": 1.9826,
"step": 2220
},
{
"epoch": 0.7586969464999574,
"grad_norm": 1.8515625,
"learning_rate": 1.25670158226983e-05,
"loss": 2.0467,
"step": 2230
},
{
"epoch": 0.7620991749595986,
"grad_norm": 1.9765625,
"learning_rate": 1.2557836512907456e-05,
"loss": 1.9924,
"step": 2240
},
{
"epoch": 0.7655014034192396,
"grad_norm": 2.140625,
"learning_rate": 1.2548612915071894e-05,
"loss": 1.9864,
"step": 2250
},
{
"epoch": 0.7689036318788807,
"grad_norm": 1.921875,
"learning_rate": 1.2539345099201851e-05,
"loss": 1.9966,
"step": 2260
},
{
"epoch": 0.7723058603385218,
"grad_norm": 1.875,
"learning_rate": 1.2530033135643203e-05,
"loss": 2.0092,
"step": 2270
},
{
"epoch": 0.7757080887981628,
"grad_norm": 2.1875,
"learning_rate": 1.2520677095076918e-05,
"loss": 1.97,
"step": 2280
},
{
"epoch": 0.7791103172578039,
"grad_norm": 1.96875,
"learning_rate": 1.2511277048518522e-05,
"loss": 1.9781,
"step": 2290
},
{
"epoch": 0.7825125457174449,
"grad_norm": 1.953125,
"learning_rate": 1.2501833067317562e-05,
"loss": 2.0167,
"step": 2300
},
{
"epoch": 0.785914774177086,
"grad_norm": 2.0,
"learning_rate": 1.2492345223157068e-05,
"loss": 2.0108,
"step": 2310
},
{
"epoch": 0.7893170026367271,
"grad_norm": 1.6328125,
"learning_rate": 1.2482813588053004e-05,
"loss": 2.0094,
"step": 2320
},
{
"epoch": 0.7927192310963681,
"grad_norm": 1.3671875,
"learning_rate": 1.2473238234353713e-05,
"loss": 1.9266,
"step": 2330
},
{
"epoch": 0.7961214595560092,
"grad_norm": 1.765625,
"learning_rate": 1.2463619234739388e-05,
"loss": 1.9982,
"step": 2340
},
{
"epoch": 0.7995236880156502,
"grad_norm": 1.875,
"learning_rate": 1.2453956662221504e-05,
"loss": 2.0688,
"step": 2350
},
{
"epoch": 0.8029259164752913,
"grad_norm": 1.890625,
"learning_rate": 1.2444250590142271e-05,
"loss": 1.9658,
"step": 2360
},
{
"epoch": 0.8063281449349324,
"grad_norm": 1.953125,
"learning_rate": 1.2434501092174075e-05,
"loss": 1.9954,
"step": 2370
},
{
"epoch": 0.8097303733945734,
"grad_norm": 1.7421875,
"learning_rate": 1.242470824231892e-05,
"loss": 2.0507,
"step": 2380
},
{
"epoch": 0.8131326018542145,
"grad_norm": 1.7109375,
"learning_rate": 1.241487211490786e-05,
"loss": 2.0469,
"step": 2390
},
{
"epoch": 0.8165348303138555,
"grad_norm": 1.8203125,
"learning_rate": 1.2404992784600451e-05,
"loss": 2.0436,
"step": 2400
},
{
"epoch": 0.8199370587734967,
"grad_norm": 1.78125,
"learning_rate": 1.2395070326384164e-05,
"loss": 2.0195,
"step": 2410
},
{
"epoch": 0.8233392872331377,
"grad_norm": 2.21875,
"learning_rate": 1.238510481557383e-05,
"loss": 1.9674,
"step": 2420
},
{
"epoch": 0.8267415156927788,
"grad_norm": 1.9609375,
"learning_rate": 1.2375096327811061e-05,
"loss": 1.9918,
"step": 2430
},
{
"epoch": 0.8301437441524199,
"grad_norm": 2.078125,
"learning_rate": 1.2365044939063687e-05,
"loss": 2.0161,
"step": 2440
},
{
"epoch": 0.8335459726120609,
"grad_norm": 1.9140625,
"learning_rate": 1.2354950725625158e-05,
"loss": 2.0303,
"step": 2450
},
{
"epoch": 0.836948201071702,
"grad_norm": 2.109375,
"learning_rate": 1.2344813764113985e-05,
"loss": 1.973,
"step": 2460
},
{
"epoch": 0.840350429531343,
"grad_norm": 1.9296875,
"learning_rate": 1.2334634131473154e-05,
"loss": 2.0389,
"step": 2470
},
{
"epoch": 0.8437526579909841,
"grad_norm": 1.78125,
"learning_rate": 1.2324411904969535e-05,
"loss": 2.0597,
"step": 2480
},
{
"epoch": 0.8471548864506252,
"grad_norm": 1.7734375,
"learning_rate": 1.2314147162193302e-05,
"loss": 2.029,
"step": 2490
},
{
"epoch": 0.8505571149102662,
"grad_norm": 1.921875,
"learning_rate": 1.2303839981057342e-05,
"loss": 2.0216,
"step": 2500
},
{
"epoch": 0.8539593433699073,
"grad_norm": 1.96875,
"learning_rate": 1.2293490439796658e-05,
"loss": 1.9839,
"step": 2510
},
{
"epoch": 0.8573615718295483,
"grad_norm": 1.78125,
"learning_rate": 1.2283098616967793e-05,
"loss": 2.0373,
"step": 2520
},
{
"epoch": 0.8607638002891894,
"grad_norm": 1.75,
"learning_rate": 1.2272664591448208e-05,
"loss": 2.075,
"step": 2530
},
{
"epoch": 0.8641660287488305,
"grad_norm": 1.890625,
"learning_rate": 1.2262188442435706e-05,
"loss": 2.071,
"step": 2540
},
{
"epoch": 0.8675682572084715,
"grad_norm": 1.7734375,
"learning_rate": 1.2251670249447816e-05,
"loss": 2.0474,
"step": 2550
},
{
"epoch": 0.8709704856681126,
"grad_norm": 1.7578125,
"learning_rate": 1.22411100923212e-05,
"loss": 1.9866,
"step": 2560
},
{
"epoch": 0.8743727141277536,
"grad_norm": 1.859375,
"learning_rate": 1.2230508051211039e-05,
"loss": 2.0365,
"step": 2570
},
{
"epoch": 0.8777749425873947,
"grad_norm": 2.03125,
"learning_rate": 1.2219864206590427e-05,
"loss": 2.0041,
"step": 2580
},
{
"epoch": 0.8811771710470359,
"grad_norm": 1.9921875,
"learning_rate": 1.2209178639249763e-05,
"loss": 2.0164,
"step": 2590
},
{
"epoch": 0.8845793995066769,
"grad_norm": 1.7578125,
"learning_rate": 1.2198451430296135e-05,
"loss": 2.0469,
"step": 2600
},
{
"epoch": 0.887981627966318,
"grad_norm": 1.921875,
"learning_rate": 1.2187682661152705e-05,
"loss": 1.9873,
"step": 2610
},
{
"epoch": 0.891383856425959,
"grad_norm": 1.5078125,
"learning_rate": 1.2176872413558087e-05,
"loss": 2.0442,
"step": 2620
},
{
"epoch": 0.8947860848856001,
"grad_norm": 1.6640625,
"learning_rate": 1.2166020769565741e-05,
"loss": 2.0356,
"step": 2630
},
{
"epoch": 0.8981883133452412,
"grad_norm": 1.9453125,
"learning_rate": 1.2155127811543326e-05,
"loss": 2.0253,
"step": 2640
},
{
"epoch": 0.9015905418048822,
"grad_norm": 1.8671875,
"learning_rate": 1.2144193622172099e-05,
"loss": 1.974,
"step": 2650
},
{
"epoch": 0.9049927702645233,
"grad_norm": 1.8203125,
"learning_rate": 1.2133218284446276e-05,
"loss": 2.0084,
"step": 2660
},
{
"epoch": 0.9083949987241643,
"grad_norm": 1.9609375,
"learning_rate": 1.2122201881672392e-05,
"loss": 2.1215,
"step": 2670
},
{
"epoch": 0.9117972271838054,
"grad_norm": 1.9140625,
"learning_rate": 1.2111144497468698e-05,
"loss": 1.9749,
"step": 2680
},
{
"epoch": 0.9151994556434464,
"grad_norm": 1.75,
"learning_rate": 1.2100046215764493e-05,
"loss": 1.9601,
"step": 2690
},
{
"epoch": 0.9186016841030875,
"grad_norm": 2.03125,
"learning_rate": 1.2088907120799507e-05,
"loss": 1.9761,
"step": 2700
},
{
"epoch": 0.9220039125627286,
"grad_norm": 1.90625,
"learning_rate": 1.2077727297123258e-05,
"loss": 2.0309,
"step": 2710
},
{
"epoch": 0.9254061410223696,
"grad_norm": 1.6953125,
"learning_rate": 1.2066506829594404e-05,
"loss": 2.0306,
"step": 2720
},
{
"epoch": 0.9288083694820107,
"grad_norm": 1.765625,
"learning_rate": 1.2055245803380112e-05,
"loss": 2.0073,
"step": 2730
},
{
"epoch": 0.9322105979416517,
"grad_norm": 2.046875,
"learning_rate": 1.2043944303955393e-05,
"loss": 1.9904,
"step": 2740
},
{
"epoch": 0.9356128264012928,
"grad_norm": 1.8984375,
"learning_rate": 1.2032602417102472e-05,
"loss": 2.0916,
"step": 2750
},
{
"epoch": 0.939015054860934,
"grad_norm": 1.8828125,
"learning_rate": 1.2021220228910125e-05,
"loss": 1.9665,
"step": 2760
},
{
"epoch": 0.942417283320575,
"grad_norm": 1.984375,
"learning_rate": 1.2009797825773027e-05,
"loss": 1.9822,
"step": 2770
},
{
"epoch": 0.9458195117802161,
"grad_norm": 2.109375,
"learning_rate": 1.1998335294391099e-05,
"loss": 1.9947,
"step": 2780
},
{
"epoch": 0.9492217402398571,
"grad_norm": 1.7578125,
"learning_rate": 1.1986832721768856e-05,
"loss": 1.9626,
"step": 2790
},
{
"epoch": 0.9526239686994982,
"grad_norm": 1.8515625,
"learning_rate": 1.1975290195214724e-05,
"loss": 1.9772,
"step": 2800
},
{
"epoch": 0.9560261971591393,
"grad_norm": 1.921875,
"learning_rate": 1.1963707802340409e-05,
"loss": 2.0471,
"step": 2810
},
{
"epoch": 0.9594284256187803,
"grad_norm": 1.8984375,
"learning_rate": 1.1952085631060207e-05,
"loss": 1.9514,
"step": 2820
},
{
"epoch": 0.9628306540784214,
"grad_norm": 1.9453125,
"learning_rate": 1.1940423769590349e-05,
"loss": 1.9974,
"step": 2830
},
{
"epoch": 0.9662328825380624,
"grad_norm": 1.7578125,
"learning_rate": 1.1928722306448326e-05,
"loss": 2.0036,
"step": 2840
},
{
"epoch": 0.9696351109977035,
"grad_norm": 1.453125,
"learning_rate": 1.1916981330452221e-05,
"loss": 1.9803,
"step": 2850
},
{
"epoch": 0.9730373394573446,
"grad_norm": 1.8515625,
"learning_rate": 1.1905200930720032e-05,
"loss": 2.0608,
"step": 2860
},
{
"epoch": 0.9764395679169856,
"grad_norm": 1.8984375,
"learning_rate": 1.1893381196668997e-05,
"loss": 1.9857,
"step": 2870
},
{
"epoch": 0.9798417963766267,
"grad_norm": 1.6171875,
"learning_rate": 1.1881522218014912e-05,
"loss": 2.0197,
"step": 2880
},
{
"epoch": 0.9832440248362677,
"grad_norm": 1.8984375,
"learning_rate": 1.1869624084771457e-05,
"loss": 1.9883,
"step": 2890
},
{
"epoch": 0.9866462532959088,
"grad_norm": 1.8203125,
"learning_rate": 1.185768688724951e-05,
"loss": 2.0941,
"step": 2900
},
{
"epoch": 0.9900484817555499,
"grad_norm": 1.7109375,
"learning_rate": 1.184571071605645e-05,
"loss": 1.9953,
"step": 2910
},
{
"epoch": 0.9934507102151909,
"grad_norm": 1.7265625,
"learning_rate": 1.1833695662095493e-05,
"loss": 1.9833,
"step": 2920
},
{
"epoch": 0.996852938674832,
"grad_norm": 1.9765625,
"learning_rate": 1.1821641816564982e-05,
"loss": 2.0431,
"step": 2930
},
{
"epoch": 1.000255167134473,
"grad_norm": 1.71875,
"learning_rate": 1.1809549270957697e-05,
"loss": 1.886,
"step": 2940
},
{
"epoch": 1.0036573955941142,
"grad_norm": 2.078125,
"learning_rate": 1.1797418117060173e-05,
"loss": 1.9804,
"step": 2950
},
{
"epoch": 1.0070596240537553,
"grad_norm": 1.875,
"learning_rate": 1.1785248446951988e-05,
"loss": 2.0657,
"step": 2960
},
{
"epoch": 1.0104618525133964,
"grad_norm": 1.9296875,
"learning_rate": 1.1773040353005074e-05,
"loss": 2.0112,
"step": 2970
},
{
"epoch": 1.0138640809730373,
"grad_norm": 2.015625,
"learning_rate": 1.1760793927883016e-05,
"loss": 2.0262,
"step": 2980
},
{
"epoch": 1.0172663094326784,
"grad_norm": 2.109375,
"learning_rate": 1.174850926454034e-05,
"loss": 2.0007,
"step": 2990
},
{
"epoch": 1.0206685378923195,
"grad_norm": 2.03125,
"learning_rate": 1.1736186456221816e-05,
"loss": 1.9723,
"step": 3000
},
{
"epoch": 1.0240707663519606,
"grad_norm": 2.0625,
"learning_rate": 1.1723825596461751e-05,
"loss": 1.9384,
"step": 3010
},
{
"epoch": 1.0274729948116017,
"grad_norm": 1.96875,
"learning_rate": 1.1711426779083267e-05,
"loss": 1.9556,
"step": 3020
},
{
"epoch": 1.0308752232712426,
"grad_norm": 1.828125,
"learning_rate": 1.1698990098197604e-05,
"loss": 1.9963,
"step": 3030
},
{
"epoch": 1.0342774517308837,
"grad_norm": 2.09375,
"learning_rate": 1.1686515648203396e-05,
"loss": 1.9429,
"step": 3040
},
{
"epoch": 1.0376796801905248,
"grad_norm": 2.203125,
"learning_rate": 1.1674003523785957e-05,
"loss": 1.8885,
"step": 3050
},
{
"epoch": 1.041081908650166,
"grad_norm": 1.9765625,
"learning_rate": 1.1661453819916565e-05,
"loss": 1.9456,
"step": 3060
},
{
"epoch": 1.0444841371098068,
"grad_norm": 2.015625,
"learning_rate": 1.1648866631851738e-05,
"loss": 1.9386,
"step": 3070
},
{
"epoch": 1.047886365569448,
"grad_norm": 2.09375,
"learning_rate": 1.1636242055132511e-05,
"loss": 1.9569,
"step": 3080
},
{
"epoch": 1.051288594029089,
"grad_norm": 1.8671875,
"learning_rate": 1.1623580185583711e-05,
"loss": 1.9159,
"step": 3090
},
{
"epoch": 1.0546908224887301,
"grad_norm": 1.9296875,
"learning_rate": 1.1610881119313231e-05,
"loss": 1.9094,
"step": 3100
},
{
"epoch": 1.0580930509483712,
"grad_norm": 2.078125,
"learning_rate": 1.1598144952711302e-05,
"loss": 2.0189,
"step": 3110
},
{
"epoch": 1.0614952794080121,
"grad_norm": 1.8515625,
"learning_rate": 1.1585371782449755e-05,
"loss": 2.0053,
"step": 3120
},
{
"epoch": 1.0648975078676532,
"grad_norm": 2.15625,
"learning_rate": 1.1572561705481294e-05,
"loss": 1.9826,
"step": 3130
},
{
"epoch": 1.0682997363272944,
"grad_norm": 2.015625,
"learning_rate": 1.1559714819038756e-05,
"loss": 1.9597,
"step": 3140
},
{
"epoch": 1.0717019647869355,
"grad_norm": 1.734375,
"learning_rate": 1.1546831220634377e-05,
"loss": 1.9255,
"step": 3150
},
{
"epoch": 1.0751041932465766,
"grad_norm": 2.109375,
"learning_rate": 1.1533911008059046e-05,
"loss": 1.9859,
"step": 3160
},
{
"epoch": 1.0785064217062175,
"grad_norm": 1.7578125,
"learning_rate": 1.1520954279381567e-05,
"loss": 1.9651,
"step": 3170
},
{
"epoch": 1.0819086501658586,
"grad_norm": 1.9296875,
"learning_rate": 1.1507961132947917e-05,
"loss": 1.9321,
"step": 3180
},
{
"epoch": 1.0853108786254997,
"grad_norm": 1.8046875,
"learning_rate": 1.1494931667380492e-05,
"loss": 1.9215,
"step": 3190
},
{
"epoch": 1.0887131070851408,
"grad_norm": 1.9453125,
"learning_rate": 1.1481865981577362e-05,
"loss": 1.982,
"step": 3200
},
{
"epoch": 1.092115335544782,
"grad_norm": 2.125,
"learning_rate": 1.1468764174711526e-05,
"loss": 1.9728,
"step": 3210
},
{
"epoch": 1.0955175640044228,
"grad_norm": 2.046875,
"learning_rate": 1.1455626346230147e-05,
"loss": 2.0267,
"step": 3220
},
{
"epoch": 1.098919792464064,
"grad_norm": 2.359375,
"learning_rate": 1.1442452595853809e-05,
"loss": 1.9484,
"step": 3230
},
{
"epoch": 1.102322020923705,
"grad_norm": 2.0,
"learning_rate": 1.1429243023575758e-05,
"loss": 1.9867,
"step": 3240
},
{
"epoch": 1.1057242493833461,
"grad_norm": 1.8046875,
"learning_rate": 1.1415997729661134e-05,
"loss": 1.9269,
"step": 3250
},
{
"epoch": 1.1091264778429872,
"grad_norm": 1.953125,
"learning_rate": 1.140271681464622e-05,
"loss": 1.9095,
"step": 3260
},
{
"epoch": 1.1125287063026281,
"grad_norm": 1.8515625,
"learning_rate": 1.1389400379337676e-05,
"loss": 2.0021,
"step": 3270
},
{
"epoch": 1.1159309347622692,
"grad_norm": 2.046875,
"learning_rate": 1.137604852481177e-05,
"loss": 2.0117,
"step": 3280
},
{
"epoch": 1.1193331632219103,
"grad_norm": 1.5546875,
"learning_rate": 1.1362661352413616e-05,
"loss": 1.9835,
"step": 3290
},
{
"epoch": 1.1227353916815515,
"grad_norm": 2.1875,
"learning_rate": 1.1349238963756402e-05,
"loss": 1.9492,
"step": 3300
},
{
"epoch": 1.1261376201411926,
"grad_norm": 2.0,
"learning_rate": 1.1335781460720621e-05,
"loss": 1.9394,
"step": 3310
},
{
"epoch": 1.1295398486008335,
"grad_norm": 1.703125,
"learning_rate": 1.1322288945453292e-05,
"loss": 1.9442,
"step": 3320
},
{
"epoch": 1.1329420770604746,
"grad_norm": 1.84375,
"learning_rate": 1.1308761520367196e-05,
"loss": 1.9256,
"step": 3330
},
{
"epoch": 1.1363443055201157,
"grad_norm": 1.96875,
"learning_rate": 1.1295199288140082e-05,
"loss": 1.9861,
"step": 3340
},
{
"epoch": 1.1397465339797568,
"grad_norm": 2.265625,
"learning_rate": 1.1281602351713905e-05,
"loss": 1.9598,
"step": 3350
},
{
"epoch": 1.143148762439398,
"grad_norm": 2.09375,
"learning_rate": 1.1267970814294032e-05,
"loss": 1.9839,
"step": 3360
},
{
"epoch": 1.1465509908990388,
"grad_norm": 2.125,
"learning_rate": 1.1254304779348466e-05,
"loss": 1.9654,
"step": 3370
},
{
"epoch": 1.14995321935868,
"grad_norm": 1.9296875,
"learning_rate": 1.1240604350607055e-05,
"loss": 1.9536,
"step": 3380
},
{
"epoch": 1.153355447818321,
"grad_norm": 1.9296875,
"learning_rate": 1.122686963206071e-05,
"loss": 1.9331,
"step": 3390
},
{
"epoch": 1.156757676277962,
"grad_norm": 1.921875,
"learning_rate": 1.1213100727960614e-05,
"loss": 1.9218,
"step": 3400
},
{
"epoch": 1.1601599047376032,
"grad_norm": 1.9765625,
"learning_rate": 1.1199297742817428e-05,
"loss": 1.9979,
"step": 3410
},
{
"epoch": 1.163562133197244,
"grad_norm": 2.25,
"learning_rate": 1.11854607814005e-05,
"loss": 2.02,
"step": 3420
},
{
"epoch": 1.1669643616568852,
"grad_norm": 2.09375,
"learning_rate": 1.117158994873707e-05,
"loss": 2.0195,
"step": 3430
},
{
"epoch": 1.1703665901165263,
"grad_norm": 1.984375,
"learning_rate": 1.1157685350111472e-05,
"loss": 2.0053,
"step": 3440
},
{
"epoch": 1.1737688185761674,
"grad_norm": 1.84375,
"learning_rate": 1.1143747091064334e-05,
"loss": 2.014,
"step": 3450
},
{
"epoch": 1.1771710470358085,
"grad_norm": 2.0625,
"learning_rate": 1.1129775277391782e-05,
"loss": 1.9057,
"step": 3460
},
{
"epoch": 1.1805732754954494,
"grad_norm": 2.140625,
"learning_rate": 1.1115770015144628e-05,
"loss": 1.9496,
"step": 3470
},
{
"epoch": 1.1839755039550905,
"grad_norm": 1.828125,
"learning_rate": 1.1101731410627574e-05,
"loss": 1.9163,
"step": 3480
},
{
"epoch": 1.1873777324147317,
"grad_norm": 1.890625,
"learning_rate": 1.1087659570398397e-05,
"loss": 1.9717,
"step": 3490
},
{
"epoch": 1.1907799608743728,
"grad_norm": 2.078125,
"learning_rate": 1.1073554601267147e-05,
"loss": 2.0302,
"step": 3500
},
{
"epoch": 1.1941821893340139,
"grad_norm": 1.796875,
"learning_rate": 1.1059416610295336e-05,
"loss": 1.9523,
"step": 3510
},
{
"epoch": 1.1975844177936548,
"grad_norm": 2.015625,
"learning_rate": 1.104524570479512e-05,
"loss": 1.9842,
"step": 3520
},
{
"epoch": 1.2009866462532959,
"grad_norm": 1.875,
"learning_rate": 1.1031041992328483e-05,
"loss": 2.0036,
"step": 3530
},
{
"epoch": 1.204388874712937,
"grad_norm": 2.03125,
"learning_rate": 1.1016805580706439e-05,
"loss": 2.048,
"step": 3540
},
{
"epoch": 1.207791103172578,
"grad_norm": 2.0625,
"learning_rate": 1.1002536577988182e-05,
"loss": 1.9545,
"step": 3550
},
{
"epoch": 1.2111933316322192,
"grad_norm": 1.9921875,
"learning_rate": 1.0988235092480297e-05,
"loss": 1.9575,
"step": 3560
},
{
"epoch": 1.21459556009186,
"grad_norm": 2.015625,
"learning_rate": 1.0973901232735917e-05,
"loss": 1.9759,
"step": 3570
},
{
"epoch": 1.2179977885515012,
"grad_norm": 2.078125,
"learning_rate": 1.0959535107553909e-05,
"loss": 1.9737,
"step": 3580
},
{
"epoch": 1.2214000170111423,
"grad_norm": 1.890625,
"learning_rate": 1.0945136825978049e-05,
"loss": 2.0414,
"step": 3590
},
{
"epoch": 1.2248022454707834,
"grad_norm": 2.0625,
"learning_rate": 1.0930706497296186e-05,
"loss": 1.9566,
"step": 3600
},
{
"epoch": 1.2282044739304245,
"grad_norm": 1.8125,
"learning_rate": 1.0916244231039415e-05,
"loss": 1.9614,
"step": 3610
},
{
"epoch": 1.2316067023900654,
"grad_norm": 2.09375,
"learning_rate": 1.0901750136981258e-05,
"loss": 2.0045,
"step": 3620
},
{
"epoch": 1.2350089308497065,
"grad_norm": 1.578125,
"learning_rate": 1.0887224325136807e-05,
"loss": 1.9703,
"step": 3630
},
{
"epoch": 1.2384111593093476,
"grad_norm": 2.265625,
"learning_rate": 1.0872666905761921e-05,
"loss": 1.9609,
"step": 3640
},
{
"epoch": 1.2418133877689888,
"grad_norm": 1.9296875,
"learning_rate": 1.0858077989352354e-05,
"loss": 1.9865,
"step": 3650
},
{
"epoch": 1.2452156162286299,
"grad_norm": 1.84375,
"learning_rate": 1.084345768664294e-05,
"loss": 1.9276,
"step": 3660
},
{
"epoch": 1.2486178446882708,
"grad_norm": 2.25,
"learning_rate": 1.0828806108606748e-05,
"loss": 1.9673,
"step": 3670
},
{
"epoch": 1.2520200731479119,
"grad_norm": 2.15625,
"learning_rate": 1.081412336645423e-05,
"loss": 1.9522,
"step": 3680
},
{
"epoch": 1.255422301607553,
"grad_norm": 1.953125,
"learning_rate": 1.0799409571632395e-05,
"loss": 1.8882,
"step": 3690
},
{
"epoch": 1.258824530067194,
"grad_norm": 1.9765625,
"learning_rate": 1.0784664835823945e-05,
"loss": 1.9378,
"step": 3700
},
{
"epoch": 1.2622267585268352,
"grad_norm": 1.7421875,
"learning_rate": 1.076988927094643e-05,
"loss": 2.0231,
"step": 3710
},
{
"epoch": 1.265628986986476,
"grad_norm": 2.03125,
"learning_rate": 1.0755082989151417e-05,
"loss": 1.925,
"step": 3720
},
{
"epoch": 1.2690312154461172,
"grad_norm": 2.15625,
"learning_rate": 1.0740246102823613e-05,
"loss": 1.8958,
"step": 3730
},
{
"epoch": 1.2724334439057583,
"grad_norm": 2.015625,
"learning_rate": 1.0725378724580027e-05,
"loss": 1.9536,
"step": 3740
},
{
"epoch": 1.2758356723653994,
"grad_norm": 1.953125,
"learning_rate": 1.0710480967269115e-05,
"loss": 1.9541,
"step": 3750
},
{
"epoch": 1.2792379008250405,
"grad_norm": 1.734375,
"learning_rate": 1.0695552943969919e-05,
"loss": 1.9327,
"step": 3760
},
{
"epoch": 1.2826401292846814,
"grad_norm": 1.9375,
"learning_rate": 1.0680594767991203e-05,
"loss": 1.9935,
"step": 3770
},
{
"epoch": 1.2860423577443225,
"grad_norm": 2.078125,
"learning_rate": 1.0665606552870612e-05,
"loss": 1.9933,
"step": 3780
},
{
"epoch": 1.2894445862039636,
"grad_norm": 2.125,
"learning_rate": 1.0650588412373792e-05,
"loss": 1.9314,
"step": 3790
},
{
"epoch": 1.2928468146636047,
"grad_norm": 1.609375,
"learning_rate": 1.0635540460493534e-05,
"loss": 1.9136,
"step": 3800
},
{
"epoch": 1.2962490431232458,
"grad_norm": 1.796875,
"learning_rate": 1.0620462811448904e-05,
"loss": 1.9175,
"step": 3810
},
{
"epoch": 1.2996512715828867,
"grad_norm": 2.125,
"learning_rate": 1.0605355579684382e-05,
"loss": 1.9929,
"step": 3820
},
{
"epoch": 1.3030535000425278,
"grad_norm": 2.109375,
"learning_rate": 1.0590218879868998e-05,
"loss": 1.9072,
"step": 3830
},
{
"epoch": 1.306455728502169,
"grad_norm": 2.296875,
"learning_rate": 1.0575052826895442e-05,
"loss": 1.9315,
"step": 3840
},
{
"epoch": 1.30985795696181,
"grad_norm": 1.78125,
"learning_rate": 1.0559857535879212e-05,
"loss": 2.045,
"step": 3850
},
{
"epoch": 1.3132601854214512,
"grad_norm": 2.15625,
"learning_rate": 1.0544633122157734e-05,
"loss": 1.9443,
"step": 3860
},
{
"epoch": 1.316662413881092,
"grad_norm": 1.890625,
"learning_rate": 1.0529379701289476e-05,
"loss": 1.9742,
"step": 3870
},
{
"epoch": 1.3200646423407332,
"grad_norm": 1.7890625,
"learning_rate": 1.051409738905309e-05,
"loss": 1.9852,
"step": 3880
},
{
"epoch": 1.3234668708003743,
"grad_norm": 2.1875,
"learning_rate": 1.0498786301446519e-05,
"loss": 1.997,
"step": 3890
},
{
"epoch": 1.3268690992600152,
"grad_norm": 2.0,
"learning_rate": 1.0483446554686125e-05,
"loss": 1.9083,
"step": 3900
},
{
"epoch": 1.3302713277196565,
"grad_norm": 1.8046875,
"learning_rate": 1.0468078265205796e-05,
"loss": 1.974,
"step": 3910
},
{
"epoch": 1.3336735561792974,
"grad_norm": 1.875,
"learning_rate": 1.0452681549656073e-05,
"loss": 1.9885,
"step": 3920
},
{
"epoch": 1.3370757846389385,
"grad_norm": 1.9609375,
"learning_rate": 1.0437256524903258e-05,
"loss": 1.9872,
"step": 3930
},
{
"epoch": 1.3404780130985796,
"grad_norm": 2.046875,
"learning_rate": 1.0421803308028533e-05,
"loss": 1.9477,
"step": 3940
},
{
"epoch": 1.3438802415582205,
"grad_norm": 1.9296875,
"learning_rate": 1.0406322016327067e-05,
"loss": 2.0032,
"step": 3950
},
{
"epoch": 1.3472824700178618,
"grad_norm": 2.015625,
"learning_rate": 1.0390812767307123e-05,
"loss": 1.9942,
"step": 3960
},
{
"epoch": 1.3506846984775027,
"grad_norm": 1.8984375,
"learning_rate": 1.0375275678689174e-05,
"loss": 2.0242,
"step": 3970
},
{
"epoch": 1.3540869269371438,
"grad_norm": 1.90625,
"learning_rate": 1.0359710868405e-05,
"loss": 2.0306,
"step": 3980
},
{
"epoch": 1.357489155396785,
"grad_norm": 2.140625,
"learning_rate": 1.0344118454596807e-05,
"loss": 1.9709,
"step": 3990
},
{
"epoch": 1.3608913838564258,
"grad_norm": 1.9765625,
"learning_rate": 1.032849855561631e-05,
"loss": 1.9812,
"step": 4000
},
{
"epoch": 1.3642936123160672,
"grad_norm": 2.09375,
"learning_rate": 1.0312851290023851e-05,
"loss": 2.0006,
"step": 4010
},
{
"epoch": 1.367695840775708,
"grad_norm": 2.078125,
"learning_rate": 1.0297176776587497e-05,
"loss": 1.9679,
"step": 4020
},
{
"epoch": 1.3710980692353492,
"grad_norm": 2.375,
"learning_rate": 1.028147513428213e-05,
"loss": 1.934,
"step": 4030
},
{
"epoch": 1.3745002976949903,
"grad_norm": 2.046875,
"learning_rate": 1.026574648228855e-05,
"loss": 1.9867,
"step": 4040
},
{
"epoch": 1.3779025261546312,
"grad_norm": 2.359375,
"learning_rate": 1.0249990939992573e-05,
"loss": 1.899,
"step": 4050
},
{
"epoch": 1.3813047546142723,
"grad_norm": 2.15625,
"learning_rate": 1.023420862698412e-05,
"loss": 1.9799,
"step": 4060
},
{
"epoch": 1.3847069830739134,
"grad_norm": 1.9609375,
"learning_rate": 1.021839966305631e-05,
"loss": 2.0251,
"step": 4070
},
{
"epoch": 1.3881092115335545,
"grad_norm": 2.0625,
"learning_rate": 1.0202564168204549e-05,
"loss": 1.9332,
"step": 4080
},
{
"epoch": 1.3915114399931956,
"grad_norm": 2.1875,
"learning_rate": 1.0186702262625632e-05,
"loss": 1.971,
"step": 4090
},
{
"epoch": 1.3949136684528365,
"grad_norm": 2.0625,
"learning_rate": 1.0170814066716807e-05,
"loss": 1.9266,
"step": 4100
},
{
"epoch": 1.3983158969124776,
"grad_norm": 1.984375,
"learning_rate": 1.0154899701074883e-05,
"loss": 1.9282,
"step": 4110
},
{
"epoch": 1.4017181253721187,
"grad_norm": 2.046875,
"learning_rate": 1.0138959286495303e-05,
"loss": 2.0014,
"step": 4120
},
{
"epoch": 1.4051203538317598,
"grad_norm": 2.125,
"learning_rate": 1.0122992943971232e-05,
"loss": 1.9463,
"step": 4130
},
{
"epoch": 1.408522582291401,
"grad_norm": 1.875,
"learning_rate": 1.0107000794692637e-05,
"loss": 2.003,
"step": 4140
},
{
"epoch": 1.4119248107510418,
"grad_norm": 2.234375,
"learning_rate": 1.0090982960045363e-05,
"loss": 2.0,
"step": 4150
},
{
"epoch": 1.415327039210683,
"grad_norm": 2.203125,
"learning_rate": 1.0074939561610221e-05,
"loss": 1.9832,
"step": 4160
},
{
"epoch": 1.418729267670324,
"grad_norm": 2.078125,
"learning_rate": 1.005887072116206e-05,
"loss": 1.8977,
"step": 4170
},
{
"epoch": 1.4221314961299651,
"grad_norm": 1.65625,
"learning_rate": 1.0042776560668832e-05,
"loss": 1.9778,
"step": 4180
},
{
"epoch": 1.4255337245896063,
"grad_norm": 1.9921875,
"learning_rate": 1.0026657202290696e-05,
"loss": 1.9389,
"step": 4190
},
{
"epoch": 1.4289359530492471,
"grad_norm": 2.21875,
"learning_rate": 1.0010512768379053e-05,
"loss": 1.909,
"step": 4200
},
{
"epoch": 1.4323381815088883,
"grad_norm": 2.109375,
"learning_rate": 9.994343381475644e-06,
"loss": 1.9563,
"step": 4210
},
{
"epoch": 1.4357404099685294,
"grad_norm": 2.09375,
"learning_rate": 9.978149164311613e-06,
"loss": 1.9725,
"step": 4220
},
{
"epoch": 1.4391426384281705,
"grad_norm": 1.71875,
"learning_rate": 9.961930239806571e-06,
"loss": 2.0237,
"step": 4230
},
{
"epoch": 1.4425448668878116,
"grad_norm": 1.953125,
"learning_rate": 9.945686731067668e-06,
"loss": 1.9415,
"step": 4240
},
{
"epoch": 1.4459470953474525,
"grad_norm": 2.0625,
"learning_rate": 9.929418761388654e-06,
"loss": 1.9221,
"step": 4250
},
{
"epoch": 1.4493493238070936,
"grad_norm": 2.046875,
"learning_rate": 9.91312645424895e-06,
"loss": 1.9062,
"step": 4260
},
{
"epoch": 1.4527515522667347,
"grad_norm": 2.40625,
"learning_rate": 9.896809933312702e-06,
"loss": 1.9621,
"step": 4270
},
{
"epoch": 1.4561537807263758,
"grad_norm": 2.265625,
"learning_rate": 9.88046932242785e-06,
"loss": 1.9721,
"step": 4280
},
{
"epoch": 1.459556009186017,
"grad_norm": 1.9765625,
"learning_rate": 9.864104745625186e-06,
"loss": 2.0143,
"step": 4290
},
{
"epoch": 1.4629582376456578,
"grad_norm": 2.359375,
"learning_rate": 9.847716327117408e-06,
"loss": 1.9356,
"step": 4300
},
{
"epoch": 1.466360466105299,
"grad_norm": 2.140625,
"learning_rate": 9.831304191298181e-06,
"loss": 1.9466,
"step": 4310
},
{
"epoch": 1.46976269456494,
"grad_norm": 1.890625,
"learning_rate": 9.814868462741196e-06,
"loss": 1.9112,
"step": 4320
},
{
"epoch": 1.4731649230245811,
"grad_norm": 1.953125,
"learning_rate": 9.798409266199217e-06,
"loss": 1.9464,
"step": 4330
},
{
"epoch": 1.4765671514842222,
"grad_norm": 2.046875,
"learning_rate": 9.781926726603141e-06,
"loss": 1.9421,
"step": 4340
},
{
"epoch": 1.4799693799438631,
"grad_norm": 2.09375,
"learning_rate": 9.765420969061045e-06,
"loss": 2.0682,
"step": 4350
},
{
"epoch": 1.4833716084035042,
"grad_norm": 1.7734375,
"learning_rate": 9.748892118857236e-06,
"loss": 1.9912,
"step": 4360
},
{
"epoch": 1.4867738368631453,
"grad_norm": 1.921875,
"learning_rate": 9.73234030145131e-06,
"loss": 1.9594,
"step": 4370
},
{
"epoch": 1.4901760653227865,
"grad_norm": 2.34375,
"learning_rate": 9.71576564247718e-06,
"loss": 1.9444,
"step": 4380
},
{
"epoch": 1.4935782937824276,
"grad_norm": 2.09375,
"learning_rate": 9.699168267742144e-06,
"loss": 1.9882,
"step": 4390
},
{
"epoch": 1.4969805222420685,
"grad_norm": 1.84375,
"learning_rate": 9.682548303225915e-06,
"loss": 1.9076,
"step": 4400
},
{
"epoch": 1.5003827507017096,
"grad_norm": 2.015625,
"learning_rate": 9.665905875079679e-06,
"loss": 1.9594,
"step": 4410
},
{
"epoch": 1.5037849791613507,
"grad_norm": 2.03125,
"learning_rate": 9.649241109625111e-06,
"loss": 2.0808,
"step": 4420
},
{
"epoch": 1.5071872076209918,
"grad_norm": 1.9375,
"learning_rate": 9.632554133353453e-06,
"loss": 1.9688,
"step": 4430
},
{
"epoch": 1.510589436080633,
"grad_norm": 1.953125,
"learning_rate": 9.615845072924522e-06,
"loss": 1.971,
"step": 4440
},
{
"epoch": 1.5139916645402738,
"grad_norm": 1.9609375,
"learning_rate": 9.59911405516577e-06,
"loss": 1.9759,
"step": 4450
},
{
"epoch": 1.517393892999915,
"grad_norm": 2.125,
"learning_rate": 9.582361207071299e-06,
"loss": 1.975,
"step": 4460
},
{
"epoch": 1.520796121459556,
"grad_norm": 1.90625,
"learning_rate": 9.565586655800928e-06,
"loss": 1.9975,
"step": 4470
},
{
"epoch": 1.5241983499191971,
"grad_norm": 1.9453125,
"learning_rate": 9.5487905286792e-06,
"loss": 1.966,
"step": 4480
},
{
"epoch": 1.5276005783788382,
"grad_norm": 2.078125,
"learning_rate": 9.531972953194425e-06,
"loss": 1.9374,
"step": 4490
},
{
"epoch": 1.5310028068384791,
"grad_norm": 2.0625,
"learning_rate": 9.51513405699772e-06,
"loss": 1.9567,
"step": 4500
},
{
"epoch": 1.5344050352981202,
"grad_norm": 2.359375,
"learning_rate": 9.498273967902033e-06,
"loss": 1.9704,
"step": 4510
},
{
"epoch": 1.5378072637577613,
"grad_norm": 2.078125,
"learning_rate": 9.481392813881164e-06,
"loss": 1.9064,
"step": 4520
},
{
"epoch": 1.5412094922174024,
"grad_norm": 2.21875,
"learning_rate": 9.464490723068811e-06,
"loss": 1.9553,
"step": 4530
},
{
"epoch": 1.5446117206770436,
"grad_norm": 2.171875,
"learning_rate": 9.447567823757589e-06,
"loss": 1.9416,
"step": 4540
},
{
"epoch": 1.5480139491366844,
"grad_norm": 1.859375,
"learning_rate": 9.430624244398053e-06,
"loss": 2.0401,
"step": 4550
},
{
"epoch": 1.5514161775963256,
"grad_norm": 2.125,
"learning_rate": 9.413660113597731e-06,
"loss": 1.9495,
"step": 4560
},
{
"epoch": 1.5548184060559667,
"grad_norm": 2.296875,
"learning_rate": 9.396675560120143e-06,
"loss": 2.0093,
"step": 4570
},
{
"epoch": 1.5582206345156078,
"grad_norm": 2.203125,
"learning_rate": 9.379670712883817e-06,
"loss": 1.974,
"step": 4580
},
{
"epoch": 1.5616228629752489,
"grad_norm": 1.96875,
"learning_rate": 9.362645700961327e-06,
"loss": 1.935,
"step": 4590
},
{
"epoch": 1.5650250914348898,
"grad_norm": 2.171875,
"learning_rate": 9.345600653578297e-06,
"loss": 1.9727,
"step": 4600
},
{
"epoch": 1.5684273198945309,
"grad_norm": 2.34375,
"learning_rate": 9.328535700112433e-06,
"loss": 1.9115,
"step": 4610
},
{
"epoch": 1.571829548354172,
"grad_norm": 2.109375,
"learning_rate": 9.311450970092529e-06,
"loss": 1.9329,
"step": 4620
},
{
"epoch": 1.575231776813813,
"grad_norm": 1.9609375,
"learning_rate": 9.294346593197489e-06,
"loss": 1.9534,
"step": 4630
},
{
"epoch": 1.5786340052734542,
"grad_norm": 1.9609375,
"learning_rate": 9.277222699255353e-06,
"loss": 1.9047,
"step": 4640
},
{
"epoch": 1.582036233733095,
"grad_norm": 1.9765625,
"learning_rate": 9.260079418242293e-06,
"loss": 1.9975,
"step": 4650
},
{
"epoch": 1.5854384621927362,
"grad_norm": 2.359375,
"learning_rate": 9.242916880281638e-06,
"loss": 1.9347,
"step": 4660
},
{
"epoch": 1.5888406906523773,
"grad_norm": 2.1875,
"learning_rate": 9.225735215642885e-06,
"loss": 1.9552,
"step": 4670
},
{
"epoch": 1.5922429191120182,
"grad_norm": 2.109375,
"learning_rate": 9.208534554740706e-06,
"loss": 1.9052,
"step": 4680
},
{
"epoch": 1.5956451475716595,
"grad_norm": 2.1875,
"learning_rate": 9.191315028133966e-06,
"loss": 1.9881,
"step": 4690
},
{
"epoch": 1.5990473760313004,
"grad_norm": 2.0625,
"learning_rate": 9.17407676652472e-06,
"loss": 1.9671,
"step": 4700
},
{
"epoch": 1.6024496044909415,
"grad_norm": 2.203125,
"learning_rate": 9.156819900757237e-06,
"loss": 1.9753,
"step": 4710
},
{
"epoch": 1.6058518329505826,
"grad_norm": 1.9140625,
"learning_rate": 9.139544561816991e-06,
"loss": 1.9516,
"step": 4720
},
{
"epoch": 1.6092540614102235,
"grad_norm": 2.234375,
"learning_rate": 9.122250880829674e-06,
"loss": 1.9615,
"step": 4730
},
{
"epoch": 1.6126562898698649,
"grad_norm": 2.15625,
"learning_rate": 9.104938989060205e-06,
"loss": 1.9325,
"step": 4740
},
{
"epoch": 1.6160585183295058,
"grad_norm": 1.984375,
"learning_rate": 9.087609017911725e-06,
"loss": 1.9227,
"step": 4750
},
{
"epoch": 1.6194607467891469,
"grad_norm": 2.109375,
"learning_rate": 9.070261098924604e-06,
"loss": 1.9796,
"step": 4760
},
{
"epoch": 1.622862975248788,
"grad_norm": 2.1875,
"learning_rate": 9.052895363775442e-06,
"loss": 1.977,
"step": 4770
},
{
"epoch": 1.6262652037084289,
"grad_norm": 2.046875,
"learning_rate": 9.035511944276075e-06,
"loss": 1.8778,
"step": 4780
},
{
"epoch": 1.6296674321680702,
"grad_norm": 2.546875,
"learning_rate": 9.018110972372563e-06,
"loss": 1.924,
"step": 4790
},
{
"epoch": 1.633069660627711,
"grad_norm": 1.9140625,
"learning_rate": 9.000692580144194e-06,
"loss": 1.9173,
"step": 4800
},
{
"epoch": 1.6364718890873522,
"grad_norm": 2.40625,
"learning_rate": 8.983256899802485e-06,
"loss": 1.9433,
"step": 4810
},
{
"epoch": 1.6398741175469933,
"grad_norm": 2.09375,
"learning_rate": 8.96580406369018e-06,
"loss": 1.9947,
"step": 4820
},
{
"epoch": 1.6432763460066342,
"grad_norm": 1.9921875,
"learning_rate": 8.948334204280234e-06,
"loss": 1.9073,
"step": 4830
},
{
"epoch": 1.6466785744662755,
"grad_norm": 2.3125,
"learning_rate": 8.930847454174817e-06,
"loss": 1.9565,
"step": 4840
},
{
"epoch": 1.6500808029259164,
"grad_norm": 2.15625,
"learning_rate": 8.913343946104305e-06,
"loss": 1.8945,
"step": 4850
},
{
"epoch": 1.6534830313855575,
"grad_norm": 2.296875,
"learning_rate": 8.895823812926273e-06,
"loss": 1.9491,
"step": 4860
},
{
"epoch": 1.6568852598451986,
"grad_norm": 2.203125,
"learning_rate": 8.878287187624486e-06,
"loss": 1.8916,
"step": 4870
},
{
"epoch": 1.6602874883048395,
"grad_norm": 1.9296875,
"learning_rate": 8.860734203307893e-06,
"loss": 1.9758,
"step": 4880
},
{
"epoch": 1.6636897167644809,
"grad_norm": 1.9453125,
"learning_rate": 8.84316499320961e-06,
"loss": 1.9791,
"step": 4890
},
{
"epoch": 1.6670919452241217,
"grad_norm": 2.0,
"learning_rate": 8.825579690685907e-06,
"loss": 2.0407,
"step": 4900
},
{
"epoch": 1.6704941736837629,
"grad_norm": 1.953125,
"learning_rate": 8.807978429215212e-06,
"loss": 2.0039,
"step": 4910
},
{
"epoch": 1.673896402143404,
"grad_norm": 2.203125,
"learning_rate": 8.79036134239708e-06,
"loss": 2.0093,
"step": 4920
},
{
"epoch": 1.6772986306030448,
"grad_norm": 2.265625,
"learning_rate": 8.772728563951189e-06,
"loss": 1.8997,
"step": 4930
},
{
"epoch": 1.6807008590626862,
"grad_norm": 2.140625,
"learning_rate": 8.755080227716316e-06,
"loss": 1.908,
"step": 4940
},
{
"epoch": 1.684103087522327,
"grad_norm": 1.8515625,
"learning_rate": 8.737416467649337e-06,
"loss": 1.9478,
"step": 4950
},
{
"epoch": 1.6875053159819682,
"grad_norm": 2.203125,
"learning_rate": 8.71973741782419e-06,
"loss": 1.9497,
"step": 4960
},
{
"epoch": 1.6909075444416093,
"grad_norm": 1.8125,
"learning_rate": 8.70204321243087e-06,
"loss": 1.9035,
"step": 4970
},
{
"epoch": 1.6943097729012502,
"grad_norm": 2.171875,
"learning_rate": 8.684333985774413e-06,
"loss": 1.9666,
"step": 4980
},
{
"epoch": 1.6977120013608915,
"grad_norm": 2.484375,
"learning_rate": 8.666609872273867e-06,
"loss": 1.9943,
"step": 4990
},
{
"epoch": 1.7011142298205324,
"grad_norm": 2.234375,
"learning_rate": 8.648871006461278e-06,
"loss": 1.9293,
"step": 5000
},
{
"epoch": 1.7045164582801735,
"grad_norm": 2.140625,
"learning_rate": 8.631117522980663e-06,
"loss": 1.9369,
"step": 5010
},
{
"epoch": 1.7079186867398146,
"grad_norm": 2.046875,
"learning_rate": 8.613349556587001e-06,
"loss": 1.9117,
"step": 5020
},
{
"epoch": 1.7113209151994555,
"grad_norm": 2.078125,
"learning_rate": 8.59556724214519e-06,
"loss": 1.9757,
"step": 5030
},
{
"epoch": 1.7147231436590968,
"grad_norm": 2.328125,
"learning_rate": 8.577770714629042e-06,
"loss": 1.9838,
"step": 5040
},
{
"epoch": 1.7181253721187377,
"grad_norm": 2.328125,
"learning_rate": 8.559960109120251e-06,
"loss": 1.9571,
"step": 5050
},
{
"epoch": 1.7215276005783788,
"grad_norm": 2.140625,
"learning_rate": 8.542135560807365e-06,
"loss": 1.9588,
"step": 5060
},
{
"epoch": 1.72492982903802,
"grad_norm": 2.15625,
"learning_rate": 8.524297204984759e-06,
"loss": 1.9542,
"step": 5070
},
{
"epoch": 1.7283320574976608,
"grad_norm": 1.9765625,
"learning_rate": 8.506445177051624e-06,
"loss": 1.9691,
"step": 5080
},
{
"epoch": 1.7317342859573022,
"grad_norm": 1.953125,
"learning_rate": 8.488579612510915e-06,
"loss": 1.9141,
"step": 5090
},
{
"epoch": 1.735136514416943,
"grad_norm": 2.0,
"learning_rate": 8.470700646968339e-06,
"loss": 2.0129,
"step": 5100
},
{
"epoch": 1.7385387428765842,
"grad_norm": 2.171875,
"learning_rate": 8.452808416131319e-06,
"loss": 1.9424,
"step": 5110
},
{
"epoch": 1.7419409713362253,
"grad_norm": 1.8984375,
"learning_rate": 8.434903055807971e-06,
"loss": 1.9041,
"step": 5120
},
{
"epoch": 1.7453431997958662,
"grad_norm": 1.859375,
"learning_rate": 8.416984701906065e-06,
"loss": 1.9514,
"step": 5130
},
{
"epoch": 1.7487454282555075,
"grad_norm": 1.7421875,
"learning_rate": 8.399053490431994e-06,
"loss": 1.9846,
"step": 5140
},
{
"epoch": 1.7521476567151484,
"grad_norm": 2.03125,
"learning_rate": 8.38110955748975e-06,
"loss": 1.9242,
"step": 5150
},
{
"epoch": 1.7555498851747895,
"grad_norm": 2.015625,
"learning_rate": 8.363153039279882e-06,
"loss": 1.9853,
"step": 5160
},
{
"epoch": 1.7589521136344306,
"grad_norm": 2.15625,
"learning_rate": 8.345184072098464e-06,
"loss": 2.0005,
"step": 5170
},
{
"epoch": 1.7623543420940715,
"grad_norm": 2.171875,
"learning_rate": 8.327202792336068e-06,
"loss": 2.0181,
"step": 5180
},
{
"epoch": 1.7657565705537128,
"grad_norm": 2.234375,
"learning_rate": 8.309209336476713e-06,
"loss": 1.9119,
"step": 5190
},
{
"epoch": 1.7691587990133537,
"grad_norm": 2.328125,
"learning_rate": 8.29120384109685e-06,
"loss": 1.9061,
"step": 5200
},
{
"epoch": 1.7725610274729948,
"grad_norm": 2.046875,
"learning_rate": 8.273186442864303e-06,
"loss": 1.9584,
"step": 5210
},
{
"epoch": 1.775963255932636,
"grad_norm": 2.1875,
"learning_rate": 8.25515727853725e-06,
"loss": 1.9456,
"step": 5220
},
{
"epoch": 1.7793654843922768,
"grad_norm": 2.109375,
"learning_rate": 8.23711648496318e-06,
"loss": 1.9162,
"step": 5230
},
{
"epoch": 1.7827677128519182,
"grad_norm": 2.3125,
"learning_rate": 8.219064199077837e-06,
"loss": 1.9735,
"step": 5240
},
{
"epoch": 1.786169941311559,
"grad_norm": 2.296875,
"learning_rate": 8.201000557904217e-06,
"loss": 1.9512,
"step": 5250
},
{
"epoch": 1.7895721697712001,
"grad_norm": 2.046875,
"learning_rate": 8.182925698551491e-06,
"loss": 1.9886,
"step": 5260
},
{
"epoch": 1.7929743982308413,
"grad_norm": 2.390625,
"learning_rate": 8.164839758213986e-06,
"loss": 1.9956,
"step": 5270
},
{
"epoch": 1.7963766266904821,
"grad_norm": 2.28125,
"learning_rate": 8.14674287417013e-06,
"loss": 1.9076,
"step": 5280
},
{
"epoch": 1.7997788551501235,
"grad_norm": 1.84375,
"learning_rate": 8.128635183781433e-06,
"loss": 1.912,
"step": 5290
},
{
"epoch": 1.8031810836097644,
"grad_norm": 2.21875,
"learning_rate": 8.11051682449141e-06,
"loss": 1.9582,
"step": 5300
},
{
"epoch": 1.8065833120694055,
"grad_norm": 2.296875,
"learning_rate": 8.092387933824571e-06,
"loss": 1.979,
"step": 5310
},
{
"epoch": 1.8099855405290466,
"grad_norm": 2.46875,
"learning_rate": 8.074248649385357e-06,
"loss": 1.9679,
"step": 5320
},
{
"epoch": 1.8133877689886875,
"grad_norm": 2.21875,
"learning_rate": 8.056099108857101e-06,
"loss": 1.9288,
"step": 5330
},
{
"epoch": 1.8167899974483288,
"grad_norm": 2.296875,
"learning_rate": 8.037939450000985e-06,
"loss": 1.922,
"step": 5340
},
{
"epoch": 1.8201922259079697,
"grad_norm": 2.1875,
"learning_rate": 8.019769810654989e-06,
"loss": 1.9022,
"step": 5350
},
{
"epoch": 1.8235944543676108,
"grad_norm": 2.0,
"learning_rate": 8.00159032873285e-06,
"loss": 1.9698,
"step": 5360
},
{
"epoch": 1.826996682827252,
"grad_norm": 2.171875,
"learning_rate": 7.98340114222302e-06,
"loss": 1.9087,
"step": 5370
},
{
"epoch": 1.8303989112868928,
"grad_norm": 2.140625,
"learning_rate": 7.9652023891876e-06,
"loss": 1.9785,
"step": 5380
},
{
"epoch": 1.8338011397465341,
"grad_norm": 2.015625,
"learning_rate": 7.946994207761316e-06,
"loss": 1.9983,
"step": 5390
},
{
"epoch": 1.837203368206175,
"grad_norm": 2.328125,
"learning_rate": 7.928776736150451e-06,
"loss": 2.0148,
"step": 5400
},
{
"epoch": 1.8406055966658161,
"grad_norm": 2.109375,
"learning_rate": 7.910550112631802e-06,
"loss": 1.9808,
"step": 5410
},
{
"epoch": 1.8440078251254572,
"grad_norm": 2.15625,
"learning_rate": 7.892314475551641e-06,
"loss": 1.9153,
"step": 5420
},
{
"epoch": 1.8474100535850981,
"grad_norm": 2.109375,
"learning_rate": 7.87406996332465e-06,
"loss": 1.9285,
"step": 5430
},
{
"epoch": 1.8508122820447395,
"grad_norm": 2.34375,
"learning_rate": 7.855816714432878e-06,
"loss": 1.952,
"step": 5440
},
{
"epoch": 1.8542145105043804,
"grad_norm": 2.203125,
"learning_rate": 7.837554867424685e-06,
"loss": 1.9335,
"step": 5450
},
{
"epoch": 1.8576167389640215,
"grad_norm": 2.34375,
"learning_rate": 7.8192845609137e-06,
"loss": 1.943,
"step": 5460
},
{
"epoch": 1.8610189674236626,
"grad_norm": 2.203125,
"learning_rate": 7.801005933577753e-06,
"loss": 2.0204,
"step": 5470
},
{
"epoch": 1.8644211958833035,
"grad_norm": 2.1875,
"learning_rate": 7.782719124157842e-06,
"loss": 1.915,
"step": 5480
},
{
"epoch": 1.8678234243429448,
"grad_norm": 2.21875,
"learning_rate": 7.764424271457067e-06,
"loss": 1.9207,
"step": 5490
},
{
"epoch": 1.8712256528025857,
"grad_norm": 2.015625,
"learning_rate": 7.746121514339576e-06,
"loss": 1.9593,
"step": 5500
},
{
"epoch": 1.8746278812622268,
"grad_norm": 1.828125,
"learning_rate": 7.727810991729512e-06,
"loss": 1.904,
"step": 5510
},
{
"epoch": 1.878030109721868,
"grad_norm": 1.9140625,
"learning_rate": 7.709492842609971e-06,
"loss": 1.9757,
"step": 5520
},
{
"epoch": 1.8814323381815088,
"grad_norm": 1.9140625,
"learning_rate": 7.691167206021928e-06,
"loss": 1.938,
"step": 5530
},
{
"epoch": 1.88483456664115,
"grad_norm": 2.484375,
"learning_rate": 7.67283422106319e-06,
"loss": 1.956,
"step": 5540
},
{
"epoch": 1.888236795100791,
"grad_norm": 1.7578125,
"learning_rate": 7.654494026887346e-06,
"loss": 1.9298,
"step": 5550
},
{
"epoch": 1.8916390235604321,
"grad_norm": 1.890625,
"learning_rate": 7.636146762702703e-06,
"loss": 1.8893,
"step": 5560
},
{
"epoch": 1.8950412520200732,
"grad_norm": 2.15625,
"learning_rate": 7.617792567771233e-06,
"loss": 1.9309,
"step": 5570
},
{
"epoch": 1.8984434804797141,
"grad_norm": 2.578125,
"learning_rate": 7.59943158140751e-06,
"loss": 1.9064,
"step": 5580
},
{
"epoch": 1.9018457089393552,
"grad_norm": 2.203125,
"learning_rate": 7.581063942977662e-06,
"loss": 1.9647,
"step": 5590
},
{
"epoch": 1.9052479373989963,
"grad_norm": 2.1875,
"learning_rate": 7.56268979189831e-06,
"loss": 1.9417,
"step": 5600
},
{
"epoch": 1.9086501658586374,
"grad_norm": 2.421875,
"learning_rate": 7.544309267635502e-06,
"loss": 1.96,
"step": 5610
},
{
"epoch": 1.9120523943182786,
"grad_norm": 2.25,
"learning_rate": 7.525922509703665e-06,
"loss": 1.9672,
"step": 5620
},
{
"epoch": 1.9154546227779194,
"grad_norm": 2.1875,
"learning_rate": 7.507529657664538e-06,
"loss": 1.9975,
"step": 5630
},
{
"epoch": 1.9188568512375606,
"grad_norm": 2.078125,
"learning_rate": 7.489130851126123e-06,
"loss": 1.9763,
"step": 5640
},
{
"epoch": 1.9222590796972017,
"grad_norm": 2.171875,
"learning_rate": 7.470726229741613e-06,
"loss": 1.9206,
"step": 5650
},
{
"epoch": 1.9256613081568428,
"grad_norm": 2.484375,
"learning_rate": 7.45231593320834e-06,
"loss": 2.0314,
"step": 5660
},
{
"epoch": 1.9290635366164839,
"grad_norm": 2.109375,
"learning_rate": 7.433900101266712e-06,
"loss": 1.9449,
"step": 5670
},
{
"epoch": 1.9324657650761248,
"grad_norm": 2.0,
"learning_rate": 7.415478873699151e-06,
"loss": 1.9294,
"step": 5680
},
{
"epoch": 1.9358679935357659,
"grad_norm": 1.8828125,
"learning_rate": 7.3970523903290335e-06,
"loss": 1.8888,
"step": 5690
},
{
"epoch": 1.939270221995407,
"grad_norm": 2.25,
"learning_rate": 7.378620791019634e-06,
"loss": 1.9365,
"step": 5700
},
{
"epoch": 1.942672450455048,
"grad_norm": 1.8828125,
"learning_rate": 7.360184215673055e-06,
"loss": 1.9441,
"step": 5710
},
{
"epoch": 1.9460746789146892,
"grad_norm": 2.28125,
"learning_rate": 7.341742804229166e-06,
"loss": 1.9156,
"step": 5720
},
{
"epoch": 1.94947690737433,
"grad_norm": 2.375,
"learning_rate": 7.32329669666455e-06,
"loss": 1.9051,
"step": 5730
},
{
"epoch": 1.9528791358339712,
"grad_norm": 2.109375,
"learning_rate": 7.304846032991432e-06,
"loss": 2.0019,
"step": 5740
},
{
"epoch": 1.9562813642936123,
"grad_norm": 1.875,
"learning_rate": 7.2863909532566196e-06,
"loss": 1.8679,
"step": 5750
},
{
"epoch": 1.9596835927532534,
"grad_norm": 2.234375,
"learning_rate": 7.2679315975404405e-06,
"loss": 1.9605,
"step": 5760
},
{
"epoch": 1.9630858212128945,
"grad_norm": 1.9375,
"learning_rate": 7.249468105955679e-06,
"loss": 1.9355,
"step": 5770
},
{
"epoch": 1.9664880496725354,
"grad_norm": 2.0,
"learning_rate": 7.231000618646511e-06,
"loss": 1.8908,
"step": 5780
},
{
"epoch": 1.9698902781321765,
"grad_norm": 2.203125,
"learning_rate": 7.212529275787436e-06,
"loss": 1.9578,
"step": 5790
},
{
"epoch": 1.9732925065918177,
"grad_norm": 2.265625,
"learning_rate": 7.194054217582234e-06,
"loss": 1.9287,
"step": 5800
},
{
"epoch": 1.9766947350514585,
"grad_norm": 2.375,
"learning_rate": 7.17557558426287e-06,
"loss": 1.9672,
"step": 5810
},
{
"epoch": 1.9800969635110999,
"grad_norm": 2.0,
"learning_rate": 7.157093516088451e-06,
"loss": 1.9581,
"step": 5820
},
{
"epoch": 1.9834991919707408,
"grad_norm": 2.015625,
"learning_rate": 7.138608153344156e-06,
"loss": 1.9872,
"step": 5830
},
{
"epoch": 1.9869014204303819,
"grad_norm": 1.921875,
"learning_rate": 7.120119636340172e-06,
"loss": 1.9525,
"step": 5840
},
{
"epoch": 1.990303648890023,
"grad_norm": 1.890625,
"learning_rate": 7.101628105410625e-06,
"loss": 1.9093,
"step": 5850
},
{
"epoch": 1.9937058773496639,
"grad_norm": 2.234375,
"learning_rate": 7.0831337009125195e-06,
"loss": 1.9706,
"step": 5860
},
{
"epoch": 1.9971081058093052,
"grad_norm": 2.3125,
"learning_rate": 7.064636563224674e-06,
"loss": 1.9331,
"step": 5870
},
{
"epoch": 2.000510334268946,
"grad_norm": 2.203125,
"learning_rate": 7.046136832746647e-06,
"loss": 1.9434,
"step": 5880
},
{
"epoch": 2.0039125627285874,
"grad_norm": 2.265625,
"learning_rate": 7.027634649897679e-06,
"loss": 1.8678,
"step": 5890
},
{
"epoch": 2.0073147911882283,
"grad_norm": 2.421875,
"learning_rate": 7.009130155115627e-06,
"loss": 1.9193,
"step": 5900
},
{
"epoch": 2.010717019647869,
"grad_norm": 2.125,
"learning_rate": 6.990623488855899e-06,
"loss": 1.9459,
"step": 5910
},
{
"epoch": 2.0141192481075105,
"grad_norm": 2.46875,
"learning_rate": 6.972114791590378e-06,
"loss": 1.9229,
"step": 5920
},
{
"epoch": 2.0175214765671514,
"grad_norm": 2.03125,
"learning_rate": 6.953604203806366e-06,
"loss": 1.9008,
"step": 5930
},
{
"epoch": 2.0209237050267927,
"grad_norm": 2.5625,
"learning_rate": 6.935091866005518e-06,
"loss": 1.9513,
"step": 5940
},
{
"epoch": 2.0243259334864336,
"grad_norm": 2.125,
"learning_rate": 6.9165779187027685e-06,
"loss": 1.9013,
"step": 5950
},
{
"epoch": 2.0277281619460745,
"grad_norm": 2.25,
"learning_rate": 6.898062502425267e-06,
"loss": 1.914,
"step": 5960
},
{
"epoch": 2.031130390405716,
"grad_norm": 2.015625,
"learning_rate": 6.87954575771132e-06,
"loss": 1.8773,
"step": 5970
},
{
"epoch": 2.0345326188653567,
"grad_norm": 2.234375,
"learning_rate": 6.861027825109312e-06,
"loss": 1.9337,
"step": 5980
},
{
"epoch": 2.037934847324998,
"grad_norm": 2.234375,
"learning_rate": 6.842508845176642e-06,
"loss": 1.9866,
"step": 5990
},
{
"epoch": 2.041337075784639,
"grad_norm": 1.9921875,
"learning_rate": 6.8239889584786644e-06,
"loss": 1.9557,
"step": 6000
},
{
"epoch": 2.04473930424428,
"grad_norm": 2.0,
"learning_rate": 6.805468305587612e-06,
"loss": 1.9082,
"step": 6010
},
{
"epoch": 2.048141532703921,
"grad_norm": 2.234375,
"learning_rate": 6.786947027081537e-06,
"loss": 1.8822,
"step": 6020
},
{
"epoch": 2.051543761163562,
"grad_norm": 2.296875,
"learning_rate": 6.768425263543234e-06,
"loss": 1.9611,
"step": 6030
},
{
"epoch": 2.0549459896232034,
"grad_norm": 2.171875,
"learning_rate": 6.7499031555591875e-06,
"loss": 1.9623,
"step": 6040
},
{
"epoch": 2.0583482180828443,
"grad_norm": 2.328125,
"learning_rate": 6.7313808437184895e-06,
"loss": 1.9902,
"step": 6050
},
{
"epoch": 2.061750446542485,
"grad_norm": 2.21875,
"learning_rate": 6.71285846861178e-06,
"loss": 1.9358,
"step": 6060
},
{
"epoch": 2.0651526750021265,
"grad_norm": 2.40625,
"learning_rate": 6.694336170830184e-06,
"loss": 1.8377,
"step": 6070
},
{
"epoch": 2.0685549034617674,
"grad_norm": 2.359375,
"learning_rate": 6.675814090964238e-06,
"loss": 1.9771,
"step": 6080
},
{
"epoch": 2.0719571319214087,
"grad_norm": 2.0625,
"learning_rate": 6.6572923696028185e-06,
"loss": 1.8634,
"step": 6090
},
{
"epoch": 2.0753593603810496,
"grad_norm": 2.609375,
"learning_rate": 6.638771147332086e-06,
"loss": 1.9388,
"step": 6100
},
{
"epoch": 2.0787615888406905,
"grad_norm": 2.203125,
"learning_rate": 6.62025056473442e-06,
"loss": 1.918,
"step": 6110
},
{
"epoch": 2.082163817300332,
"grad_norm": 2.234375,
"learning_rate": 6.601730762387327e-06,
"loss": 1.9617,
"step": 6120
},
{
"epoch": 2.0855660457599727,
"grad_norm": 2.234375,
"learning_rate": 6.583211880862406e-06,
"loss": 1.9056,
"step": 6130
},
{
"epoch": 2.0889682742196136,
"grad_norm": 2.15625,
"learning_rate": 6.56469406072426e-06,
"loss": 1.9458,
"step": 6140
},
{
"epoch": 2.092370502679255,
"grad_norm": 2.109375,
"learning_rate": 6.546177442529437e-06,
"loss": 1.9393,
"step": 6150
},
{
"epoch": 2.095772731138896,
"grad_norm": 2.140625,
"learning_rate": 6.5276621668253645e-06,
"loss": 1.9038,
"step": 6160
},
{
"epoch": 2.099174959598537,
"grad_norm": 2.265625,
"learning_rate": 6.509148374149276e-06,
"loss": 1.9621,
"step": 6170
},
{
"epoch": 2.102577188058178,
"grad_norm": 2.015625,
"learning_rate": 6.490636205027152e-06,
"loss": 1.9206,
"step": 6180
},
{
"epoch": 2.105979416517819,
"grad_norm": 2.515625,
"learning_rate": 6.472125799972643e-06,
"loss": 1.9409,
"step": 6190
},
{
"epoch": 2.1093816449774603,
"grad_norm": 2.53125,
"learning_rate": 6.453617299486017e-06,
"loss": 1.9348,
"step": 6200
},
{
"epoch": 2.112783873437101,
"grad_norm": 2.109375,
"learning_rate": 6.435110844053086e-06,
"loss": 1.9364,
"step": 6210
},
{
"epoch": 2.1161861018967425,
"grad_norm": 2.46875,
"learning_rate": 6.416606574144131e-06,
"loss": 1.9042,
"step": 6220
},
{
"epoch": 2.1195883303563834,
"grad_norm": 2.34375,
"learning_rate": 6.398104630212853e-06,
"loss": 1.9547,
"step": 6230
},
{
"epoch": 2.1229905588160243,
"grad_norm": 2.4375,
"learning_rate": 6.379605152695294e-06,
"loss": 1.9768,
"step": 6240
},
{
"epoch": 2.1263927872756656,
"grad_norm": 2.125,
"learning_rate": 6.361108282008776e-06,
"loss": 1.9522,
"step": 6250
},
{
"epoch": 2.1297950157353065,
"grad_norm": 1.8359375,
"learning_rate": 6.342614158550832e-06,
"loss": 1.9168,
"step": 6260
},
{
"epoch": 2.133197244194948,
"grad_norm": 2.3125,
"learning_rate": 6.324122922698143e-06,
"loss": 1.9871,
"step": 6270
},
{
"epoch": 2.1365994726545887,
"grad_norm": 2.28125,
"learning_rate": 6.305634714805481e-06,
"loss": 1.9398,
"step": 6280
},
{
"epoch": 2.1400017011142296,
"grad_norm": 1.921875,
"learning_rate": 6.287149675204619e-06,
"loss": 1.9629,
"step": 6290
},
{
"epoch": 2.143403929573871,
"grad_norm": 2.421875,
"learning_rate": 6.268667944203294e-06,
"loss": 1.9102,
"step": 6300
},
{
"epoch": 2.146806158033512,
"grad_norm": 2.28125,
"learning_rate": 6.2501896620841255e-06,
"loss": 1.8596,
"step": 6310
},
{
"epoch": 2.150208386493153,
"grad_norm": 2.265625,
"learning_rate": 6.231714969103553e-06,
"loss": 1.7886,
"step": 6320
},
{
"epoch": 2.153610614952794,
"grad_norm": 2.3125,
"learning_rate": 6.213244005490776e-06,
"loss": 1.9695,
"step": 6330
},
{
"epoch": 2.157012843412435,
"grad_norm": 2.09375,
"learning_rate": 6.194776911446687e-06,
"loss": 1.971,
"step": 6340
},
{
"epoch": 2.1604150718720763,
"grad_norm": 2.375,
"learning_rate": 6.176313827142807e-06,
"loss": 1.9136,
"step": 6350
},
{
"epoch": 2.163817300331717,
"grad_norm": 2.25,
"learning_rate": 6.157854892720216e-06,
"loss": 1.9184,
"step": 6360
},
{
"epoch": 2.1672195287913585,
"grad_norm": 2.09375,
"learning_rate": 6.139400248288503e-06,
"loss": 1.9933,
"step": 6370
},
{
"epoch": 2.1706217572509994,
"grad_norm": 1.8984375,
"learning_rate": 6.120950033924691e-06,
"loss": 1.9114,
"step": 6380
},
{
"epoch": 2.1740239857106403,
"grad_norm": 2.078125,
"learning_rate": 6.102504389672177e-06,
"loss": 1.9974,
"step": 6390
},
{
"epoch": 2.1774262141702816,
"grad_norm": 1.9140625,
"learning_rate": 6.084063455539671e-06,
"loss": 1.8925,
"step": 6400
},
{
"epoch": 2.1808284426299225,
"grad_norm": 2.40625,
"learning_rate": 6.065627371500128e-06,
"loss": 1.9208,
"step": 6410
},
{
"epoch": 2.184230671089564,
"grad_norm": 2.609375,
"learning_rate": 6.0471962774896946e-06,
"loss": 1.8757,
"step": 6420
},
{
"epoch": 2.1876328995492047,
"grad_norm": 1.8203125,
"learning_rate": 6.0287703134066385e-06,
"loss": 1.905,
"step": 6430
},
{
"epoch": 2.1910351280088456,
"grad_norm": 2.46875,
"learning_rate": 6.010349619110283e-06,
"loss": 1.8878,
"step": 6440
},
{
"epoch": 2.194437356468487,
"grad_norm": 2.15625,
"learning_rate": 5.991934334419968e-06,
"loss": 1.9549,
"step": 6450
},
{
"epoch": 2.197839584928128,
"grad_norm": 2.125,
"learning_rate": 5.973524599113954e-06,
"loss": 1.9137,
"step": 6460
},
{
"epoch": 2.201241813387769,
"grad_norm": 2.453125,
"learning_rate": 5.9551205529283955e-06,
"loss": 1.9856,
"step": 6470
},
{
"epoch": 2.20464404184741,
"grad_norm": 2.09375,
"learning_rate": 5.936722335556252e-06,
"loss": 1.9262,
"step": 6480
},
{
"epoch": 2.208046270307051,
"grad_norm": 1.9609375,
"learning_rate": 5.91833008664625e-06,
"loss": 1.9596,
"step": 6490
},
{
"epoch": 2.2114484987666922,
"grad_norm": 2.28125,
"learning_rate": 5.89994394580181e-06,
"loss": 1.907,
"step": 6500
},
{
"epoch": 2.214850727226333,
"grad_norm": 2.125,
"learning_rate": 5.881564052579987e-06,
"loss": 1.938,
"step": 6510
},
{
"epoch": 2.2182529556859745,
"grad_norm": 2.1875,
"learning_rate": 5.863190546490422e-06,
"loss": 1.9615,
"step": 6520
},
{
"epoch": 2.2216551841456154,
"grad_norm": 2.078125,
"learning_rate": 5.844823566994264e-06,
"loss": 1.9353,
"step": 6530
},
{
"epoch": 2.2250574126052562,
"grad_norm": 2.75,
"learning_rate": 5.826463253503132e-06,
"loss": 1.98,
"step": 6540
},
{
"epoch": 2.2284596410648976,
"grad_norm": 2.25,
"learning_rate": 5.808109745378048e-06,
"loss": 1.8649,
"step": 6550
},
{
"epoch": 2.2318618695245385,
"grad_norm": 2.265625,
"learning_rate": 5.789763181928373e-06,
"loss": 1.9079,
"step": 6560
},
{
"epoch": 2.23526409798418,
"grad_norm": 2.421875,
"learning_rate": 5.771423702410762e-06,
"loss": 1.9156,
"step": 6570
},
{
"epoch": 2.2386663264438207,
"grad_norm": 2.0,
"learning_rate": 5.753091446028094e-06,
"loss": 1.9416,
"step": 6580
},
{
"epoch": 2.2420685549034616,
"grad_norm": 2.265625,
"learning_rate": 5.734766551928427e-06,
"loss": 1.8595,
"step": 6590
},
{
"epoch": 2.245470783363103,
"grad_norm": 2.3125,
"learning_rate": 5.716449159203939e-06,
"loss": 1.9292,
"step": 6600
},
{
"epoch": 2.248873011822744,
"grad_norm": 2.15625,
"learning_rate": 5.698139406889855e-06,
"loss": 1.9578,
"step": 6610
},
{
"epoch": 2.252275240282385,
"grad_norm": 2.203125,
"learning_rate": 5.679837433963432e-06,
"loss": 1.9706,
"step": 6620
},
{
"epoch": 2.255677468742026,
"grad_norm": 2.359375,
"learning_rate": 5.661543379342855e-06,
"loss": 1.9641,
"step": 6630
},
{
"epoch": 2.259079697201667,
"grad_norm": 2.328125,
"learning_rate": 5.643257381886218e-06,
"loss": 1.9505,
"step": 6640
},
{
"epoch": 2.2624819256613082,
"grad_norm": 2.046875,
"learning_rate": 5.624979580390459e-06,
"loss": 1.9631,
"step": 6650
},
{
"epoch": 2.265884154120949,
"grad_norm": 2.375,
"learning_rate": 5.6067101135902996e-06,
"loss": 1.9767,
"step": 6660
},
{
"epoch": 2.2692863825805905,
"grad_norm": 1.8515625,
"learning_rate": 5.588449120157205e-06,
"loss": 1.9077,
"step": 6670
},
{
"epoch": 2.2726886110402313,
"grad_norm": 2.3125,
"learning_rate": 5.57019673869832e-06,
"loss": 1.9133,
"step": 6680
},
{
"epoch": 2.2760908394998722,
"grad_norm": 2.265625,
"learning_rate": 5.5519531077554244e-06,
"loss": 1.8405,
"step": 6690
},
{
"epoch": 2.2794930679595136,
"grad_norm": 2.375,
"learning_rate": 5.533718365803875e-06,
"loss": 1.8948,
"step": 6700
},
{
"epoch": 2.2828952964191545,
"grad_norm": 2.265625,
"learning_rate": 5.51549265125156e-06,
"loss": 1.9344,
"step": 6710
},
{
"epoch": 2.286297524878796,
"grad_norm": 2.015625,
"learning_rate": 5.4972761024378514e-06,
"loss": 1.842,
"step": 6720
},
{
"epoch": 2.2896997533384367,
"grad_norm": 2.28125,
"learning_rate": 5.479068857632542e-06,
"loss": 1.9172,
"step": 6730
},
{
"epoch": 2.2931019817980776,
"grad_norm": 2.171875,
"learning_rate": 5.46087105503481e-06,
"loss": 1.9252,
"step": 6740
},
{
"epoch": 2.296504210257719,
"grad_norm": 2.21875,
"learning_rate": 5.4426828327721594e-06,
"loss": 1.9356,
"step": 6750
},
{
"epoch": 2.29990643871736,
"grad_norm": 2.3125,
"learning_rate": 5.4245043288993795e-06,
"loss": 1.9462,
"step": 6760
},
{
"epoch": 2.303308667177001,
"grad_norm": 2.375,
"learning_rate": 5.406335681397498e-06,
"loss": 1.9788,
"step": 6770
},
{
"epoch": 2.306710895636642,
"grad_norm": 2.578125,
"learning_rate": 5.388177028172714e-06,
"loss": 1.9221,
"step": 6780
},
{
"epoch": 2.310113124096283,
"grad_norm": 1.9609375,
"learning_rate": 5.370028507055387e-06,
"loss": 1.9344,
"step": 6790
},
{
"epoch": 2.313515352555924,
"grad_norm": 2.140625,
"learning_rate": 5.351890255798953e-06,
"loss": 1.871,
"step": 6800
},
{
"epoch": 2.316917581015565,
"grad_norm": 1.984375,
"learning_rate": 5.333762412078907e-06,
"loss": 1.975,
"step": 6810
},
{
"epoch": 2.3203198094752064,
"grad_norm": 2.21875,
"learning_rate": 5.315645113491743e-06,
"loss": 1.9103,
"step": 6820
},
{
"epoch": 2.3237220379348473,
"grad_norm": 2.203125,
"learning_rate": 5.2975384975539145e-06,
"loss": 1.9036,
"step": 6830
},
{
"epoch": 2.327124266394488,
"grad_norm": 2.140625,
"learning_rate": 5.279442701700792e-06,
"loss": 1.9292,
"step": 6840
},
{
"epoch": 2.3305264948541295,
"grad_norm": 2.34375,
"learning_rate": 5.261357863285613e-06,
"loss": 1.9181,
"step": 6850
},
{
"epoch": 2.3339287233137704,
"grad_norm": 2.359375,
"learning_rate": 5.243284119578448e-06,
"loss": 1.8917,
"step": 6860
},
{
"epoch": 2.3373309517734118,
"grad_norm": 2.484375,
"learning_rate": 5.225221607765159e-06,
"loss": 1.9389,
"step": 6870
},
{
"epoch": 2.3407331802330527,
"grad_norm": 2.6875,
"learning_rate": 5.207170464946342e-06,
"loss": 1.9298,
"step": 6880
},
{
"epoch": 2.3441354086926935,
"grad_norm": 2.078125,
"learning_rate": 5.189130828136312e-06,
"loss": 1.9011,
"step": 6890
},
{
"epoch": 2.347537637152335,
"grad_norm": 2.40625,
"learning_rate": 5.1711028342620375e-06,
"loss": 1.908,
"step": 6900
},
{
"epoch": 2.3509398656119758,
"grad_norm": 2.65625,
"learning_rate": 5.153086620162123e-06,
"loss": 1.8829,
"step": 6910
},
{
"epoch": 2.354342094071617,
"grad_norm": 2.25,
"learning_rate": 5.135082322585758e-06,
"loss": 1.9441,
"step": 6920
},
{
"epoch": 2.357744322531258,
"grad_norm": 2.4375,
"learning_rate": 5.117090078191676e-06,
"loss": 1.9403,
"step": 6930
},
{
"epoch": 2.361146550990899,
"grad_norm": 2.46875,
"learning_rate": 5.09911002354713e-06,
"loss": 1.9478,
"step": 6940
},
{
"epoch": 2.36454877945054,
"grad_norm": 2.0625,
"learning_rate": 5.081142295126842e-06,
"loss": 1.8916,
"step": 6950
},
{
"epoch": 2.367951007910181,
"grad_norm": 2.4375,
"learning_rate": 5.063187029311983e-06,
"loss": 1.9323,
"step": 6960
},
{
"epoch": 2.3713532363698224,
"grad_norm": 1.9375,
"learning_rate": 5.045244362389115e-06,
"loss": 1.9571,
"step": 6970
},
{
"epoch": 2.3747554648294633,
"grad_norm": 1.8359375,
"learning_rate": 5.027314430549185e-06,
"loss": 1.9486,
"step": 6980
},
{
"epoch": 2.378157693289104,
"grad_norm": 2.4375,
"learning_rate": 5.009397369886466e-06,
"loss": 1.944,
"step": 6990
},
{
"epoch": 2.3815599217487455,
"grad_norm": 2.390625,
"learning_rate": 4.991493316397536e-06,
"loss": 1.9539,
"step": 7000
},
{
"epoch": 2.3849621502083864,
"grad_norm": 2.21875,
"learning_rate": 4.973602405980251e-06,
"loss": 1.8877,
"step": 7010
},
{
"epoch": 2.3883643786680278,
"grad_norm": 2.1875,
"learning_rate": 4.955724774432697e-06,
"loss": 1.9579,
"step": 7020
},
{
"epoch": 2.3917666071276686,
"grad_norm": 2.4375,
"learning_rate": 4.937860557452174e-06,
"loss": 1.9066,
"step": 7030
},
{
"epoch": 2.3951688355873095,
"grad_norm": 2.328125,
"learning_rate": 4.920009890634164e-06,
"loss": 1.9488,
"step": 7040
},
{
"epoch": 2.398571064046951,
"grad_norm": 2.203125,
"learning_rate": 4.902172909471289e-06,
"loss": 1.9939,
"step": 7050
},
{
"epoch": 2.4019732925065918,
"grad_norm": 2.390625,
"learning_rate": 4.884349749352304e-06,
"loss": 1.9718,
"step": 7060
},
{
"epoch": 2.405375520966233,
"grad_norm": 2.53125,
"learning_rate": 4.866540545561045e-06,
"loss": 1.9198,
"step": 7070
},
{
"epoch": 2.408777749425874,
"grad_norm": 2.421875,
"learning_rate": 4.848745433275427e-06,
"loss": 1.8993,
"step": 7080
},
{
"epoch": 2.412179977885515,
"grad_norm": 2.65625,
"learning_rate": 4.830964547566399e-06,
"loss": 1.9977,
"step": 7090
},
{
"epoch": 2.415582206345156,
"grad_norm": 2.265625,
"learning_rate": 4.813198023396925e-06,
"loss": 1.911,
"step": 7100
},
{
"epoch": 2.418984434804797,
"grad_norm": 2.25,
"learning_rate": 4.795445995620965e-06,
"loss": 1.977,
"step": 7110
},
{
"epoch": 2.4223866632644384,
"grad_norm": 2.203125,
"learning_rate": 4.777708598982436e-06,
"loss": 1.9065,
"step": 7120
},
{
"epoch": 2.4257888917240793,
"grad_norm": 2.28125,
"learning_rate": 4.759985968114213e-06,
"loss": 1.9569,
"step": 7130
},
{
"epoch": 2.42919112018372,
"grad_norm": 2.59375,
"learning_rate": 4.742278237537088e-06,
"loss": 1.9151,
"step": 7140
},
{
"epoch": 2.4325933486433615,
"grad_norm": 1.90625,
"learning_rate": 4.72458554165875e-06,
"loss": 1.984,
"step": 7150
},
{
"epoch": 2.4359955771030024,
"grad_norm": 1.9453125,
"learning_rate": 4.706908014772776e-06,
"loss": 1.9921,
"step": 7160
},
{
"epoch": 2.4393978055626437,
"grad_norm": 2.515625,
"learning_rate": 4.689245791057602e-06,
"loss": 1.9753,
"step": 7170
},
{
"epoch": 2.4428000340222846,
"grad_norm": 1.9765625,
"learning_rate": 4.671599004575511e-06,
"loss": 1.9305,
"step": 7180
},
{
"epoch": 2.4462022624819255,
"grad_norm": 2.34375,
"learning_rate": 4.653967789271607e-06,
"loss": 1.8709,
"step": 7190
},
{
"epoch": 2.449604490941567,
"grad_norm": 2.359375,
"learning_rate": 4.636352278972806e-06,
"loss": 1.9123,
"step": 7200
},
{
"epoch": 2.4530067194012077,
"grad_norm": 2.046875,
"learning_rate": 4.618752607386824e-06,
"loss": 1.8976,
"step": 7210
},
{
"epoch": 2.456408947860849,
"grad_norm": 2.375,
"learning_rate": 4.601168908101142e-06,
"loss": 2.0117,
"step": 7220
},
{
"epoch": 2.45981117632049,
"grad_norm": 2.25,
"learning_rate": 4.5836013145820175e-06,
"loss": 1.8844,
"step": 7230
},
{
"epoch": 2.463213404780131,
"grad_norm": 2.40625,
"learning_rate": 4.5660499601734545e-06,
"loss": 1.9541,
"step": 7240
},
{
"epoch": 2.466615633239772,
"grad_norm": 2.375,
"learning_rate": 4.548514978096198e-06,
"loss": 1.9029,
"step": 7250
},
{
"epoch": 2.470017861699413,
"grad_norm": 2.34375,
"learning_rate": 4.5309965014467246e-06,
"loss": 1.9122,
"step": 7260
},
{
"epoch": 2.4734200901590544,
"grad_norm": 2.125,
"learning_rate": 4.513494663196221e-06,
"loss": 1.8935,
"step": 7270
},
{
"epoch": 2.4768223186186953,
"grad_norm": 2.546875,
"learning_rate": 4.496009596189593e-06,
"loss": 1.9198,
"step": 7280
},
{
"epoch": 2.480224547078336,
"grad_norm": 2.71875,
"learning_rate": 4.478541433144435e-06,
"loss": 1.8702,
"step": 7290
},
{
"epoch": 2.4836267755379775,
"grad_norm": 2.171875,
"learning_rate": 4.461090306650046e-06,
"loss": 1.9336,
"step": 7300
},
{
"epoch": 2.4870290039976184,
"grad_norm": 2.40625,
"learning_rate": 4.443656349166409e-06,
"loss": 1.9156,
"step": 7310
},
{
"epoch": 2.4904312324572597,
"grad_norm": 2.078125,
"learning_rate": 4.426239693023181e-06,
"loss": 1.949,
"step": 7320
},
{
"epoch": 2.4938334609169006,
"grad_norm": 2.34375,
"learning_rate": 4.408840470418706e-06,
"loss": 1.9331,
"step": 7330
},
{
"epoch": 2.4972356893765415,
"grad_norm": 2.046875,
"learning_rate": 4.391458813418992e-06,
"loss": 1.9376,
"step": 7340
},
{
"epoch": 2.500637917836183,
"grad_norm": 2.171875,
"learning_rate": 4.374094853956726e-06,
"loss": 1.8894,
"step": 7350
},
{
"epoch": 2.5040401462958237,
"grad_norm": 2.40625,
"learning_rate": 4.3567487238302625e-06,
"loss": 2.0008,
"step": 7360
},
{
"epoch": 2.507442374755465,
"grad_norm": 2.5,
"learning_rate": 4.3394205547026224e-06,
"loss": 1.8901,
"step": 7370
},
{
"epoch": 2.510844603215106,
"grad_norm": 2.25,
"learning_rate": 4.322110478100502e-06,
"loss": 1.9533,
"step": 7380
},
{
"epoch": 2.514246831674747,
"grad_norm": 2.171875,
"learning_rate": 4.3048186254132606e-06,
"loss": 1.9216,
"step": 7390
},
{
"epoch": 2.517649060134388,
"grad_norm": 2.453125,
"learning_rate": 4.287545127891939e-06,
"loss": 1.9397,
"step": 7400
},
{
"epoch": 2.521051288594029,
"grad_norm": 2.1875,
"learning_rate": 4.270290116648254e-06,
"loss": 1.9161,
"step": 7410
},
{
"epoch": 2.5244535170536704,
"grad_norm": 2.484375,
"learning_rate": 4.2530537226536075e-06,
"loss": 1.8427,
"step": 7420
},
{
"epoch": 2.5278557455133113,
"grad_norm": 2.84375,
"learning_rate": 4.235836076738085e-06,
"loss": 1.917,
"step": 7430
},
{
"epoch": 2.531257973972952,
"grad_norm": 2.453125,
"learning_rate": 4.218637309589471e-06,
"loss": 1.8681,
"step": 7440
},
{
"epoch": 2.5346602024325935,
"grad_norm": 2.171875,
"learning_rate": 4.201457551752256e-06,
"loss": 1.9049,
"step": 7450
},
{
"epoch": 2.5380624308922344,
"grad_norm": 2.1875,
"learning_rate": 4.184296933626636e-06,
"loss": 1.9001,
"step": 7460
},
{
"epoch": 2.5414646593518757,
"grad_norm": 2.46875,
"learning_rate": 4.167155585467538e-06,
"loss": 1.895,
"step": 7470
},
{
"epoch": 2.5448668878115166,
"grad_norm": 1.890625,
"learning_rate": 4.150033637383623e-06,
"loss": 1.9132,
"step": 7480
},
{
"epoch": 2.5482691162711575,
"grad_norm": 2.296875,
"learning_rate": 4.132931219336289e-06,
"loss": 1.9031,
"step": 7490
},
{
"epoch": 2.551671344730799,
"grad_norm": 2.15625,
"learning_rate": 4.115848461138707e-06,
"loss": 1.8727,
"step": 7500
},
{
"epoch": 2.5550735731904397,
"grad_norm": 2.5,
"learning_rate": 4.0987854924548134e-06,
"loss": 1.8808,
"step": 7510
},
{
"epoch": 2.558475801650081,
"grad_norm": 2.5,
"learning_rate": 4.081742442798342e-06,
"loss": 1.9265,
"step": 7520
},
{
"epoch": 2.561878030109722,
"grad_norm": 2.390625,
"learning_rate": 4.064719441531834e-06,
"loss": 1.9463,
"step": 7530
},
{
"epoch": 2.565280258569363,
"grad_norm": 2.6875,
"learning_rate": 4.04771661786565e-06,
"loss": 1.9341,
"step": 7540
},
{
"epoch": 2.568682487029004,
"grad_norm": 1.9296875,
"learning_rate": 4.030734100857004e-06,
"loss": 1.9036,
"step": 7550
},
{
"epoch": 2.572084715488645,
"grad_norm": 2.21875,
"learning_rate": 4.013772019408969e-06,
"loss": 1.9604,
"step": 7560
},
{
"epoch": 2.5754869439482864,
"grad_norm": 2.171875,
"learning_rate": 3.9968305022695076e-06,
"loss": 1.8938,
"step": 7570
},
{
"epoch": 2.5788891724079273,
"grad_norm": 2.0625,
"learning_rate": 3.979909678030498e-06,
"loss": 1.976,
"step": 7580
},
{
"epoch": 2.582291400867568,
"grad_norm": 2.609375,
"learning_rate": 3.9630096751267395e-06,
"loss": 1.9534,
"step": 7590
},
{
"epoch": 2.5856936293272095,
"grad_norm": 2.1875,
"learning_rate": 3.946130621835003e-06,
"loss": 1.9374,
"step": 7600
},
{
"epoch": 2.5890958577868504,
"grad_norm": 2.359375,
"learning_rate": 3.929272646273037e-06,
"loss": 1.9044,
"step": 7610
},
{
"epoch": 2.5924980862464917,
"grad_norm": 2.265625,
"learning_rate": 3.9124358763986045e-06,
"loss": 1.9723,
"step": 7620
},
{
"epoch": 2.5959003147061326,
"grad_norm": 2.578125,
"learning_rate": 3.895620440008517e-06,
"loss": 1.8593,
"step": 7630
},
{
"epoch": 2.5993025431657735,
"grad_norm": 2.5,
"learning_rate": 3.878826464737643e-06,
"loss": 1.9203,
"step": 7640
},
{
"epoch": 2.602704771625415,
"grad_norm": 2.5625,
"learning_rate": 3.862054078057968e-06,
"loss": 1.9127,
"step": 7650
},
{
"epoch": 2.6061070000850557,
"grad_norm": 2.421875,
"learning_rate": 3.845303407277605e-06,
"loss": 1.8969,
"step": 7660
},
{
"epoch": 2.609509228544697,
"grad_norm": 2.078125,
"learning_rate": 3.828574579539842e-06,
"loss": 1.957,
"step": 7670
},
{
"epoch": 2.612911457004338,
"grad_norm": 2.046875,
"learning_rate": 3.811867721822161e-06,
"loss": 1.9497,
"step": 7680
},
{
"epoch": 2.616313685463979,
"grad_norm": 2.484375,
"learning_rate": 3.7951829609352926e-06,
"loss": 1.9144,
"step": 7690
},
{
"epoch": 2.61971591392362,
"grad_norm": 2.640625,
"learning_rate": 3.778520423522247e-06,
"loss": 1.9252,
"step": 7700
},
{
"epoch": 2.623118142383261,
"grad_norm": 2.390625,
"learning_rate": 3.7618802360573384e-06,
"loss": 1.9192,
"step": 7710
},
{
"epoch": 2.6265203708429024,
"grad_norm": 2.0,
"learning_rate": 3.7452625248452478e-06,
"loss": 1.887,
"step": 7720
},
{
"epoch": 2.6299225993025432,
"grad_norm": 2.390625,
"learning_rate": 3.728667416020052e-06,
"loss": 1.9326,
"step": 7730
},
{
"epoch": 2.633324827762184,
"grad_norm": 2.484375,
"learning_rate": 3.7120950355442677e-06,
"loss": 1.9739,
"step": 7740
},
{
"epoch": 2.6367270562218255,
"grad_norm": 2.1875,
"learning_rate": 3.6955455092078956e-06,
"loss": 1.9417,
"step": 7750
},
{
"epoch": 2.6401292846814663,
"grad_norm": 2.078125,
"learning_rate": 3.679018962627461e-06,
"loss": 1.9288,
"step": 7760
},
{
"epoch": 2.6435315131411077,
"grad_norm": 2.0625,
"learning_rate": 3.6625155212450754e-06,
"loss": 1.9062,
"step": 7770
},
{
"epoch": 2.6469337416007486,
"grad_norm": 2.625,
"learning_rate": 3.6460353103274615e-06,
"loss": 1.9304,
"step": 7780
},
{
"epoch": 2.6503359700603895,
"grad_norm": 2.109375,
"learning_rate": 3.6295784549650233e-06,
"loss": 1.9378,
"step": 7790
},
{
"epoch": 2.6537381985200303,
"grad_norm": 2.234375,
"learning_rate": 3.613145080070886e-06,
"loss": 1.9244,
"step": 7800
},
{
"epoch": 2.6571404269796717,
"grad_norm": 2.328125,
"learning_rate": 3.59673531037995e-06,
"loss": 1.8997,
"step": 7810
},
{
"epoch": 2.660542655439313,
"grad_norm": 2.203125,
"learning_rate": 3.5803492704479488e-06,
"loss": 1.9715,
"step": 7820
},
{
"epoch": 2.663944883898954,
"grad_norm": 2.0625,
"learning_rate": 3.5639870846504873e-06,
"loss": 1.917,
"step": 7830
},
{
"epoch": 2.667347112358595,
"grad_norm": 2.4375,
"learning_rate": 3.54764887718212e-06,
"loss": 1.9122,
"step": 7840
},
{
"epoch": 2.6707493408182357,
"grad_norm": 2.265625,
"learning_rate": 3.5313347720553963e-06,
"loss": 1.9234,
"step": 7850
},
{
"epoch": 2.674151569277877,
"grad_norm": 2.359375,
"learning_rate": 3.5150448930999113e-06,
"loss": 1.9519,
"step": 7860
},
{
"epoch": 2.6775537977375183,
"grad_norm": 2.25,
"learning_rate": 3.4987793639613926e-06,
"loss": 1.9065,
"step": 7870
},
{
"epoch": 2.6809560261971592,
"grad_norm": 2.171875,
"learning_rate": 3.482538308100727e-06,
"loss": 1.8604,
"step": 7880
},
{
"epoch": 2.6843582546568,
"grad_norm": 2.328125,
"learning_rate": 3.4663218487930547e-06,
"loss": 1.8554,
"step": 7890
},
{
"epoch": 2.687760483116441,
"grad_norm": 2.4375,
"learning_rate": 3.4501301091268043e-06,
"loss": 1.936,
"step": 7900
},
{
"epoch": 2.6911627115760823,
"grad_norm": 2.328125,
"learning_rate": 3.433963212002789e-06,
"loss": 1.8966,
"step": 7910
},
{
"epoch": 2.6945649400357237,
"grad_norm": 2.15625,
"learning_rate": 3.41782128013325e-06,
"loss": 1.9634,
"step": 7920
},
{
"epoch": 2.6979671684953646,
"grad_norm": 2.546875,
"learning_rate": 3.4017044360409375e-06,
"loss": 1.922,
"step": 7930
},
{
"epoch": 2.7013693969550054,
"grad_norm": 2.4375,
"learning_rate": 3.3856128020581783e-06,
"loss": 1.9411,
"step": 7940
},
{
"epoch": 2.7047716254146463,
"grad_norm": 2.265625,
"learning_rate": 3.3695465003259376e-06,
"loss": 1.8679,
"step": 7950
},
{
"epoch": 2.7081738538742877,
"grad_norm": 1.953125,
"learning_rate": 3.353505652792909e-06,
"loss": 1.906,
"step": 7960
},
{
"epoch": 2.711576082333929,
"grad_norm": 2.421875,
"learning_rate": 3.3374903812145784e-06,
"loss": 1.8951,
"step": 7970
},
{
"epoch": 2.71497831079357,
"grad_norm": 2.546875,
"learning_rate": 3.3215008071522965e-06,
"loss": 1.9556,
"step": 7980
},
{
"epoch": 2.7183805392532108,
"grad_norm": 2.21875,
"learning_rate": 3.3055370519723652e-06,
"loss": 1.9427,
"step": 7990
},
{
"epoch": 2.7217827677128517,
"grad_norm": 2.71875,
"learning_rate": 3.289599236845113e-06,
"loss": 1.9533,
"step": 8000
},
{
"epoch": 2.725184996172493,
"grad_norm": 2.609375,
"learning_rate": 3.273687482743974e-06,
"loss": 1.9608,
"step": 8010
},
{
"epoch": 2.7285872246321343,
"grad_norm": 1.9609375,
"learning_rate": 3.2578019104445702e-06,
"loss": 1.9894,
"step": 8020
},
{
"epoch": 2.731989453091775,
"grad_norm": 2.46875,
"learning_rate": 3.241942640523791e-06,
"loss": 1.864,
"step": 8030
},
{
"epoch": 2.735391681551416,
"grad_norm": 2.40625,
"learning_rate": 3.2261097933588893e-06,
"loss": 1.9567,
"step": 8040
},
{
"epoch": 2.738793910011057,
"grad_norm": 2.65625,
"learning_rate": 3.210303489126551e-06,
"loss": 1.9093,
"step": 8050
},
{
"epoch": 2.7421961384706983,
"grad_norm": 2.4375,
"learning_rate": 3.1945238478020003e-06,
"loss": 1.9673,
"step": 8060
},
{
"epoch": 2.745598366930339,
"grad_norm": 2.265625,
"learning_rate": 3.1787709891580763e-06,
"loss": 1.9712,
"step": 8070
},
{
"epoch": 2.7490005953899805,
"grad_norm": 2.265625,
"learning_rate": 3.1630450327643315e-06,
"loss": 1.9127,
"step": 8080
},
{
"epoch": 2.7524028238496214,
"grad_norm": 2.234375,
"learning_rate": 3.147346097986121e-06,
"loss": 1.9763,
"step": 8090
},
{
"epoch": 2.7558050523092623,
"grad_norm": 1.9453125,
"learning_rate": 3.1316743039836908e-06,
"loss": 1.8313,
"step": 8100
},
{
"epoch": 2.7592072807689036,
"grad_norm": 2.0625,
"learning_rate": 3.1160297697112855e-06,
"loss": 1.9062,
"step": 8110
},
{
"epoch": 2.7626095092285445,
"grad_norm": 2.25,
"learning_rate": 3.10041261391624e-06,
"loss": 1.9072,
"step": 8120
},
{
"epoch": 2.766011737688186,
"grad_norm": 2.546875,
"learning_rate": 3.0848229551380702e-06,
"loss": 1.932,
"step": 8130
},
{
"epoch": 2.7694139661478268,
"grad_norm": 2.375,
"learning_rate": 3.069260911707586e-06,
"loss": 1.9311,
"step": 8140
},
{
"epoch": 2.7728161946074676,
"grad_norm": 2.6875,
"learning_rate": 3.0537266017459856e-06,
"loss": 1.9067,
"step": 8150
},
{
"epoch": 2.776218423067109,
"grad_norm": 2.203125,
"learning_rate": 3.0382201431639656e-06,
"loss": 1.978,
"step": 8160
},
{
"epoch": 2.77962065152675,
"grad_norm": 2.375,
"learning_rate": 3.0227416536608095e-06,
"loss": 1.9084,
"step": 8170
},
{
"epoch": 2.783022879986391,
"grad_norm": 2.203125,
"learning_rate": 3.0072912507235167e-06,
"loss": 1.8865,
"step": 8180
},
{
"epoch": 2.786425108446032,
"grad_norm": 2.015625,
"learning_rate": 2.991869051625898e-06,
"loss": 1.9293,
"step": 8190
},
{
"epoch": 2.789827336905673,
"grad_norm": 2.59375,
"learning_rate": 2.9764751734276803e-06,
"loss": 1.9127,
"step": 8200
},
{
"epoch": 2.7932295653653143,
"grad_norm": 2.453125,
"learning_rate": 2.9611097329736394e-06,
"loss": 1.9198,
"step": 8210
},
{
"epoch": 2.796631793824955,
"grad_norm": 2.3125,
"learning_rate": 2.9457728468926836e-06,
"loss": 1.9261,
"step": 8220
},
{
"epoch": 2.8000340222845965,
"grad_norm": 2.59375,
"learning_rate": 2.930464631596993e-06,
"loss": 1.9068,
"step": 8230
},
{
"epoch": 2.8034362507442374,
"grad_norm": 2.40625,
"learning_rate": 2.915185203281126e-06,
"loss": 1.947,
"step": 8240
},
{
"epoch": 2.8068384792038783,
"grad_norm": 2.34375,
"learning_rate": 2.899934677921133e-06,
"loss": 1.9014,
"step": 8250
},
{
"epoch": 2.8102407076635196,
"grad_norm": 2.25,
"learning_rate": 2.884713171273686e-06,
"loss": 1.9012,
"step": 8260
},
{
"epoch": 2.8136429361231605,
"grad_norm": 2.3125,
"learning_rate": 2.869520798875194e-06,
"loss": 1.9299,
"step": 8270
},
{
"epoch": 2.817045164582802,
"grad_norm": 2.046875,
"learning_rate": 2.8543576760409264e-06,
"loss": 1.9472,
"step": 8280
},
{
"epoch": 2.8204473930424427,
"grad_norm": 2.140625,
"learning_rate": 2.839223917864142e-06,
"loss": 1.9323,
"step": 8290
},
{
"epoch": 2.8238496215020836,
"grad_norm": 2.203125,
"learning_rate": 2.824119639215203e-06,
"loss": 1.9394,
"step": 8300
},
{
"epoch": 2.827251849961725,
"grad_norm": 2.515625,
"learning_rate": 2.809044954740723e-06,
"loss": 1.9369,
"step": 8310
},
{
"epoch": 2.830654078421366,
"grad_norm": 2.46875,
"learning_rate": 2.7939999788626755e-06,
"loss": 1.9025,
"step": 8320
},
{
"epoch": 2.834056306881007,
"grad_norm": 2.390625,
"learning_rate": 2.778984825777543e-06,
"loss": 1.908,
"step": 8330
},
{
"epoch": 2.837458535340648,
"grad_norm": 2.5,
"learning_rate": 2.763999609455441e-06,
"loss": 1.9814,
"step": 8340
},
{
"epoch": 2.840860763800289,
"grad_norm": 2.421875,
"learning_rate": 2.7490444436392535e-06,
"loss": 1.9804,
"step": 8350
},
{
"epoch": 2.8442629922599303,
"grad_norm": 2.359375,
"learning_rate": 2.7341194418437747e-06,
"loss": 1.9187,
"step": 8360
},
{
"epoch": 2.847665220719571,
"grad_norm": 2.25,
"learning_rate": 2.7192247173548356e-06,
"loss": 1.8885,
"step": 8370
},
{
"epoch": 2.8510674491792125,
"grad_norm": 2.515625,
"learning_rate": 2.7043603832284616e-06,
"loss": 1.9056,
"step": 8380
},
{
"epoch": 2.8544696776388534,
"grad_norm": 2.5625,
"learning_rate": 2.689526552289997e-06,
"loss": 1.9068,
"step": 8390
},
{
"epoch": 2.8578719060984943,
"grad_norm": 1.9375,
"learning_rate": 2.6747233371332606e-06,
"loss": 2.0559,
"step": 8400
},
{
"epoch": 2.8612741345581356,
"grad_norm": 2.140625,
"learning_rate": 2.6599508501196876e-06,
"loss": 1.9102,
"step": 8410
},
{
"epoch": 2.8646763630177765,
"grad_norm": 2.3125,
"learning_rate": 2.6452092033774744e-06,
"loss": 1.878,
"step": 8420
},
{
"epoch": 2.868078591477418,
"grad_norm": 2.21875,
"learning_rate": 2.630498508800734e-06,
"loss": 1.9412,
"step": 8430
},
{
"epoch": 2.8714808199370587,
"grad_norm": 2.59375,
"learning_rate": 2.6158188780486312e-06,
"loss": 1.8957,
"step": 8440
},
{
"epoch": 2.8748830483966996,
"grad_norm": 2.65625,
"learning_rate": 2.6011704225445548e-06,
"loss": 1.8656,
"step": 8450
},
{
"epoch": 2.878285276856341,
"grad_norm": 2.5,
"learning_rate": 2.586553253475264e-06,
"loss": 1.9598,
"step": 8460
},
{
"epoch": 2.881687505315982,
"grad_norm": 2.25,
"learning_rate": 2.5719674817900346e-06,
"loss": 1.957,
"step": 8470
},
{
"epoch": 2.885089733775623,
"grad_norm": 2.296875,
"learning_rate": 2.5574132181998334e-06,
"loss": 1.9725,
"step": 8480
},
{
"epoch": 2.888491962235264,
"grad_norm": 1.9765625,
"learning_rate": 2.5428905731764664e-06,
"loss": 1.9228,
"step": 8490
},
{
"epoch": 2.891894190694905,
"grad_norm": 2.40625,
"learning_rate": 2.5283996569517464e-06,
"loss": 1.938,
"step": 8500
},
{
"epoch": 2.8952964191545463,
"grad_norm": 2.21875,
"learning_rate": 2.5139405795166538e-06,
"loss": 1.9243,
"step": 8510
},
{
"epoch": 2.898698647614187,
"grad_norm": 2.3125,
"learning_rate": 2.4995134506204964e-06,
"loss": 1.9328,
"step": 8520
},
{
"epoch": 2.9021008760738285,
"grad_norm": 2.15625,
"learning_rate": 2.48511837977009e-06,
"loss": 1.9199,
"step": 8530
},
{
"epoch": 2.9055031045334694,
"grad_norm": 2.625,
"learning_rate": 2.4707554762289077e-06,
"loss": 1.9613,
"step": 8540
},
{
"epoch": 2.9089053329931103,
"grad_norm": 2.046875,
"learning_rate": 2.4564248490162763e-06,
"loss": 1.9547,
"step": 8550
},
{
"epoch": 2.9123075614527516,
"grad_norm": 2.328125,
"learning_rate": 2.442126606906526e-06,
"loss": 2.0251,
"step": 8560
},
{
"epoch": 2.9157097899123925,
"grad_norm": 2.40625,
"learning_rate": 2.4278608584281694e-06,
"loss": 1.9231,
"step": 8570
},
{
"epoch": 2.919112018372034,
"grad_norm": 2.625,
"learning_rate": 2.413627711863091e-06,
"loss": 1.9295,
"step": 8580
},
{
"epoch": 2.9225142468316747,
"grad_norm": 2.5,
"learning_rate": 2.399427275245705e-06,
"loss": 1.9444,
"step": 8590
},
{
"epoch": 2.9259164752913156,
"grad_norm": 2.328125,
"learning_rate": 2.3852596563621536e-06,
"loss": 1.9794,
"step": 8600
},
{
"epoch": 2.929318703750957,
"grad_norm": 2.1875,
"learning_rate": 2.3711249627494803e-06,
"loss": 1.9096,
"step": 8610
},
{
"epoch": 2.932720932210598,
"grad_norm": 2.578125,
"learning_rate": 2.3570233016948133e-06,
"loss": 1.9062,
"step": 8620
},
{
"epoch": 2.936123160670239,
"grad_norm": 2.34375,
"learning_rate": 2.3429547802345537e-06,
"loss": 1.8779,
"step": 8630
},
{
"epoch": 2.93952538912988,
"grad_norm": 2.265625,
"learning_rate": 2.3289195051535584e-06,
"loss": 1.8901,
"step": 8640
},
{
"epoch": 2.942927617589521,
"grad_norm": 2.203125,
"learning_rate": 2.3149175829843367e-06,
"loss": 1.9073,
"step": 8650
},
{
"epoch": 2.9463298460491623,
"grad_norm": 2.46875,
"learning_rate": 2.3009491200062343e-06,
"loss": 1.9434,
"step": 8660
},
{
"epoch": 2.949732074508803,
"grad_norm": 2.1875,
"learning_rate": 2.287014222244634e-06,
"loss": 1.88,
"step": 8670
},
{
"epoch": 2.9531343029684445,
"grad_norm": 2.109375,
"learning_rate": 2.273112995470147e-06,
"loss": 1.968,
"step": 8680
},
{
"epoch": 2.9565365314280854,
"grad_norm": 2.03125,
"learning_rate": 2.259245545197807e-06,
"loss": 1.9048,
"step": 8690
},
{
"epoch": 2.9599387598877263,
"grad_norm": 2.46875,
"learning_rate": 2.245411976686278e-06,
"loss": 1.9502,
"step": 8700
},
{
"epoch": 2.9633409883473676,
"grad_norm": 2.546875,
"learning_rate": 2.231612394937042e-06,
"loss": 1.87,
"step": 8710
},
{
"epoch": 2.9667432168070085,
"grad_norm": 2.234375,
"learning_rate": 2.217846904693616e-06,
"loss": 1.9337,
"step": 8720
},
{
"epoch": 2.97014544526665,
"grad_norm": 2.609375,
"learning_rate": 2.2041156104407518e-06,
"loss": 1.9095,
"step": 8730
},
{
"epoch": 2.9735476737262907,
"grad_norm": 2.4375,
"learning_rate": 2.1904186164036358e-06,
"loss": 1.9346,
"step": 8740
},
{
"epoch": 2.9769499021859316,
"grad_norm": 2.09375,
"learning_rate": 2.1767560265471087e-06,
"loss": 1.9296,
"step": 8750
},
{
"epoch": 2.980352130645573,
"grad_norm": 2.484375,
"learning_rate": 2.163127944574872e-06,
"loss": 1.9386,
"step": 8760
},
{
"epoch": 2.983754359105214,
"grad_norm": 2.40625,
"learning_rate": 2.149534473928699e-06,
"loss": 1.9189,
"step": 8770
},
{
"epoch": 2.987156587564855,
"grad_norm": 2.46875,
"learning_rate": 2.135975717787654e-06,
"loss": 1.8996,
"step": 8780
},
{
"epoch": 2.990558816024496,
"grad_norm": 2.1875,
"learning_rate": 2.1224517790673003e-06,
"loss": 1.937,
"step": 8790
},
{
"epoch": 2.993961044484137,
"grad_norm": 2.234375,
"learning_rate": 2.108962760418933e-06,
"loss": 1.9724,
"step": 8800
},
{
"epoch": 2.9973632729437782,
"grad_norm": 2.5,
"learning_rate": 2.0955087642287833e-06,
"loss": 1.9497,
"step": 8810
},
{
"epoch": 3.000765501403419,
"grad_norm": 2.5,
"learning_rate": 2.0820898926172546e-06,
"loss": 1.9683,
"step": 8820
},
{
"epoch": 3.0041677298630605,
"grad_norm": 2.375,
"learning_rate": 2.0687062474381516e-06,
"loss": 1.9146,
"step": 8830
},
{
"epoch": 3.0075699583227014,
"grad_norm": 2.515625,
"learning_rate": 2.05535793027788e-06,
"loss": 1.9749,
"step": 8840
},
{
"epoch": 3.0109721867823422,
"grad_norm": 2.46875,
"learning_rate": 2.042045042454711e-06,
"loss": 1.9554,
"step": 8850
},
{
"epoch": 3.0143744152419836,
"grad_norm": 2.53125,
"learning_rate": 2.028767685017981e-06,
"loss": 1.8963,
"step": 8860
},
{
"epoch": 3.0177766437016245,
"grad_norm": 2.671875,
"learning_rate": 2.015525958747352e-06,
"loss": 1.938,
"step": 8870
},
{
"epoch": 3.021178872161266,
"grad_norm": 2.625,
"learning_rate": 2.0023199641520177e-06,
"loss": 1.9223,
"step": 8880
},
{
"epoch": 3.0245811006209067,
"grad_norm": 2.625,
"learning_rate": 1.989149801469974e-06,
"loss": 1.8825,
"step": 8890
},
{
"epoch": 3.0279833290805476,
"grad_norm": 2.703125,
"learning_rate": 1.97601557066723e-06,
"loss": 1.9489,
"step": 8900
},
{
"epoch": 3.031385557540189,
"grad_norm": 2.109375,
"learning_rate": 1.9629173714370583e-06,
"loss": 1.9236,
"step": 8910
},
{
"epoch": 3.03478778599983,
"grad_norm": 2.078125,
"learning_rate": 1.949855303199246e-06,
"loss": 1.9561,
"step": 8920
},
{
"epoch": 3.038190014459471,
"grad_norm": 2.484375,
"learning_rate": 1.9368294650993263e-06,
"loss": 1.8969,
"step": 8930
},
{
"epoch": 3.041592242919112,
"grad_norm": 2.125,
"learning_rate": 1.92383995600784e-06,
"loss": 1.9331,
"step": 8940
},
{
"epoch": 3.044994471378753,
"grad_norm": 2.40625,
"learning_rate": 1.910886874519575e-06,
"loss": 1.9734,
"step": 8950
},
{
"epoch": 3.0483966998383942,
"grad_norm": 2.0625,
"learning_rate": 1.8979703189528225e-06,
"loss": 1.918,
"step": 8960
},
{
"epoch": 3.051798928298035,
"grad_norm": 2.40625,
"learning_rate": 1.885090387348631e-06,
"loss": 1.9162,
"step": 8970
},
{
"epoch": 3.0552011567576765,
"grad_norm": 2.421875,
"learning_rate": 1.8722471774700541e-06,
"loss": 1.9047,
"step": 8980
},
{
"epoch": 3.0586033852173173,
"grad_norm": 2.40625,
"learning_rate": 1.8594407868014222e-06,
"loss": 1.9391,
"step": 8990
},
{
"epoch": 3.0620056136769582,
"grad_norm": 2.53125,
"learning_rate": 1.8466713125475953e-06,
"loss": 1.9597,
"step": 9000
},
{
"epoch": 3.0654078421365996,
"grad_norm": 2.125,
"learning_rate": 1.8339388516332183e-06,
"loss": 1.9123,
"step": 9010
},
{
"epoch": 3.0688100705962404,
"grad_norm": 2.265625,
"learning_rate": 1.8212435007019987e-06,
"loss": 1.9063,
"step": 9020
},
{
"epoch": 3.072212299055882,
"grad_norm": 2.0625,
"learning_rate": 1.8085853561159651e-06,
"loss": 1.8604,
"step": 9030
},
{
"epoch": 3.0756145275155227,
"grad_norm": 2.203125,
"learning_rate": 1.7959645139547367e-06,
"loss": 1.9165,
"step": 9040
},
{
"epoch": 3.0790167559751636,
"grad_norm": 2.8125,
"learning_rate": 1.7833810700147973e-06,
"loss": 1.9096,
"step": 9050
},
{
"epoch": 3.082418984434805,
"grad_norm": 2.203125,
"learning_rate": 1.770835119808758e-06,
"loss": 1.9433,
"step": 9060
},
{
"epoch": 3.0858212128944458,
"grad_norm": 2.46875,
"learning_rate": 1.7583267585646496e-06,
"loss": 1.972,
"step": 9070
},
{
"epoch": 3.089223441354087,
"grad_norm": 2.40625,
"learning_rate": 1.7458560812251807e-06,
"loss": 1.9191,
"step": 9080
},
{
"epoch": 3.092625669813728,
"grad_norm": 2.046875,
"learning_rate": 1.7334231824470327e-06,
"loss": 1.882,
"step": 9090
},
{
"epoch": 3.096027898273369,
"grad_norm": 2.40625,
"learning_rate": 1.7210281566001321e-06,
"loss": 1.9086,
"step": 9100
},
{
"epoch": 3.09943012673301,
"grad_norm": 2.09375,
"learning_rate": 1.7086710977669391e-06,
"loss": 1.9225,
"step": 9110
},
{
"epoch": 3.102832355192651,
"grad_norm": 2.515625,
"learning_rate": 1.6963520997417304e-06,
"loss": 1.9364,
"step": 9120
},
{
"epoch": 3.1062345836522924,
"grad_norm": 2.40625,
"learning_rate": 1.684071256029885e-06,
"loss": 1.962,
"step": 9130
},
{
"epoch": 3.1096368121119333,
"grad_norm": 2.25,
"learning_rate": 1.6718286598471834e-06,
"loss": 1.9557,
"step": 9140
},
{
"epoch": 3.113039040571574,
"grad_norm": 2.234375,
"learning_rate": 1.6596244041190884e-06,
"loss": 1.963,
"step": 9150
},
{
"epoch": 3.1164412690312155,
"grad_norm": 2.453125,
"learning_rate": 1.6474585814800486e-06,
"loss": 1.8665,
"step": 9160
},
{
"epoch": 3.1198434974908564,
"grad_norm": 2.234375,
"learning_rate": 1.6353312842727971e-06,
"loss": 1.9364,
"step": 9170
},
{
"epoch": 3.1232457259504978,
"grad_norm": 1.9921875,
"learning_rate": 1.6232426045476368e-06,
"loss": 1.9379,
"step": 9180
},
{
"epoch": 3.1266479544101387,
"grad_norm": 2.484375,
"learning_rate": 1.6111926340617594e-06,
"loss": 1.8696,
"step": 9190
},
{
"epoch": 3.1300501828697795,
"grad_norm": 2.546875,
"learning_rate": 1.599181464278531e-06,
"loss": 1.9511,
"step": 9200
},
{
"epoch": 3.133452411329421,
"grad_norm": 2.125,
"learning_rate": 1.587209186366815e-06,
"loss": 1.9289,
"step": 9210
},
{
"epoch": 3.1368546397890618,
"grad_norm": 2.296875,
"learning_rate": 1.5752758912002694e-06,
"loss": 1.8937,
"step": 9220
},
{
"epoch": 3.140256868248703,
"grad_norm": 2.265625,
"learning_rate": 1.5633816693566608e-06,
"loss": 1.8763,
"step": 9230
},
{
"epoch": 3.143659096708344,
"grad_norm": 2.3125,
"learning_rate": 1.5515266111171768e-06,
"loss": 1.9913,
"step": 9240
},
{
"epoch": 3.147061325167985,
"grad_norm": 2.5,
"learning_rate": 1.5397108064657348e-06,
"loss": 1.8861,
"step": 9250
},
{
"epoch": 3.150463553627626,
"grad_norm": 2.109375,
"learning_rate": 1.5279343450883104e-06,
"loss": 1.9029,
"step": 9260
},
{
"epoch": 3.153865782087267,
"grad_norm": 2.4375,
"learning_rate": 1.5161973163722477e-06,
"loss": 1.9382,
"step": 9270
},
{
"epoch": 3.1572680105469084,
"grad_norm": 2.421875,
"learning_rate": 1.5044998094055818e-06,
"loss": 1.8859,
"step": 9280
},
{
"epoch": 3.1606702390065493,
"grad_norm": 2.375,
"learning_rate": 1.4928419129763672e-06,
"loss": 1.8785,
"step": 9290
},
{
"epoch": 3.16407246746619,
"grad_norm": 2.6875,
"learning_rate": 1.4812237155720006e-06,
"loss": 1.8864,
"step": 9300
},
{
"epoch": 3.1674746959258315,
"grad_norm": 2.53125,
"learning_rate": 1.4696453053785496e-06,
"loss": 1.8698,
"step": 9310
},
{
"epoch": 3.1708769243854724,
"grad_norm": 2.296875,
"learning_rate": 1.4581067702800793e-06,
"loss": 1.9852,
"step": 9320
},
{
"epoch": 3.1742791528451137,
"grad_norm": 2.3125,
"learning_rate": 1.4466081978579942e-06,
"loss": 1.98,
"step": 9330
},
{
"epoch": 3.1776813813047546,
"grad_norm": 2.34375,
"learning_rate": 1.4351496753903699e-06,
"loss": 1.925,
"step": 9340
},
{
"epoch": 3.1810836097643955,
"grad_norm": 2.5,
"learning_rate": 1.4237312898512816e-06,
"loss": 1.9355,
"step": 9350
},
{
"epoch": 3.184485838224037,
"grad_norm": 2.703125,
"learning_rate": 1.4123531279101576e-06,
"loss": 1.9966,
"step": 9360
},
{
"epoch": 3.1878880666836777,
"grad_norm": 2.578125,
"learning_rate": 1.4010152759311148e-06,
"loss": 1.8377,
"step": 9370
},
{
"epoch": 3.191290295143319,
"grad_norm": 2.296875,
"learning_rate": 1.3897178199723027e-06,
"loss": 1.9501,
"step": 9380
},
{
"epoch": 3.19469252360296,
"grad_norm": 2.390625,
"learning_rate": 1.3784608457852537e-06,
"loss": 1.9103,
"step": 9390
},
{
"epoch": 3.198094752062601,
"grad_norm": 2.578125,
"learning_rate": 1.3672444388142238e-06,
"loss": 1.9575,
"step": 9400
},
{
"epoch": 3.201496980522242,
"grad_norm": 2.328125,
"learning_rate": 1.3560686841955576e-06,
"loss": 1.929,
"step": 9410
},
{
"epoch": 3.204899208981883,
"grad_norm": 2.375,
"learning_rate": 1.3449336667570272e-06,
"loss": 1.9606,
"step": 9420
},
{
"epoch": 3.2083014374415244,
"grad_norm": 2.3125,
"learning_rate": 1.3338394710172017e-06,
"loss": 1.9379,
"step": 9430
},
{
"epoch": 3.2117036659011653,
"grad_norm": 2.640625,
"learning_rate": 1.3227861811847961e-06,
"loss": 1.8995,
"step": 9440
},
{
"epoch": 3.215105894360806,
"grad_norm": 2.203125,
"learning_rate": 1.3117738811580378e-06,
"loss": 1.9038,
"step": 9450
},
{
"epoch": 3.2185081228204475,
"grad_norm": 2.234375,
"learning_rate": 1.3008026545240273e-06,
"loss": 1.9499,
"step": 9460
},
{
"epoch": 3.2219103512800884,
"grad_norm": 2.234375,
"learning_rate": 1.2898725845581015e-06,
"loss": 1.9625,
"step": 9470
},
{
"epoch": 3.2253125797397297,
"grad_norm": 2.234375,
"learning_rate": 1.2789837542232062e-06,
"loss": 2.0014,
"step": 9480
},
{
"epoch": 3.2287148081993706,
"grad_norm": 2.375,
"learning_rate": 1.2681362461692674e-06,
"loss": 1.9227,
"step": 9490
},
{
"epoch": 3.2321170366590115,
"grad_norm": 1.90625,
"learning_rate": 1.2573301427325523e-06,
"loss": 1.9411,
"step": 9500
},
{
"epoch": 3.235519265118653,
"grad_norm": 1.9375,
"learning_rate": 1.246565525935065e-06,
"loss": 1.8898,
"step": 9510
},
{
"epoch": 3.2389214935782937,
"grad_norm": 2.25,
"learning_rate": 1.2358424774839005e-06,
"loss": 1.8962,
"step": 9520
},
{
"epoch": 3.242323722037935,
"grad_norm": 2.5,
"learning_rate": 1.2251610787706435e-06,
"loss": 1.9404,
"step": 9530
},
{
"epoch": 3.245725950497576,
"grad_norm": 2.265625,
"learning_rate": 1.2145214108707407e-06,
"loss": 1.8978,
"step": 9540
},
{
"epoch": 3.249128178957217,
"grad_norm": 2.140625,
"learning_rate": 1.2039235545428843e-06,
"loss": 1.9312,
"step": 9550
},
{
"epoch": 3.252530407416858,
"grad_norm": 2.140625,
"learning_rate": 1.1933675902284088e-06,
"loss": 1.8721,
"step": 9560
},
{
"epoch": 3.255932635876499,
"grad_norm": 2.171875,
"learning_rate": 1.182853598050669e-06,
"loss": 1.9304,
"step": 9570
},
{
"epoch": 3.2593348643361404,
"grad_norm": 2.34375,
"learning_rate": 1.1723816578144417e-06,
"loss": 1.8912,
"step": 9580
},
{
"epoch": 3.2627370927957813,
"grad_norm": 2.375,
"learning_rate": 1.1619518490053083e-06,
"loss": 1.8852,
"step": 9590
},
{
"epoch": 3.266139321255422,
"grad_norm": 2.359375,
"learning_rate": 1.1515642507890646e-06,
"loss": 1.9256,
"step": 9600
},
{
"epoch": 3.2695415497150635,
"grad_norm": 2.375,
"learning_rate": 1.141218942011112e-06,
"loss": 1.8988,
"step": 9610
},
{
"epoch": 3.2729437781747044,
"grad_norm": 2.4375,
"learning_rate": 1.1309160011958583e-06,
"loss": 1.9262,
"step": 9620
},
{
"epoch": 3.2763460066343457,
"grad_norm": 2.078125,
"learning_rate": 1.1206555065461265e-06,
"loss": 1.9177,
"step": 9630
},
{
"epoch": 3.2797482350939866,
"grad_norm": 2.28125,
"learning_rate": 1.1104375359425585e-06,
"loss": 1.9117,
"step": 9640
},
{
"epoch": 3.2831504635536275,
"grad_norm": 2.703125,
"learning_rate": 1.100262166943023e-06,
"loss": 1.9711,
"step": 9650
},
{
"epoch": 3.286552692013269,
"grad_norm": 2.296875,
"learning_rate": 1.0901294767820318e-06,
"loss": 1.9243,
"step": 9660
},
{
"epoch": 3.2899549204729097,
"grad_norm": 2.4375,
"learning_rate": 1.0800395423701436e-06,
"loss": 1.9023,
"step": 9670
},
{
"epoch": 3.293357148932551,
"grad_norm": 2.140625,
"learning_rate": 1.0699924402933917e-06,
"loss": 1.938,
"step": 9680
},
{
"epoch": 3.296759377392192,
"grad_norm": 2.359375,
"learning_rate": 1.0599882468126933e-06,
"loss": 1.9328,
"step": 9690
},
{
"epoch": 3.300161605851833,
"grad_norm": 2.109375,
"learning_rate": 1.0500270378632782e-06,
"loss": 1.9429,
"step": 9700
},
{
"epoch": 3.303563834311474,
"grad_norm": 2.171875,
"learning_rate": 1.0401088890541082e-06,
"loss": 1.9068,
"step": 9710
},
{
"epoch": 3.306966062771115,
"grad_norm": 2.28125,
"learning_rate": 1.0302338756673032e-06,
"loss": 1.9121,
"step": 9720
},
{
"epoch": 3.3103682912307564,
"grad_norm": 2.28125,
"learning_rate": 1.0204020726575725e-06,
"loss": 1.9197,
"step": 9730
},
{
"epoch": 3.3137705196903973,
"grad_norm": 2.09375,
"learning_rate": 1.0106135546516385e-06,
"loss": 1.9347,
"step": 9740
},
{
"epoch": 3.317172748150038,
"grad_norm": 1.9375,
"learning_rate": 1.0008683959476827e-06,
"loss": 1.929,
"step": 9750
},
{
"epoch": 3.3205749766096795,
"grad_norm": 2.203125,
"learning_rate": 9.911666705147721e-07,
"loss": 1.8878,
"step": 9760
},
{
"epoch": 3.3239772050693204,
"grad_norm": 2.359375,
"learning_rate": 9.815084519922975e-07,
"loss": 1.8525,
"step": 9770
},
{
"epoch": 3.3273794335289617,
"grad_norm": 2.03125,
"learning_rate": 9.718938136894211e-07,
"loss": 1.8368,
"step": 9780
},
{
"epoch": 3.3307816619886026,
"grad_norm": 2.0,
"learning_rate": 9.623228285845155e-07,
"loss": 1.8964,
"step": 9790
},
{
"epoch": 3.3341838904482435,
"grad_norm": 2.796875,
"learning_rate": 9.527955693246117e-07,
"loss": 1.9062,
"step": 9800
},
{
"epoch": 3.337586118907885,
"grad_norm": 2.125,
"learning_rate": 9.433121082248422e-07,
"loss": 1.87,
"step": 9810
},
{
"epoch": 3.3409883473675257,
"grad_norm": 2.5,
"learning_rate": 9.33872517267902e-07,
"loss": 1.9351,
"step": 9820
},
{
"epoch": 3.344390575827167,
"grad_norm": 2.21875,
"learning_rate": 9.244768681034954e-07,
"loss": 1.9826,
"step": 9830
},
{
"epoch": 3.347792804286808,
"grad_norm": 2.5625,
"learning_rate": 9.151252320477888e-07,
"loss": 1.9788,
"step": 9840
},
{
"epoch": 3.351195032746449,
"grad_norm": 1.9765625,
"learning_rate": 9.058176800828842e-07,
"loss": 1.9306,
"step": 9850
},
{
"epoch": 3.35459726120609,
"grad_norm": 2.375,
"learning_rate": 8.965542828562589e-07,
"loss": 1.9304,
"step": 9860
},
{
"epoch": 3.357999489665731,
"grad_norm": 2.546875,
"learning_rate": 8.873351106802486e-07,
"loss": 1.9565,
"step": 9870
},
{
"epoch": 3.3614017181253724,
"grad_norm": 2.28125,
"learning_rate": 8.781602335315041e-07,
"loss": 1.9325,
"step": 9880
},
{
"epoch": 3.3648039465850133,
"grad_norm": 2.25,
"learning_rate": 8.690297210504589e-07,
"loss": 1.9074,
"step": 9890
},
{
"epoch": 3.368206175044654,
"grad_norm": 2.65625,
"learning_rate": 8.599436425408064e-07,
"loss": 1.9338,
"step": 9900
},
{
"epoch": 3.3716084035042955,
"grad_norm": 2.625,
"learning_rate": 8.509020669689717e-07,
"loss": 1.9236,
"step": 9910
},
{
"epoch": 3.3750106319639364,
"grad_norm": 2.5625,
"learning_rate": 8.419050629635849e-07,
"loss": 1.9387,
"step": 9920
},
{
"epoch": 3.3784128604235777,
"grad_norm": 2.4375,
"learning_rate": 8.329526988149661e-07,
"loss": 1.9503,
"step": 9930
},
{
"epoch": 3.3818150888832186,
"grad_norm": 2.1875,
"learning_rate": 8.240450424745993e-07,
"loss": 1.9232,
"step": 9940
},
{
"epoch": 3.3852173173428595,
"grad_norm": 2.546875,
"learning_rate": 8.151821615546263e-07,
"loss": 1.9435,
"step": 9950
},
{
"epoch": 3.388619545802501,
"grad_norm": 2.203125,
"learning_rate": 8.063641233273221e-07,
"loss": 1.9005,
"step": 9960
},
{
"epoch": 3.3920217742621417,
"grad_norm": 2.609375,
"learning_rate": 7.975909947245956e-07,
"loss": 1.864,
"step": 9970
},
{
"epoch": 3.3954240027217826,
"grad_norm": 2.15625,
"learning_rate": 7.888628423374738e-07,
"loss": 1.9707,
"step": 9980
},
{
"epoch": 3.398826231181424,
"grad_norm": 2.53125,
"learning_rate": 7.801797324156009e-07,
"loss": 1.9314,
"step": 9990
},
{
"epoch": 3.402228459641065,
"grad_norm": 2.546875,
"learning_rate": 7.715417308667326e-07,
"loss": 1.9229,
"step": 10000
},
{
"epoch": 3.405630688100706,
"grad_norm": 2.5625,
"learning_rate": 7.629489032562336e-07,
"loss": 1.86,
"step": 10010
},
{
"epoch": 3.409032916560347,
"grad_norm": 2.4375,
"learning_rate": 7.544013148065898e-07,
"loss": 1.9123,
"step": 10020
},
{
"epoch": 3.412435145019988,
"grad_norm": 1.8515625,
"learning_rate": 7.45899030396898e-07,
"loss": 1.8735,
"step": 10030
},
{
"epoch": 3.4158373734796292,
"grad_norm": 2.375,
"learning_rate": 7.374421145623891e-07,
"loss": 1.9386,
"step": 10040
},
{
"epoch": 3.41923960193927,
"grad_norm": 2.5625,
"learning_rate": 7.290306314939283e-07,
"loss": 1.8794,
"step": 10050
},
{
"epoch": 3.4226418303989115,
"grad_norm": 2.296875,
"learning_rate": 7.206646450375306e-07,
"loss": 1.9236,
"step": 10060
},
{
"epoch": 3.4260440588585523,
"grad_norm": 2.25,
"learning_rate": 7.123442186938769e-07,
"loss": 1.9224,
"step": 10070
},
{
"epoch": 3.4294462873181932,
"grad_norm": 2.28125,
"learning_rate": 7.040694156178301e-07,
"loss": 1.9089,
"step": 10080
},
{
"epoch": 3.4328485157778346,
"grad_norm": 2.125,
"learning_rate": 6.958402986179579e-07,
"loss": 1.9395,
"step": 10090
},
{
"epoch": 3.4362507442374755,
"grad_norm": 2.703125,
"learning_rate": 6.87656930156057e-07,
"loss": 1.9217,
"step": 10100
},
{
"epoch": 3.439652972697117,
"grad_norm": 2.203125,
"learning_rate": 6.795193723466726e-07,
"loss": 1.9458,
"step": 10110
},
{
"epoch": 3.4430552011567577,
"grad_norm": 1.828125,
"learning_rate": 6.714276869566347e-07,
"loss": 1.9698,
"step": 10120
},
{
"epoch": 3.4464574296163986,
"grad_norm": 2.3125,
"learning_rate": 6.633819354045855e-07,
"loss": 1.9773,
"step": 10130
},
{
"epoch": 3.44985965807604,
"grad_norm": 2.34375,
"learning_rate": 6.553821787605149e-07,
"loss": 1.8458,
"step": 10140
},
{
"epoch": 3.453261886535681,
"grad_norm": 2.265625,
"learning_rate": 6.474284777452948e-07,
"loss": 1.9633,
"step": 10150
},
{
"epoch": 3.456664114995322,
"grad_norm": 2.234375,
"learning_rate": 6.395208927302167e-07,
"loss": 1.9253,
"step": 10160
},
{
"epoch": 3.460066343454963,
"grad_norm": 1.984375,
"learning_rate": 6.31659483736541e-07,
"loss": 1.8867,
"step": 10170
},
{
"epoch": 3.463468571914604,
"grad_norm": 2.46875,
"learning_rate": 6.238443104350302e-07,
"loss": 1.9415,
"step": 10180
},
{
"epoch": 3.466870800374245,
"grad_norm": 2.4375,
"learning_rate": 6.160754321455092e-07,
"loss": 1.8688,
"step": 10190
},
{
"epoch": 3.470273028833886,
"grad_norm": 2.359375,
"learning_rate": 6.083529078364046e-07,
"loss": 1.8777,
"step": 10200
},
{
"epoch": 3.4736752572935274,
"grad_norm": 2.046875,
"learning_rate": 6.006767961242978e-07,
"loss": 1.8808,
"step": 10210
},
{
"epoch": 3.4770774857531683,
"grad_norm": 2.140625,
"learning_rate": 5.930471552734888e-07,
"loss": 1.9203,
"step": 10220
},
{
"epoch": 3.480479714212809,
"grad_norm": 2.21875,
"learning_rate": 5.854640431955407e-07,
"loss": 1.9427,
"step": 10230
},
{
"epoch": 3.4838819426724505,
"grad_norm": 2.609375,
"learning_rate": 5.779275174488542e-07,
"loss": 1.9229,
"step": 10240
},
{
"epoch": 3.4872841711320914,
"grad_norm": 2.328125,
"learning_rate": 5.704376352382198e-07,
"loss": 1.8909,
"step": 10250
},
{
"epoch": 3.4906863995917328,
"grad_norm": 2.25,
"learning_rate": 5.629944534143905e-07,
"loss": 1.9481,
"step": 10260
},
{
"epoch": 3.4940886280513737,
"grad_norm": 2.390625,
"learning_rate": 5.555980284736454e-07,
"loss": 1.9152,
"step": 10270
},
{
"epoch": 3.4974908565110145,
"grad_norm": 2.03125,
"learning_rate": 5.482484165573627e-07,
"loss": 1.9002,
"step": 10280
},
{
"epoch": 3.500893084970656,
"grad_norm": 2.34375,
"learning_rate": 5.409456734515961e-07,
"loss": 1.9427,
"step": 10290
},
{
"epoch": 3.5042953134302968,
"grad_norm": 2.390625,
"learning_rate": 5.336898545866455e-07,
"loss": 1.9312,
"step": 10300
},
{
"epoch": 3.5076975418899377,
"grad_norm": 2.3125,
"learning_rate": 5.264810150366431e-07,
"loss": 1.9146,
"step": 10310
},
{
"epoch": 3.511099770349579,
"grad_norm": 2.625,
"learning_rate": 5.193192095191315e-07,
"loss": 1.932,
"step": 10320
},
{
"epoch": 3.51450199880922,
"grad_norm": 2.21875,
"learning_rate": 5.122044923946488e-07,
"loss": 1.9544,
"step": 10330
},
{
"epoch": 3.517904227268861,
"grad_norm": 2.21875,
"learning_rate": 5.051369176663161e-07,
"loss": 1.9132,
"step": 10340
},
{
"epoch": 3.521306455728502,
"grad_norm": 2.09375,
"learning_rate": 4.981165389794265e-07,
"loss": 1.9379,
"step": 10350
},
{
"epoch": 3.524708684188143,
"grad_norm": 2.359375,
"learning_rate": 4.911434096210408e-07,
"loss": 1.8495,
"step": 10360
},
{
"epoch": 3.5281109126477843,
"grad_norm": 2.53125,
"learning_rate": 4.842175825195817e-07,
"loss": 1.964,
"step": 10370
},
{
"epoch": 3.531513141107425,
"grad_norm": 2.09375,
"learning_rate": 4.773391102444278e-07,
"loss": 1.8755,
"step": 10380
},
{
"epoch": 3.5349153695670665,
"grad_norm": 2.8125,
"learning_rate": 4.705080450055242e-07,
"loss": 1.902,
"step": 10390
},
{
"epoch": 3.5383175980267074,
"grad_norm": 3.03125,
"learning_rate": 4.63724438652977e-07,
"loss": 1.9428,
"step": 10400
},
{
"epoch": 3.5417198264863483,
"grad_norm": 2.125,
"learning_rate": 4.5698834267666295e-07,
"loss": 1.8812,
"step": 10410
},
{
"epoch": 3.5451220549459896,
"grad_norm": 2.265625,
"learning_rate": 4.502998082058419e-07,
"loss": 1.9378,
"step": 10420
},
{
"epoch": 3.5485242834056305,
"grad_norm": 2.546875,
"learning_rate": 4.4365888600876105e-07,
"loss": 1.8586,
"step": 10430
},
{
"epoch": 3.551926511865272,
"grad_norm": 2.5,
"learning_rate": 4.3706562649227966e-07,
"loss": 1.9303,
"step": 10440
},
{
"epoch": 3.5553287403249128,
"grad_norm": 2.28125,
"learning_rate": 4.305200797014755e-07,
"loss": 1.8785,
"step": 10450
},
{
"epoch": 3.5587309687845536,
"grad_norm": 2.296875,
"learning_rate": 4.2402229531927284e-07,
"loss": 1.8698,
"step": 10460
},
{
"epoch": 3.562133197244195,
"grad_norm": 2.203125,
"learning_rate": 4.1757232266606775e-07,
"loss": 1.9134,
"step": 10470
},
{
"epoch": 3.565535425703836,
"grad_norm": 2.0,
"learning_rate": 4.1117021069934086e-07,
"loss": 1.9092,
"step": 10480
},
{
"epoch": 3.568937654163477,
"grad_norm": 2.578125,
"learning_rate": 4.048160080133004e-07,
"loss": 1.8521,
"step": 10490
},
{
"epoch": 3.572339882623118,
"grad_norm": 2.046875,
"learning_rate": 3.985097628385017e-07,
"loss": 1.9322,
"step": 10500
},
{
"epoch": 3.575742111082759,
"grad_norm": 2.265625,
"learning_rate": 3.9225152304149186e-07,
"loss": 1.95,
"step": 10510
},
{
"epoch": 3.5791443395424003,
"grad_norm": 2.40625,
"learning_rate": 3.8604133612443344e-07,
"loss": 1.8966,
"step": 10520
},
{
"epoch": 3.582546568002041,
"grad_norm": 2.28125,
"learning_rate": 3.798792492247598e-07,
"loss": 1.8615,
"step": 10530
},
{
"epoch": 3.5859487964616825,
"grad_norm": 2.203125,
"learning_rate": 3.737653091148046e-07,
"loss": 1.9687,
"step": 10540
},
{
"epoch": 3.5893510249213234,
"grad_norm": 2.109375,
"learning_rate": 3.6769956220144835e-07,
"loss": 1.9133,
"step": 10550
},
{
"epoch": 3.5927532533809643,
"grad_norm": 2.203125,
"learning_rate": 3.61682054525775e-07,
"loss": 1.9313,
"step": 10560
},
{
"epoch": 3.5961554818406056,
"grad_norm": 2.359375,
"learning_rate": 3.5571283176270955e-07,
"loss": 2.0094,
"step": 10570
},
{
"epoch": 3.5995577103002465,
"grad_norm": 2.328125,
"learning_rate": 3.4979193922068417e-07,
"loss": 1.9955,
"step": 10580
},
{
"epoch": 3.602959938759888,
"grad_norm": 2.359375,
"learning_rate": 3.439194218412834e-07,
"loss": 1.9294,
"step": 10590
},
{
"epoch": 3.6063621672195287,
"grad_norm": 2.390625,
"learning_rate": 3.380953241989119e-07,
"loss": 1.8658,
"step": 10600
},
{
"epoch": 3.6097643956791696,
"grad_norm": 2.859375,
"learning_rate": 3.3231969050044987e-07,
"loss": 1.9264,
"step": 10610
},
{
"epoch": 3.613166624138811,
"grad_norm": 2.15625,
"learning_rate": 3.2659256458491855e-07,
"loss": 1.9539,
"step": 10620
},
{
"epoch": 3.616568852598452,
"grad_norm": 2.609375,
"learning_rate": 3.209139899231508e-07,
"loss": 1.9833,
"step": 10630
},
{
"epoch": 3.619971081058093,
"grad_norm": 2.328125,
"learning_rate": 3.1528400961745953e-07,
"loss": 1.9088,
"step": 10640
},
{
"epoch": 3.623373309517734,
"grad_norm": 2.359375,
"learning_rate": 3.0970266640130633e-07,
"loss": 1.9261,
"step": 10650
},
{
"epoch": 3.626775537977375,
"grad_norm": 2.1875,
"learning_rate": 3.0417000263898494e-07,
"loss": 1.8439,
"step": 10660
},
{
"epoch": 3.6301777664370163,
"grad_norm": 2.421875,
"learning_rate": 2.9868606032529224e-07,
"loss": 1.9474,
"step": 10670
},
{
"epoch": 3.633579994896657,
"grad_norm": 2.296875,
"learning_rate": 2.932508810852159e-07,
"loss": 1.9432,
"step": 10680
},
{
"epoch": 3.6369822233562985,
"grad_norm": 2.84375,
"learning_rate": 2.8786450617361245e-07,
"loss": 1.8769,
"step": 10690
},
{
"epoch": 3.6403844518159394,
"grad_norm": 2.40625,
"learning_rate": 2.825269764748977e-07,
"loss": 1.9754,
"step": 10700
},
{
"epoch": 3.6437866802755803,
"grad_norm": 2.109375,
"learning_rate": 2.772383325027377e-07,
"loss": 1.9327,
"step": 10710
},
{
"epoch": 3.6471889087352216,
"grad_norm": 2.421875,
"learning_rate": 2.719986143997357e-07,
"loss": 1.916,
"step": 10720
},
{
"epoch": 3.6505911371948625,
"grad_norm": 2.328125,
"learning_rate": 2.668078619371333e-07,
"loss": 1.8941,
"step": 10730
},
{
"epoch": 3.653993365654504,
"grad_norm": 2.4375,
"learning_rate": 2.616661145145063e-07,
"loss": 1.9525,
"step": 10740
},
{
"epoch": 3.6573955941141447,
"grad_norm": 2.546875,
"learning_rate": 2.5657341115946487e-07,
"loss": 1.8995,
"step": 10750
},
{
"epoch": 3.6607978225737856,
"grad_norm": 2.65625,
"learning_rate": 2.5152979052736e-07,
"loss": 1.9815,
"step": 10760
},
{
"epoch": 3.664200051033427,
"grad_norm": 2.765625,
"learning_rate": 2.46535290900983e-07,
"loss": 1.8823,
"step": 10770
},
{
"epoch": 3.667602279493068,
"grad_norm": 2.171875,
"learning_rate": 2.4158995019028676e-07,
"loss": 1.9158,
"step": 10780
},
{
"epoch": 3.671004507952709,
"grad_norm": 2.671875,
"learning_rate": 2.3669380593208516e-07,
"loss": 1.8857,
"step": 10790
},
{
"epoch": 3.67440673641235,
"grad_norm": 2.40625,
"learning_rate": 2.3184689528977832e-07,
"loss": 1.8922,
"step": 10800
},
{
"epoch": 3.677808964871991,
"grad_norm": 2.3125,
"learning_rate": 2.270492550530667e-07,
"loss": 1.9044,
"step": 10810
},
{
"epoch": 3.6812111933316323,
"grad_norm": 2.1875,
"learning_rate": 2.2230092163766907e-07,
"loss": 1.9365,
"step": 10820
},
{
"epoch": 3.684613421791273,
"grad_norm": 2.15625,
"learning_rate": 2.1760193108504913e-07,
"loss": 1.894,
"step": 10830
},
{
"epoch": 3.6880156502509145,
"grad_norm": 2.265625,
"learning_rate": 2.1295231906214332e-07,
"loss": 1.9366,
"step": 10840
},
{
"epoch": 3.6914178787105554,
"grad_norm": 1.921875,
"learning_rate": 2.0835212086108594e-07,
"loss": 1.9098,
"step": 10850
},
{
"epoch": 3.6948201071701963,
"grad_norm": 2.390625,
"learning_rate": 2.038013713989457e-07,
"loss": 1.9487,
"step": 10860
},
{
"epoch": 3.6982223356298376,
"grad_norm": 2.328125,
"learning_rate": 1.9930010521745713e-07,
"loss": 1.8716,
"step": 10870
},
{
"epoch": 3.7016245640894785,
"grad_norm": 2.21875,
"learning_rate": 1.9484835648276147e-07,
"loss": 1.8958,
"step": 10880
},
{
"epoch": 3.70502679254912,
"grad_norm": 2.390625,
"learning_rate": 1.904461589851424e-07,
"loss": 1.8943,
"step": 10890
},
{
"epoch": 3.7084290210087607,
"grad_norm": 1.9296875,
"learning_rate": 1.8609354613877697e-07,
"loss": 1.8747,
"step": 10900
},
{
"epoch": 3.7118312494684016,
"grad_norm": 2.296875,
"learning_rate": 1.817905509814755e-07,
"loss": 1.9229,
"step": 10910
},
{
"epoch": 3.715233477928043,
"grad_norm": 2.25,
"learning_rate": 1.7753720617443335e-07,
"loss": 1.9303,
"step": 10920
},
{
"epoch": 3.718635706387684,
"grad_norm": 2.328125,
"learning_rate": 1.7333354400198364e-07,
"loss": 1.9388,
"step": 10930
},
{
"epoch": 3.722037934847325,
"grad_norm": 2.015625,
"learning_rate": 1.691795963713496e-07,
"loss": 1.892,
"step": 10940
},
{
"epoch": 3.725440163306966,
"grad_norm": 2.3125,
"learning_rate": 1.6507539481240707e-07,
"loss": 1.9215,
"step": 10950
},
{
"epoch": 3.728842391766607,
"grad_norm": 2.28125,
"learning_rate": 1.6102097047744054e-07,
"loss": 1.9803,
"step": 10960
},
{
"epoch": 3.7322446202262483,
"grad_norm": 2.046875,
"learning_rate": 1.5701635414090798e-07,
"loss": 1.9324,
"step": 10970
},
{
"epoch": 3.735646848685889,
"grad_norm": 2.515625,
"learning_rate": 1.530615761992094e-07,
"loss": 1.8066,
"step": 10980
},
{
"epoch": 3.7390490771455305,
"grad_norm": 2.171875,
"learning_rate": 1.4915666667045188e-07,
"loss": 1.8818,
"step": 10990
},
{
"epoch": 3.7424513056051714,
"grad_norm": 2.390625,
"learning_rate": 1.4530165519422625e-07,
"loss": 1.9121,
"step": 11000
},
{
"epoch": 3.7458535340648123,
"grad_norm": 2.359375,
"learning_rate": 1.4149657103138097e-07,
"loss": 1.9224,
"step": 11010
},
{
"epoch": 3.7492557625244536,
"grad_norm": 2.5,
"learning_rate": 1.377414430637975e-07,
"loss": 1.9537,
"step": 11020
},
{
"epoch": 3.7526579909840945,
"grad_norm": 2.5,
"learning_rate": 1.3403629979417308e-07,
"loss": 1.9439,
"step": 11030
},
{
"epoch": 3.756060219443736,
"grad_norm": 2.375,
"learning_rate": 1.303811693458042e-07,
"loss": 1.9555,
"step": 11040
},
{
"epoch": 3.7594624479033767,
"grad_norm": 2.171875,
"learning_rate": 1.2677607946237328e-07,
"loss": 1.9296,
"step": 11050
},
{
"epoch": 3.7628646763630176,
"grad_norm": 2.46875,
"learning_rate": 1.2322105750773803e-07,
"loss": 1.9048,
"step": 11060
},
{
"epoch": 3.766266904822659,
"grad_norm": 2.609375,
"learning_rate": 1.1971613046572323e-07,
"loss": 1.9255,
"step": 11070
},
{
"epoch": 3.7696691332823,
"grad_norm": 2.34375,
"learning_rate": 1.1626132493991633e-07,
"loss": 1.9011,
"step": 11080
},
{
"epoch": 3.773071361741941,
"grad_norm": 2.28125,
"learning_rate": 1.1285666715346502e-07,
"loss": 1.8918,
"step": 11090
},
{
"epoch": 3.776473590201582,
"grad_norm": 2.484375,
"learning_rate": 1.0950218294888028e-07,
"loss": 1.84,
"step": 11100
},
{
"epoch": 3.779875818661223,
"grad_norm": 2.65625,
"learning_rate": 1.0619789778783557e-07,
"loss": 1.979,
"step": 11110
},
{
"epoch": 3.7832780471208642,
"grad_norm": 2.4375,
"learning_rate": 1.0294383675097872e-07,
"loss": 1.9141,
"step": 11120
},
{
"epoch": 3.786680275580505,
"grad_norm": 2.09375,
"learning_rate": 9.974002453774011e-08,
"loss": 1.98,
"step": 11130
},
{
"epoch": 3.7900825040401465,
"grad_norm": 2.484375,
"learning_rate": 9.658648546614084e-08,
"loss": 1.9723,
"step": 11140
},
{
"epoch": 3.7934847324997873,
"grad_norm": 2.421875,
"learning_rate": 9.348324347261734e-08,
"loss": 1.8887,
"step": 11150
},
{
"epoch": 3.7968869609594282,
"grad_norm": 2.546875,
"learning_rate": 9.04303221118288e-08,
"loss": 1.8763,
"step": 11160
},
{
"epoch": 3.8002891894190696,
"grad_norm": 2.46875,
"learning_rate": 8.742774455648695e-08,
"loss": 1.9326,
"step": 11170
},
{
"epoch": 3.8036914178787105,
"grad_norm": 1.9765625,
"learning_rate": 8.447553359717545e-08,
"loss": 1.8815,
"step": 11180
},
{
"epoch": 3.807093646338352,
"grad_norm": 2.296875,
"learning_rate": 8.157371164217902e-08,
"loss": 1.971,
"step": 11190
},
{
"epoch": 3.8104958747979927,
"grad_norm": 2.375,
"learning_rate": 7.872230071731239e-08,
"loss": 1.9483,
"step": 11200
},
{
"epoch": 3.8138981032576336,
"grad_norm": 2.609375,
"learning_rate": 7.592132246575323e-08,
"loss": 1.9457,
"step": 11210
},
{
"epoch": 3.817300331717275,
"grad_norm": 2.28125,
"learning_rate": 7.317079814787934e-08,
"loss": 1.9193,
"step": 11220
},
{
"epoch": 3.820702560176916,
"grad_norm": 2.203125,
"learning_rate": 7.047074864110375e-08,
"loss": 1.9131,
"step": 11230
},
{
"epoch": 3.824104788636557,
"grad_norm": 2.21875,
"learning_rate": 6.782119443972094e-08,
"loss": 1.9334,
"step": 11240
},
{
"epoch": 3.827507017096198,
"grad_norm": 2.625,
"learning_rate": 6.522215565474712e-08,
"loss": 1.958,
"step": 11250
},
{
"epoch": 3.830909245555839,
"grad_norm": 2.421875,
"learning_rate": 6.267365201377092e-08,
"loss": 1.9266,
"step": 11260
},
{
"epoch": 3.8343114740154802,
"grad_norm": 2.53125,
"learning_rate": 6.017570286079965e-08,
"loss": 1.9022,
"step": 11270
},
{
"epoch": 3.837713702475121,
"grad_norm": 2.34375,
"learning_rate": 5.77283271561175e-08,
"loss": 1.8612,
"step": 11280
},
{
"epoch": 3.8411159309347624,
"grad_norm": 2.453125,
"learning_rate": 5.5331543476137706e-08,
"loss": 1.9326,
"step": 11290
},
{
"epoch": 3.8445181593944033,
"grad_norm": 2.296875,
"learning_rate": 5.298537001326303e-08,
"loss": 1.8951,
"step": 11300
},
{
"epoch": 3.847920387854044,
"grad_norm": 2.40625,
"learning_rate": 5.068982457574685e-08,
"loss": 1.9788,
"step": 11310
},
{
"epoch": 3.8513226163136856,
"grad_norm": 2.609375,
"learning_rate": 4.8444924587559654e-08,
"loss": 1.9643,
"step": 11320
},
{
"epoch": 3.8547248447733264,
"grad_norm": 2.5625,
"learning_rate": 4.625068708825534e-08,
"loss": 1.9245,
"step": 11330
},
{
"epoch": 3.8581270732329678,
"grad_norm": 2.34375,
"learning_rate": 4.4107128732841385e-08,
"loss": 1.8401,
"step": 11340
},
{
"epoch": 3.8615293016926087,
"grad_norm": 2.09375,
"learning_rate": 4.20142657916557e-08,
"loss": 1.9087,
"step": 11350
},
{
"epoch": 3.8649315301522496,
"grad_norm": 2.140625,
"learning_rate": 3.99721141502382e-08,
"loss": 1.9401,
"step": 11360
},
{
"epoch": 3.868333758611891,
"grad_norm": 2.328125,
"learning_rate": 3.798068930921441e-08,
"loss": 1.9699,
"step": 11370
},
{
"epoch": 3.8717359870715318,
"grad_norm": 2.0625,
"learning_rate": 3.6040006384174545e-08,
"loss": 1.954,
"step": 11380
},
{
"epoch": 3.875138215531173,
"grad_norm": 2.40625,
"learning_rate": 3.4150080105563755e-08,
"loss": 1.8693,
"step": 11390
},
{
"epoch": 3.878540443990814,
"grad_norm": 2.078125,
"learning_rate": 3.231092481856271e-08,
"loss": 1.9307,
"step": 11400
},
{
"epoch": 3.881942672450455,
"grad_norm": 2.328125,
"learning_rate": 3.052255448298612e-08,
"loss": 1.956,
"step": 11410
},
{
"epoch": 3.885344900910096,
"grad_norm": 2.234375,
"learning_rate": 2.878498267317298e-08,
"loss": 1.9185,
"step": 11420
},
{
"epoch": 3.888747129369737,
"grad_norm": 2.5,
"learning_rate": 2.7098222577882825e-08,
"loss": 1.8685,
"step": 11430
},
{
"epoch": 3.8921493578293784,
"grad_norm": 2.328125,
"learning_rate": 2.5462287000197963e-08,
"loss": 1.9734,
"step": 11440
},
{
"epoch": 3.8955515862890193,
"grad_norm": 2.09375,
"learning_rate": 2.3877188357427174e-08,
"loss": 1.8995,
"step": 11450
},
{
"epoch": 3.89895381474866,
"grad_norm": 2.25,
"learning_rate": 2.2342938681005695e-08,
"loss": 1.8764,
"step": 11460
},
{
"epoch": 3.9023560432083015,
"grad_norm": 2.265625,
"learning_rate": 2.085954961641164e-08,
"loss": 1.8865,
"step": 11470
},
{
"epoch": 3.9057582716679424,
"grad_norm": 2.359375,
"learning_rate": 1.9427032423071165e-08,
"loss": 1.8932,
"step": 11480
},
{
"epoch": 3.9091605001275838,
"grad_norm": 2.25,
"learning_rate": 1.8045397974277166e-08,
"loss": 1.9042,
"step": 11490
},
{
"epoch": 3.9125627285872246,
"grad_norm": 2.1875,
"learning_rate": 1.6714656757104883e-08,
"loss": 1.94,
"step": 11500
},
{
"epoch": 3.9159649570468655,
"grad_norm": 2.28125,
"learning_rate": 1.5434818872331314e-08,
"loss": 1.8879,
"step": 11510
},
{
"epoch": 3.919367185506507,
"grad_norm": 2.046875,
"learning_rate": 1.4205894034362065e-08,
"loss": 1.9147,
"step": 11520
},
{
"epoch": 3.9227694139661478,
"grad_norm": 2.484375,
"learning_rate": 1.3027891571153722e-08,
"loss": 1.8714,
"step": 11530
},
{
"epoch": 3.926171642425789,
"grad_norm": 2.03125,
"learning_rate": 1.1900820424145176e-08,
"loss": 1.9371,
"step": 11540
},
{
"epoch": 3.92957387088543,
"grad_norm": 2.0,
"learning_rate": 1.0824689148190455e-08,
"loss": 1.9505,
"step": 11550
},
{
"epoch": 3.932976099345071,
"grad_norm": 2.453125,
"learning_rate": 9.799505911490794e-09,
"loss": 1.8738,
"step": 11560
},
{
"epoch": 3.936378327804712,
"grad_norm": 2.328125,
"learning_rate": 8.825278495535672e-09,
"loss": 1.8447,
"step": 11570
},
{
"epoch": 3.939780556264353,
"grad_norm": 2.28125,
"learning_rate": 7.902014295042352e-09,
"loss": 1.8987,
"step": 11580
},
{
"epoch": 3.9431827847239944,
"grad_norm": 2.46875,
"learning_rate": 7.029720317899902e-09,
"loss": 1.9864,
"step": 11590
},
{
"epoch": 3.9465850131836353,
"grad_norm": 2.796875,
"learning_rate": 6.20840318511545e-09,
"loss": 1.9454,
"step": 11600
},
{
"epoch": 3.949987241643276,
"grad_norm": 2.59375,
"learning_rate": 5.438069130766418e-09,
"loss": 1.9871,
"step": 11610
},
{
"epoch": 3.9533894701029175,
"grad_norm": 2.40625,
"learning_rate": 4.718724001949017e-09,
"loss": 1.8746,
"step": 11620
},
{
"epoch": 3.9567916985625584,
"grad_norm": 2.46875,
"learning_rate": 4.050373258737196e-09,
"loss": 1.9578,
"step": 11630
},
{
"epoch": 3.9601939270221997,
"grad_norm": 2.171875,
"learning_rate": 3.4330219741408427e-09,
"loss": 1.9242,
"step": 11640
},
{
"epoch": 3.9635961554818406,
"grad_norm": 2.703125,
"learning_rate": 2.8666748340662245e-09,
"loss": 1.9133,
"step": 11650
},
{
"epoch": 3.9669983839414815,
"grad_norm": 2.0625,
"learning_rate": 2.351336137279413e-09,
"loss": 1.9196,
"step": 11660
},
{
"epoch": 3.970400612401123,
"grad_norm": 1.78125,
"learning_rate": 1.887009795377922e-09,
"loss": 1.9906,
"step": 11670
},
{
"epoch": 3.9738028408607637,
"grad_norm": 2.296875,
"learning_rate": 1.473699332754879e-09,
"loss": 1.8989,
"step": 11680
},
{
"epoch": 3.977205069320405,
"grad_norm": 2.609375,
"learning_rate": 1.1114078865781264e-09,
"loss": 1.8962,
"step": 11690
},
{
"epoch": 3.980607297780046,
"grad_norm": 2.34375,
"learning_rate": 8.001382067626036e-10,
"loss": 1.944,
"step": 11700
},
{
"epoch": 3.984009526239687,
"grad_norm": 2.265625,
"learning_rate": 5.398926559516878e-10,
"loss": 1.8959,
"step": 11710
},
{
"epoch": 3.987411754699328,
"grad_norm": 2.328125,
"learning_rate": 3.306732094962939e-10,
"loss": 1.9388,
"step": 11720
},
{
"epoch": 3.990813983158969,
"grad_norm": 2.359375,
"learning_rate": 1.7248145544367861e-10,
"loss": 1.9133,
"step": 11730
},
{
"epoch": 3.9942162116186104,
"grad_norm": 1.96875,
"learning_rate": 6.531859452325864e-11,
"loss": 1.957,
"step": 11740
},
{
"epoch": 3.9976184400782513,
"grad_norm": 2.3125,
"learning_rate": 9.185440136907336e-12,
"loss": 1.9494,
"step": 11750
}
],
"logging_steps": 10,
"max_steps": 11756,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0768921731962634e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}