{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 900, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00022222222222222223, "grad_norm": 2.652693033218384, "learning_rate": 2e-05, "loss": 1.3346, "step": 1 }, { "epoch": 0.00044444444444444447, "grad_norm": 2.6040241718292236, "learning_rate": 4e-05, "loss": 1.1227, "step": 2 }, { "epoch": 0.0006666666666666666, "grad_norm": 3.2344393730163574, "learning_rate": 6e-05, "loss": 0.181, "step": 3 }, { "epoch": 0.0008888888888888889, "grad_norm": 1.2059385776519775, "learning_rate": 8e-05, "loss": 1.0497, "step": 4 }, { "epoch": 0.0011111111111111111, "grad_norm": 0.9265973567962646, "learning_rate": 0.0001, "loss": 2.3025, "step": 5 }, { "epoch": 0.0013333333333333333, "grad_norm": 0.6584568023681641, "learning_rate": 0.00012, "loss": 1.3183, "step": 6 }, { "epoch": 0.0015555555555555555, "grad_norm": 1.0159577131271362, "learning_rate": 0.00014, "loss": 2.3477, "step": 7 }, { "epoch": 0.0017777777777777779, "grad_norm": 0.8150708675384521, "learning_rate": 0.00016, "loss": 1.1444, "step": 8 }, { "epoch": 0.002, "grad_norm": 0.8650357127189636, "learning_rate": 0.00018, "loss": 0.1126, "step": 9 }, { "epoch": 0.0022222222222222222, "grad_norm": 0.5120269656181335, "learning_rate": 0.0002, "loss": 0.0703, "step": 10 }, { "epoch": 0.0024444444444444444, "grad_norm": 0.8459653258323669, "learning_rate": 0.0001999554565701559, "loss": 2.1607, "step": 11 }, { "epoch": 0.0026666666666666666, "grad_norm": 1.0426557064056396, "learning_rate": 0.0001999109131403118, "loss": 2.2511, "step": 12 }, { "epoch": 0.0028888888888888888, "grad_norm": 0.9789963960647583, "learning_rate": 0.0001998663697104677, "loss": 2.3192, "step": 13 }, { "epoch": 0.003111111111111111, "grad_norm": 0.9778504967689514, "learning_rate": 0.00019982182628062363, "loss": 2.3259, "step": 14 }, { "epoch": 0.0033333333333333335, "grad_norm": 0.9376258850097656, "learning_rate": 0.00019977728285077952, "loss": 2.3107, "step": 15 }, { "epoch": 0.0035555555555555557, "grad_norm": 0.958590567111969, "learning_rate": 0.00019973273942093541, "loss": 2.1426, "step": 16 }, { "epoch": 0.003777777777777778, "grad_norm": 1.1192786693572998, "learning_rate": 0.00019968819599109133, "loss": 1.8911, "step": 17 }, { "epoch": 0.004, "grad_norm": 1.112155556678772, "learning_rate": 0.00019964365256124723, "loss": 2.3746, "step": 18 }, { "epoch": 0.004222222222222222, "grad_norm": 1.0468113422393799, "learning_rate": 0.00019959910913140312, "loss": 2.3151, "step": 19 }, { "epoch": 0.0044444444444444444, "grad_norm": 0.907065212726593, "learning_rate": 0.00019955456570155904, "loss": 2.1126, "step": 20 }, { "epoch": 0.004666666666666667, "grad_norm": 1.0177619457244873, "learning_rate": 0.00019951002227171493, "loss": 1.7835, "step": 21 }, { "epoch": 0.004888888888888889, "grad_norm": 0.9592558741569519, "learning_rate": 0.00019946547884187085, "loss": 1.8765, "step": 22 }, { "epoch": 0.005111111111111111, "grad_norm": 1.0939193964004517, "learning_rate": 0.00019942093541202674, "loss": 1.4541, "step": 23 }, { "epoch": 0.005333333333333333, "grad_norm": 1.4333382844924927, "learning_rate": 0.00019937639198218263, "loss": 0.5901, "step": 24 }, { "epoch": 0.005555555555555556, "grad_norm": 1.2383116483688354, "learning_rate": 0.00019933184855233852, "loss": 0.4609, "step": 25 }, { "epoch": 0.0057777777777777775, "grad_norm": 1.2145708799362183, "learning_rate": 0.00019928730512249444, "loss": 0.3003, "step": 26 }, { "epoch": 0.006, "grad_norm": 1.2296050786972046, "learning_rate": 0.00019924276169265036, "loss": 1.8733, "step": 27 }, { "epoch": 0.006222222222222222, "grad_norm": 1.5386277437210083, "learning_rate": 0.00019919821826280625, "loss": 1.9739, "step": 28 }, { "epoch": 0.0064444444444444445, "grad_norm": 1.691746473312378, "learning_rate": 0.00019915367483296214, "loss": 2.3246, "step": 29 }, { "epoch": 0.006666666666666667, "grad_norm": 1.5347216129302979, "learning_rate": 0.00019910913140311804, "loss": 1.9747, "step": 30 }, { "epoch": 0.006888888888888889, "grad_norm": 1.1143240928649902, "learning_rate": 0.00019906458797327395, "loss": 1.8893, "step": 31 }, { "epoch": 0.0071111111111111115, "grad_norm": 1.443770170211792, "learning_rate": 0.00019902004454342987, "loss": 2.1026, "step": 32 }, { "epoch": 0.007333333333333333, "grad_norm": 1.1426650285720825, "learning_rate": 0.00019897550111358577, "loss": 1.1449, "step": 33 }, { "epoch": 0.007555555555555556, "grad_norm": 1.4505339860916138, "learning_rate": 0.00019893095768374166, "loss": 0.1506, "step": 34 }, { "epoch": 0.0077777777777777776, "grad_norm": 0.7057297825813293, "learning_rate": 0.00019888641425389755, "loss": 0.0606, "step": 35 }, { "epoch": 0.008, "grad_norm": 0.3842390775680542, "learning_rate": 0.00019884187082405347, "loss": 0.0288, "step": 36 }, { "epoch": 0.008222222222222223, "grad_norm": 1.8523081541061401, "learning_rate": 0.00019879732739420936, "loss": 1.0721, "step": 37 }, { "epoch": 0.008444444444444444, "grad_norm": 2.3438615798950195, "learning_rate": 0.00019875278396436528, "loss": 0.3206, "step": 38 }, { "epoch": 0.008666666666666666, "grad_norm": 1.7265911102294922, "learning_rate": 0.00019870824053452117, "loss": 0.2136, "step": 39 }, { "epoch": 0.008888888888888889, "grad_norm": 1.1597121953964233, "learning_rate": 0.00019866369710467706, "loss": 0.1069, "step": 40 }, { "epoch": 0.009111111111111111, "grad_norm": 1.5598105192184448, "learning_rate": 0.00019861915367483298, "loss": 2.1067, "step": 41 }, { "epoch": 0.009333333333333334, "grad_norm": 1.8397672176361084, "learning_rate": 0.00019857461024498887, "loss": 1.3043, "step": 42 }, { "epoch": 0.009555555555555555, "grad_norm": 1.7002127170562744, "learning_rate": 0.00019853006681514476, "loss": 1.1985, "step": 43 }, { "epoch": 0.009777777777777778, "grad_norm": 1.86135733127594, "learning_rate": 0.00019848552338530068, "loss": 1.8315, "step": 44 }, { "epoch": 0.01, "grad_norm": 1.30124032497406, "learning_rate": 0.00019844097995545658, "loss": 1.7591, "step": 45 }, { "epoch": 0.010222222222222223, "grad_norm": 1.6460219621658325, "learning_rate": 0.0001983964365256125, "loss": 1.9981, "step": 46 }, { "epoch": 0.010444444444444444, "grad_norm": 1.2923930883407593, "learning_rate": 0.00019835189309576839, "loss": 1.2989, "step": 47 }, { "epoch": 0.010666666666666666, "grad_norm": 1.448328971862793, "learning_rate": 0.00019830734966592428, "loss": 1.2251, "step": 48 }, { "epoch": 0.010888888888888889, "grad_norm": 1.767919659614563, "learning_rate": 0.00019826280623608017, "loss": 1.1289, "step": 49 }, { "epoch": 0.011111111111111112, "grad_norm": 1.786415696144104, "learning_rate": 0.0001982182628062361, "loss": 1.3542, "step": 50 }, { "epoch": 0.011333333333333334, "grad_norm": 1.26632559299469, "learning_rate": 0.000198173719376392, "loss": 1.5781, "step": 51 }, { "epoch": 0.011555555555555555, "grad_norm": 1.0629119873046875, "learning_rate": 0.0001981291759465479, "loss": 1.2804, "step": 52 }, { "epoch": 0.011777777777777778, "grad_norm": 1.2844982147216797, "learning_rate": 0.0001980846325167038, "loss": 0.2304, "step": 53 }, { "epoch": 0.012, "grad_norm": 0.7769795656204224, "learning_rate": 0.00019804008908685968, "loss": 1.302, "step": 54 }, { "epoch": 0.012222222222222223, "grad_norm": 0.5583917498588562, "learning_rate": 0.0001979955456570156, "loss": 0.0753, "step": 55 }, { "epoch": 0.012444444444444444, "grad_norm": 0.9252032041549683, "learning_rate": 0.00019795100222717152, "loss": 2.0616, "step": 56 }, { "epoch": 0.012666666666666666, "grad_norm": 4.016125202178955, "learning_rate": 0.0001979064587973274, "loss": 0.3289, "step": 57 }, { "epoch": 0.012888888888888889, "grad_norm": 1.1086289882659912, "learning_rate": 0.0001978619153674833, "loss": 0.9885, "step": 58 }, { "epoch": 0.013111111111111112, "grad_norm": 1.0204805135726929, "learning_rate": 0.0001978173719376392, "loss": 2.0717, "step": 59 }, { "epoch": 0.013333333333333334, "grad_norm": 1.1669329404830933, "learning_rate": 0.00019777282850779511, "loss": 2.2568, "step": 60 }, { "epoch": 0.013555555555555555, "grad_norm": 1.0386414527893066, "learning_rate": 0.000197728285077951, "loss": 2.3931, "step": 61 }, { "epoch": 0.013777777777777778, "grad_norm": 0.9788153767585754, "learning_rate": 0.00019768374164810693, "loss": 2.0631, "step": 62 }, { "epoch": 0.014, "grad_norm": 0.9021984338760376, "learning_rate": 0.00019763919821826282, "loss": 2.3083, "step": 63 }, { "epoch": 0.014222222222222223, "grad_norm": 1.1166595220565796, "learning_rate": 0.0001975946547884187, "loss": 1.932, "step": 64 }, { "epoch": 0.014444444444444444, "grad_norm": 1.7329879999160767, "learning_rate": 0.00019755011135857463, "loss": 1.599, "step": 65 }, { "epoch": 0.014666666666666666, "grad_norm": 1.1422115564346313, "learning_rate": 0.00019750556792873052, "loss": 1.8565, "step": 66 }, { "epoch": 0.014888888888888889, "grad_norm": 1.0347861051559448, "learning_rate": 0.0001974610244988864, "loss": 2.108, "step": 67 }, { "epoch": 0.015111111111111112, "grad_norm": 1.5094088315963745, "learning_rate": 0.00019741648106904233, "loss": 1.0669, "step": 68 }, { "epoch": 0.015333333333333332, "grad_norm": 1.7448095083236694, "learning_rate": 0.00019737193763919822, "loss": 0.2599, "step": 69 }, { "epoch": 0.015555555555555555, "grad_norm": 0.9938380718231201, "learning_rate": 0.00019732739420935414, "loss": 0.1311, "step": 70 }, { "epoch": 0.01577777777777778, "grad_norm": 1.0205384492874146, "learning_rate": 0.00019728285077951003, "loss": 1.7412, "step": 71 }, { "epoch": 0.016, "grad_norm": 1.2222613096237183, "learning_rate": 0.00019723830734966592, "loss": 1.7811, "step": 72 }, { "epoch": 0.01622222222222222, "grad_norm": 1.2196162939071655, "learning_rate": 0.00019719376391982182, "loss": 1.6904, "step": 73 }, { "epoch": 0.016444444444444446, "grad_norm": 1.3248560428619385, "learning_rate": 0.00019714922048997774, "loss": 1.7129, "step": 74 }, { "epoch": 0.016666666666666666, "grad_norm": 2.0687692165374756, "learning_rate": 0.00019710467706013365, "loss": 0.1651, "step": 75 }, { "epoch": 0.016888888888888887, "grad_norm": 0.9671218395233154, "learning_rate": 0.00019706013363028955, "loss": 0.0788, "step": 76 }, { "epoch": 0.01711111111111111, "grad_norm": 0.2219647616147995, "learning_rate": 0.00019701559020044544, "loss": 0.0253, "step": 77 }, { "epoch": 0.017333333333333333, "grad_norm": 1.0968049764633179, "learning_rate": 0.00019697104677060133, "loss": 1.058, "step": 78 }, { "epoch": 0.017555555555555557, "grad_norm": 0.9246222376823425, "learning_rate": 0.00019692650334075725, "loss": 0.1111, "step": 79 }, { "epoch": 0.017777777777777778, "grad_norm": 0.7714378237724304, "learning_rate": 0.00019688195991091317, "loss": 0.9722, "step": 80 }, { "epoch": 0.018, "grad_norm": 0.25117895007133484, "learning_rate": 0.00019683741648106906, "loss": 0.0352, "step": 81 }, { "epoch": 0.018222222222222223, "grad_norm": 1.1857657432556152, "learning_rate": 0.00019679287305122495, "loss": 1.997, "step": 82 }, { "epoch": 0.018444444444444444, "grad_norm": 1.196076512336731, "learning_rate": 0.00019674832962138084, "loss": 1.7684, "step": 83 }, { "epoch": 0.018666666666666668, "grad_norm": 1.5178613662719727, "learning_rate": 0.00019670378619153676, "loss": 1.93, "step": 84 }, { "epoch": 0.01888888888888889, "grad_norm": 1.5289626121520996, "learning_rate": 0.00019665924276169265, "loss": 1.4133, "step": 85 }, { "epoch": 0.01911111111111111, "grad_norm": 1.3246040344238281, "learning_rate": 0.00019661469933184855, "loss": 1.7251, "step": 86 }, { "epoch": 0.019333333333333334, "grad_norm": 1.296377420425415, "learning_rate": 0.00019657015590200446, "loss": 1.8894, "step": 87 }, { "epoch": 0.019555555555555555, "grad_norm": 1.5035158395767212, "learning_rate": 0.00019652561247216036, "loss": 1.6791, "step": 88 }, { "epoch": 0.019777777777777776, "grad_norm": 1.1684895753860474, "learning_rate": 0.00019648106904231628, "loss": 1.4391, "step": 89 }, { "epoch": 0.02, "grad_norm": 1.2564208507537842, "learning_rate": 0.00019643652561247217, "loss": 1.5629, "step": 90 }, { "epoch": 0.02022222222222222, "grad_norm": 1.1524401903152466, "learning_rate": 0.00019639198218262806, "loss": 1.5727, "step": 91 }, { "epoch": 0.020444444444444446, "grad_norm": 1.2944073677062988, "learning_rate": 0.00019634743875278395, "loss": 1.4461, "step": 92 }, { "epoch": 0.020666666666666667, "grad_norm": 1.0988140106201172, "learning_rate": 0.0001963028953229399, "loss": 1.7277, "step": 93 }, { "epoch": 0.020888888888888887, "grad_norm": 1.2961751222610474, "learning_rate": 0.0001962583518930958, "loss": 1.5801, "step": 94 }, { "epoch": 0.021111111111111112, "grad_norm": 1.103636622428894, "learning_rate": 0.00019621380846325168, "loss": 1.2372, "step": 95 }, { "epoch": 0.021333333333333333, "grad_norm": 1.108388900756836, "learning_rate": 0.00019616926503340757, "loss": 1.2511, "step": 96 }, { "epoch": 0.021555555555555557, "grad_norm": 1.27703857421875, "learning_rate": 0.00019612472160356346, "loss": 0.4245, "step": 97 }, { "epoch": 0.021777777777777778, "grad_norm": 1.0161255598068237, "learning_rate": 0.00019608017817371938, "loss": 0.9912, "step": 98 }, { "epoch": 0.022, "grad_norm": 1.1940312385559082, "learning_rate": 0.0001960356347438753, "loss": 1.0144, "step": 99 }, { "epoch": 0.022222222222222223, "grad_norm": 1.552917242050171, "learning_rate": 0.0001959910913140312, "loss": 0.8394, "step": 100 }, { "epoch": 0.022444444444444444, "grad_norm": 1.560513973236084, "learning_rate": 0.00019594654788418709, "loss": 1.3496, "step": 101 }, { "epoch": 0.02266666666666667, "grad_norm": 0.8733224868774414, "learning_rate": 0.000195902004454343, "loss": 1.2893, "step": 102 }, { "epoch": 0.02288888888888889, "grad_norm": 0.7647473216056824, "learning_rate": 0.0001958574610244989, "loss": 1.309, "step": 103 }, { "epoch": 0.02311111111111111, "grad_norm": 0.36057984828948975, "learning_rate": 0.0001958129175946548, "loss": 0.0338, "step": 104 }, { "epoch": 0.023333333333333334, "grad_norm": 0.6094343066215515, "learning_rate": 0.0001957683741648107, "loss": 1.1273, "step": 105 }, { "epoch": 0.023555555555555555, "grad_norm": 0.8449940085411072, "learning_rate": 0.0001957238307349666, "loss": 0.0724, "step": 106 }, { "epoch": 0.023777777777777776, "grad_norm": 0.6553965210914612, "learning_rate": 0.00019567928730512252, "loss": 0.0731, "step": 107 }, { "epoch": 0.024, "grad_norm": 0.7489123940467834, "learning_rate": 0.0001956347438752784, "loss": 1.2152, "step": 108 }, { "epoch": 0.02422222222222222, "grad_norm": 1.0028694868087769, "learning_rate": 0.0001955902004454343, "loss": 2.3775, "step": 109 }, { "epoch": 0.024444444444444446, "grad_norm": 1.1270484924316406, "learning_rate": 0.0001955456570155902, "loss": 2.1361, "step": 110 }, { "epoch": 0.024666666666666667, "grad_norm": 1.0289149284362793, "learning_rate": 0.0001955011135857461, "loss": 1.892, "step": 111 }, { "epoch": 0.024888888888888887, "grad_norm": 1.0495026111602783, "learning_rate": 0.00019545657015590203, "loss": 1.9626, "step": 112 }, { "epoch": 0.025111111111111112, "grad_norm": 0.8400951623916626, "learning_rate": 0.00019541202672605792, "loss": 1.9681, "step": 113 }, { "epoch": 0.025333333333333333, "grad_norm": 3.4133801460266113, "learning_rate": 0.00019536748329621381, "loss": 0.8974, "step": 114 }, { "epoch": 0.025555555555555557, "grad_norm": 1.6891502141952515, "learning_rate": 0.0001953229398663697, "loss": 0.7522, "step": 115 }, { "epoch": 0.025777777777777778, "grad_norm": 0.8611025810241699, "learning_rate": 0.00019527839643652563, "loss": 2.0537, "step": 116 }, { "epoch": 0.026, "grad_norm": 0.9971293210983276, "learning_rate": 0.00019523385300668154, "loss": 2.2649, "step": 117 }, { "epoch": 0.026222222222222223, "grad_norm": 0.9530083537101746, "learning_rate": 0.00019518930957683744, "loss": 2.0625, "step": 118 }, { "epoch": 0.026444444444444444, "grad_norm": 1.045301079750061, "learning_rate": 0.00019514476614699333, "loss": 2.006, "step": 119 }, { "epoch": 0.02666666666666667, "grad_norm": 0.9277514815330505, "learning_rate": 0.00019510022271714922, "loss": 2.0909, "step": 120 }, { "epoch": 0.02688888888888889, "grad_norm": 1.1190154552459717, "learning_rate": 0.00019505567928730514, "loss": 1.8509, "step": 121 }, { "epoch": 0.02711111111111111, "grad_norm": 1.1135308742523193, "learning_rate": 0.00019501113585746103, "loss": 2.0486, "step": 122 }, { "epoch": 0.027333333333333334, "grad_norm": 1.0613086223602295, "learning_rate": 0.00019496659242761695, "loss": 1.2045, "step": 123 }, { "epoch": 0.027555555555555555, "grad_norm": 1.2695746421813965, "learning_rate": 0.00019492204899777284, "loss": 0.133, "step": 124 }, { "epoch": 0.027777777777777776, "grad_norm": 1.5150560140609741, "learning_rate": 0.00019487750556792873, "loss": 1.3502, "step": 125 }, { "epoch": 0.028, "grad_norm": 1.473061203956604, "learning_rate": 0.00019483296213808465, "loss": 1.2591, "step": 126 }, { "epoch": 0.02822222222222222, "grad_norm": 1.287636637687683, "learning_rate": 0.00019478841870824054, "loss": 1.8108, "step": 127 }, { "epoch": 0.028444444444444446, "grad_norm": 1.033453345298767, "learning_rate": 0.00019474387527839644, "loss": 1.8811, "step": 128 }, { "epoch": 0.028666666666666667, "grad_norm": 1.2280066013336182, "learning_rate": 0.00019469933184855235, "loss": 1.9518, "step": 129 }, { "epoch": 0.028888888888888888, "grad_norm": 1.2945783138275146, "learning_rate": 0.00019465478841870825, "loss": 1.8723, "step": 130 }, { "epoch": 0.029111111111111112, "grad_norm": 1.2305806875228882, "learning_rate": 0.00019461024498886416, "loss": 1.7272, "step": 131 }, { "epoch": 0.029333333333333333, "grad_norm": 1.3530161380767822, "learning_rate": 0.00019456570155902006, "loss": 1.0613, "step": 132 }, { "epoch": 0.029555555555555557, "grad_norm": 0.6455280184745789, "learning_rate": 0.00019452115812917595, "loss": 0.0564, "step": 133 }, { "epoch": 0.029777777777777778, "grad_norm": 0.27219173312187195, "learning_rate": 0.00019447661469933184, "loss": 0.0297, "step": 134 }, { "epoch": 0.03, "grad_norm": 0.8533250093460083, "learning_rate": 0.00019443207126948776, "loss": 1.055, "step": 135 }, { "epoch": 0.030222222222222223, "grad_norm": 2.0201220512390137, "learning_rate": 0.00019438752783964368, "loss": 0.0575, "step": 136 }, { "epoch": 0.030444444444444444, "grad_norm": 0.22158296406269073, "learning_rate": 0.00019434298440979957, "loss": 0.0267, "step": 137 }, { "epoch": 0.030666666666666665, "grad_norm": 0.6912283301353455, "learning_rate": 0.00019429844097995546, "loss": 0.759, "step": 138 }, { "epoch": 0.03088888888888889, "grad_norm": 1.0244457721710205, "learning_rate": 0.00019425389755011135, "loss": 1.8124, "step": 139 }, { "epoch": 0.03111111111111111, "grad_norm": 1.0154163837432861, "learning_rate": 0.00019420935412026727, "loss": 1.2565, "step": 140 }, { "epoch": 0.03133333333333333, "grad_norm": 0.9750798940658569, "learning_rate": 0.0001941648106904232, "loss": 1.3334, "step": 141 }, { "epoch": 0.03155555555555556, "grad_norm": 1.3760290145874023, "learning_rate": 0.00019412026726057908, "loss": 1.674, "step": 142 }, { "epoch": 0.03177777777777778, "grad_norm": 1.2384613752365112, "learning_rate": 0.00019407572383073498, "loss": 1.5803, "step": 143 }, { "epoch": 0.032, "grad_norm": 1.452026128768921, "learning_rate": 0.00019403118040089087, "loss": 1.5312, "step": 144 }, { "epoch": 0.03222222222222222, "grad_norm": 1.0377565622329712, "learning_rate": 0.00019398663697104679, "loss": 1.3206, "step": 145 }, { "epoch": 0.03244444444444444, "grad_norm": 1.2706232070922852, "learning_rate": 0.00019394209354120268, "loss": 0.6841, "step": 146 }, { "epoch": 0.03266666666666666, "grad_norm": 1.1973495483398438, "learning_rate": 0.0001938975501113586, "loss": 1.3782, "step": 147 }, { "epoch": 0.03288888888888889, "grad_norm": 1.2757797241210938, "learning_rate": 0.0001938530066815145, "loss": 1.5044, "step": 148 }, { "epoch": 0.03311111111111111, "grad_norm": 1.0980628728866577, "learning_rate": 0.00019380846325167038, "loss": 0.6844, "step": 149 }, { "epoch": 0.03333333333333333, "grad_norm": 1.2653899192810059, "learning_rate": 0.0001937639198218263, "loss": 1.1744, "step": 150 }, { "epoch": 0.033555555555555554, "grad_norm": 2.2348453998565674, "learning_rate": 0.0001937193763919822, "loss": 0.1349, "step": 151 }, { "epoch": 0.033777777777777775, "grad_norm": 1.0379369258880615, "learning_rate": 0.00019367483296213808, "loss": 1.6603, "step": 152 }, { "epoch": 0.034, "grad_norm": 0.735359251499176, "learning_rate": 0.000193630289532294, "loss": 1.0719, "step": 153 }, { "epoch": 0.03422222222222222, "grad_norm": 1.1146577596664429, "learning_rate": 0.0001935857461024499, "loss": 2.4996, "step": 154 }, { "epoch": 0.034444444444444444, "grad_norm": 0.7083627581596375, "learning_rate": 0.0001935412026726058, "loss": 1.1257, "step": 155 }, { "epoch": 0.034666666666666665, "grad_norm": 0.12564276158809662, "learning_rate": 0.0001934966592427617, "loss": 0.0168, "step": 156 }, { "epoch": 0.034888888888888886, "grad_norm": 0.7023375630378723, "learning_rate": 0.0001934521158129176, "loss": 1.0538, "step": 157 }, { "epoch": 0.035111111111111114, "grad_norm": 0.5180396437644958, "learning_rate": 0.0001934075723830735, "loss": 0.046, "step": 158 }, { "epoch": 0.035333333333333335, "grad_norm": 0.6033398509025574, "learning_rate": 0.0001933630289532294, "loss": 1.0375, "step": 159 }, { "epoch": 0.035555555555555556, "grad_norm": 0.8683068156242371, "learning_rate": 0.00019331848552338533, "loss": 2.1418, "step": 160 }, { "epoch": 0.035777777777777776, "grad_norm": 0.9552950859069824, "learning_rate": 0.00019327394209354122, "loss": 2.2581, "step": 161 }, { "epoch": 0.036, "grad_norm": 0.9460126757621765, "learning_rate": 0.0001932293986636971, "loss": 2.012, "step": 162 }, { "epoch": 0.036222222222222225, "grad_norm": 0.9581560492515564, "learning_rate": 0.000193184855233853, "loss": 2.2566, "step": 163 }, { "epoch": 0.036444444444444446, "grad_norm": 1.3161773681640625, "learning_rate": 0.00019314031180400892, "loss": 1.3755, "step": 164 }, { "epoch": 0.03666666666666667, "grad_norm": 1.008534550666809, "learning_rate": 0.0001930957683741648, "loss": 2.1225, "step": 165 }, { "epoch": 0.03688888888888889, "grad_norm": 0.965151309967041, "learning_rate": 0.00019305122494432073, "loss": 2.0691, "step": 166 }, { "epoch": 0.03711111111111111, "grad_norm": 1.0121870040893555, "learning_rate": 0.00019300668151447662, "loss": 1.7821, "step": 167 }, { "epoch": 0.037333333333333336, "grad_norm": 1.084385871887207, "learning_rate": 0.00019296213808463251, "loss": 0.0566, "step": 168 }, { "epoch": 0.03755555555555556, "grad_norm": 0.6437819600105286, "learning_rate": 0.00019291759465478843, "loss": 0.9437, "step": 169 }, { "epoch": 0.03777777777777778, "grad_norm": 0.8647774457931519, "learning_rate": 0.00019287305122494432, "loss": 1.2514, "step": 170 }, { "epoch": 0.038, "grad_norm": 1.2839748859405518, "learning_rate": 0.00019282850779510022, "loss": 2.1381, "step": 171 }, { "epoch": 0.03822222222222222, "grad_norm": 1.1602987051010132, "learning_rate": 0.00019278396436525614, "loss": 2.0459, "step": 172 }, { "epoch": 0.03844444444444445, "grad_norm": 1.0964981317520142, "learning_rate": 0.00019273942093541203, "loss": 1.8051, "step": 173 }, { "epoch": 0.03866666666666667, "grad_norm": 0.9932026267051697, "learning_rate": 0.00019269487750556795, "loss": 2.048, "step": 174 }, { "epoch": 0.03888888888888889, "grad_norm": 0.9929348826408386, "learning_rate": 0.00019265033407572384, "loss": 1.6021, "step": 175 }, { "epoch": 0.03911111111111111, "grad_norm": 0.9955350756645203, "learning_rate": 0.00019260579064587973, "loss": 1.8276, "step": 176 }, { "epoch": 0.03933333333333333, "grad_norm": 1.1119470596313477, "learning_rate": 0.00019256124721603562, "loss": 2.2424, "step": 177 }, { "epoch": 0.03955555555555555, "grad_norm": 0.9925389885902405, "learning_rate": 0.00019251670378619157, "loss": 1.9117, "step": 178 }, { "epoch": 0.03977777777777778, "grad_norm": 0.7970108985900879, "learning_rate": 0.00019247216035634746, "loss": 0.8588, "step": 179 }, { "epoch": 0.04, "grad_norm": 0.25734761357307434, "learning_rate": 0.00019242761692650335, "loss": 0.0239, "step": 180 }, { "epoch": 0.04022222222222222, "grad_norm": 0.13014006614685059, "learning_rate": 0.00019238307349665924, "loss": 0.0209, "step": 181 }, { "epoch": 0.04044444444444444, "grad_norm": 0.10182005167007446, "learning_rate": 0.00019233853006681513, "loss": 0.0186, "step": 182 }, { "epoch": 0.04066666666666666, "grad_norm": 0.07455065846443176, "learning_rate": 0.00019229398663697105, "loss": 0.0176, "step": 183 }, { "epoch": 0.04088888888888889, "grad_norm": 0.06727147102355957, "learning_rate": 0.00019224944320712697, "loss": 0.0166, "step": 184 }, { "epoch": 0.04111111111111111, "grad_norm": 0.7398682236671448, "learning_rate": 0.00019220489977728286, "loss": 0.7252, "step": 185 }, { "epoch": 0.04133333333333333, "grad_norm": 0.9568517804145813, "learning_rate": 0.00019216035634743876, "loss": 1.0639, "step": 186 }, { "epoch": 0.041555555555555554, "grad_norm": 0.7644314169883728, "learning_rate": 0.00019211581291759468, "loss": 1.0938, "step": 187 }, { "epoch": 0.041777777777777775, "grad_norm": 1.0712711811065674, "learning_rate": 0.00019207126948775057, "loss": 1.9997, "step": 188 }, { "epoch": 0.042, "grad_norm": 1.1801820993423462, "learning_rate": 0.00019202672605790646, "loss": 1.684, "step": 189 }, { "epoch": 0.042222222222222223, "grad_norm": 1.134307861328125, "learning_rate": 0.00019198218262806238, "loss": 1.4712, "step": 190 }, { "epoch": 0.042444444444444444, "grad_norm": 1.0281476974487305, "learning_rate": 0.00019193763919821827, "loss": 1.7053, "step": 191 }, { "epoch": 0.042666666666666665, "grad_norm": 1.0464823246002197, "learning_rate": 0.0001918930957683742, "loss": 1.4212, "step": 192 }, { "epoch": 0.042888888888888886, "grad_norm": 1.1084800958633423, "learning_rate": 0.00019184855233853008, "loss": 1.6954, "step": 193 }, { "epoch": 0.043111111111111114, "grad_norm": 1.5273072719573975, "learning_rate": 0.00019180400890868597, "loss": 1.5331, "step": 194 }, { "epoch": 0.043333333333333335, "grad_norm": 1.0163640975952148, "learning_rate": 0.00019175946547884186, "loss": 0.8302, "step": 195 }, { "epoch": 0.043555555555555556, "grad_norm": 1.1570039987564087, "learning_rate": 0.00019171492204899778, "loss": 1.1332, "step": 196 }, { "epoch": 0.04377777777777778, "grad_norm": 1.3068771362304688, "learning_rate": 0.0001916703786191537, "loss": 0.1635, "step": 197 }, { "epoch": 0.044, "grad_norm": 1.343957781791687, "learning_rate": 0.0001916258351893096, "loss": 1.3181, "step": 198 }, { "epoch": 0.044222222222222225, "grad_norm": 1.087274193763733, "learning_rate": 0.00019158129175946549, "loss": 1.2172, "step": 199 }, { "epoch": 0.044444444444444446, "grad_norm": 1.6285357475280762, "learning_rate": 0.00019153674832962138, "loss": 0.8885, "step": 200 }, { "epoch": 0.04466666666666667, "grad_norm": 0.8212461471557617, "learning_rate": 0.0001914922048997773, "loss": 1.9517, "step": 201 }, { "epoch": 0.04488888888888889, "grad_norm": 0.8975577354431152, "learning_rate": 0.00019144766146993322, "loss": 2.0502, "step": 202 }, { "epoch": 0.04511111111111111, "grad_norm": 0.931280255317688, "learning_rate": 0.0001914031180400891, "loss": 1.0111, "step": 203 }, { "epoch": 0.04533333333333334, "grad_norm": 0.6608829498291016, "learning_rate": 0.000191358574610245, "loss": 1.1096, "step": 204 }, { "epoch": 0.04555555555555556, "grad_norm": 0.7915617227554321, "learning_rate": 0.0001913140311804009, "loss": 2.6039, "step": 205 }, { "epoch": 0.04577777777777778, "grad_norm": 0.6403900980949402, "learning_rate": 0.0001912694877505568, "loss": 1.13, "step": 206 }, { "epoch": 0.046, "grad_norm": 0.9232172966003418, "learning_rate": 0.0001912249443207127, "loss": 2.668, "step": 207 }, { "epoch": 0.04622222222222222, "grad_norm": 0.6349806785583496, "learning_rate": 0.00019118040089086862, "loss": 1.1026, "step": 208 }, { "epoch": 0.04644444444444444, "grad_norm": 0.8131903409957886, "learning_rate": 0.0001911358574610245, "loss": 2.2254, "step": 209 }, { "epoch": 0.04666666666666667, "grad_norm": 0.2445099800825119, "learning_rate": 0.0001910913140311804, "loss": 0.0209, "step": 210 }, { "epoch": 0.04688888888888889, "grad_norm": 0.11807701736688614, "learning_rate": 0.00019104677060133632, "loss": 0.0192, "step": 211 }, { "epoch": 0.04711111111111111, "grad_norm": 0.10483487695455551, "learning_rate": 0.00019100222717149221, "loss": 0.0177, "step": 212 }, { "epoch": 0.04733333333333333, "grad_norm": 0.5437442064285278, "learning_rate": 0.0001909576837416481, "loss": 0.9199, "step": 213 }, { "epoch": 0.04755555555555555, "grad_norm": 0.8952361941337585, "learning_rate": 0.00019091314031180403, "loss": 2.6062, "step": 214 }, { "epoch": 0.04777777777777778, "grad_norm": 0.87530916929245, "learning_rate": 0.00019086859688195992, "loss": 1.98, "step": 215 }, { "epoch": 0.048, "grad_norm": 0.9103797078132629, "learning_rate": 0.00019082405345211584, "loss": 2.021, "step": 216 }, { "epoch": 0.04822222222222222, "grad_norm": 0.8201193809509277, "learning_rate": 0.00019077951002227173, "loss": 2.0562, "step": 217 }, { "epoch": 0.04844444444444444, "grad_norm": 1.1453330516815186, "learning_rate": 0.00019073496659242762, "loss": 1.1635, "step": 218 }, { "epoch": 0.048666666666666664, "grad_norm": 1.24114990234375, "learning_rate": 0.0001906904231625835, "loss": 0.2453, "step": 219 }, { "epoch": 0.04888888888888889, "grad_norm": 0.9633159637451172, "learning_rate": 0.00019064587973273943, "loss": 2.1915, "step": 220 }, { "epoch": 0.04911111111111111, "grad_norm": 0.9862430691719055, "learning_rate": 0.00019060133630289535, "loss": 2.058, "step": 221 }, { "epoch": 0.04933333333333333, "grad_norm": 0.9856882691383362, "learning_rate": 0.00019055679287305124, "loss": 2.0013, "step": 222 }, { "epoch": 0.049555555555555554, "grad_norm": 0.8800470232963562, "learning_rate": 0.00019051224944320713, "loss": 1.7867, "step": 223 }, { "epoch": 0.049777777777777775, "grad_norm": 1.1182115077972412, "learning_rate": 0.00019046770601336302, "loss": 2.261, "step": 224 }, { "epoch": 0.05, "grad_norm": 0.9679086804389954, "learning_rate": 0.00019042316258351894, "loss": 1.9116, "step": 225 }, { "epoch": 0.050222222222222224, "grad_norm": 0.9191752076148987, "learning_rate": 0.00019037861915367486, "loss": 2.0246, "step": 226 }, { "epoch": 0.050444444444444445, "grad_norm": 1.2083994150161743, "learning_rate": 0.00019033407572383075, "loss": 1.1371, "step": 227 }, { "epoch": 0.050666666666666665, "grad_norm": 0.16673077642917633, "learning_rate": 0.00019028953229398665, "loss": 0.0239, "step": 228 }, { "epoch": 0.050888888888888886, "grad_norm": 0.6332354545593262, "learning_rate": 0.00019024498886414254, "loss": 0.8717, "step": 229 }, { "epoch": 0.051111111111111114, "grad_norm": 1.3816114664077759, "learning_rate": 0.00019020044543429846, "loss": 1.0904, "step": 230 }, { "epoch": 0.051333333333333335, "grad_norm": 0.9588896632194519, "learning_rate": 0.00019015590200445435, "loss": 1.856, "step": 231 }, { "epoch": 0.051555555555555556, "grad_norm": 1.0486406087875366, "learning_rate": 0.00019011135857461024, "loss": 2.0886, "step": 232 }, { "epoch": 0.05177777777777778, "grad_norm": 1.0114916563034058, "learning_rate": 0.00019006681514476616, "loss": 1.9553, "step": 233 }, { "epoch": 0.052, "grad_norm": 1.2497199773788452, "learning_rate": 0.00019002227171492205, "loss": 1.8981, "step": 234 }, { "epoch": 0.052222222222222225, "grad_norm": 1.0667015314102173, "learning_rate": 0.00018997772828507797, "loss": 1.8742, "step": 235 }, { "epoch": 0.052444444444444446, "grad_norm": 1.1447402238845825, "learning_rate": 0.00018993318485523386, "loss": 0.0403, "step": 236 }, { "epoch": 0.05266666666666667, "grad_norm": 0.6644120216369629, "learning_rate": 0.00018988864142538975, "loss": 0.9821, "step": 237 }, { "epoch": 0.05288888888888889, "grad_norm": 0.17127251625061035, "learning_rate": 0.00018984409799554565, "loss": 0.0277, "step": 238 }, { "epoch": 0.05311111111111111, "grad_norm": 0.22570157051086426, "learning_rate": 0.00018979955456570156, "loss": 0.0325, "step": 239 }, { "epoch": 0.05333333333333334, "grad_norm": 0.1881849616765976, "learning_rate": 0.00018975501113585748, "loss": 0.0283, "step": 240 }, { "epoch": 0.05355555555555556, "grad_norm": 0.15113665163516998, "learning_rate": 0.00018971046770601337, "loss": 0.0244, "step": 241 }, { "epoch": 0.05377777777777778, "grad_norm": 1.3109371662139893, "learning_rate": 0.00018966592427616927, "loss": 1.7582, "step": 242 }, { "epoch": 0.054, "grad_norm": 1.0388661623001099, "learning_rate": 0.00018962138084632516, "loss": 1.8784, "step": 243 }, { "epoch": 0.05422222222222222, "grad_norm": 1.4733574390411377, "learning_rate": 0.00018957683741648108, "loss": 0.5269, "step": 244 }, { "epoch": 0.05444444444444444, "grad_norm": 1.2060288190841675, "learning_rate": 0.000189532293986637, "loss": 1.9426, "step": 245 }, { "epoch": 0.05466666666666667, "grad_norm": 1.2038888931274414, "learning_rate": 0.0001894877505567929, "loss": 1.7192, "step": 246 }, { "epoch": 0.05488888888888889, "grad_norm": 1.2904036045074463, "learning_rate": 0.00018944320712694878, "loss": 1.5118, "step": 247 }, { "epoch": 0.05511111111111111, "grad_norm": 1.381375789642334, "learning_rate": 0.00018939866369710467, "loss": 0.5868, "step": 248 }, { "epoch": 0.05533333333333333, "grad_norm": 1.606933832168579, "learning_rate": 0.0001893541202672606, "loss": 0.9528, "step": 249 }, { "epoch": 0.05555555555555555, "grad_norm": 1.470360517501831, "learning_rate": 0.00018930957683741648, "loss": 1.0201, "step": 250 }, { "epoch": 0.05577777777777778, "grad_norm": 0.794981837272644, "learning_rate": 0.0001892650334075724, "loss": 1.1911, "step": 251 }, { "epoch": 0.056, "grad_norm": 0.828644871711731, "learning_rate": 0.0001892204899777283, "loss": 2.1317, "step": 252 }, { "epoch": 0.05622222222222222, "grad_norm": 0.10859879851341248, "learning_rate": 0.00018917594654788419, "loss": 0.0136, "step": 253 }, { "epoch": 0.05644444444444444, "grad_norm": 0.9162502288818359, "learning_rate": 0.0001891314031180401, "loss": 2.0738, "step": 254 }, { "epoch": 0.056666666666666664, "grad_norm": 0.7586551308631897, "learning_rate": 0.000189086859688196, "loss": 2.2056, "step": 255 }, { "epoch": 0.05688888888888889, "grad_norm": 0.07891134172677994, "learning_rate": 0.0001890423162583519, "loss": 0.0118, "step": 256 }, { "epoch": 0.05711111111111111, "grad_norm": 0.10793828219175339, "learning_rate": 0.0001889977728285078, "loss": 0.0169, "step": 257 }, { "epoch": 0.05733333333333333, "grad_norm": 0.1842554360628128, "learning_rate": 0.0001889532293986637, "loss": 0.0224, "step": 258 }, { "epoch": 0.057555555555555554, "grad_norm": 0.93645840883255, "learning_rate": 0.00018890868596881962, "loss": 2.4856, "step": 259 }, { "epoch": 0.057777777777777775, "grad_norm": 0.943806529045105, "learning_rate": 0.0001888641425389755, "loss": 2.0643, "step": 260 }, { "epoch": 0.058, "grad_norm": 0.9614866971969604, "learning_rate": 0.0001888195991091314, "loss": 2.2795, "step": 261 }, { "epoch": 0.058222222222222224, "grad_norm": 2.3811917304992676, "learning_rate": 0.0001887750556792873, "loss": 0.1832, "step": 262 }, { "epoch": 0.058444444444444445, "grad_norm": 0.8956352472305298, "learning_rate": 0.00018873051224944324, "loss": 2.0827, "step": 263 }, { "epoch": 0.058666666666666666, "grad_norm": 1.0169123411178589, "learning_rate": 0.00018868596881959913, "loss": 1.9675, "step": 264 }, { "epoch": 0.058888888888888886, "grad_norm": 1.000707745552063, "learning_rate": 0.00018864142538975502, "loss": 2.0002, "step": 265 }, { "epoch": 0.059111111111111114, "grad_norm": 1.0070831775665283, "learning_rate": 0.00018859688195991091, "loss": 1.7638, "step": 266 }, { "epoch": 0.059333333333333335, "grad_norm": 0.9460271596908569, "learning_rate": 0.0001885523385300668, "loss": 1.8534, "step": 267 }, { "epoch": 0.059555555555555556, "grad_norm": 0.7087247371673584, "learning_rate": 0.00018850779510022272, "loss": 1.0192, "step": 268 }, { "epoch": 0.05977777777777778, "grad_norm": 0.16115489602088928, "learning_rate": 0.00018846325167037864, "loss": 0.0249, "step": 269 }, { "epoch": 0.06, "grad_norm": 0.12376585602760315, "learning_rate": 0.00018841870824053454, "loss": 0.0232, "step": 270 }, { "epoch": 0.060222222222222226, "grad_norm": 0.1018003597855568, "learning_rate": 0.00018837416481069043, "loss": 0.0212, "step": 271 }, { "epoch": 0.060444444444444446, "grad_norm": 0.09464185684919357, "learning_rate": 0.00018832962138084635, "loss": 0.0198, "step": 272 }, { "epoch": 0.06066666666666667, "grad_norm": 0.6736340522766113, "learning_rate": 0.00018828507795100224, "loss": 0.9684, "step": 273 }, { "epoch": 0.06088888888888889, "grad_norm": 1.5240976810455322, "learning_rate": 0.00018824053452115813, "loss": 1.2137, "step": 274 }, { "epoch": 0.06111111111111111, "grad_norm": 1.1830003261566162, "learning_rate": 0.00018819599109131405, "loss": 2.0444, "step": 275 }, { "epoch": 0.06133333333333333, "grad_norm": 1.0134773254394531, "learning_rate": 0.00018815144766146994, "loss": 1.898, "step": 276 }, { "epoch": 0.06155555555555556, "grad_norm": 1.1037492752075195, "learning_rate": 0.00018810690423162586, "loss": 1.9418, "step": 277 }, { "epoch": 0.06177777777777778, "grad_norm": 1.1346395015716553, "learning_rate": 0.00018806236080178175, "loss": 1.855, "step": 278 }, { "epoch": 0.062, "grad_norm": 0.996393084526062, "learning_rate": 0.00018801781737193764, "loss": 1.6958, "step": 279 }, { "epoch": 0.06222222222222222, "grad_norm": 0.9884223937988281, "learning_rate": 0.00018797327394209353, "loss": 1.6775, "step": 280 }, { "epoch": 0.06244444444444444, "grad_norm": 1.2472928762435913, "learning_rate": 0.00018792873051224945, "loss": 1.8322, "step": 281 }, { "epoch": 0.06266666666666666, "grad_norm": 0.9096193313598633, "learning_rate": 0.00018788418708240537, "loss": 1.0656, "step": 282 }, { "epoch": 0.06288888888888888, "grad_norm": 0.1850435882806778, "learning_rate": 0.00018783964365256126, "loss": 0.0229, "step": 283 }, { "epoch": 0.06311111111111112, "grad_norm": 0.90726238489151, "learning_rate": 0.00018779510022271716, "loss": 1.882, "step": 284 }, { "epoch": 0.06333333333333334, "grad_norm": 0.9707450270652771, "learning_rate": 0.00018775055679287305, "loss": 1.5804, "step": 285 }, { "epoch": 0.06355555555555556, "grad_norm": 0.7034225463867188, "learning_rate": 0.00018770601336302897, "loss": 0.9516, "step": 286 }, { "epoch": 0.06377777777777778, "grad_norm": 0.742444634437561, "learning_rate": 0.00018766146993318489, "loss": 0.7442, "step": 287 }, { "epoch": 0.064, "grad_norm": 0.6448124647140503, "learning_rate": 0.00018761692650334078, "loss": 0.8479, "step": 288 }, { "epoch": 0.06422222222222222, "grad_norm": 0.9502848982810974, "learning_rate": 0.00018757238307349667, "loss": 1.9858, "step": 289 }, { "epoch": 0.06444444444444444, "grad_norm": 0.966400146484375, "learning_rate": 0.00018752783964365256, "loss": 1.8273, "step": 290 }, { "epoch": 0.06466666666666666, "grad_norm": 0.9974849224090576, "learning_rate": 0.00018748329621380848, "loss": 1.0585, "step": 291 }, { "epoch": 0.06488888888888888, "grad_norm": 1.0357645750045776, "learning_rate": 0.00018743875278396437, "loss": 1.7982, "step": 292 }, { "epoch": 0.0651111111111111, "grad_norm": 1.1910970211029053, "learning_rate": 0.0001873942093541203, "loss": 1.6365, "step": 293 }, { "epoch": 0.06533333333333333, "grad_norm": 1.1395729780197144, "learning_rate": 0.00018734966592427618, "loss": 1.3202, "step": 294 }, { "epoch": 0.06555555555555556, "grad_norm": 1.0338728427886963, "learning_rate": 0.00018730512249443207, "loss": 1.6425, "step": 295 }, { "epoch": 0.06577777777777778, "grad_norm": 1.1223074197769165, "learning_rate": 0.000187260579064588, "loss": 1.7752, "step": 296 }, { "epoch": 0.066, "grad_norm": 1.4117039442062378, "learning_rate": 0.00018721603563474389, "loss": 1.4247, "step": 297 }, { "epoch": 0.06622222222222222, "grad_norm": 1.1682217121124268, "learning_rate": 0.00018717149220489978, "loss": 1.1538, "step": 298 }, { "epoch": 0.06644444444444444, "grad_norm": 0.995919942855835, "learning_rate": 0.0001871269487750557, "loss": 0.6315, "step": 299 }, { "epoch": 0.06666666666666667, "grad_norm": 1.2403305768966675, "learning_rate": 0.0001870824053452116, "loss": 1.0913, "step": 300 }, { "epoch": 0.06688888888888889, "grad_norm": 0.836203932762146, "learning_rate": 0.0001870378619153675, "loss": 1.2099, "step": 301 }, { "epoch": 0.06711111111111111, "grad_norm": 0.5399565696716309, "learning_rate": 0.0001869933184855234, "loss": 1.0186, "step": 302 }, { "epoch": 0.06733333333333333, "grad_norm": 0.5899677276611328, "learning_rate": 0.0001869487750556793, "loss": 1.0829, "step": 303 }, { "epoch": 0.06755555555555555, "grad_norm": 0.8014405965805054, "learning_rate": 0.00018690423162583518, "loss": 2.2654, "step": 304 }, { "epoch": 0.06777777777777778, "grad_norm": 0.6197260022163391, "learning_rate": 0.0001868596881959911, "loss": 1.3951, "step": 305 }, { "epoch": 0.068, "grad_norm": 0.8079295754432678, "learning_rate": 0.00018681514476614702, "loss": 2.2529, "step": 306 }, { "epoch": 0.06822222222222223, "grad_norm": 0.05890868231654167, "learning_rate": 0.0001867706013363029, "loss": 0.0114, "step": 307 }, { "epoch": 0.06844444444444445, "grad_norm": 0.7747896909713745, "learning_rate": 0.0001867260579064588, "loss": 2.1361, "step": 308 }, { "epoch": 0.06866666666666667, "grad_norm": 0.5295421481132507, "learning_rate": 0.0001866815144766147, "loss": 0.0238, "step": 309 }, { "epoch": 0.06888888888888889, "grad_norm": 0.12555040419101715, "learning_rate": 0.00018663697104677061, "loss": 0.019, "step": 310 }, { "epoch": 0.06911111111111111, "grad_norm": 0.11239951103925705, "learning_rate": 0.0001865924276169265, "loss": 0.0172, "step": 311 }, { "epoch": 0.06933333333333333, "grad_norm": 0.6421605944633484, "learning_rate": 0.00018654788418708243, "loss": 1.1458, "step": 312 }, { "epoch": 0.06955555555555555, "grad_norm": 0.9105124473571777, "learning_rate": 0.00018650334075723832, "loss": 1.9677, "step": 313 }, { "epoch": 0.06977777777777777, "grad_norm": 0.8628215789794922, "learning_rate": 0.0001864587973273942, "loss": 2.1423, "step": 314 }, { "epoch": 0.07, "grad_norm": 0.829418957233429, "learning_rate": 0.00018641425389755013, "loss": 1.9332, "step": 315 }, { "epoch": 0.07022222222222223, "grad_norm": 0.8867872357368469, "learning_rate": 0.00018636971046770602, "loss": 1.8072, "step": 316 }, { "epoch": 0.07044444444444445, "grad_norm": 0.9867467284202576, "learning_rate": 0.0001863251670378619, "loss": 1.8567, "step": 317 }, { "epoch": 0.07066666666666667, "grad_norm": 0.9062153100967407, "learning_rate": 0.00018628062360801783, "loss": 1.8609, "step": 318 }, { "epoch": 0.07088888888888889, "grad_norm": 1.1173990964889526, "learning_rate": 0.00018623608017817372, "loss": 2.1317, "step": 319 }, { "epoch": 0.07111111111111111, "grad_norm": 0.10674909502267838, "learning_rate": 0.00018619153674832964, "loss": 0.021, "step": 320 }, { "epoch": 0.07133333333333333, "grad_norm": 0.1044670045375824, "learning_rate": 0.00018614699331848553, "loss": 0.0203, "step": 321 }, { "epoch": 0.07155555555555555, "grad_norm": 0.09776726365089417, "learning_rate": 0.00018610244988864142, "loss": 0.0192, "step": 322 }, { "epoch": 0.07177777777777777, "grad_norm": 1.127001404762268, "learning_rate": 0.00018605790645879732, "loss": 1.9981, "step": 323 }, { "epoch": 0.072, "grad_norm": 0.9693049788475037, "learning_rate": 0.00018601336302895324, "loss": 1.9003, "step": 324 }, { "epoch": 0.07222222222222222, "grad_norm": 0.9398884773254395, "learning_rate": 0.00018596881959910915, "loss": 1.791, "step": 325 }, { "epoch": 0.07244444444444445, "grad_norm": 1.1868557929992676, "learning_rate": 0.00018592427616926505, "loss": 1.7433, "step": 326 }, { "epoch": 0.07266666666666667, "grad_norm": 0.8618929982185364, "learning_rate": 0.00018587973273942094, "loss": 1.9456, "step": 327 }, { "epoch": 0.07288888888888889, "grad_norm": 0.9172239899635315, "learning_rate": 0.00018583518930957683, "loss": 1.7215, "step": 328 }, { "epoch": 0.07311111111111111, "grad_norm": 0.7097941040992737, "learning_rate": 0.00018579064587973275, "loss": 0.9636, "step": 329 }, { "epoch": 0.07333333333333333, "grad_norm": 0.7206116318702698, "learning_rate": 0.00018574610244988867, "loss": 0.9052, "step": 330 }, { "epoch": 0.07355555555555555, "grad_norm": 0.16659016907215118, "learning_rate": 0.00018570155902004456, "loss": 0.0253, "step": 331 }, { "epoch": 0.07377777777777778, "grad_norm": 1.0400179624557495, "learning_rate": 0.00018565701559020045, "loss": 1.6796, "step": 332 }, { "epoch": 0.074, "grad_norm": 0.9435672163963318, "learning_rate": 0.00018561247216035634, "loss": 0.8521, "step": 333 }, { "epoch": 0.07422222222222222, "grad_norm": 0.17228728532791138, "learning_rate": 0.00018556792873051226, "loss": 0.0243, "step": 334 }, { "epoch": 0.07444444444444444, "grad_norm": 0.7825049161911011, "learning_rate": 0.00018552338530066815, "loss": 0.8863, "step": 335 }, { "epoch": 0.07466666666666667, "grad_norm": 1.0093753337860107, "learning_rate": 0.00018547884187082407, "loss": 1.7281, "step": 336 }, { "epoch": 0.0748888888888889, "grad_norm": 1.0921690464019775, "learning_rate": 0.00018543429844097996, "loss": 1.9082, "step": 337 }, { "epoch": 0.07511111111111111, "grad_norm": 0.9549766182899475, "learning_rate": 0.00018538975501113586, "loss": 1.5888, "step": 338 }, { "epoch": 0.07533333333333334, "grad_norm": 1.225373387336731, "learning_rate": 0.00018534521158129177, "loss": 0.9245, "step": 339 }, { "epoch": 0.07555555555555556, "grad_norm": 1.1169060468673706, "learning_rate": 0.00018530066815144767, "loss": 0.1607, "step": 340 }, { "epoch": 0.07577777777777778, "grad_norm": 1.0006355047225952, "learning_rate": 0.00018525612472160356, "loss": 1.6492, "step": 341 }, { "epoch": 0.076, "grad_norm": 1.0142918825149536, "learning_rate": 0.00018521158129175948, "loss": 1.5253, "step": 342 }, { "epoch": 0.07622222222222222, "grad_norm": 0.9755372405052185, "learning_rate": 0.00018516703786191537, "loss": 1.6062, "step": 343 }, { "epoch": 0.07644444444444444, "grad_norm": 1.1996437311172485, "learning_rate": 0.0001851224944320713, "loss": 1.6435, "step": 344 }, { "epoch": 0.07666666666666666, "grad_norm": 1.0422567129135132, "learning_rate": 0.00018507795100222718, "loss": 1.6154, "step": 345 }, { "epoch": 0.0768888888888889, "grad_norm": 1.4139487743377686, "learning_rate": 0.00018503340757238307, "loss": 1.6726, "step": 346 }, { "epoch": 0.07711111111111112, "grad_norm": 1.4194035530090332, "learning_rate": 0.00018498886414253896, "loss": 0.7896, "step": 347 }, { "epoch": 0.07733333333333334, "grad_norm": 1.0651965141296387, "learning_rate": 0.0001849443207126949, "loss": 1.315, "step": 348 }, { "epoch": 0.07755555555555556, "grad_norm": 1.0679761171340942, "learning_rate": 0.0001848997772828508, "loss": 0.6501, "step": 349 }, { "epoch": 0.07777777777777778, "grad_norm": 1.3450720310211182, "learning_rate": 0.0001848552338530067, "loss": 0.7368, "step": 350 }, { "epoch": 0.078, "grad_norm": 0.6705034375190735, "learning_rate": 0.00018481069042316258, "loss": 1.0454, "step": 351 }, { "epoch": 0.07822222222222222, "grad_norm": 0.6062948703765869, "learning_rate": 0.00018476614699331848, "loss": 1.2369, "step": 352 }, { "epoch": 0.07844444444444444, "grad_norm": 0.562647819519043, "learning_rate": 0.0001847216035634744, "loss": 1.186, "step": 353 }, { "epoch": 0.07866666666666666, "grad_norm": 0.8215838670730591, "learning_rate": 0.00018467706013363031, "loss": 0.035, "step": 354 }, { "epoch": 0.07888888888888888, "grad_norm": 0.6498701572418213, "learning_rate": 0.0001846325167037862, "loss": 1.097, "step": 355 }, { "epoch": 0.0791111111111111, "grad_norm": 0.9627291560173035, "learning_rate": 0.0001845879732739421, "loss": 2.1706, "step": 356 }, { "epoch": 0.07933333333333334, "grad_norm": 0.8392488956451416, "learning_rate": 0.00018454342984409802, "loss": 2.1539, "step": 357 }, { "epoch": 0.07955555555555556, "grad_norm": 0.9004356265068054, "learning_rate": 0.0001844988864142539, "loss": 2.0944, "step": 358 }, { "epoch": 0.07977777777777778, "grad_norm": 0.8827551603317261, "learning_rate": 0.0001844543429844098, "loss": 2.1067, "step": 359 }, { "epoch": 0.08, "grad_norm": 0.9497137665748596, "learning_rate": 0.00018440979955456572, "loss": 2.0297, "step": 360 }, { "epoch": 0.08022222222222222, "grad_norm": 0.9877284169197083, "learning_rate": 0.0001843652561247216, "loss": 2.2244, "step": 361 }, { "epoch": 0.08044444444444444, "grad_norm": 0.9175549149513245, "learning_rate": 0.00018432071269487753, "loss": 1.9345, "step": 362 }, { "epoch": 0.08066666666666666, "grad_norm": 0.9076710343360901, "learning_rate": 0.00018427616926503342, "loss": 1.0216, "step": 363 }, { "epoch": 0.08088888888888889, "grad_norm": 0.9432811141014099, "learning_rate": 0.00018423162583518931, "loss": 1.9628, "step": 364 }, { "epoch": 0.0811111111111111, "grad_norm": 0.9499055743217468, "learning_rate": 0.0001841870824053452, "loss": 1.7118, "step": 365 }, { "epoch": 0.08133333333333333, "grad_norm": 0.8877860903739929, "learning_rate": 0.00018414253897550112, "loss": 1.9912, "step": 366 }, { "epoch": 0.08155555555555556, "grad_norm": 0.8878340125083923, "learning_rate": 0.00018409799554565704, "loss": 1.9927, "step": 367 }, { "epoch": 0.08177777777777778, "grad_norm": 0.9800185561180115, "learning_rate": 0.00018405345211581294, "loss": 1.9707, "step": 368 }, { "epoch": 0.082, "grad_norm": 0.7973767518997192, "learning_rate": 0.00018400890868596883, "loss": 1.0049, "step": 369 }, { "epoch": 0.08222222222222222, "grad_norm": 0.5313000679016113, "learning_rate": 0.00018396436525612472, "loss": 0.0383, "step": 370 }, { "epoch": 0.08244444444444445, "grad_norm": 0.6772900223731995, "learning_rate": 0.00018391982182628064, "loss": 0.9766, "step": 371 }, { "epoch": 0.08266666666666667, "grad_norm": 0.8833468556404114, "learning_rate": 0.00018387527839643656, "loss": 1.6439, "step": 372 }, { "epoch": 0.08288888888888889, "grad_norm": 1.4322277307510376, "learning_rate": 0.00018383073496659245, "loss": 0.9063, "step": 373 }, { "epoch": 0.08311111111111111, "grad_norm": 0.9009195566177368, "learning_rate": 0.00018378619153674834, "loss": 1.6793, "step": 374 }, { "epoch": 0.08333333333333333, "grad_norm": 1.0847877264022827, "learning_rate": 0.00018374164810690423, "loss": 1.9706, "step": 375 }, { "epoch": 0.08355555555555555, "grad_norm": 1.0555421113967896, "learning_rate": 0.00018369710467706015, "loss": 1.7983, "step": 376 }, { "epoch": 0.08377777777777778, "grad_norm": 1.0266549587249756, "learning_rate": 0.00018365256124721604, "loss": 1.5035, "step": 377 }, { "epoch": 0.084, "grad_norm": 1.0186165571212769, "learning_rate": 0.00018360801781737193, "loss": 1.6932, "step": 378 }, { "epoch": 0.08422222222222223, "grad_norm": 0.9408406019210815, "learning_rate": 0.00018356347438752785, "loss": 0.8905, "step": 379 }, { "epoch": 0.08444444444444445, "grad_norm": 0.11006919294595718, "learning_rate": 0.00018351893095768375, "loss": 0.0205, "step": 380 }, { "epoch": 0.08466666666666667, "grad_norm": 0.6134664416313171, "learning_rate": 0.00018347438752783966, "loss": 0.8471, "step": 381 }, { "epoch": 0.08488888888888889, "grad_norm": 0.9574106931686401, "learning_rate": 0.00018342984409799556, "loss": 1.5896, "step": 382 }, { "epoch": 0.08511111111111111, "grad_norm": 0.7427169680595398, "learning_rate": 0.00018338530066815145, "loss": 0.827, "step": 383 }, { "epoch": 0.08533333333333333, "grad_norm": 1.155964970588684, "learning_rate": 0.00018334075723830734, "loss": 1.7048, "step": 384 }, { "epoch": 0.08555555555555555, "grad_norm": 0.9930922389030457, "learning_rate": 0.00018329621380846326, "loss": 1.7281, "step": 385 }, { "epoch": 0.08577777777777777, "grad_norm": 1.097965955734253, "learning_rate": 0.00018325167037861918, "loss": 1.4611, "step": 386 }, { "epoch": 0.086, "grad_norm": 3.5689327716827393, "learning_rate": 0.00018320712694877507, "loss": 0.2311, "step": 387 }, { "epoch": 0.08622222222222223, "grad_norm": 1.0425963401794434, "learning_rate": 0.00018316258351893096, "loss": 1.7319, "step": 388 }, { "epoch": 0.08644444444444445, "grad_norm": 1.156557559967041, "learning_rate": 0.00018311804008908685, "loss": 1.505, "step": 389 }, { "epoch": 0.08666666666666667, "grad_norm": 1.0377581119537354, "learning_rate": 0.00018307349665924277, "loss": 1.6431, "step": 390 }, { "epoch": 0.08688888888888889, "grad_norm": 1.3024321794509888, "learning_rate": 0.0001830289532293987, "loss": 1.7298, "step": 391 }, { "epoch": 0.08711111111111111, "grad_norm": 1.0300354957580566, "learning_rate": 0.00018298440979955458, "loss": 1.6316, "step": 392 }, { "epoch": 0.08733333333333333, "grad_norm": 1.046774983406067, "learning_rate": 0.00018293986636971047, "loss": 1.5083, "step": 393 }, { "epoch": 0.08755555555555555, "grad_norm": 1.32634437084198, "learning_rate": 0.00018289532293986637, "loss": 1.6296, "step": 394 }, { "epoch": 0.08777777777777777, "grad_norm": 2.0546815395355225, "learning_rate": 0.00018285077951002229, "loss": 0.1777, "step": 395 }, { "epoch": 0.088, "grad_norm": 0.8895161151885986, "learning_rate": 0.00018280623608017818, "loss": 0.6762, "step": 396 }, { "epoch": 0.08822222222222222, "grad_norm": 0.8843004703521729, "learning_rate": 0.0001827616926503341, "loss": 0.8162, "step": 397 }, { "epoch": 0.08844444444444445, "grad_norm": 1.2154911756515503, "learning_rate": 0.00018271714922049, "loss": 1.4117, "step": 398 }, { "epoch": 0.08866666666666667, "grad_norm": 0.9329721331596375, "learning_rate": 0.00018267260579064588, "loss": 0.7163, "step": 399 }, { "epoch": 0.08888888888888889, "grad_norm": 1.130262017250061, "learning_rate": 0.0001826280623608018, "loss": 1.2238, "step": 400 }, { "epoch": 0.08911111111111111, "grad_norm": 0.5905479192733765, "learning_rate": 0.0001825835189309577, "loss": 1.2077, "step": 401 }, { "epoch": 0.08933333333333333, "grad_norm": 0.8485738039016724, "learning_rate": 0.00018253897550111358, "loss": 2.3062, "step": 402 }, { "epoch": 0.08955555555555555, "grad_norm": 0.16456733644008636, "learning_rate": 0.0001824944320712695, "loss": 0.0196, "step": 403 }, { "epoch": 0.08977777777777778, "grad_norm": 0.540153443813324, "learning_rate": 0.0001824498886414254, "loss": 1.0257, "step": 404 }, { "epoch": 0.09, "grad_norm": 0.09294180572032928, "learning_rate": 0.0001824053452115813, "loss": 0.0164, "step": 405 }, { "epoch": 0.09022222222222222, "grad_norm": 0.7190051078796387, "learning_rate": 0.0001823608017817372, "loss": 2.529, "step": 406 }, { "epoch": 0.09044444444444444, "grad_norm": 0.7573959231376648, "learning_rate": 0.0001823162583518931, "loss": 0.95, "step": 407 }, { "epoch": 0.09066666666666667, "grad_norm": 0.13494077324867249, "learning_rate": 0.000182271714922049, "loss": 0.0194, "step": 408 }, { "epoch": 0.0908888888888889, "grad_norm": 0.11461230367422104, "learning_rate": 0.0001822271714922049, "loss": 0.0177, "step": 409 }, { "epoch": 0.09111111111111111, "grad_norm": 0.6490556001663208, "learning_rate": 0.00018218262806236082, "loss": 0.9504, "step": 410 }, { "epoch": 0.09133333333333334, "grad_norm": 0.8228305578231812, "learning_rate": 0.00018213808463251672, "loss": 2.0239, "step": 411 }, { "epoch": 0.09155555555555556, "grad_norm": 0.898671567440033, "learning_rate": 0.0001820935412026726, "loss": 2.1288, "step": 412 }, { "epoch": 0.09177777777777778, "grad_norm": 0.8291831016540527, "learning_rate": 0.0001820489977728285, "loss": 2.1028, "step": 413 }, { "epoch": 0.092, "grad_norm": 0.937248706817627, "learning_rate": 0.00018200445434298442, "loss": 2.1206, "step": 414 }, { "epoch": 0.09222222222222222, "grad_norm": 0.8091291785240173, "learning_rate": 0.00018195991091314034, "loss": 2.1107, "step": 415 }, { "epoch": 0.09244444444444444, "grad_norm": 0.9196256995201111, "learning_rate": 0.00018191536748329623, "loss": 2.1269, "step": 416 }, { "epoch": 0.09266666666666666, "grad_norm": 0.7829540371894836, "learning_rate": 0.00018187082405345212, "loss": 1.0987, "step": 417 }, { "epoch": 0.09288888888888888, "grad_norm": 0.9346635937690735, "learning_rate": 0.00018182628062360801, "loss": 1.8687, "step": 418 }, { "epoch": 0.09311111111111112, "grad_norm": 0.906173825263977, "learning_rate": 0.00018178173719376393, "loss": 1.9742, "step": 419 }, { "epoch": 0.09333333333333334, "grad_norm": 0.833707869052887, "learning_rate": 0.00018173719376391982, "loss": 1.8876, "step": 420 }, { "epoch": 0.09355555555555556, "grad_norm": 0.8492452502250671, "learning_rate": 0.00018169265033407574, "loss": 1.9469, "step": 421 }, { "epoch": 0.09377777777777778, "grad_norm": 0.846052885055542, "learning_rate": 0.00018164810690423164, "loss": 1.887, "step": 422 }, { "epoch": 0.094, "grad_norm": 0.7105516195297241, "learning_rate": 0.00018160356347438753, "loss": 0.8668, "step": 423 }, { "epoch": 0.09422222222222222, "grad_norm": 0.40860867500305176, "learning_rate": 0.00018155902004454345, "loss": 0.025, "step": 424 }, { "epoch": 0.09444444444444444, "grad_norm": 0.08055032789707184, "learning_rate": 0.00018151447661469934, "loss": 0.0187, "step": 425 }, { "epoch": 0.09466666666666666, "grad_norm": 0.9038375616073608, "learning_rate": 0.00018146993318485523, "loss": 1.8731, "step": 426 }, { "epoch": 0.09488888888888888, "grad_norm": 1.0034536123275757, "learning_rate": 0.00018142538975501115, "loss": 1.6233, "step": 427 }, { "epoch": 0.0951111111111111, "grad_norm": 1.02449631690979, "learning_rate": 0.00018138084632516704, "loss": 1.9286, "step": 428 }, { "epoch": 0.09533333333333334, "grad_norm": 0.9773498773574829, "learning_rate": 0.00018133630289532296, "loss": 1.7909, "step": 429 }, { "epoch": 0.09555555555555556, "grad_norm": 1.028286099433899, "learning_rate": 0.00018129175946547885, "loss": 1.5749, "step": 430 }, { "epoch": 0.09577777777777778, "grad_norm": 0.18419918417930603, "learning_rate": 0.00018124721603563474, "loss": 0.0204, "step": 431 }, { "epoch": 0.096, "grad_norm": 0.08870963752269745, "learning_rate": 0.00018120267260579063, "loss": 0.0192, "step": 432 }, { "epoch": 0.09622222222222222, "grad_norm": 0.08107131719589233, "learning_rate": 0.00018115812917594658, "loss": 0.0187, "step": 433 }, { "epoch": 0.09644444444444444, "grad_norm": 0.07466530054807663, "learning_rate": 0.00018111358574610247, "loss": 0.0179, "step": 434 }, { "epoch": 0.09666666666666666, "grad_norm": 0.7753478288650513, "learning_rate": 0.00018106904231625836, "loss": 0.9339, "step": 435 }, { "epoch": 0.09688888888888889, "grad_norm": 1.0347474813461304, "learning_rate": 0.00018102449888641426, "loss": 1.786, "step": 436 }, { "epoch": 0.0971111111111111, "grad_norm": 1.0018888711929321, "learning_rate": 0.00018097995545657015, "loss": 1.0405, "step": 437 }, { "epoch": 0.09733333333333333, "grad_norm": 0.12741631269454956, "learning_rate": 0.00018093541202672607, "loss": 0.0246, "step": 438 }, { "epoch": 0.09755555555555556, "grad_norm": 0.720927357673645, "learning_rate": 0.00018089086859688199, "loss": 0.7423, "step": 439 }, { "epoch": 0.09777777777777778, "grad_norm": 1.0436959266662598, "learning_rate": 0.00018084632516703788, "loss": 1.7813, "step": 440 }, { "epoch": 0.098, "grad_norm": 1.33415687084198, "learning_rate": 0.00018080178173719377, "loss": 0.1569, "step": 441 }, { "epoch": 0.09822222222222222, "grad_norm": 1.0239771604537964, "learning_rate": 0.0001807572383073497, "loss": 1.4739, "step": 442 }, { "epoch": 0.09844444444444445, "grad_norm": 1.022449254989624, "learning_rate": 0.00018071269487750558, "loss": 1.4619, "step": 443 }, { "epoch": 0.09866666666666667, "grad_norm": 1.7489303350448608, "learning_rate": 0.00018066815144766147, "loss": 0.1832, "step": 444 }, { "epoch": 0.09888888888888889, "grad_norm": 0.7640416026115417, "learning_rate": 0.00018062360801781736, "loss": 0.0761, "step": 445 }, { "epoch": 0.09911111111111111, "grad_norm": 1.0594218969345093, "learning_rate": 0.00018057906458797328, "loss": 1.0882, "step": 446 }, { "epoch": 0.09933333333333333, "grad_norm": 1.0107744932174683, "learning_rate": 0.0001805345211581292, "loss": 1.2503, "step": 447 }, { "epoch": 0.09955555555555555, "grad_norm": 1.194696307182312, "learning_rate": 0.0001804899777282851, "loss": 1.3876, "step": 448 }, { "epoch": 0.09977777777777778, "grad_norm": 0.9181436896324158, "learning_rate": 0.00018044543429844098, "loss": 0.6689, "step": 449 }, { "epoch": 0.1, "grad_norm": 0.9961203932762146, "learning_rate": 0.00018040089086859688, "loss": 0.9308, "step": 450 }, { "epoch": 0.10022222222222223, "grad_norm": 0.6209729909896851, "learning_rate": 0.00018035634743875277, "loss": 1.0967, "step": 451 }, { "epoch": 0.10044444444444445, "grad_norm": 0.7032824158668518, "learning_rate": 0.00018031180400890871, "loss": 1.3153, "step": 452 }, { "epoch": 0.10066666666666667, "grad_norm": 0.6422486305236816, "learning_rate": 0.0001802672605790646, "loss": 1.1794, "step": 453 }, { "epoch": 0.10088888888888889, "grad_norm": 0.5714130401611328, "learning_rate": 0.0001802227171492205, "loss": 1.1083, "step": 454 }, { "epoch": 0.10111111111111111, "grad_norm": 0.9129486680030823, "learning_rate": 0.0001801781737193764, "loss": 2.1855, "step": 455 }, { "epoch": 0.10133333333333333, "grad_norm": 0.7295732498168945, "learning_rate": 0.0001801336302895323, "loss": 0.0452, "step": 456 }, { "epoch": 0.10155555555555555, "grad_norm": 0.5325131416320801, "learning_rate": 0.0001800890868596882, "loss": 0.0173, "step": 457 }, { "epoch": 0.10177777777777777, "grad_norm": 1.0180753469467163, "learning_rate": 0.00018004454342984412, "loss": 2.1597, "step": 458 }, { "epoch": 0.102, "grad_norm": 0.9186641573905945, "learning_rate": 0.00018, "loss": 2.1838, "step": 459 }, { "epoch": 0.10222222222222223, "grad_norm": 0.9034368991851807, "learning_rate": 0.0001799554565701559, "loss": 2.1579, "step": 460 }, { "epoch": 0.10244444444444445, "grad_norm": 1.2893496751785278, "learning_rate": 0.00017991091314031182, "loss": 2.286, "step": 461 }, { "epoch": 0.10266666666666667, "grad_norm": 0.9287530183792114, "learning_rate": 0.00017986636971046771, "loss": 2.3557, "step": 462 }, { "epoch": 0.10288888888888889, "grad_norm": 1.0746686458587646, "learning_rate": 0.0001798218262806236, "loss": 2.1398, "step": 463 }, { "epoch": 0.10311111111111111, "grad_norm": 0.9413710236549377, "learning_rate": 0.00017977728285077952, "loss": 1.0248, "step": 464 }, { "epoch": 0.10333333333333333, "grad_norm": 0.9778950810432434, "learning_rate": 0.00017973273942093542, "loss": 2.1938, "step": 465 }, { "epoch": 0.10355555555555555, "grad_norm": 0.9324243068695068, "learning_rate": 0.00017968819599109134, "loss": 2.0434, "step": 466 }, { "epoch": 0.10377777777777777, "grad_norm": 0.8695129156112671, "learning_rate": 0.00017964365256124723, "loss": 1.858, "step": 467 }, { "epoch": 0.104, "grad_norm": 1.1168073415756226, "learning_rate": 0.00017959910913140312, "loss": 2.002, "step": 468 }, { "epoch": 0.10422222222222222, "grad_norm": 0.9692973494529724, "learning_rate": 0.000179554565701559, "loss": 2.1136, "step": 469 }, { "epoch": 0.10444444444444445, "grad_norm": 0.23175086081027985, "learning_rate": 0.00017951002227171493, "loss": 0.0257, "step": 470 }, { "epoch": 0.10466666666666667, "grad_norm": 0.12664885818958282, "learning_rate": 0.00017946547884187085, "loss": 0.024, "step": 471 }, { "epoch": 0.10488888888888889, "grad_norm": 0.11350343376398087, "learning_rate": 0.00017942093541202674, "loss": 0.0225, "step": 472 }, { "epoch": 0.10511111111111111, "grad_norm": 0.0981689915060997, "learning_rate": 0.00017937639198218263, "loss": 0.021, "step": 473 }, { "epoch": 0.10533333333333333, "grad_norm": 0.7038472294807434, "learning_rate": 0.00017933184855233852, "loss": 0.8939, "step": 474 }, { "epoch": 0.10555555555555556, "grad_norm": 1.1517345905303955, "learning_rate": 0.00017928730512249444, "loss": 1.8588, "step": 475 }, { "epoch": 0.10577777777777778, "grad_norm": 1.0799177885055542, "learning_rate": 0.00017924276169265036, "loss": 1.8593, "step": 476 }, { "epoch": 0.106, "grad_norm": 1.1787912845611572, "learning_rate": 0.00017919821826280625, "loss": 1.9679, "step": 477 }, { "epoch": 0.10622222222222222, "grad_norm": 0.9503030776977539, "learning_rate": 0.00017915367483296215, "loss": 1.7726, "step": 478 }, { "epoch": 0.10644444444444444, "grad_norm": 0.9899899959564209, "learning_rate": 0.00017910913140311804, "loss": 1.6583, "step": 479 }, { "epoch": 0.10666666666666667, "grad_norm": 0.8547096848487854, "learning_rate": 0.00017906458797327396, "loss": 0.9547, "step": 480 }, { "epoch": 0.1068888888888889, "grad_norm": 0.08154784888029099, "learning_rate": 0.00017902004454342985, "loss": 0.0182, "step": 481 }, { "epoch": 0.10711111111111112, "grad_norm": 1.203589916229248, "learning_rate": 0.00017897550111358577, "loss": 2.0223, "step": 482 }, { "epoch": 0.10733333333333334, "grad_norm": 0.5660258531570435, "learning_rate": 0.00017893095768374166, "loss": 0.0261, "step": 483 }, { "epoch": 0.10755555555555556, "grad_norm": 0.12165828794240952, "learning_rate": 0.00017888641425389755, "loss": 0.0219, "step": 484 }, { "epoch": 0.10777777777777778, "grad_norm": 0.1016518846154213, "learning_rate": 0.00017884187082405347, "loss": 0.0209, "step": 485 }, { "epoch": 0.108, "grad_norm": 0.7895167469978333, "learning_rate": 0.00017879732739420936, "loss": 0.781, "step": 486 }, { "epoch": 0.10822222222222222, "grad_norm": 0.9849477410316467, "learning_rate": 0.00017875278396436525, "loss": 1.7806, "step": 487 }, { "epoch": 0.10844444444444444, "grad_norm": 0.9108963012695312, "learning_rate": 0.00017870824053452117, "loss": 1.4698, "step": 488 }, { "epoch": 0.10866666666666666, "grad_norm": 1.078587532043457, "learning_rate": 0.00017866369710467706, "loss": 1.5214, "step": 489 }, { "epoch": 0.10888888888888888, "grad_norm": 1.0179158449172974, "learning_rate": 0.00017861915367483298, "loss": 1.8058, "step": 490 }, { "epoch": 0.10911111111111112, "grad_norm": 1.0984735488891602, "learning_rate": 0.00017857461024498887, "loss": 1.5218, "step": 491 }, { "epoch": 0.10933333333333334, "grad_norm": 1.056720495223999, "learning_rate": 0.00017853006681514477, "loss": 1.5723, "step": 492 }, { "epoch": 0.10955555555555556, "grad_norm": 1.1733689308166504, "learning_rate": 0.00017848552338530066, "loss": 1.5902, "step": 493 }, { "epoch": 0.10977777777777778, "grad_norm": 1.0748651027679443, "learning_rate": 0.00017844097995545658, "loss": 0.7313, "step": 494 }, { "epoch": 0.11, "grad_norm": 1.1146743297576904, "learning_rate": 0.0001783964365256125, "loss": 1.5575, "step": 495 }, { "epoch": 0.11022222222222222, "grad_norm": 0.9686447978019714, "learning_rate": 0.0001783518930957684, "loss": 1.26, "step": 496 }, { "epoch": 0.11044444444444444, "grad_norm": 0.9595248103141785, "learning_rate": 0.00017830734966592428, "loss": 1.2048, "step": 497 }, { "epoch": 0.11066666666666666, "grad_norm": 0.655178964138031, "learning_rate": 0.00017826280623608017, "loss": 0.0465, "step": 498 }, { "epoch": 0.11088888888888888, "grad_norm": 0.8750471472740173, "learning_rate": 0.0001782182628062361, "loss": 0.8124, "step": 499 }, { "epoch": 0.1111111111111111, "grad_norm": 0.869178831577301, "learning_rate": 0.000178173719376392, "loss": 0.5776, "step": 500 }, { "epoch": 0.11133333333333334, "grad_norm": 0.641015887260437, "learning_rate": 0.0001781291759465479, "loss": 0.9489, "step": 501 }, { "epoch": 0.11155555555555556, "grad_norm": 0.6184130907058716, "learning_rate": 0.0001780846325167038, "loss": 1.1484, "step": 502 }, { "epoch": 0.11177777777777778, "grad_norm": 0.6213683485984802, "learning_rate": 0.00017804008908685968, "loss": 1.2721, "step": 503 }, { "epoch": 0.112, "grad_norm": 0.857179582118988, "learning_rate": 0.0001779955456570156, "loss": 2.2039, "step": 504 }, { "epoch": 0.11222222222222222, "grad_norm": 0.9155113101005554, "learning_rate": 0.0001779510022271715, "loss": 2.0145, "step": 505 }, { "epoch": 0.11244444444444444, "grad_norm": 0.8243066668510437, "learning_rate": 0.00017790645879732741, "loss": 0.0276, "step": 506 }, { "epoch": 0.11266666666666666, "grad_norm": 0.13185575604438782, "learning_rate": 0.0001778619153674833, "loss": 0.0172, "step": 507 }, { "epoch": 0.11288888888888889, "grad_norm": 0.5929608345031738, "learning_rate": 0.0001778173719376392, "loss": 1.1099, "step": 508 }, { "epoch": 0.1131111111111111, "grad_norm": 0.821940004825592, "learning_rate": 0.00017777282850779512, "loss": 2.2026, "step": 509 }, { "epoch": 0.11333333333333333, "grad_norm": 0.9007627964019775, "learning_rate": 0.000177728285077951, "loss": 2.0279, "step": 510 }, { "epoch": 0.11355555555555556, "grad_norm": 0.8976812958717346, "learning_rate": 0.0001776837416481069, "loss": 2.1622, "step": 511 }, { "epoch": 0.11377777777777778, "grad_norm": 0.8461301326751709, "learning_rate": 0.00017763919821826282, "loss": 2.0182, "step": 512 }, { "epoch": 0.114, "grad_norm": 0.9555535912513733, "learning_rate": 0.0001775946547884187, "loss": 2.3448, "step": 513 }, { "epoch": 0.11422222222222222, "grad_norm": 0.6102076768875122, "learning_rate": 0.00017755011135857463, "loss": 0.8332, "step": 514 }, { "epoch": 0.11444444444444445, "grad_norm": 0.9315692782402039, "learning_rate": 0.00017750556792873052, "loss": 1.9508, "step": 515 }, { "epoch": 0.11466666666666667, "grad_norm": 0.7640470266342163, "learning_rate": 0.0001774610244988864, "loss": 1.9453, "step": 516 }, { "epoch": 0.11488888888888889, "grad_norm": 0.9399512410163879, "learning_rate": 0.0001774164810690423, "loss": 1.8861, "step": 517 }, { "epoch": 0.11511111111111111, "grad_norm": 0.9297406673431396, "learning_rate": 0.00017737193763919822, "loss": 1.956, "step": 518 }, { "epoch": 0.11533333333333333, "grad_norm": 0.9970346689224243, "learning_rate": 0.00017732739420935414, "loss": 1.9636, "step": 519 }, { "epoch": 0.11555555555555555, "grad_norm": 0.1605161428451538, "learning_rate": 0.00017728285077951003, "loss": 0.0191, "step": 520 }, { "epoch": 0.11577777777777777, "grad_norm": 0.09369970113039017, "learning_rate": 0.00017723830734966593, "loss": 0.0181, "step": 521 }, { "epoch": 0.116, "grad_norm": 0.6291989088058472, "learning_rate": 0.00017719376391982182, "loss": 1.0345, "step": 522 }, { "epoch": 0.11622222222222223, "grad_norm": 1.45033597946167, "learning_rate": 0.00017714922048997774, "loss": 0.9683, "step": 523 }, { "epoch": 0.11644444444444445, "grad_norm": 0.7875054478645325, "learning_rate": 0.00017710467706013363, "loss": 0.087, "step": 524 }, { "epoch": 0.11666666666666667, "grad_norm": 0.9223285913467407, "learning_rate": 0.00017706013363028955, "loss": 1.9496, "step": 525 }, { "epoch": 0.11688888888888889, "grad_norm": 1.0288112163543701, "learning_rate": 0.00017701559020044544, "loss": 1.9752, "step": 526 }, { "epoch": 0.11711111111111111, "grad_norm": 0.9108681678771973, "learning_rate": 0.00017697104677060133, "loss": 1.8513, "step": 527 }, { "epoch": 0.11733333333333333, "grad_norm": 0.996670126914978, "learning_rate": 0.00017692650334075725, "loss": 1.9809, "step": 528 }, { "epoch": 0.11755555555555555, "grad_norm": 0.9788877964019775, "learning_rate": 0.00017688195991091314, "loss": 1.8143, "step": 529 }, { "epoch": 0.11777777777777777, "grad_norm": 0.751679539680481, "learning_rate": 0.00017683741648106903, "loss": 0.9896, "step": 530 }, { "epoch": 0.118, "grad_norm": 0.07957503199577332, "learning_rate": 0.00017679287305122495, "loss": 0.019, "step": 531 }, { "epoch": 0.11822222222222223, "grad_norm": 0.06963716447353363, "learning_rate": 0.00017674832962138087, "loss": 0.0187, "step": 532 }, { "epoch": 0.11844444444444445, "grad_norm": 0.6804819703102112, "learning_rate": 0.00017670378619153676, "loss": 0.8408, "step": 533 }, { "epoch": 0.11866666666666667, "grad_norm": 1.1230911016464233, "learning_rate": 0.00017665924276169266, "loss": 2.0286, "step": 534 }, { "epoch": 0.11888888888888889, "grad_norm": 0.2663741409778595, "learning_rate": 0.00017661469933184855, "loss": 0.0247, "step": 535 }, { "epoch": 0.11911111111111111, "grad_norm": 0.09472407400608063, "learning_rate": 0.00017657015590200444, "loss": 0.0229, "step": 536 }, { "epoch": 0.11933333333333333, "grad_norm": 0.08869557082653046, "learning_rate": 0.00017652561247216039, "loss": 0.0217, "step": 537 }, { "epoch": 0.11955555555555555, "grad_norm": 0.9782452583312988, "learning_rate": 0.00017648106904231628, "loss": 1.7486, "step": 538 }, { "epoch": 0.11977777777777777, "grad_norm": 1.1253771781921387, "learning_rate": 0.00017643652561247217, "loss": 1.7666, "step": 539 }, { "epoch": 0.12, "grad_norm": 1.0017434358596802, "learning_rate": 0.00017639198218262806, "loss": 1.6833, "step": 540 }, { "epoch": 0.12022222222222222, "grad_norm": 1.0089191198349, "learning_rate": 0.00017634743875278398, "loss": 1.7523, "step": 541 }, { "epoch": 0.12044444444444445, "grad_norm": 1.0553641319274902, "learning_rate": 0.00017630289532293987, "loss": 1.647, "step": 542 }, { "epoch": 0.12066666666666667, "grad_norm": 1.0351313352584839, "learning_rate": 0.0001762583518930958, "loss": 1.6262, "step": 543 }, { "epoch": 0.12088888888888889, "grad_norm": 1.1512898206710815, "learning_rate": 0.00017621380846325168, "loss": 1.7247, "step": 544 }, { "epoch": 0.12111111111111111, "grad_norm": 1.083144187927246, "learning_rate": 0.00017616926503340757, "loss": 1.491, "step": 545 }, { "epoch": 0.12133333333333333, "grad_norm": 1.0339199304580688, "learning_rate": 0.0001761247216035635, "loss": 1.31, "step": 546 }, { "epoch": 0.12155555555555556, "grad_norm": 1.1164246797561646, "learning_rate": 0.00017608017817371938, "loss": 1.3082, "step": 547 }, { "epoch": 0.12177777777777778, "grad_norm": 1.182440161705017, "learning_rate": 0.00017603563474387528, "loss": 0.9501, "step": 548 }, { "epoch": 0.122, "grad_norm": 2.1058719158172607, "learning_rate": 0.0001759910913140312, "loss": 0.073, "step": 549 }, { "epoch": 0.12222222222222222, "grad_norm": 1.0356662273406982, "learning_rate": 0.0001759465478841871, "loss": 0.8053, "step": 550 }, { "epoch": 0.12244444444444444, "grad_norm": 0.09837248176336288, "learning_rate": 0.000175902004454343, "loss": 0.0157, "step": 551 }, { "epoch": 0.12266666666666666, "grad_norm": 0.5652306079864502, "learning_rate": 0.0001758574610244989, "loss": 1.1591, "step": 552 }, { "epoch": 0.1228888888888889, "grad_norm": 0.07858684659004211, "learning_rate": 0.0001758129175946548, "loss": 0.0148, "step": 553 }, { "epoch": 0.12311111111111112, "grad_norm": 0.8684788942337036, "learning_rate": 0.00017576837416481068, "loss": 2.0654, "step": 554 }, { "epoch": 0.12333333333333334, "grad_norm": 0.067326620221138, "learning_rate": 0.0001757238307349666, "loss": 0.0133, "step": 555 }, { "epoch": 0.12355555555555556, "grad_norm": 0.9072126150131226, "learning_rate": 0.00017567928730512252, "loss": 2.1198, "step": 556 }, { "epoch": 0.12377777777777778, "grad_norm": 0.6821531653404236, "learning_rate": 0.0001756347438752784, "loss": 1.1766, "step": 557 }, { "epoch": 0.124, "grad_norm": 0.8932785987854004, "learning_rate": 0.0001755902004454343, "loss": 2.179, "step": 558 }, { "epoch": 0.12422222222222222, "grad_norm": 0.8057580590248108, "learning_rate": 0.0001755456570155902, "loss": 1.7459, "step": 559 }, { "epoch": 0.12444444444444444, "grad_norm": 0.9386541843414307, "learning_rate": 0.00017550111358574611, "loss": 2.0665, "step": 560 }, { "epoch": 0.12466666666666666, "grad_norm": 0.820952296257019, "learning_rate": 0.00017545657015590203, "loss": 2.0064, "step": 561 }, { "epoch": 0.12488888888888888, "grad_norm": 0.8475558757781982, "learning_rate": 0.00017541202672605792, "loss": 2.1712, "step": 562 }, { "epoch": 0.12511111111111112, "grad_norm": 0.9829273223876953, "learning_rate": 0.00017536748329621382, "loss": 2.2744, "step": 563 }, { "epoch": 0.12533333333333332, "grad_norm": 0.9070901870727539, "learning_rate": 0.0001753229398663697, "loss": 1.0953, "step": 564 }, { "epoch": 0.12555555555555556, "grad_norm": 0.6405161619186401, "learning_rate": 0.00017527839643652563, "loss": 0.967, "step": 565 }, { "epoch": 0.12577777777777777, "grad_norm": 1.083997368812561, "learning_rate": 0.00017523385300668152, "loss": 2.2172, "step": 566 }, { "epoch": 0.126, "grad_norm": 0.8409084677696228, "learning_rate": 0.00017518930957683744, "loss": 1.9096, "step": 567 }, { "epoch": 0.12622222222222224, "grad_norm": 0.9304640889167786, "learning_rate": 0.00017514476614699333, "loss": 1.9116, "step": 568 }, { "epoch": 0.12644444444444444, "grad_norm": 0.07137549668550491, "learning_rate": 0.00017510022271714922, "loss": 0.0168, "step": 569 }, { "epoch": 0.12666666666666668, "grad_norm": 0.06879518926143646, "learning_rate": 0.00017505567928730514, "loss": 0.0169, "step": 570 }, { "epoch": 0.12688888888888888, "grad_norm": 0.7226182818412781, "learning_rate": 0.00017501113585746103, "loss": 0.9012, "step": 571 }, { "epoch": 0.12711111111111112, "grad_norm": 0.9200636148452759, "learning_rate": 0.00017496659242761692, "loss": 1.8988, "step": 572 }, { "epoch": 0.12733333333333333, "grad_norm": 1.2475532293319702, "learning_rate": 0.00017492204899777284, "loss": 2.0776, "step": 573 }, { "epoch": 0.12755555555555556, "grad_norm": 0.07300052046775818, "learning_rate": 0.00017487750556792873, "loss": 0.019, "step": 574 }, { "epoch": 0.12777777777777777, "grad_norm": 0.06808430701494217, "learning_rate": 0.00017483296213808465, "loss": 0.0189, "step": 575 }, { "epoch": 0.128, "grad_norm": 1.7508424520492554, "learning_rate": 0.00017478841870824055, "loss": 2.1227, "step": 576 }, { "epoch": 0.1282222222222222, "grad_norm": 1.0882468223571777, "learning_rate": 0.00017474387527839644, "loss": 2.0598, "step": 577 }, { "epoch": 0.12844444444444444, "grad_norm": 0.14829707145690918, "learning_rate": 0.00017469933184855233, "loss": 0.0228, "step": 578 }, { "epoch": 0.12866666666666668, "grad_norm": 0.13807116448879242, "learning_rate": 0.00017465478841870825, "loss": 0.0218, "step": 579 }, { "epoch": 0.1288888888888889, "grad_norm": 0.7114683985710144, "learning_rate": 0.00017461024498886417, "loss": 1.0263, "step": 580 }, { "epoch": 0.12911111111111112, "grad_norm": 0.7122150659561157, "learning_rate": 0.00017456570155902006, "loss": 0.8802, "step": 581 }, { "epoch": 0.12933333333333333, "grad_norm": 0.08912209421396255, "learning_rate": 0.00017452115812917595, "loss": 0.0176, "step": 582 }, { "epoch": 0.12955555555555556, "grad_norm": 0.0869002416729927, "learning_rate": 0.00017447661469933184, "loss": 0.0162, "step": 583 }, { "epoch": 0.12977777777777777, "grad_norm": 0.8936485052108765, "learning_rate": 0.00017443207126948776, "loss": 1.4582, "step": 584 }, { "epoch": 0.13, "grad_norm": 0.9836113452911377, "learning_rate": 0.00017438752783964368, "loss": 1.0513, "step": 585 }, { "epoch": 0.1302222222222222, "grad_norm": 0.9093931317329407, "learning_rate": 0.00017434298440979957, "loss": 0.1017, "step": 586 }, { "epoch": 0.13044444444444445, "grad_norm": 0.7786886692047119, "learning_rate": 0.00017429844097995546, "loss": 0.7586, "step": 587 }, { "epoch": 0.13066666666666665, "grad_norm": 1.0884865522384644, "learning_rate": 0.00017425389755011136, "loss": 1.6373, "step": 588 }, { "epoch": 0.1308888888888889, "grad_norm": 1.0809407234191895, "learning_rate": 0.00017420935412026727, "loss": 1.7491, "step": 589 }, { "epoch": 0.13111111111111112, "grad_norm": 1.408219814300537, "learning_rate": 0.00017416481069042317, "loss": 1.6617, "step": 590 }, { "epoch": 0.13133333333333333, "grad_norm": 0.9930030107498169, "learning_rate": 0.00017412026726057906, "loss": 1.385, "step": 591 }, { "epoch": 0.13155555555555556, "grad_norm": 1.1311132907867432, "learning_rate": 0.00017407572383073498, "loss": 1.7926, "step": 592 }, { "epoch": 0.13177777777777777, "grad_norm": 1.1787432432174683, "learning_rate": 0.00017403118040089087, "loss": 1.7932, "step": 593 }, { "epoch": 0.132, "grad_norm": 0.9596449732780457, "learning_rate": 0.0001739866369710468, "loss": 1.4321, "step": 594 }, { "epoch": 0.1322222222222222, "grad_norm": 0.9271344542503357, "learning_rate": 0.00017394209354120268, "loss": 0.0491, "step": 595 }, { "epoch": 0.13244444444444445, "grad_norm": 0.2056574523448944, "learning_rate": 0.00017389755011135857, "loss": 0.0362, "step": 596 }, { "epoch": 0.13266666666666665, "grad_norm": 1.0598512887954712, "learning_rate": 0.00017385300668151446, "loss": 1.1693, "step": 597 }, { "epoch": 0.1328888888888889, "grad_norm": 1.6673024892807007, "learning_rate": 0.00017380846325167038, "loss": 0.5844, "step": 598 }, { "epoch": 0.13311111111111112, "grad_norm": 1.1234303712844849, "learning_rate": 0.0001737639198218263, "loss": 0.923, "step": 599 }, { "epoch": 0.13333333333333333, "grad_norm": 1.1665160655975342, "learning_rate": 0.0001737193763919822, "loss": 1.1461, "step": 600 }, { "epoch": 0.13355555555555557, "grad_norm": 0.761227548122406, "learning_rate": 0.00017367483296213808, "loss": 1.2228, "step": 601 }, { "epoch": 0.13377777777777777, "grad_norm": 0.7757149338722229, "learning_rate": 0.00017363028953229398, "loss": 2.3424, "step": 602 }, { "epoch": 0.134, "grad_norm": 0.6811862587928772, "learning_rate": 0.0001735857461024499, "loss": 1.3366, "step": 603 }, { "epoch": 0.13422222222222221, "grad_norm": 0.6670216917991638, "learning_rate": 0.00017354120267260581, "loss": 0.5969, "step": 604 }, { "epoch": 0.13444444444444445, "grad_norm": 0.7554097771644592, "learning_rate": 0.0001734966592427617, "loss": 2.1288, "step": 605 }, { "epoch": 0.13466666666666666, "grad_norm": 0.7412663698196411, "learning_rate": 0.0001734521158129176, "loss": 2.1018, "step": 606 }, { "epoch": 0.1348888888888889, "grad_norm": 0.14801454544067383, "learning_rate": 0.0001734075723830735, "loss": 0.0229, "step": 607 }, { "epoch": 0.1351111111111111, "grad_norm": 0.1401568502187729, "learning_rate": 0.0001733630289532294, "loss": 0.0219, "step": 608 }, { "epoch": 0.13533333333333333, "grad_norm": 0.1252715289592743, "learning_rate": 0.0001733184855233853, "loss": 0.0204, "step": 609 }, { "epoch": 0.13555555555555557, "grad_norm": 0.8645430207252502, "learning_rate": 0.00017327394209354122, "loss": 2.121, "step": 610 }, { "epoch": 0.13577777777777778, "grad_norm": 0.651623010635376, "learning_rate": 0.0001732293986636971, "loss": 2.0482, "step": 611 }, { "epoch": 0.136, "grad_norm": 0.8796990513801575, "learning_rate": 0.000173184855233853, "loss": 1.9341, "step": 612 }, { "epoch": 0.13622222222222222, "grad_norm": 0.7759367227554321, "learning_rate": 0.00017314031180400892, "loss": 2.0492, "step": 613 }, { "epoch": 0.13644444444444445, "grad_norm": 0.758148729801178, "learning_rate": 0.0001730957683741648, "loss": 1.8866, "step": 614 }, { "epoch": 0.13666666666666666, "grad_norm": 0.7975385785102844, "learning_rate": 0.0001730512249443207, "loss": 1.946, "step": 615 }, { "epoch": 0.1368888888888889, "grad_norm": 1.0211257934570312, "learning_rate": 0.00017300668151447662, "loss": 2.042, "step": 616 }, { "epoch": 0.1371111111111111, "grad_norm": 0.8569313883781433, "learning_rate": 0.00017296213808463254, "loss": 1.9781, "step": 617 }, { "epoch": 0.13733333333333334, "grad_norm": 0.9642595052719116, "learning_rate": 0.00017291759465478843, "loss": 1.9, "step": 618 }, { "epoch": 0.13755555555555554, "grad_norm": 0.24051399528980255, "learning_rate": 0.00017287305122494433, "loss": 0.0233, "step": 619 }, { "epoch": 0.13777777777777778, "grad_norm": 0.13666865229606628, "learning_rate": 0.00017282850779510022, "loss": 0.0208, "step": 620 }, { "epoch": 0.138, "grad_norm": 0.7538687586784363, "learning_rate": 0.0001727839643652561, "loss": 1.0854, "step": 621 }, { "epoch": 0.13822222222222222, "grad_norm": 1.1495360136032104, "learning_rate": 0.00017273942093541206, "loss": 1.1, "step": 622 }, { "epoch": 0.13844444444444445, "grad_norm": 0.6540763974189758, "learning_rate": 0.00017269487750556795, "loss": 0.8361, "step": 623 }, { "epoch": 0.13866666666666666, "grad_norm": 1.0281493663787842, "learning_rate": 0.00017265033407572384, "loss": 1.9939, "step": 624 }, { "epoch": 0.1388888888888889, "grad_norm": 1.0349078178405762, "learning_rate": 0.00017260579064587973, "loss": 1.778, "step": 625 }, { "epoch": 0.1391111111111111, "grad_norm": 0.9192053079605103, "learning_rate": 0.00017256124721603565, "loss": 1.8163, "step": 626 }, { "epoch": 0.13933333333333334, "grad_norm": 1.1019245386123657, "learning_rate": 0.00017251670378619154, "loss": 1.8964, "step": 627 }, { "epoch": 0.13955555555555554, "grad_norm": 0.9387298226356506, "learning_rate": 0.00017247216035634746, "loss": 1.8093, "step": 628 }, { "epoch": 0.13977777777777778, "grad_norm": 1.047958254814148, "learning_rate": 0.00017242761692650335, "loss": 1.7654, "step": 629 }, { "epoch": 0.14, "grad_norm": 1.093125820159912, "learning_rate": 0.00017238307349665924, "loss": 1.1535, "step": 630 }, { "epoch": 0.14022222222222222, "grad_norm": 0.12394639104604721, "learning_rate": 0.00017233853006681516, "loss": 0.0218, "step": 631 }, { "epoch": 0.14044444444444446, "grad_norm": 0.09156838059425354, "learning_rate": 0.00017229398663697106, "loss": 0.0206, "step": 632 }, { "epoch": 0.14066666666666666, "grad_norm": 1.1327197551727295, "learning_rate": 0.00017224944320712695, "loss": 1.872, "step": 633 }, { "epoch": 0.1408888888888889, "grad_norm": 0.38535261154174805, "learning_rate": 0.00017220489977728287, "loss": 0.0376, "step": 634 }, { "epoch": 0.1411111111111111, "grad_norm": 0.2830149531364441, "learning_rate": 0.00017216035634743876, "loss": 0.0275, "step": 635 }, { "epoch": 0.14133333333333334, "grad_norm": 0.09745927155017853, "learning_rate": 0.00017211581291759468, "loss": 0.0205, "step": 636 }, { "epoch": 0.14155555555555555, "grad_norm": 0.998629093170166, "learning_rate": 0.00017207126948775057, "loss": 1.4353, "step": 637 }, { "epoch": 0.14177777777777778, "grad_norm": 1.019489049911499, "learning_rate": 0.00017202672605790646, "loss": 1.4431, "step": 638 }, { "epoch": 0.142, "grad_norm": 1.311251163482666, "learning_rate": 0.00017198218262806235, "loss": 1.9691, "step": 639 }, { "epoch": 0.14222222222222222, "grad_norm": 1.1369127035140991, "learning_rate": 0.00017193763919821827, "loss": 1.7863, "step": 640 }, { "epoch": 0.14244444444444446, "grad_norm": 0.9947224855422974, "learning_rate": 0.0001718930957683742, "loss": 1.429, "step": 641 }, { "epoch": 0.14266666666666666, "grad_norm": 0.7390214800834656, "learning_rate": 0.00017184855233853008, "loss": 0.7239, "step": 642 }, { "epoch": 0.1428888888888889, "grad_norm": 0.2698360085487366, "learning_rate": 0.00017180400890868597, "loss": 0.0362, "step": 643 }, { "epoch": 0.1431111111111111, "grad_norm": 0.7463746070861816, "learning_rate": 0.00017175946547884187, "loss": 0.898, "step": 644 }, { "epoch": 0.14333333333333334, "grad_norm": 1.0978554487228394, "learning_rate": 0.00017171492204899778, "loss": 1.2833, "step": 645 }, { "epoch": 0.14355555555555555, "grad_norm": 1.149170160293579, "learning_rate": 0.0001716703786191537, "loss": 1.2463, "step": 646 }, { "epoch": 0.14377777777777778, "grad_norm": 2.201732873916626, "learning_rate": 0.0001716258351893096, "loss": 0.869, "step": 647 }, { "epoch": 0.144, "grad_norm": 2.3375518321990967, "learning_rate": 0.0001715812917594655, "loss": 0.3554, "step": 648 }, { "epoch": 0.14422222222222222, "grad_norm": 1.1565347909927368, "learning_rate": 0.00017153674832962138, "loss": 0.5603, "step": 649 }, { "epoch": 0.14444444444444443, "grad_norm": 1.1238269805908203, "learning_rate": 0.0001714922048997773, "loss": 1.0216, "step": 650 }, { "epoch": 0.14466666666666667, "grad_norm": 0.861092209815979, "learning_rate": 0.0001714476614699332, "loss": 2.2131, "step": 651 }, { "epoch": 0.1448888888888889, "grad_norm": 0.0988718643784523, "learning_rate": 0.0001714031180400891, "loss": 0.0149, "step": 652 }, { "epoch": 0.1451111111111111, "grad_norm": 0.5644282698631287, "learning_rate": 0.000171358574610245, "loss": 0.9802, "step": 653 }, { "epoch": 0.14533333333333334, "grad_norm": 0.6775911450386047, "learning_rate": 0.0001713140311804009, "loss": 1.1163, "step": 654 }, { "epoch": 0.14555555555555555, "grad_norm": 0.6742568016052246, "learning_rate": 0.0001712694877505568, "loss": 1.1633, "step": 655 }, { "epoch": 0.14577777777777778, "grad_norm": 0.10246681421995163, "learning_rate": 0.0001712249443207127, "loss": 0.0136, "step": 656 }, { "epoch": 0.146, "grad_norm": 0.09264359623193741, "learning_rate": 0.0001711804008908686, "loss": 0.0126, "step": 657 }, { "epoch": 0.14622222222222223, "grad_norm": 0.07406888157129288, "learning_rate": 0.00017113585746102451, "loss": 0.0117, "step": 658 }, { "epoch": 0.14644444444444443, "grad_norm": 0.06493738293647766, "learning_rate": 0.0001710913140311804, "loss": 0.0107, "step": 659 }, { "epoch": 0.14666666666666667, "grad_norm": 0.9421645998954773, "learning_rate": 0.00017104677060133632, "loss": 2.1287, "step": 660 }, { "epoch": 0.1468888888888889, "grad_norm": 0.9334849119186401, "learning_rate": 0.00017100222717149222, "loss": 2.1281, "step": 661 }, { "epoch": 0.1471111111111111, "grad_norm": 1.126652717590332, "learning_rate": 0.0001709576837416481, "loss": 2.1985, "step": 662 }, { "epoch": 0.14733333333333334, "grad_norm": 0.8477734327316284, "learning_rate": 0.000170913140311804, "loss": 2.2331, "step": 663 }, { "epoch": 0.14755555555555555, "grad_norm": 0.816444456577301, "learning_rate": 0.00017086859688195992, "loss": 1.9682, "step": 664 }, { "epoch": 0.14777777777777779, "grad_norm": 1.1034094095230103, "learning_rate": 0.00017082405345211584, "loss": 1.0565, "step": 665 }, { "epoch": 0.148, "grad_norm": 0.9575863480567932, "learning_rate": 0.00017077951002227173, "loss": 1.9457, "step": 666 }, { "epoch": 0.14822222222222223, "grad_norm": 0.8643515706062317, "learning_rate": 0.00017073496659242762, "loss": 1.969, "step": 667 }, { "epoch": 0.14844444444444443, "grad_norm": 0.7901911735534668, "learning_rate": 0.0001706904231625835, "loss": 1.6202, "step": 668 }, { "epoch": 0.14866666666666667, "grad_norm": 0.8725628852844238, "learning_rate": 0.00017064587973273943, "loss": 2.0964, "step": 669 }, { "epoch": 0.14888888888888888, "grad_norm": 0.8935587406158447, "learning_rate": 0.00017060133630289532, "loss": 1.7493, "step": 670 }, { "epoch": 0.1491111111111111, "grad_norm": 0.7131580114364624, "learning_rate": 0.00017055679287305124, "loss": 0.9353, "step": 671 }, { "epoch": 0.14933333333333335, "grad_norm": 0.30260053277015686, "learning_rate": 0.00017051224944320713, "loss": 0.0378, "step": 672 }, { "epoch": 0.14955555555555555, "grad_norm": 0.7780633568763733, "learning_rate": 0.00017046770601336303, "loss": 1.0137, "step": 673 }, { "epoch": 0.1497777777777778, "grad_norm": 0.9263336062431335, "learning_rate": 0.00017042316258351895, "loss": 1.3275, "step": 674 }, { "epoch": 0.15, "grad_norm": 0.9188753366470337, "learning_rate": 0.00017037861915367484, "loss": 1.8992, "step": 675 }, { "epoch": 0.15022222222222223, "grad_norm": 0.9691118597984314, "learning_rate": 0.00017033407572383073, "loss": 2.0906, "step": 676 }, { "epoch": 0.15044444444444444, "grad_norm": 0.9938292503356934, "learning_rate": 0.00017028953229398665, "loss": 1.769, "step": 677 }, { "epoch": 0.15066666666666667, "grad_norm": 1.172528862953186, "learning_rate": 0.00017024498886414254, "loss": 1.808, "step": 678 }, { "epoch": 0.15088888888888888, "grad_norm": 0.8911821842193604, "learning_rate": 0.00017020044543429846, "loss": 1.8644, "step": 679 }, { "epoch": 0.1511111111111111, "grad_norm": 0.9470701217651367, "learning_rate": 0.00017015590200445435, "loss": 1.9587, "step": 680 }, { "epoch": 0.15133333333333332, "grad_norm": 1.2212241888046265, "learning_rate": 0.00017011135857461024, "loss": 0.0495, "step": 681 }, { "epoch": 0.15155555555555555, "grad_norm": 0.15241730213165283, "learning_rate": 0.00017006681514476613, "loss": 0.0224, "step": 682 }, { "epoch": 0.1517777777777778, "grad_norm": 1.2180373668670654, "learning_rate": 0.00017002227171492205, "loss": 1.7299, "step": 683 }, { "epoch": 0.152, "grad_norm": 0.9515765905380249, "learning_rate": 0.00016997772828507797, "loss": 1.1014, "step": 684 }, { "epoch": 0.15222222222222223, "grad_norm": 0.10555847734212875, "learning_rate": 0.00016993318485523386, "loss": 0.0218, "step": 685 }, { "epoch": 0.15244444444444444, "grad_norm": 0.0902755856513977, "learning_rate": 0.00016988864142538976, "loss": 0.0213, "step": 686 }, { "epoch": 0.15266666666666667, "grad_norm": 0.08572933077812195, "learning_rate": 0.00016984409799554565, "loss": 0.0203, "step": 687 }, { "epoch": 0.15288888888888888, "grad_norm": 0.6668169498443604, "learning_rate": 0.00016979955456570157, "loss": 0.8351, "step": 688 }, { "epoch": 0.15311111111111111, "grad_norm": 1.3807684183120728, "learning_rate": 0.00016975501113585748, "loss": 2.0387, "step": 689 }, { "epoch": 0.15333333333333332, "grad_norm": 1.0587692260742188, "learning_rate": 0.00016971046770601338, "loss": 1.6551, "step": 690 }, { "epoch": 0.15355555555555556, "grad_norm": 1.378057837486267, "learning_rate": 0.00016966592427616927, "loss": 0.8331, "step": 691 }, { "epoch": 0.1537777777777778, "grad_norm": 1.1262686252593994, "learning_rate": 0.00016962138084632516, "loss": 1.6237, "step": 692 }, { "epoch": 0.154, "grad_norm": 1.0472062826156616, "learning_rate": 0.00016957683741648108, "loss": 1.7357, "step": 693 }, { "epoch": 0.15422222222222223, "grad_norm": 0.9540035128593445, "learning_rate": 0.00016953229398663697, "loss": 1.4822, "step": 694 }, { "epoch": 0.15444444444444444, "grad_norm": 1.032220721244812, "learning_rate": 0.0001694877505567929, "loss": 1.5291, "step": 695 }, { "epoch": 0.15466666666666667, "grad_norm": 0.7844957709312439, "learning_rate": 0.00016944320712694878, "loss": 0.8609, "step": 696 }, { "epoch": 0.15488888888888888, "grad_norm": 1.222839117050171, "learning_rate": 0.00016939866369710467, "loss": 1.2355, "step": 697 }, { "epoch": 0.15511111111111112, "grad_norm": 1.9266964197158813, "learning_rate": 0.0001693541202672606, "loss": 0.6971, "step": 698 }, { "epoch": 0.15533333333333332, "grad_norm": 0.49530452489852905, "learning_rate": 0.00016930957683741648, "loss": 0.0478, "step": 699 }, { "epoch": 0.15555555555555556, "grad_norm": 0.9992147088050842, "learning_rate": 0.00016926503340757238, "loss": 0.8504, "step": 700 }, { "epoch": 0.15577777777777777, "grad_norm": 0.5562880635261536, "learning_rate": 0.0001692204899777283, "loss": 1.2278, "step": 701 }, { "epoch": 0.156, "grad_norm": 0.5927205681800842, "learning_rate": 0.00016917594654788421, "loss": 1.1753, "step": 702 }, { "epoch": 0.15622222222222223, "grad_norm": 0.7892579436302185, "learning_rate": 0.0001691314031180401, "loss": 2.1196, "step": 703 }, { "epoch": 0.15644444444444444, "grad_norm": 0.09951931238174438, "learning_rate": 0.000169086859688196, "loss": 0.0159, "step": 704 }, { "epoch": 0.15666666666666668, "grad_norm": 0.08787377178668976, "learning_rate": 0.0001690423162583519, "loss": 0.0151, "step": 705 }, { "epoch": 0.15688888888888888, "grad_norm": 0.50344318151474, "learning_rate": 0.00016899777282850778, "loss": 1.1949, "step": 706 }, { "epoch": 0.15711111111111112, "grad_norm": 0.8544764518737793, "learning_rate": 0.00016895322939866373, "loss": 2.3102, "step": 707 }, { "epoch": 0.15733333333333333, "grad_norm": 0.9083617925643921, "learning_rate": 0.00016890868596881962, "loss": 2.1908, "step": 708 }, { "epoch": 0.15755555555555556, "grad_norm": 0.7772009968757629, "learning_rate": 0.0001688641425389755, "loss": 2.3081, "step": 709 }, { "epoch": 0.15777777777777777, "grad_norm": 0.7839867472648621, "learning_rate": 0.0001688195991091314, "loss": 1.919, "step": 710 }, { "epoch": 0.158, "grad_norm": 0.8075196743011475, "learning_rate": 0.00016877505567928732, "loss": 1.9511, "step": 711 }, { "epoch": 0.1582222222222222, "grad_norm": 0.9218339920043945, "learning_rate": 0.0001687305122494432, "loss": 1.9041, "step": 712 }, { "epoch": 0.15844444444444444, "grad_norm": 0.7846603989601135, "learning_rate": 0.00016868596881959913, "loss": 1.8966, "step": 713 }, { "epoch": 0.15866666666666668, "grad_norm": 1.1635181903839111, "learning_rate": 0.00016864142538975502, "loss": 2.0407, "step": 714 }, { "epoch": 0.15888888888888889, "grad_norm": 0.7846897840499878, "learning_rate": 0.00016859688195991092, "loss": 0.9193, "step": 715 }, { "epoch": 0.15911111111111112, "grad_norm": 0.31325796246528625, "learning_rate": 0.00016855233853006683, "loss": 0.035, "step": 716 }, { "epoch": 0.15933333333333333, "grad_norm": 1.016501545906067, "learning_rate": 0.00016850779510022273, "loss": 2.0913, "step": 717 }, { "epoch": 0.15955555555555556, "grad_norm": 0.9303516149520874, "learning_rate": 0.00016846325167037862, "loss": 1.9976, "step": 718 }, { "epoch": 0.15977777777777777, "grad_norm": 1.1509366035461426, "learning_rate": 0.00016841870824053454, "loss": 1.6855, "step": 719 }, { "epoch": 0.16, "grad_norm": 1.0439561605453491, "learning_rate": 0.00016837416481069043, "loss": 1.7674, "step": 720 }, { "epoch": 0.1602222222222222, "grad_norm": 0.27185630798339844, "learning_rate": 0.00016832962138084635, "loss": 0.0249, "step": 721 }, { "epoch": 0.16044444444444445, "grad_norm": 0.8196635246276855, "learning_rate": 0.00016828507795100224, "loss": 1.0722, "step": 722 }, { "epoch": 0.16066666666666668, "grad_norm": 0.6817071437835693, "learning_rate": 0.00016824053452115813, "loss": 1.0584, "step": 723 }, { "epoch": 0.1608888888888889, "grad_norm": 0.8211132884025574, "learning_rate": 0.00016819599109131402, "loss": 1.0687, "step": 724 }, { "epoch": 0.16111111111111112, "grad_norm": 0.23781214654445648, "learning_rate": 0.00016815144766146994, "loss": 0.0279, "step": 725 }, { "epoch": 0.16133333333333333, "grad_norm": 0.9902861714363098, "learning_rate": 0.00016810690423162586, "loss": 1.8951, "step": 726 }, { "epoch": 0.16155555555555556, "grad_norm": 1.0102611780166626, "learning_rate": 0.00016806236080178175, "loss": 1.7121, "step": 727 }, { "epoch": 0.16177777777777777, "grad_norm": 1.0301183462142944, "learning_rate": 0.00016801781737193764, "loss": 1.636, "step": 728 }, { "epoch": 0.162, "grad_norm": 0.9955403804779053, "learning_rate": 0.00016797327394209354, "loss": 0.9496, "step": 729 }, { "epoch": 0.1622222222222222, "grad_norm": 0.2391827553510666, "learning_rate": 0.00016792873051224946, "loss": 0.035, "step": 730 }, { "epoch": 0.16244444444444445, "grad_norm": 0.1952289491891861, "learning_rate": 0.00016788418708240537, "loss": 0.0305, "step": 731 }, { "epoch": 0.16266666666666665, "grad_norm": 0.13709303736686707, "learning_rate": 0.00016783964365256127, "loss": 0.0259, "step": 732 }, { "epoch": 0.1628888888888889, "grad_norm": 0.969248354434967, "learning_rate": 0.00016779510022271716, "loss": 1.7231, "step": 733 }, { "epoch": 0.16311111111111112, "grad_norm": 1.2497774362564087, "learning_rate": 0.00016775055679287305, "loss": 1.639, "step": 734 }, { "epoch": 0.16333333333333333, "grad_norm": 0.10687464475631714, "learning_rate": 0.00016770601336302897, "loss": 0.021, "step": 735 }, { "epoch": 0.16355555555555557, "grad_norm": 0.10326875746250153, "learning_rate": 0.00016766146993318486, "loss": 0.021, "step": 736 }, { "epoch": 0.16377777777777777, "grad_norm": 0.08747762441635132, "learning_rate": 0.00016761692650334075, "loss": 0.0201, "step": 737 }, { "epoch": 0.164, "grad_norm": 0.09365817159414291, "learning_rate": 0.00016757238307349667, "loss": 0.0191, "step": 738 }, { "epoch": 0.16422222222222221, "grad_norm": 0.9810875058174133, "learning_rate": 0.00016752783964365256, "loss": 1.5407, "step": 739 }, { "epoch": 0.16444444444444445, "grad_norm": 1.0869085788726807, "learning_rate": 0.00016748329621380848, "loss": 1.8049, "step": 740 }, { "epoch": 0.16466666666666666, "grad_norm": 1.1577770709991455, "learning_rate": 0.00016743875278396437, "loss": 1.9633, "step": 741 }, { "epoch": 0.1648888888888889, "grad_norm": 1.1007702350616455, "learning_rate": 0.00016739420935412027, "loss": 1.6518, "step": 742 }, { "epoch": 0.1651111111111111, "grad_norm": 1.4112728834152222, "learning_rate": 0.00016734966592427616, "loss": 1.7209, "step": 743 }, { "epoch": 0.16533333333333333, "grad_norm": 1.1396406888961792, "learning_rate": 0.00016730512249443208, "loss": 1.666, "step": 744 }, { "epoch": 0.16555555555555557, "grad_norm": 1.080653190612793, "learning_rate": 0.000167260579064588, "loss": 1.5255, "step": 745 }, { "epoch": 0.16577777777777777, "grad_norm": 1.300510048866272, "learning_rate": 0.0001672160356347439, "loss": 1.8507, "step": 746 }, { "epoch": 0.166, "grad_norm": 0.9009153842926025, "learning_rate": 0.00016717149220489978, "loss": 0.5545, "step": 747 }, { "epoch": 0.16622222222222222, "grad_norm": 0.20857150852680206, "learning_rate": 0.00016712694877505567, "loss": 0.0369, "step": 748 }, { "epoch": 0.16644444444444445, "grad_norm": 1.3490118980407715, "learning_rate": 0.0001670824053452116, "loss": 1.6188, "step": 749 }, { "epoch": 0.16666666666666666, "grad_norm": 1.054196834564209, "learning_rate": 0.0001670378619153675, "loss": 1.438, "step": 750 }, { "epoch": 0.1668888888888889, "grad_norm": 0.49820706248283386, "learning_rate": 0.0001669933184855234, "loss": 1.0425, "step": 751 }, { "epoch": 0.1671111111111111, "grad_norm": 0.10511091351509094, "learning_rate": 0.0001669487750556793, "loss": 0.0161, "step": 752 }, { "epoch": 0.16733333333333333, "grad_norm": 0.5921303033828735, "learning_rate": 0.00016690423162583518, "loss": 1.0911, "step": 753 }, { "epoch": 0.16755555555555557, "grad_norm": 0.09364970773458481, "learning_rate": 0.0001668596881959911, "loss": 0.0152, "step": 754 }, { "epoch": 0.16777777777777778, "grad_norm": 0.6593067646026611, "learning_rate": 0.000166815144766147, "loss": 1.1768, "step": 755 }, { "epoch": 0.168, "grad_norm": 0.5098649859428406, "learning_rate": 0.00016677060133630291, "loss": 0.9257, "step": 756 }, { "epoch": 0.16822222222222222, "grad_norm": 0.7384516000747681, "learning_rate": 0.0001667260579064588, "loss": 0.0302, "step": 757 }, { "epoch": 0.16844444444444445, "grad_norm": 0.11098629981279373, "learning_rate": 0.0001666815144766147, "loss": 0.0209, "step": 758 }, { "epoch": 0.16866666666666666, "grad_norm": 0.5349358916282654, "learning_rate": 0.00016663697104677062, "loss": 0.9604, "step": 759 }, { "epoch": 0.1688888888888889, "grad_norm": 0.8677853941917419, "learning_rate": 0.0001665924276169265, "loss": 2.01, "step": 760 }, { "epoch": 0.1691111111111111, "grad_norm": 0.8876023292541504, "learning_rate": 0.0001665478841870824, "loss": 2.1739, "step": 761 }, { "epoch": 0.16933333333333334, "grad_norm": 0.7748745679855347, "learning_rate": 0.00016650334075723832, "loss": 2.0633, "step": 762 }, { "epoch": 0.16955555555555554, "grad_norm": 0.8775037527084351, "learning_rate": 0.0001664587973273942, "loss": 2.0544, "step": 763 }, { "epoch": 0.16977777777777778, "grad_norm": 0.8547599911689758, "learning_rate": 0.00016641425389755013, "loss": 1.6622, "step": 764 }, { "epoch": 0.17, "grad_norm": 1.0196506977081299, "learning_rate": 0.00016636971046770602, "loss": 2.2227, "step": 765 }, { "epoch": 0.17022222222222222, "grad_norm": 0.9069720506668091, "learning_rate": 0.0001663251670378619, "loss": 2.2042, "step": 766 }, { "epoch": 0.17044444444444445, "grad_norm": 0.8413894772529602, "learning_rate": 0.0001662806236080178, "loss": 2.0638, "step": 767 }, { "epoch": 0.17066666666666666, "grad_norm": 0.5857881903648376, "learning_rate": 0.00016623608017817372, "loss": 0.8278, "step": 768 }, { "epoch": 0.1708888888888889, "grad_norm": 0.084866464138031, "learning_rate": 0.00016619153674832964, "loss": 0.0179, "step": 769 }, { "epoch": 0.1711111111111111, "grad_norm": 0.08603781461715698, "learning_rate": 0.00016614699331848553, "loss": 0.0177, "step": 770 }, { "epoch": 0.17133333333333334, "grad_norm": 0.07927680015563965, "learning_rate": 0.00016610244988864143, "loss": 0.0174, "step": 771 }, { "epoch": 0.17155555555555554, "grad_norm": 0.09696035087108612, "learning_rate": 0.00016605790645879732, "loss": 0.0208, "step": 772 }, { "epoch": 0.17177777777777778, "grad_norm": 0.8104578852653503, "learning_rate": 0.00016601336302895324, "loss": 0.89, "step": 773 }, { "epoch": 0.172, "grad_norm": 1.0266926288604736, "learning_rate": 0.00016596881959910916, "loss": 1.8188, "step": 774 }, { "epoch": 0.17222222222222222, "grad_norm": 1.1254407167434692, "learning_rate": 0.00016592427616926505, "loss": 2.1854, "step": 775 }, { "epoch": 0.17244444444444446, "grad_norm": 1.0652568340301514, "learning_rate": 0.00016587973273942094, "loss": 1.8448, "step": 776 }, { "epoch": 0.17266666666666666, "grad_norm": 0.9953740239143372, "learning_rate": 0.00016583518930957683, "loss": 1.7428, "step": 777 }, { "epoch": 0.1728888888888889, "grad_norm": 0.9609451293945312, "learning_rate": 0.00016579064587973275, "loss": 1.7552, "step": 778 }, { "epoch": 0.1731111111111111, "grad_norm": 1.3759422302246094, "learning_rate": 0.00016574610244988864, "loss": 1.0064, "step": 779 }, { "epoch": 0.17333333333333334, "grad_norm": 0.103799007833004, "learning_rate": 0.00016570155902004456, "loss": 0.0194, "step": 780 }, { "epoch": 0.17355555555555555, "grad_norm": 0.6549257040023804, "learning_rate": 0.00016565701559020045, "loss": 0.7385, "step": 781 }, { "epoch": 0.17377777777777778, "grad_norm": 0.8397656679153442, "learning_rate": 0.00016561247216035634, "loss": 0.0268, "step": 782 }, { "epoch": 0.174, "grad_norm": 0.07175108045339584, "learning_rate": 0.00016556792873051226, "loss": 0.0176, "step": 783 }, { "epoch": 0.17422222222222222, "grad_norm": 0.07930734753608704, "learning_rate": 0.00016552338530066816, "loss": 0.0173, "step": 784 }, { "epoch": 0.17444444444444446, "grad_norm": 0.6065148115158081, "learning_rate": 0.00016547884187082405, "loss": 0.819, "step": 785 }, { "epoch": 0.17466666666666666, "grad_norm": 0.8312206864356995, "learning_rate": 0.00016543429844097997, "loss": 0.9267, "step": 786 }, { "epoch": 0.1748888888888889, "grad_norm": 0.2022327035665512, "learning_rate": 0.00016538975501113588, "loss": 0.0356, "step": 787 }, { "epoch": 0.1751111111111111, "grad_norm": 0.996240496635437, "learning_rate": 0.00016534521158129178, "loss": 1.6089, "step": 788 }, { "epoch": 0.17533333333333334, "grad_norm": 0.9978050589561462, "learning_rate": 0.00016530066815144767, "loss": 1.7166, "step": 789 }, { "epoch": 0.17555555555555555, "grad_norm": 1.1079049110412598, "learning_rate": 0.00016525612472160356, "loss": 1.5275, "step": 790 }, { "epoch": 0.17577777777777778, "grad_norm": 0.9986259937286377, "learning_rate": 0.00016521158129175945, "loss": 1.4903, "step": 791 }, { "epoch": 0.176, "grad_norm": 0.7572327852249146, "learning_rate": 0.0001651670378619154, "loss": 0.7548, "step": 792 }, { "epoch": 0.17622222222222222, "grad_norm": 0.16977067291736603, "learning_rate": 0.0001651224944320713, "loss": 0.0267, "step": 793 }, { "epoch": 0.17644444444444443, "grad_norm": 0.16834843158721924, "learning_rate": 0.00016507795100222718, "loss": 0.0259, "step": 794 }, { "epoch": 0.17666666666666667, "grad_norm": 0.8039578199386597, "learning_rate": 0.00016503340757238307, "loss": 0.7745, "step": 795 }, { "epoch": 0.1768888888888889, "grad_norm": 1.0637822151184082, "learning_rate": 0.000164988864142539, "loss": 1.2708, "step": 796 }, { "epoch": 0.1771111111111111, "grad_norm": 1.1139543056488037, "learning_rate": 0.00016494432071269488, "loss": 1.1626, "step": 797 }, { "epoch": 0.17733333333333334, "grad_norm": 0.8471236824989319, "learning_rate": 0.0001648997772828508, "loss": 0.5387, "step": 798 }, { "epoch": 0.17755555555555555, "grad_norm": 1.1162381172180176, "learning_rate": 0.0001648552338530067, "loss": 0.9235, "step": 799 }, { "epoch": 0.17777777777777778, "grad_norm": 1.160738229751587, "learning_rate": 0.0001648106904231626, "loss": 0.9536, "step": 800 }, { "epoch": 0.178, "grad_norm": 0.055902622640132904, "learning_rate": 0.0001647661469933185, "loss": 0.011, "step": 801 }, { "epoch": 0.17822222222222223, "grad_norm": 0.6805006861686707, "learning_rate": 0.0001647216035634744, "loss": 1.1912, "step": 802 }, { "epoch": 0.17844444444444443, "grad_norm": 0.5978904962539673, "learning_rate": 0.0001646770601336303, "loss": 1.242, "step": 803 }, { "epoch": 0.17866666666666667, "grad_norm": 1.012514352798462, "learning_rate": 0.0001646325167037862, "loss": 2.2625, "step": 804 }, { "epoch": 0.17888888888888888, "grad_norm": 0.06615343689918518, "learning_rate": 0.0001645879732739421, "loss": 0.0109, "step": 805 }, { "epoch": 0.1791111111111111, "grad_norm": 0.8926467299461365, "learning_rate": 0.00016454342984409802, "loss": 2.1026, "step": 806 }, { "epoch": 0.17933333333333334, "grad_norm": 0.7982742786407471, "learning_rate": 0.0001644988864142539, "loss": 1.7316, "step": 807 }, { "epoch": 0.17955555555555555, "grad_norm": 1.4942423105239868, "learning_rate": 0.0001644543429844098, "loss": 0.039, "step": 808 }, { "epoch": 0.1797777777777778, "grad_norm": 0.26765769720077515, "learning_rate": 0.0001644097995545657, "loss": 0.0198, "step": 809 }, { "epoch": 0.18, "grad_norm": 0.9778783321380615, "learning_rate": 0.0001643652561247216, "loss": 2.0191, "step": 810 }, { "epoch": 0.18022222222222223, "grad_norm": 1.1812803745269775, "learning_rate": 0.00016432071269487753, "loss": 2.7094, "step": 811 }, { "epoch": 0.18044444444444444, "grad_norm": 0.9629987478256226, "learning_rate": 0.00016427616926503342, "loss": 2.0157, "step": 812 }, { "epoch": 0.18066666666666667, "grad_norm": 0.895087718963623, "learning_rate": 0.00016423162583518932, "loss": 2.144, "step": 813 }, { "epoch": 0.18088888888888888, "grad_norm": 0.9080528616905212, "learning_rate": 0.0001641870824053452, "loss": 2.1558, "step": 814 }, { "epoch": 0.1811111111111111, "grad_norm": 1.078696846961975, "learning_rate": 0.00016414253897550113, "loss": 0.0447, "step": 815 }, { "epoch": 0.18133333333333335, "grad_norm": 0.9236059784889221, "learning_rate": 0.00016409799554565702, "loss": 2.0536, "step": 816 }, { "epoch": 0.18155555555555555, "grad_norm": 0.7201200127601624, "learning_rate": 0.00016405345211581294, "loss": 0.8334, "step": 817 }, { "epoch": 0.1817777777777778, "grad_norm": 0.08575476706027985, "learning_rate": 0.00016400890868596883, "loss": 0.0198, "step": 818 }, { "epoch": 0.182, "grad_norm": 1.1828711032867432, "learning_rate": 0.00016396436525612472, "loss": 0.9102, "step": 819 }, { "epoch": 0.18222222222222223, "grad_norm": 0.7130089998245239, "learning_rate": 0.00016391982182628064, "loss": 1.0971, "step": 820 }, { "epoch": 0.18244444444444444, "grad_norm": 0.9672996997833252, "learning_rate": 0.00016387527839643653, "loss": 1.9789, "step": 821 }, { "epoch": 0.18266666666666667, "grad_norm": 1.0078965425491333, "learning_rate": 0.00016383073496659242, "loss": 1.7371, "step": 822 }, { "epoch": 0.18288888888888888, "grad_norm": 1.0774242877960205, "learning_rate": 0.00016378619153674834, "loss": 1.7296, "step": 823 }, { "epoch": 0.1831111111111111, "grad_norm": 0.9907069802284241, "learning_rate": 0.00016374164810690423, "loss": 1.8363, "step": 824 }, { "epoch": 0.18333333333333332, "grad_norm": 0.9560150504112244, "learning_rate": 0.00016369710467706015, "loss": 1.6966, "step": 825 }, { "epoch": 0.18355555555555556, "grad_norm": 0.9652445912361145, "learning_rate": 0.00016365256124721604, "loss": 1.7625, "step": 826 }, { "epoch": 0.1837777777777778, "grad_norm": 0.9431360960006714, "learning_rate": 0.00016360801781737194, "loss": 1.6979, "step": 827 }, { "epoch": 0.184, "grad_norm": 0.9380690455436707, "learning_rate": 0.00016356347438752783, "loss": 1.7109, "step": 828 }, { "epoch": 0.18422222222222223, "grad_norm": 1.3345482349395752, "learning_rate": 0.00016351893095768375, "loss": 0.083, "step": 829 }, { "epoch": 0.18444444444444444, "grad_norm": 0.8567286729812622, "learning_rate": 0.00016347438752783967, "loss": 0.0372, "step": 830 }, { "epoch": 0.18466666666666667, "grad_norm": 0.14056961238384247, "learning_rate": 0.00016342984409799556, "loss": 0.0209, "step": 831 }, { "epoch": 0.18488888888888888, "grad_norm": 0.6902725696563721, "learning_rate": 0.00016338530066815145, "loss": 0.8186, "step": 832 }, { "epoch": 0.18511111111111112, "grad_norm": 0.8135867118835449, "learning_rate": 0.00016334075723830734, "loss": 0.8957, "step": 833 }, { "epoch": 0.18533333333333332, "grad_norm": 0.1166531890630722, "learning_rate": 0.00016329621380846326, "loss": 0.0205, "step": 834 }, { "epoch": 0.18555555555555556, "grad_norm": 0.09129589051008224, "learning_rate": 0.00016325167037861918, "loss": 0.0199, "step": 835 }, { "epoch": 0.18577777777777776, "grad_norm": 0.9971749782562256, "learning_rate": 0.00016320712694877507, "loss": 1.7846, "step": 836 }, { "epoch": 0.186, "grad_norm": 1.054129719734192, "learning_rate": 0.00016316258351893096, "loss": 1.4707, "step": 837 }, { "epoch": 0.18622222222222223, "grad_norm": 1.067037582397461, "learning_rate": 0.00016311804008908685, "loss": 1.8865, "step": 838 }, { "epoch": 0.18644444444444444, "grad_norm": 1.2100924253463745, "learning_rate": 0.00016307349665924277, "loss": 1.8347, "step": 839 }, { "epoch": 0.18666666666666668, "grad_norm": 1.0970559120178223, "learning_rate": 0.00016302895322939867, "loss": 1.7943, "step": 840 }, { "epoch": 0.18688888888888888, "grad_norm": 1.0739829540252686, "learning_rate": 0.00016298440979955458, "loss": 1.6663, "step": 841 }, { "epoch": 0.18711111111111112, "grad_norm": 0.9962330460548401, "learning_rate": 0.00016293986636971048, "loss": 1.6769, "step": 842 }, { "epoch": 0.18733333333333332, "grad_norm": 0.9708060026168823, "learning_rate": 0.00016289532293986637, "loss": 1.4129, "step": 843 }, { "epoch": 0.18755555555555556, "grad_norm": 0.9281109571456909, "learning_rate": 0.0001628507795100223, "loss": 1.276, "step": 844 }, { "epoch": 0.18777777777777777, "grad_norm": 0.7527890801429749, "learning_rate": 0.00016280623608017818, "loss": 0.8151, "step": 845 }, { "epoch": 0.188, "grad_norm": 1.0176119804382324, "learning_rate": 0.00016276169265033407, "loss": 1.1774, "step": 846 }, { "epoch": 0.18822222222222224, "grad_norm": 1.2225992679595947, "learning_rate": 0.00016271714922049, "loss": 1.2259, "step": 847 }, { "epoch": 0.18844444444444444, "grad_norm": 0.7326065897941589, "learning_rate": 0.00016267260579064588, "loss": 0.5637, "step": 848 }, { "epoch": 0.18866666666666668, "grad_norm": 0.3323253095149994, "learning_rate": 0.0001626280623608018, "loss": 0.05, "step": 849 }, { "epoch": 0.18888888888888888, "grad_norm": 1.0131728649139404, "learning_rate": 0.0001625835189309577, "loss": 0.8704, "step": 850 }, { "epoch": 0.18911111111111112, "grad_norm": 0.9873238205909729, "learning_rate": 0.00016253897550111358, "loss": 2.2498, "step": 851 }, { "epoch": 0.18933333333333333, "grad_norm": 0.8620486855506897, "learning_rate": 0.00016249443207126948, "loss": 2.1671, "step": 852 }, { "epoch": 0.18955555555555556, "grad_norm": 0.5899505019187927, "learning_rate": 0.0001624498886414254, "loss": 0.9804, "step": 853 }, { "epoch": 0.18977777777777777, "grad_norm": 0.5602113604545593, "learning_rate": 0.0001624053452115813, "loss": 1.1528, "step": 854 }, { "epoch": 0.19, "grad_norm": 0.9018243551254272, "learning_rate": 0.0001623608017817372, "loss": 2.1603, "step": 855 }, { "epoch": 0.1902222222222222, "grad_norm": 1.0384292602539062, "learning_rate": 0.0001623162583518931, "loss": 0.0507, "step": 856 }, { "epoch": 0.19044444444444444, "grad_norm": 0.16755282878875732, "learning_rate": 0.000162271714922049, "loss": 0.0188, "step": 857 }, { "epoch": 0.19066666666666668, "grad_norm": 0.5630192756652832, "learning_rate": 0.0001622271714922049, "loss": 1.1861, "step": 858 }, { "epoch": 0.19088888888888889, "grad_norm": 0.8935820460319519, "learning_rate": 0.00016218262806236083, "loss": 2.1745, "step": 859 }, { "epoch": 0.19111111111111112, "grad_norm": 0.8242104649543762, "learning_rate": 0.00016213808463251672, "loss": 1.8877, "step": 860 }, { "epoch": 0.19133333333333333, "grad_norm": 0.8368860483169556, "learning_rate": 0.0001620935412026726, "loss": 2.219, "step": 861 }, { "epoch": 0.19155555555555556, "grad_norm": 0.9018285870552063, "learning_rate": 0.0001620489977728285, "loss": 2.1182, "step": 862 }, { "epoch": 0.19177777777777777, "grad_norm": 0.8542325496673584, "learning_rate": 0.00016200445434298442, "loss": 1.9889, "step": 863 }, { "epoch": 0.192, "grad_norm": 1.7638332843780518, "learning_rate": 0.0001619599109131403, "loss": 0.1212, "step": 864 }, { "epoch": 0.1922222222222222, "grad_norm": 0.8848968148231506, "learning_rate": 0.00016191536748329623, "loss": 1.8857, "step": 865 }, { "epoch": 0.19244444444444445, "grad_norm": 0.8312684893608093, "learning_rate": 0.00016187082405345212, "loss": 2.09, "step": 866 }, { "epoch": 0.19266666666666668, "grad_norm": 0.8967249989509583, "learning_rate": 0.00016182628062360802, "loss": 1.8655, "step": 867 }, { "epoch": 0.1928888888888889, "grad_norm": 1.0011157989501953, "learning_rate": 0.00016178173719376393, "loss": 2.1108, "step": 868 }, { "epoch": 0.19311111111111112, "grad_norm": 0.8654418587684631, "learning_rate": 0.00016173719376391983, "loss": 1.8368, "step": 869 }, { "epoch": 0.19333333333333333, "grad_norm": 0.6689291596412659, "learning_rate": 0.00016169265033407572, "loss": 0.9421, "step": 870 }, { "epoch": 0.19355555555555556, "grad_norm": 0.19246675074100494, "learning_rate": 0.00016164810690423164, "loss": 0.0235, "step": 871 }, { "epoch": 0.19377777777777777, "grad_norm": 0.17772454023361206, "learning_rate": 0.00016160356347438756, "loss": 0.0224, "step": 872 }, { "epoch": 0.194, "grad_norm": 0.145878866314888, "learning_rate": 0.00016155902004454345, "loss": 0.0196, "step": 873 }, { "epoch": 0.1942222222222222, "grad_norm": 0.6575565338134766, "learning_rate": 0.00016151447661469934, "loss": 0.9732, "step": 874 }, { "epoch": 0.19444444444444445, "grad_norm": 0.949036180973053, "learning_rate": 0.00016146993318485523, "loss": 0.8626, "step": 875 }, { "epoch": 0.19466666666666665, "grad_norm": 0.9357439279556274, "learning_rate": 0.00016142538975501112, "loss": 1.8252, "step": 876 }, { "epoch": 0.1948888888888889, "grad_norm": 1.0359864234924316, "learning_rate": 0.00016138084632516707, "loss": 1.9521, "step": 877 }, { "epoch": 0.19511111111111112, "grad_norm": 0.9912081360816956, "learning_rate": 0.00016133630289532296, "loss": 1.7392, "step": 878 }, { "epoch": 0.19533333333333333, "grad_norm": 0.839015543460846, "learning_rate": 0.00016129175946547885, "loss": 0.9531, "step": 879 }, { "epoch": 0.19555555555555557, "grad_norm": 0.40147385001182556, "learning_rate": 0.00016124721603563474, "loss": 0.0272, "step": 880 }, { "epoch": 0.19577777777777777, "grad_norm": 0.09291915595531464, "learning_rate": 0.00016120267260579066, "loss": 0.0201, "step": 881 }, { "epoch": 0.196, "grad_norm": 0.7130599021911621, "learning_rate": 0.00016115812917594656, "loss": 0.9825, "step": 882 }, { "epoch": 0.19622222222222221, "grad_norm": 0.6923748254776001, "learning_rate": 0.00016111358574610245, "loss": 0.6706, "step": 883 }, { "epoch": 0.19644444444444445, "grad_norm": 0.09329139441251755, "learning_rate": 0.00016106904231625837, "loss": 0.0186, "step": 884 }, { "epoch": 0.19666666666666666, "grad_norm": 0.6442102789878845, "learning_rate": 0.00016102449888641426, "loss": 0.6065, "step": 885 }, { "epoch": 0.1968888888888889, "grad_norm": 1.2954368591308594, "learning_rate": 0.00016097995545657018, "loss": 1.7283, "step": 886 }, { "epoch": 0.1971111111111111, "grad_norm": 0.8830384612083435, "learning_rate": 0.00016093541202672607, "loss": 0.8123, "step": 887 }, { "epoch": 0.19733333333333333, "grad_norm": 1.0833762884140015, "learning_rate": 0.00016089086859688196, "loss": 1.6151, "step": 888 }, { "epoch": 0.19755555555555557, "grad_norm": 1.110963225364685, "learning_rate": 0.00016084632516703785, "loss": 1.5561, "step": 889 }, { "epoch": 0.19777777777777777, "grad_norm": 0.9899112582206726, "learning_rate": 0.00016080178173719377, "loss": 1.2833, "step": 890 }, { "epoch": 0.198, "grad_norm": 1.1277340650558472, "learning_rate": 0.0001607572383073497, "loss": 1.8641, "step": 891 }, { "epoch": 0.19822222222222222, "grad_norm": 1.1691646575927734, "learning_rate": 0.00016071269487750558, "loss": 1.3711, "step": 892 }, { "epoch": 0.19844444444444445, "grad_norm": 1.13434636592865, "learning_rate": 0.00016066815144766147, "loss": 1.6223, "step": 893 }, { "epoch": 0.19866666666666666, "grad_norm": 0.7454355955123901, "learning_rate": 0.00016062360801781737, "loss": 0.6587, "step": 894 }, { "epoch": 0.1988888888888889, "grad_norm": 1.034938931465149, "learning_rate": 0.00016057906458797328, "loss": 1.203, "step": 895 }, { "epoch": 0.1991111111111111, "grad_norm": 1.118506908416748, "learning_rate": 0.0001605345211581292, "loss": 1.1357, "step": 896 }, { "epoch": 0.19933333333333333, "grad_norm": 0.6530042290687561, "learning_rate": 0.0001604899777282851, "loss": 0.5032, "step": 897 }, { "epoch": 0.19955555555555557, "grad_norm": 0.6382874846458435, "learning_rate": 0.000160445434298441, "loss": 0.475, "step": 898 }, { "epoch": 0.19977777777777778, "grad_norm": 1.1674174070358276, "learning_rate": 0.00016040089086859688, "loss": 0.9756, "step": 899 }, { "epoch": 0.2, "grad_norm": 1.1725280284881592, "learning_rate": 0.0001603563474387528, "loss": 0.7818, "step": 900 }, { "epoch": 0.2, "eval_loss": 1.222800612449646, "eval_runtime": 242.0579, "eval_samples_per_second": 4.131, "eval_steps_per_second": 4.131, "step": 900 }, { "epoch": 0.20022222222222222, "grad_norm": 0.5205540060997009, "learning_rate": 0.0001603118040089087, "loss": 0.9559, "step": 901 }, { "epoch": 0.20044444444444445, "grad_norm": 0.08372654020786285, "learning_rate": 0.0001602672605790646, "loss": 0.0132, "step": 902 }, { "epoch": 0.20066666666666666, "grad_norm": 0.07322391122579575, "learning_rate": 0.0001602227171492205, "loss": 0.0129, "step": 903 }, { "epoch": 0.2008888888888889, "grad_norm": 0.07906320691108704, "learning_rate": 0.0001601781737193764, "loss": 0.0128, "step": 904 }, { "epoch": 0.2011111111111111, "grad_norm": 0.546818196773529, "learning_rate": 0.0001601336302895323, "loss": 1.0331, "step": 905 }, { "epoch": 0.20133333333333334, "grad_norm": 0.8553550243377686, "learning_rate": 0.0001600890868596882, "loss": 1.9287, "step": 906 }, { "epoch": 0.20155555555555554, "grad_norm": 0.1751287281513214, "learning_rate": 0.0001600445434298441, "loss": 0.017, "step": 907 }, { "epoch": 0.20177777777777778, "grad_norm": 0.6139826774597168, "learning_rate": 0.00016, "loss": 1.0172, "step": 908 }, { "epoch": 0.202, "grad_norm": 0.8161941766738892, "learning_rate": 0.0001599554565701559, "loss": 2.0837, "step": 909 }, { "epoch": 0.20222222222222222, "grad_norm": 0.9680670499801636, "learning_rate": 0.00015991091314031182, "loss": 2.0895, "step": 910 }, { "epoch": 0.20244444444444445, "grad_norm": 1.0817011594772339, "learning_rate": 0.00015986636971046772, "loss": 2.2301, "step": 911 }, { "epoch": 0.20266666666666666, "grad_norm": 0.9138790369033813, "learning_rate": 0.0001598218262806236, "loss": 1.9302, "step": 912 }, { "epoch": 0.2028888888888889, "grad_norm": 0.8514297604560852, "learning_rate": 0.0001597772828507795, "loss": 0.9703, "step": 913 }, { "epoch": 0.2031111111111111, "grad_norm": 0.9511028528213501, "learning_rate": 0.00015973273942093542, "loss": 2.1424, "step": 914 }, { "epoch": 0.20333333333333334, "grad_norm": 0.8551965355873108, "learning_rate": 0.00015968819599109134, "loss": 1.825, "step": 915 }, { "epoch": 0.20355555555555555, "grad_norm": 0.8839070200920105, "learning_rate": 0.00015964365256124723, "loss": 2.013, "step": 916 }, { "epoch": 0.20377777777777778, "grad_norm": 0.9161024689674377, "learning_rate": 0.00015959910913140312, "loss": 1.9285, "step": 917 }, { "epoch": 0.204, "grad_norm": 0.8351966738700867, "learning_rate": 0.000159554565701559, "loss": 1.7185, "step": 918 }, { "epoch": 0.20422222222222222, "grad_norm": 0.9648137092590332, "learning_rate": 0.00015951002227171493, "loss": 2.0408, "step": 919 }, { "epoch": 0.20444444444444446, "grad_norm": 1.0050605535507202, "learning_rate": 0.00015946547884187085, "loss": 1.9834, "step": 920 }, { "epoch": 0.20466666666666666, "grad_norm": 0.5988847613334656, "learning_rate": 0.00015942093541202674, "loss": 0.8184, "step": 921 }, { "epoch": 0.2048888888888889, "grad_norm": 0.39480581879615784, "learning_rate": 0.00015937639198218263, "loss": 0.0215, "step": 922 }, { "epoch": 0.2051111111111111, "grad_norm": 0.08290968835353851, "learning_rate": 0.00015933184855233853, "loss": 0.0178, "step": 923 }, { "epoch": 0.20533333333333334, "grad_norm": 0.0785195529460907, "learning_rate": 0.00015928730512249444, "loss": 0.0172, "step": 924 }, { "epoch": 0.20555555555555555, "grad_norm": 0.901236891746521, "learning_rate": 0.00015924276169265034, "loss": 1.7259, "step": 925 }, { "epoch": 0.20577777777777778, "grad_norm": 1.0050063133239746, "learning_rate": 0.00015919821826280626, "loss": 1.8985, "step": 926 }, { "epoch": 0.206, "grad_norm": 1.1211670637130737, "learning_rate": 0.00015915367483296215, "loss": 2.0145, "step": 927 }, { "epoch": 0.20622222222222222, "grad_norm": 0.9888872504234314, "learning_rate": 0.00015910913140311804, "loss": 1.8903, "step": 928 }, { "epoch": 0.20644444444444446, "grad_norm": 1.1626695394515991, "learning_rate": 0.00015906458797327396, "loss": 1.8271, "step": 929 }, { "epoch": 0.20666666666666667, "grad_norm": 1.0316535234451294, "learning_rate": 0.00015902004454342985, "loss": 1.6792, "step": 930 }, { "epoch": 0.2068888888888889, "grad_norm": 0.9496293663978577, "learning_rate": 0.00015897550111358574, "loss": 1.825, "step": 931 }, { "epoch": 0.2071111111111111, "grad_norm": 0.29808974266052246, "learning_rate": 0.00015893095768374166, "loss": 0.0233, "step": 932 }, { "epoch": 0.20733333333333334, "grad_norm": 0.11366426944732666, "learning_rate": 0.00015888641425389755, "loss": 0.0209, "step": 933 }, { "epoch": 0.20755555555555555, "grad_norm": 1.003063678741455, "learning_rate": 0.00015884187082405347, "loss": 1.7181, "step": 934 }, { "epoch": 0.20777777777777778, "grad_norm": 1.1148592233657837, "learning_rate": 0.00015879732739420936, "loss": 2.0231, "step": 935 }, { "epoch": 0.208, "grad_norm": 0.08369893580675125, "learning_rate": 0.00015875278396436525, "loss": 0.0197, "step": 936 }, { "epoch": 0.20822222222222223, "grad_norm": 0.6577330827713013, "learning_rate": 0.00015870824053452115, "loss": 0.7512, "step": 937 }, { "epoch": 0.20844444444444443, "grad_norm": 0.9851068258285522, "learning_rate": 0.00015866369710467707, "loss": 0.724, "step": 938 }, { "epoch": 0.20866666666666667, "grad_norm": 0.9157700538635254, "learning_rate": 0.00015861915367483298, "loss": 1.6358, "step": 939 }, { "epoch": 0.2088888888888889, "grad_norm": 1.2965058088302612, "learning_rate": 0.00015857461024498888, "loss": 1.4643, "step": 940 }, { "epoch": 0.2091111111111111, "grad_norm": 1.1065465211868286, "learning_rate": 0.00015853006681514477, "loss": 1.4903, "step": 941 }, { "epoch": 0.20933333333333334, "grad_norm": 1.0369428396224976, "learning_rate": 0.00015848552338530066, "loss": 1.4679, "step": 942 }, { "epoch": 0.20955555555555555, "grad_norm": 1.2130650281906128, "learning_rate": 0.00015844097995545658, "loss": 1.4029, "step": 943 }, { "epoch": 0.20977777777777779, "grad_norm": 0.7914889454841614, "learning_rate": 0.0001583964365256125, "loss": 0.7555, "step": 944 }, { "epoch": 0.21, "grad_norm": 0.36863523721694946, "learning_rate": 0.0001583518930957684, "loss": 0.0382, "step": 945 }, { "epoch": 0.21022222222222223, "grad_norm": 0.6304244995117188, "learning_rate": 0.00015830734966592428, "loss": 0.6299, "step": 946 }, { "epoch": 0.21044444444444443, "grad_norm": 1.0273418426513672, "learning_rate": 0.00015826280623608017, "loss": 1.5365, "step": 947 }, { "epoch": 0.21066666666666667, "grad_norm": 0.9680097103118896, "learning_rate": 0.0001582182628062361, "loss": 1.1998, "step": 948 }, { "epoch": 0.21088888888888888, "grad_norm": 0.338010311126709, "learning_rate": 0.00015817371937639198, "loss": 0.0447, "step": 949 }, { "epoch": 0.2111111111111111, "grad_norm": 1.1934006214141846, "learning_rate": 0.0001581291759465479, "loss": 1.1876, "step": 950 }, { "epoch": 0.21133333333333335, "grad_norm": 0.6406352519989014, "learning_rate": 0.0001580846325167038, "loss": 1.2899, "step": 951 }, { "epoch": 0.21155555555555555, "grad_norm": 0.06388068944215775, "learning_rate": 0.00015804008908685969, "loss": 0.0118, "step": 952 }, { "epoch": 0.2117777777777778, "grad_norm": 0.5354200005531311, "learning_rate": 0.0001579955456570156, "loss": 1.1527, "step": 953 }, { "epoch": 0.212, "grad_norm": 0.7551606893539429, "learning_rate": 0.0001579510022271715, "loss": 2.0341, "step": 954 }, { "epoch": 0.21222222222222223, "grad_norm": 0.5737783908843994, "learning_rate": 0.0001579064587973274, "loss": 1.0804, "step": 955 }, { "epoch": 0.21244444444444444, "grad_norm": 0.8123324513435364, "learning_rate": 0.00015786191536748328, "loss": 2.1932, "step": 956 }, { "epoch": 0.21266666666666667, "grad_norm": 0.6195405721664429, "learning_rate": 0.00015781737193763923, "loss": 1.1485, "step": 957 }, { "epoch": 0.21288888888888888, "grad_norm": 0.08890359103679657, "learning_rate": 0.00015777282850779512, "loss": 0.0144, "step": 958 }, { "epoch": 0.2131111111111111, "grad_norm": 0.5360710024833679, "learning_rate": 0.000157728285077951, "loss": 1.2024, "step": 959 }, { "epoch": 0.21333333333333335, "grad_norm": 0.8111135959625244, "learning_rate": 0.0001576837416481069, "loss": 2.0705, "step": 960 }, { "epoch": 0.21355555555555555, "grad_norm": 0.8970634937286377, "learning_rate": 0.0001576391982182628, "loss": 1.9576, "step": 961 }, { "epoch": 0.2137777777777778, "grad_norm": 0.9263898730278015, "learning_rate": 0.0001575946547884187, "loss": 2.3869, "step": 962 }, { "epoch": 0.214, "grad_norm": 0.9336531758308411, "learning_rate": 0.00015755011135857463, "loss": 2.125, "step": 963 }, { "epoch": 0.21422222222222223, "grad_norm": 0.7852495908737183, "learning_rate": 0.00015750556792873052, "loss": 0.9513, "step": 964 }, { "epoch": 0.21444444444444444, "grad_norm": 0.17854809761047363, "learning_rate": 0.00015746102449888642, "loss": 0.0245, "step": 965 }, { "epoch": 0.21466666666666667, "grad_norm": 0.9751142263412476, "learning_rate": 0.00015741648106904233, "loss": 1.6707, "step": 966 }, { "epoch": 0.21488888888888888, "grad_norm": 0.8332890272140503, "learning_rate": 0.00015737193763919823, "loss": 1.9127, "step": 967 }, { "epoch": 0.21511111111111111, "grad_norm": 0.9040923118591309, "learning_rate": 0.00015732739420935412, "loss": 2.0763, "step": 968 }, { "epoch": 0.21533333333333332, "grad_norm": 1.0410791635513306, "learning_rate": 0.00015728285077951004, "loss": 2.3181, "step": 969 }, { "epoch": 0.21555555555555556, "grad_norm": 0.9526614546775818, "learning_rate": 0.00015723830734966593, "loss": 1.8509, "step": 970 }, { "epoch": 0.2157777777777778, "grad_norm": 0.07503191381692886, "learning_rate": 0.00015719376391982185, "loss": 0.0165, "step": 971 }, { "epoch": 0.216, "grad_norm": 0.07039511948823929, "learning_rate": 0.00015714922048997774, "loss": 0.0168, "step": 972 }, { "epoch": 0.21622222222222223, "grad_norm": 0.6502916216850281, "learning_rate": 0.00015710467706013363, "loss": 0.9638, "step": 973 }, { "epoch": 0.21644444444444444, "grad_norm": 0.8625080585479736, "learning_rate": 0.00015706013363028952, "loss": 1.9742, "step": 974 }, { "epoch": 0.21666666666666667, "grad_norm": 0.9066895842552185, "learning_rate": 0.00015701559020044544, "loss": 1.6882, "step": 975 }, { "epoch": 0.21688888888888888, "grad_norm": 0.8513925075531006, "learning_rate": 0.00015697104677060136, "loss": 1.7857, "step": 976 }, { "epoch": 0.21711111111111112, "grad_norm": 0.9731569886207581, "learning_rate": 0.00015692650334075725, "loss": 1.6924, "step": 977 }, { "epoch": 0.21733333333333332, "grad_norm": 1.0422254800796509, "learning_rate": 0.00015688195991091314, "loss": 2.0664, "step": 978 }, { "epoch": 0.21755555555555556, "grad_norm": 0.9497055411338806, "learning_rate": 0.00015683741648106904, "loss": 1.7705, "step": 979 }, { "epoch": 0.21777777777777776, "grad_norm": 1.1931400299072266, "learning_rate": 0.00015679287305122495, "loss": 1.861, "step": 980 }, { "epoch": 0.218, "grad_norm": 0.40133345127105713, "learning_rate": 0.00015674832962138087, "loss": 0.0212, "step": 981 }, { "epoch": 0.21822222222222223, "grad_norm": 0.0913856029510498, "learning_rate": 0.00015670378619153677, "loss": 0.0189, "step": 982 }, { "epoch": 0.21844444444444444, "grad_norm": 0.08609500527381897, "learning_rate": 0.00015665924276169266, "loss": 0.0203, "step": 983 }, { "epoch": 0.21866666666666668, "grad_norm": 0.11373770982027054, "learning_rate": 0.00015661469933184855, "loss": 0.0218, "step": 984 }, { "epoch": 0.21888888888888888, "grad_norm": 0.09455129504203796, "learning_rate": 0.00015657015590200447, "loss": 0.0197, "step": 985 }, { "epoch": 0.21911111111111112, "grad_norm": 0.11562435328960419, "learning_rate": 0.00015652561247216036, "loss": 0.0198, "step": 986 }, { "epoch": 0.21933333333333332, "grad_norm": 0.07547328621149063, "learning_rate": 0.00015648106904231628, "loss": 0.0189, "step": 987 }, { "epoch": 0.21955555555555556, "grad_norm": 1.1698229312896729, "learning_rate": 0.00015643652561247217, "loss": 1.5906, "step": 988 }, { "epoch": 0.21977777777777777, "grad_norm": 0.8595744371414185, "learning_rate": 0.00015639198218262806, "loss": 0.7687, "step": 989 }, { "epoch": 0.22, "grad_norm": 1.0414891242980957, "learning_rate": 0.00015634743875278398, "loss": 1.5007, "step": 990 }, { "epoch": 0.22022222222222224, "grad_norm": 1.2998589277267456, "learning_rate": 0.00015630289532293987, "loss": 1.5281, "step": 991 }, { "epoch": 0.22044444444444444, "grad_norm": 1.1212791204452515, "learning_rate": 0.00015625835189309577, "loss": 1.6504, "step": 992 }, { "epoch": 0.22066666666666668, "grad_norm": 1.2405270338058472, "learning_rate": 0.00015621380846325168, "loss": 1.4231, "step": 993 }, { "epoch": 0.22088888888888888, "grad_norm": 0.7674121856689453, "learning_rate": 0.00015616926503340758, "loss": 0.709, "step": 994 }, { "epoch": 0.22111111111111112, "grad_norm": 0.19164682924747467, "learning_rate": 0.0001561247216035635, "loss": 0.0275, "step": 995 }, { "epoch": 0.22133333333333333, "grad_norm": 1.1732245683670044, "learning_rate": 0.0001560801781737194, "loss": 1.419, "step": 996 }, { "epoch": 0.22155555555555556, "grad_norm": 1.1951122283935547, "learning_rate": 0.00015603563474387528, "loss": 1.0835, "step": 997 }, { "epoch": 0.22177777777777777, "grad_norm": 0.1803571879863739, "learning_rate": 0.00015599109131403117, "loss": 0.0346, "step": 998 }, { "epoch": 0.222, "grad_norm": 0.9627526998519897, "learning_rate": 0.0001559465478841871, "loss": 1.0206, "step": 999 }, { "epoch": 0.2222222222222222, "grad_norm": 1.1441150903701782, "learning_rate": 0.000155902004454343, "loss": 0.8732, "step": 1000 }, { "epoch": 0.22244444444444444, "grad_norm": 0.6593534350395203, "learning_rate": 0.0001558574610244989, "loss": 1.1676, "step": 1001 }, { "epoch": 0.22266666666666668, "grad_norm": 0.664578914642334, "learning_rate": 0.0001558129175946548, "loss": 1.1948, "step": 1002 }, { "epoch": 0.2228888888888889, "grad_norm": 0.0607873909175396, "learning_rate": 0.00015576837416481068, "loss": 0.0118, "step": 1003 }, { "epoch": 0.22311111111111112, "grad_norm": 1.1000303030014038, "learning_rate": 0.0001557238307349666, "loss": 2.4529, "step": 1004 }, { "epoch": 0.22333333333333333, "grad_norm": 0.6257951259613037, "learning_rate": 0.00015567928730512252, "loss": 1.1959, "step": 1005 }, { "epoch": 0.22355555555555556, "grad_norm": 0.540494978427887, "learning_rate": 0.0001556347438752784, "loss": 1.1178, "step": 1006 }, { "epoch": 0.22377777777777777, "grad_norm": 0.8437963128089905, "learning_rate": 0.0001555902004454343, "loss": 2.0957, "step": 1007 }, { "epoch": 0.224, "grad_norm": 0.8972681760787964, "learning_rate": 0.0001555456570155902, "loss": 2.3074, "step": 1008 }, { "epoch": 0.2242222222222222, "grad_norm": 0.837619960308075, "learning_rate": 0.00015550111358574612, "loss": 1.8041, "step": 1009 }, { "epoch": 0.22444444444444445, "grad_norm": 0.6094018816947937, "learning_rate": 0.000155456570155902, "loss": 1.1169, "step": 1010 }, { "epoch": 0.22466666666666665, "grad_norm": 1.000135064125061, "learning_rate": 0.00015541202672605793, "loss": 2.1065, "step": 1011 }, { "epoch": 0.2248888888888889, "grad_norm": 0.8690905570983887, "learning_rate": 0.00015536748329621382, "loss": 2.1307, "step": 1012 }, { "epoch": 0.22511111111111112, "grad_norm": 0.7830039858818054, "learning_rate": 0.0001553229398663697, "loss": 2.0187, "step": 1013 }, { "epoch": 0.22533333333333333, "grad_norm": 0.9192244410514832, "learning_rate": 0.00015527839643652563, "loss": 2.1958, "step": 1014 }, { "epoch": 0.22555555555555556, "grad_norm": 1.086327314376831, "learning_rate": 0.00015523385300668152, "loss": 2.2896, "step": 1015 }, { "epoch": 0.22577777777777777, "grad_norm": 0.6661816239356995, "learning_rate": 0.0001551893095768374, "loss": 1.0338, "step": 1016 }, { "epoch": 0.226, "grad_norm": 0.1229943260550499, "learning_rate": 0.00015514476614699333, "loss": 0.019, "step": 1017 }, { "epoch": 0.2262222222222222, "grad_norm": 0.6619732975959778, "learning_rate": 0.00015510022271714922, "loss": 0.8572, "step": 1018 }, { "epoch": 0.22644444444444445, "grad_norm": 0.9510409832000732, "learning_rate": 0.00015505567928730514, "loss": 1.8705, "step": 1019 }, { "epoch": 0.22666666666666666, "grad_norm": 0.9156755805015564, "learning_rate": 0.00015501113585746103, "loss": 1.7297, "step": 1020 }, { "epoch": 0.2268888888888889, "grad_norm": 0.8265693187713623, "learning_rate": 0.00015496659242761693, "loss": 1.8998, "step": 1021 }, { "epoch": 0.22711111111111112, "grad_norm": 0.9348383545875549, "learning_rate": 0.00015492204899777282, "loss": 1.7829, "step": 1022 }, { "epoch": 0.22733333333333333, "grad_norm": 0.07994972914457321, "learning_rate": 0.00015487750556792874, "loss": 0.0171, "step": 1023 }, { "epoch": 0.22755555555555557, "grad_norm": 0.9954615831375122, "learning_rate": 0.00015483296213808466, "loss": 2.0246, "step": 1024 }, { "epoch": 0.22777777777777777, "grad_norm": 0.811485230922699, "learning_rate": 0.00015478841870824055, "loss": 0.9906, "step": 1025 }, { "epoch": 0.228, "grad_norm": 0.9506434202194214, "learning_rate": 0.00015474387527839644, "loss": 1.8778, "step": 1026 }, { "epoch": 0.22822222222222222, "grad_norm": 1.0921201705932617, "learning_rate": 0.00015469933184855233, "loss": 1.7785, "step": 1027 }, { "epoch": 0.22844444444444445, "grad_norm": 1.0791798830032349, "learning_rate": 0.00015465478841870825, "loss": 1.7389, "step": 1028 }, { "epoch": 0.22866666666666666, "grad_norm": 0.8696101307868958, "learning_rate": 0.00015461024498886414, "loss": 1.6503, "step": 1029 }, { "epoch": 0.2288888888888889, "grad_norm": 1.0953142642974854, "learning_rate": 0.00015456570155902006, "loss": 1.5829, "step": 1030 }, { "epoch": 0.2291111111111111, "grad_norm": 1.0775700807571411, "learning_rate": 0.00015452115812917595, "loss": 1.7792, "step": 1031 }, { "epoch": 0.22933333333333333, "grad_norm": 1.0957459211349487, "learning_rate": 0.00015447661469933184, "loss": 1.6229, "step": 1032 }, { "epoch": 0.22955555555555557, "grad_norm": 0.9933871030807495, "learning_rate": 0.00015443207126948776, "loss": 1.9686, "step": 1033 }, { "epoch": 0.22977777777777778, "grad_norm": 0.6201068758964539, "learning_rate": 0.00015438752783964365, "loss": 0.9276, "step": 1034 }, { "epoch": 0.23, "grad_norm": 0.08361168205738068, "learning_rate": 0.00015434298440979955, "loss": 0.0183, "step": 1035 }, { "epoch": 0.23022222222222222, "grad_norm": 0.0638362318277359, "learning_rate": 0.00015429844097995547, "loss": 0.0174, "step": 1036 }, { "epoch": 0.23044444444444445, "grad_norm": 0.6617216467857361, "learning_rate": 0.00015425389755011136, "loss": 0.9611, "step": 1037 }, { "epoch": 0.23066666666666666, "grad_norm": 0.6723819375038147, "learning_rate": 0.00015420935412026728, "loss": 0.7742, "step": 1038 }, { "epoch": 0.2308888888888889, "grad_norm": 1.1894159317016602, "learning_rate": 0.00015416481069042317, "loss": 1.9163, "step": 1039 }, { "epoch": 0.2311111111111111, "grad_norm": 0.8032247424125671, "learning_rate": 0.00015412026726057906, "loss": 0.9474, "step": 1040 }, { "epoch": 0.23133333333333334, "grad_norm": 0.6486397981643677, "learning_rate": 0.00015407572383073495, "loss": 0.8002, "step": 1041 }, { "epoch": 0.23155555555555554, "grad_norm": 0.9920786619186401, "learning_rate": 0.0001540311804008909, "loss": 1.2296, "step": 1042 }, { "epoch": 0.23177777777777778, "grad_norm": 1.0267969369888306, "learning_rate": 0.0001539866369710468, "loss": 1.479, "step": 1043 }, { "epoch": 0.232, "grad_norm": 1.1731008291244507, "learning_rate": 0.00015394209354120268, "loss": 1.6044, "step": 1044 }, { "epoch": 0.23222222222222222, "grad_norm": 0.9902610182762146, "learning_rate": 0.00015389755011135857, "loss": 1.4472, "step": 1045 }, { "epoch": 0.23244444444444445, "grad_norm": 1.1087284088134766, "learning_rate": 0.00015385300668151446, "loss": 1.2586, "step": 1046 }, { "epoch": 0.23266666666666666, "grad_norm": 0.7335534691810608, "learning_rate": 0.00015380846325167038, "loss": 0.7099, "step": 1047 }, { "epoch": 0.2328888888888889, "grad_norm": 1.1326829195022583, "learning_rate": 0.0001537639198218263, "loss": 1.0497, "step": 1048 }, { "epoch": 0.2331111111111111, "grad_norm": 0.7942183613777161, "learning_rate": 0.0001537193763919822, "loss": 0.6312, "step": 1049 }, { "epoch": 0.23333333333333334, "grad_norm": 1.005306601524353, "learning_rate": 0.00015367483296213809, "loss": 0.7768, "step": 1050 }, { "epoch": 0.23355555555555554, "grad_norm": 0.9126147031784058, "learning_rate": 0.000153630289532294, "loss": 2.3264, "step": 1051 }, { "epoch": 0.23377777777777778, "grad_norm": 0.6329047083854675, "learning_rate": 0.0001535857461024499, "loss": 1.0818, "step": 1052 }, { "epoch": 0.234, "grad_norm": 0.05343164876103401, "learning_rate": 0.0001535412026726058, "loss": 0.0112, "step": 1053 }, { "epoch": 0.23422222222222222, "grad_norm": 0.9046483039855957, "learning_rate": 0.0001534966592427617, "loss": 2.37, "step": 1054 }, { "epoch": 0.23444444444444446, "grad_norm": 0.8647105693817139, "learning_rate": 0.0001534521158129176, "loss": 2.4517, "step": 1055 }, { "epoch": 0.23466666666666666, "grad_norm": 0.648027241230011, "learning_rate": 0.00015340757238307352, "loss": 1.0719, "step": 1056 }, { "epoch": 0.2348888888888889, "grad_norm": 0.5614693760871887, "learning_rate": 0.0001533630289532294, "loss": 1.1226, "step": 1057 }, { "epoch": 0.2351111111111111, "grad_norm": 0.8491237163543701, "learning_rate": 0.0001533184855233853, "loss": 2.069, "step": 1058 }, { "epoch": 0.23533333333333334, "grad_norm": 0.6382585167884827, "learning_rate": 0.0001532739420935412, "loss": 1.3122, "step": 1059 }, { "epoch": 0.23555555555555555, "grad_norm": 0.12528251111507416, "learning_rate": 0.0001532293986636971, "loss": 0.0166, "step": 1060 }, { "epoch": 0.23577777777777778, "grad_norm": 0.09820661693811417, "learning_rate": 0.00015318485523385303, "loss": 0.0157, "step": 1061 }, { "epoch": 0.236, "grad_norm": 0.537966787815094, "learning_rate": 0.00015314031180400892, "loss": 0.887, "step": 1062 }, { "epoch": 0.23622222222222222, "grad_norm": 0.869125485420227, "learning_rate": 0.00015309576837416482, "loss": 2.243, "step": 1063 }, { "epoch": 0.23644444444444446, "grad_norm": 0.9121571779251099, "learning_rate": 0.0001530512249443207, "loss": 2.0393, "step": 1064 }, { "epoch": 0.23666666666666666, "grad_norm": 0.8999320864677429, "learning_rate": 0.00015300668151447663, "loss": 2.0973, "step": 1065 }, { "epoch": 0.2368888888888889, "grad_norm": 1.0168380737304688, "learning_rate": 0.00015296213808463254, "loss": 1.9858, "step": 1066 }, { "epoch": 0.2371111111111111, "grad_norm": 0.10989035665988922, "learning_rate": 0.00015291759465478844, "loss": 0.0164, "step": 1067 }, { "epoch": 0.23733333333333334, "grad_norm": 0.09613081812858582, "learning_rate": 0.00015287305122494433, "loss": 0.0164, "step": 1068 }, { "epoch": 0.23755555555555555, "grad_norm": 1.0540229082107544, "learning_rate": 0.00015282850779510022, "loss": 2.1282, "step": 1069 }, { "epoch": 0.23777777777777778, "grad_norm": 0.6315290927886963, "learning_rate": 0.00015278396436525614, "loss": 0.9098, "step": 1070 }, { "epoch": 0.238, "grad_norm": 0.08182475715875626, "learning_rate": 0.00015273942093541203, "loss": 0.018, "step": 1071 }, { "epoch": 0.23822222222222222, "grad_norm": 0.07903318852186203, "learning_rate": 0.00015269487750556795, "loss": 0.0176, "step": 1072 }, { "epoch": 0.23844444444444443, "grad_norm": 0.07375822216272354, "learning_rate": 0.00015265033407572384, "loss": 0.017, "step": 1073 }, { "epoch": 0.23866666666666667, "grad_norm": 1.087067723274231, "learning_rate": 0.00015260579064587973, "loss": 1.0629, "step": 1074 }, { "epoch": 0.2388888888888889, "grad_norm": 0.9791780114173889, "learning_rate": 0.00015256124721603565, "loss": 1.8542, "step": 1075 }, { "epoch": 0.2391111111111111, "grad_norm": 0.920712411403656, "learning_rate": 0.00015251670378619154, "loss": 1.8871, "step": 1076 }, { "epoch": 0.23933333333333334, "grad_norm": 0.9495246410369873, "learning_rate": 0.00015247216035634744, "loss": 1.798, "step": 1077 }, { "epoch": 0.23955555555555555, "grad_norm": 0.927691638469696, "learning_rate": 0.00015242761692650335, "loss": 1.5861, "step": 1078 }, { "epoch": 0.23977777777777778, "grad_norm": 0.9088736176490784, "learning_rate": 0.00015238307349665925, "loss": 1.4221, "step": 1079 }, { "epoch": 0.24, "grad_norm": 0.7401149868965149, "learning_rate": 0.00015233853006681517, "loss": 0.9053, "step": 1080 }, { "epoch": 0.24022222222222223, "grad_norm": 0.5902130007743835, "learning_rate": 0.00015229398663697106, "loss": 0.5401, "step": 1081 }, { "epoch": 0.24044444444444443, "grad_norm": 0.06568460166454315, "learning_rate": 0.00015224944320712695, "loss": 0.0187, "step": 1082 }, { "epoch": 0.24066666666666667, "grad_norm": 0.08122013509273529, "learning_rate": 0.00015220489977728284, "loss": 0.0183, "step": 1083 }, { "epoch": 0.2408888888888889, "grad_norm": 1.0280604362487793, "learning_rate": 0.00015216035634743876, "loss": 1.7743, "step": 1084 }, { "epoch": 0.2411111111111111, "grad_norm": 1.0982590913772583, "learning_rate": 0.00015211581291759468, "loss": 1.8326, "step": 1085 }, { "epoch": 0.24133333333333334, "grad_norm": 0.109102763235569, "learning_rate": 0.00015207126948775057, "loss": 0.0195, "step": 1086 }, { "epoch": 0.24155555555555555, "grad_norm": 0.1620292216539383, "learning_rate": 0.00015202672605790646, "loss": 0.0194, "step": 1087 }, { "epoch": 0.24177777777777779, "grad_norm": 0.11929841339588165, "learning_rate": 0.00015198218262806235, "loss": 0.0187, "step": 1088 }, { "epoch": 0.242, "grad_norm": 1.131895661354065, "learning_rate": 0.00015193763919821827, "loss": 1.6243, "step": 1089 }, { "epoch": 0.24222222222222223, "grad_norm": 0.714919924736023, "learning_rate": 0.0001518930957683742, "loss": 0.8017, "step": 1090 }, { "epoch": 0.24244444444444443, "grad_norm": 0.9816338419914246, "learning_rate": 0.00015184855233853008, "loss": 1.386, "step": 1091 }, { "epoch": 0.24266666666666667, "grad_norm": 1.0071079730987549, "learning_rate": 0.00015180400890868598, "loss": 1.5263, "step": 1092 }, { "epoch": 0.24288888888888888, "grad_norm": 1.1087195873260498, "learning_rate": 0.00015175946547884187, "loss": 1.7932, "step": 1093 }, { "epoch": 0.2431111111111111, "grad_norm": 0.9575614333152771, "learning_rate": 0.00015171492204899779, "loss": 1.5521, "step": 1094 }, { "epoch": 0.24333333333333335, "grad_norm": 0.7373946309089661, "learning_rate": 0.00015167037861915368, "loss": 0.7751, "step": 1095 }, { "epoch": 0.24355555555555555, "grad_norm": 1.1326210498809814, "learning_rate": 0.0001516258351893096, "loss": 1.3552, "step": 1096 }, { "epoch": 0.2437777777777778, "grad_norm": 1.0046254396438599, "learning_rate": 0.0001515812917594655, "loss": 1.3028, "step": 1097 }, { "epoch": 0.244, "grad_norm": 1.095888376235962, "learning_rate": 0.00015153674832962138, "loss": 1.2255, "step": 1098 }, { "epoch": 0.24422222222222223, "grad_norm": 0.9542917609214783, "learning_rate": 0.0001514922048997773, "loss": 0.6265, "step": 1099 }, { "epoch": 0.24444444444444444, "grad_norm": 0.17893174290657043, "learning_rate": 0.0001514476614699332, "loss": 0.037, "step": 1100 }, { "epoch": 0.24466666666666667, "grad_norm": 0.8549418449401855, "learning_rate": 0.00015140311804008908, "loss": 2.1731, "step": 1101 }, { "epoch": 0.24488888888888888, "grad_norm": 0.7608986496925354, "learning_rate": 0.00015135857461024498, "loss": 1.9913, "step": 1102 }, { "epoch": 0.2451111111111111, "grad_norm": 0.9019181728363037, "learning_rate": 0.0001513140311804009, "loss": 2.2666, "step": 1103 }, { "epoch": 0.24533333333333332, "grad_norm": 0.07070691883563995, "learning_rate": 0.0001512694877505568, "loss": 0.0127, "step": 1104 }, { "epoch": 0.24555555555555555, "grad_norm": 0.07447178661823273, "learning_rate": 0.0001512249443207127, "loss": 0.0126, "step": 1105 }, { "epoch": 0.2457777777777778, "grad_norm": 0.07123079895973206, "learning_rate": 0.0001511804008908686, "loss": 0.0125, "step": 1106 }, { "epoch": 0.246, "grad_norm": 0.06680039316415787, "learning_rate": 0.0001511358574610245, "loss": 0.012, "step": 1107 }, { "epoch": 0.24622222222222223, "grad_norm": 0.8252882361412048, "learning_rate": 0.0001510913140311804, "loss": 1.9517, "step": 1108 }, { "epoch": 0.24644444444444444, "grad_norm": 0.8782687783241272, "learning_rate": 0.00015104677060133633, "loss": 2.119, "step": 1109 }, { "epoch": 0.24666666666666667, "grad_norm": 0.9111925363540649, "learning_rate": 0.00015100222717149222, "loss": 2.3125, "step": 1110 }, { "epoch": 0.24688888888888888, "grad_norm": 0.9598534107208252, "learning_rate": 0.0001509576837416481, "loss": 1.9542, "step": 1111 }, { "epoch": 0.24711111111111111, "grad_norm": 0.9928382635116577, "learning_rate": 0.000150913140311804, "loss": 2.1632, "step": 1112 }, { "epoch": 0.24733333333333332, "grad_norm": 0.8307510614395142, "learning_rate": 0.00015086859688195992, "loss": 2.0449, "step": 1113 }, { "epoch": 0.24755555555555556, "grad_norm": 0.8130167722702026, "learning_rate": 0.0001508240534521158, "loss": 1.6968, "step": 1114 }, { "epoch": 0.2477777777777778, "grad_norm": 0.9309992790222168, "learning_rate": 0.00015077951002227173, "loss": 2.1483, "step": 1115 }, { "epoch": 0.248, "grad_norm": 0.7260227203369141, "learning_rate": 0.00015073496659242762, "loss": 1.0392, "step": 1116 }, { "epoch": 0.24822222222222223, "grad_norm": 0.6399407386779785, "learning_rate": 0.00015069042316258351, "loss": 1.005, "step": 1117 }, { "epoch": 0.24844444444444444, "grad_norm": 0.9045966863632202, "learning_rate": 0.00015064587973273943, "loss": 1.9107, "step": 1118 }, { "epoch": 0.24866666666666667, "grad_norm": 0.9134828448295593, "learning_rate": 0.00015060133630289533, "loss": 1.8531, "step": 1119 }, { "epoch": 0.24888888888888888, "grad_norm": 0.8679183721542358, "learning_rate": 0.00015055679287305122, "loss": 1.8696, "step": 1120 }, { "epoch": 0.24911111111111112, "grad_norm": 0.9295245409011841, "learning_rate": 0.00015051224944320714, "loss": 1.8919, "step": 1121 }, { "epoch": 0.24933333333333332, "grad_norm": 0.7018031477928162, "learning_rate": 0.00015046770601336303, "loss": 0.8568, "step": 1122 }, { "epoch": 0.24955555555555556, "grad_norm": 0.9831565618515015, "learning_rate": 0.00015042316258351895, "loss": 1.9167, "step": 1123 }, { "epoch": 0.24977777777777777, "grad_norm": 0.788474977016449, "learning_rate": 0.00015037861915367484, "loss": 1.0294, "step": 1124 }, { "epoch": 0.25, "grad_norm": 0.9335272908210754, "learning_rate": 0.00015033407572383073, "loss": 1.7164, "step": 1125 }, { "epoch": 0.25022222222222223, "grad_norm": 1.0208749771118164, "learning_rate": 0.00015028953229398662, "loss": 1.7482, "step": 1126 }, { "epoch": 0.25044444444444447, "grad_norm": 1.1490461826324463, "learning_rate": 0.00015024498886414257, "loss": 2.2458, "step": 1127 }, { "epoch": 0.25066666666666665, "grad_norm": 0.07004405558109283, "learning_rate": 0.00015020044543429846, "loss": 0.018, "step": 1128 }, { "epoch": 0.2508888888888889, "grad_norm": 0.07559281587600708, "learning_rate": 0.00015015590200445435, "loss": 0.0176, "step": 1129 }, { "epoch": 0.2511111111111111, "grad_norm": 0.6543799638748169, "learning_rate": 0.00015011135857461024, "loss": 0.8183, "step": 1130 }, { "epoch": 0.25133333333333335, "grad_norm": 0.9669918417930603, "learning_rate": 0.00015006681514476614, "loss": 1.591, "step": 1131 }, { "epoch": 0.25155555555555553, "grad_norm": 1.1905044317245483, "learning_rate": 0.00015002227171492205, "loss": 2.0181, "step": 1132 }, { "epoch": 0.25177777777777777, "grad_norm": 0.6645126938819885, "learning_rate": 0.00014997772828507797, "loss": 0.9216, "step": 1133 }, { "epoch": 0.252, "grad_norm": 0.7185103893280029, "learning_rate": 0.00014993318485523387, "loss": 0.867, "step": 1134 }, { "epoch": 0.25222222222222224, "grad_norm": 1.1038732528686523, "learning_rate": 0.00014988864142538976, "loss": 1.8593, "step": 1135 }, { "epoch": 0.25244444444444447, "grad_norm": 1.0466231107711792, "learning_rate": 0.00014984409799554568, "loss": 1.6207, "step": 1136 }, { "epoch": 0.25266666666666665, "grad_norm": 1.0683057308197021, "learning_rate": 0.00014979955456570157, "loss": 1.6742, "step": 1137 }, { "epoch": 0.2528888888888889, "grad_norm": 0.14429320394992828, "learning_rate": 0.00014975501113585746, "loss": 0.0322, "step": 1138 }, { "epoch": 0.2531111111111111, "grad_norm": 1.1202466487884521, "learning_rate": 0.00014971046770601338, "loss": 1.7064, "step": 1139 }, { "epoch": 0.25333333333333335, "grad_norm": 1.1153877973556519, "learning_rate": 0.00014966592427616927, "loss": 1.5018, "step": 1140 }, { "epoch": 0.25355555555555553, "grad_norm": 1.383662223815918, "learning_rate": 0.0001496213808463252, "loss": 1.4785, "step": 1141 }, { "epoch": 0.25377777777777777, "grad_norm": 1.060515284538269, "learning_rate": 0.00014957683741648108, "loss": 1.5914, "step": 1142 }, { "epoch": 0.254, "grad_norm": 0.6889761090278625, "learning_rate": 0.00014953229398663697, "loss": 0.686, "step": 1143 }, { "epoch": 0.25422222222222224, "grad_norm": 0.8010234832763672, "learning_rate": 0.00014948775055679286, "loss": 0.6522, "step": 1144 }, { "epoch": 0.2544444444444444, "grad_norm": 0.9428964257240295, "learning_rate": 0.00014944320712694878, "loss": 1.1183, "step": 1145 }, { "epoch": 0.25466666666666665, "grad_norm": 1.2112162113189697, "learning_rate": 0.0001493986636971047, "loss": 1.4202, "step": 1146 }, { "epoch": 0.2548888888888889, "grad_norm": 0.8112894892692566, "learning_rate": 0.0001493541202672606, "loss": 0.7603, "step": 1147 }, { "epoch": 0.2551111111111111, "grad_norm": 0.18849223852157593, "learning_rate": 0.00014930957683741649, "loss": 0.0361, "step": 1148 }, { "epoch": 0.25533333333333336, "grad_norm": 1.0638991594314575, "learning_rate": 0.00014926503340757238, "loss": 1.2604, "step": 1149 }, { "epoch": 0.25555555555555554, "grad_norm": 1.0759894847869873, "learning_rate": 0.0001492204899777283, "loss": 0.8551, "step": 1150 }, { "epoch": 0.25577777777777777, "grad_norm": 0.8875473737716675, "learning_rate": 0.00014917594654788422, "loss": 2.2251, "step": 1151 }, { "epoch": 0.256, "grad_norm": 0.06436305493116379, "learning_rate": 0.0001491314031180401, "loss": 0.0134, "step": 1152 }, { "epoch": 0.25622222222222224, "grad_norm": 0.06132598966360092, "learning_rate": 0.000149086859688196, "loss": 0.013, "step": 1153 }, { "epoch": 0.2564444444444444, "grad_norm": 0.6333918571472168, "learning_rate": 0.0001490423162583519, "loss": 1.2197, "step": 1154 }, { "epoch": 0.25666666666666665, "grad_norm": 0.6512543559074402, "learning_rate": 0.0001489977728285078, "loss": 1.0077, "step": 1155 }, { "epoch": 0.2568888888888889, "grad_norm": 0.8547171354293823, "learning_rate": 0.0001489532293986637, "loss": 1.9731, "step": 1156 }, { "epoch": 0.2571111111111111, "grad_norm": 0.9840943217277527, "learning_rate": 0.00014890868596881962, "loss": 2.437, "step": 1157 }, { "epoch": 0.25733333333333336, "grad_norm": 0.08678994327783585, "learning_rate": 0.0001488641425389755, "loss": 0.0136, "step": 1158 }, { "epoch": 0.25755555555555554, "grad_norm": 0.891431450843811, "learning_rate": 0.0001488195991091314, "loss": 1.6169, "step": 1159 }, { "epoch": 0.2577777777777778, "grad_norm": 1.0839104652404785, "learning_rate": 0.00014877505567928732, "loss": 1.8314, "step": 1160 }, { "epoch": 0.258, "grad_norm": 1.0359727144241333, "learning_rate": 0.00014873051224944322, "loss": 2.0611, "step": 1161 }, { "epoch": 0.25822222222222224, "grad_norm": 0.1595863550901413, "learning_rate": 0.0001486859688195991, "loss": 0.0189, "step": 1162 }, { "epoch": 0.2584444444444444, "grad_norm": 0.09386439621448517, "learning_rate": 0.00014864142538975503, "loss": 0.0174, "step": 1163 }, { "epoch": 0.25866666666666666, "grad_norm": 0.9504187107086182, "learning_rate": 0.00014859688195991092, "loss": 1.9366, "step": 1164 }, { "epoch": 0.2588888888888889, "grad_norm": 0.9891676902770996, "learning_rate": 0.00014855233853006684, "loss": 2.3224, "step": 1165 }, { "epoch": 0.2591111111111111, "grad_norm": 0.817260205745697, "learning_rate": 0.00014850779510022273, "loss": 1.9071, "step": 1166 }, { "epoch": 0.25933333333333336, "grad_norm": 0.9042514562606812, "learning_rate": 0.00014846325167037862, "loss": 1.9343, "step": 1167 }, { "epoch": 0.25955555555555554, "grad_norm": 0.9342247843742371, "learning_rate": 0.0001484187082405345, "loss": 2.2231, "step": 1168 }, { "epoch": 0.2597777777777778, "grad_norm": 0.5495959520339966, "learning_rate": 0.00014837416481069043, "loss": 0.6888, "step": 1169 }, { "epoch": 0.26, "grad_norm": 0.081369549036026, "learning_rate": 0.00014832962138084635, "loss": 0.0182, "step": 1170 }, { "epoch": 0.26022222222222224, "grad_norm": 0.08859319984912872, "learning_rate": 0.00014828507795100224, "loss": 0.0182, "step": 1171 }, { "epoch": 0.2604444444444444, "grad_norm": 0.07650327682495117, "learning_rate": 0.00014824053452115813, "loss": 0.0176, "step": 1172 }, { "epoch": 0.26066666666666666, "grad_norm": 0.8651443123817444, "learning_rate": 0.00014819599109131403, "loss": 1.6714, "step": 1173 }, { "epoch": 0.2608888888888889, "grad_norm": 0.7669498920440674, "learning_rate": 0.00014815144766146994, "loss": 0.0361, "step": 1174 }, { "epoch": 0.2611111111111111, "grad_norm": 0.9396262764930725, "learning_rate": 0.00014810690423162584, "loss": 1.8296, "step": 1175 }, { "epoch": 0.2613333333333333, "grad_norm": 1.0305185317993164, "learning_rate": 0.00014806236080178175, "loss": 1.9626, "step": 1176 }, { "epoch": 0.26155555555555554, "grad_norm": 0.9989243745803833, "learning_rate": 0.00014801781737193765, "loss": 2.113, "step": 1177 }, { "epoch": 0.2617777777777778, "grad_norm": 0.8748054504394531, "learning_rate": 0.00014797327394209354, "loss": 1.5958, "step": 1178 }, { "epoch": 0.262, "grad_norm": 1.3041480779647827, "learning_rate": 0.00014792873051224946, "loss": 1.7549, "step": 1179 }, { "epoch": 0.26222222222222225, "grad_norm": 1.0158839225769043, "learning_rate": 0.00014788418708240535, "loss": 1.9756, "step": 1180 }, { "epoch": 0.2624444444444444, "grad_norm": 0.7090574502944946, "learning_rate": 0.00014783964365256124, "loss": 1.051, "step": 1181 }, { "epoch": 0.26266666666666666, "grad_norm": 0.13157020509243011, "learning_rate": 0.00014779510022271716, "loss": 0.0176, "step": 1182 }, { "epoch": 0.2628888888888889, "grad_norm": 0.738917350769043, "learning_rate": 0.00014775055679287305, "loss": 0.9179, "step": 1183 }, { "epoch": 0.26311111111111113, "grad_norm": 0.14277049899101257, "learning_rate": 0.00014770601336302897, "loss": 0.0199, "step": 1184 }, { "epoch": 0.2633333333333333, "grad_norm": 0.1282823085784912, "learning_rate": 0.00014766146993318486, "loss": 0.0193, "step": 1185 }, { "epoch": 0.26355555555555554, "grad_norm": 0.10668764263391495, "learning_rate": 0.00014761692650334075, "loss": 0.018, "step": 1186 }, { "epoch": 0.2637777777777778, "grad_norm": 0.0911671444773674, "learning_rate": 0.00014757238307349665, "loss": 0.0169, "step": 1187 }, { "epoch": 0.264, "grad_norm": 0.7024106979370117, "learning_rate": 0.00014752783964365256, "loss": 0.9606, "step": 1188 }, { "epoch": 0.26422222222222225, "grad_norm": 0.9707425236701965, "learning_rate": 0.00014748329621380848, "loss": 1.8245, "step": 1189 }, { "epoch": 0.2644444444444444, "grad_norm": 0.25269874930381775, "learning_rate": 0.00014743875278396438, "loss": 0.0316, "step": 1190 }, { "epoch": 0.26466666666666666, "grad_norm": 0.9807192087173462, "learning_rate": 0.00014739420935412027, "loss": 1.659, "step": 1191 }, { "epoch": 0.2648888888888889, "grad_norm": 1.0586344003677368, "learning_rate": 0.00014734966592427616, "loss": 1.6508, "step": 1192 }, { "epoch": 0.26511111111111113, "grad_norm": 1.1077786684036255, "learning_rate": 0.00014730512249443208, "loss": 1.7849, "step": 1193 }, { "epoch": 0.2653333333333333, "grad_norm": 0.9380425810813904, "learning_rate": 0.000147260579064588, "loss": 1.4617, "step": 1194 }, { "epoch": 0.26555555555555554, "grad_norm": 0.8665103912353516, "learning_rate": 0.0001472160356347439, "loss": 0.8163, "step": 1195 }, { "epoch": 0.2657777777777778, "grad_norm": 0.20836298167705536, "learning_rate": 0.00014717149220489978, "loss": 0.0295, "step": 1196 }, { "epoch": 0.266, "grad_norm": 0.7868085503578186, "learning_rate": 0.00014712694877505567, "loss": 0.6978, "step": 1197 }, { "epoch": 0.26622222222222225, "grad_norm": 0.9609499573707581, "learning_rate": 0.0001470824053452116, "loss": 1.2007, "step": 1198 }, { "epoch": 0.26644444444444443, "grad_norm": 0.6641119718551636, "learning_rate": 0.00014703786191536748, "loss": 0.5312, "step": 1199 }, { "epoch": 0.26666666666666666, "grad_norm": 0.5177371501922607, "learning_rate": 0.0001469933184855234, "loss": 0.2728, "step": 1200 }, { "epoch": 0.2668888888888889, "grad_norm": 0.6398945450782776, "learning_rate": 0.0001469487750556793, "loss": 1.1822, "step": 1201 }, { "epoch": 0.26711111111111113, "grad_norm": 0.5665084719657898, "learning_rate": 0.00014690423162583519, "loss": 1.0326, "step": 1202 }, { "epoch": 0.2673333333333333, "grad_norm": 0.806073009967804, "learning_rate": 0.0001468596881959911, "loss": 2.2595, "step": 1203 }, { "epoch": 0.26755555555555555, "grad_norm": 0.05791214853525162, "learning_rate": 0.000146815144766147, "loss": 0.0123, "step": 1204 }, { "epoch": 0.2677777777777778, "grad_norm": 0.5717757344245911, "learning_rate": 0.0001467706013363029, "loss": 1.0509, "step": 1205 }, { "epoch": 0.268, "grad_norm": 0.8858723044395447, "learning_rate": 0.0001467260579064588, "loss": 2.0359, "step": 1206 }, { "epoch": 0.2682222222222222, "grad_norm": 0.14826105535030365, "learning_rate": 0.0001466815144766147, "loss": 0.0167, "step": 1207 }, { "epoch": 0.26844444444444443, "grad_norm": 0.099408358335495, "learning_rate": 0.00014663697104677062, "loss": 0.0163, "step": 1208 }, { "epoch": 0.26866666666666666, "grad_norm": 0.09596351534128189, "learning_rate": 0.0001465924276169265, "loss": 0.0153, "step": 1209 }, { "epoch": 0.2688888888888889, "grad_norm": 0.8464707732200623, "learning_rate": 0.0001465478841870824, "loss": 2.4877, "step": 1210 }, { "epoch": 0.26911111111111113, "grad_norm": 0.8414135575294495, "learning_rate": 0.0001465033407572383, "loss": 2.4347, "step": 1211 }, { "epoch": 0.2693333333333333, "grad_norm": 0.8445218205451965, "learning_rate": 0.00014645879732739424, "loss": 2.0295, "step": 1212 }, { "epoch": 0.26955555555555555, "grad_norm": 0.8376966118812561, "learning_rate": 0.00014641425389755013, "loss": 1.9265, "step": 1213 }, { "epoch": 0.2697777777777778, "grad_norm": 1.121559977531433, "learning_rate": 0.00014636971046770602, "loss": 1.9569, "step": 1214 }, { "epoch": 0.27, "grad_norm": 0.9244825839996338, "learning_rate": 0.00014632516703786191, "loss": 1.9792, "step": 1215 }, { "epoch": 0.2702222222222222, "grad_norm": 0.9264352321624756, "learning_rate": 0.0001462806236080178, "loss": 1.965, "step": 1216 }, { "epoch": 0.27044444444444443, "grad_norm": 0.9068411588668823, "learning_rate": 0.00014623608017817373, "loss": 1.9279, "step": 1217 }, { "epoch": 0.27066666666666667, "grad_norm": 0.9238573312759399, "learning_rate": 0.00014619153674832964, "loss": 1.9785, "step": 1218 }, { "epoch": 0.2708888888888889, "grad_norm": 0.8850834965705872, "learning_rate": 0.00014614699331848554, "loss": 1.8371, "step": 1219 }, { "epoch": 0.27111111111111114, "grad_norm": 0.8664029240608215, "learning_rate": 0.00014610244988864143, "loss": 1.6893, "step": 1220 }, { "epoch": 0.2713333333333333, "grad_norm": 0.9488045573234558, "learning_rate": 0.00014605790645879735, "loss": 1.9064, "step": 1221 }, { "epoch": 0.27155555555555555, "grad_norm": 0.11144151538610458, "learning_rate": 0.00014601336302895324, "loss": 0.0165, "step": 1222 }, { "epoch": 0.2717777777777778, "grad_norm": 0.11891952157020569, "learning_rate": 0.00014596881959910913, "loss": 0.0163, "step": 1223 }, { "epoch": 0.272, "grad_norm": 0.07421109825372696, "learning_rate": 0.00014592427616926505, "loss": 0.016, "step": 1224 }, { "epoch": 0.2722222222222222, "grad_norm": 0.5813721418380737, "learning_rate": 0.00014587973273942094, "loss": 1.0181, "step": 1225 }, { "epoch": 0.27244444444444443, "grad_norm": 0.720731258392334, "learning_rate": 0.00014583518930957686, "loss": 0.8357, "step": 1226 }, { "epoch": 0.27266666666666667, "grad_norm": 0.8886854648590088, "learning_rate": 0.00014579064587973275, "loss": 1.6687, "step": 1227 }, { "epoch": 0.2728888888888889, "grad_norm": 1.0154438018798828, "learning_rate": 0.00014574610244988864, "loss": 2.0414, "step": 1228 }, { "epoch": 0.27311111111111114, "grad_norm": 1.0460702180862427, "learning_rate": 0.00014570155902004454, "loss": 1.8346, "step": 1229 }, { "epoch": 0.2733333333333333, "grad_norm": 1.0006906986236572, "learning_rate": 0.00014565701559020045, "loss": 0.9797, "step": 1230 }, { "epoch": 0.27355555555555555, "grad_norm": 0.08748811483383179, "learning_rate": 0.00014561247216035637, "loss": 0.0191, "step": 1231 }, { "epoch": 0.2737777777777778, "grad_norm": 0.08443690091371536, "learning_rate": 0.00014556792873051227, "loss": 0.0191, "step": 1232 }, { "epoch": 0.274, "grad_norm": 0.8554977774620056, "learning_rate": 0.00014552338530066816, "loss": 0.8648, "step": 1233 }, { "epoch": 0.2742222222222222, "grad_norm": 0.2030288428068161, "learning_rate": 0.00014547884187082405, "loss": 0.0251, "step": 1234 }, { "epoch": 0.27444444444444444, "grad_norm": 0.1689607948064804, "learning_rate": 0.00014543429844097997, "loss": 0.0226, "step": 1235 }, { "epoch": 0.27466666666666667, "grad_norm": 0.14935432374477386, "learning_rate": 0.0001453897550111359, "loss": 0.0201, "step": 1236 }, { "epoch": 0.2748888888888889, "grad_norm": 0.9350739121437073, "learning_rate": 0.00014534521158129178, "loss": 1.5576, "step": 1237 }, { "epoch": 0.2751111111111111, "grad_norm": 0.11490830034017563, "learning_rate": 0.00014530066815144767, "loss": 0.0268, "step": 1238 }, { "epoch": 0.2753333333333333, "grad_norm": 1.1253389120101929, "learning_rate": 0.00014525612472160356, "loss": 1.6596, "step": 1239 }, { "epoch": 0.27555555555555555, "grad_norm": 1.0991276502609253, "learning_rate": 0.00014521158129175948, "loss": 1.4434, "step": 1240 }, { "epoch": 0.2757777777777778, "grad_norm": 1.245217204093933, "learning_rate": 0.00014516703786191537, "loss": 1.6334, "step": 1241 }, { "epoch": 0.276, "grad_norm": 1.0247350931167603, "learning_rate": 0.00014512249443207126, "loss": 1.5174, "step": 1242 }, { "epoch": 0.2762222222222222, "grad_norm": 1.1460856199264526, "learning_rate": 0.00014507795100222718, "loss": 1.4504, "step": 1243 }, { "epoch": 0.27644444444444444, "grad_norm": 0.8328757286071777, "learning_rate": 0.00014503340757238308, "loss": 0.7809, "step": 1244 }, { "epoch": 0.27666666666666667, "grad_norm": 0.726498007774353, "learning_rate": 0.000144988864142539, "loss": 0.6805, "step": 1245 }, { "epoch": 0.2768888888888889, "grad_norm": 1.1462013721466064, "learning_rate": 0.00014494432071269489, "loss": 1.3072, "step": 1246 }, { "epoch": 0.2771111111111111, "grad_norm": 1.0997343063354492, "learning_rate": 0.00014489977728285078, "loss": 1.18, "step": 1247 }, { "epoch": 0.2773333333333333, "grad_norm": 0.16613906621932983, "learning_rate": 0.00014485523385300667, "loss": 0.0333, "step": 1248 }, { "epoch": 0.27755555555555556, "grad_norm": 1.0108569860458374, "learning_rate": 0.0001448106904231626, "loss": 1.0494, "step": 1249 }, { "epoch": 0.2777777777777778, "grad_norm": 1.0045305490493774, "learning_rate": 0.0001447661469933185, "loss": 0.7517, "step": 1250 }, { "epoch": 0.278, "grad_norm": 0.591677188873291, "learning_rate": 0.0001447216035634744, "loss": 1.1579, "step": 1251 }, { "epoch": 0.2782222222222222, "grad_norm": 0.6461545825004578, "learning_rate": 0.0001446770601336303, "loss": 1.2558, "step": 1252 }, { "epoch": 0.27844444444444444, "grad_norm": 0.5436459183692932, "learning_rate": 0.00014463251670378618, "loss": 1.0179, "step": 1253 }, { "epoch": 0.2786666666666667, "grad_norm": 0.5476921200752258, "learning_rate": 0.0001445879732739421, "loss": 1.1344, "step": 1254 }, { "epoch": 0.2788888888888889, "grad_norm": 0.8634714484214783, "learning_rate": 0.00014454342984409802, "loss": 2.2497, "step": 1255 }, { "epoch": 0.2791111111111111, "grad_norm": 0.8563636541366577, "learning_rate": 0.0001444988864142539, "loss": 2.2022, "step": 1256 }, { "epoch": 0.2793333333333333, "grad_norm": 0.6468019485473633, "learning_rate": 0.0001444543429844098, "loss": 1.102, "step": 1257 }, { "epoch": 0.27955555555555556, "grad_norm": 0.0794794037938118, "learning_rate": 0.0001444097995545657, "loss": 0.013, "step": 1258 }, { "epoch": 0.2797777777777778, "grad_norm": 0.07662985473871231, "learning_rate": 0.00014436525612472161, "loss": 0.0129, "step": 1259 }, { "epoch": 0.28, "grad_norm": 0.5852623581886292, "learning_rate": 0.0001443207126948775, "loss": 1.1717, "step": 1260 }, { "epoch": 0.2802222222222222, "grad_norm": 0.8723194003105164, "learning_rate": 0.00014427616926503343, "loss": 2.0411, "step": 1261 }, { "epoch": 0.28044444444444444, "grad_norm": 0.927684485912323, "learning_rate": 0.00014423162583518932, "loss": 2.4348, "step": 1262 }, { "epoch": 0.2806666666666667, "grad_norm": 0.9913063645362854, "learning_rate": 0.0001441870824053452, "loss": 2.1211, "step": 1263 }, { "epoch": 0.2808888888888889, "grad_norm": 0.8557515144348145, "learning_rate": 0.00014414253897550113, "loss": 2.1376, "step": 1264 }, { "epoch": 0.2811111111111111, "grad_norm": 0.9847201704978943, "learning_rate": 0.00014409799554565702, "loss": 1.9602, "step": 1265 }, { "epoch": 0.2813333333333333, "grad_norm": 0.9515376687049866, "learning_rate": 0.0001440534521158129, "loss": 1.8319, "step": 1266 }, { "epoch": 0.28155555555555556, "grad_norm": 0.12117951363325119, "learning_rate": 0.00014400890868596883, "loss": 0.0204, "step": 1267 }, { "epoch": 0.2817777777777778, "grad_norm": 0.8738093376159668, "learning_rate": 0.00014396436525612472, "loss": 2.0693, "step": 1268 }, { "epoch": 0.282, "grad_norm": 0.9916730523109436, "learning_rate": 0.00014391982182628064, "loss": 1.8292, "step": 1269 }, { "epoch": 0.2822222222222222, "grad_norm": 0.9203552007675171, "learning_rate": 0.00014387527839643653, "loss": 1.9077, "step": 1270 }, { "epoch": 0.28244444444444444, "grad_norm": 0.9378885626792908, "learning_rate": 0.00014383073496659243, "loss": 1.6729, "step": 1271 }, { "epoch": 0.2826666666666667, "grad_norm": 0.8718151450157166, "learning_rate": 0.00014378619153674832, "loss": 1.8453, "step": 1272 }, { "epoch": 0.2828888888888889, "grad_norm": 0.545022189617157, "learning_rate": 0.00014374164810690424, "loss": 0.9028, "step": 1273 }, { "epoch": 0.2831111111111111, "grad_norm": 0.10532009601593018, "learning_rate": 0.00014369710467706015, "loss": 0.0177, "step": 1274 }, { "epoch": 0.2833333333333333, "grad_norm": 0.08250175416469574, "learning_rate": 0.00014365256124721605, "loss": 0.0174, "step": 1275 }, { "epoch": 0.28355555555555556, "grad_norm": 0.08483847975730896, "learning_rate": 0.00014360801781737194, "loss": 0.017, "step": 1276 }, { "epoch": 0.2837777777777778, "grad_norm": 0.07746873795986176, "learning_rate": 0.00014356347438752783, "loss": 0.0165, "step": 1277 }, { "epoch": 0.284, "grad_norm": 0.0772603377699852, "learning_rate": 0.00014351893095768375, "loss": 0.0162, "step": 1278 }, { "epoch": 0.2842222222222222, "grad_norm": 0.9422643780708313, "learning_rate": 0.00014347438752783967, "loss": 1.9495, "step": 1279 }, { "epoch": 0.28444444444444444, "grad_norm": 0.563165009021759, "learning_rate": 0.00014342984409799556, "loss": 0.79, "step": 1280 }, { "epoch": 0.2846666666666667, "grad_norm": 0.9257560968399048, "learning_rate": 0.00014338530066815145, "loss": 1.8217, "step": 1281 }, { "epoch": 0.2848888888888889, "grad_norm": 1.079615831375122, "learning_rate": 0.00014334075723830734, "loss": 1.9557, "step": 1282 }, { "epoch": 0.2851111111111111, "grad_norm": 0.8131044507026672, "learning_rate": 0.00014329621380846326, "loss": 1.0458, "step": 1283 }, { "epoch": 0.2853333333333333, "grad_norm": 0.793594241142273, "learning_rate": 0.00014325167037861915, "loss": 0.9087, "step": 1284 }, { "epoch": 0.28555555555555556, "grad_norm": 0.6490100622177124, "learning_rate": 0.00014320712694877507, "loss": 0.7678, "step": 1285 }, { "epoch": 0.2857777777777778, "grad_norm": 0.07962695509195328, "learning_rate": 0.00014316258351893096, "loss": 0.0181, "step": 1286 }, { "epoch": 0.286, "grad_norm": 0.08379562944173813, "learning_rate": 0.00014311804008908686, "loss": 0.0177, "step": 1287 }, { "epoch": 0.2862222222222222, "grad_norm": 0.7064594030380249, "learning_rate": 0.00014307349665924278, "loss": 0.7159, "step": 1288 }, { "epoch": 0.28644444444444445, "grad_norm": 1.0957010984420776, "learning_rate": 0.00014302895322939867, "loss": 1.9004, "step": 1289 }, { "epoch": 0.2866666666666667, "grad_norm": 1.0294493436813354, "learning_rate": 0.00014298440979955456, "loss": 1.4984, "step": 1290 }, { "epoch": 0.2868888888888889, "grad_norm": 1.0541222095489502, "learning_rate": 0.00014293986636971048, "loss": 1.4094, "step": 1291 }, { "epoch": 0.2871111111111111, "grad_norm": 0.9578663110733032, "learning_rate": 0.00014289532293986637, "loss": 1.7305, "step": 1292 }, { "epoch": 0.28733333333333333, "grad_norm": 1.0165977478027344, "learning_rate": 0.0001428507795100223, "loss": 1.4897, "step": 1293 }, { "epoch": 0.28755555555555556, "grad_norm": 0.6953179240226746, "learning_rate": 0.00014280623608017818, "loss": 0.7403, "step": 1294 }, { "epoch": 0.2877777777777778, "grad_norm": 0.6080345511436462, "learning_rate": 0.00014276169265033407, "loss": 0.5275, "step": 1295 }, { "epoch": 0.288, "grad_norm": 1.2395416498184204, "learning_rate": 0.00014271714922048996, "loss": 1.2051, "step": 1296 }, { "epoch": 0.2882222222222222, "grad_norm": 0.724810779094696, "learning_rate": 0.0001426726057906459, "loss": 0.5437, "step": 1297 }, { "epoch": 0.28844444444444445, "grad_norm": 0.7165504693984985, "learning_rate": 0.0001426280623608018, "loss": 0.5968, "step": 1298 }, { "epoch": 0.2886666666666667, "grad_norm": 1.0052456855773926, "learning_rate": 0.0001425835189309577, "loss": 1.0547, "step": 1299 }, { "epoch": 0.28888888888888886, "grad_norm": 0.8707148432731628, "learning_rate": 0.00014253897550111359, "loss": 0.6631, "step": 1300 }, { "epoch": 0.2891111111111111, "grad_norm": 0.811651349067688, "learning_rate": 0.00014249443207126948, "loss": 2.4015, "step": 1301 }, { "epoch": 0.28933333333333333, "grad_norm": 0.10851640999317169, "learning_rate": 0.0001424498886414254, "loss": 0.0133, "step": 1302 }, { "epoch": 0.28955555555555557, "grad_norm": 0.628270149230957, "learning_rate": 0.00014240534521158132, "loss": 1.2922, "step": 1303 }, { "epoch": 0.2897777777777778, "grad_norm": 1.039566993713379, "learning_rate": 0.0001423608017817372, "loss": 2.5175, "step": 1304 }, { "epoch": 0.29, "grad_norm": 0.3334747552871704, "learning_rate": 0.0001423162583518931, "loss": 0.0157, "step": 1305 }, { "epoch": 0.2902222222222222, "grad_norm": 0.14356687664985657, "learning_rate": 0.00014227171492204902, "loss": 0.0136, "step": 1306 }, { "epoch": 0.29044444444444445, "grad_norm": 0.07359285652637482, "learning_rate": 0.0001422271714922049, "loss": 0.012, "step": 1307 }, { "epoch": 0.2906666666666667, "grad_norm": 0.683387815952301, "learning_rate": 0.0001421826280623608, "loss": 0.9133, "step": 1308 }, { "epoch": 0.29088888888888886, "grad_norm": 0.8599250316619873, "learning_rate": 0.00014213808463251672, "loss": 1.9407, "step": 1309 }, { "epoch": 0.2911111111111111, "grad_norm": 0.9184352159500122, "learning_rate": 0.0001420935412026726, "loss": 2.1807, "step": 1310 }, { "epoch": 0.29133333333333333, "grad_norm": 1.0117696523666382, "learning_rate": 0.00014204899777282853, "loss": 2.1512, "step": 1311 }, { "epoch": 0.29155555555555557, "grad_norm": 0.7668831944465637, "learning_rate": 0.00014200445434298442, "loss": 1.7311, "step": 1312 }, { "epoch": 0.2917777777777778, "grad_norm": 0.7954047918319702, "learning_rate": 0.00014195991091314031, "loss": 1.8633, "step": 1313 }, { "epoch": 0.292, "grad_norm": 1.201979637145996, "learning_rate": 0.0001419153674832962, "loss": 1.9811, "step": 1314 }, { "epoch": 0.2922222222222222, "grad_norm": 0.8454548716545105, "learning_rate": 0.0001418708240534521, "loss": 1.9974, "step": 1315 }, { "epoch": 0.29244444444444445, "grad_norm": 1.0249965190887451, "learning_rate": 0.00014182628062360804, "loss": 2.3543, "step": 1316 }, { "epoch": 0.2926666666666667, "grad_norm": 1.0549813508987427, "learning_rate": 0.00014178173719376394, "loss": 2.3438, "step": 1317 }, { "epoch": 0.29288888888888887, "grad_norm": 0.9025498628616333, "learning_rate": 0.00014173719376391983, "loss": 1.8958, "step": 1318 }, { "epoch": 0.2931111111111111, "grad_norm": 0.9086618423461914, "learning_rate": 0.00014169265033407572, "loss": 2.1565, "step": 1319 }, { "epoch": 0.29333333333333333, "grad_norm": 0.9326392412185669, "learning_rate": 0.00014164810690423164, "loss": 1.8965, "step": 1320 }, { "epoch": 0.29355555555555557, "grad_norm": 0.8608342409133911, "learning_rate": 0.00014160356347438753, "loss": 1.8891, "step": 1321 }, { "epoch": 0.2937777777777778, "grad_norm": 0.8682828545570374, "learning_rate": 0.00014155902004454345, "loss": 1.9273, "step": 1322 }, { "epoch": 0.294, "grad_norm": 0.7296971082687378, "learning_rate": 0.00014151447661469934, "loss": 1.0999, "step": 1323 }, { "epoch": 0.2942222222222222, "grad_norm": 0.09005673974752426, "learning_rate": 0.00014146993318485523, "loss": 0.0146, "step": 1324 }, { "epoch": 0.29444444444444445, "grad_norm": 0.08192627131938934, "learning_rate": 0.00014142538975501115, "loss": 0.0152, "step": 1325 }, { "epoch": 0.2946666666666667, "grad_norm": 0.08258790522813797, "learning_rate": 0.00014138084632516704, "loss": 0.0148, "step": 1326 }, { "epoch": 0.29488888888888887, "grad_norm": 0.07657041400671005, "learning_rate": 0.00014133630289532294, "loss": 0.0146, "step": 1327 }, { "epoch": 0.2951111111111111, "grad_norm": 0.09436430037021637, "learning_rate": 0.00014129175946547885, "loss": 0.0206, "step": 1328 }, { "epoch": 0.29533333333333334, "grad_norm": 0.6281077265739441, "learning_rate": 0.00014124721603563475, "loss": 0.9227, "step": 1329 }, { "epoch": 0.29555555555555557, "grad_norm": 0.9210866093635559, "learning_rate": 0.00014120267260579067, "loss": 1.4825, "step": 1330 }, { "epoch": 0.29577777777777775, "grad_norm": 0.9637800455093384, "learning_rate": 0.00014115812917594656, "loss": 1.84, "step": 1331 }, { "epoch": 0.296, "grad_norm": 1.1256612539291382, "learning_rate": 0.00014111358574610245, "loss": 1.732, "step": 1332 }, { "epoch": 0.2962222222222222, "grad_norm": 1.3389207124710083, "learning_rate": 0.00014106904231625834, "loss": 2.1392, "step": 1333 }, { "epoch": 0.29644444444444445, "grad_norm": 0.770871639251709, "learning_rate": 0.00014102449888641426, "loss": 0.8635, "step": 1334 }, { "epoch": 0.2966666666666667, "grad_norm": 0.08359196037054062, "learning_rate": 0.00014097995545657018, "loss": 0.0214, "step": 1335 }, { "epoch": 0.29688888888888887, "grad_norm": 0.07316497713327408, "learning_rate": 0.00014093541202672607, "loss": 0.0209, "step": 1336 }, { "epoch": 0.2971111111111111, "grad_norm": 1.0184942483901978, "learning_rate": 0.00014089086859688196, "loss": 1.8638, "step": 1337 }, { "epoch": 0.29733333333333334, "grad_norm": 0.7653923630714417, "learning_rate": 0.00014084632516703785, "loss": 1.0134, "step": 1338 }, { "epoch": 0.2975555555555556, "grad_norm": 0.07775040715932846, "learning_rate": 0.00014080178173719377, "loss": 0.0195, "step": 1339 }, { "epoch": 0.29777777777777775, "grad_norm": 1.0716129541397095, "learning_rate": 0.0001407572383073497, "loss": 1.75, "step": 1340 }, { "epoch": 0.298, "grad_norm": 1.022926688194275, "learning_rate": 0.00014071269487750558, "loss": 1.6189, "step": 1341 }, { "epoch": 0.2982222222222222, "grad_norm": 1.1022212505340576, "learning_rate": 0.00014066815144766148, "loss": 1.5175, "step": 1342 }, { "epoch": 0.29844444444444446, "grad_norm": 0.8157255053520203, "learning_rate": 0.00014062360801781737, "loss": 0.0359, "step": 1343 }, { "epoch": 0.2986666666666667, "grad_norm": 1.034464955329895, "learning_rate": 0.00014057906458797329, "loss": 1.6205, "step": 1344 }, { "epoch": 0.29888888888888887, "grad_norm": 1.1047078371047974, "learning_rate": 0.00014053452115812918, "loss": 1.4756, "step": 1345 }, { "epoch": 0.2991111111111111, "grad_norm": 1.0624406337738037, "learning_rate": 0.0001404899777282851, "loss": 1.4263, "step": 1346 }, { "epoch": 0.29933333333333334, "grad_norm": 0.9821346402168274, "learning_rate": 0.000140445434298441, "loss": 1.4288, "step": 1347 }, { "epoch": 0.2995555555555556, "grad_norm": 1.1753016710281372, "learning_rate": 0.00014040089086859688, "loss": 1.6721, "step": 1348 }, { "epoch": 0.29977777777777775, "grad_norm": 1.0248849391937256, "learning_rate": 0.0001403563474387528, "loss": 1.3122, "step": 1349 }, { "epoch": 0.3, "grad_norm": 1.0691323280334473, "learning_rate": 0.0001403118040089087, "loss": 1.3309, "step": 1350 }, { "epoch": 0.3002222222222222, "grad_norm": 0.055053651332855225, "learning_rate": 0.00014026726057906458, "loss": 0.0123, "step": 1351 }, { "epoch": 0.30044444444444446, "grad_norm": 0.05664655193686485, "learning_rate": 0.0001402227171492205, "loss": 0.0123, "step": 1352 }, { "epoch": 0.3006666666666667, "grad_norm": 0.8870833516120911, "learning_rate": 0.0001401781737193764, "loss": 2.1987, "step": 1353 }, { "epoch": 0.3008888888888889, "grad_norm": 0.5931410193443298, "learning_rate": 0.0001401336302895323, "loss": 0.9983, "step": 1354 }, { "epoch": 0.3011111111111111, "grad_norm": 0.8538689017295837, "learning_rate": 0.0001400890868596882, "loss": 2.2606, "step": 1355 }, { "epoch": 0.30133333333333334, "grad_norm": 0.05451615899801254, "learning_rate": 0.0001400445434298441, "loss": 0.0118, "step": 1356 }, { "epoch": 0.3015555555555556, "grad_norm": 0.881386935710907, "learning_rate": 0.00014, "loss": 2.3174, "step": 1357 }, { "epoch": 0.30177777777777776, "grad_norm": 0.8224138617515564, "learning_rate": 0.0001399554565701559, "loss": 0.0327, "step": 1358 }, { "epoch": 0.302, "grad_norm": 0.08270707726478577, "learning_rate": 0.00013991091314031183, "loss": 0.0139, "step": 1359 }, { "epoch": 0.3022222222222222, "grad_norm": 0.0797787606716156, "learning_rate": 0.00013986636971046772, "loss": 0.014, "step": 1360 }, { "epoch": 0.30244444444444446, "grad_norm": 0.6289453506469727, "learning_rate": 0.0001398218262806236, "loss": 0.973, "step": 1361 }, { "epoch": 0.30266666666666664, "grad_norm": 0.9311420917510986, "learning_rate": 0.0001397772828507795, "loss": 2.1816, "step": 1362 }, { "epoch": 0.3028888888888889, "grad_norm": 0.8370685577392578, "learning_rate": 0.00013973273942093542, "loss": 2.128, "step": 1363 }, { "epoch": 0.3031111111111111, "grad_norm": 0.7551019787788391, "learning_rate": 0.00013968819599109134, "loss": 1.9265, "step": 1364 }, { "epoch": 0.30333333333333334, "grad_norm": 0.8512743711471558, "learning_rate": 0.00013964365256124723, "loss": 1.9408, "step": 1365 }, { "epoch": 0.3035555555555556, "grad_norm": 0.9641419649124146, "learning_rate": 0.00013959910913140312, "loss": 2.1117, "step": 1366 }, { "epoch": 0.30377777777777776, "grad_norm": 0.841379702091217, "learning_rate": 0.00013955456570155901, "loss": 1.7673, "step": 1367 }, { "epoch": 0.304, "grad_norm": 0.660455584526062, "learning_rate": 0.00013951002227171493, "loss": 0.9827, "step": 1368 }, { "epoch": 0.3042222222222222, "grad_norm": 0.11817409098148346, "learning_rate": 0.00013946547884187082, "loss": 0.0225, "step": 1369 }, { "epoch": 0.30444444444444446, "grad_norm": 0.7172605395317078, "learning_rate": 0.00013942093541202674, "loss": 1.3491, "step": 1370 }, { "epoch": 0.30466666666666664, "grad_norm": 0.9483502507209778, "learning_rate": 0.00013937639198218264, "loss": 2.0678, "step": 1371 }, { "epoch": 0.3048888888888889, "grad_norm": 1.0518776178359985, "learning_rate": 0.00013933184855233853, "loss": 2.0944, "step": 1372 }, { "epoch": 0.3051111111111111, "grad_norm": 0.9454103708267212, "learning_rate": 0.00013928730512249445, "loss": 1.8151, "step": 1373 }, { "epoch": 0.30533333333333335, "grad_norm": 0.6248193383216858, "learning_rate": 0.00013924276169265034, "loss": 0.9771, "step": 1374 }, { "epoch": 0.3055555555555556, "grad_norm": 0.08333655446767807, "learning_rate": 0.00013919821826280623, "loss": 0.0158, "step": 1375 }, { "epoch": 0.30577777777777776, "grad_norm": 0.08563945442438126, "learning_rate": 0.00013915367483296215, "loss": 0.0191, "step": 1376 }, { "epoch": 0.306, "grad_norm": 0.9253622889518738, "learning_rate": 0.00013910913140311804, "loss": 1.7993, "step": 1377 }, { "epoch": 0.30622222222222223, "grad_norm": 0.9118351936340332, "learning_rate": 0.00013906458797327396, "loss": 1.855, "step": 1378 }, { "epoch": 0.30644444444444446, "grad_norm": 1.037188172340393, "learning_rate": 0.00013902004454342985, "loss": 1.8305, "step": 1379 }, { "epoch": 0.30666666666666664, "grad_norm": 0.9743615984916687, "learning_rate": 0.00013897550111358574, "loss": 1.7063, "step": 1380 }, { "epoch": 0.3068888888888889, "grad_norm": 1.013454794883728, "learning_rate": 0.00013893095768374164, "loss": 1.9382, "step": 1381 }, { "epoch": 0.3071111111111111, "grad_norm": 0.9891089797019958, "learning_rate": 0.00013888641425389758, "loss": 2.0535, "step": 1382 }, { "epoch": 0.30733333333333335, "grad_norm": 0.10896483063697815, "learning_rate": 0.00013884187082405347, "loss": 0.02, "step": 1383 }, { "epoch": 0.3075555555555556, "grad_norm": 0.07615262269973755, "learning_rate": 0.00013879732739420936, "loss": 0.02, "step": 1384 }, { "epoch": 0.30777777777777776, "grad_norm": 0.6620470881462097, "learning_rate": 0.00013875278396436526, "loss": 0.8813, "step": 1385 }, { "epoch": 0.308, "grad_norm": 1.031408667564392, "learning_rate": 0.00013870824053452115, "loss": 1.9278, "step": 1386 }, { "epoch": 0.30822222222222223, "grad_norm": 0.08425939083099365, "learning_rate": 0.00013866369710467707, "loss": 0.0183, "step": 1387 }, { "epoch": 0.30844444444444447, "grad_norm": 0.07298826426267624, "learning_rate": 0.00013861915367483296, "loss": 0.0176, "step": 1388 }, { "epoch": 0.30866666666666664, "grad_norm": 0.6326761245727539, "learning_rate": 0.00013857461024498888, "loss": 0.7375, "step": 1389 }, { "epoch": 0.3088888888888889, "grad_norm": 0.7388807535171509, "learning_rate": 0.00013853006681514477, "loss": 0.8331, "step": 1390 }, { "epoch": 0.3091111111111111, "grad_norm": 1.3735177516937256, "learning_rate": 0.00013848552338530066, "loss": 1.5142, "step": 1391 }, { "epoch": 0.30933333333333335, "grad_norm": 1.0506902933120728, "learning_rate": 0.00013844097995545658, "loss": 1.5543, "step": 1392 }, { "epoch": 0.30955555555555553, "grad_norm": 0.9635607600212097, "learning_rate": 0.00013839643652561247, "loss": 1.3675, "step": 1393 }, { "epoch": 0.30977777777777776, "grad_norm": 1.2587249279022217, "learning_rate": 0.00013835189309576836, "loss": 1.7267, "step": 1394 }, { "epoch": 0.31, "grad_norm": 1.1692713499069214, "learning_rate": 0.00013830734966592428, "loss": 1.442, "step": 1395 }, { "epoch": 0.31022222222222223, "grad_norm": 1.0046931505203247, "learning_rate": 0.0001382628062360802, "loss": 1.2172, "step": 1396 }, { "epoch": 0.31044444444444447, "grad_norm": 0.7264860272407532, "learning_rate": 0.0001382182628062361, "loss": 0.5203, "step": 1397 }, { "epoch": 0.31066666666666665, "grad_norm": 1.1405805349349976, "learning_rate": 0.00013817371937639199, "loss": 1.0399, "step": 1398 }, { "epoch": 0.3108888888888889, "grad_norm": 1.1510692834854126, "learning_rate": 0.00013812917594654788, "loss": 1.1632, "step": 1399 }, { "epoch": 0.3111111111111111, "grad_norm": 1.4786934852600098, "learning_rate": 0.00013808463251670377, "loss": 1.1631, "step": 1400 }, { "epoch": 0.31133333333333335, "grad_norm": 0.7321925759315491, "learning_rate": 0.00013804008908685972, "loss": 1.2846, "step": 1401 }, { "epoch": 0.31155555555555553, "grad_norm": 0.8748204708099365, "learning_rate": 0.0001379955456570156, "loss": 1.9939, "step": 1402 }, { "epoch": 0.31177777777777776, "grad_norm": 0.6573612689971924, "learning_rate": 0.0001379510022271715, "loss": 1.1956, "step": 1403 }, { "epoch": 0.312, "grad_norm": 0.049669332802295685, "learning_rate": 0.0001379064587973274, "loss": 0.0108, "step": 1404 }, { "epoch": 0.31222222222222223, "grad_norm": 0.5930540561676025, "learning_rate": 0.0001378619153674833, "loss": 1.1255, "step": 1405 }, { "epoch": 0.31244444444444447, "grad_norm": 0.585092306137085, "learning_rate": 0.0001378173719376392, "loss": 0.9278, "step": 1406 }, { "epoch": 0.31266666666666665, "grad_norm": 0.7510188221931458, "learning_rate": 0.00013777282850779512, "loss": 1.3691, "step": 1407 }, { "epoch": 0.3128888888888889, "grad_norm": 0.675545334815979, "learning_rate": 0.000137728285077951, "loss": 0.0309, "step": 1408 }, { "epoch": 0.3131111111111111, "grad_norm": 0.13560031354427338, "learning_rate": 0.0001376837416481069, "loss": 0.0159, "step": 1409 }, { "epoch": 0.31333333333333335, "grad_norm": 0.8208870887756348, "learning_rate": 0.00013763919821826282, "loss": 1.9514, "step": 1410 }, { "epoch": 0.31355555555555553, "grad_norm": 0.8589663505554199, "learning_rate": 0.00013759465478841871, "loss": 2.0799, "step": 1411 }, { "epoch": 0.31377777777777777, "grad_norm": 0.9203690886497498, "learning_rate": 0.0001375501113585746, "loss": 2.1629, "step": 1412 }, { "epoch": 0.314, "grad_norm": 1.0017417669296265, "learning_rate": 0.00013750556792873053, "loss": 2.403, "step": 1413 }, { "epoch": 0.31422222222222224, "grad_norm": 0.9020522236824036, "learning_rate": 0.00013746102449888642, "loss": 1.8001, "step": 1414 }, { "epoch": 0.31444444444444447, "grad_norm": 0.8959416747093201, "learning_rate": 0.00013741648106904234, "loss": 1.9272, "step": 1415 }, { "epoch": 0.31466666666666665, "grad_norm": 0.8453971147537231, "learning_rate": 0.00013737193763919823, "loss": 2.0584, "step": 1416 }, { "epoch": 0.3148888888888889, "grad_norm": 1.0558977127075195, "learning_rate": 0.00013732739420935412, "loss": 2.1863, "step": 1417 }, { "epoch": 0.3151111111111111, "grad_norm": 0.9428475499153137, "learning_rate": 0.00013728285077951, "loss": 1.8684, "step": 1418 }, { "epoch": 0.31533333333333335, "grad_norm": 1.0137181282043457, "learning_rate": 0.00013723830734966593, "loss": 1.9836, "step": 1419 }, { "epoch": 0.31555555555555553, "grad_norm": 0.9282086491584778, "learning_rate": 0.00013719376391982185, "loss": 1.9903, "step": 1420 }, { "epoch": 0.31577777777777777, "grad_norm": 0.6798798441886902, "learning_rate": 0.00013714922048997774, "loss": 0.0244, "step": 1421 }, { "epoch": 0.316, "grad_norm": 0.07246652245521545, "learning_rate": 0.00013710467706013363, "loss": 0.0155, "step": 1422 }, { "epoch": 0.31622222222222224, "grad_norm": 0.7266292572021484, "learning_rate": 0.00013706013363028952, "loss": 0.74, "step": 1423 }, { "epoch": 0.3164444444444444, "grad_norm": 0.30071863532066345, "learning_rate": 0.00013701559020044544, "loss": 0.0299, "step": 1424 }, { "epoch": 0.31666666666666665, "grad_norm": 0.6319347620010376, "learning_rate": 0.00013697104677060136, "loss": 0.9216, "step": 1425 }, { "epoch": 0.3168888888888889, "grad_norm": 0.9551006555557251, "learning_rate": 0.00013692650334075725, "loss": 1.9096, "step": 1426 }, { "epoch": 0.3171111111111111, "grad_norm": 0.9880414009094238, "learning_rate": 0.00013688195991091315, "loss": 1.8509, "step": 1427 }, { "epoch": 0.31733333333333336, "grad_norm": 0.9100226163864136, "learning_rate": 0.00013683741648106904, "loss": 1.8315, "step": 1428 }, { "epoch": 0.31755555555555554, "grad_norm": 1.0499440431594849, "learning_rate": 0.00013679287305122496, "loss": 2.1878, "step": 1429 }, { "epoch": 0.31777777777777777, "grad_norm": 1.0114299058914185, "learning_rate": 0.00013674832962138085, "loss": 1.5405, "step": 1430 }, { "epoch": 0.318, "grad_norm": 0.0601423941552639, "learning_rate": 0.00013670378619153677, "loss": 0.018, "step": 1431 }, { "epoch": 0.31822222222222224, "grad_norm": 0.05997453257441521, "learning_rate": 0.00013665924276169266, "loss": 0.0182, "step": 1432 }, { "epoch": 0.3184444444444444, "grad_norm": 0.8638304471969604, "learning_rate": 0.00013661469933184855, "loss": 0.829, "step": 1433 }, { "epoch": 0.31866666666666665, "grad_norm": 0.9641528725624084, "learning_rate": 0.00013657015590200447, "loss": 1.7045, "step": 1434 }, { "epoch": 0.3188888888888889, "grad_norm": 1.0239591598510742, "learning_rate": 0.00013652561247216036, "loss": 1.6333, "step": 1435 }, { "epoch": 0.3191111111111111, "grad_norm": 0.07839752733707428, "learning_rate": 0.00013648106904231625, "loss": 0.0199, "step": 1436 }, { "epoch": 0.31933333333333336, "grad_norm": 0.07986389845609665, "learning_rate": 0.00013643652561247217, "loss": 0.0205, "step": 1437 }, { "epoch": 0.31955555555555554, "grad_norm": 0.0760846883058548, "learning_rate": 0.00013639198218262806, "loss": 0.0196, "step": 1438 }, { "epoch": 0.31977777777777777, "grad_norm": 1.220151424407959, "learning_rate": 0.00013634743875278398, "loss": 1.8283, "step": 1439 }, { "epoch": 0.32, "grad_norm": 1.1636601686477661, "learning_rate": 0.00013630289532293988, "loss": 1.661, "step": 1440 }, { "epoch": 0.32022222222222224, "grad_norm": 1.003257393836975, "learning_rate": 0.00013625835189309577, "loss": 1.5869, "step": 1441 }, { "epoch": 0.3204444444444444, "grad_norm": 0.850940465927124, "learning_rate": 0.00013621380846325166, "loss": 0.8508, "step": 1442 }, { "epoch": 0.32066666666666666, "grad_norm": 0.8343439102172852, "learning_rate": 0.00013616926503340758, "loss": 0.9602, "step": 1443 }, { "epoch": 0.3208888888888889, "grad_norm": 1.075269103050232, "learning_rate": 0.0001361247216035635, "loss": 1.7471, "step": 1444 }, { "epoch": 0.3211111111111111, "grad_norm": 1.0553182363510132, "learning_rate": 0.0001360801781737194, "loss": 1.465, "step": 1445 }, { "epoch": 0.32133333333333336, "grad_norm": 1.0383347272872925, "learning_rate": 0.00013603563474387528, "loss": 1.332, "step": 1446 }, { "epoch": 0.32155555555555554, "grad_norm": 1.3711236715316772, "learning_rate": 0.00013599109131403117, "loss": 1.7505, "step": 1447 }, { "epoch": 0.3217777777777778, "grad_norm": 1.1211059093475342, "learning_rate": 0.0001359465478841871, "loss": 1.3927, "step": 1448 }, { "epoch": 0.322, "grad_norm": 1.0040308237075806, "learning_rate": 0.000135902004454343, "loss": 1.2536, "step": 1449 }, { "epoch": 0.32222222222222224, "grad_norm": 0.7612625956535339, "learning_rate": 0.0001358574610244989, "loss": 0.4927, "step": 1450 }, { "epoch": 0.3224444444444444, "grad_norm": 0.8916938900947571, "learning_rate": 0.0001358129175946548, "loss": 2.2834, "step": 1451 }, { "epoch": 0.32266666666666666, "grad_norm": 0.6277791261672974, "learning_rate": 0.00013576837416481069, "loss": 1.0415, "step": 1452 }, { "epoch": 0.3228888888888889, "grad_norm": 0.8357473611831665, "learning_rate": 0.0001357238307349666, "loss": 2.2224, "step": 1453 }, { "epoch": 0.3231111111111111, "grad_norm": 0.051700394600629807, "learning_rate": 0.0001356792873051225, "loss": 0.0112, "step": 1454 }, { "epoch": 0.3233333333333333, "grad_norm": 0.6184151768684387, "learning_rate": 0.00013563474387527841, "loss": 1.1171, "step": 1455 }, { "epoch": 0.32355555555555554, "grad_norm": 0.8683602809906006, "learning_rate": 0.0001355902004454343, "loss": 2.2732, "step": 1456 }, { "epoch": 0.3237777777777778, "grad_norm": 0.810332179069519, "learning_rate": 0.0001355456570155902, "loss": 2.1909, "step": 1457 }, { "epoch": 0.324, "grad_norm": 0.9960488677024841, "learning_rate": 0.00013550111358574612, "loss": 2.3706, "step": 1458 }, { "epoch": 0.32422222222222224, "grad_norm": 0.8388908505439758, "learning_rate": 0.000135456570155902, "loss": 2.0544, "step": 1459 }, { "epoch": 0.3244444444444444, "grad_norm": 1.571382999420166, "learning_rate": 0.0001354120267260579, "loss": 0.0447, "step": 1460 }, { "epoch": 0.32466666666666666, "grad_norm": 0.8016864657402039, "learning_rate": 0.0001353674832962138, "loss": 1.9393, "step": 1461 }, { "epoch": 0.3248888888888889, "grad_norm": 0.973527729511261, "learning_rate": 0.0001353229398663697, "loss": 2.0011, "step": 1462 }, { "epoch": 0.32511111111111113, "grad_norm": 0.896851658821106, "learning_rate": 0.00013527839643652563, "loss": 2.0065, "step": 1463 }, { "epoch": 0.3253333333333333, "grad_norm": 1.0477668046951294, "learning_rate": 0.00013523385300668152, "loss": 1.781, "step": 1464 }, { "epoch": 0.32555555555555554, "grad_norm": 0.9915692806243896, "learning_rate": 0.00013518930957683741, "loss": 2.0669, "step": 1465 }, { "epoch": 0.3257777777777778, "grad_norm": 0.1883118599653244, "learning_rate": 0.0001351447661469933, "loss": 0.0247, "step": 1466 }, { "epoch": 0.326, "grad_norm": 0.69676274061203, "learning_rate": 0.00013510022271714922, "loss": 1.0799, "step": 1467 }, { "epoch": 0.32622222222222225, "grad_norm": 0.6046315431594849, "learning_rate": 0.00013505567928730514, "loss": 0.9688, "step": 1468 }, { "epoch": 0.3264444444444444, "grad_norm": 1.0804486274719238, "learning_rate": 0.00013501113585746104, "loss": 1.9587, "step": 1469 }, { "epoch": 0.32666666666666666, "grad_norm": 0.8974558115005493, "learning_rate": 0.00013496659242761693, "loss": 1.7077, "step": 1470 }, { "epoch": 0.3268888888888889, "grad_norm": 0.8613162040710449, "learning_rate": 0.00013492204899777282, "loss": 1.8079, "step": 1471 }, { "epoch": 0.32711111111111113, "grad_norm": 0.9599412083625793, "learning_rate": 0.00013487750556792874, "loss": 2.1691, "step": 1472 }, { "epoch": 0.3273333333333333, "grad_norm": 1.0441149473190308, "learning_rate": 0.00013483296213808463, "loss": 2.1019, "step": 1473 }, { "epoch": 0.32755555555555554, "grad_norm": 0.9434587359428406, "learning_rate": 0.00013478841870824055, "loss": 1.6825, "step": 1474 }, { "epoch": 0.3277777777777778, "grad_norm": 0.17473283410072327, "learning_rate": 0.00013474387527839644, "loss": 0.0173, "step": 1475 }, { "epoch": 0.328, "grad_norm": 0.10064394772052765, "learning_rate": 0.00013469933184855233, "loss": 0.0165, "step": 1476 }, { "epoch": 0.32822222222222225, "grad_norm": 0.6770084500312805, "learning_rate": 0.00013465478841870825, "loss": 0.9154, "step": 1477 }, { "epoch": 0.32844444444444443, "grad_norm": 1.1016879081726074, "learning_rate": 0.00013461024498886414, "loss": 1.7554, "step": 1478 }, { "epoch": 0.32866666666666666, "grad_norm": 1.096827745437622, "learning_rate": 0.00013456570155902003, "loss": 1.6961, "step": 1479 }, { "epoch": 0.3288888888888889, "grad_norm": 0.9961020350456238, "learning_rate": 0.00013452115812917595, "loss": 1.9033, "step": 1480 }, { "epoch": 0.32911111111111113, "grad_norm": 0.08242473751306534, "learning_rate": 0.00013447661469933187, "loss": 0.0181, "step": 1481 }, { "epoch": 0.3293333333333333, "grad_norm": 0.07623957842588425, "learning_rate": 0.00013443207126948776, "loss": 0.018, "step": 1482 }, { "epoch": 0.32955555555555555, "grad_norm": 0.06432683765888214, "learning_rate": 0.00013438752783964366, "loss": 0.0177, "step": 1483 }, { "epoch": 0.3297777777777778, "grad_norm": 0.06077761575579643, "learning_rate": 0.00013434298440979955, "loss": 0.0173, "step": 1484 }, { "epoch": 0.33, "grad_norm": 0.0834859162569046, "learning_rate": 0.00013429844097995544, "loss": 0.0173, "step": 1485 }, { "epoch": 0.3302222222222222, "grad_norm": 1.0607445240020752, "learning_rate": 0.00013425389755011139, "loss": 1.8938, "step": 1486 }, { "epoch": 0.33044444444444443, "grad_norm": 0.7781770825386047, "learning_rate": 0.00013420935412026728, "loss": 0.8223, "step": 1487 }, { "epoch": 0.33066666666666666, "grad_norm": 0.1246686577796936, "learning_rate": 0.00013416481069042317, "loss": 0.0205, "step": 1488 }, { "epoch": 0.3308888888888889, "grad_norm": 0.10837747901678085, "learning_rate": 0.00013412026726057906, "loss": 0.0194, "step": 1489 }, { "epoch": 0.33111111111111113, "grad_norm": 0.09982477873563766, "learning_rate": 0.00013407572383073498, "loss": 0.0186, "step": 1490 }, { "epoch": 0.3313333333333333, "grad_norm": 0.9533151984214783, "learning_rate": 0.00013403118040089087, "loss": 1.656, "step": 1491 }, { "epoch": 0.33155555555555555, "grad_norm": 0.8519287109375, "learning_rate": 0.0001339866369710468, "loss": 0.7699, "step": 1492 }, { "epoch": 0.3317777777777778, "grad_norm": 1.026831865310669, "learning_rate": 0.00013394209354120268, "loss": 1.4599, "step": 1493 }, { "epoch": 0.332, "grad_norm": 1.4534752368927002, "learning_rate": 0.00013389755011135857, "loss": 2.2483, "step": 1494 }, { "epoch": 0.3322222222222222, "grad_norm": 1.1159143447875977, "learning_rate": 0.0001338530066815145, "loss": 1.3013, "step": 1495 }, { "epoch": 0.33244444444444443, "grad_norm": 0.7417425513267517, "learning_rate": 0.00013380846325167039, "loss": 0.6602, "step": 1496 }, { "epoch": 0.33266666666666667, "grad_norm": 0.7993981242179871, "learning_rate": 0.00013376391982182628, "loss": 0.5802, "step": 1497 }, { "epoch": 0.3328888888888889, "grad_norm": 0.962468147277832, "learning_rate": 0.0001337193763919822, "loss": 1.0806, "step": 1498 }, { "epoch": 0.33311111111111114, "grad_norm": 1.2233699560165405, "learning_rate": 0.0001336748329621381, "loss": 1.338, "step": 1499 }, { "epoch": 0.3333333333333333, "grad_norm": 0.9210373163223267, "learning_rate": 0.000133630289532294, "loss": 1.0784, "step": 1500 }, { "epoch": 0.33355555555555555, "grad_norm": 0.5745635032653809, "learning_rate": 0.0001335857461024499, "loss": 1.0502, "step": 1501 }, { "epoch": 0.3337777777777778, "grad_norm": 0.06606602668762207, "learning_rate": 0.0001335412026726058, "loss": 0.0111, "step": 1502 }, { "epoch": 0.334, "grad_norm": 0.5875163078308105, "learning_rate": 0.00013349665924276168, "loss": 1.1621, "step": 1503 }, { "epoch": 0.3342222222222222, "grad_norm": 0.0689874216914177, "learning_rate": 0.0001334521158129176, "loss": 0.0108, "step": 1504 }, { "epoch": 0.33444444444444443, "grad_norm": 0.4859806299209595, "learning_rate": 0.00013340757238307352, "loss": 0.9335, "step": 1505 }, { "epoch": 0.33466666666666667, "grad_norm": 0.7597318291664124, "learning_rate": 0.0001333630289532294, "loss": 0.9311, "step": 1506 }, { "epoch": 0.3348888888888889, "grad_norm": 0.9831368327140808, "learning_rate": 0.0001333184855233853, "loss": 2.4321, "step": 1507 }, { "epoch": 0.33511111111111114, "grad_norm": 0.9348986148834229, "learning_rate": 0.0001332739420935412, "loss": 1.38, "step": 1508 }, { "epoch": 0.3353333333333333, "grad_norm": 0.7181216478347778, "learning_rate": 0.00013322939866369711, "loss": 0.0187, "step": 1509 }, { "epoch": 0.33555555555555555, "grad_norm": 1.021496057510376, "learning_rate": 0.00013318485523385303, "loss": 2.2939, "step": 1510 }, { "epoch": 0.3357777777777778, "grad_norm": 0.8876065015792847, "learning_rate": 0.00013314031180400893, "loss": 1.9854, "step": 1511 }, { "epoch": 0.336, "grad_norm": 0.9143947958946228, "learning_rate": 0.00013309576837416482, "loss": 1.9859, "step": 1512 }, { "epoch": 0.3362222222222222, "grad_norm": 0.9279030561447144, "learning_rate": 0.0001330512249443207, "loss": 2.1078, "step": 1513 }, { "epoch": 0.33644444444444443, "grad_norm": 0.9055482149124146, "learning_rate": 0.00013300668151447663, "loss": 2.1865, "step": 1514 }, { "epoch": 0.33666666666666667, "grad_norm": 0.9191523790359497, "learning_rate": 0.00013296213808463252, "loss": 2.2237, "step": 1515 }, { "epoch": 0.3368888888888889, "grad_norm": 0.8818835020065308, "learning_rate": 0.00013291759465478844, "loss": 1.8723, "step": 1516 }, { "epoch": 0.3371111111111111, "grad_norm": 0.9102388024330139, "learning_rate": 0.00013287305122494433, "loss": 1.9411, "step": 1517 }, { "epoch": 0.3373333333333333, "grad_norm": 1.2459015846252441, "learning_rate": 0.00013282850779510022, "loss": 1.9948, "step": 1518 }, { "epoch": 0.33755555555555555, "grad_norm": 0.9848894476890564, "learning_rate": 0.00013278396436525614, "loss": 2.0083, "step": 1519 }, { "epoch": 0.3377777777777778, "grad_norm": 0.900453507900238, "learning_rate": 0.00013273942093541203, "loss": 1.9048, "step": 1520 }, { "epoch": 0.338, "grad_norm": 0.9033414721488953, "learning_rate": 0.00013269487750556792, "loss": 1.8965, "step": 1521 }, { "epoch": 0.3382222222222222, "grad_norm": 0.8703753352165222, "learning_rate": 0.00013265033407572384, "loss": 2.0901, "step": 1522 }, { "epoch": 0.33844444444444444, "grad_norm": 0.15832370519638062, "learning_rate": 0.00013260579064587974, "loss": 0.0187, "step": 1523 }, { "epoch": 0.33866666666666667, "grad_norm": 0.11121733486652374, "learning_rate": 0.00013256124721603565, "loss": 0.0181, "step": 1524 }, { "epoch": 0.3388888888888889, "grad_norm": 0.131880983710289, "learning_rate": 0.00013251670378619155, "loss": 0.0212, "step": 1525 }, { "epoch": 0.3391111111111111, "grad_norm": 1.021878957748413, "learning_rate": 0.00013247216035634744, "loss": 2.0158, "step": 1526 }, { "epoch": 0.3393333333333333, "grad_norm": 0.9855987429618835, "learning_rate": 0.00013242761692650333, "loss": 1.7378, "step": 1527 }, { "epoch": 0.33955555555555555, "grad_norm": 0.9633174538612366, "learning_rate": 0.00013238307349665925, "loss": 1.9319, "step": 1528 }, { "epoch": 0.3397777777777778, "grad_norm": 0.9162039160728455, "learning_rate": 0.00013233853006681517, "loss": 1.6992, "step": 1529 }, { "epoch": 0.34, "grad_norm": 0.06041941046714783, "learning_rate": 0.00013229398663697106, "loss": 0.0164, "step": 1530 }, { "epoch": 0.3402222222222222, "grad_norm": 0.06718610972166061, "learning_rate": 0.00013224944320712695, "loss": 0.0163, "step": 1531 }, { "epoch": 0.34044444444444444, "grad_norm": 0.06589064002037048, "learning_rate": 0.00013220489977728284, "loss": 0.0165, "step": 1532 }, { "epoch": 0.3406666666666667, "grad_norm": 0.7173296809196472, "learning_rate": 0.00013216035634743876, "loss": 0.9545, "step": 1533 }, { "epoch": 0.3408888888888889, "grad_norm": 0.6503753662109375, "learning_rate": 0.00013211581291759465, "loss": 0.8755, "step": 1534 }, { "epoch": 0.3411111111111111, "grad_norm": 0.09492537379264832, "learning_rate": 0.00013207126948775057, "loss": 0.0178, "step": 1535 }, { "epoch": 0.3413333333333333, "grad_norm": 0.07086139172315598, "learning_rate": 0.00013202672605790646, "loss": 0.0174, "step": 1536 }, { "epoch": 0.34155555555555556, "grad_norm": 0.7318459749221802, "learning_rate": 0.00013198218262806236, "loss": 0.7707, "step": 1537 }, { "epoch": 0.3417777777777778, "grad_norm": 0.9774269461631775, "learning_rate": 0.00013193763919821827, "loss": 1.4131, "step": 1538 }, { "epoch": 0.342, "grad_norm": 0.6436899900436401, "learning_rate": 0.00013189309576837417, "loss": 0.7927, "step": 1539 }, { "epoch": 0.3422222222222222, "grad_norm": 0.12925714254379272, "learning_rate": 0.00013184855233853006, "loss": 0.0295, "step": 1540 }, { "epoch": 0.34244444444444444, "grad_norm": 0.6985756158828735, "learning_rate": 0.00013180400890868598, "loss": 0.6512, "step": 1541 }, { "epoch": 0.3426666666666667, "grad_norm": 1.2356915473937988, "learning_rate": 0.00013175946547884187, "loss": 1.9498, "step": 1542 }, { "epoch": 0.3428888888888889, "grad_norm": 1.0016330480575562, "learning_rate": 0.0001317149220489978, "loss": 0.8292, "step": 1543 }, { "epoch": 0.3431111111111111, "grad_norm": 1.0059810876846313, "learning_rate": 0.00013167037861915368, "loss": 1.1816, "step": 1544 }, { "epoch": 0.3433333333333333, "grad_norm": 1.080196738243103, "learning_rate": 0.00013162583518930957, "loss": 1.1594, "step": 1545 }, { "epoch": 0.34355555555555556, "grad_norm": 0.25775691866874695, "learning_rate": 0.00013158129175946546, "loss": 0.0452, "step": 1546 }, { "epoch": 0.3437777777777778, "grad_norm": 0.2299448549747467, "learning_rate": 0.00013153674832962138, "loss": 0.0418, "step": 1547 }, { "epoch": 0.344, "grad_norm": 1.0654411315917969, "learning_rate": 0.0001314922048997773, "loss": 0.8916, "step": 1548 }, { "epoch": 0.3442222222222222, "grad_norm": 1.1717489957809448, "learning_rate": 0.0001314476614699332, "loss": 1.1441, "step": 1549 }, { "epoch": 0.34444444444444444, "grad_norm": 0.9188210964202881, "learning_rate": 0.00013140311804008909, "loss": 0.7552, "step": 1550 }, { "epoch": 0.3446666666666667, "grad_norm": 0.04411787912249565, "learning_rate": 0.00013135857461024498, "loss": 0.0097, "step": 1551 }, { "epoch": 0.3448888888888889, "grad_norm": 0.923112690448761, "learning_rate": 0.0001313140311804009, "loss": 2.3056, "step": 1552 }, { "epoch": 0.3451111111111111, "grad_norm": 0.5338932275772095, "learning_rate": 0.00013126948775055681, "loss": 1.0079, "step": 1553 }, { "epoch": 0.3453333333333333, "grad_norm": 0.6365249156951904, "learning_rate": 0.0001312249443207127, "loss": 1.1061, "step": 1554 }, { "epoch": 0.34555555555555556, "grad_norm": 0.6134782433509827, "learning_rate": 0.0001311804008908686, "loss": 1.1132, "step": 1555 }, { "epoch": 0.3457777777777778, "grad_norm": 0.9963126182556152, "learning_rate": 0.0001311358574610245, "loss": 2.2424, "step": 1556 }, { "epoch": 0.346, "grad_norm": 0.6900335550308228, "learning_rate": 0.0001310913140311804, "loss": 0.0327, "step": 1557 }, { "epoch": 0.3462222222222222, "grad_norm": 0.5578808188438416, "learning_rate": 0.0001310467706013363, "loss": 1.0124, "step": 1558 }, { "epoch": 0.34644444444444444, "grad_norm": 0.9253682494163513, "learning_rate": 0.00013100222717149222, "loss": 2.1314, "step": 1559 }, { "epoch": 0.3466666666666667, "grad_norm": 0.9170551896095276, "learning_rate": 0.0001309576837416481, "loss": 2.1579, "step": 1560 }, { "epoch": 0.3468888888888889, "grad_norm": 0.9059464931488037, "learning_rate": 0.000130913140311804, "loss": 2.0186, "step": 1561 }, { "epoch": 0.3471111111111111, "grad_norm": 0.9621554017066956, "learning_rate": 0.00013086859688195992, "loss": 2.1287, "step": 1562 }, { "epoch": 0.3473333333333333, "grad_norm": 1.1893537044525146, "learning_rate": 0.00013082405345211581, "loss": 2.3136, "step": 1563 }, { "epoch": 0.34755555555555556, "grad_norm": 0.9095190167427063, "learning_rate": 0.0001307795100222717, "loss": 1.9219, "step": 1564 }, { "epoch": 0.3477777777777778, "grad_norm": 1.1559150218963623, "learning_rate": 0.00013073496659242762, "loss": 2.1498, "step": 1565 }, { "epoch": 0.348, "grad_norm": 1.0603296756744385, "learning_rate": 0.00013069042316258354, "loss": 1.8157, "step": 1566 }, { "epoch": 0.3482222222222222, "grad_norm": 0.8160759806632996, "learning_rate": 0.00013064587973273944, "loss": 1.6901, "step": 1567 }, { "epoch": 0.34844444444444445, "grad_norm": 0.8839480876922607, "learning_rate": 0.00013060133630289533, "loss": 2.0199, "step": 1568 }, { "epoch": 0.3486666666666667, "grad_norm": 0.9084996581077576, "learning_rate": 0.00013055679287305122, "loss": 1.6581, "step": 1569 }, { "epoch": 0.3488888888888889, "grad_norm": 1.0160479545593262, "learning_rate": 0.0001305122494432071, "loss": 2.0388, "step": 1570 }, { "epoch": 0.3491111111111111, "grad_norm": 0.949428141117096, "learning_rate": 0.00013046770601336306, "loss": 1.9788, "step": 1571 }, { "epoch": 0.34933333333333333, "grad_norm": 0.08356664329767227, "learning_rate": 0.00013042316258351895, "loss": 0.0148, "step": 1572 }, { "epoch": 0.34955555555555556, "grad_norm": 0.6757658123970032, "learning_rate": 0.00013037861915367484, "loss": 0.7336, "step": 1573 }, { "epoch": 0.3497777777777778, "grad_norm": 1.2868684530258179, "learning_rate": 0.00013033407572383073, "loss": 2.1339, "step": 1574 }, { "epoch": 0.35, "grad_norm": 0.1288681924343109, "learning_rate": 0.00013028953229398665, "loss": 0.0233, "step": 1575 }, { "epoch": 0.3502222222222222, "grad_norm": 0.9786088466644287, "learning_rate": 0.00013024498886414254, "loss": 1.8232, "step": 1576 }, { "epoch": 0.35044444444444445, "grad_norm": 1.028579831123352, "learning_rate": 0.00013020044543429846, "loss": 1.8186, "step": 1577 }, { "epoch": 0.3506666666666667, "grad_norm": 1.0287123918533325, "learning_rate": 0.00013015590200445435, "loss": 1.7949, "step": 1578 }, { "epoch": 0.35088888888888886, "grad_norm": 1.1234492063522339, "learning_rate": 0.00013011135857461025, "loss": 1.7413, "step": 1579 }, { "epoch": 0.3511111111111111, "grad_norm": 0.061823874711990356, "learning_rate": 0.00013006681514476616, "loss": 0.0176, "step": 1580 }, { "epoch": 0.35133333333333333, "grad_norm": 0.6424452662467957, "learning_rate": 0.00013002227171492206, "loss": 0.707, "step": 1581 }, { "epoch": 0.35155555555555557, "grad_norm": 0.9415295124053955, "learning_rate": 0.00012997772828507795, "loss": 1.6596, "step": 1582 }, { "epoch": 0.3517777777777778, "grad_norm": 0.9959827065467834, "learning_rate": 0.00012993318485523387, "loss": 1.7779, "step": 1583 }, { "epoch": 0.352, "grad_norm": 0.7211580872535706, "learning_rate": 0.00012988864142538976, "loss": 1.0332, "step": 1584 }, { "epoch": 0.3522222222222222, "grad_norm": 0.08983895927667618, "learning_rate": 0.00012984409799554568, "loss": 0.0193, "step": 1585 }, { "epoch": 0.35244444444444445, "grad_norm": 0.07530716061592102, "learning_rate": 0.00012979955456570157, "loss": 0.0195, "step": 1586 }, { "epoch": 0.3526666666666667, "grad_norm": 1.0737003087997437, "learning_rate": 0.00012975501113585746, "loss": 1.4139, "step": 1587 }, { "epoch": 0.35288888888888886, "grad_norm": 1.1176050901412964, "learning_rate": 0.00012971046770601335, "loss": 1.6433, "step": 1588 }, { "epoch": 0.3531111111111111, "grad_norm": 1.0836116075515747, "learning_rate": 0.00012966592427616927, "loss": 1.602, "step": 1589 }, { "epoch": 0.35333333333333333, "grad_norm": 0.3639664351940155, "learning_rate": 0.0001296213808463252, "loss": 0.0277, "step": 1590 }, { "epoch": 0.35355555555555557, "grad_norm": 0.9747996926307678, "learning_rate": 0.00012957683741648108, "loss": 1.4553, "step": 1591 }, { "epoch": 0.3537777777777778, "grad_norm": 1.146167278289795, "learning_rate": 0.00012953229398663697, "loss": 1.7357, "step": 1592 }, { "epoch": 0.354, "grad_norm": 1.1450601816177368, "learning_rate": 0.00012948775055679287, "loss": 1.6419, "step": 1593 }, { "epoch": 0.3542222222222222, "grad_norm": 1.1255170106887817, "learning_rate": 0.00012944320712694879, "loss": 1.5562, "step": 1594 }, { "epoch": 0.35444444444444445, "grad_norm": 0.783320963382721, "learning_rate": 0.0001293986636971047, "loss": 0.6857, "step": 1595 }, { "epoch": 0.3546666666666667, "grad_norm": 0.5967231392860413, "learning_rate": 0.0001293541202672606, "loss": 0.5624, "step": 1596 }, { "epoch": 0.35488888888888886, "grad_norm": 1.0421006679534912, "learning_rate": 0.0001293095768374165, "loss": 1.3535, "step": 1597 }, { "epoch": 0.3551111111111111, "grad_norm": 0.8290188908576965, "learning_rate": 0.00012926503340757238, "loss": 0.7833, "step": 1598 }, { "epoch": 0.35533333333333333, "grad_norm": 0.8200139403343201, "learning_rate": 0.0001292204899777283, "loss": 0.6196, "step": 1599 }, { "epoch": 0.35555555555555557, "grad_norm": 0.9781906604766846, "learning_rate": 0.0001291759465478842, "loss": 0.8058, "step": 1600 }, { "epoch": 0.3557777777777778, "grad_norm": 0.0529782809317112, "learning_rate": 0.0001291314031180401, "loss": 0.0104, "step": 1601 }, { "epoch": 0.356, "grad_norm": 0.5451091527938843, "learning_rate": 0.000129086859688196, "loss": 0.9391, "step": 1602 }, { "epoch": 0.3562222222222222, "grad_norm": 0.8948948979377747, "learning_rate": 0.0001290423162583519, "loss": 2.1385, "step": 1603 }, { "epoch": 0.35644444444444445, "grad_norm": 0.0486118420958519, "learning_rate": 0.0001289977728285078, "loss": 0.0104, "step": 1604 }, { "epoch": 0.3566666666666667, "grad_norm": 0.5522439479827881, "learning_rate": 0.0001289532293986637, "loss": 1.1311, "step": 1605 }, { "epoch": 0.35688888888888887, "grad_norm": 0.847810685634613, "learning_rate": 0.0001289086859688196, "loss": 2.2308, "step": 1606 }, { "epoch": 0.3571111111111111, "grad_norm": 0.8593490719795227, "learning_rate": 0.0001288641425389755, "loss": 2.0835, "step": 1607 }, { "epoch": 0.35733333333333334, "grad_norm": 0.8339969515800476, "learning_rate": 0.0001288195991091314, "loss": 2.0113, "step": 1608 }, { "epoch": 0.35755555555555557, "grad_norm": 0.38288450241088867, "learning_rate": 0.00012877505567928732, "loss": 0.0169, "step": 1609 }, { "epoch": 0.35777777777777775, "grad_norm": 0.11315584927797318, "learning_rate": 0.00012873051224944322, "loss": 0.0161, "step": 1610 }, { "epoch": 0.358, "grad_norm": 0.1084010973572731, "learning_rate": 0.0001286859688195991, "loss": 0.015, "step": 1611 }, { "epoch": 0.3582222222222222, "grad_norm": 0.10393639653921127, "learning_rate": 0.000128641425389755, "loss": 0.0146, "step": 1612 }, { "epoch": 0.35844444444444445, "grad_norm": 0.8769986629486084, "learning_rate": 0.00012859688195991092, "loss": 2.0321, "step": 1613 }, { "epoch": 0.3586666666666667, "grad_norm": 0.8985224366188049, "learning_rate": 0.00012855233853006684, "loss": 1.9825, "step": 1614 }, { "epoch": 0.35888888888888887, "grad_norm": 0.8849453926086426, "learning_rate": 0.00012850779510022273, "loss": 1.8286, "step": 1615 }, { "epoch": 0.3591111111111111, "grad_norm": 0.876132607460022, "learning_rate": 0.00012846325167037862, "loss": 2.1886, "step": 1616 }, { "epoch": 0.35933333333333334, "grad_norm": 0.7373232245445251, "learning_rate": 0.00012841870824053451, "loss": 1.0218, "step": 1617 }, { "epoch": 0.3595555555555556, "grad_norm": 0.9138587117195129, "learning_rate": 0.00012837416481069043, "loss": 2.1635, "step": 1618 }, { "epoch": 0.35977777777777775, "grad_norm": 0.8863195180892944, "learning_rate": 0.00012832962138084632, "loss": 1.9095, "step": 1619 }, { "epoch": 0.36, "grad_norm": 0.9143100380897522, "learning_rate": 0.00012828507795100224, "loss": 1.8664, "step": 1620 }, { "epoch": 0.3602222222222222, "grad_norm": 0.9049918055534363, "learning_rate": 0.00012824053452115814, "loss": 1.9491, "step": 1621 }, { "epoch": 0.36044444444444446, "grad_norm": 0.8652317523956299, "learning_rate": 0.00012819599109131403, "loss": 1.7523, "step": 1622 }, { "epoch": 0.3606666666666667, "grad_norm": 0.07398027926683426, "learning_rate": 0.00012815144766146995, "loss": 0.0159, "step": 1623 }, { "epoch": 0.36088888888888887, "grad_norm": 0.07411263883113861, "learning_rate": 0.00012810690423162584, "loss": 0.0185, "step": 1624 }, { "epoch": 0.3611111111111111, "grad_norm": 0.11097452044487, "learning_rate": 0.00012806236080178173, "loss": 0.0206, "step": 1625 }, { "epoch": 0.36133333333333334, "grad_norm": 1.0089179277420044, "learning_rate": 0.00012801781737193765, "loss": 1.8515, "step": 1626 }, { "epoch": 0.3615555555555556, "grad_norm": 0.9654055237770081, "learning_rate": 0.00012797327394209354, "loss": 1.7223, "step": 1627 }, { "epoch": 0.36177777777777775, "grad_norm": 0.9917669892311096, "learning_rate": 0.00012792873051224946, "loss": 1.8853, "step": 1628 }, { "epoch": 0.362, "grad_norm": 1.2126929759979248, "learning_rate": 0.00012788418708240535, "loss": 1.8215, "step": 1629 }, { "epoch": 0.3622222222222222, "grad_norm": 1.3606003522872925, "learning_rate": 0.00012783964365256124, "loss": 1.9842, "step": 1630 }, { "epoch": 0.36244444444444446, "grad_norm": 0.7445697784423828, "learning_rate": 0.00012779510022271713, "loss": 0.9715, "step": 1631 }, { "epoch": 0.3626666666666667, "grad_norm": 0.0977458506822586, "learning_rate": 0.00012775055679287305, "loss": 0.0185, "step": 1632 }, { "epoch": 0.36288888888888887, "grad_norm": 0.07269278168678284, "learning_rate": 0.00012770601336302897, "loss": 0.0188, "step": 1633 }, { "epoch": 0.3631111111111111, "grad_norm": 0.7048798203468323, "learning_rate": 0.00012766146993318486, "loss": 0.8731, "step": 1634 }, { "epoch": 0.36333333333333334, "grad_norm": 0.9043886661529541, "learning_rate": 0.00012761692650334076, "loss": 1.8761, "step": 1635 }, { "epoch": 0.3635555555555556, "grad_norm": 1.0288832187652588, "learning_rate": 0.00012757238307349665, "loss": 1.7554, "step": 1636 }, { "epoch": 0.36377777777777776, "grad_norm": 1.0029970407485962, "learning_rate": 0.00012752783964365257, "loss": 1.8181, "step": 1637 }, { "epoch": 0.364, "grad_norm": 0.0940476730465889, "learning_rate": 0.00012748329621380849, "loss": 0.0175, "step": 1638 }, { "epoch": 0.3642222222222222, "grad_norm": 0.07410628348588943, "learning_rate": 0.00012743875278396438, "loss": 0.0175, "step": 1639 }, { "epoch": 0.36444444444444446, "grad_norm": 1.1884973049163818, "learning_rate": 0.00012739420935412027, "loss": 1.936, "step": 1640 }, { "epoch": 0.36466666666666664, "grad_norm": 0.9561774730682373, "learning_rate": 0.00012734966592427616, "loss": 1.3877, "step": 1641 }, { "epoch": 0.3648888888888889, "grad_norm": 1.1464483737945557, "learning_rate": 0.00012730512249443208, "loss": 1.9865, "step": 1642 }, { "epoch": 0.3651111111111111, "grad_norm": 0.7155196666717529, "learning_rate": 0.00012726057906458797, "loss": 0.6446, "step": 1643 }, { "epoch": 0.36533333333333334, "grad_norm": 1.1351078748703003, "learning_rate": 0.0001272160356347439, "loss": 1.8303, "step": 1644 }, { "epoch": 0.3655555555555556, "grad_norm": 1.0295593738555908, "learning_rate": 0.00012717149220489978, "loss": 1.2797, "step": 1645 }, { "epoch": 0.36577777777777776, "grad_norm": 0.20271006226539612, "learning_rate": 0.00012712694877505567, "loss": 0.0336, "step": 1646 }, { "epoch": 0.366, "grad_norm": 0.2018907517194748, "learning_rate": 0.0001270824053452116, "loss": 0.0321, "step": 1647 }, { "epoch": 0.3662222222222222, "grad_norm": 1.1571309566497803, "learning_rate": 0.00012703786191536748, "loss": 1.2524, "step": 1648 }, { "epoch": 0.36644444444444446, "grad_norm": 0.6432564854621887, "learning_rate": 0.00012699331848552338, "loss": 0.503, "step": 1649 }, { "epoch": 0.36666666666666664, "grad_norm": 0.9266985058784485, "learning_rate": 0.0001269487750556793, "loss": 0.8534, "step": 1650 }, { "epoch": 0.3668888888888889, "grad_norm": 0.581489086151123, "learning_rate": 0.00012690423162583521, "loss": 1.1384, "step": 1651 }, { "epoch": 0.3671111111111111, "grad_norm": 0.5554734468460083, "learning_rate": 0.0001268596881959911, "loss": 1.0044, "step": 1652 }, { "epoch": 0.36733333333333335, "grad_norm": 0.6623711585998535, "learning_rate": 0.000126815144766147, "loss": 1.161, "step": 1653 }, { "epoch": 0.3675555555555556, "grad_norm": 0.060292765498161316, "learning_rate": 0.0001267706013363029, "loss": 0.0109, "step": 1654 }, { "epoch": 0.36777777777777776, "grad_norm": 0.582197368144989, "learning_rate": 0.00012672605790645878, "loss": 1.1099, "step": 1655 }, { "epoch": 0.368, "grad_norm": 0.8612513542175293, "learning_rate": 0.00012668151447661473, "loss": 1.9373, "step": 1656 }, { "epoch": 0.36822222222222223, "grad_norm": 0.252760112285614, "learning_rate": 0.00012663697104677062, "loss": 0.0131, "step": 1657 }, { "epoch": 0.36844444444444446, "grad_norm": 0.07458077371120453, "learning_rate": 0.0001265924276169265, "loss": 0.0122, "step": 1658 }, { "epoch": 0.36866666666666664, "grad_norm": 0.926961362361908, "learning_rate": 0.0001265478841870824, "loss": 2.0833, "step": 1659 }, { "epoch": 0.3688888888888889, "grad_norm": 0.8995304107666016, "learning_rate": 0.00012650334075723832, "loss": 2.253, "step": 1660 }, { "epoch": 0.3691111111111111, "grad_norm": 0.9721949100494385, "learning_rate": 0.00012645879732739421, "loss": 2.0844, "step": 1661 }, { "epoch": 0.36933333333333335, "grad_norm": 0.8844018578529358, "learning_rate": 0.00012641425389755013, "loss": 2.176, "step": 1662 }, { "epoch": 0.3695555555555556, "grad_norm": 1.2494245767593384, "learning_rate": 0.00012636971046770602, "loss": 2.3682, "step": 1663 }, { "epoch": 0.36977777777777776, "grad_norm": 0.8971030712127686, "learning_rate": 0.00012632516703786192, "loss": 1.9487, "step": 1664 }, { "epoch": 0.37, "grad_norm": 0.6521131992340088, "learning_rate": 0.00012628062360801784, "loss": 1.0122, "step": 1665 }, { "epoch": 0.37022222222222223, "grad_norm": 0.9701309204101562, "learning_rate": 0.00012623608017817373, "loss": 2.0993, "step": 1666 }, { "epoch": 0.37044444444444447, "grad_norm": 0.8997014760971069, "learning_rate": 0.00012619153674832962, "loss": 1.8794, "step": 1667 }, { "epoch": 0.37066666666666664, "grad_norm": 1.065514087677002, "learning_rate": 0.00012614699331848554, "loss": 2.1362, "step": 1668 }, { "epoch": 0.3708888888888889, "grad_norm": 0.995553731918335, "learning_rate": 0.00012610244988864143, "loss": 1.7254, "step": 1669 }, { "epoch": 0.3711111111111111, "grad_norm": 0.8794861435890198, "learning_rate": 0.00012605790645879735, "loss": 1.0144, "step": 1670 }, { "epoch": 0.37133333333333335, "grad_norm": 0.0776643380522728, "learning_rate": 0.00012601336302895324, "loss": 0.0167, "step": 1671 }, { "epoch": 0.37155555555555553, "grad_norm": 0.6524280905723572, "learning_rate": 0.00012596881959910913, "loss": 1.0171, "step": 1672 }, { "epoch": 0.37177777777777776, "grad_norm": 0.9473826289176941, "learning_rate": 0.00012592427616926502, "loss": 1.8923, "step": 1673 }, { "epoch": 0.372, "grad_norm": 0.9059301614761353, "learning_rate": 0.00012587973273942094, "loss": 1.8065, "step": 1674 }, { "epoch": 0.37222222222222223, "grad_norm": 0.9567731022834778, "learning_rate": 0.00012583518930957686, "loss": 1.8692, "step": 1675 }, { "epoch": 0.37244444444444447, "grad_norm": 1.0518147945404053, "learning_rate": 0.00012579064587973275, "loss": 1.9397, "step": 1676 }, { "epoch": 0.37266666666666665, "grad_norm": 1.0581625699996948, "learning_rate": 0.00012574610244988865, "loss": 1.9263, "step": 1677 }, { "epoch": 0.3728888888888889, "grad_norm": 1.148897647857666, "learning_rate": 0.00012570155902004454, "loss": 1.8185, "step": 1678 }, { "epoch": 0.3731111111111111, "grad_norm": 0.6279930472373962, "learning_rate": 0.00012565701559020046, "loss": 0.6442, "step": 1679 }, { "epoch": 0.37333333333333335, "grad_norm": 0.06652513891458511, "learning_rate": 0.00012561247216035635, "loss": 0.0181, "step": 1680 }, { "epoch": 0.37355555555555553, "grad_norm": 0.5910684466362, "learning_rate": 0.00012556792873051227, "loss": 0.857, "step": 1681 }, { "epoch": 0.37377777777777776, "grad_norm": 0.09267974644899368, "learning_rate": 0.00012552338530066816, "loss": 0.0185, "step": 1682 }, { "epoch": 0.374, "grad_norm": 0.09794706106185913, "learning_rate": 0.00012547884187082405, "loss": 0.0179, "step": 1683 }, { "epoch": 0.37422222222222223, "grad_norm": 1.141258955001831, "learning_rate": 0.00012543429844097997, "loss": 1.9392, "step": 1684 }, { "epoch": 0.37444444444444447, "grad_norm": 0.9182208776473999, "learning_rate": 0.00012538975501113586, "loss": 1.4379, "step": 1685 }, { "epoch": 0.37466666666666665, "grad_norm": 0.9447404742240906, "learning_rate": 0.00012534521158129175, "loss": 1.3792, "step": 1686 }, { "epoch": 0.3748888888888889, "grad_norm": 0.6784771680831909, "learning_rate": 0.00012530066815144767, "loss": 0.8427, "step": 1687 }, { "epoch": 0.3751111111111111, "grad_norm": 0.10267607867717743, "learning_rate": 0.00012525612472160356, "loss": 0.0257, "step": 1688 }, { "epoch": 0.37533333333333335, "grad_norm": 0.9849367141723633, "learning_rate": 0.00012521158129175948, "loss": 1.5144, "step": 1689 }, { "epoch": 0.37555555555555553, "grad_norm": 1.1103235483169556, "learning_rate": 0.00012516703786191537, "loss": 1.7006, "step": 1690 }, { "epoch": 0.37577777777777777, "grad_norm": 1.041797399520874, "learning_rate": 0.00012512249443207127, "loss": 1.4283, "step": 1691 }, { "epoch": 0.376, "grad_norm": 0.8971735835075378, "learning_rate": 0.00012507795100222716, "loss": 1.3154, "step": 1692 }, { "epoch": 0.37622222222222224, "grad_norm": 1.0990266799926758, "learning_rate": 0.00012503340757238308, "loss": 1.3619, "step": 1693 }, { "epoch": 0.37644444444444447, "grad_norm": 1.2396061420440674, "learning_rate": 0.000124988864142539, "loss": 1.5702, "step": 1694 }, { "epoch": 0.37666666666666665, "grad_norm": 0.661430835723877, "learning_rate": 0.0001249443207126949, "loss": 0.5549, "step": 1695 }, { "epoch": 0.3768888888888889, "grad_norm": 1.2013130187988281, "learning_rate": 0.00012489977728285078, "loss": 1.2375, "step": 1696 }, { "epoch": 0.3771111111111111, "grad_norm": 0.6722013354301453, "learning_rate": 0.00012485523385300667, "loss": 0.6046, "step": 1697 }, { "epoch": 0.37733333333333335, "grad_norm": 0.19260694086551666, "learning_rate": 0.0001248106904231626, "loss": 0.0361, "step": 1698 }, { "epoch": 0.37755555555555553, "grad_norm": 1.0123697519302368, "learning_rate": 0.0001247661469933185, "loss": 1.0244, "step": 1699 }, { "epoch": 0.37777777777777777, "grad_norm": 0.949657678604126, "learning_rate": 0.0001247216035634744, "loss": 0.8659, "step": 1700 }, { "epoch": 0.378, "grad_norm": 0.7017142176628113, "learning_rate": 0.0001246770601336303, "loss": 1.1181, "step": 1701 }, { "epoch": 0.37822222222222224, "grad_norm": 0.9786233305931091, "learning_rate": 0.00012463251670378618, "loss": 2.1521, "step": 1702 }, { "epoch": 0.37844444444444447, "grad_norm": 0.6545943021774292, "learning_rate": 0.0001245879732739421, "loss": 1.2272, "step": 1703 }, { "epoch": 0.37866666666666665, "grad_norm": 1.0188517570495605, "learning_rate": 0.000124543429844098, "loss": 2.0431, "step": 1704 }, { "epoch": 0.3788888888888889, "grad_norm": 0.7633973360061646, "learning_rate": 0.00012449888641425391, "loss": 2.082, "step": 1705 }, { "epoch": 0.3791111111111111, "grad_norm": 0.9081476926803589, "learning_rate": 0.0001244543429844098, "loss": 2.2362, "step": 1706 }, { "epoch": 0.37933333333333336, "grad_norm": 0.08680958300828934, "learning_rate": 0.0001244097995545657, "loss": 0.0136, "step": 1707 }, { "epoch": 0.37955555555555553, "grad_norm": 0.0849529430270195, "learning_rate": 0.00012436525612472162, "loss": 0.0133, "step": 1708 }, { "epoch": 0.37977777777777777, "grad_norm": 0.5420004725456238, "learning_rate": 0.0001243207126948775, "loss": 1.0004, "step": 1709 }, { "epoch": 0.38, "grad_norm": 0.9287367463111877, "learning_rate": 0.0001242761692650334, "loss": 2.0996, "step": 1710 }, { "epoch": 0.38022222222222224, "grad_norm": 1.0013773441314697, "learning_rate": 0.00012423162583518932, "loss": 2.0191, "step": 1711 }, { "epoch": 0.3804444444444444, "grad_norm": 0.9471575021743774, "learning_rate": 0.0001241870824053452, "loss": 1.9956, "step": 1712 }, { "epoch": 0.38066666666666665, "grad_norm": 0.981694221496582, "learning_rate": 0.00012414253897550113, "loss": 2.1098, "step": 1713 }, { "epoch": 0.3808888888888889, "grad_norm": 0.970139741897583, "learning_rate": 0.00012409799554565702, "loss": 2.0178, "step": 1714 }, { "epoch": 0.3811111111111111, "grad_norm": 0.598267138004303, "learning_rate": 0.0001240534521158129, "loss": 1.0541, "step": 1715 }, { "epoch": 0.38133333333333336, "grad_norm": 1.0712449550628662, "learning_rate": 0.0001240089086859688, "loss": 1.9999, "step": 1716 }, { "epoch": 0.38155555555555554, "grad_norm": 0.9940736293792725, "learning_rate": 0.00012396436525612472, "loss": 2.1087, "step": 1717 }, { "epoch": 0.38177777777777777, "grad_norm": 0.0704389214515686, "learning_rate": 0.00012391982182628064, "loss": 0.0179, "step": 1718 }, { "epoch": 0.382, "grad_norm": 0.07912923395633698, "learning_rate": 0.00012387527839643653, "loss": 0.018, "step": 1719 }, { "epoch": 0.38222222222222224, "grad_norm": 0.6652305126190186, "learning_rate": 0.00012383073496659243, "loss": 0.9506, "step": 1720 }, { "epoch": 0.3824444444444444, "grad_norm": 0.2545645833015442, "learning_rate": 0.00012378619153674832, "loss": 0.0274, "step": 1721 }, { "epoch": 0.38266666666666665, "grad_norm": 1.0166536569595337, "learning_rate": 0.00012374164810690424, "loss": 1.8203, "step": 1722 }, { "epoch": 0.3828888888888889, "grad_norm": 0.967710018157959, "learning_rate": 0.00012369710467706016, "loss": 1.8026, "step": 1723 }, { "epoch": 0.3831111111111111, "grad_norm": 0.9521524310112, "learning_rate": 0.00012365256124721605, "loss": 1.8211, "step": 1724 }, { "epoch": 0.38333333333333336, "grad_norm": 0.9643096923828125, "learning_rate": 0.00012360801781737194, "loss": 1.8337, "step": 1725 }, { "epoch": 0.38355555555555554, "grad_norm": 1.082641839981079, "learning_rate": 0.00012356347438752783, "loss": 1.9756, "step": 1726 }, { "epoch": 0.3837777777777778, "grad_norm": 0.2606064975261688, "learning_rate": 0.00012351893095768375, "loss": 0.0189, "step": 1727 }, { "epoch": 0.384, "grad_norm": 0.06352642178535461, "learning_rate": 0.00012347438752783964, "loss": 0.0185, "step": 1728 }, { "epoch": 0.38422222222222224, "grad_norm": 0.9276388883590698, "learning_rate": 0.00012342984409799556, "loss": 1.7712, "step": 1729 }, { "epoch": 0.3844444444444444, "grad_norm": 0.9787095785140991, "learning_rate": 0.00012338530066815145, "loss": 1.6116, "step": 1730 }, { "epoch": 0.38466666666666666, "grad_norm": 1.4584063291549683, "learning_rate": 0.00012334075723830735, "loss": 1.6803, "step": 1731 }, { "epoch": 0.3848888888888889, "grad_norm": 0.10419341921806335, "learning_rate": 0.00012329621380846326, "loss": 0.0198, "step": 1732 }, { "epoch": 0.3851111111111111, "grad_norm": 0.9885858297348022, "learning_rate": 0.00012325167037861916, "loss": 1.5324, "step": 1733 }, { "epoch": 0.38533333333333336, "grad_norm": 0.9763101935386658, "learning_rate": 0.00012320712694877505, "loss": 1.7128, "step": 1734 }, { "epoch": 0.38555555555555554, "grad_norm": 1.02174973487854, "learning_rate": 0.00012316258351893097, "loss": 1.6472, "step": 1735 }, { "epoch": 0.3857777777777778, "grad_norm": 1.0153290033340454, "learning_rate": 0.00012311804008908689, "loss": 1.7305, "step": 1736 }, { "epoch": 0.386, "grad_norm": 0.9473196864128113, "learning_rate": 0.00012307349665924278, "loss": 0.0297, "step": 1737 }, { "epoch": 0.38622222222222224, "grad_norm": 0.9792290329933167, "learning_rate": 0.00012302895322939867, "loss": 1.6719, "step": 1738 }, { "epoch": 0.3864444444444444, "grad_norm": 1.081703782081604, "learning_rate": 0.00012298440979955456, "loss": 1.6265, "step": 1739 }, { "epoch": 0.38666666666666666, "grad_norm": 1.0634948015213013, "learning_rate": 0.00012293986636971045, "loss": 1.4929, "step": 1740 }, { "epoch": 0.3868888888888889, "grad_norm": 1.219645380973816, "learning_rate": 0.0001228953229398664, "loss": 1.3222, "step": 1741 }, { "epoch": 0.38711111111111113, "grad_norm": 1.1311880350112915, "learning_rate": 0.0001228507795100223, "loss": 1.4129, "step": 1742 }, { "epoch": 0.3873333333333333, "grad_norm": 1.1938977241516113, "learning_rate": 0.00012280623608017818, "loss": 1.6393, "step": 1743 }, { "epoch": 0.38755555555555554, "grad_norm": 0.7046709656715393, "learning_rate": 0.00012276169265033407, "loss": 0.6052, "step": 1744 }, { "epoch": 0.3877777777777778, "grad_norm": 1.2281843423843384, "learning_rate": 0.00012271714922049, "loss": 1.361, "step": 1745 }, { "epoch": 0.388, "grad_norm": 1.0567635297775269, "learning_rate": 0.00012267260579064588, "loss": 1.1866, "step": 1746 }, { "epoch": 0.38822222222222225, "grad_norm": 0.9681671261787415, "learning_rate": 0.0001226280623608018, "loss": 1.2505, "step": 1747 }, { "epoch": 0.3884444444444444, "grad_norm": 0.7260466814041138, "learning_rate": 0.0001225835189309577, "loss": 0.5553, "step": 1748 }, { "epoch": 0.38866666666666666, "grad_norm": 0.8488286137580872, "learning_rate": 0.0001225389755011136, "loss": 0.7687, "step": 1749 }, { "epoch": 0.3888888888888889, "grad_norm": 1.0907729864120483, "learning_rate": 0.0001224944320712695, "loss": 0.9859, "step": 1750 }, { "epoch": 0.38911111111111113, "grad_norm": 0.5968582630157471, "learning_rate": 0.0001224498886414254, "loss": 1.0487, "step": 1751 }, { "epoch": 0.3893333333333333, "grad_norm": 0.4760688245296478, "learning_rate": 0.0001224053452115813, "loss": 1.1006, "step": 1752 }, { "epoch": 0.38955555555555554, "grad_norm": 0.880111813545227, "learning_rate": 0.00012236080178173718, "loss": 2.0063, "step": 1753 }, { "epoch": 0.3897777777777778, "grad_norm": 0.6034107804298401, "learning_rate": 0.0001223162583518931, "loss": 1.2052, "step": 1754 }, { "epoch": 0.39, "grad_norm": 0.055661823600530624, "learning_rate": 0.00012227171492204902, "loss": 0.0117, "step": 1755 }, { "epoch": 0.39022222222222225, "grad_norm": 0.05036118999123573, "learning_rate": 0.0001222271714922049, "loss": 0.0128, "step": 1756 }, { "epoch": 0.3904444444444444, "grad_norm": 0.09149646013975143, "learning_rate": 0.0001221826280623608, "loss": 0.014, "step": 1757 }, { "epoch": 0.39066666666666666, "grad_norm": 0.09102200716733932, "learning_rate": 0.0001221380846325167, "loss": 0.0142, "step": 1758 }, { "epoch": 0.3908888888888889, "grad_norm": 0.9077672958374023, "learning_rate": 0.00012209354120267261, "loss": 2.2172, "step": 1759 }, { "epoch": 0.39111111111111113, "grad_norm": 0.9596045613288879, "learning_rate": 0.00012204899777282852, "loss": 2.005, "step": 1760 }, { "epoch": 0.3913333333333333, "grad_norm": 0.9369930624961853, "learning_rate": 0.00012200445434298442, "loss": 1.9653, "step": 1761 }, { "epoch": 0.39155555555555555, "grad_norm": 0.7747904062271118, "learning_rate": 0.00012195991091314032, "loss": 1.9532, "step": 1762 }, { "epoch": 0.3917777777777778, "grad_norm": 0.9672890305519104, "learning_rate": 0.00012191536748329622, "loss": 2.137, "step": 1763 }, { "epoch": 0.392, "grad_norm": 0.1316177099943161, "learning_rate": 0.00012187082405345211, "loss": 0.0221, "step": 1764 }, { "epoch": 0.39222222222222225, "grad_norm": 0.9752106070518494, "learning_rate": 0.00012182628062360802, "loss": 1.8069, "step": 1765 }, { "epoch": 0.39244444444444443, "grad_norm": 0.9790940284729004, "learning_rate": 0.00012178173719376394, "loss": 1.9365, "step": 1766 }, { "epoch": 0.39266666666666666, "grad_norm": 0.9355417490005493, "learning_rate": 0.00012173719376391983, "loss": 2.0998, "step": 1767 }, { "epoch": 0.3928888888888889, "grad_norm": 0.8696015477180481, "learning_rate": 0.00012169265033407573, "loss": 1.8553, "step": 1768 }, { "epoch": 0.39311111111111113, "grad_norm": 1.026228666305542, "learning_rate": 0.00012164810690423163, "loss": 1.7821, "step": 1769 }, { "epoch": 0.3933333333333333, "grad_norm": 1.0769325494766235, "learning_rate": 0.00012160356347438753, "loss": 2.1182, "step": 1770 }, { "epoch": 0.39355555555555555, "grad_norm": 0.9091227650642395, "learning_rate": 0.00012155902004454342, "loss": 1.7079, "step": 1771 }, { "epoch": 0.3937777777777778, "grad_norm": 0.0760640799999237, "learning_rate": 0.00012151447661469934, "loss": 0.0169, "step": 1772 }, { "epoch": 0.394, "grad_norm": 0.07410979270935059, "learning_rate": 0.00012146993318485525, "loss": 0.0165, "step": 1773 }, { "epoch": 0.3942222222222222, "grad_norm": 0.5938198566436768, "learning_rate": 0.00012142538975501114, "loss": 0.9765, "step": 1774 }, { "epoch": 0.39444444444444443, "grad_norm": 0.15130677819252014, "learning_rate": 0.00012138084632516705, "loss": 0.0236, "step": 1775 }, { "epoch": 0.39466666666666667, "grad_norm": 0.6686200499534607, "learning_rate": 0.00012133630289532294, "loss": 0.9716, "step": 1776 }, { "epoch": 0.3948888888888889, "grad_norm": 1.011210322380066, "learning_rate": 0.00012129175946547884, "loss": 1.7901, "step": 1777 }, { "epoch": 0.39511111111111114, "grad_norm": 1.0566589832305908, "learning_rate": 0.00012124721603563476, "loss": 1.6834, "step": 1778 }, { "epoch": 0.3953333333333333, "grad_norm": 1.1285459995269775, "learning_rate": 0.00012120267260579065, "loss": 1.7103, "step": 1779 }, { "epoch": 0.39555555555555555, "grad_norm": 0.9587770104408264, "learning_rate": 0.00012115812917594656, "loss": 2.0388, "step": 1780 }, { "epoch": 0.3957777777777778, "grad_norm": 0.9700530171394348, "learning_rate": 0.00012111358574610245, "loss": 1.7206, "step": 1781 }, { "epoch": 0.396, "grad_norm": 0.6485929489135742, "learning_rate": 0.00012106904231625836, "loss": 0.7268, "step": 1782 }, { "epoch": 0.3962222222222222, "grad_norm": 0.07007116824388504, "learning_rate": 0.00012102449888641425, "loss": 0.0187, "step": 1783 }, { "epoch": 0.39644444444444443, "grad_norm": 0.06163879111409187, "learning_rate": 0.00012097995545657017, "loss": 0.0191, "step": 1784 }, { "epoch": 0.39666666666666667, "grad_norm": 0.9925112128257751, "learning_rate": 0.00012093541202672607, "loss": 1.7063, "step": 1785 }, { "epoch": 0.3968888888888889, "grad_norm": 0.1028611809015274, "learning_rate": 0.00012089086859688196, "loss": 0.0217, "step": 1786 }, { "epoch": 0.39711111111111114, "grad_norm": 0.10675106197595596, "learning_rate": 0.00012084632516703787, "loss": 0.0214, "step": 1787 }, { "epoch": 0.3973333333333333, "grad_norm": 0.09875133633613586, "learning_rate": 0.00012080178173719376, "loss": 0.0206, "step": 1788 }, { "epoch": 0.39755555555555555, "grad_norm": 1.1721303462982178, "learning_rate": 0.00012075723830734967, "loss": 1.4865, "step": 1789 }, { "epoch": 0.3977777777777778, "grad_norm": 1.0789026021957397, "learning_rate": 0.00012071269487750559, "loss": 1.7231, "step": 1790 }, { "epoch": 0.398, "grad_norm": 1.559047818183899, "learning_rate": 0.00012066815144766148, "loss": 0.0602, "step": 1791 }, { "epoch": 0.3982222222222222, "grad_norm": 1.1085612773895264, "learning_rate": 0.00012062360801781738, "loss": 1.6387, "step": 1792 }, { "epoch": 0.39844444444444443, "grad_norm": 1.3449455499649048, "learning_rate": 0.00012057906458797327, "loss": 1.7407, "step": 1793 }, { "epoch": 0.39866666666666667, "grad_norm": 1.04912531375885, "learning_rate": 0.00012053452115812918, "loss": 1.5535, "step": 1794 }, { "epoch": 0.3988888888888889, "grad_norm": 1.0405304431915283, "learning_rate": 0.00012048997772828507, "loss": 1.4078, "step": 1795 }, { "epoch": 0.39911111111111114, "grad_norm": 1.0141276121139526, "learning_rate": 0.00012044543429844099, "loss": 1.2146, "step": 1796 }, { "epoch": 0.3993333333333333, "grad_norm": 0.9934976100921631, "learning_rate": 0.0001204008908685969, "loss": 1.0574, "step": 1797 }, { "epoch": 0.39955555555555555, "grad_norm": 1.1789382696151733, "learning_rate": 0.00012035634743875279, "loss": 1.3544, "step": 1798 }, { "epoch": 0.3997777777777778, "grad_norm": 0.6419237852096558, "learning_rate": 0.00012031180400890869, "loss": 0.4579, "step": 1799 }, { "epoch": 0.4, "grad_norm": 0.6202405095100403, "learning_rate": 0.00012026726057906458, "loss": 0.4062, "step": 1800 }, { "epoch": 0.4, "eval_loss": 1.200887680053711, "eval_runtime": 240.6565, "eval_samples_per_second": 4.155, "eval_steps_per_second": 4.155, "step": 1800 }, { "epoch": 0.4002222222222222, "grad_norm": 0.974485456943512, "learning_rate": 0.00012022271714922049, "loss": 2.0298, "step": 1801 }, { "epoch": 0.40044444444444444, "grad_norm": 0.8032060265541077, "learning_rate": 0.00012017817371937641, "loss": 2.1083, "step": 1802 }, { "epoch": 0.40066666666666667, "grad_norm": 0.054990362375974655, "learning_rate": 0.0001201336302895323, "loss": 0.011, "step": 1803 }, { "epoch": 0.4008888888888889, "grad_norm": 0.8215484619140625, "learning_rate": 0.0001200890868596882, "loss": 2.1251, "step": 1804 }, { "epoch": 0.4011111111111111, "grad_norm": 0.7075866460800171, "learning_rate": 0.0001200445434298441, "loss": 0.9495, "step": 1805 }, { "epoch": 0.4013333333333333, "grad_norm": 0.08694480359554291, "learning_rate": 0.00012, "loss": 0.0124, "step": 1806 }, { "epoch": 0.40155555555555555, "grad_norm": 0.5654726028442383, "learning_rate": 0.0001199554565701559, "loss": 1.032, "step": 1807 }, { "epoch": 0.4017777777777778, "grad_norm": 0.9461843371391296, "learning_rate": 0.00011991091314031181, "loss": 2.3011, "step": 1808 }, { "epoch": 0.402, "grad_norm": 0.9733036160469055, "learning_rate": 0.00011986636971046772, "loss": 2.212, "step": 1809 }, { "epoch": 0.4022222222222222, "grad_norm": 0.9258533716201782, "learning_rate": 0.00011982182628062361, "loss": 2.2234, "step": 1810 }, { "epoch": 0.40244444444444444, "grad_norm": 0.900391697883606, "learning_rate": 0.00011977728285077952, "loss": 2.1527, "step": 1811 }, { "epoch": 0.4026666666666667, "grad_norm": 1.021876573562622, "learning_rate": 0.00011973273942093541, "loss": 2.1505, "step": 1812 }, { "epoch": 0.4028888888888889, "grad_norm": 0.8823310136795044, "learning_rate": 0.00011968819599109131, "loss": 1.9947, "step": 1813 }, { "epoch": 0.4031111111111111, "grad_norm": 0.603898286819458, "learning_rate": 0.00011964365256124723, "loss": 1.0451, "step": 1814 }, { "epoch": 0.4033333333333333, "grad_norm": 0.5974671840667725, "learning_rate": 0.00011959910913140312, "loss": 0.9056, "step": 1815 }, { "epoch": 0.40355555555555556, "grad_norm": 0.8365625143051147, "learning_rate": 0.00011955456570155903, "loss": 1.7608, "step": 1816 }, { "epoch": 0.4037777777777778, "grad_norm": 0.9293599128723145, "learning_rate": 0.00011951002227171492, "loss": 1.8854, "step": 1817 }, { "epoch": 0.404, "grad_norm": 1.0051164627075195, "learning_rate": 0.00011946547884187083, "loss": 1.9833, "step": 1818 }, { "epoch": 0.4042222222222222, "grad_norm": 0.9288824200630188, "learning_rate": 0.00011942093541202672, "loss": 1.8936, "step": 1819 }, { "epoch": 0.40444444444444444, "grad_norm": 0.9323967099189758, "learning_rate": 0.00011937639198218265, "loss": 1.6103, "step": 1820 }, { "epoch": 0.4046666666666667, "grad_norm": 0.9585559964179993, "learning_rate": 0.00011933184855233854, "loss": 1.8573, "step": 1821 }, { "epoch": 0.4048888888888889, "grad_norm": 0.9867689609527588, "learning_rate": 0.00011928730512249445, "loss": 1.8621, "step": 1822 }, { "epoch": 0.4051111111111111, "grad_norm": 0.08060096949338913, "learning_rate": 0.00011924276169265034, "loss": 0.0168, "step": 1823 }, { "epoch": 0.4053333333333333, "grad_norm": 0.08404132723808289, "learning_rate": 0.00011919821826280623, "loss": 0.0171, "step": 1824 }, { "epoch": 0.40555555555555556, "grad_norm": 0.12815998494625092, "learning_rate": 0.00011915367483296214, "loss": 0.0181, "step": 1825 }, { "epoch": 0.4057777777777778, "grad_norm": 0.8509750366210938, "learning_rate": 0.00011910913140311803, "loss": 1.1935, "step": 1826 }, { "epoch": 0.406, "grad_norm": 0.929954469203949, "learning_rate": 0.00011906458797327396, "loss": 1.7651, "step": 1827 }, { "epoch": 0.4062222222222222, "grad_norm": 0.8800256252288818, "learning_rate": 0.00011902004454342985, "loss": 1.4242, "step": 1828 }, { "epoch": 0.40644444444444444, "grad_norm": 0.7143679857254028, "learning_rate": 0.00011897550111358576, "loss": 0.8556, "step": 1829 }, { "epoch": 0.4066666666666667, "grad_norm": 0.06789492070674896, "learning_rate": 0.00011893095768374165, "loss": 0.0177, "step": 1830 }, { "epoch": 0.4068888888888889, "grad_norm": 0.06996285915374756, "learning_rate": 0.00011888641425389756, "loss": 0.0176, "step": 1831 }, { "epoch": 0.4071111111111111, "grad_norm": 0.06404642760753632, "learning_rate": 0.00011884187082405345, "loss": 0.0172, "step": 1832 }, { "epoch": 0.4073333333333333, "grad_norm": 0.6085385680198669, "learning_rate": 0.00011879732739420937, "loss": 0.8157, "step": 1833 }, { "epoch": 0.40755555555555556, "grad_norm": 0.6177558302879333, "learning_rate": 0.00011875278396436527, "loss": 0.6847, "step": 1834 }, { "epoch": 0.4077777777777778, "grad_norm": 1.0499635934829712, "learning_rate": 0.00011870824053452116, "loss": 1.6652, "step": 1835 }, { "epoch": 0.408, "grad_norm": 1.0262913703918457, "learning_rate": 0.00011866369710467707, "loss": 1.6644, "step": 1836 }, { "epoch": 0.4082222222222222, "grad_norm": 0.6902967095375061, "learning_rate": 0.00011861915367483296, "loss": 0.8171, "step": 1837 }, { "epoch": 0.40844444444444444, "grad_norm": 0.07804334908723831, "learning_rate": 0.00011857461024498887, "loss": 0.0196, "step": 1838 }, { "epoch": 0.4086666666666667, "grad_norm": 0.9165345430374146, "learning_rate": 0.00011853006681514479, "loss": 0.8781, "step": 1839 }, { "epoch": 0.4088888888888889, "grad_norm": 0.8940587639808655, "learning_rate": 0.00011848552338530068, "loss": 1.5392, "step": 1840 }, { "epoch": 0.4091111111111111, "grad_norm": 1.045750379562378, "learning_rate": 0.00011844097995545658, "loss": 1.6757, "step": 1841 }, { "epoch": 0.4093333333333333, "grad_norm": 1.0602422952651978, "learning_rate": 0.00011839643652561247, "loss": 1.5899, "step": 1842 }, { "epoch": 0.40955555555555556, "grad_norm": 0.7604432702064514, "learning_rate": 0.00011835189309576838, "loss": 0.9366, "step": 1843 }, { "epoch": 0.4097777777777778, "grad_norm": 1.1251657009124756, "learning_rate": 0.00011830734966592427, "loss": 1.7115, "step": 1844 }, { "epoch": 0.41, "grad_norm": 1.1490317583084106, "learning_rate": 0.00011826280623608019, "loss": 1.644, "step": 1845 }, { "epoch": 0.4102222222222222, "grad_norm": 1.2152633666992188, "learning_rate": 0.0001182182628062361, "loss": 1.8793, "step": 1846 }, { "epoch": 0.41044444444444445, "grad_norm": 1.234496831893921, "learning_rate": 0.00011817371937639199, "loss": 1.2693, "step": 1847 }, { "epoch": 0.4106666666666667, "grad_norm": 1.104956030845642, "learning_rate": 0.00011812917594654789, "loss": 1.3936, "step": 1848 }, { "epoch": 0.4108888888888889, "grad_norm": 0.8974543213844299, "learning_rate": 0.00011808463251670378, "loss": 0.5888, "step": 1849 }, { "epoch": 0.4111111111111111, "grad_norm": 0.5945538282394409, "learning_rate": 0.00011804008908685969, "loss": 0.4158, "step": 1850 }, { "epoch": 0.41133333333333333, "grad_norm": 0.713422417640686, "learning_rate": 0.00011799554565701561, "loss": 1.1928, "step": 1851 }, { "epoch": 0.41155555555555556, "grad_norm": 0.8967742919921875, "learning_rate": 0.0001179510022271715, "loss": 2.1678, "step": 1852 }, { "epoch": 0.4117777777777778, "grad_norm": 0.8929163813591003, "learning_rate": 0.0001179064587973274, "loss": 2.1692, "step": 1853 }, { "epoch": 0.412, "grad_norm": 0.8850563168525696, "learning_rate": 0.0001178619153674833, "loss": 2.1808, "step": 1854 }, { "epoch": 0.4122222222222222, "grad_norm": 0.8336376547813416, "learning_rate": 0.0001178173719376392, "loss": 2.5249, "step": 1855 }, { "epoch": 0.41244444444444445, "grad_norm": 0.5886844396591187, "learning_rate": 0.0001177728285077951, "loss": 1.2025, "step": 1856 }, { "epoch": 0.4126666666666667, "grad_norm": 0.9081274271011353, "learning_rate": 0.00011772828507795101, "loss": 2.1547, "step": 1857 }, { "epoch": 0.4128888888888889, "grad_norm": 0.07262948900461197, "learning_rate": 0.00011768374164810692, "loss": 0.0127, "step": 1858 }, { "epoch": 0.4131111111111111, "grad_norm": 0.5580977201461792, "learning_rate": 0.00011763919821826281, "loss": 1.0184, "step": 1859 }, { "epoch": 0.41333333333333333, "grad_norm": 0.9046309590339661, "learning_rate": 0.00011759465478841872, "loss": 2.1869, "step": 1860 }, { "epoch": 0.41355555555555557, "grad_norm": 1.056998372077942, "learning_rate": 0.00011755011135857461, "loss": 1.9969, "step": 1861 }, { "epoch": 0.4137777777777778, "grad_norm": 1.0445380210876465, "learning_rate": 0.00011750556792873051, "loss": 2.2281, "step": 1862 }, { "epoch": 0.414, "grad_norm": 0.9709343910217285, "learning_rate": 0.00011746102449888643, "loss": 1.7402, "step": 1863 }, { "epoch": 0.4142222222222222, "grad_norm": 0.9131556153297424, "learning_rate": 0.00011741648106904232, "loss": 1.7472, "step": 1864 }, { "epoch": 0.41444444444444445, "grad_norm": 0.8268289566040039, "learning_rate": 0.00011737193763919823, "loss": 2.1226, "step": 1865 }, { "epoch": 0.4146666666666667, "grad_norm": 0.8866710066795349, "learning_rate": 0.00011732739420935412, "loss": 1.819, "step": 1866 }, { "epoch": 0.41488888888888886, "grad_norm": 0.9805562496185303, "learning_rate": 0.00011728285077951003, "loss": 2.2956, "step": 1867 }, { "epoch": 0.4151111111111111, "grad_norm": 1.0550199747085571, "learning_rate": 0.00011723830734966592, "loss": 1.9666, "step": 1868 }, { "epoch": 0.41533333333333333, "grad_norm": 1.027684211730957, "learning_rate": 0.00011719376391982184, "loss": 1.7633, "step": 1869 }, { "epoch": 0.41555555555555557, "grad_norm": 0.07350092381238937, "learning_rate": 0.00011714922048997774, "loss": 0.0169, "step": 1870 }, { "epoch": 0.4157777777777778, "grad_norm": 0.7602355480194092, "learning_rate": 0.00011710467706013363, "loss": 0.8183, "step": 1871 }, { "epoch": 0.416, "grad_norm": 0.21299070119857788, "learning_rate": 0.00011706013363028954, "loss": 0.0191, "step": 1872 }, { "epoch": 0.4162222222222222, "grad_norm": 1.0202507972717285, "learning_rate": 0.00011701559020044543, "loss": 1.8291, "step": 1873 }, { "epoch": 0.41644444444444445, "grad_norm": 0.9267475008964539, "learning_rate": 0.00011697104677060134, "loss": 1.546, "step": 1874 }, { "epoch": 0.4166666666666667, "grad_norm": 1.0164928436279297, "learning_rate": 0.00011692650334075726, "loss": 1.6923, "step": 1875 }, { "epoch": 0.41688888888888886, "grad_norm": 1.030597448348999, "learning_rate": 0.00011688195991091315, "loss": 1.858, "step": 1876 }, { "epoch": 0.4171111111111111, "grad_norm": 0.9687047600746155, "learning_rate": 0.00011683741648106905, "loss": 1.6026, "step": 1877 }, { "epoch": 0.41733333333333333, "grad_norm": 1.1525236368179321, "learning_rate": 0.00011679287305122494, "loss": 1.6997, "step": 1878 }, { "epoch": 0.41755555555555557, "grad_norm": 0.07384829223155975, "learning_rate": 0.00011674832962138085, "loss": 0.0186, "step": 1879 }, { "epoch": 0.4177777777777778, "grad_norm": 0.07113431394100189, "learning_rate": 0.00011670378619153674, "loss": 0.0181, "step": 1880 }, { "epoch": 0.418, "grad_norm": 1.0431753396987915, "learning_rate": 0.00011665924276169266, "loss": 1.8063, "step": 1881 }, { "epoch": 0.4182222222222222, "grad_norm": 1.003762125968933, "learning_rate": 0.00011661469933184857, "loss": 1.8644, "step": 1882 }, { "epoch": 0.41844444444444445, "grad_norm": 0.08568891882896423, "learning_rate": 0.00011657015590200446, "loss": 0.0177, "step": 1883 }, { "epoch": 0.4186666666666667, "grad_norm": 0.08027679473161697, "learning_rate": 0.00011652561247216036, "loss": 0.0176, "step": 1884 }, { "epoch": 0.41888888888888887, "grad_norm": 1.0662174224853516, "learning_rate": 0.00011648106904231626, "loss": 1.7168, "step": 1885 }, { "epoch": 0.4191111111111111, "grad_norm": 1.0285723209381104, "learning_rate": 0.00011643652561247216, "loss": 1.6087, "step": 1886 }, { "epoch": 0.41933333333333334, "grad_norm": 0.7417445182800293, "learning_rate": 0.00011639198218262808, "loss": 0.7676, "step": 1887 }, { "epoch": 0.41955555555555557, "grad_norm": 0.9185432195663452, "learning_rate": 0.00011634743875278397, "loss": 1.336, "step": 1888 }, { "epoch": 0.4197777777777778, "grad_norm": 1.0804965496063232, "learning_rate": 0.00011630289532293988, "loss": 1.5121, "step": 1889 }, { "epoch": 0.42, "grad_norm": 1.3549460172653198, "learning_rate": 0.00011625835189309577, "loss": 1.5925, "step": 1890 }, { "epoch": 0.4202222222222222, "grad_norm": 0.9999265670776367, "learning_rate": 0.00011621380846325167, "loss": 1.3402, "step": 1891 }, { "epoch": 0.42044444444444445, "grad_norm": 0.7168278694152832, "learning_rate": 0.00011616926503340757, "loss": 0.698, "step": 1892 }, { "epoch": 0.4206666666666667, "grad_norm": 0.2091141939163208, "learning_rate": 0.00011612472160356347, "loss": 0.0375, "step": 1893 }, { "epoch": 0.42088888888888887, "grad_norm": 1.0474592447280884, "learning_rate": 0.00011608017817371939, "loss": 1.0801, "step": 1894 }, { "epoch": 0.4211111111111111, "grad_norm": 1.3502936363220215, "learning_rate": 0.00011603563474387528, "loss": 1.3399, "step": 1895 }, { "epoch": 0.42133333333333334, "grad_norm": 1.0498988628387451, "learning_rate": 0.00011599109131403119, "loss": 1.3411, "step": 1896 }, { "epoch": 0.42155555555555557, "grad_norm": 0.9921227693557739, "learning_rate": 0.00011594654788418708, "loss": 1.1275, "step": 1897 }, { "epoch": 0.42177777777777775, "grad_norm": 0.766704261302948, "learning_rate": 0.00011590200445434298, "loss": 0.5538, "step": 1898 }, { "epoch": 0.422, "grad_norm": 1.1626564264297485, "learning_rate": 0.00011585746102449888, "loss": 1.1005, "step": 1899 }, { "epoch": 0.4222222222222222, "grad_norm": 1.2583063840866089, "learning_rate": 0.0001158129175946548, "loss": 1.0355, "step": 1900 }, { "epoch": 0.42244444444444446, "grad_norm": 1.069173812866211, "learning_rate": 0.0001157683741648107, "loss": 1.0298, "step": 1901 }, { "epoch": 0.4226666666666667, "grad_norm": 0.06103937700390816, "learning_rate": 0.00011572383073496659, "loss": 0.012, "step": 1902 }, { "epoch": 0.42288888888888887, "grad_norm": 0.07574369013309479, "learning_rate": 0.0001156792873051225, "loss": 0.0123, "step": 1903 }, { "epoch": 0.4231111111111111, "grad_norm": 0.5607222318649292, "learning_rate": 0.00011563474387527839, "loss": 1.0307, "step": 1904 }, { "epoch": 0.42333333333333334, "grad_norm": 0.619121253490448, "learning_rate": 0.0001155902004454343, "loss": 1.1993, "step": 1905 }, { "epoch": 0.4235555555555556, "grad_norm": 0.07177204638719559, "learning_rate": 0.00011554565701559021, "loss": 0.0122, "step": 1906 }, { "epoch": 0.42377777777777775, "grad_norm": 0.06914813816547394, "learning_rate": 0.00011550111358574612, "loss": 0.0125, "step": 1907 }, { "epoch": 0.424, "grad_norm": 0.06676662713289261, "learning_rate": 0.00011545657015590201, "loss": 0.0123, "step": 1908 }, { "epoch": 0.4242222222222222, "grad_norm": 0.8784866333007812, "learning_rate": 0.0001154120267260579, "loss": 2.1322, "step": 1909 }, { "epoch": 0.42444444444444446, "grad_norm": 0.9178574085235596, "learning_rate": 0.00011536748329621381, "loss": 2.1509, "step": 1910 }, { "epoch": 0.4246666666666667, "grad_norm": 0.8715436458587646, "learning_rate": 0.0001153229398663697, "loss": 1.9367, "step": 1911 }, { "epoch": 0.42488888888888887, "grad_norm": 0.8655092716217041, "learning_rate": 0.00011527839643652563, "loss": 1.8575, "step": 1912 }, { "epoch": 0.4251111111111111, "grad_norm": 1.0920130014419556, "learning_rate": 0.00011523385300668152, "loss": 2.1347, "step": 1913 }, { "epoch": 0.42533333333333334, "grad_norm": 0.8793624639511108, "learning_rate": 0.00011518930957683743, "loss": 1.7823, "step": 1914 }, { "epoch": 0.4255555555555556, "grad_norm": 0.9117141962051392, "learning_rate": 0.00011514476614699332, "loss": 1.7285, "step": 1915 }, { "epoch": 0.42577777777777776, "grad_norm": 0.866205096244812, "learning_rate": 0.00011510022271714921, "loss": 1.7268, "step": 1916 }, { "epoch": 0.426, "grad_norm": 1.1967967748641968, "learning_rate": 0.00011505567928730512, "loss": 2.0665, "step": 1917 }, { "epoch": 0.4262222222222222, "grad_norm": 0.9093246459960938, "learning_rate": 0.00011501113585746104, "loss": 1.8155, "step": 1918 }, { "epoch": 0.42644444444444446, "grad_norm": 1.162400484085083, "learning_rate": 0.00011496659242761694, "loss": 2.068, "step": 1919 }, { "epoch": 0.4266666666666667, "grad_norm": 0.978716254234314, "learning_rate": 0.00011492204899777283, "loss": 2.0323, "step": 1920 }, { "epoch": 0.4268888888888889, "grad_norm": 0.9247249960899353, "learning_rate": 0.00011487750556792874, "loss": 1.707, "step": 1921 }, { "epoch": 0.4271111111111111, "grad_norm": 0.08401922136545181, "learning_rate": 0.00011483296213808463, "loss": 0.0179, "step": 1922 }, { "epoch": 0.42733333333333334, "grad_norm": 0.081658273935318, "learning_rate": 0.00011478841870824054, "loss": 0.0181, "step": 1923 }, { "epoch": 0.4275555555555556, "grad_norm": 1.0231783390045166, "learning_rate": 0.00011474387527839646, "loss": 1.6638, "step": 1924 }, { "epoch": 0.42777777777777776, "grad_norm": 1.0630674362182617, "learning_rate": 0.00011469933184855235, "loss": 1.9074, "step": 1925 }, { "epoch": 0.428, "grad_norm": 1.016446590423584, "learning_rate": 0.00011465478841870825, "loss": 1.5796, "step": 1926 }, { "epoch": 0.4282222222222222, "grad_norm": 1.0378187894821167, "learning_rate": 0.00011461024498886414, "loss": 1.8205, "step": 1927 }, { "epoch": 0.42844444444444446, "grad_norm": 0.06329286843538284, "learning_rate": 0.00011456570155902005, "loss": 0.0175, "step": 1928 }, { "epoch": 0.42866666666666664, "grad_norm": 0.06730126589536667, "learning_rate": 0.00011452115812917594, "loss": 0.0173, "step": 1929 }, { "epoch": 0.4288888888888889, "grad_norm": 0.8092349767684937, "learning_rate": 0.00011447661469933186, "loss": 0.9686, "step": 1930 }, { "epoch": 0.4291111111111111, "grad_norm": 0.6625300049781799, "learning_rate": 0.00011443207126948777, "loss": 0.7906, "step": 1931 }, { "epoch": 0.42933333333333334, "grad_norm": 0.09340567141771317, "learning_rate": 0.00011438752783964366, "loss": 0.0185, "step": 1932 }, { "epoch": 0.4295555555555556, "grad_norm": 0.08747432380914688, "learning_rate": 0.00011434298440979956, "loss": 0.0186, "step": 1933 }, { "epoch": 0.42977777777777776, "grad_norm": 0.08998148888349533, "learning_rate": 0.00011429844097995546, "loss": 0.0179, "step": 1934 }, { "epoch": 0.43, "grad_norm": 0.9794445633888245, "learning_rate": 0.00011425389755011136, "loss": 1.8862, "step": 1935 }, { "epoch": 0.43022222222222223, "grad_norm": 1.0736924409866333, "learning_rate": 0.00011420935412026728, "loss": 1.8727, "step": 1936 }, { "epoch": 0.43044444444444446, "grad_norm": 1.05514395236969, "learning_rate": 0.00011416481069042317, "loss": 1.6888, "step": 1937 }, { "epoch": 0.43066666666666664, "grad_norm": 1.1766986846923828, "learning_rate": 0.00011412026726057908, "loss": 1.7106, "step": 1938 }, { "epoch": 0.4308888888888889, "grad_norm": 0.9431614875793457, "learning_rate": 0.00011407572383073497, "loss": 1.4491, "step": 1939 }, { "epoch": 0.4311111111111111, "grad_norm": 1.1172568798065186, "learning_rate": 0.00011403118040089087, "loss": 1.7868, "step": 1940 }, { "epoch": 0.43133333333333335, "grad_norm": 1.3255879878997803, "learning_rate": 0.00011398663697104677, "loss": 1.7341, "step": 1941 }, { "epoch": 0.4315555555555556, "grad_norm": 0.9136682152748108, "learning_rate": 0.00011394209354120268, "loss": 1.4765, "step": 1942 }, { "epoch": 0.43177777777777776, "grad_norm": 1.1250746250152588, "learning_rate": 0.00011389755011135859, "loss": 1.212, "step": 1943 }, { "epoch": 0.432, "grad_norm": 0.7082473039627075, "learning_rate": 0.00011385300668151448, "loss": 0.6973, "step": 1944 }, { "epoch": 0.43222222222222223, "grad_norm": 1.1732277870178223, "learning_rate": 0.00011380846325167039, "loss": 1.3756, "step": 1945 }, { "epoch": 0.43244444444444446, "grad_norm": 1.1802074909210205, "learning_rate": 0.00011376391982182628, "loss": 1.2276, "step": 1946 }, { "epoch": 0.43266666666666664, "grad_norm": 0.745093584060669, "learning_rate": 0.00011371937639198218, "loss": 0.6378, "step": 1947 }, { "epoch": 0.4328888888888889, "grad_norm": 1.0691252946853638, "learning_rate": 0.0001136748329621381, "loss": 1.076, "step": 1948 }, { "epoch": 0.4331111111111111, "grad_norm": 0.9302070140838623, "learning_rate": 0.000113630289532294, "loss": 0.9216, "step": 1949 }, { "epoch": 0.43333333333333335, "grad_norm": 0.9691843390464783, "learning_rate": 0.0001135857461024499, "loss": 0.6993, "step": 1950 }, { "epoch": 0.4335555555555556, "grad_norm": 0.5687994956970215, "learning_rate": 0.00011354120267260579, "loss": 0.8047, "step": 1951 }, { "epoch": 0.43377777777777776, "grad_norm": 0.5520983338356018, "learning_rate": 0.0001134966592427617, "loss": 1.0069, "step": 1952 }, { "epoch": 0.434, "grad_norm": 0.8625077605247498, "learning_rate": 0.00011345211581291759, "loss": 2.2169, "step": 1953 }, { "epoch": 0.43422222222222223, "grad_norm": 0.6881237030029297, "learning_rate": 0.00011340757238307351, "loss": 1.4639, "step": 1954 }, { "epoch": 0.43444444444444447, "grad_norm": 0.9478729367256165, "learning_rate": 0.00011336302895322941, "loss": 2.1246, "step": 1955 }, { "epoch": 0.43466666666666665, "grad_norm": 0.5995079874992371, "learning_rate": 0.0001133184855233853, "loss": 0.8358, "step": 1956 }, { "epoch": 0.4348888888888889, "grad_norm": 0.8051624298095703, "learning_rate": 0.00011327394209354121, "loss": 2.3354, "step": 1957 }, { "epoch": 0.4351111111111111, "grad_norm": 0.9365907907485962, "learning_rate": 0.0001132293986636971, "loss": 2.3088, "step": 1958 }, { "epoch": 0.43533333333333335, "grad_norm": 0.06474913656711578, "learning_rate": 0.00011318485523385301, "loss": 0.0119, "step": 1959 }, { "epoch": 0.43555555555555553, "grad_norm": 0.06161544471979141, "learning_rate": 0.00011314031180400893, "loss": 0.0116, "step": 1960 }, { "epoch": 0.43577777777777776, "grad_norm": 0.05895036458969116, "learning_rate": 0.00011309576837416482, "loss": 0.0117, "step": 1961 }, { "epoch": 0.436, "grad_norm": 0.058882202953100204, "learning_rate": 0.00011305122494432072, "loss": 0.0114, "step": 1962 }, { "epoch": 0.43622222222222223, "grad_norm": 0.845483124256134, "learning_rate": 0.00011300668151447662, "loss": 1.9667, "step": 1963 }, { "epoch": 0.43644444444444447, "grad_norm": 0.841730535030365, "learning_rate": 0.00011296213808463252, "loss": 2.0258, "step": 1964 }, { "epoch": 0.43666666666666665, "grad_norm": 0.85284024477005, "learning_rate": 0.00011291759465478841, "loss": 1.9691, "step": 1965 }, { "epoch": 0.4368888888888889, "grad_norm": 0.5937424898147583, "learning_rate": 0.00011287305122494432, "loss": 0.8792, "step": 1966 }, { "epoch": 0.4371111111111111, "grad_norm": 0.9278184175491333, "learning_rate": 0.00011282850779510024, "loss": 1.8017, "step": 1967 }, { "epoch": 0.43733333333333335, "grad_norm": 0.9445812106132507, "learning_rate": 0.00011278396436525613, "loss": 1.9442, "step": 1968 }, { "epoch": 0.43755555555555553, "grad_norm": 1.0501065254211426, "learning_rate": 0.00011273942093541203, "loss": 2.2303, "step": 1969 }, { "epoch": 0.43777777777777777, "grad_norm": 1.0860295295715332, "learning_rate": 0.00011269487750556793, "loss": 2.1409, "step": 1970 }, { "epoch": 0.438, "grad_norm": 1.156929850578308, "learning_rate": 0.00011265033407572383, "loss": 1.8208, "step": 1971 }, { "epoch": 0.43822222222222224, "grad_norm": 0.10455144941806793, "learning_rate": 0.00011260579064587972, "loss": 0.0198, "step": 1972 }, { "epoch": 0.43844444444444447, "grad_norm": 0.0996774211525917, "learning_rate": 0.00011256124721603564, "loss": 0.0196, "step": 1973 }, { "epoch": 0.43866666666666665, "grad_norm": 0.0888049378991127, "learning_rate": 0.00011251670378619155, "loss": 0.0188, "step": 1974 }, { "epoch": 0.4388888888888889, "grad_norm": 0.6885740756988525, "learning_rate": 0.00011247216035634744, "loss": 0.9559, "step": 1975 }, { "epoch": 0.4391111111111111, "grad_norm": 0.9182388186454773, "learning_rate": 0.00011242761692650334, "loss": 1.5226, "step": 1976 }, { "epoch": 0.43933333333333335, "grad_norm": 1.1160727739334106, "learning_rate": 0.00011238307349665924, "loss": 1.8567, "step": 1977 }, { "epoch": 0.43955555555555553, "grad_norm": 0.0670580118894577, "learning_rate": 0.00011233853006681514, "loss": 0.0177, "step": 1978 }, { "epoch": 0.43977777777777777, "grad_norm": 0.06605665385723114, "learning_rate": 0.00011229398663697106, "loss": 0.0174, "step": 1979 }, { "epoch": 0.44, "grad_norm": 0.07107221335172653, "learning_rate": 0.00011224944320712695, "loss": 0.0175, "step": 1980 }, { "epoch": 0.44022222222222224, "grad_norm": 0.06550829857587814, "learning_rate": 0.00011220489977728286, "loss": 0.0171, "step": 1981 }, { "epoch": 0.44044444444444447, "grad_norm": 0.06652254611253738, "learning_rate": 0.00011216035634743875, "loss": 0.0169, "step": 1982 }, { "epoch": 0.44066666666666665, "grad_norm": 1.0364837646484375, "learning_rate": 0.00011211581291759466, "loss": 1.8154, "step": 1983 }, { "epoch": 0.4408888888888889, "grad_norm": 1.0154790878295898, "learning_rate": 0.00011207126948775055, "loss": 1.5184, "step": 1984 }, { "epoch": 0.4411111111111111, "grad_norm": 0.985426664352417, "learning_rate": 0.00011202672605790647, "loss": 1.5944, "step": 1985 }, { "epoch": 0.44133333333333336, "grad_norm": 1.0827574729919434, "learning_rate": 0.00011198218262806237, "loss": 1.7389, "step": 1986 }, { "epoch": 0.44155555555555553, "grad_norm": 0.1062050461769104, "learning_rate": 0.00011193763919821826, "loss": 0.0204, "step": 1987 }, { "epoch": 0.44177777777777777, "grad_norm": 0.7836151719093323, "learning_rate": 0.00011189309576837417, "loss": 1.0073, "step": 1988 }, { "epoch": 0.442, "grad_norm": 0.09760961681604385, "learning_rate": 0.00011184855233853006, "loss": 0.0197, "step": 1989 }, { "epoch": 0.44222222222222224, "grad_norm": 1.1412192583084106, "learning_rate": 0.00011180400890868597, "loss": 1.7364, "step": 1990 }, { "epoch": 0.4424444444444444, "grad_norm": 0.9756342172622681, "learning_rate": 0.00011175946547884188, "loss": 1.3964, "step": 1991 }, { "epoch": 0.44266666666666665, "grad_norm": 0.13070262968540192, "learning_rate": 0.00011171492204899779, "loss": 0.0298, "step": 1992 }, { "epoch": 0.4428888888888889, "grad_norm": 0.7475621104240417, "learning_rate": 0.00011167037861915368, "loss": 0.5803, "step": 1993 }, { "epoch": 0.4431111111111111, "grad_norm": 1.2354745864868164, "learning_rate": 0.00011162583518930957, "loss": 1.7266, "step": 1994 }, { "epoch": 0.44333333333333336, "grad_norm": 1.1055632829666138, "learning_rate": 0.00011158129175946548, "loss": 1.3517, "step": 1995 }, { "epoch": 0.44355555555555554, "grad_norm": 1.1422277688980103, "learning_rate": 0.00011153674832962137, "loss": 1.7153, "step": 1996 }, { "epoch": 0.44377777777777777, "grad_norm": 1.0632517337799072, "learning_rate": 0.0001114922048997773, "loss": 1.3935, "step": 1997 }, { "epoch": 0.444, "grad_norm": 1.2863705158233643, "learning_rate": 0.0001114476614699332, "loss": 1.3315, "step": 1998 }, { "epoch": 0.44422222222222224, "grad_norm": 0.7513629198074341, "learning_rate": 0.0001114031180400891, "loss": 0.6531, "step": 1999 }, { "epoch": 0.4444444444444444, "grad_norm": 0.9309746026992798, "learning_rate": 0.00011135857461024499, "loss": 0.6415, "step": 2000 }, { "epoch": 0.44466666666666665, "grad_norm": 0.9425560235977173, "learning_rate": 0.00011131403118040088, "loss": 2.1203, "step": 2001 }, { "epoch": 0.4448888888888889, "grad_norm": 0.8981547951698303, "learning_rate": 0.00011126948775055679, "loss": 2.4304, "step": 2002 }, { "epoch": 0.4451111111111111, "grad_norm": 0.8358199000358582, "learning_rate": 0.00011122494432071271, "loss": 2.1793, "step": 2003 }, { "epoch": 0.44533333333333336, "grad_norm": 0.8026860952377319, "learning_rate": 0.00011118040089086861, "loss": 2.198, "step": 2004 }, { "epoch": 0.44555555555555554, "grad_norm": 0.8084419369697571, "learning_rate": 0.0001111358574610245, "loss": 2.1637, "step": 2005 }, { "epoch": 0.4457777777777778, "grad_norm": 0.0967579036951065, "learning_rate": 0.00011109131403118041, "loss": 0.0117, "step": 2006 }, { "epoch": 0.446, "grad_norm": 0.5831789374351501, "learning_rate": 0.0001110467706013363, "loss": 0.9959, "step": 2007 }, { "epoch": 0.44622222222222224, "grad_norm": 0.8523693084716797, "learning_rate": 0.00011100222717149221, "loss": 1.8045, "step": 2008 }, { "epoch": 0.4464444444444444, "grad_norm": 0.9027776122093201, "learning_rate": 0.00011095768374164813, "loss": 1.922, "step": 2009 }, { "epoch": 0.44666666666666666, "grad_norm": 0.9854663014411926, "learning_rate": 0.00011091314031180402, "loss": 1.8949, "step": 2010 }, { "epoch": 0.4468888888888889, "grad_norm": 1.163071632385254, "learning_rate": 0.00011086859688195992, "loss": 2.5929, "step": 2011 }, { "epoch": 0.4471111111111111, "grad_norm": 0.8055479526519775, "learning_rate": 0.00011082405345211582, "loss": 1.2043, "step": 2012 }, { "epoch": 0.44733333333333336, "grad_norm": 0.8408487439155579, "learning_rate": 0.00011077951002227172, "loss": 1.966, "step": 2013 }, { "epoch": 0.44755555555555554, "grad_norm": 0.8684518337249756, "learning_rate": 0.00011073496659242761, "loss": 1.9108, "step": 2014 }, { "epoch": 0.4477777777777778, "grad_norm": 1.0258240699768066, "learning_rate": 0.00011069042316258353, "loss": 2.0882, "step": 2015 }, { "epoch": 0.448, "grad_norm": 0.9539505839347839, "learning_rate": 0.00011064587973273944, "loss": 2.0714, "step": 2016 }, { "epoch": 0.44822222222222224, "grad_norm": 0.8587532639503479, "learning_rate": 0.00011060133630289533, "loss": 1.7786, "step": 2017 }, { "epoch": 0.4484444444444444, "grad_norm": 0.9189285039901733, "learning_rate": 0.00011055679287305123, "loss": 2.0443, "step": 2018 }, { "epoch": 0.44866666666666666, "grad_norm": 0.1925644874572754, "learning_rate": 0.00011051224944320713, "loss": 0.0213, "step": 2019 }, { "epoch": 0.4488888888888889, "grad_norm": 0.17474225163459778, "learning_rate": 0.00011046770601336303, "loss": 0.0198, "step": 2020 }, { "epoch": 0.4491111111111111, "grad_norm": 0.12866677343845367, "learning_rate": 0.00011042316258351895, "loss": 0.0168, "step": 2021 }, { "epoch": 0.4493333333333333, "grad_norm": 0.9263811111450195, "learning_rate": 0.00011037861915367484, "loss": 2.0245, "step": 2022 }, { "epoch": 0.44955555555555554, "grad_norm": 1.2182332277297974, "learning_rate": 0.00011033407572383075, "loss": 1.944, "step": 2023 }, { "epoch": 0.4497777777777778, "grad_norm": 0.9207272529602051, "learning_rate": 0.00011028953229398664, "loss": 1.7777, "step": 2024 }, { "epoch": 0.45, "grad_norm": 0.9211624264717102, "learning_rate": 0.00011024498886414254, "loss": 1.9517, "step": 2025 }, { "epoch": 0.45022222222222225, "grad_norm": 0.9342603087425232, "learning_rate": 0.00011020044543429844, "loss": 1.7293, "step": 2026 }, { "epoch": 0.4504444444444444, "grad_norm": 0.7406251430511475, "learning_rate": 0.00011015590200445436, "loss": 1.0289, "step": 2027 }, { "epoch": 0.45066666666666666, "grad_norm": 0.09853217005729675, "learning_rate": 0.00011011135857461026, "loss": 0.0172, "step": 2028 }, { "epoch": 0.4508888888888889, "grad_norm": 0.0647294893860817, "learning_rate": 0.00011006681514476615, "loss": 0.0167, "step": 2029 }, { "epoch": 0.45111111111111113, "grad_norm": 0.6249412894248962, "learning_rate": 0.00011002227171492206, "loss": 0.7176, "step": 2030 }, { "epoch": 0.4513333333333333, "grad_norm": 1.0367200374603271, "learning_rate": 0.00010997772828507795, "loss": 1.6925, "step": 2031 }, { "epoch": 0.45155555555555554, "grad_norm": 0.08039866387844086, "learning_rate": 0.00010993318485523386, "loss": 0.0182, "step": 2032 }, { "epoch": 0.4517777777777778, "grad_norm": 0.08283301442861557, "learning_rate": 0.00010988864142538977, "loss": 0.0183, "step": 2033 }, { "epoch": 0.452, "grad_norm": 1.053772211074829, "learning_rate": 0.00010984409799554567, "loss": 1.9159, "step": 2034 }, { "epoch": 0.45222222222222225, "grad_norm": 0.8648183941841125, "learning_rate": 0.00010979955456570157, "loss": 1.4673, "step": 2035 }, { "epoch": 0.4524444444444444, "grad_norm": 1.0042818784713745, "learning_rate": 0.00010975501113585746, "loss": 1.397, "step": 2036 }, { "epoch": 0.45266666666666666, "grad_norm": 0.6784095764160156, "learning_rate": 0.00010971046770601337, "loss": 0.7173, "step": 2037 }, { "epoch": 0.4528888888888889, "grad_norm": 0.09913664311170578, "learning_rate": 0.00010966592427616926, "loss": 0.0251, "step": 2038 }, { "epoch": 0.45311111111111113, "grad_norm": 0.0990590900182724, "learning_rate": 0.00010962138084632517, "loss": 0.0241, "step": 2039 }, { "epoch": 0.4533333333333333, "grad_norm": 0.09208090603351593, "learning_rate": 0.00010957683741648108, "loss": 0.0232, "step": 2040 }, { "epoch": 0.45355555555555555, "grad_norm": 1.100632667541504, "learning_rate": 0.00010953229398663698, "loss": 1.5947, "step": 2041 }, { "epoch": 0.4537777777777778, "grad_norm": 1.0162431001663208, "learning_rate": 0.00010948775055679288, "loss": 1.4251, "step": 2042 }, { "epoch": 0.454, "grad_norm": 1.1525739431381226, "learning_rate": 0.00010944320712694877, "loss": 1.2817, "step": 2043 }, { "epoch": 0.45422222222222225, "grad_norm": 0.8342036008834839, "learning_rate": 0.00010939866369710468, "loss": 0.6935, "step": 2044 }, { "epoch": 0.45444444444444443, "grad_norm": 0.8119909167289734, "learning_rate": 0.00010935412026726057, "loss": 0.6888, "step": 2045 }, { "epoch": 0.45466666666666666, "grad_norm": 1.0022333860397339, "learning_rate": 0.00010930957683741649, "loss": 1.4122, "step": 2046 }, { "epoch": 0.4548888888888889, "grad_norm": 1.043531060218811, "learning_rate": 0.0001092650334075724, "loss": 0.8783, "step": 2047 }, { "epoch": 0.45511111111111113, "grad_norm": 1.1184370517730713, "learning_rate": 0.00010922048997772829, "loss": 1.028, "step": 2048 }, { "epoch": 0.4553333333333333, "grad_norm": 0.9111670851707458, "learning_rate": 0.00010917594654788419, "loss": 0.7038, "step": 2049 }, { "epoch": 0.45555555555555555, "grad_norm": 0.9852802753448486, "learning_rate": 0.00010913140311804008, "loss": 0.68, "step": 2050 }, { "epoch": 0.4557777777777778, "grad_norm": 0.5692037343978882, "learning_rate": 0.00010908685968819599, "loss": 1.1471, "step": 2051 }, { "epoch": 0.456, "grad_norm": 0.060164306312799454, "learning_rate": 0.00010904231625835191, "loss": 0.0125, "step": 2052 }, { "epoch": 0.4562222222222222, "grad_norm": 0.05939817428588867, "learning_rate": 0.0001089977728285078, "loss": 0.0124, "step": 2053 }, { "epoch": 0.45644444444444443, "grad_norm": 0.9658234119415283, "learning_rate": 0.0001089532293986637, "loss": 2.4221, "step": 2054 }, { "epoch": 0.45666666666666667, "grad_norm": 0.608363926410675, "learning_rate": 0.0001089086859688196, "loss": 1.0087, "step": 2055 }, { "epoch": 0.4568888888888889, "grad_norm": 0.6269051432609558, "learning_rate": 0.0001088641425389755, "loss": 1.1338, "step": 2056 }, { "epoch": 0.45711111111111113, "grad_norm": 0.06373189389705658, "learning_rate": 0.0001088195991091314, "loss": 0.0117, "step": 2057 }, { "epoch": 0.4573333333333333, "grad_norm": 0.656608521938324, "learning_rate": 0.00010877505567928731, "loss": 1.0346, "step": 2058 }, { "epoch": 0.45755555555555555, "grad_norm": 0.7977051138877869, "learning_rate": 0.00010873051224944322, "loss": 1.8994, "step": 2059 }, { "epoch": 0.4577777777777778, "grad_norm": 0.8953185677528381, "learning_rate": 0.00010868596881959911, "loss": 1.9597, "step": 2060 }, { "epoch": 0.458, "grad_norm": 0.9071193933486938, "learning_rate": 0.00010864142538975502, "loss": 2.0342, "step": 2061 }, { "epoch": 0.4582222222222222, "grad_norm": 0.9120450019836426, "learning_rate": 0.00010859688195991091, "loss": 2.1583, "step": 2062 }, { "epoch": 0.45844444444444443, "grad_norm": 0.93471759557724, "learning_rate": 0.00010855233853006681, "loss": 1.7983, "step": 2063 }, { "epoch": 0.45866666666666667, "grad_norm": 1.089474081993103, "learning_rate": 0.00010850779510022273, "loss": 2.1283, "step": 2064 }, { "epoch": 0.4588888888888889, "grad_norm": 1.0539686679840088, "learning_rate": 0.00010846325167037862, "loss": 1.8288, "step": 2065 }, { "epoch": 0.45911111111111114, "grad_norm": 0.41104814410209656, "learning_rate": 0.00010841870824053453, "loss": 0.0214, "step": 2066 }, { "epoch": 0.4593333333333333, "grad_norm": 0.8927615284919739, "learning_rate": 0.00010837416481069042, "loss": 1.9371, "step": 2067 }, { "epoch": 0.45955555555555555, "grad_norm": 0.9341305494308472, "learning_rate": 0.00010832962138084633, "loss": 1.921, "step": 2068 }, { "epoch": 0.4597777777777778, "grad_norm": 1.0359078645706177, "learning_rate": 0.00010828507795100222, "loss": 2.3344, "step": 2069 }, { "epoch": 0.46, "grad_norm": 1.0806338787078857, "learning_rate": 0.00010824053452115814, "loss": 1.9512, "step": 2070 }, { "epoch": 0.4602222222222222, "grad_norm": 0.9538819193840027, "learning_rate": 0.00010819599109131404, "loss": 2.1274, "step": 2071 }, { "epoch": 0.46044444444444443, "grad_norm": 0.6135286688804626, "learning_rate": 0.00010815144766146993, "loss": 1.0131, "step": 2072 }, { "epoch": 0.46066666666666667, "grad_norm": 0.6279967427253723, "learning_rate": 0.00010810690423162584, "loss": 0.8183, "step": 2073 }, { "epoch": 0.4608888888888889, "grad_norm": 0.687468409538269, "learning_rate": 0.00010806236080178173, "loss": 0.8753, "step": 2074 }, { "epoch": 0.46111111111111114, "grad_norm": 1.2607934474945068, "learning_rate": 0.00010801781737193764, "loss": 2.009, "step": 2075 }, { "epoch": 0.4613333333333333, "grad_norm": 1.0350881814956665, "learning_rate": 0.00010797327394209356, "loss": 1.9684, "step": 2076 }, { "epoch": 0.46155555555555555, "grad_norm": 0.897770881652832, "learning_rate": 0.00010792873051224946, "loss": 1.678, "step": 2077 }, { "epoch": 0.4617777777777778, "grad_norm": 1.0721005201339722, "learning_rate": 0.00010788418708240535, "loss": 1.6479, "step": 2078 }, { "epoch": 0.462, "grad_norm": 0.6707905530929565, "learning_rate": 0.00010783964365256124, "loss": 0.7607, "step": 2079 }, { "epoch": 0.4622222222222222, "grad_norm": 1.0232561826705933, "learning_rate": 0.00010779510022271715, "loss": 0.9529, "step": 2080 }, { "epoch": 0.46244444444444444, "grad_norm": 1.1841635704040527, "learning_rate": 0.00010775055679287304, "loss": 2.0433, "step": 2081 }, { "epoch": 0.46266666666666667, "grad_norm": 1.104246973991394, "learning_rate": 0.00010770601336302897, "loss": 1.8199, "step": 2082 }, { "epoch": 0.4628888888888889, "grad_norm": 0.7725056409835815, "learning_rate": 0.00010766146993318487, "loss": 0.7594, "step": 2083 }, { "epoch": 0.4631111111111111, "grad_norm": 0.9705109596252441, "learning_rate": 0.00010761692650334077, "loss": 1.6149, "step": 2084 }, { "epoch": 0.4633333333333333, "grad_norm": 1.2132149934768677, "learning_rate": 0.00010757238307349666, "loss": 1.7415, "step": 2085 }, { "epoch": 0.46355555555555555, "grad_norm": 0.6384971737861633, "learning_rate": 0.00010752783964365255, "loss": 0.7416, "step": 2086 }, { "epoch": 0.4637777777777778, "grad_norm": 0.08440492302179337, "learning_rate": 0.00010748329621380846, "loss": 0.0227, "step": 2087 }, { "epoch": 0.464, "grad_norm": 1.0965360403060913, "learning_rate": 0.00010743875278396438, "loss": 1.8355, "step": 2088 }, { "epoch": 0.4642222222222222, "grad_norm": 0.9710419178009033, "learning_rate": 0.00010739420935412028, "loss": 1.4107, "step": 2089 }, { "epoch": 0.46444444444444444, "grad_norm": 1.1657572984695435, "learning_rate": 0.00010734966592427618, "loss": 1.529, "step": 2090 }, { "epoch": 0.4646666666666667, "grad_norm": 1.0341477394104004, "learning_rate": 0.00010730512249443208, "loss": 1.3314, "step": 2091 }, { "epoch": 0.4648888888888889, "grad_norm": 1.185089111328125, "learning_rate": 0.00010726057906458797, "loss": 1.3492, "step": 2092 }, { "epoch": 0.4651111111111111, "grad_norm": 1.172006607055664, "learning_rate": 0.00010721603563474388, "loss": 1.6585, "step": 2093 }, { "epoch": 0.4653333333333333, "grad_norm": 0.19929863512516022, "learning_rate": 0.0001071714922048998, "loss": 0.0307, "step": 2094 }, { "epoch": 0.46555555555555556, "grad_norm": 0.7375540137290955, "learning_rate": 0.00010712694877505569, "loss": 0.6284, "step": 2095 }, { "epoch": 0.4657777777777778, "grad_norm": 1.1733025312423706, "learning_rate": 0.0001070824053452116, "loss": 1.3572, "step": 2096 }, { "epoch": 0.466, "grad_norm": 1.2688745260238647, "learning_rate": 0.00010703786191536749, "loss": 1.1305, "step": 2097 }, { "epoch": 0.4662222222222222, "grad_norm": 0.3591971695423126, "learning_rate": 0.00010699331848552339, "loss": 0.0473, "step": 2098 }, { "epoch": 0.46644444444444444, "grad_norm": 0.7150940299034119, "learning_rate": 0.00010694877505567928, "loss": 0.4544, "step": 2099 }, { "epoch": 0.4666666666666667, "grad_norm": 0.8277695775032043, "learning_rate": 0.0001069042316258352, "loss": 0.8539, "step": 2100 }, { "epoch": 0.4668888888888889, "grad_norm": 0.6670131087303162, "learning_rate": 0.00010685968819599111, "loss": 1.1052, "step": 2101 }, { "epoch": 0.4671111111111111, "grad_norm": 0.06290578842163086, "learning_rate": 0.000106815144766147, "loss": 0.0125, "step": 2102 }, { "epoch": 0.4673333333333333, "grad_norm": 0.058846112340688705, "learning_rate": 0.0001067706013363029, "loss": 0.012, "step": 2103 }, { "epoch": 0.46755555555555556, "grad_norm": 0.537786602973938, "learning_rate": 0.0001067260579064588, "loss": 1.0569, "step": 2104 }, { "epoch": 0.4677777777777778, "grad_norm": 0.9007193446159363, "learning_rate": 0.0001066815144766147, "loss": 2.1002, "step": 2105 }, { "epoch": 0.468, "grad_norm": 0.527800440788269, "learning_rate": 0.00010663697104677062, "loss": 0.9616, "step": 2106 }, { "epoch": 0.4682222222222222, "grad_norm": 0.9083489775657654, "learning_rate": 0.00010659242761692651, "loss": 2.4556, "step": 2107 }, { "epoch": 0.46844444444444444, "grad_norm": 7.447436332702637, "learning_rate": 0.00010654788418708242, "loss": 1.1922, "step": 2108 }, { "epoch": 0.4686666666666667, "grad_norm": 0.6265543699264526, "learning_rate": 0.00010650334075723831, "loss": 1.0863, "step": 2109 }, { "epoch": 0.4688888888888889, "grad_norm": 0.0921320989727974, "learning_rate": 0.00010645879732739422, "loss": 0.0133, "step": 2110 }, { "epoch": 0.4691111111111111, "grad_norm": 0.12606237828731537, "learning_rate": 0.00010641425389755011, "loss": 0.014, "step": 2111 }, { "epoch": 0.4693333333333333, "grad_norm": 0.5374711155891418, "learning_rate": 0.00010636971046770601, "loss": 0.9771, "step": 2112 }, { "epoch": 0.46955555555555556, "grad_norm": 1.0356422662734985, "learning_rate": 0.00010632516703786193, "loss": 2.4527, "step": 2113 }, { "epoch": 0.4697777777777778, "grad_norm": 0.9254876375198364, "learning_rate": 0.00010628062360801782, "loss": 2.0676, "step": 2114 }, { "epoch": 0.47, "grad_norm": 0.7463611960411072, "learning_rate": 0.00010623608017817373, "loss": 0.9246, "step": 2115 }, { "epoch": 0.4702222222222222, "grad_norm": 1.0094149112701416, "learning_rate": 0.00010619153674832962, "loss": 1.8244, "step": 2116 }, { "epoch": 0.47044444444444444, "grad_norm": 0.995177149772644, "learning_rate": 0.00010614699331848553, "loss": 2.0105, "step": 2117 }, { "epoch": 0.4706666666666667, "grad_norm": 0.9415448904037476, "learning_rate": 0.00010610244988864142, "loss": 2.0042, "step": 2118 }, { "epoch": 0.4708888888888889, "grad_norm": 0.9262849688529968, "learning_rate": 0.00010605790645879734, "loss": 1.8853, "step": 2119 }, { "epoch": 0.4711111111111111, "grad_norm": 0.13545557856559753, "learning_rate": 0.00010601336302895324, "loss": 0.0187, "step": 2120 }, { "epoch": 0.4713333333333333, "grad_norm": 0.11557869613170624, "learning_rate": 0.00010596881959910913, "loss": 0.0182, "step": 2121 }, { "epoch": 0.47155555555555556, "grad_norm": 0.10856274515390396, "learning_rate": 0.00010592427616926504, "loss": 0.0175, "step": 2122 }, { "epoch": 0.4717777777777778, "grad_norm": 0.09006939828395844, "learning_rate": 0.00010587973273942093, "loss": 0.0166, "step": 2123 }, { "epoch": 0.472, "grad_norm": 0.08400023728609085, "learning_rate": 0.00010583518930957684, "loss": 0.0155, "step": 2124 }, { "epoch": 0.4722222222222222, "grad_norm": 0.6308079361915588, "learning_rate": 0.00010579064587973276, "loss": 0.9436, "step": 2125 }, { "epoch": 0.47244444444444444, "grad_norm": 0.6802711486816406, "learning_rate": 0.00010574610244988865, "loss": 0.8038, "step": 2126 }, { "epoch": 0.4726666666666667, "grad_norm": 1.0816277265548706, "learning_rate": 0.00010570155902004455, "loss": 1.6768, "step": 2127 }, { "epoch": 0.4728888888888889, "grad_norm": 0.15280470252037048, "learning_rate": 0.00010565701559020044, "loss": 0.0236, "step": 2128 }, { "epoch": 0.4731111111111111, "grad_norm": 0.13545870780944824, "learning_rate": 0.00010561247216035635, "loss": 0.0222, "step": 2129 }, { "epoch": 0.47333333333333333, "grad_norm": 0.9755976796150208, "learning_rate": 0.00010556792873051224, "loss": 2.1984, "step": 2130 }, { "epoch": 0.47355555555555556, "grad_norm": 1.1221860647201538, "learning_rate": 0.00010552338530066816, "loss": 1.9101, "step": 2131 }, { "epoch": 0.4737777777777778, "grad_norm": 1.06197190284729, "learning_rate": 0.00010547884187082407, "loss": 1.9334, "step": 2132 }, { "epoch": 0.474, "grad_norm": 0.6913040280342102, "learning_rate": 0.00010543429844097996, "loss": 0.8087, "step": 2133 }, { "epoch": 0.4742222222222222, "grad_norm": 0.10353131592273712, "learning_rate": 0.00010538975501113586, "loss": 0.0182, "step": 2134 }, { "epoch": 0.47444444444444445, "grad_norm": 0.8617785573005676, "learning_rate": 0.00010534521158129175, "loss": 1.0227, "step": 2135 }, { "epoch": 0.4746666666666667, "grad_norm": 0.9836186170578003, "learning_rate": 0.00010530066815144766, "loss": 1.6456, "step": 2136 }, { "epoch": 0.4748888888888889, "grad_norm": 0.07714372128248215, "learning_rate": 0.00010525612472160358, "loss": 0.0202, "step": 2137 }, { "epoch": 0.4751111111111111, "grad_norm": 0.7352029085159302, "learning_rate": 0.00010521158129175947, "loss": 0.8581, "step": 2138 }, { "epoch": 0.47533333333333333, "grad_norm": 0.8806314468383789, "learning_rate": 0.00010516703786191538, "loss": 1.4621, "step": 2139 }, { "epoch": 0.47555555555555556, "grad_norm": 1.0340739488601685, "learning_rate": 0.00010512249443207127, "loss": 1.6558, "step": 2140 }, { "epoch": 0.4757777777777778, "grad_norm": 0.9357542395591736, "learning_rate": 0.00010507795100222717, "loss": 0.0534, "step": 2141 }, { "epoch": 0.476, "grad_norm": 1.0452251434326172, "learning_rate": 0.00010503340757238307, "loss": 1.7621, "step": 2142 }, { "epoch": 0.4762222222222222, "grad_norm": 1.091395378112793, "learning_rate": 0.00010498886414253898, "loss": 1.5887, "step": 2143 }, { "epoch": 0.47644444444444445, "grad_norm": 1.1353317499160767, "learning_rate": 0.00010494432071269489, "loss": 1.7071, "step": 2144 }, { "epoch": 0.4766666666666667, "grad_norm": 0.9791475534439087, "learning_rate": 0.00010489977728285078, "loss": 1.3087, "step": 2145 }, { "epoch": 0.47688888888888886, "grad_norm": 1.2445948123931885, "learning_rate": 0.00010485523385300669, "loss": 1.6218, "step": 2146 }, { "epoch": 0.4771111111111111, "grad_norm": 0.684476912021637, "learning_rate": 0.00010481069042316258, "loss": 0.5532, "step": 2147 }, { "epoch": 0.47733333333333333, "grad_norm": 1.0223796367645264, "learning_rate": 0.00010476614699331848, "loss": 1.4684, "step": 2148 }, { "epoch": 0.47755555555555557, "grad_norm": 0.9877771735191345, "learning_rate": 0.0001047216035634744, "loss": 0.7886, "step": 2149 }, { "epoch": 0.4777777777777778, "grad_norm": 1.0899747610092163, "learning_rate": 0.0001046770601336303, "loss": 0.9223, "step": 2150 }, { "epoch": 0.478, "grad_norm": 0.7108315825462341, "learning_rate": 0.0001046325167037862, "loss": 1.2016, "step": 2151 }, { "epoch": 0.4782222222222222, "grad_norm": 0.06013140454888344, "learning_rate": 0.00010458797327394209, "loss": 0.0102, "step": 2152 }, { "epoch": 0.47844444444444445, "grad_norm": 0.8427261114120483, "learning_rate": 0.000104543429844098, "loss": 2.311, "step": 2153 }, { "epoch": 0.4786666666666667, "grad_norm": 1.2041535377502441, "learning_rate": 0.00010449888641425389, "loss": 2.3181, "step": 2154 }, { "epoch": 0.47888888888888886, "grad_norm": 0.8807494640350342, "learning_rate": 0.00010445434298440981, "loss": 1.9016, "step": 2155 }, { "epoch": 0.4791111111111111, "grad_norm": 0.5941738486289978, "learning_rate": 0.00010440979955456571, "loss": 1.1593, "step": 2156 }, { "epoch": 0.47933333333333333, "grad_norm": 0.05396668612957001, "learning_rate": 0.0001043652561247216, "loss": 0.0105, "step": 2157 }, { "epoch": 0.47955555555555557, "grad_norm": 0.056162334978580475, "learning_rate": 0.00010432071269487751, "loss": 0.01, "step": 2158 }, { "epoch": 0.4797777777777778, "grad_norm": 0.543596625328064, "learning_rate": 0.0001042761692650334, "loss": 0.9017, "step": 2159 }, { "epoch": 0.48, "grad_norm": 0.8763737082481384, "learning_rate": 0.00010423162583518931, "loss": 2.3425, "step": 2160 }, { "epoch": 0.4802222222222222, "grad_norm": 0.9465508460998535, "learning_rate": 0.00010418708240534523, "loss": 1.934, "step": 2161 }, { "epoch": 0.48044444444444445, "grad_norm": 0.9368391633033752, "learning_rate": 0.00010414253897550113, "loss": 2.1625, "step": 2162 }, { "epoch": 0.4806666666666667, "grad_norm": 0.8468746542930603, "learning_rate": 0.00010409799554565702, "loss": 1.8851, "step": 2163 }, { "epoch": 0.48088888888888887, "grad_norm": 0.9411273002624512, "learning_rate": 0.00010405345211581292, "loss": 1.9171, "step": 2164 }, { "epoch": 0.4811111111111111, "grad_norm": 0.9668144583702087, "learning_rate": 0.00010400890868596882, "loss": 1.8995, "step": 2165 }, { "epoch": 0.48133333333333334, "grad_norm": 1.0552144050598145, "learning_rate": 0.00010396436525612471, "loss": 1.92, "step": 2166 }, { "epoch": 0.48155555555555557, "grad_norm": 0.8945801854133606, "learning_rate": 0.00010391982182628064, "loss": 2.0224, "step": 2167 }, { "epoch": 0.4817777777777778, "grad_norm": 0.8795874714851379, "learning_rate": 0.00010387527839643654, "loss": 1.8932, "step": 2168 }, { "epoch": 0.482, "grad_norm": 0.7880940437316895, "learning_rate": 0.00010383073496659244, "loss": 0.981, "step": 2169 }, { "epoch": 0.4822222222222222, "grad_norm": 0.06655468791723251, "learning_rate": 0.00010378619153674833, "loss": 0.0142, "step": 2170 }, { "epoch": 0.48244444444444445, "grad_norm": 0.06633251905441284, "learning_rate": 0.00010374164810690423, "loss": 0.0146, "step": 2171 }, { "epoch": 0.4826666666666667, "grad_norm": 0.0680522546172142, "learning_rate": 0.00010369710467706013, "loss": 0.0143, "step": 2172 }, { "epoch": 0.48288888888888887, "grad_norm": 0.6770047545433044, "learning_rate": 0.00010365256124721605, "loss": 1.0753, "step": 2173 }, { "epoch": 0.4831111111111111, "grad_norm": 1.0793815851211548, "learning_rate": 0.00010360801781737196, "loss": 1.7203, "step": 2174 }, { "epoch": 0.48333333333333334, "grad_norm": 0.8015415668487549, "learning_rate": 0.00010356347438752785, "loss": 1.7094, "step": 2175 }, { "epoch": 0.48355555555555557, "grad_norm": 1.0239602327346802, "learning_rate": 0.00010351893095768375, "loss": 1.9498, "step": 2176 }, { "epoch": 0.48377777777777775, "grad_norm": 1.029447078704834, "learning_rate": 0.00010347438752783964, "loss": 1.9248, "step": 2177 }, { "epoch": 0.484, "grad_norm": 0.9458478689193726, "learning_rate": 0.00010342984409799555, "loss": 1.7154, "step": 2178 }, { "epoch": 0.4842222222222222, "grad_norm": 0.06287504732608795, "learning_rate": 0.00010338530066815147, "loss": 0.0172, "step": 2179 }, { "epoch": 0.48444444444444446, "grad_norm": 0.07798685878515244, "learning_rate": 0.00010334075723830736, "loss": 0.0179, "step": 2180 }, { "epoch": 0.4846666666666667, "grad_norm": 0.06901486217975616, "learning_rate": 0.00010329621380846327, "loss": 0.0182, "step": 2181 }, { "epoch": 0.48488888888888887, "grad_norm": 1.101205587387085, "learning_rate": 0.00010325167037861916, "loss": 1.6031, "step": 2182 }, { "epoch": 0.4851111111111111, "grad_norm": 1.084505319595337, "learning_rate": 0.00010320712694877506, "loss": 1.6042, "step": 2183 }, { "epoch": 0.48533333333333334, "grad_norm": 0.10013191401958466, "learning_rate": 0.00010316258351893095, "loss": 0.0203, "step": 2184 }, { "epoch": 0.4855555555555556, "grad_norm": 0.08007735759019852, "learning_rate": 0.00010311804008908686, "loss": 0.0207, "step": 2185 }, { "epoch": 0.48577777777777775, "grad_norm": 1.1262269020080566, "learning_rate": 0.00010307349665924278, "loss": 1.7542, "step": 2186 }, { "epoch": 0.486, "grad_norm": 1.2522791624069214, "learning_rate": 0.00010302895322939867, "loss": 1.7026, "step": 2187 }, { "epoch": 0.4862222222222222, "grad_norm": 1.145750880241394, "learning_rate": 0.00010298440979955458, "loss": 1.6401, "step": 2188 }, { "epoch": 0.48644444444444446, "grad_norm": 0.6675021052360535, "learning_rate": 0.00010293986636971047, "loss": 0.7744, "step": 2189 }, { "epoch": 0.4866666666666667, "grad_norm": 0.12333891540765762, "learning_rate": 0.00010289532293986637, "loss": 0.0266, "step": 2190 }, { "epoch": 0.48688888888888887, "grad_norm": 0.678531289100647, "learning_rate": 0.00010285077951002227, "loss": 0.66, "step": 2191 }, { "epoch": 0.4871111111111111, "grad_norm": 1.046586036682129, "learning_rate": 0.00010280623608017818, "loss": 1.3619, "step": 2192 }, { "epoch": 0.48733333333333334, "grad_norm": 1.0906909704208374, "learning_rate": 0.00010276169265033409, "loss": 1.6156, "step": 2193 }, { "epoch": 0.4875555555555556, "grad_norm": 1.0561549663543701, "learning_rate": 0.00010271714922048998, "loss": 1.455, "step": 2194 }, { "epoch": 0.48777777777777775, "grad_norm": 0.9658767580986023, "learning_rate": 0.00010267260579064589, "loss": 1.372, "step": 2195 }, { "epoch": 0.488, "grad_norm": 0.9055988192558289, "learning_rate": 0.00010262806236080178, "loss": 0.8257, "step": 2196 }, { "epoch": 0.4882222222222222, "grad_norm": 0.7421156764030457, "learning_rate": 0.00010258351893095768, "loss": 0.6387, "step": 2197 }, { "epoch": 0.48844444444444446, "grad_norm": 1.0203956365585327, "learning_rate": 0.0001025389755011136, "loss": 1.2457, "step": 2198 }, { "epoch": 0.4886666666666667, "grad_norm": 1.1960588693618774, "learning_rate": 0.0001024944320712695, "loss": 1.2971, "step": 2199 }, { "epoch": 0.4888888888888889, "grad_norm": 0.9425092935562134, "learning_rate": 0.0001024498886414254, "loss": 0.5784, "step": 2200 }, { "epoch": 0.4891111111111111, "grad_norm": 0.8036244511604309, "learning_rate": 0.00010240534521158129, "loss": 2.0153, "step": 2201 }, { "epoch": 0.48933333333333334, "grad_norm": 0.9643092155456543, "learning_rate": 0.0001023608017817372, "loss": 2.0731, "step": 2202 }, { "epoch": 0.4895555555555556, "grad_norm": 0.5496547222137451, "learning_rate": 0.00010231625835189309, "loss": 1.0441, "step": 2203 }, { "epoch": 0.48977777777777776, "grad_norm": 0.052178915590047836, "learning_rate": 0.00010227171492204901, "loss": 0.0109, "step": 2204 }, { "epoch": 0.49, "grad_norm": 0.054271552711725235, "learning_rate": 0.00010222717149220491, "loss": 0.0106, "step": 2205 }, { "epoch": 0.4902222222222222, "grad_norm": 0.6506833434104919, "learning_rate": 0.0001021826280623608, "loss": 1.151, "step": 2206 }, { "epoch": 0.49044444444444446, "grad_norm": 0.9006673097610474, "learning_rate": 0.00010213808463251671, "loss": 2.2136, "step": 2207 }, { "epoch": 0.49066666666666664, "grad_norm": 0.8316347002983093, "learning_rate": 0.0001020935412026726, "loss": 2.2916, "step": 2208 }, { "epoch": 0.4908888888888889, "grad_norm": 0.07608042657375336, "learning_rate": 0.00010204899777282851, "loss": 0.0124, "step": 2209 }, { "epoch": 0.4911111111111111, "grad_norm": 0.07590346783399582, "learning_rate": 0.00010200445434298443, "loss": 0.0125, "step": 2210 }, { "epoch": 0.49133333333333334, "grad_norm": 0.07187937945127487, "learning_rate": 0.00010195991091314032, "loss": 0.012, "step": 2211 }, { "epoch": 0.4915555555555556, "grad_norm": 0.6782304644584656, "learning_rate": 0.00010191536748329622, "loss": 1.0925, "step": 2212 }, { "epoch": 0.49177777777777776, "grad_norm": 0.8945388197898865, "learning_rate": 0.00010187082405345212, "loss": 2.0201, "step": 2213 }, { "epoch": 0.492, "grad_norm": 0.8869574666023254, "learning_rate": 0.00010182628062360802, "loss": 1.7918, "step": 2214 }, { "epoch": 0.4922222222222222, "grad_norm": 0.9882270097732544, "learning_rate": 0.00010178173719376391, "loss": 2.2219, "step": 2215 }, { "epoch": 0.49244444444444446, "grad_norm": 1.0089894533157349, "learning_rate": 0.00010173719376391983, "loss": 2.1878, "step": 2216 }, { "epoch": 0.49266666666666664, "grad_norm": 0.9218000173568726, "learning_rate": 0.00010169265033407574, "loss": 1.8475, "step": 2217 }, { "epoch": 0.4928888888888889, "grad_norm": 0.8486325740814209, "learning_rate": 0.00010164810690423163, "loss": 1.7585, "step": 2218 }, { "epoch": 0.4931111111111111, "grad_norm": 0.9325646162033081, "learning_rate": 0.00010160356347438753, "loss": 1.9068, "step": 2219 }, { "epoch": 0.49333333333333335, "grad_norm": 1.0260847806930542, "learning_rate": 0.00010155902004454343, "loss": 1.8463, "step": 2220 }, { "epoch": 0.4935555555555556, "grad_norm": 0.8245062828063965, "learning_rate": 0.00010151447661469933, "loss": 1.8072, "step": 2221 }, { "epoch": 0.49377777777777776, "grad_norm": 1.05905020236969, "learning_rate": 0.00010146993318485525, "loss": 1.7947, "step": 2222 }, { "epoch": 0.494, "grad_norm": 0.06895928084850311, "learning_rate": 0.00010142538975501114, "loss": 0.0156, "step": 2223 }, { "epoch": 0.49422222222222223, "grad_norm": 0.9304447174072266, "learning_rate": 0.00010138084632516705, "loss": 1.8109, "step": 2224 }, { "epoch": 0.49444444444444446, "grad_norm": 0.6271647810935974, "learning_rate": 0.00010133630289532294, "loss": 0.9784, "step": 2225 }, { "epoch": 0.49466666666666664, "grad_norm": 0.10684725642204285, "learning_rate": 0.00010129175946547884, "loss": 0.0192, "step": 2226 }, { "epoch": 0.4948888888888889, "grad_norm": 0.09946753084659576, "learning_rate": 0.00010124721603563474, "loss": 0.0185, "step": 2227 }, { "epoch": 0.4951111111111111, "grad_norm": 1.025982141494751, "learning_rate": 0.00010120267260579065, "loss": 1.6198, "step": 2228 }, { "epoch": 0.49533333333333335, "grad_norm": 1.0194659233093262, "learning_rate": 0.00010115812917594656, "loss": 2.1222, "step": 2229 }, { "epoch": 0.4955555555555556, "grad_norm": 0.9168062806129456, "learning_rate": 0.00010111358574610245, "loss": 1.8122, "step": 2230 }, { "epoch": 0.49577777777777776, "grad_norm": 0.8633151054382324, "learning_rate": 0.00010106904231625836, "loss": 1.4762, "step": 2231 }, { "epoch": 0.496, "grad_norm": 0.9805095791816711, "learning_rate": 0.00010102449888641425, "loss": 1.8171, "step": 2232 }, { "epoch": 0.49622222222222223, "grad_norm": 0.9416176676750183, "learning_rate": 0.00010097995545657015, "loss": 1.5266, "step": 2233 }, { "epoch": 0.49644444444444447, "grad_norm": 0.6914428472518921, "learning_rate": 0.00010093541202672607, "loss": 0.8921, "step": 2234 }, { "epoch": 0.49666666666666665, "grad_norm": 0.06182475388050079, "learning_rate": 0.00010089086859688197, "loss": 0.018, "step": 2235 }, { "epoch": 0.4968888888888889, "grad_norm": 0.7663688063621521, "learning_rate": 0.00010084632516703787, "loss": 0.9712, "step": 2236 }, { "epoch": 0.4971111111111111, "grad_norm": 0.9623875617980957, "learning_rate": 0.00010080178173719376, "loss": 1.7805, "step": 2237 }, { "epoch": 0.49733333333333335, "grad_norm": 0.812954306602478, "learning_rate": 0.00010075723830734967, "loss": 0.8818, "step": 2238 }, { "epoch": 0.49755555555555553, "grad_norm": 0.8574005961418152, "learning_rate": 0.00010071269487750556, "loss": 0.9017, "step": 2239 }, { "epoch": 0.49777777777777776, "grad_norm": 1.052270770072937, "learning_rate": 0.00010066815144766148, "loss": 1.6329, "step": 2240 }, { "epoch": 0.498, "grad_norm": 1.0629431009292603, "learning_rate": 0.00010062360801781738, "loss": 1.5515, "step": 2241 }, { "epoch": 0.49822222222222223, "grad_norm": 1.1712193489074707, "learning_rate": 0.00010057906458797328, "loss": 1.6491, "step": 2242 }, { "epoch": 0.49844444444444447, "grad_norm": 1.318710207939148, "learning_rate": 0.00010053452115812918, "loss": 1.5213, "step": 2243 }, { "epoch": 0.49866666666666665, "grad_norm": 0.9559906721115112, "learning_rate": 0.00010048997772828507, "loss": 1.2708, "step": 2244 }, { "epoch": 0.4988888888888889, "grad_norm": 0.9218617081642151, "learning_rate": 0.00010044543429844098, "loss": 0.7616, "step": 2245 }, { "epoch": 0.4991111111111111, "grad_norm": 1.2992888689041138, "learning_rate": 0.0001004008908685969, "loss": 1.2365, "step": 2246 }, { "epoch": 0.49933333333333335, "grad_norm": 0.9246402382850647, "learning_rate": 0.00010035634743875279, "loss": 1.2666, "step": 2247 }, { "epoch": 0.49955555555555553, "grad_norm": 1.1523358821868896, "learning_rate": 0.0001003118040089087, "loss": 1.3827, "step": 2248 }, { "epoch": 0.49977777777777777, "grad_norm": 0.19934004545211792, "learning_rate": 0.00010026726057906459, "loss": 0.0396, "step": 2249 }, { "epoch": 0.5, "grad_norm": 0.7492770552635193, "learning_rate": 0.00010022271714922049, "loss": 0.4563, "step": 2250 }, { "epoch": 0.5002222222222222, "grad_norm": 0.051572561264038086, "learning_rate": 0.00010017817371937638, "loss": 0.0103, "step": 2251 }, { "epoch": 0.5004444444444445, "grad_norm": 0.05010450258851051, "learning_rate": 0.00010013363028953232, "loss": 0.0102, "step": 2252 }, { "epoch": 0.5006666666666667, "grad_norm": 0.7738996148109436, "learning_rate": 0.00010008908685968821, "loss": 2.2673, "step": 2253 }, { "epoch": 0.5008888888888889, "grad_norm": 0.5947201251983643, "learning_rate": 0.00010004454342984411, "loss": 1.0109, "step": 2254 }, { "epoch": 0.5011111111111111, "grad_norm": 0.07506557554006577, "learning_rate": 0.0001, "loss": 0.0122, "step": 2255 }, { "epoch": 0.5013333333333333, "grad_norm": 0.0709841400384903, "learning_rate": 9.99554565701559e-05, "loss": 0.0119, "step": 2256 }, { "epoch": 0.5015555555555555, "grad_norm": 0.793897271156311, "learning_rate": 9.991091314031182e-05, "loss": 1.9008, "step": 2257 }, { "epoch": 0.5017777777777778, "grad_norm": 1.026395320892334, "learning_rate": 9.986636971046771e-05, "loss": 2.1726, "step": 2258 }, { "epoch": 0.502, "grad_norm": 0.9329989552497864, "learning_rate": 9.982182628062361e-05, "loss": 2.034, "step": 2259 }, { "epoch": 0.5022222222222222, "grad_norm": 0.5686019062995911, "learning_rate": 9.977728285077952e-05, "loss": 0.9681, "step": 2260 }, { "epoch": 0.5024444444444445, "grad_norm": 0.17110589146614075, "learning_rate": 9.973273942093542e-05, "loss": 0.0237, "step": 2261 }, { "epoch": 0.5026666666666667, "grad_norm": 0.9597615599632263, "learning_rate": 9.968819599109132e-05, "loss": 1.9351, "step": 2262 }, { "epoch": 0.5028888888888889, "grad_norm": 0.8988699913024902, "learning_rate": 9.964365256124722e-05, "loss": 2.0789, "step": 2263 }, { "epoch": 0.5031111111111111, "grad_norm": 1.0947890281677246, "learning_rate": 9.959910913140313e-05, "loss": 2.2087, "step": 2264 }, { "epoch": 0.5033333333333333, "grad_norm": 0.9448829889297485, "learning_rate": 9.955456570155902e-05, "loss": 1.6314, "step": 2265 }, { "epoch": 0.5035555555555555, "grad_norm": 0.6050695776939392, "learning_rate": 9.951002227171494e-05, "loss": 0.7855, "step": 2266 }, { "epoch": 0.5037777777777778, "grad_norm": 0.0719807967543602, "learning_rate": 9.946547884187083e-05, "loss": 0.016, "step": 2267 }, { "epoch": 0.504, "grad_norm": 0.07161426544189453, "learning_rate": 9.942093541202673e-05, "loss": 0.0151, "step": 2268 }, { "epoch": 0.5042222222222222, "grad_norm": 0.06885481625795364, "learning_rate": 9.937639198218264e-05, "loss": 0.0157, "step": 2269 }, { "epoch": 0.5044444444444445, "grad_norm": 0.06610265374183655, "learning_rate": 9.933184855233853e-05, "loss": 0.0163, "step": 2270 }, { "epoch": 0.5046666666666667, "grad_norm": 1.0415818691253662, "learning_rate": 9.928730512249444e-05, "loss": 1.9357, "step": 2271 }, { "epoch": 0.5048888888888889, "grad_norm": 1.081796646118164, "learning_rate": 9.924276169265034e-05, "loss": 1.6581, "step": 2272 }, { "epoch": 0.5051111111111111, "grad_norm": 0.9375271201133728, "learning_rate": 9.919821826280625e-05, "loss": 1.8193, "step": 2273 }, { "epoch": 0.5053333333333333, "grad_norm": 0.9642285108566284, "learning_rate": 9.915367483296214e-05, "loss": 1.6289, "step": 2274 }, { "epoch": 0.5055555555555555, "grad_norm": 1.1919479370117188, "learning_rate": 9.910913140311804e-05, "loss": 2.1311, "step": 2275 }, { "epoch": 0.5057777777777778, "grad_norm": 1.0379412174224854, "learning_rate": 9.906458797327395e-05, "loss": 1.7015, "step": 2276 }, { "epoch": 0.506, "grad_norm": 0.7220401763916016, "learning_rate": 9.902004454342984e-05, "loss": 0.9768, "step": 2277 }, { "epoch": 0.5062222222222222, "grad_norm": 0.0648246705532074, "learning_rate": 9.897550111358576e-05, "loss": 0.0176, "step": 2278 }, { "epoch": 0.5064444444444445, "grad_norm": 0.06818456947803497, "learning_rate": 9.893095768374165e-05, "loss": 0.0179, "step": 2279 }, { "epoch": 0.5066666666666667, "grad_norm": 0.07543423771858215, "learning_rate": 9.888641425389756e-05, "loss": 0.018, "step": 2280 }, { "epoch": 0.5068888888888889, "grad_norm": 1.0633699893951416, "learning_rate": 9.884187082405346e-05, "loss": 1.578, "step": 2281 }, { "epoch": 0.5071111111111111, "grad_norm": 0.11469082534313202, "learning_rate": 9.879732739420935e-05, "loss": 0.0209, "step": 2282 }, { "epoch": 0.5073333333333333, "grad_norm": 0.10379460453987122, "learning_rate": 9.875278396436526e-05, "loss": 0.02, "step": 2283 }, { "epoch": 0.5075555555555555, "grad_norm": 0.09670916199684143, "learning_rate": 9.870824053452117e-05, "loss": 0.019, "step": 2284 }, { "epoch": 0.5077777777777778, "grad_norm": 1.0629053115844727, "learning_rate": 9.866369710467707e-05, "loss": 1.845, "step": 2285 }, { "epoch": 0.508, "grad_norm": 1.166548252105713, "learning_rate": 9.861915367483296e-05, "loss": 1.8971, "step": 2286 }, { "epoch": 0.5082222222222222, "grad_norm": 1.0978573560714722, "learning_rate": 9.857461024498887e-05, "loss": 1.6318, "step": 2287 }, { "epoch": 0.5084444444444445, "grad_norm": 0.13032492995262146, "learning_rate": 9.853006681514477e-05, "loss": 0.0259, "step": 2288 }, { "epoch": 0.5086666666666667, "grad_norm": 1.131226897239685, "learning_rate": 9.848552338530067e-05, "loss": 1.5446, "step": 2289 }, { "epoch": 0.5088888888888888, "grad_norm": 1.1936326026916504, "learning_rate": 9.844097995545658e-05, "loss": 1.6931, "step": 2290 }, { "epoch": 0.5091111111111111, "grad_norm": 1.0433292388916016, "learning_rate": 9.839643652561248e-05, "loss": 1.6301, "step": 2291 }, { "epoch": 0.5093333333333333, "grad_norm": 0.991683840751648, "learning_rate": 9.835189309576838e-05, "loss": 1.4689, "step": 2292 }, { "epoch": 0.5095555555555555, "grad_norm": 1.026808500289917, "learning_rate": 9.830734966592427e-05, "loss": 1.3458, "step": 2293 }, { "epoch": 0.5097777777777778, "grad_norm": 0.7094257473945618, "learning_rate": 9.826280623608018e-05, "loss": 0.5261, "step": 2294 }, { "epoch": 0.51, "grad_norm": 0.722606360912323, "learning_rate": 9.821826280623608e-05, "loss": 0.5935, "step": 2295 }, { "epoch": 0.5102222222222222, "grad_norm": 1.1530025005340576, "learning_rate": 9.817371937639198e-05, "loss": 1.3226, "step": 2296 }, { "epoch": 0.5104444444444445, "grad_norm": 0.14330358803272247, "learning_rate": 9.81291759465479e-05, "loss": 0.0357, "step": 2297 }, { "epoch": 0.5106666666666667, "grad_norm": 1.1050106287002563, "learning_rate": 9.808463251670379e-05, "loss": 1.0314, "step": 2298 }, { "epoch": 0.5108888888888888, "grad_norm": 1.1559122800827026, "learning_rate": 9.804008908685969e-05, "loss": 1.084, "step": 2299 }, { "epoch": 0.5111111111111111, "grad_norm": 0.9388430118560791, "learning_rate": 9.79955456570156e-05, "loss": 0.6384, "step": 2300 }, { "epoch": 0.5113333333333333, "grad_norm": 0.5832968950271606, "learning_rate": 9.79510022271715e-05, "loss": 1.085, "step": 2301 }, { "epoch": 0.5115555555555555, "grad_norm": 0.7767675518989563, "learning_rate": 9.79064587973274e-05, "loss": 2.0396, "step": 2302 }, { "epoch": 0.5117777777777778, "grad_norm": 0.5899970531463623, "learning_rate": 9.78619153674833e-05, "loss": 0.9667, "step": 2303 }, { "epoch": 0.512, "grad_norm": 0.8278191089630127, "learning_rate": 9.78173719376392e-05, "loss": 2.134, "step": 2304 }, { "epoch": 0.5122222222222222, "grad_norm": 0.5901010632514954, "learning_rate": 9.77728285077951e-05, "loss": 1.0668, "step": 2305 }, { "epoch": 0.5124444444444445, "grad_norm": 0.0631280392408371, "learning_rate": 9.772828507795102e-05, "loss": 0.0116, "step": 2306 }, { "epoch": 0.5126666666666667, "grad_norm": 0.06386229395866394, "learning_rate": 9.768374164810691e-05, "loss": 0.0114, "step": 2307 }, { "epoch": 0.5128888888888888, "grad_norm": 0.6113215684890747, "learning_rate": 9.763919821826281e-05, "loss": 0.9675, "step": 2308 }, { "epoch": 0.5131111111111111, "grad_norm": 0.8982253670692444, "learning_rate": 9.759465478841872e-05, "loss": 2.0637, "step": 2309 }, { "epoch": 0.5133333333333333, "grad_norm": 0.8227818608283997, "learning_rate": 9.755011135857461e-05, "loss": 2.1668, "step": 2310 }, { "epoch": 0.5135555555555555, "grad_norm": 0.9095910787582397, "learning_rate": 9.750556792873052e-05, "loss": 2.2258, "step": 2311 }, { "epoch": 0.5137777777777778, "grad_norm": 1.043130874633789, "learning_rate": 9.746102449888642e-05, "loss": 2.5546, "step": 2312 }, { "epoch": 0.514, "grad_norm": 0.9570296406745911, "learning_rate": 9.741648106904233e-05, "loss": 2.1633, "step": 2313 }, { "epoch": 0.5142222222222222, "grad_norm": 0.5847756862640381, "learning_rate": 9.737193763919822e-05, "loss": 0.9857, "step": 2314 }, { "epoch": 0.5144444444444445, "grad_norm": 1.1674765348434448, "learning_rate": 9.732739420935412e-05, "loss": 1.876, "step": 2315 }, { "epoch": 0.5146666666666667, "grad_norm": 1.0518763065338135, "learning_rate": 9.728285077951003e-05, "loss": 2.0836, "step": 2316 }, { "epoch": 0.5148888888888888, "grad_norm": 0.8954287767410278, "learning_rate": 9.723830734966592e-05, "loss": 1.9936, "step": 2317 }, { "epoch": 0.5151111111111111, "grad_norm": 0.9926402568817139, "learning_rate": 9.719376391982184e-05, "loss": 1.9524, "step": 2318 }, { "epoch": 0.5153333333333333, "grad_norm": 1.602445125579834, "learning_rate": 9.714922048997773e-05, "loss": 1.5352, "step": 2319 }, { "epoch": 0.5155555555555555, "grad_norm": 0.09388995170593262, "learning_rate": 9.710467706013364e-05, "loss": 0.0163, "step": 2320 }, { "epoch": 0.5157777777777778, "grad_norm": 0.0820835530757904, "learning_rate": 9.706013363028954e-05, "loss": 0.0164, "step": 2321 }, { "epoch": 0.516, "grad_norm": 0.07170173525810242, "learning_rate": 9.701559020044543e-05, "loss": 0.016, "step": 2322 }, { "epoch": 0.5162222222222222, "grad_norm": 0.6576219201087952, "learning_rate": 9.697104677060134e-05, "loss": 0.8959, "step": 2323 }, { "epoch": 0.5164444444444445, "grad_norm": 1.153394103050232, "learning_rate": 9.692650334075724e-05, "loss": 2.3199, "step": 2324 }, { "epoch": 0.5166666666666667, "grad_norm": 0.977983832359314, "learning_rate": 9.688195991091315e-05, "loss": 1.7632, "step": 2325 }, { "epoch": 0.5168888888888888, "grad_norm": 0.9710626006126404, "learning_rate": 9.683741648106904e-05, "loss": 1.6654, "step": 2326 }, { "epoch": 0.5171111111111111, "grad_norm": 0.9058797955513, "learning_rate": 9.679287305122495e-05, "loss": 1.6685, "step": 2327 }, { "epoch": 0.5173333333333333, "grad_norm": 0.8887612223625183, "learning_rate": 9.674832962138085e-05, "loss": 1.7818, "step": 2328 }, { "epoch": 0.5175555555555555, "grad_norm": 0.9914031028747559, "learning_rate": 9.670378619153674e-05, "loss": 1.7536, "step": 2329 }, { "epoch": 0.5177777777777778, "grad_norm": 0.9197384119033813, "learning_rate": 9.665924276169266e-05, "loss": 1.74, "step": 2330 }, { "epoch": 0.518, "grad_norm": 1.1414260864257812, "learning_rate": 9.661469933184855e-05, "loss": 1.8903, "step": 2331 }, { "epoch": 0.5182222222222223, "grad_norm": 3.304169178009033, "learning_rate": 9.657015590200446e-05, "loss": 0.9065, "step": 2332 }, { "epoch": 0.5184444444444445, "grad_norm": 4.9997334480285645, "learning_rate": 9.652561247216037e-05, "loss": 0.0771, "step": 2333 }, { "epoch": 0.5186666666666667, "grad_norm": 0.09176503121852875, "learning_rate": 9.648106904231626e-05, "loss": 0.0186, "step": 2334 }, { "epoch": 0.5188888888888888, "grad_norm": 1.1866215467453003, "learning_rate": 9.643652561247216e-05, "loss": 1.6262, "step": 2335 }, { "epoch": 0.5191111111111111, "grad_norm": 0.07120410352945328, "learning_rate": 9.639198218262807e-05, "loss": 0.019, "step": 2336 }, { "epoch": 0.5193333333333333, "grad_norm": 0.07103080302476883, "learning_rate": 9.634743875278397e-05, "loss": 0.0195, "step": 2337 }, { "epoch": 0.5195555555555555, "grad_norm": 0.06759145110845566, "learning_rate": 9.630289532293986e-05, "loss": 0.0186, "step": 2338 }, { "epoch": 0.5197777777777778, "grad_norm": 1.2702637910842896, "learning_rate": 9.625835189309578e-05, "loss": 2.0136, "step": 2339 }, { "epoch": 0.52, "grad_norm": 1.1234042644500732, "learning_rate": 9.621380846325168e-05, "loss": 1.5986, "step": 2340 }, { "epoch": 0.5202222222222223, "grad_norm": 1.0025362968444824, "learning_rate": 9.616926503340757e-05, "loss": 1.3816, "step": 2341 }, { "epoch": 0.5204444444444445, "grad_norm": 0.9683412909507751, "learning_rate": 9.612472160356349e-05, "loss": 1.7117, "step": 2342 }, { "epoch": 0.5206666666666667, "grad_norm": 0.6754570603370667, "learning_rate": 9.608017817371938e-05, "loss": 0.7684, "step": 2343 }, { "epoch": 0.5208888888888888, "grad_norm": 1.094579815864563, "learning_rate": 9.603563474387528e-05, "loss": 1.6406, "step": 2344 }, { "epoch": 0.5211111111111111, "grad_norm": 1.0832680463790894, "learning_rate": 9.599109131403119e-05, "loss": 1.2147, "step": 2345 }, { "epoch": 0.5213333333333333, "grad_norm": 0.7980796098709106, "learning_rate": 9.59465478841871e-05, "loss": 0.8512, "step": 2346 }, { "epoch": 0.5215555555555556, "grad_norm": 0.7709640264511108, "learning_rate": 9.590200445434299e-05, "loss": 0.6093, "step": 2347 }, { "epoch": 0.5217777777777778, "grad_norm": 1.0753897428512573, "learning_rate": 9.585746102449889e-05, "loss": 1.26, "step": 2348 }, { "epoch": 0.522, "grad_norm": 1.1210321187973022, "learning_rate": 9.58129175946548e-05, "loss": 1.3757, "step": 2349 }, { "epoch": 0.5222222222222223, "grad_norm": 1.2580091953277588, "learning_rate": 9.576837416481069e-05, "loss": 1.1165, "step": 2350 }, { "epoch": 0.5224444444444445, "grad_norm": 0.12348097562789917, "learning_rate": 9.572383073496661e-05, "loss": 0.0121, "step": 2351 }, { "epoch": 0.5226666666666666, "grad_norm": 0.05580771714448929, "learning_rate": 9.56792873051225e-05, "loss": 0.0115, "step": 2352 }, { "epoch": 0.5228888888888888, "grad_norm": 0.8565195798873901, "learning_rate": 9.56347438752784e-05, "loss": 1.1554, "step": 2353 }, { "epoch": 0.5231111111111111, "grad_norm": 0.0494968518614769, "learning_rate": 9.559020044543431e-05, "loss": 0.0113, "step": 2354 }, { "epoch": 0.5233333333333333, "grad_norm": 1.011706829071045, "learning_rate": 9.55456570155902e-05, "loss": 2.2168, "step": 2355 }, { "epoch": 0.5235555555555556, "grad_norm": 0.7530580759048462, "learning_rate": 9.550111358574611e-05, "loss": 1.9735, "step": 2356 }, { "epoch": 0.5237777777777778, "grad_norm": 0.5390753149986267, "learning_rate": 9.545657015590201e-05, "loss": 0.9808, "step": 2357 }, { "epoch": 0.524, "grad_norm": 0.06390012800693512, "learning_rate": 9.541202672605792e-05, "loss": 0.0113, "step": 2358 }, { "epoch": 0.5242222222222223, "grad_norm": 0.06459398567676544, "learning_rate": 9.536748329621381e-05, "loss": 0.0115, "step": 2359 }, { "epoch": 0.5244444444444445, "grad_norm": 0.779170036315918, "learning_rate": 9.532293986636972e-05, "loss": 1.8657, "step": 2360 }, { "epoch": 0.5246666666666666, "grad_norm": 0.969144344329834, "learning_rate": 9.527839643652562e-05, "loss": 1.8891, "step": 2361 }, { "epoch": 0.5248888888888888, "grad_norm": 0.9192339777946472, "learning_rate": 9.523385300668151e-05, "loss": 1.1894, "step": 2362 }, { "epoch": 0.5251111111111111, "grad_norm": 0.9349969625473022, "learning_rate": 9.518930957683743e-05, "loss": 1.8199, "step": 2363 }, { "epoch": 0.5253333333333333, "grad_norm": 1.0784986019134521, "learning_rate": 9.514476614699332e-05, "loss": 1.6971, "step": 2364 }, { "epoch": 0.5255555555555556, "grad_norm": 0.8992215394973755, "learning_rate": 9.510022271714923e-05, "loss": 1.8382, "step": 2365 }, { "epoch": 0.5257777777777778, "grad_norm": 1.3966472148895264, "learning_rate": 9.505567928730512e-05, "loss": 1.9623, "step": 2366 }, { "epoch": 0.526, "grad_norm": 0.8984045386314392, "learning_rate": 9.501113585746103e-05, "loss": 1.5869, "step": 2367 }, { "epoch": 0.5262222222222223, "grad_norm": 0.9143683314323425, "learning_rate": 9.496659242761693e-05, "loss": 1.9555, "step": 2368 }, { "epoch": 0.5264444444444445, "grad_norm": 0.9720048308372498, "learning_rate": 9.492204899777282e-05, "loss": 2.0711, "step": 2369 }, { "epoch": 0.5266666666666666, "grad_norm": 0.07726185023784637, "learning_rate": 9.487750556792874e-05, "loss": 0.0151, "step": 2370 }, { "epoch": 0.5268888888888889, "grad_norm": 0.06836960464715958, "learning_rate": 9.483296213808463e-05, "loss": 0.0151, "step": 2371 }, { "epoch": 0.5271111111111111, "grad_norm": 0.06594307720661163, "learning_rate": 9.478841870824054e-05, "loss": 0.0152, "step": 2372 }, { "epoch": 0.5273333333333333, "grad_norm": 0.6360456943511963, "learning_rate": 9.474387527839644e-05, "loss": 0.7369, "step": 2373 }, { "epoch": 0.5275555555555556, "grad_norm": 0.18018539249897003, "learning_rate": 9.469933184855234e-05, "loss": 0.023, "step": 2374 }, { "epoch": 0.5277777777777778, "grad_norm": 0.663093090057373, "learning_rate": 9.465478841870824e-05, "loss": 1.0362, "step": 2375 }, { "epoch": 0.528, "grad_norm": 0.9098090529441833, "learning_rate": 9.461024498886415e-05, "loss": 1.6369, "step": 2376 }, { "epoch": 0.5282222222222223, "grad_norm": 1.1311285495758057, "learning_rate": 9.456570155902005e-05, "loss": 1.8766, "step": 2377 }, { "epoch": 0.5284444444444445, "grad_norm": 0.9926170706748962, "learning_rate": 9.452115812917594e-05, "loss": 1.6138, "step": 2378 }, { "epoch": 0.5286666666666666, "grad_norm": 1.0175913572311401, "learning_rate": 9.447661469933185e-05, "loss": 1.6726, "step": 2379 }, { "epoch": 0.5288888888888889, "grad_norm": 1.056575894355774, "learning_rate": 9.443207126948775e-05, "loss": 1.7264, "step": 2380 }, { "epoch": 0.5291111111111111, "grad_norm": 1.1231061220169067, "learning_rate": 9.438752783964365e-05, "loss": 1.0777, "step": 2381 }, { "epoch": 0.5293333333333333, "grad_norm": 0.7852345705032349, "learning_rate": 9.434298440979957e-05, "loss": 0.8974, "step": 2382 }, { "epoch": 0.5295555555555556, "grad_norm": 0.7592169046401978, "learning_rate": 9.429844097995546e-05, "loss": 0.9902, "step": 2383 }, { "epoch": 0.5297777777777778, "grad_norm": 0.07275737076997757, "learning_rate": 9.425389755011136e-05, "loss": 0.0192, "step": 2384 }, { "epoch": 0.53, "grad_norm": 0.9310175180435181, "learning_rate": 9.420935412026727e-05, "loss": 1.7347, "step": 2385 }, { "epoch": 0.5302222222222223, "grad_norm": 1.182883620262146, "learning_rate": 9.416481069042317e-05, "loss": 1.5165, "step": 2386 }, { "epoch": 0.5304444444444445, "grad_norm": 1.1949394941329956, "learning_rate": 9.412026726057906e-05, "loss": 1.8244, "step": 2387 }, { "epoch": 0.5306666666666666, "grad_norm": 1.121900200843811, "learning_rate": 9.407572383073497e-05, "loss": 1.9119, "step": 2388 }, { "epoch": 0.5308888888888889, "grad_norm": 0.10859935730695724, "learning_rate": 9.403118040089088e-05, "loss": 0.0262, "step": 2389 }, { "epoch": 0.5311111111111111, "grad_norm": 0.7552645802497864, "learning_rate": 9.398663697104677e-05, "loss": 0.797, "step": 2390 }, { "epoch": 0.5313333333333333, "grad_norm": 1.0325798988342285, "learning_rate": 9.394209354120269e-05, "loss": 1.3011, "step": 2391 }, { "epoch": 0.5315555555555556, "grad_norm": 1.0349661111831665, "learning_rate": 9.389755011135858e-05, "loss": 1.6267, "step": 2392 }, { "epoch": 0.5317777777777778, "grad_norm": 1.0448760986328125, "learning_rate": 9.385300668151448e-05, "loss": 1.3726, "step": 2393 }, { "epoch": 0.532, "grad_norm": 1.186620831489563, "learning_rate": 9.380846325167039e-05, "loss": 1.5912, "step": 2394 }, { "epoch": 0.5322222222222223, "grad_norm": 0.7071300148963928, "learning_rate": 9.376391982182628e-05, "loss": 0.7175, "step": 2395 }, { "epoch": 0.5324444444444445, "grad_norm": 0.6794847249984741, "learning_rate": 9.371937639198219e-05, "loss": 0.6406, "step": 2396 }, { "epoch": 0.5326666666666666, "grad_norm": 0.978138267993927, "learning_rate": 9.367483296213809e-05, "loss": 1.0755, "step": 2397 }, { "epoch": 0.5328888888888889, "grad_norm": 1.1468720436096191, "learning_rate": 9.3630289532294e-05, "loss": 1.3126, "step": 2398 }, { "epoch": 0.5331111111111111, "grad_norm": 1.0386849641799927, "learning_rate": 9.358574610244989e-05, "loss": 1.1505, "step": 2399 }, { "epoch": 0.5333333333333333, "grad_norm": 1.2117102146148682, "learning_rate": 9.35412026726058e-05, "loss": 1.1134, "step": 2400 }, { "epoch": 0.5335555555555556, "grad_norm": 0.6301453113555908, "learning_rate": 9.34966592427617e-05, "loss": 1.1273, "step": 2401 }, { "epoch": 0.5337777777777778, "grad_norm": 0.5409572124481201, "learning_rate": 9.345211581291759e-05, "loss": 1.0298, "step": 2402 }, { "epoch": 0.534, "grad_norm": 0.05003529414534569, "learning_rate": 9.340757238307351e-05, "loss": 0.0111, "step": 2403 }, { "epoch": 0.5342222222222223, "grad_norm": 0.547648549079895, "learning_rate": 9.33630289532294e-05, "loss": 0.9771, "step": 2404 }, { "epoch": 0.5344444444444445, "grad_norm": 0.8171392679214478, "learning_rate": 9.331848552338531e-05, "loss": 2.2949, "step": 2405 }, { "epoch": 0.5346666666666666, "grad_norm": 0.5529095530509949, "learning_rate": 9.327394209354121e-05, "loss": 1.2102, "step": 2406 }, { "epoch": 0.5348888888888889, "grad_norm": 0.622061550617218, "learning_rate": 9.32293986636971e-05, "loss": 1.2203, "step": 2407 }, { "epoch": 0.5351111111111111, "grad_norm": 0.4969142973423004, "learning_rate": 9.318485523385301e-05, "loss": 1.1624, "step": 2408 }, { "epoch": 0.5353333333333333, "grad_norm": 0.9374632239341736, "learning_rate": 9.314031180400892e-05, "loss": 2.6747, "step": 2409 }, { "epoch": 0.5355555555555556, "grad_norm": 0.8105266690254211, "learning_rate": 9.309576837416482e-05, "loss": 2.2292, "step": 2410 }, { "epoch": 0.5357777777777778, "grad_norm": 0.8644961714744568, "learning_rate": 9.305122494432071e-05, "loss": 1.1513, "step": 2411 }, { "epoch": 0.536, "grad_norm": 0.05260290950536728, "learning_rate": 9.300668151447662e-05, "loss": 0.0104, "step": 2412 }, { "epoch": 0.5362222222222223, "grad_norm": 0.053732600063085556, "learning_rate": 9.296213808463252e-05, "loss": 0.0108, "step": 2413 }, { "epoch": 0.5364444444444444, "grad_norm": 0.05255819112062454, "learning_rate": 9.291759465478841e-05, "loss": 0.0103, "step": 2414 }, { "epoch": 0.5366666666666666, "grad_norm": 0.5198526978492737, "learning_rate": 9.287305122494433e-05, "loss": 0.9938, "step": 2415 }, { "epoch": 0.5368888888888889, "grad_norm": 0.9449132680892944, "learning_rate": 9.282850779510023e-05, "loss": 2.1894, "step": 2416 }, { "epoch": 0.5371111111111111, "grad_norm": 0.8379865288734436, "learning_rate": 9.278396436525613e-05, "loss": 1.9937, "step": 2417 }, { "epoch": 0.5373333333333333, "grad_norm": 0.9000791311264038, "learning_rate": 9.273942093541204e-05, "loss": 1.9498, "step": 2418 }, { "epoch": 0.5375555555555556, "grad_norm": 1.0631499290466309, "learning_rate": 9.269487750556793e-05, "loss": 2.1338, "step": 2419 }, { "epoch": 0.5377777777777778, "grad_norm": 0.9265825748443604, "learning_rate": 9.265033407572383e-05, "loss": 2.1304, "step": 2420 }, { "epoch": 0.538, "grad_norm": 0.8675844669342041, "learning_rate": 9.260579064587974e-05, "loss": 2.0546, "step": 2421 }, { "epoch": 0.5382222222222223, "grad_norm": 0.14311416447162628, "learning_rate": 9.256124721603564e-05, "loss": 0.0225, "step": 2422 }, { "epoch": 0.5384444444444444, "grad_norm": 0.6641564965248108, "learning_rate": 9.251670378619154e-05, "loss": 1.1475, "step": 2423 }, { "epoch": 0.5386666666666666, "grad_norm": 0.9647601246833801, "learning_rate": 9.247216035634745e-05, "loss": 1.9414, "step": 2424 }, { "epoch": 0.5388888888888889, "grad_norm": 0.9718881845474243, "learning_rate": 9.242761692650335e-05, "loss": 2.031, "step": 2425 }, { "epoch": 0.5391111111111111, "grad_norm": 0.9469605088233948, "learning_rate": 9.238307349665924e-05, "loss": 1.9717, "step": 2426 }, { "epoch": 0.5393333333333333, "grad_norm": 0.9883223176002502, "learning_rate": 9.233853006681516e-05, "loss": 1.919, "step": 2427 }, { "epoch": 0.5395555555555556, "grad_norm": 1.31260085105896, "learning_rate": 9.229398663697105e-05, "loss": 2.0415, "step": 2428 }, { "epoch": 0.5397777777777778, "grad_norm": 0.9640005230903625, "learning_rate": 9.224944320712695e-05, "loss": 1.8964, "step": 2429 }, { "epoch": 0.54, "grad_norm": 0.7503467202186584, "learning_rate": 9.220489977728286e-05, "loss": 0.8938, "step": 2430 }, { "epoch": 0.5402222222222223, "grad_norm": 0.8691534996032715, "learning_rate": 9.216035634743877e-05, "loss": 1.9263, "step": 2431 }, { "epoch": 0.5404444444444444, "grad_norm": 1.175347089767456, "learning_rate": 9.211581291759466e-05, "loss": 1.5685, "step": 2432 }, { "epoch": 0.5406666666666666, "grad_norm": 1.2837430238723755, "learning_rate": 9.207126948775056e-05, "loss": 0.0653, "step": 2433 }, { "epoch": 0.5408888888888889, "grad_norm": 0.23923173546791077, "learning_rate": 9.202672605790647e-05, "loss": 0.0208, "step": 2434 }, { "epoch": 0.5411111111111111, "grad_norm": 0.075802281498909, "learning_rate": 9.198218262806236e-05, "loss": 0.0198, "step": 2435 }, { "epoch": 0.5413333333333333, "grad_norm": 0.6716746091842651, "learning_rate": 9.193763919821828e-05, "loss": 0.7959, "step": 2436 }, { "epoch": 0.5415555555555556, "grad_norm": 1.0868823528289795, "learning_rate": 9.189309576837417e-05, "loss": 1.7425, "step": 2437 }, { "epoch": 0.5417777777777778, "grad_norm": 0.9050690531730652, "learning_rate": 9.184855233853008e-05, "loss": 1.725, "step": 2438 }, { "epoch": 0.542, "grad_norm": 1.1277137994766235, "learning_rate": 9.180400890868597e-05, "loss": 1.7227, "step": 2439 }, { "epoch": 0.5422222222222223, "grad_norm": 0.11746444553136826, "learning_rate": 9.175946547884187e-05, "loss": 0.0203, "step": 2440 }, { "epoch": 0.5424444444444444, "grad_norm": 0.8444859981536865, "learning_rate": 9.171492204899778e-05, "loss": 1.0308, "step": 2441 }, { "epoch": 0.5426666666666666, "grad_norm": 0.7121472954750061, "learning_rate": 9.167037861915367e-05, "loss": 1.0558, "step": 2442 }, { "epoch": 0.5428888888888889, "grad_norm": 1.0242067575454712, "learning_rate": 9.162583518930959e-05, "loss": 1.3226, "step": 2443 }, { "epoch": 0.5431111111111111, "grad_norm": 1.067833423614502, "learning_rate": 9.158129175946548e-05, "loss": 1.3312, "step": 2444 }, { "epoch": 0.5433333333333333, "grad_norm": 1.1601375341415405, "learning_rate": 9.153674832962139e-05, "loss": 1.7604, "step": 2445 }, { "epoch": 0.5435555555555556, "grad_norm": 0.9809809327125549, "learning_rate": 9.149220489977729e-05, "loss": 1.3923, "step": 2446 }, { "epoch": 0.5437777777777778, "grad_norm": 0.74070143699646, "learning_rate": 9.144766146993318e-05, "loss": 0.7608, "step": 2447 }, { "epoch": 0.544, "grad_norm": 0.19705651700496674, "learning_rate": 9.140311804008909e-05, "loss": 0.0306, "step": 2448 }, { "epoch": 0.5442222222222223, "grad_norm": 0.8063182234764099, "learning_rate": 9.1358574610245e-05, "loss": 0.7462, "step": 2449 }, { "epoch": 0.5444444444444444, "grad_norm": 0.5909017324447632, "learning_rate": 9.13140311804009e-05, "loss": 0.4208, "step": 2450 }, { "epoch": 0.5446666666666666, "grad_norm": 0.7135075926780701, "learning_rate": 9.126948775055679e-05, "loss": 1.023, "step": 2451 }, { "epoch": 0.5448888888888889, "grad_norm": 0.5500476956367493, "learning_rate": 9.12249443207127e-05, "loss": 0.8607, "step": 2452 }, { "epoch": 0.5451111111111111, "grad_norm": 0.5393461585044861, "learning_rate": 9.11804008908686e-05, "loss": 1.1374, "step": 2453 }, { "epoch": 0.5453333333333333, "grad_norm": 0.8940771222114563, "learning_rate": 9.11358574610245e-05, "loss": 2.0951, "step": 2454 }, { "epoch": 0.5455555555555556, "grad_norm": 0.8712387084960938, "learning_rate": 9.109131403118041e-05, "loss": 2.3463, "step": 2455 }, { "epoch": 0.5457777777777778, "grad_norm": 0.8256047368049622, "learning_rate": 9.10467706013363e-05, "loss": 2.0528, "step": 2456 }, { "epoch": 0.546, "grad_norm": 0.06170797720551491, "learning_rate": 9.100222717149221e-05, "loss": 0.0108, "step": 2457 }, { "epoch": 0.5462222222222223, "grad_norm": 0.06142743304371834, "learning_rate": 9.095768374164811e-05, "loss": 0.0108, "step": 2458 }, { "epoch": 0.5464444444444444, "grad_norm": 0.5574305057525635, "learning_rate": 9.091314031180401e-05, "loss": 0.9544, "step": 2459 }, { "epoch": 0.5466666666666666, "grad_norm": 0.7911089062690735, "learning_rate": 9.086859688195991e-05, "loss": 1.9205, "step": 2460 }, { "epoch": 0.5468888888888889, "grad_norm": 0.9855570197105408, "learning_rate": 9.082405345211582e-05, "loss": 2.1234, "step": 2461 }, { "epoch": 0.5471111111111111, "grad_norm": 0.9141358137130737, "learning_rate": 9.077951002227172e-05, "loss": 2.0343, "step": 2462 }, { "epoch": 0.5473333333333333, "grad_norm": 0.8803722262382507, "learning_rate": 9.073496659242761e-05, "loss": 1.6688, "step": 2463 }, { "epoch": 0.5475555555555556, "grad_norm": 0.8939493894577026, "learning_rate": 9.069042316258352e-05, "loss": 2.0294, "step": 2464 }, { "epoch": 0.5477777777777778, "grad_norm": 1.09419584274292, "learning_rate": 9.064587973273943e-05, "loss": 1.9657, "step": 2465 }, { "epoch": 0.548, "grad_norm": 0.6533779501914978, "learning_rate": 9.060133630289532e-05, "loss": 0.9263, "step": 2466 }, { "epoch": 0.5482222222222223, "grad_norm": 0.8392800688743591, "learning_rate": 9.055679287305124e-05, "loss": 1.602, "step": 2467 }, { "epoch": 0.5484444444444444, "grad_norm": 0.9414380788803101, "learning_rate": 9.051224944320713e-05, "loss": 1.823, "step": 2468 }, { "epoch": 0.5486666666666666, "grad_norm": 0.9360827207565308, "learning_rate": 9.046770601336303e-05, "loss": 1.8465, "step": 2469 }, { "epoch": 0.5488888888888889, "grad_norm": 0.9744712114334106, "learning_rate": 9.042316258351894e-05, "loss": 2.1427, "step": 2470 }, { "epoch": 0.5491111111111111, "grad_norm": 0.7434724569320679, "learning_rate": 9.037861915367484e-05, "loss": 0.9291, "step": 2471 }, { "epoch": 0.5493333333333333, "grad_norm": 0.0730072483420372, "learning_rate": 9.033407572383074e-05, "loss": 0.0156, "step": 2472 }, { "epoch": 0.5495555555555556, "grad_norm": 0.07402710616588593, "learning_rate": 9.028953229398664e-05, "loss": 0.0155, "step": 2473 }, { "epoch": 0.5497777777777778, "grad_norm": 0.146684929728508, "learning_rate": 9.024498886414255e-05, "loss": 0.0244, "step": 2474 }, { "epoch": 0.55, "grad_norm": 0.6161303520202637, "learning_rate": 9.020044543429844e-05, "loss": 0.7681, "step": 2475 }, { "epoch": 0.5502222222222222, "grad_norm": 1.186063289642334, "learning_rate": 9.015590200445436e-05, "loss": 1.8129, "step": 2476 }, { "epoch": 0.5504444444444444, "grad_norm": 0.9506951570510864, "learning_rate": 9.011135857461025e-05, "loss": 1.884, "step": 2477 }, { "epoch": 0.5506666666666666, "grad_norm": 1.1868743896484375, "learning_rate": 9.006681514476615e-05, "loss": 1.974, "step": 2478 }, { "epoch": 0.5508888888888889, "grad_norm": 1.0391061305999756, "learning_rate": 9.002227171492206e-05, "loss": 1.4596, "step": 2479 }, { "epoch": 0.5511111111111111, "grad_norm": 1.0040979385375977, "learning_rate": 8.997772828507795e-05, "loss": 1.7033, "step": 2480 }, { "epoch": 0.5513333333333333, "grad_norm": 1.0166115760803223, "learning_rate": 8.993318485523386e-05, "loss": 1.7985, "step": 2481 }, { "epoch": 0.5515555555555556, "grad_norm": 1.0222774744033813, "learning_rate": 8.988864142538976e-05, "loss": 1.9303, "step": 2482 }, { "epoch": 0.5517777777777778, "grad_norm": 1.0761734247207642, "learning_rate": 8.984409799554567e-05, "loss": 1.6934, "step": 2483 }, { "epoch": 0.552, "grad_norm": 0.5333191156387329, "learning_rate": 8.979955456570156e-05, "loss": 0.0299, "step": 2484 }, { "epoch": 0.5522222222222222, "grad_norm": 0.22201071679592133, "learning_rate": 8.975501113585746e-05, "loss": 0.0227, "step": 2485 }, { "epoch": 0.5524444444444444, "grad_norm": 0.7878185510635376, "learning_rate": 8.971046770601337e-05, "loss": 0.9822, "step": 2486 }, { "epoch": 0.5526666666666666, "grad_norm": 0.6330142617225647, "learning_rate": 8.966592427616926e-05, "loss": 0.7849, "step": 2487 }, { "epoch": 0.5528888888888889, "grad_norm": 0.6124374866485596, "learning_rate": 8.962138084632518e-05, "loss": 0.7545, "step": 2488 }, { "epoch": 0.5531111111111111, "grad_norm": 0.9551767706871033, "learning_rate": 8.957683741648107e-05, "loss": 1.6892, "step": 2489 }, { "epoch": 0.5533333333333333, "grad_norm": 0.9491903781890869, "learning_rate": 8.953229398663698e-05, "loss": 1.5815, "step": 2490 }, { "epoch": 0.5535555555555556, "grad_norm": 0.6652069091796875, "learning_rate": 8.948775055679288e-05, "loss": 0.7771, "step": 2491 }, { "epoch": 0.5537777777777778, "grad_norm": 1.0102852582931519, "learning_rate": 8.944320712694878e-05, "loss": 1.5977, "step": 2492 }, { "epoch": 0.554, "grad_norm": 1.2895917892456055, "learning_rate": 8.939866369710468e-05, "loss": 1.5941, "step": 2493 }, { "epoch": 0.5542222222222222, "grad_norm": 0.7654258012771606, "learning_rate": 8.935412026726059e-05, "loss": 0.7395, "step": 2494 }, { "epoch": 0.5544444444444444, "grad_norm": 1.508339524269104, "learning_rate": 8.930957683741649e-05, "loss": 1.4961, "step": 2495 }, { "epoch": 0.5546666666666666, "grad_norm": 1.0076874494552612, "learning_rate": 8.926503340757238e-05, "loss": 1.1791, "step": 2496 }, { "epoch": 0.5548888888888889, "grad_norm": 1.1195073127746582, "learning_rate": 8.922048997772829e-05, "loss": 1.0248, "step": 2497 }, { "epoch": 0.5551111111111111, "grad_norm": 1.0789536237716675, "learning_rate": 8.91759465478842e-05, "loss": 1.0849, "step": 2498 }, { "epoch": 0.5553333333333333, "grad_norm": 0.8178943991661072, "learning_rate": 8.913140311804009e-05, "loss": 0.4074, "step": 2499 }, { "epoch": 0.5555555555555556, "grad_norm": 1.1042306423187256, "learning_rate": 8.9086859688196e-05, "loss": 0.909, "step": 2500 }, { "epoch": 0.5557777777777778, "grad_norm": 0.6491156816482544, "learning_rate": 8.90423162583519e-05, "loss": 1.2739, "step": 2501 }, { "epoch": 0.556, "grad_norm": 0.04861566051840782, "learning_rate": 8.89977728285078e-05, "loss": 0.0115, "step": 2502 }, { "epoch": 0.5562222222222222, "grad_norm": 0.8540087938308716, "learning_rate": 8.895322939866371e-05, "loss": 2.3845, "step": 2503 }, { "epoch": 0.5564444444444444, "grad_norm": 1.0630886554718018, "learning_rate": 8.89086859688196e-05, "loss": 2.3142, "step": 2504 }, { "epoch": 0.5566666666666666, "grad_norm": 0.7888458371162415, "learning_rate": 8.88641425389755e-05, "loss": 2.1994, "step": 2505 }, { "epoch": 0.5568888888888889, "grad_norm": 0.0522671639919281, "learning_rate": 8.881959910913141e-05, "loss": 0.0114, "step": 2506 }, { "epoch": 0.5571111111111111, "grad_norm": 0.8858940601348877, "learning_rate": 8.877505567928731e-05, "loss": 2.1228, "step": 2507 }, { "epoch": 0.5573333333333333, "grad_norm": 0.9195045232772827, "learning_rate": 8.87305122494432e-05, "loss": 2.1459, "step": 2508 }, { "epoch": 0.5575555555555556, "grad_norm": 0.08028994500637054, "learning_rate": 8.868596881959911e-05, "loss": 0.0116, "step": 2509 }, { "epoch": 0.5577777777777778, "grad_norm": 0.5606245398521423, "learning_rate": 8.864142538975502e-05, "loss": 1.1338, "step": 2510 }, { "epoch": 0.558, "grad_norm": 0.8265031576156616, "learning_rate": 8.859688195991091e-05, "loss": 2.0906, "step": 2511 }, { "epoch": 0.5582222222222222, "grad_norm": 0.8628373742103577, "learning_rate": 8.855233853006681e-05, "loss": 2.1225, "step": 2512 }, { "epoch": 0.5584444444444444, "grad_norm": 0.8644111752510071, "learning_rate": 8.850779510022272e-05, "loss": 1.9861, "step": 2513 }, { "epoch": 0.5586666666666666, "grad_norm": 1.0040253400802612, "learning_rate": 8.846325167037863e-05, "loss": 2.3637, "step": 2514 }, { "epoch": 0.5588888888888889, "grad_norm": 1.0627641677856445, "learning_rate": 8.841870824053452e-05, "loss": 1.7167, "step": 2515 }, { "epoch": 0.5591111111111111, "grad_norm": 0.9366372227668762, "learning_rate": 8.837416481069044e-05, "loss": 1.843, "step": 2516 }, { "epoch": 0.5593333333333333, "grad_norm": 0.9823426604270935, "learning_rate": 8.832962138084633e-05, "loss": 1.7333, "step": 2517 }, { "epoch": 0.5595555555555556, "grad_norm": 0.9016628861427307, "learning_rate": 8.828507795100222e-05, "loss": 1.6935, "step": 2518 }, { "epoch": 0.5597777777777778, "grad_norm": 1.0293992757797241, "learning_rate": 8.824053452115814e-05, "loss": 2.4355, "step": 2519 }, { "epoch": 0.56, "grad_norm": 0.8653985261917114, "learning_rate": 8.819599109131403e-05, "loss": 1.9367, "step": 2520 }, { "epoch": 0.5602222222222222, "grad_norm": 0.9979233145713806, "learning_rate": 8.815144766146994e-05, "loss": 1.9982, "step": 2521 }, { "epoch": 0.5604444444444444, "grad_norm": 0.9345369338989258, "learning_rate": 8.810690423162584e-05, "loss": 1.6632, "step": 2522 }, { "epoch": 0.5606666666666666, "grad_norm": 0.9781597256660461, "learning_rate": 8.806236080178175e-05, "loss": 1.945, "step": 2523 }, { "epoch": 0.5608888888888889, "grad_norm": 0.6337805390357971, "learning_rate": 8.801781737193764e-05, "loss": 1.0621, "step": 2524 }, { "epoch": 0.5611111111111111, "grad_norm": 0.07470656931400299, "learning_rate": 8.797327394209354e-05, "loss": 0.0167, "step": 2525 }, { "epoch": 0.5613333333333334, "grad_norm": 0.8214187622070312, "learning_rate": 8.792873051224945e-05, "loss": 1.1169, "step": 2526 }, { "epoch": 0.5615555555555556, "grad_norm": 0.10050620138645172, "learning_rate": 8.788418708240534e-05, "loss": 0.0203, "step": 2527 }, { "epoch": 0.5617777777777778, "grad_norm": 0.09838547557592392, "learning_rate": 8.783964365256126e-05, "loss": 0.0198, "step": 2528 }, { "epoch": 0.562, "grad_norm": 0.9661954641342163, "learning_rate": 8.779510022271715e-05, "loss": 1.6663, "step": 2529 }, { "epoch": 0.5622222222222222, "grad_norm": 0.9664443731307983, "learning_rate": 8.775055679287306e-05, "loss": 1.6305, "step": 2530 }, { "epoch": 0.5624444444444444, "grad_norm": 1.0631190538406372, "learning_rate": 8.770601336302896e-05, "loss": 2.0862, "step": 2531 }, { "epoch": 0.5626666666666666, "grad_norm": 0.9939605593681335, "learning_rate": 8.766146993318485e-05, "loss": 1.8518, "step": 2532 }, { "epoch": 0.5628888888888889, "grad_norm": 0.06220734864473343, "learning_rate": 8.761692650334076e-05, "loss": 0.0191, "step": 2533 }, { "epoch": 0.5631111111111111, "grad_norm": 0.07106296718120575, "learning_rate": 8.757238307349666e-05, "loss": 0.0181, "step": 2534 }, { "epoch": 0.5633333333333334, "grad_norm": 1.0672193765640259, "learning_rate": 8.752783964365257e-05, "loss": 1.9556, "step": 2535 }, { "epoch": 0.5635555555555556, "grad_norm": 1.0547032356262207, "learning_rate": 8.748329621380846e-05, "loss": 1.848, "step": 2536 }, { "epoch": 0.5637777777777778, "grad_norm": 0.08132950961589813, "learning_rate": 8.743875278396437e-05, "loss": 0.02, "step": 2537 }, { "epoch": 0.564, "grad_norm": 0.08906566351652145, "learning_rate": 8.739420935412027e-05, "loss": 0.0208, "step": 2538 }, { "epoch": 0.5642222222222222, "grad_norm": 0.0810704380273819, "learning_rate": 8.734966592427616e-05, "loss": 0.0199, "step": 2539 }, { "epoch": 0.5644444444444444, "grad_norm": 0.6286865472793579, "learning_rate": 8.730512249443208e-05, "loss": 0.789, "step": 2540 }, { "epoch": 0.5646666666666667, "grad_norm": 1.0283888578414917, "learning_rate": 8.726057906458798e-05, "loss": 1.6464, "step": 2541 }, { "epoch": 0.5648888888888889, "grad_norm": 0.7151986956596375, "learning_rate": 8.721603563474388e-05, "loss": 0.6798, "step": 2542 }, { "epoch": 0.5651111111111111, "grad_norm": 1.166221261024475, "learning_rate": 8.717149220489979e-05, "loss": 1.8092, "step": 2543 }, { "epoch": 0.5653333333333334, "grad_norm": 0.9559175968170166, "learning_rate": 8.712694877505568e-05, "loss": 1.5216, "step": 2544 }, { "epoch": 0.5655555555555556, "grad_norm": 1.0090394020080566, "learning_rate": 8.708240534521158e-05, "loss": 0.7943, "step": 2545 }, { "epoch": 0.5657777777777778, "grad_norm": 0.18629422783851624, "learning_rate": 8.703786191536749e-05, "loss": 0.0287, "step": 2546 }, { "epoch": 0.566, "grad_norm": 0.7050996422767639, "learning_rate": 8.69933184855234e-05, "loss": 0.6541, "step": 2547 }, { "epoch": 0.5662222222222222, "grad_norm": 1.126369833946228, "learning_rate": 8.694877505567929e-05, "loss": 1.3952, "step": 2548 }, { "epoch": 0.5664444444444444, "grad_norm": 0.15956442058086395, "learning_rate": 8.690423162583519e-05, "loss": 0.0382, "step": 2549 }, { "epoch": 0.5666666666666667, "grad_norm": 1.2113162279129028, "learning_rate": 8.68596881959911e-05, "loss": 1.149, "step": 2550 }, { "epoch": 0.5668888888888889, "grad_norm": 0.5767436027526855, "learning_rate": 8.681514476614699e-05, "loss": 0.8876, "step": 2551 }, { "epoch": 0.5671111111111111, "grad_norm": 0.06607482582330704, "learning_rate": 8.677060133630291e-05, "loss": 0.0109, "step": 2552 }, { "epoch": 0.5673333333333334, "grad_norm": 0.5429921746253967, "learning_rate": 8.67260579064588e-05, "loss": 1.1846, "step": 2553 }, { "epoch": 0.5675555555555556, "grad_norm": 0.04655059799551964, "learning_rate": 8.66815144766147e-05, "loss": 0.0108, "step": 2554 }, { "epoch": 0.5677777777777778, "grad_norm": 0.8319535255432129, "learning_rate": 8.663697104677061e-05, "loss": 2.2138, "step": 2555 }, { "epoch": 0.568, "grad_norm": 0.5527912378311157, "learning_rate": 8.65924276169265e-05, "loss": 1.0141, "step": 2556 }, { "epoch": 0.5682222222222222, "grad_norm": 0.07282774150371552, "learning_rate": 8.65478841870824e-05, "loss": 0.0119, "step": 2557 }, { "epoch": 0.5684444444444444, "grad_norm": 0.08080356568098068, "learning_rate": 8.650334075723831e-05, "loss": 0.0117, "step": 2558 }, { "epoch": 0.5686666666666667, "grad_norm": 0.07515871524810791, "learning_rate": 8.645879732739422e-05, "loss": 0.0117, "step": 2559 }, { "epoch": 0.5688888888888889, "grad_norm": 0.07429923862218857, "learning_rate": 8.641425389755011e-05, "loss": 0.011, "step": 2560 }, { "epoch": 0.5691111111111111, "grad_norm": 0.985916793346405, "learning_rate": 8.636971046770603e-05, "loss": 2.1608, "step": 2561 }, { "epoch": 0.5693333333333334, "grad_norm": 1.0047152042388916, "learning_rate": 8.632516703786192e-05, "loss": 2.3047, "step": 2562 }, { "epoch": 0.5695555555555556, "grad_norm": 0.8193495273590088, "learning_rate": 8.628062360801783e-05, "loss": 1.8905, "step": 2563 }, { "epoch": 0.5697777777777778, "grad_norm": 0.8767317533493042, "learning_rate": 8.623608017817373e-05, "loss": 1.795, "step": 2564 }, { "epoch": 0.57, "grad_norm": 0.9993298053741455, "learning_rate": 8.619153674832962e-05, "loss": 2.022, "step": 2565 }, { "epoch": 0.5702222222222222, "grad_norm": 0.9578080773353577, "learning_rate": 8.614699331848553e-05, "loss": 1.7333, "step": 2566 }, { "epoch": 0.5704444444444444, "grad_norm": 0.8383786082267761, "learning_rate": 8.610244988864143e-05, "loss": 1.9282, "step": 2567 }, { "epoch": 0.5706666666666667, "grad_norm": 0.900726854801178, "learning_rate": 8.605790645879734e-05, "loss": 1.835, "step": 2568 }, { "epoch": 0.5708888888888889, "grad_norm": 0.8840144872665405, "learning_rate": 8.601336302895323e-05, "loss": 1.9359, "step": 2569 }, { "epoch": 0.5711111111111111, "grad_norm": 1.0611618757247925, "learning_rate": 8.596881959910914e-05, "loss": 1.7402, "step": 2570 }, { "epoch": 0.5713333333333334, "grad_norm": 0.960759162902832, "learning_rate": 8.592427616926504e-05, "loss": 1.8016, "step": 2571 }, { "epoch": 0.5715555555555556, "grad_norm": 0.962713897228241, "learning_rate": 8.587973273942093e-05, "loss": 1.8722, "step": 2572 }, { "epoch": 0.5717777777777778, "grad_norm": 0.6798233985900879, "learning_rate": 8.583518930957685e-05, "loss": 0.8461, "step": 2573 }, { "epoch": 0.572, "grad_norm": 0.7111622095108032, "learning_rate": 8.579064587973274e-05, "loss": 1.0258, "step": 2574 }, { "epoch": 0.5722222222222222, "grad_norm": 0.7947016358375549, "learning_rate": 8.574610244988865e-05, "loss": 1.0723, "step": 2575 }, { "epoch": 0.5724444444444444, "grad_norm": 0.645455002784729, "learning_rate": 8.570155902004455e-05, "loss": 0.9378, "step": 2576 }, { "epoch": 0.5726666666666667, "grad_norm": 0.9577328562736511, "learning_rate": 8.565701559020045e-05, "loss": 1.7592, "step": 2577 }, { "epoch": 0.5728888888888889, "grad_norm": 0.9869621396064758, "learning_rate": 8.561247216035635e-05, "loss": 1.8991, "step": 2578 }, { "epoch": 0.5731111111111111, "grad_norm": 0.9232078194618225, "learning_rate": 8.556792873051226e-05, "loss": 1.5691, "step": 2579 }, { "epoch": 0.5733333333333334, "grad_norm": 1.0303270816802979, "learning_rate": 8.552338530066816e-05, "loss": 1.6218, "step": 2580 }, { "epoch": 0.5735555555555556, "grad_norm": 0.9738333225250244, "learning_rate": 8.547884187082405e-05, "loss": 1.9149, "step": 2581 }, { "epoch": 0.5737777777777778, "grad_norm": 1.0307059288024902, "learning_rate": 8.543429844097996e-05, "loss": 1.8569, "step": 2582 }, { "epoch": 0.574, "grad_norm": 1.0071452856063843, "learning_rate": 8.538975501113586e-05, "loss": 1.5601, "step": 2583 }, { "epoch": 0.5742222222222222, "grad_norm": 0.06155632063746452, "learning_rate": 8.534521158129176e-05, "loss": 0.0178, "step": 2584 }, { "epoch": 0.5744444444444444, "grad_norm": 0.061177369207143784, "learning_rate": 8.530066815144766e-05, "loss": 0.0178, "step": 2585 }, { "epoch": 0.5746666666666667, "grad_norm": 0.07538451999425888, "learning_rate": 8.525612472160357e-05, "loss": 0.0177, "step": 2586 }, { "epoch": 0.5748888888888889, "grad_norm": 0.8191643357276917, "learning_rate": 8.521158129175947e-05, "loss": 0.99, "step": 2587 }, { "epoch": 0.5751111111111111, "grad_norm": 0.8007138967514038, "learning_rate": 8.516703786191536e-05, "loss": 0.9497, "step": 2588 }, { "epoch": 0.5753333333333334, "grad_norm": 0.0795382633805275, "learning_rate": 8.512249443207127e-05, "loss": 0.0185, "step": 2589 }, { "epoch": 0.5755555555555556, "grad_norm": 0.8413631319999695, "learning_rate": 8.507795100222718e-05, "loss": 0.901, "step": 2590 }, { "epoch": 0.5757777777777778, "grad_norm": 1.0749456882476807, "learning_rate": 8.503340757238307e-05, "loss": 1.6439, "step": 2591 }, { "epoch": 0.576, "grad_norm": 1.0282601118087769, "learning_rate": 8.498886414253899e-05, "loss": 1.7618, "step": 2592 }, { "epoch": 0.5762222222222222, "grad_norm": 1.302951455116272, "learning_rate": 8.494432071269488e-05, "loss": 1.6646, "step": 2593 }, { "epoch": 0.5764444444444444, "grad_norm": 1.0702754259109497, "learning_rate": 8.489977728285078e-05, "loss": 1.5267, "step": 2594 }, { "epoch": 0.5766666666666667, "grad_norm": 1.0590934753417969, "learning_rate": 8.485523385300669e-05, "loss": 1.7966, "step": 2595 }, { "epoch": 0.5768888888888889, "grad_norm": 1.1090075969696045, "learning_rate": 8.481069042316258e-05, "loss": 1.6027, "step": 2596 }, { "epoch": 0.5771111111111111, "grad_norm": 1.0323162078857422, "learning_rate": 8.476614699331849e-05, "loss": 1.8052, "step": 2597 }, { "epoch": 0.5773333333333334, "grad_norm": 1.0176801681518555, "learning_rate": 8.472160356347439e-05, "loss": 1.3763, "step": 2598 }, { "epoch": 0.5775555555555556, "grad_norm": 0.6453489065170288, "learning_rate": 8.46770601336303e-05, "loss": 0.673, "step": 2599 }, { "epoch": 0.5777777777777777, "grad_norm": 0.6275585293769836, "learning_rate": 8.463251670378619e-05, "loss": 0.3759, "step": 2600 }, { "epoch": 0.578, "grad_norm": 0.5658282041549683, "learning_rate": 8.458797327394211e-05, "loss": 1.0322, "step": 2601 }, { "epoch": 0.5782222222222222, "grad_norm": 0.0510311983525753, "learning_rate": 8.4543429844098e-05, "loss": 0.0103, "step": 2602 }, { "epoch": 0.5784444444444444, "grad_norm": 0.7874890565872192, "learning_rate": 8.449888641425389e-05, "loss": 0.8342, "step": 2603 }, { "epoch": 0.5786666666666667, "grad_norm": 0.5974457859992981, "learning_rate": 8.445434298440981e-05, "loss": 1.0163, "step": 2604 }, { "epoch": 0.5788888888888889, "grad_norm": 0.901496171951294, "learning_rate": 8.44097995545657e-05, "loss": 2.1625, "step": 2605 }, { "epoch": 0.5791111111111111, "grad_norm": 0.6653454899787903, "learning_rate": 8.43652561247216e-05, "loss": 1.1271, "step": 2606 }, { "epoch": 0.5793333333333334, "grad_norm": 0.0663490742444992, "learning_rate": 8.432071269487751e-05, "loss": 0.0111, "step": 2607 }, { "epoch": 0.5795555555555556, "grad_norm": 0.062182825058698654, "learning_rate": 8.427616926503342e-05, "loss": 0.011, "step": 2608 }, { "epoch": 0.5797777777777777, "grad_norm": 0.6534640789031982, "learning_rate": 8.423162583518931e-05, "loss": 1.1674, "step": 2609 }, { "epoch": 0.58, "grad_norm": 0.872759222984314, "learning_rate": 8.418708240534521e-05, "loss": 1.9497, "step": 2610 }, { "epoch": 0.5802222222222222, "grad_norm": 0.9230912923812866, "learning_rate": 8.414253897550112e-05, "loss": 1.8053, "step": 2611 }, { "epoch": 0.5804444444444444, "grad_norm": 0.9444859623908997, "learning_rate": 8.409799554565701e-05, "loss": 2.0272, "step": 2612 }, { "epoch": 0.5806666666666667, "grad_norm": 0.8272079825401306, "learning_rate": 8.405345211581293e-05, "loss": 1.9847, "step": 2613 }, { "epoch": 0.5808888888888889, "grad_norm": 0.9048417806625366, "learning_rate": 8.400890868596882e-05, "loss": 1.1981, "step": 2614 }, { "epoch": 0.5811111111111111, "grad_norm": 0.7303177118301392, "learning_rate": 8.396436525612473e-05, "loss": 0.8572, "step": 2615 }, { "epoch": 0.5813333333333334, "grad_norm": 0.6562807559967041, "learning_rate": 8.391982182628063e-05, "loss": 1.179, "step": 2616 }, { "epoch": 0.5815555555555556, "grad_norm": 1.0120536088943481, "learning_rate": 8.387527839643652e-05, "loss": 1.8926, "step": 2617 }, { "epoch": 0.5817777777777777, "grad_norm": 0.9938877820968628, "learning_rate": 8.383073496659243e-05, "loss": 2.0306, "step": 2618 }, { "epoch": 0.582, "grad_norm": 0.9313610196113586, "learning_rate": 8.378619153674834e-05, "loss": 2.047, "step": 2619 }, { "epoch": 0.5822222222222222, "grad_norm": 0.06936561316251755, "learning_rate": 8.374164810690424e-05, "loss": 0.0158, "step": 2620 }, { "epoch": 0.5824444444444444, "grad_norm": 0.06766082346439362, "learning_rate": 8.369710467706013e-05, "loss": 0.016, "step": 2621 }, { "epoch": 0.5826666666666667, "grad_norm": 0.07017278671264648, "learning_rate": 8.365256124721604e-05, "loss": 0.0156, "step": 2622 }, { "epoch": 0.5828888888888889, "grad_norm": 0.7275362014770508, "learning_rate": 8.360801781737194e-05, "loss": 0.8656, "step": 2623 }, { "epoch": 0.5831111111111111, "grad_norm": 1.1176071166992188, "learning_rate": 8.356347438752784e-05, "loss": 2.0985, "step": 2624 }, { "epoch": 0.5833333333333334, "grad_norm": 0.9742321968078613, "learning_rate": 8.351893095768375e-05, "loss": 1.8387, "step": 2625 }, { "epoch": 0.5835555555555556, "grad_norm": 0.940450131893158, "learning_rate": 8.347438752783965e-05, "loss": 1.6521, "step": 2626 }, { "epoch": 0.5837777777777777, "grad_norm": 1.1521259546279907, "learning_rate": 8.342984409799555e-05, "loss": 1.8395, "step": 2627 }, { "epoch": 0.584, "grad_norm": 0.9441390037536621, "learning_rate": 8.338530066815146e-05, "loss": 1.6441, "step": 2628 }, { "epoch": 0.5842222222222222, "grad_norm": 1.1160365343093872, "learning_rate": 8.334075723830735e-05, "loss": 1.5527, "step": 2629 }, { "epoch": 0.5844444444444444, "grad_norm": 0.6974783539772034, "learning_rate": 8.329621380846325e-05, "loss": 0.9513, "step": 2630 }, { "epoch": 0.5846666666666667, "grad_norm": 0.06145935505628586, "learning_rate": 8.325167037861916e-05, "loss": 0.0166, "step": 2631 }, { "epoch": 0.5848888888888889, "grad_norm": 0.06906305998563766, "learning_rate": 8.320712694877506e-05, "loss": 0.0169, "step": 2632 }, { "epoch": 0.5851111111111111, "grad_norm": 0.06415744870901108, "learning_rate": 8.316258351893096e-05, "loss": 0.0169, "step": 2633 }, { "epoch": 0.5853333333333334, "grad_norm": 1.028851866722107, "learning_rate": 8.311804008908686e-05, "loss": 1.9996, "step": 2634 }, { "epoch": 0.5855555555555556, "grad_norm": 0.6387873291969299, "learning_rate": 8.307349665924277e-05, "loss": 1.0506, "step": 2635 }, { "epoch": 0.5857777777777777, "grad_norm": 0.07523876428604126, "learning_rate": 8.302895322939866e-05, "loss": 0.0182, "step": 2636 }, { "epoch": 0.586, "grad_norm": 0.07396306097507477, "learning_rate": 8.298440979955458e-05, "loss": 0.0178, "step": 2637 }, { "epoch": 0.5862222222222222, "grad_norm": 0.6903396248817444, "learning_rate": 8.293986636971047e-05, "loss": 0.7665, "step": 2638 }, { "epoch": 0.5864444444444444, "grad_norm": 1.0969858169555664, "learning_rate": 8.289532293986638e-05, "loss": 1.7489, "step": 2639 }, { "epoch": 0.5866666666666667, "grad_norm": 1.394546389579773, "learning_rate": 8.285077951002228e-05, "loss": 1.6268, "step": 2640 }, { "epoch": 0.5868888888888889, "grad_norm": 1.185672640800476, "learning_rate": 8.280623608017817e-05, "loss": 1.7259, "step": 2641 }, { "epoch": 0.5871111111111111, "grad_norm": 1.1864769458770752, "learning_rate": 8.276169265033408e-05, "loss": 1.4244, "step": 2642 }, { "epoch": 0.5873333333333334, "grad_norm": 1.172638177871704, "learning_rate": 8.271714922048998e-05, "loss": 1.4388, "step": 2643 }, { "epoch": 0.5875555555555556, "grad_norm": 1.2217496633529663, "learning_rate": 8.267260579064589e-05, "loss": 1.4179, "step": 2644 }, { "epoch": 0.5877777777777777, "grad_norm": 1.3695220947265625, "learning_rate": 8.262806236080178e-05, "loss": 1.7871, "step": 2645 }, { "epoch": 0.588, "grad_norm": 1.0224231481552124, "learning_rate": 8.25835189309577e-05, "loss": 1.24, "step": 2646 }, { "epoch": 0.5882222222222222, "grad_norm": 0.9784666299819946, "learning_rate": 8.253897550111359e-05, "loss": 1.1633, "step": 2647 }, { "epoch": 0.5884444444444444, "grad_norm": 0.9947894215583801, "learning_rate": 8.24944320712695e-05, "loss": 1.1847, "step": 2648 }, { "epoch": 0.5886666666666667, "grad_norm": 0.17486204206943512, "learning_rate": 8.24498886414254e-05, "loss": 0.0404, "step": 2649 }, { "epoch": 0.5888888888888889, "grad_norm": 1.0700467824935913, "learning_rate": 8.24053452115813e-05, "loss": 1.1755, "step": 2650 }, { "epoch": 0.5891111111111111, "grad_norm": 0.050411712378263474, "learning_rate": 8.23608017817372e-05, "loss": 0.0105, "step": 2651 }, { "epoch": 0.5893333333333334, "grad_norm": 0.9025883078575134, "learning_rate": 8.23162583518931e-05, "loss": 2.3288, "step": 2652 }, { "epoch": 0.5895555555555556, "grad_norm": 0.6692728996276855, "learning_rate": 8.227171492204901e-05, "loss": 1.1624, "step": 2653 }, { "epoch": 0.5897777777777777, "grad_norm": 0.053395964205265045, "learning_rate": 8.22271714922049e-05, "loss": 0.0103, "step": 2654 }, { "epoch": 0.59, "grad_norm": 0.9292510747909546, "learning_rate": 8.21826280623608e-05, "loss": 2.1717, "step": 2655 }, { "epoch": 0.5902222222222222, "grad_norm": 0.08087780326604843, "learning_rate": 8.213808463251671e-05, "loss": 0.0115, "step": 2656 }, { "epoch": 0.5904444444444444, "grad_norm": 0.635247528553009, "learning_rate": 8.20935412026726e-05, "loss": 1.0443, "step": 2657 }, { "epoch": 0.5906666666666667, "grad_norm": 0.9392407536506653, "learning_rate": 8.204899777282851e-05, "loss": 2.1964, "step": 2658 }, { "epoch": 0.5908888888888889, "grad_norm": 0.9710378646850586, "learning_rate": 8.200445434298441e-05, "loss": 2.2006, "step": 2659 }, { "epoch": 0.5911111111111111, "grad_norm": 0.9601635336875916, "learning_rate": 8.195991091314032e-05, "loss": 2.1372, "step": 2660 }, { "epoch": 0.5913333333333334, "grad_norm": 0.9137882590293884, "learning_rate": 8.191536748329621e-05, "loss": 2.1513, "step": 2661 }, { "epoch": 0.5915555555555555, "grad_norm": 1.1451430320739746, "learning_rate": 8.187082405345212e-05, "loss": 1.9865, "step": 2662 }, { "epoch": 0.5917777777777777, "grad_norm": 0.91966313123703, "learning_rate": 8.182628062360802e-05, "loss": 1.8152, "step": 2663 }, { "epoch": 0.592, "grad_norm": 0.9007231593132019, "learning_rate": 8.178173719376391e-05, "loss": 2.0409, "step": 2664 }, { "epoch": 0.5922222222222222, "grad_norm": 0.6561061143875122, "learning_rate": 8.173719376391983e-05, "loss": 1.1022, "step": 2665 }, { "epoch": 0.5924444444444444, "grad_norm": 0.6513422727584839, "learning_rate": 8.169265033407572e-05, "loss": 0.8374, "step": 2666 }, { "epoch": 0.5926666666666667, "grad_norm": 0.8476526737213135, "learning_rate": 8.164810690423163e-05, "loss": 1.6632, "step": 2667 }, { "epoch": 0.5928888888888889, "grad_norm": 1.1757445335388184, "learning_rate": 8.160356347438754e-05, "loss": 2.2507, "step": 2668 }, { "epoch": 0.5931111111111111, "grad_norm": 0.9867371916770935, "learning_rate": 8.155902004454343e-05, "loss": 1.8475, "step": 2669 }, { "epoch": 0.5933333333333334, "grad_norm": 1.1854937076568604, "learning_rate": 8.151447661469933e-05, "loss": 1.7041, "step": 2670 }, { "epoch": 0.5935555555555555, "grad_norm": 0.721083402633667, "learning_rate": 8.146993318485524e-05, "loss": 0.8732, "step": 2671 }, { "epoch": 0.5937777777777777, "grad_norm": 0.6721528768539429, "learning_rate": 8.142538975501114e-05, "loss": 0.8163, "step": 2672 }, { "epoch": 0.594, "grad_norm": 1.2233518362045288, "learning_rate": 8.138084632516704e-05, "loss": 1.7324, "step": 2673 }, { "epoch": 0.5942222222222222, "grad_norm": 1.009818196296692, "learning_rate": 8.133630289532294e-05, "loss": 1.9385, "step": 2674 }, { "epoch": 0.5944444444444444, "grad_norm": 0.9225603342056274, "learning_rate": 8.129175946547885e-05, "loss": 1.6991, "step": 2675 }, { "epoch": 0.5946666666666667, "grad_norm": 0.07418637722730637, "learning_rate": 8.124721603563474e-05, "loss": 0.0175, "step": 2676 }, { "epoch": 0.5948888888888889, "grad_norm": 0.07073847204446793, "learning_rate": 8.120267260579066e-05, "loss": 0.0174, "step": 2677 }, { "epoch": 0.5951111111111111, "grad_norm": 0.06910637021064758, "learning_rate": 8.115812917594655e-05, "loss": 0.0172, "step": 2678 }, { "epoch": 0.5953333333333334, "grad_norm": 1.236820936203003, "learning_rate": 8.111358574610245e-05, "loss": 1.7179, "step": 2679 }, { "epoch": 0.5955555555555555, "grad_norm": 1.0825942754745483, "learning_rate": 8.106904231625836e-05, "loss": 1.547, "step": 2680 }, { "epoch": 0.5957777777777777, "grad_norm": 0.08058004081249237, "learning_rate": 8.102449888641425e-05, "loss": 0.0191, "step": 2681 }, { "epoch": 0.596, "grad_norm": 0.08336427807807922, "learning_rate": 8.097995545657016e-05, "loss": 0.0191, "step": 2682 }, { "epoch": 0.5962222222222222, "grad_norm": 0.08025740832090378, "learning_rate": 8.093541202672606e-05, "loss": 0.019, "step": 2683 }, { "epoch": 0.5964444444444444, "grad_norm": 0.07246199250221252, "learning_rate": 8.089086859688197e-05, "loss": 0.0186, "step": 2684 }, { "epoch": 0.5966666666666667, "grad_norm": 0.07082468271255493, "learning_rate": 8.084632516703786e-05, "loss": 0.018, "step": 2685 }, { "epoch": 0.5968888888888889, "grad_norm": 0.08357842266559601, "learning_rate": 8.080178173719378e-05, "loss": 0.0182, "step": 2686 }, { "epoch": 0.5971111111111111, "grad_norm": 0.07589254528284073, "learning_rate": 8.075723830734967e-05, "loss": 0.0171, "step": 2687 }, { "epoch": 0.5973333333333334, "grad_norm": 1.0739028453826904, "learning_rate": 8.071269487750556e-05, "loss": 1.6881, "step": 2688 }, { "epoch": 0.5975555555555555, "grad_norm": 1.0699751377105713, "learning_rate": 8.066815144766148e-05, "loss": 1.4065, "step": 2689 }, { "epoch": 0.5977777777777777, "grad_norm": 0.8686650395393372, "learning_rate": 8.062360801781737e-05, "loss": 0.8745, "step": 2690 }, { "epoch": 0.598, "grad_norm": 0.19912366569042206, "learning_rate": 8.057906458797328e-05, "loss": 0.029, "step": 2691 }, { "epoch": 0.5982222222222222, "grad_norm": 1.0016716718673706, "learning_rate": 8.053452115812918e-05, "loss": 1.6162, "step": 2692 }, { "epoch": 0.5984444444444444, "grad_norm": 1.1502323150634766, "learning_rate": 8.048997772828509e-05, "loss": 1.5778, "step": 2693 }, { "epoch": 0.5986666666666667, "grad_norm": 1.196444034576416, "learning_rate": 8.044543429844098e-05, "loss": 1.2702, "step": 2694 }, { "epoch": 0.5988888888888889, "grad_norm": 1.1185574531555176, "learning_rate": 8.040089086859689e-05, "loss": 1.4555, "step": 2695 }, { "epoch": 0.5991111111111111, "grad_norm": 0.9055349230766296, "learning_rate": 8.035634743875279e-05, "loss": 1.1932, "step": 2696 }, { "epoch": 0.5993333333333334, "grad_norm": 1.1257692575454712, "learning_rate": 8.031180400890868e-05, "loss": 1.1586, "step": 2697 }, { "epoch": 0.5995555555555555, "grad_norm": 1.1968945264816284, "learning_rate": 8.02672605790646e-05, "loss": 1.194, "step": 2698 }, { "epoch": 0.5997777777777777, "grad_norm": 0.8461349606513977, "learning_rate": 8.02227171492205e-05, "loss": 0.6877, "step": 2699 }, { "epoch": 0.6, "grad_norm": 1.3101285696029663, "learning_rate": 8.01781737193764e-05, "loss": 1.1574, "step": 2700 }, { "epoch": 0.6, "eval_loss": 1.185476541519165, "eval_runtime": 240.0565, "eval_samples_per_second": 4.166, "eval_steps_per_second": 4.166, "step": 2700 }, { "epoch": 0.6002222222222222, "grad_norm": 0.6387638449668884, "learning_rate": 8.01336302895323e-05, "loss": 1.1591, "step": 2701 }, { "epoch": 0.6004444444444444, "grad_norm": 0.05776821821928024, "learning_rate": 8.00890868596882e-05, "loss": 0.0107, "step": 2702 }, { "epoch": 0.6006666666666667, "grad_norm": 0.05349646508693695, "learning_rate": 8.00445434298441e-05, "loss": 0.0104, "step": 2703 }, { "epoch": 0.6008888888888889, "grad_norm": 0.0507376454770565, "learning_rate": 8e-05, "loss": 0.0103, "step": 2704 }, { "epoch": 0.6011111111111112, "grad_norm": 0.7964654564857483, "learning_rate": 7.995545657015591e-05, "loss": 2.1793, "step": 2705 }, { "epoch": 0.6013333333333334, "grad_norm": 0.9528084397315979, "learning_rate": 7.99109131403118e-05, "loss": 2.3925, "step": 2706 }, { "epoch": 0.6015555555555555, "grad_norm": 0.8756290078163147, "learning_rate": 7.986636971046771e-05, "loss": 2.1808, "step": 2707 }, { "epoch": 0.6017777777777777, "grad_norm": 0.0803305059671402, "learning_rate": 7.982182628062361e-05, "loss": 0.0128, "step": 2708 }, { "epoch": 0.602, "grad_norm": 0.08680860698223114, "learning_rate": 7.97772828507795e-05, "loss": 0.0128, "step": 2709 }, { "epoch": 0.6022222222222222, "grad_norm": 0.08112699538469315, "learning_rate": 7.973273942093543e-05, "loss": 0.0125, "step": 2710 }, { "epoch": 0.6024444444444444, "grad_norm": 0.5621529221534729, "learning_rate": 7.968819599109132e-05, "loss": 0.8918, "step": 2711 }, { "epoch": 0.6026666666666667, "grad_norm": 0.8549271821975708, "learning_rate": 7.964365256124722e-05, "loss": 1.9602, "step": 2712 }, { "epoch": 0.6028888888888889, "grad_norm": 0.8815329670906067, "learning_rate": 7.959910913140313e-05, "loss": 2.0818, "step": 2713 }, { "epoch": 0.6031111111111112, "grad_norm": 0.9298145174980164, "learning_rate": 7.955456570155902e-05, "loss": 2.07, "step": 2714 }, { "epoch": 0.6033333333333334, "grad_norm": 0.9375580549240112, "learning_rate": 7.951002227171492e-05, "loss": 2.0471, "step": 2715 }, { "epoch": 0.6035555555555555, "grad_norm": 0.8987560868263245, "learning_rate": 7.946547884187083e-05, "loss": 2.002, "step": 2716 }, { "epoch": 0.6037777777777777, "grad_norm": 1.3601025342941284, "learning_rate": 7.942093541202674e-05, "loss": 0.0422, "step": 2717 }, { "epoch": 0.604, "grad_norm": 0.9038719534873962, "learning_rate": 7.937639198218263e-05, "loss": 1.99, "step": 2718 }, { "epoch": 0.6042222222222222, "grad_norm": 0.9744821786880493, "learning_rate": 7.933184855233853e-05, "loss": 1.9441, "step": 2719 }, { "epoch": 0.6044444444444445, "grad_norm": 1.00676691532135, "learning_rate": 7.928730512249444e-05, "loss": 1.658, "step": 2720 }, { "epoch": 0.6046666666666667, "grad_norm": 1.040614128112793, "learning_rate": 7.924276169265033e-05, "loss": 1.8342, "step": 2721 }, { "epoch": 0.6048888888888889, "grad_norm": 1.0080763101577759, "learning_rate": 7.919821826280625e-05, "loss": 1.9057, "step": 2722 }, { "epoch": 0.6051111111111112, "grad_norm": 0.6371939182281494, "learning_rate": 7.915367483296214e-05, "loss": 0.7754, "step": 2723 }, { "epoch": 0.6053333333333333, "grad_norm": 0.08159324526786804, "learning_rate": 7.910913140311805e-05, "loss": 0.0167, "step": 2724 }, { "epoch": 0.6055555555555555, "grad_norm": 0.0773693099617958, "learning_rate": 7.906458797327395e-05, "loss": 0.0165, "step": 2725 }, { "epoch": 0.6057777777777777, "grad_norm": 0.6714785695075989, "learning_rate": 7.902004454342984e-05, "loss": 0.824, "step": 2726 }, { "epoch": 0.606, "grad_norm": 0.6955849528312683, "learning_rate": 7.897550111358575e-05, "loss": 0.9747, "step": 2727 }, { "epoch": 0.6062222222222222, "grad_norm": 0.12702590227127075, "learning_rate": 7.893095768374164e-05, "loss": 0.02, "step": 2728 }, { "epoch": 0.6064444444444445, "grad_norm": 0.9727218151092529, "learning_rate": 7.888641425389756e-05, "loss": 1.8577, "step": 2729 }, { "epoch": 0.6066666666666667, "grad_norm": 1.186213493347168, "learning_rate": 7.884187082405345e-05, "loss": 2.0189, "step": 2730 }, { "epoch": 0.6068888888888889, "grad_norm": 0.9478958249092102, "learning_rate": 7.879732739420936e-05, "loss": 1.7233, "step": 2731 }, { "epoch": 0.6071111111111112, "grad_norm": 0.9013267159461975, "learning_rate": 7.875278396436526e-05, "loss": 1.6254, "step": 2732 }, { "epoch": 0.6073333333333333, "grad_norm": 1.2980573177337646, "learning_rate": 7.870824053452117e-05, "loss": 2.0018, "step": 2733 }, { "epoch": 0.6075555555555555, "grad_norm": 0.6931120753288269, "learning_rate": 7.866369710467706e-05, "loss": 0.7662, "step": 2734 }, { "epoch": 0.6077777777777778, "grad_norm": 0.0702509731054306, "learning_rate": 7.861915367483296e-05, "loss": 0.0174, "step": 2735 }, { "epoch": 0.608, "grad_norm": 0.9084820747375488, "learning_rate": 7.857461024498887e-05, "loss": 0.7707, "step": 2736 }, { "epoch": 0.6082222222222222, "grad_norm": 0.06436257809400558, "learning_rate": 7.853006681514476e-05, "loss": 0.0169, "step": 2737 }, { "epoch": 0.6084444444444445, "grad_norm": 1.086901068687439, "learning_rate": 7.848552338530068e-05, "loss": 1.8021, "step": 2738 }, { "epoch": 0.6086666666666667, "grad_norm": 0.799363911151886, "learning_rate": 7.844097995545657e-05, "loss": 0.9215, "step": 2739 }, { "epoch": 0.6088888888888889, "grad_norm": 0.101883664727211, "learning_rate": 7.839643652561248e-05, "loss": 0.0245, "step": 2740 }, { "epoch": 0.6091111111111112, "grad_norm": 1.1138994693756104, "learning_rate": 7.835189309576838e-05, "loss": 1.3771, "step": 2741 }, { "epoch": 0.6093333333333333, "grad_norm": 1.1364567279815674, "learning_rate": 7.830734966592427e-05, "loss": 1.3817, "step": 2742 }, { "epoch": 0.6095555555555555, "grad_norm": 1.1259334087371826, "learning_rate": 7.826280623608018e-05, "loss": 1.5021, "step": 2743 }, { "epoch": 0.6097777777777778, "grad_norm": 1.2044109106063843, "learning_rate": 7.821826280623609e-05, "loss": 1.5266, "step": 2744 }, { "epoch": 0.61, "grad_norm": 1.1969807147979736, "learning_rate": 7.817371937639199e-05, "loss": 0.864, "step": 2745 }, { "epoch": 0.6102222222222222, "grad_norm": 1.0737295150756836, "learning_rate": 7.812917594654788e-05, "loss": 1.3506, "step": 2746 }, { "epoch": 0.6104444444444445, "grad_norm": 1.2049528360366821, "learning_rate": 7.808463251670379e-05, "loss": 1.2664, "step": 2747 }, { "epoch": 0.6106666666666667, "grad_norm": 1.0982084274291992, "learning_rate": 7.80400890868597e-05, "loss": 1.2182, "step": 2748 }, { "epoch": 0.6108888888888889, "grad_norm": 1.0116227865219116, "learning_rate": 7.799554565701559e-05, "loss": 0.919, "step": 2749 }, { "epoch": 0.6111111111111112, "grad_norm": 1.211203932762146, "learning_rate": 7.79510022271715e-05, "loss": 1.043, "step": 2750 }, { "epoch": 0.6113333333333333, "grad_norm": 0.04495595768094063, "learning_rate": 7.79064587973274e-05, "loss": 0.0098, "step": 2751 }, { "epoch": 0.6115555555555555, "grad_norm": 0.885490357875824, "learning_rate": 7.78619153674833e-05, "loss": 2.1379, "step": 2752 }, { "epoch": 0.6117777777777778, "grad_norm": 0.5612982511520386, "learning_rate": 7.78173719376392e-05, "loss": 1.091, "step": 2753 }, { "epoch": 0.612, "grad_norm": 0.8652318120002747, "learning_rate": 7.77728285077951e-05, "loss": 2.1604, "step": 2754 }, { "epoch": 0.6122222222222222, "grad_norm": 0.5881816148757935, "learning_rate": 7.7728285077951e-05, "loss": 0.8575, "step": 2755 }, { "epoch": 0.6124444444444445, "grad_norm": 0.06693144142627716, "learning_rate": 7.768374164810691e-05, "loss": 0.0108, "step": 2756 }, { "epoch": 0.6126666666666667, "grad_norm": 0.0678841769695282, "learning_rate": 7.763919821826281e-05, "loss": 0.0108, "step": 2757 }, { "epoch": 0.6128888888888889, "grad_norm": 0.06649115681648254, "learning_rate": 7.75946547884187e-05, "loss": 0.0105, "step": 2758 }, { "epoch": 0.6131111111111112, "grad_norm": 0.7412600517272949, "learning_rate": 7.755011135857461e-05, "loss": 1.1085, "step": 2759 }, { "epoch": 0.6133333333333333, "grad_norm": 0.8898929357528687, "learning_rate": 7.750556792873052e-05, "loss": 2.0215, "step": 2760 }, { "epoch": 0.6135555555555555, "grad_norm": 0.8269761204719543, "learning_rate": 7.746102449888641e-05, "loss": 1.7194, "step": 2761 }, { "epoch": 0.6137777777777778, "grad_norm": 0.9091681241989136, "learning_rate": 7.741648106904233e-05, "loss": 2.13, "step": 2762 }, { "epoch": 0.614, "grad_norm": 0.9192904233932495, "learning_rate": 7.737193763919822e-05, "loss": 2.057, "step": 2763 }, { "epoch": 0.6142222222222222, "grad_norm": 1.021559476852417, "learning_rate": 7.732739420935412e-05, "loss": 2.085, "step": 2764 }, { "epoch": 0.6144444444444445, "grad_norm": 0.8879882097244263, "learning_rate": 7.728285077951003e-05, "loss": 1.7405, "step": 2765 }, { "epoch": 0.6146666666666667, "grad_norm": 0.9193564057350159, "learning_rate": 7.723830734966592e-05, "loss": 1.7514, "step": 2766 }, { "epoch": 0.6148888888888889, "grad_norm": 0.8495803475379944, "learning_rate": 7.719376391982183e-05, "loss": 1.1789, "step": 2767 }, { "epoch": 0.6151111111111112, "grad_norm": 0.9236475229263306, "learning_rate": 7.714922048997773e-05, "loss": 1.9599, "step": 2768 }, { "epoch": 0.6153333333333333, "grad_norm": 1.0752800703048706, "learning_rate": 7.710467706013364e-05, "loss": 1.8044, "step": 2769 }, { "epoch": 0.6155555555555555, "grad_norm": 0.9333148002624512, "learning_rate": 7.706013363028953e-05, "loss": 2.0566, "step": 2770 }, { "epoch": 0.6157777777777778, "grad_norm": 0.07040537893772125, "learning_rate": 7.701559020044545e-05, "loss": 0.016, "step": 2771 }, { "epoch": 0.616, "grad_norm": 0.07483159750699997, "learning_rate": 7.697104677060134e-05, "loss": 0.0166, "step": 2772 }, { "epoch": 0.6162222222222222, "grad_norm": 0.07214007526636124, "learning_rate": 7.692650334075723e-05, "loss": 0.016, "step": 2773 }, { "epoch": 0.6164444444444445, "grad_norm": 0.07688527554273605, "learning_rate": 7.688195991091315e-05, "loss": 0.0162, "step": 2774 }, { "epoch": 0.6166666666666667, "grad_norm": 0.6190181374549866, "learning_rate": 7.683741648106904e-05, "loss": 0.8162, "step": 2775 }, { "epoch": 0.6168888888888889, "grad_norm": 0.9131618142127991, "learning_rate": 7.679287305122495e-05, "loss": 0.9931, "step": 2776 }, { "epoch": 0.6171111111111112, "grad_norm": 0.9522696137428284, "learning_rate": 7.674832962138085e-05, "loss": 2.0182, "step": 2777 }, { "epoch": 0.6173333333333333, "grad_norm": 0.9142736196517944, "learning_rate": 7.670378619153676e-05, "loss": 1.4818, "step": 2778 }, { "epoch": 0.6175555555555555, "grad_norm": 0.6549271941184998, "learning_rate": 7.665924276169265e-05, "loss": 0.8369, "step": 2779 }, { "epoch": 0.6177777777777778, "grad_norm": 0.5640666484832764, "learning_rate": 7.661469933184856e-05, "loss": 0.6976, "step": 2780 }, { "epoch": 0.618, "grad_norm": 0.9751452803611755, "learning_rate": 7.657015590200446e-05, "loss": 1.9721, "step": 2781 }, { "epoch": 0.6182222222222222, "grad_norm": 1.0083777904510498, "learning_rate": 7.652561247216035e-05, "loss": 1.8079, "step": 2782 }, { "epoch": 0.6184444444444445, "grad_norm": 0.1356029212474823, "learning_rate": 7.648106904231627e-05, "loss": 0.0189, "step": 2783 }, { "epoch": 0.6186666666666667, "grad_norm": 0.09193126112222672, "learning_rate": 7.643652561247216e-05, "loss": 0.0188, "step": 2784 }, { "epoch": 0.6188888888888889, "grad_norm": 0.07181335985660553, "learning_rate": 7.639198218262807e-05, "loss": 0.0176, "step": 2785 }, { "epoch": 0.6191111111111111, "grad_norm": 0.07424760609865189, "learning_rate": 7.634743875278397e-05, "loss": 0.0175, "step": 2786 }, { "epoch": 0.6193333333333333, "grad_norm": 0.07973092794418335, "learning_rate": 7.630289532293987e-05, "loss": 0.0176, "step": 2787 }, { "epoch": 0.6195555555555555, "grad_norm": 0.665320634841919, "learning_rate": 7.625835189309577e-05, "loss": 0.8284, "step": 2788 }, { "epoch": 0.6197777777777778, "grad_norm": 1.122811198234558, "learning_rate": 7.621380846325168e-05, "loss": 1.87, "step": 2789 }, { "epoch": 0.62, "grad_norm": 0.7436560988426208, "learning_rate": 7.616926503340758e-05, "loss": 0.9044, "step": 2790 }, { "epoch": 0.6202222222222222, "grad_norm": 1.0274792909622192, "learning_rate": 7.612472160356347e-05, "loss": 1.7865, "step": 2791 }, { "epoch": 0.6204444444444445, "grad_norm": 1.0092792510986328, "learning_rate": 7.608017817371938e-05, "loss": 1.4015, "step": 2792 }, { "epoch": 0.6206666666666667, "grad_norm": 0.9540352821350098, "learning_rate": 7.603563474387529e-05, "loss": 1.3971, "step": 2793 }, { "epoch": 0.6208888888888889, "grad_norm": 1.1279139518737793, "learning_rate": 7.599109131403118e-05, "loss": 1.6411, "step": 2794 }, { "epoch": 0.6211111111111111, "grad_norm": 0.9098860025405884, "learning_rate": 7.59465478841871e-05, "loss": 0.7837, "step": 2795 }, { "epoch": 0.6213333333333333, "grad_norm": 1.027552843093872, "learning_rate": 7.590200445434299e-05, "loss": 1.2958, "step": 2796 }, { "epoch": 0.6215555555555555, "grad_norm": 0.9595284461975098, "learning_rate": 7.585746102449889e-05, "loss": 0.9841, "step": 2797 }, { "epoch": 0.6217777777777778, "grad_norm": 0.8761973977088928, "learning_rate": 7.58129175946548e-05, "loss": 0.5813, "step": 2798 }, { "epoch": 0.622, "grad_norm": 1.033424973487854, "learning_rate": 7.576837416481069e-05, "loss": 1.3788, "step": 2799 }, { "epoch": 0.6222222222222222, "grad_norm": 0.8536433577537537, "learning_rate": 7.57238307349666e-05, "loss": 0.5428, "step": 2800 }, { "epoch": 0.6224444444444445, "grad_norm": 0.5578122138977051, "learning_rate": 7.567928730512249e-05, "loss": 1.0879, "step": 2801 }, { "epoch": 0.6226666666666667, "grad_norm": 0.09419236332178116, "learning_rate": 7.56347438752784e-05, "loss": 0.0109, "step": 2802 }, { "epoch": 0.6228888888888889, "grad_norm": 0.618521511554718, "learning_rate": 7.55902004454343e-05, "loss": 1.218, "step": 2803 }, { "epoch": 0.6231111111111111, "grad_norm": 0.05352301150560379, "learning_rate": 7.55456570155902e-05, "loss": 0.0103, "step": 2804 }, { "epoch": 0.6233333333333333, "grad_norm": 0.6948210000991821, "learning_rate": 7.550111358574611e-05, "loss": 1.0658, "step": 2805 }, { "epoch": 0.6235555555555555, "grad_norm": 0.6293577551841736, "learning_rate": 7.5456570155902e-05, "loss": 0.9913, "step": 2806 }, { "epoch": 0.6237777777777778, "grad_norm": 0.8518357276916504, "learning_rate": 7.54120267260579e-05, "loss": 2.4278, "step": 2807 }, { "epoch": 0.624, "grad_norm": 0.9139655828475952, "learning_rate": 7.536748329621381e-05, "loss": 2.1537, "step": 2808 }, { "epoch": 0.6242222222222222, "grad_norm": 0.08207522332668304, "learning_rate": 7.532293986636972e-05, "loss": 0.012, "step": 2809 }, { "epoch": 0.6244444444444445, "grad_norm": 0.0847187265753746, "learning_rate": 7.527839643652561e-05, "loss": 0.012, "step": 2810 }, { "epoch": 0.6246666666666667, "grad_norm": 0.658014178276062, "learning_rate": 7.523385300668151e-05, "loss": 0.9869, "step": 2811 }, { "epoch": 0.6248888888888889, "grad_norm": 0.8833963871002197, "learning_rate": 7.518930957683742e-05, "loss": 1.9448, "step": 2812 }, { "epoch": 0.6251111111111111, "grad_norm": 0.8396661877632141, "learning_rate": 7.514476614699331e-05, "loss": 2.1203, "step": 2813 }, { "epoch": 0.6253333333333333, "grad_norm": 0.8181087970733643, "learning_rate": 7.510022271714923e-05, "loss": 1.8757, "step": 2814 }, { "epoch": 0.6255555555555555, "grad_norm": 0.8380526900291443, "learning_rate": 7.505567928730512e-05, "loss": 1.7779, "step": 2815 }, { "epoch": 0.6257777777777778, "grad_norm": 1.1210718154907227, "learning_rate": 7.501113585746103e-05, "loss": 2.1145, "step": 2816 }, { "epoch": 0.626, "grad_norm": 1.0055640935897827, "learning_rate": 7.496659242761693e-05, "loss": 1.8104, "step": 2817 }, { "epoch": 0.6262222222222222, "grad_norm": 0.8134049773216248, "learning_rate": 7.492204899777284e-05, "loss": 0.9472, "step": 2818 }, { "epoch": 0.6264444444444445, "grad_norm": 0.9640253782272339, "learning_rate": 7.487750556792873e-05, "loss": 2.0194, "step": 2819 }, { "epoch": 0.6266666666666667, "grad_norm": 0.901558518409729, "learning_rate": 7.483296213808464e-05, "loss": 1.7276, "step": 2820 }, { "epoch": 0.6268888888888889, "grad_norm": 0.9449894428253174, "learning_rate": 7.478841870824054e-05, "loss": 2.0367, "step": 2821 }, { "epoch": 0.6271111111111111, "grad_norm": 1.145552396774292, "learning_rate": 7.474387527839643e-05, "loss": 2.5524, "step": 2822 }, { "epoch": 0.6273333333333333, "grad_norm": 1.0274474620819092, "learning_rate": 7.469933184855235e-05, "loss": 1.9253, "step": 2823 }, { "epoch": 0.6275555555555555, "grad_norm": 0.9850492477416992, "learning_rate": 7.465478841870824e-05, "loss": 1.848, "step": 2824 }, { "epoch": 0.6277777777777778, "grad_norm": 0.0741516649723053, "learning_rate": 7.461024498886415e-05, "loss": 0.0151, "step": 2825 }, { "epoch": 0.628, "grad_norm": 0.07517000287771225, "learning_rate": 7.456570155902005e-05, "loss": 0.0145, "step": 2826 }, { "epoch": 0.6282222222222222, "grad_norm": 0.07452172785997391, "learning_rate": 7.452115812917595e-05, "loss": 0.0153, "step": 2827 }, { "epoch": 0.6284444444444445, "grad_norm": 1.0018540620803833, "learning_rate": 7.447661469933185e-05, "loss": 1.8204, "step": 2828 }, { "epoch": 0.6286666666666667, "grad_norm": 0.941403865814209, "learning_rate": 7.443207126948776e-05, "loss": 1.9339, "step": 2829 }, { "epoch": 0.6288888888888889, "grad_norm": 1.0924185514450073, "learning_rate": 7.438752783964366e-05, "loss": 1.6405, "step": 2830 }, { "epoch": 0.6291111111111111, "grad_norm": 0.6763534545898438, "learning_rate": 7.434298440979955e-05, "loss": 0.661, "step": 2831 }, { "epoch": 0.6293333333333333, "grad_norm": 0.7361119389533997, "learning_rate": 7.429844097995546e-05, "loss": 0.8457, "step": 2832 }, { "epoch": 0.6295555555555555, "grad_norm": 0.0745445117354393, "learning_rate": 7.425389755011136e-05, "loss": 0.0192, "step": 2833 }, { "epoch": 0.6297777777777778, "grad_norm": 1.0449111461639404, "learning_rate": 7.420935412026726e-05, "loss": 1.599, "step": 2834 }, { "epoch": 0.63, "grad_norm": 1.0760903358459473, "learning_rate": 7.416481069042317e-05, "loss": 1.6259, "step": 2835 }, { "epoch": 0.6302222222222222, "grad_norm": 0.726026713848114, "learning_rate": 7.412026726057907e-05, "loss": 0.7053, "step": 2836 }, { "epoch": 0.6304444444444445, "grad_norm": 0.07237496972084045, "learning_rate": 7.407572383073497e-05, "loss": 0.0162, "step": 2837 }, { "epoch": 0.6306666666666667, "grad_norm": 0.06699542701244354, "learning_rate": 7.403118040089088e-05, "loss": 0.0162, "step": 2838 }, { "epoch": 0.6308888888888889, "grad_norm": 0.7108114957809448, "learning_rate": 7.398663697104677e-05, "loss": 0.727, "step": 2839 }, { "epoch": 0.6311111111111111, "grad_norm": 0.9890654683113098, "learning_rate": 7.394209354120267e-05, "loss": 1.4659, "step": 2840 }, { "epoch": 0.6313333333333333, "grad_norm": 1.0247973203659058, "learning_rate": 7.389755011135858e-05, "loss": 1.8139, "step": 2841 }, { "epoch": 0.6315555555555555, "grad_norm": 1.0868635177612305, "learning_rate": 7.385300668151449e-05, "loss": 1.6254, "step": 2842 }, { "epoch": 0.6317777777777778, "grad_norm": 1.1002624034881592, "learning_rate": 7.380846325167038e-05, "loss": 1.7458, "step": 2843 }, { "epoch": 0.632, "grad_norm": 1.13886296749115, "learning_rate": 7.376391982182628e-05, "loss": 1.5266, "step": 2844 }, { "epoch": 0.6322222222222222, "grad_norm": 1.1197274923324585, "learning_rate": 7.371937639198219e-05, "loss": 1.3135, "step": 2845 }, { "epoch": 0.6324444444444445, "grad_norm": 1.2070372104644775, "learning_rate": 7.367483296213808e-05, "loss": 1.1524, "step": 2846 }, { "epoch": 0.6326666666666667, "grad_norm": 0.9080721139907837, "learning_rate": 7.3630289532294e-05, "loss": 0.7859, "step": 2847 }, { "epoch": 0.6328888888888888, "grad_norm": 1.1644912958145142, "learning_rate": 7.358574610244989e-05, "loss": 1.3371, "step": 2848 }, { "epoch": 0.6331111111111111, "grad_norm": 0.7726056575775146, "learning_rate": 7.35412026726058e-05, "loss": 0.6174, "step": 2849 }, { "epoch": 0.6333333333333333, "grad_norm": 1.1530473232269287, "learning_rate": 7.34966592427617e-05, "loss": 0.6304, "step": 2850 }, { "epoch": 0.6335555555555555, "grad_norm": 0.044335298240184784, "learning_rate": 7.345211581291759e-05, "loss": 0.0101, "step": 2851 }, { "epoch": 0.6337777777777778, "grad_norm": 0.615163266658783, "learning_rate": 7.34075723830735e-05, "loss": 1.1783, "step": 2852 }, { "epoch": 0.634, "grad_norm": 0.7737541794776917, "learning_rate": 7.33630289532294e-05, "loss": 2.0947, "step": 2853 }, { "epoch": 0.6342222222222222, "grad_norm": 0.4981004297733307, "learning_rate": 7.331848552338531e-05, "loss": 0.9559, "step": 2854 }, { "epoch": 0.6344444444444445, "grad_norm": 0.7779539823532104, "learning_rate": 7.32739420935412e-05, "loss": 0.9773, "step": 2855 }, { "epoch": 0.6346666666666667, "grad_norm": 0.9028410315513611, "learning_rate": 7.322939866369712e-05, "loss": 2.1857, "step": 2856 }, { "epoch": 0.6348888888888888, "grad_norm": 0.757631778717041, "learning_rate": 7.318485523385301e-05, "loss": 1.4806, "step": 2857 }, { "epoch": 0.6351111111111111, "grad_norm": 0.8450924754142761, "learning_rate": 7.31403118040089e-05, "loss": 2.0115, "step": 2858 }, { "epoch": 0.6353333333333333, "grad_norm": 0.8707918524742126, "learning_rate": 7.309576837416482e-05, "loss": 1.8007, "step": 2859 }, { "epoch": 0.6355555555555555, "grad_norm": 0.8273522257804871, "learning_rate": 7.305122494432071e-05, "loss": 1.8579, "step": 2860 }, { "epoch": 0.6357777777777778, "grad_norm": 0.9253904223442078, "learning_rate": 7.300668151447662e-05, "loss": 2.0011, "step": 2861 }, { "epoch": 0.636, "grad_norm": 0.8965882658958435, "learning_rate": 7.296213808463252e-05, "loss": 2.0516, "step": 2862 }, { "epoch": 0.6362222222222222, "grad_norm": 0.9190070629119873, "learning_rate": 7.291759465478843e-05, "loss": 1.7634, "step": 2863 }, { "epoch": 0.6364444444444445, "grad_norm": 0.919740617275238, "learning_rate": 7.287305122494432e-05, "loss": 1.7735, "step": 2864 }, { "epoch": 0.6366666666666667, "grad_norm": 1.0333331823349, "learning_rate": 7.282850779510023e-05, "loss": 2.2958, "step": 2865 }, { "epoch": 0.6368888888888888, "grad_norm": 0.9126089811325073, "learning_rate": 7.278396436525613e-05, "loss": 1.9232, "step": 2866 }, { "epoch": 0.6371111111111111, "grad_norm": 0.8983725309371948, "learning_rate": 7.273942093541202e-05, "loss": 1.6496, "step": 2867 }, { "epoch": 0.6373333333333333, "grad_norm": 0.07827930152416229, "learning_rate": 7.269487750556794e-05, "loss": 0.0164, "step": 2868 }, { "epoch": 0.6375555555555555, "grad_norm": 0.07462301850318909, "learning_rate": 7.265033407572384e-05, "loss": 0.0165, "step": 2869 }, { "epoch": 0.6377777777777778, "grad_norm": 0.6954313516616821, "learning_rate": 7.260579064587974e-05, "loss": 0.9052, "step": 2870 }, { "epoch": 0.638, "grad_norm": 0.888656497001648, "learning_rate": 7.256124721603563e-05, "loss": 1.6937, "step": 2871 }, { "epoch": 0.6382222222222222, "grad_norm": 0.1639782041311264, "learning_rate": 7.251670378619154e-05, "loss": 0.0263, "step": 2872 }, { "epoch": 0.6384444444444445, "grad_norm": 0.6530879735946655, "learning_rate": 7.247216035634744e-05, "loss": 0.8118, "step": 2873 }, { "epoch": 0.6386666666666667, "grad_norm": 0.9214199185371399, "learning_rate": 7.242761692650333e-05, "loss": 1.5913, "step": 2874 }, { "epoch": 0.6388888888888888, "grad_norm": 0.9698308706283569, "learning_rate": 7.238307349665925e-05, "loss": 1.9825, "step": 2875 }, { "epoch": 0.6391111111111111, "grad_norm": 1.0415962934494019, "learning_rate": 7.233853006681515e-05, "loss": 1.738, "step": 2876 }, { "epoch": 0.6393333333333333, "grad_norm": 1.2774953842163086, "learning_rate": 7.229398663697105e-05, "loss": 1.7611, "step": 2877 }, { "epoch": 0.6395555555555555, "grad_norm": 0.06831032782793045, "learning_rate": 7.224944320712696e-05, "loss": 0.018, "step": 2878 }, { "epoch": 0.6397777777777778, "grad_norm": 0.07592401653528214, "learning_rate": 7.220489977728285e-05, "loss": 0.0179, "step": 2879 }, { "epoch": 0.64, "grad_norm": 0.06443265825510025, "learning_rate": 7.216035634743875e-05, "loss": 0.0179, "step": 2880 }, { "epoch": 0.6402222222222222, "grad_norm": 0.7548007369041443, "learning_rate": 7.211581291759466e-05, "loss": 0.7391, "step": 2881 }, { "epoch": 0.6404444444444445, "grad_norm": 1.4142894744873047, "learning_rate": 7.207126948775056e-05, "loss": 0.877, "step": 2882 }, { "epoch": 0.6406666666666667, "grad_norm": 0.07341942191123962, "learning_rate": 7.202672605790646e-05, "loss": 0.0166, "step": 2883 }, { "epoch": 0.6408888888888888, "grad_norm": 0.06469117850065231, "learning_rate": 7.198218262806236e-05, "loss": 0.0172, "step": 2884 }, { "epoch": 0.6411111111111111, "grad_norm": 0.94828861951828, "learning_rate": 7.193763919821827e-05, "loss": 1.5546, "step": 2885 }, { "epoch": 0.6413333333333333, "grad_norm": 1.0259699821472168, "learning_rate": 7.189309576837416e-05, "loss": 1.6598, "step": 2886 }, { "epoch": 0.6415555555555555, "grad_norm": 1.1069990396499634, "learning_rate": 7.184855233853008e-05, "loss": 1.8913, "step": 2887 }, { "epoch": 0.6417777777777778, "grad_norm": 1.0191295146942139, "learning_rate": 7.180400890868597e-05, "loss": 1.5918, "step": 2888 }, { "epoch": 0.642, "grad_norm": 1.0177274942398071, "learning_rate": 7.175946547884187e-05, "loss": 1.6859, "step": 2889 }, { "epoch": 0.6422222222222222, "grad_norm": 1.0870459079742432, "learning_rate": 7.171492204899778e-05, "loss": 2.0939, "step": 2890 }, { "epoch": 0.6424444444444445, "grad_norm": 0.7583724856376648, "learning_rate": 7.167037861915367e-05, "loss": 0.8794, "step": 2891 }, { "epoch": 0.6426666666666667, "grad_norm": 1.1080893278121948, "learning_rate": 7.162583518930958e-05, "loss": 1.653, "step": 2892 }, { "epoch": 0.6428888888888888, "grad_norm": 0.999817967414856, "learning_rate": 7.158129175946548e-05, "loss": 1.3451, "step": 2893 }, { "epoch": 0.6431111111111111, "grad_norm": 1.2734150886535645, "learning_rate": 7.153674832962139e-05, "loss": 1.4211, "step": 2894 }, { "epoch": 0.6433333333333333, "grad_norm": 1.1219244003295898, "learning_rate": 7.149220489977728e-05, "loss": 1.2831, "step": 2895 }, { "epoch": 0.6435555555555555, "grad_norm": 1.1147305965423584, "learning_rate": 7.144766146993318e-05, "loss": 1.2474, "step": 2896 }, { "epoch": 0.6437777777777778, "grad_norm": 1.0683484077453613, "learning_rate": 7.140311804008909e-05, "loss": 1.2979, "step": 2897 }, { "epoch": 0.644, "grad_norm": 0.7551613450050354, "learning_rate": 7.135857461024498e-05, "loss": 0.7246, "step": 2898 }, { "epoch": 0.6442222222222223, "grad_norm": 1.0384818315505981, "learning_rate": 7.13140311804009e-05, "loss": 1.0033, "step": 2899 }, { "epoch": 0.6444444444444445, "grad_norm": 1.0102633237838745, "learning_rate": 7.126948775055679e-05, "loss": 0.8248, "step": 2900 }, { "epoch": 0.6446666666666667, "grad_norm": 0.05665779858827591, "learning_rate": 7.12249443207127e-05, "loss": 0.0108, "step": 2901 }, { "epoch": 0.6448888888888888, "grad_norm": 0.054136212915182114, "learning_rate": 7.11804008908686e-05, "loss": 0.0107, "step": 2902 }, { "epoch": 0.6451111111111111, "grad_norm": 0.8140088319778442, "learning_rate": 7.113585746102451e-05, "loss": 2.1089, "step": 2903 }, { "epoch": 0.6453333333333333, "grad_norm": 0.8073779344558716, "learning_rate": 7.10913140311804e-05, "loss": 2.156, "step": 2904 }, { "epoch": 0.6455555555555555, "grad_norm": 0.8879762887954712, "learning_rate": 7.10467706013363e-05, "loss": 2.5526, "step": 2905 }, { "epoch": 0.6457777777777778, "grad_norm": 0.4919484853744507, "learning_rate": 7.100222717149221e-05, "loss": 1.0987, "step": 2906 }, { "epoch": 0.646, "grad_norm": 0.7251628637313843, "learning_rate": 7.09576837416481e-05, "loss": 1.2484, "step": 2907 }, { "epoch": 0.6462222222222223, "grad_norm": 0.10525999963283539, "learning_rate": 7.091314031180402e-05, "loss": 0.0137, "step": 2908 }, { "epoch": 0.6464444444444445, "grad_norm": 0.5602700114250183, "learning_rate": 7.086859688195991e-05, "loss": 1.1304, "step": 2909 }, { "epoch": 0.6466666666666666, "grad_norm": 0.8258494138717651, "learning_rate": 7.082405345211582e-05, "loss": 2.225, "step": 2910 }, { "epoch": 0.6468888888888888, "grad_norm": 0.841549277305603, "learning_rate": 7.077951002227172e-05, "loss": 2.0783, "step": 2911 }, { "epoch": 0.6471111111111111, "grad_norm": 0.9405723214149475, "learning_rate": 7.073496659242762e-05, "loss": 2.3224, "step": 2912 }, { "epoch": 0.6473333333333333, "grad_norm": 0.8422486186027527, "learning_rate": 7.069042316258352e-05, "loss": 1.8426, "step": 2913 }, { "epoch": 0.6475555555555556, "grad_norm": 0.9155295491218567, "learning_rate": 7.064587973273943e-05, "loss": 1.9838, "step": 2914 }, { "epoch": 0.6477777777777778, "grad_norm": 1.0015355348587036, "learning_rate": 7.060133630289533e-05, "loss": 2.0283, "step": 2915 }, { "epoch": 0.648, "grad_norm": 1.0666885375976562, "learning_rate": 7.055679287305122e-05, "loss": 2.1292, "step": 2916 }, { "epoch": 0.6482222222222223, "grad_norm": 0.863190233707428, "learning_rate": 7.051224944320713e-05, "loss": 1.6575, "step": 2917 }, { "epoch": 0.6484444444444445, "grad_norm": 1.0107028484344482, "learning_rate": 7.046770601336304e-05, "loss": 1.6859, "step": 2918 }, { "epoch": 0.6486666666666666, "grad_norm": 0.7367758750915527, "learning_rate": 7.042316258351893e-05, "loss": 1.109, "step": 2919 }, { "epoch": 0.6488888888888888, "grad_norm": 0.17549914121627808, "learning_rate": 7.037861915367485e-05, "loss": 0.0181, "step": 2920 }, { "epoch": 0.6491111111111111, "grad_norm": 0.1470581293106079, "learning_rate": 7.033407572383074e-05, "loss": 0.0229, "step": 2921 }, { "epoch": 0.6493333333333333, "grad_norm": 0.6036903858184814, "learning_rate": 7.028953229398664e-05, "loss": 0.9571, "step": 2922 }, { "epoch": 0.6495555555555556, "grad_norm": 0.9932591319084167, "learning_rate": 7.024498886414255e-05, "loss": 1.8507, "step": 2923 }, { "epoch": 0.6497777777777778, "grad_norm": 1.0042146444320679, "learning_rate": 7.020044543429844e-05, "loss": 1.6924, "step": 2924 }, { "epoch": 0.65, "grad_norm": 1.1565930843353271, "learning_rate": 7.015590200445435e-05, "loss": 1.7762, "step": 2925 }, { "epoch": 0.6502222222222223, "grad_norm": 1.0624140501022339, "learning_rate": 7.011135857461025e-05, "loss": 1.9526, "step": 2926 }, { "epoch": 0.6504444444444445, "grad_norm": 0.9543834328651428, "learning_rate": 7.006681514476616e-05, "loss": 1.832, "step": 2927 }, { "epoch": 0.6506666666666666, "grad_norm": 0.23686853051185608, "learning_rate": 7.002227171492205e-05, "loss": 0.0233, "step": 2928 }, { "epoch": 0.6508888888888889, "grad_norm": 0.21207448840141296, "learning_rate": 6.997772828507795e-05, "loss": 0.0227, "step": 2929 }, { "epoch": 0.6511111111111111, "grad_norm": 0.1674102395772934, "learning_rate": 6.993318485523386e-05, "loss": 0.0203, "step": 2930 }, { "epoch": 0.6513333333333333, "grad_norm": 0.6476364135742188, "learning_rate": 6.988864142538975e-05, "loss": 0.8066, "step": 2931 }, { "epoch": 0.6515555555555556, "grad_norm": 0.0691596120595932, "learning_rate": 6.984409799554567e-05, "loss": 0.0172, "step": 2932 }, { "epoch": 0.6517777777777778, "grad_norm": 0.06929878145456314, "learning_rate": 6.979955456570156e-05, "loss": 0.017, "step": 2933 }, { "epoch": 0.652, "grad_norm": 0.6000028252601624, "learning_rate": 6.975501113585747e-05, "loss": 0.7008, "step": 2934 }, { "epoch": 0.6522222222222223, "grad_norm": 0.9273680448532104, "learning_rate": 6.971046770601337e-05, "loss": 1.6629, "step": 2935 }, { "epoch": 0.6524444444444445, "grad_norm": 1.1437641382217407, "learning_rate": 6.966592427616926e-05, "loss": 1.4685, "step": 2936 }, { "epoch": 0.6526666666666666, "grad_norm": 0.954337477684021, "learning_rate": 6.962138084632517e-05, "loss": 1.4641, "step": 2937 }, { "epoch": 0.6528888888888889, "grad_norm": 1.209396243095398, "learning_rate": 6.957683741648107e-05, "loss": 1.6734, "step": 2938 }, { "epoch": 0.6531111111111111, "grad_norm": 1.0835387706756592, "learning_rate": 6.953229398663698e-05, "loss": 1.4416, "step": 2939 }, { "epoch": 0.6533333333333333, "grad_norm": 1.2384669780731201, "learning_rate": 6.948775055679287e-05, "loss": 1.5449, "step": 2940 }, { "epoch": 0.6535555555555556, "grad_norm": 1.0444763898849487, "learning_rate": 6.944320712694879e-05, "loss": 1.4081, "step": 2941 }, { "epoch": 0.6537777777777778, "grad_norm": 0.19488677382469177, "learning_rate": 6.939866369710468e-05, "loss": 0.0357, "step": 2942 }, { "epoch": 0.654, "grad_norm": 0.7100367546081543, "learning_rate": 6.935412026726057e-05, "loss": 0.6988, "step": 2943 }, { "epoch": 0.6542222222222223, "grad_norm": 1.0661877393722534, "learning_rate": 6.930957683741648e-05, "loss": 1.402, "step": 2944 }, { "epoch": 0.6544444444444445, "grad_norm": 1.0283530950546265, "learning_rate": 6.926503340757238e-05, "loss": 1.2476, "step": 2945 }, { "epoch": 0.6546666666666666, "grad_norm": 1.0251097679138184, "learning_rate": 6.922048997772829e-05, "loss": 1.2837, "step": 2946 }, { "epoch": 0.6548888888888889, "grad_norm": 1.202881932258606, "learning_rate": 6.917594654788418e-05, "loss": 1.5042, "step": 2947 }, { "epoch": 0.6551111111111111, "grad_norm": 0.8322992324829102, "learning_rate": 6.91314031180401e-05, "loss": 0.5849, "step": 2948 }, { "epoch": 0.6553333333333333, "grad_norm": 1.1371495723724365, "learning_rate": 6.908685968819599e-05, "loss": 0.8999, "step": 2949 }, { "epoch": 0.6555555555555556, "grad_norm": 1.1280728578567505, "learning_rate": 6.904231625835188e-05, "loss": 0.8439, "step": 2950 }, { "epoch": 0.6557777777777778, "grad_norm": 0.750573456287384, "learning_rate": 6.89977728285078e-05, "loss": 1.2617, "step": 2951 }, { "epoch": 0.656, "grad_norm": 0.6734370589256287, "learning_rate": 6.89532293986637e-05, "loss": 1.4601, "step": 2952 }, { "epoch": 0.6562222222222223, "grad_norm": 0.8959650993347168, "learning_rate": 6.89086859688196e-05, "loss": 2.0815, "step": 2953 }, { "epoch": 0.6564444444444445, "grad_norm": 0.7922069430351257, "learning_rate": 6.88641425389755e-05, "loss": 1.825, "step": 2954 }, { "epoch": 0.6566666666666666, "grad_norm": 0.8229972720146179, "learning_rate": 6.881959910913141e-05, "loss": 2.1376, "step": 2955 }, { "epoch": 0.6568888888888889, "grad_norm": 0.8326950669288635, "learning_rate": 6.87750556792873e-05, "loss": 1.7849, "step": 2956 }, { "epoch": 0.6571111111111111, "grad_norm": 0.07217428833246231, "learning_rate": 6.873051224944321e-05, "loss": 0.0109, "step": 2957 }, { "epoch": 0.6573333333333333, "grad_norm": 0.07548868656158447, "learning_rate": 6.868596881959911e-05, "loss": 0.0109, "step": 2958 }, { "epoch": 0.6575555555555556, "grad_norm": 1.0550813674926758, "learning_rate": 6.8641425389755e-05, "loss": 2.1406, "step": 2959 }, { "epoch": 0.6577777777777778, "grad_norm": 1.059350848197937, "learning_rate": 6.859688195991092e-05, "loss": 2.3544, "step": 2960 }, { "epoch": 0.658, "grad_norm": 1.044738531112671, "learning_rate": 6.855233853006682e-05, "loss": 1.9748, "step": 2961 }, { "epoch": 0.6582222222222223, "grad_norm": 0.7435956597328186, "learning_rate": 6.850779510022272e-05, "loss": 1.0326, "step": 2962 }, { "epoch": 0.6584444444444445, "grad_norm": 1.0756325721740723, "learning_rate": 6.846325167037863e-05, "loss": 1.902, "step": 2963 }, { "epoch": 0.6586666666666666, "grad_norm": 0.9899616837501526, "learning_rate": 6.841870824053452e-05, "loss": 2.0294, "step": 2964 }, { "epoch": 0.6588888888888889, "grad_norm": 0.9705696105957031, "learning_rate": 6.837416481069042e-05, "loss": 1.7723, "step": 2965 }, { "epoch": 0.6591111111111111, "grad_norm": 0.7591641545295715, "learning_rate": 6.832962138084633e-05, "loss": 1.054, "step": 2966 }, { "epoch": 0.6593333333333333, "grad_norm": 0.6711844801902771, "learning_rate": 6.828507795100223e-05, "loss": 1.002, "step": 2967 }, { "epoch": 0.6595555555555556, "grad_norm": 0.10974638164043427, "learning_rate": 6.824053452115813e-05, "loss": 0.0174, "step": 2968 }, { "epoch": 0.6597777777777778, "grad_norm": 0.6256340742111206, "learning_rate": 6.819599109131403e-05, "loss": 0.7909, "step": 2969 }, { "epoch": 0.66, "grad_norm": 0.9503763914108276, "learning_rate": 6.815144766146994e-05, "loss": 1.6599, "step": 2970 }, { "epoch": 0.6602222222222223, "grad_norm": 0.10511241108179092, "learning_rate": 6.810690423162583e-05, "loss": 0.0191, "step": 2971 }, { "epoch": 0.6604444444444444, "grad_norm": 0.10218800604343414, "learning_rate": 6.806236080178175e-05, "loss": 0.0192, "step": 2972 }, { "epoch": 0.6606666666666666, "grad_norm": 1.087993860244751, "learning_rate": 6.801781737193764e-05, "loss": 2.0295, "step": 2973 }, { "epoch": 0.6608888888888889, "grad_norm": 0.9236508011817932, "learning_rate": 6.797327394209355e-05, "loss": 1.8498, "step": 2974 }, { "epoch": 0.6611111111111111, "grad_norm": 0.9590336680412292, "learning_rate": 6.792873051224945e-05, "loss": 1.8817, "step": 2975 }, { "epoch": 0.6613333333333333, "grad_norm": 1.0234291553497314, "learning_rate": 6.788418708240534e-05, "loss": 1.7545, "step": 2976 }, { "epoch": 0.6615555555555556, "grad_norm": 1.0496752262115479, "learning_rate": 6.783964365256125e-05, "loss": 1.838, "step": 2977 }, { "epoch": 0.6617777777777778, "grad_norm": 1.0736680030822754, "learning_rate": 6.779510022271715e-05, "loss": 1.8839, "step": 2978 }, { "epoch": 0.662, "grad_norm": 0.06254412978887558, "learning_rate": 6.775055679287306e-05, "loss": 0.0165, "step": 2979 }, { "epoch": 0.6622222222222223, "grad_norm": 0.06592579185962677, "learning_rate": 6.770601336302895e-05, "loss": 0.0167, "step": 2980 }, { "epoch": 0.6624444444444444, "grad_norm": 0.0603296123445034, "learning_rate": 6.766146993318486e-05, "loss": 0.0164, "step": 2981 }, { "epoch": 0.6626666666666666, "grad_norm": 0.0696684792637825, "learning_rate": 6.761692650334076e-05, "loss": 0.0163, "step": 2982 }, { "epoch": 0.6628888888888889, "grad_norm": 0.711073637008667, "learning_rate": 6.757238307349665e-05, "loss": 0.7294, "step": 2983 }, { "epoch": 0.6631111111111111, "grad_norm": 1.0906970500946045, "learning_rate": 6.752783964365257e-05, "loss": 1.8489, "step": 2984 }, { "epoch": 0.6633333333333333, "grad_norm": 0.6575995683670044, "learning_rate": 6.748329621380846e-05, "loss": 0.7741, "step": 2985 }, { "epoch": 0.6635555555555556, "grad_norm": 0.9926353096961975, "learning_rate": 6.743875278396437e-05, "loss": 1.7744, "step": 2986 }, { "epoch": 0.6637777777777778, "grad_norm": 1.089295506477356, "learning_rate": 6.739420935412027e-05, "loss": 1.7575, "step": 2987 }, { "epoch": 0.664, "grad_norm": 0.10425405949354172, "learning_rate": 6.734966592427617e-05, "loss": 0.027, "step": 2988 }, { "epoch": 0.6642222222222223, "grad_norm": 0.682433009147644, "learning_rate": 6.730512249443207e-05, "loss": 0.7208, "step": 2989 }, { "epoch": 0.6644444444444444, "grad_norm": 1.045576572418213, "learning_rate": 6.726057906458798e-05, "loss": 1.1317, "step": 2990 }, { "epoch": 0.6646666666666666, "grad_norm": 1.2633000612258911, "learning_rate": 6.721603563474388e-05, "loss": 1.6152, "step": 2991 }, { "epoch": 0.6648888888888889, "grad_norm": 1.0451045036315918, "learning_rate": 6.717149220489977e-05, "loss": 1.412, "step": 2992 }, { "epoch": 0.6651111111111111, "grad_norm": 0.9378172159194946, "learning_rate": 6.712694877505569e-05, "loss": 1.4484, "step": 2993 }, { "epoch": 0.6653333333333333, "grad_norm": 0.9717287421226501, "learning_rate": 6.708240534521158e-05, "loss": 1.2648, "step": 2994 }, { "epoch": 0.6655555555555556, "grad_norm": 0.7894330620765686, "learning_rate": 6.703786191536749e-05, "loss": 0.7415, "step": 2995 }, { "epoch": 0.6657777777777778, "grad_norm": 1.0013213157653809, "learning_rate": 6.69933184855234e-05, "loss": 1.1406, "step": 2996 }, { "epoch": 0.666, "grad_norm": 0.16363666951656342, "learning_rate": 6.694877505567929e-05, "loss": 0.038, "step": 2997 }, { "epoch": 0.6662222222222223, "grad_norm": 0.7024639844894409, "learning_rate": 6.690423162583519e-05, "loss": 0.5375, "step": 2998 }, { "epoch": 0.6664444444444444, "grad_norm": 0.9310855865478516, "learning_rate": 6.68596881959911e-05, "loss": 1.042, "step": 2999 }, { "epoch": 0.6666666666666666, "grad_norm": 1.0746338367462158, "learning_rate": 6.6815144766147e-05, "loss": 0.8101, "step": 3000 }, { "epoch": 0.6668888888888889, "grad_norm": 0.8144944906234741, "learning_rate": 6.67706013363029e-05, "loss": 1.9887, "step": 3001 }, { "epoch": 0.6671111111111111, "grad_norm": 0.8721863627433777, "learning_rate": 6.67260579064588e-05, "loss": 2.3995, "step": 3002 }, { "epoch": 0.6673333333333333, "grad_norm": 0.801108717918396, "learning_rate": 6.66815144766147e-05, "loss": 2.1919, "step": 3003 }, { "epoch": 0.6675555555555556, "grad_norm": 0.6709057688713074, "learning_rate": 6.66369710467706e-05, "loss": 1.0861, "step": 3004 }, { "epoch": 0.6677777777777778, "grad_norm": 0.05186731740832329, "learning_rate": 6.659242761692652e-05, "loss": 0.0109, "step": 3005 }, { "epoch": 0.668, "grad_norm": 0.6289195418357849, "learning_rate": 6.654788418708241e-05, "loss": 1.1656, "step": 3006 }, { "epoch": 0.6682222222222223, "grad_norm": 0.5143423080444336, "learning_rate": 6.650334075723831e-05, "loss": 1.1429, "step": 3007 }, { "epoch": 0.6684444444444444, "grad_norm": 0.7924249768257141, "learning_rate": 6.645879732739422e-05, "loss": 1.8575, "step": 3008 }, { "epoch": 0.6686666666666666, "grad_norm": 0.09778264164924622, "learning_rate": 6.641425389755011e-05, "loss": 0.0128, "step": 3009 }, { "epoch": 0.6688888888888889, "grad_norm": 0.10178276896476746, "learning_rate": 6.636971046770602e-05, "loss": 0.013, "step": 3010 }, { "epoch": 0.6691111111111111, "grad_norm": 0.09697845578193665, "learning_rate": 6.632516703786192e-05, "loss": 0.0126, "step": 3011 }, { "epoch": 0.6693333333333333, "grad_norm": 1.0395288467407227, "learning_rate": 6.628062360801783e-05, "loss": 2.0803, "step": 3012 }, { "epoch": 0.6695555555555556, "grad_norm": 0.8418979048728943, "learning_rate": 6.623608017817372e-05, "loss": 2.1971, "step": 3013 }, { "epoch": 0.6697777777777778, "grad_norm": 0.9855999946594238, "learning_rate": 6.619153674832962e-05, "loss": 1.9663, "step": 3014 }, { "epoch": 0.67, "grad_norm": 0.9182018637657166, "learning_rate": 6.614699331848553e-05, "loss": 2.1056, "step": 3015 }, { "epoch": 0.6702222222222223, "grad_norm": 0.9927064180374146, "learning_rate": 6.610244988864142e-05, "loss": 1.8567, "step": 3016 }, { "epoch": 0.6704444444444444, "grad_norm": 0.8513967394828796, "learning_rate": 6.605790645879733e-05, "loss": 1.8663, "step": 3017 }, { "epoch": 0.6706666666666666, "grad_norm": 0.6829978227615356, "learning_rate": 6.601336302895323e-05, "loss": 0.9329, "step": 3018 }, { "epoch": 0.6708888888888889, "grad_norm": 1.0144321918487549, "learning_rate": 6.596881959910914e-05, "loss": 2.3309, "step": 3019 }, { "epoch": 0.6711111111111111, "grad_norm": 0.9434064030647278, "learning_rate": 6.592427616926503e-05, "loss": 1.841, "step": 3020 }, { "epoch": 0.6713333333333333, "grad_norm": 0.9861494302749634, "learning_rate": 6.587973273942093e-05, "loss": 2.0507, "step": 3021 }, { "epoch": 0.6715555555555556, "grad_norm": 1.0820823907852173, "learning_rate": 6.583518930957684e-05, "loss": 1.5992, "step": 3022 }, { "epoch": 0.6717777777777778, "grad_norm": 0.6171742677688599, "learning_rate": 6.579064587973273e-05, "loss": 0.876, "step": 3023 }, { "epoch": 0.672, "grad_norm": 0.07197222858667374, "learning_rate": 6.574610244988865e-05, "loss": 0.0164, "step": 3024 }, { "epoch": 0.6722222222222223, "grad_norm": 0.7509397268295288, "learning_rate": 6.570155902004454e-05, "loss": 0.9635, "step": 3025 }, { "epoch": 0.6724444444444444, "grad_norm": 0.9577994346618652, "learning_rate": 6.565701559020045e-05, "loss": 1.7846, "step": 3026 }, { "epoch": 0.6726666666666666, "grad_norm": 0.9909307360649109, "learning_rate": 6.561247216035635e-05, "loss": 1.7811, "step": 3027 }, { "epoch": 0.6728888888888889, "grad_norm": 1.076392412185669, "learning_rate": 6.556792873051225e-05, "loss": 1.6848, "step": 3028 }, { "epoch": 0.6731111111111111, "grad_norm": 0.9113189578056335, "learning_rate": 6.552338530066815e-05, "loss": 1.7005, "step": 3029 }, { "epoch": 0.6733333333333333, "grad_norm": 1.2456274032592773, "learning_rate": 6.547884187082406e-05, "loss": 1.7526, "step": 3030 }, { "epoch": 0.6735555555555556, "grad_norm": 1.0734461545944214, "learning_rate": 6.543429844097996e-05, "loss": 1.9902, "step": 3031 }, { "epoch": 0.6737777777777778, "grad_norm": 0.7110247015953064, "learning_rate": 6.538975501113585e-05, "loss": 0.8453, "step": 3032 }, { "epoch": 0.674, "grad_norm": 0.059201423078775406, "learning_rate": 6.534521158129177e-05, "loss": 0.0163, "step": 3033 }, { "epoch": 0.6742222222222222, "grad_norm": 0.7983320355415344, "learning_rate": 6.530066815144766e-05, "loss": 0.9296, "step": 3034 }, { "epoch": 0.6744444444444444, "grad_norm": 0.07402991503477097, "learning_rate": 6.525612472160356e-05, "loss": 0.0193, "step": 3035 }, { "epoch": 0.6746666666666666, "grad_norm": 0.07244686037302017, "learning_rate": 6.521158129175947e-05, "loss": 0.0193, "step": 3036 }, { "epoch": 0.6748888888888889, "grad_norm": 1.0383340120315552, "learning_rate": 6.516703786191537e-05, "loss": 1.4567, "step": 3037 }, { "epoch": 0.6751111111111111, "grad_norm": 1.0180835723876953, "learning_rate": 6.512249443207127e-05, "loss": 1.8275, "step": 3038 }, { "epoch": 0.6753333333333333, "grad_norm": 1.225290298461914, "learning_rate": 6.507795100222718e-05, "loss": 1.7056, "step": 3039 }, { "epoch": 0.6755555555555556, "grad_norm": 0.8802182674407959, "learning_rate": 6.503340757238308e-05, "loss": 1.0935, "step": 3040 }, { "epoch": 0.6757777777777778, "grad_norm": 1.0758693218231201, "learning_rate": 6.498886414253897e-05, "loss": 1.7778, "step": 3041 }, { "epoch": 0.676, "grad_norm": 1.1325352191925049, "learning_rate": 6.494432071269488e-05, "loss": 1.5706, "step": 3042 }, { "epoch": 0.6762222222222222, "grad_norm": 1.0380780696868896, "learning_rate": 6.489977728285078e-05, "loss": 1.535, "step": 3043 }, { "epoch": 0.6764444444444444, "grad_norm": 0.9906545281410217, "learning_rate": 6.485523385300668e-05, "loss": 1.4007, "step": 3044 }, { "epoch": 0.6766666666666666, "grad_norm": 0.17783640325069427, "learning_rate": 6.48106904231626e-05, "loss": 0.0312, "step": 3045 }, { "epoch": 0.6768888888888889, "grad_norm": 0.9812122583389282, "learning_rate": 6.476614699331849e-05, "loss": 1.2594, "step": 3046 }, { "epoch": 0.6771111111111111, "grad_norm": 1.060013771057129, "learning_rate": 6.472160356347439e-05, "loss": 1.1064, "step": 3047 }, { "epoch": 0.6773333333333333, "grad_norm": 0.6272473335266113, "learning_rate": 6.46770601336303e-05, "loss": 0.4809, "step": 3048 }, { "epoch": 0.6775555555555556, "grad_norm": 0.9412599802017212, "learning_rate": 6.463251670378619e-05, "loss": 0.9192, "step": 3049 }, { "epoch": 0.6777777777777778, "grad_norm": 1.0236815214157104, "learning_rate": 6.45879732739421e-05, "loss": 0.8751, "step": 3050 }, { "epoch": 0.678, "grad_norm": 0.05509922653436661, "learning_rate": 6.4543429844098e-05, "loss": 0.0112, "step": 3051 }, { "epoch": 0.6782222222222222, "grad_norm": 0.8858595490455627, "learning_rate": 6.44988864142539e-05, "loss": 2.2657, "step": 3052 }, { "epoch": 0.6784444444444444, "grad_norm": 0.5961353182792664, "learning_rate": 6.44543429844098e-05, "loss": 1.2772, "step": 3053 }, { "epoch": 0.6786666666666666, "grad_norm": 0.05628953129053116, "learning_rate": 6.44097995545657e-05, "loss": 0.011, "step": 3054 }, { "epoch": 0.6788888888888889, "grad_norm": 0.8051088452339172, "learning_rate": 6.436525612472161e-05, "loss": 2.2465, "step": 3055 }, { "epoch": 0.6791111111111111, "grad_norm": 0.9172492027282715, "learning_rate": 6.43207126948775e-05, "loss": 2.1671, "step": 3056 }, { "epoch": 0.6793333333333333, "grad_norm": 0.622685968875885, "learning_rate": 6.427616926503342e-05, "loss": 1.1392, "step": 3057 }, { "epoch": 0.6795555555555556, "grad_norm": 1.047365665435791, "learning_rate": 6.423162583518931e-05, "loss": 2.1024, "step": 3058 }, { "epoch": 0.6797777777777778, "grad_norm": 0.6596314311027527, "learning_rate": 6.418708240534522e-05, "loss": 1.2222, "step": 3059 }, { "epoch": 0.68, "grad_norm": 0.058744728565216064, "learning_rate": 6.414253897550112e-05, "loss": 0.0111, "step": 3060 }, { "epoch": 0.6802222222222222, "grad_norm": 0.9326651692390442, "learning_rate": 6.409799554565701e-05, "loss": 2.1789, "step": 3061 }, { "epoch": 0.6804444444444444, "grad_norm": 0.9691800475120544, "learning_rate": 6.405345211581292e-05, "loss": 2.2824, "step": 3062 }, { "epoch": 0.6806666666666666, "grad_norm": 0.925193190574646, "learning_rate": 6.400890868596882e-05, "loss": 2.431, "step": 3063 }, { "epoch": 0.6808888888888889, "grad_norm": 0.9088225364685059, "learning_rate": 6.396436525612473e-05, "loss": 2.0492, "step": 3064 }, { "epoch": 0.6811111111111111, "grad_norm": 0.8372054696083069, "learning_rate": 6.391982182628062e-05, "loss": 2.2126, "step": 3065 }, { "epoch": 0.6813333333333333, "grad_norm": 0.8477223515510559, "learning_rate": 6.387527839643653e-05, "loss": 1.933, "step": 3066 }, { "epoch": 0.6815555555555556, "grad_norm": 0.6637649536132812, "learning_rate": 6.383073496659243e-05, "loss": 1.0526, "step": 3067 }, { "epoch": 0.6817777777777778, "grad_norm": 0.9227988719940186, "learning_rate": 6.378619153674832e-05, "loss": 1.9949, "step": 3068 }, { "epoch": 0.682, "grad_norm": 0.9380735754966736, "learning_rate": 6.374164810690424e-05, "loss": 1.8282, "step": 3069 }, { "epoch": 0.6822222222222222, "grad_norm": 0.992690920829773, "learning_rate": 6.369710467706013e-05, "loss": 1.7263, "step": 3070 }, { "epoch": 0.6824444444444444, "grad_norm": 0.9732444286346436, "learning_rate": 6.365256124721604e-05, "loss": 2.0818, "step": 3071 }, { "epoch": 0.6826666666666666, "grad_norm": 0.07128032296895981, "learning_rate": 6.360801781737195e-05, "loss": 0.0167, "step": 3072 }, { "epoch": 0.6828888888888889, "grad_norm": 0.9442581534385681, "learning_rate": 6.356347438752784e-05, "loss": 1.9286, "step": 3073 }, { "epoch": 0.6831111111111111, "grad_norm": 0.13606970012187958, "learning_rate": 6.351893095768374e-05, "loss": 0.0205, "step": 3074 }, { "epoch": 0.6833333333333333, "grad_norm": 0.648127555847168, "learning_rate": 6.347438752783965e-05, "loss": 0.8501, "step": 3075 }, { "epoch": 0.6835555555555556, "grad_norm": 0.9328134655952454, "learning_rate": 6.342984409799555e-05, "loss": 1.6076, "step": 3076 }, { "epoch": 0.6837777777777778, "grad_norm": 1.0561175346374512, "learning_rate": 6.338530066815144e-05, "loss": 1.765, "step": 3077 }, { "epoch": 0.684, "grad_norm": 1.1816853284835815, "learning_rate": 6.334075723830736e-05, "loss": 2.1397, "step": 3078 }, { "epoch": 0.6842222222222222, "grad_norm": 1.151865839958191, "learning_rate": 6.329621380846326e-05, "loss": 1.8881, "step": 3079 }, { "epoch": 0.6844444444444444, "grad_norm": 0.07445438951253891, "learning_rate": 6.325167037861916e-05, "loss": 0.0177, "step": 3080 }, { "epoch": 0.6846666666666666, "grad_norm": 0.07042374461889267, "learning_rate": 6.320712694877507e-05, "loss": 0.0171, "step": 3081 }, { "epoch": 0.6848888888888889, "grad_norm": 0.6800836324691772, "learning_rate": 6.316258351893096e-05, "loss": 0.7686, "step": 3082 }, { "epoch": 0.6851111111111111, "grad_norm": 0.7797111868858337, "learning_rate": 6.311804008908686e-05, "loss": 0.9435, "step": 3083 }, { "epoch": 0.6853333333333333, "grad_norm": 0.07849026471376419, "learning_rate": 6.307349665924277e-05, "loss": 0.0194, "step": 3084 }, { "epoch": 0.6855555555555556, "grad_norm": 0.07662785053253174, "learning_rate": 6.302895322939867e-05, "loss": 0.0191, "step": 3085 }, { "epoch": 0.6857777777777778, "grad_norm": 0.0744476169347763, "learning_rate": 6.298440979955457e-05, "loss": 0.0183, "step": 3086 }, { "epoch": 0.686, "grad_norm": 0.0717550590634346, "learning_rate": 6.293986636971047e-05, "loss": 0.0184, "step": 3087 }, { "epoch": 0.6862222222222222, "grad_norm": 1.115823745727539, "learning_rate": 6.289532293986638e-05, "loss": 1.871, "step": 3088 }, { "epoch": 0.6864444444444444, "grad_norm": 0.9394058585166931, "learning_rate": 6.285077951002227e-05, "loss": 1.2818, "step": 3089 }, { "epoch": 0.6866666666666666, "grad_norm": 0.7753637433052063, "learning_rate": 6.280623608017817e-05, "loss": 0.9434, "step": 3090 }, { "epoch": 0.6868888888888889, "grad_norm": 0.7117932438850403, "learning_rate": 6.276169265033408e-05, "loss": 0.7796, "step": 3091 }, { "epoch": 0.6871111111111111, "grad_norm": 1.1098551750183105, "learning_rate": 6.271714922048998e-05, "loss": 1.3902, "step": 3092 }, { "epoch": 0.6873333333333334, "grad_norm": 1.1206355094909668, "learning_rate": 6.267260579064588e-05, "loss": 1.4423, "step": 3093 }, { "epoch": 0.6875555555555556, "grad_norm": 1.0141700506210327, "learning_rate": 6.262806236080178e-05, "loss": 1.587, "step": 3094 }, { "epoch": 0.6877777777777778, "grad_norm": 1.2595239877700806, "learning_rate": 6.258351893095769e-05, "loss": 1.5666, "step": 3095 }, { "epoch": 0.688, "grad_norm": 0.9674675464630127, "learning_rate": 6.253897550111358e-05, "loss": 0.7225, "step": 3096 }, { "epoch": 0.6882222222222222, "grad_norm": 0.1801719069480896, "learning_rate": 6.24944320712695e-05, "loss": 0.0303, "step": 3097 }, { "epoch": 0.6884444444444444, "grad_norm": 1.2141374349594116, "learning_rate": 6.244988864142539e-05, "loss": 1.0987, "step": 3098 }, { "epoch": 0.6886666666666666, "grad_norm": 1.031459093093872, "learning_rate": 6.24053452115813e-05, "loss": 1.0589, "step": 3099 }, { "epoch": 0.6888888888888889, "grad_norm": 0.8709812760353088, "learning_rate": 6.23608017817372e-05, "loss": 0.5863, "step": 3100 }, { "epoch": 0.6891111111111111, "grad_norm": 0.607231616973877, "learning_rate": 6.231625835189309e-05, "loss": 1.0338, "step": 3101 }, { "epoch": 0.6893333333333334, "grad_norm": 0.5942530035972595, "learning_rate": 6.2271714922049e-05, "loss": 1.0758, "step": 3102 }, { "epoch": 0.6895555555555556, "grad_norm": 0.8875899910926819, "learning_rate": 6.22271714922049e-05, "loss": 2.1266, "step": 3103 }, { "epoch": 0.6897777777777778, "grad_norm": 0.8262476325035095, "learning_rate": 6.218262806236081e-05, "loss": 2.2635, "step": 3104 }, { "epoch": 0.69, "grad_norm": 0.0524749718606472, "learning_rate": 6.21380846325167e-05, "loss": 0.0106, "step": 3105 }, { "epoch": 0.6902222222222222, "grad_norm": 0.5803321003913879, "learning_rate": 6.20935412026726e-05, "loss": 1.0884, "step": 3106 }, { "epoch": 0.6904444444444444, "grad_norm": 1.0942132472991943, "learning_rate": 6.204899777282851e-05, "loss": 2.2134, "step": 3107 }, { "epoch": 0.6906666666666667, "grad_norm": 0.07692880928516388, "learning_rate": 6.20044543429844e-05, "loss": 0.0116, "step": 3108 }, { "epoch": 0.6908888888888889, "grad_norm": 0.07232845574617386, "learning_rate": 6.195991091314032e-05, "loss": 0.0117, "step": 3109 }, { "epoch": 0.6911111111111111, "grad_norm": 0.8545564413070679, "learning_rate": 6.191536748329621e-05, "loss": 2.0441, "step": 3110 }, { "epoch": 0.6913333333333334, "grad_norm": 0.8483017086982727, "learning_rate": 6.187082405345212e-05, "loss": 1.9926, "step": 3111 }, { "epoch": 0.6915555555555556, "grad_norm": 0.8519989848136902, "learning_rate": 6.182628062360802e-05, "loss": 2.1871, "step": 3112 }, { "epoch": 0.6917777777777778, "grad_norm": 0.8962295055389404, "learning_rate": 6.178173719376392e-05, "loss": 1.9079, "step": 3113 }, { "epoch": 0.692, "grad_norm": 0.908099353313446, "learning_rate": 6.173719376391982e-05, "loss": 2.2056, "step": 3114 }, { "epoch": 0.6922222222222222, "grad_norm": 0.9471180438995361, "learning_rate": 6.169265033407573e-05, "loss": 1.6695, "step": 3115 }, { "epoch": 0.6924444444444444, "grad_norm": 0.9277594685554504, "learning_rate": 6.164810690423163e-05, "loss": 2.0879, "step": 3116 }, { "epoch": 0.6926666666666667, "grad_norm": 0.6673265695571899, "learning_rate": 6.160356347438752e-05, "loss": 0.9213, "step": 3117 }, { "epoch": 0.6928888888888889, "grad_norm": 0.12814414501190186, "learning_rate": 6.155902004454344e-05, "loss": 0.0211, "step": 3118 }, { "epoch": 0.6931111111111111, "grad_norm": 0.9539985656738281, "learning_rate": 6.151447661469933e-05, "loss": 1.9212, "step": 3119 }, { "epoch": 0.6933333333333334, "grad_norm": 0.927853524684906, "learning_rate": 6.146993318485523e-05, "loss": 1.6054, "step": 3120 }, { "epoch": 0.6935555555555556, "grad_norm": 0.6636569499969482, "learning_rate": 6.142538975501115e-05, "loss": 0.9331, "step": 3121 }, { "epoch": 0.6937777777777778, "grad_norm": 0.07317844778299332, "learning_rate": 6.138084632516704e-05, "loss": 0.0166, "step": 3122 }, { "epoch": 0.694, "grad_norm": 0.07253949344158173, "learning_rate": 6.133630289532294e-05, "loss": 0.0165, "step": 3123 }, { "epoch": 0.6942222222222222, "grad_norm": 0.07455820590257645, "learning_rate": 6.129175946547885e-05, "loss": 0.0168, "step": 3124 }, { "epoch": 0.6944444444444444, "grad_norm": 0.7180811762809753, "learning_rate": 6.124721603563475e-05, "loss": 1.3197, "step": 3125 }, { "epoch": 0.6946666666666667, "grad_norm": 1.0325121879577637, "learning_rate": 6.120267260579064e-05, "loss": 1.8146, "step": 3126 }, { "epoch": 0.6948888888888889, "grad_norm": 1.0472650527954102, "learning_rate": 6.115812917594655e-05, "loss": 1.8477, "step": 3127 }, { "epoch": 0.6951111111111111, "grad_norm": 1.3057109117507935, "learning_rate": 6.111358574610246e-05, "loss": 1.6522, "step": 3128 }, { "epoch": 0.6953333333333334, "grad_norm": 0.9642925262451172, "learning_rate": 6.106904231625835e-05, "loss": 1.9227, "step": 3129 }, { "epoch": 0.6955555555555556, "grad_norm": 0.9852336049079895, "learning_rate": 6.102449888641426e-05, "loss": 1.8925, "step": 3130 }, { "epoch": 0.6957777777777778, "grad_norm": 0.0911262258887291, "learning_rate": 6.097995545657016e-05, "loss": 0.0171, "step": 3131 }, { "epoch": 0.696, "grad_norm": 0.6741465330123901, "learning_rate": 6.093541202672606e-05, "loss": 0.8653, "step": 3132 }, { "epoch": 0.6962222222222222, "grad_norm": 0.9752011895179749, "learning_rate": 6.089086859688197e-05, "loss": 1.4045, "step": 3133 }, { "epoch": 0.6964444444444444, "grad_norm": 0.07129085063934326, "learning_rate": 6.084632516703787e-05, "loss": 0.0192, "step": 3134 }, { "epoch": 0.6966666666666667, "grad_norm": 0.0695280209183693, "learning_rate": 6.0801781737193766e-05, "loss": 0.0186, "step": 3135 }, { "epoch": 0.6968888888888889, "grad_norm": 0.07262587547302246, "learning_rate": 6.075723830734967e-05, "loss": 0.0183, "step": 3136 }, { "epoch": 0.6971111111111111, "grad_norm": 0.9831186532974243, "learning_rate": 6.071269487750557e-05, "loss": 1.4122, "step": 3137 }, { "epoch": 0.6973333333333334, "grad_norm": 0.9442914724349976, "learning_rate": 6.066815144766147e-05, "loss": 1.452, "step": 3138 }, { "epoch": 0.6975555555555556, "grad_norm": 1.1144623756408691, "learning_rate": 6.062360801781738e-05, "loss": 1.6665, "step": 3139 }, { "epoch": 0.6977777777777778, "grad_norm": 1.0614639520645142, "learning_rate": 6.057906458797328e-05, "loss": 1.5621, "step": 3140 }, { "epoch": 0.698, "grad_norm": 1.2415484189987183, "learning_rate": 6.053452115812918e-05, "loss": 1.5338, "step": 3141 }, { "epoch": 0.6982222222222222, "grad_norm": 1.146238923072815, "learning_rate": 6.048997772828508e-05, "loss": 1.8936, "step": 3142 }, { "epoch": 0.6984444444444444, "grad_norm": 1.1693158149719238, "learning_rate": 6.044543429844098e-05, "loss": 1.7386, "step": 3143 }, { "epoch": 0.6986666666666667, "grad_norm": 1.2340409755706787, "learning_rate": 6.040089086859688e-05, "loss": 1.4781, "step": 3144 }, { "epoch": 0.6988888888888889, "grad_norm": 1.0042845010757446, "learning_rate": 6.035634743875279e-05, "loss": 1.3662, "step": 3145 }, { "epoch": 0.6991111111111111, "grad_norm": 0.18454298377037048, "learning_rate": 6.031180400890869e-05, "loss": 0.0276, "step": 3146 }, { "epoch": 0.6993333333333334, "grad_norm": 1.1719262599945068, "learning_rate": 6.026726057906459e-05, "loss": 1.0601, "step": 3147 }, { "epoch": 0.6995555555555556, "grad_norm": 0.9232467412948608, "learning_rate": 6.0222717149220495e-05, "loss": 0.8415, "step": 3148 }, { "epoch": 0.6997777777777778, "grad_norm": 0.194104865193367, "learning_rate": 6.0178173719376394e-05, "loss": 0.0401, "step": 3149 }, { "epoch": 0.7, "grad_norm": 0.7421103119850159, "learning_rate": 6.013363028953229e-05, "loss": 0.6284, "step": 3150 }, { "epoch": 0.7002222222222222, "grad_norm": 0.7694705724716187, "learning_rate": 6.0089086859688204e-05, "loss": 1.1152, "step": 3151 }, { "epoch": 0.7004444444444444, "grad_norm": 0.703349232673645, "learning_rate": 6.00445434298441e-05, "loss": 1.1683, "step": 3152 }, { "epoch": 0.7006666666666667, "grad_norm": 0.05406121537089348, "learning_rate": 6e-05, "loss": 0.0105, "step": 3153 }, { "epoch": 0.7008888888888889, "grad_norm": 0.5842484831809998, "learning_rate": 5.995545657015591e-05, "loss": 1.15, "step": 3154 }, { "epoch": 0.7011111111111111, "grad_norm": 0.05234431475400925, "learning_rate": 5.9910913140311805e-05, "loss": 0.0105, "step": 3155 }, { "epoch": 0.7013333333333334, "grad_norm": 0.5893082618713379, "learning_rate": 5.9866369710467704e-05, "loss": 1.1913, "step": 3156 }, { "epoch": 0.7015555555555556, "grad_norm": 0.5218148231506348, "learning_rate": 5.9821826280623616e-05, "loss": 0.9835, "step": 3157 }, { "epoch": 0.7017777777777777, "grad_norm": 0.5484596490859985, "learning_rate": 5.9777282850779515e-05, "loss": 0.9247, "step": 3158 }, { "epoch": 0.702, "grad_norm": 0.6557696461677551, "learning_rate": 5.973273942093541e-05, "loss": 1.1391, "step": 3159 }, { "epoch": 0.7022222222222222, "grad_norm": 0.5898274779319763, "learning_rate": 5.9688195991091325e-05, "loss": 1.2284, "step": 3160 }, { "epoch": 0.7024444444444444, "grad_norm": 0.09231838583946228, "learning_rate": 5.9643652561247224e-05, "loss": 0.0125, "step": 3161 }, { "epoch": 0.7026666666666667, "grad_norm": 1.012488842010498, "learning_rate": 5.9599109131403116e-05, "loss": 2.0515, "step": 3162 }, { "epoch": 0.7028888888888889, "grad_norm": 0.9501926302909851, "learning_rate": 5.9554565701559014e-05, "loss": 2.3767, "step": 3163 }, { "epoch": 0.7031111111111111, "grad_norm": 0.9576533436775208, "learning_rate": 5.9510022271714927e-05, "loss": 2.3394, "step": 3164 }, { "epoch": 0.7033333333333334, "grad_norm": 0.944797694683075, "learning_rate": 5.9465478841870825e-05, "loss": 1.9659, "step": 3165 }, { "epoch": 0.7035555555555556, "grad_norm": 0.8810012340545654, "learning_rate": 5.9420935412026724e-05, "loss": 1.8656, "step": 3166 }, { "epoch": 0.7037777777777777, "grad_norm": 0.6439220309257507, "learning_rate": 5.9376391982182636e-05, "loss": 0.8335, "step": 3167 }, { "epoch": 0.704, "grad_norm": 0.9962994456291199, "learning_rate": 5.9331848552338534e-05, "loss": 2.0233, "step": 3168 }, { "epoch": 0.7042222222222222, "grad_norm": 0.9703332185745239, "learning_rate": 5.928730512249443e-05, "loss": 1.9519, "step": 3169 }, { "epoch": 0.7044444444444444, "grad_norm": 1.0500884056091309, "learning_rate": 5.924276169265034e-05, "loss": 1.578, "step": 3170 }, { "epoch": 0.7046666666666667, "grad_norm": 0.9718672037124634, "learning_rate": 5.919821826280624e-05, "loss": 2.0021, "step": 3171 }, { "epoch": 0.7048888888888889, "grad_norm": 0.07014777511358261, "learning_rate": 5.9153674832962136e-05, "loss": 0.0162, "step": 3172 }, { "epoch": 0.7051111111111111, "grad_norm": 0.07737057656049728, "learning_rate": 5.910913140311805e-05, "loss": 0.0156, "step": 3173 }, { "epoch": 0.7053333333333334, "grad_norm": 0.12630076706409454, "learning_rate": 5.9064587973273946e-05, "loss": 0.0213, "step": 3174 }, { "epoch": 0.7055555555555556, "grad_norm": 0.7619150876998901, "learning_rate": 5.9020044543429845e-05, "loss": 0.9373, "step": 3175 }, { "epoch": 0.7057777777777777, "grad_norm": 1.0807890892028809, "learning_rate": 5.897550111358575e-05, "loss": 2.0725, "step": 3176 }, { "epoch": 0.706, "grad_norm": 0.9409441351890564, "learning_rate": 5.893095768374165e-05, "loss": 1.6597, "step": 3177 }, { "epoch": 0.7062222222222222, "grad_norm": 1.096917986869812, "learning_rate": 5.888641425389755e-05, "loss": 1.9767, "step": 3178 }, { "epoch": 0.7064444444444444, "grad_norm": 1.091698408126831, "learning_rate": 5.884187082405346e-05, "loss": 1.7166, "step": 3179 }, { "epoch": 0.7066666666666667, "grad_norm": 1.0211970806121826, "learning_rate": 5.879732739420936e-05, "loss": 1.6798, "step": 3180 }, { "epoch": 0.7068888888888889, "grad_norm": 0.6886789202690125, "learning_rate": 5.875278396436526e-05, "loss": 1.0461, "step": 3181 }, { "epoch": 0.7071111111111111, "grad_norm": 0.05880124494433403, "learning_rate": 5.870824053452116e-05, "loss": 0.0178, "step": 3182 }, { "epoch": 0.7073333333333334, "grad_norm": 0.060819823294878006, "learning_rate": 5.866369710467706e-05, "loss": 0.0178, "step": 3183 }, { "epoch": 0.7075555555555556, "grad_norm": 0.724615752696991, "learning_rate": 5.861915367483296e-05, "loss": 1.0519, "step": 3184 }, { "epoch": 0.7077777777777777, "grad_norm": 0.8110787868499756, "learning_rate": 5.857461024498887e-05, "loss": 0.9285, "step": 3185 }, { "epoch": 0.708, "grad_norm": 0.06422421336174011, "learning_rate": 5.853006681514477e-05, "loss": 0.0176, "step": 3186 }, { "epoch": 0.7082222222222222, "grad_norm": 0.067098468542099, "learning_rate": 5.848552338530067e-05, "loss": 0.0175, "step": 3187 }, { "epoch": 0.7084444444444444, "grad_norm": 0.06487097591161728, "learning_rate": 5.8440979955456574e-05, "loss": 0.0172, "step": 3188 }, { "epoch": 0.7086666666666667, "grad_norm": 0.06535470485687256, "learning_rate": 5.839643652561247e-05, "loss": 0.0174, "step": 3189 }, { "epoch": 0.7088888888888889, "grad_norm": 1.172293782234192, "learning_rate": 5.835189309576837e-05, "loss": 1.5616, "step": 3190 }, { "epoch": 0.7091111111111111, "grad_norm": 1.1036264896392822, "learning_rate": 5.830734966592428e-05, "loss": 1.6201, "step": 3191 }, { "epoch": 0.7093333333333334, "grad_norm": 0.7746077179908752, "learning_rate": 5.826280623608018e-05, "loss": 0.8634, "step": 3192 }, { "epoch": 0.7095555555555556, "grad_norm": 0.9545249342918396, "learning_rate": 5.821826280623608e-05, "loss": 1.4337, "step": 3193 }, { "epoch": 0.7097777777777777, "grad_norm": 1.0250579118728638, "learning_rate": 5.8173719376391986e-05, "loss": 1.6208, "step": 3194 }, { "epoch": 0.71, "grad_norm": 1.0089478492736816, "learning_rate": 5.8129175946547884e-05, "loss": 1.2085, "step": 3195 }, { "epoch": 0.7102222222222222, "grad_norm": 1.1248306035995483, "learning_rate": 5.808463251670378e-05, "loss": 1.5717, "step": 3196 }, { "epoch": 0.7104444444444444, "grad_norm": 0.6627147793769836, "learning_rate": 5.8040089086859695e-05, "loss": 0.6573, "step": 3197 }, { "epoch": 0.7106666666666667, "grad_norm": 1.230597972869873, "learning_rate": 5.7995545657015594e-05, "loss": 1.2766, "step": 3198 }, { "epoch": 0.7108888888888889, "grad_norm": 0.1396600902080536, "learning_rate": 5.795100222717149e-05, "loss": 0.0322, "step": 3199 }, { "epoch": 0.7111111111111111, "grad_norm": 0.9573265314102173, "learning_rate": 5.79064587973274e-05, "loss": 0.507, "step": 3200 }, { "epoch": 0.7113333333333334, "grad_norm": 0.8519662618637085, "learning_rate": 5.7861915367483296e-05, "loss": 1.1341, "step": 3201 }, { "epoch": 0.7115555555555556, "grad_norm": 0.04184136167168617, "learning_rate": 5.7817371937639195e-05, "loss": 0.0093, "step": 3202 }, { "epoch": 0.7117777777777777, "grad_norm": 0.6974391341209412, "learning_rate": 5.777282850779511e-05, "loss": 1.3839, "step": 3203 }, { "epoch": 0.712, "grad_norm": 0.8318896293640137, "learning_rate": 5.7728285077951005e-05, "loss": 2.0999, "step": 3204 }, { "epoch": 0.7122222222222222, "grad_norm": 0.5589978694915771, "learning_rate": 5.7683741648106904e-05, "loss": 1.1487, "step": 3205 }, { "epoch": 0.7124444444444444, "grad_norm": 0.07977552711963654, "learning_rate": 5.7639198218262816e-05, "loss": 0.0123, "step": 3206 }, { "epoch": 0.7126666666666667, "grad_norm": 0.07479345053434372, "learning_rate": 5.7594654788418715e-05, "loss": 0.0123, "step": 3207 }, { "epoch": 0.7128888888888889, "grad_norm": 0.7520397305488586, "learning_rate": 5.7550111358574607e-05, "loss": 1.8669, "step": 3208 }, { "epoch": 0.7131111111111111, "grad_norm": 0.891527533531189, "learning_rate": 5.750556792873052e-05, "loss": 1.8476, "step": 3209 }, { "epoch": 0.7133333333333334, "grad_norm": 0.870412290096283, "learning_rate": 5.746102449888642e-05, "loss": 1.9461, "step": 3210 }, { "epoch": 0.7135555555555556, "grad_norm": 0.9231261610984802, "learning_rate": 5.7416481069042316e-05, "loss": 2.1436, "step": 3211 }, { "epoch": 0.7137777777777777, "grad_norm": 0.804538369178772, "learning_rate": 5.737193763919823e-05, "loss": 1.6058, "step": 3212 }, { "epoch": 0.714, "grad_norm": 0.9710292220115662, "learning_rate": 5.7327394209354127e-05, "loss": 1.0738, "step": 3213 }, { "epoch": 0.7142222222222222, "grad_norm": 0.9411685466766357, "learning_rate": 5.7282850779510025e-05, "loss": 2.0708, "step": 3214 }, { "epoch": 0.7144444444444444, "grad_norm": 0.9712237119674683, "learning_rate": 5.723830734966593e-05, "loss": 2.1416, "step": 3215 }, { "epoch": 0.7146666666666667, "grad_norm": 0.6982542872428894, "learning_rate": 5.719376391982183e-05, "loss": 0.7926, "step": 3216 }, { "epoch": 0.7148888888888889, "grad_norm": 0.7483058571815491, "learning_rate": 5.714922048997773e-05, "loss": 0.8696, "step": 3217 }, { "epoch": 0.7151111111111111, "grad_norm": 0.6382774114608765, "learning_rate": 5.710467706013364e-05, "loss": 0.8758, "step": 3218 }, { "epoch": 0.7153333333333334, "grad_norm": 0.09534616768360138, "learning_rate": 5.706013363028954e-05, "loss": 0.0179, "step": 3219 }, { "epoch": 0.7155555555555555, "grad_norm": 0.9931474328041077, "learning_rate": 5.701559020044544e-05, "loss": 1.7448, "step": 3220 }, { "epoch": 0.7157777777777777, "grad_norm": 1.051207184791565, "learning_rate": 5.697104677060134e-05, "loss": 1.8485, "step": 3221 }, { "epoch": 0.716, "grad_norm": 0.9426413178443909, "learning_rate": 5.692650334075724e-05, "loss": 1.6347, "step": 3222 }, { "epoch": 0.7162222222222222, "grad_norm": 0.919272243976593, "learning_rate": 5.688195991091314e-05, "loss": 1.7151, "step": 3223 }, { "epoch": 0.7164444444444444, "grad_norm": 0.9655510783195496, "learning_rate": 5.683741648106905e-05, "loss": 1.6615, "step": 3224 }, { "epoch": 0.7166666666666667, "grad_norm": 1.2728337049484253, "learning_rate": 5.679287305122495e-05, "loss": 1.8277, "step": 3225 }, { "epoch": 0.7168888888888889, "grad_norm": 0.7086578011512756, "learning_rate": 5.674832962138085e-05, "loss": 0.8322, "step": 3226 }, { "epoch": 0.7171111111111111, "grad_norm": 0.06795133650302887, "learning_rate": 5.6703786191536754e-05, "loss": 0.017, "step": 3227 }, { "epoch": 0.7173333333333334, "grad_norm": 0.06331969052553177, "learning_rate": 5.665924276169265e-05, "loss": 0.0171, "step": 3228 }, { "epoch": 0.7175555555555555, "grad_norm": 0.0663456916809082, "learning_rate": 5.661469933184855e-05, "loss": 0.0173, "step": 3229 }, { "epoch": 0.7177777777777777, "grad_norm": 0.8989565968513489, "learning_rate": 5.6570155902004463e-05, "loss": 1.6765, "step": 3230 }, { "epoch": 0.718, "grad_norm": 0.7637456059455872, "learning_rate": 5.652561247216036e-05, "loss": 0.7514, "step": 3231 }, { "epoch": 0.7182222222222222, "grad_norm": 0.08078856021165848, "learning_rate": 5.648106904231626e-05, "loss": 0.0163, "step": 3232 }, { "epoch": 0.7184444444444444, "grad_norm": 0.8078843951225281, "learning_rate": 5.643652561247216e-05, "loss": 0.8599, "step": 3233 }, { "epoch": 0.7186666666666667, "grad_norm": 1.0271605253219604, "learning_rate": 5.6391982182628065e-05, "loss": 1.415, "step": 3234 }, { "epoch": 0.7188888888888889, "grad_norm": 1.2213661670684814, "learning_rate": 5.634743875278396e-05, "loss": 1.8322, "step": 3235 }, { "epoch": 0.7191111111111111, "grad_norm": 1.0940077304840088, "learning_rate": 5.630289532293986e-05, "loss": 1.5287, "step": 3236 }, { "epoch": 0.7193333333333334, "grad_norm": 1.0005013942718506, "learning_rate": 5.6258351893095774e-05, "loss": 1.6254, "step": 3237 }, { "epoch": 0.7195555555555555, "grad_norm": 0.8303656578063965, "learning_rate": 5.621380846325167e-05, "loss": 1.027, "step": 3238 }, { "epoch": 0.7197777777777777, "grad_norm": 0.704897403717041, "learning_rate": 5.616926503340757e-05, "loss": 0.7988, "step": 3239 }, { "epoch": 0.72, "grad_norm": 1.0700993537902832, "learning_rate": 5.6124721603563476e-05, "loss": 1.7471, "step": 3240 }, { "epoch": 0.7202222222222222, "grad_norm": 1.1328794956207275, "learning_rate": 5.6080178173719375e-05, "loss": 1.2742, "step": 3241 }, { "epoch": 0.7204444444444444, "grad_norm": 0.9732044339179993, "learning_rate": 5.6035634743875274e-05, "loss": 1.3644, "step": 3242 }, { "epoch": 0.7206666666666667, "grad_norm": 1.174729347229004, "learning_rate": 5.5991091314031186e-05, "loss": 1.5483, "step": 3243 }, { "epoch": 0.7208888888888889, "grad_norm": 0.78294837474823, "learning_rate": 5.5946547884187084e-05, "loss": 0.6034, "step": 3244 }, { "epoch": 0.7211111111111111, "grad_norm": 0.8941324949264526, "learning_rate": 5.590200445434298e-05, "loss": 1.0701, "step": 3245 }, { "epoch": 0.7213333333333334, "grad_norm": 1.1886690855026245, "learning_rate": 5.5857461024498895e-05, "loss": 1.3666, "step": 3246 }, { "epoch": 0.7215555555555555, "grad_norm": 1.0057522058486938, "learning_rate": 5.581291759465479e-05, "loss": 1.1996, "step": 3247 }, { "epoch": 0.7217777777777777, "grad_norm": 0.839670717716217, "learning_rate": 5.5768374164810685e-05, "loss": 0.6609, "step": 3248 }, { "epoch": 0.722, "grad_norm": 1.1767035722732544, "learning_rate": 5.57238307349666e-05, "loss": 1.0941, "step": 3249 }, { "epoch": 0.7222222222222222, "grad_norm": 1.2154204845428467, "learning_rate": 5.5679287305122496e-05, "loss": 0.7415, "step": 3250 }, { "epoch": 0.7224444444444444, "grad_norm": 0.5861397385597229, "learning_rate": 5.5634743875278395e-05, "loss": 1.1437, "step": 3251 }, { "epoch": 0.7226666666666667, "grad_norm": 0.041759125888347626, "learning_rate": 5.559020044543431e-05, "loss": 0.0097, "step": 3252 }, { "epoch": 0.7228888888888889, "grad_norm": 0.7977886199951172, "learning_rate": 5.5545657015590205e-05, "loss": 2.1774, "step": 3253 }, { "epoch": 0.7231111111111111, "grad_norm": 0.571662425994873, "learning_rate": 5.5501113585746104e-05, "loss": 1.1858, "step": 3254 }, { "epoch": 0.7233333333333334, "grad_norm": 0.7104848027229309, "learning_rate": 5.545657015590201e-05, "loss": 1.0467, "step": 3255 }, { "epoch": 0.7235555555555555, "grad_norm": 0.8153942823410034, "learning_rate": 5.541202672605791e-05, "loss": 1.976, "step": 3256 }, { "epoch": 0.7237777777777777, "grad_norm": 0.08071549981832504, "learning_rate": 5.5367483296213806e-05, "loss": 0.0125, "step": 3257 }, { "epoch": 0.724, "grad_norm": 0.12843948602676392, "learning_rate": 5.532293986636972e-05, "loss": 0.0124, "step": 3258 }, { "epoch": 0.7242222222222222, "grad_norm": 0.07995433360338211, "learning_rate": 5.527839643652562e-05, "loss": 0.0122, "step": 3259 }, { "epoch": 0.7244444444444444, "grad_norm": 0.0713566243648529, "learning_rate": 5.5233853006681516e-05, "loss": 0.0119, "step": 3260 }, { "epoch": 0.7246666666666667, "grad_norm": 0.07306591421365738, "learning_rate": 5.518930957683742e-05, "loss": 0.0118, "step": 3261 }, { "epoch": 0.7248888888888889, "grad_norm": 0.8607704043388367, "learning_rate": 5.514476614699332e-05, "loss": 2.1437, "step": 3262 }, { "epoch": 0.7251111111111112, "grad_norm": 0.8772170543670654, "learning_rate": 5.510022271714922e-05, "loss": 1.9092, "step": 3263 }, { "epoch": 0.7253333333333334, "grad_norm": 0.9902425408363342, "learning_rate": 5.505567928730513e-05, "loss": 2.1999, "step": 3264 }, { "epoch": 0.7255555555555555, "grad_norm": 0.926304817199707, "learning_rate": 5.501113585746103e-05, "loss": 2.0622, "step": 3265 }, { "epoch": 0.7257777777777777, "grad_norm": 0.8717379570007324, "learning_rate": 5.496659242761693e-05, "loss": 1.6769, "step": 3266 }, { "epoch": 0.726, "grad_norm": 1.0354970693588257, "learning_rate": 5.492204899777283e-05, "loss": 1.9093, "step": 3267 }, { "epoch": 0.7262222222222222, "grad_norm": 0.9445512890815735, "learning_rate": 5.487750556792873e-05, "loss": 1.9806, "step": 3268 }, { "epoch": 0.7264444444444444, "grad_norm": 0.9720260500907898, "learning_rate": 5.483296213808463e-05, "loss": 1.793, "step": 3269 }, { "epoch": 0.7266666666666667, "grad_norm": 0.932304859161377, "learning_rate": 5.478841870824054e-05, "loss": 1.8463, "step": 3270 }, { "epoch": 0.7268888888888889, "grad_norm": 0.9925035238265991, "learning_rate": 5.474387527839644e-05, "loss": 1.9726, "step": 3271 }, { "epoch": 0.7271111111111112, "grad_norm": 0.5608296990394592, "learning_rate": 5.469933184855234e-05, "loss": 0.7764, "step": 3272 }, { "epoch": 0.7273333333333334, "grad_norm": 0.6601234674453735, "learning_rate": 5.4654788418708245e-05, "loss": 0.8271, "step": 3273 }, { "epoch": 0.7275555555555555, "grad_norm": 0.6779617071151733, "learning_rate": 5.461024498886414e-05, "loss": 0.9032, "step": 3274 }, { "epoch": 0.7277777777777777, "grad_norm": 0.9753432869911194, "learning_rate": 5.456570155902004e-05, "loss": 1.7793, "step": 3275 }, { "epoch": 0.728, "grad_norm": 0.9676978588104248, "learning_rate": 5.4521158129175954e-05, "loss": 1.6972, "step": 3276 }, { "epoch": 0.7282222222222222, "grad_norm": 1.093235969543457, "learning_rate": 5.447661469933185e-05, "loss": 2.0882, "step": 3277 }, { "epoch": 0.7284444444444444, "grad_norm": 1.0347819328308105, "learning_rate": 5.443207126948775e-05, "loss": 2.039, "step": 3278 }, { "epoch": 0.7286666666666667, "grad_norm": 0.071097731590271, "learning_rate": 5.4387527839643657e-05, "loss": 0.0174, "step": 3279 }, { "epoch": 0.7288888888888889, "grad_norm": 0.9010851383209229, "learning_rate": 5.4342984409799555e-05, "loss": 1.0428, "step": 3280 }, { "epoch": 0.7291111111111112, "grad_norm": 0.07293925434350967, "learning_rate": 5.4298440979955454e-05, "loss": 0.0173, "step": 3281 }, { "epoch": 0.7293333333333333, "grad_norm": 1.1432619094848633, "learning_rate": 5.4253897550111366e-05, "loss": 1.8629, "step": 3282 }, { "epoch": 0.7295555555555555, "grad_norm": 1.1886756420135498, "learning_rate": 5.4209354120267264e-05, "loss": 1.6837, "step": 3283 }, { "epoch": 0.7297777777777777, "grad_norm": 1.0832699537277222, "learning_rate": 5.416481069042316e-05, "loss": 1.536, "step": 3284 }, { "epoch": 0.73, "grad_norm": 0.6643537878990173, "learning_rate": 5.412026726057907e-05, "loss": 0.7855, "step": 3285 }, { "epoch": 0.7302222222222222, "grad_norm": 1.0094225406646729, "learning_rate": 5.407572383073497e-05, "loss": 1.4139, "step": 3286 }, { "epoch": 0.7304444444444445, "grad_norm": 1.14029860496521, "learning_rate": 5.4031180400890866e-05, "loss": 1.5854, "step": 3287 }, { "epoch": 0.7306666666666667, "grad_norm": 0.9698799848556519, "learning_rate": 5.398663697104678e-05, "loss": 1.4863, "step": 3288 }, { "epoch": 0.7308888888888889, "grad_norm": 1.1054226160049438, "learning_rate": 5.3942093541202676e-05, "loss": 1.3324, "step": 3289 }, { "epoch": 0.7311111111111112, "grad_norm": 1.1010569334030151, "learning_rate": 5.3897550111358575e-05, "loss": 1.4656, "step": 3290 }, { "epoch": 0.7313333333333333, "grad_norm": 1.315499186515808, "learning_rate": 5.385300668151449e-05, "loss": 1.4048, "step": 3291 }, { "epoch": 0.7315555555555555, "grad_norm": 1.108127474784851, "learning_rate": 5.3808463251670386e-05, "loss": 1.327, "step": 3292 }, { "epoch": 0.7317777777777777, "grad_norm": 0.6661926507949829, "learning_rate": 5.376391982182628e-05, "loss": 0.6108, "step": 3293 }, { "epoch": 0.732, "grad_norm": 0.9805776476860046, "learning_rate": 5.371937639198219e-05, "loss": 1.1752, "step": 3294 }, { "epoch": 0.7322222222222222, "grad_norm": 1.0693986415863037, "learning_rate": 5.367483296213809e-05, "loss": 1.3078, "step": 3295 }, { "epoch": 0.7324444444444445, "grad_norm": 1.078148603439331, "learning_rate": 5.363028953229399e-05, "loss": 1.2446, "step": 3296 }, { "epoch": 0.7326666666666667, "grad_norm": 1.1625440120697021, "learning_rate": 5.35857461024499e-05, "loss": 1.2387, "step": 3297 }, { "epoch": 0.7328888888888889, "grad_norm": 1.1278488636016846, "learning_rate": 5.35412026726058e-05, "loss": 1.1962, "step": 3298 }, { "epoch": 0.7331111111111112, "grad_norm": 1.182511806488037, "learning_rate": 5.3496659242761696e-05, "loss": 1.1573, "step": 3299 }, { "epoch": 0.7333333333333333, "grad_norm": 1.1381057500839233, "learning_rate": 5.34521158129176e-05, "loss": 0.7817, "step": 3300 }, { "epoch": 0.7335555555555555, "grad_norm": 0.5531929135322571, "learning_rate": 5.34075723830735e-05, "loss": 0.8331, "step": 3301 }, { "epoch": 0.7337777777777778, "grad_norm": 0.8333101868629456, "learning_rate": 5.33630289532294e-05, "loss": 1.9768, "step": 3302 }, { "epoch": 0.734, "grad_norm": 0.6918635964393616, "learning_rate": 5.331848552338531e-05, "loss": 1.0828, "step": 3303 }, { "epoch": 0.7342222222222222, "grad_norm": 0.9859722256660461, "learning_rate": 5.327394209354121e-05, "loss": 2.2754, "step": 3304 }, { "epoch": 0.7344444444444445, "grad_norm": 0.6960622072219849, "learning_rate": 5.322939866369711e-05, "loss": 1.0663, "step": 3305 }, { "epoch": 0.7346666666666667, "grad_norm": 1.1575109958648682, "learning_rate": 5.3184855233853006e-05, "loss": 2.2622, "step": 3306 }, { "epoch": 0.7348888888888889, "grad_norm": 0.5985379219055176, "learning_rate": 5.314031180400891e-05, "loss": 1.0319, "step": 3307 }, { "epoch": 0.7351111111111112, "grad_norm": 0.06290951371192932, "learning_rate": 5.309576837416481e-05, "loss": 0.0109, "step": 3308 }, { "epoch": 0.7353333333333333, "grad_norm": 0.06811843812465668, "learning_rate": 5.305122494432071e-05, "loss": 0.0109, "step": 3309 }, { "epoch": 0.7355555555555555, "grad_norm": 0.06429023295640945, "learning_rate": 5.300668151447662e-05, "loss": 0.0107, "step": 3310 }, { "epoch": 0.7357777777777778, "grad_norm": 0.06323552876710892, "learning_rate": 5.296213808463252e-05, "loss": 0.0107, "step": 3311 }, { "epoch": 0.736, "grad_norm": 0.6487092971801758, "learning_rate": 5.291759465478842e-05, "loss": 0.9286, "step": 3312 }, { "epoch": 0.7362222222222222, "grad_norm": 0.8638578653335571, "learning_rate": 5.2873051224944324e-05, "loss": 1.8427, "step": 3313 }, { "epoch": 0.7364444444444445, "grad_norm": 0.9095218181610107, "learning_rate": 5.282850779510022e-05, "loss": 2.0546, "step": 3314 }, { "epoch": 0.7366666666666667, "grad_norm": 0.87845379114151, "learning_rate": 5.278396436525612e-05, "loss": 1.9648, "step": 3315 }, { "epoch": 0.7368888888888889, "grad_norm": 0.8854038119316101, "learning_rate": 5.273942093541203e-05, "loss": 1.8114, "step": 3316 }, { "epoch": 0.7371111111111112, "grad_norm": 0.5725350379943848, "learning_rate": 5.269487750556793e-05, "loss": 1.0721, "step": 3317 }, { "epoch": 0.7373333333333333, "grad_norm": 0.6683716177940369, "learning_rate": 5.265033407572383e-05, "loss": 0.9192, "step": 3318 }, { "epoch": 0.7375555555555555, "grad_norm": 0.9927780628204346, "learning_rate": 5.2605790645879735e-05, "loss": 1.8748, "step": 3319 }, { "epoch": 0.7377777777777778, "grad_norm": 0.8612250685691833, "learning_rate": 5.2561247216035634e-05, "loss": 1.8307, "step": 3320 }, { "epoch": 0.738, "grad_norm": 0.9024035930633545, "learning_rate": 5.251670378619153e-05, "loss": 1.8448, "step": 3321 }, { "epoch": 0.7382222222222222, "grad_norm": 0.969914436340332, "learning_rate": 5.2472160356347445e-05, "loss": 1.866, "step": 3322 }, { "epoch": 0.7384444444444445, "grad_norm": 0.6315984129905701, "learning_rate": 5.242761692650334e-05, "loss": 0.9124, "step": 3323 }, { "epoch": 0.7386666666666667, "grad_norm": 0.07167524099349976, "learning_rate": 5.238307349665924e-05, "loss": 0.0158, "step": 3324 }, { "epoch": 0.7388888888888889, "grad_norm": 0.07736406475305557, "learning_rate": 5.233853006681515e-05, "loss": 0.0161, "step": 3325 }, { "epoch": 0.7391111111111112, "grad_norm": 0.07857107371091843, "learning_rate": 5.2293986636971046e-05, "loss": 0.0164, "step": 3326 }, { "epoch": 0.7393333333333333, "grad_norm": 0.0633215382695198, "learning_rate": 5.2249443207126944e-05, "loss": 0.0173, "step": 3327 }, { "epoch": 0.7395555555555555, "grad_norm": 0.7630808353424072, "learning_rate": 5.2204899777282857e-05, "loss": 0.9757, "step": 3328 }, { "epoch": 0.7397777777777778, "grad_norm": 0.8969722986221313, "learning_rate": 5.2160356347438755e-05, "loss": 1.6171, "step": 3329 }, { "epoch": 0.74, "grad_norm": 0.9955383539199829, "learning_rate": 5.2115812917594654e-05, "loss": 1.6627, "step": 3330 }, { "epoch": 0.7402222222222222, "grad_norm": 1.0531073808670044, "learning_rate": 5.2071269487750566e-05, "loss": 1.7925, "step": 3331 }, { "epoch": 0.7404444444444445, "grad_norm": 1.1096101999282837, "learning_rate": 5.202672605790646e-05, "loss": 1.4716, "step": 3332 }, { "epoch": 0.7406666666666667, "grad_norm": 0.06471211463212967, "learning_rate": 5.1982182628062356e-05, "loss": 0.0184, "step": 3333 }, { "epoch": 0.7408888888888889, "grad_norm": 0.07156452536582947, "learning_rate": 5.193763919821827e-05, "loss": 0.0183, "step": 3334 }, { "epoch": 0.7411111111111112, "grad_norm": 0.7111669182777405, "learning_rate": 5.189309576837417e-05, "loss": 0.8435, "step": 3335 }, { "epoch": 0.7413333333333333, "grad_norm": 0.784017026424408, "learning_rate": 5.1848552338530066e-05, "loss": 0.9037, "step": 3336 }, { "epoch": 0.7415555555555555, "grad_norm": 0.07464414834976196, "learning_rate": 5.180400890868598e-05, "loss": 0.0173, "step": 3337 }, { "epoch": 0.7417777777777778, "grad_norm": 0.7238468527793884, "learning_rate": 5.1759465478841876e-05, "loss": 0.8807, "step": 3338 }, { "epoch": 0.742, "grad_norm": 0.07420375943183899, "learning_rate": 5.1714922048997775e-05, "loss": 0.0192, "step": 3339 }, { "epoch": 0.7422222222222222, "grad_norm": 0.07133994251489639, "learning_rate": 5.167037861915368e-05, "loss": 0.0189, "step": 3340 }, { "epoch": 0.7424444444444445, "grad_norm": 0.0961189940571785, "learning_rate": 5.162583518930958e-05, "loss": 0.0194, "step": 3341 }, { "epoch": 0.7426666666666667, "grad_norm": 1.0209311246871948, "learning_rate": 5.158129175946548e-05, "loss": 1.7523, "step": 3342 }, { "epoch": 0.7428888888888889, "grad_norm": 1.067814588546753, "learning_rate": 5.153674832962139e-05, "loss": 1.7394, "step": 3343 }, { "epoch": 0.7431111111111111, "grad_norm": 0.10426237434148788, "learning_rate": 5.149220489977729e-05, "loss": 0.0266, "step": 3344 }, { "epoch": 0.7433333333333333, "grad_norm": 1.1256235837936401, "learning_rate": 5.144766146993319e-05, "loss": 1.5493, "step": 3345 }, { "epoch": 0.7435555555555555, "grad_norm": 1.0838463306427002, "learning_rate": 5.140311804008909e-05, "loss": 1.508, "step": 3346 }, { "epoch": 0.7437777777777778, "grad_norm": 1.0034325122833252, "learning_rate": 5.135857461024499e-05, "loss": 1.3716, "step": 3347 }, { "epoch": 0.744, "grad_norm": 1.1057904958724976, "learning_rate": 5.131403118040089e-05, "loss": 0.9587, "step": 3348 }, { "epoch": 0.7442222222222222, "grad_norm": 0.19667142629623413, "learning_rate": 5.12694877505568e-05, "loss": 0.0377, "step": 3349 }, { "epoch": 0.7444444444444445, "grad_norm": 1.0404895544052124, "learning_rate": 5.12249443207127e-05, "loss": 1.0799, "step": 3350 }, { "epoch": 0.7446666666666667, "grad_norm": 0.8521629571914673, "learning_rate": 5.11804008908686e-05, "loss": 2.0826, "step": 3351 }, { "epoch": 0.7448888888888889, "grad_norm": 0.046493686735630035, "learning_rate": 5.1135857461024504e-05, "loss": 0.0101, "step": 3352 }, { "epoch": 0.7451111111111111, "grad_norm": 0.04533799737691879, "learning_rate": 5.10913140311804e-05, "loss": 0.0102, "step": 3353 }, { "epoch": 0.7453333333333333, "grad_norm": 0.6256393194198608, "learning_rate": 5.10467706013363e-05, "loss": 1.2161, "step": 3354 }, { "epoch": 0.7455555555555555, "grad_norm": 0.5878841280937195, "learning_rate": 5.100222717149221e-05, "loss": 1.1603, "step": 3355 }, { "epoch": 0.7457777777777778, "grad_norm": 0.04651748016476631, "learning_rate": 5.095768374164811e-05, "loss": 0.01, "step": 3356 }, { "epoch": 0.746, "grad_norm": 0.03794243186712265, "learning_rate": 5.091314031180401e-05, "loss": 0.01, "step": 3357 }, { "epoch": 0.7462222222222222, "grad_norm": 0.04922659322619438, "learning_rate": 5.0868596881959916e-05, "loss": 0.0097, "step": 3358 }, { "epoch": 0.7464444444444445, "grad_norm": 0.8625622391700745, "learning_rate": 5.0824053452115814e-05, "loss": 1.8859, "step": 3359 }, { "epoch": 0.7466666666666667, "grad_norm": 0.8704177141189575, "learning_rate": 5.077951002227171e-05, "loss": 1.9087, "step": 3360 }, { "epoch": 0.7468888888888889, "grad_norm": 0.9514003992080688, "learning_rate": 5.0734966592427625e-05, "loss": 2.1152, "step": 3361 }, { "epoch": 0.7471111111111111, "grad_norm": 0.9952490925788879, "learning_rate": 5.0690423162583524e-05, "loss": 2.237, "step": 3362 }, { "epoch": 0.7473333333333333, "grad_norm": 1.0425519943237305, "learning_rate": 5.064587973273942e-05, "loss": 2.1412, "step": 3363 }, { "epoch": 0.7475555555555555, "grad_norm": 0.7753322124481201, "learning_rate": 5.060133630289533e-05, "loss": 1.7639, "step": 3364 }, { "epoch": 0.7477777777777778, "grad_norm": 0.9439111351966858, "learning_rate": 5.0556792873051226e-05, "loss": 1.7622, "step": 3365 }, { "epoch": 0.748, "grad_norm": 0.9274625778198242, "learning_rate": 5.0512249443207125e-05, "loss": 2.1017, "step": 3366 }, { "epoch": 0.7482222222222222, "grad_norm": 0.9550508856773376, "learning_rate": 5.046770601336304e-05, "loss": 1.8416, "step": 3367 }, { "epoch": 0.7484444444444445, "grad_norm": 0.8628423810005188, "learning_rate": 5.0423162583518935e-05, "loss": 1.9227, "step": 3368 }, { "epoch": 0.7486666666666667, "grad_norm": 1.0649088621139526, "learning_rate": 5.0378619153674834e-05, "loss": 2.1865, "step": 3369 }, { "epoch": 0.7488888888888889, "grad_norm": 0.9452845454216003, "learning_rate": 5.033407572383074e-05, "loss": 1.9341, "step": 3370 }, { "epoch": 0.7491111111111111, "grad_norm": 0.9852356910705566, "learning_rate": 5.028953229398664e-05, "loss": 1.6767, "step": 3371 }, { "epoch": 0.7493333333333333, "grad_norm": 0.9458546042442322, "learning_rate": 5.0244988864142536e-05, "loss": 1.786, "step": 3372 }, { "epoch": 0.7495555555555555, "grad_norm": 0.07178652286529541, "learning_rate": 5.020044543429845e-05, "loss": 0.015, "step": 3373 }, { "epoch": 0.7497777777777778, "grad_norm": 0.07055787742137909, "learning_rate": 5.015590200445435e-05, "loss": 0.0153, "step": 3374 }, { "epoch": 0.75, "grad_norm": 0.6104269027709961, "learning_rate": 5.0111358574610246e-05, "loss": 0.8618, "step": 3375 }, { "epoch": 0.7502222222222222, "grad_norm": 0.6599386930465698, "learning_rate": 5.006681514476616e-05, "loss": 0.8642, "step": 3376 }, { "epoch": 0.7504444444444445, "grad_norm": 0.6750035881996155, "learning_rate": 5.0022271714922056e-05, "loss": 0.8647, "step": 3377 }, { "epoch": 0.7506666666666667, "grad_norm": 0.9692963361740112, "learning_rate": 4.997772828507795e-05, "loss": 1.8036, "step": 3378 }, { "epoch": 0.7508888888888889, "grad_norm": 1.0836691856384277, "learning_rate": 4.9933184855233854e-05, "loss": 2.039, "step": 3379 }, { "epoch": 0.7511111111111111, "grad_norm": 0.06479348987340927, "learning_rate": 4.988864142538976e-05, "loss": 0.0173, "step": 3380 }, { "epoch": 0.7513333333333333, "grad_norm": 0.06957981735467911, "learning_rate": 4.984409799554566e-05, "loss": 0.0166, "step": 3381 }, { "epoch": 0.7515555555555555, "grad_norm": 0.666901707649231, "learning_rate": 4.979955456570156e-05, "loss": 0.7578, "step": 3382 }, { "epoch": 0.7517777777777778, "grad_norm": 1.0305155515670776, "learning_rate": 4.975501113585747e-05, "loss": 1.6703, "step": 3383 }, { "epoch": 0.752, "grad_norm": 0.9969210624694824, "learning_rate": 4.971046770601337e-05, "loss": 1.7831, "step": 3384 }, { "epoch": 0.7522222222222222, "grad_norm": 0.068308025598526, "learning_rate": 4.9665924276169265e-05, "loss": 0.018, "step": 3385 }, { "epoch": 0.7524444444444445, "grad_norm": 0.06835668534040451, "learning_rate": 4.962138084632517e-05, "loss": 0.0171, "step": 3386 }, { "epoch": 0.7526666666666667, "grad_norm": 0.562114417552948, "learning_rate": 4.957683741648107e-05, "loss": 0.8015, "step": 3387 }, { "epoch": 0.7528888888888889, "grad_norm": 0.9326373338699341, "learning_rate": 4.9532293986636975e-05, "loss": 1.7364, "step": 3388 }, { "epoch": 0.7531111111111111, "grad_norm": 1.0560567378997803, "learning_rate": 4.948775055679288e-05, "loss": 1.3854, "step": 3389 }, { "epoch": 0.7533333333333333, "grad_norm": 1.0617526769638062, "learning_rate": 4.944320712694878e-05, "loss": 1.3826, "step": 3390 }, { "epoch": 0.7535555555555555, "grad_norm": 0.6773163080215454, "learning_rate": 4.939866369710468e-05, "loss": 0.9724, "step": 3391 }, { "epoch": 0.7537777777777778, "grad_norm": 0.8919631838798523, "learning_rate": 4.935412026726058e-05, "loss": 1.4029, "step": 3392 }, { "epoch": 0.754, "grad_norm": 1.0007896423339844, "learning_rate": 4.930957683741648e-05, "loss": 1.3675, "step": 3393 }, { "epoch": 0.7542222222222222, "grad_norm": 1.1181669235229492, "learning_rate": 4.9265033407572387e-05, "loss": 1.5695, "step": 3394 }, { "epoch": 0.7544444444444445, "grad_norm": 1.058223843574524, "learning_rate": 4.922048997772829e-05, "loss": 1.444, "step": 3395 }, { "epoch": 0.7546666666666667, "grad_norm": 1.0917662382125854, "learning_rate": 4.917594654788419e-05, "loss": 1.5776, "step": 3396 }, { "epoch": 0.7548888888888889, "grad_norm": 1.2129132747650146, "learning_rate": 4.913140311804009e-05, "loss": 1.5378, "step": 3397 }, { "epoch": 0.7551111111111111, "grad_norm": 0.7757513523101807, "learning_rate": 4.908685968819599e-05, "loss": 0.7143, "step": 3398 }, { "epoch": 0.7553333333333333, "grad_norm": 1.0675660371780396, "learning_rate": 4.904231625835189e-05, "loss": 1.2625, "step": 3399 }, { "epoch": 0.7555555555555555, "grad_norm": 0.7911191582679749, "learning_rate": 4.89977728285078e-05, "loss": 0.6726, "step": 3400 }, { "epoch": 0.7557777777777778, "grad_norm": 0.936028003692627, "learning_rate": 4.89532293986637e-05, "loss": 2.5741, "step": 3401 }, { "epoch": 0.756, "grad_norm": 0.04625101387500763, "learning_rate": 4.89086859688196e-05, "loss": 0.0098, "step": 3402 }, { "epoch": 0.7562222222222222, "grad_norm": 0.5739651918411255, "learning_rate": 4.886414253897551e-05, "loss": 1.0021, "step": 3403 }, { "epoch": 0.7564444444444445, "grad_norm": 0.874405562877655, "learning_rate": 4.8819599109131406e-05, "loss": 2.1036, "step": 3404 }, { "epoch": 0.7566666666666667, "grad_norm": 0.5654922723770142, "learning_rate": 4.8775055679287305e-05, "loss": 0.9892, "step": 3405 }, { "epoch": 0.7568888888888889, "grad_norm": 0.6591737866401672, "learning_rate": 4.873051224944321e-05, "loss": 0.9575, "step": 3406 }, { "epoch": 0.7571111111111111, "grad_norm": 0.05461383983492851, "learning_rate": 4.868596881959911e-05, "loss": 0.011, "step": 3407 }, { "epoch": 0.7573333333333333, "grad_norm": 0.0622735358774662, "learning_rate": 4.8641425389755014e-05, "loss": 0.0112, "step": 3408 }, { "epoch": 0.7575555555555555, "grad_norm": 0.059408292174339294, "learning_rate": 4.859688195991092e-05, "loss": 0.011, "step": 3409 }, { "epoch": 0.7577777777777778, "grad_norm": 0.6495372653007507, "learning_rate": 4.855233853006682e-05, "loss": 0.8378, "step": 3410 }, { "epoch": 0.758, "grad_norm": 0.9061746001243591, "learning_rate": 4.850779510022272e-05, "loss": 2.2088, "step": 3411 }, { "epoch": 0.7582222222222222, "grad_norm": 0.8633875846862793, "learning_rate": 4.846325167037862e-05, "loss": 1.9511, "step": 3412 }, { "epoch": 0.7584444444444445, "grad_norm": 1.055767297744751, "learning_rate": 4.841870824053452e-05, "loss": 1.9755, "step": 3413 }, { "epoch": 0.7586666666666667, "grad_norm": 0.8679887056350708, "learning_rate": 4.8374164810690426e-05, "loss": 1.8676, "step": 3414 }, { "epoch": 0.7588888888888888, "grad_norm": 0.9158828258514404, "learning_rate": 4.832962138084633e-05, "loss": 2.0772, "step": 3415 }, { "epoch": 0.7591111111111111, "grad_norm": 0.6672974228858948, "learning_rate": 4.828507795100223e-05, "loss": 1.1813, "step": 3416 }, { "epoch": 0.7593333333333333, "grad_norm": 0.9546223282814026, "learning_rate": 4.824053452115813e-05, "loss": 1.9467, "step": 3417 }, { "epoch": 0.7595555555555555, "grad_norm": 1.0391935110092163, "learning_rate": 4.8195991091314034e-05, "loss": 2.041, "step": 3418 }, { "epoch": 0.7597777777777778, "grad_norm": 1.0147621631622314, "learning_rate": 4.815144766146993e-05, "loss": 2.0473, "step": 3419 }, { "epoch": 0.76, "grad_norm": 0.6334058046340942, "learning_rate": 4.810690423162584e-05, "loss": 0.8882, "step": 3420 }, { "epoch": 0.7602222222222222, "grad_norm": 0.06809257715940475, "learning_rate": 4.806236080178174e-05, "loss": 0.0153, "step": 3421 }, { "epoch": 0.7604444444444445, "grad_norm": 0.06833475828170776, "learning_rate": 4.801781737193764e-05, "loss": 0.0156, "step": 3422 }, { "epoch": 0.7606666666666667, "grad_norm": 0.09722508490085602, "learning_rate": 4.797327394209355e-05, "loss": 0.0179, "step": 3423 }, { "epoch": 0.7608888888888888, "grad_norm": 0.92330402135849, "learning_rate": 4.7928730512249446e-05, "loss": 1.7349, "step": 3424 }, { "epoch": 0.7611111111111111, "grad_norm": 1.0066584348678589, "learning_rate": 4.7884187082405344e-05, "loss": 1.6615, "step": 3425 }, { "epoch": 0.7613333333333333, "grad_norm": 0.9122890830039978, "learning_rate": 4.783964365256125e-05, "loss": 1.9283, "step": 3426 }, { "epoch": 0.7615555555555555, "grad_norm": 1.0834369659423828, "learning_rate": 4.7795100222717155e-05, "loss": 1.8457, "step": 3427 }, { "epoch": 0.7617777777777778, "grad_norm": 0.9122326970100403, "learning_rate": 4.7750556792873054e-05, "loss": 1.5779, "step": 3428 }, { "epoch": 0.762, "grad_norm": 0.6459372639656067, "learning_rate": 4.770601336302896e-05, "loss": 0.8526, "step": 3429 }, { "epoch": 0.7622222222222222, "grad_norm": 0.06661590933799744, "learning_rate": 4.766146993318486e-05, "loss": 0.018, "step": 3430 }, { "epoch": 0.7624444444444445, "grad_norm": 0.06595264375209808, "learning_rate": 4.7616926503340756e-05, "loss": 0.0176, "step": 3431 }, { "epoch": 0.7626666666666667, "grad_norm": 0.06258884072303772, "learning_rate": 4.757238307349666e-05, "loss": 0.0175, "step": 3432 }, { "epoch": 0.7628888888888888, "grad_norm": 0.9908372163772583, "learning_rate": 4.752783964365256e-05, "loss": 1.5601, "step": 3433 }, { "epoch": 0.7631111111111111, "grad_norm": 1.1008018255233765, "learning_rate": 4.7483296213808465e-05, "loss": 1.9175, "step": 3434 }, { "epoch": 0.7633333333333333, "grad_norm": 0.06766713410615921, "learning_rate": 4.743875278396437e-05, "loss": 0.0185, "step": 3435 }, { "epoch": 0.7635555555555555, "grad_norm": 0.06862013787031174, "learning_rate": 4.739420935412027e-05, "loss": 0.018, "step": 3436 }, { "epoch": 0.7637777777777778, "grad_norm": 0.995215654373169, "learning_rate": 4.734966592427617e-05, "loss": 1.6609, "step": 3437 }, { "epoch": 0.764, "grad_norm": 1.1150976419448853, "learning_rate": 4.730512249443207e-05, "loss": 1.5773, "step": 3438 }, { "epoch": 0.7642222222222222, "grad_norm": 0.10110121965408325, "learning_rate": 4.726057906458797e-05, "loss": 0.0254, "step": 3439 }, { "epoch": 0.7644444444444445, "grad_norm": 0.8509777188301086, "learning_rate": 4.721603563474388e-05, "loss": 0.8449, "step": 3440 }, { "epoch": 0.7646666666666667, "grad_norm": 1.163260579109192, "learning_rate": 4.717149220489978e-05, "loss": 1.6988, "step": 3441 }, { "epoch": 0.7648888888888888, "grad_norm": 1.1963449716567993, "learning_rate": 4.712694877505568e-05, "loss": 1.6756, "step": 3442 }, { "epoch": 0.7651111111111111, "grad_norm": 1.1867884397506714, "learning_rate": 4.7082405345211587e-05, "loss": 1.6131, "step": 3443 }, { "epoch": 0.7653333333333333, "grad_norm": 1.0478819608688354, "learning_rate": 4.7037861915367485e-05, "loss": 1.4666, "step": 3444 }, { "epoch": 0.7655555555555555, "grad_norm": 1.076615571975708, "learning_rate": 4.6993318485523384e-05, "loss": 1.3148, "step": 3445 }, { "epoch": 0.7657777777777778, "grad_norm": 0.7551054954528809, "learning_rate": 4.694877505567929e-05, "loss": 0.7423, "step": 3446 }, { "epoch": 0.766, "grad_norm": 0.7709291577339172, "learning_rate": 4.6904231625835194e-05, "loss": 0.749, "step": 3447 }, { "epoch": 0.7662222222222222, "grad_norm": 0.9779494404792786, "learning_rate": 4.685968819599109e-05, "loss": 1.1534, "step": 3448 }, { "epoch": 0.7664444444444445, "grad_norm": 0.7176189422607422, "learning_rate": 4.6815144766147e-05, "loss": 0.5791, "step": 3449 }, { "epoch": 0.7666666666666667, "grad_norm": 2.1151397228240967, "learning_rate": 4.67706013363029e-05, "loss": 1.2452, "step": 3450 }, { "epoch": 0.7668888888888888, "grad_norm": 0.04698283597826958, "learning_rate": 4.6726057906458796e-05, "loss": 0.0105, "step": 3451 }, { "epoch": 0.7671111111111111, "grad_norm": 0.806088387966156, "learning_rate": 4.66815144766147e-05, "loss": 2.1148, "step": 3452 }, { "epoch": 0.7673333333333333, "grad_norm": 0.9896338582038879, "learning_rate": 4.6636971046770606e-05, "loss": 2.4891, "step": 3453 }, { "epoch": 0.7675555555555555, "grad_norm": 0.6359859704971313, "learning_rate": 4.6592427616926505e-05, "loss": 0.8817, "step": 3454 }, { "epoch": 0.7677777777777778, "grad_norm": 0.6366167068481445, "learning_rate": 4.654788418708241e-05, "loss": 1.1016, "step": 3455 }, { "epoch": 0.768, "grad_norm": 0.6625463366508484, "learning_rate": 4.650334075723831e-05, "loss": 1.2164, "step": 3456 }, { "epoch": 0.7682222222222223, "grad_norm": 0.6121510863304138, "learning_rate": 4.645879732739421e-05, "loss": 0.8625, "step": 3457 }, { "epoch": 0.7684444444444445, "grad_norm": 1.0105525255203247, "learning_rate": 4.641425389755011e-05, "loss": 2.7758, "step": 3458 }, { "epoch": 0.7686666666666667, "grad_norm": 0.8283724188804626, "learning_rate": 4.636971046770602e-05, "loss": 2.0447, "step": 3459 }, { "epoch": 0.7688888888888888, "grad_norm": 0.06813201308250427, "learning_rate": 4.632516703786192e-05, "loss": 0.0111, "step": 3460 }, { "epoch": 0.7691111111111111, "grad_norm": 0.0665576308965683, "learning_rate": 4.628062360801782e-05, "loss": 0.011, "step": 3461 }, { "epoch": 0.7693333333333333, "grad_norm": 0.06463496387004852, "learning_rate": 4.623608017817373e-05, "loss": 0.0111, "step": 3462 }, { "epoch": 0.7695555555555555, "grad_norm": 0.8166987895965576, "learning_rate": 4.619153674832962e-05, "loss": 2.0366, "step": 3463 }, { "epoch": 0.7697777777777778, "grad_norm": 0.9549795985221863, "learning_rate": 4.6146993318485525e-05, "loss": 1.8478, "step": 3464 }, { "epoch": 0.77, "grad_norm": 0.8335583209991455, "learning_rate": 4.610244988864143e-05, "loss": 1.8226, "step": 3465 }, { "epoch": 0.7702222222222223, "grad_norm": 0.9823237657546997, "learning_rate": 4.605790645879733e-05, "loss": 2.114, "step": 3466 }, { "epoch": 0.7704444444444445, "grad_norm": 0.9316264986991882, "learning_rate": 4.6013363028953234e-05, "loss": 2.0765, "step": 3467 }, { "epoch": 0.7706666666666667, "grad_norm": 0.8862332701683044, "learning_rate": 4.596881959910914e-05, "loss": 1.8899, "step": 3468 }, { "epoch": 0.7708888888888888, "grad_norm": 0.9615729451179504, "learning_rate": 4.592427616926504e-05, "loss": 1.9076, "step": 3469 }, { "epoch": 0.7711111111111111, "grad_norm": 0.107745461165905, "learning_rate": 4.5879732739420936e-05, "loss": 0.0184, "step": 3470 }, { "epoch": 0.7713333333333333, "grad_norm": 0.9588910341262817, "learning_rate": 4.5835189309576835e-05, "loss": 1.7689, "step": 3471 }, { "epoch": 0.7715555555555556, "grad_norm": 0.8808805346488953, "learning_rate": 4.579064587973274e-05, "loss": 2.204, "step": 3472 }, { "epoch": 0.7717777777777778, "grad_norm": 1.1614326238632202, "learning_rate": 4.5746102449888646e-05, "loss": 1.9956, "step": 3473 }, { "epoch": 0.772, "grad_norm": 0.8884471654891968, "learning_rate": 4.5701559020044544e-05, "loss": 1.875, "step": 3474 }, { "epoch": 0.7722222222222223, "grad_norm": 0.9541723728179932, "learning_rate": 4.565701559020045e-05, "loss": 1.8806, "step": 3475 }, { "epoch": 0.7724444444444445, "grad_norm": 0.07305742055177689, "learning_rate": 4.561247216035635e-05, "loss": 0.0169, "step": 3476 }, { "epoch": 0.7726666666666666, "grad_norm": 0.6821660399436951, "learning_rate": 4.556792873051225e-05, "loss": 1.079, "step": 3477 }, { "epoch": 0.7728888888888888, "grad_norm": 0.9830121994018555, "learning_rate": 4.552338530066815e-05, "loss": 1.7285, "step": 3478 }, { "epoch": 0.7731111111111111, "grad_norm": 0.6831437945365906, "learning_rate": 4.547884187082406e-05, "loss": 0.8884, "step": 3479 }, { "epoch": 0.7733333333333333, "grad_norm": 1.0051524639129639, "learning_rate": 4.5434298440979956e-05, "loss": 1.8973, "step": 3480 }, { "epoch": 0.7735555555555556, "grad_norm": 0.9727129340171814, "learning_rate": 4.538975501113586e-05, "loss": 2.042, "step": 3481 }, { "epoch": 0.7737777777777778, "grad_norm": 1.0296839475631714, "learning_rate": 4.534521158129176e-05, "loss": 1.7367, "step": 3482 }, { "epoch": 0.774, "grad_norm": 0.9972522258758545, "learning_rate": 4.530066815144766e-05, "loss": 1.8867, "step": 3483 }, { "epoch": 0.7742222222222223, "grad_norm": 1.0227113962173462, "learning_rate": 4.5256124721603564e-05, "loss": 1.8279, "step": 3484 }, { "epoch": 0.7744444444444445, "grad_norm": 1.061448335647583, "learning_rate": 4.521158129175947e-05, "loss": 1.9245, "step": 3485 }, { "epoch": 0.7746666666666666, "grad_norm": 0.6845740675926208, "learning_rate": 4.516703786191537e-05, "loss": 0.9532, "step": 3486 }, { "epoch": 0.7748888888888888, "grad_norm": 1.010504961013794, "learning_rate": 4.512249443207127e-05, "loss": 1.6469, "step": 3487 }, { "epoch": 0.7751111111111111, "grad_norm": 1.15483820438385, "learning_rate": 4.507795100222718e-05, "loss": 1.735, "step": 3488 }, { "epoch": 0.7753333333333333, "grad_norm": 0.06943599879741669, "learning_rate": 4.503340757238308e-05, "loss": 0.0192, "step": 3489 }, { "epoch": 0.7755555555555556, "grad_norm": 0.06758453696966171, "learning_rate": 4.4988864142538976e-05, "loss": 0.0185, "step": 3490 }, { "epoch": 0.7757777777777778, "grad_norm": 0.06915237754583359, "learning_rate": 4.494432071269488e-05, "loss": 0.0189, "step": 3491 }, { "epoch": 0.776, "grad_norm": 0.7292212843894958, "learning_rate": 4.489977728285078e-05, "loss": 0.8598, "step": 3492 }, { "epoch": 0.7762222222222223, "grad_norm": 0.9773833751678467, "learning_rate": 4.4855233853006685e-05, "loss": 1.5498, "step": 3493 }, { "epoch": 0.7764444444444445, "grad_norm": 1.0763559341430664, "learning_rate": 4.481069042316259e-05, "loss": 1.6527, "step": 3494 }, { "epoch": 0.7766666666666666, "grad_norm": 1.0425339937210083, "learning_rate": 4.476614699331849e-05, "loss": 1.272, "step": 3495 }, { "epoch": 0.7768888888888889, "grad_norm": 1.1225720643997192, "learning_rate": 4.472160356347439e-05, "loss": 1.3202, "step": 3496 }, { "epoch": 0.7771111111111111, "grad_norm": 1.2557756900787354, "learning_rate": 4.467706013363029e-05, "loss": 1.378, "step": 3497 }, { "epoch": 0.7773333333333333, "grad_norm": 0.17433112859725952, "learning_rate": 4.463251670378619e-05, "loss": 0.038, "step": 3498 }, { "epoch": 0.7775555555555556, "grad_norm": 1.008841633796692, "learning_rate": 4.45879732739421e-05, "loss": 0.8367, "step": 3499 }, { "epoch": 0.7777777777777778, "grad_norm": 0.9712222814559937, "learning_rate": 4.4543429844098e-05, "loss": 0.6577, "step": 3500 }, { "epoch": 0.778, "grad_norm": 0.6440428495407104, "learning_rate": 4.44988864142539e-05, "loss": 0.9956, "step": 3501 }, { "epoch": 0.7782222222222223, "grad_norm": 0.5025835633277893, "learning_rate": 4.44543429844098e-05, "loss": 0.9893, "step": 3502 }, { "epoch": 0.7784444444444445, "grad_norm": 0.04510605335235596, "learning_rate": 4.4409799554565705e-05, "loss": 0.0106, "step": 3503 }, { "epoch": 0.7786666666666666, "grad_norm": 0.9423682689666748, "learning_rate": 4.43652561247216e-05, "loss": 2.3701, "step": 3504 }, { "epoch": 0.7788888888888889, "grad_norm": 0.9391410946846008, "learning_rate": 4.432071269487751e-05, "loss": 2.172, "step": 3505 }, { "epoch": 0.7791111111111111, "grad_norm": 0.9086732864379883, "learning_rate": 4.427616926503341e-05, "loss": 2.217, "step": 3506 }, { "epoch": 0.7793333333333333, "grad_norm": 0.9702697396278381, "learning_rate": 4.423162583518931e-05, "loss": 2.1024, "step": 3507 }, { "epoch": 0.7795555555555556, "grad_norm": 0.9364957809448242, "learning_rate": 4.418708240534522e-05, "loss": 1.9179, "step": 3508 }, { "epoch": 0.7797777777777778, "grad_norm": 0.9006823301315308, "learning_rate": 4.414253897550111e-05, "loss": 1.9431, "step": 3509 }, { "epoch": 0.78, "grad_norm": 0.8712829947471619, "learning_rate": 4.4097995545657015e-05, "loss": 1.7075, "step": 3510 }, { "epoch": 0.7802222222222223, "grad_norm": 0.8921668529510498, "learning_rate": 4.405345211581292e-05, "loss": 2.0187, "step": 3511 }, { "epoch": 0.7804444444444445, "grad_norm": 1.2319942712783813, "learning_rate": 4.400890868596882e-05, "loss": 2.2116, "step": 3512 }, { "epoch": 0.7806666666666666, "grad_norm": 1.0687848329544067, "learning_rate": 4.3964365256124724e-05, "loss": 1.827, "step": 3513 }, { "epoch": 0.7808888888888889, "grad_norm": 0.06872207671403885, "learning_rate": 4.391982182628063e-05, "loss": 0.016, "step": 3514 }, { "epoch": 0.7811111111111111, "grad_norm": 0.06946699321269989, "learning_rate": 4.387527839643653e-05, "loss": 0.016, "step": 3515 }, { "epoch": 0.7813333333333333, "grad_norm": 0.06873323768377304, "learning_rate": 4.383073496659243e-05, "loss": 0.0157, "step": 3516 }, { "epoch": 0.7815555555555556, "grad_norm": 0.7765884399414062, "learning_rate": 4.378619153674833e-05, "loss": 0.974, "step": 3517 }, { "epoch": 0.7817777777777778, "grad_norm": 0.7765089869499207, "learning_rate": 4.374164810690423e-05, "loss": 0.9251, "step": 3518 }, { "epoch": 0.782, "grad_norm": 0.6271977424621582, "learning_rate": 4.3697104677060136e-05, "loss": 0.8018, "step": 3519 }, { "epoch": 0.7822222222222223, "grad_norm": 0.726948618888855, "learning_rate": 4.365256124721604e-05, "loss": 0.8888, "step": 3520 }, { "epoch": 0.7824444444444445, "grad_norm": 0.9243329167366028, "learning_rate": 4.360801781737194e-05, "loss": 1.6066, "step": 3521 }, { "epoch": 0.7826666666666666, "grad_norm": 1.2513469457626343, "learning_rate": 4.356347438752784e-05, "loss": 1.9953, "step": 3522 }, { "epoch": 0.7828888888888889, "grad_norm": 0.9244915246963501, "learning_rate": 4.3518930957683744e-05, "loss": 1.8274, "step": 3523 }, { "epoch": 0.7831111111111111, "grad_norm": 1.0428435802459717, "learning_rate": 4.347438752783964e-05, "loss": 1.6283, "step": 3524 }, { "epoch": 0.7833333333333333, "grad_norm": 0.8906724452972412, "learning_rate": 4.342984409799555e-05, "loss": 1.6417, "step": 3525 }, { "epoch": 0.7835555555555556, "grad_norm": 1.084955096244812, "learning_rate": 4.3385300668151454e-05, "loss": 1.6931, "step": 3526 }, { "epoch": 0.7837777777777778, "grad_norm": 0.945055365562439, "learning_rate": 4.334075723830735e-05, "loss": 1.0096, "step": 3527 }, { "epoch": 0.784, "grad_norm": 0.0929998904466629, "learning_rate": 4.329621380846325e-05, "loss": 0.0173, "step": 3528 }, { "epoch": 0.7842222222222223, "grad_norm": 1.0231083631515503, "learning_rate": 4.3251670378619156e-05, "loss": 1.729, "step": 3529 }, { "epoch": 0.7844444444444445, "grad_norm": 1.0118030309677124, "learning_rate": 4.3207126948775055e-05, "loss": 1.6239, "step": 3530 }, { "epoch": 0.7846666666666666, "grad_norm": 0.712536096572876, "learning_rate": 4.316258351893096e-05, "loss": 0.7916, "step": 3531 }, { "epoch": 0.7848888888888889, "grad_norm": 0.0708150640130043, "learning_rate": 4.3118040089086865e-05, "loss": 0.0182, "step": 3532 }, { "epoch": 0.7851111111111111, "grad_norm": 0.08458317071199417, "learning_rate": 4.3073496659242764e-05, "loss": 0.0185, "step": 3533 }, { "epoch": 0.7853333333333333, "grad_norm": 1.0665457248687744, "learning_rate": 4.302895322939867e-05, "loss": 1.6411, "step": 3534 }, { "epoch": 0.7855555555555556, "grad_norm": 1.1385186910629272, "learning_rate": 4.298440979955457e-05, "loss": 1.3018, "step": 3535 }, { "epoch": 0.7857777777777778, "grad_norm": 1.0706120729446411, "learning_rate": 4.2939866369710466e-05, "loss": 1.5641, "step": 3536 }, { "epoch": 0.786, "grad_norm": 1.1300500631332397, "learning_rate": 4.289532293986637e-05, "loss": 1.8248, "step": 3537 }, { "epoch": 0.7862222222222223, "grad_norm": 1.3996295928955078, "learning_rate": 4.285077951002228e-05, "loss": 1.5244, "step": 3538 }, { "epoch": 0.7864444444444444, "grad_norm": 1.079788088798523, "learning_rate": 4.2806236080178176e-05, "loss": 1.6515, "step": 3539 }, { "epoch": 0.7866666666666666, "grad_norm": 1.3016208410263062, "learning_rate": 4.276169265033408e-05, "loss": 1.5737, "step": 3540 }, { "epoch": 0.7868888888888889, "grad_norm": 1.1378430128097534, "learning_rate": 4.271714922048998e-05, "loss": 1.4334, "step": 3541 }, { "epoch": 0.7871111111111111, "grad_norm": 1.0300368070602417, "learning_rate": 4.267260579064588e-05, "loss": 1.4333, "step": 3542 }, { "epoch": 0.7873333333333333, "grad_norm": 1.0914349555969238, "learning_rate": 4.2628062360801784e-05, "loss": 1.3632, "step": 3543 }, { "epoch": 0.7875555555555556, "grad_norm": 1.0840858221054077, "learning_rate": 4.258351893095768e-05, "loss": 1.471, "step": 3544 }, { "epoch": 0.7877777777777778, "grad_norm": 0.19576247036457062, "learning_rate": 4.253897550111359e-05, "loss": 0.0362, "step": 3545 }, { "epoch": 0.788, "grad_norm": 0.19378961622714996, "learning_rate": 4.249443207126949e-05, "loss": 0.035, "step": 3546 }, { "epoch": 0.7882222222222223, "grad_norm": 0.13323235511779785, "learning_rate": 4.244988864142539e-05, "loss": 0.0349, "step": 3547 }, { "epoch": 0.7884444444444444, "grad_norm": 0.1556854248046875, "learning_rate": 4.240534521158129e-05, "loss": 0.0353, "step": 3548 }, { "epoch": 0.7886666666666666, "grad_norm": 1.26119065284729, "learning_rate": 4.2360801781737195e-05, "loss": 1.1486, "step": 3549 }, { "epoch": 0.7888888888888889, "grad_norm": 0.8363838791847229, "learning_rate": 4.2316258351893094e-05, "loss": 0.5973, "step": 3550 }, { "epoch": 0.7891111111111111, "grad_norm": 0.6089571118354797, "learning_rate": 4.2271714922049e-05, "loss": 1.1064, "step": 3551 }, { "epoch": 0.7893333333333333, "grad_norm": 0.04548042267560959, "learning_rate": 4.2227171492204905e-05, "loss": 0.0104, "step": 3552 }, { "epoch": 0.7895555555555556, "grad_norm": 0.6471196413040161, "learning_rate": 4.21826280623608e-05, "loss": 1.1893, "step": 3553 }, { "epoch": 0.7897777777777778, "grad_norm": 0.046915166079998016, "learning_rate": 4.213808463251671e-05, "loss": 0.0107, "step": 3554 }, { "epoch": 0.79, "grad_norm": 0.5992786884307861, "learning_rate": 4.209354120267261e-05, "loss": 0.9432, "step": 3555 }, { "epoch": 0.7902222222222223, "grad_norm": 0.08186815679073334, "learning_rate": 4.2048997772828506e-05, "loss": 0.0118, "step": 3556 }, { "epoch": 0.7904444444444444, "grad_norm": 0.9210121035575867, "learning_rate": 4.200445434298441e-05, "loss": 2.1219, "step": 3557 }, { "epoch": 0.7906666666666666, "grad_norm": 1.006956696510315, "learning_rate": 4.1959910913140317e-05, "loss": 1.8555, "step": 3558 }, { "epoch": 0.7908888888888889, "grad_norm": 0.9316953420639038, "learning_rate": 4.1915367483296215e-05, "loss": 2.0922, "step": 3559 }, { "epoch": 0.7911111111111111, "grad_norm": 0.9348610639572144, "learning_rate": 4.187082405345212e-05, "loss": 2.2086, "step": 3560 }, { "epoch": 0.7913333333333333, "grad_norm": 0.884831964969635, "learning_rate": 4.182628062360802e-05, "loss": 1.712, "step": 3561 }, { "epoch": 0.7915555555555556, "grad_norm": 0.9388924837112427, "learning_rate": 4.178173719376392e-05, "loss": 1.9738, "step": 3562 }, { "epoch": 0.7917777777777778, "grad_norm": 0.8886390924453735, "learning_rate": 4.173719376391982e-05, "loss": 2.0557, "step": 3563 }, { "epoch": 0.792, "grad_norm": 0.9279087781906128, "learning_rate": 4.169265033407573e-05, "loss": 1.8811, "step": 3564 }, { "epoch": 0.7922222222222223, "grad_norm": 0.8624377250671387, "learning_rate": 4.164810690423163e-05, "loss": 1.7352, "step": 3565 }, { "epoch": 0.7924444444444444, "grad_norm": 0.9816845059394836, "learning_rate": 4.160356347438753e-05, "loss": 2.084, "step": 3566 }, { "epoch": 0.7926666666666666, "grad_norm": 1.113109827041626, "learning_rate": 4.155902004454343e-05, "loss": 1.8477, "step": 3567 }, { "epoch": 0.7928888888888889, "grad_norm": 0.9390388131141663, "learning_rate": 4.151447661469933e-05, "loss": 2.0081, "step": 3568 }, { "epoch": 0.7931111111111111, "grad_norm": 0.06967326253652573, "learning_rate": 4.1469933184855235e-05, "loss": 0.0158, "step": 3569 }, { "epoch": 0.7933333333333333, "grad_norm": 0.7333217859268188, "learning_rate": 4.142538975501114e-05, "loss": 0.959, "step": 3570 }, { "epoch": 0.7935555555555556, "grad_norm": 0.9319103360176086, "learning_rate": 4.138084632516704e-05, "loss": 1.7261, "step": 3571 }, { "epoch": 0.7937777777777778, "grad_norm": 1.187261700630188, "learning_rate": 4.1336302895322944e-05, "loss": 2.1856, "step": 3572 }, { "epoch": 0.794, "grad_norm": 0.9051704406738281, "learning_rate": 4.129175946547885e-05, "loss": 1.5193, "step": 3573 }, { "epoch": 0.7942222222222223, "grad_norm": 1.0516455173492432, "learning_rate": 4.124721603563475e-05, "loss": 1.8372, "step": 3574 }, { "epoch": 0.7944444444444444, "grad_norm": 0.9026862978935242, "learning_rate": 4.120267260579065e-05, "loss": 1.7081, "step": 3575 }, { "epoch": 0.7946666666666666, "grad_norm": 1.0022499561309814, "learning_rate": 4.115812917594655e-05, "loss": 1.8029, "step": 3576 }, { "epoch": 0.7948888888888889, "grad_norm": 1.0183892250061035, "learning_rate": 4.111358574610245e-05, "loss": 1.8359, "step": 3577 }, { "epoch": 0.7951111111111111, "grad_norm": 0.06487128883600235, "learning_rate": 4.1069042316258356e-05, "loss": 0.018, "step": 3578 }, { "epoch": 0.7953333333333333, "grad_norm": 0.07973368465900421, "learning_rate": 4.1024498886414255e-05, "loss": 0.0177, "step": 3579 }, { "epoch": 0.7955555555555556, "grad_norm": 0.6818245053291321, "learning_rate": 4.097995545657016e-05, "loss": 0.7042, "step": 3580 }, { "epoch": 0.7957777777777778, "grad_norm": 0.7157871127128601, "learning_rate": 4.093541202672606e-05, "loss": 0.9386, "step": 3581 }, { "epoch": 0.796, "grad_norm": 0.0813339352607727, "learning_rate": 4.089086859688196e-05, "loss": 0.0178, "step": 3582 }, { "epoch": 0.7962222222222223, "grad_norm": 0.07369329035282135, "learning_rate": 4.084632516703786e-05, "loss": 0.0175, "step": 3583 }, { "epoch": 0.7964444444444444, "grad_norm": 0.07479511946439743, "learning_rate": 4.080178173719377e-05, "loss": 0.0181, "step": 3584 }, { "epoch": 0.7966666666666666, "grad_norm": 0.07840964198112488, "learning_rate": 4.0757238307349666e-05, "loss": 0.0169, "step": 3585 }, { "epoch": 0.7968888888888889, "grad_norm": 1.3525023460388184, "learning_rate": 4.071269487750557e-05, "loss": 1.9031, "step": 3586 }, { "epoch": 0.7971111111111111, "grad_norm": 1.6226398944854736, "learning_rate": 4.066815144766147e-05, "loss": 1.763, "step": 3587 }, { "epoch": 0.7973333333333333, "grad_norm": 0.9485257267951965, "learning_rate": 4.062360801781737e-05, "loss": 1.4717, "step": 3588 }, { "epoch": 0.7975555555555556, "grad_norm": 0.9942083358764648, "learning_rate": 4.0579064587973274e-05, "loss": 1.61, "step": 3589 }, { "epoch": 0.7977777777777778, "grad_norm": 0.10811223834753036, "learning_rate": 4.053452115812918e-05, "loss": 0.0255, "step": 3590 }, { "epoch": 0.798, "grad_norm": 0.9328152537345886, "learning_rate": 4.048997772828508e-05, "loss": 1.6944, "step": 3591 }, { "epoch": 0.7982222222222223, "grad_norm": 1.1752344369888306, "learning_rate": 4.0445434298440984e-05, "loss": 1.5285, "step": 3592 }, { "epoch": 0.7984444444444444, "grad_norm": 1.2680740356445312, "learning_rate": 4.040089086859689e-05, "loss": 1.4009, "step": 3593 }, { "epoch": 0.7986666666666666, "grad_norm": 1.1118732690811157, "learning_rate": 4.035634743875278e-05, "loss": 1.3982, "step": 3594 }, { "epoch": 0.7988888888888889, "grad_norm": 1.389929175376892, "learning_rate": 4.0311804008908686e-05, "loss": 1.3998, "step": 3595 }, { "epoch": 0.7991111111111111, "grad_norm": 0.8431739807128906, "learning_rate": 4.026726057906459e-05, "loss": 0.8518, "step": 3596 }, { "epoch": 0.7993333333333333, "grad_norm": 1.3290144205093384, "learning_rate": 4.022271714922049e-05, "loss": 1.4607, "step": 3597 }, { "epoch": 0.7995555555555556, "grad_norm": 0.15491686761379242, "learning_rate": 4.0178173719376395e-05, "loss": 0.0335, "step": 3598 }, { "epoch": 0.7997777777777778, "grad_norm": 0.7583034038543701, "learning_rate": 4.01336302895323e-05, "loss": 0.4833, "step": 3599 }, { "epoch": 0.8, "grad_norm": 0.900272786617279, "learning_rate": 4.00890868596882e-05, "loss": 0.8159, "step": 3600 }, { "epoch": 0.8, "eval_loss": 1.1739096641540527, "eval_runtime": 239.8403, "eval_samples_per_second": 4.169, "eval_steps_per_second": 4.169, "step": 3600 }, { "epoch": 0.8002222222222222, "grad_norm": 0.5315160155296326, "learning_rate": 4.00445434298441e-05, "loss": 1.0474, "step": 3601 }, { "epoch": 0.8004444444444444, "grad_norm": 0.5535997152328491, "learning_rate": 4e-05, "loss": 0.9027, "step": 3602 }, { "epoch": 0.8006666666666666, "grad_norm": 0.7079357504844666, "learning_rate": 3.99554565701559e-05, "loss": 1.1432, "step": 3603 }, { "epoch": 0.8008888888888889, "grad_norm": 0.6734809875488281, "learning_rate": 3.991091314031181e-05, "loss": 1.0644, "step": 3604 }, { "epoch": 0.8011111111111111, "grad_norm": 0.6071887016296387, "learning_rate": 3.986636971046771e-05, "loss": 1.101, "step": 3605 }, { "epoch": 0.8013333333333333, "grad_norm": 0.5823980569839478, "learning_rate": 3.982182628062361e-05, "loss": 0.8761, "step": 3606 }, { "epoch": 0.8015555555555556, "grad_norm": 0.09482403099536896, "learning_rate": 3.977728285077951e-05, "loss": 0.0129, "step": 3607 }, { "epoch": 0.8017777777777778, "grad_norm": 0.588431179523468, "learning_rate": 3.9732739420935415e-05, "loss": 1.0989, "step": 3608 }, { "epoch": 0.802, "grad_norm": 0.9978165626525879, "learning_rate": 3.9688195991091314e-05, "loss": 1.6576, "step": 3609 }, { "epoch": 0.8022222222222222, "grad_norm": 0.9243869185447693, "learning_rate": 3.964365256124722e-05, "loss": 2.1462, "step": 3610 }, { "epoch": 0.8024444444444444, "grad_norm": 0.8666023015975952, "learning_rate": 3.9599109131403124e-05, "loss": 2.0262, "step": 3611 }, { "epoch": 0.8026666666666666, "grad_norm": 0.9008248448371887, "learning_rate": 3.955456570155902e-05, "loss": 2.0854, "step": 3612 }, { "epoch": 0.8028888888888889, "grad_norm": 0.6247386336326599, "learning_rate": 3.951002227171492e-05, "loss": 0.8506, "step": 3613 }, { "epoch": 0.8031111111111111, "grad_norm": 0.13101428747177124, "learning_rate": 3.946547884187082e-05, "loss": 0.0223, "step": 3614 }, { "epoch": 0.8033333333333333, "grad_norm": 0.12358218431472778, "learning_rate": 3.9420935412026726e-05, "loss": 0.0211, "step": 3615 }, { "epoch": 0.8035555555555556, "grad_norm": 0.9996263980865479, "learning_rate": 3.937639198218263e-05, "loss": 2.0538, "step": 3616 }, { "epoch": 0.8037777777777778, "grad_norm": 1.048120379447937, "learning_rate": 3.933184855233853e-05, "loss": 1.6907, "step": 3617 }, { "epoch": 0.804, "grad_norm": 0.9704152941703796, "learning_rate": 3.9287305122494435e-05, "loss": 2.025, "step": 3618 }, { "epoch": 0.8042222222222222, "grad_norm": 0.9544731378555298, "learning_rate": 3.924276169265034e-05, "loss": 1.8855, "step": 3619 }, { "epoch": 0.8044444444444444, "grad_norm": 0.869174599647522, "learning_rate": 3.919821826280624e-05, "loss": 2.1548, "step": 3620 }, { "epoch": 0.8046666666666666, "grad_norm": 0.7145273685455322, "learning_rate": 3.915367483296214e-05, "loss": 1.0897, "step": 3621 }, { "epoch": 0.8048888888888889, "grad_norm": 0.06968183070421219, "learning_rate": 3.910913140311804e-05, "loss": 0.0157, "step": 3622 }, { "epoch": 0.8051111111111111, "grad_norm": 0.636101484298706, "learning_rate": 3.906458797327394e-05, "loss": 0.8512, "step": 3623 }, { "epoch": 0.8053333333333333, "grad_norm": 1.1181496381759644, "learning_rate": 3.902004454342985e-05, "loss": 1.5164, "step": 3624 }, { "epoch": 0.8055555555555556, "grad_norm": 0.6359825730323792, "learning_rate": 3.897550111358575e-05, "loss": 1.0128, "step": 3625 }, { "epoch": 0.8057777777777778, "grad_norm": 0.943658173084259, "learning_rate": 3.893095768374165e-05, "loss": 1.582, "step": 3626 }, { "epoch": 0.806, "grad_norm": 1.047963261604309, "learning_rate": 3.888641425389755e-05, "loss": 1.704, "step": 3627 }, { "epoch": 0.8062222222222222, "grad_norm": 0.9852431416511536, "learning_rate": 3.8841870824053455e-05, "loss": 1.6806, "step": 3628 }, { "epoch": 0.8064444444444444, "grad_norm": 0.8574654459953308, "learning_rate": 3.879732739420935e-05, "loss": 0.916, "step": 3629 }, { "epoch": 0.8066666666666666, "grad_norm": 0.9380385279655457, "learning_rate": 3.875278396436526e-05, "loss": 0.9947, "step": 3630 }, { "epoch": 0.8068888888888889, "grad_norm": 0.06696880608797073, "learning_rate": 3.8708240534521164e-05, "loss": 0.017, "step": 3631 }, { "epoch": 0.8071111111111111, "grad_norm": 0.0664907768368721, "learning_rate": 3.866369710467706e-05, "loss": 0.0173, "step": 3632 }, { "epoch": 0.8073333333333333, "grad_norm": 0.06398806720972061, "learning_rate": 3.861915367483296e-05, "loss": 0.0171, "step": 3633 }, { "epoch": 0.8075555555555556, "grad_norm": 1.21707022190094, "learning_rate": 3.8574610244988866e-05, "loss": 2.0207, "step": 3634 }, { "epoch": 0.8077777777777778, "grad_norm": 1.04575777053833, "learning_rate": 3.8530066815144765e-05, "loss": 1.8143, "step": 3635 }, { "epoch": 0.808, "grad_norm": 0.0705951601266861, "learning_rate": 3.848552338530067e-05, "loss": 0.0166, "step": 3636 }, { "epoch": 0.8082222222222222, "grad_norm": 0.076121024787426, "learning_rate": 3.8440979955456576e-05, "loss": 0.0167, "step": 3637 }, { "epoch": 0.8084444444444444, "grad_norm": 1.0178598165512085, "learning_rate": 3.8396436525612474e-05, "loss": 1.6507, "step": 3638 }, { "epoch": 0.8086666666666666, "grad_norm": 1.102067232131958, "learning_rate": 3.835189309576838e-05, "loss": 1.7494, "step": 3639 }, { "epoch": 0.8088888888888889, "grad_norm": 0.10027281194925308, "learning_rate": 3.830734966592428e-05, "loss": 0.0251, "step": 3640 }, { "epoch": 0.8091111111111111, "grad_norm": 0.6546877026557922, "learning_rate": 3.826280623608018e-05, "loss": 0.6345, "step": 3641 }, { "epoch": 0.8093333333333333, "grad_norm": 1.25735342502594, "learning_rate": 3.821826280623608e-05, "loss": 1.7112, "step": 3642 }, { "epoch": 0.8095555555555556, "grad_norm": 1.1318788528442383, "learning_rate": 3.817371937639199e-05, "loss": 1.5739, "step": 3643 }, { "epoch": 0.8097777777777778, "grad_norm": 1.0459527969360352, "learning_rate": 3.8129175946547886e-05, "loss": 1.3376, "step": 3644 }, { "epoch": 0.81, "grad_norm": 1.0099126100540161, "learning_rate": 3.808463251670379e-05, "loss": 1.2814, "step": 3645 }, { "epoch": 0.8102222222222222, "grad_norm": 0.82170170545578, "learning_rate": 3.804008908685969e-05, "loss": 0.9743, "step": 3646 }, { "epoch": 0.8104444444444444, "grad_norm": 1.0343072414398193, "learning_rate": 3.799554565701559e-05, "loss": 1.4283, "step": 3647 }, { "epoch": 0.8106666666666666, "grad_norm": 0.944911777973175, "learning_rate": 3.7951002227171494e-05, "loss": 1.05, "step": 3648 }, { "epoch": 0.8108888888888889, "grad_norm": 0.7252945303916931, "learning_rate": 3.79064587973274e-05, "loss": 0.6662, "step": 3649 }, { "epoch": 0.8111111111111111, "grad_norm": 1.005825400352478, "learning_rate": 3.78619153674833e-05, "loss": 0.7958, "step": 3650 }, { "epoch": 0.8113333333333334, "grad_norm": 0.82235187292099, "learning_rate": 3.78173719376392e-05, "loss": 2.2468, "step": 3651 }, { "epoch": 0.8115555555555556, "grad_norm": 0.6531208157539368, "learning_rate": 3.77728285077951e-05, "loss": 0.9914, "step": 3652 }, { "epoch": 0.8117777777777778, "grad_norm": 0.8134424686431885, "learning_rate": 3.7728285077951e-05, "loss": 2.2763, "step": 3653 }, { "epoch": 0.812, "grad_norm": 0.043821610510349274, "learning_rate": 3.7683741648106906e-05, "loss": 0.0101, "step": 3654 }, { "epoch": 0.8122222222222222, "grad_norm": 0.6952782273292542, "learning_rate": 3.7639198218262804e-05, "loss": 1.234, "step": 3655 }, { "epoch": 0.8124444444444444, "grad_norm": 0.6162470579147339, "learning_rate": 3.759465478841871e-05, "loss": 1.0527, "step": 3656 }, { "epoch": 0.8126666666666666, "grad_norm": 1.0366730690002441, "learning_rate": 3.7550111358574615e-05, "loss": 2.3824, "step": 3657 }, { "epoch": 0.8128888888888889, "grad_norm": 0.08013699948787689, "learning_rate": 3.7505567928730514e-05, "loss": 0.0114, "step": 3658 }, { "epoch": 0.8131111111111111, "grad_norm": 0.07211296260356903, "learning_rate": 3.746102449888642e-05, "loss": 0.0114, "step": 3659 }, { "epoch": 0.8133333333333334, "grad_norm": 0.07523675262928009, "learning_rate": 3.741648106904232e-05, "loss": 0.0114, "step": 3660 }, { "epoch": 0.8135555555555556, "grad_norm": 0.4986688494682312, "learning_rate": 3.7371937639198216e-05, "loss": 0.946, "step": 3661 }, { "epoch": 0.8137777777777778, "grad_norm": 0.9343963265419006, "learning_rate": 3.732739420935412e-05, "loss": 2.0871, "step": 3662 }, { "epoch": 0.814, "grad_norm": 0.881712794303894, "learning_rate": 3.728285077951003e-05, "loss": 2.0004, "step": 3663 }, { "epoch": 0.8142222222222222, "grad_norm": 1.054946780204773, "learning_rate": 3.7238307349665925e-05, "loss": 2.1722, "step": 3664 }, { "epoch": 0.8144444444444444, "grad_norm": 0.9730517268180847, "learning_rate": 3.719376391982183e-05, "loss": 1.7299, "step": 3665 }, { "epoch": 0.8146666666666667, "grad_norm": 0.8659468293190002, "learning_rate": 3.714922048997773e-05, "loss": 2.1389, "step": 3666 }, { "epoch": 0.8148888888888889, "grad_norm": 0.6216636896133423, "learning_rate": 3.710467706013363e-05, "loss": 1.0111, "step": 3667 }, { "epoch": 0.8151111111111111, "grad_norm": 0.1096658706665039, "learning_rate": 3.706013363028953e-05, "loss": 0.0178, "step": 3668 }, { "epoch": 0.8153333333333334, "grad_norm": 0.6512637138366699, "learning_rate": 3.701559020044544e-05, "loss": 1.0441, "step": 3669 }, { "epoch": 0.8155555555555556, "grad_norm": 1.0497286319732666, "learning_rate": 3.697104677060134e-05, "loss": 1.7955, "step": 3670 }, { "epoch": 0.8157777777777778, "grad_norm": 0.9653757214546204, "learning_rate": 3.692650334075724e-05, "loss": 1.8275, "step": 3671 }, { "epoch": 0.816, "grad_norm": 0.6577117443084717, "learning_rate": 3.688195991091314e-05, "loss": 0.9564, "step": 3672 }, { "epoch": 0.8162222222222222, "grad_norm": 0.07206307351589203, "learning_rate": 3.683741648106904e-05, "loss": 0.0158, "step": 3673 }, { "epoch": 0.8164444444444444, "grad_norm": 0.06910723447799683, "learning_rate": 3.6792873051224945e-05, "loss": 0.0159, "step": 3674 }, { "epoch": 0.8166666666666667, "grad_norm": 0.9517031908035278, "learning_rate": 3.674832962138085e-05, "loss": 1.9874, "step": 3675 }, { "epoch": 0.8168888888888889, "grad_norm": 0.1267796903848648, "learning_rate": 3.670378619153675e-05, "loss": 0.0203, "step": 3676 }, { "epoch": 0.8171111111111111, "grad_norm": 0.6326009631156921, "learning_rate": 3.6659242761692654e-05, "loss": 0.8404, "step": 3677 }, { "epoch": 0.8173333333333334, "grad_norm": 0.950645387172699, "learning_rate": 3.661469933184856e-05, "loss": 1.7112, "step": 3678 }, { "epoch": 0.8175555555555556, "grad_norm": 1.134836196899414, "learning_rate": 3.657015590200445e-05, "loss": 1.8671, "step": 3679 }, { "epoch": 0.8177777777777778, "grad_norm": 0.9950535893440247, "learning_rate": 3.652561247216036e-05, "loss": 1.9121, "step": 3680 }, { "epoch": 0.818, "grad_norm": 0.06296125799417496, "learning_rate": 3.648106904231626e-05, "loss": 0.0175, "step": 3681 }, { "epoch": 0.8182222222222222, "grad_norm": 0.9528807401657104, "learning_rate": 3.643652561247216e-05, "loss": 1.7288, "step": 3682 }, { "epoch": 0.8184444444444444, "grad_norm": 1.0748895406723022, "learning_rate": 3.6391982182628066e-05, "loss": 1.3809, "step": 3683 }, { "epoch": 0.8186666666666667, "grad_norm": 0.07369447499513626, "learning_rate": 3.634743875278397e-05, "loss": 0.0173, "step": 3684 }, { "epoch": 0.8188888888888889, "grad_norm": 0.07020772248506546, "learning_rate": 3.630289532293987e-05, "loss": 0.0172, "step": 3685 }, { "epoch": 0.8191111111111111, "grad_norm": 0.7225638628005981, "learning_rate": 3.625835189309577e-05, "loss": 0.9078, "step": 3686 }, { "epoch": 0.8193333333333334, "grad_norm": 1.205776333808899, "learning_rate": 3.621380846325167e-05, "loss": 2.0067, "step": 3687 }, { "epoch": 0.8195555555555556, "grad_norm": 1.0239263772964478, "learning_rate": 3.616926503340757e-05, "loss": 1.3173, "step": 3688 }, { "epoch": 0.8197777777777778, "grad_norm": 0.09923997521400452, "learning_rate": 3.612472160356348e-05, "loss": 0.0243, "step": 3689 }, { "epoch": 0.82, "grad_norm": 1.150796890258789, "learning_rate": 3.608017817371938e-05, "loss": 1.5561, "step": 3690 }, { "epoch": 0.8202222222222222, "grad_norm": 1.0021523237228394, "learning_rate": 3.603563474387528e-05, "loss": 1.6326, "step": 3691 }, { "epoch": 0.8204444444444444, "grad_norm": 0.9547725319862366, "learning_rate": 3.599109131403118e-05, "loss": 1.2198, "step": 3692 }, { "epoch": 0.8206666666666667, "grad_norm": 0.9803183674812317, "learning_rate": 3.594654788418708e-05, "loss": 1.4813, "step": 3693 }, { "epoch": 0.8208888888888889, "grad_norm": 1.140236258506775, "learning_rate": 3.5902004454342985e-05, "loss": 1.5703, "step": 3694 }, { "epoch": 0.8211111111111111, "grad_norm": 0.1800074428319931, "learning_rate": 3.585746102449889e-05, "loss": 0.0296, "step": 3695 }, { "epoch": 0.8213333333333334, "grad_norm": 0.744806170463562, "learning_rate": 3.581291759465479e-05, "loss": 0.7234, "step": 3696 }, { "epoch": 0.8215555555555556, "grad_norm": 1.0382987260818481, "learning_rate": 3.5768374164810694e-05, "loss": 1.0165, "step": 3697 }, { "epoch": 0.8217777777777778, "grad_norm": 1.01142156124115, "learning_rate": 3.572383073496659e-05, "loss": 1.2093, "step": 3698 }, { "epoch": 0.822, "grad_norm": 0.575733482837677, "learning_rate": 3.567928730512249e-05, "loss": 0.4409, "step": 3699 }, { "epoch": 0.8222222222222222, "grad_norm": 0.9521045684814453, "learning_rate": 3.5634743875278396e-05, "loss": 0.6805, "step": 3700 }, { "epoch": 0.8224444444444444, "grad_norm": 0.9057009220123291, "learning_rate": 3.55902004454343e-05, "loss": 2.3256, "step": 3701 }, { "epoch": 0.8226666666666667, "grad_norm": 0.8539500832557678, "learning_rate": 3.55456570155902e-05, "loss": 2.1458, "step": 3702 }, { "epoch": 0.8228888888888889, "grad_norm": 0.598690390586853, "learning_rate": 3.5501113585746106e-05, "loss": 1.164, "step": 3703 }, { "epoch": 0.8231111111111111, "grad_norm": 1.0174680948257446, "learning_rate": 3.545657015590201e-05, "loss": 2.2128, "step": 3704 }, { "epoch": 0.8233333333333334, "grad_norm": 0.5973488092422485, "learning_rate": 3.541202672605791e-05, "loss": 1.1449, "step": 3705 }, { "epoch": 0.8235555555555556, "grad_norm": 0.07344137132167816, "learning_rate": 3.536748329621381e-05, "loss": 0.0108, "step": 3706 }, { "epoch": 0.8237777777777778, "grad_norm": 0.08576471358537674, "learning_rate": 3.5322939866369714e-05, "loss": 0.0111, "step": 3707 }, { "epoch": 0.824, "grad_norm": 0.0801275447010994, "learning_rate": 3.527839643652561e-05, "loss": 0.0111, "step": 3708 }, { "epoch": 0.8242222222222222, "grad_norm": 0.08194973319768906, "learning_rate": 3.523385300668152e-05, "loss": 0.0109, "step": 3709 }, { "epoch": 0.8244444444444444, "grad_norm": 1.0708433389663696, "learning_rate": 3.518930957683742e-05, "loss": 2.033, "step": 3710 }, { "epoch": 0.8246666666666667, "grad_norm": 1.263612985610962, "learning_rate": 3.514476614699332e-05, "loss": 2.6208, "step": 3711 }, { "epoch": 0.8248888888888889, "grad_norm": 0.9804373383522034, "learning_rate": 3.510022271714922e-05, "loss": 1.8539, "step": 3712 }, { "epoch": 0.8251111111111111, "grad_norm": 0.8642978668212891, "learning_rate": 3.5055679287305125e-05, "loss": 1.7259, "step": 3713 }, { "epoch": 0.8253333333333334, "grad_norm": 0.9090456962585449, "learning_rate": 3.5011135857461024e-05, "loss": 1.8745, "step": 3714 }, { "epoch": 0.8255555555555556, "grad_norm": 0.9151667356491089, "learning_rate": 3.496659242761693e-05, "loss": 1.9903, "step": 3715 }, { "epoch": 0.8257777777777778, "grad_norm": 0.6366732120513916, "learning_rate": 3.4922048997772835e-05, "loss": 0.9141, "step": 3716 }, { "epoch": 0.826, "grad_norm": 0.7700564861297607, "learning_rate": 3.487750556792873e-05, "loss": 0.9737, "step": 3717 }, { "epoch": 0.8262222222222222, "grad_norm": 0.9940738081932068, "learning_rate": 3.483296213808463e-05, "loss": 2.0155, "step": 3718 }, { "epoch": 0.8264444444444444, "grad_norm": 0.8816027641296387, "learning_rate": 3.478841870824054e-05, "loss": 1.7297, "step": 3719 }, { "epoch": 0.8266666666666667, "grad_norm": 1.3799381256103516, "learning_rate": 3.4743875278396436e-05, "loss": 2.159, "step": 3720 }, { "epoch": 0.8268888888888889, "grad_norm": 1.0440785884857178, "learning_rate": 3.469933184855234e-05, "loss": 1.7563, "step": 3721 }, { "epoch": 0.8271111111111111, "grad_norm": 0.6796101331710815, "learning_rate": 3.465478841870824e-05, "loss": 0.9447, "step": 3722 }, { "epoch": 0.8273333333333334, "grad_norm": 0.06959293782711029, "learning_rate": 3.4610244988864145e-05, "loss": 0.0163, "step": 3723 }, { "epoch": 0.8275555555555556, "grad_norm": 0.06835558265447617, "learning_rate": 3.456570155902005e-05, "loss": 0.0164, "step": 3724 }, { "epoch": 0.8277777777777777, "grad_norm": 1.080851435661316, "learning_rate": 3.452115812917594e-05, "loss": 1.9956, "step": 3725 }, { "epoch": 0.828, "grad_norm": 0.6416263580322266, "learning_rate": 3.447661469933185e-05, "loss": 0.6763, "step": 3726 }, { "epoch": 0.8282222222222222, "grad_norm": 1.0977882146835327, "learning_rate": 3.443207126948775e-05, "loss": 1.7526, "step": 3727 }, { "epoch": 0.8284444444444444, "grad_norm": 0.8835853338241577, "learning_rate": 3.438752783964365e-05, "loss": 1.7257, "step": 3728 }, { "epoch": 0.8286666666666667, "grad_norm": 1.0617055892944336, "learning_rate": 3.434298440979956e-05, "loss": 1.9876, "step": 3729 }, { "epoch": 0.8288888888888889, "grad_norm": 1.1054614782333374, "learning_rate": 3.429844097995546e-05, "loss": 1.4692, "step": 3730 }, { "epoch": 0.8291111111111111, "grad_norm": 0.06595687568187714, "learning_rate": 3.425389755011136e-05, "loss": 0.0171, "step": 3731 }, { "epoch": 0.8293333333333334, "grad_norm": 0.06783387809991837, "learning_rate": 3.420935412026726e-05, "loss": 0.0172, "step": 3732 }, { "epoch": 0.8295555555555556, "grad_norm": 0.7828741073608398, "learning_rate": 3.4164810690423165e-05, "loss": 0.8107, "step": 3733 }, { "epoch": 0.8297777777777777, "grad_norm": 0.07702479511499405, "learning_rate": 3.4120267260579063e-05, "loss": 0.0177, "step": 3734 }, { "epoch": 0.83, "grad_norm": 0.6449767351150513, "learning_rate": 3.407572383073497e-05, "loss": 0.7656, "step": 3735 }, { "epoch": 0.8302222222222222, "grad_norm": 1.3564252853393555, "learning_rate": 3.4031180400890874e-05, "loss": 1.9644, "step": 3736 }, { "epoch": 0.8304444444444444, "grad_norm": 0.7747462391853333, "learning_rate": 3.398663697104677e-05, "loss": 1.0032, "step": 3737 }, { "epoch": 0.8306666666666667, "grad_norm": 1.13922119140625, "learning_rate": 3.394209354120267e-05, "loss": 1.545, "step": 3738 }, { "epoch": 0.8308888888888889, "grad_norm": 0.9685484766960144, "learning_rate": 3.389755011135858e-05, "loss": 1.3771, "step": 3739 }, { "epoch": 0.8311111111111111, "grad_norm": 0.7709338665008545, "learning_rate": 3.3853006681514475e-05, "loss": 0.696, "step": 3740 }, { "epoch": 0.8313333333333334, "grad_norm": 1.0451520681381226, "learning_rate": 3.380846325167038e-05, "loss": 1.564, "step": 3741 }, { "epoch": 0.8315555555555556, "grad_norm": 1.1514785289764404, "learning_rate": 3.3763919821826286e-05, "loss": 1.4297, "step": 3742 }, { "epoch": 0.8317777777777777, "grad_norm": 1.3433165550231934, "learning_rate": 3.3719376391982185e-05, "loss": 1.7462, "step": 3743 }, { "epoch": 0.832, "grad_norm": 0.9320109486579895, "learning_rate": 3.367483296213808e-05, "loss": 0.95, "step": 3744 }, { "epoch": 0.8322222222222222, "grad_norm": 0.9835542440414429, "learning_rate": 3.363028953229399e-05, "loss": 1.1297, "step": 3745 }, { "epoch": 0.8324444444444444, "grad_norm": 0.18634167313575745, "learning_rate": 3.358574610244989e-05, "loss": 0.0346, "step": 3746 }, { "epoch": 0.8326666666666667, "grad_norm": 0.1696268618106842, "learning_rate": 3.354120267260579e-05, "loss": 0.034, "step": 3747 }, { "epoch": 0.8328888888888889, "grad_norm": 1.043257236480713, "learning_rate": 3.34966592427617e-05, "loss": 0.9229, "step": 3748 }, { "epoch": 0.8331111111111111, "grad_norm": 1.21977698802948, "learning_rate": 3.3452115812917596e-05, "loss": 1.0685, "step": 3749 }, { "epoch": 0.8333333333333334, "grad_norm": 1.1700232028961182, "learning_rate": 3.34075723830735e-05, "loss": 0.7485, "step": 3750 }, { "epoch": 0.8335555555555556, "grad_norm": 0.4857214093208313, "learning_rate": 3.33630289532294e-05, "loss": 0.9854, "step": 3751 }, { "epoch": 0.8337777777777777, "grad_norm": 0.8265995383262634, "learning_rate": 3.33184855233853e-05, "loss": 1.9879, "step": 3752 }, { "epoch": 0.834, "grad_norm": 0.48844748735427856, "learning_rate": 3.3273942093541204e-05, "loss": 1.0029, "step": 3753 }, { "epoch": 0.8342222222222222, "grad_norm": 0.6757328510284424, "learning_rate": 3.322939866369711e-05, "loss": 1.0597, "step": 3754 }, { "epoch": 0.8344444444444444, "grad_norm": 1.0443055629730225, "learning_rate": 3.318485523385301e-05, "loss": 2.3737, "step": 3755 }, { "epoch": 0.8346666666666667, "grad_norm": 0.8254187703132629, "learning_rate": 3.3140311804008914e-05, "loss": 1.9924, "step": 3756 }, { "epoch": 0.8348888888888889, "grad_norm": 0.09497911483049393, "learning_rate": 3.309576837416481e-05, "loss": 0.0113, "step": 3757 }, { "epoch": 0.8351111111111111, "grad_norm": 0.09401866048574448, "learning_rate": 3.305122494432071e-05, "loss": 0.0116, "step": 3758 }, { "epoch": 0.8353333333333334, "grad_norm": 0.09458266943693161, "learning_rate": 3.3006681514476616e-05, "loss": 0.0116, "step": 3759 }, { "epoch": 0.8355555555555556, "grad_norm": 0.09231861680746078, "learning_rate": 3.2962138084632515e-05, "loss": 0.0114, "step": 3760 }, { "epoch": 0.8357777777777777, "grad_norm": 1.012135624885559, "learning_rate": 3.291759465478842e-05, "loss": 2.2142, "step": 3761 }, { "epoch": 0.836, "grad_norm": 0.8345160484313965, "learning_rate": 3.2873051224944325e-05, "loss": 2.0887, "step": 3762 }, { "epoch": 0.8362222222222222, "grad_norm": 0.777621865272522, "learning_rate": 3.2828507795100224e-05, "loss": 1.7456, "step": 3763 }, { "epoch": 0.8364444444444444, "grad_norm": 0.9471651911735535, "learning_rate": 3.278396436525612e-05, "loss": 1.9976, "step": 3764 }, { "epoch": 0.8366666666666667, "grad_norm": 0.9481960535049438, "learning_rate": 3.273942093541203e-05, "loss": 2.1314, "step": 3765 }, { "epoch": 0.8368888888888889, "grad_norm": 1.0837010145187378, "learning_rate": 3.2694877505567926e-05, "loss": 2.0858, "step": 3766 }, { "epoch": 0.8371111111111111, "grad_norm": 1.0105607509613037, "learning_rate": 3.265033407572383e-05, "loss": 2.1732, "step": 3767 }, { "epoch": 0.8373333333333334, "grad_norm": 0.5977281928062439, "learning_rate": 3.260579064587974e-05, "loss": 0.8871, "step": 3768 }, { "epoch": 0.8375555555555556, "grad_norm": 1.2047114372253418, "learning_rate": 3.2561247216035636e-05, "loss": 2.0168, "step": 3769 }, { "epoch": 0.8377777777777777, "grad_norm": 0.8766410946846008, "learning_rate": 3.251670378619154e-05, "loss": 1.6475, "step": 3770 }, { "epoch": 0.838, "grad_norm": 0.9371228218078613, "learning_rate": 3.247216035634744e-05, "loss": 1.8688, "step": 3771 }, { "epoch": 0.8382222222222222, "grad_norm": 1.0611170530319214, "learning_rate": 3.242761692650334e-05, "loss": 1.8825, "step": 3772 }, { "epoch": 0.8384444444444444, "grad_norm": 1.0075304508209229, "learning_rate": 3.2383073496659244e-05, "loss": 2.1026, "step": 3773 }, { "epoch": 0.8386666666666667, "grad_norm": 0.06949839740991592, "learning_rate": 3.233853006681515e-05, "loss": 0.0162, "step": 3774 }, { "epoch": 0.8388888888888889, "grad_norm": 0.06908978521823883, "learning_rate": 3.229398663697105e-05, "loss": 0.0159, "step": 3775 }, { "epoch": 0.8391111111111111, "grad_norm": 0.905208945274353, "learning_rate": 3.224944320712695e-05, "loss": 1.7153, "step": 3776 }, { "epoch": 0.8393333333333334, "grad_norm": 0.8136224150657654, "learning_rate": 3.220489977728285e-05, "loss": 0.903, "step": 3777 }, { "epoch": 0.8395555555555556, "grad_norm": 0.9069592356681824, "learning_rate": 3.216035634743875e-05, "loss": 1.6758, "step": 3778 }, { "epoch": 0.8397777777777777, "grad_norm": 1.0851026773452759, "learning_rate": 3.2115812917594655e-05, "loss": 1.8553, "step": 3779 }, { "epoch": 0.84, "grad_norm": 0.998877763748169, "learning_rate": 3.207126948775056e-05, "loss": 1.8056, "step": 3780 }, { "epoch": 0.8402222222222222, "grad_norm": 0.0660950317978859, "learning_rate": 3.202672605790646e-05, "loss": 0.0175, "step": 3781 }, { "epoch": 0.8404444444444444, "grad_norm": 0.0653744786977768, "learning_rate": 3.1982182628062365e-05, "loss": 0.0173, "step": 3782 }, { "epoch": 0.8406666666666667, "grad_norm": 0.06411214917898178, "learning_rate": 3.193763919821826e-05, "loss": 0.0175, "step": 3783 }, { "epoch": 0.8408888888888889, "grad_norm": 0.06954985857009888, "learning_rate": 3.189309576837416e-05, "loss": 0.0174, "step": 3784 }, { "epoch": 0.8411111111111111, "grad_norm": 0.7279578447341919, "learning_rate": 3.184855233853007e-05, "loss": 1.0075, "step": 3785 }, { "epoch": 0.8413333333333334, "grad_norm": 1.0854923725128174, "learning_rate": 3.180400890868597e-05, "loss": 1.8502, "step": 3786 }, { "epoch": 0.8415555555555555, "grad_norm": 1.1248599290847778, "learning_rate": 3.175946547884187e-05, "loss": 1.769, "step": 3787 }, { "epoch": 0.8417777777777777, "grad_norm": 0.6535754203796387, "learning_rate": 3.1714922048997777e-05, "loss": 0.922, "step": 3788 }, { "epoch": 0.842, "grad_norm": 1.0051473379135132, "learning_rate": 3.167037861915368e-05, "loss": 1.6551, "step": 3789 }, { "epoch": 0.8422222222222222, "grad_norm": 1.0957441329956055, "learning_rate": 3.162583518930958e-05, "loss": 1.7058, "step": 3790 }, { "epoch": 0.8424444444444444, "grad_norm": 1.0055428743362427, "learning_rate": 3.158129175946548e-05, "loss": 1.5342, "step": 3791 }, { "epoch": 0.8426666666666667, "grad_norm": 0.8064576387405396, "learning_rate": 3.1536748329621384e-05, "loss": 0.8161, "step": 3792 }, { "epoch": 0.8428888888888889, "grad_norm": 1.1807235479354858, "learning_rate": 3.149220489977728e-05, "loss": 1.8853, "step": 3793 }, { "epoch": 0.8431111111111111, "grad_norm": 1.0103986263275146, "learning_rate": 3.144766146993319e-05, "loss": 1.3899, "step": 3794 }, { "epoch": 0.8433333333333334, "grad_norm": 0.2762221693992615, "learning_rate": 3.140311804008909e-05, "loss": 0.0367, "step": 3795 }, { "epoch": 0.8435555555555555, "grad_norm": 0.7262986302375793, "learning_rate": 3.135857461024499e-05, "loss": 0.7566, "step": 3796 }, { "epoch": 0.8437777777777777, "grad_norm": 1.0480473041534424, "learning_rate": 3.131403118040089e-05, "loss": 1.6141, "step": 3797 }, { "epoch": 0.844, "grad_norm": 1.0954132080078125, "learning_rate": 3.126948775055679e-05, "loss": 1.269, "step": 3798 }, { "epoch": 0.8442222222222222, "grad_norm": 0.7563920021057129, "learning_rate": 3.1224944320712695e-05, "loss": 0.6253, "step": 3799 }, { "epoch": 0.8444444444444444, "grad_norm": 0.7246300578117371, "learning_rate": 3.11804008908686e-05, "loss": 0.3778, "step": 3800 }, { "epoch": 0.8446666666666667, "grad_norm": 0.9608231782913208, "learning_rate": 3.11358574610245e-05, "loss": 2.1959, "step": 3801 }, { "epoch": 0.8448888888888889, "grad_norm": 0.04354199394583702, "learning_rate": 3.1091314031180404e-05, "loss": 0.0106, "step": 3802 }, { "epoch": 0.8451111111111111, "grad_norm": 0.8725544810295105, "learning_rate": 3.10467706013363e-05, "loss": 2.4019, "step": 3803 }, { "epoch": 0.8453333333333334, "grad_norm": 0.04339034482836723, "learning_rate": 3.10022271714922e-05, "loss": 0.0107, "step": 3804 }, { "epoch": 0.8455555555555555, "grad_norm": 0.8996299505233765, "learning_rate": 3.095768374164811e-05, "loss": 1.7978, "step": 3805 }, { "epoch": 0.8457777777777777, "grad_norm": 0.08517049998044968, "learning_rate": 3.091314031180401e-05, "loss": 0.0108, "step": 3806 }, { "epoch": 0.846, "grad_norm": 0.08175533264875412, "learning_rate": 3.086859688195991e-05, "loss": 0.0109, "step": 3807 }, { "epoch": 0.8462222222222222, "grad_norm": 0.06705193221569061, "learning_rate": 3.0824053452115816e-05, "loss": 0.0103, "step": 3808 }, { "epoch": 0.8464444444444444, "grad_norm": 0.08038879185914993, "learning_rate": 3.077951002227172e-05, "loss": 0.0106, "step": 3809 }, { "epoch": 0.8466666666666667, "grad_norm": 0.617675244808197, "learning_rate": 3.073496659242761e-05, "loss": 1.0167, "step": 3810 }, { "epoch": 0.8468888888888889, "grad_norm": 0.8487913012504578, "learning_rate": 3.069042316258352e-05, "loss": 1.8088, "step": 3811 }, { "epoch": 0.8471111111111111, "grad_norm": 0.8923436403274536, "learning_rate": 3.0645879732739424e-05, "loss": 1.8555, "step": 3812 }, { "epoch": 0.8473333333333334, "grad_norm": 0.9946725368499756, "learning_rate": 3.060133630289532e-05, "loss": 2.1355, "step": 3813 }, { "epoch": 0.8475555555555555, "grad_norm": 0.8608193397521973, "learning_rate": 3.055679287305123e-05, "loss": 1.8501, "step": 3814 }, { "epoch": 0.8477777777777777, "grad_norm": 1.0533936023712158, "learning_rate": 3.051224944320713e-05, "loss": 2.3759, "step": 3815 }, { "epoch": 0.848, "grad_norm": 0.9395473003387451, "learning_rate": 3.046770601336303e-05, "loss": 1.8055, "step": 3816 }, { "epoch": 0.8482222222222222, "grad_norm": 0.868739902973175, "learning_rate": 3.0423162583518934e-05, "loss": 1.9014, "step": 3817 }, { "epoch": 0.8484444444444444, "grad_norm": 0.9286447167396545, "learning_rate": 3.0378619153674836e-05, "loss": 2.1256, "step": 3818 }, { "epoch": 0.8486666666666667, "grad_norm": 0.9453836679458618, "learning_rate": 3.0334075723830734e-05, "loss": 1.9572, "step": 3819 }, { "epoch": 0.8488888888888889, "grad_norm": 1.1995909214019775, "learning_rate": 3.028953229398664e-05, "loss": 1.7376, "step": 3820 }, { "epoch": 0.8491111111111111, "grad_norm": 1.2044036388397217, "learning_rate": 3.024498886414254e-05, "loss": 2.1343, "step": 3821 }, { "epoch": 0.8493333333333334, "grad_norm": 0.8160643577575684, "learning_rate": 3.020044543429844e-05, "loss": 0.9794, "step": 3822 }, { "epoch": 0.8495555555555555, "grad_norm": 0.06719803065061569, "learning_rate": 3.0155902004454346e-05, "loss": 0.0154, "step": 3823 }, { "epoch": 0.8497777777777777, "grad_norm": 0.06740820407867432, "learning_rate": 3.0111358574610248e-05, "loss": 0.0154, "step": 3824 }, { "epoch": 0.85, "grad_norm": 1.0504337549209595, "learning_rate": 3.0066815144766146e-05, "loss": 1.8658, "step": 3825 }, { "epoch": 0.8502222222222222, "grad_norm": 0.7439045906066895, "learning_rate": 3.002227171492205e-05, "loss": 1.1039, "step": 3826 }, { "epoch": 0.8504444444444444, "grad_norm": 0.1253952980041504, "learning_rate": 2.9977728285077953e-05, "loss": 0.0206, "step": 3827 }, { "epoch": 0.8506666666666667, "grad_norm": 1.0089833736419678, "learning_rate": 2.9933184855233852e-05, "loss": 2.0178, "step": 3828 }, { "epoch": 0.8508888888888889, "grad_norm": 1.0395070314407349, "learning_rate": 2.9888641425389757e-05, "loss": 1.9692, "step": 3829 }, { "epoch": 0.8511111111111112, "grad_norm": 1.0526185035705566, "learning_rate": 2.9844097995545663e-05, "loss": 1.8065, "step": 3830 }, { "epoch": 0.8513333333333334, "grad_norm": 1.0034129619598389, "learning_rate": 2.9799554565701558e-05, "loss": 1.679, "step": 3831 }, { "epoch": 0.8515555555555555, "grad_norm": 0.06555074453353882, "learning_rate": 2.9755011135857463e-05, "loss": 0.017, "step": 3832 }, { "epoch": 0.8517777777777777, "grad_norm": 0.8089559078216553, "learning_rate": 2.9710467706013362e-05, "loss": 0.9377, "step": 3833 }, { "epoch": 0.852, "grad_norm": 0.7607543468475342, "learning_rate": 2.9665924276169267e-05, "loss": 0.873, "step": 3834 }, { "epoch": 0.8522222222222222, "grad_norm": 0.08201993256807327, "learning_rate": 2.962138084632517e-05, "loss": 0.0183, "step": 3835 }, { "epoch": 0.8524444444444444, "grad_norm": 0.6691009402275085, "learning_rate": 2.9576837416481068e-05, "loss": 0.6488, "step": 3836 }, { "epoch": 0.8526666666666667, "grad_norm": 1.0818275213241577, "learning_rate": 2.9532293986636973e-05, "loss": 1.6231, "step": 3837 }, { "epoch": 0.8528888888888889, "grad_norm": 0.10234081745147705, "learning_rate": 2.9487750556792875e-05, "loss": 0.0249, "step": 3838 }, { "epoch": 0.8531111111111112, "grad_norm": 0.7301368117332458, "learning_rate": 2.9443207126948774e-05, "loss": 0.7285, "step": 3839 }, { "epoch": 0.8533333333333334, "grad_norm": 1.020973563194275, "learning_rate": 2.939866369710468e-05, "loss": 1.5453, "step": 3840 }, { "epoch": 0.8535555555555555, "grad_norm": 1.161118745803833, "learning_rate": 2.935412026726058e-05, "loss": 1.5789, "step": 3841 }, { "epoch": 0.8537777777777777, "grad_norm": 1.1855006217956543, "learning_rate": 2.930957683741648e-05, "loss": 1.443, "step": 3842 }, { "epoch": 0.854, "grad_norm": 1.0485907793045044, "learning_rate": 2.9265033407572385e-05, "loss": 1.2626, "step": 3843 }, { "epoch": 0.8542222222222222, "grad_norm": 1.0825096368789673, "learning_rate": 2.9220489977728287e-05, "loss": 1.4062, "step": 3844 }, { "epoch": 0.8544444444444445, "grad_norm": 0.16687047481536865, "learning_rate": 2.9175946547884186e-05, "loss": 0.0262, "step": 3845 }, { "epoch": 0.8546666666666667, "grad_norm": 1.0403611660003662, "learning_rate": 2.913140311804009e-05, "loss": 1.4555, "step": 3846 }, { "epoch": 0.8548888888888889, "grad_norm": 1.069176197052002, "learning_rate": 2.9086859688195993e-05, "loss": 1.2518, "step": 3847 }, { "epoch": 0.8551111111111112, "grad_norm": 1.2168667316436768, "learning_rate": 2.904231625835189e-05, "loss": 1.289, "step": 3848 }, { "epoch": 0.8553333333333333, "grad_norm": 0.5977094769477844, "learning_rate": 2.8997772828507797e-05, "loss": 0.4731, "step": 3849 }, { "epoch": 0.8555555555555555, "grad_norm": 0.543451189994812, "learning_rate": 2.89532293986637e-05, "loss": 0.3367, "step": 3850 }, { "epoch": 0.8557777777777777, "grad_norm": 0.044504791498184204, "learning_rate": 2.8908685968819597e-05, "loss": 0.0103, "step": 3851 }, { "epoch": 0.856, "grad_norm": 0.8173375725746155, "learning_rate": 2.8864142538975503e-05, "loss": 2.0658, "step": 3852 }, { "epoch": 0.8562222222222222, "grad_norm": 0.6008175015449524, "learning_rate": 2.8819599109131408e-05, "loss": 1.0048, "step": 3853 }, { "epoch": 0.8564444444444445, "grad_norm": 0.6246810555458069, "learning_rate": 2.8775055679287303e-05, "loss": 1.0661, "step": 3854 }, { "epoch": 0.8566666666666667, "grad_norm": 0.9632955193519592, "learning_rate": 2.873051224944321e-05, "loss": 2.273, "step": 3855 }, { "epoch": 0.8568888888888889, "grad_norm": 0.8222072720527649, "learning_rate": 2.8685968819599114e-05, "loss": 2.0065, "step": 3856 }, { "epoch": 0.8571111111111112, "grad_norm": 0.6057097911834717, "learning_rate": 2.8641425389755013e-05, "loss": 1.1629, "step": 3857 }, { "epoch": 0.8573333333333333, "grad_norm": 0.06852009892463684, "learning_rate": 2.8596881959910915e-05, "loss": 0.0104, "step": 3858 }, { "epoch": 0.8575555555555555, "grad_norm": 0.06707873195409775, "learning_rate": 2.855233853006682e-05, "loss": 0.0103, "step": 3859 }, { "epoch": 0.8577777777777778, "grad_norm": 0.0674692690372467, "learning_rate": 2.850779510022272e-05, "loss": 0.0099, "step": 3860 }, { "epoch": 0.858, "grad_norm": 0.5297547578811646, "learning_rate": 2.846325167037862e-05, "loss": 1.1045, "step": 3861 }, { "epoch": 0.8582222222222222, "grad_norm": 0.9173485040664673, "learning_rate": 2.8418708240534526e-05, "loss": 1.9225, "step": 3862 }, { "epoch": 0.8584444444444445, "grad_norm": 0.9960424900054932, "learning_rate": 2.8374164810690424e-05, "loss": 2.0529, "step": 3863 }, { "epoch": 0.8586666666666667, "grad_norm": 0.6144242286682129, "learning_rate": 2.8329621380846326e-05, "loss": 1.0756, "step": 3864 }, { "epoch": 0.8588888888888889, "grad_norm": 0.9492395520210266, "learning_rate": 2.8285077951002232e-05, "loss": 1.5811, "step": 3865 }, { "epoch": 0.8591111111111112, "grad_norm": 1.0924068689346313, "learning_rate": 2.824053452115813e-05, "loss": 2.1395, "step": 3866 }, { "epoch": 0.8593333333333333, "grad_norm": 1.214991569519043, "learning_rate": 2.8195991091314032e-05, "loss": 2.3004, "step": 3867 }, { "epoch": 0.8595555555555555, "grad_norm": 0.9233739972114563, "learning_rate": 2.815144766146993e-05, "loss": 1.9741, "step": 3868 }, { "epoch": 0.8597777777777778, "grad_norm": 0.9544225931167603, "learning_rate": 2.8106904231625836e-05, "loss": 2.0073, "step": 3869 }, { "epoch": 0.86, "grad_norm": 1.0409972667694092, "learning_rate": 2.8062360801781738e-05, "loss": 2.0343, "step": 3870 }, { "epoch": 0.8602222222222222, "grad_norm": 1.0500308275222778, "learning_rate": 2.8017817371937637e-05, "loss": 1.8736, "step": 3871 }, { "epoch": 0.8604444444444445, "grad_norm": 1.0792829990386963, "learning_rate": 2.7973273942093542e-05, "loss": 1.9845, "step": 3872 }, { "epoch": 0.8606666666666667, "grad_norm": 1.1363780498504639, "learning_rate": 2.7928730512249447e-05, "loss": 1.8806, "step": 3873 }, { "epoch": 0.8608888888888889, "grad_norm": 0.6764510869979858, "learning_rate": 2.7884187082405343e-05, "loss": 0.9852, "step": 3874 }, { "epoch": 0.8611111111111112, "grad_norm": 0.06674336642026901, "learning_rate": 2.7839643652561248e-05, "loss": 0.0153, "step": 3875 }, { "epoch": 0.8613333333333333, "grad_norm": 0.06754778325557709, "learning_rate": 2.7795100222717153e-05, "loss": 0.0156, "step": 3876 }, { "epoch": 0.8615555555555555, "grad_norm": 0.6871387362480164, "learning_rate": 2.7750556792873052e-05, "loss": 0.8689, "step": 3877 }, { "epoch": 0.8617777777777778, "grad_norm": 1.1654753684997559, "learning_rate": 2.7706013363028954e-05, "loss": 1.9632, "step": 3878 }, { "epoch": 0.862, "grad_norm": 0.13107286393642426, "learning_rate": 2.766146993318486e-05, "loss": 0.0207, "step": 3879 }, { "epoch": 0.8622222222222222, "grad_norm": 1.0098730325698853, "learning_rate": 2.7616926503340758e-05, "loss": 1.7388, "step": 3880 }, { "epoch": 0.8624444444444445, "grad_norm": 0.9629087448120117, "learning_rate": 2.757238307349666e-05, "loss": 1.7969, "step": 3881 }, { "epoch": 0.8626666666666667, "grad_norm": 0.9217532873153687, "learning_rate": 2.7527839643652565e-05, "loss": 1.9322, "step": 3882 }, { "epoch": 0.8628888888888889, "grad_norm": 1.0283830165863037, "learning_rate": 2.7483296213808464e-05, "loss": 1.7681, "step": 3883 }, { "epoch": 0.8631111111111112, "grad_norm": 0.701818585395813, "learning_rate": 2.7438752783964366e-05, "loss": 0.8642, "step": 3884 }, { "epoch": 0.8633333333333333, "grad_norm": 0.7634962201118469, "learning_rate": 2.739420935412027e-05, "loss": 0.7195, "step": 3885 }, { "epoch": 0.8635555555555555, "grad_norm": 0.9613010287284851, "learning_rate": 2.734966592427617e-05, "loss": 1.7299, "step": 3886 }, { "epoch": 0.8637777777777778, "grad_norm": 0.8127443790435791, "learning_rate": 2.730512249443207e-05, "loss": 0.9909, "step": 3887 }, { "epoch": 0.864, "grad_norm": 0.7633342146873474, "learning_rate": 2.7260579064587977e-05, "loss": 0.7784, "step": 3888 }, { "epoch": 0.8642222222222222, "grad_norm": 0.8209825754165649, "learning_rate": 2.7216035634743876e-05, "loss": 0.8605, "step": 3889 }, { "epoch": 0.8644444444444445, "grad_norm": 1.1006879806518555, "learning_rate": 2.7171492204899778e-05, "loss": 1.4274, "step": 3890 }, { "epoch": 0.8646666666666667, "grad_norm": 0.9458972811698914, "learning_rate": 2.7126948775055683e-05, "loss": 1.4813, "step": 3891 }, { "epoch": 0.8648888888888889, "grad_norm": 1.3641767501831055, "learning_rate": 2.708240534521158e-05, "loss": 1.5427, "step": 3892 }, { "epoch": 0.8651111111111112, "grad_norm": 1.0138379335403442, "learning_rate": 2.7037861915367484e-05, "loss": 1.4602, "step": 3893 }, { "epoch": 0.8653333333333333, "grad_norm": 1.0641552209854126, "learning_rate": 2.699331848552339e-05, "loss": 1.2653, "step": 3894 }, { "epoch": 0.8655555555555555, "grad_norm": 1.2748581171035767, "learning_rate": 2.6948775055679287e-05, "loss": 1.3019, "step": 3895 }, { "epoch": 0.8657777777777778, "grad_norm": 1.1393803358078003, "learning_rate": 2.6904231625835193e-05, "loss": 1.331, "step": 3896 }, { "epoch": 0.866, "grad_norm": 1.0819334983825684, "learning_rate": 2.6859688195991095e-05, "loss": 1.1292, "step": 3897 }, { "epoch": 0.8662222222222222, "grad_norm": 0.14373019337654114, "learning_rate": 2.6815144766146993e-05, "loss": 0.0321, "step": 3898 }, { "epoch": 0.8664444444444445, "grad_norm": 0.14658400416374207, "learning_rate": 2.67706013363029e-05, "loss": 0.032, "step": 3899 }, { "epoch": 0.8666666666666667, "grad_norm": 1.1421598196029663, "learning_rate": 2.67260579064588e-05, "loss": 1.0447, "step": 3900 }, { "epoch": 0.8668888888888889, "grad_norm": 0.6876357793807983, "learning_rate": 2.66815144766147e-05, "loss": 1.2227, "step": 3901 }, { "epoch": 0.8671111111111112, "grad_norm": 0.0448576956987381, "learning_rate": 2.6636971046770605e-05, "loss": 0.0104, "step": 3902 }, { "epoch": 0.8673333333333333, "grad_norm": 0.6660778522491455, "learning_rate": 2.6592427616926503e-05, "loss": 1.121, "step": 3903 }, { "epoch": 0.8675555555555555, "grad_norm": 0.043087027966976166, "learning_rate": 2.6547884187082405e-05, "loss": 0.0103, "step": 3904 }, { "epoch": 0.8677777777777778, "grad_norm": 0.5372818112373352, "learning_rate": 2.650334075723831e-05, "loss": 1.0993, "step": 3905 }, { "epoch": 0.868, "grad_norm": 0.9083240628242493, "learning_rate": 2.645879732739421e-05, "loss": 2.459, "step": 3906 }, { "epoch": 0.8682222222222222, "grad_norm": 0.8523256182670593, "learning_rate": 2.641425389755011e-05, "loss": 2.1683, "step": 3907 }, { "epoch": 0.8684444444444445, "grad_norm": 0.6197808384895325, "learning_rate": 2.6369710467706016e-05, "loss": 1.0535, "step": 3908 }, { "epoch": 0.8686666666666667, "grad_norm": 0.8953803181648254, "learning_rate": 2.6325167037861915e-05, "loss": 1.9434, "step": 3909 }, { "epoch": 0.8688888888888889, "grad_norm": 0.9139788150787354, "learning_rate": 2.6280623608017817e-05, "loss": 1.8545, "step": 3910 }, { "epoch": 0.8691111111111111, "grad_norm": 0.8638214468955994, "learning_rate": 2.6236080178173722e-05, "loss": 1.8329, "step": 3911 }, { "epoch": 0.8693333333333333, "grad_norm": 0.8344167470932007, "learning_rate": 2.619153674832962e-05, "loss": 1.7991, "step": 3912 }, { "epoch": 0.8695555555555555, "grad_norm": 0.96803879737854, "learning_rate": 2.6146993318485523e-05, "loss": 1.9849, "step": 3913 }, { "epoch": 0.8697777777777778, "grad_norm": 1.0239784717559814, "learning_rate": 2.6102449888641428e-05, "loss": 1.9256, "step": 3914 }, { "epoch": 0.87, "grad_norm": 0.8905801177024841, "learning_rate": 2.6057906458797327e-05, "loss": 1.6746, "step": 3915 }, { "epoch": 0.8702222222222222, "grad_norm": 1.0133596658706665, "learning_rate": 2.601336302895323e-05, "loss": 2.1594, "step": 3916 }, { "epoch": 0.8704444444444445, "grad_norm": 0.06942284107208252, "learning_rate": 2.5968819599109134e-05, "loss": 0.015, "step": 3917 }, { "epoch": 0.8706666666666667, "grad_norm": 0.07573316246271133, "learning_rate": 2.5924276169265033e-05, "loss": 0.0148, "step": 3918 }, { "epoch": 0.8708888888888889, "grad_norm": 0.07226064801216125, "learning_rate": 2.5879732739420938e-05, "loss": 0.015, "step": 3919 }, { "epoch": 0.8711111111111111, "grad_norm": 1.0551682710647583, "learning_rate": 2.583518930957684e-05, "loss": 1.9451, "step": 3920 }, { "epoch": 0.8713333333333333, "grad_norm": 1.0881084203720093, "learning_rate": 2.579064587973274e-05, "loss": 1.9361, "step": 3921 }, { "epoch": 0.8715555555555555, "grad_norm": 1.029228925704956, "learning_rate": 2.5746102449888644e-05, "loss": 1.97, "step": 3922 }, { "epoch": 0.8717777777777778, "grad_norm": 0.9416628479957581, "learning_rate": 2.5701559020044546e-05, "loss": 1.4815, "step": 3923 }, { "epoch": 0.872, "grad_norm": 1.8949933052062988, "learning_rate": 2.5657015590200445e-05, "loss": 2.0094, "step": 3924 }, { "epoch": 0.8722222222222222, "grad_norm": 0.9487776756286621, "learning_rate": 2.561247216035635e-05, "loss": 1.8348, "step": 3925 }, { "epoch": 0.8724444444444445, "grad_norm": 0.865877091884613, "learning_rate": 2.5567928730512252e-05, "loss": 1.5909, "step": 3926 }, { "epoch": 0.8726666666666667, "grad_norm": 0.9927725195884705, "learning_rate": 2.552338530066815e-05, "loss": 1.9465, "step": 3927 }, { "epoch": 0.8728888888888889, "grad_norm": 0.9912342429161072, "learning_rate": 2.5478841870824056e-05, "loss": 1.3772, "step": 3928 }, { "epoch": 0.8731111111111111, "grad_norm": 0.9611807465553284, "learning_rate": 2.5434298440979958e-05, "loss": 1.5165, "step": 3929 }, { "epoch": 0.8733333333333333, "grad_norm": 0.9328694343566895, "learning_rate": 2.5389755011135856e-05, "loss": 1.3826, "step": 3930 }, { "epoch": 0.8735555555555555, "grad_norm": 0.9587991237640381, "learning_rate": 2.5345211581291762e-05, "loss": 1.6781, "step": 3931 }, { "epoch": 0.8737777777777778, "grad_norm": 0.06626418977975845, "learning_rate": 2.5300668151447664e-05, "loss": 0.0176, "step": 3932 }, { "epoch": 0.874, "grad_norm": 0.06570940464735031, "learning_rate": 2.5256124721603562e-05, "loss": 0.0179, "step": 3933 }, { "epoch": 0.8742222222222222, "grad_norm": 0.06534791737794876, "learning_rate": 2.5211581291759468e-05, "loss": 0.0175, "step": 3934 }, { "epoch": 0.8744444444444445, "grad_norm": 0.07688681036233902, "learning_rate": 2.516703786191537e-05, "loss": 0.0177, "step": 3935 }, { "epoch": 0.8746666666666667, "grad_norm": 0.060970455408096313, "learning_rate": 2.5122494432071268e-05, "loss": 0.0177, "step": 3936 }, { "epoch": 0.8748888888888889, "grad_norm": 1.0032833814620972, "learning_rate": 2.5077951002227174e-05, "loss": 1.7855, "step": 3937 }, { "epoch": 0.8751111111111111, "grad_norm": 0.9916431903839111, "learning_rate": 2.503340757238308e-05, "loss": 1.7102, "step": 3938 }, { "epoch": 0.8753333333333333, "grad_norm": 0.6338675618171692, "learning_rate": 2.4988864142538974e-05, "loss": 0.744, "step": 3939 }, { "epoch": 0.8755555555555555, "grad_norm": 0.732306957244873, "learning_rate": 2.494432071269488e-05, "loss": 0.7834, "step": 3940 }, { "epoch": 0.8757777777777778, "grad_norm": 0.9343276619911194, "learning_rate": 2.489977728285078e-05, "loss": 1.6322, "step": 3941 }, { "epoch": 0.876, "grad_norm": 1.0164755582809448, "learning_rate": 2.4855233853006683e-05, "loss": 1.561, "step": 3942 }, { "epoch": 0.8762222222222222, "grad_norm": 0.968427300453186, "learning_rate": 2.4810690423162585e-05, "loss": 1.2585, "step": 3943 }, { "epoch": 0.8764444444444445, "grad_norm": 0.10157324373722076, "learning_rate": 2.4766146993318487e-05, "loss": 0.025, "step": 3944 }, { "epoch": 0.8766666666666667, "grad_norm": 1.4210426807403564, "learning_rate": 2.472160356347439e-05, "loss": 1.3668, "step": 3945 }, { "epoch": 0.8768888888888889, "grad_norm": 1.2264833450317383, "learning_rate": 2.467706013363029e-05, "loss": 1.2792, "step": 3946 }, { "epoch": 0.8771111111111111, "grad_norm": 1.10524582862854, "learning_rate": 2.4632516703786193e-05, "loss": 1.3331, "step": 3947 }, { "epoch": 0.8773333333333333, "grad_norm": 0.8390571475028992, "learning_rate": 2.4587973273942095e-05, "loss": 0.6529, "step": 3948 }, { "epoch": 0.8775555555555555, "grad_norm": 1.1716080904006958, "learning_rate": 2.4543429844097994e-05, "loss": 1.2058, "step": 3949 }, { "epoch": 0.8777777777777778, "grad_norm": 1.0907095670700073, "learning_rate": 2.44988864142539e-05, "loss": 0.9754, "step": 3950 }, { "epoch": 0.878, "grad_norm": 0.5868102312088013, "learning_rate": 2.44543429844098e-05, "loss": 1.1827, "step": 3951 }, { "epoch": 0.8782222222222222, "grad_norm": 0.8409274220466614, "learning_rate": 2.4409799554565703e-05, "loss": 2.1622, "step": 3952 }, { "epoch": 0.8784444444444445, "grad_norm": 0.6952332258224487, "learning_rate": 2.4365256124721605e-05, "loss": 1.1901, "step": 3953 }, { "epoch": 0.8786666666666667, "grad_norm": 0.519538164138794, "learning_rate": 2.4320712694877507e-05, "loss": 1.2036, "step": 3954 }, { "epoch": 0.8788888888888889, "grad_norm": 0.6376737356185913, "learning_rate": 2.427616926503341e-05, "loss": 1.1078, "step": 3955 }, { "epoch": 0.8791111111111111, "grad_norm": 0.044897519052028656, "learning_rate": 2.423162583518931e-05, "loss": 0.0104, "step": 3956 }, { "epoch": 0.8793333333333333, "grad_norm": 0.9802849292755127, "learning_rate": 2.4187082405345213e-05, "loss": 1.8767, "step": 3957 }, { "epoch": 0.8795555555555555, "grad_norm": 0.08882291615009308, "learning_rate": 2.4142538975501115e-05, "loss": 0.0114, "step": 3958 }, { "epoch": 0.8797777777777778, "grad_norm": 0.9282602667808533, "learning_rate": 2.4097995545657017e-05, "loss": 1.8114, "step": 3959 }, { "epoch": 0.88, "grad_norm": 0.9374412894248962, "learning_rate": 2.405345211581292e-05, "loss": 1.9425, "step": 3960 }, { "epoch": 0.8802222222222222, "grad_norm": 1.0642507076263428, "learning_rate": 2.400890868596882e-05, "loss": 2.585, "step": 3961 }, { "epoch": 0.8804444444444445, "grad_norm": 0.8070052862167358, "learning_rate": 2.3964365256124723e-05, "loss": 1.908, "step": 3962 }, { "epoch": 0.8806666666666667, "grad_norm": 0.8729952573776245, "learning_rate": 2.3919821826280625e-05, "loss": 1.9254, "step": 3963 }, { "epoch": 0.8808888888888889, "grad_norm": 0.12168601900339127, "learning_rate": 2.3875278396436527e-05, "loss": 0.0192, "step": 3964 }, { "epoch": 0.8811111111111111, "grad_norm": 0.6381791830062866, "learning_rate": 2.383073496659243e-05, "loss": 0.9943, "step": 3965 }, { "epoch": 0.8813333333333333, "grad_norm": 1.2023353576660156, "learning_rate": 2.378619153674833e-05, "loss": 1.9608, "step": 3966 }, { "epoch": 0.8815555555555555, "grad_norm": 0.9587229490280151, "learning_rate": 2.3741648106904233e-05, "loss": 1.8919, "step": 3967 }, { "epoch": 0.8817777777777778, "grad_norm": 1.0025968551635742, "learning_rate": 2.3697104677060135e-05, "loss": 1.953, "step": 3968 }, { "epoch": 0.882, "grad_norm": 0.9075009226799011, "learning_rate": 2.3652561247216037e-05, "loss": 1.9855, "step": 3969 }, { "epoch": 0.8822222222222222, "grad_norm": 0.06732242554426193, "learning_rate": 2.360801781737194e-05, "loss": 0.0149, "step": 3970 }, { "epoch": 0.8824444444444445, "grad_norm": 0.06586241722106934, "learning_rate": 2.356347438752784e-05, "loss": 0.015, "step": 3971 }, { "epoch": 0.8826666666666667, "grad_norm": 0.06589429080486298, "learning_rate": 2.3518930957683743e-05, "loss": 0.015, "step": 3972 }, { "epoch": 0.8828888888888888, "grad_norm": 0.7274507284164429, "learning_rate": 2.3474387527839645e-05, "loss": 0.9097, "step": 3973 }, { "epoch": 0.8831111111111111, "grad_norm": 0.9447082877159119, "learning_rate": 2.3429844097995547e-05, "loss": 1.8058, "step": 3974 }, { "epoch": 0.8833333333333333, "grad_norm": 0.13066767156124115, "learning_rate": 2.338530066815145e-05, "loss": 0.0211, "step": 3975 }, { "epoch": 0.8835555555555555, "grad_norm": 0.7804778218269348, "learning_rate": 2.334075723830735e-05, "loss": 1.0144, "step": 3976 }, { "epoch": 0.8837777777777778, "grad_norm": 1.0640380382537842, "learning_rate": 2.3296213808463252e-05, "loss": 1.7064, "step": 3977 }, { "epoch": 0.884, "grad_norm": 1.0175601243972778, "learning_rate": 2.3251670378619154e-05, "loss": 1.9517, "step": 3978 }, { "epoch": 0.8842222222222222, "grad_norm": 1.1040079593658447, "learning_rate": 2.3207126948775056e-05, "loss": 1.8058, "step": 3979 }, { "epoch": 0.8844444444444445, "grad_norm": 0.728284478187561, "learning_rate": 2.316258351893096e-05, "loss": 0.7271, "step": 3980 }, { "epoch": 0.8846666666666667, "grad_norm": 0.9347479939460754, "learning_rate": 2.3118040089086864e-05, "loss": 1.4855, "step": 3981 }, { "epoch": 0.8848888888888888, "grad_norm": 0.07804767787456512, "learning_rate": 2.3073496659242762e-05, "loss": 0.0192, "step": 3982 }, { "epoch": 0.8851111111111111, "grad_norm": 0.08066914230585098, "learning_rate": 2.3028953229398664e-05, "loss": 0.0188, "step": 3983 }, { "epoch": 0.8853333333333333, "grad_norm": 0.7905464768409729, "learning_rate": 2.298440979955457e-05, "loss": 0.8492, "step": 3984 }, { "epoch": 0.8855555555555555, "grad_norm": 1.2028931379318237, "learning_rate": 2.2939866369710468e-05, "loss": 0.9801, "step": 3985 }, { "epoch": 0.8857777777777778, "grad_norm": 0.10054640471935272, "learning_rate": 2.289532293986637e-05, "loss": 0.0247, "step": 3986 }, { "epoch": 0.886, "grad_norm": 1.1364169120788574, "learning_rate": 2.2850779510022272e-05, "loss": 1.6362, "step": 3987 }, { "epoch": 0.8862222222222222, "grad_norm": 1.2560831308364868, "learning_rate": 2.2806236080178174e-05, "loss": 1.8384, "step": 3988 }, { "epoch": 0.8864444444444445, "grad_norm": 1.1757941246032715, "learning_rate": 2.2761692650334076e-05, "loss": 1.4828, "step": 3989 }, { "epoch": 0.8866666666666667, "grad_norm": 1.120353102684021, "learning_rate": 2.2717149220489978e-05, "loss": 1.3649, "step": 3990 }, { "epoch": 0.8868888888888888, "grad_norm": 1.0847200155258179, "learning_rate": 2.267260579064588e-05, "loss": 1.7217, "step": 3991 }, { "epoch": 0.8871111111111111, "grad_norm": 1.1451468467712402, "learning_rate": 2.2628062360801782e-05, "loss": 1.6961, "step": 3992 }, { "epoch": 0.8873333333333333, "grad_norm": 1.0738978385925293, "learning_rate": 2.2583518930957684e-05, "loss": 1.4236, "step": 3993 }, { "epoch": 0.8875555555555555, "grad_norm": 1.3635321855545044, "learning_rate": 2.253897550111359e-05, "loss": 1.4417, "step": 3994 }, { "epoch": 0.8877777777777778, "grad_norm": 0.19308915734291077, "learning_rate": 2.2494432071269488e-05, "loss": 0.0304, "step": 3995 }, { "epoch": 0.888, "grad_norm": 1.2087732553482056, "learning_rate": 2.244988864142539e-05, "loss": 1.4158, "step": 3996 }, { "epoch": 0.8882222222222222, "grad_norm": 1.0328425168991089, "learning_rate": 2.2405345211581295e-05, "loss": 1.014, "step": 3997 }, { "epoch": 0.8884444444444445, "grad_norm": 0.8065721988677979, "learning_rate": 2.2360801781737194e-05, "loss": 0.6411, "step": 3998 }, { "epoch": 0.8886666666666667, "grad_norm": 0.6682571172714233, "learning_rate": 2.2316258351893096e-05, "loss": 0.5168, "step": 3999 }, { "epoch": 0.8888888888888888, "grad_norm": 0.9756750464439392, "learning_rate": 2.2271714922049e-05, "loss": 0.4815, "step": 4000 }, { "epoch": 0.8891111111111111, "grad_norm": 0.5801368355751038, "learning_rate": 2.22271714922049e-05, "loss": 0.8694, "step": 4001 }, { "epoch": 0.8893333333333333, "grad_norm": 0.6287752985954285, "learning_rate": 2.21826280623608e-05, "loss": 1.1768, "step": 4002 }, { "epoch": 0.8895555555555555, "grad_norm": 0.8464820981025696, "learning_rate": 2.2138084632516704e-05, "loss": 2.4031, "step": 4003 }, { "epoch": 0.8897777777777778, "grad_norm": 0.878257691860199, "learning_rate": 2.209354120267261e-05, "loss": 2.1387, "step": 4004 }, { "epoch": 0.89, "grad_norm": 0.6297408938407898, "learning_rate": 2.2048997772828508e-05, "loss": 1.0515, "step": 4005 }, { "epoch": 0.8902222222222222, "grad_norm": 0.8279980421066284, "learning_rate": 2.200445434298441e-05, "loss": 1.992, "step": 4006 }, { "epoch": 0.8904444444444445, "grad_norm": 0.07900725305080414, "learning_rate": 2.1959910913140315e-05, "loss": 0.0116, "step": 4007 }, { "epoch": 0.8906666666666667, "grad_norm": 0.07973389327526093, "learning_rate": 2.1915367483296214e-05, "loss": 0.0113, "step": 4008 }, { "epoch": 0.8908888888888888, "grad_norm": 0.5878556370735168, "learning_rate": 2.1870824053452115e-05, "loss": 1.1704, "step": 4009 }, { "epoch": 0.8911111111111111, "grad_norm": 0.9955252408981323, "learning_rate": 2.182628062360802e-05, "loss": 1.8866, "step": 4010 }, { "epoch": 0.8913333333333333, "grad_norm": 0.876213788986206, "learning_rate": 2.178173719376392e-05, "loss": 1.9363, "step": 4011 }, { "epoch": 0.8915555555555555, "grad_norm": 0.8237855434417725, "learning_rate": 2.173719376391982e-05, "loss": 2.1091, "step": 4012 }, { "epoch": 0.8917777777777778, "grad_norm": 0.6529291868209839, "learning_rate": 2.1692650334075727e-05, "loss": 0.9921, "step": 4013 }, { "epoch": 0.892, "grad_norm": 0.9489926099777222, "learning_rate": 2.1648106904231625e-05, "loss": 1.8424, "step": 4014 }, { "epoch": 0.8922222222222222, "grad_norm": 0.973099946975708, "learning_rate": 2.1603563474387527e-05, "loss": 1.9839, "step": 4015 }, { "epoch": 0.8924444444444445, "grad_norm": 0.8978729248046875, "learning_rate": 2.1559020044543433e-05, "loss": 1.7646, "step": 4016 }, { "epoch": 0.8926666666666667, "grad_norm": 0.9603530764579773, "learning_rate": 2.1514476614699335e-05, "loss": 1.8374, "step": 4017 }, { "epoch": 0.8928888888888888, "grad_norm": 0.07031574100255966, "learning_rate": 2.1469933184855233e-05, "loss": 0.0148, "step": 4018 }, { "epoch": 0.8931111111111111, "grad_norm": 0.06550273299217224, "learning_rate": 2.142538975501114e-05, "loss": 0.0147, "step": 4019 }, { "epoch": 0.8933333333333333, "grad_norm": 0.06782650202512741, "learning_rate": 2.138084632516704e-05, "loss": 0.0149, "step": 4020 }, { "epoch": 0.8935555555555555, "grad_norm": 0.7483673095703125, "learning_rate": 2.133630289532294e-05, "loss": 0.9084, "step": 4021 }, { "epoch": 0.8937777777777778, "grad_norm": 0.9090237617492676, "learning_rate": 2.129175946547884e-05, "loss": 1.8329, "step": 4022 }, { "epoch": 0.894, "grad_norm": 0.7626523971557617, "learning_rate": 2.1247216035634746e-05, "loss": 0.9174, "step": 4023 }, { "epoch": 0.8942222222222223, "grad_norm": 0.6706441640853882, "learning_rate": 2.1202672605790645e-05, "loss": 0.8133, "step": 4024 }, { "epoch": 0.8944444444444445, "grad_norm": 0.9489988684654236, "learning_rate": 2.1158129175946547e-05, "loss": 2.003, "step": 4025 }, { "epoch": 0.8946666666666667, "grad_norm": 0.9574695825576782, "learning_rate": 2.1113585746102452e-05, "loss": 1.5621, "step": 4026 }, { "epoch": 0.8948888888888888, "grad_norm": 1.1038743257522583, "learning_rate": 2.1069042316258354e-05, "loss": 1.8653, "step": 4027 }, { "epoch": 0.8951111111111111, "grad_norm": 1.0262362957000732, "learning_rate": 2.1024498886414253e-05, "loss": 1.6652, "step": 4028 }, { "epoch": 0.8953333333333333, "grad_norm": 0.8741075396537781, "learning_rate": 2.0979955456570158e-05, "loss": 1.6058, "step": 4029 }, { "epoch": 0.8955555555555555, "grad_norm": 0.7687373161315918, "learning_rate": 2.093541202672606e-05, "loss": 0.8818, "step": 4030 }, { "epoch": 0.8957777777777778, "grad_norm": 0.06525861471891403, "learning_rate": 2.089086859688196e-05, "loss": 0.0177, "step": 4031 }, { "epoch": 0.896, "grad_norm": 0.6307370066642761, "learning_rate": 2.0846325167037864e-05, "loss": 0.7724, "step": 4032 }, { "epoch": 0.8962222222222223, "grad_norm": 1.1199438571929932, "learning_rate": 2.0801781737193766e-05, "loss": 1.7534, "step": 4033 }, { "epoch": 0.8964444444444445, "grad_norm": 0.9748408794403076, "learning_rate": 2.0757238307349665e-05, "loss": 1.6166, "step": 4034 }, { "epoch": 0.8966666666666666, "grad_norm": 0.0824805200099945, "learning_rate": 2.071269487750557e-05, "loss": 0.0188, "step": 4035 }, { "epoch": 0.8968888888888888, "grad_norm": 0.09000510722398758, "learning_rate": 2.0668151447661472e-05, "loss": 0.0188, "step": 4036 }, { "epoch": 0.8971111111111111, "grad_norm": 0.08561154454946518, "learning_rate": 2.0623608017817374e-05, "loss": 0.0185, "step": 4037 }, { "epoch": 0.8973333333333333, "grad_norm": 0.7661683559417725, "learning_rate": 2.0579064587973276e-05, "loss": 0.907, "step": 4038 }, { "epoch": 0.8975555555555556, "grad_norm": 0.9890311360359192, "learning_rate": 2.0534521158129178e-05, "loss": 1.3849, "step": 4039 }, { "epoch": 0.8977777777777778, "grad_norm": 0.7973209619522095, "learning_rate": 2.048997772828508e-05, "loss": 0.749, "step": 4040 }, { "epoch": 0.898, "grad_norm": 1.1026244163513184, "learning_rate": 2.044543429844098e-05, "loss": 1.7591, "step": 4041 }, { "epoch": 0.8982222222222223, "grad_norm": 1.1480908393859863, "learning_rate": 2.0400890868596884e-05, "loss": 1.9024, "step": 4042 }, { "epoch": 0.8984444444444445, "grad_norm": 1.0242488384246826, "learning_rate": 2.0356347438752786e-05, "loss": 1.4006, "step": 4043 }, { "epoch": 0.8986666666666666, "grad_norm": 1.117613434791565, "learning_rate": 2.0311804008908684e-05, "loss": 1.3118, "step": 4044 }, { "epoch": 0.8988888888888888, "grad_norm": 1.1382890939712524, "learning_rate": 2.026726057906459e-05, "loss": 1.3193, "step": 4045 }, { "epoch": 0.8991111111111111, "grad_norm": 1.2156895399093628, "learning_rate": 2.0222717149220492e-05, "loss": 1.1794, "step": 4046 }, { "epoch": 0.8993333333333333, "grad_norm": 0.6855819225311279, "learning_rate": 2.017817371937639e-05, "loss": 0.4808, "step": 4047 }, { "epoch": 0.8995555555555556, "grad_norm": 0.1426740288734436, "learning_rate": 2.0133630289532296e-05, "loss": 0.033, "step": 4048 }, { "epoch": 0.8997777777777778, "grad_norm": 1.0770491361618042, "learning_rate": 2.0089086859688198e-05, "loss": 0.8844, "step": 4049 }, { "epoch": 0.9, "grad_norm": 0.9471620917320251, "learning_rate": 2.00445434298441e-05, "loss": 0.9019, "step": 4050 }, { "epoch": 0.9002222222222223, "grad_norm": 0.8870931267738342, "learning_rate": 2e-05, "loss": 1.9518, "step": 4051 }, { "epoch": 0.9004444444444445, "grad_norm": 0.681877851486206, "learning_rate": 1.9955456570155904e-05, "loss": 1.0902, "step": 4052 }, { "epoch": 0.9006666666666666, "grad_norm": 0.8550397157669067, "learning_rate": 1.9910913140311806e-05, "loss": 2.1819, "step": 4053 }, { "epoch": 0.9008888888888889, "grad_norm": 0.5659412741661072, "learning_rate": 1.9866369710467708e-05, "loss": 1.309, "step": 4054 }, { "epoch": 0.9011111111111111, "grad_norm": 0.618087887763977, "learning_rate": 1.982182628062361e-05, "loss": 1.0288, "step": 4055 }, { "epoch": 0.9013333333333333, "grad_norm": 0.5965234637260437, "learning_rate": 1.977728285077951e-05, "loss": 1.0864, "step": 4056 }, { "epoch": 0.9015555555555556, "grad_norm": 0.8751803636550903, "learning_rate": 1.973273942093541e-05, "loss": 2.2079, "step": 4057 }, { "epoch": 0.9017777777777778, "grad_norm": 0.907996416091919, "learning_rate": 1.9688195991091315e-05, "loss": 2.0763, "step": 4058 }, { "epoch": 0.902, "grad_norm": 0.08640366792678833, "learning_rate": 1.9643652561247217e-05, "loss": 0.0116, "step": 4059 }, { "epoch": 0.9022222222222223, "grad_norm": 0.08815193176269531, "learning_rate": 1.959910913140312e-05, "loss": 0.0114, "step": 4060 }, { "epoch": 0.9024444444444445, "grad_norm": 0.08544806391000748, "learning_rate": 1.955456570155902e-05, "loss": 0.0115, "step": 4061 }, { "epoch": 0.9026666666666666, "grad_norm": 0.6461583375930786, "learning_rate": 1.9510022271714923e-05, "loss": 1.1449, "step": 4062 }, { "epoch": 0.9028888888888889, "grad_norm": 0.8666505813598633, "learning_rate": 1.9465478841870825e-05, "loss": 2.0281, "step": 4063 }, { "epoch": 0.9031111111111111, "grad_norm": 1.0003634691238403, "learning_rate": 1.9420935412026727e-05, "loss": 1.929, "step": 4064 }, { "epoch": 0.9033333333333333, "grad_norm": 1.1285505294799805, "learning_rate": 1.937639198218263e-05, "loss": 2.3214, "step": 4065 }, { "epoch": 0.9035555555555556, "grad_norm": 0.6442127823829651, "learning_rate": 1.933184855233853e-05, "loss": 0.79, "step": 4066 }, { "epoch": 0.9037777777777778, "grad_norm": 0.6966649889945984, "learning_rate": 1.9287305122494433e-05, "loss": 0.9614, "step": 4067 }, { "epoch": 0.904, "grad_norm": 1.0646114349365234, "learning_rate": 1.9242761692650335e-05, "loss": 2.0066, "step": 4068 }, { "epoch": 0.9042222222222223, "grad_norm": 1.0722988843917847, "learning_rate": 1.9198218262806237e-05, "loss": 1.8977, "step": 4069 }, { "epoch": 0.9044444444444445, "grad_norm": 0.9870444536209106, "learning_rate": 1.915367483296214e-05, "loss": 1.6182, "step": 4070 }, { "epoch": 0.9046666666666666, "grad_norm": 0.6224427819252014, "learning_rate": 1.910913140311804e-05, "loss": 0.7754, "step": 4071 }, { "epoch": 0.9048888888888889, "grad_norm": 0.06635406613349915, "learning_rate": 1.9064587973273943e-05, "loss": 0.0147, "step": 4072 }, { "epoch": 0.9051111111111111, "grad_norm": 0.07058946043252945, "learning_rate": 1.9020044543429845e-05, "loss": 0.0173, "step": 4073 }, { "epoch": 0.9053333333333333, "grad_norm": 0.6457788348197937, "learning_rate": 1.8975501113585747e-05, "loss": 0.8879, "step": 4074 }, { "epoch": 0.9055555555555556, "grad_norm": 1.0111377239227295, "learning_rate": 1.893095768374165e-05, "loss": 1.617, "step": 4075 }, { "epoch": 0.9057777777777778, "grad_norm": 0.9277496933937073, "learning_rate": 1.888641425389755e-05, "loss": 1.8035, "step": 4076 }, { "epoch": 0.906, "grad_norm": 1.0849852561950684, "learning_rate": 1.8841870824053453e-05, "loss": 1.6758, "step": 4077 }, { "epoch": 0.9062222222222223, "grad_norm": 1.02144455909729, "learning_rate": 1.8797327394209355e-05, "loss": 1.812, "step": 4078 }, { "epoch": 0.9064444444444445, "grad_norm": 1.1183116436004639, "learning_rate": 1.8752783964365257e-05, "loss": 1.8675, "step": 4079 }, { "epoch": 0.9066666666666666, "grad_norm": 1.3064316511154175, "learning_rate": 1.870824053452116e-05, "loss": 1.7242, "step": 4080 }, { "epoch": 0.9068888888888889, "grad_norm": 0.06701880693435669, "learning_rate": 1.866369710467706e-05, "loss": 0.0177, "step": 4081 }, { "epoch": 0.9071111111111111, "grad_norm": 0.06481373310089111, "learning_rate": 1.8619153674832963e-05, "loss": 0.0178, "step": 4082 }, { "epoch": 0.9073333333333333, "grad_norm": 0.7761397361755371, "learning_rate": 1.8574610244988865e-05, "loss": 0.7757, "step": 4083 }, { "epoch": 0.9075555555555556, "grad_norm": 1.0291235446929932, "learning_rate": 1.8530066815144767e-05, "loss": 1.6664, "step": 4084 }, { "epoch": 0.9077777777777778, "grad_norm": 0.7274791598320007, "learning_rate": 1.848552338530067e-05, "loss": 0.979, "step": 4085 }, { "epoch": 0.908, "grad_norm": 0.6331042647361755, "learning_rate": 1.844097995545657e-05, "loss": 0.6799, "step": 4086 }, { "epoch": 0.9082222222222223, "grad_norm": 0.08008535206317902, "learning_rate": 1.8396436525612473e-05, "loss": 0.018, "step": 4087 }, { "epoch": 0.9084444444444445, "grad_norm": 0.08232392370700836, "learning_rate": 1.8351893095768375e-05, "loss": 0.018, "step": 4088 }, { "epoch": 0.9086666666666666, "grad_norm": 0.6026217341423035, "learning_rate": 1.830734966592428e-05, "loss": 0.8012, "step": 4089 }, { "epoch": 0.9088888888888889, "grad_norm": 1.0574473142623901, "learning_rate": 1.826280623608018e-05, "loss": 1.7651, "step": 4090 }, { "epoch": 0.9091111111111111, "grad_norm": 1.1249449253082275, "learning_rate": 1.821826280623608e-05, "loss": 1.6539, "step": 4091 }, { "epoch": 0.9093333333333333, "grad_norm": 0.7003470659255981, "learning_rate": 1.8173719376391986e-05, "loss": 0.764, "step": 4092 }, { "epoch": 0.9095555555555556, "grad_norm": 1.0299309492111206, "learning_rate": 1.8129175946547884e-05, "loss": 1.3027, "step": 4093 }, { "epoch": 0.9097777777777778, "grad_norm": 0.21282123029232025, "learning_rate": 1.8084632516703786e-05, "loss": 0.0309, "step": 4094 }, { "epoch": 0.91, "grad_norm": 1.1035081148147583, "learning_rate": 1.804008908685969e-05, "loss": 1.2723, "step": 4095 }, { "epoch": 0.9102222222222223, "grad_norm": 0.9910405874252319, "learning_rate": 1.799554565701559e-05, "loss": 1.0153, "step": 4096 }, { "epoch": 0.9104444444444444, "grad_norm": 1.0081919431686401, "learning_rate": 1.7951002227171492e-05, "loss": 1.0348, "step": 4097 }, { "epoch": 0.9106666666666666, "grad_norm": 0.1447010040283203, "learning_rate": 1.7906458797327394e-05, "loss": 0.0327, "step": 4098 }, { "epoch": 0.9108888888888889, "grad_norm": 1.1237828731536865, "learning_rate": 1.7861915367483296e-05, "loss": 1.332, "step": 4099 }, { "epoch": 0.9111111111111111, "grad_norm": 1.0061198472976685, "learning_rate": 1.7817371937639198e-05, "loss": 0.9762, "step": 4100 }, { "epoch": 0.9113333333333333, "grad_norm": 0.045394111424684525, "learning_rate": 1.77728285077951e-05, "loss": 0.0105, "step": 4101 }, { "epoch": 0.9115555555555556, "grad_norm": 0.6273143291473389, "learning_rate": 1.7728285077951006e-05, "loss": 0.8231, "step": 4102 }, { "epoch": 0.9117777777777778, "grad_norm": 0.5369709730148315, "learning_rate": 1.7683741648106904e-05, "loss": 1.0971, "step": 4103 }, { "epoch": 0.912, "grad_norm": 0.841785728931427, "learning_rate": 1.7639198218262806e-05, "loss": 2.1345, "step": 4104 }, { "epoch": 0.9122222222222223, "grad_norm": 0.5022440552711487, "learning_rate": 1.759465478841871e-05, "loss": 1.0847, "step": 4105 }, { "epoch": 0.9124444444444444, "grad_norm": 0.5736976265907288, "learning_rate": 1.755011135857461e-05, "loss": 1.0705, "step": 4106 }, { "epoch": 0.9126666666666666, "grad_norm": 0.7846779227256775, "learning_rate": 1.7505567928730512e-05, "loss": 2.2198, "step": 4107 }, { "epoch": 0.9128888888888889, "grad_norm": 0.04545416682958603, "learning_rate": 1.7461024498886417e-05, "loss": 0.0105, "step": 4108 }, { "epoch": 0.9131111111111111, "grad_norm": 0.8995314240455627, "learning_rate": 1.7416481069042316e-05, "loss": 2.0983, "step": 4109 }, { "epoch": 0.9133333333333333, "grad_norm": 0.08467597514390945, "learning_rate": 1.7371937639198218e-05, "loss": 0.0111, "step": 4110 }, { "epoch": 0.9135555555555556, "grad_norm": 0.06848177313804626, "learning_rate": 1.732739420935412e-05, "loss": 0.0109, "step": 4111 }, { "epoch": 0.9137777777777778, "grad_norm": 0.6615252494812012, "learning_rate": 1.7282850779510025e-05, "loss": 1.0423, "step": 4112 }, { "epoch": 0.914, "grad_norm": 0.8934789896011353, "learning_rate": 1.7238307349665924e-05, "loss": 1.8382, "step": 4113 }, { "epoch": 0.9142222222222223, "grad_norm": 0.8137645125389099, "learning_rate": 1.7193763919821826e-05, "loss": 1.9163, "step": 4114 }, { "epoch": 0.9144444444444444, "grad_norm": 0.8993197083473206, "learning_rate": 1.714922048997773e-05, "loss": 2.1383, "step": 4115 }, { "epoch": 0.9146666666666666, "grad_norm": 0.908676028251648, "learning_rate": 1.710467706013363e-05, "loss": 2.0524, "step": 4116 }, { "epoch": 0.9148888888888889, "grad_norm": 0.6348316669464111, "learning_rate": 1.7060133630289532e-05, "loss": 0.9211, "step": 4117 }, { "epoch": 0.9151111111111111, "grad_norm": 0.10803266614675522, "learning_rate": 1.7015590200445437e-05, "loss": 0.0177, "step": 4118 }, { "epoch": 0.9153333333333333, "grad_norm": 0.5778976678848267, "learning_rate": 1.6971046770601336e-05, "loss": 0.7947, "step": 4119 }, { "epoch": 0.9155555555555556, "grad_norm": 0.9023910164833069, "learning_rate": 1.6926503340757238e-05, "loss": 1.6568, "step": 4120 }, { "epoch": 0.9157777777777778, "grad_norm": 0.7427157759666443, "learning_rate": 1.6881959910913143e-05, "loss": 0.6653, "step": 4121 }, { "epoch": 0.916, "grad_norm": 0.8236956000328064, "learning_rate": 1.683741648106904e-05, "loss": 1.0835, "step": 4122 }, { "epoch": 0.9162222222222223, "grad_norm": 0.788445234298706, "learning_rate": 1.6792873051224944e-05, "loss": 1.1082, "step": 4123 }, { "epoch": 0.9164444444444444, "grad_norm": 0.7391776442527771, "learning_rate": 1.674832962138085e-05, "loss": 1.0042, "step": 4124 }, { "epoch": 0.9166666666666666, "grad_norm": 0.993009626865387, "learning_rate": 1.670378619153675e-05, "loss": 1.7091, "step": 4125 }, { "epoch": 0.9168888888888889, "grad_norm": 1.1671327352523804, "learning_rate": 1.665924276169265e-05, "loss": 1.8704, "step": 4126 }, { "epoch": 0.9171111111111111, "grad_norm": 0.9321463704109192, "learning_rate": 1.6614699331848555e-05, "loss": 1.3912, "step": 4127 }, { "epoch": 0.9173333333333333, "grad_norm": 0.7201982140541077, "learning_rate": 1.6570155902004457e-05, "loss": 1.0362, "step": 4128 }, { "epoch": 0.9175555555555556, "grad_norm": 0.06468725949525833, "learning_rate": 1.6525612472160355e-05, "loss": 0.0177, "step": 4129 }, { "epoch": 0.9177777777777778, "grad_norm": 0.06377862393856049, "learning_rate": 1.6481069042316257e-05, "loss": 0.0175, "step": 4130 }, { "epoch": 0.918, "grad_norm": 1.2018589973449707, "learning_rate": 1.6436525612472163e-05, "loss": 2.088, "step": 4131 }, { "epoch": 0.9182222222222223, "grad_norm": 0.08640787750482559, "learning_rate": 1.639198218262806e-05, "loss": 0.0178, "step": 4132 }, { "epoch": 0.9184444444444444, "grad_norm": 0.07626676559448242, "learning_rate": 1.6347438752783963e-05, "loss": 0.0179, "step": 4133 }, { "epoch": 0.9186666666666666, "grad_norm": 0.684622585773468, "learning_rate": 1.630289532293987e-05, "loss": 0.9346, "step": 4134 }, { "epoch": 0.9188888888888889, "grad_norm": 1.07980215549469, "learning_rate": 1.625835189309577e-05, "loss": 1.8093, "step": 4135 }, { "epoch": 0.9191111111111111, "grad_norm": 1.0103257894515991, "learning_rate": 1.621380846325167e-05, "loss": 1.3954, "step": 4136 }, { "epoch": 0.9193333333333333, "grad_norm": 1.0367659330368042, "learning_rate": 1.6169265033407574e-05, "loss": 1.665, "step": 4137 }, { "epoch": 0.9195555555555556, "grad_norm": 1.13039231300354, "learning_rate": 1.6124721603563476e-05, "loss": 1.3939, "step": 4138 }, { "epoch": 0.9197777777777778, "grad_norm": 1.2525602579116821, "learning_rate": 1.6080178173719375e-05, "loss": 1.7406, "step": 4139 }, { "epoch": 0.92, "grad_norm": 1.0886310338974, "learning_rate": 1.603563474387528e-05, "loss": 1.447, "step": 4140 }, { "epoch": 0.9202222222222223, "grad_norm": 1.0128674507141113, "learning_rate": 1.5991091314031182e-05, "loss": 1.3665, "step": 4141 }, { "epoch": 0.9204444444444444, "grad_norm": 1.087297797203064, "learning_rate": 1.594654788418708e-05, "loss": 1.2043, "step": 4142 }, { "epoch": 0.9206666666666666, "grad_norm": 1.0681723356246948, "learning_rate": 1.5902004454342986e-05, "loss": 1.3818, "step": 4143 }, { "epoch": 0.9208888888888889, "grad_norm": 0.8169934153556824, "learning_rate": 1.5857461024498888e-05, "loss": 0.7076, "step": 4144 }, { "epoch": 0.9211111111111111, "grad_norm": 1.1659146547317505, "learning_rate": 1.581291759465479e-05, "loss": 1.3053, "step": 4145 }, { "epoch": 0.9213333333333333, "grad_norm": 1.0384572744369507, "learning_rate": 1.5768374164810692e-05, "loss": 1.1864, "step": 4146 }, { "epoch": 0.9215555555555556, "grad_norm": 1.0471240282058716, "learning_rate": 1.5723830734966594e-05, "loss": 1.066, "step": 4147 }, { "epoch": 0.9217777777777778, "grad_norm": 0.7413065433502197, "learning_rate": 1.5679287305122496e-05, "loss": 0.5414, "step": 4148 }, { "epoch": 0.922, "grad_norm": 0.9853238463401794, "learning_rate": 1.5634743875278395e-05, "loss": 0.9216, "step": 4149 }, { "epoch": 0.9222222222222223, "grad_norm": 0.9708325266838074, "learning_rate": 1.55902004454343e-05, "loss": 0.8306, "step": 4150 }, { "epoch": 0.9224444444444444, "grad_norm": 0.565495491027832, "learning_rate": 1.5545657015590202e-05, "loss": 1.0385, "step": 4151 }, { "epoch": 0.9226666666666666, "grad_norm": 0.046501629054546356, "learning_rate": 1.55011135857461e-05, "loss": 0.0103, "step": 4152 }, { "epoch": 0.9228888888888889, "grad_norm": 0.5920565128326416, "learning_rate": 1.5456570155902006e-05, "loss": 1.1167, "step": 4153 }, { "epoch": 0.9231111111111111, "grad_norm": 0.6111573576927185, "learning_rate": 1.5412026726057908e-05, "loss": 0.9939, "step": 4154 }, { "epoch": 0.9233333333333333, "grad_norm": 0.5594994425773621, "learning_rate": 1.5367483296213807e-05, "loss": 1.1777, "step": 4155 }, { "epoch": 0.9235555555555556, "grad_norm": 0.05240090563893318, "learning_rate": 1.5322939866369712e-05, "loss": 0.0107, "step": 4156 }, { "epoch": 0.9237777777777778, "grad_norm": 0.8332452178001404, "learning_rate": 1.5278396436525614e-05, "loss": 2.1969, "step": 4157 }, { "epoch": 0.924, "grad_norm": 0.60703045129776, "learning_rate": 1.5233853006681514e-05, "loss": 0.8806, "step": 4158 }, { "epoch": 0.9242222222222222, "grad_norm": 0.0834372490644455, "learning_rate": 1.5189309576837418e-05, "loss": 0.0109, "step": 4159 }, { "epoch": 0.9244444444444444, "grad_norm": 0.06800226867198944, "learning_rate": 1.514476614699332e-05, "loss": 0.0111, "step": 4160 }, { "epoch": 0.9246666666666666, "grad_norm": 0.0833854079246521, "learning_rate": 1.510022271714922e-05, "loss": 0.0112, "step": 4161 }, { "epoch": 0.9248888888888889, "grad_norm": 0.6188771724700928, "learning_rate": 1.5055679287305124e-05, "loss": 0.9193, "step": 4162 }, { "epoch": 0.9251111111111111, "grad_norm": 0.9110550284385681, "learning_rate": 1.5011135857461026e-05, "loss": 1.8064, "step": 4163 }, { "epoch": 0.9253333333333333, "grad_norm": 0.9889512658119202, "learning_rate": 1.4966592427616926e-05, "loss": 2.1231, "step": 4164 }, { "epoch": 0.9255555555555556, "grad_norm": 0.700734555721283, "learning_rate": 1.4922048997772831e-05, "loss": 0.9772, "step": 4165 }, { "epoch": 0.9257777777777778, "grad_norm": 0.10837709903717041, "learning_rate": 1.4877505567928732e-05, "loss": 0.0171, "step": 4166 }, { "epoch": 0.926, "grad_norm": 0.11039458960294724, "learning_rate": 1.4832962138084634e-05, "loss": 0.017, "step": 4167 }, { "epoch": 0.9262222222222222, "grad_norm": 1.0548503398895264, "learning_rate": 1.4788418708240534e-05, "loss": 2.02, "step": 4168 }, { "epoch": 0.9264444444444444, "grad_norm": 1.027597427368164, "learning_rate": 1.4743875278396438e-05, "loss": 2.0137, "step": 4169 }, { "epoch": 0.9266666666666666, "grad_norm": 0.9742117524147034, "learning_rate": 1.469933184855234e-05, "loss": 1.6216, "step": 4170 }, { "epoch": 0.9268888888888889, "grad_norm": 1.1034184694290161, "learning_rate": 1.465478841870824e-05, "loss": 1.9359, "step": 4171 }, { "epoch": 0.9271111111111111, "grad_norm": 0.9604677557945251, "learning_rate": 1.4610244988864143e-05, "loss": 1.9562, "step": 4172 }, { "epoch": 0.9273333333333333, "grad_norm": 0.07002148032188416, "learning_rate": 1.4565701559020045e-05, "loss": 0.0148, "step": 4173 }, { "epoch": 0.9275555555555556, "grad_norm": 0.07596537470817566, "learning_rate": 1.4521158129175946e-05, "loss": 0.015, "step": 4174 }, { "epoch": 0.9277777777777778, "grad_norm": 0.07176879793405533, "learning_rate": 1.447661469933185e-05, "loss": 0.0153, "step": 4175 }, { "epoch": 0.928, "grad_norm": 0.6465597152709961, "learning_rate": 1.4432071269487751e-05, "loss": 0.8603, "step": 4176 }, { "epoch": 0.9282222222222222, "grad_norm": 0.7567986249923706, "learning_rate": 1.4387527839643652e-05, "loss": 0.974, "step": 4177 }, { "epoch": 0.9284444444444444, "grad_norm": 0.9745081067085266, "learning_rate": 1.4342984409799557e-05, "loss": 1.7431, "step": 4178 }, { "epoch": 0.9286666666666666, "grad_norm": 1.2606123685836792, "learning_rate": 1.4298440979955457e-05, "loss": 1.8718, "step": 4179 }, { "epoch": 0.9288888888888889, "grad_norm": 1.018596887588501, "learning_rate": 1.425389755011136e-05, "loss": 1.7782, "step": 4180 }, { "epoch": 0.9291111111111111, "grad_norm": 0.961146891117096, "learning_rate": 1.4209354120267263e-05, "loss": 1.6335, "step": 4181 }, { "epoch": 0.9293333333333333, "grad_norm": 0.9671225547790527, "learning_rate": 1.4164810690423163e-05, "loss": 1.507, "step": 4182 }, { "epoch": 0.9295555555555556, "grad_norm": 0.06404636800289154, "learning_rate": 1.4120267260579065e-05, "loss": 0.0178, "step": 4183 }, { "epoch": 0.9297777777777778, "grad_norm": 0.06602248549461365, "learning_rate": 1.4075723830734965e-05, "loss": 0.0179, "step": 4184 }, { "epoch": 0.93, "grad_norm": 0.06565021723508835, "learning_rate": 1.4031180400890869e-05, "loss": 0.0178, "step": 4185 }, { "epoch": 0.9302222222222222, "grad_norm": 1.20155930519104, "learning_rate": 1.3986636971046771e-05, "loss": 1.839, "step": 4186 }, { "epoch": 0.9304444444444444, "grad_norm": 0.08105297386646271, "learning_rate": 1.3942093541202671e-05, "loss": 0.0171, "step": 4187 }, { "epoch": 0.9306666666666666, "grad_norm": 0.07562917470932007, "learning_rate": 1.3897550111358577e-05, "loss": 0.0173, "step": 4188 }, { "epoch": 0.9308888888888889, "grad_norm": 0.9316315054893494, "learning_rate": 1.3853006681514477e-05, "loss": 0.9604, "step": 4189 }, { "epoch": 0.9311111111111111, "grad_norm": 1.0534974336624146, "learning_rate": 1.3808463251670379e-05, "loss": 1.5912, "step": 4190 }, { "epoch": 0.9313333333333333, "grad_norm": 1.1811798810958862, "learning_rate": 1.3763919821826283e-05, "loss": 1.76, "step": 4191 }, { "epoch": 0.9315555555555556, "grad_norm": 1.158895492553711, "learning_rate": 1.3719376391982183e-05, "loss": 1.6761, "step": 4192 }, { "epoch": 0.9317777777777778, "grad_norm": 1.1721011400222778, "learning_rate": 1.3674832962138085e-05, "loss": 1.3907, "step": 4193 }, { "epoch": 0.932, "grad_norm": 0.180902361869812, "learning_rate": 1.3630289532293989e-05, "loss": 0.0291, "step": 4194 }, { "epoch": 0.9322222222222222, "grad_norm": 0.17803487181663513, "learning_rate": 1.3585746102449889e-05, "loss": 0.029, "step": 4195 }, { "epoch": 0.9324444444444444, "grad_norm": 1.2288343906402588, "learning_rate": 1.354120267260579e-05, "loss": 1.5034, "step": 4196 }, { "epoch": 0.9326666666666666, "grad_norm": 1.1452394723892212, "learning_rate": 1.3496659242761694e-05, "loss": 1.3718, "step": 4197 }, { "epoch": 0.9328888888888889, "grad_norm": 1.1846433877944946, "learning_rate": 1.3452115812917596e-05, "loss": 1.1493, "step": 4198 }, { "epoch": 0.9331111111111111, "grad_norm": 1.074758529663086, "learning_rate": 1.3407572383073497e-05, "loss": 1.3701, "step": 4199 }, { "epoch": 0.9333333333333333, "grad_norm": 1.0819156169891357, "learning_rate": 1.33630289532294e-05, "loss": 0.4173, "step": 4200 }, { "epoch": 0.9335555555555556, "grad_norm": 0.04741929471492767, "learning_rate": 1.3318485523385302e-05, "loss": 0.0103, "step": 4201 }, { "epoch": 0.9337777777777778, "grad_norm": 0.5368507504463196, "learning_rate": 1.3273942093541203e-05, "loss": 0.9904, "step": 4202 }, { "epoch": 0.934, "grad_norm": 0.9232761859893799, "learning_rate": 1.3229398663697105e-05, "loss": 2.0784, "step": 4203 }, { "epoch": 0.9342222222222222, "grad_norm": 0.8134653568267822, "learning_rate": 1.3184855233853008e-05, "loss": 1.8798, "step": 4204 }, { "epoch": 0.9344444444444444, "grad_norm": 0.6048870086669922, "learning_rate": 1.3140311804008909e-05, "loss": 1.0963, "step": 4205 }, { "epoch": 0.9346666666666666, "grad_norm": 0.6360740065574646, "learning_rate": 1.309576837416481e-05, "loss": 1.0909, "step": 4206 }, { "epoch": 0.9348888888888889, "grad_norm": 0.6409539580345154, "learning_rate": 1.3051224944320714e-05, "loss": 0.9653, "step": 4207 }, { "epoch": 0.9351111111111111, "grad_norm": 0.8436147570610046, "learning_rate": 1.3006681514476614e-05, "loss": 2.0953, "step": 4208 }, { "epoch": 0.9353333333333333, "grad_norm": 0.06498900055885315, "learning_rate": 1.2962138084632516e-05, "loss": 0.0108, "step": 4209 }, { "epoch": 0.9355555555555556, "grad_norm": 0.06997820734977722, "learning_rate": 1.291759465478842e-05, "loss": 0.0104, "step": 4210 }, { "epoch": 0.9357777777777778, "grad_norm": 0.07132133841514587, "learning_rate": 1.2873051224944322e-05, "loss": 0.0105, "step": 4211 }, { "epoch": 0.936, "grad_norm": 0.880042552947998, "learning_rate": 1.2828507795100222e-05, "loss": 2.0466, "step": 4212 }, { "epoch": 0.9362222222222222, "grad_norm": 0.9125528931617737, "learning_rate": 1.2783964365256126e-05, "loss": 1.4838, "step": 4213 }, { "epoch": 0.9364444444444444, "grad_norm": 0.8474193811416626, "learning_rate": 1.2739420935412028e-05, "loss": 1.9134, "step": 4214 }, { "epoch": 0.9366666666666666, "grad_norm": 0.9230953454971313, "learning_rate": 1.2694877505567928e-05, "loss": 1.9544, "step": 4215 }, { "epoch": 0.9368888888888889, "grad_norm": 0.9458275437355042, "learning_rate": 1.2650334075723832e-05, "loss": 1.9736, "step": 4216 }, { "epoch": 0.9371111111111111, "grad_norm": 1.3373515605926514, "learning_rate": 1.2605790645879734e-05, "loss": 2.2127, "step": 4217 }, { "epoch": 0.9373333333333334, "grad_norm": 0.09605936706066132, "learning_rate": 1.2561247216035634e-05, "loss": 0.0156, "step": 4218 }, { "epoch": 0.9375555555555556, "grad_norm": 0.9786912798881531, "learning_rate": 1.251670378619154e-05, "loss": 1.8977, "step": 4219 }, { "epoch": 0.9377777777777778, "grad_norm": 0.9807332754135132, "learning_rate": 1.247216035634744e-05, "loss": 1.8869, "step": 4220 }, { "epoch": 0.938, "grad_norm": 0.8643897175788879, "learning_rate": 1.2427616926503342e-05, "loss": 1.6852, "step": 4221 }, { "epoch": 0.9382222222222222, "grad_norm": 1.1636970043182373, "learning_rate": 1.2383073496659244e-05, "loss": 1.5879, "step": 4222 }, { "epoch": 0.9384444444444444, "grad_norm": 0.0700439065694809, "learning_rate": 1.2338530066815146e-05, "loss": 0.015, "step": 4223 }, { "epoch": 0.9386666666666666, "grad_norm": 0.07256254553794861, "learning_rate": 1.2293986636971048e-05, "loss": 0.015, "step": 4224 }, { "epoch": 0.9388888888888889, "grad_norm": 0.6184701919555664, "learning_rate": 1.224944320712695e-05, "loss": 0.8797, "step": 4225 }, { "epoch": 0.9391111111111111, "grad_norm": 0.9730594158172607, "learning_rate": 1.2204899777282852e-05, "loss": 1.6952, "step": 4226 }, { "epoch": 0.9393333333333334, "grad_norm": 0.8093335628509521, "learning_rate": 1.2160356347438754e-05, "loss": 1.7324, "step": 4227 }, { "epoch": 0.9395555555555556, "grad_norm": 1.0205293893814087, "learning_rate": 1.2115812917594656e-05, "loss": 1.6117, "step": 4228 }, { "epoch": 0.9397777777777778, "grad_norm": 0.9411073327064514, "learning_rate": 1.2071269487750557e-05, "loss": 1.6911, "step": 4229 }, { "epoch": 0.94, "grad_norm": 1.1085401773452759, "learning_rate": 1.202672605790646e-05, "loss": 1.822, "step": 4230 }, { "epoch": 0.9402222222222222, "grad_norm": 1.0736027956008911, "learning_rate": 1.1982182628062361e-05, "loss": 1.7295, "step": 4231 }, { "epoch": 0.9404444444444444, "grad_norm": 1.0651301145553589, "learning_rate": 1.1937639198218263e-05, "loss": 1.6569, "step": 4232 }, { "epoch": 0.9406666666666667, "grad_norm": 0.9908804297447205, "learning_rate": 1.1893095768374165e-05, "loss": 1.7012, "step": 4233 }, { "epoch": 0.9408888888888889, "grad_norm": 0.9733399748802185, "learning_rate": 1.1848552338530067e-05, "loss": 1.7654, "step": 4234 }, { "epoch": 0.9411111111111111, "grad_norm": 0.8500748872756958, "learning_rate": 1.180400890868597e-05, "loss": 0.8944, "step": 4235 }, { "epoch": 0.9413333333333334, "grad_norm": 0.7698972821235657, "learning_rate": 1.1759465478841871e-05, "loss": 0.6535, "step": 4236 }, { "epoch": 0.9415555555555556, "grad_norm": 0.976285994052887, "learning_rate": 1.1714922048997773e-05, "loss": 1.5464, "step": 4237 }, { "epoch": 0.9417777777777778, "grad_norm": 0.07256097346544266, "learning_rate": 1.1670378619153675e-05, "loss": 0.0177, "step": 4238 }, { "epoch": 0.942, "grad_norm": 0.8034363389015198, "learning_rate": 1.1625835189309577e-05, "loss": 0.7093, "step": 4239 }, { "epoch": 0.9422222222222222, "grad_norm": 1.131561279296875, "learning_rate": 1.158129175946548e-05, "loss": 1.6093, "step": 4240 }, { "epoch": 0.9424444444444444, "grad_norm": 0.6806484460830688, "learning_rate": 1.1536748329621381e-05, "loss": 0.5989, "step": 4241 }, { "epoch": 0.9426666666666667, "grad_norm": 0.7716354727745056, "learning_rate": 1.1492204899777285e-05, "loss": 0.8757, "step": 4242 }, { "epoch": 0.9428888888888889, "grad_norm": 1.0691800117492676, "learning_rate": 1.1447661469933185e-05, "loss": 1.525, "step": 4243 }, { "epoch": 0.9431111111111111, "grad_norm": 1.1259671449661255, "learning_rate": 1.1403118040089087e-05, "loss": 1.3549, "step": 4244 }, { "epoch": 0.9433333333333334, "grad_norm": 1.147953987121582, "learning_rate": 1.1358574610244989e-05, "loss": 1.5969, "step": 4245 }, { "epoch": 0.9435555555555556, "grad_norm": 0.932826578617096, "learning_rate": 1.1314031180400891e-05, "loss": 1.3558, "step": 4246 }, { "epoch": 0.9437777777777778, "grad_norm": 1.2732888460159302, "learning_rate": 1.1269487750556795e-05, "loss": 1.2856, "step": 4247 }, { "epoch": 0.944, "grad_norm": 1.4992557764053345, "learning_rate": 1.1224944320712695e-05, "loss": 1.1909, "step": 4248 }, { "epoch": 0.9442222222222222, "grad_norm": 0.13996893167495728, "learning_rate": 1.1180400890868597e-05, "loss": 0.0324, "step": 4249 }, { "epoch": 0.9444444444444444, "grad_norm": 1.2993894815444946, "learning_rate": 1.11358574610245e-05, "loss": 1.1957, "step": 4250 }, { "epoch": 0.9446666666666667, "grad_norm": 0.5980258584022522, "learning_rate": 1.10913140311804e-05, "loss": 1.0878, "step": 4251 }, { "epoch": 0.9448888888888889, "grad_norm": 0.044246070086956024, "learning_rate": 1.1046770601336305e-05, "loss": 0.0103, "step": 4252 }, { "epoch": 0.9451111111111111, "grad_norm": 0.04438783973455429, "learning_rate": 1.1002227171492205e-05, "loss": 0.0103, "step": 4253 }, { "epoch": 0.9453333333333334, "grad_norm": 0.5702335834503174, "learning_rate": 1.0957683741648107e-05, "loss": 1.2407, "step": 4254 }, { "epoch": 0.9455555555555556, "grad_norm": 0.5994575023651123, "learning_rate": 1.091314031180401e-05, "loss": 0.974, "step": 4255 }, { "epoch": 0.9457777777777778, "grad_norm": 0.916534423828125, "learning_rate": 1.086859688195991e-05, "loss": 2.1087, "step": 4256 }, { "epoch": 0.946, "grad_norm": 0.8901073336601257, "learning_rate": 1.0824053452115813e-05, "loss": 2.4281, "step": 4257 }, { "epoch": 0.9462222222222222, "grad_norm": 0.9570392966270447, "learning_rate": 1.0779510022271716e-05, "loss": 1.9183, "step": 4258 }, { "epoch": 0.9464444444444444, "grad_norm": 0.880129873752594, "learning_rate": 1.0734966592427617e-05, "loss": 2.15, "step": 4259 }, { "epoch": 0.9466666666666667, "grad_norm": 0.06885567307472229, "learning_rate": 1.069042316258352e-05, "loss": 0.0106, "step": 4260 }, { "epoch": 0.9468888888888889, "grad_norm": 0.8540828824043274, "learning_rate": 1.064587973273942e-05, "loss": 1.9726, "step": 4261 }, { "epoch": 0.9471111111111111, "grad_norm": 0.9908187985420227, "learning_rate": 1.0601336302895323e-05, "loss": 2.4167, "step": 4262 }, { "epoch": 0.9473333333333334, "grad_norm": 0.9318075180053711, "learning_rate": 1.0556792873051226e-05, "loss": 1.9713, "step": 4263 }, { "epoch": 0.9475555555555556, "grad_norm": 0.9172812700271606, "learning_rate": 1.0512249443207126e-05, "loss": 1.9912, "step": 4264 }, { "epoch": 0.9477777777777778, "grad_norm": 0.6781771779060364, "learning_rate": 1.046770601336303e-05, "loss": 1.0824, "step": 4265 }, { "epoch": 0.948, "grad_norm": 0.7084238529205322, "learning_rate": 1.0423162583518932e-05, "loss": 0.9626, "step": 4266 }, { "epoch": 0.9482222222222222, "grad_norm": 1.27622389793396, "learning_rate": 1.0378619153674832e-05, "loss": 1.8587, "step": 4267 }, { "epoch": 0.9484444444444444, "grad_norm": 0.9670615792274475, "learning_rate": 1.0334075723830736e-05, "loss": 1.6965, "step": 4268 }, { "epoch": 0.9486666666666667, "grad_norm": 0.07310232520103455, "learning_rate": 1.0289532293986638e-05, "loss": 0.0157, "step": 4269 }, { "epoch": 0.9488888888888889, "grad_norm": 0.9369098544120789, "learning_rate": 1.024498886414254e-05, "loss": 1.7478, "step": 4270 }, { "epoch": 0.9491111111111111, "grad_norm": 0.7294790148735046, "learning_rate": 1.0200445434298442e-05, "loss": 1.1187, "step": 4271 }, { "epoch": 0.9493333333333334, "grad_norm": 0.9501145482063293, "learning_rate": 1.0155902004454342e-05, "loss": 1.6452, "step": 4272 }, { "epoch": 0.9495555555555556, "grad_norm": 1.1477452516555786, "learning_rate": 1.0111358574610246e-05, "loss": 1.8428, "step": 4273 }, { "epoch": 0.9497777777777778, "grad_norm": 0.9523744583129883, "learning_rate": 1.0066815144766148e-05, "loss": 1.8126, "step": 4274 }, { "epoch": 0.95, "grad_norm": 0.9534154534339905, "learning_rate": 1.002227171492205e-05, "loss": 1.7667, "step": 4275 }, { "epoch": 0.9502222222222222, "grad_norm": 1.2276133298873901, "learning_rate": 9.977728285077952e-06, "loss": 1.901, "step": 4276 }, { "epoch": 0.9504444444444444, "grad_norm": 0.7759425044059753, "learning_rate": 9.933184855233854e-06, "loss": 0.7691, "step": 4277 }, { "epoch": 0.9506666666666667, "grad_norm": 0.06685524433851242, "learning_rate": 9.888641425389756e-06, "loss": 0.0175, "step": 4278 }, { "epoch": 0.9508888888888889, "grad_norm": 0.7249411940574646, "learning_rate": 9.844097995545658e-06, "loss": 0.983, "step": 4279 }, { "epoch": 0.9511111111111111, "grad_norm": 0.6953089237213135, "learning_rate": 9.79955456570156e-06, "loss": 0.9221, "step": 4280 }, { "epoch": 0.9513333333333334, "grad_norm": 1.0122225284576416, "learning_rate": 9.755011135857462e-06, "loss": 1.6027, "step": 4281 }, { "epoch": 0.9515555555555556, "grad_norm": 1.0647549629211426, "learning_rate": 9.710467706013364e-06, "loss": 1.6956, "step": 4282 }, { "epoch": 0.9517777777777777, "grad_norm": 1.0427286624908447, "learning_rate": 9.665924276169266e-06, "loss": 1.6593, "step": 4283 }, { "epoch": 0.952, "grad_norm": 0.07722879201173782, "learning_rate": 9.621380846325168e-06, "loss": 0.0186, "step": 4284 }, { "epoch": 0.9522222222222222, "grad_norm": 0.7623983025550842, "learning_rate": 9.57683741648107e-06, "loss": 0.9881, "step": 4285 }, { "epoch": 0.9524444444444444, "grad_norm": 1.0861274003982544, "learning_rate": 9.532293986636972e-06, "loss": 1.5431, "step": 4286 }, { "epoch": 0.9526666666666667, "grad_norm": 0.9466423988342285, "learning_rate": 9.487750556792873e-06, "loss": 1.5866, "step": 4287 }, { "epoch": 0.9528888888888889, "grad_norm": 0.7006486058235168, "learning_rate": 9.443207126948775e-06, "loss": 0.8612, "step": 4288 }, { "epoch": 0.9531111111111111, "grad_norm": 0.9445701241493225, "learning_rate": 9.398663697104677e-06, "loss": 1.4541, "step": 4289 }, { "epoch": 0.9533333333333334, "grad_norm": 1.2460566759109497, "learning_rate": 9.35412026726058e-06, "loss": 1.6395, "step": 4290 }, { "epoch": 0.9535555555555556, "grad_norm": 1.0960421562194824, "learning_rate": 9.309576837416481e-06, "loss": 1.3166, "step": 4291 }, { "epoch": 0.9537777777777777, "grad_norm": 1.22000253200531, "learning_rate": 9.265033407572383e-06, "loss": 1.4362, "step": 4292 }, { "epoch": 0.954, "grad_norm": 1.0577735900878906, "learning_rate": 9.220489977728285e-06, "loss": 1.2362, "step": 4293 }, { "epoch": 0.9542222222222222, "grad_norm": 1.261118769645691, "learning_rate": 9.175946547884187e-06, "loss": 1.5433, "step": 4294 }, { "epoch": 0.9544444444444444, "grad_norm": 1.0835603475570679, "learning_rate": 9.13140311804009e-06, "loss": 1.0706, "step": 4295 }, { "epoch": 0.9546666666666667, "grad_norm": 0.1414426565170288, "learning_rate": 9.086859688195993e-06, "loss": 0.0324, "step": 4296 }, { "epoch": 0.9548888888888889, "grad_norm": 0.15208975970745087, "learning_rate": 9.042316258351893e-06, "loss": 0.0327, "step": 4297 }, { "epoch": 0.9551111111111111, "grad_norm": 0.942937970161438, "learning_rate": 8.997772828507795e-06, "loss": 0.9045, "step": 4298 }, { "epoch": 0.9553333333333334, "grad_norm": 0.9884275197982788, "learning_rate": 8.953229398663697e-06, "loss": 0.9895, "step": 4299 }, { "epoch": 0.9555555555555556, "grad_norm": 0.936667263507843, "learning_rate": 8.908685968819599e-06, "loss": 0.776, "step": 4300 }, { "epoch": 0.9557777777777777, "grad_norm": 0.6337212324142456, "learning_rate": 8.864142538975503e-06, "loss": 1.1042, "step": 4301 }, { "epoch": 0.956, "grad_norm": 0.8414755463600159, "learning_rate": 8.819599109131403e-06, "loss": 1.9703, "step": 4302 }, { "epoch": 0.9562222222222222, "grad_norm": 0.5726562738418579, "learning_rate": 8.775055679287305e-06, "loss": 0.8849, "step": 4303 }, { "epoch": 0.9564444444444444, "grad_norm": 0.6391728520393372, "learning_rate": 8.730512249443209e-06, "loss": 1.1577, "step": 4304 }, { "epoch": 0.9566666666666667, "grad_norm": 0.5593711137771606, "learning_rate": 8.685968819599109e-06, "loss": 0.8922, "step": 4305 }, { "epoch": 0.9568888888888889, "grad_norm": 0.7331346869468689, "learning_rate": 8.641425389755013e-06, "loss": 1.1386, "step": 4306 }, { "epoch": 0.9571111111111111, "grad_norm": 0.5739585757255554, "learning_rate": 8.596881959910913e-06, "loss": 0.8375, "step": 4307 }, { "epoch": 0.9573333333333334, "grad_norm": 0.07684678584337234, "learning_rate": 8.552338530066815e-06, "loss": 0.011, "step": 4308 }, { "epoch": 0.9575555555555556, "grad_norm": 0.07993515580892563, "learning_rate": 8.507795100222719e-06, "loss": 0.0111, "step": 4309 }, { "epoch": 0.9577777777777777, "grad_norm": 0.6196415424346924, "learning_rate": 8.463251670378619e-06, "loss": 1.0166, "step": 4310 }, { "epoch": 0.958, "grad_norm": 0.8932988047599792, "learning_rate": 8.41870824053452e-06, "loss": 2.3441, "step": 4311 }, { "epoch": 0.9582222222222222, "grad_norm": 1.2204405069351196, "learning_rate": 8.374164810690424e-06, "loss": 2.0743, "step": 4312 }, { "epoch": 0.9584444444444444, "grad_norm": 0.9031944870948792, "learning_rate": 8.329621380846325e-06, "loss": 1.9911, "step": 4313 }, { "epoch": 0.9586666666666667, "grad_norm": 1.0615592002868652, "learning_rate": 8.285077951002228e-06, "loss": 2.3094, "step": 4314 }, { "epoch": 0.9588888888888889, "grad_norm": 0.931339681148529, "learning_rate": 8.240534521158129e-06, "loss": 1.8745, "step": 4315 }, { "epoch": 0.9591111111111111, "grad_norm": 1.068681240081787, "learning_rate": 8.19599109131403e-06, "loss": 2.1088, "step": 4316 }, { "epoch": 0.9593333333333334, "grad_norm": 0.9199005365371704, "learning_rate": 8.151447661469934e-06, "loss": 1.9663, "step": 4317 }, { "epoch": 0.9595555555555556, "grad_norm": 0.9643719792366028, "learning_rate": 8.106904231625835e-06, "loss": 2.2083, "step": 4318 }, { "epoch": 0.9597777777777777, "grad_norm": 1.026021122932434, "learning_rate": 8.062360801781738e-06, "loss": 1.688, "step": 4319 }, { "epoch": 0.96, "grad_norm": 0.9671936631202698, "learning_rate": 8.01781737193764e-06, "loss": 2.0219, "step": 4320 }, { "epoch": 0.9602222222222222, "grad_norm": 0.9827919006347656, "learning_rate": 7.97327394209354e-06, "loss": 1.6401, "step": 4321 }, { "epoch": 0.9604444444444444, "grad_norm": 0.07093919813632965, "learning_rate": 7.928730512249444e-06, "loss": 0.0157, "step": 4322 }, { "epoch": 0.9606666666666667, "grad_norm": 0.07090691477060318, "learning_rate": 7.884187082405346e-06, "loss": 0.0154, "step": 4323 }, { "epoch": 0.9608888888888889, "grad_norm": 0.06990091502666473, "learning_rate": 7.839643652561248e-06, "loss": 0.0153, "step": 4324 }, { "epoch": 0.9611111111111111, "grad_norm": 0.06884946674108505, "learning_rate": 7.79510022271715e-06, "loss": 0.0152, "step": 4325 }, { "epoch": 0.9613333333333334, "grad_norm": 0.8022354245185852, "learning_rate": 7.75055679287305e-06, "loss": 0.8116, "step": 4326 }, { "epoch": 0.9615555555555556, "grad_norm": 1.062786340713501, "learning_rate": 7.706013363028954e-06, "loss": 1.7179, "step": 4327 }, { "epoch": 0.9617777777777777, "grad_norm": 1.0791099071502686, "learning_rate": 7.661469933184856e-06, "loss": 1.7474, "step": 4328 }, { "epoch": 0.962, "grad_norm": 1.0296615362167358, "learning_rate": 7.616926503340757e-06, "loss": 1.6771, "step": 4329 }, { "epoch": 0.9622222222222222, "grad_norm": 1.076456904411316, "learning_rate": 7.57238307349666e-06, "loss": 1.8226, "step": 4330 }, { "epoch": 0.9624444444444444, "grad_norm": 0.9866617321968079, "learning_rate": 7.527839643652562e-06, "loss": 1.887, "step": 4331 }, { "epoch": 0.9626666666666667, "grad_norm": 0.06448253244161606, "learning_rate": 7.483296213808463e-06, "loss": 0.0173, "step": 4332 }, { "epoch": 0.9628888888888889, "grad_norm": 0.06699193269014359, "learning_rate": 7.438752783964366e-06, "loss": 0.0175, "step": 4333 }, { "epoch": 0.9631111111111111, "grad_norm": 0.7605263590812683, "learning_rate": 7.394209354120267e-06, "loss": 0.8152, "step": 4334 }, { "epoch": 0.9633333333333334, "grad_norm": 1.077652096748352, "learning_rate": 7.34966592427617e-06, "loss": 1.8169, "step": 4335 }, { "epoch": 0.9635555555555556, "grad_norm": 0.07532133162021637, "learning_rate": 7.305122494432072e-06, "loss": 0.0182, "step": 4336 }, { "epoch": 0.9637777777777777, "grad_norm": 0.07628657668828964, "learning_rate": 7.260579064587973e-06, "loss": 0.0177, "step": 4337 }, { "epoch": 0.964, "grad_norm": 0.07550018280744553, "learning_rate": 7.216035634743876e-06, "loss": 0.0181, "step": 4338 }, { "epoch": 0.9642222222222222, "grad_norm": 0.9071139097213745, "learning_rate": 7.1714922048997785e-06, "loss": 0.8412, "step": 4339 }, { "epoch": 0.9644444444444444, "grad_norm": 0.9595382213592529, "learning_rate": 7.12694877505568e-06, "loss": 1.4946, "step": 4340 }, { "epoch": 0.9646666666666667, "grad_norm": 1.0608892440795898, "learning_rate": 7.082405345211582e-06, "loss": 1.6542, "step": 4341 }, { "epoch": 0.9648888888888889, "grad_norm": 1.1697642803192139, "learning_rate": 7.037861915367483e-06, "loss": 1.7757, "step": 4342 }, { "epoch": 0.9651111111111111, "grad_norm": 1.0154902935028076, "learning_rate": 6.9933184855233855e-06, "loss": 1.2555, "step": 4343 }, { "epoch": 0.9653333333333334, "grad_norm": 1.1377027034759521, "learning_rate": 6.948775055679288e-06, "loss": 1.7557, "step": 4344 }, { "epoch": 0.9655555555555555, "grad_norm": 1.1074367761611938, "learning_rate": 6.9042316258351895e-06, "loss": 1.5758, "step": 4345 }, { "epoch": 0.9657777777777777, "grad_norm": 0.6881236433982849, "learning_rate": 6.8596881959910914e-06, "loss": 0.7293, "step": 4346 }, { "epoch": 0.966, "grad_norm": 0.18515631556510925, "learning_rate": 6.815144766146994e-06, "loss": 0.0304, "step": 4347 }, { "epoch": 0.9662222222222222, "grad_norm": 1.1112456321716309, "learning_rate": 6.770601336302895e-06, "loss": 1.2363, "step": 4348 }, { "epoch": 0.9664444444444444, "grad_norm": 0.8793594837188721, "learning_rate": 6.726057906458798e-06, "loss": 0.8094, "step": 4349 }, { "epoch": 0.9666666666666667, "grad_norm": 0.9809714555740356, "learning_rate": 6.6815144766147e-06, "loss": 0.7478, "step": 4350 }, { "epoch": 0.9668888888888889, "grad_norm": 0.6749547123908997, "learning_rate": 6.636971046770601e-06, "loss": 1.0962, "step": 4351 }, { "epoch": 0.9671111111111111, "grad_norm": 0.8518489599227905, "learning_rate": 6.592427616926504e-06, "loss": 2.0981, "step": 4352 }, { "epoch": 0.9673333333333334, "grad_norm": 0.5904682874679565, "learning_rate": 6.547884187082405e-06, "loss": 0.8928, "step": 4353 }, { "epoch": 0.9675555555555555, "grad_norm": 0.6970412731170654, "learning_rate": 6.503340757238307e-06, "loss": 1.1904, "step": 4354 }, { "epoch": 0.9677777777777777, "grad_norm": 0.8849335312843323, "learning_rate": 6.45879732739421e-06, "loss": 1.9197, "step": 4355 }, { "epoch": 0.968, "grad_norm": 0.7975565791130066, "learning_rate": 6.414253897550111e-06, "loss": 2.0422, "step": 4356 }, { "epoch": 0.9682222222222222, "grad_norm": 0.898343026638031, "learning_rate": 6.369710467706014e-06, "loss": 2.0284, "step": 4357 }, { "epoch": 0.9684444444444444, "grad_norm": 0.08013878762722015, "learning_rate": 6.325167037861916e-06, "loss": 0.011, "step": 4358 }, { "epoch": 0.9686666666666667, "grad_norm": 0.0778137668967247, "learning_rate": 6.280623608017817e-06, "loss": 0.011, "step": 4359 }, { "epoch": 0.9688888888888889, "grad_norm": 0.075262151658535, "learning_rate": 6.23608017817372e-06, "loss": 0.0111, "step": 4360 }, { "epoch": 0.9691111111111111, "grad_norm": 0.8569869995117188, "learning_rate": 6.191536748329622e-06, "loss": 2.1552, "step": 4361 }, { "epoch": 0.9693333333333334, "grad_norm": 0.7958348393440247, "learning_rate": 6.146993318485524e-06, "loss": 2.1074, "step": 4362 }, { "epoch": 0.9695555555555555, "grad_norm": 0.861589789390564, "learning_rate": 6.102449888641426e-06, "loss": 1.8603, "step": 4363 }, { "epoch": 0.9697777777777777, "grad_norm": 0.870924174785614, "learning_rate": 6.057906458797328e-06, "loss": 1.6482, "step": 4364 }, { "epoch": 0.97, "grad_norm": 0.6811211109161377, "learning_rate": 6.01336302895323e-06, "loss": 0.9431, "step": 4365 }, { "epoch": 0.9702222222222222, "grad_norm": 1.2624512910842896, "learning_rate": 5.968819599109132e-06, "loss": 1.2395, "step": 4366 }, { "epoch": 0.9704444444444444, "grad_norm": 0.6082854866981506, "learning_rate": 5.924276169265034e-06, "loss": 0.9055, "step": 4367 }, { "epoch": 0.9706666666666667, "grad_norm": 0.990861713886261, "learning_rate": 5.879732739420936e-06, "loss": 1.96, "step": 4368 }, { "epoch": 0.9708888888888889, "grad_norm": 0.99041748046875, "learning_rate": 5.835189309576838e-06, "loss": 1.6842, "step": 4369 }, { "epoch": 0.9711111111111111, "grad_norm": 0.9319810271263123, "learning_rate": 5.79064587973274e-06, "loss": 1.9125, "step": 4370 }, { "epoch": 0.9713333333333334, "grad_norm": 1.01008141040802, "learning_rate": 5.746102449888642e-06, "loss": 1.9553, "step": 4371 }, { "epoch": 0.9715555555555555, "grad_norm": 0.7167505025863647, "learning_rate": 5.7015590200445435e-06, "loss": 0.972, "step": 4372 }, { "epoch": 0.9717777777777777, "grad_norm": 0.07200965285301208, "learning_rate": 5.6570155902004455e-06, "loss": 0.0151, "step": 4373 }, { "epoch": 0.972, "grad_norm": 0.6893488764762878, "learning_rate": 5.6124721603563475e-06, "loss": 0.8677, "step": 4374 }, { "epoch": 0.9722222222222222, "grad_norm": 0.12305945158004761, "learning_rate": 5.56792873051225e-06, "loss": 0.0204, "step": 4375 }, { "epoch": 0.9724444444444444, "grad_norm": 0.9432768821716309, "learning_rate": 5.523385300668152e-06, "loss": 1.7875, "step": 4376 }, { "epoch": 0.9726666666666667, "grad_norm": 1.0131165981292725, "learning_rate": 5.478841870824053e-06, "loss": 1.6501, "step": 4377 }, { "epoch": 0.9728888888888889, "grad_norm": 1.0048753023147583, "learning_rate": 5.434298440979955e-06, "loss": 1.6584, "step": 4378 }, { "epoch": 0.9731111111111111, "grad_norm": 0.9927910566329956, "learning_rate": 5.389755011135858e-06, "loss": 1.9151, "step": 4379 }, { "epoch": 0.9733333333333334, "grad_norm": 0.7287546396255493, "learning_rate": 5.34521158129176e-06, "loss": 0.7935, "step": 4380 }, { "epoch": 0.9735555555555555, "grad_norm": 0.06438437104225159, "learning_rate": 5.300668151447661e-06, "loss": 0.0174, "step": 4381 }, { "epoch": 0.9737777777777777, "grad_norm": 0.7063573002815247, "learning_rate": 5.256124721603563e-06, "loss": 0.794, "step": 4382 }, { "epoch": 0.974, "grad_norm": 0.07574823498725891, "learning_rate": 5.211581291759466e-06, "loss": 0.0178, "step": 4383 }, { "epoch": 0.9742222222222222, "grad_norm": 0.0726298987865448, "learning_rate": 5.167037861915368e-06, "loss": 0.0177, "step": 4384 }, { "epoch": 0.9744444444444444, "grad_norm": 0.7331129312515259, "learning_rate": 5.12249443207127e-06, "loss": 0.7402, "step": 4385 }, { "epoch": 0.9746666666666667, "grad_norm": 1.0017316341400146, "learning_rate": 5.077951002227171e-06, "loss": 1.6145, "step": 4386 }, { "epoch": 0.9748888888888889, "grad_norm": 0.9680055379867554, "learning_rate": 5.033407572383074e-06, "loss": 1.4337, "step": 4387 }, { "epoch": 0.9751111111111112, "grad_norm": 1.0141950845718384, "learning_rate": 4.988864142538976e-06, "loss": 1.549, "step": 4388 }, { "epoch": 0.9753333333333334, "grad_norm": 0.7489122748374939, "learning_rate": 4.944320712694878e-06, "loss": 0.6078, "step": 4389 }, { "epoch": 0.9755555555555555, "grad_norm": 0.9311794638633728, "learning_rate": 4.89977728285078e-06, "loss": 0.6076, "step": 4390 }, { "epoch": 0.9757777777777777, "grad_norm": 1.1416817903518677, "learning_rate": 4.855233853006682e-06, "loss": 1.7661, "step": 4391 }, { "epoch": 0.976, "grad_norm": 1.3029440641403198, "learning_rate": 4.810690423162584e-06, "loss": 1.5154, "step": 4392 }, { "epoch": 0.9762222222222222, "grad_norm": 1.1306506395339966, "learning_rate": 4.766146993318486e-06, "loss": 1.382, "step": 4393 }, { "epoch": 0.9764444444444444, "grad_norm": 0.9537327885627747, "learning_rate": 4.721603563474388e-06, "loss": 1.432, "step": 4394 }, { "epoch": 0.9766666666666667, "grad_norm": 0.9183233976364136, "learning_rate": 4.67706013363029e-06, "loss": 1.214, "step": 4395 }, { "epoch": 0.9768888888888889, "grad_norm": 0.9410824775695801, "learning_rate": 4.632516703786192e-06, "loss": 1.1194, "step": 4396 }, { "epoch": 0.9771111111111112, "grad_norm": 1.0940196514129639, "learning_rate": 4.587973273942094e-06, "loss": 1.1115, "step": 4397 }, { "epoch": 0.9773333333333334, "grad_norm": 1.0624735355377197, "learning_rate": 4.5434298440979965e-06, "loss": 1.0935, "step": 4398 }, { "epoch": 0.9775555555555555, "grad_norm": 1.0532846450805664, "learning_rate": 4.498886414253898e-06, "loss": 1.1252, "step": 4399 }, { "epoch": 0.9777777777777777, "grad_norm": 1.0047916173934937, "learning_rate": 4.4543429844097995e-06, "loss": 0.6994, "step": 4400 }, { "epoch": 0.978, "grad_norm": 0.7360401153564453, "learning_rate": 4.4097995545657015e-06, "loss": 1.1339, "step": 4401 }, { "epoch": 0.9782222222222222, "grad_norm": 0.8057011961936951, "learning_rate": 4.365256124721604e-06, "loss": 2.1264, "step": 4402 }, { "epoch": 0.9784444444444444, "grad_norm": 0.6433674097061157, "learning_rate": 4.320712694877506e-06, "loss": 1.1829, "step": 4403 }, { "epoch": 0.9786666666666667, "grad_norm": 0.5455031991004944, "learning_rate": 4.2761692650334074e-06, "loss": 1.2065, "step": 4404 }, { "epoch": 0.9788888888888889, "grad_norm": 0.883698582649231, "learning_rate": 4.231625835189309e-06, "loss": 2.0593, "step": 4405 }, { "epoch": 0.9791111111111112, "grad_norm": 0.6390405297279358, "learning_rate": 4.187082405345212e-06, "loss": 0.9819, "step": 4406 }, { "epoch": 0.9793333333333333, "grad_norm": 0.8261483311653137, "learning_rate": 4.142538975501114e-06, "loss": 2.308, "step": 4407 }, { "epoch": 0.9795555555555555, "grad_norm": 0.6975874900817871, "learning_rate": 4.097995545657015e-06, "loss": 1.0839, "step": 4408 }, { "epoch": 0.9797777777777777, "grad_norm": 0.6761125326156616, "learning_rate": 4.053452115812917e-06, "loss": 0.9341, "step": 4409 }, { "epoch": 0.98, "grad_norm": 0.7890470027923584, "learning_rate": 4.00890868596882e-06, "loss": 1.901, "step": 4410 }, { "epoch": 0.9802222222222222, "grad_norm": 0.9101024866104126, "learning_rate": 3.964365256124722e-06, "loss": 1.9368, "step": 4411 }, { "epoch": 0.9804444444444445, "grad_norm": 0.9786936640739441, "learning_rate": 3.919821826280624e-06, "loss": 2.1794, "step": 4412 }, { "epoch": 0.9806666666666667, "grad_norm": 0.7119241952896118, "learning_rate": 3.875278396436525e-06, "loss": 1.0641, "step": 4413 }, { "epoch": 0.9808888888888889, "grad_norm": 0.09762410819530487, "learning_rate": 3.830734966592428e-06, "loss": 0.0164, "step": 4414 }, { "epoch": 0.9811111111111112, "grad_norm": 0.7070305943489075, "learning_rate": 3.78619153674833e-06, "loss": 0.9271, "step": 4415 }, { "epoch": 0.9813333333333333, "grad_norm": 0.9111929535865784, "learning_rate": 3.7416481069042315e-06, "loss": 1.7253, "step": 4416 }, { "epoch": 0.9815555555555555, "grad_norm": 1.0224978923797607, "learning_rate": 3.6971046770601335e-06, "loss": 2.0652, "step": 4417 }, { "epoch": 0.9817777777777777, "grad_norm": 1.2484158277511597, "learning_rate": 3.652561247216036e-06, "loss": 2.263, "step": 4418 }, { "epoch": 0.982, "grad_norm": 0.6740500926971436, "learning_rate": 3.608017817371938e-06, "loss": 0.8257, "step": 4419 }, { "epoch": 0.9822222222222222, "grad_norm": 0.06990643590688705, "learning_rate": 3.56347438752784e-06, "loss": 0.0152, "step": 4420 }, { "epoch": 0.9824444444444445, "grad_norm": 0.728216826915741, "learning_rate": 3.5189309576837414e-06, "loss": 0.8901, "step": 4421 }, { "epoch": 0.9826666666666667, "grad_norm": 0.06907333433628082, "learning_rate": 3.474387527839644e-06, "loss": 0.0152, "step": 4422 }, { "epoch": 0.9828888888888889, "grad_norm": 0.792972981929779, "learning_rate": 3.4298440979955457e-06, "loss": 0.8383, "step": 4423 }, { "epoch": 0.9831111111111112, "grad_norm": 0.9240522384643555, "learning_rate": 3.3853006681514477e-06, "loss": 1.9004, "step": 4424 }, { "epoch": 0.9833333333333333, "grad_norm": 0.9684634208679199, "learning_rate": 3.34075723830735e-06, "loss": 1.7412, "step": 4425 }, { "epoch": 0.9835555555555555, "grad_norm": 1.075197696685791, "learning_rate": 3.296213808463252e-06, "loss": 1.4785, "step": 4426 }, { "epoch": 0.9837777777777778, "grad_norm": 0.9526484608650208, "learning_rate": 3.2516703786191536e-06, "loss": 1.6998, "step": 4427 }, { "epoch": 0.984, "grad_norm": 0.995002269744873, "learning_rate": 3.2071269487750556e-06, "loss": 1.5562, "step": 4428 }, { "epoch": 0.9842222222222222, "grad_norm": 1.0168581008911133, "learning_rate": 3.162583518930958e-06, "loss": 1.722, "step": 4429 }, { "epoch": 0.9844444444444445, "grad_norm": 0.06682226806879044, "learning_rate": 3.11804008908686e-06, "loss": 0.0176, "step": 4430 }, { "epoch": 0.9846666666666667, "grad_norm": 0.06983762979507446, "learning_rate": 3.073496659242762e-06, "loss": 0.0173, "step": 4431 }, { "epoch": 0.9848888888888889, "grad_norm": 0.06633459031581879, "learning_rate": 3.028953229398664e-06, "loss": 0.0174, "step": 4432 }, { "epoch": 0.9851111111111112, "grad_norm": 0.06410173326730728, "learning_rate": 2.984409799554566e-06, "loss": 0.0175, "step": 4433 }, { "epoch": 0.9853333333333333, "grad_norm": 0.8998127579689026, "learning_rate": 2.939866369710468e-06, "loss": 1.6707, "step": 4434 }, { "epoch": 0.9855555555555555, "grad_norm": 0.08308030664920807, "learning_rate": 2.89532293986637e-06, "loss": 0.0179, "step": 4435 }, { "epoch": 0.9857777777777778, "grad_norm": 0.8417572379112244, "learning_rate": 2.8507795100222718e-06, "loss": 0.966, "step": 4436 }, { "epoch": 0.986, "grad_norm": 1.395193338394165, "learning_rate": 2.8062360801781737e-06, "loss": 1.8783, "step": 4437 }, { "epoch": 0.9862222222222222, "grad_norm": 0.9416733384132385, "learning_rate": 2.761692650334076e-06, "loss": 1.5385, "step": 4438 }, { "epoch": 0.9864444444444445, "grad_norm": 1.100425362586975, "learning_rate": 2.7171492204899777e-06, "loss": 1.7733, "step": 4439 }, { "epoch": 0.9866666666666667, "grad_norm": 0.7649857401847839, "learning_rate": 2.67260579064588e-06, "loss": 0.7851, "step": 4440 }, { "epoch": 0.9868888888888889, "grad_norm": 1.1875056028366089, "learning_rate": 2.6280623608017816e-06, "loss": 1.6325, "step": 4441 }, { "epoch": 0.9871111111111112, "grad_norm": 1.1401832103729248, "learning_rate": 2.583518930957684e-06, "loss": 1.6937, "step": 4442 }, { "epoch": 0.9873333333333333, "grad_norm": 1.1035478115081787, "learning_rate": 2.5389755011135856e-06, "loss": 1.526, "step": 4443 }, { "epoch": 0.9875555555555555, "grad_norm": 0.8037136793136597, "learning_rate": 2.494432071269488e-06, "loss": 0.7856, "step": 4444 }, { "epoch": 0.9877777777777778, "grad_norm": 1.0584372282028198, "learning_rate": 2.44988864142539e-06, "loss": 1.3084, "step": 4445 }, { "epoch": 0.988, "grad_norm": 0.1836099475622177, "learning_rate": 2.405345211581292e-06, "loss": 0.0299, "step": 4446 }, { "epoch": 0.9882222222222222, "grad_norm": 1.108872413635254, "learning_rate": 2.360801781737194e-06, "loss": 1.0455, "step": 4447 }, { "epoch": 0.9884444444444445, "grad_norm": 0.6207655072212219, "learning_rate": 2.316258351893096e-06, "loss": 0.4939, "step": 4448 }, { "epoch": 0.9886666666666667, "grad_norm": 0.14554363489151, "learning_rate": 2.2717149220489982e-06, "loss": 0.0318, "step": 4449 }, { "epoch": 0.9888888888888889, "grad_norm": 1.2572603225708008, "learning_rate": 2.2271714922048998e-06, "loss": 1.1098, "step": 4450 }, { "epoch": 0.9891111111111112, "grad_norm": 0.04474545270204544, "learning_rate": 2.182628062360802e-06, "loss": 0.0103, "step": 4451 }, { "epoch": 0.9893333333333333, "grad_norm": 0.04636682942509651, "learning_rate": 2.1380846325167037e-06, "loss": 0.0101, "step": 4452 }, { "epoch": 0.9895555555555555, "grad_norm": 0.5653097033500671, "learning_rate": 2.093541202672606e-06, "loss": 1.0041, "step": 4453 }, { "epoch": 0.9897777777777778, "grad_norm": 0.4789440333843231, "learning_rate": 2.0489977728285077e-06, "loss": 0.979, "step": 4454 }, { "epoch": 0.99, "grad_norm": 0.8047142028808594, "learning_rate": 2.00445434298441e-06, "loss": 2.0886, "step": 4455 }, { "epoch": 0.9902222222222222, "grad_norm": 0.8989213109016418, "learning_rate": 1.959910913140312e-06, "loss": 2.1387, "step": 4456 }, { "epoch": 0.9904444444444445, "grad_norm": 0.06995019316673279, "learning_rate": 1.915367483296214e-06, "loss": 0.0109, "step": 4457 }, { "epoch": 0.9906666666666667, "grad_norm": 0.07215920835733414, "learning_rate": 1.8708240534521158e-06, "loss": 0.0108, "step": 4458 }, { "epoch": 0.9908888888888889, "grad_norm": 0.07202310115098953, "learning_rate": 1.826280623608018e-06, "loss": 0.0109, "step": 4459 }, { "epoch": 0.9911111111111112, "grad_norm": 0.9508035778999329, "learning_rate": 1.78173719376392e-06, "loss": 2.2415, "step": 4460 }, { "epoch": 0.9913333333333333, "grad_norm": 0.891727864742279, "learning_rate": 1.737193763919822e-06, "loss": 1.9116, "step": 4461 }, { "epoch": 0.9915555555555555, "grad_norm": 1.0234503746032715, "learning_rate": 1.6926503340757238e-06, "loss": 2.0408, "step": 4462 }, { "epoch": 0.9917777777777778, "grad_norm": 0.8998834490776062, "learning_rate": 1.648106904231626e-06, "loss": 2.0895, "step": 4463 }, { "epoch": 0.992, "grad_norm": 0.9309079051017761, "learning_rate": 1.6035634743875278e-06, "loss": 1.9546, "step": 4464 }, { "epoch": 0.9922222222222222, "grad_norm": 0.903396725654602, "learning_rate": 1.55902004454343e-06, "loss": 1.0776, "step": 4465 }, { "epoch": 0.9924444444444445, "grad_norm": 1.0036734342575073, "learning_rate": 1.514476614699332e-06, "loss": 1.7439, "step": 4466 }, { "epoch": 0.9926666666666667, "grad_norm": 0.9246737957000732, "learning_rate": 1.469933184855234e-06, "loss": 1.7637, "step": 4467 }, { "epoch": 0.9928888888888889, "grad_norm": 1.0618118047714233, "learning_rate": 1.4253897550111359e-06, "loss": 1.9589, "step": 4468 }, { "epoch": 0.9931111111111111, "grad_norm": 1.1122076511383057, "learning_rate": 1.380846325167038e-06, "loss": 1.9023, "step": 4469 }, { "epoch": 0.9933333333333333, "grad_norm": 1.027601957321167, "learning_rate": 1.33630289532294e-06, "loss": 1.9814, "step": 4470 }, { "epoch": 0.9935555555555555, "grad_norm": 0.06850501894950867, "learning_rate": 1.291759465478842e-06, "loss": 0.0152, "step": 4471 }, { "epoch": 0.9937777777777778, "grad_norm": 0.067985400557518, "learning_rate": 1.247216035634744e-06, "loss": 0.0152, "step": 4472 }, { "epoch": 0.994, "grad_norm": 1.0229130983352661, "learning_rate": 1.202672605790646e-06, "loss": 1.6158, "step": 4473 }, { "epoch": 0.9942222222222222, "grad_norm": 0.6642321944236755, "learning_rate": 1.158129175946548e-06, "loss": 0.7592, "step": 4474 }, { "epoch": 0.9944444444444445, "grad_norm": 1.025769829750061, "learning_rate": 1.1135857461024499e-06, "loss": 1.8864, "step": 4475 }, { "epoch": 0.9946666666666667, "grad_norm": 1.1777735948562622, "learning_rate": 1.0690423162583519e-06, "loss": 1.9098, "step": 4476 }, { "epoch": 0.9948888888888889, "grad_norm": 1.0232651233673096, "learning_rate": 1.0244988864142538e-06, "loss": 1.6622, "step": 4477 }, { "epoch": 0.9951111111111111, "grad_norm": 1.0267844200134277, "learning_rate": 9.79955456570156e-07, "loss": 1.8175, "step": 4478 }, { "epoch": 0.9953333333333333, "grad_norm": 0.7749679684638977, "learning_rate": 9.354120267260579e-07, "loss": 0.906, "step": 4479 }, { "epoch": 0.9955555555555555, "grad_norm": 0.06536448746919632, "learning_rate": 8.9086859688196e-07, "loss": 0.0173, "step": 4480 }, { "epoch": 0.9957777777777778, "grad_norm": 0.6798564195632935, "learning_rate": 8.463251670378619e-07, "loss": 0.7955, "step": 4481 }, { "epoch": 0.996, "grad_norm": 0.06655056774616241, "learning_rate": 8.017817371937639e-07, "loss": 0.0176, "step": 4482 }, { "epoch": 0.9962222222222222, "grad_norm": 0.7525641918182373, "learning_rate": 7.57238307349666e-07, "loss": 0.8103, "step": 4483 }, { "epoch": 0.9964444444444445, "grad_norm": 0.6724408268928528, "learning_rate": 7.126948775055679e-07, "loss": 0.7957, "step": 4484 }, { "epoch": 0.9966666666666667, "grad_norm": 0.99349445104599, "learning_rate": 6.6815144766147e-07, "loss": 1.691, "step": 4485 }, { "epoch": 0.9968888888888889, "grad_norm": 1.0608917474746704, "learning_rate": 6.23608017817372e-07, "loss": 1.6244, "step": 4486 }, { "epoch": 0.9971111111111111, "grad_norm": 0.07752467691898346, "learning_rate": 5.79064587973274e-07, "loss": 0.018, "step": 4487 }, { "epoch": 0.9973333333333333, "grad_norm": 0.7708075046539307, "learning_rate": 5.345211581291759e-07, "loss": 0.8414, "step": 4488 }, { "epoch": 0.9975555555555555, "grad_norm": 0.6976569890975952, "learning_rate": 4.89977728285078e-07, "loss": 0.7637, "step": 4489 }, { "epoch": 0.9977777777777778, "grad_norm": 1.0548564195632935, "learning_rate": 4.4543429844098e-07, "loss": 1.4826, "step": 4490 }, { "epoch": 0.998, "grad_norm": 0.6447573900222778, "learning_rate": 4.0089086859688195e-07, "loss": 0.6006, "step": 4491 }, { "epoch": 0.9982222222222222, "grad_norm": 1.2821402549743652, "learning_rate": 3.5634743875278397e-07, "loss": 1.8731, "step": 4492 }, { "epoch": 0.9984444444444445, "grad_norm": 1.1518702507019043, "learning_rate": 3.11804008908686e-07, "loss": 1.5884, "step": 4493 }, { "epoch": 0.9986666666666667, "grad_norm": 1.11997389793396, "learning_rate": 2.6726057906458796e-07, "loss": 1.4486, "step": 4494 }, { "epoch": 0.9988888888888889, "grad_norm": 1.1092532873153687, "learning_rate": 2.2271714922049e-07, "loss": 1.3308, "step": 4495 }, { "epoch": 0.9991111111111111, "grad_norm": 0.17926262319087982, "learning_rate": 1.7817371937639199e-07, "loss": 0.0297, "step": 4496 }, { "epoch": 0.9993333333333333, "grad_norm": 1.144982933998108, "learning_rate": 1.3363028953229398e-07, "loss": 1.2413, "step": 4497 }, { "epoch": 0.9995555555555555, "grad_norm": 1.1863489151000977, "learning_rate": 8.908685968819599e-08, "loss": 1.1693, "step": 4498 }, { "epoch": 0.9997777777777778, "grad_norm": 0.7812955975532532, "learning_rate": 4.4543429844097996e-08, "loss": 0.5973, "step": 4499 }, { "epoch": 1.0, "grad_norm": 1.0512616634368896, "learning_rate": 0.0, "loss": 0.8343, "step": 4500 }, { "epoch": 1.0, "eval_loss": 1.1682192087173462, "eval_runtime": 240.8872, "eval_samples_per_second": 4.151, "eval_steps_per_second": 4.151, "step": 4500 } ], "logging_steps": 1, "max_steps": 4500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.847769692985754e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }