{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5, "eval_steps": 500, "global_step": 250000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, "grad_norm": 55.07521057128906, "learning_rate": 9.99816e-07, "loss": 1.4625, "step": 50 }, { "epoch": 0.001, "grad_norm": 96.95206451416016, "learning_rate": 9.99616e-07, "loss": 1.4538, "step": 100 }, { "epoch": 0.0015, "grad_norm": 49.72062683105469, "learning_rate": 9.99416e-07, "loss": 1.3503, "step": 150 }, { "epoch": 0.002, "grad_norm": 62.22916793823242, "learning_rate": 9.9922e-07, "loss": 1.3127, "step": 200 }, { "epoch": 0.0025, "grad_norm": 38.854434967041016, "learning_rate": 9.99024e-07, "loss": 1.261, "step": 250 }, { "epoch": 0.003, "grad_norm": 49.72938919067383, "learning_rate": 9.98824e-07, "loss": 1.1819, "step": 300 }, { "epoch": 0.0035, "grad_norm": 68.74335479736328, "learning_rate": 9.986239999999999e-07, "loss": 1.1701, "step": 350 }, { "epoch": 0.004, "grad_norm": 51.449588775634766, "learning_rate": 9.984239999999999e-07, "loss": 1.1705, "step": 400 }, { "epoch": 0.0045, "grad_norm": 42.585655212402344, "learning_rate": 9.98224e-07, "loss": 1.0597, "step": 450 }, { "epoch": 0.005, "grad_norm": 62.51144027709961, "learning_rate": 9.98024e-07, "loss": 1.0494, "step": 500 }, { "epoch": 0.0055, "grad_norm": 40.80481719970703, "learning_rate": 9.97824e-07, "loss": 1.027, "step": 550 }, { "epoch": 0.006, "grad_norm": 63.342613220214844, "learning_rate": 9.97628e-07, "loss": 1.048, "step": 600 }, { "epoch": 0.0065, "grad_norm": 38.498329162597656, "learning_rate": 9.97428e-07, "loss": 0.9951, "step": 650 }, { "epoch": 0.007, "grad_norm": 25.70156478881836, "learning_rate": 9.972279999999999e-07, "loss": 0.9348, "step": 700 }, { "epoch": 0.0075, "grad_norm": 54.72833251953125, "learning_rate": 9.97028e-07, "loss": 0.9405, "step": 750 }, { "epoch": 0.008, "grad_norm": 51.39662170410156, "learning_rate": 9.96828e-07, "loss": 0.9214, "step": 800 }, { "epoch": 0.0085, "grad_norm": 57.055030822753906, "learning_rate": 9.966279999999998e-07, "loss": 0.9247, "step": 850 }, { "epoch": 0.009, "grad_norm": 48.74729919433594, "learning_rate": 9.96428e-07, "loss": 0.9254, "step": 900 }, { "epoch": 0.0095, "grad_norm": 69.1877212524414, "learning_rate": 9.96228e-07, "loss": 0.7783, "step": 950 }, { "epoch": 0.01, "grad_norm": 58.19287872314453, "learning_rate": 9.96028e-07, "loss": 0.9523, "step": 1000 }, { "epoch": 0.0105, "grad_norm": 90.27144622802734, "learning_rate": 9.95828e-07, "loss": 0.8884, "step": 1050 }, { "epoch": 0.011, "grad_norm": 47.847496032714844, "learning_rate": 9.95628e-07, "loss": 0.765, "step": 1100 }, { "epoch": 0.0115, "grad_norm": 60.23532485961914, "learning_rate": 9.95428e-07, "loss": 0.9025, "step": 1150 }, { "epoch": 0.012, "grad_norm": 65.98821258544922, "learning_rate": 9.95228e-07, "loss": 0.8743, "step": 1200 }, { "epoch": 0.0125, "grad_norm": 41.296512603759766, "learning_rate": 9.95028e-07, "loss": 0.7958, "step": 1250 }, { "epoch": 0.013, "grad_norm": 10.350058555603027, "learning_rate": 9.94828e-07, "loss": 0.7748, "step": 1300 }, { "epoch": 0.0135, "grad_norm": 100.7204818725586, "learning_rate": 9.94628e-07, "loss": 0.7265, "step": 1350 }, { "epoch": 0.014, "grad_norm": 33.72673797607422, "learning_rate": 9.944279999999999e-07, "loss": 0.8833, "step": 1400 }, { "epoch": 0.0145, "grad_norm": 51.3131217956543, "learning_rate": 9.942279999999999e-07, "loss": 0.6357, "step": 1450 }, { "epoch": 0.015, "grad_norm": 62.02531814575195, "learning_rate": 9.94028e-07, "loss": 0.8208, "step": 1500 }, { "epoch": 0.0155, "grad_norm": 79.17247772216797, "learning_rate": 9.93828e-07, "loss": 0.7981, "step": 1550 }, { "epoch": 0.016, "grad_norm": 45.16271209716797, "learning_rate": 9.936279999999998e-07, "loss": 0.9424, "step": 1600 }, { "epoch": 0.0165, "grad_norm": 25.51656723022461, "learning_rate": 9.93428e-07, "loss": 0.7381, "step": 1650 }, { "epoch": 0.017, "grad_norm": 67.2197265625, "learning_rate": 9.93228e-07, "loss": 0.8223, "step": 1700 }, { "epoch": 0.0175, "grad_norm": 68.64763641357422, "learning_rate": 9.93028e-07, "loss": 0.8728, "step": 1750 }, { "epoch": 0.018, "grad_norm": 73.73970031738281, "learning_rate": 9.92828e-07, "loss": 0.8765, "step": 1800 }, { "epoch": 0.0185, "grad_norm": 91.64149475097656, "learning_rate": 9.92628e-07, "loss": 0.7413, "step": 1850 }, { "epoch": 0.019, "grad_norm": 65.28728485107422, "learning_rate": 9.92428e-07, "loss": 0.8043, "step": 1900 }, { "epoch": 0.0195, "grad_norm": 67.93745422363281, "learning_rate": 9.92228e-07, "loss": 0.7773, "step": 1950 }, { "epoch": 0.02, "grad_norm": 59.84891128540039, "learning_rate": 9.92028e-07, "loss": 0.6151, "step": 2000 }, { "epoch": 0.0205, "grad_norm": 25.66044044494629, "learning_rate": 9.91828e-07, "loss": 0.8568, "step": 2050 }, { "epoch": 0.021, "grad_norm": 88.2499008178711, "learning_rate": 9.916279999999999e-07, "loss": 0.8048, "step": 2100 }, { "epoch": 0.0215, "grad_norm": 80.02753448486328, "learning_rate": 9.914279999999999e-07, "loss": 0.8383, "step": 2150 }, { "epoch": 0.022, "grad_norm": 36.008663177490234, "learning_rate": 9.912279999999999e-07, "loss": 0.7106, "step": 2200 }, { "epoch": 0.0225, "grad_norm": 19.52667999267578, "learning_rate": 9.91028e-07, "loss": 0.5627, "step": 2250 }, { "epoch": 0.023, "grad_norm": 44.50703048706055, "learning_rate": 9.90828e-07, "loss": 0.8129, "step": 2300 }, { "epoch": 0.0235, "grad_norm": 31.829010009765625, "learning_rate": 9.906279999999998e-07, "loss": 0.5897, "step": 2350 }, { "epoch": 0.024, "grad_norm": 80.26560974121094, "learning_rate": 9.90428e-07, "loss": 0.7319, "step": 2400 }, { "epoch": 0.0245, "grad_norm": 46.38948440551758, "learning_rate": 9.90228e-07, "loss": 0.8328, "step": 2450 }, { "epoch": 0.025, "grad_norm": 36.038612365722656, "learning_rate": 9.900319999999999e-07, "loss": 0.8958, "step": 2500 }, { "epoch": 0.0255, "grad_norm": 36.27665328979492, "learning_rate": 9.89832e-07, "loss": 0.7135, "step": 2550 }, { "epoch": 0.026, "grad_norm": 76.19625854492188, "learning_rate": 9.896319999999998e-07, "loss": 0.9015, "step": 2600 }, { "epoch": 0.0265, "grad_norm": 54.22828674316406, "learning_rate": 9.894319999999998e-07, "loss": 0.7887, "step": 2650 }, { "epoch": 0.027, "grad_norm": 77.00263214111328, "learning_rate": 9.89232e-07, "loss": 0.6704, "step": 2700 }, { "epoch": 0.0275, "grad_norm": 65.53499603271484, "learning_rate": 9.89032e-07, "loss": 0.6049, "step": 2750 }, { "epoch": 0.028, "grad_norm": 97.3672103881836, "learning_rate": 9.88832e-07, "loss": 0.7043, "step": 2800 }, { "epoch": 0.0285, "grad_norm": 80.95391082763672, "learning_rate": 9.88632e-07, "loss": 0.7861, "step": 2850 }, { "epoch": 0.029, "grad_norm": 39.608619689941406, "learning_rate": 9.88432e-07, "loss": 0.7095, "step": 2900 }, { "epoch": 0.0295, "grad_norm": 71.16398620605469, "learning_rate": 9.88232e-07, "loss": 0.5963, "step": 2950 }, { "epoch": 0.03, "grad_norm": 66.53075408935547, "learning_rate": 9.88032e-07, "loss": 0.7837, "step": 3000 }, { "epoch": 0.0305, "grad_norm": 77.58058166503906, "learning_rate": 9.87832e-07, "loss": 0.627, "step": 3050 }, { "epoch": 0.031, "grad_norm": 60.44248962402344, "learning_rate": 9.87632e-07, "loss": 0.7445, "step": 3100 }, { "epoch": 0.0315, "grad_norm": 69.0057144165039, "learning_rate": 9.874319999999999e-07, "loss": 0.9611, "step": 3150 }, { "epoch": 0.032, "grad_norm": 103.23316192626953, "learning_rate": 9.872319999999999e-07, "loss": 0.6921, "step": 3200 }, { "epoch": 0.0325, "grad_norm": 63.068687438964844, "learning_rate": 9.87032e-07, "loss": 0.7081, "step": 3250 }, { "epoch": 0.033, "grad_norm": 61.475379943847656, "learning_rate": 9.86832e-07, "loss": 0.8122, "step": 3300 }, { "epoch": 0.0335, "grad_norm": 65.56523895263672, "learning_rate": 9.866319999999998e-07, "loss": 0.7693, "step": 3350 }, { "epoch": 0.034, "grad_norm": 62.29042434692383, "learning_rate": 9.86432e-07, "loss": 0.8332, "step": 3400 }, { "epoch": 0.0345, "grad_norm": 66.55669403076172, "learning_rate": 9.86232e-07, "loss": 0.7765, "step": 3450 }, { "epoch": 0.035, "grad_norm": 41.560089111328125, "learning_rate": 9.86032e-07, "loss": 0.8187, "step": 3500 }, { "epoch": 0.0355, "grad_norm": 59.37902069091797, "learning_rate": 9.85832e-07, "loss": 0.6688, "step": 3550 }, { "epoch": 0.036, "grad_norm": 24.928035736083984, "learning_rate": 9.85632e-07, "loss": 0.6727, "step": 3600 }, { "epoch": 0.0365, "grad_norm": 10.977078437805176, "learning_rate": 9.85432e-07, "loss": 0.7937, "step": 3650 }, { "epoch": 0.037, "grad_norm": 52.790130615234375, "learning_rate": 9.85232e-07, "loss": 0.9761, "step": 3700 }, { "epoch": 0.0375, "grad_norm": 10.147725105285645, "learning_rate": 9.85032e-07, "loss": 0.703, "step": 3750 }, { "epoch": 0.038, "grad_norm": 17.01890754699707, "learning_rate": 9.84832e-07, "loss": 0.7169, "step": 3800 }, { "epoch": 0.0385, "grad_norm": 9.722551345825195, "learning_rate": 9.846319999999999e-07, "loss": 0.802, "step": 3850 }, { "epoch": 0.039, "grad_norm": 39.80437088012695, "learning_rate": 9.844319999999999e-07, "loss": 0.7751, "step": 3900 }, { "epoch": 0.0395, "grad_norm": 43.26958465576172, "learning_rate": 9.842319999999999e-07, "loss": 0.7675, "step": 3950 }, { "epoch": 0.04, "grad_norm": 53.78465270996094, "learning_rate": 9.84032e-07, "loss": 0.683, "step": 4000 }, { "epoch": 0.0405, "grad_norm": 7.1808319091796875, "learning_rate": 9.83832e-07, "loss": 0.7682, "step": 4050 }, { "epoch": 0.041, "grad_norm": 75.9840316772461, "learning_rate": 9.836319999999998e-07, "loss": 0.6552, "step": 4100 }, { "epoch": 0.0415, "grad_norm": 22.792072296142578, "learning_rate": 9.83432e-07, "loss": 0.7268, "step": 4150 }, { "epoch": 0.042, "grad_norm": 29.64811897277832, "learning_rate": 9.83232e-07, "loss": 0.6025, "step": 4200 }, { "epoch": 0.0425, "grad_norm": 93.7188720703125, "learning_rate": 9.83032e-07, "loss": 0.7603, "step": 4250 }, { "epoch": 0.043, "grad_norm": 81.36428833007812, "learning_rate": 9.82832e-07, "loss": 0.7969, "step": 4300 }, { "epoch": 0.0435, "grad_norm": 72.37635040283203, "learning_rate": 9.82632e-07, "loss": 0.6322, "step": 4350 }, { "epoch": 0.044, "grad_norm": 58.41753387451172, "learning_rate": 9.82432e-07, "loss": 0.6965, "step": 4400 }, { "epoch": 0.0445, "grad_norm": 26.843284606933594, "learning_rate": 9.82232e-07, "loss": 0.622, "step": 4450 }, { "epoch": 0.045, "grad_norm": 80.62271881103516, "learning_rate": 9.82032e-07, "loss": 0.7182, "step": 4500 }, { "epoch": 0.0455, "grad_norm": 62.724483489990234, "learning_rate": 9.81832e-07, "loss": 0.6782, "step": 4550 }, { "epoch": 0.046, "grad_norm": 39.624332427978516, "learning_rate": 9.816319999999999e-07, "loss": 0.5628, "step": 4600 }, { "epoch": 0.0465, "grad_norm": 30.11197853088379, "learning_rate": 9.814319999999999e-07, "loss": 0.6594, "step": 4650 }, { "epoch": 0.047, "grad_norm": 47.84878158569336, "learning_rate": 9.812319999999998e-07, "loss": 0.5869, "step": 4700 }, { "epoch": 0.0475, "grad_norm": 31.320173263549805, "learning_rate": 9.81032e-07, "loss": 0.7272, "step": 4750 }, { "epoch": 0.048, "grad_norm": 57.74075698852539, "learning_rate": 9.80832e-07, "loss": 0.7878, "step": 4800 }, { "epoch": 0.0485, "grad_norm": 66.6219711303711, "learning_rate": 9.806319999999998e-07, "loss": 0.7573, "step": 4850 }, { "epoch": 0.049, "grad_norm": 28.821975708007812, "learning_rate": 9.80432e-07, "loss": 0.7238, "step": 4900 }, { "epoch": 0.0495, "grad_norm": 70.03323364257812, "learning_rate": 9.80232e-07, "loss": 0.7167, "step": 4950 }, { "epoch": 0.05, "grad_norm": 42.992374420166016, "learning_rate": 9.80032e-07, "loss": 0.6073, "step": 5000 }, { "epoch": 0.0505, "grad_norm": 80.54195404052734, "learning_rate": 9.79832e-07, "loss": 0.7736, "step": 5050 }, { "epoch": 0.051, "grad_norm": 58.23442459106445, "learning_rate": 9.79632e-07, "loss": 0.7848, "step": 5100 }, { "epoch": 0.0515, "grad_norm": 51.8128662109375, "learning_rate": 9.79432e-07, "loss": 0.7043, "step": 5150 }, { "epoch": 0.052, "grad_norm": 68.98834228515625, "learning_rate": 9.79232e-07, "loss": 0.7293, "step": 5200 }, { "epoch": 0.0525, "grad_norm": 75.1903305053711, "learning_rate": 9.79032e-07, "loss": 0.7225, "step": 5250 }, { "epoch": 0.053, "grad_norm": 46.63593292236328, "learning_rate": 9.78832e-07, "loss": 0.7507, "step": 5300 }, { "epoch": 0.0535, "grad_norm": 54.36689376831055, "learning_rate": 9.786319999999999e-07, "loss": 0.8457, "step": 5350 }, { "epoch": 0.054, "grad_norm": 20.45354461669922, "learning_rate": 9.784319999999999e-07, "loss": 0.621, "step": 5400 }, { "epoch": 0.0545, "grad_norm": 49.178565979003906, "learning_rate": 9.78232e-07, "loss": 0.8933, "step": 5450 }, { "epoch": 0.055, "grad_norm": 1.9964510202407837, "learning_rate": 9.78032e-07, "loss": 0.5691, "step": 5500 }, { "epoch": 0.0555, "grad_norm": 74.9593734741211, "learning_rate": 9.77832e-07, "loss": 0.614, "step": 5550 }, { "epoch": 0.056, "grad_norm": 62.81483459472656, "learning_rate": 9.77632e-07, "loss": 0.8412, "step": 5600 }, { "epoch": 0.0565, "grad_norm": 0.782301664352417, "learning_rate": 9.77432e-07, "loss": 0.6128, "step": 5650 }, { "epoch": 0.057, "grad_norm": 62.49407196044922, "learning_rate": 9.77232e-07, "loss": 0.6208, "step": 5700 }, { "epoch": 0.0575, "grad_norm": 39.935970306396484, "learning_rate": 9.77032e-07, "loss": 0.5399, "step": 5750 }, { "epoch": 0.058, "grad_norm": 25.412355422973633, "learning_rate": 9.76832e-07, "loss": 0.6784, "step": 5800 }, { "epoch": 0.0585, "grad_norm": 64.27447509765625, "learning_rate": 9.76632e-07, "loss": 0.7829, "step": 5850 }, { "epoch": 0.059, "grad_norm": 85.29767608642578, "learning_rate": 9.76432e-07, "loss": 0.649, "step": 5900 }, { "epoch": 0.0595, "grad_norm": 36.06076431274414, "learning_rate": 9.76232e-07, "loss": 0.7838, "step": 5950 }, { "epoch": 0.06, "grad_norm": 84.43746948242188, "learning_rate": 9.760319999999999e-07, "loss": 0.6822, "step": 6000 }, { "epoch": 0.0605, "grad_norm": 74.78901672363281, "learning_rate": 9.75832e-07, "loss": 0.6003, "step": 6050 }, { "epoch": 0.061, "grad_norm": 70.3706283569336, "learning_rate": 9.75632e-07, "loss": 0.6829, "step": 6100 }, { "epoch": 0.0615, "grad_norm": 47.27094650268555, "learning_rate": 9.754319999999998e-07, "loss": 0.6415, "step": 6150 }, { "epoch": 0.062, "grad_norm": 19.190216064453125, "learning_rate": 9.75232e-07, "loss": 0.5789, "step": 6200 }, { "epoch": 0.0625, "grad_norm": 82.82536315917969, "learning_rate": 9.75032e-07, "loss": 0.6222, "step": 6250 }, { "epoch": 0.063, "grad_norm": 72.95569610595703, "learning_rate": 9.74832e-07, "loss": 0.6288, "step": 6300 }, { "epoch": 0.0635, "grad_norm": 19.769786834716797, "learning_rate": 9.74632e-07, "loss": 0.7223, "step": 6350 }, { "epoch": 0.064, "grad_norm": 22.352313995361328, "learning_rate": 9.74432e-07, "loss": 0.6113, "step": 6400 }, { "epoch": 0.0645, "grad_norm": 42.68896484375, "learning_rate": 9.74232e-07, "loss": 0.5704, "step": 6450 }, { "epoch": 0.065, "grad_norm": 8.79783821105957, "learning_rate": 9.74032e-07, "loss": 0.7296, "step": 6500 }, { "epoch": 0.0655, "grad_norm": 19.08173942565918, "learning_rate": 9.73832e-07, "loss": 0.5812, "step": 6550 }, { "epoch": 0.066, "grad_norm": 7.165023326873779, "learning_rate": 9.73632e-07, "loss": 0.7078, "step": 6600 }, { "epoch": 0.0665, "grad_norm": 58.18810272216797, "learning_rate": 9.73432e-07, "loss": 0.6484, "step": 6650 }, { "epoch": 0.067, "grad_norm": 50.902000427246094, "learning_rate": 9.732319999999999e-07, "loss": 0.7417, "step": 6700 }, { "epoch": 0.0675, "grad_norm": 15.194981575012207, "learning_rate": 9.730319999999999e-07, "loss": 0.66, "step": 6750 }, { "epoch": 0.068, "grad_norm": 12.29551887512207, "learning_rate": 9.72832e-07, "loss": 0.6219, "step": 6800 }, { "epoch": 0.0685, "grad_norm": 33.02836608886719, "learning_rate": 9.72632e-07, "loss": 0.7864, "step": 6850 }, { "epoch": 0.069, "grad_norm": 61.996707916259766, "learning_rate": 9.724319999999998e-07, "loss": 0.7458, "step": 6900 }, { "epoch": 0.0695, "grad_norm": 50.779640197753906, "learning_rate": 9.72232e-07, "loss": 0.6969, "step": 6950 }, { "epoch": 0.07, "grad_norm": 63.49092483520508, "learning_rate": 9.72032e-07, "loss": 0.6315, "step": 7000 }, { "epoch": 0.0705, "grad_norm": 2.9683051109313965, "learning_rate": 9.71832e-07, "loss": 0.6708, "step": 7050 }, { "epoch": 0.071, "grad_norm": 63.99093246459961, "learning_rate": 9.71632e-07, "loss": 0.5949, "step": 7100 }, { "epoch": 0.0715, "grad_norm": 59.804561614990234, "learning_rate": 9.71432e-07, "loss": 0.6125, "step": 7150 }, { "epoch": 0.072, "grad_norm": 52.34803009033203, "learning_rate": 9.71232e-07, "loss": 0.6553, "step": 7200 }, { "epoch": 0.0725, "grad_norm": 48.697906494140625, "learning_rate": 9.71032e-07, "loss": 0.7065, "step": 7250 }, { "epoch": 0.073, "grad_norm": 22.473642349243164, "learning_rate": 9.70832e-07, "loss": 0.5137, "step": 7300 }, { "epoch": 0.0735, "grad_norm": 108.86209106445312, "learning_rate": 9.706320000000001e-07, "loss": 0.5885, "step": 7350 }, { "epoch": 0.074, "grad_norm": 97.51536560058594, "learning_rate": 9.704319999999999e-07, "loss": 0.725, "step": 7400 }, { "epoch": 0.0745, "grad_norm": 90.5220947265625, "learning_rate": 9.702319999999999e-07, "loss": 0.6542, "step": 7450 }, { "epoch": 0.075, "grad_norm": 79.1460189819336, "learning_rate": 9.70032e-07, "loss": 0.7872, "step": 7500 }, { "epoch": 0.0755, "grad_norm": 11.08342170715332, "learning_rate": 9.69832e-07, "loss": 0.6952, "step": 7550 }, { "epoch": 0.076, "grad_norm": 70.80834197998047, "learning_rate": 9.69632e-07, "loss": 0.7243, "step": 7600 }, { "epoch": 0.0765, "grad_norm": 51.24228286743164, "learning_rate": 9.69432e-07, "loss": 0.6433, "step": 7650 }, { "epoch": 0.077, "grad_norm": 65.85570526123047, "learning_rate": 9.69232e-07, "loss": 0.7118, "step": 7700 }, { "epoch": 0.0775, "grad_norm": 77.98046112060547, "learning_rate": 9.69032e-07, "loss": 0.6795, "step": 7750 }, { "epoch": 0.078, "grad_norm": 58.70011901855469, "learning_rate": 9.68832e-07, "loss": 0.677, "step": 7800 }, { "epoch": 0.0785, "grad_norm": 47.72407150268555, "learning_rate": 9.68632e-07, "loss": 0.8678, "step": 7850 }, { "epoch": 0.079, "grad_norm": 77.00784301757812, "learning_rate": 9.684359999999998e-07, "loss": 0.61, "step": 7900 }, { "epoch": 0.0795, "grad_norm": 82.65626525878906, "learning_rate": 9.68236e-07, "loss": 0.6014, "step": 7950 }, { "epoch": 0.08, "grad_norm": 64.11064147949219, "learning_rate": 9.68036e-07, "loss": 0.5775, "step": 8000 }, { "epoch": 0.0805, "grad_norm": 75.26988220214844, "learning_rate": 9.67836e-07, "loss": 0.7189, "step": 8050 }, { "epoch": 0.081, "grad_norm": 53.913936614990234, "learning_rate": 9.67636e-07, "loss": 0.6397, "step": 8100 }, { "epoch": 0.0815, "grad_norm": 22.316036224365234, "learning_rate": 9.67436e-07, "loss": 0.5865, "step": 8150 }, { "epoch": 0.082, "grad_norm": 25.347740173339844, "learning_rate": 9.67236e-07, "loss": 0.6304, "step": 8200 }, { "epoch": 0.0825, "grad_norm": 59.95059585571289, "learning_rate": 9.67036e-07, "loss": 0.5163, "step": 8250 }, { "epoch": 0.083, "grad_norm": 35.91787338256836, "learning_rate": 9.66836e-07, "loss": 0.6994, "step": 8300 }, { "epoch": 0.0835, "grad_norm": 10.109200477600098, "learning_rate": 9.66636e-07, "loss": 0.6286, "step": 8350 }, { "epoch": 0.084, "grad_norm": 25.906421661376953, "learning_rate": 9.66436e-07, "loss": 0.6338, "step": 8400 }, { "epoch": 0.0845, "grad_norm": 79.9737777709961, "learning_rate": 9.662359999999999e-07, "loss": 0.8442, "step": 8450 }, { "epoch": 0.085, "grad_norm": 41.02357482910156, "learning_rate": 9.660359999999999e-07, "loss": 0.7414, "step": 8500 }, { "epoch": 0.0855, "grad_norm": 44.978729248046875, "learning_rate": 9.65836e-07, "loss": 0.8366, "step": 8550 }, { "epoch": 0.086, "grad_norm": 55.472808837890625, "learning_rate": 9.65636e-07, "loss": 0.5963, "step": 8600 }, { "epoch": 0.0865, "grad_norm": 11.16412353515625, "learning_rate": 9.654359999999998e-07, "loss": 0.6936, "step": 8650 }, { "epoch": 0.087, "grad_norm": 83.75244903564453, "learning_rate": 9.65236e-07, "loss": 0.6425, "step": 8700 }, { "epoch": 0.0875, "grad_norm": 54.668190002441406, "learning_rate": 9.65036e-07, "loss": 0.6609, "step": 8750 }, { "epoch": 0.088, "grad_norm": 78.21800231933594, "learning_rate": 9.64836e-07, "loss": 0.6267, "step": 8800 }, { "epoch": 0.0885, "grad_norm": 13.47957706451416, "learning_rate": 9.64636e-07, "loss": 0.689, "step": 8850 }, { "epoch": 0.089, "grad_norm": 15.554354667663574, "learning_rate": 9.64436e-07, "loss": 0.7207, "step": 8900 }, { "epoch": 0.0895, "grad_norm": 77.30255126953125, "learning_rate": 9.64236e-07, "loss": 0.5907, "step": 8950 }, { "epoch": 0.09, "grad_norm": 50.847564697265625, "learning_rate": 9.64036e-07, "loss": 0.623, "step": 9000 }, { "epoch": 0.0905, "grad_norm": 20.90938377380371, "learning_rate": 9.63836e-07, "loss": 0.7254, "step": 9050 }, { "epoch": 0.091, "grad_norm": 77.27519989013672, "learning_rate": 9.63636e-07, "loss": 0.6997, "step": 9100 }, { "epoch": 0.0915, "grad_norm": 71.25733947753906, "learning_rate": 9.634359999999999e-07, "loss": 0.6436, "step": 9150 }, { "epoch": 0.092, "grad_norm": 47.3591423034668, "learning_rate": 9.632359999999999e-07, "loss": 0.6289, "step": 9200 }, { "epoch": 0.0925, "grad_norm": 0.31778019666671753, "learning_rate": 9.630359999999999e-07, "loss": 0.58, "step": 9250 }, { "epoch": 0.093, "grad_norm": 73.01952362060547, "learning_rate": 9.62836e-07, "loss": 0.6876, "step": 9300 }, { "epoch": 0.0935, "grad_norm": 79.24958038330078, "learning_rate": 9.62636e-07, "loss": 0.7076, "step": 9350 }, { "epoch": 0.094, "grad_norm": 33.39700698852539, "learning_rate": 9.624359999999998e-07, "loss": 0.6385, "step": 9400 }, { "epoch": 0.0945, "grad_norm": 6.0091376304626465, "learning_rate": 9.62236e-07, "loss": 0.6001, "step": 9450 }, { "epoch": 0.095, "grad_norm": 93.4347152709961, "learning_rate": 9.62036e-07, "loss": 0.5681, "step": 9500 }, { "epoch": 0.0955, "grad_norm": 124.39002227783203, "learning_rate": 9.61836e-07, "loss": 0.5942, "step": 9550 }, { "epoch": 0.096, "grad_norm": 26.654417037963867, "learning_rate": 9.61636e-07, "loss": 0.5937, "step": 9600 }, { "epoch": 0.0965, "grad_norm": 84.6335678100586, "learning_rate": 9.61436e-07, "loss": 0.6569, "step": 9650 }, { "epoch": 0.097, "grad_norm": 68.39822387695312, "learning_rate": 9.61236e-07, "loss": 0.6662, "step": 9700 }, { "epoch": 0.0975, "grad_norm": 46.38802719116211, "learning_rate": 9.61036e-07, "loss": 0.6765, "step": 9750 }, { "epoch": 0.098, "grad_norm": 12.477609634399414, "learning_rate": 9.60836e-07, "loss": 0.6491, "step": 9800 }, { "epoch": 0.0985, "grad_norm": 7.226109027862549, "learning_rate": 9.606360000000001e-07, "loss": 0.6194, "step": 9850 }, { "epoch": 0.099, "grad_norm": 47.4417724609375, "learning_rate": 9.604359999999999e-07, "loss": 0.7301, "step": 9900 }, { "epoch": 0.0995, "grad_norm": 91.45806884765625, "learning_rate": 9.602359999999999e-07, "loss": 0.6963, "step": 9950 }, { "epoch": 0.1, "grad_norm": 40.271846771240234, "learning_rate": 9.60036e-07, "loss": 0.7493, "step": 10000 }, { "epoch": 0.1005, "grad_norm": 87.0700912475586, "learning_rate": 9.59836e-07, "loss": 0.558, "step": 10050 }, { "epoch": 0.101, "grad_norm": 89.44054412841797, "learning_rate": 9.5964e-07, "loss": 0.4637, "step": 10100 }, { "epoch": 0.1015, "grad_norm": 104.4756088256836, "learning_rate": 9.5944e-07, "loss": 0.6641, "step": 10150 }, { "epoch": 0.102, "grad_norm": 77.9275131225586, "learning_rate": 9.5924e-07, "loss": 0.4791, "step": 10200 }, { "epoch": 0.1025, "grad_norm": 113.33071899414062, "learning_rate": 9.590399999999999e-07, "loss": 0.5757, "step": 10250 }, { "epoch": 0.103, "grad_norm": 74.14317321777344, "learning_rate": 9.5884e-07, "loss": 0.7051, "step": 10300 }, { "epoch": 0.1035, "grad_norm": 91.95326232910156, "learning_rate": 9.5864e-07, "loss": 0.7315, "step": 10350 }, { "epoch": 0.104, "grad_norm": 72.8747329711914, "learning_rate": 9.584399999999998e-07, "loss": 0.7391, "step": 10400 }, { "epoch": 0.1045, "grad_norm": 61.38804244995117, "learning_rate": 9.5824e-07, "loss": 0.6073, "step": 10450 }, { "epoch": 0.105, "grad_norm": 65.2311782836914, "learning_rate": 9.5804e-07, "loss": 0.792, "step": 10500 }, { "epoch": 0.1055, "grad_norm": 52.02727508544922, "learning_rate": 9.5784e-07, "loss": 0.5261, "step": 10550 }, { "epoch": 0.106, "grad_norm": 34.90068054199219, "learning_rate": 9.5764e-07, "loss": 0.5306, "step": 10600 }, { "epoch": 0.1065, "grad_norm": 58.16232681274414, "learning_rate": 9.5744e-07, "loss": 0.7673, "step": 10650 }, { "epoch": 0.107, "grad_norm": 15.396655082702637, "learning_rate": 9.5724e-07, "loss": 0.5671, "step": 10700 }, { "epoch": 0.1075, "grad_norm": 15.548702239990234, "learning_rate": 9.5704e-07, "loss": 0.7041, "step": 10750 }, { "epoch": 0.108, "grad_norm": 101.34947204589844, "learning_rate": 9.5684e-07, "loss": 0.7207, "step": 10800 }, { "epoch": 0.1085, "grad_norm": 6.5170207023620605, "learning_rate": 9.5664e-07, "loss": 0.5903, "step": 10850 }, { "epoch": 0.109, "grad_norm": 36.6441764831543, "learning_rate": 9.5644e-07, "loss": 0.6876, "step": 10900 }, { "epoch": 0.1095, "grad_norm": 63.76533508300781, "learning_rate": 9.562399999999999e-07, "loss": 0.6007, "step": 10950 }, { "epoch": 0.11, "grad_norm": 52.73303985595703, "learning_rate": 9.560399999999999e-07, "loss": 0.6017, "step": 11000 }, { "epoch": 0.1105, "grad_norm": 62.94703674316406, "learning_rate": 9.5584e-07, "loss": 0.6171, "step": 11050 }, { "epoch": 0.111, "grad_norm": 60.48774337768555, "learning_rate": 9.5564e-07, "loss": 0.5815, "step": 11100 }, { "epoch": 0.1115, "grad_norm": 84.18730163574219, "learning_rate": 9.554399999999998e-07, "loss": 0.6942, "step": 11150 }, { "epoch": 0.112, "grad_norm": 81.33743286132812, "learning_rate": 9.5524e-07, "loss": 0.5393, "step": 11200 }, { "epoch": 0.1125, "grad_norm": 58.714378356933594, "learning_rate": 9.5504e-07, "loss": 0.532, "step": 11250 }, { "epoch": 0.113, "grad_norm": 97.91901397705078, "learning_rate": 9.5484e-07, "loss": 0.6323, "step": 11300 }, { "epoch": 0.1135, "grad_norm": 5.727772235870361, "learning_rate": 9.5464e-07, "loss": 0.5041, "step": 11350 }, { "epoch": 0.114, "grad_norm": 39.63825988769531, "learning_rate": 9.5444e-07, "loss": 0.5549, "step": 11400 }, { "epoch": 0.1145, "grad_norm": 103.17711639404297, "learning_rate": 9.5424e-07, "loss": 0.5167, "step": 11450 }, { "epoch": 0.115, "grad_norm": 48.15481948852539, "learning_rate": 9.5404e-07, "loss": 0.7287, "step": 11500 }, { "epoch": 0.1155, "grad_norm": 4.032690525054932, "learning_rate": 9.5384e-07, "loss": 0.64, "step": 11550 }, { "epoch": 0.116, "grad_norm": 22.449026107788086, "learning_rate": 9.5364e-07, "loss": 0.6243, "step": 11600 }, { "epoch": 0.1165, "grad_norm": 76.77507019042969, "learning_rate": 9.534399999999999e-07, "loss": 0.6222, "step": 11650 }, { "epoch": 0.117, "grad_norm": 3.4103989601135254, "learning_rate": 9.5324e-07, "loss": 0.7213, "step": 11700 }, { "epoch": 0.1175, "grad_norm": 40.05355453491211, "learning_rate": 9.5304e-07, "loss": 0.6782, "step": 11750 }, { "epoch": 0.118, "grad_norm": 13.29817008972168, "learning_rate": 9.5284e-07, "loss": 0.7741, "step": 11800 }, { "epoch": 0.1185, "grad_norm": 55.80624771118164, "learning_rate": 9.5264e-07, "loss": 0.7785, "step": 11850 }, { "epoch": 0.119, "grad_norm": 75.11266326904297, "learning_rate": 9.524399999999999e-07, "loss": 0.7455, "step": 11900 }, { "epoch": 0.1195, "grad_norm": 28.10260009765625, "learning_rate": 9.522399999999999e-07, "loss": 0.6025, "step": 11950 }, { "epoch": 0.12, "grad_norm": 71.65565490722656, "learning_rate": 9.5204e-07, "loss": 0.7934, "step": 12000 }, { "epoch": 0.1205, "grad_norm": 54.734107971191406, "learning_rate": 9.5184e-07, "loss": 0.6244, "step": 12050 }, { "epoch": 0.121, "grad_norm": 3.3738136291503906, "learning_rate": 9.5164e-07, "loss": 0.51, "step": 12100 }, { "epoch": 0.1215, "grad_norm": 50.44538116455078, "learning_rate": 9.5144e-07, "loss": 0.5917, "step": 12150 }, { "epoch": 0.122, "grad_norm": 22.168771743774414, "learning_rate": 9.512399999999999e-07, "loss": 0.5424, "step": 12200 }, { "epoch": 0.1225, "grad_norm": 55.60480499267578, "learning_rate": 9.510399999999999e-07, "loss": 0.6332, "step": 12250 }, { "epoch": 0.123, "grad_norm": 42.9905891418457, "learning_rate": 9.5084e-07, "loss": 0.5598, "step": 12300 }, { "epoch": 0.1235, "grad_norm": 2.6760189533233643, "learning_rate": 9.5064e-07, "loss": 0.671, "step": 12350 }, { "epoch": 0.124, "grad_norm": 62.84537124633789, "learning_rate": 9.504399999999999e-07, "loss": 0.6672, "step": 12400 }, { "epoch": 0.1245, "grad_norm": 60.886653900146484, "learning_rate": 9.5024e-07, "loss": 0.7067, "step": 12450 }, { "epoch": 0.125, "grad_norm": 84.8931655883789, "learning_rate": 9.5004e-07, "loss": 0.5629, "step": 12500 }, { "epoch": 0.1255, "grad_norm": 0.7383530735969543, "learning_rate": 9.498399999999999e-07, "loss": 0.5731, "step": 12550 }, { "epoch": 0.126, "grad_norm": 146.97189331054688, "learning_rate": 9.4964e-07, "loss": 0.5422, "step": 12600 }, { "epoch": 0.1265, "grad_norm": 13.385457038879395, "learning_rate": 9.494399999999999e-07, "loss": 0.6314, "step": 12650 }, { "epoch": 0.127, "grad_norm": 9.101408004760742, "learning_rate": 9.492399999999999e-07, "loss": 0.6082, "step": 12700 }, { "epoch": 0.1275, "grad_norm": 9.814948081970215, "learning_rate": 9.4904e-07, "loss": 0.6427, "step": 12750 }, { "epoch": 0.128, "grad_norm": 1.8460052013397217, "learning_rate": 9.4884e-07, "loss": 0.6449, "step": 12800 }, { "epoch": 0.1285, "grad_norm": 2.2399051189422607, "learning_rate": 9.4864e-07, "loss": 0.6494, "step": 12850 }, { "epoch": 0.129, "grad_norm": 51.1398811340332, "learning_rate": 9.484399999999999e-07, "loss": 0.5733, "step": 12900 }, { "epoch": 0.1295, "grad_norm": 48.945919036865234, "learning_rate": 9.482399999999999e-07, "loss": 0.6088, "step": 12950 }, { "epoch": 0.13, "grad_norm": 55.05748748779297, "learning_rate": 9.480399999999999e-07, "loss": 0.5011, "step": 13000 }, { "epoch": 0.1305, "grad_norm": 13.456110954284668, "learning_rate": 9.4784e-07, "loss": 0.7739, "step": 13050 }, { "epoch": 0.131, "grad_norm": 76.10543060302734, "learning_rate": 9.4764e-07, "loss": 0.6818, "step": 13100 }, { "epoch": 0.1315, "grad_norm": 51.6418571472168, "learning_rate": 9.474439999999999e-07, "loss": 0.525, "step": 13150 }, { "epoch": 0.132, "grad_norm": 17.212923049926758, "learning_rate": 9.47244e-07, "loss": 0.5746, "step": 13200 }, { "epoch": 0.1325, "grad_norm": 53.69514465332031, "learning_rate": 9.470439999999999e-07, "loss": 0.5003, "step": 13250 }, { "epoch": 0.133, "grad_norm": 40.141632080078125, "learning_rate": 9.468439999999999e-07, "loss": 0.6027, "step": 13300 }, { "epoch": 0.1335, "grad_norm": 61.87038040161133, "learning_rate": 9.46644e-07, "loss": 0.6072, "step": 13350 }, { "epoch": 0.134, "grad_norm": 122.38882446289062, "learning_rate": 9.464439999999999e-07, "loss": 0.6451, "step": 13400 }, { "epoch": 0.1345, "grad_norm": 56.08554458618164, "learning_rate": 9.462439999999999e-07, "loss": 0.6374, "step": 13450 }, { "epoch": 0.135, "grad_norm": 81.84549713134766, "learning_rate": 9.46044e-07, "loss": 0.5401, "step": 13500 }, { "epoch": 0.1355, "grad_norm": 118.67034912109375, "learning_rate": 9.45844e-07, "loss": 0.6624, "step": 13550 }, { "epoch": 0.136, "grad_norm": 57.54740905761719, "learning_rate": 9.45644e-07, "loss": 0.5806, "step": 13600 }, { "epoch": 0.1365, "grad_norm": 105.1045913696289, "learning_rate": 9.454439999999999e-07, "loss": 0.5867, "step": 13650 }, { "epoch": 0.137, "grad_norm": 25.346324920654297, "learning_rate": 9.452439999999999e-07, "loss": 0.6926, "step": 13700 }, { "epoch": 0.1375, "grad_norm": 91.3180160522461, "learning_rate": 9.45044e-07, "loss": 0.5217, "step": 13750 }, { "epoch": 0.138, "grad_norm": 79.1546630859375, "learning_rate": 9.44844e-07, "loss": 0.6172, "step": 13800 }, { "epoch": 0.1385, "grad_norm": 92.9755630493164, "learning_rate": 9.44644e-07, "loss": 0.7169, "step": 13850 }, { "epoch": 0.139, "grad_norm": 33.35745620727539, "learning_rate": 9.44444e-07, "loss": 0.6807, "step": 13900 }, { "epoch": 0.1395, "grad_norm": 14.041089057922363, "learning_rate": 9.442439999999999e-07, "loss": 0.5662, "step": 13950 }, { "epoch": 0.14, "grad_norm": 46.25873947143555, "learning_rate": 9.440439999999999e-07, "loss": 0.6316, "step": 14000 }, { "epoch": 0.1405, "grad_norm": 59.24448013305664, "learning_rate": 9.43844e-07, "loss": 0.6294, "step": 14050 }, { "epoch": 0.141, "grad_norm": 68.7812728881836, "learning_rate": 9.43644e-07, "loss": 0.6128, "step": 14100 }, { "epoch": 0.1415, "grad_norm": 66.03839874267578, "learning_rate": 9.434439999999999e-07, "loss": 0.7409, "step": 14150 }, { "epoch": 0.142, "grad_norm": 28.09392738342285, "learning_rate": 9.43244e-07, "loss": 0.7238, "step": 14200 }, { "epoch": 0.1425, "grad_norm": 11.177745819091797, "learning_rate": 9.43044e-07, "loss": 0.5512, "step": 14250 }, { "epoch": 0.143, "grad_norm": 77.08145141601562, "learning_rate": 9.428439999999999e-07, "loss": 0.558, "step": 14300 }, { "epoch": 0.1435, "grad_norm": 32.226314544677734, "learning_rate": 9.42644e-07, "loss": 0.6093, "step": 14350 }, { "epoch": 0.144, "grad_norm": 39.99115753173828, "learning_rate": 9.424439999999999e-07, "loss": 0.511, "step": 14400 }, { "epoch": 0.1445, "grad_norm": 45.316959381103516, "learning_rate": 9.422439999999999e-07, "loss": 0.6866, "step": 14450 }, { "epoch": 0.145, "grad_norm": 9.914299011230469, "learning_rate": 9.42044e-07, "loss": 0.5704, "step": 14500 }, { "epoch": 0.1455, "grad_norm": 112.46765899658203, "learning_rate": 9.41844e-07, "loss": 0.6679, "step": 14550 }, { "epoch": 0.146, "grad_norm": 18.95020866394043, "learning_rate": 9.41644e-07, "loss": 0.4952, "step": 14600 }, { "epoch": 0.1465, "grad_norm": 74.44670867919922, "learning_rate": 9.414439999999999e-07, "loss": 0.5307, "step": 14650 }, { "epoch": 0.147, "grad_norm": 1.2418839931488037, "learning_rate": 9.412439999999999e-07, "loss": 0.5571, "step": 14700 }, { "epoch": 0.1475, "grad_norm": 49.43461608886719, "learning_rate": 9.410439999999999e-07, "loss": 0.4794, "step": 14750 }, { "epoch": 0.148, "grad_norm": 74.62863159179688, "learning_rate": 9.40844e-07, "loss": 0.5945, "step": 14800 }, { "epoch": 0.1485, "grad_norm": 81.81336212158203, "learning_rate": 9.40644e-07, "loss": 0.62, "step": 14850 }, { "epoch": 0.149, "grad_norm": 63.31863021850586, "learning_rate": 9.404439999999999e-07, "loss": 0.5821, "step": 14900 }, { "epoch": 0.1495, "grad_norm": 42.47512435913086, "learning_rate": 9.40244e-07, "loss": 0.6294, "step": 14950 }, { "epoch": 0.15, "grad_norm": 76.9931869506836, "learning_rate": 9.400439999999999e-07, "loss": 0.4261, "step": 15000 }, { "epoch": 0.1505, "grad_norm": 61.6082763671875, "learning_rate": 9.398439999999999e-07, "loss": 0.671, "step": 15050 }, { "epoch": 0.151, "grad_norm": 64.3666763305664, "learning_rate": 9.39644e-07, "loss": 0.7315, "step": 15100 }, { "epoch": 0.1515, "grad_norm": 50.603111267089844, "learning_rate": 9.394439999999999e-07, "loss": 0.6032, "step": 15150 }, { "epoch": 0.152, "grad_norm": 88.94772338867188, "learning_rate": 9.392439999999999e-07, "loss": 0.5629, "step": 15200 }, { "epoch": 0.1525, "grad_norm": 16.136049270629883, "learning_rate": 9.39048e-07, "loss": 0.547, "step": 15250 }, { "epoch": 0.153, "grad_norm": 14.290252685546875, "learning_rate": 9.38848e-07, "loss": 0.5877, "step": 15300 }, { "epoch": 0.1535, "grad_norm": 72.88977813720703, "learning_rate": 9.38648e-07, "loss": 0.6141, "step": 15350 }, { "epoch": 0.154, "grad_norm": 56.635902404785156, "learning_rate": 9.384479999999999e-07, "loss": 0.5268, "step": 15400 }, { "epoch": 0.1545, "grad_norm": 36.33918762207031, "learning_rate": 9.382479999999999e-07, "loss": 0.6514, "step": 15450 }, { "epoch": 0.155, "grad_norm": 53.50493621826172, "learning_rate": 9.380479999999999e-07, "loss": 0.573, "step": 15500 }, { "epoch": 0.1555, "grad_norm": 2.055042266845703, "learning_rate": 9.37848e-07, "loss": 0.6182, "step": 15550 }, { "epoch": 0.156, "grad_norm": 173.22987365722656, "learning_rate": 9.37648e-07, "loss": 0.6573, "step": 15600 }, { "epoch": 0.1565, "grad_norm": 22.735443115234375, "learning_rate": 9.374479999999999e-07, "loss": 0.6504, "step": 15650 }, { "epoch": 0.157, "grad_norm": 76.63526153564453, "learning_rate": 9.37248e-07, "loss": 0.6736, "step": 15700 }, { "epoch": 0.1575, "grad_norm": 16.873645782470703, "learning_rate": 9.370479999999999e-07, "loss": 0.624, "step": 15750 }, { "epoch": 0.158, "grad_norm": 6.853653430938721, "learning_rate": 9.368479999999999e-07, "loss": 0.705, "step": 15800 }, { "epoch": 0.1585, "grad_norm": 70.6560287475586, "learning_rate": 9.36648e-07, "loss": 0.603, "step": 15850 }, { "epoch": 0.159, "grad_norm": 59.094749450683594, "learning_rate": 9.364479999999999e-07, "loss": 0.7677, "step": 15900 }, { "epoch": 0.1595, "grad_norm": 7.74002742767334, "learning_rate": 9.362479999999999e-07, "loss": 0.5653, "step": 15950 }, { "epoch": 0.16, "grad_norm": 20.42781639099121, "learning_rate": 9.36048e-07, "loss": 0.5747, "step": 16000 }, { "epoch": 0.1605, "grad_norm": 56.63848876953125, "learning_rate": 9.35848e-07, "loss": 0.7402, "step": 16050 }, { "epoch": 0.161, "grad_norm": 62.384342193603516, "learning_rate": 9.35648e-07, "loss": 0.6194, "step": 16100 }, { "epoch": 0.1615, "grad_norm": 53.056766510009766, "learning_rate": 9.354479999999999e-07, "loss": 0.7444, "step": 16150 }, { "epoch": 0.162, "grad_norm": 82.36703491210938, "learning_rate": 9.352479999999999e-07, "loss": 0.5666, "step": 16200 }, { "epoch": 0.1625, "grad_norm": 18.262683868408203, "learning_rate": 9.35048e-07, "loss": 0.5184, "step": 16250 }, { "epoch": 0.163, "grad_norm": 96.85376739501953, "learning_rate": 9.34848e-07, "loss": 0.6913, "step": 16300 }, { "epoch": 0.1635, "grad_norm": 35.92914962768555, "learning_rate": 9.34648e-07, "loss": 0.5972, "step": 16350 }, { "epoch": 0.164, "grad_norm": 33.37955856323242, "learning_rate": 9.34448e-07, "loss": 0.5924, "step": 16400 }, { "epoch": 0.1645, "grad_norm": 33.09463882446289, "learning_rate": 9.342479999999999e-07, "loss": 0.5298, "step": 16450 }, { "epoch": 0.165, "grad_norm": 26.03669548034668, "learning_rate": 9.340479999999999e-07, "loss": 0.4642, "step": 16500 }, { "epoch": 0.1655, "grad_norm": 20.21666717529297, "learning_rate": 9.33848e-07, "loss": 0.5442, "step": 16550 }, { "epoch": 0.166, "grad_norm": 23.451101303100586, "learning_rate": 9.33648e-07, "loss": 0.5649, "step": 16600 }, { "epoch": 0.1665, "grad_norm": 8.985445022583008, "learning_rate": 9.33448e-07, "loss": 0.5812, "step": 16650 }, { "epoch": 0.167, "grad_norm": 94.17129516601562, "learning_rate": 9.33248e-07, "loss": 0.5932, "step": 16700 }, { "epoch": 0.1675, "grad_norm": 55.66262435913086, "learning_rate": 9.33048e-07, "loss": 0.5894, "step": 16750 }, { "epoch": 0.168, "grad_norm": 22.5528507232666, "learning_rate": 9.328479999999999e-07, "loss": 0.6531, "step": 16800 }, { "epoch": 0.1685, "grad_norm": 1.4093486070632935, "learning_rate": 9.32648e-07, "loss": 0.6753, "step": 16850 }, { "epoch": 0.169, "grad_norm": 2.8292222023010254, "learning_rate": 9.32448e-07, "loss": 0.6501, "step": 16900 }, { "epoch": 0.1695, "grad_norm": 55.30830383300781, "learning_rate": 9.322479999999999e-07, "loss": 0.4652, "step": 16950 }, { "epoch": 0.17, "grad_norm": 2.9419615268707275, "learning_rate": 9.32048e-07, "loss": 0.5817, "step": 17000 }, { "epoch": 0.1705, "grad_norm": 21.788589477539062, "learning_rate": 9.31848e-07, "loss": 0.4845, "step": 17050 }, { "epoch": 0.171, "grad_norm": 51.12996292114258, "learning_rate": 9.31648e-07, "loss": 0.6086, "step": 17100 }, { "epoch": 0.1715, "grad_norm": 29.451993942260742, "learning_rate": 9.31448e-07, "loss": 0.609, "step": 17150 }, { "epoch": 0.172, "grad_norm": 26.545246124267578, "learning_rate": 9.312479999999999e-07, "loss": 0.7538, "step": 17200 }, { "epoch": 0.1725, "grad_norm": 88.69632720947266, "learning_rate": 9.310479999999999e-07, "loss": 0.629, "step": 17250 }, { "epoch": 0.173, "grad_norm": 24.869253158569336, "learning_rate": 9.30848e-07, "loss": 0.6385, "step": 17300 }, { "epoch": 0.1735, "grad_norm": 116.06053161621094, "learning_rate": 9.30652e-07, "loss": 0.6501, "step": 17350 }, { "epoch": 0.174, "grad_norm": 89.61216735839844, "learning_rate": 9.304519999999999e-07, "loss": 0.5625, "step": 17400 }, { "epoch": 0.1745, "grad_norm": 0.683408260345459, "learning_rate": 9.30252e-07, "loss": 0.7442, "step": 17450 }, { "epoch": 0.175, "grad_norm": 43.17182540893555, "learning_rate": 9.300519999999999e-07, "loss": 0.7728, "step": 17500 }, { "epoch": 0.1755, "grad_norm": 16.35731315612793, "learning_rate": 9.298519999999999e-07, "loss": 0.5745, "step": 17550 }, { "epoch": 0.176, "grad_norm": 8.577791213989258, "learning_rate": 9.29652e-07, "loss": 0.465, "step": 17600 }, { "epoch": 0.1765, "grad_norm": 55.087440490722656, "learning_rate": 9.294519999999999e-07, "loss": 0.5468, "step": 17650 }, { "epoch": 0.177, "grad_norm": 65.42967987060547, "learning_rate": 9.292519999999999e-07, "loss": 0.597, "step": 17700 }, { "epoch": 0.1775, "grad_norm": 4.98286247253418, "learning_rate": 9.29052e-07, "loss": 0.5523, "step": 17750 }, { "epoch": 0.178, "grad_norm": 39.229949951171875, "learning_rate": 9.28852e-07, "loss": 0.6382, "step": 17800 }, { "epoch": 0.1785, "grad_norm": 33.733394622802734, "learning_rate": 9.286519999999999e-07, "loss": 0.6871, "step": 17850 }, { "epoch": 0.179, "grad_norm": 16.59604263305664, "learning_rate": 9.28452e-07, "loss": 0.6174, "step": 17900 }, { "epoch": 0.1795, "grad_norm": 85.7757797241211, "learning_rate": 9.282519999999999e-07, "loss": 0.7438, "step": 17950 }, { "epoch": 0.18, "grad_norm": 77.86770629882812, "learning_rate": 9.280519999999999e-07, "loss": 0.5513, "step": 18000 }, { "epoch": 0.1805, "grad_norm": 4.501659393310547, "learning_rate": 9.27852e-07, "loss": 0.6059, "step": 18050 }, { "epoch": 0.181, "grad_norm": 54.42782211303711, "learning_rate": 9.27652e-07, "loss": 0.4786, "step": 18100 }, { "epoch": 0.1815, "grad_norm": 45.432151794433594, "learning_rate": 9.274520000000001e-07, "loss": 0.6646, "step": 18150 }, { "epoch": 0.182, "grad_norm": 111.56059265136719, "learning_rate": 9.272519999999999e-07, "loss": 0.6535, "step": 18200 }, { "epoch": 0.1825, "grad_norm": 78.46669006347656, "learning_rate": 9.270519999999999e-07, "loss": 0.5738, "step": 18250 }, { "epoch": 0.183, "grad_norm": 2.2137699127197266, "learning_rate": 9.26852e-07, "loss": 0.55, "step": 18300 }, { "epoch": 0.1835, "grad_norm": 65.4900894165039, "learning_rate": 9.26652e-07, "loss": 0.6607, "step": 18350 }, { "epoch": 0.184, "grad_norm": 45.10904312133789, "learning_rate": 9.26452e-07, "loss": 0.6373, "step": 18400 }, { "epoch": 0.1845, "grad_norm": 61.703250885009766, "learning_rate": 9.26252e-07, "loss": 0.4666, "step": 18450 }, { "epoch": 0.185, "grad_norm": 105.42646026611328, "learning_rate": 9.26052e-07, "loss": 0.6782, "step": 18500 }, { "epoch": 0.1855, "grad_norm": 108.39800262451172, "learning_rate": 9.258519999999999e-07, "loss": 0.6013, "step": 18550 }, { "epoch": 0.186, "grad_norm": 48.339599609375, "learning_rate": 9.25652e-07, "loss": 0.5994, "step": 18600 }, { "epoch": 0.1865, "grad_norm": 61.67754364013672, "learning_rate": 9.25452e-07, "loss": 0.5721, "step": 18650 }, { "epoch": 0.187, "grad_norm": 33.096046447753906, "learning_rate": 9.252519999999999e-07, "loss": 0.6398, "step": 18700 }, { "epoch": 0.1875, "grad_norm": 33.91469192504883, "learning_rate": 9.25052e-07, "loss": 0.4305, "step": 18750 }, { "epoch": 0.188, "grad_norm": 32.4777717590332, "learning_rate": 9.24852e-07, "loss": 0.6306, "step": 18800 }, { "epoch": 0.1885, "grad_norm": 19.212507247924805, "learning_rate": 9.24652e-07, "loss": 0.6042, "step": 18850 }, { "epoch": 0.189, "grad_norm": 193.7104949951172, "learning_rate": 9.24452e-07, "loss": 0.5036, "step": 18900 }, { "epoch": 0.1895, "grad_norm": 77.8702392578125, "learning_rate": 9.242519999999999e-07, "loss": 0.6825, "step": 18950 }, { "epoch": 0.19, "grad_norm": 53.27097702026367, "learning_rate": 9.240519999999999e-07, "loss": 0.6074, "step": 19000 }, { "epoch": 0.1905, "grad_norm": 16.498302459716797, "learning_rate": 9.23852e-07, "loss": 0.497, "step": 19050 }, { "epoch": 0.191, "grad_norm": 75.21617889404297, "learning_rate": 9.23652e-07, "loss": 0.6573, "step": 19100 }, { "epoch": 0.1915, "grad_norm": 4.08469295501709, "learning_rate": 9.23452e-07, "loss": 0.5048, "step": 19150 }, { "epoch": 0.192, "grad_norm": 8.395776748657227, "learning_rate": 9.23252e-07, "loss": 0.5109, "step": 19200 }, { "epoch": 0.1925, "grad_norm": 1.7910493612289429, "learning_rate": 9.230519999999999e-07, "loss": 0.5289, "step": 19250 }, { "epoch": 0.193, "grad_norm": 67.36382293701172, "learning_rate": 9.228519999999999e-07, "loss": 0.7231, "step": 19300 }, { "epoch": 0.1935, "grad_norm": 55.8679313659668, "learning_rate": 9.22652e-07, "loss": 0.5469, "step": 19350 }, { "epoch": 0.194, "grad_norm": 108.610107421875, "learning_rate": 9.22452e-07, "loss": 0.7487, "step": 19400 }, { "epoch": 0.1945, "grad_norm": 60.358062744140625, "learning_rate": 9.222519999999999e-07, "loss": 0.5649, "step": 19450 }, { "epoch": 0.195, "grad_norm": 70.0567398071289, "learning_rate": 9.22052e-07, "loss": 0.526, "step": 19500 }, { "epoch": 0.1955, "grad_norm": 23.887163162231445, "learning_rate": 9.21856e-07, "loss": 0.4628, "step": 19550 }, { "epoch": 0.196, "grad_norm": 15.568492889404297, "learning_rate": 9.21656e-07, "loss": 0.683, "step": 19600 }, { "epoch": 0.1965, "grad_norm": 78.2328872680664, "learning_rate": 9.21456e-07, "loss": 0.6217, "step": 19650 }, { "epoch": 0.197, "grad_norm": 16.458410263061523, "learning_rate": 9.212559999999999e-07, "loss": 0.6503, "step": 19700 }, { "epoch": 0.1975, "grad_norm": 73.25173950195312, "learning_rate": 9.210559999999999e-07, "loss": 0.6609, "step": 19750 }, { "epoch": 0.198, "grad_norm": 47.52989959716797, "learning_rate": 9.20856e-07, "loss": 0.5237, "step": 19800 }, { "epoch": 0.1985, "grad_norm": 62.07141876220703, "learning_rate": 9.20656e-07, "loss": 0.5814, "step": 19850 }, { "epoch": 0.199, "grad_norm": 41.7489013671875, "learning_rate": 9.20456e-07, "loss": 0.6958, "step": 19900 }, { "epoch": 0.1995, "grad_norm": 64.66690063476562, "learning_rate": 9.20256e-07, "loss": 0.5138, "step": 19950 }, { "epoch": 0.2, "grad_norm": 11.058856010437012, "learning_rate": 9.200559999999999e-07, "loss": 0.5906, "step": 20000 }, { "epoch": 0.2005, "grad_norm": 60.96815872192383, "learning_rate": 9.198559999999999e-07, "loss": 0.5804, "step": 20050 }, { "epoch": 0.201, "grad_norm": 30.981765747070312, "learning_rate": 9.19656e-07, "loss": 0.8173, "step": 20100 }, { "epoch": 0.2015, "grad_norm": 54.25685501098633, "learning_rate": 9.19456e-07, "loss": 0.5893, "step": 20150 }, { "epoch": 0.202, "grad_norm": 105.40068054199219, "learning_rate": 9.192559999999999e-07, "loss": 0.5352, "step": 20200 }, { "epoch": 0.2025, "grad_norm": 36.076011657714844, "learning_rate": 9.19056e-07, "loss": 0.4749, "step": 20250 }, { "epoch": 0.203, "grad_norm": 51.85771179199219, "learning_rate": 9.18856e-07, "loss": 0.6509, "step": 20300 }, { "epoch": 0.2035, "grad_norm": 54.14777755737305, "learning_rate": 9.186559999999999e-07, "loss": 0.7679, "step": 20350 }, { "epoch": 0.204, "grad_norm": 90.21994018554688, "learning_rate": 9.18456e-07, "loss": 0.5251, "step": 20400 }, { "epoch": 0.2045, "grad_norm": 2.283958911895752, "learning_rate": 9.182559999999999e-07, "loss": 0.4597, "step": 20450 }, { "epoch": 0.205, "grad_norm": 51.34020233154297, "learning_rate": 9.180559999999999e-07, "loss": 0.6181, "step": 20500 }, { "epoch": 0.2055, "grad_norm": 113.1830062866211, "learning_rate": 9.17856e-07, "loss": 0.5824, "step": 20550 }, { "epoch": 0.206, "grad_norm": 95.66886901855469, "learning_rate": 9.17656e-07, "loss": 0.5657, "step": 20600 }, { "epoch": 0.2065, "grad_norm": 72.63542938232422, "learning_rate": 9.174560000000001e-07, "loss": 0.5979, "step": 20650 }, { "epoch": 0.207, "grad_norm": 62.32326889038086, "learning_rate": 9.172559999999999e-07, "loss": 0.5072, "step": 20700 }, { "epoch": 0.2075, "grad_norm": 59.557228088378906, "learning_rate": 9.170559999999999e-07, "loss": 0.4438, "step": 20750 }, { "epoch": 0.208, "grad_norm": 64.50997161865234, "learning_rate": 9.16856e-07, "loss": 0.6501, "step": 20800 }, { "epoch": 0.2085, "grad_norm": 51.724708557128906, "learning_rate": 9.16656e-07, "loss": 0.6389, "step": 20850 }, { "epoch": 0.209, "grad_norm": 44.57424545288086, "learning_rate": 9.16456e-07, "loss": 0.5444, "step": 20900 }, { "epoch": 0.2095, "grad_norm": 46.1285400390625, "learning_rate": 9.16256e-07, "loss": 0.6703, "step": 20950 }, { "epoch": 0.21, "grad_norm": 62.80519485473633, "learning_rate": 9.16056e-07, "loss": 0.5194, "step": 21000 }, { "epoch": 0.2105, "grad_norm": 97.80638885498047, "learning_rate": 9.158559999999999e-07, "loss": 0.425, "step": 21050 }, { "epoch": 0.211, "grad_norm": 79.29085540771484, "learning_rate": 9.15656e-07, "loss": 0.4748, "step": 21100 }, { "epoch": 0.2115, "grad_norm": 15.490723609924316, "learning_rate": 9.15456e-07, "loss": 0.666, "step": 21150 }, { "epoch": 0.212, "grad_norm": 1.4117364883422852, "learning_rate": 9.152559999999999e-07, "loss": 0.4462, "step": 21200 }, { "epoch": 0.2125, "grad_norm": 64.279541015625, "learning_rate": 9.15056e-07, "loss": 0.462, "step": 21250 }, { "epoch": 0.213, "grad_norm": 50.567176818847656, "learning_rate": 9.14856e-07, "loss": 0.629, "step": 21300 }, { "epoch": 0.2135, "grad_norm": 74.72473907470703, "learning_rate": 9.14656e-07, "loss": 0.7797, "step": 21350 }, { "epoch": 0.214, "grad_norm": 0.2427138090133667, "learning_rate": 9.14456e-07, "loss": 0.6424, "step": 21400 }, { "epoch": 0.2145, "grad_norm": 94.19923400878906, "learning_rate": 9.142559999999999e-07, "loss": 0.5077, "step": 21450 }, { "epoch": 0.215, "grad_norm": 5.564889430999756, "learning_rate": 9.140559999999999e-07, "loss": 0.5594, "step": 21500 }, { "epoch": 0.2155, "grad_norm": 79.15170288085938, "learning_rate": 9.1386e-07, "loss": 0.5204, "step": 21550 }, { "epoch": 0.216, "grad_norm": 34.72391128540039, "learning_rate": 9.1366e-07, "loss": 0.5304, "step": 21600 }, { "epoch": 0.2165, "grad_norm": 45.1458625793457, "learning_rate": 9.1346e-07, "loss": 0.5573, "step": 21650 }, { "epoch": 0.217, "grad_norm": 13.129668235778809, "learning_rate": 9.1326e-07, "loss": 0.5866, "step": 21700 }, { "epoch": 0.2175, "grad_norm": 94.04682159423828, "learning_rate": 9.130599999999999e-07, "loss": 0.5363, "step": 21750 }, { "epoch": 0.218, "grad_norm": 3.027968645095825, "learning_rate": 9.128599999999999e-07, "loss": 0.6121, "step": 21800 }, { "epoch": 0.2185, "grad_norm": 14.584113121032715, "learning_rate": 9.1266e-07, "loss": 0.6755, "step": 21850 }, { "epoch": 0.219, "grad_norm": 90.84115600585938, "learning_rate": 9.1246e-07, "loss": 0.8153, "step": 21900 }, { "epoch": 0.2195, "grad_norm": 54.05946350097656, "learning_rate": 9.122599999999999e-07, "loss": 0.5227, "step": 21950 }, { "epoch": 0.22, "grad_norm": 112.29281616210938, "learning_rate": 9.1206e-07, "loss": 0.5026, "step": 22000 }, { "epoch": 0.2205, "grad_norm": 39.67966842651367, "learning_rate": 9.1186e-07, "loss": 0.6301, "step": 22050 }, { "epoch": 0.221, "grad_norm": 80.91326141357422, "learning_rate": 9.116599999999999e-07, "loss": 0.5802, "step": 22100 }, { "epoch": 0.2215, "grad_norm": 21.68891143798828, "learning_rate": 9.1146e-07, "loss": 0.6474, "step": 22150 }, { "epoch": 0.222, "grad_norm": 112.01923370361328, "learning_rate": 9.112599999999999e-07, "loss": 0.556, "step": 22200 }, { "epoch": 0.2225, "grad_norm": 7.433606147766113, "learning_rate": 9.110599999999999e-07, "loss": 0.6761, "step": 22250 }, { "epoch": 0.223, "grad_norm": 88.78018188476562, "learning_rate": 9.1086e-07, "loss": 0.6357, "step": 22300 }, { "epoch": 0.2235, "grad_norm": 143.90103149414062, "learning_rate": 9.1066e-07, "loss": 0.5232, "step": 22350 }, { "epoch": 0.224, "grad_norm": 7.7126359939575195, "learning_rate": 9.1046e-07, "loss": 0.6542, "step": 22400 }, { "epoch": 0.2245, "grad_norm": 44.35465621948242, "learning_rate": 9.102599999999999e-07, "loss": 0.7115, "step": 22450 }, { "epoch": 0.225, "grad_norm": 7.780481815338135, "learning_rate": 9.100599999999999e-07, "loss": 0.6885, "step": 22500 }, { "epoch": 0.2255, "grad_norm": 120.73460388183594, "learning_rate": 9.098599999999999e-07, "loss": 0.6542, "step": 22550 }, { "epoch": 0.226, "grad_norm": 85.5213623046875, "learning_rate": 9.0966e-07, "loss": 0.6606, "step": 22600 }, { "epoch": 0.2265, "grad_norm": 73.37017822265625, "learning_rate": 9.0946e-07, "loss": 0.6267, "step": 22650 }, { "epoch": 0.227, "grad_norm": 89.31539916992188, "learning_rate": 9.092599999999999e-07, "loss": 0.7014, "step": 22700 }, { "epoch": 0.2275, "grad_norm": 27.36491584777832, "learning_rate": 9.0906e-07, "loss": 0.6383, "step": 22750 }, { "epoch": 0.228, "grad_norm": 57.39030838012695, "learning_rate": 9.0886e-07, "loss": 0.6104, "step": 22800 }, { "epoch": 0.2285, "grad_norm": 6.634609222412109, "learning_rate": 9.086599999999999e-07, "loss": 0.4772, "step": 22850 }, { "epoch": 0.229, "grad_norm": 98.5889892578125, "learning_rate": 9.0846e-07, "loss": 0.6115, "step": 22900 }, { "epoch": 0.2295, "grad_norm": 71.15877532958984, "learning_rate": 9.082599999999999e-07, "loss": 0.4788, "step": 22950 }, { "epoch": 0.23, "grad_norm": 61.014102935791016, "learning_rate": 9.080599999999999e-07, "loss": 0.5738, "step": 23000 }, { "epoch": 0.2305, "grad_norm": 18.213537216186523, "learning_rate": 9.0786e-07, "loss": 0.5771, "step": 23050 }, { "epoch": 0.231, "grad_norm": 62.87067794799805, "learning_rate": 9.0766e-07, "loss": 0.5029, "step": 23100 }, { "epoch": 0.2315, "grad_norm": 52.611976623535156, "learning_rate": 9.074600000000001e-07, "loss": 0.5436, "step": 23150 }, { "epoch": 0.232, "grad_norm": 79.97590637207031, "learning_rate": 9.072599999999999e-07, "loss": 0.5754, "step": 23200 }, { "epoch": 0.2325, "grad_norm": 100.50433349609375, "learning_rate": 9.070599999999999e-07, "loss": 0.7087, "step": 23250 }, { "epoch": 0.233, "grad_norm": 77.63613891601562, "learning_rate": 9.0686e-07, "loss": 0.4804, "step": 23300 }, { "epoch": 0.2335, "grad_norm": 152.4717254638672, "learning_rate": 9.0666e-07, "loss": 0.4687, "step": 23350 }, { "epoch": 0.234, "grad_norm": 1.093911051750183, "learning_rate": 9.0646e-07, "loss": 0.5668, "step": 23400 }, { "epoch": 0.2345, "grad_norm": 80.12809753417969, "learning_rate": 9.0626e-07, "loss": 0.6275, "step": 23450 }, { "epoch": 0.235, "grad_norm": 17.399316787719727, "learning_rate": 9.0606e-07, "loss": 0.5775, "step": 23500 }, { "epoch": 0.2355, "grad_norm": 62.769813537597656, "learning_rate": 9.058599999999999e-07, "loss": 0.5202, "step": 23550 }, { "epoch": 0.236, "grad_norm": 37.83443069458008, "learning_rate": 9.0566e-07, "loss": 0.4658, "step": 23600 }, { "epoch": 0.2365, "grad_norm": 16.844783782958984, "learning_rate": 9.0546e-07, "loss": 0.4983, "step": 23650 }, { "epoch": 0.237, "grad_norm": 56.658695220947266, "learning_rate": 9.052599999999999e-07, "loss": 0.715, "step": 23700 }, { "epoch": 0.2375, "grad_norm": 52.001991271972656, "learning_rate": 9.0506e-07, "loss": 0.481, "step": 23750 }, { "epoch": 0.238, "grad_norm": 44.0219612121582, "learning_rate": 9.0486e-07, "loss": 0.5652, "step": 23800 }, { "epoch": 0.2385, "grad_norm": 23.275821685791016, "learning_rate": 9.0466e-07, "loss": 0.5805, "step": 23850 }, { "epoch": 0.239, "grad_norm": 42.84242630004883, "learning_rate": 9.0446e-07, "loss": 0.579, "step": 23900 }, { "epoch": 0.2395, "grad_norm": 38.506683349609375, "learning_rate": 9.042599999999999e-07, "loss": 0.6933, "step": 23950 }, { "epoch": 0.24, "grad_norm": 97.8546142578125, "learning_rate": 9.040599999999999e-07, "loss": 0.5293, "step": 24000 }, { "epoch": 0.2405, "grad_norm": 34.26162338256836, "learning_rate": 9.0386e-07, "loss": 0.6291, "step": 24050 }, { "epoch": 0.241, "grad_norm": 52.09344482421875, "learning_rate": 9.0366e-07, "loss": 0.5158, "step": 24100 }, { "epoch": 0.2415, "grad_norm": 52.938385009765625, "learning_rate": 9.0346e-07, "loss": 0.4271, "step": 24150 }, { "epoch": 0.242, "grad_norm": 102.01466369628906, "learning_rate": 9.03264e-07, "loss": 0.676, "step": 24200 }, { "epoch": 0.2425, "grad_norm": 92.10142517089844, "learning_rate": 9.030639999999999e-07, "loss": 0.4368, "step": 24250 }, { "epoch": 0.243, "grad_norm": 61.187618255615234, "learning_rate": 9.028639999999999e-07, "loss": 0.593, "step": 24300 }, { "epoch": 0.2435, "grad_norm": 54.55991744995117, "learning_rate": 9.02664e-07, "loss": 0.5802, "step": 24350 }, { "epoch": 0.244, "grad_norm": 66.34545135498047, "learning_rate": 9.02464e-07, "loss": 0.5796, "step": 24400 }, { "epoch": 0.2445, "grad_norm": 57.91377639770508, "learning_rate": 9.022639999999999e-07, "loss": 0.5566, "step": 24450 }, { "epoch": 0.245, "grad_norm": 40.273277282714844, "learning_rate": 9.02064e-07, "loss": 0.5867, "step": 24500 }, { "epoch": 0.2455, "grad_norm": 83.97872161865234, "learning_rate": 9.01864e-07, "loss": 0.6223, "step": 24550 }, { "epoch": 0.246, "grad_norm": 50.599708557128906, "learning_rate": 9.016639999999999e-07, "loss": 0.498, "step": 24600 }, { "epoch": 0.2465, "grad_norm": 14.626625061035156, "learning_rate": 9.01464e-07, "loss": 0.5453, "step": 24650 }, { "epoch": 0.247, "grad_norm": 12.880644798278809, "learning_rate": 9.012639999999999e-07, "loss": 0.5644, "step": 24700 }, { "epoch": 0.2475, "grad_norm": 10.257484436035156, "learning_rate": 9.010639999999999e-07, "loss": 0.4375, "step": 24750 }, { "epoch": 0.248, "grad_norm": 96.48957061767578, "learning_rate": 9.00864e-07, "loss": 0.6164, "step": 24800 }, { "epoch": 0.2485, "grad_norm": 75.5651626586914, "learning_rate": 9.00664e-07, "loss": 0.82, "step": 24850 }, { "epoch": 0.249, "grad_norm": 29.0128116607666, "learning_rate": 9.00464e-07, "loss": 0.4682, "step": 24900 }, { "epoch": 0.2495, "grad_norm": 15.282790184020996, "learning_rate": 9.002639999999999e-07, "loss": 0.512, "step": 24950 }, { "epoch": 0.25, "grad_norm": 70.66602325439453, "learning_rate": 9.000639999999999e-07, "loss": 0.719, "step": 25000 }, { "epoch": 0.2505, "grad_norm": 133.6775665283203, "learning_rate": 8.998639999999999e-07, "loss": 0.583, "step": 25050 }, { "epoch": 0.251, "grad_norm": 53.346988677978516, "learning_rate": 8.99664e-07, "loss": 0.661, "step": 25100 }, { "epoch": 0.2515, "grad_norm": 57.71779251098633, "learning_rate": 8.99464e-07, "loss": 0.4995, "step": 25150 }, { "epoch": 0.252, "grad_norm": 11.651928901672363, "learning_rate": 8.992639999999999e-07, "loss": 0.5597, "step": 25200 }, { "epoch": 0.2525, "grad_norm": 107.9719467163086, "learning_rate": 8.99064e-07, "loss": 0.497, "step": 25250 }, { "epoch": 0.253, "grad_norm": 14.6730375289917, "learning_rate": 8.988639999999999e-07, "loss": 0.6174, "step": 25300 }, { "epoch": 0.2535, "grad_norm": 56.200225830078125, "learning_rate": 8.986639999999999e-07, "loss": 0.6374, "step": 25350 }, { "epoch": 0.254, "grad_norm": 15.914813995361328, "learning_rate": 8.98464e-07, "loss": 0.513, "step": 25400 }, { "epoch": 0.2545, "grad_norm": 3.805548906326294, "learning_rate": 8.982639999999999e-07, "loss": 0.5197, "step": 25450 }, { "epoch": 0.255, "grad_norm": 88.02377319335938, "learning_rate": 8.980639999999999e-07, "loss": 0.7373, "step": 25500 }, { "epoch": 0.2555, "grad_norm": 35.10052490234375, "learning_rate": 8.97864e-07, "loss": 0.626, "step": 25550 }, { "epoch": 0.256, "grad_norm": 40.560298919677734, "learning_rate": 8.97664e-07, "loss": 0.6376, "step": 25600 }, { "epoch": 0.2565, "grad_norm": 36.76740264892578, "learning_rate": 8.97464e-07, "loss": 0.6143, "step": 25650 }, { "epoch": 0.257, "grad_norm": 58.6038703918457, "learning_rate": 8.972639999999999e-07, "loss": 0.6131, "step": 25700 }, { "epoch": 0.2575, "grad_norm": 16.4853515625, "learning_rate": 8.970639999999999e-07, "loss": 0.5266, "step": 25750 }, { "epoch": 0.258, "grad_norm": 2.5067620277404785, "learning_rate": 8.96864e-07, "loss": 0.4183, "step": 25800 }, { "epoch": 0.2585, "grad_norm": 47.051795959472656, "learning_rate": 8.96664e-07, "loss": 0.411, "step": 25850 }, { "epoch": 0.259, "grad_norm": 80.19731140136719, "learning_rate": 8.96464e-07, "loss": 0.6141, "step": 25900 }, { "epoch": 0.2595, "grad_norm": 53.69868469238281, "learning_rate": 8.96264e-07, "loss": 0.5144, "step": 25950 }, { "epoch": 0.26, "grad_norm": 33.09700393676758, "learning_rate": 8.960639999999999e-07, "loss": 0.6407, "step": 26000 }, { "epoch": 0.2605, "grad_norm": 28.68010711669922, "learning_rate": 8.958639999999999e-07, "loss": 0.4799, "step": 26050 }, { "epoch": 0.261, "grad_norm": 31.614540100097656, "learning_rate": 8.95664e-07, "loss": 0.5902, "step": 26100 }, { "epoch": 0.2615, "grad_norm": 46.6901969909668, "learning_rate": 8.95464e-07, "loss": 0.5856, "step": 26150 }, { "epoch": 0.262, "grad_norm": 22.507549285888672, "learning_rate": 8.952639999999999e-07, "loss": 0.6191, "step": 26200 }, { "epoch": 0.2625, "grad_norm": 25.52752113342285, "learning_rate": 8.95064e-07, "loss": 0.5374, "step": 26250 }, { "epoch": 0.263, "grad_norm": 66.70597076416016, "learning_rate": 8.94864e-07, "loss": 0.5224, "step": 26300 }, { "epoch": 0.2635, "grad_norm": 62.320648193359375, "learning_rate": 8.946639999999999e-07, "loss": 0.5897, "step": 26350 }, { "epoch": 0.264, "grad_norm": 15.186070442199707, "learning_rate": 8.94464e-07, "loss": 0.6228, "step": 26400 }, { "epoch": 0.2645, "grad_norm": 111.6026382446289, "learning_rate": 8.942639999999999e-07, "loss": 0.5744, "step": 26450 }, { "epoch": 0.265, "grad_norm": 82.44178771972656, "learning_rate": 8.940639999999999e-07, "loss": 0.5254, "step": 26500 }, { "epoch": 0.2655, "grad_norm": 52.06359100341797, "learning_rate": 8.93864e-07, "loss": 0.4563, "step": 26550 }, { "epoch": 0.266, "grad_norm": 77.35059356689453, "learning_rate": 8.93664e-07, "loss": 0.5443, "step": 26600 }, { "epoch": 0.2665, "grad_norm": 40.24097442626953, "learning_rate": 8.93464e-07, "loss": 0.7312, "step": 26650 }, { "epoch": 0.267, "grad_norm": 12.489123344421387, "learning_rate": 8.932639999999999e-07, "loss": 0.5649, "step": 26700 }, { "epoch": 0.2675, "grad_norm": 87.02991485595703, "learning_rate": 8.930639999999999e-07, "loss": 0.5359, "step": 26750 }, { "epoch": 0.268, "grad_norm": 20.647361755371094, "learning_rate": 8.928639999999999e-07, "loss": 0.5762, "step": 26800 }, { "epoch": 0.2685, "grad_norm": 67.69243621826172, "learning_rate": 8.92664e-07, "loss": 0.4922, "step": 26850 }, { "epoch": 0.269, "grad_norm": 60.18714904785156, "learning_rate": 8.92464e-07, "loss": 0.5516, "step": 26900 }, { "epoch": 0.2695, "grad_norm": 9.904438972473145, "learning_rate": 8.922639999999999e-07, "loss": 0.5436, "step": 26950 }, { "epoch": 0.27, "grad_norm": 74.50110626220703, "learning_rate": 8.92064e-07, "loss": 0.6846, "step": 27000 }, { "epoch": 0.2705, "grad_norm": 64.5053939819336, "learning_rate": 8.918639999999999e-07, "loss": 0.5902, "step": 27050 }, { "epoch": 0.271, "grad_norm": 54.11682891845703, "learning_rate": 8.916679999999999e-07, "loss": 0.6153, "step": 27100 }, { "epoch": 0.2715, "grad_norm": 2.7656333446502686, "learning_rate": 8.91468e-07, "loss": 0.4774, "step": 27150 }, { "epoch": 0.272, "grad_norm": 47.780067443847656, "learning_rate": 8.912679999999999e-07, "loss": 0.6265, "step": 27200 }, { "epoch": 0.2725, "grad_norm": 79.4407958984375, "learning_rate": 8.910679999999999e-07, "loss": 0.4763, "step": 27250 }, { "epoch": 0.273, "grad_norm": 30.775165557861328, "learning_rate": 8.90868e-07, "loss": 0.4733, "step": 27300 }, { "epoch": 0.2735, "grad_norm": 43.01605987548828, "learning_rate": 8.90668e-07, "loss": 0.5996, "step": 27350 }, { "epoch": 0.274, "grad_norm": 25.7813663482666, "learning_rate": 8.90468e-07, "loss": 0.4017, "step": 27400 }, { "epoch": 0.2745, "grad_norm": 70.08576202392578, "learning_rate": 8.902679999999999e-07, "loss": 0.6638, "step": 27450 }, { "epoch": 0.275, "grad_norm": 3.8417301177978516, "learning_rate": 8.900679999999999e-07, "loss": 0.5934, "step": 27500 }, { "epoch": 0.2755, "grad_norm": 43.573211669921875, "learning_rate": 8.898679999999999e-07, "loss": 0.4682, "step": 27550 }, { "epoch": 0.276, "grad_norm": 77.05116271972656, "learning_rate": 8.89668e-07, "loss": 0.852, "step": 27600 }, { "epoch": 0.2765, "grad_norm": 51.25703430175781, "learning_rate": 8.89468e-07, "loss": 0.5171, "step": 27650 }, { "epoch": 0.277, "grad_norm": 79.16458129882812, "learning_rate": 8.892679999999999e-07, "loss": 0.4679, "step": 27700 }, { "epoch": 0.2775, "grad_norm": 52.861976623535156, "learning_rate": 8.89068e-07, "loss": 0.5379, "step": 27750 }, { "epoch": 0.278, "grad_norm": 56.5855598449707, "learning_rate": 8.888679999999999e-07, "loss": 0.5315, "step": 27800 }, { "epoch": 0.2785, "grad_norm": 38.6595458984375, "learning_rate": 8.886679999999999e-07, "loss": 0.5684, "step": 27850 }, { "epoch": 0.279, "grad_norm": 111.93008422851562, "learning_rate": 8.88468e-07, "loss": 0.5945, "step": 27900 }, { "epoch": 0.2795, "grad_norm": 58.56281661987305, "learning_rate": 8.882679999999999e-07, "loss": 0.4908, "step": 27950 }, { "epoch": 0.28, "grad_norm": 6.173264026641846, "learning_rate": 8.880679999999999e-07, "loss": 0.6168, "step": 28000 }, { "epoch": 0.2805, "grad_norm": 75.64866638183594, "learning_rate": 8.87868e-07, "loss": 0.6355, "step": 28050 }, { "epoch": 0.281, "grad_norm": 17.418989181518555, "learning_rate": 8.87668e-07, "loss": 0.4972, "step": 28100 }, { "epoch": 0.2815, "grad_norm": 105.94364929199219, "learning_rate": 8.87468e-07, "loss": 0.4307, "step": 28150 }, { "epoch": 0.282, "grad_norm": 204.995361328125, "learning_rate": 8.872679999999999e-07, "loss": 0.5594, "step": 28200 }, { "epoch": 0.2825, "grad_norm": 64.4760971069336, "learning_rate": 8.870679999999999e-07, "loss": 0.7741, "step": 28250 }, { "epoch": 0.283, "grad_norm": 88.27828216552734, "learning_rate": 8.86868e-07, "loss": 0.6538, "step": 28300 }, { "epoch": 0.2835, "grad_norm": 16.012582778930664, "learning_rate": 8.86668e-07, "loss": 0.6104, "step": 28350 }, { "epoch": 0.284, "grad_norm": 27.557018280029297, "learning_rate": 8.86468e-07, "loss": 0.501, "step": 28400 }, { "epoch": 0.2845, "grad_norm": 54.364566802978516, "learning_rate": 8.86268e-07, "loss": 0.5534, "step": 28450 }, { "epoch": 0.285, "grad_norm": 17.52170753479004, "learning_rate": 8.860679999999999e-07, "loss": 0.5886, "step": 28500 }, { "epoch": 0.2855, "grad_norm": 65.14887237548828, "learning_rate": 8.858679999999999e-07, "loss": 0.5129, "step": 28550 }, { "epoch": 0.286, "grad_norm": 4.750748157501221, "learning_rate": 8.85668e-07, "loss": 0.521, "step": 28600 }, { "epoch": 0.2865, "grad_norm": 69.85326385498047, "learning_rate": 8.85468e-07, "loss": 0.5333, "step": 28650 }, { "epoch": 0.287, "grad_norm": 54.12617874145508, "learning_rate": 8.85268e-07, "loss": 0.481, "step": 28700 }, { "epoch": 0.2875, "grad_norm": 1.6394026279449463, "learning_rate": 8.85068e-07, "loss": 0.5055, "step": 28750 }, { "epoch": 0.288, "grad_norm": 12.818920135498047, "learning_rate": 8.84868e-07, "loss": 0.6489, "step": 28800 }, { "epoch": 0.2885, "grad_norm": 29.769569396972656, "learning_rate": 8.846679999999999e-07, "loss": 0.4749, "step": 28850 }, { "epoch": 0.289, "grad_norm": 62.24850845336914, "learning_rate": 8.84468e-07, "loss": 0.5205, "step": 28900 }, { "epoch": 0.2895, "grad_norm": 6.054821491241455, "learning_rate": 8.84268e-07, "loss": 0.5548, "step": 28950 }, { "epoch": 0.29, "grad_norm": 57.04275894165039, "learning_rate": 8.840679999999999e-07, "loss": 0.435, "step": 29000 }, { "epoch": 0.2905, "grad_norm": 44.01559829711914, "learning_rate": 8.83868e-07, "loss": 0.5515, "step": 29050 }, { "epoch": 0.291, "grad_norm": 102.21464538574219, "learning_rate": 8.83668e-07, "loss": 0.5175, "step": 29100 }, { "epoch": 0.2915, "grad_norm": 16.0670108795166, "learning_rate": 8.83468e-07, "loss": 0.5667, "step": 29150 }, { "epoch": 0.292, "grad_norm": 50.337894439697266, "learning_rate": 8.83268e-07, "loss": 0.6014, "step": 29200 }, { "epoch": 0.2925, "grad_norm": 90.58765411376953, "learning_rate": 8.830679999999999e-07, "loss": 0.5361, "step": 29250 }, { "epoch": 0.293, "grad_norm": 88.60356140136719, "learning_rate": 8.828679999999999e-07, "loss": 0.4992, "step": 29300 }, { "epoch": 0.2935, "grad_norm": 17.39057731628418, "learning_rate": 8.82668e-07, "loss": 0.6699, "step": 29350 }, { "epoch": 0.294, "grad_norm": 9.938417434692383, "learning_rate": 8.82468e-07, "loss": 0.5748, "step": 29400 }, { "epoch": 0.2945, "grad_norm": 86.84083557128906, "learning_rate": 8.82268e-07, "loss": 0.5455, "step": 29450 }, { "epoch": 0.295, "grad_norm": 76.97161102294922, "learning_rate": 8.82068e-07, "loss": 0.5668, "step": 29500 }, { "epoch": 0.2955, "grad_norm": 6.305858135223389, "learning_rate": 8.818679999999999e-07, "loss": 0.4598, "step": 29550 }, { "epoch": 0.296, "grad_norm": 83.46090698242188, "learning_rate": 8.816679999999999e-07, "loss": 0.5434, "step": 29600 }, { "epoch": 0.2965, "grad_norm": 39.0470085144043, "learning_rate": 8.81468e-07, "loss": 0.5519, "step": 29650 }, { "epoch": 0.297, "grad_norm": 74.89450073242188, "learning_rate": 8.81268e-07, "loss": 0.6435, "step": 29700 }, { "epoch": 0.2975, "grad_norm": 77.68589782714844, "learning_rate": 8.810679999999999e-07, "loss": 0.5398, "step": 29750 }, { "epoch": 0.298, "grad_norm": 55.85251235961914, "learning_rate": 8.80868e-07, "loss": 0.5951, "step": 29800 }, { "epoch": 0.2985, "grad_norm": 35.89079666137695, "learning_rate": 8.80668e-07, "loss": 0.5007, "step": 29850 }, { "epoch": 0.299, "grad_norm": 81.23458099365234, "learning_rate": 8.804679999999999e-07, "loss": 0.6661, "step": 29900 }, { "epoch": 0.2995, "grad_norm": 30.005859375, "learning_rate": 8.80268e-07, "loss": 0.5404, "step": 29950 }, { "epoch": 0.3, "grad_norm": 93.9574203491211, "learning_rate": 8.800679999999999e-07, "loss": 0.5561, "step": 30000 }, { "epoch": 0.3005, "grad_norm": 70.64046478271484, "learning_rate": 8.798719999999999e-07, "loss": 0.4643, "step": 30050 }, { "epoch": 0.301, "grad_norm": 23.76947021484375, "learning_rate": 8.79672e-07, "loss": 0.5806, "step": 30100 }, { "epoch": 0.3015, "grad_norm": 54.23265838623047, "learning_rate": 8.79472e-07, "loss": 0.5467, "step": 30150 }, { "epoch": 0.302, "grad_norm": 62.528175354003906, "learning_rate": 8.792720000000001e-07, "loss": 0.6381, "step": 30200 }, { "epoch": 0.3025, "grad_norm": 24.852832794189453, "learning_rate": 8.790719999999999e-07, "loss": 0.4532, "step": 30250 }, { "epoch": 0.303, "grad_norm": 55.6757698059082, "learning_rate": 8.788719999999999e-07, "loss": 0.7508, "step": 30300 }, { "epoch": 0.3035, "grad_norm": 108.9383316040039, "learning_rate": 8.78672e-07, "loss": 0.6378, "step": 30350 }, { "epoch": 0.304, "grad_norm": 44.41104507446289, "learning_rate": 8.78472e-07, "loss": 0.6764, "step": 30400 }, { "epoch": 0.3045, "grad_norm": 34.045413970947266, "learning_rate": 8.78272e-07, "loss": 0.5944, "step": 30450 }, { "epoch": 0.305, "grad_norm": 29.27425765991211, "learning_rate": 8.78072e-07, "loss": 0.5212, "step": 30500 }, { "epoch": 0.3055, "grad_norm": 83.5364761352539, "learning_rate": 8.77872e-07, "loss": 0.3991, "step": 30550 }, { "epoch": 0.306, "grad_norm": 1.8257538080215454, "learning_rate": 8.776719999999999e-07, "loss": 0.5711, "step": 30600 }, { "epoch": 0.3065, "grad_norm": 74.53131866455078, "learning_rate": 8.77472e-07, "loss": 0.5351, "step": 30650 }, { "epoch": 0.307, "grad_norm": 49.41853713989258, "learning_rate": 8.77272e-07, "loss": 0.5694, "step": 30700 }, { "epoch": 0.3075, "grad_norm": 120.75214385986328, "learning_rate": 8.770719999999999e-07, "loss": 0.5498, "step": 30750 }, { "epoch": 0.308, "grad_norm": 46.72524642944336, "learning_rate": 8.76872e-07, "loss": 0.5695, "step": 30800 }, { "epoch": 0.3085, "grad_norm": 88.22650146484375, "learning_rate": 8.76672e-07, "loss": 0.5629, "step": 30850 }, { "epoch": 0.309, "grad_norm": 11.028036117553711, "learning_rate": 8.76472e-07, "loss": 0.4899, "step": 30900 }, { "epoch": 0.3095, "grad_norm": 18.966903686523438, "learning_rate": 8.76272e-07, "loss": 0.5321, "step": 30950 }, { "epoch": 0.31, "grad_norm": 8.517266273498535, "learning_rate": 8.760719999999999e-07, "loss": 0.5198, "step": 31000 }, { "epoch": 0.3105, "grad_norm": 2.6922929286956787, "learning_rate": 8.758719999999999e-07, "loss": 0.4756, "step": 31050 }, { "epoch": 0.311, "grad_norm": 80.45319366455078, "learning_rate": 8.75672e-07, "loss": 0.625, "step": 31100 }, { "epoch": 0.3115, "grad_norm": 50.82831573486328, "learning_rate": 8.75472e-07, "loss": 0.5731, "step": 31150 }, { "epoch": 0.312, "grad_norm": 80.17171478271484, "learning_rate": 8.75272e-07, "loss": 0.4188, "step": 31200 }, { "epoch": 0.3125, "grad_norm": 83.52489471435547, "learning_rate": 8.75072e-07, "loss": 0.6047, "step": 31250 }, { "epoch": 0.313, "grad_norm": 40.45694351196289, "learning_rate": 8.74872e-07, "loss": 0.5314, "step": 31300 }, { "epoch": 0.3135, "grad_norm": 7.2418694496154785, "learning_rate": 8.746719999999999e-07, "loss": 0.6789, "step": 31350 }, { "epoch": 0.314, "grad_norm": 105.38126373291016, "learning_rate": 8.74472e-07, "loss": 0.6344, "step": 31400 }, { "epoch": 0.3145, "grad_norm": 69.45547485351562, "learning_rate": 8.74272e-07, "loss": 0.508, "step": 31450 }, { "epoch": 0.315, "grad_norm": 7.970687389373779, "learning_rate": 8.740719999999999e-07, "loss": 0.6869, "step": 31500 }, { "epoch": 0.3155, "grad_norm": 85.00370788574219, "learning_rate": 8.73872e-07, "loss": 0.5232, "step": 31550 }, { "epoch": 0.316, "grad_norm": 63.42658996582031, "learning_rate": 8.73672e-07, "loss": 0.5965, "step": 31600 }, { "epoch": 0.3165, "grad_norm": 100.0108871459961, "learning_rate": 8.73472e-07, "loss": 0.5686, "step": 31650 }, { "epoch": 0.317, "grad_norm": 76.61862182617188, "learning_rate": 8.73272e-07, "loss": 0.6126, "step": 31700 }, { "epoch": 0.3175, "grad_norm": 23.5545711517334, "learning_rate": 8.730719999999999e-07, "loss": 0.486, "step": 31750 }, { "epoch": 0.318, "grad_norm": 121.40791320800781, "learning_rate": 8.728719999999999e-07, "loss": 0.5634, "step": 31800 }, { "epoch": 0.3185, "grad_norm": 60.57836151123047, "learning_rate": 8.72672e-07, "loss": 0.477, "step": 31850 }, { "epoch": 0.319, "grad_norm": 71.34037017822266, "learning_rate": 8.72472e-07, "loss": 0.6271, "step": 31900 }, { "epoch": 0.3195, "grad_norm": 1.094132661819458, "learning_rate": 8.72272e-07, "loss": 0.6245, "step": 31950 }, { "epoch": 0.32, "grad_norm": 112.46796417236328, "learning_rate": 8.72072e-07, "loss": 0.4673, "step": 32000 }, { "epoch": 0.3205, "grad_norm": 65.74776458740234, "learning_rate": 8.718719999999999e-07, "loss": 0.4702, "step": 32050 }, { "epoch": 0.321, "grad_norm": 133.10072326660156, "learning_rate": 8.716719999999999e-07, "loss": 0.4708, "step": 32100 }, { "epoch": 0.3215, "grad_norm": 88.02649688720703, "learning_rate": 8.71472e-07, "loss": 0.4592, "step": 32150 }, { "epoch": 0.322, "grad_norm": 97.91495513916016, "learning_rate": 8.71272e-07, "loss": 0.6828, "step": 32200 }, { "epoch": 0.3225, "grad_norm": 76.7396011352539, "learning_rate": 8.710719999999999e-07, "loss": 0.4356, "step": 32250 }, { "epoch": 0.323, "grad_norm": 48.60926055908203, "learning_rate": 8.70872e-07, "loss": 0.5898, "step": 32300 }, { "epoch": 0.3235, "grad_norm": 107.64606475830078, "learning_rate": 8.70672e-07, "loss": 0.552, "step": 32350 }, { "epoch": 0.324, "grad_norm": 77.53285217285156, "learning_rate": 8.704719999999999e-07, "loss": 0.4998, "step": 32400 }, { "epoch": 0.3245, "grad_norm": 3.892568349838257, "learning_rate": 8.70272e-07, "loss": 0.5265, "step": 32450 }, { "epoch": 0.325, "grad_norm": 87.63228607177734, "learning_rate": 8.700719999999999e-07, "loss": 0.6641, "step": 32500 }, { "epoch": 0.3255, "grad_norm": 208.2560272216797, "learning_rate": 8.698719999999999e-07, "loss": 0.5142, "step": 32550 }, { "epoch": 0.326, "grad_norm": 39.32928466796875, "learning_rate": 8.69672e-07, "loss": 0.5655, "step": 32600 }, { "epoch": 0.3265, "grad_norm": 121.70012664794922, "learning_rate": 8.69472e-07, "loss": 0.5668, "step": 32650 }, { "epoch": 0.327, "grad_norm": 3.1223995685577393, "learning_rate": 8.692720000000001e-07, "loss": 0.5199, "step": 32700 }, { "epoch": 0.3275, "grad_norm": 103.2892074584961, "learning_rate": 8.690719999999999e-07, "loss": 0.5335, "step": 32750 }, { "epoch": 0.328, "grad_norm": 9.383655548095703, "learning_rate": 8.688719999999999e-07, "loss": 0.6043, "step": 32800 }, { "epoch": 0.3285, "grad_norm": 41.5776252746582, "learning_rate": 8.68672e-07, "loss": 0.671, "step": 32850 }, { "epoch": 0.329, "grad_norm": 12.326675415039062, "learning_rate": 8.68472e-07, "loss": 0.4646, "step": 32900 }, { "epoch": 0.3295, "grad_norm": 42.7502555847168, "learning_rate": 8.68272e-07, "loss": 0.492, "step": 32950 }, { "epoch": 0.33, "grad_norm": 70.67731475830078, "learning_rate": 8.68072e-07, "loss": 0.4932, "step": 33000 }, { "epoch": 0.3305, "grad_norm": 50.19448471069336, "learning_rate": 8.67872e-07, "loss": 0.4514, "step": 33050 }, { "epoch": 0.331, "grad_norm": 73.63932037353516, "learning_rate": 8.676719999999999e-07, "loss": 0.4684, "step": 33100 }, { "epoch": 0.3315, "grad_norm": 22.24913787841797, "learning_rate": 8.67472e-07, "loss": 0.6631, "step": 33150 }, { "epoch": 0.332, "grad_norm": 110.29122161865234, "learning_rate": 8.67272e-07, "loss": 0.714, "step": 33200 }, { "epoch": 0.3325, "grad_norm": 9.803537368774414, "learning_rate": 8.670719999999999e-07, "loss": 0.5666, "step": 33250 }, { "epoch": 0.333, "grad_norm": 1.9911659955978394, "learning_rate": 8.66872e-07, "loss": 0.5231, "step": 33300 }, { "epoch": 0.3335, "grad_norm": 29.713672637939453, "learning_rate": 8.66672e-07, "loss": 0.6263, "step": 33350 }, { "epoch": 0.334, "grad_norm": 58.22745895385742, "learning_rate": 8.66472e-07, "loss": 0.4424, "step": 33400 }, { "epoch": 0.3345, "grad_norm": 71.6273422241211, "learning_rate": 8.66272e-07, "loss": 0.4962, "step": 33450 }, { "epoch": 0.335, "grad_norm": 39.40334701538086, "learning_rate": 8.660719999999999e-07, "loss": 0.5212, "step": 33500 }, { "epoch": 0.3355, "grad_norm": 100.38050842285156, "learning_rate": 8.658719999999999e-07, "loss": 0.432, "step": 33550 }, { "epoch": 0.336, "grad_norm": 57.89727783203125, "learning_rate": 8.65672e-07, "loss": 0.528, "step": 33600 }, { "epoch": 0.3365, "grad_norm": 28.244213104248047, "learning_rate": 8.65472e-07, "loss": 0.5949, "step": 33650 }, { "epoch": 0.337, "grad_norm": 57.93000030517578, "learning_rate": 8.65272e-07, "loss": 0.534, "step": 33700 }, { "epoch": 0.3375, "grad_norm": 38.53422164916992, "learning_rate": 8.65072e-07, "loss": 0.6568, "step": 33750 }, { "epoch": 0.338, "grad_norm": 77.30050659179688, "learning_rate": 8.648719999999999e-07, "loss": 0.5894, "step": 33800 }, { "epoch": 0.3385, "grad_norm": 99.1224594116211, "learning_rate": 8.646719999999999e-07, "loss": 0.4271, "step": 33850 }, { "epoch": 0.339, "grad_norm": 10.921067237854004, "learning_rate": 8.64472e-07, "loss": 0.457, "step": 33900 }, { "epoch": 0.3395, "grad_norm": 83.70796203613281, "learning_rate": 8.64272e-07, "loss": 0.5231, "step": 33950 }, { "epoch": 0.34, "grad_norm": 81.03082275390625, "learning_rate": 8.640719999999999e-07, "loss": 0.6068, "step": 34000 }, { "epoch": 0.3405, "grad_norm": 18.295440673828125, "learning_rate": 8.63872e-07, "loss": 0.5386, "step": 34050 }, { "epoch": 0.341, "grad_norm": 79.17333221435547, "learning_rate": 8.63676e-07, "loss": 0.5819, "step": 34100 }, { "epoch": 0.3415, "grad_norm": 59.78744888305664, "learning_rate": 8.634759999999999e-07, "loss": 0.6588, "step": 34150 }, { "epoch": 0.342, "grad_norm": 87.66487121582031, "learning_rate": 8.63276e-07, "loss": 0.4755, "step": 34200 }, { "epoch": 0.3425, "grad_norm": 40.66081237792969, "learning_rate": 8.630799999999999e-07, "loss": 0.4778, "step": 34250 }, { "epoch": 0.343, "grad_norm": 99.85968780517578, "learning_rate": 8.628799999999999e-07, "loss": 0.631, "step": 34300 }, { "epoch": 0.3435, "grad_norm": 116.81835174560547, "learning_rate": 8.6268e-07, "loss": 0.58, "step": 34350 }, { "epoch": 0.344, "grad_norm": 20.3864803314209, "learning_rate": 8.6248e-07, "loss": 0.5837, "step": 34400 }, { "epoch": 0.3445, "grad_norm": 22.575923919677734, "learning_rate": 8.6228e-07, "loss": 0.519, "step": 34450 }, { "epoch": 0.345, "grad_norm": 95.2472915649414, "learning_rate": 8.6208e-07, "loss": 0.5047, "step": 34500 }, { "epoch": 0.3455, "grad_norm": 24.045143127441406, "learning_rate": 8.618799999999999e-07, "loss": 0.5282, "step": 34550 }, { "epoch": 0.346, "grad_norm": 93.23561096191406, "learning_rate": 8.616799999999999e-07, "loss": 0.5889, "step": 34600 }, { "epoch": 0.3465, "grad_norm": 83.2427978515625, "learning_rate": 8.6148e-07, "loss": 0.395, "step": 34650 }, { "epoch": 0.347, "grad_norm": 75.5633544921875, "learning_rate": 8.6128e-07, "loss": 0.5339, "step": 34700 }, { "epoch": 0.3475, "grad_norm": 46.325660705566406, "learning_rate": 8.610799999999999e-07, "loss": 0.6007, "step": 34750 }, { "epoch": 0.348, "grad_norm": 94.44389343261719, "learning_rate": 8.6088e-07, "loss": 0.5271, "step": 34800 }, { "epoch": 0.3485, "grad_norm": 1.9219623804092407, "learning_rate": 8.6068e-07, "loss": 0.4917, "step": 34850 }, { "epoch": 0.349, "grad_norm": 256.146484375, "learning_rate": 8.604799999999999e-07, "loss": 0.4976, "step": 34900 }, { "epoch": 0.3495, "grad_norm": 42.80830383300781, "learning_rate": 8.6028e-07, "loss": 0.4433, "step": 34950 }, { "epoch": 0.35, "grad_norm": 38.0195426940918, "learning_rate": 8.600799999999999e-07, "loss": 0.615, "step": 35000 }, { "epoch": 0.3505, "grad_norm": 55.40459060668945, "learning_rate": 8.598799999999999e-07, "loss": 0.5231, "step": 35050 }, { "epoch": 0.351, "grad_norm": 75.99671173095703, "learning_rate": 8.5968e-07, "loss": 0.505, "step": 35100 }, { "epoch": 0.3515, "grad_norm": 30.296842575073242, "learning_rate": 8.5948e-07, "loss": 0.5513, "step": 35150 }, { "epoch": 0.352, "grad_norm": 69.02685546875, "learning_rate": 8.592800000000001e-07, "loss": 0.585, "step": 35200 }, { "epoch": 0.3525, "grad_norm": 88.14878845214844, "learning_rate": 8.590799999999999e-07, "loss": 0.6418, "step": 35250 }, { "epoch": 0.353, "grad_norm": 1.586727261543274, "learning_rate": 8.588799999999999e-07, "loss": 0.5362, "step": 35300 }, { "epoch": 0.3535, "grad_norm": 48.83518981933594, "learning_rate": 8.5868e-07, "loss": 0.4468, "step": 35350 }, { "epoch": 0.354, "grad_norm": 98.71307373046875, "learning_rate": 8.5848e-07, "loss": 0.6128, "step": 35400 }, { "epoch": 0.3545, "grad_norm": 44.5135612487793, "learning_rate": 8.5828e-07, "loss": 0.695, "step": 35450 }, { "epoch": 0.355, "grad_norm": 39.28260803222656, "learning_rate": 8.5808e-07, "loss": 0.5119, "step": 35500 }, { "epoch": 0.3555, "grad_norm": 16.02478790283203, "learning_rate": 8.5788e-07, "loss": 0.5504, "step": 35550 }, { "epoch": 0.356, "grad_norm": 73.70425415039062, "learning_rate": 8.576799999999999e-07, "loss": 0.3506, "step": 35600 }, { "epoch": 0.3565, "grad_norm": 60.15646743774414, "learning_rate": 8.5748e-07, "loss": 0.701, "step": 35650 }, { "epoch": 0.357, "grad_norm": 124.11194610595703, "learning_rate": 8.5728e-07, "loss": 0.5455, "step": 35700 }, { "epoch": 0.3575, "grad_norm": 63.276222229003906, "learning_rate": 8.570799999999999e-07, "loss": 0.5035, "step": 35750 }, { "epoch": 0.358, "grad_norm": 70.08747863769531, "learning_rate": 8.5688e-07, "loss": 0.4866, "step": 35800 }, { "epoch": 0.3585, "grad_norm": 85.71963500976562, "learning_rate": 8.5668e-07, "loss": 0.5919, "step": 35850 }, { "epoch": 0.359, "grad_norm": 0.5785985589027405, "learning_rate": 8.5648e-07, "loss": 0.4748, "step": 35900 }, { "epoch": 0.3595, "grad_norm": 54.46210861206055, "learning_rate": 8.5628e-07, "loss": 0.5546, "step": 35950 }, { "epoch": 0.36, "grad_norm": 3.6963469982147217, "learning_rate": 8.560799999999999e-07, "loss": 0.5244, "step": 36000 }, { "epoch": 0.3605, "grad_norm": 75.58008575439453, "learning_rate": 8.558799999999999e-07, "loss": 0.493, "step": 36050 }, { "epoch": 0.361, "grad_norm": 1.6295361518859863, "learning_rate": 8.5568e-07, "loss": 0.4227, "step": 36100 }, { "epoch": 0.3615, "grad_norm": 78.0384292602539, "learning_rate": 8.5548e-07, "loss": 0.5416, "step": 36150 }, { "epoch": 0.362, "grad_norm": 14.516754150390625, "learning_rate": 8.5528e-07, "loss": 0.6593, "step": 36200 }, { "epoch": 0.3625, "grad_norm": 29.74073028564453, "learning_rate": 8.5508e-07, "loss": 0.5811, "step": 36250 }, { "epoch": 0.363, "grad_norm": 94.58187103271484, "learning_rate": 8.548799999999999e-07, "loss": 0.5598, "step": 36300 }, { "epoch": 0.3635, "grad_norm": 71.66839599609375, "learning_rate": 8.546799999999999e-07, "loss": 0.3467, "step": 36350 }, { "epoch": 0.364, "grad_norm": 62.97394943237305, "learning_rate": 8.5448e-07, "loss": 0.668, "step": 36400 }, { "epoch": 0.3645, "grad_norm": 24.426570892333984, "learning_rate": 8.5428e-07, "loss": 0.579, "step": 36450 }, { "epoch": 0.365, "grad_norm": 48.1230354309082, "learning_rate": 8.540799999999999e-07, "loss": 0.5345, "step": 36500 }, { "epoch": 0.3655, "grad_norm": 82.27693176269531, "learning_rate": 8.5388e-07, "loss": 0.6415, "step": 36550 }, { "epoch": 0.366, "grad_norm": 29.432802200317383, "learning_rate": 8.5368e-07, "loss": 0.564, "step": 36600 }, { "epoch": 0.3665, "grad_norm": 15.768404006958008, "learning_rate": 8.534799999999999e-07, "loss": 0.4265, "step": 36650 }, { "epoch": 0.367, "grad_norm": 71.74403381347656, "learning_rate": 8.5328e-07, "loss": 0.5159, "step": 36700 }, { "epoch": 0.3675, "grad_norm": 11.544425010681152, "learning_rate": 8.530799999999999e-07, "loss": 0.4023, "step": 36750 }, { "epoch": 0.368, "grad_norm": 6.664909839630127, "learning_rate": 8.528799999999999e-07, "loss": 0.6547, "step": 36800 }, { "epoch": 0.3685, "grad_norm": 59.17173385620117, "learning_rate": 8.5268e-07, "loss": 0.4626, "step": 36850 }, { "epoch": 0.369, "grad_norm": 45.66848373413086, "learning_rate": 8.5248e-07, "loss": 0.5634, "step": 36900 }, { "epoch": 0.3695, "grad_norm": 34.434417724609375, "learning_rate": 8.5228e-07, "loss": 0.5269, "step": 36950 }, { "epoch": 0.37, "grad_norm": 30.29810905456543, "learning_rate": 8.520799999999999e-07, "loss": 0.7069, "step": 37000 }, { "epoch": 0.3705, "grad_norm": 75.58245849609375, "learning_rate": 8.518799999999999e-07, "loss": 0.5033, "step": 37050 }, { "epoch": 0.371, "grad_norm": 1.9364076852798462, "learning_rate": 8.516799999999999e-07, "loss": 0.4915, "step": 37100 }, { "epoch": 0.3715, "grad_norm": 48.733604431152344, "learning_rate": 8.5148e-07, "loss": 0.4742, "step": 37150 }, { "epoch": 0.372, "grad_norm": 98.77400970458984, "learning_rate": 8.5128e-07, "loss": 0.4848, "step": 37200 }, { "epoch": 0.3725, "grad_norm": 126.41114044189453, "learning_rate": 8.510799999999999e-07, "loss": 0.4387, "step": 37250 }, { "epoch": 0.373, "grad_norm": 93.77721405029297, "learning_rate": 8.5088e-07, "loss": 0.5441, "step": 37300 }, { "epoch": 0.3735, "grad_norm": 22.67792510986328, "learning_rate": 8.506799999999999e-07, "loss": 0.4786, "step": 37350 }, { "epoch": 0.374, "grad_norm": 27.49308967590332, "learning_rate": 8.504799999999999e-07, "loss": 0.5442, "step": 37400 }, { "epoch": 0.3745, "grad_norm": 4.33549690246582, "learning_rate": 8.5028e-07, "loss": 0.4156, "step": 37450 }, { "epoch": 0.375, "grad_norm": 29.745681762695312, "learning_rate": 8.500799999999999e-07, "loss": 0.4352, "step": 37500 }, { "epoch": 0.3755, "grad_norm": 53.99748611450195, "learning_rate": 8.498799999999999e-07, "loss": 0.5147, "step": 37550 }, { "epoch": 0.376, "grad_norm": 11.581042289733887, "learning_rate": 8.4968e-07, "loss": 0.6587, "step": 37600 }, { "epoch": 0.3765, "grad_norm": 47.363216400146484, "learning_rate": 8.4948e-07, "loss": 0.5124, "step": 37650 }, { "epoch": 0.377, "grad_norm": 92.71063232421875, "learning_rate": 8.4928e-07, "loss": 0.5673, "step": 37700 }, { "epoch": 0.3775, "grad_norm": 56.31315994262695, "learning_rate": 8.490799999999999e-07, "loss": 0.5734, "step": 37750 }, { "epoch": 0.378, "grad_norm": 66.45987701416016, "learning_rate": 8.488799999999999e-07, "loss": 0.5247, "step": 37800 }, { "epoch": 0.3785, "grad_norm": 31.38943862915039, "learning_rate": 8.4868e-07, "loss": 0.5605, "step": 37850 }, { "epoch": 0.379, "grad_norm": 51.36092758178711, "learning_rate": 8.4848e-07, "loss": 0.4554, "step": 37900 }, { "epoch": 0.3795, "grad_norm": 31.094507217407227, "learning_rate": 8.4828e-07, "loss": 0.6803, "step": 37950 }, { "epoch": 0.38, "grad_norm": 60.59067916870117, "learning_rate": 8.4808e-07, "loss": 0.6999, "step": 38000 }, { "epoch": 0.3805, "grad_norm": 11.532386779785156, "learning_rate": 8.478799999999999e-07, "loss": 0.4905, "step": 38050 }, { "epoch": 0.381, "grad_norm": 77.91910552978516, "learning_rate": 8.476799999999999e-07, "loss": 0.5046, "step": 38100 }, { "epoch": 0.3815, "grad_norm": 123.77552795410156, "learning_rate": 8.4748e-07, "loss": 0.6066, "step": 38150 }, { "epoch": 0.382, "grad_norm": 80.5956802368164, "learning_rate": 8.4728e-07, "loss": 0.5123, "step": 38200 }, { "epoch": 0.3825, "grad_norm": 18.614927291870117, "learning_rate": 8.4708e-07, "loss": 0.5425, "step": 38250 }, { "epoch": 0.383, "grad_norm": 40.55034255981445, "learning_rate": 8.4688e-07, "loss": 0.5672, "step": 38300 }, { "epoch": 0.3835, "grad_norm": 60.523948669433594, "learning_rate": 8.4668e-07, "loss": 0.5903, "step": 38350 }, { "epoch": 0.384, "grad_norm": 56.68648910522461, "learning_rate": 8.464799999999999e-07, "loss": 0.5536, "step": 38400 }, { "epoch": 0.3845, "grad_norm": 63.2244758605957, "learning_rate": 8.4628e-07, "loss": 0.5863, "step": 38450 }, { "epoch": 0.385, "grad_norm": 73.95377349853516, "learning_rate": 8.4608e-07, "loss": 0.4515, "step": 38500 }, { "epoch": 0.3855, "grad_norm": 18.579113006591797, "learning_rate": 8.458799999999999e-07, "loss": 0.6405, "step": 38550 }, { "epoch": 0.386, "grad_norm": 40.64234161376953, "learning_rate": 8.4568e-07, "loss": 0.5371, "step": 38600 }, { "epoch": 0.3865, "grad_norm": 54.03630447387695, "learning_rate": 8.4548e-07, "loss": 0.5378, "step": 38650 }, { "epoch": 0.387, "grad_norm": 6.5824666023254395, "learning_rate": 8.4528e-07, "loss": 0.4843, "step": 38700 }, { "epoch": 0.3875, "grad_norm": 8.509979248046875, "learning_rate": 8.4508e-07, "loss": 0.4975, "step": 38750 }, { "epoch": 0.388, "grad_norm": 97.18090057373047, "learning_rate": 8.448839999999999e-07, "loss": 0.4974, "step": 38800 }, { "epoch": 0.3885, "grad_norm": 109.9385757446289, "learning_rate": 8.446839999999999e-07, "loss": 0.5312, "step": 38850 }, { "epoch": 0.389, "grad_norm": 2.83178448677063, "learning_rate": 8.44484e-07, "loss": 0.4961, "step": 38900 }, { "epoch": 0.3895, "grad_norm": 1.3601728677749634, "learning_rate": 8.44284e-07, "loss": 0.5074, "step": 38950 }, { "epoch": 0.39, "grad_norm": 49.6685905456543, "learning_rate": 8.440839999999999e-07, "loss": 0.4328, "step": 39000 }, { "epoch": 0.3905, "grad_norm": 58.785987854003906, "learning_rate": 8.43884e-07, "loss": 0.5464, "step": 39050 }, { "epoch": 0.391, "grad_norm": 79.17613220214844, "learning_rate": 8.43688e-07, "loss": 0.5818, "step": 39100 }, { "epoch": 0.3915, "grad_norm": 30.441478729248047, "learning_rate": 8.434879999999999e-07, "loss": 0.5418, "step": 39150 }, { "epoch": 0.392, "grad_norm": 58.006282806396484, "learning_rate": 8.43288e-07, "loss": 0.4618, "step": 39200 }, { "epoch": 0.3925, "grad_norm": 83.23896026611328, "learning_rate": 8.430879999999999e-07, "loss": 0.6328, "step": 39250 }, { "epoch": 0.393, "grad_norm": 30.840930938720703, "learning_rate": 8.428879999999999e-07, "loss": 0.52, "step": 39300 }, { "epoch": 0.3935, "grad_norm": 22.173118591308594, "learning_rate": 8.42688e-07, "loss": 0.4596, "step": 39350 }, { "epoch": 0.394, "grad_norm": 123.26498413085938, "learning_rate": 8.42488e-07, "loss": 0.5246, "step": 39400 }, { "epoch": 0.3945, "grad_norm": 65.1488265991211, "learning_rate": 8.42288e-07, "loss": 0.4235, "step": 39450 }, { "epoch": 0.395, "grad_norm": 0.8173670768737793, "learning_rate": 8.420919999999999e-07, "loss": 0.4922, "step": 39500 }, { "epoch": 0.3955, "grad_norm": 4.30292272567749, "learning_rate": 8.418919999999999e-07, "loss": 0.4174, "step": 39550 }, { "epoch": 0.396, "grad_norm": 55.06961441040039, "learning_rate": 8.416919999999999e-07, "loss": 0.5271, "step": 39600 }, { "epoch": 0.3965, "grad_norm": 7.464923858642578, "learning_rate": 8.41492e-07, "loss": 0.495, "step": 39650 }, { "epoch": 0.397, "grad_norm": 34.88985824584961, "learning_rate": 8.41292e-07, "loss": 0.5638, "step": 39700 }, { "epoch": 0.3975, "grad_norm": 36.899593353271484, "learning_rate": 8.410919999999999e-07, "loss": 0.4877, "step": 39750 }, { "epoch": 0.398, "grad_norm": 70.25818634033203, "learning_rate": 8.40892e-07, "loss": 0.516, "step": 39800 }, { "epoch": 0.3985, "grad_norm": 23.857093811035156, "learning_rate": 8.406919999999999e-07, "loss": 0.4775, "step": 39850 }, { "epoch": 0.399, "grad_norm": 64.42444610595703, "learning_rate": 8.404919999999999e-07, "loss": 0.3856, "step": 39900 }, { "epoch": 0.3995, "grad_norm": 94.81324005126953, "learning_rate": 8.40292e-07, "loss": 0.603, "step": 39950 }, { "epoch": 0.4, "grad_norm": 70.74253845214844, "learning_rate": 8.400919999999999e-07, "loss": 0.4607, "step": 40000 }, { "epoch": 0.4005, "grad_norm": 69.46749877929688, "learning_rate": 8.398919999999999e-07, "loss": 0.5185, "step": 40050 }, { "epoch": 0.401, "grad_norm": 30.19931983947754, "learning_rate": 8.39692e-07, "loss": 0.5854, "step": 40100 }, { "epoch": 0.4015, "grad_norm": 80.08529663085938, "learning_rate": 8.39492e-07, "loss": 0.4856, "step": 40150 }, { "epoch": 0.402, "grad_norm": 40.47534942626953, "learning_rate": 8.39292e-07, "loss": 0.4393, "step": 40200 }, { "epoch": 0.4025, "grad_norm": 92.6363296508789, "learning_rate": 8.390919999999999e-07, "loss": 0.4964, "step": 40250 }, { "epoch": 0.403, "grad_norm": 1.9177128076553345, "learning_rate": 8.388919999999999e-07, "loss": 0.606, "step": 40300 }, { "epoch": 0.4035, "grad_norm": 9.308789253234863, "learning_rate": 8.38692e-07, "loss": 0.4849, "step": 40350 }, { "epoch": 0.404, "grad_norm": 9.60834789276123, "learning_rate": 8.38492e-07, "loss": 0.5118, "step": 40400 }, { "epoch": 0.4045, "grad_norm": 70.33528137207031, "learning_rate": 8.38292e-07, "loss": 0.5834, "step": 40450 }, { "epoch": 0.405, "grad_norm": 93.31282806396484, "learning_rate": 8.38092e-07, "loss": 0.5711, "step": 40500 }, { "epoch": 0.4055, "grad_norm": 56.34794998168945, "learning_rate": 8.378919999999999e-07, "loss": 0.5346, "step": 40550 }, { "epoch": 0.406, "grad_norm": 53.3722038269043, "learning_rate": 8.376919999999999e-07, "loss": 0.5983, "step": 40600 }, { "epoch": 0.4065, "grad_norm": 79.872314453125, "learning_rate": 8.37492e-07, "loss": 0.6024, "step": 40650 }, { "epoch": 0.407, "grad_norm": 43.69386291503906, "learning_rate": 8.37292e-07, "loss": 0.6604, "step": 40700 }, { "epoch": 0.4075, "grad_norm": 82.44725799560547, "learning_rate": 8.370919999999999e-07, "loss": 0.5791, "step": 40750 }, { "epoch": 0.408, "grad_norm": 28.565074920654297, "learning_rate": 8.36892e-07, "loss": 0.4985, "step": 40800 }, { "epoch": 0.4085, "grad_norm": 108.46695709228516, "learning_rate": 8.36692e-07, "loss": 0.559, "step": 40850 }, { "epoch": 0.409, "grad_norm": 68.24771881103516, "learning_rate": 8.364919999999999e-07, "loss": 0.6729, "step": 40900 }, { "epoch": 0.4095, "grad_norm": 106.6298599243164, "learning_rate": 8.36292e-07, "loss": 0.6222, "step": 40950 }, { "epoch": 0.41, "grad_norm": 10.868021011352539, "learning_rate": 8.360919999999999e-07, "loss": 0.4931, "step": 41000 }, { "epoch": 0.4105, "grad_norm": 77.19261169433594, "learning_rate": 8.358919999999999e-07, "loss": 0.5437, "step": 41050 }, { "epoch": 0.411, "grad_norm": 52.96466064453125, "learning_rate": 8.35692e-07, "loss": 0.5333, "step": 41100 }, { "epoch": 0.4115, "grad_norm": 47.27060317993164, "learning_rate": 8.35492e-07, "loss": 0.6691, "step": 41150 }, { "epoch": 0.412, "grad_norm": 65.95133209228516, "learning_rate": 8.35292e-07, "loss": 0.506, "step": 41200 }, { "epoch": 0.4125, "grad_norm": 124.53824615478516, "learning_rate": 8.350919999999999e-07, "loss": 0.5634, "step": 41250 }, { "epoch": 0.413, "grad_norm": 59.79798126220703, "learning_rate": 8.348919999999999e-07, "loss": 0.6084, "step": 41300 }, { "epoch": 0.4135, "grad_norm": 17.437131881713867, "learning_rate": 8.346919999999999e-07, "loss": 0.4448, "step": 41350 }, { "epoch": 0.414, "grad_norm": 0.26926401257514954, "learning_rate": 8.34492e-07, "loss": 0.5695, "step": 41400 }, { "epoch": 0.4145, "grad_norm": 25.987173080444336, "learning_rate": 8.34292e-07, "loss": 0.4574, "step": 41450 }, { "epoch": 0.415, "grad_norm": 77.89625549316406, "learning_rate": 8.340919999999999e-07, "loss": 0.5357, "step": 41500 }, { "epoch": 0.4155, "grad_norm": 37.63655471801758, "learning_rate": 8.33892e-07, "loss": 0.5492, "step": 41550 }, { "epoch": 0.416, "grad_norm": 103.72399139404297, "learning_rate": 8.33696e-07, "loss": 0.4425, "step": 41600 }, { "epoch": 0.4165, "grad_norm": 73.7927017211914, "learning_rate": 8.334959999999999e-07, "loss": 0.54, "step": 41650 }, { "epoch": 0.417, "grad_norm": 43.02588653564453, "learning_rate": 8.33296e-07, "loss": 0.492, "step": 41700 }, { "epoch": 0.4175, "grad_norm": 71.02433776855469, "learning_rate": 8.330959999999999e-07, "loss": 0.5462, "step": 41750 }, { "epoch": 0.418, "grad_norm": 115.1431884765625, "learning_rate": 8.328959999999999e-07, "loss": 0.5912, "step": 41800 }, { "epoch": 0.4185, "grad_norm": 96.75558471679688, "learning_rate": 8.32696e-07, "loss": 0.5876, "step": 41850 }, { "epoch": 0.419, "grad_norm": 36.395286560058594, "learning_rate": 8.32496e-07, "loss": 0.4394, "step": 41900 }, { "epoch": 0.4195, "grad_norm": 168.5052947998047, "learning_rate": 8.32296e-07, "loss": 0.5484, "step": 41950 }, { "epoch": 0.42, "grad_norm": 60.965389251708984, "learning_rate": 8.320959999999999e-07, "loss": 0.4683, "step": 42000 }, { "epoch": 0.4205, "grad_norm": 118.09526062011719, "learning_rate": 8.318959999999999e-07, "loss": 0.5096, "step": 42050 }, { "epoch": 0.421, "grad_norm": 91.55111694335938, "learning_rate": 8.316959999999999e-07, "loss": 0.5228, "step": 42100 }, { "epoch": 0.4215, "grad_norm": 24.228757858276367, "learning_rate": 8.31496e-07, "loss": 0.5189, "step": 42150 }, { "epoch": 0.422, "grad_norm": 5.371842861175537, "learning_rate": 8.31296e-07, "loss": 0.4865, "step": 42200 }, { "epoch": 0.4225, "grad_norm": 74.1497802734375, "learning_rate": 8.310959999999999e-07, "loss": 0.4502, "step": 42250 }, { "epoch": 0.423, "grad_norm": 3.059469223022461, "learning_rate": 8.30896e-07, "loss": 0.4839, "step": 42300 }, { "epoch": 0.4235, "grad_norm": 14.30676555633545, "learning_rate": 8.306959999999999e-07, "loss": 0.4863, "step": 42350 }, { "epoch": 0.424, "grad_norm": 0.32036876678466797, "learning_rate": 8.30496e-07, "loss": 0.4859, "step": 42400 }, { "epoch": 0.4245, "grad_norm": 36.87676239013672, "learning_rate": 8.30296e-07, "loss": 0.3993, "step": 42450 }, { "epoch": 0.425, "grad_norm": 53.69343566894531, "learning_rate": 8.300959999999999e-07, "loss": 0.564, "step": 42500 }, { "epoch": 0.4255, "grad_norm": 49.399715423583984, "learning_rate": 8.29896e-07, "loss": 0.537, "step": 42550 }, { "epoch": 0.426, "grad_norm": 42.28934860229492, "learning_rate": 8.29696e-07, "loss": 0.4264, "step": 42600 }, { "epoch": 0.4265, "grad_norm": 14.933109283447266, "learning_rate": 8.29496e-07, "loss": 0.5168, "step": 42650 }, { "epoch": 0.427, "grad_norm": 69.50049591064453, "learning_rate": 8.29296e-07, "loss": 0.5216, "step": 42700 }, { "epoch": 0.4275, "grad_norm": 6.045228958129883, "learning_rate": 8.29096e-07, "loss": 0.6186, "step": 42750 }, { "epoch": 0.428, "grad_norm": 17.617143630981445, "learning_rate": 8.288959999999999e-07, "loss": 0.5476, "step": 42800 }, { "epoch": 0.4285, "grad_norm": 21.75743293762207, "learning_rate": 8.28696e-07, "loss": 0.4667, "step": 42850 }, { "epoch": 0.429, "grad_norm": 85.2100830078125, "learning_rate": 8.28496e-07, "loss": 0.6098, "step": 42900 }, { "epoch": 0.4295, "grad_norm": 10.500739097595215, "learning_rate": 8.28296e-07, "loss": 0.434, "step": 42950 }, { "epoch": 0.43, "grad_norm": 42.483436584472656, "learning_rate": 8.280960000000001e-07, "loss": 0.4656, "step": 43000 }, { "epoch": 0.4305, "grad_norm": 21.001169204711914, "learning_rate": 8.278959999999999e-07, "loss": 0.5221, "step": 43050 }, { "epoch": 0.431, "grad_norm": 17.075965881347656, "learning_rate": 8.276959999999999e-07, "loss": 0.4657, "step": 43100 }, { "epoch": 0.4315, "grad_norm": 15.272270202636719, "learning_rate": 8.27496e-07, "loss": 0.4914, "step": 43150 }, { "epoch": 0.432, "grad_norm": 51.080135345458984, "learning_rate": 8.27296e-07, "loss": 0.689, "step": 43200 }, { "epoch": 0.4325, "grad_norm": 99.12468719482422, "learning_rate": 8.27096e-07, "loss": 0.4991, "step": 43250 }, { "epoch": 0.433, "grad_norm": 69.47232055664062, "learning_rate": 8.26896e-07, "loss": 0.4888, "step": 43300 }, { "epoch": 0.4335, "grad_norm": 71.0149917602539, "learning_rate": 8.26696e-07, "loss": 0.625, "step": 43350 }, { "epoch": 0.434, "grad_norm": 47.15895080566406, "learning_rate": 8.264959999999999e-07, "loss": 0.5997, "step": 43400 }, { "epoch": 0.4345, "grad_norm": 74.67218017578125, "learning_rate": 8.26296e-07, "loss": 0.5972, "step": 43450 }, { "epoch": 0.435, "grad_norm": 53.72488784790039, "learning_rate": 8.26096e-07, "loss": 0.5028, "step": 43500 }, { "epoch": 0.4355, "grad_norm": 14.094425201416016, "learning_rate": 8.258959999999999e-07, "loss": 0.5297, "step": 43550 }, { "epoch": 0.436, "grad_norm": 12.79079818725586, "learning_rate": 8.25696e-07, "loss": 0.4202, "step": 43600 }, { "epoch": 0.4365, "grad_norm": 28.30698013305664, "learning_rate": 8.25496e-07, "loss": 0.4182, "step": 43650 }, { "epoch": 0.437, "grad_norm": 91.74481964111328, "learning_rate": 8.25296e-07, "loss": 0.4885, "step": 43700 }, { "epoch": 0.4375, "grad_norm": 13.599432945251465, "learning_rate": 8.25096e-07, "loss": 0.6304, "step": 43750 }, { "epoch": 0.438, "grad_norm": 3.0752172470092773, "learning_rate": 8.248959999999999e-07, "loss": 0.449, "step": 43800 }, { "epoch": 0.4385, "grad_norm": 74.79292297363281, "learning_rate": 8.246959999999999e-07, "loss": 0.6306, "step": 43850 }, { "epoch": 0.439, "grad_norm": 43.5394401550293, "learning_rate": 8.24496e-07, "loss": 0.6897, "step": 43900 }, { "epoch": 0.4395, "grad_norm": 25.60984230041504, "learning_rate": 8.24296e-07, "loss": 0.5346, "step": 43950 }, { "epoch": 0.44, "grad_norm": 91.31788635253906, "learning_rate": 8.24096e-07, "loss": 0.6347, "step": 44000 }, { "epoch": 0.4405, "grad_norm": 90.00495147705078, "learning_rate": 8.23896e-07, "loss": 0.5585, "step": 44050 }, { "epoch": 0.441, "grad_norm": 78.50735473632812, "learning_rate": 8.236959999999999e-07, "loss": 0.5065, "step": 44100 }, { "epoch": 0.4415, "grad_norm": 61.63063049316406, "learning_rate": 8.234959999999999e-07, "loss": 0.3718, "step": 44150 }, { "epoch": 0.442, "grad_norm": 64.05858612060547, "learning_rate": 8.23296e-07, "loss": 0.5225, "step": 44200 }, { "epoch": 0.4425, "grad_norm": 46.02941131591797, "learning_rate": 8.23096e-07, "loss": 0.4429, "step": 44250 }, { "epoch": 0.443, "grad_norm": 19.115480422973633, "learning_rate": 8.228959999999999e-07, "loss": 0.6595, "step": 44300 }, { "epoch": 0.4435, "grad_norm": 5.615220069885254, "learning_rate": 8.22696e-07, "loss": 0.3437, "step": 44350 }, { "epoch": 0.444, "grad_norm": 53.1685791015625, "learning_rate": 8.22496e-07, "loss": 0.5388, "step": 44400 }, { "epoch": 0.4445, "grad_norm": 35.044410705566406, "learning_rate": 8.222959999999999e-07, "loss": 0.6554, "step": 44450 }, { "epoch": 0.445, "grad_norm": 98.26656341552734, "learning_rate": 8.22096e-07, "loss": 0.5233, "step": 44500 }, { "epoch": 0.4455, "grad_norm": 109.79158782958984, "learning_rate": 8.218959999999999e-07, "loss": 0.5021, "step": 44550 }, { "epoch": 0.446, "grad_norm": 59.18463134765625, "learning_rate": 8.216959999999999e-07, "loss": 0.5612, "step": 44600 }, { "epoch": 0.4465, "grad_norm": 89.63908386230469, "learning_rate": 8.21496e-07, "loss": 0.4865, "step": 44650 }, { "epoch": 0.447, "grad_norm": 75.61219024658203, "learning_rate": 8.21296e-07, "loss": 0.4486, "step": 44700 }, { "epoch": 0.4475, "grad_norm": 4.093531131744385, "learning_rate": 8.210960000000001e-07, "loss": 0.6416, "step": 44750 }, { "epoch": 0.448, "grad_norm": 37.043434143066406, "learning_rate": 8.208959999999999e-07, "loss": 0.6513, "step": 44800 }, { "epoch": 0.4485, "grad_norm": 51.53557586669922, "learning_rate": 8.206959999999999e-07, "loss": 0.4894, "step": 44850 }, { "epoch": 0.449, "grad_norm": 69.94869232177734, "learning_rate": 8.20496e-07, "loss": 0.4744, "step": 44900 }, { "epoch": 0.4495, "grad_norm": 76.57780456542969, "learning_rate": 8.20296e-07, "loss": 0.5147, "step": 44950 }, { "epoch": 0.45, "grad_norm": 82.7248764038086, "learning_rate": 8.20096e-07, "loss": 0.6523, "step": 45000 }, { "epoch": 0.4505, "grad_norm": 23.24110984802246, "learning_rate": 8.19896e-07, "loss": 0.5502, "step": 45050 }, { "epoch": 0.451, "grad_norm": 60.64106750488281, "learning_rate": 8.19696e-07, "loss": 0.4758, "step": 45100 }, { "epoch": 0.4515, "grad_norm": 35.0093879699707, "learning_rate": 8.194959999999999e-07, "loss": 0.544, "step": 45150 }, { "epoch": 0.452, "grad_norm": 27.198448181152344, "learning_rate": 8.19296e-07, "loss": 0.5395, "step": 45200 }, { "epoch": 0.4525, "grad_norm": 51.811222076416016, "learning_rate": 8.19096e-07, "loss": 0.5858, "step": 45250 }, { "epoch": 0.453, "grad_norm": 59.12669372558594, "learning_rate": 8.188959999999999e-07, "loss": 0.4579, "step": 45300 }, { "epoch": 0.4535, "grad_norm": 35.265716552734375, "learning_rate": 8.18696e-07, "loss": 0.53, "step": 45350 }, { "epoch": 0.454, "grad_norm": 38.40176010131836, "learning_rate": 8.18496e-07, "loss": 0.5556, "step": 45400 }, { "epoch": 0.4545, "grad_norm": 6.6817240715026855, "learning_rate": 8.18296e-07, "loss": 0.4169, "step": 45450 }, { "epoch": 0.455, "grad_norm": 77.1117935180664, "learning_rate": 8.18096e-07, "loss": 0.5975, "step": 45500 }, { "epoch": 0.4555, "grad_norm": 32.58802032470703, "learning_rate": 8.178959999999999e-07, "loss": 0.4934, "step": 45550 }, { "epoch": 0.456, "grad_norm": 80.36177062988281, "learning_rate": 8.176959999999999e-07, "loss": 0.4513, "step": 45600 }, { "epoch": 0.4565, "grad_norm": 38.63931655883789, "learning_rate": 8.17496e-07, "loss": 0.4297, "step": 45650 }, { "epoch": 0.457, "grad_norm": 30.655248641967773, "learning_rate": 8.17296e-07, "loss": 0.6114, "step": 45700 }, { "epoch": 0.4575, "grad_norm": 122.92070007324219, "learning_rate": 8.17096e-07, "loss": 0.5557, "step": 45750 }, { "epoch": 0.458, "grad_norm": 63.41127395629883, "learning_rate": 8.16896e-07, "loss": 0.6203, "step": 45800 }, { "epoch": 0.4585, "grad_norm": 10.74167537689209, "learning_rate": 8.166959999999999e-07, "loss": 0.4382, "step": 45850 }, { "epoch": 0.459, "grad_norm": 70.49405670166016, "learning_rate": 8.164959999999999e-07, "loss": 0.4747, "step": 45900 }, { "epoch": 0.4595, "grad_norm": 3.7027602195739746, "learning_rate": 8.16296e-07, "loss": 0.6623, "step": 45950 }, { "epoch": 0.46, "grad_norm": 25.70686149597168, "learning_rate": 8.16096e-07, "loss": 0.494, "step": 46000 }, { "epoch": 0.4605, "grad_norm": 32.90084457397461, "learning_rate": 8.158959999999999e-07, "loss": 0.4998, "step": 46050 }, { "epoch": 0.461, "grad_norm": 14.209221839904785, "learning_rate": 8.15696e-07, "loss": 0.4964, "step": 46100 }, { "epoch": 0.4615, "grad_norm": 0.3458581268787384, "learning_rate": 8.15496e-07, "loss": 0.5275, "step": 46150 }, { "epoch": 0.462, "grad_norm": 66.87709045410156, "learning_rate": 8.152959999999999e-07, "loss": 0.5533, "step": 46200 }, { "epoch": 0.4625, "grad_norm": 53.677860260009766, "learning_rate": 8.15096e-07, "loss": 0.5109, "step": 46250 }, { "epoch": 0.463, "grad_norm": 43.0468864440918, "learning_rate": 8.148959999999999e-07, "loss": 0.4664, "step": 46300 }, { "epoch": 0.4635, "grad_norm": 70.673095703125, "learning_rate": 8.146959999999999e-07, "loss": 0.6125, "step": 46350 }, { "epoch": 0.464, "grad_norm": 71.34368133544922, "learning_rate": 8.14496e-07, "loss": 0.4832, "step": 46400 }, { "epoch": 0.4645, "grad_norm": 198.13221740722656, "learning_rate": 8.14296e-07, "loss": 0.6594, "step": 46450 }, { "epoch": 0.465, "grad_norm": 35.54419708251953, "learning_rate": 8.14096e-07, "loss": 0.5413, "step": 46500 }, { "epoch": 0.4655, "grad_norm": 78.56597900390625, "learning_rate": 8.138959999999999e-07, "loss": 0.6681, "step": 46550 }, { "epoch": 0.466, "grad_norm": 22.999269485473633, "learning_rate": 8.136959999999999e-07, "loss": 0.4987, "step": 46600 }, { "epoch": 0.4665, "grad_norm": 1.7849301099777222, "learning_rate": 8.134959999999999e-07, "loss": 0.4031, "step": 46650 }, { "epoch": 0.467, "grad_norm": 27.667150497436523, "learning_rate": 8.13296e-07, "loss": 0.5426, "step": 46700 }, { "epoch": 0.4675, "grad_norm": 33.66166687011719, "learning_rate": 8.13096e-07, "loss": 0.4668, "step": 46750 }, { "epoch": 0.468, "grad_norm": 28.29575538635254, "learning_rate": 8.128959999999999e-07, "loss": 0.5348, "step": 46800 }, { "epoch": 0.4685, "grad_norm": 48.07258605957031, "learning_rate": 8.12696e-07, "loss": 0.7829, "step": 46850 }, { "epoch": 0.469, "grad_norm": 42.36520004272461, "learning_rate": 8.124959999999999e-07, "loss": 0.6122, "step": 46900 }, { "epoch": 0.4695, "grad_norm": 106.2151107788086, "learning_rate": 8.122959999999999e-07, "loss": 0.621, "step": 46950 }, { "epoch": 0.47, "grad_norm": 76.08829498291016, "learning_rate": 8.12096e-07, "loss": 0.6716, "step": 47000 }, { "epoch": 0.4705, "grad_norm": 31.28024673461914, "learning_rate": 8.118959999999999e-07, "loss": 0.4718, "step": 47050 }, { "epoch": 0.471, "grad_norm": 55.14548873901367, "learning_rate": 8.116959999999999e-07, "loss": 0.6012, "step": 47100 }, { "epoch": 0.4715, "grad_norm": 8.463032722473145, "learning_rate": 8.11496e-07, "loss": 0.4563, "step": 47150 }, { "epoch": 0.472, "grad_norm": 74.97944641113281, "learning_rate": 8.11296e-07, "loss": 0.568, "step": 47200 }, { "epoch": 0.4725, "grad_norm": 1.081640601158142, "learning_rate": 8.11096e-07, "loss": 0.6412, "step": 47250 }, { "epoch": 0.473, "grad_norm": 77.51432800292969, "learning_rate": 8.108959999999999e-07, "loss": 0.4582, "step": 47300 }, { "epoch": 0.4735, "grad_norm": 45.42144012451172, "learning_rate": 8.106959999999999e-07, "loss": 0.4746, "step": 47350 }, { "epoch": 0.474, "grad_norm": 0.2983810603618622, "learning_rate": 8.105e-07, "loss": 0.6629, "step": 47400 }, { "epoch": 0.4745, "grad_norm": 5.902935981750488, "learning_rate": 8.103e-07, "loss": 0.5522, "step": 47450 }, { "epoch": 0.475, "grad_norm": 0.9811944961547852, "learning_rate": 8.101e-07, "loss": 0.4998, "step": 47500 }, { "epoch": 0.4755, "grad_norm": 21.928115844726562, "learning_rate": 8.099e-07, "loss": 0.6313, "step": 47550 }, { "epoch": 0.476, "grad_norm": 51.325443267822266, "learning_rate": 8.097e-07, "loss": 0.5051, "step": 47600 }, { "epoch": 0.4765, "grad_norm": 10.640209197998047, "learning_rate": 8.094999999999999e-07, "loss": 0.4917, "step": 47650 }, { "epoch": 0.477, "grad_norm": 90.27954864501953, "learning_rate": 8.093e-07, "loss": 0.4371, "step": 47700 }, { "epoch": 0.4775, "grad_norm": 78.23124694824219, "learning_rate": 8.091e-07, "loss": 0.5052, "step": 47750 }, { "epoch": 0.478, "grad_norm": 3.3375375270843506, "learning_rate": 8.088999999999999e-07, "loss": 0.5095, "step": 47800 }, { "epoch": 0.4785, "grad_norm": 17.96718406677246, "learning_rate": 8.087e-07, "loss": 0.4613, "step": 47850 }, { "epoch": 0.479, "grad_norm": 86.63037872314453, "learning_rate": 8.085e-07, "loss": 0.5274, "step": 47900 }, { "epoch": 0.4795, "grad_norm": 15.691425323486328, "learning_rate": 8.083e-07, "loss": 0.5586, "step": 47950 }, { "epoch": 0.48, "grad_norm": 95.55493927001953, "learning_rate": 8.081e-07, "loss": 0.5387, "step": 48000 }, { "epoch": 0.4805, "grad_norm": 86.83247375488281, "learning_rate": 8.078999999999999e-07, "loss": 0.5033, "step": 48050 }, { "epoch": 0.481, "grad_norm": 24.38957405090332, "learning_rate": 8.076999999999999e-07, "loss": 0.7073, "step": 48100 }, { "epoch": 0.4815, "grad_norm": 49.8204231262207, "learning_rate": 8.075e-07, "loss": 0.4825, "step": 48150 }, { "epoch": 0.482, "grad_norm": 81.01677703857422, "learning_rate": 8.07304e-07, "loss": 0.6172, "step": 48200 }, { "epoch": 0.4825, "grad_norm": 9.030482292175293, "learning_rate": 8.07104e-07, "loss": 0.5569, "step": 48250 }, { "epoch": 0.483, "grad_norm": 94.63253021240234, "learning_rate": 8.06904e-07, "loss": 0.5637, "step": 48300 }, { "epoch": 0.4835, "grad_norm": 92.56613159179688, "learning_rate": 8.067039999999999e-07, "loss": 0.6043, "step": 48350 }, { "epoch": 0.484, "grad_norm": 21.853906631469727, "learning_rate": 8.065039999999999e-07, "loss": 0.4438, "step": 48400 }, { "epoch": 0.4845, "grad_norm": 22.982881546020508, "learning_rate": 8.06304e-07, "loss": 0.498, "step": 48450 }, { "epoch": 0.485, "grad_norm": 57.36958312988281, "learning_rate": 8.06104e-07, "loss": 0.5241, "step": 48500 }, { "epoch": 0.4855, "grad_norm": 21.14190673828125, "learning_rate": 8.059039999999999e-07, "loss": 0.6204, "step": 48550 }, { "epoch": 0.486, "grad_norm": 52.20134353637695, "learning_rate": 8.05704e-07, "loss": 0.5012, "step": 48600 }, { "epoch": 0.4865, "grad_norm": 1.349229335784912, "learning_rate": 8.05504e-07, "loss": 0.5246, "step": 48650 }, { "epoch": 0.487, "grad_norm": 66.7760009765625, "learning_rate": 8.053039999999999e-07, "loss": 0.5898, "step": 48700 }, { "epoch": 0.4875, "grad_norm": 20.221620559692383, "learning_rate": 8.05104e-07, "loss": 0.449, "step": 48750 }, { "epoch": 0.488, "grad_norm": 36.600914001464844, "learning_rate": 8.049039999999999e-07, "loss": 0.4766, "step": 48800 }, { "epoch": 0.4885, "grad_norm": 65.7702865600586, "learning_rate": 8.047039999999999e-07, "loss": 0.66, "step": 48850 }, { "epoch": 0.489, "grad_norm": 41.316524505615234, "learning_rate": 8.04504e-07, "loss": 0.549, "step": 48900 }, { "epoch": 0.4895, "grad_norm": 30.280691146850586, "learning_rate": 8.04304e-07, "loss": 0.5504, "step": 48950 }, { "epoch": 0.49, "grad_norm": 34.455535888671875, "learning_rate": 8.04104e-07, "loss": 0.502, "step": 49000 }, { "epoch": 0.4905, "grad_norm": 57.2782096862793, "learning_rate": 8.039039999999999e-07, "loss": 0.4723, "step": 49050 }, { "epoch": 0.491, "grad_norm": 29.322336196899414, "learning_rate": 8.037039999999999e-07, "loss": 0.5335, "step": 49100 }, { "epoch": 0.4915, "grad_norm": 88.96637725830078, "learning_rate": 8.035039999999999e-07, "loss": 0.5582, "step": 49150 }, { "epoch": 0.492, "grad_norm": 59.695770263671875, "learning_rate": 8.03304e-07, "loss": 0.5627, "step": 49200 }, { "epoch": 0.4925, "grad_norm": 96.95304107666016, "learning_rate": 8.03104e-07, "loss": 0.5773, "step": 49250 }, { "epoch": 0.493, "grad_norm": 76.24807739257812, "learning_rate": 8.029039999999999e-07, "loss": 0.5979, "step": 49300 }, { "epoch": 0.4935, "grad_norm": 5.769176483154297, "learning_rate": 8.02704e-07, "loss": 0.4968, "step": 49350 }, { "epoch": 0.494, "grad_norm": 61.573577880859375, "learning_rate": 8.025039999999999e-07, "loss": 0.5445, "step": 49400 }, { "epoch": 0.4945, "grad_norm": 66.46489715576172, "learning_rate": 8.023039999999999e-07, "loss": 0.467, "step": 49450 }, { "epoch": 0.495, "grad_norm": 84.0884780883789, "learning_rate": 8.02104e-07, "loss": 0.5543, "step": 49500 }, { "epoch": 0.4955, "grad_norm": 73.09436798095703, "learning_rate": 8.019039999999999e-07, "loss": 0.5498, "step": 49550 }, { "epoch": 0.496, "grad_norm": 17.9005126953125, "learning_rate": 8.017039999999999e-07, "loss": 0.4723, "step": 49600 }, { "epoch": 0.4965, "grad_norm": 50.205223083496094, "learning_rate": 8.01504e-07, "loss": 0.5561, "step": 49650 }, { "epoch": 0.497, "grad_norm": 52.76295471191406, "learning_rate": 8.01304e-07, "loss": 0.4949, "step": 49700 }, { "epoch": 0.4975, "grad_norm": 62.78417205810547, "learning_rate": 8.01104e-07, "loss": 0.56, "step": 49750 }, { "epoch": 0.498, "grad_norm": 6.838378429412842, "learning_rate": 8.009039999999999e-07, "loss": 0.6055, "step": 49800 }, { "epoch": 0.4985, "grad_norm": 18.722816467285156, "learning_rate": 8.007039999999999e-07, "loss": 0.5358, "step": 49850 }, { "epoch": 0.499, "grad_norm": 72.58301544189453, "learning_rate": 8.00504e-07, "loss": 0.401, "step": 49900 }, { "epoch": 0.4995, "grad_norm": 37.88386535644531, "learning_rate": 8.00304e-07, "loss": 0.5257, "step": 49950 }, { "epoch": 0.5, "grad_norm": 72.8793716430664, "learning_rate": 8.00104e-07, "loss": 0.5824, "step": 50000 }, { "epoch": 0.5005, "grad_norm": 64.095458984375, "learning_rate": 7.99904e-07, "loss": 0.5659, "step": 50050 }, { "epoch": 0.501, "grad_norm": 25.19496726989746, "learning_rate": 7.997039999999999e-07, "loss": 0.4465, "step": 50100 }, { "epoch": 0.5015, "grad_norm": 112.08228302001953, "learning_rate": 7.995039999999999e-07, "loss": 0.6215, "step": 50150 }, { "epoch": 0.502, "grad_norm": 3.157916307449341, "learning_rate": 7.99304e-07, "loss": 0.6333, "step": 50200 }, { "epoch": 0.5025, "grad_norm": 76.75867462158203, "learning_rate": 7.99104e-07, "loss": 0.483, "step": 50250 }, { "epoch": 0.503, "grad_norm": 43.23719024658203, "learning_rate": 7.989039999999999e-07, "loss": 0.5854, "step": 50300 }, { "epoch": 0.5035, "grad_norm": 68.20858001708984, "learning_rate": 7.98708e-07, "loss": 0.6767, "step": 50350 }, { "epoch": 0.504, "grad_norm": 88.28502655029297, "learning_rate": 7.98508e-07, "loss": 0.7013, "step": 50400 }, { "epoch": 0.5045, "grad_norm": 36.20711898803711, "learning_rate": 7.98308e-07, "loss": 0.3669, "step": 50450 }, { "epoch": 0.505, "grad_norm": 36.513954162597656, "learning_rate": 7.98108e-07, "loss": 0.4585, "step": 50500 }, { "epoch": 0.5055, "grad_norm": 58.205039978027344, "learning_rate": 7.979079999999999e-07, "loss": 0.4132, "step": 50550 }, { "epoch": 0.506, "grad_norm": 79.69970703125, "learning_rate": 7.977079999999999e-07, "loss": 0.5174, "step": 50600 }, { "epoch": 0.5065, "grad_norm": 45.726173400878906, "learning_rate": 7.97508e-07, "loss": 0.5019, "step": 50650 }, { "epoch": 0.507, "grad_norm": 88.03544616699219, "learning_rate": 7.97308e-07, "loss": 0.4937, "step": 50700 }, { "epoch": 0.5075, "grad_norm": 47.83407211303711, "learning_rate": 7.97108e-07, "loss": 0.5023, "step": 50750 }, { "epoch": 0.508, "grad_norm": 28.184261322021484, "learning_rate": 7.96908e-07, "loss": 0.4929, "step": 50800 }, { "epoch": 0.5085, "grad_norm": 97.35902404785156, "learning_rate": 7.967079999999999e-07, "loss": 0.4831, "step": 50850 }, { "epoch": 0.509, "grad_norm": 45.97766876220703, "learning_rate": 7.965079999999999e-07, "loss": 0.5077, "step": 50900 }, { "epoch": 0.5095, "grad_norm": 126.22740173339844, "learning_rate": 7.96308e-07, "loss": 0.7102, "step": 50950 }, { "epoch": 0.51, "grad_norm": 47.19316101074219, "learning_rate": 7.96108e-07, "loss": 0.4501, "step": 51000 }, { "epoch": 0.5105, "grad_norm": 7.235782146453857, "learning_rate": 7.959079999999999e-07, "loss": 0.4165, "step": 51050 }, { "epoch": 0.511, "grad_norm": 1.8868838548660278, "learning_rate": 7.95708e-07, "loss": 0.5589, "step": 51100 }, { "epoch": 0.5115, "grad_norm": 48.44756317138672, "learning_rate": 7.95508e-07, "loss": 0.7304, "step": 51150 }, { "epoch": 0.512, "grad_norm": 36.71236038208008, "learning_rate": 7.953079999999999e-07, "loss": 0.5783, "step": 51200 }, { "epoch": 0.5125, "grad_norm": 59.711605072021484, "learning_rate": 7.95108e-07, "loss": 0.5104, "step": 51250 }, { "epoch": 0.513, "grad_norm": 21.43183135986328, "learning_rate": 7.949079999999999e-07, "loss": 0.7175, "step": 51300 }, { "epoch": 0.5135, "grad_norm": 106.07775115966797, "learning_rate": 7.947079999999999e-07, "loss": 0.5952, "step": 51350 }, { "epoch": 0.514, "grad_norm": 25.90865135192871, "learning_rate": 7.94508e-07, "loss": 0.5459, "step": 51400 }, { "epoch": 0.5145, "grad_norm": 3.6912739276885986, "learning_rate": 7.94308e-07, "loss": 0.5168, "step": 51450 }, { "epoch": 0.515, "grad_norm": 89.33435821533203, "learning_rate": 7.94108e-07, "loss": 0.5239, "step": 51500 }, { "epoch": 0.5155, "grad_norm": 32.307071685791016, "learning_rate": 7.939079999999999e-07, "loss": 0.5169, "step": 51550 }, { "epoch": 0.516, "grad_norm": 16.09387969970703, "learning_rate": 7.937079999999999e-07, "loss": 0.4644, "step": 51600 }, { "epoch": 0.5165, "grad_norm": 11.74305534362793, "learning_rate": 7.935079999999999e-07, "loss": 0.3657, "step": 51650 }, { "epoch": 0.517, "grad_norm": 3.9428036212921143, "learning_rate": 7.93308e-07, "loss": 0.4922, "step": 51700 }, { "epoch": 0.5175, "grad_norm": 15.612861633300781, "learning_rate": 7.93108e-07, "loss": 0.3998, "step": 51750 }, { "epoch": 0.518, "grad_norm": 96.3296890258789, "learning_rate": 7.929079999999999e-07, "loss": 0.5107, "step": 51800 }, { "epoch": 0.5185, "grad_norm": 76.5985336303711, "learning_rate": 7.92708e-07, "loss": 0.5332, "step": 51850 }, { "epoch": 0.519, "grad_norm": 17.58921241760254, "learning_rate": 7.925079999999999e-07, "loss": 0.4055, "step": 51900 }, { "epoch": 0.5195, "grad_norm": 15.568597793579102, "learning_rate": 7.923079999999999e-07, "loss": 0.5198, "step": 51950 }, { "epoch": 0.52, "grad_norm": 0.8958696126937866, "learning_rate": 7.92108e-07, "loss": 0.5061, "step": 52000 }, { "epoch": 0.5205, "grad_norm": 1.784369945526123, "learning_rate": 7.919079999999999e-07, "loss": 0.5103, "step": 52050 }, { "epoch": 0.521, "grad_norm": 36.33938980102539, "learning_rate": 7.917079999999999e-07, "loss": 0.574, "step": 52100 }, { "epoch": 0.5215, "grad_norm": 39.72173309326172, "learning_rate": 7.91508e-07, "loss": 0.5163, "step": 52150 }, { "epoch": 0.522, "grad_norm": 56.958919525146484, "learning_rate": 7.91308e-07, "loss": 0.5032, "step": 52200 }, { "epoch": 0.5225, "grad_norm": 103.39403533935547, "learning_rate": 7.91108e-07, "loss": 0.63, "step": 52250 }, { "epoch": 0.523, "grad_norm": 79.32686614990234, "learning_rate": 7.90908e-07, "loss": 0.5285, "step": 52300 }, { "epoch": 0.5235, "grad_norm": 38.18978500366211, "learning_rate": 7.907079999999999e-07, "loss": 0.4923, "step": 52350 }, { "epoch": 0.524, "grad_norm": 17.309484481811523, "learning_rate": 7.90508e-07, "loss": 0.4923, "step": 52400 }, { "epoch": 0.5245, "grad_norm": 92.0141372680664, "learning_rate": 7.90308e-07, "loss": 0.5748, "step": 52450 }, { "epoch": 0.525, "grad_norm": 38.47359848022461, "learning_rate": 7.90108e-07, "loss": 0.5513, "step": 52500 }, { "epoch": 0.5255, "grad_norm": 69.26992797851562, "learning_rate": 7.899080000000001e-07, "loss": 0.6091, "step": 52550 }, { "epoch": 0.526, "grad_norm": 15.796204566955566, "learning_rate": 7.897079999999999e-07, "loss": 0.5999, "step": 52600 }, { "epoch": 0.5265, "grad_norm": 103.76625061035156, "learning_rate": 7.895079999999999e-07, "loss": 0.4535, "step": 52650 }, { "epoch": 0.527, "grad_norm": 6.661074161529541, "learning_rate": 7.89308e-07, "loss": 0.5645, "step": 52700 }, { "epoch": 0.5275, "grad_norm": 92.19075012207031, "learning_rate": 7.89108e-07, "loss": 0.5311, "step": 52750 }, { "epoch": 0.528, "grad_norm": 51.94999313354492, "learning_rate": 7.88908e-07, "loss": 0.4162, "step": 52800 }, { "epoch": 0.5285, "grad_norm": 2.7024729251861572, "learning_rate": 7.88708e-07, "loss": 0.4668, "step": 52850 }, { "epoch": 0.529, "grad_norm": 72.6579360961914, "learning_rate": 7.88508e-07, "loss": 0.5353, "step": 52900 }, { "epoch": 0.5295, "grad_norm": 47.76490783691406, "learning_rate": 7.883079999999999e-07, "loss": 0.3945, "step": 52950 }, { "epoch": 0.53, "grad_norm": 64.03026580810547, "learning_rate": 7.88108e-07, "loss": 0.5546, "step": 53000 }, { "epoch": 0.5305, "grad_norm": 37.60674285888672, "learning_rate": 7.87908e-07, "loss": 0.5378, "step": 53050 }, { "epoch": 0.531, "grad_norm": 53.003170013427734, "learning_rate": 7.877079999999999e-07, "loss": 0.4513, "step": 53100 }, { "epoch": 0.5315, "grad_norm": 5.38227653503418, "learning_rate": 7.87508e-07, "loss": 0.5933, "step": 53150 }, { "epoch": 0.532, "grad_norm": 48.471832275390625, "learning_rate": 7.87308e-07, "loss": 0.476, "step": 53200 }, { "epoch": 0.5325, "grad_norm": 7.869255542755127, "learning_rate": 7.87108e-07, "loss": 0.4092, "step": 53250 }, { "epoch": 0.533, "grad_norm": 18.027494430541992, "learning_rate": 7.86908e-07, "loss": 0.5511, "step": 53300 }, { "epoch": 0.5335, "grad_norm": 2.7823119163513184, "learning_rate": 7.867079999999999e-07, "loss": 0.4462, "step": 53350 }, { "epoch": 0.534, "grad_norm": 84.66714477539062, "learning_rate": 7.865079999999999e-07, "loss": 0.5392, "step": 53400 }, { "epoch": 0.5345, "grad_norm": 39.51984786987305, "learning_rate": 7.86308e-07, "loss": 0.5594, "step": 53450 }, { "epoch": 0.535, "grad_norm": 58.15394973754883, "learning_rate": 7.86112e-07, "loss": 0.49, "step": 53500 }, { "epoch": 0.5355, "grad_norm": 90.69095611572266, "learning_rate": 7.85912e-07, "loss": 0.3737, "step": 53550 }, { "epoch": 0.536, "grad_norm": 6.2065629959106445, "learning_rate": 7.85712e-07, "loss": 0.6176, "step": 53600 }, { "epoch": 0.5365, "grad_norm": 123.52501678466797, "learning_rate": 7.855119999999999e-07, "loss": 0.4808, "step": 53650 }, { "epoch": 0.537, "grad_norm": 122.08316040039062, "learning_rate": 7.853119999999999e-07, "loss": 0.5169, "step": 53700 }, { "epoch": 0.5375, "grad_norm": 1.263464331626892, "learning_rate": 7.85112e-07, "loss": 0.4527, "step": 53750 }, { "epoch": 0.538, "grad_norm": 0.7644511461257935, "learning_rate": 7.84912e-07, "loss": 0.6983, "step": 53800 }, { "epoch": 0.5385, "grad_norm": 4.055157661437988, "learning_rate": 7.847119999999999e-07, "loss": 0.5582, "step": 53850 }, { "epoch": 0.539, "grad_norm": 39.59327697753906, "learning_rate": 7.84512e-07, "loss": 0.4857, "step": 53900 }, { "epoch": 0.5395, "grad_norm": 44.580570220947266, "learning_rate": 7.84312e-07, "loss": 0.5478, "step": 53950 }, { "epoch": 0.54, "grad_norm": 45.88383865356445, "learning_rate": 7.84112e-07, "loss": 0.5136, "step": 54000 }, { "epoch": 0.5405, "grad_norm": 18.86288833618164, "learning_rate": 7.83912e-07, "loss": 0.5359, "step": 54050 }, { "epoch": 0.541, "grad_norm": 78.80966186523438, "learning_rate": 7.837119999999999e-07, "loss": 0.6137, "step": 54100 }, { "epoch": 0.5415, "grad_norm": 17.388288497924805, "learning_rate": 7.835119999999999e-07, "loss": 0.5367, "step": 54150 }, { "epoch": 0.542, "grad_norm": 50.73162078857422, "learning_rate": 7.83312e-07, "loss": 0.5237, "step": 54200 }, { "epoch": 0.5425, "grad_norm": 11.157291412353516, "learning_rate": 7.83112e-07, "loss": 0.4897, "step": 54250 }, { "epoch": 0.543, "grad_norm": 75.02784729003906, "learning_rate": 7.829120000000001e-07, "loss": 0.5842, "step": 54300 }, { "epoch": 0.5435, "grad_norm": 72.39640045166016, "learning_rate": 7.82712e-07, "loss": 0.5354, "step": 54350 }, { "epoch": 0.544, "grad_norm": 0.02550535462796688, "learning_rate": 7.825119999999999e-07, "loss": 0.5047, "step": 54400 }, { "epoch": 0.5445, "grad_norm": 5.766872882843018, "learning_rate": 7.82312e-07, "loss": 0.5554, "step": 54450 }, { "epoch": 0.545, "grad_norm": 41.95703125, "learning_rate": 7.82112e-07, "loss": 0.5145, "step": 54500 }, { "epoch": 0.5455, "grad_norm": 100.95028686523438, "learning_rate": 7.81912e-07, "loss": 0.5323, "step": 54550 }, { "epoch": 0.546, "grad_norm": 107.0252685546875, "learning_rate": 7.81712e-07, "loss": 0.6051, "step": 54600 }, { "epoch": 0.5465, "grad_norm": 73.3103256225586, "learning_rate": 7.81512e-07, "loss": 0.6191, "step": 54650 }, { "epoch": 0.547, "grad_norm": 99.1833724975586, "learning_rate": 7.81312e-07, "loss": 0.5797, "step": 54700 }, { "epoch": 0.5475, "grad_norm": 64.93377685546875, "learning_rate": 7.81112e-07, "loss": 0.4962, "step": 54750 }, { "epoch": 0.548, "grad_norm": 16.584720611572266, "learning_rate": 7.80912e-07, "loss": 0.5693, "step": 54800 }, { "epoch": 0.5485, "grad_norm": 91.99310302734375, "learning_rate": 7.807119999999999e-07, "loss": 0.6053, "step": 54850 }, { "epoch": 0.549, "grad_norm": 91.74974822998047, "learning_rate": 7.80512e-07, "loss": 0.515, "step": 54900 }, { "epoch": 0.5495, "grad_norm": 11.728752136230469, "learning_rate": 7.80312e-07, "loss": 0.499, "step": 54950 }, { "epoch": 0.55, "grad_norm": 39.06698989868164, "learning_rate": 7.80112e-07, "loss": 0.4463, "step": 55000 }, { "epoch": 0.5505, "grad_norm": 88.770751953125, "learning_rate": 7.799120000000001e-07, "loss": 0.5427, "step": 55050 }, { "epoch": 0.551, "grad_norm": 27.026084899902344, "learning_rate": 7.797119999999999e-07, "loss": 0.523, "step": 55100 }, { "epoch": 0.5515, "grad_norm": 1.4966083765029907, "learning_rate": 7.795119999999999e-07, "loss": 0.3774, "step": 55150 }, { "epoch": 0.552, "grad_norm": 48.237422943115234, "learning_rate": 7.79312e-07, "loss": 0.5175, "step": 55200 }, { "epoch": 0.5525, "grad_norm": 55.568851470947266, "learning_rate": 7.79112e-07, "loss": 0.4215, "step": 55250 }, { "epoch": 0.553, "grad_norm": 102.37129211425781, "learning_rate": 7.78912e-07, "loss": 0.64, "step": 55300 }, { "epoch": 0.5535, "grad_norm": 3.732124090194702, "learning_rate": 7.78712e-07, "loss": 0.5539, "step": 55350 }, { "epoch": 0.554, "grad_norm": 49.1021842956543, "learning_rate": 7.78512e-07, "loss": 0.4958, "step": 55400 }, { "epoch": 0.5545, "grad_norm": 85.12141418457031, "learning_rate": 7.783119999999999e-07, "loss": 0.4048, "step": 55450 }, { "epoch": 0.555, "grad_norm": 7.854513168334961, "learning_rate": 7.78112e-07, "loss": 0.4996, "step": 55500 }, { "epoch": 0.5555, "grad_norm": 21.370290756225586, "learning_rate": 7.77912e-07, "loss": 0.6001, "step": 55550 }, { "epoch": 0.556, "grad_norm": 58.50725173950195, "learning_rate": 7.777119999999999e-07, "loss": 0.5398, "step": 55600 }, { "epoch": 0.5565, "grad_norm": 26.769861221313477, "learning_rate": 7.77512e-07, "loss": 0.5714, "step": 55650 }, { "epoch": 0.557, "grad_norm": 7.172065734863281, "learning_rate": 7.77312e-07, "loss": 0.5132, "step": 55700 }, { "epoch": 0.5575, "grad_norm": 28.10909080505371, "learning_rate": 7.77112e-07, "loss": 0.5079, "step": 55750 }, { "epoch": 0.558, "grad_norm": 78.68544006347656, "learning_rate": 7.76912e-07, "loss": 0.575, "step": 55800 }, { "epoch": 0.5585, "grad_norm": 105.1363754272461, "learning_rate": 7.767119999999999e-07, "loss": 0.6106, "step": 55850 }, { "epoch": 0.559, "grad_norm": 67.24320983886719, "learning_rate": 7.765159999999999e-07, "loss": 0.5576, "step": 55900 }, { "epoch": 0.5595, "grad_norm": 8.551283836364746, "learning_rate": 7.76316e-07, "loss": 0.6247, "step": 55950 }, { "epoch": 0.56, "grad_norm": 33.97761535644531, "learning_rate": 7.76116e-07, "loss": 0.5779, "step": 56000 }, { "epoch": 0.5605, "grad_norm": 106.0393295288086, "learning_rate": 7.75916e-07, "loss": 0.5783, "step": 56050 }, { "epoch": 0.561, "grad_norm": 66.49019622802734, "learning_rate": 7.75716e-07, "loss": 0.5892, "step": 56100 }, { "epoch": 0.5615, "grad_norm": 70.9111557006836, "learning_rate": 7.755159999999999e-07, "loss": 0.521, "step": 56150 }, { "epoch": 0.562, "grad_norm": 27.3460750579834, "learning_rate": 7.753159999999999e-07, "loss": 0.4994, "step": 56200 }, { "epoch": 0.5625, "grad_norm": 26.073501586914062, "learning_rate": 7.75116e-07, "loss": 0.4284, "step": 56250 }, { "epoch": 0.563, "grad_norm": 10.379983901977539, "learning_rate": 7.74916e-07, "loss": 0.4984, "step": 56300 }, { "epoch": 0.5635, "grad_norm": 15.902315139770508, "learning_rate": 7.747159999999999e-07, "loss": 0.5109, "step": 56350 }, { "epoch": 0.564, "grad_norm": 13.556990623474121, "learning_rate": 7.74516e-07, "loss": 0.5017, "step": 56400 }, { "epoch": 0.5645, "grad_norm": 32.288238525390625, "learning_rate": 7.74316e-07, "loss": 0.4915, "step": 56450 }, { "epoch": 0.565, "grad_norm": 82.15554809570312, "learning_rate": 7.741159999999999e-07, "loss": 0.5898, "step": 56500 }, { "epoch": 0.5655, "grad_norm": 2.104949951171875, "learning_rate": 7.73916e-07, "loss": 0.4796, "step": 56550 }, { "epoch": 0.566, "grad_norm": 18.454978942871094, "learning_rate": 7.737159999999999e-07, "loss": 0.646, "step": 56600 }, { "epoch": 0.5665, "grad_norm": 1.2304694652557373, "learning_rate": 7.735159999999999e-07, "loss": 0.5653, "step": 56650 }, { "epoch": 0.567, "grad_norm": 87.61625671386719, "learning_rate": 7.73316e-07, "loss": 0.4816, "step": 56700 }, { "epoch": 0.5675, "grad_norm": 39.44472885131836, "learning_rate": 7.73116e-07, "loss": 0.4758, "step": 56750 }, { "epoch": 0.568, "grad_norm": 60.77321243286133, "learning_rate": 7.729160000000001e-07, "loss": 0.5649, "step": 56800 }, { "epoch": 0.5685, "grad_norm": 62.973541259765625, "learning_rate": 7.727159999999999e-07, "loss": 0.5331, "step": 56850 }, { "epoch": 0.569, "grad_norm": 3.172307252883911, "learning_rate": 7.725159999999999e-07, "loss": 0.529, "step": 56900 }, { "epoch": 0.5695, "grad_norm": 82.92523956298828, "learning_rate": 7.72316e-07, "loss": 0.4194, "step": 56950 }, { "epoch": 0.57, "grad_norm": 30.7186222076416, "learning_rate": 7.72116e-07, "loss": 0.4916, "step": 57000 }, { "epoch": 0.5705, "grad_norm": 21.11612319946289, "learning_rate": 7.71916e-07, "loss": 0.5155, "step": 57050 }, { "epoch": 0.571, "grad_norm": 109.33685302734375, "learning_rate": 7.71716e-07, "loss": 0.6148, "step": 57100 }, { "epoch": 0.5715, "grad_norm": 4.589968204498291, "learning_rate": 7.71516e-07, "loss": 0.4523, "step": 57150 }, { "epoch": 0.572, "grad_norm": 50.23842239379883, "learning_rate": 7.713159999999999e-07, "loss": 0.4689, "step": 57200 }, { "epoch": 0.5725, "grad_norm": 5.932251453399658, "learning_rate": 7.71116e-07, "loss": 0.4741, "step": 57250 }, { "epoch": 0.573, "grad_norm": 83.40533447265625, "learning_rate": 7.70916e-07, "loss": 0.522, "step": 57300 }, { "epoch": 0.5735, "grad_norm": 53.40946960449219, "learning_rate": 7.707159999999999e-07, "loss": 0.3644, "step": 57350 }, { "epoch": 0.574, "grad_norm": 107.14583587646484, "learning_rate": 7.70516e-07, "loss": 0.4899, "step": 57400 }, { "epoch": 0.5745, "grad_norm": 81.80699920654297, "learning_rate": 7.70316e-07, "loss": 0.4938, "step": 57450 }, { "epoch": 0.575, "grad_norm": 66.33548736572266, "learning_rate": 7.70116e-07, "loss": 0.5103, "step": 57500 }, { "epoch": 0.5755, "grad_norm": 55.853206634521484, "learning_rate": 7.69916e-07, "loss": 0.5101, "step": 57550 }, { "epoch": 0.576, "grad_norm": 1.4973876476287842, "learning_rate": 7.697159999999999e-07, "loss": 0.5066, "step": 57600 }, { "epoch": 0.5765, "grad_norm": 8.70065689086914, "learning_rate": 7.695159999999999e-07, "loss": 0.523, "step": 57650 }, { "epoch": 0.577, "grad_norm": 68.1734390258789, "learning_rate": 7.69316e-07, "loss": 0.5269, "step": 57700 }, { "epoch": 0.5775, "grad_norm": 47.97027587890625, "learning_rate": 7.69116e-07, "loss": 0.4298, "step": 57750 }, { "epoch": 0.578, "grad_norm": 0.9205309152603149, "learning_rate": 7.68916e-07, "loss": 0.4194, "step": 57800 }, { "epoch": 0.5785, "grad_norm": 73.67825317382812, "learning_rate": 7.68716e-07, "loss": 0.4472, "step": 57850 }, { "epoch": 0.579, "grad_norm": 41.70149612426758, "learning_rate": 7.685159999999999e-07, "loss": 0.4958, "step": 57900 }, { "epoch": 0.5795, "grad_norm": 37.12933349609375, "learning_rate": 7.683159999999999e-07, "loss": 0.3652, "step": 57950 }, { "epoch": 0.58, "grad_norm": 50.90730667114258, "learning_rate": 7.68116e-07, "loss": 0.4357, "step": 58000 }, { "epoch": 0.5805, "grad_norm": 84.36893463134766, "learning_rate": 7.67916e-07, "loss": 0.643, "step": 58050 }, { "epoch": 0.581, "grad_norm": 2.3979945182800293, "learning_rate": 7.677159999999999e-07, "loss": 0.4654, "step": 58100 }, { "epoch": 0.5815, "grad_norm": 82.98129272460938, "learning_rate": 7.67516e-07, "loss": 0.5124, "step": 58150 }, { "epoch": 0.582, "grad_norm": 68.0512924194336, "learning_rate": 7.67316e-07, "loss": 0.5489, "step": 58200 }, { "epoch": 0.5825, "grad_norm": 7.42828893661499, "learning_rate": 7.671159999999999e-07, "loss": 0.6616, "step": 58250 }, { "epoch": 0.583, "grad_norm": 16.994314193725586, "learning_rate": 7.66916e-07, "loss": 0.4413, "step": 58300 }, { "epoch": 0.5835, "grad_norm": 33.765499114990234, "learning_rate": 7.667159999999999e-07, "loss": 0.5723, "step": 58350 }, { "epoch": 0.584, "grad_norm": 114.72591400146484, "learning_rate": 7.665159999999999e-07, "loss": 0.4305, "step": 58400 }, { "epoch": 0.5845, "grad_norm": 60.763423919677734, "learning_rate": 7.66316e-07, "loss": 0.4434, "step": 58450 }, { "epoch": 0.585, "grad_norm": 70.40579223632812, "learning_rate": 7.66116e-07, "loss": 0.5024, "step": 58500 }, { "epoch": 0.5855, "grad_norm": 63.25678634643555, "learning_rate": 7.65916e-07, "loss": 0.4624, "step": 58550 }, { "epoch": 0.586, "grad_norm": 75.37684631347656, "learning_rate": 7.657159999999999e-07, "loss": 0.6335, "step": 58600 }, { "epoch": 0.5865, "grad_norm": 82.71162414550781, "learning_rate": 7.655159999999999e-07, "loss": 0.57, "step": 58650 }, { "epoch": 0.587, "grad_norm": 6.6550445556640625, "learning_rate": 7.653159999999999e-07, "loss": 0.6168, "step": 58700 }, { "epoch": 0.5875, "grad_norm": 21.841920852661133, "learning_rate": 7.65116e-07, "loss": 0.4213, "step": 58750 }, { "epoch": 0.588, "grad_norm": 67.11859893798828, "learning_rate": 7.64916e-07, "loss": 0.5094, "step": 58800 }, { "epoch": 0.5885, "grad_norm": 6.230730056762695, "learning_rate": 7.647159999999999e-07, "loss": 0.5381, "step": 58850 }, { "epoch": 0.589, "grad_norm": 90.90957641601562, "learning_rate": 7.64516e-07, "loss": 0.5987, "step": 58900 }, { "epoch": 0.5895, "grad_norm": 67.64148712158203, "learning_rate": 7.643159999999999e-07, "loss": 0.5335, "step": 58950 }, { "epoch": 0.59, "grad_norm": 82.5676040649414, "learning_rate": 7.641159999999999e-07, "loss": 0.524, "step": 59000 }, { "epoch": 0.5905, "grad_norm": 2.6406068801879883, "learning_rate": 7.63916e-07, "loss": 0.4817, "step": 59050 }, { "epoch": 0.591, "grad_norm": 44.6208381652832, "learning_rate": 7.637159999999999e-07, "loss": 0.5026, "step": 59100 }, { "epoch": 0.5915, "grad_norm": 90.23446655273438, "learning_rate": 7.635159999999999e-07, "loss": 0.5775, "step": 59150 }, { "epoch": 0.592, "grad_norm": 1.308499813079834, "learning_rate": 7.63316e-07, "loss": 0.5328, "step": 59200 }, { "epoch": 0.5925, "grad_norm": 97.81929016113281, "learning_rate": 7.63116e-07, "loss": 0.5452, "step": 59250 }, { "epoch": 0.593, "grad_norm": 21.65294647216797, "learning_rate": 7.629160000000001e-07, "loss": 0.5259, "step": 59300 }, { "epoch": 0.5935, "grad_norm": 15.141357421875, "learning_rate": 7.627159999999999e-07, "loss": 0.3815, "step": 59350 }, { "epoch": 0.594, "grad_norm": 62.881248474121094, "learning_rate": 7.625159999999999e-07, "loss": 0.5372, "step": 59400 }, { "epoch": 0.5945, "grad_norm": 2.6095540523529053, "learning_rate": 7.62316e-07, "loss": 0.4359, "step": 59450 }, { "epoch": 0.595, "grad_norm": 5.5010085105896, "learning_rate": 7.62116e-07, "loss": 0.4372, "step": 59500 }, { "epoch": 0.5955, "grad_norm": 35.81318664550781, "learning_rate": 7.61916e-07, "loss": 0.442, "step": 59550 }, { "epoch": 0.596, "grad_norm": 61.836212158203125, "learning_rate": 7.61716e-07, "loss": 0.6257, "step": 59600 }, { "epoch": 0.5965, "grad_norm": 89.80731201171875, "learning_rate": 7.61516e-07, "loss": 0.5622, "step": 59650 }, { "epoch": 0.597, "grad_norm": 59.49220657348633, "learning_rate": 7.613159999999999e-07, "loss": 0.4933, "step": 59700 }, { "epoch": 0.5975, "grad_norm": 64.80416107177734, "learning_rate": 7.61116e-07, "loss": 0.6054, "step": 59750 }, { "epoch": 0.598, "grad_norm": 42.769248962402344, "learning_rate": 7.60916e-07, "loss": 0.4669, "step": 59800 }, { "epoch": 0.5985, "grad_norm": 4.290161609649658, "learning_rate": 7.607159999999999e-07, "loss": 0.5286, "step": 59850 }, { "epoch": 0.599, "grad_norm": 50.70981216430664, "learning_rate": 7.60516e-07, "loss": 0.5784, "step": 59900 }, { "epoch": 0.5995, "grad_norm": 64.66263580322266, "learning_rate": 7.60316e-07, "loss": 0.558, "step": 59950 }, { "epoch": 0.6, "grad_norm": 56.63187026977539, "learning_rate": 7.60116e-07, "loss": 0.59, "step": 60000 }, { "epoch": 0.6005, "grad_norm": 53.35725402832031, "learning_rate": 7.59916e-07, "loss": 0.6365, "step": 60050 }, { "epoch": 0.601, "grad_norm": 30.334291458129883, "learning_rate": 7.597159999999999e-07, "loss": 0.5164, "step": 60100 }, { "epoch": 0.6015, "grad_norm": 9.290550231933594, "learning_rate": 7.595159999999999e-07, "loss": 0.515, "step": 60150 }, { "epoch": 0.602, "grad_norm": 64.14176940917969, "learning_rate": 7.59316e-07, "loss": 0.614, "step": 60200 }, { "epoch": 0.6025, "grad_norm": 75.271484375, "learning_rate": 7.59116e-07, "loss": 0.6181, "step": 60250 }, { "epoch": 0.603, "grad_norm": 4.262742042541504, "learning_rate": 7.58916e-07, "loss": 0.5013, "step": 60300 }, { "epoch": 0.6035, "grad_norm": 3.081693649291992, "learning_rate": 7.58716e-07, "loss": 0.5908, "step": 60350 }, { "epoch": 0.604, "grad_norm": 39.718345642089844, "learning_rate": 7.585159999999999e-07, "loss": 0.4868, "step": 60400 }, { "epoch": 0.6045, "grad_norm": 84.18822479248047, "learning_rate": 7.583159999999999e-07, "loss": 0.5012, "step": 60450 }, { "epoch": 0.605, "grad_norm": 90.46514892578125, "learning_rate": 7.58116e-07, "loss": 0.5695, "step": 60500 }, { "epoch": 0.6055, "grad_norm": 56.23940658569336, "learning_rate": 7.57916e-07, "loss": 0.5034, "step": 60550 }, { "epoch": 0.606, "grad_norm": 27.244321823120117, "learning_rate": 7.577159999999999e-07, "loss": 0.5065, "step": 60600 }, { "epoch": 0.6065, "grad_norm": 6.311286449432373, "learning_rate": 7.57516e-07, "loss": 0.5169, "step": 60650 }, { "epoch": 0.607, "grad_norm": 1.7884770631790161, "learning_rate": 7.57316e-07, "loss": 0.4025, "step": 60700 }, { "epoch": 0.6075, "grad_norm": 76.4052734375, "learning_rate": 7.571159999999999e-07, "loss": 0.5358, "step": 60750 }, { "epoch": 0.608, "grad_norm": 57.62378692626953, "learning_rate": 7.56916e-07, "loss": 0.5542, "step": 60800 }, { "epoch": 0.6085, "grad_norm": 1.3535525798797607, "learning_rate": 7.56716e-07, "loss": 0.6046, "step": 60850 }, { "epoch": 0.609, "grad_norm": 57.33085632324219, "learning_rate": 7.565159999999999e-07, "loss": 0.5285, "step": 60900 }, { "epoch": 0.6095, "grad_norm": 10.677825927734375, "learning_rate": 7.56316e-07, "loss": 0.4391, "step": 60950 }, { "epoch": 0.61, "grad_norm": 14.921430587768555, "learning_rate": 7.56116e-07, "loss": 0.4501, "step": 61000 }, { "epoch": 0.6105, "grad_norm": 3.2991600036621094, "learning_rate": 7.55916e-07, "loss": 0.5235, "step": 61050 }, { "epoch": 0.611, "grad_norm": 50.49160385131836, "learning_rate": 7.55716e-07, "loss": 0.4383, "step": 61100 }, { "epoch": 0.6115, "grad_norm": 93.91867065429688, "learning_rate": 7.555159999999999e-07, "loss": 0.558, "step": 61150 }, { "epoch": 0.612, "grad_norm": 15.711594581604004, "learning_rate": 7.553159999999999e-07, "loss": 0.5294, "step": 61200 }, { "epoch": 0.6125, "grad_norm": 9.569193840026855, "learning_rate": 7.5512e-07, "loss": 0.4918, "step": 61250 }, { "epoch": 0.613, "grad_norm": 36.533626556396484, "learning_rate": 7.5492e-07, "loss": 0.4458, "step": 61300 }, { "epoch": 0.6135, "grad_norm": 47.60615158081055, "learning_rate": 7.547199999999999e-07, "loss": 0.494, "step": 61350 }, { "epoch": 0.614, "grad_norm": 47.9034538269043, "learning_rate": 7.5452e-07, "loss": 0.5418, "step": 61400 }, { "epoch": 0.6145, "grad_norm": 55.0429573059082, "learning_rate": 7.543199999999999e-07, "loss": 0.6425, "step": 61450 }, { "epoch": 0.615, "grad_norm": 29.267080307006836, "learning_rate": 7.541199999999999e-07, "loss": 0.467, "step": 61500 }, { "epoch": 0.6155, "grad_norm": 47.8558235168457, "learning_rate": 7.5392e-07, "loss": 0.4311, "step": 61550 }, { "epoch": 0.616, "grad_norm": 63.829071044921875, "learning_rate": 7.537199999999999e-07, "loss": 0.564, "step": 61600 }, { "epoch": 0.6165, "grad_norm": 2.9663500785827637, "learning_rate": 7.535199999999999e-07, "loss": 0.5972, "step": 61650 }, { "epoch": 0.617, "grad_norm": 1.3793628215789795, "learning_rate": 7.5332e-07, "loss": 0.4365, "step": 61700 }, { "epoch": 0.6175, "grad_norm": 37.88240432739258, "learning_rate": 7.5312e-07, "loss": 0.596, "step": 61750 }, { "epoch": 0.618, "grad_norm": 71.33352661132812, "learning_rate": 7.5292e-07, "loss": 0.4351, "step": 61800 }, { "epoch": 0.6185, "grad_norm": 59.38632583618164, "learning_rate": 7.527199999999999e-07, "loss": 0.4588, "step": 61850 }, { "epoch": 0.619, "grad_norm": 59.172969818115234, "learning_rate": 7.525199999999999e-07, "loss": 0.5131, "step": 61900 }, { "epoch": 0.6195, "grad_norm": 11.83280086517334, "learning_rate": 7.5232e-07, "loss": 0.5119, "step": 61950 }, { "epoch": 0.62, "grad_norm": 64.36370849609375, "learning_rate": 7.5212e-07, "loss": 0.4334, "step": 62000 }, { "epoch": 0.6205, "grad_norm": 63.02851867675781, "learning_rate": 7.5192e-07, "loss": 0.4878, "step": 62050 }, { "epoch": 0.621, "grad_norm": 1.058779001235962, "learning_rate": 7.517200000000001e-07, "loss": 0.5113, "step": 62100 }, { "epoch": 0.6215, "grad_norm": 2.1751315593719482, "learning_rate": 7.515199999999999e-07, "loss": 0.4869, "step": 62150 }, { "epoch": 0.622, "grad_norm": 100.90425109863281, "learning_rate": 7.513199999999999e-07, "loss": 0.5549, "step": 62200 }, { "epoch": 0.6225, "grad_norm": 116.42280578613281, "learning_rate": 7.5112e-07, "loss": 0.4273, "step": 62250 }, { "epoch": 0.623, "grad_norm": 81.34993743896484, "learning_rate": 7.5092e-07, "loss": 0.5264, "step": 62300 }, { "epoch": 0.6235, "grad_norm": 27.30585289001465, "learning_rate": 7.5072e-07, "loss": 0.5647, "step": 62350 }, { "epoch": 0.624, "grad_norm": 8.460829734802246, "learning_rate": 7.5052e-07, "loss": 0.5671, "step": 62400 }, { "epoch": 0.6245, "grad_norm": 23.786176681518555, "learning_rate": 7.5032e-07, "loss": 0.4409, "step": 62450 }, { "epoch": 0.625, "grad_norm": 100.85468292236328, "learning_rate": 7.501199999999999e-07, "loss": 0.5269, "step": 62500 }, { "epoch": 0.6255, "grad_norm": 55.07974624633789, "learning_rate": 7.4992e-07, "loss": 0.3994, "step": 62550 }, { "epoch": 0.626, "grad_norm": 50.99776840209961, "learning_rate": 7.4972e-07, "loss": 0.5842, "step": 62600 }, { "epoch": 0.6265, "grad_norm": 6.158398151397705, "learning_rate": 7.495199999999999e-07, "loss": 0.4597, "step": 62650 }, { "epoch": 0.627, "grad_norm": 47.69706344604492, "learning_rate": 7.4932e-07, "loss": 0.5979, "step": 62700 }, { "epoch": 0.6275, "grad_norm": 42.70079040527344, "learning_rate": 7.4912e-07, "loss": 0.4116, "step": 62750 }, { "epoch": 0.628, "grad_norm": 109.9066162109375, "learning_rate": 7.4892e-07, "loss": 0.4997, "step": 62800 }, { "epoch": 0.6285, "grad_norm": 51.17443084716797, "learning_rate": 7.4872e-07, "loss": 0.5929, "step": 62850 }, { "epoch": 0.629, "grad_norm": 24.46744155883789, "learning_rate": 7.485199999999999e-07, "loss": 0.5733, "step": 62900 }, { "epoch": 0.6295, "grad_norm": 16.23428726196289, "learning_rate": 7.483199999999999e-07, "loss": 0.3725, "step": 62950 }, { "epoch": 0.63, "grad_norm": 4.631766319274902, "learning_rate": 7.4812e-07, "loss": 0.5679, "step": 63000 }, { "epoch": 0.6305, "grad_norm": 127.42314147949219, "learning_rate": 7.4792e-07, "loss": 0.4697, "step": 63050 }, { "epoch": 0.631, "grad_norm": 85.11431121826172, "learning_rate": 7.4772e-07, "loss": 0.4926, "step": 63100 }, { "epoch": 0.6315, "grad_norm": 18.65448570251465, "learning_rate": 7.4752e-07, "loss": 0.3696, "step": 63150 }, { "epoch": 0.632, "grad_norm": 52.924503326416016, "learning_rate": 7.473199999999999e-07, "loss": 0.6384, "step": 63200 }, { "epoch": 0.6325, "grad_norm": 81.14250946044922, "learning_rate": 7.471199999999999e-07, "loss": 0.4821, "step": 63250 }, { "epoch": 0.633, "grad_norm": 36.8980827331543, "learning_rate": 7.4692e-07, "loss": 0.456, "step": 63300 }, { "epoch": 0.6335, "grad_norm": 39.605323791503906, "learning_rate": 7.4672e-07, "loss": 0.4922, "step": 63350 }, { "epoch": 0.634, "grad_norm": 31.341976165771484, "learning_rate": 7.465199999999999e-07, "loss": 0.466, "step": 63400 }, { "epoch": 0.6345, "grad_norm": 87.95755004882812, "learning_rate": 7.4632e-07, "loss": 0.3872, "step": 63450 }, { "epoch": 0.635, "grad_norm": 59.00345993041992, "learning_rate": 7.4612e-07, "loss": 0.5011, "step": 63500 }, { "epoch": 0.6355, "grad_norm": 31.053945541381836, "learning_rate": 7.459199999999999e-07, "loss": 0.6692, "step": 63550 }, { "epoch": 0.636, "grad_norm": 47.0079345703125, "learning_rate": 7.45724e-07, "loss": 0.6124, "step": 63600 }, { "epoch": 0.6365, "grad_norm": 49.89374542236328, "learning_rate": 7.455239999999999e-07, "loss": 0.4711, "step": 63650 }, { "epoch": 0.637, "grad_norm": 93.30340576171875, "learning_rate": 7.453239999999999e-07, "loss": 0.5194, "step": 63700 }, { "epoch": 0.6375, "grad_norm": 95.00665283203125, "learning_rate": 7.45124e-07, "loss": 0.5065, "step": 63750 }, { "epoch": 0.638, "grad_norm": 50.189510345458984, "learning_rate": 7.44924e-07, "loss": 0.3856, "step": 63800 }, { "epoch": 0.6385, "grad_norm": 37.73897933959961, "learning_rate": 7.447240000000001e-07, "loss": 0.4225, "step": 63850 }, { "epoch": 0.639, "grad_norm": 28.45961570739746, "learning_rate": 7.44524e-07, "loss": 0.5747, "step": 63900 }, { "epoch": 0.6395, "grad_norm": 44.61785888671875, "learning_rate": 7.443239999999999e-07, "loss": 0.6641, "step": 63950 }, { "epoch": 0.64, "grad_norm": 48.99622344970703, "learning_rate": 7.44124e-07, "loss": 0.6024, "step": 64000 }, { "epoch": 0.6405, "grad_norm": 22.556676864624023, "learning_rate": 7.43924e-07, "loss": 0.5411, "step": 64050 }, { "epoch": 0.641, "grad_norm": 72.16722106933594, "learning_rate": 7.43724e-07, "loss": 0.5184, "step": 64100 }, { "epoch": 0.6415, "grad_norm": 32.8302116394043, "learning_rate": 7.43524e-07, "loss": 0.4113, "step": 64150 }, { "epoch": 0.642, "grad_norm": 47.202213287353516, "learning_rate": 7.43324e-07, "loss": 0.6019, "step": 64200 }, { "epoch": 0.6425, "grad_norm": 58.76848602294922, "learning_rate": 7.43124e-07, "loss": 0.5887, "step": 64250 }, { "epoch": 0.643, "grad_norm": 54.78934097290039, "learning_rate": 7.42924e-07, "loss": 0.5743, "step": 64300 }, { "epoch": 0.6435, "grad_norm": 16.40074348449707, "learning_rate": 7.42724e-07, "loss": 0.531, "step": 64350 }, { "epoch": 0.644, "grad_norm": 30.374088287353516, "learning_rate": 7.425239999999999e-07, "loss": 0.3552, "step": 64400 }, { "epoch": 0.6445, "grad_norm": 14.19670295715332, "learning_rate": 7.42324e-07, "loss": 0.6266, "step": 64450 }, { "epoch": 0.645, "grad_norm": 86.16085052490234, "learning_rate": 7.42124e-07, "loss": 0.5935, "step": 64500 }, { "epoch": 0.6455, "grad_norm": 84.91423797607422, "learning_rate": 7.41924e-07, "loss": 0.4733, "step": 64550 }, { "epoch": 0.646, "grad_norm": 72.64830780029297, "learning_rate": 7.417240000000001e-07, "loss": 0.4532, "step": 64600 }, { "epoch": 0.6465, "grad_norm": 6.822329998016357, "learning_rate": 7.415239999999999e-07, "loss": 0.4741, "step": 64650 }, { "epoch": 0.647, "grad_norm": 36.594993591308594, "learning_rate": 7.413239999999999e-07, "loss": 0.531, "step": 64700 }, { "epoch": 0.6475, "grad_norm": 75.4520263671875, "learning_rate": 7.41124e-07, "loss": 0.4991, "step": 64750 }, { "epoch": 0.648, "grad_norm": 44.896141052246094, "learning_rate": 7.40924e-07, "loss": 0.4959, "step": 64800 }, { "epoch": 0.6485, "grad_norm": 47.80727767944336, "learning_rate": 7.40724e-07, "loss": 0.5498, "step": 64850 }, { "epoch": 0.649, "grad_norm": 2.48860502243042, "learning_rate": 7.40524e-07, "loss": 0.5012, "step": 64900 }, { "epoch": 0.6495, "grad_norm": 16.638439178466797, "learning_rate": 7.40324e-07, "loss": 0.6177, "step": 64950 }, { "epoch": 0.65, "grad_norm": 44.449302673339844, "learning_rate": 7.401239999999999e-07, "loss": 0.4293, "step": 65000 }, { "epoch": 0.6505, "grad_norm": 161.3533935546875, "learning_rate": 7.39924e-07, "loss": 0.4377, "step": 65050 }, { "epoch": 0.651, "grad_norm": 12.957307815551758, "learning_rate": 7.39724e-07, "loss": 0.5337, "step": 65100 }, { "epoch": 0.6515, "grad_norm": 13.819578170776367, "learning_rate": 7.395239999999999e-07, "loss": 0.4665, "step": 65150 }, { "epoch": 0.652, "grad_norm": 108.1066665649414, "learning_rate": 7.39324e-07, "loss": 0.61, "step": 65200 }, { "epoch": 0.6525, "grad_norm": 18.782867431640625, "learning_rate": 7.39124e-07, "loss": 0.5585, "step": 65250 }, { "epoch": 0.653, "grad_norm": 14.486738204956055, "learning_rate": 7.38924e-07, "loss": 0.5132, "step": 65300 }, { "epoch": 0.6535, "grad_norm": 28.351959228515625, "learning_rate": 7.38724e-07, "loss": 0.4924, "step": 65350 }, { "epoch": 0.654, "grad_norm": 49.564491271972656, "learning_rate": 7.385239999999999e-07, "loss": 0.485, "step": 65400 }, { "epoch": 0.6545, "grad_norm": 55.472877502441406, "learning_rate": 7.383239999999999e-07, "loss": 0.5007, "step": 65450 }, { "epoch": 0.655, "grad_norm": 37.1424446105957, "learning_rate": 7.38124e-07, "loss": 0.518, "step": 65500 }, { "epoch": 0.6555, "grad_norm": 30.962854385375977, "learning_rate": 7.37924e-07, "loss": 0.5072, "step": 65550 }, { "epoch": 0.656, "grad_norm": 63.9365348815918, "learning_rate": 7.37724e-07, "loss": 0.5801, "step": 65600 }, { "epoch": 0.6565, "grad_norm": 6.695558547973633, "learning_rate": 7.37524e-07, "loss": 0.6023, "step": 65650 }, { "epoch": 0.657, "grad_norm": 26.7724666595459, "learning_rate": 7.373239999999999e-07, "loss": 0.5249, "step": 65700 }, { "epoch": 0.6575, "grad_norm": 81.64033508300781, "learning_rate": 7.371239999999999e-07, "loss": 0.5675, "step": 65750 }, { "epoch": 0.658, "grad_norm": 32.74658966064453, "learning_rate": 7.36924e-07, "loss": 0.5071, "step": 65800 }, { "epoch": 0.6585, "grad_norm": 58.301597595214844, "learning_rate": 7.36724e-07, "loss": 0.5389, "step": 65850 }, { "epoch": 0.659, "grad_norm": 53.7860221862793, "learning_rate": 7.365279999999999e-07, "loss": 0.4365, "step": 65900 }, { "epoch": 0.6595, "grad_norm": 20.44268798828125, "learning_rate": 7.36328e-07, "loss": 0.4764, "step": 65950 }, { "epoch": 0.66, "grad_norm": 23.711496353149414, "learning_rate": 7.36128e-07, "loss": 0.4768, "step": 66000 }, { "epoch": 0.6605, "grad_norm": 61.34829330444336, "learning_rate": 7.359279999999999e-07, "loss": 0.3809, "step": 66050 }, { "epoch": 0.661, "grad_norm": 95.46158599853516, "learning_rate": 7.35732e-07, "loss": 0.4626, "step": 66100 }, { "epoch": 0.6615, "grad_norm": 50.04861068725586, "learning_rate": 7.355319999999999e-07, "loss": 0.6196, "step": 66150 }, { "epoch": 0.662, "grad_norm": 111.31083679199219, "learning_rate": 7.353319999999999e-07, "loss": 0.4418, "step": 66200 }, { "epoch": 0.6625, "grad_norm": 0.3906168043613434, "learning_rate": 7.35132e-07, "loss": 0.3852, "step": 66250 }, { "epoch": 0.663, "grad_norm": 45.034507751464844, "learning_rate": 7.34932e-07, "loss": 0.4873, "step": 66300 }, { "epoch": 0.6635, "grad_norm": 39.52790069580078, "learning_rate": 7.347320000000001e-07, "loss": 0.332, "step": 66350 }, { "epoch": 0.664, "grad_norm": 12.687024116516113, "learning_rate": 7.34532e-07, "loss": 0.4187, "step": 66400 }, { "epoch": 0.6645, "grad_norm": 79.7389144897461, "learning_rate": 7.343319999999999e-07, "loss": 0.3296, "step": 66450 }, { "epoch": 0.665, "grad_norm": 99.39762115478516, "learning_rate": 7.34132e-07, "loss": 0.44, "step": 66500 }, { "epoch": 0.6655, "grad_norm": 1.2789992094039917, "learning_rate": 7.33932e-07, "loss": 0.4709, "step": 66550 }, { "epoch": 0.666, "grad_norm": 102.41425323486328, "learning_rate": 7.33732e-07, "loss": 0.4785, "step": 66600 }, { "epoch": 0.6665, "grad_norm": 1.850731611251831, "learning_rate": 7.33532e-07, "loss": 0.5505, "step": 66650 }, { "epoch": 0.667, "grad_norm": 6.710739612579346, "learning_rate": 7.33332e-07, "loss": 0.4497, "step": 66700 }, { "epoch": 0.6675, "grad_norm": 27.49475860595703, "learning_rate": 7.33132e-07, "loss": 0.4216, "step": 66750 }, { "epoch": 0.668, "grad_norm": 109.29817962646484, "learning_rate": 7.32932e-07, "loss": 0.5599, "step": 66800 }, { "epoch": 0.6685, "grad_norm": 2.05843186378479, "learning_rate": 7.32732e-07, "loss": 0.4372, "step": 66850 }, { "epoch": 0.669, "grad_norm": 19.476158142089844, "learning_rate": 7.325319999999999e-07, "loss": 0.573, "step": 66900 }, { "epoch": 0.6695, "grad_norm": 78.01306915283203, "learning_rate": 7.32332e-07, "loss": 0.3769, "step": 66950 }, { "epoch": 0.67, "grad_norm": 64.33741760253906, "learning_rate": 7.32132e-07, "loss": 0.4052, "step": 67000 }, { "epoch": 0.6705, "grad_norm": 2.576007843017578, "learning_rate": 7.31932e-07, "loss": 0.5676, "step": 67050 }, { "epoch": 0.671, "grad_norm": 22.989377975463867, "learning_rate": 7.317320000000001e-07, "loss": 0.511, "step": 67100 }, { "epoch": 0.6715, "grad_norm": 0.35649439692497253, "learning_rate": 7.315319999999999e-07, "loss": 0.5617, "step": 67150 }, { "epoch": 0.672, "grad_norm": 95.40701293945312, "learning_rate": 7.313319999999999e-07, "loss": 0.5461, "step": 67200 }, { "epoch": 0.6725, "grad_norm": 80.1086196899414, "learning_rate": 7.31132e-07, "loss": 0.6098, "step": 67250 }, { "epoch": 0.673, "grad_norm": 119.30028533935547, "learning_rate": 7.30932e-07, "loss": 0.4937, "step": 67300 }, { "epoch": 0.6735, "grad_norm": 38.92840576171875, "learning_rate": 7.30732e-07, "loss": 0.4933, "step": 67350 }, { "epoch": 0.674, "grad_norm": 113.92001342773438, "learning_rate": 7.30532e-07, "loss": 0.421, "step": 67400 }, { "epoch": 0.6745, "grad_norm": 0.898191511631012, "learning_rate": 7.30332e-07, "loss": 0.4647, "step": 67450 }, { "epoch": 0.675, "grad_norm": 62.35942077636719, "learning_rate": 7.301319999999999e-07, "loss": 0.5297, "step": 67500 }, { "epoch": 0.6755, "grad_norm": 10.209319114685059, "learning_rate": 7.29932e-07, "loss": 0.4762, "step": 67550 }, { "epoch": 0.676, "grad_norm": 84.57727813720703, "learning_rate": 7.29732e-07, "loss": 0.6114, "step": 67600 }, { "epoch": 0.6765, "grad_norm": 60.19552230834961, "learning_rate": 7.295319999999999e-07, "loss": 0.499, "step": 67650 }, { "epoch": 0.677, "grad_norm": 12.361583709716797, "learning_rate": 7.29332e-07, "loss": 0.3799, "step": 67700 }, { "epoch": 0.6775, "grad_norm": 72.01377868652344, "learning_rate": 7.29132e-07, "loss": 0.505, "step": 67750 }, { "epoch": 0.678, "grad_norm": 66.44666290283203, "learning_rate": 7.28932e-07, "loss": 0.6133, "step": 67800 }, { "epoch": 0.6785, "grad_norm": 50.07161331176758, "learning_rate": 7.28732e-07, "loss": 0.4261, "step": 67850 }, { "epoch": 0.679, "grad_norm": 104.22811889648438, "learning_rate": 7.285319999999999e-07, "loss": 0.5914, "step": 67900 }, { "epoch": 0.6795, "grad_norm": 6.429470062255859, "learning_rate": 7.283319999999999e-07, "loss": 0.345, "step": 67950 }, { "epoch": 0.68, "grad_norm": 65.51649475097656, "learning_rate": 7.28132e-07, "loss": 0.4633, "step": 68000 }, { "epoch": 0.6805, "grad_norm": 2.1752583980560303, "learning_rate": 7.27932e-07, "loss": 0.4383, "step": 68050 }, { "epoch": 0.681, "grad_norm": 22.099319458007812, "learning_rate": 7.27732e-07, "loss": 0.5433, "step": 68100 }, { "epoch": 0.6815, "grad_norm": 117.40200805664062, "learning_rate": 7.27532e-07, "loss": 0.5817, "step": 68150 }, { "epoch": 0.682, "grad_norm": 57.13771057128906, "learning_rate": 7.273319999999999e-07, "loss": 0.4602, "step": 68200 }, { "epoch": 0.6825, "grad_norm": 16.235754013061523, "learning_rate": 7.271319999999999e-07, "loss": 0.6009, "step": 68250 }, { "epoch": 0.683, "grad_norm": 6.888073921203613, "learning_rate": 7.26932e-07, "loss": 0.4454, "step": 68300 }, { "epoch": 0.6835, "grad_norm": 4.0116705894470215, "learning_rate": 7.26732e-07, "loss": 0.3861, "step": 68350 }, { "epoch": 0.684, "grad_norm": 36.852413177490234, "learning_rate": 7.265319999999999e-07, "loss": 0.6036, "step": 68400 }, { "epoch": 0.6845, "grad_norm": 23.3376522064209, "learning_rate": 7.26332e-07, "loss": 0.4487, "step": 68450 }, { "epoch": 0.685, "grad_norm": 12.914100646972656, "learning_rate": 7.26132e-07, "loss": 0.5701, "step": 68500 }, { "epoch": 0.6855, "grad_norm": 39.22294616699219, "learning_rate": 7.259319999999999e-07, "loss": 0.6363, "step": 68550 }, { "epoch": 0.686, "grad_norm": 38.24320602416992, "learning_rate": 7.25732e-07, "loss": 0.3829, "step": 68600 }, { "epoch": 0.6865, "grad_norm": 28.683549880981445, "learning_rate": 7.255319999999999e-07, "loss": 0.444, "step": 68650 }, { "epoch": 0.687, "grad_norm": 14.346872329711914, "learning_rate": 7.253319999999999e-07, "loss": 0.5185, "step": 68700 }, { "epoch": 0.6875, "grad_norm": 0.12592576444149017, "learning_rate": 7.25132e-07, "loss": 0.512, "step": 68750 }, { "epoch": 0.688, "grad_norm": 21.978870391845703, "learning_rate": 7.24932e-07, "loss": 0.4442, "step": 68800 }, { "epoch": 0.6885, "grad_norm": 0.3890920579433441, "learning_rate": 7.247320000000001e-07, "loss": 0.4323, "step": 68850 }, { "epoch": 0.689, "grad_norm": 46.296112060546875, "learning_rate": 7.245319999999999e-07, "loss": 0.5979, "step": 68900 }, { "epoch": 0.6895, "grad_norm": 3.0906097888946533, "learning_rate": 7.243319999999999e-07, "loss": 0.4466, "step": 68950 }, { "epoch": 0.69, "grad_norm": 113.38265991210938, "learning_rate": 7.24132e-07, "loss": 0.4887, "step": 69000 }, { "epoch": 0.6905, "grad_norm": 81.5102310180664, "learning_rate": 7.23932e-07, "loss": 0.6562, "step": 69050 }, { "epoch": 0.691, "grad_norm": 0.32676833868026733, "learning_rate": 7.23732e-07, "loss": 0.4413, "step": 69100 }, { "epoch": 0.6915, "grad_norm": 2.3341333866119385, "learning_rate": 7.23532e-07, "loss": 0.4187, "step": 69150 }, { "epoch": 0.692, "grad_norm": 51.32745361328125, "learning_rate": 7.23332e-07, "loss": 0.5124, "step": 69200 }, { "epoch": 0.6925, "grad_norm": 106.96540069580078, "learning_rate": 7.231319999999999e-07, "loss": 0.4508, "step": 69250 }, { "epoch": 0.693, "grad_norm": 28.311433792114258, "learning_rate": 7.22932e-07, "loss": 0.4263, "step": 69300 }, { "epoch": 0.6935, "grad_norm": 31.820341110229492, "learning_rate": 7.22732e-07, "loss": 0.5164, "step": 69350 }, { "epoch": 0.694, "grad_norm": 56.47795867919922, "learning_rate": 7.225319999999999e-07, "loss": 0.535, "step": 69400 }, { "epoch": 0.6945, "grad_norm": 10.806572914123535, "learning_rate": 7.22332e-07, "loss": 0.4787, "step": 69450 }, { "epoch": 0.695, "grad_norm": 41.71363067626953, "learning_rate": 7.22132e-07, "loss": 0.5698, "step": 69500 }, { "epoch": 0.6955, "grad_norm": 14.216658592224121, "learning_rate": 7.21932e-07, "loss": 0.4295, "step": 69550 }, { "epoch": 0.696, "grad_norm": 73.08179473876953, "learning_rate": 7.21732e-07, "loss": 0.4647, "step": 69600 }, { "epoch": 0.6965, "grad_norm": 32.81566619873047, "learning_rate": 7.215319999999999e-07, "loss": 0.4516, "step": 69650 }, { "epoch": 0.697, "grad_norm": 74.85893249511719, "learning_rate": 7.213319999999999e-07, "loss": 0.5258, "step": 69700 }, { "epoch": 0.6975, "grad_norm": 15.480995178222656, "learning_rate": 7.21132e-07, "loss": 0.48, "step": 69750 }, { "epoch": 0.698, "grad_norm": 36.054656982421875, "learning_rate": 7.20932e-07, "loss": 0.5223, "step": 69800 }, { "epoch": 0.6985, "grad_norm": 6.590498447418213, "learning_rate": 7.20732e-07, "loss": 0.6518, "step": 69850 }, { "epoch": 0.699, "grad_norm": 34.523651123046875, "learning_rate": 7.20532e-07, "loss": 0.6054, "step": 69900 }, { "epoch": 0.6995, "grad_norm": 413.315185546875, "learning_rate": 7.203319999999999e-07, "loss": 0.3861, "step": 69950 }, { "epoch": 0.7, "grad_norm": 28.01694679260254, "learning_rate": 7.201319999999999e-07, "loss": 0.5878, "step": 70000 }, { "epoch": 0.7005, "grad_norm": 4.377991676330566, "learning_rate": 7.19932e-07, "loss": 0.5515, "step": 70050 }, { "epoch": 0.701, "grad_norm": 55.94136428833008, "learning_rate": 7.19732e-07, "loss": 0.4695, "step": 70100 }, { "epoch": 0.7015, "grad_norm": 68.08269500732422, "learning_rate": 7.195359999999999e-07, "loss": 0.5501, "step": 70150 }, { "epoch": 0.702, "grad_norm": 13.8864107131958, "learning_rate": 7.19336e-07, "loss": 0.4736, "step": 70200 }, { "epoch": 0.7025, "grad_norm": 12.973597526550293, "learning_rate": 7.19136e-07, "loss": 0.6711, "step": 70250 }, { "epoch": 0.703, "grad_norm": 59.53081512451172, "learning_rate": 7.189359999999999e-07, "loss": 0.6633, "step": 70300 }, { "epoch": 0.7035, "grad_norm": 33.69902420043945, "learning_rate": 7.18736e-07, "loss": 0.5304, "step": 70350 }, { "epoch": 0.704, "grad_norm": 4.359379768371582, "learning_rate": 7.185359999999999e-07, "loss": 0.5243, "step": 70400 }, { "epoch": 0.7045, "grad_norm": 42.60222625732422, "learning_rate": 7.183359999999999e-07, "loss": 0.4602, "step": 70450 }, { "epoch": 0.705, "grad_norm": 1.3176884651184082, "learning_rate": 7.18136e-07, "loss": 0.4459, "step": 70500 }, { "epoch": 0.7055, "grad_norm": 3.021962881088257, "learning_rate": 7.17936e-07, "loss": 0.4491, "step": 70550 }, { "epoch": 0.706, "grad_norm": 274.6659851074219, "learning_rate": 7.17736e-07, "loss": 0.5348, "step": 70600 }, { "epoch": 0.7065, "grad_norm": 59.16705322265625, "learning_rate": 7.17536e-07, "loss": 0.4527, "step": 70650 }, { "epoch": 0.707, "grad_norm": 19.327486038208008, "learning_rate": 7.173359999999999e-07, "loss": 0.6739, "step": 70700 }, { "epoch": 0.7075, "grad_norm": 10.360088348388672, "learning_rate": 7.171359999999999e-07, "loss": 0.621, "step": 70750 }, { "epoch": 0.708, "grad_norm": 65.7010726928711, "learning_rate": 7.16936e-07, "loss": 0.3637, "step": 70800 }, { "epoch": 0.7085, "grad_norm": 7.891746997833252, "learning_rate": 7.16736e-07, "loss": 0.4372, "step": 70850 }, { "epoch": 0.709, "grad_norm": 46.58807373046875, "learning_rate": 7.165359999999999e-07, "loss": 0.4355, "step": 70900 }, { "epoch": 0.7095, "grad_norm": 82.19285583496094, "learning_rate": 7.16336e-07, "loss": 0.3844, "step": 70950 }, { "epoch": 0.71, "grad_norm": 44.19703674316406, "learning_rate": 7.16136e-07, "loss": 0.4022, "step": 71000 }, { "epoch": 0.7105, "grad_norm": 64.81764221191406, "learning_rate": 7.159359999999999e-07, "loss": 0.4597, "step": 71050 }, { "epoch": 0.711, "grad_norm": 4.582440376281738, "learning_rate": 7.15736e-07, "loss": 0.3086, "step": 71100 }, { "epoch": 0.7115, "grad_norm": 95.07144165039062, "learning_rate": 7.155359999999999e-07, "loss": 0.4794, "step": 71150 }, { "epoch": 0.712, "grad_norm": 47.082794189453125, "learning_rate": 7.153359999999999e-07, "loss": 0.5136, "step": 71200 }, { "epoch": 0.7125, "grad_norm": 58.89522171020508, "learning_rate": 7.15136e-07, "loss": 0.4111, "step": 71250 }, { "epoch": 0.713, "grad_norm": 76.09228515625, "learning_rate": 7.14936e-07, "loss": 0.493, "step": 71300 }, { "epoch": 0.7135, "grad_norm": 7.7800211906433105, "learning_rate": 7.147360000000001e-07, "loss": 0.3391, "step": 71350 }, { "epoch": 0.714, "grad_norm": 24.84239959716797, "learning_rate": 7.145359999999999e-07, "loss": 0.5557, "step": 71400 }, { "epoch": 0.7145, "grad_norm": 1.11128568649292, "learning_rate": 7.143359999999999e-07, "loss": 0.4902, "step": 71450 }, { "epoch": 0.715, "grad_norm": 67.38085174560547, "learning_rate": 7.14136e-07, "loss": 0.5157, "step": 71500 }, { "epoch": 0.7155, "grad_norm": 58.82173538208008, "learning_rate": 7.13936e-07, "loss": 0.4311, "step": 71550 }, { "epoch": 0.716, "grad_norm": 10.34243106842041, "learning_rate": 7.13736e-07, "loss": 0.5631, "step": 71600 }, { "epoch": 0.7165, "grad_norm": 31.23766326904297, "learning_rate": 7.13536e-07, "loss": 0.4615, "step": 71650 }, { "epoch": 0.717, "grad_norm": 90.4206314086914, "learning_rate": 7.13336e-07, "loss": 0.6139, "step": 71700 }, { "epoch": 0.7175, "grad_norm": 17.178178787231445, "learning_rate": 7.131359999999999e-07, "loss": 0.5126, "step": 71750 }, { "epoch": 0.718, "grad_norm": 60.4676399230957, "learning_rate": 7.12936e-07, "loss": 0.6, "step": 71800 }, { "epoch": 0.7185, "grad_norm": 1.8041740655899048, "learning_rate": 7.12736e-07, "loss": 0.4853, "step": 71850 }, { "epoch": 0.719, "grad_norm": 60.83064270019531, "learning_rate": 7.125359999999999e-07, "loss": 0.6092, "step": 71900 }, { "epoch": 0.7195, "grad_norm": 59.2396354675293, "learning_rate": 7.12336e-07, "loss": 0.5124, "step": 71950 }, { "epoch": 0.72, "grad_norm": 66.30610656738281, "learning_rate": 7.12136e-07, "loss": 0.5062, "step": 72000 }, { "epoch": 0.7205, "grad_norm": 116.58683013916016, "learning_rate": 7.11936e-07, "loss": 0.561, "step": 72050 }, { "epoch": 0.721, "grad_norm": 64.6193618774414, "learning_rate": 7.11736e-07, "loss": 0.5903, "step": 72100 }, { "epoch": 0.7215, "grad_norm": 29.871625900268555, "learning_rate": 7.115359999999999e-07, "loss": 0.3967, "step": 72150 }, { "epoch": 0.722, "grad_norm": 48.294960021972656, "learning_rate": 7.113359999999999e-07, "loss": 0.4418, "step": 72200 }, { "epoch": 0.7225, "grad_norm": 1.5724999904632568, "learning_rate": 7.11136e-07, "loss": 0.5528, "step": 72250 }, { "epoch": 0.723, "grad_norm": 10.528818130493164, "learning_rate": 7.10936e-07, "loss": 0.3585, "step": 72300 }, { "epoch": 0.7235, "grad_norm": 26.029348373413086, "learning_rate": 7.10736e-07, "loss": 0.5177, "step": 72350 }, { "epoch": 0.724, "grad_norm": 92.28126525878906, "learning_rate": 7.10536e-07, "loss": 0.3724, "step": 72400 }, { "epoch": 0.7245, "grad_norm": 14.755386352539062, "learning_rate": 7.103359999999999e-07, "loss": 0.4839, "step": 72450 }, { "epoch": 0.725, "grad_norm": 32.792755126953125, "learning_rate": 7.101359999999999e-07, "loss": 0.5051, "step": 72500 }, { "epoch": 0.7255, "grad_norm": 44.9954948425293, "learning_rate": 7.09936e-07, "loss": 0.572, "step": 72550 }, { "epoch": 0.726, "grad_norm": 61.472633361816406, "learning_rate": 7.09736e-07, "loss": 0.3723, "step": 72600 }, { "epoch": 0.7265, "grad_norm": 56.659156799316406, "learning_rate": 7.095359999999999e-07, "loss": 0.5166, "step": 72650 }, { "epoch": 0.727, "grad_norm": 66.07804107666016, "learning_rate": 7.09336e-07, "loss": 0.5227, "step": 72700 }, { "epoch": 0.7275, "grad_norm": 18.027997970581055, "learning_rate": 7.09136e-07, "loss": 0.5235, "step": 72750 }, { "epoch": 0.728, "grad_norm": 71.68598937988281, "learning_rate": 7.089359999999999e-07, "loss": 0.4355, "step": 72800 }, { "epoch": 0.7285, "grad_norm": 0.8782522082328796, "learning_rate": 7.08736e-07, "loss": 0.4416, "step": 72850 }, { "epoch": 0.729, "grad_norm": 66.14490509033203, "learning_rate": 7.08536e-07, "loss": 0.4943, "step": 72900 }, { "epoch": 0.7295, "grad_norm": 17.617177963256836, "learning_rate": 7.083359999999999e-07, "loss": 0.597, "step": 72950 }, { "epoch": 0.73, "grad_norm": 95.89069366455078, "learning_rate": 7.08136e-07, "loss": 0.5914, "step": 73000 }, { "epoch": 0.7305, "grad_norm": 6.234703063964844, "learning_rate": 7.07936e-07, "loss": 0.432, "step": 73050 }, { "epoch": 0.731, "grad_norm": 81.79715728759766, "learning_rate": 7.07736e-07, "loss": 0.5353, "step": 73100 }, { "epoch": 0.7315, "grad_norm": 68.71401977539062, "learning_rate": 7.07536e-07, "loss": 0.4773, "step": 73150 }, { "epoch": 0.732, "grad_norm": 20.033000946044922, "learning_rate": 7.073359999999999e-07, "loss": 0.3961, "step": 73200 }, { "epoch": 0.7325, "grad_norm": 0.8714961409568787, "learning_rate": 7.071359999999999e-07, "loss": 0.4119, "step": 73250 }, { "epoch": 0.733, "grad_norm": 67.67875671386719, "learning_rate": 7.06936e-07, "loss": 0.4616, "step": 73300 }, { "epoch": 0.7335, "grad_norm": 170.6661376953125, "learning_rate": 7.06736e-07, "loss": 0.4623, "step": 73350 }, { "epoch": 0.734, "grad_norm": 22.065210342407227, "learning_rate": 7.06536e-07, "loss": 0.5301, "step": 73400 }, { "epoch": 0.7345, "grad_norm": 116.2767562866211, "learning_rate": 7.06336e-07, "loss": 0.6367, "step": 73450 }, { "epoch": 0.735, "grad_norm": 118.61943817138672, "learning_rate": 7.061359999999999e-07, "loss": 0.4786, "step": 73500 }, { "epoch": 0.7355, "grad_norm": 10.633493423461914, "learning_rate": 7.05936e-07, "loss": 0.5042, "step": 73550 }, { "epoch": 0.736, "grad_norm": 15.222790718078613, "learning_rate": 7.05736e-07, "loss": 0.5449, "step": 73600 }, { "epoch": 0.7365, "grad_norm": 4.541469097137451, "learning_rate": 7.05536e-07, "loss": 0.4413, "step": 73650 }, { "epoch": 0.737, "grad_norm": 50.30397033691406, "learning_rate": 7.05336e-07, "loss": 0.4317, "step": 73700 }, { "epoch": 0.7375, "grad_norm": 4.888488292694092, "learning_rate": 7.05136e-07, "loss": 0.4241, "step": 73750 }, { "epoch": 0.738, "grad_norm": 46.71607971191406, "learning_rate": 7.04936e-07, "loss": 0.4109, "step": 73800 }, { "epoch": 0.7385, "grad_norm": 139.61004638671875, "learning_rate": 7.04736e-07, "loss": 0.5672, "step": 73850 }, { "epoch": 0.739, "grad_norm": 96.24576568603516, "learning_rate": 7.04536e-07, "loss": 0.5308, "step": 73900 }, { "epoch": 0.7395, "grad_norm": 3.3251993656158447, "learning_rate": 7.043359999999999e-07, "loss": 0.5206, "step": 73950 }, { "epoch": 0.74, "grad_norm": 64.65046691894531, "learning_rate": 7.04136e-07, "loss": 0.3965, "step": 74000 }, { "epoch": 0.7405, "grad_norm": 79.69112396240234, "learning_rate": 7.03936e-07, "loss": 0.571, "step": 74050 }, { "epoch": 0.741, "grad_norm": 86.38766479492188, "learning_rate": 7.03736e-07, "loss": 0.6437, "step": 74100 }, { "epoch": 0.7415, "grad_norm": 43.816471099853516, "learning_rate": 7.035360000000001e-07, "loss": 0.4205, "step": 74150 }, { "epoch": 0.742, "grad_norm": 41.845672607421875, "learning_rate": 7.033359999999999e-07, "loss": 0.3473, "step": 74200 }, { "epoch": 0.7425, "grad_norm": 0.8183789849281311, "learning_rate": 7.031359999999999e-07, "loss": 0.4469, "step": 74250 }, { "epoch": 0.743, "grad_norm": 3.786691427230835, "learning_rate": 7.02936e-07, "loss": 0.5015, "step": 74300 }, { "epoch": 0.7435, "grad_norm": 78.48735046386719, "learning_rate": 7.02736e-07, "loss": 0.624, "step": 74350 }, { "epoch": 0.744, "grad_norm": 104.10353088378906, "learning_rate": 7.02536e-07, "loss": 0.4875, "step": 74400 }, { "epoch": 0.7445, "grad_norm": 6.0870819091796875, "learning_rate": 7.02336e-07, "loss": 0.4004, "step": 74450 }, { "epoch": 0.745, "grad_norm": 13.105294227600098, "learning_rate": 7.02136e-07, "loss": 0.4386, "step": 74500 }, { "epoch": 0.7455, "grad_norm": 51.39162826538086, "learning_rate": 7.019359999999999e-07, "loss": 0.4182, "step": 74550 }, { "epoch": 0.746, "grad_norm": 91.84526824951172, "learning_rate": 7.01736e-07, "loss": 0.5552, "step": 74600 }, { "epoch": 0.7465, "grad_norm": 9.574994087219238, "learning_rate": 7.01536e-07, "loss": 0.4372, "step": 74650 }, { "epoch": 0.747, "grad_norm": 75.70382690429688, "learning_rate": 7.013359999999999e-07, "loss": 0.5477, "step": 74700 }, { "epoch": 0.7475, "grad_norm": 59.122093200683594, "learning_rate": 7.01136e-07, "loss": 0.5786, "step": 74750 }, { "epoch": 0.748, "grad_norm": 92.53333282470703, "learning_rate": 7.00936e-07, "loss": 0.5653, "step": 74800 }, { "epoch": 0.7485, "grad_norm": 93.97532653808594, "learning_rate": 7.00736e-07, "loss": 0.3929, "step": 74850 }, { "epoch": 0.749, "grad_norm": 1.357946515083313, "learning_rate": 7.00536e-07, "loss": 0.3806, "step": 74900 }, { "epoch": 0.7495, "grad_norm": 1.4514474868774414, "learning_rate": 7.003359999999999e-07, "loss": 0.4542, "step": 74950 }, { "epoch": 0.75, "grad_norm": 60.96038055419922, "learning_rate": 7.001359999999999e-07, "loss": 0.4032, "step": 75000 }, { "epoch": 0.7505, "grad_norm": 54.15022659301758, "learning_rate": 6.99936e-07, "loss": 0.5132, "step": 75050 }, { "epoch": 0.751, "grad_norm": 92.10535430908203, "learning_rate": 6.99736e-07, "loss": 0.5717, "step": 75100 }, { "epoch": 0.7515, "grad_norm": 67.41671752929688, "learning_rate": 6.99536e-07, "loss": 0.4319, "step": 75150 }, { "epoch": 0.752, "grad_norm": 27.999591827392578, "learning_rate": 6.99336e-07, "loss": 0.5642, "step": 75200 }, { "epoch": 0.7525, "grad_norm": 54.86029815673828, "learning_rate": 6.991359999999999e-07, "loss": 0.6475, "step": 75250 }, { "epoch": 0.753, "grad_norm": 38.79415512084961, "learning_rate": 6.989359999999999e-07, "loss": 0.3514, "step": 75300 }, { "epoch": 0.7535, "grad_norm": 2.452829360961914, "learning_rate": 6.98736e-07, "loss": 0.5446, "step": 75350 }, { "epoch": 0.754, "grad_norm": 118.6248550415039, "learning_rate": 6.98536e-07, "loss": 0.5, "step": 75400 }, { "epoch": 0.7545, "grad_norm": 70.82340240478516, "learning_rate": 6.983359999999999e-07, "loss": 0.5008, "step": 75450 }, { "epoch": 0.755, "grad_norm": 42.31960678100586, "learning_rate": 6.9814e-07, "loss": 0.4178, "step": 75500 }, { "epoch": 0.7555, "grad_norm": 90.04888916015625, "learning_rate": 6.9794e-07, "loss": 0.5661, "step": 75550 }, { "epoch": 0.756, "grad_norm": 21.198366165161133, "learning_rate": 6.977399999999999e-07, "loss": 0.4164, "step": 75600 }, { "epoch": 0.7565, "grad_norm": 78.9493408203125, "learning_rate": 6.9754e-07, "loss": 0.5132, "step": 75650 }, { "epoch": 0.757, "grad_norm": 103.06893920898438, "learning_rate": 6.973399999999999e-07, "loss": 0.5996, "step": 75700 }, { "epoch": 0.7575, "grad_norm": 4.515233993530273, "learning_rate": 6.971399999999999e-07, "loss": 0.4079, "step": 75750 }, { "epoch": 0.758, "grad_norm": 68.41087341308594, "learning_rate": 6.9694e-07, "loss": 0.557, "step": 75800 }, { "epoch": 0.7585, "grad_norm": 4.536024570465088, "learning_rate": 6.9674e-07, "loss": 0.4946, "step": 75850 }, { "epoch": 0.759, "grad_norm": 43.321983337402344, "learning_rate": 6.965400000000001e-07, "loss": 0.4212, "step": 75900 }, { "epoch": 0.7595, "grad_norm": 57.4993896484375, "learning_rate": 6.9634e-07, "loss": 0.4643, "step": 75950 }, { "epoch": 0.76, "grad_norm": 58.03748321533203, "learning_rate": 6.961399999999999e-07, "loss": 0.5878, "step": 76000 }, { "epoch": 0.7605, "grad_norm": 5.033849239349365, "learning_rate": 6.9594e-07, "loss": 0.6593, "step": 76050 }, { "epoch": 0.761, "grad_norm": 75.7629623413086, "learning_rate": 6.9574e-07, "loss": 0.4768, "step": 76100 }, { "epoch": 0.7615, "grad_norm": 60.79175567626953, "learning_rate": 6.9554e-07, "loss": 0.5528, "step": 76150 }, { "epoch": 0.762, "grad_norm": 71.26956176757812, "learning_rate": 6.9534e-07, "loss": 0.587, "step": 76200 }, { "epoch": 0.7625, "grad_norm": 3.231938362121582, "learning_rate": 6.9514e-07, "loss": 0.4488, "step": 76250 }, { "epoch": 0.763, "grad_norm": 2.168018102645874, "learning_rate": 6.9494e-07, "loss": 0.4011, "step": 76300 }, { "epoch": 0.7635, "grad_norm": 2.409412145614624, "learning_rate": 6.9474e-07, "loss": 0.5966, "step": 76350 }, { "epoch": 0.764, "grad_norm": 8.960734367370605, "learning_rate": 6.9454e-07, "loss": 0.4323, "step": 76400 }, { "epoch": 0.7645, "grad_norm": 74.19822692871094, "learning_rate": 6.943399999999999e-07, "loss": 0.3678, "step": 76450 }, { "epoch": 0.765, "grad_norm": 75.17100524902344, "learning_rate": 6.9414e-07, "loss": 0.4418, "step": 76500 }, { "epoch": 0.7655, "grad_norm": 23.91132164001465, "learning_rate": 6.9394e-07, "loss": 0.4939, "step": 76550 }, { "epoch": 0.766, "grad_norm": 33.52251052856445, "learning_rate": 6.9374e-07, "loss": 0.4463, "step": 76600 }, { "epoch": 0.7665, "grad_norm": 119.65557861328125, "learning_rate": 6.935400000000001e-07, "loss": 0.6833, "step": 76650 }, { "epoch": 0.767, "grad_norm": 46.15104675292969, "learning_rate": 6.933399999999999e-07, "loss": 0.4756, "step": 76700 }, { "epoch": 0.7675, "grad_norm": 1.8948994874954224, "learning_rate": 6.931399999999999e-07, "loss": 0.535, "step": 76750 }, { "epoch": 0.768, "grad_norm": 38.17501449584961, "learning_rate": 6.9294e-07, "loss": 0.428, "step": 76800 }, { "epoch": 0.7685, "grad_norm": 65.1424789428711, "learning_rate": 6.9274e-07, "loss": 0.5542, "step": 76850 }, { "epoch": 0.769, "grad_norm": 88.36298370361328, "learning_rate": 6.9254e-07, "loss": 0.3903, "step": 76900 }, { "epoch": 0.7695, "grad_norm": 152.0764923095703, "learning_rate": 6.9234e-07, "loss": 0.4659, "step": 76950 }, { "epoch": 0.77, "grad_norm": 47.97085189819336, "learning_rate": 6.9214e-07, "loss": 0.5331, "step": 77000 }, { "epoch": 0.7705, "grad_norm": 91.40066528320312, "learning_rate": 6.919399999999999e-07, "loss": 0.5595, "step": 77050 }, { "epoch": 0.771, "grad_norm": 8.514924049377441, "learning_rate": 6.9174e-07, "loss": 0.3657, "step": 77100 }, { "epoch": 0.7715, "grad_norm": 24.74837303161621, "learning_rate": 6.9154e-07, "loss": 0.4459, "step": 77150 }, { "epoch": 0.772, "grad_norm": 87.70143127441406, "learning_rate": 6.913399999999999e-07, "loss": 0.4602, "step": 77200 }, { "epoch": 0.7725, "grad_norm": 20.411006927490234, "learning_rate": 6.9114e-07, "loss": 0.4812, "step": 77250 }, { "epoch": 0.773, "grad_norm": 74.98430633544922, "learning_rate": 6.9094e-07, "loss": 0.4783, "step": 77300 }, { "epoch": 0.7735, "grad_norm": 15.123334884643555, "learning_rate": 6.9074e-07, "loss": 0.289, "step": 77350 }, { "epoch": 0.774, "grad_norm": 81.70301818847656, "learning_rate": 6.9054e-07, "loss": 0.5672, "step": 77400 }, { "epoch": 0.7745, "grad_norm": 0.1056661456823349, "learning_rate": 6.903399999999999e-07, "loss": 0.3384, "step": 77450 }, { "epoch": 0.775, "grad_norm": 40.181251525878906, "learning_rate": 6.901399999999999e-07, "loss": 0.5383, "step": 77500 }, { "epoch": 0.7755, "grad_norm": 77.13603973388672, "learning_rate": 6.8994e-07, "loss": 0.5013, "step": 77550 }, { "epoch": 0.776, "grad_norm": 12.230585098266602, "learning_rate": 6.8974e-07, "loss": 0.4305, "step": 77600 }, { "epoch": 0.7765, "grad_norm": 130.13206481933594, "learning_rate": 6.8954e-07, "loss": 0.4474, "step": 77650 }, { "epoch": 0.777, "grad_norm": 7.258871555328369, "learning_rate": 6.8934e-07, "loss": 0.5191, "step": 77700 }, { "epoch": 0.7775, "grad_norm": 74.94843292236328, "learning_rate": 6.891399999999999e-07, "loss": 0.4981, "step": 77750 }, { "epoch": 0.778, "grad_norm": 37.118385314941406, "learning_rate": 6.889399999999999e-07, "loss": 0.5874, "step": 77800 }, { "epoch": 0.7785, "grad_norm": 66.5859146118164, "learning_rate": 6.8874e-07, "loss": 0.4819, "step": 77850 }, { "epoch": 0.779, "grad_norm": 44.33268356323242, "learning_rate": 6.8854e-07, "loss": 0.5725, "step": 77900 }, { "epoch": 0.7795, "grad_norm": 23.16948890686035, "learning_rate": 6.883399999999999e-07, "loss": 0.4699, "step": 77950 }, { "epoch": 0.78, "grad_norm": 84.33336639404297, "learning_rate": 6.8814e-07, "loss": 0.5684, "step": 78000 }, { "epoch": 0.7805, "grad_norm": 52.04594039916992, "learning_rate": 6.8794e-07, "loss": 0.5498, "step": 78050 }, { "epoch": 0.781, "grad_norm": 9.365368843078613, "learning_rate": 6.877399999999999e-07, "loss": 0.4686, "step": 78100 }, { "epoch": 0.7815, "grad_norm": 20.72178077697754, "learning_rate": 6.8754e-07, "loss": 0.5012, "step": 78150 }, { "epoch": 0.782, "grad_norm": 35.4743537902832, "learning_rate": 6.873399999999999e-07, "loss": 0.5656, "step": 78200 }, { "epoch": 0.7825, "grad_norm": 58.698486328125, "learning_rate": 6.871399999999999e-07, "loss": 0.5637, "step": 78250 }, { "epoch": 0.783, "grad_norm": 4.087533950805664, "learning_rate": 6.8694e-07, "loss": 0.4656, "step": 78300 }, { "epoch": 0.7835, "grad_norm": 114.83053588867188, "learning_rate": 6.8674e-07, "loss": 0.5119, "step": 78350 }, { "epoch": 0.784, "grad_norm": 22.25564193725586, "learning_rate": 6.865400000000001e-07, "loss": 0.4064, "step": 78400 }, { "epoch": 0.7845, "grad_norm": 25.038494110107422, "learning_rate": 6.863399999999999e-07, "loss": 0.4126, "step": 78450 }, { "epoch": 0.785, "grad_norm": 77.9610824584961, "learning_rate": 6.861399999999999e-07, "loss": 0.4798, "step": 78500 }, { "epoch": 0.7855, "grad_norm": 48.64069366455078, "learning_rate": 6.8594e-07, "loss": 0.4212, "step": 78550 }, { "epoch": 0.786, "grad_norm": 33.35960006713867, "learning_rate": 6.8574e-07, "loss": 0.4711, "step": 78600 }, { "epoch": 0.7865, "grad_norm": 8.562115669250488, "learning_rate": 6.8554e-07, "loss": 0.5208, "step": 78650 }, { "epoch": 0.787, "grad_norm": 84.58731079101562, "learning_rate": 6.8534e-07, "loss": 0.5262, "step": 78700 }, { "epoch": 0.7875, "grad_norm": 35.85997009277344, "learning_rate": 6.8514e-07, "loss": 0.6896, "step": 78750 }, { "epoch": 0.788, "grad_norm": 84.28465270996094, "learning_rate": 6.849399999999999e-07, "loss": 0.6728, "step": 78800 }, { "epoch": 0.7885, "grad_norm": 105.6515121459961, "learning_rate": 6.8474e-07, "loss": 0.5276, "step": 78850 }, { "epoch": 0.789, "grad_norm": 74.69842529296875, "learning_rate": 6.8454e-07, "loss": 0.5207, "step": 78900 }, { "epoch": 0.7895, "grad_norm": 60.617088317871094, "learning_rate": 6.843399999999999e-07, "loss": 0.4344, "step": 78950 }, { "epoch": 0.79, "grad_norm": 64.80194854736328, "learning_rate": 6.8414e-07, "loss": 0.3582, "step": 79000 }, { "epoch": 0.7905, "grad_norm": 9.421799659729004, "learning_rate": 6.8394e-07, "loss": 0.3847, "step": 79050 }, { "epoch": 0.791, "grad_norm": 27.81099510192871, "learning_rate": 6.8374e-07, "loss": 0.5735, "step": 79100 }, { "epoch": 0.7915, "grad_norm": 11.743189811706543, "learning_rate": 6.8354e-07, "loss": 0.3546, "step": 79150 }, { "epoch": 0.792, "grad_norm": 2.8396387100219727, "learning_rate": 6.833399999999999e-07, "loss": 0.3229, "step": 79200 }, { "epoch": 0.7925, "grad_norm": 0.9796394109725952, "learning_rate": 6.831399999999999e-07, "loss": 0.4796, "step": 79250 }, { "epoch": 0.793, "grad_norm": 64.07000732421875, "learning_rate": 6.8294e-07, "loss": 0.4583, "step": 79300 }, { "epoch": 0.7935, "grad_norm": 91.14384460449219, "learning_rate": 6.8274e-07, "loss": 0.5557, "step": 79350 }, { "epoch": 0.794, "grad_norm": 67.72103881835938, "learning_rate": 6.8254e-07, "loss": 0.4973, "step": 79400 }, { "epoch": 0.7945, "grad_norm": 76.02549743652344, "learning_rate": 6.8234e-07, "loss": 0.4128, "step": 79450 }, { "epoch": 0.795, "grad_norm": 14.491253852844238, "learning_rate": 6.821399999999999e-07, "loss": 0.5534, "step": 79500 }, { "epoch": 0.7955, "grad_norm": 0.9683569073677063, "learning_rate": 6.819399999999999e-07, "loss": 0.6786, "step": 79550 }, { "epoch": 0.796, "grad_norm": 73.29056549072266, "learning_rate": 6.8174e-07, "loss": 0.4019, "step": 79600 }, { "epoch": 0.7965, "grad_norm": 44.23388671875, "learning_rate": 6.8154e-07, "loss": 0.4544, "step": 79650 }, { "epoch": 0.797, "grad_norm": 0.06450020521879196, "learning_rate": 6.813439999999999e-07, "loss": 0.3565, "step": 79700 }, { "epoch": 0.7975, "grad_norm": 79.00003814697266, "learning_rate": 6.81144e-07, "loss": 0.5056, "step": 79750 }, { "epoch": 0.798, "grad_norm": 25.8265438079834, "learning_rate": 6.80944e-07, "loss": 0.4529, "step": 79800 }, { "epoch": 0.7985, "grad_norm": 67.03202056884766, "learning_rate": 6.807439999999999e-07, "loss": 0.4031, "step": 79850 }, { "epoch": 0.799, "grad_norm": 67.00475311279297, "learning_rate": 6.80544e-07, "loss": 0.4486, "step": 79900 }, { "epoch": 0.7995, "grad_norm": 11.94206714630127, "learning_rate": 6.803439999999999e-07, "loss": 0.4968, "step": 79950 }, { "epoch": 0.8, "grad_norm": 73.1527099609375, "learning_rate": 6.801439999999999e-07, "loss": 0.379, "step": 80000 }, { "epoch": 0.8005, "grad_norm": 34.18701934814453, "learning_rate": 6.79944e-07, "loss": 0.4137, "step": 80050 }, { "epoch": 0.801, "grad_norm": 64.62896728515625, "learning_rate": 6.79744e-07, "loss": 0.6624, "step": 80100 }, { "epoch": 0.8015, "grad_norm": 52.73872375488281, "learning_rate": 6.79544e-07, "loss": 0.4575, "step": 80150 }, { "epoch": 0.802, "grad_norm": 76.4136734008789, "learning_rate": 6.793439999999999e-07, "loss": 0.5554, "step": 80200 }, { "epoch": 0.8025, "grad_norm": 27.00578498840332, "learning_rate": 6.791439999999999e-07, "loss": 0.5405, "step": 80250 }, { "epoch": 0.803, "grad_norm": 33.01068115234375, "learning_rate": 6.789439999999999e-07, "loss": 0.5504, "step": 80300 }, { "epoch": 0.8035, "grad_norm": 0.1820012480020523, "learning_rate": 6.78752e-07, "loss": 0.5296, "step": 80350 }, { "epoch": 0.804, "grad_norm": 73.24763488769531, "learning_rate": 6.78552e-07, "loss": 0.4904, "step": 80400 }, { "epoch": 0.8045, "grad_norm": 53.44038772583008, "learning_rate": 6.783519999999999e-07, "loss": 0.283, "step": 80450 }, { "epoch": 0.805, "grad_norm": 9.23548412322998, "learning_rate": 6.78152e-07, "loss": 0.524, "step": 80500 }, { "epoch": 0.8055, "grad_norm": 8.429581642150879, "learning_rate": 6.77952e-07, "loss": 0.3238, "step": 80550 }, { "epoch": 0.806, "grad_norm": 0.890905499458313, "learning_rate": 6.777519999999999e-07, "loss": 0.4394, "step": 80600 }, { "epoch": 0.8065, "grad_norm": 61.318058013916016, "learning_rate": 6.77552e-07, "loss": 0.4028, "step": 80650 }, { "epoch": 0.807, "grad_norm": 32.087276458740234, "learning_rate": 6.773519999999999e-07, "loss": 0.6014, "step": 80700 }, { "epoch": 0.8075, "grad_norm": 14.302053451538086, "learning_rate": 6.771519999999999e-07, "loss": 0.5009, "step": 80750 }, { "epoch": 0.808, "grad_norm": 20.004087448120117, "learning_rate": 6.76952e-07, "loss": 0.5234, "step": 80800 }, { "epoch": 0.8085, "grad_norm": 58.03341293334961, "learning_rate": 6.76752e-07, "loss": 0.5761, "step": 80850 }, { "epoch": 0.809, "grad_norm": 42.07494354248047, "learning_rate": 6.765560000000001e-07, "loss": 0.4014, "step": 80900 }, { "epoch": 0.8095, "grad_norm": 11.237667083740234, "learning_rate": 6.763559999999999e-07, "loss": 0.6565, "step": 80950 }, { "epoch": 0.81, "grad_norm": 54.023685455322266, "learning_rate": 6.761559999999999e-07, "loss": 0.5733, "step": 81000 }, { "epoch": 0.8105, "grad_norm": 26.939653396606445, "learning_rate": 6.75956e-07, "loss": 0.5418, "step": 81050 }, { "epoch": 0.811, "grad_norm": 72.97735595703125, "learning_rate": 6.75756e-07, "loss": 0.6132, "step": 81100 }, { "epoch": 0.8115, "grad_norm": 30.797290802001953, "learning_rate": 6.75556e-07, "loss": 0.4381, "step": 81150 }, { "epoch": 0.812, "grad_norm": 193.12718200683594, "learning_rate": 6.75356e-07, "loss": 0.5038, "step": 81200 }, { "epoch": 0.8125, "grad_norm": 42.33784103393555, "learning_rate": 6.75156e-07, "loss": 0.5907, "step": 81250 }, { "epoch": 0.813, "grad_norm": 60.777767181396484, "learning_rate": 6.749559999999999e-07, "loss": 0.5999, "step": 81300 }, { "epoch": 0.8135, "grad_norm": 4.074952602386475, "learning_rate": 6.74756e-07, "loss": 0.5535, "step": 81350 }, { "epoch": 0.814, "grad_norm": 19.895477294921875, "learning_rate": 6.74556e-07, "loss": 0.4116, "step": 81400 }, { "epoch": 0.8145, "grad_norm": 8.462628364562988, "learning_rate": 6.743559999999999e-07, "loss": 0.4759, "step": 81450 }, { "epoch": 0.815, "grad_norm": 55.037052154541016, "learning_rate": 6.74156e-07, "loss": 0.5654, "step": 81500 }, { "epoch": 0.8155, "grad_norm": 65.07530975341797, "learning_rate": 6.73956e-07, "loss": 0.5132, "step": 81550 }, { "epoch": 0.816, "grad_norm": 64.06502532958984, "learning_rate": 6.73756e-07, "loss": 0.5159, "step": 81600 }, { "epoch": 0.8165, "grad_norm": 73.82300567626953, "learning_rate": 6.73556e-07, "loss": 0.5446, "step": 81650 }, { "epoch": 0.817, "grad_norm": 40.699676513671875, "learning_rate": 6.733559999999999e-07, "loss": 0.4982, "step": 81700 }, { "epoch": 0.8175, "grad_norm": 94.0560531616211, "learning_rate": 6.731559999999999e-07, "loss": 0.5136, "step": 81750 }, { "epoch": 0.818, "grad_norm": 77.07228088378906, "learning_rate": 6.72956e-07, "loss": 0.5026, "step": 81800 }, { "epoch": 0.8185, "grad_norm": 54.43427658081055, "learning_rate": 6.72756e-07, "loss": 0.5811, "step": 81850 }, { "epoch": 0.819, "grad_norm": 32.514156341552734, "learning_rate": 6.72556e-07, "loss": 0.4385, "step": 81900 }, { "epoch": 0.8195, "grad_norm": 8.645065307617188, "learning_rate": 6.72356e-07, "loss": 0.3586, "step": 81950 }, { "epoch": 0.82, "grad_norm": 42.03317642211914, "learning_rate": 6.72156e-07, "loss": 0.413, "step": 82000 }, { "epoch": 0.8205, "grad_norm": 0.5245881676673889, "learning_rate": 6.719559999999999e-07, "loss": 0.4099, "step": 82050 }, { "epoch": 0.821, "grad_norm": 66.3309326171875, "learning_rate": 6.71756e-07, "loss": 0.7605, "step": 82100 }, { "epoch": 0.8215, "grad_norm": 26.570140838623047, "learning_rate": 6.71556e-07, "loss": 0.4468, "step": 82150 }, { "epoch": 0.822, "grad_norm": 71.70770263671875, "learning_rate": 6.713559999999999e-07, "loss": 0.3865, "step": 82200 }, { "epoch": 0.8225, "grad_norm": 29.463645935058594, "learning_rate": 6.71156e-07, "loss": 0.5206, "step": 82250 }, { "epoch": 0.823, "grad_norm": 61.58549880981445, "learning_rate": 6.70956e-07, "loss": 0.5122, "step": 82300 }, { "epoch": 0.8235, "grad_norm": 52.86707305908203, "learning_rate": 6.70756e-07, "loss": 0.5216, "step": 82350 }, { "epoch": 0.824, "grad_norm": 67.04204559326172, "learning_rate": 6.70556e-07, "loss": 0.5536, "step": 82400 }, { "epoch": 0.8245, "grad_norm": 1.1900335550308228, "learning_rate": 6.703559999999999e-07, "loss": 0.4011, "step": 82450 }, { "epoch": 0.825, "grad_norm": 13.605680465698242, "learning_rate": 6.701559999999999e-07, "loss": 0.412, "step": 82500 }, { "epoch": 0.8255, "grad_norm": 68.13459777832031, "learning_rate": 6.69956e-07, "loss": 0.5592, "step": 82550 }, { "epoch": 0.826, "grad_norm": 41.138648986816406, "learning_rate": 6.69756e-07, "loss": 0.5622, "step": 82600 }, { "epoch": 0.8265, "grad_norm": 50.51581954956055, "learning_rate": 6.69556e-07, "loss": 0.408, "step": 82650 }, { "epoch": 0.827, "grad_norm": 63.566707611083984, "learning_rate": 6.69356e-07, "loss": 0.4288, "step": 82700 }, { "epoch": 0.8275, "grad_norm": 3.45732045173645, "learning_rate": 6.691559999999999e-07, "loss": 0.5257, "step": 82750 }, { "epoch": 0.828, "grad_norm": 13.433048248291016, "learning_rate": 6.689559999999999e-07, "loss": 0.4753, "step": 82800 }, { "epoch": 0.8285, "grad_norm": 0.32884418964385986, "learning_rate": 6.68756e-07, "loss": 0.4934, "step": 82850 }, { "epoch": 0.829, "grad_norm": 13.126819610595703, "learning_rate": 6.68556e-07, "loss": 0.3574, "step": 82900 }, { "epoch": 0.8295, "grad_norm": 48.53691482543945, "learning_rate": 6.683559999999999e-07, "loss": 0.7363, "step": 82950 }, { "epoch": 0.83, "grad_norm": 87.21624755859375, "learning_rate": 6.68156e-07, "loss": 0.4372, "step": 83000 }, { "epoch": 0.8305, "grad_norm": 53.67840576171875, "learning_rate": 6.67956e-07, "loss": 0.5366, "step": 83050 }, { "epoch": 0.831, "grad_norm": 72.83885192871094, "learning_rate": 6.677559999999999e-07, "loss": 0.5919, "step": 83100 }, { "epoch": 0.8315, "grad_norm": 24.180238723754883, "learning_rate": 6.67556e-07, "loss": 0.5037, "step": 83150 }, { "epoch": 0.832, "grad_norm": 68.72847747802734, "learning_rate": 6.673559999999999e-07, "loss": 0.5177, "step": 83200 }, { "epoch": 0.8325, "grad_norm": 8.159306526184082, "learning_rate": 6.671559999999999e-07, "loss": 0.4894, "step": 83250 }, { "epoch": 0.833, "grad_norm": 77.16256713867188, "learning_rate": 6.66956e-07, "loss": 0.4338, "step": 83300 }, { "epoch": 0.8335, "grad_norm": 15.30505657196045, "learning_rate": 6.66756e-07, "loss": 0.4723, "step": 83350 }, { "epoch": 0.834, "grad_norm": 56.12632751464844, "learning_rate": 6.665560000000001e-07, "loss": 0.4587, "step": 83400 }, { "epoch": 0.8345, "grad_norm": 16.000368118286133, "learning_rate": 6.663559999999999e-07, "loss": 0.5214, "step": 83450 }, { "epoch": 0.835, "grad_norm": 2.457681655883789, "learning_rate": 6.661559999999999e-07, "loss": 0.5707, "step": 83500 }, { "epoch": 0.8355, "grad_norm": 107.4878921508789, "learning_rate": 6.65956e-07, "loss": 0.5638, "step": 83550 }, { "epoch": 0.836, "grad_norm": 133.30532836914062, "learning_rate": 6.65756e-07, "loss": 0.5926, "step": 83600 }, { "epoch": 0.8365, "grad_norm": 63.642738342285156, "learning_rate": 6.65556e-07, "loss": 0.3443, "step": 83650 }, { "epoch": 0.837, "grad_norm": 10.244129180908203, "learning_rate": 6.65356e-07, "loss": 0.4525, "step": 83700 }, { "epoch": 0.8375, "grad_norm": 21.855087280273438, "learning_rate": 6.65156e-07, "loss": 0.5493, "step": 83750 }, { "epoch": 0.838, "grad_norm": 8.698675155639648, "learning_rate": 6.649559999999999e-07, "loss": 0.4376, "step": 83800 }, { "epoch": 0.8385, "grad_norm": 56.18486785888672, "learning_rate": 6.64756e-07, "loss": 0.4455, "step": 83850 }, { "epoch": 0.839, "grad_norm": 3.442760705947876, "learning_rate": 6.64556e-07, "loss": 0.3941, "step": 83900 }, { "epoch": 0.8395, "grad_norm": 108.05015563964844, "learning_rate": 6.643559999999999e-07, "loss": 0.4177, "step": 83950 }, { "epoch": 0.84, "grad_norm": 11.338204383850098, "learning_rate": 6.64156e-07, "loss": 0.4218, "step": 84000 }, { "epoch": 0.8405, "grad_norm": 53.90700149536133, "learning_rate": 6.6396e-07, "loss": 0.3908, "step": 84050 }, { "epoch": 0.841, "grad_norm": 67.98553466796875, "learning_rate": 6.6376e-07, "loss": 0.3739, "step": 84100 }, { "epoch": 0.8415, "grad_norm": 45.681941986083984, "learning_rate": 6.6356e-07, "loss": 0.4754, "step": 84150 }, { "epoch": 0.842, "grad_norm": 84.54562377929688, "learning_rate": 6.633599999999999e-07, "loss": 0.5086, "step": 84200 }, { "epoch": 0.8425, "grad_norm": 101.58403778076172, "learning_rate": 6.631599999999999e-07, "loss": 0.4529, "step": 84250 }, { "epoch": 0.843, "grad_norm": 87.35352325439453, "learning_rate": 6.6296e-07, "loss": 0.561, "step": 84300 }, { "epoch": 0.8435, "grad_norm": 64.42286682128906, "learning_rate": 6.6276e-07, "loss": 0.4564, "step": 84350 }, { "epoch": 0.844, "grad_norm": 12.244241714477539, "learning_rate": 6.6256e-07, "loss": 0.3562, "step": 84400 }, { "epoch": 0.8445, "grad_norm": 38.84530258178711, "learning_rate": 6.6236e-07, "loss": 0.537, "step": 84450 }, { "epoch": 0.845, "grad_norm": 71.45441436767578, "learning_rate": 6.621599999999999e-07, "loss": 0.4359, "step": 84500 }, { "epoch": 0.8455, "grad_norm": 38.31736755371094, "learning_rate": 6.619599999999999e-07, "loss": 0.3036, "step": 84550 }, { "epoch": 0.846, "grad_norm": 1.6229268312454224, "learning_rate": 6.6176e-07, "loss": 0.5366, "step": 84600 }, { "epoch": 0.8465, "grad_norm": 65.19337463378906, "learning_rate": 6.6156e-07, "loss": 0.5766, "step": 84650 }, { "epoch": 0.847, "grad_norm": 96.37982940673828, "learning_rate": 6.613599999999999e-07, "loss": 0.4957, "step": 84700 }, { "epoch": 0.8475, "grad_norm": 117.63880920410156, "learning_rate": 6.6116e-07, "loss": 0.5433, "step": 84750 }, { "epoch": 0.848, "grad_norm": 107.3805923461914, "learning_rate": 6.6096e-07, "loss": 0.4758, "step": 84800 }, { "epoch": 0.8485, "grad_norm": 105.60696411132812, "learning_rate": 6.607599999999999e-07, "loss": 0.562, "step": 84850 }, { "epoch": 0.849, "grad_norm": 0.5521818995475769, "learning_rate": 6.6056e-07, "loss": 0.3867, "step": 84900 }, { "epoch": 0.8495, "grad_norm": 15.534207344055176, "learning_rate": 6.603599999999999e-07, "loss": 0.3862, "step": 84950 }, { "epoch": 0.85, "grad_norm": 4.9523701667785645, "learning_rate": 6.601599999999999e-07, "loss": 0.4487, "step": 85000 }, { "epoch": 0.8505, "grad_norm": 107.40753936767578, "learning_rate": 6.5996e-07, "loss": 0.4253, "step": 85050 }, { "epoch": 0.851, "grad_norm": 2.598055362701416, "learning_rate": 6.5976e-07, "loss": 0.5241, "step": 85100 }, { "epoch": 0.8515, "grad_norm": 55.7864875793457, "learning_rate": 6.5956e-07, "loss": 0.5934, "step": 85150 }, { "epoch": 0.852, "grad_norm": 4.6609697341918945, "learning_rate": 6.593599999999999e-07, "loss": 0.4759, "step": 85200 }, { "epoch": 0.8525, "grad_norm": 103.4588851928711, "learning_rate": 6.591599999999999e-07, "loss": 0.4338, "step": 85250 }, { "epoch": 0.853, "grad_norm": 4.739608287811279, "learning_rate": 6.589599999999999e-07, "loss": 0.5771, "step": 85300 }, { "epoch": 0.8535, "grad_norm": 81.03630828857422, "learning_rate": 6.5876e-07, "loss": 0.4308, "step": 85350 }, { "epoch": 0.854, "grad_norm": 10.07264232635498, "learning_rate": 6.5856e-07, "loss": 0.5954, "step": 85400 }, { "epoch": 0.8545, "grad_norm": 87.12622833251953, "learning_rate": 6.583599999999999e-07, "loss": 0.5724, "step": 85450 }, { "epoch": 0.855, "grad_norm": 12.88304615020752, "learning_rate": 6.5816e-07, "loss": 0.3812, "step": 85500 }, { "epoch": 0.8555, "grad_norm": 2.8463549613952637, "learning_rate": 6.579599999999999e-07, "loss": 0.5091, "step": 85550 }, { "epoch": 0.856, "grad_norm": 107.44549560546875, "learning_rate": 6.577599999999999e-07, "loss": 0.5983, "step": 85600 }, { "epoch": 0.8565, "grad_norm": 25.965686798095703, "learning_rate": 6.5756e-07, "loss": 0.4518, "step": 85650 }, { "epoch": 0.857, "grad_norm": 13.483121871948242, "learning_rate": 6.5736e-07, "loss": 0.4494, "step": 85700 }, { "epoch": 0.8575, "grad_norm": 57.970027923583984, "learning_rate": 6.571599999999999e-07, "loss": 0.5211, "step": 85750 }, { "epoch": 0.858, "grad_norm": 61.619537353515625, "learning_rate": 6.5696e-07, "loss": 0.4131, "step": 85800 }, { "epoch": 0.8585, "grad_norm": 72.63780975341797, "learning_rate": 6.5676e-07, "loss": 0.4786, "step": 85850 }, { "epoch": 0.859, "grad_norm": 32.84150314331055, "learning_rate": 6.5656e-07, "loss": 0.3868, "step": 85900 }, { "epoch": 0.8595, "grad_norm": 11.908856391906738, "learning_rate": 6.5636e-07, "loss": 0.4036, "step": 85950 }, { "epoch": 0.86, "grad_norm": 124.65275573730469, "learning_rate": 6.561599999999999e-07, "loss": 0.5248, "step": 86000 }, { "epoch": 0.8605, "grad_norm": 77.87248992919922, "learning_rate": 6.5596e-07, "loss": 0.631, "step": 86050 }, { "epoch": 0.861, "grad_norm": 88.47559356689453, "learning_rate": 6.5576e-07, "loss": 0.5183, "step": 86100 }, { "epoch": 0.8615, "grad_norm": 35.03434371948242, "learning_rate": 6.5556e-07, "loss": 0.536, "step": 86150 }, { "epoch": 0.862, "grad_norm": 49.06086349487305, "learning_rate": 6.553600000000001e-07, "loss": 0.4981, "step": 86200 }, { "epoch": 0.8625, "grad_norm": 22.367053985595703, "learning_rate": 6.551599999999999e-07, "loss": 0.5554, "step": 86250 }, { "epoch": 0.863, "grad_norm": 46.18075180053711, "learning_rate": 6.549599999999999e-07, "loss": 0.4415, "step": 86300 }, { "epoch": 0.8635, "grad_norm": 85.1124496459961, "learning_rate": 6.5476e-07, "loss": 0.6429, "step": 86350 }, { "epoch": 0.864, "grad_norm": 75.1630630493164, "learning_rate": 6.5456e-07, "loss": 0.4281, "step": 86400 }, { "epoch": 0.8645, "grad_norm": 23.679262161254883, "learning_rate": 6.5436e-07, "loss": 0.4745, "step": 86450 }, { "epoch": 0.865, "grad_norm": 40.03208541870117, "learning_rate": 6.5416e-07, "loss": 0.3905, "step": 86500 }, { "epoch": 0.8655, "grad_norm": 65.0140151977539, "learning_rate": 6.5396e-07, "loss": 0.6504, "step": 86550 }, { "epoch": 0.866, "grad_norm": 85.9790267944336, "learning_rate": 6.537599999999999e-07, "loss": 0.3979, "step": 86600 }, { "epoch": 0.8665, "grad_norm": 3.0157697200775146, "learning_rate": 6.5356e-07, "loss": 0.4499, "step": 86650 }, { "epoch": 0.867, "grad_norm": 86.25167846679688, "learning_rate": 6.5336e-07, "loss": 0.5585, "step": 86700 }, { "epoch": 0.8675, "grad_norm": 26.297157287597656, "learning_rate": 6.531599999999999e-07, "loss": 0.5065, "step": 86750 }, { "epoch": 0.868, "grad_norm": 53.790348052978516, "learning_rate": 6.5296e-07, "loss": 0.507, "step": 86800 }, { "epoch": 0.8685, "grad_norm": 45.35681915283203, "learning_rate": 6.5276e-07, "loss": 0.4223, "step": 86850 }, { "epoch": 0.869, "grad_norm": 108.29571533203125, "learning_rate": 6.5256e-07, "loss": 0.4757, "step": 86900 }, { "epoch": 0.8695, "grad_norm": 1.806410312652588, "learning_rate": 6.5236e-07, "loss": 0.4832, "step": 86950 }, { "epoch": 0.87, "grad_norm": 46.172607421875, "learning_rate": 6.521599999999999e-07, "loss": 0.599, "step": 87000 }, { "epoch": 0.8705, "grad_norm": 37.954742431640625, "learning_rate": 6.519599999999999e-07, "loss": 0.6213, "step": 87050 }, { "epoch": 0.871, "grad_norm": 11.543988227844238, "learning_rate": 6.5176e-07, "loss": 0.4723, "step": 87100 }, { "epoch": 0.8715, "grad_norm": 26.00443458557129, "learning_rate": 6.5156e-07, "loss": 0.4994, "step": 87150 }, { "epoch": 0.872, "grad_norm": 144.9601593017578, "learning_rate": 6.5136e-07, "loss": 0.5887, "step": 87200 }, { "epoch": 0.8725, "grad_norm": 53.76115417480469, "learning_rate": 6.5116e-07, "loss": 0.4368, "step": 87250 }, { "epoch": 0.873, "grad_norm": 56.003292083740234, "learning_rate": 6.5096e-07, "loss": 0.49, "step": 87300 }, { "epoch": 0.8735, "grad_norm": 81.86634063720703, "learning_rate": 6.507599999999999e-07, "loss": 0.4194, "step": 87350 }, { "epoch": 0.874, "grad_norm": 46.207969665527344, "learning_rate": 6.5056e-07, "loss": 0.5626, "step": 87400 }, { "epoch": 0.8745, "grad_norm": 93.14911651611328, "learning_rate": 6.5036e-07, "loss": 0.5803, "step": 87450 }, { "epoch": 0.875, "grad_norm": 91.9137954711914, "learning_rate": 6.501599999999999e-07, "loss": 0.4872, "step": 87500 }, { "epoch": 0.8755, "grad_norm": 73.91366577148438, "learning_rate": 6.4996e-07, "loss": 0.5283, "step": 87550 }, { "epoch": 0.876, "grad_norm": 35.279666900634766, "learning_rate": 6.4976e-07, "loss": 0.4378, "step": 87600 }, { "epoch": 0.8765, "grad_norm": 15.952431678771973, "learning_rate": 6.4956e-07, "loss": 0.4564, "step": 87650 }, { "epoch": 0.877, "grad_norm": 2.8610007762908936, "learning_rate": 6.4936e-07, "loss": 0.4478, "step": 87700 }, { "epoch": 0.8775, "grad_norm": 64.05665588378906, "learning_rate": 6.491599999999999e-07, "loss": 0.4061, "step": 87750 }, { "epoch": 0.878, "grad_norm": 3.5503172874450684, "learning_rate": 6.489599999999999e-07, "loss": 0.4837, "step": 87800 }, { "epoch": 0.8785, "grad_norm": 11.314192771911621, "learning_rate": 6.4876e-07, "loss": 0.3553, "step": 87850 }, { "epoch": 0.879, "grad_norm": 23.217226028442383, "learning_rate": 6.4856e-07, "loss": 0.4392, "step": 87900 }, { "epoch": 0.8795, "grad_norm": 2.8893821239471436, "learning_rate": 6.483600000000001e-07, "loss": 0.4047, "step": 87950 }, { "epoch": 0.88, "grad_norm": 5.448762893676758, "learning_rate": 6.4816e-07, "loss": 0.4604, "step": 88000 }, { "epoch": 0.8805, "grad_norm": 84.17955017089844, "learning_rate": 6.479599999999999e-07, "loss": 0.4875, "step": 88050 }, { "epoch": 0.881, "grad_norm": 68.11402130126953, "learning_rate": 6.4776e-07, "loss": 0.5875, "step": 88100 }, { "epoch": 0.8815, "grad_norm": 77.2765884399414, "learning_rate": 6.4756e-07, "loss": 0.4419, "step": 88150 }, { "epoch": 0.882, "grad_norm": 16.410972595214844, "learning_rate": 6.4736e-07, "loss": 0.4844, "step": 88200 }, { "epoch": 0.8825, "grad_norm": 31.93629264831543, "learning_rate": 6.4716e-07, "loss": 0.4829, "step": 88250 }, { "epoch": 0.883, "grad_norm": 68.09761047363281, "learning_rate": 6.4696e-07, "loss": 0.3913, "step": 88300 }, { "epoch": 0.8835, "grad_norm": 91.0275650024414, "learning_rate": 6.4676e-07, "loss": 0.6128, "step": 88350 }, { "epoch": 0.884, "grad_norm": 67.89716339111328, "learning_rate": 6.4656e-07, "loss": 0.5768, "step": 88400 }, { "epoch": 0.8845, "grad_norm": 21.36505126953125, "learning_rate": 6.4636e-07, "loss": 0.3613, "step": 88450 }, { "epoch": 0.885, "grad_norm": 2.784221887588501, "learning_rate": 6.461599999999999e-07, "loss": 0.3029, "step": 88500 }, { "epoch": 0.8855, "grad_norm": 4.084001064300537, "learning_rate": 6.4596e-07, "loss": 0.3698, "step": 88550 }, { "epoch": 0.886, "grad_norm": 72.70446014404297, "learning_rate": 6.4576e-07, "loss": 0.4915, "step": 88600 }, { "epoch": 0.8865, "grad_norm": 55.41581726074219, "learning_rate": 6.4556e-07, "loss": 0.5785, "step": 88650 }, { "epoch": 0.887, "grad_norm": 35.189048767089844, "learning_rate": 6.453600000000001e-07, "loss": 0.5671, "step": 88700 }, { "epoch": 0.8875, "grad_norm": 57.02684783935547, "learning_rate": 6.451599999999999e-07, "loss": 0.4981, "step": 88750 }, { "epoch": 0.888, "grad_norm": 0.43074750900268555, "learning_rate": 6.449599999999999e-07, "loss": 0.4555, "step": 88800 }, { "epoch": 0.8885, "grad_norm": 13.582013130187988, "learning_rate": 6.4476e-07, "loss": 0.3789, "step": 88850 }, { "epoch": 0.889, "grad_norm": 65.70917510986328, "learning_rate": 6.4456e-07, "loss": 0.5623, "step": 88900 }, { "epoch": 0.8895, "grad_norm": 48.792179107666016, "learning_rate": 6.4436e-07, "loss": 0.5911, "step": 88950 }, { "epoch": 0.89, "grad_norm": 11.270796775817871, "learning_rate": 6.4416e-07, "loss": 0.4401, "step": 89000 }, { "epoch": 0.8905, "grad_norm": 89.74335479736328, "learning_rate": 6.4396e-07, "loss": 0.3221, "step": 89050 }, { "epoch": 0.891, "grad_norm": 46.43724822998047, "learning_rate": 6.437599999999999e-07, "loss": 0.3968, "step": 89100 }, { "epoch": 0.8915, "grad_norm": 2.0606515407562256, "learning_rate": 6.4356e-07, "loss": 0.447, "step": 89150 }, { "epoch": 0.892, "grad_norm": 91.6527099609375, "learning_rate": 6.4336e-07, "loss": 0.4425, "step": 89200 }, { "epoch": 0.8925, "grad_norm": 42.8084716796875, "learning_rate": 6.431599999999999e-07, "loss": 0.4977, "step": 89250 }, { "epoch": 0.893, "grad_norm": 13.197081565856934, "learning_rate": 6.4296e-07, "loss": 0.395, "step": 89300 }, { "epoch": 0.8935, "grad_norm": 97.76970672607422, "learning_rate": 6.4276e-07, "loss": 0.4206, "step": 89350 }, { "epoch": 0.894, "grad_norm": 103.70655822753906, "learning_rate": 6.4256e-07, "loss": 0.6174, "step": 89400 }, { "epoch": 0.8945, "grad_norm": 25.918859481811523, "learning_rate": 6.4236e-07, "loss": 0.5479, "step": 89450 }, { "epoch": 0.895, "grad_norm": 28.53264808654785, "learning_rate": 6.421599999999999e-07, "loss": 0.5709, "step": 89500 }, { "epoch": 0.8955, "grad_norm": 38.950416564941406, "learning_rate": 6.419599999999999e-07, "loss": 0.5301, "step": 89550 }, { "epoch": 0.896, "grad_norm": 95.71314239501953, "learning_rate": 6.4176e-07, "loss": 0.5001, "step": 89600 }, { "epoch": 0.8965, "grad_norm": 7.237802505493164, "learning_rate": 6.4156e-07, "loss": 0.5101, "step": 89650 }, { "epoch": 0.897, "grad_norm": 104.18899536132812, "learning_rate": 6.4136e-07, "loss": 0.445, "step": 89700 }, { "epoch": 0.8975, "grad_norm": 88.33003997802734, "learning_rate": 6.4116e-07, "loss": 0.4632, "step": 89750 }, { "epoch": 0.898, "grad_norm": 0.11056256294250488, "learning_rate": 6.409599999999999e-07, "loss": 0.4456, "step": 89800 }, { "epoch": 0.8985, "grad_norm": 23.158008575439453, "learning_rate": 6.407599999999999e-07, "loss": 0.5806, "step": 89850 }, { "epoch": 0.899, "grad_norm": 45.30116271972656, "learning_rate": 6.4056e-07, "loss": 0.4927, "step": 89900 }, { "epoch": 0.8995, "grad_norm": 13.431071281433105, "learning_rate": 6.4036e-07, "loss": 0.5075, "step": 89950 }, { "epoch": 0.9, "grad_norm": 42.31669235229492, "learning_rate": 6.401599999999999e-07, "loss": 0.4129, "step": 90000 }, { "epoch": 0.9005, "grad_norm": 35.61188888549805, "learning_rate": 6.3996e-07, "loss": 0.5002, "step": 90050 }, { "epoch": 0.901, "grad_norm": 33.90731430053711, "learning_rate": 6.3976e-07, "loss": 0.4587, "step": 90100 }, { "epoch": 0.9015, "grad_norm": 17.363262176513672, "learning_rate": 6.395599999999999e-07, "loss": 0.3567, "step": 90150 }, { "epoch": 0.902, "grad_norm": 88.20648193359375, "learning_rate": 6.3936e-07, "loss": 0.3825, "step": 90200 }, { "epoch": 0.9025, "grad_norm": 105.39450073242188, "learning_rate": 6.391599999999999e-07, "loss": 0.4482, "step": 90250 }, { "epoch": 0.903, "grad_norm": 41.72043991088867, "learning_rate": 6.389599999999999e-07, "loss": 0.4216, "step": 90300 }, { "epoch": 0.9035, "grad_norm": 52.67734909057617, "learning_rate": 6.3876e-07, "loss": 0.5691, "step": 90350 }, { "epoch": 0.904, "grad_norm": 2.5297904014587402, "learning_rate": 6.3856e-07, "loss": 0.4965, "step": 90400 }, { "epoch": 0.9045, "grad_norm": 100.61029815673828, "learning_rate": 6.383600000000001e-07, "loss": 0.3907, "step": 90450 }, { "epoch": 0.905, "grad_norm": 123.27056884765625, "learning_rate": 6.381599999999999e-07, "loss": 0.6531, "step": 90500 }, { "epoch": 0.9055, "grad_norm": 57.43628692626953, "learning_rate": 6.379599999999999e-07, "loss": 0.5612, "step": 90550 }, { "epoch": 0.906, "grad_norm": 6.510050296783447, "learning_rate": 6.3776e-07, "loss": 0.6138, "step": 90600 }, { "epoch": 0.9065, "grad_norm": 71.36262512207031, "learning_rate": 6.3756e-07, "loss": 0.3697, "step": 90650 }, { "epoch": 0.907, "grad_norm": 92.18431091308594, "learning_rate": 6.3736e-07, "loss": 0.4635, "step": 90700 }, { "epoch": 0.9075, "grad_norm": 73.8544692993164, "learning_rate": 6.3716e-07, "loss": 0.4039, "step": 90750 }, { "epoch": 0.908, "grad_norm": 84.32909393310547, "learning_rate": 6.3696e-07, "loss": 0.5271, "step": 90800 }, { "epoch": 0.9085, "grad_norm": 71.52722930908203, "learning_rate": 6.367599999999999e-07, "loss": 0.4169, "step": 90850 }, { "epoch": 0.909, "grad_norm": 10.915364265441895, "learning_rate": 6.36564e-07, "loss": 0.467, "step": 90900 }, { "epoch": 0.9095, "grad_norm": 74.15825653076172, "learning_rate": 6.36364e-07, "loss": 0.4426, "step": 90950 }, { "epoch": 0.91, "grad_norm": 91.29158020019531, "learning_rate": 6.361639999999999e-07, "loss": 0.6179, "step": 91000 }, { "epoch": 0.9105, "grad_norm": 0.4594174027442932, "learning_rate": 6.35964e-07, "loss": 0.5343, "step": 91050 }, { "epoch": 0.911, "grad_norm": 124.07490539550781, "learning_rate": 6.35764e-07, "loss": 0.4171, "step": 91100 }, { "epoch": 0.9115, "grad_norm": 43.403846740722656, "learning_rate": 6.35564e-07, "loss": 0.5054, "step": 91150 }, { "epoch": 0.912, "grad_norm": 43.1138801574707, "learning_rate": 6.35364e-07, "loss": 0.4225, "step": 91200 }, { "epoch": 0.9125, "grad_norm": 117.5290756225586, "learning_rate": 6.351639999999999e-07, "loss": 0.5131, "step": 91250 }, { "epoch": 0.913, "grad_norm": 67.10313415527344, "learning_rate": 6.349679999999999e-07, "loss": 0.3869, "step": 91300 }, { "epoch": 0.9135, "grad_norm": 12.4064359664917, "learning_rate": 6.34768e-07, "loss": 0.4939, "step": 91350 }, { "epoch": 0.914, "grad_norm": 20.02683448791504, "learning_rate": 6.34568e-07, "loss": 0.5706, "step": 91400 }, { "epoch": 0.9145, "grad_norm": 64.59272766113281, "learning_rate": 6.34368e-07, "loss": 0.3377, "step": 91450 }, { "epoch": 0.915, "grad_norm": 2.4483537673950195, "learning_rate": 6.34168e-07, "loss": 0.349, "step": 91500 }, { "epoch": 0.9155, "grad_norm": 0.7086917757987976, "learning_rate": 6.33968e-07, "loss": 0.4049, "step": 91550 }, { "epoch": 0.916, "grad_norm": 17.961669921875, "learning_rate": 6.337679999999999e-07, "loss": 0.3951, "step": 91600 }, { "epoch": 0.9165, "grad_norm": 62.844703674316406, "learning_rate": 6.33568e-07, "loss": 0.4574, "step": 91650 }, { "epoch": 0.917, "grad_norm": 4.404021263122559, "learning_rate": 6.33372e-07, "loss": 0.4018, "step": 91700 }, { "epoch": 0.9175, "grad_norm": 1.865087866783142, "learning_rate": 6.331719999999999e-07, "loss": 0.5116, "step": 91750 }, { "epoch": 0.918, "grad_norm": 49.26223373413086, "learning_rate": 6.32976e-07, "loss": 0.5372, "step": 91800 }, { "epoch": 0.9185, "grad_norm": 87.36351013183594, "learning_rate": 6.32776e-07, "loss": 0.594, "step": 91850 }, { "epoch": 0.919, "grad_norm": 13.864863395690918, "learning_rate": 6.32576e-07, "loss": 0.385, "step": 91900 }, { "epoch": 0.9195, "grad_norm": 85.708251953125, "learning_rate": 6.32376e-07, "loss": 0.4528, "step": 91950 }, { "epoch": 0.92, "grad_norm": 66.8047866821289, "learning_rate": 6.321759999999999e-07, "loss": 0.5978, "step": 92000 }, { "epoch": 0.9205, "grad_norm": 91.00402069091797, "learning_rate": 6.319759999999999e-07, "loss": 0.512, "step": 92050 }, { "epoch": 0.921, "grad_norm": 56.01173400878906, "learning_rate": 6.31776e-07, "loss": 0.5682, "step": 92100 }, { "epoch": 0.9215, "grad_norm": 71.80009460449219, "learning_rate": 6.31576e-07, "loss": 0.5471, "step": 92150 }, { "epoch": 0.922, "grad_norm": 49.644187927246094, "learning_rate": 6.31376e-07, "loss": 0.6007, "step": 92200 }, { "epoch": 0.9225, "grad_norm": 6.2205119132995605, "learning_rate": 6.31176e-07, "loss": 0.3881, "step": 92250 }, { "epoch": 0.923, "grad_norm": 34.30057907104492, "learning_rate": 6.309759999999999e-07, "loss": 0.4156, "step": 92300 }, { "epoch": 0.9235, "grad_norm": 65.61951446533203, "learning_rate": 6.307759999999999e-07, "loss": 0.5385, "step": 92350 }, { "epoch": 0.924, "grad_norm": 96.61206817626953, "learning_rate": 6.30576e-07, "loss": 0.5514, "step": 92400 }, { "epoch": 0.9245, "grad_norm": 21.83513641357422, "learning_rate": 6.30376e-07, "loss": 0.5608, "step": 92450 }, { "epoch": 0.925, "grad_norm": 17.049848556518555, "learning_rate": 6.301759999999999e-07, "loss": 0.4524, "step": 92500 }, { "epoch": 0.9255, "grad_norm": 19.969083786010742, "learning_rate": 6.29976e-07, "loss": 0.5688, "step": 92550 }, { "epoch": 0.926, "grad_norm": 52.34061050415039, "learning_rate": 6.29776e-07, "loss": 0.476, "step": 92600 }, { "epoch": 0.9265, "grad_norm": 86.44994354248047, "learning_rate": 6.295759999999999e-07, "loss": 0.3973, "step": 92650 }, { "epoch": 0.927, "grad_norm": 19.09813117980957, "learning_rate": 6.29376e-07, "loss": 0.4407, "step": 92700 }, { "epoch": 0.9275, "grad_norm": 17.017074584960938, "learning_rate": 6.291759999999999e-07, "loss": 0.4964, "step": 92750 }, { "epoch": 0.928, "grad_norm": 68.21665954589844, "learning_rate": 6.289759999999999e-07, "loss": 0.4977, "step": 92800 }, { "epoch": 0.9285, "grad_norm": 15.754688262939453, "learning_rate": 6.28776e-07, "loss": 0.5211, "step": 92850 }, { "epoch": 0.929, "grad_norm": 39.87798309326172, "learning_rate": 6.28576e-07, "loss": 0.4764, "step": 92900 }, { "epoch": 0.9295, "grad_norm": 66.07242584228516, "learning_rate": 6.283760000000001e-07, "loss": 0.4276, "step": 92950 }, { "epoch": 0.93, "grad_norm": 68.16017150878906, "learning_rate": 6.281759999999999e-07, "loss": 0.4338, "step": 93000 }, { "epoch": 0.9305, "grad_norm": 92.84886932373047, "learning_rate": 6.279759999999999e-07, "loss": 0.3994, "step": 93050 }, { "epoch": 0.931, "grad_norm": 7.953521251678467, "learning_rate": 6.27776e-07, "loss": 0.5969, "step": 93100 }, { "epoch": 0.9315, "grad_norm": 6.302267551422119, "learning_rate": 6.27576e-07, "loss": 0.5746, "step": 93150 }, { "epoch": 0.932, "grad_norm": 57.08115005493164, "learning_rate": 6.27376e-07, "loss": 0.6096, "step": 93200 }, { "epoch": 0.9325, "grad_norm": 74.17098236083984, "learning_rate": 6.27176e-07, "loss": 0.5127, "step": 93250 }, { "epoch": 0.933, "grad_norm": 3.4939818382263184, "learning_rate": 6.26976e-07, "loss": 0.4822, "step": 93300 }, { "epoch": 0.9335, "grad_norm": 15.249403953552246, "learning_rate": 6.26776e-07, "loss": 0.4677, "step": 93350 }, { "epoch": 0.934, "grad_norm": 8.158519744873047, "learning_rate": 6.26576e-07, "loss": 0.637, "step": 93400 }, { "epoch": 0.9345, "grad_norm": 3.797173023223877, "learning_rate": 6.26376e-07, "loss": 0.6001, "step": 93450 }, { "epoch": 0.935, "grad_norm": 9.297698020935059, "learning_rate": 6.261759999999999e-07, "loss": 0.4783, "step": 93500 }, { "epoch": 0.9355, "grad_norm": 97.72147369384766, "learning_rate": 6.25976e-07, "loss": 0.5935, "step": 93550 }, { "epoch": 0.936, "grad_norm": 30.139896392822266, "learning_rate": 6.25776e-07, "loss": 0.4528, "step": 93600 }, { "epoch": 0.9365, "grad_norm": 93.17426300048828, "learning_rate": 6.25576e-07, "loss": 0.519, "step": 93650 }, { "epoch": 0.937, "grad_norm": 105.38163757324219, "learning_rate": 6.253760000000001e-07, "loss": 0.5527, "step": 93700 }, { "epoch": 0.9375, "grad_norm": 68.39093780517578, "learning_rate": 6.251759999999999e-07, "loss": 0.4141, "step": 93750 }, { "epoch": 0.938, "grad_norm": 70.97985076904297, "learning_rate": 6.249759999999999e-07, "loss": 0.4535, "step": 93800 }, { "epoch": 0.9385, "grad_norm": 74.9505615234375, "learning_rate": 6.24776e-07, "loss": 0.5361, "step": 93850 }, { "epoch": 0.939, "grad_norm": 21.2496395111084, "learning_rate": 6.24576e-07, "loss": 0.4383, "step": 93900 }, { "epoch": 0.9395, "grad_norm": 69.52516174316406, "learning_rate": 6.24376e-07, "loss": 0.3776, "step": 93950 }, { "epoch": 0.94, "grad_norm": 42.33454132080078, "learning_rate": 6.24176e-07, "loss": 0.4317, "step": 94000 }, { "epoch": 0.9405, "grad_norm": 3.589440107345581, "learning_rate": 6.23976e-07, "loss": 0.4107, "step": 94050 }, { "epoch": 0.941, "grad_norm": 108.18557739257812, "learning_rate": 6.237759999999999e-07, "loss": 0.4974, "step": 94100 }, { "epoch": 0.9415, "grad_norm": 17.219356536865234, "learning_rate": 6.23576e-07, "loss": 0.4116, "step": 94150 }, { "epoch": 0.942, "grad_norm": 19.18585777282715, "learning_rate": 6.23376e-07, "loss": 0.4121, "step": 94200 }, { "epoch": 0.9425, "grad_norm": 118.1676254272461, "learning_rate": 6.231759999999999e-07, "loss": 0.5545, "step": 94250 }, { "epoch": 0.943, "grad_norm": 47.58047103881836, "learning_rate": 6.22976e-07, "loss": 0.4113, "step": 94300 }, { "epoch": 0.9435, "grad_norm": 81.94721221923828, "learning_rate": 6.22776e-07, "loss": 0.5893, "step": 94350 }, { "epoch": 0.944, "grad_norm": 4.948315143585205, "learning_rate": 6.22576e-07, "loss": 0.3997, "step": 94400 }, { "epoch": 0.9445, "grad_norm": 14.120973587036133, "learning_rate": 6.22376e-07, "loss": 0.4992, "step": 94450 }, { "epoch": 0.945, "grad_norm": 63.99724197387695, "learning_rate": 6.221759999999999e-07, "loss": 0.5603, "step": 94500 }, { "epoch": 0.9455, "grad_norm": 46.79153060913086, "learning_rate": 6.219759999999999e-07, "loss": 0.4607, "step": 94550 }, { "epoch": 0.946, "grad_norm": 8.22097110748291, "learning_rate": 6.21776e-07, "loss": 0.557, "step": 94600 }, { "epoch": 0.9465, "grad_norm": 98.2745361328125, "learning_rate": 6.21576e-07, "loss": 0.5769, "step": 94650 }, { "epoch": 0.947, "grad_norm": 40.03475570678711, "learning_rate": 6.21376e-07, "loss": 0.5056, "step": 94700 }, { "epoch": 0.9475, "grad_norm": 74.36143493652344, "learning_rate": 6.21176e-07, "loss": 0.4523, "step": 94750 }, { "epoch": 0.948, "grad_norm": 51.59547805786133, "learning_rate": 6.209759999999999e-07, "loss": 0.5666, "step": 94800 }, { "epoch": 0.9485, "grad_norm": 61.492488861083984, "learning_rate": 6.207759999999999e-07, "loss": 0.536, "step": 94850 }, { "epoch": 0.949, "grad_norm": 3.265573024749756, "learning_rate": 6.20576e-07, "loss": 0.3787, "step": 94900 }, { "epoch": 0.9495, "grad_norm": 29.715877532958984, "learning_rate": 6.20376e-07, "loss": 0.5144, "step": 94950 }, { "epoch": 0.95, "grad_norm": 4.610234260559082, "learning_rate": 6.201759999999999e-07, "loss": 0.4496, "step": 95000 }, { "epoch": 0.9505, "grad_norm": 3.437190532684326, "learning_rate": 6.19976e-07, "loss": 0.5684, "step": 95050 }, { "epoch": 0.951, "grad_norm": 48.969093322753906, "learning_rate": 6.19776e-07, "loss": 0.4846, "step": 95100 }, { "epoch": 0.9515, "grad_norm": 72.4744873046875, "learning_rate": 6.195759999999999e-07, "loss": 0.4036, "step": 95150 }, { "epoch": 0.952, "grad_norm": 2.2708630561828613, "learning_rate": 6.19376e-07, "loss": 0.4133, "step": 95200 }, { "epoch": 0.9525, "grad_norm": 4.139185905456543, "learning_rate": 6.191759999999999e-07, "loss": 0.5245, "step": 95250 }, { "epoch": 0.953, "grad_norm": 131.5601806640625, "learning_rate": 6.189759999999999e-07, "loss": 0.3855, "step": 95300 }, { "epoch": 0.9535, "grad_norm": 27.90871238708496, "learning_rate": 6.18776e-07, "loss": 0.4754, "step": 95350 }, { "epoch": 0.954, "grad_norm": 82.64385986328125, "learning_rate": 6.18576e-07, "loss": 0.4861, "step": 95400 }, { "epoch": 0.9545, "grad_norm": 51.01675033569336, "learning_rate": 6.183760000000001e-07, "loss": 0.4428, "step": 95450 }, { "epoch": 0.955, "grad_norm": 56.236717224121094, "learning_rate": 6.181759999999999e-07, "loss": 0.5291, "step": 95500 }, { "epoch": 0.9555, "grad_norm": 62.634403228759766, "learning_rate": 6.179759999999999e-07, "loss": 0.4693, "step": 95550 }, { "epoch": 0.956, "grad_norm": 29.53840446472168, "learning_rate": 6.17776e-07, "loss": 0.5259, "step": 95600 }, { "epoch": 0.9565, "grad_norm": 41.582767486572266, "learning_rate": 6.17576e-07, "loss": 0.4315, "step": 95650 }, { "epoch": 0.957, "grad_norm": 20.49850845336914, "learning_rate": 6.17376e-07, "loss": 0.4245, "step": 95700 }, { "epoch": 0.9575, "grad_norm": 17.59610939025879, "learning_rate": 6.17176e-07, "loss": 0.4987, "step": 95750 }, { "epoch": 0.958, "grad_norm": 80.921630859375, "learning_rate": 6.16976e-07, "loss": 0.3194, "step": 95800 }, { "epoch": 0.9585, "grad_norm": 84.2488784790039, "learning_rate": 6.167759999999999e-07, "loss": 0.5606, "step": 95850 }, { "epoch": 0.959, "grad_norm": 0.6134811639785767, "learning_rate": 6.16576e-07, "loss": 0.4759, "step": 95900 }, { "epoch": 0.9595, "grad_norm": 15.289545059204102, "learning_rate": 6.16376e-07, "loss": 0.5179, "step": 95950 }, { "epoch": 0.96, "grad_norm": 41.45950698852539, "learning_rate": 6.161759999999999e-07, "loss": 0.6042, "step": 96000 }, { "epoch": 0.9605, "grad_norm": 2.4903767108917236, "learning_rate": 6.15976e-07, "loss": 0.6048, "step": 96050 }, { "epoch": 0.961, "grad_norm": 82.29048919677734, "learning_rate": 6.15776e-07, "loss": 0.4689, "step": 96100 }, { "epoch": 0.9615, "grad_norm": 10.290043830871582, "learning_rate": 6.15576e-07, "loss": 0.4352, "step": 96150 }, { "epoch": 0.962, "grad_norm": 56.273529052734375, "learning_rate": 6.15376e-07, "loss": 0.5958, "step": 96200 }, { "epoch": 0.9625, "grad_norm": 58.597835540771484, "learning_rate": 6.151759999999999e-07, "loss": 0.4868, "step": 96250 }, { "epoch": 0.963, "grad_norm": 15.38319206237793, "learning_rate": 6.149759999999999e-07, "loss": 0.4837, "step": 96300 }, { "epoch": 0.9635, "grad_norm": 40.752864837646484, "learning_rate": 6.14776e-07, "loss": 0.4492, "step": 96350 }, { "epoch": 0.964, "grad_norm": 7.580194473266602, "learning_rate": 6.14576e-07, "loss": 0.5223, "step": 96400 }, { "epoch": 0.9645, "grad_norm": 33.51999282836914, "learning_rate": 6.14376e-07, "loss": 0.5388, "step": 96450 }, { "epoch": 0.965, "grad_norm": 60.9385986328125, "learning_rate": 6.141760000000001e-07, "loss": 0.5853, "step": 96500 }, { "epoch": 0.9655, "grad_norm": 72.08675384521484, "learning_rate": 6.139759999999999e-07, "loss": 0.567, "step": 96550 }, { "epoch": 0.966, "grad_norm": 27.145662307739258, "learning_rate": 6.137759999999999e-07, "loss": 0.5532, "step": 96600 }, { "epoch": 0.9665, "grad_norm": 91.9208984375, "learning_rate": 6.1358e-07, "loss": 0.4526, "step": 96650 }, { "epoch": 0.967, "grad_norm": 6.773651123046875, "learning_rate": 6.1338e-07, "loss": 0.467, "step": 96700 }, { "epoch": 0.9675, "grad_norm": 54.172584533691406, "learning_rate": 6.131799999999999e-07, "loss": 0.5223, "step": 96750 }, { "epoch": 0.968, "grad_norm": 23.472658157348633, "learning_rate": 6.1298e-07, "loss": 0.4758, "step": 96800 }, { "epoch": 0.9685, "grad_norm": 104.0491714477539, "learning_rate": 6.1278e-07, "loss": 0.5201, "step": 96850 }, { "epoch": 0.969, "grad_norm": 11.634442329406738, "learning_rate": 6.125799999999999e-07, "loss": 0.4486, "step": 96900 }, { "epoch": 0.9695, "grad_norm": 24.342527389526367, "learning_rate": 6.1238e-07, "loss": 0.5842, "step": 96950 }, { "epoch": 0.97, "grad_norm": 15.33423900604248, "learning_rate": 6.121799999999999e-07, "loss": 0.4822, "step": 97000 }, { "epoch": 0.9705, "grad_norm": 73.634033203125, "learning_rate": 6.119799999999999e-07, "loss": 0.4194, "step": 97050 }, { "epoch": 0.971, "grad_norm": 90.969970703125, "learning_rate": 6.1178e-07, "loss": 0.6029, "step": 97100 }, { "epoch": 0.9715, "grad_norm": 17.80976104736328, "learning_rate": 6.1158e-07, "loss": 0.4477, "step": 97150 }, { "epoch": 0.972, "grad_norm": 81.571533203125, "learning_rate": 6.1138e-07, "loss": 0.7005, "step": 97200 }, { "epoch": 0.9725, "grad_norm": 41.37546157836914, "learning_rate": 6.111799999999999e-07, "loss": 0.5194, "step": 97250 }, { "epoch": 0.973, "grad_norm": 100.39408874511719, "learning_rate": 6.109799999999999e-07, "loss": 0.3624, "step": 97300 }, { "epoch": 0.9735, "grad_norm": 65.12748718261719, "learning_rate": 6.107799999999999e-07, "loss": 0.5751, "step": 97350 }, { "epoch": 0.974, "grad_norm": 33.728546142578125, "learning_rate": 6.1058e-07, "loss": 0.521, "step": 97400 }, { "epoch": 0.9745, "grad_norm": 69.31006622314453, "learning_rate": 6.10384e-07, "loss": 0.4695, "step": 97450 }, { "epoch": 0.975, "grad_norm": 38.2792854309082, "learning_rate": 6.101839999999999e-07, "loss": 0.3575, "step": 97500 }, { "epoch": 0.9755, "grad_norm": 69.43871307373047, "learning_rate": 6.09984e-07, "loss": 0.3759, "step": 97550 }, { "epoch": 0.976, "grad_norm": 68.89128112792969, "learning_rate": 6.09784e-07, "loss": 0.4355, "step": 97600 }, { "epoch": 0.9765, "grad_norm": 41.74790573120117, "learning_rate": 6.095839999999999e-07, "loss": 0.6918, "step": 97650 }, { "epoch": 0.977, "grad_norm": 12.850433349609375, "learning_rate": 6.09384e-07, "loss": 0.556, "step": 97700 }, { "epoch": 0.9775, "grad_norm": 43.75520324707031, "learning_rate": 6.091839999999999e-07, "loss": 0.4272, "step": 97750 }, { "epoch": 0.978, "grad_norm": 9.665956497192383, "learning_rate": 6.089839999999999e-07, "loss": 0.4407, "step": 97800 }, { "epoch": 0.9785, "grad_norm": 22.188209533691406, "learning_rate": 6.08784e-07, "loss": 0.4733, "step": 97850 }, { "epoch": 0.979, "grad_norm": 108.53839111328125, "learning_rate": 6.08584e-07, "loss": 0.5335, "step": 97900 }, { "epoch": 0.9795, "grad_norm": 58.352508544921875, "learning_rate": 6.083840000000001e-07, "loss": 0.478, "step": 97950 }, { "epoch": 0.98, "grad_norm": 57.22300338745117, "learning_rate": 6.081839999999999e-07, "loss": 0.543, "step": 98000 }, { "epoch": 0.9805, "grad_norm": 42.58542251586914, "learning_rate": 6.079839999999999e-07, "loss": 0.4557, "step": 98050 }, { "epoch": 0.981, "grad_norm": 73.4317626953125, "learning_rate": 6.07784e-07, "loss": 0.4937, "step": 98100 }, { "epoch": 0.9815, "grad_norm": 24.243736267089844, "learning_rate": 6.07584e-07, "loss": 0.4762, "step": 98150 }, { "epoch": 0.982, "grad_norm": 0.41077935695648193, "learning_rate": 6.07384e-07, "loss": 0.5536, "step": 98200 }, { "epoch": 0.9825, "grad_norm": 3.0135037899017334, "learning_rate": 6.07184e-07, "loss": 0.3889, "step": 98250 }, { "epoch": 0.983, "grad_norm": 27.06598472595215, "learning_rate": 6.06984e-07, "loss": 0.5288, "step": 98300 }, { "epoch": 0.9835, "grad_norm": 79.37501525878906, "learning_rate": 6.067839999999999e-07, "loss": 0.5145, "step": 98350 }, { "epoch": 0.984, "grad_norm": 10.967093467712402, "learning_rate": 6.06584e-07, "loss": 0.5057, "step": 98400 }, { "epoch": 0.9845, "grad_norm": 86.88298797607422, "learning_rate": 6.06384e-07, "loss": 0.4921, "step": 98450 }, { "epoch": 0.985, "grad_norm": 83.58354187011719, "learning_rate": 6.061839999999999e-07, "loss": 0.6873, "step": 98500 }, { "epoch": 0.9855, "grad_norm": 54.29396057128906, "learning_rate": 6.05984e-07, "loss": 0.5628, "step": 98550 }, { "epoch": 0.986, "grad_norm": 93.75994873046875, "learning_rate": 6.05784e-07, "loss": 0.4604, "step": 98600 }, { "epoch": 0.9865, "grad_norm": 26.21271324157715, "learning_rate": 6.05584e-07, "loss": 0.47, "step": 98650 }, { "epoch": 0.987, "grad_norm": 75.35508728027344, "learning_rate": 6.05384e-07, "loss": 0.4449, "step": 98700 }, { "epoch": 0.9875, "grad_norm": 34.0965690612793, "learning_rate": 6.05184e-07, "loss": 0.4985, "step": 98750 }, { "epoch": 0.988, "grad_norm": 15.451560974121094, "learning_rate": 6.049839999999999e-07, "loss": 0.4956, "step": 98800 }, { "epoch": 0.9885, "grad_norm": 6.660648822784424, "learning_rate": 6.04784e-07, "loss": 0.3919, "step": 98850 }, { "epoch": 0.989, "grad_norm": 39.9170036315918, "learning_rate": 6.04584e-07, "loss": 0.497, "step": 98900 }, { "epoch": 0.9895, "grad_norm": 33.44998550415039, "learning_rate": 6.04384e-07, "loss": 0.6224, "step": 98950 }, { "epoch": 0.99, "grad_norm": 28.230859756469727, "learning_rate": 6.041840000000001e-07, "loss": 0.4241, "step": 99000 }, { "epoch": 0.9905, "grad_norm": 104.45681762695312, "learning_rate": 6.039839999999999e-07, "loss": 0.4589, "step": 99050 }, { "epoch": 0.991, "grad_norm": 8.24346923828125, "learning_rate": 6.037839999999999e-07, "loss": 0.4088, "step": 99100 }, { "epoch": 0.9915, "grad_norm": 73.47013092041016, "learning_rate": 6.03584e-07, "loss": 0.4545, "step": 99150 }, { "epoch": 0.992, "grad_norm": 88.34424591064453, "learning_rate": 6.03384e-07, "loss": 0.4967, "step": 99200 }, { "epoch": 0.9925, "grad_norm": 19.245553970336914, "learning_rate": 6.03184e-07, "loss": 0.3781, "step": 99250 }, { "epoch": 0.993, "grad_norm": 24.192604064941406, "learning_rate": 6.02984e-07, "loss": 0.5745, "step": 99300 }, { "epoch": 0.9935, "grad_norm": 30.267501831054688, "learning_rate": 6.02784e-07, "loss": 0.4486, "step": 99350 }, { "epoch": 0.994, "grad_norm": 102.20484161376953, "learning_rate": 6.025839999999999e-07, "loss": 0.5407, "step": 99400 }, { "epoch": 0.9945, "grad_norm": 65.89080810546875, "learning_rate": 6.02384e-07, "loss": 0.5351, "step": 99450 }, { "epoch": 0.995, "grad_norm": 84.1611557006836, "learning_rate": 6.02184e-07, "loss": 0.4555, "step": 99500 }, { "epoch": 0.9955, "grad_norm": 4.047856330871582, "learning_rate": 6.019839999999999e-07, "loss": 0.4759, "step": 99550 }, { "epoch": 0.996, "grad_norm": 8.221104621887207, "learning_rate": 6.01784e-07, "loss": 0.3095, "step": 99600 }, { "epoch": 0.9965, "grad_norm": 56.98789978027344, "learning_rate": 6.01584e-07, "loss": 0.4643, "step": 99650 }, { "epoch": 0.997, "grad_norm": 12.39257526397705, "learning_rate": 6.01384e-07, "loss": 0.3251, "step": 99700 }, { "epoch": 0.9975, "grad_norm": 20.75105094909668, "learning_rate": 6.01184e-07, "loss": 0.5136, "step": 99750 }, { "epoch": 0.998, "grad_norm": 47.02289962768555, "learning_rate": 6.009839999999999e-07, "loss": 0.4591, "step": 99800 }, { "epoch": 0.9985, "grad_norm": 97.42501831054688, "learning_rate": 6.007839999999999e-07, "loss": 0.5596, "step": 99850 }, { "epoch": 0.999, "grad_norm": 66.5621566772461, "learning_rate": 6.00584e-07, "loss": 0.4794, "step": 99900 }, { "epoch": 0.9995, "grad_norm": 9.777603149414062, "learning_rate": 6.00384e-07, "loss": 0.4818, "step": 99950 }, { "epoch": 1.0, "grad_norm": 66.84056854248047, "learning_rate": 6.001840000000001e-07, "loss": 0.4902, "step": 100000 }, { "epoch": 1.0005, "grad_norm": 0.26840323209762573, "learning_rate": 5.99984e-07, "loss": 0.4989, "step": 100050 }, { "epoch": 1.001, "grad_norm": 1.4899623394012451, "learning_rate": 5.997839999999999e-07, "loss": 0.466, "step": 100100 }, { "epoch": 1.0015, "grad_norm": 56.374874114990234, "learning_rate": 5.99584e-07, "loss": 0.5739, "step": 100150 }, { "epoch": 1.002, "grad_norm": 28.144792556762695, "learning_rate": 5.99384e-07, "loss": 0.5244, "step": 100200 }, { "epoch": 1.0025, "grad_norm": 41.527069091796875, "learning_rate": 5.99184e-07, "loss": 0.4076, "step": 100250 }, { "epoch": 1.003, "grad_norm": 24.504615783691406, "learning_rate": 5.98984e-07, "loss": 0.4945, "step": 100300 }, { "epoch": 1.0035, "grad_norm": 59.61265563964844, "learning_rate": 5.98784e-07, "loss": 0.5471, "step": 100350 }, { "epoch": 1.004, "grad_norm": 4.118966102600098, "learning_rate": 5.98584e-07, "loss": 0.5698, "step": 100400 }, { "epoch": 1.0045, "grad_norm": 83.80474853515625, "learning_rate": 5.98384e-07, "loss": 0.4028, "step": 100450 }, { "epoch": 1.005, "grad_norm": 82.36299133300781, "learning_rate": 5.98184e-07, "loss": 0.3842, "step": 100500 }, { "epoch": 1.0055, "grad_norm": 26.00364112854004, "learning_rate": 5.979839999999999e-07, "loss": 0.4643, "step": 100550 }, { "epoch": 1.006, "grad_norm": 3.6861441135406494, "learning_rate": 5.97784e-07, "loss": 0.5716, "step": 100600 }, { "epoch": 1.0065, "grad_norm": 9.791892051696777, "learning_rate": 5.97584e-07, "loss": 0.4626, "step": 100650 }, { "epoch": 1.007, "grad_norm": 32.55017852783203, "learning_rate": 5.97384e-07, "loss": 0.3589, "step": 100700 }, { "epoch": 1.0075, "grad_norm": 102.46571350097656, "learning_rate": 5.971840000000001e-07, "loss": 0.5368, "step": 100750 }, { "epoch": 1.008, "grad_norm": 3.5553598403930664, "learning_rate": 5.969839999999999e-07, "loss": 0.4756, "step": 100800 }, { "epoch": 1.0085, "grad_norm": 84.53421020507812, "learning_rate": 5.967839999999999e-07, "loss": 0.4855, "step": 100850 }, { "epoch": 1.009, "grad_norm": 50.297672271728516, "learning_rate": 5.96584e-07, "loss": 0.4461, "step": 100900 }, { "epoch": 1.0095, "grad_norm": 46.95347213745117, "learning_rate": 5.96384e-07, "loss": 0.2661, "step": 100950 }, { "epoch": 1.01, "grad_norm": 46.71750259399414, "learning_rate": 5.96184e-07, "loss": 0.5056, "step": 101000 }, { "epoch": 1.0105, "grad_norm": 65.1976547241211, "learning_rate": 5.95984e-07, "loss": 0.4613, "step": 101050 }, { "epoch": 1.011, "grad_norm": 53.80318069458008, "learning_rate": 5.95784e-07, "loss": 0.4472, "step": 101100 }, { "epoch": 1.0115, "grad_norm": 82.39476776123047, "learning_rate": 5.955839999999999e-07, "loss": 0.5717, "step": 101150 }, { "epoch": 1.012, "grad_norm": 39.730159759521484, "learning_rate": 5.95384e-07, "loss": 0.4549, "step": 101200 }, { "epoch": 1.0125, "grad_norm": 106.95108795166016, "learning_rate": 5.95184e-07, "loss": 0.489, "step": 101250 }, { "epoch": 1.013, "grad_norm": 48.25342559814453, "learning_rate": 5.949839999999999e-07, "loss": 0.4537, "step": 101300 }, { "epoch": 1.0135, "grad_norm": 2.1384692192077637, "learning_rate": 5.94784e-07, "loss": 0.6137, "step": 101350 }, { "epoch": 1.014, "grad_norm": 59.659515380859375, "learning_rate": 5.94584e-07, "loss": 0.4321, "step": 101400 }, { "epoch": 1.0145, "grad_norm": 60.75330352783203, "learning_rate": 5.94384e-07, "loss": 0.4137, "step": 101450 }, { "epoch": 1.015, "grad_norm": 84.83941650390625, "learning_rate": 5.94184e-07, "loss": 0.7533, "step": 101500 }, { "epoch": 1.0155, "grad_norm": 13.40371036529541, "learning_rate": 5.939839999999999e-07, "loss": 0.4591, "step": 101550 }, { "epoch": 1.016, "grad_norm": 38.117313385009766, "learning_rate": 5.937839999999999e-07, "loss": 0.5045, "step": 101600 }, { "epoch": 1.0165, "grad_norm": 1.0856471061706543, "learning_rate": 5.93584e-07, "loss": 0.3855, "step": 101650 }, { "epoch": 1.017, "grad_norm": 1.0429091453552246, "learning_rate": 5.93384e-07, "loss": 0.4512, "step": 101700 }, { "epoch": 1.0175, "grad_norm": 19.322357177734375, "learning_rate": 5.93184e-07, "loss": 0.3774, "step": 101750 }, { "epoch": 1.018, "grad_norm": 104.85911560058594, "learning_rate": 5.92984e-07, "loss": 0.4906, "step": 101800 }, { "epoch": 1.0185, "grad_norm": 48.950130462646484, "learning_rate": 5.927839999999999e-07, "loss": 0.4659, "step": 101850 }, { "epoch": 1.019, "grad_norm": 40.40967559814453, "learning_rate": 5.925839999999999e-07, "loss": 0.4826, "step": 101900 }, { "epoch": 1.0195, "grad_norm": 32.901214599609375, "learning_rate": 5.92384e-07, "loss": 0.3543, "step": 101950 }, { "epoch": 1.02, "grad_norm": 22.917282104492188, "learning_rate": 5.92184e-07, "loss": 0.3971, "step": 102000 }, { "epoch": 1.0205, "grad_norm": 70.27293395996094, "learning_rate": 5.919839999999999e-07, "loss": 0.4203, "step": 102050 }, { "epoch": 1.021, "grad_norm": 58.08955001831055, "learning_rate": 5.91784e-07, "loss": 0.4806, "step": 102100 }, { "epoch": 1.0215, "grad_norm": 0.6320064663887024, "learning_rate": 5.91584e-07, "loss": 0.4064, "step": 102150 }, { "epoch": 1.022, "grad_norm": 13.316550254821777, "learning_rate": 5.913839999999999e-07, "loss": 0.4039, "step": 102200 }, { "epoch": 1.0225, "grad_norm": 86.25184631347656, "learning_rate": 5.91184e-07, "loss": 0.389, "step": 102250 }, { "epoch": 1.023, "grad_norm": 11.370412826538086, "learning_rate": 5.909839999999999e-07, "loss": 0.4349, "step": 102300 }, { "epoch": 1.0235, "grad_norm": 7.496046543121338, "learning_rate": 5.907839999999999e-07, "loss": 0.4676, "step": 102350 }, { "epoch": 1.024, "grad_norm": 15.750993728637695, "learning_rate": 5.90584e-07, "loss": 0.5462, "step": 102400 }, { "epoch": 1.0245, "grad_norm": 108.76589965820312, "learning_rate": 5.90384e-07, "loss": 0.4425, "step": 102450 }, { "epoch": 1.025, "grad_norm": 85.9919204711914, "learning_rate": 5.901840000000001e-07, "loss": 0.4755, "step": 102500 }, { "epoch": 1.0255, "grad_norm": 84.9974594116211, "learning_rate": 5.899839999999999e-07, "loss": 0.3698, "step": 102550 }, { "epoch": 1.026, "grad_norm": 53.898643493652344, "learning_rate": 5.897839999999999e-07, "loss": 0.5112, "step": 102600 }, { "epoch": 1.0265, "grad_norm": 50.896507263183594, "learning_rate": 5.89584e-07, "loss": 0.3905, "step": 102650 }, { "epoch": 1.027, "grad_norm": 78.91048431396484, "learning_rate": 5.89384e-07, "loss": 0.4481, "step": 102700 }, { "epoch": 1.0275, "grad_norm": 82.20804595947266, "learning_rate": 5.89184e-07, "loss": 0.4744, "step": 102750 }, { "epoch": 1.028, "grad_norm": 72.1124038696289, "learning_rate": 5.88984e-07, "loss": 0.5971, "step": 102800 }, { "epoch": 1.0285, "grad_norm": 6.744050979614258, "learning_rate": 5.88784e-07, "loss": 0.4036, "step": 102850 }, { "epoch": 1.029, "grad_norm": 102.9438247680664, "learning_rate": 5.885839999999999e-07, "loss": 0.3638, "step": 102900 }, { "epoch": 1.0295, "grad_norm": 91.29195404052734, "learning_rate": 5.88384e-07, "loss": 0.4288, "step": 102950 }, { "epoch": 1.03, "grad_norm": 10.420190811157227, "learning_rate": 5.88184e-07, "loss": 0.5381, "step": 103000 }, { "epoch": 1.0305, "grad_norm": 3.8758227825164795, "learning_rate": 5.879839999999999e-07, "loss": 0.4904, "step": 103050 }, { "epoch": 1.031, "grad_norm": 42.36983871459961, "learning_rate": 5.87784e-07, "loss": 0.4687, "step": 103100 }, { "epoch": 1.0315, "grad_norm": 4.480782508850098, "learning_rate": 5.87584e-07, "loss": 0.3783, "step": 103150 }, { "epoch": 1.032, "grad_norm": 16.480192184448242, "learning_rate": 5.87384e-07, "loss": 0.4104, "step": 103200 }, { "epoch": 1.0325, "grad_norm": 56.70964050292969, "learning_rate": 5.87184e-07, "loss": 0.4744, "step": 103250 }, { "epoch": 1.033, "grad_norm": 79.56087493896484, "learning_rate": 5.869839999999999e-07, "loss": 0.4447, "step": 103300 }, { "epoch": 1.0335, "grad_norm": 60.962284088134766, "learning_rate": 5.867839999999999e-07, "loss": 0.5019, "step": 103350 }, { "epoch": 1.034, "grad_norm": 58.27140808105469, "learning_rate": 5.86584e-07, "loss": 0.5927, "step": 103400 }, { "epoch": 1.0345, "grad_norm": 39.9395637512207, "learning_rate": 5.86384e-07, "loss": 0.4556, "step": 103450 }, { "epoch": 1.035, "grad_norm": 10.879745483398438, "learning_rate": 5.86184e-07, "loss": 0.5386, "step": 103500 }, { "epoch": 1.0355, "grad_norm": 14.985457420349121, "learning_rate": 5.85984e-07, "loss": 0.4561, "step": 103550 }, { "epoch": 1.036, "grad_norm": 0.901373565196991, "learning_rate": 5.85788e-07, "loss": 0.486, "step": 103600 }, { "epoch": 1.0365, "grad_norm": 76.16960906982422, "learning_rate": 5.855879999999999e-07, "loss": 0.5131, "step": 103650 }, { "epoch": 1.037, "grad_norm": 71.22393798828125, "learning_rate": 5.85388e-07, "loss": 0.6253, "step": 103700 }, { "epoch": 1.0375, "grad_norm": 72.47522735595703, "learning_rate": 5.85188e-07, "loss": 0.4038, "step": 103750 }, { "epoch": 1.038, "grad_norm": 35.06593322753906, "learning_rate": 5.849879999999999e-07, "loss": 0.4235, "step": 103800 }, { "epoch": 1.0385, "grad_norm": 58.12506866455078, "learning_rate": 5.84788e-07, "loss": 0.4734, "step": 103850 }, { "epoch": 1.039, "grad_norm": 70.32412719726562, "learning_rate": 5.84588e-07, "loss": 0.4489, "step": 103900 }, { "epoch": 1.0395, "grad_norm": 94.34310150146484, "learning_rate": 5.84388e-07, "loss": 0.5206, "step": 103950 }, { "epoch": 1.04, "grad_norm": 96.84242248535156, "learning_rate": 5.84188e-07, "loss": 0.4214, "step": 104000 }, { "epoch": 1.0405, "grad_norm": 50.48087692260742, "learning_rate": 5.839879999999999e-07, "loss": 0.4325, "step": 104050 }, { "epoch": 1.041, "grad_norm": 15.453256607055664, "learning_rate": 5.837879999999999e-07, "loss": 0.4099, "step": 104100 }, { "epoch": 1.0415, "grad_norm": 7.770763397216797, "learning_rate": 5.83588e-07, "loss": 0.5572, "step": 104150 }, { "epoch": 1.042, "grad_norm": 75.37487030029297, "learning_rate": 5.83388e-07, "loss": 0.6112, "step": 104200 }, { "epoch": 1.0425, "grad_norm": 3.0867295265197754, "learning_rate": 5.83188e-07, "loss": 0.4438, "step": 104250 }, { "epoch": 1.043, "grad_norm": 82.33965301513672, "learning_rate": 5.82988e-07, "loss": 0.5488, "step": 104300 }, { "epoch": 1.0435, "grad_norm": 108.06182861328125, "learning_rate": 5.827879999999999e-07, "loss": 0.4588, "step": 104350 }, { "epoch": 1.044, "grad_norm": 7.8545660972595215, "learning_rate": 5.825879999999999e-07, "loss": 0.4324, "step": 104400 }, { "epoch": 1.0445, "grad_norm": 105.07747650146484, "learning_rate": 5.82388e-07, "loss": 0.4492, "step": 104450 }, { "epoch": 1.045, "grad_norm": 50.81571578979492, "learning_rate": 5.82188e-07, "loss": 0.6157, "step": 104500 }, { "epoch": 1.0455, "grad_norm": 102.53087615966797, "learning_rate": 5.819879999999999e-07, "loss": 0.3951, "step": 104550 }, { "epoch": 1.046, "grad_norm": 40.119380950927734, "learning_rate": 5.81788e-07, "loss": 0.4466, "step": 104600 }, { "epoch": 1.0465, "grad_norm": 98.8756103515625, "learning_rate": 5.81588e-07, "loss": 0.4351, "step": 104650 }, { "epoch": 1.047, "grad_norm": 7.848785400390625, "learning_rate": 5.813879999999999e-07, "loss": 0.4286, "step": 104700 }, { "epoch": 1.0475, "grad_norm": 57.9787712097168, "learning_rate": 5.81188e-07, "loss": 0.4914, "step": 104750 }, { "epoch": 1.048, "grad_norm": 34.086063385009766, "learning_rate": 5.809879999999999e-07, "loss": 0.5624, "step": 104800 }, { "epoch": 1.0485, "grad_norm": 38.03075408935547, "learning_rate": 5.807879999999999e-07, "loss": 0.4121, "step": 104850 }, { "epoch": 1.049, "grad_norm": 113.10247039794922, "learning_rate": 5.80588e-07, "loss": 0.5234, "step": 104900 }, { "epoch": 1.0495, "grad_norm": 67.31755828857422, "learning_rate": 5.80388e-07, "loss": 0.4457, "step": 104950 }, { "epoch": 1.05, "grad_norm": 4.373214244842529, "learning_rate": 5.801880000000001e-07, "loss": 0.4674, "step": 105000 }, { "epoch": 1.0505, "grad_norm": 24.029144287109375, "learning_rate": 5.799879999999999e-07, "loss": 0.4393, "step": 105050 }, { "epoch": 1.051, "grad_norm": 76.45175170898438, "learning_rate": 5.797879999999999e-07, "loss": 0.6139, "step": 105100 }, { "epoch": 1.0515, "grad_norm": 90.8389663696289, "learning_rate": 5.79588e-07, "loss": 0.4096, "step": 105150 }, { "epoch": 1.052, "grad_norm": 52.22407531738281, "learning_rate": 5.79388e-07, "loss": 0.4983, "step": 105200 }, { "epoch": 1.0525, "grad_norm": 28.466354370117188, "learning_rate": 5.79188e-07, "loss": 0.4473, "step": 105250 }, { "epoch": 1.053, "grad_norm": 74.26171112060547, "learning_rate": 5.78988e-07, "loss": 0.5071, "step": 105300 }, { "epoch": 1.0535, "grad_norm": 45.36643981933594, "learning_rate": 5.78788e-07, "loss": 0.4621, "step": 105350 }, { "epoch": 1.054, "grad_norm": 22.525741577148438, "learning_rate": 5.785879999999999e-07, "loss": 0.5238, "step": 105400 }, { "epoch": 1.0545, "grad_norm": 25.078907012939453, "learning_rate": 5.78388e-07, "loss": 0.472, "step": 105450 }, { "epoch": 1.055, "grad_norm": 29.455020904541016, "learning_rate": 5.78188e-07, "loss": 0.3387, "step": 105500 }, { "epoch": 1.0555, "grad_norm": 4.957467555999756, "learning_rate": 5.779879999999999e-07, "loss": 0.4357, "step": 105550 }, { "epoch": 1.056, "grad_norm": 56.372745513916016, "learning_rate": 5.77788e-07, "loss": 0.5183, "step": 105600 }, { "epoch": 1.0565, "grad_norm": 1.047702670097351, "learning_rate": 5.77588e-07, "loss": 0.3755, "step": 105650 }, { "epoch": 1.057, "grad_norm": 26.11679458618164, "learning_rate": 5.77388e-07, "loss": 0.431, "step": 105700 }, { "epoch": 1.0575, "grad_norm": 81.7731704711914, "learning_rate": 5.77192e-07, "loss": 0.642, "step": 105750 }, { "epoch": 1.058, "grad_norm": 2.761612892150879, "learning_rate": 5.769919999999999e-07, "loss": 0.4243, "step": 105800 }, { "epoch": 1.0585, "grad_norm": 16.649229049682617, "learning_rate": 5.767919999999999e-07, "loss": 0.4652, "step": 105850 }, { "epoch": 1.059, "grad_norm": 74.75067138671875, "learning_rate": 5.76592e-07, "loss": 0.4756, "step": 105900 }, { "epoch": 1.0594999999999999, "grad_norm": 6.984798431396484, "learning_rate": 5.76392e-07, "loss": 0.4214, "step": 105950 }, { "epoch": 1.06, "grad_norm": 40.673606872558594, "learning_rate": 5.76192e-07, "loss": 0.5516, "step": 106000 }, { "epoch": 1.0605, "grad_norm": 111.4358901977539, "learning_rate": 5.75992e-07, "loss": 0.4571, "step": 106050 }, { "epoch": 1.061, "grad_norm": 22.08611297607422, "learning_rate": 5.757919999999999e-07, "loss": 0.4519, "step": 106100 }, { "epoch": 1.0615, "grad_norm": 6.3416666984558105, "learning_rate": 5.755919999999999e-07, "loss": 0.3882, "step": 106150 }, { "epoch": 1.062, "grad_norm": 18.259502410888672, "learning_rate": 5.75392e-07, "loss": 0.3961, "step": 106200 }, { "epoch": 1.0625, "grad_norm": 76.18441772460938, "learning_rate": 5.75192e-07, "loss": 0.4029, "step": 106250 }, { "epoch": 1.063, "grad_norm": 47.05423355102539, "learning_rate": 5.749919999999999e-07, "loss": 0.3685, "step": 106300 }, { "epoch": 1.0635, "grad_norm": 69.71207427978516, "learning_rate": 5.74792e-07, "loss": 0.4922, "step": 106350 }, { "epoch": 1.064, "grad_norm": 32.429664611816406, "learning_rate": 5.74592e-07, "loss": 0.6055, "step": 106400 }, { "epoch": 1.0645, "grad_norm": 105.841552734375, "learning_rate": 5.743919999999999e-07, "loss": 0.4959, "step": 106450 }, { "epoch": 1.065, "grad_norm": 65.44140625, "learning_rate": 5.74192e-07, "loss": 0.4779, "step": 106500 }, { "epoch": 1.0655000000000001, "grad_norm": 31.591449737548828, "learning_rate": 5.739919999999999e-07, "loss": 0.5328, "step": 106550 }, { "epoch": 1.066, "grad_norm": 39.16038513183594, "learning_rate": 5.737919999999999e-07, "loss": 0.4452, "step": 106600 }, { "epoch": 1.0665, "grad_norm": 0.7136451601982117, "learning_rate": 5.73592e-07, "loss": 0.4545, "step": 106650 }, { "epoch": 1.067, "grad_norm": 78.04670715332031, "learning_rate": 5.73392e-07, "loss": 0.5004, "step": 106700 }, { "epoch": 1.0675, "grad_norm": 77.70978546142578, "learning_rate": 5.73192e-07, "loss": 0.4688, "step": 106750 }, { "epoch": 1.068, "grad_norm": 2.930210590362549, "learning_rate": 5.729919999999999e-07, "loss": 0.504, "step": 106800 }, { "epoch": 1.0685, "grad_norm": 2.333472967147827, "learning_rate": 5.727919999999999e-07, "loss": 0.4608, "step": 106850 }, { "epoch": 1.069, "grad_norm": 96.40602111816406, "learning_rate": 5.725919999999999e-07, "loss": 0.4855, "step": 106900 }, { "epoch": 1.0695000000000001, "grad_norm": 147.89405822753906, "learning_rate": 5.72392e-07, "loss": 0.6196, "step": 106950 }, { "epoch": 1.07, "grad_norm": 1.7213540077209473, "learning_rate": 5.72192e-07, "loss": 0.4273, "step": 107000 }, { "epoch": 1.0705, "grad_norm": 105.5814437866211, "learning_rate": 5.719919999999999e-07, "loss": 0.5419, "step": 107050 }, { "epoch": 1.071, "grad_norm": 104.46412658691406, "learning_rate": 5.71792e-07, "loss": 0.4807, "step": 107100 }, { "epoch": 1.0715, "grad_norm": 51.38224792480469, "learning_rate": 5.71592e-07, "loss": 0.505, "step": 107150 }, { "epoch": 1.072, "grad_norm": 59.694828033447266, "learning_rate": 5.713919999999999e-07, "loss": 0.3088, "step": 107200 }, { "epoch": 1.0725, "grad_norm": 17.387590408325195, "learning_rate": 5.71192e-07, "loss": 0.5795, "step": 107250 }, { "epoch": 1.073, "grad_norm": 70.23807525634766, "learning_rate": 5.70992e-07, "loss": 0.4044, "step": 107300 }, { "epoch": 1.0735, "grad_norm": 10.148516654968262, "learning_rate": 5.707919999999999e-07, "loss": 0.4085, "step": 107350 }, { "epoch": 1.074, "grad_norm": 2.2347519397735596, "learning_rate": 5.70592e-07, "loss": 0.5751, "step": 107400 }, { "epoch": 1.0745, "grad_norm": 59.01057434082031, "learning_rate": 5.70392e-07, "loss": 0.4502, "step": 107450 }, { "epoch": 1.075, "grad_norm": 41.65180206298828, "learning_rate": 5.701920000000001e-07, "loss": 0.4107, "step": 107500 }, { "epoch": 1.0755, "grad_norm": 63.487892150878906, "learning_rate": 5.69992e-07, "loss": 0.4103, "step": 107550 }, { "epoch": 1.076, "grad_norm": 44.388309478759766, "learning_rate": 5.697919999999999e-07, "loss": 0.5648, "step": 107600 }, { "epoch": 1.0765, "grad_norm": 15.944334030151367, "learning_rate": 5.69592e-07, "loss": 0.434, "step": 107650 }, { "epoch": 1.077, "grad_norm": 69.580322265625, "learning_rate": 5.69392e-07, "loss": 0.4692, "step": 107700 }, { "epoch": 1.0775, "grad_norm": 7.532376289367676, "learning_rate": 5.69196e-07, "loss": 0.486, "step": 107750 }, { "epoch": 1.078, "grad_norm": 117.07515716552734, "learning_rate": 5.68996e-07, "loss": 0.5425, "step": 107800 }, { "epoch": 1.0785, "grad_norm": 74.83606719970703, "learning_rate": 5.68796e-07, "loss": 0.4081, "step": 107850 }, { "epoch": 1.079, "grad_norm": 66.38423919677734, "learning_rate": 5.685959999999999e-07, "loss": 0.4735, "step": 107900 }, { "epoch": 1.0795, "grad_norm": 18.5911808013916, "learning_rate": 5.68396e-07, "loss": 0.5111, "step": 107950 }, { "epoch": 1.08, "grad_norm": 41.88191223144531, "learning_rate": 5.68196e-07, "loss": 0.4556, "step": 108000 }, { "epoch": 1.0805, "grad_norm": 81.14200592041016, "learning_rate": 5.679959999999999e-07, "loss": 0.3716, "step": 108050 }, { "epoch": 1.081, "grad_norm": 35.856998443603516, "learning_rate": 5.67796e-07, "loss": 0.4, "step": 108100 }, { "epoch": 1.0815, "grad_norm": 43.836421966552734, "learning_rate": 5.67596e-07, "loss": 0.4874, "step": 108150 }, { "epoch": 1.082, "grad_norm": 84.13600158691406, "learning_rate": 5.67396e-07, "loss": 0.4165, "step": 108200 }, { "epoch": 1.0825, "grad_norm": 22.291542053222656, "learning_rate": 5.67196e-07, "loss": 0.5321, "step": 108250 }, { "epoch": 1.083, "grad_norm": 94.12808227539062, "learning_rate": 5.66996e-07, "loss": 0.624, "step": 108300 }, { "epoch": 1.0835, "grad_norm": 53.173072814941406, "learning_rate": 5.667959999999999e-07, "loss": 0.4123, "step": 108350 }, { "epoch": 1.084, "grad_norm": 78.43643188476562, "learning_rate": 5.66596e-07, "loss": 0.5123, "step": 108400 }, { "epoch": 1.0845, "grad_norm": 39.595787048339844, "learning_rate": 5.66396e-07, "loss": 0.4563, "step": 108450 }, { "epoch": 1.085, "grad_norm": 14.890007972717285, "learning_rate": 5.66196e-07, "loss": 0.4253, "step": 108500 }, { "epoch": 1.0855, "grad_norm": 91.4941635131836, "learning_rate": 5.659960000000001e-07, "loss": 0.3835, "step": 108550 }, { "epoch": 1.086, "grad_norm": 74.25895690917969, "learning_rate": 5.657959999999999e-07, "loss": 0.4304, "step": 108600 }, { "epoch": 1.0865, "grad_norm": 48.94237518310547, "learning_rate": 5.655959999999999e-07, "loss": 0.3277, "step": 108650 }, { "epoch": 1.087, "grad_norm": 56.9554328918457, "learning_rate": 5.65396e-07, "loss": 0.5296, "step": 108700 }, { "epoch": 1.0875, "grad_norm": 47.54026412963867, "learning_rate": 5.65196e-07, "loss": 0.3747, "step": 108750 }, { "epoch": 1.088, "grad_norm": 67.42796325683594, "learning_rate": 5.64996e-07, "loss": 0.4032, "step": 108800 }, { "epoch": 1.0885, "grad_norm": 39.55683898925781, "learning_rate": 5.64796e-07, "loss": 0.3196, "step": 108850 }, { "epoch": 1.089, "grad_norm": 96.23863220214844, "learning_rate": 5.64596e-07, "loss": 0.4986, "step": 108900 }, { "epoch": 1.0895, "grad_norm": 6.769164085388184, "learning_rate": 5.643959999999999e-07, "loss": 0.4685, "step": 108950 }, { "epoch": 1.09, "grad_norm": 23.501625061035156, "learning_rate": 5.64196e-07, "loss": 0.6104, "step": 109000 }, { "epoch": 1.0905, "grad_norm": 9.114831924438477, "learning_rate": 5.63996e-07, "loss": 0.5997, "step": 109050 }, { "epoch": 1.091, "grad_norm": 12.518210411071777, "learning_rate": 5.637959999999999e-07, "loss": 0.3271, "step": 109100 }, { "epoch": 1.0915, "grad_norm": 99.00508117675781, "learning_rate": 5.63596e-07, "loss": 0.5053, "step": 109150 }, { "epoch": 1.092, "grad_norm": 10.864371299743652, "learning_rate": 5.63396e-07, "loss": 0.5697, "step": 109200 }, { "epoch": 1.0925, "grad_norm": 43.38995361328125, "learning_rate": 5.63196e-07, "loss": 0.369, "step": 109250 }, { "epoch": 1.093, "grad_norm": 1.3382948637008667, "learning_rate": 5.62996e-07, "loss": 0.5112, "step": 109300 }, { "epoch": 1.0935, "grad_norm": 78.29591369628906, "learning_rate": 5.627959999999999e-07, "loss": 0.4358, "step": 109350 }, { "epoch": 1.094, "grad_norm": 93.0485610961914, "learning_rate": 5.625959999999999e-07, "loss": 0.38, "step": 109400 }, { "epoch": 1.0945, "grad_norm": 20.810575485229492, "learning_rate": 5.62396e-07, "loss": 0.5316, "step": 109450 }, { "epoch": 1.095, "grad_norm": 0.7341397404670715, "learning_rate": 5.62196e-07, "loss": 0.3763, "step": 109500 }, { "epoch": 1.0955, "grad_norm": 18.946903228759766, "learning_rate": 5.619960000000001e-07, "loss": 0.562, "step": 109550 }, { "epoch": 1.096, "grad_norm": 95.75009155273438, "learning_rate": 5.61796e-07, "loss": 0.4775, "step": 109600 }, { "epoch": 1.0965, "grad_norm": 72.53479766845703, "learning_rate": 5.615959999999999e-07, "loss": 0.4117, "step": 109650 }, { "epoch": 1.097, "grad_norm": 55.721282958984375, "learning_rate": 5.61396e-07, "loss": 0.4769, "step": 109700 }, { "epoch": 1.0975, "grad_norm": 29.421995162963867, "learning_rate": 5.61196e-07, "loss": 0.4423, "step": 109750 }, { "epoch": 1.098, "grad_norm": 62.25779342651367, "learning_rate": 5.60996e-07, "loss": 0.4081, "step": 109800 }, { "epoch": 1.0985, "grad_norm": 80.15348052978516, "learning_rate": 5.60796e-07, "loss": 0.3587, "step": 109850 }, { "epoch": 1.099, "grad_norm": 0.7332010865211487, "learning_rate": 5.60596e-07, "loss": 0.3441, "step": 109900 }, { "epoch": 1.0995, "grad_norm": 62.10441207885742, "learning_rate": 5.60396e-07, "loss": 0.5722, "step": 109950 }, { "epoch": 1.1, "grad_norm": 4.652379035949707, "learning_rate": 5.60196e-07, "loss": 0.3688, "step": 110000 }, { "epoch": 1.1005, "grad_norm": 56.87358093261719, "learning_rate": 5.59996e-07, "loss": 0.4241, "step": 110050 }, { "epoch": 1.101, "grad_norm": 93.23614501953125, "learning_rate": 5.597959999999999e-07, "loss": 0.5574, "step": 110100 }, { "epoch": 1.1015, "grad_norm": 34.96125793457031, "learning_rate": 5.59596e-07, "loss": 0.5044, "step": 110150 }, { "epoch": 1.102, "grad_norm": 79.12371063232422, "learning_rate": 5.59396e-07, "loss": 0.4684, "step": 110200 }, { "epoch": 1.1025, "grad_norm": 130.24989318847656, "learning_rate": 5.59196e-07, "loss": 0.4631, "step": 110250 }, { "epoch": 1.103, "grad_norm": 121.72173309326172, "learning_rate": 5.589960000000001e-07, "loss": 0.4232, "step": 110300 }, { "epoch": 1.1035, "grad_norm": 66.10565948486328, "learning_rate": 5.587959999999999e-07, "loss": 0.4178, "step": 110350 }, { "epoch": 1.104, "grad_norm": 70.6107406616211, "learning_rate": 5.585959999999999e-07, "loss": 0.5394, "step": 110400 }, { "epoch": 1.1045, "grad_norm": 28.195026397705078, "learning_rate": 5.58396e-07, "loss": 0.5433, "step": 110450 }, { "epoch": 1.105, "grad_norm": 73.64238739013672, "learning_rate": 5.58196e-07, "loss": 0.418, "step": 110500 }, { "epoch": 1.1055, "grad_norm": 90.38440704345703, "learning_rate": 5.57996e-07, "loss": 0.6524, "step": 110550 }, { "epoch": 1.106, "grad_norm": 7.275987148284912, "learning_rate": 5.57796e-07, "loss": 0.4582, "step": 110600 }, { "epoch": 1.1065, "grad_norm": 15.238624572753906, "learning_rate": 5.576e-07, "loss": 0.476, "step": 110650 }, { "epoch": 1.107, "grad_norm": 70.75611877441406, "learning_rate": 5.574e-07, "loss": 0.4448, "step": 110700 }, { "epoch": 1.1075, "grad_norm": 87.86063385009766, "learning_rate": 5.572e-07, "loss": 0.5077, "step": 110750 }, { "epoch": 1.108, "grad_norm": 0.4845068156719208, "learning_rate": 5.57e-07, "loss": 0.4965, "step": 110800 }, { "epoch": 1.1085, "grad_norm": 2.1131792068481445, "learning_rate": 5.567999999999999e-07, "loss": 0.5516, "step": 110850 }, { "epoch": 1.109, "grad_norm": 16.047956466674805, "learning_rate": 5.566e-07, "loss": 0.4468, "step": 110900 }, { "epoch": 1.1095, "grad_norm": 17.67030143737793, "learning_rate": 5.564e-07, "loss": 0.5287, "step": 110950 }, { "epoch": 1.11, "grad_norm": 82.67327117919922, "learning_rate": 5.562e-07, "loss": 0.3612, "step": 111000 }, { "epoch": 1.1105, "grad_norm": 1.270891785621643, "learning_rate": 5.560000000000001e-07, "loss": 0.396, "step": 111050 }, { "epoch": 1.111, "grad_norm": 75.45249938964844, "learning_rate": 5.557999999999999e-07, "loss": 0.3984, "step": 111100 }, { "epoch": 1.1115, "grad_norm": 19.672801971435547, "learning_rate": 5.555999999999999e-07, "loss": 0.5507, "step": 111150 }, { "epoch": 1.112, "grad_norm": 6.31002140045166, "learning_rate": 5.554e-07, "loss": 0.3859, "step": 111200 }, { "epoch": 1.1125, "grad_norm": 85.23663330078125, "learning_rate": 5.552e-07, "loss": 0.4728, "step": 111250 }, { "epoch": 1.113, "grad_norm": 88.14482879638672, "learning_rate": 5.55e-07, "loss": 0.5598, "step": 111300 }, { "epoch": 1.1135, "grad_norm": 94.62075805664062, "learning_rate": 5.548e-07, "loss": 0.4012, "step": 111350 }, { "epoch": 1.114, "grad_norm": 51.954891204833984, "learning_rate": 5.546e-07, "loss": 0.3856, "step": 111400 }, { "epoch": 1.1145, "grad_norm": 75.34689331054688, "learning_rate": 5.543999999999999e-07, "loss": 0.3746, "step": 111450 }, { "epoch": 1.115, "grad_norm": 72.60891723632812, "learning_rate": 5.542e-07, "loss": 0.5083, "step": 111500 }, { "epoch": 1.1155, "grad_norm": 39.89189147949219, "learning_rate": 5.54e-07, "loss": 0.4479, "step": 111550 }, { "epoch": 1.116, "grad_norm": 4.4385833740234375, "learning_rate": 5.537999999999999e-07, "loss": 0.3594, "step": 111600 }, { "epoch": 1.1165, "grad_norm": 0.6604699492454529, "learning_rate": 5.536e-07, "loss": 0.3674, "step": 111650 }, { "epoch": 1.117, "grad_norm": 49.04147720336914, "learning_rate": 5.534e-07, "loss": 0.4568, "step": 111700 }, { "epoch": 1.1175, "grad_norm": 42.06256866455078, "learning_rate": 5.532e-07, "loss": 0.5162, "step": 111750 }, { "epoch": 1.1179999999999999, "grad_norm": 102.51911163330078, "learning_rate": 5.53e-07, "loss": 0.4049, "step": 111800 }, { "epoch": 1.1185, "grad_norm": 44.5997428894043, "learning_rate": 5.527999999999999e-07, "loss": 0.4188, "step": 111850 }, { "epoch": 1.119, "grad_norm": 25.700366973876953, "learning_rate": 5.525999999999999e-07, "loss": 0.3842, "step": 111900 }, { "epoch": 1.1195, "grad_norm": 16.742462158203125, "learning_rate": 5.524e-07, "loss": 0.5652, "step": 111950 }, { "epoch": 1.12, "grad_norm": 52.463157653808594, "learning_rate": 5.522e-07, "loss": 0.4582, "step": 112000 }, { "epoch": 1.1205, "grad_norm": 37.043521881103516, "learning_rate": 5.520000000000001e-07, "loss": 0.4157, "step": 112050 }, { "epoch": 1.121, "grad_norm": 1.2325105667114258, "learning_rate": 5.518e-07, "loss": 0.5246, "step": 112100 }, { "epoch": 1.1215, "grad_norm": 21.83551597595215, "learning_rate": 5.515999999999999e-07, "loss": 0.4453, "step": 112150 }, { "epoch": 1.1219999999999999, "grad_norm": 3.408167839050293, "learning_rate": 5.514e-07, "loss": 0.4424, "step": 112200 }, { "epoch": 1.1225, "grad_norm": 86.52948760986328, "learning_rate": 5.512e-07, "loss": 0.516, "step": 112250 }, { "epoch": 1.123, "grad_norm": 2.4071269035339355, "learning_rate": 5.51e-07, "loss": 0.304, "step": 112300 }, { "epoch": 1.1235, "grad_norm": 74.5964126586914, "learning_rate": 5.508e-07, "loss": 0.2992, "step": 112350 }, { "epoch": 1.124, "grad_norm": 52.07612991333008, "learning_rate": 5.506e-07, "loss": 0.4152, "step": 112400 }, { "epoch": 1.1245, "grad_norm": 66.46135711669922, "learning_rate": 5.504e-07, "loss": 0.4892, "step": 112450 }, { "epoch": 1.125, "grad_norm": 19.547279357910156, "learning_rate": 5.502e-07, "loss": 0.3387, "step": 112500 }, { "epoch": 1.1255, "grad_norm": 44.79277420043945, "learning_rate": 5.5e-07, "loss": 0.459, "step": 112550 }, { "epoch": 1.126, "grad_norm": 112.7216567993164, "learning_rate": 5.497999999999999e-07, "loss": 0.3898, "step": 112600 }, { "epoch": 1.1265, "grad_norm": 14.867293357849121, "learning_rate": 5.496e-07, "loss": 0.5735, "step": 112650 }, { "epoch": 1.127, "grad_norm": 98.45852661132812, "learning_rate": 5.494e-07, "loss": 0.5208, "step": 112700 }, { "epoch": 1.1275, "grad_norm": 27.679582595825195, "learning_rate": 5.492e-07, "loss": 0.429, "step": 112750 }, { "epoch": 1.1280000000000001, "grad_norm": 0.8725939989089966, "learning_rate": 5.490000000000001e-07, "loss": 0.4026, "step": 112800 }, { "epoch": 1.1285, "grad_norm": 9.201322555541992, "learning_rate": 5.487999999999999e-07, "loss": 0.5991, "step": 112850 }, { "epoch": 1.129, "grad_norm": 42.13934326171875, "learning_rate": 5.485999999999999e-07, "loss": 0.4351, "step": 112900 }, { "epoch": 1.1295, "grad_norm": 22.487741470336914, "learning_rate": 5.484e-07, "loss": 0.3695, "step": 112950 }, { "epoch": 1.13, "grad_norm": 78.89250183105469, "learning_rate": 5.482e-07, "loss": 0.6346, "step": 113000 }, { "epoch": 1.1305, "grad_norm": 123.7279052734375, "learning_rate": 5.48e-07, "loss": 0.5355, "step": 113050 }, { "epoch": 1.131, "grad_norm": 19.976831436157227, "learning_rate": 5.478e-07, "loss": 0.3888, "step": 113100 }, { "epoch": 1.1315, "grad_norm": 36.473289489746094, "learning_rate": 5.476e-07, "loss": 0.6061, "step": 113150 }, { "epoch": 1.1320000000000001, "grad_norm": 33.37117004394531, "learning_rate": 5.473999999999999e-07, "loss": 0.3772, "step": 113200 }, { "epoch": 1.1325, "grad_norm": 45.76275634765625, "learning_rate": 5.472e-07, "loss": 0.4798, "step": 113250 }, { "epoch": 1.133, "grad_norm": 67.44389343261719, "learning_rate": 5.47e-07, "loss": 0.5103, "step": 113300 }, { "epoch": 1.1335, "grad_norm": 77.15213775634766, "learning_rate": 5.467999999999999e-07, "loss": 0.4921, "step": 113350 }, { "epoch": 1.134, "grad_norm": 109.74034881591797, "learning_rate": 5.466e-07, "loss": 0.5541, "step": 113400 }, { "epoch": 1.1345, "grad_norm": 49.160316467285156, "learning_rate": 5.464e-07, "loss": 0.4071, "step": 113450 }, { "epoch": 1.135, "grad_norm": 0.6659963130950928, "learning_rate": 5.462e-07, "loss": 0.4504, "step": 113500 }, { "epoch": 1.1355, "grad_norm": 129.61778259277344, "learning_rate": 5.46e-07, "loss": 0.4083, "step": 113550 }, { "epoch": 1.1360000000000001, "grad_norm": 51.32670211791992, "learning_rate": 5.457999999999999e-07, "loss": 0.4323, "step": 113600 }, { "epoch": 1.1365, "grad_norm": 94.61491394042969, "learning_rate": 5.455999999999999e-07, "loss": 0.4553, "step": 113650 }, { "epoch": 1.137, "grad_norm": 94.95795440673828, "learning_rate": 5.454e-07, "loss": 0.3422, "step": 113700 }, { "epoch": 1.1375, "grad_norm": 45.39287185668945, "learning_rate": 5.452e-07, "loss": 0.4915, "step": 113750 }, { "epoch": 1.138, "grad_norm": 29.25931739807129, "learning_rate": 5.45e-07, "loss": 0.3838, "step": 113800 }, { "epoch": 1.1385, "grad_norm": 60.96989822387695, "learning_rate": 5.448e-07, "loss": 0.5589, "step": 113850 }, { "epoch": 1.139, "grad_norm": 45.03044128417969, "learning_rate": 5.445999999999999e-07, "loss": 0.4151, "step": 113900 }, { "epoch": 1.1395, "grad_norm": 6.3732404708862305, "learning_rate": 5.443999999999999e-07, "loss": 0.3358, "step": 113950 }, { "epoch": 1.1400000000000001, "grad_norm": 18.30240249633789, "learning_rate": 5.442e-07, "loss": 0.5567, "step": 114000 }, { "epoch": 1.1405, "grad_norm": 47.34935760498047, "learning_rate": 5.44e-07, "loss": 0.4934, "step": 114050 }, { "epoch": 1.141, "grad_norm": 12.485174179077148, "learning_rate": 5.437999999999999e-07, "loss": 0.4781, "step": 114100 }, { "epoch": 1.1415, "grad_norm": 6.659138202667236, "learning_rate": 5.436e-07, "loss": 0.4272, "step": 114150 }, { "epoch": 1.142, "grad_norm": 17.311927795410156, "learning_rate": 5.434e-07, "loss": 0.3993, "step": 114200 }, { "epoch": 1.1425, "grad_norm": 70.42869567871094, "learning_rate": 5.431999999999999e-07, "loss": 0.4323, "step": 114250 }, { "epoch": 1.143, "grad_norm": 44.21244812011719, "learning_rate": 5.43e-07, "loss": 0.4421, "step": 114300 }, { "epoch": 1.1435, "grad_norm": 66.3972396850586, "learning_rate": 5.427999999999999e-07, "loss": 0.4448, "step": 114350 }, { "epoch": 1.144, "grad_norm": 31.286418914794922, "learning_rate": 5.425999999999999e-07, "loss": 0.5305, "step": 114400 }, { "epoch": 1.1445, "grad_norm": 79.53046417236328, "learning_rate": 5.424e-07, "loss": 0.5406, "step": 114450 }, { "epoch": 1.145, "grad_norm": 1.268337607383728, "learning_rate": 5.422e-07, "loss": 0.3917, "step": 114500 }, { "epoch": 1.1455, "grad_norm": 27.05938148498535, "learning_rate": 5.420000000000001e-07, "loss": 0.5285, "step": 114550 }, { "epoch": 1.146, "grad_norm": 24.848316192626953, "learning_rate": 5.417999999999999e-07, "loss": 0.5525, "step": 114600 }, { "epoch": 1.1465, "grad_norm": 21.377756118774414, "learning_rate": 5.415999999999999e-07, "loss": 0.5307, "step": 114650 }, { "epoch": 1.147, "grad_norm": 3.2962486743927, "learning_rate": 5.414e-07, "loss": 0.4409, "step": 114700 }, { "epoch": 1.1475, "grad_norm": 77.59668731689453, "learning_rate": 5.41204e-07, "loss": 0.5064, "step": 114750 }, { "epoch": 1.148, "grad_norm": 82.37496185302734, "learning_rate": 5.41004e-07, "loss": 0.3953, "step": 114800 }, { "epoch": 1.1485, "grad_norm": 85.80392456054688, "learning_rate": 5.40804e-07, "loss": 0.5122, "step": 114850 }, { "epoch": 1.149, "grad_norm": 81.59890747070312, "learning_rate": 5.40604e-07, "loss": 0.5849, "step": 114900 }, { "epoch": 1.1495, "grad_norm": 58.08201217651367, "learning_rate": 5.404039999999999e-07, "loss": 0.4393, "step": 114950 }, { "epoch": 1.15, "grad_norm": 78.7620849609375, "learning_rate": 5.40204e-07, "loss": 0.4161, "step": 115000 }, { "epoch": 1.1505, "grad_norm": 84.27385711669922, "learning_rate": 5.40004e-07, "loss": 0.3664, "step": 115050 }, { "epoch": 1.151, "grad_norm": 62.63862609863281, "learning_rate": 5.398039999999999e-07, "loss": 0.4925, "step": 115100 }, { "epoch": 1.1515, "grad_norm": 67.16043090820312, "learning_rate": 5.39604e-07, "loss": 0.5412, "step": 115150 }, { "epoch": 1.152, "grad_norm": 53.16383361816406, "learning_rate": 5.39404e-07, "loss": 0.404, "step": 115200 }, { "epoch": 1.1525, "grad_norm": 39.47855758666992, "learning_rate": 5.39204e-07, "loss": 0.4474, "step": 115250 }, { "epoch": 1.153, "grad_norm": 1.1669477224349976, "learning_rate": 5.390040000000001e-07, "loss": 0.4366, "step": 115300 }, { "epoch": 1.1535, "grad_norm": 67.96946716308594, "learning_rate": 5.388039999999999e-07, "loss": 0.478, "step": 115350 }, { "epoch": 1.154, "grad_norm": 26.770727157592773, "learning_rate": 5.386039999999999e-07, "loss": 0.3817, "step": 115400 }, { "epoch": 1.1545, "grad_norm": 57.50968933105469, "learning_rate": 5.38404e-07, "loss": 0.3984, "step": 115450 }, { "epoch": 1.155, "grad_norm": 33.6568603515625, "learning_rate": 5.38204e-07, "loss": 0.4302, "step": 115500 }, { "epoch": 1.1555, "grad_norm": 1.8125121593475342, "learning_rate": 5.38004e-07, "loss": 0.4788, "step": 115550 }, { "epoch": 1.156, "grad_norm": 57.99422073364258, "learning_rate": 5.37804e-07, "loss": 0.2844, "step": 115600 }, { "epoch": 1.1565, "grad_norm": 63.1536865234375, "learning_rate": 5.37608e-07, "loss": 0.3902, "step": 115650 }, { "epoch": 1.157, "grad_norm": 15.688610076904297, "learning_rate": 5.374079999999999e-07, "loss": 0.3493, "step": 115700 }, { "epoch": 1.1575, "grad_norm": 77.63614654541016, "learning_rate": 5.37208e-07, "loss": 0.5133, "step": 115750 }, { "epoch": 1.158, "grad_norm": 36.73105239868164, "learning_rate": 5.37008e-07, "loss": 0.4251, "step": 115800 }, { "epoch": 1.1585, "grad_norm": 42.460872650146484, "learning_rate": 5.368079999999999e-07, "loss": 0.5139, "step": 115850 }, { "epoch": 1.159, "grad_norm": 73.16018676757812, "learning_rate": 5.36608e-07, "loss": 0.3794, "step": 115900 }, { "epoch": 1.1595, "grad_norm": 42.55072021484375, "learning_rate": 5.36408e-07, "loss": 0.3831, "step": 115950 }, { "epoch": 1.16, "grad_norm": 32.332515716552734, "learning_rate": 5.36208e-07, "loss": 0.3343, "step": 116000 }, { "epoch": 1.1605, "grad_norm": 47.2687873840332, "learning_rate": 5.36008e-07, "loss": 0.5091, "step": 116050 }, { "epoch": 1.161, "grad_norm": 95.39241790771484, "learning_rate": 5.358079999999999e-07, "loss": 0.4826, "step": 116100 }, { "epoch": 1.1615, "grad_norm": 83.02808380126953, "learning_rate": 5.356079999999999e-07, "loss": 0.4376, "step": 116150 }, { "epoch": 1.162, "grad_norm": 59.88267517089844, "learning_rate": 5.35408e-07, "loss": 0.4086, "step": 116200 }, { "epoch": 1.1625, "grad_norm": 62.45161056518555, "learning_rate": 5.35208e-07, "loss": 0.4202, "step": 116250 }, { "epoch": 1.163, "grad_norm": 117.84101104736328, "learning_rate": 5.35008e-07, "loss": 0.464, "step": 116300 }, { "epoch": 1.1635, "grad_norm": 65.64312744140625, "learning_rate": 5.34808e-07, "loss": 0.5111, "step": 116350 }, { "epoch": 1.164, "grad_norm": 45.327972412109375, "learning_rate": 5.346079999999999e-07, "loss": 0.4727, "step": 116400 }, { "epoch": 1.1645, "grad_norm": 61.47356414794922, "learning_rate": 5.344079999999999e-07, "loss": 0.3288, "step": 116450 }, { "epoch": 1.165, "grad_norm": 34.35345458984375, "learning_rate": 5.34208e-07, "loss": 0.365, "step": 116500 }, { "epoch": 1.1655, "grad_norm": 5.033834934234619, "learning_rate": 5.34008e-07, "loss": 0.4617, "step": 116550 }, { "epoch": 1.166, "grad_norm": 0.281574010848999, "learning_rate": 5.338079999999999e-07, "loss": 0.4622, "step": 116600 }, { "epoch": 1.1665, "grad_norm": 51.69894027709961, "learning_rate": 5.33608e-07, "loss": 0.4423, "step": 116650 }, { "epoch": 1.167, "grad_norm": 36.404632568359375, "learning_rate": 5.33408e-07, "loss": 0.5443, "step": 116700 }, { "epoch": 1.1675, "grad_norm": 78.08043670654297, "learning_rate": 5.332079999999999e-07, "loss": 0.4774, "step": 116750 }, { "epoch": 1.168, "grad_norm": 30.812881469726562, "learning_rate": 5.33008e-07, "loss": 0.5483, "step": 116800 }, { "epoch": 1.1685, "grad_norm": 0.9321984648704529, "learning_rate": 5.328079999999999e-07, "loss": 0.3477, "step": 116850 }, { "epoch": 1.169, "grad_norm": 72.7139892578125, "learning_rate": 5.326079999999999e-07, "loss": 0.5847, "step": 116900 }, { "epoch": 1.1695, "grad_norm": 50.76666259765625, "learning_rate": 5.32408e-07, "loss": 0.3907, "step": 116950 }, { "epoch": 1.17, "grad_norm": 96.60762786865234, "learning_rate": 5.32208e-07, "loss": 0.672, "step": 117000 }, { "epoch": 1.1705, "grad_norm": 65.07868957519531, "learning_rate": 5.320080000000001e-07, "loss": 0.4654, "step": 117050 }, { "epoch": 1.171, "grad_norm": 41.71099090576172, "learning_rate": 5.318079999999999e-07, "loss": 0.3896, "step": 117100 }, { "epoch": 1.1715, "grad_norm": 30.449907302856445, "learning_rate": 5.316079999999999e-07, "loss": 0.5155, "step": 117150 }, { "epoch": 1.172, "grad_norm": 103.2293701171875, "learning_rate": 5.31408e-07, "loss": 0.3301, "step": 117200 }, { "epoch": 1.1724999999999999, "grad_norm": 54.75368881225586, "learning_rate": 5.31208e-07, "loss": 0.4718, "step": 117250 }, { "epoch": 1.173, "grad_norm": 75.55032348632812, "learning_rate": 5.31008e-07, "loss": 0.4543, "step": 117300 }, { "epoch": 1.1735, "grad_norm": 54.67997741699219, "learning_rate": 5.30808e-07, "loss": 0.4419, "step": 117350 }, { "epoch": 1.174, "grad_norm": 91.2918472290039, "learning_rate": 5.30608e-07, "loss": 0.4074, "step": 117400 }, { "epoch": 1.1745, "grad_norm": 73.0489501953125, "learning_rate": 5.304079999999999e-07, "loss": 0.4729, "step": 117450 }, { "epoch": 1.175, "grad_norm": 3.4424524307250977, "learning_rate": 5.30208e-07, "loss": 0.4709, "step": 117500 }, { "epoch": 1.1755, "grad_norm": 64.21658325195312, "learning_rate": 5.30008e-07, "loss": 0.4925, "step": 117550 }, { "epoch": 1.176, "grad_norm": 24.437042236328125, "learning_rate": 5.298079999999999e-07, "loss": 0.4695, "step": 117600 }, { "epoch": 1.1764999999999999, "grad_norm": 85.16154479980469, "learning_rate": 5.29608e-07, "loss": 0.4352, "step": 117650 }, { "epoch": 1.177, "grad_norm": 34.37142562866211, "learning_rate": 5.29408e-07, "loss": 0.4595, "step": 117700 }, { "epoch": 1.1775, "grad_norm": 103.63298034667969, "learning_rate": 5.29208e-07, "loss": 0.4311, "step": 117750 }, { "epoch": 1.178, "grad_norm": 72.38675689697266, "learning_rate": 5.29008e-07, "loss": 0.4235, "step": 117800 }, { "epoch": 1.1785, "grad_norm": 4.414365291595459, "learning_rate": 5.288079999999999e-07, "loss": 0.462, "step": 117850 }, { "epoch": 1.179, "grad_norm": 74.01151275634766, "learning_rate": 5.286079999999999e-07, "loss": 0.5448, "step": 117900 }, { "epoch": 1.1795, "grad_norm": 88.3164291381836, "learning_rate": 5.28408e-07, "loss": 0.2708, "step": 117950 }, { "epoch": 1.18, "grad_norm": 75.72642517089844, "learning_rate": 5.28208e-07, "loss": 0.3894, "step": 118000 }, { "epoch": 1.1804999999999999, "grad_norm": 63.216453552246094, "learning_rate": 5.28008e-07, "loss": 0.4749, "step": 118050 }, { "epoch": 1.181, "grad_norm": 39.14889144897461, "learning_rate": 5.278080000000001e-07, "loss": 0.3814, "step": 118100 }, { "epoch": 1.1815, "grad_norm": 56.19570541381836, "learning_rate": 5.276079999999999e-07, "loss": 0.349, "step": 118150 }, { "epoch": 1.182, "grad_norm": 99.52997589111328, "learning_rate": 5.274079999999999e-07, "loss": 0.6551, "step": 118200 }, { "epoch": 1.1825, "grad_norm": 27.029600143432617, "learning_rate": 5.27208e-07, "loss": 0.5198, "step": 118250 }, { "epoch": 1.183, "grad_norm": 33.4561882019043, "learning_rate": 5.27008e-07, "loss": 0.4874, "step": 118300 }, { "epoch": 1.1835, "grad_norm": 24.337610244750977, "learning_rate": 5.26808e-07, "loss": 0.5505, "step": 118350 }, { "epoch": 1.184, "grad_norm": 39.1830940246582, "learning_rate": 5.26608e-07, "loss": 0.4899, "step": 118400 }, { "epoch": 1.1844999999999999, "grad_norm": 32.634098052978516, "learning_rate": 5.26408e-07, "loss": 0.354, "step": 118450 }, { "epoch": 1.185, "grad_norm": 102.42017364501953, "learning_rate": 5.262079999999999e-07, "loss": 0.3999, "step": 118500 }, { "epoch": 1.1855, "grad_norm": 1.6400700807571411, "learning_rate": 5.26008e-07, "loss": 0.4722, "step": 118550 }, { "epoch": 1.186, "grad_norm": 56.43583297729492, "learning_rate": 5.25808e-07, "loss": 0.5107, "step": 118600 }, { "epoch": 1.1865, "grad_norm": 4.176393508911133, "learning_rate": 5.256079999999999e-07, "loss": 0.5332, "step": 118650 }, { "epoch": 1.187, "grad_norm": 74.65667724609375, "learning_rate": 5.25408e-07, "loss": 0.347, "step": 118700 }, { "epoch": 1.1875, "grad_norm": 61.597930908203125, "learning_rate": 5.25208e-07, "loss": 0.5037, "step": 118750 }, { "epoch": 1.188, "grad_norm": 20.81285285949707, "learning_rate": 5.25008e-07, "loss": 0.4662, "step": 118800 }, { "epoch": 1.1885, "grad_norm": 16.560958862304688, "learning_rate": 5.24808e-07, "loss": 0.4709, "step": 118850 }, { "epoch": 1.189, "grad_norm": 85.1524429321289, "learning_rate": 5.246079999999999e-07, "loss": 0.3761, "step": 118900 }, { "epoch": 1.1895, "grad_norm": 38.864070892333984, "learning_rate": 5.244079999999999e-07, "loss": 0.3661, "step": 118950 }, { "epoch": 1.19, "grad_norm": 25.94007682800293, "learning_rate": 5.24208e-07, "loss": 0.3894, "step": 119000 }, { "epoch": 1.1905000000000001, "grad_norm": 89.40354919433594, "learning_rate": 5.24008e-07, "loss": 0.6054, "step": 119050 }, { "epoch": 1.191, "grad_norm": 28.081167221069336, "learning_rate": 5.238080000000001e-07, "loss": 0.4478, "step": 119100 }, { "epoch": 1.1915, "grad_norm": 86.49946594238281, "learning_rate": 5.23608e-07, "loss": 0.4845, "step": 119150 }, { "epoch": 1.192, "grad_norm": 29.06024169921875, "learning_rate": 5.234079999999999e-07, "loss": 0.3214, "step": 119200 }, { "epoch": 1.1925, "grad_norm": 21.522964477539062, "learning_rate": 5.23208e-07, "loss": 0.3218, "step": 119250 }, { "epoch": 1.193, "grad_norm": 77.7065200805664, "learning_rate": 5.23008e-07, "loss": 0.422, "step": 119300 }, { "epoch": 1.1935, "grad_norm": 6.1106343269348145, "learning_rate": 5.22808e-07, "loss": 0.5586, "step": 119350 }, { "epoch": 1.194, "grad_norm": 9.23585319519043, "learning_rate": 5.22608e-07, "loss": 0.4294, "step": 119400 }, { "epoch": 1.1945000000000001, "grad_norm": 79.62310791015625, "learning_rate": 5.22408e-07, "loss": 0.4129, "step": 119450 }, { "epoch": 1.195, "grad_norm": 18.00088882446289, "learning_rate": 5.22208e-07, "loss": 0.5814, "step": 119500 }, { "epoch": 1.1955, "grad_norm": 110.30831146240234, "learning_rate": 5.22008e-07, "loss": 0.5494, "step": 119550 }, { "epoch": 1.196, "grad_norm": 53.79940414428711, "learning_rate": 5.21808e-07, "loss": 0.3932, "step": 119600 }, { "epoch": 1.1965, "grad_norm": 76.86608123779297, "learning_rate": 5.216079999999999e-07, "loss": 0.5098, "step": 119650 }, { "epoch": 1.197, "grad_norm": 17.632843017578125, "learning_rate": 5.21408e-07, "loss": 0.5619, "step": 119700 }, { "epoch": 1.1975, "grad_norm": 121.02203369140625, "learning_rate": 5.21208e-07, "loss": 0.478, "step": 119750 }, { "epoch": 1.198, "grad_norm": 7.7555084228515625, "learning_rate": 5.21008e-07, "loss": 0.3932, "step": 119800 }, { "epoch": 1.1985000000000001, "grad_norm": 79.47029113769531, "learning_rate": 5.208080000000001e-07, "loss": 0.4755, "step": 119850 }, { "epoch": 1.199, "grad_norm": 85.48977661132812, "learning_rate": 5.206079999999999e-07, "loss": 0.5045, "step": 119900 }, { "epoch": 1.1995, "grad_norm": 27.96647834777832, "learning_rate": 5.204079999999999e-07, "loss": 0.4199, "step": 119950 }, { "epoch": 1.2, "grad_norm": 93.02487182617188, "learning_rate": 5.20208e-07, "loss": 0.471, "step": 120000 }, { "epoch": 1.2005, "grad_norm": 27.727758407592773, "learning_rate": 5.20008e-07, "loss": 0.4306, "step": 120050 }, { "epoch": 1.201, "grad_norm": 82.77517700195312, "learning_rate": 5.19808e-07, "loss": 0.4908, "step": 120100 }, { "epoch": 1.2015, "grad_norm": 83.77193450927734, "learning_rate": 5.19608e-07, "loss": 0.415, "step": 120150 }, { "epoch": 1.202, "grad_norm": 1.6518832445144653, "learning_rate": 5.19408e-07, "loss": 0.6042, "step": 120200 }, { "epoch": 1.2025000000000001, "grad_norm": 39.49478530883789, "learning_rate": 5.192079999999999e-07, "loss": 0.4554, "step": 120250 }, { "epoch": 1.203, "grad_norm": 62.075286865234375, "learning_rate": 5.19008e-07, "loss": 0.4344, "step": 120300 }, { "epoch": 1.2035, "grad_norm": 0.7993803024291992, "learning_rate": 5.18808e-07, "loss": 0.3913, "step": 120350 }, { "epoch": 1.204, "grad_norm": 0.20502905547618866, "learning_rate": 5.186079999999999e-07, "loss": 0.2958, "step": 120400 }, { "epoch": 1.2045, "grad_norm": 23.898794174194336, "learning_rate": 5.18412e-07, "loss": 0.4188, "step": 120450 }, { "epoch": 1.205, "grad_norm": 4.550911903381348, "learning_rate": 5.18212e-07, "loss": 0.4426, "step": 120500 }, { "epoch": 1.2055, "grad_norm": 39.06218719482422, "learning_rate": 5.18012e-07, "loss": 0.4951, "step": 120550 }, { "epoch": 1.206, "grad_norm": 77.20858001708984, "learning_rate": 5.178120000000001e-07, "loss": 0.4668, "step": 120600 }, { "epoch": 1.2065, "grad_norm": 27.37639617919922, "learning_rate": 5.176119999999999e-07, "loss": 0.5083, "step": 120650 }, { "epoch": 1.207, "grad_norm": 102.03683471679688, "learning_rate": 5.174119999999999e-07, "loss": 0.4479, "step": 120700 }, { "epoch": 1.2075, "grad_norm": 10.521049499511719, "learning_rate": 5.17212e-07, "loss": 0.4924, "step": 120750 }, { "epoch": 1.208, "grad_norm": 50.4854621887207, "learning_rate": 5.17012e-07, "loss": 0.595, "step": 120800 }, { "epoch": 1.2085, "grad_norm": 60.808013916015625, "learning_rate": 5.16812e-07, "loss": 0.3966, "step": 120850 }, { "epoch": 1.209, "grad_norm": 35.996952056884766, "learning_rate": 5.16612e-07, "loss": 0.3948, "step": 120900 }, { "epoch": 1.2095, "grad_norm": 83.63671112060547, "learning_rate": 5.16412e-07, "loss": 0.6021, "step": 120950 }, { "epoch": 1.21, "grad_norm": 92.66004180908203, "learning_rate": 5.162119999999999e-07, "loss": 0.3842, "step": 121000 }, { "epoch": 1.2105, "grad_norm": 13.310367584228516, "learning_rate": 5.16012e-07, "loss": 0.6077, "step": 121050 }, { "epoch": 1.211, "grad_norm": 4.458984375, "learning_rate": 5.15812e-07, "loss": 0.3837, "step": 121100 }, { "epoch": 1.2115, "grad_norm": 39.17095184326172, "learning_rate": 5.156119999999999e-07, "loss": 0.4516, "step": 121150 }, { "epoch": 1.212, "grad_norm": 77.5641860961914, "learning_rate": 5.15412e-07, "loss": 0.4531, "step": 121200 }, { "epoch": 1.2125, "grad_norm": 31.501514434814453, "learning_rate": 5.15212e-07, "loss": 0.4466, "step": 121250 }, { "epoch": 1.213, "grad_norm": 5.8556413650512695, "learning_rate": 5.15012e-07, "loss": 0.473, "step": 121300 }, { "epoch": 1.2135, "grad_norm": 37.5422477722168, "learning_rate": 5.14812e-07, "loss": 0.4125, "step": 121350 }, { "epoch": 1.214, "grad_norm": 2.340611457824707, "learning_rate": 5.146119999999999e-07, "loss": 0.4659, "step": 121400 }, { "epoch": 1.2145, "grad_norm": 16.7429141998291, "learning_rate": 5.144119999999999e-07, "loss": 0.4964, "step": 121450 }, { "epoch": 1.215, "grad_norm": 122.42613220214844, "learning_rate": 5.14212e-07, "loss": 0.403, "step": 121500 }, { "epoch": 1.2155, "grad_norm": 89.16505432128906, "learning_rate": 5.14012e-07, "loss": 0.474, "step": 121550 }, { "epoch": 1.216, "grad_norm": 41.55615997314453, "learning_rate": 5.138120000000001e-07, "loss": 0.4665, "step": 121600 }, { "epoch": 1.2165, "grad_norm": 4.50972843170166, "learning_rate": 5.13612e-07, "loss": 0.4449, "step": 121650 }, { "epoch": 1.217, "grad_norm": 45.18336486816406, "learning_rate": 5.134119999999999e-07, "loss": 0.4837, "step": 121700 }, { "epoch": 1.2175, "grad_norm": 15.911096572875977, "learning_rate": 5.13212e-07, "loss": 0.4563, "step": 121750 }, { "epoch": 1.218, "grad_norm": 32.39420700073242, "learning_rate": 5.13012e-07, "loss": 0.3398, "step": 121800 }, { "epoch": 1.2185, "grad_norm": 67.41966247558594, "learning_rate": 5.12812e-07, "loss": 0.4374, "step": 121850 }, { "epoch": 1.219, "grad_norm": 5.196559429168701, "learning_rate": 5.12612e-07, "loss": 0.5112, "step": 121900 }, { "epoch": 1.2195, "grad_norm": 50.85737991333008, "learning_rate": 5.12412e-07, "loss": 0.5262, "step": 121950 }, { "epoch": 1.22, "grad_norm": 1.6516265869140625, "learning_rate": 5.12212e-07, "loss": 0.3481, "step": 122000 }, { "epoch": 1.2205, "grad_norm": 39.09980392456055, "learning_rate": 5.12012e-07, "loss": 0.3903, "step": 122050 }, { "epoch": 1.221, "grad_norm": 45.83763122558594, "learning_rate": 5.11812e-07, "loss": 0.572, "step": 122100 }, { "epoch": 1.2215, "grad_norm": 78.31942749023438, "learning_rate": 5.116119999999999e-07, "loss": 0.4883, "step": 122150 }, { "epoch": 1.222, "grad_norm": 64.6844253540039, "learning_rate": 5.11412e-07, "loss": 0.5336, "step": 122200 }, { "epoch": 1.2225, "grad_norm": 1.514631986618042, "learning_rate": 5.11212e-07, "loss": 0.4101, "step": 122250 }, { "epoch": 1.223, "grad_norm": 47.222328186035156, "learning_rate": 5.11012e-07, "loss": 0.5162, "step": 122300 }, { "epoch": 1.2235, "grad_norm": 54.371952056884766, "learning_rate": 5.108120000000001e-07, "loss": 0.5621, "step": 122350 }, { "epoch": 1.224, "grad_norm": 62.68058395385742, "learning_rate": 5.106119999999999e-07, "loss": 0.4995, "step": 122400 }, { "epoch": 1.2245, "grad_norm": 39.96584701538086, "learning_rate": 5.104119999999999e-07, "loss": 0.3692, "step": 122450 }, { "epoch": 1.225, "grad_norm": 3.750762462615967, "learning_rate": 5.10212e-07, "loss": 0.4632, "step": 122500 }, { "epoch": 1.2255, "grad_norm": 5.8588080406188965, "learning_rate": 5.10012e-07, "loss": 0.4385, "step": 122550 }, { "epoch": 1.226, "grad_norm": 27.323911666870117, "learning_rate": 5.09812e-07, "loss": 0.3817, "step": 122600 }, { "epoch": 1.2265, "grad_norm": 78.1037826538086, "learning_rate": 5.09612e-07, "loss": 0.5332, "step": 122650 }, { "epoch": 1.227, "grad_norm": 30.815229415893555, "learning_rate": 5.09412e-07, "loss": 0.517, "step": 122700 }, { "epoch": 1.2275, "grad_norm": 8.353984832763672, "learning_rate": 5.092119999999999e-07, "loss": 0.4976, "step": 122750 }, { "epoch": 1.228, "grad_norm": 119.27523040771484, "learning_rate": 5.09012e-07, "loss": 0.4711, "step": 122800 }, { "epoch": 1.2285, "grad_norm": 73.34131622314453, "learning_rate": 5.08812e-07, "loss": 0.5385, "step": 122850 }, { "epoch": 1.229, "grad_norm": 0.49966078996658325, "learning_rate": 5.086159999999999e-07, "loss": 0.4272, "step": 122900 }, { "epoch": 1.2295, "grad_norm": 60.721370697021484, "learning_rate": 5.0842e-07, "loss": 0.4648, "step": 122950 }, { "epoch": 1.23, "grad_norm": 87.89384460449219, "learning_rate": 5.0822e-07, "loss": 0.4228, "step": 123000 }, { "epoch": 1.2305, "grad_norm": 79.43232727050781, "learning_rate": 5.0802e-07, "loss": 0.4643, "step": 123050 }, { "epoch": 1.231, "grad_norm": 22.63487434387207, "learning_rate": 5.078200000000001e-07, "loss": 0.5683, "step": 123100 }, { "epoch": 1.2315, "grad_norm": 79.83702850341797, "learning_rate": 5.076199999999999e-07, "loss": 0.5672, "step": 123150 }, { "epoch": 1.232, "grad_norm": 56.245948791503906, "learning_rate": 5.074199999999999e-07, "loss": 0.4281, "step": 123200 }, { "epoch": 1.2325, "grad_norm": 17.55223846435547, "learning_rate": 5.0722e-07, "loss": 0.4716, "step": 123250 }, { "epoch": 1.233, "grad_norm": 10.06242561340332, "learning_rate": 5.0702e-07, "loss": 0.4489, "step": 123300 }, { "epoch": 1.2335, "grad_norm": 11.893889427185059, "learning_rate": 5.0682e-07, "loss": 0.4702, "step": 123350 }, { "epoch": 1.234, "grad_norm": 105.41064453125, "learning_rate": 5.0662e-07, "loss": 0.4019, "step": 123400 }, { "epoch": 1.2345, "grad_norm": 86.82225799560547, "learning_rate": 5.0642e-07, "loss": 0.4129, "step": 123450 }, { "epoch": 1.2349999999999999, "grad_norm": 68.03509521484375, "learning_rate": 5.062199999999999e-07, "loss": 0.5526, "step": 123500 }, { "epoch": 1.2355, "grad_norm": 27.85662841796875, "learning_rate": 5.0602e-07, "loss": 0.388, "step": 123550 }, { "epoch": 1.236, "grad_norm": 15.7693452835083, "learning_rate": 5.0582e-07, "loss": 0.6257, "step": 123600 }, { "epoch": 1.2365, "grad_norm": 8.797830581665039, "learning_rate": 5.056199999999999e-07, "loss": 0.3529, "step": 123650 }, { "epoch": 1.237, "grad_norm": 46.14826202392578, "learning_rate": 5.0542e-07, "loss": 0.4523, "step": 123700 }, { "epoch": 1.2375, "grad_norm": 16.78753662109375, "learning_rate": 5.0522e-07, "loss": 0.4304, "step": 123750 }, { "epoch": 1.238, "grad_norm": 90.13909912109375, "learning_rate": 5.0502e-07, "loss": 0.4219, "step": 123800 }, { "epoch": 1.2385, "grad_norm": 20.657495498657227, "learning_rate": 5.0482e-07, "loss": 0.4275, "step": 123850 }, { "epoch": 1.2389999999999999, "grad_norm": 117.53194427490234, "learning_rate": 5.046199999999999e-07, "loss": 0.5234, "step": 123900 }, { "epoch": 1.2395, "grad_norm": 44.009521484375, "learning_rate": 5.044199999999999e-07, "loss": 0.3952, "step": 123950 }, { "epoch": 1.24, "grad_norm": 62.8895263671875, "learning_rate": 5.0422e-07, "loss": 0.3973, "step": 124000 }, { "epoch": 1.2405, "grad_norm": 83.4673080444336, "learning_rate": 5.0402e-07, "loss": 0.4622, "step": 124050 }, { "epoch": 1.241, "grad_norm": 69.27863311767578, "learning_rate": 5.038200000000001e-07, "loss": 0.3788, "step": 124100 }, { "epoch": 1.2415, "grad_norm": 27.948728561401367, "learning_rate": 5.0362e-07, "loss": 0.4593, "step": 124150 }, { "epoch": 1.242, "grad_norm": 17.01951789855957, "learning_rate": 5.034199999999999e-07, "loss": 0.4347, "step": 124200 }, { "epoch": 1.2425, "grad_norm": 95.70691680908203, "learning_rate": 5.0322e-07, "loss": 0.4042, "step": 124250 }, { "epoch": 1.2429999999999999, "grad_norm": 120.20599365234375, "learning_rate": 5.0302e-07, "loss": 0.4595, "step": 124300 }, { "epoch": 1.2435, "grad_norm": 119.31159973144531, "learning_rate": 5.0282e-07, "loss": 0.4708, "step": 124350 }, { "epoch": 1.244, "grad_norm": 72.69075775146484, "learning_rate": 5.0262e-07, "loss": 0.5223, "step": 124400 }, { "epoch": 1.2445, "grad_norm": 65.67170715332031, "learning_rate": 5.0242e-07, "loss": 0.5593, "step": 124450 }, { "epoch": 1.245, "grad_norm": 32.64443588256836, "learning_rate": 5.0222e-07, "loss": 0.532, "step": 124500 }, { "epoch": 1.2455, "grad_norm": 6.584482192993164, "learning_rate": 5.0202e-07, "loss": 0.4046, "step": 124550 }, { "epoch": 1.246, "grad_norm": 26.74454116821289, "learning_rate": 5.0182e-07, "loss": 0.493, "step": 124600 }, { "epoch": 1.2465, "grad_norm": 3.494159460067749, "learning_rate": 5.016199999999999e-07, "loss": 0.395, "step": 124650 }, { "epoch": 1.2469999999999999, "grad_norm": 12.93493938446045, "learning_rate": 5.0142e-07, "loss": 0.3823, "step": 124700 }, { "epoch": 1.2475, "grad_norm": 58.11399459838867, "learning_rate": 5.0122e-07, "loss": 0.4471, "step": 124750 }, { "epoch": 1.248, "grad_norm": 51.41734313964844, "learning_rate": 5.0102e-07, "loss": 0.4074, "step": 124800 }, { "epoch": 1.2485, "grad_norm": 76.81995391845703, "learning_rate": 5.008200000000001e-07, "loss": 0.4224, "step": 124850 }, { "epoch": 1.249, "grad_norm": 48.812599182128906, "learning_rate": 5.006199999999999e-07, "loss": 0.4964, "step": 124900 }, { "epoch": 1.2495, "grad_norm": 62.64362716674805, "learning_rate": 5.004199999999999e-07, "loss": 0.423, "step": 124950 }, { "epoch": 1.25, "grad_norm": 88.53785705566406, "learning_rate": 5.0022e-07, "loss": 0.4533, "step": 125000 }, { "epoch": 1.2505, "grad_norm": 88.8245620727539, "learning_rate": 5.0002e-07, "loss": 0.5211, "step": 125050 }, { "epoch": 1.251, "grad_norm": 43.50419616699219, "learning_rate": 4.99824e-07, "loss": 0.4007, "step": 125100 }, { "epoch": 1.2515, "grad_norm": 11.175210952758789, "learning_rate": 4.99624e-07, "loss": 0.5217, "step": 125150 }, { "epoch": 1.252, "grad_norm": 50.50872039794922, "learning_rate": 4.99424e-07, "loss": 0.4395, "step": 125200 }, { "epoch": 1.2525, "grad_norm": 28.890705108642578, "learning_rate": 4.992239999999999e-07, "loss": 0.3575, "step": 125250 }, { "epoch": 1.2530000000000001, "grad_norm": 2.7780516147613525, "learning_rate": 4.99024e-07, "loss": 0.4679, "step": 125300 }, { "epoch": 1.2535, "grad_norm": 74.0434341430664, "learning_rate": 4.988239999999999e-07, "loss": 0.4339, "step": 125350 }, { "epoch": 1.254, "grad_norm": 1.51035737991333, "learning_rate": 4.98624e-07, "loss": 0.4149, "step": 125400 }, { "epoch": 1.2545, "grad_norm": 11.029925346374512, "learning_rate": 4.98424e-07, "loss": 0.4752, "step": 125450 }, { "epoch": 1.255, "grad_norm": 4.947063446044922, "learning_rate": 4.98224e-07, "loss": 0.5536, "step": 125500 }, { "epoch": 1.2555, "grad_norm": 1.3308128118515015, "learning_rate": 4.98024e-07, "loss": 0.3657, "step": 125550 }, { "epoch": 1.256, "grad_norm": 61.55348205566406, "learning_rate": 4.978239999999999e-07, "loss": 0.5046, "step": 125600 }, { "epoch": 1.2565, "grad_norm": 128.6259765625, "learning_rate": 4.97624e-07, "loss": 0.4097, "step": 125650 }, { "epoch": 1.2570000000000001, "grad_norm": 7.346100807189941, "learning_rate": 4.974239999999999e-07, "loss": 0.4103, "step": 125700 }, { "epoch": 1.2575, "grad_norm": 8.92379093170166, "learning_rate": 4.97224e-07, "loss": 0.2637, "step": 125750 }, { "epoch": 1.258, "grad_norm": 42.0355110168457, "learning_rate": 4.97024e-07, "loss": 0.3719, "step": 125800 }, { "epoch": 1.2585, "grad_norm": 77.65760803222656, "learning_rate": 4.96824e-07, "loss": 0.3427, "step": 125850 }, { "epoch": 1.259, "grad_norm": 95.41388702392578, "learning_rate": 4.96624e-07, "loss": 0.4427, "step": 125900 }, { "epoch": 1.2595, "grad_norm": 38.658050537109375, "learning_rate": 4.964239999999999e-07, "loss": 0.5894, "step": 125950 }, { "epoch": 1.26, "grad_norm": 1.92863130569458, "learning_rate": 4.962239999999999e-07, "loss": 0.5318, "step": 126000 }, { "epoch": 1.2605, "grad_norm": 132.43011474609375, "learning_rate": 4.96024e-07, "loss": 0.4031, "step": 126050 }, { "epoch": 1.2610000000000001, "grad_norm": 51.750030517578125, "learning_rate": 4.95824e-07, "loss": 0.4338, "step": 126100 }, { "epoch": 1.2615, "grad_norm": 4.630794525146484, "learning_rate": 4.95624e-07, "loss": 0.4925, "step": 126150 }, { "epoch": 1.262, "grad_norm": 91.11925506591797, "learning_rate": 4.95424e-07, "loss": 0.4702, "step": 126200 }, { "epoch": 1.2625, "grad_norm": 68.52376556396484, "learning_rate": 4.95224e-07, "loss": 0.6044, "step": 126250 }, { "epoch": 1.263, "grad_norm": 84.54537200927734, "learning_rate": 4.950239999999999e-07, "loss": 0.5117, "step": 126300 }, { "epoch": 1.2635, "grad_norm": 6.372779846191406, "learning_rate": 4.948239999999999e-07, "loss": 0.3654, "step": 126350 }, { "epoch": 1.264, "grad_norm": 10.201987266540527, "learning_rate": 4.94624e-07, "loss": 0.5223, "step": 126400 }, { "epoch": 1.2645, "grad_norm": 48.930599212646484, "learning_rate": 4.944239999999999e-07, "loss": 0.4524, "step": 126450 }, { "epoch": 1.2650000000000001, "grad_norm": 63.114070892333984, "learning_rate": 4.94224e-07, "loss": 0.5026, "step": 126500 }, { "epoch": 1.2655, "grad_norm": 64.93250274658203, "learning_rate": 4.94024e-07, "loss": 0.4449, "step": 126550 }, { "epoch": 1.266, "grad_norm": 18.260643005371094, "learning_rate": 4.93824e-07, "loss": 0.4546, "step": 126600 }, { "epoch": 1.2665, "grad_norm": 56.187164306640625, "learning_rate": 4.93624e-07, "loss": 0.4692, "step": 126650 }, { "epoch": 1.267, "grad_norm": 75.86105346679688, "learning_rate": 4.934239999999999e-07, "loss": 0.4319, "step": 126700 }, { "epoch": 1.2675, "grad_norm": 59.311798095703125, "learning_rate": 4.93228e-07, "loss": 0.4846, "step": 126750 }, { "epoch": 1.268, "grad_norm": 34.68928146362305, "learning_rate": 4.93028e-07, "loss": 0.3693, "step": 126800 }, { "epoch": 1.2685, "grad_norm": 58.48672866821289, "learning_rate": 4.92828e-07, "loss": 0.5163, "step": 126850 }, { "epoch": 1.2690000000000001, "grad_norm": 6.793260097503662, "learning_rate": 4.92628e-07, "loss": 0.4033, "step": 126900 }, { "epoch": 1.2695, "grad_norm": 108.26202392578125, "learning_rate": 4.92428e-07, "loss": 0.4662, "step": 126950 }, { "epoch": 1.27, "grad_norm": 62.59315872192383, "learning_rate": 4.92228e-07, "loss": 0.427, "step": 127000 }, { "epoch": 1.2705, "grad_norm": 7.085406303405762, "learning_rate": 4.92028e-07, "loss": 0.356, "step": 127050 }, { "epoch": 1.271, "grad_norm": 27.684703826904297, "learning_rate": 4.918279999999999e-07, "loss": 0.3432, "step": 127100 }, { "epoch": 1.2715, "grad_norm": 89.28202819824219, "learning_rate": 4.91628e-07, "loss": 0.4642, "step": 127150 }, { "epoch": 1.272, "grad_norm": 81.8940658569336, "learning_rate": 4.91428e-07, "loss": 0.4793, "step": 127200 }, { "epoch": 1.2725, "grad_norm": 57.01283645629883, "learning_rate": 4.91228e-07, "loss": 0.6866, "step": 127250 }, { "epoch": 1.2730000000000001, "grad_norm": 7.828011989593506, "learning_rate": 4.91028e-07, "loss": 0.4175, "step": 127300 }, { "epoch": 1.2735, "grad_norm": 6.988823890686035, "learning_rate": 4.90828e-07, "loss": 0.4843, "step": 127350 }, { "epoch": 1.274, "grad_norm": 129.013916015625, "learning_rate": 4.906279999999999e-07, "loss": 0.4135, "step": 127400 }, { "epoch": 1.2745, "grad_norm": 2.377220392227173, "learning_rate": 4.904279999999999e-07, "loss": 0.4675, "step": 127450 }, { "epoch": 1.275, "grad_norm": 73.6028060913086, "learning_rate": 4.90228e-07, "loss": 0.4821, "step": 127500 }, { "epoch": 1.2755, "grad_norm": 53.05596160888672, "learning_rate": 4.90028e-07, "loss": 0.3646, "step": 127550 }, { "epoch": 1.276, "grad_norm": 2.0764973163604736, "learning_rate": 4.89828e-07, "loss": 0.4229, "step": 127600 }, { "epoch": 1.2765, "grad_norm": 36.13945770263672, "learning_rate": 4.89628e-07, "loss": 0.4127, "step": 127650 }, { "epoch": 1.2770000000000001, "grad_norm": 66.94934844970703, "learning_rate": 4.89428e-07, "loss": 0.4685, "step": 127700 }, { "epoch": 1.2775, "grad_norm": 100.14254760742188, "learning_rate": 4.892279999999999e-07, "loss": 0.5389, "step": 127750 }, { "epoch": 1.278, "grad_norm": 56.245418548583984, "learning_rate": 4.89028e-07, "loss": 0.4782, "step": 127800 }, { "epoch": 1.2785, "grad_norm": 2.368191957473755, "learning_rate": 4.88828e-07, "loss": 0.3856, "step": 127850 }, { "epoch": 1.279, "grad_norm": 125.4609375, "learning_rate": 4.88628e-07, "loss": 0.5209, "step": 127900 }, { "epoch": 1.2795, "grad_norm": 9.785893440246582, "learning_rate": 4.88428e-07, "loss": 0.3902, "step": 127950 }, { "epoch": 1.28, "grad_norm": 76.34190368652344, "learning_rate": 4.88228e-07, "loss": 0.4521, "step": 128000 }, { "epoch": 1.2805, "grad_norm": 37.4355354309082, "learning_rate": 4.88028e-07, "loss": 0.3555, "step": 128050 }, { "epoch": 1.2810000000000001, "grad_norm": 9.940116882324219, "learning_rate": 4.878279999999999e-07, "loss": 0.3704, "step": 128100 }, { "epoch": 1.2814999999999999, "grad_norm": 6.346107006072998, "learning_rate": 4.87628e-07, "loss": 0.3613, "step": 128150 }, { "epoch": 1.282, "grad_norm": 20.553491592407227, "learning_rate": 4.874279999999999e-07, "loss": 0.4186, "step": 128200 }, { "epoch": 1.2825, "grad_norm": 31.738750457763672, "learning_rate": 4.87228e-07, "loss": 0.5275, "step": 128250 }, { "epoch": 1.283, "grad_norm": 32.1572151184082, "learning_rate": 4.87028e-07, "loss": 0.5016, "step": 128300 }, { "epoch": 1.2835, "grad_norm": 60.900787353515625, "learning_rate": 4.86828e-07, "loss": 0.5527, "step": 128350 }, { "epoch": 1.284, "grad_norm": 37.76549530029297, "learning_rate": 4.86628e-07, "loss": 0.3499, "step": 128400 }, { "epoch": 1.2845, "grad_norm": 16.48982810974121, "learning_rate": 4.86428e-07, "loss": 0.3264, "step": 128450 }, { "epoch": 1.285, "grad_norm": 11.211465835571289, "learning_rate": 4.862279999999999e-07, "loss": 0.3776, "step": 128500 }, { "epoch": 1.2854999999999999, "grad_norm": 4.8228583335876465, "learning_rate": 4.86028e-07, "loss": 0.497, "step": 128550 }, { "epoch": 1.286, "grad_norm": 51.52220916748047, "learning_rate": 4.85828e-07, "loss": 0.4283, "step": 128600 }, { "epoch": 1.2865, "grad_norm": 116.35945892333984, "learning_rate": 4.85628e-07, "loss": 0.4819, "step": 128650 }, { "epoch": 1.287, "grad_norm": 16.744569778442383, "learning_rate": 4.85428e-07, "loss": 0.4903, "step": 128700 }, { "epoch": 1.2875, "grad_norm": 83.41133880615234, "learning_rate": 4.85228e-07, "loss": 0.4459, "step": 128750 }, { "epoch": 1.288, "grad_norm": 14.692625999450684, "learning_rate": 4.850279999999999e-07, "loss": 0.3043, "step": 128800 }, { "epoch": 1.2885, "grad_norm": 56.206844329833984, "learning_rate": 4.848279999999999e-07, "loss": 0.4554, "step": 128850 }, { "epoch": 1.289, "grad_norm": 89.15912628173828, "learning_rate": 4.84628e-07, "loss": 0.5114, "step": 128900 }, { "epoch": 1.2894999999999999, "grad_norm": 101.62841033935547, "learning_rate": 4.84428e-07, "loss": 0.491, "step": 128950 }, { "epoch": 1.29, "grad_norm": 65.63429260253906, "learning_rate": 4.84228e-07, "loss": 0.5268, "step": 129000 }, { "epoch": 1.2905, "grad_norm": 50.63749694824219, "learning_rate": 4.84028e-07, "loss": 0.4863, "step": 129050 }, { "epoch": 1.291, "grad_norm": 9.616476058959961, "learning_rate": 4.83828e-07, "loss": 0.3873, "step": 129100 }, { "epoch": 1.2915, "grad_norm": 38.35820388793945, "learning_rate": 4.836279999999999e-07, "loss": 0.2707, "step": 129150 }, { "epoch": 1.292, "grad_norm": 7.243841171264648, "learning_rate": 4.83428e-07, "loss": 0.3993, "step": 129200 }, { "epoch": 1.2925, "grad_norm": 4.215170860290527, "learning_rate": 4.83228e-07, "loss": 0.3448, "step": 129250 }, { "epoch": 1.293, "grad_norm": 34.597991943359375, "learning_rate": 4.83028e-07, "loss": 0.3508, "step": 129300 }, { "epoch": 1.2934999999999999, "grad_norm": 65.05717468261719, "learning_rate": 4.82828e-07, "loss": 0.3527, "step": 129350 }, { "epoch": 1.294, "grad_norm": 13.01252269744873, "learning_rate": 4.82628e-07, "loss": 0.4451, "step": 129400 }, { "epoch": 1.2945, "grad_norm": 13.404302597045898, "learning_rate": 4.82428e-07, "loss": 0.3919, "step": 129450 }, { "epoch": 1.295, "grad_norm": 62.99480056762695, "learning_rate": 4.822279999999999e-07, "loss": 0.4649, "step": 129500 }, { "epoch": 1.2955, "grad_norm": 45.989017486572266, "learning_rate": 4.82028e-07, "loss": 0.479, "step": 129550 }, { "epoch": 1.296, "grad_norm": 34.542762756347656, "learning_rate": 4.818279999999999e-07, "loss": 0.4426, "step": 129600 }, { "epoch": 1.2965, "grad_norm": 41.29684066772461, "learning_rate": 4.81628e-07, "loss": 0.4677, "step": 129650 }, { "epoch": 1.297, "grad_norm": 92.34574127197266, "learning_rate": 4.81428e-07, "loss": 0.3883, "step": 129700 }, { "epoch": 1.2974999999999999, "grad_norm": 27.71408462524414, "learning_rate": 4.81228e-07, "loss": 0.4247, "step": 129750 }, { "epoch": 1.298, "grad_norm": 6.5036492347717285, "learning_rate": 4.81028e-07, "loss": 0.4296, "step": 129800 }, { "epoch": 1.2985, "grad_norm": 17.653621673583984, "learning_rate": 4.808279999999999e-07, "loss": 0.5943, "step": 129850 }, { "epoch": 1.299, "grad_norm": 17.503652572631836, "learning_rate": 4.806319999999999e-07, "loss": 0.4773, "step": 129900 }, { "epoch": 1.2995, "grad_norm": 34.05784225463867, "learning_rate": 4.80432e-07, "loss": 0.455, "step": 129950 }, { "epoch": 1.3, "grad_norm": 3.221923351287842, "learning_rate": 4.80232e-07, "loss": 0.3534, "step": 130000 }, { "epoch": 1.3005, "grad_norm": 44.165382385253906, "learning_rate": 4.80032e-07, "loss": 0.4089, "step": 130050 }, { "epoch": 1.301, "grad_norm": 19.523441314697266, "learning_rate": 4.79832e-07, "loss": 0.5113, "step": 130100 }, { "epoch": 1.3014999999999999, "grad_norm": 15.177743911743164, "learning_rate": 4.79632e-07, "loss": 0.4199, "step": 130150 }, { "epoch": 1.302, "grad_norm": 0.24970543384552002, "learning_rate": 4.794320000000001e-07, "loss": 0.2723, "step": 130200 }, { "epoch": 1.3025, "grad_norm": 80.9247817993164, "learning_rate": 4.792319999999999e-07, "loss": 0.4628, "step": 130250 }, { "epoch": 1.303, "grad_norm": 24.861129760742188, "learning_rate": 4.79032e-07, "loss": 0.4166, "step": 130300 }, { "epoch": 1.3035, "grad_norm": 4.55272912979126, "learning_rate": 4.78832e-07, "loss": 0.4463, "step": 130350 }, { "epoch": 1.304, "grad_norm": 11.789888381958008, "learning_rate": 4.78632e-07, "loss": 0.5848, "step": 130400 }, { "epoch": 1.3045, "grad_norm": 3.2823047637939453, "learning_rate": 4.78432e-07, "loss": 0.4772, "step": 130450 }, { "epoch": 1.305, "grad_norm": 8.338168144226074, "learning_rate": 4.78232e-07, "loss": 0.3896, "step": 130500 }, { "epoch": 1.3054999999999999, "grad_norm": 84.38629150390625, "learning_rate": 4.78032e-07, "loss": 0.4665, "step": 130550 }, { "epoch": 1.306, "grad_norm": 79.32563781738281, "learning_rate": 4.778319999999999e-07, "loss": 0.3935, "step": 130600 }, { "epoch": 1.3065, "grad_norm": 12.994901657104492, "learning_rate": 4.77632e-07, "loss": 0.4861, "step": 130650 }, { "epoch": 1.307, "grad_norm": 34.566383361816406, "learning_rate": 4.77432e-07, "loss": 0.4927, "step": 130700 }, { "epoch": 1.3075, "grad_norm": 80.90245056152344, "learning_rate": 4.77232e-07, "loss": 0.5533, "step": 130750 }, { "epoch": 1.308, "grad_norm": 7.320152282714844, "learning_rate": 4.77032e-07, "loss": 0.4052, "step": 130800 }, { "epoch": 1.3085, "grad_norm": 9.89566421508789, "learning_rate": 4.7683199999999996e-07, "loss": 0.411, "step": 130850 }, { "epoch": 1.309, "grad_norm": 32.64478302001953, "learning_rate": 4.76632e-07, "loss": 0.4795, "step": 130900 }, { "epoch": 1.3094999999999999, "grad_norm": 4.066441059112549, "learning_rate": 4.76432e-07, "loss": 0.637, "step": 130950 }, { "epoch": 1.31, "grad_norm": 147.58811950683594, "learning_rate": 4.7623199999999997e-07, "loss": 0.6026, "step": 131000 }, { "epoch": 1.3105, "grad_norm": 11.256807327270508, "learning_rate": 4.76032e-07, "loss": 0.5771, "step": 131050 }, { "epoch": 1.311, "grad_norm": 52.247196197509766, "learning_rate": 4.7583199999999994e-07, "loss": 0.4587, "step": 131100 }, { "epoch": 1.3115, "grad_norm": 1.7685823440551758, "learning_rate": 4.75632e-07, "loss": 0.3916, "step": 131150 }, { "epoch": 1.312, "grad_norm": 72.10371398925781, "learning_rate": 4.75432e-07, "loss": 0.3049, "step": 131200 }, { "epoch": 1.3125, "grad_norm": 86.47714233398438, "learning_rate": 4.7523199999999995e-07, "loss": 0.4716, "step": 131250 }, { "epoch": 1.313, "grad_norm": 120.87084197998047, "learning_rate": 4.75032e-07, "loss": 0.4428, "step": 131300 }, { "epoch": 1.3135, "grad_norm": 21.09920883178711, "learning_rate": 4.7483199999999997e-07, "loss": 0.521, "step": 131350 }, { "epoch": 1.314, "grad_norm": 88.9284439086914, "learning_rate": 4.7463199999999996e-07, "loss": 0.5036, "step": 131400 }, { "epoch": 1.3145, "grad_norm": 56.702247619628906, "learning_rate": 4.74432e-07, "loss": 0.5041, "step": 131450 }, { "epoch": 1.315, "grad_norm": 0.35143765807151794, "learning_rate": 4.74232e-07, "loss": 0.4163, "step": 131500 }, { "epoch": 1.3155000000000001, "grad_norm": 92.69734191894531, "learning_rate": 4.7403199999999997e-07, "loss": 0.4294, "step": 131550 }, { "epoch": 1.316, "grad_norm": 50.940093994140625, "learning_rate": 4.7383199999999995e-07, "loss": 0.4826, "step": 131600 }, { "epoch": 1.3165, "grad_norm": 82.14228820800781, "learning_rate": 4.73632e-07, "loss": 0.4999, "step": 131650 }, { "epoch": 1.317, "grad_norm": 75.90847778320312, "learning_rate": 4.73432e-07, "loss": 0.4275, "step": 131700 }, { "epoch": 1.3175, "grad_norm": 34.303688049316406, "learning_rate": 4.7323199999999996e-07, "loss": 0.453, "step": 131750 }, { "epoch": 1.318, "grad_norm": 47.25891876220703, "learning_rate": 4.73032e-07, "loss": 0.5394, "step": 131800 }, { "epoch": 1.3185, "grad_norm": 0.128638356924057, "learning_rate": 4.7283199999999993e-07, "loss": 0.3947, "step": 131850 }, { "epoch": 1.319, "grad_norm": 4.6934967041015625, "learning_rate": 4.7263199999999997e-07, "loss": 0.5127, "step": 131900 }, { "epoch": 1.3195000000000001, "grad_norm": 38.059913635253906, "learning_rate": 4.72432e-07, "loss": 0.4648, "step": 131950 }, { "epoch": 1.32, "grad_norm": 42.113609313964844, "learning_rate": 4.72232e-07, "loss": 0.3855, "step": 132000 }, { "epoch": 1.3205, "grad_norm": 88.45838928222656, "learning_rate": 4.72032e-07, "loss": 0.3796, "step": 132050 }, { "epoch": 1.321, "grad_norm": 60.03329086303711, "learning_rate": 4.7183199999999996e-07, "loss": 0.4678, "step": 132100 }, { "epoch": 1.3215, "grad_norm": 92.38176727294922, "learning_rate": 4.71632e-07, "loss": 0.4438, "step": 132150 }, { "epoch": 1.322, "grad_norm": 5.0574235916137695, "learning_rate": 4.71432e-07, "loss": 0.4548, "step": 132200 }, { "epoch": 1.3225, "grad_norm": 45.51777648925781, "learning_rate": 4.7123199999999997e-07, "loss": 0.477, "step": 132250 }, { "epoch": 1.323, "grad_norm": 58.61941146850586, "learning_rate": 4.71032e-07, "loss": 0.4616, "step": 132300 }, { "epoch": 1.3235000000000001, "grad_norm": 9.93522834777832, "learning_rate": 4.7083199999999994e-07, "loss": 0.3566, "step": 132350 }, { "epoch": 1.324, "grad_norm": 73.70626068115234, "learning_rate": 4.70632e-07, "loss": 0.5039, "step": 132400 }, { "epoch": 1.3245, "grad_norm": 119.5567626953125, "learning_rate": 4.70432e-07, "loss": 0.5233, "step": 132450 }, { "epoch": 1.325, "grad_norm": 93.0530014038086, "learning_rate": 4.7023199999999995e-07, "loss": 0.3723, "step": 132500 }, { "epoch": 1.3255, "grad_norm": 94.75311279296875, "learning_rate": 4.70032e-07, "loss": 0.4238, "step": 132550 }, { "epoch": 1.326, "grad_norm": 24.6379337310791, "learning_rate": 4.69832e-07, "loss": 0.5475, "step": 132600 }, { "epoch": 1.3265, "grad_norm": 128.80043029785156, "learning_rate": 4.6963199999999995e-07, "loss": 0.5309, "step": 132650 }, { "epoch": 1.327, "grad_norm": 11.744426727294922, "learning_rate": 4.69432e-07, "loss": 0.4684, "step": 132700 }, { "epoch": 1.3275000000000001, "grad_norm": 68.0375747680664, "learning_rate": 4.69232e-07, "loss": 0.4713, "step": 132750 }, { "epoch": 1.328, "grad_norm": 63.520179748535156, "learning_rate": 4.6903199999999996e-07, "loss": 0.4248, "step": 132800 }, { "epoch": 1.3285, "grad_norm": 10.20051383972168, "learning_rate": 4.68832e-07, "loss": 0.3877, "step": 132850 }, { "epoch": 1.329, "grad_norm": 94.5232925415039, "learning_rate": 4.68632e-07, "loss": 0.5337, "step": 132900 }, { "epoch": 1.3295, "grad_norm": 118.54349517822266, "learning_rate": 4.6843199999999997e-07, "loss": 0.3081, "step": 132950 }, { "epoch": 1.33, "grad_norm": 1.4136791229248047, "learning_rate": 4.6823199999999995e-07, "loss": 0.4322, "step": 133000 }, { "epoch": 1.3305, "grad_norm": 13.606802940368652, "learning_rate": 4.68032e-07, "loss": 0.5477, "step": 133050 }, { "epoch": 1.331, "grad_norm": 58.811222076416016, "learning_rate": 4.6783200000000003e-07, "loss": 0.5183, "step": 133100 }, { "epoch": 1.3315000000000001, "grad_norm": 1.774475336074829, "learning_rate": 4.6763199999999996e-07, "loss": 0.3814, "step": 133150 }, { "epoch": 1.332, "grad_norm": 3.9559104442596436, "learning_rate": 4.67432e-07, "loss": 0.4848, "step": 133200 }, { "epoch": 1.3325, "grad_norm": 44.5838623046875, "learning_rate": 4.67232e-07, "loss": 0.4987, "step": 133250 }, { "epoch": 1.333, "grad_norm": 45.46150207519531, "learning_rate": 4.6703199999999997e-07, "loss": 0.4701, "step": 133300 }, { "epoch": 1.3335, "grad_norm": 4.442442417144775, "learning_rate": 4.66832e-07, "loss": 0.4129, "step": 133350 }, { "epoch": 1.334, "grad_norm": 83.65077209472656, "learning_rate": 4.66632e-07, "loss": 0.4445, "step": 133400 }, { "epoch": 1.3345, "grad_norm": 95.16340637207031, "learning_rate": 4.66432e-07, "loss": 0.4481, "step": 133450 }, { "epoch": 1.335, "grad_norm": 12.23143482208252, "learning_rate": 4.6623199999999996e-07, "loss": 0.4512, "step": 133500 }, { "epoch": 1.3355000000000001, "grad_norm": 38.25416564941406, "learning_rate": 4.66032e-07, "loss": 0.3672, "step": 133550 }, { "epoch": 1.336, "grad_norm": 57.15376281738281, "learning_rate": 4.65832e-07, "loss": 0.5406, "step": 133600 }, { "epoch": 1.3365, "grad_norm": 58.421695709228516, "learning_rate": 4.6563199999999997e-07, "loss": 0.5141, "step": 133650 }, { "epoch": 1.337, "grad_norm": 87.27682495117188, "learning_rate": 4.65432e-07, "loss": 0.4306, "step": 133700 }, { "epoch": 1.3375, "grad_norm": 71.72251892089844, "learning_rate": 4.6523199999999994e-07, "loss": 0.4374, "step": 133750 }, { "epoch": 1.338, "grad_norm": 49.080322265625, "learning_rate": 4.65032e-07, "loss": 0.5072, "step": 133800 }, { "epoch": 1.3385, "grad_norm": 22.312482833862305, "learning_rate": 4.64832e-07, "loss": 0.4124, "step": 133850 }, { "epoch": 1.339, "grad_norm": 36.95961380004883, "learning_rate": 4.6463199999999995e-07, "loss": 0.4986, "step": 133900 }, { "epoch": 1.3395000000000001, "grad_norm": 2.186569929122925, "learning_rate": 4.64432e-07, "loss": 0.3585, "step": 133950 }, { "epoch": 1.34, "grad_norm": 38.58488464355469, "learning_rate": 4.6423199999999997e-07, "loss": 0.4046, "step": 134000 }, { "epoch": 1.3405, "grad_norm": 83.42021942138672, "learning_rate": 4.6403199999999996e-07, "loss": 0.4514, "step": 134050 }, { "epoch": 1.341, "grad_norm": 5.808516979217529, "learning_rate": 4.63832e-07, "loss": 0.4749, "step": 134100 }, { "epoch": 1.3415, "grad_norm": 6.701382637023926, "learning_rate": 4.63632e-07, "loss": 0.4058, "step": 134150 }, { "epoch": 1.342, "grad_norm": 54.70067596435547, "learning_rate": 4.6343199999999996e-07, "loss": 0.4774, "step": 134200 }, { "epoch": 1.3425, "grad_norm": 0.1387663632631302, "learning_rate": 4.6323199999999995e-07, "loss": 0.3653, "step": 134250 }, { "epoch": 1.343, "grad_norm": 32.30421829223633, "learning_rate": 4.63032e-07, "loss": 0.3759, "step": 134300 }, { "epoch": 1.3435000000000001, "grad_norm": 60.436344146728516, "learning_rate": 4.62832e-07, "loss": 0.5095, "step": 134350 }, { "epoch": 1.3439999999999999, "grad_norm": 83.42176818847656, "learning_rate": 4.6263199999999996e-07, "loss": 0.425, "step": 134400 }, { "epoch": 1.3445, "grad_norm": 3.811023473739624, "learning_rate": 4.62432e-07, "loss": 0.407, "step": 134450 }, { "epoch": 1.345, "grad_norm": 74.2744369506836, "learning_rate": 4.62232e-07, "loss": 0.4088, "step": 134500 }, { "epoch": 1.3455, "grad_norm": 7.530820846557617, "learning_rate": 4.6203199999999996e-07, "loss": 0.401, "step": 134550 }, { "epoch": 1.346, "grad_norm": 143.85891723632812, "learning_rate": 4.61832e-07, "loss": 0.5301, "step": 134600 }, { "epoch": 1.3465, "grad_norm": 82.85350036621094, "learning_rate": 4.61632e-07, "loss": 0.4794, "step": 134650 }, { "epoch": 1.347, "grad_norm": 94.02362823486328, "learning_rate": 4.6143199999999997e-07, "loss": 0.408, "step": 134700 }, { "epoch": 1.3475, "grad_norm": 1.712831974029541, "learning_rate": 4.6123199999999996e-07, "loss": 0.3879, "step": 134750 }, { "epoch": 1.3479999999999999, "grad_norm": 91.60545349121094, "learning_rate": 4.61032e-07, "loss": 0.4832, "step": 134800 }, { "epoch": 1.3485, "grad_norm": 107.8651123046875, "learning_rate": 4.60832e-07, "loss": 0.4833, "step": 134850 }, { "epoch": 1.349, "grad_norm": 70.89888000488281, "learning_rate": 4.6063199999999997e-07, "loss": 0.4173, "step": 134900 }, { "epoch": 1.3495, "grad_norm": 100.18441772460938, "learning_rate": 4.60432e-07, "loss": 0.5035, "step": 134950 }, { "epoch": 1.35, "grad_norm": 12.220166206359863, "learning_rate": 4.6023599999999994e-07, "loss": 0.3394, "step": 135000 }, { "epoch": 1.3505, "grad_norm": 76.76039123535156, "learning_rate": 4.60036e-07, "loss": 0.4507, "step": 135050 }, { "epoch": 1.351, "grad_norm": 94.28003692626953, "learning_rate": 4.59836e-07, "loss": 0.3727, "step": 135100 }, { "epoch": 1.3515, "grad_norm": 109.33968353271484, "learning_rate": 4.5963599999999995e-07, "loss": 0.4583, "step": 135150 }, { "epoch": 1.3519999999999999, "grad_norm": 50.13473892211914, "learning_rate": 4.59436e-07, "loss": 0.4678, "step": 135200 }, { "epoch": 1.3525, "grad_norm": 30.904821395874023, "learning_rate": 4.5923599999999997e-07, "loss": 0.4636, "step": 135250 }, { "epoch": 1.353, "grad_norm": 115.35250091552734, "learning_rate": 4.5903599999999996e-07, "loss": 0.3913, "step": 135300 }, { "epoch": 1.3535, "grad_norm": 116.52228546142578, "learning_rate": 4.58836e-07, "loss": 0.553, "step": 135350 }, { "epoch": 1.354, "grad_norm": 86.03593444824219, "learning_rate": 4.58636e-07, "loss": 0.5031, "step": 135400 }, { "epoch": 1.3545, "grad_norm": 72.3880386352539, "learning_rate": 4.5843599999999996e-07, "loss": 0.4295, "step": 135450 }, { "epoch": 1.355, "grad_norm": 45.91992950439453, "learning_rate": 4.5823599999999995e-07, "loss": 0.396, "step": 135500 }, { "epoch": 1.3555, "grad_norm": 17.957754135131836, "learning_rate": 4.58036e-07, "loss": 0.3191, "step": 135550 }, { "epoch": 1.3559999999999999, "grad_norm": 3.7718541622161865, "learning_rate": 4.57836e-07, "loss": 0.5687, "step": 135600 }, { "epoch": 1.3565, "grad_norm": 16.24213409423828, "learning_rate": 4.5763599999999996e-07, "loss": 0.4716, "step": 135650 }, { "epoch": 1.357, "grad_norm": 3.7620809078216553, "learning_rate": 4.57436e-07, "loss": 0.456, "step": 135700 }, { "epoch": 1.3575, "grad_norm": 86.84635925292969, "learning_rate": 4.57236e-07, "loss": 0.3689, "step": 135750 }, { "epoch": 1.358, "grad_norm": 84.04347229003906, "learning_rate": 4.5703599999999996e-07, "loss": 0.4508, "step": 135800 }, { "epoch": 1.3585, "grad_norm": 29.693422317504883, "learning_rate": 4.56836e-07, "loss": 0.5614, "step": 135850 }, { "epoch": 1.359, "grad_norm": 45.12217712402344, "learning_rate": 4.56636e-07, "loss": 0.3916, "step": 135900 }, { "epoch": 1.3595, "grad_norm": 31.29984474182129, "learning_rate": 4.5643599999999997e-07, "loss": 0.4284, "step": 135950 }, { "epoch": 1.3599999999999999, "grad_norm": 33.480751037597656, "learning_rate": 4.5623599999999996e-07, "loss": 0.3547, "step": 136000 }, { "epoch": 1.3605, "grad_norm": 2.545854091644287, "learning_rate": 4.56036e-07, "loss": 0.418, "step": 136050 }, { "epoch": 1.361, "grad_norm": 91.34917449951172, "learning_rate": 4.55836e-07, "loss": 0.4277, "step": 136100 }, { "epoch": 1.3615, "grad_norm": 36.96833419799805, "learning_rate": 4.5563599999999996e-07, "loss": 0.4769, "step": 136150 }, { "epoch": 1.362, "grad_norm": 53.53211975097656, "learning_rate": 4.55436e-07, "loss": 0.4037, "step": 136200 }, { "epoch": 1.3625, "grad_norm": 16.555437088012695, "learning_rate": 4.5523599999999993e-07, "loss": 0.4622, "step": 136250 }, { "epoch": 1.363, "grad_norm": 2.20904803276062, "learning_rate": 4.5503599999999997e-07, "loss": 0.4856, "step": 136300 }, { "epoch": 1.3635, "grad_norm": 67.71829986572266, "learning_rate": 4.54836e-07, "loss": 0.4294, "step": 136350 }, { "epoch": 1.3639999999999999, "grad_norm": 27.427248001098633, "learning_rate": 4.5463599999999994e-07, "loss": 0.5167, "step": 136400 }, { "epoch": 1.3645, "grad_norm": 80.44205474853516, "learning_rate": 4.54436e-07, "loss": 0.4855, "step": 136450 }, { "epoch": 1.365, "grad_norm": 15.163834571838379, "learning_rate": 4.5423599999999997e-07, "loss": 0.3511, "step": 136500 }, { "epoch": 1.3655, "grad_norm": 12.879931449890137, "learning_rate": 4.5403599999999995e-07, "loss": 0.4367, "step": 136550 }, { "epoch": 1.366, "grad_norm": 68.53999328613281, "learning_rate": 4.53836e-07, "loss": 0.5996, "step": 136600 }, { "epoch": 1.3665, "grad_norm": 121.40241241455078, "learning_rate": 4.5363599999999997e-07, "loss": 0.5384, "step": 136650 }, { "epoch": 1.367, "grad_norm": 0.31804296374320984, "learning_rate": 4.53436e-07, "loss": 0.6281, "step": 136700 }, { "epoch": 1.3675, "grad_norm": 13.64835262298584, "learning_rate": 4.53236e-07, "loss": 0.4431, "step": 136750 }, { "epoch": 1.3679999999999999, "grad_norm": 39.9966926574707, "learning_rate": 4.53036e-07, "loss": 0.4694, "step": 136800 }, { "epoch": 1.3685, "grad_norm": 29.257156372070312, "learning_rate": 4.52836e-07, "loss": 0.5316, "step": 136850 }, { "epoch": 1.369, "grad_norm": 68.866943359375, "learning_rate": 4.5263599999999995e-07, "loss": 0.4085, "step": 136900 }, { "epoch": 1.3695, "grad_norm": 1.851599097251892, "learning_rate": 4.52436e-07, "loss": 0.4832, "step": 136950 }, { "epoch": 1.37, "grad_norm": 66.2961654663086, "learning_rate": 4.5223600000000003e-07, "loss": 0.4054, "step": 137000 }, { "epoch": 1.3705, "grad_norm": 12.196954727172852, "learning_rate": 4.5203599999999996e-07, "loss": 0.5535, "step": 137050 }, { "epoch": 1.371, "grad_norm": 60.95892333984375, "learning_rate": 4.51836e-07, "loss": 0.3948, "step": 137100 }, { "epoch": 1.3715, "grad_norm": 0.48031020164489746, "learning_rate": 4.51636e-07, "loss": 0.5336, "step": 137150 }, { "epoch": 1.3719999999999999, "grad_norm": 88.17898559570312, "learning_rate": 4.5143599999999997e-07, "loss": 0.553, "step": 137200 }, { "epoch": 1.3725, "grad_norm": 15.11093807220459, "learning_rate": 4.51236e-07, "loss": 0.4888, "step": 137250 }, { "epoch": 1.373, "grad_norm": 27.76417350769043, "learning_rate": 4.51036e-07, "loss": 0.5925, "step": 137300 }, { "epoch": 1.3735, "grad_norm": 2.448547840118408, "learning_rate": 4.50836e-07, "loss": 0.3783, "step": 137350 }, { "epoch": 1.374, "grad_norm": 7.2902607917785645, "learning_rate": 4.5063599999999996e-07, "loss": 0.4221, "step": 137400 }, { "epoch": 1.3745, "grad_norm": 15.440958976745605, "learning_rate": 4.50436e-07, "loss": 0.3223, "step": 137450 }, { "epoch": 1.375, "grad_norm": 9.393592834472656, "learning_rate": 4.50236e-07, "loss": 0.393, "step": 137500 }, { "epoch": 1.3755, "grad_norm": 22.066003799438477, "learning_rate": 4.5003599999999997e-07, "loss": 0.4808, "step": 137550 }, { "epoch": 1.376, "grad_norm": 2.079468250274658, "learning_rate": 4.49836e-07, "loss": 0.4312, "step": 137600 }, { "epoch": 1.3765, "grad_norm": 74.9188461303711, "learning_rate": 4.4963599999999994e-07, "loss": 0.4377, "step": 137650 }, { "epoch": 1.377, "grad_norm": 0.9050947427749634, "learning_rate": 4.49436e-07, "loss": 0.4035, "step": 137700 }, { "epoch": 1.3775, "grad_norm": 104.04845428466797, "learning_rate": 4.49236e-07, "loss": 0.3991, "step": 137750 }, { "epoch": 1.3780000000000001, "grad_norm": 47.782188415527344, "learning_rate": 4.4903599999999995e-07, "loss": 0.4328, "step": 137800 }, { "epoch": 1.3785, "grad_norm": 66.12909698486328, "learning_rate": 4.48836e-07, "loss": 0.4564, "step": 137850 }, { "epoch": 1.379, "grad_norm": 99.37031555175781, "learning_rate": 4.4863599999999997e-07, "loss": 0.4904, "step": 137900 }, { "epoch": 1.3795, "grad_norm": 110.17735290527344, "learning_rate": 4.48436e-07, "loss": 0.5182, "step": 137950 }, { "epoch": 1.38, "grad_norm": 50.46833038330078, "learning_rate": 4.48236e-07, "loss": 0.5215, "step": 138000 }, { "epoch": 1.3805, "grad_norm": 110.3285140991211, "learning_rate": 4.48036e-07, "loss": 0.5516, "step": 138050 }, { "epoch": 1.381, "grad_norm": 1.3695465326309204, "learning_rate": 4.47836e-07, "loss": 0.4337, "step": 138100 }, { "epoch": 1.3815, "grad_norm": 38.30186462402344, "learning_rate": 4.4763599999999995e-07, "loss": 0.4659, "step": 138150 }, { "epoch": 1.3820000000000001, "grad_norm": 70.28520202636719, "learning_rate": 4.47436e-07, "loss": 0.48, "step": 138200 }, { "epoch": 1.3825, "grad_norm": 9.3742036819458, "learning_rate": 4.47236e-07, "loss": 0.4788, "step": 138250 }, { "epoch": 1.383, "grad_norm": 0.6705490350723267, "learning_rate": 4.4703599999999995e-07, "loss": 0.4825, "step": 138300 }, { "epoch": 1.3835, "grad_norm": 30.913881301879883, "learning_rate": 4.46836e-07, "loss": 0.4678, "step": 138350 }, { "epoch": 1.384, "grad_norm": 71.79946899414062, "learning_rate": 4.46636e-07, "loss": 0.5039, "step": 138400 }, { "epoch": 1.3845, "grad_norm": 25.541101455688477, "learning_rate": 4.4643599999999996e-07, "loss": 0.4999, "step": 138450 }, { "epoch": 1.385, "grad_norm": 88.34212493896484, "learning_rate": 4.46236e-07, "loss": 0.3772, "step": 138500 }, { "epoch": 1.3855, "grad_norm": 28.59035873413086, "learning_rate": 4.46036e-07, "loss": 0.3627, "step": 138550 }, { "epoch": 1.3860000000000001, "grad_norm": 60.05609130859375, "learning_rate": 4.4583599999999997e-07, "loss": 0.458, "step": 138600 }, { "epoch": 1.3865, "grad_norm": 52.41706085205078, "learning_rate": 4.4563599999999995e-07, "loss": 0.3964, "step": 138650 }, { "epoch": 1.387, "grad_norm": 121.22614288330078, "learning_rate": 4.45436e-07, "loss": 0.3593, "step": 138700 }, { "epoch": 1.3875, "grad_norm": 9.27535629272461, "learning_rate": 4.45236e-07, "loss": 0.5142, "step": 138750 }, { "epoch": 1.388, "grad_norm": 5.494357109069824, "learning_rate": 4.4503599999999996e-07, "loss": 0.4425, "step": 138800 }, { "epoch": 1.3885, "grad_norm": 100.4722671508789, "learning_rate": 4.44836e-07, "loss": 0.4553, "step": 138850 }, { "epoch": 1.389, "grad_norm": 6.942700386047363, "learning_rate": 4.4463599999999993e-07, "loss": 0.4525, "step": 138900 }, { "epoch": 1.3895, "grad_norm": 38.53964614868164, "learning_rate": 4.4443599999999997e-07, "loss": 0.5523, "step": 138950 }, { "epoch": 1.3900000000000001, "grad_norm": 38.89802169799805, "learning_rate": 4.44236e-07, "loss": 0.3422, "step": 139000 }, { "epoch": 1.3905, "grad_norm": 123.1884765625, "learning_rate": 4.4403599999999994e-07, "loss": 0.6059, "step": 139050 }, { "epoch": 1.391, "grad_norm": 3.0012309551239014, "learning_rate": 4.43836e-07, "loss": 0.3437, "step": 139100 }, { "epoch": 1.3915, "grad_norm": 1.8544474840164185, "learning_rate": 4.4363599999999996e-07, "loss": 0.3483, "step": 139150 }, { "epoch": 1.392, "grad_norm": 81.5112533569336, "learning_rate": 4.43436e-07, "loss": 0.4729, "step": 139200 }, { "epoch": 1.3925, "grad_norm": 88.00301361083984, "learning_rate": 4.43236e-07, "loss": 0.4897, "step": 139250 }, { "epoch": 1.393, "grad_norm": 7.775111675262451, "learning_rate": 4.4303599999999997e-07, "loss": 0.4474, "step": 139300 }, { "epoch": 1.3935, "grad_norm": 60.3265266418457, "learning_rate": 4.42836e-07, "loss": 0.5121, "step": 139350 }, { "epoch": 1.3940000000000001, "grad_norm": 79.7757568359375, "learning_rate": 4.4263599999999994e-07, "loss": 0.512, "step": 139400 }, { "epoch": 1.3945, "grad_norm": 13.68470287322998, "learning_rate": 4.42436e-07, "loss": 0.3414, "step": 139450 }, { "epoch": 1.395, "grad_norm": 60.95335388183594, "learning_rate": 4.42236e-07, "loss": 0.4045, "step": 139500 }, { "epoch": 1.3955, "grad_norm": 47.811424255371094, "learning_rate": 4.4203599999999995e-07, "loss": 0.384, "step": 139550 }, { "epoch": 1.396, "grad_norm": 17.77873420715332, "learning_rate": 4.41836e-07, "loss": 0.4314, "step": 139600 }, { "epoch": 1.3965, "grad_norm": 0.8219741582870483, "learning_rate": 4.4163599999999997e-07, "loss": 0.4474, "step": 139650 }, { "epoch": 1.397, "grad_norm": 44.76259231567383, "learning_rate": 4.4143599999999996e-07, "loss": 0.4728, "step": 139700 }, { "epoch": 1.3975, "grad_norm": 32.0296745300293, "learning_rate": 4.41236e-07, "loss": 0.2824, "step": 139750 }, { "epoch": 1.3980000000000001, "grad_norm": 6.995255947113037, "learning_rate": 4.41036e-07, "loss": 0.4395, "step": 139800 }, { "epoch": 1.3985, "grad_norm": 3.6521756649017334, "learning_rate": 4.4083599999999996e-07, "loss": 0.4282, "step": 139850 }, { "epoch": 1.399, "grad_norm": 0.5228773951530457, "learning_rate": 4.4063999999999995e-07, "loss": 0.5437, "step": 139900 }, { "epoch": 1.3995, "grad_norm": 0.48555630445480347, "learning_rate": 4.4044e-07, "loss": 0.5651, "step": 139950 }, { "epoch": 1.4, "grad_norm": 81.31806182861328, "learning_rate": 4.4024e-07, "loss": 0.4659, "step": 140000 }, { "epoch": 1.4005, "grad_norm": 1.3937146663665771, "learning_rate": 4.4003999999999996e-07, "loss": 0.4486, "step": 140050 }, { "epoch": 1.401, "grad_norm": 21.106035232543945, "learning_rate": 4.3984e-07, "loss": 0.4779, "step": 140100 }, { "epoch": 1.4015, "grad_norm": 13.850019454956055, "learning_rate": 4.3963999999999993e-07, "loss": 0.3871, "step": 140150 }, { "epoch": 1.4020000000000001, "grad_norm": 1.2949334383010864, "learning_rate": 4.3943999999999997e-07, "loss": 0.5905, "step": 140200 }, { "epoch": 1.4025, "grad_norm": 85.25445556640625, "learning_rate": 4.3924e-07, "loss": 0.5164, "step": 140250 }, { "epoch": 1.403, "grad_norm": 21.79319190979004, "learning_rate": 4.3903999999999994e-07, "loss": 0.4014, "step": 140300 }, { "epoch": 1.4035, "grad_norm": 16.37718391418457, "learning_rate": 4.3884e-07, "loss": 0.4419, "step": 140350 }, { "epoch": 1.404, "grad_norm": 3.0240814685821533, "learning_rate": 4.3863999999999996e-07, "loss": 0.5172, "step": 140400 }, { "epoch": 1.4045, "grad_norm": 106.63111877441406, "learning_rate": 4.3844e-07, "loss": 0.4104, "step": 140450 }, { "epoch": 1.405, "grad_norm": 17.3591251373291, "learning_rate": 4.3824e-07, "loss": 0.4768, "step": 140500 }, { "epoch": 1.4055, "grad_norm": 76.1080322265625, "learning_rate": 4.3803999999999997e-07, "loss": 0.4747, "step": 140550 }, { "epoch": 1.4060000000000001, "grad_norm": 3.34743595123291, "learning_rate": 4.3784e-07, "loss": 0.3191, "step": 140600 }, { "epoch": 1.4064999999999999, "grad_norm": 38.74766159057617, "learning_rate": 4.3763999999999994e-07, "loss": 0.4569, "step": 140650 }, { "epoch": 1.407, "grad_norm": 46.4010124206543, "learning_rate": 4.3744e-07, "loss": 0.3989, "step": 140700 }, { "epoch": 1.4075, "grad_norm": 69.19367980957031, "learning_rate": 4.3724e-07, "loss": 0.4423, "step": 140750 }, { "epoch": 1.408, "grad_norm": 86.27606964111328, "learning_rate": 4.3703999999999995e-07, "loss": 0.3281, "step": 140800 }, { "epoch": 1.4085, "grad_norm": 0.331612229347229, "learning_rate": 4.3684e-07, "loss": 0.552, "step": 140850 }, { "epoch": 1.409, "grad_norm": 17.078815460205078, "learning_rate": 4.3663999999999997e-07, "loss": 0.4963, "step": 140900 }, { "epoch": 1.4095, "grad_norm": 17.343914031982422, "learning_rate": 4.3643999999999996e-07, "loss": 0.3181, "step": 140950 }, { "epoch": 1.41, "grad_norm": 75.39768981933594, "learning_rate": 4.3624e-07, "loss": 0.3793, "step": 141000 }, { "epoch": 1.4104999999999999, "grad_norm": 0.2830001711845398, "learning_rate": 4.3604e-07, "loss": 0.4037, "step": 141050 }, { "epoch": 1.411, "grad_norm": 30.401926040649414, "learning_rate": 4.3583999999999996e-07, "loss": 0.427, "step": 141100 }, { "epoch": 1.4115, "grad_norm": 10.625136375427246, "learning_rate": 4.3564e-07, "loss": 0.4488, "step": 141150 }, { "epoch": 1.412, "grad_norm": 8.427471160888672, "learning_rate": 4.3544e-07, "loss": 0.3985, "step": 141200 }, { "epoch": 1.4125, "grad_norm": 26.40860366821289, "learning_rate": 4.3523999999999997e-07, "loss": 0.3947, "step": 141250 }, { "epoch": 1.413, "grad_norm": 87.355224609375, "learning_rate": 4.3503999999999996e-07, "loss": 0.3551, "step": 141300 }, { "epoch": 1.4135, "grad_norm": 31.92610740661621, "learning_rate": 4.3484e-07, "loss": 0.5354, "step": 141350 }, { "epoch": 1.414, "grad_norm": 24.853439331054688, "learning_rate": 4.3464000000000003e-07, "loss": 0.4651, "step": 141400 }, { "epoch": 1.4144999999999999, "grad_norm": 73.6370849609375, "learning_rate": 4.3443999999999996e-07, "loss": 0.4862, "step": 141450 }, { "epoch": 1.415, "grad_norm": 35.83401870727539, "learning_rate": 4.3424e-07, "loss": 0.4366, "step": 141500 }, { "epoch": 1.4155, "grad_norm": 30.853010177612305, "learning_rate": 4.3404e-07, "loss": 0.5262, "step": 141550 }, { "epoch": 1.416, "grad_norm": 118.53955078125, "learning_rate": 4.3383999999999997e-07, "loss": 0.5409, "step": 141600 }, { "epoch": 1.4165, "grad_norm": 40.07814025878906, "learning_rate": 4.3364399999999996e-07, "loss": 0.4521, "step": 141650 }, { "epoch": 1.417, "grad_norm": 9.961045265197754, "learning_rate": 4.33444e-07, "loss": 0.3655, "step": 141700 }, { "epoch": 1.4175, "grad_norm": 141.18646240234375, "learning_rate": 4.33244e-07, "loss": 0.6208, "step": 141750 }, { "epoch": 1.418, "grad_norm": 58.834197998046875, "learning_rate": 4.3304399999999997e-07, "loss": 0.3911, "step": 141800 }, { "epoch": 1.4184999999999999, "grad_norm": 63.38532257080078, "learning_rate": 4.32844e-07, "loss": 0.3999, "step": 141850 }, { "epoch": 1.419, "grad_norm": 77.08700561523438, "learning_rate": 4.3264399999999994e-07, "loss": 0.5151, "step": 141900 }, { "epoch": 1.4195, "grad_norm": 0.9496738910675049, "learning_rate": 4.32444e-07, "loss": 0.4697, "step": 141950 }, { "epoch": 1.42, "grad_norm": 35.80956268310547, "learning_rate": 4.32244e-07, "loss": 0.3883, "step": 142000 }, { "epoch": 1.4205, "grad_norm": 82.25574493408203, "learning_rate": 4.3204399999999995e-07, "loss": 0.5685, "step": 142050 }, { "epoch": 1.421, "grad_norm": 8.83198356628418, "learning_rate": 4.31844e-07, "loss": 0.4466, "step": 142100 }, { "epoch": 1.4215, "grad_norm": 84.17395782470703, "learning_rate": 4.31644e-07, "loss": 0.4718, "step": 142150 }, { "epoch": 1.422, "grad_norm": 94.86622619628906, "learning_rate": 4.3144399999999996e-07, "loss": 0.3978, "step": 142200 }, { "epoch": 1.4224999999999999, "grad_norm": 77.70457458496094, "learning_rate": 4.31244e-07, "loss": 0.4577, "step": 142250 }, { "epoch": 1.423, "grad_norm": 95.18637084960938, "learning_rate": 4.31044e-07, "loss": 0.4149, "step": 142300 }, { "epoch": 1.4235, "grad_norm": 63.49544143676758, "learning_rate": 4.3084399999999996e-07, "loss": 0.5862, "step": 142350 }, { "epoch": 1.424, "grad_norm": 16.329381942749023, "learning_rate": 4.30644e-07, "loss": 0.3519, "step": 142400 }, { "epoch": 1.4245, "grad_norm": 8.99925708770752, "learning_rate": 4.30444e-07, "loss": 0.3929, "step": 142450 }, { "epoch": 1.425, "grad_norm": 3.1051368713378906, "learning_rate": 4.3024399999999997e-07, "loss": 0.3946, "step": 142500 }, { "epoch": 1.4255, "grad_norm": 116.58087921142578, "learning_rate": 4.3004399999999996e-07, "loss": 0.5392, "step": 142550 }, { "epoch": 1.426, "grad_norm": 11.166736602783203, "learning_rate": 4.29848e-07, "loss": 0.4117, "step": 142600 }, { "epoch": 1.4264999999999999, "grad_norm": 45.40515899658203, "learning_rate": 4.2964799999999993e-07, "loss": 0.4275, "step": 142650 }, { "epoch": 1.427, "grad_norm": 49.694183349609375, "learning_rate": 4.2944799999999997e-07, "loss": 0.3936, "step": 142700 }, { "epoch": 1.4275, "grad_norm": 52.98374557495117, "learning_rate": 4.29248e-07, "loss": 0.4006, "step": 142750 }, { "epoch": 1.428, "grad_norm": 23.433656692504883, "learning_rate": 4.29048e-07, "loss": 0.4007, "step": 142800 }, { "epoch": 1.4285, "grad_norm": 88.89615631103516, "learning_rate": 4.28848e-07, "loss": 0.442, "step": 142850 }, { "epoch": 1.429, "grad_norm": 2.4820308685302734, "learning_rate": 4.2864799999999996e-07, "loss": 0.4532, "step": 142900 }, { "epoch": 1.4295, "grad_norm": 26.804426193237305, "learning_rate": 4.28448e-07, "loss": 0.4837, "step": 142950 }, { "epoch": 1.43, "grad_norm": 9.690469741821289, "learning_rate": 4.28248e-07, "loss": 0.4484, "step": 143000 }, { "epoch": 1.4304999999999999, "grad_norm": 31.82974624633789, "learning_rate": 4.2804799999999997e-07, "loss": 0.4718, "step": 143050 }, { "epoch": 1.431, "grad_norm": 76.13028717041016, "learning_rate": 4.27848e-07, "loss": 0.4575, "step": 143100 }, { "epoch": 1.4315, "grad_norm": 79.27205657958984, "learning_rate": 4.2764799999999994e-07, "loss": 0.4532, "step": 143150 }, { "epoch": 1.432, "grad_norm": 1.6054680347442627, "learning_rate": 4.27448e-07, "loss": 0.4233, "step": 143200 }, { "epoch": 1.4325, "grad_norm": 5.973708629608154, "learning_rate": 4.27248e-07, "loss": 0.4445, "step": 143250 }, { "epoch": 1.433, "grad_norm": 97.4744873046875, "learning_rate": 4.2704799999999995e-07, "loss": 0.5598, "step": 143300 }, { "epoch": 1.4335, "grad_norm": 7.280982494354248, "learning_rate": 4.26848e-07, "loss": 0.4821, "step": 143350 }, { "epoch": 1.434, "grad_norm": 30.4831485748291, "learning_rate": 4.26648e-07, "loss": 0.3772, "step": 143400 }, { "epoch": 1.4344999999999999, "grad_norm": 6.623522758483887, "learning_rate": 4.2644799999999996e-07, "loss": 0.4908, "step": 143450 }, { "epoch": 1.435, "grad_norm": 26.69012451171875, "learning_rate": 4.26248e-07, "loss": 0.4367, "step": 143500 }, { "epoch": 1.4355, "grad_norm": 48.03159713745117, "learning_rate": 4.26048e-07, "loss": 0.57, "step": 143550 }, { "epoch": 1.436, "grad_norm": 0.36997270584106445, "learning_rate": 4.2584799999999996e-07, "loss": 0.381, "step": 143600 }, { "epoch": 1.4365, "grad_norm": 75.4528579711914, "learning_rate": 4.25648e-07, "loss": 0.3569, "step": 143650 }, { "epoch": 1.437, "grad_norm": 23.47142219543457, "learning_rate": 4.25448e-07, "loss": 0.4182, "step": 143700 }, { "epoch": 1.4375, "grad_norm": 40.28401184082031, "learning_rate": 4.2524799999999997e-07, "loss": 0.4932, "step": 143750 }, { "epoch": 1.438, "grad_norm": 8.089696884155273, "learning_rate": 4.2504799999999996e-07, "loss": 0.5582, "step": 143800 }, { "epoch": 1.4385, "grad_norm": 93.84317779541016, "learning_rate": 4.24848e-07, "loss": 0.3421, "step": 143850 }, { "epoch": 1.439, "grad_norm": 78.79322052001953, "learning_rate": 4.2464800000000003e-07, "loss": 0.5463, "step": 143900 }, { "epoch": 1.4395, "grad_norm": 153.0576171875, "learning_rate": 4.2444799999999996e-07, "loss": 0.4011, "step": 143950 }, { "epoch": 1.44, "grad_norm": 4.934996604919434, "learning_rate": 4.24248e-07, "loss": 0.4774, "step": 144000 }, { "epoch": 1.4405000000000001, "grad_norm": 86.63742065429688, "learning_rate": 4.24048e-07, "loss": 0.4094, "step": 144050 }, { "epoch": 1.441, "grad_norm": 79.47491455078125, "learning_rate": 4.2384799999999997e-07, "loss": 0.3831, "step": 144100 }, { "epoch": 1.4415, "grad_norm": 55.07907485961914, "learning_rate": 4.23648e-07, "loss": 0.4505, "step": 144150 }, { "epoch": 1.442, "grad_norm": 7.138432025909424, "learning_rate": 4.23448e-07, "loss": 0.4995, "step": 144200 }, { "epoch": 1.4425, "grad_norm": 31.88157844543457, "learning_rate": 4.23248e-07, "loss": 0.674, "step": 144250 }, { "epoch": 1.443, "grad_norm": 24.490238189697266, "learning_rate": 4.2304799999999996e-07, "loss": 0.4163, "step": 144300 }, { "epoch": 1.4435, "grad_norm": 67.5228042602539, "learning_rate": 4.22848e-07, "loss": 0.3403, "step": 144350 }, { "epoch": 1.444, "grad_norm": 1.8344687223434448, "learning_rate": 4.22648e-07, "loss": 0.5146, "step": 144400 }, { "epoch": 1.4445000000000001, "grad_norm": 52.78595733642578, "learning_rate": 4.2244799999999997e-07, "loss": 0.5185, "step": 144450 }, { "epoch": 1.445, "grad_norm": 8.644986152648926, "learning_rate": 4.22248e-07, "loss": 0.5884, "step": 144500 }, { "epoch": 1.4455, "grad_norm": 28.123138427734375, "learning_rate": 4.2204799999999994e-07, "loss": 0.4497, "step": 144550 }, { "epoch": 1.446, "grad_norm": 67.64913177490234, "learning_rate": 4.21848e-07, "loss": 0.4375, "step": 144600 }, { "epoch": 1.4465, "grad_norm": 30.597097396850586, "learning_rate": 4.21648e-07, "loss": 0.5279, "step": 144650 }, { "epoch": 1.447, "grad_norm": 100.2088851928711, "learning_rate": 4.2144799999999995e-07, "loss": 0.5009, "step": 144700 }, { "epoch": 1.4475, "grad_norm": 55.302974700927734, "learning_rate": 4.21248e-07, "loss": 0.455, "step": 144750 }, { "epoch": 1.448, "grad_norm": 67.07560729980469, "learning_rate": 4.2104799999999997e-07, "loss": 0.4658, "step": 144800 }, { "epoch": 1.4485000000000001, "grad_norm": 13.170785903930664, "learning_rate": 4.2084799999999996e-07, "loss": 0.3561, "step": 144850 }, { "epoch": 1.449, "grad_norm": 7.08843994140625, "learning_rate": 4.20648e-07, "loss": 0.3366, "step": 144900 }, { "epoch": 1.4495, "grad_norm": 3.2631988525390625, "learning_rate": 4.20448e-07, "loss": 0.435, "step": 144950 }, { "epoch": 1.45, "grad_norm": 14.511624336242676, "learning_rate": 4.2024799999999997e-07, "loss": 0.2825, "step": 145000 }, { "epoch": 1.4505, "grad_norm": 13.6480073928833, "learning_rate": 4.2004799999999995e-07, "loss": 0.4737, "step": 145050 }, { "epoch": 1.451, "grad_norm": 84.39595794677734, "learning_rate": 4.19848e-07, "loss": 0.478, "step": 145100 }, { "epoch": 1.4515, "grad_norm": 112.03206634521484, "learning_rate": 4.1964800000000003e-07, "loss": 0.3953, "step": 145150 }, { "epoch": 1.452, "grad_norm": 13.859879493713379, "learning_rate": 4.1944799999999996e-07, "loss": 0.4724, "step": 145200 }, { "epoch": 1.4525000000000001, "grad_norm": 51.57634735107422, "learning_rate": 4.19248e-07, "loss": 0.4693, "step": 145250 }, { "epoch": 1.453, "grad_norm": 1.5918205976486206, "learning_rate": 4.19048e-07, "loss": 0.3019, "step": 145300 }, { "epoch": 1.4535, "grad_norm": 86.82250213623047, "learning_rate": 4.1884799999999997e-07, "loss": 0.4294, "step": 145350 }, { "epoch": 1.454, "grad_norm": 29.19518280029297, "learning_rate": 4.18648e-07, "loss": 0.4965, "step": 145400 }, { "epoch": 1.4545, "grad_norm": 38.55145263671875, "learning_rate": 4.18448e-07, "loss": 0.3507, "step": 145450 }, { "epoch": 1.455, "grad_norm": 96.74462127685547, "learning_rate": 4.18248e-07, "loss": 0.5309, "step": 145500 }, { "epoch": 1.4555, "grad_norm": 71.57083129882812, "learning_rate": 4.1804799999999996e-07, "loss": 0.5173, "step": 145550 }, { "epoch": 1.456, "grad_norm": 142.42713928222656, "learning_rate": 4.17848e-07, "loss": 0.4896, "step": 145600 }, { "epoch": 1.4565000000000001, "grad_norm": 71.31710815429688, "learning_rate": 4.17648e-07, "loss": 0.4368, "step": 145650 }, { "epoch": 1.457, "grad_norm": 40.85289001464844, "learning_rate": 4.1744799999999997e-07, "loss": 0.4845, "step": 145700 }, { "epoch": 1.4575, "grad_norm": 92.76287841796875, "learning_rate": 4.17248e-07, "loss": 0.4203, "step": 145750 }, { "epoch": 1.458, "grad_norm": 34.11021423339844, "learning_rate": 4.1704799999999994e-07, "loss": 0.3603, "step": 145800 }, { "epoch": 1.4585, "grad_norm": 19.930692672729492, "learning_rate": 4.16848e-07, "loss": 0.4356, "step": 145850 }, { "epoch": 1.459, "grad_norm": 9.085647583007812, "learning_rate": 4.16648e-07, "loss": 0.3679, "step": 145900 }, { "epoch": 1.4595, "grad_norm": 127.162841796875, "learning_rate": 4.1644799999999994e-07, "loss": 0.4983, "step": 145950 }, { "epoch": 1.46, "grad_norm": 26.87095069885254, "learning_rate": 4.16248e-07, "loss": 0.452, "step": 146000 }, { "epoch": 1.4605000000000001, "grad_norm": 6.62654972076416, "learning_rate": 4.1604799999999997e-07, "loss": 0.4967, "step": 146050 }, { "epoch": 1.461, "grad_norm": 0.27412235736846924, "learning_rate": 4.1584799999999995e-07, "loss": 0.35, "step": 146100 }, { "epoch": 1.4615, "grad_norm": 0.39492130279541016, "learning_rate": 4.15648e-07, "loss": 0.5392, "step": 146150 }, { "epoch": 1.462, "grad_norm": 0.3583661615848541, "learning_rate": 4.15448e-07, "loss": 0.4021, "step": 146200 }, { "epoch": 1.4625, "grad_norm": 66.29902648925781, "learning_rate": 4.15248e-07, "loss": 0.4798, "step": 146250 }, { "epoch": 1.463, "grad_norm": 14.90131664276123, "learning_rate": 4.1504799999999995e-07, "loss": 0.4671, "step": 146300 }, { "epoch": 1.4635, "grad_norm": 8.83768367767334, "learning_rate": 4.14848e-07, "loss": 0.5414, "step": 146350 }, { "epoch": 1.464, "grad_norm": 39.12753677368164, "learning_rate": 4.14648e-07, "loss": 0.5358, "step": 146400 }, { "epoch": 1.4645000000000001, "grad_norm": 50.82841110229492, "learning_rate": 4.1444799999999995e-07, "loss": 0.4129, "step": 146450 }, { "epoch": 1.465, "grad_norm": 39.67839431762695, "learning_rate": 4.14248e-07, "loss": 0.4745, "step": 146500 }, { "epoch": 1.4655, "grad_norm": 89.63794708251953, "learning_rate": 4.1404800000000003e-07, "loss": 0.5237, "step": 146550 }, { "epoch": 1.466, "grad_norm": 65.97050476074219, "learning_rate": 4.1384799999999996e-07, "loss": 0.4113, "step": 146600 }, { "epoch": 1.4665, "grad_norm": 30.679176330566406, "learning_rate": 4.13648e-07, "loss": 0.5226, "step": 146650 }, { "epoch": 1.467, "grad_norm": 75.3191909790039, "learning_rate": 4.13448e-07, "loss": 0.4449, "step": 146700 }, { "epoch": 1.4675, "grad_norm": 63.889930725097656, "learning_rate": 4.1324799999999997e-07, "loss": 0.4736, "step": 146750 }, { "epoch": 1.468, "grad_norm": 61.48563003540039, "learning_rate": 4.13048e-07, "loss": 0.3692, "step": 146800 }, { "epoch": 1.4685000000000001, "grad_norm": 34.46501541137695, "learning_rate": 4.12848e-07, "loss": 0.3301, "step": 146850 }, { "epoch": 1.4689999999999999, "grad_norm": 11.682842254638672, "learning_rate": 4.12648e-07, "loss": 0.4875, "step": 146900 }, { "epoch": 1.4695, "grad_norm": 6.080808639526367, "learning_rate": 4.1244799999999996e-07, "loss": 0.5323, "step": 146950 }, { "epoch": 1.47, "grad_norm": 29.79290008544922, "learning_rate": 4.12248e-07, "loss": 0.5893, "step": 147000 }, { "epoch": 1.4705, "grad_norm": 41.09132766723633, "learning_rate": 4.12048e-07, "loss": 0.4248, "step": 147050 }, { "epoch": 1.471, "grad_norm": 62.73756408691406, "learning_rate": 4.1184799999999997e-07, "loss": 0.4393, "step": 147100 }, { "epoch": 1.4715, "grad_norm": 41.83452224731445, "learning_rate": 4.11648e-07, "loss": 0.396, "step": 147150 }, { "epoch": 1.472, "grad_norm": 59.541011810302734, "learning_rate": 4.1144799999999994e-07, "loss": 0.4265, "step": 147200 }, { "epoch": 1.4725, "grad_norm": 23.928707122802734, "learning_rate": 4.11248e-07, "loss": 0.3726, "step": 147250 }, { "epoch": 1.4729999999999999, "grad_norm": 70.83897399902344, "learning_rate": 4.11048e-07, "loss": 0.3918, "step": 147300 }, { "epoch": 1.4735, "grad_norm": 21.586318969726562, "learning_rate": 4.1084799999999995e-07, "loss": 0.3456, "step": 147350 }, { "epoch": 1.474, "grad_norm": 11.594403266906738, "learning_rate": 4.10648e-07, "loss": 0.3408, "step": 147400 }, { "epoch": 1.4745, "grad_norm": 47.519081115722656, "learning_rate": 4.1044799999999997e-07, "loss": 0.3752, "step": 147450 }, { "epoch": 1.475, "grad_norm": 128.55673217773438, "learning_rate": 4.10248e-07, "loss": 0.4493, "step": 147500 }, { "epoch": 1.4755, "grad_norm": 67.64390563964844, "learning_rate": 4.10048e-07, "loss": 0.5414, "step": 147550 }, { "epoch": 1.476, "grad_norm": 35.612281799316406, "learning_rate": 4.09848e-07, "loss": 0.4347, "step": 147600 }, { "epoch": 1.4765, "grad_norm": 71.22669219970703, "learning_rate": 4.09648e-07, "loss": 0.4211, "step": 147650 }, { "epoch": 1.4769999999999999, "grad_norm": 79.02700805664062, "learning_rate": 4.0944799999999995e-07, "loss": 0.3902, "step": 147700 }, { "epoch": 1.4775, "grad_norm": 118.43605041503906, "learning_rate": 4.09248e-07, "loss": 0.4709, "step": 147750 }, { "epoch": 1.478, "grad_norm": 42.166351318359375, "learning_rate": 4.09048e-07, "loss": 0.2732, "step": 147800 }, { "epoch": 1.4785, "grad_norm": 12.510908126831055, "learning_rate": 4.0884799999999996e-07, "loss": 0.3895, "step": 147850 }, { "epoch": 1.479, "grad_norm": 91.17105102539062, "learning_rate": 4.08648e-07, "loss": 0.4198, "step": 147900 }, { "epoch": 1.4795, "grad_norm": 69.4258041381836, "learning_rate": 4.08448e-07, "loss": 0.4962, "step": 147950 }, { "epoch": 1.48, "grad_norm": 45.617069244384766, "learning_rate": 4.0824799999999996e-07, "loss": 0.3913, "step": 148000 }, { "epoch": 1.4805, "grad_norm": 88.43484497070312, "learning_rate": 4.08048e-07, "loss": 0.3713, "step": 148050 }, { "epoch": 1.4809999999999999, "grad_norm": 28.707489013671875, "learning_rate": 4.07848e-07, "loss": 0.377, "step": 148100 }, { "epoch": 1.4815, "grad_norm": 29.1148681640625, "learning_rate": 4.0764799999999997e-07, "loss": 0.3606, "step": 148150 }, { "epoch": 1.482, "grad_norm": 11.052894592285156, "learning_rate": 4.0744799999999996e-07, "loss": 0.4278, "step": 148200 }, { "epoch": 1.4825, "grad_norm": 94.74927520751953, "learning_rate": 4.07248e-07, "loss": 0.3981, "step": 148250 }, { "epoch": 1.483, "grad_norm": 28.74058723449707, "learning_rate": 4.07048e-07, "loss": 0.5204, "step": 148300 }, { "epoch": 1.4835, "grad_norm": 44.29704284667969, "learning_rate": 4.0684799999999996e-07, "loss": 0.473, "step": 148350 }, { "epoch": 1.484, "grad_norm": 3.45829439163208, "learning_rate": 4.06648e-07, "loss": 0.423, "step": 148400 }, { "epoch": 1.4845, "grad_norm": 112.15509796142578, "learning_rate": 4.0644799999999993e-07, "loss": 0.4297, "step": 148450 }, { "epoch": 1.4849999999999999, "grad_norm": 37.767704010009766, "learning_rate": 4.0624799999999997e-07, "loss": 0.407, "step": 148500 }, { "epoch": 1.4855, "grad_norm": 114.10420227050781, "learning_rate": 4.06048e-07, "loss": 0.49, "step": 148550 }, { "epoch": 1.486, "grad_norm": 81.723388671875, "learning_rate": 4.0584799999999994e-07, "loss": 0.3653, "step": 148600 }, { "epoch": 1.4865, "grad_norm": 2.658217191696167, "learning_rate": 4.05652e-07, "loss": 0.4765, "step": 148650 }, { "epoch": 1.487, "grad_norm": 0.6626757979393005, "learning_rate": 4.0545199999999997e-07, "loss": 0.4804, "step": 148700 }, { "epoch": 1.4875, "grad_norm": 71.02579498291016, "learning_rate": 4.05252e-07, "loss": 0.4608, "step": 148750 }, { "epoch": 1.488, "grad_norm": 7.002717018127441, "learning_rate": 4.05052e-07, "loss": 0.4567, "step": 148800 }, { "epoch": 1.4885, "grad_norm": 159.69973754882812, "learning_rate": 4.04852e-07, "loss": 0.4012, "step": 148850 }, { "epoch": 1.4889999999999999, "grad_norm": 33.81310272216797, "learning_rate": 4.04652e-07, "loss": 0.4608, "step": 148900 }, { "epoch": 1.4895, "grad_norm": 79.68433380126953, "learning_rate": 4.0445199999999995e-07, "loss": 0.4923, "step": 148950 }, { "epoch": 1.49, "grad_norm": 3.4748802185058594, "learning_rate": 4.04252e-07, "loss": 0.39, "step": 149000 }, { "epoch": 1.4905, "grad_norm": 53.5771598815918, "learning_rate": 4.04052e-07, "loss": 0.5652, "step": 149050 }, { "epoch": 1.491, "grad_norm": 46.88816452026367, "learning_rate": 4.0385199999999996e-07, "loss": 0.4258, "step": 149100 }, { "epoch": 1.4915, "grad_norm": 32.95225524902344, "learning_rate": 4.03652e-07, "loss": 0.4477, "step": 149150 }, { "epoch": 1.492, "grad_norm": 68.19593048095703, "learning_rate": 4.03452e-07, "loss": 0.4861, "step": 149200 }, { "epoch": 1.4925, "grad_norm": 78.71890258789062, "learning_rate": 4.0325199999999996e-07, "loss": 0.5034, "step": 149250 }, { "epoch": 1.4929999999999999, "grad_norm": 4.197335720062256, "learning_rate": 4.03052e-07, "loss": 0.5138, "step": 149300 }, { "epoch": 1.4935, "grad_norm": 6.181484222412109, "learning_rate": 4.02852e-07, "loss": 0.4473, "step": 149350 }, { "epoch": 1.494, "grad_norm": 5.397106170654297, "learning_rate": 4.0265199999999997e-07, "loss": 0.4057, "step": 149400 }, { "epoch": 1.4945, "grad_norm": 6.172823905944824, "learning_rate": 4.0245199999999996e-07, "loss": 0.4547, "step": 149450 }, { "epoch": 1.495, "grad_norm": 30.402366638183594, "learning_rate": 4.02252e-07, "loss": 0.4936, "step": 149500 }, { "epoch": 1.4955, "grad_norm": 90.25249481201172, "learning_rate": 4.02052e-07, "loss": 0.4147, "step": 149550 }, { "epoch": 1.496, "grad_norm": 5.896697521209717, "learning_rate": 4.0185199999999996e-07, "loss": 0.4432, "step": 149600 }, { "epoch": 1.4965, "grad_norm": 66.12384796142578, "learning_rate": 4.01652e-07, "loss": 0.4179, "step": 149650 }, { "epoch": 1.4969999999999999, "grad_norm": 26.684412002563477, "learning_rate": 4.0145199999999993e-07, "loss": 0.5063, "step": 149700 }, { "epoch": 1.4975, "grad_norm": 0.9913628697395325, "learning_rate": 4.0125199999999997e-07, "loss": 0.3879, "step": 149750 }, { "epoch": 1.498, "grad_norm": 98.07878875732422, "learning_rate": 4.01052e-07, "loss": 0.4664, "step": 149800 }, { "epoch": 1.4985, "grad_norm": 78.66146850585938, "learning_rate": 4.0085199999999994e-07, "loss": 0.5213, "step": 149850 }, { "epoch": 1.499, "grad_norm": 18.173500061035156, "learning_rate": 4.00652e-07, "loss": 0.3256, "step": 149900 }, { "epoch": 1.4995, "grad_norm": 0.6916254758834839, "learning_rate": 4.0045199999999997e-07, "loss": 0.4523, "step": 149950 }, { "epoch": 1.5, "grad_norm": 6.336428642272949, "learning_rate": 4.00252e-07, "loss": 0.3649, "step": 150000 }, { "epoch": 1.5005, "grad_norm": 72.81371307373047, "learning_rate": 4.00052e-07, "loss": 0.4134, "step": 150050 }, { "epoch": 1.501, "grad_norm": 53.876487731933594, "learning_rate": 3.9985199999999997e-07, "loss": 0.4047, "step": 150100 }, { "epoch": 1.5015, "grad_norm": 146.40159606933594, "learning_rate": 3.99652e-07, "loss": 0.4675, "step": 150150 }, { "epoch": 1.502, "grad_norm": 65.77076721191406, "learning_rate": 3.9945199999999994e-07, "loss": 0.5416, "step": 150200 }, { "epoch": 1.5025, "grad_norm": 41.62353515625, "learning_rate": 3.99252e-07, "loss": 0.5011, "step": 150250 }, { "epoch": 1.5030000000000001, "grad_norm": 75.19416809082031, "learning_rate": 3.99052e-07, "loss": 0.3859, "step": 150300 }, { "epoch": 1.5034999999999998, "grad_norm": 26.303030014038086, "learning_rate": 3.9885199999999995e-07, "loss": 0.4655, "step": 150350 }, { "epoch": 1.504, "grad_norm": 97.27670288085938, "learning_rate": 3.98652e-07, "loss": 0.4718, "step": 150400 }, { "epoch": 1.5045, "grad_norm": 120.45020294189453, "learning_rate": 3.98456e-07, "loss": 0.402, "step": 150450 }, { "epoch": 1.505, "grad_norm": 57.528079986572266, "learning_rate": 3.9825599999999996e-07, "loss": 0.4117, "step": 150500 }, { "epoch": 1.5055, "grad_norm": 110.59357452392578, "learning_rate": 3.98056e-07, "loss": 0.4865, "step": 150550 }, { "epoch": 1.506, "grad_norm": 20.42061996459961, "learning_rate": 3.97856e-07, "loss": 0.5312, "step": 150600 }, { "epoch": 1.5065, "grad_norm": 19.152402877807617, "learning_rate": 3.9765599999999997e-07, "loss": 0.5577, "step": 150650 }, { "epoch": 1.5070000000000001, "grad_norm": 144.61178588867188, "learning_rate": 3.9745599999999996e-07, "loss": 0.4414, "step": 150700 }, { "epoch": 1.5074999999999998, "grad_norm": 29.284423828125, "learning_rate": 3.97256e-07, "loss": 0.4616, "step": 150750 }, { "epoch": 1.508, "grad_norm": 5.666861057281494, "learning_rate": 3.97056e-07, "loss": 0.5011, "step": 150800 }, { "epoch": 1.5085, "grad_norm": 68.09160614013672, "learning_rate": 3.9685599999999996e-07, "loss": 0.5303, "step": 150850 }, { "epoch": 1.509, "grad_norm": 17.154300689697266, "learning_rate": 3.96656e-07, "loss": 0.5095, "step": 150900 }, { "epoch": 1.5095, "grad_norm": 76.5134048461914, "learning_rate": 3.9645599999999993e-07, "loss": 0.446, "step": 150950 }, { "epoch": 1.51, "grad_norm": 40.559085845947266, "learning_rate": 3.9625599999999997e-07, "loss": 0.5092, "step": 151000 }, { "epoch": 1.5105, "grad_norm": 60.698848724365234, "learning_rate": 3.96056e-07, "loss": 0.5773, "step": 151050 }, { "epoch": 1.5110000000000001, "grad_norm": 122.44525909423828, "learning_rate": 3.9585599999999994e-07, "loss": 0.4852, "step": 151100 }, { "epoch": 1.5114999999999998, "grad_norm": 31.7929744720459, "learning_rate": 3.95656e-07, "loss": 0.3784, "step": 151150 }, { "epoch": 1.512, "grad_norm": 73.54200744628906, "learning_rate": 3.9545599999999996e-07, "loss": 0.3659, "step": 151200 }, { "epoch": 1.5125, "grad_norm": 45.57496643066406, "learning_rate": 3.95256e-07, "loss": 0.4095, "step": 151250 }, { "epoch": 1.513, "grad_norm": 35.19536590576172, "learning_rate": 3.95056e-07, "loss": 0.3262, "step": 151300 }, { "epoch": 1.5135, "grad_norm": 29.406625747680664, "learning_rate": 3.9485599999999997e-07, "loss": 0.2914, "step": 151350 }, { "epoch": 1.514, "grad_norm": 11.98690128326416, "learning_rate": 3.94656e-07, "loss": 0.5087, "step": 151400 }, { "epoch": 1.5145, "grad_norm": 404.19842529296875, "learning_rate": 3.9445599999999994e-07, "loss": 0.5389, "step": 151450 }, { "epoch": 1.5150000000000001, "grad_norm": 71.88636016845703, "learning_rate": 3.94256e-07, "loss": 0.5309, "step": 151500 }, { "epoch": 1.5154999999999998, "grad_norm": 130.74053955078125, "learning_rate": 3.94056e-07, "loss": 0.4026, "step": 151550 }, { "epoch": 1.516, "grad_norm": 43.46409606933594, "learning_rate": 3.9385599999999995e-07, "loss": 0.5388, "step": 151600 }, { "epoch": 1.5165, "grad_norm": 7.446013927459717, "learning_rate": 3.93656e-07, "loss": 0.5019, "step": 151650 }, { "epoch": 1.517, "grad_norm": 99.18240356445312, "learning_rate": 3.9345599999999997e-07, "loss": 0.5287, "step": 151700 }, { "epoch": 1.5175, "grad_norm": 16.316425323486328, "learning_rate": 3.9325599999999996e-07, "loss": 0.409, "step": 151750 }, { "epoch": 1.518, "grad_norm": 3.0078415870666504, "learning_rate": 3.93056e-07, "loss": 0.4389, "step": 151800 }, { "epoch": 1.5185, "grad_norm": 57.24897384643555, "learning_rate": 3.92856e-07, "loss": 0.4834, "step": 151850 }, { "epoch": 1.5190000000000001, "grad_norm": 9.619451522827148, "learning_rate": 3.9265599999999997e-07, "loss": 0.4941, "step": 151900 }, { "epoch": 1.5194999999999999, "grad_norm": 66.24259185791016, "learning_rate": 3.92456e-07, "loss": 0.4241, "step": 151950 }, { "epoch": 1.52, "grad_norm": 64.54350280761719, "learning_rate": 3.92256e-07, "loss": 0.4539, "step": 152000 }, { "epoch": 1.5205, "grad_norm": 3.8642959594726562, "learning_rate": 3.92056e-07, "loss": 0.4306, "step": 152050 }, { "epoch": 1.521, "grad_norm": 88.06119537353516, "learning_rate": 3.9185599999999996e-07, "loss": 0.4, "step": 152100 }, { "epoch": 1.5215, "grad_norm": 0.5030847191810608, "learning_rate": 3.91656e-07, "loss": 0.4478, "step": 152150 }, { "epoch": 1.522, "grad_norm": 74.51904296875, "learning_rate": 3.9145600000000003e-07, "loss": 0.4082, "step": 152200 }, { "epoch": 1.5225, "grad_norm": 7.973733425140381, "learning_rate": 3.9125599999999997e-07, "loss": 0.4605, "step": 152250 }, { "epoch": 1.5230000000000001, "grad_norm": 52.102630615234375, "learning_rate": 3.91056e-07, "loss": 0.4287, "step": 152300 }, { "epoch": 1.5234999999999999, "grad_norm": 10.355042457580566, "learning_rate": 3.90856e-07, "loss": 0.532, "step": 152350 }, { "epoch": 1.524, "grad_norm": 13.536386489868164, "learning_rate": 3.90656e-07, "loss": 0.3487, "step": 152400 }, { "epoch": 1.5245, "grad_norm": 17.59664535522461, "learning_rate": 3.90456e-07, "loss": 0.5859, "step": 152450 }, { "epoch": 1.525, "grad_norm": 37.894744873046875, "learning_rate": 3.90256e-07, "loss": 0.3717, "step": 152500 }, { "epoch": 1.5255, "grad_norm": 87.29824829101562, "learning_rate": 3.90056e-07, "loss": 0.3377, "step": 152550 }, { "epoch": 1.526, "grad_norm": 150.49517822265625, "learning_rate": 3.8985599999999997e-07, "loss": 0.4282, "step": 152600 }, { "epoch": 1.5265, "grad_norm": 22.18712615966797, "learning_rate": 3.89656e-07, "loss": 0.3628, "step": 152650 }, { "epoch": 1.5270000000000001, "grad_norm": 7.895515441894531, "learning_rate": 3.8945999999999994e-07, "loss": 0.4914, "step": 152700 }, { "epoch": 1.5274999999999999, "grad_norm": 98.30242919921875, "learning_rate": 3.8926e-07, "loss": 0.4066, "step": 152750 }, { "epoch": 1.528, "grad_norm": 71.57797241210938, "learning_rate": 3.8906e-07, "loss": 0.4606, "step": 152800 }, { "epoch": 1.5285, "grad_norm": 52.828861236572266, "learning_rate": 3.8885999999999995e-07, "loss": 0.4835, "step": 152850 }, { "epoch": 1.529, "grad_norm": 1.3210022449493408, "learning_rate": 3.8866e-07, "loss": 0.6443, "step": 152900 }, { "epoch": 1.5295, "grad_norm": 0.20396688580513, "learning_rate": 3.8846e-07, "loss": 0.3461, "step": 152950 }, { "epoch": 1.53, "grad_norm": 5.065512180328369, "learning_rate": 3.8825999999999996e-07, "loss": 0.5639, "step": 153000 }, { "epoch": 1.5305, "grad_norm": 0.27754420042037964, "learning_rate": 3.8806e-07, "loss": 0.357, "step": 153050 }, { "epoch": 1.5310000000000001, "grad_norm": 105.62935638427734, "learning_rate": 3.8786e-07, "loss": 0.51, "step": 153100 }, { "epoch": 1.5314999999999999, "grad_norm": 78.91966247558594, "learning_rate": 3.8765999999999997e-07, "loss": 0.3173, "step": 153150 }, { "epoch": 1.532, "grad_norm": 22.052091598510742, "learning_rate": 3.8746e-07, "loss": 0.4488, "step": 153200 }, { "epoch": 1.5325, "grad_norm": 105.67029571533203, "learning_rate": 3.8726e-07, "loss": 0.3544, "step": 153250 }, { "epoch": 1.533, "grad_norm": 68.83394622802734, "learning_rate": 3.8705999999999997e-07, "loss": 0.4906, "step": 153300 }, { "epoch": 1.5335, "grad_norm": 35.794700622558594, "learning_rate": 3.8685999999999996e-07, "loss": 0.3602, "step": 153350 }, { "epoch": 1.534, "grad_norm": 8.180852890014648, "learning_rate": 3.8666e-07, "loss": 0.4035, "step": 153400 }, { "epoch": 1.5345, "grad_norm": 18.921524047851562, "learning_rate": 3.8646000000000003e-07, "loss": 0.4177, "step": 153450 }, { "epoch": 1.5350000000000001, "grad_norm": 28.274133682250977, "learning_rate": 3.8625999999999997e-07, "loss": 0.3924, "step": 153500 }, { "epoch": 1.5354999999999999, "grad_norm": 60.59059524536133, "learning_rate": 3.8606e-07, "loss": 0.641, "step": 153550 }, { "epoch": 1.536, "grad_norm": 13.755937576293945, "learning_rate": 3.8586e-07, "loss": 0.3756, "step": 153600 }, { "epoch": 1.5365, "grad_norm": 28.67469596862793, "learning_rate": 3.8566e-07, "loss": 0.5329, "step": 153650 }, { "epoch": 1.537, "grad_norm": 67.26752471923828, "learning_rate": 3.8546e-07, "loss": 0.4442, "step": 153700 }, { "epoch": 1.5375, "grad_norm": 76.13204956054688, "learning_rate": 3.8526e-07, "loss": 0.3637, "step": 153750 }, { "epoch": 1.538, "grad_norm": 108.6531753540039, "learning_rate": 3.8506e-07, "loss": 0.4456, "step": 153800 }, { "epoch": 1.5385, "grad_norm": 82.92815399169922, "learning_rate": 3.8485999999999997e-07, "loss": 0.3671, "step": 153850 }, { "epoch": 1.5390000000000001, "grad_norm": 3.92008376121521, "learning_rate": 3.8466e-07, "loss": 0.354, "step": 153900 }, { "epoch": 1.5394999999999999, "grad_norm": 46.99858093261719, "learning_rate": 3.8446e-07, "loss": 0.5856, "step": 153950 }, { "epoch": 1.54, "grad_norm": 16.050018310546875, "learning_rate": 3.8426e-07, "loss": 0.5328, "step": 154000 }, { "epoch": 1.5405, "grad_norm": 59.63269805908203, "learning_rate": 3.8406e-07, "loss": 0.5117, "step": 154050 }, { "epoch": 1.541, "grad_norm": 12.401506423950195, "learning_rate": 3.8385999999999994e-07, "loss": 0.4722, "step": 154100 }, { "epoch": 1.5415, "grad_norm": 104.35944366455078, "learning_rate": 3.8366e-07, "loss": 0.5007, "step": 154150 }, { "epoch": 1.542, "grad_norm": 32.52841567993164, "learning_rate": 3.8346e-07, "loss": 0.4704, "step": 154200 }, { "epoch": 1.5425, "grad_norm": 101.7193374633789, "learning_rate": 3.8325999999999995e-07, "loss": 0.4382, "step": 154250 }, { "epoch": 1.5430000000000001, "grad_norm": 89.42278289794922, "learning_rate": 3.8306e-07, "loss": 0.5037, "step": 154300 }, { "epoch": 1.5434999999999999, "grad_norm": 35.02275466918945, "learning_rate": 3.8286e-07, "loss": 0.4409, "step": 154350 }, { "epoch": 1.544, "grad_norm": 78.44393157958984, "learning_rate": 3.8265999999999996e-07, "loss": 0.4195, "step": 154400 }, { "epoch": 1.5445, "grad_norm": 14.024569511413574, "learning_rate": 3.8246e-07, "loss": 0.3931, "step": 154450 }, { "epoch": 1.545, "grad_norm": 29.970157623291016, "learning_rate": 3.8226e-07, "loss": 0.4982, "step": 154500 }, { "epoch": 1.5455, "grad_norm": 92.17186737060547, "learning_rate": 3.8205999999999997e-07, "loss": 0.489, "step": 154550 }, { "epoch": 1.546, "grad_norm": 103.02266693115234, "learning_rate": 3.8185999999999995e-07, "loss": 0.4268, "step": 154600 }, { "epoch": 1.5465, "grad_norm": 2.3112690448760986, "learning_rate": 3.8166e-07, "loss": 0.4287, "step": 154650 }, { "epoch": 1.5470000000000002, "grad_norm": 71.17611694335938, "learning_rate": 3.8146000000000003e-07, "loss": 0.356, "step": 154700 }, { "epoch": 1.5474999999999999, "grad_norm": 21.637741088867188, "learning_rate": 3.8125999999999996e-07, "loss": 0.4844, "step": 154750 }, { "epoch": 1.548, "grad_norm": 26.230649948120117, "learning_rate": 3.8106e-07, "loss": 0.5267, "step": 154800 }, { "epoch": 1.5485, "grad_norm": 91.8410415649414, "learning_rate": 3.8086e-07, "loss": 0.4135, "step": 154850 }, { "epoch": 1.549, "grad_norm": 12.015192985534668, "learning_rate": 3.8065999999999997e-07, "loss": 0.4938, "step": 154900 }, { "epoch": 1.5495, "grad_norm": 5.9891533851623535, "learning_rate": 3.8046e-07, "loss": 0.3827, "step": 154950 }, { "epoch": 1.55, "grad_norm": 0.673921525478363, "learning_rate": 3.8026e-07, "loss": 0.4642, "step": 155000 }, { "epoch": 1.5505, "grad_norm": 48.262916564941406, "learning_rate": 3.8006e-07, "loss": 0.2987, "step": 155050 }, { "epoch": 1.5510000000000002, "grad_norm": 15.329411506652832, "learning_rate": 3.7985999999999996e-07, "loss": 0.4899, "step": 155100 }, { "epoch": 1.5514999999999999, "grad_norm": 54.15485763549805, "learning_rate": 3.7966e-07, "loss": 0.3895, "step": 155150 }, { "epoch": 1.552, "grad_norm": 76.88223266601562, "learning_rate": 3.7946e-07, "loss": 0.3881, "step": 155200 }, { "epoch": 1.5525, "grad_norm": 19.991403579711914, "learning_rate": 3.7925999999999997e-07, "loss": 0.398, "step": 155250 }, { "epoch": 1.553, "grad_norm": 94.80240631103516, "learning_rate": 3.7906e-07, "loss": 0.4081, "step": 155300 }, { "epoch": 1.5535, "grad_norm": 122.23736572265625, "learning_rate": 3.7885999999999994e-07, "loss": 0.3894, "step": 155350 }, { "epoch": 1.554, "grad_norm": 57.91024398803711, "learning_rate": 3.7866e-07, "loss": 0.4474, "step": 155400 }, { "epoch": 1.5545, "grad_norm": 8.4732666015625, "learning_rate": 3.7846e-07, "loss": 0.4155, "step": 155450 }, { "epoch": 1.5550000000000002, "grad_norm": 12.340983390808105, "learning_rate": 3.7825999999999995e-07, "loss": 0.4288, "step": 155500 }, { "epoch": 1.5554999999999999, "grad_norm": 13.673118591308594, "learning_rate": 3.7806e-07, "loss": 0.5154, "step": 155550 }, { "epoch": 1.556, "grad_norm": 112.43163299560547, "learning_rate": 3.7785999999999997e-07, "loss": 0.3473, "step": 155600 }, { "epoch": 1.5565, "grad_norm": 10.982392311096191, "learning_rate": 3.7765999999999996e-07, "loss": 0.3726, "step": 155650 }, { "epoch": 1.557, "grad_norm": 105.76234436035156, "learning_rate": 3.7746e-07, "loss": 0.3479, "step": 155700 }, { "epoch": 1.5575, "grad_norm": 76.24043273925781, "learning_rate": 3.7726e-07, "loss": 0.4958, "step": 155750 }, { "epoch": 1.558, "grad_norm": 6.342814922332764, "learning_rate": 3.7705999999999996e-07, "loss": 0.3893, "step": 155800 }, { "epoch": 1.5585, "grad_norm": 74.10182189941406, "learning_rate": 3.7685999999999995e-07, "loss": 0.4655, "step": 155850 }, { "epoch": 1.5590000000000002, "grad_norm": 0.016593320295214653, "learning_rate": 3.7666e-07, "loss": 0.4724, "step": 155900 }, { "epoch": 1.5594999999999999, "grad_norm": 3.463813304901123, "learning_rate": 3.7646e-07, "loss": 0.3882, "step": 155950 }, { "epoch": 1.56, "grad_norm": 28.10302734375, "learning_rate": 3.7625999999999996e-07, "loss": 0.4819, "step": 156000 }, { "epoch": 1.5605, "grad_norm": 90.9014892578125, "learning_rate": 3.7606e-07, "loss": 0.5103, "step": 156050 }, { "epoch": 1.561, "grad_norm": 5.384343147277832, "learning_rate": 3.7586000000000003e-07, "loss": 0.3814, "step": 156100 }, { "epoch": 1.5615, "grad_norm": 63.378196716308594, "learning_rate": 3.7565999999999996e-07, "loss": 0.3121, "step": 156150 }, { "epoch": 1.562, "grad_norm": 67.78466796875, "learning_rate": 3.7546e-07, "loss": 0.3317, "step": 156200 }, { "epoch": 1.5625, "grad_norm": 38.204132080078125, "learning_rate": 3.7526e-07, "loss": 0.6073, "step": 156250 }, { "epoch": 1.563, "grad_norm": 29.50925636291504, "learning_rate": 3.7505999999999997e-07, "loss": 0.3818, "step": 156300 }, { "epoch": 1.5635, "grad_norm": 55.05265426635742, "learning_rate": 3.7486e-07, "loss": 0.4596, "step": 156350 }, { "epoch": 1.564, "grad_norm": 124.0997085571289, "learning_rate": 3.7466e-07, "loss": 0.4736, "step": 156400 }, { "epoch": 1.5645, "grad_norm": 38.588619232177734, "learning_rate": 3.7446e-07, "loss": 0.4511, "step": 156450 }, { "epoch": 1.565, "grad_norm": 80.09603118896484, "learning_rate": 3.7425999999999996e-07, "loss": 0.4403, "step": 156500 }, { "epoch": 1.5655000000000001, "grad_norm": 1.9069684743881226, "learning_rate": 3.7406e-07, "loss": 0.5459, "step": 156550 }, { "epoch": 1.5659999999999998, "grad_norm": 1.7578861713409424, "learning_rate": 3.7386e-07, "loss": 0.5356, "step": 156600 }, { "epoch": 1.5665, "grad_norm": 13.54135513305664, "learning_rate": 3.7365999999999997e-07, "loss": 0.4722, "step": 156650 }, { "epoch": 1.567, "grad_norm": 42.47798156738281, "learning_rate": 3.7346e-07, "loss": 0.4462, "step": 156700 }, { "epoch": 1.5675, "grad_norm": 32.1161003112793, "learning_rate": 3.7326399999999995e-07, "loss": 0.4484, "step": 156750 }, { "epoch": 1.568, "grad_norm": 44.59100341796875, "learning_rate": 3.73064e-07, "loss": 0.3855, "step": 156800 }, { "epoch": 1.5685, "grad_norm": 15.064719200134277, "learning_rate": 3.7286399999999997e-07, "loss": 0.4979, "step": 156850 }, { "epoch": 1.569, "grad_norm": 82.406982421875, "learning_rate": 3.7266399999999995e-07, "loss": 0.3934, "step": 156900 }, { "epoch": 1.5695000000000001, "grad_norm": 1.5106115341186523, "learning_rate": 3.72464e-07, "loss": 0.4888, "step": 156950 }, { "epoch": 1.5699999999999998, "grad_norm": 34.84396743774414, "learning_rate": 3.72264e-07, "loss": 0.3313, "step": 157000 }, { "epoch": 1.5705, "grad_norm": 56.88155746459961, "learning_rate": 3.7206399999999996e-07, "loss": 0.6379, "step": 157050 }, { "epoch": 1.571, "grad_norm": 47.09871292114258, "learning_rate": 3.7186399999999995e-07, "loss": 0.4519, "step": 157100 }, { "epoch": 1.5715, "grad_norm": 44.833335876464844, "learning_rate": 3.71664e-07, "loss": 0.3936, "step": 157150 }, { "epoch": 1.572, "grad_norm": 85.64391326904297, "learning_rate": 3.71464e-07, "loss": 0.3674, "step": 157200 }, { "epoch": 1.5725, "grad_norm": 69.18328857421875, "learning_rate": 3.7126399999999996e-07, "loss": 0.5126, "step": 157250 }, { "epoch": 1.573, "grad_norm": 98.84770202636719, "learning_rate": 3.71064e-07, "loss": 0.4173, "step": 157300 }, { "epoch": 1.5735000000000001, "grad_norm": 22.579782485961914, "learning_rate": 3.7086400000000003e-07, "loss": 0.4732, "step": 157350 }, { "epoch": 1.5739999999999998, "grad_norm": 27.351652145385742, "learning_rate": 3.7066399999999996e-07, "loss": 0.5637, "step": 157400 }, { "epoch": 1.5745, "grad_norm": 66.01678466796875, "learning_rate": 3.70464e-07, "loss": 0.3713, "step": 157450 }, { "epoch": 1.575, "grad_norm": 4.506946563720703, "learning_rate": 3.70264e-07, "loss": 0.2717, "step": 157500 }, { "epoch": 1.5755, "grad_norm": 61.51058578491211, "learning_rate": 3.7006399999999997e-07, "loss": 0.4417, "step": 157550 }, { "epoch": 1.576, "grad_norm": 3.467181921005249, "learning_rate": 3.69864e-07, "loss": 0.4313, "step": 157600 }, { "epoch": 1.5765, "grad_norm": 14.573429107666016, "learning_rate": 3.69664e-07, "loss": 0.4639, "step": 157650 }, { "epoch": 1.577, "grad_norm": 54.943565368652344, "learning_rate": 3.69464e-07, "loss": 0.428, "step": 157700 }, { "epoch": 1.5775000000000001, "grad_norm": 8.155677795410156, "learning_rate": 3.6926799999999997e-07, "loss": 0.4534, "step": 157750 }, { "epoch": 1.5779999999999998, "grad_norm": 67.54132080078125, "learning_rate": 3.69068e-07, "loss": 0.4839, "step": 157800 }, { "epoch": 1.5785, "grad_norm": 10.213589668273926, "learning_rate": 3.6886799999999994e-07, "loss": 0.3556, "step": 157850 }, { "epoch": 1.579, "grad_norm": 109.822021484375, "learning_rate": 3.68668e-07, "loss": 0.4593, "step": 157900 }, { "epoch": 1.5795, "grad_norm": 82.01921081542969, "learning_rate": 3.68468e-07, "loss": 0.3996, "step": 157950 }, { "epoch": 1.58, "grad_norm": 60.44963836669922, "learning_rate": 3.6826799999999995e-07, "loss": 0.5142, "step": 158000 }, { "epoch": 1.5805, "grad_norm": 101.18560791015625, "learning_rate": 3.68068e-07, "loss": 0.39, "step": 158050 }, { "epoch": 1.581, "grad_norm": 73.67401123046875, "learning_rate": 3.6786799999999997e-07, "loss": 0.4294, "step": 158100 }, { "epoch": 1.5815000000000001, "grad_norm": 38.87802505493164, "learning_rate": 3.6766799999999995e-07, "loss": 0.4877, "step": 158150 }, { "epoch": 1.5819999999999999, "grad_norm": 36.239723205566406, "learning_rate": 3.67468e-07, "loss": 0.3294, "step": 158200 }, { "epoch": 1.5825, "grad_norm": 16.11030387878418, "learning_rate": 3.67268e-07, "loss": 0.5534, "step": 158250 }, { "epoch": 1.583, "grad_norm": 43.37452697753906, "learning_rate": 3.67068e-07, "loss": 0.4388, "step": 158300 }, { "epoch": 1.5835, "grad_norm": 15.790046691894531, "learning_rate": 3.66868e-07, "loss": 0.4595, "step": 158350 }, { "epoch": 1.584, "grad_norm": 49.98896026611328, "learning_rate": 3.66668e-07, "loss": 0.598, "step": 158400 }, { "epoch": 1.5845, "grad_norm": 151.6587371826172, "learning_rate": 3.66468e-07, "loss": 0.5676, "step": 158450 }, { "epoch": 1.585, "grad_norm": 150.25958251953125, "learning_rate": 3.6626799999999995e-07, "loss": 0.4359, "step": 158500 }, { "epoch": 1.5855000000000001, "grad_norm": 79.11650085449219, "learning_rate": 3.66068e-07, "loss": 0.4847, "step": 158550 }, { "epoch": 1.5859999999999999, "grad_norm": 16.014951705932617, "learning_rate": 3.6586800000000003e-07, "loss": 0.4409, "step": 158600 }, { "epoch": 1.5865, "grad_norm": 38.4268798828125, "learning_rate": 3.6566799999999996e-07, "loss": 0.4406, "step": 158650 }, { "epoch": 1.587, "grad_norm": 75.6319808959961, "learning_rate": 3.65468e-07, "loss": 0.4073, "step": 158700 }, { "epoch": 1.5875, "grad_norm": 83.09989166259766, "learning_rate": 3.65268e-07, "loss": 0.6108, "step": 158750 }, { "epoch": 1.588, "grad_norm": 85.7934341430664, "learning_rate": 3.6506799999999997e-07, "loss": 0.4062, "step": 158800 }, { "epoch": 1.5885, "grad_norm": 63.43593978881836, "learning_rate": 3.64868e-07, "loss": 0.4971, "step": 158850 }, { "epoch": 1.589, "grad_norm": 0.3882288336753845, "learning_rate": 3.64668e-07, "loss": 0.4136, "step": 158900 }, { "epoch": 1.5895000000000001, "grad_norm": 69.4052963256836, "learning_rate": 3.64468e-07, "loss": 0.5876, "step": 158950 }, { "epoch": 1.5899999999999999, "grad_norm": 0.12768994271755219, "learning_rate": 3.6426799999999996e-07, "loss": 0.5236, "step": 159000 }, { "epoch": 1.5905, "grad_norm": 123.2761459350586, "learning_rate": 3.64068e-07, "loss": 0.503, "step": 159050 }, { "epoch": 1.591, "grad_norm": 0.08437743782997131, "learning_rate": 3.63868e-07, "loss": 0.5431, "step": 159100 }, { "epoch": 1.5915, "grad_norm": 50.3804817199707, "learning_rate": 3.6366799999999997e-07, "loss": 0.3803, "step": 159150 }, { "epoch": 1.592, "grad_norm": 66.29913330078125, "learning_rate": 3.63468e-07, "loss": 0.441, "step": 159200 }, { "epoch": 1.5925, "grad_norm": 57.80009460449219, "learning_rate": 3.6326799999999994e-07, "loss": 0.5003, "step": 159250 }, { "epoch": 1.593, "grad_norm": 38.19333267211914, "learning_rate": 3.63068e-07, "loss": 0.3942, "step": 159300 }, { "epoch": 1.5935000000000001, "grad_norm": 108.0379638671875, "learning_rate": 3.62868e-07, "loss": 0.5143, "step": 159350 }, { "epoch": 1.5939999999999999, "grad_norm": 37.051204681396484, "learning_rate": 3.6266799999999995e-07, "loss": 0.4615, "step": 159400 }, { "epoch": 1.5945, "grad_norm": 32.642669677734375, "learning_rate": 3.62468e-07, "loss": 0.4396, "step": 159450 }, { "epoch": 1.595, "grad_norm": 64.99166107177734, "learning_rate": 3.6226799999999997e-07, "loss": 0.4363, "step": 159500 }, { "epoch": 1.5955, "grad_norm": 57.88900375366211, "learning_rate": 3.62068e-07, "loss": 0.4541, "step": 159550 }, { "epoch": 1.596, "grad_norm": 0.2389134019613266, "learning_rate": 3.61868e-07, "loss": 0.5371, "step": 159600 }, { "epoch": 1.5965, "grad_norm": 7.437679290771484, "learning_rate": 3.61668e-07, "loss": 0.4728, "step": 159650 }, { "epoch": 1.597, "grad_norm": 78.19436645507812, "learning_rate": 3.61468e-07, "loss": 0.4397, "step": 159700 }, { "epoch": 1.5975000000000001, "grad_norm": 110.06568908691406, "learning_rate": 3.6127199999999995e-07, "loss": 0.4691, "step": 159750 }, { "epoch": 1.5979999999999999, "grad_norm": 67.42326354980469, "learning_rate": 3.61072e-07, "loss": 0.3647, "step": 159800 }, { "epoch": 1.5985, "grad_norm": 82.26587677001953, "learning_rate": 3.6087200000000003e-07, "loss": 0.5464, "step": 159850 }, { "epoch": 1.599, "grad_norm": 92.55001068115234, "learning_rate": 3.6067199999999996e-07, "loss": 0.3611, "step": 159900 }, { "epoch": 1.5995, "grad_norm": 35.45196533203125, "learning_rate": 3.60472e-07, "loss": 0.3369, "step": 159950 }, { "epoch": 1.6, "grad_norm": 38.73994445800781, "learning_rate": 3.60272e-07, "loss": 0.3748, "step": 160000 }, { "epoch": 1.6005, "grad_norm": 2.0758419036865234, "learning_rate": 3.6007199999999997e-07, "loss": 0.4442, "step": 160050 }, { "epoch": 1.601, "grad_norm": 62.947288513183594, "learning_rate": 3.59872e-07, "loss": 0.4279, "step": 160100 }, { "epoch": 1.6015000000000001, "grad_norm": 0.5712276101112366, "learning_rate": 3.59672e-07, "loss": 0.4415, "step": 160150 }, { "epoch": 1.6019999999999999, "grad_norm": 31.01521873474121, "learning_rate": 3.59472e-07, "loss": 0.4596, "step": 160200 }, { "epoch": 1.6025, "grad_norm": 58.51720428466797, "learning_rate": 3.5927199999999996e-07, "loss": 0.4667, "step": 160250 }, { "epoch": 1.603, "grad_norm": 23.069557189941406, "learning_rate": 3.59072e-07, "loss": 0.3335, "step": 160300 }, { "epoch": 1.6035, "grad_norm": 93.3023910522461, "learning_rate": 3.58872e-07, "loss": 0.519, "step": 160350 }, { "epoch": 1.604, "grad_norm": 5.520042896270752, "learning_rate": 3.5867199999999997e-07, "loss": 0.42, "step": 160400 }, { "epoch": 1.6045, "grad_norm": 49.47138214111328, "learning_rate": 3.58472e-07, "loss": 0.5085, "step": 160450 }, { "epoch": 1.605, "grad_norm": 93.62559509277344, "learning_rate": 3.5827199999999994e-07, "loss": 0.3626, "step": 160500 }, { "epoch": 1.6055000000000001, "grad_norm": 28.985837936401367, "learning_rate": 3.58072e-07, "loss": 0.4725, "step": 160550 }, { "epoch": 1.6059999999999999, "grad_norm": 0.5782918334007263, "learning_rate": 3.57872e-07, "loss": 0.344, "step": 160600 }, { "epoch": 1.6065, "grad_norm": 62.849220275878906, "learning_rate": 3.5767199999999995e-07, "loss": 0.3763, "step": 160650 }, { "epoch": 1.607, "grad_norm": 62.66322708129883, "learning_rate": 3.57472e-07, "loss": 0.4729, "step": 160700 }, { "epoch": 1.6075, "grad_norm": 1.8875168561935425, "learning_rate": 3.5727199999999997e-07, "loss": 0.4565, "step": 160750 }, { "epoch": 1.608, "grad_norm": 86.59226989746094, "learning_rate": 3.57072e-07, "loss": 0.3441, "step": 160800 }, { "epoch": 1.6085, "grad_norm": 22.594993591308594, "learning_rate": 3.56872e-07, "loss": 0.5117, "step": 160850 }, { "epoch": 1.609, "grad_norm": 31.01827049255371, "learning_rate": 3.56672e-07, "loss": 0.4471, "step": 160900 }, { "epoch": 1.6095000000000002, "grad_norm": 85.61234283447266, "learning_rate": 3.56472e-07, "loss": 0.3757, "step": 160950 }, { "epoch": 1.6099999999999999, "grad_norm": 87.01544189453125, "learning_rate": 3.5627199999999995e-07, "loss": 0.5461, "step": 161000 }, { "epoch": 1.6105, "grad_norm": 4.279980659484863, "learning_rate": 3.56072e-07, "loss": 0.4856, "step": 161050 }, { "epoch": 1.611, "grad_norm": 121.6936264038086, "learning_rate": 3.55872e-07, "loss": 0.5672, "step": 161100 }, { "epoch": 1.6115, "grad_norm": 12.646116256713867, "learning_rate": 3.5567199999999996e-07, "loss": 0.4158, "step": 161150 }, { "epoch": 1.612, "grad_norm": 166.2145233154297, "learning_rate": 3.55472e-07, "loss": 0.4199, "step": 161200 }, { "epoch": 1.6125, "grad_norm": 37.84153366088867, "learning_rate": 3.55272e-07, "loss": 0.5097, "step": 161250 }, { "epoch": 1.613, "grad_norm": 21.25974464416504, "learning_rate": 3.5507199999999996e-07, "loss": 0.4476, "step": 161300 }, { "epoch": 1.6135000000000002, "grad_norm": 1.0741132497787476, "learning_rate": 3.54872e-07, "loss": 0.3204, "step": 161350 }, { "epoch": 1.6139999999999999, "grad_norm": 76.07339477539062, "learning_rate": 3.54672e-07, "loss": 0.4677, "step": 161400 }, { "epoch": 1.6145, "grad_norm": 37.11135482788086, "learning_rate": 3.5447199999999997e-07, "loss": 0.4673, "step": 161450 }, { "epoch": 1.615, "grad_norm": 29.35950469970703, "learning_rate": 3.5427199999999996e-07, "loss": 0.4879, "step": 161500 }, { "epoch": 1.6155, "grad_norm": 6.429986000061035, "learning_rate": 3.54072e-07, "loss": 0.4428, "step": 161550 }, { "epoch": 1.616, "grad_norm": 4.6186933517456055, "learning_rate": 3.53872e-07, "loss": 0.3503, "step": 161600 }, { "epoch": 1.6165, "grad_norm": 102.59295654296875, "learning_rate": 3.5367199999999997e-07, "loss": 0.3102, "step": 161650 }, { "epoch": 1.617, "grad_norm": 0.6536028981208801, "learning_rate": 3.53472e-07, "loss": 0.4446, "step": 161700 }, { "epoch": 1.6175000000000002, "grad_norm": 25.664196014404297, "learning_rate": 3.5327199999999994e-07, "loss": 0.3685, "step": 161750 }, { "epoch": 1.6179999999999999, "grad_norm": 59.3905143737793, "learning_rate": 3.53072e-07, "loss": 0.5044, "step": 161800 }, { "epoch": 1.6185, "grad_norm": 35.49124526977539, "learning_rate": 3.52872e-07, "loss": 0.3558, "step": 161850 }, { "epoch": 1.619, "grad_norm": 36.11788558959961, "learning_rate": 3.5267199999999994e-07, "loss": 0.3282, "step": 161900 }, { "epoch": 1.6195, "grad_norm": 36.98171615600586, "learning_rate": 3.52472e-07, "loss": 0.4198, "step": 161950 }, { "epoch": 1.62, "grad_norm": 7.539791584014893, "learning_rate": 3.5227199999999997e-07, "loss": 0.4764, "step": 162000 }, { "epoch": 1.6205, "grad_norm": 5.743781566619873, "learning_rate": 3.52072e-07, "loss": 0.3213, "step": 162050 }, { "epoch": 1.621, "grad_norm": 1.3049453496932983, "learning_rate": 3.51872e-07, "loss": 0.2321, "step": 162100 }, { "epoch": 1.6215000000000002, "grad_norm": 17.184179306030273, "learning_rate": 3.51676e-07, "loss": 0.4108, "step": 162150 }, { "epoch": 1.6219999999999999, "grad_norm": 8.404489517211914, "learning_rate": 3.51476e-07, "loss": 0.4013, "step": 162200 }, { "epoch": 1.6225, "grad_norm": 4.2238993644714355, "learning_rate": 3.5127599999999995e-07, "loss": 0.413, "step": 162250 }, { "epoch": 1.623, "grad_norm": 53.541168212890625, "learning_rate": 3.51076e-07, "loss": 0.4472, "step": 162300 }, { "epoch": 1.6235, "grad_norm": 40.160789489746094, "learning_rate": 3.50876e-07, "loss": 0.3242, "step": 162350 }, { "epoch": 1.624, "grad_norm": 2.3333935737609863, "learning_rate": 3.5067599999999996e-07, "loss": 0.4453, "step": 162400 }, { "epoch": 1.6245, "grad_norm": 28.548282623291016, "learning_rate": 3.50476e-07, "loss": 0.4291, "step": 162450 }, { "epoch": 1.625, "grad_norm": 3.4190590381622314, "learning_rate": 3.50276e-07, "loss": 0.3052, "step": 162500 }, { "epoch": 1.6255, "grad_norm": 50.14396286010742, "learning_rate": 3.5007599999999996e-07, "loss": 0.4178, "step": 162550 }, { "epoch": 1.626, "grad_norm": 82.80094146728516, "learning_rate": 3.49876e-07, "loss": 0.4882, "step": 162600 }, { "epoch": 1.6265, "grad_norm": 274.3426208496094, "learning_rate": 3.49676e-07, "loss": 0.4842, "step": 162650 }, { "epoch": 1.627, "grad_norm": 40.102027893066406, "learning_rate": 3.4947599999999997e-07, "loss": 0.4315, "step": 162700 }, { "epoch": 1.6275, "grad_norm": 10.764528274536133, "learning_rate": 3.4927599999999996e-07, "loss": 0.3907, "step": 162750 }, { "epoch": 1.6280000000000001, "grad_norm": 58.64040756225586, "learning_rate": 3.4908e-07, "loss": 0.5452, "step": 162800 }, { "epoch": 1.6284999999999998, "grad_norm": 80.29693603515625, "learning_rate": 3.4888e-07, "loss": 0.4522, "step": 162850 }, { "epoch": 1.629, "grad_norm": 79.97079467773438, "learning_rate": 3.4867999999999997e-07, "loss": 0.5351, "step": 162900 }, { "epoch": 1.6295, "grad_norm": 104.74796295166016, "learning_rate": 3.4848e-07, "loss": 0.5425, "step": 162950 }, { "epoch": 1.63, "grad_norm": 5.748071193695068, "learning_rate": 3.4827999999999994e-07, "loss": 0.3334, "step": 163000 }, { "epoch": 1.6305, "grad_norm": 167.40061950683594, "learning_rate": 3.4808e-07, "loss": 0.4874, "step": 163050 }, { "epoch": 1.631, "grad_norm": 0.46852225065231323, "learning_rate": 3.4788e-07, "loss": 0.4224, "step": 163100 }, { "epoch": 1.6315, "grad_norm": 86.6996078491211, "learning_rate": 3.4767999999999995e-07, "loss": 0.572, "step": 163150 }, { "epoch": 1.6320000000000001, "grad_norm": 17.278841018676758, "learning_rate": 3.4748e-07, "loss": 0.521, "step": 163200 }, { "epoch": 1.6324999999999998, "grad_norm": 26.78488540649414, "learning_rate": 3.4727999999999997e-07, "loss": 0.4149, "step": 163250 }, { "epoch": 1.633, "grad_norm": 13.343032836914062, "learning_rate": 3.4708e-07, "loss": 0.3371, "step": 163300 }, { "epoch": 1.6335, "grad_norm": 0.05451102554798126, "learning_rate": 3.4688e-07, "loss": 0.4209, "step": 163350 }, { "epoch": 1.634, "grad_norm": 67.80818939208984, "learning_rate": 3.4668e-07, "loss": 0.3541, "step": 163400 }, { "epoch": 1.6345, "grad_norm": 77.798828125, "learning_rate": 3.4648e-07, "loss": 0.4772, "step": 163450 }, { "epoch": 1.635, "grad_norm": 70.70539855957031, "learning_rate": 3.4627999999999995e-07, "loss": 0.4017, "step": 163500 }, { "epoch": 1.6355, "grad_norm": 89.26293182373047, "learning_rate": 3.4608e-07, "loss": 0.5131, "step": 163550 }, { "epoch": 1.6360000000000001, "grad_norm": 68.28092956542969, "learning_rate": 3.4588e-07, "loss": 0.4411, "step": 163600 }, { "epoch": 1.6364999999999998, "grad_norm": 14.089746475219727, "learning_rate": 3.4567999999999996e-07, "loss": 0.5162, "step": 163650 }, { "epoch": 1.637, "grad_norm": 87.18695068359375, "learning_rate": 3.4548e-07, "loss": 0.4253, "step": 163700 }, { "epoch": 1.6375, "grad_norm": 7.904430389404297, "learning_rate": 3.4528e-07, "loss": 0.4666, "step": 163750 }, { "epoch": 1.638, "grad_norm": 1.7455227375030518, "learning_rate": 3.4507999999999996e-07, "loss": 0.286, "step": 163800 }, { "epoch": 1.6385, "grad_norm": 26.539167404174805, "learning_rate": 3.4488e-07, "loss": 0.5327, "step": 163850 }, { "epoch": 1.639, "grad_norm": 10.772929191589355, "learning_rate": 3.4468e-07, "loss": 0.3932, "step": 163900 }, { "epoch": 1.6395, "grad_norm": 37.356571197509766, "learning_rate": 3.4447999999999997e-07, "loss": 0.5795, "step": 163950 }, { "epoch": 1.6400000000000001, "grad_norm": 37.573543548583984, "learning_rate": 3.4427999999999996e-07, "loss": 0.5033, "step": 164000 }, { "epoch": 1.6404999999999998, "grad_norm": 4.547316074371338, "learning_rate": 3.4408e-07, "loss": 0.4576, "step": 164050 }, { "epoch": 1.641, "grad_norm": 70.57447814941406, "learning_rate": 3.4388e-07, "loss": 0.4713, "step": 164100 }, { "epoch": 1.6415, "grad_norm": 5.259047508239746, "learning_rate": 3.4367999999999996e-07, "loss": 0.5499, "step": 164150 }, { "epoch": 1.642, "grad_norm": 10.574108123779297, "learning_rate": 3.4348e-07, "loss": 0.2849, "step": 164200 }, { "epoch": 1.6425, "grad_norm": 69.39708709716797, "learning_rate": 3.4327999999999993e-07, "loss": 0.4958, "step": 164250 }, { "epoch": 1.643, "grad_norm": 21.05078887939453, "learning_rate": 3.4307999999999997e-07, "loss": 0.6435, "step": 164300 }, { "epoch": 1.6435, "grad_norm": 41.12791442871094, "learning_rate": 3.4288e-07, "loss": 0.3685, "step": 164350 }, { "epoch": 1.6440000000000001, "grad_norm": 118.48395538330078, "learning_rate": 3.4268e-07, "loss": 0.4217, "step": 164400 }, { "epoch": 1.6444999999999999, "grad_norm": 5.526557922363281, "learning_rate": 3.4248e-07, "loss": 0.3102, "step": 164450 }, { "epoch": 1.645, "grad_norm": 52.724700927734375, "learning_rate": 3.4227999999999997e-07, "loss": 0.4322, "step": 164500 }, { "epoch": 1.6455, "grad_norm": 5.8467631340026855, "learning_rate": 3.4208e-07, "loss": 0.4841, "step": 164550 }, { "epoch": 1.646, "grad_norm": 8.807022094726562, "learning_rate": 3.4188e-07, "loss": 0.3826, "step": 164600 }, { "epoch": 1.6465, "grad_norm": 0.2374558299779892, "learning_rate": 3.4167999999999997e-07, "loss": 0.4602, "step": 164650 }, { "epoch": 1.647, "grad_norm": 4.015384197235107, "learning_rate": 3.4148e-07, "loss": 0.4809, "step": 164700 }, { "epoch": 1.6475, "grad_norm": 0.15211087465286255, "learning_rate": 3.4127999999999994e-07, "loss": 0.3981, "step": 164750 }, { "epoch": 1.6480000000000001, "grad_norm": 62.6292839050293, "learning_rate": 3.4108e-07, "loss": 0.4082, "step": 164800 }, { "epoch": 1.6484999999999999, "grad_norm": 46.557071685791016, "learning_rate": 3.4088e-07, "loss": 0.3833, "step": 164850 }, { "epoch": 1.649, "grad_norm": 109.10408782958984, "learning_rate": 3.4067999999999995e-07, "loss": 0.3742, "step": 164900 }, { "epoch": 1.6495, "grad_norm": 84.44685363769531, "learning_rate": 3.4048e-07, "loss": 0.4855, "step": 164950 }, { "epoch": 1.65, "grad_norm": 3.448240041732788, "learning_rate": 3.4028000000000003e-07, "loss": 0.5069, "step": 165000 }, { "epoch": 1.6505, "grad_norm": 25.00313377380371, "learning_rate": 3.4007999999999996e-07, "loss": 0.4354, "step": 165050 }, { "epoch": 1.651, "grad_norm": 83.84122467041016, "learning_rate": 3.3988e-07, "loss": 0.5135, "step": 165100 }, { "epoch": 1.6515, "grad_norm": 0.6992330551147461, "learning_rate": 3.3968e-07, "loss": 0.3979, "step": 165150 }, { "epoch": 1.6520000000000001, "grad_norm": 13.750035285949707, "learning_rate": 3.3947999999999997e-07, "loss": 0.4341, "step": 165200 }, { "epoch": 1.6524999999999999, "grad_norm": 7.680423736572266, "learning_rate": 3.3928e-07, "loss": 0.4273, "step": 165250 }, { "epoch": 1.653, "grad_norm": 1.2927963733673096, "learning_rate": 3.3908e-07, "loss": 0.3914, "step": 165300 }, { "epoch": 1.6535, "grad_norm": 92.42204284667969, "learning_rate": 3.3888e-07, "loss": 0.5286, "step": 165350 }, { "epoch": 1.654, "grad_norm": 102.65548706054688, "learning_rate": 3.3867999999999996e-07, "loss": 0.485, "step": 165400 }, { "epoch": 1.6545, "grad_norm": 7.027461051940918, "learning_rate": 3.3848e-07, "loss": 0.4433, "step": 165450 }, { "epoch": 1.655, "grad_norm": 74.44886779785156, "learning_rate": 3.3828000000000004e-07, "loss": 0.3997, "step": 165500 }, { "epoch": 1.6555, "grad_norm": 36.282901763916016, "learning_rate": 3.3807999999999997e-07, "loss": 0.389, "step": 165550 }, { "epoch": 1.6560000000000001, "grad_norm": 23.829561233520508, "learning_rate": 3.3788e-07, "loss": 0.312, "step": 165600 }, { "epoch": 1.6564999999999999, "grad_norm": 50.19546127319336, "learning_rate": 3.3768e-07, "loss": 0.4511, "step": 165650 }, { "epoch": 1.657, "grad_norm": 37.749725341796875, "learning_rate": 3.3748e-07, "loss": 0.3837, "step": 165700 }, { "epoch": 1.6575, "grad_norm": 8.897858619689941, "learning_rate": 3.3728e-07, "loss": 0.5425, "step": 165750 }, { "epoch": 1.658, "grad_norm": 3.789098024368286, "learning_rate": 3.3708e-07, "loss": 0.4618, "step": 165800 }, { "epoch": 1.6585, "grad_norm": 73.6874771118164, "learning_rate": 3.3688e-07, "loss": 0.6174, "step": 165850 }, { "epoch": 1.659, "grad_norm": 28.941118240356445, "learning_rate": 3.3667999999999997e-07, "loss": 0.4578, "step": 165900 }, { "epoch": 1.6595, "grad_norm": 104.42362213134766, "learning_rate": 3.3648e-07, "loss": 0.4737, "step": 165950 }, { "epoch": 1.6600000000000001, "grad_norm": 6.5759477615356445, "learning_rate": 3.3628e-07, "loss": 0.4182, "step": 166000 }, { "epoch": 1.6604999999999999, "grad_norm": 3.6129372119903564, "learning_rate": 3.3608e-07, "loss": 0.3286, "step": 166050 }, { "epoch": 1.661, "grad_norm": 32.19172668457031, "learning_rate": 3.3588e-07, "loss": 0.3299, "step": 166100 }, { "epoch": 1.6615, "grad_norm": 34.54439163208008, "learning_rate": 3.3567999999999995e-07, "loss": 0.2821, "step": 166150 }, { "epoch": 1.662, "grad_norm": 7.3551411628723145, "learning_rate": 3.3548e-07, "loss": 0.6133, "step": 166200 }, { "epoch": 1.6625, "grad_norm": 135.5836944580078, "learning_rate": 3.3528e-07, "loss": 0.5549, "step": 166250 }, { "epoch": 1.663, "grad_norm": 65.05741882324219, "learning_rate": 3.3507999999999995e-07, "loss": 0.5788, "step": 166300 }, { "epoch": 1.6635, "grad_norm": 46.17664337158203, "learning_rate": 3.3488e-07, "loss": 0.5116, "step": 166350 }, { "epoch": 1.6640000000000001, "grad_norm": 55.95429611206055, "learning_rate": 3.3468e-07, "loss": 0.4453, "step": 166400 }, { "epoch": 1.6644999999999999, "grad_norm": 1.1684380769729614, "learning_rate": 3.3447999999999996e-07, "loss": 0.4546, "step": 166450 }, { "epoch": 1.665, "grad_norm": 4.0728230476379395, "learning_rate": 3.3428e-07, "loss": 0.4625, "step": 166500 }, { "epoch": 1.6655, "grad_norm": 14.39780330657959, "learning_rate": 3.3408e-07, "loss": 0.3562, "step": 166550 }, { "epoch": 1.666, "grad_norm": 0.6815798282623291, "learning_rate": 3.3387999999999997e-07, "loss": 0.4597, "step": 166600 }, { "epoch": 1.6665, "grad_norm": 95.65667724609375, "learning_rate": 3.3367999999999995e-07, "loss": 0.5346, "step": 166650 }, { "epoch": 1.667, "grad_norm": 13.627656936645508, "learning_rate": 3.3348e-07, "loss": 0.4855, "step": 166700 }, { "epoch": 1.6675, "grad_norm": 5.321529388427734, "learning_rate": 3.3328000000000003e-07, "loss": 0.401, "step": 166750 }, { "epoch": 1.6680000000000001, "grad_norm": 9.042032241821289, "learning_rate": 3.3307999999999996e-07, "loss": 0.3916, "step": 166800 }, { "epoch": 1.6684999999999999, "grad_norm": 2.1465108394622803, "learning_rate": 3.3288e-07, "loss": 0.559, "step": 166850 }, { "epoch": 1.669, "grad_norm": 60.72265625, "learning_rate": 3.3268e-07, "loss": 0.3849, "step": 166900 }, { "epoch": 1.6695, "grad_norm": 68.50015258789062, "learning_rate": 3.3247999999999997e-07, "loss": 0.4911, "step": 166950 }, { "epoch": 1.67, "grad_norm": 15.50178337097168, "learning_rate": 3.3228e-07, "loss": 0.4197, "step": 167000 }, { "epoch": 1.6705, "grad_norm": 3.2465248107910156, "learning_rate": 3.3208e-07, "loss": 0.3693, "step": 167050 }, { "epoch": 1.671, "grad_norm": 7.669544219970703, "learning_rate": 3.3188e-07, "loss": 0.4919, "step": 167100 }, { "epoch": 1.6715, "grad_norm": 50.00425720214844, "learning_rate": 3.3167999999999996e-07, "loss": 0.4257, "step": 167150 }, { "epoch": 1.6720000000000002, "grad_norm": 2.649852991104126, "learning_rate": 3.3148e-07, "loss": 0.43, "step": 167200 }, { "epoch": 1.6724999999999999, "grad_norm": 70.55695343017578, "learning_rate": 3.3128e-07, "loss": 0.3321, "step": 167250 }, { "epoch": 1.673, "grad_norm": 18.895328521728516, "learning_rate": 3.3107999999999997e-07, "loss": 0.5031, "step": 167300 }, { "epoch": 1.6735, "grad_norm": 7.735249042510986, "learning_rate": 3.3088e-07, "loss": 0.501, "step": 167350 }, { "epoch": 1.674, "grad_norm": 60.472511291503906, "learning_rate": 3.3067999999999994e-07, "loss": 0.3583, "step": 167400 }, { "epoch": 1.6745, "grad_norm": 121.01581573486328, "learning_rate": 3.3048e-07, "loss": 0.4813, "step": 167450 }, { "epoch": 1.675, "grad_norm": 31.92609405517578, "learning_rate": 3.3028e-07, "loss": 0.4163, "step": 167500 }, { "epoch": 1.6755, "grad_norm": 20.554677963256836, "learning_rate": 3.3007999999999995e-07, "loss": 0.3398, "step": 167550 }, { "epoch": 1.6760000000000002, "grad_norm": 50.82945251464844, "learning_rate": 3.2988e-07, "loss": 0.5493, "step": 167600 }, { "epoch": 1.6764999999999999, "grad_norm": 13.953234672546387, "learning_rate": 3.2967999999999997e-07, "loss": 0.4574, "step": 167650 }, { "epoch": 1.677, "grad_norm": 16.389623641967773, "learning_rate": 3.2947999999999996e-07, "loss": 0.3857, "step": 167700 }, { "epoch": 1.6775, "grad_norm": 2.509197235107422, "learning_rate": 3.2928e-07, "loss": 0.3919, "step": 167750 }, { "epoch": 1.678, "grad_norm": 4.2857666015625, "learning_rate": 3.2908e-07, "loss": 0.4109, "step": 167800 }, { "epoch": 1.6785, "grad_norm": 35.18456268310547, "learning_rate": 3.2887999999999996e-07, "loss": 0.4017, "step": 167850 }, { "epoch": 1.679, "grad_norm": 76.32749938964844, "learning_rate": 3.2868e-07, "loss": 0.3254, "step": 167900 }, { "epoch": 1.6795, "grad_norm": 11.019416809082031, "learning_rate": 3.2848e-07, "loss": 0.4384, "step": 167950 }, { "epoch": 1.6800000000000002, "grad_norm": 18.828954696655273, "learning_rate": 3.2828e-07, "loss": 0.3532, "step": 168000 }, { "epoch": 1.6804999999999999, "grad_norm": 51.45600891113281, "learning_rate": 3.2807999999999996e-07, "loss": 0.4289, "step": 168050 }, { "epoch": 1.681, "grad_norm": 38.062007904052734, "learning_rate": 3.2788e-07, "loss": 0.3752, "step": 168100 }, { "epoch": 1.6815, "grad_norm": 135.77662658691406, "learning_rate": 3.2768000000000003e-07, "loss": 0.3286, "step": 168150 }, { "epoch": 1.682, "grad_norm": 79.01766967773438, "learning_rate": 3.2747999999999997e-07, "loss": 0.4695, "step": 168200 }, { "epoch": 1.6825, "grad_norm": 61.94942855834961, "learning_rate": 3.2728e-07, "loss": 0.2963, "step": 168250 }, { "epoch": 1.683, "grad_norm": 101.182373046875, "learning_rate": 3.2708e-07, "loss": 0.4316, "step": 168300 }, { "epoch": 1.6835, "grad_norm": 44.68824768066406, "learning_rate": 3.2687999999999997e-07, "loss": 0.2822, "step": 168350 }, { "epoch": 1.6840000000000002, "grad_norm": 97.19388580322266, "learning_rate": 3.2668e-07, "loss": 0.4836, "step": 168400 }, { "epoch": 1.6844999999999999, "grad_norm": 55.13529968261719, "learning_rate": 3.2648e-07, "loss": 0.2778, "step": 168450 }, { "epoch": 1.685, "grad_norm": 7.870936870574951, "learning_rate": 3.2628e-07, "loss": 0.4664, "step": 168500 }, { "epoch": 1.6855, "grad_norm": 2.3908069133758545, "learning_rate": 3.2607999999999997e-07, "loss": 0.5063, "step": 168550 }, { "epoch": 1.686, "grad_norm": 4.8086466789245605, "learning_rate": 3.2588e-07, "loss": 0.3995, "step": 168600 }, { "epoch": 1.6865, "grad_norm": 0.3409883677959442, "learning_rate": 3.2568e-07, "loss": 0.3984, "step": 168650 }, { "epoch": 1.687, "grad_norm": 10.495270729064941, "learning_rate": 3.2548e-07, "loss": 0.4395, "step": 168700 }, { "epoch": 1.6875, "grad_norm": 41.94511032104492, "learning_rate": 3.2528e-07, "loss": 0.4126, "step": 168750 }, { "epoch": 1.688, "grad_norm": 57.92393493652344, "learning_rate": 3.2507999999999994e-07, "loss": 0.318, "step": 168800 }, { "epoch": 1.6885, "grad_norm": 58.26197814941406, "learning_rate": 3.2488e-07, "loss": 0.4174, "step": 168850 }, { "epoch": 1.689, "grad_norm": 7.934047222137451, "learning_rate": 3.2468e-07, "loss": 0.4684, "step": 168900 }, { "epoch": 1.6895, "grad_norm": 76.46321105957031, "learning_rate": 3.2447999999999995e-07, "loss": 0.3206, "step": 168950 }, { "epoch": 1.69, "grad_norm": 55.63337707519531, "learning_rate": 3.2428e-07, "loss": 0.5596, "step": 169000 }, { "epoch": 1.6905000000000001, "grad_norm": 9.17141056060791, "learning_rate": 3.2408e-07, "loss": 0.3573, "step": 169050 }, { "epoch": 1.6909999999999998, "grad_norm": 76.35599517822266, "learning_rate": 3.2388e-07, "loss": 0.6063, "step": 169100 }, { "epoch": 1.6915, "grad_norm": 54.43082809448242, "learning_rate": 3.2368e-07, "loss": 0.4363, "step": 169150 }, { "epoch": 1.692, "grad_norm": 25.601472854614258, "learning_rate": 3.2348e-07, "loss": 0.3161, "step": 169200 }, { "epoch": 1.6925, "grad_norm": 61.37073516845703, "learning_rate": 3.2328e-07, "loss": 0.4411, "step": 169250 }, { "epoch": 1.693, "grad_norm": 73.22949981689453, "learning_rate": 3.2307999999999995e-07, "loss": 0.4661, "step": 169300 }, { "epoch": 1.6935, "grad_norm": 16.108476638793945, "learning_rate": 3.2288e-07, "loss": 0.4321, "step": 169350 }, { "epoch": 1.694, "grad_norm": 83.31065368652344, "learning_rate": 3.2268000000000003e-07, "loss": 0.3823, "step": 169400 }, { "epoch": 1.6945000000000001, "grad_norm": 1.437071681022644, "learning_rate": 3.2248399999999996e-07, "loss": 0.4882, "step": 169450 }, { "epoch": 1.6949999999999998, "grad_norm": 60.666839599609375, "learning_rate": 3.22284e-07, "loss": 0.4287, "step": 169500 }, { "epoch": 1.6955, "grad_norm": 52.73131561279297, "learning_rate": 3.22084e-07, "loss": 0.3638, "step": 169550 }, { "epoch": 1.696, "grad_norm": 67.59291076660156, "learning_rate": 3.2188399999999997e-07, "loss": 0.5091, "step": 169600 }, { "epoch": 1.6965, "grad_norm": 33.913936614990234, "learning_rate": 3.21684e-07, "loss": 0.3616, "step": 169650 }, { "epoch": 1.697, "grad_norm": 6.20016622543335, "learning_rate": 3.21484e-07, "loss": 0.4313, "step": 169700 }, { "epoch": 1.6975, "grad_norm": 78.34620666503906, "learning_rate": 3.21288e-07, "loss": 0.4994, "step": 169750 }, { "epoch": 1.698, "grad_norm": 8.514922142028809, "learning_rate": 3.2108799999999997e-07, "loss": 0.4833, "step": 169800 }, { "epoch": 1.6985000000000001, "grad_norm": 43.1236572265625, "learning_rate": 3.20888e-07, "loss": 0.4908, "step": 169850 }, { "epoch": 1.6989999999999998, "grad_norm": 51.2786750793457, "learning_rate": 3.2068799999999994e-07, "loss": 0.6363, "step": 169900 }, { "epoch": 1.6995, "grad_norm": 102.73619842529297, "learning_rate": 3.20488e-07, "loss": 0.442, "step": 169950 }, { "epoch": 1.7, "grad_norm": 121.2522201538086, "learning_rate": 3.20288e-07, "loss": 0.5802, "step": 170000 }, { "epoch": 1.7005, "grad_norm": 31.411584854125977, "learning_rate": 3.2008799999999995e-07, "loss": 0.3274, "step": 170050 }, { "epoch": 1.701, "grad_norm": 83.65167999267578, "learning_rate": 3.19888e-07, "loss": 0.3685, "step": 170100 }, { "epoch": 1.7015, "grad_norm": 11.413860321044922, "learning_rate": 3.19688e-07, "loss": 0.382, "step": 170150 }, { "epoch": 1.702, "grad_norm": 18.51053237915039, "learning_rate": 3.1948799999999996e-07, "loss": 0.3183, "step": 170200 }, { "epoch": 1.7025000000000001, "grad_norm": 4.223658084869385, "learning_rate": 3.19288e-07, "loss": 0.2734, "step": 170250 }, { "epoch": 1.7029999999999998, "grad_norm": 4.411848545074463, "learning_rate": 3.19088e-07, "loss": 0.3668, "step": 170300 }, { "epoch": 1.7035, "grad_norm": 29.715543746948242, "learning_rate": 3.18888e-07, "loss": 0.3997, "step": 170350 }, { "epoch": 1.704, "grad_norm": 3.227698802947998, "learning_rate": 3.18688e-07, "loss": 0.5644, "step": 170400 }, { "epoch": 1.7045, "grad_norm": 41.32566833496094, "learning_rate": 3.18488e-07, "loss": 0.5333, "step": 170450 }, { "epoch": 1.705, "grad_norm": 11.538256645202637, "learning_rate": 3.18288e-07, "loss": 0.3157, "step": 170500 }, { "epoch": 1.7055, "grad_norm": 0.46403923630714417, "learning_rate": 3.1808799999999996e-07, "loss": 0.534, "step": 170550 }, { "epoch": 1.706, "grad_norm": 18.477012634277344, "learning_rate": 3.17892e-07, "loss": 0.4467, "step": 170600 }, { "epoch": 1.7065000000000001, "grad_norm": 69.50625610351562, "learning_rate": 3.17692e-07, "loss": 0.4261, "step": 170650 }, { "epoch": 1.7069999999999999, "grad_norm": 73.91793060302734, "learning_rate": 3.1749199999999997e-07, "loss": 0.4084, "step": 170700 }, { "epoch": 1.7075, "grad_norm": 43.193870544433594, "learning_rate": 3.17292e-07, "loss": 0.3532, "step": 170750 }, { "epoch": 1.708, "grad_norm": 29.659255981445312, "learning_rate": 3.17092e-07, "loss": 0.4348, "step": 170800 }, { "epoch": 1.7085, "grad_norm": 3.251600980758667, "learning_rate": 3.16892e-07, "loss": 0.4886, "step": 170850 }, { "epoch": 1.709, "grad_norm": 21.75658416748047, "learning_rate": 3.1669199999999996e-07, "loss": 0.515, "step": 170900 }, { "epoch": 1.7095, "grad_norm": 6.0939106941223145, "learning_rate": 3.16492e-07, "loss": 0.5142, "step": 170950 }, { "epoch": 1.71, "grad_norm": 94.46733856201172, "learning_rate": 3.16292e-07, "loss": 0.5497, "step": 171000 }, { "epoch": 1.7105000000000001, "grad_norm": 148.20986938476562, "learning_rate": 3.1609199999999997e-07, "loss": 0.5326, "step": 171050 }, { "epoch": 1.7109999999999999, "grad_norm": 25.23739242553711, "learning_rate": 3.15892e-07, "loss": 0.4548, "step": 171100 }, { "epoch": 1.7115, "grad_norm": 45.993045806884766, "learning_rate": 3.1569199999999994e-07, "loss": 0.3861, "step": 171150 }, { "epoch": 1.712, "grad_norm": 70.52889251708984, "learning_rate": 3.15492e-07, "loss": 0.4945, "step": 171200 }, { "epoch": 1.7125, "grad_norm": 0.17195826768875122, "learning_rate": 3.15292e-07, "loss": 0.4047, "step": 171250 }, { "epoch": 1.713, "grad_norm": 141.29371643066406, "learning_rate": 3.1509199999999995e-07, "loss": 0.4854, "step": 171300 }, { "epoch": 1.7135, "grad_norm": 79.38813781738281, "learning_rate": 3.14892e-07, "loss": 0.4288, "step": 171350 }, { "epoch": 1.714, "grad_norm": 93.10710144042969, "learning_rate": 3.14692e-07, "loss": 0.3964, "step": 171400 }, { "epoch": 1.7145000000000001, "grad_norm": 120.59520721435547, "learning_rate": 3.1449199999999995e-07, "loss": 0.3408, "step": 171450 }, { "epoch": 1.7149999999999999, "grad_norm": 33.953556060791016, "learning_rate": 3.14292e-07, "loss": 0.4579, "step": 171500 }, { "epoch": 1.7155, "grad_norm": 32.431541442871094, "learning_rate": 3.14092e-07, "loss": 0.348, "step": 171550 }, { "epoch": 1.716, "grad_norm": 36.8668327331543, "learning_rate": 3.13892e-07, "loss": 0.4879, "step": 171600 }, { "epoch": 1.7165, "grad_norm": 2.457643747329712, "learning_rate": 3.13692e-07, "loss": 0.3101, "step": 171650 }, { "epoch": 1.717, "grad_norm": 29.928564071655273, "learning_rate": 3.13496e-07, "loss": 0.433, "step": 171700 }, { "epoch": 1.7175, "grad_norm": 46.240840911865234, "learning_rate": 3.1329600000000003e-07, "loss": 0.4581, "step": 171750 }, { "epoch": 1.718, "grad_norm": 13.193766593933105, "learning_rate": 3.1309599999999996e-07, "loss": 0.5739, "step": 171800 }, { "epoch": 1.7185000000000001, "grad_norm": 47.43181228637695, "learning_rate": 3.12896e-07, "loss": 0.4211, "step": 171850 }, { "epoch": 1.7189999999999999, "grad_norm": 50.500213623046875, "learning_rate": 3.12696e-07, "loss": 0.3436, "step": 171900 }, { "epoch": 1.7195, "grad_norm": 5.8976731300354, "learning_rate": 3.1249599999999997e-07, "loss": 0.3893, "step": 171950 }, { "epoch": 1.72, "grad_norm": 71.6220474243164, "learning_rate": 3.12296e-07, "loss": 0.4529, "step": 172000 }, { "epoch": 1.7205, "grad_norm": 32.8338508605957, "learning_rate": 3.12096e-07, "loss": 0.3568, "step": 172050 }, { "epoch": 1.721, "grad_norm": 17.878786087036133, "learning_rate": 3.11896e-07, "loss": 0.2774, "step": 172100 }, { "epoch": 1.7215, "grad_norm": 90.94465637207031, "learning_rate": 3.1169599999999996e-07, "loss": 0.3775, "step": 172150 }, { "epoch": 1.722, "grad_norm": 101.07366180419922, "learning_rate": 3.11496e-07, "loss": 0.452, "step": 172200 }, { "epoch": 1.7225000000000001, "grad_norm": 61.204246520996094, "learning_rate": 3.11296e-07, "loss": 0.4113, "step": 172250 }, { "epoch": 1.7229999999999999, "grad_norm": 92.17945098876953, "learning_rate": 3.1109599999999997e-07, "loss": 0.4296, "step": 172300 }, { "epoch": 1.7235, "grad_norm": 74.0315170288086, "learning_rate": 3.10896e-07, "loss": 0.5087, "step": 172350 }, { "epoch": 1.724, "grad_norm": 91.9222640991211, "learning_rate": 3.10696e-07, "loss": 0.4429, "step": 172400 }, { "epoch": 1.7245, "grad_norm": 33.95732498168945, "learning_rate": 3.10496e-07, "loss": 0.5813, "step": 172450 }, { "epoch": 1.725, "grad_norm": 80.2164535522461, "learning_rate": 3.10296e-07, "loss": 0.4709, "step": 172500 }, { "epoch": 1.7255, "grad_norm": 131.9748992919922, "learning_rate": 3.1009599999999995e-07, "loss": 0.4982, "step": 172550 }, { "epoch": 1.726, "grad_norm": 1.9716410636901855, "learning_rate": 3.09896e-07, "loss": 0.3589, "step": 172600 }, { "epoch": 1.7265000000000001, "grad_norm": 76.4041519165039, "learning_rate": 3.09696e-07, "loss": 0.4375, "step": 172650 }, { "epoch": 1.7269999999999999, "grad_norm": 10.473490715026855, "learning_rate": 3.0949599999999995e-07, "loss": 0.3493, "step": 172700 }, { "epoch": 1.7275, "grad_norm": 44.0904426574707, "learning_rate": 3.09296e-07, "loss": 0.4193, "step": 172750 }, { "epoch": 1.728, "grad_norm": 37.536067962646484, "learning_rate": 3.09096e-07, "loss": 0.4956, "step": 172800 }, { "epoch": 1.7285, "grad_norm": 141.21176147460938, "learning_rate": 3.08896e-07, "loss": 0.3959, "step": 172850 }, { "epoch": 1.729, "grad_norm": 15.123921394348145, "learning_rate": 3.08696e-07, "loss": 0.4004, "step": 172900 }, { "epoch": 1.7295, "grad_norm": 38.241249084472656, "learning_rate": 3.08496e-07, "loss": 0.5414, "step": 172950 }, { "epoch": 1.73, "grad_norm": 87.40081787109375, "learning_rate": 3.08296e-07, "loss": 0.5123, "step": 173000 }, { "epoch": 1.7305000000000001, "grad_norm": 17.00108528137207, "learning_rate": 3.0809599999999996e-07, "loss": 0.4696, "step": 173050 }, { "epoch": 1.7309999999999999, "grad_norm": 5.777243614196777, "learning_rate": 3.07896e-07, "loss": 0.479, "step": 173100 }, { "epoch": 1.7315, "grad_norm": 92.2618179321289, "learning_rate": 3.0769600000000003e-07, "loss": 0.3697, "step": 173150 }, { "epoch": 1.732, "grad_norm": 115.76209259033203, "learning_rate": 3.0749599999999996e-07, "loss": 0.5014, "step": 173200 }, { "epoch": 1.7325, "grad_norm": 10.42154598236084, "learning_rate": 3.07296e-07, "loss": 0.521, "step": 173250 }, { "epoch": 1.733, "grad_norm": 48.72549819946289, "learning_rate": 3.07096e-07, "loss": 0.5236, "step": 173300 }, { "epoch": 1.7335, "grad_norm": 43.980899810791016, "learning_rate": 3.0689599999999997e-07, "loss": 0.5085, "step": 173350 }, { "epoch": 1.734, "grad_norm": 7.443145275115967, "learning_rate": 3.06696e-07, "loss": 0.4113, "step": 173400 }, { "epoch": 1.7345000000000002, "grad_norm": 9.301448822021484, "learning_rate": 3.06496e-07, "loss": 0.4126, "step": 173450 }, { "epoch": 1.7349999999999999, "grad_norm": 85.95564270019531, "learning_rate": 3.06296e-07, "loss": 0.6378, "step": 173500 }, { "epoch": 1.7355, "grad_norm": 20.996845245361328, "learning_rate": 3.0609599999999996e-07, "loss": 0.6383, "step": 173550 }, { "epoch": 1.736, "grad_norm": 50.388126373291016, "learning_rate": 3.05896e-07, "loss": 0.3279, "step": 173600 }, { "epoch": 1.7365, "grad_norm": 0.6028560400009155, "learning_rate": 3.05696e-07, "loss": 0.4151, "step": 173650 }, { "epoch": 1.737, "grad_norm": 31.13410186767578, "learning_rate": 3.0549599999999997e-07, "loss": 0.423, "step": 173700 }, { "epoch": 1.7375, "grad_norm": 72.78054809570312, "learning_rate": 3.05296e-07, "loss": 0.344, "step": 173750 }, { "epoch": 1.738, "grad_norm": 25.89346694946289, "learning_rate": 3.0509599999999994e-07, "loss": 0.6206, "step": 173800 }, { "epoch": 1.7385000000000002, "grad_norm": 48.15966033935547, "learning_rate": 3.04896e-07, "loss": 0.3109, "step": 173850 }, { "epoch": 1.7389999999999999, "grad_norm": 45.8206787109375, "learning_rate": 3.04696e-07, "loss": 0.5, "step": 173900 }, { "epoch": 1.7395, "grad_norm": 9.931618690490723, "learning_rate": 3.0449599999999995e-07, "loss": 0.4324, "step": 173950 }, { "epoch": 1.74, "grad_norm": 107.68231964111328, "learning_rate": 3.04296e-07, "loss": 0.4264, "step": 174000 }, { "epoch": 1.7405, "grad_norm": 23.903282165527344, "learning_rate": 3.0409599999999997e-07, "loss": 0.4645, "step": 174050 }, { "epoch": 1.741, "grad_norm": 60.52227783203125, "learning_rate": 3.03896e-07, "loss": 0.4347, "step": 174100 }, { "epoch": 1.7415, "grad_norm": 3.167391300201416, "learning_rate": 3.03696e-07, "loss": 0.354, "step": 174150 }, { "epoch": 1.742, "grad_norm": 72.62369537353516, "learning_rate": 3.03496e-07, "loss": 0.4243, "step": 174200 }, { "epoch": 1.7425000000000002, "grad_norm": 51.68940734863281, "learning_rate": 3.03296e-07, "loss": 0.4351, "step": 174250 }, { "epoch": 1.7429999999999999, "grad_norm": 93.96479797363281, "learning_rate": 3.0309599999999995e-07, "loss": 0.3287, "step": 174300 }, { "epoch": 1.7435, "grad_norm": 151.80270385742188, "learning_rate": 3.02896e-07, "loss": 0.3704, "step": 174350 }, { "epoch": 1.744, "grad_norm": 6.188260555267334, "learning_rate": 3.02696e-07, "loss": 0.5232, "step": 174400 }, { "epoch": 1.7445, "grad_norm": 19.85384178161621, "learning_rate": 3.0249599999999996e-07, "loss": 0.4082, "step": 174450 }, { "epoch": 1.745, "grad_norm": 2.640712022781372, "learning_rate": 3.02296e-07, "loss": 0.3756, "step": 174500 }, { "epoch": 1.7455, "grad_norm": 20.137554168701172, "learning_rate": 3.02096e-07, "loss": 0.459, "step": 174550 }, { "epoch": 1.746, "grad_norm": 60.7595100402832, "learning_rate": 3.0189599999999997e-07, "loss": 0.4374, "step": 174600 }, { "epoch": 1.7465000000000002, "grad_norm": 51.217464447021484, "learning_rate": 3.01696e-07, "loss": 0.4513, "step": 174650 }, { "epoch": 1.7469999999999999, "grad_norm": 7.342816352844238, "learning_rate": 3.01496e-07, "loss": 0.448, "step": 174700 }, { "epoch": 1.7475, "grad_norm": 1.492277979850769, "learning_rate": 3.01296e-07, "loss": 0.4433, "step": 174750 }, { "epoch": 1.748, "grad_norm": 75.10221099853516, "learning_rate": 3.0109599999999996e-07, "loss": 0.2593, "step": 174800 }, { "epoch": 1.7485, "grad_norm": 6.8189616203308105, "learning_rate": 3.00896e-07, "loss": 0.4845, "step": 174850 }, { "epoch": 1.749, "grad_norm": 33.18309783935547, "learning_rate": 3.00696e-07, "loss": 0.3589, "step": 174900 }, { "epoch": 1.7495, "grad_norm": 149.52420043945312, "learning_rate": 3.0049599999999997e-07, "loss": 0.3238, "step": 174950 }, { "epoch": 1.75, "grad_norm": 33.072242736816406, "learning_rate": 3.00296e-07, "loss": 0.5178, "step": 175000 }, { "epoch": 1.7505, "grad_norm": 19.933521270751953, "learning_rate": 3.0009599999999994e-07, "loss": 0.4674, "step": 175050 }, { "epoch": 1.751, "grad_norm": 14.122361183166504, "learning_rate": 2.99896e-07, "loss": 0.3734, "step": 175100 }, { "epoch": 1.7515, "grad_norm": 0.9820132255554199, "learning_rate": 2.99696e-07, "loss": 0.3835, "step": 175150 }, { "epoch": 1.752, "grad_norm": 0.1489812433719635, "learning_rate": 2.99496e-07, "loss": 0.6293, "step": 175200 }, { "epoch": 1.7525, "grad_norm": 2.5810625553131104, "learning_rate": 2.99296e-07, "loss": 0.335, "step": 175250 }, { "epoch": 1.7530000000000001, "grad_norm": 4.589435577392578, "learning_rate": 2.9909599999999997e-07, "loss": 0.5491, "step": 175300 }, { "epoch": 1.7534999999999998, "grad_norm": 35.78123474121094, "learning_rate": 2.98896e-07, "loss": 0.3732, "step": 175350 }, { "epoch": 1.754, "grad_norm": 7.164456844329834, "learning_rate": 2.98696e-07, "loss": 0.418, "step": 175400 }, { "epoch": 1.7545, "grad_norm": 102.37982177734375, "learning_rate": 2.98496e-07, "loss": 0.5153, "step": 175450 }, { "epoch": 1.755, "grad_norm": 1.655551552772522, "learning_rate": 2.98296e-07, "loss": 0.4579, "step": 175500 }, { "epoch": 1.7555, "grad_norm": 11.854036331176758, "learning_rate": 2.98096e-07, "loss": 0.5165, "step": 175550 }, { "epoch": 1.756, "grad_norm": 5.970990180969238, "learning_rate": 2.97896e-07, "loss": 0.3749, "step": 175600 }, { "epoch": 1.7565, "grad_norm": 40.14838790893555, "learning_rate": 2.97696e-07, "loss": 0.4894, "step": 175650 }, { "epoch": 1.7570000000000001, "grad_norm": 6.2947187423706055, "learning_rate": 2.9749999999999996e-07, "loss": 0.3435, "step": 175700 }, { "epoch": 1.7574999999999998, "grad_norm": 72.81707000732422, "learning_rate": 2.973e-07, "loss": 0.5341, "step": 175750 }, { "epoch": 1.758, "grad_norm": 23.278066635131836, "learning_rate": 2.971e-07, "loss": 0.3967, "step": 175800 }, { "epoch": 1.7585, "grad_norm": 75.2026138305664, "learning_rate": 2.9689999999999997e-07, "loss": 0.4016, "step": 175850 }, { "epoch": 1.759, "grad_norm": 25.339069366455078, "learning_rate": 2.967e-07, "loss": 0.2943, "step": 175900 }, { "epoch": 1.7595, "grad_norm": 44.909751892089844, "learning_rate": 2.965e-07, "loss": 0.3533, "step": 175950 }, { "epoch": 1.76, "grad_norm": 60.30315017700195, "learning_rate": 2.9629999999999997e-07, "loss": 0.5381, "step": 176000 }, { "epoch": 1.7605, "grad_norm": 105.79608917236328, "learning_rate": 2.9609999999999996e-07, "loss": 0.6541, "step": 176050 }, { "epoch": 1.7610000000000001, "grad_norm": 74.8968505859375, "learning_rate": 2.959e-07, "loss": 0.3239, "step": 176100 }, { "epoch": 1.7614999999999998, "grad_norm": 2.4534757137298584, "learning_rate": 2.957e-07, "loss": 0.3273, "step": 176150 }, { "epoch": 1.762, "grad_norm": 35.73273468017578, "learning_rate": 2.9549999999999997e-07, "loss": 0.4744, "step": 176200 }, { "epoch": 1.7625, "grad_norm": 33.48553466796875, "learning_rate": 2.953e-07, "loss": 0.4624, "step": 176250 }, { "epoch": 1.763, "grad_norm": 52.17403793334961, "learning_rate": 2.9509999999999994e-07, "loss": 0.3885, "step": 176300 }, { "epoch": 1.7635, "grad_norm": 103.1676025390625, "learning_rate": 2.949e-07, "loss": 0.5388, "step": 176350 }, { "epoch": 1.764, "grad_norm": 84.96452331542969, "learning_rate": 2.947e-07, "loss": 0.4278, "step": 176400 }, { "epoch": 1.7645, "grad_norm": 1.080453634262085, "learning_rate": 2.945e-07, "loss": 0.5096, "step": 176450 }, { "epoch": 1.7650000000000001, "grad_norm": 1.111628532409668, "learning_rate": 2.943e-07, "loss": 0.3848, "step": 176500 }, { "epoch": 1.7654999999999998, "grad_norm": 31.323686599731445, "learning_rate": 2.9409999999999997e-07, "loss": 0.3379, "step": 176550 }, { "epoch": 1.766, "grad_norm": 42.48777389526367, "learning_rate": 2.939e-07, "loss": 0.3058, "step": 176600 }, { "epoch": 1.7665, "grad_norm": 7.140532970428467, "learning_rate": 2.937e-07, "loss": 0.3894, "step": 176650 }, { "epoch": 1.767, "grad_norm": 18.32383918762207, "learning_rate": 2.935e-07, "loss": 0.2754, "step": 176700 }, { "epoch": 1.7675, "grad_norm": 0.7897120118141174, "learning_rate": 2.933e-07, "loss": 0.5175, "step": 176750 }, { "epoch": 1.768, "grad_norm": 121.51934051513672, "learning_rate": 2.931e-07, "loss": 0.4686, "step": 176800 }, { "epoch": 1.7685, "grad_norm": 0.11282049119472504, "learning_rate": 2.929e-07, "loss": 0.4456, "step": 176850 }, { "epoch": 1.7690000000000001, "grad_norm": 19.87791633605957, "learning_rate": 2.927e-07, "loss": 0.3768, "step": 176900 }, { "epoch": 1.7694999999999999, "grad_norm": 0.5945930480957031, "learning_rate": 2.9249999999999995e-07, "loss": 0.5334, "step": 176950 }, { "epoch": 1.77, "grad_norm": 5.010547161102295, "learning_rate": 2.923e-07, "loss": 0.4443, "step": 177000 }, { "epoch": 1.7705, "grad_norm": 47.688934326171875, "learning_rate": 2.9210000000000003e-07, "loss": 0.4077, "step": 177050 }, { "epoch": 1.771, "grad_norm": 60.86616516113281, "learning_rate": 2.9189999999999996e-07, "loss": 0.4187, "step": 177100 }, { "epoch": 1.7715, "grad_norm": 19.05134391784668, "learning_rate": 2.917e-07, "loss": 0.4585, "step": 177150 }, { "epoch": 1.772, "grad_norm": 41.00670623779297, "learning_rate": 2.915e-07, "loss": 0.3587, "step": 177200 }, { "epoch": 1.7725, "grad_norm": 4.250216007232666, "learning_rate": 2.9129999999999997e-07, "loss": 0.4831, "step": 177250 }, { "epoch": 1.7730000000000001, "grad_norm": 53.11735534667969, "learning_rate": 2.911e-07, "loss": 0.378, "step": 177300 }, { "epoch": 1.7734999999999999, "grad_norm": 79.12381744384766, "learning_rate": 2.909e-07, "loss": 0.3421, "step": 177350 }, { "epoch": 1.774, "grad_norm": 123.16691589355469, "learning_rate": 2.907e-07, "loss": 0.4126, "step": 177400 }, { "epoch": 1.7745, "grad_norm": 83.73424530029297, "learning_rate": 2.9049999999999996e-07, "loss": 0.4596, "step": 177450 }, { "epoch": 1.775, "grad_norm": 47.7365608215332, "learning_rate": 2.903e-07, "loss": 0.3569, "step": 177500 }, { "epoch": 1.7755, "grad_norm": 18.262990951538086, "learning_rate": 2.9010000000000004e-07, "loss": 0.3546, "step": 177550 }, { "epoch": 1.776, "grad_norm": 25.71767807006836, "learning_rate": 2.8989999999999997e-07, "loss": 0.5118, "step": 177600 }, { "epoch": 1.7765, "grad_norm": 43.59213638305664, "learning_rate": 2.897e-07, "loss": 0.4437, "step": 177650 }, { "epoch": 1.7770000000000001, "grad_norm": 3.3139374256134033, "learning_rate": 2.895e-07, "loss": 0.4414, "step": 177700 }, { "epoch": 1.7774999999999999, "grad_norm": 21.859750747680664, "learning_rate": 2.893e-07, "loss": 0.5413, "step": 177750 }, { "epoch": 1.778, "grad_norm": 63.0023307800293, "learning_rate": 2.891e-07, "loss": 0.3183, "step": 177800 }, { "epoch": 1.7785, "grad_norm": 7.4691267013549805, "learning_rate": 2.889e-07, "loss": 0.2339, "step": 177850 }, { "epoch": 1.779, "grad_norm": 51.13058090209961, "learning_rate": 2.887e-07, "loss": 0.5413, "step": 177900 }, { "epoch": 1.7795, "grad_norm": 65.8671875, "learning_rate": 2.8849999999999997e-07, "loss": 0.4583, "step": 177950 }, { "epoch": 1.78, "grad_norm": 10.142685890197754, "learning_rate": 2.883e-07, "loss": 0.4431, "step": 178000 }, { "epoch": 1.7805, "grad_norm": 26.582061767578125, "learning_rate": 2.881e-07, "loss": 0.4004, "step": 178050 }, { "epoch": 1.7810000000000001, "grad_norm": 73.92897033691406, "learning_rate": 2.879e-07, "loss": 0.5293, "step": 178100 }, { "epoch": 1.7814999999999999, "grad_norm": 69.2878646850586, "learning_rate": 2.877e-07, "loss": 0.5733, "step": 178150 }, { "epoch": 1.782, "grad_norm": 99.10835266113281, "learning_rate": 2.8749999999999995e-07, "loss": 0.5006, "step": 178200 }, { "epoch": 1.7825, "grad_norm": 78.11844635009766, "learning_rate": 2.873e-07, "loss": 0.3794, "step": 178250 }, { "epoch": 1.783, "grad_norm": 100.39323425292969, "learning_rate": 2.871e-07, "loss": 0.4381, "step": 178300 }, { "epoch": 1.7835, "grad_norm": 19.325456619262695, "learning_rate": 2.8689999999999996e-07, "loss": 0.4126, "step": 178350 }, { "epoch": 1.784, "grad_norm": 3.960777759552002, "learning_rate": 2.867e-07, "loss": 0.4078, "step": 178400 }, { "epoch": 1.7845, "grad_norm": 59.112300872802734, "learning_rate": 2.865e-07, "loss": 0.3978, "step": 178450 }, { "epoch": 1.7850000000000001, "grad_norm": 17.307994842529297, "learning_rate": 2.8629999999999996e-07, "loss": 0.4808, "step": 178500 }, { "epoch": 1.7854999999999999, "grad_norm": 33.76523971557617, "learning_rate": 2.861e-07, "loss": 0.4348, "step": 178550 }, { "epoch": 1.786, "grad_norm": 80.96756744384766, "learning_rate": 2.859e-07, "loss": 0.4833, "step": 178600 }, { "epoch": 1.7865, "grad_norm": 48.94674301147461, "learning_rate": 2.8569999999999997e-07, "loss": 0.3485, "step": 178650 }, { "epoch": 1.787, "grad_norm": 72.22645568847656, "learning_rate": 2.8549999999999996e-07, "loss": 0.4025, "step": 178700 }, { "epoch": 1.7875, "grad_norm": 4.155689716339111, "learning_rate": 2.853e-07, "loss": 0.405, "step": 178750 }, { "epoch": 1.788, "grad_norm": 0.55622398853302, "learning_rate": 2.8510000000000003e-07, "loss": 0.5012, "step": 178800 }, { "epoch": 1.7885, "grad_norm": 67.4764404296875, "learning_rate": 2.8489999999999996e-07, "loss": 0.3643, "step": 178850 }, { "epoch": 1.7890000000000001, "grad_norm": 85.40705108642578, "learning_rate": 2.847e-07, "loss": 0.5223, "step": 178900 }, { "epoch": 1.7894999999999999, "grad_norm": 85.30449676513672, "learning_rate": 2.845e-07, "loss": 0.4813, "step": 178950 }, { "epoch": 1.79, "grad_norm": 10.022859573364258, "learning_rate": 2.8429999999999997e-07, "loss": 0.4518, "step": 179000 }, { "epoch": 1.7905, "grad_norm": 53.33081817626953, "learning_rate": 2.841e-07, "loss": 0.4076, "step": 179050 }, { "epoch": 1.791, "grad_norm": 1.9716160297393799, "learning_rate": 2.839e-07, "loss": 0.4666, "step": 179100 }, { "epoch": 1.7915, "grad_norm": 59.22763442993164, "learning_rate": 2.837e-07, "loss": 0.5778, "step": 179150 }, { "epoch": 1.792, "grad_norm": 7.315980911254883, "learning_rate": 2.8349999999999996e-07, "loss": 0.4156, "step": 179200 }, { "epoch": 1.7925, "grad_norm": 49.91332244873047, "learning_rate": 2.833e-07, "loss": 0.4988, "step": 179250 }, { "epoch": 1.7930000000000001, "grad_norm": 65.8807144165039, "learning_rate": 2.831e-07, "loss": 0.5433, "step": 179300 }, { "epoch": 1.7934999999999999, "grad_norm": 103.30567169189453, "learning_rate": 2.8289999999999997e-07, "loss": 0.5536, "step": 179350 }, { "epoch": 1.794, "grad_norm": 86.77356719970703, "learning_rate": 2.827e-07, "loss": 0.5217, "step": 179400 }, { "epoch": 1.7945, "grad_norm": 96.92404174804688, "learning_rate": 2.8249999999999994e-07, "loss": 0.4565, "step": 179450 }, { "epoch": 1.795, "grad_norm": 14.877604484558105, "learning_rate": 2.823e-07, "loss": 0.4828, "step": 179500 }, { "epoch": 1.7955, "grad_norm": 24.75067901611328, "learning_rate": 2.821e-07, "loss": 0.4715, "step": 179550 }, { "epoch": 1.796, "grad_norm": 45.08053207397461, "learning_rate": 2.8189999999999995e-07, "loss": 0.4347, "step": 179600 }, { "epoch": 1.7965, "grad_norm": 1.8589259386062622, "learning_rate": 2.817e-07, "loss": 0.4739, "step": 179650 }, { "epoch": 1.7970000000000002, "grad_norm": 128.76307678222656, "learning_rate": 2.8149999999999997e-07, "loss": 0.4551, "step": 179700 }, { "epoch": 1.7974999999999999, "grad_norm": 25.413040161132812, "learning_rate": 2.8129999999999996e-07, "loss": 0.335, "step": 179750 }, { "epoch": 1.798, "grad_norm": 74.6392593383789, "learning_rate": 2.811e-07, "loss": 0.4635, "step": 179800 }, { "epoch": 1.7985, "grad_norm": 69.69477844238281, "learning_rate": 2.809e-07, "loss": 0.4422, "step": 179850 }, { "epoch": 1.799, "grad_norm": 12.26147747039795, "learning_rate": 2.807e-07, "loss": 0.4213, "step": 179900 }, { "epoch": 1.7995, "grad_norm": 90.06591033935547, "learning_rate": 2.805e-07, "loss": 0.3711, "step": 179950 }, { "epoch": 1.8, "grad_norm": 24.045799255371094, "learning_rate": 2.803e-07, "loss": 0.3808, "step": 180000 }, { "epoch": 1.8005, "grad_norm": 42.867767333984375, "learning_rate": 2.8010000000000003e-07, "loss": 0.4572, "step": 180050 }, { "epoch": 1.8010000000000002, "grad_norm": 63.34716033935547, "learning_rate": 2.7989999999999996e-07, "loss": 0.5186, "step": 180100 }, { "epoch": 1.8014999999999999, "grad_norm": 56.33584976196289, "learning_rate": 2.797e-07, "loss": 0.5546, "step": 180150 }, { "epoch": 1.802, "grad_norm": 81.60842895507812, "learning_rate": 2.7950000000000003e-07, "loss": 0.5097, "step": 180200 }, { "epoch": 1.8025, "grad_norm": 12.385924339294434, "learning_rate": 2.7929999999999997e-07, "loss": 0.3856, "step": 180250 }, { "epoch": 1.803, "grad_norm": 54.57160186767578, "learning_rate": 2.791e-07, "loss": 0.398, "step": 180300 }, { "epoch": 1.8035, "grad_norm": 75.23123168945312, "learning_rate": 2.789e-07, "loss": 0.3756, "step": 180350 }, { "epoch": 1.804, "grad_norm": 102.83467102050781, "learning_rate": 2.787e-07, "loss": 0.3975, "step": 180400 }, { "epoch": 1.8045, "grad_norm": 150.63693237304688, "learning_rate": 2.785e-07, "loss": 0.2832, "step": 180450 }, { "epoch": 1.8050000000000002, "grad_norm": 46.95927429199219, "learning_rate": 2.783e-07, "loss": 0.3984, "step": 180500 }, { "epoch": 1.8054999999999999, "grad_norm": 103.29827117919922, "learning_rate": 2.781e-07, "loss": 0.3903, "step": 180550 }, { "epoch": 1.806, "grad_norm": 6.826172351837158, "learning_rate": 2.7789999999999997e-07, "loss": 0.5537, "step": 180600 }, { "epoch": 1.8065, "grad_norm": 71.7900161743164, "learning_rate": 2.777e-07, "loss": 0.421, "step": 180650 }, { "epoch": 1.807, "grad_norm": 78.16989135742188, "learning_rate": 2.775e-07, "loss": 0.4313, "step": 180700 }, { "epoch": 1.8075, "grad_norm": 64.42400360107422, "learning_rate": 2.773e-07, "loss": 0.4864, "step": 180750 }, { "epoch": 1.808, "grad_norm": 27.222681045532227, "learning_rate": 2.771e-07, "loss": 0.3927, "step": 180800 }, { "epoch": 1.8085, "grad_norm": 72.81607818603516, "learning_rate": 2.7689999999999995e-07, "loss": 0.4236, "step": 180850 }, { "epoch": 1.8090000000000002, "grad_norm": 119.21035766601562, "learning_rate": 2.767e-07, "loss": 0.5258, "step": 180900 }, { "epoch": 1.8094999999999999, "grad_norm": 93.99649810791016, "learning_rate": 2.765e-07, "loss": 0.425, "step": 180950 }, { "epoch": 1.81, "grad_norm": 81.92257690429688, "learning_rate": 2.7629999999999995e-07, "loss": 0.4852, "step": 181000 }, { "epoch": 1.8105, "grad_norm": 67.70683288574219, "learning_rate": 2.761e-07, "loss": 0.4518, "step": 181050 }, { "epoch": 1.811, "grad_norm": 8.245046615600586, "learning_rate": 2.759e-07, "loss": 0.4221, "step": 181100 }, { "epoch": 1.8115, "grad_norm": 0.5864673852920532, "learning_rate": 2.757e-07, "loss": 0.3862, "step": 181150 }, { "epoch": 1.812, "grad_norm": 103.44923400878906, "learning_rate": 2.755e-07, "loss": 0.4913, "step": 181200 }, { "epoch": 1.8125, "grad_norm": 0.6303176879882812, "learning_rate": 2.753e-07, "loss": 0.411, "step": 181250 }, { "epoch": 1.813, "grad_norm": 50.171905517578125, "learning_rate": 2.751e-07, "loss": 0.3466, "step": 181300 }, { "epoch": 1.8135, "grad_norm": 31.094560623168945, "learning_rate": 2.7489999999999995e-07, "loss": 0.4811, "step": 181350 }, { "epoch": 1.814, "grad_norm": 3.8934226036071777, "learning_rate": 2.747e-07, "loss": 0.3938, "step": 181400 }, { "epoch": 1.8145, "grad_norm": 37.404117584228516, "learning_rate": 2.7450000000000003e-07, "loss": 0.3969, "step": 181450 }, { "epoch": 1.815, "grad_norm": 13.522012710571289, "learning_rate": 2.7429999999999996e-07, "loss": 0.4785, "step": 181500 }, { "epoch": 1.8155000000000001, "grad_norm": 14.331076622009277, "learning_rate": 2.741e-07, "loss": 0.3068, "step": 181550 }, { "epoch": 1.8159999999999998, "grad_norm": 26.77286148071289, "learning_rate": 2.739e-07, "loss": 0.337, "step": 181600 }, { "epoch": 1.8165, "grad_norm": 48.4984245300293, "learning_rate": 2.7369999999999997e-07, "loss": 0.3276, "step": 181650 }, { "epoch": 1.817, "grad_norm": 60.87279510498047, "learning_rate": 2.735e-07, "loss": 0.4779, "step": 181700 }, { "epoch": 1.8175, "grad_norm": 51.6512336730957, "learning_rate": 2.733e-07, "loss": 0.3513, "step": 181750 }, { "epoch": 1.818, "grad_norm": 6.495523929595947, "learning_rate": 2.731e-07, "loss": 0.5444, "step": 181800 }, { "epoch": 1.8185, "grad_norm": 60.35252380371094, "learning_rate": 2.7289999999999996e-07, "loss": 0.4291, "step": 181850 }, { "epoch": 1.819, "grad_norm": 28.022743225097656, "learning_rate": 2.727e-07, "loss": 0.3474, "step": 181900 }, { "epoch": 1.8195000000000001, "grad_norm": 10.732136726379395, "learning_rate": 2.725e-07, "loss": 0.4097, "step": 181950 }, { "epoch": 1.8199999999999998, "grad_norm": 85.77149200439453, "learning_rate": 2.7229999999999997e-07, "loss": 0.4671, "step": 182000 }, { "epoch": 1.8205, "grad_norm": 69.69800567626953, "learning_rate": 2.721e-07, "loss": 0.4068, "step": 182050 }, { "epoch": 1.821, "grad_norm": 67.32820129394531, "learning_rate": 2.7189999999999994e-07, "loss": 0.434, "step": 182100 }, { "epoch": 1.8215, "grad_norm": 46.35198974609375, "learning_rate": 2.717e-07, "loss": 0.4207, "step": 182150 }, { "epoch": 1.822, "grad_norm": 13.979206085205078, "learning_rate": 2.715e-07, "loss": 0.4041, "step": 182200 }, { "epoch": 1.8225, "grad_norm": 3.441556453704834, "learning_rate": 2.7129999999999995e-07, "loss": 0.4985, "step": 182250 }, { "epoch": 1.823, "grad_norm": 2.4120137691497803, "learning_rate": 2.711e-07, "loss": 0.5843, "step": 182300 }, { "epoch": 1.8235000000000001, "grad_norm": 70.7459716796875, "learning_rate": 2.7089999999999997e-07, "loss": 0.3521, "step": 182350 }, { "epoch": 1.8239999999999998, "grad_norm": 35.97295379638672, "learning_rate": 2.707e-07, "loss": 0.5621, "step": 182400 }, { "epoch": 1.8245, "grad_norm": 89.31857299804688, "learning_rate": 2.705e-07, "loss": 0.3875, "step": 182450 }, { "epoch": 1.825, "grad_norm": 91.55574035644531, "learning_rate": 2.703e-07, "loss": 0.4778, "step": 182500 }, { "epoch": 1.8255, "grad_norm": 4.394865036010742, "learning_rate": 2.701e-07, "loss": 0.3469, "step": 182550 }, { "epoch": 1.826, "grad_norm": 3.864041566848755, "learning_rate": 2.6989999999999995e-07, "loss": 0.3886, "step": 182600 }, { "epoch": 1.8265, "grad_norm": 12.288315773010254, "learning_rate": 2.697e-07, "loss": 0.4518, "step": 182650 }, { "epoch": 1.827, "grad_norm": 2.9489903450012207, "learning_rate": 2.695e-07, "loss": 0.3015, "step": 182700 }, { "epoch": 1.8275000000000001, "grad_norm": 58.115970611572266, "learning_rate": 2.6929999999999996e-07, "loss": 0.3234, "step": 182750 }, { "epoch": 1.8279999999999998, "grad_norm": 62.53365707397461, "learning_rate": 2.691e-07, "loss": 0.4949, "step": 182800 }, { "epoch": 1.8285, "grad_norm": 38.055843353271484, "learning_rate": 2.68904e-07, "loss": 0.4362, "step": 182850 }, { "epoch": 1.829, "grad_norm": 114.48504638671875, "learning_rate": 2.6870399999999997e-07, "loss": 0.4096, "step": 182900 }, { "epoch": 1.8295, "grad_norm": 56.97817611694336, "learning_rate": 2.68504e-07, "loss": 0.4706, "step": 182950 }, { "epoch": 1.83, "grad_norm": 100.63391876220703, "learning_rate": 2.68304e-07, "loss": 0.436, "step": 183000 }, { "epoch": 1.8305, "grad_norm": 0.5884892344474792, "learning_rate": 2.68104e-07, "loss": 0.5211, "step": 183050 }, { "epoch": 1.831, "grad_norm": 39.73606491088867, "learning_rate": 2.6790399999999996e-07, "loss": 0.3914, "step": 183100 }, { "epoch": 1.8315000000000001, "grad_norm": 3.421617269515991, "learning_rate": 2.67704e-07, "loss": 0.3778, "step": 183150 }, { "epoch": 1.8319999999999999, "grad_norm": 61.44535827636719, "learning_rate": 2.67504e-07, "loss": 0.5635, "step": 183200 }, { "epoch": 1.8325, "grad_norm": 80.6475601196289, "learning_rate": 2.6730399999999997e-07, "loss": 0.491, "step": 183250 }, { "epoch": 1.833, "grad_norm": 5.449416160583496, "learning_rate": 2.67104e-07, "loss": 0.3328, "step": 183300 }, { "epoch": 1.8335, "grad_norm": 92.49075317382812, "learning_rate": 2.6690399999999994e-07, "loss": 0.4623, "step": 183350 }, { "epoch": 1.834, "grad_norm": 7.8489990234375, "learning_rate": 2.66704e-07, "loss": 0.4183, "step": 183400 }, { "epoch": 1.8345, "grad_norm": 10.370362281799316, "learning_rate": 2.66504e-07, "loss": 0.4563, "step": 183450 }, { "epoch": 1.835, "grad_norm": 104.78883361816406, "learning_rate": 2.6630399999999995e-07, "loss": 0.4643, "step": 183500 }, { "epoch": 1.8355000000000001, "grad_norm": 1.3594282865524292, "learning_rate": 2.66104e-07, "loss": 0.4734, "step": 183550 }, { "epoch": 1.8359999999999999, "grad_norm": 1.5864472389221191, "learning_rate": 2.6590399999999997e-07, "loss": 0.3522, "step": 183600 }, { "epoch": 1.8365, "grad_norm": 62.5477409362793, "learning_rate": 2.65704e-07, "loss": 0.3794, "step": 183650 }, { "epoch": 1.837, "grad_norm": 53.555747985839844, "learning_rate": 2.65504e-07, "loss": 0.4603, "step": 183700 }, { "epoch": 1.8375, "grad_norm": 81.1266098022461, "learning_rate": 2.65304e-07, "loss": 0.564, "step": 183750 }, { "epoch": 1.838, "grad_norm": 40.866695404052734, "learning_rate": 2.65104e-07, "loss": 0.4407, "step": 183800 }, { "epoch": 1.8385, "grad_norm": 78.56951904296875, "learning_rate": 2.6490399999999995e-07, "loss": 0.435, "step": 183850 }, { "epoch": 1.839, "grad_norm": 6.286223888397217, "learning_rate": 2.64704e-07, "loss": 0.3209, "step": 183900 }, { "epoch": 1.8395000000000001, "grad_norm": 21.405439376831055, "learning_rate": 2.64504e-07, "loss": 0.352, "step": 183950 }, { "epoch": 1.8399999999999999, "grad_norm": 1.1134364604949951, "learning_rate": 2.6430399999999996e-07, "loss": 0.4044, "step": 184000 }, { "epoch": 1.8405, "grad_norm": 0.49174389243125916, "learning_rate": 2.64104e-07, "loss": 0.2879, "step": 184050 }, { "epoch": 1.841, "grad_norm": 7.193144798278809, "learning_rate": 2.6390400000000003e-07, "loss": 0.5735, "step": 184100 }, { "epoch": 1.8415, "grad_norm": 109.04924774169922, "learning_rate": 2.6370399999999996e-07, "loss": 0.4442, "step": 184150 }, { "epoch": 1.842, "grad_norm": 29.094871520996094, "learning_rate": 2.63504e-07, "loss": 0.5491, "step": 184200 }, { "epoch": 1.8425, "grad_norm": 50.769309997558594, "learning_rate": 2.63304e-07, "loss": 0.3682, "step": 184250 }, { "epoch": 1.843, "grad_norm": 93.74052429199219, "learning_rate": 2.6310399999999997e-07, "loss": 0.612, "step": 184300 }, { "epoch": 1.8435000000000001, "grad_norm": 90.94390106201172, "learning_rate": 2.62904e-07, "loss": 0.5965, "step": 184350 }, { "epoch": 1.8439999999999999, "grad_norm": 66.96981811523438, "learning_rate": 2.62704e-07, "loss": 0.4703, "step": 184400 }, { "epoch": 1.8445, "grad_norm": 99.0244369506836, "learning_rate": 2.62504e-07, "loss": 0.4547, "step": 184450 }, { "epoch": 1.845, "grad_norm": 75.41879272460938, "learning_rate": 2.6230399999999996e-07, "loss": 0.3646, "step": 184500 }, { "epoch": 1.8455, "grad_norm": 15.815811157226562, "learning_rate": 2.62104e-07, "loss": 0.432, "step": 184550 }, { "epoch": 1.846, "grad_norm": 86.7995376586914, "learning_rate": 2.6190400000000004e-07, "loss": 0.5396, "step": 184600 }, { "epoch": 1.8465, "grad_norm": 43.598876953125, "learning_rate": 2.6170399999999997e-07, "loss": 0.4494, "step": 184650 }, { "epoch": 1.847, "grad_norm": 55.9370231628418, "learning_rate": 2.61504e-07, "loss": 0.4301, "step": 184700 }, { "epoch": 1.8475000000000001, "grad_norm": 91.31571197509766, "learning_rate": 2.61304e-07, "loss": 0.4079, "step": 184750 }, { "epoch": 1.8479999999999999, "grad_norm": 2.9025886058807373, "learning_rate": 2.61104e-07, "loss": 0.4143, "step": 184800 }, { "epoch": 1.8485, "grad_norm": 76.20918273925781, "learning_rate": 2.60904e-07, "loss": 0.3667, "step": 184850 }, { "epoch": 1.849, "grad_norm": 65.22622680664062, "learning_rate": 2.60704e-07, "loss": 0.3588, "step": 184900 }, { "epoch": 1.8495, "grad_norm": 100.45918273925781, "learning_rate": 2.60504e-07, "loss": 0.4632, "step": 184950 }, { "epoch": 1.85, "grad_norm": 3.1237618923187256, "learning_rate": 2.6030399999999997e-07, "loss": 0.5477, "step": 185000 }, { "epoch": 1.8505, "grad_norm": 64.92867279052734, "learning_rate": 2.60104e-07, "loss": 0.4146, "step": 185050 }, { "epoch": 1.851, "grad_norm": 5.72701358795166, "learning_rate": 2.59904e-07, "loss": 0.3746, "step": 185100 }, { "epoch": 1.8515000000000001, "grad_norm": 86.76318359375, "learning_rate": 2.59704e-07, "loss": 0.4661, "step": 185150 }, { "epoch": 1.8519999999999999, "grad_norm": 68.64383697509766, "learning_rate": 2.59504e-07, "loss": 0.4119, "step": 185200 }, { "epoch": 1.8525, "grad_norm": 10.763185501098633, "learning_rate": 2.5930399999999995e-07, "loss": 0.414, "step": 185250 }, { "epoch": 1.853, "grad_norm": 1.534547209739685, "learning_rate": 2.59104e-07, "loss": 0.3791, "step": 185300 }, { "epoch": 1.8535, "grad_norm": 6.461805820465088, "learning_rate": 2.5890400000000003e-07, "loss": 0.5048, "step": 185350 }, { "epoch": 1.854, "grad_norm": 109.25067138671875, "learning_rate": 2.5870399999999996e-07, "loss": 0.5874, "step": 185400 }, { "epoch": 1.8545, "grad_norm": 24.06308364868164, "learning_rate": 2.58504e-07, "loss": 0.3391, "step": 185450 }, { "epoch": 1.855, "grad_norm": 0.7882880568504333, "learning_rate": 2.58304e-07, "loss": 0.4764, "step": 185500 }, { "epoch": 1.8555000000000001, "grad_norm": 26.011205673217773, "learning_rate": 2.5810399999999997e-07, "loss": 0.4689, "step": 185550 }, { "epoch": 1.8559999999999999, "grad_norm": 111.33613586425781, "learning_rate": 2.57904e-07, "loss": 0.4482, "step": 185600 }, { "epoch": 1.8565, "grad_norm": 7.982728958129883, "learning_rate": 2.57704e-07, "loss": 0.4772, "step": 185650 }, { "epoch": 1.857, "grad_norm": 68.1218032836914, "learning_rate": 2.57504e-07, "loss": 0.4142, "step": 185700 }, { "epoch": 1.8575, "grad_norm": 22.570636749267578, "learning_rate": 2.5730399999999996e-07, "loss": 0.3669, "step": 185750 }, { "epoch": 1.858, "grad_norm": 53.14181137084961, "learning_rate": 2.57104e-07, "loss": 0.3772, "step": 185800 }, { "epoch": 1.8585, "grad_norm": 94.5253677368164, "learning_rate": 2.5690400000000004e-07, "loss": 0.5068, "step": 185850 }, { "epoch": 1.859, "grad_norm": 57.95703887939453, "learning_rate": 2.5670399999999997e-07, "loss": 0.4562, "step": 185900 }, { "epoch": 1.8595000000000002, "grad_norm": 125.20692443847656, "learning_rate": 2.56504e-07, "loss": 0.5206, "step": 185950 }, { "epoch": 1.8599999999999999, "grad_norm": 76.74690246582031, "learning_rate": 2.56304e-07, "loss": 0.6174, "step": 186000 }, { "epoch": 1.8605, "grad_norm": 36.07761764526367, "learning_rate": 2.56104e-07, "loss": 0.5131, "step": 186050 }, { "epoch": 1.861, "grad_norm": 105.98657989501953, "learning_rate": 2.55904e-07, "loss": 0.4608, "step": 186100 }, { "epoch": 1.8615, "grad_norm": 62.74296569824219, "learning_rate": 2.55704e-07, "loss": 0.3984, "step": 186150 }, { "epoch": 1.862, "grad_norm": 7.824224948883057, "learning_rate": 2.55504e-07, "loss": 0.4741, "step": 186200 }, { "epoch": 1.8625, "grad_norm": 0.2132396399974823, "learning_rate": 2.5530399999999997e-07, "loss": 0.4148, "step": 186250 }, { "epoch": 1.863, "grad_norm": 63.773040771484375, "learning_rate": 2.55104e-07, "loss": 0.3911, "step": 186300 }, { "epoch": 1.8635000000000002, "grad_norm": 35.12382888793945, "learning_rate": 2.54904e-07, "loss": 0.3641, "step": 186350 }, { "epoch": 1.8639999999999999, "grad_norm": 13.937716484069824, "learning_rate": 2.54704e-07, "loss": 0.5185, "step": 186400 }, { "epoch": 1.8645, "grad_norm": 139.38047790527344, "learning_rate": 2.54504e-07, "loss": 0.3617, "step": 186450 }, { "epoch": 1.865, "grad_norm": 54.645015716552734, "learning_rate": 2.5430399999999995e-07, "loss": 0.4411, "step": 186500 }, { "epoch": 1.8655, "grad_norm": 77.72416687011719, "learning_rate": 2.54104e-07, "loss": 0.4681, "step": 186550 }, { "epoch": 1.866, "grad_norm": 75.3111801147461, "learning_rate": 2.53904e-07, "loss": 0.3701, "step": 186600 }, { "epoch": 1.8665, "grad_norm": 90.5988540649414, "learning_rate": 2.5370399999999995e-07, "loss": 0.415, "step": 186650 }, { "epoch": 1.867, "grad_norm": 7.769338130950928, "learning_rate": 2.53504e-07, "loss": 0.2835, "step": 186700 }, { "epoch": 1.8675000000000002, "grad_norm": 38.08205032348633, "learning_rate": 2.53304e-07, "loss": 0.5307, "step": 186750 }, { "epoch": 1.8679999999999999, "grad_norm": 17.543153762817383, "learning_rate": 2.5310399999999996e-07, "loss": 0.3548, "step": 186800 }, { "epoch": 1.8685, "grad_norm": 10.238409996032715, "learning_rate": 2.52904e-07, "loss": 0.5478, "step": 186850 }, { "epoch": 1.869, "grad_norm": 73.03883361816406, "learning_rate": 2.52704e-07, "loss": 0.4317, "step": 186900 }, { "epoch": 1.8695, "grad_norm": 50.91599655151367, "learning_rate": 2.5250399999999997e-07, "loss": 0.4353, "step": 186950 }, { "epoch": 1.87, "grad_norm": 1.1760220527648926, "learning_rate": 2.5230399999999995e-07, "loss": 0.4632, "step": 187000 }, { "epoch": 1.8705, "grad_norm": 79.37187194824219, "learning_rate": 2.52104e-07, "loss": 0.4926, "step": 187050 }, { "epoch": 1.871, "grad_norm": 60.9011344909668, "learning_rate": 2.5190400000000003e-07, "loss": 0.5786, "step": 187100 }, { "epoch": 1.8715000000000002, "grad_norm": 108.72712707519531, "learning_rate": 2.5170399999999996e-07, "loss": 0.3917, "step": 187150 }, { "epoch": 1.8719999999999999, "grad_norm": 34.94348907470703, "learning_rate": 2.51504e-07, "loss": 0.4065, "step": 187200 }, { "epoch": 1.8725, "grad_norm": 18.728527069091797, "learning_rate": 2.5130400000000004e-07, "loss": 0.523, "step": 187250 }, { "epoch": 1.873, "grad_norm": 68.10762023925781, "learning_rate": 2.5110399999999997e-07, "loss": 0.372, "step": 187300 }, { "epoch": 1.8735, "grad_norm": 24.434104919433594, "learning_rate": 2.50904e-07, "loss": 0.3331, "step": 187350 }, { "epoch": 1.874, "grad_norm": 27.29010772705078, "learning_rate": 2.50708e-07, "loss": 0.5148, "step": 187400 }, { "epoch": 1.8745, "grad_norm": 19.589698791503906, "learning_rate": 2.50508e-07, "loss": 0.4057, "step": 187450 }, { "epoch": 1.875, "grad_norm": 2.283752679824829, "learning_rate": 2.5031199999999997e-07, "loss": 0.3063, "step": 187500 }, { "epoch": 1.8755, "grad_norm": 21.889986038208008, "learning_rate": 2.50112e-07, "loss": 0.3292, "step": 187550 }, { "epoch": 1.876, "grad_norm": 2.8711414337158203, "learning_rate": 2.49912e-07, "loss": 0.4056, "step": 187600 }, { "epoch": 1.8765, "grad_norm": 53.26054763793945, "learning_rate": 2.49712e-07, "loss": 0.4772, "step": 187650 }, { "epoch": 1.877, "grad_norm": 10.286738395690918, "learning_rate": 2.49512e-07, "loss": 0.4867, "step": 187700 }, { "epoch": 1.8775, "grad_norm": 93.31888580322266, "learning_rate": 2.49312e-07, "loss": 0.3756, "step": 187750 }, { "epoch": 1.8780000000000001, "grad_norm": 86.73455810546875, "learning_rate": 2.49112e-07, "loss": 0.4222, "step": 187800 }, { "epoch": 1.8784999999999998, "grad_norm": 7.344048023223877, "learning_rate": 2.4891199999999997e-07, "loss": 0.4104, "step": 187850 }, { "epoch": 1.879, "grad_norm": 18.443588256835938, "learning_rate": 2.4871199999999996e-07, "loss": 0.5056, "step": 187900 }, { "epoch": 1.8795, "grad_norm": 20.430662155151367, "learning_rate": 2.48512e-07, "loss": 0.7031, "step": 187950 }, { "epoch": 1.88, "grad_norm": 98.71015930175781, "learning_rate": 2.48312e-07, "loss": 0.5261, "step": 188000 }, { "epoch": 1.8805, "grad_norm": 103.14667510986328, "learning_rate": 2.4811199999999997e-07, "loss": 0.4343, "step": 188050 }, { "epoch": 1.881, "grad_norm": 82.66529083251953, "learning_rate": 2.47912e-07, "loss": 0.4226, "step": 188100 }, { "epoch": 1.8815, "grad_norm": 11.182500839233398, "learning_rate": 2.47712e-07, "loss": 0.5237, "step": 188150 }, { "epoch": 1.8820000000000001, "grad_norm": 74.19300842285156, "learning_rate": 2.4751199999999997e-07, "loss": 0.5641, "step": 188200 }, { "epoch": 1.8824999999999998, "grad_norm": 36.28932571411133, "learning_rate": 2.47312e-07, "loss": 0.4777, "step": 188250 }, { "epoch": 1.883, "grad_norm": 80.10577392578125, "learning_rate": 2.47112e-07, "loss": 0.5638, "step": 188300 }, { "epoch": 1.8835, "grad_norm": 48.6932373046875, "learning_rate": 2.46912e-07, "loss": 0.3765, "step": 188350 }, { "epoch": 1.884, "grad_norm": 2.2516491413116455, "learning_rate": 2.4671199999999997e-07, "loss": 0.5383, "step": 188400 }, { "epoch": 1.8845, "grad_norm": 51.581634521484375, "learning_rate": 2.46512e-07, "loss": 0.5575, "step": 188450 }, { "epoch": 1.885, "grad_norm": 29.83396339416504, "learning_rate": 2.46312e-07, "loss": 0.3791, "step": 188500 }, { "epoch": 1.8855, "grad_norm": 30.861713409423828, "learning_rate": 2.46112e-07, "loss": 0.3886, "step": 188550 }, { "epoch": 1.8860000000000001, "grad_norm": 0.5609220862388611, "learning_rate": 2.4591199999999996e-07, "loss": 0.4782, "step": 188600 }, { "epoch": 1.8864999999999998, "grad_norm": 99.23149108886719, "learning_rate": 2.45712e-07, "loss": 0.4121, "step": 188650 }, { "epoch": 1.887, "grad_norm": 75.1487808227539, "learning_rate": 2.45512e-07, "loss": 0.3858, "step": 188700 }, { "epoch": 1.8875, "grad_norm": 52.555572509765625, "learning_rate": 2.4531199999999997e-07, "loss": 0.4526, "step": 188750 }, { "epoch": 1.888, "grad_norm": 3.6657705307006836, "learning_rate": 2.45112e-07, "loss": 0.3795, "step": 188800 }, { "epoch": 1.8885, "grad_norm": 53.775360107421875, "learning_rate": 2.44912e-07, "loss": 0.4186, "step": 188850 }, { "epoch": 1.889, "grad_norm": 82.37354278564453, "learning_rate": 2.4471200000000003e-07, "loss": 0.5195, "step": 188900 }, { "epoch": 1.8895, "grad_norm": 71.7086181640625, "learning_rate": 2.44512e-07, "loss": 0.4569, "step": 188950 }, { "epoch": 1.8900000000000001, "grad_norm": 76.88038635253906, "learning_rate": 2.44312e-07, "loss": 0.3721, "step": 189000 }, { "epoch": 1.8904999999999998, "grad_norm": 16.016008377075195, "learning_rate": 2.44112e-07, "loss": 0.4164, "step": 189050 }, { "epoch": 1.891, "grad_norm": 71.51254272460938, "learning_rate": 2.4391199999999997e-07, "loss": 0.4399, "step": 189100 }, { "epoch": 1.8915, "grad_norm": 9.580039024353027, "learning_rate": 2.43712e-07, "loss": 0.3662, "step": 189150 }, { "epoch": 1.892, "grad_norm": 75.79584503173828, "learning_rate": 2.43512e-07, "loss": 0.3707, "step": 189200 }, { "epoch": 1.8925, "grad_norm": 86.73115539550781, "learning_rate": 2.43312e-07, "loss": 0.4684, "step": 189250 }, { "epoch": 1.893, "grad_norm": 68.81775665283203, "learning_rate": 2.4311199999999996e-07, "loss": 0.3968, "step": 189300 }, { "epoch": 1.8935, "grad_norm": 62.46356201171875, "learning_rate": 2.42912e-07, "loss": 0.3978, "step": 189350 }, { "epoch": 1.8940000000000001, "grad_norm": 78.36255645751953, "learning_rate": 2.42712e-07, "loss": 0.4894, "step": 189400 }, { "epoch": 1.8944999999999999, "grad_norm": 96.74028778076172, "learning_rate": 2.4251199999999997e-07, "loss": 0.3728, "step": 189450 }, { "epoch": 1.895, "grad_norm": 92.6654052734375, "learning_rate": 2.42312e-07, "loss": 0.3868, "step": 189500 }, { "epoch": 1.8955, "grad_norm": 38.866249084472656, "learning_rate": 2.42112e-07, "loss": 0.3451, "step": 189550 }, { "epoch": 1.896, "grad_norm": 63.626548767089844, "learning_rate": 2.41912e-07, "loss": 0.4468, "step": 189600 }, { "epoch": 1.8965, "grad_norm": 65.61091613769531, "learning_rate": 2.41712e-07, "loss": 0.3838, "step": 189650 }, { "epoch": 1.897, "grad_norm": 7.437851428985596, "learning_rate": 2.41512e-07, "loss": 0.3055, "step": 189700 }, { "epoch": 1.8975, "grad_norm": 115.52892303466797, "learning_rate": 2.41312e-07, "loss": 0.3372, "step": 189750 }, { "epoch": 1.8980000000000001, "grad_norm": 18.40287971496582, "learning_rate": 2.4111199999999997e-07, "loss": 0.3989, "step": 189800 }, { "epoch": 1.8984999999999999, "grad_norm": 9.118658065795898, "learning_rate": 2.4091199999999995e-07, "loss": 0.4081, "step": 189850 }, { "epoch": 1.899, "grad_norm": 18.847917556762695, "learning_rate": 2.40712e-07, "loss": 0.395, "step": 189900 }, { "epoch": 1.8995, "grad_norm": 36.21388626098633, "learning_rate": 2.40512e-07, "loss": 0.3841, "step": 189950 }, { "epoch": 1.9, "grad_norm": 65.34298706054688, "learning_rate": 2.4031199999999996e-07, "loss": 0.4045, "step": 190000 }, { "epoch": 1.9005, "grad_norm": 90.31636047363281, "learning_rate": 2.40112e-07, "loss": 0.3998, "step": 190050 }, { "epoch": 1.901, "grad_norm": 80.00827026367188, "learning_rate": 2.39912e-07, "loss": 0.505, "step": 190100 }, { "epoch": 1.9015, "grad_norm": 47.44596862792969, "learning_rate": 2.39712e-07, "loss": 0.3921, "step": 190150 }, { "epoch": 1.9020000000000001, "grad_norm": 19.611328125, "learning_rate": 2.39512e-07, "loss": 0.3527, "step": 190200 }, { "epoch": 1.9024999999999999, "grad_norm": 111.60746765136719, "learning_rate": 2.39316e-07, "loss": 0.4237, "step": 190250 }, { "epoch": 1.903, "grad_norm": 0.5896288156509399, "learning_rate": 2.39116e-07, "loss": 0.6023, "step": 190300 }, { "epoch": 1.9035, "grad_norm": 6.561922550201416, "learning_rate": 2.3891599999999997e-07, "loss": 0.4336, "step": 190350 }, { "epoch": 1.904, "grad_norm": 72.0843734741211, "learning_rate": 2.38716e-07, "loss": 0.4817, "step": 190400 }, { "epoch": 1.9045, "grad_norm": 82.44991302490234, "learning_rate": 2.38516e-07, "loss": 0.5264, "step": 190450 }, { "epoch": 1.905, "grad_norm": 65.08079528808594, "learning_rate": 2.38316e-07, "loss": 0.4905, "step": 190500 }, { "epoch": 1.9055, "grad_norm": 52.838294982910156, "learning_rate": 2.3811599999999999e-07, "loss": 0.5025, "step": 190550 }, { "epoch": 1.9060000000000001, "grad_norm": 16.74088478088379, "learning_rate": 2.3791599999999997e-07, "loss": 0.3235, "step": 190600 }, { "epoch": 1.9064999999999999, "grad_norm": 37.48681640625, "learning_rate": 2.37716e-07, "loss": 0.4151, "step": 190650 }, { "epoch": 1.907, "grad_norm": 84.55889892578125, "learning_rate": 2.37516e-07, "loss": 0.3849, "step": 190700 }, { "epoch": 1.9075, "grad_norm": 13.987595558166504, "learning_rate": 2.3731599999999998e-07, "loss": 0.3796, "step": 190750 }, { "epoch": 1.908, "grad_norm": 63.06429672241211, "learning_rate": 2.37116e-07, "loss": 0.3547, "step": 190800 }, { "epoch": 1.9085, "grad_norm": 41.80989456176758, "learning_rate": 2.3691599999999998e-07, "loss": 0.5286, "step": 190850 }, { "epoch": 1.909, "grad_norm": 83.75984954833984, "learning_rate": 2.36716e-07, "loss": 0.3391, "step": 190900 }, { "epoch": 1.9095, "grad_norm": 51.583290100097656, "learning_rate": 2.36516e-07, "loss": 0.3597, "step": 190950 }, { "epoch": 1.9100000000000001, "grad_norm": 65.20770263671875, "learning_rate": 2.3631599999999998e-07, "loss": 0.5186, "step": 191000 }, { "epoch": 1.9104999999999999, "grad_norm": 123.32764434814453, "learning_rate": 2.36116e-07, "loss": 0.4346, "step": 191050 }, { "epoch": 1.911, "grad_norm": 41.057682037353516, "learning_rate": 2.3591599999999998e-07, "loss": 0.4873, "step": 191100 }, { "epoch": 1.9115, "grad_norm": 40.01293182373047, "learning_rate": 2.35716e-07, "loss": 0.4443, "step": 191150 }, { "epoch": 1.912, "grad_norm": 0.16099533438682556, "learning_rate": 2.35516e-07, "loss": 0.3952, "step": 191200 }, { "epoch": 1.9125, "grad_norm": 48.290771484375, "learning_rate": 2.35316e-07, "loss": 0.3768, "step": 191250 }, { "epoch": 1.913, "grad_norm": 0.869463324546814, "learning_rate": 2.3511599999999997e-07, "loss": 0.5064, "step": 191300 }, { "epoch": 1.9135, "grad_norm": 174.40245056152344, "learning_rate": 2.34916e-07, "loss": 0.4593, "step": 191350 }, { "epoch": 1.9140000000000001, "grad_norm": 64.9591293334961, "learning_rate": 2.34716e-07, "loss": 0.4874, "step": 191400 }, { "epoch": 1.9144999999999999, "grad_norm": 11.59118366241455, "learning_rate": 2.3451599999999998e-07, "loss": 0.5005, "step": 191450 }, { "epoch": 1.915, "grad_norm": 108.15178680419922, "learning_rate": 2.34316e-07, "loss": 0.5062, "step": 191500 }, { "epoch": 1.9155, "grad_norm": 0.83865886926651, "learning_rate": 2.3411599999999998e-07, "loss": 0.3885, "step": 191550 }, { "epoch": 1.916, "grad_norm": 85.95867919921875, "learning_rate": 2.3391600000000001e-07, "loss": 0.3974, "step": 191600 }, { "epoch": 1.9165, "grad_norm": 31.9445858001709, "learning_rate": 2.33716e-07, "loss": 0.34, "step": 191650 }, { "epoch": 1.917, "grad_norm": 2.4149677753448486, "learning_rate": 2.3351599999999998e-07, "loss": 0.6457, "step": 191700 }, { "epoch": 1.9175, "grad_norm": 18.962900161743164, "learning_rate": 2.33316e-07, "loss": 0.5091, "step": 191750 }, { "epoch": 1.9180000000000001, "grad_norm": 73.12571716308594, "learning_rate": 2.3311599999999998e-07, "loss": 0.4173, "step": 191800 }, { "epoch": 1.9184999999999999, "grad_norm": 5.990804195404053, "learning_rate": 2.32916e-07, "loss": 0.4436, "step": 191850 }, { "epoch": 1.919, "grad_norm": 70.40878295898438, "learning_rate": 2.32716e-07, "loss": 0.4218, "step": 191900 }, { "epoch": 1.9195, "grad_norm": 24.912353515625, "learning_rate": 2.32516e-07, "loss": 0.5248, "step": 191950 }, { "epoch": 1.92, "grad_norm": 24.189905166625977, "learning_rate": 2.3231599999999997e-07, "loss": 0.4552, "step": 192000 }, { "epoch": 1.9205, "grad_norm": 56.80104064941406, "learning_rate": 2.3211599999999999e-07, "loss": 0.5258, "step": 192050 }, { "epoch": 1.921, "grad_norm": 69.89954376220703, "learning_rate": 2.31916e-07, "loss": 0.4205, "step": 192100 }, { "epoch": 1.9215, "grad_norm": 72.8506851196289, "learning_rate": 2.3171599999999998e-07, "loss": 0.4343, "step": 192150 }, { "epoch": 1.9220000000000002, "grad_norm": 77.9870376586914, "learning_rate": 2.31516e-07, "loss": 0.3266, "step": 192200 }, { "epoch": 1.9224999999999999, "grad_norm": 128.5535430908203, "learning_rate": 2.3131599999999998e-07, "loss": 0.5274, "step": 192250 }, { "epoch": 1.923, "grad_norm": 34.5822868347168, "learning_rate": 2.31116e-07, "loss": 0.4293, "step": 192300 }, { "epoch": 1.9235, "grad_norm": 1.127679467201233, "learning_rate": 2.30916e-07, "loss": 0.5053, "step": 192350 }, { "epoch": 1.924, "grad_norm": 122.0609130859375, "learning_rate": 2.3071599999999999e-07, "loss": 0.4999, "step": 192400 }, { "epoch": 1.9245, "grad_norm": 4.369318962097168, "learning_rate": 2.30516e-07, "loss": 0.375, "step": 192450 }, { "epoch": 1.925, "grad_norm": 16.14723014831543, "learning_rate": 2.3031599999999998e-07, "loss": 0.4943, "step": 192500 }, { "epoch": 1.9255, "grad_norm": 9.635993957519531, "learning_rate": 2.3011599999999997e-07, "loss": 0.3829, "step": 192550 }, { "epoch": 1.9260000000000002, "grad_norm": 37.19770050048828, "learning_rate": 2.29916e-07, "loss": 0.441, "step": 192600 }, { "epoch": 1.9264999999999999, "grad_norm": 100.09040069580078, "learning_rate": 2.29716e-07, "loss": 0.4527, "step": 192650 }, { "epoch": 1.927, "grad_norm": 73.9195785522461, "learning_rate": 2.2951599999999998e-07, "loss": 0.4453, "step": 192700 }, { "epoch": 1.9275, "grad_norm": 5.231517314910889, "learning_rate": 2.29316e-07, "loss": 0.4418, "step": 192750 }, { "epoch": 1.928, "grad_norm": 86.26935577392578, "learning_rate": 2.2911599999999997e-07, "loss": 0.4282, "step": 192800 }, { "epoch": 1.9285, "grad_norm": 20.850133895874023, "learning_rate": 2.28916e-07, "loss": 0.5655, "step": 192850 }, { "epoch": 1.929, "grad_norm": 0.3520480692386627, "learning_rate": 2.28716e-07, "loss": 0.409, "step": 192900 }, { "epoch": 1.9295, "grad_norm": 3.128180503845215, "learning_rate": 2.2851599999999998e-07, "loss": 0.4842, "step": 192950 }, { "epoch": 1.9300000000000002, "grad_norm": 122.73941040039062, "learning_rate": 2.28316e-07, "loss": 0.4514, "step": 193000 }, { "epoch": 1.9304999999999999, "grad_norm": 9.219717979431152, "learning_rate": 2.28116e-07, "loss": 0.3849, "step": 193050 }, { "epoch": 1.931, "grad_norm": 12.25208854675293, "learning_rate": 2.27916e-07, "loss": 0.4238, "step": 193100 }, { "epoch": 1.9315, "grad_norm": 92.9471435546875, "learning_rate": 2.27716e-07, "loss": 0.3785, "step": 193150 }, { "epoch": 1.932, "grad_norm": 35.56284713745117, "learning_rate": 2.2751599999999998e-07, "loss": 0.4077, "step": 193200 }, { "epoch": 1.9325, "grad_norm": 85.1362075805664, "learning_rate": 2.2731599999999997e-07, "loss": 0.4885, "step": 193250 }, { "epoch": 1.933, "grad_norm": 17.76969337463379, "learning_rate": 2.2711999999999998e-07, "loss": 0.5319, "step": 193300 }, { "epoch": 1.9335, "grad_norm": 77.77349853515625, "learning_rate": 2.2692399999999997e-07, "loss": 0.6296, "step": 193350 }, { "epoch": 1.9340000000000002, "grad_norm": 1.1785967350006104, "learning_rate": 2.2672399999999999e-07, "loss": 0.4844, "step": 193400 }, { "epoch": 1.9344999999999999, "grad_norm": 36.741268157958984, "learning_rate": 2.26524e-07, "loss": 0.3688, "step": 193450 }, { "epoch": 1.935, "grad_norm": 83.8866195678711, "learning_rate": 2.2632399999999998e-07, "loss": 0.5385, "step": 193500 }, { "epoch": 1.9355, "grad_norm": 67.07383728027344, "learning_rate": 2.26124e-07, "loss": 0.3809, "step": 193550 }, { "epoch": 1.936, "grad_norm": 61.0508918762207, "learning_rate": 2.25924e-07, "loss": 0.362, "step": 193600 }, { "epoch": 1.9365, "grad_norm": 95.52999877929688, "learning_rate": 2.25724e-07, "loss": 0.4692, "step": 193650 }, { "epoch": 1.937, "grad_norm": 2.942073345184326, "learning_rate": 2.25524e-07, "loss": 0.4602, "step": 193700 }, { "epoch": 1.9375, "grad_norm": 80.84626770019531, "learning_rate": 2.2532399999999999e-07, "loss": 0.6496, "step": 193750 }, { "epoch": 1.938, "grad_norm": 15.654928207397461, "learning_rate": 2.2512399999999997e-07, "loss": 0.3803, "step": 193800 }, { "epoch": 1.9385, "grad_norm": 61.12825393676758, "learning_rate": 2.24924e-07, "loss": 0.4215, "step": 193850 }, { "epoch": 1.939, "grad_norm": 64.433837890625, "learning_rate": 2.24724e-07, "loss": 0.394, "step": 193900 }, { "epoch": 1.9395, "grad_norm": 7.063285827636719, "learning_rate": 2.2452399999999998e-07, "loss": 0.4053, "step": 193950 }, { "epoch": 1.94, "grad_norm": 67.11646270751953, "learning_rate": 2.24324e-07, "loss": 0.5138, "step": 194000 }, { "epoch": 1.9405000000000001, "grad_norm": 28.027524948120117, "learning_rate": 2.2412399999999998e-07, "loss": 0.3441, "step": 194050 }, { "epoch": 1.9409999999999998, "grad_norm": 124.71054077148438, "learning_rate": 2.2392400000000001e-07, "loss": 0.4387, "step": 194100 }, { "epoch": 1.9415, "grad_norm": 53.5966911315918, "learning_rate": 2.23724e-07, "loss": 0.3617, "step": 194150 }, { "epoch": 1.942, "grad_norm": 52.884151458740234, "learning_rate": 2.2352399999999998e-07, "loss": 0.5952, "step": 194200 }, { "epoch": 1.9425, "grad_norm": 5.888815879821777, "learning_rate": 2.23324e-07, "loss": 0.4729, "step": 194250 }, { "epoch": 1.943, "grad_norm": 50.12318420410156, "learning_rate": 2.2312399999999998e-07, "loss": 0.443, "step": 194300 }, { "epoch": 1.9435, "grad_norm": 53.48051452636719, "learning_rate": 2.22924e-07, "loss": 0.4987, "step": 194350 }, { "epoch": 1.944, "grad_norm": 40.23417663574219, "learning_rate": 2.22724e-07, "loss": 0.477, "step": 194400 }, { "epoch": 1.9445000000000001, "grad_norm": 0.8471763730049133, "learning_rate": 2.22524e-07, "loss": 0.5448, "step": 194450 }, { "epoch": 1.9449999999999998, "grad_norm": 3.5994434356689453, "learning_rate": 2.2232399999999997e-07, "loss": 0.3093, "step": 194500 }, { "epoch": 1.9455, "grad_norm": 88.96204376220703, "learning_rate": 2.2212399999999998e-07, "loss": 0.447, "step": 194550 }, { "epoch": 1.946, "grad_norm": 3.4328107833862305, "learning_rate": 2.21924e-07, "loss": 0.4485, "step": 194600 }, { "epoch": 1.9465, "grad_norm": 0.81305330991745, "learning_rate": 2.21724e-07, "loss": 0.3417, "step": 194650 }, { "epoch": 1.947, "grad_norm": 14.793620109558105, "learning_rate": 2.21524e-07, "loss": 0.2682, "step": 194700 }, { "epoch": 1.9475, "grad_norm": 71.23736572265625, "learning_rate": 2.2132399999999998e-07, "loss": 0.5289, "step": 194750 }, { "epoch": 1.948, "grad_norm": 89.07301330566406, "learning_rate": 2.21124e-07, "loss": 0.4109, "step": 194800 }, { "epoch": 1.9485000000000001, "grad_norm": 169.87252807617188, "learning_rate": 2.20924e-07, "loss": 0.5863, "step": 194850 }, { "epoch": 1.9489999999999998, "grad_norm": 107.96073150634766, "learning_rate": 2.2072399999999999e-07, "loss": 0.416, "step": 194900 }, { "epoch": 1.9495, "grad_norm": 10.294443130493164, "learning_rate": 2.20524e-07, "loss": 0.4665, "step": 194950 }, { "epoch": 1.95, "grad_norm": 62.89139175415039, "learning_rate": 2.2032399999999998e-07, "loss": 0.5773, "step": 195000 }, { "epoch": 1.9505, "grad_norm": 146.08712768554688, "learning_rate": 2.2012399999999997e-07, "loss": 0.5096, "step": 195050 }, { "epoch": 1.951, "grad_norm": 5.977169990539551, "learning_rate": 2.19924e-07, "loss": 0.4419, "step": 195100 }, { "epoch": 1.9515, "grad_norm": 34.73382568359375, "learning_rate": 2.19724e-07, "loss": 0.4073, "step": 195150 }, { "epoch": 1.952, "grad_norm": 111.31939697265625, "learning_rate": 2.1952399999999997e-07, "loss": 0.4008, "step": 195200 }, { "epoch": 1.9525000000000001, "grad_norm": 28.672367095947266, "learning_rate": 2.1932399999999999e-07, "loss": 0.3195, "step": 195250 }, { "epoch": 1.9529999999999998, "grad_norm": 51.09148025512695, "learning_rate": 2.19124e-07, "loss": 0.3417, "step": 195300 }, { "epoch": 1.9535, "grad_norm": 13.32286262512207, "learning_rate": 2.18924e-07, "loss": 0.539, "step": 195350 }, { "epoch": 1.954, "grad_norm": 89.55329895019531, "learning_rate": 2.18724e-07, "loss": 0.3611, "step": 195400 }, { "epoch": 1.9545, "grad_norm": 112.88272857666016, "learning_rate": 2.1852399999999998e-07, "loss": 0.496, "step": 195450 }, { "epoch": 1.955, "grad_norm": 34.98467254638672, "learning_rate": 2.18324e-07, "loss": 0.3647, "step": 195500 }, { "epoch": 1.9555, "grad_norm": 25.51811981201172, "learning_rate": 2.18124e-07, "loss": 0.2741, "step": 195550 }, { "epoch": 1.956, "grad_norm": 35.12691879272461, "learning_rate": 2.1792399999999999e-07, "loss": 0.5114, "step": 195600 }, { "epoch": 1.9565000000000001, "grad_norm": 92.44993591308594, "learning_rate": 2.17724e-07, "loss": 0.4415, "step": 195650 }, { "epoch": 1.9569999999999999, "grad_norm": 58.697418212890625, "learning_rate": 2.1752399999999998e-07, "loss": 0.5425, "step": 195700 }, { "epoch": 1.9575, "grad_norm": 70.51270294189453, "learning_rate": 2.1732399999999997e-07, "loss": 0.4504, "step": 195750 }, { "epoch": 1.958, "grad_norm": 100.30704498291016, "learning_rate": 2.17124e-07, "loss": 0.5119, "step": 195800 }, { "epoch": 1.9585, "grad_norm": 74.18003845214844, "learning_rate": 2.16924e-07, "loss": 0.3975, "step": 195850 }, { "epoch": 1.959, "grad_norm": 67.8548583984375, "learning_rate": 2.16724e-07, "loss": 0.3993, "step": 195900 }, { "epoch": 1.9595, "grad_norm": 25.4310359954834, "learning_rate": 2.16524e-07, "loss": 0.4111, "step": 195950 }, { "epoch": 1.96, "grad_norm": 3.258418321609497, "learning_rate": 2.1632399999999997e-07, "loss": 0.4154, "step": 196000 }, { "epoch": 1.9605000000000001, "grad_norm": 20.04966163635254, "learning_rate": 2.16124e-07, "loss": 0.4066, "step": 196050 }, { "epoch": 1.9609999999999999, "grad_norm": 45.66105651855469, "learning_rate": 2.15924e-07, "loss": 0.503, "step": 196100 }, { "epoch": 1.9615, "grad_norm": 46.578895568847656, "learning_rate": 2.1572399999999998e-07, "loss": 0.4432, "step": 196150 }, { "epoch": 1.962, "grad_norm": 85.224609375, "learning_rate": 2.15524e-07, "loss": 0.5397, "step": 196200 }, { "epoch": 1.9625, "grad_norm": 2.3414793014526367, "learning_rate": 2.1532399999999998e-07, "loss": 0.3125, "step": 196250 }, { "epoch": 1.963, "grad_norm": 4.292512893676758, "learning_rate": 2.15124e-07, "loss": 0.4941, "step": 196300 }, { "epoch": 1.9635, "grad_norm": 67.12167358398438, "learning_rate": 2.14924e-07, "loss": 0.4186, "step": 196350 }, { "epoch": 1.964, "grad_norm": 74.41678619384766, "learning_rate": 2.1472399999999998e-07, "loss": 0.4799, "step": 196400 }, { "epoch": 1.9645000000000001, "grad_norm": 55.400428771972656, "learning_rate": 2.14524e-07, "loss": 0.3932, "step": 196450 }, { "epoch": 1.9649999999999999, "grad_norm": 89.4749526977539, "learning_rate": 2.1432399999999998e-07, "loss": 0.409, "step": 196500 }, { "epoch": 1.9655, "grad_norm": 41.83777618408203, "learning_rate": 2.14124e-07, "loss": 0.3961, "step": 196550 }, { "epoch": 1.966, "grad_norm": 46.31528854370117, "learning_rate": 2.13924e-07, "loss": 0.5221, "step": 196600 }, { "epoch": 1.9665, "grad_norm": 64.41300964355469, "learning_rate": 2.13724e-07, "loss": 0.3646, "step": 196650 }, { "epoch": 1.967, "grad_norm": 73.85462951660156, "learning_rate": 2.1352399999999997e-07, "loss": 0.496, "step": 196700 }, { "epoch": 1.9675, "grad_norm": 15.591062545776367, "learning_rate": 2.13324e-07, "loss": 0.447, "step": 196750 }, { "epoch": 1.968, "grad_norm": 48.92272186279297, "learning_rate": 2.13124e-07, "loss": 0.4293, "step": 196800 }, { "epoch": 1.9685000000000001, "grad_norm": 1.0661996603012085, "learning_rate": 2.1292399999999998e-07, "loss": 0.4461, "step": 196850 }, { "epoch": 1.9689999999999999, "grad_norm": 3.930109739303589, "learning_rate": 2.12724e-07, "loss": 0.5336, "step": 196900 }, { "epoch": 1.9695, "grad_norm": 59.66569519042969, "learning_rate": 2.1252399999999998e-07, "loss": 0.4064, "step": 196950 }, { "epoch": 1.97, "grad_norm": 59.22267532348633, "learning_rate": 2.1232400000000002e-07, "loss": 0.4626, "step": 197000 }, { "epoch": 1.9705, "grad_norm": 61.249202728271484, "learning_rate": 2.12124e-07, "loss": 0.3603, "step": 197050 }, { "epoch": 1.971, "grad_norm": 8.615995407104492, "learning_rate": 2.1192399999999999e-07, "loss": 0.4088, "step": 197100 }, { "epoch": 1.9715, "grad_norm": 74.69889831542969, "learning_rate": 2.11724e-07, "loss": 0.4601, "step": 197150 }, { "epoch": 1.972, "grad_norm": 70.49463653564453, "learning_rate": 2.1152399999999998e-07, "loss": 0.4344, "step": 197200 }, { "epoch": 1.9725000000000001, "grad_norm": 62.61591339111328, "learning_rate": 2.11324e-07, "loss": 0.5932, "step": 197250 }, { "epoch": 1.9729999999999999, "grad_norm": 25.135278701782227, "learning_rate": 2.11124e-07, "loss": 0.4321, "step": 197300 }, { "epoch": 1.9735, "grad_norm": 1.134199619293213, "learning_rate": 2.10924e-07, "loss": 0.2776, "step": 197350 }, { "epoch": 1.974, "grad_norm": 5.215174198150635, "learning_rate": 2.1072399999999998e-07, "loss": 0.3703, "step": 197400 }, { "epoch": 1.9745, "grad_norm": 73.48294830322266, "learning_rate": 2.1052399999999999e-07, "loss": 0.5626, "step": 197450 }, { "epoch": 1.975, "grad_norm": 136.4009552001953, "learning_rate": 2.10324e-07, "loss": 0.3805, "step": 197500 }, { "epoch": 1.9755, "grad_norm": 79.76919555664062, "learning_rate": 2.1012399999999998e-07, "loss": 0.4289, "step": 197550 }, { "epoch": 1.976, "grad_norm": 70.65571594238281, "learning_rate": 2.09924e-07, "loss": 0.3265, "step": 197600 }, { "epoch": 1.9765000000000001, "grad_norm": 83.61393737792969, "learning_rate": 2.0972399999999998e-07, "loss": 0.3778, "step": 197650 }, { "epoch": 1.9769999999999999, "grad_norm": 0.6025605797767639, "learning_rate": 2.09524e-07, "loss": 0.4016, "step": 197700 }, { "epoch": 1.9775, "grad_norm": 3.0272040367126465, "learning_rate": 2.0932799999999998e-07, "loss": 0.4813, "step": 197750 }, { "epoch": 1.978, "grad_norm": 60.68215560913086, "learning_rate": 2.09128e-07, "loss": 0.353, "step": 197800 }, { "epoch": 1.9785, "grad_norm": 140.40936279296875, "learning_rate": 2.08928e-07, "loss": 0.4967, "step": 197850 }, { "epoch": 1.979, "grad_norm": 81.03306579589844, "learning_rate": 2.08728e-07, "loss": 0.6082, "step": 197900 }, { "epoch": 1.9795, "grad_norm": 4.013239860534668, "learning_rate": 2.0852799999999997e-07, "loss": 0.4588, "step": 197950 }, { "epoch": 1.98, "grad_norm": 95.69232940673828, "learning_rate": 2.08328e-07, "loss": 0.4396, "step": 198000 }, { "epoch": 1.9805000000000001, "grad_norm": 91.35757446289062, "learning_rate": 2.08128e-07, "loss": 0.4297, "step": 198050 }, { "epoch": 1.9809999999999999, "grad_norm": 61.077537536621094, "learning_rate": 2.0792799999999998e-07, "loss": 0.4121, "step": 198100 }, { "epoch": 1.9815, "grad_norm": 91.7123031616211, "learning_rate": 2.07728e-07, "loss": 0.3933, "step": 198150 }, { "epoch": 1.982, "grad_norm": 58.250064849853516, "learning_rate": 2.0752799999999998e-07, "loss": 0.5652, "step": 198200 }, { "epoch": 1.9825, "grad_norm": 4.037406921386719, "learning_rate": 2.0732800000000002e-07, "loss": 0.4317, "step": 198250 }, { "epoch": 1.983, "grad_norm": 14.497562408447266, "learning_rate": 2.07128e-07, "loss": 0.3503, "step": 198300 }, { "epoch": 1.9835, "grad_norm": 15.300369262695312, "learning_rate": 2.0692799999999999e-07, "loss": 0.4021, "step": 198350 }, { "epoch": 1.984, "grad_norm": 11.134997367858887, "learning_rate": 2.06728e-07, "loss": 0.45, "step": 198400 }, { "epoch": 1.9845000000000002, "grad_norm": 17.25223731994629, "learning_rate": 2.0652799999999998e-07, "loss": 0.302, "step": 198450 }, { "epoch": 1.9849999999999999, "grad_norm": 50.793540954589844, "learning_rate": 2.06328e-07, "loss": 0.4223, "step": 198500 }, { "epoch": 1.9855, "grad_norm": 1.6654607057571411, "learning_rate": 2.06128e-07, "loss": 0.4057, "step": 198550 }, { "epoch": 1.986, "grad_norm": 15.111828804016113, "learning_rate": 2.05928e-07, "loss": 0.2903, "step": 198600 }, { "epoch": 1.9865, "grad_norm": 60.819488525390625, "learning_rate": 2.0572799999999997e-07, "loss": 0.3924, "step": 198650 }, { "epoch": 1.987, "grad_norm": 58.338111877441406, "learning_rate": 2.0552799999999999e-07, "loss": 0.4706, "step": 198700 }, { "epoch": 1.9875, "grad_norm": 4.499456882476807, "learning_rate": 2.05328e-07, "loss": 0.4668, "step": 198750 }, { "epoch": 1.988, "grad_norm": 26.65963363647461, "learning_rate": 2.0512799999999998e-07, "loss": 0.464, "step": 198800 }, { "epoch": 1.9885000000000002, "grad_norm": 33.08699035644531, "learning_rate": 2.04928e-07, "loss": 0.4391, "step": 198850 }, { "epoch": 1.9889999999999999, "grad_norm": 25.592788696289062, "learning_rate": 2.0472799999999998e-07, "loss": 0.3808, "step": 198900 }, { "epoch": 1.9895, "grad_norm": 58.27476501464844, "learning_rate": 2.04528e-07, "loss": 0.5133, "step": 198950 }, { "epoch": 1.99, "grad_norm": 133.251220703125, "learning_rate": 2.04328e-07, "loss": 0.5155, "step": 199000 }, { "epoch": 1.9905, "grad_norm": 115.85850524902344, "learning_rate": 2.0412799999999999e-07, "loss": 0.3483, "step": 199050 }, { "epoch": 1.991, "grad_norm": 6.595485687255859, "learning_rate": 2.03928e-07, "loss": 0.449, "step": 199100 }, { "epoch": 1.9915, "grad_norm": 19.415233612060547, "learning_rate": 2.0372799999999998e-07, "loss": 0.5, "step": 199150 }, { "epoch": 1.992, "grad_norm": 36.67066192626953, "learning_rate": 2.0352799999999997e-07, "loss": 0.3246, "step": 199200 }, { "epoch": 1.9925000000000002, "grad_norm": 82.13230895996094, "learning_rate": 2.03328e-07, "loss": 0.442, "step": 199250 }, { "epoch": 1.9929999999999999, "grad_norm": 29.641822814941406, "learning_rate": 2.03128e-07, "loss": 0.3297, "step": 199300 }, { "epoch": 1.9935, "grad_norm": 78.03038787841797, "learning_rate": 2.0292799999999998e-07, "loss": 0.5838, "step": 199350 }, { "epoch": 1.994, "grad_norm": 14.143548011779785, "learning_rate": 2.02728e-07, "loss": 0.5141, "step": 199400 }, { "epoch": 1.9945, "grad_norm": 124.76140594482422, "learning_rate": 2.02528e-07, "loss": 0.5651, "step": 199450 }, { "epoch": 1.995, "grad_norm": 27.242807388305664, "learning_rate": 2.02328e-07, "loss": 0.4477, "step": 199500 }, { "epoch": 1.9955, "grad_norm": 55.86709976196289, "learning_rate": 2.02128e-07, "loss": 0.5458, "step": 199550 }, { "epoch": 1.996, "grad_norm": 61.482025146484375, "learning_rate": 2.0192799999999998e-07, "loss": 0.42, "step": 199600 }, { "epoch": 1.9965000000000002, "grad_norm": 21.021808624267578, "learning_rate": 2.01728e-07, "loss": 0.4657, "step": 199650 }, { "epoch": 1.9969999999999999, "grad_norm": 65.79966735839844, "learning_rate": 2.01528e-07, "loss": 0.4816, "step": 199700 }, { "epoch": 1.9975, "grad_norm": 74.81526947021484, "learning_rate": 2.01328e-07, "loss": 0.5426, "step": 199750 }, { "epoch": 1.998, "grad_norm": 78.884765625, "learning_rate": 2.01128e-07, "loss": 0.4315, "step": 199800 }, { "epoch": 1.9985, "grad_norm": 10.851948738098145, "learning_rate": 2.0092799999999998e-07, "loss": 0.4302, "step": 199850 }, { "epoch": 1.999, "grad_norm": 3.022129774093628, "learning_rate": 2.0072799999999997e-07, "loss": 0.3712, "step": 199900 }, { "epoch": 1.9995, "grad_norm": 24.550790786743164, "learning_rate": 2.00528e-07, "loss": 0.4664, "step": 199950 }, { "epoch": 2.0, "grad_norm": 14.006537437438965, "learning_rate": 2.00328e-07, "loss": 0.3405, "step": 200000 }, { "epoch": 2.0005, "grad_norm": 3.087618827819824, "learning_rate": 2.00128e-07, "loss": 0.4074, "step": 200050 }, { "epoch": 2.001, "grad_norm": 1.2099264860153198, "learning_rate": 1.99928e-07, "loss": 0.4809, "step": 200100 }, { "epoch": 2.0015, "grad_norm": 3.2248778343200684, "learning_rate": 1.9972799999999997e-07, "loss": 0.3725, "step": 200150 }, { "epoch": 2.002, "grad_norm": 39.72489929199219, "learning_rate": 1.99528e-07, "loss": 0.3364, "step": 200200 }, { "epoch": 2.0025, "grad_norm": 79.86190032958984, "learning_rate": 1.99328e-07, "loss": 0.3408, "step": 200250 }, { "epoch": 2.003, "grad_norm": 45.70669174194336, "learning_rate": 1.9912799999999998e-07, "loss": 0.4011, "step": 200300 }, { "epoch": 2.0035, "grad_norm": 80.72676849365234, "learning_rate": 1.98928e-07, "loss": 0.5005, "step": 200350 }, { "epoch": 2.004, "grad_norm": 54.08387756347656, "learning_rate": 1.9872799999999998e-07, "loss": 0.552, "step": 200400 }, { "epoch": 2.0045, "grad_norm": 14.636670112609863, "learning_rate": 1.98528e-07, "loss": 0.3083, "step": 200450 }, { "epoch": 2.005, "grad_norm": 65.19412994384766, "learning_rate": 1.98328e-07, "loss": 0.419, "step": 200500 }, { "epoch": 2.0055, "grad_norm": 18.607166290283203, "learning_rate": 1.9812799999999999e-07, "loss": 0.3372, "step": 200550 }, { "epoch": 2.006, "grad_norm": 5.587798595428467, "learning_rate": 1.9792799999999997e-07, "loss": 0.3789, "step": 200600 }, { "epoch": 2.0065, "grad_norm": 9.782854080200195, "learning_rate": 1.9772799999999998e-07, "loss": 0.3832, "step": 200650 }, { "epoch": 2.007, "grad_norm": 92.8857650756836, "learning_rate": 1.97528e-07, "loss": 0.3575, "step": 200700 }, { "epoch": 2.0075, "grad_norm": 157.7519073486328, "learning_rate": 1.97332e-07, "loss": 0.5908, "step": 200750 }, { "epoch": 2.008, "grad_norm": 103.3365478515625, "learning_rate": 1.97132e-07, "loss": 0.3179, "step": 200800 }, { "epoch": 2.0085, "grad_norm": 42.684906005859375, "learning_rate": 1.9693199999999998e-07, "loss": 0.3399, "step": 200850 }, { "epoch": 2.009, "grad_norm": 2.2768945693969727, "learning_rate": 1.96732e-07, "loss": 0.3791, "step": 200900 }, { "epoch": 2.0095, "grad_norm": 8.21125316619873, "learning_rate": 1.96532e-07, "loss": 0.4884, "step": 200950 }, { "epoch": 2.01, "grad_norm": 13.066143035888672, "learning_rate": 1.96332e-07, "loss": 0.5262, "step": 201000 }, { "epoch": 2.0105, "grad_norm": 62.395328521728516, "learning_rate": 1.96132e-07, "loss": 0.3039, "step": 201050 }, { "epoch": 2.011, "grad_norm": 1.7823199033737183, "learning_rate": 1.9593199999999998e-07, "loss": 0.5953, "step": 201100 }, { "epoch": 2.0115, "grad_norm": 84.18726348876953, "learning_rate": 1.9573199999999997e-07, "loss": 0.4766, "step": 201150 }, { "epoch": 2.012, "grad_norm": 119.35281372070312, "learning_rate": 1.95532e-07, "loss": 0.4257, "step": 201200 }, { "epoch": 2.0125, "grad_norm": 24.274431228637695, "learning_rate": 1.95332e-07, "loss": 0.5276, "step": 201250 }, { "epoch": 2.013, "grad_norm": 28.234947204589844, "learning_rate": 1.95132e-07, "loss": 0.4958, "step": 201300 }, { "epoch": 2.0135, "grad_norm": 36.3571891784668, "learning_rate": 1.94932e-07, "loss": 0.3327, "step": 201350 }, { "epoch": 2.014, "grad_norm": 70.78429412841797, "learning_rate": 1.9473199999999997e-07, "loss": 0.3493, "step": 201400 }, { "epoch": 2.0145, "grad_norm": 67.98007202148438, "learning_rate": 1.94532e-07, "loss": 0.5261, "step": 201450 }, { "epoch": 2.015, "grad_norm": 2.617002487182617, "learning_rate": 1.94332e-07, "loss": 0.3285, "step": 201500 }, { "epoch": 2.0155, "grad_norm": 27.248516082763672, "learning_rate": 1.9413199999999998e-07, "loss": 0.4242, "step": 201550 }, { "epoch": 2.016, "grad_norm": 85.0505142211914, "learning_rate": 1.93932e-07, "loss": 0.3836, "step": 201600 }, { "epoch": 2.0165, "grad_norm": 14.559182167053223, "learning_rate": 1.9373199999999998e-07, "loss": 0.31, "step": 201650 }, { "epoch": 2.017, "grad_norm": 82.49097442626953, "learning_rate": 1.93532e-07, "loss": 0.3404, "step": 201700 }, { "epoch": 2.0175, "grad_norm": 46.396629333496094, "learning_rate": 1.93332e-07, "loss": 0.3666, "step": 201750 }, { "epoch": 2.018, "grad_norm": 44.75120162963867, "learning_rate": 1.9313199999999999e-07, "loss": 0.4974, "step": 201800 }, { "epoch": 2.0185, "grad_norm": 6.472919464111328, "learning_rate": 1.92932e-07, "loss": 0.2974, "step": 201850 }, { "epoch": 2.019, "grad_norm": 41.595184326171875, "learning_rate": 1.9273199999999998e-07, "loss": 0.3399, "step": 201900 }, { "epoch": 2.0195, "grad_norm": 119.87677001953125, "learning_rate": 1.92532e-07, "loss": 0.3771, "step": 201950 }, { "epoch": 2.02, "grad_norm": 85.57170104980469, "learning_rate": 1.92336e-07, "loss": 0.4158, "step": 202000 }, { "epoch": 2.0205, "grad_norm": 29.789093017578125, "learning_rate": 1.92136e-07, "loss": 0.4293, "step": 202050 }, { "epoch": 2.021, "grad_norm": 83.55206298828125, "learning_rate": 1.9193599999999998e-07, "loss": 0.4268, "step": 202100 }, { "epoch": 2.0215, "grad_norm": 11.1190824508667, "learning_rate": 1.91736e-07, "loss": 0.4764, "step": 202150 }, { "epoch": 2.022, "grad_norm": 36.4733772277832, "learning_rate": 1.91536e-07, "loss": 0.4419, "step": 202200 }, { "epoch": 2.0225, "grad_norm": 73.61580657958984, "learning_rate": 1.91336e-07, "loss": 0.4973, "step": 202250 }, { "epoch": 2.023, "grad_norm": 46.41292190551758, "learning_rate": 1.91136e-07, "loss": 0.4368, "step": 202300 }, { "epoch": 2.0235, "grad_norm": 85.7939224243164, "learning_rate": 1.9093599999999998e-07, "loss": 0.2774, "step": 202350 }, { "epoch": 2.024, "grad_norm": 5.079582214355469, "learning_rate": 1.9073599999999997e-07, "loss": 0.3701, "step": 202400 }, { "epoch": 2.0245, "grad_norm": 16.522323608398438, "learning_rate": 1.90536e-07, "loss": 0.4191, "step": 202450 }, { "epoch": 2.025, "grad_norm": 44.48244857788086, "learning_rate": 1.90336e-07, "loss": 0.4953, "step": 202500 }, { "epoch": 2.0255, "grad_norm": 46.651466369628906, "learning_rate": 1.90136e-07, "loss": 0.372, "step": 202550 }, { "epoch": 2.026, "grad_norm": 8.931777000427246, "learning_rate": 1.89936e-07, "loss": 0.4437, "step": 202600 }, { "epoch": 2.0265, "grad_norm": 11.248937606811523, "learning_rate": 1.8973599999999997e-07, "loss": 0.4143, "step": 202650 }, { "epoch": 2.027, "grad_norm": 65.09107208251953, "learning_rate": 1.89536e-07, "loss": 0.3685, "step": 202700 }, { "epoch": 2.0275, "grad_norm": 15.731978416442871, "learning_rate": 1.89336e-07, "loss": 0.5076, "step": 202750 }, { "epoch": 2.028, "grad_norm": 117.5165023803711, "learning_rate": 1.8913599999999998e-07, "loss": 0.4142, "step": 202800 }, { "epoch": 2.0285, "grad_norm": 8.751471519470215, "learning_rate": 1.88936e-07, "loss": 0.3654, "step": 202850 }, { "epoch": 2.029, "grad_norm": 85.83335876464844, "learning_rate": 1.8873599999999998e-07, "loss": 0.4755, "step": 202900 }, { "epoch": 2.0295, "grad_norm": 11.942030906677246, "learning_rate": 1.88536e-07, "loss": 0.4555, "step": 202950 }, { "epoch": 2.03, "grad_norm": 2.0720551013946533, "learning_rate": 1.88336e-07, "loss": 0.3455, "step": 203000 }, { "epoch": 2.0305, "grad_norm": 27.060739517211914, "learning_rate": 1.8813599999999998e-07, "loss": 0.4064, "step": 203050 }, { "epoch": 2.031, "grad_norm": 12.625927925109863, "learning_rate": 1.87936e-07, "loss": 0.4275, "step": 203100 }, { "epoch": 2.0315, "grad_norm": 16.742694854736328, "learning_rate": 1.8773599999999998e-07, "loss": 0.4582, "step": 203150 }, { "epoch": 2.032, "grad_norm": 77.07002258300781, "learning_rate": 1.87536e-07, "loss": 0.3919, "step": 203200 }, { "epoch": 2.0325, "grad_norm": 64.95145416259766, "learning_rate": 1.87336e-07, "loss": 0.4802, "step": 203250 }, { "epoch": 2.033, "grad_norm": 124.8387680053711, "learning_rate": 1.87136e-07, "loss": 0.5289, "step": 203300 }, { "epoch": 2.0335, "grad_norm": 38.79643630981445, "learning_rate": 1.8693599999999997e-07, "loss": 0.3996, "step": 203350 }, { "epoch": 2.034, "grad_norm": 64.64935302734375, "learning_rate": 1.86736e-07, "loss": 0.3993, "step": 203400 }, { "epoch": 2.0345, "grad_norm": 62.82065963745117, "learning_rate": 1.86536e-07, "loss": 0.5019, "step": 203450 }, { "epoch": 2.035, "grad_norm": 1.8743690252304077, "learning_rate": 1.8633599999999998e-07, "loss": 0.3265, "step": 203500 }, { "epoch": 2.0355, "grad_norm": 136.758056640625, "learning_rate": 1.86136e-07, "loss": 0.3709, "step": 203550 }, { "epoch": 2.036, "grad_norm": 93.76640319824219, "learning_rate": 1.8593599999999998e-07, "loss": 0.4507, "step": 203600 }, { "epoch": 2.0365, "grad_norm": 2.7972347736358643, "learning_rate": 1.8573600000000002e-07, "loss": 0.4523, "step": 203650 }, { "epoch": 2.037, "grad_norm": 12.453752517700195, "learning_rate": 1.85536e-07, "loss": 0.2875, "step": 203700 }, { "epoch": 2.0375, "grad_norm": 77.1436996459961, "learning_rate": 1.8533599999999999e-07, "loss": 0.3493, "step": 203750 }, { "epoch": 2.038, "grad_norm": 5.663081645965576, "learning_rate": 1.85136e-07, "loss": 0.4532, "step": 203800 }, { "epoch": 2.0385, "grad_norm": 74.7771224975586, "learning_rate": 1.8493599999999998e-07, "loss": 0.3986, "step": 203850 }, { "epoch": 2.039, "grad_norm": 170.24021911621094, "learning_rate": 1.84736e-07, "loss": 0.5018, "step": 203900 }, { "epoch": 2.0395, "grad_norm": 4.51198148727417, "learning_rate": 1.84536e-07, "loss": 0.3477, "step": 203950 }, { "epoch": 2.04, "grad_norm": 4.396271228790283, "learning_rate": 1.84336e-07, "loss": 0.4144, "step": 204000 }, { "epoch": 2.0405, "grad_norm": 38.89625549316406, "learning_rate": 1.8413599999999998e-07, "loss": 0.4672, "step": 204050 }, { "epoch": 2.041, "grad_norm": 63.771202087402344, "learning_rate": 1.83936e-07, "loss": 0.4361, "step": 204100 }, { "epoch": 2.0415, "grad_norm": 85.22208404541016, "learning_rate": 1.83736e-07, "loss": 0.3421, "step": 204150 }, { "epoch": 2.042, "grad_norm": 16.77423095703125, "learning_rate": 1.8353599999999998e-07, "loss": 0.4003, "step": 204200 }, { "epoch": 2.0425, "grad_norm": 6.526354789733887, "learning_rate": 1.83336e-07, "loss": 0.4882, "step": 204250 }, { "epoch": 2.043, "grad_norm": 21.03365135192871, "learning_rate": 1.8313599999999998e-07, "loss": 0.4461, "step": 204300 }, { "epoch": 2.0435, "grad_norm": 99.76431274414062, "learning_rate": 1.82936e-07, "loss": 0.3903, "step": 204350 }, { "epoch": 2.044, "grad_norm": 40.31965637207031, "learning_rate": 1.82736e-07, "loss": 0.4469, "step": 204400 }, { "epoch": 2.0445, "grad_norm": 32.08421325683594, "learning_rate": 1.82536e-07, "loss": 0.285, "step": 204450 }, { "epoch": 2.045, "grad_norm": 106.2055892944336, "learning_rate": 1.82336e-07, "loss": 0.4367, "step": 204500 }, { "epoch": 2.0455, "grad_norm": 81.09613037109375, "learning_rate": 1.8213599999999998e-07, "loss": 0.5206, "step": 204550 }, { "epoch": 2.046, "grad_norm": 74.99372863769531, "learning_rate": 1.8193599999999997e-07, "loss": 0.4927, "step": 204600 }, { "epoch": 2.0465, "grad_norm": 1.6157112121582031, "learning_rate": 1.81736e-07, "loss": 0.463, "step": 204650 }, { "epoch": 2.047, "grad_norm": 95.24854278564453, "learning_rate": 1.81536e-07, "loss": 0.401, "step": 204700 }, { "epoch": 2.0475, "grad_norm": 36.316490173339844, "learning_rate": 1.8133599999999998e-07, "loss": 0.4379, "step": 204750 }, { "epoch": 2.048, "grad_norm": 134.88839721679688, "learning_rate": 1.81136e-07, "loss": 0.3477, "step": 204800 }, { "epoch": 2.0485, "grad_norm": 0.8851105570793152, "learning_rate": 1.80936e-07, "loss": 0.4119, "step": 204850 }, { "epoch": 2.049, "grad_norm": 0.5529405474662781, "learning_rate": 1.80736e-07, "loss": 0.3685, "step": 204900 }, { "epoch": 2.0495, "grad_norm": 2.562713146209717, "learning_rate": 1.80536e-07, "loss": 0.4975, "step": 204950 }, { "epoch": 2.05, "grad_norm": 90.19808197021484, "learning_rate": 1.8033599999999998e-07, "loss": 0.5049, "step": 205000 }, { "epoch": 2.0505, "grad_norm": 39.76176071166992, "learning_rate": 1.80136e-07, "loss": 0.4066, "step": 205050 }, { "epoch": 2.051, "grad_norm": 41.6057014465332, "learning_rate": 1.79936e-07, "loss": 0.5315, "step": 205100 }, { "epoch": 2.0515, "grad_norm": 2.7808868885040283, "learning_rate": 1.79736e-07, "loss": 0.3613, "step": 205150 }, { "epoch": 2.052, "grad_norm": 82.41030883789062, "learning_rate": 1.79536e-07, "loss": 0.407, "step": 205200 }, { "epoch": 2.0525, "grad_norm": 60.45346450805664, "learning_rate": 1.7933599999999999e-07, "loss": 0.529, "step": 205250 }, { "epoch": 2.053, "grad_norm": 41.536399841308594, "learning_rate": 1.7913599999999997e-07, "loss": 0.5233, "step": 205300 }, { "epoch": 2.0535, "grad_norm": 18.10399055480957, "learning_rate": 1.78936e-07, "loss": 0.3914, "step": 205350 }, { "epoch": 2.054, "grad_norm": 2.1177046298980713, "learning_rate": 1.78736e-07, "loss": 0.4353, "step": 205400 }, { "epoch": 2.0545, "grad_norm": 2.495558500289917, "learning_rate": 1.78536e-07, "loss": 0.3869, "step": 205450 }, { "epoch": 2.055, "grad_norm": 30.129528045654297, "learning_rate": 1.78336e-07, "loss": 0.4063, "step": 205500 }, { "epoch": 2.0555, "grad_norm": 85.46407318115234, "learning_rate": 1.7813599999999997e-07, "loss": 0.548, "step": 205550 }, { "epoch": 2.056, "grad_norm": 63.010189056396484, "learning_rate": 1.77936e-07, "loss": 0.4037, "step": 205600 }, { "epoch": 2.0565, "grad_norm": 23.553878784179688, "learning_rate": 1.77736e-07, "loss": 0.4054, "step": 205650 }, { "epoch": 2.057, "grad_norm": 61.06326675415039, "learning_rate": 1.7753599999999998e-07, "loss": 0.3369, "step": 205700 }, { "epoch": 2.0575, "grad_norm": 72.79309844970703, "learning_rate": 1.77336e-07, "loss": 0.4785, "step": 205750 }, { "epoch": 2.058, "grad_norm": 83.79118347167969, "learning_rate": 1.7713599999999998e-07, "loss": 0.4321, "step": 205800 }, { "epoch": 2.0585, "grad_norm": 91.41462707519531, "learning_rate": 1.76936e-07, "loss": 0.3816, "step": 205850 }, { "epoch": 2.059, "grad_norm": 0.9627030491828918, "learning_rate": 1.76736e-07, "loss": 0.3933, "step": 205900 }, { "epoch": 2.0595, "grad_norm": 76.58192443847656, "learning_rate": 1.76536e-07, "loss": 0.4329, "step": 205950 }, { "epoch": 2.06, "grad_norm": 92.036376953125, "learning_rate": 1.7633599999999997e-07, "loss": 0.4328, "step": 206000 }, { "epoch": 2.0605, "grad_norm": 16.09295654296875, "learning_rate": 1.7613599999999998e-07, "loss": 0.4057, "step": 206050 }, { "epoch": 2.061, "grad_norm": 20.907642364501953, "learning_rate": 1.75936e-07, "loss": 0.3151, "step": 206100 }, { "epoch": 2.0615, "grad_norm": 125.13389587402344, "learning_rate": 1.75736e-07, "loss": 0.5345, "step": 206150 }, { "epoch": 2.062, "grad_norm": 10.434032440185547, "learning_rate": 1.75536e-07, "loss": 0.4701, "step": 206200 }, { "epoch": 2.0625, "grad_norm": 88.18071746826172, "learning_rate": 1.7533599999999998e-07, "loss": 0.6219, "step": 206250 }, { "epoch": 2.063, "grad_norm": 107.26880645751953, "learning_rate": 1.75136e-07, "loss": 0.4287, "step": 206300 }, { "epoch": 2.0635, "grad_norm": 64.41120147705078, "learning_rate": 1.74936e-07, "loss": 0.467, "step": 206350 }, { "epoch": 2.064, "grad_norm": 71.87296295166016, "learning_rate": 1.7473599999999998e-07, "loss": 0.5102, "step": 206400 }, { "epoch": 2.0645, "grad_norm": 4.165521621704102, "learning_rate": 1.74536e-07, "loss": 0.5447, "step": 206450 }, { "epoch": 2.065, "grad_norm": 10.817039489746094, "learning_rate": 1.7433599999999998e-07, "loss": 0.415, "step": 206500 }, { "epoch": 2.0655, "grad_norm": 132.01255798339844, "learning_rate": 1.7413600000000002e-07, "loss": 0.4636, "step": 206550 }, { "epoch": 2.066, "grad_norm": 4.783523082733154, "learning_rate": 1.73936e-07, "loss": 0.3517, "step": 206600 }, { "epoch": 2.0665, "grad_norm": 91.72089385986328, "learning_rate": 1.73736e-07, "loss": 0.3424, "step": 206650 }, { "epoch": 2.067, "grad_norm": 1.3586903810501099, "learning_rate": 1.73536e-07, "loss": 0.3763, "step": 206700 }, { "epoch": 2.0675, "grad_norm": 119.87760162353516, "learning_rate": 1.7333599999999998e-07, "loss": 0.4029, "step": 206750 }, { "epoch": 2.068, "grad_norm": 74.30902099609375, "learning_rate": 1.73136e-07, "loss": 0.4613, "step": 206800 }, { "epoch": 2.0685000000000002, "grad_norm": 88.31536865234375, "learning_rate": 1.72936e-07, "loss": 0.4587, "step": 206850 }, { "epoch": 2.069, "grad_norm": 30.35822105407715, "learning_rate": 1.72736e-07, "loss": 0.3296, "step": 206900 }, { "epoch": 2.0695, "grad_norm": 68.1887435913086, "learning_rate": 1.7253599999999998e-07, "loss": 0.3588, "step": 206950 }, { "epoch": 2.07, "grad_norm": 24.027542114257812, "learning_rate": 1.72336e-07, "loss": 0.435, "step": 207000 }, { "epoch": 2.0705, "grad_norm": 1.4884693622589111, "learning_rate": 1.72136e-07, "loss": 0.3958, "step": 207050 }, { "epoch": 2.071, "grad_norm": 76.33940887451172, "learning_rate": 1.7193599999999999e-07, "loss": 0.4019, "step": 207100 }, { "epoch": 2.0715, "grad_norm": 74.91709899902344, "learning_rate": 1.71736e-07, "loss": 0.4541, "step": 207150 }, { "epoch": 2.072, "grad_norm": 91.8819808959961, "learning_rate": 1.7153599999999998e-07, "loss": 0.5439, "step": 207200 }, { "epoch": 2.0725, "grad_norm": 6.123153209686279, "learning_rate": 1.71336e-07, "loss": 0.4287, "step": 207250 }, { "epoch": 2.073, "grad_norm": 0.9759462475776672, "learning_rate": 1.71136e-07, "loss": 0.374, "step": 207300 }, { "epoch": 2.0735, "grad_norm": 15.24844741821289, "learning_rate": 1.70936e-07, "loss": 0.3756, "step": 207350 }, { "epoch": 2.074, "grad_norm": 6.204489707946777, "learning_rate": 1.70736e-07, "loss": 0.4405, "step": 207400 }, { "epoch": 2.0745, "grad_norm": 69.0223388671875, "learning_rate": 1.7053599999999999e-07, "loss": 0.4419, "step": 207450 }, { "epoch": 2.075, "grad_norm": 39.172096252441406, "learning_rate": 1.7033599999999997e-07, "loss": 0.3817, "step": 207500 }, { "epoch": 2.0755, "grad_norm": 59.810447692871094, "learning_rate": 1.70136e-07, "loss": 0.4333, "step": 207550 }, { "epoch": 2.076, "grad_norm": 43.26020050048828, "learning_rate": 1.69936e-07, "loss": 0.4731, "step": 207600 }, { "epoch": 2.0765, "grad_norm": 81.62408447265625, "learning_rate": 1.6973599999999998e-07, "loss": 0.4608, "step": 207650 }, { "epoch": 2.077, "grad_norm": 0.3745923638343811, "learning_rate": 1.69536e-07, "loss": 0.3853, "step": 207700 }, { "epoch": 2.0775, "grad_norm": 90.62264251708984, "learning_rate": 1.6933599999999998e-07, "loss": 0.4131, "step": 207750 }, { "epoch": 2.078, "grad_norm": 12.992121696472168, "learning_rate": 1.69136e-07, "loss": 0.4822, "step": 207800 }, { "epoch": 2.0785, "grad_norm": 54.97868347167969, "learning_rate": 1.68936e-07, "loss": 0.4623, "step": 207850 }, { "epoch": 2.079, "grad_norm": 35.057308197021484, "learning_rate": 1.6873599999999998e-07, "loss": 0.2986, "step": 207900 }, { "epoch": 2.0795, "grad_norm": 98.75119018554688, "learning_rate": 1.68536e-07, "loss": 0.5695, "step": 207950 }, { "epoch": 2.08, "grad_norm": 57.579368591308594, "learning_rate": 1.68336e-07, "loss": 0.5152, "step": 208000 }, { "epoch": 2.0805, "grad_norm": 141.9813232421875, "learning_rate": 1.68136e-07, "loss": 0.4165, "step": 208050 }, { "epoch": 2.081, "grad_norm": 114.23064422607422, "learning_rate": 1.67936e-07, "loss": 0.3704, "step": 208100 }, { "epoch": 2.0815, "grad_norm": 160.40625, "learning_rate": 1.67736e-07, "loss": 0.3956, "step": 208150 }, { "epoch": 2.082, "grad_norm": 47.36935043334961, "learning_rate": 1.6753599999999997e-07, "loss": 0.3628, "step": 208200 }, { "epoch": 2.0825, "grad_norm": 114.87825012207031, "learning_rate": 1.67336e-07, "loss": 0.4358, "step": 208250 }, { "epoch": 2.083, "grad_norm": 112.84601593017578, "learning_rate": 1.67136e-07, "loss": 0.4642, "step": 208300 }, { "epoch": 2.0835, "grad_norm": 9.166009902954102, "learning_rate": 1.66936e-07, "loss": 0.3521, "step": 208350 }, { "epoch": 2.084, "grad_norm": 114.99955749511719, "learning_rate": 1.66736e-07, "loss": 0.4923, "step": 208400 }, { "epoch": 2.0845, "grad_norm": 3.3287408351898193, "learning_rate": 1.6653599999999998e-07, "loss": 0.3185, "step": 208450 }, { "epoch": 2.085, "grad_norm": 6.908358097076416, "learning_rate": 1.6634e-07, "loss": 0.5219, "step": 208500 }, { "epoch": 2.0855, "grad_norm": 24.365503311157227, "learning_rate": 1.6614e-07, "loss": 0.5059, "step": 208550 }, { "epoch": 2.086, "grad_norm": 7.259128093719482, "learning_rate": 1.6594e-07, "loss": 0.4561, "step": 208600 }, { "epoch": 2.0865, "grad_norm": 107.42555236816406, "learning_rate": 1.6574e-07, "loss": 0.3289, "step": 208650 }, { "epoch": 2.087, "grad_norm": 7.124502182006836, "learning_rate": 1.6553999999999999e-07, "loss": 0.3946, "step": 208700 }, { "epoch": 2.0875, "grad_norm": 71.81135559082031, "learning_rate": 1.6533999999999997e-07, "loss": 0.3836, "step": 208750 }, { "epoch": 2.088, "grad_norm": 95.23866271972656, "learning_rate": 1.6514e-07, "loss": 0.3454, "step": 208800 }, { "epoch": 2.0885, "grad_norm": 20.035490036010742, "learning_rate": 1.6494e-07, "loss": 0.4794, "step": 208850 }, { "epoch": 2.089, "grad_norm": 75.28366088867188, "learning_rate": 1.6473999999999998e-07, "loss": 0.5533, "step": 208900 }, { "epoch": 2.0895, "grad_norm": 72.16234588623047, "learning_rate": 1.6454e-07, "loss": 0.4135, "step": 208950 }, { "epoch": 2.09, "grad_norm": 68.55174255371094, "learning_rate": 1.6434e-07, "loss": 0.3546, "step": 209000 }, { "epoch": 2.0905, "grad_norm": 84.08557891845703, "learning_rate": 1.6414e-07, "loss": 0.4366, "step": 209050 }, { "epoch": 2.091, "grad_norm": 119.96469116210938, "learning_rate": 1.6394e-07, "loss": 0.3998, "step": 209100 }, { "epoch": 2.0915, "grad_norm": 15.461919784545898, "learning_rate": 1.6373999999999998e-07, "loss": 0.4224, "step": 209150 }, { "epoch": 2.092, "grad_norm": 95.27957916259766, "learning_rate": 1.6354e-07, "loss": 0.4491, "step": 209200 }, { "epoch": 2.0925, "grad_norm": 9.547892570495605, "learning_rate": 1.6334e-07, "loss": 0.4787, "step": 209250 }, { "epoch": 2.093, "grad_norm": 56.198638916015625, "learning_rate": 1.6314e-07, "loss": 0.4518, "step": 209300 }, { "epoch": 2.0935, "grad_norm": 45.14836502075195, "learning_rate": 1.6294e-07, "loss": 0.3812, "step": 209350 }, { "epoch": 2.094, "grad_norm": 47.17598342895508, "learning_rate": 1.6274e-07, "loss": 0.3314, "step": 209400 }, { "epoch": 2.0945, "grad_norm": 3.217527389526367, "learning_rate": 1.6253999999999997e-07, "loss": 0.4014, "step": 209450 }, { "epoch": 2.095, "grad_norm": 59.638771057128906, "learning_rate": 1.6234e-07, "loss": 0.3964, "step": 209500 }, { "epoch": 2.0955, "grad_norm": 83.21932220458984, "learning_rate": 1.6214e-07, "loss": 0.5206, "step": 209550 }, { "epoch": 2.096, "grad_norm": 15.194162368774414, "learning_rate": 1.6194e-07, "loss": 0.4676, "step": 209600 }, { "epoch": 2.0965, "grad_norm": 107.33583068847656, "learning_rate": 1.6174e-07, "loss": 0.4261, "step": 209650 }, { "epoch": 2.097, "grad_norm": 13.499813079833984, "learning_rate": 1.6153999999999998e-07, "loss": 0.5161, "step": 209700 }, { "epoch": 2.0975, "grad_norm": 82.69535064697266, "learning_rate": 1.6134000000000001e-07, "loss": 0.4069, "step": 209750 }, { "epoch": 2.098, "grad_norm": 41.087249755859375, "learning_rate": 1.6114e-07, "loss": 0.3837, "step": 209800 }, { "epoch": 2.0985, "grad_norm": 19.804964065551758, "learning_rate": 1.6093999999999998e-07, "loss": 0.427, "step": 209850 }, { "epoch": 2.099, "grad_norm": 36.766902923583984, "learning_rate": 1.6074e-07, "loss": 0.3817, "step": 209900 }, { "epoch": 2.0995, "grad_norm": 128.2410125732422, "learning_rate": 1.6053999999999998e-07, "loss": 0.4625, "step": 209950 }, { "epoch": 2.1, "grad_norm": 0.3461100459098816, "learning_rate": 1.6034e-07, "loss": 0.4514, "step": 210000 }, { "epoch": 2.1005, "grad_norm": 39.63062286376953, "learning_rate": 1.6014e-07, "loss": 0.4665, "step": 210050 }, { "epoch": 2.101, "grad_norm": 29.11553382873535, "learning_rate": 1.5994e-07, "loss": 0.3503, "step": 210100 }, { "epoch": 2.1015, "grad_norm": 18.823801040649414, "learning_rate": 1.5973999999999997e-07, "loss": 0.3922, "step": 210150 }, { "epoch": 2.102, "grad_norm": 8.857904434204102, "learning_rate": 1.5953999999999998e-07, "loss": 0.4219, "step": 210200 }, { "epoch": 2.1025, "grad_norm": 1.5444262027740479, "learning_rate": 1.5934e-07, "loss": 0.3796, "step": 210250 }, { "epoch": 2.103, "grad_norm": 17.656465530395508, "learning_rate": 1.59144e-07, "loss": 0.5756, "step": 210300 }, { "epoch": 2.1035, "grad_norm": 70.85519409179688, "learning_rate": 1.58944e-07, "loss": 0.3324, "step": 210350 }, { "epoch": 2.104, "grad_norm": 139.22637939453125, "learning_rate": 1.5874399999999998e-07, "loss": 0.5752, "step": 210400 }, { "epoch": 2.1045, "grad_norm": 56.294315338134766, "learning_rate": 1.58544e-07, "loss": 0.4574, "step": 210450 }, { "epoch": 2.105, "grad_norm": 45.46672821044922, "learning_rate": 1.58344e-07, "loss": 0.3502, "step": 210500 }, { "epoch": 2.1055, "grad_norm": 11.471989631652832, "learning_rate": 1.58144e-07, "loss": 0.3842, "step": 210550 }, { "epoch": 2.106, "grad_norm": 23.504499435424805, "learning_rate": 1.57944e-07, "loss": 0.4154, "step": 210600 }, { "epoch": 2.1065, "grad_norm": 17.586835861206055, "learning_rate": 1.5774399999999999e-07, "loss": 0.4408, "step": 210650 }, { "epoch": 2.107, "grad_norm": 74.19413757324219, "learning_rate": 1.5754399999999997e-07, "loss": 0.5318, "step": 210700 }, { "epoch": 2.1075, "grad_norm": 5.282220840454102, "learning_rate": 1.57344e-07, "loss": 0.5633, "step": 210750 }, { "epoch": 2.108, "grad_norm": 27.199790954589844, "learning_rate": 1.57144e-07, "loss": 0.3807, "step": 210800 }, { "epoch": 2.1085, "grad_norm": 9.653837203979492, "learning_rate": 1.56944e-07, "loss": 0.5071, "step": 210850 }, { "epoch": 2.109, "grad_norm": 2.2887659072875977, "learning_rate": 1.56744e-07, "loss": 0.442, "step": 210900 }, { "epoch": 2.1095, "grad_norm": 130.85214233398438, "learning_rate": 1.5654399999999998e-07, "loss": 0.5694, "step": 210950 }, { "epoch": 2.11, "grad_norm": 71.07162475585938, "learning_rate": 1.5634400000000001e-07, "loss": 0.3873, "step": 211000 }, { "epoch": 2.1105, "grad_norm": 0.9926930665969849, "learning_rate": 1.56144e-07, "loss": 0.4219, "step": 211050 }, { "epoch": 2.111, "grad_norm": 31.29315948486328, "learning_rate": 1.5594399999999998e-07, "loss": 0.406, "step": 211100 }, { "epoch": 2.1115, "grad_norm": 4.280275821685791, "learning_rate": 1.55744e-07, "loss": 0.4414, "step": 211150 }, { "epoch": 2.112, "grad_norm": 22.806076049804688, "learning_rate": 1.5554399999999998e-07, "loss": 0.4081, "step": 211200 }, { "epoch": 2.1125, "grad_norm": 48.29012680053711, "learning_rate": 1.55344e-07, "loss": 0.4448, "step": 211250 }, { "epoch": 2.113, "grad_norm": 28.17279624938965, "learning_rate": 1.55144e-07, "loss": 0.3505, "step": 211300 }, { "epoch": 2.1135, "grad_norm": 4.374676704406738, "learning_rate": 1.54944e-07, "loss": 0.374, "step": 211350 }, { "epoch": 2.114, "grad_norm": 5.979548931121826, "learning_rate": 1.5474399999999997e-07, "loss": 0.304, "step": 211400 }, { "epoch": 2.1145, "grad_norm": 5.3794426918029785, "learning_rate": 1.5454399999999998e-07, "loss": 0.4963, "step": 211450 }, { "epoch": 2.115, "grad_norm": 71.64678955078125, "learning_rate": 1.54344e-07, "loss": 0.4826, "step": 211500 }, { "epoch": 2.1155, "grad_norm": 8.600757598876953, "learning_rate": 1.54144e-07, "loss": 0.3473, "step": 211550 }, { "epoch": 2.116, "grad_norm": 6.017736434936523, "learning_rate": 1.53944e-07, "loss": 0.4768, "step": 211600 }, { "epoch": 2.1165, "grad_norm": 58.22939682006836, "learning_rate": 1.5374399999999998e-07, "loss": 0.4373, "step": 211650 }, { "epoch": 2.117, "grad_norm": 68.43115234375, "learning_rate": 1.5354400000000001e-07, "loss": 0.4691, "step": 211700 }, { "epoch": 2.1175, "grad_norm": 79.82152557373047, "learning_rate": 1.53344e-07, "loss": 0.4244, "step": 211750 }, { "epoch": 2.118, "grad_norm": 31.79875946044922, "learning_rate": 1.5314399999999998e-07, "loss": 0.5947, "step": 211800 }, { "epoch": 2.1185, "grad_norm": 62.042083740234375, "learning_rate": 1.52944e-07, "loss": 0.3467, "step": 211850 }, { "epoch": 2.1189999999999998, "grad_norm": 2.5608060359954834, "learning_rate": 1.5274399999999998e-07, "loss": 0.4214, "step": 211900 }, { "epoch": 2.1195, "grad_norm": 105.99209594726562, "learning_rate": 1.5254400000000002e-07, "loss": 0.5052, "step": 211950 }, { "epoch": 2.12, "grad_norm": 24.472665786743164, "learning_rate": 1.52344e-07, "loss": 0.3036, "step": 212000 }, { "epoch": 2.1205, "grad_norm": 97.40585327148438, "learning_rate": 1.52144e-07, "loss": 0.4331, "step": 212050 }, { "epoch": 2.121, "grad_norm": 68.8331069946289, "learning_rate": 1.51944e-07, "loss": 0.4896, "step": 212100 }, { "epoch": 2.1215, "grad_norm": 0.17462262511253357, "learning_rate": 1.5174399999999999e-07, "loss": 0.2989, "step": 212150 }, { "epoch": 2.122, "grad_norm": 0.09120786935091019, "learning_rate": 1.51544e-07, "loss": 0.49, "step": 212200 }, { "epoch": 2.1225, "grad_norm": 41.664825439453125, "learning_rate": 1.51344e-07, "loss": 0.4313, "step": 212250 }, { "epoch": 2.123, "grad_norm": 99.65406036376953, "learning_rate": 1.51144e-07, "loss": 0.3627, "step": 212300 }, { "epoch": 2.1235, "grad_norm": 71.01948547363281, "learning_rate": 1.5094399999999998e-07, "loss": 0.496, "step": 212350 }, { "epoch": 2.124, "grad_norm": 90.17384338378906, "learning_rate": 1.50744e-07, "loss": 0.3975, "step": 212400 }, { "epoch": 2.1245, "grad_norm": 16.80384635925293, "learning_rate": 1.50544e-07, "loss": 0.3572, "step": 212450 }, { "epoch": 2.125, "grad_norm": 63.9257926940918, "learning_rate": 1.5034399999999999e-07, "loss": 0.5707, "step": 212500 }, { "epoch": 2.1255, "grad_norm": 40.26178741455078, "learning_rate": 1.50144e-07, "loss": 0.4961, "step": 212550 }, { "epoch": 2.126, "grad_norm": 75.82178497314453, "learning_rate": 1.4994399999999998e-07, "loss": 0.4571, "step": 212600 }, { "epoch": 2.1265, "grad_norm": 10.5368070602417, "learning_rate": 1.49744e-07, "loss": 0.4173, "step": 212650 }, { "epoch": 2.127, "grad_norm": 141.17538452148438, "learning_rate": 1.49544e-07, "loss": 0.4677, "step": 212700 }, { "epoch": 2.1275, "grad_norm": 58.944175720214844, "learning_rate": 1.49344e-07, "loss": 0.3237, "step": 212750 }, { "epoch": 2.128, "grad_norm": 43.46791458129883, "learning_rate": 1.49144e-07, "loss": 0.5176, "step": 212800 }, { "epoch": 2.1285, "grad_norm": 1.668687105178833, "learning_rate": 1.48944e-07, "loss": 0.4732, "step": 212850 }, { "epoch": 2.129, "grad_norm": 69.23137664794922, "learning_rate": 1.4874399999999997e-07, "loss": 0.4604, "step": 212900 }, { "epoch": 2.1295, "grad_norm": 39.40574645996094, "learning_rate": 1.48544e-07, "loss": 0.4294, "step": 212950 }, { "epoch": 2.13, "grad_norm": 48.64951705932617, "learning_rate": 1.48344e-07, "loss": 0.3854, "step": 213000 }, { "epoch": 2.1305, "grad_norm": 7.141056537628174, "learning_rate": 1.4814399999999998e-07, "loss": 0.3979, "step": 213050 }, { "epoch": 2.1310000000000002, "grad_norm": 61.7899284362793, "learning_rate": 1.47944e-07, "loss": 0.4668, "step": 213100 }, { "epoch": 2.1315, "grad_norm": 43.82136154174805, "learning_rate": 1.4774399999999998e-07, "loss": 0.4552, "step": 213150 }, { "epoch": 2.132, "grad_norm": 9.3849458694458, "learning_rate": 1.4754400000000001e-07, "loss": 0.4971, "step": 213200 }, { "epoch": 2.1325, "grad_norm": 0.2815798819065094, "learning_rate": 1.47344e-07, "loss": 0.3141, "step": 213250 }, { "epoch": 2.133, "grad_norm": 70.74395751953125, "learning_rate": 1.4714399999999998e-07, "loss": 0.4281, "step": 213300 }, { "epoch": 2.1335, "grad_norm": 36.044673919677734, "learning_rate": 1.46944e-07, "loss": 0.3749, "step": 213350 }, { "epoch": 2.134, "grad_norm": 22.537832260131836, "learning_rate": 1.46744e-07, "loss": 0.4091, "step": 213400 }, { "epoch": 2.1345, "grad_norm": 64.7277603149414, "learning_rate": 1.46544e-07, "loss": 0.4648, "step": 213450 }, { "epoch": 2.135, "grad_norm": 32.96176528930664, "learning_rate": 1.46344e-07, "loss": 0.5325, "step": 213500 }, { "epoch": 2.1355, "grad_norm": 39.71519470214844, "learning_rate": 1.46144e-07, "loss": 0.4756, "step": 213550 }, { "epoch": 2.136, "grad_norm": 27.864206314086914, "learning_rate": 1.4594399999999997e-07, "loss": 0.4653, "step": 213600 }, { "epoch": 2.1365, "grad_norm": 59.47966766357422, "learning_rate": 1.45744e-07, "loss": 0.4488, "step": 213650 }, { "epoch": 2.137, "grad_norm": 30.075504302978516, "learning_rate": 1.45544e-07, "loss": 0.5499, "step": 213700 }, { "epoch": 2.1375, "grad_norm": 0.5983642339706421, "learning_rate": 1.4534399999999998e-07, "loss": 0.5613, "step": 213750 }, { "epoch": 2.138, "grad_norm": 5.750205039978027, "learning_rate": 1.45144e-07, "loss": 0.5196, "step": 213800 }, { "epoch": 2.1385, "grad_norm": 45.518558502197266, "learning_rate": 1.4494399999999998e-07, "loss": 0.2925, "step": 213850 }, { "epoch": 2.1390000000000002, "grad_norm": 144.67080688476562, "learning_rate": 1.4474400000000002e-07, "loss": 0.4629, "step": 213900 }, { "epoch": 2.1395, "grad_norm": 82.38689422607422, "learning_rate": 1.44544e-07, "loss": 0.5278, "step": 213950 }, { "epoch": 2.14, "grad_norm": 1.598920226097107, "learning_rate": 1.4434399999999999e-07, "loss": 0.2595, "step": 214000 }, { "epoch": 2.1405, "grad_norm": 22.938703536987305, "learning_rate": 1.44144e-07, "loss": 0.4807, "step": 214050 }, { "epoch": 2.141, "grad_norm": 50.35197067260742, "learning_rate": 1.4394399999999998e-07, "loss": 0.3687, "step": 214100 }, { "epoch": 2.1415, "grad_norm": 56.01286697387695, "learning_rate": 1.43744e-07, "loss": 0.4494, "step": 214150 }, { "epoch": 2.142, "grad_norm": 117.62816619873047, "learning_rate": 1.43544e-07, "loss": 0.4037, "step": 214200 }, { "epoch": 2.1425, "grad_norm": 26.653152465820312, "learning_rate": 1.43344e-07, "loss": 0.3627, "step": 214250 }, { "epoch": 2.143, "grad_norm": 18.058246612548828, "learning_rate": 1.4314399999999997e-07, "loss": 0.6102, "step": 214300 }, { "epoch": 2.1435, "grad_norm": 6.283153057098389, "learning_rate": 1.4294399999999999e-07, "loss": 0.318, "step": 214350 }, { "epoch": 2.144, "grad_norm": 50.743133544921875, "learning_rate": 1.42748e-07, "loss": 0.3315, "step": 214400 }, { "epoch": 2.1445, "grad_norm": 7.809654712677002, "learning_rate": 1.4254800000000001e-07, "loss": 0.3765, "step": 214450 }, { "epoch": 2.145, "grad_norm": 58.41513442993164, "learning_rate": 1.42348e-07, "loss": 0.3844, "step": 214500 }, { "epoch": 2.1455, "grad_norm": 117.92552185058594, "learning_rate": 1.4214799999999998e-07, "loss": 0.4366, "step": 214550 }, { "epoch": 2.146, "grad_norm": 38.15401077270508, "learning_rate": 1.41948e-07, "loss": 0.4262, "step": 214600 }, { "epoch": 2.1465, "grad_norm": 99.7507553100586, "learning_rate": 1.41748e-07, "loss": 0.4156, "step": 214650 }, { "epoch": 2.147, "grad_norm": 120.47976684570312, "learning_rate": 1.41548e-07, "loss": 0.3704, "step": 214700 }, { "epoch": 2.1475, "grad_norm": 0.7170859575271606, "learning_rate": 1.41348e-07, "loss": 0.5463, "step": 214750 }, { "epoch": 2.148, "grad_norm": 73.54954528808594, "learning_rate": 1.41148e-07, "loss": 0.4351, "step": 214800 }, { "epoch": 2.1485, "grad_norm": 1.4426411390304565, "learning_rate": 1.4094799999999997e-07, "loss": 0.4003, "step": 214850 }, { "epoch": 2.149, "grad_norm": 16.82160758972168, "learning_rate": 1.40748e-07, "loss": 0.334, "step": 214900 }, { "epoch": 2.1495, "grad_norm": 69.73733520507812, "learning_rate": 1.40548e-07, "loss": 0.427, "step": 214950 }, { "epoch": 2.15, "grad_norm": 125.05155944824219, "learning_rate": 1.40348e-07, "loss": 0.3633, "step": 215000 }, { "epoch": 2.1505, "grad_norm": 55.94770812988281, "learning_rate": 1.40148e-07, "loss": 0.5265, "step": 215050 }, { "epoch": 2.151, "grad_norm": 4.276247978210449, "learning_rate": 1.3994799999999998e-07, "loss": 0.3524, "step": 215100 }, { "epoch": 2.1515, "grad_norm": 67.32160949707031, "learning_rate": 1.3974800000000001e-07, "loss": 0.4974, "step": 215150 }, { "epoch": 2.152, "grad_norm": 34.65168762207031, "learning_rate": 1.39548e-07, "loss": 0.3331, "step": 215200 }, { "epoch": 2.1525, "grad_norm": 91.49262237548828, "learning_rate": 1.3934799999999998e-07, "loss": 0.4829, "step": 215250 }, { "epoch": 2.153, "grad_norm": 14.011157989501953, "learning_rate": 1.39148e-07, "loss": 0.4288, "step": 215300 }, { "epoch": 2.1535, "grad_norm": 5.634559631347656, "learning_rate": 1.3894799999999998e-07, "loss": 0.3349, "step": 215350 }, { "epoch": 2.154, "grad_norm": 73.73646545410156, "learning_rate": 1.38748e-07, "loss": 0.4687, "step": 215400 }, { "epoch": 2.1545, "grad_norm": 2.7727954387664795, "learning_rate": 1.38548e-07, "loss": 0.3353, "step": 215450 }, { "epoch": 2.155, "grad_norm": 73.1183090209961, "learning_rate": 1.38348e-07, "loss": 0.5878, "step": 215500 }, { "epoch": 2.1555, "grad_norm": 88.92938232421875, "learning_rate": 1.3814799999999997e-07, "loss": 0.3574, "step": 215550 }, { "epoch": 2.156, "grad_norm": 9.608054161071777, "learning_rate": 1.3794799999999999e-07, "loss": 0.472, "step": 215600 }, { "epoch": 2.1565, "grad_norm": 34.318458557128906, "learning_rate": 1.37748e-07, "loss": 0.3066, "step": 215650 }, { "epoch": 2.157, "grad_norm": 146.12513732910156, "learning_rate": 1.37548e-07, "loss": 0.3393, "step": 215700 }, { "epoch": 2.1575, "grad_norm": 7.567634582519531, "learning_rate": 1.37348e-07, "loss": 0.3595, "step": 215750 }, { "epoch": 2.158, "grad_norm": 14.484745979309082, "learning_rate": 1.3714799999999998e-07, "loss": 0.3492, "step": 215800 }, { "epoch": 2.1585, "grad_norm": 29.185087203979492, "learning_rate": 1.36948e-07, "loss": 0.4477, "step": 215850 }, { "epoch": 2.159, "grad_norm": 3.4553093910217285, "learning_rate": 1.36748e-07, "loss": 0.6068, "step": 215900 }, { "epoch": 2.1595, "grad_norm": 14.842061996459961, "learning_rate": 1.3654799999999999e-07, "loss": 0.4224, "step": 215950 }, { "epoch": 2.16, "grad_norm": 94.66940307617188, "learning_rate": 1.36348e-07, "loss": 0.4909, "step": 216000 }, { "epoch": 2.1605, "grad_norm": 12.455718994140625, "learning_rate": 1.3614799999999998e-07, "loss": 0.4814, "step": 216050 }, { "epoch": 2.161, "grad_norm": 118.89146423339844, "learning_rate": 1.35948e-07, "loss": 0.5104, "step": 216100 }, { "epoch": 2.1615, "grad_norm": 64.86026763916016, "learning_rate": 1.35748e-07, "loss": 0.4312, "step": 216150 }, { "epoch": 2.162, "grad_norm": 8.784263610839844, "learning_rate": 1.35548e-07, "loss": 0.4906, "step": 216200 }, { "epoch": 2.1625, "grad_norm": 90.33892822265625, "learning_rate": 1.35348e-07, "loss": 0.4301, "step": 216250 }, { "epoch": 2.163, "grad_norm": 22.997182846069336, "learning_rate": 1.35148e-07, "loss": 0.2993, "step": 216300 }, { "epoch": 2.1635, "grad_norm": 35.568603515625, "learning_rate": 1.34948e-07, "loss": 0.3766, "step": 216350 }, { "epoch": 2.164, "grad_norm": 28.420759201049805, "learning_rate": 1.34748e-07, "loss": 0.5307, "step": 216400 }, { "epoch": 2.1645, "grad_norm": 19.557161331176758, "learning_rate": 1.34548e-07, "loss": 0.5116, "step": 216450 }, { "epoch": 2.165, "grad_norm": 23.2882022857666, "learning_rate": 1.3434799999999998e-07, "loss": 0.4442, "step": 216500 }, { "epoch": 2.1655, "grad_norm": 22.49976348876953, "learning_rate": 1.34148e-07, "loss": 0.4319, "step": 216550 }, { "epoch": 2.166, "grad_norm": 79.17230987548828, "learning_rate": 1.33948e-07, "loss": 0.4977, "step": 216600 }, { "epoch": 2.1665, "grad_norm": 68.50972747802734, "learning_rate": 1.33748e-07, "loss": 0.3239, "step": 216650 }, { "epoch": 2.167, "grad_norm": 57.847503662109375, "learning_rate": 1.33548e-07, "loss": 0.3964, "step": 216700 }, { "epoch": 2.1675, "grad_norm": 90.66923522949219, "learning_rate": 1.3334799999999998e-07, "loss": 0.3266, "step": 216750 }, { "epoch": 2.168, "grad_norm": 10.624639511108398, "learning_rate": 1.33148e-07, "loss": 0.4769, "step": 216800 }, { "epoch": 2.1685, "grad_norm": 70.89537048339844, "learning_rate": 1.32948e-07, "loss": 0.4799, "step": 216850 }, { "epoch": 2.169, "grad_norm": 52.987213134765625, "learning_rate": 1.32748e-07, "loss": 0.4412, "step": 216900 }, { "epoch": 2.1695, "grad_norm": 69.99027252197266, "learning_rate": 1.32552e-07, "loss": 0.4945, "step": 216950 }, { "epoch": 2.17, "grad_norm": 90.99432373046875, "learning_rate": 1.32356e-07, "loss": 0.4697, "step": 217000 }, { "epoch": 2.1705, "grad_norm": 79.41846466064453, "learning_rate": 1.3215599999999998e-07, "loss": 0.4307, "step": 217050 }, { "epoch": 2.171, "grad_norm": 86.42295837402344, "learning_rate": 1.31956e-07, "loss": 0.4369, "step": 217100 }, { "epoch": 2.1715, "grad_norm": 64.85267639160156, "learning_rate": 1.31756e-07, "loss": 0.4694, "step": 217150 }, { "epoch": 2.172, "grad_norm": 4.685230731964111, "learning_rate": 1.31556e-07, "loss": 0.3749, "step": 217200 }, { "epoch": 2.1725, "grad_norm": 98.96492004394531, "learning_rate": 1.31356e-07, "loss": 0.4746, "step": 217250 }, { "epoch": 2.173, "grad_norm": 58.136985778808594, "learning_rate": 1.3115599999999999e-07, "loss": 0.4622, "step": 217300 }, { "epoch": 2.1734999999999998, "grad_norm": 33.713130950927734, "learning_rate": 1.3095599999999997e-07, "loss": 0.411, "step": 217350 }, { "epoch": 2.174, "grad_norm": 5.95607328414917, "learning_rate": 1.30756e-07, "loss": 0.4671, "step": 217400 }, { "epoch": 2.1745, "grad_norm": 44.79324722290039, "learning_rate": 1.30556e-07, "loss": 0.3069, "step": 217450 }, { "epoch": 2.175, "grad_norm": 21.093141555786133, "learning_rate": 1.30356e-07, "loss": 0.3747, "step": 217500 }, { "epoch": 2.1755, "grad_norm": 27.197893142700195, "learning_rate": 1.30156e-07, "loss": 0.4195, "step": 217550 }, { "epoch": 2.176, "grad_norm": 77.7616195678711, "learning_rate": 1.2995599999999998e-07, "loss": 0.3689, "step": 217600 }, { "epoch": 2.1765, "grad_norm": 0.3446306586265564, "learning_rate": 1.2975600000000001e-07, "loss": 0.4223, "step": 217650 }, { "epoch": 2.177, "grad_norm": 95.07039642333984, "learning_rate": 1.29556e-07, "loss": 0.3375, "step": 217700 }, { "epoch": 2.1775, "grad_norm": 80.28645324707031, "learning_rate": 1.2935599999999998e-07, "loss": 0.4666, "step": 217750 }, { "epoch": 2.178, "grad_norm": 57.870567321777344, "learning_rate": 1.29156e-07, "loss": 0.5303, "step": 217800 }, { "epoch": 2.1785, "grad_norm": 6.648061752319336, "learning_rate": 1.2895599999999998e-07, "loss": 0.3876, "step": 217850 }, { "epoch": 2.179, "grad_norm": 80.23861694335938, "learning_rate": 1.28756e-07, "loss": 0.3716, "step": 217900 }, { "epoch": 2.1795, "grad_norm": 44.08333206176758, "learning_rate": 1.28556e-07, "loss": 0.4331, "step": 217950 }, { "epoch": 2.18, "grad_norm": 54.142154693603516, "learning_rate": 1.28356e-07, "loss": 0.4002, "step": 218000 }, { "epoch": 2.1805, "grad_norm": 13.98410701751709, "learning_rate": 1.28156e-07, "loss": 0.4893, "step": 218050 }, { "epoch": 2.181, "grad_norm": 67.61353302001953, "learning_rate": 1.2795599999999998e-07, "loss": 0.4332, "step": 218100 }, { "epoch": 2.1814999999999998, "grad_norm": 3.3425474166870117, "learning_rate": 1.27756e-07, "loss": 0.4579, "step": 218150 }, { "epoch": 2.182, "grad_norm": 25.27934455871582, "learning_rate": 1.27556e-07, "loss": 0.4718, "step": 218200 }, { "epoch": 2.1825, "grad_norm": 74.17402648925781, "learning_rate": 1.27356e-07, "loss": 0.3044, "step": 218250 }, { "epoch": 2.183, "grad_norm": 12.15510368347168, "learning_rate": 1.2715599999999998e-07, "loss": 0.3911, "step": 218300 }, { "epoch": 2.1835, "grad_norm": 8.140380859375, "learning_rate": 1.2695600000000002e-07, "loss": 0.376, "step": 218350 }, { "epoch": 2.184, "grad_norm": 102.70777130126953, "learning_rate": 1.26756e-07, "loss": 0.3637, "step": 218400 }, { "epoch": 2.1845, "grad_norm": 19.61964225769043, "learning_rate": 1.2655599999999999e-07, "loss": 0.5274, "step": 218450 }, { "epoch": 2.185, "grad_norm": 14.926944732666016, "learning_rate": 1.26356e-07, "loss": 0.5191, "step": 218500 }, { "epoch": 2.1855, "grad_norm": 37.19790267944336, "learning_rate": 1.2615599999999998e-07, "loss": 0.4175, "step": 218550 }, { "epoch": 2.186, "grad_norm": 107.50064849853516, "learning_rate": 1.2595600000000002e-07, "loss": 0.3893, "step": 218600 }, { "epoch": 2.1865, "grad_norm": 125.32817077636719, "learning_rate": 1.25756e-07, "loss": 0.442, "step": 218650 }, { "epoch": 2.187, "grad_norm": 5.219124794006348, "learning_rate": 1.25556e-07, "loss": 0.4723, "step": 218700 }, { "epoch": 2.1875, "grad_norm": 69.8603286743164, "learning_rate": 1.25356e-07, "loss": 0.3469, "step": 218750 }, { "epoch": 2.188, "grad_norm": 61.861080169677734, "learning_rate": 1.2515599999999999e-07, "loss": 0.4402, "step": 218800 }, { "epoch": 2.1885, "grad_norm": 27.252426147460938, "learning_rate": 1.24956e-07, "loss": 0.5054, "step": 218850 }, { "epoch": 2.189, "grad_norm": 90.06917572021484, "learning_rate": 1.24756e-07, "loss": 0.3561, "step": 218900 }, { "epoch": 2.1895, "grad_norm": 0.39669129252433777, "learning_rate": 1.24556e-07, "loss": 0.4879, "step": 218950 }, { "epoch": 2.19, "grad_norm": 15.000761032104492, "learning_rate": 1.2435599999999998e-07, "loss": 0.5528, "step": 219000 }, { "epoch": 2.1905, "grad_norm": 77.33452606201172, "learning_rate": 1.24156e-07, "loss": 0.593, "step": 219050 }, { "epoch": 2.191, "grad_norm": 67.30721282958984, "learning_rate": 1.23956e-07, "loss": 0.4106, "step": 219100 }, { "epoch": 2.1915, "grad_norm": 4.53434944152832, "learning_rate": 1.2375599999999999e-07, "loss": 0.5223, "step": 219150 }, { "epoch": 2.192, "grad_norm": 48.35958480834961, "learning_rate": 1.23556e-07, "loss": 0.362, "step": 219200 }, { "epoch": 2.1925, "grad_norm": 82.18716430664062, "learning_rate": 1.2335599999999998e-07, "loss": 0.4678, "step": 219250 }, { "epoch": 2.193, "grad_norm": 11.158646583557129, "learning_rate": 1.23156e-07, "loss": 0.3892, "step": 219300 }, { "epoch": 2.1935000000000002, "grad_norm": 36.34510040283203, "learning_rate": 1.2295599999999998e-07, "loss": 0.3854, "step": 219350 }, { "epoch": 2.194, "grad_norm": 24.300992965698242, "learning_rate": 1.22756e-07, "loss": 0.3248, "step": 219400 }, { "epoch": 2.1945, "grad_norm": 56.57987976074219, "learning_rate": 1.22556e-07, "loss": 0.4331, "step": 219450 }, { "epoch": 2.195, "grad_norm": 44.11698532104492, "learning_rate": 1.2236e-07, "loss": 0.3961, "step": 219500 }, { "epoch": 2.1955, "grad_norm": 0.06106671318411827, "learning_rate": 1.2216e-07, "loss": 0.357, "step": 219550 }, { "epoch": 2.196, "grad_norm": 22.567058563232422, "learning_rate": 1.2196e-07, "loss": 0.4296, "step": 219600 }, { "epoch": 2.1965, "grad_norm": 95.61750030517578, "learning_rate": 1.2176e-07, "loss": 0.5135, "step": 219650 }, { "epoch": 2.197, "grad_norm": 0.5937201976776123, "learning_rate": 1.2155999999999998e-07, "loss": 0.4198, "step": 219700 }, { "epoch": 2.1975, "grad_norm": 9.500515937805176, "learning_rate": 1.2136e-07, "loss": 0.3876, "step": 219750 }, { "epoch": 2.198, "grad_norm": 18.312498092651367, "learning_rate": 1.2116e-07, "loss": 0.5221, "step": 219800 }, { "epoch": 2.1985, "grad_norm": 108.91061401367188, "learning_rate": 1.2096e-07, "loss": 0.4779, "step": 219850 }, { "epoch": 2.199, "grad_norm": 108.51426696777344, "learning_rate": 1.2076e-07, "loss": 0.3632, "step": 219900 }, { "epoch": 2.1995, "grad_norm": 54.85345458984375, "learning_rate": 1.2056e-07, "loss": 0.2912, "step": 219950 }, { "epoch": 2.2, "grad_norm": 98.45470428466797, "learning_rate": 1.2036e-07, "loss": 0.3563, "step": 220000 }, { "epoch": 2.2005, "grad_norm": 26.045886993408203, "learning_rate": 1.2015999999999999e-07, "loss": 0.5571, "step": 220050 }, { "epoch": 2.201, "grad_norm": 70.3372802734375, "learning_rate": 1.1996e-07, "loss": 0.4871, "step": 220100 }, { "epoch": 2.2015000000000002, "grad_norm": 3.0247647762298584, "learning_rate": 1.1976e-07, "loss": 0.3335, "step": 220150 }, { "epoch": 2.202, "grad_norm": 24.912132263183594, "learning_rate": 1.1956e-07, "loss": 0.4888, "step": 220200 }, { "epoch": 2.2025, "grad_norm": 79.61896514892578, "learning_rate": 1.1935999999999998e-07, "loss": 0.5456, "step": 220250 }, { "epoch": 2.203, "grad_norm": 52.392642974853516, "learning_rate": 1.1916e-07, "loss": 0.4036, "step": 220300 }, { "epoch": 2.2035, "grad_norm": 2.282226085662842, "learning_rate": 1.1895999999999999e-07, "loss": 0.3552, "step": 220350 }, { "epoch": 2.204, "grad_norm": 26.8724365234375, "learning_rate": 1.1876e-07, "loss": 0.3787, "step": 220400 }, { "epoch": 2.2045, "grad_norm": 84.30389404296875, "learning_rate": 1.1856e-07, "loss": 0.3487, "step": 220450 }, { "epoch": 2.205, "grad_norm": 74.3729476928711, "learning_rate": 1.1836e-07, "loss": 0.432, "step": 220500 }, { "epoch": 2.2055, "grad_norm": 25.35677146911621, "learning_rate": 1.1816e-07, "loss": 0.4825, "step": 220550 }, { "epoch": 2.206, "grad_norm": 5.5437116622924805, "learning_rate": 1.1795999999999999e-07, "loss": 0.4293, "step": 220600 }, { "epoch": 2.2065, "grad_norm": 51.029869079589844, "learning_rate": 1.1776e-07, "loss": 0.3243, "step": 220650 }, { "epoch": 2.207, "grad_norm": 12.300873756408691, "learning_rate": 1.1755999999999999e-07, "loss": 0.3076, "step": 220700 }, { "epoch": 2.2075, "grad_norm": 0.5374252796173096, "learning_rate": 1.1736e-07, "loss": 0.4596, "step": 220750 }, { "epoch": 2.208, "grad_norm": 19.82467269897461, "learning_rate": 1.1716e-07, "loss": 0.4194, "step": 220800 }, { "epoch": 2.2085, "grad_norm": 88.30901336669922, "learning_rate": 1.1695999999999998e-07, "loss": 0.4866, "step": 220850 }, { "epoch": 2.209, "grad_norm": 53.6544303894043, "learning_rate": 1.1676e-07, "loss": 0.5025, "step": 220900 }, { "epoch": 2.2095, "grad_norm": 16.769838333129883, "learning_rate": 1.1655999999999999e-07, "loss": 0.3527, "step": 220950 }, { "epoch": 2.21, "grad_norm": 77.25714874267578, "learning_rate": 1.1636e-07, "loss": 0.4571, "step": 221000 }, { "epoch": 2.2105, "grad_norm": 74.47918701171875, "learning_rate": 1.1615999999999999e-07, "loss": 0.4852, "step": 221050 }, { "epoch": 2.211, "grad_norm": 1.7386324405670166, "learning_rate": 1.1595999999999999e-07, "loss": 0.4368, "step": 221100 }, { "epoch": 2.2115, "grad_norm": 43.10595703125, "learning_rate": 1.1576e-07, "loss": 0.472, "step": 221150 }, { "epoch": 2.212, "grad_norm": 195.77264404296875, "learning_rate": 1.1556e-07, "loss": 0.4499, "step": 221200 }, { "epoch": 2.2125, "grad_norm": 41.63266372680664, "learning_rate": 1.1536e-07, "loss": 0.3944, "step": 221250 }, { "epoch": 2.213, "grad_norm": 93.48374938964844, "learning_rate": 1.1516e-07, "loss": 0.4368, "step": 221300 }, { "epoch": 2.2135, "grad_norm": 11.143013000488281, "learning_rate": 1.1496e-07, "loss": 0.4911, "step": 221350 }, { "epoch": 2.214, "grad_norm": 0.3506889045238495, "learning_rate": 1.1475999999999999e-07, "loss": 0.4476, "step": 221400 }, { "epoch": 2.2145, "grad_norm": 5.332769393920898, "learning_rate": 1.1455999999999999e-07, "loss": 0.3238, "step": 221450 }, { "epoch": 2.215, "grad_norm": 69.08348083496094, "learning_rate": 1.1436e-07, "loss": 0.4466, "step": 221500 }, { "epoch": 2.2155, "grad_norm": 62.9998664855957, "learning_rate": 1.1416e-07, "loss": 0.4017, "step": 221550 }, { "epoch": 2.216, "grad_norm": 66.38433074951172, "learning_rate": 1.1396e-07, "loss": 0.4719, "step": 221600 }, { "epoch": 2.2165, "grad_norm": 123.8575668334961, "learning_rate": 1.1376e-07, "loss": 0.3695, "step": 221650 }, { "epoch": 2.217, "grad_norm": 9.564665794372559, "learning_rate": 1.1355999999999999e-07, "loss": 0.3205, "step": 221700 }, { "epoch": 2.2175, "grad_norm": 28.491945266723633, "learning_rate": 1.1335999999999999e-07, "loss": 0.4048, "step": 221750 }, { "epoch": 2.218, "grad_norm": 20.753578186035156, "learning_rate": 1.1315999999999999e-07, "loss": 0.4333, "step": 221800 }, { "epoch": 2.2185, "grad_norm": 77.56809997558594, "learning_rate": 1.1296e-07, "loss": 0.5474, "step": 221850 }, { "epoch": 2.219, "grad_norm": 6.686479091644287, "learning_rate": 1.1276e-07, "loss": 0.4709, "step": 221900 }, { "epoch": 2.2195, "grad_norm": 1.914489507675171, "learning_rate": 1.1255999999999998e-07, "loss": 0.3998, "step": 221950 }, { "epoch": 2.22, "grad_norm": 0.9213142991065979, "learning_rate": 1.1236e-07, "loss": 0.378, "step": 222000 }, { "epoch": 2.2205, "grad_norm": 115.55097198486328, "learning_rate": 1.1215999999999999e-07, "loss": 0.4008, "step": 222050 }, { "epoch": 2.221, "grad_norm": 72.356201171875, "learning_rate": 1.1196e-07, "loss": 0.3891, "step": 222100 }, { "epoch": 2.2215, "grad_norm": 6.992410659790039, "learning_rate": 1.1175999999999999e-07, "loss": 0.3579, "step": 222150 }, { "epoch": 2.222, "grad_norm": 72.04354858398438, "learning_rate": 1.1156e-07, "loss": 0.402, "step": 222200 }, { "epoch": 2.2225, "grad_norm": 112.33955383300781, "learning_rate": 1.1136e-07, "loss": 0.4516, "step": 222250 }, { "epoch": 2.223, "grad_norm": 45.285400390625, "learning_rate": 1.1115999999999998e-07, "loss": 0.3904, "step": 222300 }, { "epoch": 2.2235, "grad_norm": 0.01933535747230053, "learning_rate": 1.1096e-07, "loss": 0.4005, "step": 222350 }, { "epoch": 2.224, "grad_norm": 2.9697704315185547, "learning_rate": 1.1076e-07, "loss": 0.4953, "step": 222400 }, { "epoch": 2.2245, "grad_norm": 143.373779296875, "learning_rate": 1.1056e-07, "loss": 0.3947, "step": 222450 }, { "epoch": 2.225, "grad_norm": 2.298295259475708, "learning_rate": 1.1035999999999999e-07, "loss": 0.3572, "step": 222500 }, { "epoch": 2.2255, "grad_norm": 4.240357875823975, "learning_rate": 1.1015999999999999e-07, "loss": 0.4634, "step": 222550 }, { "epoch": 2.226, "grad_norm": 19.71664810180664, "learning_rate": 1.0996e-07, "loss": 0.4539, "step": 222600 }, { "epoch": 2.2265, "grad_norm": 20.2246150970459, "learning_rate": 1.0975999999999998e-07, "loss": 0.4125, "step": 222650 }, { "epoch": 2.227, "grad_norm": 9.409420013427734, "learning_rate": 1.0956e-07, "loss": 0.4532, "step": 222700 }, { "epoch": 2.2275, "grad_norm": 2.0112721920013428, "learning_rate": 1.0936e-07, "loss": 0.4254, "step": 222750 }, { "epoch": 2.228, "grad_norm": 107.7596664428711, "learning_rate": 1.0915999999999999e-07, "loss": 0.4481, "step": 222800 }, { "epoch": 2.2285, "grad_norm": 64.18367767333984, "learning_rate": 1.0895999999999999e-07, "loss": 0.5081, "step": 222850 }, { "epoch": 2.229, "grad_norm": 31.307912826538086, "learning_rate": 1.0875999999999999e-07, "loss": 0.411, "step": 222900 }, { "epoch": 2.2295, "grad_norm": 62.03761291503906, "learning_rate": 1.0856e-07, "loss": 0.5153, "step": 222950 }, { "epoch": 2.23, "grad_norm": 68.89613342285156, "learning_rate": 1.0836e-07, "loss": 0.3965, "step": 223000 }, { "epoch": 2.2305, "grad_norm": 10.568138122558594, "learning_rate": 1.0816e-07, "loss": 0.3033, "step": 223050 }, { "epoch": 2.231, "grad_norm": 17.958087921142578, "learning_rate": 1.07964e-07, "loss": 0.428, "step": 223100 }, { "epoch": 2.2315, "grad_norm": 1.1294639110565186, "learning_rate": 1.07764e-07, "loss": 0.4191, "step": 223150 }, { "epoch": 2.232, "grad_norm": 31.283260345458984, "learning_rate": 1.07564e-07, "loss": 0.3648, "step": 223200 }, { "epoch": 2.2325, "grad_norm": 74.15252685546875, "learning_rate": 1.07364e-07, "loss": 0.3545, "step": 223250 }, { "epoch": 2.233, "grad_norm": 56.10187911987305, "learning_rate": 1.0716399999999999e-07, "loss": 0.5046, "step": 223300 }, { "epoch": 2.2335, "grad_norm": 19.8614444732666, "learning_rate": 1.06964e-07, "loss": 0.4121, "step": 223350 }, { "epoch": 2.234, "grad_norm": 52.8697395324707, "learning_rate": 1.0676399999999999e-07, "loss": 0.4443, "step": 223400 }, { "epoch": 2.2345, "grad_norm": 15.52030086517334, "learning_rate": 1.06564e-07, "loss": 0.4393, "step": 223450 }, { "epoch": 2.235, "grad_norm": 13.279940605163574, "learning_rate": 1.06364e-07, "loss": 0.5055, "step": 223500 }, { "epoch": 2.2355, "grad_norm": 35.77288818359375, "learning_rate": 1.0616399999999998e-07, "loss": 0.3935, "step": 223550 }, { "epoch": 2.2359999999999998, "grad_norm": 41.148807525634766, "learning_rate": 1.05964e-07, "loss": 0.5071, "step": 223600 }, { "epoch": 2.2365, "grad_norm": 119.54339599609375, "learning_rate": 1.0576399999999999e-07, "loss": 0.5073, "step": 223650 }, { "epoch": 2.237, "grad_norm": 4.346285343170166, "learning_rate": 1.05564e-07, "loss": 0.3924, "step": 223700 }, { "epoch": 2.2375, "grad_norm": 22.041217803955078, "learning_rate": 1.0536399999999999e-07, "loss": 0.4077, "step": 223750 }, { "epoch": 2.238, "grad_norm": 23.932233810424805, "learning_rate": 1.0516399999999999e-07, "loss": 0.4078, "step": 223800 }, { "epoch": 2.2385, "grad_norm": 92.13134765625, "learning_rate": 1.04964e-07, "loss": 0.3978, "step": 223850 }, { "epoch": 2.239, "grad_norm": 5.41093635559082, "learning_rate": 1.04764e-07, "loss": 0.3965, "step": 223900 }, { "epoch": 2.2395, "grad_norm": 32.597747802734375, "learning_rate": 1.04564e-07, "loss": 0.4309, "step": 223950 }, { "epoch": 2.24, "grad_norm": 76.27898406982422, "learning_rate": 1.04364e-07, "loss": 0.4534, "step": 224000 }, { "epoch": 2.2405, "grad_norm": 77.87999725341797, "learning_rate": 1.04164e-07, "loss": 0.364, "step": 224050 }, { "epoch": 2.241, "grad_norm": 83.9505615234375, "learning_rate": 1.0396399999999999e-07, "loss": 0.4332, "step": 224100 }, { "epoch": 2.2415, "grad_norm": 92.03340148925781, "learning_rate": 1.0376399999999999e-07, "loss": 0.4405, "step": 224150 }, { "epoch": 2.242, "grad_norm": 2.194648027420044, "learning_rate": 1.03564e-07, "loss": 0.5773, "step": 224200 }, { "epoch": 2.2425, "grad_norm": 11.85653018951416, "learning_rate": 1.03364e-07, "loss": 0.2945, "step": 224250 }, { "epoch": 2.243, "grad_norm": 77.39554595947266, "learning_rate": 1.03164e-07, "loss": 0.382, "step": 224300 }, { "epoch": 2.2435, "grad_norm": 85.84123229980469, "learning_rate": 1.02964e-07, "loss": 0.3054, "step": 224350 }, { "epoch": 2.2439999999999998, "grad_norm": 95.42002868652344, "learning_rate": 1.0276399999999999e-07, "loss": 0.4038, "step": 224400 }, { "epoch": 2.2445, "grad_norm": 1.0675559043884277, "learning_rate": 1.0256399999999999e-07, "loss": 0.3923, "step": 224450 }, { "epoch": 2.245, "grad_norm": 68.109130859375, "learning_rate": 1.0236399999999999e-07, "loss": 0.5414, "step": 224500 }, { "epoch": 2.2455, "grad_norm": 14.229341506958008, "learning_rate": 1.02164e-07, "loss": 0.4831, "step": 224550 }, { "epoch": 2.246, "grad_norm": 53.71510314941406, "learning_rate": 1.01964e-07, "loss": 0.4338, "step": 224600 }, { "epoch": 2.2465, "grad_norm": 44.91522216796875, "learning_rate": 1.0176399999999998e-07, "loss": 0.4844, "step": 224650 }, { "epoch": 2.247, "grad_norm": 138.286865234375, "learning_rate": 1.01564e-07, "loss": 0.361, "step": 224700 }, { "epoch": 2.2475, "grad_norm": 43.51007080078125, "learning_rate": 1.01364e-07, "loss": 0.4108, "step": 224750 }, { "epoch": 2.248, "grad_norm": 7.015733242034912, "learning_rate": 1.01164e-07, "loss": 0.3845, "step": 224800 }, { "epoch": 2.2485, "grad_norm": 125.538330078125, "learning_rate": 1.0096399999999999e-07, "loss": 0.463, "step": 224850 }, { "epoch": 2.249, "grad_norm": 7.6888346672058105, "learning_rate": 1.00764e-07, "loss": 0.4631, "step": 224900 }, { "epoch": 2.2495, "grad_norm": 6.268250465393066, "learning_rate": 1.00564e-07, "loss": 0.2814, "step": 224950 }, { "epoch": 2.25, "grad_norm": 1.0146398544311523, "learning_rate": 1.0036399999999998e-07, "loss": 0.4237, "step": 225000 }, { "epoch": 2.2505, "grad_norm": 33.02668380737305, "learning_rate": 1.00164e-07, "loss": 0.3576, "step": 225050 }, { "epoch": 2.251, "grad_norm": 97.76544189453125, "learning_rate": 9.9964e-08, "loss": 0.3949, "step": 225100 }, { "epoch": 2.2515, "grad_norm": 78.87450408935547, "learning_rate": 9.9764e-08, "loss": 0.2917, "step": 225150 }, { "epoch": 2.252, "grad_norm": 108.28045654296875, "learning_rate": 9.956399999999999e-08, "loss": 0.5591, "step": 225200 }, { "epoch": 2.2525, "grad_norm": 65.84611511230469, "learning_rate": 9.936399999999999e-08, "loss": 0.3784, "step": 225250 }, { "epoch": 2.253, "grad_norm": 20.012842178344727, "learning_rate": 9.9164e-08, "loss": 0.4402, "step": 225300 }, { "epoch": 2.2535, "grad_norm": 1.915622353553772, "learning_rate": 9.896399999999999e-08, "loss": 0.4123, "step": 225350 }, { "epoch": 2.254, "grad_norm": 19.519975662231445, "learning_rate": 9.8764e-08, "loss": 0.3652, "step": 225400 }, { "epoch": 2.2545, "grad_norm": 87.41107940673828, "learning_rate": 9.8564e-08, "loss": 0.6044, "step": 225450 }, { "epoch": 2.255, "grad_norm": 29.000362396240234, "learning_rate": 9.836399999999999e-08, "loss": 0.4545, "step": 225500 }, { "epoch": 2.2555, "grad_norm": 70.12849426269531, "learning_rate": 9.816399999999999e-08, "loss": 0.4011, "step": 225550 }, { "epoch": 2.2560000000000002, "grad_norm": 0.5577579140663147, "learning_rate": 9.796399999999999e-08, "loss": 0.4326, "step": 225600 }, { "epoch": 2.2565, "grad_norm": 2.2879648208618164, "learning_rate": 9.7764e-08, "loss": 0.3948, "step": 225650 }, { "epoch": 2.257, "grad_norm": 0.7911091446876526, "learning_rate": 9.7564e-08, "loss": 0.5271, "step": 225700 }, { "epoch": 2.2575, "grad_norm": 52.76624298095703, "learning_rate": 9.7364e-08, "loss": 0.5534, "step": 225750 }, { "epoch": 2.258, "grad_norm": 53.3375244140625, "learning_rate": 9.7164e-08, "loss": 0.4334, "step": 225800 }, { "epoch": 2.2585, "grad_norm": 27.924394607543945, "learning_rate": 9.6964e-08, "loss": 0.2814, "step": 225850 }, { "epoch": 2.259, "grad_norm": 48.85221481323242, "learning_rate": 9.676399999999999e-08, "loss": 0.3435, "step": 225900 }, { "epoch": 2.2595, "grad_norm": 7.6830549240112305, "learning_rate": 9.656399999999999e-08, "loss": 0.5507, "step": 225950 }, { "epoch": 2.26, "grad_norm": 2.447178840637207, "learning_rate": 9.6364e-08, "loss": 0.329, "step": 226000 }, { "epoch": 2.2605, "grad_norm": 39.04615783691406, "learning_rate": 9.6164e-08, "loss": 0.4663, "step": 226050 }, { "epoch": 2.261, "grad_norm": 4.852475643157959, "learning_rate": 9.596399999999999e-08, "loss": 0.5203, "step": 226100 }, { "epoch": 2.2615, "grad_norm": 89.7945556640625, "learning_rate": 9.5764e-08, "loss": 0.4516, "step": 226150 }, { "epoch": 2.262, "grad_norm": 35.014644622802734, "learning_rate": 9.5564e-08, "loss": 0.3496, "step": 226200 }, { "epoch": 2.2625, "grad_norm": 57.93213653564453, "learning_rate": 9.5364e-08, "loss": 0.4799, "step": 226250 }, { "epoch": 2.263, "grad_norm": 39.42817306518555, "learning_rate": 9.516399999999999e-08, "loss": 0.4357, "step": 226300 }, { "epoch": 2.2635, "grad_norm": 41.9278450012207, "learning_rate": 9.4964e-08, "loss": 0.437, "step": 226350 }, { "epoch": 2.2640000000000002, "grad_norm": 41.00620651245117, "learning_rate": 9.4764e-08, "loss": 0.417, "step": 226400 }, { "epoch": 2.2645, "grad_norm": 90.83309936523438, "learning_rate": 9.456399999999999e-08, "loss": 0.4885, "step": 226450 }, { "epoch": 2.265, "grad_norm": 36.397216796875, "learning_rate": 9.4364e-08, "loss": 0.4193, "step": 226500 }, { "epoch": 2.2655, "grad_norm": 43.22880554199219, "learning_rate": 9.4164e-08, "loss": 0.4173, "step": 226550 }, { "epoch": 2.266, "grad_norm": 144.54409790039062, "learning_rate": 9.396400000000001e-08, "loss": 0.3163, "step": 226600 }, { "epoch": 2.2665, "grad_norm": 0.3794969916343689, "learning_rate": 9.376399999999999e-08, "loss": 0.3984, "step": 226650 }, { "epoch": 2.267, "grad_norm": 48.13538360595703, "learning_rate": 9.356399999999999e-08, "loss": 0.3953, "step": 226700 }, { "epoch": 2.2675, "grad_norm": 89.7881088256836, "learning_rate": 9.3364e-08, "loss": 0.5402, "step": 226750 }, { "epoch": 2.268, "grad_norm": 87.7471923828125, "learning_rate": 9.316399999999999e-08, "loss": 0.438, "step": 226800 }, { "epoch": 2.2685, "grad_norm": 22.916074752807617, "learning_rate": 9.2964e-08, "loss": 0.3621, "step": 226850 }, { "epoch": 2.269, "grad_norm": 23.842510223388672, "learning_rate": 9.2764e-08, "loss": 0.351, "step": 226900 }, { "epoch": 2.2695, "grad_norm": 86.34049224853516, "learning_rate": 9.2564e-08, "loss": 0.4619, "step": 226950 }, { "epoch": 2.27, "grad_norm": 34.528133392333984, "learning_rate": 9.236399999999999e-08, "loss": 0.367, "step": 227000 }, { "epoch": 2.2705, "grad_norm": 42.39723205566406, "learning_rate": 9.216399999999999e-08, "loss": 0.4345, "step": 227050 }, { "epoch": 2.271, "grad_norm": 4.247285842895508, "learning_rate": 9.1964e-08, "loss": 0.6515, "step": 227100 }, { "epoch": 2.2715, "grad_norm": 34.562137603759766, "learning_rate": 9.1764e-08, "loss": 0.4751, "step": 227150 }, { "epoch": 2.2720000000000002, "grad_norm": 0.823958158493042, "learning_rate": 9.1564e-08, "loss": 0.4003, "step": 227200 }, { "epoch": 2.2725, "grad_norm": 86.37012481689453, "learning_rate": 9.1364e-08, "loss": 0.4309, "step": 227250 }, { "epoch": 2.273, "grad_norm": 108.56483459472656, "learning_rate": 9.1164e-08, "loss": 0.5015, "step": 227300 }, { "epoch": 2.2735, "grad_norm": 73.98970031738281, "learning_rate": 9.096399999999999e-08, "loss": 0.3684, "step": 227350 }, { "epoch": 2.274, "grad_norm": 18.87233543395996, "learning_rate": 9.076399999999999e-08, "loss": 0.4713, "step": 227400 }, { "epoch": 2.2745, "grad_norm": 88.28274536132812, "learning_rate": 9.0564e-08, "loss": 0.5386, "step": 227450 }, { "epoch": 2.275, "grad_norm": 63.17116165161133, "learning_rate": 9.0364e-08, "loss": 0.4483, "step": 227500 }, { "epoch": 2.2755, "grad_norm": 28.934703826904297, "learning_rate": 9.016399999999999e-08, "loss": 0.4424, "step": 227550 }, { "epoch": 2.276, "grad_norm": 61.9217414855957, "learning_rate": 8.9964e-08, "loss": 0.4156, "step": 227600 }, { "epoch": 2.2765, "grad_norm": 75.80464172363281, "learning_rate": 8.9764e-08, "loss": 0.4462, "step": 227650 }, { "epoch": 2.277, "grad_norm": 9.887880325317383, "learning_rate": 8.956400000000001e-08, "loss": 0.4142, "step": 227700 }, { "epoch": 2.2775, "grad_norm": 46.1187858581543, "learning_rate": 8.936399999999999e-08, "loss": 0.3884, "step": 227750 }, { "epoch": 2.278, "grad_norm": 42.99738311767578, "learning_rate": 8.916399999999999e-08, "loss": 0.4486, "step": 227800 }, { "epoch": 2.2785, "grad_norm": 5.804699420928955, "learning_rate": 8.8968e-08, "loss": 0.3833, "step": 227850 }, { "epoch": 2.279, "grad_norm": 20.068557739257812, "learning_rate": 8.876799999999999e-08, "loss": 0.4945, "step": 227900 }, { "epoch": 2.2795, "grad_norm": 0.5861179828643799, "learning_rate": 8.856799999999999e-08, "loss": 0.4478, "step": 227950 }, { "epoch": 2.2800000000000002, "grad_norm": 0.990058958530426, "learning_rate": 8.8368e-08, "loss": 0.3093, "step": 228000 }, { "epoch": 2.2805, "grad_norm": 129.85740661621094, "learning_rate": 8.816799999999999e-08, "loss": 0.429, "step": 228050 }, { "epoch": 2.281, "grad_norm": 29.716796875, "learning_rate": 8.7968e-08, "loss": 0.3996, "step": 228100 }, { "epoch": 2.2815, "grad_norm": 58.8203239440918, "learning_rate": 8.7768e-08, "loss": 0.3056, "step": 228150 }, { "epoch": 2.282, "grad_norm": 41.287166595458984, "learning_rate": 8.7568e-08, "loss": 0.321, "step": 228200 }, { "epoch": 2.2824999999999998, "grad_norm": 9.633661270141602, "learning_rate": 8.736799999999999e-08, "loss": 0.52, "step": 228250 }, { "epoch": 2.283, "grad_norm": 35.10003662109375, "learning_rate": 8.716799999999999e-08, "loss": 0.4273, "step": 228300 }, { "epoch": 2.2835, "grad_norm": 8.269124031066895, "learning_rate": 8.6968e-08, "loss": 0.3968, "step": 228350 }, { "epoch": 2.284, "grad_norm": 47.246002197265625, "learning_rate": 8.6768e-08, "loss": 0.6276, "step": 228400 }, { "epoch": 2.2845, "grad_norm": 106.63582611083984, "learning_rate": 8.6568e-08, "loss": 0.3303, "step": 228450 }, { "epoch": 2.285, "grad_norm": 5.064298152923584, "learning_rate": 8.6368e-08, "loss": 0.46, "step": 228500 }, { "epoch": 2.2855, "grad_norm": 0.2619529962539673, "learning_rate": 8.6168e-08, "loss": 0.3292, "step": 228550 }, { "epoch": 2.286, "grad_norm": 7.505974292755127, "learning_rate": 8.596799999999999e-08, "loss": 0.4411, "step": 228600 }, { "epoch": 2.2865, "grad_norm": 69.65245056152344, "learning_rate": 8.576799999999999e-08, "loss": 0.4029, "step": 228650 }, { "epoch": 2.287, "grad_norm": 57.54338073730469, "learning_rate": 8.5568e-08, "loss": 0.4123, "step": 228700 }, { "epoch": 2.2875, "grad_norm": 78.23273468017578, "learning_rate": 8.5368e-08, "loss": 0.3677, "step": 228750 }, { "epoch": 2.288, "grad_norm": 43.50226593017578, "learning_rate": 8.516799999999999e-08, "loss": 0.4841, "step": 228800 }, { "epoch": 2.2885, "grad_norm": 160.9630126953125, "learning_rate": 8.4968e-08, "loss": 0.3561, "step": 228850 }, { "epoch": 2.289, "grad_norm": 106.46874237060547, "learning_rate": 8.4768e-08, "loss": 0.3973, "step": 228900 }, { "epoch": 2.2895, "grad_norm": 67.45094299316406, "learning_rate": 8.4568e-08, "loss": 0.498, "step": 228950 }, { "epoch": 2.29, "grad_norm": 100.94178771972656, "learning_rate": 8.436799999999999e-08, "loss": 0.542, "step": 229000 }, { "epoch": 2.2904999999999998, "grad_norm": 3.8312125205993652, "learning_rate": 8.4168e-08, "loss": 0.7298, "step": 229050 }, { "epoch": 2.291, "grad_norm": 100.0070571899414, "learning_rate": 8.3968e-08, "loss": 0.4864, "step": 229100 }, { "epoch": 2.2915, "grad_norm": 88.91947937011719, "learning_rate": 8.376799999999999e-08, "loss": 0.4731, "step": 229150 }, { "epoch": 2.292, "grad_norm": 11.505712509155273, "learning_rate": 8.3568e-08, "loss": 0.42, "step": 229200 }, { "epoch": 2.2925, "grad_norm": 50.03477478027344, "learning_rate": 8.3368e-08, "loss": 0.433, "step": 229250 }, { "epoch": 2.293, "grad_norm": 62.71595001220703, "learning_rate": 8.316800000000001e-08, "loss": 0.4322, "step": 229300 }, { "epoch": 2.2935, "grad_norm": 81.32498931884766, "learning_rate": 8.296799999999999e-08, "loss": 0.4962, "step": 229350 }, { "epoch": 2.294, "grad_norm": 80.8553466796875, "learning_rate": 8.276799999999999e-08, "loss": 0.3587, "step": 229400 }, { "epoch": 2.2945, "grad_norm": 77.47965240478516, "learning_rate": 8.2568e-08, "loss": 0.3187, "step": 229450 }, { "epoch": 2.295, "grad_norm": 77.37553405761719, "learning_rate": 8.236799999999999e-08, "loss": 0.5271, "step": 229500 }, { "epoch": 2.2955, "grad_norm": 0.49823853373527527, "learning_rate": 8.2168e-08, "loss": 0.3428, "step": 229550 }, { "epoch": 2.296, "grad_norm": 2.033798933029175, "learning_rate": 8.1968e-08, "loss": 0.32, "step": 229600 }, { "epoch": 2.2965, "grad_norm": 2.363724946975708, "learning_rate": 8.1768e-08, "loss": 0.2763, "step": 229650 }, { "epoch": 2.297, "grad_norm": 42.67679977416992, "learning_rate": 8.156799999999999e-08, "loss": 0.3813, "step": 229700 }, { "epoch": 2.2975, "grad_norm": 93.03001403808594, "learning_rate": 8.1372e-08, "loss": 0.4763, "step": 229750 }, { "epoch": 2.298, "grad_norm": 23.316856384277344, "learning_rate": 8.1172e-08, "loss": 0.3295, "step": 229800 }, { "epoch": 2.2984999999999998, "grad_norm": 57.193267822265625, "learning_rate": 8.097199999999999e-08, "loss": 0.5061, "step": 229850 }, { "epoch": 2.299, "grad_norm": 22.44532585144043, "learning_rate": 8.077199999999999e-08, "loss": 0.3112, "step": 229900 }, { "epoch": 2.2995, "grad_norm": 101.59669494628906, "learning_rate": 8.0572e-08, "loss": 0.3241, "step": 229950 }, { "epoch": 2.3, "grad_norm": 78.62955474853516, "learning_rate": 8.0372e-08, "loss": 0.4214, "step": 230000 }, { "epoch": 2.3005, "grad_norm": 55.72152328491211, "learning_rate": 8.017199999999998e-08, "loss": 0.3366, "step": 230050 }, { "epoch": 2.301, "grad_norm": 105.1781997680664, "learning_rate": 7.9972e-08, "loss": 0.5508, "step": 230100 }, { "epoch": 2.3015, "grad_norm": 32.745849609375, "learning_rate": 7.9772e-08, "loss": 0.3962, "step": 230150 }, { "epoch": 2.302, "grad_norm": 4.188356399536133, "learning_rate": 7.9572e-08, "loss": 0.3405, "step": 230200 }, { "epoch": 2.3025, "grad_norm": 86.73455047607422, "learning_rate": 7.937199999999999e-08, "loss": 0.309, "step": 230250 }, { "epoch": 2.303, "grad_norm": 96.70193481445312, "learning_rate": 7.9172e-08, "loss": 0.3716, "step": 230300 }, { "epoch": 2.3035, "grad_norm": 2.2225139141082764, "learning_rate": 7.8972e-08, "loss": 0.3604, "step": 230350 }, { "epoch": 2.304, "grad_norm": 105.9201431274414, "learning_rate": 7.877199999999999e-08, "loss": 0.4714, "step": 230400 }, { "epoch": 2.3045, "grad_norm": 83.0341567993164, "learning_rate": 7.8572e-08, "loss": 0.5562, "step": 230450 }, { "epoch": 2.305, "grad_norm": 2.381653070449829, "learning_rate": 7.8372e-08, "loss": 0.4385, "step": 230500 }, { "epoch": 2.3055, "grad_norm": 18.078433990478516, "learning_rate": 7.817200000000001e-08, "loss": 0.379, "step": 230550 }, { "epoch": 2.306, "grad_norm": 27.159372329711914, "learning_rate": 7.797199999999999e-08, "loss": 0.3722, "step": 230600 }, { "epoch": 2.3064999999999998, "grad_norm": 67.36116027832031, "learning_rate": 7.777199999999999e-08, "loss": 0.4527, "step": 230650 }, { "epoch": 2.307, "grad_norm": 54.01509475708008, "learning_rate": 7.7572e-08, "loss": 0.3647, "step": 230700 }, { "epoch": 2.3075, "grad_norm": 3.8897550106048584, "learning_rate": 7.737199999999999e-08, "loss": 0.5743, "step": 230750 }, { "epoch": 2.308, "grad_norm": 0.7189392447471619, "learning_rate": 7.7172e-08, "loss": 0.4053, "step": 230800 }, { "epoch": 2.3085, "grad_norm": 53.79637908935547, "learning_rate": 7.6972e-08, "loss": 0.6112, "step": 230850 }, { "epoch": 2.309, "grad_norm": 29.953990936279297, "learning_rate": 7.677200000000001e-08, "loss": 0.4461, "step": 230900 }, { "epoch": 2.3095, "grad_norm": 0.23817132413387299, "learning_rate": 7.657199999999999e-08, "loss": 0.4433, "step": 230950 }, { "epoch": 2.31, "grad_norm": 72.25556945800781, "learning_rate": 7.637199999999999e-08, "loss": 0.3299, "step": 231000 }, { "epoch": 2.3105, "grad_norm": 56.55588150024414, "learning_rate": 7.6172e-08, "loss": 0.4597, "step": 231050 }, { "epoch": 2.311, "grad_norm": 94.5429916381836, "learning_rate": 7.5972e-08, "loss": 0.3237, "step": 231100 }, { "epoch": 2.3115, "grad_norm": 65.64086151123047, "learning_rate": 7.5772e-08, "loss": 0.4391, "step": 231150 }, { "epoch": 2.312, "grad_norm": 57.61141586303711, "learning_rate": 7.5572e-08, "loss": 0.3361, "step": 231200 }, { "epoch": 2.3125, "grad_norm": 73.47406768798828, "learning_rate": 7.5372e-08, "loss": 0.4478, "step": 231250 }, { "epoch": 2.313, "grad_norm": 8.852334976196289, "learning_rate": 7.517199999999999e-08, "loss": 0.56, "step": 231300 }, { "epoch": 2.3135, "grad_norm": 86.10330200195312, "learning_rate": 7.497199999999999e-08, "loss": 0.3869, "step": 231350 }, { "epoch": 2.314, "grad_norm": 26.84288787841797, "learning_rate": 7.4772e-08, "loss": 0.4664, "step": 231400 }, { "epoch": 2.3145, "grad_norm": 28.752120971679688, "learning_rate": 7.4572e-08, "loss": 0.4008, "step": 231450 }, { "epoch": 2.315, "grad_norm": 12.051288604736328, "learning_rate": 7.437199999999999e-08, "loss": 0.4183, "step": 231500 }, { "epoch": 2.3155, "grad_norm": 60.44932556152344, "learning_rate": 7.4172e-08, "loss": 0.3753, "step": 231550 }, { "epoch": 2.316, "grad_norm": 105.76837158203125, "learning_rate": 7.3972e-08, "loss": 0.4456, "step": 231600 }, { "epoch": 2.3165, "grad_norm": 20.609785079956055, "learning_rate": 7.377200000000001e-08, "loss": 0.3352, "step": 231650 }, { "epoch": 2.317, "grad_norm": 23.5540828704834, "learning_rate": 7.357199999999999e-08, "loss": 0.4725, "step": 231700 }, { "epoch": 2.3175, "grad_norm": 9.055368423461914, "learning_rate": 7.3372e-08, "loss": 0.3954, "step": 231750 }, { "epoch": 2.318, "grad_norm": 0.28335410356521606, "learning_rate": 7.3172e-08, "loss": 0.3075, "step": 231800 }, { "epoch": 2.3185000000000002, "grad_norm": 142.01795959472656, "learning_rate": 7.297199999999999e-08, "loss": 0.454, "step": 231850 }, { "epoch": 2.319, "grad_norm": 69.494873046875, "learning_rate": 7.2772e-08, "loss": 0.3512, "step": 231900 }, { "epoch": 2.3195, "grad_norm": 144.35302734375, "learning_rate": 7.2572e-08, "loss": 0.3771, "step": 231950 }, { "epoch": 2.32, "grad_norm": 44.31450653076172, "learning_rate": 7.237200000000001e-08, "loss": 0.3828, "step": 232000 }, { "epoch": 2.3205, "grad_norm": 43.578590393066406, "learning_rate": 7.217199999999999e-08, "loss": 0.4416, "step": 232050 }, { "epoch": 2.321, "grad_norm": 0.5683667659759521, "learning_rate": 7.197199999999999e-08, "loss": 0.3824, "step": 232100 }, { "epoch": 2.3215, "grad_norm": 13.191206932067871, "learning_rate": 7.1772e-08, "loss": 0.4271, "step": 232150 }, { "epoch": 2.322, "grad_norm": 89.80245971679688, "learning_rate": 7.157199999999999e-08, "loss": 0.3694, "step": 232200 }, { "epoch": 2.3225, "grad_norm": 100.60424041748047, "learning_rate": 7.1372e-08, "loss": 0.4274, "step": 232250 }, { "epoch": 2.323, "grad_norm": 21.0079288482666, "learning_rate": 7.1172e-08, "loss": 0.4433, "step": 232300 }, { "epoch": 2.3235, "grad_norm": 80.20172882080078, "learning_rate": 7.0972e-08, "loss": 0.2979, "step": 232350 }, { "epoch": 2.324, "grad_norm": 22.903003692626953, "learning_rate": 7.0772e-08, "loss": 0.3579, "step": 232400 }, { "epoch": 2.3245, "grad_norm": 69.52226257324219, "learning_rate": 7.057199999999999e-08, "loss": 0.524, "step": 232450 }, { "epoch": 2.325, "grad_norm": 54.36795425415039, "learning_rate": 7.0372e-08, "loss": 0.5302, "step": 232500 }, { "epoch": 2.3255, "grad_norm": 3.2600345611572266, "learning_rate": 7.0172e-08, "loss": 0.3502, "step": 232550 }, { "epoch": 2.326, "grad_norm": 8.641088485717773, "learning_rate": 6.9972e-08, "loss": 0.4258, "step": 232600 }, { "epoch": 2.3265000000000002, "grad_norm": 0.2351902574300766, "learning_rate": 6.9772e-08, "loss": 0.4205, "step": 232650 }, { "epoch": 2.327, "grad_norm": 5.2298126220703125, "learning_rate": 6.9572e-08, "loss": 0.2792, "step": 232700 }, { "epoch": 2.3275, "grad_norm": 120.54157257080078, "learning_rate": 6.9372e-08, "loss": 0.3501, "step": 232750 }, { "epoch": 2.328, "grad_norm": 129.25265502929688, "learning_rate": 6.917199999999999e-08, "loss": 0.5156, "step": 232800 }, { "epoch": 2.3285, "grad_norm": 66.50567626953125, "learning_rate": 6.8972e-08, "loss": 0.5007, "step": 232850 }, { "epoch": 2.329, "grad_norm": 80.42890930175781, "learning_rate": 6.8772e-08, "loss": 0.4191, "step": 232900 }, { "epoch": 2.3295, "grad_norm": 84.22823333740234, "learning_rate": 6.857199999999999e-08, "loss": 0.4093, "step": 232950 }, { "epoch": 2.33, "grad_norm": 60.78300094604492, "learning_rate": 6.8372e-08, "loss": 0.3831, "step": 233000 }, { "epoch": 2.3305, "grad_norm": 9.814701080322266, "learning_rate": 6.8172e-08, "loss": 0.3795, "step": 233050 }, { "epoch": 2.331, "grad_norm": 89.87799072265625, "learning_rate": 6.797200000000001e-08, "loss": 0.4236, "step": 233100 }, { "epoch": 2.3315, "grad_norm": 51.194068908691406, "learning_rate": 6.777199999999999e-08, "loss": 0.3, "step": 233150 }, { "epoch": 2.332, "grad_norm": 40.99766159057617, "learning_rate": 6.757199999999999e-08, "loss": 0.5982, "step": 233200 }, { "epoch": 2.3325, "grad_norm": 97.21105194091797, "learning_rate": 6.7372e-08, "loss": 0.4268, "step": 233250 }, { "epoch": 2.333, "grad_norm": 77.22876739501953, "learning_rate": 6.717199999999999e-08, "loss": 0.3962, "step": 233300 }, { "epoch": 2.3335, "grad_norm": 24.11876106262207, "learning_rate": 6.6972e-08, "loss": 0.3584, "step": 233350 }, { "epoch": 2.334, "grad_norm": 49.48302459716797, "learning_rate": 6.6772e-08, "loss": 0.3827, "step": 233400 }, { "epoch": 2.3345000000000002, "grad_norm": 0.11236397176980972, "learning_rate": 6.657200000000001e-08, "loss": 0.2988, "step": 233450 }, { "epoch": 2.335, "grad_norm": 7.154104232788086, "learning_rate": 6.6372e-08, "loss": 0.4928, "step": 233500 }, { "epoch": 2.3355, "grad_norm": 2.846604824066162, "learning_rate": 6.617199999999999e-08, "loss": 0.4661, "step": 233550 }, { "epoch": 2.336, "grad_norm": 93.85885620117188, "learning_rate": 6.5972e-08, "loss": 0.5282, "step": 233600 }, { "epoch": 2.3365, "grad_norm": 14.930121421813965, "learning_rate": 6.577199999999999e-08, "loss": 0.3968, "step": 233650 }, { "epoch": 2.337, "grad_norm": 50.37242889404297, "learning_rate": 6.5572e-08, "loss": 0.4821, "step": 233700 }, { "epoch": 2.3375, "grad_norm": 7.079569339752197, "learning_rate": 6.5372e-08, "loss": 0.4573, "step": 233750 }, { "epoch": 2.338, "grad_norm": 59.11067581176758, "learning_rate": 6.5172e-08, "loss": 0.4407, "step": 233800 }, { "epoch": 2.3385, "grad_norm": 106.90413665771484, "learning_rate": 6.4972e-08, "loss": 0.5134, "step": 233850 }, { "epoch": 2.339, "grad_norm": 88.19550323486328, "learning_rate": 6.477199999999999e-08, "loss": 0.3665, "step": 233900 }, { "epoch": 2.3395, "grad_norm": 3.8314621448516846, "learning_rate": 6.4572e-08, "loss": 0.3177, "step": 233950 }, { "epoch": 2.34, "grad_norm": 78.19933319091797, "learning_rate": 6.4372e-08, "loss": 0.4219, "step": 234000 }, { "epoch": 2.3405, "grad_norm": 62.778011322021484, "learning_rate": 6.4172e-08, "loss": 0.3997, "step": 234050 }, { "epoch": 2.341, "grad_norm": 73.71574401855469, "learning_rate": 6.3972e-08, "loss": 0.3116, "step": 234100 }, { "epoch": 2.3415, "grad_norm": 54.49970245361328, "learning_rate": 6.3772e-08, "loss": 0.4426, "step": 234150 }, { "epoch": 2.342, "grad_norm": 63.628170013427734, "learning_rate": 6.3572e-08, "loss": 0.4592, "step": 234200 }, { "epoch": 2.3425000000000002, "grad_norm": 69.60954284667969, "learning_rate": 6.337199999999999e-08, "loss": 0.3635, "step": 234250 }, { "epoch": 2.343, "grad_norm": 52.40366744995117, "learning_rate": 6.3172e-08, "loss": 0.538, "step": 234300 }, { "epoch": 2.3435, "grad_norm": 86.80823516845703, "learning_rate": 6.2972e-08, "loss": 0.3849, "step": 234350 }, { "epoch": 2.344, "grad_norm": 67.82777404785156, "learning_rate": 6.277199999999999e-08, "loss": 0.5053, "step": 234400 }, { "epoch": 2.3445, "grad_norm": 63.045921325683594, "learning_rate": 6.2572e-08, "loss": 0.3757, "step": 234450 }, { "epoch": 2.3449999999999998, "grad_norm": 64.31387329101562, "learning_rate": 6.2372e-08, "loss": 0.4009, "step": 234500 }, { "epoch": 2.3455, "grad_norm": 43.08381652832031, "learning_rate": 6.2172e-08, "loss": 0.4228, "step": 234550 }, { "epoch": 2.346, "grad_norm": 25.41963768005371, "learning_rate": 6.1972e-08, "loss": 0.3963, "step": 234600 }, { "epoch": 2.3465, "grad_norm": 45.17002487182617, "learning_rate": 6.177199999999999e-08, "loss": 0.4244, "step": 234650 }, { "epoch": 2.347, "grad_norm": 64.86937713623047, "learning_rate": 6.1572e-08, "loss": 0.433, "step": 234700 }, { "epoch": 2.3475, "grad_norm": 122.25826263427734, "learning_rate": 6.1372e-08, "loss": 0.5875, "step": 234750 }, { "epoch": 2.348, "grad_norm": 1.6496562957763672, "learning_rate": 6.1172e-08, "loss": 0.3615, "step": 234800 }, { "epoch": 2.3485, "grad_norm": 34.062599182128906, "learning_rate": 6.0972e-08, "loss": 0.4072, "step": 234850 }, { "epoch": 2.349, "grad_norm": 1.5793612003326416, "learning_rate": 6.0772e-08, "loss": 0.4464, "step": 234900 }, { "epoch": 2.3495, "grad_norm": 17.111398696899414, "learning_rate": 6.0572e-08, "loss": 0.3973, "step": 234950 }, { "epoch": 2.35, "grad_norm": 17.124202728271484, "learning_rate": 6.037199999999999e-08, "loss": 0.4139, "step": 235000 }, { "epoch": 2.3505, "grad_norm": 20.528778076171875, "learning_rate": 6.0172e-08, "loss": 0.4128, "step": 235050 }, { "epoch": 2.351, "grad_norm": 9.12598991394043, "learning_rate": 5.997199999999999e-08, "loss": 0.429, "step": 235100 }, { "epoch": 2.3515, "grad_norm": 89.71222686767578, "learning_rate": 5.9772e-08, "loss": 0.4243, "step": 235150 }, { "epoch": 2.352, "grad_norm": 8.675117492675781, "learning_rate": 5.9576e-08, "loss": 0.5467, "step": 235200 }, { "epoch": 2.3525, "grad_norm": 82.99100494384766, "learning_rate": 5.9375999999999995e-08, "loss": 0.3422, "step": 235250 }, { "epoch": 2.3529999999999998, "grad_norm": 77.30717468261719, "learning_rate": 5.9176e-08, "loss": 0.4311, "step": 235300 }, { "epoch": 2.3535, "grad_norm": 58.811668395996094, "learning_rate": 5.8976e-08, "loss": 0.2901, "step": 235350 }, { "epoch": 2.354, "grad_norm": 18.311758041381836, "learning_rate": 5.8775999999999996e-08, "loss": 0.602, "step": 235400 }, { "epoch": 2.3545, "grad_norm": 54.06341552734375, "learning_rate": 5.8576e-08, "loss": 0.3226, "step": 235450 }, { "epoch": 2.355, "grad_norm": 3.558945655822754, "learning_rate": 5.837599999999999e-08, "loss": 0.5484, "step": 235500 }, { "epoch": 2.3555, "grad_norm": 40.88227081298828, "learning_rate": 5.8176e-08, "loss": 0.2963, "step": 235550 }, { "epoch": 2.356, "grad_norm": 4.487430572509766, "learning_rate": 5.7979999999999994e-08, "loss": 0.4298, "step": 235600 }, { "epoch": 2.3565, "grad_norm": 83.76229858398438, "learning_rate": 5.778e-08, "loss": 0.4383, "step": 235650 }, { "epoch": 2.357, "grad_norm": 25.388702392578125, "learning_rate": 5.758e-08, "loss": 0.4707, "step": 235700 }, { "epoch": 2.3575, "grad_norm": 21.608808517456055, "learning_rate": 5.7379999999999995e-08, "loss": 0.3404, "step": 235750 }, { "epoch": 2.358, "grad_norm": 95.14769744873047, "learning_rate": 5.718e-08, "loss": 0.4012, "step": 235800 }, { "epoch": 2.3585, "grad_norm": 29.87248420715332, "learning_rate": 5.698e-08, "loss": 0.6228, "step": 235850 }, { "epoch": 2.359, "grad_norm": 119.35411834716797, "learning_rate": 5.6779999999999996e-08, "loss": 0.4535, "step": 235900 }, { "epoch": 2.3595, "grad_norm": 22.905563354492188, "learning_rate": 5.6579999999999994e-08, "loss": 0.2687, "step": 235950 }, { "epoch": 2.36, "grad_norm": 112.05744934082031, "learning_rate": 5.638e-08, "loss": 0.3915, "step": 236000 }, { "epoch": 2.3605, "grad_norm": 51.91557693481445, "learning_rate": 5.618e-08, "loss": 0.4123, "step": 236050 }, { "epoch": 2.3609999999999998, "grad_norm": 35.96802520751953, "learning_rate": 5.598e-08, "loss": 0.3404, "step": 236100 }, { "epoch": 2.3615, "grad_norm": 39.31023406982422, "learning_rate": 5.578e-08, "loss": 0.362, "step": 236150 }, { "epoch": 2.362, "grad_norm": 100.35054016113281, "learning_rate": 5.557999999999999e-08, "loss": 0.5586, "step": 236200 }, { "epoch": 2.3625, "grad_norm": 0.3581673502922058, "learning_rate": 5.538e-08, "loss": 0.3348, "step": 236250 }, { "epoch": 2.363, "grad_norm": 7.026209831237793, "learning_rate": 5.5179999999999995e-08, "loss": 0.4421, "step": 236300 }, { "epoch": 2.3635, "grad_norm": 1.0426843166351318, "learning_rate": 5.498e-08, "loss": 0.4958, "step": 236350 }, { "epoch": 2.364, "grad_norm": 86.80310821533203, "learning_rate": 5.478e-08, "loss": 0.3704, "step": 236400 }, { "epoch": 2.3645, "grad_norm": 99.3370590209961, "learning_rate": 5.4579999999999996e-08, "loss": 0.5521, "step": 236450 }, { "epoch": 2.365, "grad_norm": 73.29484558105469, "learning_rate": 5.4384e-08, "loss": 0.4607, "step": 236500 }, { "epoch": 2.3655, "grad_norm": 94.66316986083984, "learning_rate": 5.4184e-08, "loss": 0.4448, "step": 236550 }, { "epoch": 2.366, "grad_norm": 4.878780364990234, "learning_rate": 5.3983999999999996e-08, "loss": 0.3542, "step": 236600 }, { "epoch": 2.3665, "grad_norm": 38.552696228027344, "learning_rate": 5.3783999999999994e-08, "loss": 0.3363, "step": 236650 }, { "epoch": 2.367, "grad_norm": 106.73291015625, "learning_rate": 5.3584e-08, "loss": 0.3413, "step": 236700 }, { "epoch": 2.3675, "grad_norm": 42.5386848449707, "learning_rate": 5.3384e-08, "loss": 0.3877, "step": 236750 }, { "epoch": 2.368, "grad_norm": 53.35752487182617, "learning_rate": 5.3184e-08, "loss": 0.4686, "step": 236800 }, { "epoch": 2.3685, "grad_norm": 7.68386173248291, "learning_rate": 5.2984e-08, "loss": 0.4841, "step": 236850 }, { "epoch": 2.3689999999999998, "grad_norm": 121.2600326538086, "learning_rate": 5.2784e-08, "loss": 0.4746, "step": 236900 }, { "epoch": 2.3695, "grad_norm": 25.649150848388672, "learning_rate": 5.2583999999999996e-08, "loss": 0.365, "step": 236950 }, { "epoch": 2.37, "grad_norm": 19.756162643432617, "learning_rate": 5.2383999999999994e-08, "loss": 0.3344, "step": 237000 }, { "epoch": 2.3705, "grad_norm": 20.484275817871094, "learning_rate": 5.2184e-08, "loss": 0.5115, "step": 237050 }, { "epoch": 2.371, "grad_norm": 111.54764556884766, "learning_rate": 5.1984e-08, "loss": 0.3719, "step": 237100 }, { "epoch": 2.3715, "grad_norm": 2.243807315826416, "learning_rate": 5.1783999999999996e-08, "loss": 0.4457, "step": 237150 }, { "epoch": 2.372, "grad_norm": 76.7511978149414, "learning_rate": 5.1583999999999994e-08, "loss": 0.3587, "step": 237200 }, { "epoch": 2.3725, "grad_norm": 82.79983520507812, "learning_rate": 5.1384e-08, "loss": 0.4723, "step": 237250 }, { "epoch": 2.373, "grad_norm": 76.18513488769531, "learning_rate": 5.1184e-08, "loss": 0.4799, "step": 237300 }, { "epoch": 2.3735, "grad_norm": 94.5965805053711, "learning_rate": 5.0984e-08, "loss": 0.4241, "step": 237350 }, { "epoch": 2.374, "grad_norm": 0.5150562524795532, "learning_rate": 5.0784e-08, "loss": 0.4691, "step": 237400 }, { "epoch": 2.3745, "grad_norm": 3.979444742202759, "learning_rate": 5.058399999999999e-08, "loss": 0.3656, "step": 237450 }, { "epoch": 2.375, "grad_norm": 49.238975524902344, "learning_rate": 5.0383999999999996e-08, "loss": 0.4712, "step": 237500 }, { "epoch": 2.3755, "grad_norm": 82.24160766601562, "learning_rate": 5.0183999999999995e-08, "loss": 0.3209, "step": 237550 }, { "epoch": 2.376, "grad_norm": 106.60427856445312, "learning_rate": 4.9984e-08, "loss": 0.5374, "step": 237600 }, { "epoch": 2.3765, "grad_norm": 5.549964904785156, "learning_rate": 4.9784e-08, "loss": 0.3677, "step": 237650 }, { "epoch": 2.377, "grad_norm": 106.1153564453125, "learning_rate": 4.9584e-08, "loss": 0.5558, "step": 237700 }, { "epoch": 2.3775, "grad_norm": 6.455342769622803, "learning_rate": 4.9383999999999994e-08, "loss": 0.53, "step": 237750 }, { "epoch": 2.378, "grad_norm": 28.48943519592285, "learning_rate": 4.9184e-08, "loss": 0.618, "step": 237800 }, { "epoch": 2.3785, "grad_norm": 3.647404432296753, "learning_rate": 4.8984e-08, "loss": 0.5573, "step": 237850 }, { "epoch": 2.379, "grad_norm": 3.0441486835479736, "learning_rate": 4.8784e-08, "loss": 0.3326, "step": 237900 }, { "epoch": 2.3795, "grad_norm": 44.93407440185547, "learning_rate": 4.8584e-08, "loss": 0.3113, "step": 237950 }, { "epoch": 2.38, "grad_norm": 73.76074981689453, "learning_rate": 4.8384e-08, "loss": 0.3362, "step": 238000 }, { "epoch": 2.3805, "grad_norm": 7.347684383392334, "learning_rate": 4.8183999999999997e-08, "loss": 0.4527, "step": 238050 }, { "epoch": 2.3810000000000002, "grad_norm": 61.60081481933594, "learning_rate": 4.7983999999999995e-08, "loss": 0.464, "step": 238100 }, { "epoch": 2.3815, "grad_norm": 11.561792373657227, "learning_rate": 4.7784e-08, "loss": 0.5532, "step": 238150 }, { "epoch": 2.382, "grad_norm": 1.321579933166504, "learning_rate": 4.7584e-08, "loss": 0.4313, "step": 238200 }, { "epoch": 2.3825, "grad_norm": 35.636898040771484, "learning_rate": 4.7384e-08, "loss": 0.4452, "step": 238250 }, { "epoch": 2.383, "grad_norm": 82.60275268554688, "learning_rate": 4.7183999999999994e-08, "loss": 0.4888, "step": 238300 }, { "epoch": 2.3835, "grad_norm": 10.286293029785156, "learning_rate": 4.6984e-08, "loss": 0.328, "step": 238350 }, { "epoch": 2.384, "grad_norm": 126.57600402832031, "learning_rate": 4.6784e-08, "loss": 0.4364, "step": 238400 }, { "epoch": 2.3845, "grad_norm": 0.09401095658540726, "learning_rate": 4.6583999999999995e-08, "loss": 0.3141, "step": 238450 }, { "epoch": 2.385, "grad_norm": 42.13533401489258, "learning_rate": 4.6384e-08, "loss": 0.5262, "step": 238500 }, { "epoch": 2.3855, "grad_norm": 8.396677017211914, "learning_rate": 4.6184e-08, "loss": 0.3766, "step": 238550 }, { "epoch": 2.386, "grad_norm": 12.288100242614746, "learning_rate": 4.5984e-08, "loss": 0.4144, "step": 238600 }, { "epoch": 2.3865, "grad_norm": 81.89070892333984, "learning_rate": 4.5783999999999995e-08, "loss": 0.3493, "step": 238650 }, { "epoch": 2.387, "grad_norm": 104.77812194824219, "learning_rate": 4.5584e-08, "loss": 0.329, "step": 238700 }, { "epoch": 2.3875, "grad_norm": 111.44982147216797, "learning_rate": 4.5384e-08, "loss": 0.3988, "step": 238750 }, { "epoch": 2.388, "grad_norm": 96.20793151855469, "learning_rate": 4.5184e-08, "loss": 0.2894, "step": 238800 }, { "epoch": 2.3885, "grad_norm": 67.85469818115234, "learning_rate": 4.4984e-08, "loss": 0.3836, "step": 238850 }, { "epoch": 2.3890000000000002, "grad_norm": 53.94377899169922, "learning_rate": 4.478399999999999e-08, "loss": 0.4408, "step": 238900 }, { "epoch": 2.3895, "grad_norm": 174.99449157714844, "learning_rate": 4.4584e-08, "loss": 0.454, "step": 238950 }, { "epoch": 2.39, "grad_norm": 9.08309555053711, "learning_rate": 4.4383999999999996e-08, "loss": 0.4012, "step": 239000 }, { "epoch": 2.3905, "grad_norm": 7.337883949279785, "learning_rate": 4.4184e-08, "loss": 0.4567, "step": 239050 }, { "epoch": 2.391, "grad_norm": 118.68801879882812, "learning_rate": 4.3984e-08, "loss": 0.4913, "step": 239100 }, { "epoch": 2.3915, "grad_norm": 119.5619888305664, "learning_rate": 4.3784e-08, "loss": 0.3864, "step": 239150 }, { "epoch": 2.392, "grad_norm": 63.690818786621094, "learning_rate": 4.3583999999999995e-08, "loss": 0.3754, "step": 239200 }, { "epoch": 2.3925, "grad_norm": 15.803923606872559, "learning_rate": 4.3384e-08, "loss": 0.4492, "step": 239250 }, { "epoch": 2.393, "grad_norm": 13.975971221923828, "learning_rate": 4.3184e-08, "loss": 0.3106, "step": 239300 }, { "epoch": 2.3935, "grad_norm": 6.938758373260498, "learning_rate": 4.2983999999999996e-08, "loss": 0.3647, "step": 239350 }, { "epoch": 2.394, "grad_norm": 80.98222351074219, "learning_rate": 4.2784e-08, "loss": 0.365, "step": 239400 }, { "epoch": 2.3945, "grad_norm": 11.557998657226562, "learning_rate": 4.258399999999999e-08, "loss": 0.4769, "step": 239450 }, { "epoch": 2.395, "grad_norm": 20.222549438476562, "learning_rate": 4.2384e-08, "loss": 0.5143, "step": 239500 }, { "epoch": 2.3955, "grad_norm": 61.61232376098633, "learning_rate": 4.2183999999999996e-08, "loss": 0.4041, "step": 239550 }, { "epoch": 2.396, "grad_norm": 40.21515655517578, "learning_rate": 4.1988e-08, "loss": 0.4189, "step": 239600 }, { "epoch": 2.3965, "grad_norm": 39.18798065185547, "learning_rate": 4.1788e-08, "loss": 0.2413, "step": 239650 }, { "epoch": 2.3970000000000002, "grad_norm": 0.4177449345588684, "learning_rate": 4.1587999999999995e-08, "loss": 0.3866, "step": 239700 }, { "epoch": 2.3975, "grad_norm": 32.55744171142578, "learning_rate": 4.1388e-08, "loss": 0.3252, "step": 239750 }, { "epoch": 2.398, "grad_norm": 93.50446319580078, "learning_rate": 4.1188e-08, "loss": 0.5187, "step": 239800 }, { "epoch": 2.3985, "grad_norm": 21.071910858154297, "learning_rate": 4.0987999999999996e-08, "loss": 0.4274, "step": 239850 }, { "epoch": 2.399, "grad_norm": 156.43629455566406, "learning_rate": 4.0787999999999994e-08, "loss": 0.4526, "step": 239900 }, { "epoch": 2.3995, "grad_norm": 71.36151123046875, "learning_rate": 4.0588e-08, "loss": 0.4707, "step": 239950 }, { "epoch": 2.4, "grad_norm": 17.978219985961914, "learning_rate": 4.0388e-08, "loss": 0.4547, "step": 240000 }, { "epoch": 2.4005, "grad_norm": 17.83655548095703, "learning_rate": 4.0188e-08, "loss": 0.5657, "step": 240050 }, { "epoch": 2.401, "grad_norm": 139.66326904296875, "learning_rate": 3.9988e-08, "loss": 0.356, "step": 240100 }, { "epoch": 2.4015, "grad_norm": 20.223098754882812, "learning_rate": 3.978799999999999e-08, "loss": 0.382, "step": 240150 }, { "epoch": 2.402, "grad_norm": 2.374896287918091, "learning_rate": 3.9588e-08, "loss": 0.4235, "step": 240200 }, { "epoch": 2.4025, "grad_norm": 60.51106643676758, "learning_rate": 3.9387999999999995e-08, "loss": 0.6068, "step": 240250 }, { "epoch": 2.403, "grad_norm": 72.06840515136719, "learning_rate": 3.9188e-08, "loss": 0.474, "step": 240300 }, { "epoch": 2.4035, "grad_norm": 78.26203155517578, "learning_rate": 3.8988e-08, "loss": 0.4211, "step": 240350 }, { "epoch": 2.404, "grad_norm": 30.761234283447266, "learning_rate": 3.8788e-08, "loss": 0.2871, "step": 240400 }, { "epoch": 2.4045, "grad_norm": 20.948881149291992, "learning_rate": 3.8587999999999995e-08, "loss": 0.4948, "step": 240450 }, { "epoch": 2.4050000000000002, "grad_norm": 2.7093822956085205, "learning_rate": 3.8388e-08, "loss": 0.256, "step": 240500 }, { "epoch": 2.4055, "grad_norm": 8.741806983947754, "learning_rate": 3.8188e-08, "loss": 0.4368, "step": 240550 }, { "epoch": 2.406, "grad_norm": 13.452826499938965, "learning_rate": 3.7988e-08, "loss": 0.3249, "step": 240600 }, { "epoch": 2.4065, "grad_norm": 9.17270565032959, "learning_rate": 3.7788e-08, "loss": 0.434, "step": 240650 }, { "epoch": 2.407, "grad_norm": 44.4083366394043, "learning_rate": 3.7588e-08, "loss": 0.37, "step": 240700 }, { "epoch": 2.4074999999999998, "grad_norm": 5.229351997375488, "learning_rate": 3.7388e-08, "loss": 0.3604, "step": 240750 }, { "epoch": 2.408, "grad_norm": 113.33466339111328, "learning_rate": 3.7187999999999995e-08, "loss": 0.3738, "step": 240800 }, { "epoch": 2.4085, "grad_norm": 66.20381927490234, "learning_rate": 3.6988e-08, "loss": 0.3353, "step": 240850 }, { "epoch": 2.409, "grad_norm": 15.30612850189209, "learning_rate": 3.6788e-08, "loss": 0.2977, "step": 240900 }, { "epoch": 2.4095, "grad_norm": 0.48715662956237793, "learning_rate": 3.6588e-08, "loss": 0.3838, "step": 240950 }, { "epoch": 2.41, "grad_norm": 21.8796329498291, "learning_rate": 3.6387999999999995e-08, "loss": 0.5768, "step": 241000 }, { "epoch": 2.4105, "grad_norm": 58.25382995605469, "learning_rate": 3.6188e-08, "loss": 0.222, "step": 241050 }, { "epoch": 2.411, "grad_norm": 75.87273406982422, "learning_rate": 3.5988e-08, "loss": 0.3897, "step": 241100 }, { "epoch": 2.4115, "grad_norm": 132.18612670898438, "learning_rate": 3.5787999999999996e-08, "loss": 0.4991, "step": 241150 }, { "epoch": 2.412, "grad_norm": 93.72396087646484, "learning_rate": 3.5588e-08, "loss": 0.5212, "step": 241200 }, { "epoch": 2.4125, "grad_norm": 73.5746841430664, "learning_rate": 3.5388e-08, "loss": 0.4832, "step": 241250 }, { "epoch": 2.413, "grad_norm": 0.44722920656204224, "learning_rate": 3.5188e-08, "loss": 0.3307, "step": 241300 }, { "epoch": 2.4135, "grad_norm": 12.440936088562012, "learning_rate": 3.4987999999999995e-08, "loss": 0.4114, "step": 241350 }, { "epoch": 2.414, "grad_norm": 5.505441665649414, "learning_rate": 3.4788e-08, "loss": 0.4136, "step": 241400 }, { "epoch": 2.4145, "grad_norm": 54.741905212402344, "learning_rate": 3.4588e-08, "loss": 0.4845, "step": 241450 }, { "epoch": 2.415, "grad_norm": 3.8523218631744385, "learning_rate": 3.4388e-08, "loss": 0.3512, "step": 241500 }, { "epoch": 2.4154999999999998, "grad_norm": 60.24558639526367, "learning_rate": 3.4188e-08, "loss": 0.43, "step": 241550 }, { "epoch": 2.416, "grad_norm": 103.20320129394531, "learning_rate": 3.398799999999999e-08, "loss": 0.5293, "step": 241600 }, { "epoch": 2.4165, "grad_norm": 43.67532730102539, "learning_rate": 3.3788e-08, "loss": 0.4799, "step": 241650 }, { "epoch": 2.417, "grad_norm": 12.198750495910645, "learning_rate": 3.3587999999999996e-08, "loss": 0.4764, "step": 241700 }, { "epoch": 2.4175, "grad_norm": 44.76414489746094, "learning_rate": 3.3388e-08, "loss": 0.3694, "step": 241750 }, { "epoch": 2.418, "grad_norm": 93.81781005859375, "learning_rate": 3.3188e-08, "loss": 0.4577, "step": 241800 }, { "epoch": 2.4185, "grad_norm": 86.02700805664062, "learning_rate": 3.2988000000000004e-08, "loss": 0.337, "step": 241850 }, { "epoch": 2.419, "grad_norm": 23.790729522705078, "learning_rate": 3.2787999999999996e-08, "loss": 0.3581, "step": 241900 }, { "epoch": 2.4195, "grad_norm": 80.7553482055664, "learning_rate": 3.2588e-08, "loss": 0.4819, "step": 241950 }, { "epoch": 2.42, "grad_norm": 100.54222106933594, "learning_rate": 3.2388e-08, "loss": 0.3009, "step": 242000 }, { "epoch": 2.4205, "grad_norm": 69.31678009033203, "learning_rate": 3.2188e-08, "loss": 0.414, "step": 242050 }, { "epoch": 2.421, "grad_norm": 8.001131057739258, "learning_rate": 3.1988e-08, "loss": 0.5114, "step": 242100 }, { "epoch": 2.4215, "grad_norm": 96.0322036743164, "learning_rate": 3.178799999999999e-08, "loss": 0.4587, "step": 242150 }, { "epoch": 2.422, "grad_norm": 8.513936042785645, "learning_rate": 3.1588e-08, "loss": 0.3816, "step": 242200 }, { "epoch": 2.4225, "grad_norm": 53.89067077636719, "learning_rate": 3.1387999999999996e-08, "loss": 0.4928, "step": 242250 }, { "epoch": 2.423, "grad_norm": 98.1548080444336, "learning_rate": 3.1188e-08, "loss": 0.4295, "step": 242300 }, { "epoch": 2.4234999999999998, "grad_norm": 4.35904598236084, "learning_rate": 3.0988e-08, "loss": 0.4473, "step": 242350 }, { "epoch": 2.424, "grad_norm": 72.66747283935547, "learning_rate": 3.0788e-08, "loss": 0.4041, "step": 242400 }, { "epoch": 2.4245, "grad_norm": 275.8028259277344, "learning_rate": 3.0588e-08, "loss": 0.384, "step": 242450 }, { "epoch": 2.425, "grad_norm": 74.81790924072266, "learning_rate": 3.0387999999999994e-08, "loss": 0.4163, "step": 242500 }, { "epoch": 2.4255, "grad_norm": 2.4196512699127197, "learning_rate": 3.0188e-08, "loss": 0.4067, "step": 242550 }, { "epoch": 2.426, "grad_norm": 60.739994049072266, "learning_rate": 2.9988e-08, "loss": 0.4061, "step": 242600 }, { "epoch": 2.4265, "grad_norm": 11.086175918579102, "learning_rate": 2.9788e-08, "loss": 0.343, "step": 242650 }, { "epoch": 2.427, "grad_norm": 9.039807319641113, "learning_rate": 2.9588e-08, "loss": 0.3752, "step": 242700 }, { "epoch": 2.4275, "grad_norm": 1.2201632261276245, "learning_rate": 2.9387999999999998e-08, "loss": 0.3769, "step": 242750 }, { "epoch": 2.428, "grad_norm": 46.109474182128906, "learning_rate": 2.9187999999999996e-08, "loss": 0.4977, "step": 242800 }, { "epoch": 2.4285, "grad_norm": 4.451110363006592, "learning_rate": 2.8987999999999998e-08, "loss": 0.4644, "step": 242850 }, { "epoch": 2.429, "grad_norm": 78.92597961425781, "learning_rate": 2.8788e-08, "loss": 0.3742, "step": 242900 }, { "epoch": 2.4295, "grad_norm": 4.9124908447265625, "learning_rate": 2.8587999999999998e-08, "loss": 0.3717, "step": 242950 }, { "epoch": 2.43, "grad_norm": 6.718141555786133, "learning_rate": 2.8388e-08, "loss": 0.4299, "step": 243000 }, { "epoch": 2.4305, "grad_norm": 95.63214874267578, "learning_rate": 2.8188e-08, "loss": 0.3924, "step": 243050 }, { "epoch": 2.431, "grad_norm": 7.528963088989258, "learning_rate": 2.7988e-08, "loss": 0.5847, "step": 243100 }, { "epoch": 2.4314999999999998, "grad_norm": 2.7887654304504395, "learning_rate": 2.7788e-08, "loss": 0.3462, "step": 243150 }, { "epoch": 2.432, "grad_norm": 17.53578758239746, "learning_rate": 2.7588e-08, "loss": 0.4067, "step": 243200 }, { "epoch": 2.4325, "grad_norm": 88.1779556274414, "learning_rate": 2.7387999999999997e-08, "loss": 0.3385, "step": 243250 }, { "epoch": 2.433, "grad_norm": 100.06237030029297, "learning_rate": 2.7187999999999998e-08, "loss": 0.4189, "step": 243300 }, { "epoch": 2.4335, "grad_norm": 89.0728759765625, "learning_rate": 2.6988e-08, "loss": 0.3789, "step": 243350 }, { "epoch": 2.434, "grad_norm": 0.17192931473255157, "learning_rate": 2.6787999999999998e-08, "loss": 0.3695, "step": 243400 }, { "epoch": 2.4345, "grad_norm": 8.71643352508545, "learning_rate": 2.6588e-08, "loss": 0.5545, "step": 243450 }, { "epoch": 2.435, "grad_norm": 14.25753116607666, "learning_rate": 2.6388e-08, "loss": 0.4481, "step": 243500 }, { "epoch": 2.4355, "grad_norm": 29.616668701171875, "learning_rate": 2.6188e-08, "loss": 0.2906, "step": 243550 }, { "epoch": 2.436, "grad_norm": 126.89151000976562, "learning_rate": 2.5988e-08, "loss": 0.5021, "step": 243600 }, { "epoch": 2.4365, "grad_norm": 36.65034103393555, "learning_rate": 2.5787999999999996e-08, "loss": 0.4752, "step": 243650 }, { "epoch": 2.437, "grad_norm": 52.342201232910156, "learning_rate": 2.5587999999999997e-08, "loss": 0.4993, "step": 243700 }, { "epoch": 2.4375, "grad_norm": 14.278721809387207, "learning_rate": 2.5388e-08, "loss": 0.4432, "step": 243750 }, { "epoch": 2.438, "grad_norm": 100.52146911621094, "learning_rate": 2.5187999999999997e-08, "loss": 0.4158, "step": 243800 }, { "epoch": 2.4385, "grad_norm": 1.8151702880859375, "learning_rate": 2.4988e-08, "loss": 0.3256, "step": 243850 }, { "epoch": 2.439, "grad_norm": 2.068545341491699, "learning_rate": 2.4788e-08, "loss": 0.3985, "step": 243900 }, { "epoch": 2.4395, "grad_norm": 114.05219268798828, "learning_rate": 2.4587999999999998e-08, "loss": 0.5039, "step": 243950 }, { "epoch": 2.44, "grad_norm": 22.46074104309082, "learning_rate": 2.4388e-08, "loss": 0.3068, "step": 244000 }, { "epoch": 2.4405, "grad_norm": 49.89002990722656, "learning_rate": 2.4188e-08, "loss": 0.5444, "step": 244050 }, { "epoch": 2.441, "grad_norm": 116.9295883178711, "learning_rate": 2.3988e-08, "loss": 0.401, "step": 244100 }, { "epoch": 2.4415, "grad_norm": 159.10597229003906, "learning_rate": 2.3787999999999998e-08, "loss": 0.5854, "step": 244150 }, { "epoch": 2.442, "grad_norm": 72.9452896118164, "learning_rate": 2.3588e-08, "loss": 0.5169, "step": 244200 }, { "epoch": 2.4425, "grad_norm": 59.4945068359375, "learning_rate": 2.3387999999999997e-08, "loss": 0.5033, "step": 244250 }, { "epoch": 2.443, "grad_norm": 110.84677124023438, "learning_rate": 2.3188e-08, "loss": 0.446, "step": 244300 }, { "epoch": 2.4435000000000002, "grad_norm": 46.814674377441406, "learning_rate": 2.2988e-08, "loss": 0.4448, "step": 244350 }, { "epoch": 2.444, "grad_norm": 42.03607940673828, "learning_rate": 2.2788e-08, "loss": 0.4175, "step": 244400 }, { "epoch": 2.4445, "grad_norm": 4.896573543548584, "learning_rate": 2.2588e-08, "loss": 0.3549, "step": 244450 }, { "epoch": 2.445, "grad_norm": 66.42455291748047, "learning_rate": 2.2388000000000002e-08, "loss": 0.5136, "step": 244500 }, { "epoch": 2.4455, "grad_norm": 9.06808090209961, "learning_rate": 2.2188e-08, "loss": 0.4, "step": 244550 }, { "epoch": 2.446, "grad_norm": 32.235755920410156, "learning_rate": 2.1987999999999998e-08, "loss": 0.3744, "step": 244600 }, { "epoch": 2.4465, "grad_norm": 87.67384338378906, "learning_rate": 2.1787999999999996e-08, "loss": 0.4273, "step": 244650 }, { "epoch": 2.447, "grad_norm": 72.4257583618164, "learning_rate": 2.1587999999999998e-08, "loss": 0.5571, "step": 244700 }, { "epoch": 2.4475, "grad_norm": 0.4879385530948639, "learning_rate": 2.1388e-08, "loss": 0.5196, "step": 244750 }, { "epoch": 2.448, "grad_norm": 80.70654296875, "learning_rate": 2.1187999999999998e-08, "loss": 0.5049, "step": 244800 }, { "epoch": 2.4485, "grad_norm": 38.23407745361328, "learning_rate": 2.0988e-08, "loss": 0.3874, "step": 244850 }, { "epoch": 2.449, "grad_norm": 31.929292678833008, "learning_rate": 2.0788e-08, "loss": 0.433, "step": 244900 }, { "epoch": 2.4495, "grad_norm": 21.76283836364746, "learning_rate": 2.0588e-08, "loss": 0.41, "step": 244950 }, { "epoch": 2.45, "grad_norm": 8.751602172851562, "learning_rate": 2.0391999999999998e-08, "loss": 0.3788, "step": 245000 }, { "epoch": 2.4505, "grad_norm": 10.776887893676758, "learning_rate": 2.0192e-08, "loss": 0.4291, "step": 245050 }, { "epoch": 2.451, "grad_norm": 0.36848610639572144, "learning_rate": 1.9991999999999998e-08, "loss": 0.4365, "step": 245100 }, { "epoch": 2.4515000000000002, "grad_norm": 23.090919494628906, "learning_rate": 1.9792e-08, "loss": 0.406, "step": 245150 }, { "epoch": 2.452, "grad_norm": 25.339353561401367, "learning_rate": 1.9591999999999998e-08, "loss": 0.4708, "step": 245200 }, { "epoch": 2.4525, "grad_norm": 52.52699661254883, "learning_rate": 1.9392e-08, "loss": 0.4524, "step": 245250 }, { "epoch": 2.453, "grad_norm": 1.461986780166626, "learning_rate": 1.9192e-08, "loss": 0.4664, "step": 245300 }, { "epoch": 2.4535, "grad_norm": 95.0424575805664, "learning_rate": 1.8992e-08, "loss": 0.3294, "step": 245350 }, { "epoch": 2.454, "grad_norm": 91.5193862915039, "learning_rate": 1.8792e-08, "loss": 0.3741, "step": 245400 }, { "epoch": 2.4545, "grad_norm": 95.59048461914062, "learning_rate": 1.8592e-08, "loss": 0.5858, "step": 245450 }, { "epoch": 2.455, "grad_norm": 1.0647406578063965, "learning_rate": 1.8391999999999997e-08, "loss": 0.3868, "step": 245500 }, { "epoch": 2.4555, "grad_norm": 36.12507629394531, "learning_rate": 1.8192e-08, "loss": 0.269, "step": 245550 }, { "epoch": 2.456, "grad_norm": 31.336393356323242, "learning_rate": 1.7992e-08, "loss": 0.388, "step": 245600 }, { "epoch": 2.4565, "grad_norm": 103.9297866821289, "learning_rate": 1.7791999999999998e-08, "loss": 0.4473, "step": 245650 }, { "epoch": 2.457, "grad_norm": 63.19395446777344, "learning_rate": 1.7592e-08, "loss": 0.4356, "step": 245700 }, { "epoch": 2.4575, "grad_norm": 72.82255554199219, "learning_rate": 1.7392e-08, "loss": 0.434, "step": 245750 }, { "epoch": 2.458, "grad_norm": 25.266742706298828, "learning_rate": 1.7192e-08, "loss": 0.4078, "step": 245800 }, { "epoch": 2.4585, "grad_norm": 26.895408630371094, "learning_rate": 1.6992e-08, "loss": 0.5463, "step": 245850 }, { "epoch": 2.459, "grad_norm": 18.793058395385742, "learning_rate": 1.6792e-08, "loss": 0.4784, "step": 245900 }, { "epoch": 2.4595000000000002, "grad_norm": 1.1856598854064941, "learning_rate": 1.6591999999999997e-08, "loss": 0.3484, "step": 245950 }, { "epoch": 2.46, "grad_norm": 1.1075769662857056, "learning_rate": 1.6392e-08, "loss": 0.4504, "step": 246000 }, { "epoch": 2.4605, "grad_norm": 47.581871032714844, "learning_rate": 1.6192e-08, "loss": 0.4725, "step": 246050 }, { "epoch": 2.461, "grad_norm": 132.03213500976562, "learning_rate": 1.5992e-08, "loss": 0.46, "step": 246100 }, { "epoch": 2.4615, "grad_norm": 129.3053741455078, "learning_rate": 1.5792e-08, "loss": 0.4021, "step": 246150 }, { "epoch": 2.462, "grad_norm": 94.52012634277344, "learning_rate": 1.5591999999999998e-08, "loss": 0.5046, "step": 246200 }, { "epoch": 2.4625, "grad_norm": 2.470003128051758, "learning_rate": 1.5392e-08, "loss": 0.4378, "step": 246250 }, { "epoch": 2.463, "grad_norm": 65.46109008789062, "learning_rate": 1.5192e-08, "loss": 0.3905, "step": 246300 }, { "epoch": 2.4635, "grad_norm": 29.295551300048828, "learning_rate": 1.4992e-08, "loss": 0.4354, "step": 246350 }, { "epoch": 2.464, "grad_norm": 46.023048400878906, "learning_rate": 1.4792e-08, "loss": 0.3686, "step": 246400 }, { "epoch": 2.4645, "grad_norm": 2.7170820236206055, "learning_rate": 1.4592000000000001e-08, "loss": 0.4175, "step": 246450 }, { "epoch": 2.465, "grad_norm": 46.63579559326172, "learning_rate": 1.4391999999999999e-08, "loss": 0.476, "step": 246500 }, { "epoch": 2.4655, "grad_norm": 31.72504997253418, "learning_rate": 1.4191999999999999e-08, "loss": 0.4872, "step": 246550 }, { "epoch": 2.466, "grad_norm": 2.155219793319702, "learning_rate": 1.3991999999999999e-08, "loss": 0.3661, "step": 246600 }, { "epoch": 2.4665, "grad_norm": 100.10980987548828, "learning_rate": 1.3792e-08, "loss": 0.4025, "step": 246650 }, { "epoch": 2.467, "grad_norm": 40.943275451660156, "learning_rate": 1.3591999999999999e-08, "loss": 0.4854, "step": 246700 }, { "epoch": 2.4675000000000002, "grad_norm": 68.8076400756836, "learning_rate": 1.3391999999999998e-08, "loss": 0.3888, "step": 246750 }, { "epoch": 2.468, "grad_norm": 53.90141296386719, "learning_rate": 1.3192e-08, "loss": 0.3383, "step": 246800 }, { "epoch": 2.4685, "grad_norm": 65.13260650634766, "learning_rate": 1.2992e-08, "loss": 0.4943, "step": 246850 }, { "epoch": 2.469, "grad_norm": 9.727241516113281, "learning_rate": 1.2792e-08, "loss": 0.5687, "step": 246900 }, { "epoch": 2.4695, "grad_norm": 45.80133056640625, "learning_rate": 1.2592e-08, "loss": 0.2708, "step": 246950 }, { "epoch": 2.4699999999999998, "grad_norm": 95.85955810546875, "learning_rate": 1.2392e-08, "loss": 0.4595, "step": 247000 }, { "epoch": 2.4705, "grad_norm": 1.5538358688354492, "learning_rate": 1.2192e-08, "loss": 0.2399, "step": 247050 }, { "epoch": 2.471, "grad_norm": 87.2671890258789, "learning_rate": 1.1991999999999999e-08, "loss": 0.4663, "step": 247100 }, { "epoch": 2.4715, "grad_norm": 4.15252161026001, "learning_rate": 1.1792e-08, "loss": 0.4194, "step": 247150 }, { "epoch": 2.472, "grad_norm": 51.74559783935547, "learning_rate": 1.1591999999999999e-08, "loss": 0.3372, "step": 247200 }, { "epoch": 2.4725, "grad_norm": 66.37777709960938, "learning_rate": 1.1391999999999999e-08, "loss": 0.3389, "step": 247250 }, { "epoch": 2.473, "grad_norm": 105.33214569091797, "learning_rate": 1.1195999999999998e-08, "loss": 0.4773, "step": 247300 }, { "epoch": 2.4735, "grad_norm": 44.509883880615234, "learning_rate": 1.0996e-08, "loss": 0.3653, "step": 247350 }, { "epoch": 2.474, "grad_norm": 85.23392486572266, "learning_rate": 1.0796e-08, "loss": 0.3496, "step": 247400 }, { "epoch": 2.4745, "grad_norm": 91.08809661865234, "learning_rate": 1.0596e-08, "loss": 0.451, "step": 247450 }, { "epoch": 2.475, "grad_norm": 51.84107208251953, "learning_rate": 1.0396000000000001e-08, "loss": 0.4562, "step": 247500 }, { "epoch": 2.4755, "grad_norm": 89.71752166748047, "learning_rate": 1.0195999999999999e-08, "loss": 0.3289, "step": 247550 }, { "epoch": 2.476, "grad_norm": 30.12227439880371, "learning_rate": 9.995999999999999e-09, "loss": 0.3847, "step": 247600 }, { "epoch": 2.4765, "grad_norm": 138.02243041992188, "learning_rate": 9.795999999999999e-09, "loss": 0.7013, "step": 247650 }, { "epoch": 2.477, "grad_norm": 9.041874885559082, "learning_rate": 9.596e-09, "loss": 0.5115, "step": 247700 }, { "epoch": 2.4775, "grad_norm": 2.327457904815674, "learning_rate": 9.396e-09, "loss": 0.3211, "step": 247750 }, { "epoch": 2.4779999999999998, "grad_norm": 6.0508317947387695, "learning_rate": 9.195999999999998e-09, "loss": 0.5396, "step": 247800 }, { "epoch": 2.4785, "grad_norm": 26.49738883972168, "learning_rate": 8.996e-09, "loss": 0.2972, "step": 247850 }, { "epoch": 2.479, "grad_norm": 70.40245819091797, "learning_rate": 8.796e-09, "loss": 0.526, "step": 247900 }, { "epoch": 2.4795, "grad_norm": 70.888916015625, "learning_rate": 8.596e-09, "loss": 0.4678, "step": 247950 }, { "epoch": 2.48, "grad_norm": 2.654665231704712, "learning_rate": 8.396e-09, "loss": 0.5367, "step": 248000 }, { "epoch": 2.4805, "grad_norm": 16.878707885742188, "learning_rate": 8.196e-09, "loss": 0.2801, "step": 248050 }, { "epoch": 2.481, "grad_norm": 47.633583068847656, "learning_rate": 7.996e-09, "loss": 0.3872, "step": 248100 }, { "epoch": 2.4815, "grad_norm": 77.5364990234375, "learning_rate": 7.795999999999999e-09, "loss": 0.4372, "step": 248150 }, { "epoch": 2.482, "grad_norm": 57.8331184387207, "learning_rate": 7.596e-09, "loss": 0.5912, "step": 248200 }, { "epoch": 2.4825, "grad_norm": 64.51912689208984, "learning_rate": 7.396e-09, "loss": 0.4503, "step": 248250 }, { "epoch": 2.483, "grad_norm": 10.555734634399414, "learning_rate": 7.1959999999999996e-09, "loss": 0.4095, "step": 248300 }, { "epoch": 2.4835, "grad_norm": 58.91648483276367, "learning_rate": 6.9959999999999994e-09, "loss": 0.4893, "step": 248350 }, { "epoch": 2.484, "grad_norm": 81.10791778564453, "learning_rate": 6.795999999999999e-09, "loss": 0.3603, "step": 248400 }, { "epoch": 2.4845, "grad_norm": 69.9582748413086, "learning_rate": 6.596e-09, "loss": 0.3966, "step": 248450 }, { "epoch": 2.485, "grad_norm": 2.605888843536377, "learning_rate": 6.396e-09, "loss": 0.4878, "step": 248500 }, { "epoch": 2.4855, "grad_norm": 92.16374206542969, "learning_rate": 6.196e-09, "loss": 0.4327, "step": 248550 }, { "epoch": 2.4859999999999998, "grad_norm": 90.9197998046875, "learning_rate": 5.9959999999999996e-09, "loss": 0.3959, "step": 248600 }, { "epoch": 2.4865, "grad_norm": 76.15708923339844, "learning_rate": 5.7959999999999994e-09, "loss": 0.3849, "step": 248650 }, { "epoch": 2.487, "grad_norm": 32.75033187866211, "learning_rate": 5.596e-09, "loss": 0.4641, "step": 248700 }, { "epoch": 2.4875, "grad_norm": 103.26265716552734, "learning_rate": 5.395999999999999e-09, "loss": 0.3382, "step": 248750 }, { "epoch": 2.488, "grad_norm": 68.11605072021484, "learning_rate": 5.196e-09, "loss": 0.3231, "step": 248800 }, { "epoch": 2.4885, "grad_norm": 4.257381439208984, "learning_rate": 4.996e-09, "loss": 0.4907, "step": 248850 }, { "epoch": 2.489, "grad_norm": 33.47376251220703, "learning_rate": 4.7959999999999996e-09, "loss": 0.3289, "step": 248900 }, { "epoch": 2.4895, "grad_norm": 49.842369079589844, "learning_rate": 4.596e-09, "loss": 0.234, "step": 248950 }, { "epoch": 2.49, "grad_norm": 3.662029504776001, "learning_rate": 4.395999999999999e-09, "loss": 0.3642, "step": 249000 }, { "epoch": 2.4905, "grad_norm": 112.91320037841797, "learning_rate": 4.196e-09, "loss": 0.437, "step": 249050 }, { "epoch": 2.491, "grad_norm": 2.2020812034606934, "learning_rate": 3.996e-09, "loss": 0.4311, "step": 249100 }, { "epoch": 2.4915, "grad_norm": 20.80237579345703, "learning_rate": 3.796e-09, "loss": 0.4193, "step": 249150 }, { "epoch": 2.492, "grad_norm": 47.812049865722656, "learning_rate": 3.5959999999999996e-09, "loss": 0.2837, "step": 249200 }, { "epoch": 2.4925, "grad_norm": 14.877352714538574, "learning_rate": 3.396e-09, "loss": 0.4079, "step": 249250 }, { "epoch": 2.493, "grad_norm": 31.925203323364258, "learning_rate": 3.1959999999999997e-09, "loss": 0.4154, "step": 249300 }, { "epoch": 2.4935, "grad_norm": 7.781558513641357, "learning_rate": 2.996e-09, "loss": 0.3361, "step": 249350 }, { "epoch": 2.4939999999999998, "grad_norm": 0.3294135630130768, "learning_rate": 2.796e-09, "loss": 0.4179, "step": 249400 }, { "epoch": 2.4945, "grad_norm": 64.53218078613281, "learning_rate": 2.5959999999999997e-09, "loss": 0.4882, "step": 249450 }, { "epoch": 2.495, "grad_norm": 44.542545318603516, "learning_rate": 2.396e-09, "loss": 0.3351, "step": 249500 }, { "epoch": 2.4955, "grad_norm": 18.63250160217285, "learning_rate": 2.196e-09, "loss": 0.4181, "step": 249550 }, { "epoch": 2.496, "grad_norm": 42.67049789428711, "learning_rate": 1.9959999999999997e-09, "loss": 0.4524, "step": 249600 }, { "epoch": 2.4965, "grad_norm": 4.257511615753174, "learning_rate": 1.796e-09, "loss": 0.3714, "step": 249650 }, { "epoch": 2.497, "grad_norm": 38.738487243652344, "learning_rate": 1.5959999999999999e-09, "loss": 0.3591, "step": 249700 }, { "epoch": 2.4975, "grad_norm": 50.39963150024414, "learning_rate": 1.3960000000000001e-09, "loss": 0.3474, "step": 249750 }, { "epoch": 2.498, "grad_norm": 69.15322875976562, "learning_rate": 1.196e-09, "loss": 0.5561, "step": 249800 }, { "epoch": 2.4985, "grad_norm": 7.292196273803711, "learning_rate": 9.959999999999999e-10, "loss": 0.5342, "step": 249850 }, { "epoch": 2.499, "grad_norm": 1.3633363246917725, "learning_rate": 7.96e-10, "loss": 0.4622, "step": 249900 }, { "epoch": 2.4995, "grad_norm": 82.1470947265625, "learning_rate": 5.959999999999999e-10, "loss": 0.5702, "step": 249950 }, { "epoch": 2.5, "grad_norm": 84.26116943359375, "learning_rate": 3.96e-10, "loss": 0.4723, "step": 250000 } ], "logging_steps": 50, "max_steps": 250000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }