diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,35033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5, + "eval_steps": 500, + "global_step": 250000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005, + "grad_norm": 55.07521057128906, + "learning_rate": 9.99816e-07, + "loss": 1.4625, + "step": 50 + }, + { + "epoch": 0.001, + "grad_norm": 96.95206451416016, + "learning_rate": 9.99616e-07, + "loss": 1.4538, + "step": 100 + }, + { + "epoch": 0.0015, + "grad_norm": 49.72062683105469, + "learning_rate": 9.99416e-07, + "loss": 1.3503, + "step": 150 + }, + { + "epoch": 0.002, + "grad_norm": 62.22916793823242, + "learning_rate": 9.9922e-07, + "loss": 1.3127, + "step": 200 + }, + { + "epoch": 0.0025, + "grad_norm": 38.854434967041016, + "learning_rate": 9.99024e-07, + "loss": 1.261, + "step": 250 + }, + { + "epoch": 0.003, + "grad_norm": 49.72938919067383, + "learning_rate": 9.98824e-07, + "loss": 1.1819, + "step": 300 + }, + { + "epoch": 0.0035, + "grad_norm": 68.74335479736328, + "learning_rate": 9.986239999999999e-07, + "loss": 1.1701, + "step": 350 + }, + { + "epoch": 0.004, + "grad_norm": 51.449588775634766, + "learning_rate": 9.984239999999999e-07, + "loss": 1.1705, + "step": 400 + }, + { + "epoch": 0.0045, + "grad_norm": 42.585655212402344, + "learning_rate": 9.98224e-07, + "loss": 1.0597, + "step": 450 + }, + { + "epoch": 0.005, + "grad_norm": 62.51144027709961, + "learning_rate": 9.98024e-07, + "loss": 1.0494, + "step": 500 + }, + { + "epoch": 0.0055, + "grad_norm": 40.80481719970703, + "learning_rate": 9.97824e-07, + "loss": 1.027, + "step": 550 + }, + { + "epoch": 0.006, + "grad_norm": 63.342613220214844, + "learning_rate": 9.97628e-07, + "loss": 1.048, + "step": 600 + }, + { + "epoch": 0.0065, + "grad_norm": 38.498329162597656, + "learning_rate": 9.97428e-07, + "loss": 0.9951, + "step": 650 + }, + { + "epoch": 0.007, + "grad_norm": 25.70156478881836, + "learning_rate": 9.972279999999999e-07, + "loss": 0.9348, + "step": 700 + }, + { + "epoch": 0.0075, + "grad_norm": 54.72833251953125, + "learning_rate": 9.97028e-07, + "loss": 0.9405, + "step": 750 + }, + { + "epoch": 0.008, + "grad_norm": 51.39662170410156, + "learning_rate": 9.96828e-07, + "loss": 0.9214, + "step": 800 + }, + { + "epoch": 0.0085, + "grad_norm": 57.055030822753906, + "learning_rate": 9.966279999999998e-07, + "loss": 0.9247, + "step": 850 + }, + { + "epoch": 0.009, + "grad_norm": 48.74729919433594, + "learning_rate": 9.96428e-07, + "loss": 0.9254, + "step": 900 + }, + { + "epoch": 0.0095, + "grad_norm": 69.1877212524414, + "learning_rate": 9.96228e-07, + "loss": 0.7783, + "step": 950 + }, + { + "epoch": 0.01, + "grad_norm": 58.19287872314453, + "learning_rate": 9.96028e-07, + "loss": 0.9523, + "step": 1000 + }, + { + "epoch": 0.0105, + "grad_norm": 90.27144622802734, + "learning_rate": 9.95828e-07, + "loss": 0.8884, + "step": 1050 + }, + { + "epoch": 0.011, + "grad_norm": 47.847496032714844, + "learning_rate": 9.95628e-07, + "loss": 0.765, + "step": 1100 + }, + { + "epoch": 0.0115, + "grad_norm": 60.23532485961914, + "learning_rate": 9.95428e-07, + "loss": 0.9025, + "step": 1150 + }, + { + "epoch": 0.012, + "grad_norm": 65.98821258544922, + "learning_rate": 9.95228e-07, + "loss": 0.8743, + "step": 1200 + }, + { + "epoch": 0.0125, + "grad_norm": 41.296512603759766, + "learning_rate": 9.95028e-07, + "loss": 0.7958, + "step": 1250 + }, + { + "epoch": 0.013, + "grad_norm": 10.350058555603027, + "learning_rate": 9.94828e-07, + "loss": 0.7748, + "step": 1300 + }, + { + "epoch": 0.0135, + "grad_norm": 100.7204818725586, + "learning_rate": 9.94628e-07, + "loss": 0.7265, + "step": 1350 + }, + { + "epoch": 0.014, + "grad_norm": 33.72673797607422, + "learning_rate": 9.944279999999999e-07, + "loss": 0.8833, + "step": 1400 + }, + { + "epoch": 0.0145, + "grad_norm": 51.3131217956543, + "learning_rate": 9.942279999999999e-07, + "loss": 0.6357, + "step": 1450 + }, + { + "epoch": 0.015, + "grad_norm": 62.02531814575195, + "learning_rate": 9.94028e-07, + "loss": 0.8208, + "step": 1500 + }, + { + "epoch": 0.0155, + "grad_norm": 79.17247772216797, + "learning_rate": 9.93828e-07, + "loss": 0.7981, + "step": 1550 + }, + { + "epoch": 0.016, + "grad_norm": 45.16271209716797, + "learning_rate": 9.936279999999998e-07, + "loss": 0.9424, + "step": 1600 + }, + { + "epoch": 0.0165, + "grad_norm": 25.51656723022461, + "learning_rate": 9.93428e-07, + "loss": 0.7381, + "step": 1650 + }, + { + "epoch": 0.017, + "grad_norm": 67.2197265625, + "learning_rate": 9.93228e-07, + "loss": 0.8223, + "step": 1700 + }, + { + "epoch": 0.0175, + "grad_norm": 68.64763641357422, + "learning_rate": 9.93028e-07, + "loss": 0.8728, + "step": 1750 + }, + { + "epoch": 0.018, + "grad_norm": 73.73970031738281, + "learning_rate": 9.92828e-07, + "loss": 0.8765, + "step": 1800 + }, + { + "epoch": 0.0185, + "grad_norm": 91.64149475097656, + "learning_rate": 9.92628e-07, + "loss": 0.7413, + "step": 1850 + }, + { + "epoch": 0.019, + "grad_norm": 65.28728485107422, + "learning_rate": 9.92428e-07, + "loss": 0.8043, + "step": 1900 + }, + { + "epoch": 0.0195, + "grad_norm": 67.93745422363281, + "learning_rate": 9.92228e-07, + "loss": 0.7773, + "step": 1950 + }, + { + "epoch": 0.02, + "grad_norm": 59.84891128540039, + "learning_rate": 9.92028e-07, + "loss": 0.6151, + "step": 2000 + }, + { + "epoch": 0.0205, + "grad_norm": 25.66044044494629, + "learning_rate": 9.91828e-07, + "loss": 0.8568, + "step": 2050 + }, + { + "epoch": 0.021, + "grad_norm": 88.2499008178711, + "learning_rate": 9.916279999999999e-07, + "loss": 0.8048, + "step": 2100 + }, + { + "epoch": 0.0215, + "grad_norm": 80.02753448486328, + "learning_rate": 9.914279999999999e-07, + "loss": 0.8383, + "step": 2150 + }, + { + "epoch": 0.022, + "grad_norm": 36.008663177490234, + "learning_rate": 9.912279999999999e-07, + "loss": 0.7106, + "step": 2200 + }, + { + "epoch": 0.0225, + "grad_norm": 19.52667999267578, + "learning_rate": 9.91028e-07, + "loss": 0.5627, + "step": 2250 + }, + { + "epoch": 0.023, + "grad_norm": 44.50703048706055, + "learning_rate": 9.90828e-07, + "loss": 0.8129, + "step": 2300 + }, + { + "epoch": 0.0235, + "grad_norm": 31.829010009765625, + "learning_rate": 9.906279999999998e-07, + "loss": 0.5897, + "step": 2350 + }, + { + "epoch": 0.024, + "grad_norm": 80.26560974121094, + "learning_rate": 9.90428e-07, + "loss": 0.7319, + "step": 2400 + }, + { + "epoch": 0.0245, + "grad_norm": 46.38948440551758, + "learning_rate": 9.90228e-07, + "loss": 0.8328, + "step": 2450 + }, + { + "epoch": 0.025, + "grad_norm": 36.038612365722656, + "learning_rate": 9.900319999999999e-07, + "loss": 0.8958, + "step": 2500 + }, + { + "epoch": 0.0255, + "grad_norm": 36.27665328979492, + "learning_rate": 9.89832e-07, + "loss": 0.7135, + "step": 2550 + }, + { + "epoch": 0.026, + "grad_norm": 76.19625854492188, + "learning_rate": 9.896319999999998e-07, + "loss": 0.9015, + "step": 2600 + }, + { + "epoch": 0.0265, + "grad_norm": 54.22828674316406, + "learning_rate": 9.894319999999998e-07, + "loss": 0.7887, + "step": 2650 + }, + { + "epoch": 0.027, + "grad_norm": 77.00263214111328, + "learning_rate": 9.89232e-07, + "loss": 0.6704, + "step": 2700 + }, + { + "epoch": 0.0275, + "grad_norm": 65.53499603271484, + "learning_rate": 9.89032e-07, + "loss": 0.6049, + "step": 2750 + }, + { + "epoch": 0.028, + "grad_norm": 97.3672103881836, + "learning_rate": 9.88832e-07, + "loss": 0.7043, + "step": 2800 + }, + { + "epoch": 0.0285, + "grad_norm": 80.95391082763672, + "learning_rate": 9.88632e-07, + "loss": 0.7861, + "step": 2850 + }, + { + "epoch": 0.029, + "grad_norm": 39.608619689941406, + "learning_rate": 9.88432e-07, + "loss": 0.7095, + "step": 2900 + }, + { + "epoch": 0.0295, + "grad_norm": 71.16398620605469, + "learning_rate": 9.88232e-07, + "loss": 0.5963, + "step": 2950 + }, + { + "epoch": 0.03, + "grad_norm": 66.53075408935547, + "learning_rate": 9.88032e-07, + "loss": 0.7837, + "step": 3000 + }, + { + "epoch": 0.0305, + "grad_norm": 77.58058166503906, + "learning_rate": 9.87832e-07, + "loss": 0.627, + "step": 3050 + }, + { + "epoch": 0.031, + "grad_norm": 60.44248962402344, + "learning_rate": 9.87632e-07, + "loss": 0.7445, + "step": 3100 + }, + { + "epoch": 0.0315, + "grad_norm": 69.0057144165039, + "learning_rate": 9.874319999999999e-07, + "loss": 0.9611, + "step": 3150 + }, + { + "epoch": 0.032, + "grad_norm": 103.23316192626953, + "learning_rate": 9.872319999999999e-07, + "loss": 0.6921, + "step": 3200 + }, + { + "epoch": 0.0325, + "grad_norm": 63.068687438964844, + "learning_rate": 9.87032e-07, + "loss": 0.7081, + "step": 3250 + }, + { + "epoch": 0.033, + "grad_norm": 61.475379943847656, + "learning_rate": 9.86832e-07, + "loss": 0.8122, + "step": 3300 + }, + { + "epoch": 0.0335, + "grad_norm": 65.56523895263672, + "learning_rate": 9.866319999999998e-07, + "loss": 0.7693, + "step": 3350 + }, + { + "epoch": 0.034, + "grad_norm": 62.29042434692383, + "learning_rate": 9.86432e-07, + "loss": 0.8332, + "step": 3400 + }, + { + "epoch": 0.0345, + "grad_norm": 66.55669403076172, + "learning_rate": 9.86232e-07, + "loss": 0.7765, + "step": 3450 + }, + { + "epoch": 0.035, + "grad_norm": 41.560089111328125, + "learning_rate": 9.86032e-07, + "loss": 0.8187, + "step": 3500 + }, + { + "epoch": 0.0355, + "grad_norm": 59.37902069091797, + "learning_rate": 9.85832e-07, + "loss": 0.6688, + "step": 3550 + }, + { + "epoch": 0.036, + "grad_norm": 24.928035736083984, + "learning_rate": 9.85632e-07, + "loss": 0.6727, + "step": 3600 + }, + { + "epoch": 0.0365, + "grad_norm": 10.977078437805176, + "learning_rate": 9.85432e-07, + "loss": 0.7937, + "step": 3650 + }, + { + "epoch": 0.037, + "grad_norm": 52.790130615234375, + "learning_rate": 9.85232e-07, + "loss": 0.9761, + "step": 3700 + }, + { + "epoch": 0.0375, + "grad_norm": 10.147725105285645, + "learning_rate": 9.85032e-07, + "loss": 0.703, + "step": 3750 + }, + { + "epoch": 0.038, + "grad_norm": 17.01890754699707, + "learning_rate": 9.84832e-07, + "loss": 0.7169, + "step": 3800 + }, + { + "epoch": 0.0385, + "grad_norm": 9.722551345825195, + "learning_rate": 9.846319999999999e-07, + "loss": 0.802, + "step": 3850 + }, + { + "epoch": 0.039, + "grad_norm": 39.80437088012695, + "learning_rate": 9.844319999999999e-07, + "loss": 0.7751, + "step": 3900 + }, + { + "epoch": 0.0395, + "grad_norm": 43.26958465576172, + "learning_rate": 9.842319999999999e-07, + "loss": 0.7675, + "step": 3950 + }, + { + "epoch": 0.04, + "grad_norm": 53.78465270996094, + "learning_rate": 9.84032e-07, + "loss": 0.683, + "step": 4000 + }, + { + "epoch": 0.0405, + "grad_norm": 7.1808319091796875, + "learning_rate": 9.83832e-07, + "loss": 0.7682, + "step": 4050 + }, + { + "epoch": 0.041, + "grad_norm": 75.9840316772461, + "learning_rate": 9.836319999999998e-07, + "loss": 0.6552, + "step": 4100 + }, + { + "epoch": 0.0415, + "grad_norm": 22.792072296142578, + "learning_rate": 9.83432e-07, + "loss": 0.7268, + "step": 4150 + }, + { + "epoch": 0.042, + "grad_norm": 29.64811897277832, + "learning_rate": 9.83232e-07, + "loss": 0.6025, + "step": 4200 + }, + { + "epoch": 0.0425, + "grad_norm": 93.7188720703125, + "learning_rate": 9.83032e-07, + "loss": 0.7603, + "step": 4250 + }, + { + "epoch": 0.043, + "grad_norm": 81.36428833007812, + "learning_rate": 9.82832e-07, + "loss": 0.7969, + "step": 4300 + }, + { + "epoch": 0.0435, + "grad_norm": 72.37635040283203, + "learning_rate": 9.82632e-07, + "loss": 0.6322, + "step": 4350 + }, + { + "epoch": 0.044, + "grad_norm": 58.41753387451172, + "learning_rate": 9.82432e-07, + "loss": 0.6965, + "step": 4400 + }, + { + "epoch": 0.0445, + "grad_norm": 26.843284606933594, + "learning_rate": 9.82232e-07, + "loss": 0.622, + "step": 4450 + }, + { + "epoch": 0.045, + "grad_norm": 80.62271881103516, + "learning_rate": 9.82032e-07, + "loss": 0.7182, + "step": 4500 + }, + { + "epoch": 0.0455, + "grad_norm": 62.724483489990234, + "learning_rate": 9.81832e-07, + "loss": 0.6782, + "step": 4550 + }, + { + "epoch": 0.046, + "grad_norm": 39.624332427978516, + "learning_rate": 9.816319999999999e-07, + "loss": 0.5628, + "step": 4600 + }, + { + "epoch": 0.0465, + "grad_norm": 30.11197853088379, + "learning_rate": 9.814319999999999e-07, + "loss": 0.6594, + "step": 4650 + }, + { + "epoch": 0.047, + "grad_norm": 47.84878158569336, + "learning_rate": 9.812319999999998e-07, + "loss": 0.5869, + "step": 4700 + }, + { + "epoch": 0.0475, + "grad_norm": 31.320173263549805, + "learning_rate": 9.81032e-07, + "loss": 0.7272, + "step": 4750 + }, + { + "epoch": 0.048, + "grad_norm": 57.74075698852539, + "learning_rate": 9.80832e-07, + "loss": 0.7878, + "step": 4800 + }, + { + "epoch": 0.0485, + "grad_norm": 66.6219711303711, + "learning_rate": 9.806319999999998e-07, + "loss": 0.7573, + "step": 4850 + }, + { + "epoch": 0.049, + "grad_norm": 28.821975708007812, + "learning_rate": 9.80432e-07, + "loss": 0.7238, + "step": 4900 + }, + { + "epoch": 0.0495, + "grad_norm": 70.03323364257812, + "learning_rate": 9.80232e-07, + "loss": 0.7167, + "step": 4950 + }, + { + "epoch": 0.05, + "grad_norm": 42.992374420166016, + "learning_rate": 9.80032e-07, + "loss": 0.6073, + "step": 5000 + }, + { + "epoch": 0.0505, + "grad_norm": 80.54195404052734, + "learning_rate": 9.79832e-07, + "loss": 0.7736, + "step": 5050 + }, + { + "epoch": 0.051, + "grad_norm": 58.23442459106445, + "learning_rate": 9.79632e-07, + "loss": 0.7848, + "step": 5100 + }, + { + "epoch": 0.0515, + "grad_norm": 51.8128662109375, + "learning_rate": 9.79432e-07, + "loss": 0.7043, + "step": 5150 + }, + { + "epoch": 0.052, + "grad_norm": 68.98834228515625, + "learning_rate": 9.79232e-07, + "loss": 0.7293, + "step": 5200 + }, + { + "epoch": 0.0525, + "grad_norm": 75.1903305053711, + "learning_rate": 9.79032e-07, + "loss": 0.7225, + "step": 5250 + }, + { + "epoch": 0.053, + "grad_norm": 46.63593292236328, + "learning_rate": 9.78832e-07, + "loss": 0.7507, + "step": 5300 + }, + { + "epoch": 0.0535, + "grad_norm": 54.36689376831055, + "learning_rate": 9.786319999999999e-07, + "loss": 0.8457, + "step": 5350 + }, + { + "epoch": 0.054, + "grad_norm": 20.45354461669922, + "learning_rate": 9.784319999999999e-07, + "loss": 0.621, + "step": 5400 + }, + { + "epoch": 0.0545, + "grad_norm": 49.178565979003906, + "learning_rate": 9.78232e-07, + "loss": 0.8933, + "step": 5450 + }, + { + "epoch": 0.055, + "grad_norm": 1.9964510202407837, + "learning_rate": 9.78032e-07, + "loss": 0.5691, + "step": 5500 + }, + { + "epoch": 0.0555, + "grad_norm": 74.9593734741211, + "learning_rate": 9.77832e-07, + "loss": 0.614, + "step": 5550 + }, + { + "epoch": 0.056, + "grad_norm": 62.81483459472656, + "learning_rate": 9.77632e-07, + "loss": 0.8412, + "step": 5600 + }, + { + "epoch": 0.0565, + "grad_norm": 0.782301664352417, + "learning_rate": 9.77432e-07, + "loss": 0.6128, + "step": 5650 + }, + { + "epoch": 0.057, + "grad_norm": 62.49407196044922, + "learning_rate": 9.77232e-07, + "loss": 0.6208, + "step": 5700 + }, + { + "epoch": 0.0575, + "grad_norm": 39.935970306396484, + "learning_rate": 9.77032e-07, + "loss": 0.5399, + "step": 5750 + }, + { + "epoch": 0.058, + "grad_norm": 25.412355422973633, + "learning_rate": 9.76832e-07, + "loss": 0.6784, + "step": 5800 + }, + { + "epoch": 0.0585, + "grad_norm": 64.27447509765625, + "learning_rate": 9.76632e-07, + "loss": 0.7829, + "step": 5850 + }, + { + "epoch": 0.059, + "grad_norm": 85.29767608642578, + "learning_rate": 9.76432e-07, + "loss": 0.649, + "step": 5900 + }, + { + "epoch": 0.0595, + "grad_norm": 36.06076431274414, + "learning_rate": 9.76232e-07, + "loss": 0.7838, + "step": 5950 + }, + { + "epoch": 0.06, + "grad_norm": 84.43746948242188, + "learning_rate": 9.760319999999999e-07, + "loss": 0.6822, + "step": 6000 + }, + { + "epoch": 0.0605, + "grad_norm": 74.78901672363281, + "learning_rate": 9.75832e-07, + "loss": 0.6003, + "step": 6050 + }, + { + "epoch": 0.061, + "grad_norm": 70.3706283569336, + "learning_rate": 9.75632e-07, + "loss": 0.6829, + "step": 6100 + }, + { + "epoch": 0.0615, + "grad_norm": 47.27094650268555, + "learning_rate": 9.754319999999998e-07, + "loss": 0.6415, + "step": 6150 + }, + { + "epoch": 0.062, + "grad_norm": 19.190216064453125, + "learning_rate": 9.75232e-07, + "loss": 0.5789, + "step": 6200 + }, + { + "epoch": 0.0625, + "grad_norm": 82.82536315917969, + "learning_rate": 9.75032e-07, + "loss": 0.6222, + "step": 6250 + }, + { + "epoch": 0.063, + "grad_norm": 72.95569610595703, + "learning_rate": 9.74832e-07, + "loss": 0.6288, + "step": 6300 + }, + { + "epoch": 0.0635, + "grad_norm": 19.769786834716797, + "learning_rate": 9.74632e-07, + "loss": 0.7223, + "step": 6350 + }, + { + "epoch": 0.064, + "grad_norm": 22.352313995361328, + "learning_rate": 9.74432e-07, + "loss": 0.6113, + "step": 6400 + }, + { + "epoch": 0.0645, + "grad_norm": 42.68896484375, + "learning_rate": 9.74232e-07, + "loss": 0.5704, + "step": 6450 + }, + { + "epoch": 0.065, + "grad_norm": 8.79783821105957, + "learning_rate": 9.74032e-07, + "loss": 0.7296, + "step": 6500 + }, + { + "epoch": 0.0655, + "grad_norm": 19.08173942565918, + "learning_rate": 9.73832e-07, + "loss": 0.5812, + "step": 6550 + }, + { + "epoch": 0.066, + "grad_norm": 7.165023326873779, + "learning_rate": 9.73632e-07, + "loss": 0.7078, + "step": 6600 + }, + { + "epoch": 0.0665, + "grad_norm": 58.18810272216797, + "learning_rate": 9.73432e-07, + "loss": 0.6484, + "step": 6650 + }, + { + "epoch": 0.067, + "grad_norm": 50.902000427246094, + "learning_rate": 9.732319999999999e-07, + "loss": 0.7417, + "step": 6700 + }, + { + "epoch": 0.0675, + "grad_norm": 15.194981575012207, + "learning_rate": 9.730319999999999e-07, + "loss": 0.66, + "step": 6750 + }, + { + "epoch": 0.068, + "grad_norm": 12.29551887512207, + "learning_rate": 9.72832e-07, + "loss": 0.6219, + "step": 6800 + }, + { + "epoch": 0.0685, + "grad_norm": 33.02836608886719, + "learning_rate": 9.72632e-07, + "loss": 0.7864, + "step": 6850 + }, + { + "epoch": 0.069, + "grad_norm": 61.996707916259766, + "learning_rate": 9.724319999999998e-07, + "loss": 0.7458, + "step": 6900 + }, + { + "epoch": 0.0695, + "grad_norm": 50.779640197753906, + "learning_rate": 9.72232e-07, + "loss": 0.6969, + "step": 6950 + }, + { + "epoch": 0.07, + "grad_norm": 63.49092483520508, + "learning_rate": 9.72032e-07, + "loss": 0.6315, + "step": 7000 + }, + { + "epoch": 0.0705, + "grad_norm": 2.9683051109313965, + "learning_rate": 9.71832e-07, + "loss": 0.6708, + "step": 7050 + }, + { + "epoch": 0.071, + "grad_norm": 63.99093246459961, + "learning_rate": 9.71632e-07, + "loss": 0.5949, + "step": 7100 + }, + { + "epoch": 0.0715, + "grad_norm": 59.804561614990234, + "learning_rate": 9.71432e-07, + "loss": 0.6125, + "step": 7150 + }, + { + "epoch": 0.072, + "grad_norm": 52.34803009033203, + "learning_rate": 9.71232e-07, + "loss": 0.6553, + "step": 7200 + }, + { + "epoch": 0.0725, + "grad_norm": 48.697906494140625, + "learning_rate": 9.71032e-07, + "loss": 0.7065, + "step": 7250 + }, + { + "epoch": 0.073, + "grad_norm": 22.473642349243164, + "learning_rate": 9.70832e-07, + "loss": 0.5137, + "step": 7300 + }, + { + "epoch": 0.0735, + "grad_norm": 108.86209106445312, + "learning_rate": 9.706320000000001e-07, + "loss": 0.5885, + "step": 7350 + }, + { + "epoch": 0.074, + "grad_norm": 97.51536560058594, + "learning_rate": 9.704319999999999e-07, + "loss": 0.725, + "step": 7400 + }, + { + "epoch": 0.0745, + "grad_norm": 90.5220947265625, + "learning_rate": 9.702319999999999e-07, + "loss": 0.6542, + "step": 7450 + }, + { + "epoch": 0.075, + "grad_norm": 79.1460189819336, + "learning_rate": 9.70032e-07, + "loss": 0.7872, + "step": 7500 + }, + { + "epoch": 0.0755, + "grad_norm": 11.08342170715332, + "learning_rate": 9.69832e-07, + "loss": 0.6952, + "step": 7550 + }, + { + "epoch": 0.076, + "grad_norm": 70.80834197998047, + "learning_rate": 9.69632e-07, + "loss": 0.7243, + "step": 7600 + }, + { + "epoch": 0.0765, + "grad_norm": 51.24228286743164, + "learning_rate": 9.69432e-07, + "loss": 0.6433, + "step": 7650 + }, + { + "epoch": 0.077, + "grad_norm": 65.85570526123047, + "learning_rate": 9.69232e-07, + "loss": 0.7118, + "step": 7700 + }, + { + "epoch": 0.0775, + "grad_norm": 77.98046112060547, + "learning_rate": 9.69032e-07, + "loss": 0.6795, + "step": 7750 + }, + { + "epoch": 0.078, + "grad_norm": 58.70011901855469, + "learning_rate": 9.68832e-07, + "loss": 0.677, + "step": 7800 + }, + { + "epoch": 0.0785, + "grad_norm": 47.72407150268555, + "learning_rate": 9.68632e-07, + "loss": 0.8678, + "step": 7850 + }, + { + "epoch": 0.079, + "grad_norm": 77.00784301757812, + "learning_rate": 9.684359999999998e-07, + "loss": 0.61, + "step": 7900 + }, + { + "epoch": 0.0795, + "grad_norm": 82.65626525878906, + "learning_rate": 9.68236e-07, + "loss": 0.6014, + "step": 7950 + }, + { + "epoch": 0.08, + "grad_norm": 64.11064147949219, + "learning_rate": 9.68036e-07, + "loss": 0.5775, + "step": 8000 + }, + { + "epoch": 0.0805, + "grad_norm": 75.26988220214844, + "learning_rate": 9.67836e-07, + "loss": 0.7189, + "step": 8050 + }, + { + "epoch": 0.081, + "grad_norm": 53.913936614990234, + "learning_rate": 9.67636e-07, + "loss": 0.6397, + "step": 8100 + }, + { + "epoch": 0.0815, + "grad_norm": 22.316036224365234, + "learning_rate": 9.67436e-07, + "loss": 0.5865, + "step": 8150 + }, + { + "epoch": 0.082, + "grad_norm": 25.347740173339844, + "learning_rate": 9.67236e-07, + "loss": 0.6304, + "step": 8200 + }, + { + "epoch": 0.0825, + "grad_norm": 59.95059585571289, + "learning_rate": 9.67036e-07, + "loss": 0.5163, + "step": 8250 + }, + { + "epoch": 0.083, + "grad_norm": 35.91787338256836, + "learning_rate": 9.66836e-07, + "loss": 0.6994, + "step": 8300 + }, + { + "epoch": 0.0835, + "grad_norm": 10.109200477600098, + "learning_rate": 9.66636e-07, + "loss": 0.6286, + "step": 8350 + }, + { + "epoch": 0.084, + "grad_norm": 25.906421661376953, + "learning_rate": 9.66436e-07, + "loss": 0.6338, + "step": 8400 + }, + { + "epoch": 0.0845, + "grad_norm": 79.9737777709961, + "learning_rate": 9.662359999999999e-07, + "loss": 0.8442, + "step": 8450 + }, + { + "epoch": 0.085, + "grad_norm": 41.02357482910156, + "learning_rate": 9.660359999999999e-07, + "loss": 0.7414, + "step": 8500 + }, + { + "epoch": 0.0855, + "grad_norm": 44.978729248046875, + "learning_rate": 9.65836e-07, + "loss": 0.8366, + "step": 8550 + }, + { + "epoch": 0.086, + "grad_norm": 55.472808837890625, + "learning_rate": 9.65636e-07, + "loss": 0.5963, + "step": 8600 + }, + { + "epoch": 0.0865, + "grad_norm": 11.16412353515625, + "learning_rate": 9.654359999999998e-07, + "loss": 0.6936, + "step": 8650 + }, + { + "epoch": 0.087, + "grad_norm": 83.75244903564453, + "learning_rate": 9.65236e-07, + "loss": 0.6425, + "step": 8700 + }, + { + "epoch": 0.0875, + "grad_norm": 54.668190002441406, + "learning_rate": 9.65036e-07, + "loss": 0.6609, + "step": 8750 + }, + { + "epoch": 0.088, + "grad_norm": 78.21800231933594, + "learning_rate": 9.64836e-07, + "loss": 0.6267, + "step": 8800 + }, + { + "epoch": 0.0885, + "grad_norm": 13.47957706451416, + "learning_rate": 9.64636e-07, + "loss": 0.689, + "step": 8850 + }, + { + "epoch": 0.089, + "grad_norm": 15.554354667663574, + "learning_rate": 9.64436e-07, + "loss": 0.7207, + "step": 8900 + }, + { + "epoch": 0.0895, + "grad_norm": 77.30255126953125, + "learning_rate": 9.64236e-07, + "loss": 0.5907, + "step": 8950 + }, + { + "epoch": 0.09, + "grad_norm": 50.847564697265625, + "learning_rate": 9.64036e-07, + "loss": 0.623, + "step": 9000 + }, + { + "epoch": 0.0905, + "grad_norm": 20.90938377380371, + "learning_rate": 9.63836e-07, + "loss": 0.7254, + "step": 9050 + }, + { + "epoch": 0.091, + "grad_norm": 77.27519989013672, + "learning_rate": 9.63636e-07, + "loss": 0.6997, + "step": 9100 + }, + { + "epoch": 0.0915, + "grad_norm": 71.25733947753906, + "learning_rate": 9.634359999999999e-07, + "loss": 0.6436, + "step": 9150 + }, + { + "epoch": 0.092, + "grad_norm": 47.3591423034668, + "learning_rate": 9.632359999999999e-07, + "loss": 0.6289, + "step": 9200 + }, + { + "epoch": 0.0925, + "grad_norm": 0.31778019666671753, + "learning_rate": 9.630359999999999e-07, + "loss": 0.58, + "step": 9250 + }, + { + "epoch": 0.093, + "grad_norm": 73.01952362060547, + "learning_rate": 9.62836e-07, + "loss": 0.6876, + "step": 9300 + }, + { + "epoch": 0.0935, + "grad_norm": 79.24958038330078, + "learning_rate": 9.62636e-07, + "loss": 0.7076, + "step": 9350 + }, + { + "epoch": 0.094, + "grad_norm": 33.39700698852539, + "learning_rate": 9.624359999999998e-07, + "loss": 0.6385, + "step": 9400 + }, + { + "epoch": 0.0945, + "grad_norm": 6.0091376304626465, + "learning_rate": 9.62236e-07, + "loss": 0.6001, + "step": 9450 + }, + { + "epoch": 0.095, + "grad_norm": 93.4347152709961, + "learning_rate": 9.62036e-07, + "loss": 0.5681, + "step": 9500 + }, + { + "epoch": 0.0955, + "grad_norm": 124.39002227783203, + "learning_rate": 9.61836e-07, + "loss": 0.5942, + "step": 9550 + }, + { + "epoch": 0.096, + "grad_norm": 26.654417037963867, + "learning_rate": 9.61636e-07, + "loss": 0.5937, + "step": 9600 + }, + { + "epoch": 0.0965, + "grad_norm": 84.6335678100586, + "learning_rate": 9.61436e-07, + "loss": 0.6569, + "step": 9650 + }, + { + "epoch": 0.097, + "grad_norm": 68.39822387695312, + "learning_rate": 9.61236e-07, + "loss": 0.6662, + "step": 9700 + }, + { + "epoch": 0.0975, + "grad_norm": 46.38802719116211, + "learning_rate": 9.61036e-07, + "loss": 0.6765, + "step": 9750 + }, + { + "epoch": 0.098, + "grad_norm": 12.477609634399414, + "learning_rate": 9.60836e-07, + "loss": 0.6491, + "step": 9800 + }, + { + "epoch": 0.0985, + "grad_norm": 7.226109027862549, + "learning_rate": 9.606360000000001e-07, + "loss": 0.6194, + "step": 9850 + }, + { + "epoch": 0.099, + "grad_norm": 47.4417724609375, + "learning_rate": 9.604359999999999e-07, + "loss": 0.7301, + "step": 9900 + }, + { + "epoch": 0.0995, + "grad_norm": 91.45806884765625, + "learning_rate": 9.602359999999999e-07, + "loss": 0.6963, + "step": 9950 + }, + { + "epoch": 0.1, + "grad_norm": 40.271846771240234, + "learning_rate": 9.60036e-07, + "loss": 0.7493, + "step": 10000 + }, + { + "epoch": 0.1005, + "grad_norm": 87.0700912475586, + "learning_rate": 9.59836e-07, + "loss": 0.558, + "step": 10050 + }, + { + "epoch": 0.101, + "grad_norm": 89.44054412841797, + "learning_rate": 9.5964e-07, + "loss": 0.4637, + "step": 10100 + }, + { + "epoch": 0.1015, + "grad_norm": 104.4756088256836, + "learning_rate": 9.5944e-07, + "loss": 0.6641, + "step": 10150 + }, + { + "epoch": 0.102, + "grad_norm": 77.9275131225586, + "learning_rate": 9.5924e-07, + "loss": 0.4791, + "step": 10200 + }, + { + "epoch": 0.1025, + "grad_norm": 113.33071899414062, + "learning_rate": 9.590399999999999e-07, + "loss": 0.5757, + "step": 10250 + }, + { + "epoch": 0.103, + "grad_norm": 74.14317321777344, + "learning_rate": 9.5884e-07, + "loss": 0.7051, + "step": 10300 + }, + { + "epoch": 0.1035, + "grad_norm": 91.95326232910156, + "learning_rate": 9.5864e-07, + "loss": 0.7315, + "step": 10350 + }, + { + "epoch": 0.104, + "grad_norm": 72.8747329711914, + "learning_rate": 9.584399999999998e-07, + "loss": 0.7391, + "step": 10400 + }, + { + "epoch": 0.1045, + "grad_norm": 61.38804244995117, + "learning_rate": 9.5824e-07, + "loss": 0.6073, + "step": 10450 + }, + { + "epoch": 0.105, + "grad_norm": 65.2311782836914, + "learning_rate": 9.5804e-07, + "loss": 0.792, + "step": 10500 + }, + { + "epoch": 0.1055, + "grad_norm": 52.02727508544922, + "learning_rate": 9.5784e-07, + "loss": 0.5261, + "step": 10550 + }, + { + "epoch": 0.106, + "grad_norm": 34.90068054199219, + "learning_rate": 9.5764e-07, + "loss": 0.5306, + "step": 10600 + }, + { + "epoch": 0.1065, + "grad_norm": 58.16232681274414, + "learning_rate": 9.5744e-07, + "loss": 0.7673, + "step": 10650 + }, + { + "epoch": 0.107, + "grad_norm": 15.396655082702637, + "learning_rate": 9.5724e-07, + "loss": 0.5671, + "step": 10700 + }, + { + "epoch": 0.1075, + "grad_norm": 15.548702239990234, + "learning_rate": 9.5704e-07, + "loss": 0.7041, + "step": 10750 + }, + { + "epoch": 0.108, + "grad_norm": 101.34947204589844, + "learning_rate": 9.5684e-07, + "loss": 0.7207, + "step": 10800 + }, + { + "epoch": 0.1085, + "grad_norm": 6.5170207023620605, + "learning_rate": 9.5664e-07, + "loss": 0.5903, + "step": 10850 + }, + { + "epoch": 0.109, + "grad_norm": 36.6441764831543, + "learning_rate": 9.5644e-07, + "loss": 0.6876, + "step": 10900 + }, + { + "epoch": 0.1095, + "grad_norm": 63.76533508300781, + "learning_rate": 9.562399999999999e-07, + "loss": 0.6007, + "step": 10950 + }, + { + "epoch": 0.11, + "grad_norm": 52.73303985595703, + "learning_rate": 9.560399999999999e-07, + "loss": 0.6017, + "step": 11000 + }, + { + "epoch": 0.1105, + "grad_norm": 62.94703674316406, + "learning_rate": 9.5584e-07, + "loss": 0.6171, + "step": 11050 + }, + { + "epoch": 0.111, + "grad_norm": 60.48774337768555, + "learning_rate": 9.5564e-07, + "loss": 0.5815, + "step": 11100 + }, + { + "epoch": 0.1115, + "grad_norm": 84.18730163574219, + "learning_rate": 9.554399999999998e-07, + "loss": 0.6942, + "step": 11150 + }, + { + "epoch": 0.112, + "grad_norm": 81.33743286132812, + "learning_rate": 9.5524e-07, + "loss": 0.5393, + "step": 11200 + }, + { + "epoch": 0.1125, + "grad_norm": 58.714378356933594, + "learning_rate": 9.5504e-07, + "loss": 0.532, + "step": 11250 + }, + { + "epoch": 0.113, + "grad_norm": 97.91901397705078, + "learning_rate": 9.5484e-07, + "loss": 0.6323, + "step": 11300 + }, + { + "epoch": 0.1135, + "grad_norm": 5.727772235870361, + "learning_rate": 9.5464e-07, + "loss": 0.5041, + "step": 11350 + }, + { + "epoch": 0.114, + "grad_norm": 39.63825988769531, + "learning_rate": 9.5444e-07, + "loss": 0.5549, + "step": 11400 + }, + { + "epoch": 0.1145, + "grad_norm": 103.17711639404297, + "learning_rate": 9.5424e-07, + "loss": 0.5167, + "step": 11450 + }, + { + "epoch": 0.115, + "grad_norm": 48.15481948852539, + "learning_rate": 9.5404e-07, + "loss": 0.7287, + "step": 11500 + }, + { + "epoch": 0.1155, + "grad_norm": 4.032690525054932, + "learning_rate": 9.5384e-07, + "loss": 0.64, + "step": 11550 + }, + { + "epoch": 0.116, + "grad_norm": 22.449026107788086, + "learning_rate": 9.5364e-07, + "loss": 0.6243, + "step": 11600 + }, + { + "epoch": 0.1165, + "grad_norm": 76.77507019042969, + "learning_rate": 9.534399999999999e-07, + "loss": 0.6222, + "step": 11650 + }, + { + "epoch": 0.117, + "grad_norm": 3.4103989601135254, + "learning_rate": 9.5324e-07, + "loss": 0.7213, + "step": 11700 + }, + { + "epoch": 0.1175, + "grad_norm": 40.05355453491211, + "learning_rate": 9.5304e-07, + "loss": 0.6782, + "step": 11750 + }, + { + "epoch": 0.118, + "grad_norm": 13.29817008972168, + "learning_rate": 9.5284e-07, + "loss": 0.7741, + "step": 11800 + }, + { + "epoch": 0.1185, + "grad_norm": 55.80624771118164, + "learning_rate": 9.5264e-07, + "loss": 0.7785, + "step": 11850 + }, + { + "epoch": 0.119, + "grad_norm": 75.11266326904297, + "learning_rate": 9.524399999999999e-07, + "loss": 0.7455, + "step": 11900 + }, + { + "epoch": 0.1195, + "grad_norm": 28.10260009765625, + "learning_rate": 9.522399999999999e-07, + "loss": 0.6025, + "step": 11950 + }, + { + "epoch": 0.12, + "grad_norm": 71.65565490722656, + "learning_rate": 9.5204e-07, + "loss": 0.7934, + "step": 12000 + }, + { + "epoch": 0.1205, + "grad_norm": 54.734107971191406, + "learning_rate": 9.5184e-07, + "loss": 0.6244, + "step": 12050 + }, + { + "epoch": 0.121, + "grad_norm": 3.3738136291503906, + "learning_rate": 9.5164e-07, + "loss": 0.51, + "step": 12100 + }, + { + "epoch": 0.1215, + "grad_norm": 50.44538116455078, + "learning_rate": 9.5144e-07, + "loss": 0.5917, + "step": 12150 + }, + { + "epoch": 0.122, + "grad_norm": 22.168771743774414, + "learning_rate": 9.512399999999999e-07, + "loss": 0.5424, + "step": 12200 + }, + { + "epoch": 0.1225, + "grad_norm": 55.60480499267578, + "learning_rate": 9.510399999999999e-07, + "loss": 0.6332, + "step": 12250 + }, + { + "epoch": 0.123, + "grad_norm": 42.9905891418457, + "learning_rate": 9.5084e-07, + "loss": 0.5598, + "step": 12300 + }, + { + "epoch": 0.1235, + "grad_norm": 2.6760189533233643, + "learning_rate": 9.5064e-07, + "loss": 0.671, + "step": 12350 + }, + { + "epoch": 0.124, + "grad_norm": 62.84537124633789, + "learning_rate": 9.504399999999999e-07, + "loss": 0.6672, + "step": 12400 + }, + { + "epoch": 0.1245, + "grad_norm": 60.886653900146484, + "learning_rate": 9.5024e-07, + "loss": 0.7067, + "step": 12450 + }, + { + "epoch": 0.125, + "grad_norm": 84.8931655883789, + "learning_rate": 9.5004e-07, + "loss": 0.5629, + "step": 12500 + }, + { + "epoch": 0.1255, + "grad_norm": 0.7383530735969543, + "learning_rate": 9.498399999999999e-07, + "loss": 0.5731, + "step": 12550 + }, + { + "epoch": 0.126, + "grad_norm": 146.97189331054688, + "learning_rate": 9.4964e-07, + "loss": 0.5422, + "step": 12600 + }, + { + "epoch": 0.1265, + "grad_norm": 13.385457038879395, + "learning_rate": 9.494399999999999e-07, + "loss": 0.6314, + "step": 12650 + }, + { + "epoch": 0.127, + "grad_norm": 9.101408004760742, + "learning_rate": 9.492399999999999e-07, + "loss": 0.6082, + "step": 12700 + }, + { + "epoch": 0.1275, + "grad_norm": 9.814948081970215, + "learning_rate": 9.4904e-07, + "loss": 0.6427, + "step": 12750 + }, + { + "epoch": 0.128, + "grad_norm": 1.8460052013397217, + "learning_rate": 9.4884e-07, + "loss": 0.6449, + "step": 12800 + }, + { + "epoch": 0.1285, + "grad_norm": 2.2399051189422607, + "learning_rate": 9.4864e-07, + "loss": 0.6494, + "step": 12850 + }, + { + "epoch": 0.129, + "grad_norm": 51.1398811340332, + "learning_rate": 9.484399999999999e-07, + "loss": 0.5733, + "step": 12900 + }, + { + "epoch": 0.1295, + "grad_norm": 48.945919036865234, + "learning_rate": 9.482399999999999e-07, + "loss": 0.6088, + "step": 12950 + }, + { + "epoch": 0.13, + "grad_norm": 55.05748748779297, + "learning_rate": 9.480399999999999e-07, + "loss": 0.5011, + "step": 13000 + }, + { + "epoch": 0.1305, + "grad_norm": 13.456110954284668, + "learning_rate": 9.4784e-07, + "loss": 0.7739, + "step": 13050 + }, + { + "epoch": 0.131, + "grad_norm": 76.10543060302734, + "learning_rate": 9.4764e-07, + "loss": 0.6818, + "step": 13100 + }, + { + "epoch": 0.1315, + "grad_norm": 51.6418571472168, + "learning_rate": 9.474439999999999e-07, + "loss": 0.525, + "step": 13150 + }, + { + "epoch": 0.132, + "grad_norm": 17.212923049926758, + "learning_rate": 9.47244e-07, + "loss": 0.5746, + "step": 13200 + }, + { + "epoch": 0.1325, + "grad_norm": 53.69514465332031, + "learning_rate": 9.470439999999999e-07, + "loss": 0.5003, + "step": 13250 + }, + { + "epoch": 0.133, + "grad_norm": 40.141632080078125, + "learning_rate": 9.468439999999999e-07, + "loss": 0.6027, + "step": 13300 + }, + { + "epoch": 0.1335, + "grad_norm": 61.87038040161133, + "learning_rate": 9.46644e-07, + "loss": 0.6072, + "step": 13350 + }, + { + "epoch": 0.134, + "grad_norm": 122.38882446289062, + "learning_rate": 9.464439999999999e-07, + "loss": 0.6451, + "step": 13400 + }, + { + "epoch": 0.1345, + "grad_norm": 56.08554458618164, + "learning_rate": 9.462439999999999e-07, + "loss": 0.6374, + "step": 13450 + }, + { + "epoch": 0.135, + "grad_norm": 81.84549713134766, + "learning_rate": 9.46044e-07, + "loss": 0.5401, + "step": 13500 + }, + { + "epoch": 0.1355, + "grad_norm": 118.67034912109375, + "learning_rate": 9.45844e-07, + "loss": 0.6624, + "step": 13550 + }, + { + "epoch": 0.136, + "grad_norm": 57.54740905761719, + "learning_rate": 9.45644e-07, + "loss": 0.5806, + "step": 13600 + }, + { + "epoch": 0.1365, + "grad_norm": 105.1045913696289, + "learning_rate": 9.454439999999999e-07, + "loss": 0.5867, + "step": 13650 + }, + { + "epoch": 0.137, + "grad_norm": 25.346324920654297, + "learning_rate": 9.452439999999999e-07, + "loss": 0.6926, + "step": 13700 + }, + { + "epoch": 0.1375, + "grad_norm": 91.3180160522461, + "learning_rate": 9.45044e-07, + "loss": 0.5217, + "step": 13750 + }, + { + "epoch": 0.138, + "grad_norm": 79.1546630859375, + "learning_rate": 9.44844e-07, + "loss": 0.6172, + "step": 13800 + }, + { + "epoch": 0.1385, + "grad_norm": 92.9755630493164, + "learning_rate": 9.44644e-07, + "loss": 0.7169, + "step": 13850 + }, + { + "epoch": 0.139, + "grad_norm": 33.35745620727539, + "learning_rate": 9.44444e-07, + "loss": 0.6807, + "step": 13900 + }, + { + "epoch": 0.1395, + "grad_norm": 14.041089057922363, + "learning_rate": 9.442439999999999e-07, + "loss": 0.5662, + "step": 13950 + }, + { + "epoch": 0.14, + "grad_norm": 46.25873947143555, + "learning_rate": 9.440439999999999e-07, + "loss": 0.6316, + "step": 14000 + }, + { + "epoch": 0.1405, + "grad_norm": 59.24448013305664, + "learning_rate": 9.43844e-07, + "loss": 0.6294, + "step": 14050 + }, + { + "epoch": 0.141, + "grad_norm": 68.7812728881836, + "learning_rate": 9.43644e-07, + "loss": 0.6128, + "step": 14100 + }, + { + "epoch": 0.1415, + "grad_norm": 66.03839874267578, + "learning_rate": 9.434439999999999e-07, + "loss": 0.7409, + "step": 14150 + }, + { + "epoch": 0.142, + "grad_norm": 28.09392738342285, + "learning_rate": 9.43244e-07, + "loss": 0.7238, + "step": 14200 + }, + { + "epoch": 0.1425, + "grad_norm": 11.177745819091797, + "learning_rate": 9.43044e-07, + "loss": 0.5512, + "step": 14250 + }, + { + "epoch": 0.143, + "grad_norm": 77.08145141601562, + "learning_rate": 9.428439999999999e-07, + "loss": 0.558, + "step": 14300 + }, + { + "epoch": 0.1435, + "grad_norm": 32.226314544677734, + "learning_rate": 9.42644e-07, + "loss": 0.6093, + "step": 14350 + }, + { + "epoch": 0.144, + "grad_norm": 39.99115753173828, + "learning_rate": 9.424439999999999e-07, + "loss": 0.511, + "step": 14400 + }, + { + "epoch": 0.1445, + "grad_norm": 45.316959381103516, + "learning_rate": 9.422439999999999e-07, + "loss": 0.6866, + "step": 14450 + }, + { + "epoch": 0.145, + "grad_norm": 9.914299011230469, + "learning_rate": 9.42044e-07, + "loss": 0.5704, + "step": 14500 + }, + { + "epoch": 0.1455, + "grad_norm": 112.46765899658203, + "learning_rate": 9.41844e-07, + "loss": 0.6679, + "step": 14550 + }, + { + "epoch": 0.146, + "grad_norm": 18.95020866394043, + "learning_rate": 9.41644e-07, + "loss": 0.4952, + "step": 14600 + }, + { + "epoch": 0.1465, + "grad_norm": 74.44670867919922, + "learning_rate": 9.414439999999999e-07, + "loss": 0.5307, + "step": 14650 + }, + { + "epoch": 0.147, + "grad_norm": 1.2418839931488037, + "learning_rate": 9.412439999999999e-07, + "loss": 0.5571, + "step": 14700 + }, + { + "epoch": 0.1475, + "grad_norm": 49.43461608886719, + "learning_rate": 9.410439999999999e-07, + "loss": 0.4794, + "step": 14750 + }, + { + "epoch": 0.148, + "grad_norm": 74.62863159179688, + "learning_rate": 9.40844e-07, + "loss": 0.5945, + "step": 14800 + }, + { + "epoch": 0.1485, + "grad_norm": 81.81336212158203, + "learning_rate": 9.40644e-07, + "loss": 0.62, + "step": 14850 + }, + { + "epoch": 0.149, + "grad_norm": 63.31863021850586, + "learning_rate": 9.404439999999999e-07, + "loss": 0.5821, + "step": 14900 + }, + { + "epoch": 0.1495, + "grad_norm": 42.47512435913086, + "learning_rate": 9.40244e-07, + "loss": 0.6294, + "step": 14950 + }, + { + "epoch": 0.15, + "grad_norm": 76.9931869506836, + "learning_rate": 9.400439999999999e-07, + "loss": 0.4261, + "step": 15000 + }, + { + "epoch": 0.1505, + "grad_norm": 61.6082763671875, + "learning_rate": 9.398439999999999e-07, + "loss": 0.671, + "step": 15050 + }, + { + "epoch": 0.151, + "grad_norm": 64.3666763305664, + "learning_rate": 9.39644e-07, + "loss": 0.7315, + "step": 15100 + }, + { + "epoch": 0.1515, + "grad_norm": 50.603111267089844, + "learning_rate": 9.394439999999999e-07, + "loss": 0.6032, + "step": 15150 + }, + { + "epoch": 0.152, + "grad_norm": 88.94772338867188, + "learning_rate": 9.392439999999999e-07, + "loss": 0.5629, + "step": 15200 + }, + { + "epoch": 0.1525, + "grad_norm": 16.136049270629883, + "learning_rate": 9.39048e-07, + "loss": 0.547, + "step": 15250 + }, + { + "epoch": 0.153, + "grad_norm": 14.290252685546875, + "learning_rate": 9.38848e-07, + "loss": 0.5877, + "step": 15300 + }, + { + "epoch": 0.1535, + "grad_norm": 72.88977813720703, + "learning_rate": 9.38648e-07, + "loss": 0.6141, + "step": 15350 + }, + { + "epoch": 0.154, + "grad_norm": 56.635902404785156, + "learning_rate": 9.384479999999999e-07, + "loss": 0.5268, + "step": 15400 + }, + { + "epoch": 0.1545, + "grad_norm": 36.33918762207031, + "learning_rate": 9.382479999999999e-07, + "loss": 0.6514, + "step": 15450 + }, + { + "epoch": 0.155, + "grad_norm": 53.50493621826172, + "learning_rate": 9.380479999999999e-07, + "loss": 0.573, + "step": 15500 + }, + { + "epoch": 0.1555, + "grad_norm": 2.055042266845703, + "learning_rate": 9.37848e-07, + "loss": 0.6182, + "step": 15550 + }, + { + "epoch": 0.156, + "grad_norm": 173.22987365722656, + "learning_rate": 9.37648e-07, + "loss": 0.6573, + "step": 15600 + }, + { + "epoch": 0.1565, + "grad_norm": 22.735443115234375, + "learning_rate": 9.374479999999999e-07, + "loss": 0.6504, + "step": 15650 + }, + { + "epoch": 0.157, + "grad_norm": 76.63526153564453, + "learning_rate": 9.37248e-07, + "loss": 0.6736, + "step": 15700 + }, + { + "epoch": 0.1575, + "grad_norm": 16.873645782470703, + "learning_rate": 9.370479999999999e-07, + "loss": 0.624, + "step": 15750 + }, + { + "epoch": 0.158, + "grad_norm": 6.853653430938721, + "learning_rate": 9.368479999999999e-07, + "loss": 0.705, + "step": 15800 + }, + { + "epoch": 0.1585, + "grad_norm": 70.6560287475586, + "learning_rate": 9.36648e-07, + "loss": 0.603, + "step": 15850 + }, + { + "epoch": 0.159, + "grad_norm": 59.094749450683594, + "learning_rate": 9.364479999999999e-07, + "loss": 0.7677, + "step": 15900 + }, + { + "epoch": 0.1595, + "grad_norm": 7.74002742767334, + "learning_rate": 9.362479999999999e-07, + "loss": 0.5653, + "step": 15950 + }, + { + "epoch": 0.16, + "grad_norm": 20.42781639099121, + "learning_rate": 9.36048e-07, + "loss": 0.5747, + "step": 16000 + }, + { + "epoch": 0.1605, + "grad_norm": 56.63848876953125, + "learning_rate": 9.35848e-07, + "loss": 0.7402, + "step": 16050 + }, + { + "epoch": 0.161, + "grad_norm": 62.384342193603516, + "learning_rate": 9.35648e-07, + "loss": 0.6194, + "step": 16100 + }, + { + "epoch": 0.1615, + "grad_norm": 53.056766510009766, + "learning_rate": 9.354479999999999e-07, + "loss": 0.7444, + "step": 16150 + }, + { + "epoch": 0.162, + "grad_norm": 82.36703491210938, + "learning_rate": 9.352479999999999e-07, + "loss": 0.5666, + "step": 16200 + }, + { + "epoch": 0.1625, + "grad_norm": 18.262683868408203, + "learning_rate": 9.35048e-07, + "loss": 0.5184, + "step": 16250 + }, + { + "epoch": 0.163, + "grad_norm": 96.85376739501953, + "learning_rate": 9.34848e-07, + "loss": 0.6913, + "step": 16300 + }, + { + "epoch": 0.1635, + "grad_norm": 35.92914962768555, + "learning_rate": 9.34648e-07, + "loss": 0.5972, + "step": 16350 + }, + { + "epoch": 0.164, + "grad_norm": 33.37955856323242, + "learning_rate": 9.34448e-07, + "loss": 0.5924, + "step": 16400 + }, + { + "epoch": 0.1645, + "grad_norm": 33.09463882446289, + "learning_rate": 9.342479999999999e-07, + "loss": 0.5298, + "step": 16450 + }, + { + "epoch": 0.165, + "grad_norm": 26.03669548034668, + "learning_rate": 9.340479999999999e-07, + "loss": 0.4642, + "step": 16500 + }, + { + "epoch": 0.1655, + "grad_norm": 20.21666717529297, + "learning_rate": 9.33848e-07, + "loss": 0.5442, + "step": 16550 + }, + { + "epoch": 0.166, + "grad_norm": 23.451101303100586, + "learning_rate": 9.33648e-07, + "loss": 0.5649, + "step": 16600 + }, + { + "epoch": 0.1665, + "grad_norm": 8.985445022583008, + "learning_rate": 9.33448e-07, + "loss": 0.5812, + "step": 16650 + }, + { + "epoch": 0.167, + "grad_norm": 94.17129516601562, + "learning_rate": 9.33248e-07, + "loss": 0.5932, + "step": 16700 + }, + { + "epoch": 0.1675, + "grad_norm": 55.66262435913086, + "learning_rate": 9.33048e-07, + "loss": 0.5894, + "step": 16750 + }, + { + "epoch": 0.168, + "grad_norm": 22.5528507232666, + "learning_rate": 9.328479999999999e-07, + "loss": 0.6531, + "step": 16800 + }, + { + "epoch": 0.1685, + "grad_norm": 1.4093486070632935, + "learning_rate": 9.32648e-07, + "loss": 0.6753, + "step": 16850 + }, + { + "epoch": 0.169, + "grad_norm": 2.8292222023010254, + "learning_rate": 9.32448e-07, + "loss": 0.6501, + "step": 16900 + }, + { + "epoch": 0.1695, + "grad_norm": 55.30830383300781, + "learning_rate": 9.322479999999999e-07, + "loss": 0.4652, + "step": 16950 + }, + { + "epoch": 0.17, + "grad_norm": 2.9419615268707275, + "learning_rate": 9.32048e-07, + "loss": 0.5817, + "step": 17000 + }, + { + "epoch": 0.1705, + "grad_norm": 21.788589477539062, + "learning_rate": 9.31848e-07, + "loss": 0.4845, + "step": 17050 + }, + { + "epoch": 0.171, + "grad_norm": 51.12996292114258, + "learning_rate": 9.31648e-07, + "loss": 0.6086, + "step": 17100 + }, + { + "epoch": 0.1715, + "grad_norm": 29.451993942260742, + "learning_rate": 9.31448e-07, + "loss": 0.609, + "step": 17150 + }, + { + "epoch": 0.172, + "grad_norm": 26.545246124267578, + "learning_rate": 9.312479999999999e-07, + "loss": 0.7538, + "step": 17200 + }, + { + "epoch": 0.1725, + "grad_norm": 88.69632720947266, + "learning_rate": 9.310479999999999e-07, + "loss": 0.629, + "step": 17250 + }, + { + "epoch": 0.173, + "grad_norm": 24.869253158569336, + "learning_rate": 9.30848e-07, + "loss": 0.6385, + "step": 17300 + }, + { + "epoch": 0.1735, + "grad_norm": 116.06053161621094, + "learning_rate": 9.30652e-07, + "loss": 0.6501, + "step": 17350 + }, + { + "epoch": 0.174, + "grad_norm": 89.61216735839844, + "learning_rate": 9.304519999999999e-07, + "loss": 0.5625, + "step": 17400 + }, + { + "epoch": 0.1745, + "grad_norm": 0.683408260345459, + "learning_rate": 9.30252e-07, + "loss": 0.7442, + "step": 17450 + }, + { + "epoch": 0.175, + "grad_norm": 43.17182540893555, + "learning_rate": 9.300519999999999e-07, + "loss": 0.7728, + "step": 17500 + }, + { + "epoch": 0.1755, + "grad_norm": 16.35731315612793, + "learning_rate": 9.298519999999999e-07, + "loss": 0.5745, + "step": 17550 + }, + { + "epoch": 0.176, + "grad_norm": 8.577791213989258, + "learning_rate": 9.29652e-07, + "loss": 0.465, + "step": 17600 + }, + { + "epoch": 0.1765, + "grad_norm": 55.087440490722656, + "learning_rate": 9.294519999999999e-07, + "loss": 0.5468, + "step": 17650 + }, + { + "epoch": 0.177, + "grad_norm": 65.42967987060547, + "learning_rate": 9.292519999999999e-07, + "loss": 0.597, + "step": 17700 + }, + { + "epoch": 0.1775, + "grad_norm": 4.98286247253418, + "learning_rate": 9.29052e-07, + "loss": 0.5523, + "step": 17750 + }, + { + "epoch": 0.178, + "grad_norm": 39.229949951171875, + "learning_rate": 9.28852e-07, + "loss": 0.6382, + "step": 17800 + }, + { + "epoch": 0.1785, + "grad_norm": 33.733394622802734, + "learning_rate": 9.286519999999999e-07, + "loss": 0.6871, + "step": 17850 + }, + { + "epoch": 0.179, + "grad_norm": 16.59604263305664, + "learning_rate": 9.28452e-07, + "loss": 0.6174, + "step": 17900 + }, + { + "epoch": 0.1795, + "grad_norm": 85.7757797241211, + "learning_rate": 9.282519999999999e-07, + "loss": 0.7438, + "step": 17950 + }, + { + "epoch": 0.18, + "grad_norm": 77.86770629882812, + "learning_rate": 9.280519999999999e-07, + "loss": 0.5513, + "step": 18000 + }, + { + "epoch": 0.1805, + "grad_norm": 4.501659393310547, + "learning_rate": 9.27852e-07, + "loss": 0.6059, + "step": 18050 + }, + { + "epoch": 0.181, + "grad_norm": 54.42782211303711, + "learning_rate": 9.27652e-07, + "loss": 0.4786, + "step": 18100 + }, + { + "epoch": 0.1815, + "grad_norm": 45.432151794433594, + "learning_rate": 9.274520000000001e-07, + "loss": 0.6646, + "step": 18150 + }, + { + "epoch": 0.182, + "grad_norm": 111.56059265136719, + "learning_rate": 9.272519999999999e-07, + "loss": 0.6535, + "step": 18200 + }, + { + "epoch": 0.1825, + "grad_norm": 78.46669006347656, + "learning_rate": 9.270519999999999e-07, + "loss": 0.5738, + "step": 18250 + }, + { + "epoch": 0.183, + "grad_norm": 2.2137699127197266, + "learning_rate": 9.26852e-07, + "loss": 0.55, + "step": 18300 + }, + { + "epoch": 0.1835, + "grad_norm": 65.4900894165039, + "learning_rate": 9.26652e-07, + "loss": 0.6607, + "step": 18350 + }, + { + "epoch": 0.184, + "grad_norm": 45.10904312133789, + "learning_rate": 9.26452e-07, + "loss": 0.6373, + "step": 18400 + }, + { + "epoch": 0.1845, + "grad_norm": 61.703250885009766, + "learning_rate": 9.26252e-07, + "loss": 0.4666, + "step": 18450 + }, + { + "epoch": 0.185, + "grad_norm": 105.42646026611328, + "learning_rate": 9.26052e-07, + "loss": 0.6782, + "step": 18500 + }, + { + "epoch": 0.1855, + "grad_norm": 108.39800262451172, + "learning_rate": 9.258519999999999e-07, + "loss": 0.6013, + "step": 18550 + }, + { + "epoch": 0.186, + "grad_norm": 48.339599609375, + "learning_rate": 9.25652e-07, + "loss": 0.5994, + "step": 18600 + }, + { + "epoch": 0.1865, + "grad_norm": 61.67754364013672, + "learning_rate": 9.25452e-07, + "loss": 0.5721, + "step": 18650 + }, + { + "epoch": 0.187, + "grad_norm": 33.096046447753906, + "learning_rate": 9.252519999999999e-07, + "loss": 0.6398, + "step": 18700 + }, + { + "epoch": 0.1875, + "grad_norm": 33.91469192504883, + "learning_rate": 9.25052e-07, + "loss": 0.4305, + "step": 18750 + }, + { + "epoch": 0.188, + "grad_norm": 32.4777717590332, + "learning_rate": 9.24852e-07, + "loss": 0.6306, + "step": 18800 + }, + { + "epoch": 0.1885, + "grad_norm": 19.212507247924805, + "learning_rate": 9.24652e-07, + "loss": 0.6042, + "step": 18850 + }, + { + "epoch": 0.189, + "grad_norm": 193.7104949951172, + "learning_rate": 9.24452e-07, + "loss": 0.5036, + "step": 18900 + }, + { + "epoch": 0.1895, + "grad_norm": 77.8702392578125, + "learning_rate": 9.242519999999999e-07, + "loss": 0.6825, + "step": 18950 + }, + { + "epoch": 0.19, + "grad_norm": 53.27097702026367, + "learning_rate": 9.240519999999999e-07, + "loss": 0.6074, + "step": 19000 + }, + { + "epoch": 0.1905, + "grad_norm": 16.498302459716797, + "learning_rate": 9.23852e-07, + "loss": 0.497, + "step": 19050 + }, + { + "epoch": 0.191, + "grad_norm": 75.21617889404297, + "learning_rate": 9.23652e-07, + "loss": 0.6573, + "step": 19100 + }, + { + "epoch": 0.1915, + "grad_norm": 4.08469295501709, + "learning_rate": 9.23452e-07, + "loss": 0.5048, + "step": 19150 + }, + { + "epoch": 0.192, + "grad_norm": 8.395776748657227, + "learning_rate": 9.23252e-07, + "loss": 0.5109, + "step": 19200 + }, + { + "epoch": 0.1925, + "grad_norm": 1.7910493612289429, + "learning_rate": 9.230519999999999e-07, + "loss": 0.5289, + "step": 19250 + }, + { + "epoch": 0.193, + "grad_norm": 67.36382293701172, + "learning_rate": 9.228519999999999e-07, + "loss": 0.7231, + "step": 19300 + }, + { + "epoch": 0.1935, + "grad_norm": 55.8679313659668, + "learning_rate": 9.22652e-07, + "loss": 0.5469, + "step": 19350 + }, + { + "epoch": 0.194, + "grad_norm": 108.610107421875, + "learning_rate": 9.22452e-07, + "loss": 0.7487, + "step": 19400 + }, + { + "epoch": 0.1945, + "grad_norm": 60.358062744140625, + "learning_rate": 9.222519999999999e-07, + "loss": 0.5649, + "step": 19450 + }, + { + "epoch": 0.195, + "grad_norm": 70.0567398071289, + "learning_rate": 9.22052e-07, + "loss": 0.526, + "step": 19500 + }, + { + "epoch": 0.1955, + "grad_norm": 23.887163162231445, + "learning_rate": 9.21856e-07, + "loss": 0.4628, + "step": 19550 + }, + { + "epoch": 0.196, + "grad_norm": 15.568492889404297, + "learning_rate": 9.21656e-07, + "loss": 0.683, + "step": 19600 + }, + { + "epoch": 0.1965, + "grad_norm": 78.2328872680664, + "learning_rate": 9.21456e-07, + "loss": 0.6217, + "step": 19650 + }, + { + "epoch": 0.197, + "grad_norm": 16.458410263061523, + "learning_rate": 9.212559999999999e-07, + "loss": 0.6503, + "step": 19700 + }, + { + "epoch": 0.1975, + "grad_norm": 73.25173950195312, + "learning_rate": 9.210559999999999e-07, + "loss": 0.6609, + "step": 19750 + }, + { + "epoch": 0.198, + "grad_norm": 47.52989959716797, + "learning_rate": 9.20856e-07, + "loss": 0.5237, + "step": 19800 + }, + { + "epoch": 0.1985, + "grad_norm": 62.07141876220703, + "learning_rate": 9.20656e-07, + "loss": 0.5814, + "step": 19850 + }, + { + "epoch": 0.199, + "grad_norm": 41.7489013671875, + "learning_rate": 9.20456e-07, + "loss": 0.6958, + "step": 19900 + }, + { + "epoch": 0.1995, + "grad_norm": 64.66690063476562, + "learning_rate": 9.20256e-07, + "loss": 0.5138, + "step": 19950 + }, + { + "epoch": 0.2, + "grad_norm": 11.058856010437012, + "learning_rate": 9.200559999999999e-07, + "loss": 0.5906, + "step": 20000 + }, + { + "epoch": 0.2005, + "grad_norm": 60.96815872192383, + "learning_rate": 9.198559999999999e-07, + "loss": 0.5804, + "step": 20050 + }, + { + "epoch": 0.201, + "grad_norm": 30.981765747070312, + "learning_rate": 9.19656e-07, + "loss": 0.8173, + "step": 20100 + }, + { + "epoch": 0.2015, + "grad_norm": 54.25685501098633, + "learning_rate": 9.19456e-07, + "loss": 0.5893, + "step": 20150 + }, + { + "epoch": 0.202, + "grad_norm": 105.40068054199219, + "learning_rate": 9.192559999999999e-07, + "loss": 0.5352, + "step": 20200 + }, + { + "epoch": 0.2025, + "grad_norm": 36.076011657714844, + "learning_rate": 9.19056e-07, + "loss": 0.4749, + "step": 20250 + }, + { + "epoch": 0.203, + "grad_norm": 51.85771179199219, + "learning_rate": 9.18856e-07, + "loss": 0.6509, + "step": 20300 + }, + { + "epoch": 0.2035, + "grad_norm": 54.14777755737305, + "learning_rate": 9.186559999999999e-07, + "loss": 0.7679, + "step": 20350 + }, + { + "epoch": 0.204, + "grad_norm": 90.21994018554688, + "learning_rate": 9.18456e-07, + "loss": 0.5251, + "step": 20400 + }, + { + "epoch": 0.2045, + "grad_norm": 2.283958911895752, + "learning_rate": 9.182559999999999e-07, + "loss": 0.4597, + "step": 20450 + }, + { + "epoch": 0.205, + "grad_norm": 51.34020233154297, + "learning_rate": 9.180559999999999e-07, + "loss": 0.6181, + "step": 20500 + }, + { + "epoch": 0.2055, + "grad_norm": 113.1830062866211, + "learning_rate": 9.17856e-07, + "loss": 0.5824, + "step": 20550 + }, + { + "epoch": 0.206, + "grad_norm": 95.66886901855469, + "learning_rate": 9.17656e-07, + "loss": 0.5657, + "step": 20600 + }, + { + "epoch": 0.2065, + "grad_norm": 72.63542938232422, + "learning_rate": 9.174560000000001e-07, + "loss": 0.5979, + "step": 20650 + }, + { + "epoch": 0.207, + "grad_norm": 62.32326889038086, + "learning_rate": 9.172559999999999e-07, + "loss": 0.5072, + "step": 20700 + }, + { + "epoch": 0.2075, + "grad_norm": 59.557228088378906, + "learning_rate": 9.170559999999999e-07, + "loss": 0.4438, + "step": 20750 + }, + { + "epoch": 0.208, + "grad_norm": 64.50997161865234, + "learning_rate": 9.16856e-07, + "loss": 0.6501, + "step": 20800 + }, + { + "epoch": 0.2085, + "grad_norm": 51.724708557128906, + "learning_rate": 9.16656e-07, + "loss": 0.6389, + "step": 20850 + }, + { + "epoch": 0.209, + "grad_norm": 44.57424545288086, + "learning_rate": 9.16456e-07, + "loss": 0.5444, + "step": 20900 + }, + { + "epoch": 0.2095, + "grad_norm": 46.1285400390625, + "learning_rate": 9.16256e-07, + "loss": 0.6703, + "step": 20950 + }, + { + "epoch": 0.21, + "grad_norm": 62.80519485473633, + "learning_rate": 9.16056e-07, + "loss": 0.5194, + "step": 21000 + }, + { + "epoch": 0.2105, + "grad_norm": 97.80638885498047, + "learning_rate": 9.158559999999999e-07, + "loss": 0.425, + "step": 21050 + }, + { + "epoch": 0.211, + "grad_norm": 79.29085540771484, + "learning_rate": 9.15656e-07, + "loss": 0.4748, + "step": 21100 + }, + { + "epoch": 0.2115, + "grad_norm": 15.490723609924316, + "learning_rate": 9.15456e-07, + "loss": 0.666, + "step": 21150 + }, + { + "epoch": 0.212, + "grad_norm": 1.4117364883422852, + "learning_rate": 9.152559999999999e-07, + "loss": 0.4462, + "step": 21200 + }, + { + "epoch": 0.2125, + "grad_norm": 64.279541015625, + "learning_rate": 9.15056e-07, + "loss": 0.462, + "step": 21250 + }, + { + "epoch": 0.213, + "grad_norm": 50.567176818847656, + "learning_rate": 9.14856e-07, + "loss": 0.629, + "step": 21300 + }, + { + "epoch": 0.2135, + "grad_norm": 74.72473907470703, + "learning_rate": 9.14656e-07, + "loss": 0.7797, + "step": 21350 + }, + { + "epoch": 0.214, + "grad_norm": 0.2427138090133667, + "learning_rate": 9.14456e-07, + "loss": 0.6424, + "step": 21400 + }, + { + "epoch": 0.2145, + "grad_norm": 94.19923400878906, + "learning_rate": 9.142559999999999e-07, + "loss": 0.5077, + "step": 21450 + }, + { + "epoch": 0.215, + "grad_norm": 5.564889430999756, + "learning_rate": 9.140559999999999e-07, + "loss": 0.5594, + "step": 21500 + }, + { + "epoch": 0.2155, + "grad_norm": 79.15170288085938, + "learning_rate": 9.1386e-07, + "loss": 0.5204, + "step": 21550 + }, + { + "epoch": 0.216, + "grad_norm": 34.72391128540039, + "learning_rate": 9.1366e-07, + "loss": 0.5304, + "step": 21600 + }, + { + "epoch": 0.2165, + "grad_norm": 45.1458625793457, + "learning_rate": 9.1346e-07, + "loss": 0.5573, + "step": 21650 + }, + { + "epoch": 0.217, + "grad_norm": 13.129668235778809, + "learning_rate": 9.1326e-07, + "loss": 0.5866, + "step": 21700 + }, + { + "epoch": 0.2175, + "grad_norm": 94.04682159423828, + "learning_rate": 9.130599999999999e-07, + "loss": 0.5363, + "step": 21750 + }, + { + "epoch": 0.218, + "grad_norm": 3.027968645095825, + "learning_rate": 9.128599999999999e-07, + "loss": 0.6121, + "step": 21800 + }, + { + "epoch": 0.2185, + "grad_norm": 14.584113121032715, + "learning_rate": 9.1266e-07, + "loss": 0.6755, + "step": 21850 + }, + { + "epoch": 0.219, + "grad_norm": 90.84115600585938, + "learning_rate": 9.1246e-07, + "loss": 0.8153, + "step": 21900 + }, + { + "epoch": 0.2195, + "grad_norm": 54.05946350097656, + "learning_rate": 9.122599999999999e-07, + "loss": 0.5227, + "step": 21950 + }, + { + "epoch": 0.22, + "grad_norm": 112.29281616210938, + "learning_rate": 9.1206e-07, + "loss": 0.5026, + "step": 22000 + }, + { + "epoch": 0.2205, + "grad_norm": 39.67966842651367, + "learning_rate": 9.1186e-07, + "loss": 0.6301, + "step": 22050 + }, + { + "epoch": 0.221, + "grad_norm": 80.91326141357422, + "learning_rate": 9.116599999999999e-07, + "loss": 0.5802, + "step": 22100 + }, + { + "epoch": 0.2215, + "grad_norm": 21.68891143798828, + "learning_rate": 9.1146e-07, + "loss": 0.6474, + "step": 22150 + }, + { + "epoch": 0.222, + "grad_norm": 112.01923370361328, + "learning_rate": 9.112599999999999e-07, + "loss": 0.556, + "step": 22200 + }, + { + "epoch": 0.2225, + "grad_norm": 7.433606147766113, + "learning_rate": 9.110599999999999e-07, + "loss": 0.6761, + "step": 22250 + }, + { + "epoch": 0.223, + "grad_norm": 88.78018188476562, + "learning_rate": 9.1086e-07, + "loss": 0.6357, + "step": 22300 + }, + { + "epoch": 0.2235, + "grad_norm": 143.90103149414062, + "learning_rate": 9.1066e-07, + "loss": 0.5232, + "step": 22350 + }, + { + "epoch": 0.224, + "grad_norm": 7.7126359939575195, + "learning_rate": 9.1046e-07, + "loss": 0.6542, + "step": 22400 + }, + { + "epoch": 0.2245, + "grad_norm": 44.35465621948242, + "learning_rate": 9.102599999999999e-07, + "loss": 0.7115, + "step": 22450 + }, + { + "epoch": 0.225, + "grad_norm": 7.780481815338135, + "learning_rate": 9.100599999999999e-07, + "loss": 0.6885, + "step": 22500 + }, + { + "epoch": 0.2255, + "grad_norm": 120.73460388183594, + "learning_rate": 9.098599999999999e-07, + "loss": 0.6542, + "step": 22550 + }, + { + "epoch": 0.226, + "grad_norm": 85.5213623046875, + "learning_rate": 9.0966e-07, + "loss": 0.6606, + "step": 22600 + }, + { + "epoch": 0.2265, + "grad_norm": 73.37017822265625, + "learning_rate": 9.0946e-07, + "loss": 0.6267, + "step": 22650 + }, + { + "epoch": 0.227, + "grad_norm": 89.31539916992188, + "learning_rate": 9.092599999999999e-07, + "loss": 0.7014, + "step": 22700 + }, + { + "epoch": 0.2275, + "grad_norm": 27.36491584777832, + "learning_rate": 9.0906e-07, + "loss": 0.6383, + "step": 22750 + }, + { + "epoch": 0.228, + "grad_norm": 57.39030838012695, + "learning_rate": 9.0886e-07, + "loss": 0.6104, + "step": 22800 + }, + { + "epoch": 0.2285, + "grad_norm": 6.634609222412109, + "learning_rate": 9.086599999999999e-07, + "loss": 0.4772, + "step": 22850 + }, + { + "epoch": 0.229, + "grad_norm": 98.5889892578125, + "learning_rate": 9.0846e-07, + "loss": 0.6115, + "step": 22900 + }, + { + "epoch": 0.2295, + "grad_norm": 71.15877532958984, + "learning_rate": 9.082599999999999e-07, + "loss": 0.4788, + "step": 22950 + }, + { + "epoch": 0.23, + "grad_norm": 61.014102935791016, + "learning_rate": 9.080599999999999e-07, + "loss": 0.5738, + "step": 23000 + }, + { + "epoch": 0.2305, + "grad_norm": 18.213537216186523, + "learning_rate": 9.0786e-07, + "loss": 0.5771, + "step": 23050 + }, + { + "epoch": 0.231, + "grad_norm": 62.87067794799805, + "learning_rate": 9.0766e-07, + "loss": 0.5029, + "step": 23100 + }, + { + "epoch": 0.2315, + "grad_norm": 52.611976623535156, + "learning_rate": 9.074600000000001e-07, + "loss": 0.5436, + "step": 23150 + }, + { + "epoch": 0.232, + "grad_norm": 79.97590637207031, + "learning_rate": 9.072599999999999e-07, + "loss": 0.5754, + "step": 23200 + }, + { + "epoch": 0.2325, + "grad_norm": 100.50433349609375, + "learning_rate": 9.070599999999999e-07, + "loss": 0.7087, + "step": 23250 + }, + { + "epoch": 0.233, + "grad_norm": 77.63613891601562, + "learning_rate": 9.0686e-07, + "loss": 0.4804, + "step": 23300 + }, + { + "epoch": 0.2335, + "grad_norm": 152.4717254638672, + "learning_rate": 9.0666e-07, + "loss": 0.4687, + "step": 23350 + }, + { + "epoch": 0.234, + "grad_norm": 1.093911051750183, + "learning_rate": 9.0646e-07, + "loss": 0.5668, + "step": 23400 + }, + { + "epoch": 0.2345, + "grad_norm": 80.12809753417969, + "learning_rate": 9.0626e-07, + "loss": 0.6275, + "step": 23450 + }, + { + "epoch": 0.235, + "grad_norm": 17.399316787719727, + "learning_rate": 9.0606e-07, + "loss": 0.5775, + "step": 23500 + }, + { + "epoch": 0.2355, + "grad_norm": 62.769813537597656, + "learning_rate": 9.058599999999999e-07, + "loss": 0.5202, + "step": 23550 + }, + { + "epoch": 0.236, + "grad_norm": 37.83443069458008, + "learning_rate": 9.0566e-07, + "loss": 0.4658, + "step": 23600 + }, + { + "epoch": 0.2365, + "grad_norm": 16.844783782958984, + "learning_rate": 9.0546e-07, + "loss": 0.4983, + "step": 23650 + }, + { + "epoch": 0.237, + "grad_norm": 56.658695220947266, + "learning_rate": 9.052599999999999e-07, + "loss": 0.715, + "step": 23700 + }, + { + "epoch": 0.2375, + "grad_norm": 52.001991271972656, + "learning_rate": 9.0506e-07, + "loss": 0.481, + "step": 23750 + }, + { + "epoch": 0.238, + "grad_norm": 44.0219612121582, + "learning_rate": 9.0486e-07, + "loss": 0.5652, + "step": 23800 + }, + { + "epoch": 0.2385, + "grad_norm": 23.275821685791016, + "learning_rate": 9.0466e-07, + "loss": 0.5805, + "step": 23850 + }, + { + "epoch": 0.239, + "grad_norm": 42.84242630004883, + "learning_rate": 9.0446e-07, + "loss": 0.579, + "step": 23900 + }, + { + "epoch": 0.2395, + "grad_norm": 38.506683349609375, + "learning_rate": 9.042599999999999e-07, + "loss": 0.6933, + "step": 23950 + }, + { + "epoch": 0.24, + "grad_norm": 97.8546142578125, + "learning_rate": 9.040599999999999e-07, + "loss": 0.5293, + "step": 24000 + }, + { + "epoch": 0.2405, + "grad_norm": 34.26162338256836, + "learning_rate": 9.0386e-07, + "loss": 0.6291, + "step": 24050 + }, + { + "epoch": 0.241, + "grad_norm": 52.09344482421875, + "learning_rate": 9.0366e-07, + "loss": 0.5158, + "step": 24100 + }, + { + "epoch": 0.2415, + "grad_norm": 52.938385009765625, + "learning_rate": 9.0346e-07, + "loss": 0.4271, + "step": 24150 + }, + { + "epoch": 0.242, + "grad_norm": 102.01466369628906, + "learning_rate": 9.03264e-07, + "loss": 0.676, + "step": 24200 + }, + { + "epoch": 0.2425, + "grad_norm": 92.10142517089844, + "learning_rate": 9.030639999999999e-07, + "loss": 0.4368, + "step": 24250 + }, + { + "epoch": 0.243, + "grad_norm": 61.187618255615234, + "learning_rate": 9.028639999999999e-07, + "loss": 0.593, + "step": 24300 + }, + { + "epoch": 0.2435, + "grad_norm": 54.55991744995117, + "learning_rate": 9.02664e-07, + "loss": 0.5802, + "step": 24350 + }, + { + "epoch": 0.244, + "grad_norm": 66.34545135498047, + "learning_rate": 9.02464e-07, + "loss": 0.5796, + "step": 24400 + }, + { + "epoch": 0.2445, + "grad_norm": 57.91377639770508, + "learning_rate": 9.022639999999999e-07, + "loss": 0.5566, + "step": 24450 + }, + { + "epoch": 0.245, + "grad_norm": 40.273277282714844, + "learning_rate": 9.02064e-07, + "loss": 0.5867, + "step": 24500 + }, + { + "epoch": 0.2455, + "grad_norm": 83.97872161865234, + "learning_rate": 9.01864e-07, + "loss": 0.6223, + "step": 24550 + }, + { + "epoch": 0.246, + "grad_norm": 50.599708557128906, + "learning_rate": 9.016639999999999e-07, + "loss": 0.498, + "step": 24600 + }, + { + "epoch": 0.2465, + "grad_norm": 14.626625061035156, + "learning_rate": 9.01464e-07, + "loss": 0.5453, + "step": 24650 + }, + { + "epoch": 0.247, + "grad_norm": 12.880644798278809, + "learning_rate": 9.012639999999999e-07, + "loss": 0.5644, + "step": 24700 + }, + { + "epoch": 0.2475, + "grad_norm": 10.257484436035156, + "learning_rate": 9.010639999999999e-07, + "loss": 0.4375, + "step": 24750 + }, + { + "epoch": 0.248, + "grad_norm": 96.48957061767578, + "learning_rate": 9.00864e-07, + "loss": 0.6164, + "step": 24800 + }, + { + "epoch": 0.2485, + "grad_norm": 75.5651626586914, + "learning_rate": 9.00664e-07, + "loss": 0.82, + "step": 24850 + }, + { + "epoch": 0.249, + "grad_norm": 29.0128116607666, + "learning_rate": 9.00464e-07, + "loss": 0.4682, + "step": 24900 + }, + { + "epoch": 0.2495, + "grad_norm": 15.282790184020996, + "learning_rate": 9.002639999999999e-07, + "loss": 0.512, + "step": 24950 + }, + { + "epoch": 0.25, + "grad_norm": 70.66602325439453, + "learning_rate": 9.000639999999999e-07, + "loss": 0.719, + "step": 25000 + }, + { + "epoch": 0.2505, + "grad_norm": 133.6775665283203, + "learning_rate": 8.998639999999999e-07, + "loss": 0.583, + "step": 25050 + }, + { + "epoch": 0.251, + "grad_norm": 53.346988677978516, + "learning_rate": 8.99664e-07, + "loss": 0.661, + "step": 25100 + }, + { + "epoch": 0.2515, + "grad_norm": 57.71779251098633, + "learning_rate": 8.99464e-07, + "loss": 0.4995, + "step": 25150 + }, + { + "epoch": 0.252, + "grad_norm": 11.651928901672363, + "learning_rate": 8.992639999999999e-07, + "loss": 0.5597, + "step": 25200 + }, + { + "epoch": 0.2525, + "grad_norm": 107.9719467163086, + "learning_rate": 8.99064e-07, + "loss": 0.497, + "step": 25250 + }, + { + "epoch": 0.253, + "grad_norm": 14.6730375289917, + "learning_rate": 8.988639999999999e-07, + "loss": 0.6174, + "step": 25300 + }, + { + "epoch": 0.2535, + "grad_norm": 56.200225830078125, + "learning_rate": 8.986639999999999e-07, + "loss": 0.6374, + "step": 25350 + }, + { + "epoch": 0.254, + "grad_norm": 15.914813995361328, + "learning_rate": 8.98464e-07, + "loss": 0.513, + "step": 25400 + }, + { + "epoch": 0.2545, + "grad_norm": 3.805548906326294, + "learning_rate": 8.982639999999999e-07, + "loss": 0.5197, + "step": 25450 + }, + { + "epoch": 0.255, + "grad_norm": 88.02377319335938, + "learning_rate": 8.980639999999999e-07, + "loss": 0.7373, + "step": 25500 + }, + { + "epoch": 0.2555, + "grad_norm": 35.10052490234375, + "learning_rate": 8.97864e-07, + "loss": 0.626, + "step": 25550 + }, + { + "epoch": 0.256, + "grad_norm": 40.560298919677734, + "learning_rate": 8.97664e-07, + "loss": 0.6376, + "step": 25600 + }, + { + "epoch": 0.2565, + "grad_norm": 36.76740264892578, + "learning_rate": 8.97464e-07, + "loss": 0.6143, + "step": 25650 + }, + { + "epoch": 0.257, + "grad_norm": 58.6038703918457, + "learning_rate": 8.972639999999999e-07, + "loss": 0.6131, + "step": 25700 + }, + { + "epoch": 0.2575, + "grad_norm": 16.4853515625, + "learning_rate": 8.970639999999999e-07, + "loss": 0.5266, + "step": 25750 + }, + { + "epoch": 0.258, + "grad_norm": 2.5067620277404785, + "learning_rate": 8.96864e-07, + "loss": 0.4183, + "step": 25800 + }, + { + "epoch": 0.2585, + "grad_norm": 47.051795959472656, + "learning_rate": 8.96664e-07, + "loss": 0.411, + "step": 25850 + }, + { + "epoch": 0.259, + "grad_norm": 80.19731140136719, + "learning_rate": 8.96464e-07, + "loss": 0.6141, + "step": 25900 + }, + { + "epoch": 0.2595, + "grad_norm": 53.69868469238281, + "learning_rate": 8.96264e-07, + "loss": 0.5144, + "step": 25950 + }, + { + "epoch": 0.26, + "grad_norm": 33.09700393676758, + "learning_rate": 8.960639999999999e-07, + "loss": 0.6407, + "step": 26000 + }, + { + "epoch": 0.2605, + "grad_norm": 28.68010711669922, + "learning_rate": 8.958639999999999e-07, + "loss": 0.4799, + "step": 26050 + }, + { + "epoch": 0.261, + "grad_norm": 31.614540100097656, + "learning_rate": 8.95664e-07, + "loss": 0.5902, + "step": 26100 + }, + { + "epoch": 0.2615, + "grad_norm": 46.6901969909668, + "learning_rate": 8.95464e-07, + "loss": 0.5856, + "step": 26150 + }, + { + "epoch": 0.262, + "grad_norm": 22.507549285888672, + "learning_rate": 8.952639999999999e-07, + "loss": 0.6191, + "step": 26200 + }, + { + "epoch": 0.2625, + "grad_norm": 25.52752113342285, + "learning_rate": 8.95064e-07, + "loss": 0.5374, + "step": 26250 + }, + { + "epoch": 0.263, + "grad_norm": 66.70597076416016, + "learning_rate": 8.94864e-07, + "loss": 0.5224, + "step": 26300 + }, + { + "epoch": 0.2635, + "grad_norm": 62.320648193359375, + "learning_rate": 8.946639999999999e-07, + "loss": 0.5897, + "step": 26350 + }, + { + "epoch": 0.264, + "grad_norm": 15.186070442199707, + "learning_rate": 8.94464e-07, + "loss": 0.6228, + "step": 26400 + }, + { + "epoch": 0.2645, + "grad_norm": 111.6026382446289, + "learning_rate": 8.942639999999999e-07, + "loss": 0.5744, + "step": 26450 + }, + { + "epoch": 0.265, + "grad_norm": 82.44178771972656, + "learning_rate": 8.940639999999999e-07, + "loss": 0.5254, + "step": 26500 + }, + { + "epoch": 0.2655, + "grad_norm": 52.06359100341797, + "learning_rate": 8.93864e-07, + "loss": 0.4563, + "step": 26550 + }, + { + "epoch": 0.266, + "grad_norm": 77.35059356689453, + "learning_rate": 8.93664e-07, + "loss": 0.5443, + "step": 26600 + }, + { + "epoch": 0.2665, + "grad_norm": 40.24097442626953, + "learning_rate": 8.93464e-07, + "loss": 0.7312, + "step": 26650 + }, + { + "epoch": 0.267, + "grad_norm": 12.489123344421387, + "learning_rate": 8.932639999999999e-07, + "loss": 0.5649, + "step": 26700 + }, + { + "epoch": 0.2675, + "grad_norm": 87.02991485595703, + "learning_rate": 8.930639999999999e-07, + "loss": 0.5359, + "step": 26750 + }, + { + "epoch": 0.268, + "grad_norm": 20.647361755371094, + "learning_rate": 8.928639999999999e-07, + "loss": 0.5762, + "step": 26800 + }, + { + "epoch": 0.2685, + "grad_norm": 67.69243621826172, + "learning_rate": 8.92664e-07, + "loss": 0.4922, + "step": 26850 + }, + { + "epoch": 0.269, + "grad_norm": 60.18714904785156, + "learning_rate": 8.92464e-07, + "loss": 0.5516, + "step": 26900 + }, + { + "epoch": 0.2695, + "grad_norm": 9.904438972473145, + "learning_rate": 8.922639999999999e-07, + "loss": 0.5436, + "step": 26950 + }, + { + "epoch": 0.27, + "grad_norm": 74.50110626220703, + "learning_rate": 8.92064e-07, + "loss": 0.6846, + "step": 27000 + }, + { + "epoch": 0.2705, + "grad_norm": 64.5053939819336, + "learning_rate": 8.918639999999999e-07, + "loss": 0.5902, + "step": 27050 + }, + { + "epoch": 0.271, + "grad_norm": 54.11682891845703, + "learning_rate": 8.916679999999999e-07, + "loss": 0.6153, + "step": 27100 + }, + { + "epoch": 0.2715, + "grad_norm": 2.7656333446502686, + "learning_rate": 8.91468e-07, + "loss": 0.4774, + "step": 27150 + }, + { + "epoch": 0.272, + "grad_norm": 47.780067443847656, + "learning_rate": 8.912679999999999e-07, + "loss": 0.6265, + "step": 27200 + }, + { + "epoch": 0.2725, + "grad_norm": 79.4407958984375, + "learning_rate": 8.910679999999999e-07, + "loss": 0.4763, + "step": 27250 + }, + { + "epoch": 0.273, + "grad_norm": 30.775165557861328, + "learning_rate": 8.90868e-07, + "loss": 0.4733, + "step": 27300 + }, + { + "epoch": 0.2735, + "grad_norm": 43.01605987548828, + "learning_rate": 8.90668e-07, + "loss": 0.5996, + "step": 27350 + }, + { + "epoch": 0.274, + "grad_norm": 25.7813663482666, + "learning_rate": 8.90468e-07, + "loss": 0.4017, + "step": 27400 + }, + { + "epoch": 0.2745, + "grad_norm": 70.08576202392578, + "learning_rate": 8.902679999999999e-07, + "loss": 0.6638, + "step": 27450 + }, + { + "epoch": 0.275, + "grad_norm": 3.8417301177978516, + "learning_rate": 8.900679999999999e-07, + "loss": 0.5934, + "step": 27500 + }, + { + "epoch": 0.2755, + "grad_norm": 43.573211669921875, + "learning_rate": 8.898679999999999e-07, + "loss": 0.4682, + "step": 27550 + }, + { + "epoch": 0.276, + "grad_norm": 77.05116271972656, + "learning_rate": 8.89668e-07, + "loss": 0.852, + "step": 27600 + }, + { + "epoch": 0.2765, + "grad_norm": 51.25703430175781, + "learning_rate": 8.89468e-07, + "loss": 0.5171, + "step": 27650 + }, + { + "epoch": 0.277, + "grad_norm": 79.16458129882812, + "learning_rate": 8.892679999999999e-07, + "loss": 0.4679, + "step": 27700 + }, + { + "epoch": 0.2775, + "grad_norm": 52.861976623535156, + "learning_rate": 8.89068e-07, + "loss": 0.5379, + "step": 27750 + }, + { + "epoch": 0.278, + "grad_norm": 56.5855598449707, + "learning_rate": 8.888679999999999e-07, + "loss": 0.5315, + "step": 27800 + }, + { + "epoch": 0.2785, + "grad_norm": 38.6595458984375, + "learning_rate": 8.886679999999999e-07, + "loss": 0.5684, + "step": 27850 + }, + { + "epoch": 0.279, + "grad_norm": 111.93008422851562, + "learning_rate": 8.88468e-07, + "loss": 0.5945, + "step": 27900 + }, + { + "epoch": 0.2795, + "grad_norm": 58.56281661987305, + "learning_rate": 8.882679999999999e-07, + "loss": 0.4908, + "step": 27950 + }, + { + "epoch": 0.28, + "grad_norm": 6.173264026641846, + "learning_rate": 8.880679999999999e-07, + "loss": 0.6168, + "step": 28000 + }, + { + "epoch": 0.2805, + "grad_norm": 75.64866638183594, + "learning_rate": 8.87868e-07, + "loss": 0.6355, + "step": 28050 + }, + { + "epoch": 0.281, + "grad_norm": 17.418989181518555, + "learning_rate": 8.87668e-07, + "loss": 0.4972, + "step": 28100 + }, + { + "epoch": 0.2815, + "grad_norm": 105.94364929199219, + "learning_rate": 8.87468e-07, + "loss": 0.4307, + "step": 28150 + }, + { + "epoch": 0.282, + "grad_norm": 204.995361328125, + "learning_rate": 8.872679999999999e-07, + "loss": 0.5594, + "step": 28200 + }, + { + "epoch": 0.2825, + "grad_norm": 64.4760971069336, + "learning_rate": 8.870679999999999e-07, + "loss": 0.7741, + "step": 28250 + }, + { + "epoch": 0.283, + "grad_norm": 88.27828216552734, + "learning_rate": 8.86868e-07, + "loss": 0.6538, + "step": 28300 + }, + { + "epoch": 0.2835, + "grad_norm": 16.012582778930664, + "learning_rate": 8.86668e-07, + "loss": 0.6104, + "step": 28350 + }, + { + "epoch": 0.284, + "grad_norm": 27.557018280029297, + "learning_rate": 8.86468e-07, + "loss": 0.501, + "step": 28400 + }, + { + "epoch": 0.2845, + "grad_norm": 54.364566802978516, + "learning_rate": 8.86268e-07, + "loss": 0.5534, + "step": 28450 + }, + { + "epoch": 0.285, + "grad_norm": 17.52170753479004, + "learning_rate": 8.860679999999999e-07, + "loss": 0.5886, + "step": 28500 + }, + { + "epoch": 0.2855, + "grad_norm": 65.14887237548828, + "learning_rate": 8.858679999999999e-07, + "loss": 0.5129, + "step": 28550 + }, + { + "epoch": 0.286, + "grad_norm": 4.750748157501221, + "learning_rate": 8.85668e-07, + "loss": 0.521, + "step": 28600 + }, + { + "epoch": 0.2865, + "grad_norm": 69.85326385498047, + "learning_rate": 8.85468e-07, + "loss": 0.5333, + "step": 28650 + }, + { + "epoch": 0.287, + "grad_norm": 54.12617874145508, + "learning_rate": 8.85268e-07, + "loss": 0.481, + "step": 28700 + }, + { + "epoch": 0.2875, + "grad_norm": 1.6394026279449463, + "learning_rate": 8.85068e-07, + "loss": 0.5055, + "step": 28750 + }, + { + "epoch": 0.288, + "grad_norm": 12.818920135498047, + "learning_rate": 8.84868e-07, + "loss": 0.6489, + "step": 28800 + }, + { + "epoch": 0.2885, + "grad_norm": 29.769569396972656, + "learning_rate": 8.846679999999999e-07, + "loss": 0.4749, + "step": 28850 + }, + { + "epoch": 0.289, + "grad_norm": 62.24850845336914, + "learning_rate": 8.84468e-07, + "loss": 0.5205, + "step": 28900 + }, + { + "epoch": 0.2895, + "grad_norm": 6.054821491241455, + "learning_rate": 8.84268e-07, + "loss": 0.5548, + "step": 28950 + }, + { + "epoch": 0.29, + "grad_norm": 57.04275894165039, + "learning_rate": 8.840679999999999e-07, + "loss": 0.435, + "step": 29000 + }, + { + "epoch": 0.2905, + "grad_norm": 44.01559829711914, + "learning_rate": 8.83868e-07, + "loss": 0.5515, + "step": 29050 + }, + { + "epoch": 0.291, + "grad_norm": 102.21464538574219, + "learning_rate": 8.83668e-07, + "loss": 0.5175, + "step": 29100 + }, + { + "epoch": 0.2915, + "grad_norm": 16.0670108795166, + "learning_rate": 8.83468e-07, + "loss": 0.5667, + "step": 29150 + }, + { + "epoch": 0.292, + "grad_norm": 50.337894439697266, + "learning_rate": 8.83268e-07, + "loss": 0.6014, + "step": 29200 + }, + { + "epoch": 0.2925, + "grad_norm": 90.58765411376953, + "learning_rate": 8.830679999999999e-07, + "loss": 0.5361, + "step": 29250 + }, + { + "epoch": 0.293, + "grad_norm": 88.60356140136719, + "learning_rate": 8.828679999999999e-07, + "loss": 0.4992, + "step": 29300 + }, + { + "epoch": 0.2935, + "grad_norm": 17.39057731628418, + "learning_rate": 8.82668e-07, + "loss": 0.6699, + "step": 29350 + }, + { + "epoch": 0.294, + "grad_norm": 9.938417434692383, + "learning_rate": 8.82468e-07, + "loss": 0.5748, + "step": 29400 + }, + { + "epoch": 0.2945, + "grad_norm": 86.84083557128906, + "learning_rate": 8.82268e-07, + "loss": 0.5455, + "step": 29450 + }, + { + "epoch": 0.295, + "grad_norm": 76.97161102294922, + "learning_rate": 8.82068e-07, + "loss": 0.5668, + "step": 29500 + }, + { + "epoch": 0.2955, + "grad_norm": 6.305858135223389, + "learning_rate": 8.818679999999999e-07, + "loss": 0.4598, + "step": 29550 + }, + { + "epoch": 0.296, + "grad_norm": 83.46090698242188, + "learning_rate": 8.816679999999999e-07, + "loss": 0.5434, + "step": 29600 + }, + { + "epoch": 0.2965, + "grad_norm": 39.0470085144043, + "learning_rate": 8.81468e-07, + "loss": 0.5519, + "step": 29650 + }, + { + "epoch": 0.297, + "grad_norm": 74.89450073242188, + "learning_rate": 8.81268e-07, + "loss": 0.6435, + "step": 29700 + }, + { + "epoch": 0.2975, + "grad_norm": 77.68589782714844, + "learning_rate": 8.810679999999999e-07, + "loss": 0.5398, + "step": 29750 + }, + { + "epoch": 0.298, + "grad_norm": 55.85251235961914, + "learning_rate": 8.80868e-07, + "loss": 0.5951, + "step": 29800 + }, + { + "epoch": 0.2985, + "grad_norm": 35.89079666137695, + "learning_rate": 8.80668e-07, + "loss": 0.5007, + "step": 29850 + }, + { + "epoch": 0.299, + "grad_norm": 81.23458099365234, + "learning_rate": 8.804679999999999e-07, + "loss": 0.6661, + "step": 29900 + }, + { + "epoch": 0.2995, + "grad_norm": 30.005859375, + "learning_rate": 8.80268e-07, + "loss": 0.5404, + "step": 29950 + }, + { + "epoch": 0.3, + "grad_norm": 93.9574203491211, + "learning_rate": 8.800679999999999e-07, + "loss": 0.5561, + "step": 30000 + }, + { + "epoch": 0.3005, + "grad_norm": 70.64046478271484, + "learning_rate": 8.798719999999999e-07, + "loss": 0.4643, + "step": 30050 + }, + { + "epoch": 0.301, + "grad_norm": 23.76947021484375, + "learning_rate": 8.79672e-07, + "loss": 0.5806, + "step": 30100 + }, + { + "epoch": 0.3015, + "grad_norm": 54.23265838623047, + "learning_rate": 8.79472e-07, + "loss": 0.5467, + "step": 30150 + }, + { + "epoch": 0.302, + "grad_norm": 62.528175354003906, + "learning_rate": 8.792720000000001e-07, + "loss": 0.6381, + "step": 30200 + }, + { + "epoch": 0.3025, + "grad_norm": 24.852832794189453, + "learning_rate": 8.790719999999999e-07, + "loss": 0.4532, + "step": 30250 + }, + { + "epoch": 0.303, + "grad_norm": 55.6757698059082, + "learning_rate": 8.788719999999999e-07, + "loss": 0.7508, + "step": 30300 + }, + { + "epoch": 0.3035, + "grad_norm": 108.9383316040039, + "learning_rate": 8.78672e-07, + "loss": 0.6378, + "step": 30350 + }, + { + "epoch": 0.304, + "grad_norm": 44.41104507446289, + "learning_rate": 8.78472e-07, + "loss": 0.6764, + "step": 30400 + }, + { + "epoch": 0.3045, + "grad_norm": 34.045413970947266, + "learning_rate": 8.78272e-07, + "loss": 0.5944, + "step": 30450 + }, + { + "epoch": 0.305, + "grad_norm": 29.27425765991211, + "learning_rate": 8.78072e-07, + "loss": 0.5212, + "step": 30500 + }, + { + "epoch": 0.3055, + "grad_norm": 83.5364761352539, + "learning_rate": 8.77872e-07, + "loss": 0.3991, + "step": 30550 + }, + { + "epoch": 0.306, + "grad_norm": 1.8257538080215454, + "learning_rate": 8.776719999999999e-07, + "loss": 0.5711, + "step": 30600 + }, + { + "epoch": 0.3065, + "grad_norm": 74.53131866455078, + "learning_rate": 8.77472e-07, + "loss": 0.5351, + "step": 30650 + }, + { + "epoch": 0.307, + "grad_norm": 49.41853713989258, + "learning_rate": 8.77272e-07, + "loss": 0.5694, + "step": 30700 + }, + { + "epoch": 0.3075, + "grad_norm": 120.75214385986328, + "learning_rate": 8.770719999999999e-07, + "loss": 0.5498, + "step": 30750 + }, + { + "epoch": 0.308, + "grad_norm": 46.72524642944336, + "learning_rate": 8.76872e-07, + "loss": 0.5695, + "step": 30800 + }, + { + "epoch": 0.3085, + "grad_norm": 88.22650146484375, + "learning_rate": 8.76672e-07, + "loss": 0.5629, + "step": 30850 + }, + { + "epoch": 0.309, + "grad_norm": 11.028036117553711, + "learning_rate": 8.76472e-07, + "loss": 0.4899, + "step": 30900 + }, + { + "epoch": 0.3095, + "grad_norm": 18.966903686523438, + "learning_rate": 8.76272e-07, + "loss": 0.5321, + "step": 30950 + }, + { + "epoch": 0.31, + "grad_norm": 8.517266273498535, + "learning_rate": 8.760719999999999e-07, + "loss": 0.5198, + "step": 31000 + }, + { + "epoch": 0.3105, + "grad_norm": 2.6922929286956787, + "learning_rate": 8.758719999999999e-07, + "loss": 0.4756, + "step": 31050 + }, + { + "epoch": 0.311, + "grad_norm": 80.45319366455078, + "learning_rate": 8.75672e-07, + "loss": 0.625, + "step": 31100 + }, + { + "epoch": 0.3115, + "grad_norm": 50.82831573486328, + "learning_rate": 8.75472e-07, + "loss": 0.5731, + "step": 31150 + }, + { + "epoch": 0.312, + "grad_norm": 80.17171478271484, + "learning_rate": 8.75272e-07, + "loss": 0.4188, + "step": 31200 + }, + { + "epoch": 0.3125, + "grad_norm": 83.52489471435547, + "learning_rate": 8.75072e-07, + "loss": 0.6047, + "step": 31250 + }, + { + "epoch": 0.313, + "grad_norm": 40.45694351196289, + "learning_rate": 8.74872e-07, + "loss": 0.5314, + "step": 31300 + }, + { + "epoch": 0.3135, + "grad_norm": 7.2418694496154785, + "learning_rate": 8.746719999999999e-07, + "loss": 0.6789, + "step": 31350 + }, + { + "epoch": 0.314, + "grad_norm": 105.38126373291016, + "learning_rate": 8.74472e-07, + "loss": 0.6344, + "step": 31400 + }, + { + "epoch": 0.3145, + "grad_norm": 69.45547485351562, + "learning_rate": 8.74272e-07, + "loss": 0.508, + "step": 31450 + }, + { + "epoch": 0.315, + "grad_norm": 7.970687389373779, + "learning_rate": 8.740719999999999e-07, + "loss": 0.6869, + "step": 31500 + }, + { + "epoch": 0.3155, + "grad_norm": 85.00370788574219, + "learning_rate": 8.73872e-07, + "loss": 0.5232, + "step": 31550 + }, + { + "epoch": 0.316, + "grad_norm": 63.42658996582031, + "learning_rate": 8.73672e-07, + "loss": 0.5965, + "step": 31600 + }, + { + "epoch": 0.3165, + "grad_norm": 100.0108871459961, + "learning_rate": 8.73472e-07, + "loss": 0.5686, + "step": 31650 + }, + { + "epoch": 0.317, + "grad_norm": 76.61862182617188, + "learning_rate": 8.73272e-07, + "loss": 0.6126, + "step": 31700 + }, + { + "epoch": 0.3175, + "grad_norm": 23.5545711517334, + "learning_rate": 8.730719999999999e-07, + "loss": 0.486, + "step": 31750 + }, + { + "epoch": 0.318, + "grad_norm": 121.40791320800781, + "learning_rate": 8.728719999999999e-07, + "loss": 0.5634, + "step": 31800 + }, + { + "epoch": 0.3185, + "grad_norm": 60.57836151123047, + "learning_rate": 8.72672e-07, + "loss": 0.477, + "step": 31850 + }, + { + "epoch": 0.319, + "grad_norm": 71.34037017822266, + "learning_rate": 8.72472e-07, + "loss": 0.6271, + "step": 31900 + }, + { + "epoch": 0.3195, + "grad_norm": 1.094132661819458, + "learning_rate": 8.72272e-07, + "loss": 0.6245, + "step": 31950 + }, + { + "epoch": 0.32, + "grad_norm": 112.46796417236328, + "learning_rate": 8.72072e-07, + "loss": 0.4673, + "step": 32000 + }, + { + "epoch": 0.3205, + "grad_norm": 65.74776458740234, + "learning_rate": 8.718719999999999e-07, + "loss": 0.4702, + "step": 32050 + }, + { + "epoch": 0.321, + "grad_norm": 133.10072326660156, + "learning_rate": 8.716719999999999e-07, + "loss": 0.4708, + "step": 32100 + }, + { + "epoch": 0.3215, + "grad_norm": 88.02649688720703, + "learning_rate": 8.71472e-07, + "loss": 0.4592, + "step": 32150 + }, + { + "epoch": 0.322, + "grad_norm": 97.91495513916016, + "learning_rate": 8.71272e-07, + "loss": 0.6828, + "step": 32200 + }, + { + "epoch": 0.3225, + "grad_norm": 76.7396011352539, + "learning_rate": 8.710719999999999e-07, + "loss": 0.4356, + "step": 32250 + }, + { + "epoch": 0.323, + "grad_norm": 48.60926055908203, + "learning_rate": 8.70872e-07, + "loss": 0.5898, + "step": 32300 + }, + { + "epoch": 0.3235, + "grad_norm": 107.64606475830078, + "learning_rate": 8.70672e-07, + "loss": 0.552, + "step": 32350 + }, + { + "epoch": 0.324, + "grad_norm": 77.53285217285156, + "learning_rate": 8.704719999999999e-07, + "loss": 0.4998, + "step": 32400 + }, + { + "epoch": 0.3245, + "grad_norm": 3.892568349838257, + "learning_rate": 8.70272e-07, + "loss": 0.5265, + "step": 32450 + }, + { + "epoch": 0.325, + "grad_norm": 87.63228607177734, + "learning_rate": 8.700719999999999e-07, + "loss": 0.6641, + "step": 32500 + }, + { + "epoch": 0.3255, + "grad_norm": 208.2560272216797, + "learning_rate": 8.698719999999999e-07, + "loss": 0.5142, + "step": 32550 + }, + { + "epoch": 0.326, + "grad_norm": 39.32928466796875, + "learning_rate": 8.69672e-07, + "loss": 0.5655, + "step": 32600 + }, + { + "epoch": 0.3265, + "grad_norm": 121.70012664794922, + "learning_rate": 8.69472e-07, + "loss": 0.5668, + "step": 32650 + }, + { + "epoch": 0.327, + "grad_norm": 3.1223995685577393, + "learning_rate": 8.692720000000001e-07, + "loss": 0.5199, + "step": 32700 + }, + { + "epoch": 0.3275, + "grad_norm": 103.2892074584961, + "learning_rate": 8.690719999999999e-07, + "loss": 0.5335, + "step": 32750 + }, + { + "epoch": 0.328, + "grad_norm": 9.383655548095703, + "learning_rate": 8.688719999999999e-07, + "loss": 0.6043, + "step": 32800 + }, + { + "epoch": 0.3285, + "grad_norm": 41.5776252746582, + "learning_rate": 8.68672e-07, + "loss": 0.671, + "step": 32850 + }, + { + "epoch": 0.329, + "grad_norm": 12.326675415039062, + "learning_rate": 8.68472e-07, + "loss": 0.4646, + "step": 32900 + }, + { + "epoch": 0.3295, + "grad_norm": 42.7502555847168, + "learning_rate": 8.68272e-07, + "loss": 0.492, + "step": 32950 + }, + { + "epoch": 0.33, + "grad_norm": 70.67731475830078, + "learning_rate": 8.68072e-07, + "loss": 0.4932, + "step": 33000 + }, + { + "epoch": 0.3305, + "grad_norm": 50.19448471069336, + "learning_rate": 8.67872e-07, + "loss": 0.4514, + "step": 33050 + }, + { + "epoch": 0.331, + "grad_norm": 73.63932037353516, + "learning_rate": 8.676719999999999e-07, + "loss": 0.4684, + "step": 33100 + }, + { + "epoch": 0.3315, + "grad_norm": 22.24913787841797, + "learning_rate": 8.67472e-07, + "loss": 0.6631, + "step": 33150 + }, + { + "epoch": 0.332, + "grad_norm": 110.29122161865234, + "learning_rate": 8.67272e-07, + "loss": 0.714, + "step": 33200 + }, + { + "epoch": 0.3325, + "grad_norm": 9.803537368774414, + "learning_rate": 8.670719999999999e-07, + "loss": 0.5666, + "step": 33250 + }, + { + "epoch": 0.333, + "grad_norm": 1.9911659955978394, + "learning_rate": 8.66872e-07, + "loss": 0.5231, + "step": 33300 + }, + { + "epoch": 0.3335, + "grad_norm": 29.713672637939453, + "learning_rate": 8.66672e-07, + "loss": 0.6263, + "step": 33350 + }, + { + "epoch": 0.334, + "grad_norm": 58.22745895385742, + "learning_rate": 8.66472e-07, + "loss": 0.4424, + "step": 33400 + }, + { + "epoch": 0.3345, + "grad_norm": 71.6273422241211, + "learning_rate": 8.66272e-07, + "loss": 0.4962, + "step": 33450 + }, + { + "epoch": 0.335, + "grad_norm": 39.40334701538086, + "learning_rate": 8.660719999999999e-07, + "loss": 0.5212, + "step": 33500 + }, + { + "epoch": 0.3355, + "grad_norm": 100.38050842285156, + "learning_rate": 8.658719999999999e-07, + "loss": 0.432, + "step": 33550 + }, + { + "epoch": 0.336, + "grad_norm": 57.89727783203125, + "learning_rate": 8.65672e-07, + "loss": 0.528, + "step": 33600 + }, + { + "epoch": 0.3365, + "grad_norm": 28.244213104248047, + "learning_rate": 8.65472e-07, + "loss": 0.5949, + "step": 33650 + }, + { + "epoch": 0.337, + "grad_norm": 57.93000030517578, + "learning_rate": 8.65272e-07, + "loss": 0.534, + "step": 33700 + }, + { + "epoch": 0.3375, + "grad_norm": 38.53422164916992, + "learning_rate": 8.65072e-07, + "loss": 0.6568, + "step": 33750 + }, + { + "epoch": 0.338, + "grad_norm": 77.30050659179688, + "learning_rate": 8.648719999999999e-07, + "loss": 0.5894, + "step": 33800 + }, + { + "epoch": 0.3385, + "grad_norm": 99.1224594116211, + "learning_rate": 8.646719999999999e-07, + "loss": 0.4271, + "step": 33850 + }, + { + "epoch": 0.339, + "grad_norm": 10.921067237854004, + "learning_rate": 8.64472e-07, + "loss": 0.457, + "step": 33900 + }, + { + "epoch": 0.3395, + "grad_norm": 83.70796203613281, + "learning_rate": 8.64272e-07, + "loss": 0.5231, + "step": 33950 + }, + { + "epoch": 0.34, + "grad_norm": 81.03082275390625, + "learning_rate": 8.640719999999999e-07, + "loss": 0.6068, + "step": 34000 + }, + { + "epoch": 0.3405, + "grad_norm": 18.295440673828125, + "learning_rate": 8.63872e-07, + "loss": 0.5386, + "step": 34050 + }, + { + "epoch": 0.341, + "grad_norm": 79.17333221435547, + "learning_rate": 8.63676e-07, + "loss": 0.5819, + "step": 34100 + }, + { + "epoch": 0.3415, + "grad_norm": 59.78744888305664, + "learning_rate": 8.634759999999999e-07, + "loss": 0.6588, + "step": 34150 + }, + { + "epoch": 0.342, + "grad_norm": 87.66487121582031, + "learning_rate": 8.63276e-07, + "loss": 0.4755, + "step": 34200 + }, + { + "epoch": 0.3425, + "grad_norm": 40.66081237792969, + "learning_rate": 8.630799999999999e-07, + "loss": 0.4778, + "step": 34250 + }, + { + "epoch": 0.343, + "grad_norm": 99.85968780517578, + "learning_rate": 8.628799999999999e-07, + "loss": 0.631, + "step": 34300 + }, + { + "epoch": 0.3435, + "grad_norm": 116.81835174560547, + "learning_rate": 8.6268e-07, + "loss": 0.58, + "step": 34350 + }, + { + "epoch": 0.344, + "grad_norm": 20.3864803314209, + "learning_rate": 8.6248e-07, + "loss": 0.5837, + "step": 34400 + }, + { + "epoch": 0.3445, + "grad_norm": 22.575923919677734, + "learning_rate": 8.6228e-07, + "loss": 0.519, + "step": 34450 + }, + { + "epoch": 0.345, + "grad_norm": 95.2472915649414, + "learning_rate": 8.6208e-07, + "loss": 0.5047, + "step": 34500 + }, + { + "epoch": 0.3455, + "grad_norm": 24.045143127441406, + "learning_rate": 8.618799999999999e-07, + "loss": 0.5282, + "step": 34550 + }, + { + "epoch": 0.346, + "grad_norm": 93.23561096191406, + "learning_rate": 8.616799999999999e-07, + "loss": 0.5889, + "step": 34600 + }, + { + "epoch": 0.3465, + "grad_norm": 83.2427978515625, + "learning_rate": 8.6148e-07, + "loss": 0.395, + "step": 34650 + }, + { + "epoch": 0.347, + "grad_norm": 75.5633544921875, + "learning_rate": 8.6128e-07, + "loss": 0.5339, + "step": 34700 + }, + { + "epoch": 0.3475, + "grad_norm": 46.325660705566406, + "learning_rate": 8.610799999999999e-07, + "loss": 0.6007, + "step": 34750 + }, + { + "epoch": 0.348, + "grad_norm": 94.44389343261719, + "learning_rate": 8.6088e-07, + "loss": 0.5271, + "step": 34800 + }, + { + "epoch": 0.3485, + "grad_norm": 1.9219623804092407, + "learning_rate": 8.6068e-07, + "loss": 0.4917, + "step": 34850 + }, + { + "epoch": 0.349, + "grad_norm": 256.146484375, + "learning_rate": 8.604799999999999e-07, + "loss": 0.4976, + "step": 34900 + }, + { + "epoch": 0.3495, + "grad_norm": 42.80830383300781, + "learning_rate": 8.6028e-07, + "loss": 0.4433, + "step": 34950 + }, + { + "epoch": 0.35, + "grad_norm": 38.0195426940918, + "learning_rate": 8.600799999999999e-07, + "loss": 0.615, + "step": 35000 + }, + { + "epoch": 0.3505, + "grad_norm": 55.40459060668945, + "learning_rate": 8.598799999999999e-07, + "loss": 0.5231, + "step": 35050 + }, + { + "epoch": 0.351, + "grad_norm": 75.99671173095703, + "learning_rate": 8.5968e-07, + "loss": 0.505, + "step": 35100 + }, + { + "epoch": 0.3515, + "grad_norm": 30.296842575073242, + "learning_rate": 8.5948e-07, + "loss": 0.5513, + "step": 35150 + }, + { + "epoch": 0.352, + "grad_norm": 69.02685546875, + "learning_rate": 8.592800000000001e-07, + "loss": 0.585, + "step": 35200 + }, + { + "epoch": 0.3525, + "grad_norm": 88.14878845214844, + "learning_rate": 8.590799999999999e-07, + "loss": 0.6418, + "step": 35250 + }, + { + "epoch": 0.353, + "grad_norm": 1.586727261543274, + "learning_rate": 8.588799999999999e-07, + "loss": 0.5362, + "step": 35300 + }, + { + "epoch": 0.3535, + "grad_norm": 48.83518981933594, + "learning_rate": 8.5868e-07, + "loss": 0.4468, + "step": 35350 + }, + { + "epoch": 0.354, + "grad_norm": 98.71307373046875, + "learning_rate": 8.5848e-07, + "loss": 0.6128, + "step": 35400 + }, + { + "epoch": 0.3545, + "grad_norm": 44.5135612487793, + "learning_rate": 8.5828e-07, + "loss": 0.695, + "step": 35450 + }, + { + "epoch": 0.355, + "grad_norm": 39.28260803222656, + "learning_rate": 8.5808e-07, + "loss": 0.5119, + "step": 35500 + }, + { + "epoch": 0.3555, + "grad_norm": 16.02478790283203, + "learning_rate": 8.5788e-07, + "loss": 0.5504, + "step": 35550 + }, + { + "epoch": 0.356, + "grad_norm": 73.70425415039062, + "learning_rate": 8.576799999999999e-07, + "loss": 0.3506, + "step": 35600 + }, + { + "epoch": 0.3565, + "grad_norm": 60.15646743774414, + "learning_rate": 8.5748e-07, + "loss": 0.701, + "step": 35650 + }, + { + "epoch": 0.357, + "grad_norm": 124.11194610595703, + "learning_rate": 8.5728e-07, + "loss": 0.5455, + "step": 35700 + }, + { + "epoch": 0.3575, + "grad_norm": 63.276222229003906, + "learning_rate": 8.570799999999999e-07, + "loss": 0.5035, + "step": 35750 + }, + { + "epoch": 0.358, + "grad_norm": 70.08747863769531, + "learning_rate": 8.5688e-07, + "loss": 0.4866, + "step": 35800 + }, + { + "epoch": 0.3585, + "grad_norm": 85.71963500976562, + "learning_rate": 8.5668e-07, + "loss": 0.5919, + "step": 35850 + }, + { + "epoch": 0.359, + "grad_norm": 0.5785985589027405, + "learning_rate": 8.5648e-07, + "loss": 0.4748, + "step": 35900 + }, + { + "epoch": 0.3595, + "grad_norm": 54.46210861206055, + "learning_rate": 8.5628e-07, + "loss": 0.5546, + "step": 35950 + }, + { + "epoch": 0.36, + "grad_norm": 3.6963469982147217, + "learning_rate": 8.560799999999999e-07, + "loss": 0.5244, + "step": 36000 + }, + { + "epoch": 0.3605, + "grad_norm": 75.58008575439453, + "learning_rate": 8.558799999999999e-07, + "loss": 0.493, + "step": 36050 + }, + { + "epoch": 0.361, + "grad_norm": 1.6295361518859863, + "learning_rate": 8.5568e-07, + "loss": 0.4227, + "step": 36100 + }, + { + "epoch": 0.3615, + "grad_norm": 78.0384292602539, + "learning_rate": 8.5548e-07, + "loss": 0.5416, + "step": 36150 + }, + { + "epoch": 0.362, + "grad_norm": 14.516754150390625, + "learning_rate": 8.5528e-07, + "loss": 0.6593, + "step": 36200 + }, + { + "epoch": 0.3625, + "grad_norm": 29.74073028564453, + "learning_rate": 8.5508e-07, + "loss": 0.5811, + "step": 36250 + }, + { + "epoch": 0.363, + "grad_norm": 94.58187103271484, + "learning_rate": 8.548799999999999e-07, + "loss": 0.5598, + "step": 36300 + }, + { + "epoch": 0.3635, + "grad_norm": 71.66839599609375, + "learning_rate": 8.546799999999999e-07, + "loss": 0.3467, + "step": 36350 + }, + { + "epoch": 0.364, + "grad_norm": 62.97394943237305, + "learning_rate": 8.5448e-07, + "loss": 0.668, + "step": 36400 + }, + { + "epoch": 0.3645, + "grad_norm": 24.426570892333984, + "learning_rate": 8.5428e-07, + "loss": 0.579, + "step": 36450 + }, + { + "epoch": 0.365, + "grad_norm": 48.1230354309082, + "learning_rate": 8.540799999999999e-07, + "loss": 0.5345, + "step": 36500 + }, + { + "epoch": 0.3655, + "grad_norm": 82.27693176269531, + "learning_rate": 8.5388e-07, + "loss": 0.6415, + "step": 36550 + }, + { + "epoch": 0.366, + "grad_norm": 29.432802200317383, + "learning_rate": 8.5368e-07, + "loss": 0.564, + "step": 36600 + }, + { + "epoch": 0.3665, + "grad_norm": 15.768404006958008, + "learning_rate": 8.534799999999999e-07, + "loss": 0.4265, + "step": 36650 + }, + { + "epoch": 0.367, + "grad_norm": 71.74403381347656, + "learning_rate": 8.5328e-07, + "loss": 0.5159, + "step": 36700 + }, + { + "epoch": 0.3675, + "grad_norm": 11.544425010681152, + "learning_rate": 8.530799999999999e-07, + "loss": 0.4023, + "step": 36750 + }, + { + "epoch": 0.368, + "grad_norm": 6.664909839630127, + "learning_rate": 8.528799999999999e-07, + "loss": 0.6547, + "step": 36800 + }, + { + "epoch": 0.3685, + "grad_norm": 59.17173385620117, + "learning_rate": 8.5268e-07, + "loss": 0.4626, + "step": 36850 + }, + { + "epoch": 0.369, + "grad_norm": 45.66848373413086, + "learning_rate": 8.5248e-07, + "loss": 0.5634, + "step": 36900 + }, + { + "epoch": 0.3695, + "grad_norm": 34.434417724609375, + "learning_rate": 8.5228e-07, + "loss": 0.5269, + "step": 36950 + }, + { + "epoch": 0.37, + "grad_norm": 30.29810905456543, + "learning_rate": 8.520799999999999e-07, + "loss": 0.7069, + "step": 37000 + }, + { + "epoch": 0.3705, + "grad_norm": 75.58245849609375, + "learning_rate": 8.518799999999999e-07, + "loss": 0.5033, + "step": 37050 + }, + { + "epoch": 0.371, + "grad_norm": 1.9364076852798462, + "learning_rate": 8.516799999999999e-07, + "loss": 0.4915, + "step": 37100 + }, + { + "epoch": 0.3715, + "grad_norm": 48.733604431152344, + "learning_rate": 8.5148e-07, + "loss": 0.4742, + "step": 37150 + }, + { + "epoch": 0.372, + "grad_norm": 98.77400970458984, + "learning_rate": 8.5128e-07, + "loss": 0.4848, + "step": 37200 + }, + { + "epoch": 0.3725, + "grad_norm": 126.41114044189453, + "learning_rate": 8.510799999999999e-07, + "loss": 0.4387, + "step": 37250 + }, + { + "epoch": 0.373, + "grad_norm": 93.77721405029297, + "learning_rate": 8.5088e-07, + "loss": 0.5441, + "step": 37300 + }, + { + "epoch": 0.3735, + "grad_norm": 22.67792510986328, + "learning_rate": 8.506799999999999e-07, + "loss": 0.4786, + "step": 37350 + }, + { + "epoch": 0.374, + "grad_norm": 27.49308967590332, + "learning_rate": 8.504799999999999e-07, + "loss": 0.5442, + "step": 37400 + }, + { + "epoch": 0.3745, + "grad_norm": 4.33549690246582, + "learning_rate": 8.5028e-07, + "loss": 0.4156, + "step": 37450 + }, + { + "epoch": 0.375, + "grad_norm": 29.745681762695312, + "learning_rate": 8.500799999999999e-07, + "loss": 0.4352, + "step": 37500 + }, + { + "epoch": 0.3755, + "grad_norm": 53.99748611450195, + "learning_rate": 8.498799999999999e-07, + "loss": 0.5147, + "step": 37550 + }, + { + "epoch": 0.376, + "grad_norm": 11.581042289733887, + "learning_rate": 8.4968e-07, + "loss": 0.6587, + "step": 37600 + }, + { + "epoch": 0.3765, + "grad_norm": 47.363216400146484, + "learning_rate": 8.4948e-07, + "loss": 0.5124, + "step": 37650 + }, + { + "epoch": 0.377, + "grad_norm": 92.71063232421875, + "learning_rate": 8.4928e-07, + "loss": 0.5673, + "step": 37700 + }, + { + "epoch": 0.3775, + "grad_norm": 56.31315994262695, + "learning_rate": 8.490799999999999e-07, + "loss": 0.5734, + "step": 37750 + }, + { + "epoch": 0.378, + "grad_norm": 66.45987701416016, + "learning_rate": 8.488799999999999e-07, + "loss": 0.5247, + "step": 37800 + }, + { + "epoch": 0.3785, + "grad_norm": 31.38943862915039, + "learning_rate": 8.4868e-07, + "loss": 0.5605, + "step": 37850 + }, + { + "epoch": 0.379, + "grad_norm": 51.36092758178711, + "learning_rate": 8.4848e-07, + "loss": 0.4554, + "step": 37900 + }, + { + "epoch": 0.3795, + "grad_norm": 31.094507217407227, + "learning_rate": 8.4828e-07, + "loss": 0.6803, + "step": 37950 + }, + { + "epoch": 0.38, + "grad_norm": 60.59067916870117, + "learning_rate": 8.4808e-07, + "loss": 0.6999, + "step": 38000 + }, + { + "epoch": 0.3805, + "grad_norm": 11.532386779785156, + "learning_rate": 8.478799999999999e-07, + "loss": 0.4905, + "step": 38050 + }, + { + "epoch": 0.381, + "grad_norm": 77.91910552978516, + "learning_rate": 8.476799999999999e-07, + "loss": 0.5046, + "step": 38100 + }, + { + "epoch": 0.3815, + "grad_norm": 123.77552795410156, + "learning_rate": 8.4748e-07, + "loss": 0.6066, + "step": 38150 + }, + { + "epoch": 0.382, + "grad_norm": 80.5956802368164, + "learning_rate": 8.4728e-07, + "loss": 0.5123, + "step": 38200 + }, + { + "epoch": 0.3825, + "grad_norm": 18.614927291870117, + "learning_rate": 8.4708e-07, + "loss": 0.5425, + "step": 38250 + }, + { + "epoch": 0.383, + "grad_norm": 40.55034255981445, + "learning_rate": 8.4688e-07, + "loss": 0.5672, + "step": 38300 + }, + { + "epoch": 0.3835, + "grad_norm": 60.523948669433594, + "learning_rate": 8.4668e-07, + "loss": 0.5903, + "step": 38350 + }, + { + "epoch": 0.384, + "grad_norm": 56.68648910522461, + "learning_rate": 8.464799999999999e-07, + "loss": 0.5536, + "step": 38400 + }, + { + "epoch": 0.3845, + "grad_norm": 63.2244758605957, + "learning_rate": 8.4628e-07, + "loss": 0.5863, + "step": 38450 + }, + { + "epoch": 0.385, + "grad_norm": 73.95377349853516, + "learning_rate": 8.4608e-07, + "loss": 0.4515, + "step": 38500 + }, + { + "epoch": 0.3855, + "grad_norm": 18.579113006591797, + "learning_rate": 8.458799999999999e-07, + "loss": 0.6405, + "step": 38550 + }, + { + "epoch": 0.386, + "grad_norm": 40.64234161376953, + "learning_rate": 8.4568e-07, + "loss": 0.5371, + "step": 38600 + }, + { + "epoch": 0.3865, + "grad_norm": 54.03630447387695, + "learning_rate": 8.4548e-07, + "loss": 0.5378, + "step": 38650 + }, + { + "epoch": 0.387, + "grad_norm": 6.5824666023254395, + "learning_rate": 8.4528e-07, + "loss": 0.4843, + "step": 38700 + }, + { + "epoch": 0.3875, + "grad_norm": 8.509979248046875, + "learning_rate": 8.4508e-07, + "loss": 0.4975, + "step": 38750 + }, + { + "epoch": 0.388, + "grad_norm": 97.18090057373047, + "learning_rate": 8.448839999999999e-07, + "loss": 0.4974, + "step": 38800 + }, + { + "epoch": 0.3885, + "grad_norm": 109.9385757446289, + "learning_rate": 8.446839999999999e-07, + "loss": 0.5312, + "step": 38850 + }, + { + "epoch": 0.389, + "grad_norm": 2.83178448677063, + "learning_rate": 8.44484e-07, + "loss": 0.4961, + "step": 38900 + }, + { + "epoch": 0.3895, + "grad_norm": 1.3601728677749634, + "learning_rate": 8.44284e-07, + "loss": 0.5074, + "step": 38950 + }, + { + "epoch": 0.39, + "grad_norm": 49.6685905456543, + "learning_rate": 8.440839999999999e-07, + "loss": 0.4328, + "step": 39000 + }, + { + "epoch": 0.3905, + "grad_norm": 58.785987854003906, + "learning_rate": 8.43884e-07, + "loss": 0.5464, + "step": 39050 + }, + { + "epoch": 0.391, + "grad_norm": 79.17613220214844, + "learning_rate": 8.43688e-07, + "loss": 0.5818, + "step": 39100 + }, + { + "epoch": 0.3915, + "grad_norm": 30.441478729248047, + "learning_rate": 8.434879999999999e-07, + "loss": 0.5418, + "step": 39150 + }, + { + "epoch": 0.392, + "grad_norm": 58.006282806396484, + "learning_rate": 8.43288e-07, + "loss": 0.4618, + "step": 39200 + }, + { + "epoch": 0.3925, + "grad_norm": 83.23896026611328, + "learning_rate": 8.430879999999999e-07, + "loss": 0.6328, + "step": 39250 + }, + { + "epoch": 0.393, + "grad_norm": 30.840930938720703, + "learning_rate": 8.428879999999999e-07, + "loss": 0.52, + "step": 39300 + }, + { + "epoch": 0.3935, + "grad_norm": 22.173118591308594, + "learning_rate": 8.42688e-07, + "loss": 0.4596, + "step": 39350 + }, + { + "epoch": 0.394, + "grad_norm": 123.26498413085938, + "learning_rate": 8.42488e-07, + "loss": 0.5246, + "step": 39400 + }, + { + "epoch": 0.3945, + "grad_norm": 65.1488265991211, + "learning_rate": 8.42288e-07, + "loss": 0.4235, + "step": 39450 + }, + { + "epoch": 0.395, + "grad_norm": 0.8173670768737793, + "learning_rate": 8.420919999999999e-07, + "loss": 0.4922, + "step": 39500 + }, + { + "epoch": 0.3955, + "grad_norm": 4.30292272567749, + "learning_rate": 8.418919999999999e-07, + "loss": 0.4174, + "step": 39550 + }, + { + "epoch": 0.396, + "grad_norm": 55.06961441040039, + "learning_rate": 8.416919999999999e-07, + "loss": 0.5271, + "step": 39600 + }, + { + "epoch": 0.3965, + "grad_norm": 7.464923858642578, + "learning_rate": 8.41492e-07, + "loss": 0.495, + "step": 39650 + }, + { + "epoch": 0.397, + "grad_norm": 34.88985824584961, + "learning_rate": 8.41292e-07, + "loss": 0.5638, + "step": 39700 + }, + { + "epoch": 0.3975, + "grad_norm": 36.899593353271484, + "learning_rate": 8.410919999999999e-07, + "loss": 0.4877, + "step": 39750 + }, + { + "epoch": 0.398, + "grad_norm": 70.25818634033203, + "learning_rate": 8.40892e-07, + "loss": 0.516, + "step": 39800 + }, + { + "epoch": 0.3985, + "grad_norm": 23.857093811035156, + "learning_rate": 8.406919999999999e-07, + "loss": 0.4775, + "step": 39850 + }, + { + "epoch": 0.399, + "grad_norm": 64.42444610595703, + "learning_rate": 8.404919999999999e-07, + "loss": 0.3856, + "step": 39900 + }, + { + "epoch": 0.3995, + "grad_norm": 94.81324005126953, + "learning_rate": 8.40292e-07, + "loss": 0.603, + "step": 39950 + }, + { + "epoch": 0.4, + "grad_norm": 70.74253845214844, + "learning_rate": 8.400919999999999e-07, + "loss": 0.4607, + "step": 40000 + }, + { + "epoch": 0.4005, + "grad_norm": 69.46749877929688, + "learning_rate": 8.398919999999999e-07, + "loss": 0.5185, + "step": 40050 + }, + { + "epoch": 0.401, + "grad_norm": 30.19931983947754, + "learning_rate": 8.39692e-07, + "loss": 0.5854, + "step": 40100 + }, + { + "epoch": 0.4015, + "grad_norm": 80.08529663085938, + "learning_rate": 8.39492e-07, + "loss": 0.4856, + "step": 40150 + }, + { + "epoch": 0.402, + "grad_norm": 40.47534942626953, + "learning_rate": 8.39292e-07, + "loss": 0.4393, + "step": 40200 + }, + { + "epoch": 0.4025, + "grad_norm": 92.6363296508789, + "learning_rate": 8.390919999999999e-07, + "loss": 0.4964, + "step": 40250 + }, + { + "epoch": 0.403, + "grad_norm": 1.9177128076553345, + "learning_rate": 8.388919999999999e-07, + "loss": 0.606, + "step": 40300 + }, + { + "epoch": 0.4035, + "grad_norm": 9.308789253234863, + "learning_rate": 8.38692e-07, + "loss": 0.4849, + "step": 40350 + }, + { + "epoch": 0.404, + "grad_norm": 9.60834789276123, + "learning_rate": 8.38492e-07, + "loss": 0.5118, + "step": 40400 + }, + { + "epoch": 0.4045, + "grad_norm": 70.33528137207031, + "learning_rate": 8.38292e-07, + "loss": 0.5834, + "step": 40450 + }, + { + "epoch": 0.405, + "grad_norm": 93.31282806396484, + "learning_rate": 8.38092e-07, + "loss": 0.5711, + "step": 40500 + }, + { + "epoch": 0.4055, + "grad_norm": 56.34794998168945, + "learning_rate": 8.378919999999999e-07, + "loss": 0.5346, + "step": 40550 + }, + { + "epoch": 0.406, + "grad_norm": 53.3722038269043, + "learning_rate": 8.376919999999999e-07, + "loss": 0.5983, + "step": 40600 + }, + { + "epoch": 0.4065, + "grad_norm": 79.872314453125, + "learning_rate": 8.37492e-07, + "loss": 0.6024, + "step": 40650 + }, + { + "epoch": 0.407, + "grad_norm": 43.69386291503906, + "learning_rate": 8.37292e-07, + "loss": 0.6604, + "step": 40700 + }, + { + "epoch": 0.4075, + "grad_norm": 82.44725799560547, + "learning_rate": 8.370919999999999e-07, + "loss": 0.5791, + "step": 40750 + }, + { + "epoch": 0.408, + "grad_norm": 28.565074920654297, + "learning_rate": 8.36892e-07, + "loss": 0.4985, + "step": 40800 + }, + { + "epoch": 0.4085, + "grad_norm": 108.46695709228516, + "learning_rate": 8.36692e-07, + "loss": 0.559, + "step": 40850 + }, + { + "epoch": 0.409, + "grad_norm": 68.24771881103516, + "learning_rate": 8.364919999999999e-07, + "loss": 0.6729, + "step": 40900 + }, + { + "epoch": 0.4095, + "grad_norm": 106.6298599243164, + "learning_rate": 8.36292e-07, + "loss": 0.6222, + "step": 40950 + }, + { + "epoch": 0.41, + "grad_norm": 10.868021011352539, + "learning_rate": 8.360919999999999e-07, + "loss": 0.4931, + "step": 41000 + }, + { + "epoch": 0.4105, + "grad_norm": 77.19261169433594, + "learning_rate": 8.358919999999999e-07, + "loss": 0.5437, + "step": 41050 + }, + { + "epoch": 0.411, + "grad_norm": 52.96466064453125, + "learning_rate": 8.35692e-07, + "loss": 0.5333, + "step": 41100 + }, + { + "epoch": 0.4115, + "grad_norm": 47.27060317993164, + "learning_rate": 8.35492e-07, + "loss": 0.6691, + "step": 41150 + }, + { + "epoch": 0.412, + "grad_norm": 65.95133209228516, + "learning_rate": 8.35292e-07, + "loss": 0.506, + "step": 41200 + }, + { + "epoch": 0.4125, + "grad_norm": 124.53824615478516, + "learning_rate": 8.350919999999999e-07, + "loss": 0.5634, + "step": 41250 + }, + { + "epoch": 0.413, + "grad_norm": 59.79798126220703, + "learning_rate": 8.348919999999999e-07, + "loss": 0.6084, + "step": 41300 + }, + { + "epoch": 0.4135, + "grad_norm": 17.437131881713867, + "learning_rate": 8.346919999999999e-07, + "loss": 0.4448, + "step": 41350 + }, + { + "epoch": 0.414, + "grad_norm": 0.26926401257514954, + "learning_rate": 8.34492e-07, + "loss": 0.5695, + "step": 41400 + }, + { + "epoch": 0.4145, + "grad_norm": 25.987173080444336, + "learning_rate": 8.34292e-07, + "loss": 0.4574, + "step": 41450 + }, + { + "epoch": 0.415, + "grad_norm": 77.89625549316406, + "learning_rate": 8.340919999999999e-07, + "loss": 0.5357, + "step": 41500 + }, + { + "epoch": 0.4155, + "grad_norm": 37.63655471801758, + "learning_rate": 8.33892e-07, + "loss": 0.5492, + "step": 41550 + }, + { + "epoch": 0.416, + "grad_norm": 103.72399139404297, + "learning_rate": 8.33696e-07, + "loss": 0.4425, + "step": 41600 + }, + { + "epoch": 0.4165, + "grad_norm": 73.7927017211914, + "learning_rate": 8.334959999999999e-07, + "loss": 0.54, + "step": 41650 + }, + { + "epoch": 0.417, + "grad_norm": 43.02588653564453, + "learning_rate": 8.33296e-07, + "loss": 0.492, + "step": 41700 + }, + { + "epoch": 0.4175, + "grad_norm": 71.02433776855469, + "learning_rate": 8.330959999999999e-07, + "loss": 0.5462, + "step": 41750 + }, + { + "epoch": 0.418, + "grad_norm": 115.1431884765625, + "learning_rate": 8.328959999999999e-07, + "loss": 0.5912, + "step": 41800 + }, + { + "epoch": 0.4185, + "grad_norm": 96.75558471679688, + "learning_rate": 8.32696e-07, + "loss": 0.5876, + "step": 41850 + }, + { + "epoch": 0.419, + "grad_norm": 36.395286560058594, + "learning_rate": 8.32496e-07, + "loss": 0.4394, + "step": 41900 + }, + { + "epoch": 0.4195, + "grad_norm": 168.5052947998047, + "learning_rate": 8.32296e-07, + "loss": 0.5484, + "step": 41950 + }, + { + "epoch": 0.42, + "grad_norm": 60.965389251708984, + "learning_rate": 8.320959999999999e-07, + "loss": 0.4683, + "step": 42000 + }, + { + "epoch": 0.4205, + "grad_norm": 118.09526062011719, + "learning_rate": 8.318959999999999e-07, + "loss": 0.5096, + "step": 42050 + }, + { + "epoch": 0.421, + "grad_norm": 91.55111694335938, + "learning_rate": 8.316959999999999e-07, + "loss": 0.5228, + "step": 42100 + }, + { + "epoch": 0.4215, + "grad_norm": 24.228757858276367, + "learning_rate": 8.31496e-07, + "loss": 0.5189, + "step": 42150 + }, + { + "epoch": 0.422, + "grad_norm": 5.371842861175537, + "learning_rate": 8.31296e-07, + "loss": 0.4865, + "step": 42200 + }, + { + "epoch": 0.4225, + "grad_norm": 74.1497802734375, + "learning_rate": 8.310959999999999e-07, + "loss": 0.4502, + "step": 42250 + }, + { + "epoch": 0.423, + "grad_norm": 3.059469223022461, + "learning_rate": 8.30896e-07, + "loss": 0.4839, + "step": 42300 + }, + { + "epoch": 0.4235, + "grad_norm": 14.30676555633545, + "learning_rate": 8.306959999999999e-07, + "loss": 0.4863, + "step": 42350 + }, + { + "epoch": 0.424, + "grad_norm": 0.32036876678466797, + "learning_rate": 8.30496e-07, + "loss": 0.4859, + "step": 42400 + }, + { + "epoch": 0.4245, + "grad_norm": 36.87676239013672, + "learning_rate": 8.30296e-07, + "loss": 0.3993, + "step": 42450 + }, + { + "epoch": 0.425, + "grad_norm": 53.69343566894531, + "learning_rate": 8.300959999999999e-07, + "loss": 0.564, + "step": 42500 + }, + { + "epoch": 0.4255, + "grad_norm": 49.399715423583984, + "learning_rate": 8.29896e-07, + "loss": 0.537, + "step": 42550 + }, + { + "epoch": 0.426, + "grad_norm": 42.28934860229492, + "learning_rate": 8.29696e-07, + "loss": 0.4264, + "step": 42600 + }, + { + "epoch": 0.4265, + "grad_norm": 14.933109283447266, + "learning_rate": 8.29496e-07, + "loss": 0.5168, + "step": 42650 + }, + { + "epoch": 0.427, + "grad_norm": 69.50049591064453, + "learning_rate": 8.29296e-07, + "loss": 0.5216, + "step": 42700 + }, + { + "epoch": 0.4275, + "grad_norm": 6.045228958129883, + "learning_rate": 8.29096e-07, + "loss": 0.6186, + "step": 42750 + }, + { + "epoch": 0.428, + "grad_norm": 17.617143630981445, + "learning_rate": 8.288959999999999e-07, + "loss": 0.5476, + "step": 42800 + }, + { + "epoch": 0.4285, + "grad_norm": 21.75743293762207, + "learning_rate": 8.28696e-07, + "loss": 0.4667, + "step": 42850 + }, + { + "epoch": 0.429, + "grad_norm": 85.2100830078125, + "learning_rate": 8.28496e-07, + "loss": 0.6098, + "step": 42900 + }, + { + "epoch": 0.4295, + "grad_norm": 10.500739097595215, + "learning_rate": 8.28296e-07, + "loss": 0.434, + "step": 42950 + }, + { + "epoch": 0.43, + "grad_norm": 42.483436584472656, + "learning_rate": 8.280960000000001e-07, + "loss": 0.4656, + "step": 43000 + }, + { + "epoch": 0.4305, + "grad_norm": 21.001169204711914, + "learning_rate": 8.278959999999999e-07, + "loss": 0.5221, + "step": 43050 + }, + { + "epoch": 0.431, + "grad_norm": 17.075965881347656, + "learning_rate": 8.276959999999999e-07, + "loss": 0.4657, + "step": 43100 + }, + { + "epoch": 0.4315, + "grad_norm": 15.272270202636719, + "learning_rate": 8.27496e-07, + "loss": 0.4914, + "step": 43150 + }, + { + "epoch": 0.432, + "grad_norm": 51.080135345458984, + "learning_rate": 8.27296e-07, + "loss": 0.689, + "step": 43200 + }, + { + "epoch": 0.4325, + "grad_norm": 99.12468719482422, + "learning_rate": 8.27096e-07, + "loss": 0.4991, + "step": 43250 + }, + { + "epoch": 0.433, + "grad_norm": 69.47232055664062, + "learning_rate": 8.26896e-07, + "loss": 0.4888, + "step": 43300 + }, + { + "epoch": 0.4335, + "grad_norm": 71.0149917602539, + "learning_rate": 8.26696e-07, + "loss": 0.625, + "step": 43350 + }, + { + "epoch": 0.434, + "grad_norm": 47.15895080566406, + "learning_rate": 8.264959999999999e-07, + "loss": 0.5997, + "step": 43400 + }, + { + "epoch": 0.4345, + "grad_norm": 74.67218017578125, + "learning_rate": 8.26296e-07, + "loss": 0.5972, + "step": 43450 + }, + { + "epoch": 0.435, + "grad_norm": 53.72488784790039, + "learning_rate": 8.26096e-07, + "loss": 0.5028, + "step": 43500 + }, + { + "epoch": 0.4355, + "grad_norm": 14.094425201416016, + "learning_rate": 8.258959999999999e-07, + "loss": 0.5297, + "step": 43550 + }, + { + "epoch": 0.436, + "grad_norm": 12.79079818725586, + "learning_rate": 8.25696e-07, + "loss": 0.4202, + "step": 43600 + }, + { + "epoch": 0.4365, + "grad_norm": 28.30698013305664, + "learning_rate": 8.25496e-07, + "loss": 0.4182, + "step": 43650 + }, + { + "epoch": 0.437, + "grad_norm": 91.74481964111328, + "learning_rate": 8.25296e-07, + "loss": 0.4885, + "step": 43700 + }, + { + "epoch": 0.4375, + "grad_norm": 13.599432945251465, + "learning_rate": 8.25096e-07, + "loss": 0.6304, + "step": 43750 + }, + { + "epoch": 0.438, + "grad_norm": 3.0752172470092773, + "learning_rate": 8.248959999999999e-07, + "loss": 0.449, + "step": 43800 + }, + { + "epoch": 0.4385, + "grad_norm": 74.79292297363281, + "learning_rate": 8.246959999999999e-07, + "loss": 0.6306, + "step": 43850 + }, + { + "epoch": 0.439, + "grad_norm": 43.5394401550293, + "learning_rate": 8.24496e-07, + "loss": 0.6897, + "step": 43900 + }, + { + "epoch": 0.4395, + "grad_norm": 25.60984230041504, + "learning_rate": 8.24296e-07, + "loss": 0.5346, + "step": 43950 + }, + { + "epoch": 0.44, + "grad_norm": 91.31788635253906, + "learning_rate": 8.24096e-07, + "loss": 0.6347, + "step": 44000 + }, + { + "epoch": 0.4405, + "grad_norm": 90.00495147705078, + "learning_rate": 8.23896e-07, + "loss": 0.5585, + "step": 44050 + }, + { + "epoch": 0.441, + "grad_norm": 78.50735473632812, + "learning_rate": 8.236959999999999e-07, + "loss": 0.5065, + "step": 44100 + }, + { + "epoch": 0.4415, + "grad_norm": 61.63063049316406, + "learning_rate": 8.234959999999999e-07, + "loss": 0.3718, + "step": 44150 + }, + { + "epoch": 0.442, + "grad_norm": 64.05858612060547, + "learning_rate": 8.23296e-07, + "loss": 0.5225, + "step": 44200 + }, + { + "epoch": 0.4425, + "grad_norm": 46.02941131591797, + "learning_rate": 8.23096e-07, + "loss": 0.4429, + "step": 44250 + }, + { + "epoch": 0.443, + "grad_norm": 19.115480422973633, + "learning_rate": 8.228959999999999e-07, + "loss": 0.6595, + "step": 44300 + }, + { + "epoch": 0.4435, + "grad_norm": 5.615220069885254, + "learning_rate": 8.22696e-07, + "loss": 0.3437, + "step": 44350 + }, + { + "epoch": 0.444, + "grad_norm": 53.1685791015625, + "learning_rate": 8.22496e-07, + "loss": 0.5388, + "step": 44400 + }, + { + "epoch": 0.4445, + "grad_norm": 35.044410705566406, + "learning_rate": 8.222959999999999e-07, + "loss": 0.6554, + "step": 44450 + }, + { + "epoch": 0.445, + "grad_norm": 98.26656341552734, + "learning_rate": 8.22096e-07, + "loss": 0.5233, + "step": 44500 + }, + { + "epoch": 0.4455, + "grad_norm": 109.79158782958984, + "learning_rate": 8.218959999999999e-07, + "loss": 0.5021, + "step": 44550 + }, + { + "epoch": 0.446, + "grad_norm": 59.18463134765625, + "learning_rate": 8.216959999999999e-07, + "loss": 0.5612, + "step": 44600 + }, + { + "epoch": 0.4465, + "grad_norm": 89.63908386230469, + "learning_rate": 8.21496e-07, + "loss": 0.4865, + "step": 44650 + }, + { + "epoch": 0.447, + "grad_norm": 75.61219024658203, + "learning_rate": 8.21296e-07, + "loss": 0.4486, + "step": 44700 + }, + { + "epoch": 0.4475, + "grad_norm": 4.093531131744385, + "learning_rate": 8.210960000000001e-07, + "loss": 0.6416, + "step": 44750 + }, + { + "epoch": 0.448, + "grad_norm": 37.043434143066406, + "learning_rate": 8.208959999999999e-07, + "loss": 0.6513, + "step": 44800 + }, + { + "epoch": 0.4485, + "grad_norm": 51.53557586669922, + "learning_rate": 8.206959999999999e-07, + "loss": 0.4894, + "step": 44850 + }, + { + "epoch": 0.449, + "grad_norm": 69.94869232177734, + "learning_rate": 8.20496e-07, + "loss": 0.4744, + "step": 44900 + }, + { + "epoch": 0.4495, + "grad_norm": 76.57780456542969, + "learning_rate": 8.20296e-07, + "loss": 0.5147, + "step": 44950 + }, + { + "epoch": 0.45, + "grad_norm": 82.7248764038086, + "learning_rate": 8.20096e-07, + "loss": 0.6523, + "step": 45000 + }, + { + "epoch": 0.4505, + "grad_norm": 23.24110984802246, + "learning_rate": 8.19896e-07, + "loss": 0.5502, + "step": 45050 + }, + { + "epoch": 0.451, + "grad_norm": 60.64106750488281, + "learning_rate": 8.19696e-07, + "loss": 0.4758, + "step": 45100 + }, + { + "epoch": 0.4515, + "grad_norm": 35.0093879699707, + "learning_rate": 8.194959999999999e-07, + "loss": 0.544, + "step": 45150 + }, + { + "epoch": 0.452, + "grad_norm": 27.198448181152344, + "learning_rate": 8.19296e-07, + "loss": 0.5395, + "step": 45200 + }, + { + "epoch": 0.4525, + "grad_norm": 51.811222076416016, + "learning_rate": 8.19096e-07, + "loss": 0.5858, + "step": 45250 + }, + { + "epoch": 0.453, + "grad_norm": 59.12669372558594, + "learning_rate": 8.188959999999999e-07, + "loss": 0.4579, + "step": 45300 + }, + { + "epoch": 0.4535, + "grad_norm": 35.265716552734375, + "learning_rate": 8.18696e-07, + "loss": 0.53, + "step": 45350 + }, + { + "epoch": 0.454, + "grad_norm": 38.40176010131836, + "learning_rate": 8.18496e-07, + "loss": 0.5556, + "step": 45400 + }, + { + "epoch": 0.4545, + "grad_norm": 6.6817240715026855, + "learning_rate": 8.18296e-07, + "loss": 0.4169, + "step": 45450 + }, + { + "epoch": 0.455, + "grad_norm": 77.1117935180664, + "learning_rate": 8.18096e-07, + "loss": 0.5975, + "step": 45500 + }, + { + "epoch": 0.4555, + "grad_norm": 32.58802032470703, + "learning_rate": 8.178959999999999e-07, + "loss": 0.4934, + "step": 45550 + }, + { + "epoch": 0.456, + "grad_norm": 80.36177062988281, + "learning_rate": 8.176959999999999e-07, + "loss": 0.4513, + "step": 45600 + }, + { + "epoch": 0.4565, + "grad_norm": 38.63931655883789, + "learning_rate": 8.17496e-07, + "loss": 0.4297, + "step": 45650 + }, + { + "epoch": 0.457, + "grad_norm": 30.655248641967773, + "learning_rate": 8.17296e-07, + "loss": 0.6114, + "step": 45700 + }, + { + "epoch": 0.4575, + "grad_norm": 122.92070007324219, + "learning_rate": 8.17096e-07, + "loss": 0.5557, + "step": 45750 + }, + { + "epoch": 0.458, + "grad_norm": 63.41127395629883, + "learning_rate": 8.16896e-07, + "loss": 0.6203, + "step": 45800 + }, + { + "epoch": 0.4585, + "grad_norm": 10.74167537689209, + "learning_rate": 8.166959999999999e-07, + "loss": 0.4382, + "step": 45850 + }, + { + "epoch": 0.459, + "grad_norm": 70.49405670166016, + "learning_rate": 8.164959999999999e-07, + "loss": 0.4747, + "step": 45900 + }, + { + "epoch": 0.4595, + "grad_norm": 3.7027602195739746, + "learning_rate": 8.16296e-07, + "loss": 0.6623, + "step": 45950 + }, + { + "epoch": 0.46, + "grad_norm": 25.70686149597168, + "learning_rate": 8.16096e-07, + "loss": 0.494, + "step": 46000 + }, + { + "epoch": 0.4605, + "grad_norm": 32.90084457397461, + "learning_rate": 8.158959999999999e-07, + "loss": 0.4998, + "step": 46050 + }, + { + "epoch": 0.461, + "grad_norm": 14.209221839904785, + "learning_rate": 8.15696e-07, + "loss": 0.4964, + "step": 46100 + }, + { + "epoch": 0.4615, + "grad_norm": 0.3458581268787384, + "learning_rate": 8.15496e-07, + "loss": 0.5275, + "step": 46150 + }, + { + "epoch": 0.462, + "grad_norm": 66.87709045410156, + "learning_rate": 8.152959999999999e-07, + "loss": 0.5533, + "step": 46200 + }, + { + "epoch": 0.4625, + "grad_norm": 53.677860260009766, + "learning_rate": 8.15096e-07, + "loss": 0.5109, + "step": 46250 + }, + { + "epoch": 0.463, + "grad_norm": 43.0468864440918, + "learning_rate": 8.148959999999999e-07, + "loss": 0.4664, + "step": 46300 + }, + { + "epoch": 0.4635, + "grad_norm": 70.673095703125, + "learning_rate": 8.146959999999999e-07, + "loss": 0.6125, + "step": 46350 + }, + { + "epoch": 0.464, + "grad_norm": 71.34368133544922, + "learning_rate": 8.14496e-07, + "loss": 0.4832, + "step": 46400 + }, + { + "epoch": 0.4645, + "grad_norm": 198.13221740722656, + "learning_rate": 8.14296e-07, + "loss": 0.6594, + "step": 46450 + }, + { + "epoch": 0.465, + "grad_norm": 35.54419708251953, + "learning_rate": 8.14096e-07, + "loss": 0.5413, + "step": 46500 + }, + { + "epoch": 0.4655, + "grad_norm": 78.56597900390625, + "learning_rate": 8.138959999999999e-07, + "loss": 0.6681, + "step": 46550 + }, + { + "epoch": 0.466, + "grad_norm": 22.999269485473633, + "learning_rate": 8.136959999999999e-07, + "loss": 0.4987, + "step": 46600 + }, + { + "epoch": 0.4665, + "grad_norm": 1.7849301099777222, + "learning_rate": 8.134959999999999e-07, + "loss": 0.4031, + "step": 46650 + }, + { + "epoch": 0.467, + "grad_norm": 27.667150497436523, + "learning_rate": 8.13296e-07, + "loss": 0.5426, + "step": 46700 + }, + { + "epoch": 0.4675, + "grad_norm": 33.66166687011719, + "learning_rate": 8.13096e-07, + "loss": 0.4668, + "step": 46750 + }, + { + "epoch": 0.468, + "grad_norm": 28.29575538635254, + "learning_rate": 8.128959999999999e-07, + "loss": 0.5348, + "step": 46800 + }, + { + "epoch": 0.4685, + "grad_norm": 48.07258605957031, + "learning_rate": 8.12696e-07, + "loss": 0.7829, + "step": 46850 + }, + { + "epoch": 0.469, + "grad_norm": 42.36520004272461, + "learning_rate": 8.124959999999999e-07, + "loss": 0.6122, + "step": 46900 + }, + { + "epoch": 0.4695, + "grad_norm": 106.2151107788086, + "learning_rate": 8.122959999999999e-07, + "loss": 0.621, + "step": 46950 + }, + { + "epoch": 0.47, + "grad_norm": 76.08829498291016, + "learning_rate": 8.12096e-07, + "loss": 0.6716, + "step": 47000 + }, + { + "epoch": 0.4705, + "grad_norm": 31.28024673461914, + "learning_rate": 8.118959999999999e-07, + "loss": 0.4718, + "step": 47050 + }, + { + "epoch": 0.471, + "grad_norm": 55.14548873901367, + "learning_rate": 8.116959999999999e-07, + "loss": 0.6012, + "step": 47100 + }, + { + "epoch": 0.4715, + "grad_norm": 8.463032722473145, + "learning_rate": 8.11496e-07, + "loss": 0.4563, + "step": 47150 + }, + { + "epoch": 0.472, + "grad_norm": 74.97944641113281, + "learning_rate": 8.11296e-07, + "loss": 0.568, + "step": 47200 + }, + { + "epoch": 0.4725, + "grad_norm": 1.081640601158142, + "learning_rate": 8.11096e-07, + "loss": 0.6412, + "step": 47250 + }, + { + "epoch": 0.473, + "grad_norm": 77.51432800292969, + "learning_rate": 8.108959999999999e-07, + "loss": 0.4582, + "step": 47300 + }, + { + "epoch": 0.4735, + "grad_norm": 45.42144012451172, + "learning_rate": 8.106959999999999e-07, + "loss": 0.4746, + "step": 47350 + }, + { + "epoch": 0.474, + "grad_norm": 0.2983810603618622, + "learning_rate": 8.105e-07, + "loss": 0.6629, + "step": 47400 + }, + { + "epoch": 0.4745, + "grad_norm": 5.902935981750488, + "learning_rate": 8.103e-07, + "loss": 0.5522, + "step": 47450 + }, + { + "epoch": 0.475, + "grad_norm": 0.9811944961547852, + "learning_rate": 8.101e-07, + "loss": 0.4998, + "step": 47500 + }, + { + "epoch": 0.4755, + "grad_norm": 21.928115844726562, + "learning_rate": 8.099e-07, + "loss": 0.6313, + "step": 47550 + }, + { + "epoch": 0.476, + "grad_norm": 51.325443267822266, + "learning_rate": 8.097e-07, + "loss": 0.5051, + "step": 47600 + }, + { + "epoch": 0.4765, + "grad_norm": 10.640209197998047, + "learning_rate": 8.094999999999999e-07, + "loss": 0.4917, + "step": 47650 + }, + { + "epoch": 0.477, + "grad_norm": 90.27954864501953, + "learning_rate": 8.093e-07, + "loss": 0.4371, + "step": 47700 + }, + { + "epoch": 0.4775, + "grad_norm": 78.23124694824219, + "learning_rate": 8.091e-07, + "loss": 0.5052, + "step": 47750 + }, + { + "epoch": 0.478, + "grad_norm": 3.3375375270843506, + "learning_rate": 8.088999999999999e-07, + "loss": 0.5095, + "step": 47800 + }, + { + "epoch": 0.4785, + "grad_norm": 17.96718406677246, + "learning_rate": 8.087e-07, + "loss": 0.4613, + "step": 47850 + }, + { + "epoch": 0.479, + "grad_norm": 86.63037872314453, + "learning_rate": 8.085e-07, + "loss": 0.5274, + "step": 47900 + }, + { + "epoch": 0.4795, + "grad_norm": 15.691425323486328, + "learning_rate": 8.083e-07, + "loss": 0.5586, + "step": 47950 + }, + { + "epoch": 0.48, + "grad_norm": 95.55493927001953, + "learning_rate": 8.081e-07, + "loss": 0.5387, + "step": 48000 + }, + { + "epoch": 0.4805, + "grad_norm": 86.83247375488281, + "learning_rate": 8.078999999999999e-07, + "loss": 0.5033, + "step": 48050 + }, + { + "epoch": 0.481, + "grad_norm": 24.38957405090332, + "learning_rate": 8.076999999999999e-07, + "loss": 0.7073, + "step": 48100 + }, + { + "epoch": 0.4815, + "grad_norm": 49.8204231262207, + "learning_rate": 8.075e-07, + "loss": 0.4825, + "step": 48150 + }, + { + "epoch": 0.482, + "grad_norm": 81.01677703857422, + "learning_rate": 8.07304e-07, + "loss": 0.6172, + "step": 48200 + }, + { + "epoch": 0.4825, + "grad_norm": 9.030482292175293, + "learning_rate": 8.07104e-07, + "loss": 0.5569, + "step": 48250 + }, + { + "epoch": 0.483, + "grad_norm": 94.63253021240234, + "learning_rate": 8.06904e-07, + "loss": 0.5637, + "step": 48300 + }, + { + "epoch": 0.4835, + "grad_norm": 92.56613159179688, + "learning_rate": 8.067039999999999e-07, + "loss": 0.6043, + "step": 48350 + }, + { + "epoch": 0.484, + "grad_norm": 21.853906631469727, + "learning_rate": 8.065039999999999e-07, + "loss": 0.4438, + "step": 48400 + }, + { + "epoch": 0.4845, + "grad_norm": 22.982881546020508, + "learning_rate": 8.06304e-07, + "loss": 0.498, + "step": 48450 + }, + { + "epoch": 0.485, + "grad_norm": 57.36958312988281, + "learning_rate": 8.06104e-07, + "loss": 0.5241, + "step": 48500 + }, + { + "epoch": 0.4855, + "grad_norm": 21.14190673828125, + "learning_rate": 8.059039999999999e-07, + "loss": 0.6204, + "step": 48550 + }, + { + "epoch": 0.486, + "grad_norm": 52.20134353637695, + "learning_rate": 8.05704e-07, + "loss": 0.5012, + "step": 48600 + }, + { + "epoch": 0.4865, + "grad_norm": 1.349229335784912, + "learning_rate": 8.05504e-07, + "loss": 0.5246, + "step": 48650 + }, + { + "epoch": 0.487, + "grad_norm": 66.7760009765625, + "learning_rate": 8.053039999999999e-07, + "loss": 0.5898, + "step": 48700 + }, + { + "epoch": 0.4875, + "grad_norm": 20.221620559692383, + "learning_rate": 8.05104e-07, + "loss": 0.449, + "step": 48750 + }, + { + "epoch": 0.488, + "grad_norm": 36.600914001464844, + "learning_rate": 8.049039999999999e-07, + "loss": 0.4766, + "step": 48800 + }, + { + "epoch": 0.4885, + "grad_norm": 65.7702865600586, + "learning_rate": 8.047039999999999e-07, + "loss": 0.66, + "step": 48850 + }, + { + "epoch": 0.489, + "grad_norm": 41.316524505615234, + "learning_rate": 8.04504e-07, + "loss": 0.549, + "step": 48900 + }, + { + "epoch": 0.4895, + "grad_norm": 30.280691146850586, + "learning_rate": 8.04304e-07, + "loss": 0.5504, + "step": 48950 + }, + { + "epoch": 0.49, + "grad_norm": 34.455535888671875, + "learning_rate": 8.04104e-07, + "loss": 0.502, + "step": 49000 + }, + { + "epoch": 0.4905, + "grad_norm": 57.2782096862793, + "learning_rate": 8.039039999999999e-07, + "loss": 0.4723, + "step": 49050 + }, + { + "epoch": 0.491, + "grad_norm": 29.322336196899414, + "learning_rate": 8.037039999999999e-07, + "loss": 0.5335, + "step": 49100 + }, + { + "epoch": 0.4915, + "grad_norm": 88.96637725830078, + "learning_rate": 8.035039999999999e-07, + "loss": 0.5582, + "step": 49150 + }, + { + "epoch": 0.492, + "grad_norm": 59.695770263671875, + "learning_rate": 8.03304e-07, + "loss": 0.5627, + "step": 49200 + }, + { + "epoch": 0.4925, + "grad_norm": 96.95304107666016, + "learning_rate": 8.03104e-07, + "loss": 0.5773, + "step": 49250 + }, + { + "epoch": 0.493, + "grad_norm": 76.24807739257812, + "learning_rate": 8.029039999999999e-07, + "loss": 0.5979, + "step": 49300 + }, + { + "epoch": 0.4935, + "grad_norm": 5.769176483154297, + "learning_rate": 8.02704e-07, + "loss": 0.4968, + "step": 49350 + }, + { + "epoch": 0.494, + "grad_norm": 61.573577880859375, + "learning_rate": 8.025039999999999e-07, + "loss": 0.5445, + "step": 49400 + }, + { + "epoch": 0.4945, + "grad_norm": 66.46489715576172, + "learning_rate": 8.023039999999999e-07, + "loss": 0.467, + "step": 49450 + }, + { + "epoch": 0.495, + "grad_norm": 84.0884780883789, + "learning_rate": 8.02104e-07, + "loss": 0.5543, + "step": 49500 + }, + { + "epoch": 0.4955, + "grad_norm": 73.09436798095703, + "learning_rate": 8.019039999999999e-07, + "loss": 0.5498, + "step": 49550 + }, + { + "epoch": 0.496, + "grad_norm": 17.9005126953125, + "learning_rate": 8.017039999999999e-07, + "loss": 0.4723, + "step": 49600 + }, + { + "epoch": 0.4965, + "grad_norm": 50.205223083496094, + "learning_rate": 8.01504e-07, + "loss": 0.5561, + "step": 49650 + }, + { + "epoch": 0.497, + "grad_norm": 52.76295471191406, + "learning_rate": 8.01304e-07, + "loss": 0.4949, + "step": 49700 + }, + { + "epoch": 0.4975, + "grad_norm": 62.78417205810547, + "learning_rate": 8.01104e-07, + "loss": 0.56, + "step": 49750 + }, + { + "epoch": 0.498, + "grad_norm": 6.838378429412842, + "learning_rate": 8.009039999999999e-07, + "loss": 0.6055, + "step": 49800 + }, + { + "epoch": 0.4985, + "grad_norm": 18.722816467285156, + "learning_rate": 8.007039999999999e-07, + "loss": 0.5358, + "step": 49850 + }, + { + "epoch": 0.499, + "grad_norm": 72.58301544189453, + "learning_rate": 8.00504e-07, + "loss": 0.401, + "step": 49900 + }, + { + "epoch": 0.4995, + "grad_norm": 37.88386535644531, + "learning_rate": 8.00304e-07, + "loss": 0.5257, + "step": 49950 + }, + { + "epoch": 0.5, + "grad_norm": 72.8793716430664, + "learning_rate": 8.00104e-07, + "loss": 0.5824, + "step": 50000 + }, + { + "epoch": 0.5005, + "grad_norm": 64.095458984375, + "learning_rate": 7.99904e-07, + "loss": 0.5659, + "step": 50050 + }, + { + "epoch": 0.501, + "grad_norm": 25.19496726989746, + "learning_rate": 7.997039999999999e-07, + "loss": 0.4465, + "step": 50100 + }, + { + "epoch": 0.5015, + "grad_norm": 112.08228302001953, + "learning_rate": 7.995039999999999e-07, + "loss": 0.6215, + "step": 50150 + }, + { + "epoch": 0.502, + "grad_norm": 3.157916307449341, + "learning_rate": 7.99304e-07, + "loss": 0.6333, + "step": 50200 + }, + { + "epoch": 0.5025, + "grad_norm": 76.75867462158203, + "learning_rate": 7.99104e-07, + "loss": 0.483, + "step": 50250 + }, + { + "epoch": 0.503, + "grad_norm": 43.23719024658203, + "learning_rate": 7.989039999999999e-07, + "loss": 0.5854, + "step": 50300 + }, + { + "epoch": 0.5035, + "grad_norm": 68.20858001708984, + "learning_rate": 7.98708e-07, + "loss": 0.6767, + "step": 50350 + }, + { + "epoch": 0.504, + "grad_norm": 88.28502655029297, + "learning_rate": 7.98508e-07, + "loss": 0.7013, + "step": 50400 + }, + { + "epoch": 0.5045, + "grad_norm": 36.20711898803711, + "learning_rate": 7.98308e-07, + "loss": 0.3669, + "step": 50450 + }, + { + "epoch": 0.505, + "grad_norm": 36.513954162597656, + "learning_rate": 7.98108e-07, + "loss": 0.4585, + "step": 50500 + }, + { + "epoch": 0.5055, + "grad_norm": 58.205039978027344, + "learning_rate": 7.979079999999999e-07, + "loss": 0.4132, + "step": 50550 + }, + { + "epoch": 0.506, + "grad_norm": 79.69970703125, + "learning_rate": 7.977079999999999e-07, + "loss": 0.5174, + "step": 50600 + }, + { + "epoch": 0.5065, + "grad_norm": 45.726173400878906, + "learning_rate": 7.97508e-07, + "loss": 0.5019, + "step": 50650 + }, + { + "epoch": 0.507, + "grad_norm": 88.03544616699219, + "learning_rate": 7.97308e-07, + "loss": 0.4937, + "step": 50700 + }, + { + "epoch": 0.5075, + "grad_norm": 47.83407211303711, + "learning_rate": 7.97108e-07, + "loss": 0.5023, + "step": 50750 + }, + { + "epoch": 0.508, + "grad_norm": 28.184261322021484, + "learning_rate": 7.96908e-07, + "loss": 0.4929, + "step": 50800 + }, + { + "epoch": 0.5085, + "grad_norm": 97.35902404785156, + "learning_rate": 7.967079999999999e-07, + "loss": 0.4831, + "step": 50850 + }, + { + "epoch": 0.509, + "grad_norm": 45.97766876220703, + "learning_rate": 7.965079999999999e-07, + "loss": 0.5077, + "step": 50900 + }, + { + "epoch": 0.5095, + "grad_norm": 126.22740173339844, + "learning_rate": 7.96308e-07, + "loss": 0.7102, + "step": 50950 + }, + { + "epoch": 0.51, + "grad_norm": 47.19316101074219, + "learning_rate": 7.96108e-07, + "loss": 0.4501, + "step": 51000 + }, + { + "epoch": 0.5105, + "grad_norm": 7.235782146453857, + "learning_rate": 7.959079999999999e-07, + "loss": 0.4165, + "step": 51050 + }, + { + "epoch": 0.511, + "grad_norm": 1.8868838548660278, + "learning_rate": 7.95708e-07, + "loss": 0.5589, + "step": 51100 + }, + { + "epoch": 0.5115, + "grad_norm": 48.44756317138672, + "learning_rate": 7.95508e-07, + "loss": 0.7304, + "step": 51150 + }, + { + "epoch": 0.512, + "grad_norm": 36.71236038208008, + "learning_rate": 7.953079999999999e-07, + "loss": 0.5783, + "step": 51200 + }, + { + "epoch": 0.5125, + "grad_norm": 59.711605072021484, + "learning_rate": 7.95108e-07, + "loss": 0.5104, + "step": 51250 + }, + { + "epoch": 0.513, + "grad_norm": 21.43183135986328, + "learning_rate": 7.949079999999999e-07, + "loss": 0.7175, + "step": 51300 + }, + { + "epoch": 0.5135, + "grad_norm": 106.07775115966797, + "learning_rate": 7.947079999999999e-07, + "loss": 0.5952, + "step": 51350 + }, + { + "epoch": 0.514, + "grad_norm": 25.90865135192871, + "learning_rate": 7.94508e-07, + "loss": 0.5459, + "step": 51400 + }, + { + "epoch": 0.5145, + "grad_norm": 3.6912739276885986, + "learning_rate": 7.94308e-07, + "loss": 0.5168, + "step": 51450 + }, + { + "epoch": 0.515, + "grad_norm": 89.33435821533203, + "learning_rate": 7.94108e-07, + "loss": 0.5239, + "step": 51500 + }, + { + "epoch": 0.5155, + "grad_norm": 32.307071685791016, + "learning_rate": 7.939079999999999e-07, + "loss": 0.5169, + "step": 51550 + }, + { + "epoch": 0.516, + "grad_norm": 16.09387969970703, + "learning_rate": 7.937079999999999e-07, + "loss": 0.4644, + "step": 51600 + }, + { + "epoch": 0.5165, + "grad_norm": 11.74305534362793, + "learning_rate": 7.935079999999999e-07, + "loss": 0.3657, + "step": 51650 + }, + { + "epoch": 0.517, + "grad_norm": 3.9428036212921143, + "learning_rate": 7.93308e-07, + "loss": 0.4922, + "step": 51700 + }, + { + "epoch": 0.5175, + "grad_norm": 15.612861633300781, + "learning_rate": 7.93108e-07, + "loss": 0.3998, + "step": 51750 + }, + { + "epoch": 0.518, + "grad_norm": 96.3296890258789, + "learning_rate": 7.929079999999999e-07, + "loss": 0.5107, + "step": 51800 + }, + { + "epoch": 0.5185, + "grad_norm": 76.5985336303711, + "learning_rate": 7.92708e-07, + "loss": 0.5332, + "step": 51850 + }, + { + "epoch": 0.519, + "grad_norm": 17.58921241760254, + "learning_rate": 7.925079999999999e-07, + "loss": 0.4055, + "step": 51900 + }, + { + "epoch": 0.5195, + "grad_norm": 15.568597793579102, + "learning_rate": 7.923079999999999e-07, + "loss": 0.5198, + "step": 51950 + }, + { + "epoch": 0.52, + "grad_norm": 0.8958696126937866, + "learning_rate": 7.92108e-07, + "loss": 0.5061, + "step": 52000 + }, + { + "epoch": 0.5205, + "grad_norm": 1.784369945526123, + "learning_rate": 7.919079999999999e-07, + "loss": 0.5103, + "step": 52050 + }, + { + "epoch": 0.521, + "grad_norm": 36.33938980102539, + "learning_rate": 7.917079999999999e-07, + "loss": 0.574, + "step": 52100 + }, + { + "epoch": 0.5215, + "grad_norm": 39.72173309326172, + "learning_rate": 7.91508e-07, + "loss": 0.5163, + "step": 52150 + }, + { + "epoch": 0.522, + "grad_norm": 56.958919525146484, + "learning_rate": 7.91308e-07, + "loss": 0.5032, + "step": 52200 + }, + { + "epoch": 0.5225, + "grad_norm": 103.39403533935547, + "learning_rate": 7.91108e-07, + "loss": 0.63, + "step": 52250 + }, + { + "epoch": 0.523, + "grad_norm": 79.32686614990234, + "learning_rate": 7.90908e-07, + "loss": 0.5285, + "step": 52300 + }, + { + "epoch": 0.5235, + "grad_norm": 38.18978500366211, + "learning_rate": 7.907079999999999e-07, + "loss": 0.4923, + "step": 52350 + }, + { + "epoch": 0.524, + "grad_norm": 17.309484481811523, + "learning_rate": 7.90508e-07, + "loss": 0.4923, + "step": 52400 + }, + { + "epoch": 0.5245, + "grad_norm": 92.0141372680664, + "learning_rate": 7.90308e-07, + "loss": 0.5748, + "step": 52450 + }, + { + "epoch": 0.525, + "grad_norm": 38.47359848022461, + "learning_rate": 7.90108e-07, + "loss": 0.5513, + "step": 52500 + }, + { + "epoch": 0.5255, + "grad_norm": 69.26992797851562, + "learning_rate": 7.899080000000001e-07, + "loss": 0.6091, + "step": 52550 + }, + { + "epoch": 0.526, + "grad_norm": 15.796204566955566, + "learning_rate": 7.897079999999999e-07, + "loss": 0.5999, + "step": 52600 + }, + { + "epoch": 0.5265, + "grad_norm": 103.76625061035156, + "learning_rate": 7.895079999999999e-07, + "loss": 0.4535, + "step": 52650 + }, + { + "epoch": 0.527, + "grad_norm": 6.661074161529541, + "learning_rate": 7.89308e-07, + "loss": 0.5645, + "step": 52700 + }, + { + "epoch": 0.5275, + "grad_norm": 92.19075012207031, + "learning_rate": 7.89108e-07, + "loss": 0.5311, + "step": 52750 + }, + { + "epoch": 0.528, + "grad_norm": 51.94999313354492, + "learning_rate": 7.88908e-07, + "loss": 0.4162, + "step": 52800 + }, + { + "epoch": 0.5285, + "grad_norm": 2.7024729251861572, + "learning_rate": 7.88708e-07, + "loss": 0.4668, + "step": 52850 + }, + { + "epoch": 0.529, + "grad_norm": 72.6579360961914, + "learning_rate": 7.88508e-07, + "loss": 0.5353, + "step": 52900 + }, + { + "epoch": 0.5295, + "grad_norm": 47.76490783691406, + "learning_rate": 7.883079999999999e-07, + "loss": 0.3945, + "step": 52950 + }, + { + "epoch": 0.53, + "grad_norm": 64.03026580810547, + "learning_rate": 7.88108e-07, + "loss": 0.5546, + "step": 53000 + }, + { + "epoch": 0.5305, + "grad_norm": 37.60674285888672, + "learning_rate": 7.87908e-07, + "loss": 0.5378, + "step": 53050 + }, + { + "epoch": 0.531, + "grad_norm": 53.003170013427734, + "learning_rate": 7.877079999999999e-07, + "loss": 0.4513, + "step": 53100 + }, + { + "epoch": 0.5315, + "grad_norm": 5.38227653503418, + "learning_rate": 7.87508e-07, + "loss": 0.5933, + "step": 53150 + }, + { + "epoch": 0.532, + "grad_norm": 48.471832275390625, + "learning_rate": 7.87308e-07, + "loss": 0.476, + "step": 53200 + }, + { + "epoch": 0.5325, + "grad_norm": 7.869255542755127, + "learning_rate": 7.87108e-07, + "loss": 0.4092, + "step": 53250 + }, + { + "epoch": 0.533, + "grad_norm": 18.027494430541992, + "learning_rate": 7.86908e-07, + "loss": 0.5511, + "step": 53300 + }, + { + "epoch": 0.5335, + "grad_norm": 2.7823119163513184, + "learning_rate": 7.867079999999999e-07, + "loss": 0.4462, + "step": 53350 + }, + { + "epoch": 0.534, + "grad_norm": 84.66714477539062, + "learning_rate": 7.865079999999999e-07, + "loss": 0.5392, + "step": 53400 + }, + { + "epoch": 0.5345, + "grad_norm": 39.51984786987305, + "learning_rate": 7.86308e-07, + "loss": 0.5594, + "step": 53450 + }, + { + "epoch": 0.535, + "grad_norm": 58.15394973754883, + "learning_rate": 7.86112e-07, + "loss": 0.49, + "step": 53500 + }, + { + "epoch": 0.5355, + "grad_norm": 90.69095611572266, + "learning_rate": 7.85912e-07, + "loss": 0.3737, + "step": 53550 + }, + { + "epoch": 0.536, + "grad_norm": 6.2065629959106445, + "learning_rate": 7.85712e-07, + "loss": 0.6176, + "step": 53600 + }, + { + "epoch": 0.5365, + "grad_norm": 123.52501678466797, + "learning_rate": 7.855119999999999e-07, + "loss": 0.4808, + "step": 53650 + }, + { + "epoch": 0.537, + "grad_norm": 122.08316040039062, + "learning_rate": 7.853119999999999e-07, + "loss": 0.5169, + "step": 53700 + }, + { + "epoch": 0.5375, + "grad_norm": 1.263464331626892, + "learning_rate": 7.85112e-07, + "loss": 0.4527, + "step": 53750 + }, + { + "epoch": 0.538, + "grad_norm": 0.7644511461257935, + "learning_rate": 7.84912e-07, + "loss": 0.6983, + "step": 53800 + }, + { + "epoch": 0.5385, + "grad_norm": 4.055157661437988, + "learning_rate": 7.847119999999999e-07, + "loss": 0.5582, + "step": 53850 + }, + { + "epoch": 0.539, + "grad_norm": 39.59327697753906, + "learning_rate": 7.84512e-07, + "loss": 0.4857, + "step": 53900 + }, + { + "epoch": 0.5395, + "grad_norm": 44.580570220947266, + "learning_rate": 7.84312e-07, + "loss": 0.5478, + "step": 53950 + }, + { + "epoch": 0.54, + "grad_norm": 45.88383865356445, + "learning_rate": 7.84112e-07, + "loss": 0.5136, + "step": 54000 + }, + { + "epoch": 0.5405, + "grad_norm": 18.86288833618164, + "learning_rate": 7.83912e-07, + "loss": 0.5359, + "step": 54050 + }, + { + "epoch": 0.541, + "grad_norm": 78.80966186523438, + "learning_rate": 7.837119999999999e-07, + "loss": 0.6137, + "step": 54100 + }, + { + "epoch": 0.5415, + "grad_norm": 17.388288497924805, + "learning_rate": 7.835119999999999e-07, + "loss": 0.5367, + "step": 54150 + }, + { + "epoch": 0.542, + "grad_norm": 50.73162078857422, + "learning_rate": 7.83312e-07, + "loss": 0.5237, + "step": 54200 + }, + { + "epoch": 0.5425, + "grad_norm": 11.157291412353516, + "learning_rate": 7.83112e-07, + "loss": 0.4897, + "step": 54250 + }, + { + "epoch": 0.543, + "grad_norm": 75.02784729003906, + "learning_rate": 7.829120000000001e-07, + "loss": 0.5842, + "step": 54300 + }, + { + "epoch": 0.5435, + "grad_norm": 72.39640045166016, + "learning_rate": 7.82712e-07, + "loss": 0.5354, + "step": 54350 + }, + { + "epoch": 0.544, + "grad_norm": 0.02550535462796688, + "learning_rate": 7.825119999999999e-07, + "loss": 0.5047, + "step": 54400 + }, + { + "epoch": 0.5445, + "grad_norm": 5.766872882843018, + "learning_rate": 7.82312e-07, + "loss": 0.5554, + "step": 54450 + }, + { + "epoch": 0.545, + "grad_norm": 41.95703125, + "learning_rate": 7.82112e-07, + "loss": 0.5145, + "step": 54500 + }, + { + "epoch": 0.5455, + "grad_norm": 100.95028686523438, + "learning_rate": 7.81912e-07, + "loss": 0.5323, + "step": 54550 + }, + { + "epoch": 0.546, + "grad_norm": 107.0252685546875, + "learning_rate": 7.81712e-07, + "loss": 0.6051, + "step": 54600 + }, + { + "epoch": 0.5465, + "grad_norm": 73.3103256225586, + "learning_rate": 7.81512e-07, + "loss": 0.6191, + "step": 54650 + }, + { + "epoch": 0.547, + "grad_norm": 99.1833724975586, + "learning_rate": 7.81312e-07, + "loss": 0.5797, + "step": 54700 + }, + { + "epoch": 0.5475, + "grad_norm": 64.93377685546875, + "learning_rate": 7.81112e-07, + "loss": 0.4962, + "step": 54750 + }, + { + "epoch": 0.548, + "grad_norm": 16.584720611572266, + "learning_rate": 7.80912e-07, + "loss": 0.5693, + "step": 54800 + }, + { + "epoch": 0.5485, + "grad_norm": 91.99310302734375, + "learning_rate": 7.807119999999999e-07, + "loss": 0.6053, + "step": 54850 + }, + { + "epoch": 0.549, + "grad_norm": 91.74974822998047, + "learning_rate": 7.80512e-07, + "loss": 0.515, + "step": 54900 + }, + { + "epoch": 0.5495, + "grad_norm": 11.728752136230469, + "learning_rate": 7.80312e-07, + "loss": 0.499, + "step": 54950 + }, + { + "epoch": 0.55, + "grad_norm": 39.06698989868164, + "learning_rate": 7.80112e-07, + "loss": 0.4463, + "step": 55000 + }, + { + "epoch": 0.5505, + "grad_norm": 88.770751953125, + "learning_rate": 7.799120000000001e-07, + "loss": 0.5427, + "step": 55050 + }, + { + "epoch": 0.551, + "grad_norm": 27.026084899902344, + "learning_rate": 7.797119999999999e-07, + "loss": 0.523, + "step": 55100 + }, + { + "epoch": 0.5515, + "grad_norm": 1.4966083765029907, + "learning_rate": 7.795119999999999e-07, + "loss": 0.3774, + "step": 55150 + }, + { + "epoch": 0.552, + "grad_norm": 48.237422943115234, + "learning_rate": 7.79312e-07, + "loss": 0.5175, + "step": 55200 + }, + { + "epoch": 0.5525, + "grad_norm": 55.568851470947266, + "learning_rate": 7.79112e-07, + "loss": 0.4215, + "step": 55250 + }, + { + "epoch": 0.553, + "grad_norm": 102.37129211425781, + "learning_rate": 7.78912e-07, + "loss": 0.64, + "step": 55300 + }, + { + "epoch": 0.5535, + "grad_norm": 3.732124090194702, + "learning_rate": 7.78712e-07, + "loss": 0.5539, + "step": 55350 + }, + { + "epoch": 0.554, + "grad_norm": 49.1021842956543, + "learning_rate": 7.78512e-07, + "loss": 0.4958, + "step": 55400 + }, + { + "epoch": 0.5545, + "grad_norm": 85.12141418457031, + "learning_rate": 7.783119999999999e-07, + "loss": 0.4048, + "step": 55450 + }, + { + "epoch": 0.555, + "grad_norm": 7.854513168334961, + "learning_rate": 7.78112e-07, + "loss": 0.4996, + "step": 55500 + }, + { + "epoch": 0.5555, + "grad_norm": 21.370290756225586, + "learning_rate": 7.77912e-07, + "loss": 0.6001, + "step": 55550 + }, + { + "epoch": 0.556, + "grad_norm": 58.50725173950195, + "learning_rate": 7.777119999999999e-07, + "loss": 0.5398, + "step": 55600 + }, + { + "epoch": 0.5565, + "grad_norm": 26.769861221313477, + "learning_rate": 7.77512e-07, + "loss": 0.5714, + "step": 55650 + }, + { + "epoch": 0.557, + "grad_norm": 7.172065734863281, + "learning_rate": 7.77312e-07, + "loss": 0.5132, + "step": 55700 + }, + { + "epoch": 0.5575, + "grad_norm": 28.10909080505371, + "learning_rate": 7.77112e-07, + "loss": 0.5079, + "step": 55750 + }, + { + "epoch": 0.558, + "grad_norm": 78.68544006347656, + "learning_rate": 7.76912e-07, + "loss": 0.575, + "step": 55800 + }, + { + "epoch": 0.5585, + "grad_norm": 105.1363754272461, + "learning_rate": 7.767119999999999e-07, + "loss": 0.6106, + "step": 55850 + }, + { + "epoch": 0.559, + "grad_norm": 67.24320983886719, + "learning_rate": 7.765159999999999e-07, + "loss": 0.5576, + "step": 55900 + }, + { + "epoch": 0.5595, + "grad_norm": 8.551283836364746, + "learning_rate": 7.76316e-07, + "loss": 0.6247, + "step": 55950 + }, + { + "epoch": 0.56, + "grad_norm": 33.97761535644531, + "learning_rate": 7.76116e-07, + "loss": 0.5779, + "step": 56000 + }, + { + "epoch": 0.5605, + "grad_norm": 106.0393295288086, + "learning_rate": 7.75916e-07, + "loss": 0.5783, + "step": 56050 + }, + { + "epoch": 0.561, + "grad_norm": 66.49019622802734, + "learning_rate": 7.75716e-07, + "loss": 0.5892, + "step": 56100 + }, + { + "epoch": 0.5615, + "grad_norm": 70.9111557006836, + "learning_rate": 7.755159999999999e-07, + "loss": 0.521, + "step": 56150 + }, + { + "epoch": 0.562, + "grad_norm": 27.3460750579834, + "learning_rate": 7.753159999999999e-07, + "loss": 0.4994, + "step": 56200 + }, + { + "epoch": 0.5625, + "grad_norm": 26.073501586914062, + "learning_rate": 7.75116e-07, + "loss": 0.4284, + "step": 56250 + }, + { + "epoch": 0.563, + "grad_norm": 10.379983901977539, + "learning_rate": 7.74916e-07, + "loss": 0.4984, + "step": 56300 + }, + { + "epoch": 0.5635, + "grad_norm": 15.902315139770508, + "learning_rate": 7.747159999999999e-07, + "loss": 0.5109, + "step": 56350 + }, + { + "epoch": 0.564, + "grad_norm": 13.556990623474121, + "learning_rate": 7.74516e-07, + "loss": 0.5017, + "step": 56400 + }, + { + "epoch": 0.5645, + "grad_norm": 32.288238525390625, + "learning_rate": 7.74316e-07, + "loss": 0.4915, + "step": 56450 + }, + { + "epoch": 0.565, + "grad_norm": 82.15554809570312, + "learning_rate": 7.741159999999999e-07, + "loss": 0.5898, + "step": 56500 + }, + { + "epoch": 0.5655, + "grad_norm": 2.104949951171875, + "learning_rate": 7.73916e-07, + "loss": 0.4796, + "step": 56550 + }, + { + "epoch": 0.566, + "grad_norm": 18.454978942871094, + "learning_rate": 7.737159999999999e-07, + "loss": 0.646, + "step": 56600 + }, + { + "epoch": 0.5665, + "grad_norm": 1.2304694652557373, + "learning_rate": 7.735159999999999e-07, + "loss": 0.5653, + "step": 56650 + }, + { + "epoch": 0.567, + "grad_norm": 87.61625671386719, + "learning_rate": 7.73316e-07, + "loss": 0.4816, + "step": 56700 + }, + { + "epoch": 0.5675, + "grad_norm": 39.44472885131836, + "learning_rate": 7.73116e-07, + "loss": 0.4758, + "step": 56750 + }, + { + "epoch": 0.568, + "grad_norm": 60.77321243286133, + "learning_rate": 7.729160000000001e-07, + "loss": 0.5649, + "step": 56800 + }, + { + "epoch": 0.5685, + "grad_norm": 62.973541259765625, + "learning_rate": 7.727159999999999e-07, + "loss": 0.5331, + "step": 56850 + }, + { + "epoch": 0.569, + "grad_norm": 3.172307252883911, + "learning_rate": 7.725159999999999e-07, + "loss": 0.529, + "step": 56900 + }, + { + "epoch": 0.5695, + "grad_norm": 82.92523956298828, + "learning_rate": 7.72316e-07, + "loss": 0.4194, + "step": 56950 + }, + { + "epoch": 0.57, + "grad_norm": 30.7186222076416, + "learning_rate": 7.72116e-07, + "loss": 0.4916, + "step": 57000 + }, + { + "epoch": 0.5705, + "grad_norm": 21.11612319946289, + "learning_rate": 7.71916e-07, + "loss": 0.5155, + "step": 57050 + }, + { + "epoch": 0.571, + "grad_norm": 109.33685302734375, + "learning_rate": 7.71716e-07, + "loss": 0.6148, + "step": 57100 + }, + { + "epoch": 0.5715, + "grad_norm": 4.589968204498291, + "learning_rate": 7.71516e-07, + "loss": 0.4523, + "step": 57150 + }, + { + "epoch": 0.572, + "grad_norm": 50.23842239379883, + "learning_rate": 7.713159999999999e-07, + "loss": 0.4689, + "step": 57200 + }, + { + "epoch": 0.5725, + "grad_norm": 5.932251453399658, + "learning_rate": 7.71116e-07, + "loss": 0.4741, + "step": 57250 + }, + { + "epoch": 0.573, + "grad_norm": 83.40533447265625, + "learning_rate": 7.70916e-07, + "loss": 0.522, + "step": 57300 + }, + { + "epoch": 0.5735, + "grad_norm": 53.40946960449219, + "learning_rate": 7.707159999999999e-07, + "loss": 0.3644, + "step": 57350 + }, + { + "epoch": 0.574, + "grad_norm": 107.14583587646484, + "learning_rate": 7.70516e-07, + "loss": 0.4899, + "step": 57400 + }, + { + "epoch": 0.5745, + "grad_norm": 81.80699920654297, + "learning_rate": 7.70316e-07, + "loss": 0.4938, + "step": 57450 + }, + { + "epoch": 0.575, + "grad_norm": 66.33548736572266, + "learning_rate": 7.70116e-07, + "loss": 0.5103, + "step": 57500 + }, + { + "epoch": 0.5755, + "grad_norm": 55.853206634521484, + "learning_rate": 7.69916e-07, + "loss": 0.5101, + "step": 57550 + }, + { + "epoch": 0.576, + "grad_norm": 1.4973876476287842, + "learning_rate": 7.697159999999999e-07, + "loss": 0.5066, + "step": 57600 + }, + { + "epoch": 0.5765, + "grad_norm": 8.70065689086914, + "learning_rate": 7.695159999999999e-07, + "loss": 0.523, + "step": 57650 + }, + { + "epoch": 0.577, + "grad_norm": 68.1734390258789, + "learning_rate": 7.69316e-07, + "loss": 0.5269, + "step": 57700 + }, + { + "epoch": 0.5775, + "grad_norm": 47.97027587890625, + "learning_rate": 7.69116e-07, + "loss": 0.4298, + "step": 57750 + }, + { + "epoch": 0.578, + "grad_norm": 0.9205309152603149, + "learning_rate": 7.68916e-07, + "loss": 0.4194, + "step": 57800 + }, + { + "epoch": 0.5785, + "grad_norm": 73.67825317382812, + "learning_rate": 7.68716e-07, + "loss": 0.4472, + "step": 57850 + }, + { + "epoch": 0.579, + "grad_norm": 41.70149612426758, + "learning_rate": 7.685159999999999e-07, + "loss": 0.4958, + "step": 57900 + }, + { + "epoch": 0.5795, + "grad_norm": 37.12933349609375, + "learning_rate": 7.683159999999999e-07, + "loss": 0.3652, + "step": 57950 + }, + { + "epoch": 0.58, + "grad_norm": 50.90730667114258, + "learning_rate": 7.68116e-07, + "loss": 0.4357, + "step": 58000 + }, + { + "epoch": 0.5805, + "grad_norm": 84.36893463134766, + "learning_rate": 7.67916e-07, + "loss": 0.643, + "step": 58050 + }, + { + "epoch": 0.581, + "grad_norm": 2.3979945182800293, + "learning_rate": 7.677159999999999e-07, + "loss": 0.4654, + "step": 58100 + }, + { + "epoch": 0.5815, + "grad_norm": 82.98129272460938, + "learning_rate": 7.67516e-07, + "loss": 0.5124, + "step": 58150 + }, + { + "epoch": 0.582, + "grad_norm": 68.0512924194336, + "learning_rate": 7.67316e-07, + "loss": 0.5489, + "step": 58200 + }, + { + "epoch": 0.5825, + "grad_norm": 7.42828893661499, + "learning_rate": 7.671159999999999e-07, + "loss": 0.6616, + "step": 58250 + }, + { + "epoch": 0.583, + "grad_norm": 16.994314193725586, + "learning_rate": 7.66916e-07, + "loss": 0.4413, + "step": 58300 + }, + { + "epoch": 0.5835, + "grad_norm": 33.765499114990234, + "learning_rate": 7.667159999999999e-07, + "loss": 0.5723, + "step": 58350 + }, + { + "epoch": 0.584, + "grad_norm": 114.72591400146484, + "learning_rate": 7.665159999999999e-07, + "loss": 0.4305, + "step": 58400 + }, + { + "epoch": 0.5845, + "grad_norm": 60.763423919677734, + "learning_rate": 7.66316e-07, + "loss": 0.4434, + "step": 58450 + }, + { + "epoch": 0.585, + "grad_norm": 70.40579223632812, + "learning_rate": 7.66116e-07, + "loss": 0.5024, + "step": 58500 + }, + { + "epoch": 0.5855, + "grad_norm": 63.25678634643555, + "learning_rate": 7.65916e-07, + "loss": 0.4624, + "step": 58550 + }, + { + "epoch": 0.586, + "grad_norm": 75.37684631347656, + "learning_rate": 7.657159999999999e-07, + "loss": 0.6335, + "step": 58600 + }, + { + "epoch": 0.5865, + "grad_norm": 82.71162414550781, + "learning_rate": 7.655159999999999e-07, + "loss": 0.57, + "step": 58650 + }, + { + "epoch": 0.587, + "grad_norm": 6.6550445556640625, + "learning_rate": 7.653159999999999e-07, + "loss": 0.6168, + "step": 58700 + }, + { + "epoch": 0.5875, + "grad_norm": 21.841920852661133, + "learning_rate": 7.65116e-07, + "loss": 0.4213, + "step": 58750 + }, + { + "epoch": 0.588, + "grad_norm": 67.11859893798828, + "learning_rate": 7.64916e-07, + "loss": 0.5094, + "step": 58800 + }, + { + "epoch": 0.5885, + "grad_norm": 6.230730056762695, + "learning_rate": 7.647159999999999e-07, + "loss": 0.5381, + "step": 58850 + }, + { + "epoch": 0.589, + "grad_norm": 90.90957641601562, + "learning_rate": 7.64516e-07, + "loss": 0.5987, + "step": 58900 + }, + { + "epoch": 0.5895, + "grad_norm": 67.64148712158203, + "learning_rate": 7.643159999999999e-07, + "loss": 0.5335, + "step": 58950 + }, + { + "epoch": 0.59, + "grad_norm": 82.5676040649414, + "learning_rate": 7.641159999999999e-07, + "loss": 0.524, + "step": 59000 + }, + { + "epoch": 0.5905, + "grad_norm": 2.6406068801879883, + "learning_rate": 7.63916e-07, + "loss": 0.4817, + "step": 59050 + }, + { + "epoch": 0.591, + "grad_norm": 44.6208381652832, + "learning_rate": 7.637159999999999e-07, + "loss": 0.5026, + "step": 59100 + }, + { + "epoch": 0.5915, + "grad_norm": 90.23446655273438, + "learning_rate": 7.635159999999999e-07, + "loss": 0.5775, + "step": 59150 + }, + { + "epoch": 0.592, + "grad_norm": 1.308499813079834, + "learning_rate": 7.63316e-07, + "loss": 0.5328, + "step": 59200 + }, + { + "epoch": 0.5925, + "grad_norm": 97.81929016113281, + "learning_rate": 7.63116e-07, + "loss": 0.5452, + "step": 59250 + }, + { + "epoch": 0.593, + "grad_norm": 21.65294647216797, + "learning_rate": 7.629160000000001e-07, + "loss": 0.5259, + "step": 59300 + }, + { + "epoch": 0.5935, + "grad_norm": 15.141357421875, + "learning_rate": 7.627159999999999e-07, + "loss": 0.3815, + "step": 59350 + }, + { + "epoch": 0.594, + "grad_norm": 62.881248474121094, + "learning_rate": 7.625159999999999e-07, + "loss": 0.5372, + "step": 59400 + }, + { + "epoch": 0.5945, + "grad_norm": 2.6095540523529053, + "learning_rate": 7.62316e-07, + "loss": 0.4359, + "step": 59450 + }, + { + "epoch": 0.595, + "grad_norm": 5.5010085105896, + "learning_rate": 7.62116e-07, + "loss": 0.4372, + "step": 59500 + }, + { + "epoch": 0.5955, + "grad_norm": 35.81318664550781, + "learning_rate": 7.61916e-07, + "loss": 0.442, + "step": 59550 + }, + { + "epoch": 0.596, + "grad_norm": 61.836212158203125, + "learning_rate": 7.61716e-07, + "loss": 0.6257, + "step": 59600 + }, + { + "epoch": 0.5965, + "grad_norm": 89.80731201171875, + "learning_rate": 7.61516e-07, + "loss": 0.5622, + "step": 59650 + }, + { + "epoch": 0.597, + "grad_norm": 59.49220657348633, + "learning_rate": 7.613159999999999e-07, + "loss": 0.4933, + "step": 59700 + }, + { + "epoch": 0.5975, + "grad_norm": 64.80416107177734, + "learning_rate": 7.61116e-07, + "loss": 0.6054, + "step": 59750 + }, + { + "epoch": 0.598, + "grad_norm": 42.769248962402344, + "learning_rate": 7.60916e-07, + "loss": 0.4669, + "step": 59800 + }, + { + "epoch": 0.5985, + "grad_norm": 4.290161609649658, + "learning_rate": 7.607159999999999e-07, + "loss": 0.5286, + "step": 59850 + }, + { + "epoch": 0.599, + "grad_norm": 50.70981216430664, + "learning_rate": 7.60516e-07, + "loss": 0.5784, + "step": 59900 + }, + { + "epoch": 0.5995, + "grad_norm": 64.66263580322266, + "learning_rate": 7.60316e-07, + "loss": 0.558, + "step": 59950 + }, + { + "epoch": 0.6, + "grad_norm": 56.63187026977539, + "learning_rate": 7.60116e-07, + "loss": 0.59, + "step": 60000 + }, + { + "epoch": 0.6005, + "grad_norm": 53.35725402832031, + "learning_rate": 7.59916e-07, + "loss": 0.6365, + "step": 60050 + }, + { + "epoch": 0.601, + "grad_norm": 30.334291458129883, + "learning_rate": 7.597159999999999e-07, + "loss": 0.5164, + "step": 60100 + }, + { + "epoch": 0.6015, + "grad_norm": 9.290550231933594, + "learning_rate": 7.595159999999999e-07, + "loss": 0.515, + "step": 60150 + }, + { + "epoch": 0.602, + "grad_norm": 64.14176940917969, + "learning_rate": 7.59316e-07, + "loss": 0.614, + "step": 60200 + }, + { + "epoch": 0.6025, + "grad_norm": 75.271484375, + "learning_rate": 7.59116e-07, + "loss": 0.6181, + "step": 60250 + }, + { + "epoch": 0.603, + "grad_norm": 4.262742042541504, + "learning_rate": 7.58916e-07, + "loss": 0.5013, + "step": 60300 + }, + { + "epoch": 0.6035, + "grad_norm": 3.081693649291992, + "learning_rate": 7.58716e-07, + "loss": 0.5908, + "step": 60350 + }, + { + "epoch": 0.604, + "grad_norm": 39.718345642089844, + "learning_rate": 7.585159999999999e-07, + "loss": 0.4868, + "step": 60400 + }, + { + "epoch": 0.6045, + "grad_norm": 84.18822479248047, + "learning_rate": 7.583159999999999e-07, + "loss": 0.5012, + "step": 60450 + }, + { + "epoch": 0.605, + "grad_norm": 90.46514892578125, + "learning_rate": 7.58116e-07, + "loss": 0.5695, + "step": 60500 + }, + { + "epoch": 0.6055, + "grad_norm": 56.23940658569336, + "learning_rate": 7.57916e-07, + "loss": 0.5034, + "step": 60550 + }, + { + "epoch": 0.606, + "grad_norm": 27.244321823120117, + "learning_rate": 7.577159999999999e-07, + "loss": 0.5065, + "step": 60600 + }, + { + "epoch": 0.6065, + "grad_norm": 6.311286449432373, + "learning_rate": 7.57516e-07, + "loss": 0.5169, + "step": 60650 + }, + { + "epoch": 0.607, + "grad_norm": 1.7884770631790161, + "learning_rate": 7.57316e-07, + "loss": 0.4025, + "step": 60700 + }, + { + "epoch": 0.6075, + "grad_norm": 76.4052734375, + "learning_rate": 7.571159999999999e-07, + "loss": 0.5358, + "step": 60750 + }, + { + "epoch": 0.608, + "grad_norm": 57.62378692626953, + "learning_rate": 7.56916e-07, + "loss": 0.5542, + "step": 60800 + }, + { + "epoch": 0.6085, + "grad_norm": 1.3535525798797607, + "learning_rate": 7.56716e-07, + "loss": 0.6046, + "step": 60850 + }, + { + "epoch": 0.609, + "grad_norm": 57.33085632324219, + "learning_rate": 7.565159999999999e-07, + "loss": 0.5285, + "step": 60900 + }, + { + "epoch": 0.6095, + "grad_norm": 10.677825927734375, + "learning_rate": 7.56316e-07, + "loss": 0.4391, + "step": 60950 + }, + { + "epoch": 0.61, + "grad_norm": 14.921430587768555, + "learning_rate": 7.56116e-07, + "loss": 0.4501, + "step": 61000 + }, + { + "epoch": 0.6105, + "grad_norm": 3.2991600036621094, + "learning_rate": 7.55916e-07, + "loss": 0.5235, + "step": 61050 + }, + { + "epoch": 0.611, + "grad_norm": 50.49160385131836, + "learning_rate": 7.55716e-07, + "loss": 0.4383, + "step": 61100 + }, + { + "epoch": 0.6115, + "grad_norm": 93.91867065429688, + "learning_rate": 7.555159999999999e-07, + "loss": 0.558, + "step": 61150 + }, + { + "epoch": 0.612, + "grad_norm": 15.711594581604004, + "learning_rate": 7.553159999999999e-07, + "loss": 0.5294, + "step": 61200 + }, + { + "epoch": 0.6125, + "grad_norm": 9.569193840026855, + "learning_rate": 7.5512e-07, + "loss": 0.4918, + "step": 61250 + }, + { + "epoch": 0.613, + "grad_norm": 36.533626556396484, + "learning_rate": 7.5492e-07, + "loss": 0.4458, + "step": 61300 + }, + { + "epoch": 0.6135, + "grad_norm": 47.60615158081055, + "learning_rate": 7.547199999999999e-07, + "loss": 0.494, + "step": 61350 + }, + { + "epoch": 0.614, + "grad_norm": 47.9034538269043, + "learning_rate": 7.5452e-07, + "loss": 0.5418, + "step": 61400 + }, + { + "epoch": 0.6145, + "grad_norm": 55.0429573059082, + "learning_rate": 7.543199999999999e-07, + "loss": 0.6425, + "step": 61450 + }, + { + "epoch": 0.615, + "grad_norm": 29.267080307006836, + "learning_rate": 7.541199999999999e-07, + "loss": 0.467, + "step": 61500 + }, + { + "epoch": 0.6155, + "grad_norm": 47.8558235168457, + "learning_rate": 7.5392e-07, + "loss": 0.4311, + "step": 61550 + }, + { + "epoch": 0.616, + "grad_norm": 63.829071044921875, + "learning_rate": 7.537199999999999e-07, + "loss": 0.564, + "step": 61600 + }, + { + "epoch": 0.6165, + "grad_norm": 2.9663500785827637, + "learning_rate": 7.535199999999999e-07, + "loss": 0.5972, + "step": 61650 + }, + { + "epoch": 0.617, + "grad_norm": 1.3793628215789795, + "learning_rate": 7.5332e-07, + "loss": 0.4365, + "step": 61700 + }, + { + "epoch": 0.6175, + "grad_norm": 37.88240432739258, + "learning_rate": 7.5312e-07, + "loss": 0.596, + "step": 61750 + }, + { + "epoch": 0.618, + "grad_norm": 71.33352661132812, + "learning_rate": 7.5292e-07, + "loss": 0.4351, + "step": 61800 + }, + { + "epoch": 0.6185, + "grad_norm": 59.38632583618164, + "learning_rate": 7.527199999999999e-07, + "loss": 0.4588, + "step": 61850 + }, + { + "epoch": 0.619, + "grad_norm": 59.172969818115234, + "learning_rate": 7.525199999999999e-07, + "loss": 0.5131, + "step": 61900 + }, + { + "epoch": 0.6195, + "grad_norm": 11.83280086517334, + "learning_rate": 7.5232e-07, + "loss": 0.5119, + "step": 61950 + }, + { + "epoch": 0.62, + "grad_norm": 64.36370849609375, + "learning_rate": 7.5212e-07, + "loss": 0.4334, + "step": 62000 + }, + { + "epoch": 0.6205, + "grad_norm": 63.02851867675781, + "learning_rate": 7.5192e-07, + "loss": 0.4878, + "step": 62050 + }, + { + "epoch": 0.621, + "grad_norm": 1.058779001235962, + "learning_rate": 7.517200000000001e-07, + "loss": 0.5113, + "step": 62100 + }, + { + "epoch": 0.6215, + "grad_norm": 2.1751315593719482, + "learning_rate": 7.515199999999999e-07, + "loss": 0.4869, + "step": 62150 + }, + { + "epoch": 0.622, + "grad_norm": 100.90425109863281, + "learning_rate": 7.513199999999999e-07, + "loss": 0.5549, + "step": 62200 + }, + { + "epoch": 0.6225, + "grad_norm": 116.42280578613281, + "learning_rate": 7.5112e-07, + "loss": 0.4273, + "step": 62250 + }, + { + "epoch": 0.623, + "grad_norm": 81.34993743896484, + "learning_rate": 7.5092e-07, + "loss": 0.5264, + "step": 62300 + }, + { + "epoch": 0.6235, + "grad_norm": 27.30585289001465, + "learning_rate": 7.5072e-07, + "loss": 0.5647, + "step": 62350 + }, + { + "epoch": 0.624, + "grad_norm": 8.460829734802246, + "learning_rate": 7.5052e-07, + "loss": 0.5671, + "step": 62400 + }, + { + "epoch": 0.6245, + "grad_norm": 23.786176681518555, + "learning_rate": 7.5032e-07, + "loss": 0.4409, + "step": 62450 + }, + { + "epoch": 0.625, + "grad_norm": 100.85468292236328, + "learning_rate": 7.501199999999999e-07, + "loss": 0.5269, + "step": 62500 + }, + { + "epoch": 0.6255, + "grad_norm": 55.07974624633789, + "learning_rate": 7.4992e-07, + "loss": 0.3994, + "step": 62550 + }, + { + "epoch": 0.626, + "grad_norm": 50.99776840209961, + "learning_rate": 7.4972e-07, + "loss": 0.5842, + "step": 62600 + }, + { + "epoch": 0.6265, + "grad_norm": 6.158398151397705, + "learning_rate": 7.495199999999999e-07, + "loss": 0.4597, + "step": 62650 + }, + { + "epoch": 0.627, + "grad_norm": 47.69706344604492, + "learning_rate": 7.4932e-07, + "loss": 0.5979, + "step": 62700 + }, + { + "epoch": 0.6275, + "grad_norm": 42.70079040527344, + "learning_rate": 7.4912e-07, + "loss": 0.4116, + "step": 62750 + }, + { + "epoch": 0.628, + "grad_norm": 109.9066162109375, + "learning_rate": 7.4892e-07, + "loss": 0.4997, + "step": 62800 + }, + { + "epoch": 0.6285, + "grad_norm": 51.17443084716797, + "learning_rate": 7.4872e-07, + "loss": 0.5929, + "step": 62850 + }, + { + "epoch": 0.629, + "grad_norm": 24.46744155883789, + "learning_rate": 7.485199999999999e-07, + "loss": 0.5733, + "step": 62900 + }, + { + "epoch": 0.6295, + "grad_norm": 16.23428726196289, + "learning_rate": 7.483199999999999e-07, + "loss": 0.3725, + "step": 62950 + }, + { + "epoch": 0.63, + "grad_norm": 4.631766319274902, + "learning_rate": 7.4812e-07, + "loss": 0.5679, + "step": 63000 + }, + { + "epoch": 0.6305, + "grad_norm": 127.42314147949219, + "learning_rate": 7.4792e-07, + "loss": 0.4697, + "step": 63050 + }, + { + "epoch": 0.631, + "grad_norm": 85.11431121826172, + "learning_rate": 7.4772e-07, + "loss": 0.4926, + "step": 63100 + }, + { + "epoch": 0.6315, + "grad_norm": 18.65448570251465, + "learning_rate": 7.4752e-07, + "loss": 0.3696, + "step": 63150 + }, + { + "epoch": 0.632, + "grad_norm": 52.924503326416016, + "learning_rate": 7.473199999999999e-07, + "loss": 0.6384, + "step": 63200 + }, + { + "epoch": 0.6325, + "grad_norm": 81.14250946044922, + "learning_rate": 7.471199999999999e-07, + "loss": 0.4821, + "step": 63250 + }, + { + "epoch": 0.633, + "grad_norm": 36.8980827331543, + "learning_rate": 7.4692e-07, + "loss": 0.456, + "step": 63300 + }, + { + "epoch": 0.6335, + "grad_norm": 39.605323791503906, + "learning_rate": 7.4672e-07, + "loss": 0.4922, + "step": 63350 + }, + { + "epoch": 0.634, + "grad_norm": 31.341976165771484, + "learning_rate": 7.465199999999999e-07, + "loss": 0.466, + "step": 63400 + }, + { + "epoch": 0.6345, + "grad_norm": 87.95755004882812, + "learning_rate": 7.4632e-07, + "loss": 0.3872, + "step": 63450 + }, + { + "epoch": 0.635, + "grad_norm": 59.00345993041992, + "learning_rate": 7.4612e-07, + "loss": 0.5011, + "step": 63500 + }, + { + "epoch": 0.6355, + "grad_norm": 31.053945541381836, + "learning_rate": 7.459199999999999e-07, + "loss": 0.6692, + "step": 63550 + }, + { + "epoch": 0.636, + "grad_norm": 47.0079345703125, + "learning_rate": 7.45724e-07, + "loss": 0.6124, + "step": 63600 + }, + { + "epoch": 0.6365, + "grad_norm": 49.89374542236328, + "learning_rate": 7.455239999999999e-07, + "loss": 0.4711, + "step": 63650 + }, + { + "epoch": 0.637, + "grad_norm": 93.30340576171875, + "learning_rate": 7.453239999999999e-07, + "loss": 0.5194, + "step": 63700 + }, + { + "epoch": 0.6375, + "grad_norm": 95.00665283203125, + "learning_rate": 7.45124e-07, + "loss": 0.5065, + "step": 63750 + }, + { + "epoch": 0.638, + "grad_norm": 50.189510345458984, + "learning_rate": 7.44924e-07, + "loss": 0.3856, + "step": 63800 + }, + { + "epoch": 0.6385, + "grad_norm": 37.73897933959961, + "learning_rate": 7.447240000000001e-07, + "loss": 0.4225, + "step": 63850 + }, + { + "epoch": 0.639, + "grad_norm": 28.45961570739746, + "learning_rate": 7.44524e-07, + "loss": 0.5747, + "step": 63900 + }, + { + "epoch": 0.6395, + "grad_norm": 44.61785888671875, + "learning_rate": 7.443239999999999e-07, + "loss": 0.6641, + "step": 63950 + }, + { + "epoch": 0.64, + "grad_norm": 48.99622344970703, + "learning_rate": 7.44124e-07, + "loss": 0.6024, + "step": 64000 + }, + { + "epoch": 0.6405, + "grad_norm": 22.556676864624023, + "learning_rate": 7.43924e-07, + "loss": 0.5411, + "step": 64050 + }, + { + "epoch": 0.641, + "grad_norm": 72.16722106933594, + "learning_rate": 7.43724e-07, + "loss": 0.5184, + "step": 64100 + }, + { + "epoch": 0.6415, + "grad_norm": 32.8302116394043, + "learning_rate": 7.43524e-07, + "loss": 0.4113, + "step": 64150 + }, + { + "epoch": 0.642, + "grad_norm": 47.202213287353516, + "learning_rate": 7.43324e-07, + "loss": 0.6019, + "step": 64200 + }, + { + "epoch": 0.6425, + "grad_norm": 58.76848602294922, + "learning_rate": 7.43124e-07, + "loss": 0.5887, + "step": 64250 + }, + { + "epoch": 0.643, + "grad_norm": 54.78934097290039, + "learning_rate": 7.42924e-07, + "loss": 0.5743, + "step": 64300 + }, + { + "epoch": 0.6435, + "grad_norm": 16.40074348449707, + "learning_rate": 7.42724e-07, + "loss": 0.531, + "step": 64350 + }, + { + "epoch": 0.644, + "grad_norm": 30.374088287353516, + "learning_rate": 7.425239999999999e-07, + "loss": 0.3552, + "step": 64400 + }, + { + "epoch": 0.6445, + "grad_norm": 14.19670295715332, + "learning_rate": 7.42324e-07, + "loss": 0.6266, + "step": 64450 + }, + { + "epoch": 0.645, + "grad_norm": 86.16085052490234, + "learning_rate": 7.42124e-07, + "loss": 0.5935, + "step": 64500 + }, + { + "epoch": 0.6455, + "grad_norm": 84.91423797607422, + "learning_rate": 7.41924e-07, + "loss": 0.4733, + "step": 64550 + }, + { + "epoch": 0.646, + "grad_norm": 72.64830780029297, + "learning_rate": 7.417240000000001e-07, + "loss": 0.4532, + "step": 64600 + }, + { + "epoch": 0.6465, + "grad_norm": 6.822329998016357, + "learning_rate": 7.415239999999999e-07, + "loss": 0.4741, + "step": 64650 + }, + { + "epoch": 0.647, + "grad_norm": 36.594993591308594, + "learning_rate": 7.413239999999999e-07, + "loss": 0.531, + "step": 64700 + }, + { + "epoch": 0.6475, + "grad_norm": 75.4520263671875, + "learning_rate": 7.41124e-07, + "loss": 0.4991, + "step": 64750 + }, + { + "epoch": 0.648, + "grad_norm": 44.896141052246094, + "learning_rate": 7.40924e-07, + "loss": 0.4959, + "step": 64800 + }, + { + "epoch": 0.6485, + "grad_norm": 47.80727767944336, + "learning_rate": 7.40724e-07, + "loss": 0.5498, + "step": 64850 + }, + { + "epoch": 0.649, + "grad_norm": 2.48860502243042, + "learning_rate": 7.40524e-07, + "loss": 0.5012, + "step": 64900 + }, + { + "epoch": 0.6495, + "grad_norm": 16.638439178466797, + "learning_rate": 7.40324e-07, + "loss": 0.6177, + "step": 64950 + }, + { + "epoch": 0.65, + "grad_norm": 44.449302673339844, + "learning_rate": 7.401239999999999e-07, + "loss": 0.4293, + "step": 65000 + }, + { + "epoch": 0.6505, + "grad_norm": 161.3533935546875, + "learning_rate": 7.39924e-07, + "loss": 0.4377, + "step": 65050 + }, + { + "epoch": 0.651, + "grad_norm": 12.957307815551758, + "learning_rate": 7.39724e-07, + "loss": 0.5337, + "step": 65100 + }, + { + "epoch": 0.6515, + "grad_norm": 13.819578170776367, + "learning_rate": 7.395239999999999e-07, + "loss": 0.4665, + "step": 65150 + }, + { + "epoch": 0.652, + "grad_norm": 108.1066665649414, + "learning_rate": 7.39324e-07, + "loss": 0.61, + "step": 65200 + }, + { + "epoch": 0.6525, + "grad_norm": 18.782867431640625, + "learning_rate": 7.39124e-07, + "loss": 0.5585, + "step": 65250 + }, + { + "epoch": 0.653, + "grad_norm": 14.486738204956055, + "learning_rate": 7.38924e-07, + "loss": 0.5132, + "step": 65300 + }, + { + "epoch": 0.6535, + "grad_norm": 28.351959228515625, + "learning_rate": 7.38724e-07, + "loss": 0.4924, + "step": 65350 + }, + { + "epoch": 0.654, + "grad_norm": 49.564491271972656, + "learning_rate": 7.385239999999999e-07, + "loss": 0.485, + "step": 65400 + }, + { + "epoch": 0.6545, + "grad_norm": 55.472877502441406, + "learning_rate": 7.383239999999999e-07, + "loss": 0.5007, + "step": 65450 + }, + { + "epoch": 0.655, + "grad_norm": 37.1424446105957, + "learning_rate": 7.38124e-07, + "loss": 0.518, + "step": 65500 + }, + { + "epoch": 0.6555, + "grad_norm": 30.962854385375977, + "learning_rate": 7.37924e-07, + "loss": 0.5072, + "step": 65550 + }, + { + "epoch": 0.656, + "grad_norm": 63.9365348815918, + "learning_rate": 7.37724e-07, + "loss": 0.5801, + "step": 65600 + }, + { + "epoch": 0.6565, + "grad_norm": 6.695558547973633, + "learning_rate": 7.37524e-07, + "loss": 0.6023, + "step": 65650 + }, + { + "epoch": 0.657, + "grad_norm": 26.7724666595459, + "learning_rate": 7.373239999999999e-07, + "loss": 0.5249, + "step": 65700 + }, + { + "epoch": 0.6575, + "grad_norm": 81.64033508300781, + "learning_rate": 7.371239999999999e-07, + "loss": 0.5675, + "step": 65750 + }, + { + "epoch": 0.658, + "grad_norm": 32.74658966064453, + "learning_rate": 7.36924e-07, + "loss": 0.5071, + "step": 65800 + }, + { + "epoch": 0.6585, + "grad_norm": 58.301597595214844, + "learning_rate": 7.36724e-07, + "loss": 0.5389, + "step": 65850 + }, + { + "epoch": 0.659, + "grad_norm": 53.7860221862793, + "learning_rate": 7.365279999999999e-07, + "loss": 0.4365, + "step": 65900 + }, + { + "epoch": 0.6595, + "grad_norm": 20.44268798828125, + "learning_rate": 7.36328e-07, + "loss": 0.4764, + "step": 65950 + }, + { + "epoch": 0.66, + "grad_norm": 23.711496353149414, + "learning_rate": 7.36128e-07, + "loss": 0.4768, + "step": 66000 + }, + { + "epoch": 0.6605, + "grad_norm": 61.34829330444336, + "learning_rate": 7.359279999999999e-07, + "loss": 0.3809, + "step": 66050 + }, + { + "epoch": 0.661, + "grad_norm": 95.46158599853516, + "learning_rate": 7.35732e-07, + "loss": 0.4626, + "step": 66100 + }, + { + "epoch": 0.6615, + "grad_norm": 50.04861068725586, + "learning_rate": 7.355319999999999e-07, + "loss": 0.6196, + "step": 66150 + }, + { + "epoch": 0.662, + "grad_norm": 111.31083679199219, + "learning_rate": 7.353319999999999e-07, + "loss": 0.4418, + "step": 66200 + }, + { + "epoch": 0.6625, + "grad_norm": 0.3906168043613434, + "learning_rate": 7.35132e-07, + "loss": 0.3852, + "step": 66250 + }, + { + "epoch": 0.663, + "grad_norm": 45.034507751464844, + "learning_rate": 7.34932e-07, + "loss": 0.4873, + "step": 66300 + }, + { + "epoch": 0.6635, + "grad_norm": 39.52790069580078, + "learning_rate": 7.347320000000001e-07, + "loss": 0.332, + "step": 66350 + }, + { + "epoch": 0.664, + "grad_norm": 12.687024116516113, + "learning_rate": 7.34532e-07, + "loss": 0.4187, + "step": 66400 + }, + { + "epoch": 0.6645, + "grad_norm": 79.7389144897461, + "learning_rate": 7.343319999999999e-07, + "loss": 0.3296, + "step": 66450 + }, + { + "epoch": 0.665, + "grad_norm": 99.39762115478516, + "learning_rate": 7.34132e-07, + "loss": 0.44, + "step": 66500 + }, + { + "epoch": 0.6655, + "grad_norm": 1.2789992094039917, + "learning_rate": 7.33932e-07, + "loss": 0.4709, + "step": 66550 + }, + { + "epoch": 0.666, + "grad_norm": 102.41425323486328, + "learning_rate": 7.33732e-07, + "loss": 0.4785, + "step": 66600 + }, + { + "epoch": 0.6665, + "grad_norm": 1.850731611251831, + "learning_rate": 7.33532e-07, + "loss": 0.5505, + "step": 66650 + }, + { + "epoch": 0.667, + "grad_norm": 6.710739612579346, + "learning_rate": 7.33332e-07, + "loss": 0.4497, + "step": 66700 + }, + { + "epoch": 0.6675, + "grad_norm": 27.49475860595703, + "learning_rate": 7.33132e-07, + "loss": 0.4216, + "step": 66750 + }, + { + "epoch": 0.668, + "grad_norm": 109.29817962646484, + "learning_rate": 7.32932e-07, + "loss": 0.5599, + "step": 66800 + }, + { + "epoch": 0.6685, + "grad_norm": 2.05843186378479, + "learning_rate": 7.32732e-07, + "loss": 0.4372, + "step": 66850 + }, + { + "epoch": 0.669, + "grad_norm": 19.476158142089844, + "learning_rate": 7.325319999999999e-07, + "loss": 0.573, + "step": 66900 + }, + { + "epoch": 0.6695, + "grad_norm": 78.01306915283203, + "learning_rate": 7.32332e-07, + "loss": 0.3769, + "step": 66950 + }, + { + "epoch": 0.67, + "grad_norm": 64.33741760253906, + "learning_rate": 7.32132e-07, + "loss": 0.4052, + "step": 67000 + }, + { + "epoch": 0.6705, + "grad_norm": 2.576007843017578, + "learning_rate": 7.31932e-07, + "loss": 0.5676, + "step": 67050 + }, + { + "epoch": 0.671, + "grad_norm": 22.989377975463867, + "learning_rate": 7.317320000000001e-07, + "loss": 0.511, + "step": 67100 + }, + { + "epoch": 0.6715, + "grad_norm": 0.35649439692497253, + "learning_rate": 7.315319999999999e-07, + "loss": 0.5617, + "step": 67150 + }, + { + "epoch": 0.672, + "grad_norm": 95.40701293945312, + "learning_rate": 7.313319999999999e-07, + "loss": 0.5461, + "step": 67200 + }, + { + "epoch": 0.6725, + "grad_norm": 80.1086196899414, + "learning_rate": 7.31132e-07, + "loss": 0.6098, + "step": 67250 + }, + { + "epoch": 0.673, + "grad_norm": 119.30028533935547, + "learning_rate": 7.30932e-07, + "loss": 0.4937, + "step": 67300 + }, + { + "epoch": 0.6735, + "grad_norm": 38.92840576171875, + "learning_rate": 7.30732e-07, + "loss": 0.4933, + "step": 67350 + }, + { + "epoch": 0.674, + "grad_norm": 113.92001342773438, + "learning_rate": 7.30532e-07, + "loss": 0.421, + "step": 67400 + }, + { + "epoch": 0.6745, + "grad_norm": 0.898191511631012, + "learning_rate": 7.30332e-07, + "loss": 0.4647, + "step": 67450 + }, + { + "epoch": 0.675, + "grad_norm": 62.35942077636719, + "learning_rate": 7.301319999999999e-07, + "loss": 0.5297, + "step": 67500 + }, + { + "epoch": 0.6755, + "grad_norm": 10.209319114685059, + "learning_rate": 7.29932e-07, + "loss": 0.4762, + "step": 67550 + }, + { + "epoch": 0.676, + "grad_norm": 84.57727813720703, + "learning_rate": 7.29732e-07, + "loss": 0.6114, + "step": 67600 + }, + { + "epoch": 0.6765, + "grad_norm": 60.19552230834961, + "learning_rate": 7.295319999999999e-07, + "loss": 0.499, + "step": 67650 + }, + { + "epoch": 0.677, + "grad_norm": 12.361583709716797, + "learning_rate": 7.29332e-07, + "loss": 0.3799, + "step": 67700 + }, + { + "epoch": 0.6775, + "grad_norm": 72.01377868652344, + "learning_rate": 7.29132e-07, + "loss": 0.505, + "step": 67750 + }, + { + "epoch": 0.678, + "grad_norm": 66.44666290283203, + "learning_rate": 7.28932e-07, + "loss": 0.6133, + "step": 67800 + }, + { + "epoch": 0.6785, + "grad_norm": 50.07161331176758, + "learning_rate": 7.28732e-07, + "loss": 0.4261, + "step": 67850 + }, + { + "epoch": 0.679, + "grad_norm": 104.22811889648438, + "learning_rate": 7.285319999999999e-07, + "loss": 0.5914, + "step": 67900 + }, + { + "epoch": 0.6795, + "grad_norm": 6.429470062255859, + "learning_rate": 7.283319999999999e-07, + "loss": 0.345, + "step": 67950 + }, + { + "epoch": 0.68, + "grad_norm": 65.51649475097656, + "learning_rate": 7.28132e-07, + "loss": 0.4633, + "step": 68000 + }, + { + "epoch": 0.6805, + "grad_norm": 2.1752583980560303, + "learning_rate": 7.27932e-07, + "loss": 0.4383, + "step": 68050 + }, + { + "epoch": 0.681, + "grad_norm": 22.099319458007812, + "learning_rate": 7.27732e-07, + "loss": 0.5433, + "step": 68100 + }, + { + "epoch": 0.6815, + "grad_norm": 117.40200805664062, + "learning_rate": 7.27532e-07, + "loss": 0.5817, + "step": 68150 + }, + { + "epoch": 0.682, + "grad_norm": 57.13771057128906, + "learning_rate": 7.273319999999999e-07, + "loss": 0.4602, + "step": 68200 + }, + { + "epoch": 0.6825, + "grad_norm": 16.235754013061523, + "learning_rate": 7.271319999999999e-07, + "loss": 0.6009, + "step": 68250 + }, + { + "epoch": 0.683, + "grad_norm": 6.888073921203613, + "learning_rate": 7.26932e-07, + "loss": 0.4454, + "step": 68300 + }, + { + "epoch": 0.6835, + "grad_norm": 4.0116705894470215, + "learning_rate": 7.26732e-07, + "loss": 0.3861, + "step": 68350 + }, + { + "epoch": 0.684, + "grad_norm": 36.852413177490234, + "learning_rate": 7.265319999999999e-07, + "loss": 0.6036, + "step": 68400 + }, + { + "epoch": 0.6845, + "grad_norm": 23.3376522064209, + "learning_rate": 7.26332e-07, + "loss": 0.4487, + "step": 68450 + }, + { + "epoch": 0.685, + "grad_norm": 12.914100646972656, + "learning_rate": 7.26132e-07, + "loss": 0.5701, + "step": 68500 + }, + { + "epoch": 0.6855, + "grad_norm": 39.22294616699219, + "learning_rate": 7.259319999999999e-07, + "loss": 0.6363, + "step": 68550 + }, + { + "epoch": 0.686, + "grad_norm": 38.24320602416992, + "learning_rate": 7.25732e-07, + "loss": 0.3829, + "step": 68600 + }, + { + "epoch": 0.6865, + "grad_norm": 28.683549880981445, + "learning_rate": 7.255319999999999e-07, + "loss": 0.444, + "step": 68650 + }, + { + "epoch": 0.687, + "grad_norm": 14.346872329711914, + "learning_rate": 7.253319999999999e-07, + "loss": 0.5185, + "step": 68700 + }, + { + "epoch": 0.6875, + "grad_norm": 0.12592576444149017, + "learning_rate": 7.25132e-07, + "loss": 0.512, + "step": 68750 + }, + { + "epoch": 0.688, + "grad_norm": 21.978870391845703, + "learning_rate": 7.24932e-07, + "loss": 0.4442, + "step": 68800 + }, + { + "epoch": 0.6885, + "grad_norm": 0.3890920579433441, + "learning_rate": 7.247320000000001e-07, + "loss": 0.4323, + "step": 68850 + }, + { + "epoch": 0.689, + "grad_norm": 46.296112060546875, + "learning_rate": 7.245319999999999e-07, + "loss": 0.5979, + "step": 68900 + }, + { + "epoch": 0.6895, + "grad_norm": 3.0906097888946533, + "learning_rate": 7.243319999999999e-07, + "loss": 0.4466, + "step": 68950 + }, + { + "epoch": 0.69, + "grad_norm": 113.38265991210938, + "learning_rate": 7.24132e-07, + "loss": 0.4887, + "step": 69000 + }, + { + "epoch": 0.6905, + "grad_norm": 81.5102310180664, + "learning_rate": 7.23932e-07, + "loss": 0.6562, + "step": 69050 + }, + { + "epoch": 0.691, + "grad_norm": 0.32676833868026733, + "learning_rate": 7.23732e-07, + "loss": 0.4413, + "step": 69100 + }, + { + "epoch": 0.6915, + "grad_norm": 2.3341333866119385, + "learning_rate": 7.23532e-07, + "loss": 0.4187, + "step": 69150 + }, + { + "epoch": 0.692, + "grad_norm": 51.32745361328125, + "learning_rate": 7.23332e-07, + "loss": 0.5124, + "step": 69200 + }, + { + "epoch": 0.6925, + "grad_norm": 106.96540069580078, + "learning_rate": 7.231319999999999e-07, + "loss": 0.4508, + "step": 69250 + }, + { + "epoch": 0.693, + "grad_norm": 28.311433792114258, + "learning_rate": 7.22932e-07, + "loss": 0.4263, + "step": 69300 + }, + { + "epoch": 0.6935, + "grad_norm": 31.820341110229492, + "learning_rate": 7.22732e-07, + "loss": 0.5164, + "step": 69350 + }, + { + "epoch": 0.694, + "grad_norm": 56.47795867919922, + "learning_rate": 7.225319999999999e-07, + "loss": 0.535, + "step": 69400 + }, + { + "epoch": 0.6945, + "grad_norm": 10.806572914123535, + "learning_rate": 7.22332e-07, + "loss": 0.4787, + "step": 69450 + }, + { + "epoch": 0.695, + "grad_norm": 41.71363067626953, + "learning_rate": 7.22132e-07, + "loss": 0.5698, + "step": 69500 + }, + { + "epoch": 0.6955, + "grad_norm": 14.216658592224121, + "learning_rate": 7.21932e-07, + "loss": 0.4295, + "step": 69550 + }, + { + "epoch": 0.696, + "grad_norm": 73.08179473876953, + "learning_rate": 7.21732e-07, + "loss": 0.4647, + "step": 69600 + }, + { + "epoch": 0.6965, + "grad_norm": 32.81566619873047, + "learning_rate": 7.215319999999999e-07, + "loss": 0.4516, + "step": 69650 + }, + { + "epoch": 0.697, + "grad_norm": 74.85893249511719, + "learning_rate": 7.213319999999999e-07, + "loss": 0.5258, + "step": 69700 + }, + { + "epoch": 0.6975, + "grad_norm": 15.480995178222656, + "learning_rate": 7.21132e-07, + "loss": 0.48, + "step": 69750 + }, + { + "epoch": 0.698, + "grad_norm": 36.054656982421875, + "learning_rate": 7.20932e-07, + "loss": 0.5223, + "step": 69800 + }, + { + "epoch": 0.6985, + "grad_norm": 6.590498447418213, + "learning_rate": 7.20732e-07, + "loss": 0.6518, + "step": 69850 + }, + { + "epoch": 0.699, + "grad_norm": 34.523651123046875, + "learning_rate": 7.20532e-07, + "loss": 0.6054, + "step": 69900 + }, + { + "epoch": 0.6995, + "grad_norm": 413.315185546875, + "learning_rate": 7.203319999999999e-07, + "loss": 0.3861, + "step": 69950 + }, + { + "epoch": 0.7, + "grad_norm": 28.01694679260254, + "learning_rate": 7.201319999999999e-07, + "loss": 0.5878, + "step": 70000 + }, + { + "epoch": 0.7005, + "grad_norm": 4.377991676330566, + "learning_rate": 7.19932e-07, + "loss": 0.5515, + "step": 70050 + }, + { + "epoch": 0.701, + "grad_norm": 55.94136428833008, + "learning_rate": 7.19732e-07, + "loss": 0.4695, + "step": 70100 + }, + { + "epoch": 0.7015, + "grad_norm": 68.08269500732422, + "learning_rate": 7.195359999999999e-07, + "loss": 0.5501, + "step": 70150 + }, + { + "epoch": 0.702, + "grad_norm": 13.8864107131958, + "learning_rate": 7.19336e-07, + "loss": 0.4736, + "step": 70200 + }, + { + "epoch": 0.7025, + "grad_norm": 12.973597526550293, + "learning_rate": 7.19136e-07, + "loss": 0.6711, + "step": 70250 + }, + { + "epoch": 0.703, + "grad_norm": 59.53081512451172, + "learning_rate": 7.189359999999999e-07, + "loss": 0.6633, + "step": 70300 + }, + { + "epoch": 0.7035, + "grad_norm": 33.69902420043945, + "learning_rate": 7.18736e-07, + "loss": 0.5304, + "step": 70350 + }, + { + "epoch": 0.704, + "grad_norm": 4.359379768371582, + "learning_rate": 7.185359999999999e-07, + "loss": 0.5243, + "step": 70400 + }, + { + "epoch": 0.7045, + "grad_norm": 42.60222625732422, + "learning_rate": 7.183359999999999e-07, + "loss": 0.4602, + "step": 70450 + }, + { + "epoch": 0.705, + "grad_norm": 1.3176884651184082, + "learning_rate": 7.18136e-07, + "loss": 0.4459, + "step": 70500 + }, + { + "epoch": 0.7055, + "grad_norm": 3.021962881088257, + "learning_rate": 7.17936e-07, + "loss": 0.4491, + "step": 70550 + }, + { + "epoch": 0.706, + "grad_norm": 274.6659851074219, + "learning_rate": 7.17736e-07, + "loss": 0.5348, + "step": 70600 + }, + { + "epoch": 0.7065, + "grad_norm": 59.16705322265625, + "learning_rate": 7.17536e-07, + "loss": 0.4527, + "step": 70650 + }, + { + "epoch": 0.707, + "grad_norm": 19.327486038208008, + "learning_rate": 7.173359999999999e-07, + "loss": 0.6739, + "step": 70700 + }, + { + "epoch": 0.7075, + "grad_norm": 10.360088348388672, + "learning_rate": 7.171359999999999e-07, + "loss": 0.621, + "step": 70750 + }, + { + "epoch": 0.708, + "grad_norm": 65.7010726928711, + "learning_rate": 7.16936e-07, + "loss": 0.3637, + "step": 70800 + }, + { + "epoch": 0.7085, + "grad_norm": 7.891746997833252, + "learning_rate": 7.16736e-07, + "loss": 0.4372, + "step": 70850 + }, + { + "epoch": 0.709, + "grad_norm": 46.58807373046875, + "learning_rate": 7.165359999999999e-07, + "loss": 0.4355, + "step": 70900 + }, + { + "epoch": 0.7095, + "grad_norm": 82.19285583496094, + "learning_rate": 7.16336e-07, + "loss": 0.3844, + "step": 70950 + }, + { + "epoch": 0.71, + "grad_norm": 44.19703674316406, + "learning_rate": 7.16136e-07, + "loss": 0.4022, + "step": 71000 + }, + { + "epoch": 0.7105, + "grad_norm": 64.81764221191406, + "learning_rate": 7.159359999999999e-07, + "loss": 0.4597, + "step": 71050 + }, + { + "epoch": 0.711, + "grad_norm": 4.582440376281738, + "learning_rate": 7.15736e-07, + "loss": 0.3086, + "step": 71100 + }, + { + "epoch": 0.7115, + "grad_norm": 95.07144165039062, + "learning_rate": 7.155359999999999e-07, + "loss": 0.4794, + "step": 71150 + }, + { + "epoch": 0.712, + "grad_norm": 47.082794189453125, + "learning_rate": 7.153359999999999e-07, + "loss": 0.5136, + "step": 71200 + }, + { + "epoch": 0.7125, + "grad_norm": 58.89522171020508, + "learning_rate": 7.15136e-07, + "loss": 0.4111, + "step": 71250 + }, + { + "epoch": 0.713, + "grad_norm": 76.09228515625, + "learning_rate": 7.14936e-07, + "loss": 0.493, + "step": 71300 + }, + { + "epoch": 0.7135, + "grad_norm": 7.7800211906433105, + "learning_rate": 7.147360000000001e-07, + "loss": 0.3391, + "step": 71350 + }, + { + "epoch": 0.714, + "grad_norm": 24.84239959716797, + "learning_rate": 7.145359999999999e-07, + "loss": 0.5557, + "step": 71400 + }, + { + "epoch": 0.7145, + "grad_norm": 1.11128568649292, + "learning_rate": 7.143359999999999e-07, + "loss": 0.4902, + "step": 71450 + }, + { + "epoch": 0.715, + "grad_norm": 67.38085174560547, + "learning_rate": 7.14136e-07, + "loss": 0.5157, + "step": 71500 + }, + { + "epoch": 0.7155, + "grad_norm": 58.82173538208008, + "learning_rate": 7.13936e-07, + "loss": 0.4311, + "step": 71550 + }, + { + "epoch": 0.716, + "grad_norm": 10.34243106842041, + "learning_rate": 7.13736e-07, + "loss": 0.5631, + "step": 71600 + }, + { + "epoch": 0.7165, + "grad_norm": 31.23766326904297, + "learning_rate": 7.13536e-07, + "loss": 0.4615, + "step": 71650 + }, + { + "epoch": 0.717, + "grad_norm": 90.4206314086914, + "learning_rate": 7.13336e-07, + "loss": 0.6139, + "step": 71700 + }, + { + "epoch": 0.7175, + "grad_norm": 17.178178787231445, + "learning_rate": 7.131359999999999e-07, + "loss": 0.5126, + "step": 71750 + }, + { + "epoch": 0.718, + "grad_norm": 60.4676399230957, + "learning_rate": 7.12936e-07, + "loss": 0.6, + "step": 71800 + }, + { + "epoch": 0.7185, + "grad_norm": 1.8041740655899048, + "learning_rate": 7.12736e-07, + "loss": 0.4853, + "step": 71850 + }, + { + "epoch": 0.719, + "grad_norm": 60.83064270019531, + "learning_rate": 7.125359999999999e-07, + "loss": 0.6092, + "step": 71900 + }, + { + "epoch": 0.7195, + "grad_norm": 59.2396354675293, + "learning_rate": 7.12336e-07, + "loss": 0.5124, + "step": 71950 + }, + { + "epoch": 0.72, + "grad_norm": 66.30610656738281, + "learning_rate": 7.12136e-07, + "loss": 0.5062, + "step": 72000 + }, + { + "epoch": 0.7205, + "grad_norm": 116.58683013916016, + "learning_rate": 7.11936e-07, + "loss": 0.561, + "step": 72050 + }, + { + "epoch": 0.721, + "grad_norm": 64.6193618774414, + "learning_rate": 7.11736e-07, + "loss": 0.5903, + "step": 72100 + }, + { + "epoch": 0.7215, + "grad_norm": 29.871625900268555, + "learning_rate": 7.115359999999999e-07, + "loss": 0.3967, + "step": 72150 + }, + { + "epoch": 0.722, + "grad_norm": 48.294960021972656, + "learning_rate": 7.113359999999999e-07, + "loss": 0.4418, + "step": 72200 + }, + { + "epoch": 0.7225, + "grad_norm": 1.5724999904632568, + "learning_rate": 7.11136e-07, + "loss": 0.5528, + "step": 72250 + }, + { + "epoch": 0.723, + "grad_norm": 10.528818130493164, + "learning_rate": 7.10936e-07, + "loss": 0.3585, + "step": 72300 + }, + { + "epoch": 0.7235, + "grad_norm": 26.029348373413086, + "learning_rate": 7.10736e-07, + "loss": 0.5177, + "step": 72350 + }, + { + "epoch": 0.724, + "grad_norm": 92.28126525878906, + "learning_rate": 7.10536e-07, + "loss": 0.3724, + "step": 72400 + }, + { + "epoch": 0.7245, + "grad_norm": 14.755386352539062, + "learning_rate": 7.103359999999999e-07, + "loss": 0.4839, + "step": 72450 + }, + { + "epoch": 0.725, + "grad_norm": 32.792755126953125, + "learning_rate": 7.101359999999999e-07, + "loss": 0.5051, + "step": 72500 + }, + { + "epoch": 0.7255, + "grad_norm": 44.9954948425293, + "learning_rate": 7.09936e-07, + "loss": 0.572, + "step": 72550 + }, + { + "epoch": 0.726, + "grad_norm": 61.472633361816406, + "learning_rate": 7.09736e-07, + "loss": 0.3723, + "step": 72600 + }, + { + "epoch": 0.7265, + "grad_norm": 56.659156799316406, + "learning_rate": 7.095359999999999e-07, + "loss": 0.5166, + "step": 72650 + }, + { + "epoch": 0.727, + "grad_norm": 66.07804107666016, + "learning_rate": 7.09336e-07, + "loss": 0.5227, + "step": 72700 + }, + { + "epoch": 0.7275, + "grad_norm": 18.027997970581055, + "learning_rate": 7.09136e-07, + "loss": 0.5235, + "step": 72750 + }, + { + "epoch": 0.728, + "grad_norm": 71.68598937988281, + "learning_rate": 7.089359999999999e-07, + "loss": 0.4355, + "step": 72800 + }, + { + "epoch": 0.7285, + "grad_norm": 0.8782522082328796, + "learning_rate": 7.08736e-07, + "loss": 0.4416, + "step": 72850 + }, + { + "epoch": 0.729, + "grad_norm": 66.14490509033203, + "learning_rate": 7.08536e-07, + "loss": 0.4943, + "step": 72900 + }, + { + "epoch": 0.7295, + "grad_norm": 17.617177963256836, + "learning_rate": 7.083359999999999e-07, + "loss": 0.597, + "step": 72950 + }, + { + "epoch": 0.73, + "grad_norm": 95.89069366455078, + "learning_rate": 7.08136e-07, + "loss": 0.5914, + "step": 73000 + }, + { + "epoch": 0.7305, + "grad_norm": 6.234703063964844, + "learning_rate": 7.07936e-07, + "loss": 0.432, + "step": 73050 + }, + { + "epoch": 0.731, + "grad_norm": 81.79715728759766, + "learning_rate": 7.07736e-07, + "loss": 0.5353, + "step": 73100 + }, + { + "epoch": 0.7315, + "grad_norm": 68.71401977539062, + "learning_rate": 7.07536e-07, + "loss": 0.4773, + "step": 73150 + }, + { + "epoch": 0.732, + "grad_norm": 20.033000946044922, + "learning_rate": 7.073359999999999e-07, + "loss": 0.3961, + "step": 73200 + }, + { + "epoch": 0.7325, + "grad_norm": 0.8714961409568787, + "learning_rate": 7.071359999999999e-07, + "loss": 0.4119, + "step": 73250 + }, + { + "epoch": 0.733, + "grad_norm": 67.67875671386719, + "learning_rate": 7.06936e-07, + "loss": 0.4616, + "step": 73300 + }, + { + "epoch": 0.7335, + "grad_norm": 170.6661376953125, + "learning_rate": 7.06736e-07, + "loss": 0.4623, + "step": 73350 + }, + { + "epoch": 0.734, + "grad_norm": 22.065210342407227, + "learning_rate": 7.06536e-07, + "loss": 0.5301, + "step": 73400 + }, + { + "epoch": 0.7345, + "grad_norm": 116.2767562866211, + "learning_rate": 7.06336e-07, + "loss": 0.6367, + "step": 73450 + }, + { + "epoch": 0.735, + "grad_norm": 118.61943817138672, + "learning_rate": 7.061359999999999e-07, + "loss": 0.4786, + "step": 73500 + }, + { + "epoch": 0.7355, + "grad_norm": 10.633493423461914, + "learning_rate": 7.05936e-07, + "loss": 0.5042, + "step": 73550 + }, + { + "epoch": 0.736, + "grad_norm": 15.222790718078613, + "learning_rate": 7.05736e-07, + "loss": 0.5449, + "step": 73600 + }, + { + "epoch": 0.7365, + "grad_norm": 4.541469097137451, + "learning_rate": 7.05536e-07, + "loss": 0.4413, + "step": 73650 + }, + { + "epoch": 0.737, + "grad_norm": 50.30397033691406, + "learning_rate": 7.05336e-07, + "loss": 0.4317, + "step": 73700 + }, + { + "epoch": 0.7375, + "grad_norm": 4.888488292694092, + "learning_rate": 7.05136e-07, + "loss": 0.4241, + "step": 73750 + }, + { + "epoch": 0.738, + "grad_norm": 46.71607971191406, + "learning_rate": 7.04936e-07, + "loss": 0.4109, + "step": 73800 + }, + { + "epoch": 0.7385, + "grad_norm": 139.61004638671875, + "learning_rate": 7.04736e-07, + "loss": 0.5672, + "step": 73850 + }, + { + "epoch": 0.739, + "grad_norm": 96.24576568603516, + "learning_rate": 7.04536e-07, + "loss": 0.5308, + "step": 73900 + }, + { + "epoch": 0.7395, + "grad_norm": 3.3251993656158447, + "learning_rate": 7.043359999999999e-07, + "loss": 0.5206, + "step": 73950 + }, + { + "epoch": 0.74, + "grad_norm": 64.65046691894531, + "learning_rate": 7.04136e-07, + "loss": 0.3965, + "step": 74000 + }, + { + "epoch": 0.7405, + "grad_norm": 79.69112396240234, + "learning_rate": 7.03936e-07, + "loss": 0.571, + "step": 74050 + }, + { + "epoch": 0.741, + "grad_norm": 86.38766479492188, + "learning_rate": 7.03736e-07, + "loss": 0.6437, + "step": 74100 + }, + { + "epoch": 0.7415, + "grad_norm": 43.816471099853516, + "learning_rate": 7.035360000000001e-07, + "loss": 0.4205, + "step": 74150 + }, + { + "epoch": 0.742, + "grad_norm": 41.845672607421875, + "learning_rate": 7.033359999999999e-07, + "loss": 0.3473, + "step": 74200 + }, + { + "epoch": 0.7425, + "grad_norm": 0.8183789849281311, + "learning_rate": 7.031359999999999e-07, + "loss": 0.4469, + "step": 74250 + }, + { + "epoch": 0.743, + "grad_norm": 3.786691427230835, + "learning_rate": 7.02936e-07, + "loss": 0.5015, + "step": 74300 + }, + { + "epoch": 0.7435, + "grad_norm": 78.48735046386719, + "learning_rate": 7.02736e-07, + "loss": 0.624, + "step": 74350 + }, + { + "epoch": 0.744, + "grad_norm": 104.10353088378906, + "learning_rate": 7.02536e-07, + "loss": 0.4875, + "step": 74400 + }, + { + "epoch": 0.7445, + "grad_norm": 6.0870819091796875, + "learning_rate": 7.02336e-07, + "loss": 0.4004, + "step": 74450 + }, + { + "epoch": 0.745, + "grad_norm": 13.105294227600098, + "learning_rate": 7.02136e-07, + "loss": 0.4386, + "step": 74500 + }, + { + "epoch": 0.7455, + "grad_norm": 51.39162826538086, + "learning_rate": 7.019359999999999e-07, + "loss": 0.4182, + "step": 74550 + }, + { + "epoch": 0.746, + "grad_norm": 91.84526824951172, + "learning_rate": 7.01736e-07, + "loss": 0.5552, + "step": 74600 + }, + { + "epoch": 0.7465, + "grad_norm": 9.574994087219238, + "learning_rate": 7.01536e-07, + "loss": 0.4372, + "step": 74650 + }, + { + "epoch": 0.747, + "grad_norm": 75.70382690429688, + "learning_rate": 7.013359999999999e-07, + "loss": 0.5477, + "step": 74700 + }, + { + "epoch": 0.7475, + "grad_norm": 59.122093200683594, + "learning_rate": 7.01136e-07, + "loss": 0.5786, + "step": 74750 + }, + { + "epoch": 0.748, + "grad_norm": 92.53333282470703, + "learning_rate": 7.00936e-07, + "loss": 0.5653, + "step": 74800 + }, + { + "epoch": 0.7485, + "grad_norm": 93.97532653808594, + "learning_rate": 7.00736e-07, + "loss": 0.3929, + "step": 74850 + }, + { + "epoch": 0.749, + "grad_norm": 1.357946515083313, + "learning_rate": 7.00536e-07, + "loss": 0.3806, + "step": 74900 + }, + { + "epoch": 0.7495, + "grad_norm": 1.4514474868774414, + "learning_rate": 7.003359999999999e-07, + "loss": 0.4542, + "step": 74950 + }, + { + "epoch": 0.75, + "grad_norm": 60.96038055419922, + "learning_rate": 7.001359999999999e-07, + "loss": 0.4032, + "step": 75000 + }, + { + "epoch": 0.7505, + "grad_norm": 54.15022659301758, + "learning_rate": 6.99936e-07, + "loss": 0.5132, + "step": 75050 + }, + { + "epoch": 0.751, + "grad_norm": 92.10535430908203, + "learning_rate": 6.99736e-07, + "loss": 0.5717, + "step": 75100 + }, + { + "epoch": 0.7515, + "grad_norm": 67.41671752929688, + "learning_rate": 6.99536e-07, + "loss": 0.4319, + "step": 75150 + }, + { + "epoch": 0.752, + "grad_norm": 27.999591827392578, + "learning_rate": 6.99336e-07, + "loss": 0.5642, + "step": 75200 + }, + { + "epoch": 0.7525, + "grad_norm": 54.86029815673828, + "learning_rate": 6.991359999999999e-07, + "loss": 0.6475, + "step": 75250 + }, + { + "epoch": 0.753, + "grad_norm": 38.79415512084961, + "learning_rate": 6.989359999999999e-07, + "loss": 0.3514, + "step": 75300 + }, + { + "epoch": 0.7535, + "grad_norm": 2.452829360961914, + "learning_rate": 6.98736e-07, + "loss": 0.5446, + "step": 75350 + }, + { + "epoch": 0.754, + "grad_norm": 118.6248550415039, + "learning_rate": 6.98536e-07, + "loss": 0.5, + "step": 75400 + }, + { + "epoch": 0.7545, + "grad_norm": 70.82340240478516, + "learning_rate": 6.983359999999999e-07, + "loss": 0.5008, + "step": 75450 + }, + { + "epoch": 0.755, + "grad_norm": 42.31960678100586, + "learning_rate": 6.9814e-07, + "loss": 0.4178, + "step": 75500 + }, + { + "epoch": 0.7555, + "grad_norm": 90.04888916015625, + "learning_rate": 6.9794e-07, + "loss": 0.5661, + "step": 75550 + }, + { + "epoch": 0.756, + "grad_norm": 21.198366165161133, + "learning_rate": 6.977399999999999e-07, + "loss": 0.4164, + "step": 75600 + }, + { + "epoch": 0.7565, + "grad_norm": 78.9493408203125, + "learning_rate": 6.9754e-07, + "loss": 0.5132, + "step": 75650 + }, + { + "epoch": 0.757, + "grad_norm": 103.06893920898438, + "learning_rate": 6.973399999999999e-07, + "loss": 0.5996, + "step": 75700 + }, + { + "epoch": 0.7575, + "grad_norm": 4.515233993530273, + "learning_rate": 6.971399999999999e-07, + "loss": 0.4079, + "step": 75750 + }, + { + "epoch": 0.758, + "grad_norm": 68.41087341308594, + "learning_rate": 6.9694e-07, + "loss": 0.557, + "step": 75800 + }, + { + "epoch": 0.7585, + "grad_norm": 4.536024570465088, + "learning_rate": 6.9674e-07, + "loss": 0.4946, + "step": 75850 + }, + { + "epoch": 0.759, + "grad_norm": 43.321983337402344, + "learning_rate": 6.965400000000001e-07, + "loss": 0.4212, + "step": 75900 + }, + { + "epoch": 0.7595, + "grad_norm": 57.4993896484375, + "learning_rate": 6.9634e-07, + "loss": 0.4643, + "step": 75950 + }, + { + "epoch": 0.76, + "grad_norm": 58.03748321533203, + "learning_rate": 6.961399999999999e-07, + "loss": 0.5878, + "step": 76000 + }, + { + "epoch": 0.7605, + "grad_norm": 5.033849239349365, + "learning_rate": 6.9594e-07, + "loss": 0.6593, + "step": 76050 + }, + { + "epoch": 0.761, + "grad_norm": 75.7629623413086, + "learning_rate": 6.9574e-07, + "loss": 0.4768, + "step": 76100 + }, + { + "epoch": 0.7615, + "grad_norm": 60.79175567626953, + "learning_rate": 6.9554e-07, + "loss": 0.5528, + "step": 76150 + }, + { + "epoch": 0.762, + "grad_norm": 71.26956176757812, + "learning_rate": 6.9534e-07, + "loss": 0.587, + "step": 76200 + }, + { + "epoch": 0.7625, + "grad_norm": 3.231938362121582, + "learning_rate": 6.9514e-07, + "loss": 0.4488, + "step": 76250 + }, + { + "epoch": 0.763, + "grad_norm": 2.168018102645874, + "learning_rate": 6.9494e-07, + "loss": 0.4011, + "step": 76300 + }, + { + "epoch": 0.7635, + "grad_norm": 2.409412145614624, + "learning_rate": 6.9474e-07, + "loss": 0.5966, + "step": 76350 + }, + { + "epoch": 0.764, + "grad_norm": 8.960734367370605, + "learning_rate": 6.9454e-07, + "loss": 0.4323, + "step": 76400 + }, + { + "epoch": 0.7645, + "grad_norm": 74.19822692871094, + "learning_rate": 6.943399999999999e-07, + "loss": 0.3678, + "step": 76450 + }, + { + "epoch": 0.765, + "grad_norm": 75.17100524902344, + "learning_rate": 6.9414e-07, + "loss": 0.4418, + "step": 76500 + }, + { + "epoch": 0.7655, + "grad_norm": 23.91132164001465, + "learning_rate": 6.9394e-07, + "loss": 0.4939, + "step": 76550 + }, + { + "epoch": 0.766, + "grad_norm": 33.52251052856445, + "learning_rate": 6.9374e-07, + "loss": 0.4463, + "step": 76600 + }, + { + "epoch": 0.7665, + "grad_norm": 119.65557861328125, + "learning_rate": 6.935400000000001e-07, + "loss": 0.6833, + "step": 76650 + }, + { + "epoch": 0.767, + "grad_norm": 46.15104675292969, + "learning_rate": 6.933399999999999e-07, + "loss": 0.4756, + "step": 76700 + }, + { + "epoch": 0.7675, + "grad_norm": 1.8948994874954224, + "learning_rate": 6.931399999999999e-07, + "loss": 0.535, + "step": 76750 + }, + { + "epoch": 0.768, + "grad_norm": 38.17501449584961, + "learning_rate": 6.9294e-07, + "loss": 0.428, + "step": 76800 + }, + { + "epoch": 0.7685, + "grad_norm": 65.1424789428711, + "learning_rate": 6.9274e-07, + "loss": 0.5542, + "step": 76850 + }, + { + "epoch": 0.769, + "grad_norm": 88.36298370361328, + "learning_rate": 6.9254e-07, + "loss": 0.3903, + "step": 76900 + }, + { + "epoch": 0.7695, + "grad_norm": 152.0764923095703, + "learning_rate": 6.9234e-07, + "loss": 0.4659, + "step": 76950 + }, + { + "epoch": 0.77, + "grad_norm": 47.97085189819336, + "learning_rate": 6.9214e-07, + "loss": 0.5331, + "step": 77000 + }, + { + "epoch": 0.7705, + "grad_norm": 91.40066528320312, + "learning_rate": 6.919399999999999e-07, + "loss": 0.5595, + "step": 77050 + }, + { + "epoch": 0.771, + "grad_norm": 8.514924049377441, + "learning_rate": 6.9174e-07, + "loss": 0.3657, + "step": 77100 + }, + { + "epoch": 0.7715, + "grad_norm": 24.74837303161621, + "learning_rate": 6.9154e-07, + "loss": 0.4459, + "step": 77150 + }, + { + "epoch": 0.772, + "grad_norm": 87.70143127441406, + "learning_rate": 6.913399999999999e-07, + "loss": 0.4602, + "step": 77200 + }, + { + "epoch": 0.7725, + "grad_norm": 20.411006927490234, + "learning_rate": 6.9114e-07, + "loss": 0.4812, + "step": 77250 + }, + { + "epoch": 0.773, + "grad_norm": 74.98430633544922, + "learning_rate": 6.9094e-07, + "loss": 0.4783, + "step": 77300 + }, + { + "epoch": 0.7735, + "grad_norm": 15.123334884643555, + "learning_rate": 6.9074e-07, + "loss": 0.289, + "step": 77350 + }, + { + "epoch": 0.774, + "grad_norm": 81.70301818847656, + "learning_rate": 6.9054e-07, + "loss": 0.5672, + "step": 77400 + }, + { + "epoch": 0.7745, + "grad_norm": 0.1056661456823349, + "learning_rate": 6.903399999999999e-07, + "loss": 0.3384, + "step": 77450 + }, + { + "epoch": 0.775, + "grad_norm": 40.181251525878906, + "learning_rate": 6.901399999999999e-07, + "loss": 0.5383, + "step": 77500 + }, + { + "epoch": 0.7755, + "grad_norm": 77.13603973388672, + "learning_rate": 6.8994e-07, + "loss": 0.5013, + "step": 77550 + }, + { + "epoch": 0.776, + "grad_norm": 12.230585098266602, + "learning_rate": 6.8974e-07, + "loss": 0.4305, + "step": 77600 + }, + { + "epoch": 0.7765, + "grad_norm": 130.13206481933594, + "learning_rate": 6.8954e-07, + "loss": 0.4474, + "step": 77650 + }, + { + "epoch": 0.777, + "grad_norm": 7.258871555328369, + "learning_rate": 6.8934e-07, + "loss": 0.5191, + "step": 77700 + }, + { + "epoch": 0.7775, + "grad_norm": 74.94843292236328, + "learning_rate": 6.891399999999999e-07, + "loss": 0.4981, + "step": 77750 + }, + { + "epoch": 0.778, + "grad_norm": 37.118385314941406, + "learning_rate": 6.889399999999999e-07, + "loss": 0.5874, + "step": 77800 + }, + { + "epoch": 0.7785, + "grad_norm": 66.5859146118164, + "learning_rate": 6.8874e-07, + "loss": 0.4819, + "step": 77850 + }, + { + "epoch": 0.779, + "grad_norm": 44.33268356323242, + "learning_rate": 6.8854e-07, + "loss": 0.5725, + "step": 77900 + }, + { + "epoch": 0.7795, + "grad_norm": 23.16948890686035, + "learning_rate": 6.883399999999999e-07, + "loss": 0.4699, + "step": 77950 + }, + { + "epoch": 0.78, + "grad_norm": 84.33336639404297, + "learning_rate": 6.8814e-07, + "loss": 0.5684, + "step": 78000 + }, + { + "epoch": 0.7805, + "grad_norm": 52.04594039916992, + "learning_rate": 6.8794e-07, + "loss": 0.5498, + "step": 78050 + }, + { + "epoch": 0.781, + "grad_norm": 9.365368843078613, + "learning_rate": 6.877399999999999e-07, + "loss": 0.4686, + "step": 78100 + }, + { + "epoch": 0.7815, + "grad_norm": 20.72178077697754, + "learning_rate": 6.8754e-07, + "loss": 0.5012, + "step": 78150 + }, + { + "epoch": 0.782, + "grad_norm": 35.4743537902832, + "learning_rate": 6.873399999999999e-07, + "loss": 0.5656, + "step": 78200 + }, + { + "epoch": 0.7825, + "grad_norm": 58.698486328125, + "learning_rate": 6.871399999999999e-07, + "loss": 0.5637, + "step": 78250 + }, + { + "epoch": 0.783, + "grad_norm": 4.087533950805664, + "learning_rate": 6.8694e-07, + "loss": 0.4656, + "step": 78300 + }, + { + "epoch": 0.7835, + "grad_norm": 114.83053588867188, + "learning_rate": 6.8674e-07, + "loss": 0.5119, + "step": 78350 + }, + { + "epoch": 0.784, + "grad_norm": 22.25564193725586, + "learning_rate": 6.865400000000001e-07, + "loss": 0.4064, + "step": 78400 + }, + { + "epoch": 0.7845, + "grad_norm": 25.038494110107422, + "learning_rate": 6.863399999999999e-07, + "loss": 0.4126, + "step": 78450 + }, + { + "epoch": 0.785, + "grad_norm": 77.9610824584961, + "learning_rate": 6.861399999999999e-07, + "loss": 0.4798, + "step": 78500 + }, + { + "epoch": 0.7855, + "grad_norm": 48.64069366455078, + "learning_rate": 6.8594e-07, + "loss": 0.4212, + "step": 78550 + }, + { + "epoch": 0.786, + "grad_norm": 33.35960006713867, + "learning_rate": 6.8574e-07, + "loss": 0.4711, + "step": 78600 + }, + { + "epoch": 0.7865, + "grad_norm": 8.562115669250488, + "learning_rate": 6.8554e-07, + "loss": 0.5208, + "step": 78650 + }, + { + "epoch": 0.787, + "grad_norm": 84.58731079101562, + "learning_rate": 6.8534e-07, + "loss": 0.5262, + "step": 78700 + }, + { + "epoch": 0.7875, + "grad_norm": 35.85997009277344, + "learning_rate": 6.8514e-07, + "loss": 0.6896, + "step": 78750 + }, + { + "epoch": 0.788, + "grad_norm": 84.28465270996094, + "learning_rate": 6.849399999999999e-07, + "loss": 0.6728, + "step": 78800 + }, + { + "epoch": 0.7885, + "grad_norm": 105.6515121459961, + "learning_rate": 6.8474e-07, + "loss": 0.5276, + "step": 78850 + }, + { + "epoch": 0.789, + "grad_norm": 74.69842529296875, + "learning_rate": 6.8454e-07, + "loss": 0.5207, + "step": 78900 + }, + { + "epoch": 0.7895, + "grad_norm": 60.617088317871094, + "learning_rate": 6.843399999999999e-07, + "loss": 0.4344, + "step": 78950 + }, + { + "epoch": 0.79, + "grad_norm": 64.80194854736328, + "learning_rate": 6.8414e-07, + "loss": 0.3582, + "step": 79000 + }, + { + "epoch": 0.7905, + "grad_norm": 9.421799659729004, + "learning_rate": 6.8394e-07, + "loss": 0.3847, + "step": 79050 + }, + { + "epoch": 0.791, + "grad_norm": 27.81099510192871, + "learning_rate": 6.8374e-07, + "loss": 0.5735, + "step": 79100 + }, + { + "epoch": 0.7915, + "grad_norm": 11.743189811706543, + "learning_rate": 6.8354e-07, + "loss": 0.3546, + "step": 79150 + }, + { + "epoch": 0.792, + "grad_norm": 2.8396387100219727, + "learning_rate": 6.833399999999999e-07, + "loss": 0.3229, + "step": 79200 + }, + { + "epoch": 0.7925, + "grad_norm": 0.9796394109725952, + "learning_rate": 6.831399999999999e-07, + "loss": 0.4796, + "step": 79250 + }, + { + "epoch": 0.793, + "grad_norm": 64.07000732421875, + "learning_rate": 6.8294e-07, + "loss": 0.4583, + "step": 79300 + }, + { + "epoch": 0.7935, + "grad_norm": 91.14384460449219, + "learning_rate": 6.8274e-07, + "loss": 0.5557, + "step": 79350 + }, + { + "epoch": 0.794, + "grad_norm": 67.72103881835938, + "learning_rate": 6.8254e-07, + "loss": 0.4973, + "step": 79400 + }, + { + "epoch": 0.7945, + "grad_norm": 76.02549743652344, + "learning_rate": 6.8234e-07, + "loss": 0.4128, + "step": 79450 + }, + { + "epoch": 0.795, + "grad_norm": 14.491253852844238, + "learning_rate": 6.821399999999999e-07, + "loss": 0.5534, + "step": 79500 + }, + { + "epoch": 0.7955, + "grad_norm": 0.9683569073677063, + "learning_rate": 6.819399999999999e-07, + "loss": 0.6786, + "step": 79550 + }, + { + "epoch": 0.796, + "grad_norm": 73.29056549072266, + "learning_rate": 6.8174e-07, + "loss": 0.4019, + "step": 79600 + }, + { + "epoch": 0.7965, + "grad_norm": 44.23388671875, + "learning_rate": 6.8154e-07, + "loss": 0.4544, + "step": 79650 + }, + { + "epoch": 0.797, + "grad_norm": 0.06450020521879196, + "learning_rate": 6.813439999999999e-07, + "loss": 0.3565, + "step": 79700 + }, + { + "epoch": 0.7975, + "grad_norm": 79.00003814697266, + "learning_rate": 6.81144e-07, + "loss": 0.5056, + "step": 79750 + }, + { + "epoch": 0.798, + "grad_norm": 25.8265438079834, + "learning_rate": 6.80944e-07, + "loss": 0.4529, + "step": 79800 + }, + { + "epoch": 0.7985, + "grad_norm": 67.03202056884766, + "learning_rate": 6.807439999999999e-07, + "loss": 0.4031, + "step": 79850 + }, + { + "epoch": 0.799, + "grad_norm": 67.00475311279297, + "learning_rate": 6.80544e-07, + "loss": 0.4486, + "step": 79900 + }, + { + "epoch": 0.7995, + "grad_norm": 11.94206714630127, + "learning_rate": 6.803439999999999e-07, + "loss": 0.4968, + "step": 79950 + }, + { + "epoch": 0.8, + "grad_norm": 73.1527099609375, + "learning_rate": 6.801439999999999e-07, + "loss": 0.379, + "step": 80000 + }, + { + "epoch": 0.8005, + "grad_norm": 34.18701934814453, + "learning_rate": 6.79944e-07, + "loss": 0.4137, + "step": 80050 + }, + { + "epoch": 0.801, + "grad_norm": 64.62896728515625, + "learning_rate": 6.79744e-07, + "loss": 0.6624, + "step": 80100 + }, + { + "epoch": 0.8015, + "grad_norm": 52.73872375488281, + "learning_rate": 6.79544e-07, + "loss": 0.4575, + "step": 80150 + }, + { + "epoch": 0.802, + "grad_norm": 76.4136734008789, + "learning_rate": 6.793439999999999e-07, + "loss": 0.5554, + "step": 80200 + }, + { + "epoch": 0.8025, + "grad_norm": 27.00578498840332, + "learning_rate": 6.791439999999999e-07, + "loss": 0.5405, + "step": 80250 + }, + { + "epoch": 0.803, + "grad_norm": 33.01068115234375, + "learning_rate": 6.789439999999999e-07, + "loss": 0.5504, + "step": 80300 + }, + { + "epoch": 0.8035, + "grad_norm": 0.1820012480020523, + "learning_rate": 6.78752e-07, + "loss": 0.5296, + "step": 80350 + }, + { + "epoch": 0.804, + "grad_norm": 73.24763488769531, + "learning_rate": 6.78552e-07, + "loss": 0.4904, + "step": 80400 + }, + { + "epoch": 0.8045, + "grad_norm": 53.44038772583008, + "learning_rate": 6.783519999999999e-07, + "loss": 0.283, + "step": 80450 + }, + { + "epoch": 0.805, + "grad_norm": 9.23548412322998, + "learning_rate": 6.78152e-07, + "loss": 0.524, + "step": 80500 + }, + { + "epoch": 0.8055, + "grad_norm": 8.429581642150879, + "learning_rate": 6.77952e-07, + "loss": 0.3238, + "step": 80550 + }, + { + "epoch": 0.806, + "grad_norm": 0.890905499458313, + "learning_rate": 6.777519999999999e-07, + "loss": 0.4394, + "step": 80600 + }, + { + "epoch": 0.8065, + "grad_norm": 61.318058013916016, + "learning_rate": 6.77552e-07, + "loss": 0.4028, + "step": 80650 + }, + { + "epoch": 0.807, + "grad_norm": 32.087276458740234, + "learning_rate": 6.773519999999999e-07, + "loss": 0.6014, + "step": 80700 + }, + { + "epoch": 0.8075, + "grad_norm": 14.302053451538086, + "learning_rate": 6.771519999999999e-07, + "loss": 0.5009, + "step": 80750 + }, + { + "epoch": 0.808, + "grad_norm": 20.004087448120117, + "learning_rate": 6.76952e-07, + "loss": 0.5234, + "step": 80800 + }, + { + "epoch": 0.8085, + "grad_norm": 58.03341293334961, + "learning_rate": 6.76752e-07, + "loss": 0.5761, + "step": 80850 + }, + { + "epoch": 0.809, + "grad_norm": 42.07494354248047, + "learning_rate": 6.765560000000001e-07, + "loss": 0.4014, + "step": 80900 + }, + { + "epoch": 0.8095, + "grad_norm": 11.237667083740234, + "learning_rate": 6.763559999999999e-07, + "loss": 0.6565, + "step": 80950 + }, + { + "epoch": 0.81, + "grad_norm": 54.023685455322266, + "learning_rate": 6.761559999999999e-07, + "loss": 0.5733, + "step": 81000 + }, + { + "epoch": 0.8105, + "grad_norm": 26.939653396606445, + "learning_rate": 6.75956e-07, + "loss": 0.5418, + "step": 81050 + }, + { + "epoch": 0.811, + "grad_norm": 72.97735595703125, + "learning_rate": 6.75756e-07, + "loss": 0.6132, + "step": 81100 + }, + { + "epoch": 0.8115, + "grad_norm": 30.797290802001953, + "learning_rate": 6.75556e-07, + "loss": 0.4381, + "step": 81150 + }, + { + "epoch": 0.812, + "grad_norm": 193.12718200683594, + "learning_rate": 6.75356e-07, + "loss": 0.5038, + "step": 81200 + }, + { + "epoch": 0.8125, + "grad_norm": 42.33784103393555, + "learning_rate": 6.75156e-07, + "loss": 0.5907, + "step": 81250 + }, + { + "epoch": 0.813, + "grad_norm": 60.777767181396484, + "learning_rate": 6.749559999999999e-07, + "loss": 0.5999, + "step": 81300 + }, + { + "epoch": 0.8135, + "grad_norm": 4.074952602386475, + "learning_rate": 6.74756e-07, + "loss": 0.5535, + "step": 81350 + }, + { + "epoch": 0.814, + "grad_norm": 19.895477294921875, + "learning_rate": 6.74556e-07, + "loss": 0.4116, + "step": 81400 + }, + { + "epoch": 0.8145, + "grad_norm": 8.462628364562988, + "learning_rate": 6.743559999999999e-07, + "loss": 0.4759, + "step": 81450 + }, + { + "epoch": 0.815, + "grad_norm": 55.037052154541016, + "learning_rate": 6.74156e-07, + "loss": 0.5654, + "step": 81500 + }, + { + "epoch": 0.8155, + "grad_norm": 65.07530975341797, + "learning_rate": 6.73956e-07, + "loss": 0.5132, + "step": 81550 + }, + { + "epoch": 0.816, + "grad_norm": 64.06502532958984, + "learning_rate": 6.73756e-07, + "loss": 0.5159, + "step": 81600 + }, + { + "epoch": 0.8165, + "grad_norm": 73.82300567626953, + "learning_rate": 6.73556e-07, + "loss": 0.5446, + "step": 81650 + }, + { + "epoch": 0.817, + "grad_norm": 40.699676513671875, + "learning_rate": 6.733559999999999e-07, + "loss": 0.4982, + "step": 81700 + }, + { + "epoch": 0.8175, + "grad_norm": 94.0560531616211, + "learning_rate": 6.731559999999999e-07, + "loss": 0.5136, + "step": 81750 + }, + { + "epoch": 0.818, + "grad_norm": 77.07228088378906, + "learning_rate": 6.72956e-07, + "loss": 0.5026, + "step": 81800 + }, + { + "epoch": 0.8185, + "grad_norm": 54.43427658081055, + "learning_rate": 6.72756e-07, + "loss": 0.5811, + "step": 81850 + }, + { + "epoch": 0.819, + "grad_norm": 32.514156341552734, + "learning_rate": 6.72556e-07, + "loss": 0.4385, + "step": 81900 + }, + { + "epoch": 0.8195, + "grad_norm": 8.645065307617188, + "learning_rate": 6.72356e-07, + "loss": 0.3586, + "step": 81950 + }, + { + "epoch": 0.82, + "grad_norm": 42.03317642211914, + "learning_rate": 6.72156e-07, + "loss": 0.413, + "step": 82000 + }, + { + "epoch": 0.8205, + "grad_norm": 0.5245881676673889, + "learning_rate": 6.719559999999999e-07, + "loss": 0.4099, + "step": 82050 + }, + { + "epoch": 0.821, + "grad_norm": 66.3309326171875, + "learning_rate": 6.71756e-07, + "loss": 0.7605, + "step": 82100 + }, + { + "epoch": 0.8215, + "grad_norm": 26.570140838623047, + "learning_rate": 6.71556e-07, + "loss": 0.4468, + "step": 82150 + }, + { + "epoch": 0.822, + "grad_norm": 71.70770263671875, + "learning_rate": 6.713559999999999e-07, + "loss": 0.3865, + "step": 82200 + }, + { + "epoch": 0.8225, + "grad_norm": 29.463645935058594, + "learning_rate": 6.71156e-07, + "loss": 0.5206, + "step": 82250 + }, + { + "epoch": 0.823, + "grad_norm": 61.58549880981445, + "learning_rate": 6.70956e-07, + "loss": 0.5122, + "step": 82300 + }, + { + "epoch": 0.8235, + "grad_norm": 52.86707305908203, + "learning_rate": 6.70756e-07, + "loss": 0.5216, + "step": 82350 + }, + { + "epoch": 0.824, + "grad_norm": 67.04204559326172, + "learning_rate": 6.70556e-07, + "loss": 0.5536, + "step": 82400 + }, + { + "epoch": 0.8245, + "grad_norm": 1.1900335550308228, + "learning_rate": 6.703559999999999e-07, + "loss": 0.4011, + "step": 82450 + }, + { + "epoch": 0.825, + "grad_norm": 13.605680465698242, + "learning_rate": 6.701559999999999e-07, + "loss": 0.412, + "step": 82500 + }, + { + "epoch": 0.8255, + "grad_norm": 68.13459777832031, + "learning_rate": 6.69956e-07, + "loss": 0.5592, + "step": 82550 + }, + { + "epoch": 0.826, + "grad_norm": 41.138648986816406, + "learning_rate": 6.69756e-07, + "loss": 0.5622, + "step": 82600 + }, + { + "epoch": 0.8265, + "grad_norm": 50.51581954956055, + "learning_rate": 6.69556e-07, + "loss": 0.408, + "step": 82650 + }, + { + "epoch": 0.827, + "grad_norm": 63.566707611083984, + "learning_rate": 6.69356e-07, + "loss": 0.4288, + "step": 82700 + }, + { + "epoch": 0.8275, + "grad_norm": 3.45732045173645, + "learning_rate": 6.691559999999999e-07, + "loss": 0.5257, + "step": 82750 + }, + { + "epoch": 0.828, + "grad_norm": 13.433048248291016, + "learning_rate": 6.689559999999999e-07, + "loss": 0.4753, + "step": 82800 + }, + { + "epoch": 0.8285, + "grad_norm": 0.32884418964385986, + "learning_rate": 6.68756e-07, + "loss": 0.4934, + "step": 82850 + }, + { + "epoch": 0.829, + "grad_norm": 13.126819610595703, + "learning_rate": 6.68556e-07, + "loss": 0.3574, + "step": 82900 + }, + { + "epoch": 0.8295, + "grad_norm": 48.53691482543945, + "learning_rate": 6.683559999999999e-07, + "loss": 0.7363, + "step": 82950 + }, + { + "epoch": 0.83, + "grad_norm": 87.21624755859375, + "learning_rate": 6.68156e-07, + "loss": 0.4372, + "step": 83000 + }, + { + "epoch": 0.8305, + "grad_norm": 53.67840576171875, + "learning_rate": 6.67956e-07, + "loss": 0.5366, + "step": 83050 + }, + { + "epoch": 0.831, + "grad_norm": 72.83885192871094, + "learning_rate": 6.677559999999999e-07, + "loss": 0.5919, + "step": 83100 + }, + { + "epoch": 0.8315, + "grad_norm": 24.180238723754883, + "learning_rate": 6.67556e-07, + "loss": 0.5037, + "step": 83150 + }, + { + "epoch": 0.832, + "grad_norm": 68.72847747802734, + "learning_rate": 6.673559999999999e-07, + "loss": 0.5177, + "step": 83200 + }, + { + "epoch": 0.8325, + "grad_norm": 8.159306526184082, + "learning_rate": 6.671559999999999e-07, + "loss": 0.4894, + "step": 83250 + }, + { + "epoch": 0.833, + "grad_norm": 77.16256713867188, + "learning_rate": 6.66956e-07, + "loss": 0.4338, + "step": 83300 + }, + { + "epoch": 0.8335, + "grad_norm": 15.30505657196045, + "learning_rate": 6.66756e-07, + "loss": 0.4723, + "step": 83350 + }, + { + "epoch": 0.834, + "grad_norm": 56.12632751464844, + "learning_rate": 6.665560000000001e-07, + "loss": 0.4587, + "step": 83400 + }, + { + "epoch": 0.8345, + "grad_norm": 16.000368118286133, + "learning_rate": 6.663559999999999e-07, + "loss": 0.5214, + "step": 83450 + }, + { + "epoch": 0.835, + "grad_norm": 2.457681655883789, + "learning_rate": 6.661559999999999e-07, + "loss": 0.5707, + "step": 83500 + }, + { + "epoch": 0.8355, + "grad_norm": 107.4878921508789, + "learning_rate": 6.65956e-07, + "loss": 0.5638, + "step": 83550 + }, + { + "epoch": 0.836, + "grad_norm": 133.30532836914062, + "learning_rate": 6.65756e-07, + "loss": 0.5926, + "step": 83600 + }, + { + "epoch": 0.8365, + "grad_norm": 63.642738342285156, + "learning_rate": 6.65556e-07, + "loss": 0.3443, + "step": 83650 + }, + { + "epoch": 0.837, + "grad_norm": 10.244129180908203, + "learning_rate": 6.65356e-07, + "loss": 0.4525, + "step": 83700 + }, + { + "epoch": 0.8375, + "grad_norm": 21.855087280273438, + "learning_rate": 6.65156e-07, + "loss": 0.5493, + "step": 83750 + }, + { + "epoch": 0.838, + "grad_norm": 8.698675155639648, + "learning_rate": 6.649559999999999e-07, + "loss": 0.4376, + "step": 83800 + }, + { + "epoch": 0.8385, + "grad_norm": 56.18486785888672, + "learning_rate": 6.64756e-07, + "loss": 0.4455, + "step": 83850 + }, + { + "epoch": 0.839, + "grad_norm": 3.442760705947876, + "learning_rate": 6.64556e-07, + "loss": 0.3941, + "step": 83900 + }, + { + "epoch": 0.8395, + "grad_norm": 108.05015563964844, + "learning_rate": 6.643559999999999e-07, + "loss": 0.4177, + "step": 83950 + }, + { + "epoch": 0.84, + "grad_norm": 11.338204383850098, + "learning_rate": 6.64156e-07, + "loss": 0.4218, + "step": 84000 + }, + { + "epoch": 0.8405, + "grad_norm": 53.90700149536133, + "learning_rate": 6.6396e-07, + "loss": 0.3908, + "step": 84050 + }, + { + "epoch": 0.841, + "grad_norm": 67.98553466796875, + "learning_rate": 6.6376e-07, + "loss": 0.3739, + "step": 84100 + }, + { + "epoch": 0.8415, + "grad_norm": 45.681941986083984, + "learning_rate": 6.6356e-07, + "loss": 0.4754, + "step": 84150 + }, + { + "epoch": 0.842, + "grad_norm": 84.54562377929688, + "learning_rate": 6.633599999999999e-07, + "loss": 0.5086, + "step": 84200 + }, + { + "epoch": 0.8425, + "grad_norm": 101.58403778076172, + "learning_rate": 6.631599999999999e-07, + "loss": 0.4529, + "step": 84250 + }, + { + "epoch": 0.843, + "grad_norm": 87.35352325439453, + "learning_rate": 6.6296e-07, + "loss": 0.561, + "step": 84300 + }, + { + "epoch": 0.8435, + "grad_norm": 64.42286682128906, + "learning_rate": 6.6276e-07, + "loss": 0.4564, + "step": 84350 + }, + { + "epoch": 0.844, + "grad_norm": 12.244241714477539, + "learning_rate": 6.6256e-07, + "loss": 0.3562, + "step": 84400 + }, + { + "epoch": 0.8445, + "grad_norm": 38.84530258178711, + "learning_rate": 6.6236e-07, + "loss": 0.537, + "step": 84450 + }, + { + "epoch": 0.845, + "grad_norm": 71.45441436767578, + "learning_rate": 6.621599999999999e-07, + "loss": 0.4359, + "step": 84500 + }, + { + "epoch": 0.8455, + "grad_norm": 38.31736755371094, + "learning_rate": 6.619599999999999e-07, + "loss": 0.3036, + "step": 84550 + }, + { + "epoch": 0.846, + "grad_norm": 1.6229268312454224, + "learning_rate": 6.6176e-07, + "loss": 0.5366, + "step": 84600 + }, + { + "epoch": 0.8465, + "grad_norm": 65.19337463378906, + "learning_rate": 6.6156e-07, + "loss": 0.5766, + "step": 84650 + }, + { + "epoch": 0.847, + "grad_norm": 96.37982940673828, + "learning_rate": 6.613599999999999e-07, + "loss": 0.4957, + "step": 84700 + }, + { + "epoch": 0.8475, + "grad_norm": 117.63880920410156, + "learning_rate": 6.6116e-07, + "loss": 0.5433, + "step": 84750 + }, + { + "epoch": 0.848, + "grad_norm": 107.3805923461914, + "learning_rate": 6.6096e-07, + "loss": 0.4758, + "step": 84800 + }, + { + "epoch": 0.8485, + "grad_norm": 105.60696411132812, + "learning_rate": 6.607599999999999e-07, + "loss": 0.562, + "step": 84850 + }, + { + "epoch": 0.849, + "grad_norm": 0.5521818995475769, + "learning_rate": 6.6056e-07, + "loss": 0.3867, + "step": 84900 + }, + { + "epoch": 0.8495, + "grad_norm": 15.534207344055176, + "learning_rate": 6.603599999999999e-07, + "loss": 0.3862, + "step": 84950 + }, + { + "epoch": 0.85, + "grad_norm": 4.9523701667785645, + "learning_rate": 6.601599999999999e-07, + "loss": 0.4487, + "step": 85000 + }, + { + "epoch": 0.8505, + "grad_norm": 107.40753936767578, + "learning_rate": 6.5996e-07, + "loss": 0.4253, + "step": 85050 + }, + { + "epoch": 0.851, + "grad_norm": 2.598055362701416, + "learning_rate": 6.5976e-07, + "loss": 0.5241, + "step": 85100 + }, + { + "epoch": 0.8515, + "grad_norm": 55.7864875793457, + "learning_rate": 6.5956e-07, + "loss": 0.5934, + "step": 85150 + }, + { + "epoch": 0.852, + "grad_norm": 4.6609697341918945, + "learning_rate": 6.593599999999999e-07, + "loss": 0.4759, + "step": 85200 + }, + { + "epoch": 0.8525, + "grad_norm": 103.4588851928711, + "learning_rate": 6.591599999999999e-07, + "loss": 0.4338, + "step": 85250 + }, + { + "epoch": 0.853, + "grad_norm": 4.739608287811279, + "learning_rate": 6.589599999999999e-07, + "loss": 0.5771, + "step": 85300 + }, + { + "epoch": 0.8535, + "grad_norm": 81.03630828857422, + "learning_rate": 6.5876e-07, + "loss": 0.4308, + "step": 85350 + }, + { + "epoch": 0.854, + "grad_norm": 10.07264232635498, + "learning_rate": 6.5856e-07, + "loss": 0.5954, + "step": 85400 + }, + { + "epoch": 0.8545, + "grad_norm": 87.12622833251953, + "learning_rate": 6.583599999999999e-07, + "loss": 0.5724, + "step": 85450 + }, + { + "epoch": 0.855, + "grad_norm": 12.88304615020752, + "learning_rate": 6.5816e-07, + "loss": 0.3812, + "step": 85500 + }, + { + "epoch": 0.8555, + "grad_norm": 2.8463549613952637, + "learning_rate": 6.579599999999999e-07, + "loss": 0.5091, + "step": 85550 + }, + { + "epoch": 0.856, + "grad_norm": 107.44549560546875, + "learning_rate": 6.577599999999999e-07, + "loss": 0.5983, + "step": 85600 + }, + { + "epoch": 0.8565, + "grad_norm": 25.965686798095703, + "learning_rate": 6.5756e-07, + "loss": 0.4518, + "step": 85650 + }, + { + "epoch": 0.857, + "grad_norm": 13.483121871948242, + "learning_rate": 6.5736e-07, + "loss": 0.4494, + "step": 85700 + }, + { + "epoch": 0.8575, + "grad_norm": 57.970027923583984, + "learning_rate": 6.571599999999999e-07, + "loss": 0.5211, + "step": 85750 + }, + { + "epoch": 0.858, + "grad_norm": 61.619537353515625, + "learning_rate": 6.5696e-07, + "loss": 0.4131, + "step": 85800 + }, + { + "epoch": 0.8585, + "grad_norm": 72.63780975341797, + "learning_rate": 6.5676e-07, + "loss": 0.4786, + "step": 85850 + }, + { + "epoch": 0.859, + "grad_norm": 32.84150314331055, + "learning_rate": 6.5656e-07, + "loss": 0.3868, + "step": 85900 + }, + { + "epoch": 0.8595, + "grad_norm": 11.908856391906738, + "learning_rate": 6.5636e-07, + "loss": 0.4036, + "step": 85950 + }, + { + "epoch": 0.86, + "grad_norm": 124.65275573730469, + "learning_rate": 6.561599999999999e-07, + "loss": 0.5248, + "step": 86000 + }, + { + "epoch": 0.8605, + "grad_norm": 77.87248992919922, + "learning_rate": 6.5596e-07, + "loss": 0.631, + "step": 86050 + }, + { + "epoch": 0.861, + "grad_norm": 88.47559356689453, + "learning_rate": 6.5576e-07, + "loss": 0.5183, + "step": 86100 + }, + { + "epoch": 0.8615, + "grad_norm": 35.03434371948242, + "learning_rate": 6.5556e-07, + "loss": 0.536, + "step": 86150 + }, + { + "epoch": 0.862, + "grad_norm": 49.06086349487305, + "learning_rate": 6.553600000000001e-07, + "loss": 0.4981, + "step": 86200 + }, + { + "epoch": 0.8625, + "grad_norm": 22.367053985595703, + "learning_rate": 6.551599999999999e-07, + "loss": 0.5554, + "step": 86250 + }, + { + "epoch": 0.863, + "grad_norm": 46.18075180053711, + "learning_rate": 6.549599999999999e-07, + "loss": 0.4415, + "step": 86300 + }, + { + "epoch": 0.8635, + "grad_norm": 85.1124496459961, + "learning_rate": 6.5476e-07, + "loss": 0.6429, + "step": 86350 + }, + { + "epoch": 0.864, + "grad_norm": 75.1630630493164, + "learning_rate": 6.5456e-07, + "loss": 0.4281, + "step": 86400 + }, + { + "epoch": 0.8645, + "grad_norm": 23.679262161254883, + "learning_rate": 6.5436e-07, + "loss": 0.4745, + "step": 86450 + }, + { + "epoch": 0.865, + "grad_norm": 40.03208541870117, + "learning_rate": 6.5416e-07, + "loss": 0.3905, + "step": 86500 + }, + { + "epoch": 0.8655, + "grad_norm": 65.0140151977539, + "learning_rate": 6.5396e-07, + "loss": 0.6504, + "step": 86550 + }, + { + "epoch": 0.866, + "grad_norm": 85.9790267944336, + "learning_rate": 6.537599999999999e-07, + "loss": 0.3979, + "step": 86600 + }, + { + "epoch": 0.8665, + "grad_norm": 3.0157697200775146, + "learning_rate": 6.5356e-07, + "loss": 0.4499, + "step": 86650 + }, + { + "epoch": 0.867, + "grad_norm": 86.25167846679688, + "learning_rate": 6.5336e-07, + "loss": 0.5585, + "step": 86700 + }, + { + "epoch": 0.8675, + "grad_norm": 26.297157287597656, + "learning_rate": 6.531599999999999e-07, + "loss": 0.5065, + "step": 86750 + }, + { + "epoch": 0.868, + "grad_norm": 53.790348052978516, + "learning_rate": 6.5296e-07, + "loss": 0.507, + "step": 86800 + }, + { + "epoch": 0.8685, + "grad_norm": 45.35681915283203, + "learning_rate": 6.5276e-07, + "loss": 0.4223, + "step": 86850 + }, + { + "epoch": 0.869, + "grad_norm": 108.29571533203125, + "learning_rate": 6.5256e-07, + "loss": 0.4757, + "step": 86900 + }, + { + "epoch": 0.8695, + "grad_norm": 1.806410312652588, + "learning_rate": 6.5236e-07, + "loss": 0.4832, + "step": 86950 + }, + { + "epoch": 0.87, + "grad_norm": 46.172607421875, + "learning_rate": 6.521599999999999e-07, + "loss": 0.599, + "step": 87000 + }, + { + "epoch": 0.8705, + "grad_norm": 37.954742431640625, + "learning_rate": 6.519599999999999e-07, + "loss": 0.6213, + "step": 87050 + }, + { + "epoch": 0.871, + "grad_norm": 11.543988227844238, + "learning_rate": 6.5176e-07, + "loss": 0.4723, + "step": 87100 + }, + { + "epoch": 0.8715, + "grad_norm": 26.00443458557129, + "learning_rate": 6.5156e-07, + "loss": 0.4994, + "step": 87150 + }, + { + "epoch": 0.872, + "grad_norm": 144.9601593017578, + "learning_rate": 6.5136e-07, + "loss": 0.5887, + "step": 87200 + }, + { + "epoch": 0.8725, + "grad_norm": 53.76115417480469, + "learning_rate": 6.5116e-07, + "loss": 0.4368, + "step": 87250 + }, + { + "epoch": 0.873, + "grad_norm": 56.003292083740234, + "learning_rate": 6.5096e-07, + "loss": 0.49, + "step": 87300 + }, + { + "epoch": 0.8735, + "grad_norm": 81.86634063720703, + "learning_rate": 6.507599999999999e-07, + "loss": 0.4194, + "step": 87350 + }, + { + "epoch": 0.874, + "grad_norm": 46.207969665527344, + "learning_rate": 6.5056e-07, + "loss": 0.5626, + "step": 87400 + }, + { + "epoch": 0.8745, + "grad_norm": 93.14911651611328, + "learning_rate": 6.5036e-07, + "loss": 0.5803, + "step": 87450 + }, + { + "epoch": 0.875, + "grad_norm": 91.9137954711914, + "learning_rate": 6.501599999999999e-07, + "loss": 0.4872, + "step": 87500 + }, + { + "epoch": 0.8755, + "grad_norm": 73.91366577148438, + "learning_rate": 6.4996e-07, + "loss": 0.5283, + "step": 87550 + }, + { + "epoch": 0.876, + "grad_norm": 35.279666900634766, + "learning_rate": 6.4976e-07, + "loss": 0.4378, + "step": 87600 + }, + { + "epoch": 0.8765, + "grad_norm": 15.952431678771973, + "learning_rate": 6.4956e-07, + "loss": 0.4564, + "step": 87650 + }, + { + "epoch": 0.877, + "grad_norm": 2.8610007762908936, + "learning_rate": 6.4936e-07, + "loss": 0.4478, + "step": 87700 + }, + { + "epoch": 0.8775, + "grad_norm": 64.05665588378906, + "learning_rate": 6.491599999999999e-07, + "loss": 0.4061, + "step": 87750 + }, + { + "epoch": 0.878, + "grad_norm": 3.5503172874450684, + "learning_rate": 6.489599999999999e-07, + "loss": 0.4837, + "step": 87800 + }, + { + "epoch": 0.8785, + "grad_norm": 11.314192771911621, + "learning_rate": 6.4876e-07, + "loss": 0.3553, + "step": 87850 + }, + { + "epoch": 0.879, + "grad_norm": 23.217226028442383, + "learning_rate": 6.4856e-07, + "loss": 0.4392, + "step": 87900 + }, + { + "epoch": 0.8795, + "grad_norm": 2.8893821239471436, + "learning_rate": 6.483600000000001e-07, + "loss": 0.4047, + "step": 87950 + }, + { + "epoch": 0.88, + "grad_norm": 5.448762893676758, + "learning_rate": 6.4816e-07, + "loss": 0.4604, + "step": 88000 + }, + { + "epoch": 0.8805, + "grad_norm": 84.17955017089844, + "learning_rate": 6.479599999999999e-07, + "loss": 0.4875, + "step": 88050 + }, + { + "epoch": 0.881, + "grad_norm": 68.11402130126953, + "learning_rate": 6.4776e-07, + "loss": 0.5875, + "step": 88100 + }, + { + "epoch": 0.8815, + "grad_norm": 77.2765884399414, + "learning_rate": 6.4756e-07, + "loss": 0.4419, + "step": 88150 + }, + { + "epoch": 0.882, + "grad_norm": 16.410972595214844, + "learning_rate": 6.4736e-07, + "loss": 0.4844, + "step": 88200 + }, + { + "epoch": 0.8825, + "grad_norm": 31.93629264831543, + "learning_rate": 6.4716e-07, + "loss": 0.4829, + "step": 88250 + }, + { + "epoch": 0.883, + "grad_norm": 68.09761047363281, + "learning_rate": 6.4696e-07, + "loss": 0.3913, + "step": 88300 + }, + { + "epoch": 0.8835, + "grad_norm": 91.0275650024414, + "learning_rate": 6.4676e-07, + "loss": 0.6128, + "step": 88350 + }, + { + "epoch": 0.884, + "grad_norm": 67.89716339111328, + "learning_rate": 6.4656e-07, + "loss": 0.5768, + "step": 88400 + }, + { + "epoch": 0.8845, + "grad_norm": 21.36505126953125, + "learning_rate": 6.4636e-07, + "loss": 0.3613, + "step": 88450 + }, + { + "epoch": 0.885, + "grad_norm": 2.784221887588501, + "learning_rate": 6.461599999999999e-07, + "loss": 0.3029, + "step": 88500 + }, + { + "epoch": 0.8855, + "grad_norm": 4.084001064300537, + "learning_rate": 6.4596e-07, + "loss": 0.3698, + "step": 88550 + }, + { + "epoch": 0.886, + "grad_norm": 72.70446014404297, + "learning_rate": 6.4576e-07, + "loss": 0.4915, + "step": 88600 + }, + { + "epoch": 0.8865, + "grad_norm": 55.41581726074219, + "learning_rate": 6.4556e-07, + "loss": 0.5785, + "step": 88650 + }, + { + "epoch": 0.887, + "grad_norm": 35.189048767089844, + "learning_rate": 6.453600000000001e-07, + "loss": 0.5671, + "step": 88700 + }, + { + "epoch": 0.8875, + "grad_norm": 57.02684783935547, + "learning_rate": 6.451599999999999e-07, + "loss": 0.4981, + "step": 88750 + }, + { + "epoch": 0.888, + "grad_norm": 0.43074750900268555, + "learning_rate": 6.449599999999999e-07, + "loss": 0.4555, + "step": 88800 + }, + { + "epoch": 0.8885, + "grad_norm": 13.582013130187988, + "learning_rate": 6.4476e-07, + "loss": 0.3789, + "step": 88850 + }, + { + "epoch": 0.889, + "grad_norm": 65.70917510986328, + "learning_rate": 6.4456e-07, + "loss": 0.5623, + "step": 88900 + }, + { + "epoch": 0.8895, + "grad_norm": 48.792179107666016, + "learning_rate": 6.4436e-07, + "loss": 0.5911, + "step": 88950 + }, + { + "epoch": 0.89, + "grad_norm": 11.270796775817871, + "learning_rate": 6.4416e-07, + "loss": 0.4401, + "step": 89000 + }, + { + "epoch": 0.8905, + "grad_norm": 89.74335479736328, + "learning_rate": 6.4396e-07, + "loss": 0.3221, + "step": 89050 + }, + { + "epoch": 0.891, + "grad_norm": 46.43724822998047, + "learning_rate": 6.437599999999999e-07, + "loss": 0.3968, + "step": 89100 + }, + { + "epoch": 0.8915, + "grad_norm": 2.0606515407562256, + "learning_rate": 6.4356e-07, + "loss": 0.447, + "step": 89150 + }, + { + "epoch": 0.892, + "grad_norm": 91.6527099609375, + "learning_rate": 6.4336e-07, + "loss": 0.4425, + "step": 89200 + }, + { + "epoch": 0.8925, + "grad_norm": 42.8084716796875, + "learning_rate": 6.431599999999999e-07, + "loss": 0.4977, + "step": 89250 + }, + { + "epoch": 0.893, + "grad_norm": 13.197081565856934, + "learning_rate": 6.4296e-07, + "loss": 0.395, + "step": 89300 + }, + { + "epoch": 0.8935, + "grad_norm": 97.76970672607422, + "learning_rate": 6.4276e-07, + "loss": 0.4206, + "step": 89350 + }, + { + "epoch": 0.894, + "grad_norm": 103.70655822753906, + "learning_rate": 6.4256e-07, + "loss": 0.6174, + "step": 89400 + }, + { + "epoch": 0.8945, + "grad_norm": 25.918859481811523, + "learning_rate": 6.4236e-07, + "loss": 0.5479, + "step": 89450 + }, + { + "epoch": 0.895, + "grad_norm": 28.53264808654785, + "learning_rate": 6.421599999999999e-07, + "loss": 0.5709, + "step": 89500 + }, + { + "epoch": 0.8955, + "grad_norm": 38.950416564941406, + "learning_rate": 6.419599999999999e-07, + "loss": 0.5301, + "step": 89550 + }, + { + "epoch": 0.896, + "grad_norm": 95.71314239501953, + "learning_rate": 6.4176e-07, + "loss": 0.5001, + "step": 89600 + }, + { + "epoch": 0.8965, + "grad_norm": 7.237802505493164, + "learning_rate": 6.4156e-07, + "loss": 0.5101, + "step": 89650 + }, + { + "epoch": 0.897, + "grad_norm": 104.18899536132812, + "learning_rate": 6.4136e-07, + "loss": 0.445, + "step": 89700 + }, + { + "epoch": 0.8975, + "grad_norm": 88.33003997802734, + "learning_rate": 6.4116e-07, + "loss": 0.4632, + "step": 89750 + }, + { + "epoch": 0.898, + "grad_norm": 0.11056256294250488, + "learning_rate": 6.409599999999999e-07, + "loss": 0.4456, + "step": 89800 + }, + { + "epoch": 0.8985, + "grad_norm": 23.158008575439453, + "learning_rate": 6.407599999999999e-07, + "loss": 0.5806, + "step": 89850 + }, + { + "epoch": 0.899, + "grad_norm": 45.30116271972656, + "learning_rate": 6.4056e-07, + "loss": 0.4927, + "step": 89900 + }, + { + "epoch": 0.8995, + "grad_norm": 13.431071281433105, + "learning_rate": 6.4036e-07, + "loss": 0.5075, + "step": 89950 + }, + { + "epoch": 0.9, + "grad_norm": 42.31669235229492, + "learning_rate": 6.401599999999999e-07, + "loss": 0.4129, + "step": 90000 + }, + { + "epoch": 0.9005, + "grad_norm": 35.61188888549805, + "learning_rate": 6.3996e-07, + "loss": 0.5002, + "step": 90050 + }, + { + "epoch": 0.901, + "grad_norm": 33.90731430053711, + "learning_rate": 6.3976e-07, + "loss": 0.4587, + "step": 90100 + }, + { + "epoch": 0.9015, + "grad_norm": 17.363262176513672, + "learning_rate": 6.395599999999999e-07, + "loss": 0.3567, + "step": 90150 + }, + { + "epoch": 0.902, + "grad_norm": 88.20648193359375, + "learning_rate": 6.3936e-07, + "loss": 0.3825, + "step": 90200 + }, + { + "epoch": 0.9025, + "grad_norm": 105.39450073242188, + "learning_rate": 6.391599999999999e-07, + "loss": 0.4482, + "step": 90250 + }, + { + "epoch": 0.903, + "grad_norm": 41.72043991088867, + "learning_rate": 6.389599999999999e-07, + "loss": 0.4216, + "step": 90300 + }, + { + "epoch": 0.9035, + "grad_norm": 52.67734909057617, + "learning_rate": 6.3876e-07, + "loss": 0.5691, + "step": 90350 + }, + { + "epoch": 0.904, + "grad_norm": 2.5297904014587402, + "learning_rate": 6.3856e-07, + "loss": 0.4965, + "step": 90400 + }, + { + "epoch": 0.9045, + "grad_norm": 100.61029815673828, + "learning_rate": 6.383600000000001e-07, + "loss": 0.3907, + "step": 90450 + }, + { + "epoch": 0.905, + "grad_norm": 123.27056884765625, + "learning_rate": 6.381599999999999e-07, + "loss": 0.6531, + "step": 90500 + }, + { + "epoch": 0.9055, + "grad_norm": 57.43628692626953, + "learning_rate": 6.379599999999999e-07, + "loss": 0.5612, + "step": 90550 + }, + { + "epoch": 0.906, + "grad_norm": 6.510050296783447, + "learning_rate": 6.3776e-07, + "loss": 0.6138, + "step": 90600 + }, + { + "epoch": 0.9065, + "grad_norm": 71.36262512207031, + "learning_rate": 6.3756e-07, + "loss": 0.3697, + "step": 90650 + }, + { + "epoch": 0.907, + "grad_norm": 92.18431091308594, + "learning_rate": 6.3736e-07, + "loss": 0.4635, + "step": 90700 + }, + { + "epoch": 0.9075, + "grad_norm": 73.8544692993164, + "learning_rate": 6.3716e-07, + "loss": 0.4039, + "step": 90750 + }, + { + "epoch": 0.908, + "grad_norm": 84.32909393310547, + "learning_rate": 6.3696e-07, + "loss": 0.5271, + "step": 90800 + }, + { + "epoch": 0.9085, + "grad_norm": 71.52722930908203, + "learning_rate": 6.367599999999999e-07, + "loss": 0.4169, + "step": 90850 + }, + { + "epoch": 0.909, + "grad_norm": 10.915364265441895, + "learning_rate": 6.36564e-07, + "loss": 0.467, + "step": 90900 + }, + { + "epoch": 0.9095, + "grad_norm": 74.15825653076172, + "learning_rate": 6.36364e-07, + "loss": 0.4426, + "step": 90950 + }, + { + "epoch": 0.91, + "grad_norm": 91.29158020019531, + "learning_rate": 6.361639999999999e-07, + "loss": 0.6179, + "step": 91000 + }, + { + "epoch": 0.9105, + "grad_norm": 0.4594174027442932, + "learning_rate": 6.35964e-07, + "loss": 0.5343, + "step": 91050 + }, + { + "epoch": 0.911, + "grad_norm": 124.07490539550781, + "learning_rate": 6.35764e-07, + "loss": 0.4171, + "step": 91100 + }, + { + "epoch": 0.9115, + "grad_norm": 43.403846740722656, + "learning_rate": 6.35564e-07, + "loss": 0.5054, + "step": 91150 + }, + { + "epoch": 0.912, + "grad_norm": 43.1138801574707, + "learning_rate": 6.35364e-07, + "loss": 0.4225, + "step": 91200 + }, + { + "epoch": 0.9125, + "grad_norm": 117.5290756225586, + "learning_rate": 6.351639999999999e-07, + "loss": 0.5131, + "step": 91250 + }, + { + "epoch": 0.913, + "grad_norm": 67.10313415527344, + "learning_rate": 6.349679999999999e-07, + "loss": 0.3869, + "step": 91300 + }, + { + "epoch": 0.9135, + "grad_norm": 12.4064359664917, + "learning_rate": 6.34768e-07, + "loss": 0.4939, + "step": 91350 + }, + { + "epoch": 0.914, + "grad_norm": 20.02683448791504, + "learning_rate": 6.34568e-07, + "loss": 0.5706, + "step": 91400 + }, + { + "epoch": 0.9145, + "grad_norm": 64.59272766113281, + "learning_rate": 6.34368e-07, + "loss": 0.3377, + "step": 91450 + }, + { + "epoch": 0.915, + "grad_norm": 2.4483537673950195, + "learning_rate": 6.34168e-07, + "loss": 0.349, + "step": 91500 + }, + { + "epoch": 0.9155, + "grad_norm": 0.7086917757987976, + "learning_rate": 6.33968e-07, + "loss": 0.4049, + "step": 91550 + }, + { + "epoch": 0.916, + "grad_norm": 17.961669921875, + "learning_rate": 6.337679999999999e-07, + "loss": 0.3951, + "step": 91600 + }, + { + "epoch": 0.9165, + "grad_norm": 62.844703674316406, + "learning_rate": 6.33568e-07, + "loss": 0.4574, + "step": 91650 + }, + { + "epoch": 0.917, + "grad_norm": 4.404021263122559, + "learning_rate": 6.33372e-07, + "loss": 0.4018, + "step": 91700 + }, + { + "epoch": 0.9175, + "grad_norm": 1.865087866783142, + "learning_rate": 6.331719999999999e-07, + "loss": 0.5116, + "step": 91750 + }, + { + "epoch": 0.918, + "grad_norm": 49.26223373413086, + "learning_rate": 6.32976e-07, + "loss": 0.5372, + "step": 91800 + }, + { + "epoch": 0.9185, + "grad_norm": 87.36351013183594, + "learning_rate": 6.32776e-07, + "loss": 0.594, + "step": 91850 + }, + { + "epoch": 0.919, + "grad_norm": 13.864863395690918, + "learning_rate": 6.32576e-07, + "loss": 0.385, + "step": 91900 + }, + { + "epoch": 0.9195, + "grad_norm": 85.708251953125, + "learning_rate": 6.32376e-07, + "loss": 0.4528, + "step": 91950 + }, + { + "epoch": 0.92, + "grad_norm": 66.8047866821289, + "learning_rate": 6.321759999999999e-07, + "loss": 0.5978, + "step": 92000 + }, + { + "epoch": 0.9205, + "grad_norm": 91.00402069091797, + "learning_rate": 6.319759999999999e-07, + "loss": 0.512, + "step": 92050 + }, + { + "epoch": 0.921, + "grad_norm": 56.01173400878906, + "learning_rate": 6.31776e-07, + "loss": 0.5682, + "step": 92100 + }, + { + "epoch": 0.9215, + "grad_norm": 71.80009460449219, + "learning_rate": 6.31576e-07, + "loss": 0.5471, + "step": 92150 + }, + { + "epoch": 0.922, + "grad_norm": 49.644187927246094, + "learning_rate": 6.31376e-07, + "loss": 0.6007, + "step": 92200 + }, + { + "epoch": 0.9225, + "grad_norm": 6.2205119132995605, + "learning_rate": 6.31176e-07, + "loss": 0.3881, + "step": 92250 + }, + { + "epoch": 0.923, + "grad_norm": 34.30057907104492, + "learning_rate": 6.309759999999999e-07, + "loss": 0.4156, + "step": 92300 + }, + { + "epoch": 0.9235, + "grad_norm": 65.61951446533203, + "learning_rate": 6.307759999999999e-07, + "loss": 0.5385, + "step": 92350 + }, + { + "epoch": 0.924, + "grad_norm": 96.61206817626953, + "learning_rate": 6.30576e-07, + "loss": 0.5514, + "step": 92400 + }, + { + "epoch": 0.9245, + "grad_norm": 21.83513641357422, + "learning_rate": 6.30376e-07, + "loss": 0.5608, + "step": 92450 + }, + { + "epoch": 0.925, + "grad_norm": 17.049848556518555, + "learning_rate": 6.301759999999999e-07, + "loss": 0.4524, + "step": 92500 + }, + { + "epoch": 0.9255, + "grad_norm": 19.969083786010742, + "learning_rate": 6.29976e-07, + "loss": 0.5688, + "step": 92550 + }, + { + "epoch": 0.926, + "grad_norm": 52.34061050415039, + "learning_rate": 6.29776e-07, + "loss": 0.476, + "step": 92600 + }, + { + "epoch": 0.9265, + "grad_norm": 86.44994354248047, + "learning_rate": 6.295759999999999e-07, + "loss": 0.3973, + "step": 92650 + }, + { + "epoch": 0.927, + "grad_norm": 19.09813117980957, + "learning_rate": 6.29376e-07, + "loss": 0.4407, + "step": 92700 + }, + { + "epoch": 0.9275, + "grad_norm": 17.017074584960938, + "learning_rate": 6.291759999999999e-07, + "loss": 0.4964, + "step": 92750 + }, + { + "epoch": 0.928, + "grad_norm": 68.21665954589844, + "learning_rate": 6.289759999999999e-07, + "loss": 0.4977, + "step": 92800 + }, + { + "epoch": 0.9285, + "grad_norm": 15.754688262939453, + "learning_rate": 6.28776e-07, + "loss": 0.5211, + "step": 92850 + }, + { + "epoch": 0.929, + "grad_norm": 39.87798309326172, + "learning_rate": 6.28576e-07, + "loss": 0.4764, + "step": 92900 + }, + { + "epoch": 0.9295, + "grad_norm": 66.07242584228516, + "learning_rate": 6.283760000000001e-07, + "loss": 0.4276, + "step": 92950 + }, + { + "epoch": 0.93, + "grad_norm": 68.16017150878906, + "learning_rate": 6.281759999999999e-07, + "loss": 0.4338, + "step": 93000 + }, + { + "epoch": 0.9305, + "grad_norm": 92.84886932373047, + "learning_rate": 6.279759999999999e-07, + "loss": 0.3994, + "step": 93050 + }, + { + "epoch": 0.931, + "grad_norm": 7.953521251678467, + "learning_rate": 6.27776e-07, + "loss": 0.5969, + "step": 93100 + }, + { + "epoch": 0.9315, + "grad_norm": 6.302267551422119, + "learning_rate": 6.27576e-07, + "loss": 0.5746, + "step": 93150 + }, + { + "epoch": 0.932, + "grad_norm": 57.08115005493164, + "learning_rate": 6.27376e-07, + "loss": 0.6096, + "step": 93200 + }, + { + "epoch": 0.9325, + "grad_norm": 74.17098236083984, + "learning_rate": 6.27176e-07, + "loss": 0.5127, + "step": 93250 + }, + { + "epoch": 0.933, + "grad_norm": 3.4939818382263184, + "learning_rate": 6.26976e-07, + "loss": 0.4822, + "step": 93300 + }, + { + "epoch": 0.9335, + "grad_norm": 15.249403953552246, + "learning_rate": 6.26776e-07, + "loss": 0.4677, + "step": 93350 + }, + { + "epoch": 0.934, + "grad_norm": 8.158519744873047, + "learning_rate": 6.26576e-07, + "loss": 0.637, + "step": 93400 + }, + { + "epoch": 0.9345, + "grad_norm": 3.797173023223877, + "learning_rate": 6.26376e-07, + "loss": 0.6001, + "step": 93450 + }, + { + "epoch": 0.935, + "grad_norm": 9.297698020935059, + "learning_rate": 6.261759999999999e-07, + "loss": 0.4783, + "step": 93500 + }, + { + "epoch": 0.9355, + "grad_norm": 97.72147369384766, + "learning_rate": 6.25976e-07, + "loss": 0.5935, + "step": 93550 + }, + { + "epoch": 0.936, + "grad_norm": 30.139896392822266, + "learning_rate": 6.25776e-07, + "loss": 0.4528, + "step": 93600 + }, + { + "epoch": 0.9365, + "grad_norm": 93.17426300048828, + "learning_rate": 6.25576e-07, + "loss": 0.519, + "step": 93650 + }, + { + "epoch": 0.937, + "grad_norm": 105.38163757324219, + "learning_rate": 6.253760000000001e-07, + "loss": 0.5527, + "step": 93700 + }, + { + "epoch": 0.9375, + "grad_norm": 68.39093780517578, + "learning_rate": 6.251759999999999e-07, + "loss": 0.4141, + "step": 93750 + }, + { + "epoch": 0.938, + "grad_norm": 70.97985076904297, + "learning_rate": 6.249759999999999e-07, + "loss": 0.4535, + "step": 93800 + }, + { + "epoch": 0.9385, + "grad_norm": 74.9505615234375, + "learning_rate": 6.24776e-07, + "loss": 0.5361, + "step": 93850 + }, + { + "epoch": 0.939, + "grad_norm": 21.2496395111084, + "learning_rate": 6.24576e-07, + "loss": 0.4383, + "step": 93900 + }, + { + "epoch": 0.9395, + "grad_norm": 69.52516174316406, + "learning_rate": 6.24376e-07, + "loss": 0.3776, + "step": 93950 + }, + { + "epoch": 0.94, + "grad_norm": 42.33454132080078, + "learning_rate": 6.24176e-07, + "loss": 0.4317, + "step": 94000 + }, + { + "epoch": 0.9405, + "grad_norm": 3.589440107345581, + "learning_rate": 6.23976e-07, + "loss": 0.4107, + "step": 94050 + }, + { + "epoch": 0.941, + "grad_norm": 108.18557739257812, + "learning_rate": 6.237759999999999e-07, + "loss": 0.4974, + "step": 94100 + }, + { + "epoch": 0.9415, + "grad_norm": 17.219356536865234, + "learning_rate": 6.23576e-07, + "loss": 0.4116, + "step": 94150 + }, + { + "epoch": 0.942, + "grad_norm": 19.18585777282715, + "learning_rate": 6.23376e-07, + "loss": 0.4121, + "step": 94200 + }, + { + "epoch": 0.9425, + "grad_norm": 118.1676254272461, + "learning_rate": 6.231759999999999e-07, + "loss": 0.5545, + "step": 94250 + }, + { + "epoch": 0.943, + "grad_norm": 47.58047103881836, + "learning_rate": 6.22976e-07, + "loss": 0.4113, + "step": 94300 + }, + { + "epoch": 0.9435, + "grad_norm": 81.94721221923828, + "learning_rate": 6.22776e-07, + "loss": 0.5893, + "step": 94350 + }, + { + "epoch": 0.944, + "grad_norm": 4.948315143585205, + "learning_rate": 6.22576e-07, + "loss": 0.3997, + "step": 94400 + }, + { + "epoch": 0.9445, + "grad_norm": 14.120973587036133, + "learning_rate": 6.22376e-07, + "loss": 0.4992, + "step": 94450 + }, + { + "epoch": 0.945, + "grad_norm": 63.99724197387695, + "learning_rate": 6.221759999999999e-07, + "loss": 0.5603, + "step": 94500 + }, + { + "epoch": 0.9455, + "grad_norm": 46.79153060913086, + "learning_rate": 6.219759999999999e-07, + "loss": 0.4607, + "step": 94550 + }, + { + "epoch": 0.946, + "grad_norm": 8.22097110748291, + "learning_rate": 6.21776e-07, + "loss": 0.557, + "step": 94600 + }, + { + "epoch": 0.9465, + "grad_norm": 98.2745361328125, + "learning_rate": 6.21576e-07, + "loss": 0.5769, + "step": 94650 + }, + { + "epoch": 0.947, + "grad_norm": 40.03475570678711, + "learning_rate": 6.21376e-07, + "loss": 0.5056, + "step": 94700 + }, + { + "epoch": 0.9475, + "grad_norm": 74.36143493652344, + "learning_rate": 6.21176e-07, + "loss": 0.4523, + "step": 94750 + }, + { + "epoch": 0.948, + "grad_norm": 51.59547805786133, + "learning_rate": 6.209759999999999e-07, + "loss": 0.5666, + "step": 94800 + }, + { + "epoch": 0.9485, + "grad_norm": 61.492488861083984, + "learning_rate": 6.207759999999999e-07, + "loss": 0.536, + "step": 94850 + }, + { + "epoch": 0.949, + "grad_norm": 3.265573024749756, + "learning_rate": 6.20576e-07, + "loss": 0.3787, + "step": 94900 + }, + { + "epoch": 0.9495, + "grad_norm": 29.715877532958984, + "learning_rate": 6.20376e-07, + "loss": 0.5144, + "step": 94950 + }, + { + "epoch": 0.95, + "grad_norm": 4.610234260559082, + "learning_rate": 6.201759999999999e-07, + "loss": 0.4496, + "step": 95000 + }, + { + "epoch": 0.9505, + "grad_norm": 3.437190532684326, + "learning_rate": 6.19976e-07, + "loss": 0.5684, + "step": 95050 + }, + { + "epoch": 0.951, + "grad_norm": 48.969093322753906, + "learning_rate": 6.19776e-07, + "loss": 0.4846, + "step": 95100 + }, + { + "epoch": 0.9515, + "grad_norm": 72.4744873046875, + "learning_rate": 6.195759999999999e-07, + "loss": 0.4036, + "step": 95150 + }, + { + "epoch": 0.952, + "grad_norm": 2.2708630561828613, + "learning_rate": 6.19376e-07, + "loss": 0.4133, + "step": 95200 + }, + { + "epoch": 0.9525, + "grad_norm": 4.139185905456543, + "learning_rate": 6.191759999999999e-07, + "loss": 0.5245, + "step": 95250 + }, + { + "epoch": 0.953, + "grad_norm": 131.5601806640625, + "learning_rate": 6.189759999999999e-07, + "loss": 0.3855, + "step": 95300 + }, + { + "epoch": 0.9535, + "grad_norm": 27.90871238708496, + "learning_rate": 6.18776e-07, + "loss": 0.4754, + "step": 95350 + }, + { + "epoch": 0.954, + "grad_norm": 82.64385986328125, + "learning_rate": 6.18576e-07, + "loss": 0.4861, + "step": 95400 + }, + { + "epoch": 0.9545, + "grad_norm": 51.01675033569336, + "learning_rate": 6.183760000000001e-07, + "loss": 0.4428, + "step": 95450 + }, + { + "epoch": 0.955, + "grad_norm": 56.236717224121094, + "learning_rate": 6.181759999999999e-07, + "loss": 0.5291, + "step": 95500 + }, + { + "epoch": 0.9555, + "grad_norm": 62.634403228759766, + "learning_rate": 6.179759999999999e-07, + "loss": 0.4693, + "step": 95550 + }, + { + "epoch": 0.956, + "grad_norm": 29.53840446472168, + "learning_rate": 6.17776e-07, + "loss": 0.5259, + "step": 95600 + }, + { + "epoch": 0.9565, + "grad_norm": 41.582767486572266, + "learning_rate": 6.17576e-07, + "loss": 0.4315, + "step": 95650 + }, + { + "epoch": 0.957, + "grad_norm": 20.49850845336914, + "learning_rate": 6.17376e-07, + "loss": 0.4245, + "step": 95700 + }, + { + "epoch": 0.9575, + "grad_norm": 17.59610939025879, + "learning_rate": 6.17176e-07, + "loss": 0.4987, + "step": 95750 + }, + { + "epoch": 0.958, + "grad_norm": 80.921630859375, + "learning_rate": 6.16976e-07, + "loss": 0.3194, + "step": 95800 + }, + { + "epoch": 0.9585, + "grad_norm": 84.2488784790039, + "learning_rate": 6.167759999999999e-07, + "loss": 0.5606, + "step": 95850 + }, + { + "epoch": 0.959, + "grad_norm": 0.6134811639785767, + "learning_rate": 6.16576e-07, + "loss": 0.4759, + "step": 95900 + }, + { + "epoch": 0.9595, + "grad_norm": 15.289545059204102, + "learning_rate": 6.16376e-07, + "loss": 0.5179, + "step": 95950 + }, + { + "epoch": 0.96, + "grad_norm": 41.45950698852539, + "learning_rate": 6.161759999999999e-07, + "loss": 0.6042, + "step": 96000 + }, + { + "epoch": 0.9605, + "grad_norm": 2.4903767108917236, + "learning_rate": 6.15976e-07, + "loss": 0.6048, + "step": 96050 + }, + { + "epoch": 0.961, + "grad_norm": 82.29048919677734, + "learning_rate": 6.15776e-07, + "loss": 0.4689, + "step": 96100 + }, + { + "epoch": 0.9615, + "grad_norm": 10.290043830871582, + "learning_rate": 6.15576e-07, + "loss": 0.4352, + "step": 96150 + }, + { + "epoch": 0.962, + "grad_norm": 56.273529052734375, + "learning_rate": 6.15376e-07, + "loss": 0.5958, + "step": 96200 + }, + { + "epoch": 0.9625, + "grad_norm": 58.597835540771484, + "learning_rate": 6.151759999999999e-07, + "loss": 0.4868, + "step": 96250 + }, + { + "epoch": 0.963, + "grad_norm": 15.38319206237793, + "learning_rate": 6.149759999999999e-07, + "loss": 0.4837, + "step": 96300 + }, + { + "epoch": 0.9635, + "grad_norm": 40.752864837646484, + "learning_rate": 6.14776e-07, + "loss": 0.4492, + "step": 96350 + }, + { + "epoch": 0.964, + "grad_norm": 7.580194473266602, + "learning_rate": 6.14576e-07, + "loss": 0.5223, + "step": 96400 + }, + { + "epoch": 0.9645, + "grad_norm": 33.51999282836914, + "learning_rate": 6.14376e-07, + "loss": 0.5388, + "step": 96450 + }, + { + "epoch": 0.965, + "grad_norm": 60.9385986328125, + "learning_rate": 6.141760000000001e-07, + "loss": 0.5853, + "step": 96500 + }, + { + "epoch": 0.9655, + "grad_norm": 72.08675384521484, + "learning_rate": 6.139759999999999e-07, + "loss": 0.567, + "step": 96550 + }, + { + "epoch": 0.966, + "grad_norm": 27.145662307739258, + "learning_rate": 6.137759999999999e-07, + "loss": 0.5532, + "step": 96600 + }, + { + "epoch": 0.9665, + "grad_norm": 91.9208984375, + "learning_rate": 6.1358e-07, + "loss": 0.4526, + "step": 96650 + }, + { + "epoch": 0.967, + "grad_norm": 6.773651123046875, + "learning_rate": 6.1338e-07, + "loss": 0.467, + "step": 96700 + }, + { + "epoch": 0.9675, + "grad_norm": 54.172584533691406, + "learning_rate": 6.131799999999999e-07, + "loss": 0.5223, + "step": 96750 + }, + { + "epoch": 0.968, + "grad_norm": 23.472658157348633, + "learning_rate": 6.1298e-07, + "loss": 0.4758, + "step": 96800 + }, + { + "epoch": 0.9685, + "grad_norm": 104.0491714477539, + "learning_rate": 6.1278e-07, + "loss": 0.5201, + "step": 96850 + }, + { + "epoch": 0.969, + "grad_norm": 11.634442329406738, + "learning_rate": 6.125799999999999e-07, + "loss": 0.4486, + "step": 96900 + }, + { + "epoch": 0.9695, + "grad_norm": 24.342527389526367, + "learning_rate": 6.1238e-07, + "loss": 0.5842, + "step": 96950 + }, + { + "epoch": 0.97, + "grad_norm": 15.33423900604248, + "learning_rate": 6.121799999999999e-07, + "loss": 0.4822, + "step": 97000 + }, + { + "epoch": 0.9705, + "grad_norm": 73.634033203125, + "learning_rate": 6.119799999999999e-07, + "loss": 0.4194, + "step": 97050 + }, + { + "epoch": 0.971, + "grad_norm": 90.969970703125, + "learning_rate": 6.1178e-07, + "loss": 0.6029, + "step": 97100 + }, + { + "epoch": 0.9715, + "grad_norm": 17.80976104736328, + "learning_rate": 6.1158e-07, + "loss": 0.4477, + "step": 97150 + }, + { + "epoch": 0.972, + "grad_norm": 81.571533203125, + "learning_rate": 6.1138e-07, + "loss": 0.7005, + "step": 97200 + }, + { + "epoch": 0.9725, + "grad_norm": 41.37546157836914, + "learning_rate": 6.111799999999999e-07, + "loss": 0.5194, + "step": 97250 + }, + { + "epoch": 0.973, + "grad_norm": 100.39408874511719, + "learning_rate": 6.109799999999999e-07, + "loss": 0.3624, + "step": 97300 + }, + { + "epoch": 0.9735, + "grad_norm": 65.12748718261719, + "learning_rate": 6.107799999999999e-07, + "loss": 0.5751, + "step": 97350 + }, + { + "epoch": 0.974, + "grad_norm": 33.728546142578125, + "learning_rate": 6.1058e-07, + "loss": 0.521, + "step": 97400 + }, + { + "epoch": 0.9745, + "grad_norm": 69.31006622314453, + "learning_rate": 6.10384e-07, + "loss": 0.4695, + "step": 97450 + }, + { + "epoch": 0.975, + "grad_norm": 38.2792854309082, + "learning_rate": 6.101839999999999e-07, + "loss": 0.3575, + "step": 97500 + }, + { + "epoch": 0.9755, + "grad_norm": 69.43871307373047, + "learning_rate": 6.09984e-07, + "loss": 0.3759, + "step": 97550 + }, + { + "epoch": 0.976, + "grad_norm": 68.89128112792969, + "learning_rate": 6.09784e-07, + "loss": 0.4355, + "step": 97600 + }, + { + "epoch": 0.9765, + "grad_norm": 41.74790573120117, + "learning_rate": 6.095839999999999e-07, + "loss": 0.6918, + "step": 97650 + }, + { + "epoch": 0.977, + "grad_norm": 12.850433349609375, + "learning_rate": 6.09384e-07, + "loss": 0.556, + "step": 97700 + }, + { + "epoch": 0.9775, + "grad_norm": 43.75520324707031, + "learning_rate": 6.091839999999999e-07, + "loss": 0.4272, + "step": 97750 + }, + { + "epoch": 0.978, + "grad_norm": 9.665956497192383, + "learning_rate": 6.089839999999999e-07, + "loss": 0.4407, + "step": 97800 + }, + { + "epoch": 0.9785, + "grad_norm": 22.188209533691406, + "learning_rate": 6.08784e-07, + "loss": 0.4733, + "step": 97850 + }, + { + "epoch": 0.979, + "grad_norm": 108.53839111328125, + "learning_rate": 6.08584e-07, + "loss": 0.5335, + "step": 97900 + }, + { + "epoch": 0.9795, + "grad_norm": 58.352508544921875, + "learning_rate": 6.083840000000001e-07, + "loss": 0.478, + "step": 97950 + }, + { + "epoch": 0.98, + "grad_norm": 57.22300338745117, + "learning_rate": 6.081839999999999e-07, + "loss": 0.543, + "step": 98000 + }, + { + "epoch": 0.9805, + "grad_norm": 42.58542251586914, + "learning_rate": 6.079839999999999e-07, + "loss": 0.4557, + "step": 98050 + }, + { + "epoch": 0.981, + "grad_norm": 73.4317626953125, + "learning_rate": 6.07784e-07, + "loss": 0.4937, + "step": 98100 + }, + { + "epoch": 0.9815, + "grad_norm": 24.243736267089844, + "learning_rate": 6.07584e-07, + "loss": 0.4762, + "step": 98150 + }, + { + "epoch": 0.982, + "grad_norm": 0.41077935695648193, + "learning_rate": 6.07384e-07, + "loss": 0.5536, + "step": 98200 + }, + { + "epoch": 0.9825, + "grad_norm": 3.0135037899017334, + "learning_rate": 6.07184e-07, + "loss": 0.3889, + "step": 98250 + }, + { + "epoch": 0.983, + "grad_norm": 27.06598472595215, + "learning_rate": 6.06984e-07, + "loss": 0.5288, + "step": 98300 + }, + { + "epoch": 0.9835, + "grad_norm": 79.37501525878906, + "learning_rate": 6.067839999999999e-07, + "loss": 0.5145, + "step": 98350 + }, + { + "epoch": 0.984, + "grad_norm": 10.967093467712402, + "learning_rate": 6.06584e-07, + "loss": 0.5057, + "step": 98400 + }, + { + "epoch": 0.9845, + "grad_norm": 86.88298797607422, + "learning_rate": 6.06384e-07, + "loss": 0.4921, + "step": 98450 + }, + { + "epoch": 0.985, + "grad_norm": 83.58354187011719, + "learning_rate": 6.061839999999999e-07, + "loss": 0.6873, + "step": 98500 + }, + { + "epoch": 0.9855, + "grad_norm": 54.29396057128906, + "learning_rate": 6.05984e-07, + "loss": 0.5628, + "step": 98550 + }, + { + "epoch": 0.986, + "grad_norm": 93.75994873046875, + "learning_rate": 6.05784e-07, + "loss": 0.4604, + "step": 98600 + }, + { + "epoch": 0.9865, + "grad_norm": 26.21271324157715, + "learning_rate": 6.05584e-07, + "loss": 0.47, + "step": 98650 + }, + { + "epoch": 0.987, + "grad_norm": 75.35508728027344, + "learning_rate": 6.05384e-07, + "loss": 0.4449, + "step": 98700 + }, + { + "epoch": 0.9875, + "grad_norm": 34.0965690612793, + "learning_rate": 6.05184e-07, + "loss": 0.4985, + "step": 98750 + }, + { + "epoch": 0.988, + "grad_norm": 15.451560974121094, + "learning_rate": 6.049839999999999e-07, + "loss": 0.4956, + "step": 98800 + }, + { + "epoch": 0.9885, + "grad_norm": 6.660648822784424, + "learning_rate": 6.04784e-07, + "loss": 0.3919, + "step": 98850 + }, + { + "epoch": 0.989, + "grad_norm": 39.9170036315918, + "learning_rate": 6.04584e-07, + "loss": 0.497, + "step": 98900 + }, + { + "epoch": 0.9895, + "grad_norm": 33.44998550415039, + "learning_rate": 6.04384e-07, + "loss": 0.6224, + "step": 98950 + }, + { + "epoch": 0.99, + "grad_norm": 28.230859756469727, + "learning_rate": 6.041840000000001e-07, + "loss": 0.4241, + "step": 99000 + }, + { + "epoch": 0.9905, + "grad_norm": 104.45681762695312, + "learning_rate": 6.039839999999999e-07, + "loss": 0.4589, + "step": 99050 + }, + { + "epoch": 0.991, + "grad_norm": 8.24346923828125, + "learning_rate": 6.037839999999999e-07, + "loss": 0.4088, + "step": 99100 + }, + { + "epoch": 0.9915, + "grad_norm": 73.47013092041016, + "learning_rate": 6.03584e-07, + "loss": 0.4545, + "step": 99150 + }, + { + "epoch": 0.992, + "grad_norm": 88.34424591064453, + "learning_rate": 6.03384e-07, + "loss": 0.4967, + "step": 99200 + }, + { + "epoch": 0.9925, + "grad_norm": 19.245553970336914, + "learning_rate": 6.03184e-07, + "loss": 0.3781, + "step": 99250 + }, + { + "epoch": 0.993, + "grad_norm": 24.192604064941406, + "learning_rate": 6.02984e-07, + "loss": 0.5745, + "step": 99300 + }, + { + "epoch": 0.9935, + "grad_norm": 30.267501831054688, + "learning_rate": 6.02784e-07, + "loss": 0.4486, + "step": 99350 + }, + { + "epoch": 0.994, + "grad_norm": 102.20484161376953, + "learning_rate": 6.025839999999999e-07, + "loss": 0.5407, + "step": 99400 + }, + { + "epoch": 0.9945, + "grad_norm": 65.89080810546875, + "learning_rate": 6.02384e-07, + "loss": 0.5351, + "step": 99450 + }, + { + "epoch": 0.995, + "grad_norm": 84.1611557006836, + "learning_rate": 6.02184e-07, + "loss": 0.4555, + "step": 99500 + }, + { + "epoch": 0.9955, + "grad_norm": 4.047856330871582, + "learning_rate": 6.019839999999999e-07, + "loss": 0.4759, + "step": 99550 + }, + { + "epoch": 0.996, + "grad_norm": 8.221104621887207, + "learning_rate": 6.01784e-07, + "loss": 0.3095, + "step": 99600 + }, + { + "epoch": 0.9965, + "grad_norm": 56.98789978027344, + "learning_rate": 6.01584e-07, + "loss": 0.4643, + "step": 99650 + }, + { + "epoch": 0.997, + "grad_norm": 12.39257526397705, + "learning_rate": 6.01384e-07, + "loss": 0.3251, + "step": 99700 + }, + { + "epoch": 0.9975, + "grad_norm": 20.75105094909668, + "learning_rate": 6.01184e-07, + "loss": 0.5136, + "step": 99750 + }, + { + "epoch": 0.998, + "grad_norm": 47.02289962768555, + "learning_rate": 6.009839999999999e-07, + "loss": 0.4591, + "step": 99800 + }, + { + "epoch": 0.9985, + "grad_norm": 97.42501831054688, + "learning_rate": 6.007839999999999e-07, + "loss": 0.5596, + "step": 99850 + }, + { + "epoch": 0.999, + "grad_norm": 66.5621566772461, + "learning_rate": 6.00584e-07, + "loss": 0.4794, + "step": 99900 + }, + { + "epoch": 0.9995, + "grad_norm": 9.777603149414062, + "learning_rate": 6.00384e-07, + "loss": 0.4818, + "step": 99950 + }, + { + "epoch": 1.0, + "grad_norm": 66.84056854248047, + "learning_rate": 6.001840000000001e-07, + "loss": 0.4902, + "step": 100000 + }, + { + "epoch": 1.0005, + "grad_norm": 0.26840323209762573, + "learning_rate": 5.99984e-07, + "loss": 0.4989, + "step": 100050 + }, + { + "epoch": 1.001, + "grad_norm": 1.4899623394012451, + "learning_rate": 5.997839999999999e-07, + "loss": 0.466, + "step": 100100 + }, + { + "epoch": 1.0015, + "grad_norm": 56.374874114990234, + "learning_rate": 5.99584e-07, + "loss": 0.5739, + "step": 100150 + }, + { + "epoch": 1.002, + "grad_norm": 28.144792556762695, + "learning_rate": 5.99384e-07, + "loss": 0.5244, + "step": 100200 + }, + { + "epoch": 1.0025, + "grad_norm": 41.527069091796875, + "learning_rate": 5.99184e-07, + "loss": 0.4076, + "step": 100250 + }, + { + "epoch": 1.003, + "grad_norm": 24.504615783691406, + "learning_rate": 5.98984e-07, + "loss": 0.4945, + "step": 100300 + }, + { + "epoch": 1.0035, + "grad_norm": 59.61265563964844, + "learning_rate": 5.98784e-07, + "loss": 0.5471, + "step": 100350 + }, + { + "epoch": 1.004, + "grad_norm": 4.118966102600098, + "learning_rate": 5.98584e-07, + "loss": 0.5698, + "step": 100400 + }, + { + "epoch": 1.0045, + "grad_norm": 83.80474853515625, + "learning_rate": 5.98384e-07, + "loss": 0.4028, + "step": 100450 + }, + { + "epoch": 1.005, + "grad_norm": 82.36299133300781, + "learning_rate": 5.98184e-07, + "loss": 0.3842, + "step": 100500 + }, + { + "epoch": 1.0055, + "grad_norm": 26.00364112854004, + "learning_rate": 5.979839999999999e-07, + "loss": 0.4643, + "step": 100550 + }, + { + "epoch": 1.006, + "grad_norm": 3.6861441135406494, + "learning_rate": 5.97784e-07, + "loss": 0.5716, + "step": 100600 + }, + { + "epoch": 1.0065, + "grad_norm": 9.791892051696777, + "learning_rate": 5.97584e-07, + "loss": 0.4626, + "step": 100650 + }, + { + "epoch": 1.007, + "grad_norm": 32.55017852783203, + "learning_rate": 5.97384e-07, + "loss": 0.3589, + "step": 100700 + }, + { + "epoch": 1.0075, + "grad_norm": 102.46571350097656, + "learning_rate": 5.971840000000001e-07, + "loss": 0.5368, + "step": 100750 + }, + { + "epoch": 1.008, + "grad_norm": 3.5553598403930664, + "learning_rate": 5.969839999999999e-07, + "loss": 0.4756, + "step": 100800 + }, + { + "epoch": 1.0085, + "grad_norm": 84.53421020507812, + "learning_rate": 5.967839999999999e-07, + "loss": 0.4855, + "step": 100850 + }, + { + "epoch": 1.009, + "grad_norm": 50.297672271728516, + "learning_rate": 5.96584e-07, + "loss": 0.4461, + "step": 100900 + }, + { + "epoch": 1.0095, + "grad_norm": 46.95347213745117, + "learning_rate": 5.96384e-07, + "loss": 0.2661, + "step": 100950 + }, + { + "epoch": 1.01, + "grad_norm": 46.71750259399414, + "learning_rate": 5.96184e-07, + "loss": 0.5056, + "step": 101000 + }, + { + "epoch": 1.0105, + "grad_norm": 65.1976547241211, + "learning_rate": 5.95984e-07, + "loss": 0.4613, + "step": 101050 + }, + { + "epoch": 1.011, + "grad_norm": 53.80318069458008, + "learning_rate": 5.95784e-07, + "loss": 0.4472, + "step": 101100 + }, + { + "epoch": 1.0115, + "grad_norm": 82.39476776123047, + "learning_rate": 5.955839999999999e-07, + "loss": 0.5717, + "step": 101150 + }, + { + "epoch": 1.012, + "grad_norm": 39.730159759521484, + "learning_rate": 5.95384e-07, + "loss": 0.4549, + "step": 101200 + }, + { + "epoch": 1.0125, + "grad_norm": 106.95108795166016, + "learning_rate": 5.95184e-07, + "loss": 0.489, + "step": 101250 + }, + { + "epoch": 1.013, + "grad_norm": 48.25342559814453, + "learning_rate": 5.949839999999999e-07, + "loss": 0.4537, + "step": 101300 + }, + { + "epoch": 1.0135, + "grad_norm": 2.1384692192077637, + "learning_rate": 5.94784e-07, + "loss": 0.6137, + "step": 101350 + }, + { + "epoch": 1.014, + "grad_norm": 59.659515380859375, + "learning_rate": 5.94584e-07, + "loss": 0.4321, + "step": 101400 + }, + { + "epoch": 1.0145, + "grad_norm": 60.75330352783203, + "learning_rate": 5.94384e-07, + "loss": 0.4137, + "step": 101450 + }, + { + "epoch": 1.015, + "grad_norm": 84.83941650390625, + "learning_rate": 5.94184e-07, + "loss": 0.7533, + "step": 101500 + }, + { + "epoch": 1.0155, + "grad_norm": 13.40371036529541, + "learning_rate": 5.939839999999999e-07, + "loss": 0.4591, + "step": 101550 + }, + { + "epoch": 1.016, + "grad_norm": 38.117313385009766, + "learning_rate": 5.937839999999999e-07, + "loss": 0.5045, + "step": 101600 + }, + { + "epoch": 1.0165, + "grad_norm": 1.0856471061706543, + "learning_rate": 5.93584e-07, + "loss": 0.3855, + "step": 101650 + }, + { + "epoch": 1.017, + "grad_norm": 1.0429091453552246, + "learning_rate": 5.93384e-07, + "loss": 0.4512, + "step": 101700 + }, + { + "epoch": 1.0175, + "grad_norm": 19.322357177734375, + "learning_rate": 5.93184e-07, + "loss": 0.3774, + "step": 101750 + }, + { + "epoch": 1.018, + "grad_norm": 104.85911560058594, + "learning_rate": 5.92984e-07, + "loss": 0.4906, + "step": 101800 + }, + { + "epoch": 1.0185, + "grad_norm": 48.950130462646484, + "learning_rate": 5.927839999999999e-07, + "loss": 0.4659, + "step": 101850 + }, + { + "epoch": 1.019, + "grad_norm": 40.40967559814453, + "learning_rate": 5.925839999999999e-07, + "loss": 0.4826, + "step": 101900 + }, + { + "epoch": 1.0195, + "grad_norm": 32.901214599609375, + "learning_rate": 5.92384e-07, + "loss": 0.3543, + "step": 101950 + }, + { + "epoch": 1.02, + "grad_norm": 22.917282104492188, + "learning_rate": 5.92184e-07, + "loss": 0.3971, + "step": 102000 + }, + { + "epoch": 1.0205, + "grad_norm": 70.27293395996094, + "learning_rate": 5.919839999999999e-07, + "loss": 0.4203, + "step": 102050 + }, + { + "epoch": 1.021, + "grad_norm": 58.08955001831055, + "learning_rate": 5.91784e-07, + "loss": 0.4806, + "step": 102100 + }, + { + "epoch": 1.0215, + "grad_norm": 0.6320064663887024, + "learning_rate": 5.91584e-07, + "loss": 0.4064, + "step": 102150 + }, + { + "epoch": 1.022, + "grad_norm": 13.316550254821777, + "learning_rate": 5.913839999999999e-07, + "loss": 0.4039, + "step": 102200 + }, + { + "epoch": 1.0225, + "grad_norm": 86.25184631347656, + "learning_rate": 5.91184e-07, + "loss": 0.389, + "step": 102250 + }, + { + "epoch": 1.023, + "grad_norm": 11.370412826538086, + "learning_rate": 5.909839999999999e-07, + "loss": 0.4349, + "step": 102300 + }, + { + "epoch": 1.0235, + "grad_norm": 7.496046543121338, + "learning_rate": 5.907839999999999e-07, + "loss": 0.4676, + "step": 102350 + }, + { + "epoch": 1.024, + "grad_norm": 15.750993728637695, + "learning_rate": 5.90584e-07, + "loss": 0.5462, + "step": 102400 + }, + { + "epoch": 1.0245, + "grad_norm": 108.76589965820312, + "learning_rate": 5.90384e-07, + "loss": 0.4425, + "step": 102450 + }, + { + "epoch": 1.025, + "grad_norm": 85.9919204711914, + "learning_rate": 5.901840000000001e-07, + "loss": 0.4755, + "step": 102500 + }, + { + "epoch": 1.0255, + "grad_norm": 84.9974594116211, + "learning_rate": 5.899839999999999e-07, + "loss": 0.3698, + "step": 102550 + }, + { + "epoch": 1.026, + "grad_norm": 53.898643493652344, + "learning_rate": 5.897839999999999e-07, + "loss": 0.5112, + "step": 102600 + }, + { + "epoch": 1.0265, + "grad_norm": 50.896507263183594, + "learning_rate": 5.89584e-07, + "loss": 0.3905, + "step": 102650 + }, + { + "epoch": 1.027, + "grad_norm": 78.91048431396484, + "learning_rate": 5.89384e-07, + "loss": 0.4481, + "step": 102700 + }, + { + "epoch": 1.0275, + "grad_norm": 82.20804595947266, + "learning_rate": 5.89184e-07, + "loss": 0.4744, + "step": 102750 + }, + { + "epoch": 1.028, + "grad_norm": 72.1124038696289, + "learning_rate": 5.88984e-07, + "loss": 0.5971, + "step": 102800 + }, + { + "epoch": 1.0285, + "grad_norm": 6.744050979614258, + "learning_rate": 5.88784e-07, + "loss": 0.4036, + "step": 102850 + }, + { + "epoch": 1.029, + "grad_norm": 102.9438247680664, + "learning_rate": 5.885839999999999e-07, + "loss": 0.3638, + "step": 102900 + }, + { + "epoch": 1.0295, + "grad_norm": 91.29195404052734, + "learning_rate": 5.88384e-07, + "loss": 0.4288, + "step": 102950 + }, + { + "epoch": 1.03, + "grad_norm": 10.420190811157227, + "learning_rate": 5.88184e-07, + "loss": 0.5381, + "step": 103000 + }, + { + "epoch": 1.0305, + "grad_norm": 3.8758227825164795, + "learning_rate": 5.879839999999999e-07, + "loss": 0.4904, + "step": 103050 + }, + { + "epoch": 1.031, + "grad_norm": 42.36983871459961, + "learning_rate": 5.87784e-07, + "loss": 0.4687, + "step": 103100 + }, + { + "epoch": 1.0315, + "grad_norm": 4.480782508850098, + "learning_rate": 5.87584e-07, + "loss": 0.3783, + "step": 103150 + }, + { + "epoch": 1.032, + "grad_norm": 16.480192184448242, + "learning_rate": 5.87384e-07, + "loss": 0.4104, + "step": 103200 + }, + { + "epoch": 1.0325, + "grad_norm": 56.70964050292969, + "learning_rate": 5.87184e-07, + "loss": 0.4744, + "step": 103250 + }, + { + "epoch": 1.033, + "grad_norm": 79.56087493896484, + "learning_rate": 5.869839999999999e-07, + "loss": 0.4447, + "step": 103300 + }, + { + "epoch": 1.0335, + "grad_norm": 60.962284088134766, + "learning_rate": 5.867839999999999e-07, + "loss": 0.5019, + "step": 103350 + }, + { + "epoch": 1.034, + "grad_norm": 58.27140808105469, + "learning_rate": 5.86584e-07, + "loss": 0.5927, + "step": 103400 + }, + { + "epoch": 1.0345, + "grad_norm": 39.9395637512207, + "learning_rate": 5.86384e-07, + "loss": 0.4556, + "step": 103450 + }, + { + "epoch": 1.035, + "grad_norm": 10.879745483398438, + "learning_rate": 5.86184e-07, + "loss": 0.5386, + "step": 103500 + }, + { + "epoch": 1.0355, + "grad_norm": 14.985457420349121, + "learning_rate": 5.85984e-07, + "loss": 0.4561, + "step": 103550 + }, + { + "epoch": 1.036, + "grad_norm": 0.901373565196991, + "learning_rate": 5.85788e-07, + "loss": 0.486, + "step": 103600 + }, + { + "epoch": 1.0365, + "grad_norm": 76.16960906982422, + "learning_rate": 5.855879999999999e-07, + "loss": 0.5131, + "step": 103650 + }, + { + "epoch": 1.037, + "grad_norm": 71.22393798828125, + "learning_rate": 5.85388e-07, + "loss": 0.6253, + "step": 103700 + }, + { + "epoch": 1.0375, + "grad_norm": 72.47522735595703, + "learning_rate": 5.85188e-07, + "loss": 0.4038, + "step": 103750 + }, + { + "epoch": 1.038, + "grad_norm": 35.06593322753906, + "learning_rate": 5.849879999999999e-07, + "loss": 0.4235, + "step": 103800 + }, + { + "epoch": 1.0385, + "grad_norm": 58.12506866455078, + "learning_rate": 5.84788e-07, + "loss": 0.4734, + "step": 103850 + }, + { + "epoch": 1.039, + "grad_norm": 70.32412719726562, + "learning_rate": 5.84588e-07, + "loss": 0.4489, + "step": 103900 + }, + { + "epoch": 1.0395, + "grad_norm": 94.34310150146484, + "learning_rate": 5.84388e-07, + "loss": 0.5206, + "step": 103950 + }, + { + "epoch": 1.04, + "grad_norm": 96.84242248535156, + "learning_rate": 5.84188e-07, + "loss": 0.4214, + "step": 104000 + }, + { + "epoch": 1.0405, + "grad_norm": 50.48087692260742, + "learning_rate": 5.839879999999999e-07, + "loss": 0.4325, + "step": 104050 + }, + { + "epoch": 1.041, + "grad_norm": 15.453256607055664, + "learning_rate": 5.837879999999999e-07, + "loss": 0.4099, + "step": 104100 + }, + { + "epoch": 1.0415, + "grad_norm": 7.770763397216797, + "learning_rate": 5.83588e-07, + "loss": 0.5572, + "step": 104150 + }, + { + "epoch": 1.042, + "grad_norm": 75.37487030029297, + "learning_rate": 5.83388e-07, + "loss": 0.6112, + "step": 104200 + }, + { + "epoch": 1.0425, + "grad_norm": 3.0867295265197754, + "learning_rate": 5.83188e-07, + "loss": 0.4438, + "step": 104250 + }, + { + "epoch": 1.043, + "grad_norm": 82.33965301513672, + "learning_rate": 5.82988e-07, + "loss": 0.5488, + "step": 104300 + }, + { + "epoch": 1.0435, + "grad_norm": 108.06182861328125, + "learning_rate": 5.827879999999999e-07, + "loss": 0.4588, + "step": 104350 + }, + { + "epoch": 1.044, + "grad_norm": 7.8545660972595215, + "learning_rate": 5.825879999999999e-07, + "loss": 0.4324, + "step": 104400 + }, + { + "epoch": 1.0445, + "grad_norm": 105.07747650146484, + "learning_rate": 5.82388e-07, + "loss": 0.4492, + "step": 104450 + }, + { + "epoch": 1.045, + "grad_norm": 50.81571578979492, + "learning_rate": 5.82188e-07, + "loss": 0.6157, + "step": 104500 + }, + { + "epoch": 1.0455, + "grad_norm": 102.53087615966797, + "learning_rate": 5.819879999999999e-07, + "loss": 0.3951, + "step": 104550 + }, + { + "epoch": 1.046, + "grad_norm": 40.119380950927734, + "learning_rate": 5.81788e-07, + "loss": 0.4466, + "step": 104600 + }, + { + "epoch": 1.0465, + "grad_norm": 98.8756103515625, + "learning_rate": 5.81588e-07, + "loss": 0.4351, + "step": 104650 + }, + { + "epoch": 1.047, + "grad_norm": 7.848785400390625, + "learning_rate": 5.813879999999999e-07, + "loss": 0.4286, + "step": 104700 + }, + { + "epoch": 1.0475, + "grad_norm": 57.9787712097168, + "learning_rate": 5.81188e-07, + "loss": 0.4914, + "step": 104750 + }, + { + "epoch": 1.048, + "grad_norm": 34.086063385009766, + "learning_rate": 5.809879999999999e-07, + "loss": 0.5624, + "step": 104800 + }, + { + "epoch": 1.0485, + "grad_norm": 38.03075408935547, + "learning_rate": 5.807879999999999e-07, + "loss": 0.4121, + "step": 104850 + }, + { + "epoch": 1.049, + "grad_norm": 113.10247039794922, + "learning_rate": 5.80588e-07, + "loss": 0.5234, + "step": 104900 + }, + { + "epoch": 1.0495, + "grad_norm": 67.31755828857422, + "learning_rate": 5.80388e-07, + "loss": 0.4457, + "step": 104950 + }, + { + "epoch": 1.05, + "grad_norm": 4.373214244842529, + "learning_rate": 5.801880000000001e-07, + "loss": 0.4674, + "step": 105000 + }, + { + "epoch": 1.0505, + "grad_norm": 24.029144287109375, + "learning_rate": 5.799879999999999e-07, + "loss": 0.4393, + "step": 105050 + }, + { + "epoch": 1.051, + "grad_norm": 76.45175170898438, + "learning_rate": 5.797879999999999e-07, + "loss": 0.6139, + "step": 105100 + }, + { + "epoch": 1.0515, + "grad_norm": 90.8389663696289, + "learning_rate": 5.79588e-07, + "loss": 0.4096, + "step": 105150 + }, + { + "epoch": 1.052, + "grad_norm": 52.22407531738281, + "learning_rate": 5.79388e-07, + "loss": 0.4983, + "step": 105200 + }, + { + "epoch": 1.0525, + "grad_norm": 28.466354370117188, + "learning_rate": 5.79188e-07, + "loss": 0.4473, + "step": 105250 + }, + { + "epoch": 1.053, + "grad_norm": 74.26171112060547, + "learning_rate": 5.78988e-07, + "loss": 0.5071, + "step": 105300 + }, + { + "epoch": 1.0535, + "grad_norm": 45.36643981933594, + "learning_rate": 5.78788e-07, + "loss": 0.4621, + "step": 105350 + }, + { + "epoch": 1.054, + "grad_norm": 22.525741577148438, + "learning_rate": 5.785879999999999e-07, + "loss": 0.5238, + "step": 105400 + }, + { + "epoch": 1.0545, + "grad_norm": 25.078907012939453, + "learning_rate": 5.78388e-07, + "loss": 0.472, + "step": 105450 + }, + { + "epoch": 1.055, + "grad_norm": 29.455020904541016, + "learning_rate": 5.78188e-07, + "loss": 0.3387, + "step": 105500 + }, + { + "epoch": 1.0555, + "grad_norm": 4.957467555999756, + "learning_rate": 5.779879999999999e-07, + "loss": 0.4357, + "step": 105550 + }, + { + "epoch": 1.056, + "grad_norm": 56.372745513916016, + "learning_rate": 5.77788e-07, + "loss": 0.5183, + "step": 105600 + }, + { + "epoch": 1.0565, + "grad_norm": 1.047702670097351, + "learning_rate": 5.77588e-07, + "loss": 0.3755, + "step": 105650 + }, + { + "epoch": 1.057, + "grad_norm": 26.11679458618164, + "learning_rate": 5.77388e-07, + "loss": 0.431, + "step": 105700 + }, + { + "epoch": 1.0575, + "grad_norm": 81.7731704711914, + "learning_rate": 5.77192e-07, + "loss": 0.642, + "step": 105750 + }, + { + "epoch": 1.058, + "grad_norm": 2.761612892150879, + "learning_rate": 5.769919999999999e-07, + "loss": 0.4243, + "step": 105800 + }, + { + "epoch": 1.0585, + "grad_norm": 16.649229049682617, + "learning_rate": 5.767919999999999e-07, + "loss": 0.4652, + "step": 105850 + }, + { + "epoch": 1.059, + "grad_norm": 74.75067138671875, + "learning_rate": 5.76592e-07, + "loss": 0.4756, + "step": 105900 + }, + { + "epoch": 1.0594999999999999, + "grad_norm": 6.984798431396484, + "learning_rate": 5.76392e-07, + "loss": 0.4214, + "step": 105950 + }, + { + "epoch": 1.06, + "grad_norm": 40.673606872558594, + "learning_rate": 5.76192e-07, + "loss": 0.5516, + "step": 106000 + }, + { + "epoch": 1.0605, + "grad_norm": 111.4358901977539, + "learning_rate": 5.75992e-07, + "loss": 0.4571, + "step": 106050 + }, + { + "epoch": 1.061, + "grad_norm": 22.08611297607422, + "learning_rate": 5.757919999999999e-07, + "loss": 0.4519, + "step": 106100 + }, + { + "epoch": 1.0615, + "grad_norm": 6.3416666984558105, + "learning_rate": 5.755919999999999e-07, + "loss": 0.3882, + "step": 106150 + }, + { + "epoch": 1.062, + "grad_norm": 18.259502410888672, + "learning_rate": 5.75392e-07, + "loss": 0.3961, + "step": 106200 + }, + { + "epoch": 1.0625, + "grad_norm": 76.18441772460938, + "learning_rate": 5.75192e-07, + "loss": 0.4029, + "step": 106250 + }, + { + "epoch": 1.063, + "grad_norm": 47.05423355102539, + "learning_rate": 5.749919999999999e-07, + "loss": 0.3685, + "step": 106300 + }, + { + "epoch": 1.0635, + "grad_norm": 69.71207427978516, + "learning_rate": 5.74792e-07, + "loss": 0.4922, + "step": 106350 + }, + { + "epoch": 1.064, + "grad_norm": 32.429664611816406, + "learning_rate": 5.74592e-07, + "loss": 0.6055, + "step": 106400 + }, + { + "epoch": 1.0645, + "grad_norm": 105.841552734375, + "learning_rate": 5.743919999999999e-07, + "loss": 0.4959, + "step": 106450 + }, + { + "epoch": 1.065, + "grad_norm": 65.44140625, + "learning_rate": 5.74192e-07, + "loss": 0.4779, + "step": 106500 + }, + { + "epoch": 1.0655000000000001, + "grad_norm": 31.591449737548828, + "learning_rate": 5.739919999999999e-07, + "loss": 0.5328, + "step": 106550 + }, + { + "epoch": 1.066, + "grad_norm": 39.16038513183594, + "learning_rate": 5.737919999999999e-07, + "loss": 0.4452, + "step": 106600 + }, + { + "epoch": 1.0665, + "grad_norm": 0.7136451601982117, + "learning_rate": 5.73592e-07, + "loss": 0.4545, + "step": 106650 + }, + { + "epoch": 1.067, + "grad_norm": 78.04670715332031, + "learning_rate": 5.73392e-07, + "loss": 0.5004, + "step": 106700 + }, + { + "epoch": 1.0675, + "grad_norm": 77.70978546142578, + "learning_rate": 5.73192e-07, + "loss": 0.4688, + "step": 106750 + }, + { + "epoch": 1.068, + "grad_norm": 2.930210590362549, + "learning_rate": 5.729919999999999e-07, + "loss": 0.504, + "step": 106800 + }, + { + "epoch": 1.0685, + "grad_norm": 2.333472967147827, + "learning_rate": 5.727919999999999e-07, + "loss": 0.4608, + "step": 106850 + }, + { + "epoch": 1.069, + "grad_norm": 96.40602111816406, + "learning_rate": 5.725919999999999e-07, + "loss": 0.4855, + "step": 106900 + }, + { + "epoch": 1.0695000000000001, + "grad_norm": 147.89405822753906, + "learning_rate": 5.72392e-07, + "loss": 0.6196, + "step": 106950 + }, + { + "epoch": 1.07, + "grad_norm": 1.7213540077209473, + "learning_rate": 5.72192e-07, + "loss": 0.4273, + "step": 107000 + }, + { + "epoch": 1.0705, + "grad_norm": 105.5814437866211, + "learning_rate": 5.719919999999999e-07, + "loss": 0.5419, + "step": 107050 + }, + { + "epoch": 1.071, + "grad_norm": 104.46412658691406, + "learning_rate": 5.71792e-07, + "loss": 0.4807, + "step": 107100 + }, + { + "epoch": 1.0715, + "grad_norm": 51.38224792480469, + "learning_rate": 5.71592e-07, + "loss": 0.505, + "step": 107150 + }, + { + "epoch": 1.072, + "grad_norm": 59.694828033447266, + "learning_rate": 5.713919999999999e-07, + "loss": 0.3088, + "step": 107200 + }, + { + "epoch": 1.0725, + "grad_norm": 17.387590408325195, + "learning_rate": 5.71192e-07, + "loss": 0.5795, + "step": 107250 + }, + { + "epoch": 1.073, + "grad_norm": 70.23807525634766, + "learning_rate": 5.70992e-07, + "loss": 0.4044, + "step": 107300 + }, + { + "epoch": 1.0735, + "grad_norm": 10.148516654968262, + "learning_rate": 5.707919999999999e-07, + "loss": 0.4085, + "step": 107350 + }, + { + "epoch": 1.074, + "grad_norm": 2.2347519397735596, + "learning_rate": 5.70592e-07, + "loss": 0.5751, + "step": 107400 + }, + { + "epoch": 1.0745, + "grad_norm": 59.01057434082031, + "learning_rate": 5.70392e-07, + "loss": 0.4502, + "step": 107450 + }, + { + "epoch": 1.075, + "grad_norm": 41.65180206298828, + "learning_rate": 5.701920000000001e-07, + "loss": 0.4107, + "step": 107500 + }, + { + "epoch": 1.0755, + "grad_norm": 63.487892150878906, + "learning_rate": 5.69992e-07, + "loss": 0.4103, + "step": 107550 + }, + { + "epoch": 1.076, + "grad_norm": 44.388309478759766, + "learning_rate": 5.697919999999999e-07, + "loss": 0.5648, + "step": 107600 + }, + { + "epoch": 1.0765, + "grad_norm": 15.944334030151367, + "learning_rate": 5.69592e-07, + "loss": 0.434, + "step": 107650 + }, + { + "epoch": 1.077, + "grad_norm": 69.580322265625, + "learning_rate": 5.69392e-07, + "loss": 0.4692, + "step": 107700 + }, + { + "epoch": 1.0775, + "grad_norm": 7.532376289367676, + "learning_rate": 5.69196e-07, + "loss": 0.486, + "step": 107750 + }, + { + "epoch": 1.078, + "grad_norm": 117.07515716552734, + "learning_rate": 5.68996e-07, + "loss": 0.5425, + "step": 107800 + }, + { + "epoch": 1.0785, + "grad_norm": 74.83606719970703, + "learning_rate": 5.68796e-07, + "loss": 0.4081, + "step": 107850 + }, + { + "epoch": 1.079, + "grad_norm": 66.38423919677734, + "learning_rate": 5.685959999999999e-07, + "loss": 0.4735, + "step": 107900 + }, + { + "epoch": 1.0795, + "grad_norm": 18.5911808013916, + "learning_rate": 5.68396e-07, + "loss": 0.5111, + "step": 107950 + }, + { + "epoch": 1.08, + "grad_norm": 41.88191223144531, + "learning_rate": 5.68196e-07, + "loss": 0.4556, + "step": 108000 + }, + { + "epoch": 1.0805, + "grad_norm": 81.14200592041016, + "learning_rate": 5.679959999999999e-07, + "loss": 0.3716, + "step": 108050 + }, + { + "epoch": 1.081, + "grad_norm": 35.856998443603516, + "learning_rate": 5.67796e-07, + "loss": 0.4, + "step": 108100 + }, + { + "epoch": 1.0815, + "grad_norm": 43.836421966552734, + "learning_rate": 5.67596e-07, + "loss": 0.4874, + "step": 108150 + }, + { + "epoch": 1.082, + "grad_norm": 84.13600158691406, + "learning_rate": 5.67396e-07, + "loss": 0.4165, + "step": 108200 + }, + { + "epoch": 1.0825, + "grad_norm": 22.291542053222656, + "learning_rate": 5.67196e-07, + "loss": 0.5321, + "step": 108250 + }, + { + "epoch": 1.083, + "grad_norm": 94.12808227539062, + "learning_rate": 5.66996e-07, + "loss": 0.624, + "step": 108300 + }, + { + "epoch": 1.0835, + "grad_norm": 53.173072814941406, + "learning_rate": 5.667959999999999e-07, + "loss": 0.4123, + "step": 108350 + }, + { + "epoch": 1.084, + "grad_norm": 78.43643188476562, + "learning_rate": 5.66596e-07, + "loss": 0.5123, + "step": 108400 + }, + { + "epoch": 1.0845, + "grad_norm": 39.595787048339844, + "learning_rate": 5.66396e-07, + "loss": 0.4563, + "step": 108450 + }, + { + "epoch": 1.085, + "grad_norm": 14.890007972717285, + "learning_rate": 5.66196e-07, + "loss": 0.4253, + "step": 108500 + }, + { + "epoch": 1.0855, + "grad_norm": 91.4941635131836, + "learning_rate": 5.659960000000001e-07, + "loss": 0.3835, + "step": 108550 + }, + { + "epoch": 1.086, + "grad_norm": 74.25895690917969, + "learning_rate": 5.657959999999999e-07, + "loss": 0.4304, + "step": 108600 + }, + { + "epoch": 1.0865, + "grad_norm": 48.94237518310547, + "learning_rate": 5.655959999999999e-07, + "loss": 0.3277, + "step": 108650 + }, + { + "epoch": 1.087, + "grad_norm": 56.9554328918457, + "learning_rate": 5.65396e-07, + "loss": 0.5296, + "step": 108700 + }, + { + "epoch": 1.0875, + "grad_norm": 47.54026412963867, + "learning_rate": 5.65196e-07, + "loss": 0.3747, + "step": 108750 + }, + { + "epoch": 1.088, + "grad_norm": 67.42796325683594, + "learning_rate": 5.64996e-07, + "loss": 0.4032, + "step": 108800 + }, + { + "epoch": 1.0885, + "grad_norm": 39.55683898925781, + "learning_rate": 5.64796e-07, + "loss": 0.3196, + "step": 108850 + }, + { + "epoch": 1.089, + "grad_norm": 96.23863220214844, + "learning_rate": 5.64596e-07, + "loss": 0.4986, + "step": 108900 + }, + { + "epoch": 1.0895, + "grad_norm": 6.769164085388184, + "learning_rate": 5.643959999999999e-07, + "loss": 0.4685, + "step": 108950 + }, + { + "epoch": 1.09, + "grad_norm": 23.501625061035156, + "learning_rate": 5.64196e-07, + "loss": 0.6104, + "step": 109000 + }, + { + "epoch": 1.0905, + "grad_norm": 9.114831924438477, + "learning_rate": 5.63996e-07, + "loss": 0.5997, + "step": 109050 + }, + { + "epoch": 1.091, + "grad_norm": 12.518210411071777, + "learning_rate": 5.637959999999999e-07, + "loss": 0.3271, + "step": 109100 + }, + { + "epoch": 1.0915, + "grad_norm": 99.00508117675781, + "learning_rate": 5.63596e-07, + "loss": 0.5053, + "step": 109150 + }, + { + "epoch": 1.092, + "grad_norm": 10.864371299743652, + "learning_rate": 5.63396e-07, + "loss": 0.5697, + "step": 109200 + }, + { + "epoch": 1.0925, + "grad_norm": 43.38995361328125, + "learning_rate": 5.63196e-07, + "loss": 0.369, + "step": 109250 + }, + { + "epoch": 1.093, + "grad_norm": 1.3382948637008667, + "learning_rate": 5.62996e-07, + "loss": 0.5112, + "step": 109300 + }, + { + "epoch": 1.0935, + "grad_norm": 78.29591369628906, + "learning_rate": 5.627959999999999e-07, + "loss": 0.4358, + "step": 109350 + }, + { + "epoch": 1.094, + "grad_norm": 93.0485610961914, + "learning_rate": 5.625959999999999e-07, + "loss": 0.38, + "step": 109400 + }, + { + "epoch": 1.0945, + "grad_norm": 20.810575485229492, + "learning_rate": 5.62396e-07, + "loss": 0.5316, + "step": 109450 + }, + { + "epoch": 1.095, + "grad_norm": 0.7341397404670715, + "learning_rate": 5.62196e-07, + "loss": 0.3763, + "step": 109500 + }, + { + "epoch": 1.0955, + "grad_norm": 18.946903228759766, + "learning_rate": 5.619960000000001e-07, + "loss": 0.562, + "step": 109550 + }, + { + "epoch": 1.096, + "grad_norm": 95.75009155273438, + "learning_rate": 5.61796e-07, + "loss": 0.4775, + "step": 109600 + }, + { + "epoch": 1.0965, + "grad_norm": 72.53479766845703, + "learning_rate": 5.615959999999999e-07, + "loss": 0.4117, + "step": 109650 + }, + { + "epoch": 1.097, + "grad_norm": 55.721282958984375, + "learning_rate": 5.61396e-07, + "loss": 0.4769, + "step": 109700 + }, + { + "epoch": 1.0975, + "grad_norm": 29.421995162963867, + "learning_rate": 5.61196e-07, + "loss": 0.4423, + "step": 109750 + }, + { + "epoch": 1.098, + "grad_norm": 62.25779342651367, + "learning_rate": 5.60996e-07, + "loss": 0.4081, + "step": 109800 + }, + { + "epoch": 1.0985, + "grad_norm": 80.15348052978516, + "learning_rate": 5.60796e-07, + "loss": 0.3587, + "step": 109850 + }, + { + "epoch": 1.099, + "grad_norm": 0.7332010865211487, + "learning_rate": 5.60596e-07, + "loss": 0.3441, + "step": 109900 + }, + { + "epoch": 1.0995, + "grad_norm": 62.10441207885742, + "learning_rate": 5.60396e-07, + "loss": 0.5722, + "step": 109950 + }, + { + "epoch": 1.1, + "grad_norm": 4.652379035949707, + "learning_rate": 5.60196e-07, + "loss": 0.3688, + "step": 110000 + }, + { + "epoch": 1.1005, + "grad_norm": 56.87358093261719, + "learning_rate": 5.59996e-07, + "loss": 0.4241, + "step": 110050 + }, + { + "epoch": 1.101, + "grad_norm": 93.23614501953125, + "learning_rate": 5.597959999999999e-07, + "loss": 0.5574, + "step": 110100 + }, + { + "epoch": 1.1015, + "grad_norm": 34.96125793457031, + "learning_rate": 5.59596e-07, + "loss": 0.5044, + "step": 110150 + }, + { + "epoch": 1.102, + "grad_norm": 79.12371063232422, + "learning_rate": 5.59396e-07, + "loss": 0.4684, + "step": 110200 + }, + { + "epoch": 1.1025, + "grad_norm": 130.24989318847656, + "learning_rate": 5.59196e-07, + "loss": 0.4631, + "step": 110250 + }, + { + "epoch": 1.103, + "grad_norm": 121.72173309326172, + "learning_rate": 5.589960000000001e-07, + "loss": 0.4232, + "step": 110300 + }, + { + "epoch": 1.1035, + "grad_norm": 66.10565948486328, + "learning_rate": 5.587959999999999e-07, + "loss": 0.4178, + "step": 110350 + }, + { + "epoch": 1.104, + "grad_norm": 70.6107406616211, + "learning_rate": 5.585959999999999e-07, + "loss": 0.5394, + "step": 110400 + }, + { + "epoch": 1.1045, + "grad_norm": 28.195026397705078, + "learning_rate": 5.58396e-07, + "loss": 0.5433, + "step": 110450 + }, + { + "epoch": 1.105, + "grad_norm": 73.64238739013672, + "learning_rate": 5.58196e-07, + "loss": 0.418, + "step": 110500 + }, + { + "epoch": 1.1055, + "grad_norm": 90.38440704345703, + "learning_rate": 5.57996e-07, + "loss": 0.6524, + "step": 110550 + }, + { + "epoch": 1.106, + "grad_norm": 7.275987148284912, + "learning_rate": 5.57796e-07, + "loss": 0.4582, + "step": 110600 + }, + { + "epoch": 1.1065, + "grad_norm": 15.238624572753906, + "learning_rate": 5.576e-07, + "loss": 0.476, + "step": 110650 + }, + { + "epoch": 1.107, + "grad_norm": 70.75611877441406, + "learning_rate": 5.574e-07, + "loss": 0.4448, + "step": 110700 + }, + { + "epoch": 1.1075, + "grad_norm": 87.86063385009766, + "learning_rate": 5.572e-07, + "loss": 0.5077, + "step": 110750 + }, + { + "epoch": 1.108, + "grad_norm": 0.4845068156719208, + "learning_rate": 5.57e-07, + "loss": 0.4965, + "step": 110800 + }, + { + "epoch": 1.1085, + "grad_norm": 2.1131792068481445, + "learning_rate": 5.567999999999999e-07, + "loss": 0.5516, + "step": 110850 + }, + { + "epoch": 1.109, + "grad_norm": 16.047956466674805, + "learning_rate": 5.566e-07, + "loss": 0.4468, + "step": 110900 + }, + { + "epoch": 1.1095, + "grad_norm": 17.67030143737793, + "learning_rate": 5.564e-07, + "loss": 0.5287, + "step": 110950 + }, + { + "epoch": 1.11, + "grad_norm": 82.67327117919922, + "learning_rate": 5.562e-07, + "loss": 0.3612, + "step": 111000 + }, + { + "epoch": 1.1105, + "grad_norm": 1.270891785621643, + "learning_rate": 5.560000000000001e-07, + "loss": 0.396, + "step": 111050 + }, + { + "epoch": 1.111, + "grad_norm": 75.45249938964844, + "learning_rate": 5.557999999999999e-07, + "loss": 0.3984, + "step": 111100 + }, + { + "epoch": 1.1115, + "grad_norm": 19.672801971435547, + "learning_rate": 5.555999999999999e-07, + "loss": 0.5507, + "step": 111150 + }, + { + "epoch": 1.112, + "grad_norm": 6.31002140045166, + "learning_rate": 5.554e-07, + "loss": 0.3859, + "step": 111200 + }, + { + "epoch": 1.1125, + "grad_norm": 85.23663330078125, + "learning_rate": 5.552e-07, + "loss": 0.4728, + "step": 111250 + }, + { + "epoch": 1.113, + "grad_norm": 88.14482879638672, + "learning_rate": 5.55e-07, + "loss": 0.5598, + "step": 111300 + }, + { + "epoch": 1.1135, + "grad_norm": 94.62075805664062, + "learning_rate": 5.548e-07, + "loss": 0.4012, + "step": 111350 + }, + { + "epoch": 1.114, + "grad_norm": 51.954891204833984, + "learning_rate": 5.546e-07, + "loss": 0.3856, + "step": 111400 + }, + { + "epoch": 1.1145, + "grad_norm": 75.34689331054688, + "learning_rate": 5.543999999999999e-07, + "loss": 0.3746, + "step": 111450 + }, + { + "epoch": 1.115, + "grad_norm": 72.60891723632812, + "learning_rate": 5.542e-07, + "loss": 0.5083, + "step": 111500 + }, + { + "epoch": 1.1155, + "grad_norm": 39.89189147949219, + "learning_rate": 5.54e-07, + "loss": 0.4479, + "step": 111550 + }, + { + "epoch": 1.116, + "grad_norm": 4.4385833740234375, + "learning_rate": 5.537999999999999e-07, + "loss": 0.3594, + "step": 111600 + }, + { + "epoch": 1.1165, + "grad_norm": 0.6604699492454529, + "learning_rate": 5.536e-07, + "loss": 0.3674, + "step": 111650 + }, + { + "epoch": 1.117, + "grad_norm": 49.04147720336914, + "learning_rate": 5.534e-07, + "loss": 0.4568, + "step": 111700 + }, + { + "epoch": 1.1175, + "grad_norm": 42.06256866455078, + "learning_rate": 5.532e-07, + "loss": 0.5162, + "step": 111750 + }, + { + "epoch": 1.1179999999999999, + "grad_norm": 102.51911163330078, + "learning_rate": 5.53e-07, + "loss": 0.4049, + "step": 111800 + }, + { + "epoch": 1.1185, + "grad_norm": 44.5997428894043, + "learning_rate": 5.527999999999999e-07, + "loss": 0.4188, + "step": 111850 + }, + { + "epoch": 1.119, + "grad_norm": 25.700366973876953, + "learning_rate": 5.525999999999999e-07, + "loss": 0.3842, + "step": 111900 + }, + { + "epoch": 1.1195, + "grad_norm": 16.742462158203125, + "learning_rate": 5.524e-07, + "loss": 0.5652, + "step": 111950 + }, + { + "epoch": 1.12, + "grad_norm": 52.463157653808594, + "learning_rate": 5.522e-07, + "loss": 0.4582, + "step": 112000 + }, + { + "epoch": 1.1205, + "grad_norm": 37.043521881103516, + "learning_rate": 5.520000000000001e-07, + "loss": 0.4157, + "step": 112050 + }, + { + "epoch": 1.121, + "grad_norm": 1.2325105667114258, + "learning_rate": 5.518e-07, + "loss": 0.5246, + "step": 112100 + }, + { + "epoch": 1.1215, + "grad_norm": 21.83551597595215, + "learning_rate": 5.515999999999999e-07, + "loss": 0.4453, + "step": 112150 + }, + { + "epoch": 1.1219999999999999, + "grad_norm": 3.408167839050293, + "learning_rate": 5.514e-07, + "loss": 0.4424, + "step": 112200 + }, + { + "epoch": 1.1225, + "grad_norm": 86.52948760986328, + "learning_rate": 5.512e-07, + "loss": 0.516, + "step": 112250 + }, + { + "epoch": 1.123, + "grad_norm": 2.4071269035339355, + "learning_rate": 5.51e-07, + "loss": 0.304, + "step": 112300 + }, + { + "epoch": 1.1235, + "grad_norm": 74.5964126586914, + "learning_rate": 5.508e-07, + "loss": 0.2992, + "step": 112350 + }, + { + "epoch": 1.124, + "grad_norm": 52.07612991333008, + "learning_rate": 5.506e-07, + "loss": 0.4152, + "step": 112400 + }, + { + "epoch": 1.1245, + "grad_norm": 66.46135711669922, + "learning_rate": 5.504e-07, + "loss": 0.4892, + "step": 112450 + }, + { + "epoch": 1.125, + "grad_norm": 19.547279357910156, + "learning_rate": 5.502e-07, + "loss": 0.3387, + "step": 112500 + }, + { + "epoch": 1.1255, + "grad_norm": 44.79277420043945, + "learning_rate": 5.5e-07, + "loss": 0.459, + "step": 112550 + }, + { + "epoch": 1.126, + "grad_norm": 112.7216567993164, + "learning_rate": 5.497999999999999e-07, + "loss": 0.3898, + "step": 112600 + }, + { + "epoch": 1.1265, + "grad_norm": 14.867293357849121, + "learning_rate": 5.496e-07, + "loss": 0.5735, + "step": 112650 + }, + { + "epoch": 1.127, + "grad_norm": 98.45852661132812, + "learning_rate": 5.494e-07, + "loss": 0.5208, + "step": 112700 + }, + { + "epoch": 1.1275, + "grad_norm": 27.679582595825195, + "learning_rate": 5.492e-07, + "loss": 0.429, + "step": 112750 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.8725939989089966, + "learning_rate": 5.490000000000001e-07, + "loss": 0.4026, + "step": 112800 + }, + { + "epoch": 1.1285, + "grad_norm": 9.201322555541992, + "learning_rate": 5.487999999999999e-07, + "loss": 0.5991, + "step": 112850 + }, + { + "epoch": 1.129, + "grad_norm": 42.13934326171875, + "learning_rate": 5.485999999999999e-07, + "loss": 0.4351, + "step": 112900 + }, + { + "epoch": 1.1295, + "grad_norm": 22.487741470336914, + "learning_rate": 5.484e-07, + "loss": 0.3695, + "step": 112950 + }, + { + "epoch": 1.13, + "grad_norm": 78.89250183105469, + "learning_rate": 5.482e-07, + "loss": 0.6346, + "step": 113000 + }, + { + "epoch": 1.1305, + "grad_norm": 123.7279052734375, + "learning_rate": 5.48e-07, + "loss": 0.5355, + "step": 113050 + }, + { + "epoch": 1.131, + "grad_norm": 19.976831436157227, + "learning_rate": 5.478e-07, + "loss": 0.3888, + "step": 113100 + }, + { + "epoch": 1.1315, + "grad_norm": 36.473289489746094, + "learning_rate": 5.476e-07, + "loss": 0.6061, + "step": 113150 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 33.37117004394531, + "learning_rate": 5.473999999999999e-07, + "loss": 0.3772, + "step": 113200 + }, + { + "epoch": 1.1325, + "grad_norm": 45.76275634765625, + "learning_rate": 5.472e-07, + "loss": 0.4798, + "step": 113250 + }, + { + "epoch": 1.133, + "grad_norm": 67.44389343261719, + "learning_rate": 5.47e-07, + "loss": 0.5103, + "step": 113300 + }, + { + "epoch": 1.1335, + "grad_norm": 77.15213775634766, + "learning_rate": 5.467999999999999e-07, + "loss": 0.4921, + "step": 113350 + }, + { + "epoch": 1.134, + "grad_norm": 109.74034881591797, + "learning_rate": 5.466e-07, + "loss": 0.5541, + "step": 113400 + }, + { + "epoch": 1.1345, + "grad_norm": 49.160316467285156, + "learning_rate": 5.464e-07, + "loss": 0.4071, + "step": 113450 + }, + { + "epoch": 1.135, + "grad_norm": 0.6659963130950928, + "learning_rate": 5.462e-07, + "loss": 0.4504, + "step": 113500 + }, + { + "epoch": 1.1355, + "grad_norm": 129.61778259277344, + "learning_rate": 5.46e-07, + "loss": 0.4083, + "step": 113550 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 51.32670211791992, + "learning_rate": 5.457999999999999e-07, + "loss": 0.4323, + "step": 113600 + }, + { + "epoch": 1.1365, + "grad_norm": 94.61491394042969, + "learning_rate": 5.455999999999999e-07, + "loss": 0.4553, + "step": 113650 + }, + { + "epoch": 1.137, + "grad_norm": 94.95795440673828, + "learning_rate": 5.454e-07, + "loss": 0.3422, + "step": 113700 + }, + { + "epoch": 1.1375, + "grad_norm": 45.39287185668945, + "learning_rate": 5.452e-07, + "loss": 0.4915, + "step": 113750 + }, + { + "epoch": 1.138, + "grad_norm": 29.25931739807129, + "learning_rate": 5.45e-07, + "loss": 0.3838, + "step": 113800 + }, + { + "epoch": 1.1385, + "grad_norm": 60.96989822387695, + "learning_rate": 5.448e-07, + "loss": 0.5589, + "step": 113850 + }, + { + "epoch": 1.139, + "grad_norm": 45.03044128417969, + "learning_rate": 5.445999999999999e-07, + "loss": 0.4151, + "step": 113900 + }, + { + "epoch": 1.1395, + "grad_norm": 6.3732404708862305, + "learning_rate": 5.443999999999999e-07, + "loss": 0.3358, + "step": 113950 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 18.30240249633789, + "learning_rate": 5.442e-07, + "loss": 0.5567, + "step": 114000 + }, + { + "epoch": 1.1405, + "grad_norm": 47.34935760498047, + "learning_rate": 5.44e-07, + "loss": 0.4934, + "step": 114050 + }, + { + "epoch": 1.141, + "grad_norm": 12.485174179077148, + "learning_rate": 5.437999999999999e-07, + "loss": 0.4781, + "step": 114100 + }, + { + "epoch": 1.1415, + "grad_norm": 6.659138202667236, + "learning_rate": 5.436e-07, + "loss": 0.4272, + "step": 114150 + }, + { + "epoch": 1.142, + "grad_norm": 17.311927795410156, + "learning_rate": 5.434e-07, + "loss": 0.3993, + "step": 114200 + }, + { + "epoch": 1.1425, + "grad_norm": 70.42869567871094, + "learning_rate": 5.431999999999999e-07, + "loss": 0.4323, + "step": 114250 + }, + { + "epoch": 1.143, + "grad_norm": 44.21244812011719, + "learning_rate": 5.43e-07, + "loss": 0.4421, + "step": 114300 + }, + { + "epoch": 1.1435, + "grad_norm": 66.3972396850586, + "learning_rate": 5.427999999999999e-07, + "loss": 0.4448, + "step": 114350 + }, + { + "epoch": 1.144, + "grad_norm": 31.286418914794922, + "learning_rate": 5.425999999999999e-07, + "loss": 0.5305, + "step": 114400 + }, + { + "epoch": 1.1445, + "grad_norm": 79.53046417236328, + "learning_rate": 5.424e-07, + "loss": 0.5406, + "step": 114450 + }, + { + "epoch": 1.145, + "grad_norm": 1.268337607383728, + "learning_rate": 5.422e-07, + "loss": 0.3917, + "step": 114500 + }, + { + "epoch": 1.1455, + "grad_norm": 27.05938148498535, + "learning_rate": 5.420000000000001e-07, + "loss": 0.5285, + "step": 114550 + }, + { + "epoch": 1.146, + "grad_norm": 24.848316192626953, + "learning_rate": 5.417999999999999e-07, + "loss": 0.5525, + "step": 114600 + }, + { + "epoch": 1.1465, + "grad_norm": 21.377756118774414, + "learning_rate": 5.415999999999999e-07, + "loss": 0.5307, + "step": 114650 + }, + { + "epoch": 1.147, + "grad_norm": 3.2962486743927, + "learning_rate": 5.414e-07, + "loss": 0.4409, + "step": 114700 + }, + { + "epoch": 1.1475, + "grad_norm": 77.59668731689453, + "learning_rate": 5.41204e-07, + "loss": 0.5064, + "step": 114750 + }, + { + "epoch": 1.148, + "grad_norm": 82.37496185302734, + "learning_rate": 5.41004e-07, + "loss": 0.3953, + "step": 114800 + }, + { + "epoch": 1.1485, + "grad_norm": 85.80392456054688, + "learning_rate": 5.40804e-07, + "loss": 0.5122, + "step": 114850 + }, + { + "epoch": 1.149, + "grad_norm": 81.59890747070312, + "learning_rate": 5.40604e-07, + "loss": 0.5849, + "step": 114900 + }, + { + "epoch": 1.1495, + "grad_norm": 58.08201217651367, + "learning_rate": 5.404039999999999e-07, + "loss": 0.4393, + "step": 114950 + }, + { + "epoch": 1.15, + "grad_norm": 78.7620849609375, + "learning_rate": 5.40204e-07, + "loss": 0.4161, + "step": 115000 + }, + { + "epoch": 1.1505, + "grad_norm": 84.27385711669922, + "learning_rate": 5.40004e-07, + "loss": 0.3664, + "step": 115050 + }, + { + "epoch": 1.151, + "grad_norm": 62.63862609863281, + "learning_rate": 5.398039999999999e-07, + "loss": 0.4925, + "step": 115100 + }, + { + "epoch": 1.1515, + "grad_norm": 67.16043090820312, + "learning_rate": 5.39604e-07, + "loss": 0.5412, + "step": 115150 + }, + { + "epoch": 1.152, + "grad_norm": 53.16383361816406, + "learning_rate": 5.39404e-07, + "loss": 0.404, + "step": 115200 + }, + { + "epoch": 1.1525, + "grad_norm": 39.47855758666992, + "learning_rate": 5.39204e-07, + "loss": 0.4474, + "step": 115250 + }, + { + "epoch": 1.153, + "grad_norm": 1.1669477224349976, + "learning_rate": 5.390040000000001e-07, + "loss": 0.4366, + "step": 115300 + }, + { + "epoch": 1.1535, + "grad_norm": 67.96946716308594, + "learning_rate": 5.388039999999999e-07, + "loss": 0.478, + "step": 115350 + }, + { + "epoch": 1.154, + "grad_norm": 26.770727157592773, + "learning_rate": 5.386039999999999e-07, + "loss": 0.3817, + "step": 115400 + }, + { + "epoch": 1.1545, + "grad_norm": 57.50968933105469, + "learning_rate": 5.38404e-07, + "loss": 0.3984, + "step": 115450 + }, + { + "epoch": 1.155, + "grad_norm": 33.6568603515625, + "learning_rate": 5.38204e-07, + "loss": 0.4302, + "step": 115500 + }, + { + "epoch": 1.1555, + "grad_norm": 1.8125121593475342, + "learning_rate": 5.38004e-07, + "loss": 0.4788, + "step": 115550 + }, + { + "epoch": 1.156, + "grad_norm": 57.99422073364258, + "learning_rate": 5.37804e-07, + "loss": 0.2844, + "step": 115600 + }, + { + "epoch": 1.1565, + "grad_norm": 63.1536865234375, + "learning_rate": 5.37608e-07, + "loss": 0.3902, + "step": 115650 + }, + { + "epoch": 1.157, + "grad_norm": 15.688610076904297, + "learning_rate": 5.374079999999999e-07, + "loss": 0.3493, + "step": 115700 + }, + { + "epoch": 1.1575, + "grad_norm": 77.63614654541016, + "learning_rate": 5.37208e-07, + "loss": 0.5133, + "step": 115750 + }, + { + "epoch": 1.158, + "grad_norm": 36.73105239868164, + "learning_rate": 5.37008e-07, + "loss": 0.4251, + "step": 115800 + }, + { + "epoch": 1.1585, + "grad_norm": 42.460872650146484, + "learning_rate": 5.368079999999999e-07, + "loss": 0.5139, + "step": 115850 + }, + { + "epoch": 1.159, + "grad_norm": 73.16018676757812, + "learning_rate": 5.36608e-07, + "loss": 0.3794, + "step": 115900 + }, + { + "epoch": 1.1595, + "grad_norm": 42.55072021484375, + "learning_rate": 5.36408e-07, + "loss": 0.3831, + "step": 115950 + }, + { + "epoch": 1.16, + "grad_norm": 32.332515716552734, + "learning_rate": 5.36208e-07, + "loss": 0.3343, + "step": 116000 + }, + { + "epoch": 1.1605, + "grad_norm": 47.2687873840332, + "learning_rate": 5.36008e-07, + "loss": 0.5091, + "step": 116050 + }, + { + "epoch": 1.161, + "grad_norm": 95.39241790771484, + "learning_rate": 5.358079999999999e-07, + "loss": 0.4826, + "step": 116100 + }, + { + "epoch": 1.1615, + "grad_norm": 83.02808380126953, + "learning_rate": 5.356079999999999e-07, + "loss": 0.4376, + "step": 116150 + }, + { + "epoch": 1.162, + "grad_norm": 59.88267517089844, + "learning_rate": 5.35408e-07, + "loss": 0.4086, + "step": 116200 + }, + { + "epoch": 1.1625, + "grad_norm": 62.45161056518555, + "learning_rate": 5.35208e-07, + "loss": 0.4202, + "step": 116250 + }, + { + "epoch": 1.163, + "grad_norm": 117.84101104736328, + "learning_rate": 5.35008e-07, + "loss": 0.464, + "step": 116300 + }, + { + "epoch": 1.1635, + "grad_norm": 65.64312744140625, + "learning_rate": 5.34808e-07, + "loss": 0.5111, + "step": 116350 + }, + { + "epoch": 1.164, + "grad_norm": 45.327972412109375, + "learning_rate": 5.346079999999999e-07, + "loss": 0.4727, + "step": 116400 + }, + { + "epoch": 1.1645, + "grad_norm": 61.47356414794922, + "learning_rate": 5.344079999999999e-07, + "loss": 0.3288, + "step": 116450 + }, + { + "epoch": 1.165, + "grad_norm": 34.35345458984375, + "learning_rate": 5.34208e-07, + "loss": 0.365, + "step": 116500 + }, + { + "epoch": 1.1655, + "grad_norm": 5.033834934234619, + "learning_rate": 5.34008e-07, + "loss": 0.4617, + "step": 116550 + }, + { + "epoch": 1.166, + "grad_norm": 0.281574010848999, + "learning_rate": 5.338079999999999e-07, + "loss": 0.4622, + "step": 116600 + }, + { + "epoch": 1.1665, + "grad_norm": 51.69894027709961, + "learning_rate": 5.33608e-07, + "loss": 0.4423, + "step": 116650 + }, + { + "epoch": 1.167, + "grad_norm": 36.404632568359375, + "learning_rate": 5.33408e-07, + "loss": 0.5443, + "step": 116700 + }, + { + "epoch": 1.1675, + "grad_norm": 78.08043670654297, + "learning_rate": 5.332079999999999e-07, + "loss": 0.4774, + "step": 116750 + }, + { + "epoch": 1.168, + "grad_norm": 30.812881469726562, + "learning_rate": 5.33008e-07, + "loss": 0.5483, + "step": 116800 + }, + { + "epoch": 1.1685, + "grad_norm": 0.9321984648704529, + "learning_rate": 5.328079999999999e-07, + "loss": 0.3477, + "step": 116850 + }, + { + "epoch": 1.169, + "grad_norm": 72.7139892578125, + "learning_rate": 5.326079999999999e-07, + "loss": 0.5847, + "step": 116900 + }, + { + "epoch": 1.1695, + "grad_norm": 50.76666259765625, + "learning_rate": 5.32408e-07, + "loss": 0.3907, + "step": 116950 + }, + { + "epoch": 1.17, + "grad_norm": 96.60762786865234, + "learning_rate": 5.32208e-07, + "loss": 0.672, + "step": 117000 + }, + { + "epoch": 1.1705, + "grad_norm": 65.07868957519531, + "learning_rate": 5.320080000000001e-07, + "loss": 0.4654, + "step": 117050 + }, + { + "epoch": 1.171, + "grad_norm": 41.71099090576172, + "learning_rate": 5.318079999999999e-07, + "loss": 0.3896, + "step": 117100 + }, + { + "epoch": 1.1715, + "grad_norm": 30.449907302856445, + "learning_rate": 5.316079999999999e-07, + "loss": 0.5155, + "step": 117150 + }, + { + "epoch": 1.172, + "grad_norm": 103.2293701171875, + "learning_rate": 5.31408e-07, + "loss": 0.3301, + "step": 117200 + }, + { + "epoch": 1.1724999999999999, + "grad_norm": 54.75368881225586, + "learning_rate": 5.31208e-07, + "loss": 0.4718, + "step": 117250 + }, + { + "epoch": 1.173, + "grad_norm": 75.55032348632812, + "learning_rate": 5.31008e-07, + "loss": 0.4543, + "step": 117300 + }, + { + "epoch": 1.1735, + "grad_norm": 54.67997741699219, + "learning_rate": 5.30808e-07, + "loss": 0.4419, + "step": 117350 + }, + { + "epoch": 1.174, + "grad_norm": 91.2918472290039, + "learning_rate": 5.30608e-07, + "loss": 0.4074, + "step": 117400 + }, + { + "epoch": 1.1745, + "grad_norm": 73.0489501953125, + "learning_rate": 5.304079999999999e-07, + "loss": 0.4729, + "step": 117450 + }, + { + "epoch": 1.175, + "grad_norm": 3.4424524307250977, + "learning_rate": 5.30208e-07, + "loss": 0.4709, + "step": 117500 + }, + { + "epoch": 1.1755, + "grad_norm": 64.21658325195312, + "learning_rate": 5.30008e-07, + "loss": 0.4925, + "step": 117550 + }, + { + "epoch": 1.176, + "grad_norm": 24.437042236328125, + "learning_rate": 5.298079999999999e-07, + "loss": 0.4695, + "step": 117600 + }, + { + "epoch": 1.1764999999999999, + "grad_norm": 85.16154479980469, + "learning_rate": 5.29608e-07, + "loss": 0.4352, + "step": 117650 + }, + { + "epoch": 1.177, + "grad_norm": 34.37142562866211, + "learning_rate": 5.29408e-07, + "loss": 0.4595, + "step": 117700 + }, + { + "epoch": 1.1775, + "grad_norm": 103.63298034667969, + "learning_rate": 5.29208e-07, + "loss": 0.4311, + "step": 117750 + }, + { + "epoch": 1.178, + "grad_norm": 72.38675689697266, + "learning_rate": 5.29008e-07, + "loss": 0.4235, + "step": 117800 + }, + { + "epoch": 1.1785, + "grad_norm": 4.414365291595459, + "learning_rate": 5.288079999999999e-07, + "loss": 0.462, + "step": 117850 + }, + { + "epoch": 1.179, + "grad_norm": 74.01151275634766, + "learning_rate": 5.286079999999999e-07, + "loss": 0.5448, + "step": 117900 + }, + { + "epoch": 1.1795, + "grad_norm": 88.3164291381836, + "learning_rate": 5.28408e-07, + "loss": 0.2708, + "step": 117950 + }, + { + "epoch": 1.18, + "grad_norm": 75.72642517089844, + "learning_rate": 5.28208e-07, + "loss": 0.3894, + "step": 118000 + }, + { + "epoch": 1.1804999999999999, + "grad_norm": 63.216453552246094, + "learning_rate": 5.28008e-07, + "loss": 0.4749, + "step": 118050 + }, + { + "epoch": 1.181, + "grad_norm": 39.14889144897461, + "learning_rate": 5.278080000000001e-07, + "loss": 0.3814, + "step": 118100 + }, + { + "epoch": 1.1815, + "grad_norm": 56.19570541381836, + "learning_rate": 5.276079999999999e-07, + "loss": 0.349, + "step": 118150 + }, + { + "epoch": 1.182, + "grad_norm": 99.52997589111328, + "learning_rate": 5.274079999999999e-07, + "loss": 0.6551, + "step": 118200 + }, + { + "epoch": 1.1825, + "grad_norm": 27.029600143432617, + "learning_rate": 5.27208e-07, + "loss": 0.5198, + "step": 118250 + }, + { + "epoch": 1.183, + "grad_norm": 33.4561882019043, + "learning_rate": 5.27008e-07, + "loss": 0.4874, + "step": 118300 + }, + { + "epoch": 1.1835, + "grad_norm": 24.337610244750977, + "learning_rate": 5.26808e-07, + "loss": 0.5505, + "step": 118350 + }, + { + "epoch": 1.184, + "grad_norm": 39.1830940246582, + "learning_rate": 5.26608e-07, + "loss": 0.4899, + "step": 118400 + }, + { + "epoch": 1.1844999999999999, + "grad_norm": 32.634098052978516, + "learning_rate": 5.26408e-07, + "loss": 0.354, + "step": 118450 + }, + { + "epoch": 1.185, + "grad_norm": 102.42017364501953, + "learning_rate": 5.262079999999999e-07, + "loss": 0.3999, + "step": 118500 + }, + { + "epoch": 1.1855, + "grad_norm": 1.6400700807571411, + "learning_rate": 5.26008e-07, + "loss": 0.4722, + "step": 118550 + }, + { + "epoch": 1.186, + "grad_norm": 56.43583297729492, + "learning_rate": 5.25808e-07, + "loss": 0.5107, + "step": 118600 + }, + { + "epoch": 1.1865, + "grad_norm": 4.176393508911133, + "learning_rate": 5.256079999999999e-07, + "loss": 0.5332, + "step": 118650 + }, + { + "epoch": 1.187, + "grad_norm": 74.65667724609375, + "learning_rate": 5.25408e-07, + "loss": 0.347, + "step": 118700 + }, + { + "epoch": 1.1875, + "grad_norm": 61.597930908203125, + "learning_rate": 5.25208e-07, + "loss": 0.5037, + "step": 118750 + }, + { + "epoch": 1.188, + "grad_norm": 20.81285285949707, + "learning_rate": 5.25008e-07, + "loss": 0.4662, + "step": 118800 + }, + { + "epoch": 1.1885, + "grad_norm": 16.560958862304688, + "learning_rate": 5.24808e-07, + "loss": 0.4709, + "step": 118850 + }, + { + "epoch": 1.189, + "grad_norm": 85.1524429321289, + "learning_rate": 5.246079999999999e-07, + "loss": 0.3761, + "step": 118900 + }, + { + "epoch": 1.1895, + "grad_norm": 38.864070892333984, + "learning_rate": 5.244079999999999e-07, + "loss": 0.3661, + "step": 118950 + }, + { + "epoch": 1.19, + "grad_norm": 25.94007682800293, + "learning_rate": 5.24208e-07, + "loss": 0.3894, + "step": 119000 + }, + { + "epoch": 1.1905000000000001, + "grad_norm": 89.40354919433594, + "learning_rate": 5.24008e-07, + "loss": 0.6054, + "step": 119050 + }, + { + "epoch": 1.191, + "grad_norm": 28.081167221069336, + "learning_rate": 5.238080000000001e-07, + "loss": 0.4478, + "step": 119100 + }, + { + "epoch": 1.1915, + "grad_norm": 86.49946594238281, + "learning_rate": 5.23608e-07, + "loss": 0.4845, + "step": 119150 + }, + { + "epoch": 1.192, + "grad_norm": 29.06024169921875, + "learning_rate": 5.234079999999999e-07, + "loss": 0.3214, + "step": 119200 + }, + { + "epoch": 1.1925, + "grad_norm": 21.522964477539062, + "learning_rate": 5.23208e-07, + "loss": 0.3218, + "step": 119250 + }, + { + "epoch": 1.193, + "grad_norm": 77.7065200805664, + "learning_rate": 5.23008e-07, + "loss": 0.422, + "step": 119300 + }, + { + "epoch": 1.1935, + "grad_norm": 6.1106343269348145, + "learning_rate": 5.22808e-07, + "loss": 0.5586, + "step": 119350 + }, + { + "epoch": 1.194, + "grad_norm": 9.23585319519043, + "learning_rate": 5.22608e-07, + "loss": 0.4294, + "step": 119400 + }, + { + "epoch": 1.1945000000000001, + "grad_norm": 79.62310791015625, + "learning_rate": 5.22408e-07, + "loss": 0.4129, + "step": 119450 + }, + { + "epoch": 1.195, + "grad_norm": 18.00088882446289, + "learning_rate": 5.22208e-07, + "loss": 0.5814, + "step": 119500 + }, + { + "epoch": 1.1955, + "grad_norm": 110.30831146240234, + "learning_rate": 5.22008e-07, + "loss": 0.5494, + "step": 119550 + }, + { + "epoch": 1.196, + "grad_norm": 53.79940414428711, + "learning_rate": 5.21808e-07, + "loss": 0.3932, + "step": 119600 + }, + { + "epoch": 1.1965, + "grad_norm": 76.86608123779297, + "learning_rate": 5.216079999999999e-07, + "loss": 0.5098, + "step": 119650 + }, + { + "epoch": 1.197, + "grad_norm": 17.632843017578125, + "learning_rate": 5.21408e-07, + "loss": 0.5619, + "step": 119700 + }, + { + "epoch": 1.1975, + "grad_norm": 121.02203369140625, + "learning_rate": 5.21208e-07, + "loss": 0.478, + "step": 119750 + }, + { + "epoch": 1.198, + "grad_norm": 7.7555084228515625, + "learning_rate": 5.21008e-07, + "loss": 0.3932, + "step": 119800 + }, + { + "epoch": 1.1985000000000001, + "grad_norm": 79.47029113769531, + "learning_rate": 5.208080000000001e-07, + "loss": 0.4755, + "step": 119850 + }, + { + "epoch": 1.199, + "grad_norm": 85.48977661132812, + "learning_rate": 5.206079999999999e-07, + "loss": 0.5045, + "step": 119900 + }, + { + "epoch": 1.1995, + "grad_norm": 27.96647834777832, + "learning_rate": 5.204079999999999e-07, + "loss": 0.4199, + "step": 119950 + }, + { + "epoch": 1.2, + "grad_norm": 93.02487182617188, + "learning_rate": 5.20208e-07, + "loss": 0.471, + "step": 120000 + }, + { + "epoch": 1.2005, + "grad_norm": 27.727758407592773, + "learning_rate": 5.20008e-07, + "loss": 0.4306, + "step": 120050 + }, + { + "epoch": 1.201, + "grad_norm": 82.77517700195312, + "learning_rate": 5.19808e-07, + "loss": 0.4908, + "step": 120100 + }, + { + "epoch": 1.2015, + "grad_norm": 83.77193450927734, + "learning_rate": 5.19608e-07, + "loss": 0.415, + "step": 120150 + }, + { + "epoch": 1.202, + "grad_norm": 1.6518832445144653, + "learning_rate": 5.19408e-07, + "loss": 0.6042, + "step": 120200 + }, + { + "epoch": 1.2025000000000001, + "grad_norm": 39.49478530883789, + "learning_rate": 5.192079999999999e-07, + "loss": 0.4554, + "step": 120250 + }, + { + "epoch": 1.203, + "grad_norm": 62.075286865234375, + "learning_rate": 5.19008e-07, + "loss": 0.4344, + "step": 120300 + }, + { + "epoch": 1.2035, + "grad_norm": 0.7993803024291992, + "learning_rate": 5.18808e-07, + "loss": 0.3913, + "step": 120350 + }, + { + "epoch": 1.204, + "grad_norm": 0.20502905547618866, + "learning_rate": 5.186079999999999e-07, + "loss": 0.2958, + "step": 120400 + }, + { + "epoch": 1.2045, + "grad_norm": 23.898794174194336, + "learning_rate": 5.18412e-07, + "loss": 0.4188, + "step": 120450 + }, + { + "epoch": 1.205, + "grad_norm": 4.550911903381348, + "learning_rate": 5.18212e-07, + "loss": 0.4426, + "step": 120500 + }, + { + "epoch": 1.2055, + "grad_norm": 39.06218719482422, + "learning_rate": 5.18012e-07, + "loss": 0.4951, + "step": 120550 + }, + { + "epoch": 1.206, + "grad_norm": 77.20858001708984, + "learning_rate": 5.178120000000001e-07, + "loss": 0.4668, + "step": 120600 + }, + { + "epoch": 1.2065, + "grad_norm": 27.37639617919922, + "learning_rate": 5.176119999999999e-07, + "loss": 0.5083, + "step": 120650 + }, + { + "epoch": 1.207, + "grad_norm": 102.03683471679688, + "learning_rate": 5.174119999999999e-07, + "loss": 0.4479, + "step": 120700 + }, + { + "epoch": 1.2075, + "grad_norm": 10.521049499511719, + "learning_rate": 5.17212e-07, + "loss": 0.4924, + "step": 120750 + }, + { + "epoch": 1.208, + "grad_norm": 50.4854621887207, + "learning_rate": 5.17012e-07, + "loss": 0.595, + "step": 120800 + }, + { + "epoch": 1.2085, + "grad_norm": 60.808013916015625, + "learning_rate": 5.16812e-07, + "loss": 0.3966, + "step": 120850 + }, + { + "epoch": 1.209, + "grad_norm": 35.996952056884766, + "learning_rate": 5.16612e-07, + "loss": 0.3948, + "step": 120900 + }, + { + "epoch": 1.2095, + "grad_norm": 83.63671112060547, + "learning_rate": 5.16412e-07, + "loss": 0.6021, + "step": 120950 + }, + { + "epoch": 1.21, + "grad_norm": 92.66004180908203, + "learning_rate": 5.162119999999999e-07, + "loss": 0.3842, + "step": 121000 + }, + { + "epoch": 1.2105, + "grad_norm": 13.310367584228516, + "learning_rate": 5.16012e-07, + "loss": 0.6077, + "step": 121050 + }, + { + "epoch": 1.211, + "grad_norm": 4.458984375, + "learning_rate": 5.15812e-07, + "loss": 0.3837, + "step": 121100 + }, + { + "epoch": 1.2115, + "grad_norm": 39.17095184326172, + "learning_rate": 5.156119999999999e-07, + "loss": 0.4516, + "step": 121150 + }, + { + "epoch": 1.212, + "grad_norm": 77.5641860961914, + "learning_rate": 5.15412e-07, + "loss": 0.4531, + "step": 121200 + }, + { + "epoch": 1.2125, + "grad_norm": 31.501514434814453, + "learning_rate": 5.15212e-07, + "loss": 0.4466, + "step": 121250 + }, + { + "epoch": 1.213, + "grad_norm": 5.8556413650512695, + "learning_rate": 5.15012e-07, + "loss": 0.473, + "step": 121300 + }, + { + "epoch": 1.2135, + "grad_norm": 37.5422477722168, + "learning_rate": 5.14812e-07, + "loss": 0.4125, + "step": 121350 + }, + { + "epoch": 1.214, + "grad_norm": 2.340611457824707, + "learning_rate": 5.146119999999999e-07, + "loss": 0.4659, + "step": 121400 + }, + { + "epoch": 1.2145, + "grad_norm": 16.7429141998291, + "learning_rate": 5.144119999999999e-07, + "loss": 0.4964, + "step": 121450 + }, + { + "epoch": 1.215, + "grad_norm": 122.42613220214844, + "learning_rate": 5.14212e-07, + "loss": 0.403, + "step": 121500 + }, + { + "epoch": 1.2155, + "grad_norm": 89.16505432128906, + "learning_rate": 5.14012e-07, + "loss": 0.474, + "step": 121550 + }, + { + "epoch": 1.216, + "grad_norm": 41.55615997314453, + "learning_rate": 5.138120000000001e-07, + "loss": 0.4665, + "step": 121600 + }, + { + "epoch": 1.2165, + "grad_norm": 4.50972843170166, + "learning_rate": 5.13612e-07, + "loss": 0.4449, + "step": 121650 + }, + { + "epoch": 1.217, + "grad_norm": 45.18336486816406, + "learning_rate": 5.134119999999999e-07, + "loss": 0.4837, + "step": 121700 + }, + { + "epoch": 1.2175, + "grad_norm": 15.911096572875977, + "learning_rate": 5.13212e-07, + "loss": 0.4563, + "step": 121750 + }, + { + "epoch": 1.218, + "grad_norm": 32.39420700073242, + "learning_rate": 5.13012e-07, + "loss": 0.3398, + "step": 121800 + }, + { + "epoch": 1.2185, + "grad_norm": 67.41966247558594, + "learning_rate": 5.12812e-07, + "loss": 0.4374, + "step": 121850 + }, + { + "epoch": 1.219, + "grad_norm": 5.196559429168701, + "learning_rate": 5.12612e-07, + "loss": 0.5112, + "step": 121900 + }, + { + "epoch": 1.2195, + "grad_norm": 50.85737991333008, + "learning_rate": 5.12412e-07, + "loss": 0.5262, + "step": 121950 + }, + { + "epoch": 1.22, + "grad_norm": 1.6516265869140625, + "learning_rate": 5.12212e-07, + "loss": 0.3481, + "step": 122000 + }, + { + "epoch": 1.2205, + "grad_norm": 39.09980392456055, + "learning_rate": 5.12012e-07, + "loss": 0.3903, + "step": 122050 + }, + { + "epoch": 1.221, + "grad_norm": 45.83763122558594, + "learning_rate": 5.11812e-07, + "loss": 0.572, + "step": 122100 + }, + { + "epoch": 1.2215, + "grad_norm": 78.31942749023438, + "learning_rate": 5.116119999999999e-07, + "loss": 0.4883, + "step": 122150 + }, + { + "epoch": 1.222, + "grad_norm": 64.6844253540039, + "learning_rate": 5.11412e-07, + "loss": 0.5336, + "step": 122200 + }, + { + "epoch": 1.2225, + "grad_norm": 1.514631986618042, + "learning_rate": 5.11212e-07, + "loss": 0.4101, + "step": 122250 + }, + { + "epoch": 1.223, + "grad_norm": 47.222328186035156, + "learning_rate": 5.11012e-07, + "loss": 0.5162, + "step": 122300 + }, + { + "epoch": 1.2235, + "grad_norm": 54.371952056884766, + "learning_rate": 5.108120000000001e-07, + "loss": 0.5621, + "step": 122350 + }, + { + "epoch": 1.224, + "grad_norm": 62.68058395385742, + "learning_rate": 5.106119999999999e-07, + "loss": 0.4995, + "step": 122400 + }, + { + "epoch": 1.2245, + "grad_norm": 39.96584701538086, + "learning_rate": 5.104119999999999e-07, + "loss": 0.3692, + "step": 122450 + }, + { + "epoch": 1.225, + "grad_norm": 3.750762462615967, + "learning_rate": 5.10212e-07, + "loss": 0.4632, + "step": 122500 + }, + { + "epoch": 1.2255, + "grad_norm": 5.8588080406188965, + "learning_rate": 5.10012e-07, + "loss": 0.4385, + "step": 122550 + }, + { + "epoch": 1.226, + "grad_norm": 27.323911666870117, + "learning_rate": 5.09812e-07, + "loss": 0.3817, + "step": 122600 + }, + { + "epoch": 1.2265, + "grad_norm": 78.1037826538086, + "learning_rate": 5.09612e-07, + "loss": 0.5332, + "step": 122650 + }, + { + "epoch": 1.227, + "grad_norm": 30.815229415893555, + "learning_rate": 5.09412e-07, + "loss": 0.517, + "step": 122700 + }, + { + "epoch": 1.2275, + "grad_norm": 8.353984832763672, + "learning_rate": 5.092119999999999e-07, + "loss": 0.4976, + "step": 122750 + }, + { + "epoch": 1.228, + "grad_norm": 119.27523040771484, + "learning_rate": 5.09012e-07, + "loss": 0.4711, + "step": 122800 + }, + { + "epoch": 1.2285, + "grad_norm": 73.34131622314453, + "learning_rate": 5.08812e-07, + "loss": 0.5385, + "step": 122850 + }, + { + "epoch": 1.229, + "grad_norm": 0.49966078996658325, + "learning_rate": 5.086159999999999e-07, + "loss": 0.4272, + "step": 122900 + }, + { + "epoch": 1.2295, + "grad_norm": 60.721370697021484, + "learning_rate": 5.0842e-07, + "loss": 0.4648, + "step": 122950 + }, + { + "epoch": 1.23, + "grad_norm": 87.89384460449219, + "learning_rate": 5.0822e-07, + "loss": 0.4228, + "step": 123000 + }, + { + "epoch": 1.2305, + "grad_norm": 79.43232727050781, + "learning_rate": 5.0802e-07, + "loss": 0.4643, + "step": 123050 + }, + { + "epoch": 1.231, + "grad_norm": 22.63487434387207, + "learning_rate": 5.078200000000001e-07, + "loss": 0.5683, + "step": 123100 + }, + { + "epoch": 1.2315, + "grad_norm": 79.83702850341797, + "learning_rate": 5.076199999999999e-07, + "loss": 0.5672, + "step": 123150 + }, + { + "epoch": 1.232, + "grad_norm": 56.245948791503906, + "learning_rate": 5.074199999999999e-07, + "loss": 0.4281, + "step": 123200 + }, + { + "epoch": 1.2325, + "grad_norm": 17.55223846435547, + "learning_rate": 5.0722e-07, + "loss": 0.4716, + "step": 123250 + }, + { + "epoch": 1.233, + "grad_norm": 10.06242561340332, + "learning_rate": 5.0702e-07, + "loss": 0.4489, + "step": 123300 + }, + { + "epoch": 1.2335, + "grad_norm": 11.893889427185059, + "learning_rate": 5.0682e-07, + "loss": 0.4702, + "step": 123350 + }, + { + "epoch": 1.234, + "grad_norm": 105.41064453125, + "learning_rate": 5.0662e-07, + "loss": 0.4019, + "step": 123400 + }, + { + "epoch": 1.2345, + "grad_norm": 86.82225799560547, + "learning_rate": 5.0642e-07, + "loss": 0.4129, + "step": 123450 + }, + { + "epoch": 1.2349999999999999, + "grad_norm": 68.03509521484375, + "learning_rate": 5.062199999999999e-07, + "loss": 0.5526, + "step": 123500 + }, + { + "epoch": 1.2355, + "grad_norm": 27.85662841796875, + "learning_rate": 5.0602e-07, + "loss": 0.388, + "step": 123550 + }, + { + "epoch": 1.236, + "grad_norm": 15.7693452835083, + "learning_rate": 5.0582e-07, + "loss": 0.6257, + "step": 123600 + }, + { + "epoch": 1.2365, + "grad_norm": 8.797830581665039, + "learning_rate": 5.056199999999999e-07, + "loss": 0.3529, + "step": 123650 + }, + { + "epoch": 1.237, + "grad_norm": 46.14826202392578, + "learning_rate": 5.0542e-07, + "loss": 0.4523, + "step": 123700 + }, + { + "epoch": 1.2375, + "grad_norm": 16.78753662109375, + "learning_rate": 5.0522e-07, + "loss": 0.4304, + "step": 123750 + }, + { + "epoch": 1.238, + "grad_norm": 90.13909912109375, + "learning_rate": 5.0502e-07, + "loss": 0.4219, + "step": 123800 + }, + { + "epoch": 1.2385, + "grad_norm": 20.657495498657227, + "learning_rate": 5.0482e-07, + "loss": 0.4275, + "step": 123850 + }, + { + "epoch": 1.2389999999999999, + "grad_norm": 117.53194427490234, + "learning_rate": 5.046199999999999e-07, + "loss": 0.5234, + "step": 123900 + }, + { + "epoch": 1.2395, + "grad_norm": 44.009521484375, + "learning_rate": 5.044199999999999e-07, + "loss": 0.3952, + "step": 123950 + }, + { + "epoch": 1.24, + "grad_norm": 62.8895263671875, + "learning_rate": 5.0422e-07, + "loss": 0.3973, + "step": 124000 + }, + { + "epoch": 1.2405, + "grad_norm": 83.4673080444336, + "learning_rate": 5.0402e-07, + "loss": 0.4622, + "step": 124050 + }, + { + "epoch": 1.241, + "grad_norm": 69.27863311767578, + "learning_rate": 5.038200000000001e-07, + "loss": 0.3788, + "step": 124100 + }, + { + "epoch": 1.2415, + "grad_norm": 27.948728561401367, + "learning_rate": 5.0362e-07, + "loss": 0.4593, + "step": 124150 + }, + { + "epoch": 1.242, + "grad_norm": 17.01951789855957, + "learning_rate": 5.034199999999999e-07, + "loss": 0.4347, + "step": 124200 + }, + { + "epoch": 1.2425, + "grad_norm": 95.70691680908203, + "learning_rate": 5.0322e-07, + "loss": 0.4042, + "step": 124250 + }, + { + "epoch": 1.2429999999999999, + "grad_norm": 120.20599365234375, + "learning_rate": 5.0302e-07, + "loss": 0.4595, + "step": 124300 + }, + { + "epoch": 1.2435, + "grad_norm": 119.31159973144531, + "learning_rate": 5.0282e-07, + "loss": 0.4708, + "step": 124350 + }, + { + "epoch": 1.244, + "grad_norm": 72.69075775146484, + "learning_rate": 5.0262e-07, + "loss": 0.5223, + "step": 124400 + }, + { + "epoch": 1.2445, + "grad_norm": 65.67170715332031, + "learning_rate": 5.0242e-07, + "loss": 0.5593, + "step": 124450 + }, + { + "epoch": 1.245, + "grad_norm": 32.64443588256836, + "learning_rate": 5.0222e-07, + "loss": 0.532, + "step": 124500 + }, + { + "epoch": 1.2455, + "grad_norm": 6.584482192993164, + "learning_rate": 5.0202e-07, + "loss": 0.4046, + "step": 124550 + }, + { + "epoch": 1.246, + "grad_norm": 26.74454116821289, + "learning_rate": 5.0182e-07, + "loss": 0.493, + "step": 124600 + }, + { + "epoch": 1.2465, + "grad_norm": 3.494159460067749, + "learning_rate": 5.016199999999999e-07, + "loss": 0.395, + "step": 124650 + }, + { + "epoch": 1.2469999999999999, + "grad_norm": 12.93493938446045, + "learning_rate": 5.0142e-07, + "loss": 0.3823, + "step": 124700 + }, + { + "epoch": 1.2475, + "grad_norm": 58.11399459838867, + "learning_rate": 5.0122e-07, + "loss": 0.4471, + "step": 124750 + }, + { + "epoch": 1.248, + "grad_norm": 51.41734313964844, + "learning_rate": 5.0102e-07, + "loss": 0.4074, + "step": 124800 + }, + { + "epoch": 1.2485, + "grad_norm": 76.81995391845703, + "learning_rate": 5.008200000000001e-07, + "loss": 0.4224, + "step": 124850 + }, + { + "epoch": 1.249, + "grad_norm": 48.812599182128906, + "learning_rate": 5.006199999999999e-07, + "loss": 0.4964, + "step": 124900 + }, + { + "epoch": 1.2495, + "grad_norm": 62.64362716674805, + "learning_rate": 5.004199999999999e-07, + "loss": 0.423, + "step": 124950 + }, + { + "epoch": 1.25, + "grad_norm": 88.53785705566406, + "learning_rate": 5.0022e-07, + "loss": 0.4533, + "step": 125000 + }, + { + "epoch": 1.2505, + "grad_norm": 88.8245620727539, + "learning_rate": 5.0002e-07, + "loss": 0.5211, + "step": 125050 + }, + { + "epoch": 1.251, + "grad_norm": 43.50419616699219, + "learning_rate": 4.99824e-07, + "loss": 0.4007, + "step": 125100 + }, + { + "epoch": 1.2515, + "grad_norm": 11.175210952758789, + "learning_rate": 4.99624e-07, + "loss": 0.5217, + "step": 125150 + }, + { + "epoch": 1.252, + "grad_norm": 50.50872039794922, + "learning_rate": 4.99424e-07, + "loss": 0.4395, + "step": 125200 + }, + { + "epoch": 1.2525, + "grad_norm": 28.890705108642578, + "learning_rate": 4.992239999999999e-07, + "loss": 0.3575, + "step": 125250 + }, + { + "epoch": 1.2530000000000001, + "grad_norm": 2.7780516147613525, + "learning_rate": 4.99024e-07, + "loss": 0.4679, + "step": 125300 + }, + { + "epoch": 1.2535, + "grad_norm": 74.0434341430664, + "learning_rate": 4.988239999999999e-07, + "loss": 0.4339, + "step": 125350 + }, + { + "epoch": 1.254, + "grad_norm": 1.51035737991333, + "learning_rate": 4.98624e-07, + "loss": 0.4149, + "step": 125400 + }, + { + "epoch": 1.2545, + "grad_norm": 11.029925346374512, + "learning_rate": 4.98424e-07, + "loss": 0.4752, + "step": 125450 + }, + { + "epoch": 1.255, + "grad_norm": 4.947063446044922, + "learning_rate": 4.98224e-07, + "loss": 0.5536, + "step": 125500 + }, + { + "epoch": 1.2555, + "grad_norm": 1.3308128118515015, + "learning_rate": 4.98024e-07, + "loss": 0.3657, + "step": 125550 + }, + { + "epoch": 1.256, + "grad_norm": 61.55348205566406, + "learning_rate": 4.978239999999999e-07, + "loss": 0.5046, + "step": 125600 + }, + { + "epoch": 1.2565, + "grad_norm": 128.6259765625, + "learning_rate": 4.97624e-07, + "loss": 0.4097, + "step": 125650 + }, + { + "epoch": 1.2570000000000001, + "grad_norm": 7.346100807189941, + "learning_rate": 4.974239999999999e-07, + "loss": 0.4103, + "step": 125700 + }, + { + "epoch": 1.2575, + "grad_norm": 8.92379093170166, + "learning_rate": 4.97224e-07, + "loss": 0.2637, + "step": 125750 + }, + { + "epoch": 1.258, + "grad_norm": 42.0355110168457, + "learning_rate": 4.97024e-07, + "loss": 0.3719, + "step": 125800 + }, + { + "epoch": 1.2585, + "grad_norm": 77.65760803222656, + "learning_rate": 4.96824e-07, + "loss": 0.3427, + "step": 125850 + }, + { + "epoch": 1.259, + "grad_norm": 95.41388702392578, + "learning_rate": 4.96624e-07, + "loss": 0.4427, + "step": 125900 + }, + { + "epoch": 1.2595, + "grad_norm": 38.658050537109375, + "learning_rate": 4.964239999999999e-07, + "loss": 0.5894, + "step": 125950 + }, + { + "epoch": 1.26, + "grad_norm": 1.92863130569458, + "learning_rate": 4.962239999999999e-07, + "loss": 0.5318, + "step": 126000 + }, + { + "epoch": 1.2605, + "grad_norm": 132.43011474609375, + "learning_rate": 4.96024e-07, + "loss": 0.4031, + "step": 126050 + }, + { + "epoch": 1.2610000000000001, + "grad_norm": 51.750030517578125, + "learning_rate": 4.95824e-07, + "loss": 0.4338, + "step": 126100 + }, + { + "epoch": 1.2615, + "grad_norm": 4.630794525146484, + "learning_rate": 4.95624e-07, + "loss": 0.4925, + "step": 126150 + }, + { + "epoch": 1.262, + "grad_norm": 91.11925506591797, + "learning_rate": 4.95424e-07, + "loss": 0.4702, + "step": 126200 + }, + { + "epoch": 1.2625, + "grad_norm": 68.52376556396484, + "learning_rate": 4.95224e-07, + "loss": 0.6044, + "step": 126250 + }, + { + "epoch": 1.263, + "grad_norm": 84.54537200927734, + "learning_rate": 4.950239999999999e-07, + "loss": 0.5117, + "step": 126300 + }, + { + "epoch": 1.2635, + "grad_norm": 6.372779846191406, + "learning_rate": 4.948239999999999e-07, + "loss": 0.3654, + "step": 126350 + }, + { + "epoch": 1.264, + "grad_norm": 10.201987266540527, + "learning_rate": 4.94624e-07, + "loss": 0.5223, + "step": 126400 + }, + { + "epoch": 1.2645, + "grad_norm": 48.930599212646484, + "learning_rate": 4.944239999999999e-07, + "loss": 0.4524, + "step": 126450 + }, + { + "epoch": 1.2650000000000001, + "grad_norm": 63.114070892333984, + "learning_rate": 4.94224e-07, + "loss": 0.5026, + "step": 126500 + }, + { + "epoch": 1.2655, + "grad_norm": 64.93250274658203, + "learning_rate": 4.94024e-07, + "loss": 0.4449, + "step": 126550 + }, + { + "epoch": 1.266, + "grad_norm": 18.260643005371094, + "learning_rate": 4.93824e-07, + "loss": 0.4546, + "step": 126600 + }, + { + "epoch": 1.2665, + "grad_norm": 56.187164306640625, + "learning_rate": 4.93624e-07, + "loss": 0.4692, + "step": 126650 + }, + { + "epoch": 1.267, + "grad_norm": 75.86105346679688, + "learning_rate": 4.934239999999999e-07, + "loss": 0.4319, + "step": 126700 + }, + { + "epoch": 1.2675, + "grad_norm": 59.311798095703125, + "learning_rate": 4.93228e-07, + "loss": 0.4846, + "step": 126750 + }, + { + "epoch": 1.268, + "grad_norm": 34.68928146362305, + "learning_rate": 4.93028e-07, + "loss": 0.3693, + "step": 126800 + }, + { + "epoch": 1.2685, + "grad_norm": 58.48672866821289, + "learning_rate": 4.92828e-07, + "loss": 0.5163, + "step": 126850 + }, + { + "epoch": 1.2690000000000001, + "grad_norm": 6.793260097503662, + "learning_rate": 4.92628e-07, + "loss": 0.4033, + "step": 126900 + }, + { + "epoch": 1.2695, + "grad_norm": 108.26202392578125, + "learning_rate": 4.92428e-07, + "loss": 0.4662, + "step": 126950 + }, + { + "epoch": 1.27, + "grad_norm": 62.59315872192383, + "learning_rate": 4.92228e-07, + "loss": 0.427, + "step": 127000 + }, + { + "epoch": 1.2705, + "grad_norm": 7.085406303405762, + "learning_rate": 4.92028e-07, + "loss": 0.356, + "step": 127050 + }, + { + "epoch": 1.271, + "grad_norm": 27.684703826904297, + "learning_rate": 4.918279999999999e-07, + "loss": 0.3432, + "step": 127100 + }, + { + "epoch": 1.2715, + "grad_norm": 89.28202819824219, + "learning_rate": 4.91628e-07, + "loss": 0.4642, + "step": 127150 + }, + { + "epoch": 1.272, + "grad_norm": 81.8940658569336, + "learning_rate": 4.91428e-07, + "loss": 0.4793, + "step": 127200 + }, + { + "epoch": 1.2725, + "grad_norm": 57.01283645629883, + "learning_rate": 4.91228e-07, + "loss": 0.6866, + "step": 127250 + }, + { + "epoch": 1.2730000000000001, + "grad_norm": 7.828011989593506, + "learning_rate": 4.91028e-07, + "loss": 0.4175, + "step": 127300 + }, + { + "epoch": 1.2735, + "grad_norm": 6.988823890686035, + "learning_rate": 4.90828e-07, + "loss": 0.4843, + "step": 127350 + }, + { + "epoch": 1.274, + "grad_norm": 129.013916015625, + "learning_rate": 4.906279999999999e-07, + "loss": 0.4135, + "step": 127400 + }, + { + "epoch": 1.2745, + "grad_norm": 2.377220392227173, + "learning_rate": 4.904279999999999e-07, + "loss": 0.4675, + "step": 127450 + }, + { + "epoch": 1.275, + "grad_norm": 73.6028060913086, + "learning_rate": 4.90228e-07, + "loss": 0.4821, + "step": 127500 + }, + { + "epoch": 1.2755, + "grad_norm": 53.05596160888672, + "learning_rate": 4.90028e-07, + "loss": 0.3646, + "step": 127550 + }, + { + "epoch": 1.276, + "grad_norm": 2.0764973163604736, + "learning_rate": 4.89828e-07, + "loss": 0.4229, + "step": 127600 + }, + { + "epoch": 1.2765, + "grad_norm": 36.13945770263672, + "learning_rate": 4.89628e-07, + "loss": 0.4127, + "step": 127650 + }, + { + "epoch": 1.2770000000000001, + "grad_norm": 66.94934844970703, + "learning_rate": 4.89428e-07, + "loss": 0.4685, + "step": 127700 + }, + { + "epoch": 1.2775, + "grad_norm": 100.14254760742188, + "learning_rate": 4.892279999999999e-07, + "loss": 0.5389, + "step": 127750 + }, + { + "epoch": 1.278, + "grad_norm": 56.245418548583984, + "learning_rate": 4.89028e-07, + "loss": 0.4782, + "step": 127800 + }, + { + "epoch": 1.2785, + "grad_norm": 2.368191957473755, + "learning_rate": 4.88828e-07, + "loss": 0.3856, + "step": 127850 + }, + { + "epoch": 1.279, + "grad_norm": 125.4609375, + "learning_rate": 4.88628e-07, + "loss": 0.5209, + "step": 127900 + }, + { + "epoch": 1.2795, + "grad_norm": 9.785893440246582, + "learning_rate": 4.88428e-07, + "loss": 0.3902, + "step": 127950 + }, + { + "epoch": 1.28, + "grad_norm": 76.34190368652344, + "learning_rate": 4.88228e-07, + "loss": 0.4521, + "step": 128000 + }, + { + "epoch": 1.2805, + "grad_norm": 37.4355354309082, + "learning_rate": 4.88028e-07, + "loss": 0.3555, + "step": 128050 + }, + { + "epoch": 1.2810000000000001, + "grad_norm": 9.940116882324219, + "learning_rate": 4.878279999999999e-07, + "loss": 0.3704, + "step": 128100 + }, + { + "epoch": 1.2814999999999999, + "grad_norm": 6.346107006072998, + "learning_rate": 4.87628e-07, + "loss": 0.3613, + "step": 128150 + }, + { + "epoch": 1.282, + "grad_norm": 20.553491592407227, + "learning_rate": 4.874279999999999e-07, + "loss": 0.4186, + "step": 128200 + }, + { + "epoch": 1.2825, + "grad_norm": 31.738750457763672, + "learning_rate": 4.87228e-07, + "loss": 0.5275, + "step": 128250 + }, + { + "epoch": 1.283, + "grad_norm": 32.1572151184082, + "learning_rate": 4.87028e-07, + "loss": 0.5016, + "step": 128300 + }, + { + "epoch": 1.2835, + "grad_norm": 60.900787353515625, + "learning_rate": 4.86828e-07, + "loss": 0.5527, + "step": 128350 + }, + { + "epoch": 1.284, + "grad_norm": 37.76549530029297, + "learning_rate": 4.86628e-07, + "loss": 0.3499, + "step": 128400 + }, + { + "epoch": 1.2845, + "grad_norm": 16.48982810974121, + "learning_rate": 4.86428e-07, + "loss": 0.3264, + "step": 128450 + }, + { + "epoch": 1.285, + "grad_norm": 11.211465835571289, + "learning_rate": 4.862279999999999e-07, + "loss": 0.3776, + "step": 128500 + }, + { + "epoch": 1.2854999999999999, + "grad_norm": 4.8228583335876465, + "learning_rate": 4.86028e-07, + "loss": 0.497, + "step": 128550 + }, + { + "epoch": 1.286, + "grad_norm": 51.52220916748047, + "learning_rate": 4.85828e-07, + "loss": 0.4283, + "step": 128600 + }, + { + "epoch": 1.2865, + "grad_norm": 116.35945892333984, + "learning_rate": 4.85628e-07, + "loss": 0.4819, + "step": 128650 + }, + { + "epoch": 1.287, + "grad_norm": 16.744569778442383, + "learning_rate": 4.85428e-07, + "loss": 0.4903, + "step": 128700 + }, + { + "epoch": 1.2875, + "grad_norm": 83.41133880615234, + "learning_rate": 4.85228e-07, + "loss": 0.4459, + "step": 128750 + }, + { + "epoch": 1.288, + "grad_norm": 14.692625999450684, + "learning_rate": 4.850279999999999e-07, + "loss": 0.3043, + "step": 128800 + }, + { + "epoch": 1.2885, + "grad_norm": 56.206844329833984, + "learning_rate": 4.848279999999999e-07, + "loss": 0.4554, + "step": 128850 + }, + { + "epoch": 1.289, + "grad_norm": 89.15912628173828, + "learning_rate": 4.84628e-07, + "loss": 0.5114, + "step": 128900 + }, + { + "epoch": 1.2894999999999999, + "grad_norm": 101.62841033935547, + "learning_rate": 4.84428e-07, + "loss": 0.491, + "step": 128950 + }, + { + "epoch": 1.29, + "grad_norm": 65.63429260253906, + "learning_rate": 4.84228e-07, + "loss": 0.5268, + "step": 129000 + }, + { + "epoch": 1.2905, + "grad_norm": 50.63749694824219, + "learning_rate": 4.84028e-07, + "loss": 0.4863, + "step": 129050 + }, + { + "epoch": 1.291, + "grad_norm": 9.616476058959961, + "learning_rate": 4.83828e-07, + "loss": 0.3873, + "step": 129100 + }, + { + "epoch": 1.2915, + "grad_norm": 38.35820388793945, + "learning_rate": 4.836279999999999e-07, + "loss": 0.2707, + "step": 129150 + }, + { + "epoch": 1.292, + "grad_norm": 7.243841171264648, + "learning_rate": 4.83428e-07, + "loss": 0.3993, + "step": 129200 + }, + { + "epoch": 1.2925, + "grad_norm": 4.215170860290527, + "learning_rate": 4.83228e-07, + "loss": 0.3448, + "step": 129250 + }, + { + "epoch": 1.293, + "grad_norm": 34.597991943359375, + "learning_rate": 4.83028e-07, + "loss": 0.3508, + "step": 129300 + }, + { + "epoch": 1.2934999999999999, + "grad_norm": 65.05717468261719, + "learning_rate": 4.82828e-07, + "loss": 0.3527, + "step": 129350 + }, + { + "epoch": 1.294, + "grad_norm": 13.01252269744873, + "learning_rate": 4.82628e-07, + "loss": 0.4451, + "step": 129400 + }, + { + "epoch": 1.2945, + "grad_norm": 13.404302597045898, + "learning_rate": 4.82428e-07, + "loss": 0.3919, + "step": 129450 + }, + { + "epoch": 1.295, + "grad_norm": 62.99480056762695, + "learning_rate": 4.822279999999999e-07, + "loss": 0.4649, + "step": 129500 + }, + { + "epoch": 1.2955, + "grad_norm": 45.989017486572266, + "learning_rate": 4.82028e-07, + "loss": 0.479, + "step": 129550 + }, + { + "epoch": 1.296, + "grad_norm": 34.542762756347656, + "learning_rate": 4.818279999999999e-07, + "loss": 0.4426, + "step": 129600 + }, + { + "epoch": 1.2965, + "grad_norm": 41.29684066772461, + "learning_rate": 4.81628e-07, + "loss": 0.4677, + "step": 129650 + }, + { + "epoch": 1.297, + "grad_norm": 92.34574127197266, + "learning_rate": 4.81428e-07, + "loss": 0.3883, + "step": 129700 + }, + { + "epoch": 1.2974999999999999, + "grad_norm": 27.71408462524414, + "learning_rate": 4.81228e-07, + "loss": 0.4247, + "step": 129750 + }, + { + "epoch": 1.298, + "grad_norm": 6.5036492347717285, + "learning_rate": 4.81028e-07, + "loss": 0.4296, + "step": 129800 + }, + { + "epoch": 1.2985, + "grad_norm": 17.653621673583984, + "learning_rate": 4.808279999999999e-07, + "loss": 0.5943, + "step": 129850 + }, + { + "epoch": 1.299, + "grad_norm": 17.503652572631836, + "learning_rate": 4.806319999999999e-07, + "loss": 0.4773, + "step": 129900 + }, + { + "epoch": 1.2995, + "grad_norm": 34.05784225463867, + "learning_rate": 4.80432e-07, + "loss": 0.455, + "step": 129950 + }, + { + "epoch": 1.3, + "grad_norm": 3.221923351287842, + "learning_rate": 4.80232e-07, + "loss": 0.3534, + "step": 130000 + }, + { + "epoch": 1.3005, + "grad_norm": 44.165382385253906, + "learning_rate": 4.80032e-07, + "loss": 0.4089, + "step": 130050 + }, + { + "epoch": 1.301, + "grad_norm": 19.523441314697266, + "learning_rate": 4.79832e-07, + "loss": 0.5113, + "step": 130100 + }, + { + "epoch": 1.3014999999999999, + "grad_norm": 15.177743911743164, + "learning_rate": 4.79632e-07, + "loss": 0.4199, + "step": 130150 + }, + { + "epoch": 1.302, + "grad_norm": 0.24970543384552002, + "learning_rate": 4.794320000000001e-07, + "loss": 0.2723, + "step": 130200 + }, + { + "epoch": 1.3025, + "grad_norm": 80.9247817993164, + "learning_rate": 4.792319999999999e-07, + "loss": 0.4628, + "step": 130250 + }, + { + "epoch": 1.303, + "grad_norm": 24.861129760742188, + "learning_rate": 4.79032e-07, + "loss": 0.4166, + "step": 130300 + }, + { + "epoch": 1.3035, + "grad_norm": 4.55272912979126, + "learning_rate": 4.78832e-07, + "loss": 0.4463, + "step": 130350 + }, + { + "epoch": 1.304, + "grad_norm": 11.789888381958008, + "learning_rate": 4.78632e-07, + "loss": 0.5848, + "step": 130400 + }, + { + "epoch": 1.3045, + "grad_norm": 3.2823047637939453, + "learning_rate": 4.78432e-07, + "loss": 0.4772, + "step": 130450 + }, + { + "epoch": 1.305, + "grad_norm": 8.338168144226074, + "learning_rate": 4.78232e-07, + "loss": 0.3896, + "step": 130500 + }, + { + "epoch": 1.3054999999999999, + "grad_norm": 84.38629150390625, + "learning_rate": 4.78032e-07, + "loss": 0.4665, + "step": 130550 + }, + { + "epoch": 1.306, + "grad_norm": 79.32563781738281, + "learning_rate": 4.778319999999999e-07, + "loss": 0.3935, + "step": 130600 + }, + { + "epoch": 1.3065, + "grad_norm": 12.994901657104492, + "learning_rate": 4.77632e-07, + "loss": 0.4861, + "step": 130650 + }, + { + "epoch": 1.307, + "grad_norm": 34.566383361816406, + "learning_rate": 4.77432e-07, + "loss": 0.4927, + "step": 130700 + }, + { + "epoch": 1.3075, + "grad_norm": 80.90245056152344, + "learning_rate": 4.77232e-07, + "loss": 0.5533, + "step": 130750 + }, + { + "epoch": 1.308, + "grad_norm": 7.320152282714844, + "learning_rate": 4.77032e-07, + "loss": 0.4052, + "step": 130800 + }, + { + "epoch": 1.3085, + "grad_norm": 9.89566421508789, + "learning_rate": 4.7683199999999996e-07, + "loss": 0.411, + "step": 130850 + }, + { + "epoch": 1.309, + "grad_norm": 32.64478302001953, + "learning_rate": 4.76632e-07, + "loss": 0.4795, + "step": 130900 + }, + { + "epoch": 1.3094999999999999, + "grad_norm": 4.066441059112549, + "learning_rate": 4.76432e-07, + "loss": 0.637, + "step": 130950 + }, + { + "epoch": 1.31, + "grad_norm": 147.58811950683594, + "learning_rate": 4.7623199999999997e-07, + "loss": 0.6026, + "step": 131000 + }, + { + "epoch": 1.3105, + "grad_norm": 11.256807327270508, + "learning_rate": 4.76032e-07, + "loss": 0.5771, + "step": 131050 + }, + { + "epoch": 1.311, + "grad_norm": 52.247196197509766, + "learning_rate": 4.7583199999999994e-07, + "loss": 0.4587, + "step": 131100 + }, + { + "epoch": 1.3115, + "grad_norm": 1.7685823440551758, + "learning_rate": 4.75632e-07, + "loss": 0.3916, + "step": 131150 + }, + { + "epoch": 1.312, + "grad_norm": 72.10371398925781, + "learning_rate": 4.75432e-07, + "loss": 0.3049, + "step": 131200 + }, + { + "epoch": 1.3125, + "grad_norm": 86.47714233398438, + "learning_rate": 4.7523199999999995e-07, + "loss": 0.4716, + "step": 131250 + }, + { + "epoch": 1.313, + "grad_norm": 120.87084197998047, + "learning_rate": 4.75032e-07, + "loss": 0.4428, + "step": 131300 + }, + { + "epoch": 1.3135, + "grad_norm": 21.09920883178711, + "learning_rate": 4.7483199999999997e-07, + "loss": 0.521, + "step": 131350 + }, + { + "epoch": 1.314, + "grad_norm": 88.9284439086914, + "learning_rate": 4.7463199999999996e-07, + "loss": 0.5036, + "step": 131400 + }, + { + "epoch": 1.3145, + "grad_norm": 56.702247619628906, + "learning_rate": 4.74432e-07, + "loss": 0.5041, + "step": 131450 + }, + { + "epoch": 1.315, + "grad_norm": 0.35143765807151794, + "learning_rate": 4.74232e-07, + "loss": 0.4163, + "step": 131500 + }, + { + "epoch": 1.3155000000000001, + "grad_norm": 92.69734191894531, + "learning_rate": 4.7403199999999997e-07, + "loss": 0.4294, + "step": 131550 + }, + { + "epoch": 1.316, + "grad_norm": 50.940093994140625, + "learning_rate": 4.7383199999999995e-07, + "loss": 0.4826, + "step": 131600 + }, + { + "epoch": 1.3165, + "grad_norm": 82.14228820800781, + "learning_rate": 4.73632e-07, + "loss": 0.4999, + "step": 131650 + }, + { + "epoch": 1.317, + "grad_norm": 75.90847778320312, + "learning_rate": 4.73432e-07, + "loss": 0.4275, + "step": 131700 + }, + { + "epoch": 1.3175, + "grad_norm": 34.303688049316406, + "learning_rate": 4.7323199999999996e-07, + "loss": 0.453, + "step": 131750 + }, + { + "epoch": 1.318, + "grad_norm": 47.25891876220703, + "learning_rate": 4.73032e-07, + "loss": 0.5394, + "step": 131800 + }, + { + "epoch": 1.3185, + "grad_norm": 0.128638356924057, + "learning_rate": 4.7283199999999993e-07, + "loss": 0.3947, + "step": 131850 + }, + { + "epoch": 1.319, + "grad_norm": 4.6934967041015625, + "learning_rate": 4.7263199999999997e-07, + "loss": 0.5127, + "step": 131900 + }, + { + "epoch": 1.3195000000000001, + "grad_norm": 38.059913635253906, + "learning_rate": 4.72432e-07, + "loss": 0.4648, + "step": 131950 + }, + { + "epoch": 1.32, + "grad_norm": 42.113609313964844, + "learning_rate": 4.72232e-07, + "loss": 0.3855, + "step": 132000 + }, + { + "epoch": 1.3205, + "grad_norm": 88.45838928222656, + "learning_rate": 4.72032e-07, + "loss": 0.3796, + "step": 132050 + }, + { + "epoch": 1.321, + "grad_norm": 60.03329086303711, + "learning_rate": 4.7183199999999996e-07, + "loss": 0.4678, + "step": 132100 + }, + { + "epoch": 1.3215, + "grad_norm": 92.38176727294922, + "learning_rate": 4.71632e-07, + "loss": 0.4438, + "step": 132150 + }, + { + "epoch": 1.322, + "grad_norm": 5.0574235916137695, + "learning_rate": 4.71432e-07, + "loss": 0.4548, + "step": 132200 + }, + { + "epoch": 1.3225, + "grad_norm": 45.51777648925781, + "learning_rate": 4.7123199999999997e-07, + "loss": 0.477, + "step": 132250 + }, + { + "epoch": 1.323, + "grad_norm": 58.61941146850586, + "learning_rate": 4.71032e-07, + "loss": 0.4616, + "step": 132300 + }, + { + "epoch": 1.3235000000000001, + "grad_norm": 9.93522834777832, + "learning_rate": 4.7083199999999994e-07, + "loss": 0.3566, + "step": 132350 + }, + { + "epoch": 1.324, + "grad_norm": 73.70626068115234, + "learning_rate": 4.70632e-07, + "loss": 0.5039, + "step": 132400 + }, + { + "epoch": 1.3245, + "grad_norm": 119.5567626953125, + "learning_rate": 4.70432e-07, + "loss": 0.5233, + "step": 132450 + }, + { + "epoch": 1.325, + "grad_norm": 93.0530014038086, + "learning_rate": 4.7023199999999995e-07, + "loss": 0.3723, + "step": 132500 + }, + { + "epoch": 1.3255, + "grad_norm": 94.75311279296875, + "learning_rate": 4.70032e-07, + "loss": 0.4238, + "step": 132550 + }, + { + "epoch": 1.326, + "grad_norm": 24.6379337310791, + "learning_rate": 4.69832e-07, + "loss": 0.5475, + "step": 132600 + }, + { + "epoch": 1.3265, + "grad_norm": 128.80043029785156, + "learning_rate": 4.6963199999999995e-07, + "loss": 0.5309, + "step": 132650 + }, + { + "epoch": 1.327, + "grad_norm": 11.744426727294922, + "learning_rate": 4.69432e-07, + "loss": 0.4684, + "step": 132700 + }, + { + "epoch": 1.3275000000000001, + "grad_norm": 68.0375747680664, + "learning_rate": 4.69232e-07, + "loss": 0.4713, + "step": 132750 + }, + { + "epoch": 1.328, + "grad_norm": 63.520179748535156, + "learning_rate": 4.6903199999999996e-07, + "loss": 0.4248, + "step": 132800 + }, + { + "epoch": 1.3285, + "grad_norm": 10.20051383972168, + "learning_rate": 4.68832e-07, + "loss": 0.3877, + "step": 132850 + }, + { + "epoch": 1.329, + "grad_norm": 94.5232925415039, + "learning_rate": 4.68632e-07, + "loss": 0.5337, + "step": 132900 + }, + { + "epoch": 1.3295, + "grad_norm": 118.54349517822266, + "learning_rate": 4.6843199999999997e-07, + "loss": 0.3081, + "step": 132950 + }, + { + "epoch": 1.33, + "grad_norm": 1.4136791229248047, + "learning_rate": 4.6823199999999995e-07, + "loss": 0.4322, + "step": 133000 + }, + { + "epoch": 1.3305, + "grad_norm": 13.606802940368652, + "learning_rate": 4.68032e-07, + "loss": 0.5477, + "step": 133050 + }, + { + "epoch": 1.331, + "grad_norm": 58.811222076416016, + "learning_rate": 4.6783200000000003e-07, + "loss": 0.5183, + "step": 133100 + }, + { + "epoch": 1.3315000000000001, + "grad_norm": 1.774475336074829, + "learning_rate": 4.6763199999999996e-07, + "loss": 0.3814, + "step": 133150 + }, + { + "epoch": 1.332, + "grad_norm": 3.9559104442596436, + "learning_rate": 4.67432e-07, + "loss": 0.4848, + "step": 133200 + }, + { + "epoch": 1.3325, + "grad_norm": 44.5838623046875, + "learning_rate": 4.67232e-07, + "loss": 0.4987, + "step": 133250 + }, + { + "epoch": 1.333, + "grad_norm": 45.46150207519531, + "learning_rate": 4.6703199999999997e-07, + "loss": 0.4701, + "step": 133300 + }, + { + "epoch": 1.3335, + "grad_norm": 4.442442417144775, + "learning_rate": 4.66832e-07, + "loss": 0.4129, + "step": 133350 + }, + { + "epoch": 1.334, + "grad_norm": 83.65077209472656, + "learning_rate": 4.66632e-07, + "loss": 0.4445, + "step": 133400 + }, + { + "epoch": 1.3345, + "grad_norm": 95.16340637207031, + "learning_rate": 4.66432e-07, + "loss": 0.4481, + "step": 133450 + }, + { + "epoch": 1.335, + "grad_norm": 12.23143482208252, + "learning_rate": 4.6623199999999996e-07, + "loss": 0.4512, + "step": 133500 + }, + { + "epoch": 1.3355000000000001, + "grad_norm": 38.25416564941406, + "learning_rate": 4.66032e-07, + "loss": 0.3672, + "step": 133550 + }, + { + "epoch": 1.336, + "grad_norm": 57.15376281738281, + "learning_rate": 4.65832e-07, + "loss": 0.5406, + "step": 133600 + }, + { + "epoch": 1.3365, + "grad_norm": 58.421695709228516, + "learning_rate": 4.6563199999999997e-07, + "loss": 0.5141, + "step": 133650 + }, + { + "epoch": 1.337, + "grad_norm": 87.27682495117188, + "learning_rate": 4.65432e-07, + "loss": 0.4306, + "step": 133700 + }, + { + "epoch": 1.3375, + "grad_norm": 71.72251892089844, + "learning_rate": 4.6523199999999994e-07, + "loss": 0.4374, + "step": 133750 + }, + { + "epoch": 1.338, + "grad_norm": 49.080322265625, + "learning_rate": 4.65032e-07, + "loss": 0.5072, + "step": 133800 + }, + { + "epoch": 1.3385, + "grad_norm": 22.312482833862305, + "learning_rate": 4.64832e-07, + "loss": 0.4124, + "step": 133850 + }, + { + "epoch": 1.339, + "grad_norm": 36.95961380004883, + "learning_rate": 4.6463199999999995e-07, + "loss": 0.4986, + "step": 133900 + }, + { + "epoch": 1.3395000000000001, + "grad_norm": 2.186569929122925, + "learning_rate": 4.64432e-07, + "loss": 0.3585, + "step": 133950 + }, + { + "epoch": 1.34, + "grad_norm": 38.58488464355469, + "learning_rate": 4.6423199999999997e-07, + "loss": 0.4046, + "step": 134000 + }, + { + "epoch": 1.3405, + "grad_norm": 83.42021942138672, + "learning_rate": 4.6403199999999996e-07, + "loss": 0.4514, + "step": 134050 + }, + { + "epoch": 1.341, + "grad_norm": 5.808516979217529, + "learning_rate": 4.63832e-07, + "loss": 0.4749, + "step": 134100 + }, + { + "epoch": 1.3415, + "grad_norm": 6.701382637023926, + "learning_rate": 4.63632e-07, + "loss": 0.4058, + "step": 134150 + }, + { + "epoch": 1.342, + "grad_norm": 54.70067596435547, + "learning_rate": 4.6343199999999996e-07, + "loss": 0.4774, + "step": 134200 + }, + { + "epoch": 1.3425, + "grad_norm": 0.1387663632631302, + "learning_rate": 4.6323199999999995e-07, + "loss": 0.3653, + "step": 134250 + }, + { + "epoch": 1.343, + "grad_norm": 32.30421829223633, + "learning_rate": 4.63032e-07, + "loss": 0.3759, + "step": 134300 + }, + { + "epoch": 1.3435000000000001, + "grad_norm": 60.436344146728516, + "learning_rate": 4.62832e-07, + "loss": 0.5095, + "step": 134350 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 83.42176818847656, + "learning_rate": 4.6263199999999996e-07, + "loss": 0.425, + "step": 134400 + }, + { + "epoch": 1.3445, + "grad_norm": 3.811023473739624, + "learning_rate": 4.62432e-07, + "loss": 0.407, + "step": 134450 + }, + { + "epoch": 1.345, + "grad_norm": 74.2744369506836, + "learning_rate": 4.62232e-07, + "loss": 0.4088, + "step": 134500 + }, + { + "epoch": 1.3455, + "grad_norm": 7.530820846557617, + "learning_rate": 4.6203199999999996e-07, + "loss": 0.401, + "step": 134550 + }, + { + "epoch": 1.346, + "grad_norm": 143.85891723632812, + "learning_rate": 4.61832e-07, + "loss": 0.5301, + "step": 134600 + }, + { + "epoch": 1.3465, + "grad_norm": 82.85350036621094, + "learning_rate": 4.61632e-07, + "loss": 0.4794, + "step": 134650 + }, + { + "epoch": 1.347, + "grad_norm": 94.02362823486328, + "learning_rate": 4.6143199999999997e-07, + "loss": 0.408, + "step": 134700 + }, + { + "epoch": 1.3475, + "grad_norm": 1.712831974029541, + "learning_rate": 4.6123199999999996e-07, + "loss": 0.3879, + "step": 134750 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 91.60545349121094, + "learning_rate": 4.61032e-07, + "loss": 0.4832, + "step": 134800 + }, + { + "epoch": 1.3485, + "grad_norm": 107.8651123046875, + "learning_rate": 4.60832e-07, + "loss": 0.4833, + "step": 134850 + }, + { + "epoch": 1.349, + "grad_norm": 70.89888000488281, + "learning_rate": 4.6063199999999997e-07, + "loss": 0.4173, + "step": 134900 + }, + { + "epoch": 1.3495, + "grad_norm": 100.18441772460938, + "learning_rate": 4.60432e-07, + "loss": 0.5035, + "step": 134950 + }, + { + "epoch": 1.35, + "grad_norm": 12.220166206359863, + "learning_rate": 4.6023599999999994e-07, + "loss": 0.3394, + "step": 135000 + }, + { + "epoch": 1.3505, + "grad_norm": 76.76039123535156, + "learning_rate": 4.60036e-07, + "loss": 0.4507, + "step": 135050 + }, + { + "epoch": 1.351, + "grad_norm": 94.28003692626953, + "learning_rate": 4.59836e-07, + "loss": 0.3727, + "step": 135100 + }, + { + "epoch": 1.3515, + "grad_norm": 109.33968353271484, + "learning_rate": 4.5963599999999995e-07, + "loss": 0.4583, + "step": 135150 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 50.13473892211914, + "learning_rate": 4.59436e-07, + "loss": 0.4678, + "step": 135200 + }, + { + "epoch": 1.3525, + "grad_norm": 30.904821395874023, + "learning_rate": 4.5923599999999997e-07, + "loss": 0.4636, + "step": 135250 + }, + { + "epoch": 1.353, + "grad_norm": 115.35250091552734, + "learning_rate": 4.5903599999999996e-07, + "loss": 0.3913, + "step": 135300 + }, + { + "epoch": 1.3535, + "grad_norm": 116.52228546142578, + "learning_rate": 4.58836e-07, + "loss": 0.553, + "step": 135350 + }, + { + "epoch": 1.354, + "grad_norm": 86.03593444824219, + "learning_rate": 4.58636e-07, + "loss": 0.5031, + "step": 135400 + }, + { + "epoch": 1.3545, + "grad_norm": 72.3880386352539, + "learning_rate": 4.5843599999999996e-07, + "loss": 0.4295, + "step": 135450 + }, + { + "epoch": 1.355, + "grad_norm": 45.91992950439453, + "learning_rate": 4.5823599999999995e-07, + "loss": 0.396, + "step": 135500 + }, + { + "epoch": 1.3555, + "grad_norm": 17.957754135131836, + "learning_rate": 4.58036e-07, + "loss": 0.3191, + "step": 135550 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 3.7718541622161865, + "learning_rate": 4.57836e-07, + "loss": 0.5687, + "step": 135600 + }, + { + "epoch": 1.3565, + "grad_norm": 16.24213409423828, + "learning_rate": 4.5763599999999996e-07, + "loss": 0.4716, + "step": 135650 + }, + { + "epoch": 1.357, + "grad_norm": 3.7620809078216553, + "learning_rate": 4.57436e-07, + "loss": 0.456, + "step": 135700 + }, + { + "epoch": 1.3575, + "grad_norm": 86.84635925292969, + "learning_rate": 4.57236e-07, + "loss": 0.3689, + "step": 135750 + }, + { + "epoch": 1.358, + "grad_norm": 84.04347229003906, + "learning_rate": 4.5703599999999996e-07, + "loss": 0.4508, + "step": 135800 + }, + { + "epoch": 1.3585, + "grad_norm": 29.693422317504883, + "learning_rate": 4.56836e-07, + "loss": 0.5614, + "step": 135850 + }, + { + "epoch": 1.359, + "grad_norm": 45.12217712402344, + "learning_rate": 4.56636e-07, + "loss": 0.3916, + "step": 135900 + }, + { + "epoch": 1.3595, + "grad_norm": 31.29984474182129, + "learning_rate": 4.5643599999999997e-07, + "loss": 0.4284, + "step": 135950 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 33.480751037597656, + "learning_rate": 4.5623599999999996e-07, + "loss": 0.3547, + "step": 136000 + }, + { + "epoch": 1.3605, + "grad_norm": 2.545854091644287, + "learning_rate": 4.56036e-07, + "loss": 0.418, + "step": 136050 + }, + { + "epoch": 1.361, + "grad_norm": 91.34917449951172, + "learning_rate": 4.55836e-07, + "loss": 0.4277, + "step": 136100 + }, + { + "epoch": 1.3615, + "grad_norm": 36.96833419799805, + "learning_rate": 4.5563599999999996e-07, + "loss": 0.4769, + "step": 136150 + }, + { + "epoch": 1.362, + "grad_norm": 53.53211975097656, + "learning_rate": 4.55436e-07, + "loss": 0.4037, + "step": 136200 + }, + { + "epoch": 1.3625, + "grad_norm": 16.555437088012695, + "learning_rate": 4.5523599999999993e-07, + "loss": 0.4622, + "step": 136250 + }, + { + "epoch": 1.363, + "grad_norm": 2.20904803276062, + "learning_rate": 4.5503599999999997e-07, + "loss": 0.4856, + "step": 136300 + }, + { + "epoch": 1.3635, + "grad_norm": 67.71829986572266, + "learning_rate": 4.54836e-07, + "loss": 0.4294, + "step": 136350 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 27.427248001098633, + "learning_rate": 4.5463599999999994e-07, + "loss": 0.5167, + "step": 136400 + }, + { + "epoch": 1.3645, + "grad_norm": 80.44205474853516, + "learning_rate": 4.54436e-07, + "loss": 0.4855, + "step": 136450 + }, + { + "epoch": 1.365, + "grad_norm": 15.163834571838379, + "learning_rate": 4.5423599999999997e-07, + "loss": 0.3511, + "step": 136500 + }, + { + "epoch": 1.3655, + "grad_norm": 12.879931449890137, + "learning_rate": 4.5403599999999995e-07, + "loss": 0.4367, + "step": 136550 + }, + { + "epoch": 1.366, + "grad_norm": 68.53999328613281, + "learning_rate": 4.53836e-07, + "loss": 0.5996, + "step": 136600 + }, + { + "epoch": 1.3665, + "grad_norm": 121.40241241455078, + "learning_rate": 4.5363599999999997e-07, + "loss": 0.5384, + "step": 136650 + }, + { + "epoch": 1.367, + "grad_norm": 0.31804296374320984, + "learning_rate": 4.53436e-07, + "loss": 0.6281, + "step": 136700 + }, + { + "epoch": 1.3675, + "grad_norm": 13.64835262298584, + "learning_rate": 4.53236e-07, + "loss": 0.4431, + "step": 136750 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 39.9966926574707, + "learning_rate": 4.53036e-07, + "loss": 0.4694, + "step": 136800 + }, + { + "epoch": 1.3685, + "grad_norm": 29.257156372070312, + "learning_rate": 4.52836e-07, + "loss": 0.5316, + "step": 136850 + }, + { + "epoch": 1.369, + "grad_norm": 68.866943359375, + "learning_rate": 4.5263599999999995e-07, + "loss": 0.4085, + "step": 136900 + }, + { + "epoch": 1.3695, + "grad_norm": 1.851599097251892, + "learning_rate": 4.52436e-07, + "loss": 0.4832, + "step": 136950 + }, + { + "epoch": 1.37, + "grad_norm": 66.2961654663086, + "learning_rate": 4.5223600000000003e-07, + "loss": 0.4054, + "step": 137000 + }, + { + "epoch": 1.3705, + "grad_norm": 12.196954727172852, + "learning_rate": 4.5203599999999996e-07, + "loss": 0.5535, + "step": 137050 + }, + { + "epoch": 1.371, + "grad_norm": 60.95892333984375, + "learning_rate": 4.51836e-07, + "loss": 0.3948, + "step": 137100 + }, + { + "epoch": 1.3715, + "grad_norm": 0.48031020164489746, + "learning_rate": 4.51636e-07, + "loss": 0.5336, + "step": 137150 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 88.17898559570312, + "learning_rate": 4.5143599999999997e-07, + "loss": 0.553, + "step": 137200 + }, + { + "epoch": 1.3725, + "grad_norm": 15.11093807220459, + "learning_rate": 4.51236e-07, + "loss": 0.4888, + "step": 137250 + }, + { + "epoch": 1.373, + "grad_norm": 27.76417350769043, + "learning_rate": 4.51036e-07, + "loss": 0.5925, + "step": 137300 + }, + { + "epoch": 1.3735, + "grad_norm": 2.448547840118408, + "learning_rate": 4.50836e-07, + "loss": 0.3783, + "step": 137350 + }, + { + "epoch": 1.374, + "grad_norm": 7.2902607917785645, + "learning_rate": 4.5063599999999996e-07, + "loss": 0.4221, + "step": 137400 + }, + { + "epoch": 1.3745, + "grad_norm": 15.440958976745605, + "learning_rate": 4.50436e-07, + "loss": 0.3223, + "step": 137450 + }, + { + "epoch": 1.375, + "grad_norm": 9.393592834472656, + "learning_rate": 4.50236e-07, + "loss": 0.393, + "step": 137500 + }, + { + "epoch": 1.3755, + "grad_norm": 22.066003799438477, + "learning_rate": 4.5003599999999997e-07, + "loss": 0.4808, + "step": 137550 + }, + { + "epoch": 1.376, + "grad_norm": 2.079468250274658, + "learning_rate": 4.49836e-07, + "loss": 0.4312, + "step": 137600 + }, + { + "epoch": 1.3765, + "grad_norm": 74.9188461303711, + "learning_rate": 4.4963599999999994e-07, + "loss": 0.4377, + "step": 137650 + }, + { + "epoch": 1.377, + "grad_norm": 0.9050947427749634, + "learning_rate": 4.49436e-07, + "loss": 0.4035, + "step": 137700 + }, + { + "epoch": 1.3775, + "grad_norm": 104.04845428466797, + "learning_rate": 4.49236e-07, + "loss": 0.3991, + "step": 137750 + }, + { + "epoch": 1.3780000000000001, + "grad_norm": 47.782188415527344, + "learning_rate": 4.4903599999999995e-07, + "loss": 0.4328, + "step": 137800 + }, + { + "epoch": 1.3785, + "grad_norm": 66.12909698486328, + "learning_rate": 4.48836e-07, + "loss": 0.4564, + "step": 137850 + }, + { + "epoch": 1.379, + "grad_norm": 99.37031555175781, + "learning_rate": 4.4863599999999997e-07, + "loss": 0.4904, + "step": 137900 + }, + { + "epoch": 1.3795, + "grad_norm": 110.17735290527344, + "learning_rate": 4.48436e-07, + "loss": 0.5182, + "step": 137950 + }, + { + "epoch": 1.38, + "grad_norm": 50.46833038330078, + "learning_rate": 4.48236e-07, + "loss": 0.5215, + "step": 138000 + }, + { + "epoch": 1.3805, + "grad_norm": 110.3285140991211, + "learning_rate": 4.48036e-07, + "loss": 0.5516, + "step": 138050 + }, + { + "epoch": 1.381, + "grad_norm": 1.3695465326309204, + "learning_rate": 4.47836e-07, + "loss": 0.4337, + "step": 138100 + }, + { + "epoch": 1.3815, + "grad_norm": 38.30186462402344, + "learning_rate": 4.4763599999999995e-07, + "loss": 0.4659, + "step": 138150 + }, + { + "epoch": 1.3820000000000001, + "grad_norm": 70.28520202636719, + "learning_rate": 4.47436e-07, + "loss": 0.48, + "step": 138200 + }, + { + "epoch": 1.3825, + "grad_norm": 9.3742036819458, + "learning_rate": 4.47236e-07, + "loss": 0.4788, + "step": 138250 + }, + { + "epoch": 1.383, + "grad_norm": 0.6705490350723267, + "learning_rate": 4.4703599999999995e-07, + "loss": 0.4825, + "step": 138300 + }, + { + "epoch": 1.3835, + "grad_norm": 30.913881301879883, + "learning_rate": 4.46836e-07, + "loss": 0.4678, + "step": 138350 + }, + { + "epoch": 1.384, + "grad_norm": 71.79946899414062, + "learning_rate": 4.46636e-07, + "loss": 0.5039, + "step": 138400 + }, + { + "epoch": 1.3845, + "grad_norm": 25.541101455688477, + "learning_rate": 4.4643599999999996e-07, + "loss": 0.4999, + "step": 138450 + }, + { + "epoch": 1.385, + "grad_norm": 88.34212493896484, + "learning_rate": 4.46236e-07, + "loss": 0.3772, + "step": 138500 + }, + { + "epoch": 1.3855, + "grad_norm": 28.59035873413086, + "learning_rate": 4.46036e-07, + "loss": 0.3627, + "step": 138550 + }, + { + "epoch": 1.3860000000000001, + "grad_norm": 60.05609130859375, + "learning_rate": 4.4583599999999997e-07, + "loss": 0.458, + "step": 138600 + }, + { + "epoch": 1.3865, + "grad_norm": 52.41706085205078, + "learning_rate": 4.4563599999999995e-07, + "loss": 0.3964, + "step": 138650 + }, + { + "epoch": 1.387, + "grad_norm": 121.22614288330078, + "learning_rate": 4.45436e-07, + "loss": 0.3593, + "step": 138700 + }, + { + "epoch": 1.3875, + "grad_norm": 9.27535629272461, + "learning_rate": 4.45236e-07, + "loss": 0.5142, + "step": 138750 + }, + { + "epoch": 1.388, + "grad_norm": 5.494357109069824, + "learning_rate": 4.4503599999999996e-07, + "loss": 0.4425, + "step": 138800 + }, + { + "epoch": 1.3885, + "grad_norm": 100.4722671508789, + "learning_rate": 4.44836e-07, + "loss": 0.4553, + "step": 138850 + }, + { + "epoch": 1.389, + "grad_norm": 6.942700386047363, + "learning_rate": 4.4463599999999993e-07, + "loss": 0.4525, + "step": 138900 + }, + { + "epoch": 1.3895, + "grad_norm": 38.53964614868164, + "learning_rate": 4.4443599999999997e-07, + "loss": 0.5523, + "step": 138950 + }, + { + "epoch": 1.3900000000000001, + "grad_norm": 38.89802169799805, + "learning_rate": 4.44236e-07, + "loss": 0.3422, + "step": 139000 + }, + { + "epoch": 1.3905, + "grad_norm": 123.1884765625, + "learning_rate": 4.4403599999999994e-07, + "loss": 0.6059, + "step": 139050 + }, + { + "epoch": 1.391, + "grad_norm": 3.0012309551239014, + "learning_rate": 4.43836e-07, + "loss": 0.3437, + "step": 139100 + }, + { + "epoch": 1.3915, + "grad_norm": 1.8544474840164185, + "learning_rate": 4.4363599999999996e-07, + "loss": 0.3483, + "step": 139150 + }, + { + "epoch": 1.392, + "grad_norm": 81.5112533569336, + "learning_rate": 4.43436e-07, + "loss": 0.4729, + "step": 139200 + }, + { + "epoch": 1.3925, + "grad_norm": 88.00301361083984, + "learning_rate": 4.43236e-07, + "loss": 0.4897, + "step": 139250 + }, + { + "epoch": 1.393, + "grad_norm": 7.775111675262451, + "learning_rate": 4.4303599999999997e-07, + "loss": 0.4474, + "step": 139300 + }, + { + "epoch": 1.3935, + "grad_norm": 60.3265266418457, + "learning_rate": 4.42836e-07, + "loss": 0.5121, + "step": 139350 + }, + { + "epoch": 1.3940000000000001, + "grad_norm": 79.7757568359375, + "learning_rate": 4.4263599999999994e-07, + "loss": 0.512, + "step": 139400 + }, + { + "epoch": 1.3945, + "grad_norm": 13.68470287322998, + "learning_rate": 4.42436e-07, + "loss": 0.3414, + "step": 139450 + }, + { + "epoch": 1.395, + "grad_norm": 60.95335388183594, + "learning_rate": 4.42236e-07, + "loss": 0.4045, + "step": 139500 + }, + { + "epoch": 1.3955, + "grad_norm": 47.811424255371094, + "learning_rate": 4.4203599999999995e-07, + "loss": 0.384, + "step": 139550 + }, + { + "epoch": 1.396, + "grad_norm": 17.77873420715332, + "learning_rate": 4.41836e-07, + "loss": 0.4314, + "step": 139600 + }, + { + "epoch": 1.3965, + "grad_norm": 0.8219741582870483, + "learning_rate": 4.4163599999999997e-07, + "loss": 0.4474, + "step": 139650 + }, + { + "epoch": 1.397, + "grad_norm": 44.76259231567383, + "learning_rate": 4.4143599999999996e-07, + "loss": 0.4728, + "step": 139700 + }, + { + "epoch": 1.3975, + "grad_norm": 32.0296745300293, + "learning_rate": 4.41236e-07, + "loss": 0.2824, + "step": 139750 + }, + { + "epoch": 1.3980000000000001, + "grad_norm": 6.995255947113037, + "learning_rate": 4.41036e-07, + "loss": 0.4395, + "step": 139800 + }, + { + "epoch": 1.3985, + "grad_norm": 3.6521756649017334, + "learning_rate": 4.4083599999999996e-07, + "loss": 0.4282, + "step": 139850 + }, + { + "epoch": 1.399, + "grad_norm": 0.5228773951530457, + "learning_rate": 4.4063999999999995e-07, + "loss": 0.5437, + "step": 139900 + }, + { + "epoch": 1.3995, + "grad_norm": 0.48555630445480347, + "learning_rate": 4.4044e-07, + "loss": 0.5651, + "step": 139950 + }, + { + "epoch": 1.4, + "grad_norm": 81.31806182861328, + "learning_rate": 4.4024e-07, + "loss": 0.4659, + "step": 140000 + }, + { + "epoch": 1.4005, + "grad_norm": 1.3937146663665771, + "learning_rate": 4.4003999999999996e-07, + "loss": 0.4486, + "step": 140050 + }, + { + "epoch": 1.401, + "grad_norm": 21.106035232543945, + "learning_rate": 4.3984e-07, + "loss": 0.4779, + "step": 140100 + }, + { + "epoch": 1.4015, + "grad_norm": 13.850019454956055, + "learning_rate": 4.3963999999999993e-07, + "loss": 0.3871, + "step": 140150 + }, + { + "epoch": 1.4020000000000001, + "grad_norm": 1.2949334383010864, + "learning_rate": 4.3943999999999997e-07, + "loss": 0.5905, + "step": 140200 + }, + { + "epoch": 1.4025, + "grad_norm": 85.25445556640625, + "learning_rate": 4.3924e-07, + "loss": 0.5164, + "step": 140250 + }, + { + "epoch": 1.403, + "grad_norm": 21.79319190979004, + "learning_rate": 4.3903999999999994e-07, + "loss": 0.4014, + "step": 140300 + }, + { + "epoch": 1.4035, + "grad_norm": 16.37718391418457, + "learning_rate": 4.3884e-07, + "loss": 0.4419, + "step": 140350 + }, + { + "epoch": 1.404, + "grad_norm": 3.0240814685821533, + "learning_rate": 4.3863999999999996e-07, + "loss": 0.5172, + "step": 140400 + }, + { + "epoch": 1.4045, + "grad_norm": 106.63111877441406, + "learning_rate": 4.3844e-07, + "loss": 0.4104, + "step": 140450 + }, + { + "epoch": 1.405, + "grad_norm": 17.3591251373291, + "learning_rate": 4.3824e-07, + "loss": 0.4768, + "step": 140500 + }, + { + "epoch": 1.4055, + "grad_norm": 76.1080322265625, + "learning_rate": 4.3803999999999997e-07, + "loss": 0.4747, + "step": 140550 + }, + { + "epoch": 1.4060000000000001, + "grad_norm": 3.34743595123291, + "learning_rate": 4.3784e-07, + "loss": 0.3191, + "step": 140600 + }, + { + "epoch": 1.4064999999999999, + "grad_norm": 38.74766159057617, + "learning_rate": 4.3763999999999994e-07, + "loss": 0.4569, + "step": 140650 + }, + { + "epoch": 1.407, + "grad_norm": 46.4010124206543, + "learning_rate": 4.3744e-07, + "loss": 0.3989, + "step": 140700 + }, + { + "epoch": 1.4075, + "grad_norm": 69.19367980957031, + "learning_rate": 4.3724e-07, + "loss": 0.4423, + "step": 140750 + }, + { + "epoch": 1.408, + "grad_norm": 86.27606964111328, + "learning_rate": 4.3703999999999995e-07, + "loss": 0.3281, + "step": 140800 + }, + { + "epoch": 1.4085, + "grad_norm": 0.331612229347229, + "learning_rate": 4.3684e-07, + "loss": 0.552, + "step": 140850 + }, + { + "epoch": 1.409, + "grad_norm": 17.078815460205078, + "learning_rate": 4.3663999999999997e-07, + "loss": 0.4963, + "step": 140900 + }, + { + "epoch": 1.4095, + "grad_norm": 17.343914031982422, + "learning_rate": 4.3643999999999996e-07, + "loss": 0.3181, + "step": 140950 + }, + { + "epoch": 1.41, + "grad_norm": 75.39768981933594, + "learning_rate": 4.3624e-07, + "loss": 0.3793, + "step": 141000 + }, + { + "epoch": 1.4104999999999999, + "grad_norm": 0.2830001711845398, + "learning_rate": 4.3604e-07, + "loss": 0.4037, + "step": 141050 + }, + { + "epoch": 1.411, + "grad_norm": 30.401926040649414, + "learning_rate": 4.3583999999999996e-07, + "loss": 0.427, + "step": 141100 + }, + { + "epoch": 1.4115, + "grad_norm": 10.625136375427246, + "learning_rate": 4.3564e-07, + "loss": 0.4488, + "step": 141150 + }, + { + "epoch": 1.412, + "grad_norm": 8.427471160888672, + "learning_rate": 4.3544e-07, + "loss": 0.3985, + "step": 141200 + }, + { + "epoch": 1.4125, + "grad_norm": 26.40860366821289, + "learning_rate": 4.3523999999999997e-07, + "loss": 0.3947, + "step": 141250 + }, + { + "epoch": 1.413, + "grad_norm": 87.355224609375, + "learning_rate": 4.3503999999999996e-07, + "loss": 0.3551, + "step": 141300 + }, + { + "epoch": 1.4135, + "grad_norm": 31.92610740661621, + "learning_rate": 4.3484e-07, + "loss": 0.5354, + "step": 141350 + }, + { + "epoch": 1.414, + "grad_norm": 24.853439331054688, + "learning_rate": 4.3464000000000003e-07, + "loss": 0.4651, + "step": 141400 + }, + { + "epoch": 1.4144999999999999, + "grad_norm": 73.6370849609375, + "learning_rate": 4.3443999999999996e-07, + "loss": 0.4862, + "step": 141450 + }, + { + "epoch": 1.415, + "grad_norm": 35.83401870727539, + "learning_rate": 4.3424e-07, + "loss": 0.4366, + "step": 141500 + }, + { + "epoch": 1.4155, + "grad_norm": 30.853010177612305, + "learning_rate": 4.3404e-07, + "loss": 0.5262, + "step": 141550 + }, + { + "epoch": 1.416, + "grad_norm": 118.53955078125, + "learning_rate": 4.3383999999999997e-07, + "loss": 0.5409, + "step": 141600 + }, + { + "epoch": 1.4165, + "grad_norm": 40.07814025878906, + "learning_rate": 4.3364399999999996e-07, + "loss": 0.4521, + "step": 141650 + }, + { + "epoch": 1.417, + "grad_norm": 9.961045265197754, + "learning_rate": 4.33444e-07, + "loss": 0.3655, + "step": 141700 + }, + { + "epoch": 1.4175, + "grad_norm": 141.18646240234375, + "learning_rate": 4.33244e-07, + "loss": 0.6208, + "step": 141750 + }, + { + "epoch": 1.418, + "grad_norm": 58.834197998046875, + "learning_rate": 4.3304399999999997e-07, + "loss": 0.3911, + "step": 141800 + }, + { + "epoch": 1.4184999999999999, + "grad_norm": 63.38532257080078, + "learning_rate": 4.32844e-07, + "loss": 0.3999, + "step": 141850 + }, + { + "epoch": 1.419, + "grad_norm": 77.08700561523438, + "learning_rate": 4.3264399999999994e-07, + "loss": 0.5151, + "step": 141900 + }, + { + "epoch": 1.4195, + "grad_norm": 0.9496738910675049, + "learning_rate": 4.32444e-07, + "loss": 0.4697, + "step": 141950 + }, + { + "epoch": 1.42, + "grad_norm": 35.80956268310547, + "learning_rate": 4.32244e-07, + "loss": 0.3883, + "step": 142000 + }, + { + "epoch": 1.4205, + "grad_norm": 82.25574493408203, + "learning_rate": 4.3204399999999995e-07, + "loss": 0.5685, + "step": 142050 + }, + { + "epoch": 1.421, + "grad_norm": 8.83198356628418, + "learning_rate": 4.31844e-07, + "loss": 0.4466, + "step": 142100 + }, + { + "epoch": 1.4215, + "grad_norm": 84.17395782470703, + "learning_rate": 4.31644e-07, + "loss": 0.4718, + "step": 142150 + }, + { + "epoch": 1.422, + "grad_norm": 94.86622619628906, + "learning_rate": 4.3144399999999996e-07, + "loss": 0.3978, + "step": 142200 + }, + { + "epoch": 1.4224999999999999, + "grad_norm": 77.70457458496094, + "learning_rate": 4.31244e-07, + "loss": 0.4577, + "step": 142250 + }, + { + "epoch": 1.423, + "grad_norm": 95.18637084960938, + "learning_rate": 4.31044e-07, + "loss": 0.4149, + "step": 142300 + }, + { + "epoch": 1.4235, + "grad_norm": 63.49544143676758, + "learning_rate": 4.3084399999999996e-07, + "loss": 0.5862, + "step": 142350 + }, + { + "epoch": 1.424, + "grad_norm": 16.329381942749023, + "learning_rate": 4.30644e-07, + "loss": 0.3519, + "step": 142400 + }, + { + "epoch": 1.4245, + "grad_norm": 8.99925708770752, + "learning_rate": 4.30444e-07, + "loss": 0.3929, + "step": 142450 + }, + { + "epoch": 1.425, + "grad_norm": 3.1051368713378906, + "learning_rate": 4.3024399999999997e-07, + "loss": 0.3946, + "step": 142500 + }, + { + "epoch": 1.4255, + "grad_norm": 116.58087921142578, + "learning_rate": 4.3004399999999996e-07, + "loss": 0.5392, + "step": 142550 + }, + { + "epoch": 1.426, + "grad_norm": 11.166736602783203, + "learning_rate": 4.29848e-07, + "loss": 0.4117, + "step": 142600 + }, + { + "epoch": 1.4264999999999999, + "grad_norm": 45.40515899658203, + "learning_rate": 4.2964799999999993e-07, + "loss": 0.4275, + "step": 142650 + }, + { + "epoch": 1.427, + "grad_norm": 49.694183349609375, + "learning_rate": 4.2944799999999997e-07, + "loss": 0.3936, + "step": 142700 + }, + { + "epoch": 1.4275, + "grad_norm": 52.98374557495117, + "learning_rate": 4.29248e-07, + "loss": 0.4006, + "step": 142750 + }, + { + "epoch": 1.428, + "grad_norm": 23.433656692504883, + "learning_rate": 4.29048e-07, + "loss": 0.4007, + "step": 142800 + }, + { + "epoch": 1.4285, + "grad_norm": 88.89615631103516, + "learning_rate": 4.28848e-07, + "loss": 0.442, + "step": 142850 + }, + { + "epoch": 1.429, + "grad_norm": 2.4820308685302734, + "learning_rate": 4.2864799999999996e-07, + "loss": 0.4532, + "step": 142900 + }, + { + "epoch": 1.4295, + "grad_norm": 26.804426193237305, + "learning_rate": 4.28448e-07, + "loss": 0.4837, + "step": 142950 + }, + { + "epoch": 1.43, + "grad_norm": 9.690469741821289, + "learning_rate": 4.28248e-07, + "loss": 0.4484, + "step": 143000 + }, + { + "epoch": 1.4304999999999999, + "grad_norm": 31.82974624633789, + "learning_rate": 4.2804799999999997e-07, + "loss": 0.4718, + "step": 143050 + }, + { + "epoch": 1.431, + "grad_norm": 76.13028717041016, + "learning_rate": 4.27848e-07, + "loss": 0.4575, + "step": 143100 + }, + { + "epoch": 1.4315, + "grad_norm": 79.27205657958984, + "learning_rate": 4.2764799999999994e-07, + "loss": 0.4532, + "step": 143150 + }, + { + "epoch": 1.432, + "grad_norm": 1.6054680347442627, + "learning_rate": 4.27448e-07, + "loss": 0.4233, + "step": 143200 + }, + { + "epoch": 1.4325, + "grad_norm": 5.973708629608154, + "learning_rate": 4.27248e-07, + "loss": 0.4445, + "step": 143250 + }, + { + "epoch": 1.433, + "grad_norm": 97.4744873046875, + "learning_rate": 4.2704799999999995e-07, + "loss": 0.5598, + "step": 143300 + }, + { + "epoch": 1.4335, + "grad_norm": 7.280982494354248, + "learning_rate": 4.26848e-07, + "loss": 0.4821, + "step": 143350 + }, + { + "epoch": 1.434, + "grad_norm": 30.4831485748291, + "learning_rate": 4.26648e-07, + "loss": 0.3772, + "step": 143400 + }, + { + "epoch": 1.4344999999999999, + "grad_norm": 6.623522758483887, + "learning_rate": 4.2644799999999996e-07, + "loss": 0.4908, + "step": 143450 + }, + { + "epoch": 1.435, + "grad_norm": 26.69012451171875, + "learning_rate": 4.26248e-07, + "loss": 0.4367, + "step": 143500 + }, + { + "epoch": 1.4355, + "grad_norm": 48.03159713745117, + "learning_rate": 4.26048e-07, + "loss": 0.57, + "step": 143550 + }, + { + "epoch": 1.436, + "grad_norm": 0.36997270584106445, + "learning_rate": 4.2584799999999996e-07, + "loss": 0.381, + "step": 143600 + }, + { + "epoch": 1.4365, + "grad_norm": 75.4528579711914, + "learning_rate": 4.25648e-07, + "loss": 0.3569, + "step": 143650 + }, + { + "epoch": 1.437, + "grad_norm": 23.47142219543457, + "learning_rate": 4.25448e-07, + "loss": 0.4182, + "step": 143700 + }, + { + "epoch": 1.4375, + "grad_norm": 40.28401184082031, + "learning_rate": 4.2524799999999997e-07, + "loss": 0.4932, + "step": 143750 + }, + { + "epoch": 1.438, + "grad_norm": 8.089696884155273, + "learning_rate": 4.2504799999999996e-07, + "loss": 0.5582, + "step": 143800 + }, + { + "epoch": 1.4385, + "grad_norm": 93.84317779541016, + "learning_rate": 4.24848e-07, + "loss": 0.3421, + "step": 143850 + }, + { + "epoch": 1.439, + "grad_norm": 78.79322052001953, + "learning_rate": 4.2464800000000003e-07, + "loss": 0.5463, + "step": 143900 + }, + { + "epoch": 1.4395, + "grad_norm": 153.0576171875, + "learning_rate": 4.2444799999999996e-07, + "loss": 0.4011, + "step": 143950 + }, + { + "epoch": 1.44, + "grad_norm": 4.934996604919434, + "learning_rate": 4.24248e-07, + "loss": 0.4774, + "step": 144000 + }, + { + "epoch": 1.4405000000000001, + "grad_norm": 86.63742065429688, + "learning_rate": 4.24048e-07, + "loss": 0.4094, + "step": 144050 + }, + { + "epoch": 1.441, + "grad_norm": 79.47491455078125, + "learning_rate": 4.2384799999999997e-07, + "loss": 0.3831, + "step": 144100 + }, + { + "epoch": 1.4415, + "grad_norm": 55.07907485961914, + "learning_rate": 4.23648e-07, + "loss": 0.4505, + "step": 144150 + }, + { + "epoch": 1.442, + "grad_norm": 7.138432025909424, + "learning_rate": 4.23448e-07, + "loss": 0.4995, + "step": 144200 + }, + { + "epoch": 1.4425, + "grad_norm": 31.88157844543457, + "learning_rate": 4.23248e-07, + "loss": 0.674, + "step": 144250 + }, + { + "epoch": 1.443, + "grad_norm": 24.490238189697266, + "learning_rate": 4.2304799999999996e-07, + "loss": 0.4163, + "step": 144300 + }, + { + "epoch": 1.4435, + "grad_norm": 67.5228042602539, + "learning_rate": 4.22848e-07, + "loss": 0.3403, + "step": 144350 + }, + { + "epoch": 1.444, + "grad_norm": 1.8344687223434448, + "learning_rate": 4.22648e-07, + "loss": 0.5146, + "step": 144400 + }, + { + "epoch": 1.4445000000000001, + "grad_norm": 52.78595733642578, + "learning_rate": 4.2244799999999997e-07, + "loss": 0.5185, + "step": 144450 + }, + { + "epoch": 1.445, + "grad_norm": 8.644986152648926, + "learning_rate": 4.22248e-07, + "loss": 0.5884, + "step": 144500 + }, + { + "epoch": 1.4455, + "grad_norm": 28.123138427734375, + "learning_rate": 4.2204799999999994e-07, + "loss": 0.4497, + "step": 144550 + }, + { + "epoch": 1.446, + "grad_norm": 67.64913177490234, + "learning_rate": 4.21848e-07, + "loss": 0.4375, + "step": 144600 + }, + { + "epoch": 1.4465, + "grad_norm": 30.597097396850586, + "learning_rate": 4.21648e-07, + "loss": 0.5279, + "step": 144650 + }, + { + "epoch": 1.447, + "grad_norm": 100.2088851928711, + "learning_rate": 4.2144799999999995e-07, + "loss": 0.5009, + "step": 144700 + }, + { + "epoch": 1.4475, + "grad_norm": 55.302974700927734, + "learning_rate": 4.21248e-07, + "loss": 0.455, + "step": 144750 + }, + { + "epoch": 1.448, + "grad_norm": 67.07560729980469, + "learning_rate": 4.2104799999999997e-07, + "loss": 0.4658, + "step": 144800 + }, + { + "epoch": 1.4485000000000001, + "grad_norm": 13.170785903930664, + "learning_rate": 4.2084799999999996e-07, + "loss": 0.3561, + "step": 144850 + }, + { + "epoch": 1.449, + "grad_norm": 7.08843994140625, + "learning_rate": 4.20648e-07, + "loss": 0.3366, + "step": 144900 + }, + { + "epoch": 1.4495, + "grad_norm": 3.2631988525390625, + "learning_rate": 4.20448e-07, + "loss": 0.435, + "step": 144950 + }, + { + "epoch": 1.45, + "grad_norm": 14.511624336242676, + "learning_rate": 4.2024799999999997e-07, + "loss": 0.2825, + "step": 145000 + }, + { + "epoch": 1.4505, + "grad_norm": 13.6480073928833, + "learning_rate": 4.2004799999999995e-07, + "loss": 0.4737, + "step": 145050 + }, + { + "epoch": 1.451, + "grad_norm": 84.39595794677734, + "learning_rate": 4.19848e-07, + "loss": 0.478, + "step": 145100 + }, + { + "epoch": 1.4515, + "grad_norm": 112.03206634521484, + "learning_rate": 4.1964800000000003e-07, + "loss": 0.3953, + "step": 145150 + }, + { + "epoch": 1.452, + "grad_norm": 13.859879493713379, + "learning_rate": 4.1944799999999996e-07, + "loss": 0.4724, + "step": 145200 + }, + { + "epoch": 1.4525000000000001, + "grad_norm": 51.57634735107422, + "learning_rate": 4.19248e-07, + "loss": 0.4693, + "step": 145250 + }, + { + "epoch": 1.453, + "grad_norm": 1.5918205976486206, + "learning_rate": 4.19048e-07, + "loss": 0.3019, + "step": 145300 + }, + { + "epoch": 1.4535, + "grad_norm": 86.82250213623047, + "learning_rate": 4.1884799999999997e-07, + "loss": 0.4294, + "step": 145350 + }, + { + "epoch": 1.454, + "grad_norm": 29.19518280029297, + "learning_rate": 4.18648e-07, + "loss": 0.4965, + "step": 145400 + }, + { + "epoch": 1.4545, + "grad_norm": 38.55145263671875, + "learning_rate": 4.18448e-07, + "loss": 0.3507, + "step": 145450 + }, + { + "epoch": 1.455, + "grad_norm": 96.74462127685547, + "learning_rate": 4.18248e-07, + "loss": 0.5309, + "step": 145500 + }, + { + "epoch": 1.4555, + "grad_norm": 71.57083129882812, + "learning_rate": 4.1804799999999996e-07, + "loss": 0.5173, + "step": 145550 + }, + { + "epoch": 1.456, + "grad_norm": 142.42713928222656, + "learning_rate": 4.17848e-07, + "loss": 0.4896, + "step": 145600 + }, + { + "epoch": 1.4565000000000001, + "grad_norm": 71.31710815429688, + "learning_rate": 4.17648e-07, + "loss": 0.4368, + "step": 145650 + }, + { + "epoch": 1.457, + "grad_norm": 40.85289001464844, + "learning_rate": 4.1744799999999997e-07, + "loss": 0.4845, + "step": 145700 + }, + { + "epoch": 1.4575, + "grad_norm": 92.76287841796875, + "learning_rate": 4.17248e-07, + "loss": 0.4203, + "step": 145750 + }, + { + "epoch": 1.458, + "grad_norm": 34.11021423339844, + "learning_rate": 4.1704799999999994e-07, + "loss": 0.3603, + "step": 145800 + }, + { + "epoch": 1.4585, + "grad_norm": 19.930692672729492, + "learning_rate": 4.16848e-07, + "loss": 0.4356, + "step": 145850 + }, + { + "epoch": 1.459, + "grad_norm": 9.085647583007812, + "learning_rate": 4.16648e-07, + "loss": 0.3679, + "step": 145900 + }, + { + "epoch": 1.4595, + "grad_norm": 127.162841796875, + "learning_rate": 4.1644799999999994e-07, + "loss": 0.4983, + "step": 145950 + }, + { + "epoch": 1.46, + "grad_norm": 26.87095069885254, + "learning_rate": 4.16248e-07, + "loss": 0.452, + "step": 146000 + }, + { + "epoch": 1.4605000000000001, + "grad_norm": 6.62654972076416, + "learning_rate": 4.1604799999999997e-07, + "loss": 0.4967, + "step": 146050 + }, + { + "epoch": 1.461, + "grad_norm": 0.27412235736846924, + "learning_rate": 4.1584799999999995e-07, + "loss": 0.35, + "step": 146100 + }, + { + "epoch": 1.4615, + "grad_norm": 0.39492130279541016, + "learning_rate": 4.15648e-07, + "loss": 0.5392, + "step": 146150 + }, + { + "epoch": 1.462, + "grad_norm": 0.3583661615848541, + "learning_rate": 4.15448e-07, + "loss": 0.4021, + "step": 146200 + }, + { + "epoch": 1.4625, + "grad_norm": 66.29902648925781, + "learning_rate": 4.15248e-07, + "loss": 0.4798, + "step": 146250 + }, + { + "epoch": 1.463, + "grad_norm": 14.90131664276123, + "learning_rate": 4.1504799999999995e-07, + "loss": 0.4671, + "step": 146300 + }, + { + "epoch": 1.4635, + "grad_norm": 8.83768367767334, + "learning_rate": 4.14848e-07, + "loss": 0.5414, + "step": 146350 + }, + { + "epoch": 1.464, + "grad_norm": 39.12753677368164, + "learning_rate": 4.14648e-07, + "loss": 0.5358, + "step": 146400 + }, + { + "epoch": 1.4645000000000001, + "grad_norm": 50.82841110229492, + "learning_rate": 4.1444799999999995e-07, + "loss": 0.4129, + "step": 146450 + }, + { + "epoch": 1.465, + "grad_norm": 39.67839431762695, + "learning_rate": 4.14248e-07, + "loss": 0.4745, + "step": 146500 + }, + { + "epoch": 1.4655, + "grad_norm": 89.63794708251953, + "learning_rate": 4.1404800000000003e-07, + "loss": 0.5237, + "step": 146550 + }, + { + "epoch": 1.466, + "grad_norm": 65.97050476074219, + "learning_rate": 4.1384799999999996e-07, + "loss": 0.4113, + "step": 146600 + }, + { + "epoch": 1.4665, + "grad_norm": 30.679176330566406, + "learning_rate": 4.13648e-07, + "loss": 0.5226, + "step": 146650 + }, + { + "epoch": 1.467, + "grad_norm": 75.3191909790039, + "learning_rate": 4.13448e-07, + "loss": 0.4449, + "step": 146700 + }, + { + "epoch": 1.4675, + "grad_norm": 63.889930725097656, + "learning_rate": 4.1324799999999997e-07, + "loss": 0.4736, + "step": 146750 + }, + { + "epoch": 1.468, + "grad_norm": 61.48563003540039, + "learning_rate": 4.13048e-07, + "loss": 0.3692, + "step": 146800 + }, + { + "epoch": 1.4685000000000001, + "grad_norm": 34.46501541137695, + "learning_rate": 4.12848e-07, + "loss": 0.3301, + "step": 146850 + }, + { + "epoch": 1.4689999999999999, + "grad_norm": 11.682842254638672, + "learning_rate": 4.12648e-07, + "loss": 0.4875, + "step": 146900 + }, + { + "epoch": 1.4695, + "grad_norm": 6.080808639526367, + "learning_rate": 4.1244799999999996e-07, + "loss": 0.5323, + "step": 146950 + }, + { + "epoch": 1.47, + "grad_norm": 29.79290008544922, + "learning_rate": 4.12248e-07, + "loss": 0.5893, + "step": 147000 + }, + { + "epoch": 1.4705, + "grad_norm": 41.09132766723633, + "learning_rate": 4.12048e-07, + "loss": 0.4248, + "step": 147050 + }, + { + "epoch": 1.471, + "grad_norm": 62.73756408691406, + "learning_rate": 4.1184799999999997e-07, + "loss": 0.4393, + "step": 147100 + }, + { + "epoch": 1.4715, + "grad_norm": 41.83452224731445, + "learning_rate": 4.11648e-07, + "loss": 0.396, + "step": 147150 + }, + { + "epoch": 1.472, + "grad_norm": 59.541011810302734, + "learning_rate": 4.1144799999999994e-07, + "loss": 0.4265, + "step": 147200 + }, + { + "epoch": 1.4725, + "grad_norm": 23.928707122802734, + "learning_rate": 4.11248e-07, + "loss": 0.3726, + "step": 147250 + }, + { + "epoch": 1.4729999999999999, + "grad_norm": 70.83897399902344, + "learning_rate": 4.11048e-07, + "loss": 0.3918, + "step": 147300 + }, + { + "epoch": 1.4735, + "grad_norm": 21.586318969726562, + "learning_rate": 4.1084799999999995e-07, + "loss": 0.3456, + "step": 147350 + }, + { + "epoch": 1.474, + "grad_norm": 11.594403266906738, + "learning_rate": 4.10648e-07, + "loss": 0.3408, + "step": 147400 + }, + { + "epoch": 1.4745, + "grad_norm": 47.519081115722656, + "learning_rate": 4.1044799999999997e-07, + "loss": 0.3752, + "step": 147450 + }, + { + "epoch": 1.475, + "grad_norm": 128.55673217773438, + "learning_rate": 4.10248e-07, + "loss": 0.4493, + "step": 147500 + }, + { + "epoch": 1.4755, + "grad_norm": 67.64390563964844, + "learning_rate": 4.10048e-07, + "loss": 0.5414, + "step": 147550 + }, + { + "epoch": 1.476, + "grad_norm": 35.612281799316406, + "learning_rate": 4.09848e-07, + "loss": 0.4347, + "step": 147600 + }, + { + "epoch": 1.4765, + "grad_norm": 71.22669219970703, + "learning_rate": 4.09648e-07, + "loss": 0.4211, + "step": 147650 + }, + { + "epoch": 1.4769999999999999, + "grad_norm": 79.02700805664062, + "learning_rate": 4.0944799999999995e-07, + "loss": 0.3902, + "step": 147700 + }, + { + "epoch": 1.4775, + "grad_norm": 118.43605041503906, + "learning_rate": 4.09248e-07, + "loss": 0.4709, + "step": 147750 + }, + { + "epoch": 1.478, + "grad_norm": 42.166351318359375, + "learning_rate": 4.09048e-07, + "loss": 0.2732, + "step": 147800 + }, + { + "epoch": 1.4785, + "grad_norm": 12.510908126831055, + "learning_rate": 4.0884799999999996e-07, + "loss": 0.3895, + "step": 147850 + }, + { + "epoch": 1.479, + "grad_norm": 91.17105102539062, + "learning_rate": 4.08648e-07, + "loss": 0.4198, + "step": 147900 + }, + { + "epoch": 1.4795, + "grad_norm": 69.4258041381836, + "learning_rate": 4.08448e-07, + "loss": 0.4962, + "step": 147950 + }, + { + "epoch": 1.48, + "grad_norm": 45.617069244384766, + "learning_rate": 4.0824799999999996e-07, + "loss": 0.3913, + "step": 148000 + }, + { + "epoch": 1.4805, + "grad_norm": 88.43484497070312, + "learning_rate": 4.08048e-07, + "loss": 0.3713, + "step": 148050 + }, + { + "epoch": 1.4809999999999999, + "grad_norm": 28.707489013671875, + "learning_rate": 4.07848e-07, + "loss": 0.377, + "step": 148100 + }, + { + "epoch": 1.4815, + "grad_norm": 29.1148681640625, + "learning_rate": 4.0764799999999997e-07, + "loss": 0.3606, + "step": 148150 + }, + { + "epoch": 1.482, + "grad_norm": 11.052894592285156, + "learning_rate": 4.0744799999999996e-07, + "loss": 0.4278, + "step": 148200 + }, + { + "epoch": 1.4825, + "grad_norm": 94.74927520751953, + "learning_rate": 4.07248e-07, + "loss": 0.3981, + "step": 148250 + }, + { + "epoch": 1.483, + "grad_norm": 28.74058723449707, + "learning_rate": 4.07048e-07, + "loss": 0.5204, + "step": 148300 + }, + { + "epoch": 1.4835, + "grad_norm": 44.29704284667969, + "learning_rate": 4.0684799999999996e-07, + "loss": 0.473, + "step": 148350 + }, + { + "epoch": 1.484, + "grad_norm": 3.45829439163208, + "learning_rate": 4.06648e-07, + "loss": 0.423, + "step": 148400 + }, + { + "epoch": 1.4845, + "grad_norm": 112.15509796142578, + "learning_rate": 4.0644799999999993e-07, + "loss": 0.4297, + "step": 148450 + }, + { + "epoch": 1.4849999999999999, + "grad_norm": 37.767704010009766, + "learning_rate": 4.0624799999999997e-07, + "loss": 0.407, + "step": 148500 + }, + { + "epoch": 1.4855, + "grad_norm": 114.10420227050781, + "learning_rate": 4.06048e-07, + "loss": 0.49, + "step": 148550 + }, + { + "epoch": 1.486, + "grad_norm": 81.723388671875, + "learning_rate": 4.0584799999999994e-07, + "loss": 0.3653, + "step": 148600 + }, + { + "epoch": 1.4865, + "grad_norm": 2.658217191696167, + "learning_rate": 4.05652e-07, + "loss": 0.4765, + "step": 148650 + }, + { + "epoch": 1.487, + "grad_norm": 0.6626757979393005, + "learning_rate": 4.0545199999999997e-07, + "loss": 0.4804, + "step": 148700 + }, + { + "epoch": 1.4875, + "grad_norm": 71.02579498291016, + "learning_rate": 4.05252e-07, + "loss": 0.4608, + "step": 148750 + }, + { + "epoch": 1.488, + "grad_norm": 7.002717018127441, + "learning_rate": 4.05052e-07, + "loss": 0.4567, + "step": 148800 + }, + { + "epoch": 1.4885, + "grad_norm": 159.69973754882812, + "learning_rate": 4.04852e-07, + "loss": 0.4012, + "step": 148850 + }, + { + "epoch": 1.4889999999999999, + "grad_norm": 33.81310272216797, + "learning_rate": 4.04652e-07, + "loss": 0.4608, + "step": 148900 + }, + { + "epoch": 1.4895, + "grad_norm": 79.68433380126953, + "learning_rate": 4.0445199999999995e-07, + "loss": 0.4923, + "step": 148950 + }, + { + "epoch": 1.49, + "grad_norm": 3.4748802185058594, + "learning_rate": 4.04252e-07, + "loss": 0.39, + "step": 149000 + }, + { + "epoch": 1.4905, + "grad_norm": 53.5771598815918, + "learning_rate": 4.04052e-07, + "loss": 0.5652, + "step": 149050 + }, + { + "epoch": 1.491, + "grad_norm": 46.88816452026367, + "learning_rate": 4.0385199999999996e-07, + "loss": 0.4258, + "step": 149100 + }, + { + "epoch": 1.4915, + "grad_norm": 32.95225524902344, + "learning_rate": 4.03652e-07, + "loss": 0.4477, + "step": 149150 + }, + { + "epoch": 1.492, + "grad_norm": 68.19593048095703, + "learning_rate": 4.03452e-07, + "loss": 0.4861, + "step": 149200 + }, + { + "epoch": 1.4925, + "grad_norm": 78.71890258789062, + "learning_rate": 4.0325199999999996e-07, + "loss": 0.5034, + "step": 149250 + }, + { + "epoch": 1.4929999999999999, + "grad_norm": 4.197335720062256, + "learning_rate": 4.03052e-07, + "loss": 0.5138, + "step": 149300 + }, + { + "epoch": 1.4935, + "grad_norm": 6.181484222412109, + "learning_rate": 4.02852e-07, + "loss": 0.4473, + "step": 149350 + }, + { + "epoch": 1.494, + "grad_norm": 5.397106170654297, + "learning_rate": 4.0265199999999997e-07, + "loss": 0.4057, + "step": 149400 + }, + { + "epoch": 1.4945, + "grad_norm": 6.172823905944824, + "learning_rate": 4.0245199999999996e-07, + "loss": 0.4547, + "step": 149450 + }, + { + "epoch": 1.495, + "grad_norm": 30.402366638183594, + "learning_rate": 4.02252e-07, + "loss": 0.4936, + "step": 149500 + }, + { + "epoch": 1.4955, + "grad_norm": 90.25249481201172, + "learning_rate": 4.02052e-07, + "loss": 0.4147, + "step": 149550 + }, + { + "epoch": 1.496, + "grad_norm": 5.896697521209717, + "learning_rate": 4.0185199999999996e-07, + "loss": 0.4432, + "step": 149600 + }, + { + "epoch": 1.4965, + "grad_norm": 66.12384796142578, + "learning_rate": 4.01652e-07, + "loss": 0.4179, + "step": 149650 + }, + { + "epoch": 1.4969999999999999, + "grad_norm": 26.684412002563477, + "learning_rate": 4.0145199999999993e-07, + "loss": 0.5063, + "step": 149700 + }, + { + "epoch": 1.4975, + "grad_norm": 0.9913628697395325, + "learning_rate": 4.0125199999999997e-07, + "loss": 0.3879, + "step": 149750 + }, + { + "epoch": 1.498, + "grad_norm": 98.07878875732422, + "learning_rate": 4.01052e-07, + "loss": 0.4664, + "step": 149800 + }, + { + "epoch": 1.4985, + "grad_norm": 78.66146850585938, + "learning_rate": 4.0085199999999994e-07, + "loss": 0.5213, + "step": 149850 + }, + { + "epoch": 1.499, + "grad_norm": 18.173500061035156, + "learning_rate": 4.00652e-07, + "loss": 0.3256, + "step": 149900 + }, + { + "epoch": 1.4995, + "grad_norm": 0.6916254758834839, + "learning_rate": 4.0045199999999997e-07, + "loss": 0.4523, + "step": 149950 + }, + { + "epoch": 1.5, + "grad_norm": 6.336428642272949, + "learning_rate": 4.00252e-07, + "loss": 0.3649, + "step": 150000 + }, + { + "epoch": 1.5005, + "grad_norm": 72.81371307373047, + "learning_rate": 4.00052e-07, + "loss": 0.4134, + "step": 150050 + }, + { + "epoch": 1.501, + "grad_norm": 53.876487731933594, + "learning_rate": 3.9985199999999997e-07, + "loss": 0.4047, + "step": 150100 + }, + { + "epoch": 1.5015, + "grad_norm": 146.40159606933594, + "learning_rate": 3.99652e-07, + "loss": 0.4675, + "step": 150150 + }, + { + "epoch": 1.502, + "grad_norm": 65.77076721191406, + "learning_rate": 3.9945199999999994e-07, + "loss": 0.5416, + "step": 150200 + }, + { + "epoch": 1.5025, + "grad_norm": 41.62353515625, + "learning_rate": 3.99252e-07, + "loss": 0.5011, + "step": 150250 + }, + { + "epoch": 1.5030000000000001, + "grad_norm": 75.19416809082031, + "learning_rate": 3.99052e-07, + "loss": 0.3859, + "step": 150300 + }, + { + "epoch": 1.5034999999999998, + "grad_norm": 26.303030014038086, + "learning_rate": 3.9885199999999995e-07, + "loss": 0.4655, + "step": 150350 + }, + { + "epoch": 1.504, + "grad_norm": 97.27670288085938, + "learning_rate": 3.98652e-07, + "loss": 0.4718, + "step": 150400 + }, + { + "epoch": 1.5045, + "grad_norm": 120.45020294189453, + "learning_rate": 3.98456e-07, + "loss": 0.402, + "step": 150450 + }, + { + "epoch": 1.505, + "grad_norm": 57.528079986572266, + "learning_rate": 3.9825599999999996e-07, + "loss": 0.4117, + "step": 150500 + }, + { + "epoch": 1.5055, + "grad_norm": 110.59357452392578, + "learning_rate": 3.98056e-07, + "loss": 0.4865, + "step": 150550 + }, + { + "epoch": 1.506, + "grad_norm": 20.42061996459961, + "learning_rate": 3.97856e-07, + "loss": 0.5312, + "step": 150600 + }, + { + "epoch": 1.5065, + "grad_norm": 19.152402877807617, + "learning_rate": 3.9765599999999997e-07, + "loss": 0.5577, + "step": 150650 + }, + { + "epoch": 1.5070000000000001, + "grad_norm": 144.61178588867188, + "learning_rate": 3.9745599999999996e-07, + "loss": 0.4414, + "step": 150700 + }, + { + "epoch": 1.5074999999999998, + "grad_norm": 29.284423828125, + "learning_rate": 3.97256e-07, + "loss": 0.4616, + "step": 150750 + }, + { + "epoch": 1.508, + "grad_norm": 5.666861057281494, + "learning_rate": 3.97056e-07, + "loss": 0.5011, + "step": 150800 + }, + { + "epoch": 1.5085, + "grad_norm": 68.09160614013672, + "learning_rate": 3.9685599999999996e-07, + "loss": 0.5303, + "step": 150850 + }, + { + "epoch": 1.509, + "grad_norm": 17.154300689697266, + "learning_rate": 3.96656e-07, + "loss": 0.5095, + "step": 150900 + }, + { + "epoch": 1.5095, + "grad_norm": 76.5134048461914, + "learning_rate": 3.9645599999999993e-07, + "loss": 0.446, + "step": 150950 + }, + { + "epoch": 1.51, + "grad_norm": 40.559085845947266, + "learning_rate": 3.9625599999999997e-07, + "loss": 0.5092, + "step": 151000 + }, + { + "epoch": 1.5105, + "grad_norm": 60.698848724365234, + "learning_rate": 3.96056e-07, + "loss": 0.5773, + "step": 151050 + }, + { + "epoch": 1.5110000000000001, + "grad_norm": 122.44525909423828, + "learning_rate": 3.9585599999999994e-07, + "loss": 0.4852, + "step": 151100 + }, + { + "epoch": 1.5114999999999998, + "grad_norm": 31.7929744720459, + "learning_rate": 3.95656e-07, + "loss": 0.3784, + "step": 151150 + }, + { + "epoch": 1.512, + "grad_norm": 73.54200744628906, + "learning_rate": 3.9545599999999996e-07, + "loss": 0.3659, + "step": 151200 + }, + { + "epoch": 1.5125, + "grad_norm": 45.57496643066406, + "learning_rate": 3.95256e-07, + "loss": 0.4095, + "step": 151250 + }, + { + "epoch": 1.513, + "grad_norm": 35.19536590576172, + "learning_rate": 3.95056e-07, + "loss": 0.3262, + "step": 151300 + }, + { + "epoch": 1.5135, + "grad_norm": 29.406625747680664, + "learning_rate": 3.9485599999999997e-07, + "loss": 0.2914, + "step": 151350 + }, + { + "epoch": 1.514, + "grad_norm": 11.98690128326416, + "learning_rate": 3.94656e-07, + "loss": 0.5087, + "step": 151400 + }, + { + "epoch": 1.5145, + "grad_norm": 404.19842529296875, + "learning_rate": 3.9445599999999994e-07, + "loss": 0.5389, + "step": 151450 + }, + { + "epoch": 1.5150000000000001, + "grad_norm": 71.88636016845703, + "learning_rate": 3.94256e-07, + "loss": 0.5309, + "step": 151500 + }, + { + "epoch": 1.5154999999999998, + "grad_norm": 130.74053955078125, + "learning_rate": 3.94056e-07, + "loss": 0.4026, + "step": 151550 + }, + { + "epoch": 1.516, + "grad_norm": 43.46409606933594, + "learning_rate": 3.9385599999999995e-07, + "loss": 0.5388, + "step": 151600 + }, + { + "epoch": 1.5165, + "grad_norm": 7.446013927459717, + "learning_rate": 3.93656e-07, + "loss": 0.5019, + "step": 151650 + }, + { + "epoch": 1.517, + "grad_norm": 99.18240356445312, + "learning_rate": 3.9345599999999997e-07, + "loss": 0.5287, + "step": 151700 + }, + { + "epoch": 1.5175, + "grad_norm": 16.316425323486328, + "learning_rate": 3.9325599999999996e-07, + "loss": 0.409, + "step": 151750 + }, + { + "epoch": 1.518, + "grad_norm": 3.0078415870666504, + "learning_rate": 3.93056e-07, + "loss": 0.4389, + "step": 151800 + }, + { + "epoch": 1.5185, + "grad_norm": 57.24897384643555, + "learning_rate": 3.92856e-07, + "loss": 0.4834, + "step": 151850 + }, + { + "epoch": 1.5190000000000001, + "grad_norm": 9.619451522827148, + "learning_rate": 3.9265599999999997e-07, + "loss": 0.4941, + "step": 151900 + }, + { + "epoch": 1.5194999999999999, + "grad_norm": 66.24259185791016, + "learning_rate": 3.92456e-07, + "loss": 0.4241, + "step": 151950 + }, + { + "epoch": 1.52, + "grad_norm": 64.54350280761719, + "learning_rate": 3.92256e-07, + "loss": 0.4539, + "step": 152000 + }, + { + "epoch": 1.5205, + "grad_norm": 3.8642959594726562, + "learning_rate": 3.92056e-07, + "loss": 0.4306, + "step": 152050 + }, + { + "epoch": 1.521, + "grad_norm": 88.06119537353516, + "learning_rate": 3.9185599999999996e-07, + "loss": 0.4, + "step": 152100 + }, + { + "epoch": 1.5215, + "grad_norm": 0.5030847191810608, + "learning_rate": 3.91656e-07, + "loss": 0.4478, + "step": 152150 + }, + { + "epoch": 1.522, + "grad_norm": 74.51904296875, + "learning_rate": 3.9145600000000003e-07, + "loss": 0.4082, + "step": 152200 + }, + { + "epoch": 1.5225, + "grad_norm": 7.973733425140381, + "learning_rate": 3.9125599999999997e-07, + "loss": 0.4605, + "step": 152250 + }, + { + "epoch": 1.5230000000000001, + "grad_norm": 52.102630615234375, + "learning_rate": 3.91056e-07, + "loss": 0.4287, + "step": 152300 + }, + { + "epoch": 1.5234999999999999, + "grad_norm": 10.355042457580566, + "learning_rate": 3.90856e-07, + "loss": 0.532, + "step": 152350 + }, + { + "epoch": 1.524, + "grad_norm": 13.536386489868164, + "learning_rate": 3.90656e-07, + "loss": 0.3487, + "step": 152400 + }, + { + "epoch": 1.5245, + "grad_norm": 17.59664535522461, + "learning_rate": 3.90456e-07, + "loss": 0.5859, + "step": 152450 + }, + { + "epoch": 1.525, + "grad_norm": 37.894744873046875, + "learning_rate": 3.90256e-07, + "loss": 0.3717, + "step": 152500 + }, + { + "epoch": 1.5255, + "grad_norm": 87.29824829101562, + "learning_rate": 3.90056e-07, + "loss": 0.3377, + "step": 152550 + }, + { + "epoch": 1.526, + "grad_norm": 150.49517822265625, + "learning_rate": 3.8985599999999997e-07, + "loss": 0.4282, + "step": 152600 + }, + { + "epoch": 1.5265, + "grad_norm": 22.18712615966797, + "learning_rate": 3.89656e-07, + "loss": 0.3628, + "step": 152650 + }, + { + "epoch": 1.5270000000000001, + "grad_norm": 7.895515441894531, + "learning_rate": 3.8945999999999994e-07, + "loss": 0.4914, + "step": 152700 + }, + { + "epoch": 1.5274999999999999, + "grad_norm": 98.30242919921875, + "learning_rate": 3.8926e-07, + "loss": 0.4066, + "step": 152750 + }, + { + "epoch": 1.528, + "grad_norm": 71.57797241210938, + "learning_rate": 3.8906e-07, + "loss": 0.4606, + "step": 152800 + }, + { + "epoch": 1.5285, + "grad_norm": 52.828861236572266, + "learning_rate": 3.8885999999999995e-07, + "loss": 0.4835, + "step": 152850 + }, + { + "epoch": 1.529, + "grad_norm": 1.3210022449493408, + "learning_rate": 3.8866e-07, + "loss": 0.6443, + "step": 152900 + }, + { + "epoch": 1.5295, + "grad_norm": 0.20396688580513, + "learning_rate": 3.8846e-07, + "loss": 0.3461, + "step": 152950 + }, + { + "epoch": 1.53, + "grad_norm": 5.065512180328369, + "learning_rate": 3.8825999999999996e-07, + "loss": 0.5639, + "step": 153000 + }, + { + "epoch": 1.5305, + "grad_norm": 0.27754420042037964, + "learning_rate": 3.8806e-07, + "loss": 0.357, + "step": 153050 + }, + { + "epoch": 1.5310000000000001, + "grad_norm": 105.62935638427734, + "learning_rate": 3.8786e-07, + "loss": 0.51, + "step": 153100 + }, + { + "epoch": 1.5314999999999999, + "grad_norm": 78.91966247558594, + "learning_rate": 3.8765999999999997e-07, + "loss": 0.3173, + "step": 153150 + }, + { + "epoch": 1.532, + "grad_norm": 22.052091598510742, + "learning_rate": 3.8746e-07, + "loss": 0.4488, + "step": 153200 + }, + { + "epoch": 1.5325, + "grad_norm": 105.67029571533203, + "learning_rate": 3.8726e-07, + "loss": 0.3544, + "step": 153250 + }, + { + "epoch": 1.533, + "grad_norm": 68.83394622802734, + "learning_rate": 3.8705999999999997e-07, + "loss": 0.4906, + "step": 153300 + }, + { + "epoch": 1.5335, + "grad_norm": 35.794700622558594, + "learning_rate": 3.8685999999999996e-07, + "loss": 0.3602, + "step": 153350 + }, + { + "epoch": 1.534, + "grad_norm": 8.180852890014648, + "learning_rate": 3.8666e-07, + "loss": 0.4035, + "step": 153400 + }, + { + "epoch": 1.5345, + "grad_norm": 18.921524047851562, + "learning_rate": 3.8646000000000003e-07, + "loss": 0.4177, + "step": 153450 + }, + { + "epoch": 1.5350000000000001, + "grad_norm": 28.274133682250977, + "learning_rate": 3.8625999999999997e-07, + "loss": 0.3924, + "step": 153500 + }, + { + "epoch": 1.5354999999999999, + "grad_norm": 60.59059524536133, + "learning_rate": 3.8606e-07, + "loss": 0.641, + "step": 153550 + }, + { + "epoch": 1.536, + "grad_norm": 13.755937576293945, + "learning_rate": 3.8586e-07, + "loss": 0.3756, + "step": 153600 + }, + { + "epoch": 1.5365, + "grad_norm": 28.67469596862793, + "learning_rate": 3.8566e-07, + "loss": 0.5329, + "step": 153650 + }, + { + "epoch": 1.537, + "grad_norm": 67.26752471923828, + "learning_rate": 3.8546e-07, + "loss": 0.4442, + "step": 153700 + }, + { + "epoch": 1.5375, + "grad_norm": 76.13204956054688, + "learning_rate": 3.8526e-07, + "loss": 0.3637, + "step": 153750 + }, + { + "epoch": 1.538, + "grad_norm": 108.6531753540039, + "learning_rate": 3.8506e-07, + "loss": 0.4456, + "step": 153800 + }, + { + "epoch": 1.5385, + "grad_norm": 82.92815399169922, + "learning_rate": 3.8485999999999997e-07, + "loss": 0.3671, + "step": 153850 + }, + { + "epoch": 1.5390000000000001, + "grad_norm": 3.92008376121521, + "learning_rate": 3.8466e-07, + "loss": 0.354, + "step": 153900 + }, + { + "epoch": 1.5394999999999999, + "grad_norm": 46.99858093261719, + "learning_rate": 3.8446e-07, + "loss": 0.5856, + "step": 153950 + }, + { + "epoch": 1.54, + "grad_norm": 16.050018310546875, + "learning_rate": 3.8426e-07, + "loss": 0.5328, + "step": 154000 + }, + { + "epoch": 1.5405, + "grad_norm": 59.63269805908203, + "learning_rate": 3.8406e-07, + "loss": 0.5117, + "step": 154050 + }, + { + "epoch": 1.541, + "grad_norm": 12.401506423950195, + "learning_rate": 3.8385999999999994e-07, + "loss": 0.4722, + "step": 154100 + }, + { + "epoch": 1.5415, + "grad_norm": 104.35944366455078, + "learning_rate": 3.8366e-07, + "loss": 0.5007, + "step": 154150 + }, + { + "epoch": 1.542, + "grad_norm": 32.52841567993164, + "learning_rate": 3.8346e-07, + "loss": 0.4704, + "step": 154200 + }, + { + "epoch": 1.5425, + "grad_norm": 101.7193374633789, + "learning_rate": 3.8325999999999995e-07, + "loss": 0.4382, + "step": 154250 + }, + { + "epoch": 1.5430000000000001, + "grad_norm": 89.42278289794922, + "learning_rate": 3.8306e-07, + "loss": 0.5037, + "step": 154300 + }, + { + "epoch": 1.5434999999999999, + "grad_norm": 35.02275466918945, + "learning_rate": 3.8286e-07, + "loss": 0.4409, + "step": 154350 + }, + { + "epoch": 1.544, + "grad_norm": 78.44393157958984, + "learning_rate": 3.8265999999999996e-07, + "loss": 0.4195, + "step": 154400 + }, + { + "epoch": 1.5445, + "grad_norm": 14.024569511413574, + "learning_rate": 3.8246e-07, + "loss": 0.3931, + "step": 154450 + }, + { + "epoch": 1.545, + "grad_norm": 29.970157623291016, + "learning_rate": 3.8226e-07, + "loss": 0.4982, + "step": 154500 + }, + { + "epoch": 1.5455, + "grad_norm": 92.17186737060547, + "learning_rate": 3.8205999999999997e-07, + "loss": 0.489, + "step": 154550 + }, + { + "epoch": 1.546, + "grad_norm": 103.02266693115234, + "learning_rate": 3.8185999999999995e-07, + "loss": 0.4268, + "step": 154600 + }, + { + "epoch": 1.5465, + "grad_norm": 2.3112690448760986, + "learning_rate": 3.8166e-07, + "loss": 0.4287, + "step": 154650 + }, + { + "epoch": 1.5470000000000002, + "grad_norm": 71.17611694335938, + "learning_rate": 3.8146000000000003e-07, + "loss": 0.356, + "step": 154700 + }, + { + "epoch": 1.5474999999999999, + "grad_norm": 21.637741088867188, + "learning_rate": 3.8125999999999996e-07, + "loss": 0.4844, + "step": 154750 + }, + { + "epoch": 1.548, + "grad_norm": 26.230649948120117, + "learning_rate": 3.8106e-07, + "loss": 0.5267, + "step": 154800 + }, + { + "epoch": 1.5485, + "grad_norm": 91.8410415649414, + "learning_rate": 3.8086e-07, + "loss": 0.4135, + "step": 154850 + }, + { + "epoch": 1.549, + "grad_norm": 12.015192985534668, + "learning_rate": 3.8065999999999997e-07, + "loss": 0.4938, + "step": 154900 + }, + { + "epoch": 1.5495, + "grad_norm": 5.9891533851623535, + "learning_rate": 3.8046e-07, + "loss": 0.3827, + "step": 154950 + }, + { + "epoch": 1.55, + "grad_norm": 0.673921525478363, + "learning_rate": 3.8026e-07, + "loss": 0.4642, + "step": 155000 + }, + { + "epoch": 1.5505, + "grad_norm": 48.262916564941406, + "learning_rate": 3.8006e-07, + "loss": 0.2987, + "step": 155050 + }, + { + "epoch": 1.5510000000000002, + "grad_norm": 15.329411506652832, + "learning_rate": 3.7985999999999996e-07, + "loss": 0.4899, + "step": 155100 + }, + { + "epoch": 1.5514999999999999, + "grad_norm": 54.15485763549805, + "learning_rate": 3.7966e-07, + "loss": 0.3895, + "step": 155150 + }, + { + "epoch": 1.552, + "grad_norm": 76.88223266601562, + "learning_rate": 3.7946e-07, + "loss": 0.3881, + "step": 155200 + }, + { + "epoch": 1.5525, + "grad_norm": 19.991403579711914, + "learning_rate": 3.7925999999999997e-07, + "loss": 0.398, + "step": 155250 + }, + { + "epoch": 1.553, + "grad_norm": 94.80240631103516, + "learning_rate": 3.7906e-07, + "loss": 0.4081, + "step": 155300 + }, + { + "epoch": 1.5535, + "grad_norm": 122.23736572265625, + "learning_rate": 3.7885999999999994e-07, + "loss": 0.3894, + "step": 155350 + }, + { + "epoch": 1.554, + "grad_norm": 57.91024398803711, + "learning_rate": 3.7866e-07, + "loss": 0.4474, + "step": 155400 + }, + { + "epoch": 1.5545, + "grad_norm": 8.4732666015625, + "learning_rate": 3.7846e-07, + "loss": 0.4155, + "step": 155450 + }, + { + "epoch": 1.5550000000000002, + "grad_norm": 12.340983390808105, + "learning_rate": 3.7825999999999995e-07, + "loss": 0.4288, + "step": 155500 + }, + { + "epoch": 1.5554999999999999, + "grad_norm": 13.673118591308594, + "learning_rate": 3.7806e-07, + "loss": 0.5154, + "step": 155550 + }, + { + "epoch": 1.556, + "grad_norm": 112.43163299560547, + "learning_rate": 3.7785999999999997e-07, + "loss": 0.3473, + "step": 155600 + }, + { + "epoch": 1.5565, + "grad_norm": 10.982392311096191, + "learning_rate": 3.7765999999999996e-07, + "loss": 0.3726, + "step": 155650 + }, + { + "epoch": 1.557, + "grad_norm": 105.76234436035156, + "learning_rate": 3.7746e-07, + "loss": 0.3479, + "step": 155700 + }, + { + "epoch": 1.5575, + "grad_norm": 76.24043273925781, + "learning_rate": 3.7726e-07, + "loss": 0.4958, + "step": 155750 + }, + { + "epoch": 1.558, + "grad_norm": 6.342814922332764, + "learning_rate": 3.7705999999999996e-07, + "loss": 0.3893, + "step": 155800 + }, + { + "epoch": 1.5585, + "grad_norm": 74.10182189941406, + "learning_rate": 3.7685999999999995e-07, + "loss": 0.4655, + "step": 155850 + }, + { + "epoch": 1.5590000000000002, + "grad_norm": 0.016593320295214653, + "learning_rate": 3.7666e-07, + "loss": 0.4724, + "step": 155900 + }, + { + "epoch": 1.5594999999999999, + "grad_norm": 3.463813304901123, + "learning_rate": 3.7646e-07, + "loss": 0.3882, + "step": 155950 + }, + { + "epoch": 1.56, + "grad_norm": 28.10302734375, + "learning_rate": 3.7625999999999996e-07, + "loss": 0.4819, + "step": 156000 + }, + { + "epoch": 1.5605, + "grad_norm": 90.9014892578125, + "learning_rate": 3.7606e-07, + "loss": 0.5103, + "step": 156050 + }, + { + "epoch": 1.561, + "grad_norm": 5.384343147277832, + "learning_rate": 3.7586000000000003e-07, + "loss": 0.3814, + "step": 156100 + }, + { + "epoch": 1.5615, + "grad_norm": 63.378196716308594, + "learning_rate": 3.7565999999999996e-07, + "loss": 0.3121, + "step": 156150 + }, + { + "epoch": 1.562, + "grad_norm": 67.78466796875, + "learning_rate": 3.7546e-07, + "loss": 0.3317, + "step": 156200 + }, + { + "epoch": 1.5625, + "grad_norm": 38.204132080078125, + "learning_rate": 3.7526e-07, + "loss": 0.6073, + "step": 156250 + }, + { + "epoch": 1.563, + "grad_norm": 29.50925636291504, + "learning_rate": 3.7505999999999997e-07, + "loss": 0.3818, + "step": 156300 + }, + { + "epoch": 1.5635, + "grad_norm": 55.05265426635742, + "learning_rate": 3.7486e-07, + "loss": 0.4596, + "step": 156350 + }, + { + "epoch": 1.564, + "grad_norm": 124.0997085571289, + "learning_rate": 3.7466e-07, + "loss": 0.4736, + "step": 156400 + }, + { + "epoch": 1.5645, + "grad_norm": 38.588619232177734, + "learning_rate": 3.7446e-07, + "loss": 0.4511, + "step": 156450 + }, + { + "epoch": 1.565, + "grad_norm": 80.09603118896484, + "learning_rate": 3.7425999999999996e-07, + "loss": 0.4403, + "step": 156500 + }, + { + "epoch": 1.5655000000000001, + "grad_norm": 1.9069684743881226, + "learning_rate": 3.7406e-07, + "loss": 0.5459, + "step": 156550 + }, + { + "epoch": 1.5659999999999998, + "grad_norm": 1.7578861713409424, + "learning_rate": 3.7386e-07, + "loss": 0.5356, + "step": 156600 + }, + { + "epoch": 1.5665, + "grad_norm": 13.54135513305664, + "learning_rate": 3.7365999999999997e-07, + "loss": 0.4722, + "step": 156650 + }, + { + "epoch": 1.567, + "grad_norm": 42.47798156738281, + "learning_rate": 3.7346e-07, + "loss": 0.4462, + "step": 156700 + }, + { + "epoch": 1.5675, + "grad_norm": 32.1161003112793, + "learning_rate": 3.7326399999999995e-07, + "loss": 0.4484, + "step": 156750 + }, + { + "epoch": 1.568, + "grad_norm": 44.59100341796875, + "learning_rate": 3.73064e-07, + "loss": 0.3855, + "step": 156800 + }, + { + "epoch": 1.5685, + "grad_norm": 15.064719200134277, + "learning_rate": 3.7286399999999997e-07, + "loss": 0.4979, + "step": 156850 + }, + { + "epoch": 1.569, + "grad_norm": 82.406982421875, + "learning_rate": 3.7266399999999995e-07, + "loss": 0.3934, + "step": 156900 + }, + { + "epoch": 1.5695000000000001, + "grad_norm": 1.5106115341186523, + "learning_rate": 3.72464e-07, + "loss": 0.4888, + "step": 156950 + }, + { + "epoch": 1.5699999999999998, + "grad_norm": 34.84396743774414, + "learning_rate": 3.72264e-07, + "loss": 0.3313, + "step": 157000 + }, + { + "epoch": 1.5705, + "grad_norm": 56.88155746459961, + "learning_rate": 3.7206399999999996e-07, + "loss": 0.6379, + "step": 157050 + }, + { + "epoch": 1.571, + "grad_norm": 47.09871292114258, + "learning_rate": 3.7186399999999995e-07, + "loss": 0.4519, + "step": 157100 + }, + { + "epoch": 1.5715, + "grad_norm": 44.833335876464844, + "learning_rate": 3.71664e-07, + "loss": 0.3936, + "step": 157150 + }, + { + "epoch": 1.572, + "grad_norm": 85.64391326904297, + "learning_rate": 3.71464e-07, + "loss": 0.3674, + "step": 157200 + }, + { + "epoch": 1.5725, + "grad_norm": 69.18328857421875, + "learning_rate": 3.7126399999999996e-07, + "loss": 0.5126, + "step": 157250 + }, + { + "epoch": 1.573, + "grad_norm": 98.84770202636719, + "learning_rate": 3.71064e-07, + "loss": 0.4173, + "step": 157300 + }, + { + "epoch": 1.5735000000000001, + "grad_norm": 22.579782485961914, + "learning_rate": 3.7086400000000003e-07, + "loss": 0.4732, + "step": 157350 + }, + { + "epoch": 1.5739999999999998, + "grad_norm": 27.351652145385742, + "learning_rate": 3.7066399999999996e-07, + "loss": 0.5637, + "step": 157400 + }, + { + "epoch": 1.5745, + "grad_norm": 66.01678466796875, + "learning_rate": 3.70464e-07, + "loss": 0.3713, + "step": 157450 + }, + { + "epoch": 1.575, + "grad_norm": 4.506946563720703, + "learning_rate": 3.70264e-07, + "loss": 0.2717, + "step": 157500 + }, + { + "epoch": 1.5755, + "grad_norm": 61.51058578491211, + "learning_rate": 3.7006399999999997e-07, + "loss": 0.4417, + "step": 157550 + }, + { + "epoch": 1.576, + "grad_norm": 3.467181921005249, + "learning_rate": 3.69864e-07, + "loss": 0.4313, + "step": 157600 + }, + { + "epoch": 1.5765, + "grad_norm": 14.573429107666016, + "learning_rate": 3.69664e-07, + "loss": 0.4639, + "step": 157650 + }, + { + "epoch": 1.577, + "grad_norm": 54.943565368652344, + "learning_rate": 3.69464e-07, + "loss": 0.428, + "step": 157700 + }, + { + "epoch": 1.5775000000000001, + "grad_norm": 8.155677795410156, + "learning_rate": 3.6926799999999997e-07, + "loss": 0.4534, + "step": 157750 + }, + { + "epoch": 1.5779999999999998, + "grad_norm": 67.54132080078125, + "learning_rate": 3.69068e-07, + "loss": 0.4839, + "step": 157800 + }, + { + "epoch": 1.5785, + "grad_norm": 10.213589668273926, + "learning_rate": 3.6886799999999994e-07, + "loss": 0.3556, + "step": 157850 + }, + { + "epoch": 1.579, + "grad_norm": 109.822021484375, + "learning_rate": 3.68668e-07, + "loss": 0.4593, + "step": 157900 + }, + { + "epoch": 1.5795, + "grad_norm": 82.01921081542969, + "learning_rate": 3.68468e-07, + "loss": 0.3996, + "step": 157950 + }, + { + "epoch": 1.58, + "grad_norm": 60.44963836669922, + "learning_rate": 3.6826799999999995e-07, + "loss": 0.5142, + "step": 158000 + }, + { + "epoch": 1.5805, + "grad_norm": 101.18560791015625, + "learning_rate": 3.68068e-07, + "loss": 0.39, + "step": 158050 + }, + { + "epoch": 1.581, + "grad_norm": 73.67401123046875, + "learning_rate": 3.6786799999999997e-07, + "loss": 0.4294, + "step": 158100 + }, + { + "epoch": 1.5815000000000001, + "grad_norm": 38.87802505493164, + "learning_rate": 3.6766799999999995e-07, + "loss": 0.4877, + "step": 158150 + }, + { + "epoch": 1.5819999999999999, + "grad_norm": 36.239723205566406, + "learning_rate": 3.67468e-07, + "loss": 0.3294, + "step": 158200 + }, + { + "epoch": 1.5825, + "grad_norm": 16.11030387878418, + "learning_rate": 3.67268e-07, + "loss": 0.5534, + "step": 158250 + }, + { + "epoch": 1.583, + "grad_norm": 43.37452697753906, + "learning_rate": 3.67068e-07, + "loss": 0.4388, + "step": 158300 + }, + { + "epoch": 1.5835, + "grad_norm": 15.790046691894531, + "learning_rate": 3.66868e-07, + "loss": 0.4595, + "step": 158350 + }, + { + "epoch": 1.584, + "grad_norm": 49.98896026611328, + "learning_rate": 3.66668e-07, + "loss": 0.598, + "step": 158400 + }, + { + "epoch": 1.5845, + "grad_norm": 151.6587371826172, + "learning_rate": 3.66468e-07, + "loss": 0.5676, + "step": 158450 + }, + { + "epoch": 1.585, + "grad_norm": 150.25958251953125, + "learning_rate": 3.6626799999999995e-07, + "loss": 0.4359, + "step": 158500 + }, + { + "epoch": 1.5855000000000001, + "grad_norm": 79.11650085449219, + "learning_rate": 3.66068e-07, + "loss": 0.4847, + "step": 158550 + }, + { + "epoch": 1.5859999999999999, + "grad_norm": 16.014951705932617, + "learning_rate": 3.6586800000000003e-07, + "loss": 0.4409, + "step": 158600 + }, + { + "epoch": 1.5865, + "grad_norm": 38.4268798828125, + "learning_rate": 3.6566799999999996e-07, + "loss": 0.4406, + "step": 158650 + }, + { + "epoch": 1.587, + "grad_norm": 75.6319808959961, + "learning_rate": 3.65468e-07, + "loss": 0.4073, + "step": 158700 + }, + { + "epoch": 1.5875, + "grad_norm": 83.09989166259766, + "learning_rate": 3.65268e-07, + "loss": 0.6108, + "step": 158750 + }, + { + "epoch": 1.588, + "grad_norm": 85.7934341430664, + "learning_rate": 3.6506799999999997e-07, + "loss": 0.4062, + "step": 158800 + }, + { + "epoch": 1.5885, + "grad_norm": 63.43593978881836, + "learning_rate": 3.64868e-07, + "loss": 0.4971, + "step": 158850 + }, + { + "epoch": 1.589, + "grad_norm": 0.3882288336753845, + "learning_rate": 3.64668e-07, + "loss": 0.4136, + "step": 158900 + }, + { + "epoch": 1.5895000000000001, + "grad_norm": 69.4052963256836, + "learning_rate": 3.64468e-07, + "loss": 0.5876, + "step": 158950 + }, + { + "epoch": 1.5899999999999999, + "grad_norm": 0.12768994271755219, + "learning_rate": 3.6426799999999996e-07, + "loss": 0.5236, + "step": 159000 + }, + { + "epoch": 1.5905, + "grad_norm": 123.2761459350586, + "learning_rate": 3.64068e-07, + "loss": 0.503, + "step": 159050 + }, + { + "epoch": 1.591, + "grad_norm": 0.08437743782997131, + "learning_rate": 3.63868e-07, + "loss": 0.5431, + "step": 159100 + }, + { + "epoch": 1.5915, + "grad_norm": 50.3804817199707, + "learning_rate": 3.6366799999999997e-07, + "loss": 0.3803, + "step": 159150 + }, + { + "epoch": 1.592, + "grad_norm": 66.29913330078125, + "learning_rate": 3.63468e-07, + "loss": 0.441, + "step": 159200 + }, + { + "epoch": 1.5925, + "grad_norm": 57.80009460449219, + "learning_rate": 3.6326799999999994e-07, + "loss": 0.5003, + "step": 159250 + }, + { + "epoch": 1.593, + "grad_norm": 38.19333267211914, + "learning_rate": 3.63068e-07, + "loss": 0.3942, + "step": 159300 + }, + { + "epoch": 1.5935000000000001, + "grad_norm": 108.0379638671875, + "learning_rate": 3.62868e-07, + "loss": 0.5143, + "step": 159350 + }, + { + "epoch": 1.5939999999999999, + "grad_norm": 37.051204681396484, + "learning_rate": 3.6266799999999995e-07, + "loss": 0.4615, + "step": 159400 + }, + { + "epoch": 1.5945, + "grad_norm": 32.642669677734375, + "learning_rate": 3.62468e-07, + "loss": 0.4396, + "step": 159450 + }, + { + "epoch": 1.595, + "grad_norm": 64.99166107177734, + "learning_rate": 3.6226799999999997e-07, + "loss": 0.4363, + "step": 159500 + }, + { + "epoch": 1.5955, + "grad_norm": 57.88900375366211, + "learning_rate": 3.62068e-07, + "loss": 0.4541, + "step": 159550 + }, + { + "epoch": 1.596, + "grad_norm": 0.2389134019613266, + "learning_rate": 3.61868e-07, + "loss": 0.5371, + "step": 159600 + }, + { + "epoch": 1.5965, + "grad_norm": 7.437679290771484, + "learning_rate": 3.61668e-07, + "loss": 0.4728, + "step": 159650 + }, + { + "epoch": 1.597, + "grad_norm": 78.19436645507812, + "learning_rate": 3.61468e-07, + "loss": 0.4397, + "step": 159700 + }, + { + "epoch": 1.5975000000000001, + "grad_norm": 110.06568908691406, + "learning_rate": 3.6127199999999995e-07, + "loss": 0.4691, + "step": 159750 + }, + { + "epoch": 1.5979999999999999, + "grad_norm": 67.42326354980469, + "learning_rate": 3.61072e-07, + "loss": 0.3647, + "step": 159800 + }, + { + "epoch": 1.5985, + "grad_norm": 82.26587677001953, + "learning_rate": 3.6087200000000003e-07, + "loss": 0.5464, + "step": 159850 + }, + { + "epoch": 1.599, + "grad_norm": 92.55001068115234, + "learning_rate": 3.6067199999999996e-07, + "loss": 0.3611, + "step": 159900 + }, + { + "epoch": 1.5995, + "grad_norm": 35.45196533203125, + "learning_rate": 3.60472e-07, + "loss": 0.3369, + "step": 159950 + }, + { + "epoch": 1.6, + "grad_norm": 38.73994445800781, + "learning_rate": 3.60272e-07, + "loss": 0.3748, + "step": 160000 + }, + { + "epoch": 1.6005, + "grad_norm": 2.0758419036865234, + "learning_rate": 3.6007199999999997e-07, + "loss": 0.4442, + "step": 160050 + }, + { + "epoch": 1.601, + "grad_norm": 62.947288513183594, + "learning_rate": 3.59872e-07, + "loss": 0.4279, + "step": 160100 + }, + { + "epoch": 1.6015000000000001, + "grad_norm": 0.5712276101112366, + "learning_rate": 3.59672e-07, + "loss": 0.4415, + "step": 160150 + }, + { + "epoch": 1.6019999999999999, + "grad_norm": 31.01521873474121, + "learning_rate": 3.59472e-07, + "loss": 0.4596, + "step": 160200 + }, + { + "epoch": 1.6025, + "grad_norm": 58.51720428466797, + "learning_rate": 3.5927199999999996e-07, + "loss": 0.4667, + "step": 160250 + }, + { + "epoch": 1.603, + "grad_norm": 23.069557189941406, + "learning_rate": 3.59072e-07, + "loss": 0.3335, + "step": 160300 + }, + { + "epoch": 1.6035, + "grad_norm": 93.3023910522461, + "learning_rate": 3.58872e-07, + "loss": 0.519, + "step": 160350 + }, + { + "epoch": 1.604, + "grad_norm": 5.520042896270752, + "learning_rate": 3.5867199999999997e-07, + "loss": 0.42, + "step": 160400 + }, + { + "epoch": 1.6045, + "grad_norm": 49.47138214111328, + "learning_rate": 3.58472e-07, + "loss": 0.5085, + "step": 160450 + }, + { + "epoch": 1.605, + "grad_norm": 93.62559509277344, + "learning_rate": 3.5827199999999994e-07, + "loss": 0.3626, + "step": 160500 + }, + { + "epoch": 1.6055000000000001, + "grad_norm": 28.985837936401367, + "learning_rate": 3.58072e-07, + "loss": 0.4725, + "step": 160550 + }, + { + "epoch": 1.6059999999999999, + "grad_norm": 0.5782918334007263, + "learning_rate": 3.57872e-07, + "loss": 0.344, + "step": 160600 + }, + { + "epoch": 1.6065, + "grad_norm": 62.849220275878906, + "learning_rate": 3.5767199999999995e-07, + "loss": 0.3763, + "step": 160650 + }, + { + "epoch": 1.607, + "grad_norm": 62.66322708129883, + "learning_rate": 3.57472e-07, + "loss": 0.4729, + "step": 160700 + }, + { + "epoch": 1.6075, + "grad_norm": 1.8875168561935425, + "learning_rate": 3.5727199999999997e-07, + "loss": 0.4565, + "step": 160750 + }, + { + "epoch": 1.608, + "grad_norm": 86.59226989746094, + "learning_rate": 3.57072e-07, + "loss": 0.3441, + "step": 160800 + }, + { + "epoch": 1.6085, + "grad_norm": 22.594993591308594, + "learning_rate": 3.56872e-07, + "loss": 0.5117, + "step": 160850 + }, + { + "epoch": 1.609, + "grad_norm": 31.01827049255371, + "learning_rate": 3.56672e-07, + "loss": 0.4471, + "step": 160900 + }, + { + "epoch": 1.6095000000000002, + "grad_norm": 85.61234283447266, + "learning_rate": 3.56472e-07, + "loss": 0.3757, + "step": 160950 + }, + { + "epoch": 1.6099999999999999, + "grad_norm": 87.01544189453125, + "learning_rate": 3.5627199999999995e-07, + "loss": 0.5461, + "step": 161000 + }, + { + "epoch": 1.6105, + "grad_norm": 4.279980659484863, + "learning_rate": 3.56072e-07, + "loss": 0.4856, + "step": 161050 + }, + { + "epoch": 1.611, + "grad_norm": 121.6936264038086, + "learning_rate": 3.55872e-07, + "loss": 0.5672, + "step": 161100 + }, + { + "epoch": 1.6115, + "grad_norm": 12.646116256713867, + "learning_rate": 3.5567199999999996e-07, + "loss": 0.4158, + "step": 161150 + }, + { + "epoch": 1.612, + "grad_norm": 166.2145233154297, + "learning_rate": 3.55472e-07, + "loss": 0.4199, + "step": 161200 + }, + { + "epoch": 1.6125, + "grad_norm": 37.84153366088867, + "learning_rate": 3.55272e-07, + "loss": 0.5097, + "step": 161250 + }, + { + "epoch": 1.613, + "grad_norm": 21.25974464416504, + "learning_rate": 3.5507199999999996e-07, + "loss": 0.4476, + "step": 161300 + }, + { + "epoch": 1.6135000000000002, + "grad_norm": 1.0741132497787476, + "learning_rate": 3.54872e-07, + "loss": 0.3204, + "step": 161350 + }, + { + "epoch": 1.6139999999999999, + "grad_norm": 76.07339477539062, + "learning_rate": 3.54672e-07, + "loss": 0.4677, + "step": 161400 + }, + { + "epoch": 1.6145, + "grad_norm": 37.11135482788086, + "learning_rate": 3.5447199999999997e-07, + "loss": 0.4673, + "step": 161450 + }, + { + "epoch": 1.615, + "grad_norm": 29.35950469970703, + "learning_rate": 3.5427199999999996e-07, + "loss": 0.4879, + "step": 161500 + }, + { + "epoch": 1.6155, + "grad_norm": 6.429986000061035, + "learning_rate": 3.54072e-07, + "loss": 0.4428, + "step": 161550 + }, + { + "epoch": 1.616, + "grad_norm": 4.6186933517456055, + "learning_rate": 3.53872e-07, + "loss": 0.3503, + "step": 161600 + }, + { + "epoch": 1.6165, + "grad_norm": 102.59295654296875, + "learning_rate": 3.5367199999999997e-07, + "loss": 0.3102, + "step": 161650 + }, + { + "epoch": 1.617, + "grad_norm": 0.6536028981208801, + "learning_rate": 3.53472e-07, + "loss": 0.4446, + "step": 161700 + }, + { + "epoch": 1.6175000000000002, + "grad_norm": 25.664196014404297, + "learning_rate": 3.5327199999999994e-07, + "loss": 0.3685, + "step": 161750 + }, + { + "epoch": 1.6179999999999999, + "grad_norm": 59.3905143737793, + "learning_rate": 3.53072e-07, + "loss": 0.5044, + "step": 161800 + }, + { + "epoch": 1.6185, + "grad_norm": 35.49124526977539, + "learning_rate": 3.52872e-07, + "loss": 0.3558, + "step": 161850 + }, + { + "epoch": 1.619, + "grad_norm": 36.11788558959961, + "learning_rate": 3.5267199999999994e-07, + "loss": 0.3282, + "step": 161900 + }, + { + "epoch": 1.6195, + "grad_norm": 36.98171615600586, + "learning_rate": 3.52472e-07, + "loss": 0.4198, + "step": 161950 + }, + { + "epoch": 1.62, + "grad_norm": 7.539791584014893, + "learning_rate": 3.5227199999999997e-07, + "loss": 0.4764, + "step": 162000 + }, + { + "epoch": 1.6205, + "grad_norm": 5.743781566619873, + "learning_rate": 3.52072e-07, + "loss": 0.3213, + "step": 162050 + }, + { + "epoch": 1.621, + "grad_norm": 1.3049453496932983, + "learning_rate": 3.51872e-07, + "loss": 0.2321, + "step": 162100 + }, + { + "epoch": 1.6215000000000002, + "grad_norm": 17.184179306030273, + "learning_rate": 3.51676e-07, + "loss": 0.4108, + "step": 162150 + }, + { + "epoch": 1.6219999999999999, + "grad_norm": 8.404489517211914, + "learning_rate": 3.51476e-07, + "loss": 0.4013, + "step": 162200 + }, + { + "epoch": 1.6225, + "grad_norm": 4.2238993644714355, + "learning_rate": 3.5127599999999995e-07, + "loss": 0.413, + "step": 162250 + }, + { + "epoch": 1.623, + "grad_norm": 53.541168212890625, + "learning_rate": 3.51076e-07, + "loss": 0.4472, + "step": 162300 + }, + { + "epoch": 1.6235, + "grad_norm": 40.160789489746094, + "learning_rate": 3.50876e-07, + "loss": 0.3242, + "step": 162350 + }, + { + "epoch": 1.624, + "grad_norm": 2.3333935737609863, + "learning_rate": 3.5067599999999996e-07, + "loss": 0.4453, + "step": 162400 + }, + { + "epoch": 1.6245, + "grad_norm": 28.548282623291016, + "learning_rate": 3.50476e-07, + "loss": 0.4291, + "step": 162450 + }, + { + "epoch": 1.625, + "grad_norm": 3.4190590381622314, + "learning_rate": 3.50276e-07, + "loss": 0.3052, + "step": 162500 + }, + { + "epoch": 1.6255, + "grad_norm": 50.14396286010742, + "learning_rate": 3.5007599999999996e-07, + "loss": 0.4178, + "step": 162550 + }, + { + "epoch": 1.626, + "grad_norm": 82.80094146728516, + "learning_rate": 3.49876e-07, + "loss": 0.4882, + "step": 162600 + }, + { + "epoch": 1.6265, + "grad_norm": 274.3426208496094, + "learning_rate": 3.49676e-07, + "loss": 0.4842, + "step": 162650 + }, + { + "epoch": 1.627, + "grad_norm": 40.102027893066406, + "learning_rate": 3.4947599999999997e-07, + "loss": 0.4315, + "step": 162700 + }, + { + "epoch": 1.6275, + "grad_norm": 10.764528274536133, + "learning_rate": 3.4927599999999996e-07, + "loss": 0.3907, + "step": 162750 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 58.64040756225586, + "learning_rate": 3.4908e-07, + "loss": 0.5452, + "step": 162800 + }, + { + "epoch": 1.6284999999999998, + "grad_norm": 80.29693603515625, + "learning_rate": 3.4888e-07, + "loss": 0.4522, + "step": 162850 + }, + { + "epoch": 1.629, + "grad_norm": 79.97079467773438, + "learning_rate": 3.4867999999999997e-07, + "loss": 0.5351, + "step": 162900 + }, + { + "epoch": 1.6295, + "grad_norm": 104.74796295166016, + "learning_rate": 3.4848e-07, + "loss": 0.5425, + "step": 162950 + }, + { + "epoch": 1.63, + "grad_norm": 5.748071193695068, + "learning_rate": 3.4827999999999994e-07, + "loss": 0.3334, + "step": 163000 + }, + { + "epoch": 1.6305, + "grad_norm": 167.40061950683594, + "learning_rate": 3.4808e-07, + "loss": 0.4874, + "step": 163050 + }, + { + "epoch": 1.631, + "grad_norm": 0.46852225065231323, + "learning_rate": 3.4788e-07, + "loss": 0.4224, + "step": 163100 + }, + { + "epoch": 1.6315, + "grad_norm": 86.6996078491211, + "learning_rate": 3.4767999999999995e-07, + "loss": 0.572, + "step": 163150 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 17.278841018676758, + "learning_rate": 3.4748e-07, + "loss": 0.521, + "step": 163200 + }, + { + "epoch": 1.6324999999999998, + "grad_norm": 26.78488540649414, + "learning_rate": 3.4727999999999997e-07, + "loss": 0.4149, + "step": 163250 + }, + { + "epoch": 1.633, + "grad_norm": 13.343032836914062, + "learning_rate": 3.4708e-07, + "loss": 0.3371, + "step": 163300 + }, + { + "epoch": 1.6335, + "grad_norm": 0.05451102554798126, + "learning_rate": 3.4688e-07, + "loss": 0.4209, + "step": 163350 + }, + { + "epoch": 1.634, + "grad_norm": 67.80818939208984, + "learning_rate": 3.4668e-07, + "loss": 0.3541, + "step": 163400 + }, + { + "epoch": 1.6345, + "grad_norm": 77.798828125, + "learning_rate": 3.4648e-07, + "loss": 0.4772, + "step": 163450 + }, + { + "epoch": 1.635, + "grad_norm": 70.70539855957031, + "learning_rate": 3.4627999999999995e-07, + "loss": 0.4017, + "step": 163500 + }, + { + "epoch": 1.6355, + "grad_norm": 89.26293182373047, + "learning_rate": 3.4608e-07, + "loss": 0.5131, + "step": 163550 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 68.28092956542969, + "learning_rate": 3.4588e-07, + "loss": 0.4411, + "step": 163600 + }, + { + "epoch": 1.6364999999999998, + "grad_norm": 14.089746475219727, + "learning_rate": 3.4567999999999996e-07, + "loss": 0.5162, + "step": 163650 + }, + { + "epoch": 1.637, + "grad_norm": 87.18695068359375, + "learning_rate": 3.4548e-07, + "loss": 0.4253, + "step": 163700 + }, + { + "epoch": 1.6375, + "grad_norm": 7.904430389404297, + "learning_rate": 3.4528e-07, + "loss": 0.4666, + "step": 163750 + }, + { + "epoch": 1.638, + "grad_norm": 1.7455227375030518, + "learning_rate": 3.4507999999999996e-07, + "loss": 0.286, + "step": 163800 + }, + { + "epoch": 1.6385, + "grad_norm": 26.539167404174805, + "learning_rate": 3.4488e-07, + "loss": 0.5327, + "step": 163850 + }, + { + "epoch": 1.639, + "grad_norm": 10.772929191589355, + "learning_rate": 3.4468e-07, + "loss": 0.3932, + "step": 163900 + }, + { + "epoch": 1.6395, + "grad_norm": 37.356571197509766, + "learning_rate": 3.4447999999999997e-07, + "loss": 0.5795, + "step": 163950 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 37.573543548583984, + "learning_rate": 3.4427999999999996e-07, + "loss": 0.5033, + "step": 164000 + }, + { + "epoch": 1.6404999999999998, + "grad_norm": 4.547316074371338, + "learning_rate": 3.4408e-07, + "loss": 0.4576, + "step": 164050 + }, + { + "epoch": 1.641, + "grad_norm": 70.57447814941406, + "learning_rate": 3.4388e-07, + "loss": 0.4713, + "step": 164100 + }, + { + "epoch": 1.6415, + "grad_norm": 5.259047508239746, + "learning_rate": 3.4367999999999996e-07, + "loss": 0.5499, + "step": 164150 + }, + { + "epoch": 1.642, + "grad_norm": 10.574108123779297, + "learning_rate": 3.4348e-07, + "loss": 0.2849, + "step": 164200 + }, + { + "epoch": 1.6425, + "grad_norm": 69.39708709716797, + "learning_rate": 3.4327999999999993e-07, + "loss": 0.4958, + "step": 164250 + }, + { + "epoch": 1.643, + "grad_norm": 21.05078887939453, + "learning_rate": 3.4307999999999997e-07, + "loss": 0.6435, + "step": 164300 + }, + { + "epoch": 1.6435, + "grad_norm": 41.12791442871094, + "learning_rate": 3.4288e-07, + "loss": 0.3685, + "step": 164350 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 118.48395538330078, + "learning_rate": 3.4268e-07, + "loss": 0.4217, + "step": 164400 + }, + { + "epoch": 1.6444999999999999, + "grad_norm": 5.526557922363281, + "learning_rate": 3.4248e-07, + "loss": 0.3102, + "step": 164450 + }, + { + "epoch": 1.645, + "grad_norm": 52.724700927734375, + "learning_rate": 3.4227999999999997e-07, + "loss": 0.4322, + "step": 164500 + }, + { + "epoch": 1.6455, + "grad_norm": 5.8467631340026855, + "learning_rate": 3.4208e-07, + "loss": 0.4841, + "step": 164550 + }, + { + "epoch": 1.646, + "grad_norm": 8.807022094726562, + "learning_rate": 3.4188e-07, + "loss": 0.3826, + "step": 164600 + }, + { + "epoch": 1.6465, + "grad_norm": 0.2374558299779892, + "learning_rate": 3.4167999999999997e-07, + "loss": 0.4602, + "step": 164650 + }, + { + "epoch": 1.647, + "grad_norm": 4.015384197235107, + "learning_rate": 3.4148e-07, + "loss": 0.4809, + "step": 164700 + }, + { + "epoch": 1.6475, + "grad_norm": 0.15211087465286255, + "learning_rate": 3.4127999999999994e-07, + "loss": 0.3981, + "step": 164750 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 62.6292839050293, + "learning_rate": 3.4108e-07, + "loss": 0.4082, + "step": 164800 + }, + { + "epoch": 1.6484999999999999, + "grad_norm": 46.557071685791016, + "learning_rate": 3.4088e-07, + "loss": 0.3833, + "step": 164850 + }, + { + "epoch": 1.649, + "grad_norm": 109.10408782958984, + "learning_rate": 3.4067999999999995e-07, + "loss": 0.3742, + "step": 164900 + }, + { + "epoch": 1.6495, + "grad_norm": 84.44685363769531, + "learning_rate": 3.4048e-07, + "loss": 0.4855, + "step": 164950 + }, + { + "epoch": 1.65, + "grad_norm": 3.448240041732788, + "learning_rate": 3.4028000000000003e-07, + "loss": 0.5069, + "step": 165000 + }, + { + "epoch": 1.6505, + "grad_norm": 25.00313377380371, + "learning_rate": 3.4007999999999996e-07, + "loss": 0.4354, + "step": 165050 + }, + { + "epoch": 1.651, + "grad_norm": 83.84122467041016, + "learning_rate": 3.3988e-07, + "loss": 0.5135, + "step": 165100 + }, + { + "epoch": 1.6515, + "grad_norm": 0.6992330551147461, + "learning_rate": 3.3968e-07, + "loss": 0.3979, + "step": 165150 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 13.750035285949707, + "learning_rate": 3.3947999999999997e-07, + "loss": 0.4341, + "step": 165200 + }, + { + "epoch": 1.6524999999999999, + "grad_norm": 7.680423736572266, + "learning_rate": 3.3928e-07, + "loss": 0.4273, + "step": 165250 + }, + { + "epoch": 1.653, + "grad_norm": 1.2927963733673096, + "learning_rate": 3.3908e-07, + "loss": 0.3914, + "step": 165300 + }, + { + "epoch": 1.6535, + "grad_norm": 92.42204284667969, + "learning_rate": 3.3888e-07, + "loss": 0.5286, + "step": 165350 + }, + { + "epoch": 1.654, + "grad_norm": 102.65548706054688, + "learning_rate": 3.3867999999999996e-07, + "loss": 0.485, + "step": 165400 + }, + { + "epoch": 1.6545, + "grad_norm": 7.027461051940918, + "learning_rate": 3.3848e-07, + "loss": 0.4433, + "step": 165450 + }, + { + "epoch": 1.655, + "grad_norm": 74.44886779785156, + "learning_rate": 3.3828000000000004e-07, + "loss": 0.3997, + "step": 165500 + }, + { + "epoch": 1.6555, + "grad_norm": 36.282901763916016, + "learning_rate": 3.3807999999999997e-07, + "loss": 0.389, + "step": 165550 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 23.829561233520508, + "learning_rate": 3.3788e-07, + "loss": 0.312, + "step": 165600 + }, + { + "epoch": 1.6564999999999999, + "grad_norm": 50.19546127319336, + "learning_rate": 3.3768e-07, + "loss": 0.4511, + "step": 165650 + }, + { + "epoch": 1.657, + "grad_norm": 37.749725341796875, + "learning_rate": 3.3748e-07, + "loss": 0.3837, + "step": 165700 + }, + { + "epoch": 1.6575, + "grad_norm": 8.897858619689941, + "learning_rate": 3.3728e-07, + "loss": 0.5425, + "step": 165750 + }, + { + "epoch": 1.658, + "grad_norm": 3.789098024368286, + "learning_rate": 3.3708e-07, + "loss": 0.4618, + "step": 165800 + }, + { + "epoch": 1.6585, + "grad_norm": 73.6874771118164, + "learning_rate": 3.3688e-07, + "loss": 0.6174, + "step": 165850 + }, + { + "epoch": 1.659, + "grad_norm": 28.941118240356445, + "learning_rate": 3.3667999999999997e-07, + "loss": 0.4578, + "step": 165900 + }, + { + "epoch": 1.6595, + "grad_norm": 104.42362213134766, + "learning_rate": 3.3648e-07, + "loss": 0.4737, + "step": 165950 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 6.5759477615356445, + "learning_rate": 3.3628e-07, + "loss": 0.4182, + "step": 166000 + }, + { + "epoch": 1.6604999999999999, + "grad_norm": 3.6129372119903564, + "learning_rate": 3.3608e-07, + "loss": 0.3286, + "step": 166050 + }, + { + "epoch": 1.661, + "grad_norm": 32.19172668457031, + "learning_rate": 3.3588e-07, + "loss": 0.3299, + "step": 166100 + }, + { + "epoch": 1.6615, + "grad_norm": 34.54439163208008, + "learning_rate": 3.3567999999999995e-07, + "loss": 0.2821, + "step": 166150 + }, + { + "epoch": 1.662, + "grad_norm": 7.3551411628723145, + "learning_rate": 3.3548e-07, + "loss": 0.6133, + "step": 166200 + }, + { + "epoch": 1.6625, + "grad_norm": 135.5836944580078, + "learning_rate": 3.3528e-07, + "loss": 0.5549, + "step": 166250 + }, + { + "epoch": 1.663, + "grad_norm": 65.05741882324219, + "learning_rate": 3.3507999999999995e-07, + "loss": 0.5788, + "step": 166300 + }, + { + "epoch": 1.6635, + "grad_norm": 46.17664337158203, + "learning_rate": 3.3488e-07, + "loss": 0.5116, + "step": 166350 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 55.95429611206055, + "learning_rate": 3.3468e-07, + "loss": 0.4453, + "step": 166400 + }, + { + "epoch": 1.6644999999999999, + "grad_norm": 1.1684380769729614, + "learning_rate": 3.3447999999999996e-07, + "loss": 0.4546, + "step": 166450 + }, + { + "epoch": 1.665, + "grad_norm": 4.0728230476379395, + "learning_rate": 3.3428e-07, + "loss": 0.4625, + "step": 166500 + }, + { + "epoch": 1.6655, + "grad_norm": 14.39780330657959, + "learning_rate": 3.3408e-07, + "loss": 0.3562, + "step": 166550 + }, + { + "epoch": 1.666, + "grad_norm": 0.6815798282623291, + "learning_rate": 3.3387999999999997e-07, + "loss": 0.4597, + "step": 166600 + }, + { + "epoch": 1.6665, + "grad_norm": 95.65667724609375, + "learning_rate": 3.3367999999999995e-07, + "loss": 0.5346, + "step": 166650 + }, + { + "epoch": 1.667, + "grad_norm": 13.627656936645508, + "learning_rate": 3.3348e-07, + "loss": 0.4855, + "step": 166700 + }, + { + "epoch": 1.6675, + "grad_norm": 5.321529388427734, + "learning_rate": 3.3328000000000003e-07, + "loss": 0.401, + "step": 166750 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 9.042032241821289, + "learning_rate": 3.3307999999999996e-07, + "loss": 0.3916, + "step": 166800 + }, + { + "epoch": 1.6684999999999999, + "grad_norm": 2.1465108394622803, + "learning_rate": 3.3288e-07, + "loss": 0.559, + "step": 166850 + }, + { + "epoch": 1.669, + "grad_norm": 60.72265625, + "learning_rate": 3.3268e-07, + "loss": 0.3849, + "step": 166900 + }, + { + "epoch": 1.6695, + "grad_norm": 68.50015258789062, + "learning_rate": 3.3247999999999997e-07, + "loss": 0.4911, + "step": 166950 + }, + { + "epoch": 1.67, + "grad_norm": 15.50178337097168, + "learning_rate": 3.3228e-07, + "loss": 0.4197, + "step": 167000 + }, + { + "epoch": 1.6705, + "grad_norm": 3.2465248107910156, + "learning_rate": 3.3208e-07, + "loss": 0.3693, + "step": 167050 + }, + { + "epoch": 1.671, + "grad_norm": 7.669544219970703, + "learning_rate": 3.3188e-07, + "loss": 0.4919, + "step": 167100 + }, + { + "epoch": 1.6715, + "grad_norm": 50.00425720214844, + "learning_rate": 3.3167999999999996e-07, + "loss": 0.4257, + "step": 167150 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 2.649852991104126, + "learning_rate": 3.3148e-07, + "loss": 0.43, + "step": 167200 + }, + { + "epoch": 1.6724999999999999, + "grad_norm": 70.55695343017578, + "learning_rate": 3.3128e-07, + "loss": 0.3321, + "step": 167250 + }, + { + "epoch": 1.673, + "grad_norm": 18.895328521728516, + "learning_rate": 3.3107999999999997e-07, + "loss": 0.5031, + "step": 167300 + }, + { + "epoch": 1.6735, + "grad_norm": 7.735249042510986, + "learning_rate": 3.3088e-07, + "loss": 0.501, + "step": 167350 + }, + { + "epoch": 1.674, + "grad_norm": 60.472511291503906, + "learning_rate": 3.3067999999999994e-07, + "loss": 0.3583, + "step": 167400 + }, + { + "epoch": 1.6745, + "grad_norm": 121.01581573486328, + "learning_rate": 3.3048e-07, + "loss": 0.4813, + "step": 167450 + }, + { + "epoch": 1.675, + "grad_norm": 31.92609405517578, + "learning_rate": 3.3028e-07, + "loss": 0.4163, + "step": 167500 + }, + { + "epoch": 1.6755, + "grad_norm": 20.554677963256836, + "learning_rate": 3.3007999999999995e-07, + "loss": 0.3398, + "step": 167550 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 50.82945251464844, + "learning_rate": 3.2988e-07, + "loss": 0.5493, + "step": 167600 + }, + { + "epoch": 1.6764999999999999, + "grad_norm": 13.953234672546387, + "learning_rate": 3.2967999999999997e-07, + "loss": 0.4574, + "step": 167650 + }, + { + "epoch": 1.677, + "grad_norm": 16.389623641967773, + "learning_rate": 3.2947999999999996e-07, + "loss": 0.3857, + "step": 167700 + }, + { + "epoch": 1.6775, + "grad_norm": 2.509197235107422, + "learning_rate": 3.2928e-07, + "loss": 0.3919, + "step": 167750 + }, + { + "epoch": 1.678, + "grad_norm": 4.2857666015625, + "learning_rate": 3.2908e-07, + "loss": 0.4109, + "step": 167800 + }, + { + "epoch": 1.6785, + "grad_norm": 35.18456268310547, + "learning_rate": 3.2887999999999996e-07, + "loss": 0.4017, + "step": 167850 + }, + { + "epoch": 1.679, + "grad_norm": 76.32749938964844, + "learning_rate": 3.2868e-07, + "loss": 0.3254, + "step": 167900 + }, + { + "epoch": 1.6795, + "grad_norm": 11.019416809082031, + "learning_rate": 3.2848e-07, + "loss": 0.4384, + "step": 167950 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 18.828954696655273, + "learning_rate": 3.2828e-07, + "loss": 0.3532, + "step": 168000 + }, + { + "epoch": 1.6804999999999999, + "grad_norm": 51.45600891113281, + "learning_rate": 3.2807999999999996e-07, + "loss": 0.4289, + "step": 168050 + }, + { + "epoch": 1.681, + "grad_norm": 38.062007904052734, + "learning_rate": 3.2788e-07, + "loss": 0.3752, + "step": 168100 + }, + { + "epoch": 1.6815, + "grad_norm": 135.77662658691406, + "learning_rate": 3.2768000000000003e-07, + "loss": 0.3286, + "step": 168150 + }, + { + "epoch": 1.682, + "grad_norm": 79.01766967773438, + "learning_rate": 3.2747999999999997e-07, + "loss": 0.4695, + "step": 168200 + }, + { + "epoch": 1.6825, + "grad_norm": 61.94942855834961, + "learning_rate": 3.2728e-07, + "loss": 0.2963, + "step": 168250 + }, + { + "epoch": 1.683, + "grad_norm": 101.182373046875, + "learning_rate": 3.2708e-07, + "loss": 0.4316, + "step": 168300 + }, + { + "epoch": 1.6835, + "grad_norm": 44.68824768066406, + "learning_rate": 3.2687999999999997e-07, + "loss": 0.2822, + "step": 168350 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 97.19388580322266, + "learning_rate": 3.2668e-07, + "loss": 0.4836, + "step": 168400 + }, + { + "epoch": 1.6844999999999999, + "grad_norm": 55.13529968261719, + "learning_rate": 3.2648e-07, + "loss": 0.2778, + "step": 168450 + }, + { + "epoch": 1.685, + "grad_norm": 7.870936870574951, + "learning_rate": 3.2628e-07, + "loss": 0.4664, + "step": 168500 + }, + { + "epoch": 1.6855, + "grad_norm": 2.3908069133758545, + "learning_rate": 3.2607999999999997e-07, + "loss": 0.5063, + "step": 168550 + }, + { + "epoch": 1.686, + "grad_norm": 4.8086466789245605, + "learning_rate": 3.2588e-07, + "loss": 0.3995, + "step": 168600 + }, + { + "epoch": 1.6865, + "grad_norm": 0.3409883677959442, + "learning_rate": 3.2568e-07, + "loss": 0.3984, + "step": 168650 + }, + { + "epoch": 1.687, + "grad_norm": 10.495270729064941, + "learning_rate": 3.2548e-07, + "loss": 0.4395, + "step": 168700 + }, + { + "epoch": 1.6875, + "grad_norm": 41.94511032104492, + "learning_rate": 3.2528e-07, + "loss": 0.4126, + "step": 168750 + }, + { + "epoch": 1.688, + "grad_norm": 57.92393493652344, + "learning_rate": 3.2507999999999994e-07, + "loss": 0.318, + "step": 168800 + }, + { + "epoch": 1.6885, + "grad_norm": 58.26197814941406, + "learning_rate": 3.2488e-07, + "loss": 0.4174, + "step": 168850 + }, + { + "epoch": 1.689, + "grad_norm": 7.934047222137451, + "learning_rate": 3.2468e-07, + "loss": 0.4684, + "step": 168900 + }, + { + "epoch": 1.6895, + "grad_norm": 76.46321105957031, + "learning_rate": 3.2447999999999995e-07, + "loss": 0.3206, + "step": 168950 + }, + { + "epoch": 1.69, + "grad_norm": 55.63337707519531, + "learning_rate": 3.2428e-07, + "loss": 0.5596, + "step": 169000 + }, + { + "epoch": 1.6905000000000001, + "grad_norm": 9.17141056060791, + "learning_rate": 3.2408e-07, + "loss": 0.3573, + "step": 169050 + }, + { + "epoch": 1.6909999999999998, + "grad_norm": 76.35599517822266, + "learning_rate": 3.2388e-07, + "loss": 0.6063, + "step": 169100 + }, + { + "epoch": 1.6915, + "grad_norm": 54.43082809448242, + "learning_rate": 3.2368e-07, + "loss": 0.4363, + "step": 169150 + }, + { + "epoch": 1.692, + "grad_norm": 25.601472854614258, + "learning_rate": 3.2348e-07, + "loss": 0.3161, + "step": 169200 + }, + { + "epoch": 1.6925, + "grad_norm": 61.37073516845703, + "learning_rate": 3.2328e-07, + "loss": 0.4411, + "step": 169250 + }, + { + "epoch": 1.693, + "grad_norm": 73.22949981689453, + "learning_rate": 3.2307999999999995e-07, + "loss": 0.4661, + "step": 169300 + }, + { + "epoch": 1.6935, + "grad_norm": 16.108476638793945, + "learning_rate": 3.2288e-07, + "loss": 0.4321, + "step": 169350 + }, + { + "epoch": 1.694, + "grad_norm": 83.31065368652344, + "learning_rate": 3.2268000000000003e-07, + "loss": 0.3823, + "step": 169400 + }, + { + "epoch": 1.6945000000000001, + "grad_norm": 1.437071681022644, + "learning_rate": 3.2248399999999996e-07, + "loss": 0.4882, + "step": 169450 + }, + { + "epoch": 1.6949999999999998, + "grad_norm": 60.666839599609375, + "learning_rate": 3.22284e-07, + "loss": 0.4287, + "step": 169500 + }, + { + "epoch": 1.6955, + "grad_norm": 52.73131561279297, + "learning_rate": 3.22084e-07, + "loss": 0.3638, + "step": 169550 + }, + { + "epoch": 1.696, + "grad_norm": 67.59291076660156, + "learning_rate": 3.2188399999999997e-07, + "loss": 0.5091, + "step": 169600 + }, + { + "epoch": 1.6965, + "grad_norm": 33.913936614990234, + "learning_rate": 3.21684e-07, + "loss": 0.3616, + "step": 169650 + }, + { + "epoch": 1.697, + "grad_norm": 6.20016622543335, + "learning_rate": 3.21484e-07, + "loss": 0.4313, + "step": 169700 + }, + { + "epoch": 1.6975, + "grad_norm": 78.34620666503906, + "learning_rate": 3.21288e-07, + "loss": 0.4994, + "step": 169750 + }, + { + "epoch": 1.698, + "grad_norm": 8.514922142028809, + "learning_rate": 3.2108799999999997e-07, + "loss": 0.4833, + "step": 169800 + }, + { + "epoch": 1.6985000000000001, + "grad_norm": 43.1236572265625, + "learning_rate": 3.20888e-07, + "loss": 0.4908, + "step": 169850 + }, + { + "epoch": 1.6989999999999998, + "grad_norm": 51.2786750793457, + "learning_rate": 3.2068799999999994e-07, + "loss": 0.6363, + "step": 169900 + }, + { + "epoch": 1.6995, + "grad_norm": 102.73619842529297, + "learning_rate": 3.20488e-07, + "loss": 0.442, + "step": 169950 + }, + { + "epoch": 1.7, + "grad_norm": 121.2522201538086, + "learning_rate": 3.20288e-07, + "loss": 0.5802, + "step": 170000 + }, + { + "epoch": 1.7005, + "grad_norm": 31.411584854125977, + "learning_rate": 3.2008799999999995e-07, + "loss": 0.3274, + "step": 170050 + }, + { + "epoch": 1.701, + "grad_norm": 83.65167999267578, + "learning_rate": 3.19888e-07, + "loss": 0.3685, + "step": 170100 + }, + { + "epoch": 1.7015, + "grad_norm": 11.413860321044922, + "learning_rate": 3.19688e-07, + "loss": 0.382, + "step": 170150 + }, + { + "epoch": 1.702, + "grad_norm": 18.51053237915039, + "learning_rate": 3.1948799999999996e-07, + "loss": 0.3183, + "step": 170200 + }, + { + "epoch": 1.7025000000000001, + "grad_norm": 4.223658084869385, + "learning_rate": 3.19288e-07, + "loss": 0.2734, + "step": 170250 + }, + { + "epoch": 1.7029999999999998, + "grad_norm": 4.411848545074463, + "learning_rate": 3.19088e-07, + "loss": 0.3668, + "step": 170300 + }, + { + "epoch": 1.7035, + "grad_norm": 29.715543746948242, + "learning_rate": 3.18888e-07, + "loss": 0.3997, + "step": 170350 + }, + { + "epoch": 1.704, + "grad_norm": 3.227698802947998, + "learning_rate": 3.18688e-07, + "loss": 0.5644, + "step": 170400 + }, + { + "epoch": 1.7045, + "grad_norm": 41.32566833496094, + "learning_rate": 3.18488e-07, + "loss": 0.5333, + "step": 170450 + }, + { + "epoch": 1.705, + "grad_norm": 11.538256645202637, + "learning_rate": 3.18288e-07, + "loss": 0.3157, + "step": 170500 + }, + { + "epoch": 1.7055, + "grad_norm": 0.46403923630714417, + "learning_rate": 3.1808799999999996e-07, + "loss": 0.534, + "step": 170550 + }, + { + "epoch": 1.706, + "grad_norm": 18.477012634277344, + "learning_rate": 3.17892e-07, + "loss": 0.4467, + "step": 170600 + }, + { + "epoch": 1.7065000000000001, + "grad_norm": 69.50625610351562, + "learning_rate": 3.17692e-07, + "loss": 0.4261, + "step": 170650 + }, + { + "epoch": 1.7069999999999999, + "grad_norm": 73.91793060302734, + "learning_rate": 3.1749199999999997e-07, + "loss": 0.4084, + "step": 170700 + }, + { + "epoch": 1.7075, + "grad_norm": 43.193870544433594, + "learning_rate": 3.17292e-07, + "loss": 0.3532, + "step": 170750 + }, + { + "epoch": 1.708, + "grad_norm": 29.659255981445312, + "learning_rate": 3.17092e-07, + "loss": 0.4348, + "step": 170800 + }, + { + "epoch": 1.7085, + "grad_norm": 3.251600980758667, + "learning_rate": 3.16892e-07, + "loss": 0.4886, + "step": 170850 + }, + { + "epoch": 1.709, + "grad_norm": 21.75658416748047, + "learning_rate": 3.1669199999999996e-07, + "loss": 0.515, + "step": 170900 + }, + { + "epoch": 1.7095, + "grad_norm": 6.0939106941223145, + "learning_rate": 3.16492e-07, + "loss": 0.5142, + "step": 170950 + }, + { + "epoch": 1.71, + "grad_norm": 94.46733856201172, + "learning_rate": 3.16292e-07, + "loss": 0.5497, + "step": 171000 + }, + { + "epoch": 1.7105000000000001, + "grad_norm": 148.20986938476562, + "learning_rate": 3.1609199999999997e-07, + "loss": 0.5326, + "step": 171050 + }, + { + "epoch": 1.7109999999999999, + "grad_norm": 25.23739242553711, + "learning_rate": 3.15892e-07, + "loss": 0.4548, + "step": 171100 + }, + { + "epoch": 1.7115, + "grad_norm": 45.993045806884766, + "learning_rate": 3.1569199999999994e-07, + "loss": 0.3861, + "step": 171150 + }, + { + "epoch": 1.712, + "grad_norm": 70.52889251708984, + "learning_rate": 3.15492e-07, + "loss": 0.4945, + "step": 171200 + }, + { + "epoch": 1.7125, + "grad_norm": 0.17195826768875122, + "learning_rate": 3.15292e-07, + "loss": 0.4047, + "step": 171250 + }, + { + "epoch": 1.713, + "grad_norm": 141.29371643066406, + "learning_rate": 3.1509199999999995e-07, + "loss": 0.4854, + "step": 171300 + }, + { + "epoch": 1.7135, + "grad_norm": 79.38813781738281, + "learning_rate": 3.14892e-07, + "loss": 0.4288, + "step": 171350 + }, + { + "epoch": 1.714, + "grad_norm": 93.10710144042969, + "learning_rate": 3.14692e-07, + "loss": 0.3964, + "step": 171400 + }, + { + "epoch": 1.7145000000000001, + "grad_norm": 120.59520721435547, + "learning_rate": 3.1449199999999995e-07, + "loss": 0.3408, + "step": 171450 + }, + { + "epoch": 1.7149999999999999, + "grad_norm": 33.953556060791016, + "learning_rate": 3.14292e-07, + "loss": 0.4579, + "step": 171500 + }, + { + "epoch": 1.7155, + "grad_norm": 32.431541442871094, + "learning_rate": 3.14092e-07, + "loss": 0.348, + "step": 171550 + }, + { + "epoch": 1.716, + "grad_norm": 36.8668327331543, + "learning_rate": 3.13892e-07, + "loss": 0.4879, + "step": 171600 + }, + { + "epoch": 1.7165, + "grad_norm": 2.457643747329712, + "learning_rate": 3.13692e-07, + "loss": 0.3101, + "step": 171650 + }, + { + "epoch": 1.717, + "grad_norm": 29.928564071655273, + "learning_rate": 3.13496e-07, + "loss": 0.433, + "step": 171700 + }, + { + "epoch": 1.7175, + "grad_norm": 46.240840911865234, + "learning_rate": 3.1329600000000003e-07, + "loss": 0.4581, + "step": 171750 + }, + { + "epoch": 1.718, + "grad_norm": 13.193766593933105, + "learning_rate": 3.1309599999999996e-07, + "loss": 0.5739, + "step": 171800 + }, + { + "epoch": 1.7185000000000001, + "grad_norm": 47.43181228637695, + "learning_rate": 3.12896e-07, + "loss": 0.4211, + "step": 171850 + }, + { + "epoch": 1.7189999999999999, + "grad_norm": 50.500213623046875, + "learning_rate": 3.12696e-07, + "loss": 0.3436, + "step": 171900 + }, + { + "epoch": 1.7195, + "grad_norm": 5.8976731300354, + "learning_rate": 3.1249599999999997e-07, + "loss": 0.3893, + "step": 171950 + }, + { + "epoch": 1.72, + "grad_norm": 71.6220474243164, + "learning_rate": 3.12296e-07, + "loss": 0.4529, + "step": 172000 + }, + { + "epoch": 1.7205, + "grad_norm": 32.8338508605957, + "learning_rate": 3.12096e-07, + "loss": 0.3568, + "step": 172050 + }, + { + "epoch": 1.721, + "grad_norm": 17.878786087036133, + "learning_rate": 3.11896e-07, + "loss": 0.2774, + "step": 172100 + }, + { + "epoch": 1.7215, + "grad_norm": 90.94465637207031, + "learning_rate": 3.1169599999999996e-07, + "loss": 0.3775, + "step": 172150 + }, + { + "epoch": 1.722, + "grad_norm": 101.07366180419922, + "learning_rate": 3.11496e-07, + "loss": 0.452, + "step": 172200 + }, + { + "epoch": 1.7225000000000001, + "grad_norm": 61.204246520996094, + "learning_rate": 3.11296e-07, + "loss": 0.4113, + "step": 172250 + }, + { + "epoch": 1.7229999999999999, + "grad_norm": 92.17945098876953, + "learning_rate": 3.1109599999999997e-07, + "loss": 0.4296, + "step": 172300 + }, + { + "epoch": 1.7235, + "grad_norm": 74.0315170288086, + "learning_rate": 3.10896e-07, + "loss": 0.5087, + "step": 172350 + }, + { + "epoch": 1.724, + "grad_norm": 91.9222640991211, + "learning_rate": 3.10696e-07, + "loss": 0.4429, + "step": 172400 + }, + { + "epoch": 1.7245, + "grad_norm": 33.95732498168945, + "learning_rate": 3.10496e-07, + "loss": 0.5813, + "step": 172450 + }, + { + "epoch": 1.725, + "grad_norm": 80.2164535522461, + "learning_rate": 3.10296e-07, + "loss": 0.4709, + "step": 172500 + }, + { + "epoch": 1.7255, + "grad_norm": 131.9748992919922, + "learning_rate": 3.1009599999999995e-07, + "loss": 0.4982, + "step": 172550 + }, + { + "epoch": 1.726, + "grad_norm": 1.9716410636901855, + "learning_rate": 3.09896e-07, + "loss": 0.3589, + "step": 172600 + }, + { + "epoch": 1.7265000000000001, + "grad_norm": 76.4041519165039, + "learning_rate": 3.09696e-07, + "loss": 0.4375, + "step": 172650 + }, + { + "epoch": 1.7269999999999999, + "grad_norm": 10.473490715026855, + "learning_rate": 3.0949599999999995e-07, + "loss": 0.3493, + "step": 172700 + }, + { + "epoch": 1.7275, + "grad_norm": 44.0904426574707, + "learning_rate": 3.09296e-07, + "loss": 0.4193, + "step": 172750 + }, + { + "epoch": 1.728, + "grad_norm": 37.536067962646484, + "learning_rate": 3.09096e-07, + "loss": 0.4956, + "step": 172800 + }, + { + "epoch": 1.7285, + "grad_norm": 141.21176147460938, + "learning_rate": 3.08896e-07, + "loss": 0.3959, + "step": 172850 + }, + { + "epoch": 1.729, + "grad_norm": 15.123921394348145, + "learning_rate": 3.08696e-07, + "loss": 0.4004, + "step": 172900 + }, + { + "epoch": 1.7295, + "grad_norm": 38.241249084472656, + "learning_rate": 3.08496e-07, + "loss": 0.5414, + "step": 172950 + }, + { + "epoch": 1.73, + "grad_norm": 87.40081787109375, + "learning_rate": 3.08296e-07, + "loss": 0.5123, + "step": 173000 + }, + { + "epoch": 1.7305000000000001, + "grad_norm": 17.00108528137207, + "learning_rate": 3.0809599999999996e-07, + "loss": 0.4696, + "step": 173050 + }, + { + "epoch": 1.7309999999999999, + "grad_norm": 5.777243614196777, + "learning_rate": 3.07896e-07, + "loss": 0.479, + "step": 173100 + }, + { + "epoch": 1.7315, + "grad_norm": 92.2618179321289, + "learning_rate": 3.0769600000000003e-07, + "loss": 0.3697, + "step": 173150 + }, + { + "epoch": 1.732, + "grad_norm": 115.76209259033203, + "learning_rate": 3.0749599999999996e-07, + "loss": 0.5014, + "step": 173200 + }, + { + "epoch": 1.7325, + "grad_norm": 10.42154598236084, + "learning_rate": 3.07296e-07, + "loss": 0.521, + "step": 173250 + }, + { + "epoch": 1.733, + "grad_norm": 48.72549819946289, + "learning_rate": 3.07096e-07, + "loss": 0.5236, + "step": 173300 + }, + { + "epoch": 1.7335, + "grad_norm": 43.980899810791016, + "learning_rate": 3.0689599999999997e-07, + "loss": 0.5085, + "step": 173350 + }, + { + "epoch": 1.734, + "grad_norm": 7.443145275115967, + "learning_rate": 3.06696e-07, + "loss": 0.4113, + "step": 173400 + }, + { + "epoch": 1.7345000000000002, + "grad_norm": 9.301448822021484, + "learning_rate": 3.06496e-07, + "loss": 0.4126, + "step": 173450 + }, + { + "epoch": 1.7349999999999999, + "grad_norm": 85.95564270019531, + "learning_rate": 3.06296e-07, + "loss": 0.6378, + "step": 173500 + }, + { + "epoch": 1.7355, + "grad_norm": 20.996845245361328, + "learning_rate": 3.0609599999999996e-07, + "loss": 0.6383, + "step": 173550 + }, + { + "epoch": 1.736, + "grad_norm": 50.388126373291016, + "learning_rate": 3.05896e-07, + "loss": 0.3279, + "step": 173600 + }, + { + "epoch": 1.7365, + "grad_norm": 0.6028560400009155, + "learning_rate": 3.05696e-07, + "loss": 0.4151, + "step": 173650 + }, + { + "epoch": 1.737, + "grad_norm": 31.13410186767578, + "learning_rate": 3.0549599999999997e-07, + "loss": 0.423, + "step": 173700 + }, + { + "epoch": 1.7375, + "grad_norm": 72.78054809570312, + "learning_rate": 3.05296e-07, + "loss": 0.344, + "step": 173750 + }, + { + "epoch": 1.738, + "grad_norm": 25.89346694946289, + "learning_rate": 3.0509599999999994e-07, + "loss": 0.6206, + "step": 173800 + }, + { + "epoch": 1.7385000000000002, + "grad_norm": 48.15966033935547, + "learning_rate": 3.04896e-07, + "loss": 0.3109, + "step": 173850 + }, + { + "epoch": 1.7389999999999999, + "grad_norm": 45.8206787109375, + "learning_rate": 3.04696e-07, + "loss": 0.5, + "step": 173900 + }, + { + "epoch": 1.7395, + "grad_norm": 9.931618690490723, + "learning_rate": 3.0449599999999995e-07, + "loss": 0.4324, + "step": 173950 + }, + { + "epoch": 1.74, + "grad_norm": 107.68231964111328, + "learning_rate": 3.04296e-07, + "loss": 0.4264, + "step": 174000 + }, + { + "epoch": 1.7405, + "grad_norm": 23.903282165527344, + "learning_rate": 3.0409599999999997e-07, + "loss": 0.4645, + "step": 174050 + }, + { + "epoch": 1.741, + "grad_norm": 60.52227783203125, + "learning_rate": 3.03896e-07, + "loss": 0.4347, + "step": 174100 + }, + { + "epoch": 1.7415, + "grad_norm": 3.167391300201416, + "learning_rate": 3.03696e-07, + "loss": 0.354, + "step": 174150 + }, + { + "epoch": 1.742, + "grad_norm": 72.62369537353516, + "learning_rate": 3.03496e-07, + "loss": 0.4243, + "step": 174200 + }, + { + "epoch": 1.7425000000000002, + "grad_norm": 51.68940734863281, + "learning_rate": 3.03296e-07, + "loss": 0.4351, + "step": 174250 + }, + { + "epoch": 1.7429999999999999, + "grad_norm": 93.96479797363281, + "learning_rate": 3.0309599999999995e-07, + "loss": 0.3287, + "step": 174300 + }, + { + "epoch": 1.7435, + "grad_norm": 151.80270385742188, + "learning_rate": 3.02896e-07, + "loss": 0.3704, + "step": 174350 + }, + { + "epoch": 1.744, + "grad_norm": 6.188260555267334, + "learning_rate": 3.02696e-07, + "loss": 0.5232, + "step": 174400 + }, + { + "epoch": 1.7445, + "grad_norm": 19.85384178161621, + "learning_rate": 3.0249599999999996e-07, + "loss": 0.4082, + "step": 174450 + }, + { + "epoch": 1.745, + "grad_norm": 2.640712022781372, + "learning_rate": 3.02296e-07, + "loss": 0.3756, + "step": 174500 + }, + { + "epoch": 1.7455, + "grad_norm": 20.137554168701172, + "learning_rate": 3.02096e-07, + "loss": 0.459, + "step": 174550 + }, + { + "epoch": 1.746, + "grad_norm": 60.7595100402832, + "learning_rate": 3.0189599999999997e-07, + "loss": 0.4374, + "step": 174600 + }, + { + "epoch": 1.7465000000000002, + "grad_norm": 51.217464447021484, + "learning_rate": 3.01696e-07, + "loss": 0.4513, + "step": 174650 + }, + { + "epoch": 1.7469999999999999, + "grad_norm": 7.342816352844238, + "learning_rate": 3.01496e-07, + "loss": 0.448, + "step": 174700 + }, + { + "epoch": 1.7475, + "grad_norm": 1.492277979850769, + "learning_rate": 3.01296e-07, + "loss": 0.4433, + "step": 174750 + }, + { + "epoch": 1.748, + "grad_norm": 75.10221099853516, + "learning_rate": 3.0109599999999996e-07, + "loss": 0.2593, + "step": 174800 + }, + { + "epoch": 1.7485, + "grad_norm": 6.8189616203308105, + "learning_rate": 3.00896e-07, + "loss": 0.4845, + "step": 174850 + }, + { + "epoch": 1.749, + "grad_norm": 33.18309783935547, + "learning_rate": 3.00696e-07, + "loss": 0.3589, + "step": 174900 + }, + { + "epoch": 1.7495, + "grad_norm": 149.52420043945312, + "learning_rate": 3.0049599999999997e-07, + "loss": 0.3238, + "step": 174950 + }, + { + "epoch": 1.75, + "grad_norm": 33.072242736816406, + "learning_rate": 3.00296e-07, + "loss": 0.5178, + "step": 175000 + }, + { + "epoch": 1.7505, + "grad_norm": 19.933521270751953, + "learning_rate": 3.0009599999999994e-07, + "loss": 0.4674, + "step": 175050 + }, + { + "epoch": 1.751, + "grad_norm": 14.122361183166504, + "learning_rate": 2.99896e-07, + "loss": 0.3734, + "step": 175100 + }, + { + "epoch": 1.7515, + "grad_norm": 0.9820132255554199, + "learning_rate": 2.99696e-07, + "loss": 0.3835, + "step": 175150 + }, + { + "epoch": 1.752, + "grad_norm": 0.1489812433719635, + "learning_rate": 2.99496e-07, + "loss": 0.6293, + "step": 175200 + }, + { + "epoch": 1.7525, + "grad_norm": 2.5810625553131104, + "learning_rate": 2.99296e-07, + "loss": 0.335, + "step": 175250 + }, + { + "epoch": 1.7530000000000001, + "grad_norm": 4.589435577392578, + "learning_rate": 2.9909599999999997e-07, + "loss": 0.5491, + "step": 175300 + }, + { + "epoch": 1.7534999999999998, + "grad_norm": 35.78123474121094, + "learning_rate": 2.98896e-07, + "loss": 0.3732, + "step": 175350 + }, + { + "epoch": 1.754, + "grad_norm": 7.164456844329834, + "learning_rate": 2.98696e-07, + "loss": 0.418, + "step": 175400 + }, + { + "epoch": 1.7545, + "grad_norm": 102.37982177734375, + "learning_rate": 2.98496e-07, + "loss": 0.5153, + "step": 175450 + }, + { + "epoch": 1.755, + "grad_norm": 1.655551552772522, + "learning_rate": 2.98296e-07, + "loss": 0.4579, + "step": 175500 + }, + { + "epoch": 1.7555, + "grad_norm": 11.854036331176758, + "learning_rate": 2.98096e-07, + "loss": 0.5165, + "step": 175550 + }, + { + "epoch": 1.756, + "grad_norm": 5.970990180969238, + "learning_rate": 2.97896e-07, + "loss": 0.3749, + "step": 175600 + }, + { + "epoch": 1.7565, + "grad_norm": 40.14838790893555, + "learning_rate": 2.97696e-07, + "loss": 0.4894, + "step": 175650 + }, + { + "epoch": 1.7570000000000001, + "grad_norm": 6.2947187423706055, + "learning_rate": 2.9749999999999996e-07, + "loss": 0.3435, + "step": 175700 + }, + { + "epoch": 1.7574999999999998, + "grad_norm": 72.81707000732422, + "learning_rate": 2.973e-07, + "loss": 0.5341, + "step": 175750 + }, + { + "epoch": 1.758, + "grad_norm": 23.278066635131836, + "learning_rate": 2.971e-07, + "loss": 0.3967, + "step": 175800 + }, + { + "epoch": 1.7585, + "grad_norm": 75.2026138305664, + "learning_rate": 2.9689999999999997e-07, + "loss": 0.4016, + "step": 175850 + }, + { + "epoch": 1.759, + "grad_norm": 25.339069366455078, + "learning_rate": 2.967e-07, + "loss": 0.2943, + "step": 175900 + }, + { + "epoch": 1.7595, + "grad_norm": 44.909751892089844, + "learning_rate": 2.965e-07, + "loss": 0.3533, + "step": 175950 + }, + { + "epoch": 1.76, + "grad_norm": 60.30315017700195, + "learning_rate": 2.9629999999999997e-07, + "loss": 0.5381, + "step": 176000 + }, + { + "epoch": 1.7605, + "grad_norm": 105.79608917236328, + "learning_rate": 2.9609999999999996e-07, + "loss": 0.6541, + "step": 176050 + }, + { + "epoch": 1.7610000000000001, + "grad_norm": 74.8968505859375, + "learning_rate": 2.959e-07, + "loss": 0.3239, + "step": 176100 + }, + { + "epoch": 1.7614999999999998, + "grad_norm": 2.4534757137298584, + "learning_rate": 2.957e-07, + "loss": 0.3273, + "step": 176150 + }, + { + "epoch": 1.762, + "grad_norm": 35.73273468017578, + "learning_rate": 2.9549999999999997e-07, + "loss": 0.4744, + "step": 176200 + }, + { + "epoch": 1.7625, + "grad_norm": 33.48553466796875, + "learning_rate": 2.953e-07, + "loss": 0.4624, + "step": 176250 + }, + { + "epoch": 1.763, + "grad_norm": 52.17403793334961, + "learning_rate": 2.9509999999999994e-07, + "loss": 0.3885, + "step": 176300 + }, + { + "epoch": 1.7635, + "grad_norm": 103.1676025390625, + "learning_rate": 2.949e-07, + "loss": 0.5388, + "step": 176350 + }, + { + "epoch": 1.764, + "grad_norm": 84.96452331542969, + "learning_rate": 2.947e-07, + "loss": 0.4278, + "step": 176400 + }, + { + "epoch": 1.7645, + "grad_norm": 1.080453634262085, + "learning_rate": 2.945e-07, + "loss": 0.5096, + "step": 176450 + }, + { + "epoch": 1.7650000000000001, + "grad_norm": 1.111628532409668, + "learning_rate": 2.943e-07, + "loss": 0.3848, + "step": 176500 + }, + { + "epoch": 1.7654999999999998, + "grad_norm": 31.323686599731445, + "learning_rate": 2.9409999999999997e-07, + "loss": 0.3379, + "step": 176550 + }, + { + "epoch": 1.766, + "grad_norm": 42.48777389526367, + "learning_rate": 2.939e-07, + "loss": 0.3058, + "step": 176600 + }, + { + "epoch": 1.7665, + "grad_norm": 7.140532970428467, + "learning_rate": 2.937e-07, + "loss": 0.3894, + "step": 176650 + }, + { + "epoch": 1.767, + "grad_norm": 18.32383918762207, + "learning_rate": 2.935e-07, + "loss": 0.2754, + "step": 176700 + }, + { + "epoch": 1.7675, + "grad_norm": 0.7897120118141174, + "learning_rate": 2.933e-07, + "loss": 0.5175, + "step": 176750 + }, + { + "epoch": 1.768, + "grad_norm": 121.51934051513672, + "learning_rate": 2.931e-07, + "loss": 0.4686, + "step": 176800 + }, + { + "epoch": 1.7685, + "grad_norm": 0.11282049119472504, + "learning_rate": 2.929e-07, + "loss": 0.4456, + "step": 176850 + }, + { + "epoch": 1.7690000000000001, + "grad_norm": 19.87791633605957, + "learning_rate": 2.927e-07, + "loss": 0.3768, + "step": 176900 + }, + { + "epoch": 1.7694999999999999, + "grad_norm": 0.5945930480957031, + "learning_rate": 2.9249999999999995e-07, + "loss": 0.5334, + "step": 176950 + }, + { + "epoch": 1.77, + "grad_norm": 5.010547161102295, + "learning_rate": 2.923e-07, + "loss": 0.4443, + "step": 177000 + }, + { + "epoch": 1.7705, + "grad_norm": 47.688934326171875, + "learning_rate": 2.9210000000000003e-07, + "loss": 0.4077, + "step": 177050 + }, + { + "epoch": 1.771, + "grad_norm": 60.86616516113281, + "learning_rate": 2.9189999999999996e-07, + "loss": 0.4187, + "step": 177100 + }, + { + "epoch": 1.7715, + "grad_norm": 19.05134391784668, + "learning_rate": 2.917e-07, + "loss": 0.4585, + "step": 177150 + }, + { + "epoch": 1.772, + "grad_norm": 41.00670623779297, + "learning_rate": 2.915e-07, + "loss": 0.3587, + "step": 177200 + }, + { + "epoch": 1.7725, + "grad_norm": 4.250216007232666, + "learning_rate": 2.9129999999999997e-07, + "loss": 0.4831, + "step": 177250 + }, + { + "epoch": 1.7730000000000001, + "grad_norm": 53.11735534667969, + "learning_rate": 2.911e-07, + "loss": 0.378, + "step": 177300 + }, + { + "epoch": 1.7734999999999999, + "grad_norm": 79.12381744384766, + "learning_rate": 2.909e-07, + "loss": 0.3421, + "step": 177350 + }, + { + "epoch": 1.774, + "grad_norm": 123.16691589355469, + "learning_rate": 2.907e-07, + "loss": 0.4126, + "step": 177400 + }, + { + "epoch": 1.7745, + "grad_norm": 83.73424530029297, + "learning_rate": 2.9049999999999996e-07, + "loss": 0.4596, + "step": 177450 + }, + { + "epoch": 1.775, + "grad_norm": 47.7365608215332, + "learning_rate": 2.903e-07, + "loss": 0.3569, + "step": 177500 + }, + { + "epoch": 1.7755, + "grad_norm": 18.262990951538086, + "learning_rate": 2.9010000000000004e-07, + "loss": 0.3546, + "step": 177550 + }, + { + "epoch": 1.776, + "grad_norm": 25.71767807006836, + "learning_rate": 2.8989999999999997e-07, + "loss": 0.5118, + "step": 177600 + }, + { + "epoch": 1.7765, + "grad_norm": 43.59213638305664, + "learning_rate": 2.897e-07, + "loss": 0.4437, + "step": 177650 + }, + { + "epoch": 1.7770000000000001, + "grad_norm": 3.3139374256134033, + "learning_rate": 2.895e-07, + "loss": 0.4414, + "step": 177700 + }, + { + "epoch": 1.7774999999999999, + "grad_norm": 21.859750747680664, + "learning_rate": 2.893e-07, + "loss": 0.5413, + "step": 177750 + }, + { + "epoch": 1.778, + "grad_norm": 63.0023307800293, + "learning_rate": 2.891e-07, + "loss": 0.3183, + "step": 177800 + }, + { + "epoch": 1.7785, + "grad_norm": 7.4691267013549805, + "learning_rate": 2.889e-07, + "loss": 0.2339, + "step": 177850 + }, + { + "epoch": 1.779, + "grad_norm": 51.13058090209961, + "learning_rate": 2.887e-07, + "loss": 0.5413, + "step": 177900 + }, + { + "epoch": 1.7795, + "grad_norm": 65.8671875, + "learning_rate": 2.8849999999999997e-07, + "loss": 0.4583, + "step": 177950 + }, + { + "epoch": 1.78, + "grad_norm": 10.142685890197754, + "learning_rate": 2.883e-07, + "loss": 0.4431, + "step": 178000 + }, + { + "epoch": 1.7805, + "grad_norm": 26.582061767578125, + "learning_rate": 2.881e-07, + "loss": 0.4004, + "step": 178050 + }, + { + "epoch": 1.7810000000000001, + "grad_norm": 73.92897033691406, + "learning_rate": 2.879e-07, + "loss": 0.5293, + "step": 178100 + }, + { + "epoch": 1.7814999999999999, + "grad_norm": 69.2878646850586, + "learning_rate": 2.877e-07, + "loss": 0.5733, + "step": 178150 + }, + { + "epoch": 1.782, + "grad_norm": 99.10835266113281, + "learning_rate": 2.8749999999999995e-07, + "loss": 0.5006, + "step": 178200 + }, + { + "epoch": 1.7825, + "grad_norm": 78.11844635009766, + "learning_rate": 2.873e-07, + "loss": 0.3794, + "step": 178250 + }, + { + "epoch": 1.783, + "grad_norm": 100.39323425292969, + "learning_rate": 2.871e-07, + "loss": 0.4381, + "step": 178300 + }, + { + "epoch": 1.7835, + "grad_norm": 19.325456619262695, + "learning_rate": 2.8689999999999996e-07, + "loss": 0.4126, + "step": 178350 + }, + { + "epoch": 1.784, + "grad_norm": 3.960777759552002, + "learning_rate": 2.867e-07, + "loss": 0.4078, + "step": 178400 + }, + { + "epoch": 1.7845, + "grad_norm": 59.112300872802734, + "learning_rate": 2.865e-07, + "loss": 0.3978, + "step": 178450 + }, + { + "epoch": 1.7850000000000001, + "grad_norm": 17.307994842529297, + "learning_rate": 2.8629999999999996e-07, + "loss": 0.4808, + "step": 178500 + }, + { + "epoch": 1.7854999999999999, + "grad_norm": 33.76523971557617, + "learning_rate": 2.861e-07, + "loss": 0.4348, + "step": 178550 + }, + { + "epoch": 1.786, + "grad_norm": 80.96756744384766, + "learning_rate": 2.859e-07, + "loss": 0.4833, + "step": 178600 + }, + { + "epoch": 1.7865, + "grad_norm": 48.94674301147461, + "learning_rate": 2.8569999999999997e-07, + "loss": 0.3485, + "step": 178650 + }, + { + "epoch": 1.787, + "grad_norm": 72.22645568847656, + "learning_rate": 2.8549999999999996e-07, + "loss": 0.4025, + "step": 178700 + }, + { + "epoch": 1.7875, + "grad_norm": 4.155689716339111, + "learning_rate": 2.853e-07, + "loss": 0.405, + "step": 178750 + }, + { + "epoch": 1.788, + "grad_norm": 0.55622398853302, + "learning_rate": 2.8510000000000003e-07, + "loss": 0.5012, + "step": 178800 + }, + { + "epoch": 1.7885, + "grad_norm": 67.4764404296875, + "learning_rate": 2.8489999999999996e-07, + "loss": 0.3643, + "step": 178850 + }, + { + "epoch": 1.7890000000000001, + "grad_norm": 85.40705108642578, + "learning_rate": 2.847e-07, + "loss": 0.5223, + "step": 178900 + }, + { + "epoch": 1.7894999999999999, + "grad_norm": 85.30449676513672, + "learning_rate": 2.845e-07, + "loss": 0.4813, + "step": 178950 + }, + { + "epoch": 1.79, + "grad_norm": 10.022859573364258, + "learning_rate": 2.8429999999999997e-07, + "loss": 0.4518, + "step": 179000 + }, + { + "epoch": 1.7905, + "grad_norm": 53.33081817626953, + "learning_rate": 2.841e-07, + "loss": 0.4076, + "step": 179050 + }, + { + "epoch": 1.791, + "grad_norm": 1.9716160297393799, + "learning_rate": 2.839e-07, + "loss": 0.4666, + "step": 179100 + }, + { + "epoch": 1.7915, + "grad_norm": 59.22763442993164, + "learning_rate": 2.837e-07, + "loss": 0.5778, + "step": 179150 + }, + { + "epoch": 1.792, + "grad_norm": 7.315980911254883, + "learning_rate": 2.8349999999999996e-07, + "loss": 0.4156, + "step": 179200 + }, + { + "epoch": 1.7925, + "grad_norm": 49.91332244873047, + "learning_rate": 2.833e-07, + "loss": 0.4988, + "step": 179250 + }, + { + "epoch": 1.7930000000000001, + "grad_norm": 65.8807144165039, + "learning_rate": 2.831e-07, + "loss": 0.5433, + "step": 179300 + }, + { + "epoch": 1.7934999999999999, + "grad_norm": 103.30567169189453, + "learning_rate": 2.8289999999999997e-07, + "loss": 0.5536, + "step": 179350 + }, + { + "epoch": 1.794, + "grad_norm": 86.77356719970703, + "learning_rate": 2.827e-07, + "loss": 0.5217, + "step": 179400 + }, + { + "epoch": 1.7945, + "grad_norm": 96.92404174804688, + "learning_rate": 2.8249999999999994e-07, + "loss": 0.4565, + "step": 179450 + }, + { + "epoch": 1.795, + "grad_norm": 14.877604484558105, + "learning_rate": 2.823e-07, + "loss": 0.4828, + "step": 179500 + }, + { + "epoch": 1.7955, + "grad_norm": 24.75067901611328, + "learning_rate": 2.821e-07, + "loss": 0.4715, + "step": 179550 + }, + { + "epoch": 1.796, + "grad_norm": 45.08053207397461, + "learning_rate": 2.8189999999999995e-07, + "loss": 0.4347, + "step": 179600 + }, + { + "epoch": 1.7965, + "grad_norm": 1.8589259386062622, + "learning_rate": 2.817e-07, + "loss": 0.4739, + "step": 179650 + }, + { + "epoch": 1.7970000000000002, + "grad_norm": 128.76307678222656, + "learning_rate": 2.8149999999999997e-07, + "loss": 0.4551, + "step": 179700 + }, + { + "epoch": 1.7974999999999999, + "grad_norm": 25.413040161132812, + "learning_rate": 2.8129999999999996e-07, + "loss": 0.335, + "step": 179750 + }, + { + "epoch": 1.798, + "grad_norm": 74.6392593383789, + "learning_rate": 2.811e-07, + "loss": 0.4635, + "step": 179800 + }, + { + "epoch": 1.7985, + "grad_norm": 69.69477844238281, + "learning_rate": 2.809e-07, + "loss": 0.4422, + "step": 179850 + }, + { + "epoch": 1.799, + "grad_norm": 12.26147747039795, + "learning_rate": 2.807e-07, + "loss": 0.4213, + "step": 179900 + }, + { + "epoch": 1.7995, + "grad_norm": 90.06591033935547, + "learning_rate": 2.805e-07, + "loss": 0.3711, + "step": 179950 + }, + { + "epoch": 1.8, + "grad_norm": 24.045799255371094, + "learning_rate": 2.803e-07, + "loss": 0.3808, + "step": 180000 + }, + { + "epoch": 1.8005, + "grad_norm": 42.867767333984375, + "learning_rate": 2.8010000000000003e-07, + "loss": 0.4572, + "step": 180050 + }, + { + "epoch": 1.8010000000000002, + "grad_norm": 63.34716033935547, + "learning_rate": 2.7989999999999996e-07, + "loss": 0.5186, + "step": 180100 + }, + { + "epoch": 1.8014999999999999, + "grad_norm": 56.33584976196289, + "learning_rate": 2.797e-07, + "loss": 0.5546, + "step": 180150 + }, + { + "epoch": 1.802, + "grad_norm": 81.60842895507812, + "learning_rate": 2.7950000000000003e-07, + "loss": 0.5097, + "step": 180200 + }, + { + "epoch": 1.8025, + "grad_norm": 12.385924339294434, + "learning_rate": 2.7929999999999997e-07, + "loss": 0.3856, + "step": 180250 + }, + { + "epoch": 1.803, + "grad_norm": 54.57160186767578, + "learning_rate": 2.791e-07, + "loss": 0.398, + "step": 180300 + }, + { + "epoch": 1.8035, + "grad_norm": 75.23123168945312, + "learning_rate": 2.789e-07, + "loss": 0.3756, + "step": 180350 + }, + { + "epoch": 1.804, + "grad_norm": 102.83467102050781, + "learning_rate": 2.787e-07, + "loss": 0.3975, + "step": 180400 + }, + { + "epoch": 1.8045, + "grad_norm": 150.63693237304688, + "learning_rate": 2.785e-07, + "loss": 0.2832, + "step": 180450 + }, + { + "epoch": 1.8050000000000002, + "grad_norm": 46.95927429199219, + "learning_rate": 2.783e-07, + "loss": 0.3984, + "step": 180500 + }, + { + "epoch": 1.8054999999999999, + "grad_norm": 103.29827117919922, + "learning_rate": 2.781e-07, + "loss": 0.3903, + "step": 180550 + }, + { + "epoch": 1.806, + "grad_norm": 6.826172351837158, + "learning_rate": 2.7789999999999997e-07, + "loss": 0.5537, + "step": 180600 + }, + { + "epoch": 1.8065, + "grad_norm": 71.7900161743164, + "learning_rate": 2.777e-07, + "loss": 0.421, + "step": 180650 + }, + { + "epoch": 1.807, + "grad_norm": 78.16989135742188, + "learning_rate": 2.775e-07, + "loss": 0.4313, + "step": 180700 + }, + { + "epoch": 1.8075, + "grad_norm": 64.42400360107422, + "learning_rate": 2.773e-07, + "loss": 0.4864, + "step": 180750 + }, + { + "epoch": 1.808, + "grad_norm": 27.222681045532227, + "learning_rate": 2.771e-07, + "loss": 0.3927, + "step": 180800 + }, + { + "epoch": 1.8085, + "grad_norm": 72.81607818603516, + "learning_rate": 2.7689999999999995e-07, + "loss": 0.4236, + "step": 180850 + }, + { + "epoch": 1.8090000000000002, + "grad_norm": 119.21035766601562, + "learning_rate": 2.767e-07, + "loss": 0.5258, + "step": 180900 + }, + { + "epoch": 1.8094999999999999, + "grad_norm": 93.99649810791016, + "learning_rate": 2.765e-07, + "loss": 0.425, + "step": 180950 + }, + { + "epoch": 1.81, + "grad_norm": 81.92257690429688, + "learning_rate": 2.7629999999999995e-07, + "loss": 0.4852, + "step": 181000 + }, + { + "epoch": 1.8105, + "grad_norm": 67.70683288574219, + "learning_rate": 2.761e-07, + "loss": 0.4518, + "step": 181050 + }, + { + "epoch": 1.811, + "grad_norm": 8.245046615600586, + "learning_rate": 2.759e-07, + "loss": 0.4221, + "step": 181100 + }, + { + "epoch": 1.8115, + "grad_norm": 0.5864673852920532, + "learning_rate": 2.757e-07, + "loss": 0.3862, + "step": 181150 + }, + { + "epoch": 1.812, + "grad_norm": 103.44923400878906, + "learning_rate": 2.755e-07, + "loss": 0.4913, + "step": 181200 + }, + { + "epoch": 1.8125, + "grad_norm": 0.6303176879882812, + "learning_rate": 2.753e-07, + "loss": 0.411, + "step": 181250 + }, + { + "epoch": 1.813, + "grad_norm": 50.171905517578125, + "learning_rate": 2.751e-07, + "loss": 0.3466, + "step": 181300 + }, + { + "epoch": 1.8135, + "grad_norm": 31.094560623168945, + "learning_rate": 2.7489999999999995e-07, + "loss": 0.4811, + "step": 181350 + }, + { + "epoch": 1.814, + "grad_norm": 3.8934226036071777, + "learning_rate": 2.747e-07, + "loss": 0.3938, + "step": 181400 + }, + { + "epoch": 1.8145, + "grad_norm": 37.404117584228516, + "learning_rate": 2.7450000000000003e-07, + "loss": 0.3969, + "step": 181450 + }, + { + "epoch": 1.815, + "grad_norm": 13.522012710571289, + "learning_rate": 2.7429999999999996e-07, + "loss": 0.4785, + "step": 181500 + }, + { + "epoch": 1.8155000000000001, + "grad_norm": 14.331076622009277, + "learning_rate": 2.741e-07, + "loss": 0.3068, + "step": 181550 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 26.77286148071289, + "learning_rate": 2.739e-07, + "loss": 0.337, + "step": 181600 + }, + { + "epoch": 1.8165, + "grad_norm": 48.4984245300293, + "learning_rate": 2.7369999999999997e-07, + "loss": 0.3276, + "step": 181650 + }, + { + "epoch": 1.817, + "grad_norm": 60.87279510498047, + "learning_rate": 2.735e-07, + "loss": 0.4779, + "step": 181700 + }, + { + "epoch": 1.8175, + "grad_norm": 51.6512336730957, + "learning_rate": 2.733e-07, + "loss": 0.3513, + "step": 181750 + }, + { + "epoch": 1.818, + "grad_norm": 6.495523929595947, + "learning_rate": 2.731e-07, + "loss": 0.5444, + "step": 181800 + }, + { + "epoch": 1.8185, + "grad_norm": 60.35252380371094, + "learning_rate": 2.7289999999999996e-07, + "loss": 0.4291, + "step": 181850 + }, + { + "epoch": 1.819, + "grad_norm": 28.022743225097656, + "learning_rate": 2.727e-07, + "loss": 0.3474, + "step": 181900 + }, + { + "epoch": 1.8195000000000001, + "grad_norm": 10.732136726379395, + "learning_rate": 2.725e-07, + "loss": 0.4097, + "step": 181950 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 85.77149200439453, + "learning_rate": 2.7229999999999997e-07, + "loss": 0.4671, + "step": 182000 + }, + { + "epoch": 1.8205, + "grad_norm": 69.69800567626953, + "learning_rate": 2.721e-07, + "loss": 0.4068, + "step": 182050 + }, + { + "epoch": 1.821, + "grad_norm": 67.32820129394531, + "learning_rate": 2.7189999999999994e-07, + "loss": 0.434, + "step": 182100 + }, + { + "epoch": 1.8215, + "grad_norm": 46.35198974609375, + "learning_rate": 2.717e-07, + "loss": 0.4207, + "step": 182150 + }, + { + "epoch": 1.822, + "grad_norm": 13.979206085205078, + "learning_rate": 2.715e-07, + "loss": 0.4041, + "step": 182200 + }, + { + "epoch": 1.8225, + "grad_norm": 3.441556453704834, + "learning_rate": 2.7129999999999995e-07, + "loss": 0.4985, + "step": 182250 + }, + { + "epoch": 1.823, + "grad_norm": 2.4120137691497803, + "learning_rate": 2.711e-07, + "loss": 0.5843, + "step": 182300 + }, + { + "epoch": 1.8235000000000001, + "grad_norm": 70.7459716796875, + "learning_rate": 2.7089999999999997e-07, + "loss": 0.3521, + "step": 182350 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 35.97295379638672, + "learning_rate": 2.707e-07, + "loss": 0.5621, + "step": 182400 + }, + { + "epoch": 1.8245, + "grad_norm": 89.31857299804688, + "learning_rate": 2.705e-07, + "loss": 0.3875, + "step": 182450 + }, + { + "epoch": 1.825, + "grad_norm": 91.55574035644531, + "learning_rate": 2.703e-07, + "loss": 0.4778, + "step": 182500 + }, + { + "epoch": 1.8255, + "grad_norm": 4.394865036010742, + "learning_rate": 2.701e-07, + "loss": 0.3469, + "step": 182550 + }, + { + "epoch": 1.826, + "grad_norm": 3.864041566848755, + "learning_rate": 2.6989999999999995e-07, + "loss": 0.3886, + "step": 182600 + }, + { + "epoch": 1.8265, + "grad_norm": 12.288315773010254, + "learning_rate": 2.697e-07, + "loss": 0.4518, + "step": 182650 + }, + { + "epoch": 1.827, + "grad_norm": 2.9489903450012207, + "learning_rate": 2.695e-07, + "loss": 0.3015, + "step": 182700 + }, + { + "epoch": 1.8275000000000001, + "grad_norm": 58.115970611572266, + "learning_rate": 2.6929999999999996e-07, + "loss": 0.3234, + "step": 182750 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 62.53365707397461, + "learning_rate": 2.691e-07, + "loss": 0.4949, + "step": 182800 + }, + { + "epoch": 1.8285, + "grad_norm": 38.055843353271484, + "learning_rate": 2.68904e-07, + "loss": 0.4362, + "step": 182850 + }, + { + "epoch": 1.829, + "grad_norm": 114.48504638671875, + "learning_rate": 2.6870399999999997e-07, + "loss": 0.4096, + "step": 182900 + }, + { + "epoch": 1.8295, + "grad_norm": 56.97817611694336, + "learning_rate": 2.68504e-07, + "loss": 0.4706, + "step": 182950 + }, + { + "epoch": 1.83, + "grad_norm": 100.63391876220703, + "learning_rate": 2.68304e-07, + "loss": 0.436, + "step": 183000 + }, + { + "epoch": 1.8305, + "grad_norm": 0.5884892344474792, + "learning_rate": 2.68104e-07, + "loss": 0.5211, + "step": 183050 + }, + { + "epoch": 1.831, + "grad_norm": 39.73606491088867, + "learning_rate": 2.6790399999999996e-07, + "loss": 0.3914, + "step": 183100 + }, + { + "epoch": 1.8315000000000001, + "grad_norm": 3.421617269515991, + "learning_rate": 2.67704e-07, + "loss": 0.3778, + "step": 183150 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 61.44535827636719, + "learning_rate": 2.67504e-07, + "loss": 0.5635, + "step": 183200 + }, + { + "epoch": 1.8325, + "grad_norm": 80.6475601196289, + "learning_rate": 2.6730399999999997e-07, + "loss": 0.491, + "step": 183250 + }, + { + "epoch": 1.833, + "grad_norm": 5.449416160583496, + "learning_rate": 2.67104e-07, + "loss": 0.3328, + "step": 183300 + }, + { + "epoch": 1.8335, + "grad_norm": 92.49075317382812, + "learning_rate": 2.6690399999999994e-07, + "loss": 0.4623, + "step": 183350 + }, + { + "epoch": 1.834, + "grad_norm": 7.8489990234375, + "learning_rate": 2.66704e-07, + "loss": 0.4183, + "step": 183400 + }, + { + "epoch": 1.8345, + "grad_norm": 10.370362281799316, + "learning_rate": 2.66504e-07, + "loss": 0.4563, + "step": 183450 + }, + { + "epoch": 1.835, + "grad_norm": 104.78883361816406, + "learning_rate": 2.6630399999999995e-07, + "loss": 0.4643, + "step": 183500 + }, + { + "epoch": 1.8355000000000001, + "grad_norm": 1.3594282865524292, + "learning_rate": 2.66104e-07, + "loss": 0.4734, + "step": 183550 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 1.5864472389221191, + "learning_rate": 2.6590399999999997e-07, + "loss": 0.3522, + "step": 183600 + }, + { + "epoch": 1.8365, + "grad_norm": 62.5477409362793, + "learning_rate": 2.65704e-07, + "loss": 0.3794, + "step": 183650 + }, + { + "epoch": 1.837, + "grad_norm": 53.555747985839844, + "learning_rate": 2.65504e-07, + "loss": 0.4603, + "step": 183700 + }, + { + "epoch": 1.8375, + "grad_norm": 81.1266098022461, + "learning_rate": 2.65304e-07, + "loss": 0.564, + "step": 183750 + }, + { + "epoch": 1.838, + "grad_norm": 40.866695404052734, + "learning_rate": 2.65104e-07, + "loss": 0.4407, + "step": 183800 + }, + { + "epoch": 1.8385, + "grad_norm": 78.56951904296875, + "learning_rate": 2.6490399999999995e-07, + "loss": 0.435, + "step": 183850 + }, + { + "epoch": 1.839, + "grad_norm": 6.286223888397217, + "learning_rate": 2.64704e-07, + "loss": 0.3209, + "step": 183900 + }, + { + "epoch": 1.8395000000000001, + "grad_norm": 21.405439376831055, + "learning_rate": 2.64504e-07, + "loss": 0.352, + "step": 183950 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 1.1134364604949951, + "learning_rate": 2.6430399999999996e-07, + "loss": 0.4044, + "step": 184000 + }, + { + "epoch": 1.8405, + "grad_norm": 0.49174389243125916, + "learning_rate": 2.64104e-07, + "loss": 0.2879, + "step": 184050 + }, + { + "epoch": 1.841, + "grad_norm": 7.193144798278809, + "learning_rate": 2.6390400000000003e-07, + "loss": 0.5735, + "step": 184100 + }, + { + "epoch": 1.8415, + "grad_norm": 109.04924774169922, + "learning_rate": 2.6370399999999996e-07, + "loss": 0.4442, + "step": 184150 + }, + { + "epoch": 1.842, + "grad_norm": 29.094871520996094, + "learning_rate": 2.63504e-07, + "loss": 0.5491, + "step": 184200 + }, + { + "epoch": 1.8425, + "grad_norm": 50.769309997558594, + "learning_rate": 2.63304e-07, + "loss": 0.3682, + "step": 184250 + }, + { + "epoch": 1.843, + "grad_norm": 93.74052429199219, + "learning_rate": 2.6310399999999997e-07, + "loss": 0.612, + "step": 184300 + }, + { + "epoch": 1.8435000000000001, + "grad_norm": 90.94390106201172, + "learning_rate": 2.62904e-07, + "loss": 0.5965, + "step": 184350 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 66.96981811523438, + "learning_rate": 2.62704e-07, + "loss": 0.4703, + "step": 184400 + }, + { + "epoch": 1.8445, + "grad_norm": 99.0244369506836, + "learning_rate": 2.62504e-07, + "loss": 0.4547, + "step": 184450 + }, + { + "epoch": 1.845, + "grad_norm": 75.41879272460938, + "learning_rate": 2.6230399999999996e-07, + "loss": 0.3646, + "step": 184500 + }, + { + "epoch": 1.8455, + "grad_norm": 15.815811157226562, + "learning_rate": 2.62104e-07, + "loss": 0.432, + "step": 184550 + }, + { + "epoch": 1.846, + "grad_norm": 86.7995376586914, + "learning_rate": 2.6190400000000004e-07, + "loss": 0.5396, + "step": 184600 + }, + { + "epoch": 1.8465, + "grad_norm": 43.598876953125, + "learning_rate": 2.6170399999999997e-07, + "loss": 0.4494, + "step": 184650 + }, + { + "epoch": 1.847, + "grad_norm": 55.9370231628418, + "learning_rate": 2.61504e-07, + "loss": 0.4301, + "step": 184700 + }, + { + "epoch": 1.8475000000000001, + "grad_norm": 91.31571197509766, + "learning_rate": 2.61304e-07, + "loss": 0.4079, + "step": 184750 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 2.9025886058807373, + "learning_rate": 2.61104e-07, + "loss": 0.4143, + "step": 184800 + }, + { + "epoch": 1.8485, + "grad_norm": 76.20918273925781, + "learning_rate": 2.60904e-07, + "loss": 0.3667, + "step": 184850 + }, + { + "epoch": 1.849, + "grad_norm": 65.22622680664062, + "learning_rate": 2.60704e-07, + "loss": 0.3588, + "step": 184900 + }, + { + "epoch": 1.8495, + "grad_norm": 100.45918273925781, + "learning_rate": 2.60504e-07, + "loss": 0.4632, + "step": 184950 + }, + { + "epoch": 1.85, + "grad_norm": 3.1237618923187256, + "learning_rate": 2.6030399999999997e-07, + "loss": 0.5477, + "step": 185000 + }, + { + "epoch": 1.8505, + "grad_norm": 64.92867279052734, + "learning_rate": 2.60104e-07, + "loss": 0.4146, + "step": 185050 + }, + { + "epoch": 1.851, + "grad_norm": 5.72701358795166, + "learning_rate": 2.59904e-07, + "loss": 0.3746, + "step": 185100 + }, + { + "epoch": 1.8515000000000001, + "grad_norm": 86.76318359375, + "learning_rate": 2.59704e-07, + "loss": 0.4661, + "step": 185150 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 68.64383697509766, + "learning_rate": 2.59504e-07, + "loss": 0.4119, + "step": 185200 + }, + { + "epoch": 1.8525, + "grad_norm": 10.763185501098633, + "learning_rate": 2.5930399999999995e-07, + "loss": 0.414, + "step": 185250 + }, + { + "epoch": 1.853, + "grad_norm": 1.534547209739685, + "learning_rate": 2.59104e-07, + "loss": 0.3791, + "step": 185300 + }, + { + "epoch": 1.8535, + "grad_norm": 6.461805820465088, + "learning_rate": 2.5890400000000003e-07, + "loss": 0.5048, + "step": 185350 + }, + { + "epoch": 1.854, + "grad_norm": 109.25067138671875, + "learning_rate": 2.5870399999999996e-07, + "loss": 0.5874, + "step": 185400 + }, + { + "epoch": 1.8545, + "grad_norm": 24.06308364868164, + "learning_rate": 2.58504e-07, + "loss": 0.3391, + "step": 185450 + }, + { + "epoch": 1.855, + "grad_norm": 0.7882880568504333, + "learning_rate": 2.58304e-07, + "loss": 0.4764, + "step": 185500 + }, + { + "epoch": 1.8555000000000001, + "grad_norm": 26.011205673217773, + "learning_rate": 2.5810399999999997e-07, + "loss": 0.4689, + "step": 185550 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 111.33613586425781, + "learning_rate": 2.57904e-07, + "loss": 0.4482, + "step": 185600 + }, + { + "epoch": 1.8565, + "grad_norm": 7.982728958129883, + "learning_rate": 2.57704e-07, + "loss": 0.4772, + "step": 185650 + }, + { + "epoch": 1.857, + "grad_norm": 68.1218032836914, + "learning_rate": 2.57504e-07, + "loss": 0.4142, + "step": 185700 + }, + { + "epoch": 1.8575, + "grad_norm": 22.570636749267578, + "learning_rate": 2.5730399999999996e-07, + "loss": 0.3669, + "step": 185750 + }, + { + "epoch": 1.858, + "grad_norm": 53.14181137084961, + "learning_rate": 2.57104e-07, + "loss": 0.3772, + "step": 185800 + }, + { + "epoch": 1.8585, + "grad_norm": 94.5253677368164, + "learning_rate": 2.5690400000000004e-07, + "loss": 0.5068, + "step": 185850 + }, + { + "epoch": 1.859, + "grad_norm": 57.95703887939453, + "learning_rate": 2.5670399999999997e-07, + "loss": 0.4562, + "step": 185900 + }, + { + "epoch": 1.8595000000000002, + "grad_norm": 125.20692443847656, + "learning_rate": 2.56504e-07, + "loss": 0.5206, + "step": 185950 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 76.74690246582031, + "learning_rate": 2.56304e-07, + "loss": 0.6174, + "step": 186000 + }, + { + "epoch": 1.8605, + "grad_norm": 36.07761764526367, + "learning_rate": 2.56104e-07, + "loss": 0.5131, + "step": 186050 + }, + { + "epoch": 1.861, + "grad_norm": 105.98657989501953, + "learning_rate": 2.55904e-07, + "loss": 0.4608, + "step": 186100 + }, + { + "epoch": 1.8615, + "grad_norm": 62.74296569824219, + "learning_rate": 2.55704e-07, + "loss": 0.3984, + "step": 186150 + }, + { + "epoch": 1.862, + "grad_norm": 7.824224948883057, + "learning_rate": 2.55504e-07, + "loss": 0.4741, + "step": 186200 + }, + { + "epoch": 1.8625, + "grad_norm": 0.2132396399974823, + "learning_rate": 2.5530399999999997e-07, + "loss": 0.4148, + "step": 186250 + }, + { + "epoch": 1.863, + "grad_norm": 63.773040771484375, + "learning_rate": 2.55104e-07, + "loss": 0.3911, + "step": 186300 + }, + { + "epoch": 1.8635000000000002, + "grad_norm": 35.12382888793945, + "learning_rate": 2.54904e-07, + "loss": 0.3641, + "step": 186350 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 13.937716484069824, + "learning_rate": 2.54704e-07, + "loss": 0.5185, + "step": 186400 + }, + { + "epoch": 1.8645, + "grad_norm": 139.38047790527344, + "learning_rate": 2.54504e-07, + "loss": 0.3617, + "step": 186450 + }, + { + "epoch": 1.865, + "grad_norm": 54.645015716552734, + "learning_rate": 2.5430399999999995e-07, + "loss": 0.4411, + "step": 186500 + }, + { + "epoch": 1.8655, + "grad_norm": 77.72416687011719, + "learning_rate": 2.54104e-07, + "loss": 0.4681, + "step": 186550 + }, + { + "epoch": 1.866, + "grad_norm": 75.3111801147461, + "learning_rate": 2.53904e-07, + "loss": 0.3701, + "step": 186600 + }, + { + "epoch": 1.8665, + "grad_norm": 90.5988540649414, + "learning_rate": 2.5370399999999995e-07, + "loss": 0.415, + "step": 186650 + }, + { + "epoch": 1.867, + "grad_norm": 7.769338130950928, + "learning_rate": 2.53504e-07, + "loss": 0.2835, + "step": 186700 + }, + { + "epoch": 1.8675000000000002, + "grad_norm": 38.08205032348633, + "learning_rate": 2.53304e-07, + "loss": 0.5307, + "step": 186750 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 17.543153762817383, + "learning_rate": 2.5310399999999996e-07, + "loss": 0.3548, + "step": 186800 + }, + { + "epoch": 1.8685, + "grad_norm": 10.238409996032715, + "learning_rate": 2.52904e-07, + "loss": 0.5478, + "step": 186850 + }, + { + "epoch": 1.869, + "grad_norm": 73.03883361816406, + "learning_rate": 2.52704e-07, + "loss": 0.4317, + "step": 186900 + }, + { + "epoch": 1.8695, + "grad_norm": 50.91599655151367, + "learning_rate": 2.5250399999999997e-07, + "loss": 0.4353, + "step": 186950 + }, + { + "epoch": 1.87, + "grad_norm": 1.1760220527648926, + "learning_rate": 2.5230399999999995e-07, + "loss": 0.4632, + "step": 187000 + }, + { + "epoch": 1.8705, + "grad_norm": 79.37187194824219, + "learning_rate": 2.52104e-07, + "loss": 0.4926, + "step": 187050 + }, + { + "epoch": 1.871, + "grad_norm": 60.9011344909668, + "learning_rate": 2.5190400000000003e-07, + "loss": 0.5786, + "step": 187100 + }, + { + "epoch": 1.8715000000000002, + "grad_norm": 108.72712707519531, + "learning_rate": 2.5170399999999996e-07, + "loss": 0.3917, + "step": 187150 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 34.94348907470703, + "learning_rate": 2.51504e-07, + "loss": 0.4065, + "step": 187200 + }, + { + "epoch": 1.8725, + "grad_norm": 18.728527069091797, + "learning_rate": 2.5130400000000004e-07, + "loss": 0.523, + "step": 187250 + }, + { + "epoch": 1.873, + "grad_norm": 68.10762023925781, + "learning_rate": 2.5110399999999997e-07, + "loss": 0.372, + "step": 187300 + }, + { + "epoch": 1.8735, + "grad_norm": 24.434104919433594, + "learning_rate": 2.50904e-07, + "loss": 0.3331, + "step": 187350 + }, + { + "epoch": 1.874, + "grad_norm": 27.29010772705078, + "learning_rate": 2.50708e-07, + "loss": 0.5148, + "step": 187400 + }, + { + "epoch": 1.8745, + "grad_norm": 19.589698791503906, + "learning_rate": 2.50508e-07, + "loss": 0.4057, + "step": 187450 + }, + { + "epoch": 1.875, + "grad_norm": 2.283752679824829, + "learning_rate": 2.5031199999999997e-07, + "loss": 0.3063, + "step": 187500 + }, + { + "epoch": 1.8755, + "grad_norm": 21.889986038208008, + "learning_rate": 2.50112e-07, + "loss": 0.3292, + "step": 187550 + }, + { + "epoch": 1.876, + "grad_norm": 2.8711414337158203, + "learning_rate": 2.49912e-07, + "loss": 0.4056, + "step": 187600 + }, + { + "epoch": 1.8765, + "grad_norm": 53.26054763793945, + "learning_rate": 2.49712e-07, + "loss": 0.4772, + "step": 187650 + }, + { + "epoch": 1.877, + "grad_norm": 10.286738395690918, + "learning_rate": 2.49512e-07, + "loss": 0.4867, + "step": 187700 + }, + { + "epoch": 1.8775, + "grad_norm": 93.31888580322266, + "learning_rate": 2.49312e-07, + "loss": 0.3756, + "step": 187750 + }, + { + "epoch": 1.8780000000000001, + "grad_norm": 86.73455810546875, + "learning_rate": 2.49112e-07, + "loss": 0.4222, + "step": 187800 + }, + { + "epoch": 1.8784999999999998, + "grad_norm": 7.344048023223877, + "learning_rate": 2.4891199999999997e-07, + "loss": 0.4104, + "step": 187850 + }, + { + "epoch": 1.879, + "grad_norm": 18.443588256835938, + "learning_rate": 2.4871199999999996e-07, + "loss": 0.5056, + "step": 187900 + }, + { + "epoch": 1.8795, + "grad_norm": 20.430662155151367, + "learning_rate": 2.48512e-07, + "loss": 0.7031, + "step": 187950 + }, + { + "epoch": 1.88, + "grad_norm": 98.71015930175781, + "learning_rate": 2.48312e-07, + "loss": 0.5261, + "step": 188000 + }, + { + "epoch": 1.8805, + "grad_norm": 103.14667510986328, + "learning_rate": 2.4811199999999997e-07, + "loss": 0.4343, + "step": 188050 + }, + { + "epoch": 1.881, + "grad_norm": 82.66529083251953, + "learning_rate": 2.47912e-07, + "loss": 0.4226, + "step": 188100 + }, + { + "epoch": 1.8815, + "grad_norm": 11.182500839233398, + "learning_rate": 2.47712e-07, + "loss": 0.5237, + "step": 188150 + }, + { + "epoch": 1.8820000000000001, + "grad_norm": 74.19300842285156, + "learning_rate": 2.4751199999999997e-07, + "loss": 0.5641, + "step": 188200 + }, + { + "epoch": 1.8824999999999998, + "grad_norm": 36.28932571411133, + "learning_rate": 2.47312e-07, + "loss": 0.4777, + "step": 188250 + }, + { + "epoch": 1.883, + "grad_norm": 80.10577392578125, + "learning_rate": 2.47112e-07, + "loss": 0.5638, + "step": 188300 + }, + { + "epoch": 1.8835, + "grad_norm": 48.6932373046875, + "learning_rate": 2.46912e-07, + "loss": 0.3765, + "step": 188350 + }, + { + "epoch": 1.884, + "grad_norm": 2.2516491413116455, + "learning_rate": 2.4671199999999997e-07, + "loss": 0.5383, + "step": 188400 + }, + { + "epoch": 1.8845, + "grad_norm": 51.581634521484375, + "learning_rate": 2.46512e-07, + "loss": 0.5575, + "step": 188450 + }, + { + "epoch": 1.885, + "grad_norm": 29.83396339416504, + "learning_rate": 2.46312e-07, + "loss": 0.3791, + "step": 188500 + }, + { + "epoch": 1.8855, + "grad_norm": 30.861713409423828, + "learning_rate": 2.46112e-07, + "loss": 0.3886, + "step": 188550 + }, + { + "epoch": 1.8860000000000001, + "grad_norm": 0.5609220862388611, + "learning_rate": 2.4591199999999996e-07, + "loss": 0.4782, + "step": 188600 + }, + { + "epoch": 1.8864999999999998, + "grad_norm": 99.23149108886719, + "learning_rate": 2.45712e-07, + "loss": 0.4121, + "step": 188650 + }, + { + "epoch": 1.887, + "grad_norm": 75.1487808227539, + "learning_rate": 2.45512e-07, + "loss": 0.3858, + "step": 188700 + }, + { + "epoch": 1.8875, + "grad_norm": 52.555572509765625, + "learning_rate": 2.4531199999999997e-07, + "loss": 0.4526, + "step": 188750 + }, + { + "epoch": 1.888, + "grad_norm": 3.6657705307006836, + "learning_rate": 2.45112e-07, + "loss": 0.3795, + "step": 188800 + }, + { + "epoch": 1.8885, + "grad_norm": 53.775360107421875, + "learning_rate": 2.44912e-07, + "loss": 0.4186, + "step": 188850 + }, + { + "epoch": 1.889, + "grad_norm": 82.37354278564453, + "learning_rate": 2.4471200000000003e-07, + "loss": 0.5195, + "step": 188900 + }, + { + "epoch": 1.8895, + "grad_norm": 71.7086181640625, + "learning_rate": 2.44512e-07, + "loss": 0.4569, + "step": 188950 + }, + { + "epoch": 1.8900000000000001, + "grad_norm": 76.88038635253906, + "learning_rate": 2.44312e-07, + "loss": 0.3721, + "step": 189000 + }, + { + "epoch": 1.8904999999999998, + "grad_norm": 16.016008377075195, + "learning_rate": 2.44112e-07, + "loss": 0.4164, + "step": 189050 + }, + { + "epoch": 1.891, + "grad_norm": 71.51254272460938, + "learning_rate": 2.4391199999999997e-07, + "loss": 0.4399, + "step": 189100 + }, + { + "epoch": 1.8915, + "grad_norm": 9.580039024353027, + "learning_rate": 2.43712e-07, + "loss": 0.3662, + "step": 189150 + }, + { + "epoch": 1.892, + "grad_norm": 75.79584503173828, + "learning_rate": 2.43512e-07, + "loss": 0.3707, + "step": 189200 + }, + { + "epoch": 1.8925, + "grad_norm": 86.73115539550781, + "learning_rate": 2.43312e-07, + "loss": 0.4684, + "step": 189250 + }, + { + "epoch": 1.893, + "grad_norm": 68.81775665283203, + "learning_rate": 2.4311199999999996e-07, + "loss": 0.3968, + "step": 189300 + }, + { + "epoch": 1.8935, + "grad_norm": 62.46356201171875, + "learning_rate": 2.42912e-07, + "loss": 0.3978, + "step": 189350 + }, + { + "epoch": 1.8940000000000001, + "grad_norm": 78.36255645751953, + "learning_rate": 2.42712e-07, + "loss": 0.4894, + "step": 189400 + }, + { + "epoch": 1.8944999999999999, + "grad_norm": 96.74028778076172, + "learning_rate": 2.4251199999999997e-07, + "loss": 0.3728, + "step": 189450 + }, + { + "epoch": 1.895, + "grad_norm": 92.6654052734375, + "learning_rate": 2.42312e-07, + "loss": 0.3868, + "step": 189500 + }, + { + "epoch": 1.8955, + "grad_norm": 38.866249084472656, + "learning_rate": 2.42112e-07, + "loss": 0.3451, + "step": 189550 + }, + { + "epoch": 1.896, + "grad_norm": 63.626548767089844, + "learning_rate": 2.41912e-07, + "loss": 0.4468, + "step": 189600 + }, + { + "epoch": 1.8965, + "grad_norm": 65.61091613769531, + "learning_rate": 2.41712e-07, + "loss": 0.3838, + "step": 189650 + }, + { + "epoch": 1.897, + "grad_norm": 7.437851428985596, + "learning_rate": 2.41512e-07, + "loss": 0.3055, + "step": 189700 + }, + { + "epoch": 1.8975, + "grad_norm": 115.52892303466797, + "learning_rate": 2.41312e-07, + "loss": 0.3372, + "step": 189750 + }, + { + "epoch": 1.8980000000000001, + "grad_norm": 18.40287971496582, + "learning_rate": 2.4111199999999997e-07, + "loss": 0.3989, + "step": 189800 + }, + { + "epoch": 1.8984999999999999, + "grad_norm": 9.118658065795898, + "learning_rate": 2.4091199999999995e-07, + "loss": 0.4081, + "step": 189850 + }, + { + "epoch": 1.899, + "grad_norm": 18.847917556762695, + "learning_rate": 2.40712e-07, + "loss": 0.395, + "step": 189900 + }, + { + "epoch": 1.8995, + "grad_norm": 36.21388626098633, + "learning_rate": 2.40512e-07, + "loss": 0.3841, + "step": 189950 + }, + { + "epoch": 1.9, + "grad_norm": 65.34298706054688, + "learning_rate": 2.4031199999999996e-07, + "loss": 0.4045, + "step": 190000 + }, + { + "epoch": 1.9005, + "grad_norm": 90.31636047363281, + "learning_rate": 2.40112e-07, + "loss": 0.3998, + "step": 190050 + }, + { + "epoch": 1.901, + "grad_norm": 80.00827026367188, + "learning_rate": 2.39912e-07, + "loss": 0.505, + "step": 190100 + }, + { + "epoch": 1.9015, + "grad_norm": 47.44596862792969, + "learning_rate": 2.39712e-07, + "loss": 0.3921, + "step": 190150 + }, + { + "epoch": 1.9020000000000001, + "grad_norm": 19.611328125, + "learning_rate": 2.39512e-07, + "loss": 0.3527, + "step": 190200 + }, + { + "epoch": 1.9024999999999999, + "grad_norm": 111.60746765136719, + "learning_rate": 2.39316e-07, + "loss": 0.4237, + "step": 190250 + }, + { + "epoch": 1.903, + "grad_norm": 0.5896288156509399, + "learning_rate": 2.39116e-07, + "loss": 0.6023, + "step": 190300 + }, + { + "epoch": 1.9035, + "grad_norm": 6.561922550201416, + "learning_rate": 2.3891599999999997e-07, + "loss": 0.4336, + "step": 190350 + }, + { + "epoch": 1.904, + "grad_norm": 72.0843734741211, + "learning_rate": 2.38716e-07, + "loss": 0.4817, + "step": 190400 + }, + { + "epoch": 1.9045, + "grad_norm": 82.44991302490234, + "learning_rate": 2.38516e-07, + "loss": 0.5264, + "step": 190450 + }, + { + "epoch": 1.905, + "grad_norm": 65.08079528808594, + "learning_rate": 2.38316e-07, + "loss": 0.4905, + "step": 190500 + }, + { + "epoch": 1.9055, + "grad_norm": 52.838294982910156, + "learning_rate": 2.3811599999999999e-07, + "loss": 0.5025, + "step": 190550 + }, + { + "epoch": 1.9060000000000001, + "grad_norm": 16.74088478088379, + "learning_rate": 2.3791599999999997e-07, + "loss": 0.3235, + "step": 190600 + }, + { + "epoch": 1.9064999999999999, + "grad_norm": 37.48681640625, + "learning_rate": 2.37716e-07, + "loss": 0.4151, + "step": 190650 + }, + { + "epoch": 1.907, + "grad_norm": 84.55889892578125, + "learning_rate": 2.37516e-07, + "loss": 0.3849, + "step": 190700 + }, + { + "epoch": 1.9075, + "grad_norm": 13.987595558166504, + "learning_rate": 2.3731599999999998e-07, + "loss": 0.3796, + "step": 190750 + }, + { + "epoch": 1.908, + "grad_norm": 63.06429672241211, + "learning_rate": 2.37116e-07, + "loss": 0.3547, + "step": 190800 + }, + { + "epoch": 1.9085, + "grad_norm": 41.80989456176758, + "learning_rate": 2.3691599999999998e-07, + "loss": 0.5286, + "step": 190850 + }, + { + "epoch": 1.909, + "grad_norm": 83.75984954833984, + "learning_rate": 2.36716e-07, + "loss": 0.3391, + "step": 190900 + }, + { + "epoch": 1.9095, + "grad_norm": 51.583290100097656, + "learning_rate": 2.36516e-07, + "loss": 0.3597, + "step": 190950 + }, + { + "epoch": 1.9100000000000001, + "grad_norm": 65.20770263671875, + "learning_rate": 2.3631599999999998e-07, + "loss": 0.5186, + "step": 191000 + }, + { + "epoch": 1.9104999999999999, + "grad_norm": 123.32764434814453, + "learning_rate": 2.36116e-07, + "loss": 0.4346, + "step": 191050 + }, + { + "epoch": 1.911, + "grad_norm": 41.057682037353516, + "learning_rate": 2.3591599999999998e-07, + "loss": 0.4873, + "step": 191100 + }, + { + "epoch": 1.9115, + "grad_norm": 40.01293182373047, + "learning_rate": 2.35716e-07, + "loss": 0.4443, + "step": 191150 + }, + { + "epoch": 1.912, + "grad_norm": 0.16099533438682556, + "learning_rate": 2.35516e-07, + "loss": 0.3952, + "step": 191200 + }, + { + "epoch": 1.9125, + "grad_norm": 48.290771484375, + "learning_rate": 2.35316e-07, + "loss": 0.3768, + "step": 191250 + }, + { + "epoch": 1.913, + "grad_norm": 0.869463324546814, + "learning_rate": 2.3511599999999997e-07, + "loss": 0.5064, + "step": 191300 + }, + { + "epoch": 1.9135, + "grad_norm": 174.40245056152344, + "learning_rate": 2.34916e-07, + "loss": 0.4593, + "step": 191350 + }, + { + "epoch": 1.9140000000000001, + "grad_norm": 64.9591293334961, + "learning_rate": 2.34716e-07, + "loss": 0.4874, + "step": 191400 + }, + { + "epoch": 1.9144999999999999, + "grad_norm": 11.59118366241455, + "learning_rate": 2.3451599999999998e-07, + "loss": 0.5005, + "step": 191450 + }, + { + "epoch": 1.915, + "grad_norm": 108.15178680419922, + "learning_rate": 2.34316e-07, + "loss": 0.5062, + "step": 191500 + }, + { + "epoch": 1.9155, + "grad_norm": 0.83865886926651, + "learning_rate": 2.3411599999999998e-07, + "loss": 0.3885, + "step": 191550 + }, + { + "epoch": 1.916, + "grad_norm": 85.95867919921875, + "learning_rate": 2.3391600000000001e-07, + "loss": 0.3974, + "step": 191600 + }, + { + "epoch": 1.9165, + "grad_norm": 31.9445858001709, + "learning_rate": 2.33716e-07, + "loss": 0.34, + "step": 191650 + }, + { + "epoch": 1.917, + "grad_norm": 2.4149677753448486, + "learning_rate": 2.3351599999999998e-07, + "loss": 0.6457, + "step": 191700 + }, + { + "epoch": 1.9175, + "grad_norm": 18.962900161743164, + "learning_rate": 2.33316e-07, + "loss": 0.5091, + "step": 191750 + }, + { + "epoch": 1.9180000000000001, + "grad_norm": 73.12571716308594, + "learning_rate": 2.3311599999999998e-07, + "loss": 0.4173, + "step": 191800 + }, + { + "epoch": 1.9184999999999999, + "grad_norm": 5.990804195404053, + "learning_rate": 2.32916e-07, + "loss": 0.4436, + "step": 191850 + }, + { + "epoch": 1.919, + "grad_norm": 70.40878295898438, + "learning_rate": 2.32716e-07, + "loss": 0.4218, + "step": 191900 + }, + { + "epoch": 1.9195, + "grad_norm": 24.912353515625, + "learning_rate": 2.32516e-07, + "loss": 0.5248, + "step": 191950 + }, + { + "epoch": 1.92, + "grad_norm": 24.189905166625977, + "learning_rate": 2.3231599999999997e-07, + "loss": 0.4552, + "step": 192000 + }, + { + "epoch": 1.9205, + "grad_norm": 56.80104064941406, + "learning_rate": 2.3211599999999999e-07, + "loss": 0.5258, + "step": 192050 + }, + { + "epoch": 1.921, + "grad_norm": 69.89954376220703, + "learning_rate": 2.31916e-07, + "loss": 0.4205, + "step": 192100 + }, + { + "epoch": 1.9215, + "grad_norm": 72.8506851196289, + "learning_rate": 2.3171599999999998e-07, + "loss": 0.4343, + "step": 192150 + }, + { + "epoch": 1.9220000000000002, + "grad_norm": 77.9870376586914, + "learning_rate": 2.31516e-07, + "loss": 0.3266, + "step": 192200 + }, + { + "epoch": 1.9224999999999999, + "grad_norm": 128.5535430908203, + "learning_rate": 2.3131599999999998e-07, + "loss": 0.5274, + "step": 192250 + }, + { + "epoch": 1.923, + "grad_norm": 34.5822868347168, + "learning_rate": 2.31116e-07, + "loss": 0.4293, + "step": 192300 + }, + { + "epoch": 1.9235, + "grad_norm": 1.127679467201233, + "learning_rate": 2.30916e-07, + "loss": 0.5053, + "step": 192350 + }, + { + "epoch": 1.924, + "grad_norm": 122.0609130859375, + "learning_rate": 2.3071599999999999e-07, + "loss": 0.4999, + "step": 192400 + }, + { + "epoch": 1.9245, + "grad_norm": 4.369318962097168, + "learning_rate": 2.30516e-07, + "loss": 0.375, + "step": 192450 + }, + { + "epoch": 1.925, + "grad_norm": 16.14723014831543, + "learning_rate": 2.3031599999999998e-07, + "loss": 0.4943, + "step": 192500 + }, + { + "epoch": 1.9255, + "grad_norm": 9.635993957519531, + "learning_rate": 2.3011599999999997e-07, + "loss": 0.3829, + "step": 192550 + }, + { + "epoch": 1.9260000000000002, + "grad_norm": 37.19770050048828, + "learning_rate": 2.29916e-07, + "loss": 0.441, + "step": 192600 + }, + { + "epoch": 1.9264999999999999, + "grad_norm": 100.09040069580078, + "learning_rate": 2.29716e-07, + "loss": 0.4527, + "step": 192650 + }, + { + "epoch": 1.927, + "grad_norm": 73.9195785522461, + "learning_rate": 2.2951599999999998e-07, + "loss": 0.4453, + "step": 192700 + }, + { + "epoch": 1.9275, + "grad_norm": 5.231517314910889, + "learning_rate": 2.29316e-07, + "loss": 0.4418, + "step": 192750 + }, + { + "epoch": 1.928, + "grad_norm": 86.26935577392578, + "learning_rate": 2.2911599999999997e-07, + "loss": 0.4282, + "step": 192800 + }, + { + "epoch": 1.9285, + "grad_norm": 20.850133895874023, + "learning_rate": 2.28916e-07, + "loss": 0.5655, + "step": 192850 + }, + { + "epoch": 1.929, + "grad_norm": 0.3520480692386627, + "learning_rate": 2.28716e-07, + "loss": 0.409, + "step": 192900 + }, + { + "epoch": 1.9295, + "grad_norm": 3.128180503845215, + "learning_rate": 2.2851599999999998e-07, + "loss": 0.4842, + "step": 192950 + }, + { + "epoch": 1.9300000000000002, + "grad_norm": 122.73941040039062, + "learning_rate": 2.28316e-07, + "loss": 0.4514, + "step": 193000 + }, + { + "epoch": 1.9304999999999999, + "grad_norm": 9.219717979431152, + "learning_rate": 2.28116e-07, + "loss": 0.3849, + "step": 193050 + }, + { + "epoch": 1.931, + "grad_norm": 12.25208854675293, + "learning_rate": 2.27916e-07, + "loss": 0.4238, + "step": 193100 + }, + { + "epoch": 1.9315, + "grad_norm": 92.9471435546875, + "learning_rate": 2.27716e-07, + "loss": 0.3785, + "step": 193150 + }, + { + "epoch": 1.932, + "grad_norm": 35.56284713745117, + "learning_rate": 2.2751599999999998e-07, + "loss": 0.4077, + "step": 193200 + }, + { + "epoch": 1.9325, + "grad_norm": 85.1362075805664, + "learning_rate": 2.2731599999999997e-07, + "loss": 0.4885, + "step": 193250 + }, + { + "epoch": 1.933, + "grad_norm": 17.76969337463379, + "learning_rate": 2.2711999999999998e-07, + "loss": 0.5319, + "step": 193300 + }, + { + "epoch": 1.9335, + "grad_norm": 77.77349853515625, + "learning_rate": 2.2692399999999997e-07, + "loss": 0.6296, + "step": 193350 + }, + { + "epoch": 1.9340000000000002, + "grad_norm": 1.1785967350006104, + "learning_rate": 2.2672399999999999e-07, + "loss": 0.4844, + "step": 193400 + }, + { + "epoch": 1.9344999999999999, + "grad_norm": 36.741268157958984, + "learning_rate": 2.26524e-07, + "loss": 0.3688, + "step": 193450 + }, + { + "epoch": 1.935, + "grad_norm": 83.8866195678711, + "learning_rate": 2.2632399999999998e-07, + "loss": 0.5385, + "step": 193500 + }, + { + "epoch": 1.9355, + "grad_norm": 67.07383728027344, + "learning_rate": 2.26124e-07, + "loss": 0.3809, + "step": 193550 + }, + { + "epoch": 1.936, + "grad_norm": 61.0508918762207, + "learning_rate": 2.25924e-07, + "loss": 0.362, + "step": 193600 + }, + { + "epoch": 1.9365, + "grad_norm": 95.52999877929688, + "learning_rate": 2.25724e-07, + "loss": 0.4692, + "step": 193650 + }, + { + "epoch": 1.937, + "grad_norm": 2.942073345184326, + "learning_rate": 2.25524e-07, + "loss": 0.4602, + "step": 193700 + }, + { + "epoch": 1.9375, + "grad_norm": 80.84626770019531, + "learning_rate": 2.2532399999999999e-07, + "loss": 0.6496, + "step": 193750 + }, + { + "epoch": 1.938, + "grad_norm": 15.654928207397461, + "learning_rate": 2.2512399999999997e-07, + "loss": 0.3803, + "step": 193800 + }, + { + "epoch": 1.9385, + "grad_norm": 61.12825393676758, + "learning_rate": 2.24924e-07, + "loss": 0.4215, + "step": 193850 + }, + { + "epoch": 1.939, + "grad_norm": 64.433837890625, + "learning_rate": 2.24724e-07, + "loss": 0.394, + "step": 193900 + }, + { + "epoch": 1.9395, + "grad_norm": 7.063285827636719, + "learning_rate": 2.2452399999999998e-07, + "loss": 0.4053, + "step": 193950 + }, + { + "epoch": 1.94, + "grad_norm": 67.11646270751953, + "learning_rate": 2.24324e-07, + "loss": 0.5138, + "step": 194000 + }, + { + "epoch": 1.9405000000000001, + "grad_norm": 28.027524948120117, + "learning_rate": 2.2412399999999998e-07, + "loss": 0.3441, + "step": 194050 + }, + { + "epoch": 1.9409999999999998, + "grad_norm": 124.71054077148438, + "learning_rate": 2.2392400000000001e-07, + "loss": 0.4387, + "step": 194100 + }, + { + "epoch": 1.9415, + "grad_norm": 53.5966911315918, + "learning_rate": 2.23724e-07, + "loss": 0.3617, + "step": 194150 + }, + { + "epoch": 1.942, + "grad_norm": 52.884151458740234, + "learning_rate": 2.2352399999999998e-07, + "loss": 0.5952, + "step": 194200 + }, + { + "epoch": 1.9425, + "grad_norm": 5.888815879821777, + "learning_rate": 2.23324e-07, + "loss": 0.4729, + "step": 194250 + }, + { + "epoch": 1.943, + "grad_norm": 50.12318420410156, + "learning_rate": 2.2312399999999998e-07, + "loss": 0.443, + "step": 194300 + }, + { + "epoch": 1.9435, + "grad_norm": 53.48051452636719, + "learning_rate": 2.22924e-07, + "loss": 0.4987, + "step": 194350 + }, + { + "epoch": 1.944, + "grad_norm": 40.23417663574219, + "learning_rate": 2.22724e-07, + "loss": 0.477, + "step": 194400 + }, + { + "epoch": 1.9445000000000001, + "grad_norm": 0.8471763730049133, + "learning_rate": 2.22524e-07, + "loss": 0.5448, + "step": 194450 + }, + { + "epoch": 1.9449999999999998, + "grad_norm": 3.5994434356689453, + "learning_rate": 2.2232399999999997e-07, + "loss": 0.3093, + "step": 194500 + }, + { + "epoch": 1.9455, + "grad_norm": 88.96204376220703, + "learning_rate": 2.2212399999999998e-07, + "loss": 0.447, + "step": 194550 + }, + { + "epoch": 1.946, + "grad_norm": 3.4328107833862305, + "learning_rate": 2.21924e-07, + "loss": 0.4485, + "step": 194600 + }, + { + "epoch": 1.9465, + "grad_norm": 0.81305330991745, + "learning_rate": 2.21724e-07, + "loss": 0.3417, + "step": 194650 + }, + { + "epoch": 1.947, + "grad_norm": 14.793620109558105, + "learning_rate": 2.21524e-07, + "loss": 0.2682, + "step": 194700 + }, + { + "epoch": 1.9475, + "grad_norm": 71.23736572265625, + "learning_rate": 2.2132399999999998e-07, + "loss": 0.5289, + "step": 194750 + }, + { + "epoch": 1.948, + "grad_norm": 89.07301330566406, + "learning_rate": 2.21124e-07, + "loss": 0.4109, + "step": 194800 + }, + { + "epoch": 1.9485000000000001, + "grad_norm": 169.87252807617188, + "learning_rate": 2.20924e-07, + "loss": 0.5863, + "step": 194850 + }, + { + "epoch": 1.9489999999999998, + "grad_norm": 107.96073150634766, + "learning_rate": 2.2072399999999999e-07, + "loss": 0.416, + "step": 194900 + }, + { + "epoch": 1.9495, + "grad_norm": 10.294443130493164, + "learning_rate": 2.20524e-07, + "loss": 0.4665, + "step": 194950 + }, + { + "epoch": 1.95, + "grad_norm": 62.89139175415039, + "learning_rate": 2.2032399999999998e-07, + "loss": 0.5773, + "step": 195000 + }, + { + "epoch": 1.9505, + "grad_norm": 146.08712768554688, + "learning_rate": 2.2012399999999997e-07, + "loss": 0.5096, + "step": 195050 + }, + { + "epoch": 1.951, + "grad_norm": 5.977169990539551, + "learning_rate": 2.19924e-07, + "loss": 0.4419, + "step": 195100 + }, + { + "epoch": 1.9515, + "grad_norm": 34.73382568359375, + "learning_rate": 2.19724e-07, + "loss": 0.4073, + "step": 195150 + }, + { + "epoch": 1.952, + "grad_norm": 111.31939697265625, + "learning_rate": 2.1952399999999997e-07, + "loss": 0.4008, + "step": 195200 + }, + { + "epoch": 1.9525000000000001, + "grad_norm": 28.672367095947266, + "learning_rate": 2.1932399999999999e-07, + "loss": 0.3195, + "step": 195250 + }, + { + "epoch": 1.9529999999999998, + "grad_norm": 51.09148025512695, + "learning_rate": 2.19124e-07, + "loss": 0.3417, + "step": 195300 + }, + { + "epoch": 1.9535, + "grad_norm": 13.32286262512207, + "learning_rate": 2.18924e-07, + "loss": 0.539, + "step": 195350 + }, + { + "epoch": 1.954, + "grad_norm": 89.55329895019531, + "learning_rate": 2.18724e-07, + "loss": 0.3611, + "step": 195400 + }, + { + "epoch": 1.9545, + "grad_norm": 112.88272857666016, + "learning_rate": 2.1852399999999998e-07, + "loss": 0.496, + "step": 195450 + }, + { + "epoch": 1.955, + "grad_norm": 34.98467254638672, + "learning_rate": 2.18324e-07, + "loss": 0.3647, + "step": 195500 + }, + { + "epoch": 1.9555, + "grad_norm": 25.51811981201172, + "learning_rate": 2.18124e-07, + "loss": 0.2741, + "step": 195550 + }, + { + "epoch": 1.956, + "grad_norm": 35.12691879272461, + "learning_rate": 2.1792399999999999e-07, + "loss": 0.5114, + "step": 195600 + }, + { + "epoch": 1.9565000000000001, + "grad_norm": 92.44993591308594, + "learning_rate": 2.17724e-07, + "loss": 0.4415, + "step": 195650 + }, + { + "epoch": 1.9569999999999999, + "grad_norm": 58.697418212890625, + "learning_rate": 2.1752399999999998e-07, + "loss": 0.5425, + "step": 195700 + }, + { + "epoch": 1.9575, + "grad_norm": 70.51270294189453, + "learning_rate": 2.1732399999999997e-07, + "loss": 0.4504, + "step": 195750 + }, + { + "epoch": 1.958, + "grad_norm": 100.30704498291016, + "learning_rate": 2.17124e-07, + "loss": 0.5119, + "step": 195800 + }, + { + "epoch": 1.9585, + "grad_norm": 74.18003845214844, + "learning_rate": 2.16924e-07, + "loss": 0.3975, + "step": 195850 + }, + { + "epoch": 1.959, + "grad_norm": 67.8548583984375, + "learning_rate": 2.16724e-07, + "loss": 0.3993, + "step": 195900 + }, + { + "epoch": 1.9595, + "grad_norm": 25.4310359954834, + "learning_rate": 2.16524e-07, + "loss": 0.4111, + "step": 195950 + }, + { + "epoch": 1.96, + "grad_norm": 3.258418321609497, + "learning_rate": 2.1632399999999997e-07, + "loss": 0.4154, + "step": 196000 + }, + { + "epoch": 1.9605000000000001, + "grad_norm": 20.04966163635254, + "learning_rate": 2.16124e-07, + "loss": 0.4066, + "step": 196050 + }, + { + "epoch": 1.9609999999999999, + "grad_norm": 45.66105651855469, + "learning_rate": 2.15924e-07, + "loss": 0.503, + "step": 196100 + }, + { + "epoch": 1.9615, + "grad_norm": 46.578895568847656, + "learning_rate": 2.1572399999999998e-07, + "loss": 0.4432, + "step": 196150 + }, + { + "epoch": 1.962, + "grad_norm": 85.224609375, + "learning_rate": 2.15524e-07, + "loss": 0.5397, + "step": 196200 + }, + { + "epoch": 1.9625, + "grad_norm": 2.3414793014526367, + "learning_rate": 2.1532399999999998e-07, + "loss": 0.3125, + "step": 196250 + }, + { + "epoch": 1.963, + "grad_norm": 4.292512893676758, + "learning_rate": 2.15124e-07, + "loss": 0.4941, + "step": 196300 + }, + { + "epoch": 1.9635, + "grad_norm": 67.12167358398438, + "learning_rate": 2.14924e-07, + "loss": 0.4186, + "step": 196350 + }, + { + "epoch": 1.964, + "grad_norm": 74.41678619384766, + "learning_rate": 2.1472399999999998e-07, + "loss": 0.4799, + "step": 196400 + }, + { + "epoch": 1.9645000000000001, + "grad_norm": 55.400428771972656, + "learning_rate": 2.14524e-07, + "loss": 0.3932, + "step": 196450 + }, + { + "epoch": 1.9649999999999999, + "grad_norm": 89.4749526977539, + "learning_rate": 2.1432399999999998e-07, + "loss": 0.409, + "step": 196500 + }, + { + "epoch": 1.9655, + "grad_norm": 41.83777618408203, + "learning_rate": 2.14124e-07, + "loss": 0.3961, + "step": 196550 + }, + { + "epoch": 1.966, + "grad_norm": 46.31528854370117, + "learning_rate": 2.13924e-07, + "loss": 0.5221, + "step": 196600 + }, + { + "epoch": 1.9665, + "grad_norm": 64.41300964355469, + "learning_rate": 2.13724e-07, + "loss": 0.3646, + "step": 196650 + }, + { + "epoch": 1.967, + "grad_norm": 73.85462951660156, + "learning_rate": 2.1352399999999997e-07, + "loss": 0.496, + "step": 196700 + }, + { + "epoch": 1.9675, + "grad_norm": 15.591062545776367, + "learning_rate": 2.13324e-07, + "loss": 0.447, + "step": 196750 + }, + { + "epoch": 1.968, + "grad_norm": 48.92272186279297, + "learning_rate": 2.13124e-07, + "loss": 0.4293, + "step": 196800 + }, + { + "epoch": 1.9685000000000001, + "grad_norm": 1.0661996603012085, + "learning_rate": 2.1292399999999998e-07, + "loss": 0.4461, + "step": 196850 + }, + { + "epoch": 1.9689999999999999, + "grad_norm": 3.930109739303589, + "learning_rate": 2.12724e-07, + "loss": 0.5336, + "step": 196900 + }, + { + "epoch": 1.9695, + "grad_norm": 59.66569519042969, + "learning_rate": 2.1252399999999998e-07, + "loss": 0.4064, + "step": 196950 + }, + { + "epoch": 1.97, + "grad_norm": 59.22267532348633, + "learning_rate": 2.1232400000000002e-07, + "loss": 0.4626, + "step": 197000 + }, + { + "epoch": 1.9705, + "grad_norm": 61.249202728271484, + "learning_rate": 2.12124e-07, + "loss": 0.3603, + "step": 197050 + }, + { + "epoch": 1.971, + "grad_norm": 8.615995407104492, + "learning_rate": 2.1192399999999999e-07, + "loss": 0.4088, + "step": 197100 + }, + { + "epoch": 1.9715, + "grad_norm": 74.69889831542969, + "learning_rate": 2.11724e-07, + "loss": 0.4601, + "step": 197150 + }, + { + "epoch": 1.972, + "grad_norm": 70.49463653564453, + "learning_rate": 2.1152399999999998e-07, + "loss": 0.4344, + "step": 197200 + }, + { + "epoch": 1.9725000000000001, + "grad_norm": 62.61591339111328, + "learning_rate": 2.11324e-07, + "loss": 0.5932, + "step": 197250 + }, + { + "epoch": 1.9729999999999999, + "grad_norm": 25.135278701782227, + "learning_rate": 2.11124e-07, + "loss": 0.4321, + "step": 197300 + }, + { + "epoch": 1.9735, + "grad_norm": 1.134199619293213, + "learning_rate": 2.10924e-07, + "loss": 0.2776, + "step": 197350 + }, + { + "epoch": 1.974, + "grad_norm": 5.215174198150635, + "learning_rate": 2.1072399999999998e-07, + "loss": 0.3703, + "step": 197400 + }, + { + "epoch": 1.9745, + "grad_norm": 73.48294830322266, + "learning_rate": 2.1052399999999999e-07, + "loss": 0.5626, + "step": 197450 + }, + { + "epoch": 1.975, + "grad_norm": 136.4009552001953, + "learning_rate": 2.10324e-07, + "loss": 0.3805, + "step": 197500 + }, + { + "epoch": 1.9755, + "grad_norm": 79.76919555664062, + "learning_rate": 2.1012399999999998e-07, + "loss": 0.4289, + "step": 197550 + }, + { + "epoch": 1.976, + "grad_norm": 70.65571594238281, + "learning_rate": 2.09924e-07, + "loss": 0.3265, + "step": 197600 + }, + { + "epoch": 1.9765000000000001, + "grad_norm": 83.61393737792969, + "learning_rate": 2.0972399999999998e-07, + "loss": 0.3778, + "step": 197650 + }, + { + "epoch": 1.9769999999999999, + "grad_norm": 0.6025605797767639, + "learning_rate": 2.09524e-07, + "loss": 0.4016, + "step": 197700 + }, + { + "epoch": 1.9775, + "grad_norm": 3.0272040367126465, + "learning_rate": 2.0932799999999998e-07, + "loss": 0.4813, + "step": 197750 + }, + { + "epoch": 1.978, + "grad_norm": 60.68215560913086, + "learning_rate": 2.09128e-07, + "loss": 0.353, + "step": 197800 + }, + { + "epoch": 1.9785, + "grad_norm": 140.40936279296875, + "learning_rate": 2.08928e-07, + "loss": 0.4967, + "step": 197850 + }, + { + "epoch": 1.979, + "grad_norm": 81.03306579589844, + "learning_rate": 2.08728e-07, + "loss": 0.6082, + "step": 197900 + }, + { + "epoch": 1.9795, + "grad_norm": 4.013239860534668, + "learning_rate": 2.0852799999999997e-07, + "loss": 0.4588, + "step": 197950 + }, + { + "epoch": 1.98, + "grad_norm": 95.69232940673828, + "learning_rate": 2.08328e-07, + "loss": 0.4396, + "step": 198000 + }, + { + "epoch": 1.9805000000000001, + "grad_norm": 91.35757446289062, + "learning_rate": 2.08128e-07, + "loss": 0.4297, + "step": 198050 + }, + { + "epoch": 1.9809999999999999, + "grad_norm": 61.077537536621094, + "learning_rate": 2.0792799999999998e-07, + "loss": 0.4121, + "step": 198100 + }, + { + "epoch": 1.9815, + "grad_norm": 91.7123031616211, + "learning_rate": 2.07728e-07, + "loss": 0.3933, + "step": 198150 + }, + { + "epoch": 1.982, + "grad_norm": 58.250064849853516, + "learning_rate": 2.0752799999999998e-07, + "loss": 0.5652, + "step": 198200 + }, + { + "epoch": 1.9825, + "grad_norm": 4.037406921386719, + "learning_rate": 2.0732800000000002e-07, + "loss": 0.4317, + "step": 198250 + }, + { + "epoch": 1.983, + "grad_norm": 14.497562408447266, + "learning_rate": 2.07128e-07, + "loss": 0.3503, + "step": 198300 + }, + { + "epoch": 1.9835, + "grad_norm": 15.300369262695312, + "learning_rate": 2.0692799999999999e-07, + "loss": 0.4021, + "step": 198350 + }, + { + "epoch": 1.984, + "grad_norm": 11.134997367858887, + "learning_rate": 2.06728e-07, + "loss": 0.45, + "step": 198400 + }, + { + "epoch": 1.9845000000000002, + "grad_norm": 17.25223731994629, + "learning_rate": 2.0652799999999998e-07, + "loss": 0.302, + "step": 198450 + }, + { + "epoch": 1.9849999999999999, + "grad_norm": 50.793540954589844, + "learning_rate": 2.06328e-07, + "loss": 0.4223, + "step": 198500 + }, + { + "epoch": 1.9855, + "grad_norm": 1.6654607057571411, + "learning_rate": 2.06128e-07, + "loss": 0.4057, + "step": 198550 + }, + { + "epoch": 1.986, + "grad_norm": 15.111828804016113, + "learning_rate": 2.05928e-07, + "loss": 0.2903, + "step": 198600 + }, + { + "epoch": 1.9865, + "grad_norm": 60.819488525390625, + "learning_rate": 2.0572799999999997e-07, + "loss": 0.3924, + "step": 198650 + }, + { + "epoch": 1.987, + "grad_norm": 58.338111877441406, + "learning_rate": 2.0552799999999999e-07, + "loss": 0.4706, + "step": 198700 + }, + { + "epoch": 1.9875, + "grad_norm": 4.499456882476807, + "learning_rate": 2.05328e-07, + "loss": 0.4668, + "step": 198750 + }, + { + "epoch": 1.988, + "grad_norm": 26.65963363647461, + "learning_rate": 2.0512799999999998e-07, + "loss": 0.464, + "step": 198800 + }, + { + "epoch": 1.9885000000000002, + "grad_norm": 33.08699035644531, + "learning_rate": 2.04928e-07, + "loss": 0.4391, + "step": 198850 + }, + { + "epoch": 1.9889999999999999, + "grad_norm": 25.592788696289062, + "learning_rate": 2.0472799999999998e-07, + "loss": 0.3808, + "step": 198900 + }, + { + "epoch": 1.9895, + "grad_norm": 58.27476501464844, + "learning_rate": 2.04528e-07, + "loss": 0.5133, + "step": 198950 + }, + { + "epoch": 1.99, + "grad_norm": 133.251220703125, + "learning_rate": 2.04328e-07, + "loss": 0.5155, + "step": 199000 + }, + { + "epoch": 1.9905, + "grad_norm": 115.85850524902344, + "learning_rate": 2.0412799999999999e-07, + "loss": 0.3483, + "step": 199050 + }, + { + "epoch": 1.991, + "grad_norm": 6.595485687255859, + "learning_rate": 2.03928e-07, + "loss": 0.449, + "step": 199100 + }, + { + "epoch": 1.9915, + "grad_norm": 19.415233612060547, + "learning_rate": 2.0372799999999998e-07, + "loss": 0.5, + "step": 199150 + }, + { + "epoch": 1.992, + "grad_norm": 36.67066192626953, + "learning_rate": 2.0352799999999997e-07, + "loss": 0.3246, + "step": 199200 + }, + { + "epoch": 1.9925000000000002, + "grad_norm": 82.13230895996094, + "learning_rate": 2.03328e-07, + "loss": 0.442, + "step": 199250 + }, + { + "epoch": 1.9929999999999999, + "grad_norm": 29.641822814941406, + "learning_rate": 2.03128e-07, + "loss": 0.3297, + "step": 199300 + }, + { + "epoch": 1.9935, + "grad_norm": 78.03038787841797, + "learning_rate": 2.0292799999999998e-07, + "loss": 0.5838, + "step": 199350 + }, + { + "epoch": 1.994, + "grad_norm": 14.143548011779785, + "learning_rate": 2.02728e-07, + "loss": 0.5141, + "step": 199400 + }, + { + "epoch": 1.9945, + "grad_norm": 124.76140594482422, + "learning_rate": 2.02528e-07, + "loss": 0.5651, + "step": 199450 + }, + { + "epoch": 1.995, + "grad_norm": 27.242807388305664, + "learning_rate": 2.02328e-07, + "loss": 0.4477, + "step": 199500 + }, + { + "epoch": 1.9955, + "grad_norm": 55.86709976196289, + "learning_rate": 2.02128e-07, + "loss": 0.5458, + "step": 199550 + }, + { + "epoch": 1.996, + "grad_norm": 61.482025146484375, + "learning_rate": 2.0192799999999998e-07, + "loss": 0.42, + "step": 199600 + }, + { + "epoch": 1.9965000000000002, + "grad_norm": 21.021808624267578, + "learning_rate": 2.01728e-07, + "loss": 0.4657, + "step": 199650 + }, + { + "epoch": 1.9969999999999999, + "grad_norm": 65.79966735839844, + "learning_rate": 2.01528e-07, + "loss": 0.4816, + "step": 199700 + }, + { + "epoch": 1.9975, + "grad_norm": 74.81526947021484, + "learning_rate": 2.01328e-07, + "loss": 0.5426, + "step": 199750 + }, + { + "epoch": 1.998, + "grad_norm": 78.884765625, + "learning_rate": 2.01128e-07, + "loss": 0.4315, + "step": 199800 + }, + { + "epoch": 1.9985, + "grad_norm": 10.851948738098145, + "learning_rate": 2.0092799999999998e-07, + "loss": 0.4302, + "step": 199850 + }, + { + "epoch": 1.999, + "grad_norm": 3.022129774093628, + "learning_rate": 2.0072799999999997e-07, + "loss": 0.3712, + "step": 199900 + }, + { + "epoch": 1.9995, + "grad_norm": 24.550790786743164, + "learning_rate": 2.00528e-07, + "loss": 0.4664, + "step": 199950 + }, + { + "epoch": 2.0, + "grad_norm": 14.006537437438965, + "learning_rate": 2.00328e-07, + "loss": 0.3405, + "step": 200000 + }, + { + "epoch": 2.0005, + "grad_norm": 3.087618827819824, + "learning_rate": 2.00128e-07, + "loss": 0.4074, + "step": 200050 + }, + { + "epoch": 2.001, + "grad_norm": 1.2099264860153198, + "learning_rate": 1.99928e-07, + "loss": 0.4809, + "step": 200100 + }, + { + "epoch": 2.0015, + "grad_norm": 3.2248778343200684, + "learning_rate": 1.9972799999999997e-07, + "loss": 0.3725, + "step": 200150 + }, + { + "epoch": 2.002, + "grad_norm": 39.72489929199219, + "learning_rate": 1.99528e-07, + "loss": 0.3364, + "step": 200200 + }, + { + "epoch": 2.0025, + "grad_norm": 79.86190032958984, + "learning_rate": 1.99328e-07, + "loss": 0.3408, + "step": 200250 + }, + { + "epoch": 2.003, + "grad_norm": 45.70669174194336, + "learning_rate": 1.9912799999999998e-07, + "loss": 0.4011, + "step": 200300 + }, + { + "epoch": 2.0035, + "grad_norm": 80.72676849365234, + "learning_rate": 1.98928e-07, + "loss": 0.5005, + "step": 200350 + }, + { + "epoch": 2.004, + "grad_norm": 54.08387756347656, + "learning_rate": 1.9872799999999998e-07, + "loss": 0.552, + "step": 200400 + }, + { + "epoch": 2.0045, + "grad_norm": 14.636670112609863, + "learning_rate": 1.98528e-07, + "loss": 0.3083, + "step": 200450 + }, + { + "epoch": 2.005, + "grad_norm": 65.19412994384766, + "learning_rate": 1.98328e-07, + "loss": 0.419, + "step": 200500 + }, + { + "epoch": 2.0055, + "grad_norm": 18.607166290283203, + "learning_rate": 1.9812799999999999e-07, + "loss": 0.3372, + "step": 200550 + }, + { + "epoch": 2.006, + "grad_norm": 5.587798595428467, + "learning_rate": 1.9792799999999997e-07, + "loss": 0.3789, + "step": 200600 + }, + { + "epoch": 2.0065, + "grad_norm": 9.782854080200195, + "learning_rate": 1.9772799999999998e-07, + "loss": 0.3832, + "step": 200650 + }, + { + "epoch": 2.007, + "grad_norm": 92.8857650756836, + "learning_rate": 1.97528e-07, + "loss": 0.3575, + "step": 200700 + }, + { + "epoch": 2.0075, + "grad_norm": 157.7519073486328, + "learning_rate": 1.97332e-07, + "loss": 0.5908, + "step": 200750 + }, + { + "epoch": 2.008, + "grad_norm": 103.3365478515625, + "learning_rate": 1.97132e-07, + "loss": 0.3179, + "step": 200800 + }, + { + "epoch": 2.0085, + "grad_norm": 42.684906005859375, + "learning_rate": 1.9693199999999998e-07, + "loss": 0.3399, + "step": 200850 + }, + { + "epoch": 2.009, + "grad_norm": 2.2768945693969727, + "learning_rate": 1.96732e-07, + "loss": 0.3791, + "step": 200900 + }, + { + "epoch": 2.0095, + "grad_norm": 8.21125316619873, + "learning_rate": 1.96532e-07, + "loss": 0.4884, + "step": 200950 + }, + { + "epoch": 2.01, + "grad_norm": 13.066143035888672, + "learning_rate": 1.96332e-07, + "loss": 0.5262, + "step": 201000 + }, + { + "epoch": 2.0105, + "grad_norm": 62.395328521728516, + "learning_rate": 1.96132e-07, + "loss": 0.3039, + "step": 201050 + }, + { + "epoch": 2.011, + "grad_norm": 1.7823199033737183, + "learning_rate": 1.9593199999999998e-07, + "loss": 0.5953, + "step": 201100 + }, + { + "epoch": 2.0115, + "grad_norm": 84.18726348876953, + "learning_rate": 1.9573199999999997e-07, + "loss": 0.4766, + "step": 201150 + }, + { + "epoch": 2.012, + "grad_norm": 119.35281372070312, + "learning_rate": 1.95532e-07, + "loss": 0.4257, + "step": 201200 + }, + { + "epoch": 2.0125, + "grad_norm": 24.274431228637695, + "learning_rate": 1.95332e-07, + "loss": 0.5276, + "step": 201250 + }, + { + "epoch": 2.013, + "grad_norm": 28.234947204589844, + "learning_rate": 1.95132e-07, + "loss": 0.4958, + "step": 201300 + }, + { + "epoch": 2.0135, + "grad_norm": 36.3571891784668, + "learning_rate": 1.94932e-07, + "loss": 0.3327, + "step": 201350 + }, + { + "epoch": 2.014, + "grad_norm": 70.78429412841797, + "learning_rate": 1.9473199999999997e-07, + "loss": 0.3493, + "step": 201400 + }, + { + "epoch": 2.0145, + "grad_norm": 67.98007202148438, + "learning_rate": 1.94532e-07, + "loss": 0.5261, + "step": 201450 + }, + { + "epoch": 2.015, + "grad_norm": 2.617002487182617, + "learning_rate": 1.94332e-07, + "loss": 0.3285, + "step": 201500 + }, + { + "epoch": 2.0155, + "grad_norm": 27.248516082763672, + "learning_rate": 1.9413199999999998e-07, + "loss": 0.4242, + "step": 201550 + }, + { + "epoch": 2.016, + "grad_norm": 85.0505142211914, + "learning_rate": 1.93932e-07, + "loss": 0.3836, + "step": 201600 + }, + { + "epoch": 2.0165, + "grad_norm": 14.559182167053223, + "learning_rate": 1.9373199999999998e-07, + "loss": 0.31, + "step": 201650 + }, + { + "epoch": 2.017, + "grad_norm": 82.49097442626953, + "learning_rate": 1.93532e-07, + "loss": 0.3404, + "step": 201700 + }, + { + "epoch": 2.0175, + "grad_norm": 46.396629333496094, + "learning_rate": 1.93332e-07, + "loss": 0.3666, + "step": 201750 + }, + { + "epoch": 2.018, + "grad_norm": 44.75120162963867, + "learning_rate": 1.9313199999999999e-07, + "loss": 0.4974, + "step": 201800 + }, + { + "epoch": 2.0185, + "grad_norm": 6.472919464111328, + "learning_rate": 1.92932e-07, + "loss": 0.2974, + "step": 201850 + }, + { + "epoch": 2.019, + "grad_norm": 41.595184326171875, + "learning_rate": 1.9273199999999998e-07, + "loss": 0.3399, + "step": 201900 + }, + { + "epoch": 2.0195, + "grad_norm": 119.87677001953125, + "learning_rate": 1.92532e-07, + "loss": 0.3771, + "step": 201950 + }, + { + "epoch": 2.02, + "grad_norm": 85.57170104980469, + "learning_rate": 1.92336e-07, + "loss": 0.4158, + "step": 202000 + }, + { + "epoch": 2.0205, + "grad_norm": 29.789093017578125, + "learning_rate": 1.92136e-07, + "loss": 0.4293, + "step": 202050 + }, + { + "epoch": 2.021, + "grad_norm": 83.55206298828125, + "learning_rate": 1.9193599999999998e-07, + "loss": 0.4268, + "step": 202100 + }, + { + "epoch": 2.0215, + "grad_norm": 11.1190824508667, + "learning_rate": 1.91736e-07, + "loss": 0.4764, + "step": 202150 + }, + { + "epoch": 2.022, + "grad_norm": 36.4733772277832, + "learning_rate": 1.91536e-07, + "loss": 0.4419, + "step": 202200 + }, + { + "epoch": 2.0225, + "grad_norm": 73.61580657958984, + "learning_rate": 1.91336e-07, + "loss": 0.4973, + "step": 202250 + }, + { + "epoch": 2.023, + "grad_norm": 46.41292190551758, + "learning_rate": 1.91136e-07, + "loss": 0.4368, + "step": 202300 + }, + { + "epoch": 2.0235, + "grad_norm": 85.7939224243164, + "learning_rate": 1.9093599999999998e-07, + "loss": 0.2774, + "step": 202350 + }, + { + "epoch": 2.024, + "grad_norm": 5.079582214355469, + "learning_rate": 1.9073599999999997e-07, + "loss": 0.3701, + "step": 202400 + }, + { + "epoch": 2.0245, + "grad_norm": 16.522323608398438, + "learning_rate": 1.90536e-07, + "loss": 0.4191, + "step": 202450 + }, + { + "epoch": 2.025, + "grad_norm": 44.48244857788086, + "learning_rate": 1.90336e-07, + "loss": 0.4953, + "step": 202500 + }, + { + "epoch": 2.0255, + "grad_norm": 46.651466369628906, + "learning_rate": 1.90136e-07, + "loss": 0.372, + "step": 202550 + }, + { + "epoch": 2.026, + "grad_norm": 8.931777000427246, + "learning_rate": 1.89936e-07, + "loss": 0.4437, + "step": 202600 + }, + { + "epoch": 2.0265, + "grad_norm": 11.248937606811523, + "learning_rate": 1.8973599999999997e-07, + "loss": 0.4143, + "step": 202650 + }, + { + "epoch": 2.027, + "grad_norm": 65.09107208251953, + "learning_rate": 1.89536e-07, + "loss": 0.3685, + "step": 202700 + }, + { + "epoch": 2.0275, + "grad_norm": 15.731978416442871, + "learning_rate": 1.89336e-07, + "loss": 0.5076, + "step": 202750 + }, + { + "epoch": 2.028, + "grad_norm": 117.5165023803711, + "learning_rate": 1.8913599999999998e-07, + "loss": 0.4142, + "step": 202800 + }, + { + "epoch": 2.0285, + "grad_norm": 8.751471519470215, + "learning_rate": 1.88936e-07, + "loss": 0.3654, + "step": 202850 + }, + { + "epoch": 2.029, + "grad_norm": 85.83335876464844, + "learning_rate": 1.8873599999999998e-07, + "loss": 0.4755, + "step": 202900 + }, + { + "epoch": 2.0295, + "grad_norm": 11.942030906677246, + "learning_rate": 1.88536e-07, + "loss": 0.4555, + "step": 202950 + }, + { + "epoch": 2.03, + "grad_norm": 2.0720551013946533, + "learning_rate": 1.88336e-07, + "loss": 0.3455, + "step": 203000 + }, + { + "epoch": 2.0305, + "grad_norm": 27.060739517211914, + "learning_rate": 1.8813599999999998e-07, + "loss": 0.4064, + "step": 203050 + }, + { + "epoch": 2.031, + "grad_norm": 12.625927925109863, + "learning_rate": 1.87936e-07, + "loss": 0.4275, + "step": 203100 + }, + { + "epoch": 2.0315, + "grad_norm": 16.742694854736328, + "learning_rate": 1.8773599999999998e-07, + "loss": 0.4582, + "step": 203150 + }, + { + "epoch": 2.032, + "grad_norm": 77.07002258300781, + "learning_rate": 1.87536e-07, + "loss": 0.3919, + "step": 203200 + }, + { + "epoch": 2.0325, + "grad_norm": 64.95145416259766, + "learning_rate": 1.87336e-07, + "loss": 0.4802, + "step": 203250 + }, + { + "epoch": 2.033, + "grad_norm": 124.8387680053711, + "learning_rate": 1.87136e-07, + "loss": 0.5289, + "step": 203300 + }, + { + "epoch": 2.0335, + "grad_norm": 38.79643630981445, + "learning_rate": 1.8693599999999997e-07, + "loss": 0.3996, + "step": 203350 + }, + { + "epoch": 2.034, + "grad_norm": 64.64935302734375, + "learning_rate": 1.86736e-07, + "loss": 0.3993, + "step": 203400 + }, + { + "epoch": 2.0345, + "grad_norm": 62.82065963745117, + "learning_rate": 1.86536e-07, + "loss": 0.5019, + "step": 203450 + }, + { + "epoch": 2.035, + "grad_norm": 1.8743690252304077, + "learning_rate": 1.8633599999999998e-07, + "loss": 0.3265, + "step": 203500 + }, + { + "epoch": 2.0355, + "grad_norm": 136.758056640625, + "learning_rate": 1.86136e-07, + "loss": 0.3709, + "step": 203550 + }, + { + "epoch": 2.036, + "grad_norm": 93.76640319824219, + "learning_rate": 1.8593599999999998e-07, + "loss": 0.4507, + "step": 203600 + }, + { + "epoch": 2.0365, + "grad_norm": 2.7972347736358643, + "learning_rate": 1.8573600000000002e-07, + "loss": 0.4523, + "step": 203650 + }, + { + "epoch": 2.037, + "grad_norm": 12.453752517700195, + "learning_rate": 1.85536e-07, + "loss": 0.2875, + "step": 203700 + }, + { + "epoch": 2.0375, + "grad_norm": 77.1436996459961, + "learning_rate": 1.8533599999999999e-07, + "loss": 0.3493, + "step": 203750 + }, + { + "epoch": 2.038, + "grad_norm": 5.663081645965576, + "learning_rate": 1.85136e-07, + "loss": 0.4532, + "step": 203800 + }, + { + "epoch": 2.0385, + "grad_norm": 74.7771224975586, + "learning_rate": 1.8493599999999998e-07, + "loss": 0.3986, + "step": 203850 + }, + { + "epoch": 2.039, + "grad_norm": 170.24021911621094, + "learning_rate": 1.84736e-07, + "loss": 0.5018, + "step": 203900 + }, + { + "epoch": 2.0395, + "grad_norm": 4.51198148727417, + "learning_rate": 1.84536e-07, + "loss": 0.3477, + "step": 203950 + }, + { + "epoch": 2.04, + "grad_norm": 4.396271228790283, + "learning_rate": 1.84336e-07, + "loss": 0.4144, + "step": 204000 + }, + { + "epoch": 2.0405, + "grad_norm": 38.89625549316406, + "learning_rate": 1.8413599999999998e-07, + "loss": 0.4672, + "step": 204050 + }, + { + "epoch": 2.041, + "grad_norm": 63.771202087402344, + "learning_rate": 1.83936e-07, + "loss": 0.4361, + "step": 204100 + }, + { + "epoch": 2.0415, + "grad_norm": 85.22208404541016, + "learning_rate": 1.83736e-07, + "loss": 0.3421, + "step": 204150 + }, + { + "epoch": 2.042, + "grad_norm": 16.77423095703125, + "learning_rate": 1.8353599999999998e-07, + "loss": 0.4003, + "step": 204200 + }, + { + "epoch": 2.0425, + "grad_norm": 6.526354789733887, + "learning_rate": 1.83336e-07, + "loss": 0.4882, + "step": 204250 + }, + { + "epoch": 2.043, + "grad_norm": 21.03365135192871, + "learning_rate": 1.8313599999999998e-07, + "loss": 0.4461, + "step": 204300 + }, + { + "epoch": 2.0435, + "grad_norm": 99.76431274414062, + "learning_rate": 1.82936e-07, + "loss": 0.3903, + "step": 204350 + }, + { + "epoch": 2.044, + "grad_norm": 40.31965637207031, + "learning_rate": 1.82736e-07, + "loss": 0.4469, + "step": 204400 + }, + { + "epoch": 2.0445, + "grad_norm": 32.08421325683594, + "learning_rate": 1.82536e-07, + "loss": 0.285, + "step": 204450 + }, + { + "epoch": 2.045, + "grad_norm": 106.2055892944336, + "learning_rate": 1.82336e-07, + "loss": 0.4367, + "step": 204500 + }, + { + "epoch": 2.0455, + "grad_norm": 81.09613037109375, + "learning_rate": 1.8213599999999998e-07, + "loss": 0.5206, + "step": 204550 + }, + { + "epoch": 2.046, + "grad_norm": 74.99372863769531, + "learning_rate": 1.8193599999999997e-07, + "loss": 0.4927, + "step": 204600 + }, + { + "epoch": 2.0465, + "grad_norm": 1.6157112121582031, + "learning_rate": 1.81736e-07, + "loss": 0.463, + "step": 204650 + }, + { + "epoch": 2.047, + "grad_norm": 95.24854278564453, + "learning_rate": 1.81536e-07, + "loss": 0.401, + "step": 204700 + }, + { + "epoch": 2.0475, + "grad_norm": 36.316490173339844, + "learning_rate": 1.8133599999999998e-07, + "loss": 0.4379, + "step": 204750 + }, + { + "epoch": 2.048, + "grad_norm": 134.88839721679688, + "learning_rate": 1.81136e-07, + "loss": 0.3477, + "step": 204800 + }, + { + "epoch": 2.0485, + "grad_norm": 0.8851105570793152, + "learning_rate": 1.80936e-07, + "loss": 0.4119, + "step": 204850 + }, + { + "epoch": 2.049, + "grad_norm": 0.5529405474662781, + "learning_rate": 1.80736e-07, + "loss": 0.3685, + "step": 204900 + }, + { + "epoch": 2.0495, + "grad_norm": 2.562713146209717, + "learning_rate": 1.80536e-07, + "loss": 0.4975, + "step": 204950 + }, + { + "epoch": 2.05, + "grad_norm": 90.19808197021484, + "learning_rate": 1.8033599999999998e-07, + "loss": 0.5049, + "step": 205000 + }, + { + "epoch": 2.0505, + "grad_norm": 39.76176071166992, + "learning_rate": 1.80136e-07, + "loss": 0.4066, + "step": 205050 + }, + { + "epoch": 2.051, + "grad_norm": 41.6057014465332, + "learning_rate": 1.79936e-07, + "loss": 0.5315, + "step": 205100 + }, + { + "epoch": 2.0515, + "grad_norm": 2.7808868885040283, + "learning_rate": 1.79736e-07, + "loss": 0.3613, + "step": 205150 + }, + { + "epoch": 2.052, + "grad_norm": 82.41030883789062, + "learning_rate": 1.79536e-07, + "loss": 0.407, + "step": 205200 + }, + { + "epoch": 2.0525, + "grad_norm": 60.45346450805664, + "learning_rate": 1.7933599999999999e-07, + "loss": 0.529, + "step": 205250 + }, + { + "epoch": 2.053, + "grad_norm": 41.536399841308594, + "learning_rate": 1.7913599999999997e-07, + "loss": 0.5233, + "step": 205300 + }, + { + "epoch": 2.0535, + "grad_norm": 18.10399055480957, + "learning_rate": 1.78936e-07, + "loss": 0.3914, + "step": 205350 + }, + { + "epoch": 2.054, + "grad_norm": 2.1177046298980713, + "learning_rate": 1.78736e-07, + "loss": 0.4353, + "step": 205400 + }, + { + "epoch": 2.0545, + "grad_norm": 2.495558500289917, + "learning_rate": 1.78536e-07, + "loss": 0.3869, + "step": 205450 + }, + { + "epoch": 2.055, + "grad_norm": 30.129528045654297, + "learning_rate": 1.78336e-07, + "loss": 0.4063, + "step": 205500 + }, + { + "epoch": 2.0555, + "grad_norm": 85.46407318115234, + "learning_rate": 1.7813599999999997e-07, + "loss": 0.548, + "step": 205550 + }, + { + "epoch": 2.056, + "grad_norm": 63.010189056396484, + "learning_rate": 1.77936e-07, + "loss": 0.4037, + "step": 205600 + }, + { + "epoch": 2.0565, + "grad_norm": 23.553878784179688, + "learning_rate": 1.77736e-07, + "loss": 0.4054, + "step": 205650 + }, + { + "epoch": 2.057, + "grad_norm": 61.06326675415039, + "learning_rate": 1.7753599999999998e-07, + "loss": 0.3369, + "step": 205700 + }, + { + "epoch": 2.0575, + "grad_norm": 72.79309844970703, + "learning_rate": 1.77336e-07, + "loss": 0.4785, + "step": 205750 + }, + { + "epoch": 2.058, + "grad_norm": 83.79118347167969, + "learning_rate": 1.7713599999999998e-07, + "loss": 0.4321, + "step": 205800 + }, + { + "epoch": 2.0585, + "grad_norm": 91.41462707519531, + "learning_rate": 1.76936e-07, + "loss": 0.3816, + "step": 205850 + }, + { + "epoch": 2.059, + "grad_norm": 0.9627030491828918, + "learning_rate": 1.76736e-07, + "loss": 0.3933, + "step": 205900 + }, + { + "epoch": 2.0595, + "grad_norm": 76.58192443847656, + "learning_rate": 1.76536e-07, + "loss": 0.4329, + "step": 205950 + }, + { + "epoch": 2.06, + "grad_norm": 92.036376953125, + "learning_rate": 1.7633599999999997e-07, + "loss": 0.4328, + "step": 206000 + }, + { + "epoch": 2.0605, + "grad_norm": 16.09295654296875, + "learning_rate": 1.7613599999999998e-07, + "loss": 0.4057, + "step": 206050 + }, + { + "epoch": 2.061, + "grad_norm": 20.907642364501953, + "learning_rate": 1.75936e-07, + "loss": 0.3151, + "step": 206100 + }, + { + "epoch": 2.0615, + "grad_norm": 125.13389587402344, + "learning_rate": 1.75736e-07, + "loss": 0.5345, + "step": 206150 + }, + { + "epoch": 2.062, + "grad_norm": 10.434032440185547, + "learning_rate": 1.75536e-07, + "loss": 0.4701, + "step": 206200 + }, + { + "epoch": 2.0625, + "grad_norm": 88.18071746826172, + "learning_rate": 1.7533599999999998e-07, + "loss": 0.6219, + "step": 206250 + }, + { + "epoch": 2.063, + "grad_norm": 107.26880645751953, + "learning_rate": 1.75136e-07, + "loss": 0.4287, + "step": 206300 + }, + { + "epoch": 2.0635, + "grad_norm": 64.41120147705078, + "learning_rate": 1.74936e-07, + "loss": 0.467, + "step": 206350 + }, + { + "epoch": 2.064, + "grad_norm": 71.87296295166016, + "learning_rate": 1.7473599999999998e-07, + "loss": 0.5102, + "step": 206400 + }, + { + "epoch": 2.0645, + "grad_norm": 4.165521621704102, + "learning_rate": 1.74536e-07, + "loss": 0.5447, + "step": 206450 + }, + { + "epoch": 2.065, + "grad_norm": 10.817039489746094, + "learning_rate": 1.7433599999999998e-07, + "loss": 0.415, + "step": 206500 + }, + { + "epoch": 2.0655, + "grad_norm": 132.01255798339844, + "learning_rate": 1.7413600000000002e-07, + "loss": 0.4636, + "step": 206550 + }, + { + "epoch": 2.066, + "grad_norm": 4.783523082733154, + "learning_rate": 1.73936e-07, + "loss": 0.3517, + "step": 206600 + }, + { + "epoch": 2.0665, + "grad_norm": 91.72089385986328, + "learning_rate": 1.73736e-07, + "loss": 0.3424, + "step": 206650 + }, + { + "epoch": 2.067, + "grad_norm": 1.3586903810501099, + "learning_rate": 1.73536e-07, + "loss": 0.3763, + "step": 206700 + }, + { + "epoch": 2.0675, + "grad_norm": 119.87760162353516, + "learning_rate": 1.7333599999999998e-07, + "loss": 0.4029, + "step": 206750 + }, + { + "epoch": 2.068, + "grad_norm": 74.30902099609375, + "learning_rate": 1.73136e-07, + "loss": 0.4613, + "step": 206800 + }, + { + "epoch": 2.0685000000000002, + "grad_norm": 88.31536865234375, + "learning_rate": 1.72936e-07, + "loss": 0.4587, + "step": 206850 + }, + { + "epoch": 2.069, + "grad_norm": 30.35822105407715, + "learning_rate": 1.72736e-07, + "loss": 0.3296, + "step": 206900 + }, + { + "epoch": 2.0695, + "grad_norm": 68.1887435913086, + "learning_rate": 1.7253599999999998e-07, + "loss": 0.3588, + "step": 206950 + }, + { + "epoch": 2.07, + "grad_norm": 24.027542114257812, + "learning_rate": 1.72336e-07, + "loss": 0.435, + "step": 207000 + }, + { + "epoch": 2.0705, + "grad_norm": 1.4884693622589111, + "learning_rate": 1.72136e-07, + "loss": 0.3958, + "step": 207050 + }, + { + "epoch": 2.071, + "grad_norm": 76.33940887451172, + "learning_rate": 1.7193599999999999e-07, + "loss": 0.4019, + "step": 207100 + }, + { + "epoch": 2.0715, + "grad_norm": 74.91709899902344, + "learning_rate": 1.71736e-07, + "loss": 0.4541, + "step": 207150 + }, + { + "epoch": 2.072, + "grad_norm": 91.8819808959961, + "learning_rate": 1.7153599999999998e-07, + "loss": 0.5439, + "step": 207200 + }, + { + "epoch": 2.0725, + "grad_norm": 6.123153209686279, + "learning_rate": 1.71336e-07, + "loss": 0.4287, + "step": 207250 + }, + { + "epoch": 2.073, + "grad_norm": 0.9759462475776672, + "learning_rate": 1.71136e-07, + "loss": 0.374, + "step": 207300 + }, + { + "epoch": 2.0735, + "grad_norm": 15.24844741821289, + "learning_rate": 1.70936e-07, + "loss": 0.3756, + "step": 207350 + }, + { + "epoch": 2.074, + "grad_norm": 6.204489707946777, + "learning_rate": 1.70736e-07, + "loss": 0.4405, + "step": 207400 + }, + { + "epoch": 2.0745, + "grad_norm": 69.0223388671875, + "learning_rate": 1.7053599999999999e-07, + "loss": 0.4419, + "step": 207450 + }, + { + "epoch": 2.075, + "grad_norm": 39.172096252441406, + "learning_rate": 1.7033599999999997e-07, + "loss": 0.3817, + "step": 207500 + }, + { + "epoch": 2.0755, + "grad_norm": 59.810447692871094, + "learning_rate": 1.70136e-07, + "loss": 0.4333, + "step": 207550 + }, + { + "epoch": 2.076, + "grad_norm": 43.26020050048828, + "learning_rate": 1.69936e-07, + "loss": 0.4731, + "step": 207600 + }, + { + "epoch": 2.0765, + "grad_norm": 81.62408447265625, + "learning_rate": 1.6973599999999998e-07, + "loss": 0.4608, + "step": 207650 + }, + { + "epoch": 2.077, + "grad_norm": 0.3745923638343811, + "learning_rate": 1.69536e-07, + "loss": 0.3853, + "step": 207700 + }, + { + "epoch": 2.0775, + "grad_norm": 90.62264251708984, + "learning_rate": 1.6933599999999998e-07, + "loss": 0.4131, + "step": 207750 + }, + { + "epoch": 2.078, + "grad_norm": 12.992121696472168, + "learning_rate": 1.69136e-07, + "loss": 0.4822, + "step": 207800 + }, + { + "epoch": 2.0785, + "grad_norm": 54.97868347167969, + "learning_rate": 1.68936e-07, + "loss": 0.4623, + "step": 207850 + }, + { + "epoch": 2.079, + "grad_norm": 35.057308197021484, + "learning_rate": 1.6873599999999998e-07, + "loss": 0.2986, + "step": 207900 + }, + { + "epoch": 2.0795, + "grad_norm": 98.75119018554688, + "learning_rate": 1.68536e-07, + "loss": 0.5695, + "step": 207950 + }, + { + "epoch": 2.08, + "grad_norm": 57.579368591308594, + "learning_rate": 1.68336e-07, + "loss": 0.5152, + "step": 208000 + }, + { + "epoch": 2.0805, + "grad_norm": 141.9813232421875, + "learning_rate": 1.68136e-07, + "loss": 0.4165, + "step": 208050 + }, + { + "epoch": 2.081, + "grad_norm": 114.23064422607422, + "learning_rate": 1.67936e-07, + "loss": 0.3704, + "step": 208100 + }, + { + "epoch": 2.0815, + "grad_norm": 160.40625, + "learning_rate": 1.67736e-07, + "loss": 0.3956, + "step": 208150 + }, + { + "epoch": 2.082, + "grad_norm": 47.36935043334961, + "learning_rate": 1.6753599999999997e-07, + "loss": 0.3628, + "step": 208200 + }, + { + "epoch": 2.0825, + "grad_norm": 114.87825012207031, + "learning_rate": 1.67336e-07, + "loss": 0.4358, + "step": 208250 + }, + { + "epoch": 2.083, + "grad_norm": 112.84601593017578, + "learning_rate": 1.67136e-07, + "loss": 0.4642, + "step": 208300 + }, + { + "epoch": 2.0835, + "grad_norm": 9.166009902954102, + "learning_rate": 1.66936e-07, + "loss": 0.3521, + "step": 208350 + }, + { + "epoch": 2.084, + "grad_norm": 114.99955749511719, + "learning_rate": 1.66736e-07, + "loss": 0.4923, + "step": 208400 + }, + { + "epoch": 2.0845, + "grad_norm": 3.3287408351898193, + "learning_rate": 1.6653599999999998e-07, + "loss": 0.3185, + "step": 208450 + }, + { + "epoch": 2.085, + "grad_norm": 6.908358097076416, + "learning_rate": 1.6634e-07, + "loss": 0.5219, + "step": 208500 + }, + { + "epoch": 2.0855, + "grad_norm": 24.365503311157227, + "learning_rate": 1.6614e-07, + "loss": 0.5059, + "step": 208550 + }, + { + "epoch": 2.086, + "grad_norm": 7.259128093719482, + "learning_rate": 1.6594e-07, + "loss": 0.4561, + "step": 208600 + }, + { + "epoch": 2.0865, + "grad_norm": 107.42555236816406, + "learning_rate": 1.6574e-07, + "loss": 0.3289, + "step": 208650 + }, + { + "epoch": 2.087, + "grad_norm": 7.124502182006836, + "learning_rate": 1.6553999999999999e-07, + "loss": 0.3946, + "step": 208700 + }, + { + "epoch": 2.0875, + "grad_norm": 71.81135559082031, + "learning_rate": 1.6533999999999997e-07, + "loss": 0.3836, + "step": 208750 + }, + { + "epoch": 2.088, + "grad_norm": 95.23866271972656, + "learning_rate": 1.6514e-07, + "loss": 0.3454, + "step": 208800 + }, + { + "epoch": 2.0885, + "grad_norm": 20.035490036010742, + "learning_rate": 1.6494e-07, + "loss": 0.4794, + "step": 208850 + }, + { + "epoch": 2.089, + "grad_norm": 75.28366088867188, + "learning_rate": 1.6473999999999998e-07, + "loss": 0.5533, + "step": 208900 + }, + { + "epoch": 2.0895, + "grad_norm": 72.16234588623047, + "learning_rate": 1.6454e-07, + "loss": 0.4135, + "step": 208950 + }, + { + "epoch": 2.09, + "grad_norm": 68.55174255371094, + "learning_rate": 1.6434e-07, + "loss": 0.3546, + "step": 209000 + }, + { + "epoch": 2.0905, + "grad_norm": 84.08557891845703, + "learning_rate": 1.6414e-07, + "loss": 0.4366, + "step": 209050 + }, + { + "epoch": 2.091, + "grad_norm": 119.96469116210938, + "learning_rate": 1.6394e-07, + "loss": 0.3998, + "step": 209100 + }, + { + "epoch": 2.0915, + "grad_norm": 15.461919784545898, + "learning_rate": 1.6373999999999998e-07, + "loss": 0.4224, + "step": 209150 + }, + { + "epoch": 2.092, + "grad_norm": 95.27957916259766, + "learning_rate": 1.6354e-07, + "loss": 0.4491, + "step": 209200 + }, + { + "epoch": 2.0925, + "grad_norm": 9.547892570495605, + "learning_rate": 1.6334e-07, + "loss": 0.4787, + "step": 209250 + }, + { + "epoch": 2.093, + "grad_norm": 56.198638916015625, + "learning_rate": 1.6314e-07, + "loss": 0.4518, + "step": 209300 + }, + { + "epoch": 2.0935, + "grad_norm": 45.14836502075195, + "learning_rate": 1.6294e-07, + "loss": 0.3812, + "step": 209350 + }, + { + "epoch": 2.094, + "grad_norm": 47.17598342895508, + "learning_rate": 1.6274e-07, + "loss": 0.3314, + "step": 209400 + }, + { + "epoch": 2.0945, + "grad_norm": 3.217527389526367, + "learning_rate": 1.6253999999999997e-07, + "loss": 0.4014, + "step": 209450 + }, + { + "epoch": 2.095, + "grad_norm": 59.638771057128906, + "learning_rate": 1.6234e-07, + "loss": 0.3964, + "step": 209500 + }, + { + "epoch": 2.0955, + "grad_norm": 83.21932220458984, + "learning_rate": 1.6214e-07, + "loss": 0.5206, + "step": 209550 + }, + { + "epoch": 2.096, + "grad_norm": 15.194162368774414, + "learning_rate": 1.6194e-07, + "loss": 0.4676, + "step": 209600 + }, + { + "epoch": 2.0965, + "grad_norm": 107.33583068847656, + "learning_rate": 1.6174e-07, + "loss": 0.4261, + "step": 209650 + }, + { + "epoch": 2.097, + "grad_norm": 13.499813079833984, + "learning_rate": 1.6153999999999998e-07, + "loss": 0.5161, + "step": 209700 + }, + { + "epoch": 2.0975, + "grad_norm": 82.69535064697266, + "learning_rate": 1.6134000000000001e-07, + "loss": 0.4069, + "step": 209750 + }, + { + "epoch": 2.098, + "grad_norm": 41.087249755859375, + "learning_rate": 1.6114e-07, + "loss": 0.3837, + "step": 209800 + }, + { + "epoch": 2.0985, + "grad_norm": 19.804964065551758, + "learning_rate": 1.6093999999999998e-07, + "loss": 0.427, + "step": 209850 + }, + { + "epoch": 2.099, + "grad_norm": 36.766902923583984, + "learning_rate": 1.6074e-07, + "loss": 0.3817, + "step": 209900 + }, + { + "epoch": 2.0995, + "grad_norm": 128.2410125732422, + "learning_rate": 1.6053999999999998e-07, + "loss": 0.4625, + "step": 209950 + }, + { + "epoch": 2.1, + "grad_norm": 0.3461100459098816, + "learning_rate": 1.6034e-07, + "loss": 0.4514, + "step": 210000 + }, + { + "epoch": 2.1005, + "grad_norm": 39.63062286376953, + "learning_rate": 1.6014e-07, + "loss": 0.4665, + "step": 210050 + }, + { + "epoch": 2.101, + "grad_norm": 29.11553382873535, + "learning_rate": 1.5994e-07, + "loss": 0.3503, + "step": 210100 + }, + { + "epoch": 2.1015, + "grad_norm": 18.823801040649414, + "learning_rate": 1.5973999999999997e-07, + "loss": 0.3922, + "step": 210150 + }, + { + "epoch": 2.102, + "grad_norm": 8.857904434204102, + "learning_rate": 1.5953999999999998e-07, + "loss": 0.4219, + "step": 210200 + }, + { + "epoch": 2.1025, + "grad_norm": 1.5444262027740479, + "learning_rate": 1.5934e-07, + "loss": 0.3796, + "step": 210250 + }, + { + "epoch": 2.103, + "grad_norm": 17.656465530395508, + "learning_rate": 1.59144e-07, + "loss": 0.5756, + "step": 210300 + }, + { + "epoch": 2.1035, + "grad_norm": 70.85519409179688, + "learning_rate": 1.58944e-07, + "loss": 0.3324, + "step": 210350 + }, + { + "epoch": 2.104, + "grad_norm": 139.22637939453125, + "learning_rate": 1.5874399999999998e-07, + "loss": 0.5752, + "step": 210400 + }, + { + "epoch": 2.1045, + "grad_norm": 56.294315338134766, + "learning_rate": 1.58544e-07, + "loss": 0.4574, + "step": 210450 + }, + { + "epoch": 2.105, + "grad_norm": 45.46672821044922, + "learning_rate": 1.58344e-07, + "loss": 0.3502, + "step": 210500 + }, + { + "epoch": 2.1055, + "grad_norm": 11.471989631652832, + "learning_rate": 1.58144e-07, + "loss": 0.3842, + "step": 210550 + }, + { + "epoch": 2.106, + "grad_norm": 23.504499435424805, + "learning_rate": 1.57944e-07, + "loss": 0.4154, + "step": 210600 + }, + { + "epoch": 2.1065, + "grad_norm": 17.586835861206055, + "learning_rate": 1.5774399999999999e-07, + "loss": 0.4408, + "step": 210650 + }, + { + "epoch": 2.107, + "grad_norm": 74.19413757324219, + "learning_rate": 1.5754399999999997e-07, + "loss": 0.5318, + "step": 210700 + }, + { + "epoch": 2.1075, + "grad_norm": 5.282220840454102, + "learning_rate": 1.57344e-07, + "loss": 0.5633, + "step": 210750 + }, + { + "epoch": 2.108, + "grad_norm": 27.199790954589844, + "learning_rate": 1.57144e-07, + "loss": 0.3807, + "step": 210800 + }, + { + "epoch": 2.1085, + "grad_norm": 9.653837203979492, + "learning_rate": 1.56944e-07, + "loss": 0.5071, + "step": 210850 + }, + { + "epoch": 2.109, + "grad_norm": 2.2887659072875977, + "learning_rate": 1.56744e-07, + "loss": 0.442, + "step": 210900 + }, + { + "epoch": 2.1095, + "grad_norm": 130.85214233398438, + "learning_rate": 1.5654399999999998e-07, + "loss": 0.5694, + "step": 210950 + }, + { + "epoch": 2.11, + "grad_norm": 71.07162475585938, + "learning_rate": 1.5634400000000001e-07, + "loss": 0.3873, + "step": 211000 + }, + { + "epoch": 2.1105, + "grad_norm": 0.9926930665969849, + "learning_rate": 1.56144e-07, + "loss": 0.4219, + "step": 211050 + }, + { + "epoch": 2.111, + "grad_norm": 31.29315948486328, + "learning_rate": 1.5594399999999998e-07, + "loss": 0.406, + "step": 211100 + }, + { + "epoch": 2.1115, + "grad_norm": 4.280275821685791, + "learning_rate": 1.55744e-07, + "loss": 0.4414, + "step": 211150 + }, + { + "epoch": 2.112, + "grad_norm": 22.806076049804688, + "learning_rate": 1.5554399999999998e-07, + "loss": 0.4081, + "step": 211200 + }, + { + "epoch": 2.1125, + "grad_norm": 48.29012680053711, + "learning_rate": 1.55344e-07, + "loss": 0.4448, + "step": 211250 + }, + { + "epoch": 2.113, + "grad_norm": 28.17279624938965, + "learning_rate": 1.55144e-07, + "loss": 0.3505, + "step": 211300 + }, + { + "epoch": 2.1135, + "grad_norm": 4.374676704406738, + "learning_rate": 1.54944e-07, + "loss": 0.374, + "step": 211350 + }, + { + "epoch": 2.114, + "grad_norm": 5.979548931121826, + "learning_rate": 1.5474399999999997e-07, + "loss": 0.304, + "step": 211400 + }, + { + "epoch": 2.1145, + "grad_norm": 5.3794426918029785, + "learning_rate": 1.5454399999999998e-07, + "loss": 0.4963, + "step": 211450 + }, + { + "epoch": 2.115, + "grad_norm": 71.64678955078125, + "learning_rate": 1.54344e-07, + "loss": 0.4826, + "step": 211500 + }, + { + "epoch": 2.1155, + "grad_norm": 8.600757598876953, + "learning_rate": 1.54144e-07, + "loss": 0.3473, + "step": 211550 + }, + { + "epoch": 2.116, + "grad_norm": 6.017736434936523, + "learning_rate": 1.53944e-07, + "loss": 0.4768, + "step": 211600 + }, + { + "epoch": 2.1165, + "grad_norm": 58.22939682006836, + "learning_rate": 1.5374399999999998e-07, + "loss": 0.4373, + "step": 211650 + }, + { + "epoch": 2.117, + "grad_norm": 68.43115234375, + "learning_rate": 1.5354400000000001e-07, + "loss": 0.4691, + "step": 211700 + }, + { + "epoch": 2.1175, + "grad_norm": 79.82152557373047, + "learning_rate": 1.53344e-07, + "loss": 0.4244, + "step": 211750 + }, + { + "epoch": 2.118, + "grad_norm": 31.79875946044922, + "learning_rate": 1.5314399999999998e-07, + "loss": 0.5947, + "step": 211800 + }, + { + "epoch": 2.1185, + "grad_norm": 62.042083740234375, + "learning_rate": 1.52944e-07, + "loss": 0.3467, + "step": 211850 + }, + { + "epoch": 2.1189999999999998, + "grad_norm": 2.5608060359954834, + "learning_rate": 1.5274399999999998e-07, + "loss": 0.4214, + "step": 211900 + }, + { + "epoch": 2.1195, + "grad_norm": 105.99209594726562, + "learning_rate": 1.5254400000000002e-07, + "loss": 0.5052, + "step": 211950 + }, + { + "epoch": 2.12, + "grad_norm": 24.472665786743164, + "learning_rate": 1.52344e-07, + "loss": 0.3036, + "step": 212000 + }, + { + "epoch": 2.1205, + "grad_norm": 97.40585327148438, + "learning_rate": 1.52144e-07, + "loss": 0.4331, + "step": 212050 + }, + { + "epoch": 2.121, + "grad_norm": 68.8331069946289, + "learning_rate": 1.51944e-07, + "loss": 0.4896, + "step": 212100 + }, + { + "epoch": 2.1215, + "grad_norm": 0.17462262511253357, + "learning_rate": 1.5174399999999999e-07, + "loss": 0.2989, + "step": 212150 + }, + { + "epoch": 2.122, + "grad_norm": 0.09120786935091019, + "learning_rate": 1.51544e-07, + "loss": 0.49, + "step": 212200 + }, + { + "epoch": 2.1225, + "grad_norm": 41.664825439453125, + "learning_rate": 1.51344e-07, + "loss": 0.4313, + "step": 212250 + }, + { + "epoch": 2.123, + "grad_norm": 99.65406036376953, + "learning_rate": 1.51144e-07, + "loss": 0.3627, + "step": 212300 + }, + { + "epoch": 2.1235, + "grad_norm": 71.01948547363281, + "learning_rate": 1.5094399999999998e-07, + "loss": 0.496, + "step": 212350 + }, + { + "epoch": 2.124, + "grad_norm": 90.17384338378906, + "learning_rate": 1.50744e-07, + "loss": 0.3975, + "step": 212400 + }, + { + "epoch": 2.1245, + "grad_norm": 16.80384635925293, + "learning_rate": 1.50544e-07, + "loss": 0.3572, + "step": 212450 + }, + { + "epoch": 2.125, + "grad_norm": 63.9257926940918, + "learning_rate": 1.5034399999999999e-07, + "loss": 0.5707, + "step": 212500 + }, + { + "epoch": 2.1255, + "grad_norm": 40.26178741455078, + "learning_rate": 1.50144e-07, + "loss": 0.4961, + "step": 212550 + }, + { + "epoch": 2.126, + "grad_norm": 75.82178497314453, + "learning_rate": 1.4994399999999998e-07, + "loss": 0.4571, + "step": 212600 + }, + { + "epoch": 2.1265, + "grad_norm": 10.5368070602417, + "learning_rate": 1.49744e-07, + "loss": 0.4173, + "step": 212650 + }, + { + "epoch": 2.127, + "grad_norm": 141.17538452148438, + "learning_rate": 1.49544e-07, + "loss": 0.4677, + "step": 212700 + }, + { + "epoch": 2.1275, + "grad_norm": 58.944175720214844, + "learning_rate": 1.49344e-07, + "loss": 0.3237, + "step": 212750 + }, + { + "epoch": 2.128, + "grad_norm": 43.46791458129883, + "learning_rate": 1.49144e-07, + "loss": 0.5176, + "step": 212800 + }, + { + "epoch": 2.1285, + "grad_norm": 1.668687105178833, + "learning_rate": 1.48944e-07, + "loss": 0.4732, + "step": 212850 + }, + { + "epoch": 2.129, + "grad_norm": 69.23137664794922, + "learning_rate": 1.4874399999999997e-07, + "loss": 0.4604, + "step": 212900 + }, + { + "epoch": 2.1295, + "grad_norm": 39.40574645996094, + "learning_rate": 1.48544e-07, + "loss": 0.4294, + "step": 212950 + }, + { + "epoch": 2.13, + "grad_norm": 48.64951705932617, + "learning_rate": 1.48344e-07, + "loss": 0.3854, + "step": 213000 + }, + { + "epoch": 2.1305, + "grad_norm": 7.141056537628174, + "learning_rate": 1.4814399999999998e-07, + "loss": 0.3979, + "step": 213050 + }, + { + "epoch": 2.1310000000000002, + "grad_norm": 61.7899284362793, + "learning_rate": 1.47944e-07, + "loss": 0.4668, + "step": 213100 + }, + { + "epoch": 2.1315, + "grad_norm": 43.82136154174805, + "learning_rate": 1.4774399999999998e-07, + "loss": 0.4552, + "step": 213150 + }, + { + "epoch": 2.132, + "grad_norm": 9.3849458694458, + "learning_rate": 1.4754400000000001e-07, + "loss": 0.4971, + "step": 213200 + }, + { + "epoch": 2.1325, + "grad_norm": 0.2815798819065094, + "learning_rate": 1.47344e-07, + "loss": 0.3141, + "step": 213250 + }, + { + "epoch": 2.133, + "grad_norm": 70.74395751953125, + "learning_rate": 1.4714399999999998e-07, + "loss": 0.4281, + "step": 213300 + }, + { + "epoch": 2.1335, + "grad_norm": 36.044673919677734, + "learning_rate": 1.46944e-07, + "loss": 0.3749, + "step": 213350 + }, + { + "epoch": 2.134, + "grad_norm": 22.537832260131836, + "learning_rate": 1.46744e-07, + "loss": 0.4091, + "step": 213400 + }, + { + "epoch": 2.1345, + "grad_norm": 64.7277603149414, + "learning_rate": 1.46544e-07, + "loss": 0.4648, + "step": 213450 + }, + { + "epoch": 2.135, + "grad_norm": 32.96176528930664, + "learning_rate": 1.46344e-07, + "loss": 0.5325, + "step": 213500 + }, + { + "epoch": 2.1355, + "grad_norm": 39.71519470214844, + "learning_rate": 1.46144e-07, + "loss": 0.4756, + "step": 213550 + }, + { + "epoch": 2.136, + "grad_norm": 27.864206314086914, + "learning_rate": 1.4594399999999997e-07, + "loss": 0.4653, + "step": 213600 + }, + { + "epoch": 2.1365, + "grad_norm": 59.47966766357422, + "learning_rate": 1.45744e-07, + "loss": 0.4488, + "step": 213650 + }, + { + "epoch": 2.137, + "grad_norm": 30.075504302978516, + "learning_rate": 1.45544e-07, + "loss": 0.5499, + "step": 213700 + }, + { + "epoch": 2.1375, + "grad_norm": 0.5983642339706421, + "learning_rate": 1.4534399999999998e-07, + "loss": 0.5613, + "step": 213750 + }, + { + "epoch": 2.138, + "grad_norm": 5.750205039978027, + "learning_rate": 1.45144e-07, + "loss": 0.5196, + "step": 213800 + }, + { + "epoch": 2.1385, + "grad_norm": 45.518558502197266, + "learning_rate": 1.4494399999999998e-07, + "loss": 0.2925, + "step": 213850 + }, + { + "epoch": 2.1390000000000002, + "grad_norm": 144.67080688476562, + "learning_rate": 1.4474400000000002e-07, + "loss": 0.4629, + "step": 213900 + }, + { + "epoch": 2.1395, + "grad_norm": 82.38689422607422, + "learning_rate": 1.44544e-07, + "loss": 0.5278, + "step": 213950 + }, + { + "epoch": 2.14, + "grad_norm": 1.598920226097107, + "learning_rate": 1.4434399999999999e-07, + "loss": 0.2595, + "step": 214000 + }, + { + "epoch": 2.1405, + "grad_norm": 22.938703536987305, + "learning_rate": 1.44144e-07, + "loss": 0.4807, + "step": 214050 + }, + { + "epoch": 2.141, + "grad_norm": 50.35197067260742, + "learning_rate": 1.4394399999999998e-07, + "loss": 0.3687, + "step": 214100 + }, + { + "epoch": 2.1415, + "grad_norm": 56.01286697387695, + "learning_rate": 1.43744e-07, + "loss": 0.4494, + "step": 214150 + }, + { + "epoch": 2.142, + "grad_norm": 117.62816619873047, + "learning_rate": 1.43544e-07, + "loss": 0.4037, + "step": 214200 + }, + { + "epoch": 2.1425, + "grad_norm": 26.653152465820312, + "learning_rate": 1.43344e-07, + "loss": 0.3627, + "step": 214250 + }, + { + "epoch": 2.143, + "grad_norm": 18.058246612548828, + "learning_rate": 1.4314399999999997e-07, + "loss": 0.6102, + "step": 214300 + }, + { + "epoch": 2.1435, + "grad_norm": 6.283153057098389, + "learning_rate": 1.4294399999999999e-07, + "loss": 0.318, + "step": 214350 + }, + { + "epoch": 2.144, + "grad_norm": 50.743133544921875, + "learning_rate": 1.42748e-07, + "loss": 0.3315, + "step": 214400 + }, + { + "epoch": 2.1445, + "grad_norm": 7.809654712677002, + "learning_rate": 1.4254800000000001e-07, + "loss": 0.3765, + "step": 214450 + }, + { + "epoch": 2.145, + "grad_norm": 58.41513442993164, + "learning_rate": 1.42348e-07, + "loss": 0.3844, + "step": 214500 + }, + { + "epoch": 2.1455, + "grad_norm": 117.92552185058594, + "learning_rate": 1.4214799999999998e-07, + "loss": 0.4366, + "step": 214550 + }, + { + "epoch": 2.146, + "grad_norm": 38.15401077270508, + "learning_rate": 1.41948e-07, + "loss": 0.4262, + "step": 214600 + }, + { + "epoch": 2.1465, + "grad_norm": 99.7507553100586, + "learning_rate": 1.41748e-07, + "loss": 0.4156, + "step": 214650 + }, + { + "epoch": 2.147, + "grad_norm": 120.47976684570312, + "learning_rate": 1.41548e-07, + "loss": 0.3704, + "step": 214700 + }, + { + "epoch": 2.1475, + "grad_norm": 0.7170859575271606, + "learning_rate": 1.41348e-07, + "loss": 0.5463, + "step": 214750 + }, + { + "epoch": 2.148, + "grad_norm": 73.54954528808594, + "learning_rate": 1.41148e-07, + "loss": 0.4351, + "step": 214800 + }, + { + "epoch": 2.1485, + "grad_norm": 1.4426411390304565, + "learning_rate": 1.4094799999999997e-07, + "loss": 0.4003, + "step": 214850 + }, + { + "epoch": 2.149, + "grad_norm": 16.82160758972168, + "learning_rate": 1.40748e-07, + "loss": 0.334, + "step": 214900 + }, + { + "epoch": 2.1495, + "grad_norm": 69.73733520507812, + "learning_rate": 1.40548e-07, + "loss": 0.427, + "step": 214950 + }, + { + "epoch": 2.15, + "grad_norm": 125.05155944824219, + "learning_rate": 1.40348e-07, + "loss": 0.3633, + "step": 215000 + }, + { + "epoch": 2.1505, + "grad_norm": 55.94770812988281, + "learning_rate": 1.40148e-07, + "loss": 0.5265, + "step": 215050 + }, + { + "epoch": 2.151, + "grad_norm": 4.276247978210449, + "learning_rate": 1.3994799999999998e-07, + "loss": 0.3524, + "step": 215100 + }, + { + "epoch": 2.1515, + "grad_norm": 67.32160949707031, + "learning_rate": 1.3974800000000001e-07, + "loss": 0.4974, + "step": 215150 + }, + { + "epoch": 2.152, + "grad_norm": 34.65168762207031, + "learning_rate": 1.39548e-07, + "loss": 0.3331, + "step": 215200 + }, + { + "epoch": 2.1525, + "grad_norm": 91.49262237548828, + "learning_rate": 1.3934799999999998e-07, + "loss": 0.4829, + "step": 215250 + }, + { + "epoch": 2.153, + "grad_norm": 14.011157989501953, + "learning_rate": 1.39148e-07, + "loss": 0.4288, + "step": 215300 + }, + { + "epoch": 2.1535, + "grad_norm": 5.634559631347656, + "learning_rate": 1.3894799999999998e-07, + "loss": 0.3349, + "step": 215350 + }, + { + "epoch": 2.154, + "grad_norm": 73.73646545410156, + "learning_rate": 1.38748e-07, + "loss": 0.4687, + "step": 215400 + }, + { + "epoch": 2.1545, + "grad_norm": 2.7727954387664795, + "learning_rate": 1.38548e-07, + "loss": 0.3353, + "step": 215450 + }, + { + "epoch": 2.155, + "grad_norm": 73.1183090209961, + "learning_rate": 1.38348e-07, + "loss": 0.5878, + "step": 215500 + }, + { + "epoch": 2.1555, + "grad_norm": 88.92938232421875, + "learning_rate": 1.3814799999999997e-07, + "loss": 0.3574, + "step": 215550 + }, + { + "epoch": 2.156, + "grad_norm": 9.608054161071777, + "learning_rate": 1.3794799999999999e-07, + "loss": 0.472, + "step": 215600 + }, + { + "epoch": 2.1565, + "grad_norm": 34.318458557128906, + "learning_rate": 1.37748e-07, + "loss": 0.3066, + "step": 215650 + }, + { + "epoch": 2.157, + "grad_norm": 146.12513732910156, + "learning_rate": 1.37548e-07, + "loss": 0.3393, + "step": 215700 + }, + { + "epoch": 2.1575, + "grad_norm": 7.567634582519531, + "learning_rate": 1.37348e-07, + "loss": 0.3595, + "step": 215750 + }, + { + "epoch": 2.158, + "grad_norm": 14.484745979309082, + "learning_rate": 1.3714799999999998e-07, + "loss": 0.3492, + "step": 215800 + }, + { + "epoch": 2.1585, + "grad_norm": 29.185087203979492, + "learning_rate": 1.36948e-07, + "loss": 0.4477, + "step": 215850 + }, + { + "epoch": 2.159, + "grad_norm": 3.4553093910217285, + "learning_rate": 1.36748e-07, + "loss": 0.6068, + "step": 215900 + }, + { + "epoch": 2.1595, + "grad_norm": 14.842061996459961, + "learning_rate": 1.3654799999999999e-07, + "loss": 0.4224, + "step": 215950 + }, + { + "epoch": 2.16, + "grad_norm": 94.66940307617188, + "learning_rate": 1.36348e-07, + "loss": 0.4909, + "step": 216000 + }, + { + "epoch": 2.1605, + "grad_norm": 12.455718994140625, + "learning_rate": 1.3614799999999998e-07, + "loss": 0.4814, + "step": 216050 + }, + { + "epoch": 2.161, + "grad_norm": 118.89146423339844, + "learning_rate": 1.35948e-07, + "loss": 0.5104, + "step": 216100 + }, + { + "epoch": 2.1615, + "grad_norm": 64.86026763916016, + "learning_rate": 1.35748e-07, + "loss": 0.4312, + "step": 216150 + }, + { + "epoch": 2.162, + "grad_norm": 8.784263610839844, + "learning_rate": 1.35548e-07, + "loss": 0.4906, + "step": 216200 + }, + { + "epoch": 2.1625, + "grad_norm": 90.33892822265625, + "learning_rate": 1.35348e-07, + "loss": 0.4301, + "step": 216250 + }, + { + "epoch": 2.163, + "grad_norm": 22.997182846069336, + "learning_rate": 1.35148e-07, + "loss": 0.2993, + "step": 216300 + }, + { + "epoch": 2.1635, + "grad_norm": 35.568603515625, + "learning_rate": 1.34948e-07, + "loss": 0.3766, + "step": 216350 + }, + { + "epoch": 2.164, + "grad_norm": 28.420759201049805, + "learning_rate": 1.34748e-07, + "loss": 0.5307, + "step": 216400 + }, + { + "epoch": 2.1645, + "grad_norm": 19.557161331176758, + "learning_rate": 1.34548e-07, + "loss": 0.5116, + "step": 216450 + }, + { + "epoch": 2.165, + "grad_norm": 23.2882022857666, + "learning_rate": 1.3434799999999998e-07, + "loss": 0.4442, + "step": 216500 + }, + { + "epoch": 2.1655, + "grad_norm": 22.49976348876953, + "learning_rate": 1.34148e-07, + "loss": 0.4319, + "step": 216550 + }, + { + "epoch": 2.166, + "grad_norm": 79.17230987548828, + "learning_rate": 1.33948e-07, + "loss": 0.4977, + "step": 216600 + }, + { + "epoch": 2.1665, + "grad_norm": 68.50972747802734, + "learning_rate": 1.33748e-07, + "loss": 0.3239, + "step": 216650 + }, + { + "epoch": 2.167, + "grad_norm": 57.847503662109375, + "learning_rate": 1.33548e-07, + "loss": 0.3964, + "step": 216700 + }, + { + "epoch": 2.1675, + "grad_norm": 90.66923522949219, + "learning_rate": 1.3334799999999998e-07, + "loss": 0.3266, + "step": 216750 + }, + { + "epoch": 2.168, + "grad_norm": 10.624639511108398, + "learning_rate": 1.33148e-07, + "loss": 0.4769, + "step": 216800 + }, + { + "epoch": 2.1685, + "grad_norm": 70.89537048339844, + "learning_rate": 1.32948e-07, + "loss": 0.4799, + "step": 216850 + }, + { + "epoch": 2.169, + "grad_norm": 52.987213134765625, + "learning_rate": 1.32748e-07, + "loss": 0.4412, + "step": 216900 + }, + { + "epoch": 2.1695, + "grad_norm": 69.99027252197266, + "learning_rate": 1.32552e-07, + "loss": 0.4945, + "step": 216950 + }, + { + "epoch": 2.17, + "grad_norm": 90.99432373046875, + "learning_rate": 1.32356e-07, + "loss": 0.4697, + "step": 217000 + }, + { + "epoch": 2.1705, + "grad_norm": 79.41846466064453, + "learning_rate": 1.3215599999999998e-07, + "loss": 0.4307, + "step": 217050 + }, + { + "epoch": 2.171, + "grad_norm": 86.42295837402344, + "learning_rate": 1.31956e-07, + "loss": 0.4369, + "step": 217100 + }, + { + "epoch": 2.1715, + "grad_norm": 64.85267639160156, + "learning_rate": 1.31756e-07, + "loss": 0.4694, + "step": 217150 + }, + { + "epoch": 2.172, + "grad_norm": 4.685230731964111, + "learning_rate": 1.31556e-07, + "loss": 0.3749, + "step": 217200 + }, + { + "epoch": 2.1725, + "grad_norm": 98.96492004394531, + "learning_rate": 1.31356e-07, + "loss": 0.4746, + "step": 217250 + }, + { + "epoch": 2.173, + "grad_norm": 58.136985778808594, + "learning_rate": 1.3115599999999999e-07, + "loss": 0.4622, + "step": 217300 + }, + { + "epoch": 2.1734999999999998, + "grad_norm": 33.713130950927734, + "learning_rate": 1.3095599999999997e-07, + "loss": 0.411, + "step": 217350 + }, + { + "epoch": 2.174, + "grad_norm": 5.95607328414917, + "learning_rate": 1.30756e-07, + "loss": 0.4671, + "step": 217400 + }, + { + "epoch": 2.1745, + "grad_norm": 44.79324722290039, + "learning_rate": 1.30556e-07, + "loss": 0.3069, + "step": 217450 + }, + { + "epoch": 2.175, + "grad_norm": 21.093141555786133, + "learning_rate": 1.30356e-07, + "loss": 0.3747, + "step": 217500 + }, + { + "epoch": 2.1755, + "grad_norm": 27.197893142700195, + "learning_rate": 1.30156e-07, + "loss": 0.4195, + "step": 217550 + }, + { + "epoch": 2.176, + "grad_norm": 77.7616195678711, + "learning_rate": 1.2995599999999998e-07, + "loss": 0.3689, + "step": 217600 + }, + { + "epoch": 2.1765, + "grad_norm": 0.3446306586265564, + "learning_rate": 1.2975600000000001e-07, + "loss": 0.4223, + "step": 217650 + }, + { + "epoch": 2.177, + "grad_norm": 95.07039642333984, + "learning_rate": 1.29556e-07, + "loss": 0.3375, + "step": 217700 + }, + { + "epoch": 2.1775, + "grad_norm": 80.28645324707031, + "learning_rate": 1.2935599999999998e-07, + "loss": 0.4666, + "step": 217750 + }, + { + "epoch": 2.178, + "grad_norm": 57.870567321777344, + "learning_rate": 1.29156e-07, + "loss": 0.5303, + "step": 217800 + }, + { + "epoch": 2.1785, + "grad_norm": 6.648061752319336, + "learning_rate": 1.2895599999999998e-07, + "loss": 0.3876, + "step": 217850 + }, + { + "epoch": 2.179, + "grad_norm": 80.23861694335938, + "learning_rate": 1.28756e-07, + "loss": 0.3716, + "step": 217900 + }, + { + "epoch": 2.1795, + "grad_norm": 44.08333206176758, + "learning_rate": 1.28556e-07, + "loss": 0.4331, + "step": 217950 + }, + { + "epoch": 2.18, + "grad_norm": 54.142154693603516, + "learning_rate": 1.28356e-07, + "loss": 0.4002, + "step": 218000 + }, + { + "epoch": 2.1805, + "grad_norm": 13.98410701751709, + "learning_rate": 1.28156e-07, + "loss": 0.4893, + "step": 218050 + }, + { + "epoch": 2.181, + "grad_norm": 67.61353302001953, + "learning_rate": 1.2795599999999998e-07, + "loss": 0.4332, + "step": 218100 + }, + { + "epoch": 2.1814999999999998, + "grad_norm": 3.3425474166870117, + "learning_rate": 1.27756e-07, + "loss": 0.4579, + "step": 218150 + }, + { + "epoch": 2.182, + "grad_norm": 25.27934455871582, + "learning_rate": 1.27556e-07, + "loss": 0.4718, + "step": 218200 + }, + { + "epoch": 2.1825, + "grad_norm": 74.17402648925781, + "learning_rate": 1.27356e-07, + "loss": 0.3044, + "step": 218250 + }, + { + "epoch": 2.183, + "grad_norm": 12.15510368347168, + "learning_rate": 1.2715599999999998e-07, + "loss": 0.3911, + "step": 218300 + }, + { + "epoch": 2.1835, + "grad_norm": 8.140380859375, + "learning_rate": 1.2695600000000002e-07, + "loss": 0.376, + "step": 218350 + }, + { + "epoch": 2.184, + "grad_norm": 102.70777130126953, + "learning_rate": 1.26756e-07, + "loss": 0.3637, + "step": 218400 + }, + { + "epoch": 2.1845, + "grad_norm": 19.61964225769043, + "learning_rate": 1.2655599999999999e-07, + "loss": 0.5274, + "step": 218450 + }, + { + "epoch": 2.185, + "grad_norm": 14.926944732666016, + "learning_rate": 1.26356e-07, + "loss": 0.5191, + "step": 218500 + }, + { + "epoch": 2.1855, + "grad_norm": 37.19790267944336, + "learning_rate": 1.2615599999999998e-07, + "loss": 0.4175, + "step": 218550 + }, + { + "epoch": 2.186, + "grad_norm": 107.50064849853516, + "learning_rate": 1.2595600000000002e-07, + "loss": 0.3893, + "step": 218600 + }, + { + "epoch": 2.1865, + "grad_norm": 125.32817077636719, + "learning_rate": 1.25756e-07, + "loss": 0.442, + "step": 218650 + }, + { + "epoch": 2.187, + "grad_norm": 5.219124794006348, + "learning_rate": 1.25556e-07, + "loss": 0.4723, + "step": 218700 + }, + { + "epoch": 2.1875, + "grad_norm": 69.8603286743164, + "learning_rate": 1.25356e-07, + "loss": 0.3469, + "step": 218750 + }, + { + "epoch": 2.188, + "grad_norm": 61.861080169677734, + "learning_rate": 1.2515599999999999e-07, + "loss": 0.4402, + "step": 218800 + }, + { + "epoch": 2.1885, + "grad_norm": 27.252426147460938, + "learning_rate": 1.24956e-07, + "loss": 0.5054, + "step": 218850 + }, + { + "epoch": 2.189, + "grad_norm": 90.06917572021484, + "learning_rate": 1.24756e-07, + "loss": 0.3561, + "step": 218900 + }, + { + "epoch": 2.1895, + "grad_norm": 0.39669129252433777, + "learning_rate": 1.24556e-07, + "loss": 0.4879, + "step": 218950 + }, + { + "epoch": 2.19, + "grad_norm": 15.000761032104492, + "learning_rate": 1.2435599999999998e-07, + "loss": 0.5528, + "step": 219000 + }, + { + "epoch": 2.1905, + "grad_norm": 77.33452606201172, + "learning_rate": 1.24156e-07, + "loss": 0.593, + "step": 219050 + }, + { + "epoch": 2.191, + "grad_norm": 67.30721282958984, + "learning_rate": 1.23956e-07, + "loss": 0.4106, + "step": 219100 + }, + { + "epoch": 2.1915, + "grad_norm": 4.53434944152832, + "learning_rate": 1.2375599999999999e-07, + "loss": 0.5223, + "step": 219150 + }, + { + "epoch": 2.192, + "grad_norm": 48.35958480834961, + "learning_rate": 1.23556e-07, + "loss": 0.362, + "step": 219200 + }, + { + "epoch": 2.1925, + "grad_norm": 82.18716430664062, + "learning_rate": 1.2335599999999998e-07, + "loss": 0.4678, + "step": 219250 + }, + { + "epoch": 2.193, + "grad_norm": 11.158646583557129, + "learning_rate": 1.23156e-07, + "loss": 0.3892, + "step": 219300 + }, + { + "epoch": 2.1935000000000002, + "grad_norm": 36.34510040283203, + "learning_rate": 1.2295599999999998e-07, + "loss": 0.3854, + "step": 219350 + }, + { + "epoch": 2.194, + "grad_norm": 24.300992965698242, + "learning_rate": 1.22756e-07, + "loss": 0.3248, + "step": 219400 + }, + { + "epoch": 2.1945, + "grad_norm": 56.57987976074219, + "learning_rate": 1.22556e-07, + "loss": 0.4331, + "step": 219450 + }, + { + "epoch": 2.195, + "grad_norm": 44.11698532104492, + "learning_rate": 1.2236e-07, + "loss": 0.3961, + "step": 219500 + }, + { + "epoch": 2.1955, + "grad_norm": 0.06106671318411827, + "learning_rate": 1.2216e-07, + "loss": 0.357, + "step": 219550 + }, + { + "epoch": 2.196, + "grad_norm": 22.567058563232422, + "learning_rate": 1.2196e-07, + "loss": 0.4296, + "step": 219600 + }, + { + "epoch": 2.1965, + "grad_norm": 95.61750030517578, + "learning_rate": 1.2176e-07, + "loss": 0.5135, + "step": 219650 + }, + { + "epoch": 2.197, + "grad_norm": 0.5937201976776123, + "learning_rate": 1.2155999999999998e-07, + "loss": 0.4198, + "step": 219700 + }, + { + "epoch": 2.1975, + "grad_norm": 9.500515937805176, + "learning_rate": 1.2136e-07, + "loss": 0.3876, + "step": 219750 + }, + { + "epoch": 2.198, + "grad_norm": 18.312498092651367, + "learning_rate": 1.2116e-07, + "loss": 0.5221, + "step": 219800 + }, + { + "epoch": 2.1985, + "grad_norm": 108.91061401367188, + "learning_rate": 1.2096e-07, + "loss": 0.4779, + "step": 219850 + }, + { + "epoch": 2.199, + "grad_norm": 108.51426696777344, + "learning_rate": 1.2076e-07, + "loss": 0.3632, + "step": 219900 + }, + { + "epoch": 2.1995, + "grad_norm": 54.85345458984375, + "learning_rate": 1.2056e-07, + "loss": 0.2912, + "step": 219950 + }, + { + "epoch": 2.2, + "grad_norm": 98.45470428466797, + "learning_rate": 1.2036e-07, + "loss": 0.3563, + "step": 220000 + }, + { + "epoch": 2.2005, + "grad_norm": 26.045886993408203, + "learning_rate": 1.2015999999999999e-07, + "loss": 0.5571, + "step": 220050 + }, + { + "epoch": 2.201, + "grad_norm": 70.3372802734375, + "learning_rate": 1.1996e-07, + "loss": 0.4871, + "step": 220100 + }, + { + "epoch": 2.2015000000000002, + "grad_norm": 3.0247647762298584, + "learning_rate": 1.1976e-07, + "loss": 0.3335, + "step": 220150 + }, + { + "epoch": 2.202, + "grad_norm": 24.912132263183594, + "learning_rate": 1.1956e-07, + "loss": 0.4888, + "step": 220200 + }, + { + "epoch": 2.2025, + "grad_norm": 79.61896514892578, + "learning_rate": 1.1935999999999998e-07, + "loss": 0.5456, + "step": 220250 + }, + { + "epoch": 2.203, + "grad_norm": 52.392642974853516, + "learning_rate": 1.1916e-07, + "loss": 0.4036, + "step": 220300 + }, + { + "epoch": 2.2035, + "grad_norm": 2.282226085662842, + "learning_rate": 1.1895999999999999e-07, + "loss": 0.3552, + "step": 220350 + }, + { + "epoch": 2.204, + "grad_norm": 26.8724365234375, + "learning_rate": 1.1876e-07, + "loss": 0.3787, + "step": 220400 + }, + { + "epoch": 2.2045, + "grad_norm": 84.30389404296875, + "learning_rate": 1.1856e-07, + "loss": 0.3487, + "step": 220450 + }, + { + "epoch": 2.205, + "grad_norm": 74.3729476928711, + "learning_rate": 1.1836e-07, + "loss": 0.432, + "step": 220500 + }, + { + "epoch": 2.2055, + "grad_norm": 25.35677146911621, + "learning_rate": 1.1816e-07, + "loss": 0.4825, + "step": 220550 + }, + { + "epoch": 2.206, + "grad_norm": 5.5437116622924805, + "learning_rate": 1.1795999999999999e-07, + "loss": 0.4293, + "step": 220600 + }, + { + "epoch": 2.2065, + "grad_norm": 51.029869079589844, + "learning_rate": 1.1776e-07, + "loss": 0.3243, + "step": 220650 + }, + { + "epoch": 2.207, + "grad_norm": 12.300873756408691, + "learning_rate": 1.1755999999999999e-07, + "loss": 0.3076, + "step": 220700 + }, + { + "epoch": 2.2075, + "grad_norm": 0.5374252796173096, + "learning_rate": 1.1736e-07, + "loss": 0.4596, + "step": 220750 + }, + { + "epoch": 2.208, + "grad_norm": 19.82467269897461, + "learning_rate": 1.1716e-07, + "loss": 0.4194, + "step": 220800 + }, + { + "epoch": 2.2085, + "grad_norm": 88.30901336669922, + "learning_rate": 1.1695999999999998e-07, + "loss": 0.4866, + "step": 220850 + }, + { + "epoch": 2.209, + "grad_norm": 53.6544303894043, + "learning_rate": 1.1676e-07, + "loss": 0.5025, + "step": 220900 + }, + { + "epoch": 2.2095, + "grad_norm": 16.769838333129883, + "learning_rate": 1.1655999999999999e-07, + "loss": 0.3527, + "step": 220950 + }, + { + "epoch": 2.21, + "grad_norm": 77.25714874267578, + "learning_rate": 1.1636e-07, + "loss": 0.4571, + "step": 221000 + }, + { + "epoch": 2.2105, + "grad_norm": 74.47918701171875, + "learning_rate": 1.1615999999999999e-07, + "loss": 0.4852, + "step": 221050 + }, + { + "epoch": 2.211, + "grad_norm": 1.7386324405670166, + "learning_rate": 1.1595999999999999e-07, + "loss": 0.4368, + "step": 221100 + }, + { + "epoch": 2.2115, + "grad_norm": 43.10595703125, + "learning_rate": 1.1576e-07, + "loss": 0.472, + "step": 221150 + }, + { + "epoch": 2.212, + "grad_norm": 195.77264404296875, + "learning_rate": 1.1556e-07, + "loss": 0.4499, + "step": 221200 + }, + { + "epoch": 2.2125, + "grad_norm": 41.63266372680664, + "learning_rate": 1.1536e-07, + "loss": 0.3944, + "step": 221250 + }, + { + "epoch": 2.213, + "grad_norm": 93.48374938964844, + "learning_rate": 1.1516e-07, + "loss": 0.4368, + "step": 221300 + }, + { + "epoch": 2.2135, + "grad_norm": 11.143013000488281, + "learning_rate": 1.1496e-07, + "loss": 0.4911, + "step": 221350 + }, + { + "epoch": 2.214, + "grad_norm": 0.3506889045238495, + "learning_rate": 1.1475999999999999e-07, + "loss": 0.4476, + "step": 221400 + }, + { + "epoch": 2.2145, + "grad_norm": 5.332769393920898, + "learning_rate": 1.1455999999999999e-07, + "loss": 0.3238, + "step": 221450 + }, + { + "epoch": 2.215, + "grad_norm": 69.08348083496094, + "learning_rate": 1.1436e-07, + "loss": 0.4466, + "step": 221500 + }, + { + "epoch": 2.2155, + "grad_norm": 62.9998664855957, + "learning_rate": 1.1416e-07, + "loss": 0.4017, + "step": 221550 + }, + { + "epoch": 2.216, + "grad_norm": 66.38433074951172, + "learning_rate": 1.1396e-07, + "loss": 0.4719, + "step": 221600 + }, + { + "epoch": 2.2165, + "grad_norm": 123.8575668334961, + "learning_rate": 1.1376e-07, + "loss": 0.3695, + "step": 221650 + }, + { + "epoch": 2.217, + "grad_norm": 9.564665794372559, + "learning_rate": 1.1355999999999999e-07, + "loss": 0.3205, + "step": 221700 + }, + { + "epoch": 2.2175, + "grad_norm": 28.491945266723633, + "learning_rate": 1.1335999999999999e-07, + "loss": 0.4048, + "step": 221750 + }, + { + "epoch": 2.218, + "grad_norm": 20.753578186035156, + "learning_rate": 1.1315999999999999e-07, + "loss": 0.4333, + "step": 221800 + }, + { + "epoch": 2.2185, + "grad_norm": 77.56809997558594, + "learning_rate": 1.1296e-07, + "loss": 0.5474, + "step": 221850 + }, + { + "epoch": 2.219, + "grad_norm": 6.686479091644287, + "learning_rate": 1.1276e-07, + "loss": 0.4709, + "step": 221900 + }, + { + "epoch": 2.2195, + "grad_norm": 1.914489507675171, + "learning_rate": 1.1255999999999998e-07, + "loss": 0.3998, + "step": 221950 + }, + { + "epoch": 2.22, + "grad_norm": 0.9213142991065979, + "learning_rate": 1.1236e-07, + "loss": 0.378, + "step": 222000 + }, + { + "epoch": 2.2205, + "grad_norm": 115.55097198486328, + "learning_rate": 1.1215999999999999e-07, + "loss": 0.4008, + "step": 222050 + }, + { + "epoch": 2.221, + "grad_norm": 72.356201171875, + "learning_rate": 1.1196e-07, + "loss": 0.3891, + "step": 222100 + }, + { + "epoch": 2.2215, + "grad_norm": 6.992410659790039, + "learning_rate": 1.1175999999999999e-07, + "loss": 0.3579, + "step": 222150 + }, + { + "epoch": 2.222, + "grad_norm": 72.04354858398438, + "learning_rate": 1.1156e-07, + "loss": 0.402, + "step": 222200 + }, + { + "epoch": 2.2225, + "grad_norm": 112.33955383300781, + "learning_rate": 1.1136e-07, + "loss": 0.4516, + "step": 222250 + }, + { + "epoch": 2.223, + "grad_norm": 45.285400390625, + "learning_rate": 1.1115999999999998e-07, + "loss": 0.3904, + "step": 222300 + }, + { + "epoch": 2.2235, + "grad_norm": 0.01933535747230053, + "learning_rate": 1.1096e-07, + "loss": 0.4005, + "step": 222350 + }, + { + "epoch": 2.224, + "grad_norm": 2.9697704315185547, + "learning_rate": 1.1076e-07, + "loss": 0.4953, + "step": 222400 + }, + { + "epoch": 2.2245, + "grad_norm": 143.373779296875, + "learning_rate": 1.1056e-07, + "loss": 0.3947, + "step": 222450 + }, + { + "epoch": 2.225, + "grad_norm": 2.298295259475708, + "learning_rate": 1.1035999999999999e-07, + "loss": 0.3572, + "step": 222500 + }, + { + "epoch": 2.2255, + "grad_norm": 4.240357875823975, + "learning_rate": 1.1015999999999999e-07, + "loss": 0.4634, + "step": 222550 + }, + { + "epoch": 2.226, + "grad_norm": 19.71664810180664, + "learning_rate": 1.0996e-07, + "loss": 0.4539, + "step": 222600 + }, + { + "epoch": 2.2265, + "grad_norm": 20.2246150970459, + "learning_rate": 1.0975999999999998e-07, + "loss": 0.4125, + "step": 222650 + }, + { + "epoch": 2.227, + "grad_norm": 9.409420013427734, + "learning_rate": 1.0956e-07, + "loss": 0.4532, + "step": 222700 + }, + { + "epoch": 2.2275, + "grad_norm": 2.0112721920013428, + "learning_rate": 1.0936e-07, + "loss": 0.4254, + "step": 222750 + }, + { + "epoch": 2.228, + "grad_norm": 107.7596664428711, + "learning_rate": 1.0915999999999999e-07, + "loss": 0.4481, + "step": 222800 + }, + { + "epoch": 2.2285, + "grad_norm": 64.18367767333984, + "learning_rate": 1.0895999999999999e-07, + "loss": 0.5081, + "step": 222850 + }, + { + "epoch": 2.229, + "grad_norm": 31.307912826538086, + "learning_rate": 1.0875999999999999e-07, + "loss": 0.411, + "step": 222900 + }, + { + "epoch": 2.2295, + "grad_norm": 62.03761291503906, + "learning_rate": 1.0856e-07, + "loss": 0.5153, + "step": 222950 + }, + { + "epoch": 2.23, + "grad_norm": 68.89613342285156, + "learning_rate": 1.0836e-07, + "loss": 0.3965, + "step": 223000 + }, + { + "epoch": 2.2305, + "grad_norm": 10.568138122558594, + "learning_rate": 1.0816e-07, + "loss": 0.3033, + "step": 223050 + }, + { + "epoch": 2.231, + "grad_norm": 17.958087921142578, + "learning_rate": 1.07964e-07, + "loss": 0.428, + "step": 223100 + }, + { + "epoch": 2.2315, + "grad_norm": 1.1294639110565186, + "learning_rate": 1.07764e-07, + "loss": 0.4191, + "step": 223150 + }, + { + "epoch": 2.232, + "grad_norm": 31.283260345458984, + "learning_rate": 1.07564e-07, + "loss": 0.3648, + "step": 223200 + }, + { + "epoch": 2.2325, + "grad_norm": 74.15252685546875, + "learning_rate": 1.07364e-07, + "loss": 0.3545, + "step": 223250 + }, + { + "epoch": 2.233, + "grad_norm": 56.10187911987305, + "learning_rate": 1.0716399999999999e-07, + "loss": 0.5046, + "step": 223300 + }, + { + "epoch": 2.2335, + "grad_norm": 19.8614444732666, + "learning_rate": 1.06964e-07, + "loss": 0.4121, + "step": 223350 + }, + { + "epoch": 2.234, + "grad_norm": 52.8697395324707, + "learning_rate": 1.0676399999999999e-07, + "loss": 0.4443, + "step": 223400 + }, + { + "epoch": 2.2345, + "grad_norm": 15.52030086517334, + "learning_rate": 1.06564e-07, + "loss": 0.4393, + "step": 223450 + }, + { + "epoch": 2.235, + "grad_norm": 13.279940605163574, + "learning_rate": 1.06364e-07, + "loss": 0.5055, + "step": 223500 + }, + { + "epoch": 2.2355, + "grad_norm": 35.77288818359375, + "learning_rate": 1.0616399999999998e-07, + "loss": 0.3935, + "step": 223550 + }, + { + "epoch": 2.2359999999999998, + "grad_norm": 41.148807525634766, + "learning_rate": 1.05964e-07, + "loss": 0.5071, + "step": 223600 + }, + { + "epoch": 2.2365, + "grad_norm": 119.54339599609375, + "learning_rate": 1.0576399999999999e-07, + "loss": 0.5073, + "step": 223650 + }, + { + "epoch": 2.237, + "grad_norm": 4.346285343170166, + "learning_rate": 1.05564e-07, + "loss": 0.3924, + "step": 223700 + }, + { + "epoch": 2.2375, + "grad_norm": 22.041217803955078, + "learning_rate": 1.0536399999999999e-07, + "loss": 0.4077, + "step": 223750 + }, + { + "epoch": 2.238, + "grad_norm": 23.932233810424805, + "learning_rate": 1.0516399999999999e-07, + "loss": 0.4078, + "step": 223800 + }, + { + "epoch": 2.2385, + "grad_norm": 92.13134765625, + "learning_rate": 1.04964e-07, + "loss": 0.3978, + "step": 223850 + }, + { + "epoch": 2.239, + "grad_norm": 5.41093635559082, + "learning_rate": 1.04764e-07, + "loss": 0.3965, + "step": 223900 + }, + { + "epoch": 2.2395, + "grad_norm": 32.597747802734375, + "learning_rate": 1.04564e-07, + "loss": 0.4309, + "step": 223950 + }, + { + "epoch": 2.24, + "grad_norm": 76.27898406982422, + "learning_rate": 1.04364e-07, + "loss": 0.4534, + "step": 224000 + }, + { + "epoch": 2.2405, + "grad_norm": 77.87999725341797, + "learning_rate": 1.04164e-07, + "loss": 0.364, + "step": 224050 + }, + { + "epoch": 2.241, + "grad_norm": 83.9505615234375, + "learning_rate": 1.0396399999999999e-07, + "loss": 0.4332, + "step": 224100 + }, + { + "epoch": 2.2415, + "grad_norm": 92.03340148925781, + "learning_rate": 1.0376399999999999e-07, + "loss": 0.4405, + "step": 224150 + }, + { + "epoch": 2.242, + "grad_norm": 2.194648027420044, + "learning_rate": 1.03564e-07, + "loss": 0.5773, + "step": 224200 + }, + { + "epoch": 2.2425, + "grad_norm": 11.85653018951416, + "learning_rate": 1.03364e-07, + "loss": 0.2945, + "step": 224250 + }, + { + "epoch": 2.243, + "grad_norm": 77.39554595947266, + "learning_rate": 1.03164e-07, + "loss": 0.382, + "step": 224300 + }, + { + "epoch": 2.2435, + "grad_norm": 85.84123229980469, + "learning_rate": 1.02964e-07, + "loss": 0.3054, + "step": 224350 + }, + { + "epoch": 2.2439999999999998, + "grad_norm": 95.42002868652344, + "learning_rate": 1.0276399999999999e-07, + "loss": 0.4038, + "step": 224400 + }, + { + "epoch": 2.2445, + "grad_norm": 1.0675559043884277, + "learning_rate": 1.0256399999999999e-07, + "loss": 0.3923, + "step": 224450 + }, + { + "epoch": 2.245, + "grad_norm": 68.109130859375, + "learning_rate": 1.0236399999999999e-07, + "loss": 0.5414, + "step": 224500 + }, + { + "epoch": 2.2455, + "grad_norm": 14.229341506958008, + "learning_rate": 1.02164e-07, + "loss": 0.4831, + "step": 224550 + }, + { + "epoch": 2.246, + "grad_norm": 53.71510314941406, + "learning_rate": 1.01964e-07, + "loss": 0.4338, + "step": 224600 + }, + { + "epoch": 2.2465, + "grad_norm": 44.91522216796875, + "learning_rate": 1.0176399999999998e-07, + "loss": 0.4844, + "step": 224650 + }, + { + "epoch": 2.247, + "grad_norm": 138.286865234375, + "learning_rate": 1.01564e-07, + "loss": 0.361, + "step": 224700 + }, + { + "epoch": 2.2475, + "grad_norm": 43.51007080078125, + "learning_rate": 1.01364e-07, + "loss": 0.4108, + "step": 224750 + }, + { + "epoch": 2.248, + "grad_norm": 7.015733242034912, + "learning_rate": 1.01164e-07, + "loss": 0.3845, + "step": 224800 + }, + { + "epoch": 2.2485, + "grad_norm": 125.538330078125, + "learning_rate": 1.0096399999999999e-07, + "loss": 0.463, + "step": 224850 + }, + { + "epoch": 2.249, + "grad_norm": 7.6888346672058105, + "learning_rate": 1.00764e-07, + "loss": 0.4631, + "step": 224900 + }, + { + "epoch": 2.2495, + "grad_norm": 6.268250465393066, + "learning_rate": 1.00564e-07, + "loss": 0.2814, + "step": 224950 + }, + { + "epoch": 2.25, + "grad_norm": 1.0146398544311523, + "learning_rate": 1.0036399999999998e-07, + "loss": 0.4237, + "step": 225000 + }, + { + "epoch": 2.2505, + "grad_norm": 33.02668380737305, + "learning_rate": 1.00164e-07, + "loss": 0.3576, + "step": 225050 + }, + { + "epoch": 2.251, + "grad_norm": 97.76544189453125, + "learning_rate": 9.9964e-08, + "loss": 0.3949, + "step": 225100 + }, + { + "epoch": 2.2515, + "grad_norm": 78.87450408935547, + "learning_rate": 9.9764e-08, + "loss": 0.2917, + "step": 225150 + }, + { + "epoch": 2.252, + "grad_norm": 108.28045654296875, + "learning_rate": 9.956399999999999e-08, + "loss": 0.5591, + "step": 225200 + }, + { + "epoch": 2.2525, + "grad_norm": 65.84611511230469, + "learning_rate": 9.936399999999999e-08, + "loss": 0.3784, + "step": 225250 + }, + { + "epoch": 2.253, + "grad_norm": 20.012842178344727, + "learning_rate": 9.9164e-08, + "loss": 0.4402, + "step": 225300 + }, + { + "epoch": 2.2535, + "grad_norm": 1.915622353553772, + "learning_rate": 9.896399999999999e-08, + "loss": 0.4123, + "step": 225350 + }, + { + "epoch": 2.254, + "grad_norm": 19.519975662231445, + "learning_rate": 9.8764e-08, + "loss": 0.3652, + "step": 225400 + }, + { + "epoch": 2.2545, + "grad_norm": 87.41107940673828, + "learning_rate": 9.8564e-08, + "loss": 0.6044, + "step": 225450 + }, + { + "epoch": 2.255, + "grad_norm": 29.000362396240234, + "learning_rate": 9.836399999999999e-08, + "loss": 0.4545, + "step": 225500 + }, + { + "epoch": 2.2555, + "grad_norm": 70.12849426269531, + "learning_rate": 9.816399999999999e-08, + "loss": 0.4011, + "step": 225550 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 0.5577579140663147, + "learning_rate": 9.796399999999999e-08, + "loss": 0.4326, + "step": 225600 + }, + { + "epoch": 2.2565, + "grad_norm": 2.2879648208618164, + "learning_rate": 9.7764e-08, + "loss": 0.3948, + "step": 225650 + }, + { + "epoch": 2.257, + "grad_norm": 0.7911091446876526, + "learning_rate": 9.7564e-08, + "loss": 0.5271, + "step": 225700 + }, + { + "epoch": 2.2575, + "grad_norm": 52.76624298095703, + "learning_rate": 9.7364e-08, + "loss": 0.5534, + "step": 225750 + }, + { + "epoch": 2.258, + "grad_norm": 53.3375244140625, + "learning_rate": 9.7164e-08, + "loss": 0.4334, + "step": 225800 + }, + { + "epoch": 2.2585, + "grad_norm": 27.924394607543945, + "learning_rate": 9.6964e-08, + "loss": 0.2814, + "step": 225850 + }, + { + "epoch": 2.259, + "grad_norm": 48.85221481323242, + "learning_rate": 9.676399999999999e-08, + "loss": 0.3435, + "step": 225900 + }, + { + "epoch": 2.2595, + "grad_norm": 7.6830549240112305, + "learning_rate": 9.656399999999999e-08, + "loss": 0.5507, + "step": 225950 + }, + { + "epoch": 2.26, + "grad_norm": 2.447178840637207, + "learning_rate": 9.6364e-08, + "loss": 0.329, + "step": 226000 + }, + { + "epoch": 2.2605, + "grad_norm": 39.04615783691406, + "learning_rate": 9.6164e-08, + "loss": 0.4663, + "step": 226050 + }, + { + "epoch": 2.261, + "grad_norm": 4.852475643157959, + "learning_rate": 9.596399999999999e-08, + "loss": 0.5203, + "step": 226100 + }, + { + "epoch": 2.2615, + "grad_norm": 89.7945556640625, + "learning_rate": 9.5764e-08, + "loss": 0.4516, + "step": 226150 + }, + { + "epoch": 2.262, + "grad_norm": 35.014644622802734, + "learning_rate": 9.5564e-08, + "loss": 0.3496, + "step": 226200 + }, + { + "epoch": 2.2625, + "grad_norm": 57.93213653564453, + "learning_rate": 9.5364e-08, + "loss": 0.4799, + "step": 226250 + }, + { + "epoch": 2.263, + "grad_norm": 39.42817306518555, + "learning_rate": 9.516399999999999e-08, + "loss": 0.4357, + "step": 226300 + }, + { + "epoch": 2.2635, + "grad_norm": 41.9278450012207, + "learning_rate": 9.4964e-08, + "loss": 0.437, + "step": 226350 + }, + { + "epoch": 2.2640000000000002, + "grad_norm": 41.00620651245117, + "learning_rate": 9.4764e-08, + "loss": 0.417, + "step": 226400 + }, + { + "epoch": 2.2645, + "grad_norm": 90.83309936523438, + "learning_rate": 9.456399999999999e-08, + "loss": 0.4885, + "step": 226450 + }, + { + "epoch": 2.265, + "grad_norm": 36.397216796875, + "learning_rate": 9.4364e-08, + "loss": 0.4193, + "step": 226500 + }, + { + "epoch": 2.2655, + "grad_norm": 43.22880554199219, + "learning_rate": 9.4164e-08, + "loss": 0.4173, + "step": 226550 + }, + { + "epoch": 2.266, + "grad_norm": 144.54409790039062, + "learning_rate": 9.396400000000001e-08, + "loss": 0.3163, + "step": 226600 + }, + { + "epoch": 2.2665, + "grad_norm": 0.3794969916343689, + "learning_rate": 9.376399999999999e-08, + "loss": 0.3984, + "step": 226650 + }, + { + "epoch": 2.267, + "grad_norm": 48.13538360595703, + "learning_rate": 9.356399999999999e-08, + "loss": 0.3953, + "step": 226700 + }, + { + "epoch": 2.2675, + "grad_norm": 89.7881088256836, + "learning_rate": 9.3364e-08, + "loss": 0.5402, + "step": 226750 + }, + { + "epoch": 2.268, + "grad_norm": 87.7471923828125, + "learning_rate": 9.316399999999999e-08, + "loss": 0.438, + "step": 226800 + }, + { + "epoch": 2.2685, + "grad_norm": 22.916074752807617, + "learning_rate": 9.2964e-08, + "loss": 0.3621, + "step": 226850 + }, + { + "epoch": 2.269, + "grad_norm": 23.842510223388672, + "learning_rate": 9.2764e-08, + "loss": 0.351, + "step": 226900 + }, + { + "epoch": 2.2695, + "grad_norm": 86.34049224853516, + "learning_rate": 9.2564e-08, + "loss": 0.4619, + "step": 226950 + }, + { + "epoch": 2.27, + "grad_norm": 34.528133392333984, + "learning_rate": 9.236399999999999e-08, + "loss": 0.367, + "step": 227000 + }, + { + "epoch": 2.2705, + "grad_norm": 42.39723205566406, + "learning_rate": 9.216399999999999e-08, + "loss": 0.4345, + "step": 227050 + }, + { + "epoch": 2.271, + "grad_norm": 4.247285842895508, + "learning_rate": 9.1964e-08, + "loss": 0.6515, + "step": 227100 + }, + { + "epoch": 2.2715, + "grad_norm": 34.562137603759766, + "learning_rate": 9.1764e-08, + "loss": 0.4751, + "step": 227150 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.823958158493042, + "learning_rate": 9.1564e-08, + "loss": 0.4003, + "step": 227200 + }, + { + "epoch": 2.2725, + "grad_norm": 86.37012481689453, + "learning_rate": 9.1364e-08, + "loss": 0.4309, + "step": 227250 + }, + { + "epoch": 2.273, + "grad_norm": 108.56483459472656, + "learning_rate": 9.1164e-08, + "loss": 0.5015, + "step": 227300 + }, + { + "epoch": 2.2735, + "grad_norm": 73.98970031738281, + "learning_rate": 9.096399999999999e-08, + "loss": 0.3684, + "step": 227350 + }, + { + "epoch": 2.274, + "grad_norm": 18.87233543395996, + "learning_rate": 9.076399999999999e-08, + "loss": 0.4713, + "step": 227400 + }, + { + "epoch": 2.2745, + "grad_norm": 88.28274536132812, + "learning_rate": 9.0564e-08, + "loss": 0.5386, + "step": 227450 + }, + { + "epoch": 2.275, + "grad_norm": 63.17116165161133, + "learning_rate": 9.0364e-08, + "loss": 0.4483, + "step": 227500 + }, + { + "epoch": 2.2755, + "grad_norm": 28.934703826904297, + "learning_rate": 9.016399999999999e-08, + "loss": 0.4424, + "step": 227550 + }, + { + "epoch": 2.276, + "grad_norm": 61.9217414855957, + "learning_rate": 8.9964e-08, + "loss": 0.4156, + "step": 227600 + }, + { + "epoch": 2.2765, + "grad_norm": 75.80464172363281, + "learning_rate": 8.9764e-08, + "loss": 0.4462, + "step": 227650 + }, + { + "epoch": 2.277, + "grad_norm": 9.887880325317383, + "learning_rate": 8.956400000000001e-08, + "loss": 0.4142, + "step": 227700 + }, + { + "epoch": 2.2775, + "grad_norm": 46.1187858581543, + "learning_rate": 8.936399999999999e-08, + "loss": 0.3884, + "step": 227750 + }, + { + "epoch": 2.278, + "grad_norm": 42.99738311767578, + "learning_rate": 8.916399999999999e-08, + "loss": 0.4486, + "step": 227800 + }, + { + "epoch": 2.2785, + "grad_norm": 5.804699420928955, + "learning_rate": 8.8968e-08, + "loss": 0.3833, + "step": 227850 + }, + { + "epoch": 2.279, + "grad_norm": 20.068557739257812, + "learning_rate": 8.876799999999999e-08, + "loss": 0.4945, + "step": 227900 + }, + { + "epoch": 2.2795, + "grad_norm": 0.5861179828643799, + "learning_rate": 8.856799999999999e-08, + "loss": 0.4478, + "step": 227950 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 0.990058958530426, + "learning_rate": 8.8368e-08, + "loss": 0.3093, + "step": 228000 + }, + { + "epoch": 2.2805, + "grad_norm": 129.85740661621094, + "learning_rate": 8.816799999999999e-08, + "loss": 0.429, + "step": 228050 + }, + { + "epoch": 2.281, + "grad_norm": 29.716796875, + "learning_rate": 8.7968e-08, + "loss": 0.3996, + "step": 228100 + }, + { + "epoch": 2.2815, + "grad_norm": 58.8203239440918, + "learning_rate": 8.7768e-08, + "loss": 0.3056, + "step": 228150 + }, + { + "epoch": 2.282, + "grad_norm": 41.287166595458984, + "learning_rate": 8.7568e-08, + "loss": 0.321, + "step": 228200 + }, + { + "epoch": 2.2824999999999998, + "grad_norm": 9.633661270141602, + "learning_rate": 8.736799999999999e-08, + "loss": 0.52, + "step": 228250 + }, + { + "epoch": 2.283, + "grad_norm": 35.10003662109375, + "learning_rate": 8.716799999999999e-08, + "loss": 0.4273, + "step": 228300 + }, + { + "epoch": 2.2835, + "grad_norm": 8.269124031066895, + "learning_rate": 8.6968e-08, + "loss": 0.3968, + "step": 228350 + }, + { + "epoch": 2.284, + "grad_norm": 47.246002197265625, + "learning_rate": 8.6768e-08, + "loss": 0.6276, + "step": 228400 + }, + { + "epoch": 2.2845, + "grad_norm": 106.63582611083984, + "learning_rate": 8.6568e-08, + "loss": 0.3303, + "step": 228450 + }, + { + "epoch": 2.285, + "grad_norm": 5.064298152923584, + "learning_rate": 8.6368e-08, + "loss": 0.46, + "step": 228500 + }, + { + "epoch": 2.2855, + "grad_norm": 0.2619529962539673, + "learning_rate": 8.6168e-08, + "loss": 0.3292, + "step": 228550 + }, + { + "epoch": 2.286, + "grad_norm": 7.505974292755127, + "learning_rate": 8.596799999999999e-08, + "loss": 0.4411, + "step": 228600 + }, + { + "epoch": 2.2865, + "grad_norm": 69.65245056152344, + "learning_rate": 8.576799999999999e-08, + "loss": 0.4029, + "step": 228650 + }, + { + "epoch": 2.287, + "grad_norm": 57.54338073730469, + "learning_rate": 8.5568e-08, + "loss": 0.4123, + "step": 228700 + }, + { + "epoch": 2.2875, + "grad_norm": 78.23273468017578, + "learning_rate": 8.5368e-08, + "loss": 0.3677, + "step": 228750 + }, + { + "epoch": 2.288, + "grad_norm": 43.50226593017578, + "learning_rate": 8.516799999999999e-08, + "loss": 0.4841, + "step": 228800 + }, + { + "epoch": 2.2885, + "grad_norm": 160.9630126953125, + "learning_rate": 8.4968e-08, + "loss": 0.3561, + "step": 228850 + }, + { + "epoch": 2.289, + "grad_norm": 106.46874237060547, + "learning_rate": 8.4768e-08, + "loss": 0.3973, + "step": 228900 + }, + { + "epoch": 2.2895, + "grad_norm": 67.45094299316406, + "learning_rate": 8.4568e-08, + "loss": 0.498, + "step": 228950 + }, + { + "epoch": 2.29, + "grad_norm": 100.94178771972656, + "learning_rate": 8.436799999999999e-08, + "loss": 0.542, + "step": 229000 + }, + { + "epoch": 2.2904999999999998, + "grad_norm": 3.8312125205993652, + "learning_rate": 8.4168e-08, + "loss": 0.7298, + "step": 229050 + }, + { + "epoch": 2.291, + "grad_norm": 100.0070571899414, + "learning_rate": 8.3968e-08, + "loss": 0.4864, + "step": 229100 + }, + { + "epoch": 2.2915, + "grad_norm": 88.91947937011719, + "learning_rate": 8.376799999999999e-08, + "loss": 0.4731, + "step": 229150 + }, + { + "epoch": 2.292, + "grad_norm": 11.505712509155273, + "learning_rate": 8.3568e-08, + "loss": 0.42, + "step": 229200 + }, + { + "epoch": 2.2925, + "grad_norm": 50.03477478027344, + "learning_rate": 8.3368e-08, + "loss": 0.433, + "step": 229250 + }, + { + "epoch": 2.293, + "grad_norm": 62.71595001220703, + "learning_rate": 8.316800000000001e-08, + "loss": 0.4322, + "step": 229300 + }, + { + "epoch": 2.2935, + "grad_norm": 81.32498931884766, + "learning_rate": 8.296799999999999e-08, + "loss": 0.4962, + "step": 229350 + }, + { + "epoch": 2.294, + "grad_norm": 80.8553466796875, + "learning_rate": 8.276799999999999e-08, + "loss": 0.3587, + "step": 229400 + }, + { + "epoch": 2.2945, + "grad_norm": 77.47965240478516, + "learning_rate": 8.2568e-08, + "loss": 0.3187, + "step": 229450 + }, + { + "epoch": 2.295, + "grad_norm": 77.37553405761719, + "learning_rate": 8.236799999999999e-08, + "loss": 0.5271, + "step": 229500 + }, + { + "epoch": 2.2955, + "grad_norm": 0.49823853373527527, + "learning_rate": 8.2168e-08, + "loss": 0.3428, + "step": 229550 + }, + { + "epoch": 2.296, + "grad_norm": 2.033798933029175, + "learning_rate": 8.1968e-08, + "loss": 0.32, + "step": 229600 + }, + { + "epoch": 2.2965, + "grad_norm": 2.363724946975708, + "learning_rate": 8.1768e-08, + "loss": 0.2763, + "step": 229650 + }, + { + "epoch": 2.297, + "grad_norm": 42.67679977416992, + "learning_rate": 8.156799999999999e-08, + "loss": 0.3813, + "step": 229700 + }, + { + "epoch": 2.2975, + "grad_norm": 93.03001403808594, + "learning_rate": 8.1372e-08, + "loss": 0.4763, + "step": 229750 + }, + { + "epoch": 2.298, + "grad_norm": 23.316856384277344, + "learning_rate": 8.1172e-08, + "loss": 0.3295, + "step": 229800 + }, + { + "epoch": 2.2984999999999998, + "grad_norm": 57.193267822265625, + "learning_rate": 8.097199999999999e-08, + "loss": 0.5061, + "step": 229850 + }, + { + "epoch": 2.299, + "grad_norm": 22.44532585144043, + "learning_rate": 8.077199999999999e-08, + "loss": 0.3112, + "step": 229900 + }, + { + "epoch": 2.2995, + "grad_norm": 101.59669494628906, + "learning_rate": 8.0572e-08, + "loss": 0.3241, + "step": 229950 + }, + { + "epoch": 2.3, + "grad_norm": 78.62955474853516, + "learning_rate": 8.0372e-08, + "loss": 0.4214, + "step": 230000 + }, + { + "epoch": 2.3005, + "grad_norm": 55.72152328491211, + "learning_rate": 8.017199999999998e-08, + "loss": 0.3366, + "step": 230050 + }, + { + "epoch": 2.301, + "grad_norm": 105.1781997680664, + "learning_rate": 7.9972e-08, + "loss": 0.5508, + "step": 230100 + }, + { + "epoch": 2.3015, + "grad_norm": 32.745849609375, + "learning_rate": 7.9772e-08, + "loss": 0.3962, + "step": 230150 + }, + { + "epoch": 2.302, + "grad_norm": 4.188356399536133, + "learning_rate": 7.9572e-08, + "loss": 0.3405, + "step": 230200 + }, + { + "epoch": 2.3025, + "grad_norm": 86.73455047607422, + "learning_rate": 7.937199999999999e-08, + "loss": 0.309, + "step": 230250 + }, + { + "epoch": 2.303, + "grad_norm": 96.70193481445312, + "learning_rate": 7.9172e-08, + "loss": 0.3716, + "step": 230300 + }, + { + "epoch": 2.3035, + "grad_norm": 2.2225139141082764, + "learning_rate": 7.8972e-08, + "loss": 0.3604, + "step": 230350 + }, + { + "epoch": 2.304, + "grad_norm": 105.9201431274414, + "learning_rate": 7.877199999999999e-08, + "loss": 0.4714, + "step": 230400 + }, + { + "epoch": 2.3045, + "grad_norm": 83.0341567993164, + "learning_rate": 7.8572e-08, + "loss": 0.5562, + "step": 230450 + }, + { + "epoch": 2.305, + "grad_norm": 2.381653070449829, + "learning_rate": 7.8372e-08, + "loss": 0.4385, + "step": 230500 + }, + { + "epoch": 2.3055, + "grad_norm": 18.078433990478516, + "learning_rate": 7.817200000000001e-08, + "loss": 0.379, + "step": 230550 + }, + { + "epoch": 2.306, + "grad_norm": 27.159372329711914, + "learning_rate": 7.797199999999999e-08, + "loss": 0.3722, + "step": 230600 + }, + { + "epoch": 2.3064999999999998, + "grad_norm": 67.36116027832031, + "learning_rate": 7.777199999999999e-08, + "loss": 0.4527, + "step": 230650 + }, + { + "epoch": 2.307, + "grad_norm": 54.01509475708008, + "learning_rate": 7.7572e-08, + "loss": 0.3647, + "step": 230700 + }, + { + "epoch": 2.3075, + "grad_norm": 3.8897550106048584, + "learning_rate": 7.737199999999999e-08, + "loss": 0.5743, + "step": 230750 + }, + { + "epoch": 2.308, + "grad_norm": 0.7189392447471619, + "learning_rate": 7.7172e-08, + "loss": 0.4053, + "step": 230800 + }, + { + "epoch": 2.3085, + "grad_norm": 53.79637908935547, + "learning_rate": 7.6972e-08, + "loss": 0.6112, + "step": 230850 + }, + { + "epoch": 2.309, + "grad_norm": 29.953990936279297, + "learning_rate": 7.677200000000001e-08, + "loss": 0.4461, + "step": 230900 + }, + { + "epoch": 2.3095, + "grad_norm": 0.23817132413387299, + "learning_rate": 7.657199999999999e-08, + "loss": 0.4433, + "step": 230950 + }, + { + "epoch": 2.31, + "grad_norm": 72.25556945800781, + "learning_rate": 7.637199999999999e-08, + "loss": 0.3299, + "step": 231000 + }, + { + "epoch": 2.3105, + "grad_norm": 56.55588150024414, + "learning_rate": 7.6172e-08, + "loss": 0.4597, + "step": 231050 + }, + { + "epoch": 2.311, + "grad_norm": 94.5429916381836, + "learning_rate": 7.5972e-08, + "loss": 0.3237, + "step": 231100 + }, + { + "epoch": 2.3115, + "grad_norm": 65.64086151123047, + "learning_rate": 7.5772e-08, + "loss": 0.4391, + "step": 231150 + }, + { + "epoch": 2.312, + "grad_norm": 57.61141586303711, + "learning_rate": 7.5572e-08, + "loss": 0.3361, + "step": 231200 + }, + { + "epoch": 2.3125, + "grad_norm": 73.47406768798828, + "learning_rate": 7.5372e-08, + "loss": 0.4478, + "step": 231250 + }, + { + "epoch": 2.313, + "grad_norm": 8.852334976196289, + "learning_rate": 7.517199999999999e-08, + "loss": 0.56, + "step": 231300 + }, + { + "epoch": 2.3135, + "grad_norm": 86.10330200195312, + "learning_rate": 7.497199999999999e-08, + "loss": 0.3869, + "step": 231350 + }, + { + "epoch": 2.314, + "grad_norm": 26.84288787841797, + "learning_rate": 7.4772e-08, + "loss": 0.4664, + "step": 231400 + }, + { + "epoch": 2.3145, + "grad_norm": 28.752120971679688, + "learning_rate": 7.4572e-08, + "loss": 0.4008, + "step": 231450 + }, + { + "epoch": 2.315, + "grad_norm": 12.051288604736328, + "learning_rate": 7.437199999999999e-08, + "loss": 0.4183, + "step": 231500 + }, + { + "epoch": 2.3155, + "grad_norm": 60.44932556152344, + "learning_rate": 7.4172e-08, + "loss": 0.3753, + "step": 231550 + }, + { + "epoch": 2.316, + "grad_norm": 105.76837158203125, + "learning_rate": 7.3972e-08, + "loss": 0.4456, + "step": 231600 + }, + { + "epoch": 2.3165, + "grad_norm": 20.609785079956055, + "learning_rate": 7.377200000000001e-08, + "loss": 0.3352, + "step": 231650 + }, + { + "epoch": 2.317, + "grad_norm": 23.5540828704834, + "learning_rate": 7.357199999999999e-08, + "loss": 0.4725, + "step": 231700 + }, + { + "epoch": 2.3175, + "grad_norm": 9.055368423461914, + "learning_rate": 7.3372e-08, + "loss": 0.3954, + "step": 231750 + }, + { + "epoch": 2.318, + "grad_norm": 0.28335410356521606, + "learning_rate": 7.3172e-08, + "loss": 0.3075, + "step": 231800 + }, + { + "epoch": 2.3185000000000002, + "grad_norm": 142.01795959472656, + "learning_rate": 7.297199999999999e-08, + "loss": 0.454, + "step": 231850 + }, + { + "epoch": 2.319, + "grad_norm": 69.494873046875, + "learning_rate": 7.2772e-08, + "loss": 0.3512, + "step": 231900 + }, + { + "epoch": 2.3195, + "grad_norm": 144.35302734375, + "learning_rate": 7.2572e-08, + "loss": 0.3771, + "step": 231950 + }, + { + "epoch": 2.32, + "grad_norm": 44.31450653076172, + "learning_rate": 7.237200000000001e-08, + "loss": 0.3828, + "step": 232000 + }, + { + "epoch": 2.3205, + "grad_norm": 43.578590393066406, + "learning_rate": 7.217199999999999e-08, + "loss": 0.4416, + "step": 232050 + }, + { + "epoch": 2.321, + "grad_norm": 0.5683667659759521, + "learning_rate": 7.197199999999999e-08, + "loss": 0.3824, + "step": 232100 + }, + { + "epoch": 2.3215, + "grad_norm": 13.191206932067871, + "learning_rate": 7.1772e-08, + "loss": 0.4271, + "step": 232150 + }, + { + "epoch": 2.322, + "grad_norm": 89.80245971679688, + "learning_rate": 7.157199999999999e-08, + "loss": 0.3694, + "step": 232200 + }, + { + "epoch": 2.3225, + "grad_norm": 100.60424041748047, + "learning_rate": 7.1372e-08, + "loss": 0.4274, + "step": 232250 + }, + { + "epoch": 2.323, + "grad_norm": 21.0079288482666, + "learning_rate": 7.1172e-08, + "loss": 0.4433, + "step": 232300 + }, + { + "epoch": 2.3235, + "grad_norm": 80.20172882080078, + "learning_rate": 7.0972e-08, + "loss": 0.2979, + "step": 232350 + }, + { + "epoch": 2.324, + "grad_norm": 22.903003692626953, + "learning_rate": 7.0772e-08, + "loss": 0.3579, + "step": 232400 + }, + { + "epoch": 2.3245, + "grad_norm": 69.52226257324219, + "learning_rate": 7.057199999999999e-08, + "loss": 0.524, + "step": 232450 + }, + { + "epoch": 2.325, + "grad_norm": 54.36795425415039, + "learning_rate": 7.0372e-08, + "loss": 0.5302, + "step": 232500 + }, + { + "epoch": 2.3255, + "grad_norm": 3.2600345611572266, + "learning_rate": 7.0172e-08, + "loss": 0.3502, + "step": 232550 + }, + { + "epoch": 2.326, + "grad_norm": 8.641088485717773, + "learning_rate": 6.9972e-08, + "loss": 0.4258, + "step": 232600 + }, + { + "epoch": 2.3265000000000002, + "grad_norm": 0.2351902574300766, + "learning_rate": 6.9772e-08, + "loss": 0.4205, + "step": 232650 + }, + { + "epoch": 2.327, + "grad_norm": 5.2298126220703125, + "learning_rate": 6.9572e-08, + "loss": 0.2792, + "step": 232700 + }, + { + "epoch": 2.3275, + "grad_norm": 120.54157257080078, + "learning_rate": 6.9372e-08, + "loss": 0.3501, + "step": 232750 + }, + { + "epoch": 2.328, + "grad_norm": 129.25265502929688, + "learning_rate": 6.917199999999999e-08, + "loss": 0.5156, + "step": 232800 + }, + { + "epoch": 2.3285, + "grad_norm": 66.50567626953125, + "learning_rate": 6.8972e-08, + "loss": 0.5007, + "step": 232850 + }, + { + "epoch": 2.329, + "grad_norm": 80.42890930175781, + "learning_rate": 6.8772e-08, + "loss": 0.4191, + "step": 232900 + }, + { + "epoch": 2.3295, + "grad_norm": 84.22823333740234, + "learning_rate": 6.857199999999999e-08, + "loss": 0.4093, + "step": 232950 + }, + { + "epoch": 2.33, + "grad_norm": 60.78300094604492, + "learning_rate": 6.8372e-08, + "loss": 0.3831, + "step": 233000 + }, + { + "epoch": 2.3305, + "grad_norm": 9.814701080322266, + "learning_rate": 6.8172e-08, + "loss": 0.3795, + "step": 233050 + }, + { + "epoch": 2.331, + "grad_norm": 89.87799072265625, + "learning_rate": 6.797200000000001e-08, + "loss": 0.4236, + "step": 233100 + }, + { + "epoch": 2.3315, + "grad_norm": 51.194068908691406, + "learning_rate": 6.777199999999999e-08, + "loss": 0.3, + "step": 233150 + }, + { + "epoch": 2.332, + "grad_norm": 40.99766159057617, + "learning_rate": 6.757199999999999e-08, + "loss": 0.5982, + "step": 233200 + }, + { + "epoch": 2.3325, + "grad_norm": 97.21105194091797, + "learning_rate": 6.7372e-08, + "loss": 0.4268, + "step": 233250 + }, + { + "epoch": 2.333, + "grad_norm": 77.22876739501953, + "learning_rate": 6.717199999999999e-08, + "loss": 0.3962, + "step": 233300 + }, + { + "epoch": 2.3335, + "grad_norm": 24.11876106262207, + "learning_rate": 6.6972e-08, + "loss": 0.3584, + "step": 233350 + }, + { + "epoch": 2.334, + "grad_norm": 49.48302459716797, + "learning_rate": 6.6772e-08, + "loss": 0.3827, + "step": 233400 + }, + { + "epoch": 2.3345000000000002, + "grad_norm": 0.11236397176980972, + "learning_rate": 6.657200000000001e-08, + "loss": 0.2988, + "step": 233450 + }, + { + "epoch": 2.335, + "grad_norm": 7.154104232788086, + "learning_rate": 6.6372e-08, + "loss": 0.4928, + "step": 233500 + }, + { + "epoch": 2.3355, + "grad_norm": 2.846604824066162, + "learning_rate": 6.617199999999999e-08, + "loss": 0.4661, + "step": 233550 + }, + { + "epoch": 2.336, + "grad_norm": 93.85885620117188, + "learning_rate": 6.5972e-08, + "loss": 0.5282, + "step": 233600 + }, + { + "epoch": 2.3365, + "grad_norm": 14.930121421813965, + "learning_rate": 6.577199999999999e-08, + "loss": 0.3968, + "step": 233650 + }, + { + "epoch": 2.337, + "grad_norm": 50.37242889404297, + "learning_rate": 6.5572e-08, + "loss": 0.4821, + "step": 233700 + }, + { + "epoch": 2.3375, + "grad_norm": 7.079569339752197, + "learning_rate": 6.5372e-08, + "loss": 0.4573, + "step": 233750 + }, + { + "epoch": 2.338, + "grad_norm": 59.11067581176758, + "learning_rate": 6.5172e-08, + "loss": 0.4407, + "step": 233800 + }, + { + "epoch": 2.3385, + "grad_norm": 106.90413665771484, + "learning_rate": 6.4972e-08, + "loss": 0.5134, + "step": 233850 + }, + { + "epoch": 2.339, + "grad_norm": 88.19550323486328, + "learning_rate": 6.477199999999999e-08, + "loss": 0.3665, + "step": 233900 + }, + { + "epoch": 2.3395, + "grad_norm": 3.8314621448516846, + "learning_rate": 6.4572e-08, + "loss": 0.3177, + "step": 233950 + }, + { + "epoch": 2.34, + "grad_norm": 78.19933319091797, + "learning_rate": 6.4372e-08, + "loss": 0.4219, + "step": 234000 + }, + { + "epoch": 2.3405, + "grad_norm": 62.778011322021484, + "learning_rate": 6.4172e-08, + "loss": 0.3997, + "step": 234050 + }, + { + "epoch": 2.341, + "grad_norm": 73.71574401855469, + "learning_rate": 6.3972e-08, + "loss": 0.3116, + "step": 234100 + }, + { + "epoch": 2.3415, + "grad_norm": 54.49970245361328, + "learning_rate": 6.3772e-08, + "loss": 0.4426, + "step": 234150 + }, + { + "epoch": 2.342, + "grad_norm": 63.628170013427734, + "learning_rate": 6.3572e-08, + "loss": 0.4592, + "step": 234200 + }, + { + "epoch": 2.3425000000000002, + "grad_norm": 69.60954284667969, + "learning_rate": 6.337199999999999e-08, + "loss": 0.3635, + "step": 234250 + }, + { + "epoch": 2.343, + "grad_norm": 52.40366744995117, + "learning_rate": 6.3172e-08, + "loss": 0.538, + "step": 234300 + }, + { + "epoch": 2.3435, + "grad_norm": 86.80823516845703, + "learning_rate": 6.2972e-08, + "loss": 0.3849, + "step": 234350 + }, + { + "epoch": 2.344, + "grad_norm": 67.82777404785156, + "learning_rate": 6.277199999999999e-08, + "loss": 0.5053, + "step": 234400 + }, + { + "epoch": 2.3445, + "grad_norm": 63.045921325683594, + "learning_rate": 6.2572e-08, + "loss": 0.3757, + "step": 234450 + }, + { + "epoch": 2.3449999999999998, + "grad_norm": 64.31387329101562, + "learning_rate": 6.2372e-08, + "loss": 0.4009, + "step": 234500 + }, + { + "epoch": 2.3455, + "grad_norm": 43.08381652832031, + "learning_rate": 6.2172e-08, + "loss": 0.4228, + "step": 234550 + }, + { + "epoch": 2.346, + "grad_norm": 25.41963768005371, + "learning_rate": 6.1972e-08, + "loss": 0.3963, + "step": 234600 + }, + { + "epoch": 2.3465, + "grad_norm": 45.17002487182617, + "learning_rate": 6.177199999999999e-08, + "loss": 0.4244, + "step": 234650 + }, + { + "epoch": 2.347, + "grad_norm": 64.86937713623047, + "learning_rate": 6.1572e-08, + "loss": 0.433, + "step": 234700 + }, + { + "epoch": 2.3475, + "grad_norm": 122.25826263427734, + "learning_rate": 6.1372e-08, + "loss": 0.5875, + "step": 234750 + }, + { + "epoch": 2.348, + "grad_norm": 1.6496562957763672, + "learning_rate": 6.1172e-08, + "loss": 0.3615, + "step": 234800 + }, + { + "epoch": 2.3485, + "grad_norm": 34.062599182128906, + "learning_rate": 6.0972e-08, + "loss": 0.4072, + "step": 234850 + }, + { + "epoch": 2.349, + "grad_norm": 1.5793612003326416, + "learning_rate": 6.0772e-08, + "loss": 0.4464, + "step": 234900 + }, + { + "epoch": 2.3495, + "grad_norm": 17.111398696899414, + "learning_rate": 6.0572e-08, + "loss": 0.3973, + "step": 234950 + }, + { + "epoch": 2.35, + "grad_norm": 17.124202728271484, + "learning_rate": 6.037199999999999e-08, + "loss": 0.4139, + "step": 235000 + }, + { + "epoch": 2.3505, + "grad_norm": 20.528778076171875, + "learning_rate": 6.0172e-08, + "loss": 0.4128, + "step": 235050 + }, + { + "epoch": 2.351, + "grad_norm": 9.12598991394043, + "learning_rate": 5.997199999999999e-08, + "loss": 0.429, + "step": 235100 + }, + { + "epoch": 2.3515, + "grad_norm": 89.71222686767578, + "learning_rate": 5.9772e-08, + "loss": 0.4243, + "step": 235150 + }, + { + "epoch": 2.352, + "grad_norm": 8.675117492675781, + "learning_rate": 5.9576e-08, + "loss": 0.5467, + "step": 235200 + }, + { + "epoch": 2.3525, + "grad_norm": 82.99100494384766, + "learning_rate": 5.9375999999999995e-08, + "loss": 0.3422, + "step": 235250 + }, + { + "epoch": 2.3529999999999998, + "grad_norm": 77.30717468261719, + "learning_rate": 5.9176e-08, + "loss": 0.4311, + "step": 235300 + }, + { + "epoch": 2.3535, + "grad_norm": 58.811668395996094, + "learning_rate": 5.8976e-08, + "loss": 0.2901, + "step": 235350 + }, + { + "epoch": 2.354, + "grad_norm": 18.311758041381836, + "learning_rate": 5.8775999999999996e-08, + "loss": 0.602, + "step": 235400 + }, + { + "epoch": 2.3545, + "grad_norm": 54.06341552734375, + "learning_rate": 5.8576e-08, + "loss": 0.3226, + "step": 235450 + }, + { + "epoch": 2.355, + "grad_norm": 3.558945655822754, + "learning_rate": 5.837599999999999e-08, + "loss": 0.5484, + "step": 235500 + }, + { + "epoch": 2.3555, + "grad_norm": 40.88227081298828, + "learning_rate": 5.8176e-08, + "loss": 0.2963, + "step": 235550 + }, + { + "epoch": 2.356, + "grad_norm": 4.487430572509766, + "learning_rate": 5.7979999999999994e-08, + "loss": 0.4298, + "step": 235600 + }, + { + "epoch": 2.3565, + "grad_norm": 83.76229858398438, + "learning_rate": 5.778e-08, + "loss": 0.4383, + "step": 235650 + }, + { + "epoch": 2.357, + "grad_norm": 25.388702392578125, + "learning_rate": 5.758e-08, + "loss": 0.4707, + "step": 235700 + }, + { + "epoch": 2.3575, + "grad_norm": 21.608808517456055, + "learning_rate": 5.7379999999999995e-08, + "loss": 0.3404, + "step": 235750 + }, + { + "epoch": 2.358, + "grad_norm": 95.14769744873047, + "learning_rate": 5.718e-08, + "loss": 0.4012, + "step": 235800 + }, + { + "epoch": 2.3585, + "grad_norm": 29.87248420715332, + "learning_rate": 5.698e-08, + "loss": 0.6228, + "step": 235850 + }, + { + "epoch": 2.359, + "grad_norm": 119.35411834716797, + "learning_rate": 5.6779999999999996e-08, + "loss": 0.4535, + "step": 235900 + }, + { + "epoch": 2.3595, + "grad_norm": 22.905563354492188, + "learning_rate": 5.6579999999999994e-08, + "loss": 0.2687, + "step": 235950 + }, + { + "epoch": 2.36, + "grad_norm": 112.05744934082031, + "learning_rate": 5.638e-08, + "loss": 0.3915, + "step": 236000 + }, + { + "epoch": 2.3605, + "grad_norm": 51.91557693481445, + "learning_rate": 5.618e-08, + "loss": 0.4123, + "step": 236050 + }, + { + "epoch": 2.3609999999999998, + "grad_norm": 35.96802520751953, + "learning_rate": 5.598e-08, + "loss": 0.3404, + "step": 236100 + }, + { + "epoch": 2.3615, + "grad_norm": 39.31023406982422, + "learning_rate": 5.578e-08, + "loss": 0.362, + "step": 236150 + }, + { + "epoch": 2.362, + "grad_norm": 100.35054016113281, + "learning_rate": 5.557999999999999e-08, + "loss": 0.5586, + "step": 236200 + }, + { + "epoch": 2.3625, + "grad_norm": 0.3581673502922058, + "learning_rate": 5.538e-08, + "loss": 0.3348, + "step": 236250 + }, + { + "epoch": 2.363, + "grad_norm": 7.026209831237793, + "learning_rate": 5.5179999999999995e-08, + "loss": 0.4421, + "step": 236300 + }, + { + "epoch": 2.3635, + "grad_norm": 1.0426843166351318, + "learning_rate": 5.498e-08, + "loss": 0.4958, + "step": 236350 + }, + { + "epoch": 2.364, + "grad_norm": 86.80310821533203, + "learning_rate": 5.478e-08, + "loss": 0.3704, + "step": 236400 + }, + { + "epoch": 2.3645, + "grad_norm": 99.3370590209961, + "learning_rate": 5.4579999999999996e-08, + "loss": 0.5521, + "step": 236450 + }, + { + "epoch": 2.365, + "grad_norm": 73.29484558105469, + "learning_rate": 5.4384e-08, + "loss": 0.4607, + "step": 236500 + }, + { + "epoch": 2.3655, + "grad_norm": 94.66316986083984, + "learning_rate": 5.4184e-08, + "loss": 0.4448, + "step": 236550 + }, + { + "epoch": 2.366, + "grad_norm": 4.878780364990234, + "learning_rate": 5.3983999999999996e-08, + "loss": 0.3542, + "step": 236600 + }, + { + "epoch": 2.3665, + "grad_norm": 38.552696228027344, + "learning_rate": 5.3783999999999994e-08, + "loss": 0.3363, + "step": 236650 + }, + { + "epoch": 2.367, + "grad_norm": 106.73291015625, + "learning_rate": 5.3584e-08, + "loss": 0.3413, + "step": 236700 + }, + { + "epoch": 2.3675, + "grad_norm": 42.5386848449707, + "learning_rate": 5.3384e-08, + "loss": 0.3877, + "step": 236750 + }, + { + "epoch": 2.368, + "grad_norm": 53.35752487182617, + "learning_rate": 5.3184e-08, + "loss": 0.4686, + "step": 236800 + }, + { + "epoch": 2.3685, + "grad_norm": 7.68386173248291, + "learning_rate": 5.2984e-08, + "loss": 0.4841, + "step": 236850 + }, + { + "epoch": 2.3689999999999998, + "grad_norm": 121.2600326538086, + "learning_rate": 5.2784e-08, + "loss": 0.4746, + "step": 236900 + }, + { + "epoch": 2.3695, + "grad_norm": 25.649150848388672, + "learning_rate": 5.2583999999999996e-08, + "loss": 0.365, + "step": 236950 + }, + { + "epoch": 2.37, + "grad_norm": 19.756162643432617, + "learning_rate": 5.2383999999999994e-08, + "loss": 0.3344, + "step": 237000 + }, + { + "epoch": 2.3705, + "grad_norm": 20.484275817871094, + "learning_rate": 5.2184e-08, + "loss": 0.5115, + "step": 237050 + }, + { + "epoch": 2.371, + "grad_norm": 111.54764556884766, + "learning_rate": 5.1984e-08, + "loss": 0.3719, + "step": 237100 + }, + { + "epoch": 2.3715, + "grad_norm": 2.243807315826416, + "learning_rate": 5.1783999999999996e-08, + "loss": 0.4457, + "step": 237150 + }, + { + "epoch": 2.372, + "grad_norm": 76.7511978149414, + "learning_rate": 5.1583999999999994e-08, + "loss": 0.3587, + "step": 237200 + }, + { + "epoch": 2.3725, + "grad_norm": 82.79983520507812, + "learning_rate": 5.1384e-08, + "loss": 0.4723, + "step": 237250 + }, + { + "epoch": 2.373, + "grad_norm": 76.18513488769531, + "learning_rate": 5.1184e-08, + "loss": 0.4799, + "step": 237300 + }, + { + "epoch": 2.3735, + "grad_norm": 94.5965805053711, + "learning_rate": 5.0984e-08, + "loss": 0.4241, + "step": 237350 + }, + { + "epoch": 2.374, + "grad_norm": 0.5150562524795532, + "learning_rate": 5.0784e-08, + "loss": 0.4691, + "step": 237400 + }, + { + "epoch": 2.3745, + "grad_norm": 3.979444742202759, + "learning_rate": 5.058399999999999e-08, + "loss": 0.3656, + "step": 237450 + }, + { + "epoch": 2.375, + "grad_norm": 49.238975524902344, + "learning_rate": 5.0383999999999996e-08, + "loss": 0.4712, + "step": 237500 + }, + { + "epoch": 2.3755, + "grad_norm": 82.24160766601562, + "learning_rate": 5.0183999999999995e-08, + "loss": 0.3209, + "step": 237550 + }, + { + "epoch": 2.376, + "grad_norm": 106.60427856445312, + "learning_rate": 4.9984e-08, + "loss": 0.5374, + "step": 237600 + }, + { + "epoch": 2.3765, + "grad_norm": 5.549964904785156, + "learning_rate": 4.9784e-08, + "loss": 0.3677, + "step": 237650 + }, + { + "epoch": 2.377, + "grad_norm": 106.1153564453125, + "learning_rate": 4.9584e-08, + "loss": 0.5558, + "step": 237700 + }, + { + "epoch": 2.3775, + "grad_norm": 6.455342769622803, + "learning_rate": 4.9383999999999994e-08, + "loss": 0.53, + "step": 237750 + }, + { + "epoch": 2.378, + "grad_norm": 28.48943519592285, + "learning_rate": 4.9184e-08, + "loss": 0.618, + "step": 237800 + }, + { + "epoch": 2.3785, + "grad_norm": 3.647404432296753, + "learning_rate": 4.8984e-08, + "loss": 0.5573, + "step": 237850 + }, + { + "epoch": 2.379, + "grad_norm": 3.0441486835479736, + "learning_rate": 4.8784e-08, + "loss": 0.3326, + "step": 237900 + }, + { + "epoch": 2.3795, + "grad_norm": 44.93407440185547, + "learning_rate": 4.8584e-08, + "loss": 0.3113, + "step": 237950 + }, + { + "epoch": 2.38, + "grad_norm": 73.76074981689453, + "learning_rate": 4.8384e-08, + "loss": 0.3362, + "step": 238000 + }, + { + "epoch": 2.3805, + "grad_norm": 7.347684383392334, + "learning_rate": 4.8183999999999997e-08, + "loss": 0.4527, + "step": 238050 + }, + { + "epoch": 2.3810000000000002, + "grad_norm": 61.60081481933594, + "learning_rate": 4.7983999999999995e-08, + "loss": 0.464, + "step": 238100 + }, + { + "epoch": 2.3815, + "grad_norm": 11.561792373657227, + "learning_rate": 4.7784e-08, + "loss": 0.5532, + "step": 238150 + }, + { + "epoch": 2.382, + "grad_norm": 1.321579933166504, + "learning_rate": 4.7584e-08, + "loss": 0.4313, + "step": 238200 + }, + { + "epoch": 2.3825, + "grad_norm": 35.636898040771484, + "learning_rate": 4.7384e-08, + "loss": 0.4452, + "step": 238250 + }, + { + "epoch": 2.383, + "grad_norm": 82.60275268554688, + "learning_rate": 4.7183999999999994e-08, + "loss": 0.4888, + "step": 238300 + }, + { + "epoch": 2.3835, + "grad_norm": 10.286293029785156, + "learning_rate": 4.6984e-08, + "loss": 0.328, + "step": 238350 + }, + { + "epoch": 2.384, + "grad_norm": 126.57600402832031, + "learning_rate": 4.6784e-08, + "loss": 0.4364, + "step": 238400 + }, + { + "epoch": 2.3845, + "grad_norm": 0.09401095658540726, + "learning_rate": 4.6583999999999995e-08, + "loss": 0.3141, + "step": 238450 + }, + { + "epoch": 2.385, + "grad_norm": 42.13533401489258, + "learning_rate": 4.6384e-08, + "loss": 0.5262, + "step": 238500 + }, + { + "epoch": 2.3855, + "grad_norm": 8.396677017211914, + "learning_rate": 4.6184e-08, + "loss": 0.3766, + "step": 238550 + }, + { + "epoch": 2.386, + "grad_norm": 12.288100242614746, + "learning_rate": 4.5984e-08, + "loss": 0.4144, + "step": 238600 + }, + { + "epoch": 2.3865, + "grad_norm": 81.89070892333984, + "learning_rate": 4.5783999999999995e-08, + "loss": 0.3493, + "step": 238650 + }, + { + "epoch": 2.387, + "grad_norm": 104.77812194824219, + "learning_rate": 4.5584e-08, + "loss": 0.329, + "step": 238700 + }, + { + "epoch": 2.3875, + "grad_norm": 111.44982147216797, + "learning_rate": 4.5384e-08, + "loss": 0.3988, + "step": 238750 + }, + { + "epoch": 2.388, + "grad_norm": 96.20793151855469, + "learning_rate": 4.5184e-08, + "loss": 0.2894, + "step": 238800 + }, + { + "epoch": 2.3885, + "grad_norm": 67.85469818115234, + "learning_rate": 4.4984e-08, + "loss": 0.3836, + "step": 238850 + }, + { + "epoch": 2.3890000000000002, + "grad_norm": 53.94377899169922, + "learning_rate": 4.478399999999999e-08, + "loss": 0.4408, + "step": 238900 + }, + { + "epoch": 2.3895, + "grad_norm": 174.99449157714844, + "learning_rate": 4.4584e-08, + "loss": 0.454, + "step": 238950 + }, + { + "epoch": 2.39, + "grad_norm": 9.08309555053711, + "learning_rate": 4.4383999999999996e-08, + "loss": 0.4012, + "step": 239000 + }, + { + "epoch": 2.3905, + "grad_norm": 7.337883949279785, + "learning_rate": 4.4184e-08, + "loss": 0.4567, + "step": 239050 + }, + { + "epoch": 2.391, + "grad_norm": 118.68801879882812, + "learning_rate": 4.3984e-08, + "loss": 0.4913, + "step": 239100 + }, + { + "epoch": 2.3915, + "grad_norm": 119.5619888305664, + "learning_rate": 4.3784e-08, + "loss": 0.3864, + "step": 239150 + }, + { + "epoch": 2.392, + "grad_norm": 63.690818786621094, + "learning_rate": 4.3583999999999995e-08, + "loss": 0.3754, + "step": 239200 + }, + { + "epoch": 2.3925, + "grad_norm": 15.803923606872559, + "learning_rate": 4.3384e-08, + "loss": 0.4492, + "step": 239250 + }, + { + "epoch": 2.393, + "grad_norm": 13.975971221923828, + "learning_rate": 4.3184e-08, + "loss": 0.3106, + "step": 239300 + }, + { + "epoch": 2.3935, + "grad_norm": 6.938758373260498, + "learning_rate": 4.2983999999999996e-08, + "loss": 0.3647, + "step": 239350 + }, + { + "epoch": 2.394, + "grad_norm": 80.98222351074219, + "learning_rate": 4.2784e-08, + "loss": 0.365, + "step": 239400 + }, + { + "epoch": 2.3945, + "grad_norm": 11.557998657226562, + "learning_rate": 4.258399999999999e-08, + "loss": 0.4769, + "step": 239450 + }, + { + "epoch": 2.395, + "grad_norm": 20.222549438476562, + "learning_rate": 4.2384e-08, + "loss": 0.5143, + "step": 239500 + }, + { + "epoch": 2.3955, + "grad_norm": 61.61232376098633, + "learning_rate": 4.2183999999999996e-08, + "loss": 0.4041, + "step": 239550 + }, + { + "epoch": 2.396, + "grad_norm": 40.21515655517578, + "learning_rate": 4.1988e-08, + "loss": 0.4189, + "step": 239600 + }, + { + "epoch": 2.3965, + "grad_norm": 39.18798065185547, + "learning_rate": 4.1788e-08, + "loss": 0.2413, + "step": 239650 + }, + { + "epoch": 2.3970000000000002, + "grad_norm": 0.4177449345588684, + "learning_rate": 4.1587999999999995e-08, + "loss": 0.3866, + "step": 239700 + }, + { + "epoch": 2.3975, + "grad_norm": 32.55744171142578, + "learning_rate": 4.1388e-08, + "loss": 0.3252, + "step": 239750 + }, + { + "epoch": 2.398, + "grad_norm": 93.50446319580078, + "learning_rate": 4.1188e-08, + "loss": 0.5187, + "step": 239800 + }, + { + "epoch": 2.3985, + "grad_norm": 21.071910858154297, + "learning_rate": 4.0987999999999996e-08, + "loss": 0.4274, + "step": 239850 + }, + { + "epoch": 2.399, + "grad_norm": 156.43629455566406, + "learning_rate": 4.0787999999999994e-08, + "loss": 0.4526, + "step": 239900 + }, + { + "epoch": 2.3995, + "grad_norm": 71.36151123046875, + "learning_rate": 4.0588e-08, + "loss": 0.4707, + "step": 239950 + }, + { + "epoch": 2.4, + "grad_norm": 17.978219985961914, + "learning_rate": 4.0388e-08, + "loss": 0.4547, + "step": 240000 + }, + { + "epoch": 2.4005, + "grad_norm": 17.83655548095703, + "learning_rate": 4.0188e-08, + "loss": 0.5657, + "step": 240050 + }, + { + "epoch": 2.401, + "grad_norm": 139.66326904296875, + "learning_rate": 3.9988e-08, + "loss": 0.356, + "step": 240100 + }, + { + "epoch": 2.4015, + "grad_norm": 20.223098754882812, + "learning_rate": 3.978799999999999e-08, + "loss": 0.382, + "step": 240150 + }, + { + "epoch": 2.402, + "grad_norm": 2.374896287918091, + "learning_rate": 3.9588e-08, + "loss": 0.4235, + "step": 240200 + }, + { + "epoch": 2.4025, + "grad_norm": 60.51106643676758, + "learning_rate": 3.9387999999999995e-08, + "loss": 0.6068, + "step": 240250 + }, + { + "epoch": 2.403, + "grad_norm": 72.06840515136719, + "learning_rate": 3.9188e-08, + "loss": 0.474, + "step": 240300 + }, + { + "epoch": 2.4035, + "grad_norm": 78.26203155517578, + "learning_rate": 3.8988e-08, + "loss": 0.4211, + "step": 240350 + }, + { + "epoch": 2.404, + "grad_norm": 30.761234283447266, + "learning_rate": 3.8788e-08, + "loss": 0.2871, + "step": 240400 + }, + { + "epoch": 2.4045, + "grad_norm": 20.948881149291992, + "learning_rate": 3.8587999999999995e-08, + "loss": 0.4948, + "step": 240450 + }, + { + "epoch": 2.4050000000000002, + "grad_norm": 2.7093822956085205, + "learning_rate": 3.8388e-08, + "loss": 0.256, + "step": 240500 + }, + { + "epoch": 2.4055, + "grad_norm": 8.741806983947754, + "learning_rate": 3.8188e-08, + "loss": 0.4368, + "step": 240550 + }, + { + "epoch": 2.406, + "grad_norm": 13.452826499938965, + "learning_rate": 3.7988e-08, + "loss": 0.3249, + "step": 240600 + }, + { + "epoch": 2.4065, + "grad_norm": 9.17270565032959, + "learning_rate": 3.7788e-08, + "loss": 0.434, + "step": 240650 + }, + { + "epoch": 2.407, + "grad_norm": 44.4083366394043, + "learning_rate": 3.7588e-08, + "loss": 0.37, + "step": 240700 + }, + { + "epoch": 2.4074999999999998, + "grad_norm": 5.229351997375488, + "learning_rate": 3.7388e-08, + "loss": 0.3604, + "step": 240750 + }, + { + "epoch": 2.408, + "grad_norm": 113.33466339111328, + "learning_rate": 3.7187999999999995e-08, + "loss": 0.3738, + "step": 240800 + }, + { + "epoch": 2.4085, + "grad_norm": 66.20381927490234, + "learning_rate": 3.6988e-08, + "loss": 0.3353, + "step": 240850 + }, + { + "epoch": 2.409, + "grad_norm": 15.30612850189209, + "learning_rate": 3.6788e-08, + "loss": 0.2977, + "step": 240900 + }, + { + "epoch": 2.4095, + "grad_norm": 0.48715662956237793, + "learning_rate": 3.6588e-08, + "loss": 0.3838, + "step": 240950 + }, + { + "epoch": 2.41, + "grad_norm": 21.8796329498291, + "learning_rate": 3.6387999999999995e-08, + "loss": 0.5768, + "step": 241000 + }, + { + "epoch": 2.4105, + "grad_norm": 58.25382995605469, + "learning_rate": 3.6188e-08, + "loss": 0.222, + "step": 241050 + }, + { + "epoch": 2.411, + "grad_norm": 75.87273406982422, + "learning_rate": 3.5988e-08, + "loss": 0.3897, + "step": 241100 + }, + { + "epoch": 2.4115, + "grad_norm": 132.18612670898438, + "learning_rate": 3.5787999999999996e-08, + "loss": 0.4991, + "step": 241150 + }, + { + "epoch": 2.412, + "grad_norm": 93.72396087646484, + "learning_rate": 3.5588e-08, + "loss": 0.5212, + "step": 241200 + }, + { + "epoch": 2.4125, + "grad_norm": 73.5746841430664, + "learning_rate": 3.5388e-08, + "loss": 0.4832, + "step": 241250 + }, + { + "epoch": 2.413, + "grad_norm": 0.44722920656204224, + "learning_rate": 3.5188e-08, + "loss": 0.3307, + "step": 241300 + }, + { + "epoch": 2.4135, + "grad_norm": 12.440936088562012, + "learning_rate": 3.4987999999999995e-08, + "loss": 0.4114, + "step": 241350 + }, + { + "epoch": 2.414, + "grad_norm": 5.505441665649414, + "learning_rate": 3.4788e-08, + "loss": 0.4136, + "step": 241400 + }, + { + "epoch": 2.4145, + "grad_norm": 54.741905212402344, + "learning_rate": 3.4588e-08, + "loss": 0.4845, + "step": 241450 + }, + { + "epoch": 2.415, + "grad_norm": 3.8523218631744385, + "learning_rate": 3.4388e-08, + "loss": 0.3512, + "step": 241500 + }, + { + "epoch": 2.4154999999999998, + "grad_norm": 60.24558639526367, + "learning_rate": 3.4188e-08, + "loss": 0.43, + "step": 241550 + }, + { + "epoch": 2.416, + "grad_norm": 103.20320129394531, + "learning_rate": 3.398799999999999e-08, + "loss": 0.5293, + "step": 241600 + }, + { + "epoch": 2.4165, + "grad_norm": 43.67532730102539, + "learning_rate": 3.3788e-08, + "loss": 0.4799, + "step": 241650 + }, + { + "epoch": 2.417, + "grad_norm": 12.198750495910645, + "learning_rate": 3.3587999999999996e-08, + "loss": 0.4764, + "step": 241700 + }, + { + "epoch": 2.4175, + "grad_norm": 44.76414489746094, + "learning_rate": 3.3388e-08, + "loss": 0.3694, + "step": 241750 + }, + { + "epoch": 2.418, + "grad_norm": 93.81781005859375, + "learning_rate": 3.3188e-08, + "loss": 0.4577, + "step": 241800 + }, + { + "epoch": 2.4185, + "grad_norm": 86.02700805664062, + "learning_rate": 3.2988000000000004e-08, + "loss": 0.337, + "step": 241850 + }, + { + "epoch": 2.419, + "grad_norm": 23.790729522705078, + "learning_rate": 3.2787999999999996e-08, + "loss": 0.3581, + "step": 241900 + }, + { + "epoch": 2.4195, + "grad_norm": 80.7553482055664, + "learning_rate": 3.2588e-08, + "loss": 0.4819, + "step": 241950 + }, + { + "epoch": 2.42, + "grad_norm": 100.54222106933594, + "learning_rate": 3.2388e-08, + "loss": 0.3009, + "step": 242000 + }, + { + "epoch": 2.4205, + "grad_norm": 69.31678009033203, + "learning_rate": 3.2188e-08, + "loss": 0.414, + "step": 242050 + }, + { + "epoch": 2.421, + "grad_norm": 8.001131057739258, + "learning_rate": 3.1988e-08, + "loss": 0.5114, + "step": 242100 + }, + { + "epoch": 2.4215, + "grad_norm": 96.0322036743164, + "learning_rate": 3.178799999999999e-08, + "loss": 0.4587, + "step": 242150 + }, + { + "epoch": 2.422, + "grad_norm": 8.513936042785645, + "learning_rate": 3.1588e-08, + "loss": 0.3816, + "step": 242200 + }, + { + "epoch": 2.4225, + "grad_norm": 53.89067077636719, + "learning_rate": 3.1387999999999996e-08, + "loss": 0.4928, + "step": 242250 + }, + { + "epoch": 2.423, + "grad_norm": 98.1548080444336, + "learning_rate": 3.1188e-08, + "loss": 0.4295, + "step": 242300 + }, + { + "epoch": 2.4234999999999998, + "grad_norm": 4.35904598236084, + "learning_rate": 3.0988e-08, + "loss": 0.4473, + "step": 242350 + }, + { + "epoch": 2.424, + "grad_norm": 72.66747283935547, + "learning_rate": 3.0788e-08, + "loss": 0.4041, + "step": 242400 + }, + { + "epoch": 2.4245, + "grad_norm": 275.8028259277344, + "learning_rate": 3.0588e-08, + "loss": 0.384, + "step": 242450 + }, + { + "epoch": 2.425, + "grad_norm": 74.81790924072266, + "learning_rate": 3.0387999999999994e-08, + "loss": 0.4163, + "step": 242500 + }, + { + "epoch": 2.4255, + "grad_norm": 2.4196512699127197, + "learning_rate": 3.0188e-08, + "loss": 0.4067, + "step": 242550 + }, + { + "epoch": 2.426, + "grad_norm": 60.739994049072266, + "learning_rate": 2.9988e-08, + "loss": 0.4061, + "step": 242600 + }, + { + "epoch": 2.4265, + "grad_norm": 11.086175918579102, + "learning_rate": 2.9788e-08, + "loss": 0.343, + "step": 242650 + }, + { + "epoch": 2.427, + "grad_norm": 9.039807319641113, + "learning_rate": 2.9588e-08, + "loss": 0.3752, + "step": 242700 + }, + { + "epoch": 2.4275, + "grad_norm": 1.2201632261276245, + "learning_rate": 2.9387999999999998e-08, + "loss": 0.3769, + "step": 242750 + }, + { + "epoch": 2.428, + "grad_norm": 46.109474182128906, + "learning_rate": 2.9187999999999996e-08, + "loss": 0.4977, + "step": 242800 + }, + { + "epoch": 2.4285, + "grad_norm": 4.451110363006592, + "learning_rate": 2.8987999999999998e-08, + "loss": 0.4644, + "step": 242850 + }, + { + "epoch": 2.429, + "grad_norm": 78.92597961425781, + "learning_rate": 2.8788e-08, + "loss": 0.3742, + "step": 242900 + }, + { + "epoch": 2.4295, + "grad_norm": 4.9124908447265625, + "learning_rate": 2.8587999999999998e-08, + "loss": 0.3717, + "step": 242950 + }, + { + "epoch": 2.43, + "grad_norm": 6.718141555786133, + "learning_rate": 2.8388e-08, + "loss": 0.4299, + "step": 243000 + }, + { + "epoch": 2.4305, + "grad_norm": 95.63214874267578, + "learning_rate": 2.8188e-08, + "loss": 0.3924, + "step": 243050 + }, + { + "epoch": 2.431, + "grad_norm": 7.528963088989258, + "learning_rate": 2.7988e-08, + "loss": 0.5847, + "step": 243100 + }, + { + "epoch": 2.4314999999999998, + "grad_norm": 2.7887654304504395, + "learning_rate": 2.7788e-08, + "loss": 0.3462, + "step": 243150 + }, + { + "epoch": 2.432, + "grad_norm": 17.53578758239746, + "learning_rate": 2.7588e-08, + "loss": 0.4067, + "step": 243200 + }, + { + "epoch": 2.4325, + "grad_norm": 88.1779556274414, + "learning_rate": 2.7387999999999997e-08, + "loss": 0.3385, + "step": 243250 + }, + { + "epoch": 2.433, + "grad_norm": 100.06237030029297, + "learning_rate": 2.7187999999999998e-08, + "loss": 0.4189, + "step": 243300 + }, + { + "epoch": 2.4335, + "grad_norm": 89.0728759765625, + "learning_rate": 2.6988e-08, + "loss": 0.3789, + "step": 243350 + }, + { + "epoch": 2.434, + "grad_norm": 0.17192931473255157, + "learning_rate": 2.6787999999999998e-08, + "loss": 0.3695, + "step": 243400 + }, + { + "epoch": 2.4345, + "grad_norm": 8.71643352508545, + "learning_rate": 2.6588e-08, + "loss": 0.5545, + "step": 243450 + }, + { + "epoch": 2.435, + "grad_norm": 14.25753116607666, + "learning_rate": 2.6388e-08, + "loss": 0.4481, + "step": 243500 + }, + { + "epoch": 2.4355, + "grad_norm": 29.616668701171875, + "learning_rate": 2.6188e-08, + "loss": 0.2906, + "step": 243550 + }, + { + "epoch": 2.436, + "grad_norm": 126.89151000976562, + "learning_rate": 2.5988e-08, + "loss": 0.5021, + "step": 243600 + }, + { + "epoch": 2.4365, + "grad_norm": 36.65034103393555, + "learning_rate": 2.5787999999999996e-08, + "loss": 0.4752, + "step": 243650 + }, + { + "epoch": 2.437, + "grad_norm": 52.342201232910156, + "learning_rate": 2.5587999999999997e-08, + "loss": 0.4993, + "step": 243700 + }, + { + "epoch": 2.4375, + "grad_norm": 14.278721809387207, + "learning_rate": 2.5388e-08, + "loss": 0.4432, + "step": 243750 + }, + { + "epoch": 2.438, + "grad_norm": 100.52146911621094, + "learning_rate": 2.5187999999999997e-08, + "loss": 0.4158, + "step": 243800 + }, + { + "epoch": 2.4385, + "grad_norm": 1.8151702880859375, + "learning_rate": 2.4988e-08, + "loss": 0.3256, + "step": 243850 + }, + { + "epoch": 2.439, + "grad_norm": 2.068545341491699, + "learning_rate": 2.4788e-08, + "loss": 0.3985, + "step": 243900 + }, + { + "epoch": 2.4395, + "grad_norm": 114.05219268798828, + "learning_rate": 2.4587999999999998e-08, + "loss": 0.5039, + "step": 243950 + }, + { + "epoch": 2.44, + "grad_norm": 22.46074104309082, + "learning_rate": 2.4388e-08, + "loss": 0.3068, + "step": 244000 + }, + { + "epoch": 2.4405, + "grad_norm": 49.89002990722656, + "learning_rate": 2.4188e-08, + "loss": 0.5444, + "step": 244050 + }, + { + "epoch": 2.441, + "grad_norm": 116.9295883178711, + "learning_rate": 2.3988e-08, + "loss": 0.401, + "step": 244100 + }, + { + "epoch": 2.4415, + "grad_norm": 159.10597229003906, + "learning_rate": 2.3787999999999998e-08, + "loss": 0.5854, + "step": 244150 + }, + { + "epoch": 2.442, + "grad_norm": 72.9452896118164, + "learning_rate": 2.3588e-08, + "loss": 0.5169, + "step": 244200 + }, + { + "epoch": 2.4425, + "grad_norm": 59.4945068359375, + "learning_rate": 2.3387999999999997e-08, + "loss": 0.5033, + "step": 244250 + }, + { + "epoch": 2.443, + "grad_norm": 110.84677124023438, + "learning_rate": 2.3188e-08, + "loss": 0.446, + "step": 244300 + }, + { + "epoch": 2.4435000000000002, + "grad_norm": 46.814674377441406, + "learning_rate": 2.2988e-08, + "loss": 0.4448, + "step": 244350 + }, + { + "epoch": 2.444, + "grad_norm": 42.03607940673828, + "learning_rate": 2.2788e-08, + "loss": 0.4175, + "step": 244400 + }, + { + "epoch": 2.4445, + "grad_norm": 4.896573543548584, + "learning_rate": 2.2588e-08, + "loss": 0.3549, + "step": 244450 + }, + { + "epoch": 2.445, + "grad_norm": 66.42455291748047, + "learning_rate": 2.2388000000000002e-08, + "loss": 0.5136, + "step": 244500 + }, + { + "epoch": 2.4455, + "grad_norm": 9.06808090209961, + "learning_rate": 2.2188e-08, + "loss": 0.4, + "step": 244550 + }, + { + "epoch": 2.446, + "grad_norm": 32.235755920410156, + "learning_rate": 2.1987999999999998e-08, + "loss": 0.3744, + "step": 244600 + }, + { + "epoch": 2.4465, + "grad_norm": 87.67384338378906, + "learning_rate": 2.1787999999999996e-08, + "loss": 0.4273, + "step": 244650 + }, + { + "epoch": 2.447, + "grad_norm": 72.4257583618164, + "learning_rate": 2.1587999999999998e-08, + "loss": 0.5571, + "step": 244700 + }, + { + "epoch": 2.4475, + "grad_norm": 0.4879385530948639, + "learning_rate": 2.1388e-08, + "loss": 0.5196, + "step": 244750 + }, + { + "epoch": 2.448, + "grad_norm": 80.70654296875, + "learning_rate": 2.1187999999999998e-08, + "loss": 0.5049, + "step": 244800 + }, + { + "epoch": 2.4485, + "grad_norm": 38.23407745361328, + "learning_rate": 2.0988e-08, + "loss": 0.3874, + "step": 244850 + }, + { + "epoch": 2.449, + "grad_norm": 31.929292678833008, + "learning_rate": 2.0788e-08, + "loss": 0.433, + "step": 244900 + }, + { + "epoch": 2.4495, + "grad_norm": 21.76283836364746, + "learning_rate": 2.0588e-08, + "loss": 0.41, + "step": 244950 + }, + { + "epoch": 2.45, + "grad_norm": 8.751602172851562, + "learning_rate": 2.0391999999999998e-08, + "loss": 0.3788, + "step": 245000 + }, + { + "epoch": 2.4505, + "grad_norm": 10.776887893676758, + "learning_rate": 2.0192e-08, + "loss": 0.4291, + "step": 245050 + }, + { + "epoch": 2.451, + "grad_norm": 0.36848610639572144, + "learning_rate": 1.9991999999999998e-08, + "loss": 0.4365, + "step": 245100 + }, + { + "epoch": 2.4515000000000002, + "grad_norm": 23.090919494628906, + "learning_rate": 1.9792e-08, + "loss": 0.406, + "step": 245150 + }, + { + "epoch": 2.452, + "grad_norm": 25.339353561401367, + "learning_rate": 1.9591999999999998e-08, + "loss": 0.4708, + "step": 245200 + }, + { + "epoch": 2.4525, + "grad_norm": 52.52699661254883, + "learning_rate": 1.9392e-08, + "loss": 0.4524, + "step": 245250 + }, + { + "epoch": 2.453, + "grad_norm": 1.461986780166626, + "learning_rate": 1.9192e-08, + "loss": 0.4664, + "step": 245300 + }, + { + "epoch": 2.4535, + "grad_norm": 95.0424575805664, + "learning_rate": 1.8992e-08, + "loss": 0.3294, + "step": 245350 + }, + { + "epoch": 2.454, + "grad_norm": 91.5193862915039, + "learning_rate": 1.8792e-08, + "loss": 0.3741, + "step": 245400 + }, + { + "epoch": 2.4545, + "grad_norm": 95.59048461914062, + "learning_rate": 1.8592e-08, + "loss": 0.5858, + "step": 245450 + }, + { + "epoch": 2.455, + "grad_norm": 1.0647406578063965, + "learning_rate": 1.8391999999999997e-08, + "loss": 0.3868, + "step": 245500 + }, + { + "epoch": 2.4555, + "grad_norm": 36.12507629394531, + "learning_rate": 1.8192e-08, + "loss": 0.269, + "step": 245550 + }, + { + "epoch": 2.456, + "grad_norm": 31.336393356323242, + "learning_rate": 1.7992e-08, + "loss": 0.388, + "step": 245600 + }, + { + "epoch": 2.4565, + "grad_norm": 103.9297866821289, + "learning_rate": 1.7791999999999998e-08, + "loss": 0.4473, + "step": 245650 + }, + { + "epoch": 2.457, + "grad_norm": 63.19395446777344, + "learning_rate": 1.7592e-08, + "loss": 0.4356, + "step": 245700 + }, + { + "epoch": 2.4575, + "grad_norm": 72.82255554199219, + "learning_rate": 1.7392e-08, + "loss": 0.434, + "step": 245750 + }, + { + "epoch": 2.458, + "grad_norm": 25.266742706298828, + "learning_rate": 1.7192e-08, + "loss": 0.4078, + "step": 245800 + }, + { + "epoch": 2.4585, + "grad_norm": 26.895408630371094, + "learning_rate": 1.6992e-08, + "loss": 0.5463, + "step": 245850 + }, + { + "epoch": 2.459, + "grad_norm": 18.793058395385742, + "learning_rate": 1.6792e-08, + "loss": 0.4784, + "step": 245900 + }, + { + "epoch": 2.4595000000000002, + "grad_norm": 1.1856598854064941, + "learning_rate": 1.6591999999999997e-08, + "loss": 0.3484, + "step": 245950 + }, + { + "epoch": 2.46, + "grad_norm": 1.1075769662857056, + "learning_rate": 1.6392e-08, + "loss": 0.4504, + "step": 246000 + }, + { + "epoch": 2.4605, + "grad_norm": 47.581871032714844, + "learning_rate": 1.6192e-08, + "loss": 0.4725, + "step": 246050 + }, + { + "epoch": 2.461, + "grad_norm": 132.03213500976562, + "learning_rate": 1.5992e-08, + "loss": 0.46, + "step": 246100 + }, + { + "epoch": 2.4615, + "grad_norm": 129.3053741455078, + "learning_rate": 1.5792e-08, + "loss": 0.4021, + "step": 246150 + }, + { + "epoch": 2.462, + "grad_norm": 94.52012634277344, + "learning_rate": 1.5591999999999998e-08, + "loss": 0.5046, + "step": 246200 + }, + { + "epoch": 2.4625, + "grad_norm": 2.470003128051758, + "learning_rate": 1.5392e-08, + "loss": 0.4378, + "step": 246250 + }, + { + "epoch": 2.463, + "grad_norm": 65.46109008789062, + "learning_rate": 1.5192e-08, + "loss": 0.3905, + "step": 246300 + }, + { + "epoch": 2.4635, + "grad_norm": 29.295551300048828, + "learning_rate": 1.4992e-08, + "loss": 0.4354, + "step": 246350 + }, + { + "epoch": 2.464, + "grad_norm": 46.023048400878906, + "learning_rate": 1.4792e-08, + "loss": 0.3686, + "step": 246400 + }, + { + "epoch": 2.4645, + "grad_norm": 2.7170820236206055, + "learning_rate": 1.4592000000000001e-08, + "loss": 0.4175, + "step": 246450 + }, + { + "epoch": 2.465, + "grad_norm": 46.63579559326172, + "learning_rate": 1.4391999999999999e-08, + "loss": 0.476, + "step": 246500 + }, + { + "epoch": 2.4655, + "grad_norm": 31.72504997253418, + "learning_rate": 1.4191999999999999e-08, + "loss": 0.4872, + "step": 246550 + }, + { + "epoch": 2.466, + "grad_norm": 2.155219793319702, + "learning_rate": 1.3991999999999999e-08, + "loss": 0.3661, + "step": 246600 + }, + { + "epoch": 2.4665, + "grad_norm": 100.10980987548828, + "learning_rate": 1.3792e-08, + "loss": 0.4025, + "step": 246650 + }, + { + "epoch": 2.467, + "grad_norm": 40.943275451660156, + "learning_rate": 1.3591999999999999e-08, + "loss": 0.4854, + "step": 246700 + }, + { + "epoch": 2.4675000000000002, + "grad_norm": 68.8076400756836, + "learning_rate": 1.3391999999999998e-08, + "loss": 0.3888, + "step": 246750 + }, + { + "epoch": 2.468, + "grad_norm": 53.90141296386719, + "learning_rate": 1.3192e-08, + "loss": 0.3383, + "step": 246800 + }, + { + "epoch": 2.4685, + "grad_norm": 65.13260650634766, + "learning_rate": 1.2992e-08, + "loss": 0.4943, + "step": 246850 + }, + { + "epoch": 2.469, + "grad_norm": 9.727241516113281, + "learning_rate": 1.2792e-08, + "loss": 0.5687, + "step": 246900 + }, + { + "epoch": 2.4695, + "grad_norm": 45.80133056640625, + "learning_rate": 1.2592e-08, + "loss": 0.2708, + "step": 246950 + }, + { + "epoch": 2.4699999999999998, + "grad_norm": 95.85955810546875, + "learning_rate": 1.2392e-08, + "loss": 0.4595, + "step": 247000 + }, + { + "epoch": 2.4705, + "grad_norm": 1.5538358688354492, + "learning_rate": 1.2192e-08, + "loss": 0.2399, + "step": 247050 + }, + { + "epoch": 2.471, + "grad_norm": 87.2671890258789, + "learning_rate": 1.1991999999999999e-08, + "loss": 0.4663, + "step": 247100 + }, + { + "epoch": 2.4715, + "grad_norm": 4.15252161026001, + "learning_rate": 1.1792e-08, + "loss": 0.4194, + "step": 247150 + }, + { + "epoch": 2.472, + "grad_norm": 51.74559783935547, + "learning_rate": 1.1591999999999999e-08, + "loss": 0.3372, + "step": 247200 + }, + { + "epoch": 2.4725, + "grad_norm": 66.37777709960938, + "learning_rate": 1.1391999999999999e-08, + "loss": 0.3389, + "step": 247250 + }, + { + "epoch": 2.473, + "grad_norm": 105.33214569091797, + "learning_rate": 1.1195999999999998e-08, + "loss": 0.4773, + "step": 247300 + }, + { + "epoch": 2.4735, + "grad_norm": 44.509883880615234, + "learning_rate": 1.0996e-08, + "loss": 0.3653, + "step": 247350 + }, + { + "epoch": 2.474, + "grad_norm": 85.23392486572266, + "learning_rate": 1.0796e-08, + "loss": 0.3496, + "step": 247400 + }, + { + "epoch": 2.4745, + "grad_norm": 91.08809661865234, + "learning_rate": 1.0596e-08, + "loss": 0.451, + "step": 247450 + }, + { + "epoch": 2.475, + "grad_norm": 51.84107208251953, + "learning_rate": 1.0396000000000001e-08, + "loss": 0.4562, + "step": 247500 + }, + { + "epoch": 2.4755, + "grad_norm": 89.71752166748047, + "learning_rate": 1.0195999999999999e-08, + "loss": 0.3289, + "step": 247550 + }, + { + "epoch": 2.476, + "grad_norm": 30.12227439880371, + "learning_rate": 9.995999999999999e-09, + "loss": 0.3847, + "step": 247600 + }, + { + "epoch": 2.4765, + "grad_norm": 138.02243041992188, + "learning_rate": 9.795999999999999e-09, + "loss": 0.7013, + "step": 247650 + }, + { + "epoch": 2.477, + "grad_norm": 9.041874885559082, + "learning_rate": 9.596e-09, + "loss": 0.5115, + "step": 247700 + }, + { + "epoch": 2.4775, + "grad_norm": 2.327457904815674, + "learning_rate": 9.396e-09, + "loss": 0.3211, + "step": 247750 + }, + { + "epoch": 2.4779999999999998, + "grad_norm": 6.0508317947387695, + "learning_rate": 9.195999999999998e-09, + "loss": 0.5396, + "step": 247800 + }, + { + "epoch": 2.4785, + "grad_norm": 26.49738883972168, + "learning_rate": 8.996e-09, + "loss": 0.2972, + "step": 247850 + }, + { + "epoch": 2.479, + "grad_norm": 70.40245819091797, + "learning_rate": 8.796e-09, + "loss": 0.526, + "step": 247900 + }, + { + "epoch": 2.4795, + "grad_norm": 70.888916015625, + "learning_rate": 8.596e-09, + "loss": 0.4678, + "step": 247950 + }, + { + "epoch": 2.48, + "grad_norm": 2.654665231704712, + "learning_rate": 8.396e-09, + "loss": 0.5367, + "step": 248000 + }, + { + "epoch": 2.4805, + "grad_norm": 16.878707885742188, + "learning_rate": 8.196e-09, + "loss": 0.2801, + "step": 248050 + }, + { + "epoch": 2.481, + "grad_norm": 47.633583068847656, + "learning_rate": 7.996e-09, + "loss": 0.3872, + "step": 248100 + }, + { + "epoch": 2.4815, + "grad_norm": 77.5364990234375, + "learning_rate": 7.795999999999999e-09, + "loss": 0.4372, + "step": 248150 + }, + { + "epoch": 2.482, + "grad_norm": 57.8331184387207, + "learning_rate": 7.596e-09, + "loss": 0.5912, + "step": 248200 + }, + { + "epoch": 2.4825, + "grad_norm": 64.51912689208984, + "learning_rate": 7.396e-09, + "loss": 0.4503, + "step": 248250 + }, + { + "epoch": 2.483, + "grad_norm": 10.555734634399414, + "learning_rate": 7.1959999999999996e-09, + "loss": 0.4095, + "step": 248300 + }, + { + "epoch": 2.4835, + "grad_norm": 58.91648483276367, + "learning_rate": 6.9959999999999994e-09, + "loss": 0.4893, + "step": 248350 + }, + { + "epoch": 2.484, + "grad_norm": 81.10791778564453, + "learning_rate": 6.795999999999999e-09, + "loss": 0.3603, + "step": 248400 + }, + { + "epoch": 2.4845, + "grad_norm": 69.9582748413086, + "learning_rate": 6.596e-09, + "loss": 0.3966, + "step": 248450 + }, + { + "epoch": 2.485, + "grad_norm": 2.605888843536377, + "learning_rate": 6.396e-09, + "loss": 0.4878, + "step": 248500 + }, + { + "epoch": 2.4855, + "grad_norm": 92.16374206542969, + "learning_rate": 6.196e-09, + "loss": 0.4327, + "step": 248550 + }, + { + "epoch": 2.4859999999999998, + "grad_norm": 90.9197998046875, + "learning_rate": 5.9959999999999996e-09, + "loss": 0.3959, + "step": 248600 + }, + { + "epoch": 2.4865, + "grad_norm": 76.15708923339844, + "learning_rate": 5.7959999999999994e-09, + "loss": 0.3849, + "step": 248650 + }, + { + "epoch": 2.487, + "grad_norm": 32.75033187866211, + "learning_rate": 5.596e-09, + "loss": 0.4641, + "step": 248700 + }, + { + "epoch": 2.4875, + "grad_norm": 103.26265716552734, + "learning_rate": 5.395999999999999e-09, + "loss": 0.3382, + "step": 248750 + }, + { + "epoch": 2.488, + "grad_norm": 68.11605072021484, + "learning_rate": 5.196e-09, + "loss": 0.3231, + "step": 248800 + }, + { + "epoch": 2.4885, + "grad_norm": 4.257381439208984, + "learning_rate": 4.996e-09, + "loss": 0.4907, + "step": 248850 + }, + { + "epoch": 2.489, + "grad_norm": 33.47376251220703, + "learning_rate": 4.7959999999999996e-09, + "loss": 0.3289, + "step": 248900 + }, + { + "epoch": 2.4895, + "grad_norm": 49.842369079589844, + "learning_rate": 4.596e-09, + "loss": 0.234, + "step": 248950 + }, + { + "epoch": 2.49, + "grad_norm": 3.662029504776001, + "learning_rate": 4.395999999999999e-09, + "loss": 0.3642, + "step": 249000 + }, + { + "epoch": 2.4905, + "grad_norm": 112.91320037841797, + "learning_rate": 4.196e-09, + "loss": 0.437, + "step": 249050 + }, + { + "epoch": 2.491, + "grad_norm": 2.2020812034606934, + "learning_rate": 3.996e-09, + "loss": 0.4311, + "step": 249100 + }, + { + "epoch": 2.4915, + "grad_norm": 20.80237579345703, + "learning_rate": 3.796e-09, + "loss": 0.4193, + "step": 249150 + }, + { + "epoch": 2.492, + "grad_norm": 47.812049865722656, + "learning_rate": 3.5959999999999996e-09, + "loss": 0.2837, + "step": 249200 + }, + { + "epoch": 2.4925, + "grad_norm": 14.877352714538574, + "learning_rate": 3.396e-09, + "loss": 0.4079, + "step": 249250 + }, + { + "epoch": 2.493, + "grad_norm": 31.925203323364258, + "learning_rate": 3.1959999999999997e-09, + "loss": 0.4154, + "step": 249300 + }, + { + "epoch": 2.4935, + "grad_norm": 7.781558513641357, + "learning_rate": 2.996e-09, + "loss": 0.3361, + "step": 249350 + }, + { + "epoch": 2.4939999999999998, + "grad_norm": 0.3294135630130768, + "learning_rate": 2.796e-09, + "loss": 0.4179, + "step": 249400 + }, + { + "epoch": 2.4945, + "grad_norm": 64.53218078613281, + "learning_rate": 2.5959999999999997e-09, + "loss": 0.4882, + "step": 249450 + }, + { + "epoch": 2.495, + "grad_norm": 44.542545318603516, + "learning_rate": 2.396e-09, + "loss": 0.3351, + "step": 249500 + }, + { + "epoch": 2.4955, + "grad_norm": 18.63250160217285, + "learning_rate": 2.196e-09, + "loss": 0.4181, + "step": 249550 + }, + { + "epoch": 2.496, + "grad_norm": 42.67049789428711, + "learning_rate": 1.9959999999999997e-09, + "loss": 0.4524, + "step": 249600 + }, + { + "epoch": 2.4965, + "grad_norm": 4.257511615753174, + "learning_rate": 1.796e-09, + "loss": 0.3714, + "step": 249650 + }, + { + "epoch": 2.497, + "grad_norm": 38.738487243652344, + "learning_rate": 1.5959999999999999e-09, + "loss": 0.3591, + "step": 249700 + }, + { + "epoch": 2.4975, + "grad_norm": 50.39963150024414, + "learning_rate": 1.3960000000000001e-09, + "loss": 0.3474, + "step": 249750 + }, + { + "epoch": 2.498, + "grad_norm": 69.15322875976562, + "learning_rate": 1.196e-09, + "loss": 0.5561, + "step": 249800 + }, + { + "epoch": 2.4985, + "grad_norm": 7.292196273803711, + "learning_rate": 9.959999999999999e-10, + "loss": 0.5342, + "step": 249850 + }, + { + "epoch": 2.499, + "grad_norm": 1.3633363246917725, + "learning_rate": 7.96e-10, + "loss": 0.4622, + "step": 249900 + }, + { + "epoch": 2.4995, + "grad_norm": 82.1470947265625, + "learning_rate": 5.959999999999999e-10, + "loss": 0.5702, + "step": 249950 + }, + { + "epoch": 2.5, + "grad_norm": 84.26116943359375, + "learning_rate": 3.96e-10, + "loss": 0.4723, + "step": 250000 + } + ], + "logging_steps": 50, + "max_steps": 250000, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}