diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,66551 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 9503, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 36.36130527170318, + "learning_rate": 3.4965034965034967e-08, + "loss": 1.8874, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 44.35509081795796, + "learning_rate": 6.993006993006993e-08, + "loss": 1.9623, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 113.15059681611625, + "learning_rate": 1.048951048951049e-07, + "loss": 1.8315, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 26.480686239957798, + "learning_rate": 1.3986013986013987e-07, + "loss": 1.8153, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 73.55538638754199, + "learning_rate": 1.7482517482517484e-07, + "loss": 1.8261, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 44.47995785747746, + "learning_rate": 2.097902097902098e-07, + "loss": 1.7684, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 41.19958226318581, + "learning_rate": 2.447552447552448e-07, + "loss": 1.7842, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 162.3686148569218, + "learning_rate": 2.7972027972027973e-07, + "loss": 1.8389, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 37.77584116684587, + "learning_rate": 3.1468531468531473e-07, + "loss": 1.739, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 33.567385322447045, + "learning_rate": 3.496503496503497e-07, + "loss": 1.823, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 79.25205422236616, + "learning_rate": 3.846153846153847e-07, + "loss": 1.8705, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 29.01030979777457, + "learning_rate": 4.195804195804196e-07, + "loss": 1.8595, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 32.29577345941253, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.7776, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 75.68576026951285, + "learning_rate": 4.895104895104896e-07, + "loss": 1.7253, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 44.31708263717978, + "learning_rate": 5.244755244755246e-07, + "loss": 1.6779, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 219.5878152798679, + "learning_rate": 5.594405594405595e-07, + "loss": 1.702, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 33.33211308321619, + "learning_rate": 5.944055944055945e-07, + "loss": 1.5494, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 28.77943791137145, + "learning_rate": 6.293706293706295e-07, + "loss": 1.5089, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 60.96425095593503, + "learning_rate": 6.643356643356644e-07, + "loss": 1.4609, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 21.119045219567745, + "learning_rate": 6.993006993006994e-07, + "loss": 1.4632, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 30.087479668348955, + "learning_rate": 7.342657342657343e-07, + "loss": 1.5663, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 22.134522059806123, + "learning_rate": 7.692307692307694e-07, + "loss": 1.4837, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 29.13387995825167, + "learning_rate": 8.041958041958043e-07, + "loss": 1.4038, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 8.488083114385395, + "learning_rate": 8.391608391608393e-07, + "loss": 1.2151, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 10.410898426509407, + "learning_rate": 8.741258741258741e-07, + "loss": 1.2384, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 7.196734276367997, + "learning_rate": 9.090909090909091e-07, + "loss": 1.2946, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 5.855727873849648, + "learning_rate": 9.44055944055944e-07, + "loss": 1.3565, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 9.775814502806083, + "learning_rate": 9.790209790209791e-07, + "loss": 1.2246, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 4.850328552766668, + "learning_rate": 1.013986013986014e-06, + "loss": 0.8368, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 5.46226966648362, + "learning_rate": 1.0489510489510491e-06, + "loss": 0.8374, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 4.878572158964688, + "learning_rate": 1.083916083916084e-06, + "loss": 1.1998, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 7.823282919023532, + "learning_rate": 1.118881118881119e-06, + "loss": 1.3195, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 10.031172311822704, + "learning_rate": 1.153846153846154e-06, + "loss": 1.1547, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 5.192629192579879, + "learning_rate": 1.188811188811189e-06, + "loss": 1.1146, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 7.2298322921309905, + "learning_rate": 1.2237762237762238e-06, + "loss": 1.0787, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 6.136735379037229, + "learning_rate": 1.258741258741259e-06, + "loss": 1.1542, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 5.0826929997956345, + "learning_rate": 1.2937062937062938e-06, + "loss": 1.1515, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 9.642838223265395, + "learning_rate": 1.3286713286713287e-06, + "loss": 1.0195, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 21.361559216190493, + "learning_rate": 1.3636363636363636e-06, + "loss": 1.099, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 4.808653298973119, + "learning_rate": 1.3986013986013987e-06, + "loss": 1.0705, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 12.37717034792203, + "learning_rate": 1.4335664335664336e-06, + "loss": 1.0939, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 6.615327284739696, + "learning_rate": 1.4685314685314685e-06, + "loss": 1.0464, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 4.179391150292978, + "learning_rate": 1.5034965034965034e-06, + "loss": 1.0878, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 20.006567323936093, + "learning_rate": 1.5384615384615387e-06, + "loss": 1.1121, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 5.213316956884923, + "learning_rate": 1.5734265734265736e-06, + "loss": 1.0629, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 4.22220233163102, + "learning_rate": 1.6083916083916085e-06, + "loss": 1.1175, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 3.945556190990595, + "learning_rate": 1.6433566433566434e-06, + "loss": 1.0423, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 3.6470904938849054, + "learning_rate": 1.6783216783216785e-06, + "loss": 1.0511, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 4.446567502839809, + "learning_rate": 1.7132867132867134e-06, + "loss": 1.0117, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 6.4133295800026815, + "learning_rate": 1.7482517482517483e-06, + "loss": 1.0825, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 9.263781826142708, + "learning_rate": 1.7832167832167834e-06, + "loss": 1.0928, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 3.3410522050072218, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.9511, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 5.078099442521805, + "learning_rate": 1.8531468531468532e-06, + "loss": 1.0446, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 3.7783601770563657, + "learning_rate": 1.888111888111888e-06, + "loss": 0.9961, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 5.3245107534214275, + "learning_rate": 1.9230769230769234e-06, + "loss": 1.002, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 4.550873551909127, + "learning_rate": 1.9580419580419583e-06, + "loss": 1.0045, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 5.298300519654384, + "learning_rate": 1.993006993006993e-06, + "loss": 0.9899, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 5.636808401697184, + "learning_rate": 2.027972027972028e-06, + "loss": 1.0293, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 4.989845861785016, + "learning_rate": 2.0629370629370634e-06, + "loss": 0.9782, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 4.036151597648198, + "learning_rate": 2.0979020979020983e-06, + "loss": 0.9476, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 4.748491625435901, + "learning_rate": 2.132867132867133e-06, + "loss": 1.0607, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 4.981488543652494, + "learning_rate": 2.167832167832168e-06, + "loss": 1.0103, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 2.747396934973974, + "learning_rate": 2.202797202797203e-06, + "loss": 0.9394, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 6.959028225064991, + "learning_rate": 2.237762237762238e-06, + "loss": 1.0223, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 4.1572731448374745, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.8474, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 4.264763279942374, + "learning_rate": 2.307692307692308e-06, + "loss": 1.0108, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 6.095523926263317, + "learning_rate": 2.342657342657343e-06, + "loss": 0.9966, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 2.0378141611911293, + "learning_rate": 2.377622377622378e-06, + "loss": 0.6418, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 12.350034886280204, + "learning_rate": 2.4125874125874128e-06, + "loss": 0.9833, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 4.400350695457104, + "learning_rate": 2.4475524475524477e-06, + "loss": 1.014, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 8.539091480211638, + "learning_rate": 2.4825174825174825e-06, + "loss": 0.9588, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 2.76197207482864, + "learning_rate": 2.517482517482518e-06, + "loss": 1.0505, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 4.105444561403831, + "learning_rate": 2.5524475524475528e-06, + "loss": 1.0268, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 4.808311827182721, + "learning_rate": 2.5874125874125877e-06, + "loss": 0.9266, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 3.3062924637115896, + "learning_rate": 2.6223776223776225e-06, + "loss": 0.9497, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 5.359465673010584, + "learning_rate": 2.6573426573426574e-06, + "loss": 1.0061, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 3.5894653414802122, + "learning_rate": 2.6923076923076923e-06, + "loss": 0.9365, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 5.6964431945565215, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.9945, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 3.554710415267673, + "learning_rate": 2.762237762237762e-06, + "loss": 0.909, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 3.6224966193912933, + "learning_rate": 2.7972027972027974e-06, + "loss": 0.9841, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 4.100484077566709, + "learning_rate": 2.8321678321678323e-06, + "loss": 0.9785, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.7434549809880306, + "learning_rate": 2.8671328671328672e-06, + "loss": 0.9091, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 2.910473094338652, + "learning_rate": 2.902097902097902e-06, + "loss": 0.9128, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 4.681590491267232, + "learning_rate": 2.937062937062937e-06, + "loss": 0.9723, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 3.8481519631092667, + "learning_rate": 2.972027972027972e-06, + "loss": 0.9126, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 3.061017336429524, + "learning_rate": 3.006993006993007e-06, + "loss": 0.949, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 3.1054284861421526, + "learning_rate": 3.0419580419580425e-06, + "loss": 0.954, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 5.216222090801437, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.999, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 9.344055127943243, + "learning_rate": 3.1118881118881123e-06, + "loss": 0.8865, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 3.916337109234207, + "learning_rate": 3.1468531468531472e-06, + "loss": 0.9985, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 5.646273915281262, + "learning_rate": 3.181818181818182e-06, + "loss": 0.9368, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 3.2528928171773566, + "learning_rate": 3.216783216783217e-06, + "loss": 0.8946, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 3.670080997940339, + "learning_rate": 3.251748251748252e-06, + "loss": 0.883, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 4.3498888254458885, + "learning_rate": 3.286713286713287e-06, + "loss": 0.8334, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 3.4044098902580067, + "learning_rate": 3.321678321678322e-06, + "loss": 0.8982, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 5.964267535512436, + "learning_rate": 3.356643356643357e-06, + "loss": 0.901, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 7.2547749034420175, + "learning_rate": 3.391608391608392e-06, + "loss": 0.888, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 3.6445555952684594, + "learning_rate": 3.426573426573427e-06, + "loss": 0.9357, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 3.6183641170544676, + "learning_rate": 3.4615384615384617e-06, + "loss": 0.8898, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 3.683704401531074, + "learning_rate": 3.4965034965034966e-06, + "loss": 0.9716, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 4.642503279279386, + "learning_rate": 3.5314685314685315e-06, + "loss": 0.8517, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 3.5861147598949064, + "learning_rate": 3.566433566433567e-06, + "loss": 0.8891, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 6.15820179982336, + "learning_rate": 3.6013986013986017e-06, + "loss": 0.9273, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 3.0966116013351526, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.8954, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 4.590992779608164, + "learning_rate": 3.6713286713286715e-06, + "loss": 0.8935, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 6.257062049913251, + "learning_rate": 3.7062937062937064e-06, + "loss": 0.7903, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 3.1160354318045598, + "learning_rate": 3.7412587412587413e-06, + "loss": 0.9888, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 3.557620314215135, + "learning_rate": 3.776223776223776e-06, + "loss": 0.8218, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 3.690586014565839, + "learning_rate": 3.811188811188811e-06, + "loss": 0.9652, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 4.649988211859253, + "learning_rate": 3.846153846153847e-06, + "loss": 0.874, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 3.4358159594855593, + "learning_rate": 3.881118881118881e-06, + "loss": 0.8646, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 4.275525174285379, + "learning_rate": 3.916083916083917e-06, + "loss": 0.8884, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 4.163330070858521, + "learning_rate": 3.951048951048951e-06, + "loss": 0.8856, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 2.0432246966048093, + "learning_rate": 3.986013986013986e-06, + "loss": 0.6153, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 3.6243328469121807, + "learning_rate": 4.020979020979021e-06, + "loss": 0.9266, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 3.7038828259874026, + "learning_rate": 4.055944055944056e-06, + "loss": 0.8613, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 6.587444425327127, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.8677, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 2.8022237219414, + "learning_rate": 4.125874125874127e-06, + "loss": 0.9397, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 4.591322074020894, + "learning_rate": 4.160839160839161e-06, + "loss": 0.8731, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 4.273093982425321, + "learning_rate": 4.195804195804197e-06, + "loss": 0.8526, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 4.197128473907411, + "learning_rate": 4.230769230769231e-06, + "loss": 0.8944, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 3.223590603807003, + "learning_rate": 4.265734265734266e-06, + "loss": 0.8939, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 3.7474696753810433, + "learning_rate": 4.300699300699301e-06, + "loss": 0.893, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 7.509667764665236, + "learning_rate": 4.335664335664336e-06, + "loss": 0.8958, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 3.4746005923931715, + "learning_rate": 4.3706293706293715e-06, + "loss": 0.9511, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 3.8752998189935024, + "learning_rate": 4.405594405594406e-06, + "loss": 0.9831, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 2.6750347835184956, + "learning_rate": 4.440559440559441e-06, + "loss": 0.8389, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 3.059225334055095, + "learning_rate": 4.475524475524476e-06, + "loss": 0.9191, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 2.8492139079466123, + "learning_rate": 4.510489510489511e-06, + "loss": 0.9256, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 3.466866349881368, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.9199, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 2.5173267002299946, + "learning_rate": 4.580419580419581e-06, + "loss": 0.8514, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 3.6745126525258236, + "learning_rate": 4.615384615384616e-06, + "loss": 0.9873, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 2.9338742337892234, + "learning_rate": 4.650349650349651e-06, + "loss": 0.8569, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 9.843844549233959, + "learning_rate": 4.685314685314686e-06, + "loss": 0.9136, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 2.624661321804515, + "learning_rate": 4.72027972027972e-06, + "loss": 0.9501, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 3.175344406567936, + "learning_rate": 4.755244755244756e-06, + "loss": 0.7571, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 2.9193507761169832, + "learning_rate": 4.79020979020979e-06, + "loss": 0.8804, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 3.136679694610541, + "learning_rate": 4.8251748251748255e-06, + "loss": 0.8397, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 4.002722016275543, + "learning_rate": 4.86013986013986e-06, + "loss": 0.7947, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 4.059401974397323, + "learning_rate": 4.895104895104895e-06, + "loss": 0.9127, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 6.278913674680803, + "learning_rate": 4.930069930069931e-06, + "loss": 0.88, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 4.830877808380784, + "learning_rate": 4.965034965034965e-06, + "loss": 0.9, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 3.983823394564358, + "learning_rate": 5e-06, + "loss": 0.9034, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 5.804644998821451, + "learning_rate": 5.034965034965036e-06, + "loss": 0.7911, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 4.991083677118832, + "learning_rate": 5.06993006993007e-06, + "loss": 0.9021, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 5.151416376092996, + "learning_rate": 5.1048951048951055e-06, + "loss": 0.9136, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 3.2803307360013227, + "learning_rate": 5.13986013986014e-06, + "loss": 0.862, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 3.2255731360831343, + "learning_rate": 5.174825174825175e-06, + "loss": 0.8045, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 10.688893991857855, + "learning_rate": 5.20979020979021e-06, + "loss": 0.8606, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 3.0166064964847963, + "learning_rate": 5.244755244755245e-06, + "loss": 0.8728, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 5.697806716640856, + "learning_rate": 5.27972027972028e-06, + "loss": 0.8277, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 3.9797528895264196, + "learning_rate": 5.314685314685315e-06, + "loss": 0.8832, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 3.5741820214637494, + "learning_rate": 5.34965034965035e-06, + "loss": 0.7884, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 3.904019428236043, + "learning_rate": 5.384615384615385e-06, + "loss": 0.8818, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 5.113543577877246, + "learning_rate": 5.41958041958042e-06, + "loss": 0.8515, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 3.916916159534253, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.883, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 3.026332966896275, + "learning_rate": 5.48951048951049e-06, + "loss": 0.8202, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 4.2747535576313895, + "learning_rate": 5.524475524475524e-06, + "loss": 0.8201, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 4.915673342738748, + "learning_rate": 5.5594405594405596e-06, + "loss": 0.8054, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 1.8868567094503492, + "learning_rate": 5.594405594405595e-06, + "loss": 0.5897, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 4.031567311252644, + "learning_rate": 5.629370629370629e-06, + "loss": 0.9305, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 5.750704520517035, + "learning_rate": 5.664335664335665e-06, + "loss": 0.8506, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 3.9833189282740107, + "learning_rate": 5.699300699300699e-06, + "loss": 0.7708, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 8.354822607813489, + "learning_rate": 5.7342657342657345e-06, + "loss": 0.8819, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 3.130090251045954, + "learning_rate": 5.769230769230769e-06, + "loss": 0.769, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 5.0268037848757725, + "learning_rate": 5.804195804195804e-06, + "loss": 0.8636, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 4.590445411100409, + "learning_rate": 5.83916083916084e-06, + "loss": 0.9014, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 3.9573676244626235, + "learning_rate": 5.874125874125874e-06, + "loss": 0.9767, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 3.7976970282517706, + "learning_rate": 5.90909090909091e-06, + "loss": 0.7677, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 3.20200552669586, + "learning_rate": 5.944055944055944e-06, + "loss": 0.8744, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 5.403103369719815, + "learning_rate": 5.97902097902098e-06, + "loss": 0.7684, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 5.688672125161367, + "learning_rate": 6.013986013986014e-06, + "loss": 0.828, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 3.300203165530322, + "learning_rate": 6.04895104895105e-06, + "loss": 0.8845, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 2.9549829165853074, + "learning_rate": 6.083916083916085e-06, + "loss": 0.8396, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 3.137866556517222, + "learning_rate": 6.1188811188811196e-06, + "loss": 0.8187, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 4.821237941276991, + "learning_rate": 6.153846153846155e-06, + "loss": 0.8752, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 3.1343880638693324, + "learning_rate": 6.188811188811189e-06, + "loss": 0.8322, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 3.187172327993686, + "learning_rate": 6.223776223776225e-06, + "loss": 0.7712, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 3.822050694898768, + "learning_rate": 6.258741258741259e-06, + "loss": 0.8855, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 4.567034848940679, + "learning_rate": 6.2937062937062944e-06, + "loss": 0.8808, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 3.878919913487739, + "learning_rate": 6.32867132867133e-06, + "loss": 0.816, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 3.235297169720441, + "learning_rate": 6.363636363636364e-06, + "loss": 0.7188, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 3.930819892851912, + "learning_rate": 6.3986013986013996e-06, + "loss": 0.9284, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 3.670121712591447, + "learning_rate": 6.433566433566434e-06, + "loss": 0.7796, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 2.6886770215888207, + "learning_rate": 6.468531468531469e-06, + "loss": 0.8803, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 3.5148986203471853, + "learning_rate": 6.503496503496504e-06, + "loss": 0.8603, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 8.671074065374444, + "learning_rate": 6.538461538461539e-06, + "loss": 0.8593, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 4.011761895974061, + "learning_rate": 6.573426573426574e-06, + "loss": 0.8902, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 2.48637995228731, + "learning_rate": 6.608391608391609e-06, + "loss": 0.7607, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 3.3132599184181504, + "learning_rate": 6.643356643356644e-06, + "loss": 0.8014, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 3.055806698354549, + "learning_rate": 6.678321678321679e-06, + "loss": 0.9812, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 3.2543072248631977, + "learning_rate": 6.713286713286714e-06, + "loss": 0.8687, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 3.040625426096151, + "learning_rate": 6.7482517482517485e-06, + "loss": 0.7797, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 3.215620104060743, + "learning_rate": 6.783216783216784e-06, + "loss": 0.8274, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 2.471180655628137, + "learning_rate": 6.818181818181818e-06, + "loss": 0.6187, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 4.359167344334515, + "learning_rate": 6.853146853146854e-06, + "loss": 0.8114, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 4.290199760313723, + "learning_rate": 6.888111888111889e-06, + "loss": 0.8188, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 5.885960883267336, + "learning_rate": 6.923076923076923e-06, + "loss": 0.8622, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 2.63144492234704, + "learning_rate": 6.958041958041959e-06, + "loss": 0.9063, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 3.3612366109316167, + "learning_rate": 6.993006993006993e-06, + "loss": 0.862, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 4.586306665535676, + "learning_rate": 7.0279720279720285e-06, + "loss": 0.8577, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 3.2176272960473553, + "learning_rate": 7.062937062937063e-06, + "loss": 0.8225, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 3.416044727353836, + "learning_rate": 7.097902097902098e-06, + "loss": 0.8681, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 5.4811630787160945, + "learning_rate": 7.132867132867134e-06, + "loss": 0.8767, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 3.567725865125878, + "learning_rate": 7.167832167832168e-06, + "loss": 0.8528, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 3.2885446735714443, + "learning_rate": 7.202797202797203e-06, + "loss": 0.756, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 4.884812749781261, + "learning_rate": 7.237762237762238e-06, + "loss": 0.8742, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 3.2827496038865385, + "learning_rate": 7.272727272727273e-06, + "loss": 0.854, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 3.8769620411554135, + "learning_rate": 7.307692307692308e-06, + "loss": 0.9437, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 3.152946273455516, + "learning_rate": 7.342657342657343e-06, + "loss": 0.7662, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 2.7643870516857074, + "learning_rate": 7.377622377622379e-06, + "loss": 0.9363, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 3.519504752467641, + "learning_rate": 7.412587412587413e-06, + "loss": 0.8056, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 1.7919893275590533, + "learning_rate": 7.447552447552449e-06, + "loss": 0.6353, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 2.9052671409624167, + "learning_rate": 7.4825174825174825e-06, + "loss": 0.8335, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 3.479591408911271, + "learning_rate": 7.517482517482519e-06, + "loss": 0.8493, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 4.377375722973853, + "learning_rate": 7.552447552447552e-06, + "loss": 0.7782, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 3.2542755743953102, + "learning_rate": 7.5874125874125885e-06, + "loss": 0.8053, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 3.2686387521722158, + "learning_rate": 7.622377622377622e-06, + "loss": 0.8326, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 8.023160327593423, + "learning_rate": 7.657342657342658e-06, + "loss": 0.8383, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 3.0319106359639614, + "learning_rate": 7.692307692307694e-06, + "loss": 0.873, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 3.857091847979987, + "learning_rate": 7.727272727272727e-06, + "loss": 0.8656, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 3.137600063788741, + "learning_rate": 7.762237762237763e-06, + "loss": 0.7638, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 4.911586562313934, + "learning_rate": 7.797202797202798e-06, + "loss": 0.7693, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 2.9977640714571807, + "learning_rate": 7.832167832167833e-06, + "loss": 0.8815, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 2.6947791098337848, + "learning_rate": 7.867132867132867e-06, + "loss": 0.8508, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 2.70865229471086, + "learning_rate": 7.902097902097902e-06, + "loss": 0.8039, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 3.9607344558352198, + "learning_rate": 7.937062937062937e-06, + "loss": 0.8913, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 2.7336519000248947, + "learning_rate": 7.972027972027973e-06, + "loss": 0.855, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 3.197027691020523, + "learning_rate": 8.006993006993008e-06, + "loss": 0.8883, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 2.8607529888322873, + "learning_rate": 8.041958041958042e-06, + "loss": 0.8003, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 6.811064736453866, + "learning_rate": 8.076923076923077e-06, + "loss": 0.9102, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 1.9400768388006893, + "learning_rate": 8.111888111888112e-06, + "loss": 0.6545, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 3.1601907658724624, + "learning_rate": 8.146853146853148e-06, + "loss": 0.8757, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 10.646638408245371, + "learning_rate": 8.181818181818183e-06, + "loss": 0.8677, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 5.665181271364294, + "learning_rate": 8.216783216783217e-06, + "loss": 0.7499, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 4.519763391278135, + "learning_rate": 8.251748251748254e-06, + "loss": 0.8092, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 4.799514568767844, + "learning_rate": 8.286713286713287e-06, + "loss": 0.8237, + "step": 237 + }, + { + "epoch": 0.03, + "grad_norm": 4.3167393880149625, + "learning_rate": 8.321678321678323e-06, + "loss": 0.7958, + "step": 238 + }, + { + "epoch": 0.03, + "grad_norm": 4.207750765393011, + "learning_rate": 8.356643356643356e-06, + "loss": 0.9301, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 3.0242485006730004, + "learning_rate": 8.391608391608393e-06, + "loss": 0.8689, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 4.061479269990459, + "learning_rate": 8.426573426573428e-06, + "loss": 0.8528, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 3.1543956915127973, + "learning_rate": 8.461538461538462e-06, + "loss": 0.8714, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 5.237885935376305, + "learning_rate": 8.496503496503497e-06, + "loss": 0.7606, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 1.950280179321886, + "learning_rate": 8.531468531468533e-06, + "loss": 0.6335, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 1.5896370335044245, + "learning_rate": 8.566433566433568e-06, + "loss": 0.6082, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 11.404835463536644, + "learning_rate": 8.601398601398602e-06, + "loss": 0.8319, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 3.238800843683949, + "learning_rate": 8.636363636363637e-06, + "loss": 0.7277, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 4.816682891374414, + "learning_rate": 8.671328671328672e-06, + "loss": 0.8987, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 2.937171146872307, + "learning_rate": 8.706293706293708e-06, + "loss": 0.8465, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 2.9152180387496465, + "learning_rate": 8.741258741258743e-06, + "loss": 0.8588, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 3.2233086259580013, + "learning_rate": 8.776223776223777e-06, + "loss": 0.7794, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 2.2240451951651212, + "learning_rate": 8.811188811188812e-06, + "loss": 0.6543, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 3.9198696085904783, + "learning_rate": 8.846153846153847e-06, + "loss": 0.8946, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 3.0548127678087447, + "learning_rate": 8.881118881118883e-06, + "loss": 0.8461, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 3.0034454684442564, + "learning_rate": 8.916083916083916e-06, + "loss": 0.836, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 4.356298512009719, + "learning_rate": 8.951048951048951e-06, + "loss": 0.8044, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 3.7719381130534395, + "learning_rate": 8.986013986013987e-06, + "loss": 0.8598, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 2.961248303513742, + "learning_rate": 9.020979020979022e-06, + "loss": 0.8625, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 2.494139526881577, + "learning_rate": 9.055944055944057e-06, + "loss": 0.8373, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 11.62343900060472, + "learning_rate": 9.090909090909091e-06, + "loss": 0.8142, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 2.976610214894441, + "learning_rate": 9.125874125874126e-06, + "loss": 0.8383, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 3.152334899926377, + "learning_rate": 9.160839160839162e-06, + "loss": 0.831, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 2.943305109872976, + "learning_rate": 9.195804195804197e-06, + "loss": 0.7919, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 1.6262854108938019, + "learning_rate": 9.230769230769232e-06, + "loss": 0.6016, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 2.9820347523442305, + "learning_rate": 9.265734265734266e-06, + "loss": 0.842, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 2.965150403672179, + "learning_rate": 9.300699300699301e-06, + "loss": 0.8197, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 4.015217382657136, + "learning_rate": 9.335664335664337e-06, + "loss": 0.7963, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 3.5798922752668783, + "learning_rate": 9.370629370629372e-06, + "loss": 0.8153, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 3.150189471491218, + "learning_rate": 9.405594405594406e-06, + "loss": 0.7447, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 2.8869071132238133, + "learning_rate": 9.44055944055944e-06, + "loss": 0.8291, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 5.031692940937883, + "learning_rate": 9.475524475524476e-06, + "loss": 0.7182, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 3.3293803718839365, + "learning_rate": 9.510489510489511e-06, + "loss": 0.7867, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 3.411178547420612, + "learning_rate": 9.545454545454547e-06, + "loss": 0.849, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 1.8859064509058623, + "learning_rate": 9.58041958041958e-06, + "loss": 0.5875, + "step": 274 + }, + { + "epoch": 0.03, + "grad_norm": 3.9199928735928666, + "learning_rate": 9.615384615384616e-06, + "loss": 0.8625, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 3.4204022429701837, + "learning_rate": 9.650349650349651e-06, + "loss": 0.787, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 3.280881830881753, + "learning_rate": 9.685314685314686e-06, + "loss": 0.8484, + "step": 277 + }, + { + "epoch": 0.03, + "grad_norm": 3.9903254168619435, + "learning_rate": 9.72027972027972e-06, + "loss": 0.8444, + "step": 278 + }, + { + "epoch": 0.03, + "grad_norm": 3.2907073267044566, + "learning_rate": 9.755244755244755e-06, + "loss": 0.7562, + "step": 279 + }, + { + "epoch": 0.03, + "grad_norm": 3.454956915827184, + "learning_rate": 9.79020979020979e-06, + "loss": 0.7811, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 4.564600921480734, + "learning_rate": 9.825174825174826e-06, + "loss": 0.7783, + "step": 281 + }, + { + "epoch": 0.03, + "grad_norm": 3.0241945476082597, + "learning_rate": 9.860139860139861e-06, + "loss": 0.8345, + "step": 282 + }, + { + "epoch": 0.03, + "grad_norm": 3.548190169203221, + "learning_rate": 9.895104895104895e-06, + "loss": 0.7788, + "step": 283 + }, + { + "epoch": 0.03, + "grad_norm": 4.732400299622023, + "learning_rate": 9.93006993006993e-06, + "loss": 0.7922, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 2.878974528064686, + "learning_rate": 9.965034965034966e-06, + "loss": 0.8064, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 4.264526689698259, + "learning_rate": 1e-05, + "loss": 0.8588, + "step": 286 + }, + { + "epoch": 0.03, + "grad_norm": 3.20198908359177, + "learning_rate": 9.999999709557228e-06, + "loss": 0.8367, + "step": 287 + }, + { + "epoch": 0.03, + "grad_norm": 20.715771128490395, + "learning_rate": 9.999998838228941e-06, + "loss": 0.7669, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 3.3540036271091456, + "learning_rate": 9.999997386015244e-06, + "loss": 0.863, + "step": 289 + }, + { + "epoch": 0.03, + "grad_norm": 3.1848307160272125, + "learning_rate": 9.999995352916303e-06, + "loss": 0.7439, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 3.152572478387918, + "learning_rate": 9.999992738932357e-06, + "loss": 0.822, + "step": 291 + }, + { + "epoch": 0.03, + "grad_norm": 3.5369549798522573, + "learning_rate": 9.999989544063708e-06, + "loss": 0.7687, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 1.5501867277358583, + "learning_rate": 9.999985768310726e-06, + "loss": 0.6133, + "step": 293 + }, + { + "epoch": 0.03, + "grad_norm": 3.319103455407901, + "learning_rate": 9.99998141167385e-06, + "loss": 0.7729, + "step": 294 + }, + { + "epoch": 0.03, + "grad_norm": 3.687148563168577, + "learning_rate": 9.999976474153589e-06, + "loss": 0.8048, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 2.984437946192857, + "learning_rate": 9.999970955750516e-06, + "loss": 0.7737, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 4.5653321300359915, + "learning_rate": 9.999964856465268e-06, + "loss": 0.7832, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 3.6017277018504, + "learning_rate": 9.999958176298559e-06, + "loss": 0.8802, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 3.4928074350110396, + "learning_rate": 9.99995091525116e-06, + "loss": 0.8067, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 2.7767296913858037, + "learning_rate": 9.999943073323919e-06, + "loss": 0.7946, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 3.560126265511432, + "learning_rate": 9.999934650517743e-06, + "loss": 0.7549, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 3.1779121129553207, + "learning_rate": 9.999925646833614e-06, + "loss": 0.8234, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 2.6269027998737364, + "learning_rate": 9.999916062272576e-06, + "loss": 0.747, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 3.0096315209518854, + "learning_rate": 9.999905896835745e-06, + "loss": 0.8391, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 3.6570640261588303, + "learning_rate": 9.999895150524297e-06, + "loss": 0.8514, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 2.7168969428743295, + "learning_rate": 9.999883823339487e-06, + "loss": 0.7491, + "step": 306 + }, + { + "epoch": 0.03, + "grad_norm": 2.929508346366845, + "learning_rate": 9.999871915282625e-06, + "loss": 0.8162, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 3.431569543759362, + "learning_rate": 9.999859426355098e-06, + "loss": 0.8174, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 3.6342519486169595, + "learning_rate": 9.999846356558356e-06, + "loss": 0.8412, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 2.773720032260141, + "learning_rate": 9.999832705893919e-06, + "loss": 0.8367, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 2.9812086033455607, + "learning_rate": 9.999818474363368e-06, + "loss": 0.8288, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 3.9790057425352723, + "learning_rate": 9.999803661968361e-06, + "loss": 0.8828, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 3.06724404084709, + "learning_rate": 9.999788268710619e-06, + "loss": 0.8277, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 3.020362580767787, + "learning_rate": 9.999772294591927e-06, + "loss": 0.7538, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 3.4612261352729305, + "learning_rate": 9.999755739614144e-06, + "loss": 0.7895, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 3.272614641342335, + "learning_rate": 9.999738603779192e-06, + "loss": 0.6866, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 1.7980881010106364, + "learning_rate": 9.999720887089062e-06, + "loss": 0.6429, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 3.3263471950749786, + "learning_rate": 9.99970258954581e-06, + "loss": 0.8188, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 3.0088862503780187, + "learning_rate": 9.999683711151565e-06, + "loss": 0.7195, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 2.87971268033177, + "learning_rate": 9.99966425190852e-06, + "loss": 0.8251, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 2.8366240584831233, + "learning_rate": 9.999644211818934e-06, + "loss": 0.8274, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 3.3752232102724986, + "learning_rate": 9.999623590885135e-06, + "loss": 0.7519, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 3.248073936944215, + "learning_rate": 9.999602389109521e-06, + "loss": 0.8161, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 2.7887278560705093, + "learning_rate": 9.999580606494554e-06, + "loss": 0.8526, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 2.6703840967870525, + "learning_rate": 9.999558243042763e-06, + "loss": 0.763, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 3.653691408190642, + "learning_rate": 9.999535298756749e-06, + "loss": 0.8728, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 2.6557511065861665, + "learning_rate": 9.999511773639177e-06, + "loss": 0.8372, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 2.9851651458699573, + "learning_rate": 9.999487667692778e-06, + "loss": 0.8749, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 3.556775744280704, + "learning_rate": 9.999462980920353e-06, + "loss": 0.9202, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 3.1780645043985722, + "learning_rate": 9.99943771332477e-06, + "loss": 0.7793, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 2.50054261428372, + "learning_rate": 9.999411864908967e-06, + "loss": 0.7445, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 8.416951605036898, + "learning_rate": 9.999385435675947e-06, + "loss": 0.7777, + "step": 332 + }, + { + "epoch": 0.04, + "grad_norm": 1.7297205360660906, + "learning_rate": 9.999358425628777e-06, + "loss": 0.6569, + "step": 333 + }, + { + "epoch": 0.04, + "grad_norm": 2.763919036592778, + "learning_rate": 9.999330834770598e-06, + "loss": 0.7999, + "step": 334 + }, + { + "epoch": 0.04, + "grad_norm": 3.3468910031579515, + "learning_rate": 9.999302663104611e-06, + "loss": 0.8996, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 2.52656655880427, + "learning_rate": 9.999273910634095e-06, + "loss": 0.8481, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 3.0210600507512853, + "learning_rate": 9.999244577362388e-06, + "loss": 0.7786, + "step": 337 + }, + { + "epoch": 0.04, + "grad_norm": 2.449552196852401, + "learning_rate": 9.999214663292896e-06, + "loss": 0.7566, + "step": 338 + }, + { + "epoch": 0.04, + "grad_norm": 3.5228157643635942, + "learning_rate": 9.999184168429095e-06, + "loss": 0.8273, + "step": 339 + }, + { + "epoch": 0.04, + "grad_norm": 2.8127747250358417, + "learning_rate": 9.99915309277453e-06, + "loss": 0.7628, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 2.9561302000674052, + "learning_rate": 9.999121436332809e-06, + "loss": 0.7372, + "step": 341 + }, + { + "epoch": 0.04, + "grad_norm": 2.635610731073348, + "learning_rate": 9.99908919910761e-06, + "loss": 0.8471, + "step": 342 + }, + { + "epoch": 0.04, + "grad_norm": 2.974938379443744, + "learning_rate": 9.99905638110268e-06, + "loss": 0.8192, + "step": 343 + }, + { + "epoch": 0.04, + "grad_norm": 3.7322849018695665, + "learning_rate": 9.99902298232183e-06, + "loss": 0.8051, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 2.989115971364437, + "learning_rate": 9.998989002768939e-06, + "loss": 0.7731, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 2.7573253684038, + "learning_rate": 9.99895444244796e-06, + "loss": 0.7253, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 3.0060806455923066, + "learning_rate": 9.998919301362902e-06, + "loss": 0.8549, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 2.813781196605346, + "learning_rate": 9.998883579517849e-06, + "loss": 0.8518, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 3.0126862479542558, + "learning_rate": 9.998847276916953e-06, + "loss": 0.7217, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 2.5842429131791644, + "learning_rate": 9.99881039356443e-06, + "loss": 0.8056, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 3.153516334642533, + "learning_rate": 9.998772929464567e-06, + "loss": 0.827, + "step": 351 + }, + { + "epoch": 0.04, + "grad_norm": 2.955277320770189, + "learning_rate": 9.998734884621714e-06, + "loss": 0.8264, + "step": 352 + }, + { + "epoch": 0.04, + "grad_norm": 2.3636272885829444, + "learning_rate": 9.998696259040292e-06, + "loss": 0.8436, + "step": 353 + }, + { + "epoch": 0.04, + "grad_norm": 2.6445985660522586, + "learning_rate": 9.99865705272479e-06, + "loss": 0.7755, + "step": 354 + }, + { + "epoch": 0.04, + "grad_norm": 3.237081162467692, + "learning_rate": 9.99861726567976e-06, + "loss": 0.7811, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 2.6981850184698, + "learning_rate": 9.998576897909826e-06, + "loss": 0.7442, + "step": 356 + }, + { + "epoch": 0.04, + "grad_norm": 3.1062173431704796, + "learning_rate": 9.998535949419676e-06, + "loss": 0.8203, + "step": 357 + }, + { + "epoch": 0.04, + "grad_norm": 3.337004723227758, + "learning_rate": 9.99849442021407e-06, + "loss": 0.8208, + "step": 358 + }, + { + "epoch": 0.04, + "grad_norm": 7.14701609390736, + "learning_rate": 9.99845231029783e-06, + "loss": 0.8006, + "step": 359 + }, + { + "epoch": 0.04, + "grad_norm": 2.19324654121165, + "learning_rate": 9.998409619675852e-06, + "loss": 0.8317, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 3.676166242806993, + "learning_rate": 9.998366348353092e-06, + "loss": 0.7146, + "step": 361 + }, + { + "epoch": 0.04, + "grad_norm": 1.7837495615392405, + "learning_rate": 9.998322496334579e-06, + "loss": 0.6307, + "step": 362 + }, + { + "epoch": 0.04, + "grad_norm": 2.6924764025176575, + "learning_rate": 9.998278063625407e-06, + "loss": 0.788, + "step": 363 + }, + { + "epoch": 0.04, + "grad_norm": 2.771669821577446, + "learning_rate": 9.998233050230737e-06, + "loss": 0.7719, + "step": 364 + }, + { + "epoch": 0.04, + "grad_norm": 2.7022314056519003, + "learning_rate": 9.9981874561558e-06, + "loss": 0.7276, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 3.4443745382241926, + "learning_rate": 9.998141281405892e-06, + "loss": 0.7159, + "step": 366 + }, + { + "epoch": 0.04, + "grad_norm": 2.638255448321009, + "learning_rate": 9.99809452598638e-06, + "loss": 0.8132, + "step": 367 + }, + { + "epoch": 0.04, + "grad_norm": 3.342150185068803, + "learning_rate": 9.998047189902693e-06, + "loss": 0.7607, + "step": 368 + }, + { + "epoch": 0.04, + "grad_norm": 2.415218070936139, + "learning_rate": 9.997999273160333e-06, + "loss": 0.7152, + "step": 369 + }, + { + "epoch": 0.04, + "grad_norm": 4.393978658986462, + "learning_rate": 9.997950775764862e-06, + "loss": 0.8176, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 2.590618917457282, + "learning_rate": 9.99790169772192e-06, + "loss": 0.7372, + "step": 371 + }, + { + "epoch": 0.04, + "grad_norm": 3.4930700657768323, + "learning_rate": 9.997852039037206e-06, + "loss": 0.823, + "step": 372 + }, + { + "epoch": 0.04, + "grad_norm": 2.487379499776839, + "learning_rate": 9.99780179971649e-06, + "loss": 0.6554, + "step": 373 + }, + { + "epoch": 0.04, + "grad_norm": 2.624919386906931, + "learning_rate": 9.997750979765606e-06, + "loss": 0.8252, + "step": 374 + }, + { + "epoch": 0.04, + "grad_norm": 2.6531319266118483, + "learning_rate": 9.997699579190462e-06, + "loss": 0.9001, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 2.769920692517521, + "learning_rate": 9.997647597997025e-06, + "loss": 0.8726, + "step": 376 + }, + { + "epoch": 0.04, + "grad_norm": 2.8611168209874145, + "learning_rate": 9.997595036191338e-06, + "loss": 0.8799, + "step": 377 + }, + { + "epoch": 0.04, + "grad_norm": 2.704800856083731, + "learning_rate": 9.997541893779507e-06, + "loss": 0.8497, + "step": 378 + }, + { + "epoch": 0.04, + "grad_norm": 2.4415710077044612, + "learning_rate": 9.997488170767706e-06, + "loss": 0.8527, + "step": 379 + }, + { + "epoch": 0.04, + "grad_norm": 3.050991550010565, + "learning_rate": 9.997433867162174e-06, + "loss": 0.7829, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 2.869557820847034, + "learning_rate": 9.997378982969223e-06, + "loss": 0.8413, + "step": 381 + }, + { + "epoch": 0.04, + "grad_norm": 2.5652433421128276, + "learning_rate": 9.997323518195227e-06, + "loss": 0.7852, + "step": 382 + }, + { + "epoch": 0.04, + "grad_norm": 3.0907376752843794, + "learning_rate": 9.99726747284663e-06, + "loss": 0.8476, + "step": 383 + }, + { + "epoch": 0.04, + "grad_norm": 2.576052254739924, + "learning_rate": 9.997210846929945e-06, + "loss": 0.7156, + "step": 384 + }, + { + "epoch": 0.04, + "grad_norm": 2.957239805878372, + "learning_rate": 9.997153640451748e-06, + "loss": 0.7981, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 2.406716857617946, + "learning_rate": 9.997095853418685e-06, + "loss": 0.7176, + "step": 386 + }, + { + "epoch": 0.04, + "grad_norm": 2.5783679058938773, + "learning_rate": 9.997037485837474e-06, + "loss": 0.8001, + "step": 387 + }, + { + "epoch": 0.04, + "grad_norm": 3.2799386939664985, + "learning_rate": 9.996978537714891e-06, + "loss": 0.7861, + "step": 388 + }, + { + "epoch": 0.04, + "grad_norm": 2.6320652922884773, + "learning_rate": 9.996919009057787e-06, + "loss": 0.844, + "step": 389 + }, + { + "epoch": 0.04, + "grad_norm": 3.079074473308074, + "learning_rate": 9.996858899873076e-06, + "loss": 0.8376, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 2.810909666983869, + "learning_rate": 9.996798210167745e-06, + "loss": 0.8102, + "step": 391 + }, + { + "epoch": 0.04, + "grad_norm": 2.822002229623917, + "learning_rate": 9.996736939948838e-06, + "loss": 0.7358, + "step": 392 + }, + { + "epoch": 0.04, + "grad_norm": 2.553150551330481, + "learning_rate": 9.996675089223481e-06, + "loss": 0.8613, + "step": 393 + }, + { + "epoch": 0.04, + "grad_norm": 2.6973816176709278, + "learning_rate": 9.996612657998856e-06, + "loss": 0.7688, + "step": 394 + }, + { + "epoch": 0.04, + "grad_norm": 3.017782068668808, + "learning_rate": 9.996549646282214e-06, + "loss": 0.8728, + "step": 395 + }, + { + "epoch": 0.04, + "grad_norm": 2.4233265237128307, + "learning_rate": 9.99648605408088e-06, + "loss": 0.7584, + "step": 396 + }, + { + "epoch": 0.04, + "grad_norm": 2.5786786252897844, + "learning_rate": 9.996421881402238e-06, + "loss": 0.8619, + "step": 397 + }, + { + "epoch": 0.04, + "grad_norm": 2.733216464863266, + "learning_rate": 9.996357128253747e-06, + "loss": 0.7994, + "step": 398 + }, + { + "epoch": 0.04, + "grad_norm": 2.1790269474955832, + "learning_rate": 9.996291794642924e-06, + "loss": 0.7871, + "step": 399 + }, + { + "epoch": 0.04, + "grad_norm": 2.506896691185903, + "learning_rate": 9.996225880577366e-06, + "loss": 0.7843, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 2.3733740763366336, + "learning_rate": 9.996159386064728e-06, + "loss": 0.743, + "step": 401 + }, + { + "epoch": 0.04, + "grad_norm": 2.3332932132080813, + "learning_rate": 9.996092311112734e-06, + "loss": 0.7422, + "step": 402 + }, + { + "epoch": 0.04, + "grad_norm": 2.521593140693123, + "learning_rate": 9.996024655729177e-06, + "loss": 0.8187, + "step": 403 + }, + { + "epoch": 0.04, + "grad_norm": 2.7935109625374253, + "learning_rate": 9.99595641992192e-06, + "loss": 0.8251, + "step": 404 + }, + { + "epoch": 0.04, + "grad_norm": 8.26279670060322, + "learning_rate": 9.995887603698886e-06, + "loss": 0.7706, + "step": 405 + }, + { + "epoch": 0.04, + "grad_norm": 2.41303802576548, + "learning_rate": 9.99581820706807e-06, + "loss": 0.7678, + "step": 406 + }, + { + "epoch": 0.04, + "grad_norm": 3.0303506168562255, + "learning_rate": 9.99574823003754e-06, + "loss": 0.7319, + "step": 407 + }, + { + "epoch": 0.04, + "grad_norm": 3.486164975875137, + "learning_rate": 9.99567767261542e-06, + "loss": 0.8546, + "step": 408 + }, + { + "epoch": 0.04, + "grad_norm": 2.9414654462050036, + "learning_rate": 9.995606534809909e-06, + "loss": 0.77, + "step": 409 + }, + { + "epoch": 0.04, + "grad_norm": 2.5654915500442366, + "learning_rate": 9.995534816629271e-06, + "loss": 0.7913, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 2.6295512034240995, + "learning_rate": 9.99546251808184e-06, + "loss": 0.8889, + "step": 411 + }, + { + "epoch": 0.04, + "grad_norm": 2.789997301197883, + "learning_rate": 9.995389639176013e-06, + "loss": 0.848, + "step": 412 + }, + { + "epoch": 0.04, + "grad_norm": 2.9172960774366037, + "learning_rate": 9.995316179920258e-06, + "loss": 0.729, + "step": 413 + }, + { + "epoch": 0.04, + "grad_norm": 2.5099680383958822, + "learning_rate": 9.99524214032311e-06, + "loss": 0.7368, + "step": 414 + }, + { + "epoch": 0.04, + "grad_norm": 2.516008292563799, + "learning_rate": 9.99516752039317e-06, + "loss": 0.8693, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 2.2755006768965584, + "learning_rate": 9.995092320139106e-06, + "loss": 0.7947, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 2.276675967191438, + "learning_rate": 9.995016539569656e-06, + "loss": 0.8244, + "step": 417 + }, + { + "epoch": 0.04, + "grad_norm": 2.4161022069453173, + "learning_rate": 9.994940178693624e-06, + "loss": 0.8016, + "step": 418 + }, + { + "epoch": 0.04, + "grad_norm": 3.8339702744181325, + "learning_rate": 9.99486323751988e-06, + "loss": 0.7548, + "step": 419 + }, + { + "epoch": 0.04, + "grad_norm": 4.975612430718834, + "learning_rate": 9.994785716057364e-06, + "loss": 0.7163, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 2.5503239423291677, + "learning_rate": 9.994707614315084e-06, + "loss": 0.7693, + "step": 421 + }, + { + "epoch": 0.04, + "grad_norm": 2.8040417675794083, + "learning_rate": 9.99462893230211e-06, + "loss": 0.7943, + "step": 422 + }, + { + "epoch": 0.04, + "grad_norm": 3.6155920470952188, + "learning_rate": 9.994549670027584e-06, + "loss": 0.8104, + "step": 423 + }, + { + "epoch": 0.04, + "grad_norm": 3.6688952806199437, + "learning_rate": 9.994469827500716e-06, + "loss": 0.7914, + "step": 424 + }, + { + "epoch": 0.04, + "grad_norm": 2.784372060150453, + "learning_rate": 9.99438940473078e-06, + "loss": 0.7975, + "step": 425 + }, + { + "epoch": 0.04, + "grad_norm": 2.5929689621810783, + "learning_rate": 9.994308401727122e-06, + "loss": 0.8029, + "step": 426 + }, + { + "epoch": 0.04, + "grad_norm": 2.827200496684302, + "learning_rate": 9.99422681849915e-06, + "loss": 0.8063, + "step": 427 + }, + { + "epoch": 0.05, + "grad_norm": 2.420731477742907, + "learning_rate": 9.994144655056343e-06, + "loss": 0.7089, + "step": 428 + }, + { + "epoch": 0.05, + "grad_norm": 3.1411562499918078, + "learning_rate": 9.994061911408245e-06, + "loss": 0.8219, + "step": 429 + }, + { + "epoch": 0.05, + "grad_norm": 5.296664953262252, + "learning_rate": 9.993978587564473e-06, + "loss": 0.748, + "step": 430 + }, + { + "epoch": 0.05, + "grad_norm": 3.786778377368977, + "learning_rate": 9.993894683534704e-06, + "loss": 0.79, + "step": 431 + }, + { + "epoch": 0.05, + "grad_norm": 4.9069867693590075, + "learning_rate": 9.993810199328687e-06, + "loss": 0.7826, + "step": 432 + }, + { + "epoch": 0.05, + "grad_norm": 3.7740885681631293, + "learning_rate": 9.993725134956235e-06, + "loss": 0.8472, + "step": 433 + }, + { + "epoch": 0.05, + "grad_norm": 3.0792423971075777, + "learning_rate": 9.993639490427235e-06, + "loss": 0.7272, + "step": 434 + }, + { + "epoch": 0.05, + "grad_norm": 2.505248944163071, + "learning_rate": 9.993553265751632e-06, + "loss": 0.8125, + "step": 435 + }, + { + "epoch": 0.05, + "grad_norm": 3.1705332082489175, + "learning_rate": 9.993466460939447e-06, + "loss": 0.7515, + "step": 436 + }, + { + "epoch": 0.05, + "grad_norm": 3.068737573080172, + "learning_rate": 9.993379076000762e-06, + "loss": 0.6471, + "step": 437 + }, + { + "epoch": 0.05, + "grad_norm": 3.0565141356797696, + "learning_rate": 9.99329111094573e-06, + "loss": 0.7869, + "step": 438 + }, + { + "epoch": 0.05, + "grad_norm": 2.418500158534013, + "learning_rate": 9.993202565784573e-06, + "loss": 0.8211, + "step": 439 + }, + { + "epoch": 0.05, + "grad_norm": 2.423639948102629, + "learning_rate": 9.993113440527573e-06, + "loss": 0.815, + "step": 440 + }, + { + "epoch": 0.05, + "grad_norm": 2.2704412010849007, + "learning_rate": 9.993023735185088e-06, + "loss": 0.7843, + "step": 441 + }, + { + "epoch": 0.05, + "grad_norm": 2.521200294230519, + "learning_rate": 9.992933449767538e-06, + "loss": 0.7483, + "step": 442 + }, + { + "epoch": 0.05, + "grad_norm": 2.706970229721735, + "learning_rate": 9.992842584285416e-06, + "loss": 0.7187, + "step": 443 + }, + { + "epoch": 0.05, + "grad_norm": 2.390640870602009, + "learning_rate": 9.992751138749273e-06, + "loss": 0.7721, + "step": 444 + }, + { + "epoch": 0.05, + "grad_norm": 2.5982385192863124, + "learning_rate": 9.992659113169736e-06, + "loss": 0.7153, + "step": 445 + }, + { + "epoch": 0.05, + "grad_norm": 2.4856303794193093, + "learning_rate": 9.992566507557495e-06, + "loss": 0.7573, + "step": 446 + }, + { + "epoch": 0.05, + "grad_norm": 2.5444768288105166, + "learning_rate": 9.99247332192331e-06, + "loss": 0.7668, + "step": 447 + }, + { + "epoch": 0.05, + "grad_norm": 2.040260161584014, + "learning_rate": 9.992379556278006e-06, + "loss": 0.7644, + "step": 448 + }, + { + "epoch": 0.05, + "grad_norm": 2.3952337813826174, + "learning_rate": 9.992285210632476e-06, + "loss": 0.7813, + "step": 449 + }, + { + "epoch": 0.05, + "grad_norm": 2.7875393710927003, + "learning_rate": 9.992190284997683e-06, + "loss": 0.8625, + "step": 450 + }, + { + "epoch": 0.05, + "grad_norm": 2.3456211144367436, + "learning_rate": 9.992094779384651e-06, + "loss": 0.7421, + "step": 451 + }, + { + "epoch": 0.05, + "grad_norm": 2.2941981241460914, + "learning_rate": 9.991998693804482e-06, + "loss": 0.786, + "step": 452 + }, + { + "epoch": 0.05, + "grad_norm": 2.7710253391308113, + "learning_rate": 9.991902028268333e-06, + "loss": 0.8221, + "step": 453 + }, + { + "epoch": 0.05, + "grad_norm": 2.8127605742355093, + "learning_rate": 9.991804782787435e-06, + "loss": 0.8029, + "step": 454 + }, + { + "epoch": 0.05, + "grad_norm": 2.21579763303864, + "learning_rate": 9.991706957373088e-06, + "loss": 0.8484, + "step": 455 + }, + { + "epoch": 0.05, + "grad_norm": 3.0567607175438702, + "learning_rate": 9.991608552036659e-06, + "loss": 0.7895, + "step": 456 + }, + { + "epoch": 0.05, + "grad_norm": 2.337491818683799, + "learning_rate": 9.991509566789575e-06, + "loss": 0.8293, + "step": 457 + }, + { + "epoch": 0.05, + "grad_norm": 2.0862821534316573, + "learning_rate": 9.991410001643338e-06, + "loss": 0.7856, + "step": 458 + }, + { + "epoch": 0.05, + "grad_norm": 2.278358994242391, + "learning_rate": 9.991309856609517e-06, + "loss": 0.7536, + "step": 459 + }, + { + "epoch": 0.05, + "grad_norm": 1.9648242121722526, + "learning_rate": 9.991209131699745e-06, + "loss": 0.7508, + "step": 460 + }, + { + "epoch": 0.05, + "grad_norm": 2.454545521761428, + "learning_rate": 9.991107826925724e-06, + "loss": 0.7682, + "step": 461 + }, + { + "epoch": 0.05, + "grad_norm": 2.3135550329147647, + "learning_rate": 9.991005942299224e-06, + "loss": 0.794, + "step": 462 + }, + { + "epoch": 0.05, + "grad_norm": 2.958497853786959, + "learning_rate": 9.990903477832081e-06, + "loss": 0.884, + "step": 463 + }, + { + "epoch": 0.05, + "grad_norm": 2.559497699524832, + "learning_rate": 9.990800433536198e-06, + "loss": 0.7475, + "step": 464 + }, + { + "epoch": 0.05, + "grad_norm": 2.2979423649720405, + "learning_rate": 9.990696809423551e-06, + "loss": 0.8139, + "step": 465 + }, + { + "epoch": 0.05, + "grad_norm": 3.477081620098315, + "learning_rate": 9.990592605506172e-06, + "loss": 0.7413, + "step": 466 + }, + { + "epoch": 0.05, + "grad_norm": 4.145253092459398, + "learning_rate": 9.990487821796171e-06, + "loss": 0.816, + "step": 467 + }, + { + "epoch": 0.05, + "grad_norm": 2.6671842619211605, + "learning_rate": 9.99038245830572e-06, + "loss": 0.7634, + "step": 468 + }, + { + "epoch": 0.05, + "grad_norm": 2.4700914843679307, + "learning_rate": 9.990276515047063e-06, + "loss": 0.7116, + "step": 469 + }, + { + "epoch": 0.05, + "grad_norm": 2.2107804543409255, + "learning_rate": 9.990169992032506e-06, + "loss": 0.8021, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 2.1867548251004902, + "learning_rate": 9.990062889274423e-06, + "loss": 0.8221, + "step": 471 + }, + { + "epoch": 0.05, + "grad_norm": 3.1546753580491793, + "learning_rate": 9.989955206785258e-06, + "loss": 0.8515, + "step": 472 + }, + { + "epoch": 0.05, + "grad_norm": 2.845940673194977, + "learning_rate": 9.989846944577524e-06, + "loss": 0.7823, + "step": 473 + }, + { + "epoch": 0.05, + "grad_norm": 2.782847174447964, + "learning_rate": 9.989738102663792e-06, + "loss": 0.8043, + "step": 474 + }, + { + "epoch": 0.05, + "grad_norm": 1.5304700280693413, + "learning_rate": 9.989628681056716e-06, + "loss": 0.6264, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 2.4549669242826684, + "learning_rate": 9.989518679769e-06, + "loss": 0.8646, + "step": 476 + }, + { + "epoch": 0.05, + "grad_norm": 2.704340279572477, + "learning_rate": 9.989408098813429e-06, + "loss": 0.8055, + "step": 477 + }, + { + "epoch": 0.05, + "grad_norm": 2.5055698906785566, + "learning_rate": 9.989296938202846e-06, + "loss": 0.7233, + "step": 478 + }, + { + "epoch": 0.05, + "grad_norm": 3.2218415084935477, + "learning_rate": 9.989185197950168e-06, + "loss": 0.8227, + "step": 479 + }, + { + "epoch": 0.05, + "grad_norm": 2.3617802926725933, + "learning_rate": 9.989072878068376e-06, + "loss": 0.8324, + "step": 480 + }, + { + "epoch": 0.05, + "grad_norm": 2.4468315450678713, + "learning_rate": 9.98895997857052e-06, + "loss": 0.7699, + "step": 481 + }, + { + "epoch": 0.05, + "grad_norm": 2.667742814508917, + "learning_rate": 9.988846499469714e-06, + "loss": 0.9051, + "step": 482 + }, + { + "epoch": 0.05, + "grad_norm": 2.7213442322657078, + "learning_rate": 9.988732440779145e-06, + "loss": 0.7052, + "step": 483 + }, + { + "epoch": 0.05, + "grad_norm": 2.326845379917123, + "learning_rate": 9.98861780251206e-06, + "loss": 0.7763, + "step": 484 + }, + { + "epoch": 0.05, + "grad_norm": 2.9136376815635368, + "learning_rate": 9.98850258468178e-06, + "loss": 0.7103, + "step": 485 + }, + { + "epoch": 0.05, + "grad_norm": 2.2455870076795366, + "learning_rate": 9.988386787301689e-06, + "loss": 0.8498, + "step": 486 + }, + { + "epoch": 0.05, + "grad_norm": 2.697658986196603, + "learning_rate": 9.988270410385242e-06, + "loss": 0.7247, + "step": 487 + }, + { + "epoch": 0.05, + "grad_norm": 2.6037250054010124, + "learning_rate": 9.98815345394596e-06, + "loss": 0.7998, + "step": 488 + }, + { + "epoch": 0.05, + "grad_norm": 2.2732857324114515, + "learning_rate": 9.988035917997426e-06, + "loss": 0.7466, + "step": 489 + }, + { + "epoch": 0.05, + "grad_norm": 2.447532792950365, + "learning_rate": 9.9879178025533e-06, + "loss": 0.7684, + "step": 490 + }, + { + "epoch": 0.05, + "grad_norm": 2.4996822315622005, + "learning_rate": 9.987799107627301e-06, + "loss": 0.7892, + "step": 491 + }, + { + "epoch": 0.05, + "grad_norm": 2.4392244915064354, + "learning_rate": 9.987679833233219e-06, + "loss": 0.7154, + "step": 492 + }, + { + "epoch": 0.05, + "grad_norm": 3.01944378814734, + "learning_rate": 9.987559979384913e-06, + "loss": 0.8231, + "step": 493 + }, + { + "epoch": 0.05, + "grad_norm": 2.3479953891689944, + "learning_rate": 9.987439546096309e-06, + "loss": 0.7696, + "step": 494 + }, + { + "epoch": 0.05, + "grad_norm": 2.095707445377812, + "learning_rate": 9.987318533381391e-06, + "loss": 0.7402, + "step": 495 + }, + { + "epoch": 0.05, + "grad_norm": 3.6125099373264056, + "learning_rate": 9.987196941254226e-06, + "loss": 0.8434, + "step": 496 + }, + { + "epoch": 0.05, + "grad_norm": 2.407463112602799, + "learning_rate": 9.987074769728936e-06, + "loss": 0.7014, + "step": 497 + }, + { + "epoch": 0.05, + "grad_norm": 2.2803041227883676, + "learning_rate": 9.986952018819715e-06, + "loss": 0.7015, + "step": 498 + }, + { + "epoch": 0.05, + "grad_norm": 2.850670147969541, + "learning_rate": 9.986828688540825e-06, + "loss": 0.8059, + "step": 499 + }, + { + "epoch": 0.05, + "grad_norm": 2.5894833396875727, + "learning_rate": 9.986704778906594e-06, + "loss": 0.8364, + "step": 500 + }, + { + "epoch": 0.05, + "grad_norm": 2.310359350175941, + "learning_rate": 9.986580289931416e-06, + "loss": 0.854, + "step": 501 + }, + { + "epoch": 0.05, + "grad_norm": 1.3368374036518098, + "learning_rate": 9.986455221629754e-06, + "loss": 0.5978, + "step": 502 + }, + { + "epoch": 0.05, + "grad_norm": 2.4931722034378425, + "learning_rate": 9.98632957401614e-06, + "loss": 0.8525, + "step": 503 + }, + { + "epoch": 0.05, + "grad_norm": 4.644787385474121, + "learning_rate": 9.986203347105168e-06, + "loss": 0.8037, + "step": 504 + }, + { + "epoch": 0.05, + "grad_norm": 2.563442064696163, + "learning_rate": 9.986076540911507e-06, + "loss": 0.8657, + "step": 505 + }, + { + "epoch": 0.05, + "grad_norm": 2.1165485781851836, + "learning_rate": 9.985949155449885e-06, + "loss": 0.7919, + "step": 506 + }, + { + "epoch": 0.05, + "grad_norm": 3.471404486523451, + "learning_rate": 9.985821190735104e-06, + "loss": 0.7542, + "step": 507 + }, + { + "epoch": 0.05, + "grad_norm": 2.3249602966837206, + "learning_rate": 9.98569264678203e-06, + "loss": 0.8402, + "step": 508 + }, + { + "epoch": 0.05, + "grad_norm": 2.48495778605686, + "learning_rate": 9.985563523605597e-06, + "loss": 0.721, + "step": 509 + }, + { + "epoch": 0.05, + "grad_norm": 2.4276988470343577, + "learning_rate": 9.985433821220805e-06, + "loss": 0.8305, + "step": 510 + }, + { + "epoch": 0.05, + "grad_norm": 2.266343385980247, + "learning_rate": 9.985303539642721e-06, + "loss": 0.8618, + "step": 511 + }, + { + "epoch": 0.05, + "grad_norm": 2.6368944571388737, + "learning_rate": 9.985172678886486e-06, + "loss": 0.7497, + "step": 512 + }, + { + "epoch": 0.05, + "grad_norm": 2.3362271184394703, + "learning_rate": 9.985041238967297e-06, + "loss": 0.7608, + "step": 513 + }, + { + "epoch": 0.05, + "grad_norm": 2.323494738235158, + "learning_rate": 9.984909219900429e-06, + "loss": 0.8141, + "step": 514 + }, + { + "epoch": 0.05, + "grad_norm": 2.2944704925260733, + "learning_rate": 9.984776621701218e-06, + "loss": 0.7668, + "step": 515 + }, + { + "epoch": 0.05, + "grad_norm": 2.539104026728401, + "learning_rate": 9.984643444385067e-06, + "loss": 0.6898, + "step": 516 + }, + { + "epoch": 0.05, + "grad_norm": 2.2795736059424563, + "learning_rate": 9.984509687967451e-06, + "loss": 0.8352, + "step": 517 + }, + { + "epoch": 0.05, + "grad_norm": 2.5063326762752127, + "learning_rate": 9.984375352463908e-06, + "loss": 0.7219, + "step": 518 + }, + { + "epoch": 0.05, + "grad_norm": 2.875219661862659, + "learning_rate": 9.984240437890045e-06, + "loss": 0.7054, + "step": 519 + }, + { + "epoch": 0.05, + "grad_norm": 2.8779919468667536, + "learning_rate": 9.984104944261536e-06, + "loss": 0.8438, + "step": 520 + }, + { + "epoch": 0.05, + "grad_norm": 2.222566644720778, + "learning_rate": 9.983968871594121e-06, + "loss": 0.7997, + "step": 521 + }, + { + "epoch": 0.05, + "grad_norm": 2.959982902693521, + "learning_rate": 9.98383221990361e-06, + "loss": 0.7926, + "step": 522 + }, + { + "epoch": 0.06, + "grad_norm": 3.1528886906910403, + "learning_rate": 9.983694989205882e-06, + "loss": 0.8242, + "step": 523 + }, + { + "epoch": 0.06, + "grad_norm": 2.18977821015223, + "learning_rate": 9.983557179516872e-06, + "loss": 0.757, + "step": 524 + }, + { + "epoch": 0.06, + "grad_norm": 1.5734084664110641, + "learning_rate": 9.983418790852597e-06, + "loss": 0.6629, + "step": 525 + }, + { + "epoch": 0.06, + "grad_norm": 2.324375311018105, + "learning_rate": 9.983279823229132e-06, + "loss": 0.8114, + "step": 526 + }, + { + "epoch": 0.06, + "grad_norm": 3.124013190729634, + "learning_rate": 9.983140276662621e-06, + "loss": 0.779, + "step": 527 + }, + { + "epoch": 0.06, + "grad_norm": 2.535001843346559, + "learning_rate": 9.98300015116928e-06, + "loss": 0.7235, + "step": 528 + }, + { + "epoch": 0.06, + "grad_norm": 2.1443674722870116, + "learning_rate": 9.982859446765385e-06, + "loss": 0.8205, + "step": 529 + }, + { + "epoch": 0.06, + "grad_norm": 3.3022865226413534, + "learning_rate": 9.982718163467282e-06, + "loss": 0.7919, + "step": 530 + }, + { + "epoch": 0.06, + "grad_norm": 2.2210575024061376, + "learning_rate": 9.982576301291387e-06, + "loss": 0.8224, + "step": 531 + }, + { + "epoch": 0.06, + "grad_norm": 2.266820994801646, + "learning_rate": 9.982433860254181e-06, + "loss": 0.8027, + "step": 532 + }, + { + "epoch": 0.06, + "grad_norm": 2.8366928780887593, + "learning_rate": 9.982290840372212e-06, + "loss": 0.7632, + "step": 533 + }, + { + "epoch": 0.06, + "grad_norm": 2.7429115320708926, + "learning_rate": 9.982147241662097e-06, + "loss": 0.8406, + "step": 534 + }, + { + "epoch": 0.06, + "grad_norm": 2.324430888530594, + "learning_rate": 9.982003064140515e-06, + "loss": 0.7881, + "step": 535 + }, + { + "epoch": 0.06, + "grad_norm": 2.519378774537875, + "learning_rate": 9.98185830782422e-06, + "loss": 0.7492, + "step": 536 + }, + { + "epoch": 0.06, + "grad_norm": 2.2247325308571977, + "learning_rate": 9.981712972730027e-06, + "loss": 0.7321, + "step": 537 + }, + { + "epoch": 0.06, + "grad_norm": 2.367419912672573, + "learning_rate": 9.981567058874822e-06, + "loss": 0.7559, + "step": 538 + }, + { + "epoch": 0.06, + "grad_norm": 2.442752211560955, + "learning_rate": 9.981420566275554e-06, + "loss": 0.7823, + "step": 539 + }, + { + "epoch": 0.06, + "grad_norm": 2.718090756388608, + "learning_rate": 9.981273494949247e-06, + "loss": 0.7689, + "step": 540 + }, + { + "epoch": 0.06, + "grad_norm": 1.3168535978993352, + "learning_rate": 9.981125844912985e-06, + "loss": 0.6097, + "step": 541 + }, + { + "epoch": 0.06, + "grad_norm": 2.39423124719434, + "learning_rate": 9.98097761618392e-06, + "loss": 0.8062, + "step": 542 + }, + { + "epoch": 0.06, + "grad_norm": 2.6461362569919227, + "learning_rate": 9.980828808779275e-06, + "loss": 0.8499, + "step": 543 + }, + { + "epoch": 0.06, + "grad_norm": 2.6944205009407094, + "learning_rate": 9.980679422716336e-06, + "loss": 0.8563, + "step": 544 + }, + { + "epoch": 0.06, + "grad_norm": 2.419123667806579, + "learning_rate": 9.98052945801246e-06, + "loss": 0.8623, + "step": 545 + }, + { + "epoch": 0.06, + "grad_norm": 2.7435396717539557, + "learning_rate": 9.980378914685069e-06, + "loss": 0.7444, + "step": 546 + }, + { + "epoch": 0.06, + "grad_norm": 2.662949276618667, + "learning_rate": 9.980227792751653e-06, + "loss": 0.7362, + "step": 547 + }, + { + "epoch": 0.06, + "grad_norm": 2.9210580658144902, + "learning_rate": 9.980076092229767e-06, + "loss": 0.7805, + "step": 548 + }, + { + "epoch": 0.06, + "grad_norm": 2.6271159104021105, + "learning_rate": 9.979923813137039e-06, + "loss": 0.7591, + "step": 549 + }, + { + "epoch": 0.06, + "grad_norm": 2.1688198714122695, + "learning_rate": 9.979770955491154e-06, + "loss": 0.7107, + "step": 550 + }, + { + "epoch": 0.06, + "grad_norm": 3.3806001298954946, + "learning_rate": 9.979617519309878e-06, + "loss": 0.7611, + "step": 551 + }, + { + "epoch": 0.06, + "grad_norm": 2.6186345183581548, + "learning_rate": 9.97946350461103e-06, + "loss": 0.8052, + "step": 552 + }, + { + "epoch": 0.06, + "grad_norm": 2.434402348311883, + "learning_rate": 9.979308911412508e-06, + "loss": 0.7546, + "step": 553 + }, + { + "epoch": 0.06, + "grad_norm": 2.1361701202500973, + "learning_rate": 9.979153739732273e-06, + "loss": 0.6987, + "step": 554 + }, + { + "epoch": 0.06, + "grad_norm": 2.638578499216826, + "learning_rate": 9.978997989588346e-06, + "loss": 0.8184, + "step": 555 + }, + { + "epoch": 0.06, + "grad_norm": 2.397169992403867, + "learning_rate": 9.978841660998827e-06, + "loss": 0.748, + "step": 556 + }, + { + "epoch": 0.06, + "grad_norm": 2.6871052832438886, + "learning_rate": 9.978684753981875e-06, + "loss": 0.8245, + "step": 557 + }, + { + "epoch": 0.06, + "grad_norm": 2.127649979618681, + "learning_rate": 9.978527268555723e-06, + "loss": 0.7135, + "step": 558 + }, + { + "epoch": 0.06, + "grad_norm": 2.500931002963333, + "learning_rate": 9.97836920473866e-06, + "loss": 0.7869, + "step": 559 + }, + { + "epoch": 0.06, + "grad_norm": 2.6065448044913357, + "learning_rate": 9.978210562549057e-06, + "loss": 0.7589, + "step": 560 + }, + { + "epoch": 0.06, + "grad_norm": 2.2848105655651274, + "learning_rate": 9.978051342005342e-06, + "loss": 0.7439, + "step": 561 + }, + { + "epoch": 0.06, + "grad_norm": 2.3300511317144, + "learning_rate": 9.97789154312601e-06, + "loss": 0.731, + "step": 562 + }, + { + "epoch": 0.06, + "grad_norm": 2.53982707617505, + "learning_rate": 9.97773116592963e-06, + "loss": 0.7874, + "step": 563 + }, + { + "epoch": 0.06, + "grad_norm": 3.0472971560396367, + "learning_rate": 9.977570210434831e-06, + "loss": 0.7261, + "step": 564 + }, + { + "epoch": 0.06, + "grad_norm": 2.883803485999733, + "learning_rate": 9.977408676660314e-06, + "loss": 0.8079, + "step": 565 + }, + { + "epoch": 0.06, + "grad_norm": 19.573290818215536, + "learning_rate": 9.977246564624845e-06, + "loss": 0.7604, + "step": 566 + }, + { + "epoch": 0.06, + "grad_norm": 2.6716091098325245, + "learning_rate": 9.977083874347258e-06, + "loss": 0.6414, + "step": 567 + }, + { + "epoch": 0.06, + "grad_norm": 3.6901090885051846, + "learning_rate": 9.976920605846452e-06, + "loss": 0.8105, + "step": 568 + }, + { + "epoch": 0.06, + "grad_norm": 2.7134792536923555, + "learning_rate": 9.976756759141399e-06, + "loss": 0.7635, + "step": 569 + }, + { + "epoch": 0.06, + "grad_norm": 2.7086458704405434, + "learning_rate": 9.976592334251132e-06, + "loss": 0.8302, + "step": 570 + }, + { + "epoch": 0.06, + "grad_norm": 2.1306874134793508, + "learning_rate": 9.976427331194753e-06, + "loss": 0.7362, + "step": 571 + }, + { + "epoch": 0.06, + "grad_norm": 2.45895557799812, + "learning_rate": 9.976261749991433e-06, + "loss": 0.7854, + "step": 572 + }, + { + "epoch": 0.06, + "grad_norm": 2.2266882437527475, + "learning_rate": 9.976095590660407e-06, + "loss": 0.8152, + "step": 573 + }, + { + "epoch": 0.06, + "grad_norm": 2.363404791350861, + "learning_rate": 9.975928853220979e-06, + "loss": 0.8157, + "step": 574 + }, + { + "epoch": 0.06, + "grad_norm": 2.4298149385499475, + "learning_rate": 9.975761537692522e-06, + "loss": 0.7796, + "step": 575 + }, + { + "epoch": 0.06, + "grad_norm": 2.2830133914169424, + "learning_rate": 9.975593644094472e-06, + "loss": 0.7881, + "step": 576 + }, + { + "epoch": 0.06, + "grad_norm": 5.164741876077585, + "learning_rate": 9.975425172446336e-06, + "loss": 0.6478, + "step": 577 + }, + { + "epoch": 0.06, + "grad_norm": 2.1615284561481283, + "learning_rate": 9.975256122767687e-06, + "loss": 0.7038, + "step": 578 + }, + { + "epoch": 0.06, + "grad_norm": 2.8554861521662316, + "learning_rate": 9.975086495078161e-06, + "loss": 0.8082, + "step": 579 + }, + { + "epoch": 0.06, + "grad_norm": 2.8793525267467426, + "learning_rate": 9.974916289397469e-06, + "loss": 0.7612, + "step": 580 + }, + { + "epoch": 0.06, + "grad_norm": 2.2817512397632775, + "learning_rate": 9.974745505745385e-06, + "loss": 0.7507, + "step": 581 + }, + { + "epoch": 0.06, + "grad_norm": 2.3293064382183752, + "learning_rate": 9.974574144141746e-06, + "loss": 0.782, + "step": 582 + }, + { + "epoch": 0.06, + "grad_norm": 2.420863828032633, + "learning_rate": 9.974402204606464e-06, + "loss": 0.7904, + "step": 583 + }, + { + "epoch": 0.06, + "grad_norm": 2.467148177657104, + "learning_rate": 9.974229687159515e-06, + "loss": 0.7148, + "step": 584 + }, + { + "epoch": 0.06, + "grad_norm": 2.3692761510748506, + "learning_rate": 9.974056591820937e-06, + "loss": 0.7572, + "step": 585 + }, + { + "epoch": 0.06, + "grad_norm": 2.410599414340472, + "learning_rate": 9.973882918610845e-06, + "loss": 0.7463, + "step": 586 + }, + { + "epoch": 0.06, + "grad_norm": 2.407706605193204, + "learning_rate": 9.973708667549413e-06, + "loss": 0.7719, + "step": 587 + }, + { + "epoch": 0.06, + "grad_norm": 2.1487892620296885, + "learning_rate": 9.973533838656886e-06, + "loss": 0.737, + "step": 588 + }, + { + "epoch": 0.06, + "grad_norm": 2.983088881371681, + "learning_rate": 9.973358431953574e-06, + "loss": 0.8082, + "step": 589 + }, + { + "epoch": 0.06, + "grad_norm": 4.84274621933108, + "learning_rate": 9.973182447459856e-06, + "loss": 0.8005, + "step": 590 + }, + { + "epoch": 0.06, + "grad_norm": 2.514268336749957, + "learning_rate": 9.973005885196177e-06, + "loss": 0.8384, + "step": 591 + }, + { + "epoch": 0.06, + "grad_norm": 2.9786767920760076, + "learning_rate": 9.97282874518305e-06, + "loss": 0.8312, + "step": 592 + }, + { + "epoch": 0.06, + "grad_norm": 2.5407053182524333, + "learning_rate": 9.972651027441053e-06, + "loss": 0.8417, + "step": 593 + }, + { + "epoch": 0.06, + "grad_norm": 2.2034669895851526, + "learning_rate": 9.972472731990836e-06, + "loss": 0.7066, + "step": 594 + }, + { + "epoch": 0.06, + "grad_norm": 2.3153560832834725, + "learning_rate": 9.972293858853111e-06, + "loss": 0.7032, + "step": 595 + }, + { + "epoch": 0.06, + "grad_norm": 1.1674873013648661, + "learning_rate": 9.972114408048658e-06, + "loss": 0.6294, + "step": 596 + }, + { + "epoch": 0.06, + "grad_norm": 2.4983825264182977, + "learning_rate": 9.971934379598327e-06, + "loss": 0.8138, + "step": 597 + }, + { + "epoch": 0.06, + "grad_norm": 2.5640167266728517, + "learning_rate": 9.971753773523032e-06, + "loss": 0.7516, + "step": 598 + }, + { + "epoch": 0.06, + "grad_norm": 2.819664317552869, + "learning_rate": 9.971572589843754e-06, + "loss": 0.7164, + "step": 599 + }, + { + "epoch": 0.06, + "grad_norm": 2.8700762251626, + "learning_rate": 9.971390828581546e-06, + "loss": 0.7128, + "step": 600 + }, + { + "epoch": 0.06, + "grad_norm": 3.2350947513381, + "learning_rate": 9.971208489757522e-06, + "loss": 0.7065, + "step": 601 + }, + { + "epoch": 0.06, + "grad_norm": 6.78868545134538, + "learning_rate": 9.971025573392863e-06, + "loss": 0.7491, + "step": 602 + }, + { + "epoch": 0.06, + "grad_norm": 2.620861295267675, + "learning_rate": 9.970842079508827e-06, + "loss": 0.8286, + "step": 603 + }, + { + "epoch": 0.06, + "grad_norm": 2.387001074929076, + "learning_rate": 9.970658008126725e-06, + "loss": 0.7537, + "step": 604 + }, + { + "epoch": 0.06, + "grad_norm": 2.356795460296981, + "learning_rate": 9.970473359267945e-06, + "loss": 0.7542, + "step": 605 + }, + { + "epoch": 0.06, + "grad_norm": 1.9886458699286347, + "learning_rate": 9.970288132953938e-06, + "loss": 0.6917, + "step": 606 + }, + { + "epoch": 0.06, + "grad_norm": 2.2751385664982706, + "learning_rate": 9.970102329206221e-06, + "loss": 0.7728, + "step": 607 + }, + { + "epoch": 0.06, + "grad_norm": 2.495226626436587, + "learning_rate": 9.969915948046387e-06, + "loss": 0.6747, + "step": 608 + }, + { + "epoch": 0.06, + "grad_norm": 1.4123037303465609, + "learning_rate": 9.969728989496081e-06, + "loss": 0.6243, + "step": 609 + }, + { + "epoch": 0.06, + "grad_norm": 2.510919227428744, + "learning_rate": 9.96954145357703e-06, + "loss": 0.7466, + "step": 610 + }, + { + "epoch": 0.06, + "grad_norm": 2.4441972726765404, + "learning_rate": 9.969353340311017e-06, + "loss": 0.7598, + "step": 611 + }, + { + "epoch": 0.06, + "grad_norm": 2.0740206980367337, + "learning_rate": 9.969164649719898e-06, + "loss": 0.7293, + "step": 612 + }, + { + "epoch": 0.06, + "grad_norm": 1.3874311549388647, + "learning_rate": 9.968975381825594e-06, + "loss": 0.6497, + "step": 613 + }, + { + "epoch": 0.06, + "grad_norm": 2.2328817638501284, + "learning_rate": 9.968785536650095e-06, + "loss": 0.7843, + "step": 614 + }, + { + "epoch": 0.06, + "grad_norm": 2.039098314636674, + "learning_rate": 9.968595114215453e-06, + "loss": 0.7148, + "step": 615 + }, + { + "epoch": 0.06, + "grad_norm": 1.8974999023720256, + "learning_rate": 9.968404114543796e-06, + "loss": 0.7553, + "step": 616 + }, + { + "epoch": 0.06, + "grad_norm": 2.363258301216911, + "learning_rate": 9.968212537657311e-06, + "loss": 0.7465, + "step": 617 + }, + { + "epoch": 0.07, + "grad_norm": 2.5506185481947714, + "learning_rate": 9.968020383578253e-06, + "loss": 0.8889, + "step": 618 + }, + { + "epoch": 0.07, + "grad_norm": 2.0478689904138068, + "learning_rate": 9.96782765232895e-06, + "loss": 0.7014, + "step": 619 + }, + { + "epoch": 0.07, + "grad_norm": 2.1272161891964703, + "learning_rate": 9.967634343931791e-06, + "loss": 0.8825, + "step": 620 + }, + { + "epoch": 0.07, + "grad_norm": 2.3598809615068794, + "learning_rate": 9.967440458409232e-06, + "loss": 0.7521, + "step": 621 + }, + { + "epoch": 0.07, + "grad_norm": 2.0759823166768627, + "learning_rate": 9.967245995783801e-06, + "loss": 0.7767, + "step": 622 + }, + { + "epoch": 0.07, + "grad_norm": 2.1835848414184875, + "learning_rate": 9.96705095607809e-06, + "loss": 0.8266, + "step": 623 + }, + { + "epoch": 0.07, + "grad_norm": 3.6137036631937254, + "learning_rate": 9.966855339314756e-06, + "loss": 0.857, + "step": 624 + }, + { + "epoch": 0.07, + "grad_norm": 2.516218552454981, + "learning_rate": 9.966659145516527e-06, + "loss": 0.8295, + "step": 625 + }, + { + "epoch": 0.07, + "grad_norm": 2.179275872622473, + "learning_rate": 9.966462374706196e-06, + "loss": 0.8447, + "step": 626 + }, + { + "epoch": 0.07, + "grad_norm": 2.0487084736863883, + "learning_rate": 9.966265026906622e-06, + "loss": 0.7527, + "step": 627 + }, + { + "epoch": 0.07, + "grad_norm": 2.394315996422968, + "learning_rate": 9.966067102140734e-06, + "loss": 0.7543, + "step": 628 + }, + { + "epoch": 0.07, + "grad_norm": 3.0716112519627363, + "learning_rate": 9.965868600431525e-06, + "loss": 0.7141, + "step": 629 + }, + { + "epoch": 0.07, + "grad_norm": 4.924340713473136, + "learning_rate": 9.965669521802057e-06, + "loss": 0.8291, + "step": 630 + }, + { + "epoch": 0.07, + "grad_norm": 2.8596202937468727, + "learning_rate": 9.965469866275457e-06, + "loss": 0.7771, + "step": 631 + }, + { + "epoch": 0.07, + "grad_norm": 2.7307465501368644, + "learning_rate": 9.965269633874924e-06, + "loss": 0.7157, + "step": 632 + }, + { + "epoch": 0.07, + "grad_norm": 2.407738691497479, + "learning_rate": 9.965068824623718e-06, + "loss": 0.6772, + "step": 633 + }, + { + "epoch": 0.07, + "grad_norm": 2.0971229549109176, + "learning_rate": 9.964867438545166e-06, + "loss": 0.7709, + "step": 634 + }, + { + "epoch": 0.07, + "grad_norm": 2.90404822468166, + "learning_rate": 9.964665475662668e-06, + "loss": 0.7457, + "step": 635 + }, + { + "epoch": 0.07, + "grad_norm": 2.213670720198414, + "learning_rate": 9.964462935999688e-06, + "loss": 0.8134, + "step": 636 + }, + { + "epoch": 0.07, + "grad_norm": 2.8026473036892554, + "learning_rate": 9.964259819579754e-06, + "loss": 0.7484, + "step": 637 + }, + { + "epoch": 0.07, + "grad_norm": 2.4715032303759554, + "learning_rate": 9.964056126426464e-06, + "loss": 0.851, + "step": 638 + }, + { + "epoch": 0.07, + "grad_norm": 1.9713105204657861, + "learning_rate": 9.963851856563483e-06, + "loss": 0.7706, + "step": 639 + }, + { + "epoch": 0.07, + "grad_norm": 2.0696765042050407, + "learning_rate": 9.963647010014541e-06, + "loss": 0.7526, + "step": 640 + }, + { + "epoch": 0.07, + "grad_norm": 2.61319517784048, + "learning_rate": 9.963441586803439e-06, + "loss": 0.7249, + "step": 641 + }, + { + "epoch": 0.07, + "grad_norm": 35.735839142292704, + "learning_rate": 9.963235586954043e-06, + "loss": 0.807, + "step": 642 + }, + { + "epoch": 0.07, + "grad_norm": 2.601165133281576, + "learning_rate": 9.963029010490281e-06, + "loss": 0.6441, + "step": 643 + }, + { + "epoch": 0.07, + "grad_norm": 2.700590007062543, + "learning_rate": 9.962821857436156e-06, + "loss": 0.699, + "step": 644 + }, + { + "epoch": 0.07, + "grad_norm": 2.83901661028331, + "learning_rate": 9.962614127815735e-06, + "loss": 0.715, + "step": 645 + }, + { + "epoch": 0.07, + "grad_norm": 3.2052448381433063, + "learning_rate": 9.96240582165315e-06, + "loss": 0.7545, + "step": 646 + }, + { + "epoch": 0.07, + "grad_norm": 3.50569734388418, + "learning_rate": 9.962196938972599e-06, + "loss": 0.7477, + "step": 647 + }, + { + "epoch": 0.07, + "grad_norm": 3.3913535336435077, + "learning_rate": 9.961987479798354e-06, + "loss": 0.6525, + "step": 648 + }, + { + "epoch": 0.07, + "grad_norm": 3.97776453246874, + "learning_rate": 9.961777444154747e-06, + "loss": 0.7575, + "step": 649 + }, + { + "epoch": 0.07, + "grad_norm": 2.6326354312277207, + "learning_rate": 9.96156683206618e-06, + "loss": 0.7891, + "step": 650 + }, + { + "epoch": 0.07, + "grad_norm": 2.590711433166387, + "learning_rate": 9.96135564355712e-06, + "loss": 0.8471, + "step": 651 + }, + { + "epoch": 0.07, + "grad_norm": 2.694706096312992, + "learning_rate": 9.961143878652104e-06, + "loss": 0.7534, + "step": 652 + }, + { + "epoch": 0.07, + "grad_norm": 2.1493792498317736, + "learning_rate": 9.960931537375731e-06, + "loss": 0.7545, + "step": 653 + }, + { + "epoch": 0.07, + "grad_norm": 2.547411269212289, + "learning_rate": 9.960718619752676e-06, + "loss": 0.7998, + "step": 654 + }, + { + "epoch": 0.07, + "grad_norm": 2.380301424223435, + "learning_rate": 9.96050512580767e-06, + "loss": 0.6799, + "step": 655 + }, + { + "epoch": 0.07, + "grad_norm": 2.317215606575152, + "learning_rate": 9.960291055565518e-06, + "loss": 0.8361, + "step": 656 + }, + { + "epoch": 0.07, + "grad_norm": 2.76463251057736, + "learning_rate": 9.96007640905109e-06, + "loss": 0.7474, + "step": 657 + }, + { + "epoch": 0.07, + "grad_norm": 2.337177345644392, + "learning_rate": 9.959861186289324e-06, + "loss": 0.7889, + "step": 658 + }, + { + "epoch": 0.07, + "grad_norm": 2.549090640868925, + "learning_rate": 9.95964538730522e-06, + "loss": 0.8025, + "step": 659 + }, + { + "epoch": 0.07, + "grad_norm": 2.530522238161703, + "learning_rate": 9.959429012123853e-06, + "loss": 0.715, + "step": 660 + }, + { + "epoch": 0.07, + "grad_norm": 2.515390648368771, + "learning_rate": 9.95921206077036e-06, + "loss": 0.7851, + "step": 661 + }, + { + "epoch": 0.07, + "grad_norm": 2.929755415997428, + "learning_rate": 9.958994533269947e-06, + "loss": 0.7982, + "step": 662 + }, + { + "epoch": 0.07, + "grad_norm": 2.450266556660888, + "learning_rate": 9.958776429647882e-06, + "loss": 0.6835, + "step": 663 + }, + { + "epoch": 0.07, + "grad_norm": 2.5343938278654954, + "learning_rate": 9.958557749929507e-06, + "loss": 0.7832, + "step": 664 + }, + { + "epoch": 0.07, + "grad_norm": 2.2961561118476643, + "learning_rate": 9.958338494140226e-06, + "loss": 0.7488, + "step": 665 + }, + { + "epoch": 0.07, + "grad_norm": 2.0185829134658593, + "learning_rate": 9.958118662305512e-06, + "loss": 0.6639, + "step": 666 + }, + { + "epoch": 0.07, + "grad_norm": 2.6097834515400935, + "learning_rate": 9.957898254450904e-06, + "loss": 0.8354, + "step": 667 + }, + { + "epoch": 0.07, + "grad_norm": 2.681329545781412, + "learning_rate": 9.957677270602009e-06, + "loss": 0.8243, + "step": 668 + }, + { + "epoch": 0.07, + "grad_norm": 3.0534672454472056, + "learning_rate": 9.957455710784499e-06, + "loss": 0.7536, + "step": 669 + }, + { + "epoch": 0.07, + "grad_norm": 2.306313918952685, + "learning_rate": 9.957233575024114e-06, + "loss": 0.831, + "step": 670 + }, + { + "epoch": 0.07, + "grad_norm": 2.188405047316281, + "learning_rate": 9.957010863346665e-06, + "loss": 0.7803, + "step": 671 + }, + { + "epoch": 0.07, + "grad_norm": 3.0129653737744073, + "learning_rate": 9.956787575778022e-06, + "loss": 0.721, + "step": 672 + }, + { + "epoch": 0.07, + "grad_norm": 2.3823769975743683, + "learning_rate": 9.956563712344127e-06, + "loss": 0.7449, + "step": 673 + }, + { + "epoch": 0.07, + "grad_norm": 2.5712155219456476, + "learning_rate": 9.956339273070988e-06, + "loss": 0.7693, + "step": 674 + }, + { + "epoch": 0.07, + "grad_norm": 2.917447740270616, + "learning_rate": 9.95611425798468e-06, + "loss": 0.7564, + "step": 675 + }, + { + "epoch": 0.07, + "grad_norm": 2.441012766615215, + "learning_rate": 9.955888667111341e-06, + "loss": 0.7311, + "step": 676 + }, + { + "epoch": 0.07, + "grad_norm": 2.3439122547341844, + "learning_rate": 9.955662500477185e-06, + "loss": 0.6111, + "step": 677 + }, + { + "epoch": 0.07, + "grad_norm": 2.3472763771803153, + "learning_rate": 9.955435758108488e-06, + "loss": 0.7722, + "step": 678 + }, + { + "epoch": 0.07, + "grad_norm": 3.0118308796961055, + "learning_rate": 9.955208440031586e-06, + "loss": 0.7479, + "step": 679 + }, + { + "epoch": 0.07, + "grad_norm": 2.5164721040624083, + "learning_rate": 9.954980546272892e-06, + "loss": 0.7528, + "step": 680 + }, + { + "epoch": 0.07, + "grad_norm": 2.349312862665128, + "learning_rate": 9.95475207685888e-06, + "loss": 0.7895, + "step": 681 + }, + { + "epoch": 0.07, + "grad_norm": 3.538231437183247, + "learning_rate": 9.954523031816096e-06, + "loss": 0.7881, + "step": 682 + }, + { + "epoch": 0.07, + "grad_norm": 2.2748356407545756, + "learning_rate": 9.95429341117115e-06, + "loss": 0.7373, + "step": 683 + }, + { + "epoch": 0.07, + "grad_norm": 2.690864131773263, + "learning_rate": 9.954063214950715e-06, + "loss": 0.7729, + "step": 684 + }, + { + "epoch": 0.07, + "grad_norm": 2.441689564432529, + "learning_rate": 9.953832443181536e-06, + "loss": 0.6846, + "step": 685 + }, + { + "epoch": 0.07, + "grad_norm": 2.7063998770038253, + "learning_rate": 9.953601095890425e-06, + "loss": 0.7764, + "step": 686 + }, + { + "epoch": 0.07, + "grad_norm": 2.3897375113199755, + "learning_rate": 9.953369173104256e-06, + "loss": 0.7468, + "step": 687 + }, + { + "epoch": 0.07, + "grad_norm": 5.623372216112194, + "learning_rate": 9.953136674849978e-06, + "loss": 0.7766, + "step": 688 + }, + { + "epoch": 0.07, + "grad_norm": 2.33678410607246, + "learning_rate": 9.952903601154598e-06, + "loss": 0.7899, + "step": 689 + }, + { + "epoch": 0.07, + "grad_norm": 5.7100874309999385, + "learning_rate": 9.952669952045196e-06, + "loss": 0.6976, + "step": 690 + }, + { + "epoch": 0.07, + "grad_norm": 2.4385704663154444, + "learning_rate": 9.952435727548915e-06, + "loss": 0.7834, + "step": 691 + }, + { + "epoch": 0.07, + "grad_norm": 2.466989152583984, + "learning_rate": 9.952200927692965e-06, + "loss": 0.7762, + "step": 692 + }, + { + "epoch": 0.07, + "grad_norm": 2.4093135917267245, + "learning_rate": 9.95196555250463e-06, + "loss": 0.6769, + "step": 693 + }, + { + "epoch": 0.07, + "grad_norm": 2.3338351134830537, + "learning_rate": 9.95172960201125e-06, + "loss": 0.7501, + "step": 694 + }, + { + "epoch": 0.07, + "grad_norm": 2.3101279059282334, + "learning_rate": 9.95149307624024e-06, + "loss": 0.7034, + "step": 695 + }, + { + "epoch": 0.07, + "grad_norm": 2.2681781457320707, + "learning_rate": 9.951255975219076e-06, + "loss": 0.8163, + "step": 696 + }, + { + "epoch": 0.07, + "grad_norm": 2.79211360891463, + "learning_rate": 9.951018298975306e-06, + "loss": 0.7878, + "step": 697 + }, + { + "epoch": 0.07, + "grad_norm": 1.9232352000611326, + "learning_rate": 9.950780047536543e-06, + "loss": 0.74, + "step": 698 + }, + { + "epoch": 0.07, + "grad_norm": 2.750774856582151, + "learning_rate": 9.950541220930463e-06, + "loss": 0.7653, + "step": 699 + }, + { + "epoch": 0.07, + "grad_norm": 2.046202452399702, + "learning_rate": 9.950301819184816e-06, + "loss": 0.6924, + "step": 700 + }, + { + "epoch": 0.07, + "grad_norm": 2.5862040137998457, + "learning_rate": 9.950061842327415e-06, + "loss": 0.7198, + "step": 701 + }, + { + "epoch": 0.07, + "grad_norm": 2.9170844730186487, + "learning_rate": 9.949821290386137e-06, + "loss": 0.7769, + "step": 702 + }, + { + "epoch": 0.07, + "grad_norm": 2.638513502534882, + "learning_rate": 9.94958016338893e-06, + "loss": 0.7465, + "step": 703 + }, + { + "epoch": 0.07, + "grad_norm": 3.154543573950718, + "learning_rate": 9.949338461363807e-06, + "loss": 0.7234, + "step": 704 + }, + { + "epoch": 0.07, + "grad_norm": 2.4170777296764943, + "learning_rate": 9.949096184338849e-06, + "loss": 0.7579, + "step": 705 + }, + { + "epoch": 0.07, + "grad_norm": 3.1649853433830772, + "learning_rate": 9.948853332342202e-06, + "loss": 0.6416, + "step": 706 + }, + { + "epoch": 0.07, + "grad_norm": 2.2732580001069485, + "learning_rate": 9.948609905402082e-06, + "loss": 0.7448, + "step": 707 + }, + { + "epoch": 0.07, + "grad_norm": 3.0109624448260335, + "learning_rate": 9.948365903546766e-06, + "loss": 0.8066, + "step": 708 + }, + { + "epoch": 0.07, + "grad_norm": 3.5719103836155117, + "learning_rate": 9.948121326804604e-06, + "loss": 0.6297, + "step": 709 + }, + { + "epoch": 0.07, + "grad_norm": 1.6740846319075147, + "learning_rate": 9.947876175204013e-06, + "loss": 0.6353, + "step": 710 + }, + { + "epoch": 0.07, + "grad_norm": 2.8270316407346163, + "learning_rate": 9.947630448773468e-06, + "loss": 0.7751, + "step": 711 + }, + { + "epoch": 0.07, + "grad_norm": 2.8283132421827895, + "learning_rate": 9.94738414754152e-06, + "loss": 0.8027, + "step": 712 + }, + { + "epoch": 0.08, + "grad_norm": 2.332076932637489, + "learning_rate": 9.947137271536784e-06, + "loss": 0.6781, + "step": 713 + }, + { + "epoch": 0.08, + "grad_norm": 2.3484004490599073, + "learning_rate": 9.94688982078794e-06, + "loss": 0.6904, + "step": 714 + }, + { + "epoch": 0.08, + "grad_norm": 2.1432128787457025, + "learning_rate": 9.946641795323737e-06, + "loss": 0.7361, + "step": 715 + }, + { + "epoch": 0.08, + "grad_norm": 1.6418533463624745, + "learning_rate": 9.946393195172987e-06, + "loss": 0.63, + "step": 716 + }, + { + "epoch": 0.08, + "grad_norm": 2.2838183539614234, + "learning_rate": 9.946144020364576e-06, + "loss": 0.7464, + "step": 717 + }, + { + "epoch": 0.08, + "grad_norm": 2.955408112768042, + "learning_rate": 9.945894270927452e-06, + "loss": 0.7333, + "step": 718 + }, + { + "epoch": 0.08, + "grad_norm": 2.565202439751648, + "learning_rate": 9.945643946890628e-06, + "loss": 0.6788, + "step": 719 + }, + { + "epoch": 0.08, + "grad_norm": 1.987883523633849, + "learning_rate": 9.945393048283186e-06, + "loss": 0.7369, + "step": 720 + }, + { + "epoch": 0.08, + "grad_norm": 2.305645743958634, + "learning_rate": 9.945141575134275e-06, + "loss": 0.8795, + "step": 721 + }, + { + "epoch": 0.08, + "grad_norm": 4.342801724950221, + "learning_rate": 9.944889527473112e-06, + "loss": 0.7749, + "step": 722 + }, + { + "epoch": 0.08, + "grad_norm": 2.6867592587630953, + "learning_rate": 9.944636905328977e-06, + "loss": 0.7122, + "step": 723 + }, + { + "epoch": 0.08, + "grad_norm": 2.7963278085386087, + "learning_rate": 9.94438370873122e-06, + "loss": 0.7286, + "step": 724 + }, + { + "epoch": 0.08, + "grad_norm": 3.0500060514327645, + "learning_rate": 9.944129937709255e-06, + "loss": 0.8238, + "step": 725 + }, + { + "epoch": 0.08, + "grad_norm": 2.518747853267898, + "learning_rate": 9.943875592292569e-06, + "loss": 0.7468, + "step": 726 + }, + { + "epoch": 0.08, + "grad_norm": 2.3307749408071023, + "learning_rate": 9.943620672510706e-06, + "loss": 0.784, + "step": 727 + }, + { + "epoch": 0.08, + "grad_norm": 2.2927298280547608, + "learning_rate": 9.943365178393283e-06, + "loss": 0.6643, + "step": 728 + }, + { + "epoch": 0.08, + "grad_norm": 5.413030343240209, + "learning_rate": 9.943109109969985e-06, + "loss": 0.7718, + "step": 729 + }, + { + "epoch": 0.08, + "grad_norm": 1.985911781478991, + "learning_rate": 9.94285246727056e-06, + "loss": 0.7437, + "step": 730 + }, + { + "epoch": 0.08, + "grad_norm": 2.8515953816737416, + "learning_rate": 9.942595250324823e-06, + "loss": 0.7, + "step": 731 + }, + { + "epoch": 0.08, + "grad_norm": 2.567107917615768, + "learning_rate": 9.942337459162657e-06, + "loss": 0.6874, + "step": 732 + }, + { + "epoch": 0.08, + "grad_norm": 2.389536459691384, + "learning_rate": 9.942079093814012e-06, + "loss": 0.7786, + "step": 733 + }, + { + "epoch": 0.08, + "grad_norm": 2.456329041193247, + "learning_rate": 9.941820154308905e-06, + "loss": 0.6675, + "step": 734 + }, + { + "epoch": 0.08, + "grad_norm": 2.2902527313975454, + "learning_rate": 9.941560640677417e-06, + "loss": 0.7431, + "step": 735 + }, + { + "epoch": 0.08, + "grad_norm": 1.9299279456459286, + "learning_rate": 9.941300552949697e-06, + "loss": 0.7444, + "step": 736 + }, + { + "epoch": 0.08, + "grad_norm": 2.894177677033512, + "learning_rate": 9.941039891155964e-06, + "loss": 0.6389, + "step": 737 + }, + { + "epoch": 0.08, + "grad_norm": 2.1178139348238716, + "learning_rate": 9.940778655326499e-06, + "loss": 0.7812, + "step": 738 + }, + { + "epoch": 0.08, + "grad_norm": 2.6341684151897957, + "learning_rate": 9.940516845491653e-06, + "loss": 0.7911, + "step": 739 + }, + { + "epoch": 0.08, + "grad_norm": 2.3572249944985764, + "learning_rate": 9.940254461681841e-06, + "loss": 0.7095, + "step": 740 + }, + { + "epoch": 0.08, + "grad_norm": 2.392416707308911, + "learning_rate": 9.939991503927548e-06, + "loss": 0.7532, + "step": 741 + }, + { + "epoch": 0.08, + "grad_norm": 2.892105469013813, + "learning_rate": 9.939727972259321e-06, + "loss": 0.8392, + "step": 742 + }, + { + "epoch": 0.08, + "grad_norm": 2.5792946557866454, + "learning_rate": 9.939463866707777e-06, + "loss": 0.7718, + "step": 743 + }, + { + "epoch": 0.08, + "grad_norm": 2.382428104409083, + "learning_rate": 9.939199187303598e-06, + "loss": 0.6834, + "step": 744 + }, + { + "epoch": 0.08, + "grad_norm": 2.963009035348591, + "learning_rate": 9.938933934077539e-06, + "loss": 0.749, + "step": 745 + }, + { + "epoch": 0.08, + "grad_norm": 2.3169431674281125, + "learning_rate": 9.93866810706041e-06, + "loss": 0.7598, + "step": 746 + }, + { + "epoch": 0.08, + "grad_norm": 2.269068621083456, + "learning_rate": 9.938401706283096e-06, + "loss": 0.6919, + "step": 747 + }, + { + "epoch": 0.08, + "grad_norm": 4.520110943561689, + "learning_rate": 9.93813473177655e-06, + "loss": 0.7574, + "step": 748 + }, + { + "epoch": 0.08, + "grad_norm": 2.6705782110520553, + "learning_rate": 9.937867183571784e-06, + "loss": 0.6852, + "step": 749 + }, + { + "epoch": 0.08, + "grad_norm": 2.394466270287015, + "learning_rate": 9.93759906169988e-06, + "loss": 0.8214, + "step": 750 + }, + { + "epoch": 0.08, + "grad_norm": 2.4103257753815956, + "learning_rate": 9.937330366191994e-06, + "loss": 0.7784, + "step": 751 + }, + { + "epoch": 0.08, + "grad_norm": 1.2399480859042666, + "learning_rate": 9.937061097079337e-06, + "loss": 0.6601, + "step": 752 + }, + { + "epoch": 0.08, + "grad_norm": 3.0481316936580254, + "learning_rate": 9.936791254393193e-06, + "loss": 0.7356, + "step": 753 + }, + { + "epoch": 0.08, + "grad_norm": 2.437583015333378, + "learning_rate": 9.936520838164912e-06, + "loss": 0.7158, + "step": 754 + }, + { + "epoch": 0.08, + "grad_norm": 2.707441716431831, + "learning_rate": 9.93624984842591e-06, + "loss": 0.7215, + "step": 755 + }, + { + "epoch": 0.08, + "grad_norm": 4.532030304474135, + "learning_rate": 9.93597828520767e-06, + "loss": 0.753, + "step": 756 + }, + { + "epoch": 0.08, + "grad_norm": 2.7461971545571786, + "learning_rate": 9.935706148541742e-06, + "loss": 0.7097, + "step": 757 + }, + { + "epoch": 0.08, + "grad_norm": 2.3627790545916216, + "learning_rate": 9.93543343845974e-06, + "loss": 0.8169, + "step": 758 + }, + { + "epoch": 0.08, + "grad_norm": 2.6693014549496814, + "learning_rate": 9.93516015499335e-06, + "loss": 0.8296, + "step": 759 + }, + { + "epoch": 0.08, + "grad_norm": 3.7105561050330222, + "learning_rate": 9.934886298174317e-06, + "loss": 0.7099, + "step": 760 + }, + { + "epoch": 0.08, + "grad_norm": 2.720683329507592, + "learning_rate": 9.93461186803446e-06, + "loss": 0.7632, + "step": 761 + }, + { + "epoch": 0.08, + "grad_norm": 2.588291193800891, + "learning_rate": 9.934336864605663e-06, + "loss": 0.7789, + "step": 762 + }, + { + "epoch": 0.08, + "grad_norm": 2.5585071173949863, + "learning_rate": 9.934061287919869e-06, + "loss": 0.6833, + "step": 763 + }, + { + "epoch": 0.08, + "grad_norm": 2.2895773779403195, + "learning_rate": 9.9337851380091e-06, + "loss": 0.7367, + "step": 764 + }, + { + "epoch": 0.08, + "grad_norm": 3.0219975702999697, + "learning_rate": 9.933508414905434e-06, + "loss": 0.7717, + "step": 765 + }, + { + "epoch": 0.08, + "grad_norm": 2.6270513907040094, + "learning_rate": 9.933231118641025e-06, + "loss": 0.6998, + "step": 766 + }, + { + "epoch": 0.08, + "grad_norm": 2.620673953552405, + "learning_rate": 9.932953249248082e-06, + "loss": 0.833, + "step": 767 + }, + { + "epoch": 0.08, + "grad_norm": 2.5153078631067154, + "learning_rate": 9.93267480675889e-06, + "loss": 0.7723, + "step": 768 + }, + { + "epoch": 0.08, + "grad_norm": 2.951354250369123, + "learning_rate": 9.9323957912058e-06, + "loss": 0.7502, + "step": 769 + }, + { + "epoch": 0.08, + "grad_norm": 1.4394149168178134, + "learning_rate": 9.932116202621224e-06, + "loss": 0.6457, + "step": 770 + }, + { + "epoch": 0.08, + "grad_norm": 2.1095938889246026, + "learning_rate": 9.931836041037644e-06, + "loss": 0.7061, + "step": 771 + }, + { + "epoch": 0.08, + "grad_norm": 2.6819242824234957, + "learning_rate": 9.931555306487612e-06, + "loss": 0.7691, + "step": 772 + }, + { + "epoch": 0.08, + "grad_norm": 2.2043376051832424, + "learning_rate": 9.931273999003738e-06, + "loss": 0.6946, + "step": 773 + }, + { + "epoch": 0.08, + "grad_norm": 2.6902924884956425, + "learning_rate": 9.930992118618706e-06, + "loss": 0.6695, + "step": 774 + }, + { + "epoch": 0.08, + "grad_norm": 2.8989924651843326, + "learning_rate": 9.930709665365264e-06, + "loss": 0.7694, + "step": 775 + }, + { + "epoch": 0.08, + "grad_norm": 2.224303330241575, + "learning_rate": 9.930426639276225e-06, + "loss": 0.7487, + "step": 776 + }, + { + "epoch": 0.08, + "grad_norm": 2.6983836303838826, + "learning_rate": 9.930143040384472e-06, + "loss": 0.7523, + "step": 777 + }, + { + "epoch": 0.08, + "grad_norm": 2.3780230591194065, + "learning_rate": 9.929858868722954e-06, + "loss": 0.7692, + "step": 778 + }, + { + "epoch": 0.08, + "grad_norm": 2.4634887523343942, + "learning_rate": 9.929574124324682e-06, + "loss": 0.8036, + "step": 779 + }, + { + "epoch": 0.08, + "grad_norm": 2.2694290012543052, + "learning_rate": 9.929288807222738e-06, + "loss": 0.7424, + "step": 780 + }, + { + "epoch": 0.08, + "grad_norm": 2.6592362586407394, + "learning_rate": 9.92900291745027e-06, + "loss": 0.6375, + "step": 781 + }, + { + "epoch": 0.08, + "grad_norm": 3.495897539805403, + "learning_rate": 9.92871645504049e-06, + "loss": 0.7854, + "step": 782 + }, + { + "epoch": 0.08, + "grad_norm": 2.0961963685516727, + "learning_rate": 9.928429420026682e-06, + "loss": 0.7634, + "step": 783 + }, + { + "epoch": 0.08, + "grad_norm": 2.8509364984526253, + "learning_rate": 9.92814181244219e-06, + "loss": 0.6458, + "step": 784 + }, + { + "epoch": 0.08, + "grad_norm": 2.8241576523218956, + "learning_rate": 9.927853632320427e-06, + "loss": 0.7804, + "step": 785 + }, + { + "epoch": 0.08, + "grad_norm": 3.0001225664364055, + "learning_rate": 9.927564879694874e-06, + "loss": 0.7229, + "step": 786 + }, + { + "epoch": 0.08, + "grad_norm": 3.4475528816764944, + "learning_rate": 9.927275554599078e-06, + "loss": 0.8362, + "step": 787 + }, + { + "epoch": 0.08, + "grad_norm": 2.2308232356508526, + "learning_rate": 9.926985657066653e-06, + "loss": 0.7114, + "step": 788 + }, + { + "epoch": 0.08, + "grad_norm": 2.341682432680137, + "learning_rate": 9.926695187131275e-06, + "loss": 0.7783, + "step": 789 + }, + { + "epoch": 0.08, + "grad_norm": 2.6052199283711874, + "learning_rate": 9.92640414482669e-06, + "loss": 0.8333, + "step": 790 + }, + { + "epoch": 0.08, + "grad_norm": 2.8370875665068858, + "learning_rate": 9.926112530186715e-06, + "loss": 0.7107, + "step": 791 + }, + { + "epoch": 0.08, + "grad_norm": 2.5275739897207865, + "learning_rate": 9.925820343245225e-06, + "loss": 0.7965, + "step": 792 + }, + { + "epoch": 0.08, + "grad_norm": 2.241333301690011, + "learning_rate": 9.925527584036167e-06, + "loss": 0.8081, + "step": 793 + }, + { + "epoch": 0.08, + "grad_norm": 2.1980959634447164, + "learning_rate": 9.925234252593554e-06, + "loss": 0.7231, + "step": 794 + }, + { + "epoch": 0.08, + "grad_norm": 2.5031300722496965, + "learning_rate": 9.92494034895146e-06, + "loss": 0.6994, + "step": 795 + }, + { + "epoch": 0.08, + "grad_norm": 2.1430334991183604, + "learning_rate": 9.924645873144035e-06, + "loss": 0.6908, + "step": 796 + }, + { + "epoch": 0.08, + "grad_norm": 2.350620970434163, + "learning_rate": 9.924350825205487e-06, + "loss": 0.8007, + "step": 797 + }, + { + "epoch": 0.08, + "grad_norm": 2.579066826686899, + "learning_rate": 9.924055205170095e-06, + "loss": 0.8572, + "step": 798 + }, + { + "epoch": 0.08, + "grad_norm": 2.9358520387744473, + "learning_rate": 9.923759013072205e-06, + "loss": 0.7289, + "step": 799 + }, + { + "epoch": 0.08, + "grad_norm": 1.4595413230957774, + "learning_rate": 9.923462248946224e-06, + "loss": 0.6404, + "step": 800 + }, + { + "epoch": 0.08, + "grad_norm": 2.25406590521839, + "learning_rate": 9.923164912826631e-06, + "loss": 0.7915, + "step": 801 + }, + { + "epoch": 0.08, + "grad_norm": 2.5320545260007727, + "learning_rate": 9.922867004747971e-06, + "loss": 0.7355, + "step": 802 + }, + { + "epoch": 0.08, + "grad_norm": 1.0795033707901633, + "learning_rate": 9.922568524744854e-06, + "loss": 0.6239, + "step": 803 + }, + { + "epoch": 0.08, + "grad_norm": 5.1305263819984, + "learning_rate": 9.922269472851953e-06, + "loss": 0.7992, + "step": 804 + }, + { + "epoch": 0.08, + "grad_norm": 2.4229395578571924, + "learning_rate": 9.921969849104015e-06, + "loss": 0.686, + "step": 805 + }, + { + "epoch": 0.08, + "grad_norm": 2.925961640710328, + "learning_rate": 9.921669653535848e-06, + "loss": 0.7174, + "step": 806 + }, + { + "epoch": 0.08, + "grad_norm": 2.77414781814395, + "learning_rate": 9.921368886182328e-06, + "loss": 0.7249, + "step": 807 + }, + { + "epoch": 0.09, + "grad_norm": 2.732166312681211, + "learning_rate": 9.921067547078396e-06, + "loss": 0.7682, + "step": 808 + }, + { + "epoch": 0.09, + "grad_norm": 2.6581746298192357, + "learning_rate": 9.920765636259062e-06, + "loss": 0.7427, + "step": 809 + }, + { + "epoch": 0.09, + "grad_norm": 3.420618373750652, + "learning_rate": 9.9204631537594e-06, + "loss": 0.8256, + "step": 810 + }, + { + "epoch": 0.09, + "grad_norm": 4.452433300784335, + "learning_rate": 9.920160099614553e-06, + "loss": 0.7742, + "step": 811 + }, + { + "epoch": 0.09, + "grad_norm": 2.347854862834246, + "learning_rate": 9.91985647385973e-06, + "loss": 0.6981, + "step": 812 + }, + { + "epoch": 0.09, + "grad_norm": 2.335973187712152, + "learning_rate": 9.919552276530202e-06, + "loss": 0.6822, + "step": 813 + }, + { + "epoch": 0.09, + "grad_norm": 3.0899513351336645, + "learning_rate": 9.919247507661313e-06, + "loss": 0.6554, + "step": 814 + }, + { + "epoch": 0.09, + "grad_norm": 2.4579977944848657, + "learning_rate": 9.918942167288467e-06, + "loss": 0.7347, + "step": 815 + }, + { + "epoch": 0.09, + "grad_norm": 2.2321378141123316, + "learning_rate": 9.918636255447141e-06, + "loss": 0.7083, + "step": 816 + }, + { + "epoch": 0.09, + "grad_norm": 2.337678644880633, + "learning_rate": 9.918329772172872e-06, + "loss": 0.7353, + "step": 817 + }, + { + "epoch": 0.09, + "grad_norm": 2.484233611654691, + "learning_rate": 9.918022717501268e-06, + "loss": 0.7626, + "step": 818 + }, + { + "epoch": 0.09, + "grad_norm": 2.6233513206248698, + "learning_rate": 9.917715091467999e-06, + "loss": 0.7422, + "step": 819 + }, + { + "epoch": 0.09, + "grad_norm": 2.2460279378058137, + "learning_rate": 9.91740689410881e-06, + "loss": 0.8034, + "step": 820 + }, + { + "epoch": 0.09, + "grad_norm": 3.014666827028448, + "learning_rate": 9.917098125459501e-06, + "loss": 0.7201, + "step": 821 + }, + { + "epoch": 0.09, + "grad_norm": 2.8852937872527904, + "learning_rate": 9.916788785555945e-06, + "loss": 0.702, + "step": 822 + }, + { + "epoch": 0.09, + "grad_norm": 2.468109581522842, + "learning_rate": 9.91647887443408e-06, + "loss": 0.7292, + "step": 823 + }, + { + "epoch": 0.09, + "grad_norm": 2.0137798678101095, + "learning_rate": 9.916168392129914e-06, + "loss": 0.7081, + "step": 824 + }, + { + "epoch": 0.09, + "grad_norm": 2.1356848172886997, + "learning_rate": 9.915857338679515e-06, + "loss": 0.7477, + "step": 825 + }, + { + "epoch": 0.09, + "grad_norm": 2.146941252431847, + "learning_rate": 9.91554571411902e-06, + "loss": 0.7231, + "step": 826 + }, + { + "epoch": 0.09, + "grad_norm": 1.967160075670586, + "learning_rate": 9.915233518484633e-06, + "loss": 0.7984, + "step": 827 + }, + { + "epoch": 0.09, + "grad_norm": 2.0751156185412385, + "learning_rate": 9.914920751812626e-06, + "loss": 0.8102, + "step": 828 + }, + { + "epoch": 0.09, + "grad_norm": 2.3472721595318586, + "learning_rate": 9.914607414139332e-06, + "loss": 0.7313, + "step": 829 + }, + { + "epoch": 0.09, + "grad_norm": 2.118986957021893, + "learning_rate": 9.914293505501155e-06, + "loss": 0.8668, + "step": 830 + }, + { + "epoch": 0.09, + "grad_norm": 2.394587519870367, + "learning_rate": 9.913979025934566e-06, + "loss": 0.7498, + "step": 831 + }, + { + "epoch": 0.09, + "grad_norm": 3.0206794969562405, + "learning_rate": 9.913663975476099e-06, + "loss": 0.7633, + "step": 832 + }, + { + "epoch": 0.09, + "grad_norm": 2.096598722449407, + "learning_rate": 9.913348354162353e-06, + "loss": 0.7422, + "step": 833 + }, + { + "epoch": 0.09, + "grad_norm": 2.4996308686533895, + "learning_rate": 9.913032162029999e-06, + "loss": 0.7345, + "step": 834 + }, + { + "epoch": 0.09, + "grad_norm": 2.692833005695984, + "learning_rate": 9.91271539911577e-06, + "loss": 0.783, + "step": 835 + }, + { + "epoch": 0.09, + "grad_norm": 3.1228836691300037, + "learning_rate": 9.91239806545647e-06, + "loss": 0.7425, + "step": 836 + }, + { + "epoch": 0.09, + "grad_norm": 2.3986505144559427, + "learning_rate": 9.91208016108896e-06, + "loss": 0.7267, + "step": 837 + }, + { + "epoch": 0.09, + "grad_norm": 2.7101167395338273, + "learning_rate": 9.911761686050177e-06, + "loss": 0.7581, + "step": 838 + }, + { + "epoch": 0.09, + "grad_norm": 2.5806365307450667, + "learning_rate": 9.91144264037712e-06, + "loss": 0.7847, + "step": 839 + }, + { + "epoch": 0.09, + "grad_norm": 1.5964088279350312, + "learning_rate": 9.911123024106854e-06, + "loss": 0.6742, + "step": 840 + }, + { + "epoch": 0.09, + "grad_norm": 2.0458536491050143, + "learning_rate": 9.910802837276514e-06, + "loss": 0.6898, + "step": 841 + }, + { + "epoch": 0.09, + "grad_norm": 1.9738131081216486, + "learning_rate": 9.910482079923293e-06, + "loss": 0.7097, + "step": 842 + }, + { + "epoch": 0.09, + "grad_norm": 4.033940638900864, + "learning_rate": 9.910160752084461e-06, + "loss": 0.6891, + "step": 843 + }, + { + "epoch": 0.09, + "grad_norm": 2.493418453706031, + "learning_rate": 9.909838853797347e-06, + "loss": 0.8386, + "step": 844 + }, + { + "epoch": 0.09, + "grad_norm": 2.3609786228834397, + "learning_rate": 9.909516385099346e-06, + "loss": 0.6447, + "step": 845 + }, + { + "epoch": 0.09, + "grad_norm": 1.8826827313822074, + "learning_rate": 9.909193346027923e-06, + "loss": 0.7634, + "step": 846 + }, + { + "epoch": 0.09, + "grad_norm": 2.192146232328817, + "learning_rate": 9.90886973662061e-06, + "loss": 0.6501, + "step": 847 + }, + { + "epoch": 0.09, + "grad_norm": 4.924566828839797, + "learning_rate": 9.908545556915e-06, + "loss": 0.7382, + "step": 848 + }, + { + "epoch": 0.09, + "grad_norm": 2.124722725213922, + "learning_rate": 9.908220806948755e-06, + "loss": 0.7342, + "step": 849 + }, + { + "epoch": 0.09, + "grad_norm": 3.149040160809967, + "learning_rate": 9.90789548675961e-06, + "loss": 0.769, + "step": 850 + }, + { + "epoch": 0.09, + "grad_norm": 2.016612347468723, + "learning_rate": 9.90756959638535e-06, + "loss": 0.7533, + "step": 851 + }, + { + "epoch": 0.09, + "grad_norm": 2.21890626690869, + "learning_rate": 9.90724313586384e-06, + "loss": 0.7183, + "step": 852 + }, + { + "epoch": 0.09, + "grad_norm": 2.1739377588424182, + "learning_rate": 9.90691610523301e-06, + "loss": 0.725, + "step": 853 + }, + { + "epoch": 0.09, + "grad_norm": 2.4308506352472565, + "learning_rate": 9.906588504530852e-06, + "loss": 0.7721, + "step": 854 + }, + { + "epoch": 0.09, + "grad_norm": 2.039428986286277, + "learning_rate": 9.906260333795423e-06, + "loss": 0.7862, + "step": 855 + }, + { + "epoch": 0.09, + "grad_norm": 2.072889277401122, + "learning_rate": 9.905931593064852e-06, + "loss": 0.7057, + "step": 856 + }, + { + "epoch": 0.09, + "grad_norm": 2.5603746952912583, + "learning_rate": 9.905602282377331e-06, + "loss": 0.7845, + "step": 857 + }, + { + "epoch": 0.09, + "grad_norm": 2.2094754050609255, + "learning_rate": 9.905272401771115e-06, + "loss": 0.6726, + "step": 858 + }, + { + "epoch": 0.09, + "grad_norm": 2.010211257078767, + "learning_rate": 9.904941951284535e-06, + "loss": 0.82, + "step": 859 + }, + { + "epoch": 0.09, + "grad_norm": 2.4294590301163907, + "learning_rate": 9.904610930955975e-06, + "loss": 0.7225, + "step": 860 + }, + { + "epoch": 0.09, + "grad_norm": 2.4545417076156157, + "learning_rate": 9.904279340823895e-06, + "loss": 0.7379, + "step": 861 + }, + { + "epoch": 0.09, + "grad_norm": 2.0489546331766704, + "learning_rate": 9.903947180926819e-06, + "loss": 0.7939, + "step": 862 + }, + { + "epoch": 0.09, + "grad_norm": 4.328322416011042, + "learning_rate": 9.903614451303335e-06, + "loss": 0.7423, + "step": 863 + }, + { + "epoch": 0.09, + "grad_norm": 2.673446796533317, + "learning_rate": 9.903281151992097e-06, + "loss": 0.684, + "step": 864 + }, + { + "epoch": 0.09, + "grad_norm": 2.6240021544906904, + "learning_rate": 9.902947283031833e-06, + "loss": 0.6573, + "step": 865 + }, + { + "epoch": 0.09, + "grad_norm": 2.4194535688154097, + "learning_rate": 9.902612844461322e-06, + "loss": 0.7671, + "step": 866 + }, + { + "epoch": 0.09, + "grad_norm": 2.600458794788045, + "learning_rate": 9.902277836319424e-06, + "loss": 0.686, + "step": 867 + }, + { + "epoch": 0.09, + "grad_norm": 2.7708332752714226, + "learning_rate": 9.90194225864506e-06, + "loss": 0.7713, + "step": 868 + }, + { + "epoch": 0.09, + "grad_norm": 2.345954440796902, + "learning_rate": 9.901606111477213e-06, + "loss": 0.7127, + "step": 869 + }, + { + "epoch": 0.09, + "grad_norm": 2.4380277637214958, + "learning_rate": 9.901269394854938e-06, + "loss": 0.7432, + "step": 870 + }, + { + "epoch": 0.09, + "grad_norm": 2.535510929005561, + "learning_rate": 9.900932108817352e-06, + "loss": 0.735, + "step": 871 + }, + { + "epoch": 0.09, + "grad_norm": 3.0039289042478337, + "learning_rate": 9.900594253403642e-06, + "loss": 0.76, + "step": 872 + }, + { + "epoch": 0.09, + "grad_norm": 2.350300158880335, + "learning_rate": 9.900255828653057e-06, + "loss": 0.6084, + "step": 873 + }, + { + "epoch": 0.09, + "grad_norm": 2.4958117500634938, + "learning_rate": 9.899916834604914e-06, + "loss": 0.7951, + "step": 874 + }, + { + "epoch": 0.09, + "grad_norm": 2.4483497475118843, + "learning_rate": 9.899577271298596e-06, + "loss": 0.7217, + "step": 875 + }, + { + "epoch": 0.09, + "grad_norm": 1.9481072329531595, + "learning_rate": 9.89923713877356e-06, + "loss": 0.6245, + "step": 876 + }, + { + "epoch": 0.09, + "grad_norm": 2.765051932549656, + "learning_rate": 9.89889643706931e-06, + "loss": 0.6723, + "step": 877 + }, + { + "epoch": 0.09, + "grad_norm": 2.739221117936455, + "learning_rate": 9.898555166225434e-06, + "loss": 0.7703, + "step": 878 + }, + { + "epoch": 0.09, + "grad_norm": 2.1418498868809768, + "learning_rate": 9.89821332628158e-06, + "loss": 0.7298, + "step": 879 + }, + { + "epoch": 0.09, + "grad_norm": 2.2036351653598367, + "learning_rate": 9.897870917277461e-06, + "loss": 0.8093, + "step": 880 + }, + { + "epoch": 0.09, + "grad_norm": 2.4323072705596505, + "learning_rate": 9.897527939252858e-06, + "loss": 0.7399, + "step": 881 + }, + { + "epoch": 0.09, + "grad_norm": 2.8657728288025477, + "learning_rate": 9.897184392247614e-06, + "loss": 0.7356, + "step": 882 + }, + { + "epoch": 0.09, + "grad_norm": 2.5414139234529642, + "learning_rate": 9.896840276301645e-06, + "loss": 0.6747, + "step": 883 + }, + { + "epoch": 0.09, + "grad_norm": 2.2994147096204327, + "learning_rate": 9.896495591454929e-06, + "loss": 0.7335, + "step": 884 + }, + { + "epoch": 0.09, + "grad_norm": 1.9851136351096752, + "learning_rate": 9.896150337747508e-06, + "loss": 0.7839, + "step": 885 + }, + { + "epoch": 0.09, + "grad_norm": 2.475304204640165, + "learning_rate": 9.895804515219495e-06, + "loss": 0.7442, + "step": 886 + }, + { + "epoch": 0.09, + "grad_norm": 3.0051133893066178, + "learning_rate": 9.895458123911066e-06, + "loss": 0.819, + "step": 887 + }, + { + "epoch": 0.09, + "grad_norm": 3.01882317139601, + "learning_rate": 9.895111163862464e-06, + "loss": 0.743, + "step": 888 + }, + { + "epoch": 0.09, + "grad_norm": 1.4869064278045154, + "learning_rate": 9.894763635113995e-06, + "loss": 0.673, + "step": 889 + }, + { + "epoch": 0.09, + "grad_norm": 2.0423193152498316, + "learning_rate": 9.894415537706036e-06, + "loss": 0.7938, + "step": 890 + }, + { + "epoch": 0.09, + "grad_norm": 2.3877929769561415, + "learning_rate": 9.89406687167903e-06, + "loss": 0.8399, + "step": 891 + }, + { + "epoch": 0.09, + "grad_norm": 2.3118433378041283, + "learning_rate": 9.893717637073483e-06, + "loss": 0.7365, + "step": 892 + }, + { + "epoch": 0.09, + "grad_norm": 3.1231159883183817, + "learning_rate": 9.893367833929965e-06, + "loss": 0.6975, + "step": 893 + }, + { + "epoch": 0.09, + "grad_norm": 2.946732175793273, + "learning_rate": 9.893017462289119e-06, + "loss": 0.7355, + "step": 894 + }, + { + "epoch": 0.09, + "grad_norm": 4.3717939300188275, + "learning_rate": 9.892666522191648e-06, + "loss": 0.6766, + "step": 895 + }, + { + "epoch": 0.09, + "grad_norm": 3.0279846422346446, + "learning_rate": 9.892315013678323e-06, + "loss": 0.7597, + "step": 896 + }, + { + "epoch": 0.09, + "grad_norm": 2.55744056624668, + "learning_rate": 9.891962936789983e-06, + "loss": 0.7977, + "step": 897 + }, + { + "epoch": 0.09, + "grad_norm": 2.569713950624268, + "learning_rate": 9.891610291567529e-06, + "loss": 0.7722, + "step": 898 + }, + { + "epoch": 0.09, + "grad_norm": 2.8251737732812003, + "learning_rate": 9.891257078051932e-06, + "loss": 0.8538, + "step": 899 + }, + { + "epoch": 0.09, + "grad_norm": 3.940064605870108, + "learning_rate": 9.890903296284228e-06, + "loss": 0.6861, + "step": 900 + }, + { + "epoch": 0.09, + "grad_norm": 2.3962938305889727, + "learning_rate": 9.890548946305516e-06, + "loss": 0.7367, + "step": 901 + }, + { + "epoch": 0.09, + "grad_norm": 2.6270790503951877, + "learning_rate": 9.890194028156965e-06, + "loss": 0.6927, + "step": 902 + }, + { + "epoch": 0.1, + "grad_norm": 2.2464932261338486, + "learning_rate": 9.889838541879808e-06, + "loss": 0.6671, + "step": 903 + }, + { + "epoch": 0.1, + "grad_norm": 2.2068048828956184, + "learning_rate": 9.889482487515344e-06, + "loss": 0.6574, + "step": 904 + }, + { + "epoch": 0.1, + "grad_norm": 4.131286705587521, + "learning_rate": 9.889125865104939e-06, + "loss": 0.6844, + "step": 905 + }, + { + "epoch": 0.1, + "grad_norm": 2.4854115277088384, + "learning_rate": 9.888768674690023e-06, + "loss": 0.7562, + "step": 906 + }, + { + "epoch": 0.1, + "grad_norm": 2.430105603029439, + "learning_rate": 9.888410916312096e-06, + "loss": 0.8826, + "step": 907 + }, + { + "epoch": 0.1, + "grad_norm": 2.3307764886099447, + "learning_rate": 9.888052590012719e-06, + "loss": 0.6994, + "step": 908 + }, + { + "epoch": 0.1, + "grad_norm": 2.399232411769506, + "learning_rate": 9.887693695833522e-06, + "loss": 0.7844, + "step": 909 + }, + { + "epoch": 0.1, + "grad_norm": 2.4841741299380127, + "learning_rate": 9.887334233816199e-06, + "loss": 0.7191, + "step": 910 + }, + { + "epoch": 0.1, + "grad_norm": 2.231644133692005, + "learning_rate": 9.886974204002514e-06, + "loss": 0.7236, + "step": 911 + }, + { + "epoch": 0.1, + "grad_norm": 2.589193939890633, + "learning_rate": 9.886613606434294e-06, + "loss": 0.8006, + "step": 912 + }, + { + "epoch": 0.1, + "grad_norm": 2.8815065090619294, + "learning_rate": 9.886252441153428e-06, + "loss": 0.762, + "step": 913 + }, + { + "epoch": 0.1, + "grad_norm": 2.458751138818673, + "learning_rate": 9.885890708201881e-06, + "loss": 0.748, + "step": 914 + }, + { + "epoch": 0.1, + "grad_norm": 2.729352069908257, + "learning_rate": 9.885528407621674e-06, + "loss": 0.7319, + "step": 915 + }, + { + "epoch": 0.1, + "grad_norm": 2.7456542817248035, + "learning_rate": 9.885165539454898e-06, + "loss": 0.7691, + "step": 916 + }, + { + "epoch": 0.1, + "grad_norm": 2.821954609815299, + "learning_rate": 9.884802103743712e-06, + "loss": 0.724, + "step": 917 + }, + { + "epoch": 0.1, + "grad_norm": 2.507495458671125, + "learning_rate": 9.88443810053034e-06, + "loss": 0.7557, + "step": 918 + }, + { + "epoch": 0.1, + "grad_norm": 2.813557690867528, + "learning_rate": 9.884073529857066e-06, + "loss": 0.7378, + "step": 919 + }, + { + "epoch": 0.1, + "grad_norm": 1.2109392193308994, + "learning_rate": 9.883708391766248e-06, + "loss": 0.655, + "step": 920 + }, + { + "epoch": 0.1, + "grad_norm": 2.7797266133676817, + "learning_rate": 9.88334268630031e-06, + "loss": 0.7568, + "step": 921 + }, + { + "epoch": 0.1, + "grad_norm": 2.291991455347209, + "learning_rate": 9.882976413501733e-06, + "loss": 0.6939, + "step": 922 + }, + { + "epoch": 0.1, + "grad_norm": 2.2553429818506694, + "learning_rate": 9.88260957341307e-06, + "loss": 0.8572, + "step": 923 + }, + { + "epoch": 0.1, + "grad_norm": 2.944662889076845, + "learning_rate": 9.882242166076942e-06, + "loss": 0.7171, + "step": 924 + }, + { + "epoch": 0.1, + "grad_norm": 2.6005068384270817, + "learning_rate": 9.881874191536032e-06, + "loss": 0.7304, + "step": 925 + }, + { + "epoch": 0.1, + "grad_norm": 2.466317488664055, + "learning_rate": 9.881505649833091e-06, + "loss": 0.7116, + "step": 926 + }, + { + "epoch": 0.1, + "grad_norm": 2.4452143875236243, + "learning_rate": 9.881136541010934e-06, + "loss": 0.7629, + "step": 927 + }, + { + "epoch": 0.1, + "grad_norm": 2.5102243570459595, + "learning_rate": 9.880766865112444e-06, + "loss": 0.7286, + "step": 928 + }, + { + "epoch": 0.1, + "grad_norm": 2.7791143392672875, + "learning_rate": 9.880396622180567e-06, + "loss": 0.7923, + "step": 929 + }, + { + "epoch": 0.1, + "grad_norm": 2.876441285426497, + "learning_rate": 9.880025812258322e-06, + "loss": 0.7381, + "step": 930 + }, + { + "epoch": 0.1, + "grad_norm": 3.2830140025930246, + "learning_rate": 9.879654435388781e-06, + "loss": 0.7657, + "step": 931 + }, + { + "epoch": 0.1, + "grad_norm": 2.1392167947242817, + "learning_rate": 9.879282491615096e-06, + "loss": 0.7584, + "step": 932 + }, + { + "epoch": 0.1, + "grad_norm": 2.5574554782173244, + "learning_rate": 9.878909980980475e-06, + "loss": 0.6736, + "step": 933 + }, + { + "epoch": 0.1, + "grad_norm": 2.482015182428965, + "learning_rate": 9.878536903528195e-06, + "loss": 0.7855, + "step": 934 + }, + { + "epoch": 0.1, + "grad_norm": 2.20969188023806, + "learning_rate": 9.8781632593016e-06, + "loss": 0.7142, + "step": 935 + }, + { + "epoch": 0.1, + "grad_norm": 2.6096755265839593, + "learning_rate": 9.8777890483441e-06, + "loss": 0.8059, + "step": 936 + }, + { + "epoch": 0.1, + "grad_norm": 2.6014399338081122, + "learning_rate": 9.877414270699168e-06, + "loss": 0.7378, + "step": 937 + }, + { + "epoch": 0.1, + "grad_norm": 2.1756243191451383, + "learning_rate": 9.877038926410346e-06, + "loss": 0.7268, + "step": 938 + }, + { + "epoch": 0.1, + "grad_norm": 2.196316926201555, + "learning_rate": 9.876663015521237e-06, + "loss": 0.8158, + "step": 939 + }, + { + "epoch": 0.1, + "grad_norm": 2.383983584002673, + "learning_rate": 9.876286538075519e-06, + "loss": 0.7215, + "step": 940 + }, + { + "epoch": 0.1, + "grad_norm": 2.7085992373351204, + "learning_rate": 9.875909494116925e-06, + "loss": 0.6575, + "step": 941 + }, + { + "epoch": 0.1, + "grad_norm": 2.5130773928374737, + "learning_rate": 9.875531883689262e-06, + "loss": 0.6911, + "step": 942 + }, + { + "epoch": 0.1, + "grad_norm": 3.318178727992876, + "learning_rate": 9.875153706836397e-06, + "loss": 0.788, + "step": 943 + }, + { + "epoch": 0.1, + "grad_norm": 4.286494331288682, + "learning_rate": 9.874774963602268e-06, + "loss": 0.7114, + "step": 944 + }, + { + "epoch": 0.1, + "grad_norm": 2.1591477947522186, + "learning_rate": 9.874395654030876e-06, + "loss": 0.7871, + "step": 945 + }, + { + "epoch": 0.1, + "grad_norm": 2.1547218552212177, + "learning_rate": 9.874015778166285e-06, + "loss": 0.8348, + "step": 946 + }, + { + "epoch": 0.1, + "grad_norm": 2.762373245266379, + "learning_rate": 9.873635336052633e-06, + "loss": 0.7688, + "step": 947 + }, + { + "epoch": 0.1, + "grad_norm": 2.309708677231106, + "learning_rate": 9.873254327734115e-06, + "loss": 0.747, + "step": 948 + }, + { + "epoch": 0.1, + "grad_norm": 2.053888930700921, + "learning_rate": 9.872872753254996e-06, + "loss": 0.7223, + "step": 949 + }, + { + "epoch": 0.1, + "grad_norm": 3.5008788542760363, + "learning_rate": 9.872490612659607e-06, + "loss": 0.6766, + "step": 950 + }, + { + "epoch": 0.1, + "grad_norm": 2.3863679851555673, + "learning_rate": 9.872107905992343e-06, + "loss": 0.6972, + "step": 951 + }, + { + "epoch": 0.1, + "grad_norm": 2.921615156601913, + "learning_rate": 9.871724633297666e-06, + "loss": 0.6999, + "step": 952 + }, + { + "epoch": 0.1, + "grad_norm": 3.2644813929698615, + "learning_rate": 9.871340794620103e-06, + "loss": 0.761, + "step": 953 + }, + { + "epoch": 0.1, + "grad_norm": 2.1191106722345814, + "learning_rate": 9.87095639000425e-06, + "loss": 0.7376, + "step": 954 + }, + { + "epoch": 0.1, + "grad_norm": 6.026837083096435, + "learning_rate": 9.870571419494764e-06, + "loss": 0.7222, + "step": 955 + }, + { + "epoch": 0.1, + "grad_norm": 2.336308324288583, + "learning_rate": 9.87018588313637e-06, + "loss": 0.7229, + "step": 956 + }, + { + "epoch": 0.1, + "grad_norm": 2.456262394309276, + "learning_rate": 9.869799780973856e-06, + "loss": 0.7182, + "step": 957 + }, + { + "epoch": 0.1, + "grad_norm": 2.5276315796370676, + "learning_rate": 9.869413113052084e-06, + "loss": 0.6835, + "step": 958 + }, + { + "epoch": 0.1, + "grad_norm": 3.1051858480552497, + "learning_rate": 9.86902587941597e-06, + "loss": 0.7101, + "step": 959 + }, + { + "epoch": 0.1, + "grad_norm": 3.3731070604108844, + "learning_rate": 9.868638080110507e-06, + "loss": 0.7221, + "step": 960 + }, + { + "epoch": 0.1, + "grad_norm": 2.0867518495145223, + "learning_rate": 9.868249715180741e-06, + "loss": 0.7046, + "step": 961 + }, + { + "epoch": 0.1, + "grad_norm": 2.0764983780128166, + "learning_rate": 9.8678607846718e-06, + "loss": 0.7498, + "step": 962 + }, + { + "epoch": 0.1, + "grad_norm": 2.1440243110852037, + "learning_rate": 9.867471288628863e-06, + "loss": 0.6442, + "step": 963 + }, + { + "epoch": 0.1, + "grad_norm": 2.559996501289641, + "learning_rate": 9.867081227097182e-06, + "loss": 0.7556, + "step": 964 + }, + { + "epoch": 0.1, + "grad_norm": 2.546829215817774, + "learning_rate": 9.866690600122075e-06, + "loss": 0.7098, + "step": 965 + }, + { + "epoch": 0.1, + "grad_norm": 2.692415588752108, + "learning_rate": 9.866299407748921e-06, + "loss": 0.694, + "step": 966 + }, + { + "epoch": 0.1, + "grad_norm": 2.336011704071477, + "learning_rate": 9.865907650023167e-06, + "loss": 0.7751, + "step": 967 + }, + { + "epoch": 0.1, + "grad_norm": 2.3030234604337436, + "learning_rate": 9.865515326990332e-06, + "loss": 0.68, + "step": 968 + }, + { + "epoch": 0.1, + "grad_norm": 2.3788631634376527, + "learning_rate": 9.865122438695988e-06, + "loss": 0.7517, + "step": 969 + }, + { + "epoch": 0.1, + "grad_norm": 2.594025015808721, + "learning_rate": 9.864728985185783e-06, + "loss": 0.7859, + "step": 970 + }, + { + "epoch": 0.1, + "grad_norm": 2.2839322712235326, + "learning_rate": 9.86433496650543e-06, + "loss": 0.7123, + "step": 971 + }, + { + "epoch": 0.1, + "grad_norm": 2.7189304806754397, + "learning_rate": 9.863940382700699e-06, + "loss": 0.7495, + "step": 972 + }, + { + "epoch": 0.1, + "grad_norm": 2.5542757749406033, + "learning_rate": 9.863545233817436e-06, + "loss": 0.7249, + "step": 973 + }, + { + "epoch": 0.1, + "grad_norm": 2.2818240056053565, + "learning_rate": 9.863149519901545e-06, + "loss": 0.7263, + "step": 974 + }, + { + "epoch": 0.1, + "grad_norm": 5.980498735075583, + "learning_rate": 9.862753240999001e-06, + "loss": 0.796, + "step": 975 + }, + { + "epoch": 0.1, + "grad_norm": 2.17696941282096, + "learning_rate": 9.862356397155843e-06, + "loss": 0.7528, + "step": 976 + }, + { + "epoch": 0.1, + "grad_norm": 2.7279007297671747, + "learning_rate": 9.861958988418174e-06, + "loss": 0.7417, + "step": 977 + }, + { + "epoch": 0.1, + "grad_norm": 2.7775418932594853, + "learning_rate": 9.861561014832166e-06, + "loss": 0.6685, + "step": 978 + }, + { + "epoch": 0.1, + "grad_norm": 3.0608876144824255, + "learning_rate": 9.86116247644405e-06, + "loss": 0.796, + "step": 979 + }, + { + "epoch": 0.1, + "grad_norm": 2.230200096432408, + "learning_rate": 9.860763373300133e-06, + "loss": 0.682, + "step": 980 + }, + { + "epoch": 0.1, + "grad_norm": 2.583892124389053, + "learning_rate": 9.860363705446776e-06, + "loss": 0.7154, + "step": 981 + }, + { + "epoch": 0.1, + "grad_norm": 3.420436801225999, + "learning_rate": 9.859963472930413e-06, + "loss": 0.6849, + "step": 982 + }, + { + "epoch": 0.1, + "grad_norm": 2.5498244013352545, + "learning_rate": 9.859562675797543e-06, + "loss": 0.8011, + "step": 983 + }, + { + "epoch": 0.1, + "grad_norm": 2.1659421053037153, + "learning_rate": 9.85916131409473e-06, + "loss": 0.7394, + "step": 984 + }, + { + "epoch": 0.1, + "grad_norm": 3.338196115649606, + "learning_rate": 9.858759387868601e-06, + "loss": 0.7245, + "step": 985 + }, + { + "epoch": 0.1, + "grad_norm": 2.7321452634255357, + "learning_rate": 9.858356897165853e-06, + "loss": 0.7919, + "step": 986 + }, + { + "epoch": 0.1, + "grad_norm": 2.6253699652801417, + "learning_rate": 9.857953842033243e-06, + "loss": 0.733, + "step": 987 + }, + { + "epoch": 0.1, + "grad_norm": 2.59635003190081, + "learning_rate": 9.857550222517598e-06, + "loss": 0.7092, + "step": 988 + }, + { + "epoch": 0.1, + "grad_norm": 2.621851871749562, + "learning_rate": 9.857146038665812e-06, + "loss": 0.7042, + "step": 989 + }, + { + "epoch": 0.1, + "grad_norm": 2.7774812468209973, + "learning_rate": 9.856741290524839e-06, + "loss": 0.7218, + "step": 990 + }, + { + "epoch": 0.1, + "grad_norm": 2.9702400566443012, + "learning_rate": 9.856335978141703e-06, + "loss": 0.7605, + "step": 991 + }, + { + "epoch": 0.1, + "grad_norm": 3.229378466858724, + "learning_rate": 9.85593010156349e-06, + "loss": 0.7731, + "step": 992 + }, + { + "epoch": 0.1, + "grad_norm": 2.985236389182805, + "learning_rate": 9.855523660837355e-06, + "loss": 0.7433, + "step": 993 + }, + { + "epoch": 0.1, + "grad_norm": 2.418368994051385, + "learning_rate": 9.855116656010518e-06, + "loss": 0.6799, + "step": 994 + }, + { + "epoch": 0.1, + "grad_norm": 2.6770469216630866, + "learning_rate": 9.854709087130261e-06, + "loss": 0.6967, + "step": 995 + }, + { + "epoch": 0.1, + "grad_norm": 1.2596414821764004, + "learning_rate": 9.854300954243937e-06, + "loss": 0.6321, + "step": 996 + }, + { + "epoch": 0.1, + "grad_norm": 1.1913642125753563, + "learning_rate": 9.853892257398961e-06, + "loss": 0.6239, + "step": 997 + }, + { + "epoch": 0.11, + "grad_norm": 5.038065448125383, + "learning_rate": 9.853482996642812e-06, + "loss": 0.8232, + "step": 998 + }, + { + "epoch": 0.11, + "grad_norm": 2.80199548153629, + "learning_rate": 9.85307317202304e-06, + "loss": 0.7232, + "step": 999 + }, + { + "epoch": 0.11, + "grad_norm": 3.3246781936353464, + "learning_rate": 9.852662783587255e-06, + "loss": 0.6723, + "step": 1000 + }, + { + "epoch": 0.11, + "grad_norm": 2.285393284696264, + "learning_rate": 9.852251831383136e-06, + "loss": 0.7249, + "step": 1001 + }, + { + "epoch": 0.11, + "grad_norm": 2.6840301004398026, + "learning_rate": 9.851840315458424e-06, + "loss": 0.8205, + "step": 1002 + }, + { + "epoch": 0.11, + "grad_norm": 2.635419321078597, + "learning_rate": 9.85142823586093e-06, + "loss": 0.7197, + "step": 1003 + }, + { + "epoch": 0.11, + "grad_norm": 3.832069075664214, + "learning_rate": 9.851015592638528e-06, + "loss": 0.7241, + "step": 1004 + }, + { + "epoch": 0.11, + "grad_norm": 2.533811572972949, + "learning_rate": 9.850602385839158e-06, + "loss": 0.7935, + "step": 1005 + }, + { + "epoch": 0.11, + "grad_norm": 3.225027430000192, + "learning_rate": 9.850188615510824e-06, + "loss": 0.6579, + "step": 1006 + }, + { + "epoch": 0.11, + "grad_norm": 2.3382405131753483, + "learning_rate": 9.849774281701597e-06, + "loss": 0.7659, + "step": 1007 + }, + { + "epoch": 0.11, + "grad_norm": 2.097379634081843, + "learning_rate": 9.849359384459614e-06, + "loss": 0.7244, + "step": 1008 + }, + { + "epoch": 0.11, + "grad_norm": 2.882248967395376, + "learning_rate": 9.848943923833075e-06, + "loss": 0.699, + "step": 1009 + }, + { + "epoch": 0.11, + "grad_norm": 3.1835190905227715, + "learning_rate": 9.848527899870249e-06, + "loss": 0.729, + "step": 1010 + }, + { + "epoch": 0.11, + "grad_norm": 3.3344865062688966, + "learning_rate": 9.848111312619464e-06, + "loss": 0.7155, + "step": 1011 + }, + { + "epoch": 0.11, + "grad_norm": 2.5563191668434757, + "learning_rate": 9.847694162129124e-06, + "loss": 0.718, + "step": 1012 + }, + { + "epoch": 0.11, + "grad_norm": 2.468563374343176, + "learning_rate": 9.84727644844769e-06, + "loss": 0.7124, + "step": 1013 + }, + { + "epoch": 0.11, + "grad_norm": 2.7246642917209405, + "learning_rate": 9.846858171623687e-06, + "loss": 0.6024, + "step": 1014 + }, + { + "epoch": 0.11, + "grad_norm": 2.0551445765951826, + "learning_rate": 9.846439331705715e-06, + "loss": 0.6882, + "step": 1015 + }, + { + "epoch": 0.11, + "grad_norm": 2.9450655022782932, + "learning_rate": 9.846019928742432e-06, + "loss": 0.7355, + "step": 1016 + }, + { + "epoch": 0.11, + "grad_norm": 2.1869980803278106, + "learning_rate": 9.84559996278256e-06, + "loss": 0.6998, + "step": 1017 + }, + { + "epoch": 0.11, + "grad_norm": 2.0957427212075115, + "learning_rate": 9.845179433874891e-06, + "loss": 0.7364, + "step": 1018 + }, + { + "epoch": 0.11, + "grad_norm": 2.2104276939981258, + "learning_rate": 9.844758342068284e-06, + "loss": 0.7528, + "step": 1019 + }, + { + "epoch": 0.11, + "grad_norm": 2.2088600503635325, + "learning_rate": 9.844336687411657e-06, + "loss": 0.637, + "step": 1020 + }, + { + "epoch": 0.11, + "grad_norm": 2.2733449333519444, + "learning_rate": 9.843914469953995e-06, + "loss": 0.7377, + "step": 1021 + }, + { + "epoch": 0.11, + "grad_norm": 6.872012422821911, + "learning_rate": 9.843491689744354e-06, + "loss": 0.7699, + "step": 1022 + }, + { + "epoch": 0.11, + "grad_norm": 2.471746142903541, + "learning_rate": 9.84306834683185e-06, + "loss": 0.7256, + "step": 1023 + }, + { + "epoch": 0.11, + "grad_norm": 2.5086959706339997, + "learning_rate": 9.842644441265664e-06, + "loss": 0.8149, + "step": 1024 + }, + { + "epoch": 0.11, + "grad_norm": 2.3150840456614925, + "learning_rate": 9.842219973095045e-06, + "loss": 0.7124, + "step": 1025 + }, + { + "epoch": 0.11, + "grad_norm": 2.3604359593997164, + "learning_rate": 9.841794942369309e-06, + "loss": 0.6731, + "step": 1026 + }, + { + "epoch": 0.11, + "grad_norm": 2.439817785760237, + "learning_rate": 9.841369349137832e-06, + "loss": 0.7622, + "step": 1027 + }, + { + "epoch": 0.11, + "grad_norm": 2.9509635034152497, + "learning_rate": 9.840943193450059e-06, + "loss": 0.7069, + "step": 1028 + }, + { + "epoch": 0.11, + "grad_norm": 2.9934027081797234, + "learning_rate": 9.840516475355499e-06, + "loss": 0.7784, + "step": 1029 + }, + { + "epoch": 0.11, + "grad_norm": 2.533628591878186, + "learning_rate": 9.840089194903729e-06, + "loss": 0.7079, + "step": 1030 + }, + { + "epoch": 0.11, + "grad_norm": 2.4056446800859272, + "learning_rate": 9.839661352144386e-06, + "loss": 0.761, + "step": 1031 + }, + { + "epoch": 0.11, + "grad_norm": 2.515210654037056, + "learning_rate": 9.839232947127178e-06, + "loss": 0.7748, + "step": 1032 + }, + { + "epoch": 0.11, + "grad_norm": 2.108290692947581, + "learning_rate": 9.838803979901874e-06, + "loss": 0.7569, + "step": 1033 + }, + { + "epoch": 0.11, + "grad_norm": 3.079443665645577, + "learning_rate": 9.838374450518311e-06, + "loss": 0.7073, + "step": 1034 + }, + { + "epoch": 0.11, + "grad_norm": 2.3720082565519367, + "learning_rate": 9.837944359026392e-06, + "loss": 0.735, + "step": 1035 + }, + { + "epoch": 0.11, + "grad_norm": 2.6597037634218994, + "learning_rate": 9.837513705476082e-06, + "loss": 0.745, + "step": 1036 + }, + { + "epoch": 0.11, + "grad_norm": 3.6631008749810534, + "learning_rate": 9.837082489917413e-06, + "loss": 0.7522, + "step": 1037 + }, + { + "epoch": 0.11, + "grad_norm": 2.7564725036319344, + "learning_rate": 9.836650712400484e-06, + "loss": 0.6971, + "step": 1038 + }, + { + "epoch": 0.11, + "grad_norm": 2.7150369300373387, + "learning_rate": 9.836218372975456e-06, + "loss": 0.7348, + "step": 1039 + }, + { + "epoch": 0.11, + "grad_norm": 2.456649521111631, + "learning_rate": 9.835785471692559e-06, + "loss": 0.7276, + "step": 1040 + }, + { + "epoch": 0.11, + "grad_norm": 2.4789195883853457, + "learning_rate": 9.835352008602081e-06, + "loss": 0.7406, + "step": 1041 + }, + { + "epoch": 0.11, + "grad_norm": 2.240016595350792, + "learning_rate": 9.834917983754388e-06, + "loss": 0.7213, + "step": 1042 + }, + { + "epoch": 0.11, + "grad_norm": 2.4292109109445925, + "learning_rate": 9.834483397199897e-06, + "loss": 0.7573, + "step": 1043 + }, + { + "epoch": 0.11, + "grad_norm": 2.7009207551734242, + "learning_rate": 9.834048248989101e-06, + "loss": 0.7654, + "step": 1044 + }, + { + "epoch": 0.11, + "grad_norm": 3.359205277225582, + "learning_rate": 9.833612539172554e-06, + "loss": 0.6987, + "step": 1045 + }, + { + "epoch": 0.11, + "grad_norm": 4.052083146121145, + "learning_rate": 9.833176267800874e-06, + "loss": 0.8056, + "step": 1046 + }, + { + "epoch": 0.11, + "grad_norm": 2.7708114222605413, + "learning_rate": 9.832739434924747e-06, + "loss": 0.7814, + "step": 1047 + }, + { + "epoch": 0.11, + "grad_norm": 2.4023504113769785, + "learning_rate": 9.832302040594923e-06, + "loss": 0.7072, + "step": 1048 + }, + { + "epoch": 0.11, + "grad_norm": 2.926390430878239, + "learning_rate": 9.831864084862216e-06, + "loss": 0.7338, + "step": 1049 + }, + { + "epoch": 0.11, + "grad_norm": 2.18607735411363, + "learning_rate": 9.831425567777506e-06, + "loss": 0.7444, + "step": 1050 + }, + { + "epoch": 0.11, + "grad_norm": 2.494103967751944, + "learning_rate": 9.830986489391743e-06, + "loss": 0.7925, + "step": 1051 + }, + { + "epoch": 0.11, + "grad_norm": 2.65375185592133, + "learning_rate": 9.830546849755932e-06, + "loss": 0.7176, + "step": 1052 + }, + { + "epoch": 0.11, + "grad_norm": 2.350034885411002, + "learning_rate": 9.830106648921152e-06, + "loss": 0.6827, + "step": 1053 + }, + { + "epoch": 0.11, + "grad_norm": 3.2408010200349135, + "learning_rate": 9.829665886938544e-06, + "loss": 0.7066, + "step": 1054 + }, + { + "epoch": 0.11, + "grad_norm": 3.0868922059592543, + "learning_rate": 9.829224563859314e-06, + "loss": 0.6116, + "step": 1055 + }, + { + "epoch": 0.11, + "grad_norm": 2.941803666500437, + "learning_rate": 9.828782679734737e-06, + "loss": 0.8022, + "step": 1056 + }, + { + "epoch": 0.11, + "grad_norm": 3.823063887767071, + "learning_rate": 9.828340234616142e-06, + "loss": 0.7427, + "step": 1057 + }, + { + "epoch": 0.11, + "grad_norm": 2.8844101888229514, + "learning_rate": 9.827897228554939e-06, + "loss": 0.7707, + "step": 1058 + }, + { + "epoch": 0.11, + "grad_norm": 3.0160506838501684, + "learning_rate": 9.827453661602592e-06, + "loss": 0.7326, + "step": 1059 + }, + { + "epoch": 0.11, + "grad_norm": 2.300778641620372, + "learning_rate": 9.827009533810632e-06, + "loss": 0.7175, + "step": 1060 + }, + { + "epoch": 0.11, + "grad_norm": 2.6516868846525616, + "learning_rate": 9.82656484523066e-06, + "loss": 0.7898, + "step": 1061 + }, + { + "epoch": 0.11, + "grad_norm": 2.4715363916836623, + "learning_rate": 9.826119595914334e-06, + "loss": 0.6921, + "step": 1062 + }, + { + "epoch": 0.11, + "grad_norm": 2.327340583098541, + "learning_rate": 9.825673785913385e-06, + "loss": 0.6928, + "step": 1063 + }, + { + "epoch": 0.11, + "grad_norm": 2.759469881660751, + "learning_rate": 9.825227415279606e-06, + "loss": 0.7855, + "step": 1064 + }, + { + "epoch": 0.11, + "grad_norm": 2.534088445808419, + "learning_rate": 9.824780484064853e-06, + "loss": 0.7307, + "step": 1065 + }, + { + "epoch": 0.11, + "grad_norm": 2.1112574816361924, + "learning_rate": 9.824332992321052e-06, + "loss": 0.6811, + "step": 1066 + }, + { + "epoch": 0.11, + "grad_norm": 2.373627167824405, + "learning_rate": 9.823884940100188e-06, + "loss": 0.7312, + "step": 1067 + }, + { + "epoch": 0.11, + "grad_norm": 2.7883580813619697, + "learning_rate": 9.823436327454318e-06, + "loss": 0.7676, + "step": 1068 + }, + { + "epoch": 0.11, + "grad_norm": 2.1899506229168537, + "learning_rate": 9.822987154435557e-06, + "loss": 0.7408, + "step": 1069 + }, + { + "epoch": 0.11, + "grad_norm": 2.0479852936546252, + "learning_rate": 9.82253742109609e-06, + "loss": 0.6622, + "step": 1070 + }, + { + "epoch": 0.11, + "grad_norm": 2.7416313050535015, + "learning_rate": 9.822087127488167e-06, + "loss": 0.7698, + "step": 1071 + }, + { + "epoch": 0.11, + "grad_norm": 2.105517261367347, + "learning_rate": 9.821636273664102e-06, + "loss": 0.6482, + "step": 1072 + }, + { + "epoch": 0.11, + "grad_norm": 3.280144562837707, + "learning_rate": 9.821184859676269e-06, + "loss": 0.6423, + "step": 1073 + }, + { + "epoch": 0.11, + "grad_norm": 4.134409942350642, + "learning_rate": 9.820732885577117e-06, + "loss": 0.6664, + "step": 1074 + }, + { + "epoch": 0.11, + "grad_norm": 2.912802423096457, + "learning_rate": 9.820280351419155e-06, + "loss": 0.826, + "step": 1075 + }, + { + "epoch": 0.11, + "grad_norm": 3.1928081836845386, + "learning_rate": 9.819827257254957e-06, + "loss": 0.7791, + "step": 1076 + }, + { + "epoch": 0.11, + "grad_norm": 2.779451497550286, + "learning_rate": 9.81937360313716e-06, + "loss": 0.8068, + "step": 1077 + }, + { + "epoch": 0.11, + "grad_norm": 3.32410870620609, + "learning_rate": 9.818919389118466e-06, + "loss": 0.7509, + "step": 1078 + }, + { + "epoch": 0.11, + "grad_norm": 2.5323788910344027, + "learning_rate": 9.81846461525165e-06, + "loss": 0.7576, + "step": 1079 + }, + { + "epoch": 0.11, + "grad_norm": 2.439458544255558, + "learning_rate": 9.818009281589545e-06, + "loss": 0.7202, + "step": 1080 + }, + { + "epoch": 0.11, + "grad_norm": 3.0195258140646195, + "learning_rate": 9.817553388185046e-06, + "loss": 0.6668, + "step": 1081 + }, + { + "epoch": 0.11, + "grad_norm": 1.5034092301568605, + "learning_rate": 9.817096935091123e-06, + "loss": 0.6359, + "step": 1082 + }, + { + "epoch": 0.11, + "grad_norm": 3.4584143921665347, + "learning_rate": 9.816639922360802e-06, + "loss": 0.8676, + "step": 1083 + }, + { + "epoch": 0.11, + "grad_norm": 3.1995319474627197, + "learning_rate": 9.816182350047179e-06, + "loss": 0.6817, + "step": 1084 + }, + { + "epoch": 0.11, + "grad_norm": 3.416376258425686, + "learning_rate": 9.815724218203411e-06, + "loss": 0.7575, + "step": 1085 + }, + { + "epoch": 0.11, + "grad_norm": 3.4149722424830817, + "learning_rate": 9.815265526882726e-06, + "loss": 0.7805, + "step": 1086 + }, + { + "epoch": 0.11, + "grad_norm": 2.3282208941621216, + "learning_rate": 9.814806276138412e-06, + "loss": 0.8052, + "step": 1087 + }, + { + "epoch": 0.11, + "grad_norm": 2.4514470562750907, + "learning_rate": 9.81434646602382e-06, + "loss": 0.7267, + "step": 1088 + }, + { + "epoch": 0.11, + "grad_norm": 2.976660270631451, + "learning_rate": 9.813886096592376e-06, + "loss": 0.6789, + "step": 1089 + }, + { + "epoch": 0.11, + "grad_norm": 2.0977217217797643, + "learning_rate": 9.81342516789756e-06, + "loss": 0.7351, + "step": 1090 + }, + { + "epoch": 0.11, + "grad_norm": 16.753908355958515, + "learning_rate": 9.81296367999292e-06, + "loss": 0.6394, + "step": 1091 + }, + { + "epoch": 0.11, + "grad_norm": 2.3815484353523138, + "learning_rate": 9.812501632932074e-06, + "loss": 0.7619, + "step": 1092 + }, + { + "epoch": 0.12, + "grad_norm": 4.14830234781379, + "learning_rate": 9.8120390267687e-06, + "loss": 0.7141, + "step": 1093 + }, + { + "epoch": 0.12, + "grad_norm": 2.4341062988650624, + "learning_rate": 9.811575861556541e-06, + "loss": 0.776, + "step": 1094 + }, + { + "epoch": 0.12, + "grad_norm": 2.290585984126302, + "learning_rate": 9.811112137349407e-06, + "loss": 0.8117, + "step": 1095 + }, + { + "epoch": 0.12, + "grad_norm": 2.555561103046378, + "learning_rate": 9.810647854201174e-06, + "loss": 0.748, + "step": 1096 + }, + { + "epoch": 0.12, + "grad_norm": 2.710466022236052, + "learning_rate": 9.81018301216578e-06, + "loss": 0.7782, + "step": 1097 + }, + { + "epoch": 0.12, + "grad_norm": 3.3963252605264445, + "learning_rate": 9.809717611297227e-06, + "loss": 0.6976, + "step": 1098 + }, + { + "epoch": 0.12, + "grad_norm": 2.6354731547094254, + "learning_rate": 9.809251651649586e-06, + "loss": 0.8057, + "step": 1099 + }, + { + "epoch": 0.12, + "grad_norm": 1.4944044637050682, + "learning_rate": 9.80878513327699e-06, + "loss": 0.6846, + "step": 1100 + }, + { + "epoch": 0.12, + "grad_norm": 2.2779646813088203, + "learning_rate": 9.80831805623364e-06, + "loss": 0.6475, + "step": 1101 + }, + { + "epoch": 0.12, + "grad_norm": 2.3609991882243317, + "learning_rate": 9.807850420573794e-06, + "loss": 0.6642, + "step": 1102 + }, + { + "epoch": 0.12, + "grad_norm": 2.2887645711932185, + "learning_rate": 9.807382226351786e-06, + "loss": 0.7091, + "step": 1103 + }, + { + "epoch": 0.12, + "grad_norm": 2.1955085410313404, + "learning_rate": 9.806913473622008e-06, + "loss": 0.6662, + "step": 1104 + }, + { + "epoch": 0.12, + "grad_norm": 1.2036596843300829, + "learning_rate": 9.806444162438917e-06, + "loss": 0.6465, + "step": 1105 + }, + { + "epoch": 0.12, + "grad_norm": 1.1349422385810124, + "learning_rate": 9.805974292857038e-06, + "loss": 0.6237, + "step": 1106 + }, + { + "epoch": 0.12, + "grad_norm": 2.5361645121575336, + "learning_rate": 9.805503864930958e-06, + "loss": 0.9142, + "step": 1107 + }, + { + "epoch": 0.12, + "grad_norm": 3.0395622006277123, + "learning_rate": 9.80503287871533e-06, + "loss": 0.6835, + "step": 1108 + }, + { + "epoch": 0.12, + "grad_norm": 2.422353959280125, + "learning_rate": 9.804561334264872e-06, + "loss": 0.7633, + "step": 1109 + }, + { + "epoch": 0.12, + "grad_norm": 2.5480040358296767, + "learning_rate": 9.804089231634368e-06, + "loss": 0.8203, + "step": 1110 + }, + { + "epoch": 0.12, + "grad_norm": 2.8729382273172983, + "learning_rate": 9.803616570878664e-06, + "loss": 0.6498, + "step": 1111 + }, + { + "epoch": 0.12, + "grad_norm": 2.7811166027244028, + "learning_rate": 9.803143352052674e-06, + "loss": 0.5825, + "step": 1112 + }, + { + "epoch": 0.12, + "grad_norm": 2.12202620735347, + "learning_rate": 9.802669575211369e-06, + "loss": 0.6428, + "step": 1113 + }, + { + "epoch": 0.12, + "grad_norm": 5.0006468325620315, + "learning_rate": 9.8021952404098e-06, + "loss": 0.7803, + "step": 1114 + }, + { + "epoch": 0.12, + "grad_norm": 3.1558917309253327, + "learning_rate": 9.80172034770307e-06, + "loss": 0.7181, + "step": 1115 + }, + { + "epoch": 0.12, + "grad_norm": 2.333799697036943, + "learning_rate": 9.801244897146348e-06, + "loss": 0.8098, + "step": 1116 + }, + { + "epoch": 0.12, + "grad_norm": 2.3764518557927974, + "learning_rate": 9.800768888794874e-06, + "loss": 0.6982, + "step": 1117 + }, + { + "epoch": 0.12, + "grad_norm": 2.553267042004277, + "learning_rate": 9.800292322703949e-06, + "loss": 0.7525, + "step": 1118 + }, + { + "epoch": 0.12, + "grad_norm": 2.418379261387043, + "learning_rate": 9.799815198928937e-06, + "loss": 0.7675, + "step": 1119 + }, + { + "epoch": 0.12, + "grad_norm": 2.8872564473960765, + "learning_rate": 9.79933751752527e-06, + "loss": 0.6478, + "step": 1120 + }, + { + "epoch": 0.12, + "grad_norm": 4.052357385896143, + "learning_rate": 9.798859278548443e-06, + "loss": 0.7681, + "step": 1121 + }, + { + "epoch": 0.12, + "grad_norm": 2.2984285503416286, + "learning_rate": 9.798380482054019e-06, + "loss": 0.6921, + "step": 1122 + }, + { + "epoch": 0.12, + "grad_norm": 2.422351898914631, + "learning_rate": 9.79790112809762e-06, + "loss": 0.7374, + "step": 1123 + }, + { + "epoch": 0.12, + "grad_norm": 2.2410791102035756, + "learning_rate": 9.797421216734938e-06, + "loss": 0.664, + "step": 1124 + }, + { + "epoch": 0.12, + "grad_norm": 3.470129472972923, + "learning_rate": 9.796940748021727e-06, + "loss": 0.7462, + "step": 1125 + }, + { + "epoch": 0.12, + "grad_norm": 2.414314965943791, + "learning_rate": 9.796459722013804e-06, + "loss": 0.7995, + "step": 1126 + }, + { + "epoch": 0.12, + "grad_norm": 3.270258241210417, + "learning_rate": 9.795978138767059e-06, + "loss": 0.7509, + "step": 1127 + }, + { + "epoch": 0.12, + "grad_norm": 2.353498649269498, + "learning_rate": 9.795495998337436e-06, + "loss": 0.8035, + "step": 1128 + }, + { + "epoch": 0.12, + "grad_norm": 2.8698790645980807, + "learning_rate": 9.795013300780951e-06, + "loss": 0.7457, + "step": 1129 + }, + { + "epoch": 0.12, + "grad_norm": 3.6109946770346673, + "learning_rate": 9.794530046153681e-06, + "loss": 0.7238, + "step": 1130 + }, + { + "epoch": 0.12, + "grad_norm": 2.488339655347247, + "learning_rate": 9.79404623451177e-06, + "loss": 0.7118, + "step": 1131 + }, + { + "epoch": 0.12, + "grad_norm": 2.9913223737937966, + "learning_rate": 9.793561865911425e-06, + "loss": 0.76, + "step": 1132 + }, + { + "epoch": 0.12, + "grad_norm": 5.4767066337380035, + "learning_rate": 9.793076940408921e-06, + "loss": 0.6867, + "step": 1133 + }, + { + "epoch": 0.12, + "grad_norm": 2.657626022399911, + "learning_rate": 9.792591458060592e-06, + "loss": 0.7424, + "step": 1134 + }, + { + "epoch": 0.12, + "grad_norm": 3.0031975051776887, + "learning_rate": 9.792105418922842e-06, + "loss": 0.6451, + "step": 1135 + }, + { + "epoch": 0.12, + "grad_norm": 2.515412387819786, + "learning_rate": 9.791618823052137e-06, + "loss": 0.654, + "step": 1136 + }, + { + "epoch": 0.12, + "grad_norm": 2.2988508182868506, + "learning_rate": 9.791131670505008e-06, + "loss": 0.7053, + "step": 1137 + }, + { + "epoch": 0.12, + "grad_norm": 3.392212351593364, + "learning_rate": 9.790643961338051e-06, + "loss": 0.6784, + "step": 1138 + }, + { + "epoch": 0.12, + "grad_norm": 2.3321628230545013, + "learning_rate": 9.790155695607927e-06, + "loss": 0.747, + "step": 1139 + }, + { + "epoch": 0.12, + "grad_norm": 2.5520902735537194, + "learning_rate": 9.789666873371361e-06, + "loss": 0.7747, + "step": 1140 + }, + { + "epoch": 0.12, + "grad_norm": 3.0561347446951537, + "learning_rate": 9.789177494685146e-06, + "loss": 0.7497, + "step": 1141 + }, + { + "epoch": 0.12, + "grad_norm": 2.42722295869959, + "learning_rate": 9.788687559606131e-06, + "loss": 0.7141, + "step": 1142 + }, + { + "epoch": 0.12, + "grad_norm": 3.087792011146286, + "learning_rate": 9.788197068191237e-06, + "loss": 0.78, + "step": 1143 + }, + { + "epoch": 0.12, + "grad_norm": 2.742833380122242, + "learning_rate": 9.787706020497451e-06, + "loss": 0.702, + "step": 1144 + }, + { + "epoch": 0.12, + "grad_norm": 2.2623934679268323, + "learning_rate": 9.787214416581818e-06, + "loss": 0.6702, + "step": 1145 + }, + { + "epoch": 0.12, + "grad_norm": 3.105793673298671, + "learning_rate": 9.786722256501454e-06, + "loss": 0.742, + "step": 1146 + }, + { + "epoch": 0.12, + "grad_norm": 2.771703607077318, + "learning_rate": 9.786229540313534e-06, + "loss": 0.6825, + "step": 1147 + }, + { + "epoch": 0.12, + "grad_norm": 2.833220908887274, + "learning_rate": 9.785736268075303e-06, + "loss": 0.6402, + "step": 1148 + }, + { + "epoch": 0.12, + "grad_norm": 3.252609192328139, + "learning_rate": 9.785242439844064e-06, + "loss": 0.7542, + "step": 1149 + }, + { + "epoch": 0.12, + "grad_norm": 2.6298483732855993, + "learning_rate": 9.784748055677193e-06, + "loss": 0.758, + "step": 1150 + }, + { + "epoch": 0.12, + "grad_norm": 2.1632996471894916, + "learning_rate": 9.784253115632125e-06, + "loss": 0.7049, + "step": 1151 + }, + { + "epoch": 0.12, + "grad_norm": 2.34611962892255, + "learning_rate": 9.783757619766359e-06, + "loss": 0.6605, + "step": 1152 + }, + { + "epoch": 0.12, + "grad_norm": 2.3000195548875086, + "learning_rate": 9.783261568137461e-06, + "loss": 0.6681, + "step": 1153 + }, + { + "epoch": 0.12, + "grad_norm": 2.644856326527659, + "learning_rate": 9.78276496080306e-06, + "loss": 0.7254, + "step": 1154 + }, + { + "epoch": 0.12, + "grad_norm": 3.363577339292242, + "learning_rate": 9.782267797820852e-06, + "loss": 0.6255, + "step": 1155 + }, + { + "epoch": 0.12, + "grad_norm": 2.178294432104343, + "learning_rate": 9.781770079248597e-06, + "loss": 0.6964, + "step": 1156 + }, + { + "epoch": 0.12, + "grad_norm": 2.351307497143435, + "learning_rate": 9.781271805144115e-06, + "loss": 0.672, + "step": 1157 + }, + { + "epoch": 0.12, + "grad_norm": 2.317111010522036, + "learning_rate": 9.780772975565297e-06, + "loss": 0.716, + "step": 1158 + }, + { + "epoch": 0.12, + "grad_norm": 8.74680266959224, + "learning_rate": 9.780273590570095e-06, + "loss": 0.7632, + "step": 1159 + }, + { + "epoch": 0.12, + "grad_norm": 3.8612186188840654, + "learning_rate": 9.779773650216524e-06, + "loss": 0.688, + "step": 1160 + }, + { + "epoch": 0.12, + "grad_norm": 3.73767312113061, + "learning_rate": 9.779273154562668e-06, + "loss": 0.732, + "step": 1161 + }, + { + "epoch": 0.12, + "grad_norm": 2.195963737502883, + "learning_rate": 9.778772103666672e-06, + "loss": 0.7151, + "step": 1162 + }, + { + "epoch": 0.12, + "grad_norm": 2.842525432869639, + "learning_rate": 9.778270497586747e-06, + "loss": 0.6845, + "step": 1163 + }, + { + "epoch": 0.12, + "grad_norm": 2.134809149072051, + "learning_rate": 9.77776833638117e-06, + "loss": 0.7597, + "step": 1164 + }, + { + "epoch": 0.12, + "grad_norm": 2.7009490835040846, + "learning_rate": 9.777265620108277e-06, + "loss": 0.7779, + "step": 1165 + }, + { + "epoch": 0.12, + "grad_norm": 2.317959306339774, + "learning_rate": 9.776762348826474e-06, + "loss": 0.7794, + "step": 1166 + }, + { + "epoch": 0.12, + "grad_norm": 2.109072732996193, + "learning_rate": 9.776258522594231e-06, + "loss": 0.7258, + "step": 1167 + }, + { + "epoch": 0.12, + "grad_norm": 2.1691103374743497, + "learning_rate": 9.775754141470077e-06, + "loss": 0.7235, + "step": 1168 + }, + { + "epoch": 0.12, + "grad_norm": 3.2658839932977033, + "learning_rate": 9.775249205512614e-06, + "loss": 0.7238, + "step": 1169 + }, + { + "epoch": 0.12, + "grad_norm": 2.4497606850003515, + "learning_rate": 9.774743714780502e-06, + "loss": 0.7825, + "step": 1170 + }, + { + "epoch": 0.12, + "grad_norm": 2.7201508522085223, + "learning_rate": 9.774237669332467e-06, + "loss": 0.7096, + "step": 1171 + }, + { + "epoch": 0.12, + "grad_norm": 3.194344873698593, + "learning_rate": 9.7737310692273e-06, + "loss": 0.7806, + "step": 1172 + }, + { + "epoch": 0.12, + "grad_norm": 2.4546802907254675, + "learning_rate": 9.77322391452386e-06, + "loss": 0.7075, + "step": 1173 + }, + { + "epoch": 0.12, + "grad_norm": 2.8217799648099753, + "learning_rate": 9.772716205281061e-06, + "loss": 0.6641, + "step": 1174 + }, + { + "epoch": 0.12, + "grad_norm": 3.3940574360026314, + "learning_rate": 9.772207941557889e-06, + "loss": 0.7329, + "step": 1175 + }, + { + "epoch": 0.12, + "grad_norm": 2.8617824561808356, + "learning_rate": 9.771699123413396e-06, + "loss": 0.6231, + "step": 1176 + }, + { + "epoch": 0.12, + "grad_norm": 1.4154376282640087, + "learning_rate": 9.77118975090669e-06, + "loss": 0.6116, + "step": 1177 + }, + { + "epoch": 0.12, + "grad_norm": 2.378924743383222, + "learning_rate": 9.770679824096952e-06, + "loss": 0.6989, + "step": 1178 + }, + { + "epoch": 0.12, + "grad_norm": 2.7001805333163587, + "learning_rate": 9.770169343043423e-06, + "loss": 0.7121, + "step": 1179 + }, + { + "epoch": 0.12, + "grad_norm": 2.5402821969530742, + "learning_rate": 9.769658307805408e-06, + "loss": 0.7407, + "step": 1180 + }, + { + "epoch": 0.12, + "grad_norm": 2.4849409843596, + "learning_rate": 9.769146718442279e-06, + "loss": 0.7695, + "step": 1181 + }, + { + "epoch": 0.12, + "grad_norm": 2.36462157041008, + "learning_rate": 9.76863457501347e-06, + "loss": 0.7366, + "step": 1182 + }, + { + "epoch": 0.12, + "grad_norm": 3.5569947068679664, + "learning_rate": 9.76812187757848e-06, + "loss": 0.7354, + "step": 1183 + }, + { + "epoch": 0.12, + "grad_norm": 2.230897787375336, + "learning_rate": 9.767608626196877e-06, + "loss": 0.6896, + "step": 1184 + }, + { + "epoch": 0.12, + "grad_norm": 2.322170991929863, + "learning_rate": 9.767094820928282e-06, + "loss": 0.7377, + "step": 1185 + }, + { + "epoch": 0.12, + "grad_norm": 4.766972909981728, + "learning_rate": 9.766580461832391e-06, + "loss": 0.7322, + "step": 1186 + }, + { + "epoch": 0.12, + "grad_norm": 1.9936899282121772, + "learning_rate": 9.766065548968962e-06, + "loss": 0.7133, + "step": 1187 + }, + { + "epoch": 0.13, + "grad_norm": 2.9245738454667074, + "learning_rate": 9.765550082397815e-06, + "loss": 0.7225, + "step": 1188 + }, + { + "epoch": 0.13, + "grad_norm": 2.9105025784741505, + "learning_rate": 9.765034062178836e-06, + "loss": 0.773, + "step": 1189 + }, + { + "epoch": 0.13, + "grad_norm": 2.3491105179808205, + "learning_rate": 9.764517488371971e-06, + "loss": 0.7345, + "step": 1190 + }, + { + "epoch": 0.13, + "grad_norm": 3.042045767945915, + "learning_rate": 9.76400036103724e-06, + "loss": 0.6984, + "step": 1191 + }, + { + "epoch": 0.13, + "grad_norm": 2.452386829371169, + "learning_rate": 9.763482680234718e-06, + "loss": 0.7039, + "step": 1192 + }, + { + "epoch": 0.13, + "grad_norm": 2.3004751526217735, + "learning_rate": 9.762964446024547e-06, + "loss": 0.7878, + "step": 1193 + }, + { + "epoch": 0.13, + "grad_norm": 1.3235197535321637, + "learning_rate": 9.762445658466935e-06, + "loss": 0.6694, + "step": 1194 + }, + { + "epoch": 0.13, + "grad_norm": 2.539843716603838, + "learning_rate": 9.761926317622154e-06, + "loss": 0.741, + "step": 1195 + }, + { + "epoch": 0.13, + "grad_norm": 2.849640472644429, + "learning_rate": 9.761406423550539e-06, + "loss": 0.7408, + "step": 1196 + }, + { + "epoch": 0.13, + "grad_norm": 2.5166258673564514, + "learning_rate": 9.760885976312488e-06, + "loss": 0.7223, + "step": 1197 + }, + { + "epoch": 0.13, + "grad_norm": 2.725239057841064, + "learning_rate": 9.760364975968469e-06, + "loss": 0.7007, + "step": 1198 + }, + { + "epoch": 0.13, + "grad_norm": 6.7294366804059464, + "learning_rate": 9.759843422579005e-06, + "loss": 0.6748, + "step": 1199 + }, + { + "epoch": 0.13, + "grad_norm": 2.7738891360490694, + "learning_rate": 9.759321316204693e-06, + "loss": 0.6751, + "step": 1200 + }, + { + "epoch": 0.13, + "grad_norm": 16.17668799017805, + "learning_rate": 9.758798656906187e-06, + "loss": 0.7039, + "step": 1201 + }, + { + "epoch": 0.13, + "grad_norm": 2.8538609592861137, + "learning_rate": 9.758275444744211e-06, + "loss": 0.7457, + "step": 1202 + }, + { + "epoch": 0.13, + "grad_norm": 2.8412177031201455, + "learning_rate": 9.757751679779549e-06, + "loss": 0.6996, + "step": 1203 + }, + { + "epoch": 0.13, + "grad_norm": 2.4794823846058103, + "learning_rate": 9.757227362073048e-06, + "loss": 0.7336, + "step": 1204 + }, + { + "epoch": 0.13, + "grad_norm": 1.9919272034069202, + "learning_rate": 9.756702491685626e-06, + "loss": 0.7235, + "step": 1205 + }, + { + "epoch": 0.13, + "grad_norm": 2.682936702377847, + "learning_rate": 9.756177068678258e-06, + "loss": 0.6468, + "step": 1206 + }, + { + "epoch": 0.13, + "grad_norm": 2.2979583262392227, + "learning_rate": 9.755651093111987e-06, + "loss": 0.7306, + "step": 1207 + }, + { + "epoch": 0.13, + "grad_norm": 5.631349641332311, + "learning_rate": 9.755124565047918e-06, + "loss": 0.7321, + "step": 1208 + }, + { + "epoch": 0.13, + "grad_norm": 2.09817214920078, + "learning_rate": 9.754597484547223e-06, + "loss": 0.6638, + "step": 1209 + }, + { + "epoch": 0.13, + "grad_norm": 2.909553694559456, + "learning_rate": 9.754069851671138e-06, + "loss": 0.7271, + "step": 1210 + }, + { + "epoch": 0.13, + "grad_norm": 3.0625344333106828, + "learning_rate": 9.753541666480959e-06, + "loss": 0.6552, + "step": 1211 + }, + { + "epoch": 0.13, + "grad_norm": 3.2516687148488987, + "learning_rate": 9.75301292903805e-06, + "loss": 0.7763, + "step": 1212 + }, + { + "epoch": 0.13, + "grad_norm": 3.0525856091150914, + "learning_rate": 9.752483639403839e-06, + "loss": 0.7237, + "step": 1213 + }, + { + "epoch": 0.13, + "grad_norm": 1.3517703119170414, + "learning_rate": 9.751953797639817e-06, + "loss": 0.6346, + "step": 1214 + }, + { + "epoch": 0.13, + "grad_norm": 2.5336050253735287, + "learning_rate": 9.751423403807539e-06, + "loss": 0.7032, + "step": 1215 + }, + { + "epoch": 0.13, + "grad_norm": 2.274623839914132, + "learning_rate": 9.750892457968626e-06, + "loss": 0.7113, + "step": 1216 + }, + { + "epoch": 0.13, + "grad_norm": 2.502789805441234, + "learning_rate": 9.75036096018476e-06, + "loss": 0.6948, + "step": 1217 + }, + { + "epoch": 0.13, + "grad_norm": 2.362716780752687, + "learning_rate": 9.749828910517688e-06, + "loss": 0.6797, + "step": 1218 + }, + { + "epoch": 0.13, + "grad_norm": 1.9013855117687954, + "learning_rate": 9.749296309029224e-06, + "loss": 0.7267, + "step": 1219 + }, + { + "epoch": 0.13, + "grad_norm": 2.5275074129127404, + "learning_rate": 9.748763155781244e-06, + "loss": 0.719, + "step": 1220 + }, + { + "epoch": 0.13, + "grad_norm": 3.7639603698409827, + "learning_rate": 9.748229450835689e-06, + "loss": 0.6755, + "step": 1221 + }, + { + "epoch": 0.13, + "grad_norm": 2.8030238737546624, + "learning_rate": 9.747695194254561e-06, + "loss": 0.7427, + "step": 1222 + }, + { + "epoch": 0.13, + "grad_norm": 2.820939766808103, + "learning_rate": 9.74716038609993e-06, + "loss": 0.7745, + "step": 1223 + }, + { + "epoch": 0.13, + "grad_norm": 2.48939064756324, + "learning_rate": 9.746625026433929e-06, + "loss": 0.6752, + "step": 1224 + }, + { + "epoch": 0.13, + "grad_norm": 2.7729898492722955, + "learning_rate": 9.746089115318751e-06, + "loss": 0.6633, + "step": 1225 + }, + { + "epoch": 0.13, + "grad_norm": 2.5642579652047615, + "learning_rate": 9.745552652816662e-06, + "loss": 0.6911, + "step": 1226 + }, + { + "epoch": 0.13, + "grad_norm": 2.479378462117706, + "learning_rate": 9.74501563898998e-06, + "loss": 0.6961, + "step": 1227 + }, + { + "epoch": 0.13, + "grad_norm": 3.517261027107456, + "learning_rate": 9.744478073901102e-06, + "loss": 0.6875, + "step": 1228 + }, + { + "epoch": 0.13, + "grad_norm": 4.143063814038163, + "learning_rate": 9.743939957612473e-06, + "loss": 0.7862, + "step": 1229 + }, + { + "epoch": 0.13, + "grad_norm": 2.186066134453419, + "learning_rate": 9.743401290186615e-06, + "loss": 0.6905, + "step": 1230 + }, + { + "epoch": 0.13, + "grad_norm": 2.4988157638795383, + "learning_rate": 9.742862071686105e-06, + "loss": 0.6065, + "step": 1231 + }, + { + "epoch": 0.13, + "grad_norm": 2.944980331009129, + "learning_rate": 9.742322302173591e-06, + "loss": 0.8424, + "step": 1232 + }, + { + "epoch": 0.13, + "grad_norm": 2.4840746884722997, + "learning_rate": 9.74178198171178e-06, + "loss": 0.7242, + "step": 1233 + }, + { + "epoch": 0.13, + "grad_norm": 2.3564467576176615, + "learning_rate": 9.741241110363446e-06, + "loss": 0.5671, + "step": 1234 + }, + { + "epoch": 0.13, + "grad_norm": 2.5428398872084914, + "learning_rate": 9.740699688191426e-06, + "loss": 0.7558, + "step": 1235 + }, + { + "epoch": 0.13, + "grad_norm": 2.987401358325348, + "learning_rate": 9.74015771525862e-06, + "loss": 0.7976, + "step": 1236 + }, + { + "epoch": 0.13, + "grad_norm": 2.288527677436737, + "learning_rate": 9.73961519162799e-06, + "loss": 0.7495, + "step": 1237 + }, + { + "epoch": 0.13, + "grad_norm": 3.203656023458977, + "learning_rate": 9.739072117362572e-06, + "loss": 0.6546, + "step": 1238 + }, + { + "epoch": 0.13, + "grad_norm": 2.893448106608211, + "learning_rate": 9.738528492525454e-06, + "loss": 0.7536, + "step": 1239 + }, + { + "epoch": 0.13, + "grad_norm": 2.562798821263755, + "learning_rate": 9.737984317179792e-06, + "loss": 0.6901, + "step": 1240 + }, + { + "epoch": 0.13, + "grad_norm": 2.525831897954803, + "learning_rate": 9.737439591388808e-06, + "loss": 0.7072, + "step": 1241 + }, + { + "epoch": 0.13, + "grad_norm": 2.390376616064533, + "learning_rate": 9.736894315215788e-06, + "loss": 0.6709, + "step": 1242 + }, + { + "epoch": 0.13, + "grad_norm": 2.290595026707291, + "learning_rate": 9.736348488724078e-06, + "loss": 0.7022, + "step": 1243 + }, + { + "epoch": 0.13, + "grad_norm": 3.0135726421842928, + "learning_rate": 9.735802111977093e-06, + "loss": 0.7398, + "step": 1244 + }, + { + "epoch": 0.13, + "grad_norm": 6.4970745285376745, + "learning_rate": 9.735255185038308e-06, + "loss": 0.7238, + "step": 1245 + }, + { + "epoch": 0.13, + "grad_norm": 2.5294324112992945, + "learning_rate": 9.734707707971265e-06, + "loss": 0.6654, + "step": 1246 + }, + { + "epoch": 0.13, + "grad_norm": 4.769086981727464, + "learning_rate": 9.734159680839566e-06, + "loss": 0.7625, + "step": 1247 + }, + { + "epoch": 0.13, + "grad_norm": 2.425192205003877, + "learning_rate": 9.733611103706882e-06, + "loss": 0.7288, + "step": 1248 + }, + { + "epoch": 0.13, + "grad_norm": 2.4432588819312016, + "learning_rate": 9.73306197663694e-06, + "loss": 0.7453, + "step": 1249 + }, + { + "epoch": 0.13, + "grad_norm": 2.4916017795323224, + "learning_rate": 9.732512299693542e-06, + "loss": 0.6687, + "step": 1250 + }, + { + "epoch": 0.13, + "grad_norm": 6.171083266500421, + "learning_rate": 9.731962072940545e-06, + "loss": 0.7657, + "step": 1251 + }, + { + "epoch": 0.13, + "grad_norm": 2.2828106585662606, + "learning_rate": 9.731411296441873e-06, + "loss": 0.7871, + "step": 1252 + }, + { + "epoch": 0.13, + "grad_norm": 6.2796867996521595, + "learning_rate": 9.730859970261514e-06, + "loss": 0.7616, + "step": 1253 + }, + { + "epoch": 0.13, + "grad_norm": 2.9365596197667454, + "learning_rate": 9.730308094463519e-06, + "loss": 0.7857, + "step": 1254 + }, + { + "epoch": 0.13, + "grad_norm": 2.571904631777549, + "learning_rate": 9.729755669112003e-06, + "loss": 0.7116, + "step": 1255 + }, + { + "epoch": 0.13, + "grad_norm": 2.5467525821131054, + "learning_rate": 9.729202694271145e-06, + "loss": 0.6199, + "step": 1256 + }, + { + "epoch": 0.13, + "grad_norm": 2.2962244446782196, + "learning_rate": 9.72864917000519e-06, + "loss": 0.758, + "step": 1257 + }, + { + "epoch": 0.13, + "grad_norm": 3.140094567957693, + "learning_rate": 9.728095096378443e-06, + "loss": 0.7457, + "step": 1258 + }, + { + "epoch": 0.13, + "grad_norm": 2.4307560819993066, + "learning_rate": 9.727540473455277e-06, + "loss": 0.7374, + "step": 1259 + }, + { + "epoch": 0.13, + "grad_norm": 3.4574501390436145, + "learning_rate": 9.726985301300122e-06, + "loss": 0.7201, + "step": 1260 + }, + { + "epoch": 0.13, + "grad_norm": 2.5770663455788885, + "learning_rate": 9.72642957997748e-06, + "loss": 0.6912, + "step": 1261 + }, + { + "epoch": 0.13, + "grad_norm": 2.6051783139637252, + "learning_rate": 9.725873309551915e-06, + "loss": 0.7906, + "step": 1262 + }, + { + "epoch": 0.13, + "grad_norm": 2.6247042237572757, + "learning_rate": 9.725316490088046e-06, + "loss": 0.7596, + "step": 1263 + }, + { + "epoch": 0.13, + "grad_norm": 2.208978552737289, + "learning_rate": 9.724759121650569e-06, + "loss": 0.6893, + "step": 1264 + }, + { + "epoch": 0.13, + "grad_norm": 2.818237537685943, + "learning_rate": 9.724201204304234e-06, + "loss": 0.6509, + "step": 1265 + }, + { + "epoch": 0.13, + "grad_norm": 3.516551806674273, + "learning_rate": 9.72364273811386e-06, + "loss": 0.8091, + "step": 1266 + }, + { + "epoch": 0.13, + "grad_norm": 2.8775423326241003, + "learning_rate": 9.723083723144326e-06, + "loss": 0.6664, + "step": 1267 + }, + { + "epoch": 0.13, + "grad_norm": 3.1104989369101292, + "learning_rate": 9.722524159460579e-06, + "loss": 0.7484, + "step": 1268 + }, + { + "epoch": 0.13, + "grad_norm": 2.284433621967399, + "learning_rate": 9.721964047127627e-06, + "loss": 0.7694, + "step": 1269 + }, + { + "epoch": 0.13, + "grad_norm": 3.528762803348098, + "learning_rate": 9.721403386210542e-06, + "loss": 0.7047, + "step": 1270 + }, + { + "epoch": 0.13, + "grad_norm": 2.3698967383567053, + "learning_rate": 9.720842176774458e-06, + "loss": 0.7303, + "step": 1271 + }, + { + "epoch": 0.13, + "grad_norm": 2.2862180267688195, + "learning_rate": 9.720280418884578e-06, + "loss": 0.6719, + "step": 1272 + }, + { + "epoch": 0.13, + "grad_norm": 2.585341957705507, + "learning_rate": 9.719718112606163e-06, + "loss": 0.6893, + "step": 1273 + }, + { + "epoch": 0.13, + "grad_norm": 2.931292145704906, + "learning_rate": 9.719155258004542e-06, + "loss": 0.7129, + "step": 1274 + }, + { + "epoch": 0.13, + "grad_norm": 2.930050695251463, + "learning_rate": 9.718591855145105e-06, + "loss": 0.7024, + "step": 1275 + }, + { + "epoch": 0.13, + "grad_norm": 2.3725890130265377, + "learning_rate": 9.718027904093306e-06, + "loss": 0.6779, + "step": 1276 + }, + { + "epoch": 0.13, + "grad_norm": 2.667713730201913, + "learning_rate": 9.717463404914661e-06, + "loss": 0.7128, + "step": 1277 + }, + { + "epoch": 0.13, + "grad_norm": 2.4476996596913376, + "learning_rate": 9.716898357674757e-06, + "loss": 0.725, + "step": 1278 + }, + { + "epoch": 0.13, + "grad_norm": 2.286403952573892, + "learning_rate": 9.716332762439238e-06, + "loss": 0.6882, + "step": 1279 + }, + { + "epoch": 0.13, + "grad_norm": 2.187379321806065, + "learning_rate": 9.71576661927381e-06, + "loss": 0.6514, + "step": 1280 + }, + { + "epoch": 0.13, + "grad_norm": 8.048461731248665, + "learning_rate": 9.71519992824425e-06, + "loss": 0.69, + "step": 1281 + }, + { + "epoch": 0.13, + "grad_norm": 2.4471121642879634, + "learning_rate": 9.714632689416392e-06, + "loss": 0.6853, + "step": 1282 + }, + { + "epoch": 0.14, + "grad_norm": 2.3522287088330436, + "learning_rate": 9.714064902856136e-06, + "loss": 0.6857, + "step": 1283 + }, + { + "epoch": 0.14, + "grad_norm": 2.8039331155505693, + "learning_rate": 9.713496568629447e-06, + "loss": 0.7953, + "step": 1284 + }, + { + "epoch": 0.14, + "grad_norm": 2.5431568191768257, + "learning_rate": 9.71292768680235e-06, + "loss": 0.6906, + "step": 1285 + }, + { + "epoch": 0.14, + "grad_norm": 2.734010132334214, + "learning_rate": 9.712358257440942e-06, + "loss": 0.7469, + "step": 1286 + }, + { + "epoch": 0.14, + "grad_norm": 2.766837316455564, + "learning_rate": 9.711788280611371e-06, + "loss": 0.7425, + "step": 1287 + }, + { + "epoch": 0.14, + "grad_norm": 2.2607094221443815, + "learning_rate": 9.711217756379859e-06, + "loss": 0.6745, + "step": 1288 + }, + { + "epoch": 0.14, + "grad_norm": 2.718072945572998, + "learning_rate": 9.710646684812686e-06, + "loss": 0.6632, + "step": 1289 + }, + { + "epoch": 0.14, + "grad_norm": 2.220340692493552, + "learning_rate": 9.7100750659762e-06, + "loss": 0.7289, + "step": 1290 + }, + { + "epoch": 0.14, + "grad_norm": 3.630360305784288, + "learning_rate": 9.709502899936805e-06, + "loss": 0.6569, + "step": 1291 + }, + { + "epoch": 0.14, + "grad_norm": 2.7311986179292194, + "learning_rate": 9.70893018676098e-06, + "loss": 0.7763, + "step": 1292 + }, + { + "epoch": 0.14, + "grad_norm": 2.292026620226449, + "learning_rate": 9.708356926515256e-06, + "loss": 0.7046, + "step": 1293 + }, + { + "epoch": 0.14, + "grad_norm": 3.099523665954287, + "learning_rate": 9.707783119266236e-06, + "loss": 0.7467, + "step": 1294 + }, + { + "epoch": 0.14, + "grad_norm": 2.9843246706602047, + "learning_rate": 9.707208765080583e-06, + "loss": 0.7871, + "step": 1295 + }, + { + "epoch": 0.14, + "grad_norm": 2.603495110130337, + "learning_rate": 9.706633864025021e-06, + "loss": 0.6149, + "step": 1296 + }, + { + "epoch": 0.14, + "grad_norm": 2.9957523727453244, + "learning_rate": 9.706058416166342e-06, + "loss": 0.6866, + "step": 1297 + }, + { + "epoch": 0.14, + "grad_norm": 3.4581527501195923, + "learning_rate": 9.705482421571401e-06, + "loss": 0.7306, + "step": 1298 + }, + { + "epoch": 0.14, + "grad_norm": 2.75978405172835, + "learning_rate": 9.704905880307113e-06, + "loss": 0.704, + "step": 1299 + }, + { + "epoch": 0.14, + "grad_norm": 2.6219298892401675, + "learning_rate": 9.704328792440462e-06, + "loss": 0.7072, + "step": 1300 + }, + { + "epoch": 0.14, + "grad_norm": 2.1583848744671967, + "learning_rate": 9.70375115803849e-06, + "loss": 0.6726, + "step": 1301 + }, + { + "epoch": 0.14, + "grad_norm": 2.7991267040996943, + "learning_rate": 9.703172977168307e-06, + "loss": 0.6575, + "step": 1302 + }, + { + "epoch": 0.14, + "grad_norm": 9.725583520285742, + "learning_rate": 9.702594249897082e-06, + "loss": 0.735, + "step": 1303 + }, + { + "epoch": 0.14, + "grad_norm": 3.2733331053327124, + "learning_rate": 9.70201497629205e-06, + "loss": 0.7678, + "step": 1304 + }, + { + "epoch": 0.14, + "grad_norm": 3.3225063544881346, + "learning_rate": 9.701435156420511e-06, + "loss": 0.6528, + "step": 1305 + }, + { + "epoch": 0.14, + "grad_norm": 2.5695618934438342, + "learning_rate": 9.700854790349826e-06, + "loss": 0.7385, + "step": 1306 + }, + { + "epoch": 0.14, + "grad_norm": 2.165748397571203, + "learning_rate": 9.700273878147419e-06, + "loss": 0.6923, + "step": 1307 + }, + { + "epoch": 0.14, + "grad_norm": 3.946583806133681, + "learning_rate": 9.699692419880782e-06, + "loss": 0.6818, + "step": 1308 + }, + { + "epoch": 0.14, + "grad_norm": 2.7377235660563417, + "learning_rate": 9.699110415617464e-06, + "loss": 0.6344, + "step": 1309 + }, + { + "epoch": 0.14, + "grad_norm": 4.216540668421752, + "learning_rate": 9.698527865425083e-06, + "loss": 0.7325, + "step": 1310 + }, + { + "epoch": 0.14, + "grad_norm": 2.5772415638304755, + "learning_rate": 9.697944769371315e-06, + "loss": 0.6738, + "step": 1311 + }, + { + "epoch": 0.14, + "grad_norm": 3.4296412096784854, + "learning_rate": 9.697361127523905e-06, + "loss": 0.7011, + "step": 1312 + }, + { + "epoch": 0.14, + "grad_norm": 3.0442601673343894, + "learning_rate": 9.696776939950657e-06, + "loss": 0.7084, + "step": 1313 + }, + { + "epoch": 0.14, + "grad_norm": 3.075407781881538, + "learning_rate": 9.696192206719441e-06, + "loss": 0.7175, + "step": 1314 + }, + { + "epoch": 0.14, + "grad_norm": 2.190682088347276, + "learning_rate": 9.69560692789819e-06, + "loss": 0.718, + "step": 1315 + }, + { + "epoch": 0.14, + "grad_norm": 3.203702468047881, + "learning_rate": 9.695021103554901e-06, + "loss": 0.78, + "step": 1316 + }, + { + "epoch": 0.14, + "grad_norm": 2.334461505397333, + "learning_rate": 9.694434733757632e-06, + "loss": 0.7062, + "step": 1317 + }, + { + "epoch": 0.14, + "grad_norm": 2.3787385280317968, + "learning_rate": 9.693847818574504e-06, + "loss": 0.6883, + "step": 1318 + }, + { + "epoch": 0.14, + "grad_norm": 1.7061551304241709, + "learning_rate": 9.693260358073707e-06, + "loss": 0.7412, + "step": 1319 + }, + { + "epoch": 0.14, + "grad_norm": 3.5189339975333427, + "learning_rate": 9.692672352323486e-06, + "loss": 0.7493, + "step": 1320 + }, + { + "epoch": 0.14, + "grad_norm": 2.718227304089354, + "learning_rate": 9.692083801392158e-06, + "loss": 0.7298, + "step": 1321 + }, + { + "epoch": 0.14, + "grad_norm": 5.863146583225161, + "learning_rate": 9.691494705348097e-06, + "loss": 0.685, + "step": 1322 + }, + { + "epoch": 0.14, + "grad_norm": 2.224062335592531, + "learning_rate": 9.690905064259744e-06, + "loss": 0.7131, + "step": 1323 + }, + { + "epoch": 0.14, + "grad_norm": 2.3482124963584043, + "learning_rate": 9.690314878195599e-06, + "loss": 0.7642, + "step": 1324 + }, + { + "epoch": 0.14, + "grad_norm": 2.802943387618265, + "learning_rate": 9.68972414722423e-06, + "loss": 0.7402, + "step": 1325 + }, + { + "epoch": 0.14, + "grad_norm": 6.439325155682455, + "learning_rate": 9.689132871414266e-06, + "loss": 0.6827, + "step": 1326 + }, + { + "epoch": 0.14, + "grad_norm": 2.283544545333155, + "learning_rate": 9.688541050834402e-06, + "loss": 0.7649, + "step": 1327 + }, + { + "epoch": 0.14, + "grad_norm": 2.8881726166539816, + "learning_rate": 9.68794868555339e-06, + "loss": 0.7581, + "step": 1328 + }, + { + "epoch": 0.14, + "grad_norm": 2.5386184576990725, + "learning_rate": 9.687355775640052e-06, + "loss": 0.7252, + "step": 1329 + }, + { + "epoch": 0.14, + "grad_norm": 2.6298695027488703, + "learning_rate": 9.68676232116327e-06, + "loss": 0.7742, + "step": 1330 + }, + { + "epoch": 0.14, + "grad_norm": 2.084061296828642, + "learning_rate": 9.686168322191988e-06, + "loss": 0.6, + "step": 1331 + }, + { + "epoch": 0.14, + "grad_norm": 2.3716239247862054, + "learning_rate": 9.685573778795218e-06, + "loss": 0.5694, + "step": 1332 + }, + { + "epoch": 0.14, + "grad_norm": 3.2053716555139404, + "learning_rate": 9.684978691042031e-06, + "loss": 0.7231, + "step": 1333 + }, + { + "epoch": 0.14, + "grad_norm": 2.844182707802076, + "learning_rate": 9.684383059001562e-06, + "loss": 0.6835, + "step": 1334 + }, + { + "epoch": 0.14, + "grad_norm": 3.390207972128154, + "learning_rate": 9.68378688274301e-06, + "loss": 0.7385, + "step": 1335 + }, + { + "epoch": 0.14, + "grad_norm": 2.4980529568022267, + "learning_rate": 9.683190162335638e-06, + "loss": 0.8148, + "step": 1336 + }, + { + "epoch": 0.14, + "grad_norm": 4.308242765317922, + "learning_rate": 9.68259289784877e-06, + "loss": 0.604, + "step": 1337 + }, + { + "epoch": 0.14, + "grad_norm": 2.5111104713428425, + "learning_rate": 9.681995089351797e-06, + "loss": 0.6423, + "step": 1338 + }, + { + "epoch": 0.14, + "grad_norm": 2.184046389884775, + "learning_rate": 9.681396736914169e-06, + "loss": 0.8212, + "step": 1339 + }, + { + "epoch": 0.14, + "grad_norm": 2.3349747833725316, + "learning_rate": 9.680797840605398e-06, + "loss": 0.724, + "step": 1340 + }, + { + "epoch": 0.14, + "grad_norm": 3.357395284413713, + "learning_rate": 9.680198400495067e-06, + "loss": 0.7505, + "step": 1341 + }, + { + "epoch": 0.14, + "grad_norm": 3.237004929584163, + "learning_rate": 9.679598416652814e-06, + "loss": 0.7294, + "step": 1342 + }, + { + "epoch": 0.14, + "grad_norm": 2.3140384711893978, + "learning_rate": 9.678997889148342e-06, + "loss": 0.7354, + "step": 1343 + }, + { + "epoch": 0.14, + "grad_norm": 2.396103673854701, + "learning_rate": 9.678396818051423e-06, + "loss": 0.7875, + "step": 1344 + }, + { + "epoch": 0.14, + "grad_norm": 2.4741313274912935, + "learning_rate": 9.677795203431886e-06, + "loss": 0.7377, + "step": 1345 + }, + { + "epoch": 0.14, + "grad_norm": 3.4119548772690003, + "learning_rate": 9.677193045359626e-06, + "loss": 0.6744, + "step": 1346 + }, + { + "epoch": 0.14, + "grad_norm": 3.0195566082707535, + "learning_rate": 9.676590343904595e-06, + "loss": 0.5983, + "step": 1347 + }, + { + "epoch": 0.14, + "grad_norm": 2.429934472959647, + "learning_rate": 9.675987099136817e-06, + "loss": 0.6282, + "step": 1348 + }, + { + "epoch": 0.14, + "grad_norm": 2.399876917459758, + "learning_rate": 9.675383311126376e-06, + "loss": 0.7758, + "step": 1349 + }, + { + "epoch": 0.14, + "grad_norm": 2.3597263333081977, + "learning_rate": 9.674778979943417e-06, + "loss": 0.7706, + "step": 1350 + }, + { + "epoch": 0.14, + "grad_norm": 2.582735032637409, + "learning_rate": 9.67417410565815e-06, + "loss": 0.6888, + "step": 1351 + }, + { + "epoch": 0.14, + "grad_norm": 1.2457596734399106, + "learning_rate": 9.673568688340846e-06, + "loss": 0.6083, + "step": 1352 + }, + { + "epoch": 0.14, + "grad_norm": 2.2475959881242735, + "learning_rate": 9.672962728061842e-06, + "loss": 0.7384, + "step": 1353 + }, + { + "epoch": 0.14, + "grad_norm": 2.7160143206592346, + "learning_rate": 9.672356224891536e-06, + "loss": 0.6937, + "step": 1354 + }, + { + "epoch": 0.14, + "grad_norm": 2.5502591319299843, + "learning_rate": 9.671749178900392e-06, + "loss": 0.7563, + "step": 1355 + }, + { + "epoch": 0.14, + "grad_norm": 2.6327155581660455, + "learning_rate": 9.67114159015893e-06, + "loss": 0.6819, + "step": 1356 + }, + { + "epoch": 0.14, + "grad_norm": 2.131038039139984, + "learning_rate": 9.670533458737744e-06, + "loss": 0.7373, + "step": 1357 + }, + { + "epoch": 0.14, + "grad_norm": 3.385584377854225, + "learning_rate": 9.66992478470748e-06, + "loss": 0.7657, + "step": 1358 + }, + { + "epoch": 0.14, + "grad_norm": 7.320376159534197, + "learning_rate": 9.669315568138854e-06, + "loss": 0.6898, + "step": 1359 + }, + { + "epoch": 0.14, + "grad_norm": 2.8074892746810742, + "learning_rate": 9.668705809102644e-06, + "loss": 0.6807, + "step": 1360 + }, + { + "epoch": 0.14, + "grad_norm": 2.8098437749259717, + "learning_rate": 9.668095507669688e-06, + "loss": 0.6959, + "step": 1361 + }, + { + "epoch": 0.14, + "grad_norm": 2.7943446077345855, + "learning_rate": 9.66748466391089e-06, + "loss": 0.7087, + "step": 1362 + }, + { + "epoch": 0.14, + "grad_norm": 6.059820137036976, + "learning_rate": 9.666873277897216e-06, + "loss": 0.6634, + "step": 1363 + }, + { + "epoch": 0.14, + "grad_norm": 2.575688586030959, + "learning_rate": 9.666261349699696e-06, + "loss": 0.6982, + "step": 1364 + }, + { + "epoch": 0.14, + "grad_norm": 2.262608946469177, + "learning_rate": 9.66564887938942e-06, + "loss": 0.8071, + "step": 1365 + }, + { + "epoch": 0.14, + "grad_norm": 2.311694063851022, + "learning_rate": 9.665035867037545e-06, + "loss": 0.7185, + "step": 1366 + }, + { + "epoch": 0.14, + "grad_norm": 2.1721646707712954, + "learning_rate": 9.66442231271529e-06, + "loss": 0.6839, + "step": 1367 + }, + { + "epoch": 0.14, + "grad_norm": 2.056959685399615, + "learning_rate": 9.663808216493931e-06, + "loss": 0.6246, + "step": 1368 + }, + { + "epoch": 0.14, + "grad_norm": 3.211417541737079, + "learning_rate": 9.663193578444815e-06, + "loss": 0.7382, + "step": 1369 + }, + { + "epoch": 0.14, + "grad_norm": 3.482753174472199, + "learning_rate": 9.662578398639353e-06, + "loss": 0.7261, + "step": 1370 + }, + { + "epoch": 0.14, + "grad_norm": 2.8419175901293308, + "learning_rate": 9.661962677149007e-06, + "loss": 0.7479, + "step": 1371 + }, + { + "epoch": 0.14, + "grad_norm": 1.562383209374972, + "learning_rate": 9.661346414045315e-06, + "loss": 0.6338, + "step": 1372 + }, + { + "epoch": 0.14, + "grad_norm": 2.1133024703227266, + "learning_rate": 9.66072960939987e-06, + "loss": 0.8398, + "step": 1373 + }, + { + "epoch": 0.14, + "grad_norm": 2.209697515295642, + "learning_rate": 9.660112263284334e-06, + "loss": 0.6979, + "step": 1374 + }, + { + "epoch": 0.14, + "grad_norm": 4.316723331737145, + "learning_rate": 9.659494375770424e-06, + "loss": 0.7773, + "step": 1375 + }, + { + "epoch": 0.14, + "grad_norm": 2.3531621544495867, + "learning_rate": 9.65887594692993e-06, + "loss": 0.7506, + "step": 1376 + }, + { + "epoch": 0.14, + "grad_norm": 5.6434780012213634, + "learning_rate": 9.658256976834692e-06, + "loss": 0.7223, + "step": 1377 + }, + { + "epoch": 0.15, + "grad_norm": 2.899949137691732, + "learning_rate": 9.657637465556626e-06, + "loss": 0.7059, + "step": 1378 + }, + { + "epoch": 0.15, + "grad_norm": 3.154298474199794, + "learning_rate": 9.657017413167702e-06, + "loss": 0.6951, + "step": 1379 + }, + { + "epoch": 0.15, + "grad_norm": 2.762468831498145, + "learning_rate": 9.656396819739959e-06, + "loss": 0.7888, + "step": 1380 + }, + { + "epoch": 0.15, + "grad_norm": 1.3312043862215792, + "learning_rate": 9.655775685345493e-06, + "loss": 0.6494, + "step": 1381 + }, + { + "epoch": 0.15, + "grad_norm": 3.5501629949975944, + "learning_rate": 9.655154010056464e-06, + "loss": 0.8004, + "step": 1382 + }, + { + "epoch": 0.15, + "grad_norm": 4.64326272620766, + "learning_rate": 9.654531793945102e-06, + "loss": 0.7567, + "step": 1383 + }, + { + "epoch": 0.15, + "grad_norm": 2.575524144576267, + "learning_rate": 9.653909037083689e-06, + "loss": 0.7945, + "step": 1384 + }, + { + "epoch": 0.15, + "grad_norm": 2.9295057714053, + "learning_rate": 9.653285739544578e-06, + "loss": 0.7318, + "step": 1385 + }, + { + "epoch": 0.15, + "grad_norm": 2.0997333286555233, + "learning_rate": 9.65266190140018e-06, + "loss": 0.6631, + "step": 1386 + }, + { + "epoch": 0.15, + "grad_norm": 2.2304269290377787, + "learning_rate": 9.652037522722974e-06, + "loss": 0.6277, + "step": 1387 + }, + { + "epoch": 0.15, + "grad_norm": 4.44783772978294, + "learning_rate": 9.651412603585495e-06, + "loss": 0.7528, + "step": 1388 + }, + { + "epoch": 0.15, + "grad_norm": 2.178234675495518, + "learning_rate": 9.650787144060345e-06, + "loss": 0.753, + "step": 1389 + }, + { + "epoch": 0.15, + "grad_norm": 2.2283048639192446, + "learning_rate": 9.65016114422019e-06, + "loss": 0.7198, + "step": 1390 + }, + { + "epoch": 0.15, + "grad_norm": 2.4993099361247664, + "learning_rate": 9.649534604137755e-06, + "loss": 0.7052, + "step": 1391 + }, + { + "epoch": 0.15, + "grad_norm": 3.1995602061556725, + "learning_rate": 9.64890752388583e-06, + "loss": 0.7257, + "step": 1392 + }, + { + "epoch": 0.15, + "grad_norm": 2.2307125057623227, + "learning_rate": 9.648279903537268e-06, + "loss": 0.7356, + "step": 1393 + }, + { + "epoch": 0.15, + "grad_norm": 2.1622062738792347, + "learning_rate": 9.647651743164983e-06, + "loss": 0.6852, + "step": 1394 + }, + { + "epoch": 0.15, + "grad_norm": 3.3927400290041, + "learning_rate": 9.647023042841953e-06, + "loss": 0.6911, + "step": 1395 + }, + { + "epoch": 0.15, + "grad_norm": 4.504113313434371, + "learning_rate": 9.64639380264122e-06, + "loss": 0.7242, + "step": 1396 + }, + { + "epoch": 0.15, + "grad_norm": 2.837839765380396, + "learning_rate": 9.645764022635886e-06, + "loss": 0.6788, + "step": 1397 + }, + { + "epoch": 0.15, + "grad_norm": 2.2209452355398236, + "learning_rate": 9.645133702899116e-06, + "loss": 0.6795, + "step": 1398 + }, + { + "epoch": 0.15, + "grad_norm": 3.3171450711888, + "learning_rate": 9.644502843504141e-06, + "loss": 0.7098, + "step": 1399 + }, + { + "epoch": 0.15, + "grad_norm": 2.433039777766387, + "learning_rate": 9.643871444524253e-06, + "loss": 0.7227, + "step": 1400 + }, + { + "epoch": 0.15, + "grad_norm": 2.572768002891731, + "learning_rate": 9.643239506032805e-06, + "loss": 0.7593, + "step": 1401 + }, + { + "epoch": 0.15, + "grad_norm": 2.125858662151184, + "learning_rate": 9.642607028103212e-06, + "loss": 0.7119, + "step": 1402 + }, + { + "epoch": 0.15, + "grad_norm": 2.685061632087422, + "learning_rate": 9.641974010808954e-06, + "loss": 0.6543, + "step": 1403 + }, + { + "epoch": 0.15, + "grad_norm": 2.22659599285853, + "learning_rate": 9.641340454223576e-06, + "loss": 0.7721, + "step": 1404 + }, + { + "epoch": 0.15, + "grad_norm": 3.4662300157631596, + "learning_rate": 9.64070635842068e-06, + "loss": 0.7117, + "step": 1405 + }, + { + "epoch": 0.15, + "grad_norm": 2.643822549271141, + "learning_rate": 9.640071723473934e-06, + "loss": 0.7299, + "step": 1406 + }, + { + "epoch": 0.15, + "grad_norm": 2.595323686997289, + "learning_rate": 9.639436549457069e-06, + "loss": 0.7492, + "step": 1407 + }, + { + "epoch": 0.15, + "grad_norm": 2.4646272228107784, + "learning_rate": 9.638800836443875e-06, + "loss": 0.7249, + "step": 1408 + }, + { + "epoch": 0.15, + "grad_norm": 2.889739658027975, + "learning_rate": 9.638164584508211e-06, + "loss": 0.7073, + "step": 1409 + }, + { + "epoch": 0.15, + "grad_norm": 2.731955437980805, + "learning_rate": 9.637527793723993e-06, + "loss": 0.6637, + "step": 1410 + }, + { + "epoch": 0.15, + "grad_norm": 7.330236462820327, + "learning_rate": 9.6368904641652e-06, + "loss": 0.6849, + "step": 1411 + }, + { + "epoch": 0.15, + "grad_norm": 2.09841411709079, + "learning_rate": 9.63625259590588e-06, + "loss": 0.7158, + "step": 1412 + }, + { + "epoch": 0.15, + "grad_norm": 2.629199349370734, + "learning_rate": 9.635614189020133e-06, + "loss": 0.7321, + "step": 1413 + }, + { + "epoch": 0.15, + "grad_norm": 2.344946491188394, + "learning_rate": 9.63497524358213e-06, + "loss": 0.746, + "step": 1414 + }, + { + "epoch": 0.15, + "grad_norm": 2.6910188223643856, + "learning_rate": 9.6343357596661e-06, + "loss": 0.6378, + "step": 1415 + }, + { + "epoch": 0.15, + "grad_norm": 3.2643901325386224, + "learning_rate": 9.633695737346341e-06, + "loss": 0.7012, + "step": 1416 + }, + { + "epoch": 0.15, + "grad_norm": 2.741008316052005, + "learning_rate": 9.633055176697205e-06, + "loss": 0.7338, + "step": 1417 + }, + { + "epoch": 0.15, + "grad_norm": 3.204256419036011, + "learning_rate": 9.632414077793111e-06, + "loss": 0.7241, + "step": 1418 + }, + { + "epoch": 0.15, + "grad_norm": 2.440995779151848, + "learning_rate": 9.63177244070854e-06, + "loss": 0.6625, + "step": 1419 + }, + { + "epoch": 0.15, + "grad_norm": 2.5374105809523626, + "learning_rate": 9.631130265518036e-06, + "loss": 0.7329, + "step": 1420 + }, + { + "epoch": 0.15, + "grad_norm": 2.46110371173565, + "learning_rate": 9.630487552296207e-06, + "loss": 0.7371, + "step": 1421 + }, + { + "epoch": 0.15, + "grad_norm": 3.3468353330936393, + "learning_rate": 9.629844301117717e-06, + "loss": 0.6405, + "step": 1422 + }, + { + "epoch": 0.15, + "grad_norm": 2.397830492051804, + "learning_rate": 9.6292005120573e-06, + "loss": 0.6774, + "step": 1423 + }, + { + "epoch": 0.15, + "grad_norm": 3.7532545714870977, + "learning_rate": 9.62855618518975e-06, + "loss": 0.7351, + "step": 1424 + }, + { + "epoch": 0.15, + "grad_norm": 3.1816654437320833, + "learning_rate": 9.627911320589922e-06, + "loss": 0.7497, + "step": 1425 + }, + { + "epoch": 0.15, + "grad_norm": 2.4023123862610016, + "learning_rate": 9.627265918332734e-06, + "loss": 0.7702, + "step": 1426 + }, + { + "epoch": 0.15, + "grad_norm": 2.6310701737673368, + "learning_rate": 9.626619978493168e-06, + "loss": 0.7071, + "step": 1427 + }, + { + "epoch": 0.15, + "grad_norm": 2.44463817108983, + "learning_rate": 9.62597350114627e-06, + "loss": 0.6781, + "step": 1428 + }, + { + "epoch": 0.15, + "grad_norm": 2.4602674966430977, + "learning_rate": 9.625326486367139e-06, + "loss": 0.7439, + "step": 1429 + }, + { + "epoch": 0.15, + "grad_norm": 2.4828542109646614, + "learning_rate": 9.624678934230948e-06, + "loss": 0.7377, + "step": 1430 + }, + { + "epoch": 0.15, + "grad_norm": 2.846123388535948, + "learning_rate": 9.624030844812926e-06, + "loss": 0.6388, + "step": 1431 + }, + { + "epoch": 0.15, + "grad_norm": 2.1757720444416084, + "learning_rate": 9.623382218188371e-06, + "loss": 0.6985, + "step": 1432 + }, + { + "epoch": 0.15, + "grad_norm": 3.259786537034406, + "learning_rate": 9.622733054432631e-06, + "loss": 0.7484, + "step": 1433 + }, + { + "epoch": 0.15, + "grad_norm": 2.212671951693864, + "learning_rate": 9.62208335362113e-06, + "loss": 0.6293, + "step": 1434 + }, + { + "epoch": 0.15, + "grad_norm": 2.818588711614401, + "learning_rate": 9.621433115829344e-06, + "loss": 0.7193, + "step": 1435 + }, + { + "epoch": 0.15, + "grad_norm": 4.029835287597727, + "learning_rate": 9.62078234113282e-06, + "loss": 0.6895, + "step": 1436 + }, + { + "epoch": 0.15, + "grad_norm": 2.1696773604444783, + "learning_rate": 9.62013102960716e-06, + "loss": 0.7032, + "step": 1437 + }, + { + "epoch": 0.15, + "grad_norm": 3.5749029914926895, + "learning_rate": 9.619479181328034e-06, + "loss": 0.6764, + "step": 1438 + }, + { + "epoch": 0.15, + "grad_norm": 2.8794399028585693, + "learning_rate": 9.618826796371168e-06, + "loss": 0.7734, + "step": 1439 + }, + { + "epoch": 0.15, + "grad_norm": 2.516706221446295, + "learning_rate": 9.618173874812357e-06, + "loss": 0.7291, + "step": 1440 + }, + { + "epoch": 0.15, + "grad_norm": 3.1628068758073637, + "learning_rate": 9.617520416727456e-06, + "loss": 0.7082, + "step": 1441 + }, + { + "epoch": 0.15, + "grad_norm": 2.8419799887337898, + "learning_rate": 9.61686642219238e-06, + "loss": 0.7383, + "step": 1442 + }, + { + "epoch": 0.15, + "grad_norm": 2.6343772160057055, + "learning_rate": 9.616211891283108e-06, + "loss": 0.6451, + "step": 1443 + }, + { + "epoch": 0.15, + "grad_norm": 3.1705848350056636, + "learning_rate": 9.615556824075684e-06, + "loss": 0.7876, + "step": 1444 + }, + { + "epoch": 0.15, + "grad_norm": 3.483339076402114, + "learning_rate": 9.61490122064621e-06, + "loss": 0.7049, + "step": 1445 + }, + { + "epoch": 0.15, + "grad_norm": 2.0799562658988946, + "learning_rate": 9.614245081070851e-06, + "loss": 0.7073, + "step": 1446 + }, + { + "epoch": 0.15, + "grad_norm": 2.6861424428613367, + "learning_rate": 9.61358840542584e-06, + "loss": 0.6411, + "step": 1447 + }, + { + "epoch": 0.15, + "grad_norm": 2.7191955155420042, + "learning_rate": 9.612931193787464e-06, + "loss": 0.6745, + "step": 1448 + }, + { + "epoch": 0.15, + "grad_norm": 6.472474388518654, + "learning_rate": 9.612273446232075e-06, + "loss": 0.7655, + "step": 1449 + }, + { + "epoch": 0.15, + "grad_norm": 2.9357438196561434, + "learning_rate": 9.61161516283609e-06, + "loss": 0.6609, + "step": 1450 + }, + { + "epoch": 0.15, + "grad_norm": 2.590924049130375, + "learning_rate": 9.610956343675988e-06, + "loss": 0.7238, + "step": 1451 + }, + { + "epoch": 0.15, + "grad_norm": 5.879538354937373, + "learning_rate": 9.610296988828305e-06, + "loss": 0.7834, + "step": 1452 + }, + { + "epoch": 0.15, + "grad_norm": 3.1041931126773212, + "learning_rate": 9.609637098369646e-06, + "loss": 0.6807, + "step": 1453 + }, + { + "epoch": 0.15, + "grad_norm": 2.261974070639065, + "learning_rate": 9.608976672376673e-06, + "loss": 0.7105, + "step": 1454 + }, + { + "epoch": 0.15, + "grad_norm": 3.1210663189380314, + "learning_rate": 9.608315710926113e-06, + "loss": 0.6593, + "step": 1455 + }, + { + "epoch": 0.15, + "grad_norm": 3.050620721243887, + "learning_rate": 9.607654214094757e-06, + "loss": 0.6594, + "step": 1456 + }, + { + "epoch": 0.15, + "grad_norm": 2.7905140014141727, + "learning_rate": 9.606992181959451e-06, + "loss": 0.7085, + "step": 1457 + }, + { + "epoch": 0.15, + "grad_norm": 2.7770890627009153, + "learning_rate": 9.606329614597114e-06, + "loss": 0.7159, + "step": 1458 + }, + { + "epoch": 0.15, + "grad_norm": 1.9828864391785557, + "learning_rate": 9.605666512084716e-06, + "loss": 0.6708, + "step": 1459 + }, + { + "epoch": 0.15, + "grad_norm": 2.266774311682029, + "learning_rate": 9.605002874499296e-06, + "loss": 0.6211, + "step": 1460 + }, + { + "epoch": 0.15, + "grad_norm": 11.988026431087897, + "learning_rate": 9.604338701917954e-06, + "loss": 0.7906, + "step": 1461 + }, + { + "epoch": 0.15, + "grad_norm": 5.5367891929196, + "learning_rate": 9.603673994417853e-06, + "loss": 0.6816, + "step": 1462 + }, + { + "epoch": 0.15, + "grad_norm": 2.3227720167759194, + "learning_rate": 9.603008752076213e-06, + "loss": 0.7428, + "step": 1463 + }, + { + "epoch": 0.15, + "grad_norm": 3.309441828787185, + "learning_rate": 9.602342974970323e-06, + "loss": 0.675, + "step": 1464 + }, + { + "epoch": 0.15, + "grad_norm": 2.523141442933751, + "learning_rate": 9.60167666317753e-06, + "loss": 0.6574, + "step": 1465 + }, + { + "epoch": 0.15, + "grad_norm": 2.9705719635611074, + "learning_rate": 9.601009816775244e-06, + "loss": 0.6318, + "step": 1466 + }, + { + "epoch": 0.15, + "grad_norm": 2.4004597591752566, + "learning_rate": 9.600342435840939e-06, + "loss": 0.7256, + "step": 1467 + }, + { + "epoch": 0.15, + "grad_norm": 2.9444494206781227, + "learning_rate": 9.599674520452148e-06, + "loss": 0.7762, + "step": 1468 + }, + { + "epoch": 0.15, + "grad_norm": 3.7455953837505005, + "learning_rate": 9.599006070686467e-06, + "loss": 0.7091, + "step": 1469 + }, + { + "epoch": 0.15, + "grad_norm": 3.6074677438299947, + "learning_rate": 9.598337086621555e-06, + "loss": 0.6716, + "step": 1470 + }, + { + "epoch": 0.15, + "grad_norm": 2.9274341045897745, + "learning_rate": 9.597667568335133e-06, + "loss": 0.6385, + "step": 1471 + }, + { + "epoch": 0.15, + "grad_norm": 3.7346926939077543, + "learning_rate": 9.596997515904983e-06, + "loss": 0.7569, + "step": 1472 + }, + { + "epoch": 0.16, + "grad_norm": 3.5131800878247748, + "learning_rate": 9.59632692940895e-06, + "loss": 0.8693, + "step": 1473 + }, + { + "epoch": 0.16, + "grad_norm": 3.097568621283518, + "learning_rate": 9.595655808924942e-06, + "loss": 0.7473, + "step": 1474 + }, + { + "epoch": 0.16, + "grad_norm": 3.067535905696128, + "learning_rate": 9.594984154530926e-06, + "loss": 0.7308, + "step": 1475 + }, + { + "epoch": 0.16, + "grad_norm": 2.508065359892658, + "learning_rate": 9.594311966304933e-06, + "loss": 0.7814, + "step": 1476 + }, + { + "epoch": 0.16, + "grad_norm": 2.2827015265359845, + "learning_rate": 9.593639244325057e-06, + "loss": 0.7721, + "step": 1477 + }, + { + "epoch": 0.16, + "grad_norm": 2.738236724185965, + "learning_rate": 9.592965988669454e-06, + "loss": 0.7184, + "step": 1478 + }, + { + "epoch": 0.16, + "grad_norm": 2.6198830644856206, + "learning_rate": 9.592292199416338e-06, + "loss": 0.9262, + "step": 1479 + }, + { + "epoch": 0.16, + "grad_norm": 2.552236548583453, + "learning_rate": 9.591617876643989e-06, + "loss": 0.7082, + "step": 1480 + }, + { + "epoch": 0.16, + "grad_norm": 2.826249720921075, + "learning_rate": 9.590943020430747e-06, + "loss": 0.6825, + "step": 1481 + }, + { + "epoch": 0.16, + "grad_norm": 2.860146961164607, + "learning_rate": 9.59026763085502e-06, + "loss": 0.6227, + "step": 1482 + }, + { + "epoch": 0.16, + "grad_norm": 1.2203871767796366, + "learning_rate": 9.589591707995265e-06, + "loss": 0.6181, + "step": 1483 + }, + { + "epoch": 0.16, + "grad_norm": 2.401274298711083, + "learning_rate": 9.588915251930013e-06, + "loss": 0.6813, + "step": 1484 + }, + { + "epoch": 0.16, + "grad_norm": 4.331113197722243, + "learning_rate": 9.588238262737853e-06, + "loss": 0.6847, + "step": 1485 + }, + { + "epoch": 0.16, + "grad_norm": 2.7893674560880104, + "learning_rate": 9.587560740497434e-06, + "loss": 0.6684, + "step": 1486 + }, + { + "epoch": 0.16, + "grad_norm": 3.812079227043104, + "learning_rate": 9.586882685287471e-06, + "loss": 0.6859, + "step": 1487 + }, + { + "epoch": 0.16, + "grad_norm": 2.690594004910145, + "learning_rate": 9.586204097186738e-06, + "loss": 0.6719, + "step": 1488 + }, + { + "epoch": 0.16, + "grad_norm": 2.67029996471702, + "learning_rate": 9.585524976274068e-06, + "loss": 0.6752, + "step": 1489 + }, + { + "epoch": 0.16, + "grad_norm": 2.783949918181014, + "learning_rate": 9.584845322628363e-06, + "loss": 0.6744, + "step": 1490 + }, + { + "epoch": 0.16, + "grad_norm": 2.768161593958554, + "learning_rate": 9.58416513632858e-06, + "loss": 0.6731, + "step": 1491 + }, + { + "epoch": 0.16, + "grad_norm": 2.7128750709002785, + "learning_rate": 9.583484417453744e-06, + "loss": 0.6595, + "step": 1492 + }, + { + "epoch": 0.16, + "grad_norm": 3.1967481849379173, + "learning_rate": 9.582803166082938e-06, + "loss": 0.7063, + "step": 1493 + }, + { + "epoch": 0.16, + "grad_norm": 2.5603570119876258, + "learning_rate": 9.582121382295309e-06, + "loss": 0.7163, + "step": 1494 + }, + { + "epoch": 0.16, + "grad_norm": 2.275503794973825, + "learning_rate": 9.58143906617006e-06, + "loss": 0.7148, + "step": 1495 + }, + { + "epoch": 0.16, + "grad_norm": 2.4819881710650984, + "learning_rate": 9.580756217786466e-06, + "loss": 0.6752, + "step": 1496 + }, + { + "epoch": 0.16, + "grad_norm": 1.274480650090562, + "learning_rate": 9.580072837223857e-06, + "loss": 0.6004, + "step": 1497 + }, + { + "epoch": 0.16, + "grad_norm": 2.4619991807752157, + "learning_rate": 9.579388924561625e-06, + "loss": 0.7528, + "step": 1498 + }, + { + "epoch": 0.16, + "grad_norm": 2.69869982453989, + "learning_rate": 9.578704479879225e-06, + "loss": 0.6724, + "step": 1499 + }, + { + "epoch": 0.16, + "grad_norm": 2.587551021620836, + "learning_rate": 9.578019503256175e-06, + "loss": 0.661, + "step": 1500 + }, + { + "epoch": 0.16, + "grad_norm": 2.5984643485005297, + "learning_rate": 9.577333994772052e-06, + "loss": 0.7543, + "step": 1501 + }, + { + "epoch": 0.16, + "grad_norm": 3.106286735099979, + "learning_rate": 9.576647954506498e-06, + "loss": 0.7503, + "step": 1502 + }, + { + "epoch": 0.16, + "grad_norm": 2.5619268778049284, + "learning_rate": 9.575961382539211e-06, + "loss": 0.7057, + "step": 1503 + }, + { + "epoch": 0.16, + "grad_norm": 2.6307425334372945, + "learning_rate": 9.575274278949962e-06, + "loss": 0.7321, + "step": 1504 + }, + { + "epoch": 0.16, + "grad_norm": 2.8345146127858487, + "learning_rate": 9.574586643818572e-06, + "loss": 0.7287, + "step": 1505 + }, + { + "epoch": 0.16, + "grad_norm": 3.26642048267166, + "learning_rate": 9.57389847722493e-06, + "loss": 0.7251, + "step": 1506 + }, + { + "epoch": 0.16, + "grad_norm": 2.5229552614331263, + "learning_rate": 9.573209779248985e-06, + "loss": 0.7519, + "step": 1507 + }, + { + "epoch": 0.16, + "grad_norm": 2.80455601855804, + "learning_rate": 9.572520549970746e-06, + "loss": 0.7422, + "step": 1508 + }, + { + "epoch": 0.16, + "grad_norm": 2.5636231475624394, + "learning_rate": 9.571830789470288e-06, + "loss": 0.6824, + "step": 1509 + }, + { + "epoch": 0.16, + "grad_norm": 2.455344120878093, + "learning_rate": 9.571140497827746e-06, + "loss": 0.7862, + "step": 1510 + }, + { + "epoch": 0.16, + "grad_norm": 2.9947982231220225, + "learning_rate": 9.570449675123313e-06, + "loss": 0.8056, + "step": 1511 + }, + { + "epoch": 0.16, + "grad_norm": 3.199158359526649, + "learning_rate": 9.56975832143725e-06, + "loss": 0.7388, + "step": 1512 + }, + { + "epoch": 0.16, + "grad_norm": 2.481563398604672, + "learning_rate": 9.569066436849875e-06, + "loss": 0.7722, + "step": 1513 + }, + { + "epoch": 0.16, + "grad_norm": 2.7913702575178583, + "learning_rate": 9.568374021441567e-06, + "loss": 0.6772, + "step": 1514 + }, + { + "epoch": 0.16, + "grad_norm": 2.3892593790051593, + "learning_rate": 9.567681075292774e-06, + "loss": 0.7162, + "step": 1515 + }, + { + "epoch": 0.16, + "grad_norm": 2.3903154524770183, + "learning_rate": 9.566987598483995e-06, + "loss": 0.6662, + "step": 1516 + }, + { + "epoch": 0.16, + "grad_norm": 2.6872816236265624, + "learning_rate": 9.5662935910958e-06, + "loss": 0.6746, + "step": 1517 + }, + { + "epoch": 0.16, + "grad_norm": 2.1845449131190704, + "learning_rate": 9.565599053208815e-06, + "loss": 0.713, + "step": 1518 + }, + { + "epoch": 0.16, + "grad_norm": 2.520639847963599, + "learning_rate": 9.564903984903731e-06, + "loss": 0.6745, + "step": 1519 + }, + { + "epoch": 0.16, + "grad_norm": 2.3349303925717697, + "learning_rate": 9.564208386261296e-06, + "loss": 0.6643, + "step": 1520 + }, + { + "epoch": 0.16, + "grad_norm": 2.883067515708215, + "learning_rate": 9.563512257362325e-06, + "loss": 0.7143, + "step": 1521 + }, + { + "epoch": 0.16, + "grad_norm": 2.403562641572306, + "learning_rate": 9.56281559828769e-06, + "loss": 0.5652, + "step": 1522 + }, + { + "epoch": 0.16, + "grad_norm": 6.692764536743657, + "learning_rate": 9.562118409118334e-06, + "loss": 0.7098, + "step": 1523 + }, + { + "epoch": 0.16, + "grad_norm": 2.24853739667993, + "learning_rate": 9.561420689935243e-06, + "loss": 0.7346, + "step": 1524 + }, + { + "epoch": 0.16, + "grad_norm": 4.847945930120676, + "learning_rate": 9.560722440819485e-06, + "loss": 0.6834, + "step": 1525 + }, + { + "epoch": 0.16, + "grad_norm": 2.9793899903106524, + "learning_rate": 9.560023661852178e-06, + "loss": 0.6984, + "step": 1526 + }, + { + "epoch": 0.16, + "grad_norm": 2.696356145427722, + "learning_rate": 9.559324353114503e-06, + "loss": 0.7272, + "step": 1527 + }, + { + "epoch": 0.16, + "grad_norm": 2.2812497097713216, + "learning_rate": 9.558624514687703e-06, + "loss": 0.761, + "step": 1528 + }, + { + "epoch": 0.16, + "grad_norm": 2.271576386159374, + "learning_rate": 9.557924146653087e-06, + "loss": 0.7657, + "step": 1529 + }, + { + "epoch": 0.16, + "grad_norm": 2.7171270620464365, + "learning_rate": 9.557223249092017e-06, + "loss": 0.7263, + "step": 1530 + }, + { + "epoch": 0.16, + "grad_norm": 2.8535792247554834, + "learning_rate": 9.556521822085924e-06, + "loss": 0.7297, + "step": 1531 + }, + { + "epoch": 0.16, + "grad_norm": 2.4140439514306706, + "learning_rate": 9.555819865716298e-06, + "loss": 0.6768, + "step": 1532 + }, + { + "epoch": 0.16, + "grad_norm": 2.9490982350238957, + "learning_rate": 9.555117380064689e-06, + "loss": 0.698, + "step": 1533 + }, + { + "epoch": 0.16, + "grad_norm": 2.49780482074063, + "learning_rate": 9.55441436521271e-06, + "loss": 0.6453, + "step": 1534 + }, + { + "epoch": 0.16, + "grad_norm": 3.075811387978172, + "learning_rate": 9.553710821242036e-06, + "loss": 0.6666, + "step": 1535 + }, + { + "epoch": 0.16, + "grad_norm": 3.0553855565019585, + "learning_rate": 9.553006748234402e-06, + "loss": 0.7491, + "step": 1536 + }, + { + "epoch": 0.16, + "grad_norm": 4.1401242046590845, + "learning_rate": 9.552302146271606e-06, + "loss": 0.7782, + "step": 1537 + }, + { + "epoch": 0.16, + "grad_norm": 2.42939226201379, + "learning_rate": 9.551597015435505e-06, + "loss": 0.7143, + "step": 1538 + }, + { + "epoch": 0.16, + "grad_norm": 2.82555463707085, + "learning_rate": 9.550891355808022e-06, + "loss": 0.7285, + "step": 1539 + }, + { + "epoch": 0.16, + "grad_norm": 2.6022856991586036, + "learning_rate": 9.550185167471134e-06, + "loss": 0.6323, + "step": 1540 + }, + { + "epoch": 0.16, + "grad_norm": 2.716216112330096, + "learning_rate": 9.549478450506888e-06, + "loss": 0.6955, + "step": 1541 + }, + { + "epoch": 0.16, + "grad_norm": 2.9288414453383163, + "learning_rate": 9.548771204997387e-06, + "loss": 0.6652, + "step": 1542 + }, + { + "epoch": 0.16, + "grad_norm": 2.9260455534051664, + "learning_rate": 9.548063431024797e-06, + "loss": 0.6677, + "step": 1543 + }, + { + "epoch": 0.16, + "grad_norm": 2.6013304107381066, + "learning_rate": 9.547355128671343e-06, + "loss": 0.6632, + "step": 1544 + }, + { + "epoch": 0.16, + "grad_norm": 2.7326280366062523, + "learning_rate": 9.546646298019315e-06, + "loss": 0.7204, + "step": 1545 + }, + { + "epoch": 0.16, + "grad_norm": 2.1529382962472567, + "learning_rate": 9.545936939151066e-06, + "loss": 0.7363, + "step": 1546 + }, + { + "epoch": 0.16, + "grad_norm": 3.48827212379924, + "learning_rate": 9.545227052149002e-06, + "loss": 0.761, + "step": 1547 + }, + { + "epoch": 0.16, + "grad_norm": 2.6924419534835637, + "learning_rate": 9.544516637095598e-06, + "loss": 0.7262, + "step": 1548 + }, + { + "epoch": 0.16, + "grad_norm": 2.4210226974371554, + "learning_rate": 9.54380569407339e-06, + "loss": 0.6652, + "step": 1549 + }, + { + "epoch": 0.16, + "grad_norm": 2.91586157827423, + "learning_rate": 9.543094223164967e-06, + "loss": 0.6746, + "step": 1550 + }, + { + "epoch": 0.16, + "grad_norm": 2.8878136097168228, + "learning_rate": 9.542382224452995e-06, + "loss": 0.6712, + "step": 1551 + }, + { + "epoch": 0.16, + "grad_norm": 2.485973778059242, + "learning_rate": 9.541669698020184e-06, + "loss": 0.7157, + "step": 1552 + }, + { + "epoch": 0.16, + "grad_norm": 2.6385514951897298, + "learning_rate": 9.540956643949317e-06, + "loss": 0.7383, + "step": 1553 + }, + { + "epoch": 0.16, + "grad_norm": 3.2286222175015378, + "learning_rate": 9.540243062323234e-06, + "loss": 0.6431, + "step": 1554 + }, + { + "epoch": 0.16, + "grad_norm": 2.6255728686862043, + "learning_rate": 9.539528953224835e-06, + "loss": 0.6488, + "step": 1555 + }, + { + "epoch": 0.16, + "grad_norm": 5.294558982143112, + "learning_rate": 9.538814316737085e-06, + "loss": 0.7291, + "step": 1556 + }, + { + "epoch": 0.16, + "grad_norm": 2.7649291179617075, + "learning_rate": 9.53809915294301e-06, + "loss": 0.7938, + "step": 1557 + }, + { + "epoch": 0.16, + "grad_norm": 2.6080959910423047, + "learning_rate": 9.53738346192569e-06, + "loss": 0.6932, + "step": 1558 + }, + { + "epoch": 0.16, + "grad_norm": 3.0121425622421936, + "learning_rate": 9.536667243768279e-06, + "loss": 0.689, + "step": 1559 + }, + { + "epoch": 0.16, + "grad_norm": 3.332705210852358, + "learning_rate": 9.53595049855398e-06, + "loss": 0.7032, + "step": 1560 + }, + { + "epoch": 0.16, + "grad_norm": 2.5824358159901704, + "learning_rate": 9.535233226366067e-06, + "loss": 0.6877, + "step": 1561 + }, + { + "epoch": 0.16, + "grad_norm": 2.7223244944225486, + "learning_rate": 9.534515427287865e-06, + "loss": 0.6517, + "step": 1562 + }, + { + "epoch": 0.16, + "grad_norm": 2.0686395234828074, + "learning_rate": 9.53379710140277e-06, + "loss": 0.7079, + "step": 1563 + }, + { + "epoch": 0.16, + "grad_norm": 3.0167637777660223, + "learning_rate": 9.533078248794232e-06, + "loss": 0.6687, + "step": 1564 + }, + { + "epoch": 0.16, + "grad_norm": 2.432973784171849, + "learning_rate": 9.532358869545767e-06, + "loss": 0.7802, + "step": 1565 + }, + { + "epoch": 0.16, + "grad_norm": 2.2926310921200885, + "learning_rate": 9.531638963740953e-06, + "loss": 0.7231, + "step": 1566 + }, + { + "epoch": 0.16, + "grad_norm": 2.6393048434940276, + "learning_rate": 9.530918531463423e-06, + "loss": 0.6623, + "step": 1567 + }, + { + "epoch": 0.17, + "grad_norm": 2.481069201927267, + "learning_rate": 9.530197572796873e-06, + "loss": 0.735, + "step": 1568 + }, + { + "epoch": 0.17, + "grad_norm": 2.6013622318011276, + "learning_rate": 9.529476087825067e-06, + "loss": 0.6855, + "step": 1569 + }, + { + "epoch": 0.17, + "grad_norm": 3.4249010091697434, + "learning_rate": 9.528754076631821e-06, + "loss": 0.7398, + "step": 1570 + }, + { + "epoch": 0.17, + "grad_norm": 2.2070633925039465, + "learning_rate": 9.528031539301016e-06, + "loss": 0.7574, + "step": 1571 + }, + { + "epoch": 0.17, + "grad_norm": 2.555841750294239, + "learning_rate": 9.5273084759166e-06, + "loss": 0.6432, + "step": 1572 + }, + { + "epoch": 0.17, + "grad_norm": 1.457477490220107, + "learning_rate": 9.526584886562571e-06, + "loss": 0.6151, + "step": 1573 + }, + { + "epoch": 0.17, + "grad_norm": 3.048705570425487, + "learning_rate": 9.525860771322995e-06, + "loss": 0.7126, + "step": 1574 + }, + { + "epoch": 0.17, + "grad_norm": 3.438910164173861, + "learning_rate": 9.525136130281995e-06, + "loss": 0.79, + "step": 1575 + }, + { + "epoch": 0.17, + "grad_norm": 4.28221697375792, + "learning_rate": 9.524410963523763e-06, + "loss": 0.7654, + "step": 1576 + }, + { + "epoch": 0.17, + "grad_norm": 3.0467966030740645, + "learning_rate": 9.523685271132543e-06, + "loss": 0.7272, + "step": 1577 + }, + { + "epoch": 0.17, + "grad_norm": 2.3181479200807398, + "learning_rate": 9.522959053192646e-06, + "loss": 0.7779, + "step": 1578 + }, + { + "epoch": 0.17, + "grad_norm": 2.571771113992117, + "learning_rate": 9.522232309788439e-06, + "loss": 0.7544, + "step": 1579 + }, + { + "epoch": 0.17, + "grad_norm": 3.1183269342095508, + "learning_rate": 9.521505041004356e-06, + "loss": 0.7422, + "step": 1580 + }, + { + "epoch": 0.17, + "grad_norm": 2.416853737352066, + "learning_rate": 9.520777246924887e-06, + "loss": 0.7424, + "step": 1581 + }, + { + "epoch": 0.17, + "grad_norm": 2.885932690234875, + "learning_rate": 9.520048927634587e-06, + "loss": 0.6748, + "step": 1582 + }, + { + "epoch": 0.17, + "grad_norm": 2.615546636686311, + "learning_rate": 9.519320083218067e-06, + "loss": 0.7059, + "step": 1583 + }, + { + "epoch": 0.17, + "grad_norm": 2.755154151212538, + "learning_rate": 9.518590713760004e-06, + "loss": 0.7826, + "step": 1584 + }, + { + "epoch": 0.17, + "grad_norm": 2.132180252839951, + "learning_rate": 9.517860819345136e-06, + "loss": 0.6665, + "step": 1585 + }, + { + "epoch": 0.17, + "grad_norm": 2.255957083552413, + "learning_rate": 9.517130400058255e-06, + "loss": 0.7323, + "step": 1586 + }, + { + "epoch": 0.17, + "grad_norm": 2.2483764789100205, + "learning_rate": 9.516399455984222e-06, + "loss": 0.7601, + "step": 1587 + }, + { + "epoch": 0.17, + "grad_norm": 3.352631648887012, + "learning_rate": 9.515667987207958e-06, + "loss": 0.7785, + "step": 1588 + }, + { + "epoch": 0.17, + "grad_norm": 2.0869685444798676, + "learning_rate": 9.514935993814438e-06, + "loss": 0.6793, + "step": 1589 + }, + { + "epoch": 0.17, + "grad_norm": 2.3425785545443816, + "learning_rate": 9.514203475888706e-06, + "loss": 0.781, + "step": 1590 + }, + { + "epoch": 0.17, + "grad_norm": 2.86315297570834, + "learning_rate": 9.513470433515866e-06, + "loss": 0.6503, + "step": 1591 + }, + { + "epoch": 0.17, + "grad_norm": 3.1652116829667896, + "learning_rate": 9.512736866781076e-06, + "loss": 0.681, + "step": 1592 + }, + { + "epoch": 0.17, + "grad_norm": 3.766273085621574, + "learning_rate": 9.512002775769562e-06, + "loss": 0.7644, + "step": 1593 + }, + { + "epoch": 0.17, + "grad_norm": 2.2909986490602074, + "learning_rate": 9.511268160566609e-06, + "loss": 0.6739, + "step": 1594 + }, + { + "epoch": 0.17, + "grad_norm": 2.075660637359557, + "learning_rate": 9.510533021257562e-06, + "loss": 0.6721, + "step": 1595 + }, + { + "epoch": 0.17, + "grad_norm": 2.21371811437727, + "learning_rate": 9.509797357927826e-06, + "loss": 0.6669, + "step": 1596 + }, + { + "epoch": 0.17, + "grad_norm": 2.2650689770495087, + "learning_rate": 9.50906117066287e-06, + "loss": 0.6818, + "step": 1597 + }, + { + "epoch": 0.17, + "grad_norm": 8.894316490655317, + "learning_rate": 9.508324459548221e-06, + "loss": 0.6128, + "step": 1598 + }, + { + "epoch": 0.17, + "grad_norm": 2.363686348557055, + "learning_rate": 9.50758722466947e-06, + "loss": 0.6766, + "step": 1599 + }, + { + "epoch": 0.17, + "grad_norm": 2.1204545096681504, + "learning_rate": 9.506849466112264e-06, + "loss": 0.7487, + "step": 1600 + }, + { + "epoch": 0.17, + "grad_norm": 2.445721487013043, + "learning_rate": 9.506111183962316e-06, + "loss": 0.66, + "step": 1601 + }, + { + "epoch": 0.17, + "grad_norm": 2.9704281101668144, + "learning_rate": 9.505372378305398e-06, + "loss": 0.6566, + "step": 1602 + }, + { + "epoch": 0.17, + "grad_norm": 4.032858460518241, + "learning_rate": 9.504633049227338e-06, + "loss": 0.7025, + "step": 1603 + }, + { + "epoch": 0.17, + "grad_norm": 2.3197467695242198, + "learning_rate": 9.503893196814034e-06, + "loss": 0.6593, + "step": 1604 + }, + { + "epoch": 0.17, + "grad_norm": 3.003637291105467, + "learning_rate": 9.503152821151435e-06, + "loss": 0.7282, + "step": 1605 + }, + { + "epoch": 0.17, + "grad_norm": 2.974823409792684, + "learning_rate": 9.502411922325561e-06, + "loss": 0.644, + "step": 1606 + }, + { + "epoch": 0.17, + "grad_norm": 2.3999354547850578, + "learning_rate": 9.501670500422483e-06, + "loss": 0.7695, + "step": 1607 + }, + { + "epoch": 0.17, + "grad_norm": 3.142048720747408, + "learning_rate": 9.500928555528341e-06, + "loss": 0.697, + "step": 1608 + }, + { + "epoch": 0.17, + "grad_norm": 2.1918483417994215, + "learning_rate": 9.500186087729331e-06, + "loss": 0.6865, + "step": 1609 + }, + { + "epoch": 0.17, + "grad_norm": 2.3130950683881673, + "learning_rate": 9.49944309711171e-06, + "loss": 0.6882, + "step": 1610 + }, + { + "epoch": 0.17, + "grad_norm": 2.2722152426733166, + "learning_rate": 9.498699583761795e-06, + "loss": 0.7057, + "step": 1611 + }, + { + "epoch": 0.17, + "grad_norm": 2.3067065149092674, + "learning_rate": 9.497955547765966e-06, + "loss": 0.6129, + "step": 1612 + }, + { + "epoch": 0.17, + "grad_norm": 3.000454383678098, + "learning_rate": 9.497210989210665e-06, + "loss": 0.709, + "step": 1613 + }, + { + "epoch": 0.17, + "grad_norm": 2.6049845885606016, + "learning_rate": 9.49646590818239e-06, + "loss": 0.7035, + "step": 1614 + }, + { + "epoch": 0.17, + "grad_norm": 4.5829176336530795, + "learning_rate": 9.495720304767705e-06, + "loss": 0.6982, + "step": 1615 + }, + { + "epoch": 0.17, + "grad_norm": 2.8362441150996003, + "learning_rate": 9.494974179053233e-06, + "loss": 0.7086, + "step": 1616 + }, + { + "epoch": 0.17, + "grad_norm": 7.6175372207495045, + "learning_rate": 9.494227531125652e-06, + "loss": 0.7028, + "step": 1617 + }, + { + "epoch": 0.17, + "grad_norm": 3.522650359801413, + "learning_rate": 9.493480361071707e-06, + "loss": 0.7281, + "step": 1618 + }, + { + "epoch": 0.17, + "grad_norm": 2.4815378042784713, + "learning_rate": 9.492732668978205e-06, + "loss": 0.6899, + "step": 1619 + }, + { + "epoch": 0.17, + "grad_norm": 3.439653241146134, + "learning_rate": 9.491984454932009e-06, + "loss": 0.6359, + "step": 1620 + }, + { + "epoch": 0.17, + "grad_norm": 2.322479399522989, + "learning_rate": 9.491235719020042e-06, + "loss": 0.6365, + "step": 1621 + }, + { + "epoch": 0.17, + "grad_norm": 2.8722680257735367, + "learning_rate": 9.490486461329293e-06, + "loss": 0.7157, + "step": 1622 + }, + { + "epoch": 0.17, + "grad_norm": 2.8325212931006036, + "learning_rate": 9.489736681946809e-06, + "loss": 0.7014, + "step": 1623 + }, + { + "epoch": 0.17, + "grad_norm": 2.8149300073276424, + "learning_rate": 9.488986380959694e-06, + "loss": 0.7507, + "step": 1624 + }, + { + "epoch": 0.17, + "grad_norm": 4.399930194308783, + "learning_rate": 9.488235558455118e-06, + "loss": 0.7731, + "step": 1625 + }, + { + "epoch": 0.17, + "grad_norm": 3.4240902602318832, + "learning_rate": 9.487484214520308e-06, + "loss": 0.6616, + "step": 1626 + }, + { + "epoch": 0.17, + "grad_norm": 3.3880823758905465, + "learning_rate": 9.486732349242556e-06, + "loss": 0.6865, + "step": 1627 + }, + { + "epoch": 0.17, + "grad_norm": 2.434462113134042, + "learning_rate": 9.485979962709209e-06, + "loss": 0.7546, + "step": 1628 + }, + { + "epoch": 0.17, + "grad_norm": 3.140235982386299, + "learning_rate": 9.485227055007676e-06, + "loss": 0.6478, + "step": 1629 + }, + { + "epoch": 0.17, + "grad_norm": 2.6979154962012895, + "learning_rate": 9.48447362622543e-06, + "loss": 0.7336, + "step": 1630 + }, + { + "epoch": 0.17, + "grad_norm": 3.564891032405616, + "learning_rate": 9.483719676450003e-06, + "loss": 0.6995, + "step": 1631 + }, + { + "epoch": 0.17, + "grad_norm": 5.186995520230965, + "learning_rate": 9.482965205768983e-06, + "loss": 0.7284, + "step": 1632 + }, + { + "epoch": 0.17, + "grad_norm": 2.93274944592198, + "learning_rate": 9.482210214270026e-06, + "loss": 0.7065, + "step": 1633 + }, + { + "epoch": 0.17, + "grad_norm": 2.3964166552732427, + "learning_rate": 9.481454702040842e-06, + "loss": 0.7386, + "step": 1634 + }, + { + "epoch": 0.17, + "grad_norm": 2.290346577796857, + "learning_rate": 9.480698669169207e-06, + "loss": 0.6791, + "step": 1635 + }, + { + "epoch": 0.17, + "grad_norm": 6.335965960819836, + "learning_rate": 9.479942115742951e-06, + "loss": 0.6365, + "step": 1636 + }, + { + "epoch": 0.17, + "grad_norm": 1.9605346112705178, + "learning_rate": 9.47918504184997e-06, + "loss": 0.6899, + "step": 1637 + }, + { + "epoch": 0.17, + "grad_norm": 2.7418810164216705, + "learning_rate": 9.47842744757822e-06, + "loss": 0.6344, + "step": 1638 + }, + { + "epoch": 0.17, + "grad_norm": 2.2563707246896096, + "learning_rate": 9.477669333015714e-06, + "loss": 0.5757, + "step": 1639 + }, + { + "epoch": 0.17, + "grad_norm": 3.013900658553373, + "learning_rate": 9.476910698250529e-06, + "loss": 0.7077, + "step": 1640 + }, + { + "epoch": 0.17, + "grad_norm": 2.396575665358366, + "learning_rate": 9.4761515433708e-06, + "loss": 0.7228, + "step": 1641 + }, + { + "epoch": 0.17, + "grad_norm": 2.40289964027477, + "learning_rate": 9.475391868464725e-06, + "loss": 0.7134, + "step": 1642 + }, + { + "epoch": 0.17, + "grad_norm": 2.387400572157838, + "learning_rate": 9.474631673620558e-06, + "loss": 0.7249, + "step": 1643 + }, + { + "epoch": 0.17, + "grad_norm": 2.470393309910611, + "learning_rate": 9.47387095892662e-06, + "loss": 0.7756, + "step": 1644 + }, + { + "epoch": 0.17, + "grad_norm": 2.318559436180576, + "learning_rate": 9.473109724471287e-06, + "loss": 0.6502, + "step": 1645 + }, + { + "epoch": 0.17, + "grad_norm": 2.5401989407597005, + "learning_rate": 9.472347970342995e-06, + "loss": 0.6817, + "step": 1646 + }, + { + "epoch": 0.17, + "grad_norm": 3.2594402322006175, + "learning_rate": 9.471585696630245e-06, + "loss": 0.7104, + "step": 1647 + }, + { + "epoch": 0.17, + "grad_norm": 1.1769351591530914, + "learning_rate": 9.470822903421595e-06, + "loss": 0.6472, + "step": 1648 + }, + { + "epoch": 0.17, + "grad_norm": 2.540121387003377, + "learning_rate": 9.470059590805663e-06, + "loss": 0.715, + "step": 1649 + }, + { + "epoch": 0.17, + "grad_norm": 2.3993005705838404, + "learning_rate": 9.46929575887113e-06, + "loss": 0.7426, + "step": 1650 + }, + { + "epoch": 0.17, + "grad_norm": 2.4076747230692574, + "learning_rate": 9.468531407706733e-06, + "loss": 0.6518, + "step": 1651 + }, + { + "epoch": 0.17, + "grad_norm": 2.3345495527645443, + "learning_rate": 9.467766537401278e-06, + "loss": 0.6538, + "step": 1652 + }, + { + "epoch": 0.17, + "grad_norm": 2.2658417056945064, + "learning_rate": 9.46700114804362e-06, + "loss": 0.6715, + "step": 1653 + }, + { + "epoch": 0.17, + "grad_norm": 3.2719118199932598, + "learning_rate": 9.46623523972268e-06, + "loss": 0.6762, + "step": 1654 + }, + { + "epoch": 0.17, + "grad_norm": 3.0582788277872037, + "learning_rate": 9.465468812527443e-06, + "loss": 0.629, + "step": 1655 + }, + { + "epoch": 0.17, + "grad_norm": 2.247235978920937, + "learning_rate": 9.464701866546945e-06, + "loss": 0.7408, + "step": 1656 + }, + { + "epoch": 0.17, + "grad_norm": 3.155468792797194, + "learning_rate": 9.463934401870292e-06, + "loss": 0.613, + "step": 1657 + }, + { + "epoch": 0.17, + "grad_norm": 2.544248700229081, + "learning_rate": 9.463166418586645e-06, + "loss": 0.7269, + "step": 1658 + }, + { + "epoch": 0.17, + "grad_norm": 2.3313482073212315, + "learning_rate": 9.462397916785222e-06, + "loss": 0.7243, + "step": 1659 + }, + { + "epoch": 0.17, + "grad_norm": 2.82456167117737, + "learning_rate": 9.461628896555312e-06, + "loss": 0.7121, + "step": 1660 + }, + { + "epoch": 0.17, + "grad_norm": 5.726685554503556, + "learning_rate": 9.460859357986251e-06, + "loss": 0.6316, + "step": 1661 + }, + { + "epoch": 0.17, + "grad_norm": 3.542875855778497, + "learning_rate": 9.460089301167448e-06, + "loss": 0.6625, + "step": 1662 + }, + { + "epoch": 0.17, + "grad_norm": 2.86229240642419, + "learning_rate": 9.45931872618836e-06, + "loss": 0.6818, + "step": 1663 + }, + { + "epoch": 0.18, + "grad_norm": 2.656197150207348, + "learning_rate": 9.458547633138515e-06, + "loss": 0.6801, + "step": 1664 + }, + { + "epoch": 0.18, + "grad_norm": 3.243631283233658, + "learning_rate": 9.457776022107494e-06, + "loss": 0.6679, + "step": 1665 + }, + { + "epoch": 0.18, + "grad_norm": 2.5293787058884636, + "learning_rate": 9.45700389318494e-06, + "loss": 0.7386, + "step": 1666 + }, + { + "epoch": 0.18, + "grad_norm": 2.310070540436612, + "learning_rate": 9.456231246460557e-06, + "loss": 0.774, + "step": 1667 + }, + { + "epoch": 0.18, + "grad_norm": 2.508606440342495, + "learning_rate": 9.455458082024112e-06, + "loss": 0.7392, + "step": 1668 + }, + { + "epoch": 0.18, + "grad_norm": 2.2920775313114405, + "learning_rate": 9.454684399965423e-06, + "loss": 0.758, + "step": 1669 + }, + { + "epoch": 0.18, + "grad_norm": 9.041893234843545, + "learning_rate": 9.453910200374382e-06, + "loss": 0.6369, + "step": 1670 + }, + { + "epoch": 0.18, + "grad_norm": 1.2931690055491833, + "learning_rate": 9.453135483340925e-06, + "loss": 0.6547, + "step": 1671 + }, + { + "epoch": 0.18, + "grad_norm": 1.3115301263653227, + "learning_rate": 9.452360248955062e-06, + "loss": 0.6449, + "step": 1672 + }, + { + "epoch": 0.18, + "grad_norm": 2.9637645722219323, + "learning_rate": 9.451584497306856e-06, + "loss": 0.6999, + "step": 1673 + }, + { + "epoch": 0.18, + "grad_norm": 2.3068845068680326, + "learning_rate": 9.45080822848643e-06, + "loss": 0.6948, + "step": 1674 + }, + { + "epoch": 0.18, + "grad_norm": 2.2231756130525673, + "learning_rate": 9.45003144258397e-06, + "loss": 0.6791, + "step": 1675 + }, + { + "epoch": 0.18, + "grad_norm": 3.320104844855623, + "learning_rate": 9.449254139689721e-06, + "loss": 0.6537, + "step": 1676 + }, + { + "epoch": 0.18, + "grad_norm": 2.471206074285933, + "learning_rate": 9.448476319893989e-06, + "loss": 0.7204, + "step": 1677 + }, + { + "epoch": 0.18, + "grad_norm": 2.4573874653738934, + "learning_rate": 9.447697983287136e-06, + "loss": 0.6883, + "step": 1678 + }, + { + "epoch": 0.18, + "grad_norm": 2.2768018435285424, + "learning_rate": 9.446919129959589e-06, + "loss": 0.6646, + "step": 1679 + }, + { + "epoch": 0.18, + "grad_norm": 2.5304009939179433, + "learning_rate": 9.44613976000183e-06, + "loss": 0.6923, + "step": 1680 + }, + { + "epoch": 0.18, + "grad_norm": 2.3183304739700534, + "learning_rate": 9.44535987350441e-06, + "loss": 0.82, + "step": 1681 + }, + { + "epoch": 0.18, + "grad_norm": 2.293573173274393, + "learning_rate": 9.44457947055793e-06, + "loss": 0.7597, + "step": 1682 + }, + { + "epoch": 0.18, + "grad_norm": 2.820715534237888, + "learning_rate": 9.443798551253052e-06, + "loss": 0.7216, + "step": 1683 + }, + { + "epoch": 0.18, + "grad_norm": 2.122577759545682, + "learning_rate": 9.443017115680503e-06, + "loss": 0.7303, + "step": 1684 + }, + { + "epoch": 0.18, + "grad_norm": 2.653511448510485, + "learning_rate": 9.442235163931072e-06, + "loss": 0.6314, + "step": 1685 + }, + { + "epoch": 0.18, + "grad_norm": 2.186415024401008, + "learning_rate": 9.441452696095601e-06, + "loss": 0.755, + "step": 1686 + }, + { + "epoch": 0.18, + "grad_norm": 2.3274053893757563, + "learning_rate": 9.440669712264994e-06, + "loss": 0.7157, + "step": 1687 + }, + { + "epoch": 0.18, + "grad_norm": 1.9206038169379611, + "learning_rate": 9.439886212530217e-06, + "loss": 0.7278, + "step": 1688 + }, + { + "epoch": 0.18, + "grad_norm": 2.974321572145424, + "learning_rate": 9.439102196982292e-06, + "loss": 0.6442, + "step": 1689 + }, + { + "epoch": 0.18, + "grad_norm": 1.9034939575257075, + "learning_rate": 9.438317665712308e-06, + "loss": 0.7244, + "step": 1690 + }, + { + "epoch": 0.18, + "grad_norm": 2.5630336170505945, + "learning_rate": 9.437532618811407e-06, + "loss": 0.7238, + "step": 1691 + }, + { + "epoch": 0.18, + "grad_norm": 2.7600588498607768, + "learning_rate": 9.436747056370794e-06, + "loss": 0.652, + "step": 1692 + }, + { + "epoch": 0.18, + "grad_norm": 2.4504076582596195, + "learning_rate": 9.435960978481734e-06, + "loss": 0.7314, + "step": 1693 + }, + { + "epoch": 0.18, + "grad_norm": 2.51073812241736, + "learning_rate": 9.435174385235548e-06, + "loss": 0.7227, + "step": 1694 + }, + { + "epoch": 0.18, + "grad_norm": 2.846356911208405, + "learning_rate": 9.434387276723624e-06, + "loss": 0.6857, + "step": 1695 + }, + { + "epoch": 0.18, + "grad_norm": 2.703615406650167, + "learning_rate": 9.433599653037406e-06, + "loss": 0.5352, + "step": 1696 + }, + { + "epoch": 0.18, + "grad_norm": 2.3457312870011955, + "learning_rate": 9.432811514268396e-06, + "loss": 0.7758, + "step": 1697 + }, + { + "epoch": 0.18, + "grad_norm": 2.271626563468226, + "learning_rate": 9.432022860508158e-06, + "loss": 0.7422, + "step": 1698 + }, + { + "epoch": 0.18, + "grad_norm": 2.3311293386557175, + "learning_rate": 9.431233691848316e-06, + "loss": 0.6922, + "step": 1699 + }, + { + "epoch": 0.18, + "grad_norm": 3.13275822233704, + "learning_rate": 9.430444008380553e-06, + "loss": 0.6746, + "step": 1700 + }, + { + "epoch": 0.18, + "grad_norm": 2.3910164531032594, + "learning_rate": 9.429653810196611e-06, + "loss": 0.7156, + "step": 1701 + }, + { + "epoch": 0.18, + "grad_norm": 2.25390169947036, + "learning_rate": 9.428863097388295e-06, + "loss": 0.7435, + "step": 1702 + }, + { + "epoch": 0.18, + "grad_norm": 1.9560405228811595, + "learning_rate": 9.428071870047469e-06, + "loss": 0.7, + "step": 1703 + }, + { + "epoch": 0.18, + "grad_norm": 1.8824075331234948, + "learning_rate": 9.427280128266049e-06, + "loss": 0.7596, + "step": 1704 + }, + { + "epoch": 0.18, + "grad_norm": 2.286053461352808, + "learning_rate": 9.426487872136025e-06, + "loss": 0.7239, + "step": 1705 + }, + { + "epoch": 0.18, + "grad_norm": 2.008603418544589, + "learning_rate": 9.425695101749435e-06, + "loss": 0.7413, + "step": 1706 + }, + { + "epoch": 0.18, + "grad_norm": 2.10028582452023, + "learning_rate": 9.424901817198381e-06, + "loss": 0.6829, + "step": 1707 + }, + { + "epoch": 0.18, + "grad_norm": 2.1535649860037007, + "learning_rate": 9.424108018575026e-06, + "loss": 0.7089, + "step": 1708 + }, + { + "epoch": 0.18, + "grad_norm": 2.8660284485190903, + "learning_rate": 9.42331370597159e-06, + "loss": 0.7544, + "step": 1709 + }, + { + "epoch": 0.18, + "grad_norm": 2.5095078891354508, + "learning_rate": 9.422518879480353e-06, + "loss": 0.6405, + "step": 1710 + }, + { + "epoch": 0.18, + "grad_norm": 2.1188014421663874, + "learning_rate": 9.421723539193657e-06, + "loss": 0.6818, + "step": 1711 + }, + { + "epoch": 0.18, + "grad_norm": 3.163280526338815, + "learning_rate": 9.420927685203901e-06, + "loss": 0.6727, + "step": 1712 + }, + { + "epoch": 0.18, + "grad_norm": 3.7994284590304366, + "learning_rate": 9.42013131760355e-06, + "loss": 0.6459, + "step": 1713 + }, + { + "epoch": 0.18, + "grad_norm": 2.098234987273422, + "learning_rate": 9.419334436485117e-06, + "loss": 0.7406, + "step": 1714 + }, + { + "epoch": 0.18, + "grad_norm": 2.204421750962229, + "learning_rate": 9.418537041941185e-06, + "loss": 0.75, + "step": 1715 + }, + { + "epoch": 0.18, + "grad_norm": 2.551866812050452, + "learning_rate": 9.417739134064392e-06, + "loss": 0.7352, + "step": 1716 + }, + { + "epoch": 0.18, + "grad_norm": 2.1052536649230147, + "learning_rate": 9.416940712947436e-06, + "loss": 0.7457, + "step": 1717 + }, + { + "epoch": 0.18, + "grad_norm": 2.552123073690243, + "learning_rate": 9.416141778683077e-06, + "loss": 0.7008, + "step": 1718 + }, + { + "epoch": 0.18, + "grad_norm": 2.386726881220311, + "learning_rate": 9.415342331364132e-06, + "loss": 0.6845, + "step": 1719 + }, + { + "epoch": 0.18, + "grad_norm": 2.0231356594407064, + "learning_rate": 9.414542371083477e-06, + "loss": 0.6269, + "step": 1720 + }, + { + "epoch": 0.18, + "grad_norm": 3.3944077369294376, + "learning_rate": 9.413741897934052e-06, + "loss": 0.7178, + "step": 1721 + }, + { + "epoch": 0.18, + "grad_norm": 2.4163679275832997, + "learning_rate": 9.412940912008852e-06, + "loss": 0.6554, + "step": 1722 + }, + { + "epoch": 0.18, + "grad_norm": 2.1677504987549554, + "learning_rate": 9.412139413400933e-06, + "loss": 0.7565, + "step": 1723 + }, + { + "epoch": 0.18, + "grad_norm": 8.628059217990637, + "learning_rate": 9.41133740220341e-06, + "loss": 0.6513, + "step": 1724 + }, + { + "epoch": 0.18, + "grad_norm": 2.2085858844991972, + "learning_rate": 9.410534878509461e-06, + "loss": 0.6636, + "step": 1725 + }, + { + "epoch": 0.18, + "grad_norm": 1.9079694945157286, + "learning_rate": 9.40973184241232e-06, + "loss": 0.6428, + "step": 1726 + }, + { + "epoch": 0.18, + "grad_norm": 2.2280940372738156, + "learning_rate": 9.408928294005279e-06, + "loss": 0.6991, + "step": 1727 + }, + { + "epoch": 0.18, + "grad_norm": 2.2464560200093553, + "learning_rate": 9.408124233381695e-06, + "loss": 0.7057, + "step": 1728 + }, + { + "epoch": 0.18, + "grad_norm": 2.41547778019205, + "learning_rate": 9.40731966063498e-06, + "loss": 0.7175, + "step": 1729 + }, + { + "epoch": 0.18, + "grad_norm": 1.9493683126801489, + "learning_rate": 9.406514575858606e-06, + "loss": 0.6848, + "step": 1730 + }, + { + "epoch": 0.18, + "grad_norm": 2.653450440842846, + "learning_rate": 9.405708979146106e-06, + "loss": 0.7155, + "step": 1731 + }, + { + "epoch": 0.18, + "grad_norm": 2.2614446312235077, + "learning_rate": 9.404902870591076e-06, + "loss": 0.7237, + "step": 1732 + }, + { + "epoch": 0.18, + "grad_norm": 2.236025418767225, + "learning_rate": 9.40409625028716e-06, + "loss": 0.7383, + "step": 1733 + }, + { + "epoch": 0.18, + "grad_norm": 2.0068155617046903, + "learning_rate": 9.403289118328074e-06, + "loss": 0.624, + "step": 1734 + }, + { + "epoch": 0.18, + "grad_norm": 2.586872590910335, + "learning_rate": 9.402481474807588e-06, + "loss": 0.6978, + "step": 1735 + }, + { + "epoch": 0.18, + "grad_norm": 2.05535830543558, + "learning_rate": 9.401673319819529e-06, + "loss": 0.6669, + "step": 1736 + }, + { + "epoch": 0.18, + "grad_norm": 1.8884455652664238, + "learning_rate": 9.400864653457789e-06, + "loss": 0.697, + "step": 1737 + }, + { + "epoch": 0.18, + "grad_norm": 2.1851392337369266, + "learning_rate": 9.400055475816313e-06, + "loss": 0.6524, + "step": 1738 + }, + { + "epoch": 0.18, + "grad_norm": 2.576173165092392, + "learning_rate": 9.399245786989112e-06, + "loss": 0.7119, + "step": 1739 + }, + { + "epoch": 0.18, + "grad_norm": 2.620809902632731, + "learning_rate": 9.398435587070254e-06, + "loss": 0.7611, + "step": 1740 + }, + { + "epoch": 0.18, + "grad_norm": 2.8033777586962114, + "learning_rate": 9.397624876153862e-06, + "loss": 0.6888, + "step": 1741 + }, + { + "epoch": 0.18, + "grad_norm": 2.1952968707428666, + "learning_rate": 9.396813654334124e-06, + "loss": 0.729, + "step": 1742 + }, + { + "epoch": 0.18, + "grad_norm": 2.184788154360516, + "learning_rate": 9.396001921705287e-06, + "loss": 0.7522, + "step": 1743 + }, + { + "epoch": 0.18, + "grad_norm": 2.5561173456941186, + "learning_rate": 9.395189678361655e-06, + "loss": 0.6819, + "step": 1744 + }, + { + "epoch": 0.18, + "grad_norm": 2.9560468612635016, + "learning_rate": 9.39437692439759e-06, + "loss": 0.7499, + "step": 1745 + }, + { + "epoch": 0.18, + "grad_norm": 3.170377346969926, + "learning_rate": 9.393563659907516e-06, + "loss": 0.7307, + "step": 1746 + }, + { + "epoch": 0.18, + "grad_norm": 2.0529367396092386, + "learning_rate": 9.392749884985918e-06, + "loss": 0.6618, + "step": 1747 + }, + { + "epoch": 0.18, + "grad_norm": 2.5770411994159867, + "learning_rate": 9.391935599727336e-06, + "loss": 0.702, + "step": 1748 + }, + { + "epoch": 0.18, + "grad_norm": 2.1159695596217305, + "learning_rate": 9.391120804226372e-06, + "loss": 0.7339, + "step": 1749 + }, + { + "epoch": 0.18, + "grad_norm": 2.9557470831906363, + "learning_rate": 9.390305498577685e-06, + "loss": 0.829, + "step": 1750 + }, + { + "epoch": 0.18, + "grad_norm": 2.066429118959667, + "learning_rate": 9.389489682875999e-06, + "loss": 0.7344, + "step": 1751 + }, + { + "epoch": 0.18, + "grad_norm": 2.3728866613105315, + "learning_rate": 9.388673357216088e-06, + "loss": 0.6821, + "step": 1752 + }, + { + "epoch": 0.18, + "grad_norm": 2.5738421703420835, + "learning_rate": 9.387856521692795e-06, + "loss": 0.6881, + "step": 1753 + }, + { + "epoch": 0.18, + "grad_norm": 2.2173777484723933, + "learning_rate": 9.387039176401013e-06, + "loss": 0.6497, + "step": 1754 + }, + { + "epoch": 0.18, + "grad_norm": 2.8326929223772916, + "learning_rate": 9.386221321435702e-06, + "loss": 0.7752, + "step": 1755 + }, + { + "epoch": 0.18, + "grad_norm": 2.342472475816738, + "learning_rate": 9.385402956891878e-06, + "loss": 0.6642, + "step": 1756 + }, + { + "epoch": 0.18, + "grad_norm": 1.8421908380492378, + "learning_rate": 9.384584082864614e-06, + "loss": 0.628, + "step": 1757 + }, + { + "epoch": 0.18, + "grad_norm": 2.293083670289089, + "learning_rate": 9.383764699449047e-06, + "loss": 0.7642, + "step": 1758 + }, + { + "epoch": 0.19, + "grad_norm": 3.1881257410006234, + "learning_rate": 9.382944806740369e-06, + "loss": 0.697, + "step": 1759 + }, + { + "epoch": 0.19, + "grad_norm": 2.0454383098040125, + "learning_rate": 9.382124404833832e-06, + "loss": 0.6683, + "step": 1760 + }, + { + "epoch": 0.19, + "grad_norm": 2.39746694551876, + "learning_rate": 9.38130349382475e-06, + "loss": 0.7014, + "step": 1761 + }, + { + "epoch": 0.19, + "grad_norm": 2.646324899559431, + "learning_rate": 9.380482073808493e-06, + "loss": 0.6275, + "step": 1762 + }, + { + "epoch": 0.19, + "grad_norm": 2.31148966058067, + "learning_rate": 9.379660144880491e-06, + "loss": 0.6855, + "step": 1763 + }, + { + "epoch": 0.19, + "grad_norm": 2.6610941993286072, + "learning_rate": 9.378837707136235e-06, + "loss": 0.721, + "step": 1764 + }, + { + "epoch": 0.19, + "grad_norm": 2.181626035362435, + "learning_rate": 9.37801476067127e-06, + "loss": 0.7382, + "step": 1765 + }, + { + "epoch": 0.19, + "grad_norm": 2.5121904009240774, + "learning_rate": 9.377191305581208e-06, + "loss": 0.6953, + "step": 1766 + }, + { + "epoch": 0.19, + "grad_norm": 2.324736532829736, + "learning_rate": 9.376367341961712e-06, + "loss": 0.6536, + "step": 1767 + }, + { + "epoch": 0.19, + "grad_norm": 2.1731590595672206, + "learning_rate": 9.375542869908509e-06, + "loss": 0.6795, + "step": 1768 + }, + { + "epoch": 0.19, + "grad_norm": 2.0441280442692396, + "learning_rate": 9.374717889517384e-06, + "loss": 0.6707, + "step": 1769 + }, + { + "epoch": 0.19, + "grad_norm": 2.188945519966826, + "learning_rate": 9.373892400884182e-06, + "loss": 0.6427, + "step": 1770 + }, + { + "epoch": 0.19, + "grad_norm": 1.8292074188975915, + "learning_rate": 9.373066404104803e-06, + "loss": 0.6416, + "step": 1771 + }, + { + "epoch": 0.19, + "grad_norm": 2.2353048574201884, + "learning_rate": 9.37223989927521e-06, + "loss": 0.7069, + "step": 1772 + }, + { + "epoch": 0.19, + "grad_norm": 3.313904910526145, + "learning_rate": 9.371412886491424e-06, + "loss": 0.7505, + "step": 1773 + }, + { + "epoch": 0.19, + "grad_norm": 4.177329261346893, + "learning_rate": 9.370585365849527e-06, + "loss": 0.6287, + "step": 1774 + }, + { + "epoch": 0.19, + "grad_norm": 2.509687395524509, + "learning_rate": 9.369757337445655e-06, + "loss": 0.6938, + "step": 1775 + }, + { + "epoch": 0.19, + "grad_norm": 2.50959271620824, + "learning_rate": 9.368928801376009e-06, + "loss": 0.7594, + "step": 1776 + }, + { + "epoch": 0.19, + "grad_norm": 2.3425929238257943, + "learning_rate": 9.368099757736843e-06, + "loss": 0.6348, + "step": 1777 + }, + { + "epoch": 0.19, + "grad_norm": 2.571462951149562, + "learning_rate": 9.367270206624474e-06, + "loss": 0.7839, + "step": 1778 + }, + { + "epoch": 0.19, + "grad_norm": 2.288592181156132, + "learning_rate": 9.366440148135276e-06, + "loss": 0.7086, + "step": 1779 + }, + { + "epoch": 0.19, + "grad_norm": 3.7452409081529328, + "learning_rate": 9.365609582365685e-06, + "loss": 0.7787, + "step": 1780 + }, + { + "epoch": 0.19, + "grad_norm": 2.188224299368682, + "learning_rate": 9.364778509412191e-06, + "loss": 0.7622, + "step": 1781 + }, + { + "epoch": 0.19, + "grad_norm": 2.7450685765521223, + "learning_rate": 9.363946929371349e-06, + "loss": 0.6863, + "step": 1782 + }, + { + "epoch": 0.19, + "grad_norm": 2.5045275357571284, + "learning_rate": 9.363114842339767e-06, + "loss": 0.7075, + "step": 1783 + }, + { + "epoch": 0.19, + "grad_norm": 3.0164414465737837, + "learning_rate": 9.362282248414114e-06, + "loss": 0.6598, + "step": 1784 + }, + { + "epoch": 0.19, + "grad_norm": 2.35492294691701, + "learning_rate": 9.361449147691122e-06, + "loss": 0.7235, + "step": 1785 + }, + { + "epoch": 0.19, + "grad_norm": 2.2619503151970335, + "learning_rate": 9.360615540267572e-06, + "loss": 0.6997, + "step": 1786 + }, + { + "epoch": 0.19, + "grad_norm": 2.5899599164267766, + "learning_rate": 9.359781426240316e-06, + "loss": 0.7257, + "step": 1787 + }, + { + "epoch": 0.19, + "grad_norm": 2.5634208857345366, + "learning_rate": 9.358946805706257e-06, + "loss": 0.664, + "step": 1788 + }, + { + "epoch": 0.19, + "grad_norm": 4.51798972826991, + "learning_rate": 9.358111678762359e-06, + "loss": 0.6728, + "step": 1789 + }, + { + "epoch": 0.19, + "grad_norm": 2.3326773791425466, + "learning_rate": 9.357276045505643e-06, + "loss": 0.7358, + "step": 1790 + }, + { + "epoch": 0.19, + "grad_norm": 2.2350420427602833, + "learning_rate": 9.35643990603319e-06, + "loss": 0.6856, + "step": 1791 + }, + { + "epoch": 0.19, + "grad_norm": 2.608868685684469, + "learning_rate": 9.355603260442145e-06, + "loss": 0.6177, + "step": 1792 + }, + { + "epoch": 0.19, + "grad_norm": 1.9713835504144617, + "learning_rate": 9.354766108829703e-06, + "loss": 0.7943, + "step": 1793 + }, + { + "epoch": 0.19, + "grad_norm": 2.8062689555008857, + "learning_rate": 9.353928451293122e-06, + "loss": 0.6725, + "step": 1794 + }, + { + "epoch": 0.19, + "grad_norm": 2.671272851573433, + "learning_rate": 9.35309028792972e-06, + "loss": 0.6848, + "step": 1795 + }, + { + "epoch": 0.19, + "grad_norm": 2.066156049708186, + "learning_rate": 9.352251618836872e-06, + "loss": 0.7521, + "step": 1796 + }, + { + "epoch": 0.19, + "grad_norm": 3.0657191943549273, + "learning_rate": 9.351412444112013e-06, + "loss": 0.6063, + "step": 1797 + }, + { + "epoch": 0.19, + "grad_norm": 2.647445442676486, + "learning_rate": 9.350572763852633e-06, + "loss": 0.6984, + "step": 1798 + }, + { + "epoch": 0.19, + "grad_norm": 2.2824264713651994, + "learning_rate": 9.349732578156286e-06, + "loss": 0.6747, + "step": 1799 + }, + { + "epoch": 0.19, + "grad_norm": 2.232278265988478, + "learning_rate": 9.348891887120582e-06, + "loss": 0.7481, + "step": 1800 + }, + { + "epoch": 0.19, + "grad_norm": 2.2260356050576937, + "learning_rate": 9.348050690843192e-06, + "loss": 0.7101, + "step": 1801 + }, + { + "epoch": 0.19, + "grad_norm": 2.6245692650813734, + "learning_rate": 9.347208989421838e-06, + "loss": 0.6776, + "step": 1802 + }, + { + "epoch": 0.19, + "grad_norm": 2.8932084963461766, + "learning_rate": 9.346366782954313e-06, + "loss": 0.7145, + "step": 1803 + }, + { + "epoch": 0.19, + "grad_norm": 2.6293013765675806, + "learning_rate": 9.345524071538457e-06, + "loss": 0.762, + "step": 1804 + }, + { + "epoch": 0.19, + "grad_norm": 2.4909478746794926, + "learning_rate": 9.344680855272178e-06, + "loss": 0.7797, + "step": 1805 + }, + { + "epoch": 0.19, + "grad_norm": 2.678911462141494, + "learning_rate": 9.343837134253434e-06, + "loss": 0.7017, + "step": 1806 + }, + { + "epoch": 0.19, + "grad_norm": 3.236697329924541, + "learning_rate": 9.342992908580252e-06, + "loss": 0.6629, + "step": 1807 + }, + { + "epoch": 0.19, + "grad_norm": 2.6302188588519484, + "learning_rate": 9.342148178350705e-06, + "loss": 0.7272, + "step": 1808 + }, + { + "epoch": 0.19, + "grad_norm": 3.3111476074297292, + "learning_rate": 9.341302943662937e-06, + "loss": 0.6723, + "step": 1809 + }, + { + "epoch": 0.19, + "grad_norm": 2.9282424193071446, + "learning_rate": 9.34045720461514e-06, + "loss": 0.8171, + "step": 1810 + }, + { + "epoch": 0.19, + "grad_norm": 2.9521986711914656, + "learning_rate": 9.339610961305575e-06, + "loss": 0.7266, + "step": 1811 + }, + { + "epoch": 0.19, + "grad_norm": 2.6008867763760595, + "learning_rate": 9.33876421383255e-06, + "loss": 0.6991, + "step": 1812 + }, + { + "epoch": 0.19, + "grad_norm": 2.4219540920578213, + "learning_rate": 9.337916962294443e-06, + "loss": 0.7751, + "step": 1813 + }, + { + "epoch": 0.19, + "grad_norm": 2.3994526596884787, + "learning_rate": 9.337069206789681e-06, + "loss": 0.7132, + "step": 1814 + }, + { + "epoch": 0.19, + "grad_norm": 2.141697670450548, + "learning_rate": 9.336220947416757e-06, + "loss": 0.6879, + "step": 1815 + }, + { + "epoch": 0.19, + "grad_norm": 2.442700420921565, + "learning_rate": 9.335372184274219e-06, + "loss": 0.725, + "step": 1816 + }, + { + "epoch": 0.19, + "grad_norm": 2.9922843786433275, + "learning_rate": 9.334522917460671e-06, + "loss": 0.7077, + "step": 1817 + }, + { + "epoch": 0.19, + "grad_norm": 1.9847132878411518, + "learning_rate": 9.33367314707478e-06, + "loss": 0.6625, + "step": 1818 + }, + { + "epoch": 0.19, + "grad_norm": 2.2682583137130137, + "learning_rate": 9.332822873215273e-06, + "loss": 0.6533, + "step": 1819 + }, + { + "epoch": 0.19, + "grad_norm": 2.1574351053442125, + "learning_rate": 9.331972095980927e-06, + "loss": 0.6908, + "step": 1820 + }, + { + "epoch": 0.19, + "grad_norm": 2.495175865873836, + "learning_rate": 9.331120815470586e-06, + "loss": 0.69, + "step": 1821 + }, + { + "epoch": 0.19, + "grad_norm": 2.5556685920907243, + "learning_rate": 9.330269031783147e-06, + "loss": 0.6904, + "step": 1822 + }, + { + "epoch": 0.19, + "grad_norm": 2.50380795052326, + "learning_rate": 9.329416745017573e-06, + "loss": 0.6958, + "step": 1823 + }, + { + "epoch": 0.19, + "grad_norm": 2.195310629239766, + "learning_rate": 9.328563955272873e-06, + "loss": 0.718, + "step": 1824 + }, + { + "epoch": 0.19, + "grad_norm": 2.596171146233937, + "learning_rate": 9.327710662648128e-06, + "loss": 0.6523, + "step": 1825 + }, + { + "epoch": 0.19, + "grad_norm": 2.1903089534866957, + "learning_rate": 9.326856867242467e-06, + "loss": 0.6914, + "step": 1826 + }, + { + "epoch": 0.19, + "grad_norm": 2.1949315431307785, + "learning_rate": 9.326002569155084e-06, + "loss": 0.6121, + "step": 1827 + }, + { + "epoch": 0.19, + "grad_norm": 2.1196053273242206, + "learning_rate": 9.325147768485226e-06, + "loss": 0.7247, + "step": 1828 + }, + { + "epoch": 0.19, + "grad_norm": 2.927602122432581, + "learning_rate": 9.324292465332205e-06, + "loss": 0.7023, + "step": 1829 + }, + { + "epoch": 0.19, + "grad_norm": 2.59333950514103, + "learning_rate": 9.323436659795384e-06, + "loss": 0.774, + "step": 1830 + }, + { + "epoch": 0.19, + "grad_norm": 2.1860597709229794, + "learning_rate": 9.32258035197419e-06, + "loss": 0.6294, + "step": 1831 + }, + { + "epoch": 0.19, + "grad_norm": 2.237668840439711, + "learning_rate": 9.321723541968106e-06, + "loss": 0.6915, + "step": 1832 + }, + { + "epoch": 0.19, + "grad_norm": 3.013676832988803, + "learning_rate": 9.320866229876674e-06, + "loss": 0.7642, + "step": 1833 + }, + { + "epoch": 0.19, + "grad_norm": 2.357450534390374, + "learning_rate": 9.320008415799496e-06, + "loss": 0.7363, + "step": 1834 + }, + { + "epoch": 0.19, + "grad_norm": 3.2990217042269863, + "learning_rate": 9.319150099836225e-06, + "loss": 0.6567, + "step": 1835 + }, + { + "epoch": 0.19, + "grad_norm": 2.4112889638082033, + "learning_rate": 9.318291282086582e-06, + "loss": 0.6427, + "step": 1836 + }, + { + "epoch": 0.19, + "grad_norm": 2.5245440646141706, + "learning_rate": 9.317431962650339e-06, + "loss": 0.6699, + "step": 1837 + }, + { + "epoch": 0.19, + "grad_norm": 2.1773949952495566, + "learning_rate": 9.316572141627334e-06, + "loss": 0.6629, + "step": 1838 + }, + { + "epoch": 0.19, + "grad_norm": 2.0104341618703967, + "learning_rate": 9.315711819117452e-06, + "loss": 0.6719, + "step": 1839 + }, + { + "epoch": 0.19, + "grad_norm": 2.5108935311841467, + "learning_rate": 9.31485099522065e-06, + "loss": 0.7134, + "step": 1840 + }, + { + "epoch": 0.19, + "grad_norm": 2.695150335447739, + "learning_rate": 9.31398967003693e-06, + "loss": 0.8007, + "step": 1841 + }, + { + "epoch": 0.19, + "grad_norm": 2.7187367089479895, + "learning_rate": 9.31312784366636e-06, + "loss": 0.6938, + "step": 1842 + }, + { + "epoch": 0.19, + "grad_norm": 2.266932399300048, + "learning_rate": 9.312265516209068e-06, + "loss": 0.7279, + "step": 1843 + }, + { + "epoch": 0.19, + "grad_norm": 2.064288579022452, + "learning_rate": 9.311402687765231e-06, + "loss": 0.6903, + "step": 1844 + }, + { + "epoch": 0.19, + "grad_norm": 2.3054354931895165, + "learning_rate": 9.310539358435095e-06, + "loss": 0.6779, + "step": 1845 + }, + { + "epoch": 0.19, + "grad_norm": 2.3704699001159244, + "learning_rate": 9.309675528318955e-06, + "loss": 0.7828, + "step": 1846 + }, + { + "epoch": 0.19, + "grad_norm": 2.5359614229510274, + "learning_rate": 9.308811197517172e-06, + "loss": 0.6946, + "step": 1847 + }, + { + "epoch": 0.19, + "grad_norm": 2.433416978342713, + "learning_rate": 9.307946366130158e-06, + "loss": 0.8163, + "step": 1848 + }, + { + "epoch": 0.19, + "grad_norm": 2.6463567241215493, + "learning_rate": 9.307081034258389e-06, + "loss": 0.7162, + "step": 1849 + }, + { + "epoch": 0.19, + "grad_norm": 3.2116023992699523, + "learning_rate": 9.306215202002396e-06, + "loss": 0.6796, + "step": 1850 + }, + { + "epoch": 0.19, + "grad_norm": 3.0445330680873828, + "learning_rate": 9.305348869462768e-06, + "loss": 0.7149, + "step": 1851 + }, + { + "epoch": 0.19, + "grad_norm": 1.881081284323706, + "learning_rate": 9.304482036740154e-06, + "loss": 0.6436, + "step": 1852 + }, + { + "epoch": 0.19, + "grad_norm": 2.4175242650695687, + "learning_rate": 9.30361470393526e-06, + "loss": 0.6238, + "step": 1853 + }, + { + "epoch": 0.2, + "grad_norm": 2.430329856197137, + "learning_rate": 9.302746871148852e-06, + "loss": 0.7318, + "step": 1854 + }, + { + "epoch": 0.2, + "grad_norm": 2.1962368489134803, + "learning_rate": 9.301878538481748e-06, + "loss": 0.7108, + "step": 1855 + }, + { + "epoch": 0.2, + "grad_norm": 2.169152259585836, + "learning_rate": 9.30100970603483e-06, + "loss": 0.7173, + "step": 1856 + }, + { + "epoch": 0.2, + "grad_norm": 2.463790448876148, + "learning_rate": 9.30014037390904e-06, + "loss": 0.689, + "step": 1857 + }, + { + "epoch": 0.2, + "grad_norm": 2.368852330455152, + "learning_rate": 9.299270542205372e-06, + "loss": 0.6933, + "step": 1858 + }, + { + "epoch": 0.2, + "grad_norm": 2.444593231468713, + "learning_rate": 9.298400211024878e-06, + "loss": 0.6799, + "step": 1859 + }, + { + "epoch": 0.2, + "grad_norm": 1.8680880820924584, + "learning_rate": 9.297529380468675e-06, + "loss": 0.6574, + "step": 1860 + }, + { + "epoch": 0.2, + "grad_norm": 2.948905283236975, + "learning_rate": 9.29665805063793e-06, + "loss": 0.6756, + "step": 1861 + }, + { + "epoch": 0.2, + "grad_norm": 2.1963367833574643, + "learning_rate": 9.295786221633874e-06, + "loss": 0.634, + "step": 1862 + }, + { + "epoch": 0.2, + "grad_norm": 2.139790541038441, + "learning_rate": 9.294913893557792e-06, + "loss": 0.6734, + "step": 1863 + }, + { + "epoch": 0.2, + "grad_norm": 2.619706209228136, + "learning_rate": 9.294041066511031e-06, + "loss": 0.6999, + "step": 1864 + }, + { + "epoch": 0.2, + "grad_norm": 2.409480108291939, + "learning_rate": 9.29316774059499e-06, + "loss": 0.6769, + "step": 1865 + }, + { + "epoch": 0.2, + "grad_norm": 2.410329086374503, + "learning_rate": 9.29229391591113e-06, + "loss": 0.7089, + "step": 1866 + }, + { + "epoch": 0.2, + "grad_norm": 5.690751097339351, + "learning_rate": 9.291419592560973e-06, + "loss": 0.7696, + "step": 1867 + }, + { + "epoch": 0.2, + "grad_norm": 2.69073549819373, + "learning_rate": 9.290544770646092e-06, + "loss": 0.6698, + "step": 1868 + }, + { + "epoch": 0.2, + "grad_norm": 3.9808783369984093, + "learning_rate": 9.289669450268122e-06, + "loss": 0.6039, + "step": 1869 + }, + { + "epoch": 0.2, + "grad_norm": 2.15890459589978, + "learning_rate": 9.288793631528757e-06, + "loss": 0.6616, + "step": 1870 + }, + { + "epoch": 0.2, + "grad_norm": 2.081034392032343, + "learning_rate": 9.287917314529743e-06, + "loss": 0.682, + "step": 1871 + }, + { + "epoch": 0.2, + "grad_norm": 2.1849941363664382, + "learning_rate": 9.287040499372893e-06, + "loss": 0.7173, + "step": 1872 + }, + { + "epoch": 0.2, + "grad_norm": 2.4741373441095202, + "learning_rate": 9.286163186160067e-06, + "loss": 0.6348, + "step": 1873 + }, + { + "epoch": 0.2, + "grad_norm": 2.4013819058393944, + "learning_rate": 9.285285374993195e-06, + "loss": 0.641, + "step": 1874 + }, + { + "epoch": 0.2, + "grad_norm": 2.3332689108954336, + "learning_rate": 9.284407065974254e-06, + "loss": 0.7153, + "step": 1875 + }, + { + "epoch": 0.2, + "grad_norm": 2.3415860761989378, + "learning_rate": 9.283528259205287e-06, + "loss": 0.7111, + "step": 1876 + }, + { + "epoch": 0.2, + "grad_norm": 2.5710052029402166, + "learning_rate": 9.282648954788387e-06, + "loss": 0.6825, + "step": 1877 + }, + { + "epoch": 0.2, + "grad_norm": 2.172242362761334, + "learning_rate": 9.281769152825713e-06, + "loss": 0.6001, + "step": 1878 + }, + { + "epoch": 0.2, + "grad_norm": 2.202637528829396, + "learning_rate": 9.280888853419476e-06, + "loss": 0.6635, + "step": 1879 + }, + { + "epoch": 0.2, + "grad_norm": 4.4006855761361345, + "learning_rate": 9.280008056671947e-06, + "loss": 0.694, + "step": 1880 + }, + { + "epoch": 0.2, + "grad_norm": 2.831327947150695, + "learning_rate": 9.279126762685454e-06, + "loss": 0.636, + "step": 1881 + }, + { + "epoch": 0.2, + "grad_norm": 2.6062353877854436, + "learning_rate": 9.278244971562382e-06, + "loss": 0.8014, + "step": 1882 + }, + { + "epoch": 0.2, + "grad_norm": 2.870705908895546, + "learning_rate": 9.277362683405177e-06, + "loss": 0.6577, + "step": 1883 + }, + { + "epoch": 0.2, + "grad_norm": 2.3116536973353057, + "learning_rate": 9.276479898316341e-06, + "loss": 0.6627, + "step": 1884 + }, + { + "epoch": 0.2, + "grad_norm": 2.325166740616653, + "learning_rate": 9.275596616398431e-06, + "loss": 0.7138, + "step": 1885 + }, + { + "epoch": 0.2, + "grad_norm": 3.764579252240071, + "learning_rate": 9.274712837754068e-06, + "loss": 0.6182, + "step": 1886 + }, + { + "epoch": 0.2, + "grad_norm": 2.610119518405415, + "learning_rate": 9.273828562485923e-06, + "loss": 0.6369, + "step": 1887 + }, + { + "epoch": 0.2, + "grad_norm": 2.3604350623510846, + "learning_rate": 9.272943790696728e-06, + "loss": 0.6644, + "step": 1888 + }, + { + "epoch": 0.2, + "grad_norm": 2.2652286769748495, + "learning_rate": 9.272058522489277e-06, + "loss": 0.7804, + "step": 1889 + }, + { + "epoch": 0.2, + "grad_norm": 2.9511210502310745, + "learning_rate": 9.271172757966418e-06, + "loss": 0.6344, + "step": 1890 + }, + { + "epoch": 0.2, + "grad_norm": 2.8169929100268867, + "learning_rate": 9.270286497231052e-06, + "loss": 0.697, + "step": 1891 + }, + { + "epoch": 0.2, + "grad_norm": 3.7252767248143597, + "learning_rate": 9.269399740386146e-06, + "loss": 0.7239, + "step": 1892 + }, + { + "epoch": 0.2, + "grad_norm": 2.507005417930868, + "learning_rate": 9.26851248753472e-06, + "loss": 0.6427, + "step": 1893 + }, + { + "epoch": 0.2, + "grad_norm": 2.8482802966189373, + "learning_rate": 9.267624738779853e-06, + "loss": 0.6975, + "step": 1894 + }, + { + "epoch": 0.2, + "grad_norm": 2.8213003925408153, + "learning_rate": 9.266736494224677e-06, + "loss": 0.6433, + "step": 1895 + }, + { + "epoch": 0.2, + "grad_norm": 2.436716166307937, + "learning_rate": 9.265847753972392e-06, + "loss": 0.6206, + "step": 1896 + }, + { + "epoch": 0.2, + "grad_norm": 3.0067406772274063, + "learning_rate": 9.264958518126246e-06, + "loss": 0.7178, + "step": 1897 + }, + { + "epoch": 0.2, + "grad_norm": 2.652398231372856, + "learning_rate": 9.264068786789546e-06, + "loss": 0.7469, + "step": 1898 + }, + { + "epoch": 0.2, + "grad_norm": 2.903751071631814, + "learning_rate": 9.263178560065664e-06, + "loss": 0.6284, + "step": 1899 + }, + { + "epoch": 0.2, + "grad_norm": 2.233324204587242, + "learning_rate": 9.262287838058017e-06, + "loss": 0.654, + "step": 1900 + }, + { + "epoch": 0.2, + "grad_norm": 5.426621816553852, + "learning_rate": 9.261396620870092e-06, + "loss": 0.6874, + "step": 1901 + }, + { + "epoch": 0.2, + "grad_norm": 2.9434733409299283, + "learning_rate": 9.260504908605425e-06, + "loss": 0.6296, + "step": 1902 + }, + { + "epoch": 0.2, + "grad_norm": 1.370239117918409, + "learning_rate": 9.259612701367615e-06, + "loss": 0.6566, + "step": 1903 + }, + { + "epoch": 0.2, + "grad_norm": 2.306513335576572, + "learning_rate": 9.258719999260315e-06, + "loss": 0.6967, + "step": 1904 + }, + { + "epoch": 0.2, + "grad_norm": 2.385612710261463, + "learning_rate": 9.257826802387234e-06, + "loss": 0.6995, + "step": 1905 + }, + { + "epoch": 0.2, + "grad_norm": 2.8909613530557707, + "learning_rate": 9.256933110852145e-06, + "loss": 0.6234, + "step": 1906 + }, + { + "epoch": 0.2, + "grad_norm": 2.956138783299861, + "learning_rate": 9.25603892475887e-06, + "loss": 0.7436, + "step": 1907 + }, + { + "epoch": 0.2, + "grad_norm": 3.9130492645144934, + "learning_rate": 9.255144244211299e-06, + "loss": 0.7194, + "step": 1908 + }, + { + "epoch": 0.2, + "grad_norm": 2.8433841114866922, + "learning_rate": 9.254249069313368e-06, + "loss": 0.6685, + "step": 1909 + }, + { + "epoch": 0.2, + "grad_norm": 3.0403922052487427, + "learning_rate": 9.253353400169078e-06, + "loss": 0.6988, + "step": 1910 + }, + { + "epoch": 0.2, + "grad_norm": 2.3083987036017835, + "learning_rate": 9.252457236882487e-06, + "loss": 0.659, + "step": 1911 + }, + { + "epoch": 0.2, + "grad_norm": 2.636186973720048, + "learning_rate": 9.251560579557705e-06, + "loss": 0.6196, + "step": 1912 + }, + { + "epoch": 0.2, + "grad_norm": 2.3354347387696337, + "learning_rate": 9.250663428298906e-06, + "loss": 0.7811, + "step": 1913 + }, + { + "epoch": 0.2, + "grad_norm": 3.3940926366504227, + "learning_rate": 9.249765783210316e-06, + "loss": 0.6968, + "step": 1914 + }, + { + "epoch": 0.2, + "grad_norm": 1.9781144300478661, + "learning_rate": 9.248867644396224e-06, + "loss": 0.7138, + "step": 1915 + }, + { + "epoch": 0.2, + "grad_norm": 6.221082262034515, + "learning_rate": 9.24796901196097e-06, + "loss": 0.7758, + "step": 1916 + }, + { + "epoch": 0.2, + "grad_norm": 2.281625973109882, + "learning_rate": 9.247069886008957e-06, + "loss": 0.6453, + "step": 1917 + }, + { + "epoch": 0.2, + "grad_norm": 2.0495291535897127, + "learning_rate": 9.24617026664464e-06, + "loss": 0.6724, + "step": 1918 + }, + { + "epoch": 0.2, + "grad_norm": 2.294555365241956, + "learning_rate": 9.245270153972537e-06, + "loss": 0.6348, + "step": 1919 + }, + { + "epoch": 0.2, + "grad_norm": 2.6583206062968143, + "learning_rate": 9.244369548097218e-06, + "loss": 0.7125, + "step": 1920 + }, + { + "epoch": 0.2, + "grad_norm": 1.2992496345079894, + "learning_rate": 9.243468449123316e-06, + "loss": 0.6501, + "step": 1921 + }, + { + "epoch": 0.2, + "grad_norm": 2.294260337838386, + "learning_rate": 9.242566857155515e-06, + "loss": 0.6783, + "step": 1922 + }, + { + "epoch": 0.2, + "grad_norm": 2.2677177768740107, + "learning_rate": 9.241664772298561e-06, + "loss": 0.7314, + "step": 1923 + }, + { + "epoch": 0.2, + "grad_norm": 2.5676248737182172, + "learning_rate": 9.240762194657254e-06, + "loss": 0.7354, + "step": 1924 + }, + { + "epoch": 0.2, + "grad_norm": 2.456584308459011, + "learning_rate": 9.239859124336457e-06, + "loss": 0.7148, + "step": 1925 + }, + { + "epoch": 0.2, + "grad_norm": 2.5953088609259134, + "learning_rate": 9.23895556144108e-06, + "loss": 0.7198, + "step": 1926 + }, + { + "epoch": 0.2, + "grad_norm": 2.144466625240076, + "learning_rate": 9.2380515060761e-06, + "loss": 0.6582, + "step": 1927 + }, + { + "epoch": 0.2, + "grad_norm": 2.821280176023632, + "learning_rate": 9.237146958346549e-06, + "loss": 0.6836, + "step": 1928 + }, + { + "epoch": 0.2, + "grad_norm": 2.012627315683036, + "learning_rate": 9.236241918357511e-06, + "loss": 0.7583, + "step": 1929 + }, + { + "epoch": 0.2, + "grad_norm": 2.3886139997345825, + "learning_rate": 9.235336386214133e-06, + "loss": 0.6987, + "step": 1930 + }, + { + "epoch": 0.2, + "grad_norm": 3.5518418067148287, + "learning_rate": 9.234430362021615e-06, + "loss": 0.7171, + "step": 1931 + }, + { + "epoch": 0.2, + "grad_norm": 2.0640715116734265, + "learning_rate": 9.233523845885221e-06, + "loss": 0.5934, + "step": 1932 + }, + { + "epoch": 0.2, + "grad_norm": 2.5462408850965303, + "learning_rate": 9.232616837910263e-06, + "loss": 0.6674, + "step": 1933 + }, + { + "epoch": 0.2, + "grad_norm": 2.1027514803586316, + "learning_rate": 9.231709338202117e-06, + "loss": 0.7143, + "step": 1934 + }, + { + "epoch": 0.2, + "grad_norm": 1.981837777190229, + "learning_rate": 9.230801346866212e-06, + "loss": 0.6824, + "step": 1935 + }, + { + "epoch": 0.2, + "grad_norm": 3.0072257637585995, + "learning_rate": 9.229892864008037e-06, + "loss": 0.6737, + "step": 1936 + }, + { + "epoch": 0.2, + "grad_norm": 2.404301234252323, + "learning_rate": 9.228983889733135e-06, + "loss": 0.72, + "step": 1937 + }, + { + "epoch": 0.2, + "grad_norm": 2.1099503574763405, + "learning_rate": 9.228074424147111e-06, + "loss": 0.7708, + "step": 1938 + }, + { + "epoch": 0.2, + "grad_norm": 2.1564706992929987, + "learning_rate": 9.227164467355621e-06, + "loss": 0.7047, + "step": 1939 + }, + { + "epoch": 0.2, + "grad_norm": 2.717145887514134, + "learning_rate": 9.226254019464384e-06, + "loss": 0.7386, + "step": 1940 + }, + { + "epoch": 0.2, + "grad_norm": 3.434512344610932, + "learning_rate": 9.225343080579171e-06, + "loss": 0.7586, + "step": 1941 + }, + { + "epoch": 0.2, + "grad_norm": 2.516975033266961, + "learning_rate": 9.224431650805814e-06, + "loss": 0.721, + "step": 1942 + }, + { + "epoch": 0.2, + "grad_norm": 2.0749482225336697, + "learning_rate": 9.223519730250198e-06, + "loss": 0.6007, + "step": 1943 + }, + { + "epoch": 0.2, + "grad_norm": 2.5096562394219775, + "learning_rate": 9.222607319018271e-06, + "loss": 0.7729, + "step": 1944 + }, + { + "epoch": 0.2, + "grad_norm": 1.9924376463945759, + "learning_rate": 9.221694417216031e-06, + "loss": 0.7065, + "step": 1945 + }, + { + "epoch": 0.2, + "grad_norm": 3.4082767933565594, + "learning_rate": 9.220781024949536e-06, + "loss": 0.7151, + "step": 1946 + }, + { + "epoch": 0.2, + "grad_norm": 2.157525030220561, + "learning_rate": 9.219867142324904e-06, + "loss": 0.6323, + "step": 1947 + }, + { + "epoch": 0.2, + "grad_norm": 2.809345903838342, + "learning_rate": 9.218952769448307e-06, + "loss": 0.6643, + "step": 1948 + }, + { + "epoch": 0.21, + "grad_norm": 2.1126315310721426, + "learning_rate": 9.218037906425971e-06, + "loss": 0.66, + "step": 1949 + }, + { + "epoch": 0.21, + "grad_norm": 2.106123096050007, + "learning_rate": 9.217122553364184e-06, + "loss": 0.779, + "step": 1950 + }, + { + "epoch": 0.21, + "grad_norm": 2.4264702932506927, + "learning_rate": 9.21620671036929e-06, + "loss": 0.7279, + "step": 1951 + }, + { + "epoch": 0.21, + "grad_norm": 8.387568989843238, + "learning_rate": 9.215290377547688e-06, + "loss": 0.576, + "step": 1952 + }, + { + "epoch": 0.21, + "grad_norm": 2.721569909570348, + "learning_rate": 9.214373555005834e-06, + "loss": 0.6169, + "step": 1953 + }, + { + "epoch": 0.21, + "grad_norm": 2.5751826747005553, + "learning_rate": 9.213456242850245e-06, + "loss": 0.7445, + "step": 1954 + }, + { + "epoch": 0.21, + "grad_norm": 5.238179252301112, + "learning_rate": 9.21253844118749e-06, + "loss": 0.6893, + "step": 1955 + }, + { + "epoch": 0.21, + "grad_norm": 2.230727560495232, + "learning_rate": 9.211620150124192e-06, + "loss": 0.7169, + "step": 1956 + }, + { + "epoch": 0.21, + "grad_norm": 2.5021330652535645, + "learning_rate": 9.210701369767043e-06, + "loss": 0.7869, + "step": 1957 + }, + { + "epoch": 0.21, + "grad_norm": 2.7468363192392427, + "learning_rate": 9.20978210022278e-06, + "loss": 0.7359, + "step": 1958 + }, + { + "epoch": 0.21, + "grad_norm": 2.0826919441428458, + "learning_rate": 9.208862341598201e-06, + "loss": 0.7324, + "step": 1959 + }, + { + "epoch": 0.21, + "grad_norm": 2.7555569094425993, + "learning_rate": 9.207942094000163e-06, + "loss": 0.7457, + "step": 1960 + }, + { + "epoch": 0.21, + "grad_norm": 3.1278899651708088, + "learning_rate": 9.207021357535576e-06, + "loss": 0.7093, + "step": 1961 + }, + { + "epoch": 0.21, + "grad_norm": 2.304923508024911, + "learning_rate": 9.206100132311408e-06, + "loss": 0.6779, + "step": 1962 + }, + { + "epoch": 0.21, + "grad_norm": 2.3962706909638007, + "learning_rate": 9.205178418434687e-06, + "loss": 0.7315, + "step": 1963 + }, + { + "epoch": 0.21, + "grad_norm": 2.3044315673666205, + "learning_rate": 9.204256216012493e-06, + "loss": 0.7382, + "step": 1964 + }, + { + "epoch": 0.21, + "grad_norm": 2.4732648879760863, + "learning_rate": 9.203333525151964e-06, + "loss": 0.7135, + "step": 1965 + }, + { + "epoch": 0.21, + "grad_norm": 2.206789161392687, + "learning_rate": 9.202410345960298e-06, + "loss": 0.7343, + "step": 1966 + }, + { + "epoch": 0.21, + "grad_norm": 2.4651985133702827, + "learning_rate": 9.201486678544745e-06, + "loss": 0.718, + "step": 1967 + }, + { + "epoch": 0.21, + "grad_norm": 2.2568110947549407, + "learning_rate": 9.200562523012615e-06, + "loss": 0.6678, + "step": 1968 + }, + { + "epoch": 0.21, + "grad_norm": 2.4466689065325182, + "learning_rate": 9.199637879471272e-06, + "loss": 0.7278, + "step": 1969 + }, + { + "epoch": 0.21, + "grad_norm": 2.7412446183420593, + "learning_rate": 9.198712748028142e-06, + "loss": 0.6746, + "step": 1970 + }, + { + "epoch": 0.21, + "grad_norm": 2.090127601569364, + "learning_rate": 9.197787128790702e-06, + "loss": 0.7454, + "step": 1971 + }, + { + "epoch": 0.21, + "grad_norm": 2.2493962206295457, + "learning_rate": 9.19686102186649e-06, + "loss": 0.6949, + "step": 1972 + }, + { + "epoch": 0.21, + "grad_norm": 2.4780855364247603, + "learning_rate": 9.195934427363093e-06, + "loss": 0.7695, + "step": 1973 + }, + { + "epoch": 0.21, + "grad_norm": 2.720134076492023, + "learning_rate": 9.195007345388165e-06, + "loss": 0.7939, + "step": 1974 + }, + { + "epoch": 0.21, + "grad_norm": 2.6619081847464705, + "learning_rate": 9.19407977604941e-06, + "loss": 0.7645, + "step": 1975 + }, + { + "epoch": 0.21, + "grad_norm": 2.1669764589008986, + "learning_rate": 9.193151719454591e-06, + "loss": 0.658, + "step": 1976 + }, + { + "epoch": 0.21, + "grad_norm": 2.7606276779843375, + "learning_rate": 9.192223175711526e-06, + "loss": 0.641, + "step": 1977 + }, + { + "epoch": 0.21, + "grad_norm": 2.5503781978393776, + "learning_rate": 9.191294144928091e-06, + "loss": 0.7459, + "step": 1978 + }, + { + "epoch": 0.21, + "grad_norm": 2.2113194458441345, + "learning_rate": 9.190364627212216e-06, + "loss": 0.6788, + "step": 1979 + }, + { + "epoch": 0.21, + "grad_norm": 3.178914025765916, + "learning_rate": 9.189434622671894e-06, + "loss": 0.6719, + "step": 1980 + }, + { + "epoch": 0.21, + "grad_norm": 2.684741530038339, + "learning_rate": 9.188504131415167e-06, + "loss": 0.6779, + "step": 1981 + }, + { + "epoch": 0.21, + "grad_norm": 2.245253383160478, + "learning_rate": 9.187573153550139e-06, + "loss": 0.6422, + "step": 1982 + }, + { + "epoch": 0.21, + "grad_norm": 2.5644319947882055, + "learning_rate": 9.186641689184966e-06, + "loss": 0.7558, + "step": 1983 + }, + { + "epoch": 0.21, + "grad_norm": 2.6626949713612134, + "learning_rate": 9.185709738427864e-06, + "loss": 0.731, + "step": 1984 + }, + { + "epoch": 0.21, + "grad_norm": 5.227738894627522, + "learning_rate": 9.184777301387104e-06, + "loss": 0.7663, + "step": 1985 + }, + { + "epoch": 0.21, + "grad_norm": 2.3446022774136024, + "learning_rate": 9.183844378171016e-06, + "loss": 0.6564, + "step": 1986 + }, + { + "epoch": 0.21, + "grad_norm": 2.432195543728687, + "learning_rate": 9.182910968887982e-06, + "loss": 0.7728, + "step": 1987 + }, + { + "epoch": 0.21, + "grad_norm": 2.4823577607004146, + "learning_rate": 9.181977073646442e-06, + "loss": 0.6328, + "step": 1988 + }, + { + "epoch": 0.21, + "grad_norm": 2.177220583837391, + "learning_rate": 9.181042692554894e-06, + "loss": 0.6911, + "step": 1989 + }, + { + "epoch": 0.21, + "grad_norm": 2.483575743653078, + "learning_rate": 9.180107825721891e-06, + "loss": 0.7007, + "step": 1990 + }, + { + "epoch": 0.21, + "grad_norm": 2.109985363397772, + "learning_rate": 9.179172473256046e-06, + "loss": 0.688, + "step": 1991 + }, + { + "epoch": 0.21, + "grad_norm": 3.0644562546243423, + "learning_rate": 9.178236635266025e-06, + "loss": 0.6994, + "step": 1992 + }, + { + "epoch": 0.21, + "grad_norm": 2.1056210922690526, + "learning_rate": 9.17730031186055e-06, + "loss": 0.7325, + "step": 1993 + }, + { + "epoch": 0.21, + "grad_norm": 2.249585098272686, + "learning_rate": 9.176363503148397e-06, + "loss": 0.698, + "step": 1994 + }, + { + "epoch": 0.21, + "grad_norm": 2.57223165623908, + "learning_rate": 9.175426209238407e-06, + "loss": 0.6476, + "step": 1995 + }, + { + "epoch": 0.21, + "grad_norm": 3.6183458052615665, + "learning_rate": 9.17448843023947e-06, + "loss": 0.8366, + "step": 1996 + }, + { + "epoch": 0.21, + "grad_norm": 2.072274750411721, + "learning_rate": 9.173550166260533e-06, + "loss": 0.7383, + "step": 1997 + }, + { + "epoch": 0.21, + "grad_norm": 2.3127023094424963, + "learning_rate": 9.172611417410604e-06, + "loss": 0.6212, + "step": 1998 + }, + { + "epoch": 0.21, + "grad_norm": 2.3387267206899556, + "learning_rate": 9.17167218379874e-06, + "loss": 0.7201, + "step": 1999 + }, + { + "epoch": 0.21, + "grad_norm": 2.311361411354398, + "learning_rate": 9.170732465534062e-06, + "loss": 0.7375, + "step": 2000 + }, + { + "epoch": 0.21, + "grad_norm": 2.2786653146755618, + "learning_rate": 9.169792262725744e-06, + "loss": 0.7097, + "step": 2001 + }, + { + "epoch": 0.21, + "grad_norm": 2.107809653312181, + "learning_rate": 9.168851575483013e-06, + "loss": 0.6939, + "step": 2002 + }, + { + "epoch": 0.21, + "grad_norm": 2.057099189874853, + "learning_rate": 9.167910403915157e-06, + "loss": 0.7254, + "step": 2003 + }, + { + "epoch": 0.21, + "grad_norm": 2.445940571938724, + "learning_rate": 9.16696874813152e-06, + "loss": 0.6684, + "step": 2004 + }, + { + "epoch": 0.21, + "grad_norm": 2.4846164924547747, + "learning_rate": 9.166026608241496e-06, + "loss": 0.6799, + "step": 2005 + }, + { + "epoch": 0.21, + "grad_norm": 2.2002524931768814, + "learning_rate": 9.165083984354545e-06, + "loss": 0.7163, + "step": 2006 + }, + { + "epoch": 0.21, + "grad_norm": 3.2128685263981924, + "learning_rate": 9.164140876580179e-06, + "loss": 0.6353, + "step": 2007 + }, + { + "epoch": 0.21, + "grad_norm": 2.487800260585636, + "learning_rate": 9.16319728502796e-06, + "loss": 0.7001, + "step": 2008 + }, + { + "epoch": 0.21, + "grad_norm": 2.3851629284461553, + "learning_rate": 9.162253209807517e-06, + "loss": 0.6288, + "step": 2009 + }, + { + "epoch": 0.21, + "grad_norm": 2.4232158597086744, + "learning_rate": 9.161308651028527e-06, + "loss": 0.7754, + "step": 2010 + }, + { + "epoch": 0.21, + "grad_norm": 2.154013851517835, + "learning_rate": 9.160363608800728e-06, + "loss": 0.7117, + "step": 2011 + }, + { + "epoch": 0.21, + "grad_norm": 3.896276107997591, + "learning_rate": 9.159418083233911e-06, + "loss": 0.6859, + "step": 2012 + }, + { + "epoch": 0.21, + "grad_norm": 2.44486977130411, + "learning_rate": 9.158472074437923e-06, + "loss": 0.6467, + "step": 2013 + }, + { + "epoch": 0.21, + "grad_norm": 2.0090988720613, + "learning_rate": 9.157525582522673e-06, + "loss": 0.6484, + "step": 2014 + }, + { + "epoch": 0.21, + "grad_norm": 2.34089707383189, + "learning_rate": 9.156578607598118e-06, + "loss": 0.7049, + "step": 2015 + }, + { + "epoch": 0.21, + "grad_norm": 1.2880641379260889, + "learning_rate": 9.155631149774276e-06, + "loss": 0.6175, + "step": 2016 + }, + { + "epoch": 0.21, + "grad_norm": 2.229526829371313, + "learning_rate": 9.15468320916122e-06, + "loss": 0.6662, + "step": 2017 + }, + { + "epoch": 0.21, + "grad_norm": 2.156990061837107, + "learning_rate": 9.153734785869077e-06, + "loss": 0.7708, + "step": 2018 + }, + { + "epoch": 0.21, + "grad_norm": 1.0523799529588602, + "learning_rate": 9.152785880008035e-06, + "loss": 0.6244, + "step": 2019 + }, + { + "epoch": 0.21, + "grad_norm": 7.993605252322678, + "learning_rate": 9.151836491688334e-06, + "loss": 0.6563, + "step": 2020 + }, + { + "epoch": 0.21, + "grad_norm": 2.3078838508848922, + "learning_rate": 9.15088662102027e-06, + "loss": 0.6904, + "step": 2021 + }, + { + "epoch": 0.21, + "grad_norm": 3.063801331956025, + "learning_rate": 9.149936268114199e-06, + "loss": 0.676, + "step": 2022 + }, + { + "epoch": 0.21, + "grad_norm": 2.295192468843143, + "learning_rate": 9.148985433080528e-06, + "loss": 0.6834, + "step": 2023 + }, + { + "epoch": 0.21, + "grad_norm": 2.0564302867231383, + "learning_rate": 9.148034116029723e-06, + "loss": 0.704, + "step": 2024 + }, + { + "epoch": 0.21, + "grad_norm": 2.5228740207896085, + "learning_rate": 9.147082317072305e-06, + "loss": 0.6683, + "step": 2025 + }, + { + "epoch": 0.21, + "grad_norm": 2.7052297386153015, + "learning_rate": 9.146130036318853e-06, + "loss": 0.7911, + "step": 2026 + }, + { + "epoch": 0.21, + "grad_norm": 2.601417655514352, + "learning_rate": 9.145177273879995e-06, + "loss": 0.6382, + "step": 2027 + }, + { + "epoch": 0.21, + "grad_norm": 1.9751425113181098, + "learning_rate": 9.144224029866426e-06, + "loss": 0.6693, + "step": 2028 + }, + { + "epoch": 0.21, + "grad_norm": 2.5895232014896266, + "learning_rate": 9.14327030438889e-06, + "loss": 0.6712, + "step": 2029 + }, + { + "epoch": 0.21, + "grad_norm": 3.747457510645605, + "learning_rate": 9.142316097558185e-06, + "loss": 0.6682, + "step": 2030 + }, + { + "epoch": 0.21, + "grad_norm": 2.220857892024144, + "learning_rate": 9.14136140948517e-06, + "loss": 0.7372, + "step": 2031 + }, + { + "epoch": 0.21, + "grad_norm": 2.3266528015138386, + "learning_rate": 9.14040624028076e-06, + "loss": 0.7786, + "step": 2032 + }, + { + "epoch": 0.21, + "grad_norm": 2.348078882997178, + "learning_rate": 9.13945059005592e-06, + "loss": 0.7961, + "step": 2033 + }, + { + "epoch": 0.21, + "grad_norm": 2.81669382916974, + "learning_rate": 9.138494458921676e-06, + "loss": 0.6599, + "step": 2034 + }, + { + "epoch": 0.21, + "grad_norm": 2.170306767141564, + "learning_rate": 9.137537846989111e-06, + "loss": 0.6299, + "step": 2035 + }, + { + "epoch": 0.21, + "grad_norm": 2.5696842899169385, + "learning_rate": 9.136580754369357e-06, + "loss": 0.6784, + "step": 2036 + }, + { + "epoch": 0.21, + "grad_norm": 2.3739145799623045, + "learning_rate": 9.135623181173609e-06, + "loss": 0.7959, + "step": 2037 + }, + { + "epoch": 0.21, + "grad_norm": 2.3501976834313174, + "learning_rate": 9.134665127513116e-06, + "loss": 0.6405, + "step": 2038 + }, + { + "epoch": 0.21, + "grad_norm": 2.7821580766766676, + "learning_rate": 9.133706593499181e-06, + "loss": 0.7582, + "step": 2039 + }, + { + "epoch": 0.21, + "grad_norm": 2.0217636722769323, + "learning_rate": 9.132747579243163e-06, + "loss": 0.7176, + "step": 2040 + }, + { + "epoch": 0.21, + "grad_norm": 2.149948219126222, + "learning_rate": 9.131788084856477e-06, + "loss": 0.7303, + "step": 2041 + }, + { + "epoch": 0.21, + "grad_norm": 2.279544855221473, + "learning_rate": 9.130828110450593e-06, + "loss": 0.6588, + "step": 2042 + }, + { + "epoch": 0.21, + "grad_norm": 2.49303969504543, + "learning_rate": 9.129867656137044e-06, + "loss": 0.7645, + "step": 2043 + }, + { + "epoch": 0.22, + "grad_norm": 2.143115712697355, + "learning_rate": 9.128906722027406e-06, + "loss": 0.6674, + "step": 2044 + }, + { + "epoch": 0.22, + "grad_norm": 2.0564042062634225, + "learning_rate": 9.127945308233322e-06, + "loss": 0.7074, + "step": 2045 + }, + { + "epoch": 0.22, + "grad_norm": 2.147223116012419, + "learning_rate": 9.126983414866486e-06, + "loss": 0.7134, + "step": 2046 + }, + { + "epoch": 0.22, + "grad_norm": 2.272635221500622, + "learning_rate": 9.126021042038644e-06, + "loss": 0.7515, + "step": 2047 + }, + { + "epoch": 0.22, + "grad_norm": 2.7298434180359954, + "learning_rate": 9.125058189861607e-06, + "loss": 0.7381, + "step": 2048 + }, + { + "epoch": 0.22, + "grad_norm": 3.0486731591766913, + "learning_rate": 9.124094858447233e-06, + "loss": 0.6821, + "step": 2049 + }, + { + "epoch": 0.22, + "grad_norm": 4.6588169832809205, + "learning_rate": 9.123131047907439e-06, + "loss": 0.8302, + "step": 2050 + }, + { + "epoch": 0.22, + "grad_norm": 2.07023150767486, + "learning_rate": 9.122166758354199e-06, + "loss": 0.6876, + "step": 2051 + }, + { + "epoch": 0.22, + "grad_norm": 2.7807238495464133, + "learning_rate": 9.12120198989954e-06, + "loss": 0.5919, + "step": 2052 + }, + { + "epoch": 0.22, + "grad_norm": 2.1579164615666837, + "learning_rate": 9.120236742655548e-06, + "loss": 0.7351, + "step": 2053 + }, + { + "epoch": 0.22, + "grad_norm": 2.5527053709438214, + "learning_rate": 9.11927101673436e-06, + "loss": 0.6993, + "step": 2054 + }, + { + "epoch": 0.22, + "grad_norm": 2.261502991172867, + "learning_rate": 9.118304812248177e-06, + "loss": 0.6502, + "step": 2055 + }, + { + "epoch": 0.22, + "grad_norm": 3.242461395330852, + "learning_rate": 9.117338129309243e-06, + "loss": 0.6605, + "step": 2056 + }, + { + "epoch": 0.22, + "grad_norm": 1.914414634765642, + "learning_rate": 9.116370968029867e-06, + "loss": 0.6981, + "step": 2057 + }, + { + "epoch": 0.22, + "grad_norm": 2.344979314174602, + "learning_rate": 9.115403328522412e-06, + "loss": 0.7227, + "step": 2058 + }, + { + "epoch": 0.22, + "grad_norm": 2.191238107674176, + "learning_rate": 9.114435210899296e-06, + "loss": 0.6631, + "step": 2059 + }, + { + "epoch": 0.22, + "grad_norm": 2.4602672291177528, + "learning_rate": 9.113466615272988e-06, + "loss": 0.7243, + "step": 2060 + }, + { + "epoch": 0.22, + "grad_norm": 1.44867772558428, + "learning_rate": 9.11249754175602e-06, + "loss": 0.6982, + "step": 2061 + }, + { + "epoch": 0.22, + "grad_norm": 2.651963613477152, + "learning_rate": 9.111527990460977e-06, + "loss": 0.7013, + "step": 2062 + }, + { + "epoch": 0.22, + "grad_norm": 2.5214633687617325, + "learning_rate": 9.110557961500496e-06, + "loss": 0.6273, + "step": 2063 + }, + { + "epoch": 0.22, + "grad_norm": 2.1745678771179233, + "learning_rate": 9.109587454987274e-06, + "loss": 0.6482, + "step": 2064 + }, + { + "epoch": 0.22, + "grad_norm": 2.489922591200943, + "learning_rate": 9.108616471034061e-06, + "loss": 0.6609, + "step": 2065 + }, + { + "epoch": 0.22, + "grad_norm": 2.0973955866357814, + "learning_rate": 9.107645009753663e-06, + "loss": 0.6722, + "step": 2066 + }, + { + "epoch": 0.22, + "grad_norm": 1.9996934192215, + "learning_rate": 9.106673071258942e-06, + "loss": 0.7301, + "step": 2067 + }, + { + "epoch": 0.22, + "grad_norm": 2.494129615110545, + "learning_rate": 9.105700655662815e-06, + "loss": 0.6926, + "step": 2068 + }, + { + "epoch": 0.22, + "grad_norm": 11.264357578268509, + "learning_rate": 9.104727763078253e-06, + "loss": 0.7061, + "step": 2069 + }, + { + "epoch": 0.22, + "grad_norm": 2.8039561431694544, + "learning_rate": 9.103754393618287e-06, + "loss": 0.6591, + "step": 2070 + }, + { + "epoch": 0.22, + "grad_norm": 2.2260708554024737, + "learning_rate": 9.102780547395997e-06, + "loss": 0.7733, + "step": 2071 + }, + { + "epoch": 0.22, + "grad_norm": 2.6838150314170997, + "learning_rate": 9.101806224524524e-06, + "loss": 0.7586, + "step": 2072 + }, + { + "epoch": 0.22, + "grad_norm": 3.5259424740449896, + "learning_rate": 9.10083142511706e-06, + "loss": 0.6863, + "step": 2073 + }, + { + "epoch": 0.22, + "grad_norm": 1.894372573153219, + "learning_rate": 9.099856149286857e-06, + "loss": 0.6572, + "step": 2074 + }, + { + "epoch": 0.22, + "grad_norm": 3.246841628686618, + "learning_rate": 9.098880397147215e-06, + "loss": 0.6908, + "step": 2075 + }, + { + "epoch": 0.22, + "grad_norm": 3.039572983541654, + "learning_rate": 9.0979041688115e-06, + "loss": 0.6255, + "step": 2076 + }, + { + "epoch": 0.22, + "grad_norm": 1.5644692341003028, + "learning_rate": 9.096927464393123e-06, + "loss": 0.6359, + "step": 2077 + }, + { + "epoch": 0.22, + "grad_norm": 2.305502204546247, + "learning_rate": 9.095950284005557e-06, + "loss": 0.7748, + "step": 2078 + }, + { + "epoch": 0.22, + "grad_norm": 2.2388140399919334, + "learning_rate": 9.094972627762326e-06, + "loss": 0.7147, + "step": 2079 + }, + { + "epoch": 0.22, + "grad_norm": 2.137643134632365, + "learning_rate": 9.093994495777014e-06, + "loss": 0.7239, + "step": 2080 + }, + { + "epoch": 0.22, + "grad_norm": 2.166788507712685, + "learning_rate": 9.093015888163255e-06, + "loss": 0.7105, + "step": 2081 + }, + { + "epoch": 0.22, + "grad_norm": 2.811849358016705, + "learning_rate": 9.09203680503474e-06, + "loss": 0.6821, + "step": 2082 + }, + { + "epoch": 0.22, + "grad_norm": 2.196127603607177, + "learning_rate": 9.091057246505221e-06, + "loss": 0.7968, + "step": 2083 + }, + { + "epoch": 0.22, + "grad_norm": 9.922784029473444, + "learning_rate": 9.090077212688496e-06, + "loss": 0.7587, + "step": 2084 + }, + { + "epoch": 0.22, + "grad_norm": 2.039930805628341, + "learning_rate": 9.089096703698423e-06, + "loss": 0.6852, + "step": 2085 + }, + { + "epoch": 0.22, + "grad_norm": 30.95824912540301, + "learning_rate": 9.088115719648917e-06, + "loss": 0.7022, + "step": 2086 + }, + { + "epoch": 0.22, + "grad_norm": 2.181623054651788, + "learning_rate": 9.087134260653943e-06, + "loss": 0.6246, + "step": 2087 + }, + { + "epoch": 0.22, + "grad_norm": 1.9997909852846925, + "learning_rate": 9.086152326827527e-06, + "loss": 0.6697, + "step": 2088 + }, + { + "epoch": 0.22, + "grad_norm": 2.395290205085553, + "learning_rate": 9.085169918283744e-06, + "loss": 0.6567, + "step": 2089 + }, + { + "epoch": 0.22, + "grad_norm": 2.836594783866522, + "learning_rate": 9.084187035136727e-06, + "loss": 0.7324, + "step": 2090 + }, + { + "epoch": 0.22, + "grad_norm": 2.3072720294313496, + "learning_rate": 9.08320367750067e-06, + "loss": 0.6529, + "step": 2091 + }, + { + "epoch": 0.22, + "grad_norm": 1.8931936091262302, + "learning_rate": 9.08221984548981e-06, + "loss": 0.7267, + "step": 2092 + }, + { + "epoch": 0.22, + "grad_norm": 2.4682881700848234, + "learning_rate": 9.081235539218451e-06, + "loss": 0.6981, + "step": 2093 + }, + { + "epoch": 0.22, + "grad_norm": 2.1040997554296026, + "learning_rate": 9.080250758800944e-06, + "loss": 0.7395, + "step": 2094 + }, + { + "epoch": 0.22, + "grad_norm": 2.148229839051349, + "learning_rate": 9.0792655043517e-06, + "loss": 0.6395, + "step": 2095 + }, + { + "epoch": 0.22, + "grad_norm": 2.008482369488449, + "learning_rate": 9.078279775985179e-06, + "loss": 0.665, + "step": 2096 + }, + { + "epoch": 0.22, + "grad_norm": 2.414670215539318, + "learning_rate": 9.077293573815905e-06, + "loss": 0.7037, + "step": 2097 + }, + { + "epoch": 0.22, + "grad_norm": 2.6045420628577474, + "learning_rate": 9.07630689795845e-06, + "loss": 0.6876, + "step": 2098 + }, + { + "epoch": 0.22, + "grad_norm": 3.089544444738824, + "learning_rate": 9.075319748527442e-06, + "loss": 0.6634, + "step": 2099 + }, + { + "epoch": 0.22, + "grad_norm": 7.068471813524876, + "learning_rate": 9.074332125637564e-06, + "loss": 0.7743, + "step": 2100 + }, + { + "epoch": 0.22, + "grad_norm": 10.390556215843803, + "learning_rate": 9.073344029403562e-06, + "loss": 0.6952, + "step": 2101 + }, + { + "epoch": 0.22, + "grad_norm": 2.5241030806585005, + "learning_rate": 9.072355459940222e-06, + "loss": 0.7437, + "step": 2102 + }, + { + "epoch": 0.22, + "grad_norm": 3.752003680927352, + "learning_rate": 9.071366417362398e-06, + "loss": 0.7585, + "step": 2103 + }, + { + "epoch": 0.22, + "grad_norm": 2.5306889032364492, + "learning_rate": 9.070376901784992e-06, + "loss": 0.7722, + "step": 2104 + }, + { + "epoch": 0.22, + "grad_norm": 2.5152078389519015, + "learning_rate": 9.069386913322964e-06, + "loss": 0.7526, + "step": 2105 + }, + { + "epoch": 0.22, + "grad_norm": 2.300437974231803, + "learning_rate": 9.068396452091328e-06, + "loss": 0.7412, + "step": 2106 + }, + { + "epoch": 0.22, + "grad_norm": 2.966239911106796, + "learning_rate": 9.067405518205153e-06, + "loss": 0.6955, + "step": 2107 + }, + { + "epoch": 0.22, + "grad_norm": 2.374383053660284, + "learning_rate": 9.066414111779562e-06, + "loss": 0.7065, + "step": 2108 + }, + { + "epoch": 0.22, + "grad_norm": 1.95341575255716, + "learning_rate": 9.065422232929735e-06, + "loss": 0.6911, + "step": 2109 + }, + { + "epoch": 0.22, + "grad_norm": 1.38959249885153, + "learning_rate": 9.064429881770905e-06, + "loss": 0.5927, + "step": 2110 + }, + { + "epoch": 0.22, + "grad_norm": 2.354874954642544, + "learning_rate": 9.063437058418361e-06, + "loss": 0.6709, + "step": 2111 + }, + { + "epoch": 0.22, + "grad_norm": 3.1182596136285308, + "learning_rate": 9.062443762987442e-06, + "loss": 0.7256, + "step": 2112 + }, + { + "epoch": 0.22, + "grad_norm": 2.3451511992294085, + "learning_rate": 9.061449995593554e-06, + "loss": 0.6163, + "step": 2113 + }, + { + "epoch": 0.22, + "grad_norm": 2.6148189214617816, + "learning_rate": 9.060455756352144e-06, + "loss": 0.7164, + "step": 2114 + }, + { + "epoch": 0.22, + "grad_norm": 2.984293021281382, + "learning_rate": 9.059461045378723e-06, + "loss": 0.5894, + "step": 2115 + }, + { + "epoch": 0.22, + "grad_norm": 2.254747474890875, + "learning_rate": 9.058465862788852e-06, + "loss": 0.6708, + "step": 2116 + }, + { + "epoch": 0.22, + "grad_norm": 2.363609404179215, + "learning_rate": 9.05747020869815e-06, + "loss": 0.6888, + "step": 2117 + }, + { + "epoch": 0.22, + "grad_norm": 2.37996717795513, + "learning_rate": 9.056474083222286e-06, + "loss": 0.5809, + "step": 2118 + }, + { + "epoch": 0.22, + "grad_norm": 2.7306453255932848, + "learning_rate": 9.055477486476992e-06, + "loss": 0.7184, + "step": 2119 + }, + { + "epoch": 0.22, + "grad_norm": 2.3108089867660713, + "learning_rate": 9.054480418578044e-06, + "loss": 0.6179, + "step": 2120 + }, + { + "epoch": 0.22, + "grad_norm": 2.5666873554391665, + "learning_rate": 9.053482879641283e-06, + "loss": 0.701, + "step": 2121 + }, + { + "epoch": 0.22, + "grad_norm": 2.492602226878482, + "learning_rate": 9.052484869782597e-06, + "loss": 0.6191, + "step": 2122 + }, + { + "epoch": 0.22, + "grad_norm": 9.660745248304112, + "learning_rate": 9.051486389117933e-06, + "loss": 0.6498, + "step": 2123 + }, + { + "epoch": 0.22, + "grad_norm": 2.593579048719051, + "learning_rate": 9.050487437763294e-06, + "loss": 0.7411, + "step": 2124 + }, + { + "epoch": 0.22, + "grad_norm": 2.193130270383422, + "learning_rate": 9.049488015834731e-06, + "loss": 0.6509, + "step": 2125 + }, + { + "epoch": 0.22, + "grad_norm": 2.286545999733999, + "learning_rate": 9.048488123448357e-06, + "loss": 0.7333, + "step": 2126 + }, + { + "epoch": 0.22, + "grad_norm": 2.7285901550002145, + "learning_rate": 9.047487760720338e-06, + "loss": 0.755, + "step": 2127 + }, + { + "epoch": 0.22, + "grad_norm": 2.5323414148927963, + "learning_rate": 9.046486927766889e-06, + "loss": 0.6686, + "step": 2128 + }, + { + "epoch": 0.22, + "grad_norm": 1.2528013077555502, + "learning_rate": 9.045485624704287e-06, + "loss": 0.6362, + "step": 2129 + }, + { + "epoch": 0.22, + "grad_norm": 2.170043281237849, + "learning_rate": 9.044483851648858e-06, + "loss": 0.6531, + "step": 2130 + }, + { + "epoch": 0.22, + "grad_norm": 2.537814352100077, + "learning_rate": 9.043481608716987e-06, + "loss": 0.7327, + "step": 2131 + }, + { + "epoch": 0.22, + "grad_norm": 2.1536774455658683, + "learning_rate": 9.042478896025113e-06, + "loss": 0.737, + "step": 2132 + }, + { + "epoch": 0.22, + "grad_norm": 2.4395488467277504, + "learning_rate": 9.041475713689725e-06, + "loss": 0.7198, + "step": 2133 + }, + { + "epoch": 0.22, + "grad_norm": 3.612160402864214, + "learning_rate": 9.04047206182737e-06, + "loss": 0.7047, + "step": 2134 + }, + { + "epoch": 0.22, + "grad_norm": 2.361860087108175, + "learning_rate": 9.039467940554651e-06, + "loss": 0.8051, + "step": 2135 + }, + { + "epoch": 0.22, + "grad_norm": 1.1118384611691083, + "learning_rate": 9.038463349988226e-06, + "loss": 0.6651, + "step": 2136 + }, + { + "epoch": 0.22, + "grad_norm": 2.1279878773488656, + "learning_rate": 9.0374582902448e-06, + "loss": 0.7212, + "step": 2137 + }, + { + "epoch": 0.22, + "grad_norm": 1.984425840534222, + "learning_rate": 9.036452761441143e-06, + "loss": 0.6806, + "step": 2138 + }, + { + "epoch": 0.23, + "grad_norm": 2.2564393133187943, + "learning_rate": 9.035446763694073e-06, + "loss": 0.6898, + "step": 2139 + }, + { + "epoch": 0.23, + "grad_norm": 3.2675730698127854, + "learning_rate": 9.034440297120461e-06, + "loss": 0.6907, + "step": 2140 + }, + { + "epoch": 0.23, + "grad_norm": 3.1457055260967333, + "learning_rate": 9.03343336183724e-06, + "loss": 0.6275, + "step": 2141 + }, + { + "epoch": 0.23, + "grad_norm": 2.699650466296311, + "learning_rate": 9.032425957961388e-06, + "loss": 0.7142, + "step": 2142 + }, + { + "epoch": 0.23, + "grad_norm": 2.361799060850231, + "learning_rate": 9.031418085609946e-06, + "loss": 0.7878, + "step": 2143 + }, + { + "epoch": 0.23, + "grad_norm": 3.0889387826729044, + "learning_rate": 9.030409744900005e-06, + "loss": 0.7566, + "step": 2144 + }, + { + "epoch": 0.23, + "grad_norm": 2.6149850709834492, + "learning_rate": 9.029400935948712e-06, + "loss": 0.6909, + "step": 2145 + }, + { + "epoch": 0.23, + "grad_norm": 2.102880043260286, + "learning_rate": 9.028391658873264e-06, + "loss": 0.6655, + "step": 2146 + }, + { + "epoch": 0.23, + "grad_norm": 2.6572624912551666, + "learning_rate": 9.027381913790916e-06, + "loss": 0.627, + "step": 2147 + }, + { + "epoch": 0.23, + "grad_norm": 2.6045372231550585, + "learning_rate": 9.026371700818982e-06, + "loss": 0.779, + "step": 2148 + }, + { + "epoch": 0.23, + "grad_norm": 2.662458429721062, + "learning_rate": 9.025361020074823e-06, + "loss": 0.7057, + "step": 2149 + }, + { + "epoch": 0.23, + "grad_norm": 1.8868287840968536, + "learning_rate": 9.024349871675855e-06, + "loss": 0.7235, + "step": 2150 + }, + { + "epoch": 0.23, + "grad_norm": 2.2117676794317105, + "learning_rate": 9.023338255739553e-06, + "loss": 0.7088, + "step": 2151 + }, + { + "epoch": 0.23, + "grad_norm": 2.2014960756353057, + "learning_rate": 9.022326172383444e-06, + "loss": 0.7346, + "step": 2152 + }, + { + "epoch": 0.23, + "grad_norm": 2.14684126625403, + "learning_rate": 9.021313621725106e-06, + "loss": 0.734, + "step": 2153 + }, + { + "epoch": 0.23, + "grad_norm": 2.558372722059645, + "learning_rate": 9.020300603882178e-06, + "loss": 0.6929, + "step": 2154 + }, + { + "epoch": 0.23, + "grad_norm": 2.442785125856375, + "learning_rate": 9.019287118972343e-06, + "loss": 0.7285, + "step": 2155 + }, + { + "epoch": 0.23, + "grad_norm": 2.5672631424976617, + "learning_rate": 9.018273167113354e-06, + "loss": 0.7115, + "step": 2156 + }, + { + "epoch": 0.23, + "grad_norm": 3.9219116603164372, + "learning_rate": 9.017258748423e-06, + "loss": 0.7292, + "step": 2157 + }, + { + "epoch": 0.23, + "grad_norm": 3.1655630933315155, + "learning_rate": 9.01624386301914e-06, + "loss": 0.6373, + "step": 2158 + }, + { + "epoch": 0.23, + "grad_norm": 2.9959938750864805, + "learning_rate": 9.015228511019678e-06, + "loss": 0.6972, + "step": 2159 + }, + { + "epoch": 0.23, + "grad_norm": 2.451365705730853, + "learning_rate": 9.014212692542573e-06, + "loss": 0.7195, + "step": 2160 + }, + { + "epoch": 0.23, + "grad_norm": 2.277660307422319, + "learning_rate": 9.013196407705842e-06, + "loss": 0.704, + "step": 2161 + }, + { + "epoch": 0.23, + "grad_norm": 3.015544240787718, + "learning_rate": 9.012179656627553e-06, + "loss": 0.6184, + "step": 2162 + }, + { + "epoch": 0.23, + "grad_norm": 2.046162331434754, + "learning_rate": 9.011162439425831e-06, + "loss": 0.7308, + "step": 2163 + }, + { + "epoch": 0.23, + "grad_norm": 2.62859526794707, + "learning_rate": 9.010144756218851e-06, + "loss": 0.6979, + "step": 2164 + }, + { + "epoch": 0.23, + "grad_norm": 2.026397922330441, + "learning_rate": 9.009126607124844e-06, + "loss": 0.6918, + "step": 2165 + }, + { + "epoch": 0.23, + "grad_norm": 2.3621280368628996, + "learning_rate": 9.008107992262098e-06, + "loss": 0.7819, + "step": 2166 + }, + { + "epoch": 0.23, + "grad_norm": 2.176552442201047, + "learning_rate": 9.00708891174895e-06, + "loss": 0.7668, + "step": 2167 + }, + { + "epoch": 0.23, + "grad_norm": 3.2070738864736765, + "learning_rate": 9.006069365703799e-06, + "loss": 0.6924, + "step": 2168 + }, + { + "epoch": 0.23, + "grad_norm": 2.168031678031707, + "learning_rate": 9.005049354245088e-06, + "loss": 0.6743, + "step": 2169 + }, + { + "epoch": 0.23, + "grad_norm": 2.8808305867742656, + "learning_rate": 9.004028877491319e-06, + "loss": 0.631, + "step": 2170 + }, + { + "epoch": 0.23, + "grad_norm": 2.295861751174668, + "learning_rate": 9.003007935561052e-06, + "loss": 0.6949, + "step": 2171 + }, + { + "epoch": 0.23, + "grad_norm": 2.49760284826448, + "learning_rate": 9.001986528572892e-06, + "loss": 0.743, + "step": 2172 + }, + { + "epoch": 0.23, + "grad_norm": 7.296535119642908, + "learning_rate": 9.000964656645508e-06, + "loss": 0.7973, + "step": 2173 + }, + { + "epoch": 0.23, + "grad_norm": 2.2214283799588825, + "learning_rate": 8.999942319897615e-06, + "loss": 0.6888, + "step": 2174 + }, + { + "epoch": 0.23, + "grad_norm": 3.1319580111833254, + "learning_rate": 8.998919518447986e-06, + "loss": 0.6926, + "step": 2175 + }, + { + "epoch": 0.23, + "grad_norm": 2.4770875145538556, + "learning_rate": 8.997896252415445e-06, + "loss": 0.7683, + "step": 2176 + }, + { + "epoch": 0.23, + "grad_norm": 2.646693120678769, + "learning_rate": 8.996872521918877e-06, + "loss": 0.8287, + "step": 2177 + }, + { + "epoch": 0.23, + "grad_norm": 2.343228561686917, + "learning_rate": 8.995848327077211e-06, + "loss": 0.7529, + "step": 2178 + }, + { + "epoch": 0.23, + "grad_norm": 5.810342504369392, + "learning_rate": 8.994823668009437e-06, + "loss": 0.7357, + "step": 2179 + }, + { + "epoch": 0.23, + "grad_norm": 2.1482922949969145, + "learning_rate": 8.9937985448346e-06, + "loss": 0.7304, + "step": 2180 + }, + { + "epoch": 0.23, + "grad_norm": 1.8551204580841676, + "learning_rate": 8.992772957671791e-06, + "loss": 0.7101, + "step": 2181 + }, + { + "epoch": 0.23, + "grad_norm": 2.2466577666192475, + "learning_rate": 8.991746906640162e-06, + "loss": 0.6895, + "step": 2182 + }, + { + "epoch": 0.23, + "grad_norm": 2.168650877854395, + "learning_rate": 8.990720391858915e-06, + "loss": 0.6479, + "step": 2183 + }, + { + "epoch": 0.23, + "grad_norm": 3.300343690084461, + "learning_rate": 8.98969341344731e-06, + "loss": 0.6546, + "step": 2184 + }, + { + "epoch": 0.23, + "grad_norm": 2.3637093422065734, + "learning_rate": 8.98866597152466e-06, + "loss": 0.6307, + "step": 2185 + }, + { + "epoch": 0.23, + "grad_norm": 2.754311126650472, + "learning_rate": 8.987638066210325e-06, + "loss": 0.6912, + "step": 2186 + }, + { + "epoch": 0.23, + "grad_norm": 2.1562502830328474, + "learning_rate": 8.986609697623724e-06, + "loss": 0.6785, + "step": 2187 + }, + { + "epoch": 0.23, + "grad_norm": 2.327649052593783, + "learning_rate": 8.985580865884336e-06, + "loss": 0.7294, + "step": 2188 + }, + { + "epoch": 0.23, + "grad_norm": 2.408598312141122, + "learning_rate": 8.984551571111683e-06, + "loss": 0.6296, + "step": 2189 + }, + { + "epoch": 0.23, + "grad_norm": 2.142678163175262, + "learning_rate": 8.983521813425348e-06, + "loss": 0.6944, + "step": 2190 + }, + { + "epoch": 0.23, + "grad_norm": 2.175847871801838, + "learning_rate": 8.982491592944962e-06, + "loss": 0.6382, + "step": 2191 + }, + { + "epoch": 0.23, + "grad_norm": 3.228095121751495, + "learning_rate": 8.981460909790216e-06, + "loss": 0.5974, + "step": 2192 + }, + { + "epoch": 0.23, + "grad_norm": 2.052783456234635, + "learning_rate": 8.98042976408085e-06, + "loss": 0.6731, + "step": 2193 + }, + { + "epoch": 0.23, + "grad_norm": 2.0986872293091468, + "learning_rate": 8.97939815593666e-06, + "loss": 0.6891, + "step": 2194 + }, + { + "epoch": 0.23, + "grad_norm": 2.154720097555765, + "learning_rate": 8.978366085477497e-06, + "loss": 0.6185, + "step": 2195 + }, + { + "epoch": 0.23, + "grad_norm": 2.1403969118588067, + "learning_rate": 8.977333552823261e-06, + "loss": 0.6918, + "step": 2196 + }, + { + "epoch": 0.23, + "grad_norm": 3.33214529211307, + "learning_rate": 8.976300558093911e-06, + "loss": 0.6466, + "step": 2197 + }, + { + "epoch": 0.23, + "grad_norm": 4.260333763778915, + "learning_rate": 8.975267101409458e-06, + "loss": 0.7482, + "step": 2198 + }, + { + "epoch": 0.23, + "grad_norm": 2.0914836565162496, + "learning_rate": 8.974233182889961e-06, + "loss": 0.6254, + "step": 2199 + }, + { + "epoch": 0.23, + "grad_norm": 2.3762255505538676, + "learning_rate": 8.973198802655543e-06, + "loss": 0.7553, + "step": 2200 + }, + { + "epoch": 0.23, + "grad_norm": 2.1282785961850506, + "learning_rate": 8.972163960826375e-06, + "loss": 0.7332, + "step": 2201 + }, + { + "epoch": 0.23, + "grad_norm": 1.9694503720386745, + "learning_rate": 8.971128657522677e-06, + "loss": 0.7163, + "step": 2202 + }, + { + "epoch": 0.23, + "grad_norm": 2.1801548709290546, + "learning_rate": 8.970092892864732e-06, + "loss": 0.7532, + "step": 2203 + }, + { + "epoch": 0.23, + "grad_norm": 2.2491318122078092, + "learning_rate": 8.969056666972874e-06, + "loss": 0.716, + "step": 2204 + }, + { + "epoch": 0.23, + "grad_norm": 2.0893548904362413, + "learning_rate": 8.968019979967482e-06, + "loss": 0.6952, + "step": 2205 + }, + { + "epoch": 0.23, + "grad_norm": 2.2759911727787907, + "learning_rate": 8.966982831969001e-06, + "loss": 0.7345, + "step": 2206 + }, + { + "epoch": 0.23, + "grad_norm": 3.33173403077009, + "learning_rate": 8.965945223097922e-06, + "loss": 0.7495, + "step": 2207 + }, + { + "epoch": 0.23, + "grad_norm": 1.9341550603493136, + "learning_rate": 8.964907153474791e-06, + "loss": 0.7013, + "step": 2208 + }, + { + "epoch": 0.23, + "grad_norm": 2.1147262266169173, + "learning_rate": 8.963868623220208e-06, + "loss": 0.6709, + "step": 2209 + }, + { + "epoch": 0.23, + "grad_norm": 2.8936564339718123, + "learning_rate": 8.962829632454829e-06, + "loss": 0.7317, + "step": 2210 + }, + { + "epoch": 0.23, + "grad_norm": 2.4903063833013555, + "learning_rate": 8.961790181299354e-06, + "loss": 0.8207, + "step": 2211 + }, + { + "epoch": 0.23, + "grad_norm": 2.2804678844589246, + "learning_rate": 8.960750269874552e-06, + "loss": 0.6467, + "step": 2212 + }, + { + "epoch": 0.23, + "grad_norm": 2.4641706147225713, + "learning_rate": 8.959709898301232e-06, + "loss": 0.7512, + "step": 2213 + }, + { + "epoch": 0.23, + "grad_norm": 2.6089124316964005, + "learning_rate": 8.958669066700261e-06, + "loss": 0.6522, + "step": 2214 + }, + { + "epoch": 0.23, + "grad_norm": 2.19069049516705, + "learning_rate": 8.957627775192564e-06, + "loss": 0.7337, + "step": 2215 + }, + { + "epoch": 0.23, + "grad_norm": 2.500190008138319, + "learning_rate": 8.956586023899109e-06, + "loss": 0.681, + "step": 2216 + }, + { + "epoch": 0.23, + "grad_norm": 2.159720426862354, + "learning_rate": 8.95554381294093e-06, + "loss": 0.7571, + "step": 2217 + }, + { + "epoch": 0.23, + "grad_norm": 2.0686321610309095, + "learning_rate": 8.954501142439105e-06, + "loss": 0.6577, + "step": 2218 + }, + { + "epoch": 0.23, + "grad_norm": 2.2475898480773995, + "learning_rate": 8.953458012514766e-06, + "loss": 0.7263, + "step": 2219 + }, + { + "epoch": 0.23, + "grad_norm": 3.4884995247883603, + "learning_rate": 8.952414423289107e-06, + "loss": 0.7057, + "step": 2220 + }, + { + "epoch": 0.23, + "grad_norm": 4.4448700774798064, + "learning_rate": 8.951370374883362e-06, + "loss": 0.7204, + "step": 2221 + }, + { + "epoch": 0.23, + "grad_norm": 2.441270491294434, + "learning_rate": 8.950325867418831e-06, + "loss": 0.7727, + "step": 2222 + }, + { + "epoch": 0.23, + "grad_norm": 2.310391536836991, + "learning_rate": 8.949280901016859e-06, + "loss": 0.7408, + "step": 2223 + }, + { + "epoch": 0.23, + "grad_norm": 1.894271235429747, + "learning_rate": 8.94823547579885e-06, + "loss": 0.6713, + "step": 2224 + }, + { + "epoch": 0.23, + "grad_norm": 2.731116389165696, + "learning_rate": 8.947189591886255e-06, + "loss": 0.7204, + "step": 2225 + }, + { + "epoch": 0.23, + "grad_norm": 2.3805815349526687, + "learning_rate": 8.946143249400582e-06, + "loss": 0.6696, + "step": 2226 + }, + { + "epoch": 0.23, + "grad_norm": 2.354183378868526, + "learning_rate": 8.945096448463397e-06, + "loss": 0.7481, + "step": 2227 + }, + { + "epoch": 0.23, + "grad_norm": 2.189115002118964, + "learning_rate": 8.944049189196308e-06, + "loss": 0.7049, + "step": 2228 + }, + { + "epoch": 0.23, + "grad_norm": 15.927462117356988, + "learning_rate": 8.943001471720987e-06, + "loss": 0.63, + "step": 2229 + }, + { + "epoch": 0.23, + "grad_norm": 7.588326102390013, + "learning_rate": 8.941953296159153e-06, + "loss": 0.7003, + "step": 2230 + }, + { + "epoch": 0.23, + "grad_norm": 3.0744521000216753, + "learning_rate": 8.940904662632579e-06, + "loss": 0.7521, + "step": 2231 + }, + { + "epoch": 0.23, + "grad_norm": 3.4628646582770877, + "learning_rate": 8.939855571263095e-06, + "loss": 0.6593, + "step": 2232 + }, + { + "epoch": 0.23, + "grad_norm": 7.273569477502985, + "learning_rate": 8.938806022172578e-06, + "loss": 0.6958, + "step": 2233 + }, + { + "epoch": 0.24, + "grad_norm": 2.3321350896927435, + "learning_rate": 8.937756015482962e-06, + "loss": 0.706, + "step": 2234 + }, + { + "epoch": 0.24, + "grad_norm": 1.249108717782656, + "learning_rate": 8.936705551316238e-06, + "loss": 0.5939, + "step": 2235 + }, + { + "epoch": 0.24, + "grad_norm": 2.4287842458282207, + "learning_rate": 8.935654629794442e-06, + "loss": 0.7158, + "step": 2236 + }, + { + "epoch": 0.24, + "grad_norm": 2.7772840916896127, + "learning_rate": 8.934603251039667e-06, + "loss": 0.6861, + "step": 2237 + }, + { + "epoch": 0.24, + "grad_norm": 2.350481813294236, + "learning_rate": 8.93355141517406e-06, + "loss": 0.7134, + "step": 2238 + }, + { + "epoch": 0.24, + "grad_norm": 1.9394185187015496, + "learning_rate": 8.932499122319821e-06, + "loss": 0.6629, + "step": 2239 + }, + { + "epoch": 0.24, + "grad_norm": 2.664123857241033, + "learning_rate": 8.931446372599202e-06, + "loss": 0.6152, + "step": 2240 + }, + { + "epoch": 0.24, + "grad_norm": 2.349915878479327, + "learning_rate": 8.930393166134507e-06, + "loss": 0.6989, + "step": 2241 + }, + { + "epoch": 0.24, + "grad_norm": 2.206930105187941, + "learning_rate": 8.929339503048096e-06, + "loss": 0.6799, + "step": 2242 + }, + { + "epoch": 0.24, + "grad_norm": 2.647423217564447, + "learning_rate": 8.92828538346238e-06, + "loss": 0.7513, + "step": 2243 + }, + { + "epoch": 0.24, + "grad_norm": 2.4210465633816582, + "learning_rate": 8.927230807499824e-06, + "loss": 0.6743, + "step": 2244 + }, + { + "epoch": 0.24, + "grad_norm": 2.5877919600311934, + "learning_rate": 8.926175775282946e-06, + "loss": 0.6994, + "step": 2245 + }, + { + "epoch": 0.24, + "grad_norm": 2.158209740707754, + "learning_rate": 8.925120286934315e-06, + "loss": 0.6448, + "step": 2246 + }, + { + "epoch": 0.24, + "grad_norm": 2.3832699071525774, + "learning_rate": 8.924064342576554e-06, + "loss": 0.6684, + "step": 2247 + }, + { + "epoch": 0.24, + "grad_norm": 2.400928597949794, + "learning_rate": 8.923007942332345e-06, + "loss": 0.73, + "step": 2248 + }, + { + "epoch": 0.24, + "grad_norm": 2.9437835667880945, + "learning_rate": 8.92195108632441e-06, + "loss": 0.7273, + "step": 2249 + }, + { + "epoch": 0.24, + "grad_norm": 4.870436670961615, + "learning_rate": 8.920893774675536e-06, + "loss": 0.705, + "step": 2250 + }, + { + "epoch": 0.24, + "grad_norm": 3.3747630286954555, + "learning_rate": 8.919836007508558e-06, + "loss": 0.7267, + "step": 2251 + }, + { + "epoch": 0.24, + "grad_norm": 3.341185883155674, + "learning_rate": 8.918777784946364e-06, + "loss": 0.712, + "step": 2252 + }, + { + "epoch": 0.24, + "grad_norm": 2.0954214888627747, + "learning_rate": 8.917719107111893e-06, + "loss": 0.6333, + "step": 2253 + }, + { + "epoch": 0.24, + "grad_norm": 1.2833413585564724, + "learning_rate": 8.916659974128144e-06, + "loss": 0.6779, + "step": 2254 + }, + { + "epoch": 0.24, + "grad_norm": 2.1835249116285995, + "learning_rate": 8.91560038611816e-06, + "loss": 0.705, + "step": 2255 + }, + { + "epoch": 0.24, + "grad_norm": 7.568232696610119, + "learning_rate": 8.91454034320504e-06, + "loss": 0.6866, + "step": 2256 + }, + { + "epoch": 0.24, + "grad_norm": 2.7059228367154904, + "learning_rate": 8.913479845511942e-06, + "loss": 0.6874, + "step": 2257 + }, + { + "epoch": 0.24, + "grad_norm": 2.1413067070108274, + "learning_rate": 8.912418893162066e-06, + "loss": 0.6511, + "step": 2258 + }, + { + "epoch": 0.24, + "grad_norm": 2.553360298084785, + "learning_rate": 8.91135748627867e-06, + "loss": 0.6896, + "step": 2259 + }, + { + "epoch": 0.24, + "grad_norm": 2.5267231876409033, + "learning_rate": 8.910295624985072e-06, + "loss": 0.7684, + "step": 2260 + }, + { + "epoch": 0.24, + "grad_norm": 2.267588533534899, + "learning_rate": 8.909233309404632e-06, + "loss": 0.7048, + "step": 2261 + }, + { + "epoch": 0.24, + "grad_norm": 6.118523741223378, + "learning_rate": 8.908170539660766e-06, + "loss": 0.7692, + "step": 2262 + }, + { + "epoch": 0.24, + "grad_norm": 3.7350137185652335, + "learning_rate": 8.907107315876942e-06, + "loss": 0.5702, + "step": 2263 + }, + { + "epoch": 0.24, + "grad_norm": 1.2334438560399243, + "learning_rate": 8.906043638176686e-06, + "loss": 0.6064, + "step": 2264 + }, + { + "epoch": 0.24, + "grad_norm": 2.408881941929093, + "learning_rate": 8.90497950668357e-06, + "loss": 0.7069, + "step": 2265 + }, + { + "epoch": 0.24, + "grad_norm": 0.985892002965994, + "learning_rate": 8.903914921521226e-06, + "loss": 0.6155, + "step": 2266 + }, + { + "epoch": 0.24, + "grad_norm": 3.0808888218270587, + "learning_rate": 8.90284988281333e-06, + "loss": 0.6486, + "step": 2267 + }, + { + "epoch": 0.24, + "grad_norm": 2.235696390043599, + "learning_rate": 8.901784390683616e-06, + "loss": 0.7458, + "step": 2268 + }, + { + "epoch": 0.24, + "grad_norm": 2.545711880315011, + "learning_rate": 8.90071844525587e-06, + "loss": 0.6872, + "step": 2269 + }, + { + "epoch": 0.24, + "grad_norm": 3.1494684836406015, + "learning_rate": 8.89965204665393e-06, + "loss": 0.6339, + "step": 2270 + }, + { + "epoch": 0.24, + "grad_norm": 1.2914981848806717, + "learning_rate": 8.898585195001691e-06, + "loss": 0.6513, + "step": 2271 + }, + { + "epoch": 0.24, + "grad_norm": 2.6252182724430613, + "learning_rate": 8.897517890423092e-06, + "loss": 0.6416, + "step": 2272 + }, + { + "epoch": 0.24, + "grad_norm": 2.3145266443168397, + "learning_rate": 8.896450133042132e-06, + "loss": 0.5625, + "step": 2273 + }, + { + "epoch": 0.24, + "grad_norm": 2.9425490217808346, + "learning_rate": 8.895381922982857e-06, + "loss": 0.7486, + "step": 2274 + }, + { + "epoch": 0.24, + "grad_norm": 2.4409922366780656, + "learning_rate": 8.894313260369372e-06, + "loss": 0.7289, + "step": 2275 + }, + { + "epoch": 0.24, + "grad_norm": 2.7296566582591337, + "learning_rate": 8.89324414532583e-06, + "loss": 0.652, + "step": 2276 + }, + { + "epoch": 0.24, + "grad_norm": 2.7073730009269488, + "learning_rate": 8.892174577976438e-06, + "loss": 0.6029, + "step": 2277 + }, + { + "epoch": 0.24, + "grad_norm": 2.984224980813416, + "learning_rate": 8.891104558445454e-06, + "loss": 0.6809, + "step": 2278 + }, + { + "epoch": 0.24, + "grad_norm": 2.5742027243121153, + "learning_rate": 8.890034086857189e-06, + "loss": 0.6906, + "step": 2279 + }, + { + "epoch": 0.24, + "grad_norm": 2.0483532696980427, + "learning_rate": 8.88896316333601e-06, + "loss": 0.7207, + "step": 2280 + }, + { + "epoch": 0.24, + "grad_norm": 2.7333128521952657, + "learning_rate": 8.887891788006334e-06, + "loss": 0.7079, + "step": 2281 + }, + { + "epoch": 0.24, + "grad_norm": 2.860179858149371, + "learning_rate": 8.886819960992626e-06, + "loss": 0.6179, + "step": 2282 + }, + { + "epoch": 0.24, + "grad_norm": 2.398502797457853, + "learning_rate": 8.885747682419413e-06, + "loss": 0.6413, + "step": 2283 + }, + { + "epoch": 0.24, + "grad_norm": 2.3814307011526807, + "learning_rate": 8.884674952411265e-06, + "loss": 0.815, + "step": 2284 + }, + { + "epoch": 0.24, + "grad_norm": 2.91263925062964, + "learning_rate": 8.883601771092812e-06, + "loss": 0.6907, + "step": 2285 + }, + { + "epoch": 0.24, + "grad_norm": 2.470495388049324, + "learning_rate": 8.882528138588729e-06, + "loss": 0.7293, + "step": 2286 + }, + { + "epoch": 0.24, + "grad_norm": 2.206151479410179, + "learning_rate": 8.881454055023752e-06, + "loss": 0.7459, + "step": 2287 + }, + { + "epoch": 0.24, + "grad_norm": 2.432016434550301, + "learning_rate": 8.880379520522664e-06, + "loss": 0.762, + "step": 2288 + }, + { + "epoch": 0.24, + "grad_norm": 2.8140952416127694, + "learning_rate": 8.879304535210298e-06, + "loss": 0.6734, + "step": 2289 + }, + { + "epoch": 0.24, + "grad_norm": 2.473443750742672, + "learning_rate": 8.878229099211548e-06, + "loss": 0.75, + "step": 2290 + }, + { + "epoch": 0.24, + "grad_norm": 4.536037475518659, + "learning_rate": 8.87715321265135e-06, + "loss": 0.6536, + "step": 2291 + }, + { + "epoch": 0.24, + "grad_norm": 2.268832339589105, + "learning_rate": 8.8760768756547e-06, + "loss": 0.7971, + "step": 2292 + }, + { + "epoch": 0.24, + "grad_norm": 2.784421921853551, + "learning_rate": 8.875000088346642e-06, + "loss": 0.7665, + "step": 2293 + }, + { + "epoch": 0.24, + "grad_norm": 2.494207647604574, + "learning_rate": 8.873922850852276e-06, + "loss": 0.5876, + "step": 2294 + }, + { + "epoch": 0.24, + "grad_norm": 4.963714518286977, + "learning_rate": 8.872845163296752e-06, + "loss": 0.7046, + "step": 2295 + }, + { + "epoch": 0.24, + "grad_norm": 3.1676492228416104, + "learning_rate": 8.87176702580527e-06, + "loss": 0.6656, + "step": 2296 + }, + { + "epoch": 0.24, + "grad_norm": 2.5119192830739734, + "learning_rate": 8.87068843850309e-06, + "loss": 0.7083, + "step": 2297 + }, + { + "epoch": 0.24, + "grad_norm": 2.943714911949743, + "learning_rate": 8.869609401515516e-06, + "loss": 0.6959, + "step": 2298 + }, + { + "epoch": 0.24, + "grad_norm": 2.6887677900783884, + "learning_rate": 8.868529914967908e-06, + "loss": 0.6394, + "step": 2299 + }, + { + "epoch": 0.24, + "grad_norm": 2.2402749666716884, + "learning_rate": 8.867449978985676e-06, + "loss": 0.7139, + "step": 2300 + }, + { + "epoch": 0.24, + "grad_norm": 2.8829424956949716, + "learning_rate": 8.866369593694285e-06, + "loss": 0.6697, + "step": 2301 + }, + { + "epoch": 0.24, + "grad_norm": 2.746440356252218, + "learning_rate": 8.865288759219251e-06, + "loss": 0.7202, + "step": 2302 + }, + { + "epoch": 0.24, + "grad_norm": 3.1527046705438075, + "learning_rate": 8.864207475686142e-06, + "loss": 0.724, + "step": 2303 + }, + { + "epoch": 0.24, + "grad_norm": 2.95970505604412, + "learning_rate": 8.86312574322058e-06, + "loss": 0.7037, + "step": 2304 + }, + { + "epoch": 0.24, + "grad_norm": 2.271026223729123, + "learning_rate": 8.862043561948237e-06, + "loss": 0.6285, + "step": 2305 + }, + { + "epoch": 0.24, + "grad_norm": 1.8402186327283752, + "learning_rate": 8.860960931994835e-06, + "loss": 0.6876, + "step": 2306 + }, + { + "epoch": 0.24, + "grad_norm": 3.098504551725227, + "learning_rate": 8.859877853486154e-06, + "loss": 0.7273, + "step": 2307 + }, + { + "epoch": 0.24, + "grad_norm": 2.2831168072415884, + "learning_rate": 8.85879432654802e-06, + "loss": 0.7357, + "step": 2308 + }, + { + "epoch": 0.24, + "grad_norm": 3.372416894210338, + "learning_rate": 8.85771035130632e-06, + "loss": 0.6096, + "step": 2309 + }, + { + "epoch": 0.24, + "grad_norm": 7.296320517752463, + "learning_rate": 8.85662592788698e-06, + "loss": 0.6875, + "step": 2310 + }, + { + "epoch": 0.24, + "grad_norm": 3.8593869238201894, + "learning_rate": 8.855541056415988e-06, + "loss": 0.7534, + "step": 2311 + }, + { + "epoch": 0.24, + "grad_norm": 2.688210282987243, + "learning_rate": 8.854455737019381e-06, + "loss": 0.7085, + "step": 2312 + }, + { + "epoch": 0.24, + "grad_norm": 2.2398910749922645, + "learning_rate": 8.853369969823249e-06, + "loss": 0.7067, + "step": 2313 + }, + { + "epoch": 0.24, + "grad_norm": 2.5141437787544123, + "learning_rate": 8.852283754953734e-06, + "loss": 0.6465, + "step": 2314 + }, + { + "epoch": 0.24, + "grad_norm": 3.3203384815403836, + "learning_rate": 8.851197092537027e-06, + "loss": 0.6464, + "step": 2315 + }, + { + "epoch": 0.24, + "grad_norm": 4.071240596546394, + "learning_rate": 8.850109982699375e-06, + "loss": 0.7057, + "step": 2316 + }, + { + "epoch": 0.24, + "grad_norm": 5.27586745399632, + "learning_rate": 8.849022425567074e-06, + "loss": 0.5833, + "step": 2317 + }, + { + "epoch": 0.24, + "grad_norm": 2.242964035717761, + "learning_rate": 8.847934421266475e-06, + "loss": 0.6137, + "step": 2318 + }, + { + "epoch": 0.24, + "grad_norm": 3.005217367036941, + "learning_rate": 8.846845969923977e-06, + "loss": 0.6083, + "step": 2319 + }, + { + "epoch": 0.24, + "grad_norm": 2.9145747544167695, + "learning_rate": 8.845757071666035e-06, + "loss": 0.6655, + "step": 2320 + }, + { + "epoch": 0.24, + "grad_norm": 3.0295015098864075, + "learning_rate": 8.844667726619153e-06, + "loss": 0.8086, + "step": 2321 + }, + { + "epoch": 0.24, + "grad_norm": 2.2677233953465716, + "learning_rate": 8.843577934909888e-06, + "loss": 0.7134, + "step": 2322 + }, + { + "epoch": 0.24, + "grad_norm": 3.0387979236232696, + "learning_rate": 8.84248769666485e-06, + "loss": 0.6452, + "step": 2323 + }, + { + "epoch": 0.24, + "grad_norm": 2.3092678024403184, + "learning_rate": 8.8413970120107e-06, + "loss": 0.6905, + "step": 2324 + }, + { + "epoch": 0.24, + "grad_norm": 2.441938437783016, + "learning_rate": 8.840305881074147e-06, + "loss": 0.7386, + "step": 2325 + }, + { + "epoch": 0.24, + "grad_norm": 2.299488064549423, + "learning_rate": 8.83921430398196e-06, + "loss": 0.5852, + "step": 2326 + }, + { + "epoch": 0.24, + "grad_norm": 2.346680432521479, + "learning_rate": 8.838122280860953e-06, + "loss": 0.6816, + "step": 2327 + }, + { + "epoch": 0.24, + "grad_norm": 1.1971116175516738, + "learning_rate": 8.837029811837991e-06, + "loss": 0.6475, + "step": 2328 + }, + { + "epoch": 0.25, + "grad_norm": 3.033820645161564, + "learning_rate": 8.83593689704e-06, + "loss": 0.6569, + "step": 2329 + }, + { + "epoch": 0.25, + "grad_norm": 2.2373130942751507, + "learning_rate": 8.834843536593949e-06, + "loss": 0.7147, + "step": 2330 + }, + { + "epoch": 0.25, + "grad_norm": 2.462343548899127, + "learning_rate": 8.833749730626862e-06, + "loss": 0.6969, + "step": 2331 + }, + { + "epoch": 0.25, + "grad_norm": 3.597236651843794, + "learning_rate": 8.832655479265812e-06, + "loss": 0.6882, + "step": 2332 + }, + { + "epoch": 0.25, + "grad_norm": 1.2374268550953933, + "learning_rate": 8.831560782637929e-06, + "loss": 0.562, + "step": 2333 + }, + { + "epoch": 0.25, + "grad_norm": 2.383491282270233, + "learning_rate": 8.830465640870388e-06, + "loss": 0.7145, + "step": 2334 + }, + { + "epoch": 0.25, + "grad_norm": 2.1925762438333, + "learning_rate": 8.829370054090423e-06, + "loss": 0.6832, + "step": 2335 + }, + { + "epoch": 0.25, + "grad_norm": 3.41126511819354, + "learning_rate": 8.828274022425316e-06, + "loss": 0.7105, + "step": 2336 + }, + { + "epoch": 0.25, + "grad_norm": 2.863081103637933, + "learning_rate": 8.827177546002398e-06, + "loss": 0.7167, + "step": 2337 + }, + { + "epoch": 0.25, + "grad_norm": 2.3220167881162324, + "learning_rate": 8.826080624949056e-06, + "loss": 0.6032, + "step": 2338 + }, + { + "epoch": 0.25, + "grad_norm": 2.4710434596501796, + "learning_rate": 8.824983259392727e-06, + "loss": 0.6848, + "step": 2339 + }, + { + "epoch": 0.25, + "grad_norm": 2.0728765590634004, + "learning_rate": 8.823885449460899e-06, + "loss": 0.6752, + "step": 2340 + }, + { + "epoch": 0.25, + "grad_norm": 2.173323985472954, + "learning_rate": 8.822787195281114e-06, + "loss": 0.6507, + "step": 2341 + }, + { + "epoch": 0.25, + "grad_norm": 3.022192983168258, + "learning_rate": 8.821688496980964e-06, + "loss": 0.5687, + "step": 2342 + }, + { + "epoch": 0.25, + "grad_norm": 2.321047814509788, + "learning_rate": 8.82058935468809e-06, + "loss": 0.7194, + "step": 2343 + }, + { + "epoch": 0.25, + "grad_norm": 2.4035818387743038, + "learning_rate": 8.819489768530192e-06, + "loss": 0.6671, + "step": 2344 + }, + { + "epoch": 0.25, + "grad_norm": 3.2287384454438013, + "learning_rate": 8.818389738635012e-06, + "loss": 0.6707, + "step": 2345 + }, + { + "epoch": 0.25, + "grad_norm": 2.1051049030090847, + "learning_rate": 8.817289265130348e-06, + "loss": 0.7092, + "step": 2346 + }, + { + "epoch": 0.25, + "grad_norm": 2.6891874499459885, + "learning_rate": 8.816188348144054e-06, + "loss": 0.7263, + "step": 2347 + }, + { + "epoch": 0.25, + "grad_norm": 1.8695048876833085, + "learning_rate": 8.815086987804029e-06, + "loss": 0.661, + "step": 2348 + }, + { + "epoch": 0.25, + "grad_norm": 2.307276114427358, + "learning_rate": 8.813985184238226e-06, + "loss": 0.7653, + "step": 2349 + }, + { + "epoch": 0.25, + "grad_norm": 2.273797735104588, + "learning_rate": 8.81288293757465e-06, + "loss": 0.6886, + "step": 2350 + }, + { + "epoch": 0.25, + "grad_norm": 2.8942145715992846, + "learning_rate": 8.811780247941354e-06, + "loss": 0.7078, + "step": 2351 + }, + { + "epoch": 0.25, + "grad_norm": 2.3063601691994826, + "learning_rate": 8.810677115466451e-06, + "loss": 0.677, + "step": 2352 + }, + { + "epoch": 0.25, + "grad_norm": 2.1245376854410516, + "learning_rate": 8.809573540278094e-06, + "loss": 0.6966, + "step": 2353 + }, + { + "epoch": 0.25, + "grad_norm": 2.8737504795386495, + "learning_rate": 8.808469522504495e-06, + "loss": 0.6428, + "step": 2354 + }, + { + "epoch": 0.25, + "grad_norm": 2.578969771967044, + "learning_rate": 8.807365062273917e-06, + "loss": 0.7379, + "step": 2355 + }, + { + "epoch": 0.25, + "grad_norm": 2.771616985464036, + "learning_rate": 8.806260159714672e-06, + "loss": 0.5943, + "step": 2356 + }, + { + "epoch": 0.25, + "grad_norm": 2.0923888169384046, + "learning_rate": 8.805154814955124e-06, + "loss": 0.6965, + "step": 2357 + }, + { + "epoch": 0.25, + "grad_norm": 2.4647946308004243, + "learning_rate": 8.80404902812369e-06, + "loss": 0.6628, + "step": 2358 + }, + { + "epoch": 0.25, + "grad_norm": 2.8740732083977183, + "learning_rate": 8.802942799348836e-06, + "loss": 0.7136, + "step": 2359 + }, + { + "epoch": 0.25, + "grad_norm": 2.023161383444029, + "learning_rate": 8.80183612875908e-06, + "loss": 0.6532, + "step": 2360 + }, + { + "epoch": 0.25, + "grad_norm": 2.2951972055338783, + "learning_rate": 8.800729016482993e-06, + "loss": 0.7056, + "step": 2361 + }, + { + "epoch": 0.25, + "grad_norm": 3.005412561447531, + "learning_rate": 8.799621462649198e-06, + "loss": 0.6511, + "step": 2362 + }, + { + "epoch": 0.25, + "grad_norm": 2.347273517470351, + "learning_rate": 8.798513467386361e-06, + "loss": 0.6972, + "step": 2363 + }, + { + "epoch": 0.25, + "grad_norm": 1.2061420190350576, + "learning_rate": 8.797405030823212e-06, + "loss": 0.6577, + "step": 2364 + }, + { + "epoch": 0.25, + "grad_norm": 2.209573303684799, + "learning_rate": 8.796296153088523e-06, + "loss": 0.6624, + "step": 2365 + }, + { + "epoch": 0.25, + "grad_norm": 1.0157084559249987, + "learning_rate": 8.79518683431112e-06, + "loss": 0.6218, + "step": 2366 + }, + { + "epoch": 0.25, + "grad_norm": 2.6605469423214507, + "learning_rate": 8.794077074619884e-06, + "loss": 0.6776, + "step": 2367 + }, + { + "epoch": 0.25, + "grad_norm": 2.329846735401453, + "learning_rate": 8.79296687414374e-06, + "loss": 0.6263, + "step": 2368 + }, + { + "epoch": 0.25, + "grad_norm": 2.16211910196471, + "learning_rate": 8.791856233011668e-06, + "loss": 0.7662, + "step": 2369 + }, + { + "epoch": 0.25, + "grad_norm": 2.5132087358973325, + "learning_rate": 8.7907451513527e-06, + "loss": 0.5662, + "step": 2370 + }, + { + "epoch": 0.25, + "grad_norm": 2.4691914644939135, + "learning_rate": 8.78963362929592e-06, + "loss": 0.6809, + "step": 2371 + }, + { + "epoch": 0.25, + "grad_norm": 2.517049664768459, + "learning_rate": 8.788521666970458e-06, + "loss": 0.6717, + "step": 2372 + }, + { + "epoch": 0.25, + "grad_norm": 2.083726661084472, + "learning_rate": 8.7874092645055e-06, + "loss": 0.6727, + "step": 2373 + }, + { + "epoch": 0.25, + "grad_norm": 3.259534102965764, + "learning_rate": 8.786296422030283e-06, + "loss": 0.7514, + "step": 2374 + }, + { + "epoch": 0.25, + "grad_norm": 5.0201738332244945, + "learning_rate": 8.785183139674093e-06, + "loss": 0.7253, + "step": 2375 + }, + { + "epoch": 0.25, + "grad_norm": 2.7540666834409913, + "learning_rate": 8.784069417566268e-06, + "loss": 0.6923, + "step": 2376 + }, + { + "epoch": 0.25, + "grad_norm": 2.4727794350012213, + "learning_rate": 8.782955255836194e-06, + "loss": 0.6761, + "step": 2377 + }, + { + "epoch": 0.25, + "grad_norm": 2.642090445706794, + "learning_rate": 8.781840654613317e-06, + "loss": 0.6928, + "step": 2378 + }, + { + "epoch": 0.25, + "grad_norm": 2.3378650663235514, + "learning_rate": 8.780725614027123e-06, + "loss": 0.6407, + "step": 2379 + }, + { + "epoch": 0.25, + "grad_norm": 5.5463507569750945, + "learning_rate": 8.779610134207157e-06, + "loss": 0.731, + "step": 2380 + }, + { + "epoch": 0.25, + "grad_norm": 2.476687518226198, + "learning_rate": 8.778494215283011e-06, + "loss": 0.6693, + "step": 2381 + }, + { + "epoch": 0.25, + "grad_norm": 2.6190587133475143, + "learning_rate": 8.777377857384329e-06, + "loss": 0.7043, + "step": 2382 + }, + { + "epoch": 0.25, + "grad_norm": 2.276304071392182, + "learning_rate": 8.776261060640807e-06, + "loss": 0.6189, + "step": 2383 + }, + { + "epoch": 0.25, + "grad_norm": 2.5067148512484305, + "learning_rate": 8.775143825182192e-06, + "loss": 0.6429, + "step": 2384 + }, + { + "epoch": 0.25, + "grad_norm": 2.533233559475545, + "learning_rate": 8.77402615113828e-06, + "loss": 0.7218, + "step": 2385 + }, + { + "epoch": 0.25, + "grad_norm": 2.3752543251935956, + "learning_rate": 8.77290803863892e-06, + "loss": 0.6615, + "step": 2386 + }, + { + "epoch": 0.25, + "grad_norm": 2.6509057573466417, + "learning_rate": 8.771789487814009e-06, + "loss": 0.6687, + "step": 2387 + }, + { + "epoch": 0.25, + "grad_norm": 2.322946052629726, + "learning_rate": 8.770670498793498e-06, + "loss": 0.71, + "step": 2388 + }, + { + "epoch": 0.25, + "grad_norm": 2.555317772836953, + "learning_rate": 8.76955107170739e-06, + "loss": 0.6629, + "step": 2389 + }, + { + "epoch": 0.25, + "grad_norm": 2.0462771901928227, + "learning_rate": 8.768431206685735e-06, + "loss": 0.6768, + "step": 2390 + }, + { + "epoch": 0.25, + "grad_norm": 2.4959674203980957, + "learning_rate": 8.767310903858635e-06, + "loss": 0.7051, + "step": 2391 + }, + { + "epoch": 0.25, + "grad_norm": 2.400493851416112, + "learning_rate": 8.766190163356243e-06, + "loss": 0.7141, + "step": 2392 + }, + { + "epoch": 0.25, + "grad_norm": 2.447160327467119, + "learning_rate": 8.765068985308768e-06, + "loss": 0.6782, + "step": 2393 + }, + { + "epoch": 0.25, + "grad_norm": 2.205789371123814, + "learning_rate": 8.76394736984646e-06, + "loss": 0.6606, + "step": 2394 + }, + { + "epoch": 0.25, + "grad_norm": 3.2524271530733375, + "learning_rate": 8.762825317099628e-06, + "loss": 0.623, + "step": 2395 + }, + { + "epoch": 0.25, + "grad_norm": 2.1224476313578493, + "learning_rate": 8.761702827198626e-06, + "loss": 0.6808, + "step": 2396 + }, + { + "epoch": 0.25, + "grad_norm": 1.2653362778423383, + "learning_rate": 8.760579900273865e-06, + "loss": 0.636, + "step": 2397 + }, + { + "epoch": 0.25, + "grad_norm": 3.2781604321793023, + "learning_rate": 8.759456536455802e-06, + "loss": 0.6624, + "step": 2398 + }, + { + "epoch": 0.25, + "grad_norm": 2.4477894582450905, + "learning_rate": 8.758332735874946e-06, + "loss": 0.6002, + "step": 2399 + }, + { + "epoch": 0.25, + "grad_norm": 3.197585400074303, + "learning_rate": 8.757208498661857e-06, + "loss": 0.6718, + "step": 2400 + }, + { + "epoch": 0.25, + "grad_norm": 2.278739347444417, + "learning_rate": 8.756083824947145e-06, + "loss": 0.7983, + "step": 2401 + }, + { + "epoch": 0.25, + "grad_norm": 2.585442200635953, + "learning_rate": 8.754958714861474e-06, + "loss": 0.7644, + "step": 2402 + }, + { + "epoch": 0.25, + "grad_norm": 3.0314353743944635, + "learning_rate": 8.753833168535551e-06, + "loss": 0.7351, + "step": 2403 + }, + { + "epoch": 0.25, + "grad_norm": 2.7580999310075356, + "learning_rate": 8.752707186100144e-06, + "loss": 0.6929, + "step": 2404 + }, + { + "epoch": 0.25, + "grad_norm": 2.720900436087719, + "learning_rate": 8.751580767686063e-06, + "loss": 0.6816, + "step": 2405 + }, + { + "epoch": 0.25, + "grad_norm": 2.7870764847387455, + "learning_rate": 8.750453913424172e-06, + "loss": 0.6466, + "step": 2406 + }, + { + "epoch": 0.25, + "grad_norm": 2.5240294889336234, + "learning_rate": 8.74932662344539e-06, + "loss": 0.6982, + "step": 2407 + }, + { + "epoch": 0.25, + "grad_norm": 12.532901910391917, + "learning_rate": 8.748198897880677e-06, + "loss": 0.68, + "step": 2408 + }, + { + "epoch": 0.25, + "grad_norm": 2.2585279019476845, + "learning_rate": 8.747070736861052e-06, + "loss": 0.6859, + "step": 2409 + }, + { + "epoch": 0.25, + "grad_norm": 2.950693720936932, + "learning_rate": 8.745942140517579e-06, + "loss": 0.6846, + "step": 2410 + }, + { + "epoch": 0.25, + "grad_norm": 2.8818540494898084, + "learning_rate": 8.744813108981377e-06, + "loss": 0.7454, + "step": 2411 + }, + { + "epoch": 0.25, + "grad_norm": 2.03077990529205, + "learning_rate": 8.743683642383613e-06, + "loss": 0.7515, + "step": 2412 + }, + { + "epoch": 0.25, + "grad_norm": 2.3199732724761954, + "learning_rate": 8.742553740855507e-06, + "loss": 0.6335, + "step": 2413 + }, + { + "epoch": 0.25, + "grad_norm": 2.5030676274429786, + "learning_rate": 8.741423404528325e-06, + "loss": 0.6373, + "step": 2414 + }, + { + "epoch": 0.25, + "grad_norm": 3.576788831727748, + "learning_rate": 8.740292633533387e-06, + "loss": 0.6511, + "step": 2415 + }, + { + "epoch": 0.25, + "grad_norm": 2.292929750954021, + "learning_rate": 8.739161428002061e-06, + "loss": 0.6665, + "step": 2416 + }, + { + "epoch": 0.25, + "grad_norm": 2.495581313970782, + "learning_rate": 8.738029788065772e-06, + "loss": 0.6986, + "step": 2417 + }, + { + "epoch": 0.25, + "grad_norm": 2.7267216304430106, + "learning_rate": 8.736897713855988e-06, + "loss": 0.6805, + "step": 2418 + }, + { + "epoch": 0.25, + "grad_norm": 2.5310007726717654, + "learning_rate": 8.735765205504228e-06, + "loss": 0.6876, + "step": 2419 + }, + { + "epoch": 0.25, + "grad_norm": 1.2473961744466824, + "learning_rate": 8.734632263142066e-06, + "loss": 0.626, + "step": 2420 + }, + { + "epoch": 0.25, + "grad_norm": 2.6025083204104194, + "learning_rate": 8.733498886901123e-06, + "loss": 0.6504, + "step": 2421 + }, + { + "epoch": 0.25, + "grad_norm": 2.885692680206437, + "learning_rate": 8.732365076913072e-06, + "loss": 0.6555, + "step": 2422 + }, + { + "epoch": 0.25, + "grad_norm": 4.032356607171435, + "learning_rate": 8.731230833309637e-06, + "loss": 0.6401, + "step": 2423 + }, + { + "epoch": 0.26, + "grad_norm": 2.1972012220479074, + "learning_rate": 8.730096156222586e-06, + "loss": 0.6757, + "step": 2424 + }, + { + "epoch": 0.26, + "grad_norm": 2.291089360356186, + "learning_rate": 8.728961045783751e-06, + "loss": 0.7047, + "step": 2425 + }, + { + "epoch": 0.26, + "grad_norm": 2.2487186243095816, + "learning_rate": 8.727825502124998e-06, + "loss": 0.7178, + "step": 2426 + }, + { + "epoch": 0.26, + "grad_norm": 2.417453807033552, + "learning_rate": 8.726689525378254e-06, + "loss": 0.6669, + "step": 2427 + }, + { + "epoch": 0.26, + "grad_norm": 2.3540762968676656, + "learning_rate": 8.725553115675496e-06, + "loss": 0.5727, + "step": 2428 + }, + { + "epoch": 0.26, + "grad_norm": 4.006417010869224, + "learning_rate": 8.724416273148745e-06, + "loss": 0.6692, + "step": 2429 + }, + { + "epoch": 0.26, + "grad_norm": 2.24813269883343, + "learning_rate": 8.723278997930078e-06, + "loss": 0.6947, + "step": 2430 + }, + { + "epoch": 0.26, + "grad_norm": 2.568072157207673, + "learning_rate": 8.722141290151618e-06, + "loss": 0.6721, + "step": 2431 + }, + { + "epoch": 0.26, + "grad_norm": 2.681084669318892, + "learning_rate": 8.721003149945545e-06, + "loss": 0.7217, + "step": 2432 + }, + { + "epoch": 0.26, + "grad_norm": 2.5296915450142516, + "learning_rate": 8.719864577444082e-06, + "loss": 0.6596, + "step": 2433 + }, + { + "epoch": 0.26, + "grad_norm": 2.750107096539297, + "learning_rate": 8.718725572779505e-06, + "loss": 0.835, + "step": 2434 + }, + { + "epoch": 0.26, + "grad_norm": 2.444233296643607, + "learning_rate": 8.71758613608414e-06, + "loss": 0.7186, + "step": 2435 + }, + { + "epoch": 0.26, + "grad_norm": 2.270574624183063, + "learning_rate": 8.716446267490365e-06, + "loss": 0.7578, + "step": 2436 + }, + { + "epoch": 0.26, + "grad_norm": 3.8389444303037235, + "learning_rate": 8.715305967130604e-06, + "loss": 0.734, + "step": 2437 + }, + { + "epoch": 0.26, + "grad_norm": 1.1979511502659714, + "learning_rate": 8.71416523513734e-06, + "loss": 0.6708, + "step": 2438 + }, + { + "epoch": 0.26, + "grad_norm": 2.8417444339853852, + "learning_rate": 8.713024071643092e-06, + "loss": 0.6287, + "step": 2439 + }, + { + "epoch": 0.26, + "grad_norm": 3.0558749007403, + "learning_rate": 8.71188247678044e-06, + "loss": 0.7621, + "step": 2440 + }, + { + "epoch": 0.26, + "grad_norm": 2.291175152328059, + "learning_rate": 8.710740450682013e-06, + "loss": 0.6667, + "step": 2441 + }, + { + "epoch": 0.26, + "grad_norm": 2.141308772601036, + "learning_rate": 8.709597993480489e-06, + "loss": 0.6674, + "step": 2442 + }, + { + "epoch": 0.26, + "grad_norm": 2.0735451596302137, + "learning_rate": 8.708455105308591e-06, + "loss": 0.6599, + "step": 2443 + }, + { + "epoch": 0.26, + "grad_norm": 2.481850558377454, + "learning_rate": 8.7073117862991e-06, + "loss": 0.783, + "step": 2444 + }, + { + "epoch": 0.26, + "grad_norm": 1.9268542381826397, + "learning_rate": 8.706168036584843e-06, + "loss": 0.6356, + "step": 2445 + }, + { + "epoch": 0.26, + "grad_norm": 4.349321958616555, + "learning_rate": 8.705023856298695e-06, + "loss": 0.611, + "step": 2446 + }, + { + "epoch": 0.26, + "grad_norm": 3.1941989903979624, + "learning_rate": 8.703879245573588e-06, + "loss": 0.6716, + "step": 2447 + }, + { + "epoch": 0.26, + "grad_norm": 2.1349012651457664, + "learning_rate": 8.702734204542494e-06, + "loss": 0.7377, + "step": 2448 + }, + { + "epoch": 0.26, + "grad_norm": 2.648744197677024, + "learning_rate": 8.701588733338446e-06, + "loss": 0.7413, + "step": 2449 + }, + { + "epoch": 0.26, + "grad_norm": 3.3362813946257623, + "learning_rate": 8.700442832094517e-06, + "loss": 0.5947, + "step": 2450 + }, + { + "epoch": 0.26, + "grad_norm": 2.0804046111905614, + "learning_rate": 8.699296500943839e-06, + "loss": 0.6786, + "step": 2451 + }, + { + "epoch": 0.26, + "grad_norm": 2.661217136166758, + "learning_rate": 8.698149740019587e-06, + "loss": 0.7303, + "step": 2452 + }, + { + "epoch": 0.26, + "grad_norm": 2.5190865956506547, + "learning_rate": 8.697002549454988e-06, + "loss": 0.7269, + "step": 2453 + }, + { + "epoch": 0.26, + "grad_norm": 3.070053475596138, + "learning_rate": 8.695854929383318e-06, + "loss": 0.7228, + "step": 2454 + }, + { + "epoch": 0.26, + "grad_norm": 2.441251249535768, + "learning_rate": 8.694706879937909e-06, + "loss": 0.6955, + "step": 2455 + }, + { + "epoch": 0.26, + "grad_norm": 2.2612646874942746, + "learning_rate": 8.693558401252132e-06, + "loss": 0.656, + "step": 2456 + }, + { + "epoch": 0.26, + "grad_norm": 2.4863160385294063, + "learning_rate": 8.69240949345942e-06, + "loss": 0.7101, + "step": 2457 + }, + { + "epoch": 0.26, + "grad_norm": 2.43116626808894, + "learning_rate": 8.691260156693245e-06, + "loss": 0.6744, + "step": 2458 + }, + { + "epoch": 0.26, + "grad_norm": 2.5200768749768963, + "learning_rate": 8.690110391087134e-06, + "loss": 0.6949, + "step": 2459 + }, + { + "epoch": 0.26, + "grad_norm": 2.525389539161813, + "learning_rate": 8.688960196774668e-06, + "loss": 0.6323, + "step": 2460 + }, + { + "epoch": 0.26, + "grad_norm": 2.102109783358811, + "learning_rate": 8.687809573889467e-06, + "loss": 0.6833, + "step": 2461 + }, + { + "epoch": 0.26, + "grad_norm": 2.222074030086465, + "learning_rate": 8.686658522565211e-06, + "loss": 0.7645, + "step": 2462 + }, + { + "epoch": 0.26, + "grad_norm": 2.6743137482807753, + "learning_rate": 8.685507042935627e-06, + "loss": 0.7335, + "step": 2463 + }, + { + "epoch": 0.26, + "grad_norm": 2.8113533406525595, + "learning_rate": 8.684355135134486e-06, + "loss": 0.6922, + "step": 2464 + }, + { + "epoch": 0.26, + "grad_norm": 1.9458115809108403, + "learning_rate": 8.683202799295616e-06, + "loss": 0.6556, + "step": 2465 + }, + { + "epoch": 0.26, + "grad_norm": 2.1842127186245333, + "learning_rate": 8.682050035552891e-06, + "loss": 0.6102, + "step": 2466 + }, + { + "epoch": 0.26, + "grad_norm": 2.1705972141925733, + "learning_rate": 8.680896844040238e-06, + "loss": 0.706, + "step": 2467 + }, + { + "epoch": 0.26, + "grad_norm": 3.2360348544184308, + "learning_rate": 8.67974322489163e-06, + "loss": 0.7269, + "step": 2468 + }, + { + "epoch": 0.26, + "grad_norm": 2.569034689616941, + "learning_rate": 8.678589178241092e-06, + "loss": 0.737, + "step": 2469 + }, + { + "epoch": 0.26, + "grad_norm": 2.729945172388795, + "learning_rate": 8.677434704222697e-06, + "loss": 0.7642, + "step": 2470 + }, + { + "epoch": 0.26, + "grad_norm": 2.577038874216306, + "learning_rate": 8.676279802970566e-06, + "loss": 0.6741, + "step": 2471 + }, + { + "epoch": 0.26, + "grad_norm": 3.477474813381419, + "learning_rate": 8.675124474618876e-06, + "loss": 0.6524, + "step": 2472 + }, + { + "epoch": 0.26, + "grad_norm": 2.281055787854727, + "learning_rate": 8.673968719301849e-06, + "loss": 0.6976, + "step": 2473 + }, + { + "epoch": 0.26, + "grad_norm": 2.506648427439492, + "learning_rate": 8.672812537153757e-06, + "loss": 0.6998, + "step": 2474 + }, + { + "epoch": 0.26, + "grad_norm": 4.568840294148294, + "learning_rate": 8.67165592830892e-06, + "loss": 0.692, + "step": 2475 + }, + { + "epoch": 0.26, + "grad_norm": 2.257378434912131, + "learning_rate": 8.670498892901712e-06, + "loss": 0.7227, + "step": 2476 + }, + { + "epoch": 0.26, + "grad_norm": 1.9597763138644833, + "learning_rate": 8.669341431066552e-06, + "loss": 0.6658, + "step": 2477 + }, + { + "epoch": 0.26, + "grad_norm": 2.4823038278036846, + "learning_rate": 8.668183542937912e-06, + "loss": 0.6355, + "step": 2478 + }, + { + "epoch": 0.26, + "grad_norm": 2.2040856463646663, + "learning_rate": 8.66702522865031e-06, + "loss": 0.6359, + "step": 2479 + }, + { + "epoch": 0.26, + "grad_norm": 2.7952172523154166, + "learning_rate": 8.66586648833832e-06, + "loss": 0.684, + "step": 2480 + }, + { + "epoch": 0.26, + "grad_norm": 2.400485734318114, + "learning_rate": 8.664707322136556e-06, + "loss": 0.6433, + "step": 2481 + }, + { + "epoch": 0.26, + "grad_norm": 2.306795903415676, + "learning_rate": 8.663547730179692e-06, + "loss": 0.651, + "step": 2482 + }, + { + "epoch": 0.26, + "grad_norm": 4.985126814934233, + "learning_rate": 8.662387712602438e-06, + "loss": 0.6954, + "step": 2483 + }, + { + "epoch": 0.26, + "grad_norm": 2.2505071915852075, + "learning_rate": 8.661227269539572e-06, + "loss": 0.6472, + "step": 2484 + }, + { + "epoch": 0.26, + "grad_norm": 1.8826833418351787, + "learning_rate": 8.660066401125902e-06, + "loss": 0.6311, + "step": 2485 + }, + { + "epoch": 0.26, + "grad_norm": 6.818214783238789, + "learning_rate": 8.658905107496299e-06, + "loss": 0.6617, + "step": 2486 + }, + { + "epoch": 0.26, + "grad_norm": 2.6246974001427716, + "learning_rate": 8.657743388785676e-06, + "loss": 0.7234, + "step": 2487 + }, + { + "epoch": 0.26, + "grad_norm": 2.5053395649894235, + "learning_rate": 8.656581245129e-06, + "loss": 0.7851, + "step": 2488 + }, + { + "epoch": 0.26, + "grad_norm": 2.5788879956613666, + "learning_rate": 8.655418676661285e-06, + "loss": 0.6737, + "step": 2489 + }, + { + "epoch": 0.26, + "grad_norm": 2.5695918451833752, + "learning_rate": 8.654255683517595e-06, + "loss": 0.66, + "step": 2490 + }, + { + "epoch": 0.26, + "grad_norm": 2.2259903673736456, + "learning_rate": 8.653092265833044e-06, + "loss": 0.7476, + "step": 2491 + }, + { + "epoch": 0.26, + "grad_norm": 3.0361177334211376, + "learning_rate": 8.651928423742793e-06, + "loss": 0.6448, + "step": 2492 + }, + { + "epoch": 0.26, + "grad_norm": 2.2871956578155253, + "learning_rate": 8.650764157382054e-06, + "loss": 0.6647, + "step": 2493 + }, + { + "epoch": 0.26, + "grad_norm": 4.278582756268501, + "learning_rate": 8.64959946688609e-06, + "loss": 0.663, + "step": 2494 + }, + { + "epoch": 0.26, + "grad_norm": 2.5143339131874147, + "learning_rate": 8.648434352390209e-06, + "loss": 0.6837, + "step": 2495 + }, + { + "epoch": 0.26, + "grad_norm": 2.4594476956792586, + "learning_rate": 8.64726881402977e-06, + "loss": 0.6644, + "step": 2496 + }, + { + "epoch": 0.26, + "grad_norm": 3.1170695576353, + "learning_rate": 8.646102851940184e-06, + "loss": 0.766, + "step": 2497 + }, + { + "epoch": 0.26, + "grad_norm": 2.2169468874210065, + "learning_rate": 8.64493646625691e-06, + "loss": 0.664, + "step": 2498 + }, + { + "epoch": 0.26, + "grad_norm": 3.0516103481287034, + "learning_rate": 8.643769657115452e-06, + "loss": 0.6429, + "step": 2499 + }, + { + "epoch": 0.26, + "grad_norm": 3.027841909715622, + "learning_rate": 8.642602424651369e-06, + "loss": 0.7947, + "step": 2500 + }, + { + "epoch": 0.26, + "grad_norm": 2.292377856177173, + "learning_rate": 8.641434769000267e-06, + "loss": 0.6661, + "step": 2501 + }, + { + "epoch": 0.26, + "grad_norm": 2.591736810360195, + "learning_rate": 8.640266690297797e-06, + "loss": 0.5915, + "step": 2502 + }, + { + "epoch": 0.26, + "grad_norm": 2.751465380246021, + "learning_rate": 8.639098188679668e-06, + "loss": 0.6897, + "step": 2503 + }, + { + "epoch": 0.26, + "grad_norm": 2.0621395224891077, + "learning_rate": 8.637929264281632e-06, + "loss": 0.6687, + "step": 2504 + }, + { + "epoch": 0.26, + "grad_norm": 1.16826930507742, + "learning_rate": 8.63675991723949e-06, + "loss": 0.6106, + "step": 2505 + }, + { + "epoch": 0.26, + "grad_norm": 3.7332955065147577, + "learning_rate": 8.635590147689092e-06, + "loss": 0.7138, + "step": 2506 + }, + { + "epoch": 0.26, + "grad_norm": 2.7397582619681597, + "learning_rate": 8.634419955766342e-06, + "loss": 0.6154, + "step": 2507 + }, + { + "epoch": 0.26, + "grad_norm": 2.6765754635699213, + "learning_rate": 8.633249341607186e-06, + "loss": 0.6402, + "step": 2508 + }, + { + "epoch": 0.26, + "grad_norm": 1.147939650717752, + "learning_rate": 8.632078305347623e-06, + "loss": 0.6187, + "step": 2509 + }, + { + "epoch": 0.26, + "grad_norm": 2.5410600160091876, + "learning_rate": 8.630906847123704e-06, + "loss": 0.7342, + "step": 2510 + }, + { + "epoch": 0.26, + "grad_norm": 2.2309952636873485, + "learning_rate": 8.629734967071522e-06, + "loss": 0.6337, + "step": 2511 + }, + { + "epoch": 0.26, + "grad_norm": 2.221441936274778, + "learning_rate": 8.628562665327224e-06, + "loss": 0.6374, + "step": 2512 + }, + { + "epoch": 0.26, + "grad_norm": 2.4825649248711654, + "learning_rate": 8.627389942027008e-06, + "loss": 0.7285, + "step": 2513 + }, + { + "epoch": 0.26, + "grad_norm": 3.3071050264366133, + "learning_rate": 8.62621679730711e-06, + "loss": 0.5995, + "step": 2514 + }, + { + "epoch": 0.26, + "grad_norm": 1.9177350717101802, + "learning_rate": 8.62504323130383e-06, + "loss": 0.7131, + "step": 2515 + }, + { + "epoch": 0.26, + "grad_norm": 2.510316012626707, + "learning_rate": 8.623869244153504e-06, + "loss": 0.7768, + "step": 2516 + }, + { + "epoch": 0.26, + "grad_norm": 3.497041444497354, + "learning_rate": 8.622694835992525e-06, + "loss": 0.6689, + "step": 2517 + }, + { + "epoch": 0.26, + "grad_norm": 2.564165380926844, + "learning_rate": 8.621520006957334e-06, + "loss": 0.7526, + "step": 2518 + }, + { + "epoch": 0.27, + "grad_norm": 2.6172118910010704, + "learning_rate": 8.620344757184415e-06, + "loss": 0.6911, + "step": 2519 + }, + { + "epoch": 0.27, + "grad_norm": 2.2241203927304367, + "learning_rate": 8.619169086810308e-06, + "loss": 0.7166, + "step": 2520 + }, + { + "epoch": 0.27, + "grad_norm": 3.3008760240604422, + "learning_rate": 8.6179929959716e-06, + "loss": 0.7909, + "step": 2521 + }, + { + "epoch": 0.27, + "grad_norm": 3.3505911458947626, + "learning_rate": 8.616816484804922e-06, + "loss": 0.7658, + "step": 2522 + }, + { + "epoch": 0.27, + "grad_norm": 4.268553301879235, + "learning_rate": 8.615639553446961e-06, + "loss": 0.642, + "step": 2523 + }, + { + "epoch": 0.27, + "grad_norm": 2.9320366440133188, + "learning_rate": 8.614462202034449e-06, + "loss": 0.6443, + "step": 2524 + }, + { + "epoch": 0.27, + "grad_norm": 2.5459832781821428, + "learning_rate": 8.613284430704165e-06, + "loss": 0.662, + "step": 2525 + }, + { + "epoch": 0.27, + "grad_norm": 2.4781573079223422, + "learning_rate": 8.612106239592944e-06, + "loss": 0.6541, + "step": 2526 + }, + { + "epoch": 0.27, + "grad_norm": 1.971517376818193, + "learning_rate": 8.610927628837658e-06, + "loss": 0.6708, + "step": 2527 + }, + { + "epoch": 0.27, + "grad_norm": 2.465844796112776, + "learning_rate": 8.60974859857524e-06, + "loss": 0.6641, + "step": 2528 + }, + { + "epoch": 0.27, + "grad_norm": 2.3666940157342347, + "learning_rate": 8.608569148942664e-06, + "loss": 0.6648, + "step": 2529 + }, + { + "epoch": 0.27, + "grad_norm": 3.3906416500624106, + "learning_rate": 8.607389280076956e-06, + "loss": 0.7074, + "step": 2530 + }, + { + "epoch": 0.27, + "grad_norm": 2.55985900963281, + "learning_rate": 8.606208992115191e-06, + "loss": 0.6741, + "step": 2531 + }, + { + "epoch": 0.27, + "grad_norm": 2.088312059952641, + "learning_rate": 8.605028285194487e-06, + "loss": 0.6705, + "step": 2532 + }, + { + "epoch": 0.27, + "grad_norm": 4.4291577802630435, + "learning_rate": 8.60384715945202e-06, + "loss": 0.7239, + "step": 2533 + }, + { + "epoch": 0.27, + "grad_norm": 5.431296481740759, + "learning_rate": 8.602665615025006e-06, + "loss": 0.6131, + "step": 2534 + }, + { + "epoch": 0.27, + "grad_norm": 1.9635644510309531, + "learning_rate": 8.601483652050717e-06, + "loss": 0.5776, + "step": 2535 + }, + { + "epoch": 0.27, + "grad_norm": 2.460305744619151, + "learning_rate": 8.600301270666467e-06, + "loss": 0.7602, + "step": 2536 + }, + { + "epoch": 0.27, + "grad_norm": 1.2925216721758674, + "learning_rate": 8.599118471009622e-06, + "loss": 0.6207, + "step": 2537 + }, + { + "epoch": 0.27, + "grad_norm": 2.4578397320289453, + "learning_rate": 8.597935253217598e-06, + "loss": 0.7284, + "step": 2538 + }, + { + "epoch": 0.27, + "grad_norm": 2.148544072195046, + "learning_rate": 8.596751617427856e-06, + "loss": 0.609, + "step": 2539 + }, + { + "epoch": 0.27, + "grad_norm": 3.5641003536407068, + "learning_rate": 8.595567563777909e-06, + "loss": 0.6304, + "step": 2540 + }, + { + "epoch": 0.27, + "grad_norm": 2.5993299217825125, + "learning_rate": 8.594383092405317e-06, + "loss": 0.773, + "step": 2541 + }, + { + "epoch": 0.27, + "grad_norm": 4.261057636960014, + "learning_rate": 8.593198203447685e-06, + "loss": 0.6816, + "step": 2542 + }, + { + "epoch": 0.27, + "grad_norm": 2.7270686627916243, + "learning_rate": 8.592012897042677e-06, + "loss": 0.6008, + "step": 2543 + }, + { + "epoch": 0.27, + "grad_norm": 2.593307670860837, + "learning_rate": 8.59082717332799e-06, + "loss": 0.7299, + "step": 2544 + }, + { + "epoch": 0.27, + "grad_norm": 2.701277864197115, + "learning_rate": 8.589641032441384e-06, + "loss": 0.6848, + "step": 2545 + }, + { + "epoch": 0.27, + "grad_norm": 2.085418911458651, + "learning_rate": 8.588454474520657e-06, + "loss": 0.6909, + "step": 2546 + }, + { + "epoch": 0.27, + "grad_norm": 2.8232006350273355, + "learning_rate": 8.587267499703667e-06, + "loss": 0.6507, + "step": 2547 + }, + { + "epoch": 0.27, + "grad_norm": 2.4387883280629845, + "learning_rate": 8.586080108128304e-06, + "loss": 0.7263, + "step": 2548 + }, + { + "epoch": 0.27, + "grad_norm": 3.2908278171042737, + "learning_rate": 8.584892299932523e-06, + "loss": 0.6689, + "step": 2549 + }, + { + "epoch": 0.27, + "grad_norm": 2.573968919899555, + "learning_rate": 8.583704075254315e-06, + "loss": 0.7485, + "step": 2550 + }, + { + "epoch": 0.27, + "grad_norm": 1.9765316518612366, + "learning_rate": 8.582515434231729e-06, + "loss": 0.6095, + "step": 2551 + }, + { + "epoch": 0.27, + "grad_norm": 1.1051643768949975, + "learning_rate": 8.581326377002857e-06, + "loss": 0.6412, + "step": 2552 + }, + { + "epoch": 0.27, + "grad_norm": 2.0858357891432138, + "learning_rate": 8.580136903705838e-06, + "loss": 0.7277, + "step": 2553 + }, + { + "epoch": 0.27, + "grad_norm": 2.3915626172098765, + "learning_rate": 8.578947014478861e-06, + "loss": 0.6705, + "step": 2554 + }, + { + "epoch": 0.27, + "grad_norm": 2.234544105188988, + "learning_rate": 8.577756709460167e-06, + "loss": 0.6751, + "step": 2555 + }, + { + "epoch": 0.27, + "grad_norm": 2.99895822558418, + "learning_rate": 8.576565988788042e-06, + "loss": 0.6858, + "step": 2556 + }, + { + "epoch": 0.27, + "grad_norm": 2.176379192047237, + "learning_rate": 8.575374852600816e-06, + "loss": 0.7022, + "step": 2557 + }, + { + "epoch": 0.27, + "grad_norm": 2.882909865354107, + "learning_rate": 8.574183301036877e-06, + "loss": 0.7089, + "step": 2558 + }, + { + "epoch": 0.27, + "grad_norm": 2.6446090141728487, + "learning_rate": 8.572991334234654e-06, + "loss": 0.7491, + "step": 2559 + }, + { + "epoch": 0.27, + "grad_norm": 2.631308560093212, + "learning_rate": 8.571798952332625e-06, + "loss": 0.7464, + "step": 2560 + }, + { + "epoch": 0.27, + "grad_norm": 2.3075799632747174, + "learning_rate": 8.57060615546932e-06, + "loss": 0.6408, + "step": 2561 + }, + { + "epoch": 0.27, + "grad_norm": 3.280083800016366, + "learning_rate": 8.569412943783313e-06, + "loss": 0.6251, + "step": 2562 + }, + { + "epoch": 0.27, + "grad_norm": 2.37287864006488, + "learning_rate": 8.56821931741323e-06, + "loss": 0.609, + "step": 2563 + }, + { + "epoch": 0.27, + "grad_norm": 2.5322472513374734, + "learning_rate": 8.567025276497739e-06, + "loss": 0.7268, + "step": 2564 + }, + { + "epoch": 0.27, + "grad_norm": 2.4974087371265776, + "learning_rate": 8.565830821175563e-06, + "loss": 0.7007, + "step": 2565 + }, + { + "epoch": 0.27, + "grad_norm": 2.042712640061327, + "learning_rate": 8.56463595158547e-06, + "loss": 0.6754, + "step": 2566 + }, + { + "epoch": 0.27, + "grad_norm": 3.414866039042617, + "learning_rate": 8.563440667866278e-06, + "loss": 0.5793, + "step": 2567 + }, + { + "epoch": 0.27, + "grad_norm": 1.186808534700377, + "learning_rate": 8.56224497015685e-06, + "loss": 0.6083, + "step": 2568 + }, + { + "epoch": 0.27, + "grad_norm": 2.7550982917448628, + "learning_rate": 8.561048858596097e-06, + "loss": 0.6927, + "step": 2569 + }, + { + "epoch": 0.27, + "grad_norm": 3.307539577711527, + "learning_rate": 8.559852333322982e-06, + "loss": 0.7618, + "step": 2570 + }, + { + "epoch": 0.27, + "grad_norm": 2.6597721789148605, + "learning_rate": 8.558655394476513e-06, + "loss": 0.7079, + "step": 2571 + }, + { + "epoch": 0.27, + "grad_norm": 2.345267249284243, + "learning_rate": 8.557458042195748e-06, + "loss": 0.723, + "step": 2572 + }, + { + "epoch": 0.27, + "grad_norm": 2.194152292832171, + "learning_rate": 8.556260276619792e-06, + "loss": 0.7222, + "step": 2573 + }, + { + "epoch": 0.27, + "grad_norm": 2.8628566104318725, + "learning_rate": 8.555062097887796e-06, + "loss": 0.6131, + "step": 2574 + }, + { + "epoch": 0.27, + "grad_norm": 2.2175177567930087, + "learning_rate": 8.553863506138962e-06, + "loss": 0.6847, + "step": 2575 + }, + { + "epoch": 0.27, + "grad_norm": 5.19010534016988, + "learning_rate": 8.55266450151254e-06, + "loss": 0.6466, + "step": 2576 + }, + { + "epoch": 0.27, + "grad_norm": 3.030066766241923, + "learning_rate": 8.551465084147826e-06, + "loss": 0.5981, + "step": 2577 + }, + { + "epoch": 0.27, + "grad_norm": 2.5819351901307446, + "learning_rate": 8.550265254184163e-06, + "loss": 0.6906, + "step": 2578 + }, + { + "epoch": 0.27, + "grad_norm": 2.837791982234303, + "learning_rate": 8.549065011760948e-06, + "loss": 0.6177, + "step": 2579 + }, + { + "epoch": 0.27, + "grad_norm": 4.767739222282302, + "learning_rate": 8.547864357017618e-06, + "loss": 0.6715, + "step": 2580 + }, + { + "epoch": 0.27, + "grad_norm": 2.6955870932338923, + "learning_rate": 8.546663290093663e-06, + "loss": 0.6986, + "step": 2581 + }, + { + "epoch": 0.27, + "grad_norm": 2.542337333218461, + "learning_rate": 8.545461811128618e-06, + "loss": 0.7228, + "step": 2582 + }, + { + "epoch": 0.27, + "grad_norm": 2.345404508684496, + "learning_rate": 8.54425992026207e-06, + "loss": 0.6988, + "step": 2583 + }, + { + "epoch": 0.27, + "grad_norm": 3.046812475477053, + "learning_rate": 8.54305761763365e-06, + "loss": 0.6544, + "step": 2584 + }, + { + "epoch": 0.27, + "grad_norm": 2.0768028881923986, + "learning_rate": 8.541854903383038e-06, + "loss": 0.6944, + "step": 2585 + }, + { + "epoch": 0.27, + "grad_norm": 2.4296065875213158, + "learning_rate": 8.54065177764996e-06, + "loss": 0.6207, + "step": 2586 + }, + { + "epoch": 0.27, + "grad_norm": 2.333406207555947, + "learning_rate": 8.539448240574196e-06, + "loss": 0.7403, + "step": 2587 + }, + { + "epoch": 0.27, + "grad_norm": 3.2395561665415036, + "learning_rate": 8.538244292295565e-06, + "loss": 0.6861, + "step": 2588 + }, + { + "epoch": 0.27, + "grad_norm": 1.1745677467987092, + "learning_rate": 8.537039932953941e-06, + "loss": 0.6158, + "step": 2589 + }, + { + "epoch": 0.27, + "grad_norm": 2.7633198958097256, + "learning_rate": 8.535835162689243e-06, + "loss": 0.6364, + "step": 2590 + }, + { + "epoch": 0.27, + "grad_norm": 2.5596406892643273, + "learning_rate": 8.534629981641435e-06, + "loss": 0.6835, + "step": 2591 + }, + { + "epoch": 0.27, + "grad_norm": 2.182600890353231, + "learning_rate": 8.533424389950534e-06, + "loss": 0.574, + "step": 2592 + }, + { + "epoch": 0.27, + "grad_norm": 2.6915694598896676, + "learning_rate": 8.532218387756603e-06, + "loss": 0.6927, + "step": 2593 + }, + { + "epoch": 0.27, + "grad_norm": 3.6283644209612125, + "learning_rate": 8.531011975199747e-06, + "loss": 0.6385, + "step": 2594 + }, + { + "epoch": 0.27, + "grad_norm": 2.9155113489734292, + "learning_rate": 8.52980515242013e-06, + "loss": 0.6983, + "step": 2595 + }, + { + "epoch": 0.27, + "grad_norm": 8.980349217997206, + "learning_rate": 8.528597919557953e-06, + "loss": 0.7198, + "step": 2596 + }, + { + "epoch": 0.27, + "grad_norm": 2.810695167253675, + "learning_rate": 8.52739027675347e-06, + "loss": 0.6109, + "step": 2597 + }, + { + "epoch": 0.27, + "grad_norm": 2.9757821780269134, + "learning_rate": 8.526182224146982e-06, + "loss": 0.6589, + "step": 2598 + }, + { + "epoch": 0.27, + "grad_norm": 3.5030227005870316, + "learning_rate": 8.524973761878834e-06, + "loss": 0.7574, + "step": 2599 + }, + { + "epoch": 0.27, + "grad_norm": 1.9534355234534457, + "learning_rate": 8.523764890089425e-06, + "loss": 0.6311, + "step": 2600 + }, + { + "epoch": 0.27, + "grad_norm": 2.2968695529272085, + "learning_rate": 8.522555608919198e-06, + "loss": 0.7159, + "step": 2601 + }, + { + "epoch": 0.27, + "grad_norm": 2.5862520567000935, + "learning_rate": 8.521345918508644e-06, + "loss": 0.6494, + "step": 2602 + }, + { + "epoch": 0.27, + "grad_norm": 2.2346246576533058, + "learning_rate": 8.520135818998299e-06, + "loss": 0.6798, + "step": 2603 + }, + { + "epoch": 0.27, + "grad_norm": 2.3638719754838653, + "learning_rate": 8.518925310528749e-06, + "loss": 0.728, + "step": 2604 + }, + { + "epoch": 0.27, + "grad_norm": 3.03918196825084, + "learning_rate": 8.51771439324063e-06, + "loss": 0.7459, + "step": 2605 + }, + { + "epoch": 0.27, + "grad_norm": 2.3225257145217433, + "learning_rate": 8.516503067274622e-06, + "loss": 0.7274, + "step": 2606 + }, + { + "epoch": 0.27, + "grad_norm": 2.186434199840289, + "learning_rate": 8.515291332771452e-06, + "loss": 0.6888, + "step": 2607 + }, + { + "epoch": 0.27, + "grad_norm": 1.1134392839374618, + "learning_rate": 8.514079189871898e-06, + "loss": 0.6312, + "step": 2608 + }, + { + "epoch": 0.27, + "grad_norm": 2.563735931453355, + "learning_rate": 8.51286663871678e-06, + "loss": 0.7038, + "step": 2609 + }, + { + "epoch": 0.27, + "grad_norm": 7.940504117749469, + "learning_rate": 8.511653679446972e-06, + "loss": 0.7079, + "step": 2610 + }, + { + "epoch": 0.27, + "grad_norm": 3.2533130654227143, + "learning_rate": 8.51044031220339e-06, + "loss": 0.6543, + "step": 2611 + }, + { + "epoch": 0.27, + "grad_norm": 3.8589646098695116, + "learning_rate": 8.509226537127e-06, + "loss": 0.6759, + "step": 2612 + }, + { + "epoch": 0.27, + "grad_norm": 3.5253019336589153, + "learning_rate": 8.508012354358815e-06, + "loss": 0.5805, + "step": 2613 + }, + { + "epoch": 0.28, + "grad_norm": 3.1367218095988973, + "learning_rate": 8.506797764039895e-06, + "loss": 0.6941, + "step": 2614 + }, + { + "epoch": 0.28, + "grad_norm": 3.0713095113351727, + "learning_rate": 8.505582766311349e-06, + "loss": 0.6673, + "step": 2615 + }, + { + "epoch": 0.28, + "grad_norm": 2.312239589857587, + "learning_rate": 8.504367361314329e-06, + "loss": 0.6638, + "step": 2616 + }, + { + "epoch": 0.28, + "grad_norm": 2.8300865651331475, + "learning_rate": 8.50315154919004e-06, + "loss": 0.7389, + "step": 2617 + }, + { + "epoch": 0.28, + "grad_norm": 2.5470160475934462, + "learning_rate": 8.501935330079732e-06, + "loss": 0.7534, + "step": 2618 + }, + { + "epoch": 0.28, + "grad_norm": 3.316557953808444, + "learning_rate": 8.5007187041247e-06, + "loss": 0.7406, + "step": 2619 + }, + { + "epoch": 0.28, + "grad_norm": 2.146381681148758, + "learning_rate": 8.499501671466287e-06, + "loss": 0.663, + "step": 2620 + }, + { + "epoch": 0.28, + "grad_norm": 2.7424691680667523, + "learning_rate": 8.498284232245888e-06, + "loss": 0.6684, + "step": 2621 + }, + { + "epoch": 0.28, + "grad_norm": 2.812793415389665, + "learning_rate": 8.497066386604937e-06, + "loss": 0.7148, + "step": 2622 + }, + { + "epoch": 0.28, + "grad_norm": 2.4179027330742247, + "learning_rate": 8.495848134684924e-06, + "loss": 0.5672, + "step": 2623 + }, + { + "epoch": 0.28, + "grad_norm": 2.9317748456252173, + "learning_rate": 8.494629476627378e-06, + "loss": 0.6196, + "step": 2624 + }, + { + "epoch": 0.28, + "grad_norm": 3.8552043750504303, + "learning_rate": 8.493410412573883e-06, + "loss": 0.76, + "step": 2625 + }, + { + "epoch": 0.28, + "grad_norm": 2.1671342068280426, + "learning_rate": 8.492190942666065e-06, + "loss": 0.7228, + "step": 2626 + }, + { + "epoch": 0.28, + "grad_norm": 3.4361650708105707, + "learning_rate": 8.490971067045596e-06, + "loss": 0.7221, + "step": 2627 + }, + { + "epoch": 0.28, + "grad_norm": 2.6358964795479687, + "learning_rate": 8.489750785854203e-06, + "loss": 0.6363, + "step": 2628 + }, + { + "epoch": 0.28, + "grad_norm": 3.1358464929580174, + "learning_rate": 8.48853009923365e-06, + "loss": 0.7216, + "step": 2629 + }, + { + "epoch": 0.28, + "grad_norm": 2.3541129682126383, + "learning_rate": 8.487309007325755e-06, + "loss": 0.5821, + "step": 2630 + }, + { + "epoch": 0.28, + "grad_norm": 2.420390390909207, + "learning_rate": 8.48608751027238e-06, + "loss": 0.6743, + "step": 2631 + }, + { + "epoch": 0.28, + "grad_norm": 2.4451760999988363, + "learning_rate": 8.484865608215435e-06, + "loss": 0.6701, + "step": 2632 + }, + { + "epoch": 0.28, + "grad_norm": 3.4307024526167944, + "learning_rate": 8.483643301296877e-06, + "loss": 0.7317, + "step": 2633 + }, + { + "epoch": 0.28, + "grad_norm": 2.2798188313827032, + "learning_rate": 8.482420589658712e-06, + "loss": 0.6603, + "step": 2634 + }, + { + "epoch": 0.28, + "grad_norm": 2.7920937123015785, + "learning_rate": 8.481197473442989e-06, + "loss": 0.7351, + "step": 2635 + }, + { + "epoch": 0.28, + "grad_norm": 3.1650536164648404, + "learning_rate": 8.479973952791805e-06, + "loss": 0.747, + "step": 2636 + }, + { + "epoch": 0.28, + "grad_norm": 2.4408149875447256, + "learning_rate": 8.478750027847308e-06, + "loss": 0.7117, + "step": 2637 + }, + { + "epoch": 0.28, + "grad_norm": 2.5845290741492817, + "learning_rate": 8.477525698751688e-06, + "loss": 0.6305, + "step": 2638 + }, + { + "epoch": 0.28, + "grad_norm": 2.311318523859558, + "learning_rate": 8.476300965647186e-06, + "loss": 0.6609, + "step": 2639 + }, + { + "epoch": 0.28, + "grad_norm": 3.0266589746186376, + "learning_rate": 8.475075828676086e-06, + "loss": 0.663, + "step": 2640 + }, + { + "epoch": 0.28, + "grad_norm": 2.8468358467655803, + "learning_rate": 8.473850287980721e-06, + "loss": 0.6316, + "step": 2641 + }, + { + "epoch": 0.28, + "grad_norm": 3.1150563470392996, + "learning_rate": 8.472624343703473e-06, + "loss": 0.713, + "step": 2642 + }, + { + "epoch": 0.28, + "grad_norm": 3.3018689988839918, + "learning_rate": 8.471397995986766e-06, + "loss": 0.5945, + "step": 2643 + }, + { + "epoch": 0.28, + "grad_norm": 2.299828029802301, + "learning_rate": 8.470171244973075e-06, + "loss": 0.6362, + "step": 2644 + }, + { + "epoch": 0.28, + "grad_norm": 2.400265553940191, + "learning_rate": 8.46894409080492e-06, + "loss": 0.7142, + "step": 2645 + }, + { + "epoch": 0.28, + "grad_norm": 2.3965587472793275, + "learning_rate": 8.467716533624869e-06, + "loss": 0.6693, + "step": 2646 + }, + { + "epoch": 0.28, + "grad_norm": 3.1424468064421527, + "learning_rate": 8.466488573575536e-06, + "loss": 0.6621, + "step": 2647 + }, + { + "epoch": 0.28, + "grad_norm": 4.058208255714223, + "learning_rate": 8.465260210799579e-06, + "loss": 0.5876, + "step": 2648 + }, + { + "epoch": 0.28, + "grad_norm": 5.051342205579871, + "learning_rate": 8.464031445439708e-06, + "loss": 0.6876, + "step": 2649 + }, + { + "epoch": 0.28, + "grad_norm": 2.435428283210832, + "learning_rate": 8.462802277638677e-06, + "loss": 0.7141, + "step": 2650 + }, + { + "epoch": 0.28, + "grad_norm": 2.2536304400179357, + "learning_rate": 8.461572707539288e-06, + "loss": 0.7239, + "step": 2651 + }, + { + "epoch": 0.28, + "grad_norm": 2.2033188343885404, + "learning_rate": 8.460342735284388e-06, + "loss": 0.5988, + "step": 2652 + }, + { + "epoch": 0.28, + "grad_norm": 2.6488901006220136, + "learning_rate": 8.459112361016873e-06, + "loss": 0.7123, + "step": 2653 + }, + { + "epoch": 0.28, + "grad_norm": 2.454166766453256, + "learning_rate": 8.457881584879681e-06, + "loss": 0.7274, + "step": 2654 + }, + { + "epoch": 0.28, + "grad_norm": 2.338529320134492, + "learning_rate": 8.456650407015804e-06, + "loss": 0.7723, + "step": 2655 + }, + { + "epoch": 0.28, + "grad_norm": 2.7241780151773933, + "learning_rate": 8.455418827568275e-06, + "loss": 0.619, + "step": 2656 + }, + { + "epoch": 0.28, + "grad_norm": 2.2430549835972053, + "learning_rate": 8.454186846680174e-06, + "loss": 0.6374, + "step": 2657 + }, + { + "epoch": 0.28, + "grad_norm": 3.090373805102784, + "learning_rate": 8.452954464494631e-06, + "loss": 0.7013, + "step": 2658 + }, + { + "epoch": 0.28, + "grad_norm": 2.50997857984063, + "learning_rate": 8.451721681154819e-06, + "loss": 0.6987, + "step": 2659 + }, + { + "epoch": 0.28, + "grad_norm": 2.318114958320992, + "learning_rate": 8.45048849680396e-06, + "loss": 0.7233, + "step": 2660 + }, + { + "epoch": 0.28, + "grad_norm": 1.2346055238002462, + "learning_rate": 8.449254911585323e-06, + "loss": 0.6527, + "step": 2661 + }, + { + "epoch": 0.28, + "grad_norm": 2.293606280499452, + "learning_rate": 8.44802092564222e-06, + "loss": 0.6759, + "step": 2662 + }, + { + "epoch": 0.28, + "grad_norm": 1.206662259361233, + "learning_rate": 8.446786539118014e-06, + "loss": 0.6139, + "step": 2663 + }, + { + "epoch": 0.28, + "grad_norm": 12.24962776644313, + "learning_rate": 8.445551752156111e-06, + "loss": 0.6434, + "step": 2664 + }, + { + "epoch": 0.28, + "grad_norm": 3.2212865543394447, + "learning_rate": 8.444316564899966e-06, + "loss": 0.6465, + "step": 2665 + }, + { + "epoch": 0.28, + "grad_norm": 2.714117520697914, + "learning_rate": 8.443080977493078e-06, + "loss": 0.5998, + "step": 2666 + }, + { + "epoch": 0.28, + "grad_norm": 3.336735248734921, + "learning_rate": 8.441844990078995e-06, + "loss": 0.7573, + "step": 2667 + }, + { + "epoch": 0.28, + "grad_norm": 2.403703415234515, + "learning_rate": 8.44060860280131e-06, + "loss": 0.6043, + "step": 2668 + }, + { + "epoch": 0.28, + "grad_norm": 2.382422434010672, + "learning_rate": 8.439371815803666e-06, + "loss": 0.7587, + "step": 2669 + }, + { + "epoch": 0.28, + "grad_norm": 3.0968077866837533, + "learning_rate": 8.438134629229746e-06, + "loss": 0.6986, + "step": 2670 + }, + { + "epoch": 0.28, + "grad_norm": 2.43034993732329, + "learning_rate": 8.436897043223282e-06, + "loss": 0.5956, + "step": 2671 + }, + { + "epoch": 0.28, + "grad_norm": 2.396150179364192, + "learning_rate": 8.435659057928054e-06, + "loss": 0.6747, + "step": 2672 + }, + { + "epoch": 0.28, + "grad_norm": 2.7277764541975973, + "learning_rate": 8.434420673487888e-06, + "loss": 0.6954, + "step": 2673 + }, + { + "epoch": 0.28, + "grad_norm": 2.217561657414955, + "learning_rate": 8.433181890046658e-06, + "loss": 0.6458, + "step": 2674 + }, + { + "epoch": 0.28, + "grad_norm": 9.236257143877165, + "learning_rate": 8.431942707748279e-06, + "loss": 0.6915, + "step": 2675 + }, + { + "epoch": 0.28, + "grad_norm": 3.0699718693696636, + "learning_rate": 8.430703126736717e-06, + "loss": 0.6652, + "step": 2676 + }, + { + "epoch": 0.28, + "grad_norm": 3.175699865000509, + "learning_rate": 8.429463147155984e-06, + "loss": 0.772, + "step": 2677 + }, + { + "epoch": 0.28, + "grad_norm": 2.7545634656991043, + "learning_rate": 8.428222769150137e-06, + "loss": 0.6886, + "step": 2678 + }, + { + "epoch": 0.28, + "grad_norm": 2.448219091164623, + "learning_rate": 8.426981992863276e-06, + "loss": 0.6719, + "step": 2679 + }, + { + "epoch": 0.28, + "grad_norm": 2.2382890240649553, + "learning_rate": 8.425740818439553e-06, + "loss": 0.7212, + "step": 2680 + }, + { + "epoch": 0.28, + "grad_norm": 2.483770641222447, + "learning_rate": 8.424499246023168e-06, + "loss": 0.7427, + "step": 2681 + }, + { + "epoch": 0.28, + "grad_norm": 3.0361221580953, + "learning_rate": 8.42325727575836e-06, + "loss": 0.7023, + "step": 2682 + }, + { + "epoch": 0.28, + "grad_norm": 2.1519597344947137, + "learning_rate": 8.422014907789413e-06, + "loss": 0.7392, + "step": 2683 + }, + { + "epoch": 0.28, + "grad_norm": 2.1345686803799038, + "learning_rate": 8.420772142260667e-06, + "loss": 0.6333, + "step": 2684 + }, + { + "epoch": 0.28, + "grad_norm": 2.287284430544546, + "learning_rate": 8.419528979316505e-06, + "loss": 0.7573, + "step": 2685 + }, + { + "epoch": 0.28, + "grad_norm": 2.4381162142757145, + "learning_rate": 8.41828541910135e-06, + "loss": 0.7724, + "step": 2686 + }, + { + "epoch": 0.28, + "grad_norm": 2.8332535794197633, + "learning_rate": 8.417041461759674e-06, + "loss": 0.6979, + "step": 2687 + }, + { + "epoch": 0.28, + "grad_norm": 1.1728298056179722, + "learning_rate": 8.415797107436e-06, + "loss": 0.627, + "step": 2688 + }, + { + "epoch": 0.28, + "grad_norm": 1.1601804909844005, + "learning_rate": 8.414552356274891e-06, + "loss": 0.6739, + "step": 2689 + }, + { + "epoch": 0.28, + "grad_norm": 2.679136900723749, + "learning_rate": 8.413307208420963e-06, + "loss": 0.7404, + "step": 2690 + }, + { + "epoch": 0.28, + "grad_norm": 2.5824680380348557, + "learning_rate": 8.412061664018869e-06, + "loss": 0.745, + "step": 2691 + }, + { + "epoch": 0.28, + "grad_norm": 3.209895645078567, + "learning_rate": 8.410815723213312e-06, + "loss": 0.6478, + "step": 2692 + }, + { + "epoch": 0.28, + "grad_norm": 2.1856854173402795, + "learning_rate": 8.409569386149046e-06, + "loss": 0.6822, + "step": 2693 + }, + { + "epoch": 0.28, + "grad_norm": 2.3084967091893653, + "learning_rate": 8.408322652970866e-06, + "loss": 0.7768, + "step": 2694 + }, + { + "epoch": 0.28, + "grad_norm": 3.9063292951795425, + "learning_rate": 8.40707552382361e-06, + "loss": 0.6608, + "step": 2695 + }, + { + "epoch": 0.28, + "grad_norm": 2.051324396490213, + "learning_rate": 8.40582799885217e-06, + "loss": 0.5923, + "step": 2696 + }, + { + "epoch": 0.28, + "grad_norm": 2.765515717742255, + "learning_rate": 8.404580078201476e-06, + "loss": 0.7872, + "step": 2697 + }, + { + "epoch": 0.28, + "grad_norm": 3.1789517487510066, + "learning_rate": 8.403331762016514e-06, + "loss": 0.6043, + "step": 2698 + }, + { + "epoch": 0.28, + "grad_norm": 2.2655735243519515, + "learning_rate": 8.402083050442302e-06, + "loss": 0.5652, + "step": 2699 + }, + { + "epoch": 0.28, + "grad_norm": 2.2602508493232687, + "learning_rate": 8.400833943623919e-06, + "loss": 0.7123, + "step": 2700 + }, + { + "epoch": 0.28, + "grad_norm": 2.5454383180770535, + "learning_rate": 8.399584441706477e-06, + "loss": 0.699, + "step": 2701 + }, + { + "epoch": 0.28, + "grad_norm": 2.2564015333342606, + "learning_rate": 8.398334544835143e-06, + "loss": 0.6204, + "step": 2702 + }, + { + "epoch": 0.28, + "grad_norm": 2.418647080107335, + "learning_rate": 8.397084253155125e-06, + "loss": 0.6295, + "step": 2703 + }, + { + "epoch": 0.28, + "grad_norm": 3.25654659367241, + "learning_rate": 8.395833566811677e-06, + "loss": 0.692, + "step": 2704 + }, + { + "epoch": 0.28, + "grad_norm": 2.666744974325964, + "learning_rate": 8.394582485950103e-06, + "loss": 0.6038, + "step": 2705 + }, + { + "epoch": 0.28, + "grad_norm": 2.4348825850755493, + "learning_rate": 8.393331010715749e-06, + "loss": 0.6997, + "step": 2706 + }, + { + "epoch": 0.28, + "grad_norm": 2.0592895390345376, + "learning_rate": 8.392079141254006e-06, + "loss": 0.6366, + "step": 2707 + }, + { + "epoch": 0.28, + "grad_norm": 2.0843302668304746, + "learning_rate": 8.390826877710314e-06, + "loss": 0.595, + "step": 2708 + }, + { + "epoch": 0.29, + "grad_norm": 2.693813971402732, + "learning_rate": 8.38957422023016e-06, + "loss": 0.6939, + "step": 2709 + }, + { + "epoch": 0.29, + "grad_norm": 2.4215728777192447, + "learning_rate": 8.388321168959068e-06, + "loss": 0.7119, + "step": 2710 + }, + { + "epoch": 0.29, + "grad_norm": 2.4275301307354837, + "learning_rate": 8.387067724042618e-06, + "loss": 0.6875, + "step": 2711 + }, + { + "epoch": 0.29, + "grad_norm": 6.524758976644784, + "learning_rate": 8.38581388562643e-06, + "loss": 0.7328, + "step": 2712 + }, + { + "epoch": 0.29, + "grad_norm": 2.529001789661384, + "learning_rate": 8.384559653856174e-06, + "loss": 0.6467, + "step": 2713 + }, + { + "epoch": 0.29, + "grad_norm": 2.339231251838686, + "learning_rate": 8.383305028877559e-06, + "loss": 0.6954, + "step": 2714 + }, + { + "epoch": 0.29, + "grad_norm": 2.422685898720981, + "learning_rate": 8.382050010836349e-06, + "loss": 0.7583, + "step": 2715 + }, + { + "epoch": 0.29, + "grad_norm": 5.610823185514183, + "learning_rate": 8.380794599878343e-06, + "loss": 0.67, + "step": 2716 + }, + { + "epoch": 0.29, + "grad_norm": 3.642245922603603, + "learning_rate": 8.379538796149395e-06, + "loss": 0.6711, + "step": 2717 + }, + { + "epoch": 0.29, + "grad_norm": 2.3135555055352635, + "learning_rate": 8.378282599795397e-06, + "loss": 0.6705, + "step": 2718 + }, + { + "epoch": 0.29, + "grad_norm": 2.447841830239155, + "learning_rate": 8.377026010962293e-06, + "loss": 0.6548, + "step": 2719 + }, + { + "epoch": 0.29, + "grad_norm": 2.349506243780416, + "learning_rate": 8.375769029796068e-06, + "loss": 0.6988, + "step": 2720 + }, + { + "epoch": 0.29, + "grad_norm": 3.4916280933140373, + "learning_rate": 8.374511656442756e-06, + "loss": 0.6912, + "step": 2721 + }, + { + "epoch": 0.29, + "grad_norm": 2.008729698463561, + "learning_rate": 8.373253891048436e-06, + "loss": 0.6498, + "step": 2722 + }, + { + "epoch": 0.29, + "grad_norm": 2.504995655860519, + "learning_rate": 8.371995733759228e-06, + "loss": 0.6303, + "step": 2723 + }, + { + "epoch": 0.29, + "grad_norm": 4.465351579775575, + "learning_rate": 8.370737184721305e-06, + "loss": 0.6931, + "step": 2724 + }, + { + "epoch": 0.29, + "grad_norm": 2.786999536883786, + "learning_rate": 8.369478244080878e-06, + "loss": 0.6444, + "step": 2725 + }, + { + "epoch": 0.29, + "grad_norm": 3.089894612238134, + "learning_rate": 8.368218911984211e-06, + "loss": 0.7627, + "step": 2726 + }, + { + "epoch": 0.29, + "grad_norm": 2.929159775372233, + "learning_rate": 8.366959188577606e-06, + "loss": 0.6889, + "step": 2727 + }, + { + "epoch": 0.29, + "grad_norm": 5.440489695426411, + "learning_rate": 8.365699074007416e-06, + "loss": 0.6793, + "step": 2728 + }, + { + "epoch": 0.29, + "grad_norm": 3.058496978919441, + "learning_rate": 8.364438568420034e-06, + "loss": 0.6912, + "step": 2729 + }, + { + "epoch": 0.29, + "grad_norm": 2.4524683201064064, + "learning_rate": 8.363177671961908e-06, + "loss": 0.6447, + "step": 2730 + }, + { + "epoch": 0.29, + "grad_norm": 2.0795075403169156, + "learning_rate": 8.36191638477952e-06, + "loss": 0.6589, + "step": 2731 + }, + { + "epoch": 0.29, + "grad_norm": 1.9902950430871291, + "learning_rate": 8.360654707019406e-06, + "loss": 0.6756, + "step": 2732 + }, + { + "epoch": 0.29, + "grad_norm": 2.568567286867176, + "learning_rate": 8.359392638828142e-06, + "loss": 0.7202, + "step": 2733 + }, + { + "epoch": 0.29, + "grad_norm": 2.7851953645089007, + "learning_rate": 8.358130180352353e-06, + "loss": 0.6707, + "step": 2734 + }, + { + "epoch": 0.29, + "grad_norm": 2.7057531159089776, + "learning_rate": 8.356867331738706e-06, + "loss": 0.6586, + "step": 2735 + }, + { + "epoch": 0.29, + "grad_norm": 3.2347571171882237, + "learning_rate": 8.355604093133916e-06, + "loss": 0.6705, + "step": 2736 + }, + { + "epoch": 0.29, + "grad_norm": 2.625192631188438, + "learning_rate": 8.354340464684745e-06, + "loss": 0.7096, + "step": 2737 + }, + { + "epoch": 0.29, + "grad_norm": 2.7137203886388295, + "learning_rate": 8.353076446537993e-06, + "loss": 0.6789, + "step": 2738 + }, + { + "epoch": 0.29, + "grad_norm": 3.1425928647546533, + "learning_rate": 8.351812038840513e-06, + "loss": 0.6174, + "step": 2739 + }, + { + "epoch": 0.29, + "grad_norm": 3.1199663116285286, + "learning_rate": 8.3505472417392e-06, + "loss": 0.6875, + "step": 2740 + }, + { + "epoch": 0.29, + "grad_norm": 2.3558771465349433, + "learning_rate": 8.349282055380992e-06, + "loss": 0.6542, + "step": 2741 + }, + { + "epoch": 0.29, + "grad_norm": 2.9753417802732893, + "learning_rate": 8.348016479912877e-06, + "loss": 0.6566, + "step": 2742 + }, + { + "epoch": 0.29, + "grad_norm": 2.5685130835209935, + "learning_rate": 8.346750515481888e-06, + "loss": 0.7154, + "step": 2743 + }, + { + "epoch": 0.29, + "grad_norm": 3.1983652825774427, + "learning_rate": 8.345484162235096e-06, + "loss": 0.6878, + "step": 2744 + }, + { + "epoch": 0.29, + "grad_norm": 8.279656071212713, + "learning_rate": 8.344217420319624e-06, + "loss": 0.6388, + "step": 2745 + }, + { + "epoch": 0.29, + "grad_norm": 2.6025787077693563, + "learning_rate": 8.342950289882641e-06, + "loss": 0.6941, + "step": 2746 + }, + { + "epoch": 0.29, + "grad_norm": 2.614210374161948, + "learning_rate": 8.341682771071357e-06, + "loss": 0.6266, + "step": 2747 + }, + { + "epoch": 0.29, + "grad_norm": 2.1048003157133306, + "learning_rate": 8.340414864033028e-06, + "loss": 0.6507, + "step": 2748 + }, + { + "epoch": 0.29, + "grad_norm": 3.1566816578883072, + "learning_rate": 8.339146568914958e-06, + "loss": 0.7176, + "step": 2749 + }, + { + "epoch": 0.29, + "grad_norm": 2.6545077340607643, + "learning_rate": 8.337877885864489e-06, + "loss": 0.7292, + "step": 2750 + }, + { + "epoch": 0.29, + "grad_norm": 3.9130923410553775, + "learning_rate": 8.336608815029018e-06, + "loss": 0.7339, + "step": 2751 + }, + { + "epoch": 0.29, + "grad_norm": 3.184098187059293, + "learning_rate": 8.335339356555981e-06, + "loss": 0.6965, + "step": 2752 + }, + { + "epoch": 0.29, + "grad_norm": 2.14723331071444, + "learning_rate": 8.334069510592857e-06, + "loss": 0.5925, + "step": 2753 + }, + { + "epoch": 0.29, + "grad_norm": 2.9405681252036464, + "learning_rate": 8.332799277287175e-06, + "loss": 0.6977, + "step": 2754 + }, + { + "epoch": 0.29, + "grad_norm": 2.3880455307462882, + "learning_rate": 8.331528656786508e-06, + "loss": 0.6346, + "step": 2755 + }, + { + "epoch": 0.29, + "grad_norm": 2.322406078061077, + "learning_rate": 8.330257649238472e-06, + "loss": 0.7288, + "step": 2756 + }, + { + "epoch": 0.29, + "grad_norm": 2.1014011587142933, + "learning_rate": 8.328986254790729e-06, + "loss": 0.6754, + "step": 2757 + }, + { + "epoch": 0.29, + "grad_norm": 1.3514936631244956, + "learning_rate": 8.327714473590986e-06, + "loss": 0.646, + "step": 2758 + }, + { + "epoch": 0.29, + "grad_norm": 2.299770815204609, + "learning_rate": 8.326442305786995e-06, + "loss": 0.6682, + "step": 2759 + }, + { + "epoch": 0.29, + "grad_norm": 2.4432174028048284, + "learning_rate": 8.325169751526552e-06, + "loss": 0.7419, + "step": 2760 + }, + { + "epoch": 0.29, + "grad_norm": 2.7511522431665765, + "learning_rate": 8.323896810957501e-06, + "loss": 0.5777, + "step": 2761 + }, + { + "epoch": 0.29, + "grad_norm": 2.7500144099538226, + "learning_rate": 8.322623484227725e-06, + "loss": 0.6688, + "step": 2762 + }, + { + "epoch": 0.29, + "grad_norm": 2.442208106975574, + "learning_rate": 8.321349771485159e-06, + "loss": 0.7288, + "step": 2763 + }, + { + "epoch": 0.29, + "grad_norm": 2.4073668296629323, + "learning_rate": 8.320075672877776e-06, + "loss": 0.6964, + "step": 2764 + }, + { + "epoch": 0.29, + "grad_norm": 2.973204687169632, + "learning_rate": 8.3188011885536e-06, + "loss": 0.665, + "step": 2765 + }, + { + "epoch": 0.29, + "grad_norm": 1.9341022002413333, + "learning_rate": 8.317526318660695e-06, + "loss": 0.6552, + "step": 2766 + }, + { + "epoch": 0.29, + "grad_norm": 2.41457288085297, + "learning_rate": 8.316251063347175e-06, + "loss": 0.7097, + "step": 2767 + }, + { + "epoch": 0.29, + "grad_norm": 2.5800526852961263, + "learning_rate": 8.314975422761187e-06, + "loss": 0.5778, + "step": 2768 + }, + { + "epoch": 0.29, + "grad_norm": 2.752826125090857, + "learning_rate": 8.313699397050941e-06, + "loss": 0.6198, + "step": 2769 + }, + { + "epoch": 0.29, + "grad_norm": 2.3181883296463717, + "learning_rate": 8.312422986364677e-06, + "loss": 0.546, + "step": 2770 + }, + { + "epoch": 0.29, + "grad_norm": 2.3923531457438534, + "learning_rate": 8.311146190850687e-06, + "loss": 0.7019, + "step": 2771 + }, + { + "epoch": 0.29, + "grad_norm": 2.57435657748528, + "learning_rate": 8.309869010657303e-06, + "loss": 0.6215, + "step": 2772 + }, + { + "epoch": 0.29, + "grad_norm": 1.9661259285016583, + "learning_rate": 8.308591445932905e-06, + "loss": 0.6052, + "step": 2773 + }, + { + "epoch": 0.29, + "grad_norm": 2.1300824586691043, + "learning_rate": 8.307313496825918e-06, + "loss": 0.6245, + "step": 2774 + }, + { + "epoch": 0.29, + "grad_norm": 2.7492441683809816, + "learning_rate": 8.306035163484806e-06, + "loss": 0.6852, + "step": 2775 + }, + { + "epoch": 0.29, + "grad_norm": 2.4951222437462484, + "learning_rate": 8.30475644605809e-06, + "loss": 0.6103, + "step": 2776 + }, + { + "epoch": 0.29, + "grad_norm": 2.2323509496392493, + "learning_rate": 8.30347734469432e-06, + "loss": 0.6149, + "step": 2777 + }, + { + "epoch": 0.29, + "grad_norm": 3.702733593456729, + "learning_rate": 8.302197859542104e-06, + "loss": 0.6772, + "step": 2778 + }, + { + "epoch": 0.29, + "grad_norm": 2.029812775065599, + "learning_rate": 8.300917990750085e-06, + "loss": 0.669, + "step": 2779 + }, + { + "epoch": 0.29, + "grad_norm": 2.436939042536463, + "learning_rate": 8.299637738466956e-06, + "loss": 0.6703, + "step": 2780 + }, + { + "epoch": 0.29, + "grad_norm": 7.064859457627149, + "learning_rate": 8.298357102841452e-06, + "loss": 0.6941, + "step": 2781 + }, + { + "epoch": 0.29, + "grad_norm": 2.290933554699684, + "learning_rate": 8.297076084022355e-06, + "loss": 0.585, + "step": 2782 + }, + { + "epoch": 0.29, + "grad_norm": 2.7292159300881327, + "learning_rate": 8.29579468215849e-06, + "loss": 0.6463, + "step": 2783 + }, + { + "epoch": 0.29, + "grad_norm": 2.4076744166382755, + "learning_rate": 8.294512897398725e-06, + "loss": 0.6891, + "step": 2784 + }, + { + "epoch": 0.29, + "grad_norm": 2.400183386823079, + "learning_rate": 8.293230729891976e-06, + "loss": 0.6797, + "step": 2785 + }, + { + "epoch": 0.29, + "grad_norm": 2.5300566156875215, + "learning_rate": 8.2919481797872e-06, + "loss": 0.6729, + "step": 2786 + }, + { + "epoch": 0.29, + "grad_norm": 2.504139135974389, + "learning_rate": 8.2906652472334e-06, + "loss": 0.7771, + "step": 2787 + }, + { + "epoch": 0.29, + "grad_norm": 2.4033755762522127, + "learning_rate": 8.289381932379625e-06, + "loss": 0.7255, + "step": 2788 + }, + { + "epoch": 0.29, + "grad_norm": 2.130391692034721, + "learning_rate": 8.288098235374966e-06, + "loss": 0.6504, + "step": 2789 + }, + { + "epoch": 0.29, + "grad_norm": 2.3119004788902524, + "learning_rate": 8.286814156368559e-06, + "loss": 0.7532, + "step": 2790 + }, + { + "epoch": 0.29, + "grad_norm": 12.473032165918204, + "learning_rate": 8.285529695509585e-06, + "loss": 0.7055, + "step": 2791 + }, + { + "epoch": 0.29, + "grad_norm": 2.874206287893155, + "learning_rate": 8.284244852947265e-06, + "loss": 0.7682, + "step": 2792 + }, + { + "epoch": 0.29, + "grad_norm": 2.3354999888656627, + "learning_rate": 8.282959628830875e-06, + "loss": 0.7107, + "step": 2793 + }, + { + "epoch": 0.29, + "grad_norm": 2.6099032292361617, + "learning_rate": 8.281674023309725e-06, + "loss": 0.6281, + "step": 2794 + }, + { + "epoch": 0.29, + "grad_norm": 2.232202918075071, + "learning_rate": 8.280388036533171e-06, + "loss": 0.6399, + "step": 2795 + }, + { + "epoch": 0.29, + "grad_norm": 2.491569330119017, + "learning_rate": 8.27910166865062e-06, + "loss": 0.626, + "step": 2796 + }, + { + "epoch": 0.29, + "grad_norm": 2.2273954138081073, + "learning_rate": 8.277814919811516e-06, + "loss": 0.6008, + "step": 2797 + }, + { + "epoch": 0.29, + "grad_norm": 2.2945544456627385, + "learning_rate": 8.276527790165349e-06, + "loss": 0.6461, + "step": 2798 + }, + { + "epoch": 0.29, + "grad_norm": 2.277058416467495, + "learning_rate": 8.275240279861655e-06, + "loss": 0.69, + "step": 2799 + }, + { + "epoch": 0.29, + "grad_norm": 2.2731253809981653, + "learning_rate": 8.273952389050015e-06, + "loss": 0.6764, + "step": 2800 + }, + { + "epoch": 0.29, + "grad_norm": 2.1917315114226636, + "learning_rate": 8.272664117880047e-06, + "loss": 0.6476, + "step": 2801 + }, + { + "epoch": 0.29, + "grad_norm": 2.25326375571558, + "learning_rate": 8.271375466501424e-06, + "loss": 0.7102, + "step": 2802 + }, + { + "epoch": 0.29, + "grad_norm": 2.302993394481045, + "learning_rate": 8.270086435063856e-06, + "loss": 0.6434, + "step": 2803 + }, + { + "epoch": 0.3, + "grad_norm": 2.7166993208874195, + "learning_rate": 8.268797023717098e-06, + "loss": 0.6369, + "step": 2804 + }, + { + "epoch": 0.3, + "grad_norm": 2.2576333540166673, + "learning_rate": 8.267507232610952e-06, + "loss": 0.7091, + "step": 2805 + }, + { + "epoch": 0.3, + "grad_norm": 2.834052961933635, + "learning_rate": 8.26621706189526e-06, + "loss": 0.6522, + "step": 2806 + }, + { + "epoch": 0.3, + "grad_norm": 2.4814364257476966, + "learning_rate": 8.264926511719912e-06, + "loss": 0.6399, + "step": 2807 + }, + { + "epoch": 0.3, + "grad_norm": 2.8842459517526953, + "learning_rate": 8.26363558223484e-06, + "loss": 0.7116, + "step": 2808 + }, + { + "epoch": 0.3, + "grad_norm": 1.2082080106969904, + "learning_rate": 8.26234427359002e-06, + "loss": 0.6688, + "step": 2809 + }, + { + "epoch": 0.3, + "grad_norm": 2.234644633018572, + "learning_rate": 8.261052585935471e-06, + "loss": 0.6346, + "step": 2810 + }, + { + "epoch": 0.3, + "grad_norm": 2.4322865695650178, + "learning_rate": 8.259760519421263e-06, + "loss": 0.7054, + "step": 2811 + }, + { + "epoch": 0.3, + "grad_norm": 2.6564893401792675, + "learning_rate": 8.258468074197499e-06, + "loss": 0.6955, + "step": 2812 + }, + { + "epoch": 0.3, + "grad_norm": 2.266036275854144, + "learning_rate": 8.257175250414333e-06, + "loss": 0.7159, + "step": 2813 + }, + { + "epoch": 0.3, + "grad_norm": 2.762553186968921, + "learning_rate": 8.255882048221961e-06, + "loss": 0.7056, + "step": 2814 + }, + { + "epoch": 0.3, + "grad_norm": 4.139967136595774, + "learning_rate": 8.254588467770628e-06, + "loss": 0.707, + "step": 2815 + }, + { + "epoch": 0.3, + "grad_norm": 3.9568494330619317, + "learning_rate": 8.253294509210612e-06, + "loss": 0.6702, + "step": 2816 + }, + { + "epoch": 0.3, + "grad_norm": 3.031085393262171, + "learning_rate": 8.252000172692244e-06, + "loss": 0.6967, + "step": 2817 + }, + { + "epoch": 0.3, + "grad_norm": 3.3355616466171094, + "learning_rate": 8.250705458365897e-06, + "loss": 0.7047, + "step": 2818 + }, + { + "epoch": 0.3, + "grad_norm": 2.3126709857674936, + "learning_rate": 8.249410366381987e-06, + "loss": 0.6727, + "step": 2819 + }, + { + "epoch": 0.3, + "grad_norm": 2.857054433557827, + "learning_rate": 8.248114896890975e-06, + "loss": 0.6336, + "step": 2820 + }, + { + "epoch": 0.3, + "grad_norm": 3.0926046214596106, + "learning_rate": 8.246819050043363e-06, + "loss": 0.6691, + "step": 2821 + }, + { + "epoch": 0.3, + "grad_norm": 2.618242351375955, + "learning_rate": 8.245522825989697e-06, + "loss": 0.6733, + "step": 2822 + }, + { + "epoch": 0.3, + "grad_norm": 2.7640385714230673, + "learning_rate": 8.244226224880574e-06, + "loss": 0.6313, + "step": 2823 + }, + { + "epoch": 0.3, + "grad_norm": 2.5354134106997948, + "learning_rate": 8.242929246866624e-06, + "loss": 0.7069, + "step": 2824 + }, + { + "epoch": 0.3, + "grad_norm": 2.933933284256052, + "learning_rate": 8.24163189209853e-06, + "loss": 0.749, + "step": 2825 + }, + { + "epoch": 0.3, + "grad_norm": 2.8633014033698627, + "learning_rate": 8.240334160727013e-06, + "loss": 0.6782, + "step": 2826 + }, + { + "epoch": 0.3, + "grad_norm": 2.7941442636390543, + "learning_rate": 8.23903605290284e-06, + "loss": 0.6433, + "step": 2827 + }, + { + "epoch": 0.3, + "grad_norm": 2.3734190039511978, + "learning_rate": 8.23773756877682e-06, + "loss": 0.6341, + "step": 2828 + }, + { + "epoch": 0.3, + "grad_norm": 4.306290887870174, + "learning_rate": 8.236438708499811e-06, + "loss": 0.7399, + "step": 2829 + }, + { + "epoch": 0.3, + "grad_norm": 3.327313393460941, + "learning_rate": 8.235139472222708e-06, + "loss": 0.6106, + "step": 2830 + }, + { + "epoch": 0.3, + "grad_norm": 2.842091527633625, + "learning_rate": 8.233839860096453e-06, + "loss": 0.6594, + "step": 2831 + }, + { + "epoch": 0.3, + "grad_norm": 2.427751709970699, + "learning_rate": 8.23253987227203e-06, + "loss": 0.757, + "step": 2832 + }, + { + "epoch": 0.3, + "grad_norm": 4.827636901802302, + "learning_rate": 8.23123950890047e-06, + "loss": 0.7158, + "step": 2833 + }, + { + "epoch": 0.3, + "grad_norm": 2.310647756260461, + "learning_rate": 8.229938770132843e-06, + "loss": 0.6142, + "step": 2834 + }, + { + "epoch": 0.3, + "grad_norm": 3.439737457921449, + "learning_rate": 8.228637656120268e-06, + "loss": 0.6859, + "step": 2835 + }, + { + "epoch": 0.3, + "grad_norm": 3.465299362478447, + "learning_rate": 8.227336167013901e-06, + "loss": 0.619, + "step": 2836 + }, + { + "epoch": 0.3, + "grad_norm": 5.161811987054101, + "learning_rate": 8.22603430296495e-06, + "loss": 0.6885, + "step": 2837 + }, + { + "epoch": 0.3, + "grad_norm": 13.016909893025721, + "learning_rate": 8.224732064124658e-06, + "loss": 0.7578, + "step": 2838 + }, + { + "epoch": 0.3, + "grad_norm": 3.29770830365415, + "learning_rate": 8.223429450644317e-06, + "loss": 0.5983, + "step": 2839 + }, + { + "epoch": 0.3, + "grad_norm": 2.74582961488932, + "learning_rate": 8.222126462675259e-06, + "loss": 0.703, + "step": 2840 + }, + { + "epoch": 0.3, + "grad_norm": 2.839825341685874, + "learning_rate": 8.220823100368865e-06, + "loss": 0.7577, + "step": 2841 + }, + { + "epoch": 0.3, + "grad_norm": 2.151612849557347, + "learning_rate": 8.219519363876552e-06, + "loss": 0.7437, + "step": 2842 + }, + { + "epoch": 0.3, + "grad_norm": 2.2813563053260157, + "learning_rate": 8.218215253349785e-06, + "loss": 0.7145, + "step": 2843 + }, + { + "epoch": 0.3, + "grad_norm": 2.8740576992118227, + "learning_rate": 8.216910768940075e-06, + "loss": 0.6663, + "step": 2844 + }, + { + "epoch": 0.3, + "grad_norm": 2.3832922463971293, + "learning_rate": 8.215605910798972e-06, + "loss": 0.6398, + "step": 2845 + }, + { + "epoch": 0.3, + "grad_norm": 2.087398052989275, + "learning_rate": 8.21430067907807e-06, + "loss": 0.6233, + "step": 2846 + }, + { + "epoch": 0.3, + "grad_norm": 3.1121220094877007, + "learning_rate": 8.212995073929002e-06, + "loss": 0.7069, + "step": 2847 + }, + { + "epoch": 0.3, + "grad_norm": 1.1733089591500396, + "learning_rate": 8.211689095503457e-06, + "loss": 0.6373, + "step": 2848 + }, + { + "epoch": 0.3, + "grad_norm": 2.53436764490174, + "learning_rate": 8.210382743953159e-06, + "loss": 0.6675, + "step": 2849 + }, + { + "epoch": 0.3, + "grad_norm": 2.9962280045699328, + "learning_rate": 8.20907601942987e-06, + "loss": 0.6339, + "step": 2850 + }, + { + "epoch": 0.3, + "grad_norm": 2.6687863772007625, + "learning_rate": 8.207768922085408e-06, + "loss": 0.6427, + "step": 2851 + }, + { + "epoch": 0.3, + "grad_norm": 2.4367057789328523, + "learning_rate": 8.206461452071625e-06, + "loss": 0.6382, + "step": 2852 + }, + { + "epoch": 0.3, + "grad_norm": 2.494084327088906, + "learning_rate": 8.20515360954042e-06, + "loss": 0.7028, + "step": 2853 + }, + { + "epoch": 0.3, + "grad_norm": 3.0800899409941738, + "learning_rate": 8.203845394643732e-06, + "loss": 0.6296, + "step": 2854 + }, + { + "epoch": 0.3, + "grad_norm": 10.158844023386168, + "learning_rate": 8.202536807533548e-06, + "loss": 0.7244, + "step": 2855 + }, + { + "epoch": 0.3, + "grad_norm": 2.667508619561635, + "learning_rate": 8.201227848361895e-06, + "loss": 0.7001, + "step": 2856 + }, + { + "epoch": 0.3, + "grad_norm": 2.53043885098298, + "learning_rate": 8.199918517280848e-06, + "loss": 0.6844, + "step": 2857 + }, + { + "epoch": 0.3, + "grad_norm": 3.114762138835378, + "learning_rate": 8.198608814442513e-06, + "loss": 0.6376, + "step": 2858 + }, + { + "epoch": 0.3, + "grad_norm": 2.9348916221619814, + "learning_rate": 8.197298739999055e-06, + "loss": 0.6631, + "step": 2859 + }, + { + "epoch": 0.3, + "grad_norm": 2.2713830464167466, + "learning_rate": 8.19598829410267e-06, + "loss": 0.5696, + "step": 2860 + }, + { + "epoch": 0.3, + "grad_norm": 2.640428459107109, + "learning_rate": 8.194677476905604e-06, + "loss": 0.6494, + "step": 2861 + }, + { + "epoch": 0.3, + "grad_norm": 2.458537500123125, + "learning_rate": 8.193366288560144e-06, + "loss": 0.7073, + "step": 2862 + }, + { + "epoch": 0.3, + "grad_norm": 2.789431886976531, + "learning_rate": 8.192054729218621e-06, + "loss": 0.6223, + "step": 2863 + }, + { + "epoch": 0.3, + "grad_norm": 2.0346142539870478, + "learning_rate": 8.190742799033404e-06, + "loss": 0.6502, + "step": 2864 + }, + { + "epoch": 0.3, + "grad_norm": 2.523541617507126, + "learning_rate": 8.189430498156914e-06, + "loss": 0.6346, + "step": 2865 + }, + { + "epoch": 0.3, + "grad_norm": 2.2087194523547145, + "learning_rate": 8.18811782674161e-06, + "loss": 0.6108, + "step": 2866 + }, + { + "epoch": 0.3, + "grad_norm": 2.491402481550859, + "learning_rate": 8.18680478493999e-06, + "loss": 0.6853, + "step": 2867 + }, + { + "epoch": 0.3, + "grad_norm": 3.026321341321937, + "learning_rate": 8.185491372904604e-06, + "loss": 0.6313, + "step": 2868 + }, + { + "epoch": 0.3, + "grad_norm": 2.6138591078805526, + "learning_rate": 8.184177590788038e-06, + "loss": 0.6652, + "step": 2869 + }, + { + "epoch": 0.3, + "grad_norm": 2.519595862218996, + "learning_rate": 8.182863438742922e-06, + "loss": 0.7254, + "step": 2870 + }, + { + "epoch": 0.3, + "grad_norm": 2.4026200308540617, + "learning_rate": 8.181548916921935e-06, + "loss": 0.5704, + "step": 2871 + }, + { + "epoch": 0.3, + "grad_norm": 2.580908432962159, + "learning_rate": 8.180234025477792e-06, + "loss": 0.6507, + "step": 2872 + }, + { + "epoch": 0.3, + "grad_norm": 2.268967709119169, + "learning_rate": 8.178918764563251e-06, + "loss": 0.718, + "step": 2873 + }, + { + "epoch": 0.3, + "grad_norm": 2.2396586103792795, + "learning_rate": 8.177603134331119e-06, + "loss": 0.59, + "step": 2874 + }, + { + "epoch": 0.3, + "grad_norm": 2.7093881022757165, + "learning_rate": 8.17628713493424e-06, + "loss": 0.6465, + "step": 2875 + }, + { + "epoch": 0.3, + "grad_norm": 2.2292433893864105, + "learning_rate": 8.174970766525503e-06, + "loss": 0.6516, + "step": 2876 + }, + { + "epoch": 0.3, + "grad_norm": 3.0744331784154943, + "learning_rate": 8.17365402925784e-06, + "loss": 0.6089, + "step": 2877 + }, + { + "epoch": 0.3, + "grad_norm": 2.563571554597488, + "learning_rate": 8.172336923284225e-06, + "loss": 0.6704, + "step": 2878 + }, + { + "epoch": 0.3, + "grad_norm": 2.8782161919357705, + "learning_rate": 8.17101944875768e-06, + "loss": 0.7115, + "step": 2879 + }, + { + "epoch": 0.3, + "grad_norm": 2.662681630857156, + "learning_rate": 8.16970160583126e-06, + "loss": 0.6443, + "step": 2880 + }, + { + "epoch": 0.3, + "grad_norm": 2.35955388779492, + "learning_rate": 8.16838339465807e-06, + "loss": 0.7116, + "step": 2881 + }, + { + "epoch": 0.3, + "grad_norm": 3.1676445753173974, + "learning_rate": 8.167064815391254e-06, + "loss": 0.6053, + "step": 2882 + }, + { + "epoch": 0.3, + "grad_norm": 2.466081183525901, + "learning_rate": 8.165745868184006e-06, + "loss": 0.6041, + "step": 2883 + }, + { + "epoch": 0.3, + "grad_norm": 2.0720974841936783, + "learning_rate": 8.164426553189553e-06, + "loss": 0.6562, + "step": 2884 + }, + { + "epoch": 0.3, + "grad_norm": 3.740189082503985, + "learning_rate": 8.16310687056117e-06, + "loss": 0.7227, + "step": 2885 + }, + { + "epoch": 0.3, + "grad_norm": 2.978902500808749, + "learning_rate": 8.161786820452176e-06, + "loss": 0.6807, + "step": 2886 + }, + { + "epoch": 0.3, + "grad_norm": 2.301320445626182, + "learning_rate": 8.160466403015928e-06, + "loss": 0.726, + "step": 2887 + }, + { + "epoch": 0.3, + "grad_norm": 2.267037663828458, + "learning_rate": 8.159145618405828e-06, + "loss": 0.6489, + "step": 2888 + }, + { + "epoch": 0.3, + "grad_norm": 2.015189747260077, + "learning_rate": 8.157824466775324e-06, + "loss": 0.6653, + "step": 2889 + }, + { + "epoch": 0.3, + "grad_norm": 2.521922180245439, + "learning_rate": 8.156502948277902e-06, + "loss": 0.6805, + "step": 2890 + }, + { + "epoch": 0.3, + "grad_norm": 2.12799521453685, + "learning_rate": 8.15518106306709e-06, + "loss": 0.6851, + "step": 2891 + }, + { + "epoch": 0.3, + "grad_norm": 2.0483331306231944, + "learning_rate": 8.153858811296465e-06, + "loss": 0.6434, + "step": 2892 + }, + { + "epoch": 0.3, + "grad_norm": 5.707423136219375, + "learning_rate": 8.152536193119638e-06, + "loss": 0.577, + "step": 2893 + }, + { + "epoch": 0.3, + "grad_norm": 3.092278245285496, + "learning_rate": 8.151213208690271e-06, + "loss": 0.6721, + "step": 2894 + }, + { + "epoch": 0.3, + "grad_norm": 2.561997031833164, + "learning_rate": 8.149889858162062e-06, + "loss": 0.6855, + "step": 2895 + }, + { + "epoch": 0.3, + "grad_norm": 2.0803284506968662, + "learning_rate": 8.148566141688755e-06, + "loss": 0.6528, + "step": 2896 + }, + { + "epoch": 0.3, + "grad_norm": 2.9962095481419624, + "learning_rate": 8.147242059424134e-06, + "loss": 0.7353, + "step": 2897 + }, + { + "epoch": 0.3, + "grad_norm": 2.5237602734129516, + "learning_rate": 8.145917611522029e-06, + "loss": 0.6057, + "step": 2898 + }, + { + "epoch": 0.31, + "grad_norm": 1.8369538414675164, + "learning_rate": 8.14459279813631e-06, + "loss": 0.7189, + "step": 2899 + }, + { + "epoch": 0.31, + "grad_norm": 1.2530174295755743, + "learning_rate": 8.143267619420892e-06, + "loss": 0.605, + "step": 2900 + }, + { + "epoch": 0.31, + "grad_norm": 2.6147582967437226, + "learning_rate": 8.141942075529725e-06, + "loss": 0.7003, + "step": 2901 + }, + { + "epoch": 0.31, + "grad_norm": 2.8824381178183907, + "learning_rate": 8.14061616661681e-06, + "loss": 0.7087, + "step": 2902 + }, + { + "epoch": 0.31, + "grad_norm": 3.993274788781763, + "learning_rate": 8.13928989283619e-06, + "loss": 0.5397, + "step": 2903 + }, + { + "epoch": 0.31, + "grad_norm": 2.5531708615561, + "learning_rate": 8.137963254341944e-06, + "loss": 0.6569, + "step": 2904 + }, + { + "epoch": 0.31, + "grad_norm": 2.666082677328026, + "learning_rate": 8.136636251288197e-06, + "loss": 0.7032, + "step": 2905 + }, + { + "epoch": 0.31, + "grad_norm": 2.483321235755457, + "learning_rate": 8.135308883829119e-06, + "loss": 0.6559, + "step": 2906 + }, + { + "epoch": 0.31, + "grad_norm": 2.2907791966791415, + "learning_rate": 8.133981152118916e-06, + "loss": 0.6794, + "step": 2907 + }, + { + "epoch": 0.31, + "grad_norm": 2.2809901014129657, + "learning_rate": 8.132653056311844e-06, + "loss": 0.6586, + "step": 2908 + }, + { + "epoch": 0.31, + "grad_norm": 3.5398481577715, + "learning_rate": 8.131324596562195e-06, + "loss": 0.6637, + "step": 2909 + }, + { + "epoch": 0.31, + "grad_norm": 2.7594732513859443, + "learning_rate": 8.129995773024306e-06, + "loss": 0.7316, + "step": 2910 + }, + { + "epoch": 0.31, + "grad_norm": 3.30105743088516, + "learning_rate": 8.128666585852556e-06, + "loss": 0.6668, + "step": 2911 + }, + { + "epoch": 0.31, + "grad_norm": 2.731674755400316, + "learning_rate": 8.127337035201365e-06, + "loss": 0.6782, + "step": 2912 + }, + { + "epoch": 0.31, + "grad_norm": 3.186947648727673, + "learning_rate": 8.1260071212252e-06, + "loss": 0.6693, + "step": 2913 + }, + { + "epoch": 0.31, + "grad_norm": 2.198681447100411, + "learning_rate": 8.12467684407856e-06, + "loss": 0.6396, + "step": 2914 + }, + { + "epoch": 0.31, + "grad_norm": 2.5485456110616918, + "learning_rate": 8.123346203916e-06, + "loss": 0.6218, + "step": 2915 + }, + { + "epoch": 0.31, + "grad_norm": 2.914548621339626, + "learning_rate": 8.122015200892106e-06, + "loss": 0.6717, + "step": 2916 + }, + { + "epoch": 0.31, + "grad_norm": 2.593755867982301, + "learning_rate": 8.120683835161511e-06, + "loss": 0.6373, + "step": 2917 + }, + { + "epoch": 0.31, + "grad_norm": 2.062797790828946, + "learning_rate": 8.11935210687889e-06, + "loss": 0.6702, + "step": 2918 + }, + { + "epoch": 0.31, + "grad_norm": 2.9875147078800683, + "learning_rate": 8.118020016198957e-06, + "loss": 0.6398, + "step": 2919 + }, + { + "epoch": 0.31, + "grad_norm": 2.022004557471706, + "learning_rate": 8.11668756327647e-06, + "loss": 0.7186, + "step": 2920 + }, + { + "epoch": 0.31, + "grad_norm": 1.2197328167155737, + "learning_rate": 8.115354748266233e-06, + "loss": 0.652, + "step": 2921 + }, + { + "epoch": 0.31, + "grad_norm": 2.0617705515592957, + "learning_rate": 8.114021571323089e-06, + "loss": 0.5757, + "step": 2922 + }, + { + "epoch": 0.31, + "grad_norm": 2.3145783721939996, + "learning_rate": 8.112688032601919e-06, + "loss": 0.6625, + "step": 2923 + }, + { + "epoch": 0.31, + "grad_norm": 2.531303219988851, + "learning_rate": 8.111354132257651e-06, + "loss": 0.6679, + "step": 2924 + }, + { + "epoch": 0.31, + "grad_norm": 2.6045952874222555, + "learning_rate": 8.110019870445254e-06, + "loss": 0.7008, + "step": 2925 + }, + { + "epoch": 0.31, + "grad_norm": 2.9426229731805704, + "learning_rate": 8.10868524731974e-06, + "loss": 0.6898, + "step": 2926 + }, + { + "epoch": 0.31, + "grad_norm": 2.261653088656059, + "learning_rate": 8.107350263036157e-06, + "loss": 0.6312, + "step": 2927 + }, + { + "epoch": 0.31, + "grad_norm": 4.259944992592717, + "learning_rate": 8.106014917749605e-06, + "loss": 0.6676, + "step": 2928 + }, + { + "epoch": 0.31, + "grad_norm": 2.6421624462915765, + "learning_rate": 8.104679211615218e-06, + "loss": 0.7101, + "step": 2929 + }, + { + "epoch": 0.31, + "grad_norm": 2.3871886404739504, + "learning_rate": 8.103343144788177e-06, + "loss": 0.6416, + "step": 2930 + }, + { + "epoch": 0.31, + "grad_norm": 2.1379036146977164, + "learning_rate": 8.102006717423695e-06, + "loss": 0.6405, + "step": 2931 + }, + { + "epoch": 0.31, + "grad_norm": 3.0829424463195303, + "learning_rate": 8.100669929677044e-06, + "loss": 0.6991, + "step": 2932 + }, + { + "epoch": 0.31, + "grad_norm": 2.45544537675686, + "learning_rate": 8.099332781703523e-06, + "loss": 0.6083, + "step": 2933 + }, + { + "epoch": 0.31, + "grad_norm": 2.5320119199003224, + "learning_rate": 8.097995273658479e-06, + "loss": 0.6114, + "step": 2934 + }, + { + "epoch": 0.31, + "grad_norm": 2.2301392753727916, + "learning_rate": 8.0966574056973e-06, + "loss": 0.7644, + "step": 2935 + }, + { + "epoch": 0.31, + "grad_norm": 2.397360362520422, + "learning_rate": 8.095319177975412e-06, + "loss": 0.5961, + "step": 2936 + }, + { + "epoch": 0.31, + "grad_norm": 3.5622955344160414, + "learning_rate": 8.093980590648291e-06, + "loss": 0.6874, + "step": 2937 + }, + { + "epoch": 0.31, + "grad_norm": 2.481733875250971, + "learning_rate": 8.092641643871451e-06, + "loss": 0.5639, + "step": 2938 + }, + { + "epoch": 0.31, + "grad_norm": 2.237192875379186, + "learning_rate": 8.091302337800441e-06, + "loss": 0.6794, + "step": 2939 + }, + { + "epoch": 0.31, + "grad_norm": 2.364977168624842, + "learning_rate": 8.089962672590865e-06, + "loss": 0.6851, + "step": 2940 + }, + { + "epoch": 0.31, + "grad_norm": 2.155447396944477, + "learning_rate": 8.088622648398357e-06, + "loss": 0.6662, + "step": 2941 + }, + { + "epoch": 0.31, + "grad_norm": 2.8646626316625636, + "learning_rate": 8.087282265378596e-06, + "loss": 0.6547, + "step": 2942 + }, + { + "epoch": 0.31, + "grad_norm": 2.999962898982827, + "learning_rate": 8.085941523687309e-06, + "loss": 0.6913, + "step": 2943 + }, + { + "epoch": 0.31, + "grad_norm": 2.793000783068167, + "learning_rate": 8.084600423480253e-06, + "loss": 0.7647, + "step": 2944 + }, + { + "epoch": 0.31, + "grad_norm": 1.294318321894251, + "learning_rate": 8.083258964913238e-06, + "loss": 0.6202, + "step": 2945 + }, + { + "epoch": 0.31, + "grad_norm": 2.3681448665076648, + "learning_rate": 8.08191714814211e-06, + "loss": 0.6897, + "step": 2946 + }, + { + "epoch": 0.31, + "grad_norm": 3.4035241766844258, + "learning_rate": 8.080574973322755e-06, + "loss": 0.6347, + "step": 2947 + }, + { + "epoch": 0.31, + "grad_norm": 10.697359357745482, + "learning_rate": 8.079232440611106e-06, + "loss": 0.6607, + "step": 2948 + }, + { + "epoch": 0.31, + "grad_norm": 11.08038730758078, + "learning_rate": 8.077889550163133e-06, + "loss": 0.6913, + "step": 2949 + }, + { + "epoch": 0.31, + "grad_norm": 2.289498594594031, + "learning_rate": 8.076546302134849e-06, + "loss": 0.5815, + "step": 2950 + }, + { + "epoch": 0.31, + "grad_norm": 3.8411656849036024, + "learning_rate": 8.07520269668231e-06, + "loss": 0.6598, + "step": 2951 + }, + { + "epoch": 0.31, + "grad_norm": 3.9514846702421127, + "learning_rate": 8.073858733961609e-06, + "loss": 0.7163, + "step": 2952 + }, + { + "epoch": 0.31, + "grad_norm": 3.287925053791738, + "learning_rate": 8.072514414128886e-06, + "loss": 0.6914, + "step": 2953 + }, + { + "epoch": 0.31, + "grad_norm": 2.850131167037012, + "learning_rate": 8.071169737340322e-06, + "loss": 0.6972, + "step": 2954 + }, + { + "epoch": 0.31, + "grad_norm": 3.3564428711936714, + "learning_rate": 8.069824703752136e-06, + "loss": 0.6299, + "step": 2955 + }, + { + "epoch": 0.31, + "grad_norm": 2.5337256748284793, + "learning_rate": 8.068479313520589e-06, + "loss": 0.7192, + "step": 2956 + }, + { + "epoch": 0.31, + "grad_norm": 2.7829671234993767, + "learning_rate": 8.067133566801986e-06, + "loss": 0.6942, + "step": 2957 + }, + { + "epoch": 0.31, + "grad_norm": 2.0780064728452814, + "learning_rate": 8.06578746375267e-06, + "loss": 0.699, + "step": 2958 + }, + { + "epoch": 0.31, + "grad_norm": 2.066654395430205, + "learning_rate": 8.06444100452903e-06, + "loss": 0.6052, + "step": 2959 + }, + { + "epoch": 0.31, + "grad_norm": 2.3241867147767636, + "learning_rate": 8.063094189287492e-06, + "loss": 0.6425, + "step": 2960 + }, + { + "epoch": 0.31, + "grad_norm": 2.936750856792946, + "learning_rate": 8.061747018184525e-06, + "loss": 0.6816, + "step": 2961 + }, + { + "epoch": 0.31, + "grad_norm": 2.564282733037237, + "learning_rate": 8.06039949137664e-06, + "loss": 0.7117, + "step": 2962 + }, + { + "epoch": 0.31, + "grad_norm": 35.37441915823146, + "learning_rate": 8.05905160902039e-06, + "loss": 0.7604, + "step": 2963 + }, + { + "epoch": 0.31, + "grad_norm": 3.470994439733572, + "learning_rate": 8.057703371272368e-06, + "loss": 0.6648, + "step": 2964 + }, + { + "epoch": 0.31, + "grad_norm": 2.653406363973704, + "learning_rate": 8.056354778289204e-06, + "loss": 0.6818, + "step": 2965 + }, + { + "epoch": 0.31, + "grad_norm": 2.7162242932530374, + "learning_rate": 8.055005830227578e-06, + "loss": 0.6176, + "step": 2966 + }, + { + "epoch": 0.31, + "grad_norm": 2.0336246489108563, + "learning_rate": 8.053656527244206e-06, + "loss": 0.6957, + "step": 2967 + }, + { + "epoch": 0.31, + "grad_norm": 2.404566189632472, + "learning_rate": 8.052306869495847e-06, + "loss": 0.6862, + "step": 2968 + }, + { + "epoch": 0.31, + "grad_norm": 2.662561717072185, + "learning_rate": 8.050956857139298e-06, + "loss": 0.7162, + "step": 2969 + }, + { + "epoch": 0.31, + "grad_norm": 2.5875784364886254, + "learning_rate": 8.049606490331403e-06, + "loss": 0.6401, + "step": 2970 + }, + { + "epoch": 0.31, + "grad_norm": 2.978839811791059, + "learning_rate": 8.048255769229038e-06, + "loss": 0.6423, + "step": 2971 + }, + { + "epoch": 0.31, + "grad_norm": 3.159696497187548, + "learning_rate": 8.046904693989132e-06, + "loss": 0.6541, + "step": 2972 + }, + { + "epoch": 0.31, + "grad_norm": 3.108106082999746, + "learning_rate": 8.045553264768645e-06, + "loss": 0.7498, + "step": 2973 + }, + { + "epoch": 0.31, + "grad_norm": 2.4365012913470747, + "learning_rate": 8.044201481724582e-06, + "loss": 0.6099, + "step": 2974 + }, + { + "epoch": 0.31, + "grad_norm": 2.9007518834879593, + "learning_rate": 8.042849345013995e-06, + "loss": 0.6726, + "step": 2975 + }, + { + "epoch": 0.31, + "grad_norm": 2.2970067282582476, + "learning_rate": 8.041496854793964e-06, + "loss": 0.5851, + "step": 2976 + }, + { + "epoch": 0.31, + "grad_norm": 3.006328818924341, + "learning_rate": 8.040144011221621e-06, + "loss": 0.6791, + "step": 2977 + }, + { + "epoch": 0.31, + "grad_norm": 2.6939058418287805, + "learning_rate": 8.038790814454137e-06, + "loss": 0.726, + "step": 2978 + }, + { + "epoch": 0.31, + "grad_norm": 2.2054039052694705, + "learning_rate": 8.037437264648717e-06, + "loss": 0.6719, + "step": 2979 + }, + { + "epoch": 0.31, + "grad_norm": 2.41222955568751, + "learning_rate": 8.036083361962616e-06, + "loss": 0.6879, + "step": 2980 + }, + { + "epoch": 0.31, + "grad_norm": 2.4774726350606158, + "learning_rate": 8.03472910655313e-06, + "loss": 0.7345, + "step": 2981 + }, + { + "epoch": 0.31, + "grad_norm": 2.3042323688271327, + "learning_rate": 8.033374498577586e-06, + "loss": 0.7063, + "step": 2982 + }, + { + "epoch": 0.31, + "grad_norm": 2.7730077023368938, + "learning_rate": 8.032019538193363e-06, + "loss": 0.7008, + "step": 2983 + }, + { + "epoch": 0.31, + "grad_norm": 2.7189215222404144, + "learning_rate": 8.030664225557873e-06, + "loss": 0.6169, + "step": 2984 + }, + { + "epoch": 0.31, + "grad_norm": 2.845427019172591, + "learning_rate": 8.029308560828574e-06, + "loss": 0.7042, + "step": 2985 + }, + { + "epoch": 0.31, + "grad_norm": 2.311773529022882, + "learning_rate": 8.027952544162965e-06, + "loss": 0.7276, + "step": 2986 + }, + { + "epoch": 0.31, + "grad_norm": 4.498629343600977, + "learning_rate": 8.026596175718582e-06, + "loss": 0.65, + "step": 2987 + }, + { + "epoch": 0.31, + "grad_norm": 2.527545636273197, + "learning_rate": 8.025239455653003e-06, + "loss": 0.6596, + "step": 2988 + }, + { + "epoch": 0.31, + "grad_norm": 2.3431814650204354, + "learning_rate": 8.023882384123851e-06, + "loss": 0.5784, + "step": 2989 + }, + { + "epoch": 0.31, + "grad_norm": 2.266244506572552, + "learning_rate": 8.022524961288783e-06, + "loss": 0.6536, + "step": 2990 + }, + { + "epoch": 0.31, + "grad_norm": 2.4413399629821395, + "learning_rate": 8.021167187305504e-06, + "loss": 0.648, + "step": 2991 + }, + { + "epoch": 0.31, + "grad_norm": 3.8044222712960485, + "learning_rate": 8.019809062331754e-06, + "loss": 0.6785, + "step": 2992 + }, + { + "epoch": 0.31, + "grad_norm": 2.8009795918380527, + "learning_rate": 8.018450586525314e-06, + "loss": 0.6825, + "step": 2993 + }, + { + "epoch": 0.32, + "grad_norm": 2.044733004957816, + "learning_rate": 8.017091760044014e-06, + "loss": 0.7218, + "step": 2994 + }, + { + "epoch": 0.32, + "grad_norm": 2.5348478649648625, + "learning_rate": 8.015732583045713e-06, + "loss": 0.6306, + "step": 2995 + }, + { + "epoch": 0.32, + "grad_norm": 2.820292578056801, + "learning_rate": 8.014373055688319e-06, + "loss": 0.6103, + "step": 2996 + }, + { + "epoch": 0.32, + "grad_norm": 2.83941151917699, + "learning_rate": 8.013013178129775e-06, + "loss": 0.739, + "step": 2997 + }, + { + "epoch": 0.32, + "grad_norm": 2.562461642222233, + "learning_rate": 8.01165295052807e-06, + "loss": 0.6528, + "step": 2998 + }, + { + "epoch": 0.32, + "grad_norm": 2.2991584011557946, + "learning_rate": 8.010292373041233e-06, + "loss": 0.6633, + "step": 2999 + }, + { + "epoch": 0.32, + "grad_norm": 3.0164089815235755, + "learning_rate": 8.008931445827329e-06, + "loss": 0.6508, + "step": 3000 + }, + { + "epoch": 0.32, + "grad_norm": 2.2488209086571036, + "learning_rate": 8.007570169044467e-06, + "loss": 0.6924, + "step": 3001 + }, + { + "epoch": 0.32, + "grad_norm": 4.067344347818427, + "learning_rate": 8.006208542850797e-06, + "loss": 0.716, + "step": 3002 + }, + { + "epoch": 0.32, + "grad_norm": 2.433534837689038, + "learning_rate": 8.004846567404509e-06, + "loss": 0.6682, + "step": 3003 + }, + { + "epoch": 0.32, + "grad_norm": 3.1675254720528176, + "learning_rate": 8.003484242863833e-06, + "loss": 0.744, + "step": 3004 + }, + { + "epoch": 0.32, + "grad_norm": 2.3134441373453916, + "learning_rate": 8.00212156938704e-06, + "loss": 0.5428, + "step": 3005 + }, + { + "epoch": 0.32, + "grad_norm": 2.8236083439191355, + "learning_rate": 8.000758547132441e-06, + "loss": 0.66, + "step": 3006 + }, + { + "epoch": 0.32, + "grad_norm": 2.158397208676714, + "learning_rate": 7.99939517625839e-06, + "loss": 0.7367, + "step": 3007 + }, + { + "epoch": 0.32, + "grad_norm": 2.2654650330105257, + "learning_rate": 7.998031456923274e-06, + "loss": 0.5894, + "step": 3008 + }, + { + "epoch": 0.32, + "grad_norm": 2.5662295257161833, + "learning_rate": 7.996667389285532e-06, + "loss": 0.6596, + "step": 3009 + }, + { + "epoch": 0.32, + "grad_norm": 2.677019268255404, + "learning_rate": 7.995302973503636e-06, + "loss": 0.6978, + "step": 3010 + }, + { + "epoch": 0.32, + "grad_norm": 2.371436079489015, + "learning_rate": 7.993938209736097e-06, + "loss": 0.69, + "step": 3011 + }, + { + "epoch": 0.32, + "grad_norm": 2.1346580337024186, + "learning_rate": 7.992573098141472e-06, + "loss": 0.644, + "step": 3012 + }, + { + "epoch": 0.32, + "grad_norm": 1.9295251025511633, + "learning_rate": 7.991207638878356e-06, + "loss": 0.6617, + "step": 3013 + }, + { + "epoch": 0.32, + "grad_norm": 2.558479337294168, + "learning_rate": 7.989841832105382e-06, + "loss": 0.7159, + "step": 3014 + }, + { + "epoch": 0.32, + "grad_norm": 2.147625038233415, + "learning_rate": 7.988475677981229e-06, + "loss": 0.6179, + "step": 3015 + }, + { + "epoch": 0.32, + "grad_norm": 2.1693505791045053, + "learning_rate": 7.98710917666461e-06, + "loss": 0.6942, + "step": 3016 + }, + { + "epoch": 0.32, + "grad_norm": 3.1568770622983475, + "learning_rate": 7.985742328314279e-06, + "loss": 0.7053, + "step": 3017 + }, + { + "epoch": 0.32, + "grad_norm": 2.1959613312211084, + "learning_rate": 7.984375133089038e-06, + "loss": 0.637, + "step": 3018 + }, + { + "epoch": 0.32, + "grad_norm": 9.895723607078637, + "learning_rate": 7.98300759114772e-06, + "loss": 0.7098, + "step": 3019 + }, + { + "epoch": 0.32, + "grad_norm": 2.234596285844836, + "learning_rate": 7.981639702649204e-06, + "loss": 0.6282, + "step": 3020 + }, + { + "epoch": 0.32, + "grad_norm": 2.2374506416442133, + "learning_rate": 7.980271467752405e-06, + "loss": 0.6773, + "step": 3021 + }, + { + "epoch": 0.32, + "grad_norm": 2.6097297477113326, + "learning_rate": 7.97890288661628e-06, + "loss": 0.7456, + "step": 3022 + }, + { + "epoch": 0.32, + "grad_norm": 2.8225474093932426, + "learning_rate": 7.977533959399833e-06, + "loss": 0.68, + "step": 3023 + }, + { + "epoch": 0.32, + "grad_norm": 2.6648211533685795, + "learning_rate": 7.976164686262096e-06, + "loss": 0.686, + "step": 3024 + }, + { + "epoch": 0.32, + "grad_norm": 2.3486556822537206, + "learning_rate": 7.974795067362148e-06, + "loss": 0.7185, + "step": 3025 + }, + { + "epoch": 0.32, + "grad_norm": 2.529824254487311, + "learning_rate": 7.97342510285911e-06, + "loss": 0.6447, + "step": 3026 + }, + { + "epoch": 0.32, + "grad_norm": 2.7391077394652763, + "learning_rate": 7.972054792912138e-06, + "loss": 0.6169, + "step": 3027 + }, + { + "epoch": 0.32, + "grad_norm": 2.237331937538726, + "learning_rate": 7.970684137680431e-06, + "loss": 0.6488, + "step": 3028 + }, + { + "epoch": 0.32, + "grad_norm": 2.2823597001812614, + "learning_rate": 7.969313137323228e-06, + "loss": 0.6577, + "step": 3029 + }, + { + "epoch": 0.32, + "grad_norm": 2.6046928602923143, + "learning_rate": 7.96794179199981e-06, + "loss": 0.682, + "step": 3030 + }, + { + "epoch": 0.32, + "grad_norm": 3.018838373919387, + "learning_rate": 7.966570101869494e-06, + "loss": 0.6314, + "step": 3031 + }, + { + "epoch": 0.32, + "grad_norm": 3.190243611941645, + "learning_rate": 7.965198067091637e-06, + "loss": 0.6224, + "step": 3032 + }, + { + "epoch": 0.32, + "grad_norm": 2.4033899287674183, + "learning_rate": 7.96382568782564e-06, + "loss": 0.6538, + "step": 3033 + }, + { + "epoch": 0.32, + "grad_norm": 2.57052475740044, + "learning_rate": 7.962452964230944e-06, + "loss": 0.6519, + "step": 3034 + }, + { + "epoch": 0.32, + "grad_norm": 3.0579037048889752, + "learning_rate": 7.961079896467025e-06, + "loss": 0.5823, + "step": 3035 + }, + { + "epoch": 0.32, + "grad_norm": 2.8012393029432645, + "learning_rate": 7.959706484693405e-06, + "loss": 0.6989, + "step": 3036 + }, + { + "epoch": 0.32, + "grad_norm": 2.7558779760997525, + "learning_rate": 7.95833272906964e-06, + "loss": 0.6581, + "step": 3037 + }, + { + "epoch": 0.32, + "grad_norm": 2.1874011393784665, + "learning_rate": 7.95695862975533e-06, + "loss": 0.6699, + "step": 3038 + }, + { + "epoch": 0.32, + "grad_norm": 2.6164914345320107, + "learning_rate": 7.955584186910115e-06, + "loss": 0.7054, + "step": 3039 + }, + { + "epoch": 0.32, + "grad_norm": 3.0292864485504594, + "learning_rate": 7.954209400693673e-06, + "loss": 0.6537, + "step": 3040 + }, + { + "epoch": 0.32, + "grad_norm": 2.905511291216553, + "learning_rate": 7.95283427126572e-06, + "loss": 0.6966, + "step": 3041 + }, + { + "epoch": 0.32, + "grad_norm": 1.208663400211376, + "learning_rate": 7.95145879878602e-06, + "loss": 0.5702, + "step": 3042 + }, + { + "epoch": 0.32, + "grad_norm": 2.559598093856362, + "learning_rate": 7.950082983414367e-06, + "loss": 0.6517, + "step": 3043 + }, + { + "epoch": 0.32, + "grad_norm": 2.389487342933906, + "learning_rate": 7.948706825310601e-06, + "loss": 0.6289, + "step": 3044 + }, + { + "epoch": 0.32, + "grad_norm": 2.675926284212583, + "learning_rate": 7.947330324634601e-06, + "loss": 0.6088, + "step": 3045 + }, + { + "epoch": 0.32, + "grad_norm": 2.648358995913935, + "learning_rate": 7.945953481546282e-06, + "loss": 0.6291, + "step": 3046 + }, + { + "epoch": 0.32, + "grad_norm": 1.2023556963692583, + "learning_rate": 7.944576296205603e-06, + "loss": 0.5934, + "step": 3047 + }, + { + "epoch": 0.32, + "grad_norm": 2.627906298885444, + "learning_rate": 7.943198768772565e-06, + "loss": 0.743, + "step": 3048 + }, + { + "epoch": 0.32, + "grad_norm": 3.399467853821857, + "learning_rate": 7.9418208994072e-06, + "loss": 0.6587, + "step": 3049 + }, + { + "epoch": 0.32, + "grad_norm": 2.6749330583941258, + "learning_rate": 7.940442688269587e-06, + "loss": 0.6825, + "step": 3050 + }, + { + "epoch": 0.32, + "grad_norm": 2.9142489850897517, + "learning_rate": 7.939064135519844e-06, + "loss": 0.6535, + "step": 3051 + }, + { + "epoch": 0.32, + "grad_norm": 2.3087341250267763, + "learning_rate": 7.937685241318122e-06, + "loss": 0.6461, + "step": 3052 + }, + { + "epoch": 0.32, + "grad_norm": 6.292432896926418, + "learning_rate": 7.936306005824624e-06, + "loss": 0.6497, + "step": 3053 + }, + { + "epoch": 0.32, + "grad_norm": 3.3219986590091435, + "learning_rate": 7.93492642919958e-06, + "loss": 0.6326, + "step": 3054 + }, + { + "epoch": 0.32, + "grad_norm": 3.119470589054828, + "learning_rate": 7.933546511603269e-06, + "loss": 0.6417, + "step": 3055 + }, + { + "epoch": 0.32, + "grad_norm": 6.774960844710111, + "learning_rate": 7.932166253196004e-06, + "loss": 0.5901, + "step": 3056 + }, + { + "epoch": 0.32, + "grad_norm": 2.4456975659415847, + "learning_rate": 7.93078565413814e-06, + "loss": 0.6138, + "step": 3057 + }, + { + "epoch": 0.32, + "grad_norm": 2.8996471024029513, + "learning_rate": 7.92940471459007e-06, + "loss": 0.6084, + "step": 3058 + }, + { + "epoch": 0.32, + "grad_norm": 2.1348588208140797, + "learning_rate": 7.928023434712227e-06, + "loss": 0.6235, + "step": 3059 + }, + { + "epoch": 0.32, + "grad_norm": 3.1903589219833037, + "learning_rate": 7.926641814665088e-06, + "loss": 0.6413, + "step": 3060 + }, + { + "epoch": 0.32, + "grad_norm": 2.6063098506456126, + "learning_rate": 7.925259854609162e-06, + "loss": 0.6501, + "step": 3061 + }, + { + "epoch": 0.32, + "grad_norm": 7.68937491835249, + "learning_rate": 7.923877554705002e-06, + "loss": 0.632, + "step": 3062 + }, + { + "epoch": 0.32, + "grad_norm": 3.2423734100409196, + "learning_rate": 7.9224949151132e-06, + "loss": 0.7238, + "step": 3063 + }, + { + "epoch": 0.32, + "grad_norm": 2.607658237678769, + "learning_rate": 7.921111935994388e-06, + "loss": 0.6373, + "step": 3064 + }, + { + "epoch": 0.32, + "grad_norm": 2.727385481962859, + "learning_rate": 7.919728617509233e-06, + "loss": 0.6826, + "step": 3065 + }, + { + "epoch": 0.32, + "grad_norm": 2.6366816990101145, + "learning_rate": 7.91834495981845e-06, + "loss": 0.6223, + "step": 3066 + }, + { + "epoch": 0.32, + "grad_norm": 2.388243265491952, + "learning_rate": 7.916960963082783e-06, + "loss": 0.6517, + "step": 3067 + }, + { + "epoch": 0.32, + "grad_norm": 2.384651407591471, + "learning_rate": 7.915576627463024e-06, + "loss": 0.6422, + "step": 3068 + }, + { + "epoch": 0.32, + "grad_norm": 2.315078446495304, + "learning_rate": 7.91419195312e-06, + "loss": 0.7492, + "step": 3069 + }, + { + "epoch": 0.32, + "grad_norm": 2.5016948568104214, + "learning_rate": 7.91280694021458e-06, + "loss": 0.5617, + "step": 3070 + }, + { + "epoch": 0.32, + "grad_norm": 2.267739297104988, + "learning_rate": 7.91142158890767e-06, + "loss": 0.6419, + "step": 3071 + }, + { + "epoch": 0.32, + "grad_norm": 2.9294081180768607, + "learning_rate": 7.910035899360215e-06, + "loss": 0.6419, + "step": 3072 + }, + { + "epoch": 0.32, + "grad_norm": 2.5533958607494083, + "learning_rate": 7.908649871733202e-06, + "loss": 0.6337, + "step": 3073 + }, + { + "epoch": 0.32, + "grad_norm": 3.016887449204389, + "learning_rate": 7.907263506187655e-06, + "loss": 0.577, + "step": 3074 + }, + { + "epoch": 0.32, + "grad_norm": 2.229569058131735, + "learning_rate": 7.905876802884639e-06, + "loss": 0.6829, + "step": 3075 + }, + { + "epoch": 0.32, + "grad_norm": 3.650463843334712, + "learning_rate": 7.904489761985254e-06, + "loss": 0.6764, + "step": 3076 + }, + { + "epoch": 0.32, + "grad_norm": 2.5592076303986047, + "learning_rate": 7.903102383650645e-06, + "loss": 0.623, + "step": 3077 + }, + { + "epoch": 0.32, + "grad_norm": 5.032748852875668, + "learning_rate": 7.901714668041993e-06, + "loss": 0.6891, + "step": 3078 + }, + { + "epoch": 0.32, + "grad_norm": 2.317465920800203, + "learning_rate": 7.90032661532052e-06, + "loss": 0.6487, + "step": 3079 + }, + { + "epoch": 0.32, + "grad_norm": 2.305967746812263, + "learning_rate": 7.898938225647484e-06, + "loss": 0.6823, + "step": 3080 + }, + { + "epoch": 0.32, + "grad_norm": 2.8884399615654206, + "learning_rate": 7.897549499184184e-06, + "loss": 0.6296, + "step": 3081 + }, + { + "epoch": 0.32, + "grad_norm": 3.0770385988445907, + "learning_rate": 7.896160436091961e-06, + "loss": 0.6943, + "step": 3082 + }, + { + "epoch": 0.32, + "grad_norm": 2.4056547215114055, + "learning_rate": 7.894771036532189e-06, + "loss": 0.5929, + "step": 3083 + }, + { + "epoch": 0.32, + "grad_norm": 2.4323552445732304, + "learning_rate": 7.893381300666287e-06, + "loss": 0.6887, + "step": 3084 + }, + { + "epoch": 0.32, + "grad_norm": 2.343998812827096, + "learning_rate": 7.89199122865571e-06, + "loss": 0.6709, + "step": 3085 + }, + { + "epoch": 0.32, + "grad_norm": 3.0146219044287164, + "learning_rate": 7.89060082066195e-06, + "loss": 0.6724, + "step": 3086 + }, + { + "epoch": 0.32, + "grad_norm": 2.6033290148752357, + "learning_rate": 7.889210076846544e-06, + "loss": 0.6815, + "step": 3087 + }, + { + "epoch": 0.32, + "grad_norm": 2.381411272217176, + "learning_rate": 7.887818997371062e-06, + "loss": 0.6404, + "step": 3088 + }, + { + "epoch": 0.33, + "grad_norm": 2.904456744874637, + "learning_rate": 7.886427582397117e-06, + "loss": 0.7417, + "step": 3089 + }, + { + "epoch": 0.33, + "grad_norm": 2.2786773692260547, + "learning_rate": 7.88503583208636e-06, + "loss": 0.5864, + "step": 3090 + }, + { + "epoch": 0.33, + "grad_norm": 2.81932535707139, + "learning_rate": 7.88364374660048e-06, + "loss": 0.7001, + "step": 3091 + }, + { + "epoch": 0.33, + "grad_norm": 2.6849729938128997, + "learning_rate": 7.882251326101205e-06, + "loss": 0.6875, + "step": 3092 + }, + { + "epoch": 0.33, + "grad_norm": 3.3310488217574488, + "learning_rate": 7.8808585707503e-06, + "loss": 0.698, + "step": 3093 + }, + { + "epoch": 0.33, + "grad_norm": 2.6002798711965975, + "learning_rate": 7.879465480709577e-06, + "loss": 0.6669, + "step": 3094 + }, + { + "epoch": 0.33, + "grad_norm": 2.8876056113874142, + "learning_rate": 7.878072056140878e-06, + "loss": 0.6834, + "step": 3095 + }, + { + "epoch": 0.33, + "grad_norm": 3.537815529224319, + "learning_rate": 7.876678297206086e-06, + "loss": 0.7348, + "step": 3096 + }, + { + "epoch": 0.33, + "grad_norm": 2.8620170280847104, + "learning_rate": 7.875284204067127e-06, + "loss": 0.6779, + "step": 3097 + }, + { + "epoch": 0.33, + "grad_norm": 3.283985994119008, + "learning_rate": 7.873889776885959e-06, + "loss": 0.6443, + "step": 3098 + }, + { + "epoch": 0.33, + "grad_norm": 3.3105599250764297, + "learning_rate": 7.872495015824586e-06, + "loss": 0.6744, + "step": 3099 + }, + { + "epoch": 0.33, + "grad_norm": 2.9406649266002853, + "learning_rate": 7.871099921045042e-06, + "loss": 0.6953, + "step": 3100 + }, + { + "epoch": 0.33, + "grad_norm": 2.574644681179438, + "learning_rate": 7.869704492709412e-06, + "loss": 0.6711, + "step": 3101 + }, + { + "epoch": 0.33, + "grad_norm": 2.449965941126261, + "learning_rate": 7.868308730979809e-06, + "loss": 0.628, + "step": 3102 + }, + { + "epoch": 0.33, + "grad_norm": 2.772397515441643, + "learning_rate": 7.866912636018389e-06, + "loss": 0.7371, + "step": 3103 + }, + { + "epoch": 0.33, + "grad_norm": 2.39442828453295, + "learning_rate": 7.865516207987344e-06, + "loss": 0.63, + "step": 3104 + }, + { + "epoch": 0.33, + "grad_norm": 3.3246795963005713, + "learning_rate": 7.864119447048912e-06, + "loss": 0.7073, + "step": 3105 + }, + { + "epoch": 0.33, + "grad_norm": 3.0064966422057773, + "learning_rate": 7.862722353365361e-06, + "loss": 0.7265, + "step": 3106 + }, + { + "epoch": 0.33, + "grad_norm": 3.0807358878776006, + "learning_rate": 7.861324927099004e-06, + "loss": 0.7145, + "step": 3107 + }, + { + "epoch": 0.33, + "grad_norm": 1.069135073025866, + "learning_rate": 7.859927168412186e-06, + "loss": 0.6135, + "step": 3108 + }, + { + "epoch": 0.33, + "grad_norm": 3.286312915224411, + "learning_rate": 7.858529077467298e-06, + "loss": 0.6363, + "step": 3109 + }, + { + "epoch": 0.33, + "grad_norm": 2.2389602662700385, + "learning_rate": 7.857130654426764e-06, + "loss": 0.6131, + "step": 3110 + }, + { + "epoch": 0.33, + "grad_norm": 3.36015735420768, + "learning_rate": 7.85573189945305e-06, + "loss": 0.5841, + "step": 3111 + }, + { + "epoch": 0.33, + "grad_norm": 2.7462815401691096, + "learning_rate": 7.854332812708661e-06, + "loss": 0.6193, + "step": 3112 + }, + { + "epoch": 0.33, + "grad_norm": 3.0603590845258815, + "learning_rate": 7.852933394356134e-06, + "loss": 0.7154, + "step": 3113 + }, + { + "epoch": 0.33, + "grad_norm": 2.6664238931594584, + "learning_rate": 7.851533644558054e-06, + "loss": 0.747, + "step": 3114 + }, + { + "epoch": 0.33, + "grad_norm": 2.859101742503454, + "learning_rate": 7.850133563477037e-06, + "loss": 0.6958, + "step": 3115 + }, + { + "epoch": 0.33, + "grad_norm": 2.669270847924662, + "learning_rate": 7.848733151275741e-06, + "loss": 0.5718, + "step": 3116 + }, + { + "epoch": 0.33, + "grad_norm": 3.7219654697356863, + "learning_rate": 7.847332408116863e-06, + "loss": 0.6531, + "step": 3117 + }, + { + "epoch": 0.33, + "grad_norm": 3.7912563805814945, + "learning_rate": 7.845931334163138e-06, + "loss": 0.7386, + "step": 3118 + }, + { + "epoch": 0.33, + "grad_norm": 2.8886193603297348, + "learning_rate": 7.844529929577336e-06, + "loss": 0.6827, + "step": 3119 + }, + { + "epoch": 0.33, + "grad_norm": 2.586772606669079, + "learning_rate": 7.84312819452227e-06, + "loss": 0.6084, + "step": 3120 + }, + { + "epoch": 0.33, + "grad_norm": 2.6184892406372193, + "learning_rate": 7.841726129160789e-06, + "loss": 0.6775, + "step": 3121 + }, + { + "epoch": 0.33, + "grad_norm": 2.3983415224825637, + "learning_rate": 7.84032373365578e-06, + "loss": 0.6357, + "step": 3122 + }, + { + "epoch": 0.33, + "grad_norm": 3.1697316223767333, + "learning_rate": 7.838921008170171e-06, + "loss": 0.7038, + "step": 3123 + }, + { + "epoch": 0.33, + "grad_norm": 2.538074643605018, + "learning_rate": 7.837517952866924e-06, + "loss": 0.6816, + "step": 3124 + }, + { + "epoch": 0.33, + "grad_norm": 2.723960615531358, + "learning_rate": 7.836114567909046e-06, + "loss": 0.6351, + "step": 3125 + }, + { + "epoch": 0.33, + "grad_norm": 4.3141201817543715, + "learning_rate": 7.834710853459575e-06, + "loss": 0.6886, + "step": 3126 + }, + { + "epoch": 0.33, + "grad_norm": 6.935659578134951, + "learning_rate": 7.833306809681593e-06, + "loss": 0.6418, + "step": 3127 + }, + { + "epoch": 0.33, + "grad_norm": 2.8856037758667292, + "learning_rate": 7.831902436738215e-06, + "loss": 0.6849, + "step": 3128 + }, + { + "epoch": 0.33, + "grad_norm": 8.563709509210556, + "learning_rate": 7.830497734792597e-06, + "loss": 0.6117, + "step": 3129 + }, + { + "epoch": 0.33, + "grad_norm": 2.927718266986384, + "learning_rate": 7.829092704007935e-06, + "loss": 0.6543, + "step": 3130 + }, + { + "epoch": 0.33, + "grad_norm": 2.675602881652366, + "learning_rate": 7.827687344547459e-06, + "loss": 0.6443, + "step": 3131 + }, + { + "epoch": 0.33, + "grad_norm": 2.732964806627827, + "learning_rate": 7.826281656574444e-06, + "loss": 0.6704, + "step": 3132 + }, + { + "epoch": 0.33, + "grad_norm": 2.9065789120991763, + "learning_rate": 7.824875640252195e-06, + "loss": 0.7202, + "step": 3133 + }, + { + "epoch": 0.33, + "grad_norm": 2.5054335560181444, + "learning_rate": 7.82346929574406e-06, + "loss": 0.7212, + "step": 3134 + }, + { + "epoch": 0.33, + "grad_norm": 2.5085507480711065, + "learning_rate": 7.822062623213424e-06, + "loss": 0.7014, + "step": 3135 + }, + { + "epoch": 0.33, + "grad_norm": 8.434812127737116, + "learning_rate": 7.820655622823712e-06, + "loss": 0.6936, + "step": 3136 + }, + { + "epoch": 0.33, + "grad_norm": 7.281720661318549, + "learning_rate": 7.819248294738381e-06, + "loss": 0.6299, + "step": 3137 + }, + { + "epoch": 0.33, + "grad_norm": 3.336679866157924, + "learning_rate": 7.817840639120932e-06, + "loss": 0.6982, + "step": 3138 + }, + { + "epoch": 0.33, + "grad_norm": 2.7369577275240657, + "learning_rate": 7.816432656134907e-06, + "loss": 0.6836, + "step": 3139 + }, + { + "epoch": 0.33, + "grad_norm": 3.408523586518461, + "learning_rate": 7.815024345943874e-06, + "loss": 0.6304, + "step": 3140 + }, + { + "epoch": 0.33, + "grad_norm": 2.7339532046846244, + "learning_rate": 7.81361570871145e-06, + "loss": 0.6901, + "step": 3141 + }, + { + "epoch": 0.33, + "grad_norm": 2.55643397634351, + "learning_rate": 7.812206744601288e-06, + "loss": 0.7034, + "step": 3142 + }, + { + "epoch": 0.33, + "grad_norm": 2.7516095557400058, + "learning_rate": 7.810797453777076e-06, + "loss": 0.5308, + "step": 3143 + }, + { + "epoch": 0.33, + "grad_norm": 2.5846496687334355, + "learning_rate": 7.80938783640254e-06, + "loss": 0.7117, + "step": 3144 + }, + { + "epoch": 0.33, + "grad_norm": 2.8518803010333, + "learning_rate": 7.807977892641446e-06, + "loss": 0.7384, + "step": 3145 + }, + { + "epoch": 0.33, + "grad_norm": 3.978801136142066, + "learning_rate": 7.806567622657598e-06, + "loss": 0.6169, + "step": 3146 + }, + { + "epoch": 0.33, + "grad_norm": 3.8381379717790853, + "learning_rate": 7.805157026614836e-06, + "loss": 0.6696, + "step": 3147 + }, + { + "epoch": 0.33, + "grad_norm": 2.6451501046738706, + "learning_rate": 7.80374610467704e-06, + "loss": 0.695, + "step": 3148 + }, + { + "epoch": 0.33, + "grad_norm": 2.6556926713393936, + "learning_rate": 7.802334857008127e-06, + "loss": 0.5896, + "step": 3149 + }, + { + "epoch": 0.33, + "grad_norm": 2.3511219203796054, + "learning_rate": 7.800923283772051e-06, + "loss": 0.6078, + "step": 3150 + }, + { + "epoch": 0.33, + "grad_norm": 2.3055445730449935, + "learning_rate": 7.799511385132803e-06, + "loss": 0.7246, + "step": 3151 + }, + { + "epoch": 0.33, + "grad_norm": 2.807571982019713, + "learning_rate": 7.798099161254415e-06, + "loss": 0.5812, + "step": 3152 + }, + { + "epoch": 0.33, + "grad_norm": 2.9073765971064183, + "learning_rate": 7.796686612300957e-06, + "loss": 0.5974, + "step": 3153 + }, + { + "epoch": 0.33, + "grad_norm": 2.580983486608408, + "learning_rate": 7.795273738436531e-06, + "loss": 0.6505, + "step": 3154 + }, + { + "epoch": 0.33, + "grad_norm": 2.27033419980578, + "learning_rate": 7.793860539825282e-06, + "loss": 0.6077, + "step": 3155 + }, + { + "epoch": 0.33, + "grad_norm": 2.736113101860133, + "learning_rate": 7.792447016631392e-06, + "loss": 0.5951, + "step": 3156 + }, + { + "epoch": 0.33, + "grad_norm": 2.1064869720748733, + "learning_rate": 7.79103316901908e-06, + "loss": 0.6632, + "step": 3157 + }, + { + "epoch": 0.33, + "grad_norm": 3.0989709229973887, + "learning_rate": 7.789618997152603e-06, + "loss": 0.7188, + "step": 3158 + }, + { + "epoch": 0.33, + "grad_norm": 3.06812401144027, + "learning_rate": 7.788204501196255e-06, + "loss": 0.6171, + "step": 3159 + }, + { + "epoch": 0.33, + "grad_norm": 2.691041503384547, + "learning_rate": 7.786789681314368e-06, + "loss": 0.604, + "step": 3160 + }, + { + "epoch": 0.33, + "grad_norm": 2.614511086385431, + "learning_rate": 7.785374537671311e-06, + "loss": 0.6663, + "step": 3161 + }, + { + "epoch": 0.33, + "grad_norm": 2.6590381539539054, + "learning_rate": 7.783959070431492e-06, + "loss": 0.7339, + "step": 3162 + }, + { + "epoch": 0.33, + "grad_norm": 1.1374038834717708, + "learning_rate": 7.782543279759356e-06, + "loss": 0.5921, + "step": 3163 + }, + { + "epoch": 0.33, + "grad_norm": 2.7699969206806414, + "learning_rate": 7.781127165819386e-06, + "loss": 0.767, + "step": 3164 + }, + { + "epoch": 0.33, + "grad_norm": 2.0646485786942144, + "learning_rate": 7.7797107287761e-06, + "loss": 0.657, + "step": 3165 + }, + { + "epoch": 0.33, + "grad_norm": 3.383226812212046, + "learning_rate": 7.778293968794056e-06, + "loss": 0.6619, + "step": 3166 + }, + { + "epoch": 0.33, + "grad_norm": 2.3946861066654246, + "learning_rate": 7.776876886037852e-06, + "loss": 0.7148, + "step": 3167 + }, + { + "epoch": 0.33, + "grad_norm": 4.2920273098631965, + "learning_rate": 7.775459480672117e-06, + "loss": 0.6964, + "step": 3168 + }, + { + "epoch": 0.33, + "grad_norm": 2.74757852495417, + "learning_rate": 7.774041752861524e-06, + "loss": 0.6485, + "step": 3169 + }, + { + "epoch": 0.33, + "grad_norm": 3.8429087492176057, + "learning_rate": 7.772623702770779e-06, + "loss": 0.7287, + "step": 3170 + }, + { + "epoch": 0.33, + "grad_norm": 2.6463677367731218, + "learning_rate": 7.771205330564626e-06, + "loss": 0.6197, + "step": 3171 + }, + { + "epoch": 0.33, + "grad_norm": 2.3146582536567784, + "learning_rate": 7.769786636407849e-06, + "loss": 0.673, + "step": 3172 + }, + { + "epoch": 0.33, + "grad_norm": 3.097668002633075, + "learning_rate": 7.768367620465267e-06, + "loss": 0.6628, + "step": 3173 + }, + { + "epoch": 0.33, + "grad_norm": 2.6251746945849934, + "learning_rate": 7.766948282901738e-06, + "loss": 0.6698, + "step": 3174 + }, + { + "epoch": 0.33, + "grad_norm": 2.7114682635233325, + "learning_rate": 7.765528623882155e-06, + "loss": 0.6594, + "step": 3175 + }, + { + "epoch": 0.33, + "grad_norm": 3.4612588856168838, + "learning_rate": 7.76410864357145e-06, + "loss": 0.6582, + "step": 3176 + }, + { + "epoch": 0.33, + "grad_norm": 2.874426831691072, + "learning_rate": 7.762688342134597e-06, + "loss": 0.6661, + "step": 3177 + }, + { + "epoch": 0.33, + "grad_norm": 2.9148574564549485, + "learning_rate": 7.761267719736593e-06, + "loss": 0.6976, + "step": 3178 + }, + { + "epoch": 0.33, + "grad_norm": 3.0616632712894885, + "learning_rate": 7.759846776542492e-06, + "loss": 0.6677, + "step": 3179 + }, + { + "epoch": 0.33, + "grad_norm": 3.561508539134428, + "learning_rate": 7.75842551271737e-06, + "loss": 0.6977, + "step": 3180 + }, + { + "epoch": 0.33, + "grad_norm": 3.298240212255653, + "learning_rate": 7.757003928426342e-06, + "loss": 0.6149, + "step": 3181 + }, + { + "epoch": 0.33, + "grad_norm": 2.4188530769511982, + "learning_rate": 7.755582023834572e-06, + "loss": 0.6496, + "step": 3182 + }, + { + "epoch": 0.33, + "grad_norm": 2.8398810101886345, + "learning_rate": 7.754159799107244e-06, + "loss": 0.6351, + "step": 3183 + }, + { + "epoch": 0.34, + "grad_norm": 2.2499424827555927, + "learning_rate": 7.752737254409594e-06, + "loss": 0.7088, + "step": 3184 + }, + { + "epoch": 0.34, + "grad_norm": 2.3814957039535374, + "learning_rate": 7.751314389906887e-06, + "loss": 0.6245, + "step": 3185 + }, + { + "epoch": 0.34, + "grad_norm": 2.666046394806816, + "learning_rate": 7.749891205764427e-06, + "loss": 0.6133, + "step": 3186 + }, + { + "epoch": 0.34, + "grad_norm": 2.7799564388267295, + "learning_rate": 7.748467702147555e-06, + "loss": 0.605, + "step": 3187 + }, + { + "epoch": 0.34, + "grad_norm": 5.281029160468379, + "learning_rate": 7.747043879221653e-06, + "loss": 0.6321, + "step": 3188 + }, + { + "epoch": 0.34, + "grad_norm": 2.934308558200218, + "learning_rate": 7.745619737152133e-06, + "loss": 0.7053, + "step": 3189 + }, + { + "epoch": 0.34, + "grad_norm": 3.1545251666263026, + "learning_rate": 7.744195276104447e-06, + "loss": 0.6998, + "step": 3190 + }, + { + "epoch": 0.34, + "grad_norm": 2.642275030044787, + "learning_rate": 7.742770496244087e-06, + "loss": 0.7263, + "step": 3191 + }, + { + "epoch": 0.34, + "grad_norm": 1.9956469346760177, + "learning_rate": 7.74134539773658e-06, + "loss": 0.6707, + "step": 3192 + }, + { + "epoch": 0.34, + "grad_norm": 4.099408512549327, + "learning_rate": 7.73991998074749e-06, + "loss": 0.6832, + "step": 3193 + }, + { + "epoch": 0.34, + "grad_norm": 5.473717647736508, + "learning_rate": 7.738494245442415e-06, + "loss": 0.5969, + "step": 3194 + }, + { + "epoch": 0.34, + "grad_norm": 2.67068434985646, + "learning_rate": 7.737068191986995e-06, + "loss": 0.7505, + "step": 3195 + }, + { + "epoch": 0.34, + "grad_norm": 2.413731979539875, + "learning_rate": 7.735641820546906e-06, + "loss": 0.7376, + "step": 3196 + }, + { + "epoch": 0.34, + "grad_norm": 2.619766771399215, + "learning_rate": 7.73421513128786e-06, + "loss": 0.712, + "step": 3197 + }, + { + "epoch": 0.34, + "grad_norm": 2.0705943862250664, + "learning_rate": 7.7327881243756e-06, + "loss": 0.701, + "step": 3198 + }, + { + "epoch": 0.34, + "grad_norm": 2.3139029563356512, + "learning_rate": 7.731360799975916e-06, + "loss": 0.6423, + "step": 3199 + }, + { + "epoch": 0.34, + "grad_norm": 2.4451859469193082, + "learning_rate": 7.72993315825463e-06, + "loss": 0.6318, + "step": 3200 + }, + { + "epoch": 0.34, + "grad_norm": 2.2475814172050104, + "learning_rate": 7.728505199377603e-06, + "loss": 0.6631, + "step": 3201 + }, + { + "epoch": 0.34, + "grad_norm": 3.3486986779317154, + "learning_rate": 7.727076923510727e-06, + "loss": 0.7424, + "step": 3202 + }, + { + "epoch": 0.34, + "grad_norm": 2.121154876974287, + "learning_rate": 7.72564833081994e-06, + "loss": 0.6947, + "step": 3203 + }, + { + "epoch": 0.34, + "grad_norm": 3.4236551024113093, + "learning_rate": 7.724219421471206e-06, + "loss": 0.727, + "step": 3204 + }, + { + "epoch": 0.34, + "grad_norm": 2.346428595824038, + "learning_rate": 7.722790195630536e-06, + "loss": 0.6948, + "step": 3205 + }, + { + "epoch": 0.34, + "grad_norm": 3.092820968756066, + "learning_rate": 7.721360653463971e-06, + "loss": 0.6849, + "step": 3206 + }, + { + "epoch": 0.34, + "grad_norm": 2.5447700966964466, + "learning_rate": 7.719930795137592e-06, + "loss": 0.7097, + "step": 3207 + }, + { + "epoch": 0.34, + "grad_norm": 3.2830897925687825, + "learning_rate": 7.718500620817517e-06, + "loss": 0.6177, + "step": 3208 + }, + { + "epoch": 0.34, + "grad_norm": 2.8208598121038104, + "learning_rate": 7.717070130669896e-06, + "loss": 0.6102, + "step": 3209 + }, + { + "epoch": 0.34, + "grad_norm": 2.9927032819623642, + "learning_rate": 7.715639324860925e-06, + "loss": 0.6379, + "step": 3210 + }, + { + "epoch": 0.34, + "grad_norm": 2.6289931425920803, + "learning_rate": 7.714208203556825e-06, + "loss": 0.7293, + "step": 3211 + }, + { + "epoch": 0.34, + "grad_norm": 3.899374378134415, + "learning_rate": 7.712776766923862e-06, + "loss": 0.6116, + "step": 3212 + }, + { + "epoch": 0.34, + "grad_norm": 3.2414892114848803, + "learning_rate": 7.711345015128335e-06, + "loss": 0.7262, + "step": 3213 + }, + { + "epoch": 0.34, + "grad_norm": 3.45926603241094, + "learning_rate": 7.709912948336583e-06, + "loss": 0.7132, + "step": 3214 + }, + { + "epoch": 0.34, + "grad_norm": 3.182829948766781, + "learning_rate": 7.70848056671498e-06, + "loss": 0.6255, + "step": 3215 + }, + { + "epoch": 0.34, + "grad_norm": 2.7748630647111376, + "learning_rate": 7.707047870429931e-06, + "loss": 0.6655, + "step": 3216 + }, + { + "epoch": 0.34, + "grad_norm": 3.9189572306348666, + "learning_rate": 7.705614859647888e-06, + "loss": 0.6888, + "step": 3217 + }, + { + "epoch": 0.34, + "grad_norm": 1.1326017663302763, + "learning_rate": 7.704181534535332e-06, + "loss": 0.6407, + "step": 3218 + }, + { + "epoch": 0.34, + "grad_norm": 4.352263696228539, + "learning_rate": 7.70274789525878e-06, + "loss": 0.6605, + "step": 3219 + }, + { + "epoch": 0.34, + "grad_norm": 8.035499997279176, + "learning_rate": 7.701313941984791e-06, + "loss": 0.6789, + "step": 3220 + }, + { + "epoch": 0.34, + "grad_norm": 2.629717191952034, + "learning_rate": 7.699879674879958e-06, + "loss": 0.5778, + "step": 3221 + }, + { + "epoch": 0.34, + "grad_norm": 6.130587806018568, + "learning_rate": 7.698445094110909e-06, + "loss": 0.7025, + "step": 3222 + }, + { + "epoch": 0.34, + "grad_norm": 2.8760063671669753, + "learning_rate": 7.697010199844308e-06, + "loss": 0.7081, + "step": 3223 + }, + { + "epoch": 0.34, + "grad_norm": 2.7331704381503226, + "learning_rate": 7.69557499224686e-06, + "loss": 0.7452, + "step": 3224 + }, + { + "epoch": 0.34, + "grad_norm": 2.850116734761613, + "learning_rate": 7.694139471485301e-06, + "loss": 0.6846, + "step": 3225 + }, + { + "epoch": 0.34, + "grad_norm": 3.489662975427095, + "learning_rate": 7.692703637726407e-06, + "loss": 0.7061, + "step": 3226 + }, + { + "epoch": 0.34, + "grad_norm": 2.903189368179508, + "learning_rate": 7.691267491136986e-06, + "loss": 0.5947, + "step": 3227 + }, + { + "epoch": 0.34, + "grad_norm": 4.316176181783629, + "learning_rate": 7.689831031883887e-06, + "loss": 0.6344, + "step": 3228 + }, + { + "epoch": 0.34, + "grad_norm": 2.4581575403347564, + "learning_rate": 7.688394260133997e-06, + "loss": 0.6042, + "step": 3229 + }, + { + "epoch": 0.34, + "grad_norm": 3.424492880509628, + "learning_rate": 7.686957176054231e-06, + "loss": 0.6886, + "step": 3230 + }, + { + "epoch": 0.34, + "grad_norm": 2.4711470411348553, + "learning_rate": 7.68551977981155e-06, + "loss": 0.5719, + "step": 3231 + }, + { + "epoch": 0.34, + "grad_norm": 2.4487165637771358, + "learning_rate": 7.684082071572943e-06, + "loss": 0.7407, + "step": 3232 + }, + { + "epoch": 0.34, + "grad_norm": 2.8173730592149235, + "learning_rate": 7.68264405150544e-06, + "loss": 0.6361, + "step": 3233 + }, + { + "epoch": 0.34, + "grad_norm": 3.721164285756093, + "learning_rate": 7.681205719776104e-06, + "loss": 0.7631, + "step": 3234 + }, + { + "epoch": 0.34, + "grad_norm": 3.9251106459657077, + "learning_rate": 7.679767076552038e-06, + "loss": 0.6352, + "step": 3235 + }, + { + "epoch": 0.34, + "grad_norm": 1.0737715770377105, + "learning_rate": 7.678328122000382e-06, + "loss": 0.6233, + "step": 3236 + }, + { + "epoch": 0.34, + "grad_norm": 2.5541768150933235, + "learning_rate": 7.676888856288307e-06, + "loss": 0.6348, + "step": 3237 + }, + { + "epoch": 0.34, + "grad_norm": 2.580142219433405, + "learning_rate": 7.67544927958302e-06, + "loss": 0.6439, + "step": 3238 + }, + { + "epoch": 0.34, + "grad_norm": 2.654991640432859, + "learning_rate": 7.67400939205177e-06, + "loss": 0.6801, + "step": 3239 + }, + { + "epoch": 0.34, + "grad_norm": 3.6174398276239623, + "learning_rate": 7.67256919386184e-06, + "loss": 0.7203, + "step": 3240 + }, + { + "epoch": 0.34, + "grad_norm": 3.5610803553484955, + "learning_rate": 7.671128685180547e-06, + "loss": 0.6511, + "step": 3241 + }, + { + "epoch": 0.34, + "grad_norm": 2.938858056226461, + "learning_rate": 7.669687866175245e-06, + "loss": 0.6039, + "step": 3242 + }, + { + "epoch": 0.34, + "grad_norm": 2.8043292999636313, + "learning_rate": 7.668246737013323e-06, + "loss": 0.6662, + "step": 3243 + }, + { + "epoch": 0.34, + "grad_norm": 2.3949517216338907, + "learning_rate": 7.666805297862208e-06, + "loss": 0.6496, + "step": 3244 + }, + { + "epoch": 0.34, + "grad_norm": 3.1462781898491157, + "learning_rate": 7.665363548889362e-06, + "loss": 0.6178, + "step": 3245 + }, + { + "epoch": 0.34, + "grad_norm": 2.5506311534896433, + "learning_rate": 7.663921490262286e-06, + "loss": 0.6798, + "step": 3246 + }, + { + "epoch": 0.34, + "grad_norm": 9.48469185368361, + "learning_rate": 7.66247912214851e-06, + "loss": 0.623, + "step": 3247 + }, + { + "epoch": 0.34, + "grad_norm": 2.841001179637787, + "learning_rate": 7.661036444715608e-06, + "loss": 0.6621, + "step": 3248 + }, + { + "epoch": 0.34, + "grad_norm": 1.1276649407769432, + "learning_rate": 7.659593458131181e-06, + "loss": 0.6079, + "step": 3249 + }, + { + "epoch": 0.34, + "grad_norm": 2.967235234356677, + "learning_rate": 7.658150162562875e-06, + "loss": 0.6655, + "step": 3250 + }, + { + "epoch": 0.34, + "grad_norm": 2.935135934795403, + "learning_rate": 7.656706558178368e-06, + "loss": 0.6983, + "step": 3251 + }, + { + "epoch": 0.34, + "grad_norm": 2.7507617496596493, + "learning_rate": 7.655262645145374e-06, + "loss": 0.6997, + "step": 3252 + }, + { + "epoch": 0.34, + "grad_norm": 3.6113941756706422, + "learning_rate": 7.65381842363164e-06, + "loss": 0.6512, + "step": 3253 + }, + { + "epoch": 0.34, + "grad_norm": 2.9939074711670517, + "learning_rate": 7.652373893804952e-06, + "loss": 0.6817, + "step": 3254 + }, + { + "epoch": 0.34, + "grad_norm": 3.5665404033863157, + "learning_rate": 7.650929055833135e-06, + "loss": 0.6852, + "step": 3255 + }, + { + "epoch": 0.34, + "grad_norm": 2.7571299007941965, + "learning_rate": 7.64948390988404e-06, + "loss": 0.6688, + "step": 3256 + }, + { + "epoch": 0.34, + "grad_norm": 3.93253267201708, + "learning_rate": 7.648038456125566e-06, + "loss": 0.6498, + "step": 3257 + }, + { + "epoch": 0.34, + "grad_norm": 1.2564745859826627, + "learning_rate": 7.646592694725638e-06, + "loss": 0.6059, + "step": 3258 + }, + { + "epoch": 0.34, + "grad_norm": 2.460398219897918, + "learning_rate": 7.64514662585222e-06, + "loss": 0.7199, + "step": 3259 + }, + { + "epoch": 0.34, + "grad_norm": 2.095756389781973, + "learning_rate": 7.643700249673315e-06, + "loss": 0.6769, + "step": 3260 + }, + { + "epoch": 0.34, + "grad_norm": 2.6122927168443058, + "learning_rate": 7.642253566356957e-06, + "loss": 0.6627, + "step": 3261 + }, + { + "epoch": 0.34, + "grad_norm": 4.825071935901371, + "learning_rate": 7.640806576071215e-06, + "loss": 0.6343, + "step": 3262 + }, + { + "epoch": 0.34, + "grad_norm": 2.968048064848337, + "learning_rate": 7.639359278984202e-06, + "loss": 0.7232, + "step": 3263 + }, + { + "epoch": 0.34, + "grad_norm": 2.636260757493031, + "learning_rate": 7.637911675264056e-06, + "loss": 0.6242, + "step": 3264 + }, + { + "epoch": 0.34, + "grad_norm": 6.452731556132531, + "learning_rate": 7.636463765078958e-06, + "loss": 0.6242, + "step": 3265 + }, + { + "epoch": 0.34, + "grad_norm": 1.0833107459100242, + "learning_rate": 7.63501554859712e-06, + "loss": 0.5713, + "step": 3266 + }, + { + "epoch": 0.34, + "grad_norm": 3.2859379838193097, + "learning_rate": 7.633567025986795e-06, + "loss": 0.7321, + "step": 3267 + }, + { + "epoch": 0.34, + "grad_norm": 2.9027700596440162, + "learning_rate": 7.632118197416263e-06, + "loss": 0.574, + "step": 3268 + }, + { + "epoch": 0.34, + "grad_norm": 2.7949610995170224, + "learning_rate": 7.630669063053849e-06, + "loss": 0.6283, + "step": 3269 + }, + { + "epoch": 0.34, + "grad_norm": 2.724700144475404, + "learning_rate": 7.629219623067907e-06, + "loss": 0.6921, + "step": 3270 + }, + { + "epoch": 0.34, + "grad_norm": 3.384530330225657, + "learning_rate": 7.62776987762683e-06, + "loss": 0.6618, + "step": 3271 + }, + { + "epoch": 0.34, + "grad_norm": 2.591079385525758, + "learning_rate": 7.626319826899045e-06, + "loss": 0.6482, + "step": 3272 + }, + { + "epoch": 0.34, + "grad_norm": 3.314517492095959, + "learning_rate": 7.624869471053014e-06, + "loss": 0.6968, + "step": 3273 + }, + { + "epoch": 0.34, + "grad_norm": 2.71453402590448, + "learning_rate": 7.623418810257234e-06, + "loss": 0.6831, + "step": 3274 + }, + { + "epoch": 0.34, + "grad_norm": 3.710436771293437, + "learning_rate": 7.621967844680241e-06, + "loss": 0.6333, + "step": 3275 + }, + { + "epoch": 0.34, + "grad_norm": 3.8259084062940265, + "learning_rate": 7.620516574490604e-06, + "loss": 0.6768, + "step": 3276 + }, + { + "epoch": 0.34, + "grad_norm": 2.291949490786709, + "learning_rate": 7.6190649998569265e-06, + "loss": 0.6547, + "step": 3277 + }, + { + "epoch": 0.34, + "grad_norm": 2.7125389974746223, + "learning_rate": 7.617613120947848e-06, + "loss": 0.5768, + "step": 3278 + }, + { + "epoch": 0.35, + "grad_norm": 9.167970886019283, + "learning_rate": 7.616160937932045e-06, + "loss": 0.7309, + "step": 3279 + }, + { + "epoch": 0.35, + "grad_norm": 4.549733398070985, + "learning_rate": 7.614708450978226e-06, + "loss": 0.7135, + "step": 3280 + }, + { + "epoch": 0.35, + "grad_norm": 2.9374280731445555, + "learning_rate": 7.613255660255137e-06, + "loss": 0.7211, + "step": 3281 + }, + { + "epoch": 0.35, + "grad_norm": 2.870278169530705, + "learning_rate": 7.611802565931559e-06, + "loss": 0.6897, + "step": 3282 + }, + { + "epoch": 0.35, + "grad_norm": 1.2543736203150375, + "learning_rate": 7.610349168176309e-06, + "loss": 0.6159, + "step": 3283 + }, + { + "epoch": 0.35, + "grad_norm": 2.738519475571267, + "learning_rate": 7.608895467158241e-06, + "loss": 0.7084, + "step": 3284 + }, + { + "epoch": 0.35, + "grad_norm": 4.479720977262225, + "learning_rate": 7.607441463046236e-06, + "loss": 0.7224, + "step": 3285 + }, + { + "epoch": 0.35, + "grad_norm": 3.4942767891910593, + "learning_rate": 7.60598715600922e-06, + "loss": 0.6773, + "step": 3286 + }, + { + "epoch": 0.35, + "grad_norm": 4.350906281284041, + "learning_rate": 7.60453254621615e-06, + "loss": 0.6071, + "step": 3287 + }, + { + "epoch": 0.35, + "grad_norm": 3.8683834683641827, + "learning_rate": 7.603077633836018e-06, + "loss": 0.6792, + "step": 3288 + }, + { + "epoch": 0.35, + "grad_norm": 2.7330042305762703, + "learning_rate": 7.601622419037851e-06, + "loss": 0.5959, + "step": 3289 + }, + { + "epoch": 0.35, + "grad_norm": 3.4753065005990313, + "learning_rate": 7.600166901990711e-06, + "loss": 0.6422, + "step": 3290 + }, + { + "epoch": 0.35, + "grad_norm": 2.6481317364356585, + "learning_rate": 7.5987110828636966e-06, + "loss": 0.7076, + "step": 3291 + }, + { + "epoch": 0.35, + "grad_norm": 4.978965610289502, + "learning_rate": 7.5972549618259415e-06, + "loss": 0.6281, + "step": 3292 + }, + { + "epoch": 0.35, + "grad_norm": 3.1836343422551043, + "learning_rate": 7.595798539046612e-06, + "loss": 0.6259, + "step": 3293 + }, + { + "epoch": 0.35, + "grad_norm": 2.6636282179606776, + "learning_rate": 7.594341814694914e-06, + "loss": 0.5633, + "step": 3294 + }, + { + "epoch": 0.35, + "grad_norm": 2.6492054525812354, + "learning_rate": 7.592884788940082e-06, + "loss": 0.7039, + "step": 3295 + }, + { + "epoch": 0.35, + "grad_norm": 2.6199101782391185, + "learning_rate": 7.59142746195139e-06, + "loss": 0.6398, + "step": 3296 + }, + { + "epoch": 0.35, + "grad_norm": 2.4865546985556777, + "learning_rate": 7.5899698338981475e-06, + "loss": 0.625, + "step": 3297 + }, + { + "epoch": 0.35, + "grad_norm": 2.3823056427702305, + "learning_rate": 7.588511904949696e-06, + "loss": 0.7082, + "step": 3298 + }, + { + "epoch": 0.35, + "grad_norm": 2.653522038656345, + "learning_rate": 7.587053675275413e-06, + "loss": 0.7159, + "step": 3299 + }, + { + "epoch": 0.35, + "grad_norm": 3.007493250962484, + "learning_rate": 7.585595145044714e-06, + "loss": 0.6847, + "step": 3300 + }, + { + "epoch": 0.35, + "grad_norm": 3.9762142271332412, + "learning_rate": 7.5841363144270445e-06, + "loss": 0.6299, + "step": 3301 + }, + { + "epoch": 0.35, + "grad_norm": 4.081453490480924, + "learning_rate": 7.582677183591889e-06, + "loss": 0.6112, + "step": 3302 + }, + { + "epoch": 0.35, + "grad_norm": 3.0011529464421827, + "learning_rate": 7.581217752708763e-06, + "loss": 0.6528, + "step": 3303 + }, + { + "epoch": 0.35, + "grad_norm": 3.8502588551289163, + "learning_rate": 7.579758021947221e-06, + "loss": 0.7399, + "step": 3304 + }, + { + "epoch": 0.35, + "grad_norm": 3.6979556515844454, + "learning_rate": 7.578297991476848e-06, + "loss": 0.7052, + "step": 3305 + }, + { + "epoch": 0.35, + "grad_norm": 2.409793154362905, + "learning_rate": 7.576837661467269e-06, + "loss": 0.5654, + "step": 3306 + }, + { + "epoch": 0.35, + "grad_norm": 3.0297576136517264, + "learning_rate": 7.575377032088138e-06, + "loss": 0.5874, + "step": 3307 + }, + { + "epoch": 0.35, + "grad_norm": 2.335566510834448, + "learning_rate": 7.573916103509149e-06, + "loss": 0.6567, + "step": 3308 + }, + { + "epoch": 0.35, + "grad_norm": 2.8306846989581675, + "learning_rate": 7.572454875900026e-06, + "loss": 0.6771, + "step": 3309 + }, + { + "epoch": 0.35, + "grad_norm": 2.914166159192387, + "learning_rate": 7.570993349430533e-06, + "loss": 0.6979, + "step": 3310 + }, + { + "epoch": 0.35, + "grad_norm": 3.172792917556958, + "learning_rate": 7.569531524270465e-06, + "loss": 0.6625, + "step": 3311 + }, + { + "epoch": 0.35, + "grad_norm": 5.322440067377819, + "learning_rate": 7.568069400589651e-06, + "loss": 0.6143, + "step": 3312 + }, + { + "epoch": 0.35, + "grad_norm": 2.7366214062810466, + "learning_rate": 7.566606978557959e-06, + "loss": 0.6719, + "step": 3313 + }, + { + "epoch": 0.35, + "grad_norm": 3.6661060876658205, + "learning_rate": 7.565144258345287e-06, + "loss": 0.7597, + "step": 3314 + }, + { + "epoch": 0.35, + "grad_norm": 2.841666028729839, + "learning_rate": 7.563681240121569e-06, + "loss": 0.6452, + "step": 3315 + }, + { + "epoch": 0.35, + "grad_norm": 2.9148308706374637, + "learning_rate": 7.562217924056777e-06, + "loss": 0.7037, + "step": 3316 + }, + { + "epoch": 0.35, + "grad_norm": 2.493732792998696, + "learning_rate": 7.560754310320912e-06, + "loss": 0.665, + "step": 3317 + }, + { + "epoch": 0.35, + "grad_norm": 2.3318417413774633, + "learning_rate": 7.559290399084016e-06, + "loss": 0.6403, + "step": 3318 + }, + { + "epoch": 0.35, + "grad_norm": 2.400731362221395, + "learning_rate": 7.5578261905161575e-06, + "loss": 0.6265, + "step": 3319 + }, + { + "epoch": 0.35, + "grad_norm": 4.040790993560575, + "learning_rate": 7.556361684787446e-06, + "loss": 0.6665, + "step": 3320 + }, + { + "epoch": 0.35, + "grad_norm": 2.2499211076771033, + "learning_rate": 7.554896882068025e-06, + "loss": 0.594, + "step": 3321 + }, + { + "epoch": 0.35, + "grad_norm": 2.357387837819163, + "learning_rate": 7.5534317825280664e-06, + "loss": 0.6814, + "step": 3322 + }, + { + "epoch": 0.35, + "grad_norm": 6.712351178001371, + "learning_rate": 7.551966386337788e-06, + "loss": 0.7282, + "step": 3323 + }, + { + "epoch": 0.35, + "grad_norm": 2.4161878209245478, + "learning_rate": 7.5505006936674304e-06, + "loss": 0.6134, + "step": 3324 + }, + { + "epoch": 0.35, + "grad_norm": 2.640259293020909, + "learning_rate": 7.5490347046872755e-06, + "loss": 0.7243, + "step": 3325 + }, + { + "epoch": 0.35, + "grad_norm": 3.240206873144137, + "learning_rate": 7.547568419567637e-06, + "loss": 0.6741, + "step": 3326 + }, + { + "epoch": 0.35, + "grad_norm": 3.394746942635679, + "learning_rate": 7.546101838478864e-06, + "loss": 0.6578, + "step": 3327 + }, + { + "epoch": 0.35, + "grad_norm": 3.144423846324425, + "learning_rate": 7.54463496159134e-06, + "loss": 0.6511, + "step": 3328 + }, + { + "epoch": 0.35, + "grad_norm": 2.6453748407725106, + "learning_rate": 7.543167789075481e-06, + "loss": 0.6275, + "step": 3329 + }, + { + "epoch": 0.35, + "grad_norm": 2.3080573199067187, + "learning_rate": 7.54170032110174e-06, + "loss": 0.6676, + "step": 3330 + }, + { + "epoch": 0.35, + "grad_norm": 2.714660296548131, + "learning_rate": 7.540232557840604e-06, + "loss": 0.611, + "step": 3331 + }, + { + "epoch": 0.35, + "grad_norm": 4.047279583079742, + "learning_rate": 7.53876449946259e-06, + "loss": 0.6867, + "step": 3332 + }, + { + "epoch": 0.35, + "grad_norm": 3.8795995067294924, + "learning_rate": 7.537296146138255e-06, + "loss": 0.5692, + "step": 3333 + }, + { + "epoch": 0.35, + "grad_norm": 2.8734422567926003, + "learning_rate": 7.535827498038192e-06, + "loss": 0.7021, + "step": 3334 + }, + { + "epoch": 0.35, + "grad_norm": 5.854617594622321, + "learning_rate": 7.534358555333018e-06, + "loss": 0.6198, + "step": 3335 + }, + { + "epoch": 0.35, + "grad_norm": 5.60018430557542, + "learning_rate": 7.532889318193393e-06, + "loss": 0.6089, + "step": 3336 + }, + { + "epoch": 0.35, + "grad_norm": 6.2343822540898515, + "learning_rate": 7.531419786790011e-06, + "loss": 0.7023, + "step": 3337 + }, + { + "epoch": 0.35, + "grad_norm": 3.095566888586303, + "learning_rate": 7.5299499612935934e-06, + "loss": 0.6577, + "step": 3338 + }, + { + "epoch": 0.35, + "grad_norm": 3.136010955006197, + "learning_rate": 7.528479841874904e-06, + "loss": 0.6715, + "step": 3339 + }, + { + "epoch": 0.35, + "grad_norm": 2.077841229423998, + "learning_rate": 7.527009428704735e-06, + "loss": 0.683, + "step": 3340 + }, + { + "epoch": 0.35, + "grad_norm": 2.840010075056747, + "learning_rate": 7.525538721953915e-06, + "loss": 0.7238, + "step": 3341 + }, + { + "epoch": 0.35, + "grad_norm": 1.1576704441730468, + "learning_rate": 7.524067721793309e-06, + "loss": 0.6221, + "step": 3342 + }, + { + "epoch": 0.35, + "grad_norm": 2.6260852715338414, + "learning_rate": 7.522596428393809e-06, + "loss": 0.6732, + "step": 3343 + }, + { + "epoch": 0.35, + "grad_norm": 3.35119357031428, + "learning_rate": 7.521124841926348e-06, + "loss": 0.6466, + "step": 3344 + }, + { + "epoch": 0.35, + "grad_norm": 2.410888681828178, + "learning_rate": 7.519652962561894e-06, + "loss": 0.7642, + "step": 3345 + }, + { + "epoch": 0.35, + "grad_norm": 3.7276855469070074, + "learning_rate": 7.5181807904714385e-06, + "loss": 0.6249, + "step": 3346 + }, + { + "epoch": 0.35, + "grad_norm": 3.415586524281074, + "learning_rate": 7.516708325826021e-06, + "loss": 0.5827, + "step": 3347 + }, + { + "epoch": 0.35, + "grad_norm": 2.8263139482195503, + "learning_rate": 7.515235568796704e-06, + "loss": 0.662, + "step": 3348 + }, + { + "epoch": 0.35, + "grad_norm": 2.664326847472684, + "learning_rate": 7.513762519554588e-06, + "loss": 0.5015, + "step": 3349 + }, + { + "epoch": 0.35, + "grad_norm": 2.618966692292694, + "learning_rate": 7.51228917827081e-06, + "loss": 0.6738, + "step": 3350 + }, + { + "epoch": 0.35, + "grad_norm": 3.633158245759139, + "learning_rate": 7.510815545116539e-06, + "loss": 0.6204, + "step": 3351 + }, + { + "epoch": 0.35, + "grad_norm": 2.4185537158238395, + "learning_rate": 7.509341620262976e-06, + "loss": 0.5918, + "step": 3352 + }, + { + "epoch": 0.35, + "grad_norm": 2.4156711101816497, + "learning_rate": 7.507867403881356e-06, + "loss": 0.7105, + "step": 3353 + }, + { + "epoch": 0.35, + "grad_norm": 8.148153998559598, + "learning_rate": 7.506392896142951e-06, + "loss": 0.6219, + "step": 3354 + }, + { + "epoch": 0.35, + "grad_norm": 3.6450108914306396, + "learning_rate": 7.5049180972190646e-06, + "loss": 0.6179, + "step": 3355 + }, + { + "epoch": 0.35, + "grad_norm": 2.794686070537919, + "learning_rate": 7.503443007281035e-06, + "loss": 0.662, + "step": 3356 + }, + { + "epoch": 0.35, + "grad_norm": 3.1106881667918755, + "learning_rate": 7.501967626500231e-06, + "loss": 0.695, + "step": 3357 + }, + { + "epoch": 0.35, + "grad_norm": 3.700458795600681, + "learning_rate": 7.500491955048063e-06, + "loss": 0.6794, + "step": 3358 + }, + { + "epoch": 0.35, + "grad_norm": 2.419751161900197, + "learning_rate": 7.499015993095968e-06, + "loss": 0.6671, + "step": 3359 + }, + { + "epoch": 0.35, + "grad_norm": 2.8449869838184343, + "learning_rate": 7.497539740815419e-06, + "loss": 0.7413, + "step": 3360 + }, + { + "epoch": 0.35, + "grad_norm": 3.4184394602863226, + "learning_rate": 7.4960631983779205e-06, + "loss": 0.6781, + "step": 3361 + }, + { + "epoch": 0.35, + "grad_norm": 2.796963292810003, + "learning_rate": 7.494586365955017e-06, + "loss": 0.667, + "step": 3362 + }, + { + "epoch": 0.35, + "grad_norm": 3.0483662745104665, + "learning_rate": 7.49310924371828e-06, + "loss": 0.715, + "step": 3363 + }, + { + "epoch": 0.35, + "grad_norm": 3.5907131005662767, + "learning_rate": 7.491631831839318e-06, + "loss": 0.7097, + "step": 3364 + }, + { + "epoch": 0.35, + "grad_norm": 3.175466104453125, + "learning_rate": 7.490154130489773e-06, + "loss": 0.6217, + "step": 3365 + }, + { + "epoch": 0.35, + "grad_norm": 3.423899220213644, + "learning_rate": 7.488676139841318e-06, + "loss": 0.6247, + "step": 3366 + }, + { + "epoch": 0.35, + "grad_norm": 2.8572229700965646, + "learning_rate": 7.487197860065664e-06, + "loss": 0.613, + "step": 3367 + }, + { + "epoch": 0.35, + "grad_norm": 2.8587163319916042, + "learning_rate": 7.485719291334551e-06, + "loss": 0.6576, + "step": 3368 + }, + { + "epoch": 0.35, + "grad_norm": 2.3991473483168626, + "learning_rate": 7.484240433819758e-06, + "loss": 0.66, + "step": 3369 + }, + { + "epoch": 0.35, + "grad_norm": 5.965531907514714, + "learning_rate": 7.482761287693092e-06, + "loss": 0.6466, + "step": 3370 + }, + { + "epoch": 0.35, + "grad_norm": 3.048642664328739, + "learning_rate": 7.481281853126397e-06, + "loss": 0.715, + "step": 3371 + }, + { + "epoch": 0.35, + "grad_norm": 2.6309407059230296, + "learning_rate": 7.479802130291548e-06, + "loss": 0.6181, + "step": 3372 + }, + { + "epoch": 0.35, + "grad_norm": 3.222323555003956, + "learning_rate": 7.478322119360457e-06, + "loss": 0.7331, + "step": 3373 + }, + { + "epoch": 0.36, + "grad_norm": 2.2573259131196384, + "learning_rate": 7.476841820505065e-06, + "loss": 0.6684, + "step": 3374 + }, + { + "epoch": 0.36, + "grad_norm": 3.072317633131639, + "learning_rate": 7.475361233897352e-06, + "loss": 0.6969, + "step": 3375 + }, + { + "epoch": 0.36, + "grad_norm": 3.046601351124791, + "learning_rate": 7.473880359709324e-06, + "loss": 0.6585, + "step": 3376 + }, + { + "epoch": 0.36, + "grad_norm": 2.4729492389743917, + "learning_rate": 7.472399198113029e-06, + "loss": 0.677, + "step": 3377 + }, + { + "epoch": 0.36, + "grad_norm": 3.2903365979880492, + "learning_rate": 7.4709177492805405e-06, + "loss": 0.6252, + "step": 3378 + }, + { + "epoch": 0.36, + "grad_norm": 2.9983558599680125, + "learning_rate": 7.46943601338397e-06, + "loss": 0.7068, + "step": 3379 + }, + { + "epoch": 0.36, + "grad_norm": 3.0004226327693235, + "learning_rate": 7.4679539905954655e-06, + "loss": 0.6208, + "step": 3380 + }, + { + "epoch": 0.36, + "grad_norm": 2.898934649269225, + "learning_rate": 7.4664716810871975e-06, + "loss": 0.6644, + "step": 3381 + }, + { + "epoch": 0.36, + "grad_norm": 3.194372324636005, + "learning_rate": 7.464989085031381e-06, + "loss": 0.6673, + "step": 3382 + }, + { + "epoch": 0.36, + "grad_norm": 1.1182459004852183, + "learning_rate": 7.463506202600257e-06, + "loss": 0.5741, + "step": 3383 + }, + { + "epoch": 0.36, + "grad_norm": 4.36761957421957, + "learning_rate": 7.462023033966104e-06, + "loss": 0.6101, + "step": 3384 + }, + { + "epoch": 0.36, + "grad_norm": 2.2861432544796156, + "learning_rate": 7.4605395793012325e-06, + "loss": 0.6921, + "step": 3385 + }, + { + "epoch": 0.36, + "grad_norm": 2.7603235128046846, + "learning_rate": 7.459055838777984e-06, + "loss": 0.6734, + "step": 3386 + }, + { + "epoch": 0.36, + "grad_norm": 3.9793695446983532, + "learning_rate": 7.457571812568738e-06, + "loss": 0.6595, + "step": 3387 + }, + { + "epoch": 0.36, + "grad_norm": 4.311706185746525, + "learning_rate": 7.4560875008459035e-06, + "loss": 0.7186, + "step": 3388 + }, + { + "epoch": 0.36, + "grad_norm": 2.597103883565191, + "learning_rate": 7.454602903781921e-06, + "loss": 0.7305, + "step": 3389 + }, + { + "epoch": 0.36, + "grad_norm": 2.501333529220921, + "learning_rate": 7.45311802154927e-06, + "loss": 0.6342, + "step": 3390 + }, + { + "epoch": 0.36, + "grad_norm": 2.5800190663717655, + "learning_rate": 7.451632854320459e-06, + "loss": 0.568, + "step": 3391 + }, + { + "epoch": 0.36, + "grad_norm": 2.3701799197834, + "learning_rate": 7.4501474022680265e-06, + "loss": 0.6836, + "step": 3392 + }, + { + "epoch": 0.36, + "grad_norm": 2.3772231724274557, + "learning_rate": 7.4486616655645565e-06, + "loss": 0.619, + "step": 3393 + }, + { + "epoch": 0.36, + "grad_norm": 2.8160484269096595, + "learning_rate": 7.447175644382648e-06, + "loss": 0.7025, + "step": 3394 + }, + { + "epoch": 0.36, + "grad_norm": 3.7832742183500967, + "learning_rate": 7.445689338894949e-06, + "loss": 0.7469, + "step": 3395 + }, + { + "epoch": 0.36, + "grad_norm": 5.0059621903562, + "learning_rate": 7.444202749274133e-06, + "loss": 0.7397, + "step": 3396 + }, + { + "epoch": 0.36, + "grad_norm": 3.892558568857166, + "learning_rate": 7.442715875692908e-06, + "loss": 0.6436, + "step": 3397 + }, + { + "epoch": 0.36, + "grad_norm": 2.8280194358791824, + "learning_rate": 7.4412287183240115e-06, + "loss": 0.6798, + "step": 3398 + }, + { + "epoch": 0.36, + "grad_norm": 2.837303017856792, + "learning_rate": 7.43974127734022e-06, + "loss": 0.6466, + "step": 3399 + }, + { + "epoch": 0.36, + "grad_norm": 2.734694065110121, + "learning_rate": 7.4382535529143395e-06, + "loss": 0.6031, + "step": 3400 + }, + { + "epoch": 0.36, + "grad_norm": 3.959288671850776, + "learning_rate": 7.43676554521921e-06, + "loss": 0.6906, + "step": 3401 + }, + { + "epoch": 0.36, + "grad_norm": 6.198201079133412, + "learning_rate": 7.435277254427704e-06, + "loss": 0.6888, + "step": 3402 + }, + { + "epoch": 0.36, + "grad_norm": 4.386419840965464, + "learning_rate": 7.4337886807127235e-06, + "loss": 0.6252, + "step": 3403 + }, + { + "epoch": 0.36, + "grad_norm": 4.1534089316760054, + "learning_rate": 7.4322998242472135e-06, + "loss": 0.6707, + "step": 3404 + }, + { + "epoch": 0.36, + "grad_norm": 2.7988368901968124, + "learning_rate": 7.430810685204137e-06, + "loss": 0.6584, + "step": 3405 + }, + { + "epoch": 0.36, + "grad_norm": 3.919583307019518, + "learning_rate": 7.4293212637565045e-06, + "loss": 0.6966, + "step": 3406 + }, + { + "epoch": 0.36, + "grad_norm": 2.508307648411482, + "learning_rate": 7.427831560077349e-06, + "loss": 0.6444, + "step": 3407 + }, + { + "epoch": 0.36, + "grad_norm": 1.3042696713402557, + "learning_rate": 7.426341574339741e-06, + "loss": 0.5891, + "step": 3408 + }, + { + "epoch": 0.36, + "grad_norm": 3.582107451003599, + "learning_rate": 7.424851306716783e-06, + "loss": 0.6359, + "step": 3409 + }, + { + "epoch": 0.36, + "grad_norm": 2.904925684194061, + "learning_rate": 7.42336075738161e-06, + "loss": 0.6671, + "step": 3410 + }, + { + "epoch": 0.36, + "grad_norm": 3.5167258077031267, + "learning_rate": 7.421869926507389e-06, + "loss": 0.7021, + "step": 3411 + }, + { + "epoch": 0.36, + "grad_norm": 4.358687817951523, + "learning_rate": 7.420378814267322e-06, + "loss": 0.7419, + "step": 3412 + }, + { + "epoch": 0.36, + "grad_norm": 3.152148471276924, + "learning_rate": 7.41888742083464e-06, + "loss": 0.6744, + "step": 3413 + }, + { + "epoch": 0.36, + "grad_norm": 2.3915603282771976, + "learning_rate": 7.417395746382608e-06, + "loss": 0.6771, + "step": 3414 + }, + { + "epoch": 0.36, + "grad_norm": 3.7148070409783536, + "learning_rate": 7.415903791084529e-06, + "loss": 0.7244, + "step": 3415 + }, + { + "epoch": 0.36, + "grad_norm": 3.752027193158189, + "learning_rate": 7.41441155511373e-06, + "loss": 0.6177, + "step": 3416 + }, + { + "epoch": 0.36, + "grad_norm": 2.62863853424837, + "learning_rate": 7.412919038643577e-06, + "loss": 0.6457, + "step": 3417 + }, + { + "epoch": 0.36, + "grad_norm": 2.5794611356979993, + "learning_rate": 7.411426241847463e-06, + "loss": 0.6303, + "step": 3418 + }, + { + "epoch": 0.36, + "grad_norm": 3.9676318300037527, + "learning_rate": 7.409933164898819e-06, + "loss": 0.6817, + "step": 3419 + }, + { + "epoch": 0.36, + "grad_norm": 2.8550149054743, + "learning_rate": 7.408439807971108e-06, + "loss": 0.654, + "step": 3420 + }, + { + "epoch": 0.36, + "grad_norm": 5.195120384669288, + "learning_rate": 7.406946171237822e-06, + "loss": 0.579, + "step": 3421 + }, + { + "epoch": 0.36, + "grad_norm": 3.0078016488240107, + "learning_rate": 7.4054522548724874e-06, + "loss": 0.6311, + "step": 3422 + }, + { + "epoch": 0.36, + "grad_norm": 2.4860091683873784, + "learning_rate": 7.403958059048662e-06, + "loss": 0.6168, + "step": 3423 + }, + { + "epoch": 0.36, + "grad_norm": 2.570809434838443, + "learning_rate": 7.40246358393994e-06, + "loss": 0.591, + "step": 3424 + }, + { + "epoch": 0.36, + "grad_norm": 3.7959371110166256, + "learning_rate": 7.4009688297199436e-06, + "loss": 0.7263, + "step": 3425 + }, + { + "epoch": 0.36, + "grad_norm": 2.774483241184963, + "learning_rate": 7.3994737965623285e-06, + "loss": 0.6566, + "step": 3426 + }, + { + "epoch": 0.36, + "grad_norm": 2.455385395465524, + "learning_rate": 7.397978484640783e-06, + "loss": 0.5688, + "step": 3427 + }, + { + "epoch": 0.36, + "grad_norm": 2.422389158395626, + "learning_rate": 7.396482894129031e-06, + "loss": 0.6501, + "step": 3428 + }, + { + "epoch": 0.36, + "grad_norm": 2.9579176252075983, + "learning_rate": 7.3949870252008215e-06, + "loss": 0.5639, + "step": 3429 + }, + { + "epoch": 0.36, + "grad_norm": 3.2151993534590684, + "learning_rate": 7.393490878029945e-06, + "loss": 0.6368, + "step": 3430 + }, + { + "epoch": 0.36, + "grad_norm": 2.4041060387118276, + "learning_rate": 7.391994452790217e-06, + "loss": 0.7303, + "step": 3431 + }, + { + "epoch": 0.36, + "grad_norm": 2.7465187882412563, + "learning_rate": 7.390497749655487e-06, + "loss": 0.6067, + "step": 3432 + }, + { + "epoch": 0.36, + "grad_norm": 2.714071334159705, + "learning_rate": 7.389000768799638e-06, + "loss": 0.6604, + "step": 3433 + }, + { + "epoch": 0.36, + "grad_norm": 2.9662799463771505, + "learning_rate": 7.387503510396586e-06, + "loss": 0.6275, + "step": 3434 + }, + { + "epoch": 0.36, + "grad_norm": 2.3746212916638507, + "learning_rate": 7.386005974620278e-06, + "loss": 0.7533, + "step": 3435 + }, + { + "epoch": 0.36, + "grad_norm": 3.6759513228513154, + "learning_rate": 7.384508161644694e-06, + "loss": 0.6726, + "step": 3436 + }, + { + "epoch": 0.36, + "grad_norm": 4.2759734265835565, + "learning_rate": 7.383010071643844e-06, + "loss": 0.6219, + "step": 3437 + }, + { + "epoch": 0.36, + "grad_norm": 1.0676737541014298, + "learning_rate": 7.381511704791771e-06, + "loss": 0.5884, + "step": 3438 + }, + { + "epoch": 0.36, + "grad_norm": 3.292189795768927, + "learning_rate": 7.380013061262557e-06, + "loss": 0.5909, + "step": 3439 + }, + { + "epoch": 0.36, + "grad_norm": 3.097298037082653, + "learning_rate": 7.3785141412303e-06, + "loss": 0.6163, + "step": 3440 + }, + { + "epoch": 0.36, + "grad_norm": 2.816854958679348, + "learning_rate": 7.37701494486915e-06, + "loss": 0.6845, + "step": 3441 + }, + { + "epoch": 0.36, + "grad_norm": 4.788505661813447, + "learning_rate": 7.375515472353272e-06, + "loss": 0.6201, + "step": 3442 + }, + { + "epoch": 0.36, + "grad_norm": 4.129127828825236, + "learning_rate": 7.374015723856873e-06, + "loss": 0.6793, + "step": 3443 + }, + { + "epoch": 0.36, + "grad_norm": 2.3574849469565544, + "learning_rate": 7.372515699554191e-06, + "loss": 0.6221, + "step": 3444 + }, + { + "epoch": 0.36, + "grad_norm": 3.3812374504332805, + "learning_rate": 7.371015399619494e-06, + "loss": 0.6603, + "step": 3445 + }, + { + "epoch": 0.36, + "grad_norm": 3.0367925120459303, + "learning_rate": 7.369514824227082e-06, + "loss": 0.6616, + "step": 3446 + }, + { + "epoch": 0.36, + "grad_norm": 2.6543688097654665, + "learning_rate": 7.368013973551286e-06, + "loss": 0.7013, + "step": 3447 + }, + { + "epoch": 0.36, + "grad_norm": 2.1635818274199434, + "learning_rate": 7.366512847766472e-06, + "loss": 0.6885, + "step": 3448 + }, + { + "epoch": 0.36, + "grad_norm": 3.085467390645828, + "learning_rate": 7.365011447047036e-06, + "loss": 0.7165, + "step": 3449 + }, + { + "epoch": 0.36, + "grad_norm": 2.283937568641037, + "learning_rate": 7.363509771567408e-06, + "loss": 0.6338, + "step": 3450 + }, + { + "epoch": 0.36, + "grad_norm": 3.1768009624928166, + "learning_rate": 7.362007821502045e-06, + "loss": 0.747, + "step": 3451 + }, + { + "epoch": 0.36, + "grad_norm": 3.0324905424749597, + "learning_rate": 7.360505597025442e-06, + "loss": 0.685, + "step": 3452 + }, + { + "epoch": 0.36, + "grad_norm": 2.983148422243415, + "learning_rate": 7.359003098312123e-06, + "loss": 0.6152, + "step": 3453 + }, + { + "epoch": 0.36, + "grad_norm": 2.9793497750002658, + "learning_rate": 7.357500325536644e-06, + "loss": 0.7456, + "step": 3454 + }, + { + "epoch": 0.36, + "grad_norm": 2.941874609510829, + "learning_rate": 7.355997278873589e-06, + "loss": 0.6748, + "step": 3455 + }, + { + "epoch": 0.36, + "grad_norm": 2.1486776506884153, + "learning_rate": 7.354493958497583e-06, + "loss": 0.661, + "step": 3456 + }, + { + "epoch": 0.36, + "grad_norm": 1.0170095119117828, + "learning_rate": 7.3529903645832744e-06, + "loss": 0.5884, + "step": 3457 + }, + { + "epoch": 0.36, + "grad_norm": 2.4036589735821483, + "learning_rate": 7.351486497305347e-06, + "loss": 0.5853, + "step": 3458 + }, + { + "epoch": 0.36, + "grad_norm": 3.310188596068514, + "learning_rate": 7.349982356838515e-06, + "loss": 0.7171, + "step": 3459 + }, + { + "epoch": 0.36, + "grad_norm": 2.942433337193868, + "learning_rate": 7.348477943357527e-06, + "loss": 0.6472, + "step": 3460 + }, + { + "epoch": 0.36, + "grad_norm": 4.969354980228407, + "learning_rate": 7.34697325703716e-06, + "loss": 0.6456, + "step": 3461 + }, + { + "epoch": 0.36, + "grad_norm": 3.8519505648735737, + "learning_rate": 7.345468298052224e-06, + "loss": 0.6262, + "step": 3462 + }, + { + "epoch": 0.36, + "grad_norm": 3.685187241926782, + "learning_rate": 7.343963066577563e-06, + "loss": 0.6571, + "step": 3463 + }, + { + "epoch": 0.36, + "grad_norm": 3.437533087132148, + "learning_rate": 7.342457562788046e-06, + "loss": 0.6799, + "step": 3464 + }, + { + "epoch": 0.36, + "grad_norm": 4.281370556976629, + "learning_rate": 7.340951786858583e-06, + "loss": 0.688, + "step": 3465 + }, + { + "epoch": 0.36, + "grad_norm": 3.1063629908728014, + "learning_rate": 7.339445738964106e-06, + "loss": 0.7737, + "step": 3466 + }, + { + "epoch": 0.36, + "grad_norm": 2.0889025130238092, + "learning_rate": 7.337939419279588e-06, + "loss": 0.6248, + "step": 3467 + }, + { + "epoch": 0.36, + "grad_norm": 2.815104069216375, + "learning_rate": 7.336432827980026e-06, + "loss": 0.6681, + "step": 3468 + }, + { + "epoch": 0.37, + "grad_norm": 4.1118170144076265, + "learning_rate": 7.334925965240451e-06, + "loss": 0.6273, + "step": 3469 + }, + { + "epoch": 0.37, + "grad_norm": 3.2170988071407147, + "learning_rate": 7.333418831235928e-06, + "loss": 0.6412, + "step": 3470 + }, + { + "epoch": 0.37, + "grad_norm": 3.4121748554575984, + "learning_rate": 7.33191142614155e-06, + "loss": 0.698, + "step": 3471 + }, + { + "epoch": 0.37, + "grad_norm": 3.0702451497078247, + "learning_rate": 7.330403750132443e-06, + "loss": 0.5974, + "step": 3472 + }, + { + "epoch": 0.37, + "grad_norm": 2.896556923363, + "learning_rate": 7.328895803383764e-06, + "loss": 0.6431, + "step": 3473 + }, + { + "epoch": 0.37, + "grad_norm": 2.849283645322886, + "learning_rate": 7.327387586070705e-06, + "loss": 0.6288, + "step": 3474 + }, + { + "epoch": 0.37, + "grad_norm": 3.6979247535807063, + "learning_rate": 7.325879098368483e-06, + "loss": 0.6332, + "step": 3475 + }, + { + "epoch": 0.37, + "grad_norm": 2.7475025526366377, + "learning_rate": 7.324370340452351e-06, + "loss": 0.6657, + "step": 3476 + }, + { + "epoch": 0.37, + "grad_norm": 2.9659631728372777, + "learning_rate": 7.322861312497591e-06, + "loss": 0.6951, + "step": 3477 + }, + { + "epoch": 0.37, + "grad_norm": 3.1844754621892837, + "learning_rate": 7.321352014679522e-06, + "loss": 0.5929, + "step": 3478 + }, + { + "epoch": 0.37, + "grad_norm": 2.974837282969043, + "learning_rate": 7.319842447173482e-06, + "loss": 0.5501, + "step": 3479 + }, + { + "epoch": 0.37, + "grad_norm": 2.892368554345106, + "learning_rate": 7.318332610154854e-06, + "loss": 0.7127, + "step": 3480 + }, + { + "epoch": 0.37, + "grad_norm": 2.715197258419502, + "learning_rate": 7.3168225037990434e-06, + "loss": 0.6408, + "step": 3481 + }, + { + "epoch": 0.37, + "grad_norm": 2.5710173525042554, + "learning_rate": 7.315312128281493e-06, + "loss": 0.6488, + "step": 3482 + }, + { + "epoch": 0.37, + "grad_norm": 2.0270385193056204, + "learning_rate": 7.313801483777674e-06, + "loss": 0.6552, + "step": 3483 + }, + { + "epoch": 0.37, + "grad_norm": 4.205749249065878, + "learning_rate": 7.3122905704630845e-06, + "loss": 0.6876, + "step": 3484 + }, + { + "epoch": 0.37, + "grad_norm": 3.688817976879616, + "learning_rate": 7.310779388513263e-06, + "loss": 0.6637, + "step": 3485 + }, + { + "epoch": 0.37, + "grad_norm": 2.401643485510619, + "learning_rate": 7.309267938103769e-06, + "loss": 0.6614, + "step": 3486 + }, + { + "epoch": 0.37, + "grad_norm": 2.4560485153621237, + "learning_rate": 7.307756219410205e-06, + "loss": 0.6329, + "step": 3487 + }, + { + "epoch": 0.37, + "grad_norm": 2.5820481308855046, + "learning_rate": 7.306244232608191e-06, + "loss": 0.7006, + "step": 3488 + }, + { + "epoch": 0.37, + "grad_norm": 1.1020552524421927, + "learning_rate": 7.304731977873392e-06, + "loss": 0.5822, + "step": 3489 + }, + { + "epoch": 0.37, + "grad_norm": 2.584430521631613, + "learning_rate": 7.303219455381491e-06, + "loss": 0.6619, + "step": 3490 + }, + { + "epoch": 0.37, + "grad_norm": 2.298444021464788, + "learning_rate": 7.301706665308212e-06, + "loss": 0.7223, + "step": 3491 + }, + { + "epoch": 0.37, + "grad_norm": 6.985437156669851, + "learning_rate": 7.300193607829308e-06, + "loss": 0.6624, + "step": 3492 + }, + { + "epoch": 0.37, + "grad_norm": 2.590700474555624, + "learning_rate": 7.298680283120558e-06, + "loss": 0.6532, + "step": 3493 + }, + { + "epoch": 0.37, + "grad_norm": 2.3977866755703614, + "learning_rate": 7.29716669135778e-06, + "loss": 0.7138, + "step": 3494 + }, + { + "epoch": 0.37, + "grad_norm": 2.7237583049566525, + "learning_rate": 7.295652832716814e-06, + "loss": 0.6091, + "step": 3495 + }, + { + "epoch": 0.37, + "grad_norm": 2.789323251837038, + "learning_rate": 7.294138707373539e-06, + "loss": 0.6027, + "step": 3496 + }, + { + "epoch": 0.37, + "grad_norm": 2.1445610603977494, + "learning_rate": 7.29262431550386e-06, + "loss": 0.6358, + "step": 3497 + }, + { + "epoch": 0.37, + "grad_norm": 2.7828156452167434, + "learning_rate": 7.2911096572837155e-06, + "loss": 0.7049, + "step": 3498 + }, + { + "epoch": 0.37, + "grad_norm": 3.062889194963662, + "learning_rate": 7.289594732889073e-06, + "loss": 0.6906, + "step": 3499 + }, + { + "epoch": 0.37, + "grad_norm": 3.187017863959395, + "learning_rate": 7.288079542495936e-06, + "loss": 0.6046, + "step": 3500 + }, + { + "epoch": 0.37, + "grad_norm": 2.80365868383857, + "learning_rate": 7.286564086280329e-06, + "loss": 0.6339, + "step": 3501 + }, + { + "epoch": 0.37, + "grad_norm": 2.978700832429271, + "learning_rate": 7.285048364418319e-06, + "loss": 0.6488, + "step": 3502 + }, + { + "epoch": 0.37, + "grad_norm": 2.9173805232519245, + "learning_rate": 7.283532377085992e-06, + "loss": 0.6956, + "step": 3503 + }, + { + "epoch": 0.37, + "grad_norm": 3.020587219160722, + "learning_rate": 7.282016124459477e-06, + "loss": 0.6508, + "step": 3504 + }, + { + "epoch": 0.37, + "grad_norm": 2.3101783919909056, + "learning_rate": 7.280499606714923e-06, + "loss": 0.6795, + "step": 3505 + }, + { + "epoch": 0.37, + "grad_norm": 3.9706549307214556, + "learning_rate": 7.27898282402852e-06, + "loss": 0.7841, + "step": 3506 + }, + { + "epoch": 0.37, + "grad_norm": 3.2123361694164423, + "learning_rate": 7.277465776576478e-06, + "loss": 0.6485, + "step": 3507 + }, + { + "epoch": 0.37, + "grad_norm": 2.833849508292802, + "learning_rate": 7.275948464535045e-06, + "loss": 0.6073, + "step": 3508 + }, + { + "epoch": 0.37, + "grad_norm": 2.190248056297931, + "learning_rate": 7.274430888080502e-06, + "loss": 0.6136, + "step": 3509 + }, + { + "epoch": 0.37, + "grad_norm": 3.801011293418992, + "learning_rate": 7.27291304738915e-06, + "loss": 0.6847, + "step": 3510 + }, + { + "epoch": 0.37, + "grad_norm": 6.725920278243489, + "learning_rate": 7.271394942637332e-06, + "loss": 0.6099, + "step": 3511 + }, + { + "epoch": 0.37, + "grad_norm": 2.3603811567927466, + "learning_rate": 7.269876574001414e-06, + "loss": 0.6546, + "step": 3512 + }, + { + "epoch": 0.37, + "grad_norm": 2.775937881236346, + "learning_rate": 7.2683579416578e-06, + "loss": 0.6915, + "step": 3513 + }, + { + "epoch": 0.37, + "grad_norm": 2.542726679655711, + "learning_rate": 7.266839045782914e-06, + "loss": 0.6684, + "step": 3514 + }, + { + "epoch": 0.37, + "grad_norm": 2.7964307299094453, + "learning_rate": 7.265319886553223e-06, + "loss": 0.594, + "step": 3515 + }, + { + "epoch": 0.37, + "grad_norm": 2.37740356376484, + "learning_rate": 7.263800464145214e-06, + "loss": 0.7419, + "step": 3516 + }, + { + "epoch": 0.37, + "grad_norm": 3.485584641652323, + "learning_rate": 7.262280778735412e-06, + "loss": 0.6703, + "step": 3517 + }, + { + "epoch": 0.37, + "grad_norm": 3.4839449059905907, + "learning_rate": 7.26076083050037e-06, + "loss": 0.675, + "step": 3518 + }, + { + "epoch": 0.37, + "grad_norm": 2.12300936003989, + "learning_rate": 7.259240619616668e-06, + "loss": 0.6224, + "step": 3519 + }, + { + "epoch": 0.37, + "grad_norm": 2.6719730804814863, + "learning_rate": 7.257720146260923e-06, + "loss": 0.659, + "step": 3520 + }, + { + "epoch": 0.37, + "grad_norm": 3.73292327896022, + "learning_rate": 7.256199410609776e-06, + "loss": 0.6476, + "step": 3521 + }, + { + "epoch": 0.37, + "grad_norm": 4.636014633612924, + "learning_rate": 7.254678412839905e-06, + "loss": 0.6879, + "step": 3522 + }, + { + "epoch": 0.37, + "grad_norm": 2.5306894259549075, + "learning_rate": 7.253157153128012e-06, + "loss": 0.6785, + "step": 3523 + }, + { + "epoch": 0.37, + "grad_norm": 3.976569034282962, + "learning_rate": 7.251635631650838e-06, + "loss": 0.7064, + "step": 3524 + }, + { + "epoch": 0.37, + "grad_norm": 2.45250759237423, + "learning_rate": 7.250113848585141e-06, + "loss": 0.6499, + "step": 3525 + }, + { + "epoch": 0.37, + "grad_norm": 2.060166639945123, + "learning_rate": 7.248591804107724e-06, + "loss": 0.6757, + "step": 3526 + }, + { + "epoch": 0.37, + "grad_norm": 5.716487897796609, + "learning_rate": 7.247069498395409e-06, + "loss": 0.6512, + "step": 3527 + }, + { + "epoch": 0.37, + "grad_norm": 3.4915603318297346, + "learning_rate": 7.245546931625057e-06, + "loss": 0.6518, + "step": 3528 + }, + { + "epoch": 0.37, + "grad_norm": 2.657417413138783, + "learning_rate": 7.244024103973553e-06, + "loss": 0.6662, + "step": 3529 + }, + { + "epoch": 0.37, + "grad_norm": 4.6985543615408805, + "learning_rate": 7.242501015617815e-06, + "loss": 0.5643, + "step": 3530 + }, + { + "epoch": 0.37, + "grad_norm": 2.7512005261618566, + "learning_rate": 7.240977666734793e-06, + "loss": 0.6004, + "step": 3531 + }, + { + "epoch": 0.37, + "grad_norm": 2.3062901989306015, + "learning_rate": 7.239454057501462e-06, + "loss": 0.6528, + "step": 3532 + }, + { + "epoch": 0.37, + "grad_norm": 3.1690879827237803, + "learning_rate": 7.237930188094834e-06, + "loss": 0.6433, + "step": 3533 + }, + { + "epoch": 0.37, + "grad_norm": 2.189006985940273, + "learning_rate": 7.236406058691944e-06, + "loss": 0.6005, + "step": 3534 + }, + { + "epoch": 0.37, + "grad_norm": 2.5436760311571067, + "learning_rate": 7.234881669469864e-06, + "loss": 0.6815, + "step": 3535 + }, + { + "epoch": 0.37, + "grad_norm": 2.5440441226370254, + "learning_rate": 7.233357020605692e-06, + "loss": 0.5973, + "step": 3536 + }, + { + "epoch": 0.37, + "grad_norm": 2.8717435554951423, + "learning_rate": 7.2318321122765575e-06, + "loss": 0.7263, + "step": 3537 + }, + { + "epoch": 0.37, + "grad_norm": 2.5676017996288674, + "learning_rate": 7.230306944659618e-06, + "loss": 0.6429, + "step": 3538 + }, + { + "epoch": 0.37, + "grad_norm": 3.320166184610786, + "learning_rate": 7.2287815179320665e-06, + "loss": 0.7412, + "step": 3539 + }, + { + "epoch": 0.37, + "grad_norm": 4.887680699145069, + "learning_rate": 7.227255832271122e-06, + "loss": 0.7, + "step": 3540 + }, + { + "epoch": 0.37, + "grad_norm": 2.2913081303644938, + "learning_rate": 7.225729887854032e-06, + "loss": 0.6098, + "step": 3541 + }, + { + "epoch": 0.37, + "grad_norm": 4.5364276369948415, + "learning_rate": 7.224203684858078e-06, + "loss": 0.639, + "step": 3542 + }, + { + "epoch": 0.37, + "grad_norm": 2.9611456139801065, + "learning_rate": 7.222677223460567e-06, + "loss": 0.626, + "step": 3543 + }, + { + "epoch": 0.37, + "grad_norm": 4.607908981424635, + "learning_rate": 7.221150503838844e-06, + "loss": 0.6371, + "step": 3544 + }, + { + "epoch": 0.37, + "grad_norm": 2.6691678732909114, + "learning_rate": 7.219623526170275e-06, + "loss": 0.6519, + "step": 3545 + }, + { + "epoch": 0.37, + "grad_norm": 2.914049840880877, + "learning_rate": 7.218096290632263e-06, + "loss": 0.6737, + "step": 3546 + }, + { + "epoch": 0.37, + "grad_norm": 4.551071513521762, + "learning_rate": 7.216568797402232e-06, + "loss": 0.616, + "step": 3547 + }, + { + "epoch": 0.37, + "grad_norm": 2.5813083708314255, + "learning_rate": 7.2150410466576495e-06, + "loss": 0.7268, + "step": 3548 + }, + { + "epoch": 0.37, + "grad_norm": 3.0795987810833583, + "learning_rate": 7.213513038575999e-06, + "loss": 0.6526, + "step": 3549 + }, + { + "epoch": 0.37, + "grad_norm": 2.2721784087350216, + "learning_rate": 7.211984773334803e-06, + "loss": 0.6302, + "step": 3550 + }, + { + "epoch": 0.37, + "grad_norm": 1.928510250413681, + "learning_rate": 7.210456251111611e-06, + "loss": 0.6411, + "step": 3551 + }, + { + "epoch": 0.37, + "grad_norm": 4.445810874079125, + "learning_rate": 7.208927472084e-06, + "loss": 0.6141, + "step": 3552 + }, + { + "epoch": 0.37, + "grad_norm": 2.3202477177734657, + "learning_rate": 7.207398436429581e-06, + "loss": 0.6578, + "step": 3553 + }, + { + "epoch": 0.37, + "grad_norm": 4.305688026429538, + "learning_rate": 7.205869144325992e-06, + "loss": 0.6092, + "step": 3554 + }, + { + "epoch": 0.37, + "grad_norm": 2.346775916923627, + "learning_rate": 7.204339595950904e-06, + "loss": 0.5894, + "step": 3555 + }, + { + "epoch": 0.37, + "grad_norm": 3.8477133741441674, + "learning_rate": 7.202809791482013e-06, + "loss": 0.6427, + "step": 3556 + }, + { + "epoch": 0.37, + "grad_norm": 3.235974425228947, + "learning_rate": 7.201279731097048e-06, + "loss": 0.6674, + "step": 3557 + }, + { + "epoch": 0.37, + "grad_norm": 2.2493046168393445, + "learning_rate": 7.199749414973767e-06, + "loss": 0.6528, + "step": 3558 + }, + { + "epoch": 0.37, + "grad_norm": 1.107170893614456, + "learning_rate": 7.1982188432899595e-06, + "loss": 0.5877, + "step": 3559 + }, + { + "epoch": 0.37, + "grad_norm": 3.3599103332329485, + "learning_rate": 7.196688016223439e-06, + "loss": 0.7369, + "step": 3560 + }, + { + "epoch": 0.37, + "grad_norm": 2.4363610701892657, + "learning_rate": 7.195156933952055e-06, + "loss": 0.6762, + "step": 3561 + }, + { + "epoch": 0.37, + "grad_norm": 2.7535120357249556, + "learning_rate": 7.193625596653684e-06, + "loss": 0.6812, + "step": 3562 + }, + { + "epoch": 0.37, + "grad_norm": 2.099885019771948, + "learning_rate": 7.1920940045062335e-06, + "loss": 0.6133, + "step": 3563 + }, + { + "epoch": 0.38, + "grad_norm": 3.93938976849278, + "learning_rate": 7.1905621576876375e-06, + "loss": 0.6401, + "step": 3564 + }, + { + "epoch": 0.38, + "grad_norm": 2.4121806622384656, + "learning_rate": 7.189030056375862e-06, + "loss": 0.629, + "step": 3565 + }, + { + "epoch": 0.38, + "grad_norm": 2.832137503000308, + "learning_rate": 7.187497700748903e-06, + "loss": 0.7839, + "step": 3566 + }, + { + "epoch": 0.38, + "grad_norm": 2.892305433301476, + "learning_rate": 7.185965090984783e-06, + "loss": 0.6009, + "step": 3567 + }, + { + "epoch": 0.38, + "grad_norm": 2.701267405207593, + "learning_rate": 7.184432227261561e-06, + "loss": 0.7288, + "step": 3568 + }, + { + "epoch": 0.38, + "grad_norm": 3.065332041666608, + "learning_rate": 7.182899109757314e-06, + "loss": 0.6107, + "step": 3569 + }, + { + "epoch": 0.38, + "grad_norm": 2.3541441748774177, + "learning_rate": 7.181365738650161e-06, + "loss": 0.6027, + "step": 3570 + }, + { + "epoch": 0.38, + "grad_norm": 2.271716533928777, + "learning_rate": 7.17983211411824e-06, + "loss": 0.6606, + "step": 3571 + }, + { + "epoch": 0.38, + "grad_norm": 3.88759903004812, + "learning_rate": 7.178298236339727e-06, + "loss": 0.5967, + "step": 3572 + }, + { + "epoch": 0.38, + "grad_norm": 5.01835950623007, + "learning_rate": 7.176764105492821e-06, + "loss": 0.6285, + "step": 3573 + }, + { + "epoch": 0.38, + "grad_norm": 2.7434215149338397, + "learning_rate": 7.175229721755753e-06, + "loss": 0.6626, + "step": 3574 + }, + { + "epoch": 0.38, + "grad_norm": 3.346997092680918, + "learning_rate": 7.173695085306785e-06, + "loss": 0.7167, + "step": 3575 + }, + { + "epoch": 0.38, + "grad_norm": 3.140529020266596, + "learning_rate": 7.172160196324205e-06, + "loss": 0.658, + "step": 3576 + }, + { + "epoch": 0.38, + "grad_norm": 4.120165751814182, + "learning_rate": 7.1706250549863335e-06, + "loss": 0.69, + "step": 3577 + }, + { + "epoch": 0.38, + "grad_norm": 2.2247914008766796, + "learning_rate": 7.1690896614715155e-06, + "loss": 0.6288, + "step": 3578 + }, + { + "epoch": 0.38, + "grad_norm": 2.605629864739022, + "learning_rate": 7.167554015958133e-06, + "loss": 0.6357, + "step": 3579 + }, + { + "epoch": 0.38, + "grad_norm": 2.3904906015835947, + "learning_rate": 7.166018118624588e-06, + "loss": 0.5556, + "step": 3580 + }, + { + "epoch": 0.38, + "grad_norm": 2.6192217487542586, + "learning_rate": 7.164481969649323e-06, + "loss": 0.6576, + "step": 3581 + }, + { + "epoch": 0.38, + "grad_norm": 5.076216402779878, + "learning_rate": 7.162945569210796e-06, + "loss": 0.6749, + "step": 3582 + }, + { + "epoch": 0.38, + "grad_norm": 4.889072644554949, + "learning_rate": 7.161408917487509e-06, + "loss": 0.6673, + "step": 3583 + }, + { + "epoch": 0.38, + "grad_norm": 4.537856847059156, + "learning_rate": 7.159872014657978e-06, + "loss": 0.6752, + "step": 3584 + }, + { + "epoch": 0.38, + "grad_norm": 2.221005642218842, + "learning_rate": 7.158334860900762e-06, + "loss": 0.5711, + "step": 3585 + }, + { + "epoch": 0.38, + "grad_norm": 2.4533316517178387, + "learning_rate": 7.156797456394441e-06, + "loss": 0.7351, + "step": 3586 + }, + { + "epoch": 0.38, + "grad_norm": 2.886040834314495, + "learning_rate": 7.1552598013176264e-06, + "loss": 0.6767, + "step": 3587 + }, + { + "epoch": 0.38, + "grad_norm": 2.874147523804201, + "learning_rate": 7.1537218958489575e-06, + "loss": 0.6642, + "step": 3588 + }, + { + "epoch": 0.38, + "grad_norm": 2.722531139220571, + "learning_rate": 7.152183740167105e-06, + "loss": 0.6678, + "step": 3589 + }, + { + "epoch": 0.38, + "grad_norm": 3.187200392218844, + "learning_rate": 7.150645334450767e-06, + "loss": 0.7528, + "step": 3590 + }, + { + "epoch": 0.38, + "grad_norm": 2.3536496010454147, + "learning_rate": 7.14910667887867e-06, + "loss": 0.6902, + "step": 3591 + }, + { + "epoch": 0.38, + "grad_norm": 2.0650748967886057, + "learning_rate": 7.147567773629573e-06, + "loss": 0.6505, + "step": 3592 + }, + { + "epoch": 0.38, + "grad_norm": 2.86825048250956, + "learning_rate": 7.146028618882258e-06, + "loss": 0.7501, + "step": 3593 + }, + { + "epoch": 0.38, + "grad_norm": 2.5434125470032756, + "learning_rate": 7.1444892148155445e-06, + "loss": 0.7357, + "step": 3594 + }, + { + "epoch": 0.38, + "grad_norm": 2.4145813864328374, + "learning_rate": 7.14294956160827e-06, + "loss": 0.613, + "step": 3595 + }, + { + "epoch": 0.38, + "grad_norm": 2.141187484264333, + "learning_rate": 7.141409659439313e-06, + "loss": 0.6496, + "step": 3596 + }, + { + "epoch": 0.38, + "grad_norm": 2.9374788957358136, + "learning_rate": 7.139869508487569e-06, + "loss": 0.6382, + "step": 3597 + }, + { + "epoch": 0.38, + "grad_norm": 2.4608281779178722, + "learning_rate": 7.138329108931974e-06, + "loss": 0.6155, + "step": 3598 + }, + { + "epoch": 0.38, + "grad_norm": 2.9810869246330727, + "learning_rate": 7.136788460951482e-06, + "loss": 0.6844, + "step": 3599 + }, + { + "epoch": 0.38, + "grad_norm": 2.261119179477242, + "learning_rate": 7.135247564725085e-06, + "loss": 0.6355, + "step": 3600 + }, + { + "epoch": 0.38, + "grad_norm": 2.655641062191612, + "learning_rate": 7.133706420431799e-06, + "loss": 0.578, + "step": 3601 + }, + { + "epoch": 0.38, + "grad_norm": 1.0452192913458778, + "learning_rate": 7.132165028250666e-06, + "loss": 0.6081, + "step": 3602 + }, + { + "epoch": 0.38, + "grad_norm": 2.5869892334643882, + "learning_rate": 7.130623388360767e-06, + "loss": 0.7272, + "step": 3603 + }, + { + "epoch": 0.38, + "grad_norm": 2.7640836198719785, + "learning_rate": 7.129081500941199e-06, + "loss": 0.6978, + "step": 3604 + }, + { + "epoch": 0.38, + "grad_norm": 2.768985249973699, + "learning_rate": 7.127539366171099e-06, + "loss": 0.5961, + "step": 3605 + }, + { + "epoch": 0.38, + "grad_norm": 3.9567725278063506, + "learning_rate": 7.125996984229623e-06, + "loss": 0.685, + "step": 3606 + }, + { + "epoch": 0.38, + "grad_norm": 2.879031446261374, + "learning_rate": 7.124454355295966e-06, + "loss": 0.6856, + "step": 3607 + }, + { + "epoch": 0.38, + "grad_norm": 2.4194323937650752, + "learning_rate": 7.1229114795493405e-06, + "loss": 0.6894, + "step": 3608 + }, + { + "epoch": 0.38, + "grad_norm": 0.9796362738749906, + "learning_rate": 7.121368357168997e-06, + "loss": 0.5798, + "step": 3609 + }, + { + "epoch": 0.38, + "grad_norm": 2.51120233981379, + "learning_rate": 7.11982498833421e-06, + "loss": 0.7307, + "step": 3610 + }, + { + "epoch": 0.38, + "grad_norm": 0.9959995491929295, + "learning_rate": 7.1182813732242835e-06, + "loss": 0.5706, + "step": 3611 + }, + { + "epoch": 0.38, + "grad_norm": 2.693687958001663, + "learning_rate": 7.116737512018551e-06, + "loss": 0.598, + "step": 3612 + }, + { + "epoch": 0.38, + "grad_norm": 2.298264703609383, + "learning_rate": 7.115193404896372e-06, + "loss": 0.6928, + "step": 3613 + }, + { + "epoch": 0.38, + "grad_norm": 4.7510712526228716, + "learning_rate": 7.1136490520371394e-06, + "loss": 0.6345, + "step": 3614 + }, + { + "epoch": 0.38, + "grad_norm": 2.545812637717511, + "learning_rate": 7.112104453620269e-06, + "loss": 0.7136, + "step": 3615 + }, + { + "epoch": 0.38, + "grad_norm": 2.4425094561499003, + "learning_rate": 7.11055960982521e-06, + "loss": 0.6611, + "step": 3616 + }, + { + "epoch": 0.38, + "grad_norm": 3.2838270941484837, + "learning_rate": 7.109014520831433e-06, + "loss": 0.6004, + "step": 3617 + }, + { + "epoch": 0.38, + "grad_norm": 3.6464684533156175, + "learning_rate": 7.10746918681845e-06, + "loss": 0.7205, + "step": 3618 + }, + { + "epoch": 0.38, + "grad_norm": 2.494247980704131, + "learning_rate": 7.105923607965786e-06, + "loss": 0.6274, + "step": 3619 + }, + { + "epoch": 0.38, + "grad_norm": 2.4050078671251107, + "learning_rate": 7.104377784453005e-06, + "loss": 0.6664, + "step": 3620 + }, + { + "epoch": 0.38, + "grad_norm": 2.404246434896617, + "learning_rate": 7.102831716459696e-06, + "loss": 0.5954, + "step": 3621 + }, + { + "epoch": 0.38, + "grad_norm": 2.623044062785884, + "learning_rate": 7.101285404165478e-06, + "loss": 0.6359, + "step": 3622 + }, + { + "epoch": 0.38, + "grad_norm": 2.5546602923700013, + "learning_rate": 7.099738847749995e-06, + "loss": 0.6142, + "step": 3623 + }, + { + "epoch": 0.38, + "grad_norm": 2.5511546021541984, + "learning_rate": 7.098192047392923e-06, + "loss": 0.6326, + "step": 3624 + }, + { + "epoch": 0.38, + "grad_norm": 2.863354029549815, + "learning_rate": 7.096645003273964e-06, + "loss": 0.7335, + "step": 3625 + }, + { + "epoch": 0.38, + "grad_norm": 2.6001421531585835, + "learning_rate": 7.095097715572849e-06, + "loss": 0.6356, + "step": 3626 + }, + { + "epoch": 0.38, + "grad_norm": 2.6189887318128875, + "learning_rate": 7.093550184469339e-06, + "loss": 0.6734, + "step": 3627 + }, + { + "epoch": 0.38, + "grad_norm": 2.9686532488381214, + "learning_rate": 7.092002410143218e-06, + "loss": 0.6598, + "step": 3628 + }, + { + "epoch": 0.38, + "grad_norm": 2.885198141114095, + "learning_rate": 7.0904543927743066e-06, + "loss": 0.7185, + "step": 3629 + }, + { + "epoch": 0.38, + "grad_norm": 2.655688880025789, + "learning_rate": 7.088906132542446e-06, + "loss": 0.6541, + "step": 3630 + }, + { + "epoch": 0.38, + "grad_norm": 2.7287819033410154, + "learning_rate": 7.0873576296275096e-06, + "loss": 0.6549, + "step": 3631 + }, + { + "epoch": 0.38, + "grad_norm": 2.6674782827937946, + "learning_rate": 7.085808884209396e-06, + "loss": 0.5593, + "step": 3632 + }, + { + "epoch": 0.38, + "grad_norm": 2.3031490250344473, + "learning_rate": 7.084259896468038e-06, + "loss": 0.6337, + "step": 3633 + }, + { + "epoch": 0.38, + "grad_norm": 1.243801593410752, + "learning_rate": 7.082710666583389e-06, + "loss": 0.5897, + "step": 3634 + }, + { + "epoch": 0.38, + "grad_norm": 3.3382949550635908, + "learning_rate": 7.081161194735435e-06, + "loss": 0.7044, + "step": 3635 + }, + { + "epoch": 0.38, + "grad_norm": 3.2631769374260293, + "learning_rate": 7.0796114811041905e-06, + "loss": 0.6655, + "step": 3636 + }, + { + "epoch": 0.38, + "grad_norm": 1.1036281150291933, + "learning_rate": 7.078061525869695e-06, + "loss": 0.6465, + "step": 3637 + }, + { + "epoch": 0.38, + "grad_norm": 2.7686792433647867, + "learning_rate": 7.076511329212019e-06, + "loss": 0.674, + "step": 3638 + }, + { + "epoch": 0.38, + "grad_norm": 3.7550739067072625, + "learning_rate": 7.074960891311258e-06, + "loss": 0.6734, + "step": 3639 + }, + { + "epoch": 0.38, + "grad_norm": 2.177433134571054, + "learning_rate": 7.073410212347541e-06, + "loss": 0.6277, + "step": 3640 + }, + { + "epoch": 0.38, + "grad_norm": 2.7882879054667487, + "learning_rate": 7.071859292501018e-06, + "loss": 0.6493, + "step": 3641 + }, + { + "epoch": 0.38, + "grad_norm": 3.434366748004624, + "learning_rate": 7.070308131951872e-06, + "loss": 0.6723, + "step": 3642 + }, + { + "epoch": 0.38, + "grad_norm": 2.7276152477544935, + "learning_rate": 7.068756730880311e-06, + "loss": 0.622, + "step": 3643 + }, + { + "epoch": 0.38, + "grad_norm": 3.0417680257370083, + "learning_rate": 7.067205089466574e-06, + "loss": 0.6209, + "step": 3644 + }, + { + "epoch": 0.38, + "grad_norm": 2.321254791228394, + "learning_rate": 7.065653207890924e-06, + "loss": 0.6661, + "step": 3645 + }, + { + "epoch": 0.38, + "grad_norm": 2.8612826916827196, + "learning_rate": 7.064101086333657e-06, + "loss": 0.6949, + "step": 3646 + }, + { + "epoch": 0.38, + "grad_norm": 3.1520069324380366, + "learning_rate": 7.0625487249750915e-06, + "loss": 0.6501, + "step": 3647 + }, + { + "epoch": 0.38, + "grad_norm": 3.964198431485837, + "learning_rate": 7.060996123995576e-06, + "loss": 0.6407, + "step": 3648 + }, + { + "epoch": 0.38, + "grad_norm": 2.93236095260122, + "learning_rate": 7.059443283575492e-06, + "loss": 0.5939, + "step": 3649 + }, + { + "epoch": 0.38, + "grad_norm": 2.6552920819656887, + "learning_rate": 7.0578902038952375e-06, + "loss": 0.6932, + "step": 3650 + }, + { + "epoch": 0.38, + "grad_norm": 2.9297734442881906, + "learning_rate": 7.056336885135251e-06, + "loss": 0.6419, + "step": 3651 + }, + { + "epoch": 0.38, + "grad_norm": 2.6224581309833943, + "learning_rate": 7.054783327475987e-06, + "loss": 0.675, + "step": 3652 + }, + { + "epoch": 0.38, + "grad_norm": 2.7160780202997263, + "learning_rate": 7.053229531097937e-06, + "loss": 0.656, + "step": 3653 + }, + { + "epoch": 0.38, + "grad_norm": 3.3774431250824373, + "learning_rate": 7.051675496181614e-06, + "loss": 0.5999, + "step": 3654 + }, + { + "epoch": 0.38, + "grad_norm": 1.2783376063724854, + "learning_rate": 7.050121222907564e-06, + "loss": 0.5744, + "step": 3655 + }, + { + "epoch": 0.38, + "grad_norm": 2.3589367318753514, + "learning_rate": 7.048566711456355e-06, + "loss": 0.6371, + "step": 3656 + }, + { + "epoch": 0.38, + "grad_norm": 2.6774468417048696, + "learning_rate": 7.047011962008589e-06, + "loss": 0.6447, + "step": 3657 + }, + { + "epoch": 0.38, + "grad_norm": 2.4342842413375734, + "learning_rate": 7.04545697474489e-06, + "loss": 0.6903, + "step": 3658 + }, + { + "epoch": 0.39, + "grad_norm": 2.252872706576466, + "learning_rate": 7.043901749845913e-06, + "loss": 0.6952, + "step": 3659 + }, + { + "epoch": 0.39, + "grad_norm": 2.3175374076123303, + "learning_rate": 7.042346287492339e-06, + "loss": 0.708, + "step": 3660 + }, + { + "epoch": 0.39, + "grad_norm": 2.634929578368015, + "learning_rate": 7.040790587864875e-06, + "loss": 0.6424, + "step": 3661 + }, + { + "epoch": 0.39, + "grad_norm": 2.618636727946655, + "learning_rate": 7.039234651144262e-06, + "loss": 0.6591, + "step": 3662 + }, + { + "epoch": 0.39, + "grad_norm": 2.904861681615746, + "learning_rate": 7.037678477511261e-06, + "loss": 0.7491, + "step": 3663 + }, + { + "epoch": 0.39, + "grad_norm": 2.5649713303543047, + "learning_rate": 7.036122067146667e-06, + "loss": 0.703, + "step": 3664 + }, + { + "epoch": 0.39, + "grad_norm": 3.220337768555499, + "learning_rate": 7.034565420231294e-06, + "loss": 0.7034, + "step": 3665 + }, + { + "epoch": 0.39, + "grad_norm": 2.390572089955541, + "learning_rate": 7.033008536945994e-06, + "loss": 0.6504, + "step": 3666 + }, + { + "epoch": 0.39, + "grad_norm": 3.7954821263514296, + "learning_rate": 7.031451417471638e-06, + "loss": 0.6958, + "step": 3667 + }, + { + "epoch": 0.39, + "grad_norm": 2.8545469996532997, + "learning_rate": 7.029894061989128e-06, + "loss": 0.7058, + "step": 3668 + }, + { + "epoch": 0.39, + "grad_norm": 2.741678996189431, + "learning_rate": 7.0283364706793954e-06, + "loss": 0.6367, + "step": 3669 + }, + { + "epoch": 0.39, + "grad_norm": 2.677654834887371, + "learning_rate": 7.026778643723393e-06, + "loss": 0.6911, + "step": 3670 + }, + { + "epoch": 0.39, + "grad_norm": 2.061593434569085, + "learning_rate": 7.025220581302108e-06, + "loss": 0.6083, + "step": 3671 + }, + { + "epoch": 0.39, + "grad_norm": 4.4271014763245535, + "learning_rate": 7.02366228359655e-06, + "loss": 0.632, + "step": 3672 + }, + { + "epoch": 0.39, + "grad_norm": 8.875367995763348, + "learning_rate": 7.022103750787759e-06, + "loss": 0.7067, + "step": 3673 + }, + { + "epoch": 0.39, + "grad_norm": 2.7873572210988407, + "learning_rate": 7.020544983056796e-06, + "loss": 0.7194, + "step": 3674 + }, + { + "epoch": 0.39, + "grad_norm": 2.662020767735063, + "learning_rate": 7.0189859805847615e-06, + "loss": 0.6952, + "step": 3675 + }, + { + "epoch": 0.39, + "grad_norm": 3.8783628852915077, + "learning_rate": 7.017426743552769e-06, + "loss": 0.6343, + "step": 3676 + }, + { + "epoch": 0.39, + "grad_norm": 3.5280735743445337, + "learning_rate": 7.015867272141972e-06, + "loss": 0.6424, + "step": 3677 + }, + { + "epoch": 0.39, + "grad_norm": 2.457030504295436, + "learning_rate": 7.014307566533541e-06, + "loss": 0.6985, + "step": 3678 + }, + { + "epoch": 0.39, + "grad_norm": 2.102622932285392, + "learning_rate": 7.0127476269086796e-06, + "loss": 0.6618, + "step": 3679 + }, + { + "epoch": 0.39, + "grad_norm": 2.953298825950946, + "learning_rate": 7.011187453448617e-06, + "loss": 0.6986, + "step": 3680 + }, + { + "epoch": 0.39, + "grad_norm": 2.2057102088439478, + "learning_rate": 7.009627046334611e-06, + "loss": 0.7022, + "step": 3681 + }, + { + "epoch": 0.39, + "grad_norm": 2.414332975549559, + "learning_rate": 7.008066405747943e-06, + "loss": 0.6429, + "step": 3682 + }, + { + "epoch": 0.39, + "grad_norm": 2.484038423521051, + "learning_rate": 7.006505531869925e-06, + "loss": 0.7395, + "step": 3683 + }, + { + "epoch": 0.39, + "grad_norm": 2.441293974717065, + "learning_rate": 7.004944424881894e-06, + "loss": 0.6179, + "step": 3684 + }, + { + "epoch": 0.39, + "grad_norm": 2.12476887533077, + "learning_rate": 7.003383084965215e-06, + "loss": 0.5975, + "step": 3685 + }, + { + "epoch": 0.39, + "grad_norm": 2.4875957020716335, + "learning_rate": 7.001821512301283e-06, + "loss": 0.701, + "step": 3686 + }, + { + "epoch": 0.39, + "grad_norm": 2.2622990175164963, + "learning_rate": 7.000259707071512e-06, + "loss": 0.7091, + "step": 3687 + }, + { + "epoch": 0.39, + "grad_norm": 3.4195893855491506, + "learning_rate": 6.9986976694573515e-06, + "loss": 0.6841, + "step": 3688 + }, + { + "epoch": 0.39, + "grad_norm": 3.6453885611711816, + "learning_rate": 6.997135399640273e-06, + "loss": 0.6114, + "step": 3689 + }, + { + "epoch": 0.39, + "grad_norm": 2.129611856788084, + "learning_rate": 6.9955728978017775e-06, + "loss": 0.6429, + "step": 3690 + }, + { + "epoch": 0.39, + "grad_norm": 2.2384526562197875, + "learning_rate": 6.99401016412339e-06, + "loss": 0.7059, + "step": 3691 + }, + { + "epoch": 0.39, + "grad_norm": 2.206543127801006, + "learning_rate": 6.992447198786666e-06, + "loss": 0.6889, + "step": 3692 + }, + { + "epoch": 0.39, + "grad_norm": 2.5820156224344046, + "learning_rate": 6.990884001973187e-06, + "loss": 0.5918, + "step": 3693 + }, + { + "epoch": 0.39, + "grad_norm": 2.705800904753045, + "learning_rate": 6.98932057386456e-06, + "loss": 0.6739, + "step": 3694 + }, + { + "epoch": 0.39, + "grad_norm": 3.2287782223395998, + "learning_rate": 6.987756914642418e-06, + "loss": 0.5849, + "step": 3695 + }, + { + "epoch": 0.39, + "grad_norm": 3.4820384692081654, + "learning_rate": 6.986193024488423e-06, + "loss": 0.6841, + "step": 3696 + }, + { + "epoch": 0.39, + "grad_norm": 2.90798808346997, + "learning_rate": 6.984628903584266e-06, + "loss": 0.6208, + "step": 3697 + }, + { + "epoch": 0.39, + "grad_norm": 5.045138928005131, + "learning_rate": 6.983064552111658e-06, + "loss": 0.6761, + "step": 3698 + }, + { + "epoch": 0.39, + "grad_norm": 2.763765878384353, + "learning_rate": 6.981499970252345e-06, + "loss": 0.6429, + "step": 3699 + }, + { + "epoch": 0.39, + "grad_norm": 3.8969396881884393, + "learning_rate": 6.979935158188091e-06, + "loss": 0.6543, + "step": 3700 + }, + { + "epoch": 0.39, + "grad_norm": 2.144538256745873, + "learning_rate": 6.9783701161006965e-06, + "loss": 0.6848, + "step": 3701 + }, + { + "epoch": 0.39, + "grad_norm": 2.2691334424929632, + "learning_rate": 6.976804844171978e-06, + "loss": 0.682, + "step": 3702 + }, + { + "epoch": 0.39, + "grad_norm": 2.853203277677684, + "learning_rate": 6.975239342583789e-06, + "loss": 0.5798, + "step": 3703 + }, + { + "epoch": 0.39, + "grad_norm": 0.984156618342594, + "learning_rate": 6.973673611518003e-06, + "loss": 0.6253, + "step": 3704 + }, + { + "epoch": 0.39, + "grad_norm": 3.6403380840996964, + "learning_rate": 6.972107651156521e-06, + "loss": 0.7099, + "step": 3705 + }, + { + "epoch": 0.39, + "grad_norm": 3.0398511893391715, + "learning_rate": 6.970541461681274e-06, + "loss": 0.6499, + "step": 3706 + }, + { + "epoch": 0.39, + "grad_norm": 2.5614405274937453, + "learning_rate": 6.968975043274215e-06, + "loss": 0.5853, + "step": 3707 + }, + { + "epoch": 0.39, + "grad_norm": 2.34052750735692, + "learning_rate": 6.9674083961173276e-06, + "loss": 0.7104, + "step": 3708 + }, + { + "epoch": 0.39, + "grad_norm": 2.5089454252540015, + "learning_rate": 6.96584152039262e-06, + "loss": 0.6719, + "step": 3709 + }, + { + "epoch": 0.39, + "grad_norm": 2.9541879261553783, + "learning_rate": 6.964274416282129e-06, + "loss": 0.6482, + "step": 3710 + }, + { + "epoch": 0.39, + "grad_norm": 2.3885655104600265, + "learning_rate": 6.962707083967911e-06, + "loss": 0.5981, + "step": 3711 + }, + { + "epoch": 0.39, + "grad_norm": 2.4321001454354527, + "learning_rate": 6.9611395236320615e-06, + "loss": 0.6967, + "step": 3712 + }, + { + "epoch": 0.39, + "grad_norm": 1.1448344326564757, + "learning_rate": 6.959571735456687e-06, + "loss": 0.6432, + "step": 3713 + }, + { + "epoch": 0.39, + "grad_norm": 2.652298799399858, + "learning_rate": 6.958003719623936e-06, + "loss": 0.7021, + "step": 3714 + }, + { + "epoch": 0.39, + "grad_norm": 2.2558034126837194, + "learning_rate": 6.956435476315972e-06, + "loss": 0.7334, + "step": 3715 + }, + { + "epoch": 0.39, + "grad_norm": 2.089987806720731, + "learning_rate": 6.9548670057149896e-06, + "loss": 0.6904, + "step": 3716 + }, + { + "epoch": 0.39, + "grad_norm": 3.0275655323101884, + "learning_rate": 6.953298308003209e-06, + "loss": 0.6388, + "step": 3717 + }, + { + "epoch": 0.39, + "grad_norm": 2.6883335061425595, + "learning_rate": 6.9517293833628785e-06, + "loss": 0.666, + "step": 3718 + }, + { + "epoch": 0.39, + "grad_norm": 4.102080658442691, + "learning_rate": 6.950160231976269e-06, + "loss": 0.6471, + "step": 3719 + }, + { + "epoch": 0.39, + "grad_norm": 2.547523830477747, + "learning_rate": 6.948590854025681e-06, + "loss": 0.6384, + "step": 3720 + }, + { + "epoch": 0.39, + "grad_norm": 2.9940684741992807, + "learning_rate": 6.947021249693442e-06, + "loss": 0.6313, + "step": 3721 + }, + { + "epoch": 0.39, + "grad_norm": 3.388166051557115, + "learning_rate": 6.9454514191619e-06, + "loss": 0.6452, + "step": 3722 + }, + { + "epoch": 0.39, + "grad_norm": 2.5275961581874453, + "learning_rate": 6.9438813626134395e-06, + "loss": 0.5957, + "step": 3723 + }, + { + "epoch": 0.39, + "grad_norm": 2.9456477007058033, + "learning_rate": 6.942311080230458e-06, + "loss": 0.6231, + "step": 3724 + }, + { + "epoch": 0.39, + "grad_norm": 2.085258818767328, + "learning_rate": 6.940740572195392e-06, + "loss": 0.6739, + "step": 3725 + }, + { + "epoch": 0.39, + "grad_norm": 2.189417657638878, + "learning_rate": 6.939169838690695e-06, + "loss": 0.6297, + "step": 3726 + }, + { + "epoch": 0.39, + "grad_norm": 2.502191462085905, + "learning_rate": 6.937598879898853e-06, + "loss": 0.6238, + "step": 3727 + }, + { + "epoch": 0.39, + "grad_norm": 2.4771620597343773, + "learning_rate": 6.936027696002373e-06, + "loss": 0.7406, + "step": 3728 + }, + { + "epoch": 0.39, + "grad_norm": 2.3821967150760526, + "learning_rate": 6.934456287183793e-06, + "loss": 0.7131, + "step": 3729 + }, + { + "epoch": 0.39, + "grad_norm": 2.537080782846546, + "learning_rate": 6.932884653625672e-06, + "loss": 0.587, + "step": 3730 + }, + { + "epoch": 0.39, + "grad_norm": 4.044830093724147, + "learning_rate": 6.931312795510601e-06, + "loss": 0.6819, + "step": 3731 + }, + { + "epoch": 0.39, + "grad_norm": 2.060390451396102, + "learning_rate": 6.929740713021192e-06, + "loss": 0.6625, + "step": 3732 + }, + { + "epoch": 0.39, + "grad_norm": 2.1547981802320297, + "learning_rate": 6.928168406340082e-06, + "loss": 0.7245, + "step": 3733 + }, + { + "epoch": 0.39, + "grad_norm": 2.526368800735357, + "learning_rate": 6.926595875649944e-06, + "loss": 0.6918, + "step": 3734 + }, + { + "epoch": 0.39, + "grad_norm": 2.751346484300469, + "learning_rate": 6.925023121133465e-06, + "loss": 0.5705, + "step": 3735 + }, + { + "epoch": 0.39, + "grad_norm": 3.5756950930937323, + "learning_rate": 6.923450142973366e-06, + "loss": 0.7081, + "step": 3736 + }, + { + "epoch": 0.39, + "grad_norm": 2.3081997699324925, + "learning_rate": 6.921876941352388e-06, + "loss": 0.6519, + "step": 3737 + }, + { + "epoch": 0.39, + "grad_norm": 3.6782336966893183, + "learning_rate": 6.920303516453302e-06, + "loss": 0.6644, + "step": 3738 + }, + { + "epoch": 0.39, + "grad_norm": 2.1786958735173174, + "learning_rate": 6.918729868458905e-06, + "loss": 0.6086, + "step": 3739 + }, + { + "epoch": 0.39, + "grad_norm": 1.1080710822626574, + "learning_rate": 6.91715599755202e-06, + "loss": 0.5619, + "step": 3740 + }, + { + "epoch": 0.39, + "grad_norm": 3.3882194917466837, + "learning_rate": 6.9155819039154914e-06, + "loss": 0.6884, + "step": 3741 + }, + { + "epoch": 0.39, + "grad_norm": 2.2474420615290263, + "learning_rate": 6.9140075877321955e-06, + "loss": 0.7152, + "step": 3742 + }, + { + "epoch": 0.39, + "grad_norm": 2.3178490077460467, + "learning_rate": 6.91243304918503e-06, + "loss": 0.7279, + "step": 3743 + }, + { + "epoch": 0.39, + "grad_norm": 2.915704362648865, + "learning_rate": 6.9108582884569206e-06, + "loss": 0.6947, + "step": 3744 + }, + { + "epoch": 0.39, + "grad_norm": 2.4243198642597314, + "learning_rate": 6.909283305730822e-06, + "loss": 0.7129, + "step": 3745 + }, + { + "epoch": 0.39, + "grad_norm": 1.961387262313486, + "learning_rate": 6.907708101189705e-06, + "loss": 0.7032, + "step": 3746 + }, + { + "epoch": 0.39, + "grad_norm": 2.7889776855961554, + "learning_rate": 6.906132675016577e-06, + "loss": 0.5773, + "step": 3747 + }, + { + "epoch": 0.39, + "grad_norm": 4.106825781647384, + "learning_rate": 6.904557027394464e-06, + "loss": 0.5643, + "step": 3748 + }, + { + "epoch": 0.39, + "grad_norm": 4.128759223475058, + "learning_rate": 6.902981158506421e-06, + "loss": 0.6652, + "step": 3749 + }, + { + "epoch": 0.39, + "grad_norm": 1.0212995877258428, + "learning_rate": 6.90140506853553e-06, + "loss": 0.5981, + "step": 3750 + }, + { + "epoch": 0.39, + "grad_norm": 2.410233497856306, + "learning_rate": 6.899828757664892e-06, + "loss": 0.6367, + "step": 3751 + }, + { + "epoch": 0.39, + "grad_norm": 2.5548357809398037, + "learning_rate": 6.898252226077642e-06, + "loss": 0.6518, + "step": 3752 + }, + { + "epoch": 0.39, + "grad_norm": 1.8928951894823056, + "learning_rate": 6.896675473956935e-06, + "loss": 0.6251, + "step": 3753 + }, + { + "epoch": 0.4, + "grad_norm": 2.8686199304183004, + "learning_rate": 6.895098501485955e-06, + "loss": 0.6949, + "step": 3754 + }, + { + "epoch": 0.4, + "grad_norm": 2.4005163798318754, + "learning_rate": 6.8935213088479096e-06, + "loss": 0.6532, + "step": 3755 + }, + { + "epoch": 0.4, + "grad_norm": 1.083088075895648, + "learning_rate": 6.891943896226031e-06, + "loss": 0.6361, + "step": 3756 + }, + { + "epoch": 0.4, + "grad_norm": 2.3207947203497694, + "learning_rate": 6.890366263803579e-06, + "loss": 0.6775, + "step": 3757 + }, + { + "epoch": 0.4, + "grad_norm": 2.8029690208205515, + "learning_rate": 6.88878841176384e-06, + "loss": 0.614, + "step": 3758 + }, + { + "epoch": 0.4, + "grad_norm": 1.913955180413663, + "learning_rate": 6.887210340290124e-06, + "loss": 0.6718, + "step": 3759 + }, + { + "epoch": 0.4, + "grad_norm": 2.4336755807961237, + "learning_rate": 6.885632049565766e-06, + "loss": 0.6733, + "step": 3760 + }, + { + "epoch": 0.4, + "grad_norm": 3.0818816161132827, + "learning_rate": 6.884053539774125e-06, + "loss": 0.6907, + "step": 3761 + }, + { + "epoch": 0.4, + "grad_norm": 4.6742613246704785, + "learning_rate": 6.882474811098592e-06, + "loss": 0.6829, + "step": 3762 + }, + { + "epoch": 0.4, + "grad_norm": 1.9849851340007725, + "learning_rate": 6.880895863722576e-06, + "loss": 0.7172, + "step": 3763 + }, + { + "epoch": 0.4, + "grad_norm": 3.0447314238097247, + "learning_rate": 6.8793166978295166e-06, + "loss": 0.631, + "step": 3764 + }, + { + "epoch": 0.4, + "grad_norm": 3.91470175770067, + "learning_rate": 6.877737313602876e-06, + "loss": 0.6654, + "step": 3765 + }, + { + "epoch": 0.4, + "grad_norm": 2.1785559232413867, + "learning_rate": 6.8761577112261425e-06, + "loss": 0.6318, + "step": 3766 + }, + { + "epoch": 0.4, + "grad_norm": 2.669250198673815, + "learning_rate": 6.874577890882829e-06, + "loss": 0.6375, + "step": 3767 + }, + { + "epoch": 0.4, + "grad_norm": 2.9415389760168593, + "learning_rate": 6.872997852756474e-06, + "loss": 0.6013, + "step": 3768 + }, + { + "epoch": 0.4, + "grad_norm": 2.4526688080708596, + "learning_rate": 6.871417597030644e-06, + "loss": 0.6684, + "step": 3769 + }, + { + "epoch": 0.4, + "grad_norm": 2.788823894332462, + "learning_rate": 6.869837123888926e-06, + "loss": 0.6253, + "step": 3770 + }, + { + "epoch": 0.4, + "grad_norm": 4.477851029463746, + "learning_rate": 6.868256433514938e-06, + "loss": 0.7383, + "step": 3771 + }, + { + "epoch": 0.4, + "grad_norm": 2.269796038141395, + "learning_rate": 6.866675526092317e-06, + "loss": 0.7487, + "step": 3772 + }, + { + "epoch": 0.4, + "grad_norm": 2.648425239153188, + "learning_rate": 6.86509440180473e-06, + "loss": 0.7038, + "step": 3773 + }, + { + "epoch": 0.4, + "grad_norm": 2.375374473672006, + "learning_rate": 6.863513060835866e-06, + "loss": 0.6327, + "step": 3774 + }, + { + "epoch": 0.4, + "grad_norm": 1.1344879105982528, + "learning_rate": 6.861931503369441e-06, + "loss": 0.6503, + "step": 3775 + }, + { + "epoch": 0.4, + "grad_norm": 3.840442440534222, + "learning_rate": 6.8603497295891975e-06, + "loss": 0.5909, + "step": 3776 + }, + { + "epoch": 0.4, + "grad_norm": 3.9504848651490403, + "learning_rate": 6.8587677396789e-06, + "loss": 0.6947, + "step": 3777 + }, + { + "epoch": 0.4, + "grad_norm": 2.764177497995866, + "learning_rate": 6.8571855338223395e-06, + "loss": 0.6062, + "step": 3778 + }, + { + "epoch": 0.4, + "grad_norm": 3.440160369843259, + "learning_rate": 6.855603112203333e-06, + "loss": 0.6853, + "step": 3779 + }, + { + "epoch": 0.4, + "grad_norm": 2.2362406157836525, + "learning_rate": 6.854020475005719e-06, + "loss": 0.6329, + "step": 3780 + }, + { + "epoch": 0.4, + "grad_norm": 1.1361521836950608, + "learning_rate": 6.852437622413366e-06, + "loss": 0.5591, + "step": 3781 + }, + { + "epoch": 0.4, + "grad_norm": 2.2958928068872213, + "learning_rate": 6.850854554610167e-06, + "loss": 0.7093, + "step": 3782 + }, + { + "epoch": 0.4, + "grad_norm": 2.311428807633486, + "learning_rate": 6.849271271780034e-06, + "loss": 0.6284, + "step": 3783 + }, + { + "epoch": 0.4, + "grad_norm": 2.8763588330667194, + "learning_rate": 6.847687774106911e-06, + "loss": 0.6576, + "step": 3784 + }, + { + "epoch": 0.4, + "grad_norm": 2.382119861964838, + "learning_rate": 6.846104061774763e-06, + "loss": 0.6577, + "step": 3785 + }, + { + "epoch": 0.4, + "grad_norm": 2.0655614717842052, + "learning_rate": 6.8445201349675825e-06, + "loss": 0.6832, + "step": 3786 + }, + { + "epoch": 0.4, + "grad_norm": 2.8485737316689894, + "learning_rate": 6.842935993869385e-06, + "loss": 0.6554, + "step": 3787 + }, + { + "epoch": 0.4, + "grad_norm": 2.6641033052353102, + "learning_rate": 6.841351638664211e-06, + "loss": 0.7066, + "step": 3788 + }, + { + "epoch": 0.4, + "grad_norm": 2.636994092546543, + "learning_rate": 6.839767069536126e-06, + "loss": 0.5704, + "step": 3789 + }, + { + "epoch": 0.4, + "grad_norm": 2.1059693268960524, + "learning_rate": 6.838182286669222e-06, + "loss": 0.6862, + "step": 3790 + }, + { + "epoch": 0.4, + "grad_norm": 2.389714565065946, + "learning_rate": 6.8365972902476115e-06, + "loss": 0.6674, + "step": 3791 + }, + { + "epoch": 0.4, + "grad_norm": 2.3867920333404187, + "learning_rate": 6.835012080455439e-06, + "loss": 0.6521, + "step": 3792 + }, + { + "epoch": 0.4, + "grad_norm": 2.6159918503535615, + "learning_rate": 6.833426657476866e-06, + "loss": 0.6519, + "step": 3793 + }, + { + "epoch": 0.4, + "grad_norm": 2.226258850434491, + "learning_rate": 6.831841021496084e-06, + "loss": 0.7133, + "step": 3794 + }, + { + "epoch": 0.4, + "grad_norm": 2.899461642291162, + "learning_rate": 6.830255172697309e-06, + "loss": 0.6671, + "step": 3795 + }, + { + "epoch": 0.4, + "grad_norm": 2.3305187723569447, + "learning_rate": 6.828669111264776e-06, + "loss": 0.6724, + "step": 3796 + }, + { + "epoch": 0.4, + "grad_norm": 2.4055301005112946, + "learning_rate": 6.8270828373827536e-06, + "loss": 0.8054, + "step": 3797 + }, + { + "epoch": 0.4, + "grad_norm": 2.2070086468105874, + "learning_rate": 6.825496351235528e-06, + "loss": 0.6663, + "step": 3798 + }, + { + "epoch": 0.4, + "grad_norm": 2.465861609693202, + "learning_rate": 6.823909653007414e-06, + "loss": 0.6668, + "step": 3799 + }, + { + "epoch": 0.4, + "grad_norm": 2.7446295867469264, + "learning_rate": 6.8223227428827485e-06, + "loss": 0.6663, + "step": 3800 + }, + { + "epoch": 0.4, + "grad_norm": 2.2768346355361744, + "learning_rate": 6.820735621045895e-06, + "loss": 0.6675, + "step": 3801 + }, + { + "epoch": 0.4, + "grad_norm": 2.3820614356193843, + "learning_rate": 6.81914828768124e-06, + "loss": 0.689, + "step": 3802 + }, + { + "epoch": 0.4, + "grad_norm": 2.993505071374713, + "learning_rate": 6.817560742973196e-06, + "loss": 0.7702, + "step": 3803 + }, + { + "epoch": 0.4, + "grad_norm": 2.25574301607733, + "learning_rate": 6.8159729871061984e-06, + "loss": 0.6731, + "step": 3804 + }, + { + "epoch": 0.4, + "grad_norm": 3.880274384584933, + "learning_rate": 6.814385020264708e-06, + "loss": 0.7064, + "step": 3805 + }, + { + "epoch": 0.4, + "grad_norm": 2.6969745237833136, + "learning_rate": 6.812796842633213e-06, + "loss": 0.7117, + "step": 3806 + }, + { + "epoch": 0.4, + "grad_norm": 2.10614106669541, + "learning_rate": 6.811208454396218e-06, + "loss": 0.7491, + "step": 3807 + }, + { + "epoch": 0.4, + "grad_norm": 4.469718689532213, + "learning_rate": 6.809619855738262e-06, + "loss": 0.6633, + "step": 3808 + }, + { + "epoch": 0.4, + "grad_norm": 2.6641844592932244, + "learning_rate": 6.8080310468439015e-06, + "loss": 0.6784, + "step": 3809 + }, + { + "epoch": 0.4, + "grad_norm": 6.960823525262656, + "learning_rate": 6.806442027897722e-06, + "loss": 0.6114, + "step": 3810 + }, + { + "epoch": 0.4, + "grad_norm": 1.963680773700924, + "learning_rate": 6.804852799084329e-06, + "loss": 0.6952, + "step": 3811 + }, + { + "epoch": 0.4, + "grad_norm": 2.327360252561889, + "learning_rate": 6.803263360588355e-06, + "loss": 0.6871, + "step": 3812 + }, + { + "epoch": 0.4, + "grad_norm": 2.6946911855419358, + "learning_rate": 6.801673712594456e-06, + "loss": 0.7192, + "step": 3813 + }, + { + "epoch": 0.4, + "grad_norm": 2.7849793322013197, + "learning_rate": 6.8000838552873135e-06, + "loss": 0.6403, + "step": 3814 + }, + { + "epoch": 0.4, + "grad_norm": 2.3535336364449804, + "learning_rate": 6.7984937888516325e-06, + "loss": 0.708, + "step": 3815 + }, + { + "epoch": 0.4, + "grad_norm": 4.483684306418722, + "learning_rate": 6.796903513472142e-06, + "loss": 0.6605, + "step": 3816 + }, + { + "epoch": 0.4, + "grad_norm": 3.157785406165776, + "learning_rate": 6.795313029333596e-06, + "loss": 0.576, + "step": 3817 + }, + { + "epoch": 0.4, + "grad_norm": 2.243032193761467, + "learning_rate": 6.79372233662077e-06, + "loss": 0.5717, + "step": 3818 + }, + { + "epoch": 0.4, + "grad_norm": 3.0101289906232367, + "learning_rate": 6.79213143551847e-06, + "loss": 0.5542, + "step": 3819 + }, + { + "epoch": 0.4, + "grad_norm": 3.8401966485891643, + "learning_rate": 6.790540326211519e-06, + "loss": 0.6712, + "step": 3820 + }, + { + "epoch": 0.4, + "grad_norm": 2.757074235173517, + "learning_rate": 6.78894900888477e-06, + "loss": 0.6846, + "step": 3821 + }, + { + "epoch": 0.4, + "grad_norm": 4.674263473701696, + "learning_rate": 6.787357483723096e-06, + "loss": 0.7419, + "step": 3822 + }, + { + "epoch": 0.4, + "grad_norm": 2.606110641324874, + "learning_rate": 6.785765750911396e-06, + "loss": 0.6317, + "step": 3823 + }, + { + "epoch": 0.4, + "grad_norm": 2.1675322487659723, + "learning_rate": 6.7841738106345935e-06, + "loss": 0.6373, + "step": 3824 + }, + { + "epoch": 0.4, + "grad_norm": 2.2312923273279153, + "learning_rate": 6.782581663077637e-06, + "loss": 0.6132, + "step": 3825 + }, + { + "epoch": 0.4, + "grad_norm": 2.6581842644039164, + "learning_rate": 6.780989308425493e-06, + "loss": 0.6928, + "step": 3826 + }, + { + "epoch": 0.4, + "grad_norm": 2.246504818054555, + "learning_rate": 6.77939674686316e-06, + "loss": 0.6127, + "step": 3827 + }, + { + "epoch": 0.4, + "grad_norm": 2.7698626969798905, + "learning_rate": 6.7778039785756575e-06, + "loss": 0.6525, + "step": 3828 + }, + { + "epoch": 0.4, + "grad_norm": 2.4450311084573038, + "learning_rate": 6.7762110037480265e-06, + "loss": 0.6343, + "step": 3829 + }, + { + "epoch": 0.4, + "grad_norm": 4.123523847157009, + "learning_rate": 6.774617822565338e-06, + "loss": 0.6966, + "step": 3830 + }, + { + "epoch": 0.4, + "grad_norm": 2.7410973711990247, + "learning_rate": 6.773024435212678e-06, + "loss": 0.6834, + "step": 3831 + }, + { + "epoch": 0.4, + "grad_norm": 10.729971927019776, + "learning_rate": 6.771430841875166e-06, + "loss": 0.7335, + "step": 3832 + }, + { + "epoch": 0.4, + "grad_norm": 2.461919317112508, + "learning_rate": 6.7698370427379405e-06, + "loss": 0.7462, + "step": 3833 + }, + { + "epoch": 0.4, + "grad_norm": 2.764361831143325, + "learning_rate": 6.7682430379861615e-06, + "loss": 0.6316, + "step": 3834 + }, + { + "epoch": 0.4, + "grad_norm": 2.09261926247929, + "learning_rate": 6.766648827805019e-06, + "loss": 0.6006, + "step": 3835 + }, + { + "epoch": 0.4, + "grad_norm": 3.804234407107111, + "learning_rate": 6.765054412379722e-06, + "loss": 0.6852, + "step": 3836 + }, + { + "epoch": 0.4, + "grad_norm": 2.798217238649048, + "learning_rate": 6.763459791895506e-06, + "loss": 0.6397, + "step": 3837 + }, + { + "epoch": 0.4, + "grad_norm": 2.6552060440599377, + "learning_rate": 6.761864966537629e-06, + "loss": 0.5773, + "step": 3838 + }, + { + "epoch": 0.4, + "grad_norm": 2.0551205343106513, + "learning_rate": 6.760269936491373e-06, + "loss": 0.7127, + "step": 3839 + }, + { + "epoch": 0.4, + "grad_norm": 2.2555093947538323, + "learning_rate": 6.7586747019420444e-06, + "loss": 0.6463, + "step": 3840 + }, + { + "epoch": 0.4, + "grad_norm": 2.508518018543992, + "learning_rate": 6.7570792630749725e-06, + "loss": 0.6151, + "step": 3841 + }, + { + "epoch": 0.4, + "grad_norm": 2.2115480509362024, + "learning_rate": 6.755483620075509e-06, + "loss": 0.6153, + "step": 3842 + }, + { + "epoch": 0.4, + "grad_norm": 2.5571778252409416, + "learning_rate": 6.753887773129036e-06, + "loss": 0.6822, + "step": 3843 + }, + { + "epoch": 0.4, + "grad_norm": 3.3827224492687704, + "learning_rate": 6.752291722420951e-06, + "loss": 0.6651, + "step": 3844 + }, + { + "epoch": 0.4, + "grad_norm": 5.068263890882636, + "learning_rate": 6.750695468136679e-06, + "loss": 0.7441, + "step": 3845 + }, + { + "epoch": 0.4, + "grad_norm": 2.599488472042048, + "learning_rate": 6.7490990104616684e-06, + "loss": 0.6053, + "step": 3846 + }, + { + "epoch": 0.4, + "grad_norm": 5.068319078405129, + "learning_rate": 6.74750234958139e-06, + "loss": 0.6563, + "step": 3847 + }, + { + "epoch": 0.4, + "grad_norm": 3.4661100791938293, + "learning_rate": 6.745905485681341e-06, + "loss": 0.605, + "step": 3848 + }, + { + "epoch": 0.41, + "grad_norm": 2.8783604735605857, + "learning_rate": 6.7443084189470385e-06, + "loss": 0.6415, + "step": 3849 + }, + { + "epoch": 0.41, + "grad_norm": 4.159785400047253, + "learning_rate": 6.742711149564028e-06, + "loss": 0.7075, + "step": 3850 + }, + { + "epoch": 0.41, + "grad_norm": 2.5896401663997524, + "learning_rate": 6.741113677717872e-06, + "loss": 0.7611, + "step": 3851 + }, + { + "epoch": 0.41, + "grad_norm": 3.423614667213638, + "learning_rate": 6.7395160035941624e-06, + "loss": 0.7156, + "step": 3852 + }, + { + "epoch": 0.41, + "grad_norm": 2.874352439060714, + "learning_rate": 6.737918127378511e-06, + "loss": 0.5727, + "step": 3853 + }, + { + "epoch": 0.41, + "grad_norm": 2.2310646192841106, + "learning_rate": 6.736320049256557e-06, + "loss": 0.6234, + "step": 3854 + }, + { + "epoch": 0.41, + "grad_norm": 3.078116761718466, + "learning_rate": 6.734721769413959e-06, + "loss": 0.6838, + "step": 3855 + }, + { + "epoch": 0.41, + "grad_norm": 3.047167191192513, + "learning_rate": 6.733123288036399e-06, + "loss": 0.739, + "step": 3856 + }, + { + "epoch": 0.41, + "grad_norm": 2.98595396084378, + "learning_rate": 6.731524605309587e-06, + "loss": 0.6049, + "step": 3857 + }, + { + "epoch": 0.41, + "grad_norm": 2.152132778242366, + "learning_rate": 6.72992572141925e-06, + "loss": 0.6779, + "step": 3858 + }, + { + "epoch": 0.41, + "grad_norm": 3.411218764669821, + "learning_rate": 6.728326636551145e-06, + "loss": 0.6093, + "step": 3859 + }, + { + "epoch": 0.41, + "grad_norm": 2.1157066121453587, + "learning_rate": 6.726727350891047e-06, + "loss": 0.6165, + "step": 3860 + }, + { + "epoch": 0.41, + "grad_norm": 2.068433705431225, + "learning_rate": 6.725127864624757e-06, + "loss": 0.5947, + "step": 3861 + }, + { + "epoch": 0.41, + "grad_norm": 2.8708815594354373, + "learning_rate": 6.723528177938097e-06, + "loss": 0.6354, + "step": 3862 + }, + { + "epoch": 0.41, + "grad_norm": 3.33467858012365, + "learning_rate": 6.721928291016917e-06, + "loss": 0.7318, + "step": 3863 + }, + { + "epoch": 0.41, + "grad_norm": 2.803975371742299, + "learning_rate": 6.720328204047085e-06, + "loss": 0.6814, + "step": 3864 + }, + { + "epoch": 0.41, + "grad_norm": 1.1642857039266927, + "learning_rate": 6.718727917214496e-06, + "loss": 0.6067, + "step": 3865 + }, + { + "epoch": 0.41, + "grad_norm": 2.80771219731655, + "learning_rate": 6.7171274307050645e-06, + "loss": 0.6535, + "step": 3866 + }, + { + "epoch": 0.41, + "grad_norm": 2.0379961191421874, + "learning_rate": 6.715526744704732e-06, + "loss": 0.6385, + "step": 3867 + }, + { + "epoch": 0.41, + "grad_norm": 2.2538626104452626, + "learning_rate": 6.7139258593994625e-06, + "loss": 0.6347, + "step": 3868 + }, + { + "epoch": 0.41, + "grad_norm": 2.5560569234327355, + "learning_rate": 6.712324774975241e-06, + "loss": 0.6909, + "step": 3869 + }, + { + "epoch": 0.41, + "grad_norm": 2.4724982923209953, + "learning_rate": 6.710723491618077e-06, + "loss": 0.676, + "step": 3870 + }, + { + "epoch": 0.41, + "grad_norm": 2.692509799497406, + "learning_rate": 6.709122009514003e-06, + "loss": 0.673, + "step": 3871 + }, + { + "epoch": 0.41, + "grad_norm": 2.2919297459345502, + "learning_rate": 6.707520328849074e-06, + "loss": 0.6929, + "step": 3872 + }, + { + "epoch": 0.41, + "grad_norm": 2.882493860740183, + "learning_rate": 6.7059184498093696e-06, + "loss": 0.6791, + "step": 3873 + }, + { + "epoch": 0.41, + "grad_norm": 3.6357067311902735, + "learning_rate": 6.70431637258099e-06, + "loss": 0.7293, + "step": 3874 + }, + { + "epoch": 0.41, + "grad_norm": 8.322268264406318, + "learning_rate": 6.702714097350063e-06, + "loss": 0.6628, + "step": 3875 + }, + { + "epoch": 0.41, + "grad_norm": 2.4783117896301388, + "learning_rate": 6.701111624302732e-06, + "loss": 0.6576, + "step": 3876 + }, + { + "epoch": 0.41, + "grad_norm": 2.5768699541037674, + "learning_rate": 6.69950895362517e-06, + "loss": 0.7493, + "step": 3877 + }, + { + "epoch": 0.41, + "grad_norm": 5.682147247687123, + "learning_rate": 6.697906085503572e-06, + "loss": 0.6796, + "step": 3878 + }, + { + "epoch": 0.41, + "grad_norm": 2.2802199978055913, + "learning_rate": 6.696303020124152e-06, + "loss": 0.6168, + "step": 3879 + }, + { + "epoch": 0.41, + "grad_norm": 2.264182484401708, + "learning_rate": 6.694699757673151e-06, + "loss": 0.6551, + "step": 3880 + }, + { + "epoch": 0.41, + "grad_norm": 2.5897290859559083, + "learning_rate": 6.693096298336832e-06, + "loss": 0.6541, + "step": 3881 + }, + { + "epoch": 0.41, + "grad_norm": 2.05265338675788, + "learning_rate": 6.691492642301478e-06, + "loss": 0.6983, + "step": 3882 + }, + { + "epoch": 0.41, + "grad_norm": 2.55619801462537, + "learning_rate": 6.6898887897533985e-06, + "loss": 0.7378, + "step": 3883 + }, + { + "epoch": 0.41, + "grad_norm": 3.7122207089430885, + "learning_rate": 6.6882847408789255e-06, + "loss": 0.6772, + "step": 3884 + }, + { + "epoch": 0.41, + "grad_norm": 2.8904289019733236, + "learning_rate": 6.686680495864411e-06, + "loss": 0.6001, + "step": 3885 + }, + { + "epoch": 0.41, + "grad_norm": 2.1642154485237306, + "learning_rate": 6.6850760548962315e-06, + "loss": 0.7032, + "step": 3886 + }, + { + "epoch": 0.41, + "grad_norm": 9.085180115876389, + "learning_rate": 6.683471418160787e-06, + "loss": 0.6206, + "step": 3887 + }, + { + "epoch": 0.41, + "grad_norm": 2.33182419264645, + "learning_rate": 6.681866585844501e-06, + "loss": 0.5902, + "step": 3888 + }, + { + "epoch": 0.41, + "grad_norm": 2.158864307849004, + "learning_rate": 6.680261558133816e-06, + "loss": 0.5937, + "step": 3889 + }, + { + "epoch": 0.41, + "grad_norm": 2.7965197556825463, + "learning_rate": 6.678656335215199e-06, + "loss": 0.6601, + "step": 3890 + }, + { + "epoch": 0.41, + "grad_norm": 2.8204118744480993, + "learning_rate": 6.677050917275143e-06, + "loss": 0.6472, + "step": 3891 + }, + { + "epoch": 0.41, + "grad_norm": 3.4806015074983594, + "learning_rate": 6.675445304500159e-06, + "loss": 0.6548, + "step": 3892 + }, + { + "epoch": 0.41, + "grad_norm": 1.1229091161769098, + "learning_rate": 6.673839497076783e-06, + "loss": 0.5833, + "step": 3893 + }, + { + "epoch": 0.41, + "grad_norm": 3.1338450581533546, + "learning_rate": 6.672233495191572e-06, + "loss": 0.6849, + "step": 3894 + }, + { + "epoch": 0.41, + "grad_norm": 2.3209497422936622, + "learning_rate": 6.670627299031109e-06, + "loss": 0.7225, + "step": 3895 + }, + { + "epoch": 0.41, + "grad_norm": 2.76307749364276, + "learning_rate": 6.669020908781994e-06, + "loss": 0.6026, + "step": 3896 + }, + { + "epoch": 0.41, + "grad_norm": 2.4047612077616933, + "learning_rate": 6.667414324630856e-06, + "loss": 0.6428, + "step": 3897 + }, + { + "epoch": 0.41, + "grad_norm": 8.962203572754492, + "learning_rate": 6.665807546764341e-06, + "loss": 0.6385, + "step": 3898 + }, + { + "epoch": 0.41, + "grad_norm": 2.3629283595989983, + "learning_rate": 6.664200575369121e-06, + "loss": 0.6119, + "step": 3899 + }, + { + "epoch": 0.41, + "grad_norm": 3.1049318573672307, + "learning_rate": 6.662593410631888e-06, + "loss": 0.6005, + "step": 3900 + }, + { + "epoch": 0.41, + "grad_norm": 2.1219922103429094, + "learning_rate": 6.660986052739357e-06, + "loss": 0.6908, + "step": 3901 + }, + { + "epoch": 0.41, + "grad_norm": 5.5048628027957625, + "learning_rate": 6.659378501878271e-06, + "loss": 0.5462, + "step": 3902 + }, + { + "epoch": 0.41, + "grad_norm": 2.081524730634284, + "learning_rate": 6.657770758235387e-06, + "loss": 0.6065, + "step": 3903 + }, + { + "epoch": 0.41, + "grad_norm": 2.286231378737835, + "learning_rate": 6.656162821997487e-06, + "loss": 0.6641, + "step": 3904 + }, + { + "epoch": 0.41, + "grad_norm": 3.018088065090707, + "learning_rate": 6.654554693351379e-06, + "loss": 0.5698, + "step": 3905 + }, + { + "epoch": 0.41, + "grad_norm": 2.0133508154340487, + "learning_rate": 6.652946372483889e-06, + "loss": 0.6064, + "step": 3906 + }, + { + "epoch": 0.41, + "grad_norm": 2.681912809689596, + "learning_rate": 6.651337859581868e-06, + "loss": 0.6913, + "step": 3907 + }, + { + "epoch": 0.41, + "grad_norm": 3.0310073690024075, + "learning_rate": 6.6497291548321876e-06, + "loss": 0.6523, + "step": 3908 + }, + { + "epoch": 0.41, + "grad_norm": 2.3565442897512345, + "learning_rate": 6.648120258421744e-06, + "loss": 0.6395, + "step": 3909 + }, + { + "epoch": 0.41, + "grad_norm": 2.52692979561079, + "learning_rate": 6.646511170537452e-06, + "loss": 0.6792, + "step": 3910 + }, + { + "epoch": 0.41, + "grad_norm": 1.21140083501598, + "learning_rate": 6.644901891366251e-06, + "loss": 0.5643, + "step": 3911 + }, + { + "epoch": 0.41, + "grad_norm": 1.0497136827512572, + "learning_rate": 6.643292421095105e-06, + "loss": 0.5934, + "step": 3912 + }, + { + "epoch": 0.41, + "grad_norm": 3.4574095439368064, + "learning_rate": 6.641682759910993e-06, + "loss": 0.6165, + "step": 3913 + }, + { + "epoch": 0.41, + "grad_norm": 3.1072803681357684, + "learning_rate": 6.640072908000926e-06, + "loss": 0.7101, + "step": 3914 + }, + { + "epoch": 0.41, + "grad_norm": 2.449280579698595, + "learning_rate": 6.638462865551929e-06, + "loss": 0.631, + "step": 3915 + }, + { + "epoch": 0.41, + "grad_norm": 2.300143552458361, + "learning_rate": 6.6368526327510515e-06, + "loss": 0.6856, + "step": 3916 + }, + { + "epoch": 0.41, + "grad_norm": 2.1842280323695746, + "learning_rate": 6.635242209785369e-06, + "loss": 0.7335, + "step": 3917 + }, + { + "epoch": 0.41, + "grad_norm": 4.49088278011825, + "learning_rate": 6.633631596841972e-06, + "loss": 0.7089, + "step": 3918 + }, + { + "epoch": 0.41, + "grad_norm": 2.9551946728374507, + "learning_rate": 6.63202079410798e-06, + "loss": 0.7198, + "step": 3919 + }, + { + "epoch": 0.41, + "grad_norm": 2.5169889269326773, + "learning_rate": 6.630409801770528e-06, + "loss": 0.6483, + "step": 3920 + }, + { + "epoch": 0.41, + "grad_norm": 2.5244649430685646, + "learning_rate": 6.6287986200167785e-06, + "loss": 0.6174, + "step": 3921 + }, + { + "epoch": 0.41, + "grad_norm": 2.6269573809565423, + "learning_rate": 6.627187249033915e-06, + "loss": 0.6274, + "step": 3922 + }, + { + "epoch": 0.41, + "grad_norm": 2.855773301785785, + "learning_rate": 6.6255756890091394e-06, + "loss": 0.6258, + "step": 3923 + }, + { + "epoch": 0.41, + "grad_norm": 2.4211612959153817, + "learning_rate": 6.6239639401296796e-06, + "loss": 0.6101, + "step": 3924 + }, + { + "epoch": 0.41, + "grad_norm": 2.259452031681648, + "learning_rate": 6.622352002582783e-06, + "loss": 0.6934, + "step": 3925 + }, + { + "epoch": 0.41, + "grad_norm": 3.0056254388720265, + "learning_rate": 6.620739876555721e-06, + "loss": 0.7024, + "step": 3926 + }, + { + "epoch": 0.41, + "grad_norm": 2.950640775072797, + "learning_rate": 6.619127562235786e-06, + "loss": 0.7432, + "step": 3927 + }, + { + "epoch": 0.41, + "grad_norm": 2.111457506555049, + "learning_rate": 6.61751505981029e-06, + "loss": 0.6785, + "step": 3928 + }, + { + "epoch": 0.41, + "grad_norm": 5.205473900545925, + "learning_rate": 6.615902369466571e-06, + "loss": 0.6155, + "step": 3929 + }, + { + "epoch": 0.41, + "grad_norm": 1.1442082817667982, + "learning_rate": 6.614289491391985e-06, + "loss": 0.6085, + "step": 3930 + }, + { + "epoch": 0.41, + "grad_norm": 3.097758491032023, + "learning_rate": 6.612676425773914e-06, + "loss": 0.6214, + "step": 3931 + }, + { + "epoch": 0.41, + "grad_norm": 4.102972042758122, + "learning_rate": 6.6110631727997566e-06, + "loss": 0.6056, + "step": 3932 + }, + { + "epoch": 0.41, + "grad_norm": 1.9937462472618483, + "learning_rate": 6.609449732656936e-06, + "loss": 0.6597, + "step": 3933 + }, + { + "epoch": 0.41, + "grad_norm": 2.5897301973961366, + "learning_rate": 6.6078361055328986e-06, + "loss": 0.6214, + "step": 3934 + }, + { + "epoch": 0.41, + "grad_norm": 1.0149618917681156, + "learning_rate": 6.606222291615112e-06, + "loss": 0.6023, + "step": 3935 + }, + { + "epoch": 0.41, + "grad_norm": 2.253201954300004, + "learning_rate": 6.60460829109106e-06, + "loss": 0.6611, + "step": 3936 + }, + { + "epoch": 0.41, + "grad_norm": 2.9592197675312657, + "learning_rate": 6.602994104148256e-06, + "loss": 0.7145, + "step": 3937 + }, + { + "epoch": 0.41, + "grad_norm": 2.2284657591030883, + "learning_rate": 6.601379730974231e-06, + "loss": 0.618, + "step": 3938 + }, + { + "epoch": 0.41, + "grad_norm": 4.557356204052841, + "learning_rate": 6.599765171756538e-06, + "loss": 0.6578, + "step": 3939 + }, + { + "epoch": 0.41, + "grad_norm": 6.091674295085208, + "learning_rate": 6.598150426682752e-06, + "loss": 0.6179, + "step": 3940 + }, + { + "epoch": 0.41, + "grad_norm": 2.0618356021886943, + "learning_rate": 6.596535495940468e-06, + "loss": 0.6168, + "step": 3941 + }, + { + "epoch": 0.41, + "grad_norm": 2.4424041643505827, + "learning_rate": 6.594920379717307e-06, + "loss": 0.6277, + "step": 3942 + }, + { + "epoch": 0.41, + "grad_norm": 2.6813309064292485, + "learning_rate": 6.593305078200907e-06, + "loss": 0.6051, + "step": 3943 + }, + { + "epoch": 0.42, + "grad_norm": 2.067586995607296, + "learning_rate": 6.591689591578927e-06, + "loss": 0.631, + "step": 3944 + }, + { + "epoch": 0.42, + "grad_norm": 2.1568079707086967, + "learning_rate": 6.590073920039052e-06, + "loss": 0.6446, + "step": 3945 + }, + { + "epoch": 0.42, + "grad_norm": 2.7031629573317324, + "learning_rate": 6.588458063768985e-06, + "loss": 0.6328, + "step": 3946 + }, + { + "epoch": 0.42, + "grad_norm": 2.406212144524127, + "learning_rate": 6.586842022956453e-06, + "loss": 0.6518, + "step": 3947 + }, + { + "epoch": 0.42, + "grad_norm": 4.66916693541006, + "learning_rate": 6.585225797789201e-06, + "loss": 0.6624, + "step": 3948 + }, + { + "epoch": 0.42, + "grad_norm": 2.2582234615243375, + "learning_rate": 6.583609388454998e-06, + "loss": 0.555, + "step": 3949 + }, + { + "epoch": 0.42, + "grad_norm": 2.7199146063706134, + "learning_rate": 6.581992795141634e-06, + "loss": 0.6526, + "step": 3950 + }, + { + "epoch": 0.42, + "grad_norm": 2.5390407294947397, + "learning_rate": 6.580376018036921e-06, + "loss": 0.6671, + "step": 3951 + }, + { + "epoch": 0.42, + "grad_norm": 4.7476904706174, + "learning_rate": 6.578759057328691e-06, + "loss": 0.6829, + "step": 3952 + }, + { + "epoch": 0.42, + "grad_norm": 2.5382661667053528, + "learning_rate": 6.5771419132047965e-06, + "loss": 0.6645, + "step": 3953 + }, + { + "epoch": 0.42, + "grad_norm": 2.4043076498085805, + "learning_rate": 6.5755245858531135e-06, + "loss": 0.5974, + "step": 3954 + }, + { + "epoch": 0.42, + "grad_norm": 3.0677814480436205, + "learning_rate": 6.573907075461538e-06, + "loss": 0.6752, + "step": 3955 + }, + { + "epoch": 0.42, + "grad_norm": 2.3299465309321636, + "learning_rate": 6.57228938221799e-06, + "loss": 0.7099, + "step": 3956 + }, + { + "epoch": 0.42, + "grad_norm": 3.5992623283864615, + "learning_rate": 6.5706715063104065e-06, + "loss": 0.7091, + "step": 3957 + }, + { + "epoch": 0.42, + "grad_norm": 3.1185187454894776, + "learning_rate": 6.569053447926746e-06, + "loss": 0.5476, + "step": 3958 + }, + { + "epoch": 0.42, + "grad_norm": 3.4670327552541123, + "learning_rate": 6.567435207254993e-06, + "loss": 0.679, + "step": 3959 + }, + { + "epoch": 0.42, + "grad_norm": 4.602615705445102, + "learning_rate": 6.565816784483147e-06, + "loss": 0.6774, + "step": 3960 + }, + { + "epoch": 0.42, + "grad_norm": 5.735033391286663, + "learning_rate": 6.564198179799234e-06, + "loss": 0.6466, + "step": 3961 + }, + { + "epoch": 0.42, + "grad_norm": 3.8877061902133954, + "learning_rate": 6.5625793933912985e-06, + "loss": 0.594, + "step": 3962 + }, + { + "epoch": 0.42, + "grad_norm": 2.814956124692221, + "learning_rate": 6.5609604254474065e-06, + "loss": 0.6185, + "step": 3963 + }, + { + "epoch": 0.42, + "grad_norm": 1.1484699815057058, + "learning_rate": 6.559341276155644e-06, + "loss": 0.6048, + "step": 3964 + }, + { + "epoch": 0.42, + "grad_norm": 2.872534078697945, + "learning_rate": 6.55772194570412e-06, + "loss": 0.6446, + "step": 3965 + }, + { + "epoch": 0.42, + "grad_norm": 5.229090156228293, + "learning_rate": 6.5561024342809625e-06, + "loss": 0.6325, + "step": 3966 + }, + { + "epoch": 0.42, + "grad_norm": 2.5077624702277195, + "learning_rate": 6.554482742074323e-06, + "loss": 0.633, + "step": 3967 + }, + { + "epoch": 0.42, + "grad_norm": 2.6547530593774695, + "learning_rate": 6.552862869272371e-06, + "loss": 0.6035, + "step": 3968 + }, + { + "epoch": 0.42, + "grad_norm": 3.367826287135605, + "learning_rate": 6.551242816063302e-06, + "loss": 0.6763, + "step": 3969 + }, + { + "epoch": 0.42, + "grad_norm": 2.4066120293512174, + "learning_rate": 6.5496225826353254e-06, + "loss": 0.6515, + "step": 3970 + }, + { + "epoch": 0.42, + "grad_norm": 2.5260789922901496, + "learning_rate": 6.548002169176677e-06, + "loss": 0.651, + "step": 3971 + }, + { + "epoch": 0.42, + "grad_norm": 2.3066437486350457, + "learning_rate": 6.54638157587561e-06, + "loss": 0.5797, + "step": 3972 + }, + { + "epoch": 0.42, + "grad_norm": 2.181813637695597, + "learning_rate": 6.544760802920402e-06, + "loss": 0.6741, + "step": 3973 + }, + { + "epoch": 0.42, + "grad_norm": 3.3250608385498466, + "learning_rate": 6.543139850499352e-06, + "loss": 0.6409, + "step": 3974 + }, + { + "epoch": 0.42, + "grad_norm": 2.3644447611901835, + "learning_rate": 6.5415187188007726e-06, + "loss": 0.6754, + "step": 3975 + }, + { + "epoch": 0.42, + "grad_norm": 2.405053664018577, + "learning_rate": 6.539897408013005e-06, + "loss": 0.6084, + "step": 3976 + }, + { + "epoch": 0.42, + "grad_norm": 2.1188551440111305, + "learning_rate": 6.538275918324408e-06, + "loss": 0.7079, + "step": 3977 + }, + { + "epoch": 0.42, + "grad_norm": 2.439917462899966, + "learning_rate": 6.536654249923361e-06, + "loss": 0.6137, + "step": 3978 + }, + { + "epoch": 0.42, + "grad_norm": 2.6239270579211404, + "learning_rate": 6.535032402998266e-06, + "loss": 0.5237, + "step": 3979 + }, + { + "epoch": 0.42, + "grad_norm": 3.4664442974636303, + "learning_rate": 6.533410377737544e-06, + "loss": 0.6955, + "step": 3980 + }, + { + "epoch": 0.42, + "grad_norm": 2.3044985882207807, + "learning_rate": 6.531788174329636e-06, + "loss": 0.5246, + "step": 3981 + }, + { + "epoch": 0.42, + "grad_norm": 2.007003515117586, + "learning_rate": 6.5301657929630055e-06, + "loss": 0.6039, + "step": 3982 + }, + { + "epoch": 0.42, + "grad_norm": 1.2164380112159745, + "learning_rate": 6.5285432338261365e-06, + "loss": 0.616, + "step": 3983 + }, + { + "epoch": 0.42, + "grad_norm": 2.257824172908572, + "learning_rate": 6.526920497107535e-06, + "loss": 0.7186, + "step": 3984 + }, + { + "epoch": 0.42, + "grad_norm": 1.093528093998501, + "learning_rate": 6.525297582995722e-06, + "loss": 0.5979, + "step": 3985 + }, + { + "epoch": 0.42, + "grad_norm": 2.3370494508343467, + "learning_rate": 6.523674491679246e-06, + "loss": 0.639, + "step": 3986 + }, + { + "epoch": 0.42, + "grad_norm": 3.1729179222230477, + "learning_rate": 6.522051223346672e-06, + "loss": 0.6788, + "step": 3987 + }, + { + "epoch": 0.42, + "grad_norm": 4.462532611308276, + "learning_rate": 6.520427778186586e-06, + "loss": 0.7091, + "step": 3988 + }, + { + "epoch": 0.42, + "grad_norm": 2.673478312971706, + "learning_rate": 6.518804156387597e-06, + "loss": 0.6831, + "step": 3989 + }, + { + "epoch": 0.42, + "grad_norm": 2.9158479320514985, + "learning_rate": 6.517180358138332e-06, + "loss": 0.6926, + "step": 3990 + }, + { + "epoch": 0.42, + "grad_norm": 2.635411987857093, + "learning_rate": 6.515556383627437e-06, + "loss": 0.666, + "step": 3991 + }, + { + "epoch": 0.42, + "grad_norm": 2.2626676614651906, + "learning_rate": 6.513932233043584e-06, + "loss": 0.7135, + "step": 3992 + }, + { + "epoch": 0.42, + "grad_norm": 2.83263520238375, + "learning_rate": 6.512307906575459e-06, + "loss": 0.5915, + "step": 3993 + }, + { + "epoch": 0.42, + "grad_norm": 2.204046953504845, + "learning_rate": 6.510683404411774e-06, + "loss": 0.6271, + "step": 3994 + }, + { + "epoch": 0.42, + "grad_norm": 2.593299117358978, + "learning_rate": 6.509058726741258e-06, + "loss": 0.6616, + "step": 3995 + }, + { + "epoch": 0.42, + "grad_norm": 1.996690826929998, + "learning_rate": 6.50743387375266e-06, + "loss": 0.6885, + "step": 3996 + }, + { + "epoch": 0.42, + "grad_norm": 12.933011472212376, + "learning_rate": 6.505808845634753e-06, + "loss": 0.6603, + "step": 3997 + }, + { + "epoch": 0.42, + "grad_norm": 2.453187947445511, + "learning_rate": 6.504183642576327e-06, + "loss": 0.7046, + "step": 3998 + }, + { + "epoch": 0.42, + "grad_norm": 2.4465722034806157, + "learning_rate": 6.502558264766194e-06, + "loss": 0.5633, + "step": 3999 + }, + { + "epoch": 0.42, + "grad_norm": 2.341833452210207, + "learning_rate": 6.500932712393185e-06, + "loss": 0.6048, + "step": 4000 + }, + { + "epoch": 0.42, + "grad_norm": 2.5314039211160955, + "learning_rate": 6.499306985646152e-06, + "loss": 0.6565, + "step": 4001 + }, + { + "epoch": 0.42, + "grad_norm": 2.307652014877147, + "learning_rate": 6.497681084713969e-06, + "loss": 0.6175, + "step": 4002 + }, + { + "epoch": 0.42, + "grad_norm": 3.7936798318747624, + "learning_rate": 6.496055009785526e-06, + "loss": 0.6992, + "step": 4003 + }, + { + "epoch": 0.42, + "grad_norm": 2.6272219714248983, + "learning_rate": 6.494428761049736e-06, + "loss": 0.6342, + "step": 4004 + }, + { + "epoch": 0.42, + "grad_norm": 2.1079646961470906, + "learning_rate": 6.492802338695533e-06, + "loss": 0.6606, + "step": 4005 + }, + { + "epoch": 0.42, + "grad_norm": 5.297324004495899, + "learning_rate": 6.491175742911869e-06, + "loss": 0.6246, + "step": 4006 + }, + { + "epoch": 0.42, + "grad_norm": 2.6530141636498663, + "learning_rate": 6.489548973887717e-06, + "loss": 0.6609, + "step": 4007 + }, + { + "epoch": 0.42, + "grad_norm": 3.1063440835651397, + "learning_rate": 6.4879220318120735e-06, + "loss": 0.6974, + "step": 4008 + }, + { + "epoch": 0.42, + "grad_norm": 2.3068250586790517, + "learning_rate": 6.486294916873947e-06, + "loss": 0.6624, + "step": 4009 + }, + { + "epoch": 0.42, + "grad_norm": 2.459942839783369, + "learning_rate": 6.484667629262375e-06, + "loss": 0.6707, + "step": 4010 + }, + { + "epoch": 0.42, + "grad_norm": 2.1027394322760666, + "learning_rate": 6.4830401691664106e-06, + "loss": 0.6924, + "step": 4011 + }, + { + "epoch": 0.42, + "grad_norm": 1.1296970727448212, + "learning_rate": 6.481412536775125e-06, + "loss": 0.5682, + "step": 4012 + }, + { + "epoch": 0.42, + "grad_norm": 2.62301953143833, + "learning_rate": 6.479784732277612e-06, + "loss": 0.6738, + "step": 4013 + }, + { + "epoch": 0.42, + "grad_norm": 2.8179450664566996, + "learning_rate": 6.478156755862988e-06, + "loss": 0.6378, + "step": 4014 + }, + { + "epoch": 0.42, + "grad_norm": 2.7307924595408544, + "learning_rate": 6.4765286077203844e-06, + "loss": 0.5877, + "step": 4015 + }, + { + "epoch": 0.42, + "grad_norm": 4.571571986675738, + "learning_rate": 6.4749002880389566e-06, + "loss": 0.6638, + "step": 4016 + }, + { + "epoch": 0.42, + "grad_norm": 2.603346420805311, + "learning_rate": 6.473271797007876e-06, + "loss": 0.6619, + "step": 4017 + }, + { + "epoch": 0.42, + "grad_norm": 2.174453465113902, + "learning_rate": 6.471643134816336e-06, + "loss": 0.7298, + "step": 4018 + }, + { + "epoch": 0.42, + "grad_norm": 2.5043155473625514, + "learning_rate": 6.470014301653552e-06, + "loss": 0.6076, + "step": 4019 + }, + { + "epoch": 0.42, + "grad_norm": 2.8733996645916116, + "learning_rate": 6.468385297708755e-06, + "loss": 0.6422, + "step": 4020 + }, + { + "epoch": 0.42, + "grad_norm": 1.0053641792809447, + "learning_rate": 6.466756123171199e-06, + "loss": 0.6017, + "step": 4021 + }, + { + "epoch": 0.42, + "grad_norm": 2.170505948062594, + "learning_rate": 6.465126778230156e-06, + "loss": 0.6564, + "step": 4022 + }, + { + "epoch": 0.42, + "grad_norm": 2.2632578787097453, + "learning_rate": 6.46349726307492e-06, + "loss": 0.6804, + "step": 4023 + }, + { + "epoch": 0.42, + "grad_norm": 2.936556398536724, + "learning_rate": 6.4618675778948026e-06, + "loss": 0.6739, + "step": 4024 + }, + { + "epoch": 0.42, + "grad_norm": 2.1638172540039973, + "learning_rate": 6.4602377228791345e-06, + "loss": 0.6198, + "step": 4025 + }, + { + "epoch": 0.42, + "grad_norm": 2.599114630574275, + "learning_rate": 6.458607698217271e-06, + "loss": 0.5974, + "step": 4026 + }, + { + "epoch": 0.42, + "grad_norm": 3.359851858598546, + "learning_rate": 6.45697750409858e-06, + "loss": 0.6207, + "step": 4027 + }, + { + "epoch": 0.42, + "grad_norm": 1.0414819211011148, + "learning_rate": 6.455347140712455e-06, + "loss": 0.5482, + "step": 4028 + }, + { + "epoch": 0.42, + "grad_norm": 1.9073262379332099, + "learning_rate": 6.453716608248306e-06, + "loss": 0.7179, + "step": 4029 + }, + { + "epoch": 0.42, + "grad_norm": 2.894734987817326, + "learning_rate": 6.452085906895564e-06, + "loss": 0.6275, + "step": 4030 + }, + { + "epoch": 0.42, + "grad_norm": 6.409131230764491, + "learning_rate": 6.450455036843677e-06, + "loss": 0.5708, + "step": 4031 + }, + { + "epoch": 0.42, + "grad_norm": 2.626357432074768, + "learning_rate": 6.448823998282119e-06, + "loss": 0.6892, + "step": 4032 + }, + { + "epoch": 0.42, + "grad_norm": 2.3318787973963424, + "learning_rate": 6.447192791400375e-06, + "loss": 0.6684, + "step": 4033 + }, + { + "epoch": 0.42, + "grad_norm": 2.341295423290045, + "learning_rate": 6.4455614163879576e-06, + "loss": 0.7042, + "step": 4034 + }, + { + "epoch": 0.42, + "grad_norm": 2.51877291354241, + "learning_rate": 6.443929873434392e-06, + "loss": 0.6783, + "step": 4035 + }, + { + "epoch": 0.42, + "grad_norm": 2.530723558434043, + "learning_rate": 6.442298162729229e-06, + "loss": 0.652, + "step": 4036 + }, + { + "epoch": 0.42, + "grad_norm": 2.544615833177302, + "learning_rate": 6.4406662844620346e-06, + "loss": 0.6204, + "step": 4037 + }, + { + "epoch": 0.42, + "grad_norm": 2.6992866068672643, + "learning_rate": 6.439034238822396e-06, + "loss": 0.6114, + "step": 4038 + }, + { + "epoch": 0.43, + "grad_norm": 2.361537811446634, + "learning_rate": 6.4374020259999194e-06, + "loss": 0.6084, + "step": 4039 + }, + { + "epoch": 0.43, + "grad_norm": 3.3932419299570737, + "learning_rate": 6.43576964618423e-06, + "loss": 0.5435, + "step": 4040 + }, + { + "epoch": 0.43, + "grad_norm": 2.2247947497001896, + "learning_rate": 6.4341370995649735e-06, + "loss": 0.6274, + "step": 4041 + }, + { + "epoch": 0.43, + "grad_norm": 4.203698930496953, + "learning_rate": 6.4325043863318136e-06, + "loss": 0.6563, + "step": 4042 + }, + { + "epoch": 0.43, + "grad_norm": 2.6533644667178184, + "learning_rate": 6.430871506674437e-06, + "loss": 0.5898, + "step": 4043 + }, + { + "epoch": 0.43, + "grad_norm": 3.265730806748852, + "learning_rate": 6.429238460782543e-06, + "loss": 0.5698, + "step": 4044 + }, + { + "epoch": 0.43, + "grad_norm": 2.1642899955947548, + "learning_rate": 6.427605248845859e-06, + "loss": 0.6199, + "step": 4045 + }, + { + "epoch": 0.43, + "grad_norm": 2.6003134035460604, + "learning_rate": 6.42597187105412e-06, + "loss": 0.6055, + "step": 4046 + }, + { + "epoch": 0.43, + "grad_norm": 2.2193001480951926, + "learning_rate": 6.4243383275970924e-06, + "loss": 0.671, + "step": 4047 + }, + { + "epoch": 0.43, + "grad_norm": 3.5572778898263016, + "learning_rate": 6.422704618664557e-06, + "loss": 0.6435, + "step": 4048 + }, + { + "epoch": 0.43, + "grad_norm": 4.222157891641835, + "learning_rate": 6.42107074444631e-06, + "loss": 0.6501, + "step": 4049 + }, + { + "epoch": 0.43, + "grad_norm": 2.725601175114304, + "learning_rate": 6.419436705132172e-06, + "loss": 0.6936, + "step": 4050 + }, + { + "epoch": 0.43, + "grad_norm": 2.4112413353819484, + "learning_rate": 6.4178025009119825e-06, + "loss": 0.6389, + "step": 4051 + }, + { + "epoch": 0.43, + "grad_norm": 2.589327647378624, + "learning_rate": 6.416168131975595e-06, + "loss": 0.6985, + "step": 4052 + }, + { + "epoch": 0.43, + "grad_norm": 2.4983806609239703, + "learning_rate": 6.414533598512887e-06, + "loss": 0.6664, + "step": 4053 + }, + { + "epoch": 0.43, + "grad_norm": 2.240075636586892, + "learning_rate": 6.412898900713757e-06, + "loss": 0.6559, + "step": 4054 + }, + { + "epoch": 0.43, + "grad_norm": 2.8847812663112897, + "learning_rate": 6.411264038768115e-06, + "loss": 0.6703, + "step": 4055 + }, + { + "epoch": 0.43, + "grad_norm": 2.367558885670087, + "learning_rate": 6.409629012865899e-06, + "loss": 0.5792, + "step": 4056 + }, + { + "epoch": 0.43, + "grad_norm": 3.3855942924416382, + "learning_rate": 6.407993823197056e-06, + "loss": 0.7401, + "step": 4057 + }, + { + "epoch": 0.43, + "grad_norm": 3.876331931348521, + "learning_rate": 6.406358469951562e-06, + "loss": 0.6485, + "step": 4058 + }, + { + "epoch": 0.43, + "grad_norm": 2.206685527616314, + "learning_rate": 6.404722953319406e-06, + "loss": 0.6424, + "step": 4059 + }, + { + "epoch": 0.43, + "grad_norm": 2.075266887505169, + "learning_rate": 6.403087273490599e-06, + "loss": 0.6713, + "step": 4060 + }, + { + "epoch": 0.43, + "grad_norm": 2.856229721355921, + "learning_rate": 6.401451430655168e-06, + "loss": 0.628, + "step": 4061 + }, + { + "epoch": 0.43, + "grad_norm": 2.0646056376603608, + "learning_rate": 6.399815425003161e-06, + "loss": 0.6839, + "step": 4062 + }, + { + "epoch": 0.43, + "grad_norm": 2.0561582164002683, + "learning_rate": 6.398179256724644e-06, + "loss": 0.6327, + "step": 4063 + }, + { + "epoch": 0.43, + "grad_norm": 2.1686914862568094, + "learning_rate": 6.396542926009703e-06, + "loss": 0.5924, + "step": 4064 + }, + { + "epoch": 0.43, + "grad_norm": 2.452702288596918, + "learning_rate": 6.394906433048442e-06, + "loss": 0.6387, + "step": 4065 + }, + { + "epoch": 0.43, + "grad_norm": 3.400913216633786, + "learning_rate": 6.3932697780309825e-06, + "loss": 0.6913, + "step": 4066 + }, + { + "epoch": 0.43, + "grad_norm": 2.108636102788443, + "learning_rate": 6.3916329611474705e-06, + "loss": 0.6406, + "step": 4067 + }, + { + "epoch": 0.43, + "grad_norm": 2.411671185444273, + "learning_rate": 6.389995982588061e-06, + "loss": 0.6552, + "step": 4068 + }, + { + "epoch": 0.43, + "grad_norm": 2.186150326143783, + "learning_rate": 6.388358842542939e-06, + "loss": 0.6234, + "step": 4069 + }, + { + "epoch": 0.43, + "grad_norm": 4.859739939616308, + "learning_rate": 6.386721541202296e-06, + "loss": 0.6838, + "step": 4070 + }, + { + "epoch": 0.43, + "grad_norm": 2.460365538292539, + "learning_rate": 6.3850840787563565e-06, + "loss": 0.5673, + "step": 4071 + }, + { + "epoch": 0.43, + "grad_norm": 7.402069347480076, + "learning_rate": 6.383446455395352e-06, + "loss": 0.6837, + "step": 4072 + }, + { + "epoch": 0.43, + "grad_norm": 3.2533740088151664, + "learning_rate": 6.3818086713095374e-06, + "loss": 0.6788, + "step": 4073 + }, + { + "epoch": 0.43, + "grad_norm": 2.2858335624830324, + "learning_rate": 6.380170726689185e-06, + "loss": 0.7238, + "step": 4074 + }, + { + "epoch": 0.43, + "grad_norm": 2.484411238766058, + "learning_rate": 6.378532621724588e-06, + "loss": 0.6292, + "step": 4075 + }, + { + "epoch": 0.43, + "grad_norm": 8.867064268787102, + "learning_rate": 6.376894356606056e-06, + "loss": 0.5916, + "step": 4076 + }, + { + "epoch": 0.43, + "grad_norm": 2.291422789683661, + "learning_rate": 6.375255931523916e-06, + "loss": 0.6287, + "step": 4077 + }, + { + "epoch": 0.43, + "grad_norm": 2.1675402432820405, + "learning_rate": 6.373617346668519e-06, + "loss": 0.5687, + "step": 4078 + }, + { + "epoch": 0.43, + "grad_norm": 2.2952408596599088, + "learning_rate": 6.371978602230229e-06, + "loss": 0.6288, + "step": 4079 + }, + { + "epoch": 0.43, + "grad_norm": 4.141696812562484, + "learning_rate": 6.370339698399432e-06, + "loss": 0.6161, + "step": 4080 + }, + { + "epoch": 0.43, + "grad_norm": 3.8668906189282164, + "learning_rate": 6.3687006353665285e-06, + "loss": 0.6, + "step": 4081 + }, + { + "epoch": 0.43, + "grad_norm": 3.230532760665731, + "learning_rate": 6.367061413321942e-06, + "loss": 0.6102, + "step": 4082 + }, + { + "epoch": 0.43, + "grad_norm": 2.68067571548342, + "learning_rate": 6.365422032456113e-06, + "loss": 0.64, + "step": 4083 + }, + { + "epoch": 0.43, + "grad_norm": 2.574913062898836, + "learning_rate": 6.363782492959499e-06, + "loss": 0.6528, + "step": 4084 + }, + { + "epoch": 0.43, + "grad_norm": 4.168916407705001, + "learning_rate": 6.362142795022578e-06, + "loss": 0.6213, + "step": 4085 + }, + { + "epoch": 0.43, + "grad_norm": 2.7508588377575767, + "learning_rate": 6.360502938835844e-06, + "loss": 0.6605, + "step": 4086 + }, + { + "epoch": 0.43, + "grad_norm": 3.4135918342912577, + "learning_rate": 6.35886292458981e-06, + "loss": 0.6797, + "step": 4087 + }, + { + "epoch": 0.43, + "grad_norm": 2.555329036173374, + "learning_rate": 6.35722275247501e-06, + "loss": 0.691, + "step": 4088 + }, + { + "epoch": 0.43, + "grad_norm": 6.3576452705524025, + "learning_rate": 6.355582422681996e-06, + "loss": 0.6184, + "step": 4089 + }, + { + "epoch": 0.43, + "grad_norm": 3.01931185233685, + "learning_rate": 6.353941935401333e-06, + "loss": 0.6206, + "step": 4090 + }, + { + "epoch": 0.43, + "grad_norm": 2.5007928234352677, + "learning_rate": 6.352301290823611e-06, + "loss": 0.6169, + "step": 4091 + }, + { + "epoch": 0.43, + "grad_norm": 3.556883601705964, + "learning_rate": 6.350660489139433e-06, + "loss": 0.6329, + "step": 4092 + }, + { + "epoch": 0.43, + "grad_norm": 2.5252161752585427, + "learning_rate": 6.349019530539425e-06, + "loss": 0.6555, + "step": 4093 + }, + { + "epoch": 0.43, + "grad_norm": 2.5247540508658526, + "learning_rate": 6.347378415214226e-06, + "loss": 0.6158, + "step": 4094 + }, + { + "epoch": 0.43, + "grad_norm": 2.182461495472191, + "learning_rate": 6.3457371433544975e-06, + "loss": 0.7378, + "step": 4095 + }, + { + "epoch": 0.43, + "grad_norm": 2.6257667849308226, + "learning_rate": 6.34409571515092e-06, + "loss": 0.6825, + "step": 4096 + }, + { + "epoch": 0.43, + "grad_norm": 2.4163772839213733, + "learning_rate": 6.342454130794186e-06, + "loss": 0.5942, + "step": 4097 + }, + { + "epoch": 0.43, + "grad_norm": 2.548472388600027, + "learning_rate": 6.340812390475012e-06, + "loss": 0.6612, + "step": 4098 + }, + { + "epoch": 0.43, + "grad_norm": 2.6494112784435204, + "learning_rate": 6.33917049438413e-06, + "loss": 0.7117, + "step": 4099 + }, + { + "epoch": 0.43, + "grad_norm": 4.975703696322628, + "learning_rate": 6.3375284427122915e-06, + "loss": 0.726, + "step": 4100 + }, + { + "epoch": 0.43, + "grad_norm": 2.124929550258763, + "learning_rate": 6.335886235650264e-06, + "loss": 0.599, + "step": 4101 + }, + { + "epoch": 0.43, + "grad_norm": 2.5941633006024762, + "learning_rate": 6.334243873388838e-06, + "loss": 0.7624, + "step": 4102 + }, + { + "epoch": 0.43, + "grad_norm": 7.256377886619029, + "learning_rate": 6.332601356118813e-06, + "loss": 0.5449, + "step": 4103 + }, + { + "epoch": 0.43, + "grad_norm": 3.9556721482690977, + "learning_rate": 6.330958684031016e-06, + "loss": 0.6911, + "step": 4104 + }, + { + "epoch": 0.43, + "grad_norm": 2.733540693074302, + "learning_rate": 6.329315857316285e-06, + "loss": 0.6602, + "step": 4105 + }, + { + "epoch": 0.43, + "grad_norm": 2.101209748842294, + "learning_rate": 6.327672876165481e-06, + "loss": 0.6334, + "step": 4106 + }, + { + "epoch": 0.43, + "grad_norm": 2.230682903167034, + "learning_rate": 6.326029740769481e-06, + "loss": 0.6723, + "step": 4107 + }, + { + "epoch": 0.43, + "grad_norm": 3.8797719596760216, + "learning_rate": 6.324386451319179e-06, + "loss": 0.6211, + "step": 4108 + }, + { + "epoch": 0.43, + "grad_norm": 3.1406370239153274, + "learning_rate": 6.322743008005488e-06, + "loss": 0.6444, + "step": 4109 + }, + { + "epoch": 0.43, + "grad_norm": 4.712312589801154, + "learning_rate": 6.321099411019336e-06, + "loss": 0.6464, + "step": 4110 + }, + { + "epoch": 0.43, + "grad_norm": 2.233736620239088, + "learning_rate": 6.319455660551674e-06, + "loss": 0.6325, + "step": 4111 + }, + { + "epoch": 0.43, + "grad_norm": 2.038358257077351, + "learning_rate": 6.317811756793467e-06, + "loss": 0.6742, + "step": 4112 + }, + { + "epoch": 0.43, + "grad_norm": 2.143450702921909, + "learning_rate": 6.316167699935702e-06, + "loss": 0.6708, + "step": 4113 + }, + { + "epoch": 0.43, + "grad_norm": 3.8705182545419534, + "learning_rate": 6.314523490169375e-06, + "loss": 0.5526, + "step": 4114 + }, + { + "epoch": 0.43, + "grad_norm": 2.073511807722659, + "learning_rate": 6.312879127685512e-06, + "loss": 0.6384, + "step": 4115 + }, + { + "epoch": 0.43, + "grad_norm": 2.653135511720908, + "learning_rate": 6.311234612675143e-06, + "loss": 0.709, + "step": 4116 + }, + { + "epoch": 0.43, + "grad_norm": 2.0677937210863715, + "learning_rate": 6.309589945329332e-06, + "loss": 0.6508, + "step": 4117 + }, + { + "epoch": 0.43, + "grad_norm": 3.338910592281395, + "learning_rate": 6.307945125839143e-06, + "loss": 0.678, + "step": 4118 + }, + { + "epoch": 0.43, + "grad_norm": 2.715799303343155, + "learning_rate": 6.3063001543956715e-06, + "loss": 0.6564, + "step": 4119 + }, + { + "epoch": 0.43, + "grad_norm": 2.4933350769017357, + "learning_rate": 6.304655031190024e-06, + "loss": 0.5819, + "step": 4120 + }, + { + "epoch": 0.43, + "grad_norm": 2.044652665239333, + "learning_rate": 6.303009756413327e-06, + "loss": 0.6325, + "step": 4121 + }, + { + "epoch": 0.43, + "grad_norm": 6.095867866117808, + "learning_rate": 6.3013643302567225e-06, + "loss": 0.6967, + "step": 4122 + }, + { + "epoch": 0.43, + "grad_norm": 2.849872100475443, + "learning_rate": 6.299718752911371e-06, + "loss": 0.7464, + "step": 4123 + }, + { + "epoch": 0.43, + "grad_norm": 1.9748453249208904, + "learning_rate": 6.298073024568454e-06, + "loss": 0.6542, + "step": 4124 + }, + { + "epoch": 0.43, + "grad_norm": 2.2600324826958365, + "learning_rate": 6.296427145419164e-06, + "loss": 0.5836, + "step": 4125 + }, + { + "epoch": 0.43, + "grad_norm": 2.802818402196517, + "learning_rate": 6.294781115654718e-06, + "loss": 0.6668, + "step": 4126 + }, + { + "epoch": 0.43, + "grad_norm": 2.676232174418196, + "learning_rate": 6.293134935466342e-06, + "loss": 0.6596, + "step": 4127 + }, + { + "epoch": 0.43, + "grad_norm": 2.228739627526291, + "learning_rate": 6.291488605045288e-06, + "loss": 0.5731, + "step": 4128 + }, + { + "epoch": 0.43, + "grad_norm": 2.6049506326721703, + "learning_rate": 6.289842124582822e-06, + "loss": 0.5658, + "step": 4129 + }, + { + "epoch": 0.43, + "grad_norm": 2.6541990876497694, + "learning_rate": 6.2881954942702265e-06, + "loss": 0.6145, + "step": 4130 + }, + { + "epoch": 0.43, + "grad_norm": 1.0595704649971625, + "learning_rate": 6.286548714298801e-06, + "loss": 0.6131, + "step": 4131 + }, + { + "epoch": 0.43, + "grad_norm": 2.420647946200874, + "learning_rate": 6.284901784859866e-06, + "loss": 0.6737, + "step": 4132 + }, + { + "epoch": 0.43, + "grad_norm": 3.041851672678566, + "learning_rate": 6.283254706144756e-06, + "loss": 0.6566, + "step": 4133 + }, + { + "epoch": 0.44, + "grad_norm": 2.712373901252542, + "learning_rate": 6.281607478344823e-06, + "loss": 0.6394, + "step": 4134 + }, + { + "epoch": 0.44, + "grad_norm": 2.4741338459449853, + "learning_rate": 6.279960101651439e-06, + "loss": 0.7035, + "step": 4135 + }, + { + "epoch": 0.44, + "grad_norm": 1.9857951374629823, + "learning_rate": 6.278312576255988e-06, + "loss": 0.6226, + "step": 4136 + }, + { + "epoch": 0.44, + "grad_norm": 3.3494664192954384, + "learning_rate": 6.276664902349881e-06, + "loss": 0.6609, + "step": 4137 + }, + { + "epoch": 0.44, + "grad_norm": 2.2270633620793228, + "learning_rate": 6.275017080124533e-06, + "loss": 0.6487, + "step": 4138 + }, + { + "epoch": 0.44, + "grad_norm": 3.2352633292562363, + "learning_rate": 6.273369109771387e-06, + "loss": 0.7118, + "step": 4139 + }, + { + "epoch": 0.44, + "grad_norm": 2.268087699809955, + "learning_rate": 6.271720991481897e-06, + "loss": 0.6689, + "step": 4140 + }, + { + "epoch": 0.44, + "grad_norm": 5.099756078374374, + "learning_rate": 6.270072725447542e-06, + "loss": 0.7536, + "step": 4141 + }, + { + "epoch": 0.44, + "grad_norm": 2.4160875739712644, + "learning_rate": 6.268424311859808e-06, + "loss": 0.6797, + "step": 4142 + }, + { + "epoch": 0.44, + "grad_norm": 3.1608916128751066, + "learning_rate": 6.266775750910203e-06, + "loss": 0.6239, + "step": 4143 + }, + { + "epoch": 0.44, + "grad_norm": 2.642655322409538, + "learning_rate": 6.265127042790253e-06, + "loss": 0.7106, + "step": 4144 + }, + { + "epoch": 0.44, + "grad_norm": 8.48267619018257, + "learning_rate": 6.263478187691502e-06, + "loss": 0.6842, + "step": 4145 + }, + { + "epoch": 0.44, + "grad_norm": 2.2073464735347583, + "learning_rate": 6.2618291858055065e-06, + "loss": 0.716, + "step": 4146 + }, + { + "epoch": 0.44, + "grad_norm": 3.2192677534009446, + "learning_rate": 6.260180037323843e-06, + "loss": 0.6008, + "step": 4147 + }, + { + "epoch": 0.44, + "grad_norm": 2.8880065557661028, + "learning_rate": 6.258530742438107e-06, + "loss": 0.6526, + "step": 4148 + }, + { + "epoch": 0.44, + "grad_norm": 1.923108042812208, + "learning_rate": 6.256881301339907e-06, + "loss": 0.5658, + "step": 4149 + }, + { + "epoch": 0.44, + "grad_norm": 2.3215056242914405, + "learning_rate": 6.255231714220871e-06, + "loss": 0.664, + "step": 4150 + }, + { + "epoch": 0.44, + "grad_norm": 2.598137114476907, + "learning_rate": 6.253581981272641e-06, + "loss": 0.6033, + "step": 4151 + }, + { + "epoch": 0.44, + "grad_norm": 3.292031250462775, + "learning_rate": 6.251932102686883e-06, + "loss": 0.6393, + "step": 4152 + }, + { + "epoch": 0.44, + "grad_norm": 2.9711599193058196, + "learning_rate": 6.2502820786552695e-06, + "loss": 0.7229, + "step": 4153 + }, + { + "epoch": 0.44, + "grad_norm": 3.4776790489556118, + "learning_rate": 6.2486319093695006e-06, + "loss": 0.6444, + "step": 4154 + }, + { + "epoch": 0.44, + "grad_norm": 2.574952252391362, + "learning_rate": 6.246981595021284e-06, + "loss": 0.6376, + "step": 4155 + }, + { + "epoch": 0.44, + "grad_norm": 2.485791306054375, + "learning_rate": 6.245331135802351e-06, + "loss": 0.6843, + "step": 4156 + }, + { + "epoch": 0.44, + "grad_norm": 2.581681311574103, + "learning_rate": 6.243680531904448e-06, + "loss": 0.6037, + "step": 4157 + }, + { + "epoch": 0.44, + "grad_norm": 3.6015305354626954, + "learning_rate": 6.242029783519334e-06, + "loss": 0.6377, + "step": 4158 + }, + { + "epoch": 0.44, + "grad_norm": 2.514525749517879, + "learning_rate": 6.240378890838792e-06, + "loss": 0.688, + "step": 4159 + }, + { + "epoch": 0.44, + "grad_norm": 2.1131329820828673, + "learning_rate": 6.238727854054614e-06, + "loss": 0.6627, + "step": 4160 + }, + { + "epoch": 0.44, + "grad_norm": 2.3327635807486757, + "learning_rate": 6.237076673358616e-06, + "loss": 0.6625, + "step": 4161 + }, + { + "epoch": 0.44, + "grad_norm": 2.564116461115227, + "learning_rate": 6.235425348942625e-06, + "loss": 0.6025, + "step": 4162 + }, + { + "epoch": 0.44, + "grad_norm": 25.51131291171202, + "learning_rate": 6.2337738809984905e-06, + "loss": 0.6809, + "step": 4163 + }, + { + "epoch": 0.44, + "grad_norm": 2.404343930899044, + "learning_rate": 6.23212226971807e-06, + "loss": 0.6425, + "step": 4164 + }, + { + "epoch": 0.44, + "grad_norm": 2.986695766080571, + "learning_rate": 6.230470515293248e-06, + "loss": 0.674, + "step": 4165 + }, + { + "epoch": 0.44, + "grad_norm": 2.5976631521397846, + "learning_rate": 6.2288186179159175e-06, + "loss": 0.6709, + "step": 4166 + }, + { + "epoch": 0.44, + "grad_norm": 12.729513809408251, + "learning_rate": 6.227166577777992e-06, + "loss": 0.6686, + "step": 4167 + }, + { + "epoch": 0.44, + "grad_norm": 2.51466132483747, + "learning_rate": 6.225514395071401e-06, + "loss": 0.7087, + "step": 4168 + }, + { + "epoch": 0.44, + "grad_norm": 2.7437128880343717, + "learning_rate": 6.223862069988091e-06, + "loss": 0.6417, + "step": 4169 + }, + { + "epoch": 0.44, + "grad_norm": 3.7709270732185365, + "learning_rate": 6.222209602720023e-06, + "loss": 0.6492, + "step": 4170 + }, + { + "epoch": 0.44, + "grad_norm": 2.6653325226917635, + "learning_rate": 6.220556993459174e-06, + "loss": 0.6882, + "step": 4171 + }, + { + "epoch": 0.44, + "grad_norm": 2.290790865423674, + "learning_rate": 6.218904242397546e-06, + "loss": 0.6277, + "step": 4172 + }, + { + "epoch": 0.44, + "grad_norm": 3.0699405954785104, + "learning_rate": 6.217251349727145e-06, + "loss": 0.7121, + "step": 4173 + }, + { + "epoch": 0.44, + "grad_norm": 2.734217356286444, + "learning_rate": 6.215598315640001e-06, + "loss": 0.7427, + "step": 4174 + }, + { + "epoch": 0.44, + "grad_norm": 2.713834982597247, + "learning_rate": 6.213945140328157e-06, + "loss": 0.6955, + "step": 4175 + }, + { + "epoch": 0.44, + "grad_norm": 5.311432316870064, + "learning_rate": 6.212291823983678e-06, + "loss": 0.6851, + "step": 4176 + }, + { + "epoch": 0.44, + "grad_norm": 2.399335810386324, + "learning_rate": 6.2106383667986385e-06, + "loss": 0.6589, + "step": 4177 + }, + { + "epoch": 0.44, + "grad_norm": 2.1179610916249185, + "learning_rate": 6.208984768965133e-06, + "loss": 0.585, + "step": 4178 + }, + { + "epoch": 0.44, + "grad_norm": 3.492230402979365, + "learning_rate": 6.207331030675272e-06, + "loss": 0.6906, + "step": 4179 + }, + { + "epoch": 0.44, + "grad_norm": 10.05084961096072, + "learning_rate": 6.2056771521211815e-06, + "loss": 0.6719, + "step": 4180 + }, + { + "epoch": 0.44, + "grad_norm": 2.525761336265004, + "learning_rate": 6.204023133495005e-06, + "loss": 0.6352, + "step": 4181 + }, + { + "epoch": 0.44, + "grad_norm": 2.2251077581550414, + "learning_rate": 6.2023689749889e-06, + "loss": 0.6294, + "step": 4182 + }, + { + "epoch": 0.44, + "grad_norm": 2.3917017204363615, + "learning_rate": 6.2007146767950455e-06, + "loss": 0.6562, + "step": 4183 + }, + { + "epoch": 0.44, + "grad_norm": 5.7924703663410675, + "learning_rate": 6.199060239105628e-06, + "loss": 0.6758, + "step": 4184 + }, + { + "epoch": 0.44, + "grad_norm": 2.3713543077213854, + "learning_rate": 6.197405662112862e-06, + "loss": 0.6653, + "step": 4185 + }, + { + "epoch": 0.44, + "grad_norm": 2.439148199597049, + "learning_rate": 6.195750946008965e-06, + "loss": 0.6896, + "step": 4186 + }, + { + "epoch": 0.44, + "grad_norm": 3.7616942745004667, + "learning_rate": 6.19409609098618e-06, + "loss": 0.6445, + "step": 4187 + }, + { + "epoch": 0.44, + "grad_norm": 2.610494475761507, + "learning_rate": 6.192441097236762e-06, + "loss": 0.7334, + "step": 4188 + }, + { + "epoch": 0.44, + "grad_norm": 2.098962128255649, + "learning_rate": 6.190785964952985e-06, + "loss": 0.634, + "step": 4189 + }, + { + "epoch": 0.44, + "grad_norm": 1.11012265793801, + "learning_rate": 6.189130694327138e-06, + "loss": 0.6304, + "step": 4190 + }, + { + "epoch": 0.44, + "grad_norm": 2.6761648104534377, + "learning_rate": 6.187475285551523e-06, + "loss": 0.6746, + "step": 4191 + }, + { + "epoch": 0.44, + "grad_norm": 2.6521807011331933, + "learning_rate": 6.185819738818463e-06, + "loss": 0.6235, + "step": 4192 + }, + { + "epoch": 0.44, + "grad_norm": 3.263291536626014, + "learning_rate": 6.184164054320293e-06, + "loss": 0.6194, + "step": 4193 + }, + { + "epoch": 0.44, + "grad_norm": 1.0093262515523898, + "learning_rate": 6.1825082322493655e-06, + "loss": 0.6021, + "step": 4194 + }, + { + "epoch": 0.44, + "grad_norm": 6.4984369533755295, + "learning_rate": 6.18085227279805e-06, + "loss": 0.6466, + "step": 4195 + }, + { + "epoch": 0.44, + "grad_norm": 2.873242485567296, + "learning_rate": 6.179196176158733e-06, + "loss": 0.6112, + "step": 4196 + }, + { + "epoch": 0.44, + "grad_norm": 2.2153603654893232, + "learning_rate": 6.17753994252381e-06, + "loss": 0.677, + "step": 4197 + }, + { + "epoch": 0.44, + "grad_norm": 4.885706072220139, + "learning_rate": 6.175883572085703e-06, + "loss": 0.6734, + "step": 4198 + }, + { + "epoch": 0.44, + "grad_norm": 2.306773750236914, + "learning_rate": 6.1742270650368395e-06, + "loss": 0.6614, + "step": 4199 + }, + { + "epoch": 0.44, + "grad_norm": 3.0757107827698014, + "learning_rate": 6.172570421569672e-06, + "loss": 0.6607, + "step": 4200 + }, + { + "epoch": 0.44, + "grad_norm": 2.6497916669129955, + "learning_rate": 6.170913641876662e-06, + "loss": 0.6356, + "step": 4201 + }, + { + "epoch": 0.44, + "grad_norm": 2.4729495304399536, + "learning_rate": 6.1692567261502885e-06, + "loss": 0.5511, + "step": 4202 + }, + { + "epoch": 0.44, + "grad_norm": 2.4085363284193635, + "learning_rate": 6.167599674583049e-06, + "loss": 0.6771, + "step": 4203 + }, + { + "epoch": 0.44, + "grad_norm": 2.5904214468463556, + "learning_rate": 6.165942487367456e-06, + "loss": 0.6186, + "step": 4204 + }, + { + "epoch": 0.44, + "grad_norm": 11.74561826819688, + "learning_rate": 6.164285164696034e-06, + "loss": 0.698, + "step": 4205 + }, + { + "epoch": 0.44, + "grad_norm": 2.4895776594825447, + "learning_rate": 6.162627706761326e-06, + "loss": 0.6161, + "step": 4206 + }, + { + "epoch": 0.44, + "grad_norm": 3.9883010766926974, + "learning_rate": 6.160970113755894e-06, + "loss": 0.6723, + "step": 4207 + }, + { + "epoch": 0.44, + "grad_norm": 3.412743390251808, + "learning_rate": 6.159312385872309e-06, + "loss": 0.6407, + "step": 4208 + }, + { + "epoch": 0.44, + "grad_norm": 1.988545368414137, + "learning_rate": 6.157654523303164e-06, + "loss": 0.6851, + "step": 4209 + }, + { + "epoch": 0.44, + "grad_norm": 2.4530378165050104, + "learning_rate": 6.15599652624106e-06, + "loss": 0.6844, + "step": 4210 + }, + { + "epoch": 0.44, + "grad_norm": 4.747802711725356, + "learning_rate": 6.154338394878624e-06, + "loss": 0.6251, + "step": 4211 + }, + { + "epoch": 0.44, + "grad_norm": 3.6279563635896923, + "learning_rate": 6.152680129408488e-06, + "loss": 0.6315, + "step": 4212 + }, + { + "epoch": 0.44, + "grad_norm": 2.6101031201569262, + "learning_rate": 6.151021730023308e-06, + "loss": 0.6899, + "step": 4213 + }, + { + "epoch": 0.44, + "grad_norm": 2.8097748740269637, + "learning_rate": 6.14936319691575e-06, + "loss": 0.6828, + "step": 4214 + }, + { + "epoch": 0.44, + "grad_norm": 2.5163859758968785, + "learning_rate": 6.147704530278497e-06, + "loss": 0.7089, + "step": 4215 + }, + { + "epoch": 0.44, + "grad_norm": 2.8122326932337884, + "learning_rate": 6.146045730304252e-06, + "loss": 0.7728, + "step": 4216 + }, + { + "epoch": 0.44, + "grad_norm": 2.2991108660602873, + "learning_rate": 6.144386797185724e-06, + "loss": 0.6771, + "step": 4217 + }, + { + "epoch": 0.44, + "grad_norm": 2.82067248565608, + "learning_rate": 6.14272773111565e-06, + "loss": 0.6825, + "step": 4218 + }, + { + "epoch": 0.44, + "grad_norm": 4.292155901876321, + "learning_rate": 6.141068532286768e-06, + "loss": 0.7232, + "step": 4219 + }, + { + "epoch": 0.44, + "grad_norm": 3.2138852203020436, + "learning_rate": 6.139409200891845e-06, + "loss": 0.6331, + "step": 4220 + }, + { + "epoch": 0.44, + "grad_norm": 2.733361759557919, + "learning_rate": 6.137749737123652e-06, + "loss": 0.5726, + "step": 4221 + }, + { + "epoch": 0.44, + "grad_norm": 2.045897121812701, + "learning_rate": 6.136090141174986e-06, + "loss": 0.6303, + "step": 4222 + }, + { + "epoch": 0.44, + "grad_norm": 6.215595577195097, + "learning_rate": 6.134430413238649e-06, + "loss": 0.7165, + "step": 4223 + }, + { + "epoch": 0.44, + "grad_norm": 2.4720795018859287, + "learning_rate": 6.132770553507468e-06, + "loss": 0.5848, + "step": 4224 + }, + { + "epoch": 0.44, + "grad_norm": 2.469345000674683, + "learning_rate": 6.1311105621742775e-06, + "loss": 0.6669, + "step": 4225 + }, + { + "epoch": 0.44, + "grad_norm": 2.625205929898641, + "learning_rate": 6.129450439431932e-06, + "loss": 0.6472, + "step": 4226 + }, + { + "epoch": 0.44, + "grad_norm": 3.5548855414175695, + "learning_rate": 6.1277901854732994e-06, + "loss": 0.6817, + "step": 4227 + }, + { + "epoch": 0.44, + "grad_norm": 2.1027568650104005, + "learning_rate": 6.126129800491263e-06, + "loss": 0.6514, + "step": 4228 + }, + { + "epoch": 0.45, + "grad_norm": 2.045046875891458, + "learning_rate": 6.124469284678721e-06, + "loss": 0.5825, + "step": 4229 + }, + { + "epoch": 0.45, + "grad_norm": 1.1138142631995698, + "learning_rate": 6.122808638228588e-06, + "loss": 0.6002, + "step": 4230 + }, + { + "epoch": 0.45, + "grad_norm": 3.3093105459556447, + "learning_rate": 6.121147861333795e-06, + "loss": 0.7347, + "step": 4231 + }, + { + "epoch": 0.45, + "grad_norm": 2.637992867101944, + "learning_rate": 6.119486954187283e-06, + "loss": 0.6293, + "step": 4232 + }, + { + "epoch": 0.45, + "grad_norm": 4.2443163383582645, + "learning_rate": 6.117825916982013e-06, + "loss": 0.6475, + "step": 4233 + }, + { + "epoch": 0.45, + "grad_norm": 2.3645005998517554, + "learning_rate": 6.116164749910959e-06, + "loss": 0.7004, + "step": 4234 + }, + { + "epoch": 0.45, + "grad_norm": 12.488986762043982, + "learning_rate": 6.114503453167112e-06, + "loss": 0.6375, + "step": 4235 + }, + { + "epoch": 0.45, + "grad_norm": 2.456714677650742, + "learning_rate": 6.112842026943473e-06, + "loss": 0.7201, + "step": 4236 + }, + { + "epoch": 0.45, + "grad_norm": 2.792896979064036, + "learning_rate": 6.111180471433067e-06, + "loss": 0.7046, + "step": 4237 + }, + { + "epoch": 0.45, + "grad_norm": 2.311085039466233, + "learning_rate": 6.109518786828924e-06, + "loss": 0.6333, + "step": 4238 + }, + { + "epoch": 0.45, + "grad_norm": 4.082195260565265, + "learning_rate": 6.107856973324097e-06, + "loss": 0.7398, + "step": 4239 + }, + { + "epoch": 0.45, + "grad_norm": 3.422658221464742, + "learning_rate": 6.106195031111648e-06, + "loss": 0.626, + "step": 4240 + }, + { + "epoch": 0.45, + "grad_norm": 4.357834278269825, + "learning_rate": 6.104532960384658e-06, + "loss": 0.6259, + "step": 4241 + }, + { + "epoch": 0.45, + "grad_norm": 2.0886646213733924, + "learning_rate": 6.1028707613362236e-06, + "loss": 0.7135, + "step": 4242 + }, + { + "epoch": 0.45, + "grad_norm": 2.8932851903398698, + "learning_rate": 6.101208434159451e-06, + "loss": 0.6996, + "step": 4243 + }, + { + "epoch": 0.45, + "grad_norm": 2.3623621800314547, + "learning_rate": 6.099545979047465e-06, + "loss": 0.6463, + "step": 4244 + }, + { + "epoch": 0.45, + "grad_norm": 2.3794475175322924, + "learning_rate": 6.097883396193406e-06, + "loss": 0.6134, + "step": 4245 + }, + { + "epoch": 0.45, + "grad_norm": 2.2159715788742234, + "learning_rate": 6.09622068579043e-06, + "loss": 0.6379, + "step": 4246 + }, + { + "epoch": 0.45, + "grad_norm": 3.65089220209315, + "learning_rate": 6.094557848031699e-06, + "loss": 0.6913, + "step": 4247 + }, + { + "epoch": 0.45, + "grad_norm": 2.238068508379566, + "learning_rate": 6.092894883110405e-06, + "loss": 0.6267, + "step": 4248 + }, + { + "epoch": 0.45, + "grad_norm": 2.62840874216548, + "learning_rate": 6.0912317912197416e-06, + "loss": 0.6864, + "step": 4249 + }, + { + "epoch": 0.45, + "grad_norm": 2.7152502107110603, + "learning_rate": 6.089568572552923e-06, + "loss": 0.6414, + "step": 4250 + }, + { + "epoch": 0.45, + "grad_norm": 1.0634647134438302, + "learning_rate": 6.087905227303177e-06, + "loss": 0.5915, + "step": 4251 + }, + { + "epoch": 0.45, + "grad_norm": 2.6867407708114026, + "learning_rate": 6.086241755663746e-06, + "loss": 0.668, + "step": 4252 + }, + { + "epoch": 0.45, + "grad_norm": 1.0245261285927338, + "learning_rate": 6.08457815782789e-06, + "loss": 0.5945, + "step": 4253 + }, + { + "epoch": 0.45, + "grad_norm": 2.184617873138533, + "learning_rate": 6.082914433988875e-06, + "loss": 0.566, + "step": 4254 + }, + { + "epoch": 0.45, + "grad_norm": 2.2773216580867945, + "learning_rate": 6.081250584339996e-06, + "loss": 0.5985, + "step": 4255 + }, + { + "epoch": 0.45, + "grad_norm": 2.2623346319662763, + "learning_rate": 6.079586609074547e-06, + "loss": 0.6648, + "step": 4256 + }, + { + "epoch": 0.45, + "grad_norm": 3.594546200291953, + "learning_rate": 6.077922508385849e-06, + "loss": 0.6895, + "step": 4257 + }, + { + "epoch": 0.45, + "grad_norm": 2.2014493515781806, + "learning_rate": 6.076258282467227e-06, + "loss": 0.6097, + "step": 4258 + }, + { + "epoch": 0.45, + "grad_norm": 2.6694207927575233, + "learning_rate": 6.074593931512031e-06, + "loss": 0.6801, + "step": 4259 + }, + { + "epoch": 0.45, + "grad_norm": 3.451259021434374, + "learning_rate": 6.072929455713616e-06, + "loss": 0.6692, + "step": 4260 + }, + { + "epoch": 0.45, + "grad_norm": 1.9337354819470554, + "learning_rate": 6.07126485526536e-06, + "loss": 0.5854, + "step": 4261 + }, + { + "epoch": 0.45, + "grad_norm": 9.767008015068956, + "learning_rate": 6.0696001303606486e-06, + "loss": 0.6037, + "step": 4262 + }, + { + "epoch": 0.45, + "grad_norm": 3.0086485367597264, + "learning_rate": 6.067935281192887e-06, + "loss": 0.6432, + "step": 4263 + }, + { + "epoch": 0.45, + "grad_norm": 2.153528393185996, + "learning_rate": 6.066270307955492e-06, + "loss": 0.643, + "step": 4264 + }, + { + "epoch": 0.45, + "grad_norm": 3.1645611909710882, + "learning_rate": 6.064605210841893e-06, + "loss": 0.6053, + "step": 4265 + }, + { + "epoch": 0.45, + "grad_norm": 2.309866791740079, + "learning_rate": 6.062939990045541e-06, + "loss": 0.6321, + "step": 4266 + }, + { + "epoch": 0.45, + "grad_norm": 2.156210421450635, + "learning_rate": 6.06127464575989e-06, + "loss": 0.5757, + "step": 4267 + }, + { + "epoch": 0.45, + "grad_norm": 3.244885997343262, + "learning_rate": 6.059609178178423e-06, + "loss": 0.6942, + "step": 4268 + }, + { + "epoch": 0.45, + "grad_norm": 3.152053051101937, + "learning_rate": 6.0579435874946205e-06, + "loss": 0.6633, + "step": 4269 + }, + { + "epoch": 0.45, + "grad_norm": 4.478964275396135, + "learning_rate": 6.056277873901993e-06, + "loss": 0.6697, + "step": 4270 + }, + { + "epoch": 0.45, + "grad_norm": 2.9729373074740866, + "learning_rate": 6.054612037594053e-06, + "loss": 0.7223, + "step": 4271 + }, + { + "epoch": 0.45, + "grad_norm": 2.0783619133176625, + "learning_rate": 6.052946078764337e-06, + "loss": 0.6165, + "step": 4272 + }, + { + "epoch": 0.45, + "grad_norm": 1.2263809793723803, + "learning_rate": 6.0512799976063885e-06, + "loss": 0.586, + "step": 4273 + }, + { + "epoch": 0.45, + "grad_norm": 2.84734542991169, + "learning_rate": 6.049613794313769e-06, + "loss": 0.651, + "step": 4274 + }, + { + "epoch": 0.45, + "grad_norm": 2.3793929939042915, + "learning_rate": 6.047947469080053e-06, + "loss": 0.7264, + "step": 4275 + }, + { + "epoch": 0.45, + "grad_norm": 3.7894502963984307, + "learning_rate": 6.0462810220988284e-06, + "loss": 0.5848, + "step": 4276 + }, + { + "epoch": 0.45, + "grad_norm": 2.385912284914175, + "learning_rate": 6.044614453563702e-06, + "loss": 0.6738, + "step": 4277 + }, + { + "epoch": 0.45, + "grad_norm": 3.0450272266233553, + "learning_rate": 6.042947763668285e-06, + "loss": 0.746, + "step": 4278 + }, + { + "epoch": 0.45, + "grad_norm": 2.0469407502606782, + "learning_rate": 6.041280952606214e-06, + "loss": 0.5847, + "step": 4279 + }, + { + "epoch": 0.45, + "grad_norm": 2.531872581619871, + "learning_rate": 6.03961402057113e-06, + "loss": 0.5961, + "step": 4280 + }, + { + "epoch": 0.45, + "grad_norm": 0.9848807465163134, + "learning_rate": 6.037946967756696e-06, + "loss": 0.6215, + "step": 4281 + }, + { + "epoch": 0.45, + "grad_norm": 2.090185818907428, + "learning_rate": 6.036279794356582e-06, + "loss": 0.6028, + "step": 4282 + }, + { + "epoch": 0.45, + "grad_norm": 1.995346619697124, + "learning_rate": 6.034612500564479e-06, + "loss": 0.5208, + "step": 4283 + }, + { + "epoch": 0.45, + "grad_norm": 2.7529881198497277, + "learning_rate": 6.032945086574085e-06, + "loss": 0.6228, + "step": 4284 + }, + { + "epoch": 0.45, + "grad_norm": 3.3379083789144857, + "learning_rate": 6.0312775525791165e-06, + "loss": 0.6813, + "step": 4285 + }, + { + "epoch": 0.45, + "grad_norm": 2.119809893657813, + "learning_rate": 6.029609898773305e-06, + "loss": 0.6076, + "step": 4286 + }, + { + "epoch": 0.45, + "grad_norm": 2.5052511733476788, + "learning_rate": 6.027942125350389e-06, + "loss": 0.6299, + "step": 4287 + }, + { + "epoch": 0.45, + "grad_norm": 2.655210831114237, + "learning_rate": 6.02627423250413e-06, + "loss": 0.6856, + "step": 4288 + }, + { + "epoch": 0.45, + "grad_norm": 2.625661240338276, + "learning_rate": 6.024606220428297e-06, + "loss": 0.647, + "step": 4289 + }, + { + "epoch": 0.45, + "grad_norm": 3.2243027452718698, + "learning_rate": 6.022938089316677e-06, + "loss": 0.7011, + "step": 4290 + }, + { + "epoch": 0.45, + "grad_norm": 2.3340700591649557, + "learning_rate": 6.021269839363063e-06, + "loss": 0.6817, + "step": 4291 + }, + { + "epoch": 0.45, + "grad_norm": 3.0637410458294068, + "learning_rate": 6.019601470761275e-06, + "loss": 0.668, + "step": 4292 + }, + { + "epoch": 0.45, + "grad_norm": 2.865208891942265, + "learning_rate": 6.017932983705132e-06, + "loss": 0.6142, + "step": 4293 + }, + { + "epoch": 0.45, + "grad_norm": 2.9027349130767828, + "learning_rate": 6.016264378388481e-06, + "loss": 0.6485, + "step": 4294 + }, + { + "epoch": 0.45, + "grad_norm": 2.9168453504696266, + "learning_rate": 6.0145956550051694e-06, + "loss": 0.6187, + "step": 4295 + }, + { + "epoch": 0.45, + "grad_norm": 2.7032692104534073, + "learning_rate": 6.01292681374907e-06, + "loss": 0.6664, + "step": 4296 + }, + { + "epoch": 0.45, + "grad_norm": 2.223566307096264, + "learning_rate": 6.01125785481406e-06, + "loss": 0.5642, + "step": 4297 + }, + { + "epoch": 0.45, + "grad_norm": 3.504081111094006, + "learning_rate": 6.009588778394035e-06, + "loss": 0.6222, + "step": 4298 + }, + { + "epoch": 0.45, + "grad_norm": 1.8721849635784116, + "learning_rate": 6.0079195846829055e-06, + "loss": 0.629, + "step": 4299 + }, + { + "epoch": 0.45, + "grad_norm": 1.9713414450562043, + "learning_rate": 6.006250273874591e-06, + "loss": 0.6071, + "step": 4300 + }, + { + "epoch": 0.45, + "grad_norm": 2.0636910457431736, + "learning_rate": 6.0045808461630295e-06, + "loss": 0.606, + "step": 4301 + }, + { + "epoch": 0.45, + "grad_norm": 2.4935316363007756, + "learning_rate": 6.002911301742168e-06, + "loss": 0.6487, + "step": 4302 + }, + { + "epoch": 0.45, + "grad_norm": 2.0612655080904037, + "learning_rate": 6.001241640805973e-06, + "loss": 0.6267, + "step": 4303 + }, + { + "epoch": 0.45, + "grad_norm": 5.5302669242733815, + "learning_rate": 5.999571863548416e-06, + "loss": 0.6001, + "step": 4304 + }, + { + "epoch": 0.45, + "grad_norm": 2.4210531577615226, + "learning_rate": 5.997901970163491e-06, + "loss": 0.6732, + "step": 4305 + }, + { + "epoch": 0.45, + "grad_norm": 2.4958888183227415, + "learning_rate": 5.996231960845198e-06, + "loss": 0.6682, + "step": 4306 + }, + { + "epoch": 0.45, + "grad_norm": 2.5503369133292892, + "learning_rate": 5.994561835787558e-06, + "loss": 0.5959, + "step": 4307 + }, + { + "epoch": 0.45, + "grad_norm": 2.176063899739418, + "learning_rate": 5.992891595184596e-06, + "loss": 0.6718, + "step": 4308 + }, + { + "epoch": 0.45, + "grad_norm": 2.4856814525301485, + "learning_rate": 5.991221239230362e-06, + "loss": 0.6848, + "step": 4309 + }, + { + "epoch": 0.45, + "grad_norm": 2.966837942768863, + "learning_rate": 5.989550768118908e-06, + "loss": 0.5977, + "step": 4310 + }, + { + "epoch": 0.45, + "grad_norm": 2.4789769096013554, + "learning_rate": 5.987880182044304e-06, + "loss": 0.5662, + "step": 4311 + }, + { + "epoch": 0.45, + "grad_norm": 2.0922816492045966, + "learning_rate": 5.98620948120064e-06, + "loss": 0.7099, + "step": 4312 + }, + { + "epoch": 0.45, + "grad_norm": 2.144043338021814, + "learning_rate": 5.984538665782007e-06, + "loss": 0.6219, + "step": 4313 + }, + { + "epoch": 0.45, + "grad_norm": 1.9684756423653305, + "learning_rate": 5.9828677359825196e-06, + "loss": 0.6493, + "step": 4314 + }, + { + "epoch": 0.45, + "grad_norm": 2.3925762779344515, + "learning_rate": 5.981196691996298e-06, + "loss": 0.5853, + "step": 4315 + }, + { + "epoch": 0.45, + "grad_norm": 2.9740384297980214, + "learning_rate": 5.9795255340174825e-06, + "loss": 0.656, + "step": 4316 + }, + { + "epoch": 0.45, + "grad_norm": 2.1922968760567727, + "learning_rate": 5.9778542622402205e-06, + "loss": 0.5823, + "step": 4317 + }, + { + "epoch": 0.45, + "grad_norm": 2.08742766373127, + "learning_rate": 5.976182876858679e-06, + "loss": 0.6744, + "step": 4318 + }, + { + "epoch": 0.45, + "grad_norm": 2.4973148138070638, + "learning_rate": 5.9745113780670305e-06, + "loss": 0.6447, + "step": 4319 + }, + { + "epoch": 0.45, + "grad_norm": 2.6221490595037755, + "learning_rate": 5.972839766059469e-06, + "loss": 0.6632, + "step": 4320 + }, + { + "epoch": 0.45, + "grad_norm": 2.182690583898645, + "learning_rate": 5.971168041030194e-06, + "loss": 0.7078, + "step": 4321 + }, + { + "epoch": 0.45, + "grad_norm": 1.984007580074134, + "learning_rate": 5.969496203173424e-06, + "loss": 0.5901, + "step": 4322 + }, + { + "epoch": 0.45, + "grad_norm": 3.2119980194558324, + "learning_rate": 5.967824252683389e-06, + "loss": 0.6961, + "step": 4323 + }, + { + "epoch": 0.46, + "grad_norm": 1.9424736990101, + "learning_rate": 5.9661521897543276e-06, + "loss": 0.5905, + "step": 4324 + }, + { + "epoch": 0.46, + "grad_norm": 2.3950357257174666, + "learning_rate": 5.9644800145805e-06, + "loss": 0.6498, + "step": 4325 + }, + { + "epoch": 0.46, + "grad_norm": 2.9016254383942606, + "learning_rate": 5.962807727356169e-06, + "loss": 0.7386, + "step": 4326 + }, + { + "epoch": 0.46, + "grad_norm": 2.187079791651539, + "learning_rate": 5.9611353282756235e-06, + "loss": 0.6256, + "step": 4327 + }, + { + "epoch": 0.46, + "grad_norm": 3.203908956516435, + "learning_rate": 5.95946281753315e-06, + "loss": 0.5533, + "step": 4328 + }, + { + "epoch": 0.46, + "grad_norm": 2.3206237982379285, + "learning_rate": 5.957790195323064e-06, + "loss": 0.5879, + "step": 4329 + }, + { + "epoch": 0.46, + "grad_norm": 2.0758013751273796, + "learning_rate": 5.956117461839679e-06, + "loss": 0.7067, + "step": 4330 + }, + { + "epoch": 0.46, + "grad_norm": 2.9307153953571254, + "learning_rate": 5.954444617277332e-06, + "loss": 0.6043, + "step": 4331 + }, + { + "epoch": 0.46, + "grad_norm": 3.1653578493636227, + "learning_rate": 5.952771661830368e-06, + "loss": 0.6076, + "step": 4332 + }, + { + "epoch": 0.46, + "grad_norm": 2.835128332613066, + "learning_rate": 5.951098595693146e-06, + "loss": 0.5411, + "step": 4333 + }, + { + "epoch": 0.46, + "grad_norm": 2.314934556444207, + "learning_rate": 5.9494254190600395e-06, + "loss": 0.7049, + "step": 4334 + }, + { + "epoch": 0.46, + "grad_norm": 2.742652891296002, + "learning_rate": 5.947752132125432e-06, + "loss": 0.6817, + "step": 4335 + }, + { + "epoch": 0.46, + "grad_norm": 2.3350065294888402, + "learning_rate": 5.946078735083723e-06, + "loss": 0.6664, + "step": 4336 + }, + { + "epoch": 0.46, + "grad_norm": 2.878910527625717, + "learning_rate": 5.944405228129318e-06, + "loss": 0.6108, + "step": 4337 + }, + { + "epoch": 0.46, + "grad_norm": 2.8697645174477455, + "learning_rate": 5.942731611456647e-06, + "loss": 0.6308, + "step": 4338 + }, + { + "epoch": 0.46, + "grad_norm": 3.8193425849051312, + "learning_rate": 5.941057885260141e-06, + "loss": 0.6536, + "step": 4339 + }, + { + "epoch": 0.46, + "grad_norm": 2.3962102257159876, + "learning_rate": 5.939384049734252e-06, + "loss": 0.6428, + "step": 4340 + }, + { + "epoch": 0.46, + "grad_norm": 1.897199349105444, + "learning_rate": 5.937710105073436e-06, + "loss": 0.6204, + "step": 4341 + }, + { + "epoch": 0.46, + "grad_norm": 4.592219177725684, + "learning_rate": 5.936036051472173e-06, + "loss": 0.6533, + "step": 4342 + }, + { + "epoch": 0.46, + "grad_norm": 2.856840556341079, + "learning_rate": 5.934361889124946e-06, + "loss": 0.6482, + "step": 4343 + }, + { + "epoch": 0.46, + "grad_norm": 3.8920368478735567, + "learning_rate": 5.9326876182262575e-06, + "loss": 0.6338, + "step": 4344 + }, + { + "epoch": 0.46, + "grad_norm": 2.3189541995763316, + "learning_rate": 5.931013238970616e-06, + "loss": 0.6674, + "step": 4345 + }, + { + "epoch": 0.46, + "grad_norm": 3.1842357570851374, + "learning_rate": 5.929338751552549e-06, + "loss": 0.6539, + "step": 4346 + }, + { + "epoch": 0.46, + "grad_norm": 2.412864761506081, + "learning_rate": 5.927664156166592e-06, + "loss": 0.6452, + "step": 4347 + }, + { + "epoch": 0.46, + "grad_norm": 3.450085408288252, + "learning_rate": 5.925989453007294e-06, + "loss": 0.5489, + "step": 4348 + }, + { + "epoch": 0.46, + "grad_norm": 2.413183425668527, + "learning_rate": 5.924314642269219e-06, + "loss": 0.5971, + "step": 4349 + }, + { + "epoch": 0.46, + "grad_norm": 2.077588646518163, + "learning_rate": 5.922639724146939e-06, + "loss": 0.6276, + "step": 4350 + }, + { + "epoch": 0.46, + "grad_norm": 2.440785017604606, + "learning_rate": 5.920964698835047e-06, + "loss": 0.6674, + "step": 4351 + }, + { + "epoch": 0.46, + "grad_norm": 3.242550745644832, + "learning_rate": 5.919289566528135e-06, + "loss": 0.5893, + "step": 4352 + }, + { + "epoch": 0.46, + "grad_norm": 2.571180994234153, + "learning_rate": 5.9176143274208185e-06, + "loss": 0.7091, + "step": 4353 + }, + { + "epoch": 0.46, + "grad_norm": 2.4118573325477386, + "learning_rate": 5.915938981707724e-06, + "loss": 0.6622, + "step": 4354 + }, + { + "epoch": 0.46, + "grad_norm": 2.5558862540573424, + "learning_rate": 5.914263529583485e-06, + "loss": 0.7812, + "step": 4355 + }, + { + "epoch": 0.46, + "grad_norm": 2.2737963351658887, + "learning_rate": 5.9125879712427525e-06, + "loss": 0.5881, + "step": 4356 + }, + { + "epoch": 0.46, + "grad_norm": 2.4431483934766702, + "learning_rate": 5.9109123068801875e-06, + "loss": 0.7217, + "step": 4357 + }, + { + "epoch": 0.46, + "grad_norm": 2.3370688862721347, + "learning_rate": 5.909236536690464e-06, + "loss": 0.6663, + "step": 4358 + }, + { + "epoch": 0.46, + "grad_norm": 2.3073085136490574, + "learning_rate": 5.907560660868266e-06, + "loss": 0.6542, + "step": 4359 + }, + { + "epoch": 0.46, + "grad_norm": 3.2027134863893343, + "learning_rate": 5.905884679608297e-06, + "loss": 0.6156, + "step": 4360 + }, + { + "epoch": 0.46, + "grad_norm": 2.5426901682583063, + "learning_rate": 5.904208593105263e-06, + "loss": 0.7358, + "step": 4361 + }, + { + "epoch": 0.46, + "grad_norm": 3.4513894931625066, + "learning_rate": 5.902532401553888e-06, + "loss": 0.6191, + "step": 4362 + }, + { + "epoch": 0.46, + "grad_norm": 2.4589931992350498, + "learning_rate": 5.900856105148908e-06, + "loss": 0.6734, + "step": 4363 + }, + { + "epoch": 0.46, + "grad_norm": 2.33735601801206, + "learning_rate": 5.899179704085072e-06, + "loss": 0.6383, + "step": 4364 + }, + { + "epoch": 0.46, + "grad_norm": 2.153407332381854, + "learning_rate": 5.897503198557134e-06, + "loss": 0.6444, + "step": 4365 + }, + { + "epoch": 0.46, + "grad_norm": 2.8332048431773433, + "learning_rate": 5.89582658875987e-06, + "loss": 0.5908, + "step": 4366 + }, + { + "epoch": 0.46, + "grad_norm": 2.8707757219917687, + "learning_rate": 5.8941498748880635e-06, + "loss": 0.6854, + "step": 4367 + }, + { + "epoch": 0.46, + "grad_norm": 3.80342904637755, + "learning_rate": 5.892473057136508e-06, + "loss": 0.5482, + "step": 4368 + }, + { + "epoch": 0.46, + "grad_norm": 1.0613535298316306, + "learning_rate": 5.890796135700013e-06, + "loss": 0.607, + "step": 4369 + }, + { + "epoch": 0.46, + "grad_norm": 4.544050244229915, + "learning_rate": 5.889119110773398e-06, + "loss": 0.6613, + "step": 4370 + }, + { + "epoch": 0.46, + "grad_norm": 2.225190268668775, + "learning_rate": 5.887441982551495e-06, + "loss": 0.684, + "step": 4371 + }, + { + "epoch": 0.46, + "grad_norm": 2.406353835293022, + "learning_rate": 5.885764751229146e-06, + "loss": 0.5922, + "step": 4372 + }, + { + "epoch": 0.46, + "grad_norm": 2.660963513951452, + "learning_rate": 5.884087417001212e-06, + "loss": 0.6724, + "step": 4373 + }, + { + "epoch": 0.46, + "grad_norm": 2.151920515191103, + "learning_rate": 5.882409980062554e-06, + "loss": 0.6619, + "step": 4374 + }, + { + "epoch": 0.46, + "grad_norm": 4.056944032442784, + "learning_rate": 5.880732440608059e-06, + "loss": 0.689, + "step": 4375 + }, + { + "epoch": 0.46, + "grad_norm": 2.8201326385968266, + "learning_rate": 5.879054798832612e-06, + "loss": 0.6328, + "step": 4376 + }, + { + "epoch": 0.46, + "grad_norm": 1.9966584556095535, + "learning_rate": 5.877377054931122e-06, + "loss": 0.6494, + "step": 4377 + }, + { + "epoch": 0.46, + "grad_norm": 2.3102019347667095, + "learning_rate": 5.8756992090985e-06, + "loss": 0.6443, + "step": 4378 + }, + { + "epoch": 0.46, + "grad_norm": 2.7187452579507503, + "learning_rate": 5.874021261529675e-06, + "loss": 0.778, + "step": 4379 + }, + { + "epoch": 0.46, + "grad_norm": 3.9391200420774966, + "learning_rate": 5.872343212419589e-06, + "loss": 0.7402, + "step": 4380 + }, + { + "epoch": 0.46, + "grad_norm": 3.342650913183981, + "learning_rate": 5.870665061963188e-06, + "loss": 0.6401, + "step": 4381 + }, + { + "epoch": 0.46, + "grad_norm": 3.10923752534272, + "learning_rate": 5.868986810355437e-06, + "loss": 0.6497, + "step": 4382 + }, + { + "epoch": 0.46, + "grad_norm": 2.9439328630070154, + "learning_rate": 5.867308457791311e-06, + "loss": 0.6287, + "step": 4383 + }, + { + "epoch": 0.46, + "grad_norm": 2.1724951171650644, + "learning_rate": 5.865630004465796e-06, + "loss": 0.6773, + "step": 4384 + }, + { + "epoch": 0.46, + "grad_norm": 2.2545948900225095, + "learning_rate": 5.8639514505738885e-06, + "loss": 0.7132, + "step": 4385 + }, + { + "epoch": 0.46, + "grad_norm": 2.339764871054603, + "learning_rate": 5.8622727963106e-06, + "loss": 0.6558, + "step": 4386 + }, + { + "epoch": 0.46, + "grad_norm": 6.264396932081793, + "learning_rate": 5.860594041870948e-06, + "loss": 0.564, + "step": 4387 + }, + { + "epoch": 0.46, + "grad_norm": 3.0726383532297126, + "learning_rate": 5.85891518744997e-06, + "loss": 0.6943, + "step": 4388 + }, + { + "epoch": 0.46, + "grad_norm": 3.819599638111385, + "learning_rate": 5.857236233242709e-06, + "loss": 0.6919, + "step": 4389 + }, + { + "epoch": 0.46, + "grad_norm": 2.55942614466353, + "learning_rate": 5.855557179444219e-06, + "loss": 0.6243, + "step": 4390 + }, + { + "epoch": 0.46, + "grad_norm": 2.1630742426007186, + "learning_rate": 5.8538780262495695e-06, + "loss": 0.59, + "step": 4391 + }, + { + "epoch": 0.46, + "grad_norm": 2.6158551871983122, + "learning_rate": 5.85219877385384e-06, + "loss": 0.7309, + "step": 4392 + }, + { + "epoch": 0.46, + "grad_norm": 2.212409520666976, + "learning_rate": 5.8505194224521204e-06, + "loss": 0.5875, + "step": 4393 + }, + { + "epoch": 0.46, + "grad_norm": 2.3813643477613318, + "learning_rate": 5.848839972239512e-06, + "loss": 0.6635, + "step": 4394 + }, + { + "epoch": 0.46, + "grad_norm": 2.4275155873597285, + "learning_rate": 5.847160423411129e-06, + "loss": 0.6709, + "step": 4395 + }, + { + "epoch": 0.46, + "grad_norm": 4.826967263615847, + "learning_rate": 5.845480776162097e-06, + "loss": 0.6407, + "step": 4396 + }, + { + "epoch": 0.46, + "grad_norm": 2.306596082817607, + "learning_rate": 5.843801030687555e-06, + "loss": 0.6385, + "step": 4397 + }, + { + "epoch": 0.46, + "grad_norm": 1.9985283159296727, + "learning_rate": 5.842121187182644e-06, + "loss": 0.6064, + "step": 4398 + }, + { + "epoch": 0.46, + "grad_norm": 3.2091727302195725, + "learning_rate": 5.840441245842532e-06, + "loss": 0.74, + "step": 4399 + }, + { + "epoch": 0.46, + "grad_norm": 2.8692803940553713, + "learning_rate": 5.838761206862382e-06, + "loss": 0.6428, + "step": 4400 + }, + { + "epoch": 0.46, + "grad_norm": 2.125632738179534, + "learning_rate": 5.837081070437383e-06, + "loss": 0.7399, + "step": 4401 + }, + { + "epoch": 0.46, + "grad_norm": 3.397361281528623, + "learning_rate": 5.835400836762723e-06, + "loss": 0.6768, + "step": 4402 + }, + { + "epoch": 0.46, + "grad_norm": 4.962196122812087, + "learning_rate": 5.833720506033609e-06, + "loss": 0.5604, + "step": 4403 + }, + { + "epoch": 0.46, + "grad_norm": 2.4524501870949416, + "learning_rate": 5.8320400784452566e-06, + "loss": 0.6212, + "step": 4404 + }, + { + "epoch": 0.46, + "grad_norm": 2.242962121068071, + "learning_rate": 5.830359554192894e-06, + "loss": 0.6391, + "step": 4405 + }, + { + "epoch": 0.46, + "grad_norm": 2.4880079804921844, + "learning_rate": 5.828678933471758e-06, + "loss": 0.6894, + "step": 4406 + }, + { + "epoch": 0.46, + "grad_norm": 0.9982078926059621, + "learning_rate": 5.826998216477097e-06, + "loss": 0.5715, + "step": 4407 + }, + { + "epoch": 0.46, + "grad_norm": 3.39200422891883, + "learning_rate": 5.825317403404177e-06, + "loss": 0.7124, + "step": 4408 + }, + { + "epoch": 0.46, + "grad_norm": 2.5758571917643183, + "learning_rate": 5.823636494448265e-06, + "loss": 0.6916, + "step": 4409 + }, + { + "epoch": 0.46, + "grad_norm": 2.0155720379239783, + "learning_rate": 5.821955489804647e-06, + "loss": 0.5974, + "step": 4410 + }, + { + "epoch": 0.46, + "grad_norm": 2.4690764237144096, + "learning_rate": 5.820274389668614e-06, + "loss": 0.6291, + "step": 4411 + }, + { + "epoch": 0.46, + "grad_norm": 2.4569169908883968, + "learning_rate": 5.818593194235475e-06, + "loss": 0.675, + "step": 4412 + }, + { + "epoch": 0.46, + "grad_norm": 2.0672128706646213, + "learning_rate": 5.816911903700546e-06, + "loss": 0.6804, + "step": 4413 + }, + { + "epoch": 0.46, + "grad_norm": 1.9976719347095646, + "learning_rate": 5.815230518259153e-06, + "loss": 0.6844, + "step": 4414 + }, + { + "epoch": 0.46, + "grad_norm": 2.564923614303906, + "learning_rate": 5.813549038106635e-06, + "loss": 0.6927, + "step": 4415 + }, + { + "epoch": 0.46, + "grad_norm": 2.3811759370101084, + "learning_rate": 5.811867463438341e-06, + "loss": 0.628, + "step": 4416 + }, + { + "epoch": 0.46, + "grad_norm": 2.0628079283250282, + "learning_rate": 5.810185794449633e-06, + "loss": 0.5905, + "step": 4417 + }, + { + "epoch": 0.46, + "grad_norm": 2.582538159728672, + "learning_rate": 5.80850403133588e-06, + "loss": 0.7048, + "step": 4418 + }, + { + "epoch": 0.47, + "grad_norm": 2.4157594950729466, + "learning_rate": 5.806822174292467e-06, + "loss": 0.5557, + "step": 4419 + }, + { + "epoch": 0.47, + "grad_norm": 0.990310949015315, + "learning_rate": 5.805140223514785e-06, + "loss": 0.5572, + "step": 4420 + }, + { + "epoch": 0.47, + "grad_norm": 3.0489038974046543, + "learning_rate": 5.80345817919824e-06, + "loss": 0.6249, + "step": 4421 + }, + { + "epoch": 0.47, + "grad_norm": 2.060292665650145, + "learning_rate": 5.801776041538245e-06, + "loss": 0.6035, + "step": 4422 + }, + { + "epoch": 0.47, + "grad_norm": 2.091882121485412, + "learning_rate": 5.800093810730229e-06, + "loss": 0.6245, + "step": 4423 + }, + { + "epoch": 0.47, + "grad_norm": 3.179974949588236, + "learning_rate": 5.798411486969626e-06, + "loss": 0.6589, + "step": 4424 + }, + { + "epoch": 0.47, + "grad_norm": 2.138657402068139, + "learning_rate": 5.796729070451884e-06, + "loss": 0.6545, + "step": 4425 + }, + { + "epoch": 0.47, + "grad_norm": 2.233621588999477, + "learning_rate": 5.795046561372463e-06, + "loss": 0.681, + "step": 4426 + }, + { + "epoch": 0.47, + "grad_norm": 3.004996496095365, + "learning_rate": 5.79336395992683e-06, + "loss": 0.6399, + "step": 4427 + }, + { + "epoch": 0.47, + "grad_norm": 2.069729876929192, + "learning_rate": 5.791681266310465e-06, + "loss": 0.6049, + "step": 4428 + }, + { + "epoch": 0.47, + "grad_norm": 2.4465891459726623, + "learning_rate": 5.78999848071886e-06, + "loss": 0.6035, + "step": 4429 + }, + { + "epoch": 0.47, + "grad_norm": 2.2899552504847716, + "learning_rate": 5.788315603347515e-06, + "loss": 0.6646, + "step": 4430 + }, + { + "epoch": 0.47, + "grad_norm": 2.750752115170631, + "learning_rate": 5.78663263439194e-06, + "loss": 0.6953, + "step": 4431 + }, + { + "epoch": 0.47, + "grad_norm": 3.2813221898068594, + "learning_rate": 5.7849495740476625e-06, + "loss": 0.7107, + "step": 4432 + }, + { + "epoch": 0.47, + "grad_norm": 4.412245921640548, + "learning_rate": 5.783266422510211e-06, + "loss": 0.6862, + "step": 4433 + }, + { + "epoch": 0.47, + "grad_norm": 2.1713157933842573, + "learning_rate": 5.781583179975132e-06, + "loss": 0.7024, + "step": 4434 + }, + { + "epoch": 0.47, + "grad_norm": 2.5292807136122497, + "learning_rate": 5.779899846637976e-06, + "loss": 0.6344, + "step": 4435 + }, + { + "epoch": 0.47, + "grad_norm": 2.638992479723519, + "learning_rate": 5.778216422694312e-06, + "loss": 0.6147, + "step": 4436 + }, + { + "epoch": 0.47, + "grad_norm": 2.253511934867257, + "learning_rate": 5.776532908339713e-06, + "loss": 0.6775, + "step": 4437 + }, + { + "epoch": 0.47, + "grad_norm": 2.5189103997047404, + "learning_rate": 5.774849303769767e-06, + "loss": 0.6079, + "step": 4438 + }, + { + "epoch": 0.47, + "grad_norm": 1.9551608372390805, + "learning_rate": 5.773165609180067e-06, + "loss": 0.6591, + "step": 4439 + }, + { + "epoch": 0.47, + "grad_norm": 2.0567111687412636, + "learning_rate": 5.771481824766222e-06, + "loss": 0.6149, + "step": 4440 + }, + { + "epoch": 0.47, + "grad_norm": 2.063349581374693, + "learning_rate": 5.769797950723848e-06, + "loss": 0.6967, + "step": 4441 + }, + { + "epoch": 0.47, + "grad_norm": 2.1228306731836386, + "learning_rate": 5.7681139872485744e-06, + "loss": 0.6664, + "step": 4442 + }, + { + "epoch": 0.47, + "grad_norm": 2.793644978792627, + "learning_rate": 5.766429934536037e-06, + "loss": 0.5862, + "step": 4443 + }, + { + "epoch": 0.47, + "grad_norm": 1.0595037162948158, + "learning_rate": 5.764745792781886e-06, + "loss": 0.5796, + "step": 4444 + }, + { + "epoch": 0.47, + "grad_norm": 2.6360229263832173, + "learning_rate": 5.763061562181781e-06, + "loss": 0.5961, + "step": 4445 + }, + { + "epoch": 0.47, + "grad_norm": 2.317300668949924, + "learning_rate": 5.761377242931386e-06, + "loss": 0.6888, + "step": 4446 + }, + { + "epoch": 0.47, + "grad_norm": 2.3873429958437904, + "learning_rate": 5.759692835226387e-06, + "loss": 0.6889, + "step": 4447 + }, + { + "epoch": 0.47, + "grad_norm": 2.2136534312552785, + "learning_rate": 5.75800833926247e-06, + "loss": 0.6915, + "step": 4448 + }, + { + "epoch": 0.47, + "grad_norm": 3.0974828764619353, + "learning_rate": 5.756323755235334e-06, + "loss": 0.5588, + "step": 4449 + }, + { + "epoch": 0.47, + "grad_norm": 2.4333697468641398, + "learning_rate": 5.754639083340691e-06, + "loss": 0.7895, + "step": 4450 + }, + { + "epoch": 0.47, + "grad_norm": 3.480667649075754, + "learning_rate": 5.752954323774261e-06, + "loss": 0.6759, + "step": 4451 + }, + { + "epoch": 0.47, + "grad_norm": 2.26250369737515, + "learning_rate": 5.751269476731775e-06, + "loss": 0.5934, + "step": 4452 + }, + { + "epoch": 0.47, + "grad_norm": 1.1003678544450188, + "learning_rate": 5.749584542408971e-06, + "loss": 0.5815, + "step": 4453 + }, + { + "epoch": 0.47, + "grad_norm": 2.3078642309343578, + "learning_rate": 5.747899521001603e-06, + "loss": 0.6309, + "step": 4454 + }, + { + "epoch": 0.47, + "grad_norm": 0.9538174909326671, + "learning_rate": 5.74621441270543e-06, + "loss": 0.5846, + "step": 4455 + }, + { + "epoch": 0.47, + "grad_norm": 2.3481363612900386, + "learning_rate": 5.744529217716225e-06, + "loss": 0.6627, + "step": 4456 + }, + { + "epoch": 0.47, + "grad_norm": 2.4288667378870494, + "learning_rate": 5.742843936229765e-06, + "loss": 0.6473, + "step": 4457 + }, + { + "epoch": 0.47, + "grad_norm": 12.117953706010812, + "learning_rate": 5.741158568441846e-06, + "loss": 0.6602, + "step": 4458 + }, + { + "epoch": 0.47, + "grad_norm": 2.9795673957075657, + "learning_rate": 5.739473114548266e-06, + "loss": 0.6127, + "step": 4459 + }, + { + "epoch": 0.47, + "grad_norm": 2.184917637480841, + "learning_rate": 5.737787574744837e-06, + "loss": 0.7399, + "step": 4460 + }, + { + "epoch": 0.47, + "grad_norm": 3.0453232292377943, + "learning_rate": 5.736101949227382e-06, + "loss": 0.6086, + "step": 4461 + }, + { + "epoch": 0.47, + "grad_norm": 2.146930741168219, + "learning_rate": 5.734416238191729e-06, + "loss": 0.6511, + "step": 4462 + }, + { + "epoch": 0.47, + "grad_norm": 2.0161096829790184, + "learning_rate": 5.73273044183372e-06, + "loss": 0.5465, + "step": 4463 + }, + { + "epoch": 0.47, + "grad_norm": 3.4064385189492246, + "learning_rate": 5.7310445603492085e-06, + "loss": 0.6235, + "step": 4464 + }, + { + "epoch": 0.47, + "grad_norm": 2.1516478687948544, + "learning_rate": 5.729358593934051e-06, + "loss": 0.6553, + "step": 4465 + }, + { + "epoch": 0.47, + "grad_norm": 2.013275127090678, + "learning_rate": 5.727672542784122e-06, + "loss": 0.631, + "step": 4466 + }, + { + "epoch": 0.47, + "grad_norm": 2.21296220940741, + "learning_rate": 5.7259864070953e-06, + "loss": 0.6722, + "step": 4467 + }, + { + "epoch": 0.47, + "grad_norm": 2.0788929579185234, + "learning_rate": 5.724300187063474e-06, + "loss": 0.6972, + "step": 4468 + }, + { + "epoch": 0.47, + "grad_norm": 2.2057902914842415, + "learning_rate": 5.722613882884549e-06, + "loss": 0.6598, + "step": 4469 + }, + { + "epoch": 0.47, + "grad_norm": 2.3957705076731886, + "learning_rate": 5.720927494754429e-06, + "loss": 0.6991, + "step": 4470 + }, + { + "epoch": 0.47, + "grad_norm": 10.12211879589646, + "learning_rate": 5.719241022869039e-06, + "loss": 0.6594, + "step": 4471 + }, + { + "epoch": 0.47, + "grad_norm": 3.490904530002642, + "learning_rate": 5.7175544674243044e-06, + "loss": 0.5995, + "step": 4472 + }, + { + "epoch": 0.47, + "grad_norm": 5.934704124433464, + "learning_rate": 5.715867828616167e-06, + "loss": 0.6718, + "step": 4473 + }, + { + "epoch": 0.47, + "grad_norm": 2.518726432040352, + "learning_rate": 5.714181106640575e-06, + "loss": 0.7023, + "step": 4474 + }, + { + "epoch": 0.47, + "grad_norm": 2.2024550220706205, + "learning_rate": 5.712494301693486e-06, + "loss": 0.6435, + "step": 4475 + }, + { + "epoch": 0.47, + "grad_norm": 4.867057001543077, + "learning_rate": 5.710807413970868e-06, + "loss": 0.6585, + "step": 4476 + }, + { + "epoch": 0.47, + "grad_norm": 2.3892252500285447, + "learning_rate": 5.709120443668701e-06, + "loss": 0.6105, + "step": 4477 + }, + { + "epoch": 0.47, + "grad_norm": 2.751702391769238, + "learning_rate": 5.707433390982969e-06, + "loss": 0.6801, + "step": 4478 + }, + { + "epoch": 0.47, + "grad_norm": 2.2281246093897633, + "learning_rate": 5.705746256109671e-06, + "loss": 0.6254, + "step": 4479 + }, + { + "epoch": 0.47, + "grad_norm": 3.2065787831492654, + "learning_rate": 5.704059039244814e-06, + "loss": 0.6247, + "step": 4480 + }, + { + "epoch": 0.47, + "grad_norm": 3.4211374534691665, + "learning_rate": 5.7023717405844114e-06, + "loss": 0.6601, + "step": 4481 + }, + { + "epoch": 0.47, + "grad_norm": 8.97147154570612, + "learning_rate": 5.700684360324492e-06, + "loss": 0.6468, + "step": 4482 + }, + { + "epoch": 0.47, + "grad_norm": 5.36936656327254, + "learning_rate": 5.6989968986610876e-06, + "loss": 0.6256, + "step": 4483 + }, + { + "epoch": 0.47, + "grad_norm": 2.6783669889814394, + "learning_rate": 5.697309355790246e-06, + "loss": 0.7214, + "step": 4484 + }, + { + "epoch": 0.47, + "grad_norm": 2.3024950794881716, + "learning_rate": 5.695621731908018e-06, + "loss": 0.6098, + "step": 4485 + }, + { + "epoch": 0.47, + "grad_norm": 2.2375276123528454, + "learning_rate": 5.693934027210468e-06, + "loss": 0.6077, + "step": 4486 + }, + { + "epoch": 0.47, + "grad_norm": 2.3754390849560383, + "learning_rate": 5.692246241893669e-06, + "loss": 0.6369, + "step": 4487 + }, + { + "epoch": 0.47, + "grad_norm": 2.1211748838319964, + "learning_rate": 5.6905583761537034e-06, + "loss": 0.6388, + "step": 4488 + }, + { + "epoch": 0.47, + "grad_norm": 3.868019959818687, + "learning_rate": 5.68887043018666e-06, + "loss": 0.6607, + "step": 4489 + }, + { + "epoch": 0.47, + "grad_norm": 2.33570658417535, + "learning_rate": 5.687182404188642e-06, + "loss": 0.6159, + "step": 4490 + }, + { + "epoch": 0.47, + "grad_norm": 2.8823599155269486, + "learning_rate": 5.68549429835576e-06, + "loss": 0.535, + "step": 4491 + }, + { + "epoch": 0.47, + "grad_norm": 2.743721547670784, + "learning_rate": 5.6838061128841294e-06, + "loss": 0.6538, + "step": 4492 + }, + { + "epoch": 0.47, + "grad_norm": 3.8002300637615645, + "learning_rate": 5.682117847969884e-06, + "loss": 0.6659, + "step": 4493 + }, + { + "epoch": 0.47, + "grad_norm": 1.0584871886426965, + "learning_rate": 5.680429503809157e-06, + "loss": 0.5985, + "step": 4494 + }, + { + "epoch": 0.47, + "grad_norm": 3.525503447996165, + "learning_rate": 5.678741080598098e-06, + "loss": 0.5908, + "step": 4495 + }, + { + "epoch": 0.47, + "grad_norm": 2.687907042607034, + "learning_rate": 5.6770525785328625e-06, + "loss": 0.6845, + "step": 4496 + }, + { + "epoch": 0.47, + "grad_norm": 2.304902889828247, + "learning_rate": 5.675363997809616e-06, + "loss": 0.6827, + "step": 4497 + }, + { + "epoch": 0.47, + "grad_norm": 2.4406252620899282, + "learning_rate": 5.6736753386245315e-06, + "loss": 0.5224, + "step": 4498 + }, + { + "epoch": 0.47, + "grad_norm": 2.5325199094461035, + "learning_rate": 5.6719866011737934e-06, + "loss": 0.6689, + "step": 4499 + }, + { + "epoch": 0.47, + "grad_norm": 2.375682344749398, + "learning_rate": 5.670297785653596e-06, + "loss": 0.6651, + "step": 4500 + }, + { + "epoch": 0.47, + "grad_norm": 2.425658142737432, + "learning_rate": 5.668608892260138e-06, + "loss": 0.7422, + "step": 4501 + }, + { + "epoch": 0.47, + "grad_norm": 2.0617808401615054, + "learning_rate": 5.666919921189632e-06, + "loss": 0.6425, + "step": 4502 + }, + { + "epoch": 0.47, + "grad_norm": 2.272673910971418, + "learning_rate": 5.665230872638297e-06, + "loss": 0.648, + "step": 4503 + }, + { + "epoch": 0.47, + "grad_norm": 2.572982484112035, + "learning_rate": 5.6635417468023635e-06, + "loss": 0.6951, + "step": 4504 + }, + { + "epoch": 0.47, + "grad_norm": 3.0084581361854075, + "learning_rate": 5.661852543878067e-06, + "loss": 0.6481, + "step": 4505 + }, + { + "epoch": 0.47, + "grad_norm": 2.0157127811635567, + "learning_rate": 5.660163264061656e-06, + "loss": 0.585, + "step": 4506 + }, + { + "epoch": 0.47, + "grad_norm": 2.5432077638654644, + "learning_rate": 5.6584739075493835e-06, + "loss": 0.6619, + "step": 4507 + }, + { + "epoch": 0.47, + "grad_norm": 2.5133640279691036, + "learning_rate": 5.656784474537518e-06, + "loss": 0.612, + "step": 4508 + }, + { + "epoch": 0.47, + "grad_norm": 2.9315990055007775, + "learning_rate": 5.65509496522233e-06, + "loss": 0.6136, + "step": 4509 + }, + { + "epoch": 0.47, + "grad_norm": 3.2810358608137027, + "learning_rate": 5.653405379800102e-06, + "loss": 0.6993, + "step": 4510 + }, + { + "epoch": 0.47, + "grad_norm": 2.493229989865747, + "learning_rate": 5.651715718467127e-06, + "loss": 0.657, + "step": 4511 + }, + { + "epoch": 0.47, + "grad_norm": 2.6802549573204453, + "learning_rate": 5.6500259814197025e-06, + "loss": 0.6624, + "step": 4512 + }, + { + "epoch": 0.47, + "grad_norm": 2.625253414667082, + "learning_rate": 5.648336168854139e-06, + "loss": 0.6646, + "step": 4513 + }, + { + "epoch": 0.48, + "grad_norm": 2.3868204390514567, + "learning_rate": 5.646646280966755e-06, + "loss": 0.6926, + "step": 4514 + }, + { + "epoch": 0.48, + "grad_norm": 2.4290428676146343, + "learning_rate": 5.6449563179538734e-06, + "loss": 0.5732, + "step": 4515 + }, + { + "epoch": 0.48, + "grad_norm": 2.6813004334794983, + "learning_rate": 5.64326628001183e-06, + "loss": 0.6179, + "step": 4516 + }, + { + "epoch": 0.48, + "grad_norm": 2.2908656332176767, + "learning_rate": 5.641576167336972e-06, + "loss": 0.7049, + "step": 4517 + }, + { + "epoch": 0.48, + "grad_norm": 2.5593906258069583, + "learning_rate": 5.639885980125649e-06, + "loss": 0.5926, + "step": 4518 + }, + { + "epoch": 0.48, + "grad_norm": 2.7153221598085593, + "learning_rate": 5.638195718574222e-06, + "loss": 0.7009, + "step": 4519 + }, + { + "epoch": 0.48, + "grad_norm": 2.570454238663235, + "learning_rate": 5.636505382879061e-06, + "loss": 0.6327, + "step": 4520 + }, + { + "epoch": 0.48, + "grad_norm": 2.8531174704597646, + "learning_rate": 5.6348149732365465e-06, + "loss": 0.6287, + "step": 4521 + }, + { + "epoch": 0.48, + "grad_norm": 6.518867643695685, + "learning_rate": 5.633124489843063e-06, + "loss": 0.6584, + "step": 4522 + }, + { + "epoch": 0.48, + "grad_norm": 3.940669081934954, + "learning_rate": 5.631433932895005e-06, + "loss": 0.7226, + "step": 4523 + }, + { + "epoch": 0.48, + "grad_norm": 2.262500330185508, + "learning_rate": 5.62974330258878e-06, + "loss": 0.5943, + "step": 4524 + }, + { + "epoch": 0.48, + "grad_norm": 2.668293506438283, + "learning_rate": 5.6280525991207954e-06, + "loss": 0.6074, + "step": 4525 + }, + { + "epoch": 0.48, + "grad_norm": 2.5241175283242465, + "learning_rate": 5.626361822687478e-06, + "loss": 0.6424, + "step": 4526 + }, + { + "epoch": 0.48, + "grad_norm": 2.3985700650396047, + "learning_rate": 5.6246709734852535e-06, + "loss": 0.6714, + "step": 4527 + }, + { + "epoch": 0.48, + "grad_norm": 2.762272787782166, + "learning_rate": 5.6229800517105615e-06, + "loss": 0.7199, + "step": 4528 + }, + { + "epoch": 0.48, + "grad_norm": 3.74146694769342, + "learning_rate": 5.621289057559847e-06, + "loss": 0.6983, + "step": 4529 + }, + { + "epoch": 0.48, + "grad_norm": 2.5748292058521938, + "learning_rate": 5.619597991229566e-06, + "loss": 0.6199, + "step": 4530 + }, + { + "epoch": 0.48, + "grad_norm": 2.9437147149445138, + "learning_rate": 5.617906852916183e-06, + "loss": 0.6011, + "step": 4531 + }, + { + "epoch": 0.48, + "grad_norm": 23.92235581251259, + "learning_rate": 5.6162156428161665e-06, + "loss": 0.6263, + "step": 4532 + }, + { + "epoch": 0.48, + "grad_norm": 3.087366386179771, + "learning_rate": 5.614524361125998e-06, + "loss": 0.5655, + "step": 4533 + }, + { + "epoch": 0.48, + "grad_norm": 3.503816305177784, + "learning_rate": 5.612833008042166e-06, + "loss": 0.6989, + "step": 4534 + }, + { + "epoch": 0.48, + "grad_norm": 2.0399171517394388, + "learning_rate": 5.611141583761167e-06, + "loss": 0.5931, + "step": 4535 + }, + { + "epoch": 0.48, + "grad_norm": 2.4370559310970306, + "learning_rate": 5.609450088479506e-06, + "loss": 0.6178, + "step": 4536 + }, + { + "epoch": 0.48, + "grad_norm": 2.033431572385822, + "learning_rate": 5.607758522393693e-06, + "loss": 0.5607, + "step": 4537 + }, + { + "epoch": 0.48, + "grad_norm": 6.41604755439263, + "learning_rate": 5.6060668857002545e-06, + "loss": 0.6314, + "step": 4538 + }, + { + "epoch": 0.48, + "grad_norm": 2.3686567490270187, + "learning_rate": 5.604375178595715e-06, + "loss": 0.6557, + "step": 4539 + }, + { + "epoch": 0.48, + "grad_norm": 2.1562680126957847, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.6312, + "step": 4540 + }, + { + "epoch": 0.48, + "grad_norm": 2.498322304260171, + "learning_rate": 5.600991553939501e-06, + "loss": 0.7011, + "step": 4541 + }, + { + "epoch": 0.48, + "grad_norm": 2.7681759119595353, + "learning_rate": 5.5992996367809236e-06, + "loss": 0.6749, + "step": 4542 + }, + { + "epoch": 0.48, + "grad_norm": 3.1869566753671816, + "learning_rate": 5.597607649997449e-06, + "loss": 0.6855, + "step": 4543 + }, + { + "epoch": 0.48, + "grad_norm": 2.3926645915251172, + "learning_rate": 5.595915593785644e-06, + "loss": 0.676, + "step": 4544 + }, + { + "epoch": 0.48, + "grad_norm": 2.7435380969095537, + "learning_rate": 5.594223468342087e-06, + "loss": 0.5655, + "step": 4545 + }, + { + "epoch": 0.48, + "grad_norm": 2.0698113787263352, + "learning_rate": 5.592531273863367e-06, + "loss": 0.5551, + "step": 4546 + }, + { + "epoch": 0.48, + "grad_norm": 2.66821684730331, + "learning_rate": 5.590839010546074e-06, + "loss": 0.6269, + "step": 4547 + }, + { + "epoch": 0.48, + "grad_norm": 2.6735786158961927, + "learning_rate": 5.589146678586814e-06, + "loss": 0.5896, + "step": 4548 + }, + { + "epoch": 0.48, + "grad_norm": 2.556035989864172, + "learning_rate": 5.587454278182196e-06, + "loss": 0.7271, + "step": 4549 + }, + { + "epoch": 0.48, + "grad_norm": 2.5030779108925056, + "learning_rate": 5.585761809528839e-06, + "loss": 0.5887, + "step": 4550 + }, + { + "epoch": 0.48, + "grad_norm": 2.6285299123478363, + "learning_rate": 5.584069272823367e-06, + "loss": 0.6228, + "step": 4551 + }, + { + "epoch": 0.48, + "grad_norm": 4.742738953366397, + "learning_rate": 5.582376668262415e-06, + "loss": 0.5916, + "step": 4552 + }, + { + "epoch": 0.48, + "grad_norm": 2.051227649046525, + "learning_rate": 5.580683996042625e-06, + "loss": 0.5793, + "step": 4553 + }, + { + "epoch": 0.48, + "grad_norm": 2.683953068169987, + "learning_rate": 5.578991256360649e-06, + "loss": 0.6996, + "step": 4554 + }, + { + "epoch": 0.48, + "grad_norm": 2.4168596403111824, + "learning_rate": 5.577298449413141e-06, + "loss": 0.59, + "step": 4555 + }, + { + "epoch": 0.48, + "grad_norm": 2.5891727673576144, + "learning_rate": 5.575605575396767e-06, + "loss": 0.635, + "step": 4556 + }, + { + "epoch": 0.48, + "grad_norm": 3.164368406974795, + "learning_rate": 5.573912634508203e-06, + "loss": 0.6719, + "step": 4557 + }, + { + "epoch": 0.48, + "grad_norm": 2.1467160524829803, + "learning_rate": 5.572219626944128e-06, + "loss": 0.6302, + "step": 4558 + }, + { + "epoch": 0.48, + "grad_norm": 2.9545578011130686, + "learning_rate": 5.5705265529012295e-06, + "loss": 0.6129, + "step": 4559 + }, + { + "epoch": 0.48, + "grad_norm": 2.397337716611551, + "learning_rate": 5.5688334125762065e-06, + "loss": 0.577, + "step": 4560 + }, + { + "epoch": 0.48, + "grad_norm": 2.5141297243805707, + "learning_rate": 5.567140206165762e-06, + "loss": 0.6222, + "step": 4561 + }, + { + "epoch": 0.48, + "grad_norm": 2.5018217036974577, + "learning_rate": 5.565446933866607e-06, + "loss": 0.6108, + "step": 4562 + }, + { + "epoch": 0.48, + "grad_norm": 2.280655074209046, + "learning_rate": 5.563753595875463e-06, + "loss": 0.6282, + "step": 4563 + }, + { + "epoch": 0.48, + "grad_norm": 6.359831147248012, + "learning_rate": 5.562060192389054e-06, + "loss": 0.6622, + "step": 4564 + }, + { + "epoch": 0.48, + "grad_norm": 2.2107889624685333, + "learning_rate": 5.560366723604117e-06, + "loss": 0.6035, + "step": 4565 + }, + { + "epoch": 0.48, + "grad_norm": 2.5994180305445815, + "learning_rate": 5.558673189717395e-06, + "loss": 0.6777, + "step": 4566 + }, + { + "epoch": 0.48, + "grad_norm": 2.7545314951405593, + "learning_rate": 5.556979590925636e-06, + "loss": 0.5888, + "step": 4567 + }, + { + "epoch": 0.48, + "grad_norm": 2.364332000424429, + "learning_rate": 5.555285927425599e-06, + "loss": 0.6485, + "step": 4568 + }, + { + "epoch": 0.48, + "grad_norm": 2.5857119479401773, + "learning_rate": 5.553592199414047e-06, + "loss": 0.6616, + "step": 4569 + }, + { + "epoch": 0.48, + "grad_norm": 3.15710865319317, + "learning_rate": 5.551898407087754e-06, + "loss": 0.6485, + "step": 4570 + }, + { + "epoch": 0.48, + "grad_norm": 2.8154040551745236, + "learning_rate": 5.550204550643501e-06, + "loss": 0.6448, + "step": 4571 + }, + { + "epoch": 0.48, + "grad_norm": 2.8762614479514133, + "learning_rate": 5.548510630278073e-06, + "loss": 0.5749, + "step": 4572 + }, + { + "epoch": 0.48, + "grad_norm": 3.430020443992809, + "learning_rate": 5.5468166461882645e-06, + "loss": 0.694, + "step": 4573 + }, + { + "epoch": 0.48, + "grad_norm": 2.010935891713146, + "learning_rate": 5.545122598570879e-06, + "loss": 0.5862, + "step": 4574 + }, + { + "epoch": 0.48, + "grad_norm": 2.755230997400163, + "learning_rate": 5.543428487622727e-06, + "loss": 0.6561, + "step": 4575 + }, + { + "epoch": 0.48, + "grad_norm": 4.059923185258385, + "learning_rate": 5.5417343135406206e-06, + "loss": 0.6963, + "step": 4576 + }, + { + "epoch": 0.48, + "grad_norm": 4.790230956470769, + "learning_rate": 5.54004007652139e-06, + "loss": 0.5903, + "step": 4577 + }, + { + "epoch": 0.48, + "grad_norm": 2.829573606090716, + "learning_rate": 5.5383457767618655e-06, + "loss": 0.6433, + "step": 4578 + }, + { + "epoch": 0.48, + "grad_norm": 3.3802992751931424, + "learning_rate": 5.5366514144588835e-06, + "loss": 0.6504, + "step": 4579 + }, + { + "epoch": 0.48, + "grad_norm": 2.87227842810873, + "learning_rate": 5.534956989809293e-06, + "loss": 0.6567, + "step": 4580 + }, + { + "epoch": 0.48, + "grad_norm": 3.680652735847989, + "learning_rate": 5.533262503009944e-06, + "loss": 0.6889, + "step": 4581 + }, + { + "epoch": 0.48, + "grad_norm": 3.381512231517992, + "learning_rate": 5.5315679542577e-06, + "loss": 0.6544, + "step": 4582 + }, + { + "epoch": 0.48, + "grad_norm": 2.546877424221789, + "learning_rate": 5.529873343749428e-06, + "loss": 0.6369, + "step": 4583 + }, + { + "epoch": 0.48, + "grad_norm": 3.319093075019239, + "learning_rate": 5.528178671682002e-06, + "loss": 0.5969, + "step": 4584 + }, + { + "epoch": 0.48, + "grad_norm": 2.453302660315127, + "learning_rate": 5.5264839382523035e-06, + "loss": 0.5925, + "step": 4585 + }, + { + "epoch": 0.48, + "grad_norm": 2.3352208256933125, + "learning_rate": 5.524789143657226e-06, + "loss": 0.6313, + "step": 4586 + }, + { + "epoch": 0.48, + "grad_norm": 3.6803200585589213, + "learning_rate": 5.523094288093659e-06, + "loss": 0.6757, + "step": 4587 + }, + { + "epoch": 0.48, + "grad_norm": 3.4253926993814092, + "learning_rate": 5.521399371758511e-06, + "loss": 0.6969, + "step": 4588 + }, + { + "epoch": 0.48, + "grad_norm": 3.223973921567959, + "learning_rate": 5.519704394848693e-06, + "loss": 0.6356, + "step": 4589 + }, + { + "epoch": 0.48, + "grad_norm": 3.6909496271121074, + "learning_rate": 5.518009357561119e-06, + "loss": 0.6763, + "step": 4590 + }, + { + "epoch": 0.48, + "grad_norm": 2.315901845822254, + "learning_rate": 5.516314260092717e-06, + "loss": 0.6356, + "step": 4591 + }, + { + "epoch": 0.48, + "grad_norm": 2.885139970332389, + "learning_rate": 5.514619102640415e-06, + "loss": 0.6069, + "step": 4592 + }, + { + "epoch": 0.48, + "grad_norm": 2.566257463657484, + "learning_rate": 5.512923885401154e-06, + "loss": 0.6844, + "step": 4593 + }, + { + "epoch": 0.48, + "grad_norm": 1.1239709742116073, + "learning_rate": 5.511228608571879e-06, + "loss": 0.5754, + "step": 4594 + }, + { + "epoch": 0.48, + "grad_norm": 3.1944023152638357, + "learning_rate": 5.5095332723495425e-06, + "loss": 0.6047, + "step": 4595 + }, + { + "epoch": 0.48, + "grad_norm": 3.0417150262269077, + "learning_rate": 5.507837876931102e-06, + "loss": 0.6709, + "step": 4596 + }, + { + "epoch": 0.48, + "grad_norm": 3.1835449500447512, + "learning_rate": 5.506142422513525e-06, + "loss": 0.6766, + "step": 4597 + }, + { + "epoch": 0.48, + "grad_norm": 2.4531359904321675, + "learning_rate": 5.504446909293786e-06, + "loss": 0.6872, + "step": 4598 + }, + { + "epoch": 0.48, + "grad_norm": 2.9575039883577126, + "learning_rate": 5.502751337468862e-06, + "loss": 0.6198, + "step": 4599 + }, + { + "epoch": 0.48, + "grad_norm": 3.254352953269388, + "learning_rate": 5.5010557072357395e-06, + "loss": 0.7175, + "step": 4600 + }, + { + "epoch": 0.48, + "grad_norm": 2.5317638836045275, + "learning_rate": 5.499360018791416e-06, + "loss": 0.626, + "step": 4601 + }, + { + "epoch": 0.48, + "grad_norm": 3.080250323596298, + "learning_rate": 5.497664272332888e-06, + "loss": 0.6564, + "step": 4602 + }, + { + "epoch": 0.48, + "grad_norm": 4.791033529732766, + "learning_rate": 5.495968468057164e-06, + "loss": 0.6691, + "step": 4603 + }, + { + "epoch": 0.48, + "grad_norm": 3.576002951182326, + "learning_rate": 5.4942726061612564e-06, + "loss": 0.7081, + "step": 4604 + }, + { + "epoch": 0.48, + "grad_norm": 6.002512038489308, + "learning_rate": 5.492576686842186e-06, + "loss": 0.5352, + "step": 4605 + }, + { + "epoch": 0.48, + "grad_norm": 3.3897993264643347, + "learning_rate": 5.49088071029698e-06, + "loss": 0.6308, + "step": 4606 + }, + { + "epoch": 0.48, + "grad_norm": 2.409849318201643, + "learning_rate": 5.489184676722673e-06, + "loss": 0.6008, + "step": 4607 + }, + { + "epoch": 0.48, + "grad_norm": 2.5209078919290677, + "learning_rate": 5.487488586316304e-06, + "loss": 0.609, + "step": 4608 + }, + { + "epoch": 0.49, + "grad_norm": 2.3412970675636466, + "learning_rate": 5.485792439274919e-06, + "loss": 0.644, + "step": 4609 + }, + { + "epoch": 0.49, + "grad_norm": 0.9448249147025708, + "learning_rate": 5.484096235795574e-06, + "loss": 0.5841, + "step": 4610 + }, + { + "epoch": 0.49, + "grad_norm": 5.4569880167790625, + "learning_rate": 5.482399976075327e-06, + "loss": 0.6169, + "step": 4611 + }, + { + "epoch": 0.49, + "grad_norm": 4.038821439830503, + "learning_rate": 5.4807036603112465e-06, + "loss": 0.6924, + "step": 4612 + }, + { + "epoch": 0.49, + "grad_norm": 2.1488134768098157, + "learning_rate": 5.479007288700403e-06, + "loss": 0.6386, + "step": 4613 + }, + { + "epoch": 0.49, + "grad_norm": 3.0696890955921132, + "learning_rate": 5.477310861439877e-06, + "loss": 0.6493, + "step": 4614 + }, + { + "epoch": 0.49, + "grad_norm": 2.5185747574560793, + "learning_rate": 5.475614378726757e-06, + "loss": 0.6438, + "step": 4615 + }, + { + "epoch": 0.49, + "grad_norm": 2.0897327507354406, + "learning_rate": 5.4739178407581315e-06, + "loss": 0.5929, + "step": 4616 + }, + { + "epoch": 0.49, + "grad_norm": 2.6311884639260477, + "learning_rate": 5.4722212477311025e-06, + "loss": 0.6394, + "step": 4617 + }, + { + "epoch": 0.49, + "grad_norm": 2.269733417286331, + "learning_rate": 5.470524599842773e-06, + "loss": 0.5718, + "step": 4618 + }, + { + "epoch": 0.49, + "grad_norm": 1.021071958636412, + "learning_rate": 5.468827897290256e-06, + "loss": 0.5784, + "step": 4619 + }, + { + "epoch": 0.49, + "grad_norm": 3.5006735943403235, + "learning_rate": 5.46713114027067e-06, + "loss": 0.6056, + "step": 4620 + }, + { + "epoch": 0.49, + "grad_norm": 1.0266506883030095, + "learning_rate": 5.465434328981136e-06, + "loss": 0.5808, + "step": 4621 + }, + { + "epoch": 0.49, + "grad_norm": 3.8304477124966176, + "learning_rate": 5.463737463618788e-06, + "loss": 0.7006, + "step": 4622 + }, + { + "epoch": 0.49, + "grad_norm": 2.515291735666162, + "learning_rate": 5.462040544380764e-06, + "loss": 0.7003, + "step": 4623 + }, + { + "epoch": 0.49, + "grad_norm": 3.387353672942185, + "learning_rate": 5.460343571464203e-06, + "loss": 0.6504, + "step": 4624 + }, + { + "epoch": 0.49, + "grad_norm": 4.141288708078109, + "learning_rate": 5.458646545066258e-06, + "loss": 0.5682, + "step": 4625 + }, + { + "epoch": 0.49, + "grad_norm": 2.5542956306423674, + "learning_rate": 5.456949465384082e-06, + "loss": 0.6874, + "step": 4626 + }, + { + "epoch": 0.49, + "grad_norm": 2.9922167702054705, + "learning_rate": 5.455252332614839e-06, + "loss": 0.6838, + "step": 4627 + }, + { + "epoch": 0.49, + "grad_norm": 2.914946760609991, + "learning_rate": 5.453555146955696e-06, + "loss": 0.6331, + "step": 4628 + }, + { + "epoch": 0.49, + "grad_norm": 3.352573285600877, + "learning_rate": 5.451857908603826e-06, + "loss": 0.635, + "step": 4629 + }, + { + "epoch": 0.49, + "grad_norm": 2.6733703812603755, + "learning_rate": 5.450160617756411e-06, + "loss": 0.6508, + "step": 4630 + }, + { + "epoch": 0.49, + "grad_norm": 2.349826396279672, + "learning_rate": 5.448463274610637e-06, + "loss": 0.6848, + "step": 4631 + }, + { + "epoch": 0.49, + "grad_norm": 5.81980906330448, + "learning_rate": 5.446765879363697e-06, + "loss": 0.6457, + "step": 4632 + }, + { + "epoch": 0.49, + "grad_norm": 2.651707782689453, + "learning_rate": 5.445068432212787e-06, + "loss": 0.6972, + "step": 4633 + }, + { + "epoch": 0.49, + "grad_norm": 2.3655792841646512, + "learning_rate": 5.443370933355114e-06, + "loss": 0.5924, + "step": 4634 + }, + { + "epoch": 0.49, + "grad_norm": 2.7313353670254203, + "learning_rate": 5.441673382987886e-06, + "loss": 0.616, + "step": 4635 + }, + { + "epoch": 0.49, + "grad_norm": 2.4130469073654335, + "learning_rate": 5.439975781308322e-06, + "loss": 0.6315, + "step": 4636 + }, + { + "epoch": 0.49, + "grad_norm": 3.597140459042695, + "learning_rate": 5.4382781285136445e-06, + "loss": 0.6355, + "step": 4637 + }, + { + "epoch": 0.49, + "grad_norm": 2.2052663360438993, + "learning_rate": 5.436580424801081e-06, + "loss": 0.6403, + "step": 4638 + }, + { + "epoch": 0.49, + "grad_norm": 2.0955992547403572, + "learning_rate": 5.434882670367865e-06, + "loss": 0.5755, + "step": 4639 + }, + { + "epoch": 0.49, + "grad_norm": 2.7478098709325316, + "learning_rate": 5.4331848654112374e-06, + "loss": 0.713, + "step": 4640 + }, + { + "epoch": 0.49, + "grad_norm": 3.099989483770273, + "learning_rate": 5.431487010128445e-06, + "loss": 0.6117, + "step": 4641 + }, + { + "epoch": 0.49, + "grad_norm": 2.7349941290390514, + "learning_rate": 5.4297891047167385e-06, + "loss": 0.6223, + "step": 4642 + }, + { + "epoch": 0.49, + "grad_norm": 1.0659832601686883, + "learning_rate": 5.428091149373377e-06, + "loss": 0.5199, + "step": 4643 + }, + { + "epoch": 0.49, + "grad_norm": 9.61686418980415, + "learning_rate": 5.426393144295623e-06, + "loss": 0.6323, + "step": 4644 + }, + { + "epoch": 0.49, + "grad_norm": 3.013180159122992, + "learning_rate": 5.4246950896807445e-06, + "loss": 0.6634, + "step": 4645 + }, + { + "epoch": 0.49, + "grad_norm": 3.7524905717817365, + "learning_rate": 5.422996985726019e-06, + "loss": 0.5842, + "step": 4646 + }, + { + "epoch": 0.49, + "grad_norm": 2.2515177137365296, + "learning_rate": 5.421298832628729e-06, + "loss": 0.6197, + "step": 4647 + }, + { + "epoch": 0.49, + "grad_norm": 3.427073007294642, + "learning_rate": 5.419600630586155e-06, + "loss": 0.6538, + "step": 4648 + }, + { + "epoch": 0.49, + "grad_norm": 2.743461241178892, + "learning_rate": 5.417902379795593e-06, + "loss": 0.6647, + "step": 4649 + }, + { + "epoch": 0.49, + "grad_norm": 2.3931710581662897, + "learning_rate": 5.416204080454343e-06, + "loss": 0.6355, + "step": 4650 + }, + { + "epoch": 0.49, + "grad_norm": 3.8194753476800356, + "learning_rate": 5.414505732759704e-06, + "loss": 0.6645, + "step": 4651 + }, + { + "epoch": 0.49, + "grad_norm": 2.6326764418481194, + "learning_rate": 5.412807336908987e-06, + "loss": 0.6086, + "step": 4652 + }, + { + "epoch": 0.49, + "grad_norm": 2.284399679999572, + "learning_rate": 5.411108893099508e-06, + "loss": 0.5637, + "step": 4653 + }, + { + "epoch": 0.49, + "grad_norm": 6.7384502492836775, + "learning_rate": 5.409410401528586e-06, + "loss": 0.622, + "step": 4654 + }, + { + "epoch": 0.49, + "grad_norm": 3.61822403374306, + "learning_rate": 5.4077118623935476e-06, + "loss": 0.6361, + "step": 4655 + }, + { + "epoch": 0.49, + "grad_norm": 4.2720764272007425, + "learning_rate": 5.406013275891723e-06, + "loss": 0.7198, + "step": 4656 + }, + { + "epoch": 0.49, + "grad_norm": 3.533871073074416, + "learning_rate": 5.404314642220448e-06, + "loss": 0.6521, + "step": 4657 + }, + { + "epoch": 0.49, + "grad_norm": 2.5014645838243834, + "learning_rate": 5.40261596157707e-06, + "loss": 0.623, + "step": 4658 + }, + { + "epoch": 0.49, + "grad_norm": 5.187282413928454, + "learning_rate": 5.40091723415893e-06, + "loss": 0.6687, + "step": 4659 + }, + { + "epoch": 0.49, + "grad_norm": 2.210081592011959, + "learning_rate": 5.399218460163387e-06, + "loss": 0.5783, + "step": 4660 + }, + { + "epoch": 0.49, + "grad_norm": 2.9686826745928863, + "learning_rate": 5.397519639787796e-06, + "loss": 0.6148, + "step": 4661 + }, + { + "epoch": 0.49, + "grad_norm": 2.8313621442070436, + "learning_rate": 5.395820773229523e-06, + "loss": 0.6224, + "step": 4662 + }, + { + "epoch": 0.49, + "grad_norm": 3.837090983146463, + "learning_rate": 5.394121860685937e-06, + "loss": 0.5873, + "step": 4663 + }, + { + "epoch": 0.49, + "grad_norm": 2.7825956148863606, + "learning_rate": 5.392422902354413e-06, + "loss": 0.6691, + "step": 4664 + }, + { + "epoch": 0.49, + "grad_norm": 3.3329282867648136, + "learning_rate": 5.39072389843233e-06, + "loss": 0.6057, + "step": 4665 + }, + { + "epoch": 0.49, + "grad_norm": 3.5608187127868387, + "learning_rate": 5.389024849117074e-06, + "loss": 0.705, + "step": 4666 + }, + { + "epoch": 0.49, + "grad_norm": 2.706689822341293, + "learning_rate": 5.387325754606035e-06, + "loss": 0.7137, + "step": 4667 + }, + { + "epoch": 0.49, + "grad_norm": 2.5224351370813674, + "learning_rate": 5.3856266150966094e-06, + "loss": 0.6446, + "step": 4668 + }, + { + "epoch": 0.49, + "grad_norm": 4.120747037478334, + "learning_rate": 5.3839274307862e-06, + "loss": 0.6706, + "step": 4669 + }, + { + "epoch": 0.49, + "grad_norm": 3.6521412088636325, + "learning_rate": 5.3822282018722085e-06, + "loss": 0.7024, + "step": 4670 + }, + { + "epoch": 0.49, + "grad_norm": 2.364628947418871, + "learning_rate": 5.380528928552052e-06, + "loss": 0.7261, + "step": 4671 + }, + { + "epoch": 0.49, + "grad_norm": 2.7339482051464534, + "learning_rate": 5.378829611023144e-06, + "loss": 0.6781, + "step": 4672 + }, + { + "epoch": 0.49, + "grad_norm": 2.478305759343481, + "learning_rate": 5.377130249482907e-06, + "loss": 0.6225, + "step": 4673 + }, + { + "epoch": 0.49, + "grad_norm": 2.411215799916555, + "learning_rate": 5.3754308441287675e-06, + "loss": 0.6103, + "step": 4674 + }, + { + "epoch": 0.49, + "grad_norm": 2.42557822909133, + "learning_rate": 5.3737313951581575e-06, + "loss": 0.5748, + "step": 4675 + }, + { + "epoch": 0.49, + "grad_norm": 2.6013955730359046, + "learning_rate": 5.372031902768514e-06, + "loss": 0.5715, + "step": 4676 + }, + { + "epoch": 0.49, + "grad_norm": 3.1819025201627946, + "learning_rate": 5.370332367157281e-06, + "loss": 0.7339, + "step": 4677 + }, + { + "epoch": 0.49, + "grad_norm": 2.588545731682473, + "learning_rate": 5.368632788521903e-06, + "loss": 0.6315, + "step": 4678 + }, + { + "epoch": 0.49, + "grad_norm": 3.008669398229263, + "learning_rate": 5.3669331670598335e-06, + "loss": 0.7626, + "step": 4679 + }, + { + "epoch": 0.49, + "grad_norm": 3.43609159650542, + "learning_rate": 5.36523350296853e-06, + "loss": 0.7089, + "step": 4680 + }, + { + "epoch": 0.49, + "grad_norm": 3.121151492979661, + "learning_rate": 5.363533796445452e-06, + "loss": 0.6616, + "step": 4681 + }, + { + "epoch": 0.49, + "grad_norm": 2.7034182260790494, + "learning_rate": 5.361834047688071e-06, + "loss": 0.6528, + "step": 4682 + }, + { + "epoch": 0.49, + "grad_norm": 5.478076643546118, + "learning_rate": 5.360134256893854e-06, + "loss": 0.6114, + "step": 4683 + }, + { + "epoch": 0.49, + "grad_norm": 2.6079287424630264, + "learning_rate": 5.35843442426028e-06, + "loss": 0.6421, + "step": 4684 + }, + { + "epoch": 0.49, + "grad_norm": 2.685462278835273, + "learning_rate": 5.356734549984832e-06, + "loss": 0.5111, + "step": 4685 + }, + { + "epoch": 0.49, + "grad_norm": 2.979817349950144, + "learning_rate": 5.355034634264996e-06, + "loss": 0.6508, + "step": 4686 + }, + { + "epoch": 0.49, + "grad_norm": 3.0200409785087854, + "learning_rate": 5.353334677298261e-06, + "loss": 0.6656, + "step": 4687 + }, + { + "epoch": 0.49, + "grad_norm": 2.646554726463927, + "learning_rate": 5.351634679282125e-06, + "loss": 0.5935, + "step": 4688 + }, + { + "epoch": 0.49, + "grad_norm": 2.6235037363889915, + "learning_rate": 5.349934640414089e-06, + "loss": 0.6812, + "step": 4689 + }, + { + "epoch": 0.49, + "grad_norm": 9.07421728360287, + "learning_rate": 5.348234560891657e-06, + "loss": 0.6569, + "step": 4690 + }, + { + "epoch": 0.49, + "grad_norm": 36.97843704858725, + "learning_rate": 5.346534440912341e-06, + "loss": 0.6354, + "step": 4691 + }, + { + "epoch": 0.49, + "grad_norm": 2.4594375427225232, + "learning_rate": 5.3448342806736545e-06, + "loss": 0.6174, + "step": 4692 + }, + { + "epoch": 0.49, + "grad_norm": 2.8018238508168425, + "learning_rate": 5.34313408037312e-06, + "loss": 0.5667, + "step": 4693 + }, + { + "epoch": 0.49, + "grad_norm": 3.009916868253691, + "learning_rate": 5.341433840208258e-06, + "loss": 0.7038, + "step": 4694 + }, + { + "epoch": 0.49, + "grad_norm": 4.258548590941279, + "learning_rate": 5.339733560376601e-06, + "loss": 0.719, + "step": 4695 + }, + { + "epoch": 0.49, + "grad_norm": 2.61623774709776, + "learning_rate": 5.33803324107568e-06, + "loss": 0.7362, + "step": 4696 + }, + { + "epoch": 0.49, + "grad_norm": 2.7165699079262935, + "learning_rate": 5.336332882503034e-06, + "loss": 0.6397, + "step": 4697 + }, + { + "epoch": 0.49, + "grad_norm": 2.479128754751253, + "learning_rate": 5.334632484856206e-06, + "loss": 0.6062, + "step": 4698 + }, + { + "epoch": 0.49, + "grad_norm": 3.0822937031597597, + "learning_rate": 5.332932048332744e-06, + "loss": 0.6856, + "step": 4699 + }, + { + "epoch": 0.49, + "grad_norm": 2.670085477402618, + "learning_rate": 5.331231573130199e-06, + "loss": 0.6451, + "step": 4700 + }, + { + "epoch": 0.49, + "grad_norm": 2.710855683571757, + "learning_rate": 5.329531059446127e-06, + "loss": 0.5651, + "step": 4701 + }, + { + "epoch": 0.49, + "grad_norm": 4.256008895457131, + "learning_rate": 5.327830507478089e-06, + "loss": 0.6252, + "step": 4702 + }, + { + "epoch": 0.49, + "grad_norm": 2.1096958021617995, + "learning_rate": 5.32612991742365e-06, + "loss": 0.5849, + "step": 4703 + }, + { + "epoch": 0.5, + "grad_norm": 3.5793139887216263, + "learning_rate": 5.32442928948038e-06, + "loss": 0.6785, + "step": 4704 + }, + { + "epoch": 0.5, + "grad_norm": 2.7963244411549644, + "learning_rate": 5.322728623845853e-06, + "loss": 0.6408, + "step": 4705 + }, + { + "epoch": 0.5, + "grad_norm": 2.182317971739211, + "learning_rate": 5.321027920717649e-06, + "loss": 0.6363, + "step": 4706 + }, + { + "epoch": 0.5, + "grad_norm": 3.7507644271214393, + "learning_rate": 5.319327180293347e-06, + "loss": 0.635, + "step": 4707 + }, + { + "epoch": 0.5, + "grad_norm": 2.075170533407163, + "learning_rate": 5.317626402770537e-06, + "loss": 0.6228, + "step": 4708 + }, + { + "epoch": 0.5, + "grad_norm": 2.841933692244469, + "learning_rate": 5.3159255883468095e-06, + "loss": 0.6796, + "step": 4709 + }, + { + "epoch": 0.5, + "grad_norm": 3.6806157165268347, + "learning_rate": 5.314224737219761e-06, + "loss": 0.6646, + "step": 4710 + }, + { + "epoch": 0.5, + "grad_norm": 2.533486664828576, + "learning_rate": 5.31252384958699e-06, + "loss": 0.711, + "step": 4711 + }, + { + "epoch": 0.5, + "grad_norm": 2.7604461850517406, + "learning_rate": 5.310822925646103e-06, + "loss": 0.604, + "step": 4712 + }, + { + "epoch": 0.5, + "grad_norm": 2.578358994492397, + "learning_rate": 5.309121965594706e-06, + "loss": 0.6351, + "step": 4713 + }, + { + "epoch": 0.5, + "grad_norm": 2.579513564368866, + "learning_rate": 5.307420969630412e-06, + "loss": 0.7123, + "step": 4714 + }, + { + "epoch": 0.5, + "grad_norm": 3.582532356341639, + "learning_rate": 5.30571993795084e-06, + "loss": 0.6549, + "step": 4715 + }, + { + "epoch": 0.5, + "grad_norm": 2.5183375541671986, + "learning_rate": 5.304018870753608e-06, + "loss": 0.6463, + "step": 4716 + }, + { + "epoch": 0.5, + "grad_norm": 1.0420447788319127, + "learning_rate": 5.3023177682363435e-06, + "loss": 0.5367, + "step": 4717 + }, + { + "epoch": 0.5, + "grad_norm": 2.294424245337193, + "learning_rate": 5.300616630596673e-06, + "loss": 0.6515, + "step": 4718 + }, + { + "epoch": 0.5, + "grad_norm": 1.0335405308223342, + "learning_rate": 5.298915458032233e-06, + "loss": 0.5989, + "step": 4719 + }, + { + "epoch": 0.5, + "grad_norm": 1.0427016248786705, + "learning_rate": 5.297214250740658e-06, + "loss": 0.5918, + "step": 4720 + }, + { + "epoch": 0.5, + "grad_norm": 3.004325778129872, + "learning_rate": 5.295513008919592e-06, + "loss": 0.6773, + "step": 4721 + }, + { + "epoch": 0.5, + "grad_norm": 2.3701521965706323, + "learning_rate": 5.293811732766677e-06, + "loss": 0.6755, + "step": 4722 + }, + { + "epoch": 0.5, + "grad_norm": 3.111246958207434, + "learning_rate": 5.292110422479565e-06, + "loss": 0.6284, + "step": 4723 + }, + { + "epoch": 0.5, + "grad_norm": 3.018425742396122, + "learning_rate": 5.290409078255909e-06, + "loss": 0.6031, + "step": 4724 + }, + { + "epoch": 0.5, + "grad_norm": 0.9869690779073568, + "learning_rate": 5.288707700293365e-06, + "loss": 0.5695, + "step": 4725 + }, + { + "epoch": 0.5, + "grad_norm": 1.1072157586028053, + "learning_rate": 5.287006288789596e-06, + "loss": 0.5593, + "step": 4726 + }, + { + "epoch": 0.5, + "grad_norm": 2.3663187157291676, + "learning_rate": 5.285304843942265e-06, + "loss": 0.702, + "step": 4727 + }, + { + "epoch": 0.5, + "grad_norm": 9.046850596094636, + "learning_rate": 5.283603365949043e-06, + "loss": 0.7287, + "step": 4728 + }, + { + "epoch": 0.5, + "grad_norm": 0.9462619367998221, + "learning_rate": 5.2819018550076e-06, + "loss": 0.5409, + "step": 4729 + }, + { + "epoch": 0.5, + "grad_norm": 2.1605371729694633, + "learning_rate": 5.280200311315616e-06, + "loss": 0.6896, + "step": 4730 + }, + { + "epoch": 0.5, + "grad_norm": 2.9185781197431124, + "learning_rate": 5.278498735070769e-06, + "loss": 0.6377, + "step": 4731 + }, + { + "epoch": 0.5, + "grad_norm": 3.5719498173680098, + "learning_rate": 5.2767971264707445e-06, + "loss": 0.6695, + "step": 4732 + }, + { + "epoch": 0.5, + "grad_norm": 2.8324365081609333, + "learning_rate": 5.27509548571323e-06, + "loss": 0.5886, + "step": 4733 + }, + { + "epoch": 0.5, + "grad_norm": 2.705346452985015, + "learning_rate": 5.273393812995917e-06, + "loss": 0.6509, + "step": 4734 + }, + { + "epoch": 0.5, + "grad_norm": 3.9814173861586815, + "learning_rate": 5.271692108516501e-06, + "loss": 0.7016, + "step": 4735 + }, + { + "epoch": 0.5, + "grad_norm": 6.424011098877338, + "learning_rate": 5.269990372472682e-06, + "loss": 0.6616, + "step": 4736 + }, + { + "epoch": 0.5, + "grad_norm": 2.1493045527463837, + "learning_rate": 5.2682886050621604e-06, + "loss": 0.6617, + "step": 4737 + }, + { + "epoch": 0.5, + "grad_norm": 3.716615071908531, + "learning_rate": 5.266586806482646e-06, + "loss": 0.6362, + "step": 4738 + }, + { + "epoch": 0.5, + "grad_norm": 3.118626444685795, + "learning_rate": 5.264884976931845e-06, + "loss": 0.6424, + "step": 4739 + }, + { + "epoch": 0.5, + "grad_norm": 2.541880011041433, + "learning_rate": 5.263183116607474e-06, + "loss": 0.5951, + "step": 4740 + }, + { + "epoch": 0.5, + "grad_norm": 1.1285558503464246, + "learning_rate": 5.261481225707251e-06, + "loss": 0.588, + "step": 4741 + }, + { + "epoch": 0.5, + "grad_norm": 2.916993130135385, + "learning_rate": 5.259779304428893e-06, + "loss": 0.6042, + "step": 4742 + }, + { + "epoch": 0.5, + "grad_norm": 2.7254833052810423, + "learning_rate": 5.258077352970128e-06, + "loss": 0.6621, + "step": 4743 + }, + { + "epoch": 0.5, + "grad_norm": 3.462602034967814, + "learning_rate": 5.256375371528681e-06, + "loss": 0.5998, + "step": 4744 + }, + { + "epoch": 0.5, + "grad_norm": 2.520686792805083, + "learning_rate": 5.254673360302284e-06, + "loss": 0.6472, + "step": 4745 + }, + { + "epoch": 0.5, + "grad_norm": 3.459934646842888, + "learning_rate": 5.252971319488672e-06, + "loss": 0.6504, + "step": 4746 + }, + { + "epoch": 0.5, + "grad_norm": 1.0506864779125666, + "learning_rate": 5.2512692492855845e-06, + "loss": 0.5606, + "step": 4747 + }, + { + "epoch": 0.5, + "grad_norm": 2.191202095280542, + "learning_rate": 5.249567149890762e-06, + "loss": 0.6614, + "step": 4748 + }, + { + "epoch": 0.5, + "grad_norm": 10.032018653356111, + "learning_rate": 5.247865021501949e-06, + "loss": 0.608, + "step": 4749 + }, + { + "epoch": 0.5, + "grad_norm": 3.5143563476752124, + "learning_rate": 5.2461628643168935e-06, + "loss": 0.5829, + "step": 4750 + }, + { + "epoch": 0.5, + "grad_norm": 3.066214123433812, + "learning_rate": 5.244460678533349e-06, + "loss": 0.6338, + "step": 4751 + }, + { + "epoch": 0.5, + "grad_norm": 3.7234112282653116, + "learning_rate": 5.24275846434907e-06, + "loss": 0.6476, + "step": 4752 + }, + { + "epoch": 0.5, + "grad_norm": 2.5886795596777463, + "learning_rate": 5.2410562219618135e-06, + "loss": 0.652, + "step": 4753 + }, + { + "epoch": 0.5, + "grad_norm": 2.5480441407074443, + "learning_rate": 5.239353951569342e-06, + "loss": 0.5687, + "step": 4754 + }, + { + "epoch": 0.5, + "grad_norm": 2.4463031639209403, + "learning_rate": 5.2376516533694196e-06, + "loss": 0.5338, + "step": 4755 + }, + { + "epoch": 0.5, + "grad_norm": 2.257125530689949, + "learning_rate": 5.235949327559817e-06, + "loss": 0.6191, + "step": 4756 + }, + { + "epoch": 0.5, + "grad_norm": 2.1011431880055143, + "learning_rate": 5.2342469743383026e-06, + "loss": 0.6559, + "step": 4757 + }, + { + "epoch": 0.5, + "grad_norm": 3.487907737324848, + "learning_rate": 5.232544593902652e-06, + "loss": 0.6932, + "step": 4758 + }, + { + "epoch": 0.5, + "grad_norm": 2.402578163767159, + "learning_rate": 5.230842186450642e-06, + "loss": 0.6343, + "step": 4759 + }, + { + "epoch": 0.5, + "grad_norm": 3.077478548938603, + "learning_rate": 5.2291397521800545e-06, + "loss": 0.6685, + "step": 4760 + }, + { + "epoch": 0.5, + "grad_norm": 2.2841356416265044, + "learning_rate": 5.227437291288674e-06, + "loss": 0.6646, + "step": 4761 + }, + { + "epoch": 0.5, + "grad_norm": 2.7800487949493458, + "learning_rate": 5.225734803974285e-06, + "loss": 0.6371, + "step": 4762 + }, + { + "epoch": 0.5, + "grad_norm": 2.2976515988461337, + "learning_rate": 5.22403229043468e-06, + "loss": 0.6387, + "step": 4763 + }, + { + "epoch": 0.5, + "grad_norm": 2.6430943743263358, + "learning_rate": 5.222329750867649e-06, + "loss": 0.6613, + "step": 4764 + }, + { + "epoch": 0.5, + "grad_norm": 2.659848728533461, + "learning_rate": 5.220627185470993e-06, + "loss": 0.614, + "step": 4765 + }, + { + "epoch": 0.5, + "grad_norm": 3.5083748901897884, + "learning_rate": 5.218924594442507e-06, + "loss": 0.5716, + "step": 4766 + }, + { + "epoch": 0.5, + "grad_norm": 2.8298517158269325, + "learning_rate": 5.217221977979996e-06, + "loss": 0.6882, + "step": 4767 + }, + { + "epoch": 0.5, + "grad_norm": 3.7161705013278534, + "learning_rate": 5.215519336281261e-06, + "loss": 0.7084, + "step": 4768 + }, + { + "epoch": 0.5, + "grad_norm": 2.4592185775923303, + "learning_rate": 5.213816669544114e-06, + "loss": 0.5649, + "step": 4769 + }, + { + "epoch": 0.5, + "grad_norm": 2.7485436454433247, + "learning_rate": 5.2121139779663645e-06, + "loss": 0.6664, + "step": 4770 + }, + { + "epoch": 0.5, + "grad_norm": 3.0981595313863677, + "learning_rate": 5.2104112617458254e-06, + "loss": 0.6766, + "step": 4771 + }, + { + "epoch": 0.5, + "grad_norm": 4.0052501193362255, + "learning_rate": 5.2087085210803145e-06, + "loss": 0.7007, + "step": 4772 + }, + { + "epoch": 0.5, + "grad_norm": 3.1047565880215737, + "learning_rate": 5.207005756167651e-06, + "loss": 0.6448, + "step": 4773 + }, + { + "epoch": 0.5, + "grad_norm": 2.792375318373496, + "learning_rate": 5.205302967205657e-06, + "loss": 0.6724, + "step": 4774 + }, + { + "epoch": 0.5, + "grad_norm": 2.51567986610389, + "learning_rate": 5.203600154392158e-06, + "loss": 0.606, + "step": 4775 + }, + { + "epoch": 0.5, + "grad_norm": 2.5125662323316287, + "learning_rate": 5.2018973179249824e-06, + "loss": 0.6156, + "step": 4776 + }, + { + "epoch": 0.5, + "grad_norm": 2.2159857718006224, + "learning_rate": 5.200194458001958e-06, + "loss": 0.5654, + "step": 4777 + }, + { + "epoch": 0.5, + "grad_norm": 2.9452532347483427, + "learning_rate": 5.198491574820923e-06, + "loss": 0.603, + "step": 4778 + }, + { + "epoch": 0.5, + "grad_norm": 6.142355113115631, + "learning_rate": 5.196788668579708e-06, + "loss": 0.6565, + "step": 4779 + }, + { + "epoch": 0.5, + "grad_norm": 2.2887002234975764, + "learning_rate": 5.195085739476156e-06, + "loss": 0.6851, + "step": 4780 + }, + { + "epoch": 0.5, + "grad_norm": 3.085541461688833, + "learning_rate": 5.193382787708106e-06, + "loss": 0.5886, + "step": 4781 + }, + { + "epoch": 0.5, + "grad_norm": 7.500928456155343, + "learning_rate": 5.191679813473402e-06, + "loss": 0.5474, + "step": 4782 + }, + { + "epoch": 0.5, + "grad_norm": 6.985875727924238, + "learning_rate": 5.189976816969892e-06, + "loss": 0.6326, + "step": 4783 + }, + { + "epoch": 0.5, + "grad_norm": 3.1970563840062383, + "learning_rate": 5.188273798395425e-06, + "loss": 0.7014, + "step": 4784 + }, + { + "epoch": 0.5, + "grad_norm": 3.241455526944972, + "learning_rate": 5.186570757947852e-06, + "loss": 0.731, + "step": 4785 + }, + { + "epoch": 0.5, + "grad_norm": 3.576374543674953, + "learning_rate": 5.1848676958250265e-06, + "loss": 0.6363, + "step": 4786 + }, + { + "epoch": 0.5, + "grad_norm": 2.335548183754556, + "learning_rate": 5.183164612224809e-06, + "loss": 0.583, + "step": 4787 + }, + { + "epoch": 0.5, + "grad_norm": 2.4154405135354136, + "learning_rate": 5.181461507345054e-06, + "loss": 0.5799, + "step": 4788 + }, + { + "epoch": 0.5, + "grad_norm": 3.5423104288104916, + "learning_rate": 5.1797583813836285e-06, + "loss": 0.6326, + "step": 4789 + }, + { + "epoch": 0.5, + "grad_norm": 2.5045963993199747, + "learning_rate": 5.178055234538391e-06, + "loss": 0.6283, + "step": 4790 + }, + { + "epoch": 0.5, + "grad_norm": 2.0244708876329987, + "learning_rate": 5.176352067007213e-06, + "loss": 0.6263, + "step": 4791 + }, + { + "epoch": 0.5, + "grad_norm": 2.255397053263638, + "learning_rate": 5.174648878987959e-06, + "loss": 0.6359, + "step": 4792 + }, + { + "epoch": 0.5, + "grad_norm": 3.722706053854962, + "learning_rate": 5.1729456706785055e-06, + "loss": 0.6435, + "step": 4793 + }, + { + "epoch": 0.5, + "grad_norm": 2.075448749546548, + "learning_rate": 5.1712424422767224e-06, + "loss": 0.6811, + "step": 4794 + }, + { + "epoch": 0.5, + "grad_norm": 2.688883878063154, + "learning_rate": 5.169539193980489e-06, + "loss": 0.6266, + "step": 4795 + }, + { + "epoch": 0.5, + "grad_norm": 2.9204150200499615, + "learning_rate": 5.1678359259876824e-06, + "loss": 0.6795, + "step": 4796 + }, + { + "epoch": 0.5, + "grad_norm": 1.1559522338055792, + "learning_rate": 5.1661326384961805e-06, + "loss": 0.5607, + "step": 4797 + }, + { + "epoch": 0.5, + "grad_norm": 1.9541032407056105, + "learning_rate": 5.164429331703871e-06, + "loss": 0.6785, + "step": 4798 + }, + { + "epoch": 0.5, + "grad_norm": 0.8942781057490377, + "learning_rate": 5.162726005808636e-06, + "loss": 0.5714, + "step": 4799 + }, + { + "epoch": 0.51, + "grad_norm": 1.079312812042359, + "learning_rate": 5.1610226610083655e-06, + "loss": 0.5694, + "step": 4800 + }, + { + "epoch": 0.51, + "grad_norm": 3.277923548372197, + "learning_rate": 5.159319297500945e-06, + "loss": 0.6173, + "step": 4801 + }, + { + "epoch": 0.51, + "grad_norm": 2.986097736979872, + "learning_rate": 5.157615915484273e-06, + "loss": 0.6071, + "step": 4802 + }, + { + "epoch": 0.51, + "grad_norm": 3.2600607771581642, + "learning_rate": 5.155912515156236e-06, + "loss": 0.6338, + "step": 4803 + }, + { + "epoch": 0.51, + "grad_norm": 4.444448387829765, + "learning_rate": 5.154209096714736e-06, + "loss": 0.635, + "step": 4804 + }, + { + "epoch": 0.51, + "grad_norm": 2.2143014150947318, + "learning_rate": 5.152505660357667e-06, + "loss": 0.6542, + "step": 4805 + }, + { + "epoch": 0.51, + "grad_norm": 2.685218090583335, + "learning_rate": 5.150802206282932e-06, + "loss": 0.5947, + "step": 4806 + }, + { + "epoch": 0.51, + "grad_norm": 2.593209127978214, + "learning_rate": 5.149098734688434e-06, + "loss": 0.6186, + "step": 4807 + }, + { + "epoch": 0.51, + "grad_norm": 4.2086494702555886, + "learning_rate": 5.147395245772074e-06, + "loss": 0.737, + "step": 4808 + }, + { + "epoch": 0.51, + "grad_norm": 10.188890598572003, + "learning_rate": 5.145691739731761e-06, + "loss": 0.6247, + "step": 4809 + }, + { + "epoch": 0.51, + "grad_norm": 9.44137252928556, + "learning_rate": 5.143988216765402e-06, + "loss": 0.6595, + "step": 4810 + }, + { + "epoch": 0.51, + "grad_norm": 3.172419643746679, + "learning_rate": 5.142284677070911e-06, + "loss": 0.6992, + "step": 4811 + }, + { + "epoch": 0.51, + "grad_norm": 2.952102588392533, + "learning_rate": 5.140581120846194e-06, + "loss": 0.6163, + "step": 4812 + }, + { + "epoch": 0.51, + "grad_norm": 2.6735180292043665, + "learning_rate": 5.138877548289173e-06, + "loss": 0.6745, + "step": 4813 + }, + { + "epoch": 0.51, + "grad_norm": 3.3271844953089964, + "learning_rate": 5.137173959597755e-06, + "loss": 0.7083, + "step": 4814 + }, + { + "epoch": 0.51, + "grad_norm": 5.779885373526377, + "learning_rate": 5.135470354969867e-06, + "loss": 0.5758, + "step": 4815 + }, + { + "epoch": 0.51, + "grad_norm": 3.880291332502046, + "learning_rate": 5.1337667346034226e-06, + "loss": 0.6426, + "step": 4816 + }, + { + "epoch": 0.51, + "grad_norm": 2.789682189366147, + "learning_rate": 5.132063098696346e-06, + "loss": 0.6936, + "step": 4817 + }, + { + "epoch": 0.51, + "grad_norm": 2.5838279711205714, + "learning_rate": 5.130359447446561e-06, + "loss": 0.6659, + "step": 4818 + }, + { + "epoch": 0.51, + "grad_norm": 3.4258511041007518, + "learning_rate": 5.128655781051991e-06, + "loss": 0.6169, + "step": 4819 + }, + { + "epoch": 0.51, + "grad_norm": 3.6136439580060657, + "learning_rate": 5.126952099710566e-06, + "loss": 0.6806, + "step": 4820 + }, + { + "epoch": 0.51, + "grad_norm": 3.209126945224101, + "learning_rate": 5.125248403620211e-06, + "loss": 0.6683, + "step": 4821 + }, + { + "epoch": 0.51, + "grad_norm": 2.809317990932967, + "learning_rate": 5.12354469297886e-06, + "loss": 0.6826, + "step": 4822 + }, + { + "epoch": 0.51, + "grad_norm": 3.2384938127477425, + "learning_rate": 5.121840967984443e-06, + "loss": 0.6434, + "step": 4823 + }, + { + "epoch": 0.51, + "grad_norm": 2.2368138450617656, + "learning_rate": 5.120137228834896e-06, + "loss": 0.6217, + "step": 4824 + }, + { + "epoch": 0.51, + "grad_norm": 2.452259843593572, + "learning_rate": 5.1184334757281506e-06, + "loss": 0.6101, + "step": 4825 + }, + { + "epoch": 0.51, + "grad_norm": 2.585727076856822, + "learning_rate": 5.1167297088621485e-06, + "loss": 0.6537, + "step": 4826 + }, + { + "epoch": 0.51, + "grad_norm": 4.454616726505247, + "learning_rate": 5.1150259284348246e-06, + "loss": 0.571, + "step": 4827 + }, + { + "epoch": 0.51, + "grad_norm": 4.132538433226866, + "learning_rate": 5.113322134644122e-06, + "loss": 0.6571, + "step": 4828 + }, + { + "epoch": 0.51, + "grad_norm": 2.143250401873857, + "learning_rate": 5.111618327687981e-06, + "loss": 0.5883, + "step": 4829 + }, + { + "epoch": 0.51, + "grad_norm": 6.356716637447877, + "learning_rate": 5.109914507764345e-06, + "loss": 0.6736, + "step": 4830 + }, + { + "epoch": 0.51, + "grad_norm": 2.7253830188411547, + "learning_rate": 5.108210675071159e-06, + "loss": 0.6881, + "step": 4831 + }, + { + "epoch": 0.51, + "grad_norm": 3.4516235016563788, + "learning_rate": 5.1065068298063705e-06, + "loss": 0.6512, + "step": 4832 + }, + { + "epoch": 0.51, + "grad_norm": 2.7073002106917237, + "learning_rate": 5.104802972167926e-06, + "loss": 0.6957, + "step": 4833 + }, + { + "epoch": 0.51, + "grad_norm": 2.7841704207983984, + "learning_rate": 5.103099102353775e-06, + "loss": 0.5775, + "step": 4834 + }, + { + "epoch": 0.51, + "grad_norm": 7.9292838099679965, + "learning_rate": 5.101395220561869e-06, + "loss": 0.7005, + "step": 4835 + }, + { + "epoch": 0.51, + "grad_norm": 2.7773812390741077, + "learning_rate": 5.099691326990158e-06, + "loss": 0.5243, + "step": 4836 + }, + { + "epoch": 0.51, + "grad_norm": 3.26055135706108, + "learning_rate": 5.0979874218365985e-06, + "loss": 0.6554, + "step": 4837 + }, + { + "epoch": 0.51, + "grad_norm": 2.93701844263028, + "learning_rate": 5.096283505299142e-06, + "loss": 0.5827, + "step": 4838 + }, + { + "epoch": 0.51, + "grad_norm": 2.661195675756214, + "learning_rate": 5.094579577575748e-06, + "loss": 0.7464, + "step": 4839 + }, + { + "epoch": 0.51, + "grad_norm": 4.4434201960292805, + "learning_rate": 5.09287563886437e-06, + "loss": 0.6926, + "step": 4840 + }, + { + "epoch": 0.51, + "grad_norm": 2.8619375312083424, + "learning_rate": 5.0911716893629695e-06, + "loss": 0.7326, + "step": 4841 + }, + { + "epoch": 0.51, + "grad_norm": 4.179918987044076, + "learning_rate": 5.089467729269506e-06, + "loss": 0.6202, + "step": 4842 + }, + { + "epoch": 0.51, + "grad_norm": 2.3674261810438875, + "learning_rate": 5.087763758781941e-06, + "loss": 0.5401, + "step": 4843 + }, + { + "epoch": 0.51, + "grad_norm": 2.954478033527312, + "learning_rate": 5.0860597780982345e-06, + "loss": 0.6457, + "step": 4844 + }, + { + "epoch": 0.51, + "grad_norm": 6.1055773678841705, + "learning_rate": 5.084355787416352e-06, + "loss": 0.7077, + "step": 4845 + }, + { + "epoch": 0.51, + "grad_norm": 3.104870089140017, + "learning_rate": 5.08265178693426e-06, + "loss": 0.5571, + "step": 4846 + }, + { + "epoch": 0.51, + "grad_norm": 3.282087698903521, + "learning_rate": 5.08094777684992e-06, + "loss": 0.7171, + "step": 4847 + }, + { + "epoch": 0.51, + "grad_norm": 5.212353373051497, + "learning_rate": 5.079243757361304e-06, + "loss": 0.6634, + "step": 4848 + }, + { + "epoch": 0.51, + "grad_norm": 3.151447480509918, + "learning_rate": 5.077539728666374e-06, + "loss": 0.6715, + "step": 4849 + }, + { + "epoch": 0.51, + "grad_norm": 2.7192044541711002, + "learning_rate": 5.0758356909631055e-06, + "loss": 0.6798, + "step": 4850 + }, + { + "epoch": 0.51, + "grad_norm": 2.7229617539344644, + "learning_rate": 5.074131644449462e-06, + "loss": 0.6132, + "step": 4851 + }, + { + "epoch": 0.51, + "grad_norm": 6.5465212384001665, + "learning_rate": 5.072427589323422e-06, + "loss": 0.6457, + "step": 4852 + }, + { + "epoch": 0.51, + "grad_norm": 2.539686113044392, + "learning_rate": 5.0707235257829525e-06, + "loss": 0.6064, + "step": 4853 + }, + { + "epoch": 0.51, + "grad_norm": 2.8359956675113813, + "learning_rate": 5.069019454026028e-06, + "loss": 0.6889, + "step": 4854 + }, + { + "epoch": 0.51, + "grad_norm": 3.075569786140454, + "learning_rate": 5.067315374250623e-06, + "loss": 0.6786, + "step": 4855 + }, + { + "epoch": 0.51, + "grad_norm": 2.936345810031271, + "learning_rate": 5.065611286654712e-06, + "loss": 0.6724, + "step": 4856 + }, + { + "epoch": 0.51, + "grad_norm": 2.7223753328820366, + "learning_rate": 5.063907191436274e-06, + "loss": 0.6038, + "step": 4857 + }, + { + "epoch": 0.51, + "grad_norm": 2.747137280849216, + "learning_rate": 5.062203088793279e-06, + "loss": 0.6451, + "step": 4858 + }, + { + "epoch": 0.51, + "grad_norm": 2.534090883953568, + "learning_rate": 5.060498978923713e-06, + "loss": 0.7221, + "step": 4859 + }, + { + "epoch": 0.51, + "grad_norm": 2.8222282010733877, + "learning_rate": 5.058794862025548e-06, + "loss": 0.6084, + "step": 4860 + }, + { + "epoch": 0.51, + "grad_norm": 3.0443913355409364, + "learning_rate": 5.057090738296767e-06, + "loss": 0.6554, + "step": 4861 + }, + { + "epoch": 0.51, + "grad_norm": 3.2515144608074653, + "learning_rate": 5.055386607935347e-06, + "loss": 0.5725, + "step": 4862 + }, + { + "epoch": 0.51, + "grad_norm": 1.1037680116930089, + "learning_rate": 5.053682471139275e-06, + "loss": 0.5484, + "step": 4863 + }, + { + "epoch": 0.51, + "grad_norm": 3.5575549338006436, + "learning_rate": 5.051978328106525e-06, + "loss": 0.6632, + "step": 4864 + }, + { + "epoch": 0.51, + "grad_norm": 2.962207242548026, + "learning_rate": 5.050274179035084e-06, + "loss": 0.6792, + "step": 4865 + }, + { + "epoch": 0.51, + "grad_norm": 3.5877705132080306, + "learning_rate": 5.048570024122935e-06, + "loss": 0.6326, + "step": 4866 + }, + { + "epoch": 0.51, + "grad_norm": 6.322301722453874, + "learning_rate": 5.046865863568061e-06, + "loss": 0.7552, + "step": 4867 + }, + { + "epoch": 0.51, + "grad_norm": 6.682490269010231, + "learning_rate": 5.045161697568446e-06, + "loss": 0.6046, + "step": 4868 + }, + { + "epoch": 0.51, + "grad_norm": 3.7731626743807523, + "learning_rate": 5.0434575263220745e-06, + "loss": 0.6787, + "step": 4869 + }, + { + "epoch": 0.51, + "grad_norm": 2.898042881785744, + "learning_rate": 5.041753350026936e-06, + "loss": 0.5913, + "step": 4870 + }, + { + "epoch": 0.51, + "grad_norm": 5.927842102069006, + "learning_rate": 5.0400491688810105e-06, + "loss": 0.6178, + "step": 4871 + }, + { + "epoch": 0.51, + "grad_norm": 1.1051954463354348, + "learning_rate": 5.038344983082292e-06, + "loss": 0.5853, + "step": 4872 + }, + { + "epoch": 0.51, + "grad_norm": 2.71794682870089, + "learning_rate": 5.036640792828761e-06, + "loss": 0.6183, + "step": 4873 + }, + { + "epoch": 0.51, + "grad_norm": 3.746461374115521, + "learning_rate": 5.0349365983184105e-06, + "loss": 0.6803, + "step": 4874 + }, + { + "epoch": 0.51, + "grad_norm": 2.2340308167373597, + "learning_rate": 5.033232399749226e-06, + "loss": 0.6801, + "step": 4875 + }, + { + "epoch": 0.51, + "grad_norm": 2.9189306232219194, + "learning_rate": 5.031528197319197e-06, + "loss": 0.6212, + "step": 4876 + }, + { + "epoch": 0.51, + "grad_norm": 2.3120947610822853, + "learning_rate": 5.0298239912263145e-06, + "loss": 0.6743, + "step": 4877 + }, + { + "epoch": 0.51, + "grad_norm": 2.838555185528894, + "learning_rate": 5.028119781668566e-06, + "loss": 0.6603, + "step": 4878 + }, + { + "epoch": 0.51, + "grad_norm": 1.8865667092436296, + "learning_rate": 5.026415568843943e-06, + "loss": 0.4771, + "step": 4879 + }, + { + "epoch": 0.51, + "grad_norm": 2.7027338628136923, + "learning_rate": 5.024711352950435e-06, + "loss": 0.6176, + "step": 4880 + }, + { + "epoch": 0.51, + "grad_norm": 3.3089641577867184, + "learning_rate": 5.023007134186035e-06, + "loss": 0.5705, + "step": 4881 + }, + { + "epoch": 0.51, + "grad_norm": 2.1851775810718976, + "learning_rate": 5.0213029127487315e-06, + "loss": 0.5628, + "step": 4882 + }, + { + "epoch": 0.51, + "grad_norm": 3.5359776818740016, + "learning_rate": 5.0195986888365175e-06, + "loss": 0.6628, + "step": 4883 + }, + { + "epoch": 0.51, + "grad_norm": 2.6067456420321014, + "learning_rate": 5.017894462647383e-06, + "loss": 0.7029, + "step": 4884 + }, + { + "epoch": 0.51, + "grad_norm": 2.532052200950741, + "learning_rate": 5.0161902343793245e-06, + "loss": 0.6756, + "step": 4885 + }, + { + "epoch": 0.51, + "grad_norm": 2.4514973771189186, + "learning_rate": 5.014486004230329e-06, + "loss": 0.6228, + "step": 4886 + }, + { + "epoch": 0.51, + "grad_norm": 2.9144203054617583, + "learning_rate": 5.012781772398392e-06, + "loss": 0.5954, + "step": 4887 + }, + { + "epoch": 0.51, + "grad_norm": 3.5304442748367006, + "learning_rate": 5.011077539081506e-06, + "loss": 0.6309, + "step": 4888 + }, + { + "epoch": 0.51, + "grad_norm": 2.5115992891771706, + "learning_rate": 5.009373304477663e-06, + "loss": 0.65, + "step": 4889 + }, + { + "epoch": 0.51, + "grad_norm": 6.842298359852735, + "learning_rate": 5.007669068784857e-06, + "loss": 0.5684, + "step": 4890 + }, + { + "epoch": 0.51, + "grad_norm": 2.293124571221643, + "learning_rate": 5.005964832201079e-06, + "loss": 0.6347, + "step": 4891 + }, + { + "epoch": 0.51, + "grad_norm": 12.289205214511725, + "learning_rate": 5.004260594924327e-06, + "loss": 0.649, + "step": 4892 + }, + { + "epoch": 0.51, + "grad_norm": 2.6036414189539125, + "learning_rate": 5.002556357152589e-06, + "loss": 0.6041, + "step": 4893 + }, + { + "epoch": 0.51, + "grad_norm": 2.7202200557498992, + "learning_rate": 5.000852119083863e-06, + "loss": 0.5758, + "step": 4894 + }, + { + "epoch": 0.52, + "grad_norm": 2.769388945590125, + "learning_rate": 4.999147880916139e-06, + "loss": 0.6113, + "step": 4895 + }, + { + "epoch": 0.52, + "grad_norm": 2.9093269736434353, + "learning_rate": 4.997443642847412e-06, + "loss": 0.737, + "step": 4896 + }, + { + "epoch": 0.52, + "grad_norm": 2.6169699031806175, + "learning_rate": 4.995739405075674e-06, + "loss": 0.6174, + "step": 4897 + }, + { + "epoch": 0.52, + "grad_norm": 4.102025420126629, + "learning_rate": 4.994035167798921e-06, + "loss": 0.601, + "step": 4898 + }, + { + "epoch": 0.52, + "grad_norm": 2.5096200600239045, + "learning_rate": 4.992330931215146e-06, + "loss": 0.7385, + "step": 4899 + }, + { + "epoch": 0.52, + "grad_norm": 2.5045722346398853, + "learning_rate": 4.990626695522339e-06, + "loss": 0.7197, + "step": 4900 + }, + { + "epoch": 0.52, + "grad_norm": 2.487306202546889, + "learning_rate": 4.988922460918496e-06, + "loss": 0.5743, + "step": 4901 + }, + { + "epoch": 0.52, + "grad_norm": 5.319440294752822, + "learning_rate": 4.98721822760161e-06, + "loss": 0.7375, + "step": 4902 + }, + { + "epoch": 0.52, + "grad_norm": 2.1289416212364505, + "learning_rate": 4.985513995769672e-06, + "loss": 0.575, + "step": 4903 + }, + { + "epoch": 0.52, + "grad_norm": 2.504486312284673, + "learning_rate": 4.983809765620678e-06, + "loss": 0.689, + "step": 4904 + }, + { + "epoch": 0.52, + "grad_norm": 3.312174671775115, + "learning_rate": 4.9821055373526175e-06, + "loss": 0.5768, + "step": 4905 + }, + { + "epoch": 0.52, + "grad_norm": 3.0889029546172004, + "learning_rate": 4.980401311163483e-06, + "loss": 0.6912, + "step": 4906 + }, + { + "epoch": 0.52, + "grad_norm": 2.355632254622633, + "learning_rate": 4.978697087251269e-06, + "loss": 0.6775, + "step": 4907 + }, + { + "epoch": 0.52, + "grad_norm": 2.2880860706153845, + "learning_rate": 4.976992865813968e-06, + "loss": 0.6585, + "step": 4908 + }, + { + "epoch": 0.52, + "grad_norm": 1.9846108300900807, + "learning_rate": 4.975288647049566e-06, + "loss": 0.5791, + "step": 4909 + }, + { + "epoch": 0.52, + "grad_norm": 2.2342087894997813, + "learning_rate": 4.9735844311560574e-06, + "loss": 0.6124, + "step": 4910 + }, + { + "epoch": 0.52, + "grad_norm": 1.9947124356007078, + "learning_rate": 4.9718802183314345e-06, + "loss": 0.6461, + "step": 4911 + }, + { + "epoch": 0.52, + "grad_norm": 2.197647296374431, + "learning_rate": 4.970176008773688e-06, + "loss": 0.6122, + "step": 4912 + }, + { + "epoch": 0.52, + "grad_norm": 0.9932317458658896, + "learning_rate": 4.9684718026808035e-06, + "loss": 0.5758, + "step": 4913 + }, + { + "epoch": 0.52, + "grad_norm": 2.709681927841805, + "learning_rate": 4.966767600250776e-06, + "loss": 0.6107, + "step": 4914 + }, + { + "epoch": 0.52, + "grad_norm": 2.1508024576423437, + "learning_rate": 4.965063401681591e-06, + "loss": 0.598, + "step": 4915 + }, + { + "epoch": 0.52, + "grad_norm": 2.146135541337258, + "learning_rate": 4.963359207171239e-06, + "loss": 0.5641, + "step": 4916 + }, + { + "epoch": 0.52, + "grad_norm": 3.3285165686253713, + "learning_rate": 4.961655016917712e-06, + "loss": 0.7023, + "step": 4917 + }, + { + "epoch": 0.52, + "grad_norm": 3.943768718695757, + "learning_rate": 4.959950831118991e-06, + "loss": 0.6485, + "step": 4918 + }, + { + "epoch": 0.52, + "grad_norm": 4.023540049447175, + "learning_rate": 4.958246649973066e-06, + "loss": 0.6233, + "step": 4919 + }, + { + "epoch": 0.52, + "grad_norm": 5.000434315098372, + "learning_rate": 4.956542473677926e-06, + "loss": 0.709, + "step": 4920 + }, + { + "epoch": 0.52, + "grad_norm": 2.067727072792758, + "learning_rate": 4.954838302431556e-06, + "loss": 0.5751, + "step": 4921 + }, + { + "epoch": 0.52, + "grad_norm": 2.3191980917060753, + "learning_rate": 4.9531341364319404e-06, + "loss": 0.6596, + "step": 4922 + }, + { + "epoch": 0.52, + "grad_norm": 2.3586105417389245, + "learning_rate": 4.951429975877066e-06, + "loss": 0.6919, + "step": 4923 + }, + { + "epoch": 0.52, + "grad_norm": 2.1526033976465793, + "learning_rate": 4.9497258209649165e-06, + "loss": 0.6568, + "step": 4924 + }, + { + "epoch": 0.52, + "grad_norm": 2.133729675007697, + "learning_rate": 4.948021671893475e-06, + "loss": 0.6752, + "step": 4925 + }, + { + "epoch": 0.52, + "grad_norm": 3.159426842721098, + "learning_rate": 4.946317528860728e-06, + "loss": 0.7144, + "step": 4926 + }, + { + "epoch": 0.52, + "grad_norm": 2.115435574837891, + "learning_rate": 4.9446133920646535e-06, + "loss": 0.5658, + "step": 4927 + }, + { + "epoch": 0.52, + "grad_norm": 2.7390237750782194, + "learning_rate": 4.942909261703234e-06, + "loss": 0.606, + "step": 4928 + }, + { + "epoch": 0.52, + "grad_norm": 2.8886081183240573, + "learning_rate": 4.941205137974453e-06, + "loss": 0.6459, + "step": 4929 + }, + { + "epoch": 0.52, + "grad_norm": 2.1116737366663343, + "learning_rate": 4.93950102107629e-06, + "loss": 0.5802, + "step": 4930 + }, + { + "epoch": 0.52, + "grad_norm": 2.26047601506244, + "learning_rate": 4.937796911206722e-06, + "loss": 0.6255, + "step": 4931 + }, + { + "epoch": 0.52, + "grad_norm": 2.1539115192013214, + "learning_rate": 4.936092808563729e-06, + "loss": 0.6115, + "step": 4932 + }, + { + "epoch": 0.52, + "grad_norm": 3.0145846571869437, + "learning_rate": 4.9343887133452885e-06, + "loss": 0.6485, + "step": 4933 + }, + { + "epoch": 0.52, + "grad_norm": 3.099479126502059, + "learning_rate": 4.932684625749379e-06, + "loss": 0.6624, + "step": 4934 + }, + { + "epoch": 0.52, + "grad_norm": 2.5769731100724083, + "learning_rate": 4.930980545973973e-06, + "loss": 0.6308, + "step": 4935 + }, + { + "epoch": 0.52, + "grad_norm": 2.6783179427641115, + "learning_rate": 4.929276474217049e-06, + "loss": 0.6243, + "step": 4936 + }, + { + "epoch": 0.52, + "grad_norm": 2.943547559066303, + "learning_rate": 4.92757241067658e-06, + "loss": 0.6204, + "step": 4937 + }, + { + "epoch": 0.52, + "grad_norm": 2.3266756319552204, + "learning_rate": 4.925868355550537e-06, + "loss": 0.5994, + "step": 4938 + }, + { + "epoch": 0.52, + "grad_norm": 2.310739073972056, + "learning_rate": 4.924164309036897e-06, + "loss": 0.6684, + "step": 4939 + }, + { + "epoch": 0.52, + "grad_norm": 2.3395607545275916, + "learning_rate": 4.922460271333627e-06, + "loss": 0.5414, + "step": 4940 + }, + { + "epoch": 0.52, + "grad_norm": 2.4676331611691746, + "learning_rate": 4.920756242638698e-06, + "loss": 0.6746, + "step": 4941 + }, + { + "epoch": 0.52, + "grad_norm": 2.2941161951403686, + "learning_rate": 4.919052223150081e-06, + "loss": 0.7604, + "step": 4942 + }, + { + "epoch": 0.52, + "grad_norm": 2.929369540693781, + "learning_rate": 4.917348213065742e-06, + "loss": 0.6284, + "step": 4943 + }, + { + "epoch": 0.52, + "grad_norm": 2.196953658021791, + "learning_rate": 4.915644212583649e-06, + "loss": 0.5309, + "step": 4944 + }, + { + "epoch": 0.52, + "grad_norm": 2.277862980217032, + "learning_rate": 4.913940221901766e-06, + "loss": 0.6333, + "step": 4945 + }, + { + "epoch": 0.52, + "grad_norm": 2.1353131160650287, + "learning_rate": 4.912236241218061e-06, + "loss": 0.6933, + "step": 4946 + }, + { + "epoch": 0.52, + "grad_norm": 2.1309552775913865, + "learning_rate": 4.910532270730497e-06, + "loss": 0.6451, + "step": 4947 + }, + { + "epoch": 0.52, + "grad_norm": 2.226377878125629, + "learning_rate": 4.908828310637031e-06, + "loss": 0.6428, + "step": 4948 + }, + { + "epoch": 0.52, + "grad_norm": 2.218293921242868, + "learning_rate": 4.907124361135632e-06, + "loss": 0.6537, + "step": 4949 + }, + { + "epoch": 0.52, + "grad_norm": 2.4059507731751615, + "learning_rate": 4.905420422424254e-06, + "loss": 0.5964, + "step": 4950 + }, + { + "epoch": 0.52, + "grad_norm": 2.4481730880265014, + "learning_rate": 4.903716494700859e-06, + "loss": 0.6094, + "step": 4951 + }, + { + "epoch": 0.52, + "grad_norm": 2.684768787067311, + "learning_rate": 4.902012578163404e-06, + "loss": 0.6083, + "step": 4952 + }, + { + "epoch": 0.52, + "grad_norm": 1.9937131948780513, + "learning_rate": 4.900308673009843e-06, + "loss": 0.5961, + "step": 4953 + }, + { + "epoch": 0.52, + "grad_norm": 3.274862105439873, + "learning_rate": 4.8986047794381325e-06, + "loss": 0.6355, + "step": 4954 + }, + { + "epoch": 0.52, + "grad_norm": 2.081137560086168, + "learning_rate": 4.896900897646226e-06, + "loss": 0.6181, + "step": 4955 + }, + { + "epoch": 0.52, + "grad_norm": 2.1349551752747886, + "learning_rate": 4.8951970278320765e-06, + "loss": 0.6046, + "step": 4956 + }, + { + "epoch": 0.52, + "grad_norm": 2.541248755489936, + "learning_rate": 4.89349317019363e-06, + "loss": 0.6012, + "step": 4957 + }, + { + "epoch": 0.52, + "grad_norm": 2.565627078399468, + "learning_rate": 4.891789324928842e-06, + "loss": 0.6558, + "step": 4958 + }, + { + "epoch": 0.52, + "grad_norm": 3.109453742078581, + "learning_rate": 4.890085492235657e-06, + "loss": 0.5778, + "step": 4959 + }, + { + "epoch": 0.52, + "grad_norm": 2.0050613585910755, + "learning_rate": 4.888381672312022e-06, + "loss": 0.6251, + "step": 4960 + }, + { + "epoch": 0.52, + "grad_norm": 2.339550540816934, + "learning_rate": 4.88667786535588e-06, + "loss": 0.6687, + "step": 4961 + }, + { + "epoch": 0.52, + "grad_norm": 2.290758913793059, + "learning_rate": 4.884974071565177e-06, + "loss": 0.6846, + "step": 4962 + }, + { + "epoch": 0.52, + "grad_norm": 1.077911583636364, + "learning_rate": 4.883270291137852e-06, + "loss": 0.6473, + "step": 4963 + }, + { + "epoch": 0.52, + "grad_norm": 1.985960627648404, + "learning_rate": 4.88156652427185e-06, + "loss": 0.6044, + "step": 4964 + }, + { + "epoch": 0.52, + "grad_norm": 2.305367885235591, + "learning_rate": 4.879862771165107e-06, + "loss": 0.5774, + "step": 4965 + }, + { + "epoch": 0.52, + "grad_norm": 2.4768244502117036, + "learning_rate": 4.878159032015559e-06, + "loss": 0.6634, + "step": 4966 + }, + { + "epoch": 0.52, + "grad_norm": 2.4276873504006815, + "learning_rate": 4.8764553070211415e-06, + "loss": 0.6828, + "step": 4967 + }, + { + "epoch": 0.52, + "grad_norm": 2.4060474857332013, + "learning_rate": 4.87475159637979e-06, + "loss": 0.6065, + "step": 4968 + }, + { + "epoch": 0.52, + "grad_norm": 1.7772890860805246, + "learning_rate": 4.873047900289437e-06, + "loss": 0.6808, + "step": 4969 + }, + { + "epoch": 0.52, + "grad_norm": 3.0625237060008588, + "learning_rate": 4.87134421894801e-06, + "loss": 0.6523, + "step": 4970 + }, + { + "epoch": 0.52, + "grad_norm": 2.2524142418639466, + "learning_rate": 4.869640552553441e-06, + "loss": 0.6733, + "step": 4971 + }, + { + "epoch": 0.52, + "grad_norm": 2.2414743848382006, + "learning_rate": 4.867936901303656e-06, + "loss": 0.6021, + "step": 4972 + }, + { + "epoch": 0.52, + "grad_norm": 2.338392581877551, + "learning_rate": 4.866233265396577e-06, + "loss": 0.7099, + "step": 4973 + }, + { + "epoch": 0.52, + "grad_norm": 2.0935049485955495, + "learning_rate": 4.8645296450301345e-06, + "loss": 0.6684, + "step": 4974 + }, + { + "epoch": 0.52, + "grad_norm": 2.1007971339772427, + "learning_rate": 4.862826040402246e-06, + "loss": 0.6992, + "step": 4975 + }, + { + "epoch": 0.52, + "grad_norm": 0.9696344105471323, + "learning_rate": 4.861122451710829e-06, + "loss": 0.5749, + "step": 4976 + }, + { + "epoch": 0.52, + "grad_norm": 2.553225906237905, + "learning_rate": 4.859418879153805e-06, + "loss": 0.7279, + "step": 4977 + }, + { + "epoch": 0.52, + "grad_norm": 2.067442914319545, + "learning_rate": 4.857715322929091e-06, + "loss": 0.6576, + "step": 4978 + }, + { + "epoch": 0.52, + "grad_norm": 2.149325406618078, + "learning_rate": 4.856011783234599e-06, + "loss": 0.7101, + "step": 4979 + }, + { + "epoch": 0.52, + "grad_norm": 1.9706548407978104, + "learning_rate": 4.85430826026824e-06, + "loss": 0.6659, + "step": 4980 + }, + { + "epoch": 0.52, + "grad_norm": 2.6696818093641395, + "learning_rate": 4.852604754227927e-06, + "loss": 0.6047, + "step": 4981 + }, + { + "epoch": 0.52, + "grad_norm": 3.3802463897178785, + "learning_rate": 4.8509012653115695e-06, + "loss": 0.7413, + "step": 4982 + }, + { + "epoch": 0.52, + "grad_norm": 2.1818945622881833, + "learning_rate": 4.849197793717069e-06, + "loss": 0.5983, + "step": 4983 + }, + { + "epoch": 0.52, + "grad_norm": 2.14901122734723, + "learning_rate": 4.847494339642334e-06, + "loss": 0.7177, + "step": 4984 + }, + { + "epoch": 0.52, + "grad_norm": 2.918569152656593, + "learning_rate": 4.8457909032852654e-06, + "loss": 0.6908, + "step": 4985 + }, + { + "epoch": 0.52, + "grad_norm": 0.9406184537970192, + "learning_rate": 4.844087484843764e-06, + "loss": 0.6133, + "step": 4986 + }, + { + "epoch": 0.52, + "grad_norm": 2.0469495720368256, + "learning_rate": 4.84238408451573e-06, + "loss": 0.6769, + "step": 4987 + }, + { + "epoch": 0.52, + "grad_norm": 2.7726823392613116, + "learning_rate": 4.840680702499056e-06, + "loss": 0.6546, + "step": 4988 + }, + { + "epoch": 0.52, + "grad_norm": 2.272990604413833, + "learning_rate": 4.838977338991636e-06, + "loss": 0.647, + "step": 4989 + }, + { + "epoch": 0.53, + "grad_norm": 2.9330631724535494, + "learning_rate": 4.837273994191364e-06, + "loss": 0.6251, + "step": 4990 + }, + { + "epoch": 0.53, + "grad_norm": 2.2631226425982933, + "learning_rate": 4.835570668296131e-06, + "loss": 0.676, + "step": 4991 + }, + { + "epoch": 0.53, + "grad_norm": 2.13674093049108, + "learning_rate": 4.83386736150382e-06, + "loss": 0.6693, + "step": 4992 + }, + { + "epoch": 0.53, + "grad_norm": 2.133284920671136, + "learning_rate": 4.83216407401232e-06, + "loss": 0.5922, + "step": 4993 + }, + { + "epoch": 0.53, + "grad_norm": 2.520394477013768, + "learning_rate": 4.830460806019512e-06, + "loss": 0.6195, + "step": 4994 + }, + { + "epoch": 0.53, + "grad_norm": 2.693745272656559, + "learning_rate": 4.828757557723279e-06, + "loss": 0.6932, + "step": 4995 + }, + { + "epoch": 0.53, + "grad_norm": 2.4450291798500845, + "learning_rate": 4.827054329321496e-06, + "loss": 0.537, + "step": 4996 + }, + { + "epoch": 0.53, + "grad_norm": 2.0458948423518097, + "learning_rate": 4.825351121012042e-06, + "loss": 0.6444, + "step": 4997 + }, + { + "epoch": 0.53, + "grad_norm": 2.6712241240805357, + "learning_rate": 4.823647932992788e-06, + "loss": 0.6205, + "step": 4998 + }, + { + "epoch": 0.53, + "grad_norm": 2.207121252476882, + "learning_rate": 4.82194476546161e-06, + "loss": 0.5541, + "step": 4999 + }, + { + "epoch": 0.53, + "grad_norm": 2.289069632699829, + "learning_rate": 4.820241618616375e-06, + "loss": 0.544, + "step": 5000 + }, + { + "epoch": 0.53, + "grad_norm": 2.2675898245567687, + "learning_rate": 4.818538492654947e-06, + "loss": 0.6852, + "step": 5001 + }, + { + "epoch": 0.53, + "grad_norm": 2.343735840653348, + "learning_rate": 4.816835387775193e-06, + "loss": 0.6876, + "step": 5002 + }, + { + "epoch": 0.53, + "grad_norm": 2.246807379369579, + "learning_rate": 4.8151323041749734e-06, + "loss": 0.6657, + "step": 5003 + }, + { + "epoch": 0.53, + "grad_norm": 2.4237118515960447, + "learning_rate": 4.8134292420521505e-06, + "loss": 0.6224, + "step": 5004 + }, + { + "epoch": 0.53, + "grad_norm": 2.289453411888955, + "learning_rate": 4.811726201604576e-06, + "loss": 0.655, + "step": 5005 + }, + { + "epoch": 0.53, + "grad_norm": 2.0026507977297503, + "learning_rate": 4.810023183030109e-06, + "loss": 0.6169, + "step": 5006 + }, + { + "epoch": 0.53, + "grad_norm": 3.0079499431943195, + "learning_rate": 4.808320186526599e-06, + "loss": 0.6452, + "step": 5007 + }, + { + "epoch": 0.53, + "grad_norm": 2.2628812402524896, + "learning_rate": 4.806617212291898e-06, + "loss": 0.6522, + "step": 5008 + }, + { + "epoch": 0.53, + "grad_norm": 2.2505910057014997, + "learning_rate": 4.804914260523847e-06, + "loss": 0.5728, + "step": 5009 + }, + { + "epoch": 0.53, + "grad_norm": 2.2225142986972495, + "learning_rate": 4.803211331420294e-06, + "loss": 0.636, + "step": 5010 + }, + { + "epoch": 0.53, + "grad_norm": 2.5157421199340506, + "learning_rate": 4.801508425179079e-06, + "loss": 0.6838, + "step": 5011 + }, + { + "epoch": 0.53, + "grad_norm": 3.362812838563486, + "learning_rate": 4.799805541998042e-06, + "loss": 0.6514, + "step": 5012 + }, + { + "epoch": 0.53, + "grad_norm": 2.460937036790075, + "learning_rate": 4.79810268207502e-06, + "loss": 0.63, + "step": 5013 + }, + { + "epoch": 0.53, + "grad_norm": 3.3836688933373873, + "learning_rate": 4.796399845607844e-06, + "loss": 0.6927, + "step": 5014 + }, + { + "epoch": 0.53, + "grad_norm": 2.1163838752069926, + "learning_rate": 4.7946970327943435e-06, + "loss": 0.6223, + "step": 5015 + }, + { + "epoch": 0.53, + "grad_norm": 2.4368026381336314, + "learning_rate": 4.79299424383235e-06, + "loss": 0.5904, + "step": 5016 + }, + { + "epoch": 0.53, + "grad_norm": 2.0227052397175225, + "learning_rate": 4.791291478919688e-06, + "loss": 0.6301, + "step": 5017 + }, + { + "epoch": 0.53, + "grad_norm": 3.4149521326579464, + "learning_rate": 4.789588738254176e-06, + "loss": 0.6325, + "step": 5018 + }, + { + "epoch": 0.53, + "grad_norm": 2.03492009399757, + "learning_rate": 4.787886022033637e-06, + "loss": 0.6345, + "step": 5019 + }, + { + "epoch": 0.53, + "grad_norm": 2.045894564408748, + "learning_rate": 4.786183330455886e-06, + "loss": 0.6108, + "step": 5020 + }, + { + "epoch": 0.53, + "grad_norm": 2.0734976435360384, + "learning_rate": 4.784480663718742e-06, + "loss": 0.5606, + "step": 5021 + }, + { + "epoch": 0.53, + "grad_norm": 4.08406849984304, + "learning_rate": 4.782778022020006e-06, + "loss": 0.6312, + "step": 5022 + }, + { + "epoch": 0.53, + "grad_norm": 2.8678914982078667, + "learning_rate": 4.7810754055574945e-06, + "loss": 0.6431, + "step": 5023 + }, + { + "epoch": 0.53, + "grad_norm": 2.010459684826099, + "learning_rate": 4.779372814529008e-06, + "loss": 0.6737, + "step": 5024 + }, + { + "epoch": 0.53, + "grad_norm": 2.274385411512584, + "learning_rate": 4.7776702491323506e-06, + "loss": 0.6473, + "step": 5025 + }, + { + "epoch": 0.53, + "grad_norm": 2.4893768635552216, + "learning_rate": 4.775967709565323e-06, + "loss": 0.5868, + "step": 5026 + }, + { + "epoch": 0.53, + "grad_norm": 2.8594743184322855, + "learning_rate": 4.774265196025716e-06, + "loss": 0.6205, + "step": 5027 + }, + { + "epoch": 0.53, + "grad_norm": 3.0896156585418804, + "learning_rate": 4.772562708711328e-06, + "loss": 0.5436, + "step": 5028 + }, + { + "epoch": 0.53, + "grad_norm": 2.1024636760857023, + "learning_rate": 4.770860247819946e-06, + "loss": 0.6145, + "step": 5029 + }, + { + "epoch": 0.53, + "grad_norm": 2.7483433705268467, + "learning_rate": 4.7691578135493595e-06, + "loss": 0.6789, + "step": 5030 + }, + { + "epoch": 0.53, + "grad_norm": 3.7267937561387456, + "learning_rate": 4.76745540609735e-06, + "loss": 0.651, + "step": 5031 + }, + { + "epoch": 0.53, + "grad_norm": 0.930921651098154, + "learning_rate": 4.765753025661699e-06, + "loss": 0.5867, + "step": 5032 + }, + { + "epoch": 0.53, + "grad_norm": 2.0301984146112435, + "learning_rate": 4.764050672440184e-06, + "loss": 0.633, + "step": 5033 + }, + { + "epoch": 0.53, + "grad_norm": 3.3826775767799564, + "learning_rate": 4.76234834663058e-06, + "loss": 0.6399, + "step": 5034 + }, + { + "epoch": 0.53, + "grad_norm": 2.648287363125826, + "learning_rate": 4.76064604843066e-06, + "loss": 0.6734, + "step": 5035 + }, + { + "epoch": 0.53, + "grad_norm": 2.6257884700520076, + "learning_rate": 4.758943778038189e-06, + "loss": 0.5742, + "step": 5036 + }, + { + "epoch": 0.53, + "grad_norm": 2.790385480712019, + "learning_rate": 4.757241535650931e-06, + "loss": 0.5854, + "step": 5037 + }, + { + "epoch": 0.53, + "grad_norm": 2.3321216810115755, + "learning_rate": 4.755539321466652e-06, + "loss": 0.7305, + "step": 5038 + }, + { + "epoch": 0.53, + "grad_norm": 2.470106783507069, + "learning_rate": 4.753837135683108e-06, + "loss": 0.7199, + "step": 5039 + }, + { + "epoch": 0.53, + "grad_norm": 2.519684963476052, + "learning_rate": 4.752134978498052e-06, + "loss": 0.6208, + "step": 5040 + }, + { + "epoch": 0.53, + "grad_norm": 1.9933322283576163, + "learning_rate": 4.750432850109239e-06, + "loss": 0.5654, + "step": 5041 + }, + { + "epoch": 0.53, + "grad_norm": 2.1628910176613396, + "learning_rate": 4.748730750714417e-06, + "loss": 0.5597, + "step": 5042 + }, + { + "epoch": 0.53, + "grad_norm": 3.2601189066847938, + "learning_rate": 4.74702868051133e-06, + "loss": 0.5868, + "step": 5043 + }, + { + "epoch": 0.53, + "grad_norm": 2.4445761523300917, + "learning_rate": 4.745326639697718e-06, + "loss": 0.6006, + "step": 5044 + }, + { + "epoch": 0.53, + "grad_norm": 2.157421144865131, + "learning_rate": 4.743624628471322e-06, + "loss": 0.5829, + "step": 5045 + }, + { + "epoch": 0.53, + "grad_norm": 10.608165987972704, + "learning_rate": 4.741922647029873e-06, + "loss": 0.5994, + "step": 5046 + }, + { + "epoch": 0.53, + "grad_norm": 2.6959202608616892, + "learning_rate": 4.740220695571108e-06, + "loss": 0.7195, + "step": 5047 + }, + { + "epoch": 0.53, + "grad_norm": 2.374505085575942, + "learning_rate": 4.738518774292752e-06, + "loss": 0.5836, + "step": 5048 + }, + { + "epoch": 0.53, + "grad_norm": 2.4217957922423077, + "learning_rate": 4.736816883392527e-06, + "loss": 0.5942, + "step": 5049 + }, + { + "epoch": 0.53, + "grad_norm": 3.6160178947490085, + "learning_rate": 4.735115023068155e-06, + "loss": 0.6114, + "step": 5050 + }, + { + "epoch": 0.53, + "grad_norm": 1.9430364282301278, + "learning_rate": 4.733413193517355e-06, + "loss": 0.6125, + "step": 5051 + }, + { + "epoch": 0.53, + "grad_norm": 2.886628277780551, + "learning_rate": 4.731711394937842e-06, + "loss": 0.6733, + "step": 5052 + }, + { + "epoch": 0.53, + "grad_norm": 2.7819669075459643, + "learning_rate": 4.73000962752732e-06, + "loss": 0.607, + "step": 5053 + }, + { + "epoch": 0.53, + "grad_norm": 3.3962133589136707, + "learning_rate": 4.7283078914835e-06, + "loss": 0.6421, + "step": 5054 + }, + { + "epoch": 0.53, + "grad_norm": 2.8324268366863623, + "learning_rate": 4.726606187004084e-06, + "loss": 0.6786, + "step": 5055 + }, + { + "epoch": 0.53, + "grad_norm": 3.009696361121063, + "learning_rate": 4.724904514286773e-06, + "loss": 0.6692, + "step": 5056 + }, + { + "epoch": 0.53, + "grad_norm": 2.6870058183715755, + "learning_rate": 4.723202873529256e-06, + "loss": 0.6067, + "step": 5057 + }, + { + "epoch": 0.53, + "grad_norm": 2.358795360202302, + "learning_rate": 4.721501264929232e-06, + "loss": 0.6416, + "step": 5058 + }, + { + "epoch": 0.53, + "grad_norm": 2.2459402545172984, + "learning_rate": 4.719799688684385e-06, + "loss": 0.7759, + "step": 5059 + }, + { + "epoch": 0.53, + "grad_norm": 2.1705987722229483, + "learning_rate": 4.7180981449924006e-06, + "loss": 0.5818, + "step": 5060 + }, + { + "epoch": 0.53, + "grad_norm": 2.764535131155558, + "learning_rate": 4.716396634050959e-06, + "loss": 0.6444, + "step": 5061 + }, + { + "epoch": 0.53, + "grad_norm": 2.7065511765293717, + "learning_rate": 4.714695156057737e-06, + "loss": 0.7239, + "step": 5062 + }, + { + "epoch": 0.53, + "grad_norm": 2.614545604700766, + "learning_rate": 4.712993711210405e-06, + "loss": 0.5962, + "step": 5063 + }, + { + "epoch": 0.53, + "grad_norm": 3.2589815632805568, + "learning_rate": 4.711292299706636e-06, + "loss": 0.6851, + "step": 5064 + }, + { + "epoch": 0.53, + "grad_norm": 2.695743902716769, + "learning_rate": 4.709590921744093e-06, + "loss": 0.5757, + "step": 5065 + }, + { + "epoch": 0.53, + "grad_norm": 3.0665293362792982, + "learning_rate": 4.707889577520436e-06, + "loss": 0.6153, + "step": 5066 + }, + { + "epoch": 0.53, + "grad_norm": 2.6924889301708195, + "learning_rate": 4.706188267233324e-06, + "loss": 0.6596, + "step": 5067 + }, + { + "epoch": 0.53, + "grad_norm": 2.3595683992153256, + "learning_rate": 4.704486991080409e-06, + "loss": 0.6294, + "step": 5068 + }, + { + "epoch": 0.53, + "grad_norm": 2.669021302249065, + "learning_rate": 4.7027857492593445e-06, + "loss": 0.6792, + "step": 5069 + }, + { + "epoch": 0.53, + "grad_norm": 1.0161808520148305, + "learning_rate": 4.701084541967769e-06, + "loss": 0.5579, + "step": 5070 + }, + { + "epoch": 0.53, + "grad_norm": 2.2988515242704346, + "learning_rate": 4.699383369403329e-06, + "loss": 0.6003, + "step": 5071 + }, + { + "epoch": 0.53, + "grad_norm": 2.252022330411591, + "learning_rate": 4.697682231763658e-06, + "loss": 0.6515, + "step": 5072 + }, + { + "epoch": 0.53, + "grad_norm": 2.0233341986752094, + "learning_rate": 4.695981129246393e-06, + "loss": 0.5974, + "step": 5073 + }, + { + "epoch": 0.53, + "grad_norm": 2.2983932146999533, + "learning_rate": 4.694280062049163e-06, + "loss": 0.5946, + "step": 5074 + }, + { + "epoch": 0.53, + "grad_norm": 3.894694889398997, + "learning_rate": 4.6925790303695886e-06, + "loss": 0.5984, + "step": 5075 + }, + { + "epoch": 0.53, + "grad_norm": 2.374417964315601, + "learning_rate": 4.690878034405296e-06, + "loss": 0.5768, + "step": 5076 + }, + { + "epoch": 0.53, + "grad_norm": 2.310052580071245, + "learning_rate": 4.689177074353899e-06, + "loss": 0.559, + "step": 5077 + }, + { + "epoch": 0.53, + "grad_norm": 2.4835325819520286, + "learning_rate": 4.687476150413012e-06, + "loss": 0.6051, + "step": 5078 + }, + { + "epoch": 0.53, + "grad_norm": 2.57984204212142, + "learning_rate": 4.6857752627802405e-06, + "loss": 0.5899, + "step": 5079 + }, + { + "epoch": 0.53, + "grad_norm": 2.6143636376389408, + "learning_rate": 4.684074411653192e-06, + "loss": 0.6276, + "step": 5080 + }, + { + "epoch": 0.53, + "grad_norm": 2.065161043806273, + "learning_rate": 4.682373597229464e-06, + "loss": 0.6279, + "step": 5081 + }, + { + "epoch": 0.53, + "grad_norm": 4.501763492850309, + "learning_rate": 4.680672819706654e-06, + "loss": 0.681, + "step": 5082 + }, + { + "epoch": 0.53, + "grad_norm": 2.563624459964035, + "learning_rate": 4.678972079282354e-06, + "loss": 0.64, + "step": 5083 + }, + { + "epoch": 0.53, + "grad_norm": 2.0259420193680757, + "learning_rate": 4.677271376154149e-06, + "loss": 0.7082, + "step": 5084 + }, + { + "epoch": 0.54, + "grad_norm": 3.20008658827479, + "learning_rate": 4.6755707105196204e-06, + "loss": 0.7601, + "step": 5085 + }, + { + "epoch": 0.54, + "grad_norm": 2.5221950538299738, + "learning_rate": 4.673870082576351e-06, + "loss": 0.5972, + "step": 5086 + }, + { + "epoch": 0.54, + "grad_norm": 2.312808929084742, + "learning_rate": 4.672169492521914e-06, + "loss": 0.6986, + "step": 5087 + }, + { + "epoch": 0.54, + "grad_norm": 3.5730283833471397, + "learning_rate": 4.670468940553875e-06, + "loss": 0.6108, + "step": 5088 + }, + { + "epoch": 0.54, + "grad_norm": 5.6696729783798485, + "learning_rate": 4.6687684268698034e-06, + "loss": 0.6344, + "step": 5089 + }, + { + "epoch": 0.54, + "grad_norm": 3.465514843046939, + "learning_rate": 4.667067951667256e-06, + "loss": 0.622, + "step": 5090 + }, + { + "epoch": 0.54, + "grad_norm": 1.13181319546967, + "learning_rate": 4.665367515143797e-06, + "loss": 0.5712, + "step": 5091 + }, + { + "epoch": 0.54, + "grad_norm": 2.425783344218305, + "learning_rate": 4.663667117496968e-06, + "loss": 0.6551, + "step": 5092 + }, + { + "epoch": 0.54, + "grad_norm": 2.5727074812927193, + "learning_rate": 4.6619667589243225e-06, + "loss": 0.6365, + "step": 5093 + }, + { + "epoch": 0.54, + "grad_norm": 2.3970092435076134, + "learning_rate": 4.6602664396234e-06, + "loss": 0.5804, + "step": 5094 + }, + { + "epoch": 0.54, + "grad_norm": 2.186207872425851, + "learning_rate": 4.658566159791742e-06, + "loss": 0.6546, + "step": 5095 + }, + { + "epoch": 0.54, + "grad_norm": 2.655332565894822, + "learning_rate": 4.656865919626883e-06, + "loss": 0.5582, + "step": 5096 + }, + { + "epoch": 0.54, + "grad_norm": 3.041070063381227, + "learning_rate": 4.655165719326347e-06, + "loss": 0.7031, + "step": 5097 + }, + { + "epoch": 0.54, + "grad_norm": 1.9879284762088225, + "learning_rate": 4.653465559087661e-06, + "loss": 0.6188, + "step": 5098 + }, + { + "epoch": 0.54, + "grad_norm": 2.362708201617922, + "learning_rate": 4.651765439108344e-06, + "loss": 0.5938, + "step": 5099 + }, + { + "epoch": 0.54, + "grad_norm": 2.3036918192067968, + "learning_rate": 4.650065359585914e-06, + "loss": 0.6185, + "step": 5100 + }, + { + "epoch": 0.54, + "grad_norm": 3.862595099667224, + "learning_rate": 4.648365320717876e-06, + "loss": 0.6754, + "step": 5101 + }, + { + "epoch": 0.54, + "grad_norm": 2.3230616471795256, + "learning_rate": 4.64666532270174e-06, + "loss": 0.6647, + "step": 5102 + }, + { + "epoch": 0.54, + "grad_norm": 2.0507051961729577, + "learning_rate": 4.644965365735004e-06, + "loss": 0.5953, + "step": 5103 + }, + { + "epoch": 0.54, + "grad_norm": 2.422141046773203, + "learning_rate": 4.643265450015169e-06, + "loss": 0.7227, + "step": 5104 + }, + { + "epoch": 0.54, + "grad_norm": 2.2097746173914743, + "learning_rate": 4.6415655757397206e-06, + "loss": 0.6461, + "step": 5105 + }, + { + "epoch": 0.54, + "grad_norm": 2.0454571953490075, + "learning_rate": 4.639865743106148e-06, + "loss": 0.5875, + "step": 5106 + }, + { + "epoch": 0.54, + "grad_norm": 3.145981246348931, + "learning_rate": 4.63816595231193e-06, + "loss": 0.6611, + "step": 5107 + }, + { + "epoch": 0.54, + "grad_norm": 1.9222457727504096, + "learning_rate": 4.636466203554548e-06, + "loss": 0.5957, + "step": 5108 + }, + { + "epoch": 0.54, + "grad_norm": 2.509150332110932, + "learning_rate": 4.634766497031472e-06, + "loss": 0.6091, + "step": 5109 + }, + { + "epoch": 0.54, + "grad_norm": 2.8618331161141985, + "learning_rate": 4.633066832940167e-06, + "loss": 0.6478, + "step": 5110 + }, + { + "epoch": 0.54, + "grad_norm": 2.3088562094725518, + "learning_rate": 4.631367211478098e-06, + "loss": 0.5967, + "step": 5111 + }, + { + "epoch": 0.54, + "grad_norm": 2.2252841546630666, + "learning_rate": 4.62966763284272e-06, + "loss": 0.703, + "step": 5112 + }, + { + "epoch": 0.54, + "grad_norm": 0.9848922788689293, + "learning_rate": 4.6279680972314875e-06, + "loss": 0.5806, + "step": 5113 + }, + { + "epoch": 0.54, + "grad_norm": 2.421707800583379, + "learning_rate": 4.626268604841844e-06, + "loss": 0.655, + "step": 5114 + }, + { + "epoch": 0.54, + "grad_norm": 3.060075796638142, + "learning_rate": 4.624569155871235e-06, + "loss": 0.6389, + "step": 5115 + }, + { + "epoch": 0.54, + "grad_norm": 2.708021895892409, + "learning_rate": 4.622869750517094e-06, + "loss": 0.7085, + "step": 5116 + }, + { + "epoch": 0.54, + "grad_norm": 2.198154906455453, + "learning_rate": 4.621170388976858e-06, + "loss": 0.5702, + "step": 5117 + }, + { + "epoch": 0.54, + "grad_norm": 2.074916489895744, + "learning_rate": 4.619471071447949e-06, + "loss": 0.514, + "step": 5118 + }, + { + "epoch": 0.54, + "grad_norm": 2.89007197368037, + "learning_rate": 4.617771798127792e-06, + "loss": 0.6663, + "step": 5119 + }, + { + "epoch": 0.54, + "grad_norm": 2.186100478989604, + "learning_rate": 4.616072569213802e-06, + "loss": 0.5598, + "step": 5120 + }, + { + "epoch": 0.54, + "grad_norm": 4.737181403903594, + "learning_rate": 4.614373384903391e-06, + "loss": 0.6478, + "step": 5121 + }, + { + "epoch": 0.54, + "grad_norm": 2.744234393991175, + "learning_rate": 4.612674245393967e-06, + "loss": 0.6454, + "step": 5122 + }, + { + "epoch": 0.54, + "grad_norm": 2.4651854631448478, + "learning_rate": 4.610975150882928e-06, + "loss": 0.6289, + "step": 5123 + }, + { + "epoch": 0.54, + "grad_norm": 5.106591381662164, + "learning_rate": 4.609276101567672e-06, + "loss": 0.6305, + "step": 5124 + }, + { + "epoch": 0.54, + "grad_norm": 2.8645757518422106, + "learning_rate": 4.607577097645587e-06, + "loss": 0.5743, + "step": 5125 + }, + { + "epoch": 0.54, + "grad_norm": 2.8329814893534113, + "learning_rate": 4.605878139314065e-06, + "loss": 0.6463, + "step": 5126 + }, + { + "epoch": 0.54, + "grad_norm": 2.3889289397117075, + "learning_rate": 4.604179226770478e-06, + "loss": 0.7184, + "step": 5127 + }, + { + "epoch": 0.54, + "grad_norm": 2.124562346527691, + "learning_rate": 4.602480360212205e-06, + "loss": 0.6488, + "step": 5128 + }, + { + "epoch": 0.54, + "grad_norm": 2.7833987833164624, + "learning_rate": 4.600781539836614e-06, + "loss": 0.6184, + "step": 5129 + }, + { + "epoch": 0.54, + "grad_norm": 2.554887684552683, + "learning_rate": 4.5990827658410705e-06, + "loss": 0.6443, + "step": 5130 + }, + { + "epoch": 0.54, + "grad_norm": 2.83397128521315, + "learning_rate": 4.597384038422933e-06, + "loss": 0.6566, + "step": 5131 + }, + { + "epoch": 0.54, + "grad_norm": 2.322027615656651, + "learning_rate": 4.595685357779553e-06, + "loss": 0.6787, + "step": 5132 + }, + { + "epoch": 0.54, + "grad_norm": 4.634011797281909, + "learning_rate": 4.593986724108279e-06, + "loss": 0.5787, + "step": 5133 + }, + { + "epoch": 0.54, + "grad_norm": 1.0666278509393818, + "learning_rate": 4.592288137606454e-06, + "loss": 0.5666, + "step": 5134 + }, + { + "epoch": 0.54, + "grad_norm": 3.207516428553423, + "learning_rate": 4.590589598471416e-06, + "loss": 0.6368, + "step": 5135 + }, + { + "epoch": 0.54, + "grad_norm": 3.49123455675996, + "learning_rate": 4.588891106900493e-06, + "loss": 0.7018, + "step": 5136 + }, + { + "epoch": 0.54, + "grad_norm": 0.9847912096139533, + "learning_rate": 4.587192663091014e-06, + "loss": 0.5966, + "step": 5137 + }, + { + "epoch": 0.54, + "grad_norm": 3.22624018717781, + "learning_rate": 4.5854942672402965e-06, + "loss": 0.582, + "step": 5138 + }, + { + "epoch": 0.54, + "grad_norm": 1.9077695725913995, + "learning_rate": 4.5837959195456605e-06, + "loss": 0.6262, + "step": 5139 + }, + { + "epoch": 0.54, + "grad_norm": 2.5077328392848472, + "learning_rate": 4.5820976202044085e-06, + "loss": 0.678, + "step": 5140 + }, + { + "epoch": 0.54, + "grad_norm": 2.107242753594195, + "learning_rate": 4.580399369413847e-06, + "loss": 0.5716, + "step": 5141 + }, + { + "epoch": 0.54, + "grad_norm": 2.2886530499565723, + "learning_rate": 4.578701167371274e-06, + "loss": 0.6729, + "step": 5142 + }, + { + "epoch": 0.54, + "grad_norm": 1.806852048189923, + "learning_rate": 4.577003014273981e-06, + "loss": 0.6226, + "step": 5143 + }, + { + "epoch": 0.54, + "grad_norm": 2.0468082920924573, + "learning_rate": 4.575304910319257e-06, + "loss": 0.7215, + "step": 5144 + }, + { + "epoch": 0.54, + "grad_norm": 2.3451100058055863, + "learning_rate": 4.573606855704379e-06, + "loss": 0.5165, + "step": 5145 + }, + { + "epoch": 0.54, + "grad_norm": 2.122583359383461, + "learning_rate": 4.571908850626625e-06, + "loss": 0.558, + "step": 5146 + }, + { + "epoch": 0.54, + "grad_norm": 2.1021680512947545, + "learning_rate": 4.570210895283262e-06, + "loss": 0.5882, + "step": 5147 + }, + { + "epoch": 0.54, + "grad_norm": 2.613254013725175, + "learning_rate": 4.568512989871557e-06, + "loss": 0.627, + "step": 5148 + }, + { + "epoch": 0.54, + "grad_norm": 1.063740812597954, + "learning_rate": 4.566815134588763e-06, + "loss": 0.5857, + "step": 5149 + }, + { + "epoch": 0.54, + "grad_norm": 2.3292145678287235, + "learning_rate": 4.565117329632137e-06, + "loss": 0.5244, + "step": 5150 + }, + { + "epoch": 0.54, + "grad_norm": 2.2621172262271285, + "learning_rate": 4.5634195751989195e-06, + "loss": 0.6286, + "step": 5151 + }, + { + "epoch": 0.54, + "grad_norm": 2.305527091957668, + "learning_rate": 4.561721871486357e-06, + "loss": 0.6073, + "step": 5152 + }, + { + "epoch": 0.54, + "grad_norm": 2.664097973486397, + "learning_rate": 4.5600242186916786e-06, + "loss": 0.7036, + "step": 5153 + }, + { + "epoch": 0.54, + "grad_norm": 3.907247212421276, + "learning_rate": 4.5583266170121155e-06, + "loss": 0.6086, + "step": 5154 + }, + { + "epoch": 0.54, + "grad_norm": 3.356425324401827, + "learning_rate": 4.556629066644888e-06, + "loss": 0.5769, + "step": 5155 + }, + { + "epoch": 0.54, + "grad_norm": 3.328669215805127, + "learning_rate": 4.554931567787214e-06, + "loss": 0.6026, + "step": 5156 + }, + { + "epoch": 0.54, + "grad_norm": 2.2570183445717817, + "learning_rate": 4.553234120636306e-06, + "loss": 0.6124, + "step": 5157 + }, + { + "epoch": 0.54, + "grad_norm": 3.081027926632191, + "learning_rate": 4.551536725389364e-06, + "loss": 0.6321, + "step": 5158 + }, + { + "epoch": 0.54, + "grad_norm": 2.273359671518665, + "learning_rate": 4.54983938224359e-06, + "loss": 0.6413, + "step": 5159 + }, + { + "epoch": 0.54, + "grad_norm": 2.9426125776938763, + "learning_rate": 4.5481420913961734e-06, + "loss": 0.6623, + "step": 5160 + }, + { + "epoch": 0.54, + "grad_norm": 2.272805905164049, + "learning_rate": 4.546444853044308e-06, + "loss": 0.6284, + "step": 5161 + }, + { + "epoch": 0.54, + "grad_norm": 4.032007123280218, + "learning_rate": 4.544747667385163e-06, + "loss": 0.671, + "step": 5162 + }, + { + "epoch": 0.54, + "grad_norm": 2.705544234308645, + "learning_rate": 4.543050534615919e-06, + "loss": 0.6616, + "step": 5163 + }, + { + "epoch": 0.54, + "grad_norm": 2.3270348164118886, + "learning_rate": 4.541353454933743e-06, + "loss": 0.6169, + "step": 5164 + }, + { + "epoch": 0.54, + "grad_norm": 3.1781706730214743, + "learning_rate": 4.539656428535799e-06, + "loss": 0.6087, + "step": 5165 + }, + { + "epoch": 0.54, + "grad_norm": 2.157315390560079, + "learning_rate": 4.537959455619238e-06, + "loss": 0.5943, + "step": 5166 + }, + { + "epoch": 0.54, + "grad_norm": 2.909391157704517, + "learning_rate": 4.536262536381213e-06, + "loss": 0.6192, + "step": 5167 + }, + { + "epoch": 0.54, + "grad_norm": 2.3877838658782027, + "learning_rate": 4.5345656710188645e-06, + "loss": 0.6268, + "step": 5168 + }, + { + "epoch": 0.54, + "grad_norm": 2.8840280487338967, + "learning_rate": 4.532868859729333e-06, + "loss": 0.6004, + "step": 5169 + }, + { + "epoch": 0.54, + "grad_norm": 2.113177902066499, + "learning_rate": 4.531172102709746e-06, + "loss": 0.6332, + "step": 5170 + }, + { + "epoch": 0.54, + "grad_norm": 2.0427183759210084, + "learning_rate": 4.529475400157228e-06, + "loss": 0.6404, + "step": 5171 + }, + { + "epoch": 0.54, + "grad_norm": 2.552191146094726, + "learning_rate": 4.527778752268899e-06, + "loss": 0.6292, + "step": 5172 + }, + { + "epoch": 0.54, + "grad_norm": 2.3830504963095254, + "learning_rate": 4.5260821592418685e-06, + "loss": 0.6213, + "step": 5173 + }, + { + "epoch": 0.54, + "grad_norm": 5.240778096163022, + "learning_rate": 4.5243856212732466e-06, + "loss": 0.5673, + "step": 5174 + }, + { + "epoch": 0.54, + "grad_norm": 1.0053178425031761, + "learning_rate": 4.5226891385601235e-06, + "loss": 0.5822, + "step": 5175 + }, + { + "epoch": 0.54, + "grad_norm": 2.4335630650323252, + "learning_rate": 4.520992711299599e-06, + "loss": 0.6692, + "step": 5176 + }, + { + "epoch": 0.54, + "grad_norm": 2.058047240249313, + "learning_rate": 4.519296339688754e-06, + "loss": 0.5852, + "step": 5177 + }, + { + "epoch": 0.54, + "grad_norm": 3.063974457211975, + "learning_rate": 4.517600023924673e-06, + "loss": 0.6141, + "step": 5178 + }, + { + "epoch": 0.54, + "grad_norm": 2.1386266960462503, + "learning_rate": 4.515903764204428e-06, + "loss": 0.6138, + "step": 5179 + }, + { + "epoch": 0.55, + "grad_norm": 2.5711607873760527, + "learning_rate": 4.514207560725082e-06, + "loss": 0.6987, + "step": 5180 + }, + { + "epoch": 0.55, + "grad_norm": 1.979892923073177, + "learning_rate": 4.512511413683698e-06, + "loss": 0.6426, + "step": 5181 + }, + { + "epoch": 0.55, + "grad_norm": 2.1211499569858487, + "learning_rate": 4.510815323277329e-06, + "loss": 0.6801, + "step": 5182 + }, + { + "epoch": 0.55, + "grad_norm": 2.415322447097737, + "learning_rate": 4.509119289703023e-06, + "loss": 0.5961, + "step": 5183 + }, + { + "epoch": 0.55, + "grad_norm": 2.4841757246588174, + "learning_rate": 4.507423313157815e-06, + "loss": 0.6813, + "step": 5184 + }, + { + "epoch": 0.55, + "grad_norm": 2.409140496949704, + "learning_rate": 4.505727393838746e-06, + "loss": 0.6362, + "step": 5185 + }, + { + "epoch": 0.55, + "grad_norm": 2.3420213528636924, + "learning_rate": 4.504031531942837e-06, + "loss": 0.6293, + "step": 5186 + }, + { + "epoch": 0.55, + "grad_norm": 2.454227338659623, + "learning_rate": 4.502335727667114e-06, + "loss": 0.6149, + "step": 5187 + }, + { + "epoch": 0.55, + "grad_norm": 2.7418078304273394, + "learning_rate": 4.500639981208586e-06, + "loss": 0.6847, + "step": 5188 + }, + { + "epoch": 0.55, + "grad_norm": 1.9687799092010179, + "learning_rate": 4.498944292764261e-06, + "loss": 0.6455, + "step": 5189 + }, + { + "epoch": 0.55, + "grad_norm": 2.3321151600204844, + "learning_rate": 4.497248662531139e-06, + "loss": 0.605, + "step": 5190 + }, + { + "epoch": 0.55, + "grad_norm": 3.4606611035859682, + "learning_rate": 4.495553090706216e-06, + "loss": 0.6776, + "step": 5191 + }, + { + "epoch": 0.55, + "grad_norm": 2.3731291130732086, + "learning_rate": 4.493857577486477e-06, + "loss": 0.6971, + "step": 5192 + }, + { + "epoch": 0.55, + "grad_norm": 2.041114475320568, + "learning_rate": 4.492162123068899e-06, + "loss": 0.5753, + "step": 5193 + }, + { + "epoch": 0.55, + "grad_norm": 2.5283267101737827, + "learning_rate": 4.49046672765046e-06, + "loss": 0.6632, + "step": 5194 + }, + { + "epoch": 0.55, + "grad_norm": 2.535327625572618, + "learning_rate": 4.488771391428122e-06, + "loss": 0.7297, + "step": 5195 + }, + { + "epoch": 0.55, + "grad_norm": 2.3383531443300956, + "learning_rate": 4.487076114598848e-06, + "loss": 0.5403, + "step": 5196 + }, + { + "epoch": 0.55, + "grad_norm": 2.1712529025180665, + "learning_rate": 4.485380897359587e-06, + "loss": 0.658, + "step": 5197 + }, + { + "epoch": 0.55, + "grad_norm": 2.2880236706230876, + "learning_rate": 4.483685739907285e-06, + "loss": 0.6671, + "step": 5198 + }, + { + "epoch": 0.55, + "grad_norm": 2.923715631742324, + "learning_rate": 4.481990642438881e-06, + "loss": 0.6322, + "step": 5199 + }, + { + "epoch": 0.55, + "grad_norm": 2.5156565772915966, + "learning_rate": 4.480295605151308e-06, + "loss": 0.779, + "step": 5200 + }, + { + "epoch": 0.55, + "grad_norm": 2.279437155096648, + "learning_rate": 4.47860062824149e-06, + "loss": 0.5941, + "step": 5201 + }, + { + "epoch": 0.55, + "grad_norm": 2.8210671148826783, + "learning_rate": 4.4769057119063425e-06, + "loss": 0.6129, + "step": 5202 + }, + { + "epoch": 0.55, + "grad_norm": 2.1034516568430606, + "learning_rate": 4.475210856342777e-06, + "loss": 0.655, + "step": 5203 + }, + { + "epoch": 0.55, + "grad_norm": 2.1311538275721205, + "learning_rate": 4.473516061747697e-06, + "loss": 0.537, + "step": 5204 + }, + { + "epoch": 0.55, + "grad_norm": 0.9788330045930997, + "learning_rate": 4.471821328318001e-06, + "loss": 0.5896, + "step": 5205 + }, + { + "epoch": 0.55, + "grad_norm": 3.8737593347024135, + "learning_rate": 4.470126656250574e-06, + "loss": 0.5861, + "step": 5206 + }, + { + "epoch": 0.55, + "grad_norm": 2.5511717033127996, + "learning_rate": 4.468432045742301e-06, + "loss": 0.5941, + "step": 5207 + }, + { + "epoch": 0.55, + "grad_norm": 2.9074504611235072, + "learning_rate": 4.466737496990057e-06, + "loss": 0.6367, + "step": 5208 + }, + { + "epoch": 0.55, + "grad_norm": 3.069907121072369, + "learning_rate": 4.46504301019071e-06, + "loss": 0.6646, + "step": 5209 + }, + { + "epoch": 0.55, + "grad_norm": 2.6033731154873, + "learning_rate": 4.463348585541117e-06, + "loss": 0.6088, + "step": 5210 + }, + { + "epoch": 0.55, + "grad_norm": 2.188979272302636, + "learning_rate": 4.461654223238136e-06, + "loss": 0.5968, + "step": 5211 + }, + { + "epoch": 0.55, + "grad_norm": 2.743951094727751, + "learning_rate": 4.459959923478609e-06, + "loss": 0.6687, + "step": 5212 + }, + { + "epoch": 0.55, + "grad_norm": 2.379573949144014, + "learning_rate": 4.45826568645938e-06, + "loss": 0.6304, + "step": 5213 + }, + { + "epoch": 0.55, + "grad_norm": 2.454626913642241, + "learning_rate": 4.456571512377277e-06, + "loss": 0.601, + "step": 5214 + }, + { + "epoch": 0.55, + "grad_norm": 2.822548767237185, + "learning_rate": 4.454877401429123e-06, + "loss": 0.5249, + "step": 5215 + }, + { + "epoch": 0.55, + "grad_norm": 2.6657172122549992, + "learning_rate": 4.453183353811737e-06, + "loss": 0.5839, + "step": 5216 + }, + { + "epoch": 0.55, + "grad_norm": 1.3013033806129486, + "learning_rate": 4.45148936972193e-06, + "loss": 0.5733, + "step": 5217 + }, + { + "epoch": 0.55, + "grad_norm": 2.2066422534371846, + "learning_rate": 4.449795449356502e-06, + "loss": 0.6463, + "step": 5218 + }, + { + "epoch": 0.55, + "grad_norm": 2.360497161476003, + "learning_rate": 4.4481015929122465e-06, + "loss": 0.6128, + "step": 5219 + }, + { + "epoch": 0.55, + "grad_norm": 2.5466902571725396, + "learning_rate": 4.446407800585954e-06, + "loss": 0.6788, + "step": 5220 + }, + { + "epoch": 0.55, + "grad_norm": 7.202579197824117, + "learning_rate": 4.444714072574401e-06, + "loss": 0.6787, + "step": 5221 + }, + { + "epoch": 0.55, + "grad_norm": 3.010123101753715, + "learning_rate": 4.443020409074365e-06, + "loss": 0.7269, + "step": 5222 + }, + { + "epoch": 0.55, + "grad_norm": 2.8939895157613513, + "learning_rate": 4.441326810282606e-06, + "loss": 0.7057, + "step": 5223 + }, + { + "epoch": 0.55, + "grad_norm": 2.7291258805819063, + "learning_rate": 4.4396332763958835e-06, + "loss": 0.6379, + "step": 5224 + }, + { + "epoch": 0.55, + "grad_norm": 2.8152570916653485, + "learning_rate": 4.437939807610947e-06, + "loss": 0.6389, + "step": 5225 + }, + { + "epoch": 0.55, + "grad_norm": 4.863829933630771, + "learning_rate": 4.436246404124539e-06, + "loss": 0.592, + "step": 5226 + }, + { + "epoch": 0.55, + "grad_norm": 1.9228368391092252, + "learning_rate": 4.4345530661333955e-06, + "loss": 0.6296, + "step": 5227 + }, + { + "epoch": 0.55, + "grad_norm": 2.4158924275798084, + "learning_rate": 4.432859793834239e-06, + "loss": 0.582, + "step": 5228 + }, + { + "epoch": 0.55, + "grad_norm": 2.572611497694934, + "learning_rate": 4.431166587423794e-06, + "loss": 0.6169, + "step": 5229 + }, + { + "epoch": 0.55, + "grad_norm": 2.268729295515817, + "learning_rate": 4.42947344709877e-06, + "loss": 0.6666, + "step": 5230 + }, + { + "epoch": 0.55, + "grad_norm": 2.0952795027263966, + "learning_rate": 4.4277803730558746e-06, + "loss": 0.6486, + "step": 5231 + }, + { + "epoch": 0.55, + "grad_norm": 1.0968523158920367, + "learning_rate": 4.426087365491798e-06, + "loss": 0.5293, + "step": 5232 + }, + { + "epoch": 0.55, + "grad_norm": 2.088438078890414, + "learning_rate": 4.424394424603234e-06, + "loss": 0.5566, + "step": 5233 + }, + { + "epoch": 0.55, + "grad_norm": 2.1632948290431497, + "learning_rate": 4.42270155058686e-06, + "loss": 0.6262, + "step": 5234 + }, + { + "epoch": 0.55, + "grad_norm": 0.9284998520052347, + "learning_rate": 4.421008743639353e-06, + "loss": 0.5875, + "step": 5235 + }, + { + "epoch": 0.55, + "grad_norm": 2.0523984063768803, + "learning_rate": 4.419316003957376e-06, + "loss": 0.7087, + "step": 5236 + }, + { + "epoch": 0.55, + "grad_norm": 2.443819742443756, + "learning_rate": 4.417623331737587e-06, + "loss": 0.6464, + "step": 5237 + }, + { + "epoch": 0.55, + "grad_norm": 2.0896384959029604, + "learning_rate": 4.415930727176634e-06, + "loss": 0.5857, + "step": 5238 + }, + { + "epoch": 0.55, + "grad_norm": 2.230344099608812, + "learning_rate": 4.414238190471163e-06, + "loss": 0.6313, + "step": 5239 + }, + { + "epoch": 0.55, + "grad_norm": 3.3364275587007395, + "learning_rate": 4.412545721817806e-06, + "loss": 0.7127, + "step": 5240 + }, + { + "epoch": 0.55, + "grad_norm": 1.9806871576287524, + "learning_rate": 4.410853321413187e-06, + "loss": 0.6476, + "step": 5241 + }, + { + "epoch": 0.55, + "grad_norm": 2.5890707975835805, + "learning_rate": 4.409160989453927e-06, + "loss": 0.6181, + "step": 5242 + }, + { + "epoch": 0.55, + "grad_norm": 3.0789313597402534, + "learning_rate": 4.407468726136634e-06, + "loss": 0.637, + "step": 5243 + }, + { + "epoch": 0.55, + "grad_norm": 2.1476761639567976, + "learning_rate": 4.405776531657916e-06, + "loss": 0.6924, + "step": 5244 + }, + { + "epoch": 0.55, + "grad_norm": 3.4858241564055628, + "learning_rate": 4.404084406214358e-06, + "loss": 0.7234, + "step": 5245 + }, + { + "epoch": 0.55, + "grad_norm": 2.3970949139274835, + "learning_rate": 4.402392350002554e-06, + "loss": 0.5235, + "step": 5246 + }, + { + "epoch": 0.55, + "grad_norm": 2.235687197461085, + "learning_rate": 4.400700363219076e-06, + "loss": 0.6731, + "step": 5247 + }, + { + "epoch": 0.55, + "grad_norm": 8.656344736133104, + "learning_rate": 4.399008446060501e-06, + "loss": 0.6499, + "step": 5248 + }, + { + "epoch": 0.55, + "grad_norm": 3.0639313280738216, + "learning_rate": 4.397316598723385e-06, + "loss": 0.5468, + "step": 5249 + }, + { + "epoch": 0.55, + "grad_norm": 2.0867057122263857, + "learning_rate": 4.3956248214042855e-06, + "loss": 0.6578, + "step": 5250 + }, + { + "epoch": 0.55, + "grad_norm": 2.599040114763554, + "learning_rate": 4.393933114299746e-06, + "loss": 0.6618, + "step": 5251 + }, + { + "epoch": 0.55, + "grad_norm": 2.6001557142240945, + "learning_rate": 4.3922414776063075e-06, + "loss": 0.6799, + "step": 5252 + }, + { + "epoch": 0.55, + "grad_norm": 2.1507305611894223, + "learning_rate": 4.390549911520497e-06, + "loss": 0.663, + "step": 5253 + }, + { + "epoch": 0.55, + "grad_norm": 2.7080296933337946, + "learning_rate": 4.388858416238834e-06, + "loss": 0.5737, + "step": 5254 + }, + { + "epoch": 0.55, + "grad_norm": 5.185440108618935, + "learning_rate": 4.3871669919578345e-06, + "loss": 0.6253, + "step": 5255 + }, + { + "epoch": 0.55, + "grad_norm": 2.41165829284016, + "learning_rate": 4.385475638874001e-06, + "loss": 0.6524, + "step": 5256 + }, + { + "epoch": 0.55, + "grad_norm": 2.0980213739464113, + "learning_rate": 4.383784357183835e-06, + "loss": 0.5656, + "step": 5257 + }, + { + "epoch": 0.55, + "grad_norm": 6.656438895282767, + "learning_rate": 4.382093147083819e-06, + "loss": 0.6449, + "step": 5258 + }, + { + "epoch": 0.55, + "grad_norm": 2.4496335492831403, + "learning_rate": 4.380402008770435e-06, + "loss": 0.6254, + "step": 5259 + }, + { + "epoch": 0.55, + "grad_norm": 2.959663295531009, + "learning_rate": 4.378710942440153e-06, + "loss": 0.6069, + "step": 5260 + }, + { + "epoch": 0.55, + "grad_norm": 2.7471051867065492, + "learning_rate": 4.377019948289441e-06, + "loss": 0.6661, + "step": 5261 + }, + { + "epoch": 0.55, + "grad_norm": 3.8039135221418805, + "learning_rate": 4.375329026514749e-06, + "loss": 0.6398, + "step": 5262 + }, + { + "epoch": 0.55, + "grad_norm": 3.677966696492936, + "learning_rate": 4.373638177312524e-06, + "loss": 0.5647, + "step": 5263 + }, + { + "epoch": 0.55, + "grad_norm": 2.6177835119924455, + "learning_rate": 4.371947400879205e-06, + "loss": 0.6016, + "step": 5264 + }, + { + "epoch": 0.55, + "grad_norm": 2.233055028774205, + "learning_rate": 4.370256697411221e-06, + "loss": 0.5827, + "step": 5265 + }, + { + "epoch": 0.55, + "grad_norm": 2.2362734819525723, + "learning_rate": 4.368566067104998e-06, + "loss": 0.6496, + "step": 5266 + }, + { + "epoch": 0.55, + "grad_norm": 2.211364135754513, + "learning_rate": 4.366875510156939e-06, + "loss": 0.557, + "step": 5267 + }, + { + "epoch": 0.55, + "grad_norm": 2.8665096255402696, + "learning_rate": 4.365185026763455e-06, + "loss": 0.5965, + "step": 5268 + }, + { + "epoch": 0.55, + "grad_norm": 3.0564195544942065, + "learning_rate": 4.363494617120938e-06, + "loss": 0.7145, + "step": 5269 + }, + { + "epoch": 0.55, + "grad_norm": 2.64369421900083, + "learning_rate": 4.361804281425779e-06, + "loss": 0.725, + "step": 5270 + }, + { + "epoch": 0.55, + "grad_norm": 3.12214434238347, + "learning_rate": 4.360114019874353e-06, + "loss": 0.6916, + "step": 5271 + }, + { + "epoch": 0.55, + "grad_norm": 6.609092742818454, + "learning_rate": 4.35842383266303e-06, + "loss": 0.6325, + "step": 5272 + }, + { + "epoch": 0.55, + "grad_norm": 2.300634353062687, + "learning_rate": 4.35673371998817e-06, + "loss": 0.6284, + "step": 5273 + }, + { + "epoch": 0.55, + "grad_norm": 3.7838481612576356, + "learning_rate": 4.355043682046129e-06, + "loss": 0.6038, + "step": 5274 + }, + { + "epoch": 0.56, + "grad_norm": 2.7986771832900788, + "learning_rate": 4.353353719033249e-06, + "loss": 0.654, + "step": 5275 + }, + { + "epoch": 0.56, + "grad_norm": 4.87051694324319, + "learning_rate": 4.3516638311458624e-06, + "loss": 0.5146, + "step": 5276 + }, + { + "epoch": 0.56, + "grad_norm": 2.349730181771263, + "learning_rate": 4.349974018580298e-06, + "loss": 0.6656, + "step": 5277 + }, + { + "epoch": 0.56, + "grad_norm": 2.335395790852798, + "learning_rate": 4.348284281532874e-06, + "loss": 0.5557, + "step": 5278 + }, + { + "epoch": 0.56, + "grad_norm": 2.7325757799709605, + "learning_rate": 4.3465946201999e-06, + "loss": 0.7216, + "step": 5279 + }, + { + "epoch": 0.56, + "grad_norm": 2.104588371726466, + "learning_rate": 4.344905034777672e-06, + "loss": 0.6013, + "step": 5280 + }, + { + "epoch": 0.56, + "grad_norm": 2.3109494580443304, + "learning_rate": 4.343215525462484e-06, + "loss": 0.6356, + "step": 5281 + }, + { + "epoch": 0.56, + "grad_norm": 3.017653085191442, + "learning_rate": 4.3415260924506165e-06, + "loss": 0.6454, + "step": 5282 + }, + { + "epoch": 0.56, + "grad_norm": 2.571020375717761, + "learning_rate": 4.339836735938347e-06, + "loss": 0.5784, + "step": 5283 + }, + { + "epoch": 0.56, + "grad_norm": 2.4571187576232276, + "learning_rate": 4.338147456121935e-06, + "loss": 0.6483, + "step": 5284 + }, + { + "epoch": 0.56, + "grad_norm": 2.5644236337830937, + "learning_rate": 4.336458253197637e-06, + "loss": 0.5715, + "step": 5285 + }, + { + "epoch": 0.56, + "grad_norm": 2.511149918383041, + "learning_rate": 4.334769127361703e-06, + "loss": 0.6012, + "step": 5286 + }, + { + "epoch": 0.56, + "grad_norm": 4.883950372753124, + "learning_rate": 4.333080078810369e-06, + "loss": 0.6156, + "step": 5287 + }, + { + "epoch": 0.56, + "grad_norm": 3.4555634315635984, + "learning_rate": 4.331391107739864e-06, + "loss": 0.6276, + "step": 5288 + }, + { + "epoch": 0.56, + "grad_norm": 2.8118042237460923, + "learning_rate": 4.329702214346406e-06, + "loss": 0.6439, + "step": 5289 + }, + { + "epoch": 0.56, + "grad_norm": 2.141667067861921, + "learning_rate": 4.328013398826207e-06, + "loss": 0.6292, + "step": 5290 + }, + { + "epoch": 0.56, + "grad_norm": 2.1279882409294806, + "learning_rate": 4.3263246613754685e-06, + "loss": 0.6465, + "step": 5291 + }, + { + "epoch": 0.56, + "grad_norm": 5.1797440641409, + "learning_rate": 4.324636002190386e-06, + "loss": 0.6019, + "step": 5292 + }, + { + "epoch": 0.56, + "grad_norm": 2.5518905433521217, + "learning_rate": 4.322947421467138e-06, + "loss": 0.6755, + "step": 5293 + }, + { + "epoch": 0.56, + "grad_norm": 4.707501918054458, + "learning_rate": 4.321258919401903e-06, + "loss": 0.7213, + "step": 5294 + }, + { + "epoch": 0.56, + "grad_norm": 3.090219508336725, + "learning_rate": 4.319570496190843e-06, + "loss": 0.5684, + "step": 5295 + }, + { + "epoch": 0.56, + "grad_norm": 2.5586775104390838, + "learning_rate": 4.317882152030118e-06, + "loss": 0.6409, + "step": 5296 + }, + { + "epoch": 0.56, + "grad_norm": 2.7425290806450615, + "learning_rate": 4.316193887115871e-06, + "loss": 0.6583, + "step": 5297 + }, + { + "epoch": 0.56, + "grad_norm": 2.929685634618751, + "learning_rate": 4.314505701644242e-06, + "loss": 0.6087, + "step": 5298 + }, + { + "epoch": 0.56, + "grad_norm": 1.037591260391563, + "learning_rate": 4.3128175958113585e-06, + "loss": 0.5755, + "step": 5299 + }, + { + "epoch": 0.56, + "grad_norm": 2.459322238903088, + "learning_rate": 4.311129569813341e-06, + "loss": 0.6523, + "step": 5300 + }, + { + "epoch": 0.56, + "grad_norm": 3.3425180981358253, + "learning_rate": 4.3094416238463e-06, + "loss": 0.6665, + "step": 5301 + }, + { + "epoch": 0.56, + "grad_norm": 2.2917138959022267, + "learning_rate": 4.307753758106332e-06, + "loss": 0.6679, + "step": 5302 + }, + { + "epoch": 0.56, + "grad_norm": 1.9682044085217711, + "learning_rate": 4.306065972789533e-06, + "loss": 0.6181, + "step": 5303 + }, + { + "epoch": 0.56, + "grad_norm": 2.339833756953796, + "learning_rate": 4.304378268091982e-06, + "loss": 0.6613, + "step": 5304 + }, + { + "epoch": 0.56, + "grad_norm": 3.445630959329979, + "learning_rate": 4.302690644209756e-06, + "loss": 0.6045, + "step": 5305 + }, + { + "epoch": 0.56, + "grad_norm": 2.5016790643491187, + "learning_rate": 4.301003101338913e-06, + "loss": 0.5814, + "step": 5306 + }, + { + "epoch": 0.56, + "grad_norm": 5.6755679527300735, + "learning_rate": 4.29931563967551e-06, + "loss": 0.6186, + "step": 5307 + }, + { + "epoch": 0.56, + "grad_norm": 2.4930015287099927, + "learning_rate": 4.2976282594155885e-06, + "loss": 0.6656, + "step": 5308 + }, + { + "epoch": 0.56, + "grad_norm": 2.3871048273612114, + "learning_rate": 4.2959409607551885e-06, + "loss": 0.6458, + "step": 5309 + }, + { + "epoch": 0.56, + "grad_norm": 2.6870418365333726, + "learning_rate": 4.294253743890331e-06, + "loss": 0.5847, + "step": 5310 + }, + { + "epoch": 0.56, + "grad_norm": 2.453388094507715, + "learning_rate": 4.292566609017032e-06, + "loss": 0.5881, + "step": 5311 + }, + { + "epoch": 0.56, + "grad_norm": 3.1874846967139456, + "learning_rate": 4.290879556331301e-06, + "loss": 0.5902, + "step": 5312 + }, + { + "epoch": 0.56, + "grad_norm": 5.5444861113536374, + "learning_rate": 4.2891925860291315e-06, + "loss": 0.4916, + "step": 5313 + }, + { + "epoch": 0.56, + "grad_norm": 2.2307192839807906, + "learning_rate": 4.287505698306517e-06, + "loss": 0.6353, + "step": 5314 + }, + { + "epoch": 0.56, + "grad_norm": 2.316081716986372, + "learning_rate": 4.285818893359427e-06, + "loss": 0.6589, + "step": 5315 + }, + { + "epoch": 0.56, + "grad_norm": 2.455941628341801, + "learning_rate": 4.284132171383834e-06, + "loss": 0.6596, + "step": 5316 + }, + { + "epoch": 0.56, + "grad_norm": 3.220993513664449, + "learning_rate": 4.2824455325756955e-06, + "loss": 0.6346, + "step": 5317 + }, + { + "epoch": 0.56, + "grad_norm": 2.19089056244659, + "learning_rate": 4.2807589771309635e-06, + "loss": 0.5772, + "step": 5318 + }, + { + "epoch": 0.56, + "grad_norm": 2.731529565019871, + "learning_rate": 4.2790725052455726e-06, + "loss": 0.6435, + "step": 5319 + }, + { + "epoch": 0.56, + "grad_norm": 3.5785634546615195, + "learning_rate": 4.2773861171154525e-06, + "loss": 0.7559, + "step": 5320 + }, + { + "epoch": 0.56, + "grad_norm": 3.480519667349258, + "learning_rate": 4.275699812936526e-06, + "loss": 0.6401, + "step": 5321 + }, + { + "epoch": 0.56, + "grad_norm": 2.4318321347360388, + "learning_rate": 4.2740135929047034e-06, + "loss": 0.6463, + "step": 5322 + }, + { + "epoch": 0.56, + "grad_norm": 2.5507047618208354, + "learning_rate": 4.2723274572158805e-06, + "loss": 0.6721, + "step": 5323 + }, + { + "epoch": 0.56, + "grad_norm": 2.4756784180683753, + "learning_rate": 4.27064140606595e-06, + "loss": 0.6507, + "step": 5324 + }, + { + "epoch": 0.56, + "grad_norm": 2.5608707563373527, + "learning_rate": 4.268955439650793e-06, + "loss": 0.5792, + "step": 5325 + }, + { + "epoch": 0.56, + "grad_norm": 2.551538680904513, + "learning_rate": 4.267269558166279e-06, + "loss": 0.6255, + "step": 5326 + }, + { + "epoch": 0.56, + "grad_norm": 2.2584831084156374, + "learning_rate": 4.265583761808272e-06, + "loss": 0.7075, + "step": 5327 + }, + { + "epoch": 0.56, + "grad_norm": 2.4024189459356524, + "learning_rate": 4.26389805077262e-06, + "loss": 0.6714, + "step": 5328 + }, + { + "epoch": 0.56, + "grad_norm": 2.5974293268410182, + "learning_rate": 4.262212425255164e-06, + "loss": 0.6049, + "step": 5329 + }, + { + "epoch": 0.56, + "grad_norm": 2.222020544696383, + "learning_rate": 4.260526885451734e-06, + "loss": 0.6188, + "step": 5330 + }, + { + "epoch": 0.56, + "grad_norm": 2.637638372678464, + "learning_rate": 4.258841431558156e-06, + "loss": 0.6554, + "step": 5331 + }, + { + "epoch": 0.56, + "grad_norm": 2.0803098388886165, + "learning_rate": 4.257156063770237e-06, + "loss": 0.6647, + "step": 5332 + }, + { + "epoch": 0.56, + "grad_norm": 2.038548102890104, + "learning_rate": 4.2554707822837775e-06, + "loss": 0.576, + "step": 5333 + }, + { + "epoch": 0.56, + "grad_norm": 2.4381809992111414, + "learning_rate": 4.253785587294571e-06, + "loss": 0.6267, + "step": 5334 + }, + { + "epoch": 0.56, + "grad_norm": 2.9095370655137764, + "learning_rate": 4.252100478998398e-06, + "loss": 0.5394, + "step": 5335 + }, + { + "epoch": 0.56, + "grad_norm": 2.208800982681307, + "learning_rate": 4.250415457591031e-06, + "loss": 0.5668, + "step": 5336 + }, + { + "epoch": 0.56, + "grad_norm": 2.237278462901377, + "learning_rate": 4.248730523268227e-06, + "loss": 0.554, + "step": 5337 + }, + { + "epoch": 0.56, + "grad_norm": 5.159594855549469, + "learning_rate": 4.24704567622574e-06, + "loss": 0.5775, + "step": 5338 + }, + { + "epoch": 0.56, + "grad_norm": 2.0495262092053586, + "learning_rate": 4.245360916659309e-06, + "loss": 0.6269, + "step": 5339 + }, + { + "epoch": 0.56, + "grad_norm": 2.7328601805196513, + "learning_rate": 4.243676244764667e-06, + "loss": 0.5666, + "step": 5340 + }, + { + "epoch": 0.56, + "grad_norm": 2.599232977470765, + "learning_rate": 4.241991660737532e-06, + "loss": 0.5742, + "step": 5341 + }, + { + "epoch": 0.56, + "grad_norm": 2.6689500818787217, + "learning_rate": 4.240307164773615e-06, + "loss": 0.6964, + "step": 5342 + }, + { + "epoch": 0.56, + "grad_norm": 2.0051800373166295, + "learning_rate": 4.238622757068614e-06, + "loss": 0.6334, + "step": 5343 + }, + { + "epoch": 0.56, + "grad_norm": 3.296191095173538, + "learning_rate": 4.2369384378182216e-06, + "loss": 0.5819, + "step": 5344 + }, + { + "epoch": 0.56, + "grad_norm": 3.7526757822737973, + "learning_rate": 4.2352542072181156e-06, + "loss": 0.5592, + "step": 5345 + }, + { + "epoch": 0.56, + "grad_norm": 4.769568113242851, + "learning_rate": 4.233570065463964e-06, + "loss": 0.6135, + "step": 5346 + }, + { + "epoch": 0.56, + "grad_norm": 2.2793506199943447, + "learning_rate": 4.231886012751427e-06, + "loss": 0.5955, + "step": 5347 + }, + { + "epoch": 0.56, + "grad_norm": 2.4668991593558562, + "learning_rate": 4.230202049276152e-06, + "loss": 0.6746, + "step": 5348 + }, + { + "epoch": 0.56, + "grad_norm": 2.273428983253158, + "learning_rate": 4.228518175233781e-06, + "loss": 0.6367, + "step": 5349 + }, + { + "epoch": 0.56, + "grad_norm": 3.012893755691541, + "learning_rate": 4.226834390819935e-06, + "loss": 0.6371, + "step": 5350 + }, + { + "epoch": 0.56, + "grad_norm": 2.892193534980026, + "learning_rate": 4.225150696230236e-06, + "loss": 0.6217, + "step": 5351 + }, + { + "epoch": 0.56, + "grad_norm": 2.4872260622117532, + "learning_rate": 4.223467091660287e-06, + "loss": 0.6131, + "step": 5352 + }, + { + "epoch": 0.56, + "grad_norm": 2.493780184626296, + "learning_rate": 4.22178357730569e-06, + "loss": 0.6648, + "step": 5353 + }, + { + "epoch": 0.56, + "grad_norm": 2.1706205845561226, + "learning_rate": 4.220100153362026e-06, + "loss": 0.6315, + "step": 5354 + }, + { + "epoch": 0.56, + "grad_norm": 2.2135535393045607, + "learning_rate": 4.2184168200248695e-06, + "loss": 0.6127, + "step": 5355 + }, + { + "epoch": 0.56, + "grad_norm": 3.6077079918211687, + "learning_rate": 4.21673357748979e-06, + "loss": 0.5827, + "step": 5356 + }, + { + "epoch": 0.56, + "grad_norm": 2.2105823742172683, + "learning_rate": 4.215050425952339e-06, + "loss": 0.6933, + "step": 5357 + }, + { + "epoch": 0.56, + "grad_norm": 2.2055870618699362, + "learning_rate": 4.213367365608061e-06, + "loss": 0.638, + "step": 5358 + }, + { + "epoch": 0.56, + "grad_norm": 2.588655796581996, + "learning_rate": 4.211684396652487e-06, + "loss": 0.6038, + "step": 5359 + }, + { + "epoch": 0.56, + "grad_norm": 2.0448283559826437, + "learning_rate": 4.210001519281142e-06, + "loss": 0.7124, + "step": 5360 + }, + { + "epoch": 0.56, + "grad_norm": 2.062696584888601, + "learning_rate": 4.208318733689535e-06, + "loss": 0.626, + "step": 5361 + }, + { + "epoch": 0.56, + "grad_norm": 2.604045649910468, + "learning_rate": 4.206636040073172e-06, + "loss": 0.6666, + "step": 5362 + }, + { + "epoch": 0.56, + "grad_norm": 2.508672230990463, + "learning_rate": 4.204953438627539e-06, + "loss": 0.6892, + "step": 5363 + }, + { + "epoch": 0.56, + "grad_norm": 4.199729614432598, + "learning_rate": 4.203270929548117e-06, + "loss": 0.6088, + "step": 5364 + }, + { + "epoch": 0.56, + "grad_norm": 2.315338636083697, + "learning_rate": 4.2015885130303745e-06, + "loss": 0.5451, + "step": 5365 + }, + { + "epoch": 0.56, + "grad_norm": 2.8563637400295274, + "learning_rate": 4.199906189269773e-06, + "loss": 0.6707, + "step": 5366 + }, + { + "epoch": 0.56, + "grad_norm": 3.3092435696741203, + "learning_rate": 4.198223958461756e-06, + "loss": 0.5999, + "step": 5367 + }, + { + "epoch": 0.56, + "grad_norm": 2.284714130272184, + "learning_rate": 4.196541820801761e-06, + "loss": 0.5714, + "step": 5368 + }, + { + "epoch": 0.56, + "grad_norm": 2.208365754435486, + "learning_rate": 4.194859776485216e-06, + "loss": 0.6477, + "step": 5369 + }, + { + "epoch": 0.57, + "grad_norm": 2.9161689800766633, + "learning_rate": 4.193177825707535e-06, + "loss": 0.605, + "step": 5370 + }, + { + "epoch": 0.57, + "grad_norm": 2.5071516041978534, + "learning_rate": 4.191495968664122e-06, + "loss": 0.6405, + "step": 5371 + }, + { + "epoch": 0.57, + "grad_norm": 2.374183355215098, + "learning_rate": 4.189814205550369e-06, + "loss": 0.6325, + "step": 5372 + }, + { + "epoch": 0.57, + "grad_norm": 2.9716578098089905, + "learning_rate": 4.18813253656166e-06, + "loss": 0.5657, + "step": 5373 + }, + { + "epoch": 0.57, + "grad_norm": 2.5849556565957026, + "learning_rate": 4.186450961893366e-06, + "loss": 0.5766, + "step": 5374 + }, + { + "epoch": 0.57, + "grad_norm": 2.6139215621325977, + "learning_rate": 4.184769481740848e-06, + "loss": 0.5995, + "step": 5375 + }, + { + "epoch": 0.57, + "grad_norm": 2.035112537909609, + "learning_rate": 4.183088096299455e-06, + "loss": 0.6741, + "step": 5376 + }, + { + "epoch": 0.57, + "grad_norm": 4.3310970193321285, + "learning_rate": 4.1814068057645255e-06, + "loss": 0.6492, + "step": 5377 + }, + { + "epoch": 0.57, + "grad_norm": 2.581325181469474, + "learning_rate": 4.179725610331385e-06, + "loss": 0.5451, + "step": 5378 + }, + { + "epoch": 0.57, + "grad_norm": 4.498160977405595, + "learning_rate": 4.178044510195356e-06, + "loss": 0.657, + "step": 5379 + }, + { + "epoch": 0.57, + "grad_norm": 2.631101141420637, + "learning_rate": 4.176363505551737e-06, + "loss": 0.7063, + "step": 5380 + }, + { + "epoch": 0.57, + "grad_norm": 2.528098787527815, + "learning_rate": 4.174682596595824e-06, + "loss": 0.6972, + "step": 5381 + }, + { + "epoch": 0.57, + "grad_norm": 2.3803147400781186, + "learning_rate": 4.173001783522903e-06, + "loss": 0.6797, + "step": 5382 + }, + { + "epoch": 0.57, + "grad_norm": 2.483697902714156, + "learning_rate": 4.171321066528243e-06, + "loss": 0.659, + "step": 5383 + }, + { + "epoch": 0.57, + "grad_norm": 3.1064216692361284, + "learning_rate": 4.16964044580711e-06, + "loss": 0.6183, + "step": 5384 + }, + { + "epoch": 0.57, + "grad_norm": 2.750461435944443, + "learning_rate": 4.167959921554745e-06, + "loss": 0.6222, + "step": 5385 + }, + { + "epoch": 0.57, + "grad_norm": 2.0857966955698677, + "learning_rate": 4.166279493966393e-06, + "loss": 0.6282, + "step": 5386 + }, + { + "epoch": 0.57, + "grad_norm": 3.2748873110006333, + "learning_rate": 4.164599163237277e-06, + "loss": 0.627, + "step": 5387 + }, + { + "epoch": 0.57, + "grad_norm": 2.7282550442214744, + "learning_rate": 4.1629189295626195e-06, + "loss": 0.6795, + "step": 5388 + }, + { + "epoch": 0.57, + "grad_norm": 4.174622024247829, + "learning_rate": 4.161238793137619e-06, + "loss": 0.5899, + "step": 5389 + }, + { + "epoch": 0.57, + "grad_norm": 2.068250197507869, + "learning_rate": 4.159558754157469e-06, + "loss": 0.5851, + "step": 5390 + }, + { + "epoch": 0.57, + "grad_norm": 2.1242958834633563, + "learning_rate": 4.157878812817356e-06, + "loss": 0.6659, + "step": 5391 + }, + { + "epoch": 0.57, + "grad_norm": 2.7032218490370274, + "learning_rate": 4.1561989693124485e-06, + "loss": 0.6705, + "step": 5392 + }, + { + "epoch": 0.57, + "grad_norm": 2.753703066203034, + "learning_rate": 4.154519223837904e-06, + "loss": 0.6477, + "step": 5393 + }, + { + "epoch": 0.57, + "grad_norm": 2.5189720424400934, + "learning_rate": 4.1528395765888716e-06, + "loss": 0.6326, + "step": 5394 + }, + { + "epoch": 0.57, + "grad_norm": 2.1872662432303587, + "learning_rate": 4.15116002776049e-06, + "loss": 0.6172, + "step": 5395 + }, + { + "epoch": 0.57, + "grad_norm": 2.8914596193035806, + "learning_rate": 4.14948057754788e-06, + "loss": 0.6174, + "step": 5396 + }, + { + "epoch": 0.57, + "grad_norm": 4.0372536675287805, + "learning_rate": 4.147801226146163e-06, + "loss": 0.5389, + "step": 5397 + }, + { + "epoch": 0.57, + "grad_norm": 3.7867058905780673, + "learning_rate": 4.146121973750431e-06, + "loss": 0.6527, + "step": 5398 + }, + { + "epoch": 0.57, + "grad_norm": 2.3654720115387655, + "learning_rate": 4.144442820555782e-06, + "loss": 0.6418, + "step": 5399 + }, + { + "epoch": 0.57, + "grad_norm": 2.179581313117845, + "learning_rate": 4.142763766757292e-06, + "loss": 0.635, + "step": 5400 + }, + { + "epoch": 0.57, + "grad_norm": 2.2479561580681238, + "learning_rate": 4.141084812550031e-06, + "loss": 0.6461, + "step": 5401 + }, + { + "epoch": 0.57, + "grad_norm": 2.841191478849272, + "learning_rate": 4.139405958129053e-06, + "loss": 0.6007, + "step": 5402 + }, + { + "epoch": 0.57, + "grad_norm": 2.0859232520906836, + "learning_rate": 4.137727203689402e-06, + "loss": 0.5766, + "step": 5403 + }, + { + "epoch": 0.57, + "grad_norm": 3.3485825067338117, + "learning_rate": 4.136048549426112e-06, + "loss": 0.6512, + "step": 5404 + }, + { + "epoch": 0.57, + "grad_norm": 2.49496623222408, + "learning_rate": 4.134369995534206e-06, + "loss": 0.6146, + "step": 5405 + }, + { + "epoch": 0.57, + "grad_norm": 2.324007129830475, + "learning_rate": 4.132691542208691e-06, + "loss": 0.6011, + "step": 5406 + }, + { + "epoch": 0.57, + "grad_norm": 1.0793244375392315, + "learning_rate": 4.1310131896445635e-06, + "loss": 0.5288, + "step": 5407 + }, + { + "epoch": 0.57, + "grad_norm": 3.2583878078923725, + "learning_rate": 4.129334938036813e-06, + "loss": 0.6705, + "step": 5408 + }, + { + "epoch": 0.57, + "grad_norm": 2.307137719361831, + "learning_rate": 4.127656787580412e-06, + "loss": 0.6923, + "step": 5409 + }, + { + "epoch": 0.57, + "grad_norm": 2.578677072913258, + "learning_rate": 4.125978738470326e-06, + "loss": 0.6375, + "step": 5410 + }, + { + "epoch": 0.57, + "grad_norm": 2.4643009262796918, + "learning_rate": 4.1243007909015016e-06, + "loss": 0.5542, + "step": 5411 + }, + { + "epoch": 0.57, + "grad_norm": 2.2146931372972523, + "learning_rate": 4.1226229450688805e-06, + "loss": 0.6535, + "step": 5412 + }, + { + "epoch": 0.57, + "grad_norm": 2.119950795155613, + "learning_rate": 4.120945201167388e-06, + "loss": 0.6085, + "step": 5413 + }, + { + "epoch": 0.57, + "grad_norm": 3.5062101551407485, + "learning_rate": 4.119267559391944e-06, + "loss": 0.6207, + "step": 5414 + }, + { + "epoch": 0.57, + "grad_norm": 1.8128176772340931, + "learning_rate": 4.117590019937447e-06, + "loss": 0.5534, + "step": 5415 + }, + { + "epoch": 0.57, + "grad_norm": 2.4544616665994945, + "learning_rate": 4.11591258299879e-06, + "loss": 0.664, + "step": 5416 + }, + { + "epoch": 0.57, + "grad_norm": 3.5069164956287646, + "learning_rate": 4.114235248770854e-06, + "loss": 0.6393, + "step": 5417 + }, + { + "epoch": 0.57, + "grad_norm": 2.7344010000318297, + "learning_rate": 4.112558017448508e-06, + "loss": 0.6276, + "step": 5418 + }, + { + "epoch": 0.57, + "grad_norm": 3.5983760316820343, + "learning_rate": 4.1108808892266045e-06, + "loss": 0.6458, + "step": 5419 + }, + { + "epoch": 0.57, + "grad_norm": 2.8082114352695724, + "learning_rate": 4.109203864299989e-06, + "loss": 0.5609, + "step": 5420 + }, + { + "epoch": 0.57, + "grad_norm": 2.364880898558155, + "learning_rate": 4.107526942863493e-06, + "loss": 0.7066, + "step": 5421 + }, + { + "epoch": 0.57, + "grad_norm": 1.9084674442912224, + "learning_rate": 4.105850125111937e-06, + "loss": 0.613, + "step": 5422 + }, + { + "epoch": 0.57, + "grad_norm": 1.039149108932299, + "learning_rate": 4.104173411240131e-06, + "loss": 0.5501, + "step": 5423 + }, + { + "epoch": 0.57, + "grad_norm": 2.760756741136062, + "learning_rate": 4.102496801442868e-06, + "loss": 0.7026, + "step": 5424 + }, + { + "epoch": 0.57, + "grad_norm": 2.2973399753902295, + "learning_rate": 4.10082029591493e-06, + "loss": 0.643, + "step": 5425 + }, + { + "epoch": 0.57, + "grad_norm": 2.5509810297461826, + "learning_rate": 4.099143894851092e-06, + "loss": 0.5958, + "step": 5426 + }, + { + "epoch": 0.57, + "grad_norm": 1.0570994896608783, + "learning_rate": 4.097467598446113e-06, + "loss": 0.5708, + "step": 5427 + }, + { + "epoch": 0.57, + "grad_norm": 2.0781495093655433, + "learning_rate": 4.095791406894739e-06, + "loss": 0.5697, + "step": 5428 + }, + { + "epoch": 0.57, + "grad_norm": 2.46415369130286, + "learning_rate": 4.094115320391704e-06, + "loss": 0.6917, + "step": 5429 + }, + { + "epoch": 0.57, + "grad_norm": 2.1679299121769535, + "learning_rate": 4.0924393391317344e-06, + "loss": 0.6069, + "step": 5430 + }, + { + "epoch": 0.57, + "grad_norm": 1.0173189356350105, + "learning_rate": 4.090763463309536e-06, + "loss": 0.5557, + "step": 5431 + }, + { + "epoch": 0.57, + "grad_norm": 2.570745562650635, + "learning_rate": 4.089087693119815e-06, + "loss": 0.6819, + "step": 5432 + }, + { + "epoch": 0.57, + "grad_norm": 2.7474195051489185, + "learning_rate": 4.087412028757249e-06, + "loss": 0.621, + "step": 5433 + }, + { + "epoch": 0.57, + "grad_norm": 1.9523833561555928, + "learning_rate": 4.085736470416517e-06, + "loss": 0.6772, + "step": 5434 + }, + { + "epoch": 0.57, + "grad_norm": 2.4968268241742986, + "learning_rate": 4.084061018292277e-06, + "loss": 0.6174, + "step": 5435 + }, + { + "epoch": 0.57, + "grad_norm": 3.53792252658528, + "learning_rate": 4.082385672579182e-06, + "loss": 0.637, + "step": 5436 + }, + { + "epoch": 0.57, + "grad_norm": 2.964810455872569, + "learning_rate": 4.0807104334718674e-06, + "loss": 0.6321, + "step": 5437 + }, + { + "epoch": 0.57, + "grad_norm": 2.2794226187051545, + "learning_rate": 4.079035301164955e-06, + "loss": 0.5753, + "step": 5438 + }, + { + "epoch": 0.57, + "grad_norm": 2.488542771422994, + "learning_rate": 4.0773602758530606e-06, + "loss": 0.6145, + "step": 5439 + }, + { + "epoch": 0.57, + "grad_norm": 2.149185123490166, + "learning_rate": 4.0756853577307835e-06, + "loss": 0.7298, + "step": 5440 + }, + { + "epoch": 0.57, + "grad_norm": 19.446457701330363, + "learning_rate": 4.0740105469927084e-06, + "loss": 0.6299, + "step": 5441 + }, + { + "epoch": 0.57, + "grad_norm": 2.921362940815699, + "learning_rate": 4.07233584383341e-06, + "loss": 0.5932, + "step": 5442 + }, + { + "epoch": 0.57, + "grad_norm": 2.436933837657758, + "learning_rate": 4.070661248447453e-06, + "loss": 0.6107, + "step": 5443 + }, + { + "epoch": 0.57, + "grad_norm": 2.4936425865614624, + "learning_rate": 4.0689867610293845e-06, + "loss": 0.6755, + "step": 5444 + }, + { + "epoch": 0.57, + "grad_norm": 2.3563232675472374, + "learning_rate": 4.067312381773744e-06, + "loss": 0.5669, + "step": 5445 + }, + { + "epoch": 0.57, + "grad_norm": 5.811216819647008, + "learning_rate": 4.065638110875055e-06, + "loss": 0.6642, + "step": 5446 + }, + { + "epoch": 0.57, + "grad_norm": 2.9029408699267543, + "learning_rate": 4.063963948527829e-06, + "loss": 0.6854, + "step": 5447 + }, + { + "epoch": 0.57, + "grad_norm": 2.3885007406079173, + "learning_rate": 4.062289894926564e-06, + "loss": 0.6045, + "step": 5448 + }, + { + "epoch": 0.57, + "grad_norm": 2.5249267659429764, + "learning_rate": 4.060615950265752e-06, + "loss": 0.7814, + "step": 5449 + }, + { + "epoch": 0.57, + "grad_norm": 2.5576000799315537, + "learning_rate": 4.058942114739861e-06, + "loss": 0.6392, + "step": 5450 + }, + { + "epoch": 0.57, + "grad_norm": 2.340626814388993, + "learning_rate": 4.057268388543354e-06, + "loss": 0.6515, + "step": 5451 + }, + { + "epoch": 0.57, + "grad_norm": 2.366975049960535, + "learning_rate": 4.055594771870682e-06, + "loss": 0.5461, + "step": 5452 + }, + { + "epoch": 0.57, + "grad_norm": 3.5250602391277406, + "learning_rate": 4.05392126491628e-06, + "loss": 0.6166, + "step": 5453 + }, + { + "epoch": 0.57, + "grad_norm": 2.7710266206375826, + "learning_rate": 4.052247867874569e-06, + "loss": 0.5996, + "step": 5454 + }, + { + "epoch": 0.57, + "grad_norm": 2.5532961974178727, + "learning_rate": 4.050574580939961e-06, + "loss": 0.6165, + "step": 5455 + }, + { + "epoch": 0.57, + "grad_norm": 4.2749550989913345, + "learning_rate": 4.0489014043068545e-06, + "loss": 0.6004, + "step": 5456 + }, + { + "epoch": 0.57, + "grad_norm": 2.131553043318907, + "learning_rate": 4.047228338169632e-06, + "loss": 0.6498, + "step": 5457 + }, + { + "epoch": 0.57, + "grad_norm": 2.574736501197967, + "learning_rate": 4.04555538272267e-06, + "loss": 0.5856, + "step": 5458 + }, + { + "epoch": 0.57, + "grad_norm": 2.1788021789713112, + "learning_rate": 4.0438825381603225e-06, + "loss": 0.6355, + "step": 5459 + }, + { + "epoch": 0.57, + "grad_norm": 3.4984416537675296, + "learning_rate": 4.042209804676937e-06, + "loss": 0.6518, + "step": 5460 + }, + { + "epoch": 0.57, + "grad_norm": 2.5902715960904383, + "learning_rate": 4.040537182466849e-06, + "loss": 0.538, + "step": 5461 + }, + { + "epoch": 0.57, + "grad_norm": 3.1931566795393427, + "learning_rate": 4.038864671724379e-06, + "loss": 0.6478, + "step": 5462 + }, + { + "epoch": 0.57, + "grad_norm": 2.817321693238749, + "learning_rate": 4.0371922726438314e-06, + "loss": 0.6962, + "step": 5463 + }, + { + "epoch": 0.57, + "grad_norm": 2.1447873012073355, + "learning_rate": 4.035519985419502e-06, + "loss": 0.5852, + "step": 5464 + }, + { + "epoch": 0.58, + "grad_norm": 2.884660635376069, + "learning_rate": 4.033847810245673e-06, + "loss": 0.6715, + "step": 5465 + }, + { + "epoch": 0.58, + "grad_norm": 3.4056554597314035, + "learning_rate": 4.0321757473166145e-06, + "loss": 0.6417, + "step": 5466 + }, + { + "epoch": 0.58, + "grad_norm": 2.4034378137851213, + "learning_rate": 4.030503796826578e-06, + "loss": 0.6369, + "step": 5467 + }, + { + "epoch": 0.58, + "grad_norm": 3.279509860886487, + "learning_rate": 4.028831958969807e-06, + "loss": 0.6686, + "step": 5468 + }, + { + "epoch": 0.58, + "grad_norm": 2.6768593400635177, + "learning_rate": 4.027160233940534e-06, + "loss": 0.6936, + "step": 5469 + }, + { + "epoch": 0.58, + "grad_norm": 2.696418739353531, + "learning_rate": 4.02548862193297e-06, + "loss": 0.5815, + "step": 5470 + }, + { + "epoch": 0.58, + "grad_norm": 2.8470170762266642, + "learning_rate": 4.023817123141324e-06, + "loss": 0.6268, + "step": 5471 + }, + { + "epoch": 0.58, + "grad_norm": 2.3205700797742703, + "learning_rate": 4.022145737759781e-06, + "loss": 0.7149, + "step": 5472 + }, + { + "epoch": 0.58, + "grad_norm": 2.4124617955340772, + "learning_rate": 4.020474465982519e-06, + "loss": 0.5885, + "step": 5473 + }, + { + "epoch": 0.58, + "grad_norm": 2.4297597298078606, + "learning_rate": 4.0188033080037025e-06, + "loss": 0.632, + "step": 5474 + }, + { + "epoch": 0.58, + "grad_norm": 2.7294892840221427, + "learning_rate": 4.017132264017483e-06, + "loss": 0.6413, + "step": 5475 + }, + { + "epoch": 0.58, + "grad_norm": 1.0863421441237322, + "learning_rate": 4.015461334217995e-06, + "loss": 0.5971, + "step": 5476 + }, + { + "epoch": 0.58, + "grad_norm": 2.5430832765858735, + "learning_rate": 4.013790518799361e-06, + "loss": 0.6309, + "step": 5477 + }, + { + "epoch": 0.58, + "grad_norm": 2.900637212310984, + "learning_rate": 4.012119817955696e-06, + "loss": 0.5844, + "step": 5478 + }, + { + "epoch": 0.58, + "grad_norm": 2.1161956263445076, + "learning_rate": 4.010449231881093e-06, + "loss": 0.6315, + "step": 5479 + }, + { + "epoch": 0.58, + "grad_norm": 2.31955950639972, + "learning_rate": 4.00877876076964e-06, + "loss": 0.64, + "step": 5480 + }, + { + "epoch": 0.58, + "grad_norm": 2.1236471054278256, + "learning_rate": 4.0071084048154044e-06, + "loss": 0.6076, + "step": 5481 + }, + { + "epoch": 0.58, + "grad_norm": 4.246468971263171, + "learning_rate": 4.005438164212444e-06, + "loss": 0.6235, + "step": 5482 + }, + { + "epoch": 0.58, + "grad_norm": 2.0097201604995623, + "learning_rate": 4.0037680391548015e-06, + "loss": 0.5754, + "step": 5483 + }, + { + "epoch": 0.58, + "grad_norm": 2.2344373050878206, + "learning_rate": 4.002098029836511e-06, + "loss": 0.6543, + "step": 5484 + }, + { + "epoch": 0.58, + "grad_norm": 2.9506649394699247, + "learning_rate": 4.000428136451585e-06, + "loss": 0.5946, + "step": 5485 + }, + { + "epoch": 0.58, + "grad_norm": 5.350295409726678, + "learning_rate": 3.998758359194028e-06, + "loss": 0.5822, + "step": 5486 + }, + { + "epoch": 0.58, + "grad_norm": 2.4139769523797048, + "learning_rate": 3.9970886982578314e-06, + "loss": 0.6689, + "step": 5487 + }, + { + "epoch": 0.58, + "grad_norm": 2.322280956259487, + "learning_rate": 3.995419153836972e-06, + "loss": 0.5373, + "step": 5488 + }, + { + "epoch": 0.58, + "grad_norm": 2.334612127720499, + "learning_rate": 3.9937497261254114e-06, + "loss": 0.741, + "step": 5489 + }, + { + "epoch": 0.58, + "grad_norm": 2.6565331932796057, + "learning_rate": 3.992080415317096e-06, + "loss": 0.6195, + "step": 5490 + }, + { + "epoch": 0.58, + "grad_norm": 2.485017788078939, + "learning_rate": 3.9904112216059656e-06, + "loss": 0.5669, + "step": 5491 + }, + { + "epoch": 0.58, + "grad_norm": 3.189719358830894, + "learning_rate": 3.988742145185941e-06, + "loss": 0.6253, + "step": 5492 + }, + { + "epoch": 0.58, + "grad_norm": 2.8776383714391742, + "learning_rate": 3.987073186250932e-06, + "loss": 0.6488, + "step": 5493 + }, + { + "epoch": 0.58, + "grad_norm": 3.0757770946653458, + "learning_rate": 3.985404344994831e-06, + "loss": 0.7339, + "step": 5494 + }, + { + "epoch": 0.58, + "grad_norm": 2.9151134884480556, + "learning_rate": 3.9837356216115205e-06, + "loss": 0.6524, + "step": 5495 + }, + { + "epoch": 0.58, + "grad_norm": 2.5361472839977455, + "learning_rate": 3.982067016294868e-06, + "loss": 0.6586, + "step": 5496 + }, + { + "epoch": 0.58, + "grad_norm": 2.1819398494243964, + "learning_rate": 3.980398529238728e-06, + "loss": 0.6144, + "step": 5497 + }, + { + "epoch": 0.58, + "grad_norm": 9.277473751575629, + "learning_rate": 3.978730160636938e-06, + "loss": 0.5899, + "step": 5498 + }, + { + "epoch": 0.58, + "grad_norm": 4.865134875134504, + "learning_rate": 3.977061910683325e-06, + "loss": 0.6446, + "step": 5499 + }, + { + "epoch": 0.58, + "grad_norm": 2.0322930621616004, + "learning_rate": 3.975393779571704e-06, + "loss": 0.6317, + "step": 5500 + }, + { + "epoch": 0.58, + "grad_norm": 2.853800658322435, + "learning_rate": 3.9737257674958714e-06, + "loss": 0.5481, + "step": 5501 + }, + { + "epoch": 0.58, + "grad_norm": 0.9879336943874001, + "learning_rate": 3.972057874649613e-06, + "loss": 0.5534, + "step": 5502 + }, + { + "epoch": 0.58, + "grad_norm": 2.222678097797317, + "learning_rate": 3.970390101226697e-06, + "loss": 0.5955, + "step": 5503 + }, + { + "epoch": 0.58, + "grad_norm": 2.269757412226817, + "learning_rate": 3.968722447420884e-06, + "loss": 0.6627, + "step": 5504 + }, + { + "epoch": 0.58, + "grad_norm": 4.009770633477226, + "learning_rate": 3.967054913425916e-06, + "loss": 0.6221, + "step": 5505 + }, + { + "epoch": 0.58, + "grad_norm": 2.8457569192736396, + "learning_rate": 3.965387499435524e-06, + "loss": 0.6292, + "step": 5506 + }, + { + "epoch": 0.58, + "grad_norm": 3.254292979448844, + "learning_rate": 3.963720205643419e-06, + "loss": 0.6363, + "step": 5507 + }, + { + "epoch": 0.58, + "grad_norm": 2.300083482731035, + "learning_rate": 3.962053032243305e-06, + "loss": 0.6737, + "step": 5508 + }, + { + "epoch": 0.58, + "grad_norm": 3.126068404526892, + "learning_rate": 3.96038597942887e-06, + "loss": 0.689, + "step": 5509 + }, + { + "epoch": 0.58, + "grad_norm": 2.441809460166265, + "learning_rate": 3.958719047393789e-06, + "loss": 0.6798, + "step": 5510 + }, + { + "epoch": 0.58, + "grad_norm": 2.8015689631584326, + "learning_rate": 3.9570522363317165e-06, + "loss": 0.6766, + "step": 5511 + }, + { + "epoch": 0.58, + "grad_norm": 1.8277968259678123, + "learning_rate": 3.955385546436299e-06, + "loss": 0.5986, + "step": 5512 + }, + { + "epoch": 0.58, + "grad_norm": 2.335643392259737, + "learning_rate": 3.9537189779011715e-06, + "loss": 0.5452, + "step": 5513 + }, + { + "epoch": 0.58, + "grad_norm": 2.3708189781396936, + "learning_rate": 3.952052530919948e-06, + "loss": 0.6386, + "step": 5514 + }, + { + "epoch": 0.58, + "grad_norm": 2.838879644566167, + "learning_rate": 3.9503862056862315e-06, + "loss": 0.7274, + "step": 5515 + }, + { + "epoch": 0.58, + "grad_norm": 2.4970962843979057, + "learning_rate": 3.948720002393613e-06, + "loss": 0.6018, + "step": 5516 + }, + { + "epoch": 0.58, + "grad_norm": 2.1528209154994675, + "learning_rate": 3.947053921235665e-06, + "loss": 0.6035, + "step": 5517 + }, + { + "epoch": 0.58, + "grad_norm": 3.223370620526591, + "learning_rate": 3.945387962405946e-06, + "loss": 0.6433, + "step": 5518 + }, + { + "epoch": 0.58, + "grad_norm": 3.583378068602465, + "learning_rate": 3.943722126098009e-06, + "loss": 0.5599, + "step": 5519 + }, + { + "epoch": 0.58, + "grad_norm": 2.934321798163941, + "learning_rate": 3.94205641250538e-06, + "loss": 0.7043, + "step": 5520 + }, + { + "epoch": 0.58, + "grad_norm": 2.7272967348881823, + "learning_rate": 3.940390821821579e-06, + "loss": 0.6763, + "step": 5521 + }, + { + "epoch": 0.58, + "grad_norm": 2.4185559104349137, + "learning_rate": 3.93872535424011e-06, + "loss": 0.6508, + "step": 5522 + }, + { + "epoch": 0.58, + "grad_norm": 2.4863515025722562, + "learning_rate": 3.937060009954462e-06, + "loss": 0.6582, + "step": 5523 + }, + { + "epoch": 0.58, + "grad_norm": 2.89940109019646, + "learning_rate": 3.935394789158108e-06, + "loss": 0.6821, + "step": 5524 + }, + { + "epoch": 0.58, + "grad_norm": 2.850326326013693, + "learning_rate": 3.93372969204451e-06, + "loss": 0.5794, + "step": 5525 + }, + { + "epoch": 0.58, + "grad_norm": 3.1372923087151254, + "learning_rate": 3.932064718807114e-06, + "loss": 0.6803, + "step": 5526 + }, + { + "epoch": 0.58, + "grad_norm": 2.0327932638867807, + "learning_rate": 3.930399869639353e-06, + "loss": 0.6513, + "step": 5527 + }, + { + "epoch": 0.58, + "grad_norm": 3.519770785227865, + "learning_rate": 3.9287351447346424e-06, + "loss": 0.6086, + "step": 5528 + }, + { + "epoch": 0.58, + "grad_norm": 1.0115479234131517, + "learning_rate": 3.927070544286385e-06, + "loss": 0.5529, + "step": 5529 + }, + { + "epoch": 0.58, + "grad_norm": 1.0358399237096076, + "learning_rate": 3.925406068487972e-06, + "loss": 0.5233, + "step": 5530 + }, + { + "epoch": 0.58, + "grad_norm": 2.7457465246262487, + "learning_rate": 3.923741717532774e-06, + "loss": 0.6148, + "step": 5531 + }, + { + "epoch": 0.58, + "grad_norm": 2.7191030649642522, + "learning_rate": 3.922077491614155e-06, + "loss": 0.653, + "step": 5532 + }, + { + "epoch": 0.58, + "grad_norm": 2.914553835743117, + "learning_rate": 3.920413390925454e-06, + "loss": 0.6825, + "step": 5533 + }, + { + "epoch": 0.58, + "grad_norm": 2.2209209916064294, + "learning_rate": 3.918749415660005e-06, + "loss": 0.6485, + "step": 5534 + }, + { + "epoch": 0.58, + "grad_norm": 2.6768274712210633, + "learning_rate": 3.917085566011124e-06, + "loss": 0.6383, + "step": 5535 + }, + { + "epoch": 0.58, + "grad_norm": 2.07522859990918, + "learning_rate": 3.915421842172113e-06, + "loss": 0.6638, + "step": 5536 + }, + { + "epoch": 0.58, + "grad_norm": 2.4434649901286885, + "learning_rate": 3.913758244336255e-06, + "loss": 0.6297, + "step": 5537 + }, + { + "epoch": 0.58, + "grad_norm": 3.82081715284726, + "learning_rate": 3.912094772696825e-06, + "loss": 0.6508, + "step": 5538 + }, + { + "epoch": 0.58, + "grad_norm": 2.2583424490630546, + "learning_rate": 3.910431427447079e-06, + "loss": 0.5839, + "step": 5539 + }, + { + "epoch": 0.58, + "grad_norm": 4.663703287673878, + "learning_rate": 3.908768208780259e-06, + "loss": 0.5589, + "step": 5540 + }, + { + "epoch": 0.58, + "grad_norm": 2.274839976109476, + "learning_rate": 3.907105116889597e-06, + "loss": 0.6395, + "step": 5541 + }, + { + "epoch": 0.58, + "grad_norm": 2.4342153540549925, + "learning_rate": 3.905442151968302e-06, + "loss": 0.6024, + "step": 5542 + }, + { + "epoch": 0.58, + "grad_norm": 3.4254973540309996, + "learning_rate": 3.903779314209573e-06, + "loss": 0.612, + "step": 5543 + }, + { + "epoch": 0.58, + "grad_norm": 2.9209988262515627, + "learning_rate": 3.902116603806594e-06, + "loss": 0.635, + "step": 5544 + }, + { + "epoch": 0.58, + "grad_norm": 2.5257899785531785, + "learning_rate": 3.900454020952537e-06, + "loss": 0.6775, + "step": 5545 + }, + { + "epoch": 0.58, + "grad_norm": 2.231536697694151, + "learning_rate": 3.898791565840552e-06, + "loss": 0.6288, + "step": 5546 + }, + { + "epoch": 0.58, + "grad_norm": 2.7797880961916452, + "learning_rate": 3.897129238663777e-06, + "loss": 0.5785, + "step": 5547 + }, + { + "epoch": 0.58, + "grad_norm": 2.1833271590239662, + "learning_rate": 3.895467039615342e-06, + "loss": 0.6688, + "step": 5548 + }, + { + "epoch": 0.58, + "grad_norm": 3.5023552603785757, + "learning_rate": 3.893804968888354e-06, + "loss": 0.5951, + "step": 5549 + }, + { + "epoch": 0.58, + "grad_norm": 2.1779919766332645, + "learning_rate": 3.892143026675905e-06, + "loss": 0.6629, + "step": 5550 + }, + { + "epoch": 0.58, + "grad_norm": 2.937739628950341, + "learning_rate": 3.8904812131710776e-06, + "loss": 0.6831, + "step": 5551 + }, + { + "epoch": 0.58, + "grad_norm": 2.7094582537792893, + "learning_rate": 3.888819528566935e-06, + "loss": 0.6519, + "step": 5552 + }, + { + "epoch": 0.58, + "grad_norm": 2.507846503527637, + "learning_rate": 3.8871579730565265e-06, + "loss": 0.6874, + "step": 5553 + }, + { + "epoch": 0.58, + "grad_norm": 2.3128367049716334, + "learning_rate": 3.885496546832891e-06, + "loss": 0.6468, + "step": 5554 + }, + { + "epoch": 0.58, + "grad_norm": 2.18788284410742, + "learning_rate": 3.883835250089043e-06, + "loss": 0.608, + "step": 5555 + }, + { + "epoch": 0.58, + "grad_norm": 3.0367412519509323, + "learning_rate": 3.8821740830179876e-06, + "loss": 0.6377, + "step": 5556 + }, + { + "epoch": 0.58, + "grad_norm": 2.587808272946125, + "learning_rate": 3.880513045812718e-06, + "loss": 0.6709, + "step": 5557 + }, + { + "epoch": 0.58, + "grad_norm": 2.8752893486354014, + "learning_rate": 3.8788521386662076e-06, + "loss": 0.6109, + "step": 5558 + }, + { + "epoch": 0.58, + "grad_norm": 5.853722055103752, + "learning_rate": 3.8771913617714135e-06, + "loss": 0.5964, + "step": 5559 + }, + { + "epoch": 0.59, + "grad_norm": 1.0139273382638472, + "learning_rate": 3.87553071532128e-06, + "loss": 0.57, + "step": 5560 + }, + { + "epoch": 0.59, + "grad_norm": 9.753533947661753, + "learning_rate": 3.873870199508739e-06, + "loss": 0.6884, + "step": 5561 + }, + { + "epoch": 0.59, + "grad_norm": 2.59431286674271, + "learning_rate": 3.872209814526703e-06, + "loss": 0.6489, + "step": 5562 + }, + { + "epoch": 0.59, + "grad_norm": 2.118810061041824, + "learning_rate": 3.87054956056807e-06, + "loss": 0.6611, + "step": 5563 + }, + { + "epoch": 0.59, + "grad_norm": 3.2188855308440183, + "learning_rate": 3.868889437825724e-06, + "loss": 0.6555, + "step": 5564 + }, + { + "epoch": 0.59, + "grad_norm": 2.4614632914433274, + "learning_rate": 3.867229446492533e-06, + "loss": 0.6215, + "step": 5565 + }, + { + "epoch": 0.59, + "grad_norm": 2.6768898443750375, + "learning_rate": 3.865569586761352e-06, + "loss": 0.6264, + "step": 5566 + }, + { + "epoch": 0.59, + "grad_norm": 2.4319918129394216, + "learning_rate": 3.863909858825016e-06, + "loss": 0.6583, + "step": 5567 + }, + { + "epoch": 0.59, + "grad_norm": 3.70131547101183, + "learning_rate": 3.86225026287635e-06, + "loss": 0.5622, + "step": 5568 + }, + { + "epoch": 0.59, + "grad_norm": 2.5863671241991466, + "learning_rate": 3.860590799108157e-06, + "loss": 0.6492, + "step": 5569 + }, + { + "epoch": 0.59, + "grad_norm": 2.603038839665001, + "learning_rate": 3.858931467713233e-06, + "loss": 0.7282, + "step": 5570 + }, + { + "epoch": 0.59, + "grad_norm": 3.2162140609274386, + "learning_rate": 3.857272268884353e-06, + "loss": 0.5941, + "step": 5571 + }, + { + "epoch": 0.59, + "grad_norm": 2.2617877144746505, + "learning_rate": 3.855613202814277e-06, + "loss": 0.6509, + "step": 5572 + }, + { + "epoch": 0.59, + "grad_norm": 2.560892583789322, + "learning_rate": 3.853954269695749e-06, + "loss": 0.6604, + "step": 5573 + }, + { + "epoch": 0.59, + "grad_norm": 2.487420247946007, + "learning_rate": 3.8522954697215034e-06, + "loss": 0.6302, + "step": 5574 + }, + { + "epoch": 0.59, + "grad_norm": 2.821258946431581, + "learning_rate": 3.8506368030842525e-06, + "loss": 0.6311, + "step": 5575 + }, + { + "epoch": 0.59, + "grad_norm": 2.3313444270659462, + "learning_rate": 3.848978269976694e-06, + "loss": 0.5945, + "step": 5576 + }, + { + "epoch": 0.59, + "grad_norm": 2.469073388544526, + "learning_rate": 3.8473198705915135e-06, + "loss": 0.7422, + "step": 5577 + }, + { + "epoch": 0.59, + "grad_norm": 5.193626538455637, + "learning_rate": 3.845661605121377e-06, + "loss": 0.6453, + "step": 5578 + }, + { + "epoch": 0.59, + "grad_norm": 2.5193329581868897, + "learning_rate": 3.844003473758941e-06, + "loss": 0.6308, + "step": 5579 + }, + { + "epoch": 0.59, + "grad_norm": 2.157883246431803, + "learning_rate": 3.8423454766968394e-06, + "loss": 0.6685, + "step": 5580 + }, + { + "epoch": 0.59, + "grad_norm": 2.718500163349186, + "learning_rate": 3.8406876141276924e-06, + "loss": 0.5817, + "step": 5581 + }, + { + "epoch": 0.59, + "grad_norm": 2.3243517458977667, + "learning_rate": 3.8390298862441075e-06, + "loss": 0.7218, + "step": 5582 + }, + { + "epoch": 0.59, + "grad_norm": 2.612911018329449, + "learning_rate": 3.8373722932386745e-06, + "loss": 0.6655, + "step": 5583 + }, + { + "epoch": 0.59, + "grad_norm": 3.2278773536072167, + "learning_rate": 3.835714835303969e-06, + "loss": 0.6834, + "step": 5584 + }, + { + "epoch": 0.59, + "grad_norm": 2.6365356341700603, + "learning_rate": 3.834057512632546e-06, + "loss": 0.6425, + "step": 5585 + }, + { + "epoch": 0.59, + "grad_norm": 2.4664103802063475, + "learning_rate": 3.832400325416952e-06, + "loss": 0.5572, + "step": 5586 + }, + { + "epoch": 0.59, + "grad_norm": 2.0773321625767425, + "learning_rate": 3.830743273849713e-06, + "loss": 0.6554, + "step": 5587 + }, + { + "epoch": 0.59, + "grad_norm": 4.630335903459665, + "learning_rate": 3.829086358123339e-06, + "loss": 0.5597, + "step": 5588 + }, + { + "epoch": 0.59, + "grad_norm": 2.038054933842016, + "learning_rate": 3.82742957843033e-06, + "loss": 0.5566, + "step": 5589 + }, + { + "epoch": 0.59, + "grad_norm": 7.81111225996943, + "learning_rate": 3.825772934963161e-06, + "loss": 0.7147, + "step": 5590 + }, + { + "epoch": 0.59, + "grad_norm": 2.130623296542242, + "learning_rate": 3.824116427914298e-06, + "loss": 0.5917, + "step": 5591 + }, + { + "epoch": 0.59, + "grad_norm": 2.22553668110589, + "learning_rate": 3.82246005747619e-06, + "loss": 0.5317, + "step": 5592 + }, + { + "epoch": 0.59, + "grad_norm": 2.470012398893969, + "learning_rate": 3.82080382384127e-06, + "loss": 0.5783, + "step": 5593 + }, + { + "epoch": 0.59, + "grad_norm": 2.112233161530673, + "learning_rate": 3.819147727201951e-06, + "loss": 0.625, + "step": 5594 + }, + { + "epoch": 0.59, + "grad_norm": 3.4003860684934244, + "learning_rate": 3.817491767750635e-06, + "loss": 0.656, + "step": 5595 + }, + { + "epoch": 0.59, + "grad_norm": 2.6648181797893433, + "learning_rate": 3.815835945679709e-06, + "loss": 0.6308, + "step": 5596 + }, + { + "epoch": 0.59, + "grad_norm": 4.488479873290116, + "learning_rate": 3.81418026118154e-06, + "loss": 0.5743, + "step": 5597 + }, + { + "epoch": 0.59, + "grad_norm": 2.520362514666106, + "learning_rate": 3.8125247144484777e-06, + "loss": 0.5858, + "step": 5598 + }, + { + "epoch": 0.59, + "grad_norm": 2.938146052846746, + "learning_rate": 3.8108693056728636e-06, + "loss": 0.6713, + "step": 5599 + }, + { + "epoch": 0.59, + "grad_norm": 3.143207176889715, + "learning_rate": 3.809214035047016e-06, + "loss": 0.6833, + "step": 5600 + }, + { + "epoch": 0.59, + "grad_norm": 2.205971327946951, + "learning_rate": 3.8075589027632376e-06, + "loss": 0.6633, + "step": 5601 + }, + { + "epoch": 0.59, + "grad_norm": 2.2680778570668756, + "learning_rate": 3.805903909013822e-06, + "loss": 0.5748, + "step": 5602 + }, + { + "epoch": 0.59, + "grad_norm": 2.501171349333083, + "learning_rate": 3.804249053991037e-06, + "loss": 0.687, + "step": 5603 + }, + { + "epoch": 0.59, + "grad_norm": 4.102343060157223, + "learning_rate": 3.8025943378871394e-06, + "loss": 0.6342, + "step": 5604 + }, + { + "epoch": 0.59, + "grad_norm": 2.894149017249174, + "learning_rate": 3.800939760894371e-06, + "loss": 0.5931, + "step": 5605 + }, + { + "epoch": 0.59, + "grad_norm": 2.1805773899796685, + "learning_rate": 3.7992853232049566e-06, + "loss": 0.5769, + "step": 5606 + }, + { + "epoch": 0.59, + "grad_norm": 2.4362136260737426, + "learning_rate": 3.7976310250111013e-06, + "loss": 0.6543, + "step": 5607 + }, + { + "epoch": 0.59, + "grad_norm": 2.9225984516059653, + "learning_rate": 3.7959768665049967e-06, + "loss": 0.6982, + "step": 5608 + }, + { + "epoch": 0.59, + "grad_norm": 2.86310538278994, + "learning_rate": 3.7943228478788198e-06, + "loss": 0.5438, + "step": 5609 + }, + { + "epoch": 0.59, + "grad_norm": 2.2140763934345613, + "learning_rate": 3.792668969324731e-06, + "loss": 0.5698, + "step": 5610 + }, + { + "epoch": 0.59, + "grad_norm": 2.5016813794002126, + "learning_rate": 3.7910152310348686e-06, + "loss": 0.7066, + "step": 5611 + }, + { + "epoch": 0.59, + "grad_norm": 2.1455037865154742, + "learning_rate": 3.789361633201363e-06, + "loss": 0.6227, + "step": 5612 + }, + { + "epoch": 0.59, + "grad_norm": 2.0200967442200817, + "learning_rate": 3.7877081760163225e-06, + "loss": 0.6153, + "step": 5613 + }, + { + "epoch": 0.59, + "grad_norm": 2.386311762553055, + "learning_rate": 3.7860548596718427e-06, + "loss": 0.6632, + "step": 5614 + }, + { + "epoch": 0.59, + "grad_norm": 2.264330176961888, + "learning_rate": 3.784401684360001e-06, + "loss": 0.6568, + "step": 5615 + }, + { + "epoch": 0.59, + "grad_norm": 2.761814617242924, + "learning_rate": 3.782748650272857e-06, + "loss": 0.6565, + "step": 5616 + }, + { + "epoch": 0.59, + "grad_norm": 2.076122600652747, + "learning_rate": 3.781095757602455e-06, + "loss": 0.6303, + "step": 5617 + }, + { + "epoch": 0.59, + "grad_norm": 2.3150873597883947, + "learning_rate": 3.779443006540825e-06, + "loss": 0.6462, + "step": 5618 + }, + { + "epoch": 0.59, + "grad_norm": 2.940963699254998, + "learning_rate": 3.7777903972799794e-06, + "loss": 0.564, + "step": 5619 + }, + { + "epoch": 0.59, + "grad_norm": 2.4027103997644392, + "learning_rate": 3.7761379300119104e-06, + "loss": 0.61, + "step": 5620 + }, + { + "epoch": 0.59, + "grad_norm": 2.2952639954909526, + "learning_rate": 3.7744856049286e-06, + "loss": 0.5822, + "step": 5621 + }, + { + "epoch": 0.59, + "grad_norm": 3.0792638125829637, + "learning_rate": 3.77283342222201e-06, + "loss": 0.6502, + "step": 5622 + }, + { + "epoch": 0.59, + "grad_norm": 0.9784622364316484, + "learning_rate": 3.7711813820840854e-06, + "loss": 0.61, + "step": 5623 + }, + { + "epoch": 0.59, + "grad_norm": 3.0398501089056658, + "learning_rate": 3.7695294847067544e-06, + "loss": 0.641, + "step": 5624 + }, + { + "epoch": 0.59, + "grad_norm": 3.801664578402942, + "learning_rate": 3.7678777302819314e-06, + "loss": 0.5783, + "step": 5625 + }, + { + "epoch": 0.59, + "grad_norm": 2.4911044008283074, + "learning_rate": 3.7662261190015116e-06, + "loss": 0.7241, + "step": 5626 + }, + { + "epoch": 0.59, + "grad_norm": 2.3853448216389594, + "learning_rate": 3.7645746510573754e-06, + "loss": 0.6626, + "step": 5627 + }, + { + "epoch": 0.59, + "grad_norm": 2.131535326971549, + "learning_rate": 3.7629233266413866e-06, + "loss": 0.64, + "step": 5628 + }, + { + "epoch": 0.59, + "grad_norm": 1.9389020030909176, + "learning_rate": 3.7612721459453883e-06, + "loss": 0.5731, + "step": 5629 + }, + { + "epoch": 0.59, + "grad_norm": 2.7087185491547947, + "learning_rate": 3.75962110916121e-06, + "loss": 0.582, + "step": 5630 + }, + { + "epoch": 0.59, + "grad_norm": 2.475217811511566, + "learning_rate": 3.757970216480667e-06, + "loss": 0.6372, + "step": 5631 + }, + { + "epoch": 0.59, + "grad_norm": 2.4037718481514765, + "learning_rate": 3.756319468095555e-06, + "loss": 0.6331, + "step": 5632 + }, + { + "epoch": 0.59, + "grad_norm": 3.020609700875131, + "learning_rate": 3.7546688641976496e-06, + "loss": 0.6931, + "step": 5633 + }, + { + "epoch": 0.59, + "grad_norm": 4.592025998510074, + "learning_rate": 3.753018404978717e-06, + "loss": 0.5209, + "step": 5634 + }, + { + "epoch": 0.59, + "grad_norm": 2.501978181917583, + "learning_rate": 3.7513680906305015e-06, + "loss": 0.5572, + "step": 5635 + }, + { + "epoch": 0.59, + "grad_norm": 2.965457135634429, + "learning_rate": 3.7497179213447305e-06, + "loss": 0.6254, + "step": 5636 + }, + { + "epoch": 0.59, + "grad_norm": 2.294791076548485, + "learning_rate": 3.7480678973131198e-06, + "loss": 0.6553, + "step": 5637 + }, + { + "epoch": 0.59, + "grad_norm": 2.4361429134308956, + "learning_rate": 3.74641801872736e-06, + "loss": 0.645, + "step": 5638 + }, + { + "epoch": 0.59, + "grad_norm": 2.229172173470131, + "learning_rate": 3.7447682857791307e-06, + "loss": 0.6655, + "step": 5639 + }, + { + "epoch": 0.59, + "grad_norm": 2.5927694332695186, + "learning_rate": 3.743118698660094e-06, + "loss": 0.5953, + "step": 5640 + }, + { + "epoch": 0.59, + "grad_norm": 2.419854503906456, + "learning_rate": 3.741469257561895e-06, + "loss": 0.6415, + "step": 5641 + }, + { + "epoch": 0.59, + "grad_norm": 2.454076430219235, + "learning_rate": 3.739819962676159e-06, + "loss": 0.677, + "step": 5642 + }, + { + "epoch": 0.59, + "grad_norm": 2.333031240171135, + "learning_rate": 3.738170814194495e-06, + "loss": 0.5559, + "step": 5643 + }, + { + "epoch": 0.59, + "grad_norm": 2.427282486291471, + "learning_rate": 3.7365218123084996e-06, + "loss": 0.6336, + "step": 5644 + }, + { + "epoch": 0.59, + "grad_norm": 2.692927524150578, + "learning_rate": 3.7348729572097487e-06, + "loss": 0.5909, + "step": 5645 + }, + { + "epoch": 0.59, + "grad_norm": 2.406995986734407, + "learning_rate": 3.7332242490897985e-06, + "loss": 0.6221, + "step": 5646 + }, + { + "epoch": 0.59, + "grad_norm": 2.735312603120759, + "learning_rate": 3.7315756881401944e-06, + "loss": 0.6724, + "step": 5647 + }, + { + "epoch": 0.59, + "grad_norm": 3.4788360617314944, + "learning_rate": 3.7299272745524583e-06, + "loss": 0.634, + "step": 5648 + }, + { + "epoch": 0.59, + "grad_norm": 2.280769795287397, + "learning_rate": 3.728279008518102e-06, + "loss": 0.5864, + "step": 5649 + }, + { + "epoch": 0.59, + "grad_norm": 2.7064070384851253, + "learning_rate": 3.726630890228615e-06, + "loss": 0.6402, + "step": 5650 + }, + { + "epoch": 0.59, + "grad_norm": 3.801131241287733, + "learning_rate": 3.7249829198754694e-06, + "loss": 0.6281, + "step": 5651 + }, + { + "epoch": 0.59, + "grad_norm": 2.172639276357326, + "learning_rate": 3.7233350976501217e-06, + "loss": 0.6297, + "step": 5652 + }, + { + "epoch": 0.59, + "grad_norm": 1.9375269933216765, + "learning_rate": 3.7216874237440127e-06, + "loss": 0.5696, + "step": 5653 + }, + { + "epoch": 0.59, + "grad_norm": 2.3229567583419217, + "learning_rate": 3.7200398983485643e-06, + "loss": 0.6104, + "step": 5654 + }, + { + "epoch": 0.6, + "grad_norm": 2.523904866991665, + "learning_rate": 3.7183925216551784e-06, + "loss": 0.6879, + "step": 5655 + }, + { + "epoch": 0.6, + "grad_norm": 2.1913564898938622, + "learning_rate": 3.716745293855246e-06, + "loss": 0.5874, + "step": 5656 + }, + { + "epoch": 0.6, + "grad_norm": 2.2827810373182724, + "learning_rate": 3.715098215140136e-06, + "loss": 0.6078, + "step": 5657 + }, + { + "epoch": 0.6, + "grad_norm": 5.128545296474594, + "learning_rate": 3.7134512857012017e-06, + "loss": 0.6683, + "step": 5658 + }, + { + "epoch": 0.6, + "grad_norm": 2.9857064043435133, + "learning_rate": 3.711804505729776e-06, + "loss": 0.5813, + "step": 5659 + }, + { + "epoch": 0.6, + "grad_norm": 2.632396638894393, + "learning_rate": 3.7101578754171797e-06, + "loss": 0.6517, + "step": 5660 + }, + { + "epoch": 0.6, + "grad_norm": 2.2170898683792815, + "learning_rate": 3.7085113949547126e-06, + "loss": 0.6747, + "step": 5661 + }, + { + "epoch": 0.6, + "grad_norm": 2.1974382543005357, + "learning_rate": 3.706865064533659e-06, + "loss": 0.6875, + "step": 5662 + }, + { + "epoch": 0.6, + "grad_norm": 3.3819380364992115, + "learning_rate": 3.7052188843452854e-06, + "loss": 0.622, + "step": 5663 + }, + { + "epoch": 0.6, + "grad_norm": 2.682752126582236, + "learning_rate": 3.7035728545808367e-06, + "loss": 0.6522, + "step": 5664 + }, + { + "epoch": 0.6, + "grad_norm": 2.042183074281712, + "learning_rate": 3.701926975431547e-06, + "loss": 0.6193, + "step": 5665 + }, + { + "epoch": 0.6, + "grad_norm": 1.0167480891393266, + "learning_rate": 3.700281247088629e-06, + "loss": 0.5478, + "step": 5666 + }, + { + "epoch": 0.6, + "grad_norm": 3.190095939887631, + "learning_rate": 3.6986356697432796e-06, + "loss": 0.6033, + "step": 5667 + }, + { + "epoch": 0.6, + "grad_norm": 2.3881960936207682, + "learning_rate": 3.6969902435866743e-06, + "loss": 0.5799, + "step": 5668 + }, + { + "epoch": 0.6, + "grad_norm": 2.4289590644832786, + "learning_rate": 3.6953449688099774e-06, + "loss": 0.7183, + "step": 5669 + }, + { + "epoch": 0.6, + "grad_norm": 2.15648426463186, + "learning_rate": 3.69369984560433e-06, + "loss": 0.6219, + "step": 5670 + }, + { + "epoch": 0.6, + "grad_norm": 2.679139642427304, + "learning_rate": 3.69205487416086e-06, + "loss": 0.6284, + "step": 5671 + }, + { + "epoch": 0.6, + "grad_norm": 3.199487911029984, + "learning_rate": 3.690410054670671e-06, + "loss": 0.6225, + "step": 5672 + }, + { + "epoch": 0.6, + "grad_norm": 2.927509209876508, + "learning_rate": 3.6887653873248575e-06, + "loss": 0.6316, + "step": 5673 + }, + { + "epoch": 0.6, + "grad_norm": 2.9002460059852595, + "learning_rate": 3.6871208723144903e-06, + "loss": 0.6826, + "step": 5674 + }, + { + "epoch": 0.6, + "grad_norm": 2.2930525319126507, + "learning_rate": 3.6854765098306254e-06, + "loss": 0.5643, + "step": 5675 + }, + { + "epoch": 0.6, + "grad_norm": 2.809686719399776, + "learning_rate": 3.6838323000643013e-06, + "loss": 0.6805, + "step": 5676 + }, + { + "epoch": 0.6, + "grad_norm": 2.6784212261701166, + "learning_rate": 3.682188243206535e-06, + "loss": 0.6072, + "step": 5677 + }, + { + "epoch": 0.6, + "grad_norm": 1.1335859555703096, + "learning_rate": 3.6805443394483275e-06, + "loss": 0.5365, + "step": 5678 + }, + { + "epoch": 0.6, + "grad_norm": 2.477277761969307, + "learning_rate": 3.6789005889806656e-06, + "loss": 0.7198, + "step": 5679 + }, + { + "epoch": 0.6, + "grad_norm": 2.1807624837158626, + "learning_rate": 3.6772569919945157e-06, + "loss": 0.6629, + "step": 5680 + }, + { + "epoch": 0.6, + "grad_norm": 2.278161138807236, + "learning_rate": 3.6756135486808227e-06, + "loss": 0.5702, + "step": 5681 + }, + { + "epoch": 0.6, + "grad_norm": 2.433937775862514, + "learning_rate": 3.6739702592305205e-06, + "loss": 0.6464, + "step": 5682 + }, + { + "epoch": 0.6, + "grad_norm": 3.0575854854067104, + "learning_rate": 3.6723271238345187e-06, + "loss": 0.674, + "step": 5683 + }, + { + "epoch": 0.6, + "grad_norm": 3.607667973961037, + "learning_rate": 3.6706841426837145e-06, + "loss": 0.6982, + "step": 5684 + }, + { + "epoch": 0.6, + "grad_norm": 2.3855563143332543, + "learning_rate": 3.669041315968986e-06, + "loss": 0.6425, + "step": 5685 + }, + { + "epoch": 0.6, + "grad_norm": 2.551518686262234, + "learning_rate": 3.667398643881189e-06, + "loss": 0.6379, + "step": 5686 + }, + { + "epoch": 0.6, + "grad_norm": 2.2344906686707855, + "learning_rate": 3.665756126611164e-06, + "loss": 0.6542, + "step": 5687 + }, + { + "epoch": 0.6, + "grad_norm": 2.892933147579963, + "learning_rate": 3.664113764349736e-06, + "loss": 0.6753, + "step": 5688 + }, + { + "epoch": 0.6, + "grad_norm": 2.899716923671387, + "learning_rate": 3.6624715572877106e-06, + "loss": 0.5615, + "step": 5689 + }, + { + "epoch": 0.6, + "grad_norm": 2.52564711595407, + "learning_rate": 3.6608295056158717e-06, + "loss": 0.6426, + "step": 5690 + }, + { + "epoch": 0.6, + "grad_norm": 3.2313531578252412, + "learning_rate": 3.65918760952499e-06, + "loss": 0.5662, + "step": 5691 + }, + { + "epoch": 0.6, + "grad_norm": 2.5379514454376904, + "learning_rate": 3.657545869205816e-06, + "loss": 0.5762, + "step": 5692 + }, + { + "epoch": 0.6, + "grad_norm": 2.9538894091444807, + "learning_rate": 3.6559042848490835e-06, + "loss": 0.5906, + "step": 5693 + }, + { + "epoch": 0.6, + "grad_norm": 2.6382651155593604, + "learning_rate": 3.654262856645503e-06, + "loss": 0.6264, + "step": 5694 + }, + { + "epoch": 0.6, + "grad_norm": 3.482444707821686, + "learning_rate": 3.652621584785776e-06, + "loss": 0.6596, + "step": 5695 + }, + { + "epoch": 0.6, + "grad_norm": 2.2566669898698146, + "learning_rate": 3.6509804694605768e-06, + "loss": 0.7118, + "step": 5696 + }, + { + "epoch": 0.6, + "grad_norm": 3.5669855228140213, + "learning_rate": 3.649339510860568e-06, + "loss": 0.5655, + "step": 5697 + }, + { + "epoch": 0.6, + "grad_norm": 2.3661369567618866, + "learning_rate": 3.647698709176391e-06, + "loss": 0.6338, + "step": 5698 + }, + { + "epoch": 0.6, + "grad_norm": 2.3484815033901807, + "learning_rate": 3.6460580645986685e-06, + "loss": 0.6255, + "step": 5699 + }, + { + "epoch": 0.6, + "grad_norm": 2.53755595996656, + "learning_rate": 3.6444175773180045e-06, + "loss": 0.5636, + "step": 5700 + }, + { + "epoch": 0.6, + "grad_norm": 2.5222433345809763, + "learning_rate": 3.6427772475249896e-06, + "loss": 0.6518, + "step": 5701 + }, + { + "epoch": 0.6, + "grad_norm": 2.586007819868345, + "learning_rate": 3.6411370754101915e-06, + "loss": 0.6029, + "step": 5702 + }, + { + "epoch": 0.6, + "grad_norm": 5.428940429081987, + "learning_rate": 3.639497061164158e-06, + "loss": 0.5528, + "step": 5703 + }, + { + "epoch": 0.6, + "grad_norm": 2.60801073186904, + "learning_rate": 3.637857204977424e-06, + "loss": 0.5935, + "step": 5704 + }, + { + "epoch": 0.6, + "grad_norm": 3.8916382563951037, + "learning_rate": 3.636217507040502e-06, + "loss": 0.6435, + "step": 5705 + }, + { + "epoch": 0.6, + "grad_norm": 0.9863942243849351, + "learning_rate": 3.6345779675438897e-06, + "loss": 0.5717, + "step": 5706 + }, + { + "epoch": 0.6, + "grad_norm": 7.47039659299787, + "learning_rate": 3.6329385866780587e-06, + "loss": 0.5755, + "step": 5707 + }, + { + "epoch": 0.6, + "grad_norm": 2.7898417007114333, + "learning_rate": 3.6312993646334727e-06, + "loss": 0.5874, + "step": 5708 + }, + { + "epoch": 0.6, + "grad_norm": 2.273917598473412, + "learning_rate": 3.6296603016005693e-06, + "loss": 0.6866, + "step": 5709 + }, + { + "epoch": 0.6, + "grad_norm": 2.6107303675720432, + "learning_rate": 3.6280213977697715e-06, + "loss": 0.6771, + "step": 5710 + }, + { + "epoch": 0.6, + "grad_norm": 2.2956736561042357, + "learning_rate": 3.6263826533314827e-06, + "loss": 0.6167, + "step": 5711 + }, + { + "epoch": 0.6, + "grad_norm": 2.384801633397614, + "learning_rate": 3.624744068476086e-06, + "loss": 0.6664, + "step": 5712 + }, + { + "epoch": 0.6, + "grad_norm": 2.0974504782665337, + "learning_rate": 3.623105643393946e-06, + "loss": 0.6318, + "step": 5713 + }, + { + "epoch": 0.6, + "grad_norm": 3.3889022639258015, + "learning_rate": 3.621467378275414e-06, + "loss": 0.6735, + "step": 5714 + }, + { + "epoch": 0.6, + "grad_norm": 4.18791054241475, + "learning_rate": 3.6198292733108177e-06, + "loss": 0.6781, + "step": 5715 + }, + { + "epoch": 0.6, + "grad_norm": 2.7589589062878197, + "learning_rate": 3.6181913286904647e-06, + "loss": 0.5676, + "step": 5716 + }, + { + "epoch": 0.6, + "grad_norm": 1.9737675399727652, + "learning_rate": 3.6165535446046497e-06, + "loss": 0.6197, + "step": 5717 + }, + { + "epoch": 0.6, + "grad_norm": 2.520077841376911, + "learning_rate": 3.6149159212436435e-06, + "loss": 0.5949, + "step": 5718 + }, + { + "epoch": 0.6, + "grad_norm": 2.2678271084992514, + "learning_rate": 3.6132784587977053e-06, + "loss": 0.5782, + "step": 5719 + }, + { + "epoch": 0.6, + "grad_norm": 2.736740775923461, + "learning_rate": 3.611641157457064e-06, + "loss": 0.5897, + "step": 5720 + }, + { + "epoch": 0.6, + "grad_norm": 2.503689546593751, + "learning_rate": 3.6100040174119403e-06, + "loss": 0.6358, + "step": 5721 + }, + { + "epoch": 0.6, + "grad_norm": 0.9542516668761488, + "learning_rate": 3.6083670388525316e-06, + "loss": 0.6144, + "step": 5722 + }, + { + "epoch": 0.6, + "grad_norm": 3.151965071313513, + "learning_rate": 3.6067302219690175e-06, + "loss": 0.6571, + "step": 5723 + }, + { + "epoch": 0.6, + "grad_norm": 2.083294772175323, + "learning_rate": 3.6050935669515604e-06, + "loss": 0.6405, + "step": 5724 + }, + { + "epoch": 0.6, + "grad_norm": 2.308319919656163, + "learning_rate": 3.603457073990298e-06, + "loss": 0.5927, + "step": 5725 + }, + { + "epoch": 0.6, + "grad_norm": 2.5585087791382075, + "learning_rate": 3.6018207432753572e-06, + "loss": 0.5503, + "step": 5726 + }, + { + "epoch": 0.6, + "grad_norm": 14.380449615576731, + "learning_rate": 3.60018457499684e-06, + "loss": 0.6287, + "step": 5727 + }, + { + "epoch": 0.6, + "grad_norm": 3.450787029376905, + "learning_rate": 3.598548569344834e-06, + "loss": 0.6453, + "step": 5728 + }, + { + "epoch": 0.6, + "grad_norm": 3.1421747068388366, + "learning_rate": 3.596912726509402e-06, + "loss": 0.7465, + "step": 5729 + }, + { + "epoch": 0.6, + "grad_norm": 2.304462145687152, + "learning_rate": 3.595277046680594e-06, + "loss": 0.6277, + "step": 5730 + }, + { + "epoch": 0.6, + "grad_norm": 5.179794005325396, + "learning_rate": 3.5936415300484383e-06, + "loss": 0.6019, + "step": 5731 + }, + { + "epoch": 0.6, + "grad_norm": 1.0233482875110236, + "learning_rate": 3.592006176802944e-06, + "loss": 0.5521, + "step": 5732 + }, + { + "epoch": 0.6, + "grad_norm": 2.2805038648763887, + "learning_rate": 3.5903709871341034e-06, + "loss": 0.6422, + "step": 5733 + }, + { + "epoch": 0.6, + "grad_norm": 2.7182569922936426, + "learning_rate": 3.5887359612318862e-06, + "loss": 0.5947, + "step": 5734 + }, + { + "epoch": 0.6, + "grad_norm": 2.992203373180557, + "learning_rate": 3.5871010992862436e-06, + "loss": 0.6566, + "step": 5735 + }, + { + "epoch": 0.6, + "grad_norm": 3.1660591900577493, + "learning_rate": 3.5854664014871128e-06, + "loss": 0.6317, + "step": 5736 + }, + { + "epoch": 0.6, + "grad_norm": 2.645200532572237, + "learning_rate": 3.5838318680244067e-06, + "loss": 0.6629, + "step": 5737 + }, + { + "epoch": 0.6, + "grad_norm": 2.2006735106420225, + "learning_rate": 3.582197499088019e-06, + "loss": 0.7078, + "step": 5738 + }, + { + "epoch": 0.6, + "grad_norm": 2.7870416290233466, + "learning_rate": 3.580563294867828e-06, + "loss": 0.6616, + "step": 5739 + }, + { + "epoch": 0.6, + "grad_norm": 2.2284086143558057, + "learning_rate": 3.5789292555536907e-06, + "loss": 0.644, + "step": 5740 + }, + { + "epoch": 0.6, + "grad_norm": 2.0680596768407415, + "learning_rate": 3.5772953813354455e-06, + "loss": 0.5917, + "step": 5741 + }, + { + "epoch": 0.6, + "grad_norm": 2.6380941942532816, + "learning_rate": 3.575661672402908e-06, + "loss": 0.5598, + "step": 5742 + }, + { + "epoch": 0.6, + "grad_norm": 2.1088238905647647, + "learning_rate": 3.5740281289458812e-06, + "loss": 0.6548, + "step": 5743 + }, + { + "epoch": 0.6, + "grad_norm": 2.364337376015603, + "learning_rate": 3.5723947511541435e-06, + "loss": 0.6301, + "step": 5744 + }, + { + "epoch": 0.6, + "grad_norm": 2.155417858599172, + "learning_rate": 3.5707615392174576e-06, + "loss": 0.6011, + "step": 5745 + }, + { + "epoch": 0.6, + "grad_norm": 2.789662978157353, + "learning_rate": 3.5691284933255653e-06, + "loss": 0.6223, + "step": 5746 + }, + { + "epoch": 0.6, + "grad_norm": 2.436792685729838, + "learning_rate": 3.567495613668188e-06, + "loss": 0.6495, + "step": 5747 + }, + { + "epoch": 0.6, + "grad_norm": 2.2959050511929493, + "learning_rate": 3.565862900435028e-06, + "loss": 0.6366, + "step": 5748 + }, + { + "epoch": 0.6, + "grad_norm": 3.0678303289148188, + "learning_rate": 3.564230353815772e-06, + "loss": 0.5961, + "step": 5749 + }, + { + "epoch": 0.61, + "grad_norm": 2.2062343340152553, + "learning_rate": 3.562597974000084e-06, + "loss": 0.5445, + "step": 5750 + }, + { + "epoch": 0.61, + "grad_norm": 1.1459202020440926, + "learning_rate": 3.5609657611776055e-06, + "loss": 0.5367, + "step": 5751 + }, + { + "epoch": 0.61, + "grad_norm": 2.892508323233743, + "learning_rate": 3.5593337155379663e-06, + "loss": 0.5296, + "step": 5752 + }, + { + "epoch": 0.61, + "grad_norm": 13.502367015161873, + "learning_rate": 3.5577018372707706e-06, + "loss": 0.5599, + "step": 5753 + }, + { + "epoch": 0.61, + "grad_norm": 4.60449934352309, + "learning_rate": 3.5560701265656096e-06, + "loss": 0.5771, + "step": 5754 + }, + { + "epoch": 0.61, + "grad_norm": 2.0330026690570517, + "learning_rate": 3.5544385836120445e-06, + "loss": 0.6543, + "step": 5755 + }, + { + "epoch": 0.61, + "grad_norm": 2.246716338488602, + "learning_rate": 3.552807208599626e-06, + "loss": 0.6199, + "step": 5756 + }, + { + "epoch": 0.61, + "grad_norm": 2.880693301049812, + "learning_rate": 3.551176001717882e-06, + "loss": 0.5812, + "step": 5757 + }, + { + "epoch": 0.61, + "grad_norm": 2.374133819647502, + "learning_rate": 3.549544963156324e-06, + "loss": 0.6039, + "step": 5758 + }, + { + "epoch": 0.61, + "grad_norm": 3.651695196080499, + "learning_rate": 3.5479140931044393e-06, + "loss": 0.6026, + "step": 5759 + }, + { + "epoch": 0.61, + "grad_norm": 2.256349484082301, + "learning_rate": 3.546283391751696e-06, + "loss": 0.5949, + "step": 5760 + }, + { + "epoch": 0.61, + "grad_norm": 2.2426448275819504, + "learning_rate": 3.5446528592875464e-06, + "loss": 0.6444, + "step": 5761 + }, + { + "epoch": 0.61, + "grad_norm": 2.0973705329694723, + "learning_rate": 3.5430224959014215e-06, + "loss": 0.7379, + "step": 5762 + }, + { + "epoch": 0.61, + "grad_norm": 2.2641582085188, + "learning_rate": 3.5413923017827317e-06, + "loss": 0.5749, + "step": 5763 + }, + { + "epoch": 0.61, + "grad_norm": 3.058982375661702, + "learning_rate": 3.5397622771208663e-06, + "loss": 0.5927, + "step": 5764 + }, + { + "epoch": 0.61, + "grad_norm": 2.998975467492036, + "learning_rate": 3.5381324221051995e-06, + "loss": 0.611, + "step": 5765 + }, + { + "epoch": 0.61, + "grad_norm": 2.436460113249485, + "learning_rate": 3.5365027369250804e-06, + "loss": 0.5845, + "step": 5766 + }, + { + "epoch": 0.61, + "grad_norm": 2.336543180347027, + "learning_rate": 3.5348732217698466e-06, + "loss": 0.6229, + "step": 5767 + }, + { + "epoch": 0.61, + "grad_norm": 3.005942928681562, + "learning_rate": 3.533243876828803e-06, + "loss": 0.6442, + "step": 5768 + }, + { + "epoch": 0.61, + "grad_norm": 2.8740853995553226, + "learning_rate": 3.531614702291247e-06, + "loss": 0.5655, + "step": 5769 + }, + { + "epoch": 0.61, + "grad_norm": 25.902005172145653, + "learning_rate": 3.5299856983464497e-06, + "loss": 0.6208, + "step": 5770 + }, + { + "epoch": 0.61, + "grad_norm": 2.499670510652518, + "learning_rate": 3.528356865183665e-06, + "loss": 0.5878, + "step": 5771 + }, + { + "epoch": 0.61, + "grad_norm": 2.5307387633144414, + "learning_rate": 3.526728202992127e-06, + "loss": 0.7085, + "step": 5772 + }, + { + "epoch": 0.61, + "grad_norm": 2.510774660981912, + "learning_rate": 3.525099711961045e-06, + "loss": 0.6365, + "step": 5773 + }, + { + "epoch": 0.61, + "grad_norm": 2.6261622750991034, + "learning_rate": 3.523471392279616e-06, + "loss": 0.6566, + "step": 5774 + }, + { + "epoch": 0.61, + "grad_norm": 2.445814884829749, + "learning_rate": 3.521843244137013e-06, + "loss": 0.7112, + "step": 5775 + }, + { + "epoch": 0.61, + "grad_norm": 3.355458478806883, + "learning_rate": 3.52021526772239e-06, + "loss": 0.6402, + "step": 5776 + }, + { + "epoch": 0.61, + "grad_norm": 2.3733230146293263, + "learning_rate": 3.5185874632248775e-06, + "loss": 0.6198, + "step": 5777 + }, + { + "epoch": 0.61, + "grad_norm": 2.8989292888055886, + "learning_rate": 3.5169598308335915e-06, + "loss": 0.592, + "step": 5778 + }, + { + "epoch": 0.61, + "grad_norm": 2.1665097816063295, + "learning_rate": 3.515332370737625e-06, + "loss": 0.669, + "step": 5779 + }, + { + "epoch": 0.61, + "grad_norm": 1.005926002720759, + "learning_rate": 3.513705083126054e-06, + "loss": 0.6319, + "step": 5780 + }, + { + "epoch": 0.61, + "grad_norm": 2.336459256200082, + "learning_rate": 3.5120779681879286e-06, + "loss": 0.6206, + "step": 5781 + }, + { + "epoch": 0.61, + "grad_norm": 2.298134036866341, + "learning_rate": 3.5104510261122836e-06, + "loss": 0.6818, + "step": 5782 + }, + { + "epoch": 0.61, + "grad_norm": 2.8034284235986724, + "learning_rate": 3.508824257088132e-06, + "loss": 0.653, + "step": 5783 + }, + { + "epoch": 0.61, + "grad_norm": 3.123317385052922, + "learning_rate": 3.507197661304469e-06, + "loss": 0.6499, + "step": 5784 + }, + { + "epoch": 0.61, + "grad_norm": 2.655862388540164, + "learning_rate": 3.505571238950267e-06, + "loss": 0.5429, + "step": 5785 + }, + { + "epoch": 0.61, + "grad_norm": 2.490391865079292, + "learning_rate": 3.5039449902144763e-06, + "loss": 0.6385, + "step": 5786 + }, + { + "epoch": 0.61, + "grad_norm": 4.384431381049101, + "learning_rate": 3.5023189152860325e-06, + "loss": 0.7115, + "step": 5787 + }, + { + "epoch": 0.61, + "grad_norm": 2.6398628266384607, + "learning_rate": 3.5006930143538477e-06, + "loss": 0.6195, + "step": 5788 + }, + { + "epoch": 0.61, + "grad_norm": 5.043162425725876, + "learning_rate": 3.499067287606817e-06, + "loss": 0.5859, + "step": 5789 + }, + { + "epoch": 0.61, + "grad_norm": 2.925176048693361, + "learning_rate": 3.4974417352338074e-06, + "loss": 0.6185, + "step": 5790 + }, + { + "epoch": 0.61, + "grad_norm": 2.3132796544844303, + "learning_rate": 3.495816357423674e-06, + "loss": 0.6343, + "step": 5791 + }, + { + "epoch": 0.61, + "grad_norm": 2.2778784069626004, + "learning_rate": 3.494191154365247e-06, + "loss": 0.6054, + "step": 5792 + }, + { + "epoch": 0.61, + "grad_norm": 2.7460899113803503, + "learning_rate": 3.492566126247341e-06, + "loss": 0.6063, + "step": 5793 + }, + { + "epoch": 0.61, + "grad_norm": 3.201639636764578, + "learning_rate": 3.4909412732587444e-06, + "loss": 0.6561, + "step": 5794 + }, + { + "epoch": 0.61, + "grad_norm": 2.547089477361622, + "learning_rate": 3.4893165955882275e-06, + "loss": 0.5957, + "step": 5795 + }, + { + "epoch": 0.61, + "grad_norm": 2.598764658378845, + "learning_rate": 3.4876920934245423e-06, + "loss": 0.6941, + "step": 5796 + }, + { + "epoch": 0.61, + "grad_norm": 2.3890690610014795, + "learning_rate": 3.486067766956418e-06, + "loss": 0.6066, + "step": 5797 + }, + { + "epoch": 0.61, + "grad_norm": 2.9128726002118785, + "learning_rate": 3.4844436163725648e-06, + "loss": 0.5642, + "step": 5798 + }, + { + "epoch": 0.61, + "grad_norm": 3.8378397404411326, + "learning_rate": 3.48281964186167e-06, + "loss": 0.6248, + "step": 5799 + }, + { + "epoch": 0.61, + "grad_norm": 2.3946114293745913, + "learning_rate": 3.4811958436124036e-06, + "loss": 0.6376, + "step": 5800 + }, + { + "epoch": 0.61, + "grad_norm": 2.903630763714226, + "learning_rate": 3.479572221813413e-06, + "loss": 0.5643, + "step": 5801 + }, + { + "epoch": 0.61, + "grad_norm": 5.522387352098266, + "learning_rate": 3.4779487766533306e-06, + "loss": 0.6095, + "step": 5802 + }, + { + "epoch": 0.61, + "grad_norm": 2.539618765581498, + "learning_rate": 3.4763255083207547e-06, + "loss": 0.5834, + "step": 5803 + }, + { + "epoch": 0.61, + "grad_norm": 3.498863654137628, + "learning_rate": 3.4747024170042785e-06, + "loss": 0.6581, + "step": 5804 + }, + { + "epoch": 0.61, + "grad_norm": 2.0699886181482836, + "learning_rate": 3.473079502892466e-06, + "loss": 0.686, + "step": 5805 + }, + { + "epoch": 0.61, + "grad_norm": 2.56411416397428, + "learning_rate": 3.4714567661738635e-06, + "loss": 0.6205, + "step": 5806 + }, + { + "epoch": 0.61, + "grad_norm": 3.6804375443854056, + "learning_rate": 3.469834207036996e-06, + "loss": 0.5698, + "step": 5807 + }, + { + "epoch": 0.61, + "grad_norm": 2.3140301754849704, + "learning_rate": 3.4682118256703657e-06, + "loss": 0.6665, + "step": 5808 + }, + { + "epoch": 0.61, + "grad_norm": 5.132346843122038, + "learning_rate": 3.4665896222624585e-06, + "loss": 0.6473, + "step": 5809 + }, + { + "epoch": 0.61, + "grad_norm": 2.1626805670201685, + "learning_rate": 3.4649675970017355e-06, + "loss": 0.5748, + "step": 5810 + }, + { + "epoch": 0.61, + "grad_norm": 2.6486782602012684, + "learning_rate": 3.4633457500766413e-06, + "loss": 0.6031, + "step": 5811 + }, + { + "epoch": 0.61, + "grad_norm": 2.5370141596453624, + "learning_rate": 3.4617240816755937e-06, + "loss": 0.5877, + "step": 5812 + }, + { + "epoch": 0.61, + "grad_norm": 2.5534598070476, + "learning_rate": 3.460102591986997e-06, + "loss": 0.6301, + "step": 5813 + }, + { + "epoch": 0.61, + "grad_norm": 3.3454701642307962, + "learning_rate": 3.4584812811992287e-06, + "loss": 0.6339, + "step": 5814 + }, + { + "epoch": 0.61, + "grad_norm": 3.0095813678630763, + "learning_rate": 3.4568601495006503e-06, + "loss": 0.6619, + "step": 5815 + }, + { + "epoch": 0.61, + "grad_norm": 2.4805455518296835, + "learning_rate": 3.4552391970795984e-06, + "loss": 0.6543, + "step": 5816 + }, + { + "epoch": 0.61, + "grad_norm": 2.491050633881681, + "learning_rate": 3.453618424124392e-06, + "loss": 0.6431, + "step": 5817 + }, + { + "epoch": 0.61, + "grad_norm": 5.733199748643421, + "learning_rate": 3.451997830823325e-06, + "loss": 0.5999, + "step": 5818 + }, + { + "epoch": 0.61, + "grad_norm": 3.0026297623529605, + "learning_rate": 3.4503774173646767e-06, + "loss": 0.5972, + "step": 5819 + }, + { + "epoch": 0.61, + "grad_norm": 3.1106086481537787, + "learning_rate": 3.448757183936701e-06, + "loss": 0.6202, + "step": 5820 + }, + { + "epoch": 0.61, + "grad_norm": 5.983654597238341, + "learning_rate": 3.447137130727629e-06, + "loss": 0.6625, + "step": 5821 + }, + { + "epoch": 0.61, + "grad_norm": 3.81371149233971, + "learning_rate": 3.4455172579256784e-06, + "loss": 0.6177, + "step": 5822 + }, + { + "epoch": 0.61, + "grad_norm": 2.458016439088697, + "learning_rate": 3.4438975657190375e-06, + "loss": 0.7018, + "step": 5823 + }, + { + "epoch": 0.61, + "grad_norm": 1.9033971357544983, + "learning_rate": 3.442278054295883e-06, + "loss": 0.5718, + "step": 5824 + }, + { + "epoch": 0.61, + "grad_norm": 2.9518489778855823, + "learning_rate": 3.440658723844358e-06, + "loss": 0.6359, + "step": 5825 + }, + { + "epoch": 0.61, + "grad_norm": 2.6141808916194367, + "learning_rate": 3.439039574552595e-06, + "loss": 0.5865, + "step": 5826 + }, + { + "epoch": 0.61, + "grad_norm": 2.2429938266365808, + "learning_rate": 3.437420606608701e-06, + "loss": 0.6993, + "step": 5827 + }, + { + "epoch": 0.61, + "grad_norm": 2.1988445058564614, + "learning_rate": 3.435801820200767e-06, + "loss": 0.6538, + "step": 5828 + }, + { + "epoch": 0.61, + "grad_norm": 2.093978386990422, + "learning_rate": 3.4341832155168547e-06, + "loss": 0.6003, + "step": 5829 + }, + { + "epoch": 0.61, + "grad_norm": 2.311995229831535, + "learning_rate": 3.432564792745009e-06, + "loss": 0.6298, + "step": 5830 + }, + { + "epoch": 0.61, + "grad_norm": 2.324588374264805, + "learning_rate": 3.4309465520732556e-06, + "loss": 0.5572, + "step": 5831 + }, + { + "epoch": 0.61, + "grad_norm": 1.0044411358111915, + "learning_rate": 3.4293284936895956e-06, + "loss": 0.548, + "step": 5832 + }, + { + "epoch": 0.61, + "grad_norm": 4.285667049229145, + "learning_rate": 3.4277106177820123e-06, + "loss": 0.6496, + "step": 5833 + }, + { + "epoch": 0.61, + "grad_norm": 2.389684858763753, + "learning_rate": 3.426092924538462e-06, + "loss": 0.654, + "step": 5834 + }, + { + "epoch": 0.61, + "grad_norm": 2.2261660392871327, + "learning_rate": 3.4244754141468878e-06, + "loss": 0.7062, + "step": 5835 + }, + { + "epoch": 0.61, + "grad_norm": 2.237433451917938, + "learning_rate": 3.4228580867952044e-06, + "loss": 0.6249, + "step": 5836 + }, + { + "epoch": 0.61, + "grad_norm": 3.0863620341722386, + "learning_rate": 3.421240942671312e-06, + "loss": 0.6019, + "step": 5837 + }, + { + "epoch": 0.61, + "grad_norm": 2.5102635909640636, + "learning_rate": 3.4196239819630806e-06, + "loss": 0.5542, + "step": 5838 + }, + { + "epoch": 0.61, + "grad_norm": 2.377771418582857, + "learning_rate": 3.4180072048583667e-06, + "loss": 0.7179, + "step": 5839 + }, + { + "epoch": 0.61, + "grad_norm": 2.8495071063526494, + "learning_rate": 3.4163906115450025e-06, + "loss": 0.6351, + "step": 5840 + }, + { + "epoch": 0.61, + "grad_norm": 4.504564584687121, + "learning_rate": 3.4147742022108e-06, + "loss": 0.5398, + "step": 5841 + }, + { + "epoch": 0.61, + "grad_norm": 2.7848784093251355, + "learning_rate": 3.4131579770435495e-06, + "loss": 0.67, + "step": 5842 + }, + { + "epoch": 0.61, + "grad_norm": 2.7193060597973844, + "learning_rate": 3.411541936231016e-06, + "loss": 0.6779, + "step": 5843 + }, + { + "epoch": 0.61, + "grad_norm": 2.9893759030534155, + "learning_rate": 3.409926079960949e-06, + "loss": 0.5734, + "step": 5844 + }, + { + "epoch": 0.62, + "grad_norm": 2.402119046084388, + "learning_rate": 3.4083104084210746e-06, + "loss": 0.7261, + "step": 5845 + }, + { + "epoch": 0.62, + "grad_norm": 3.2792238487813976, + "learning_rate": 3.4066949217990964e-06, + "loss": 0.6428, + "step": 5846 + }, + { + "epoch": 0.62, + "grad_norm": 2.754234915442587, + "learning_rate": 3.4050796202826943e-06, + "loss": 0.6201, + "step": 5847 + }, + { + "epoch": 0.62, + "grad_norm": 2.5663658021698423, + "learning_rate": 3.4034645040595325e-06, + "loss": 0.6867, + "step": 5848 + }, + { + "epoch": 0.62, + "grad_norm": 2.808416018507515, + "learning_rate": 3.4018495733172485e-06, + "loss": 0.7043, + "step": 5849 + }, + { + "epoch": 0.62, + "grad_norm": 2.945233847925162, + "learning_rate": 3.4002348282434637e-06, + "loss": 0.5994, + "step": 5850 + }, + { + "epoch": 0.62, + "grad_norm": 2.242388743778958, + "learning_rate": 3.3986202690257707e-06, + "loss": 0.689, + "step": 5851 + }, + { + "epoch": 0.62, + "grad_norm": 2.706078895924368, + "learning_rate": 3.397005895851746e-06, + "loss": 0.6305, + "step": 5852 + }, + { + "epoch": 0.62, + "grad_norm": 2.4900554279730427, + "learning_rate": 3.395391708908941e-06, + "loss": 0.6242, + "step": 5853 + }, + { + "epoch": 0.62, + "grad_norm": 3.2235848293555365, + "learning_rate": 3.39377770838489e-06, + "loss": 0.6205, + "step": 5854 + }, + { + "epoch": 0.62, + "grad_norm": 2.1780532875830025, + "learning_rate": 3.392163894467103e-06, + "loss": 0.6479, + "step": 5855 + }, + { + "epoch": 0.62, + "grad_norm": 2.180928205646365, + "learning_rate": 3.3905502673430648e-06, + "loss": 0.6474, + "step": 5856 + }, + { + "epoch": 0.62, + "grad_norm": 2.0354934950603507, + "learning_rate": 3.3889368272002455e-06, + "loss": 0.6783, + "step": 5857 + }, + { + "epoch": 0.62, + "grad_norm": 13.984789819926613, + "learning_rate": 3.387323574226087e-06, + "loss": 0.6263, + "step": 5858 + }, + { + "epoch": 0.62, + "grad_norm": 2.841089761628964, + "learning_rate": 3.385710508608017e-06, + "loss": 0.6473, + "step": 5859 + }, + { + "epoch": 0.62, + "grad_norm": 4.189754673822482, + "learning_rate": 3.3840976305334304e-06, + "loss": 0.5959, + "step": 5860 + }, + { + "epoch": 0.62, + "grad_norm": 2.5889741809528988, + "learning_rate": 3.382484940189711e-06, + "loss": 0.6292, + "step": 5861 + }, + { + "epoch": 0.62, + "grad_norm": 2.289269598614149, + "learning_rate": 3.380872437764215e-06, + "loss": 0.6444, + "step": 5862 + }, + { + "epoch": 0.62, + "grad_norm": 2.422036993894899, + "learning_rate": 3.37926012344428e-06, + "loss": 0.5488, + "step": 5863 + }, + { + "epoch": 0.62, + "grad_norm": 2.456124041276871, + "learning_rate": 3.3776479974172184e-06, + "loss": 0.6358, + "step": 5864 + }, + { + "epoch": 0.62, + "grad_norm": 2.4906368084019817, + "learning_rate": 3.3760360598703217e-06, + "loss": 0.5991, + "step": 5865 + }, + { + "epoch": 0.62, + "grad_norm": 2.6266749863791805, + "learning_rate": 3.374424310990862e-06, + "loss": 0.6148, + "step": 5866 + }, + { + "epoch": 0.62, + "grad_norm": 2.513373641942084, + "learning_rate": 3.372812750966087e-06, + "loss": 0.6433, + "step": 5867 + }, + { + "epoch": 0.62, + "grad_norm": 2.884160237208583, + "learning_rate": 3.371201379983223e-06, + "loss": 0.5771, + "step": 5868 + }, + { + "epoch": 0.62, + "grad_norm": 3.2671886681135485, + "learning_rate": 3.369590198229473e-06, + "loss": 0.5949, + "step": 5869 + }, + { + "epoch": 0.62, + "grad_norm": 2.2010932339105223, + "learning_rate": 3.3679792058920223e-06, + "loss": 0.677, + "step": 5870 + }, + { + "epoch": 0.62, + "grad_norm": 2.8453784039703374, + "learning_rate": 3.366368403158028e-06, + "loss": 0.604, + "step": 5871 + }, + { + "epoch": 0.62, + "grad_norm": 2.641273584480205, + "learning_rate": 3.3647577902146334e-06, + "loss": 0.6553, + "step": 5872 + }, + { + "epoch": 0.62, + "grad_norm": 2.3540207447949997, + "learning_rate": 3.363147367248949e-06, + "loss": 0.6874, + "step": 5873 + }, + { + "epoch": 0.62, + "grad_norm": 2.1061808611226644, + "learning_rate": 3.3615371344480725e-06, + "loss": 0.5198, + "step": 5874 + }, + { + "epoch": 0.62, + "grad_norm": 3.610784500535765, + "learning_rate": 3.3599270919990744e-06, + "loss": 0.6982, + "step": 5875 + }, + { + "epoch": 0.62, + "grad_norm": 17.35217476906217, + "learning_rate": 3.358317240089008e-06, + "loss": 0.6382, + "step": 5876 + }, + { + "epoch": 0.62, + "grad_norm": 2.3499225456097874, + "learning_rate": 3.3567075789048973e-06, + "loss": 0.6571, + "step": 5877 + }, + { + "epoch": 0.62, + "grad_norm": 4.846781393555933, + "learning_rate": 3.35509810863375e-06, + "loss": 0.7104, + "step": 5878 + }, + { + "epoch": 0.62, + "grad_norm": 3.0399172317674497, + "learning_rate": 3.35348882946255e-06, + "loss": 0.6536, + "step": 5879 + }, + { + "epoch": 0.62, + "grad_norm": 3.2181809417118203, + "learning_rate": 3.3518797415782577e-06, + "loss": 0.5946, + "step": 5880 + }, + { + "epoch": 0.62, + "grad_norm": 3.7651214632668175, + "learning_rate": 3.3502708451678145e-06, + "loss": 0.6062, + "step": 5881 + }, + { + "epoch": 0.62, + "grad_norm": 2.5978846672711, + "learning_rate": 3.348662140418133e-06, + "loss": 0.6363, + "step": 5882 + }, + { + "epoch": 0.62, + "grad_norm": 7.165618488483955, + "learning_rate": 3.3470536275161126e-06, + "loss": 0.5837, + "step": 5883 + }, + { + "epoch": 0.62, + "grad_norm": 3.6883277185410073, + "learning_rate": 3.3454453066486214e-06, + "loss": 0.6308, + "step": 5884 + }, + { + "epoch": 0.62, + "grad_norm": 2.1630804016878504, + "learning_rate": 3.3438371780025136e-06, + "loss": 0.6886, + "step": 5885 + }, + { + "epoch": 0.62, + "grad_norm": 2.269499258419974, + "learning_rate": 3.342229241764615e-06, + "loss": 0.6706, + "step": 5886 + }, + { + "epoch": 0.62, + "grad_norm": 2.3398677700347497, + "learning_rate": 3.34062149812173e-06, + "loss": 0.6235, + "step": 5887 + }, + { + "epoch": 0.62, + "grad_norm": 2.507757222319039, + "learning_rate": 3.339013947260642e-06, + "loss": 0.5739, + "step": 5888 + }, + { + "epoch": 0.62, + "grad_norm": 2.9578807377909313, + "learning_rate": 3.337406589368113e-06, + "loss": 0.6063, + "step": 5889 + }, + { + "epoch": 0.62, + "grad_norm": 2.2486291156021263, + "learning_rate": 3.3357994246308815e-06, + "loss": 0.6258, + "step": 5890 + }, + { + "epoch": 0.62, + "grad_norm": 2.4560317757961596, + "learning_rate": 3.3341924532356605e-06, + "loss": 0.6704, + "step": 5891 + }, + { + "epoch": 0.62, + "grad_norm": 2.4000882896893856, + "learning_rate": 3.3325856753691453e-06, + "loss": 0.6653, + "step": 5892 + }, + { + "epoch": 0.62, + "grad_norm": 2.309148094791573, + "learning_rate": 3.3309790912180056e-06, + "loss": 0.5464, + "step": 5893 + }, + { + "epoch": 0.62, + "grad_norm": 2.9793468104494654, + "learning_rate": 3.329372700968894e-06, + "loss": 0.6522, + "step": 5894 + }, + { + "epoch": 0.62, + "grad_norm": 3.234552857190179, + "learning_rate": 3.3277665048084283e-06, + "loss": 0.6763, + "step": 5895 + }, + { + "epoch": 0.62, + "grad_norm": 2.305618418961012, + "learning_rate": 3.326160502923218e-06, + "loss": 0.5466, + "step": 5896 + }, + { + "epoch": 0.62, + "grad_norm": 2.4393905813677113, + "learning_rate": 3.324554695499841e-06, + "loss": 0.6869, + "step": 5897 + }, + { + "epoch": 0.62, + "grad_norm": 3.1678663773433686, + "learning_rate": 3.3229490827248585e-06, + "loss": 0.6572, + "step": 5898 + }, + { + "epoch": 0.62, + "grad_norm": 2.810314446746764, + "learning_rate": 3.3213436647848017e-06, + "loss": 0.6039, + "step": 5899 + }, + { + "epoch": 0.62, + "grad_norm": 2.616553362514333, + "learning_rate": 3.3197384418661854e-06, + "loss": 0.6404, + "step": 5900 + }, + { + "epoch": 0.62, + "grad_norm": 2.606456810157447, + "learning_rate": 3.3181334141555003e-06, + "loss": 0.6813, + "step": 5901 + }, + { + "epoch": 0.62, + "grad_norm": 3.352922919680765, + "learning_rate": 3.3165285818392133e-06, + "loss": 0.5888, + "step": 5902 + }, + { + "epoch": 0.62, + "grad_norm": 3.765095323296954, + "learning_rate": 3.3149239451037706e-06, + "loss": 0.6115, + "step": 5903 + }, + { + "epoch": 0.62, + "grad_norm": 2.0882597233379716, + "learning_rate": 3.313319504135591e-06, + "loss": 0.6357, + "step": 5904 + }, + { + "epoch": 0.62, + "grad_norm": 2.3840323224262576, + "learning_rate": 3.3117152591210765e-06, + "loss": 0.6076, + "step": 5905 + }, + { + "epoch": 0.62, + "grad_norm": 12.879550786415358, + "learning_rate": 3.3101112102466014e-06, + "loss": 0.6847, + "step": 5906 + }, + { + "epoch": 0.62, + "grad_norm": 0.9258667628138425, + "learning_rate": 3.308507357698525e-06, + "loss": 0.5716, + "step": 5907 + }, + { + "epoch": 0.62, + "grad_norm": 2.468162429286832, + "learning_rate": 3.30690370166317e-06, + "loss": 0.6944, + "step": 5908 + }, + { + "epoch": 0.62, + "grad_norm": 3.4587947376198205, + "learning_rate": 3.30530024232685e-06, + "loss": 0.6611, + "step": 5909 + }, + { + "epoch": 0.62, + "grad_norm": 2.7167118186662447, + "learning_rate": 3.3036969798758486e-06, + "loss": 0.6919, + "step": 5910 + }, + { + "epoch": 0.62, + "grad_norm": 2.8540713326378624, + "learning_rate": 3.3020939144964298e-06, + "loss": 0.5524, + "step": 5911 + }, + { + "epoch": 0.62, + "grad_norm": 2.633945852372088, + "learning_rate": 3.3004910463748315e-06, + "loss": 0.6379, + "step": 5912 + }, + { + "epoch": 0.62, + "grad_norm": 2.6471695256154977, + "learning_rate": 3.298888375697269e-06, + "loss": 0.6573, + "step": 5913 + }, + { + "epoch": 0.62, + "grad_norm": 2.7889126114993785, + "learning_rate": 3.2972859026499395e-06, + "loss": 0.5756, + "step": 5914 + }, + { + "epoch": 0.62, + "grad_norm": 2.4597559748985436, + "learning_rate": 3.2956836274190107e-06, + "loss": 0.6519, + "step": 5915 + }, + { + "epoch": 0.62, + "grad_norm": 6.22590279694477, + "learning_rate": 3.294081550190633e-06, + "loss": 0.5535, + "step": 5916 + }, + { + "epoch": 0.62, + "grad_norm": 3.1209221601644255, + "learning_rate": 3.292479671150927e-06, + "loss": 0.6076, + "step": 5917 + }, + { + "epoch": 0.62, + "grad_norm": 2.6511690459462853, + "learning_rate": 3.290877990485999e-06, + "loss": 0.5224, + "step": 5918 + }, + { + "epoch": 0.62, + "grad_norm": 1.017349317050653, + "learning_rate": 3.2892765083819233e-06, + "loss": 0.515, + "step": 5919 + }, + { + "epoch": 0.62, + "grad_norm": 2.7093325400474417, + "learning_rate": 3.28767522502476e-06, + "loss": 0.6475, + "step": 5920 + }, + { + "epoch": 0.62, + "grad_norm": 2.471371183047004, + "learning_rate": 3.2860741406005383e-06, + "loss": 0.5238, + "step": 5921 + }, + { + "epoch": 0.62, + "grad_norm": 2.6312257555514824, + "learning_rate": 3.2844732552952686e-06, + "loss": 0.6347, + "step": 5922 + }, + { + "epoch": 0.62, + "grad_norm": 2.411758053136448, + "learning_rate": 3.282872569294936e-06, + "loss": 0.6182, + "step": 5923 + }, + { + "epoch": 0.62, + "grad_norm": 2.5478750822742935, + "learning_rate": 3.281272082785506e-06, + "loss": 0.481, + "step": 5924 + }, + { + "epoch": 0.62, + "grad_norm": 2.3428946425529644, + "learning_rate": 3.2796717959529167e-06, + "loss": 0.6574, + "step": 5925 + }, + { + "epoch": 0.62, + "grad_norm": 2.5726580568876902, + "learning_rate": 3.2780717089830845e-06, + "loss": 0.6016, + "step": 5926 + }, + { + "epoch": 0.62, + "grad_norm": 2.284631597557582, + "learning_rate": 3.276471822061904e-06, + "loss": 0.5906, + "step": 5927 + }, + { + "epoch": 0.62, + "grad_norm": 3.003461067354797, + "learning_rate": 3.2748721353752445e-06, + "loss": 0.683, + "step": 5928 + }, + { + "epoch": 0.62, + "grad_norm": 2.570345614518788, + "learning_rate": 3.2732726491089562e-06, + "loss": 0.6233, + "step": 5929 + }, + { + "epoch": 0.62, + "grad_norm": 2.7306210262884796, + "learning_rate": 3.2716733634488563e-06, + "loss": 0.6855, + "step": 5930 + }, + { + "epoch": 0.62, + "grad_norm": 2.4961398274059046, + "learning_rate": 3.2700742785807503e-06, + "loss": 0.6488, + "step": 5931 + }, + { + "epoch": 0.62, + "grad_norm": 2.674949765424381, + "learning_rate": 3.2684753946904136e-06, + "loss": 0.5726, + "step": 5932 + }, + { + "epoch": 0.62, + "grad_norm": 3.1722015237208283, + "learning_rate": 3.266876711963602e-06, + "loss": 0.5482, + "step": 5933 + }, + { + "epoch": 0.62, + "grad_norm": 2.4532631137041836, + "learning_rate": 3.265278230586043e-06, + "loss": 0.6033, + "step": 5934 + }, + { + "epoch": 0.62, + "grad_norm": 5.84912133996752, + "learning_rate": 3.2636799507434447e-06, + "loss": 0.6797, + "step": 5935 + }, + { + "epoch": 0.62, + "grad_norm": 2.687307510260449, + "learning_rate": 3.2620818726214888e-06, + "loss": 0.6185, + "step": 5936 + }, + { + "epoch": 0.62, + "grad_norm": 2.3580307891356673, + "learning_rate": 3.260483996405839e-06, + "loss": 0.6161, + "step": 5937 + }, + { + "epoch": 0.62, + "grad_norm": 2.074986511593617, + "learning_rate": 3.2588863222821306e-06, + "loss": 0.6061, + "step": 5938 + }, + { + "epoch": 0.62, + "grad_norm": 2.6795167053076145, + "learning_rate": 3.2572888504359743e-06, + "loss": 0.648, + "step": 5939 + }, + { + "epoch": 0.63, + "grad_norm": 0.9485630736671, + "learning_rate": 3.2556915810529627e-06, + "loss": 0.5871, + "step": 5940 + }, + { + "epoch": 0.63, + "grad_norm": 3.589804098570901, + "learning_rate": 3.25409451431866e-06, + "loss": 0.5836, + "step": 5941 + }, + { + "epoch": 0.63, + "grad_norm": 2.029028547694551, + "learning_rate": 3.2524976504186125e-06, + "loss": 0.6208, + "step": 5942 + }, + { + "epoch": 0.63, + "grad_norm": 2.457015002957502, + "learning_rate": 3.2509009895383337e-06, + "loss": 0.5133, + "step": 5943 + }, + { + "epoch": 0.63, + "grad_norm": 2.7782124039908958, + "learning_rate": 3.249304531863323e-06, + "loss": 0.6363, + "step": 5944 + }, + { + "epoch": 0.63, + "grad_norm": 2.7887314599627198, + "learning_rate": 3.247708277579049e-06, + "loss": 0.693, + "step": 5945 + }, + { + "epoch": 0.63, + "grad_norm": 5.059722447279897, + "learning_rate": 3.2461122268709657e-06, + "loss": 0.6958, + "step": 5946 + }, + { + "epoch": 0.63, + "grad_norm": 3.609162372409086, + "learning_rate": 3.244516379924492e-06, + "loss": 0.5227, + "step": 5947 + }, + { + "epoch": 0.63, + "grad_norm": 2.5711727147391903, + "learning_rate": 3.242920736925029e-06, + "loss": 0.5873, + "step": 5948 + }, + { + "epoch": 0.63, + "grad_norm": 2.965512354865726, + "learning_rate": 3.2413252980579572e-06, + "loss": 0.5924, + "step": 5949 + }, + { + "epoch": 0.63, + "grad_norm": 4.480684034499855, + "learning_rate": 3.239730063508629e-06, + "loss": 0.6539, + "step": 5950 + }, + { + "epoch": 0.63, + "grad_norm": 3.0554230163758693, + "learning_rate": 3.238135033462374e-06, + "loss": 0.6008, + "step": 5951 + }, + { + "epoch": 0.63, + "grad_norm": 2.264871265257945, + "learning_rate": 3.2365402081044955e-06, + "loss": 0.7099, + "step": 5952 + }, + { + "epoch": 0.63, + "grad_norm": 2.4974903193187594, + "learning_rate": 3.2349455876202797e-06, + "loss": 0.6184, + "step": 5953 + }, + { + "epoch": 0.63, + "grad_norm": 2.18292964645188, + "learning_rate": 3.2333511721949816e-06, + "loss": 0.576, + "step": 5954 + }, + { + "epoch": 0.63, + "grad_norm": 2.187323524231644, + "learning_rate": 3.23175696201384e-06, + "loss": 0.6029, + "step": 5955 + }, + { + "epoch": 0.63, + "grad_norm": 2.42051305833246, + "learning_rate": 3.230162957262062e-06, + "loss": 0.5949, + "step": 5956 + }, + { + "epoch": 0.63, + "grad_norm": 2.437281602803996, + "learning_rate": 3.2285691581248345e-06, + "loss": 0.6308, + "step": 5957 + }, + { + "epoch": 0.63, + "grad_norm": 2.188705793501174, + "learning_rate": 3.226975564787322e-06, + "loss": 0.6577, + "step": 5958 + }, + { + "epoch": 0.63, + "grad_norm": 2.095497000196284, + "learning_rate": 3.2253821774346644e-06, + "loss": 0.613, + "step": 5959 + }, + { + "epoch": 0.63, + "grad_norm": 2.217153208860546, + "learning_rate": 3.2237889962519748e-06, + "loss": 0.5494, + "step": 5960 + }, + { + "epoch": 0.63, + "grad_norm": 3.0997362892532534, + "learning_rate": 3.2221960214243437e-06, + "loss": 0.6854, + "step": 5961 + }, + { + "epoch": 0.63, + "grad_norm": 2.176771504376497, + "learning_rate": 3.2206032531368407e-06, + "loss": 0.625, + "step": 5962 + }, + { + "epoch": 0.63, + "grad_norm": 3.3368855193919957, + "learning_rate": 3.2190106915745077e-06, + "loss": 0.6591, + "step": 5963 + }, + { + "epoch": 0.63, + "grad_norm": 2.221750102788297, + "learning_rate": 3.2174183369223667e-06, + "loss": 0.5971, + "step": 5964 + }, + { + "epoch": 0.63, + "grad_norm": 2.3884878579108473, + "learning_rate": 3.2158261893654073e-06, + "loss": 0.7223, + "step": 5965 + }, + { + "epoch": 0.63, + "grad_norm": 2.8667030280760954, + "learning_rate": 3.214234249088605e-06, + "loss": 0.6087, + "step": 5966 + }, + { + "epoch": 0.63, + "grad_norm": 2.172551788871772, + "learning_rate": 3.212642516276905e-06, + "loss": 0.5243, + "step": 5967 + }, + { + "epoch": 0.63, + "grad_norm": 2.7291377150296534, + "learning_rate": 3.2110509911152315e-06, + "loss": 0.5997, + "step": 5968 + }, + { + "epoch": 0.63, + "grad_norm": 1.9384746293091961, + "learning_rate": 3.209459673788482e-06, + "loss": 0.5464, + "step": 5969 + }, + { + "epoch": 0.63, + "grad_norm": 2.058472029705662, + "learning_rate": 3.207868564481532e-06, + "loss": 0.5822, + "step": 5970 + }, + { + "epoch": 0.63, + "grad_norm": 2.567086993673489, + "learning_rate": 3.2062776633792303e-06, + "loss": 0.6797, + "step": 5971 + }, + { + "epoch": 0.63, + "grad_norm": 3.1555293622681844, + "learning_rate": 3.2046869706664074e-06, + "loss": 0.5843, + "step": 5972 + }, + { + "epoch": 0.63, + "grad_norm": 2.3596960035354035, + "learning_rate": 3.2030964865278604e-06, + "loss": 0.5639, + "step": 5973 + }, + { + "epoch": 0.63, + "grad_norm": 2.0640908869935912, + "learning_rate": 3.2015062111483688e-06, + "loss": 0.6841, + "step": 5974 + }, + { + "epoch": 0.63, + "grad_norm": 2.689198480543773, + "learning_rate": 3.199916144712688e-06, + "loss": 0.6605, + "step": 5975 + }, + { + "epoch": 0.63, + "grad_norm": 2.6877494467316985, + "learning_rate": 3.1983262874055442e-06, + "loss": 0.5771, + "step": 5976 + }, + { + "epoch": 0.63, + "grad_norm": 2.772844595362914, + "learning_rate": 3.1967366394116477e-06, + "loss": 0.6137, + "step": 5977 + }, + { + "epoch": 0.63, + "grad_norm": 1.0203362646742848, + "learning_rate": 3.1951472009156725e-06, + "loss": 0.5498, + "step": 5978 + }, + { + "epoch": 0.63, + "grad_norm": 2.242251234684735, + "learning_rate": 3.193557972102279e-06, + "loss": 0.6582, + "step": 5979 + }, + { + "epoch": 0.63, + "grad_norm": 3.8382421017907684, + "learning_rate": 3.191968953156098e-06, + "loss": 0.6473, + "step": 5980 + }, + { + "epoch": 0.63, + "grad_norm": 2.1209413221002222, + "learning_rate": 3.19038014426174e-06, + "loss": 0.6004, + "step": 5981 + }, + { + "epoch": 0.63, + "grad_norm": 2.2255883161813963, + "learning_rate": 3.1887915456037833e-06, + "loss": 0.6218, + "step": 5982 + }, + { + "epoch": 0.63, + "grad_norm": 2.8783014772228532, + "learning_rate": 3.1872031573667895e-06, + "loss": 0.7142, + "step": 5983 + }, + { + "epoch": 0.63, + "grad_norm": 2.5531410132078363, + "learning_rate": 3.185614979735293e-06, + "loss": 0.5936, + "step": 5984 + }, + { + "epoch": 0.63, + "grad_norm": 2.565229169837344, + "learning_rate": 3.1840270128938032e-06, + "loss": 0.6143, + "step": 5985 + }, + { + "epoch": 0.63, + "grad_norm": 2.045126577655692, + "learning_rate": 3.182439257026807e-06, + "loss": 0.5457, + "step": 5986 + }, + { + "epoch": 0.63, + "grad_norm": 2.0857258123928033, + "learning_rate": 3.180851712318761e-06, + "loss": 0.6753, + "step": 5987 + }, + { + "epoch": 0.63, + "grad_norm": 2.631692022599449, + "learning_rate": 3.179264378954106e-06, + "loss": 0.5913, + "step": 5988 + }, + { + "epoch": 0.63, + "grad_norm": 3.8525652346655925, + "learning_rate": 3.1776772571172514e-06, + "loss": 0.7053, + "step": 5989 + }, + { + "epoch": 0.63, + "grad_norm": 2.2544931285363856, + "learning_rate": 3.1760903469925874e-06, + "loss": 0.6392, + "step": 5990 + }, + { + "epoch": 0.63, + "grad_norm": 2.2143334987948213, + "learning_rate": 3.174503648764473e-06, + "loss": 0.6196, + "step": 5991 + }, + { + "epoch": 0.63, + "grad_norm": 2.7607820463483015, + "learning_rate": 3.1729171626172485e-06, + "loss": 0.6129, + "step": 5992 + }, + { + "epoch": 0.63, + "grad_norm": 2.431241441047194, + "learning_rate": 3.1713308887352244e-06, + "loss": 0.5663, + "step": 5993 + }, + { + "epoch": 0.63, + "grad_norm": 0.9957317360641763, + "learning_rate": 3.1697448273026944e-06, + "loss": 0.5867, + "step": 5994 + }, + { + "epoch": 0.63, + "grad_norm": 4.718815023192582, + "learning_rate": 3.1681589785039178e-06, + "loss": 0.5582, + "step": 5995 + }, + { + "epoch": 0.63, + "grad_norm": 2.3173917102419392, + "learning_rate": 3.1665733425231356e-06, + "loss": 0.5831, + "step": 5996 + }, + { + "epoch": 0.63, + "grad_norm": 2.9747101713599116, + "learning_rate": 3.164987919544563e-06, + "loss": 0.6145, + "step": 5997 + }, + { + "epoch": 0.63, + "grad_norm": 3.0576887431965263, + "learning_rate": 3.163402709752389e-06, + "loss": 0.6303, + "step": 5998 + }, + { + "epoch": 0.63, + "grad_norm": 2.2892471503762697, + "learning_rate": 3.1618177133307813e-06, + "loss": 0.5384, + "step": 5999 + }, + { + "epoch": 0.63, + "grad_norm": 2.0697756117981876, + "learning_rate": 3.1602329304638755e-06, + "loss": 0.6807, + "step": 6000 + }, + { + "epoch": 0.63, + "grad_norm": 2.1548228132567897, + "learning_rate": 3.158648361335791e-06, + "loss": 0.6769, + "step": 6001 + }, + { + "epoch": 0.63, + "grad_norm": 2.171892545453622, + "learning_rate": 3.1570640061306153e-06, + "loss": 0.5813, + "step": 6002 + }, + { + "epoch": 0.63, + "grad_norm": 3.5558082679769543, + "learning_rate": 3.1554798650324183e-06, + "loss": 0.5725, + "step": 6003 + }, + { + "epoch": 0.63, + "grad_norm": 2.1783862920998778, + "learning_rate": 3.1538959382252378e-06, + "loss": 0.6095, + "step": 6004 + }, + { + "epoch": 0.63, + "grad_norm": 2.439670545080028, + "learning_rate": 3.1523122258930904e-06, + "loss": 0.6402, + "step": 6005 + }, + { + "epoch": 0.63, + "grad_norm": 2.1135984445220846, + "learning_rate": 3.150728728219966e-06, + "loss": 0.5736, + "step": 6006 + }, + { + "epoch": 0.63, + "grad_norm": 2.2613893512798544, + "learning_rate": 3.149145445389835e-06, + "loss": 0.6038, + "step": 6007 + }, + { + "epoch": 0.63, + "grad_norm": 2.7430614334073287, + "learning_rate": 3.147562377586635e-06, + "loss": 0.6266, + "step": 6008 + }, + { + "epoch": 0.63, + "grad_norm": 2.8377286942815885, + "learning_rate": 3.1459795249942815e-06, + "loss": 0.6455, + "step": 6009 + }, + { + "epoch": 0.63, + "grad_norm": 0.9591836036301431, + "learning_rate": 3.144396887796669e-06, + "loss": 0.5318, + "step": 6010 + }, + { + "epoch": 0.63, + "grad_norm": 2.0086517588320363, + "learning_rate": 3.1428144661776605e-06, + "loss": 0.5734, + "step": 6011 + }, + { + "epoch": 0.63, + "grad_norm": 2.218164590981601, + "learning_rate": 3.141232260321102e-06, + "loss": 0.683, + "step": 6012 + }, + { + "epoch": 0.63, + "grad_norm": 2.503329798201055, + "learning_rate": 3.1396502704108034e-06, + "loss": 0.6242, + "step": 6013 + }, + { + "epoch": 0.63, + "grad_norm": 3.221827577676699, + "learning_rate": 3.13806849663056e-06, + "loss": 0.5933, + "step": 6014 + }, + { + "epoch": 0.63, + "grad_norm": 3.6189027947412518, + "learning_rate": 3.1364869391641343e-06, + "loss": 0.5451, + "step": 6015 + }, + { + "epoch": 0.63, + "grad_norm": 2.2116203062161435, + "learning_rate": 3.1349055981952725e-06, + "loss": 0.5863, + "step": 6016 + }, + { + "epoch": 0.63, + "grad_norm": 2.4603820875117735, + "learning_rate": 3.133324473907685e-06, + "loss": 0.6867, + "step": 6017 + }, + { + "epoch": 0.63, + "grad_norm": 4.688134030426931, + "learning_rate": 3.1317435664850626e-06, + "loss": 0.6107, + "step": 6018 + }, + { + "epoch": 0.63, + "grad_norm": 2.2474502089494233, + "learning_rate": 3.130162876111074e-06, + "loss": 0.665, + "step": 6019 + }, + { + "epoch": 0.63, + "grad_norm": 2.235530948111977, + "learning_rate": 3.128582402969358e-06, + "loss": 0.6928, + "step": 6020 + }, + { + "epoch": 0.63, + "grad_norm": 2.5821548582113008, + "learning_rate": 3.1270021472435276e-06, + "loss": 0.6203, + "step": 6021 + }, + { + "epoch": 0.63, + "grad_norm": 2.637774136880771, + "learning_rate": 3.125422109117173e-06, + "loss": 0.66, + "step": 6022 + }, + { + "epoch": 0.63, + "grad_norm": 2.1103955615847716, + "learning_rate": 3.1238422887738596e-06, + "loss": 0.5965, + "step": 6023 + }, + { + "epoch": 0.63, + "grad_norm": 2.2341447165344492, + "learning_rate": 3.122262686397124e-06, + "loss": 0.741, + "step": 6024 + }, + { + "epoch": 0.63, + "grad_norm": 2.2231015638638967, + "learning_rate": 3.1206833021704843e-06, + "loss": 0.6181, + "step": 6025 + }, + { + "epoch": 0.63, + "grad_norm": 2.8157201183565475, + "learning_rate": 3.1191041362774246e-06, + "loss": 0.688, + "step": 6026 + }, + { + "epoch": 0.63, + "grad_norm": 2.6112171825573403, + "learning_rate": 3.117525188901409e-06, + "loss": 0.6337, + "step": 6027 + }, + { + "epoch": 0.63, + "grad_norm": 2.591086899022109, + "learning_rate": 3.115946460225875e-06, + "loss": 0.6666, + "step": 6028 + }, + { + "epoch": 0.63, + "grad_norm": 1.990144411097681, + "learning_rate": 3.1143679504342367e-06, + "loss": 0.607, + "step": 6029 + }, + { + "epoch": 0.63, + "grad_norm": 1.097695113019066, + "learning_rate": 3.1127896597098784e-06, + "loss": 0.5598, + "step": 6030 + }, + { + "epoch": 0.63, + "grad_norm": 4.043305343302304, + "learning_rate": 3.1112115882361605e-06, + "loss": 0.6536, + "step": 6031 + }, + { + "epoch": 0.63, + "grad_norm": 2.1224861128520143, + "learning_rate": 3.1096337361964213e-06, + "loss": 0.6197, + "step": 6032 + }, + { + "epoch": 0.63, + "grad_norm": 2.2716747161287523, + "learning_rate": 3.108056103773972e-06, + "loss": 0.6368, + "step": 6033 + }, + { + "epoch": 0.63, + "grad_norm": 2.3153801155163998, + "learning_rate": 3.106478691152094e-06, + "loss": 0.6325, + "step": 6034 + }, + { + "epoch": 0.64, + "grad_norm": 2.5772248457210116, + "learning_rate": 3.1049014985140468e-06, + "loss": 0.6982, + "step": 6035 + }, + { + "epoch": 0.64, + "grad_norm": 3.92355644648877, + "learning_rate": 3.103324526043066e-06, + "loss": 0.6235, + "step": 6036 + }, + { + "epoch": 0.64, + "grad_norm": 2.394493973433464, + "learning_rate": 3.101747773922359e-06, + "loss": 0.5862, + "step": 6037 + }, + { + "epoch": 0.64, + "grad_norm": 2.840825066417566, + "learning_rate": 3.100171242335109e-06, + "loss": 0.6773, + "step": 6038 + }, + { + "epoch": 0.64, + "grad_norm": 2.3142902400992083, + "learning_rate": 3.0985949314644724e-06, + "loss": 0.6347, + "step": 6039 + }, + { + "epoch": 0.64, + "grad_norm": 3.1693698090325952, + "learning_rate": 3.09701884149358e-06, + "loss": 0.632, + "step": 6040 + }, + { + "epoch": 0.64, + "grad_norm": 2.174190785161442, + "learning_rate": 3.0954429726055367e-06, + "loss": 0.5664, + "step": 6041 + }, + { + "epoch": 0.64, + "grad_norm": 2.427368074809071, + "learning_rate": 3.093867324983425e-06, + "loss": 0.6002, + "step": 6042 + }, + { + "epoch": 0.64, + "grad_norm": 0.98991358532084, + "learning_rate": 3.0922918988102968e-06, + "loss": 0.5889, + "step": 6043 + }, + { + "epoch": 0.64, + "grad_norm": 3.16305476049039, + "learning_rate": 3.0907166942691804e-06, + "loss": 0.5656, + "step": 6044 + }, + { + "epoch": 0.64, + "grad_norm": 2.3426174204082426, + "learning_rate": 3.0891417115430794e-06, + "loss": 0.5778, + "step": 6045 + }, + { + "epoch": 0.64, + "grad_norm": 2.695431267278061, + "learning_rate": 3.08756695081497e-06, + "loss": 0.662, + "step": 6046 + }, + { + "epoch": 0.64, + "grad_norm": 2.4120245938951017, + "learning_rate": 3.085992412267807e-06, + "loss": 0.5995, + "step": 6047 + }, + { + "epoch": 0.64, + "grad_norm": 2.0993135251738906, + "learning_rate": 3.08441809608451e-06, + "loss": 0.6046, + "step": 6048 + }, + { + "epoch": 0.64, + "grad_norm": 2.120694022813388, + "learning_rate": 3.0828440024479823e-06, + "loss": 0.597, + "step": 6049 + }, + { + "epoch": 0.64, + "grad_norm": 2.164550468698215, + "learning_rate": 3.081270131541094e-06, + "loss": 0.6243, + "step": 6050 + }, + { + "epoch": 0.64, + "grad_norm": 2.499720888315672, + "learning_rate": 3.079696483546699e-06, + "loss": 0.5662, + "step": 6051 + }, + { + "epoch": 0.64, + "grad_norm": 2.19241543023071, + "learning_rate": 3.078123058647614e-06, + "loss": 0.645, + "step": 6052 + }, + { + "epoch": 0.64, + "grad_norm": 2.1759008939791316, + "learning_rate": 3.0765498570266354e-06, + "loss": 0.5636, + "step": 6053 + }, + { + "epoch": 0.64, + "grad_norm": 2.481766219102531, + "learning_rate": 3.074976878866536e-06, + "loss": 0.5773, + "step": 6054 + }, + { + "epoch": 0.64, + "grad_norm": 2.6119108357768677, + "learning_rate": 3.0734041243500578e-06, + "loss": 0.6492, + "step": 6055 + }, + { + "epoch": 0.64, + "grad_norm": 2.1005618832159336, + "learning_rate": 3.0718315936599184e-06, + "loss": 0.6326, + "step": 6056 + }, + { + "epoch": 0.64, + "grad_norm": 2.1970627941200704, + "learning_rate": 3.0702592869788105e-06, + "loss": 0.5606, + "step": 6057 + }, + { + "epoch": 0.64, + "grad_norm": 1.0059803541239813, + "learning_rate": 3.0686872044894014e-06, + "loss": 0.5722, + "step": 6058 + }, + { + "epoch": 0.64, + "grad_norm": 3.2015112152252043, + "learning_rate": 3.0671153463743282e-06, + "loss": 0.641, + "step": 6059 + }, + { + "epoch": 0.64, + "grad_norm": 1.0191280750132719, + "learning_rate": 3.0655437128162093e-06, + "loss": 0.5936, + "step": 6060 + }, + { + "epoch": 0.64, + "grad_norm": 6.045650809744414, + "learning_rate": 3.0639723039976284e-06, + "loss": 0.6325, + "step": 6061 + }, + { + "epoch": 0.64, + "grad_norm": 2.2246240058605222, + "learning_rate": 3.062401120101149e-06, + "loss": 0.6262, + "step": 6062 + }, + { + "epoch": 0.64, + "grad_norm": 3.146655954063514, + "learning_rate": 3.060830161309305e-06, + "loss": 0.699, + "step": 6063 + }, + { + "epoch": 0.64, + "grad_norm": 2.0936615072845663, + "learning_rate": 3.05925942780461e-06, + "loss": 0.5818, + "step": 6064 + }, + { + "epoch": 0.64, + "grad_norm": 4.660109768200281, + "learning_rate": 3.0576889197695435e-06, + "loss": 0.5617, + "step": 6065 + }, + { + "epoch": 0.64, + "grad_norm": 2.2782490312138446, + "learning_rate": 3.0561186373865625e-06, + "loss": 0.5962, + "step": 6066 + }, + { + "epoch": 0.64, + "grad_norm": 2.5339115449409535, + "learning_rate": 3.054548580838099e-06, + "loss": 0.6213, + "step": 6067 + }, + { + "epoch": 0.64, + "grad_norm": 2.145966200390791, + "learning_rate": 3.05297875030656e-06, + "loss": 0.5802, + "step": 6068 + }, + { + "epoch": 0.64, + "grad_norm": 3.023290228663878, + "learning_rate": 3.05140914597432e-06, + "loss": 0.6221, + "step": 6069 + }, + { + "epoch": 0.64, + "grad_norm": 2.7573240624259157, + "learning_rate": 3.049839768023732e-06, + "loss": 0.5638, + "step": 6070 + }, + { + "epoch": 0.64, + "grad_norm": 10.360362698449237, + "learning_rate": 3.0482706166371236e-06, + "loss": 0.6423, + "step": 6071 + }, + { + "epoch": 0.64, + "grad_norm": 2.7058889588734343, + "learning_rate": 3.0467016919967908e-06, + "loss": 0.6319, + "step": 6072 + }, + { + "epoch": 0.64, + "grad_norm": 2.38161674800116, + "learning_rate": 3.0451329942850117e-06, + "loss": 0.6777, + "step": 6073 + }, + { + "epoch": 0.64, + "grad_norm": 2.4754236044435, + "learning_rate": 3.0435645236840296e-06, + "loss": 0.6553, + "step": 6074 + }, + { + "epoch": 0.64, + "grad_norm": 2.505835959851159, + "learning_rate": 3.041996280376066e-06, + "loss": 0.5921, + "step": 6075 + }, + { + "epoch": 0.64, + "grad_norm": 2.2325223472969085, + "learning_rate": 3.0404282645433125e-06, + "loss": 0.6708, + "step": 6076 + }, + { + "epoch": 0.64, + "grad_norm": 2.4229637130342065, + "learning_rate": 3.038860476367942e-06, + "loss": 0.5817, + "step": 6077 + }, + { + "epoch": 0.64, + "grad_norm": 2.2424616876949837, + "learning_rate": 3.03729291603209e-06, + "loss": 0.6665, + "step": 6078 + }, + { + "epoch": 0.64, + "grad_norm": 2.196951713754301, + "learning_rate": 3.0357255837178733e-06, + "loss": 0.6736, + "step": 6079 + }, + { + "epoch": 0.64, + "grad_norm": 3.412747433968496, + "learning_rate": 3.034158479607381e-06, + "loss": 0.6189, + "step": 6080 + }, + { + "epoch": 0.64, + "grad_norm": 2.3144405798445638, + "learning_rate": 3.032591603882674e-06, + "loss": 0.6844, + "step": 6081 + }, + { + "epoch": 0.64, + "grad_norm": 2.7027103937151957, + "learning_rate": 3.031024956725787e-06, + "loss": 0.6343, + "step": 6082 + }, + { + "epoch": 0.64, + "grad_norm": 2.612524722737757, + "learning_rate": 3.029458538318728e-06, + "loss": 0.6779, + "step": 6083 + }, + { + "epoch": 0.64, + "grad_norm": 2.5457464342733767, + "learning_rate": 3.02789234884348e-06, + "loss": 0.6377, + "step": 6084 + }, + { + "epoch": 0.64, + "grad_norm": 1.9720571167689611, + "learning_rate": 3.0263263884819975e-06, + "loss": 0.6417, + "step": 6085 + }, + { + "epoch": 0.64, + "grad_norm": 2.847776105778556, + "learning_rate": 3.0247606574162127e-06, + "loss": 0.6391, + "step": 6086 + }, + { + "epoch": 0.64, + "grad_norm": 2.33929101232662, + "learning_rate": 3.0231951558280226e-06, + "loss": 0.6156, + "step": 6087 + }, + { + "epoch": 0.64, + "grad_norm": 2.5013360523106787, + "learning_rate": 3.0216298838993043e-06, + "loss": 0.7201, + "step": 6088 + }, + { + "epoch": 0.64, + "grad_norm": 3.089321590479491, + "learning_rate": 3.020064841811908e-06, + "loss": 0.6186, + "step": 6089 + }, + { + "epoch": 0.64, + "grad_norm": 2.7046583661780823, + "learning_rate": 3.018500029747657e-06, + "loss": 0.6985, + "step": 6090 + }, + { + "epoch": 0.64, + "grad_norm": 2.5221299383717657, + "learning_rate": 3.016935447888343e-06, + "loss": 0.6104, + "step": 6091 + }, + { + "epoch": 0.64, + "grad_norm": 2.1978338732670597, + "learning_rate": 3.015371096415735e-06, + "loss": 0.5294, + "step": 6092 + }, + { + "epoch": 0.64, + "grad_norm": 2.685524749270053, + "learning_rate": 3.0138069755115772e-06, + "loss": 0.672, + "step": 6093 + }, + { + "epoch": 0.64, + "grad_norm": 2.1195488615414546, + "learning_rate": 3.012243085357582e-06, + "loss": 0.6829, + "step": 6094 + }, + { + "epoch": 0.64, + "grad_norm": 2.577287209734468, + "learning_rate": 3.010679426135442e-06, + "loss": 0.7072, + "step": 6095 + }, + { + "epoch": 0.64, + "grad_norm": 6.13324035955566, + "learning_rate": 3.009115998026815e-06, + "loss": 0.5589, + "step": 6096 + }, + { + "epoch": 0.64, + "grad_norm": 2.274593312395244, + "learning_rate": 3.007552801213335e-06, + "loss": 0.6797, + "step": 6097 + }, + { + "epoch": 0.64, + "grad_norm": 0.8495761189029405, + "learning_rate": 3.0059898358766102e-06, + "loss": 0.5548, + "step": 6098 + }, + { + "epoch": 0.64, + "grad_norm": 2.263736591389369, + "learning_rate": 3.004427102198225e-06, + "loss": 0.6734, + "step": 6099 + }, + { + "epoch": 0.64, + "grad_norm": 2.428586454310511, + "learning_rate": 3.002864600359729e-06, + "loss": 0.6665, + "step": 6100 + }, + { + "epoch": 0.64, + "grad_norm": 2.6142648776527855, + "learning_rate": 3.0013023305426493e-06, + "loss": 0.665, + "step": 6101 + }, + { + "epoch": 0.64, + "grad_norm": 4.093099723594316, + "learning_rate": 2.9997402929284886e-06, + "loss": 0.5937, + "step": 6102 + }, + { + "epoch": 0.64, + "grad_norm": 0.981769329689394, + "learning_rate": 2.9981784876987195e-06, + "loss": 0.5496, + "step": 6103 + }, + { + "epoch": 0.64, + "grad_norm": 2.089324365056355, + "learning_rate": 2.996616915034786e-06, + "loss": 0.582, + "step": 6104 + }, + { + "epoch": 0.64, + "grad_norm": 3.283444978516847, + "learning_rate": 2.9950555751181067e-06, + "loss": 0.6011, + "step": 6105 + }, + { + "epoch": 0.64, + "grad_norm": 2.1322926764233148, + "learning_rate": 2.9934944681300764e-06, + "loss": 0.595, + "step": 6106 + }, + { + "epoch": 0.64, + "grad_norm": 2.6785042803171994, + "learning_rate": 2.9919335942520577e-06, + "loss": 0.604, + "step": 6107 + }, + { + "epoch": 0.64, + "grad_norm": 2.525026434174256, + "learning_rate": 2.9903729536653908e-06, + "loss": 0.5419, + "step": 6108 + }, + { + "epoch": 0.64, + "grad_norm": 4.332421592393996, + "learning_rate": 2.9888125465513838e-06, + "loss": 0.6079, + "step": 6109 + }, + { + "epoch": 0.64, + "grad_norm": 2.7390363662038246, + "learning_rate": 2.987252373091322e-06, + "loss": 0.5825, + "step": 6110 + }, + { + "epoch": 0.64, + "grad_norm": 2.6929057930774722, + "learning_rate": 2.9856924334664607e-06, + "loss": 0.5882, + "step": 6111 + }, + { + "epoch": 0.64, + "grad_norm": 2.8780256095081334, + "learning_rate": 2.9841327278580306e-06, + "loss": 0.5991, + "step": 6112 + }, + { + "epoch": 0.64, + "grad_norm": 3.211030034668102, + "learning_rate": 2.982573256447232e-06, + "loss": 0.6217, + "step": 6113 + }, + { + "epoch": 0.64, + "grad_norm": 2.076849408004536, + "learning_rate": 2.98101401941524e-06, + "loss": 0.6127, + "step": 6114 + }, + { + "epoch": 0.64, + "grad_norm": 2.8399499017577328, + "learning_rate": 2.979455016943204e-06, + "loss": 0.6985, + "step": 6115 + }, + { + "epoch": 0.64, + "grad_norm": 2.5380298634074285, + "learning_rate": 2.977896249212244e-06, + "loss": 0.6568, + "step": 6116 + }, + { + "epoch": 0.64, + "grad_norm": 2.1139041232082096, + "learning_rate": 2.976337716403452e-06, + "loss": 0.6043, + "step": 6117 + }, + { + "epoch": 0.64, + "grad_norm": 2.386270978789105, + "learning_rate": 2.974779418697893e-06, + "loss": 0.58, + "step": 6118 + }, + { + "epoch": 0.64, + "grad_norm": 2.249981645922609, + "learning_rate": 2.9732213562766076e-06, + "loss": 0.7059, + "step": 6119 + }, + { + "epoch": 0.64, + "grad_norm": 2.7953475276968995, + "learning_rate": 2.9716635293206054e-06, + "loss": 0.5731, + "step": 6120 + }, + { + "epoch": 0.64, + "grad_norm": 2.1595949678722386, + "learning_rate": 2.9701059380108732e-06, + "loss": 0.6399, + "step": 6121 + }, + { + "epoch": 0.64, + "grad_norm": 4.250031089313065, + "learning_rate": 2.9685485825283646e-06, + "loss": 0.6693, + "step": 6122 + }, + { + "epoch": 0.64, + "grad_norm": 2.716682335751081, + "learning_rate": 2.9669914630540074e-06, + "loss": 0.5684, + "step": 6123 + }, + { + "epoch": 0.64, + "grad_norm": 2.11893087796511, + "learning_rate": 2.9654345797687067e-06, + "loss": 0.6003, + "step": 6124 + }, + { + "epoch": 0.64, + "grad_norm": 3.1494558741601413, + "learning_rate": 2.9638779328533363e-06, + "loss": 0.5924, + "step": 6125 + }, + { + "epoch": 0.64, + "grad_norm": 2.6969375293675673, + "learning_rate": 2.9623215224887405e-06, + "loss": 0.6277, + "step": 6126 + }, + { + "epoch": 0.64, + "grad_norm": 5.129836907789918, + "learning_rate": 2.9607653488557385e-06, + "loss": 0.6265, + "step": 6127 + }, + { + "epoch": 0.64, + "grad_norm": 2.24499842881353, + "learning_rate": 2.9592094121351257e-06, + "loss": 0.5141, + "step": 6128 + }, + { + "epoch": 0.64, + "grad_norm": 1.1727226381138738, + "learning_rate": 2.9576537125076644e-06, + "loss": 0.5722, + "step": 6129 + }, + { + "epoch": 0.65, + "grad_norm": 2.4525235655286766, + "learning_rate": 2.956098250154089e-06, + "loss": 0.628, + "step": 6130 + }, + { + "epoch": 0.65, + "grad_norm": 3.0032073192917212, + "learning_rate": 2.954543025255111e-06, + "loss": 0.6489, + "step": 6131 + }, + { + "epoch": 0.65, + "grad_norm": 2.2967514557097366, + "learning_rate": 2.9529880379914123e-06, + "loss": 0.6079, + "step": 6132 + }, + { + "epoch": 0.65, + "grad_norm": 3.9813396803758367, + "learning_rate": 2.9514332885436447e-06, + "loss": 0.6043, + "step": 6133 + }, + { + "epoch": 0.65, + "grad_norm": 2.374827795762819, + "learning_rate": 2.9498787770924375e-06, + "loss": 0.6733, + "step": 6134 + }, + { + "epoch": 0.65, + "grad_norm": 2.8305018905675694, + "learning_rate": 2.9483245038183874e-06, + "loss": 0.6773, + "step": 6135 + }, + { + "epoch": 0.65, + "grad_norm": 2.729421853905016, + "learning_rate": 2.946770468902064e-06, + "loss": 0.6898, + "step": 6136 + }, + { + "epoch": 0.65, + "grad_norm": 0.9984646956788515, + "learning_rate": 2.945216672524014e-06, + "loss": 0.6088, + "step": 6137 + }, + { + "epoch": 0.65, + "grad_norm": 2.667763450610834, + "learning_rate": 2.943663114864752e-06, + "loss": 0.577, + "step": 6138 + }, + { + "epoch": 0.65, + "grad_norm": 2.802310695654871, + "learning_rate": 2.9421097961047633e-06, + "loss": 0.5924, + "step": 6139 + }, + { + "epoch": 0.65, + "grad_norm": 2.2866904253496174, + "learning_rate": 2.9405567164245096e-06, + "loss": 0.6054, + "step": 6140 + }, + { + "epoch": 0.65, + "grad_norm": 3.2953941016991246, + "learning_rate": 2.939003876004424e-06, + "loss": 0.6667, + "step": 6141 + }, + { + "epoch": 0.65, + "grad_norm": 4.230139296054894, + "learning_rate": 2.9374512750249098e-06, + "loss": 0.594, + "step": 6142 + }, + { + "epoch": 0.65, + "grad_norm": 2.2675620169968975, + "learning_rate": 2.935898913666345e-06, + "loss": 0.6655, + "step": 6143 + }, + { + "epoch": 0.65, + "grad_norm": 2.459430858178251, + "learning_rate": 2.9343467921090774e-06, + "loss": 0.655, + "step": 6144 + }, + { + "epoch": 0.65, + "grad_norm": 2.267669278148576, + "learning_rate": 2.9327949105334284e-06, + "loss": 0.633, + "step": 6145 + }, + { + "epoch": 0.65, + "grad_norm": 2.4328862273124567, + "learning_rate": 2.93124326911969e-06, + "loss": 0.6138, + "step": 6146 + }, + { + "epoch": 0.65, + "grad_norm": 0.8997361338805473, + "learning_rate": 2.9296918680481308e-06, + "loss": 0.5905, + "step": 6147 + }, + { + "epoch": 0.65, + "grad_norm": 2.7491447988759568, + "learning_rate": 2.928140707498984e-06, + "loss": 0.6503, + "step": 6148 + }, + { + "epoch": 0.65, + "grad_norm": 3.6968575644202506, + "learning_rate": 2.92658978765246e-06, + "loss": 0.5916, + "step": 6149 + }, + { + "epoch": 0.65, + "grad_norm": 3.5697760256627373, + "learning_rate": 2.925039108688742e-06, + "loss": 0.638, + "step": 6150 + }, + { + "epoch": 0.65, + "grad_norm": 2.049482671759394, + "learning_rate": 2.9234886707879827e-06, + "loss": 0.6333, + "step": 6151 + }, + { + "epoch": 0.65, + "grad_norm": 2.446291237259445, + "learning_rate": 2.921938474130307e-06, + "loss": 0.652, + "step": 6152 + }, + { + "epoch": 0.65, + "grad_norm": 2.200019946516808, + "learning_rate": 2.9203885188958103e-06, + "loss": 0.5058, + "step": 6153 + }, + { + "epoch": 0.65, + "grad_norm": 3.0764287200010685, + "learning_rate": 2.9188388052645656e-06, + "loss": 0.6562, + "step": 6154 + }, + { + "epoch": 0.65, + "grad_norm": 3.3263889642902593, + "learning_rate": 2.9172893334166108e-06, + "loss": 0.5888, + "step": 6155 + }, + { + "epoch": 0.65, + "grad_norm": 2.164471451455957, + "learning_rate": 2.915740103531963e-06, + "loss": 0.5748, + "step": 6156 + }, + { + "epoch": 0.65, + "grad_norm": 3.0166454764249417, + "learning_rate": 2.9141911157906032e-06, + "loss": 0.6495, + "step": 6157 + }, + { + "epoch": 0.65, + "grad_norm": 2.0796493091839743, + "learning_rate": 2.9126423703724925e-06, + "loss": 0.5941, + "step": 6158 + }, + { + "epoch": 0.65, + "grad_norm": 2.472853403516785, + "learning_rate": 2.911093867457555e-06, + "loss": 0.6391, + "step": 6159 + }, + { + "epoch": 0.65, + "grad_norm": 2.2590642055205663, + "learning_rate": 2.9095456072256955e-06, + "loss": 0.6039, + "step": 6160 + }, + { + "epoch": 0.65, + "grad_norm": 2.1770319394327617, + "learning_rate": 2.9079975898567823e-06, + "loss": 0.6178, + "step": 6161 + }, + { + "epoch": 0.65, + "grad_norm": 2.4623318761495, + "learning_rate": 2.906449815530664e-06, + "loss": 0.6449, + "step": 6162 + }, + { + "epoch": 0.65, + "grad_norm": 2.4317346877522916, + "learning_rate": 2.9049022844271517e-06, + "loss": 0.688, + "step": 6163 + }, + { + "epoch": 0.65, + "grad_norm": 2.427285093977721, + "learning_rate": 2.9033549967260383e-06, + "loss": 0.6544, + "step": 6164 + }, + { + "epoch": 0.65, + "grad_norm": 2.4834448464193644, + "learning_rate": 2.9018079526070786e-06, + "loss": 0.5716, + "step": 6165 + }, + { + "epoch": 0.65, + "grad_norm": 3.7395209019829614, + "learning_rate": 2.900261152250007e-06, + "loss": 0.542, + "step": 6166 + }, + { + "epoch": 0.65, + "grad_norm": 3.0529654367037993, + "learning_rate": 2.8987145958345235e-06, + "loss": 0.5837, + "step": 6167 + }, + { + "epoch": 0.65, + "grad_norm": 3.1062690132576405, + "learning_rate": 2.8971682835403043e-06, + "loss": 0.6021, + "step": 6168 + }, + { + "epoch": 0.65, + "grad_norm": 2.8007780098866983, + "learning_rate": 2.895622215546997e-06, + "loss": 0.5921, + "step": 6169 + }, + { + "epoch": 0.65, + "grad_norm": 3.200411944525784, + "learning_rate": 2.8940763920342153e-06, + "loss": 0.6037, + "step": 6170 + }, + { + "epoch": 0.65, + "grad_norm": 4.858054420142283, + "learning_rate": 2.892530813181553e-06, + "loss": 0.6641, + "step": 6171 + }, + { + "epoch": 0.65, + "grad_norm": 2.5625816954495666, + "learning_rate": 2.8909854791685666e-06, + "loss": 0.5688, + "step": 6172 + }, + { + "epoch": 0.65, + "grad_norm": 2.715416045079819, + "learning_rate": 2.889440390174793e-06, + "loss": 0.5802, + "step": 6173 + }, + { + "epoch": 0.65, + "grad_norm": 2.6641433870808893, + "learning_rate": 2.887895546379732e-06, + "loss": 0.5821, + "step": 6174 + }, + { + "epoch": 0.65, + "grad_norm": 2.416515122562737, + "learning_rate": 2.8863509479628626e-06, + "loss": 0.6671, + "step": 6175 + }, + { + "epoch": 0.65, + "grad_norm": 2.836952564545351, + "learning_rate": 2.884806595103628e-06, + "loss": 0.6201, + "step": 6176 + }, + { + "epoch": 0.65, + "grad_norm": 2.588917907113869, + "learning_rate": 2.8832624879814507e-06, + "loss": 0.5886, + "step": 6177 + }, + { + "epoch": 0.65, + "grad_norm": 6.571817028335935, + "learning_rate": 2.8817186267757173e-06, + "loss": 0.5718, + "step": 6178 + }, + { + "epoch": 0.65, + "grad_norm": 2.812640707900028, + "learning_rate": 2.88017501166579e-06, + "loss": 0.6379, + "step": 6179 + }, + { + "epoch": 0.65, + "grad_norm": 2.626920559291416, + "learning_rate": 2.8786316428310046e-06, + "loss": 0.5918, + "step": 6180 + }, + { + "epoch": 0.65, + "grad_norm": 2.798186376430685, + "learning_rate": 2.8770885204506603e-06, + "loss": 0.6388, + "step": 6181 + }, + { + "epoch": 0.65, + "grad_norm": 2.751589330763615, + "learning_rate": 2.8755456447040362e-06, + "loss": 0.573, + "step": 6182 + }, + { + "epoch": 0.65, + "grad_norm": 2.28473507954531, + "learning_rate": 2.874003015770377e-06, + "loss": 0.6162, + "step": 6183 + }, + { + "epoch": 0.65, + "grad_norm": 2.557119980309637, + "learning_rate": 2.872460633828904e-06, + "loss": 0.5865, + "step": 6184 + }, + { + "epoch": 0.65, + "grad_norm": 3.1804128048003784, + "learning_rate": 2.8709184990588012e-06, + "loss": 0.6432, + "step": 6185 + }, + { + "epoch": 0.65, + "grad_norm": 2.6384781468932905, + "learning_rate": 2.869376611639236e-06, + "loss": 0.6571, + "step": 6186 + }, + { + "epoch": 0.65, + "grad_norm": 2.595212904243066, + "learning_rate": 2.8678349717493343e-06, + "loss": 0.6985, + "step": 6187 + }, + { + "epoch": 0.65, + "grad_norm": 3.0950307332286857, + "learning_rate": 2.8662935795682046e-06, + "loss": 0.6029, + "step": 6188 + }, + { + "epoch": 0.65, + "grad_norm": 3.345799472219859, + "learning_rate": 2.864752435274916e-06, + "loss": 0.6059, + "step": 6189 + }, + { + "epoch": 0.65, + "grad_norm": 2.25918445844699, + "learning_rate": 2.8632115390485176e-06, + "loss": 0.676, + "step": 6190 + }, + { + "epoch": 0.65, + "grad_norm": 2.322802206550409, + "learning_rate": 2.8616708910680278e-06, + "loss": 0.6211, + "step": 6191 + }, + { + "epoch": 0.65, + "grad_norm": 2.523140689849376, + "learning_rate": 2.8601304915124305e-06, + "loss": 0.6485, + "step": 6192 + }, + { + "epoch": 0.65, + "grad_norm": 3.013530763468245, + "learning_rate": 2.85859034056069e-06, + "loss": 0.6619, + "step": 6193 + }, + { + "epoch": 0.65, + "grad_norm": 3.291612700420486, + "learning_rate": 2.8570504383917296e-06, + "loss": 0.5981, + "step": 6194 + }, + { + "epoch": 0.65, + "grad_norm": 2.4140328386952463, + "learning_rate": 2.8555107851844576e-06, + "loss": 0.5461, + "step": 6195 + }, + { + "epoch": 0.65, + "grad_norm": 2.687609928584908, + "learning_rate": 2.8539713811177418e-06, + "loss": 0.6283, + "step": 6196 + }, + { + "epoch": 0.65, + "grad_norm": 2.3244100962522163, + "learning_rate": 2.8524322263704297e-06, + "loss": 0.5716, + "step": 6197 + }, + { + "epoch": 0.65, + "grad_norm": 2.1973658455412997, + "learning_rate": 2.8508933211213306e-06, + "loss": 0.531, + "step": 6198 + }, + { + "epoch": 0.65, + "grad_norm": 2.0935858472905258, + "learning_rate": 2.8493546655492356e-06, + "loss": 0.598, + "step": 6199 + }, + { + "epoch": 0.65, + "grad_norm": 2.77092033346428, + "learning_rate": 2.8478162598328963e-06, + "loss": 0.552, + "step": 6200 + }, + { + "epoch": 0.65, + "grad_norm": 2.624275390466366, + "learning_rate": 2.8462781041510446e-06, + "loss": 0.666, + "step": 6201 + }, + { + "epoch": 0.65, + "grad_norm": 2.1540146591782405, + "learning_rate": 2.8447401986823752e-06, + "loss": 0.6687, + "step": 6202 + }, + { + "epoch": 0.65, + "grad_norm": 2.263646788490095, + "learning_rate": 2.8432025436055593e-06, + "loss": 0.6402, + "step": 6203 + }, + { + "epoch": 0.65, + "grad_norm": 2.537185698965246, + "learning_rate": 2.841665139099239e-06, + "loss": 0.6249, + "step": 6204 + }, + { + "epoch": 0.65, + "grad_norm": 2.4385362216870554, + "learning_rate": 2.8401279853420216e-06, + "loss": 0.6824, + "step": 6205 + }, + { + "epoch": 0.65, + "grad_norm": 1.1466991106289137, + "learning_rate": 2.838591082512494e-06, + "loss": 0.5709, + "step": 6206 + }, + { + "epoch": 0.65, + "grad_norm": 2.076550784919779, + "learning_rate": 2.837054430789204e-06, + "loss": 0.6901, + "step": 6207 + }, + { + "epoch": 0.65, + "grad_norm": 2.7605170118341413, + "learning_rate": 2.8355180303506803e-06, + "loss": 0.6498, + "step": 6208 + }, + { + "epoch": 0.65, + "grad_norm": 2.224223567454739, + "learning_rate": 2.8339818813754115e-06, + "loss": 0.6497, + "step": 6209 + }, + { + "epoch": 0.65, + "grad_norm": 2.348664612330819, + "learning_rate": 2.8324459840418694e-06, + "loss": 0.6267, + "step": 6210 + }, + { + "epoch": 0.65, + "grad_norm": 2.9347639572736144, + "learning_rate": 2.8309103385284853e-06, + "loss": 0.6423, + "step": 6211 + }, + { + "epoch": 0.65, + "grad_norm": 2.408795609146555, + "learning_rate": 2.82937494501367e-06, + "loss": 0.6295, + "step": 6212 + }, + { + "epoch": 0.65, + "grad_norm": 3.8368693625535277, + "learning_rate": 2.8278398036757963e-06, + "loss": 0.5594, + "step": 6213 + }, + { + "epoch": 0.65, + "grad_norm": 2.51892710402293, + "learning_rate": 2.8263049146932153e-06, + "loss": 0.5737, + "step": 6214 + }, + { + "epoch": 0.65, + "grad_norm": 3.302046825233197, + "learning_rate": 2.8247702782442483e-06, + "loss": 0.6769, + "step": 6215 + }, + { + "epoch": 0.65, + "grad_norm": 3.85067453672738, + "learning_rate": 2.8232358945071804e-06, + "loss": 0.5899, + "step": 6216 + }, + { + "epoch": 0.65, + "grad_norm": 2.9563736592529963, + "learning_rate": 2.821701763660275e-06, + "loss": 0.6199, + "step": 6217 + }, + { + "epoch": 0.65, + "grad_norm": 2.8979485285850846, + "learning_rate": 2.8201678858817604e-06, + "loss": 0.5747, + "step": 6218 + }, + { + "epoch": 0.65, + "grad_norm": 6.242630295137544, + "learning_rate": 2.818634261349842e-06, + "loss": 0.5828, + "step": 6219 + }, + { + "epoch": 0.65, + "grad_norm": 2.8275662714905065, + "learning_rate": 2.8171008902426865e-06, + "loss": 0.702, + "step": 6220 + }, + { + "epoch": 0.65, + "grad_norm": 2.2586239782876385, + "learning_rate": 2.8155677727384422e-06, + "loss": 0.5648, + "step": 6221 + }, + { + "epoch": 0.65, + "grad_norm": 3.5846393384463773, + "learning_rate": 2.814034909015217e-06, + "loss": 0.6001, + "step": 6222 + }, + { + "epoch": 0.65, + "grad_norm": 2.736699486370083, + "learning_rate": 2.8125022992510997e-06, + "loss": 0.5504, + "step": 6223 + }, + { + "epoch": 0.65, + "grad_norm": 2.982453703063961, + "learning_rate": 2.810969943624139e-06, + "loss": 0.6271, + "step": 6224 + }, + { + "epoch": 0.66, + "grad_norm": 1.0378158485658642, + "learning_rate": 2.8094378423123646e-06, + "loss": 0.5252, + "step": 6225 + }, + { + "epoch": 0.66, + "grad_norm": 2.368853983880845, + "learning_rate": 2.807905995493768e-06, + "loss": 0.5657, + "step": 6226 + }, + { + "epoch": 0.66, + "grad_norm": 2.6453989305193906, + "learning_rate": 2.8063744033463157e-06, + "loss": 0.6209, + "step": 6227 + }, + { + "epoch": 0.66, + "grad_norm": 3.5318565272857363, + "learning_rate": 2.8048430660479463e-06, + "loss": 0.5866, + "step": 6228 + }, + { + "epoch": 0.66, + "grad_norm": 3.8249719217174523, + "learning_rate": 2.803311983776562e-06, + "loss": 0.6002, + "step": 6229 + }, + { + "epoch": 0.66, + "grad_norm": 2.4969861662636537, + "learning_rate": 2.8017811567100434e-06, + "loss": 0.6462, + "step": 6230 + }, + { + "epoch": 0.66, + "grad_norm": 2.3785984897135353, + "learning_rate": 2.8002505850262334e-06, + "loss": 0.5927, + "step": 6231 + }, + { + "epoch": 0.66, + "grad_norm": 2.141017713761464, + "learning_rate": 2.7987202689029535e-06, + "loss": 0.6416, + "step": 6232 + }, + { + "epoch": 0.66, + "grad_norm": 2.9090874878260973, + "learning_rate": 2.797190208517988e-06, + "loss": 0.6156, + "step": 6233 + }, + { + "epoch": 0.66, + "grad_norm": 2.0212371235951685, + "learning_rate": 2.795660404049098e-06, + "loss": 0.6247, + "step": 6234 + }, + { + "epoch": 0.66, + "grad_norm": 2.2096556030865004, + "learning_rate": 2.794130855674009e-06, + "loss": 0.6012, + "step": 6235 + }, + { + "epoch": 0.66, + "grad_norm": 2.2361232833661924, + "learning_rate": 2.7926015635704216e-06, + "loss": 0.6237, + "step": 6236 + }, + { + "epoch": 0.66, + "grad_norm": 2.605730880983019, + "learning_rate": 2.7910725279160016e-06, + "loss": 0.6243, + "step": 6237 + }, + { + "epoch": 0.66, + "grad_norm": 2.33390242289277, + "learning_rate": 2.78954374888839e-06, + "loss": 0.6674, + "step": 6238 + }, + { + "epoch": 0.66, + "grad_norm": 3.329689514740396, + "learning_rate": 2.7880152266651985e-06, + "loss": 0.6576, + "step": 6239 + }, + { + "epoch": 0.66, + "grad_norm": 4.400052397965454, + "learning_rate": 2.7864869614240013e-06, + "loss": 0.5916, + "step": 6240 + }, + { + "epoch": 0.66, + "grad_norm": 2.437094841587654, + "learning_rate": 2.7849589533423526e-06, + "loss": 0.6769, + "step": 6241 + }, + { + "epoch": 0.66, + "grad_norm": 2.158441047716093, + "learning_rate": 2.783431202597767e-06, + "loss": 0.5817, + "step": 6242 + }, + { + "epoch": 0.66, + "grad_norm": 3.202778092080676, + "learning_rate": 2.78190370936774e-06, + "loss": 0.6905, + "step": 6243 + }, + { + "epoch": 0.66, + "grad_norm": 2.181636718933403, + "learning_rate": 2.7803764738297257e-06, + "loss": 0.6753, + "step": 6244 + }, + { + "epoch": 0.66, + "grad_norm": 2.169908589296973, + "learning_rate": 2.7788494961611577e-06, + "loss": 0.6062, + "step": 6245 + }, + { + "epoch": 0.66, + "grad_norm": 2.1889402639310336, + "learning_rate": 2.7773227765394335e-06, + "loss": 0.6787, + "step": 6246 + }, + { + "epoch": 0.66, + "grad_norm": 2.6886102911065244, + "learning_rate": 2.7757963151419255e-06, + "loss": 0.6211, + "step": 6247 + }, + { + "epoch": 0.66, + "grad_norm": 3.620084649426985, + "learning_rate": 2.7742701121459703e-06, + "loss": 0.5493, + "step": 6248 + }, + { + "epoch": 0.66, + "grad_norm": 2.0808691207938845, + "learning_rate": 2.772744167728879e-06, + "loss": 0.6245, + "step": 6249 + }, + { + "epoch": 0.66, + "grad_norm": 0.94937587598107, + "learning_rate": 2.7712184820679343e-06, + "loss": 0.5034, + "step": 6250 + }, + { + "epoch": 0.66, + "grad_norm": 2.750670337900543, + "learning_rate": 2.7696930553403817e-06, + "loss": 0.6075, + "step": 6251 + }, + { + "epoch": 0.66, + "grad_norm": 1.8167968242924053, + "learning_rate": 2.7681678877234446e-06, + "loss": 0.567, + "step": 6252 + }, + { + "epoch": 0.66, + "grad_norm": 2.3048720790451163, + "learning_rate": 2.7666429793943087e-06, + "loss": 0.6424, + "step": 6253 + }, + { + "epoch": 0.66, + "grad_norm": 2.978504924873048, + "learning_rate": 2.765118330530138e-06, + "loss": 0.6175, + "step": 6254 + }, + { + "epoch": 0.66, + "grad_norm": 2.9075301291638174, + "learning_rate": 2.763593941308057e-06, + "loss": 0.6043, + "step": 6255 + }, + { + "epoch": 0.66, + "grad_norm": 3.3429392341032482, + "learning_rate": 2.7620698119051687e-06, + "loss": 0.6713, + "step": 6256 + }, + { + "epoch": 0.66, + "grad_norm": 2.077361667669363, + "learning_rate": 2.7605459424985387e-06, + "loss": 0.7026, + "step": 6257 + }, + { + "epoch": 0.66, + "grad_norm": 2.5927187596499923, + "learning_rate": 2.7590223332652096e-06, + "loss": 0.6464, + "step": 6258 + }, + { + "epoch": 0.66, + "grad_norm": 2.588598586482582, + "learning_rate": 2.7574989843821855e-06, + "loss": 0.7161, + "step": 6259 + }, + { + "epoch": 0.66, + "grad_norm": 2.9831247521107374, + "learning_rate": 2.7559758960264492e-06, + "loss": 0.6043, + "step": 6260 + }, + { + "epoch": 0.66, + "grad_norm": 13.839156626103529, + "learning_rate": 2.7544530683749447e-06, + "loss": 0.6524, + "step": 6261 + }, + { + "epoch": 0.66, + "grad_norm": 2.302955982405863, + "learning_rate": 2.7529305016045917e-06, + "loss": 0.6315, + "step": 6262 + }, + { + "epoch": 0.66, + "grad_norm": 4.745223080871031, + "learning_rate": 2.751408195892279e-06, + "loss": 0.6543, + "step": 6263 + }, + { + "epoch": 0.66, + "grad_norm": 3.2451920481026892, + "learning_rate": 2.74988615141486e-06, + "loss": 0.5986, + "step": 6264 + }, + { + "epoch": 0.66, + "grad_norm": 3.1406872121844263, + "learning_rate": 2.7483643683491658e-06, + "loss": 0.5726, + "step": 6265 + }, + { + "epoch": 0.66, + "grad_norm": 3.484418230228393, + "learning_rate": 2.7468428468719877e-06, + "loss": 0.7162, + "step": 6266 + }, + { + "epoch": 0.66, + "grad_norm": 2.329591736950578, + "learning_rate": 2.7453215871600967e-06, + "loss": 0.5947, + "step": 6267 + }, + { + "epoch": 0.66, + "grad_norm": 2.4045111597868307, + "learning_rate": 2.743800589390225e-06, + "loss": 0.6726, + "step": 6268 + }, + { + "epoch": 0.66, + "grad_norm": 2.3627978273372885, + "learning_rate": 2.74227985373908e-06, + "loss": 0.5535, + "step": 6269 + }, + { + "epoch": 0.66, + "grad_norm": 3.6457501096909266, + "learning_rate": 2.7407593803833333e-06, + "loss": 0.6602, + "step": 6270 + }, + { + "epoch": 0.66, + "grad_norm": 2.926138821288341, + "learning_rate": 2.7392391694996335e-06, + "loss": 0.7381, + "step": 6271 + }, + { + "epoch": 0.66, + "grad_norm": 1.1259023466885845, + "learning_rate": 2.7377192212645888e-06, + "loss": 0.5446, + "step": 6272 + }, + { + "epoch": 0.66, + "grad_norm": 2.231315217630402, + "learning_rate": 2.736199535854788e-06, + "loss": 0.566, + "step": 6273 + }, + { + "epoch": 0.66, + "grad_norm": 2.7856102084315983, + "learning_rate": 2.7346801134467794e-06, + "loss": 0.606, + "step": 6274 + }, + { + "epoch": 0.66, + "grad_norm": 2.6026849229772218, + "learning_rate": 2.733160954217086e-06, + "loss": 0.6111, + "step": 6275 + }, + { + "epoch": 0.66, + "grad_norm": 2.5134836440676724, + "learning_rate": 2.731642058342203e-06, + "loss": 0.5827, + "step": 6276 + }, + { + "epoch": 0.66, + "grad_norm": 2.2996578580915323, + "learning_rate": 2.7301234259985863e-06, + "loss": 0.5993, + "step": 6277 + }, + { + "epoch": 0.66, + "grad_norm": 2.937914077737516, + "learning_rate": 2.72860505736267e-06, + "loss": 0.6798, + "step": 6278 + }, + { + "epoch": 0.66, + "grad_norm": 2.9983367460524004, + "learning_rate": 2.7270869526108507e-06, + "loss": 0.6152, + "step": 6279 + }, + { + "epoch": 0.66, + "grad_norm": 2.6496495044995605, + "learning_rate": 2.7255691119195005e-06, + "loss": 0.6753, + "step": 6280 + }, + { + "epoch": 0.66, + "grad_norm": 2.138925305786583, + "learning_rate": 2.7240515354649545e-06, + "loss": 0.6051, + "step": 6281 + }, + { + "epoch": 0.66, + "grad_norm": 2.490293141221245, + "learning_rate": 2.722534223423524e-06, + "loss": 0.6335, + "step": 6282 + }, + { + "epoch": 0.66, + "grad_norm": 2.322910264522336, + "learning_rate": 2.721017175971482e-06, + "loss": 0.6405, + "step": 6283 + }, + { + "epoch": 0.66, + "grad_norm": 2.643925276634918, + "learning_rate": 2.719500393285076e-06, + "loss": 0.6404, + "step": 6284 + }, + { + "epoch": 0.66, + "grad_norm": 2.0080720283021827, + "learning_rate": 2.7179838755405253e-06, + "loss": 0.5701, + "step": 6285 + }, + { + "epoch": 0.66, + "grad_norm": 2.5525745426515476, + "learning_rate": 2.7164676229140098e-06, + "loss": 0.6459, + "step": 6286 + }, + { + "epoch": 0.66, + "grad_norm": 2.512640406594001, + "learning_rate": 2.714951635581684e-06, + "loss": 0.6733, + "step": 6287 + }, + { + "epoch": 0.66, + "grad_norm": 0.9599284170567002, + "learning_rate": 2.713435913719671e-06, + "loss": 0.5056, + "step": 6288 + }, + { + "epoch": 0.66, + "grad_norm": 2.2612027984032435, + "learning_rate": 2.7119204575040666e-06, + "loss": 0.6296, + "step": 6289 + }, + { + "epoch": 0.66, + "grad_norm": 2.856455315447502, + "learning_rate": 2.7104052671109267e-06, + "loss": 0.6801, + "step": 6290 + }, + { + "epoch": 0.66, + "grad_norm": 2.527783580967457, + "learning_rate": 2.708890342716286e-06, + "loss": 0.5643, + "step": 6291 + }, + { + "epoch": 0.66, + "grad_norm": 3.314451080373654, + "learning_rate": 2.7073756844961407e-06, + "loss": 0.6189, + "step": 6292 + }, + { + "epoch": 0.66, + "grad_norm": 2.232485359666232, + "learning_rate": 2.7058612926264634e-06, + "loss": 0.6106, + "step": 6293 + }, + { + "epoch": 0.66, + "grad_norm": 1.9749775537277983, + "learning_rate": 2.7043471672831866e-06, + "loss": 0.597, + "step": 6294 + }, + { + "epoch": 0.66, + "grad_norm": 2.570926833062, + "learning_rate": 2.7028333086422232e-06, + "loss": 0.5325, + "step": 6295 + }, + { + "epoch": 0.66, + "grad_norm": 2.58096398136124, + "learning_rate": 2.7013197168794424e-06, + "loss": 0.6335, + "step": 6296 + }, + { + "epoch": 0.66, + "grad_norm": 2.1566887676933293, + "learning_rate": 2.699806392170693e-06, + "loss": 0.6744, + "step": 6297 + }, + { + "epoch": 0.66, + "grad_norm": 2.2365763878000284, + "learning_rate": 2.698293334691789e-06, + "loss": 0.6178, + "step": 6298 + }, + { + "epoch": 0.66, + "grad_norm": 5.206253358431679, + "learning_rate": 2.69678054461851e-06, + "loss": 0.6438, + "step": 6299 + }, + { + "epoch": 0.66, + "grad_norm": 2.4308841962891514, + "learning_rate": 2.6952680221266116e-06, + "loss": 0.6907, + "step": 6300 + }, + { + "epoch": 0.66, + "grad_norm": 2.1155846785270302, + "learning_rate": 2.6937557673918096e-06, + "loss": 0.5513, + "step": 6301 + }, + { + "epoch": 0.66, + "grad_norm": 2.102074075556992, + "learning_rate": 2.692243780589798e-06, + "loss": 0.6146, + "step": 6302 + }, + { + "epoch": 0.66, + "grad_norm": 5.119358235355663, + "learning_rate": 2.6907320618962312e-06, + "loss": 0.6069, + "step": 6303 + }, + { + "epoch": 0.66, + "grad_norm": 2.52908697678268, + "learning_rate": 2.6892206114867402e-06, + "loss": 0.5663, + "step": 6304 + }, + { + "epoch": 0.66, + "grad_norm": 2.6044059425378716, + "learning_rate": 2.6877094295369167e-06, + "loss": 0.5864, + "step": 6305 + }, + { + "epoch": 0.66, + "grad_norm": 2.3129237260574933, + "learning_rate": 2.686198516222329e-06, + "loss": 0.6366, + "step": 6306 + }, + { + "epoch": 0.66, + "grad_norm": 2.699401934868816, + "learning_rate": 2.6846878717185076e-06, + "loss": 0.5805, + "step": 6307 + }, + { + "epoch": 0.66, + "grad_norm": 2.5503693982830806, + "learning_rate": 2.6831774962009582e-06, + "loss": 0.5873, + "step": 6308 + }, + { + "epoch": 0.66, + "grad_norm": 2.3528557272139152, + "learning_rate": 2.6816673898451486e-06, + "loss": 0.6644, + "step": 6309 + }, + { + "epoch": 0.66, + "grad_norm": 2.178316075395149, + "learning_rate": 2.680157552826519e-06, + "loss": 0.5794, + "step": 6310 + }, + { + "epoch": 0.66, + "grad_norm": 2.640485918878129, + "learning_rate": 2.6786479853204817e-06, + "loss": 0.7067, + "step": 6311 + }, + { + "epoch": 0.66, + "grad_norm": 2.296387374049739, + "learning_rate": 2.6771386875024087e-06, + "loss": 0.6266, + "step": 6312 + }, + { + "epoch": 0.66, + "grad_norm": 2.1896654863553193, + "learning_rate": 2.6756296595476504e-06, + "loss": 0.5763, + "step": 6313 + }, + { + "epoch": 0.66, + "grad_norm": 2.110961387612748, + "learning_rate": 2.674120901631517e-06, + "loss": 0.5546, + "step": 6314 + }, + { + "epoch": 0.66, + "grad_norm": 3.028470459580835, + "learning_rate": 2.6726124139292964e-06, + "loss": 0.59, + "step": 6315 + }, + { + "epoch": 0.66, + "grad_norm": 2.3573162744903935, + "learning_rate": 2.6711041966162356e-06, + "loss": 0.638, + "step": 6316 + }, + { + "epoch": 0.66, + "grad_norm": 2.738834086837961, + "learning_rate": 2.6695962498675588e-06, + "loss": 0.6098, + "step": 6317 + }, + { + "epoch": 0.66, + "grad_norm": 2.3705691571110727, + "learning_rate": 2.6680885738584512e-06, + "loss": 0.6687, + "step": 6318 + }, + { + "epoch": 0.66, + "grad_norm": 5.548261604744699, + "learning_rate": 2.6665811687640723e-06, + "loss": 0.6152, + "step": 6319 + }, + { + "epoch": 0.67, + "grad_norm": 2.3601953378527485, + "learning_rate": 2.66507403475955e-06, + "loss": 0.5781, + "step": 6320 + }, + { + "epoch": 0.67, + "grad_norm": 2.7282332847963287, + "learning_rate": 2.663567172019977e-06, + "loss": 0.6588, + "step": 6321 + }, + { + "epoch": 0.67, + "grad_norm": 3.647581964093641, + "learning_rate": 2.6620605807204134e-06, + "loss": 0.5947, + "step": 6322 + }, + { + "epoch": 0.67, + "grad_norm": 2.55204790802501, + "learning_rate": 2.660554261035894e-06, + "loss": 0.6317, + "step": 6323 + }, + { + "epoch": 0.67, + "grad_norm": 2.794476567030957, + "learning_rate": 2.659048213141419e-06, + "loss": 0.5403, + "step": 6324 + }, + { + "epoch": 0.67, + "grad_norm": 2.5186567163194873, + "learning_rate": 2.657542437211954e-06, + "loss": 0.6475, + "step": 6325 + }, + { + "epoch": 0.67, + "grad_norm": 2.715952090453819, + "learning_rate": 2.6560369334224396e-06, + "loss": 0.565, + "step": 6326 + }, + { + "epoch": 0.67, + "grad_norm": 2.700955615524671, + "learning_rate": 2.6545317019477764e-06, + "loss": 0.6937, + "step": 6327 + }, + { + "epoch": 0.67, + "grad_norm": 3.7950586784557565, + "learning_rate": 2.653026742962842e-06, + "loss": 0.6282, + "step": 6328 + }, + { + "epoch": 0.67, + "grad_norm": 1.932271397052837, + "learning_rate": 2.6515220566424735e-06, + "loss": 0.5145, + "step": 6329 + }, + { + "epoch": 0.67, + "grad_norm": 2.426433667015017, + "learning_rate": 2.6500176431614866e-06, + "loss": 0.6701, + "step": 6330 + }, + { + "epoch": 0.67, + "grad_norm": 2.267463500168662, + "learning_rate": 2.6485135026946545e-06, + "loss": 0.62, + "step": 6331 + }, + { + "epoch": 0.67, + "grad_norm": 2.4952915986161748, + "learning_rate": 2.6470096354167264e-06, + "loss": 0.6529, + "step": 6332 + }, + { + "epoch": 0.67, + "grad_norm": 4.28610014397335, + "learning_rate": 2.645506041502419e-06, + "loss": 0.5839, + "step": 6333 + }, + { + "epoch": 0.67, + "grad_norm": 2.8188123888897585, + "learning_rate": 2.644002721126413e-06, + "loss": 0.6255, + "step": 6334 + }, + { + "epoch": 0.67, + "grad_norm": 3.831571559662532, + "learning_rate": 2.642499674463359e-06, + "loss": 0.536, + "step": 6335 + }, + { + "epoch": 0.67, + "grad_norm": 2.58837692606335, + "learning_rate": 2.640996901687878e-06, + "loss": 0.6061, + "step": 6336 + }, + { + "epoch": 0.67, + "grad_norm": 2.3253447670569956, + "learning_rate": 2.6394944029745594e-06, + "loss": 0.6083, + "step": 6337 + }, + { + "epoch": 0.67, + "grad_norm": 3.1161500571551355, + "learning_rate": 2.6379921784979556e-06, + "loss": 0.6626, + "step": 6338 + }, + { + "epoch": 0.67, + "grad_norm": 2.3957780596036167, + "learning_rate": 2.6364902284325943e-06, + "loss": 0.5869, + "step": 6339 + }, + { + "epoch": 0.67, + "grad_norm": 3.14685635706374, + "learning_rate": 2.634988552952965e-06, + "loss": 0.6182, + "step": 6340 + }, + { + "epoch": 0.67, + "grad_norm": 6.175501966326075, + "learning_rate": 2.63348715223353e-06, + "loss": 0.5976, + "step": 6341 + }, + { + "epoch": 0.67, + "grad_norm": 3.2865872103188165, + "learning_rate": 2.6319860264487156e-06, + "loss": 0.6499, + "step": 6342 + }, + { + "epoch": 0.67, + "grad_norm": 4.282251344333547, + "learning_rate": 2.630485175772921e-06, + "loss": 0.6299, + "step": 6343 + }, + { + "epoch": 0.67, + "grad_norm": 2.959066483609487, + "learning_rate": 2.6289846003805073e-06, + "loss": 0.7163, + "step": 6344 + }, + { + "epoch": 0.67, + "grad_norm": 2.6533855270391533, + "learning_rate": 2.6274843004458083e-06, + "loss": 0.6091, + "step": 6345 + }, + { + "epoch": 0.67, + "grad_norm": 2.092500503413888, + "learning_rate": 2.6259842761431275e-06, + "loss": 0.6512, + "step": 6346 + }, + { + "epoch": 0.67, + "grad_norm": 11.63033721500452, + "learning_rate": 2.624484527646729e-06, + "loss": 0.5766, + "step": 6347 + }, + { + "epoch": 0.67, + "grad_norm": 2.344316205856234, + "learning_rate": 2.6229850551308533e-06, + "loss": 0.6893, + "step": 6348 + }, + { + "epoch": 0.67, + "grad_norm": 2.7670480341648984, + "learning_rate": 2.6214858587697e-06, + "loss": 0.6076, + "step": 6349 + }, + { + "epoch": 0.67, + "grad_norm": 2.7552634595866894, + "learning_rate": 2.6199869387374465e-06, + "loss": 0.626, + "step": 6350 + }, + { + "epoch": 0.67, + "grad_norm": 2.087916955904863, + "learning_rate": 2.6184882952082286e-06, + "loss": 0.6331, + "step": 6351 + }, + { + "epoch": 0.67, + "grad_norm": 2.2553742098034797, + "learning_rate": 2.616989928356158e-06, + "loss": 0.6029, + "step": 6352 + }, + { + "epoch": 0.67, + "grad_norm": 2.4877624793792266, + "learning_rate": 2.6154918383553075e-06, + "loss": 0.5829, + "step": 6353 + }, + { + "epoch": 0.67, + "grad_norm": 3.164663373664098, + "learning_rate": 2.6139940253797237e-06, + "loss": 0.5857, + "step": 6354 + }, + { + "epoch": 0.67, + "grad_norm": 2.2967240401034346, + "learning_rate": 2.6124964896034143e-06, + "loss": 0.5886, + "step": 6355 + }, + { + "epoch": 0.67, + "grad_norm": 4.099408237564359, + "learning_rate": 2.610999231200364e-06, + "loss": 0.6851, + "step": 6356 + }, + { + "epoch": 0.67, + "grad_norm": 2.3689665901694243, + "learning_rate": 2.6095022503445155e-06, + "loss": 0.6386, + "step": 6357 + }, + { + "epoch": 0.67, + "grad_norm": 2.2222788180650475, + "learning_rate": 2.6080055472097844e-06, + "loss": 0.638, + "step": 6358 + }, + { + "epoch": 0.67, + "grad_norm": 2.4258550761659854, + "learning_rate": 2.6065091219700568e-06, + "loss": 0.5755, + "step": 6359 + }, + { + "epoch": 0.67, + "grad_norm": 2.5822158507581823, + "learning_rate": 2.605012974799178e-06, + "loss": 0.6109, + "step": 6360 + }, + { + "epoch": 0.67, + "grad_norm": 2.344258642918491, + "learning_rate": 2.603517105870971e-06, + "loss": 0.5318, + "step": 6361 + }, + { + "epoch": 0.67, + "grad_norm": 3.661742078883693, + "learning_rate": 2.602021515359218e-06, + "loss": 0.5598, + "step": 6362 + }, + { + "epoch": 0.67, + "grad_norm": 2.2545924179160295, + "learning_rate": 2.600526203437674e-06, + "loss": 0.5684, + "step": 6363 + }, + { + "epoch": 0.67, + "grad_norm": 2.7470556591551687, + "learning_rate": 2.5990311702800573e-06, + "loss": 0.6562, + "step": 6364 + }, + { + "epoch": 0.67, + "grad_norm": 2.130529569635912, + "learning_rate": 2.597536416060062e-06, + "loss": 0.6661, + "step": 6365 + }, + { + "epoch": 0.67, + "grad_norm": 2.848914985108963, + "learning_rate": 2.5960419409513386e-06, + "loss": 0.4984, + "step": 6366 + }, + { + "epoch": 0.67, + "grad_norm": 5.47331013855302, + "learning_rate": 2.594547745127514e-06, + "loss": 0.6535, + "step": 6367 + }, + { + "epoch": 0.67, + "grad_norm": 2.1487059851875383, + "learning_rate": 2.5930538287621797e-06, + "loss": 0.6188, + "step": 6368 + }, + { + "epoch": 0.67, + "grad_norm": 2.2329154737791517, + "learning_rate": 2.591560192028894e-06, + "loss": 0.5626, + "step": 6369 + }, + { + "epoch": 0.67, + "grad_norm": 2.2654061918388013, + "learning_rate": 2.5900668351011815e-06, + "loss": 0.6215, + "step": 6370 + }, + { + "epoch": 0.67, + "grad_norm": 2.256650550756588, + "learning_rate": 2.588573758152538e-06, + "loss": 0.631, + "step": 6371 + }, + { + "epoch": 0.67, + "grad_norm": 2.001940811368443, + "learning_rate": 2.5870809613564264e-06, + "loss": 0.615, + "step": 6372 + }, + { + "epoch": 0.67, + "grad_norm": 5.097482873027538, + "learning_rate": 2.585588444886271e-06, + "loss": 0.6168, + "step": 6373 + }, + { + "epoch": 0.67, + "grad_norm": 2.401718053393286, + "learning_rate": 2.584096208915473e-06, + "loss": 0.6036, + "step": 6374 + }, + { + "epoch": 0.67, + "grad_norm": 2.7170902456231816, + "learning_rate": 2.5826042536173923e-06, + "loss": 0.6435, + "step": 6375 + }, + { + "epoch": 0.67, + "grad_norm": 2.0656142552031445, + "learning_rate": 2.581112579165363e-06, + "loss": 0.6798, + "step": 6376 + }, + { + "epoch": 0.67, + "grad_norm": 2.885892388490017, + "learning_rate": 2.5796211857326805e-06, + "loss": 0.5807, + "step": 6377 + }, + { + "epoch": 0.67, + "grad_norm": 2.357443586015753, + "learning_rate": 2.578130073492613e-06, + "loss": 0.5938, + "step": 6378 + }, + { + "epoch": 0.67, + "grad_norm": 2.3457746446640377, + "learning_rate": 2.576639242618391e-06, + "loss": 0.6046, + "step": 6379 + }, + { + "epoch": 0.67, + "grad_norm": 1.0130372996055184, + "learning_rate": 2.575148693283217e-06, + "loss": 0.5469, + "step": 6380 + }, + { + "epoch": 0.67, + "grad_norm": 2.2471983613829685, + "learning_rate": 2.5736584256602604e-06, + "loss": 0.6914, + "step": 6381 + }, + { + "epoch": 0.67, + "grad_norm": 2.8049842929766076, + "learning_rate": 2.572168439922653e-06, + "loss": 0.6207, + "step": 6382 + }, + { + "epoch": 0.67, + "grad_norm": 2.5242050580331536, + "learning_rate": 2.570678736243497e-06, + "loss": 0.5769, + "step": 6383 + }, + { + "epoch": 0.67, + "grad_norm": 2.7371668921754995, + "learning_rate": 2.569189314795863e-06, + "loss": 0.6394, + "step": 6384 + }, + { + "epoch": 0.67, + "grad_norm": 2.541518894090096, + "learning_rate": 2.56770017575279e-06, + "loss": 0.5863, + "step": 6385 + }, + { + "epoch": 0.67, + "grad_norm": 2.104105510648952, + "learning_rate": 2.566211319287276e-06, + "loss": 0.5289, + "step": 6386 + }, + { + "epoch": 0.67, + "grad_norm": 2.5996518622471667, + "learning_rate": 2.564722745572299e-06, + "loss": 0.6135, + "step": 6387 + }, + { + "epoch": 0.67, + "grad_norm": 2.2459414237155215, + "learning_rate": 2.563234454780791e-06, + "loss": 0.5344, + "step": 6388 + }, + { + "epoch": 0.67, + "grad_norm": 2.6467790790692405, + "learning_rate": 2.561746447085662e-06, + "loss": 0.5519, + "step": 6389 + }, + { + "epoch": 0.67, + "grad_norm": 2.9589463310028505, + "learning_rate": 2.5602587226597813e-06, + "loss": 0.61, + "step": 6390 + }, + { + "epoch": 0.67, + "grad_norm": 2.81740329183419, + "learning_rate": 2.5587712816759914e-06, + "loss": 0.6797, + "step": 6391 + }, + { + "epoch": 0.67, + "grad_norm": 2.4235942299105417, + "learning_rate": 2.5572841243070944e-06, + "loss": 0.6292, + "step": 6392 + }, + { + "epoch": 0.67, + "grad_norm": 2.7257768033845178, + "learning_rate": 2.5557972507258676e-06, + "loss": 0.5951, + "step": 6393 + }, + { + "epoch": 0.67, + "grad_norm": 2.9980654800137065, + "learning_rate": 2.554310661105052e-06, + "loss": 0.5879, + "step": 6394 + }, + { + "epoch": 0.67, + "grad_norm": 2.45714129335474, + "learning_rate": 2.5528243556173526e-06, + "loss": 0.5924, + "step": 6395 + }, + { + "epoch": 0.67, + "grad_norm": 2.524381334524273, + "learning_rate": 2.551338334435447e-06, + "loss": 0.5851, + "step": 6396 + }, + { + "epoch": 0.67, + "grad_norm": 2.2895599260578168, + "learning_rate": 2.549852597731973e-06, + "loss": 0.6139, + "step": 6397 + }, + { + "epoch": 0.67, + "grad_norm": 2.2734502037433173, + "learning_rate": 2.5483671456795446e-06, + "loss": 0.5641, + "step": 6398 + }, + { + "epoch": 0.67, + "grad_norm": 4.10732024204948, + "learning_rate": 2.5468819784507314e-06, + "loss": 0.6338, + "step": 6399 + }, + { + "epoch": 0.67, + "grad_norm": 2.831440417203488, + "learning_rate": 2.545397096218081e-06, + "loss": 0.632, + "step": 6400 + }, + { + "epoch": 0.67, + "grad_norm": 2.2175477294639503, + "learning_rate": 2.5439124991540986e-06, + "loss": 0.5228, + "step": 6401 + }, + { + "epoch": 0.67, + "grad_norm": 11.26293371493835, + "learning_rate": 2.5424281874312616e-06, + "loss": 0.638, + "step": 6402 + }, + { + "epoch": 0.67, + "grad_norm": 3.070512413502359, + "learning_rate": 2.5409441612220163e-06, + "loss": 0.5331, + "step": 6403 + }, + { + "epoch": 0.67, + "grad_norm": 2.43982362945709, + "learning_rate": 2.53946042069877e-06, + "loss": 0.682, + "step": 6404 + }, + { + "epoch": 0.67, + "grad_norm": 2.5481585409694536, + "learning_rate": 2.537976966033897e-06, + "loss": 0.6913, + "step": 6405 + }, + { + "epoch": 0.67, + "grad_norm": 2.780707259096246, + "learning_rate": 2.5364937973997433e-06, + "loss": 0.637, + "step": 6406 + }, + { + "epoch": 0.67, + "grad_norm": 3.059638272022161, + "learning_rate": 2.535010914968621e-06, + "loss": 0.6197, + "step": 6407 + }, + { + "epoch": 0.67, + "grad_norm": 4.198201640329154, + "learning_rate": 2.533528318912803e-06, + "loss": 0.544, + "step": 6408 + }, + { + "epoch": 0.67, + "grad_norm": 2.3114583069479973, + "learning_rate": 2.532046009404537e-06, + "loss": 0.5643, + "step": 6409 + }, + { + "epoch": 0.67, + "grad_norm": 4.0785166014115655, + "learning_rate": 2.5305639866160293e-06, + "loss": 0.6458, + "step": 6410 + }, + { + "epoch": 0.67, + "grad_norm": 2.4149803287414717, + "learning_rate": 2.529082250719461e-06, + "loss": 0.7269, + "step": 6411 + }, + { + "epoch": 0.67, + "grad_norm": 2.160446913236817, + "learning_rate": 2.5276008018869722e-06, + "loss": 0.5832, + "step": 6412 + }, + { + "epoch": 0.67, + "grad_norm": 2.6820344561724, + "learning_rate": 2.526119640290678e-06, + "loss": 0.5942, + "step": 6413 + }, + { + "epoch": 0.67, + "grad_norm": 2.559166061869424, + "learning_rate": 2.5246387661026504e-06, + "loss": 0.6239, + "step": 6414 + }, + { + "epoch": 0.68, + "grad_norm": 2.35834972171533, + "learning_rate": 2.5231581794949356e-06, + "loss": 0.535, + "step": 6415 + }, + { + "epoch": 0.68, + "grad_norm": 2.964214903325439, + "learning_rate": 2.5216778806395448e-06, + "loss": 0.6452, + "step": 6416 + }, + { + "epoch": 0.68, + "grad_norm": 2.200465378179132, + "learning_rate": 2.520197869708454e-06, + "loss": 0.6412, + "step": 6417 + }, + { + "epoch": 0.68, + "grad_norm": 2.5628414197927607, + "learning_rate": 2.518718146873605e-06, + "loss": 0.6189, + "step": 6418 + }, + { + "epoch": 0.68, + "grad_norm": 2.3739952466153516, + "learning_rate": 2.5172387123069085e-06, + "loss": 0.5864, + "step": 6419 + }, + { + "epoch": 0.68, + "grad_norm": 2.2088475058648496, + "learning_rate": 2.5157595661802437e-06, + "loss": 0.6126, + "step": 6420 + }, + { + "epoch": 0.68, + "grad_norm": 2.8096193612985947, + "learning_rate": 2.514280708665449e-06, + "loss": 0.48, + "step": 6421 + }, + { + "epoch": 0.68, + "grad_norm": 1.8758991836794494, + "learning_rate": 2.5128021399343385e-06, + "loss": 0.601, + "step": 6422 + }, + { + "epoch": 0.68, + "grad_norm": 4.354943435805703, + "learning_rate": 2.5113238601586833e-06, + "loss": 0.5763, + "step": 6423 + }, + { + "epoch": 0.68, + "grad_norm": 0.8991492339114561, + "learning_rate": 2.50984586951023e-06, + "loss": 0.571, + "step": 6424 + }, + { + "epoch": 0.68, + "grad_norm": 2.988868169199592, + "learning_rate": 2.508368168160683e-06, + "loss": 0.6109, + "step": 6425 + }, + { + "epoch": 0.68, + "grad_norm": 2.0181480966879133, + "learning_rate": 2.5068907562817223e-06, + "loss": 0.6475, + "step": 6426 + }, + { + "epoch": 0.68, + "grad_norm": 2.607017873285922, + "learning_rate": 2.505413634044984e-06, + "loss": 0.6466, + "step": 6427 + }, + { + "epoch": 0.68, + "grad_norm": 2.5325358752326745, + "learning_rate": 2.5039368016220795e-06, + "loss": 0.707, + "step": 6428 + }, + { + "epoch": 0.68, + "grad_norm": 5.495898484741088, + "learning_rate": 2.502460259184584e-06, + "loss": 0.6143, + "step": 6429 + }, + { + "epoch": 0.68, + "grad_norm": 2.27497515485443, + "learning_rate": 2.500984006904035e-06, + "loss": 0.6036, + "step": 6430 + }, + { + "epoch": 0.68, + "grad_norm": 5.384554479372134, + "learning_rate": 2.4995080449519383e-06, + "loss": 0.632, + "step": 6431 + }, + { + "epoch": 0.68, + "grad_norm": 2.136269004392376, + "learning_rate": 2.498032373499769e-06, + "loss": 0.5979, + "step": 6432 + }, + { + "epoch": 0.68, + "grad_norm": 0.9591439151746047, + "learning_rate": 2.496556992718968e-06, + "loss": 0.5752, + "step": 6433 + }, + { + "epoch": 0.68, + "grad_norm": 2.625755986560923, + "learning_rate": 2.495081902780937e-06, + "loss": 0.6251, + "step": 6434 + }, + { + "epoch": 0.68, + "grad_norm": 2.6993322682560046, + "learning_rate": 2.4936071038570514e-06, + "loss": 0.6101, + "step": 6435 + }, + { + "epoch": 0.68, + "grad_norm": 3.606064994507977, + "learning_rate": 2.4921325961186455e-06, + "loss": 0.6113, + "step": 6436 + }, + { + "epoch": 0.68, + "grad_norm": 2.8817794595876305, + "learning_rate": 2.490658379737025e-06, + "loss": 0.6152, + "step": 6437 + }, + { + "epoch": 0.68, + "grad_norm": 2.592817222457228, + "learning_rate": 2.489184454883462e-06, + "loss": 0.6595, + "step": 6438 + }, + { + "epoch": 0.68, + "grad_norm": 3.112454316014022, + "learning_rate": 2.4877108217291913e-06, + "loss": 0.6833, + "step": 6439 + }, + { + "epoch": 0.68, + "grad_norm": 2.089441060610151, + "learning_rate": 2.4862374804454127e-06, + "loss": 0.6656, + "step": 6440 + }, + { + "epoch": 0.68, + "grad_norm": 2.1810446892787994, + "learning_rate": 2.484764431203297e-06, + "loss": 0.655, + "step": 6441 + }, + { + "epoch": 0.68, + "grad_norm": 2.2182929541942453, + "learning_rate": 2.483291674173981e-06, + "loss": 0.6376, + "step": 6442 + }, + { + "epoch": 0.68, + "grad_norm": 2.620256297362581, + "learning_rate": 2.4818192095285615e-06, + "loss": 0.6077, + "step": 6443 + }, + { + "epoch": 0.68, + "grad_norm": 0.959409963615247, + "learning_rate": 2.4803470374381084e-06, + "loss": 0.5402, + "step": 6444 + }, + { + "epoch": 0.68, + "grad_norm": 2.690295499658671, + "learning_rate": 2.4788751580736516e-06, + "loss": 0.637, + "step": 6445 + }, + { + "epoch": 0.68, + "grad_norm": 2.2246955758634246, + "learning_rate": 2.4774035716061924e-06, + "loss": 0.6423, + "step": 6446 + }, + { + "epoch": 0.68, + "grad_norm": 2.4753333122148784, + "learning_rate": 2.4759322782066924e-06, + "loss": 0.6151, + "step": 6447 + }, + { + "epoch": 0.68, + "grad_norm": 2.3261687913389704, + "learning_rate": 2.4744612780460863e-06, + "loss": 0.6234, + "step": 6448 + }, + { + "epoch": 0.68, + "grad_norm": 2.050091521356472, + "learning_rate": 2.472990571295266e-06, + "loss": 0.528, + "step": 6449 + }, + { + "epoch": 0.68, + "grad_norm": 2.266129287841651, + "learning_rate": 2.4715201581250962e-06, + "loss": 0.6456, + "step": 6450 + }, + { + "epoch": 0.68, + "grad_norm": 2.5040000923789902, + "learning_rate": 2.4700500387064074e-06, + "loss": 0.6231, + "step": 6451 + }, + { + "epoch": 0.68, + "grad_norm": 2.405010632193548, + "learning_rate": 2.4685802132099923e-06, + "loss": 0.5844, + "step": 6452 + }, + { + "epoch": 0.68, + "grad_norm": 2.6378566394399967, + "learning_rate": 2.4671106818066076e-06, + "loss": 0.6313, + "step": 6453 + }, + { + "epoch": 0.68, + "grad_norm": 2.429331136222701, + "learning_rate": 2.465641444666983e-06, + "loss": 0.6776, + "step": 6454 + }, + { + "epoch": 0.68, + "grad_norm": 3.3515232887180457, + "learning_rate": 2.4641725019618107e-06, + "loss": 0.5311, + "step": 6455 + }, + { + "epoch": 0.68, + "grad_norm": 2.516974554330432, + "learning_rate": 2.4627038538617447e-06, + "loss": 0.5724, + "step": 6456 + }, + { + "epoch": 0.68, + "grad_norm": 2.1215303565599735, + "learning_rate": 2.461235500537412e-06, + "loss": 0.5501, + "step": 6457 + }, + { + "epoch": 0.68, + "grad_norm": 2.0665664683899, + "learning_rate": 2.4597674421593985e-06, + "loss": 0.6118, + "step": 6458 + }, + { + "epoch": 0.68, + "grad_norm": 2.723504300822161, + "learning_rate": 2.458299678898263e-06, + "loss": 0.6764, + "step": 6459 + }, + { + "epoch": 0.68, + "grad_norm": 2.4054502421315638, + "learning_rate": 2.456832210924521e-06, + "loss": 0.6773, + "step": 6460 + }, + { + "epoch": 0.68, + "grad_norm": 2.408884797009261, + "learning_rate": 2.455365038408663e-06, + "loss": 0.6166, + "step": 6461 + }, + { + "epoch": 0.68, + "grad_norm": 2.446288286832739, + "learning_rate": 2.453898161521137e-06, + "loss": 0.703, + "step": 6462 + }, + { + "epoch": 0.68, + "grad_norm": 2.2957817402286795, + "learning_rate": 2.4524315804323627e-06, + "loss": 0.6984, + "step": 6463 + }, + { + "epoch": 0.68, + "grad_norm": 2.401524156010693, + "learning_rate": 2.4509652953127257e-06, + "loss": 0.6117, + "step": 6464 + }, + { + "epoch": 0.68, + "grad_norm": 3.328086050605523, + "learning_rate": 2.4494993063325716e-06, + "loss": 0.5302, + "step": 6465 + }, + { + "epoch": 0.68, + "grad_norm": 2.358380771233323, + "learning_rate": 2.4480336136622133e-06, + "loss": 0.6143, + "step": 6466 + }, + { + "epoch": 0.68, + "grad_norm": 2.6392348923503626, + "learning_rate": 2.446568217471933e-06, + "loss": 0.5916, + "step": 6467 + }, + { + "epoch": 0.68, + "grad_norm": 2.7595861356010456, + "learning_rate": 2.445103117931978e-06, + "loss": 0.5833, + "step": 6468 + }, + { + "epoch": 0.68, + "grad_norm": 2.1230177292364067, + "learning_rate": 2.443638315212555e-06, + "loss": 0.5654, + "step": 6469 + }, + { + "epoch": 0.68, + "grad_norm": 2.8062776463140517, + "learning_rate": 2.442173809483845e-06, + "loss": 0.7762, + "step": 6470 + }, + { + "epoch": 0.68, + "grad_norm": 2.716571799479343, + "learning_rate": 2.440709600915986e-06, + "loss": 0.6991, + "step": 6471 + }, + { + "epoch": 0.68, + "grad_norm": 2.7930103373086728, + "learning_rate": 2.4392456896790874e-06, + "loss": 0.5541, + "step": 6472 + }, + { + "epoch": 0.68, + "grad_norm": 3.224499151376592, + "learning_rate": 2.437782075943224e-06, + "loss": 0.6459, + "step": 6473 + }, + { + "epoch": 0.68, + "grad_norm": 3.395183867623381, + "learning_rate": 2.4363187598784323e-06, + "loss": 0.6556, + "step": 6474 + }, + { + "epoch": 0.68, + "grad_norm": 2.1451834322017946, + "learning_rate": 2.4348557416547146e-06, + "loss": 0.6845, + "step": 6475 + }, + { + "epoch": 0.68, + "grad_norm": 2.6454970175141983, + "learning_rate": 2.4333930214420414e-06, + "loss": 0.6793, + "step": 6476 + }, + { + "epoch": 0.68, + "grad_norm": 2.1614550552826097, + "learning_rate": 2.43193059941035e-06, + "loss": 0.6409, + "step": 6477 + }, + { + "epoch": 0.68, + "grad_norm": 2.9560127452631026, + "learning_rate": 2.4304684757295376e-06, + "loss": 0.6685, + "step": 6478 + }, + { + "epoch": 0.68, + "grad_norm": 2.385228890681212, + "learning_rate": 2.429006650569468e-06, + "loss": 0.6054, + "step": 6479 + }, + { + "epoch": 0.68, + "grad_norm": 2.7023174068498594, + "learning_rate": 2.4275451240999743e-06, + "loss": 0.6734, + "step": 6480 + }, + { + "epoch": 0.68, + "grad_norm": 2.8800257059035994, + "learning_rate": 2.4260838964908534e-06, + "loss": 0.6168, + "step": 6481 + }, + { + "epoch": 0.68, + "grad_norm": 2.3173592174188915, + "learning_rate": 2.424622967911863e-06, + "loss": 0.6082, + "step": 6482 + }, + { + "epoch": 0.68, + "grad_norm": 0.9950821121430008, + "learning_rate": 2.4231623385327337e-06, + "loss": 0.5374, + "step": 6483 + }, + { + "epoch": 0.68, + "grad_norm": 2.9226988485243646, + "learning_rate": 2.421702008523153e-06, + "loss": 0.5921, + "step": 6484 + }, + { + "epoch": 0.68, + "grad_norm": 2.2019190672941673, + "learning_rate": 2.4202419780527796e-06, + "loss": 0.6282, + "step": 6485 + }, + { + "epoch": 0.68, + "grad_norm": 18.87914111722867, + "learning_rate": 2.418782247291238e-06, + "loss": 0.5984, + "step": 6486 + }, + { + "epoch": 0.68, + "grad_norm": 0.9756191834087302, + "learning_rate": 2.4173228164081135e-06, + "loss": 0.5355, + "step": 6487 + }, + { + "epoch": 0.68, + "grad_norm": 2.229047102096132, + "learning_rate": 2.4158636855729563e-06, + "loss": 0.6097, + "step": 6488 + }, + { + "epoch": 0.68, + "grad_norm": 2.160615604668489, + "learning_rate": 2.414404854955286e-06, + "loss": 0.5794, + "step": 6489 + }, + { + "epoch": 0.68, + "grad_norm": 2.5744631310901793, + "learning_rate": 2.4129463247245877e-06, + "loss": 0.6365, + "step": 6490 + }, + { + "epoch": 0.68, + "grad_norm": 2.4664632502489976, + "learning_rate": 2.411488095050305e-06, + "loss": 0.6087, + "step": 6491 + }, + { + "epoch": 0.68, + "grad_norm": 2.2462459640267336, + "learning_rate": 2.410030166101855e-06, + "loss": 0.6314, + "step": 6492 + }, + { + "epoch": 0.68, + "grad_norm": 3.7690797181208953, + "learning_rate": 2.4085725380486106e-06, + "loss": 0.6211, + "step": 6493 + }, + { + "epoch": 0.68, + "grad_norm": 2.3187693057872476, + "learning_rate": 2.4071152110599204e-06, + "loss": 0.5885, + "step": 6494 + }, + { + "epoch": 0.68, + "grad_norm": 2.4839016053608605, + "learning_rate": 2.4056581853050877e-06, + "loss": 0.6435, + "step": 6495 + }, + { + "epoch": 0.68, + "grad_norm": 3.5230696548262035, + "learning_rate": 2.4042014609533894e-06, + "loss": 0.5636, + "step": 6496 + }, + { + "epoch": 0.68, + "grad_norm": 2.54975560910205, + "learning_rate": 2.4027450381740598e-06, + "loss": 0.6318, + "step": 6497 + }, + { + "epoch": 0.68, + "grad_norm": 2.1179176495059173, + "learning_rate": 2.4012889171363034e-06, + "loss": 0.6436, + "step": 6498 + }, + { + "epoch": 0.68, + "grad_norm": 2.219085902942063, + "learning_rate": 2.3998330980092906e-06, + "loss": 0.5653, + "step": 6499 + }, + { + "epoch": 0.68, + "grad_norm": 2.4057964800651495, + "learning_rate": 2.3983775809621525e-06, + "loss": 0.6256, + "step": 6500 + }, + { + "epoch": 0.68, + "grad_norm": 2.9344149719041113, + "learning_rate": 2.3969223661639838e-06, + "loss": 0.6745, + "step": 6501 + }, + { + "epoch": 0.68, + "grad_norm": 3.7761690986099326, + "learning_rate": 2.395467453783851e-06, + "loss": 0.5358, + "step": 6502 + }, + { + "epoch": 0.68, + "grad_norm": 2.3942022138130055, + "learning_rate": 2.394012843990781e-06, + "loss": 0.664, + "step": 6503 + }, + { + "epoch": 0.68, + "grad_norm": 2.3279945476576263, + "learning_rate": 2.3925585369537647e-06, + "loss": 0.6307, + "step": 6504 + }, + { + "epoch": 0.68, + "grad_norm": 4.007225583568133, + "learning_rate": 2.391104532841762e-06, + "loss": 0.6179, + "step": 6505 + }, + { + "epoch": 0.68, + "grad_norm": 3.6218819183172104, + "learning_rate": 2.389650831823691e-06, + "loss": 0.6501, + "step": 6506 + }, + { + "epoch": 0.68, + "grad_norm": 2.3080723378936723, + "learning_rate": 2.388197434068441e-06, + "loss": 0.6166, + "step": 6507 + }, + { + "epoch": 0.68, + "grad_norm": 2.2881275417992613, + "learning_rate": 2.3867443397448646e-06, + "loss": 0.622, + "step": 6508 + }, + { + "epoch": 0.68, + "grad_norm": 2.241683983187079, + "learning_rate": 2.3852915490217772e-06, + "loss": 0.6207, + "step": 6509 + }, + { + "epoch": 0.69, + "grad_norm": 2.556095245479891, + "learning_rate": 2.383839062067957e-06, + "loss": 0.5998, + "step": 6510 + }, + { + "epoch": 0.69, + "grad_norm": 1.9799784118007415, + "learning_rate": 2.382386879052152e-06, + "loss": 0.6074, + "step": 6511 + }, + { + "epoch": 0.69, + "grad_norm": 2.3182312355329513, + "learning_rate": 2.3809350001430743e-06, + "loss": 0.5951, + "step": 6512 + }, + { + "epoch": 0.69, + "grad_norm": 2.3694748756782027, + "learning_rate": 2.3794834255093977e-06, + "loss": 0.6304, + "step": 6513 + }, + { + "epoch": 0.69, + "grad_norm": 2.2001457523740147, + "learning_rate": 2.37803215531976e-06, + "loss": 0.6331, + "step": 6514 + }, + { + "epoch": 0.69, + "grad_norm": 2.3212863403370796, + "learning_rate": 2.3765811897427667e-06, + "loss": 0.53, + "step": 6515 + }, + { + "epoch": 0.69, + "grad_norm": 3.160468095861783, + "learning_rate": 2.375130528946989e-06, + "loss": 0.7405, + "step": 6516 + }, + { + "epoch": 0.69, + "grad_norm": 2.6855144317046697, + "learning_rate": 2.373680173100957e-06, + "loss": 0.5849, + "step": 6517 + }, + { + "epoch": 0.69, + "grad_norm": 2.341892568924303, + "learning_rate": 2.3722301223731724e-06, + "loss": 0.5603, + "step": 6518 + }, + { + "epoch": 0.69, + "grad_norm": 2.283085350966203, + "learning_rate": 2.3707803769320943e-06, + "loss": 0.6285, + "step": 6519 + }, + { + "epoch": 0.69, + "grad_norm": 3.545450875140134, + "learning_rate": 2.3693309369461514e-06, + "loss": 0.643, + "step": 6520 + }, + { + "epoch": 0.69, + "grad_norm": 2.34738226126757, + "learning_rate": 2.367881802583738e-06, + "loss": 0.6825, + "step": 6521 + }, + { + "epoch": 0.69, + "grad_norm": 2.6575846961053333, + "learning_rate": 2.366432974013208e-06, + "loss": 0.5756, + "step": 6522 + }, + { + "epoch": 0.69, + "grad_norm": 2.6040764810860146, + "learning_rate": 2.36498445140288e-06, + "loss": 0.6637, + "step": 6523 + }, + { + "epoch": 0.69, + "grad_norm": 2.575034774458675, + "learning_rate": 2.3635362349210423e-06, + "loss": 0.6506, + "step": 6524 + }, + { + "epoch": 0.69, + "grad_norm": 2.561154528891339, + "learning_rate": 2.362088324735945e-06, + "loss": 0.585, + "step": 6525 + }, + { + "epoch": 0.69, + "grad_norm": 2.557511457700825, + "learning_rate": 2.3606407210158007e-06, + "loss": 0.7341, + "step": 6526 + }, + { + "epoch": 0.69, + "grad_norm": 5.151485466378373, + "learning_rate": 2.3591934239287858e-06, + "loss": 0.5364, + "step": 6527 + }, + { + "epoch": 0.69, + "grad_norm": 3.2369051385886, + "learning_rate": 2.3577464336430446e-06, + "loss": 0.5904, + "step": 6528 + }, + { + "epoch": 0.69, + "grad_norm": 2.4237630485347683, + "learning_rate": 2.356299750326687e-06, + "loss": 0.6413, + "step": 6529 + }, + { + "epoch": 0.69, + "grad_norm": 2.791157720967069, + "learning_rate": 2.3548533741477807e-06, + "loss": 0.6735, + "step": 6530 + }, + { + "epoch": 0.69, + "grad_norm": 2.3572226180240285, + "learning_rate": 2.353407305274365e-06, + "loss": 0.6745, + "step": 6531 + }, + { + "epoch": 0.69, + "grad_norm": 2.756415728746884, + "learning_rate": 2.3519615438744358e-06, + "loss": 0.6098, + "step": 6532 + }, + { + "epoch": 0.69, + "grad_norm": 3.9211036631117326, + "learning_rate": 2.3505160901159596e-06, + "loss": 0.6391, + "step": 6533 + }, + { + "epoch": 0.69, + "grad_norm": 10.804798236915875, + "learning_rate": 2.3490709441668673e-06, + "loss": 0.6013, + "step": 6534 + }, + { + "epoch": 0.69, + "grad_norm": 2.3231398228966333, + "learning_rate": 2.34762610619505e-06, + "loss": 0.6146, + "step": 6535 + }, + { + "epoch": 0.69, + "grad_norm": 2.496092282256762, + "learning_rate": 2.346181576368362e-06, + "loss": 0.7055, + "step": 6536 + }, + { + "epoch": 0.69, + "grad_norm": 2.487885892384019, + "learning_rate": 2.344737354854627e-06, + "loss": 0.7101, + "step": 6537 + }, + { + "epoch": 0.69, + "grad_norm": 3.2049421861180685, + "learning_rate": 2.343293441821633e-06, + "loss": 0.672, + "step": 6538 + }, + { + "epoch": 0.69, + "grad_norm": 1.9953645112584768, + "learning_rate": 2.3418498374371266e-06, + "loss": 0.6358, + "step": 6539 + }, + { + "epoch": 0.69, + "grad_norm": 2.7737257283166983, + "learning_rate": 2.3404065418688203e-06, + "loss": 0.6623, + "step": 6540 + }, + { + "epoch": 0.69, + "grad_norm": 2.2828290607769466, + "learning_rate": 2.3389635552843943e-06, + "loss": 0.6073, + "step": 6541 + }, + { + "epoch": 0.69, + "grad_norm": 2.7921110029438876, + "learning_rate": 2.3375208778514903e-06, + "loss": 0.6668, + "step": 6542 + }, + { + "epoch": 0.69, + "grad_norm": 6.963063546111307, + "learning_rate": 2.336078509737715e-06, + "loss": 0.7043, + "step": 6543 + }, + { + "epoch": 0.69, + "grad_norm": 1.9709336196500105, + "learning_rate": 2.334636451110639e-06, + "loss": 0.5716, + "step": 6544 + }, + { + "epoch": 0.69, + "grad_norm": 2.052297383518725, + "learning_rate": 2.333194702137793e-06, + "loss": 0.6293, + "step": 6545 + }, + { + "epoch": 0.69, + "grad_norm": 2.2880945024427444, + "learning_rate": 2.331753262986678e-06, + "loss": 0.5453, + "step": 6546 + }, + { + "epoch": 0.69, + "grad_norm": 3.578393672808325, + "learning_rate": 2.330312133824757e-06, + "loss": 0.7206, + "step": 6547 + }, + { + "epoch": 0.69, + "grad_norm": 9.010515317246663, + "learning_rate": 2.3288713148194554e-06, + "loss": 0.6199, + "step": 6548 + }, + { + "epoch": 0.69, + "grad_norm": 1.9927985217423352, + "learning_rate": 2.3274308061381605e-06, + "loss": 0.6139, + "step": 6549 + }, + { + "epoch": 0.69, + "grad_norm": 1.9806575010532403, + "learning_rate": 2.32599060794823e-06, + "loss": 0.6179, + "step": 6550 + }, + { + "epoch": 0.69, + "grad_norm": 2.541089321186657, + "learning_rate": 2.324550720416982e-06, + "loss": 0.5371, + "step": 6551 + }, + { + "epoch": 0.69, + "grad_norm": 2.060488905830918, + "learning_rate": 2.3231111437116954e-06, + "loss": 0.6226, + "step": 6552 + }, + { + "epoch": 0.69, + "grad_norm": 2.944439269515085, + "learning_rate": 2.3216718779996205e-06, + "loss": 0.6055, + "step": 6553 + }, + { + "epoch": 0.69, + "grad_norm": 2.8098524227724737, + "learning_rate": 2.320232923447962e-06, + "loss": 0.6007, + "step": 6554 + }, + { + "epoch": 0.69, + "grad_norm": 3.017792625313086, + "learning_rate": 2.318794280223897e-06, + "loss": 0.6693, + "step": 6555 + }, + { + "epoch": 0.69, + "grad_norm": 2.2085939033581856, + "learning_rate": 2.317355948494563e-06, + "loss": 0.5985, + "step": 6556 + }, + { + "epoch": 0.69, + "grad_norm": 2.4413921495254867, + "learning_rate": 2.31591792842706e-06, + "loss": 0.6141, + "step": 6557 + }, + { + "epoch": 0.69, + "grad_norm": 2.270526009720749, + "learning_rate": 2.314480220188452e-06, + "loss": 0.6078, + "step": 6558 + }, + { + "epoch": 0.69, + "grad_norm": 2.3574965924340434, + "learning_rate": 2.3130428239457688e-06, + "loss": 0.6009, + "step": 6559 + }, + { + "epoch": 0.69, + "grad_norm": 7.673316161467211, + "learning_rate": 2.3116057398660046e-06, + "loss": 0.5492, + "step": 6560 + }, + { + "epoch": 0.69, + "grad_norm": 2.7207174172029065, + "learning_rate": 2.3101689681161142e-06, + "loss": 0.6563, + "step": 6561 + }, + { + "epoch": 0.69, + "grad_norm": 2.307943954138673, + "learning_rate": 2.308732508863016e-06, + "loss": 0.5498, + "step": 6562 + }, + { + "epoch": 0.69, + "grad_norm": 4.299414059442358, + "learning_rate": 2.307296362273595e-06, + "loss": 0.6527, + "step": 6563 + }, + { + "epoch": 0.69, + "grad_norm": 3.0267194850070918, + "learning_rate": 2.305860528514701e-06, + "loss": 0.6284, + "step": 6564 + }, + { + "epoch": 0.69, + "grad_norm": 2.6543376447272022, + "learning_rate": 2.304425007753141e-06, + "loss": 0.6456, + "step": 6565 + }, + { + "epoch": 0.69, + "grad_norm": 2.2467729720442384, + "learning_rate": 2.3029898001556928e-06, + "loss": 0.5627, + "step": 6566 + }, + { + "epoch": 0.69, + "grad_norm": 4.086285543773585, + "learning_rate": 2.301554905889092e-06, + "loss": 0.5767, + "step": 6567 + }, + { + "epoch": 0.69, + "grad_norm": 3.212741108195205, + "learning_rate": 2.3001203251200417e-06, + "loss": 0.5448, + "step": 6568 + }, + { + "epoch": 0.69, + "grad_norm": 3.394587869638318, + "learning_rate": 2.2986860580152095e-06, + "loss": 0.6423, + "step": 6569 + }, + { + "epoch": 0.69, + "grad_norm": 3.4031910872820763, + "learning_rate": 2.2972521047412223e-06, + "loss": 0.6362, + "step": 6570 + }, + { + "epoch": 0.69, + "grad_norm": 4.483025314036182, + "learning_rate": 2.2958184654646705e-06, + "loss": 0.4954, + "step": 6571 + }, + { + "epoch": 0.69, + "grad_norm": 2.548631805057781, + "learning_rate": 2.2943851403521123e-06, + "loss": 0.7172, + "step": 6572 + }, + { + "epoch": 0.69, + "grad_norm": 4.61360591907676, + "learning_rate": 2.2929521295700695e-06, + "loss": 0.6463, + "step": 6573 + }, + { + "epoch": 0.69, + "grad_norm": 2.8748275398323564, + "learning_rate": 2.2915194332850233e-06, + "loss": 0.6434, + "step": 6574 + }, + { + "epoch": 0.69, + "grad_norm": 2.9897551268506732, + "learning_rate": 2.290087051663418e-06, + "loss": 0.5195, + "step": 6575 + }, + { + "epoch": 0.69, + "grad_norm": 2.3753623260621426, + "learning_rate": 2.288654984871665e-06, + "loss": 0.485, + "step": 6576 + }, + { + "epoch": 0.69, + "grad_norm": 2.635343007038432, + "learning_rate": 2.2872232330761383e-06, + "loss": 0.6737, + "step": 6577 + }, + { + "epoch": 0.69, + "grad_norm": 2.5493524141073918, + "learning_rate": 2.285791796443176e-06, + "loss": 0.6043, + "step": 6578 + }, + { + "epoch": 0.69, + "grad_norm": 2.9622930982127853, + "learning_rate": 2.284360675139078e-06, + "loss": 0.6278, + "step": 6579 + }, + { + "epoch": 0.69, + "grad_norm": 4.097813348559686, + "learning_rate": 2.282929869330104e-06, + "loss": 0.6418, + "step": 6580 + }, + { + "epoch": 0.69, + "grad_norm": 2.6376842210758853, + "learning_rate": 2.2814993791824836e-06, + "loss": 0.6048, + "step": 6581 + }, + { + "epoch": 0.69, + "grad_norm": 3.058425958808465, + "learning_rate": 2.2800692048624092e-06, + "loss": 0.6329, + "step": 6582 + }, + { + "epoch": 0.69, + "grad_norm": 0.9682730563914436, + "learning_rate": 2.278639346536031e-06, + "loss": 0.5591, + "step": 6583 + }, + { + "epoch": 0.69, + "grad_norm": 3.6023672656105106, + "learning_rate": 2.2772098043694656e-06, + "loss": 0.6068, + "step": 6584 + }, + { + "epoch": 0.69, + "grad_norm": 2.5349956031101852, + "learning_rate": 2.2757805785287946e-06, + "loss": 0.5998, + "step": 6585 + }, + { + "epoch": 0.69, + "grad_norm": 1.9259914387049755, + "learning_rate": 2.274351669180063e-06, + "loss": 0.5596, + "step": 6586 + }, + { + "epoch": 0.69, + "grad_norm": 2.1838133733921383, + "learning_rate": 2.272923076489275e-06, + "loss": 0.6713, + "step": 6587 + }, + { + "epoch": 0.69, + "grad_norm": 2.354737589610576, + "learning_rate": 2.271494800622399e-06, + "loss": 0.5937, + "step": 6588 + }, + { + "epoch": 0.69, + "grad_norm": 3.3360105669346956, + "learning_rate": 2.2700668417453703e-06, + "loss": 0.6004, + "step": 6589 + }, + { + "epoch": 0.69, + "grad_norm": 2.521181364398551, + "learning_rate": 2.2686392000240838e-06, + "loss": 0.6411, + "step": 6590 + }, + { + "epoch": 0.69, + "grad_norm": 2.3567654473778523, + "learning_rate": 2.2672118756244014e-06, + "loss": 0.6683, + "step": 6591 + }, + { + "epoch": 0.69, + "grad_norm": 2.2629509475772647, + "learning_rate": 2.2657848687121444e-06, + "loss": 0.5745, + "step": 6592 + }, + { + "epoch": 0.69, + "grad_norm": 2.1762686821367487, + "learning_rate": 2.2643581794530943e-06, + "loss": 0.6356, + "step": 6593 + }, + { + "epoch": 0.69, + "grad_norm": 2.4123508723145535, + "learning_rate": 2.2629318080130042e-06, + "loss": 0.638, + "step": 6594 + }, + { + "epoch": 0.69, + "grad_norm": 8.803556073787599, + "learning_rate": 2.261505754557586e-06, + "loss": 0.6717, + "step": 6595 + }, + { + "epoch": 0.69, + "grad_norm": 2.8959725718418525, + "learning_rate": 2.260080019252513e-06, + "loss": 0.5497, + "step": 6596 + }, + { + "epoch": 0.69, + "grad_norm": 2.3343782073258676, + "learning_rate": 2.258654602263421e-06, + "loss": 0.6782, + "step": 6597 + }, + { + "epoch": 0.69, + "grad_norm": 2.0752300588139856, + "learning_rate": 2.2572295037559135e-06, + "loss": 0.6159, + "step": 6598 + }, + { + "epoch": 0.69, + "grad_norm": 2.4484059409198364, + "learning_rate": 2.2558047238955547e-06, + "loss": 0.6265, + "step": 6599 + }, + { + "epoch": 0.69, + "grad_norm": 3.0167746887631335, + "learning_rate": 2.2543802628478695e-06, + "loss": 0.6153, + "step": 6600 + }, + { + "epoch": 0.69, + "grad_norm": 3.7309284321227167, + "learning_rate": 2.2529561207783495e-06, + "loss": 0.5535, + "step": 6601 + }, + { + "epoch": 0.69, + "grad_norm": 2.5843645635460657, + "learning_rate": 2.251532297852445e-06, + "loss": 0.5995, + "step": 6602 + }, + { + "epoch": 0.69, + "grad_norm": 2.1738162447033127, + "learning_rate": 2.2501087942355736e-06, + "loss": 0.7412, + "step": 6603 + }, + { + "epoch": 0.69, + "grad_norm": 2.4229284055465707, + "learning_rate": 2.2486856100931146e-06, + "loss": 0.6312, + "step": 6604 + }, + { + "epoch": 0.7, + "grad_norm": 2.8696482572875674, + "learning_rate": 2.2472627455904086e-06, + "loss": 0.5519, + "step": 6605 + }, + { + "epoch": 0.7, + "grad_norm": 2.844232232179864, + "learning_rate": 2.2458402008927578e-06, + "loss": 0.6271, + "step": 6606 + }, + { + "epoch": 0.7, + "grad_norm": 2.404083137621484, + "learning_rate": 2.24441797616543e-06, + "loss": 0.5466, + "step": 6607 + }, + { + "epoch": 0.7, + "grad_norm": 2.567530863025546, + "learning_rate": 2.2429960715736588e-06, + "loss": 0.5551, + "step": 6608 + }, + { + "epoch": 0.7, + "grad_norm": 2.2882668446276555, + "learning_rate": 2.241574487282634e-06, + "loss": 0.6007, + "step": 6609 + }, + { + "epoch": 0.7, + "grad_norm": 2.110230304278789, + "learning_rate": 2.24015322345751e-06, + "loss": 0.6316, + "step": 6610 + }, + { + "epoch": 0.7, + "grad_norm": 56.41508535977514, + "learning_rate": 2.2387322802634065e-06, + "loss": 0.6579, + "step": 6611 + }, + { + "epoch": 0.7, + "grad_norm": 2.2521890735234713, + "learning_rate": 2.2373116578654042e-06, + "loss": 0.6235, + "step": 6612 + }, + { + "epoch": 0.7, + "grad_norm": 3.234967262280706, + "learning_rate": 2.2358913564285496e-06, + "loss": 0.5794, + "step": 6613 + }, + { + "epoch": 0.7, + "grad_norm": 2.976223938280554, + "learning_rate": 2.234471376117847e-06, + "loss": 0.6562, + "step": 6614 + }, + { + "epoch": 0.7, + "grad_norm": 2.5700256793799388, + "learning_rate": 2.2330517170982634e-06, + "loss": 0.5574, + "step": 6615 + }, + { + "epoch": 0.7, + "grad_norm": 3.3477002796845468, + "learning_rate": 2.2316323795347334e-06, + "loss": 0.6504, + "step": 6616 + }, + { + "epoch": 0.7, + "grad_norm": 2.155545624420643, + "learning_rate": 2.2302133635921524e-06, + "loss": 0.6096, + "step": 6617 + }, + { + "epoch": 0.7, + "grad_norm": 2.271504833154178, + "learning_rate": 2.2287946694353764e-06, + "loss": 0.6292, + "step": 6618 + }, + { + "epoch": 0.7, + "grad_norm": 1.9615477986067251, + "learning_rate": 2.2273762972292227e-06, + "loss": 0.5254, + "step": 6619 + }, + { + "epoch": 0.7, + "grad_norm": 2.347528126787625, + "learning_rate": 2.2259582471384765e-06, + "loss": 0.5771, + "step": 6620 + }, + { + "epoch": 0.7, + "grad_norm": 2.0790518781036624, + "learning_rate": 2.224540519327884e-06, + "loss": 0.5152, + "step": 6621 + }, + { + "epoch": 0.7, + "grad_norm": 2.4932893443972515, + "learning_rate": 2.2231231139621505e-06, + "loss": 0.6342, + "step": 6622 + }, + { + "epoch": 0.7, + "grad_norm": 2.40013914291326, + "learning_rate": 2.2217060312059453e-06, + "loss": 0.6919, + "step": 6623 + }, + { + "epoch": 0.7, + "grad_norm": 3.7257495545768244, + "learning_rate": 2.2202892712239016e-06, + "loss": 0.6643, + "step": 6624 + }, + { + "epoch": 0.7, + "grad_norm": 3.029296071523241, + "learning_rate": 2.2188728341806153e-06, + "loss": 0.5941, + "step": 6625 + }, + { + "epoch": 0.7, + "grad_norm": 2.587522142486169, + "learning_rate": 2.2174567202406455e-06, + "loss": 0.689, + "step": 6626 + }, + { + "epoch": 0.7, + "grad_norm": 3.102337698108396, + "learning_rate": 2.2160409295685105e-06, + "loss": 0.6263, + "step": 6627 + }, + { + "epoch": 0.7, + "grad_norm": 2.442746996525038, + "learning_rate": 2.2146254623286905e-06, + "loss": 0.6833, + "step": 6628 + }, + { + "epoch": 0.7, + "grad_norm": 3.678411820711548, + "learning_rate": 2.213210318685633e-06, + "loss": 0.6105, + "step": 6629 + }, + { + "epoch": 0.7, + "grad_norm": 2.314390976374499, + "learning_rate": 2.2117954988037467e-06, + "loss": 0.6826, + "step": 6630 + }, + { + "epoch": 0.7, + "grad_norm": 2.3034289358297007, + "learning_rate": 2.210381002847399e-06, + "loss": 0.6343, + "step": 6631 + }, + { + "epoch": 0.7, + "grad_norm": 2.9230039460865647, + "learning_rate": 2.208966830980921e-06, + "loss": 0.629, + "step": 6632 + }, + { + "epoch": 0.7, + "grad_norm": 2.215059832781754, + "learning_rate": 2.207552983368608e-06, + "loss": 0.5855, + "step": 6633 + }, + { + "epoch": 0.7, + "grad_norm": 3.733877578923876, + "learning_rate": 2.20613946017472e-06, + "loss": 0.561, + "step": 6634 + }, + { + "epoch": 0.7, + "grad_norm": 5.184821592844948, + "learning_rate": 2.2047262615634723e-06, + "loss": 0.5693, + "step": 6635 + }, + { + "epoch": 0.7, + "grad_norm": 2.3808532135782983, + "learning_rate": 2.203313387699046e-06, + "loss": 0.61, + "step": 6636 + }, + { + "epoch": 0.7, + "grad_norm": 9.185810849417573, + "learning_rate": 2.201900838745586e-06, + "loss": 0.6877, + "step": 6637 + }, + { + "epoch": 0.7, + "grad_norm": 2.901166357224412, + "learning_rate": 2.2004886148671978e-06, + "loss": 0.6226, + "step": 6638 + }, + { + "epoch": 0.7, + "grad_norm": 2.8487030064529053, + "learning_rate": 2.1990767162279515e-06, + "loss": 0.5769, + "step": 6639 + }, + { + "epoch": 0.7, + "grad_norm": 0.9023289850683952, + "learning_rate": 2.197665142991876e-06, + "loss": 0.4856, + "step": 6640 + }, + { + "epoch": 0.7, + "grad_norm": 2.343960901159718, + "learning_rate": 2.196253895322961e-06, + "loss": 0.5961, + "step": 6641 + }, + { + "epoch": 0.7, + "grad_norm": 4.467203125378956, + "learning_rate": 2.1948429733851646e-06, + "loss": 0.6625, + "step": 6642 + }, + { + "epoch": 0.7, + "grad_norm": 2.702936295244413, + "learning_rate": 2.193432377342404e-06, + "loss": 0.6413, + "step": 6643 + }, + { + "epoch": 0.7, + "grad_norm": 2.8056618447753277, + "learning_rate": 2.1920221073585564e-06, + "loss": 0.5931, + "step": 6644 + }, + { + "epoch": 0.7, + "grad_norm": 2.79010389004188, + "learning_rate": 2.190612163597462e-06, + "loss": 0.6772, + "step": 6645 + }, + { + "epoch": 0.7, + "grad_norm": 4.5825220168109295, + "learning_rate": 2.189202546222925e-06, + "loss": 0.6462, + "step": 6646 + }, + { + "epoch": 0.7, + "grad_norm": 2.6022305800555974, + "learning_rate": 2.1877932553987114e-06, + "loss": 0.6701, + "step": 6647 + }, + { + "epoch": 0.7, + "grad_norm": 2.1361674026822683, + "learning_rate": 2.1863842912885496e-06, + "loss": 0.569, + "step": 6648 + }, + { + "epoch": 0.7, + "grad_norm": 3.1167050044456364, + "learning_rate": 2.184975654056128e-06, + "loss": 0.745, + "step": 6649 + }, + { + "epoch": 0.7, + "grad_norm": 2.735863377596502, + "learning_rate": 2.183567343865095e-06, + "loss": 0.607, + "step": 6650 + }, + { + "epoch": 0.7, + "grad_norm": 2.4462949583121554, + "learning_rate": 2.182159360879067e-06, + "loss": 0.587, + "step": 6651 + }, + { + "epoch": 0.7, + "grad_norm": 1.9840779320355102, + "learning_rate": 2.1807517052616205e-06, + "loss": 0.5347, + "step": 6652 + }, + { + "epoch": 0.7, + "grad_norm": 16.6142045279403, + "learning_rate": 2.1793443771762912e-06, + "loss": 0.5878, + "step": 6653 + }, + { + "epoch": 0.7, + "grad_norm": 2.1547249179603583, + "learning_rate": 2.177937376786577e-06, + "loss": 0.5679, + "step": 6654 + }, + { + "epoch": 0.7, + "grad_norm": 3.7649378439066905, + "learning_rate": 2.17653070425594e-06, + "loss": 0.6205, + "step": 6655 + }, + { + "epoch": 0.7, + "grad_norm": 3.1627346207064804, + "learning_rate": 2.175124359747806e-06, + "loss": 0.653, + "step": 6656 + }, + { + "epoch": 0.7, + "grad_norm": 3.3916267839635585, + "learning_rate": 2.173718343425558e-06, + "loss": 0.5896, + "step": 6657 + }, + { + "epoch": 0.7, + "grad_norm": 3.269804794003583, + "learning_rate": 2.1723126554525415e-06, + "loss": 0.6527, + "step": 6658 + }, + { + "epoch": 0.7, + "grad_norm": 1.0320795601481907, + "learning_rate": 2.1709072959920667e-06, + "loss": 0.525, + "step": 6659 + }, + { + "epoch": 0.7, + "grad_norm": 2.3228826509557523, + "learning_rate": 2.169502265207404e-06, + "loss": 0.5739, + "step": 6660 + }, + { + "epoch": 0.7, + "grad_norm": 2.952927663361331, + "learning_rate": 2.168097563261787e-06, + "loss": 0.6047, + "step": 6661 + }, + { + "epoch": 0.7, + "grad_norm": 2.120080095383413, + "learning_rate": 2.1666931903184103e-06, + "loss": 0.5871, + "step": 6662 + }, + { + "epoch": 0.7, + "grad_norm": 2.141689567051116, + "learning_rate": 2.1652891465404257e-06, + "loss": 0.6272, + "step": 6663 + }, + { + "epoch": 0.7, + "grad_norm": 2.7913975017146035, + "learning_rate": 2.1638854320909542e-06, + "loss": 0.7013, + "step": 6664 + }, + { + "epoch": 0.7, + "grad_norm": 2.2400371485167927, + "learning_rate": 2.162482047133076e-06, + "loss": 0.56, + "step": 6665 + }, + { + "epoch": 0.7, + "grad_norm": 3.3342661126733666, + "learning_rate": 2.161078991829832e-06, + "loss": 0.635, + "step": 6666 + }, + { + "epoch": 0.7, + "grad_norm": 3.536358295411925, + "learning_rate": 2.159676266344222e-06, + "loss": 0.6228, + "step": 6667 + }, + { + "epoch": 0.7, + "grad_norm": 2.503851980349559, + "learning_rate": 2.1582738708392127e-06, + "loss": 0.5741, + "step": 6668 + }, + { + "epoch": 0.7, + "grad_norm": 3.0980965236347804, + "learning_rate": 2.1568718054777322e-06, + "loss": 0.6151, + "step": 6669 + }, + { + "epoch": 0.7, + "grad_norm": 3.792431692180165, + "learning_rate": 2.1554700704226673e-06, + "loss": 0.6017, + "step": 6670 + }, + { + "epoch": 0.7, + "grad_norm": 2.0346739479145444, + "learning_rate": 2.1540686658368643e-06, + "loss": 0.5778, + "step": 6671 + }, + { + "epoch": 0.7, + "grad_norm": 2.666257894353589, + "learning_rate": 2.1526675918831373e-06, + "loss": 0.6578, + "step": 6672 + }, + { + "epoch": 0.7, + "grad_norm": 2.726228194370597, + "learning_rate": 2.151266848724259e-06, + "loss": 0.6432, + "step": 6673 + }, + { + "epoch": 0.7, + "grad_norm": 2.5487776785030176, + "learning_rate": 2.149866436522965e-06, + "loss": 0.6072, + "step": 6674 + }, + { + "epoch": 0.7, + "grad_norm": 2.8073196930271074, + "learning_rate": 2.1484663554419495e-06, + "loss": 0.6205, + "step": 6675 + }, + { + "epoch": 0.7, + "grad_norm": 2.2783150428762, + "learning_rate": 2.147066605643868e-06, + "loss": 0.6129, + "step": 6676 + }, + { + "epoch": 0.7, + "grad_norm": 6.203415894181792, + "learning_rate": 2.145667187291341e-06, + "loss": 0.6235, + "step": 6677 + }, + { + "epoch": 0.7, + "grad_norm": 2.0439147089340683, + "learning_rate": 2.144268100546951e-06, + "loss": 0.5993, + "step": 6678 + }, + { + "epoch": 0.7, + "grad_norm": 2.351141372515001, + "learning_rate": 2.1428693455732384e-06, + "loss": 0.5968, + "step": 6679 + }, + { + "epoch": 0.7, + "grad_norm": 2.2933943463025726, + "learning_rate": 2.141470922532704e-06, + "loss": 0.6489, + "step": 6680 + }, + { + "epoch": 0.7, + "grad_norm": 2.3273415621641216, + "learning_rate": 2.140072831587815e-06, + "loss": 0.635, + "step": 6681 + }, + { + "epoch": 0.7, + "grad_norm": 2.2821768908749736, + "learning_rate": 2.138675072900997e-06, + "loss": 0.5997, + "step": 6682 + }, + { + "epoch": 0.7, + "grad_norm": 3.089940278859609, + "learning_rate": 2.1372776466346414e-06, + "loss": 0.5724, + "step": 6683 + }, + { + "epoch": 0.7, + "grad_norm": 2.251126225010119, + "learning_rate": 2.1358805529510896e-06, + "loss": 0.617, + "step": 6684 + }, + { + "epoch": 0.7, + "grad_norm": 2.760541069813522, + "learning_rate": 2.134483792012656e-06, + "loss": 0.6546, + "step": 6685 + }, + { + "epoch": 0.7, + "grad_norm": 2.2370760853157, + "learning_rate": 2.1330873639816125e-06, + "loss": 0.5493, + "step": 6686 + }, + { + "epoch": 0.7, + "grad_norm": 2.9935072593068037, + "learning_rate": 2.131691269020193e-06, + "loss": 0.556, + "step": 6687 + }, + { + "epoch": 0.7, + "grad_norm": 2.0100434248241155, + "learning_rate": 2.130295507290591e-06, + "loss": 0.6344, + "step": 6688 + }, + { + "epoch": 0.7, + "grad_norm": 2.244649889387862, + "learning_rate": 2.1289000789549586e-06, + "loss": 0.5579, + "step": 6689 + }, + { + "epoch": 0.7, + "grad_norm": 2.704981131283807, + "learning_rate": 2.1275049841754165e-06, + "loss": 0.5746, + "step": 6690 + }, + { + "epoch": 0.7, + "grad_norm": 2.29539621168737, + "learning_rate": 2.126110223114043e-06, + "loss": 0.6104, + "step": 6691 + }, + { + "epoch": 0.7, + "grad_norm": 2.205856297808691, + "learning_rate": 2.1247157959328763e-06, + "loss": 0.5662, + "step": 6692 + }, + { + "epoch": 0.7, + "grad_norm": 0.8836298348060035, + "learning_rate": 2.1233217027939153e-06, + "loss": 0.562, + "step": 6693 + }, + { + "epoch": 0.7, + "grad_norm": 2.3976000029984372, + "learning_rate": 2.121927943859123e-06, + "loss": 0.6278, + "step": 6694 + }, + { + "epoch": 0.7, + "grad_norm": 2.331546227933603, + "learning_rate": 2.1205345192904224e-06, + "loss": 0.6523, + "step": 6695 + }, + { + "epoch": 0.7, + "grad_norm": 2.3605376267410083, + "learning_rate": 2.1191414292497e-06, + "loss": 0.6496, + "step": 6696 + }, + { + "epoch": 0.7, + "grad_norm": 2.102974401678194, + "learning_rate": 2.1177486738987984e-06, + "loss": 0.6491, + "step": 6697 + }, + { + "epoch": 0.7, + "grad_norm": 2.26075427680184, + "learning_rate": 2.116356253399522e-06, + "loss": 0.5821, + "step": 6698 + }, + { + "epoch": 0.7, + "grad_norm": 0.8908330043379147, + "learning_rate": 2.114964167913641e-06, + "loss": 0.537, + "step": 6699 + }, + { + "epoch": 0.71, + "grad_norm": 2.579961013538668, + "learning_rate": 2.1135724176028844e-06, + "loss": 0.5184, + "step": 6700 + }, + { + "epoch": 0.71, + "grad_norm": 1.08377361893251, + "learning_rate": 2.1121810026289404e-06, + "loss": 0.544, + "step": 6701 + }, + { + "epoch": 0.71, + "grad_norm": 6.241375532358382, + "learning_rate": 2.110789923153458e-06, + "loss": 0.6789, + "step": 6702 + }, + { + "epoch": 0.71, + "grad_norm": 2.9160714063650266, + "learning_rate": 2.109399179338051e-06, + "loss": 0.5792, + "step": 6703 + }, + { + "epoch": 0.71, + "grad_norm": 2.1537715668446755, + "learning_rate": 2.1080087713442928e-06, + "loss": 0.5489, + "step": 6704 + }, + { + "epoch": 0.71, + "grad_norm": 2.053631859586391, + "learning_rate": 2.1066186993337158e-06, + "loss": 0.5573, + "step": 6705 + }, + { + "epoch": 0.71, + "grad_norm": 5.425494797474714, + "learning_rate": 2.105228963467812e-06, + "loss": 0.5586, + "step": 6706 + }, + { + "epoch": 0.71, + "grad_norm": 2.748837747672611, + "learning_rate": 2.10383956390804e-06, + "loss": 0.5483, + "step": 6707 + }, + { + "epoch": 0.71, + "grad_norm": 2.2002541684184727, + "learning_rate": 2.1024505008158153e-06, + "loss": 0.6831, + "step": 6708 + }, + { + "epoch": 0.71, + "grad_norm": 2.309736548355432, + "learning_rate": 2.101061774352517e-06, + "loss": 0.5418, + "step": 6709 + }, + { + "epoch": 0.71, + "grad_norm": 2.4178982369090267, + "learning_rate": 2.099673384679482e-06, + "loss": 0.5158, + "step": 6710 + }, + { + "epoch": 0.71, + "grad_norm": 2.18817854947126, + "learning_rate": 2.0982853319580075e-06, + "loss": 0.5532, + "step": 6711 + }, + { + "epoch": 0.71, + "grad_norm": 15.098923040125959, + "learning_rate": 2.096897616349355e-06, + "loss": 0.6052, + "step": 6712 + }, + { + "epoch": 0.71, + "grad_norm": 4.248992003023729, + "learning_rate": 2.0955102380147474e-06, + "loss": 0.6149, + "step": 6713 + }, + { + "epoch": 0.71, + "grad_norm": 5.785961960691239, + "learning_rate": 2.0941231971153644e-06, + "loss": 0.6334, + "step": 6714 + }, + { + "epoch": 0.71, + "grad_norm": 2.878088474717053, + "learning_rate": 2.0927364938123457e-06, + "loss": 0.6535, + "step": 6715 + }, + { + "epoch": 0.71, + "grad_norm": 2.4454378764029507, + "learning_rate": 2.0913501282667975e-06, + "loss": 0.6169, + "step": 6716 + }, + { + "epoch": 0.71, + "grad_norm": 3.1028098404336815, + "learning_rate": 2.0899641006397836e-06, + "loss": 0.6849, + "step": 6717 + }, + { + "epoch": 0.71, + "grad_norm": 2.5675573992295457, + "learning_rate": 2.0885784110923325e-06, + "loss": 0.6658, + "step": 6718 + }, + { + "epoch": 0.71, + "grad_norm": 3.0159340806610255, + "learning_rate": 2.087193059785421e-06, + "loss": 0.6471, + "step": 6719 + }, + { + "epoch": 0.71, + "grad_norm": 2.879084843294698, + "learning_rate": 2.08580804688e-06, + "loss": 0.5826, + "step": 6720 + }, + { + "epoch": 0.71, + "grad_norm": 2.7524142612645415, + "learning_rate": 2.084423372536976e-06, + "loss": 0.6057, + "step": 6721 + }, + { + "epoch": 0.71, + "grad_norm": 3.2595597192294785, + "learning_rate": 2.083039036917219e-06, + "loss": 0.6349, + "step": 6722 + }, + { + "epoch": 0.71, + "grad_norm": 3.6944542500831257, + "learning_rate": 2.0816550401815538e-06, + "loss": 0.5963, + "step": 6723 + }, + { + "epoch": 0.71, + "grad_norm": 2.1565697221035833, + "learning_rate": 2.0802713824907683e-06, + "loss": 0.5843, + "step": 6724 + }, + { + "epoch": 0.71, + "grad_norm": 2.4041247947594564, + "learning_rate": 2.0788880640056137e-06, + "loss": 0.5929, + "step": 6725 + }, + { + "epoch": 0.71, + "grad_norm": 2.6733679379720794, + "learning_rate": 2.077505084886802e-06, + "loss": 0.6643, + "step": 6726 + }, + { + "epoch": 0.71, + "grad_norm": 2.2387112348073126, + "learning_rate": 2.0761224452950003e-06, + "loss": 0.6349, + "step": 6727 + }, + { + "epoch": 0.71, + "grad_norm": 2.8590997656724713, + "learning_rate": 2.07474014539084e-06, + "loss": 0.6274, + "step": 6728 + }, + { + "epoch": 0.71, + "grad_norm": 2.6204018448125233, + "learning_rate": 2.0733581853349128e-06, + "loss": 0.699, + "step": 6729 + }, + { + "epoch": 0.71, + "grad_norm": 3.7073322849788988, + "learning_rate": 2.071976565287772e-06, + "loss": 0.553, + "step": 6730 + }, + { + "epoch": 0.71, + "grad_norm": 2.0723138136947097, + "learning_rate": 2.0705952854099337e-06, + "loss": 0.5749, + "step": 6731 + }, + { + "epoch": 0.71, + "grad_norm": 2.9024005003511815, + "learning_rate": 2.069214345861863e-06, + "loss": 0.5859, + "step": 6732 + }, + { + "epoch": 0.71, + "grad_norm": 3.007051839789599, + "learning_rate": 2.067833746803998e-06, + "loss": 0.5448, + "step": 6733 + }, + { + "epoch": 0.71, + "grad_norm": 2.16193794853113, + "learning_rate": 2.0664534883967315e-06, + "loss": 0.6046, + "step": 6734 + }, + { + "epoch": 0.71, + "grad_norm": 2.0022011347301887, + "learning_rate": 2.065073570800421e-06, + "loss": 0.6004, + "step": 6735 + }, + { + "epoch": 0.71, + "grad_norm": 2.5866230804925356, + "learning_rate": 2.0636939941753793e-06, + "loss": 0.6142, + "step": 6736 + }, + { + "epoch": 0.71, + "grad_norm": 0.9455346482011558, + "learning_rate": 2.0623147586818786e-06, + "loss": 0.5795, + "step": 6737 + }, + { + "epoch": 0.71, + "grad_norm": 2.8544304813633348, + "learning_rate": 2.060935864480158e-06, + "loss": 0.6272, + "step": 6738 + }, + { + "epoch": 0.71, + "grad_norm": 3.139954944371839, + "learning_rate": 2.0595573117304147e-06, + "loss": 0.5931, + "step": 6739 + }, + { + "epoch": 0.71, + "grad_norm": 2.141121600735202, + "learning_rate": 2.0581791005928024e-06, + "loss": 0.5734, + "step": 6740 + }, + { + "epoch": 0.71, + "grad_norm": 2.1663356934725844, + "learning_rate": 2.0568012312274367e-06, + "loss": 0.4842, + "step": 6741 + }, + { + "epoch": 0.71, + "grad_norm": 2.1593137328818948, + "learning_rate": 2.0554237037943966e-06, + "loss": 0.5317, + "step": 6742 + }, + { + "epoch": 0.71, + "grad_norm": 4.225299678738348, + "learning_rate": 2.054046518453718e-06, + "loss": 0.6648, + "step": 6743 + }, + { + "epoch": 0.71, + "grad_norm": 2.46890585145027, + "learning_rate": 2.0526696753654008e-06, + "loss": 0.648, + "step": 6744 + }, + { + "epoch": 0.71, + "grad_norm": 8.384393534696619, + "learning_rate": 2.051293174689401e-06, + "loss": 0.6517, + "step": 6745 + }, + { + "epoch": 0.71, + "grad_norm": 2.3553993298810063, + "learning_rate": 2.0499170165856343e-06, + "loss": 0.5796, + "step": 6746 + }, + { + "epoch": 0.71, + "grad_norm": 3.2778199669618373, + "learning_rate": 2.048541201213981e-06, + "loss": 0.6517, + "step": 6747 + }, + { + "epoch": 0.71, + "grad_norm": 2.1601593601154647, + "learning_rate": 2.0471657287342813e-06, + "loss": 0.598, + "step": 6748 + }, + { + "epoch": 0.71, + "grad_norm": 2.7950345405844166, + "learning_rate": 2.0457905993063306e-06, + "loss": 0.6188, + "step": 6749 + }, + { + "epoch": 0.71, + "grad_norm": 2.3229400348434717, + "learning_rate": 2.044415813089887e-06, + "loss": 0.5894, + "step": 6750 + }, + { + "epoch": 0.71, + "grad_norm": 2.5323121960930086, + "learning_rate": 2.0430413702446707e-06, + "loss": 0.6715, + "step": 6751 + }, + { + "epoch": 0.71, + "grad_norm": 2.1152568643865597, + "learning_rate": 2.0416672709303597e-06, + "loss": 0.591, + "step": 6752 + }, + { + "epoch": 0.71, + "grad_norm": 3.241190616242863, + "learning_rate": 2.0402935153065976e-06, + "loss": 0.5456, + "step": 6753 + }, + { + "epoch": 0.71, + "grad_norm": 2.8952230145025495, + "learning_rate": 2.0389201035329754e-06, + "loss": 0.6587, + "step": 6754 + }, + { + "epoch": 0.71, + "grad_norm": 4.913524657839511, + "learning_rate": 2.0375470357690564e-06, + "loss": 0.6103, + "step": 6755 + }, + { + "epoch": 0.71, + "grad_norm": 3.357411676416585, + "learning_rate": 2.036174312174359e-06, + "loss": 0.6013, + "step": 6756 + }, + { + "epoch": 0.71, + "grad_norm": 2.164348116661981, + "learning_rate": 2.034801932908364e-06, + "loss": 0.641, + "step": 6757 + }, + { + "epoch": 0.71, + "grad_norm": 2.285714519634651, + "learning_rate": 2.033429898130509e-06, + "loss": 0.5707, + "step": 6758 + }, + { + "epoch": 0.71, + "grad_norm": 2.314588495421655, + "learning_rate": 2.032058208000191e-06, + "loss": 0.6353, + "step": 6759 + }, + { + "epoch": 0.71, + "grad_norm": 3.1963968095024904, + "learning_rate": 2.030686862676771e-06, + "loss": 0.6579, + "step": 6760 + }, + { + "epoch": 0.71, + "grad_norm": 2.234190092875219, + "learning_rate": 2.0293158623195702e-06, + "loss": 0.6601, + "step": 6761 + }, + { + "epoch": 0.71, + "grad_norm": 2.350692178674305, + "learning_rate": 2.0279452070878647e-06, + "loss": 0.6188, + "step": 6762 + }, + { + "epoch": 0.71, + "grad_norm": 2.564102840538574, + "learning_rate": 2.026574897140892e-06, + "loss": 0.6149, + "step": 6763 + }, + { + "epoch": 0.71, + "grad_norm": 2.174466257356336, + "learning_rate": 2.0252049326378524e-06, + "loss": 0.605, + "step": 6764 + }, + { + "epoch": 0.71, + "grad_norm": 2.149315755348158, + "learning_rate": 2.0238353137379047e-06, + "loss": 0.6217, + "step": 6765 + }, + { + "epoch": 0.71, + "grad_norm": 2.22159691006164, + "learning_rate": 2.02246604060017e-06, + "loss": 0.4916, + "step": 6766 + }, + { + "epoch": 0.71, + "grad_norm": 2.0623059295189003, + "learning_rate": 2.0210971133837208e-06, + "loss": 0.6159, + "step": 6767 + }, + { + "epoch": 0.71, + "grad_norm": 5.244551913598113, + "learning_rate": 2.0197285322475975e-06, + "loss": 0.5866, + "step": 6768 + }, + { + "epoch": 0.71, + "grad_norm": 3.429308309812128, + "learning_rate": 2.0183602973507977e-06, + "loss": 0.6469, + "step": 6769 + }, + { + "epoch": 0.71, + "grad_norm": 2.2426310766438093, + "learning_rate": 2.016992408852282e-06, + "loss": 0.6122, + "step": 6770 + }, + { + "epoch": 0.71, + "grad_norm": 3.147899821152736, + "learning_rate": 2.0156248669109645e-06, + "loss": 0.6084, + "step": 6771 + }, + { + "epoch": 0.71, + "grad_norm": 2.5207952773245057, + "learning_rate": 2.014257671685722e-06, + "loss": 0.6181, + "step": 6772 + }, + { + "epoch": 0.71, + "grad_norm": 2.290492088782644, + "learning_rate": 2.012890823335392e-06, + "loss": 0.6861, + "step": 6773 + }, + { + "epoch": 0.71, + "grad_norm": 0.9788946060137566, + "learning_rate": 2.011524322018773e-06, + "loss": 0.5617, + "step": 6774 + }, + { + "epoch": 0.71, + "grad_norm": 2.02202729427753, + "learning_rate": 2.01015816789462e-06, + "loss": 0.5491, + "step": 6775 + }, + { + "epoch": 0.71, + "grad_norm": 2.644055938189567, + "learning_rate": 2.0087923611216452e-06, + "loss": 0.6326, + "step": 6776 + }, + { + "epoch": 0.71, + "grad_norm": 2.6229423897634088, + "learning_rate": 2.0074269018585286e-06, + "loss": 0.6593, + "step": 6777 + }, + { + "epoch": 0.71, + "grad_norm": 2.657797931574321, + "learning_rate": 2.006061790263903e-06, + "loss": 0.6879, + "step": 6778 + }, + { + "epoch": 0.71, + "grad_norm": 3.731539046733333, + "learning_rate": 2.004697026496366e-06, + "loss": 0.6025, + "step": 6779 + }, + { + "epoch": 0.71, + "grad_norm": 2.3300238922256162, + "learning_rate": 2.00333261071447e-06, + "loss": 0.5737, + "step": 6780 + }, + { + "epoch": 0.71, + "grad_norm": 2.5105007437944, + "learning_rate": 2.001968543076727e-06, + "loss": 0.5984, + "step": 6781 + }, + { + "epoch": 0.71, + "grad_norm": 2.9355723240439238, + "learning_rate": 2.0006048237416127e-06, + "loss": 0.6853, + "step": 6782 + }, + { + "epoch": 0.71, + "grad_norm": 2.7838842692244414, + "learning_rate": 1.9992414528675607e-06, + "loss": 0.5972, + "step": 6783 + }, + { + "epoch": 0.71, + "grad_norm": 2.409380507657977, + "learning_rate": 1.997878430612963e-06, + "loss": 0.6305, + "step": 6784 + }, + { + "epoch": 0.71, + "grad_norm": 2.610091326027608, + "learning_rate": 1.9965157571361688e-06, + "loss": 0.6465, + "step": 6785 + }, + { + "epoch": 0.71, + "grad_norm": 2.0465580485685355, + "learning_rate": 1.9951534325954913e-06, + "loss": 0.5599, + "step": 6786 + }, + { + "epoch": 0.71, + "grad_norm": 2.7696890221497825, + "learning_rate": 1.9937914571492024e-06, + "loss": 0.5447, + "step": 6787 + }, + { + "epoch": 0.71, + "grad_norm": 2.854812905102868, + "learning_rate": 1.9924298309555355e-06, + "loss": 0.5373, + "step": 6788 + }, + { + "epoch": 0.71, + "grad_norm": 4.144295935862878, + "learning_rate": 1.991068554172673e-06, + "loss": 0.5129, + "step": 6789 + }, + { + "epoch": 0.71, + "grad_norm": 2.235492207734396, + "learning_rate": 1.9897076269587686e-06, + "loss": 0.5211, + "step": 6790 + }, + { + "epoch": 0.71, + "grad_norm": 1.0026420194820682, + "learning_rate": 1.98834704947193e-06, + "loss": 0.5368, + "step": 6791 + }, + { + "epoch": 0.71, + "grad_norm": 4.2741376437674825, + "learning_rate": 1.9869868218702266e-06, + "loss": 0.6746, + "step": 6792 + }, + { + "epoch": 0.71, + "grad_norm": 2.4315638104365025, + "learning_rate": 1.985626944311685e-06, + "loss": 0.4932, + "step": 6793 + }, + { + "epoch": 0.71, + "grad_norm": 3.005978231078567, + "learning_rate": 1.984267416954289e-06, + "loss": 0.694, + "step": 6794 + }, + { + "epoch": 0.72, + "grad_norm": 0.9172999980041083, + "learning_rate": 1.9829082399559872e-06, + "loss": 0.5798, + "step": 6795 + }, + { + "epoch": 0.72, + "grad_norm": 2.2051339131239667, + "learning_rate": 1.9815494134746866e-06, + "loss": 0.6441, + "step": 6796 + }, + { + "epoch": 0.72, + "grad_norm": 2.4405798502942733, + "learning_rate": 1.98019093766825e-06, + "loss": 0.6649, + "step": 6797 + }, + { + "epoch": 0.72, + "grad_norm": 2.505836213580318, + "learning_rate": 1.9788328126944984e-06, + "loss": 0.5017, + "step": 6798 + }, + { + "epoch": 0.72, + "grad_norm": 2.5383152998963965, + "learning_rate": 1.9774750387112176e-06, + "loss": 0.6945, + "step": 6799 + }, + { + "epoch": 0.72, + "grad_norm": 2.295043766361787, + "learning_rate": 1.976117615876149e-06, + "loss": 0.5951, + "step": 6800 + }, + { + "epoch": 0.72, + "grad_norm": 3.0818322840049546, + "learning_rate": 1.974760544346999e-06, + "loss": 0.6048, + "step": 6801 + }, + { + "epoch": 0.72, + "grad_norm": 2.1718689507776063, + "learning_rate": 1.9734038242814203e-06, + "loss": 0.5392, + "step": 6802 + }, + { + "epoch": 0.72, + "grad_norm": 4.5743079582613, + "learning_rate": 1.9720474558370356e-06, + "loss": 0.6438, + "step": 6803 + }, + { + "epoch": 0.72, + "grad_norm": 2.4649750346849424, + "learning_rate": 1.970691439171425e-06, + "loss": 0.5826, + "step": 6804 + }, + { + "epoch": 0.72, + "grad_norm": 2.251453377955877, + "learning_rate": 1.9693357744421282e-06, + "loss": 0.6129, + "step": 6805 + }, + { + "epoch": 0.72, + "grad_norm": 2.7067846238307616, + "learning_rate": 1.96798046180664e-06, + "loss": 0.5928, + "step": 6806 + }, + { + "epoch": 0.72, + "grad_norm": 2.9657749101823887, + "learning_rate": 1.966625501422415e-06, + "loss": 0.6502, + "step": 6807 + }, + { + "epoch": 0.72, + "grad_norm": 2.885763329620718, + "learning_rate": 1.965270893446871e-06, + "loss": 0.6607, + "step": 6808 + }, + { + "epoch": 0.72, + "grad_norm": 2.516314689530151, + "learning_rate": 1.963916638037384e-06, + "loss": 0.6097, + "step": 6809 + }, + { + "epoch": 0.72, + "grad_norm": 2.819130855149682, + "learning_rate": 1.9625627353512854e-06, + "loss": 0.647, + "step": 6810 + }, + { + "epoch": 0.72, + "grad_norm": 3.1510683550167125, + "learning_rate": 1.9612091855458663e-06, + "loss": 0.5885, + "step": 6811 + }, + { + "epoch": 0.72, + "grad_norm": 2.8987176112644457, + "learning_rate": 1.9598559887783797e-06, + "loss": 0.5884, + "step": 6812 + }, + { + "epoch": 0.72, + "grad_norm": 2.9904600312949907, + "learning_rate": 1.958503145206036e-06, + "loss": 0.5806, + "step": 6813 + }, + { + "epoch": 0.72, + "grad_norm": 2.5080351376008374, + "learning_rate": 1.9571506549860065e-06, + "loss": 0.6038, + "step": 6814 + }, + { + "epoch": 0.72, + "grad_norm": 2.939203442577235, + "learning_rate": 1.955798518275418e-06, + "loss": 0.5762, + "step": 6815 + }, + { + "epoch": 0.72, + "grad_norm": 2.228977059412211, + "learning_rate": 1.954446735231356e-06, + "loss": 0.5609, + "step": 6816 + }, + { + "epoch": 0.72, + "grad_norm": 2.63612086695577, + "learning_rate": 1.953095306010869e-06, + "loss": 0.5973, + "step": 6817 + }, + { + "epoch": 0.72, + "grad_norm": 2.463307921855391, + "learning_rate": 1.9517442307709626e-06, + "loss": 0.5971, + "step": 6818 + }, + { + "epoch": 0.72, + "grad_norm": 1.9678254750031574, + "learning_rate": 1.9503935096686004e-06, + "loss": 0.5483, + "step": 6819 + }, + { + "epoch": 0.72, + "grad_norm": 2.4727076802540133, + "learning_rate": 1.9490431428607027e-06, + "loss": 0.626, + "step": 6820 + }, + { + "epoch": 0.72, + "grad_norm": 2.0978475149269795, + "learning_rate": 1.947693130504153e-06, + "loss": 0.5705, + "step": 6821 + }, + { + "epoch": 0.72, + "grad_norm": 2.10936758831189, + "learning_rate": 1.9463434727557927e-06, + "loss": 0.6207, + "step": 6822 + }, + { + "epoch": 0.72, + "grad_norm": 2.4647814076539816, + "learning_rate": 1.9449941697724233e-06, + "loss": 0.5815, + "step": 6823 + }, + { + "epoch": 0.72, + "grad_norm": 2.2186953732210632, + "learning_rate": 1.943645221710797e-06, + "loss": 0.5623, + "step": 6824 + }, + { + "epoch": 0.72, + "grad_norm": 2.3686123006295414, + "learning_rate": 1.942296628727634e-06, + "loss": 0.6202, + "step": 6825 + }, + { + "epoch": 0.72, + "grad_norm": 2.4056025600068973, + "learning_rate": 1.9409483909796096e-06, + "loss": 0.5397, + "step": 6826 + }, + { + "epoch": 0.72, + "grad_norm": 2.843753987586021, + "learning_rate": 1.93960050862336e-06, + "loss": 0.6794, + "step": 6827 + }, + { + "epoch": 0.72, + "grad_norm": 3.0253380845711915, + "learning_rate": 1.9382529818154765e-06, + "loss": 0.6079, + "step": 6828 + }, + { + "epoch": 0.72, + "grad_norm": 2.226590014372156, + "learning_rate": 1.9369058107125094e-06, + "loss": 0.5139, + "step": 6829 + }, + { + "epoch": 0.72, + "grad_norm": 2.6079029240171265, + "learning_rate": 1.935558995470971e-06, + "loss": 0.5842, + "step": 6830 + }, + { + "epoch": 0.72, + "grad_norm": 2.5961478591051663, + "learning_rate": 1.9342125362473313e-06, + "loss": 0.7159, + "step": 6831 + }, + { + "epoch": 0.72, + "grad_norm": 4.508735121085522, + "learning_rate": 1.9328664331980175e-06, + "loss": 0.6256, + "step": 6832 + }, + { + "epoch": 0.72, + "grad_norm": 3.0787485825721332, + "learning_rate": 1.931520686479413e-06, + "loss": 0.7014, + "step": 6833 + }, + { + "epoch": 0.72, + "grad_norm": 2.2749815114493286, + "learning_rate": 1.9301752962478646e-06, + "loss": 0.6009, + "step": 6834 + }, + { + "epoch": 0.72, + "grad_norm": 1.9794025506603408, + "learning_rate": 1.9288302626596772e-06, + "loss": 0.6428, + "step": 6835 + }, + { + "epoch": 0.72, + "grad_norm": 5.039751673052599, + "learning_rate": 1.9274855858711157e-06, + "loss": 0.6417, + "step": 6836 + }, + { + "epoch": 0.72, + "grad_norm": 2.8877557291619853, + "learning_rate": 1.9261412660383927e-06, + "loss": 0.6086, + "step": 6837 + }, + { + "epoch": 0.72, + "grad_norm": 2.692848677613801, + "learning_rate": 1.924797303317692e-06, + "loss": 0.6517, + "step": 6838 + }, + { + "epoch": 0.72, + "grad_norm": 2.4817682883772334, + "learning_rate": 1.9234536978651514e-06, + "loss": 0.6697, + "step": 6839 + }, + { + "epoch": 0.72, + "grad_norm": 3.126381340774734, + "learning_rate": 1.922110449836869e-06, + "loss": 0.6342, + "step": 6840 + }, + { + "epoch": 0.72, + "grad_norm": 2.565024160486094, + "learning_rate": 1.920767559388896e-06, + "loss": 0.629, + "step": 6841 + }, + { + "epoch": 0.72, + "grad_norm": 2.208434714049697, + "learning_rate": 1.919425026677246e-06, + "loss": 0.6768, + "step": 6842 + }, + { + "epoch": 0.72, + "grad_norm": 2.7771674907930843, + "learning_rate": 1.9180828518578907e-06, + "loss": 0.6414, + "step": 6843 + }, + { + "epoch": 0.72, + "grad_norm": 2.972741578017247, + "learning_rate": 1.9167410350867634e-06, + "loss": 0.5638, + "step": 6844 + }, + { + "epoch": 0.72, + "grad_norm": 2.408080059628293, + "learning_rate": 1.9153995765197492e-06, + "loss": 0.5538, + "step": 6845 + }, + { + "epoch": 0.72, + "grad_norm": 2.463378183778841, + "learning_rate": 1.9140584763126942e-06, + "loss": 0.6613, + "step": 6846 + }, + { + "epoch": 0.72, + "grad_norm": 3.6438629469274177, + "learning_rate": 1.912717734621404e-06, + "loss": 0.6311, + "step": 6847 + }, + { + "epoch": 0.72, + "grad_norm": 2.4515690885324446, + "learning_rate": 1.911377351601644e-06, + "loss": 0.587, + "step": 6848 + }, + { + "epoch": 0.72, + "grad_norm": 2.9571991246007077, + "learning_rate": 1.910037327409136e-06, + "loss": 0.5853, + "step": 6849 + }, + { + "epoch": 0.72, + "grad_norm": 1.0579456833316514, + "learning_rate": 1.9086976621995595e-06, + "loss": 0.5346, + "step": 6850 + }, + { + "epoch": 0.72, + "grad_norm": 2.360162048127551, + "learning_rate": 1.9073583561285507e-06, + "loss": 0.5493, + "step": 6851 + }, + { + "epoch": 0.72, + "grad_norm": 2.9782767148226186, + "learning_rate": 1.9060194093517082e-06, + "loss": 0.6485, + "step": 6852 + }, + { + "epoch": 0.72, + "grad_norm": 3.172949041709697, + "learning_rate": 1.9046808220245888e-06, + "loss": 0.6456, + "step": 6853 + }, + { + "epoch": 0.72, + "grad_norm": 2.4126025762580485, + "learning_rate": 1.903342594302704e-06, + "loss": 0.6233, + "step": 6854 + }, + { + "epoch": 0.72, + "grad_norm": 3.111218794053248, + "learning_rate": 1.9020047263415226e-06, + "loss": 0.564, + "step": 6855 + }, + { + "epoch": 0.72, + "grad_norm": 2.112267233952676, + "learning_rate": 1.9006672182964776e-06, + "loss": 0.5294, + "step": 6856 + }, + { + "epoch": 0.72, + "grad_norm": 2.1191102063033305, + "learning_rate": 1.899330070322955e-06, + "loss": 0.5833, + "step": 6857 + }, + { + "epoch": 0.72, + "grad_norm": 2.8084622563861643, + "learning_rate": 1.8979932825763058e-06, + "loss": 0.6078, + "step": 6858 + }, + { + "epoch": 0.72, + "grad_norm": 2.731001054966648, + "learning_rate": 1.8966568552118265e-06, + "loss": 0.6116, + "step": 6859 + }, + { + "epoch": 0.72, + "grad_norm": 2.4118252942652885, + "learning_rate": 1.895320788384783e-06, + "loss": 0.6477, + "step": 6860 + }, + { + "epoch": 0.72, + "grad_norm": 2.4880736901897076, + "learning_rate": 1.8939850822503953e-06, + "loss": 0.6904, + "step": 6861 + }, + { + "epoch": 0.72, + "grad_norm": 2.2508815431544282, + "learning_rate": 1.8926497369638435e-06, + "loss": 0.5529, + "step": 6862 + }, + { + "epoch": 0.72, + "grad_norm": 2.475826447334696, + "learning_rate": 1.8913147526802633e-06, + "loss": 0.5821, + "step": 6863 + }, + { + "epoch": 0.72, + "grad_norm": 2.106833308302653, + "learning_rate": 1.8899801295547476e-06, + "loss": 0.6153, + "step": 6864 + }, + { + "epoch": 0.72, + "grad_norm": 2.202917393863976, + "learning_rate": 1.8886458677423497e-06, + "loss": 0.5526, + "step": 6865 + }, + { + "epoch": 0.72, + "grad_norm": 2.4715070549141216, + "learning_rate": 1.8873119673980828e-06, + "loss": 0.6657, + "step": 6866 + }, + { + "epoch": 0.72, + "grad_norm": 2.967796086555414, + "learning_rate": 1.8859784286769133e-06, + "loss": 0.5738, + "step": 6867 + }, + { + "epoch": 0.72, + "grad_norm": 2.265639351961703, + "learning_rate": 1.8846452517337665e-06, + "loss": 0.676, + "step": 6868 + }, + { + "epoch": 0.72, + "grad_norm": 2.2231883474524, + "learning_rate": 1.8833124367235294e-06, + "loss": 0.6419, + "step": 6869 + }, + { + "epoch": 0.72, + "grad_norm": 3.0688124326659016, + "learning_rate": 1.8819799838010434e-06, + "loss": 0.617, + "step": 6870 + }, + { + "epoch": 0.72, + "grad_norm": 2.5790235530152508, + "learning_rate": 1.8806478931211137e-06, + "loss": 0.6266, + "step": 6871 + }, + { + "epoch": 0.72, + "grad_norm": 2.248003195138114, + "learning_rate": 1.8793161648384905e-06, + "loss": 0.6454, + "step": 6872 + }, + { + "epoch": 0.72, + "grad_norm": 2.4974510096976377, + "learning_rate": 1.8779847991078943e-06, + "loss": 0.6192, + "step": 6873 + }, + { + "epoch": 0.72, + "grad_norm": 2.7581329611825183, + "learning_rate": 1.8766537960839997e-06, + "loss": 0.6176, + "step": 6874 + }, + { + "epoch": 0.72, + "grad_norm": 3.5694350624696978, + "learning_rate": 1.8753231559214402e-06, + "loss": 0.4594, + "step": 6875 + }, + { + "epoch": 0.72, + "grad_norm": 4.608950453790209, + "learning_rate": 1.8739928787748035e-06, + "loss": 0.5928, + "step": 6876 + }, + { + "epoch": 0.72, + "grad_norm": 2.277145432538584, + "learning_rate": 1.872662964798636e-06, + "loss": 0.6286, + "step": 6877 + }, + { + "epoch": 0.72, + "grad_norm": 2.794994081731649, + "learning_rate": 1.8713334141474454e-06, + "loss": 0.6851, + "step": 6878 + }, + { + "epoch": 0.72, + "grad_norm": 2.249886037254742, + "learning_rate": 1.8700042269756964e-06, + "loss": 0.633, + "step": 6879 + }, + { + "epoch": 0.72, + "grad_norm": 2.2734533778197794, + "learning_rate": 1.8686754034378085e-06, + "loss": 0.6386, + "step": 6880 + }, + { + "epoch": 0.72, + "grad_norm": 2.128369477606803, + "learning_rate": 1.867346943688158e-06, + "loss": 0.6784, + "step": 6881 + }, + { + "epoch": 0.72, + "grad_norm": 2.304439961783062, + "learning_rate": 1.8660188478810848e-06, + "loss": 0.5845, + "step": 6882 + }, + { + "epoch": 0.72, + "grad_norm": 2.995582630490625, + "learning_rate": 1.8646911161708824e-06, + "loss": 0.6216, + "step": 6883 + }, + { + "epoch": 0.72, + "grad_norm": 2.3675530017194064, + "learning_rate": 1.8633637487118046e-06, + "loss": 0.5987, + "step": 6884 + }, + { + "epoch": 0.72, + "grad_norm": 2.9813523750802173, + "learning_rate": 1.862036745658059e-06, + "loss": 0.6377, + "step": 6885 + }, + { + "epoch": 0.72, + "grad_norm": 2.6903692135653428, + "learning_rate": 1.8607101071638117e-06, + "loss": 0.6237, + "step": 6886 + }, + { + "epoch": 0.72, + "grad_norm": 2.329812249626914, + "learning_rate": 1.8593838333831893e-06, + "loss": 0.6296, + "step": 6887 + }, + { + "epoch": 0.72, + "grad_norm": 3.501685681461769, + "learning_rate": 1.8580579244702762e-06, + "loss": 0.6258, + "step": 6888 + }, + { + "epoch": 0.72, + "grad_norm": 1.055180231542555, + "learning_rate": 1.8567323805791116e-06, + "loss": 0.539, + "step": 6889 + }, + { + "epoch": 0.73, + "grad_norm": 2.6544723103678916, + "learning_rate": 1.8554072018636903e-06, + "loss": 0.5934, + "step": 6890 + }, + { + "epoch": 0.73, + "grad_norm": 2.4845123889284735, + "learning_rate": 1.8540823884779708e-06, + "loss": 0.5886, + "step": 6891 + }, + { + "epoch": 0.73, + "grad_norm": 3.1419531063459485, + "learning_rate": 1.8527579405758672e-06, + "loss": 0.5611, + "step": 6892 + }, + { + "epoch": 0.73, + "grad_norm": 2.4840964920780957, + "learning_rate": 1.851433858311248e-06, + "loss": 0.5448, + "step": 6893 + }, + { + "epoch": 0.73, + "grad_norm": 2.7386535448522418, + "learning_rate": 1.8501101418379398e-06, + "loss": 0.6005, + "step": 6894 + }, + { + "epoch": 0.73, + "grad_norm": 2.879316470041651, + "learning_rate": 1.8487867913097301e-06, + "loss": 0.5817, + "step": 6895 + }, + { + "epoch": 0.73, + "grad_norm": 1.0545439469322904, + "learning_rate": 1.8474638068803612e-06, + "loss": 0.5266, + "step": 6896 + }, + { + "epoch": 0.73, + "grad_norm": 3.215830096903211, + "learning_rate": 1.8461411887035368e-06, + "loss": 0.5681, + "step": 6897 + }, + { + "epoch": 0.73, + "grad_norm": 2.3679216916484243, + "learning_rate": 1.8448189369329117e-06, + "loss": 0.6838, + "step": 6898 + }, + { + "epoch": 0.73, + "grad_norm": 2.423382212663329, + "learning_rate": 1.8434970517221e-06, + "loss": 0.6508, + "step": 6899 + }, + { + "epoch": 0.73, + "grad_norm": 1.0003477907669218, + "learning_rate": 1.8421755332246765e-06, + "loss": 0.5632, + "step": 6900 + }, + { + "epoch": 0.73, + "grad_norm": 2.3039328325659962, + "learning_rate": 1.840854381594173e-06, + "loss": 0.5946, + "step": 6901 + }, + { + "epoch": 0.73, + "grad_norm": 2.9075536999442937, + "learning_rate": 1.8395335969840749e-06, + "loss": 0.6977, + "step": 6902 + }, + { + "epoch": 0.73, + "grad_norm": 2.653021560915067, + "learning_rate": 1.8382131795478265e-06, + "loss": 0.6104, + "step": 6903 + }, + { + "epoch": 0.73, + "grad_norm": 2.459736159668085, + "learning_rate": 1.8368931294388303e-06, + "loss": 0.6558, + "step": 6904 + }, + { + "epoch": 0.73, + "grad_norm": 2.145122811581977, + "learning_rate": 1.8355734468104476e-06, + "loss": 0.6376, + "step": 6905 + }, + { + "epoch": 0.73, + "grad_norm": 2.3194721091248622, + "learning_rate": 1.8342541318159967e-06, + "loss": 0.6556, + "step": 6906 + }, + { + "epoch": 0.73, + "grad_norm": 2.380316773328887, + "learning_rate": 1.8329351846087467e-06, + "loss": 0.4877, + "step": 6907 + }, + { + "epoch": 0.73, + "grad_norm": 2.059952338938169, + "learning_rate": 1.8316166053419321e-06, + "loss": 0.648, + "step": 6908 + }, + { + "epoch": 0.73, + "grad_norm": 2.3302793061982854, + "learning_rate": 1.8302983941687414e-06, + "loss": 0.665, + "step": 6909 + }, + { + "epoch": 0.73, + "grad_norm": 2.196962381204859, + "learning_rate": 1.828980551242322e-06, + "loss": 0.6074, + "step": 6910 + }, + { + "epoch": 0.73, + "grad_norm": 2.3653436980498133, + "learning_rate": 1.827663076715776e-06, + "loss": 0.5736, + "step": 6911 + }, + { + "epoch": 0.73, + "grad_norm": 2.3272350514303515, + "learning_rate": 1.8263459707421617e-06, + "loss": 0.5789, + "step": 6912 + }, + { + "epoch": 0.73, + "grad_norm": 2.843158913797961, + "learning_rate": 1.8250292334744979e-06, + "loss": 0.6096, + "step": 6913 + }, + { + "epoch": 0.73, + "grad_norm": 2.3445312077529357, + "learning_rate": 1.8237128650657621e-06, + "loss": 0.6748, + "step": 6914 + }, + { + "epoch": 0.73, + "grad_norm": 2.7392105169159904, + "learning_rate": 1.8223968656688834e-06, + "loss": 0.5316, + "step": 6915 + }, + { + "epoch": 0.73, + "grad_norm": 2.381012515570709, + "learning_rate": 1.8210812354367501e-06, + "loss": 0.6245, + "step": 6916 + }, + { + "epoch": 0.73, + "grad_norm": 2.0098706319158968, + "learning_rate": 1.8197659745222095e-06, + "loss": 0.5341, + "step": 6917 + }, + { + "epoch": 0.73, + "grad_norm": 2.3320907079732485, + "learning_rate": 1.818451083078065e-06, + "loss": 0.6104, + "step": 6918 + }, + { + "epoch": 0.73, + "grad_norm": 3.243267962515471, + "learning_rate": 1.817136561257078e-06, + "loss": 0.5681, + "step": 6919 + }, + { + "epoch": 0.73, + "grad_norm": 3.244715320509986, + "learning_rate": 1.8158224092119648e-06, + "loss": 0.5684, + "step": 6920 + }, + { + "epoch": 0.73, + "grad_norm": 2.9394817301430924, + "learning_rate": 1.8145086270953977e-06, + "loss": 0.576, + "step": 6921 + }, + { + "epoch": 0.73, + "grad_norm": 2.421731912227469, + "learning_rate": 1.8131952150600101e-06, + "loss": 0.573, + "step": 6922 + }, + { + "epoch": 0.73, + "grad_norm": 2.7932028278457115, + "learning_rate": 1.8118821732583918e-06, + "loss": 0.5806, + "step": 6923 + }, + { + "epoch": 0.73, + "grad_norm": 2.9079436194870394, + "learning_rate": 1.8105695018430873e-06, + "loss": 0.6072, + "step": 6924 + }, + { + "epoch": 0.73, + "grad_norm": 2.583809450277425, + "learning_rate": 1.8092572009665965e-06, + "loss": 0.628, + "step": 6925 + }, + { + "epoch": 0.73, + "grad_norm": 2.1674044267667516, + "learning_rate": 1.80794527078138e-06, + "loss": 0.5968, + "step": 6926 + }, + { + "epoch": 0.73, + "grad_norm": 2.6185013262869354, + "learning_rate": 1.8066337114398568e-06, + "loss": 0.5478, + "step": 6927 + }, + { + "epoch": 0.73, + "grad_norm": 3.158397579484625, + "learning_rate": 1.8053225230943982e-06, + "loss": 0.6001, + "step": 6928 + }, + { + "epoch": 0.73, + "grad_norm": 3.264667477188976, + "learning_rate": 1.8040117058973317e-06, + "loss": 0.6074, + "step": 6929 + }, + { + "epoch": 0.73, + "grad_norm": 2.7602307379116064, + "learning_rate": 1.802701260000947e-06, + "loss": 0.6428, + "step": 6930 + }, + { + "epoch": 0.73, + "grad_norm": 2.986327201725227, + "learning_rate": 1.8013911855574874e-06, + "loss": 0.5278, + "step": 6931 + }, + { + "epoch": 0.73, + "grad_norm": 2.0385916995999906, + "learning_rate": 1.8000814827191548e-06, + "loss": 0.6268, + "step": 6932 + }, + { + "epoch": 0.73, + "grad_norm": 2.1047064832640023, + "learning_rate": 1.7987721516381056e-06, + "loss": 0.6004, + "step": 6933 + }, + { + "epoch": 0.73, + "grad_norm": 2.4994668971667364, + "learning_rate": 1.7974631924664533e-06, + "loss": 0.6405, + "step": 6934 + }, + { + "epoch": 0.73, + "grad_norm": 2.388551949225777, + "learning_rate": 1.7961546053562684e-06, + "loss": 0.647, + "step": 6935 + }, + { + "epoch": 0.73, + "grad_norm": 2.348701907039752, + "learning_rate": 1.7948463904595826e-06, + "loss": 0.5663, + "step": 6936 + }, + { + "epoch": 0.73, + "grad_norm": 2.3266351864310137, + "learning_rate": 1.793538547928378e-06, + "loss": 0.604, + "step": 6937 + }, + { + "epoch": 0.73, + "grad_norm": 2.150907502807278, + "learning_rate": 1.7922310779145941e-06, + "loss": 0.5916, + "step": 6938 + }, + { + "epoch": 0.73, + "grad_norm": 3.2575410386870502, + "learning_rate": 1.7909239805701307e-06, + "loss": 0.6358, + "step": 6939 + }, + { + "epoch": 0.73, + "grad_norm": 1.0203290093238349, + "learning_rate": 1.7896172560468427e-06, + "loss": 0.5392, + "step": 6940 + }, + { + "epoch": 0.73, + "grad_norm": 2.455935523599585, + "learning_rate": 1.7883109044965452e-06, + "loss": 0.6663, + "step": 6941 + }, + { + "epoch": 0.73, + "grad_norm": 2.292636831216748, + "learning_rate": 1.7870049260709992e-06, + "loss": 0.6031, + "step": 6942 + }, + { + "epoch": 0.73, + "grad_norm": 2.740746619986824, + "learning_rate": 1.785699320921933e-06, + "loss": 0.5899, + "step": 6943 + }, + { + "epoch": 0.73, + "grad_norm": 4.438088048263127, + "learning_rate": 1.784394089201028e-06, + "loss": 0.609, + "step": 6944 + }, + { + "epoch": 0.73, + "grad_norm": 3.2919583772455385, + "learning_rate": 1.7830892310599245e-06, + "loss": 0.5254, + "step": 6945 + }, + { + "epoch": 0.73, + "grad_norm": 3.0375360511034284, + "learning_rate": 1.7817847466502146e-06, + "loss": 0.5886, + "step": 6946 + }, + { + "epoch": 0.73, + "grad_norm": 2.278371165535222, + "learning_rate": 1.780480636123449e-06, + "loss": 0.6285, + "step": 6947 + }, + { + "epoch": 0.73, + "grad_norm": 2.6474986848344844, + "learning_rate": 1.7791768996311355e-06, + "loss": 0.6188, + "step": 6948 + }, + { + "epoch": 0.73, + "grad_norm": 4.053270599492303, + "learning_rate": 1.7778735373247414e-06, + "loss": 0.6231, + "step": 6949 + }, + { + "epoch": 0.73, + "grad_norm": 2.6023055630386622, + "learning_rate": 1.7765705493556857e-06, + "loss": 0.5683, + "step": 6950 + }, + { + "epoch": 0.73, + "grad_norm": 2.180205593322918, + "learning_rate": 1.7752679358753433e-06, + "loss": 0.6944, + "step": 6951 + }, + { + "epoch": 0.73, + "grad_norm": 3.3913259162961666, + "learning_rate": 1.7739656970350505e-06, + "loss": 0.6438, + "step": 6952 + }, + { + "epoch": 0.73, + "grad_norm": 2.504035889002391, + "learning_rate": 1.7726638329860978e-06, + "loss": 0.6856, + "step": 6953 + }, + { + "epoch": 0.73, + "grad_norm": 2.764765266914022, + "learning_rate": 1.7713623438797335e-06, + "loss": 0.5888, + "step": 6954 + }, + { + "epoch": 0.73, + "grad_norm": 3.2754835937489934, + "learning_rate": 1.7700612298671587e-06, + "loss": 0.6877, + "step": 6955 + }, + { + "epoch": 0.73, + "grad_norm": 2.449529265194134, + "learning_rate": 1.7687604910995321e-06, + "loss": 0.6649, + "step": 6956 + }, + { + "epoch": 0.73, + "grad_norm": 2.4212481065052907, + "learning_rate": 1.7674601277279707e-06, + "loss": 0.618, + "step": 6957 + }, + { + "epoch": 0.73, + "grad_norm": 5.736474279576478, + "learning_rate": 1.7661601399035494e-06, + "loss": 0.5901, + "step": 6958 + }, + { + "epoch": 0.73, + "grad_norm": 2.691986553144693, + "learning_rate": 1.7648605277772945e-06, + "loss": 0.6313, + "step": 6959 + }, + { + "epoch": 0.73, + "grad_norm": 2.6214863033679943, + "learning_rate": 1.7635612915001903e-06, + "loss": 0.7215, + "step": 6960 + }, + { + "epoch": 0.73, + "grad_norm": 2.9600672604153533, + "learning_rate": 1.7622624312231795e-06, + "loss": 0.6593, + "step": 6961 + }, + { + "epoch": 0.73, + "grad_norm": 2.8520335479613363, + "learning_rate": 1.7609639470971618e-06, + "loss": 0.5712, + "step": 6962 + }, + { + "epoch": 0.73, + "grad_norm": 2.2886996974373814, + "learning_rate": 1.7596658392729897e-06, + "loss": 0.5834, + "step": 6963 + }, + { + "epoch": 0.73, + "grad_norm": 2.8110720875652033, + "learning_rate": 1.7583681079014713e-06, + "loss": 0.6194, + "step": 6964 + }, + { + "epoch": 0.73, + "grad_norm": 2.229998449327696, + "learning_rate": 1.7570707531333763e-06, + "loss": 0.5953, + "step": 6965 + }, + { + "epoch": 0.73, + "grad_norm": 2.6339048670728284, + "learning_rate": 1.7557737751194264e-06, + "loss": 0.6276, + "step": 6966 + }, + { + "epoch": 0.73, + "grad_norm": 3.126572883969871, + "learning_rate": 1.7544771740103034e-06, + "loss": 0.6219, + "step": 6967 + }, + { + "epoch": 0.73, + "grad_norm": 3.532068840298241, + "learning_rate": 1.7531809499566399e-06, + "loss": 0.6469, + "step": 6968 + }, + { + "epoch": 0.73, + "grad_norm": 3.1935164634899, + "learning_rate": 1.7518851031090267e-06, + "loss": 0.612, + "step": 6969 + }, + { + "epoch": 0.73, + "grad_norm": 2.541841612435397, + "learning_rate": 1.7505896336180128e-06, + "loss": 0.5458, + "step": 6970 + }, + { + "epoch": 0.73, + "grad_norm": 2.3737229858880187, + "learning_rate": 1.7492945416341034e-06, + "loss": 0.6253, + "step": 6971 + }, + { + "epoch": 0.73, + "grad_norm": 3.2573282697164827, + "learning_rate": 1.7479998273077581e-06, + "loss": 0.633, + "step": 6972 + }, + { + "epoch": 0.73, + "grad_norm": 2.7028931132859264, + "learning_rate": 1.74670549078939e-06, + "loss": 0.6755, + "step": 6973 + }, + { + "epoch": 0.73, + "grad_norm": 2.702801368704138, + "learning_rate": 1.7454115322293735e-06, + "loss": 0.5617, + "step": 6974 + }, + { + "epoch": 0.73, + "grad_norm": 2.3197807420820613, + "learning_rate": 1.7441179517780376e-06, + "loss": 0.6572, + "step": 6975 + }, + { + "epoch": 0.73, + "grad_norm": 2.674381378155758, + "learning_rate": 1.7428247495856699e-06, + "loss": 0.6158, + "step": 6976 + }, + { + "epoch": 0.73, + "grad_norm": 2.8755462439432047, + "learning_rate": 1.7415319258025032e-06, + "loss": 0.697, + "step": 6977 + }, + { + "epoch": 0.73, + "grad_norm": 3.0812031503014983, + "learning_rate": 1.7402394805787388e-06, + "loss": 0.6277, + "step": 6978 + }, + { + "epoch": 0.73, + "grad_norm": 11.5464107755622, + "learning_rate": 1.7389474140645279e-06, + "loss": 0.5804, + "step": 6979 + }, + { + "epoch": 0.73, + "grad_norm": 2.338776528858335, + "learning_rate": 1.7376557264099813e-06, + "loss": 0.5742, + "step": 6980 + }, + { + "epoch": 0.73, + "grad_norm": 2.5226545710029087, + "learning_rate": 1.7363644177651623e-06, + "loss": 0.5308, + "step": 6981 + }, + { + "epoch": 0.73, + "grad_norm": 3.538706474138878, + "learning_rate": 1.7350734882800891e-06, + "loss": 0.6669, + "step": 6982 + }, + { + "epoch": 0.73, + "grad_norm": 2.553740639868528, + "learning_rate": 1.7337829381047405e-06, + "loss": 0.6483, + "step": 6983 + }, + { + "epoch": 0.73, + "grad_norm": 2.4491443310423726, + "learning_rate": 1.7324927673890495e-06, + "loss": 0.5876, + "step": 6984 + }, + { + "epoch": 0.74, + "grad_norm": 2.3165156688725244, + "learning_rate": 1.7312029762829042e-06, + "loss": 0.5532, + "step": 6985 + }, + { + "epoch": 0.74, + "grad_norm": 2.3703050030038884, + "learning_rate": 1.729913564936146e-06, + "loss": 0.646, + "step": 6986 + }, + { + "epoch": 0.74, + "grad_norm": 2.457421486560212, + "learning_rate": 1.728624533498577e-06, + "loss": 0.6851, + "step": 6987 + }, + { + "epoch": 0.74, + "grad_norm": 3.7152345384682492, + "learning_rate": 1.7273358821199527e-06, + "loss": 0.6291, + "step": 6988 + }, + { + "epoch": 0.74, + "grad_norm": 2.928945773179569, + "learning_rate": 1.7260476109499885e-06, + "loss": 0.6238, + "step": 6989 + }, + { + "epoch": 0.74, + "grad_norm": 2.0786975252518975, + "learning_rate": 1.7247597201383459e-06, + "loss": 0.6212, + "step": 6990 + }, + { + "epoch": 0.74, + "grad_norm": 3.2322246793230462, + "learning_rate": 1.7234722098346512e-06, + "loss": 0.5755, + "step": 6991 + }, + { + "epoch": 0.74, + "grad_norm": 2.385974734862681, + "learning_rate": 1.7221850801884838e-06, + "loss": 0.6729, + "step": 6992 + }, + { + "epoch": 0.74, + "grad_norm": 2.2783917987848175, + "learning_rate": 1.7208983313493804e-06, + "loss": 0.5833, + "step": 6993 + }, + { + "epoch": 0.74, + "grad_norm": 3.1121321239656368, + "learning_rate": 1.7196119634668296e-06, + "loss": 0.6074, + "step": 6994 + }, + { + "epoch": 0.74, + "grad_norm": 2.4956465433518087, + "learning_rate": 1.7183259766902765e-06, + "loss": 0.5979, + "step": 6995 + }, + { + "epoch": 0.74, + "grad_norm": 2.2925119083577363, + "learning_rate": 1.7170403711691252e-06, + "loss": 0.5927, + "step": 6996 + }, + { + "epoch": 0.74, + "grad_norm": 4.1901529302786145, + "learning_rate": 1.7157551470527356e-06, + "loss": 0.5835, + "step": 6997 + }, + { + "epoch": 0.74, + "grad_norm": 2.1103141114003794, + "learning_rate": 1.7144703044904186e-06, + "loss": 0.647, + "step": 6998 + }, + { + "epoch": 0.74, + "grad_norm": 2.2812632686596874, + "learning_rate": 1.7131858436314431e-06, + "loss": 0.5589, + "step": 6999 + }, + { + "epoch": 0.74, + "grad_norm": 3.893898766295147, + "learning_rate": 1.7119017646250346e-06, + "loss": 0.5332, + "step": 7000 + }, + { + "epoch": 0.74, + "grad_norm": 2.333401918284876, + "learning_rate": 1.7106180676203743e-06, + "loss": 0.644, + "step": 7001 + }, + { + "epoch": 0.74, + "grad_norm": 2.2776944296611905, + "learning_rate": 1.7093347527666e-06, + "loss": 0.6315, + "step": 7002 + }, + { + "epoch": 0.74, + "grad_norm": 2.4878470361517535, + "learning_rate": 1.708051820212801e-06, + "loss": 0.5557, + "step": 7003 + }, + { + "epoch": 0.74, + "grad_norm": 0.9538072931011493, + "learning_rate": 1.7067692701080247e-06, + "loss": 0.556, + "step": 7004 + }, + { + "epoch": 0.74, + "grad_norm": 3.936334663090163, + "learning_rate": 1.7054871026012748e-06, + "loss": 0.6332, + "step": 7005 + }, + { + "epoch": 0.74, + "grad_norm": 2.569749185859605, + "learning_rate": 1.7042053178415114e-06, + "loss": 0.597, + "step": 7006 + }, + { + "epoch": 0.74, + "grad_norm": 2.2112215878359844, + "learning_rate": 1.7029239159776468e-06, + "loss": 0.6083, + "step": 7007 + }, + { + "epoch": 0.74, + "grad_norm": 2.9065918760961362, + "learning_rate": 1.7016428971585491e-06, + "loss": 0.6281, + "step": 7008 + }, + { + "epoch": 0.74, + "grad_norm": 2.0826011656455226, + "learning_rate": 1.700362261533045e-06, + "loss": 0.5959, + "step": 7009 + }, + { + "epoch": 0.74, + "grad_norm": 2.3233212826323664, + "learning_rate": 1.699082009249915e-06, + "loss": 0.6065, + "step": 7010 + }, + { + "epoch": 0.74, + "grad_norm": 0.9508372060278683, + "learning_rate": 1.6978021404578986e-06, + "loss": 0.5127, + "step": 7011 + }, + { + "epoch": 0.74, + "grad_norm": 2.3456337360777826, + "learning_rate": 1.6965226553056807e-06, + "loss": 0.6536, + "step": 7012 + }, + { + "epoch": 0.74, + "grad_norm": 2.740110007869177, + "learning_rate": 1.6952435539419114e-06, + "loss": 0.6519, + "step": 7013 + }, + { + "epoch": 0.74, + "grad_norm": 2.285157779061375, + "learning_rate": 1.6939648365151929e-06, + "loss": 0.6668, + "step": 7014 + }, + { + "epoch": 0.74, + "grad_norm": 2.286468423276322, + "learning_rate": 1.692686503174084e-06, + "loss": 0.5346, + "step": 7015 + }, + { + "epoch": 0.74, + "grad_norm": 3.544160329777678, + "learning_rate": 1.6914085540670972e-06, + "loss": 0.6256, + "step": 7016 + }, + { + "epoch": 0.74, + "grad_norm": 2.6105527595546683, + "learning_rate": 1.6901309893426987e-06, + "loss": 0.5112, + "step": 7017 + }, + { + "epoch": 0.74, + "grad_norm": 2.824511640017726, + "learning_rate": 1.688853809149314e-06, + "loss": 0.6006, + "step": 7018 + }, + { + "epoch": 0.74, + "grad_norm": 2.421375917201986, + "learning_rate": 1.6875770136353237e-06, + "loss": 0.5986, + "step": 7019 + }, + { + "epoch": 0.74, + "grad_norm": 2.775614276256507, + "learning_rate": 1.686300602949061e-06, + "loss": 0.641, + "step": 7020 + }, + { + "epoch": 0.74, + "grad_norm": 2.8464697028833026, + "learning_rate": 1.6850245772388136e-06, + "loss": 0.6148, + "step": 7021 + }, + { + "epoch": 0.74, + "grad_norm": 2.6951000358839816, + "learning_rate": 1.6837489366528275e-06, + "loss": 0.5996, + "step": 7022 + }, + { + "epoch": 0.74, + "grad_norm": 2.2776028181000036, + "learning_rate": 1.6824736813393044e-06, + "loss": 0.5767, + "step": 7023 + }, + { + "epoch": 0.74, + "grad_norm": 3.049024430013574, + "learning_rate": 1.6811988114464024e-06, + "loss": 0.5973, + "step": 7024 + }, + { + "epoch": 0.74, + "grad_norm": 2.259489650346381, + "learning_rate": 1.6799243271222248e-06, + "loss": 0.6279, + "step": 7025 + }, + { + "epoch": 0.74, + "grad_norm": 2.7237784454005047, + "learning_rate": 1.678650228514842e-06, + "loss": 0.6341, + "step": 7026 + }, + { + "epoch": 0.74, + "grad_norm": 2.492400316742001, + "learning_rate": 1.677376515772275e-06, + "loss": 0.5853, + "step": 7027 + }, + { + "epoch": 0.74, + "grad_norm": 7.370329142260077, + "learning_rate": 1.6761031890425007e-06, + "loss": 0.6075, + "step": 7028 + }, + { + "epoch": 0.74, + "grad_norm": 2.1958100915780823, + "learning_rate": 1.6748302484734496e-06, + "loss": 0.5913, + "step": 7029 + }, + { + "epoch": 0.74, + "grad_norm": 7.685029763578393, + "learning_rate": 1.6735576942130066e-06, + "loss": 0.5991, + "step": 7030 + }, + { + "epoch": 0.74, + "grad_norm": 2.0884424565510606, + "learning_rate": 1.672285526409015e-06, + "loss": 0.5875, + "step": 7031 + }, + { + "epoch": 0.74, + "grad_norm": 4.244543977179305, + "learning_rate": 1.6710137452092728e-06, + "loss": 0.647, + "step": 7032 + }, + { + "epoch": 0.74, + "grad_norm": 2.8555347187741154, + "learning_rate": 1.6697423507615307e-06, + "loss": 0.6764, + "step": 7033 + }, + { + "epoch": 0.74, + "grad_norm": 2.943847721173569, + "learning_rate": 1.6684713432134935e-06, + "loss": 0.637, + "step": 7034 + }, + { + "epoch": 0.74, + "grad_norm": 2.9179340893616694, + "learning_rate": 1.6672007227128256e-06, + "loss": 0.6128, + "step": 7035 + }, + { + "epoch": 0.74, + "grad_norm": 2.3128758180540188, + "learning_rate": 1.6659304894071437e-06, + "loss": 0.5963, + "step": 7036 + }, + { + "epoch": 0.74, + "grad_norm": 2.279530033918136, + "learning_rate": 1.6646606434440216e-06, + "loss": 0.5943, + "step": 7037 + }, + { + "epoch": 0.74, + "grad_norm": 2.4218365961921924, + "learning_rate": 1.6633911849709838e-06, + "loss": 0.5705, + "step": 7038 + }, + { + "epoch": 0.74, + "grad_norm": 2.4252321627282565, + "learning_rate": 1.6621221141355114e-06, + "loss": 0.6823, + "step": 7039 + }, + { + "epoch": 0.74, + "grad_norm": 2.815209527008716, + "learning_rate": 1.6608534310850432e-06, + "loss": 0.6023, + "step": 7040 + }, + { + "epoch": 0.74, + "grad_norm": 3.2696793670858457, + "learning_rate": 1.6595851359669723e-06, + "loss": 0.6344, + "step": 7041 + }, + { + "epoch": 0.74, + "grad_norm": 3.048168501203653, + "learning_rate": 1.6583172289286447e-06, + "loss": 0.6739, + "step": 7042 + }, + { + "epoch": 0.74, + "grad_norm": 2.345623717215056, + "learning_rate": 1.6570497101173595e-06, + "loss": 0.5041, + "step": 7043 + }, + { + "epoch": 0.74, + "grad_norm": 2.656636229228406, + "learning_rate": 1.6557825796803755e-06, + "loss": 0.5783, + "step": 7044 + }, + { + "epoch": 0.74, + "grad_norm": 2.047758363020559, + "learning_rate": 1.6545158377649063e-06, + "loss": 0.6101, + "step": 7045 + }, + { + "epoch": 0.74, + "grad_norm": 2.4301367175325446, + "learning_rate": 1.6532494845181157e-06, + "loss": 0.6579, + "step": 7046 + }, + { + "epoch": 0.74, + "grad_norm": 2.4798435707976627, + "learning_rate": 1.6519835200871243e-06, + "loss": 0.6268, + "step": 7047 + }, + { + "epoch": 0.74, + "grad_norm": 2.2104318045494717, + "learning_rate": 1.6507179446190091e-06, + "loss": 0.5432, + "step": 7048 + }, + { + "epoch": 0.74, + "grad_norm": 4.720179561419772, + "learning_rate": 1.649452758260801e-06, + "loss": 0.5763, + "step": 7049 + }, + { + "epoch": 0.74, + "grad_norm": 2.8768611462675713, + "learning_rate": 1.648187961159488e-06, + "loss": 0.5999, + "step": 7050 + }, + { + "epoch": 0.74, + "grad_norm": 2.9574492500850096, + "learning_rate": 1.6469235534620087e-06, + "loss": 0.6756, + "step": 7051 + }, + { + "epoch": 0.74, + "grad_norm": 2.4478650282919654, + "learning_rate": 1.6456595353152566e-06, + "loss": 0.6178, + "step": 7052 + }, + { + "epoch": 0.74, + "grad_norm": 3.4521440344822647, + "learning_rate": 1.644395906866083e-06, + "loss": 0.633, + "step": 7053 + }, + { + "epoch": 0.74, + "grad_norm": 2.3845856771271645, + "learning_rate": 1.6431326682612947e-06, + "loss": 0.6037, + "step": 7054 + }, + { + "epoch": 0.74, + "grad_norm": 1.9893993557633762, + "learning_rate": 1.6418698196476497e-06, + "loss": 0.5822, + "step": 7055 + }, + { + "epoch": 0.74, + "grad_norm": 2.502449539506666, + "learning_rate": 1.6406073611718593e-06, + "loss": 0.6611, + "step": 7056 + }, + { + "epoch": 0.74, + "grad_norm": 2.886805930275283, + "learning_rate": 1.6393452929805947e-06, + "loss": 0.6163, + "step": 7057 + }, + { + "epoch": 0.74, + "grad_norm": 2.324217731618782, + "learning_rate": 1.63808361522048e-06, + "loss": 0.6323, + "step": 7058 + }, + { + "epoch": 0.74, + "grad_norm": 2.4841967196947907, + "learning_rate": 1.6368223280380951e-06, + "loss": 0.6249, + "step": 7059 + }, + { + "epoch": 0.74, + "grad_norm": 2.491514087882754, + "learning_rate": 1.6355614315799673e-06, + "loss": 0.5953, + "step": 7060 + }, + { + "epoch": 0.74, + "grad_norm": 4.09150523009862, + "learning_rate": 1.6343009259925863e-06, + "loss": 0.663, + "step": 7061 + }, + { + "epoch": 0.74, + "grad_norm": 2.642962343711459, + "learning_rate": 1.633040811422395e-06, + "loss": 0.6604, + "step": 7062 + }, + { + "epoch": 0.74, + "grad_norm": 2.222557968930457, + "learning_rate": 1.6317810880157908e-06, + "loss": 0.5329, + "step": 7063 + }, + { + "epoch": 0.74, + "grad_norm": 2.7336550978700083, + "learning_rate": 1.6305217559191232e-06, + "loss": 0.5325, + "step": 7064 + }, + { + "epoch": 0.74, + "grad_norm": 2.1083210565732484, + "learning_rate": 1.629262815278696e-06, + "loss": 0.5392, + "step": 7065 + }, + { + "epoch": 0.74, + "grad_norm": 2.720575901544298, + "learning_rate": 1.6280042662407714e-06, + "loss": 0.6113, + "step": 7066 + }, + { + "epoch": 0.74, + "grad_norm": 2.16404863181636, + "learning_rate": 1.626746108951565e-06, + "loss": 0.5831, + "step": 7067 + }, + { + "epoch": 0.74, + "grad_norm": 2.4115471190550117, + "learning_rate": 1.6254883435572449e-06, + "loss": 0.59, + "step": 7068 + }, + { + "epoch": 0.74, + "grad_norm": 3.0583452239282183, + "learning_rate": 1.6242309702039327e-06, + "loss": 0.6492, + "step": 7069 + }, + { + "epoch": 0.74, + "grad_norm": 3.2718246037744843, + "learning_rate": 1.6229739890377084e-06, + "loss": 0.5989, + "step": 7070 + }, + { + "epoch": 0.74, + "grad_norm": 2.7419306661734812, + "learning_rate": 1.6217174002046032e-06, + "loss": 0.5111, + "step": 7071 + }, + { + "epoch": 0.74, + "grad_norm": 3.0816295246885725, + "learning_rate": 1.6204612038506068e-06, + "loss": 0.639, + "step": 7072 + }, + { + "epoch": 0.74, + "grad_norm": 2.2970090731001003, + "learning_rate": 1.6192054001216585e-06, + "loss": 0.6099, + "step": 7073 + }, + { + "epoch": 0.74, + "grad_norm": 2.7790697280686962, + "learning_rate": 1.6179499891636524e-06, + "loss": 0.6486, + "step": 7074 + }, + { + "epoch": 0.74, + "grad_norm": 2.6588858988791624, + "learning_rate": 1.61669497112244e-06, + "loss": 0.6991, + "step": 7075 + }, + { + "epoch": 0.74, + "grad_norm": 2.4764296283414584, + "learning_rate": 1.6154403461438273e-06, + "loss": 0.6273, + "step": 7076 + }, + { + "epoch": 0.74, + "grad_norm": 2.658819160311856, + "learning_rate": 1.6141861143735716e-06, + "loss": 0.5699, + "step": 7077 + }, + { + "epoch": 0.74, + "grad_norm": 2.315234814191497, + "learning_rate": 1.612932275957384e-06, + "loss": 0.6531, + "step": 7078 + }, + { + "epoch": 0.74, + "grad_norm": 4.409181763534464, + "learning_rate": 1.6116788310409332e-06, + "loss": 0.5654, + "step": 7079 + }, + { + "epoch": 0.75, + "grad_norm": 2.573872111742514, + "learning_rate": 1.6104257797698431e-06, + "loss": 0.6174, + "step": 7080 + }, + { + "epoch": 0.75, + "grad_norm": 2.1867723143982847, + "learning_rate": 1.6091731222896877e-06, + "loss": 0.6744, + "step": 7081 + }, + { + "epoch": 0.75, + "grad_norm": 2.436773932857918, + "learning_rate": 1.6079208587459954e-06, + "loss": 0.6701, + "step": 7082 + }, + { + "epoch": 0.75, + "grad_norm": 2.6539143787518165, + "learning_rate": 1.6066689892842525e-06, + "loss": 0.5861, + "step": 7083 + }, + { + "epoch": 0.75, + "grad_norm": 2.9851718926572968, + "learning_rate": 1.6054175140498967e-06, + "loss": 0.6667, + "step": 7084 + }, + { + "epoch": 0.75, + "grad_norm": 2.130011313816518, + "learning_rate": 1.6041664331883233e-06, + "loss": 0.6442, + "step": 7085 + }, + { + "epoch": 0.75, + "grad_norm": 2.8378492504021393, + "learning_rate": 1.6029157468448775e-06, + "loss": 0.6244, + "step": 7086 + }, + { + "epoch": 0.75, + "grad_norm": 2.2783353748879542, + "learning_rate": 1.601665455164858e-06, + "loss": 0.6466, + "step": 7087 + }, + { + "epoch": 0.75, + "grad_norm": 3.5318917740950577, + "learning_rate": 1.6004155582935232e-06, + "loss": 0.7144, + "step": 7088 + }, + { + "epoch": 0.75, + "grad_norm": 2.405042680383137, + "learning_rate": 1.599166056376083e-06, + "loss": 0.554, + "step": 7089 + }, + { + "epoch": 0.75, + "grad_norm": 2.537663239731998, + "learning_rate": 1.5979169495576991e-06, + "loss": 0.6213, + "step": 7090 + }, + { + "epoch": 0.75, + "grad_norm": 2.34724684013334, + "learning_rate": 1.5966682379834887e-06, + "loss": 0.5951, + "step": 7091 + }, + { + "epoch": 0.75, + "grad_norm": 2.683042293367126, + "learning_rate": 1.5954199217985233e-06, + "loss": 0.6249, + "step": 7092 + }, + { + "epoch": 0.75, + "grad_norm": 2.962353374507347, + "learning_rate": 1.5941720011478323e-06, + "loss": 0.5715, + "step": 7093 + }, + { + "epoch": 0.75, + "grad_norm": 2.6239346592286377, + "learning_rate": 1.5929244761763924e-06, + "loss": 0.5831, + "step": 7094 + }, + { + "epoch": 0.75, + "grad_norm": 2.338124980340621, + "learning_rate": 1.591677347029137e-06, + "loss": 0.576, + "step": 7095 + }, + { + "epoch": 0.75, + "grad_norm": 3.8481926089677714, + "learning_rate": 1.5904306138509545e-06, + "loss": 0.6755, + "step": 7096 + }, + { + "epoch": 0.75, + "grad_norm": 2.540089155983873, + "learning_rate": 1.5891842767866872e-06, + "loss": 0.5622, + "step": 7097 + }, + { + "epoch": 0.75, + "grad_norm": 2.4998455761237133, + "learning_rate": 1.587938335981133e-06, + "loss": 0.5839, + "step": 7098 + }, + { + "epoch": 0.75, + "grad_norm": 3.281273941072614, + "learning_rate": 1.5866927915790391e-06, + "loss": 0.6369, + "step": 7099 + }, + { + "epoch": 0.75, + "grad_norm": 2.401257413792379, + "learning_rate": 1.585447643725108e-06, + "loss": 0.6927, + "step": 7100 + }, + { + "epoch": 0.75, + "grad_norm": 2.885694164271074, + "learning_rate": 1.5842028925640002e-06, + "loss": 0.5983, + "step": 7101 + }, + { + "epoch": 0.75, + "grad_norm": 2.249464206134762, + "learning_rate": 1.5829585382403273e-06, + "loss": 0.552, + "step": 7102 + }, + { + "epoch": 0.75, + "grad_norm": 2.2266693760673335, + "learning_rate": 1.5817145808986534e-06, + "loss": 0.6486, + "step": 7103 + }, + { + "epoch": 0.75, + "grad_norm": 2.6048570897327545, + "learning_rate": 1.5804710206834972e-06, + "loss": 0.6163, + "step": 7104 + }, + { + "epoch": 0.75, + "grad_norm": 2.52361431641286, + "learning_rate": 1.5792278577393327e-06, + "loss": 0.5478, + "step": 7105 + }, + { + "epoch": 0.75, + "grad_norm": 3.0440511092050317, + "learning_rate": 1.577985092210587e-06, + "loss": 0.648, + "step": 7106 + }, + { + "epoch": 0.75, + "grad_norm": 2.5613872240202467, + "learning_rate": 1.5767427242416433e-06, + "loss": 0.6007, + "step": 7107 + }, + { + "epoch": 0.75, + "grad_norm": 2.4150239405907303, + "learning_rate": 1.575500753976834e-06, + "loss": 0.5187, + "step": 7108 + }, + { + "epoch": 0.75, + "grad_norm": 3.236040550601962, + "learning_rate": 1.5742591815604463e-06, + "loss": 0.6156, + "step": 7109 + }, + { + "epoch": 0.75, + "grad_norm": 2.8872460453894053, + "learning_rate": 1.5730180071367247e-06, + "loss": 0.6328, + "step": 7110 + }, + { + "epoch": 0.75, + "grad_norm": 2.329877740414494, + "learning_rate": 1.5717772308498651e-06, + "loss": 0.6179, + "step": 7111 + }, + { + "epoch": 0.75, + "grad_norm": 2.635293780550099, + "learning_rate": 1.5705368528440178e-06, + "loss": 0.6769, + "step": 7112 + }, + { + "epoch": 0.75, + "grad_norm": 2.5346783855495905, + "learning_rate": 1.569296873263283e-06, + "loss": 0.4746, + "step": 7113 + }, + { + "epoch": 0.75, + "grad_norm": 2.1425884362151524, + "learning_rate": 1.5680572922517206e-06, + "loss": 0.5694, + "step": 7114 + }, + { + "epoch": 0.75, + "grad_norm": 2.1871638635512443, + "learning_rate": 1.5668181099533431e-06, + "loss": 0.5599, + "step": 7115 + }, + { + "epoch": 0.75, + "grad_norm": 2.6126133591306533, + "learning_rate": 1.5655793265121132e-06, + "loss": 0.6576, + "step": 7116 + }, + { + "epoch": 0.75, + "grad_norm": 3.6620967020652584, + "learning_rate": 1.5643409420719475e-06, + "loss": 0.7142, + "step": 7117 + }, + { + "epoch": 0.75, + "grad_norm": 2.539598366314506, + "learning_rate": 1.5631029567767197e-06, + "loss": 0.6241, + "step": 7118 + }, + { + "epoch": 0.75, + "grad_norm": 3.873095508009267, + "learning_rate": 1.5618653707702553e-06, + "loss": 0.6067, + "step": 7119 + }, + { + "epoch": 0.75, + "grad_norm": 3.2962610827522454, + "learning_rate": 1.560628184196335e-06, + "loss": 0.5866, + "step": 7120 + }, + { + "epoch": 0.75, + "grad_norm": 2.418665449353965, + "learning_rate": 1.55939139719869e-06, + "loss": 0.5084, + "step": 7121 + }, + { + "epoch": 0.75, + "grad_norm": 54.21175226317471, + "learning_rate": 1.5581550099210053e-06, + "loss": 0.5915, + "step": 7122 + }, + { + "epoch": 0.75, + "grad_norm": 7.5068547209902965, + "learning_rate": 1.5569190225069226e-06, + "loss": 0.6405, + "step": 7123 + }, + { + "epoch": 0.75, + "grad_norm": 2.7309517743093017, + "learning_rate": 1.5556834351000356e-06, + "loss": 0.6173, + "step": 7124 + }, + { + "epoch": 0.75, + "grad_norm": 1.0711077498083168, + "learning_rate": 1.554448247843891e-06, + "loss": 0.5749, + "step": 7125 + }, + { + "epoch": 0.75, + "grad_norm": 2.4429328414022162, + "learning_rate": 1.5532134608819876e-06, + "loss": 0.5991, + "step": 7126 + }, + { + "epoch": 0.75, + "grad_norm": 4.816964217476705, + "learning_rate": 1.55197907435778e-06, + "loss": 0.6666, + "step": 7127 + }, + { + "epoch": 0.75, + "grad_norm": 2.535197145555227, + "learning_rate": 1.5507450884146784e-06, + "loss": 0.5548, + "step": 7128 + }, + { + "epoch": 0.75, + "grad_norm": 2.319066286953146, + "learning_rate": 1.5495115031960418e-06, + "loss": 0.6441, + "step": 7129 + }, + { + "epoch": 0.75, + "grad_norm": 2.8047667310373727, + "learning_rate": 1.5482783188451822e-06, + "loss": 0.6417, + "step": 7130 + }, + { + "epoch": 0.75, + "grad_norm": 3.9106777141475417, + "learning_rate": 1.5470455355053704e-06, + "loss": 0.6389, + "step": 7131 + }, + { + "epoch": 0.75, + "grad_norm": 2.6056116741953828, + "learning_rate": 1.5458131533198261e-06, + "loss": 0.6165, + "step": 7132 + }, + { + "epoch": 0.75, + "grad_norm": 2.9784665393315755, + "learning_rate": 1.5445811724317267e-06, + "loss": 0.584, + "step": 7133 + }, + { + "epoch": 0.75, + "grad_norm": 2.057328186390915, + "learning_rate": 1.5433495929841979e-06, + "loss": 0.538, + "step": 7134 + }, + { + "epoch": 0.75, + "grad_norm": 3.899774453962374, + "learning_rate": 1.5421184151203194e-06, + "loss": 0.6381, + "step": 7135 + }, + { + "epoch": 0.75, + "grad_norm": 2.302231817534724, + "learning_rate": 1.5408876389831278e-06, + "loss": 0.5251, + "step": 7136 + }, + { + "epoch": 0.75, + "grad_norm": 2.779434622195697, + "learning_rate": 1.5396572647156126e-06, + "loss": 0.6052, + "step": 7137 + }, + { + "epoch": 0.75, + "grad_norm": 2.6994213766514155, + "learning_rate": 1.538427292460714e-06, + "loss": 0.6085, + "step": 7138 + }, + { + "epoch": 0.75, + "grad_norm": 2.504683013737572, + "learning_rate": 1.5371977223613238e-06, + "loss": 0.6257, + "step": 7139 + }, + { + "epoch": 0.75, + "grad_norm": 2.2855288271378664, + "learning_rate": 1.535968554560293e-06, + "loss": 0.6142, + "step": 7140 + }, + { + "epoch": 0.75, + "grad_norm": 2.4654873006023608, + "learning_rate": 1.5347397892004234e-06, + "loss": 0.6748, + "step": 7141 + }, + { + "epoch": 0.75, + "grad_norm": 2.2166955270138478, + "learning_rate": 1.533511426424466e-06, + "loss": 0.5971, + "step": 7142 + }, + { + "epoch": 0.75, + "grad_norm": 2.5878170941910934, + "learning_rate": 1.532283466375133e-06, + "loss": 0.5781, + "step": 7143 + }, + { + "epoch": 0.75, + "grad_norm": 3.088438430805986, + "learning_rate": 1.5310559091950805e-06, + "loss": 0.5924, + "step": 7144 + }, + { + "epoch": 0.75, + "grad_norm": 3.455905487244043, + "learning_rate": 1.5298287550269248e-06, + "loss": 0.6324, + "step": 7145 + }, + { + "epoch": 0.75, + "grad_norm": 2.296776703964561, + "learning_rate": 1.5286020040132344e-06, + "loss": 0.6318, + "step": 7146 + }, + { + "epoch": 0.75, + "grad_norm": 3.502122104084021, + "learning_rate": 1.5273756562965286e-06, + "loss": 0.5917, + "step": 7147 + }, + { + "epoch": 0.75, + "grad_norm": 2.813679350238381, + "learning_rate": 1.5261497120192791e-06, + "loss": 0.7005, + "step": 7148 + }, + { + "epoch": 0.75, + "grad_norm": 2.386900010136105, + "learning_rate": 1.5249241713239148e-06, + "loss": 0.6208, + "step": 7149 + }, + { + "epoch": 0.75, + "grad_norm": 2.097816570449662, + "learning_rate": 1.5236990343528157e-06, + "loss": 0.5714, + "step": 7150 + }, + { + "epoch": 0.75, + "grad_norm": 2.8641415092943023, + "learning_rate": 1.522474301248314e-06, + "loss": 0.6228, + "step": 7151 + }, + { + "epoch": 0.75, + "grad_norm": 2.953458669418631, + "learning_rate": 1.5212499721526942e-06, + "loss": 0.635, + "step": 7152 + }, + { + "epoch": 0.75, + "grad_norm": 2.293638293253605, + "learning_rate": 1.5200260472081962e-06, + "loss": 0.6122, + "step": 7153 + }, + { + "epoch": 0.75, + "grad_norm": 2.165393072045299, + "learning_rate": 1.5188025265570127e-06, + "loss": 0.5967, + "step": 7154 + }, + { + "epoch": 0.75, + "grad_norm": 2.6653448901727215, + "learning_rate": 1.51757941034129e-06, + "loss": 0.5992, + "step": 7155 + }, + { + "epoch": 0.75, + "grad_norm": 2.590367390269278, + "learning_rate": 1.5163566987031246e-06, + "loss": 0.5784, + "step": 7156 + }, + { + "epoch": 0.75, + "grad_norm": 2.274011024448419, + "learning_rate": 1.5151343917845662e-06, + "loss": 0.5486, + "step": 7157 + }, + { + "epoch": 0.75, + "grad_norm": 2.640547755917543, + "learning_rate": 1.513912489727621e-06, + "loss": 0.6622, + "step": 7158 + }, + { + "epoch": 0.75, + "grad_norm": 3.435856702185333, + "learning_rate": 1.5126909926742461e-06, + "loss": 0.5952, + "step": 7159 + }, + { + "epoch": 0.75, + "grad_norm": 2.327306617131621, + "learning_rate": 1.511469900766352e-06, + "loss": 0.5737, + "step": 7160 + }, + { + "epoch": 0.75, + "grad_norm": 2.954132593391894, + "learning_rate": 1.510249214145798e-06, + "loss": 0.6354, + "step": 7161 + }, + { + "epoch": 0.75, + "grad_norm": 2.435417507636399, + "learning_rate": 1.5090289329544028e-06, + "loss": 0.658, + "step": 7162 + }, + { + "epoch": 0.75, + "grad_norm": 2.9854856200768136, + "learning_rate": 1.5078090573339365e-06, + "loss": 0.6282, + "step": 7163 + }, + { + "epoch": 0.75, + "grad_norm": 3.106673158220274, + "learning_rate": 1.506589587426119e-06, + "loss": 0.5409, + "step": 7164 + }, + { + "epoch": 0.75, + "grad_norm": 2.4494465926620497, + "learning_rate": 1.5053705233726228e-06, + "loss": 0.6402, + "step": 7165 + }, + { + "epoch": 0.75, + "grad_norm": 2.1863048410990804, + "learning_rate": 1.5041518653150777e-06, + "loss": 0.6386, + "step": 7166 + }, + { + "epoch": 0.75, + "grad_norm": 2.2078949677416864, + "learning_rate": 1.5029336133950635e-06, + "loss": 0.5456, + "step": 7167 + }, + { + "epoch": 0.75, + "grad_norm": 3.738221773697226, + "learning_rate": 1.5017157677541144e-06, + "loss": 0.6146, + "step": 7168 + }, + { + "epoch": 0.75, + "grad_norm": 2.250673304439142, + "learning_rate": 1.500498328533715e-06, + "loss": 0.5573, + "step": 7169 + }, + { + "epoch": 0.75, + "grad_norm": 2.5150783835660797, + "learning_rate": 1.4992812958753023e-06, + "loss": 0.5967, + "step": 7170 + }, + { + "epoch": 0.75, + "grad_norm": 0.9829915284477175, + "learning_rate": 1.4980646699202684e-06, + "loss": 0.5587, + "step": 7171 + }, + { + "epoch": 0.75, + "grad_norm": 2.1533475910931696, + "learning_rate": 1.4968484508099606e-06, + "loss": 0.6385, + "step": 7172 + }, + { + "epoch": 0.75, + "grad_norm": 5.851990435700211, + "learning_rate": 1.4956326386856723e-06, + "loss": 0.5869, + "step": 7173 + }, + { + "epoch": 0.75, + "grad_norm": 2.7719426441646275, + "learning_rate": 1.494417233688653e-06, + "loss": 0.5874, + "step": 7174 + }, + { + "epoch": 0.76, + "grad_norm": 3.102882961445121, + "learning_rate": 1.4932022359601056e-06, + "loss": 0.6421, + "step": 7175 + }, + { + "epoch": 0.76, + "grad_norm": 2.6384157210222536, + "learning_rate": 1.4919876456411875e-06, + "loss": 0.5523, + "step": 7176 + }, + { + "epoch": 0.76, + "grad_norm": 2.378591334938083, + "learning_rate": 1.490773462873002e-06, + "loss": 0.544, + "step": 7177 + }, + { + "epoch": 0.76, + "grad_norm": 20.641360117403863, + "learning_rate": 1.4895596877966128e-06, + "loss": 0.636, + "step": 7178 + }, + { + "epoch": 0.76, + "grad_norm": 2.4236725068487304, + "learning_rate": 1.4883463205530302e-06, + "loss": 0.6411, + "step": 7179 + }, + { + "epoch": 0.76, + "grad_norm": 2.2183036139750714, + "learning_rate": 1.4871333612832206e-06, + "loss": 0.5712, + "step": 7180 + }, + { + "epoch": 0.76, + "grad_norm": 3.050796166901124, + "learning_rate": 1.4859208101281041e-06, + "loss": 0.6366, + "step": 7181 + }, + { + "epoch": 0.76, + "grad_norm": 2.4152995939569166, + "learning_rate": 1.48470866722855e-06, + "loss": 0.5498, + "step": 7182 + }, + { + "epoch": 0.76, + "grad_norm": 2.362918250869478, + "learning_rate": 1.4834969327253795e-06, + "loss": 0.5972, + "step": 7183 + }, + { + "epoch": 0.76, + "grad_norm": 2.0541545853510184, + "learning_rate": 1.48228560675937e-06, + "loss": 0.5475, + "step": 7184 + }, + { + "epoch": 0.76, + "grad_norm": 4.0605689354251995, + "learning_rate": 1.481074689471252e-06, + "loss": 0.5439, + "step": 7185 + }, + { + "epoch": 0.76, + "grad_norm": 2.1804727796434964, + "learning_rate": 1.479864181001704e-06, + "loss": 0.6155, + "step": 7186 + }, + { + "epoch": 0.76, + "grad_norm": 4.353882130124197, + "learning_rate": 1.4786540814913586e-06, + "loss": 0.6792, + "step": 7187 + }, + { + "epoch": 0.76, + "grad_norm": 2.4609607660129162, + "learning_rate": 1.4774443910808023e-06, + "loss": 0.6776, + "step": 7188 + }, + { + "epoch": 0.76, + "grad_norm": 2.6666982763784812, + "learning_rate": 1.476235109910576e-06, + "loss": 0.6191, + "step": 7189 + }, + { + "epoch": 0.76, + "grad_norm": 3.56167731467775, + "learning_rate": 1.4750262381211665e-06, + "loss": 0.5887, + "step": 7190 + }, + { + "epoch": 0.76, + "grad_norm": 2.44278557698414, + "learning_rate": 1.4738177758530208e-06, + "loss": 0.6259, + "step": 7191 + }, + { + "epoch": 0.76, + "grad_norm": 2.5250241812484906, + "learning_rate": 1.4726097232465314e-06, + "loss": 0.6089, + "step": 7192 + }, + { + "epoch": 0.76, + "grad_norm": 3.2400336695589744, + "learning_rate": 1.471402080442047e-06, + "loss": 0.5666, + "step": 7193 + }, + { + "epoch": 0.76, + "grad_norm": 2.7151462386306657, + "learning_rate": 1.470194847579871e-06, + "loss": 0.6301, + "step": 7194 + }, + { + "epoch": 0.76, + "grad_norm": 2.698877450197644, + "learning_rate": 1.4689880248002537e-06, + "loss": 0.7084, + "step": 7195 + }, + { + "epoch": 0.76, + "grad_norm": 2.6410464096856665, + "learning_rate": 1.467781612243399e-06, + "loss": 0.6245, + "step": 7196 + }, + { + "epoch": 0.76, + "grad_norm": 2.612323546310781, + "learning_rate": 1.466575610049466e-06, + "loss": 0.6783, + "step": 7197 + }, + { + "epoch": 0.76, + "grad_norm": 2.7178474296193564, + "learning_rate": 1.4653700183585663e-06, + "loss": 0.6122, + "step": 7198 + }, + { + "epoch": 0.76, + "grad_norm": 3.512024045937146, + "learning_rate": 1.4641648373107598e-06, + "loss": 0.6147, + "step": 7199 + }, + { + "epoch": 0.76, + "grad_norm": 2.0834058077279685, + "learning_rate": 1.4629600670460603e-06, + "loss": 0.5514, + "step": 7200 + }, + { + "epoch": 0.76, + "grad_norm": 2.216935664488474, + "learning_rate": 1.4617557077044352e-06, + "loss": 0.6227, + "step": 7201 + }, + { + "epoch": 0.76, + "grad_norm": 2.46166081455518, + "learning_rate": 1.4605517594258046e-06, + "loss": 0.5871, + "step": 7202 + }, + { + "epoch": 0.76, + "grad_norm": 2.1948644668696375, + "learning_rate": 1.4593482223500406e-06, + "loss": 0.6353, + "step": 7203 + }, + { + "epoch": 0.76, + "grad_norm": 4.33193407924704, + "learning_rate": 1.4581450966169648e-06, + "loss": 0.6246, + "step": 7204 + }, + { + "epoch": 0.76, + "grad_norm": 2.624255396763174, + "learning_rate": 1.4569423823663515e-06, + "loss": 0.6982, + "step": 7205 + }, + { + "epoch": 0.76, + "grad_norm": 2.3813097303259028, + "learning_rate": 1.4557400797379306e-06, + "loss": 0.5593, + "step": 7206 + }, + { + "epoch": 0.76, + "grad_norm": 2.8170072633637067, + "learning_rate": 1.4545381888713833e-06, + "loss": 0.622, + "step": 7207 + }, + { + "epoch": 0.76, + "grad_norm": 1.0227331919960505, + "learning_rate": 1.45333670990634e-06, + "loss": 0.4883, + "step": 7208 + }, + { + "epoch": 0.76, + "grad_norm": 2.9590588104321056, + "learning_rate": 1.452135642982384e-06, + "loss": 0.6183, + "step": 7209 + }, + { + "epoch": 0.76, + "grad_norm": 2.33117855582943, + "learning_rate": 1.450934988239054e-06, + "loss": 0.6134, + "step": 7210 + }, + { + "epoch": 0.76, + "grad_norm": 2.4557426055942786, + "learning_rate": 1.4497347458158384e-06, + "loss": 0.62, + "step": 7211 + }, + { + "epoch": 0.76, + "grad_norm": 2.926982164970039, + "learning_rate": 1.4485349158521756e-06, + "loss": 0.6036, + "step": 7212 + }, + { + "epoch": 0.76, + "grad_norm": 2.2092246121938164, + "learning_rate": 1.447335498487462e-06, + "loss": 0.6053, + "step": 7213 + }, + { + "epoch": 0.76, + "grad_norm": 2.4017310344462994, + "learning_rate": 1.446136493861039e-06, + "loss": 0.6659, + "step": 7214 + }, + { + "epoch": 0.76, + "grad_norm": 2.1199053957284346, + "learning_rate": 1.4449379021122045e-06, + "loss": 0.6202, + "step": 7215 + }, + { + "epoch": 0.76, + "grad_norm": 2.3367095552631816, + "learning_rate": 1.4437397233802098e-06, + "loss": 0.629, + "step": 7216 + }, + { + "epoch": 0.76, + "grad_norm": 3.0011471861240357, + "learning_rate": 1.4425419578042538e-06, + "loss": 0.6302, + "step": 7217 + }, + { + "epoch": 0.76, + "grad_norm": 2.595981973547213, + "learning_rate": 1.4413446055234882e-06, + "loss": 0.6064, + "step": 7218 + }, + { + "epoch": 0.76, + "grad_norm": 3.0100131890890243, + "learning_rate": 1.4401476666770191e-06, + "loss": 0.6153, + "step": 7219 + }, + { + "epoch": 0.76, + "grad_norm": 2.068773777984486, + "learning_rate": 1.4389511414039053e-06, + "loss": 0.5228, + "step": 7220 + }, + { + "epoch": 0.76, + "grad_norm": 2.183378387130152, + "learning_rate": 1.4377550298431536e-06, + "loss": 0.6687, + "step": 7221 + }, + { + "epoch": 0.76, + "grad_norm": 2.8674171438993237, + "learning_rate": 1.436559332133724e-06, + "loss": 0.6141, + "step": 7222 + }, + { + "epoch": 0.76, + "grad_norm": 2.8843731419145637, + "learning_rate": 1.4353640484145304e-06, + "loss": 0.5749, + "step": 7223 + }, + { + "epoch": 0.76, + "grad_norm": 2.27701809442643, + "learning_rate": 1.4341691788244383e-06, + "loss": 0.6422, + "step": 7224 + }, + { + "epoch": 0.76, + "grad_norm": 2.5322604425389112, + "learning_rate": 1.4329747235022624e-06, + "loss": 0.5397, + "step": 7225 + }, + { + "epoch": 0.76, + "grad_norm": 2.431501916595479, + "learning_rate": 1.431780682586773e-06, + "loss": 0.6515, + "step": 7226 + }, + { + "epoch": 0.76, + "grad_norm": 2.3394101864352983, + "learning_rate": 1.4305870562166873e-06, + "loss": 0.6069, + "step": 7227 + }, + { + "epoch": 0.76, + "grad_norm": 2.2242293964368156, + "learning_rate": 1.4293938445306798e-06, + "loss": 0.5442, + "step": 7228 + }, + { + "epoch": 0.76, + "grad_norm": 2.5833298123991604, + "learning_rate": 1.4282010476673758e-06, + "loss": 0.6004, + "step": 7229 + }, + { + "epoch": 0.76, + "grad_norm": 2.6359655886818913, + "learning_rate": 1.427008665765348e-06, + "loss": 0.6834, + "step": 7230 + }, + { + "epoch": 0.76, + "grad_norm": 2.585971432337667, + "learning_rate": 1.4258166989631245e-06, + "loss": 0.6375, + "step": 7231 + }, + { + "epoch": 0.76, + "grad_norm": 3.257480641444835, + "learning_rate": 1.4246251473991845e-06, + "loss": 0.6148, + "step": 7232 + }, + { + "epoch": 0.76, + "grad_norm": 2.6802149842143397, + "learning_rate": 1.423434011211961e-06, + "loss": 0.6633, + "step": 7233 + }, + { + "epoch": 0.76, + "grad_norm": 2.460163635030121, + "learning_rate": 1.4222432905398353e-06, + "loss": 0.5807, + "step": 7234 + }, + { + "epoch": 0.76, + "grad_norm": 2.22647375488727, + "learning_rate": 1.4210529855211403e-06, + "loss": 0.4943, + "step": 7235 + }, + { + "epoch": 0.76, + "grad_norm": 4.121150139420541, + "learning_rate": 1.4198630962941639e-06, + "loss": 0.6249, + "step": 7236 + }, + { + "epoch": 0.76, + "grad_norm": 2.946918461716052, + "learning_rate": 1.4186736229971455e-06, + "loss": 0.6211, + "step": 7237 + }, + { + "epoch": 0.76, + "grad_norm": 4.528975031658922, + "learning_rate": 1.4174845657682712e-06, + "loss": 0.6419, + "step": 7238 + }, + { + "epoch": 0.76, + "grad_norm": 2.5787193861812496, + "learning_rate": 1.4162959247456854e-06, + "loss": 0.6206, + "step": 7239 + }, + { + "epoch": 0.76, + "grad_norm": 3.406544835370247, + "learning_rate": 1.4151077000674784e-06, + "loss": 0.624, + "step": 7240 + }, + { + "epoch": 0.76, + "grad_norm": 4.207698162326682, + "learning_rate": 1.4139198918716956e-06, + "loss": 0.5709, + "step": 7241 + }, + { + "epoch": 0.76, + "grad_norm": 2.2463252191495924, + "learning_rate": 1.4127325002963355e-06, + "loss": 0.5574, + "step": 7242 + }, + { + "epoch": 0.76, + "grad_norm": 2.3509763553384597, + "learning_rate": 1.4115455254793436e-06, + "loss": 0.5806, + "step": 7243 + }, + { + "epoch": 0.76, + "grad_norm": 2.52195463891043, + "learning_rate": 1.4103589675586176e-06, + "loss": 0.6281, + "step": 7244 + }, + { + "epoch": 0.76, + "grad_norm": 2.6057177351757734, + "learning_rate": 1.4091728266720106e-06, + "loss": 0.5374, + "step": 7245 + }, + { + "epoch": 0.76, + "grad_norm": 2.378426724174707, + "learning_rate": 1.4079871029573254e-06, + "loss": 0.5312, + "step": 7246 + }, + { + "epoch": 0.76, + "grad_norm": 2.4658224720117308, + "learning_rate": 1.4068017965523146e-06, + "loss": 0.6043, + "step": 7247 + }, + { + "epoch": 0.76, + "grad_norm": 2.367796137825628, + "learning_rate": 1.4056169075946846e-06, + "loss": 0.5532, + "step": 7248 + }, + { + "epoch": 0.76, + "grad_norm": 2.4057618387618436, + "learning_rate": 1.4044324362220912e-06, + "loss": 0.608, + "step": 7249 + }, + { + "epoch": 0.76, + "grad_norm": 3.7601186688014003, + "learning_rate": 1.4032483825721432e-06, + "loss": 0.6427, + "step": 7250 + }, + { + "epoch": 0.76, + "grad_norm": 2.5498571521744045, + "learning_rate": 1.4020647467824028e-06, + "loss": 0.6887, + "step": 7251 + }, + { + "epoch": 0.76, + "grad_norm": 0.9344224790730087, + "learning_rate": 1.4008815289903798e-06, + "loss": 0.5365, + "step": 7252 + }, + { + "epoch": 0.76, + "grad_norm": 2.219760527227203, + "learning_rate": 1.3996987293335345e-06, + "loss": 0.6433, + "step": 7253 + }, + { + "epoch": 0.76, + "grad_norm": 8.310538446761164, + "learning_rate": 1.3985163479492842e-06, + "loss": 0.6225, + "step": 7254 + }, + { + "epoch": 0.76, + "grad_norm": 2.8517117856482517, + "learning_rate": 1.3973343849749948e-06, + "loss": 0.6371, + "step": 7255 + }, + { + "epoch": 0.76, + "grad_norm": 2.661140852133978, + "learning_rate": 1.3961528405479824e-06, + "loss": 0.6281, + "step": 7256 + }, + { + "epoch": 0.76, + "grad_norm": 2.2805811042491286, + "learning_rate": 1.3949717148055136e-06, + "loss": 0.5957, + "step": 7257 + }, + { + "epoch": 0.76, + "grad_norm": 3.300483005629338, + "learning_rate": 1.39379100788481e-06, + "loss": 0.6551, + "step": 7258 + }, + { + "epoch": 0.76, + "grad_norm": 2.485309336234544, + "learning_rate": 1.3926107199230442e-06, + "loss": 0.6081, + "step": 7259 + }, + { + "epoch": 0.76, + "grad_norm": 8.341174494875231, + "learning_rate": 1.3914308510573354e-06, + "loss": 0.6515, + "step": 7260 + }, + { + "epoch": 0.76, + "grad_norm": 2.8732963511207283, + "learning_rate": 1.3902514014247608e-06, + "loss": 0.5541, + "step": 7261 + }, + { + "epoch": 0.76, + "grad_norm": 2.2043168478650896, + "learning_rate": 1.3890723711623421e-06, + "loss": 0.588, + "step": 7262 + }, + { + "epoch": 0.76, + "grad_norm": 2.853557533816863, + "learning_rate": 1.3878937604070568e-06, + "loss": 0.5689, + "step": 7263 + }, + { + "epoch": 0.76, + "grad_norm": 2.8253444135108845, + "learning_rate": 1.386715569295835e-06, + "loss": 0.5745, + "step": 7264 + }, + { + "epoch": 0.76, + "grad_norm": 0.9796152518485929, + "learning_rate": 1.3855377979655533e-06, + "loss": 0.5749, + "step": 7265 + }, + { + "epoch": 0.76, + "grad_norm": 3.6160882171024125, + "learning_rate": 1.3843604465530398e-06, + "loss": 0.5622, + "step": 7266 + }, + { + "epoch": 0.76, + "grad_norm": 2.0041115107011476, + "learning_rate": 1.3831835151950784e-06, + "loss": 0.5528, + "step": 7267 + }, + { + "epoch": 0.76, + "grad_norm": 2.16044720788834, + "learning_rate": 1.3820070040284023e-06, + "loss": 0.6886, + "step": 7268 + }, + { + "epoch": 0.76, + "grad_norm": 2.8645110276055927, + "learning_rate": 1.380830913189694e-06, + "loss": 0.6323, + "step": 7269 + }, + { + "epoch": 0.77, + "grad_norm": 2.0345813564664548, + "learning_rate": 1.3796552428155868e-06, + "loss": 0.5463, + "step": 7270 + }, + { + "epoch": 0.77, + "grad_norm": 2.4861411344231317, + "learning_rate": 1.378479993042668e-06, + "loss": 0.6813, + "step": 7271 + }, + { + "epoch": 0.77, + "grad_norm": 2.5548781593021594, + "learning_rate": 1.3773051640074764e-06, + "loss": 0.6686, + "step": 7272 + }, + { + "epoch": 0.77, + "grad_norm": 2.7242847584411165, + "learning_rate": 1.3761307558464975e-06, + "loss": 0.635, + "step": 7273 + }, + { + "epoch": 0.77, + "grad_norm": 4.079673277666802, + "learning_rate": 1.3749567686961728e-06, + "loss": 0.6109, + "step": 7274 + }, + { + "epoch": 0.77, + "grad_norm": 0.9119319275996398, + "learning_rate": 1.3737832026928905e-06, + "loss": 0.5083, + "step": 7275 + }, + { + "epoch": 0.77, + "grad_norm": 2.645571565188374, + "learning_rate": 1.3726100579729935e-06, + "loss": 0.6791, + "step": 7276 + }, + { + "epoch": 0.77, + "grad_norm": 2.280058640830768, + "learning_rate": 1.3714373346727754e-06, + "loss": 0.6213, + "step": 7277 + }, + { + "epoch": 0.77, + "grad_norm": 2.3684177527062755, + "learning_rate": 1.3702650329284794e-06, + "loss": 0.6239, + "step": 7278 + }, + { + "epoch": 0.77, + "grad_norm": 3.5713000588210955, + "learning_rate": 1.3690931528762974e-06, + "loss": 0.6513, + "step": 7279 + }, + { + "epoch": 0.77, + "grad_norm": 2.191606619177731, + "learning_rate": 1.3679216946523771e-06, + "loss": 0.6179, + "step": 7280 + }, + { + "epoch": 0.77, + "grad_norm": 2.520087458009411, + "learning_rate": 1.3667506583928163e-06, + "loss": 0.6273, + "step": 7281 + }, + { + "epoch": 0.77, + "grad_norm": 2.8211254894412083, + "learning_rate": 1.3655800442336597e-06, + "loss": 0.6629, + "step": 7282 + }, + { + "epoch": 0.77, + "grad_norm": 2.5218498030614986, + "learning_rate": 1.3644098523109096e-06, + "loss": 0.5631, + "step": 7283 + }, + { + "epoch": 0.77, + "grad_norm": 2.187549795477403, + "learning_rate": 1.3632400827605113e-06, + "loss": 0.552, + "step": 7284 + }, + { + "epoch": 0.77, + "grad_norm": 4.067246657340486, + "learning_rate": 1.3620707357183694e-06, + "loss": 0.6166, + "step": 7285 + }, + { + "epoch": 0.77, + "grad_norm": 15.55712958329288, + "learning_rate": 1.3609018113203314e-06, + "loss": 0.6316, + "step": 7286 + }, + { + "epoch": 0.77, + "grad_norm": 2.5699486322531566, + "learning_rate": 1.3597333097022031e-06, + "loss": 0.6579, + "step": 7287 + }, + { + "epoch": 0.77, + "grad_norm": 2.606631331064434, + "learning_rate": 1.3585652309997344e-06, + "loss": 0.6924, + "step": 7288 + }, + { + "epoch": 0.77, + "grad_norm": 2.4639655618590646, + "learning_rate": 1.3573975753486313e-06, + "loss": 0.6486, + "step": 7289 + }, + { + "epoch": 0.77, + "grad_norm": 3.494505254711063, + "learning_rate": 1.356230342884549e-06, + "loss": 0.6138, + "step": 7290 + }, + { + "epoch": 0.77, + "grad_norm": 2.6073670766020154, + "learning_rate": 1.3550635337430928e-06, + "loss": 0.5951, + "step": 7291 + }, + { + "epoch": 0.77, + "grad_norm": 3.2580430689462734, + "learning_rate": 1.3538971480598167e-06, + "loss": 0.6027, + "step": 7292 + }, + { + "epoch": 0.77, + "grad_norm": 0.9517406444625436, + "learning_rate": 1.3527311859702308e-06, + "loss": 0.5322, + "step": 7293 + }, + { + "epoch": 0.77, + "grad_norm": 2.860206476602887, + "learning_rate": 1.3515656476097937e-06, + "loss": 0.5832, + "step": 7294 + }, + { + "epoch": 0.77, + "grad_norm": 4.9771138110934405, + "learning_rate": 1.350400533113912e-06, + "loss": 0.543, + "step": 7295 + }, + { + "epoch": 0.77, + "grad_norm": 2.582876236962234, + "learning_rate": 1.3492358426179475e-06, + "loss": 0.6746, + "step": 7296 + }, + { + "epoch": 0.77, + "grad_norm": 2.0636423377041937, + "learning_rate": 1.3480715762572078e-06, + "loss": 0.657, + "step": 7297 + }, + { + "epoch": 0.77, + "grad_norm": 2.220396192129797, + "learning_rate": 1.3469077341669579e-06, + "loss": 0.6404, + "step": 7298 + }, + { + "epoch": 0.77, + "grad_norm": 2.8263887939999264, + "learning_rate": 1.3457443164824053e-06, + "loss": 0.6004, + "step": 7299 + }, + { + "epoch": 0.77, + "grad_norm": 2.70882407444034, + "learning_rate": 1.3445813233387167e-06, + "loss": 0.7198, + "step": 7300 + }, + { + "epoch": 0.77, + "grad_norm": 3.3517602576307355, + "learning_rate": 1.3434187548710014e-06, + "loss": 0.6133, + "step": 7301 + }, + { + "epoch": 0.77, + "grad_norm": 2.8385464767154356, + "learning_rate": 1.3422566112143248e-06, + "loss": 0.527, + "step": 7302 + }, + { + "epoch": 0.77, + "grad_norm": 2.7137520222257563, + "learning_rate": 1.3410948925037037e-06, + "loss": 0.7291, + "step": 7303 + }, + { + "epoch": 0.77, + "grad_norm": 2.238965834669248, + "learning_rate": 1.3399335988741007e-06, + "loss": 0.6578, + "step": 7304 + }, + { + "epoch": 0.77, + "grad_norm": 0.9827149232414178, + "learning_rate": 1.338772730460431e-06, + "loss": 0.5438, + "step": 7305 + }, + { + "epoch": 0.77, + "grad_norm": 4.982687189717289, + "learning_rate": 1.3376122873975616e-06, + "loss": 0.5661, + "step": 7306 + }, + { + "epoch": 0.77, + "grad_norm": 2.394229778749516, + "learning_rate": 1.3364522698203114e-06, + "loss": 0.5627, + "step": 7307 + }, + { + "epoch": 0.77, + "grad_norm": 2.5676561438034358, + "learning_rate": 1.3352926778634446e-06, + "loss": 0.5664, + "step": 7308 + }, + { + "epoch": 0.77, + "grad_norm": 4.69224823273842, + "learning_rate": 1.3341335116616822e-06, + "loss": 0.5727, + "step": 7309 + }, + { + "epoch": 0.77, + "grad_norm": 2.8984898812420727, + "learning_rate": 1.3329747713496904e-06, + "loss": 0.6014, + "step": 7310 + }, + { + "epoch": 0.77, + "grad_norm": 2.5133785385178657, + "learning_rate": 1.331816457062089e-06, + "loss": 0.6487, + "step": 7311 + }, + { + "epoch": 0.77, + "grad_norm": 2.550922385813062, + "learning_rate": 1.3306585689334494e-06, + "loss": 0.605, + "step": 7312 + }, + { + "epoch": 0.77, + "grad_norm": 2.6330342158074487, + "learning_rate": 1.3295011070982906e-06, + "loss": 0.5916, + "step": 7313 + }, + { + "epoch": 0.77, + "grad_norm": 2.906335153096584, + "learning_rate": 1.3283440716910812e-06, + "loss": 0.6149, + "step": 7314 + }, + { + "epoch": 0.77, + "grad_norm": 4.095159349488047, + "learning_rate": 1.327187462846244e-06, + "loss": 0.5979, + "step": 7315 + }, + { + "epoch": 0.77, + "grad_norm": 2.8365869518598976, + "learning_rate": 1.3260312806981517e-06, + "loss": 0.4815, + "step": 7316 + }, + { + "epoch": 0.77, + "grad_norm": 2.072771877951047, + "learning_rate": 1.3248755253811236e-06, + "loss": 0.5718, + "step": 7317 + }, + { + "epoch": 0.77, + "grad_norm": 3.5814898012658896, + "learning_rate": 1.3237201970294344e-06, + "loss": 0.5606, + "step": 7318 + }, + { + "epoch": 0.77, + "grad_norm": 2.290088791951938, + "learning_rate": 1.3225652957773044e-06, + "loss": 0.6147, + "step": 7319 + }, + { + "epoch": 0.77, + "grad_norm": 2.485404940783879, + "learning_rate": 1.3214108217589095e-06, + "loss": 0.5559, + "step": 7320 + }, + { + "epoch": 0.77, + "grad_norm": 2.890814837354043, + "learning_rate": 1.3202567751083701e-06, + "loss": 0.5864, + "step": 7321 + }, + { + "epoch": 0.77, + "grad_norm": 2.3304011547001933, + "learning_rate": 1.3191031559597628e-06, + "loss": 0.686, + "step": 7322 + }, + { + "epoch": 0.77, + "grad_norm": 2.939817245282438, + "learning_rate": 1.3179499644471088e-06, + "loss": 0.7083, + "step": 7323 + }, + { + "epoch": 0.77, + "grad_norm": 1.9322448250042434, + "learning_rate": 1.3167972007043844e-06, + "loss": 0.617, + "step": 7324 + }, + { + "epoch": 0.77, + "grad_norm": 2.2730836852993885, + "learning_rate": 1.3156448648655163e-06, + "loss": 0.6175, + "step": 7325 + }, + { + "epoch": 0.77, + "grad_norm": 2.216691599440806, + "learning_rate": 1.3144929570643767e-06, + "loss": 0.5699, + "step": 7326 + }, + { + "epoch": 0.77, + "grad_norm": 2.2709102711271103, + "learning_rate": 1.3133414774347903e-06, + "loss": 0.6783, + "step": 7327 + }, + { + "epoch": 0.77, + "grad_norm": 3.7008961073983926, + "learning_rate": 1.3121904261105339e-06, + "loss": 0.5866, + "step": 7328 + }, + { + "epoch": 0.77, + "grad_norm": 1.0182432463584763, + "learning_rate": 1.3110398032253346e-06, + "loss": 0.5087, + "step": 7329 + }, + { + "epoch": 0.77, + "grad_norm": 1.0497274511645123, + "learning_rate": 1.3098896089128666e-06, + "loss": 0.5744, + "step": 7330 + }, + { + "epoch": 0.77, + "grad_norm": 2.2533986618656225, + "learning_rate": 1.3087398433067577e-06, + "loss": 0.5694, + "step": 7331 + }, + { + "epoch": 0.77, + "grad_norm": 2.4413558422766193, + "learning_rate": 1.307590506540582e-06, + "loss": 0.616, + "step": 7332 + }, + { + "epoch": 0.77, + "grad_norm": 2.3644581867426897, + "learning_rate": 1.3064415987478691e-06, + "loss": 0.6208, + "step": 7333 + }, + { + "epoch": 0.77, + "grad_norm": 0.9694715957645057, + "learning_rate": 1.3052931200620926e-06, + "loss": 0.5448, + "step": 7334 + }, + { + "epoch": 0.77, + "grad_norm": 2.989251780651652, + "learning_rate": 1.3041450706166831e-06, + "loss": 0.5686, + "step": 7335 + }, + { + "epoch": 0.77, + "grad_norm": 2.568697403594937, + "learning_rate": 1.3029974505450137e-06, + "loss": 0.6258, + "step": 7336 + }, + { + "epoch": 0.77, + "grad_norm": 2.4006514658503697, + "learning_rate": 1.301850259980414e-06, + "loss": 0.587, + "step": 7337 + }, + { + "epoch": 0.77, + "grad_norm": 2.4391008775614704, + "learning_rate": 1.3007034990561619e-06, + "loss": 0.6487, + "step": 7338 + }, + { + "epoch": 0.77, + "grad_norm": 2.27176792056776, + "learning_rate": 1.2995571679054835e-06, + "loss": 0.6879, + "step": 7339 + }, + { + "epoch": 0.77, + "grad_norm": 2.332459416723728, + "learning_rate": 1.2984112666615555e-06, + "loss": 0.5647, + "step": 7340 + }, + { + "epoch": 0.77, + "grad_norm": 1.0350097122071702, + "learning_rate": 1.2972657954575064e-06, + "loss": 0.541, + "step": 7341 + }, + { + "epoch": 0.77, + "grad_norm": 2.3128014660034273, + "learning_rate": 1.2961207544264149e-06, + "loss": 0.6494, + "step": 7342 + }, + { + "epoch": 0.77, + "grad_norm": 2.281808135510517, + "learning_rate": 1.2949761437013059e-06, + "loss": 0.6048, + "step": 7343 + }, + { + "epoch": 0.77, + "grad_norm": 2.495322499540691, + "learning_rate": 1.2938319634151597e-06, + "loss": 0.5942, + "step": 7344 + }, + { + "epoch": 0.77, + "grad_norm": 3.914464189765621, + "learning_rate": 1.2926882137009012e-06, + "loss": 0.66, + "step": 7345 + }, + { + "epoch": 0.77, + "grad_norm": 3.098291543647439, + "learning_rate": 1.2915448946914106e-06, + "loss": 0.5978, + "step": 7346 + }, + { + "epoch": 0.77, + "grad_norm": 2.435837683057105, + "learning_rate": 1.2904020065195127e-06, + "loss": 0.5946, + "step": 7347 + }, + { + "epoch": 0.77, + "grad_norm": 2.4918968126223633, + "learning_rate": 1.2892595493179876e-06, + "loss": 0.6562, + "step": 7348 + }, + { + "epoch": 0.77, + "grad_norm": 2.2303147723923877, + "learning_rate": 1.2881175232195604e-06, + "loss": 0.6037, + "step": 7349 + }, + { + "epoch": 0.77, + "grad_norm": 2.4085569825882223, + "learning_rate": 1.2869759283569088e-06, + "loss": 0.5633, + "step": 7350 + }, + { + "epoch": 0.77, + "grad_norm": 2.9401916376736774, + "learning_rate": 1.2858347648626623e-06, + "loss": 0.6793, + "step": 7351 + }, + { + "epoch": 0.77, + "grad_norm": 2.620281409590404, + "learning_rate": 1.2846940328693952e-06, + "loss": 0.5221, + "step": 7352 + }, + { + "epoch": 0.77, + "grad_norm": 2.460645653228923, + "learning_rate": 1.2835537325096364e-06, + "loss": 0.6427, + "step": 7353 + }, + { + "epoch": 0.77, + "grad_norm": 2.1645773174699605, + "learning_rate": 1.2824138639158607e-06, + "loss": 0.5951, + "step": 7354 + }, + { + "epoch": 0.77, + "grad_norm": 1.940681938546237, + "learning_rate": 1.2812744272204969e-06, + "loss": 0.6173, + "step": 7355 + }, + { + "epoch": 0.77, + "grad_norm": 1.9852641803573394, + "learning_rate": 1.2801354225559194e-06, + "loss": 0.634, + "step": 7356 + }, + { + "epoch": 0.77, + "grad_norm": 2.408422665342694, + "learning_rate": 1.2789968500544563e-06, + "loss": 0.5832, + "step": 7357 + }, + { + "epoch": 0.77, + "grad_norm": 2.54563268747217, + "learning_rate": 1.277858709848382e-06, + "loss": 0.5927, + "step": 7358 + }, + { + "epoch": 0.77, + "grad_norm": 3.892292877622211, + "learning_rate": 1.2767210020699234e-06, + "loss": 0.6332, + "step": 7359 + }, + { + "epoch": 0.77, + "grad_norm": 0.9690186601279351, + "learning_rate": 1.2755837268512566e-06, + "loss": 0.5541, + "step": 7360 + }, + { + "epoch": 0.77, + "grad_norm": 2.3973108957180673, + "learning_rate": 1.2744468843245066e-06, + "loss": 0.6973, + "step": 7361 + }, + { + "epoch": 0.77, + "grad_norm": 9.445703072300839, + "learning_rate": 1.2733104746217468e-06, + "loss": 0.5102, + "step": 7362 + }, + { + "epoch": 0.77, + "grad_norm": 2.4460880394351956, + "learning_rate": 1.2721744978750028e-06, + "loss": 0.6275, + "step": 7363 + }, + { + "epoch": 0.77, + "grad_norm": 2.5731172106909193, + "learning_rate": 1.271038954216251e-06, + "loss": 0.567, + "step": 7364 + }, + { + "epoch": 0.78, + "grad_norm": 3.312904188551475, + "learning_rate": 1.269903843777413e-06, + "loss": 0.6004, + "step": 7365 + }, + { + "epoch": 0.78, + "grad_norm": 2.3261426877154916, + "learning_rate": 1.2687691666903657e-06, + "loss": 0.5648, + "step": 7366 + }, + { + "epoch": 0.78, + "grad_norm": 4.634024092091831, + "learning_rate": 1.2676349230869283e-06, + "loss": 0.6585, + "step": 7367 + }, + { + "epoch": 0.78, + "grad_norm": 2.5573244615411075, + "learning_rate": 1.2665011130988786e-06, + "loss": 0.5732, + "step": 7368 + }, + { + "epoch": 0.78, + "grad_norm": 2.2354267402034234, + "learning_rate": 1.2653677368579354e-06, + "loss": 0.5843, + "step": 7369 + }, + { + "epoch": 0.78, + "grad_norm": 2.2289656447356867, + "learning_rate": 1.2642347944957744e-06, + "loss": 0.7049, + "step": 7370 + }, + { + "epoch": 0.78, + "grad_norm": 3.1454417786703353, + "learning_rate": 1.2631022861440145e-06, + "loss": 0.5499, + "step": 7371 + }, + { + "epoch": 0.78, + "grad_norm": 3.367273641722864, + "learning_rate": 1.2619702119342286e-06, + "loss": 0.5656, + "step": 7372 + }, + { + "epoch": 0.78, + "grad_norm": 2.156216839264307, + "learning_rate": 1.2608385719979394e-06, + "loss": 0.6229, + "step": 7373 + }, + { + "epoch": 0.78, + "grad_norm": 3.707241733582545, + "learning_rate": 1.2597073664666159e-06, + "loss": 0.5974, + "step": 7374 + }, + { + "epoch": 0.78, + "grad_norm": 2.589860918286414, + "learning_rate": 1.2585765954716773e-06, + "loss": 0.6479, + "step": 7375 + }, + { + "epoch": 0.78, + "grad_norm": 2.1972502473528763, + "learning_rate": 1.257446259144494e-06, + "loss": 0.6352, + "step": 7376 + }, + { + "epoch": 0.78, + "grad_norm": 2.8739108933647244, + "learning_rate": 1.2563163576163879e-06, + "loss": 0.6169, + "step": 7377 + }, + { + "epoch": 0.78, + "grad_norm": 2.599760260919812, + "learning_rate": 1.2551868910186238e-06, + "loss": 0.5686, + "step": 7378 + }, + { + "epoch": 0.78, + "grad_norm": 1.0036011302793741, + "learning_rate": 1.2540578594824226e-06, + "loss": 0.5261, + "step": 7379 + }, + { + "epoch": 0.78, + "grad_norm": 3.1369364911691604, + "learning_rate": 1.25292926313895e-06, + "loss": 0.5366, + "step": 7380 + }, + { + "epoch": 0.78, + "grad_norm": 2.733863227114797, + "learning_rate": 1.251801102119325e-06, + "loss": 0.5542, + "step": 7381 + }, + { + "epoch": 0.78, + "grad_norm": 4.392211563561417, + "learning_rate": 1.2506733765546115e-06, + "loss": 0.5557, + "step": 7382 + }, + { + "epoch": 0.78, + "grad_norm": 7.2289899376136315, + "learning_rate": 1.2495460865758286e-06, + "loss": 0.6201, + "step": 7383 + }, + { + "epoch": 0.78, + "grad_norm": 2.389324115301021, + "learning_rate": 1.2484192323139382e-06, + "loss": 0.5614, + "step": 7384 + }, + { + "epoch": 0.78, + "grad_norm": 2.4415323240042524, + "learning_rate": 1.2472928138998569e-06, + "loss": 0.5961, + "step": 7385 + }, + { + "epoch": 0.78, + "grad_norm": 3.376325421905973, + "learning_rate": 1.2461668314644499e-06, + "loss": 0.551, + "step": 7386 + }, + { + "epoch": 0.78, + "grad_norm": 2.6372801647944795, + "learning_rate": 1.2450412851385275e-06, + "loss": 0.6422, + "step": 7387 + }, + { + "epoch": 0.78, + "grad_norm": 1.0428132195188833, + "learning_rate": 1.2439161750528555e-06, + "loss": 0.5497, + "step": 7388 + }, + { + "epoch": 0.78, + "grad_norm": 4.5630278208472985, + "learning_rate": 1.2427915013381436e-06, + "loss": 0.5869, + "step": 7389 + }, + { + "epoch": 0.78, + "grad_norm": 2.0559100223708637, + "learning_rate": 1.2416672641250548e-06, + "loss": 0.5977, + "step": 7390 + }, + { + "epoch": 0.78, + "grad_norm": 2.4409035745001746, + "learning_rate": 1.2405434635441982e-06, + "loss": 0.559, + "step": 7391 + }, + { + "epoch": 0.78, + "grad_norm": 4.831933762565664, + "learning_rate": 1.2394200997261358e-06, + "loss": 0.6682, + "step": 7392 + }, + { + "epoch": 0.78, + "grad_norm": 0.8825046392899807, + "learning_rate": 1.2382971728013742e-06, + "loss": 0.5093, + "step": 7393 + }, + { + "epoch": 0.78, + "grad_norm": 2.5797992352654386, + "learning_rate": 1.2371746829003745e-06, + "loss": 0.6392, + "step": 7394 + }, + { + "epoch": 0.78, + "grad_norm": 2.9414008384348906, + "learning_rate": 1.2360526301535408e-06, + "loss": 0.5538, + "step": 7395 + }, + { + "epoch": 0.78, + "grad_norm": 2.37560198530972, + "learning_rate": 1.234931014691234e-06, + "loss": 0.6542, + "step": 7396 + }, + { + "epoch": 0.78, + "grad_norm": 2.760017872274412, + "learning_rate": 1.2338098366437574e-06, + "loss": 0.6122, + "step": 7397 + }, + { + "epoch": 0.78, + "grad_norm": 2.8422640961336607, + "learning_rate": 1.2326890961413663e-06, + "loss": 0.5873, + "step": 7398 + }, + { + "epoch": 0.78, + "grad_norm": 3.1453980578084426, + "learning_rate": 1.2315687933142672e-06, + "loss": 0.6239, + "step": 7399 + }, + { + "epoch": 0.78, + "grad_norm": 2.749851183004622, + "learning_rate": 1.2304489282926109e-06, + "loss": 0.5232, + "step": 7400 + }, + { + "epoch": 0.78, + "grad_norm": 2.472329629666702, + "learning_rate": 1.2293295012065032e-06, + "loss": 0.6043, + "step": 7401 + }, + { + "epoch": 0.78, + "grad_norm": 2.7136693979225757, + "learning_rate": 1.228210512185992e-06, + "loss": 0.572, + "step": 7402 + }, + { + "epoch": 0.78, + "grad_norm": 2.3539836790068907, + "learning_rate": 1.2270919613610828e-06, + "loss": 0.5805, + "step": 7403 + }, + { + "epoch": 0.78, + "grad_norm": 2.4768244757191487, + "learning_rate": 1.2259738488617211e-06, + "loss": 0.6012, + "step": 7404 + }, + { + "epoch": 0.78, + "grad_norm": 2.6680180975475234, + "learning_rate": 1.2248561748178094e-06, + "loss": 0.647, + "step": 7405 + }, + { + "epoch": 0.78, + "grad_norm": 2.2801324593756784, + "learning_rate": 1.2237389393591931e-06, + "loss": 0.688, + "step": 7406 + }, + { + "epoch": 0.78, + "grad_norm": 3.0717420409196436, + "learning_rate": 1.222622142615671e-06, + "loss": 0.6413, + "step": 7407 + }, + { + "epoch": 0.78, + "grad_norm": 2.3216939444158515, + "learning_rate": 1.2215057847169904e-06, + "loss": 0.5048, + "step": 7408 + }, + { + "epoch": 0.78, + "grad_norm": 2.7933790970927896, + "learning_rate": 1.2203898657928453e-06, + "loss": 0.6127, + "step": 7409 + }, + { + "epoch": 0.78, + "grad_norm": 2.6973298552671556, + "learning_rate": 1.2192743859728784e-06, + "loss": 0.5957, + "step": 7410 + }, + { + "epoch": 0.78, + "grad_norm": 3.010078673364939, + "learning_rate": 1.2181593453866841e-06, + "loss": 0.5948, + "step": 7411 + }, + { + "epoch": 0.78, + "grad_norm": 1.0077226242761792, + "learning_rate": 1.2170447441638067e-06, + "loss": 0.5568, + "step": 7412 + }, + { + "epoch": 0.78, + "grad_norm": 2.3706862467692345, + "learning_rate": 1.2159305824337337e-06, + "loss": 0.5863, + "step": 7413 + }, + { + "epoch": 0.78, + "grad_norm": 3.0863837485718935, + "learning_rate": 1.2148168603259086e-06, + "loss": 0.6177, + "step": 7414 + }, + { + "epoch": 0.78, + "grad_norm": 3.246542506000711, + "learning_rate": 1.213703577969717e-06, + "loss": 0.5643, + "step": 7415 + }, + { + "epoch": 0.78, + "grad_norm": 2.2507630179043505, + "learning_rate": 1.2125907354945004e-06, + "loss": 0.5588, + "step": 7416 + }, + { + "epoch": 0.78, + "grad_norm": 3.077759944294835, + "learning_rate": 1.2114783330295426e-06, + "loss": 0.6429, + "step": 7417 + }, + { + "epoch": 0.78, + "grad_norm": 2.1001739169623956, + "learning_rate": 1.210366370704082e-06, + "loss": 0.6044, + "step": 7418 + }, + { + "epoch": 0.78, + "grad_norm": 2.0635599866105245, + "learning_rate": 1.2092548486473e-06, + "loss": 0.5829, + "step": 7419 + }, + { + "epoch": 0.78, + "grad_norm": 2.7552402862941308, + "learning_rate": 1.2081437669883323e-06, + "loss": 0.611, + "step": 7420 + }, + { + "epoch": 0.78, + "grad_norm": 2.6069981960534463, + "learning_rate": 1.2070331258562612e-06, + "loss": 0.6104, + "step": 7421 + }, + { + "epoch": 0.78, + "grad_norm": 2.173648068395166, + "learning_rate": 1.2059229253801164e-06, + "loss": 0.6458, + "step": 7422 + }, + { + "epoch": 0.78, + "grad_norm": 2.606347301566492, + "learning_rate": 1.2048131656888801e-06, + "loss": 0.6129, + "step": 7423 + }, + { + "epoch": 0.78, + "grad_norm": 4.002157820915137, + "learning_rate": 1.2037038469114775e-06, + "loss": 0.6574, + "step": 7424 + }, + { + "epoch": 0.78, + "grad_norm": 2.1433237595069006, + "learning_rate": 1.2025949691767895e-06, + "loss": 0.6796, + "step": 7425 + }, + { + "epoch": 0.78, + "grad_norm": 2.6482892199703247, + "learning_rate": 1.2014865326136393e-06, + "loss": 0.6542, + "step": 7426 + }, + { + "epoch": 0.78, + "grad_norm": 2.8554690646087244, + "learning_rate": 1.2003785373508054e-06, + "loss": 0.5909, + "step": 7427 + }, + { + "epoch": 0.78, + "grad_norm": 2.1151193571586755, + "learning_rate": 1.1992709835170075e-06, + "loss": 0.6143, + "step": 7428 + }, + { + "epoch": 0.78, + "grad_norm": 2.3281163582456483, + "learning_rate": 1.198163871240921e-06, + "loss": 0.4967, + "step": 7429 + }, + { + "epoch": 0.78, + "grad_norm": 2.3181484526166045, + "learning_rate": 1.197057200651165e-06, + "loss": 0.6032, + "step": 7430 + }, + { + "epoch": 0.78, + "grad_norm": 4.4330393599119065, + "learning_rate": 1.195950971876312e-06, + "loss": 0.5914, + "step": 7431 + }, + { + "epoch": 0.78, + "grad_norm": 2.740984634407769, + "learning_rate": 1.1948451850448767e-06, + "loss": 0.7403, + "step": 7432 + }, + { + "epoch": 0.78, + "grad_norm": 2.3670282194811905, + "learning_rate": 1.1937398402853283e-06, + "loss": 0.5251, + "step": 7433 + }, + { + "epoch": 0.78, + "grad_norm": 1.0276430189430175, + "learning_rate": 1.1926349377260843e-06, + "loss": 0.538, + "step": 7434 + }, + { + "epoch": 0.78, + "grad_norm": 2.4380874996957984, + "learning_rate": 1.1915304774955054e-06, + "loss": 0.6245, + "step": 7435 + }, + { + "epoch": 0.78, + "grad_norm": 3.242941226659693, + "learning_rate": 1.1904264597219078e-06, + "loss": 0.5773, + "step": 7436 + }, + { + "epoch": 0.78, + "grad_norm": 2.993175401734412, + "learning_rate": 1.189322884533551e-06, + "loss": 0.6, + "step": 7437 + }, + { + "epoch": 0.78, + "grad_norm": 0.9803046786575148, + "learning_rate": 1.1882197520586464e-06, + "loss": 0.5364, + "step": 7438 + }, + { + "epoch": 0.78, + "grad_norm": 3.157862409696275, + "learning_rate": 1.1871170624253515e-06, + "loss": 0.5523, + "step": 7439 + }, + { + "epoch": 0.78, + "grad_norm": 2.5580589814696273, + "learning_rate": 1.1860148157617757e-06, + "loss": 0.6233, + "step": 7440 + }, + { + "epoch": 0.78, + "grad_norm": 2.4718486982211854, + "learning_rate": 1.1849130121959717e-06, + "loss": 0.5901, + "step": 7441 + }, + { + "epoch": 0.78, + "grad_norm": 2.4594217860273098, + "learning_rate": 1.1838116518559474e-06, + "loss": 0.6861, + "step": 7442 + }, + { + "epoch": 0.78, + "grad_norm": 0.9338351636324551, + "learning_rate": 1.1827107348696526e-06, + "loss": 0.5445, + "step": 7443 + }, + { + "epoch": 0.78, + "grad_norm": 2.3059234955489814, + "learning_rate": 1.181610261364991e-06, + "loss": 0.5499, + "step": 7444 + }, + { + "epoch": 0.78, + "grad_norm": 2.581067104733737, + "learning_rate": 1.1805102314698103e-06, + "loss": 0.6374, + "step": 7445 + }, + { + "epoch": 0.78, + "grad_norm": 2.9256095254675536, + "learning_rate": 1.1794106453119098e-06, + "loss": 0.5952, + "step": 7446 + }, + { + "epoch": 0.78, + "grad_norm": 2.5727809288583776, + "learning_rate": 1.1783115030190378e-06, + "loss": 0.6924, + "step": 7447 + }, + { + "epoch": 0.78, + "grad_norm": 4.922966924396616, + "learning_rate": 1.1772128047188864e-06, + "loss": 0.6865, + "step": 7448 + }, + { + "epoch": 0.78, + "grad_norm": 2.39429256222939, + "learning_rate": 1.1761145505391025e-06, + "loss": 0.6303, + "step": 7449 + }, + { + "epoch": 0.78, + "grad_norm": 2.7885079254569667, + "learning_rate": 1.1750167406072743e-06, + "loss": 0.5501, + "step": 7450 + }, + { + "epoch": 0.78, + "grad_norm": 2.4497210304886603, + "learning_rate": 1.1739193750509465e-06, + "loss": 0.5928, + "step": 7451 + }, + { + "epoch": 0.78, + "grad_norm": 2.4756214907759198, + "learning_rate": 1.1728224539976035e-06, + "loss": 0.5829, + "step": 7452 + }, + { + "epoch": 0.78, + "grad_norm": 2.503009767146594, + "learning_rate": 1.1717259775746865e-06, + "loss": 0.5743, + "step": 7453 + }, + { + "epoch": 0.78, + "grad_norm": 2.4617513372135003, + "learning_rate": 1.1706299459095776e-06, + "loss": 0.6251, + "step": 7454 + }, + { + "epoch": 0.78, + "grad_norm": 2.2715662368230274, + "learning_rate": 1.1695343591296115e-06, + "loss": 0.6545, + "step": 7455 + }, + { + "epoch": 0.78, + "grad_norm": 3.2007462619830287, + "learning_rate": 1.1684392173620729e-06, + "loss": 0.5534, + "step": 7456 + }, + { + "epoch": 0.78, + "grad_norm": 0.9869030111837869, + "learning_rate": 1.1673445207341882e-06, + "loss": 0.534, + "step": 7457 + }, + { + "epoch": 0.78, + "grad_norm": 0.9843494999894192, + "learning_rate": 1.1662502693731393e-06, + "loss": 0.5375, + "step": 7458 + }, + { + "epoch": 0.78, + "grad_norm": 2.964320138816884, + "learning_rate": 1.1651564634060509e-06, + "loss": 0.5889, + "step": 7459 + }, + { + "epoch": 0.79, + "grad_norm": 2.3924454080709854, + "learning_rate": 1.1640631029600002e-06, + "loss": 0.5073, + "step": 7460 + }, + { + "epoch": 0.79, + "grad_norm": 2.8236225380832143, + "learning_rate": 1.1629701881620086e-06, + "loss": 0.6282, + "step": 7461 + }, + { + "epoch": 0.79, + "grad_norm": 3.172480547369297, + "learning_rate": 1.1618777191390502e-06, + "loss": 0.5964, + "step": 7462 + }, + { + "epoch": 0.79, + "grad_norm": 2.810127828045121, + "learning_rate": 1.1607856960180413e-06, + "loss": 0.6193, + "step": 7463 + }, + { + "epoch": 0.79, + "grad_norm": 2.6027058379471204, + "learning_rate": 1.1596941189258542e-06, + "loss": 0.5957, + "step": 7464 + }, + { + "epoch": 0.79, + "grad_norm": 2.5799914326081077, + "learning_rate": 1.1586029879893018e-06, + "loss": 0.5961, + "step": 7465 + }, + { + "epoch": 0.79, + "grad_norm": 2.272129728569525, + "learning_rate": 1.1575123033351514e-06, + "loss": 0.574, + "step": 7466 + }, + { + "epoch": 0.79, + "grad_norm": 3.353854643372349, + "learning_rate": 1.1564220650901126e-06, + "loss": 0.548, + "step": 7467 + }, + { + "epoch": 0.79, + "grad_norm": 2.284598981438445, + "learning_rate": 1.1553322733808474e-06, + "loss": 0.6057, + "step": 7468 + }, + { + "epoch": 0.79, + "grad_norm": 2.5638147489723737, + "learning_rate": 1.1542429283339669e-06, + "loss": 0.625, + "step": 7469 + }, + { + "epoch": 0.79, + "grad_norm": 2.859248521334551, + "learning_rate": 1.153154030076024e-06, + "loss": 0.5896, + "step": 7470 + }, + { + "epoch": 0.79, + "grad_norm": 2.1507209034830352, + "learning_rate": 1.1520655787335272e-06, + "loss": 0.5909, + "step": 7471 + }, + { + "epoch": 0.79, + "grad_norm": 2.2804744847150293, + "learning_rate": 1.150977574432927e-06, + "loss": 0.6216, + "step": 7472 + }, + { + "epoch": 0.79, + "grad_norm": 2.2755713818694026, + "learning_rate": 1.1498900173006271e-06, + "loss": 0.5953, + "step": 7473 + }, + { + "epoch": 0.79, + "grad_norm": 2.4375823375991676, + "learning_rate": 1.1488029074629742e-06, + "loss": 0.5324, + "step": 7474 + }, + { + "epoch": 0.79, + "grad_norm": 2.6695360497471374, + "learning_rate": 1.1477162450462681e-06, + "loss": 0.5473, + "step": 7475 + }, + { + "epoch": 0.79, + "grad_norm": 3.055784414516847, + "learning_rate": 1.1466300301767513e-06, + "loss": 0.6207, + "step": 7476 + }, + { + "epoch": 0.79, + "grad_norm": 2.3379616900640507, + "learning_rate": 1.1455442629806208e-06, + "loss": 0.57, + "step": 7477 + }, + { + "epoch": 0.79, + "grad_norm": 2.3050676485829817, + "learning_rate": 1.1444589435840136e-06, + "loss": 0.5941, + "step": 7478 + }, + { + "epoch": 0.79, + "grad_norm": 2.2327610274880056, + "learning_rate": 1.1433740721130227e-06, + "loss": 0.6705, + "step": 7479 + }, + { + "epoch": 0.79, + "grad_norm": 2.318030102730509, + "learning_rate": 1.1422896486936819e-06, + "loss": 0.5924, + "step": 7480 + }, + { + "epoch": 0.79, + "grad_norm": 2.3743209801016847, + "learning_rate": 1.1412056734519788e-06, + "loss": 0.6173, + "step": 7481 + }, + { + "epoch": 0.79, + "grad_norm": 2.34093180724465, + "learning_rate": 1.1401221465138468e-06, + "loss": 0.6462, + "step": 7482 + }, + { + "epoch": 0.79, + "grad_norm": 2.6932639467851116, + "learning_rate": 1.1390390680051649e-06, + "loss": 0.6229, + "step": 7483 + }, + { + "epoch": 0.79, + "grad_norm": 2.346396013718988, + "learning_rate": 1.1379564380517648e-06, + "loss": 0.6471, + "step": 7484 + }, + { + "epoch": 0.79, + "grad_norm": 2.5191557971368876, + "learning_rate": 1.1368742567794199e-06, + "loss": 0.5747, + "step": 7485 + }, + { + "epoch": 0.79, + "grad_norm": 2.187304261864126, + "learning_rate": 1.1357925243138585e-06, + "loss": 0.6561, + "step": 7486 + }, + { + "epoch": 0.79, + "grad_norm": 11.808798126077034, + "learning_rate": 1.1347112407807499e-06, + "loss": 0.61, + "step": 7487 + }, + { + "epoch": 0.79, + "grad_norm": 2.5589240903441346, + "learning_rate": 1.1336304063057169e-06, + "loss": 0.5923, + "step": 7488 + }, + { + "epoch": 0.79, + "grad_norm": 2.2691294777604085, + "learning_rate": 1.1325500210143253e-06, + "loss": 0.6155, + "step": 7489 + }, + { + "epoch": 0.79, + "grad_norm": 2.901817891190817, + "learning_rate": 1.1314700850320948e-06, + "loss": 0.6281, + "step": 7490 + }, + { + "epoch": 0.79, + "grad_norm": 2.711266040481578, + "learning_rate": 1.1303905984844848e-06, + "loss": 0.6484, + "step": 7491 + }, + { + "epoch": 0.79, + "grad_norm": 2.2630100169064113, + "learning_rate": 1.1293115614969109e-06, + "loss": 0.5749, + "step": 7492 + }, + { + "epoch": 0.79, + "grad_norm": 2.207926785870351, + "learning_rate": 1.1282329741947295e-06, + "loss": 0.584, + "step": 7493 + }, + { + "epoch": 0.79, + "grad_norm": 2.3024880371784513, + "learning_rate": 1.1271548367032487e-06, + "loss": 0.6285, + "step": 7494 + }, + { + "epoch": 0.79, + "grad_norm": 2.259781922595136, + "learning_rate": 1.1260771491477252e-06, + "loss": 0.534, + "step": 7495 + }, + { + "epoch": 0.79, + "grad_norm": 2.3229570086330815, + "learning_rate": 1.1249999116533589e-06, + "loss": 0.5503, + "step": 7496 + }, + { + "epoch": 0.79, + "grad_norm": 2.553935192730057, + "learning_rate": 1.1239231243453025e-06, + "loss": 0.664, + "step": 7497 + }, + { + "epoch": 0.79, + "grad_norm": 0.92848847705716, + "learning_rate": 1.122846787348652e-06, + "loss": 0.5382, + "step": 7498 + }, + { + "epoch": 0.79, + "grad_norm": 2.25087708292023, + "learning_rate": 1.1217709007884548e-06, + "loss": 0.6478, + "step": 7499 + }, + { + "epoch": 0.79, + "grad_norm": 2.887135255747007, + "learning_rate": 1.1206954647897023e-06, + "loss": 0.6988, + "step": 7500 + }, + { + "epoch": 0.79, + "grad_norm": 2.881071494956094, + "learning_rate": 1.1196204794773385e-06, + "loss": 0.6754, + "step": 7501 + }, + { + "epoch": 0.79, + "grad_norm": 3.8124395050447877, + "learning_rate": 1.1185459449762486e-06, + "loss": 0.6218, + "step": 7502 + }, + { + "epoch": 0.79, + "grad_norm": 5.5635427523252625, + "learning_rate": 1.1174718614112711e-06, + "loss": 0.6018, + "step": 7503 + }, + { + "epoch": 0.79, + "grad_norm": 2.1176840263880585, + "learning_rate": 1.1163982289071907e-06, + "loss": 0.5724, + "step": 7504 + }, + { + "epoch": 0.79, + "grad_norm": 2.389675591616161, + "learning_rate": 1.1153250475887362e-06, + "loss": 0.658, + "step": 7505 + }, + { + "epoch": 0.79, + "grad_norm": 3.2712660526620176, + "learning_rate": 1.1142523175805896e-06, + "loss": 0.6494, + "step": 7506 + }, + { + "epoch": 0.79, + "grad_norm": 2.3498472808911988, + "learning_rate": 1.113180039007375e-06, + "loss": 0.6334, + "step": 7507 + }, + { + "epoch": 0.79, + "grad_norm": 4.183640363375537, + "learning_rate": 1.112108211993669e-06, + "loss": 0.6067, + "step": 7508 + }, + { + "epoch": 0.79, + "grad_norm": 4.308029778683292, + "learning_rate": 1.1110368366639906e-06, + "loss": 0.6066, + "step": 7509 + }, + { + "epoch": 0.79, + "grad_norm": 8.528977099429406, + "learning_rate": 1.109965913142812e-06, + "loss": 0.6271, + "step": 7510 + }, + { + "epoch": 0.79, + "grad_norm": 2.7928225891739076, + "learning_rate": 1.1088954415545478e-06, + "loss": 0.5326, + "step": 7511 + }, + { + "epoch": 0.79, + "grad_norm": 2.4456570967777926, + "learning_rate": 1.107825422023564e-06, + "loss": 0.6375, + "step": 7512 + }, + { + "epoch": 0.79, + "grad_norm": 3.0183023237081814, + "learning_rate": 1.1067558546741708e-06, + "loss": 0.6086, + "step": 7513 + }, + { + "epoch": 0.79, + "grad_norm": 2.757754038229962, + "learning_rate": 1.1056867396306293e-06, + "loss": 0.5815, + "step": 7514 + }, + { + "epoch": 0.79, + "grad_norm": 2.5086536878121533, + "learning_rate": 1.1046180770171433e-06, + "loss": 0.5206, + "step": 7515 + }, + { + "epoch": 0.79, + "grad_norm": 1.022738679492978, + "learning_rate": 1.1035498669578693e-06, + "loss": 0.5553, + "step": 7516 + }, + { + "epoch": 0.79, + "grad_norm": 2.9716402092398586, + "learning_rate": 1.1024821095769089e-06, + "loss": 0.5598, + "step": 7517 + }, + { + "epoch": 0.79, + "grad_norm": 2.7359639039165096, + "learning_rate": 1.1014148049983097e-06, + "loss": 0.6813, + "step": 7518 + }, + { + "epoch": 0.79, + "grad_norm": 2.3089260079854244, + "learning_rate": 1.1003479533460698e-06, + "loss": 0.5112, + "step": 7519 + }, + { + "epoch": 0.79, + "grad_norm": 3.623336530654308, + "learning_rate": 1.099281554744131e-06, + "loss": 0.6542, + "step": 7520 + }, + { + "epoch": 0.79, + "grad_norm": 2.247425868920514, + "learning_rate": 1.0982156093163864e-06, + "loss": 0.571, + "step": 7521 + }, + { + "epoch": 0.79, + "grad_norm": 4.231031748647357, + "learning_rate": 1.0971501171866717e-06, + "loss": 0.5311, + "step": 7522 + }, + { + "epoch": 0.79, + "grad_norm": 2.718419525290093, + "learning_rate": 1.0960850784787763e-06, + "loss": 0.5205, + "step": 7523 + }, + { + "epoch": 0.79, + "grad_norm": 3.0286666822872843, + "learning_rate": 1.09502049331643e-06, + "loss": 0.611, + "step": 7524 + }, + { + "epoch": 0.79, + "grad_norm": 2.9672246988207207, + "learning_rate": 1.0939563618233156e-06, + "loss": 0.5913, + "step": 7525 + }, + { + "epoch": 0.79, + "grad_norm": 2.6418538967198884, + "learning_rate": 1.0928926841230585e-06, + "loss": 0.59, + "step": 7526 + }, + { + "epoch": 0.79, + "grad_norm": 2.2332248750501242, + "learning_rate": 1.0918294603392371e-06, + "loss": 0.5947, + "step": 7527 + }, + { + "epoch": 0.79, + "grad_norm": 2.5309030658613185, + "learning_rate": 1.0907666905953696e-06, + "loss": 0.5941, + "step": 7528 + }, + { + "epoch": 0.79, + "grad_norm": 2.3540324184287456, + "learning_rate": 1.0897043750149277e-06, + "loss": 0.6044, + "step": 7529 + }, + { + "epoch": 0.79, + "grad_norm": 2.770490477689187, + "learning_rate": 1.0886425137213297e-06, + "loss": 0.594, + "step": 7530 + }, + { + "epoch": 0.79, + "grad_norm": 2.6612418290795157, + "learning_rate": 1.087581106837936e-06, + "loss": 0.5736, + "step": 7531 + }, + { + "epoch": 0.79, + "grad_norm": 2.6742845066681014, + "learning_rate": 1.086520154488061e-06, + "loss": 0.6691, + "step": 7532 + }, + { + "epoch": 0.79, + "grad_norm": 2.9951587604725614, + "learning_rate": 1.0854596567949605e-06, + "loss": 0.5694, + "step": 7533 + }, + { + "epoch": 0.79, + "grad_norm": 2.5789976556897116, + "learning_rate": 1.084399613881843e-06, + "loss": 0.6256, + "step": 7534 + }, + { + "epoch": 0.79, + "grad_norm": 3.7119280648765347, + "learning_rate": 1.0833400258718579e-06, + "loss": 0.5858, + "step": 7535 + }, + { + "epoch": 0.79, + "grad_norm": 3.597761699649087, + "learning_rate": 1.0822808928881078e-06, + "loss": 0.5949, + "step": 7536 + }, + { + "epoch": 0.79, + "grad_norm": 2.7367475679821016, + "learning_rate": 1.0812222150536379e-06, + "loss": 0.4645, + "step": 7537 + }, + { + "epoch": 0.79, + "grad_norm": 2.775373037805519, + "learning_rate": 1.0801639924914437e-06, + "loss": 0.6371, + "step": 7538 + }, + { + "epoch": 0.79, + "grad_norm": 2.673800251379294, + "learning_rate": 1.0791062253244644e-06, + "loss": 0.562, + "step": 7539 + }, + { + "epoch": 0.79, + "grad_norm": 4.733905978311876, + "learning_rate": 1.07804891367559e-06, + "loss": 0.6194, + "step": 7540 + }, + { + "epoch": 0.79, + "grad_norm": 2.189114404251597, + "learning_rate": 1.0769920576676569e-06, + "loss": 0.5761, + "step": 7541 + }, + { + "epoch": 0.79, + "grad_norm": 2.283883510957929, + "learning_rate": 1.0759356574234447e-06, + "loss": 0.6039, + "step": 7542 + }, + { + "epoch": 0.79, + "grad_norm": 3.347107417686752, + "learning_rate": 1.0748797130656862e-06, + "loss": 0.6732, + "step": 7543 + }, + { + "epoch": 0.79, + "grad_norm": 3.3757009779681755, + "learning_rate": 1.0738242247170549e-06, + "loss": 0.5663, + "step": 7544 + }, + { + "epoch": 0.79, + "grad_norm": 2.1237347595673874, + "learning_rate": 1.0727691925001765e-06, + "loss": 0.6165, + "step": 7545 + }, + { + "epoch": 0.79, + "grad_norm": 3.0239588731460882, + "learning_rate": 1.0717146165376202e-06, + "loss": 0.6566, + "step": 7546 + }, + { + "epoch": 0.79, + "grad_norm": 2.560705932085416, + "learning_rate": 1.0706604969519052e-06, + "loss": 0.6015, + "step": 7547 + }, + { + "epoch": 0.79, + "grad_norm": 2.331047433053311, + "learning_rate": 1.069606833865494e-06, + "loss": 0.6184, + "step": 7548 + }, + { + "epoch": 0.79, + "grad_norm": 2.347877312775048, + "learning_rate": 1.0685536274008002e-06, + "loss": 0.5979, + "step": 7549 + }, + { + "epoch": 0.79, + "grad_norm": 2.246678913685663, + "learning_rate": 1.0675008776801804e-06, + "loss": 0.5749, + "step": 7550 + }, + { + "epoch": 0.79, + "grad_norm": 7.0685585261266795, + "learning_rate": 1.066448584825942e-06, + "loss": 0.6043, + "step": 7551 + }, + { + "epoch": 0.79, + "grad_norm": 2.204865595293493, + "learning_rate": 1.065396748960335e-06, + "loss": 0.6199, + "step": 7552 + }, + { + "epoch": 0.79, + "grad_norm": 2.0913674075484185, + "learning_rate": 1.06434537020556e-06, + "loss": 0.5869, + "step": 7553 + }, + { + "epoch": 0.79, + "grad_norm": 2.7478603902570047, + "learning_rate": 1.0632944486837642e-06, + "loss": 0.5817, + "step": 7554 + }, + { + "epoch": 0.8, + "grad_norm": 2.327839999993541, + "learning_rate": 1.0622439845170385e-06, + "loss": 0.6111, + "step": 7555 + }, + { + "epoch": 0.8, + "grad_norm": 3.0045359616694594, + "learning_rate": 1.061193977827425e-06, + "loss": 0.609, + "step": 7556 + }, + { + "epoch": 0.8, + "grad_norm": 2.062618698945312, + "learning_rate": 1.0601444287369073e-06, + "loss": 0.6086, + "step": 7557 + }, + { + "epoch": 0.8, + "grad_norm": 2.4008049526313604, + "learning_rate": 1.0590953373674229e-06, + "loss": 0.6595, + "step": 7558 + }, + { + "epoch": 0.8, + "grad_norm": 4.1811579275109, + "learning_rate": 1.0580467038408487e-06, + "loss": 0.5707, + "step": 7559 + }, + { + "epoch": 0.8, + "grad_norm": 2.3155227686288713, + "learning_rate": 1.0569985282790145e-06, + "loss": 0.6299, + "step": 7560 + }, + { + "epoch": 0.8, + "grad_norm": 3.5347215291469682, + "learning_rate": 1.0559508108036926e-06, + "loss": 0.5662, + "step": 7561 + }, + { + "epoch": 0.8, + "grad_norm": 2.007580421779567, + "learning_rate": 1.0549035515366052e-06, + "loss": 0.5217, + "step": 7562 + }, + { + "epoch": 0.8, + "grad_norm": 3.308048869247993, + "learning_rate": 1.0538567505994175e-06, + "loss": 0.6871, + "step": 7563 + }, + { + "epoch": 0.8, + "grad_norm": 2.425142794535516, + "learning_rate": 1.052810408113746e-06, + "loss": 0.643, + "step": 7564 + }, + { + "epoch": 0.8, + "grad_norm": 5.639780916922588, + "learning_rate": 1.051764524201152e-06, + "loss": 0.5547, + "step": 7565 + }, + { + "epoch": 0.8, + "grad_norm": 2.474431995538235, + "learning_rate": 1.0507190989831412e-06, + "loss": 0.632, + "step": 7566 + }, + { + "epoch": 0.8, + "grad_norm": 3.2149333641540214, + "learning_rate": 1.0496741325811705e-06, + "loss": 0.6219, + "step": 7567 + }, + { + "epoch": 0.8, + "grad_norm": 2.0743039032781865, + "learning_rate": 1.0486296251166383e-06, + "loss": 0.5175, + "step": 7568 + }, + { + "epoch": 0.8, + "grad_norm": 2.633639411660315, + "learning_rate": 1.0475855767108956e-06, + "loss": 0.6497, + "step": 7569 + }, + { + "epoch": 0.8, + "grad_norm": 2.457721991255091, + "learning_rate": 1.0465419874852338e-06, + "loss": 0.5951, + "step": 7570 + }, + { + "epoch": 0.8, + "grad_norm": 2.239650791365524, + "learning_rate": 1.0454988575608976e-06, + "loss": 0.5779, + "step": 7571 + }, + { + "epoch": 0.8, + "grad_norm": 3.0137305763833675, + "learning_rate": 1.0444561870590707e-06, + "loss": 0.6572, + "step": 7572 + }, + { + "epoch": 0.8, + "grad_norm": 2.832898876677455, + "learning_rate": 1.0434139761008915e-06, + "loss": 0.592, + "step": 7573 + }, + { + "epoch": 0.8, + "grad_norm": 2.8742909520681805, + "learning_rate": 1.042372224807438e-06, + "loss": 0.6899, + "step": 7574 + }, + { + "epoch": 0.8, + "grad_norm": 2.538021201102594, + "learning_rate": 1.0413309332997385e-06, + "loss": 0.533, + "step": 7575 + }, + { + "epoch": 0.8, + "grad_norm": 2.9699203259048996, + "learning_rate": 1.0402901016987694e-06, + "loss": 0.6227, + "step": 7576 + }, + { + "epoch": 0.8, + "grad_norm": 2.0629324280519654, + "learning_rate": 1.0392497301254489e-06, + "loss": 0.5473, + "step": 7577 + }, + { + "epoch": 0.8, + "grad_norm": 2.150602467486783, + "learning_rate": 1.0382098187006463e-06, + "loss": 0.6386, + "step": 7578 + }, + { + "epoch": 0.8, + "grad_norm": 1.012171485780843, + "learning_rate": 1.0371703675451732e-06, + "loss": 0.5258, + "step": 7579 + }, + { + "epoch": 0.8, + "grad_norm": 2.144392113247383, + "learning_rate": 1.0361313767797932e-06, + "loss": 0.5706, + "step": 7580 + }, + { + "epoch": 0.8, + "grad_norm": 2.4348220328637216, + "learning_rate": 1.0350928465252103e-06, + "loss": 0.6039, + "step": 7581 + }, + { + "epoch": 0.8, + "grad_norm": 2.6562323332446294, + "learning_rate": 1.0340547769020798e-06, + "loss": 0.5633, + "step": 7582 + }, + { + "epoch": 0.8, + "grad_norm": 0.9865343111730549, + "learning_rate": 1.0330171680309996e-06, + "loss": 0.5352, + "step": 7583 + }, + { + "epoch": 0.8, + "grad_norm": 3.3099068356431776, + "learning_rate": 1.0319800200325193e-06, + "loss": 0.5597, + "step": 7584 + }, + { + "epoch": 0.8, + "grad_norm": 2.519735515375185, + "learning_rate": 1.0309433330271285e-06, + "loss": 0.5921, + "step": 7585 + }, + { + "epoch": 0.8, + "grad_norm": 2.187601261956032, + "learning_rate": 1.029907107135269e-06, + "loss": 0.5567, + "step": 7586 + }, + { + "epoch": 0.8, + "grad_norm": 2.348573049735444, + "learning_rate": 1.0288713424773238e-06, + "loss": 0.6187, + "step": 7587 + }, + { + "epoch": 0.8, + "grad_norm": 2.701801842597667, + "learning_rate": 1.027836039173627e-06, + "loss": 0.5421, + "step": 7588 + }, + { + "epoch": 0.8, + "grad_norm": 2.717130793307138, + "learning_rate": 1.026801197344458e-06, + "loss": 0.6378, + "step": 7589 + }, + { + "epoch": 0.8, + "grad_norm": 2.3303124612250543, + "learning_rate": 1.0257668171100393e-06, + "loss": 0.6418, + "step": 7590 + }, + { + "epoch": 0.8, + "grad_norm": 2.7662913464027623, + "learning_rate": 1.0247328985905446e-06, + "loss": 0.5441, + "step": 7591 + }, + { + "epoch": 0.8, + "grad_norm": 2.544677485406088, + "learning_rate": 1.0236994419060892e-06, + "loss": 0.568, + "step": 7592 + }, + { + "epoch": 0.8, + "grad_norm": 2.899566327372865, + "learning_rate": 1.0226664471767401e-06, + "loss": 0.6502, + "step": 7593 + }, + { + "epoch": 0.8, + "grad_norm": 2.506201535469169, + "learning_rate": 1.021633914522504e-06, + "loss": 0.6079, + "step": 7594 + }, + { + "epoch": 0.8, + "grad_norm": 2.518907467485238, + "learning_rate": 1.0206018440633408e-06, + "loss": 0.5106, + "step": 7595 + }, + { + "epoch": 0.8, + "grad_norm": 2.3943093369268227, + "learning_rate": 1.0195702359191507e-06, + "loss": 0.579, + "step": 7596 + }, + { + "epoch": 0.8, + "grad_norm": 2.4145954273830927, + "learning_rate": 1.0185390902097857e-06, + "loss": 0.6422, + "step": 7597 + }, + { + "epoch": 0.8, + "grad_norm": 1.921201826914954, + "learning_rate": 1.017508407055039e-06, + "loss": 0.6023, + "step": 7598 + }, + { + "epoch": 0.8, + "grad_norm": 2.937076600981269, + "learning_rate": 1.0164781865746542e-06, + "loss": 0.5655, + "step": 7599 + }, + { + "epoch": 0.8, + "grad_norm": 2.760440801599633, + "learning_rate": 1.0154484288883177e-06, + "loss": 0.6897, + "step": 7600 + }, + { + "epoch": 0.8, + "grad_norm": 2.4696937629349898, + "learning_rate": 1.014419134115664e-06, + "loss": 0.6095, + "step": 7601 + }, + { + "epoch": 0.8, + "grad_norm": 4.811722355164242, + "learning_rate": 1.0133903023762758e-06, + "loss": 0.6984, + "step": 7602 + }, + { + "epoch": 0.8, + "grad_norm": 4.115947584954783, + "learning_rate": 1.0123619337896767e-06, + "loss": 0.5696, + "step": 7603 + }, + { + "epoch": 0.8, + "grad_norm": 2.564549686867802, + "learning_rate": 1.0113340284753425e-06, + "loss": 0.655, + "step": 7604 + }, + { + "epoch": 0.8, + "grad_norm": 12.618329274026946, + "learning_rate": 1.0103065865526895e-06, + "loss": 0.5778, + "step": 7605 + }, + { + "epoch": 0.8, + "grad_norm": 2.8969821170985206, + "learning_rate": 1.0092796081410856e-06, + "loss": 0.5542, + "step": 7606 + }, + { + "epoch": 0.8, + "grad_norm": 2.5199683999249087, + "learning_rate": 1.0082530933598388e-06, + "loss": 0.6197, + "step": 7607 + }, + { + "epoch": 0.8, + "grad_norm": 3.435215141719612, + "learning_rate": 1.0072270423282104e-06, + "loss": 0.5886, + "step": 7608 + }, + { + "epoch": 0.8, + "grad_norm": 2.1869076971904104, + "learning_rate": 1.0062014551654015e-06, + "loss": 0.5902, + "step": 7609 + }, + { + "epoch": 0.8, + "grad_norm": 2.5157709264478054, + "learning_rate": 1.0051763319905622e-06, + "loss": 0.5485, + "step": 7610 + }, + { + "epoch": 0.8, + "grad_norm": 2.3517546543055747, + "learning_rate": 1.0041516729227902e-06, + "loss": 0.5734, + "step": 7611 + }, + { + "epoch": 0.8, + "grad_norm": 2.447965128681716, + "learning_rate": 1.0031274780811245e-06, + "loss": 0.6666, + "step": 7612 + }, + { + "epoch": 0.8, + "grad_norm": 2.976761782472005, + "learning_rate": 1.0021037475845557e-06, + "loss": 0.5641, + "step": 7613 + }, + { + "epoch": 0.8, + "grad_norm": 2.698568711787718, + "learning_rate": 1.0010804815520159e-06, + "loss": 0.6378, + "step": 7614 + }, + { + "epoch": 0.8, + "grad_norm": 3.3363776215524092, + "learning_rate": 1.0000576801023876e-06, + "loss": 0.6686, + "step": 7615 + }, + { + "epoch": 0.8, + "grad_norm": 3.7415611338554813, + "learning_rate": 9.990353433544935e-07, + "loss": 0.6204, + "step": 7616 + }, + { + "epoch": 0.8, + "grad_norm": 3.136268007867056, + "learning_rate": 9.980134714271088e-07, + "loss": 0.7083, + "step": 7617 + }, + { + "epoch": 0.8, + "grad_norm": 2.186125465023069, + "learning_rate": 9.969920644389498e-07, + "loss": 0.583, + "step": 7618 + }, + { + "epoch": 0.8, + "grad_norm": 2.6754021896396454, + "learning_rate": 9.959711225086822e-07, + "loss": 0.6414, + "step": 7619 + }, + { + "epoch": 0.8, + "grad_norm": 3.3008931024230703, + "learning_rate": 9.949506457549135e-07, + "loss": 0.5452, + "step": 7620 + }, + { + "epoch": 0.8, + "grad_norm": 2.6881102400726142, + "learning_rate": 9.93930634296203e-07, + "loss": 0.5804, + "step": 7621 + }, + { + "epoch": 0.8, + "grad_norm": 2.510590511858801, + "learning_rate": 9.929110882510496e-07, + "loss": 0.5921, + "step": 7622 + }, + { + "epoch": 0.8, + "grad_norm": 2.218316814824091, + "learning_rate": 9.91892007737903e-07, + "loss": 0.6482, + "step": 7623 + }, + { + "epoch": 0.8, + "grad_norm": 2.3200140091226107, + "learning_rate": 9.908733928751574e-07, + "loss": 0.5587, + "step": 7624 + }, + { + "epoch": 0.8, + "grad_norm": 2.7818896360301317, + "learning_rate": 9.89855243781151e-07, + "loss": 0.6569, + "step": 7625 + }, + { + "epoch": 0.8, + "grad_norm": 2.972728123297002, + "learning_rate": 9.888375605741713e-07, + "loss": 0.587, + "step": 7626 + }, + { + "epoch": 0.8, + "grad_norm": 2.467181443995698, + "learning_rate": 9.87820343372447e-07, + "loss": 0.6413, + "step": 7627 + }, + { + "epoch": 0.8, + "grad_norm": 2.771873420545783, + "learning_rate": 9.868035922941594e-07, + "loss": 0.6314, + "step": 7628 + }, + { + "epoch": 0.8, + "grad_norm": 8.073309761721186, + "learning_rate": 9.857873074574276e-07, + "loss": 0.6316, + "step": 7629 + }, + { + "epoch": 0.8, + "grad_norm": 2.3110538134158447, + "learning_rate": 9.847714889803233e-07, + "loss": 0.6234, + "step": 7630 + }, + { + "epoch": 0.8, + "grad_norm": 2.3475214347720086, + "learning_rate": 9.8375613698086e-07, + "loss": 0.5873, + "step": 7631 + }, + { + "epoch": 0.8, + "grad_norm": 3.338962974057601, + "learning_rate": 9.827412515770003e-07, + "loss": 0.6285, + "step": 7632 + }, + { + "epoch": 0.8, + "grad_norm": 2.4511140443396053, + "learning_rate": 9.817268328866474e-07, + "loss": 0.5206, + "step": 7633 + }, + { + "epoch": 0.8, + "grad_norm": 2.6141500186568076, + "learning_rate": 9.807128810276573e-07, + "loss": 0.6173, + "step": 7634 + }, + { + "epoch": 0.8, + "grad_norm": 2.6588577517347343, + "learning_rate": 9.796993961178247e-07, + "loss": 0.5779, + "step": 7635 + }, + { + "epoch": 0.8, + "grad_norm": 2.2544484813883354, + "learning_rate": 9.786863782748946e-07, + "loss": 0.6226, + "step": 7636 + }, + { + "epoch": 0.8, + "grad_norm": 2.7154960137258373, + "learning_rate": 9.776738276165576e-07, + "loss": 0.5301, + "step": 7637 + }, + { + "epoch": 0.8, + "grad_norm": 2.8331620353922093, + "learning_rate": 9.76661744260447e-07, + "loss": 0.6592, + "step": 7638 + }, + { + "epoch": 0.8, + "grad_norm": 2.8596209122623604, + "learning_rate": 9.75650128324146e-07, + "loss": 0.6592, + "step": 7639 + }, + { + "epoch": 0.8, + "grad_norm": 0.9617449759052116, + "learning_rate": 9.746389799251783e-07, + "loss": 0.5056, + "step": 7640 + }, + { + "epoch": 0.8, + "grad_norm": 2.6375463130140555, + "learning_rate": 9.736282991810191e-07, + "loss": 0.649, + "step": 7641 + }, + { + "epoch": 0.8, + "grad_norm": 3.170328479172141, + "learning_rate": 9.72618086209084e-07, + "loss": 0.6085, + "step": 7642 + }, + { + "epoch": 0.8, + "grad_norm": 2.0826307503241073, + "learning_rate": 9.71608341126739e-07, + "loss": 0.5132, + "step": 7643 + }, + { + "epoch": 0.8, + "grad_norm": 5.426019291056326, + "learning_rate": 9.705990640512909e-07, + "loss": 0.5804, + "step": 7644 + }, + { + "epoch": 0.8, + "grad_norm": 2.8657766408804495, + "learning_rate": 9.695902550999953e-07, + "loss": 0.6634, + "step": 7645 + }, + { + "epoch": 0.8, + "grad_norm": 2.1192272871905335, + "learning_rate": 9.685819143900544e-07, + "loss": 0.575, + "step": 7646 + }, + { + "epoch": 0.8, + "grad_norm": 2.9730510980964873, + "learning_rate": 9.675740420386132e-07, + "loss": 0.6602, + "step": 7647 + }, + { + "epoch": 0.8, + "grad_norm": 2.8204604472554924, + "learning_rate": 9.66566638162762e-07, + "loss": 0.6495, + "step": 7648 + }, + { + "epoch": 0.8, + "grad_norm": 2.539741388640035, + "learning_rate": 9.655597028795394e-07, + "loss": 0.6173, + "step": 7649 + }, + { + "epoch": 0.81, + "grad_norm": 2.5656949751155484, + "learning_rate": 9.64553236305929e-07, + "loss": 0.6095, + "step": 7650 + }, + { + "epoch": 0.81, + "grad_norm": 2.450376476628125, + "learning_rate": 9.635472385588573e-07, + "loss": 0.5739, + "step": 7651 + }, + { + "epoch": 0.81, + "grad_norm": 2.6482970996607156, + "learning_rate": 9.625417097552003e-07, + "loss": 0.6081, + "step": 7652 + }, + { + "epoch": 0.81, + "grad_norm": 2.6667414141134174, + "learning_rate": 9.615366500117757e-07, + "loss": 0.6578, + "step": 7653 + }, + { + "epoch": 0.81, + "grad_norm": 3.74349389755696, + "learning_rate": 9.6053205944535e-07, + "loss": 0.5548, + "step": 7654 + }, + { + "epoch": 0.81, + "grad_norm": 3.010896829422771, + "learning_rate": 9.595279381726308e-07, + "loss": 0.5475, + "step": 7655 + }, + { + "epoch": 0.81, + "grad_norm": 2.4405802174170725, + "learning_rate": 9.58524286310278e-07, + "loss": 0.6354, + "step": 7656 + }, + { + "epoch": 0.81, + "grad_norm": 4.919810726968023, + "learning_rate": 9.575211039748893e-07, + "loss": 0.5448, + "step": 7657 + }, + { + "epoch": 0.81, + "grad_norm": 2.363605925097881, + "learning_rate": 9.565183912830134e-07, + "loss": 0.6191, + "step": 7658 + }, + { + "epoch": 0.81, + "grad_norm": 2.4814246714387385, + "learning_rate": 9.555161483511434e-07, + "loss": 0.6056, + "step": 7659 + }, + { + "epoch": 0.81, + "grad_norm": 2.3442144571403394, + "learning_rate": 9.545143752957143e-07, + "loss": 0.5799, + "step": 7660 + }, + { + "epoch": 0.81, + "grad_norm": 0.9844963618684487, + "learning_rate": 9.535130722331121e-07, + "loss": 0.5501, + "step": 7661 + }, + { + "epoch": 0.81, + "grad_norm": 3.841148309195625, + "learning_rate": 9.525122392796632e-07, + "loss": 0.5798, + "step": 7662 + }, + { + "epoch": 0.81, + "grad_norm": 2.6946034136889114, + "learning_rate": 9.515118765516429e-07, + "loss": 0.624, + "step": 7663 + }, + { + "epoch": 0.81, + "grad_norm": 2.614312745110826, + "learning_rate": 9.505119841652688e-07, + "loss": 0.6012, + "step": 7664 + }, + { + "epoch": 0.81, + "grad_norm": 2.2341716999178733, + "learning_rate": 9.495125622367079e-07, + "loss": 0.699, + "step": 7665 + }, + { + "epoch": 0.81, + "grad_norm": 2.454492464172891, + "learning_rate": 9.485136108820675e-07, + "loss": 0.5752, + "step": 7666 + }, + { + "epoch": 0.81, + "grad_norm": 2.7299044786206266, + "learning_rate": 9.475151302174052e-07, + "loss": 0.6267, + "step": 7667 + }, + { + "epoch": 0.81, + "grad_norm": 4.46712121179473, + "learning_rate": 9.465171203587192e-07, + "loss": 0.69, + "step": 7668 + }, + { + "epoch": 0.81, + "grad_norm": 2.8581757438763984, + "learning_rate": 9.455195814219581e-07, + "loss": 0.6329, + "step": 7669 + }, + { + "epoch": 0.81, + "grad_norm": 2.7749337082880636, + "learning_rate": 9.445225135230102e-07, + "loss": 0.5863, + "step": 7670 + }, + { + "epoch": 0.81, + "grad_norm": 2.9200575440191554, + "learning_rate": 9.435259167777139e-07, + "loss": 0.6594, + "step": 7671 + }, + { + "epoch": 0.81, + "grad_norm": 3.1961551877877343, + "learning_rate": 9.425297913018517e-07, + "loss": 0.6262, + "step": 7672 + }, + { + "epoch": 0.81, + "grad_norm": 5.621987087951769, + "learning_rate": 9.41534137211148e-07, + "loss": 0.6806, + "step": 7673 + }, + { + "epoch": 0.81, + "grad_norm": 2.37550460624206, + "learning_rate": 9.405389546212779e-07, + "loss": 0.4877, + "step": 7674 + }, + { + "epoch": 0.81, + "grad_norm": 2.1856374832479073, + "learning_rate": 9.395442436478558e-07, + "loss": 0.6366, + "step": 7675 + }, + { + "epoch": 0.81, + "grad_norm": 2.3120929072875986, + "learning_rate": 9.385500044064472e-07, + "loss": 0.5728, + "step": 7676 + }, + { + "epoch": 0.81, + "grad_norm": 11.267589648078419, + "learning_rate": 9.375562370125574e-07, + "loss": 0.6151, + "step": 7677 + }, + { + "epoch": 0.81, + "grad_norm": 3.3911562826517248, + "learning_rate": 9.365629415816418e-07, + "loss": 0.5745, + "step": 7678 + }, + { + "epoch": 0.81, + "grad_norm": 2.0931728619715138, + "learning_rate": 9.355701182290961e-07, + "loss": 0.5957, + "step": 7679 + }, + { + "epoch": 0.81, + "grad_norm": 2.4056655342238416, + "learning_rate": 9.345777670702649e-07, + "loss": 0.6034, + "step": 7680 + }, + { + "epoch": 0.81, + "grad_norm": 3.31101973582715, + "learning_rate": 9.335858882204385e-07, + "loss": 0.5717, + "step": 7681 + }, + { + "epoch": 0.81, + "grad_norm": 3.2633888627628336, + "learning_rate": 9.325944817948485e-07, + "loss": 0.6403, + "step": 7682 + }, + { + "epoch": 0.81, + "grad_norm": 2.507966888011093, + "learning_rate": 9.316035479086727e-07, + "loss": 0.6023, + "step": 7683 + }, + { + "epoch": 0.81, + "grad_norm": 2.6707445301357104, + "learning_rate": 9.306130866770364e-07, + "loss": 0.694, + "step": 7684 + }, + { + "epoch": 0.81, + "grad_norm": 2.3698584587195777, + "learning_rate": 9.296230982150095e-07, + "loss": 0.499, + "step": 7685 + }, + { + "epoch": 0.81, + "grad_norm": 2.646352948765854, + "learning_rate": 9.28633582637603e-07, + "loss": 0.5998, + "step": 7686 + }, + { + "epoch": 0.81, + "grad_norm": 2.3308088558522724, + "learning_rate": 9.276445400597795e-07, + "loss": 0.6442, + "step": 7687 + }, + { + "epoch": 0.81, + "grad_norm": 2.7184765380374314, + "learning_rate": 9.266559705964401e-07, + "loss": 0.7093, + "step": 7688 + }, + { + "epoch": 0.81, + "grad_norm": 2.623044957668708, + "learning_rate": 9.256678743624364e-07, + "loss": 0.6417, + "step": 7689 + }, + { + "epoch": 0.81, + "grad_norm": 5.079400937660165, + "learning_rate": 9.246802514725601e-07, + "loss": 0.6116, + "step": 7690 + }, + { + "epoch": 0.81, + "grad_norm": 2.2731708710443503, + "learning_rate": 9.236931020415529e-07, + "loss": 0.5756, + "step": 7691 + }, + { + "epoch": 0.81, + "grad_norm": 3.3136227736096924, + "learning_rate": 9.227064261840962e-07, + "loss": 0.6093, + "step": 7692 + }, + { + "epoch": 0.81, + "grad_norm": 2.1475285005337974, + "learning_rate": 9.21720224014821e-07, + "loss": 0.6814, + "step": 7693 + }, + { + "epoch": 0.81, + "grad_norm": 2.9968897146706133, + "learning_rate": 9.207344956483022e-07, + "loss": 0.6033, + "step": 7694 + }, + { + "epoch": 0.81, + "grad_norm": 2.6350679079149866, + "learning_rate": 9.197492411990571e-07, + "loss": 0.626, + "step": 7695 + }, + { + "epoch": 0.81, + "grad_norm": 2.72376089543527, + "learning_rate": 9.187644607815499e-07, + "loss": 0.6423, + "step": 7696 + }, + { + "epoch": 0.81, + "grad_norm": 2.8530684492710217, + "learning_rate": 9.1778015451019e-07, + "loss": 0.6458, + "step": 7697 + }, + { + "epoch": 0.81, + "grad_norm": 2.0539321212137107, + "learning_rate": 9.16796322499332e-07, + "loss": 0.6108, + "step": 7698 + }, + { + "epoch": 0.81, + "grad_norm": 2.7055060163420066, + "learning_rate": 9.15812964863273e-07, + "loss": 0.5557, + "step": 7699 + }, + { + "epoch": 0.81, + "grad_norm": 2.334028049190322, + "learning_rate": 9.148300817162587e-07, + "loss": 0.5686, + "step": 7700 + }, + { + "epoch": 0.81, + "grad_norm": 0.8483172248492421, + "learning_rate": 9.138476731724749e-07, + "loss": 0.5497, + "step": 7701 + }, + { + "epoch": 0.81, + "grad_norm": 2.586655116930661, + "learning_rate": 9.128657393460583e-07, + "loss": 0.5979, + "step": 7702 + }, + { + "epoch": 0.81, + "grad_norm": 3.314047969656968, + "learning_rate": 9.118842803510841e-07, + "loss": 0.6292, + "step": 7703 + }, + { + "epoch": 0.81, + "grad_norm": 2.410214810373217, + "learning_rate": 9.10903296301578e-07, + "loss": 0.6381, + "step": 7704 + }, + { + "epoch": 0.81, + "grad_norm": 3.104473643611443, + "learning_rate": 9.099227873115047e-07, + "loss": 0.6241, + "step": 7705 + }, + { + "epoch": 0.81, + "grad_norm": 3.4899505415847676, + "learning_rate": 9.089427534947792e-07, + "loss": 0.6409, + "step": 7706 + }, + { + "epoch": 0.81, + "grad_norm": 2.1792155269408617, + "learning_rate": 9.0796319496526e-07, + "loss": 0.6243, + "step": 7707 + }, + { + "epoch": 0.81, + "grad_norm": 2.648707594215365, + "learning_rate": 9.069841118367462e-07, + "loss": 0.6185, + "step": 7708 + }, + { + "epoch": 0.81, + "grad_norm": 2.801759382084992, + "learning_rate": 9.060055042229881e-07, + "loss": 0.6844, + "step": 7709 + }, + { + "epoch": 0.81, + "grad_norm": 2.8450393356650685, + "learning_rate": 9.050273722376746e-07, + "loss": 0.535, + "step": 7710 + }, + { + "epoch": 0.81, + "grad_norm": 3.237580480646928, + "learning_rate": 9.04049715994445e-07, + "loss": 0.6118, + "step": 7711 + }, + { + "epoch": 0.81, + "grad_norm": 4.029884410505219, + "learning_rate": 9.030725356068781e-07, + "loss": 0.642, + "step": 7712 + }, + { + "epoch": 0.81, + "grad_norm": 2.5945698031312086, + "learning_rate": 9.020958311885019e-07, + "loss": 0.6349, + "step": 7713 + }, + { + "epoch": 0.81, + "grad_norm": 2.455581536260283, + "learning_rate": 9.011196028527853e-07, + "loss": 0.6013, + "step": 7714 + }, + { + "epoch": 0.81, + "grad_norm": 0.9942779258867724, + "learning_rate": 9.001438507131444e-07, + "loss": 0.527, + "step": 7715 + }, + { + "epoch": 0.81, + "grad_norm": 2.700359868586655, + "learning_rate": 8.991685748829404e-07, + "loss": 0.6477, + "step": 7716 + }, + { + "epoch": 0.81, + "grad_norm": 2.104403745631631, + "learning_rate": 8.981937754754777e-07, + "loss": 0.6163, + "step": 7717 + }, + { + "epoch": 0.81, + "grad_norm": 2.5972794158820482, + "learning_rate": 8.972194526040034e-07, + "loss": 0.5271, + "step": 7718 + }, + { + "epoch": 0.81, + "grad_norm": 2.4719633270189583, + "learning_rate": 8.962456063817132e-07, + "loss": 0.6623, + "step": 7719 + }, + { + "epoch": 0.81, + "grad_norm": 0.9667484827218678, + "learning_rate": 8.95272236921747e-07, + "loss": 0.5675, + "step": 7720 + }, + { + "epoch": 0.81, + "grad_norm": 2.425564331369107, + "learning_rate": 8.942993443371856e-07, + "loss": 0.609, + "step": 7721 + }, + { + "epoch": 0.81, + "grad_norm": 2.947734739772372, + "learning_rate": 8.93326928741059e-07, + "loss": 0.6371, + "step": 7722 + }, + { + "epoch": 0.81, + "grad_norm": 2.9080706900479045, + "learning_rate": 8.923549902463374e-07, + "loss": 0.6628, + "step": 7723 + }, + { + "epoch": 0.81, + "grad_norm": 3.8252938892041635, + "learning_rate": 8.913835289659406e-07, + "loss": 0.5489, + "step": 7724 + }, + { + "epoch": 0.81, + "grad_norm": 2.5458974669966805, + "learning_rate": 8.904125450127272e-07, + "loss": 0.6616, + "step": 7725 + }, + { + "epoch": 0.81, + "grad_norm": 2.184591112903663, + "learning_rate": 8.894420384995056e-07, + "loss": 0.6075, + "step": 7726 + }, + { + "epoch": 0.81, + "grad_norm": 2.9011418508765647, + "learning_rate": 8.884720095390248e-07, + "loss": 0.6888, + "step": 7727 + }, + { + "epoch": 0.81, + "grad_norm": 2.454534274847728, + "learning_rate": 8.875024582439801e-07, + "loss": 0.6525, + "step": 7728 + }, + { + "epoch": 0.81, + "grad_norm": 2.3930270904791335, + "learning_rate": 8.865333847270135e-07, + "loss": 0.6685, + "step": 7729 + }, + { + "epoch": 0.81, + "grad_norm": 2.5600293308303783, + "learning_rate": 8.855647891007075e-07, + "loss": 0.5978, + "step": 7730 + }, + { + "epoch": 0.81, + "grad_norm": 2.437056971277586, + "learning_rate": 8.845966714775894e-07, + "loss": 0.6665, + "step": 7731 + }, + { + "epoch": 0.81, + "grad_norm": 21.97578049995487, + "learning_rate": 8.836290319701335e-07, + "loss": 0.5903, + "step": 7732 + }, + { + "epoch": 0.81, + "grad_norm": 2.878994513782418, + "learning_rate": 8.826618706907585e-07, + "loss": 0.6547, + "step": 7733 + }, + { + "epoch": 0.81, + "grad_norm": 2.6089834334746893, + "learning_rate": 8.816951877518243e-07, + "loss": 0.6462, + "step": 7734 + }, + { + "epoch": 0.81, + "grad_norm": 3.5891885718164533, + "learning_rate": 8.807289832656396e-07, + "loss": 0.6704, + "step": 7735 + }, + { + "epoch": 0.81, + "grad_norm": 2.53450443321136, + "learning_rate": 8.797632573444526e-07, + "loss": 0.5234, + "step": 7736 + }, + { + "epoch": 0.81, + "grad_norm": 2.4367976647015843, + "learning_rate": 8.787980101004612e-07, + "loss": 0.5515, + "step": 7737 + }, + { + "epoch": 0.81, + "grad_norm": 3.393038737753617, + "learning_rate": 8.77833241645803e-07, + "loss": 0.6213, + "step": 7738 + }, + { + "epoch": 0.81, + "grad_norm": 2.942620580800737, + "learning_rate": 8.768689520925638e-07, + "loss": 0.5855, + "step": 7739 + }, + { + "epoch": 0.81, + "grad_norm": 2.2895413377993816, + "learning_rate": 8.759051415527697e-07, + "loss": 0.6244, + "step": 7740 + }, + { + "epoch": 0.81, + "grad_norm": 3.1158010305913275, + "learning_rate": 8.749418101383944e-07, + "loss": 0.5865, + "step": 7741 + }, + { + "epoch": 0.81, + "grad_norm": 2.6974129522038877, + "learning_rate": 8.739789579613572e-07, + "loss": 0.5408, + "step": 7742 + }, + { + "epoch": 0.81, + "grad_norm": 2.895780450687752, + "learning_rate": 8.73016585133517e-07, + "loss": 0.6004, + "step": 7743 + }, + { + "epoch": 0.81, + "grad_norm": 3.0182236513579874, + "learning_rate": 8.720546917666789e-07, + "loss": 0.6118, + "step": 7744 + }, + { + "epoch": 0.82, + "grad_norm": 2.370869512524687, + "learning_rate": 8.710932779725939e-07, + "loss": 0.6114, + "step": 7745 + }, + { + "epoch": 0.82, + "grad_norm": 2.3023012937961846, + "learning_rate": 8.701323438629577e-07, + "loss": 0.5891, + "step": 7746 + }, + { + "epoch": 0.82, + "grad_norm": 2.7894697251608, + "learning_rate": 8.691718895494067e-07, + "loss": 0.5835, + "step": 7747 + }, + { + "epoch": 0.82, + "grad_norm": 2.2253869804344077, + "learning_rate": 8.682119151435258e-07, + "loss": 0.657, + "step": 7748 + }, + { + "epoch": 0.82, + "grad_norm": 2.1968761019330536, + "learning_rate": 8.672524207568389e-07, + "loss": 0.5731, + "step": 7749 + }, + { + "epoch": 0.82, + "grad_norm": 2.0451951033529774, + "learning_rate": 8.6629340650082e-07, + "loss": 0.6249, + "step": 7750 + }, + { + "epoch": 0.82, + "grad_norm": 5.290396646074176, + "learning_rate": 8.653348724868843e-07, + "loss": 0.6095, + "step": 7751 + }, + { + "epoch": 0.82, + "grad_norm": 2.4998002085053828, + "learning_rate": 8.643768188263918e-07, + "loss": 0.5901, + "step": 7752 + }, + { + "epoch": 0.82, + "grad_norm": 2.927706275807229, + "learning_rate": 8.63419245630644e-07, + "loss": 0.6667, + "step": 7753 + }, + { + "epoch": 0.82, + "grad_norm": 2.514050334470586, + "learning_rate": 8.624621530108901e-07, + "loss": 0.5498, + "step": 7754 + }, + { + "epoch": 0.82, + "grad_norm": 2.00269176738163, + "learning_rate": 8.615055410783246e-07, + "loss": 0.5119, + "step": 7755 + }, + { + "epoch": 0.82, + "grad_norm": 2.2181227482346895, + "learning_rate": 8.605494099440808e-07, + "loss": 0.6086, + "step": 7756 + }, + { + "epoch": 0.82, + "grad_norm": 0.9630978359237016, + "learning_rate": 8.595937597192422e-07, + "loss": 0.5444, + "step": 7757 + }, + { + "epoch": 0.82, + "grad_norm": 4.040120236257973, + "learning_rate": 8.586385905148304e-07, + "loss": 0.5648, + "step": 7758 + }, + { + "epoch": 0.82, + "grad_norm": 2.7334403291410077, + "learning_rate": 8.576839024418165e-07, + "loss": 0.6385, + "step": 7759 + }, + { + "epoch": 0.82, + "grad_norm": 2.2473246156394096, + "learning_rate": 8.567296956111121e-07, + "loss": 0.5395, + "step": 7760 + }, + { + "epoch": 0.82, + "grad_norm": 2.5453764969411057, + "learning_rate": 8.557759701335755e-07, + "loss": 0.6596, + "step": 7761 + }, + { + "epoch": 0.82, + "grad_norm": 3.755171473082308, + "learning_rate": 8.54822726120006e-07, + "loss": 0.5935, + "step": 7762 + }, + { + "epoch": 0.82, + "grad_norm": 2.3621820404124114, + "learning_rate": 8.538699636811493e-07, + "loss": 0.6247, + "step": 7763 + }, + { + "epoch": 0.82, + "grad_norm": 2.5573182046892384, + "learning_rate": 8.529176829276964e-07, + "loss": 0.5769, + "step": 7764 + }, + { + "epoch": 0.82, + "grad_norm": 2.452128633867755, + "learning_rate": 8.519658839702787e-07, + "loss": 0.5634, + "step": 7765 + }, + { + "epoch": 0.82, + "grad_norm": 3.1439091682655964, + "learning_rate": 8.51014566919473e-07, + "loss": 0.6145, + "step": 7766 + }, + { + "epoch": 0.82, + "grad_norm": 7.479175976072859, + "learning_rate": 8.500637318858018e-07, + "loss": 0.6538, + "step": 7767 + }, + { + "epoch": 0.82, + "grad_norm": 2.430856695338035, + "learning_rate": 8.491133789797307e-07, + "loss": 0.6279, + "step": 7768 + }, + { + "epoch": 0.82, + "grad_norm": 2.770347480171204, + "learning_rate": 8.481635083116668e-07, + "loss": 0.6057, + "step": 7769 + }, + { + "epoch": 0.82, + "grad_norm": 3.069965846489253, + "learning_rate": 8.472141199919664e-07, + "loss": 0.5681, + "step": 7770 + }, + { + "epoch": 0.82, + "grad_norm": 2.9992990174535383, + "learning_rate": 8.462652141309242e-07, + "loss": 0.6433, + "step": 7771 + }, + { + "epoch": 0.82, + "grad_norm": 2.79447885278482, + "learning_rate": 8.453167908387827e-07, + "loss": 0.6942, + "step": 7772 + }, + { + "epoch": 0.82, + "grad_norm": 2.3694158265158682, + "learning_rate": 8.443688502257253e-07, + "loss": 0.5695, + "step": 7773 + }, + { + "epoch": 0.82, + "grad_norm": 2.6521967702602134, + "learning_rate": 8.434213924018836e-07, + "loss": 0.6175, + "step": 7774 + }, + { + "epoch": 0.82, + "grad_norm": 6.450142386712109, + "learning_rate": 8.424744174773281e-07, + "loss": 0.5917, + "step": 7775 + }, + { + "epoch": 0.82, + "grad_norm": 2.60990893147203, + "learning_rate": 8.415279255620762e-07, + "loss": 0.7291, + "step": 7776 + }, + { + "epoch": 0.82, + "grad_norm": 2.3162030741562702, + "learning_rate": 8.405819167660906e-07, + "loss": 0.5496, + "step": 7777 + }, + { + "epoch": 0.82, + "grad_norm": 4.454400577973904, + "learning_rate": 8.396363911992739e-07, + "loss": 0.6948, + "step": 7778 + }, + { + "epoch": 0.82, + "grad_norm": 2.833872698534407, + "learning_rate": 8.386913489714737e-07, + "loss": 0.5477, + "step": 7779 + }, + { + "epoch": 0.82, + "grad_norm": 2.445942443952145, + "learning_rate": 8.377467901924835e-07, + "loss": 0.5796, + "step": 7780 + }, + { + "epoch": 0.82, + "grad_norm": 2.537918156805052, + "learning_rate": 8.368027149720404e-07, + "loss": 0.6614, + "step": 7781 + }, + { + "epoch": 0.82, + "grad_norm": 2.100483149802904, + "learning_rate": 8.358591234198221e-07, + "loss": 0.6177, + "step": 7782 + }, + { + "epoch": 0.82, + "grad_norm": 2.7198326768529277, + "learning_rate": 8.34916015645455e-07, + "loss": 0.625, + "step": 7783 + }, + { + "epoch": 0.82, + "grad_norm": 3.8763253745346042, + "learning_rate": 8.339733917585041e-07, + "loss": 0.504, + "step": 7784 + }, + { + "epoch": 0.82, + "grad_norm": 4.0339142550402896, + "learning_rate": 8.330312518684813e-07, + "loss": 0.6544, + "step": 7785 + }, + { + "epoch": 0.82, + "grad_norm": 3.372358625907128, + "learning_rate": 8.320895960848435e-07, + "loss": 0.662, + "step": 7786 + }, + { + "epoch": 0.82, + "grad_norm": 11.12693531528549, + "learning_rate": 8.311484245169888e-07, + "loss": 0.5841, + "step": 7787 + }, + { + "epoch": 0.82, + "grad_norm": 3.0258617844542615, + "learning_rate": 8.302077372742573e-07, + "loss": 0.5996, + "step": 7788 + }, + { + "epoch": 0.82, + "grad_norm": 2.149848595276636, + "learning_rate": 8.292675344659374e-07, + "loss": 0.5624, + "step": 7789 + }, + { + "epoch": 0.82, + "grad_norm": 9.921586434700963, + "learning_rate": 8.283278162012604e-07, + "loss": 0.6565, + "step": 7790 + }, + { + "epoch": 0.82, + "grad_norm": 2.223681029796389, + "learning_rate": 8.273885825893984e-07, + "loss": 0.572, + "step": 7791 + }, + { + "epoch": 0.82, + "grad_norm": 12.307302243450632, + "learning_rate": 8.264498337394683e-07, + "loss": 0.7158, + "step": 7792 + }, + { + "epoch": 0.82, + "grad_norm": 3.185259212276455, + "learning_rate": 8.255115697605315e-07, + "loss": 0.6205, + "step": 7793 + }, + { + "epoch": 0.82, + "grad_norm": 2.3742771690682867, + "learning_rate": 8.245737907615948e-07, + "loss": 0.571, + "step": 7794 + }, + { + "epoch": 0.82, + "grad_norm": 2.0306399507949875, + "learning_rate": 8.236364968516036e-07, + "loss": 0.574, + "step": 7795 + }, + { + "epoch": 0.82, + "grad_norm": 2.95185086622032, + "learning_rate": 8.226996881394533e-07, + "loss": 0.5969, + "step": 7796 + }, + { + "epoch": 0.82, + "grad_norm": 2.83828313253317, + "learning_rate": 8.217633647339762e-07, + "loss": 0.6308, + "step": 7797 + }, + { + "epoch": 0.82, + "grad_norm": 2.740058288843735, + "learning_rate": 8.208275267439536e-07, + "loss": 0.6409, + "step": 7798 + }, + { + "epoch": 0.82, + "grad_norm": 3.205986939831031, + "learning_rate": 8.19892174278109e-07, + "loss": 0.638, + "step": 7799 + }, + { + "epoch": 0.82, + "grad_norm": 3.133784418145159, + "learning_rate": 8.189573074451084e-07, + "loss": 0.5896, + "step": 7800 + }, + { + "epoch": 0.82, + "grad_norm": 4.651287297043312, + "learning_rate": 8.180229263535605e-07, + "loss": 0.6102, + "step": 7801 + }, + { + "epoch": 0.82, + "grad_norm": 2.8083107733624537, + "learning_rate": 8.1708903111202e-07, + "loss": 0.5911, + "step": 7802 + }, + { + "epoch": 0.82, + "grad_norm": 2.697503873505146, + "learning_rate": 8.161556218289857e-07, + "loss": 0.5624, + "step": 7803 + }, + { + "epoch": 0.82, + "grad_norm": 2.3094841871089065, + "learning_rate": 8.15222698612897e-07, + "loss": 0.5943, + "step": 7804 + }, + { + "epoch": 0.82, + "grad_norm": 2.791054703314953, + "learning_rate": 8.142902615721371e-07, + "loss": 0.5893, + "step": 7805 + }, + { + "epoch": 0.82, + "grad_norm": 2.2449207941075198, + "learning_rate": 8.133583108150345e-07, + "loss": 0.5883, + "step": 7806 + }, + { + "epoch": 0.82, + "grad_norm": 6.264533076641818, + "learning_rate": 8.124268464498625e-07, + "loss": 0.5802, + "step": 7807 + }, + { + "epoch": 0.82, + "grad_norm": 2.397265245054088, + "learning_rate": 8.114958685848334e-07, + "loss": 0.7004, + "step": 7808 + }, + { + "epoch": 0.82, + "grad_norm": 2.687783198325523, + "learning_rate": 8.105653773281074e-07, + "loss": 0.633, + "step": 7809 + }, + { + "epoch": 0.82, + "grad_norm": 4.572173865987845, + "learning_rate": 8.096353727877843e-07, + "loss": 0.6158, + "step": 7810 + }, + { + "epoch": 0.82, + "grad_norm": 2.084933298498584, + "learning_rate": 8.087058550719107e-07, + "loss": 0.6373, + "step": 7811 + }, + { + "epoch": 0.82, + "grad_norm": 4.117019095691375, + "learning_rate": 8.077768242884759e-07, + "loss": 0.5337, + "step": 7812 + }, + { + "epoch": 0.82, + "grad_norm": 2.526067117482907, + "learning_rate": 8.068482805454115e-07, + "loss": 0.5935, + "step": 7813 + }, + { + "epoch": 0.82, + "grad_norm": 2.770744913286671, + "learning_rate": 8.059202239505915e-07, + "loss": 0.6824, + "step": 7814 + }, + { + "epoch": 0.82, + "grad_norm": 2.3876150794603417, + "learning_rate": 8.049926546118359e-07, + "loss": 0.6197, + "step": 7815 + }, + { + "epoch": 0.82, + "grad_norm": 2.301040153613851, + "learning_rate": 8.040655726369079e-07, + "loss": 0.6231, + "step": 7816 + }, + { + "epoch": 0.82, + "grad_norm": 2.1640120127578, + "learning_rate": 8.031389781335119e-07, + "loss": 0.5915, + "step": 7817 + }, + { + "epoch": 0.82, + "grad_norm": 1.0216851076508051, + "learning_rate": 8.022128712092986e-07, + "loss": 0.5629, + "step": 7818 + }, + { + "epoch": 0.82, + "grad_norm": 2.3407168223787758, + "learning_rate": 8.012872519718578e-07, + "loss": 0.5779, + "step": 7819 + }, + { + "epoch": 0.82, + "grad_norm": 3.240628782070062, + "learning_rate": 8.003621205287271e-07, + "loss": 0.6247, + "step": 7820 + }, + { + "epoch": 0.82, + "grad_norm": 2.5968226751951202, + "learning_rate": 7.994374769873864e-07, + "loss": 0.5631, + "step": 7821 + }, + { + "epoch": 0.82, + "grad_norm": 2.9225493166351844, + "learning_rate": 7.98513321455257e-07, + "loss": 0.5619, + "step": 7822 + }, + { + "epoch": 0.82, + "grad_norm": 2.5510204032518633, + "learning_rate": 7.975896540397038e-07, + "loss": 0.5647, + "step": 7823 + }, + { + "epoch": 0.82, + "grad_norm": 3.2618088874821436, + "learning_rate": 7.966664748480362e-07, + "loss": 0.681, + "step": 7824 + }, + { + "epoch": 0.82, + "grad_norm": 2.432627062148813, + "learning_rate": 7.957437839875088e-07, + "loss": 0.619, + "step": 7825 + }, + { + "epoch": 0.82, + "grad_norm": 2.881899570004556, + "learning_rate": 7.948215815653149e-07, + "loss": 0.5353, + "step": 7826 + }, + { + "epoch": 0.82, + "grad_norm": 2.211030969729491, + "learning_rate": 7.938998676885922e-07, + "loss": 0.5992, + "step": 7827 + }, + { + "epoch": 0.82, + "grad_norm": 2.7189882982651374, + "learning_rate": 7.929786424644248e-07, + "loss": 0.5802, + "step": 7828 + }, + { + "epoch": 0.82, + "grad_norm": 2.6023452660932165, + "learning_rate": 7.920579059998384e-07, + "loss": 0.575, + "step": 7829 + }, + { + "epoch": 0.82, + "grad_norm": 2.417297217989513, + "learning_rate": 7.911376584017993e-07, + "loss": 0.5887, + "step": 7830 + }, + { + "epoch": 0.82, + "grad_norm": 2.636618821203334, + "learning_rate": 7.90217899777222e-07, + "loss": 0.5957, + "step": 7831 + }, + { + "epoch": 0.82, + "grad_norm": 2.783745505915478, + "learning_rate": 7.89298630232958e-07, + "loss": 0.6066, + "step": 7832 + }, + { + "epoch": 0.82, + "grad_norm": 2.211994261833092, + "learning_rate": 7.883798498758077e-07, + "loss": 0.5615, + "step": 7833 + }, + { + "epoch": 0.82, + "grad_norm": 2.316992802552742, + "learning_rate": 7.874615588125128e-07, + "loss": 0.5865, + "step": 7834 + }, + { + "epoch": 0.82, + "grad_norm": 2.540384973156056, + "learning_rate": 7.865437571497569e-07, + "loss": 0.5444, + "step": 7835 + }, + { + "epoch": 0.82, + "grad_norm": 2.2642580877781904, + "learning_rate": 7.856264449941664e-07, + "loss": 0.5456, + "step": 7836 + }, + { + "epoch": 0.82, + "grad_norm": 2.5525408933985214, + "learning_rate": 7.847096224523132e-07, + "loss": 0.607, + "step": 7837 + }, + { + "epoch": 0.82, + "grad_norm": 2.3245711806217613, + "learning_rate": 7.837932896307116e-07, + "loss": 0.5573, + "step": 7838 + }, + { + "epoch": 0.82, + "grad_norm": 3.05579417550974, + "learning_rate": 7.82877446635818e-07, + "loss": 0.5761, + "step": 7839 + }, + { + "epoch": 0.83, + "grad_norm": 2.5025364694067136, + "learning_rate": 7.819620935740313e-07, + "loss": 0.5728, + "step": 7840 + }, + { + "epoch": 0.83, + "grad_norm": 2.541642302021767, + "learning_rate": 7.810472305516947e-07, + "loss": 0.6021, + "step": 7841 + }, + { + "epoch": 0.83, + "grad_norm": 2.6527868881900747, + "learning_rate": 7.801328576750971e-07, + "loss": 0.5955, + "step": 7842 + }, + { + "epoch": 0.83, + "grad_norm": 3.035882397492675, + "learning_rate": 7.792189750504642e-07, + "loss": 0.559, + "step": 7843 + }, + { + "epoch": 0.83, + "grad_norm": 3.4225984060139196, + "learning_rate": 7.783055827839709e-07, + "loss": 0.594, + "step": 7844 + }, + { + "epoch": 0.83, + "grad_norm": 2.435724626217545, + "learning_rate": 7.7739268098173e-07, + "loss": 0.593, + "step": 7845 + }, + { + "epoch": 0.83, + "grad_norm": 2.4904962113373412, + "learning_rate": 7.764802697498009e-07, + "loss": 0.615, + "step": 7846 + }, + { + "epoch": 0.83, + "grad_norm": 2.464037403138296, + "learning_rate": 7.755683491941867e-07, + "loss": 0.7109, + "step": 7847 + }, + { + "epoch": 0.83, + "grad_norm": 2.9748316199128824, + "learning_rate": 7.746569194208298e-07, + "loss": 0.6551, + "step": 7848 + }, + { + "epoch": 0.83, + "grad_norm": 2.2358191510721923, + "learning_rate": 7.737459805356168e-07, + "loss": 0.5977, + "step": 7849 + }, + { + "epoch": 0.83, + "grad_norm": 2.419490109419338, + "learning_rate": 7.728355326443792e-07, + "loss": 0.5004, + "step": 7850 + }, + { + "epoch": 0.83, + "grad_norm": 2.911759699027262, + "learning_rate": 7.719255758528904e-07, + "loss": 0.604, + "step": 7851 + }, + { + "epoch": 0.83, + "grad_norm": 2.859440106403186, + "learning_rate": 7.710161102668667e-07, + "loss": 0.5881, + "step": 7852 + }, + { + "epoch": 0.83, + "grad_norm": 3.133615283383733, + "learning_rate": 7.701071359919654e-07, + "loss": 0.5945, + "step": 7853 + }, + { + "epoch": 0.83, + "grad_norm": 2.754614469699022, + "learning_rate": 7.691986531337891e-07, + "loss": 0.5677, + "step": 7854 + }, + { + "epoch": 0.83, + "grad_norm": 2.7443708202298573, + "learning_rate": 7.682906617978836e-07, + "loss": 0.6404, + "step": 7855 + }, + { + "epoch": 0.83, + "grad_norm": 3.087205618369263, + "learning_rate": 7.673831620897376e-07, + "loss": 0.6038, + "step": 7856 + }, + { + "epoch": 0.83, + "grad_norm": 5.5096389980291365, + "learning_rate": 7.664761541147803e-07, + "loss": 0.6013, + "step": 7857 + }, + { + "epoch": 0.83, + "grad_norm": 2.4275211367126226, + "learning_rate": 7.65569637978385e-07, + "loss": 0.6306, + "step": 7858 + }, + { + "epoch": 0.83, + "grad_norm": 3.0724412487922104, + "learning_rate": 7.646636137858682e-07, + "loss": 0.5641, + "step": 7859 + }, + { + "epoch": 0.83, + "grad_norm": 2.6908765435507656, + "learning_rate": 7.637580816424906e-07, + "loss": 0.5556, + "step": 7860 + }, + { + "epoch": 0.83, + "grad_norm": 2.2336816700244158, + "learning_rate": 7.628530416534536e-07, + "loss": 0.6658, + "step": 7861 + }, + { + "epoch": 0.83, + "grad_norm": 5.534474436786471, + "learning_rate": 7.619484939239008e-07, + "loss": 0.5065, + "step": 7862 + }, + { + "epoch": 0.83, + "grad_norm": 2.25517760727134, + "learning_rate": 7.610444385589206e-07, + "loss": 0.5669, + "step": 7863 + }, + { + "epoch": 0.83, + "grad_norm": 2.4664200875331077, + "learning_rate": 7.601408756635454e-07, + "loss": 0.5449, + "step": 7864 + }, + { + "epoch": 0.83, + "grad_norm": 2.3983166993158402, + "learning_rate": 7.592378053427463e-07, + "loss": 0.631, + "step": 7865 + }, + { + "epoch": 0.83, + "grad_norm": 2.44965513205932, + "learning_rate": 7.583352277014405e-07, + "loss": 0.6007, + "step": 7866 + }, + { + "epoch": 0.83, + "grad_norm": 2.783794296236303, + "learning_rate": 7.574331428444859e-07, + "loss": 0.6112, + "step": 7867 + }, + { + "epoch": 0.83, + "grad_norm": 7.858796558481055, + "learning_rate": 7.565315508766846e-07, + "loss": 0.6879, + "step": 7868 + }, + { + "epoch": 0.83, + "grad_norm": 2.3392675349173246, + "learning_rate": 7.556304519027824e-07, + "loss": 0.535, + "step": 7869 + }, + { + "epoch": 0.83, + "grad_norm": 2.345381147919508, + "learning_rate": 7.547298460274655e-07, + "loss": 0.5671, + "step": 7870 + }, + { + "epoch": 0.83, + "grad_norm": 1.960829913288072, + "learning_rate": 7.538297333553613e-07, + "loss": 0.5243, + "step": 7871 + }, + { + "epoch": 0.83, + "grad_norm": 2.4386836992879974, + "learning_rate": 7.529301139910444e-07, + "loss": 0.5946, + "step": 7872 + }, + { + "epoch": 0.83, + "grad_norm": 2.1914556637515674, + "learning_rate": 7.520309880390314e-07, + "loss": 0.5832, + "step": 7873 + }, + { + "epoch": 0.83, + "grad_norm": 4.241007701198296, + "learning_rate": 7.51132355603778e-07, + "loss": 0.5416, + "step": 7874 + }, + { + "epoch": 0.83, + "grad_norm": 2.0724515989906513, + "learning_rate": 7.502342167896847e-07, + "loss": 0.5964, + "step": 7875 + }, + { + "epoch": 0.83, + "grad_norm": 2.5100584726157886, + "learning_rate": 7.493365717010947e-07, + "loss": 0.631, + "step": 7876 + }, + { + "epoch": 0.83, + "grad_norm": 2.7356056595469536, + "learning_rate": 7.484394204422962e-07, + "loss": 0.5424, + "step": 7877 + }, + { + "epoch": 0.83, + "grad_norm": 2.0885569649811786, + "learning_rate": 7.475427631175141e-07, + "loss": 0.5181, + "step": 7878 + }, + { + "epoch": 0.83, + "grad_norm": 2.304547742686197, + "learning_rate": 7.466465998309225e-07, + "loss": 0.6151, + "step": 7879 + }, + { + "epoch": 0.83, + "grad_norm": 2.2979784331851913, + "learning_rate": 7.457509306866329e-07, + "loss": 0.569, + "step": 7880 + }, + { + "epoch": 0.83, + "grad_norm": 3.0618039183537324, + "learning_rate": 7.448557557887021e-07, + "loss": 0.586, + "step": 7881 + }, + { + "epoch": 0.83, + "grad_norm": 3.5231994277364835, + "learning_rate": 7.439610752411303e-07, + "loss": 0.6385, + "step": 7882 + }, + { + "epoch": 0.83, + "grad_norm": 2.5269036735363617, + "learning_rate": 7.430668891478576e-07, + "loss": 0.5901, + "step": 7883 + }, + { + "epoch": 0.83, + "grad_norm": 2.3674099325580866, + "learning_rate": 7.421731976127672e-07, + "loss": 0.7775, + "step": 7884 + }, + { + "epoch": 0.83, + "grad_norm": 2.671376901489743, + "learning_rate": 7.41280000739687e-07, + "loss": 0.5261, + "step": 7885 + }, + { + "epoch": 0.83, + "grad_norm": 2.820411629914039, + "learning_rate": 7.403872986323862e-07, + "loss": 0.569, + "step": 7886 + }, + { + "epoch": 0.83, + "grad_norm": 2.577175781646432, + "learning_rate": 7.394950913945759e-07, + "loss": 0.5512, + "step": 7887 + }, + { + "epoch": 0.83, + "grad_norm": 2.4642031464494174, + "learning_rate": 7.386033791299091e-07, + "loss": 0.6127, + "step": 7888 + }, + { + "epoch": 0.83, + "grad_norm": 2.434672247809207, + "learning_rate": 7.37712161941983e-07, + "loss": 0.6554, + "step": 7889 + }, + { + "epoch": 0.83, + "grad_norm": 2.5103511043710465, + "learning_rate": 7.368214399343371e-07, + "loss": 0.5331, + "step": 7890 + }, + { + "epoch": 0.83, + "grad_norm": 2.7655999882645057, + "learning_rate": 7.35931213210454e-07, + "loss": 0.7108, + "step": 7891 + }, + { + "epoch": 0.83, + "grad_norm": 3.2183982961814324, + "learning_rate": 7.350414818737562e-07, + "loss": 0.5502, + "step": 7892 + }, + { + "epoch": 0.83, + "grad_norm": 3.586664475078197, + "learning_rate": 7.34152246027609e-07, + "loss": 0.5924, + "step": 7893 + }, + { + "epoch": 0.83, + "grad_norm": 2.366201926305009, + "learning_rate": 7.332635057753224e-07, + "loss": 0.6236, + "step": 7894 + }, + { + "epoch": 0.83, + "grad_norm": 2.3175961493215635, + "learning_rate": 7.323752612201491e-07, + "loss": 0.5758, + "step": 7895 + }, + { + "epoch": 0.83, + "grad_norm": 16.41576198449689, + "learning_rate": 7.314875124652815e-07, + "loss": 0.6657, + "step": 7896 + }, + { + "epoch": 0.83, + "grad_norm": 3.516520516533135, + "learning_rate": 7.306002596138551e-07, + "loss": 0.6438, + "step": 7897 + }, + { + "epoch": 0.83, + "grad_norm": 2.9227517959801372, + "learning_rate": 7.297135027689484e-07, + "loss": 0.5942, + "step": 7898 + }, + { + "epoch": 0.83, + "grad_norm": 2.6069484447619486, + "learning_rate": 7.288272420335841e-07, + "loss": 0.5786, + "step": 7899 + }, + { + "epoch": 0.83, + "grad_norm": 2.5621275414127007, + "learning_rate": 7.279414775107241e-07, + "loss": 0.6239, + "step": 7900 + }, + { + "epoch": 0.83, + "grad_norm": 2.5478933874139176, + "learning_rate": 7.270562093032724e-07, + "loss": 0.6154, + "step": 7901 + }, + { + "epoch": 0.83, + "grad_norm": 2.5146219884970686, + "learning_rate": 7.261714375140788e-07, + "loss": 0.6525, + "step": 7902 + }, + { + "epoch": 0.83, + "grad_norm": 4.26095835769191, + "learning_rate": 7.252871622459335e-07, + "loss": 0.6109, + "step": 7903 + }, + { + "epoch": 0.83, + "grad_norm": 2.7915685895483926, + "learning_rate": 7.244033836015696e-07, + "loss": 0.7238, + "step": 7904 + }, + { + "epoch": 0.83, + "grad_norm": 2.44562542266984, + "learning_rate": 7.235201016836613e-07, + "loss": 0.5962, + "step": 7905 + }, + { + "epoch": 0.83, + "grad_norm": 4.6869459471637755, + "learning_rate": 7.226373165948241e-07, + "loss": 0.5836, + "step": 7906 + }, + { + "epoch": 0.83, + "grad_norm": 4.282445234624132, + "learning_rate": 7.21755028437619e-07, + "loss": 0.6254, + "step": 7907 + }, + { + "epoch": 0.83, + "grad_norm": 4.521493455628609, + "learning_rate": 7.208732373145483e-07, + "loss": 0.6589, + "step": 7908 + }, + { + "epoch": 0.83, + "grad_norm": 2.232695529398515, + "learning_rate": 7.199919433280555e-07, + "loss": 0.6094, + "step": 7909 + }, + { + "epoch": 0.83, + "grad_norm": 3.962644679645622, + "learning_rate": 7.191111465805256e-07, + "loss": 0.5981, + "step": 7910 + }, + { + "epoch": 0.83, + "grad_norm": 2.4471859782636165, + "learning_rate": 7.182308471742877e-07, + "loss": 0.5816, + "step": 7911 + }, + { + "epoch": 0.83, + "grad_norm": 2.702379481712408, + "learning_rate": 7.173510452116139e-07, + "loss": 0.68, + "step": 7912 + }, + { + "epoch": 0.83, + "grad_norm": 1.0632897742865417, + "learning_rate": 7.164717407947142e-07, + "loss": 0.5708, + "step": 7913 + }, + { + "epoch": 0.83, + "grad_norm": 2.423696780817971, + "learning_rate": 7.155929340257467e-07, + "loss": 0.5654, + "step": 7914 + }, + { + "epoch": 0.83, + "grad_norm": 2.0545652099506952, + "learning_rate": 7.14714625006806e-07, + "loss": 0.549, + "step": 7915 + }, + { + "epoch": 0.83, + "grad_norm": 4.240068457011979, + "learning_rate": 7.138368138399327e-07, + "loss": 0.575, + "step": 7916 + }, + { + "epoch": 0.83, + "grad_norm": 2.354272324791377, + "learning_rate": 7.129595006271095e-07, + "loss": 0.6664, + "step": 7917 + }, + { + "epoch": 0.83, + "grad_norm": 3.1167965022837523, + "learning_rate": 7.120826854702589e-07, + "loss": 0.5909, + "step": 7918 + }, + { + "epoch": 0.83, + "grad_norm": 2.8860947312539493, + "learning_rate": 7.112063684712456e-07, + "loss": 0.5764, + "step": 7919 + }, + { + "epoch": 0.83, + "grad_norm": 2.5639149644592307, + "learning_rate": 7.103305497318786e-07, + "loss": 0.5729, + "step": 7920 + }, + { + "epoch": 0.83, + "grad_norm": 4.157567164040736, + "learning_rate": 7.094552293539098e-07, + "loss": 0.6691, + "step": 7921 + }, + { + "epoch": 0.83, + "grad_norm": 3.017279987323207, + "learning_rate": 7.08580407439029e-07, + "loss": 0.7188, + "step": 7922 + }, + { + "epoch": 0.83, + "grad_norm": 3.180206068529602, + "learning_rate": 7.077060840888705e-07, + "loss": 0.5835, + "step": 7923 + }, + { + "epoch": 0.83, + "grad_norm": 3.2183678021512128, + "learning_rate": 7.068322594050114e-07, + "loss": 0.5853, + "step": 7924 + }, + { + "epoch": 0.83, + "grad_norm": 2.730991122722091, + "learning_rate": 7.059589334889705e-07, + "loss": 0.5394, + "step": 7925 + }, + { + "epoch": 0.83, + "grad_norm": 2.7372841955338085, + "learning_rate": 7.050861064422087e-07, + "loss": 0.6588, + "step": 7926 + }, + { + "epoch": 0.83, + "grad_norm": 2.8242014928415333, + "learning_rate": 7.042137783661273e-07, + "loss": 0.6495, + "step": 7927 + }, + { + "epoch": 0.83, + "grad_norm": 3.3717042988457577, + "learning_rate": 7.033419493620708e-07, + "loss": 0.6167, + "step": 7928 + }, + { + "epoch": 0.83, + "grad_norm": 0.9434034854373674, + "learning_rate": 7.024706195313258e-07, + "loss": 0.5459, + "step": 7929 + }, + { + "epoch": 0.83, + "grad_norm": 0.948265264022775, + "learning_rate": 7.015997889751225e-07, + "loss": 0.5389, + "step": 7930 + }, + { + "epoch": 0.83, + "grad_norm": 2.464153456017488, + "learning_rate": 7.007294577946306e-07, + "loss": 0.5651, + "step": 7931 + }, + { + "epoch": 0.83, + "grad_norm": 3.0196234432409446, + "learning_rate": 6.998596260909607e-07, + "loss": 0.6672, + "step": 7932 + }, + { + "epoch": 0.83, + "grad_norm": 2.2187036742506003, + "learning_rate": 6.989902939651694e-07, + "loss": 0.6165, + "step": 7933 + }, + { + "epoch": 0.83, + "grad_norm": 2.4128874884249774, + "learning_rate": 6.981214615182541e-07, + "loss": 0.5248, + "step": 7934 + }, + { + "epoch": 0.83, + "grad_norm": 2.714711543409674, + "learning_rate": 6.972531288511514e-07, + "loss": 0.6234, + "step": 7935 + }, + { + "epoch": 0.84, + "grad_norm": 2.566568280987358, + "learning_rate": 6.963852960647416e-07, + "loss": 0.6142, + "step": 7936 + }, + { + "epoch": 0.84, + "grad_norm": 2.5912361052965105, + "learning_rate": 6.955179632598475e-07, + "loss": 0.6019, + "step": 7937 + }, + { + "epoch": 0.84, + "grad_norm": 3.0046865938581657, + "learning_rate": 6.946511305372327e-07, + "loss": 0.6551, + "step": 7938 + }, + { + "epoch": 0.84, + "grad_norm": 2.62750432152481, + "learning_rate": 6.937847979976059e-07, + "loss": 0.7307, + "step": 7939 + }, + { + "epoch": 0.84, + "grad_norm": 4.463425594035302, + "learning_rate": 6.929189657416136e-07, + "loss": 0.6263, + "step": 7940 + }, + { + "epoch": 0.84, + "grad_norm": 2.293961976197948, + "learning_rate": 6.920536338698436e-07, + "loss": 0.7305, + "step": 7941 + }, + { + "epoch": 0.84, + "grad_norm": 3.245455408558447, + "learning_rate": 6.911888024828295e-07, + "loss": 0.6554, + "step": 7942 + }, + { + "epoch": 0.84, + "grad_norm": 2.0647694096526474, + "learning_rate": 6.903244716810459e-07, + "loss": 0.5978, + "step": 7943 + }, + { + "epoch": 0.84, + "grad_norm": 2.2770572793711135, + "learning_rate": 6.894606415649074e-07, + "loss": 0.6284, + "step": 7944 + }, + { + "epoch": 0.84, + "grad_norm": 2.590717718772522, + "learning_rate": 6.8859731223477e-07, + "loss": 0.6585, + "step": 7945 + }, + { + "epoch": 0.84, + "grad_norm": 2.1415058085079095, + "learning_rate": 6.877344837909334e-07, + "loss": 0.6856, + "step": 7946 + }, + { + "epoch": 0.84, + "grad_norm": 2.5987535704327875, + "learning_rate": 6.868721563336406e-07, + "loss": 0.6328, + "step": 7947 + }, + { + "epoch": 0.84, + "grad_norm": 2.5233039406914495, + "learning_rate": 6.860103299630722e-07, + "loss": 0.6124, + "step": 7948 + }, + { + "epoch": 0.84, + "grad_norm": 2.524754669248327, + "learning_rate": 6.851490047793524e-07, + "loss": 0.6072, + "step": 7949 + }, + { + "epoch": 0.84, + "grad_norm": 2.540427405947607, + "learning_rate": 6.84288180882548e-07, + "loss": 0.6113, + "step": 7950 + }, + { + "epoch": 0.84, + "grad_norm": 0.9908048969702398, + "learning_rate": 6.834278583726677e-07, + "loss": 0.532, + "step": 7951 + }, + { + "epoch": 0.84, + "grad_norm": 3.2038346124338073, + "learning_rate": 6.825680373496618e-07, + "loss": 0.5439, + "step": 7952 + }, + { + "epoch": 0.84, + "grad_norm": 0.9377925721646436, + "learning_rate": 6.817087179134208e-07, + "loss": 0.5117, + "step": 7953 + }, + { + "epoch": 0.84, + "grad_norm": 3.407003844113903, + "learning_rate": 6.80849900163777e-07, + "loss": 0.5797, + "step": 7954 + }, + { + "epoch": 0.84, + "grad_norm": 2.2839557052539923, + "learning_rate": 6.799915842005062e-07, + "loss": 0.5703, + "step": 7955 + }, + { + "epoch": 0.84, + "grad_norm": 2.3947698210437935, + "learning_rate": 6.791337701233269e-07, + "loss": 0.5937, + "step": 7956 + }, + { + "epoch": 0.84, + "grad_norm": 2.6739236325422904, + "learning_rate": 6.782764580318951e-07, + "loss": 0.7199, + "step": 7957 + }, + { + "epoch": 0.84, + "grad_norm": 2.1116863967686004, + "learning_rate": 6.774196480258111e-07, + "loss": 0.6494, + "step": 7958 + }, + { + "epoch": 0.84, + "grad_norm": 2.1683016397788997, + "learning_rate": 6.765633402046168e-07, + "loss": 0.5955, + "step": 7959 + }, + { + "epoch": 0.84, + "grad_norm": 3.032305057994704, + "learning_rate": 6.757075346677961e-07, + "loss": 0.603, + "step": 7960 + }, + { + "epoch": 0.84, + "grad_norm": 2.2604613175721164, + "learning_rate": 6.748522315147744e-07, + "loss": 0.6187, + "step": 7961 + }, + { + "epoch": 0.84, + "grad_norm": 4.969636492342722, + "learning_rate": 6.739974308449176e-07, + "loss": 0.6081, + "step": 7962 + }, + { + "epoch": 0.84, + "grad_norm": 3.0339603599056373, + "learning_rate": 6.731431327575339e-07, + "loss": 0.6901, + "step": 7963 + }, + { + "epoch": 0.84, + "grad_norm": 3.2703468846615533, + "learning_rate": 6.722893373518724e-07, + "loss": 0.5622, + "step": 7964 + }, + { + "epoch": 0.84, + "grad_norm": 2.4646080011429485, + "learning_rate": 6.714360447271273e-07, + "loss": 0.522, + "step": 7965 + }, + { + "epoch": 0.84, + "grad_norm": 2.364242035976487, + "learning_rate": 6.705832549824293e-07, + "loss": 0.6273, + "step": 7966 + }, + { + "epoch": 0.84, + "grad_norm": 2.7352098181866817, + "learning_rate": 6.69730968216853e-07, + "loss": 0.6764, + "step": 7967 + }, + { + "epoch": 0.84, + "grad_norm": 4.616311936586499, + "learning_rate": 6.688791845294151e-07, + "loss": 0.6525, + "step": 7968 + }, + { + "epoch": 0.84, + "grad_norm": 2.4255233825081826, + "learning_rate": 6.680279040190745e-07, + "loss": 0.6588, + "step": 7969 + }, + { + "epoch": 0.84, + "grad_norm": 2.5378364716116764, + "learning_rate": 6.671771267847299e-07, + "loss": 0.5982, + "step": 7970 + }, + { + "epoch": 0.84, + "grad_norm": 2.323293588729379, + "learning_rate": 6.663268529252209e-07, + "loss": 0.6346, + "step": 7971 + }, + { + "epoch": 0.84, + "grad_norm": 2.164418560400295, + "learning_rate": 6.654770825393303e-07, + "loss": 0.6584, + "step": 7972 + }, + { + "epoch": 0.84, + "grad_norm": 2.9789326506768754, + "learning_rate": 6.646278157257824e-07, + "loss": 0.5408, + "step": 7973 + }, + { + "epoch": 0.84, + "grad_norm": 2.4473343292404257, + "learning_rate": 6.637790525832438e-07, + "loss": 0.6776, + "step": 7974 + }, + { + "epoch": 0.84, + "grad_norm": 4.333751171962327, + "learning_rate": 6.629307932103201e-07, + "loss": 0.6228, + "step": 7975 + }, + { + "epoch": 0.84, + "grad_norm": 4.194575988760013, + "learning_rate": 6.620830377055587e-07, + "loss": 0.5451, + "step": 7976 + }, + { + "epoch": 0.84, + "grad_norm": 2.1276202217427196, + "learning_rate": 6.612357861674501e-07, + "loss": 0.5791, + "step": 7977 + }, + { + "epoch": 0.84, + "grad_norm": 2.5647125903291776, + "learning_rate": 6.603890386944273e-07, + "loss": 0.5679, + "step": 7978 + }, + { + "epoch": 0.84, + "grad_norm": 2.3644218883143338, + "learning_rate": 6.59542795384861e-07, + "loss": 0.5646, + "step": 7979 + }, + { + "epoch": 0.84, + "grad_norm": 3.3454428465301946, + "learning_rate": 6.586970563370649e-07, + "loss": 0.6593, + "step": 7980 + }, + { + "epoch": 0.84, + "grad_norm": 3.2957829519224, + "learning_rate": 6.578518216492951e-07, + "loss": 0.6074, + "step": 7981 + }, + { + "epoch": 0.84, + "grad_norm": 2.989238602602019, + "learning_rate": 6.570070914197496e-07, + "loss": 0.6612, + "step": 7982 + }, + { + "epoch": 0.84, + "grad_norm": 2.6361300624581046, + "learning_rate": 6.561628657465663e-07, + "loss": 0.6422, + "step": 7983 + }, + { + "epoch": 0.84, + "grad_norm": 2.3784169240024085, + "learning_rate": 6.553191447278234e-07, + "loss": 0.6598, + "step": 7984 + }, + { + "epoch": 0.84, + "grad_norm": 4.116665012500234, + "learning_rate": 6.544759284615431e-07, + "loss": 0.5628, + "step": 7985 + }, + { + "epoch": 0.84, + "grad_norm": 4.866416810480354, + "learning_rate": 6.536332170456877e-07, + "loss": 0.5985, + "step": 7986 + }, + { + "epoch": 0.84, + "grad_norm": 2.122808635192976, + "learning_rate": 6.527910105781626e-07, + "loss": 0.5883, + "step": 7987 + }, + { + "epoch": 0.84, + "grad_norm": 2.9397977174918566, + "learning_rate": 6.519493091568108e-07, + "loss": 0.6253, + "step": 7988 + }, + { + "epoch": 0.84, + "grad_norm": 5.2026307795457765, + "learning_rate": 6.511081128794183e-07, + "loss": 0.6072, + "step": 7989 + }, + { + "epoch": 0.84, + "grad_norm": 2.391514496115847, + "learning_rate": 6.502674218437144e-07, + "loss": 0.5603, + "step": 7990 + }, + { + "epoch": 0.84, + "grad_norm": 3.156056886121653, + "learning_rate": 6.494272361473681e-07, + "loss": 0.6524, + "step": 7991 + }, + { + "epoch": 0.84, + "grad_norm": 3.732431840720779, + "learning_rate": 6.485875558879895e-07, + "loss": 0.6481, + "step": 7992 + }, + { + "epoch": 0.84, + "grad_norm": 1.9873437823454094, + "learning_rate": 6.477483811631291e-07, + "loss": 0.5987, + "step": 7993 + }, + { + "epoch": 0.84, + "grad_norm": 2.613588104761939, + "learning_rate": 6.469097120702805e-07, + "loss": 0.6542, + "step": 7994 + }, + { + "epoch": 0.84, + "grad_norm": 2.288884381877685, + "learning_rate": 6.460715487068781e-07, + "loss": 0.5694, + "step": 7995 + }, + { + "epoch": 0.84, + "grad_norm": 3.0335170876836783, + "learning_rate": 6.452338911702994e-07, + "loss": 0.7001, + "step": 7996 + }, + { + "epoch": 0.84, + "grad_norm": 2.7744780023995363, + "learning_rate": 6.443967395578565e-07, + "loss": 0.6253, + "step": 7997 + }, + { + "epoch": 0.84, + "grad_norm": 2.6347984361148904, + "learning_rate": 6.435600939668096e-07, + "loss": 0.592, + "step": 7998 + }, + { + "epoch": 0.84, + "grad_norm": 3.011072253449798, + "learning_rate": 6.42723954494358e-07, + "loss": 0.6553, + "step": 7999 + }, + { + "epoch": 0.84, + "grad_norm": 3.0584864481962364, + "learning_rate": 6.418883212376431e-07, + "loss": 0.5839, + "step": 8000 + }, + { + "epoch": 0.84, + "grad_norm": 2.296221820496963, + "learning_rate": 6.410531942937448e-07, + "loss": 0.5928, + "step": 8001 + }, + { + "epoch": 0.84, + "grad_norm": 2.406097985573007, + "learning_rate": 6.402185737596844e-07, + "loss": 0.6986, + "step": 8002 + }, + { + "epoch": 0.84, + "grad_norm": 2.2589114977154625, + "learning_rate": 6.393844597324278e-07, + "loss": 0.5709, + "step": 8003 + }, + { + "epoch": 0.84, + "grad_norm": 2.7836938958633284, + "learning_rate": 6.385508523088801e-07, + "loss": 0.5831, + "step": 8004 + }, + { + "epoch": 0.84, + "grad_norm": 2.6551153221400194, + "learning_rate": 6.377177515858874e-07, + "loss": 0.6167, + "step": 8005 + }, + { + "epoch": 0.84, + "grad_norm": 2.6555647926872816, + "learning_rate": 6.368851576602347e-07, + "loss": 0.5478, + "step": 8006 + }, + { + "epoch": 0.84, + "grad_norm": 3.0795265737705124, + "learning_rate": 6.360530706286516e-07, + "loss": 0.6234, + "step": 8007 + }, + { + "epoch": 0.84, + "grad_norm": 5.01122414638164, + "learning_rate": 6.352214905878085e-07, + "loss": 0.5999, + "step": 8008 + }, + { + "epoch": 0.84, + "grad_norm": 2.8498907965644706, + "learning_rate": 6.343904176343169e-07, + "loss": 0.6705, + "step": 8009 + }, + { + "epoch": 0.84, + "grad_norm": 2.4166442373218096, + "learning_rate": 6.335598518647251e-07, + "loss": 0.5388, + "step": 8010 + }, + { + "epoch": 0.84, + "grad_norm": 2.6880105299523067, + "learning_rate": 6.327297933755272e-07, + "loss": 0.5941, + "step": 8011 + }, + { + "epoch": 0.84, + "grad_norm": 3.846835922064916, + "learning_rate": 6.319002422631582e-07, + "loss": 0.6664, + "step": 8012 + }, + { + "epoch": 0.84, + "grad_norm": 3.031689173749022, + "learning_rate": 6.310711986239926e-07, + "loss": 0.5696, + "step": 8013 + }, + { + "epoch": 0.84, + "grad_norm": 2.73606226733396, + "learning_rate": 6.302426625543457e-07, + "loss": 0.5136, + "step": 8014 + }, + { + "epoch": 0.84, + "grad_norm": 3.9278610126869204, + "learning_rate": 6.294146341504742e-07, + "loss": 0.6988, + "step": 8015 + }, + { + "epoch": 0.84, + "grad_norm": 2.6587658045083002, + "learning_rate": 6.285871135085758e-07, + "loss": 0.586, + "step": 8016 + }, + { + "epoch": 0.84, + "grad_norm": 2.5101848696418028, + "learning_rate": 6.277601007247913e-07, + "loss": 0.6072, + "step": 8017 + }, + { + "epoch": 0.84, + "grad_norm": 3.094591388241789, + "learning_rate": 6.269335958951995e-07, + "loss": 0.6101, + "step": 8018 + }, + { + "epoch": 0.84, + "grad_norm": 2.1190721327022017, + "learning_rate": 6.2610759911582e-07, + "loss": 0.5325, + "step": 8019 + }, + { + "epoch": 0.84, + "grad_norm": 2.394049016837729, + "learning_rate": 6.252821104826163e-07, + "loss": 0.6149, + "step": 8020 + }, + { + "epoch": 0.84, + "grad_norm": 2.185048534616482, + "learning_rate": 6.244571300914909e-07, + "loss": 0.5355, + "step": 8021 + }, + { + "epoch": 0.84, + "grad_norm": 0.9946377288465473, + "learning_rate": 6.23632658038289e-07, + "loss": 0.5187, + "step": 8022 + }, + { + "epoch": 0.84, + "grad_norm": 3.8433532638524928, + "learning_rate": 6.228086944187939e-07, + "loss": 0.6301, + "step": 8023 + }, + { + "epoch": 0.84, + "grad_norm": 3.239562343118617, + "learning_rate": 6.219852393287302e-07, + "loss": 0.6108, + "step": 8024 + }, + { + "epoch": 0.84, + "grad_norm": 3.19755867950319, + "learning_rate": 6.211622928637662e-07, + "loss": 0.5238, + "step": 8025 + }, + { + "epoch": 0.84, + "grad_norm": 2.1376360439152657, + "learning_rate": 6.2033985511951e-07, + "loss": 0.6205, + "step": 8026 + }, + { + "epoch": 0.84, + "grad_norm": 2.563081310031987, + "learning_rate": 6.19517926191509e-07, + "loss": 0.6733, + "step": 8027 + }, + { + "epoch": 0.84, + "grad_norm": 3.266713807274647, + "learning_rate": 6.186965061752515e-07, + "loss": 0.5803, + "step": 8028 + }, + { + "epoch": 0.84, + "grad_norm": 2.33107338075643, + "learning_rate": 6.178755951661692e-07, + "loss": 0.542, + "step": 8029 + }, + { + "epoch": 0.84, + "grad_norm": 2.1249280857293966, + "learning_rate": 6.170551932596336e-07, + "loss": 0.6044, + "step": 8030 + }, + { + "epoch": 0.85, + "grad_norm": 3.222877055640638, + "learning_rate": 6.162353005509558e-07, + "loss": 0.5909, + "step": 8031 + }, + { + "epoch": 0.85, + "grad_norm": 2.466145340026216, + "learning_rate": 6.154159171353879e-07, + "loss": 0.6029, + "step": 8032 + }, + { + "epoch": 0.85, + "grad_norm": 2.5315597654614805, + "learning_rate": 6.145970431081238e-07, + "loss": 0.5896, + "step": 8033 + }, + { + "epoch": 0.85, + "grad_norm": 2.769252012832659, + "learning_rate": 6.137786785642985e-07, + "loss": 0.6102, + "step": 8034 + }, + { + "epoch": 0.85, + "grad_norm": 2.4509615775141977, + "learning_rate": 6.129608235989881e-07, + "loss": 0.5448, + "step": 8035 + }, + { + "epoch": 0.85, + "grad_norm": 2.141998734885848, + "learning_rate": 6.121434783072077e-07, + "loss": 0.6219, + "step": 8036 + }, + { + "epoch": 0.85, + "grad_norm": 2.090304082790666, + "learning_rate": 6.113266427839126e-07, + "loss": 0.5533, + "step": 8037 + }, + { + "epoch": 0.85, + "grad_norm": 2.290451018341766, + "learning_rate": 6.105103171240018e-07, + "loss": 0.6181, + "step": 8038 + }, + { + "epoch": 0.85, + "grad_norm": 5.270211096142163, + "learning_rate": 6.096945014223149e-07, + "loss": 0.614, + "step": 8039 + }, + { + "epoch": 0.85, + "grad_norm": 2.9037913388153433, + "learning_rate": 6.088791957736301e-07, + "loss": 0.6391, + "step": 8040 + }, + { + "epoch": 0.85, + "grad_norm": 2.3997821521809874, + "learning_rate": 6.080644002726655e-07, + "loss": 0.6159, + "step": 8041 + }, + { + "epoch": 0.85, + "grad_norm": 2.436406794174699, + "learning_rate": 6.072501150140824e-07, + "loss": 0.6379, + "step": 8042 + }, + { + "epoch": 0.85, + "grad_norm": 3.1984050878329087, + "learning_rate": 6.064363400924839e-07, + "loss": 0.6351, + "step": 8043 + }, + { + "epoch": 0.85, + "grad_norm": 2.9565521370159096, + "learning_rate": 6.056230756024123e-07, + "loss": 0.5821, + "step": 8044 + }, + { + "epoch": 0.85, + "grad_norm": 3.4666008560066732, + "learning_rate": 6.048103216383472e-07, + "loss": 0.5522, + "step": 8045 + }, + { + "epoch": 0.85, + "grad_norm": 2.3667556756148342, + "learning_rate": 6.03998078294713e-07, + "loss": 0.5512, + "step": 8046 + }, + { + "epoch": 0.85, + "grad_norm": 2.6465130816201254, + "learning_rate": 6.031863456658754e-07, + "loss": 0.5849, + "step": 8047 + }, + { + "epoch": 0.85, + "grad_norm": 3.036938049317216, + "learning_rate": 6.023751238461389e-07, + "loss": 0.5304, + "step": 8048 + }, + { + "epoch": 0.85, + "grad_norm": 2.075249746973687, + "learning_rate": 6.015644129297482e-07, + "loss": 0.6031, + "step": 8049 + }, + { + "epoch": 0.85, + "grad_norm": 2.8124802810968377, + "learning_rate": 6.007542130108885e-07, + "loss": 0.6553, + "step": 8050 + }, + { + "epoch": 0.85, + "grad_norm": 6.474131988219833, + "learning_rate": 5.999445241836877e-07, + "loss": 0.5124, + "step": 8051 + }, + { + "epoch": 0.85, + "grad_norm": 2.668817739499102, + "learning_rate": 5.991353465422134e-07, + "loss": 0.5076, + "step": 8052 + }, + { + "epoch": 0.85, + "grad_norm": 2.411857351900381, + "learning_rate": 5.983266801804732e-07, + "loss": 0.6072, + "step": 8053 + }, + { + "epoch": 0.85, + "grad_norm": 2.4264765584589165, + "learning_rate": 5.975185251924143e-07, + "loss": 0.6211, + "step": 8054 + }, + { + "epoch": 0.85, + "grad_norm": 3.102878662708729, + "learning_rate": 5.967108816719264e-07, + "loss": 0.6705, + "step": 8055 + }, + { + "epoch": 0.85, + "grad_norm": 3.401655319504964, + "learning_rate": 5.959037497128401e-07, + "loss": 0.6786, + "step": 8056 + }, + { + "epoch": 0.85, + "grad_norm": 3.152666465422462, + "learning_rate": 5.950971294089258e-07, + "loss": 0.6148, + "step": 8057 + }, + { + "epoch": 0.85, + "grad_norm": 2.675848160084293, + "learning_rate": 5.942910208538943e-07, + "loss": 0.595, + "step": 8058 + }, + { + "epoch": 0.85, + "grad_norm": 3.2338755878945475, + "learning_rate": 5.934854241413951e-07, + "loss": 0.62, + "step": 8059 + }, + { + "epoch": 0.85, + "grad_norm": 2.7539432304617, + "learning_rate": 5.926803393650215e-07, + "loss": 0.563, + "step": 8060 + }, + { + "epoch": 0.85, + "grad_norm": 2.418072027591026, + "learning_rate": 5.918757666183067e-07, + "loss": 0.5881, + "step": 8061 + }, + { + "epoch": 0.85, + "grad_norm": 2.4209012407142225, + "learning_rate": 5.91071705994723e-07, + "loss": 0.5809, + "step": 8062 + }, + { + "epoch": 0.85, + "grad_norm": 4.302902584346649, + "learning_rate": 5.902681575876822e-07, + "loss": 0.5901, + "step": 8063 + }, + { + "epoch": 0.85, + "grad_norm": 2.3794113470801763, + "learning_rate": 5.894651214905395e-07, + "loss": 0.5332, + "step": 8064 + }, + { + "epoch": 0.85, + "grad_norm": 2.4361869128303573, + "learning_rate": 5.88662597796591e-07, + "loss": 0.5329, + "step": 8065 + }, + { + "epoch": 0.85, + "grad_norm": 5.815110075655275, + "learning_rate": 5.878605865990694e-07, + "loss": 0.6081, + "step": 8066 + }, + { + "epoch": 0.85, + "grad_norm": 2.448042678880235, + "learning_rate": 5.870590879911498e-07, + "loss": 0.5438, + "step": 8067 + }, + { + "epoch": 0.85, + "grad_norm": 2.6312265037825413, + "learning_rate": 5.862581020659491e-07, + "loss": 0.6526, + "step": 8068 + }, + { + "epoch": 0.85, + "grad_norm": 3.162596770198441, + "learning_rate": 5.854576289165232e-07, + "loss": 0.6465, + "step": 8069 + }, + { + "epoch": 0.85, + "grad_norm": 2.5578744017745305, + "learning_rate": 5.846576686358696e-07, + "loss": 0.6271, + "step": 8070 + }, + { + "epoch": 0.85, + "grad_norm": 2.8100954598723407, + "learning_rate": 5.838582213169247e-07, + "loss": 0.6043, + "step": 8071 + }, + { + "epoch": 0.85, + "grad_norm": 3.2170401537564057, + "learning_rate": 5.830592870525647e-07, + "loss": 0.6683, + "step": 8072 + }, + { + "epoch": 0.85, + "grad_norm": 2.379729669844667, + "learning_rate": 5.822608659356093e-07, + "loss": 0.5669, + "step": 8073 + }, + { + "epoch": 0.85, + "grad_norm": 2.759833858853163, + "learning_rate": 5.814629580588165e-07, + "loss": 0.679, + "step": 8074 + }, + { + "epoch": 0.85, + "grad_norm": 2.379428037812773, + "learning_rate": 5.80665563514885e-07, + "loss": 0.5477, + "step": 8075 + }, + { + "epoch": 0.85, + "grad_norm": 0.9686713188136623, + "learning_rate": 5.798686823964517e-07, + "loss": 0.5656, + "step": 8076 + }, + { + "epoch": 0.85, + "grad_norm": 2.208901084034373, + "learning_rate": 5.79072314796098e-07, + "loss": 0.6539, + "step": 8077 + }, + { + "epoch": 0.85, + "grad_norm": 2.319119693164557, + "learning_rate": 5.78276460806343e-07, + "loss": 0.5599, + "step": 8078 + }, + { + "epoch": 0.85, + "grad_norm": 3.486494110324032, + "learning_rate": 5.77481120519649e-07, + "loss": 0.6557, + "step": 8079 + }, + { + "epoch": 0.85, + "grad_norm": 2.356169785828175, + "learning_rate": 5.766862940284124e-07, + "loss": 0.5505, + "step": 8080 + }, + { + "epoch": 0.85, + "grad_norm": 3.308551249984606, + "learning_rate": 5.758919814249753e-07, + "loss": 0.6307, + "step": 8081 + }, + { + "epoch": 0.85, + "grad_norm": 3.1735857620440453, + "learning_rate": 5.750981828016189e-07, + "loss": 0.5625, + "step": 8082 + }, + { + "epoch": 0.85, + "grad_norm": 3.0910390532362695, + "learning_rate": 5.743048982505656e-07, + "loss": 0.5263, + "step": 8083 + }, + { + "epoch": 0.85, + "grad_norm": 3.0386446705294796, + "learning_rate": 5.73512127863976e-07, + "loss": 0.5617, + "step": 8084 + }, + { + "epoch": 0.85, + "grad_norm": 2.7279151957638255, + "learning_rate": 5.727198717339511e-07, + "loss": 0.6422, + "step": 8085 + }, + { + "epoch": 0.85, + "grad_norm": 2.677171114462176, + "learning_rate": 5.719281299525331e-07, + "loss": 0.5548, + "step": 8086 + }, + { + "epoch": 0.85, + "grad_norm": 3.209112258602768, + "learning_rate": 5.711369026117053e-07, + "loss": 0.6056, + "step": 8087 + }, + { + "epoch": 0.85, + "grad_norm": 2.9912493406236726, + "learning_rate": 5.703461898033902e-07, + "loss": 0.6632, + "step": 8088 + }, + { + "epoch": 0.85, + "grad_norm": 3.8418625332520318, + "learning_rate": 5.695559916194488e-07, + "loss": 0.6912, + "step": 8089 + }, + { + "epoch": 0.85, + "grad_norm": 2.516446973072753, + "learning_rate": 5.687663081516853e-07, + "loss": 0.6293, + "step": 8090 + }, + { + "epoch": 0.85, + "grad_norm": 3.7840898044343776, + "learning_rate": 5.679771394918427e-07, + "loss": 0.5642, + "step": 8091 + }, + { + "epoch": 0.85, + "grad_norm": 2.6457572435732244, + "learning_rate": 5.671884857316051e-07, + "loss": 0.559, + "step": 8092 + }, + { + "epoch": 0.85, + "grad_norm": 2.3271517036024933, + "learning_rate": 5.66400346962595e-07, + "loss": 0.7182, + "step": 8093 + }, + { + "epoch": 0.85, + "grad_norm": 2.7706715409747082, + "learning_rate": 5.656127232763759e-07, + "loss": 0.6649, + "step": 8094 + }, + { + "epoch": 0.85, + "grad_norm": 2.34603048762334, + "learning_rate": 5.64825614764452e-07, + "loss": 0.6284, + "step": 8095 + }, + { + "epoch": 0.85, + "grad_norm": 2.0215689336292253, + "learning_rate": 5.640390215182683e-07, + "loss": 0.6201, + "step": 8096 + }, + { + "epoch": 0.85, + "grad_norm": 9.53100525886137, + "learning_rate": 5.632529436292083e-07, + "loss": 0.5778, + "step": 8097 + }, + { + "epoch": 0.85, + "grad_norm": 2.126583426550784, + "learning_rate": 5.624673811885945e-07, + "loss": 0.61, + "step": 8098 + }, + { + "epoch": 0.85, + "grad_norm": 2.4029139519399867, + "learning_rate": 5.616823342876932e-07, + "loss": 0.5994, + "step": 8099 + }, + { + "epoch": 0.85, + "grad_norm": 2.3048343587136615, + "learning_rate": 5.608978030177087e-07, + "loss": 0.5854, + "step": 8100 + }, + { + "epoch": 0.85, + "grad_norm": 2.276243991774919, + "learning_rate": 5.601137874697859e-07, + "loss": 0.6745, + "step": 8101 + }, + { + "epoch": 0.85, + "grad_norm": 2.5942109454123052, + "learning_rate": 5.593302877350076e-07, + "loss": 0.6314, + "step": 8102 + }, + { + "epoch": 0.85, + "grad_norm": 2.9318122332058856, + "learning_rate": 5.585473039044004e-07, + "loss": 0.5933, + "step": 8103 + }, + { + "epoch": 0.85, + "grad_norm": 2.4441412822757806, + "learning_rate": 5.577648360689281e-07, + "loss": 0.6418, + "step": 8104 + }, + { + "epoch": 0.85, + "grad_norm": 4.2034597908955575, + "learning_rate": 5.569828843194969e-07, + "loss": 0.5708, + "step": 8105 + }, + { + "epoch": 0.85, + "grad_norm": 2.6115667573813166, + "learning_rate": 5.562014487469502e-07, + "loss": 0.6829, + "step": 8106 + }, + { + "epoch": 0.85, + "grad_norm": 2.6541755315222266, + "learning_rate": 5.554205294420733e-07, + "loss": 0.5536, + "step": 8107 + }, + { + "epoch": 0.85, + "grad_norm": 2.9752658226736735, + "learning_rate": 5.546401264955909e-07, + "loss": 0.5814, + "step": 8108 + }, + { + "epoch": 0.85, + "grad_norm": 3.8346292608939656, + "learning_rate": 5.538602399981696e-07, + "loss": 0.593, + "step": 8109 + }, + { + "epoch": 0.85, + "grad_norm": 2.279904270343523, + "learning_rate": 5.530808700404128e-07, + "loss": 0.6239, + "step": 8110 + }, + { + "epoch": 0.85, + "grad_norm": 2.4794073490515975, + "learning_rate": 5.523020167128651e-07, + "loss": 0.5705, + "step": 8111 + }, + { + "epoch": 0.85, + "grad_norm": 6.4668680618890395, + "learning_rate": 5.51523680106012e-07, + "loss": 0.6668, + "step": 8112 + }, + { + "epoch": 0.85, + "grad_norm": 3.484931444733449, + "learning_rate": 5.507458603102783e-07, + "loss": 0.6922, + "step": 8113 + }, + { + "epoch": 0.85, + "grad_norm": 4.674438977252659, + "learning_rate": 5.499685574160312e-07, + "loss": 0.5529, + "step": 8114 + }, + { + "epoch": 0.85, + "grad_norm": 2.284875736493241, + "learning_rate": 5.491917715135719e-07, + "loss": 0.5868, + "step": 8115 + }, + { + "epoch": 0.85, + "grad_norm": 2.901045185802944, + "learning_rate": 5.484155026931459e-07, + "loss": 0.6206, + "step": 8116 + }, + { + "epoch": 0.85, + "grad_norm": 2.196587431457302, + "learning_rate": 5.476397510449389e-07, + "loss": 0.6526, + "step": 8117 + }, + { + "epoch": 0.85, + "grad_norm": 3.1539318616200513, + "learning_rate": 5.468645166590758e-07, + "loss": 0.6617, + "step": 8118 + }, + { + "epoch": 0.85, + "grad_norm": 2.4155197419898746, + "learning_rate": 5.46089799625621e-07, + "loss": 0.5818, + "step": 8119 + }, + { + "epoch": 0.85, + "grad_norm": 2.6201210125396237, + "learning_rate": 5.453156000345772e-07, + "loss": 0.5792, + "step": 8120 + }, + { + "epoch": 0.85, + "grad_norm": 2.457503317771027, + "learning_rate": 5.445419179758893e-07, + "loss": 0.5982, + "step": 8121 + }, + { + "epoch": 0.85, + "grad_norm": 2.551730796116664, + "learning_rate": 5.437687535394431e-07, + "loss": 0.5919, + "step": 8122 + }, + { + "epoch": 0.85, + "grad_norm": 2.726936420412654, + "learning_rate": 5.429961068150619e-07, + "loss": 0.5739, + "step": 8123 + }, + { + "epoch": 0.85, + "grad_norm": 2.536441146532936, + "learning_rate": 5.422239778925076e-07, + "loss": 0.6811, + "step": 8124 + }, + { + "epoch": 0.85, + "grad_norm": 2.9590736750871485, + "learning_rate": 5.414523668614857e-07, + "loss": 0.5417, + "step": 8125 + }, + { + "epoch": 0.86, + "grad_norm": 3.7180436990498595, + "learning_rate": 5.406812738116396e-07, + "loss": 0.6244, + "step": 8126 + }, + { + "epoch": 0.86, + "grad_norm": 2.2060907410932096, + "learning_rate": 5.399106988325543e-07, + "loss": 0.5769, + "step": 8127 + }, + { + "epoch": 0.86, + "grad_norm": 2.9386958343089704, + "learning_rate": 5.39140642013749e-07, + "loss": 0.5423, + "step": 8128 + }, + { + "epoch": 0.86, + "grad_norm": 2.5672726543123683, + "learning_rate": 5.383711034446892e-07, + "loss": 0.5791, + "step": 8129 + }, + { + "epoch": 0.86, + "grad_norm": 2.6077162998901873, + "learning_rate": 5.376020832147777e-07, + "loss": 0.6193, + "step": 8130 + }, + { + "epoch": 0.86, + "grad_norm": 3.6993050614435203, + "learning_rate": 5.368335814133569e-07, + "loss": 0.5434, + "step": 8131 + }, + { + "epoch": 0.86, + "grad_norm": 2.2676443993494813, + "learning_rate": 5.360655981297097e-07, + "loss": 0.5061, + "step": 8132 + }, + { + "epoch": 0.86, + "grad_norm": 3.204912156598437, + "learning_rate": 5.352981334530555e-07, + "loss": 0.5548, + "step": 8133 + }, + { + "epoch": 0.86, + "grad_norm": 3.3583543994309633, + "learning_rate": 5.345311874725584e-07, + "loss": 0.5469, + "step": 8134 + }, + { + "epoch": 0.86, + "grad_norm": 2.73031391634573, + "learning_rate": 5.337647602773211e-07, + "loss": 0.6302, + "step": 8135 + }, + { + "epoch": 0.86, + "grad_norm": 2.6740573030486434, + "learning_rate": 5.329988519563828e-07, + "loss": 0.5692, + "step": 8136 + }, + { + "epoch": 0.86, + "grad_norm": 1.9923350695353295, + "learning_rate": 5.322334625987241e-07, + "loss": 0.5938, + "step": 8137 + }, + { + "epoch": 0.86, + "grad_norm": 2.8706603816339067, + "learning_rate": 5.314685922932666e-07, + "loss": 0.5283, + "step": 8138 + }, + { + "epoch": 0.86, + "grad_norm": 2.4156359250535178, + "learning_rate": 5.30704241128871e-07, + "loss": 0.6129, + "step": 8139 + }, + { + "epoch": 0.86, + "grad_norm": 2.789191292597911, + "learning_rate": 5.299404091943383e-07, + "loss": 0.6423, + "step": 8140 + }, + { + "epoch": 0.86, + "grad_norm": 2.3542836512039425, + "learning_rate": 5.291770965784076e-07, + "loss": 0.6534, + "step": 8141 + }, + { + "epoch": 0.86, + "grad_norm": 2.6693420950157534, + "learning_rate": 5.284143033697565e-07, + "loss": 0.5847, + "step": 8142 + }, + { + "epoch": 0.86, + "grad_norm": 2.4801931250434497, + "learning_rate": 5.276520296570053e-07, + "loss": 0.6162, + "step": 8143 + }, + { + "epoch": 0.86, + "grad_norm": 2.6330298094643676, + "learning_rate": 5.268902755287148e-07, + "loss": 0.5991, + "step": 8144 + }, + { + "epoch": 0.86, + "grad_norm": 2.514352693873068, + "learning_rate": 5.26129041073381e-07, + "loss": 0.5545, + "step": 8145 + }, + { + "epoch": 0.86, + "grad_norm": 2.334792126153137, + "learning_rate": 5.253683263794418e-07, + "loss": 0.5464, + "step": 8146 + }, + { + "epoch": 0.86, + "grad_norm": 2.269857305345398, + "learning_rate": 5.246081315352758e-07, + "loss": 0.6648, + "step": 8147 + }, + { + "epoch": 0.86, + "grad_norm": 2.5865095430846656, + "learning_rate": 5.238484566292002e-07, + "loss": 0.7269, + "step": 8148 + }, + { + "epoch": 0.86, + "grad_norm": 2.4282223527075573, + "learning_rate": 5.230893017494731e-07, + "loss": 0.5743, + "step": 8149 + }, + { + "epoch": 0.86, + "grad_norm": 1.0285336151160749, + "learning_rate": 5.223306669842876e-07, + "loss": 0.5332, + "step": 8150 + }, + { + "epoch": 0.86, + "grad_norm": 2.5241337376310526, + "learning_rate": 5.215725524217818e-07, + "loss": 0.555, + "step": 8151 + }, + { + "epoch": 0.86, + "grad_norm": 3.159756791738227, + "learning_rate": 5.20814958150031e-07, + "loss": 0.6031, + "step": 8152 + }, + { + "epoch": 0.86, + "grad_norm": 2.682488901407614, + "learning_rate": 5.200578842570508e-07, + "loss": 0.5975, + "step": 8153 + }, + { + "epoch": 0.86, + "grad_norm": 0.9504110630710011, + "learning_rate": 5.19301330830796e-07, + "loss": 0.559, + "step": 8154 + }, + { + "epoch": 0.86, + "grad_norm": 2.283295404157167, + "learning_rate": 5.185452979591593e-07, + "loss": 0.647, + "step": 8155 + }, + { + "epoch": 0.86, + "grad_norm": 2.895291731557172, + "learning_rate": 5.177897857299752e-07, + "loss": 0.5969, + "step": 8156 + }, + { + "epoch": 0.86, + "grad_norm": 2.0204477401700256, + "learning_rate": 5.170347942310177e-07, + "loss": 0.4857, + "step": 8157 + }, + { + "epoch": 0.86, + "grad_norm": 3.281207098958877, + "learning_rate": 5.162803235499992e-07, + "loss": 0.5998, + "step": 8158 + }, + { + "epoch": 0.86, + "grad_norm": 2.9244566758863595, + "learning_rate": 5.155263737745703e-07, + "loss": 0.5635, + "step": 8159 + }, + { + "epoch": 0.86, + "grad_norm": 2.0701940095232954, + "learning_rate": 5.147729449923244e-07, + "loss": 0.5843, + "step": 8160 + }, + { + "epoch": 0.86, + "grad_norm": 2.5067250985588987, + "learning_rate": 5.140200372907921e-07, + "loss": 0.6793, + "step": 8161 + }, + { + "epoch": 0.86, + "grad_norm": 3.105940578606609, + "learning_rate": 5.132676507574463e-07, + "loss": 0.6469, + "step": 8162 + }, + { + "epoch": 0.86, + "grad_norm": 3.241058072116501, + "learning_rate": 5.125157854796925e-07, + "loss": 0.4969, + "step": 8163 + }, + { + "epoch": 0.86, + "grad_norm": 2.792486187126245, + "learning_rate": 5.11764441544883e-07, + "loss": 0.6391, + "step": 8164 + }, + { + "epoch": 0.86, + "grad_norm": 2.829466955623266, + "learning_rate": 5.11013619040307e-07, + "loss": 0.7384, + "step": 8165 + }, + { + "epoch": 0.86, + "grad_norm": 2.4147912110071825, + "learning_rate": 5.10263318053193e-07, + "loss": 0.6223, + "step": 8166 + }, + { + "epoch": 0.86, + "grad_norm": 2.362586700546461, + "learning_rate": 5.095135386707084e-07, + "loss": 0.588, + "step": 8167 + }, + { + "epoch": 0.86, + "grad_norm": 4.445979909568467, + "learning_rate": 5.087642809799587e-07, + "loss": 0.6189, + "step": 8168 + }, + { + "epoch": 0.86, + "grad_norm": 2.732920831150975, + "learning_rate": 5.080155450679924e-07, + "loss": 0.5372, + "step": 8169 + }, + { + "epoch": 0.86, + "grad_norm": 2.705560498251637, + "learning_rate": 5.072673310217957e-07, + "loss": 0.6203, + "step": 8170 + }, + { + "epoch": 0.86, + "grad_norm": 2.31980030969563, + "learning_rate": 5.065196389282939e-07, + "loss": 0.5589, + "step": 8171 + }, + { + "epoch": 0.86, + "grad_norm": 2.5656538295462887, + "learning_rate": 5.057724688743498e-07, + "loss": 0.6339, + "step": 8172 + }, + { + "epoch": 0.86, + "grad_norm": 2.6519050302952736, + "learning_rate": 5.050258209467684e-07, + "loss": 0.7224, + "step": 8173 + }, + { + "epoch": 0.86, + "grad_norm": 2.591412532091134, + "learning_rate": 5.042796952322943e-07, + "loss": 0.5866, + "step": 8174 + }, + { + "epoch": 0.86, + "grad_norm": 3.212827161480501, + "learning_rate": 5.035340918176096e-07, + "loss": 0.7674, + "step": 8175 + }, + { + "epoch": 0.86, + "grad_norm": 2.487115162626594, + "learning_rate": 5.027890107893368e-07, + "loss": 0.6352, + "step": 8176 + }, + { + "epoch": 0.86, + "grad_norm": 2.360221538239416, + "learning_rate": 5.020444522340351e-07, + "loss": 0.5827, + "step": 8177 + }, + { + "epoch": 0.86, + "grad_norm": 7.6272603981979445, + "learning_rate": 5.013004162382068e-07, + "loss": 0.5703, + "step": 8178 + }, + { + "epoch": 0.86, + "grad_norm": 2.297902416786526, + "learning_rate": 5.005569028882928e-07, + "loss": 0.6296, + "step": 8179 + }, + { + "epoch": 0.86, + "grad_norm": 2.272681457850102, + "learning_rate": 4.998139122706713e-07, + "loss": 0.6528, + "step": 8180 + }, + { + "epoch": 0.86, + "grad_norm": 2.8422597029287604, + "learning_rate": 4.990714444716594e-07, + "loss": 0.6148, + "step": 8181 + }, + { + "epoch": 0.86, + "grad_norm": 2.645227694382434, + "learning_rate": 4.983294995775167e-07, + "loss": 0.6103, + "step": 8182 + }, + { + "epoch": 0.86, + "grad_norm": 2.283592322689147, + "learning_rate": 4.975880776744397e-07, + "loss": 0.5401, + "step": 8183 + }, + { + "epoch": 0.86, + "grad_norm": 2.1952627320177975, + "learning_rate": 4.968471788485663e-07, + "loss": 0.5755, + "step": 8184 + }, + { + "epoch": 0.86, + "grad_norm": 2.4356967067289714, + "learning_rate": 4.961068031859684e-07, + "loss": 0.6096, + "step": 8185 + }, + { + "epoch": 0.86, + "grad_norm": 2.8567686330899567, + "learning_rate": 4.953669507726633e-07, + "loss": 0.6889, + "step": 8186 + }, + { + "epoch": 0.86, + "grad_norm": 3.5528236481312305, + "learning_rate": 4.946276216946034e-07, + "loss": 0.6052, + "step": 8187 + }, + { + "epoch": 0.86, + "grad_norm": 3.0251800181776676, + "learning_rate": 4.938888160376842e-07, + "loss": 0.6357, + "step": 8188 + }, + { + "epoch": 0.86, + "grad_norm": 3.306752775047198, + "learning_rate": 4.931505338877363e-07, + "loss": 0.4966, + "step": 8189 + }, + { + "epoch": 0.86, + "grad_norm": 3.467733910306402, + "learning_rate": 4.924127753305308e-07, + "loss": 0.5819, + "step": 8190 + }, + { + "epoch": 0.86, + "grad_norm": 2.5766576901577807, + "learning_rate": 4.916755404517787e-07, + "loss": 0.6238, + "step": 8191 + }, + { + "epoch": 0.86, + "grad_norm": 6.477146410617243, + "learning_rate": 4.909388293371309e-07, + "loss": 0.5792, + "step": 8192 + }, + { + "epoch": 0.86, + "grad_norm": 2.725439865413354, + "learning_rate": 4.902026420721756e-07, + "loss": 0.589, + "step": 8193 + }, + { + "epoch": 0.86, + "grad_norm": 3.1642328795374106, + "learning_rate": 4.894669787424399e-07, + "loss": 0.5731, + "step": 8194 + }, + { + "epoch": 0.86, + "grad_norm": 2.7228885313758933, + "learning_rate": 4.887318394333923e-07, + "loss": 0.6069, + "step": 8195 + }, + { + "epoch": 0.86, + "grad_norm": 4.187726267604148, + "learning_rate": 4.879972242304382e-07, + "loss": 0.5925, + "step": 8196 + }, + { + "epoch": 0.86, + "grad_norm": 2.4064432279219896, + "learning_rate": 4.872631332189259e-07, + "loss": 0.6251, + "step": 8197 + }, + { + "epoch": 0.86, + "grad_norm": 2.0589344787984993, + "learning_rate": 4.865295664841363e-07, + "loss": 0.5597, + "step": 8198 + }, + { + "epoch": 0.86, + "grad_norm": 2.3406278362981645, + "learning_rate": 4.857965241112938e-07, + "loss": 0.5722, + "step": 8199 + }, + { + "epoch": 0.86, + "grad_norm": 3.988444067366749, + "learning_rate": 4.850640061855627e-07, + "loss": 0.6008, + "step": 8200 + }, + { + "epoch": 0.86, + "grad_norm": 2.7834751537926765, + "learning_rate": 4.843320127920442e-07, + "loss": 0.5944, + "step": 8201 + }, + { + "epoch": 0.86, + "grad_norm": 2.5609186141419173, + "learning_rate": 4.836005440157798e-07, + "loss": 0.6557, + "step": 8202 + }, + { + "epoch": 0.86, + "grad_norm": 2.3038313600115945, + "learning_rate": 4.828695999417471e-07, + "loss": 0.6159, + "step": 8203 + }, + { + "epoch": 0.86, + "grad_norm": 2.3290658242492674, + "learning_rate": 4.821391806548664e-07, + "loss": 0.6091, + "step": 8204 + }, + { + "epoch": 0.86, + "grad_norm": 2.7246422374040185, + "learning_rate": 4.814092862399971e-07, + "loss": 0.6781, + "step": 8205 + }, + { + "epoch": 0.86, + "grad_norm": 2.543683504573766, + "learning_rate": 4.806799167819354e-07, + "loss": 0.5268, + "step": 8206 + }, + { + "epoch": 0.86, + "grad_norm": 2.248828186134925, + "learning_rate": 4.799510723654154e-07, + "loss": 0.5292, + "step": 8207 + }, + { + "epoch": 0.86, + "grad_norm": 3.487968354722907, + "learning_rate": 4.792227530751137e-07, + "loss": 0.618, + "step": 8208 + }, + { + "epoch": 0.86, + "grad_norm": 3.112110087489705, + "learning_rate": 4.784949589956444e-07, + "loss": 0.6297, + "step": 8209 + }, + { + "epoch": 0.86, + "grad_norm": 3.553846046938456, + "learning_rate": 4.777676902115613e-07, + "loss": 0.6159, + "step": 8210 + }, + { + "epoch": 0.86, + "grad_norm": 2.8837009978624635, + "learning_rate": 4.770409468073562e-07, + "loss": 0.6905, + "step": 8211 + }, + { + "epoch": 0.86, + "grad_norm": 3.1032584634222546, + "learning_rate": 4.7631472886745746e-07, + "loss": 0.6244, + "step": 8212 + }, + { + "epoch": 0.86, + "grad_norm": 2.367855236507304, + "learning_rate": 4.755890364762372e-07, + "loss": 0.5847, + "step": 8213 + }, + { + "epoch": 0.86, + "grad_norm": 0.9490535768407505, + "learning_rate": 4.748638697180052e-07, + "loss": 0.6026, + "step": 8214 + }, + { + "epoch": 0.86, + "grad_norm": 3.6375758139751895, + "learning_rate": 4.741392286770075e-07, + "loss": 0.5925, + "step": 8215 + }, + { + "epoch": 0.86, + "grad_norm": 2.8159200769119366, + "learning_rate": 4.734151134374304e-07, + "loss": 0.5985, + "step": 8216 + }, + { + "epoch": 0.86, + "grad_norm": 3.1013992053905923, + "learning_rate": 4.7269152408340067e-07, + "loss": 0.718, + "step": 8217 + }, + { + "epoch": 0.86, + "grad_norm": 2.2854215939813325, + "learning_rate": 4.7196846069898216e-07, + "loss": 0.5956, + "step": 8218 + }, + { + "epoch": 0.86, + "grad_norm": 2.3465869398994483, + "learning_rate": 4.71245923368181e-07, + "loss": 0.535, + "step": 8219 + }, + { + "epoch": 0.86, + "grad_norm": 2.521050340647012, + "learning_rate": 4.7052391217493497e-07, + "loss": 0.596, + "step": 8220 + }, + { + "epoch": 0.87, + "grad_norm": 2.4595843214175392, + "learning_rate": 4.698024272031276e-07, + "loss": 0.5647, + "step": 8221 + }, + { + "epoch": 0.87, + "grad_norm": 3.029712654205638, + "learning_rate": 4.690814685365791e-07, + "loss": 0.6448, + "step": 8222 + }, + { + "epoch": 0.87, + "grad_norm": 2.341790414077542, + "learning_rate": 4.683610362590485e-07, + "loss": 0.5764, + "step": 8223 + }, + { + "epoch": 0.87, + "grad_norm": 4.5428852112483105, + "learning_rate": 4.6764113045423274e-07, + "loss": 0.6887, + "step": 8224 + }, + { + "epoch": 0.87, + "grad_norm": 2.1809278528567937, + "learning_rate": 4.6692175120576834e-07, + "loss": 0.6309, + "step": 8225 + }, + { + "epoch": 0.87, + "grad_norm": 2.412349436289653, + "learning_rate": 4.6620289859723114e-07, + "loss": 0.583, + "step": 8226 + }, + { + "epoch": 0.87, + "grad_norm": 2.6851844120083275, + "learning_rate": 4.65484572712136e-07, + "loss": 0.5324, + "step": 8227 + }, + { + "epoch": 0.87, + "grad_norm": 2.350365679203484, + "learning_rate": 4.6476677363393507e-07, + "loss": 0.6305, + "step": 8228 + }, + { + "epoch": 0.87, + "grad_norm": 2.74850582630496, + "learning_rate": 4.6404950144602e-07, + "loss": 0.6442, + "step": 8229 + }, + { + "epoch": 0.87, + "grad_norm": 2.48312204654485, + "learning_rate": 4.6333275623172137e-07, + "loss": 0.6308, + "step": 8230 + }, + { + "epoch": 0.87, + "grad_norm": 3.447477016671054, + "learning_rate": 4.626165380743086e-07, + "loss": 0.5315, + "step": 8231 + }, + { + "epoch": 0.87, + "grad_norm": 2.6758503396323916, + "learning_rate": 4.6190084705699243e-07, + "loss": 0.6075, + "step": 8232 + }, + { + "epoch": 0.87, + "grad_norm": 2.7690378845704013, + "learning_rate": 4.6118568326291577e-07, + "loss": 0.5348, + "step": 8233 + }, + { + "epoch": 0.87, + "grad_norm": 3.288950693928659, + "learning_rate": 4.604710467751661e-07, + "loss": 0.5712, + "step": 8234 + }, + { + "epoch": 0.87, + "grad_norm": 2.882335822482473, + "learning_rate": 4.5975693767676746e-07, + "loss": 0.5979, + "step": 8235 + }, + { + "epoch": 0.87, + "grad_norm": 3.0747373794610313, + "learning_rate": 4.590433560506841e-07, + "loss": 0.6, + "step": 8236 + }, + { + "epoch": 0.87, + "grad_norm": 2.6607131094292322, + "learning_rate": 4.583303019798174e-07, + "loss": 0.6083, + "step": 8237 + }, + { + "epoch": 0.87, + "grad_norm": 3.6597203389257147, + "learning_rate": 4.576177755470068e-07, + "loss": 0.6753, + "step": 8238 + }, + { + "epoch": 0.87, + "grad_norm": 3.4666690631206585, + "learning_rate": 4.5690577683503214e-07, + "loss": 0.5823, + "step": 8239 + }, + { + "epoch": 0.87, + "grad_norm": 2.426851608730652, + "learning_rate": 4.561943059266122e-07, + "loss": 0.6137, + "step": 8240 + }, + { + "epoch": 0.87, + "grad_norm": 3.5313147718873426, + "learning_rate": 4.554833629044031e-07, + "loss": 0.5988, + "step": 8241 + }, + { + "epoch": 0.87, + "grad_norm": 2.4878565766187477, + "learning_rate": 4.547729478509993e-07, + "loss": 0.6371, + "step": 8242 + }, + { + "epoch": 0.87, + "grad_norm": 12.485806184499575, + "learning_rate": 4.540630608489355e-07, + "loss": 0.5358, + "step": 8243 + }, + { + "epoch": 0.87, + "grad_norm": 2.51891715467682, + "learning_rate": 4.533537019806844e-07, + "loss": 0.5765, + "step": 8244 + }, + { + "epoch": 0.87, + "grad_norm": 2.680356743825882, + "learning_rate": 4.52644871328658e-07, + "loss": 0.6421, + "step": 8245 + }, + { + "epoch": 0.87, + "grad_norm": 1.0197526172225977, + "learning_rate": 4.5193656897520534e-07, + "loss": 0.5282, + "step": 8246 + }, + { + "epoch": 0.87, + "grad_norm": 2.456701500945603, + "learning_rate": 4.5122879500261396e-07, + "loss": 0.5945, + "step": 8247 + }, + { + "epoch": 0.87, + "grad_norm": 5.368043410381294, + "learning_rate": 4.50521549493112e-07, + "loss": 0.6154, + "step": 8248 + }, + { + "epoch": 0.87, + "grad_norm": 4.207890875247668, + "learning_rate": 4.498148325288665e-07, + "loss": 0.6075, + "step": 8249 + }, + { + "epoch": 0.87, + "grad_norm": 2.2518464473290987, + "learning_rate": 4.491086441919801e-07, + "loss": 0.6067, + "step": 8250 + }, + { + "epoch": 0.87, + "grad_norm": 2.2804388518048135, + "learning_rate": 4.484029845644955e-07, + "loss": 0.6258, + "step": 8251 + }, + { + "epoch": 0.87, + "grad_norm": 2.0120465903949816, + "learning_rate": 4.4769785372839493e-07, + "loss": 0.533, + "step": 8252 + }, + { + "epoch": 0.87, + "grad_norm": 2.7135204524309904, + "learning_rate": 4.469932517655978e-07, + "loss": 0.5617, + "step": 8253 + }, + { + "epoch": 0.87, + "grad_norm": 2.9167866927413715, + "learning_rate": 4.462891787579654e-07, + "loss": 0.6386, + "step": 8254 + }, + { + "epoch": 0.87, + "grad_norm": 2.0215478723122295, + "learning_rate": 4.4558563478729113e-07, + "loss": 0.5989, + "step": 8255 + }, + { + "epoch": 0.87, + "grad_norm": 0.910071063555147, + "learning_rate": 4.4488261993531233e-07, + "loss": 0.5422, + "step": 8256 + }, + { + "epoch": 0.87, + "grad_norm": 11.796143039706289, + "learning_rate": 4.441801342837027e-07, + "loss": 0.502, + "step": 8257 + }, + { + "epoch": 0.87, + "grad_norm": 2.342733719631572, + "learning_rate": 4.4347817791407677e-07, + "loss": 0.567, + "step": 8258 + }, + { + "epoch": 0.87, + "grad_norm": 4.3267678983826565, + "learning_rate": 4.4277675090798445e-07, + "loss": 0.5816, + "step": 8259 + }, + { + "epoch": 0.87, + "grad_norm": 3.157334595406685, + "learning_rate": 4.4207585334691493e-07, + "loss": 0.5783, + "step": 8260 + }, + { + "epoch": 0.87, + "grad_norm": 4.343720857313059, + "learning_rate": 4.41375485312297e-07, + "loss": 0.5595, + "step": 8261 + }, + { + "epoch": 0.87, + "grad_norm": 2.109175745260641, + "learning_rate": 4.406756468854989e-07, + "loss": 0.5158, + "step": 8262 + }, + { + "epoch": 0.87, + "grad_norm": 2.3196868345115775, + "learning_rate": 4.3997633814782393e-07, + "loss": 0.5602, + "step": 8263 + }, + { + "epoch": 0.87, + "grad_norm": 2.7303164750460387, + "learning_rate": 4.392775591805154e-07, + "loss": 0.5038, + "step": 8264 + }, + { + "epoch": 0.87, + "grad_norm": 2.74241779032744, + "learning_rate": 4.385793100647567e-07, + "loss": 0.6497, + "step": 8265 + }, + { + "epoch": 0.87, + "grad_norm": 2.430609319723213, + "learning_rate": 4.378815908816675e-07, + "loss": 0.5677, + "step": 8266 + }, + { + "epoch": 0.87, + "grad_norm": 2.7793948165517435, + "learning_rate": 4.371844017123095e-07, + "loss": 0.5426, + "step": 8267 + }, + { + "epoch": 0.87, + "grad_norm": 3.4548513277509145, + "learning_rate": 4.3648774263767624e-07, + "loss": 0.5881, + "step": 8268 + }, + { + "epoch": 0.87, + "grad_norm": 2.2805067121865363, + "learning_rate": 4.3579161373870526e-07, + "loss": 0.6534, + "step": 8269 + }, + { + "epoch": 0.87, + "grad_norm": 2.521977095171458, + "learning_rate": 4.350960150962702e-07, + "loss": 0.5821, + "step": 8270 + }, + { + "epoch": 0.87, + "grad_norm": 3.045332855637481, + "learning_rate": 4.344009467911858e-07, + "loss": 0.5343, + "step": 8271 + }, + { + "epoch": 0.87, + "grad_norm": 3.071925089409981, + "learning_rate": 4.3370640890420145e-07, + "loss": 0.5714, + "step": 8272 + }, + { + "epoch": 0.87, + "grad_norm": 2.3770918532643552, + "learning_rate": 4.3301240151600587e-07, + "loss": 0.6368, + "step": 8273 + }, + { + "epoch": 0.87, + "grad_norm": 2.0377462716813706, + "learning_rate": 4.3231892470722736e-07, + "loss": 0.5848, + "step": 8274 + }, + { + "epoch": 0.87, + "grad_norm": 2.3436477279860513, + "learning_rate": 4.316259785584337e-07, + "loss": 0.524, + "step": 8275 + }, + { + "epoch": 0.87, + "grad_norm": 2.7364690734810475, + "learning_rate": 4.309335631501277e-07, + "loss": 0.635, + "step": 8276 + }, + { + "epoch": 0.87, + "grad_norm": 2.5489141180850075, + "learning_rate": 4.3024167856275166e-07, + "loss": 0.5364, + "step": 8277 + }, + { + "epoch": 0.87, + "grad_norm": 3.3087010035902105, + "learning_rate": 4.2955032487668745e-07, + "loss": 0.5727, + "step": 8278 + }, + { + "epoch": 0.87, + "grad_norm": 2.3875544612935014, + "learning_rate": 4.2885950217225525e-07, + "loss": 0.573, + "step": 8279 + }, + { + "epoch": 0.87, + "grad_norm": 0.997641790782197, + "learning_rate": 4.281692105297125e-07, + "loss": 0.5424, + "step": 8280 + }, + { + "epoch": 0.87, + "grad_norm": 3.5825231781643305, + "learning_rate": 4.2747945002925507e-07, + "loss": 0.604, + "step": 8281 + }, + { + "epoch": 0.87, + "grad_norm": 2.751380772733463, + "learning_rate": 4.267902207510166e-07, + "loss": 0.6387, + "step": 8282 + }, + { + "epoch": 0.87, + "grad_norm": 2.7858717540005844, + "learning_rate": 4.261015227750709e-07, + "loss": 0.5677, + "step": 8283 + }, + { + "epoch": 0.87, + "grad_norm": 1.001147795122434, + "learning_rate": 4.254133561814289e-07, + "loss": 0.5422, + "step": 8284 + }, + { + "epoch": 0.87, + "grad_norm": 2.4257602264136264, + "learning_rate": 4.247257210500394e-07, + "loss": 0.5679, + "step": 8285 + }, + { + "epoch": 0.87, + "grad_norm": 0.98287549752436, + "learning_rate": 4.240386174607891e-07, + "loss": 0.5719, + "step": 8286 + }, + { + "epoch": 0.87, + "grad_norm": 2.0556012748341397, + "learning_rate": 4.2335204549350415e-07, + "loss": 0.5993, + "step": 8287 + }, + { + "epoch": 0.87, + "grad_norm": 2.1184763996018456, + "learning_rate": 4.226660052279491e-07, + "loss": 0.5858, + "step": 8288 + }, + { + "epoch": 0.87, + "grad_norm": 2.4200580318072165, + "learning_rate": 4.219804967438279e-07, + "loss": 0.5608, + "step": 8289 + }, + { + "epoch": 0.87, + "grad_norm": 2.2140173131580885, + "learning_rate": 4.2129552012077636e-07, + "loss": 0.577, + "step": 8290 + }, + { + "epoch": 0.87, + "grad_norm": 2.8022432950471288, + "learning_rate": 4.2061107543837633e-07, + "loss": 0.5129, + "step": 8291 + }, + { + "epoch": 0.87, + "grad_norm": 2.3001424430333746, + "learning_rate": 4.1992716277614365e-07, + "loss": 0.5429, + "step": 8292 + }, + { + "epoch": 0.87, + "grad_norm": 2.0628372093439378, + "learning_rate": 4.1924378221353425e-07, + "loss": 0.5649, + "step": 8293 + }, + { + "epoch": 0.87, + "grad_norm": 3.0938803545878013, + "learning_rate": 4.185609338299407e-07, + "loss": 0.6533, + "step": 8294 + }, + { + "epoch": 0.87, + "grad_norm": 3.2890771263913123, + "learning_rate": 4.178786177046934e-07, + "loss": 0.633, + "step": 8295 + }, + { + "epoch": 0.87, + "grad_norm": 2.1087154492100217, + "learning_rate": 4.1719683391706235e-07, + "loss": 0.5208, + "step": 8296 + }, + { + "epoch": 0.87, + "grad_norm": 3.20130117340025, + "learning_rate": 4.165155825462569e-07, + "loss": 0.66, + "step": 8297 + }, + { + "epoch": 0.87, + "grad_norm": 2.38259009410489, + "learning_rate": 4.158348636714216e-07, + "loss": 0.698, + "step": 8298 + }, + { + "epoch": 0.87, + "grad_norm": 3.955775034898537, + "learning_rate": 4.151546773716392e-07, + "loss": 0.6124, + "step": 8299 + }, + { + "epoch": 0.87, + "grad_norm": 2.598068006883137, + "learning_rate": 4.1447502372593316e-07, + "loss": 0.5196, + "step": 8300 + }, + { + "epoch": 0.87, + "grad_norm": 2.1646914007025493, + "learning_rate": 4.137959028132632e-07, + "loss": 0.631, + "step": 8301 + }, + { + "epoch": 0.87, + "grad_norm": 2.3381901988980673, + "learning_rate": 4.1311731471253e-07, + "loss": 0.5472, + "step": 8302 + }, + { + "epoch": 0.87, + "grad_norm": 4.41287496079573, + "learning_rate": 4.1243925950256616e-07, + "loss": 0.6113, + "step": 8303 + }, + { + "epoch": 0.87, + "grad_norm": 2.709280007140201, + "learning_rate": 4.117617372621474e-07, + "loss": 0.5998, + "step": 8304 + }, + { + "epoch": 0.87, + "grad_norm": 2.2689051954483386, + "learning_rate": 4.11084748069987e-07, + "loss": 0.6169, + "step": 8305 + }, + { + "epoch": 0.87, + "grad_norm": 2.3500449701909503, + "learning_rate": 4.1040829200473643e-07, + "loss": 0.6554, + "step": 8306 + }, + { + "epoch": 0.87, + "grad_norm": 2.4784819446542996, + "learning_rate": 4.0973236914498284e-07, + "loss": 0.6293, + "step": 8307 + }, + { + "epoch": 0.87, + "grad_norm": 2.513599880145634, + "learning_rate": 4.090569795692528e-07, + "loss": 0.5934, + "step": 8308 + }, + { + "epoch": 0.87, + "grad_norm": 2.8994935140490363, + "learning_rate": 4.08382123356012e-07, + "loss": 0.6651, + "step": 8309 + }, + { + "epoch": 0.87, + "grad_norm": 2.715503190088068, + "learning_rate": 4.077078005836638e-07, + "loss": 0.5931, + "step": 8310 + }, + { + "epoch": 0.87, + "grad_norm": 2.8979413858053675, + "learning_rate": 4.070340113305482e-07, + "loss": 0.6289, + "step": 8311 + }, + { + "epoch": 0.87, + "grad_norm": 2.316723946930344, + "learning_rate": 4.0636075567494384e-07, + "loss": 0.6424, + "step": 8312 + }, + { + "epoch": 0.87, + "grad_norm": 2.6873395112200575, + "learning_rate": 4.056880336950675e-07, + "loss": 0.6267, + "step": 8313 + }, + { + "epoch": 0.87, + "grad_norm": 3.2114274473336453, + "learning_rate": 4.05015845469075e-07, + "loss": 0.6438, + "step": 8314 + }, + { + "epoch": 0.87, + "grad_norm": 2.1479506466214247, + "learning_rate": 4.043441910750595e-07, + "loss": 0.6171, + "step": 8315 + }, + { + "epoch": 0.88, + "grad_norm": 7.941279814979407, + "learning_rate": 4.036730705910513e-07, + "loss": 0.6145, + "step": 8316 + }, + { + "epoch": 0.88, + "grad_norm": 2.467418133816867, + "learning_rate": 4.030024840950181e-07, + "loss": 0.592, + "step": 8317 + }, + { + "epoch": 0.88, + "grad_norm": 2.429232092167091, + "learning_rate": 4.0233243166486804e-07, + "loss": 0.5985, + "step": 8318 + }, + { + "epoch": 0.88, + "grad_norm": 2.965427954109061, + "learning_rate": 4.016629133784461e-07, + "loss": 0.6265, + "step": 8319 + }, + { + "epoch": 0.88, + "grad_norm": 2.148171593670399, + "learning_rate": 4.0099392931353454e-07, + "loss": 0.5878, + "step": 8320 + }, + { + "epoch": 0.88, + "grad_norm": 2.435832787059743, + "learning_rate": 4.0032547954785286e-07, + "loss": 0.6319, + "step": 8321 + }, + { + "epoch": 0.88, + "grad_norm": 2.6890143714422567, + "learning_rate": 3.996575641590611e-07, + "loss": 0.5508, + "step": 8322 + }, + { + "epoch": 0.88, + "grad_norm": 2.2991123215639337, + "learning_rate": 3.9899018322475503e-07, + "loss": 0.5276, + "step": 8323 + }, + { + "epoch": 0.88, + "grad_norm": 5.492027408747583, + "learning_rate": 3.983233368224709e-07, + "loss": 0.639, + "step": 8324 + }, + { + "epoch": 0.88, + "grad_norm": 2.436217435669819, + "learning_rate": 3.9765702502967795e-07, + "loss": 0.5844, + "step": 8325 + }, + { + "epoch": 0.88, + "grad_norm": 2.2735941889345, + "learning_rate": 3.969912479237875e-07, + "loss": 0.5295, + "step": 8326 + }, + { + "epoch": 0.88, + "grad_norm": 2.4581911213333942, + "learning_rate": 3.963260055821477e-07, + "loss": 0.5177, + "step": 8327 + }, + { + "epoch": 0.88, + "grad_norm": 2.4753926504162496, + "learning_rate": 3.9566129808204624e-07, + "loss": 0.6626, + "step": 8328 + }, + { + "epoch": 0.88, + "grad_norm": 2.324542564189604, + "learning_rate": 3.9499712550070513e-07, + "loss": 0.6273, + "step": 8329 + }, + { + "epoch": 0.88, + "grad_norm": 3.14806628673995, + "learning_rate": 3.943334879152849e-07, + "loss": 0.6657, + "step": 8330 + }, + { + "epoch": 0.88, + "grad_norm": 2.2230261640469764, + "learning_rate": 3.936703854028873e-07, + "loss": 0.5797, + "step": 8331 + }, + { + "epoch": 0.88, + "grad_norm": 2.3480123532810095, + "learning_rate": 3.9300781804054887e-07, + "loss": 0.5624, + "step": 8332 + }, + { + "epoch": 0.88, + "grad_norm": 3.982281191390795, + "learning_rate": 3.9234578590524486e-07, + "loss": 0.4974, + "step": 8333 + }, + { + "epoch": 0.88, + "grad_norm": 2.5837995450453755, + "learning_rate": 3.9168428907388755e-07, + "loss": 0.6192, + "step": 8334 + }, + { + "epoch": 0.88, + "grad_norm": 2.346120506226917, + "learning_rate": 3.9102332762332775e-07, + "loss": 0.5529, + "step": 8335 + }, + { + "epoch": 0.88, + "grad_norm": 2.9988359051893787, + "learning_rate": 3.903629016303551e-07, + "loss": 0.6231, + "step": 8336 + }, + { + "epoch": 0.88, + "grad_norm": 2.929097624377366, + "learning_rate": 3.897030111716971e-07, + "loss": 0.5565, + "step": 8337 + }, + { + "epoch": 0.88, + "grad_norm": 4.490972492353745, + "learning_rate": 3.890436563240141e-07, + "loss": 0.5473, + "step": 8338 + }, + { + "epoch": 0.88, + "grad_norm": 6.123463476780392, + "learning_rate": 3.883848371639104e-07, + "loss": 0.6492, + "step": 8339 + }, + { + "epoch": 0.88, + "grad_norm": 2.1499837297094007, + "learning_rate": 3.8772655376792535e-07, + "loss": 0.5906, + "step": 8340 + }, + { + "epoch": 0.88, + "grad_norm": 2.667246941203882, + "learning_rate": 3.870688062125377e-07, + "loss": 0.6599, + "step": 8341 + }, + { + "epoch": 0.88, + "grad_norm": 2.3272993650955405, + "learning_rate": 3.864115945741609e-07, + "loss": 0.601, + "step": 8342 + }, + { + "epoch": 0.88, + "grad_norm": 2.81091418829116, + "learning_rate": 3.8575491892914816e-07, + "loss": 0.6082, + "step": 8343 + }, + { + "epoch": 0.88, + "grad_norm": 1.9611755295529303, + "learning_rate": 3.8509877935379083e-07, + "loss": 0.5979, + "step": 8344 + }, + { + "epoch": 0.88, + "grad_norm": 2.61950855812886, + "learning_rate": 3.8444317592431724e-07, + "loss": 0.6358, + "step": 8345 + }, + { + "epoch": 0.88, + "grad_norm": 0.9406115494374441, + "learning_rate": 3.837881087168932e-07, + "loss": 0.4899, + "step": 8346 + }, + { + "epoch": 0.88, + "grad_norm": 2.38460628767397, + "learning_rate": 3.8313357780762227e-07, + "loss": 0.6332, + "step": 8347 + }, + { + "epoch": 0.88, + "grad_norm": 2.7558226853136016, + "learning_rate": 3.8247958327254586e-07, + "loss": 0.5917, + "step": 8348 + }, + { + "epoch": 0.88, + "grad_norm": 2.5358732790727547, + "learning_rate": 3.8182612518764374e-07, + "loss": 0.545, + "step": 8349 + }, + { + "epoch": 0.88, + "grad_norm": 2.3595312762742418, + "learning_rate": 3.811732036288335e-07, + "loss": 0.608, + "step": 8350 + }, + { + "epoch": 0.88, + "grad_norm": 2.7712731901406173, + "learning_rate": 3.805208186719689e-07, + "loss": 0.6207, + "step": 8351 + }, + { + "epoch": 0.88, + "grad_norm": 0.9142414884157021, + "learning_rate": 3.7986897039284043e-07, + "loss": 0.5117, + "step": 8352 + }, + { + "epoch": 0.88, + "grad_norm": 2.316330933057498, + "learning_rate": 3.792176588671803e-07, + "loss": 0.6112, + "step": 8353 + }, + { + "epoch": 0.88, + "grad_norm": 2.9741114811382747, + "learning_rate": 3.785668841706558e-07, + "loss": 0.6341, + "step": 8354 + }, + { + "epoch": 0.88, + "grad_norm": 2.0963466583939874, + "learning_rate": 3.7791664637887137e-07, + "loss": 0.5715, + "step": 8355 + }, + { + "epoch": 0.88, + "grad_norm": 3.1371953655685396, + "learning_rate": 3.7726694556736943e-07, + "loss": 0.4608, + "step": 8356 + }, + { + "epoch": 0.88, + "grad_norm": 2.2052960552688363, + "learning_rate": 3.7661778181163067e-07, + "loss": 0.5932, + "step": 8357 + }, + { + "epoch": 0.88, + "grad_norm": 2.7273903186451576, + "learning_rate": 3.759691551870737e-07, + "loss": 0.6266, + "step": 8358 + }, + { + "epoch": 0.88, + "grad_norm": 2.5969037148869796, + "learning_rate": 3.753210657690537e-07, + "loss": 0.5756, + "step": 8359 + }, + { + "epoch": 0.88, + "grad_norm": 2.5347492281922293, + "learning_rate": 3.746735136328633e-07, + "loss": 0.6715, + "step": 8360 + }, + { + "epoch": 0.88, + "grad_norm": 2.8106795258984225, + "learning_rate": 3.740264988537329e-07, + "loss": 0.5104, + "step": 8361 + }, + { + "epoch": 0.88, + "grad_norm": 4.00381965510081, + "learning_rate": 3.7338002150683174e-07, + "loss": 0.6463, + "step": 8362 + }, + { + "epoch": 0.88, + "grad_norm": 3.173783144313059, + "learning_rate": 3.727340816672664e-07, + "loss": 0.6332, + "step": 8363 + }, + { + "epoch": 0.88, + "grad_norm": 2.9002556770934373, + "learning_rate": 3.7208867941007974e-07, + "loss": 0.6431, + "step": 8364 + }, + { + "epoch": 0.88, + "grad_norm": 2.7468354426920008, + "learning_rate": 3.7144381481025114e-07, + "loss": 0.5672, + "step": 8365 + }, + { + "epoch": 0.88, + "grad_norm": 2.4532097840078, + "learning_rate": 3.707994879427007e-07, + "loss": 0.6065, + "step": 8366 + }, + { + "epoch": 0.88, + "grad_norm": 2.664239854523862, + "learning_rate": 3.7015569888228464e-07, + "loss": 0.5785, + "step": 8367 + }, + { + "epoch": 0.88, + "grad_norm": 3.1037887455483544, + "learning_rate": 3.6951244770379593e-07, + "loss": 0.5904, + "step": 8368 + }, + { + "epoch": 0.88, + "grad_norm": 2.4768409598228636, + "learning_rate": 3.6886973448196475e-07, + "loss": 0.5523, + "step": 8369 + }, + { + "epoch": 0.88, + "grad_norm": 2.7756714612946563, + "learning_rate": 3.682275592914608e-07, + "loss": 0.6883, + "step": 8370 + }, + { + "epoch": 0.88, + "grad_norm": 0.9591826331531919, + "learning_rate": 3.675859222068895e-07, + "loss": 0.5301, + "step": 8371 + }, + { + "epoch": 0.88, + "grad_norm": 0.9503987597108043, + "learning_rate": 3.669448233027967e-07, + "loss": 0.5432, + "step": 8372 + }, + { + "epoch": 0.88, + "grad_norm": 2.2208818742361625, + "learning_rate": 3.6630426265366003e-07, + "loss": 0.5948, + "step": 8373 + }, + { + "epoch": 0.88, + "grad_norm": 0.9397750897472155, + "learning_rate": 3.6566424033389947e-07, + "loss": 0.5438, + "step": 8374 + }, + { + "epoch": 0.88, + "grad_norm": 2.765113073654002, + "learning_rate": 3.6502475641787107e-07, + "loss": 0.6899, + "step": 8375 + }, + { + "epoch": 0.88, + "grad_norm": 2.389118275586379, + "learning_rate": 3.6438581097986867e-07, + "loss": 0.7196, + "step": 8376 + }, + { + "epoch": 0.88, + "grad_norm": 2.1693703595965332, + "learning_rate": 3.637474040941225e-07, + "loss": 0.5825, + "step": 8377 + }, + { + "epoch": 0.88, + "grad_norm": 3.064136002731287, + "learning_rate": 3.6310953583480024e-07, + "loss": 0.6088, + "step": 8378 + }, + { + "epoch": 0.88, + "grad_norm": 2.293347710145248, + "learning_rate": 3.6247220627600833e-07, + "loss": 0.6058, + "step": 8379 + }, + { + "epoch": 0.88, + "grad_norm": 2.6363547175045237, + "learning_rate": 3.6183541549179025e-07, + "loss": 0.5608, + "step": 8380 + }, + { + "epoch": 0.88, + "grad_norm": 2.2724907939257672, + "learning_rate": 3.6119916355612627e-07, + "loss": 0.5671, + "step": 8381 + }, + { + "epoch": 0.88, + "grad_norm": 3.166605602109891, + "learning_rate": 3.6056345054293283e-07, + "loss": 0.6426, + "step": 8382 + }, + { + "epoch": 0.88, + "grad_norm": 2.242029102578855, + "learning_rate": 3.59928276526067e-07, + "loss": 0.5707, + "step": 8383 + }, + { + "epoch": 0.88, + "grad_norm": 2.7693849737510656, + "learning_rate": 3.592936415793208e-07, + "loss": 0.6228, + "step": 8384 + }, + { + "epoch": 0.88, + "grad_norm": 2.104105928630141, + "learning_rate": 3.586595457764247e-07, + "loss": 0.5438, + "step": 8385 + }, + { + "epoch": 0.88, + "grad_norm": 2.368575714906391, + "learning_rate": 3.580259891910465e-07, + "loss": 0.6578, + "step": 8386 + }, + { + "epoch": 0.88, + "grad_norm": 2.8815306378727747, + "learning_rate": 3.573929718967889e-07, + "loss": 0.6015, + "step": 8387 + }, + { + "epoch": 0.88, + "grad_norm": 2.2975996264972705, + "learning_rate": 3.567604939671959e-07, + "loss": 0.6111, + "step": 8388 + }, + { + "epoch": 0.88, + "grad_norm": 3.8232534667191658, + "learning_rate": 3.561285554757471e-07, + "loss": 0.567, + "step": 8389 + }, + { + "epoch": 0.88, + "grad_norm": 4.616180660257726, + "learning_rate": 3.554971564958587e-07, + "loss": 0.5183, + "step": 8390 + }, + { + "epoch": 0.88, + "grad_norm": 5.4538629426171275, + "learning_rate": 3.548662971008837e-07, + "loss": 0.6621, + "step": 8391 + }, + { + "epoch": 0.88, + "grad_norm": 2.688169563692595, + "learning_rate": 3.5423597736411463e-07, + "loss": 0.6208, + "step": 8392 + }, + { + "epoch": 0.88, + "grad_norm": 3.685828822376977, + "learning_rate": 3.536061973587812e-07, + "loss": 0.7017, + "step": 8393 + }, + { + "epoch": 0.88, + "grad_norm": 3.36466383731313, + "learning_rate": 3.5297695715804825e-07, + "loss": 0.6273, + "step": 8394 + }, + { + "epoch": 0.88, + "grad_norm": 7.105461376785732, + "learning_rate": 3.523482568350184e-07, + "loss": 0.5799, + "step": 8395 + }, + { + "epoch": 0.88, + "grad_norm": 3.2543324276401413, + "learning_rate": 3.517200964627332e-07, + "loss": 0.6232, + "step": 8396 + }, + { + "epoch": 0.88, + "grad_norm": 6.139641724312683, + "learning_rate": 3.510924761141704e-07, + "loss": 0.6816, + "step": 8397 + }, + { + "epoch": 0.88, + "grad_norm": 3.1532567391885493, + "learning_rate": 3.504653958622456e-07, + "loss": 0.5811, + "step": 8398 + }, + { + "epoch": 0.88, + "grad_norm": 2.7533862384085492, + "learning_rate": 3.49838855779811e-07, + "loss": 0.5851, + "step": 8399 + }, + { + "epoch": 0.88, + "grad_norm": 2.2806278164148184, + "learning_rate": 3.492128559396552e-07, + "loss": 0.5411, + "step": 8400 + }, + { + "epoch": 0.88, + "grad_norm": 5.786894398505118, + "learning_rate": 3.485873964145053e-07, + "loss": 0.5874, + "step": 8401 + }, + { + "epoch": 0.88, + "grad_norm": 2.223189773086896, + "learning_rate": 3.479624772770268e-07, + "loss": 0.6098, + "step": 8402 + }, + { + "epoch": 0.88, + "grad_norm": 2.559630254288751, + "learning_rate": 3.4733809859982037e-07, + "loss": 0.5536, + "step": 8403 + }, + { + "epoch": 0.88, + "grad_norm": 3.389108579984836, + "learning_rate": 3.46714260455423e-07, + "loss": 0.517, + "step": 8404 + }, + { + "epoch": 0.88, + "grad_norm": 2.5393552430851085, + "learning_rate": 3.460909629163117e-07, + "loss": 0.6811, + "step": 8405 + }, + { + "epoch": 0.88, + "grad_norm": 2.7176090256610452, + "learning_rate": 3.4546820605489974e-07, + "loss": 0.5495, + "step": 8406 + }, + { + "epoch": 0.88, + "grad_norm": 2.11043021583628, + "learning_rate": 3.448459899435369e-07, + "loss": 0.6564, + "step": 8407 + }, + { + "epoch": 0.88, + "grad_norm": 2.5053404923404297, + "learning_rate": 3.442243146545093e-07, + "loss": 0.5676, + "step": 8408 + }, + { + "epoch": 0.88, + "grad_norm": 2.246167043873439, + "learning_rate": 3.436031802600426e-07, + "loss": 0.6413, + "step": 8409 + }, + { + "epoch": 0.88, + "grad_norm": 2.9518371356414472, + "learning_rate": 3.4298258683229836e-07, + "loss": 0.6431, + "step": 8410 + }, + { + "epoch": 0.89, + "grad_norm": 2.5332175625722124, + "learning_rate": 3.423625344433756e-07, + "loss": 0.6081, + "step": 8411 + }, + { + "epoch": 0.89, + "grad_norm": 0.9693277831137703, + "learning_rate": 3.417430231653096e-07, + "loss": 0.5281, + "step": 8412 + }, + { + "epoch": 0.89, + "grad_norm": 2.1510911382680074, + "learning_rate": 3.4112405307007266e-07, + "loss": 0.6343, + "step": 8413 + }, + { + "epoch": 0.89, + "grad_norm": 2.465031585748283, + "learning_rate": 3.4050562422957624e-07, + "loss": 0.5676, + "step": 8414 + }, + { + "epoch": 0.89, + "grad_norm": 2.664072247681827, + "learning_rate": 3.3988773671566777e-07, + "loss": 0.6398, + "step": 8415 + }, + { + "epoch": 0.89, + "grad_norm": 3.75888208434279, + "learning_rate": 3.3927039060013045e-07, + "loss": 0.6638, + "step": 8416 + }, + { + "epoch": 0.89, + "grad_norm": 2.388691306338053, + "learning_rate": 3.3865358595468635e-07, + "loss": 0.5692, + "step": 8417 + }, + { + "epoch": 0.89, + "grad_norm": 2.0916358970739397, + "learning_rate": 3.380373228509937e-07, + "loss": 0.5645, + "step": 8418 + }, + { + "epoch": 0.89, + "grad_norm": 2.745762404593912, + "learning_rate": 3.374216013606485e-07, + "loss": 0.5756, + "step": 8419 + }, + { + "epoch": 0.89, + "grad_norm": 2.1892760583248885, + "learning_rate": 3.368064215551842e-07, + "loss": 0.6245, + "step": 8420 + }, + { + "epoch": 0.89, + "grad_norm": 2.857345470679988, + "learning_rate": 3.3619178350607016e-07, + "loss": 0.6549, + "step": 8421 + }, + { + "epoch": 0.89, + "grad_norm": 2.379512536999841, + "learning_rate": 3.355776872847122e-07, + "loss": 0.5277, + "step": 8422 + }, + { + "epoch": 0.89, + "grad_norm": 2.321362797949176, + "learning_rate": 3.3496413296245536e-07, + "loss": 0.6036, + "step": 8423 + }, + { + "epoch": 0.89, + "grad_norm": 2.245567667052969, + "learning_rate": 3.343511206105804e-07, + "loss": 0.5352, + "step": 8424 + }, + { + "epoch": 0.89, + "grad_norm": 2.045437582667061, + "learning_rate": 3.3373865030030536e-07, + "loss": 0.5313, + "step": 8425 + }, + { + "epoch": 0.89, + "grad_norm": 2.440444633467484, + "learning_rate": 3.331267221027845e-07, + "loss": 0.602, + "step": 8426 + }, + { + "epoch": 0.89, + "grad_norm": 0.9346701755245185, + "learning_rate": 3.325153360891109e-07, + "loss": 0.5728, + "step": 8427 + }, + { + "epoch": 0.89, + "grad_norm": 0.964299003618511, + "learning_rate": 3.319044923303133e-07, + "loss": 0.5423, + "step": 8428 + }, + { + "epoch": 0.89, + "grad_norm": 2.7005210221802867, + "learning_rate": 3.3129419089735825e-07, + "loss": 0.6404, + "step": 8429 + }, + { + "epoch": 0.89, + "grad_norm": 2.6961944982240307, + "learning_rate": 3.306844318611474e-07, + "loss": 0.6103, + "step": 8430 + }, + { + "epoch": 0.89, + "grad_norm": 2.9859468059962073, + "learning_rate": 3.300752152925213e-07, + "loss": 0.616, + "step": 8431 + }, + { + "epoch": 0.89, + "grad_norm": 2.782849535399388, + "learning_rate": 3.2946654126225776e-07, + "loss": 0.6134, + "step": 8432 + }, + { + "epoch": 0.89, + "grad_norm": 2.3605109038050074, + "learning_rate": 3.288584098410708e-07, + "loss": 0.578, + "step": 8433 + }, + { + "epoch": 0.89, + "grad_norm": 3.0653052962653273, + "learning_rate": 3.282508210996105e-07, + "loss": 0.5938, + "step": 8434 + }, + { + "epoch": 0.89, + "grad_norm": 5.320847704691404, + "learning_rate": 3.276437751084649e-07, + "loss": 0.6857, + "step": 8435 + }, + { + "epoch": 0.89, + "grad_norm": 4.171177977533934, + "learning_rate": 3.270372719381587e-07, + "loss": 0.5834, + "step": 8436 + }, + { + "epoch": 0.89, + "grad_norm": 7.886349099099324, + "learning_rate": 3.264313116591555e-07, + "loss": 0.6321, + "step": 8437 + }, + { + "epoch": 0.89, + "grad_norm": 2.8574070263501765, + "learning_rate": 3.2582589434185184e-07, + "loss": 0.593, + "step": 8438 + }, + { + "epoch": 0.89, + "grad_norm": 0.9494414704404982, + "learning_rate": 3.252210200565842e-07, + "loss": 0.5359, + "step": 8439 + }, + { + "epoch": 0.89, + "grad_norm": 3.1811163106053844, + "learning_rate": 3.2461668887362407e-07, + "loss": 0.6684, + "step": 8440 + }, + { + "epoch": 0.89, + "grad_norm": 2.381250101023003, + "learning_rate": 3.2401290086318315e-07, + "loss": 0.5573, + "step": 8441 + }, + { + "epoch": 0.89, + "grad_norm": 2.4346013172983683, + "learning_rate": 3.2340965609540643e-07, + "loss": 0.6883, + "step": 8442 + }, + { + "epoch": 0.89, + "grad_norm": 2.3133127483274003, + "learning_rate": 3.228069546403767e-07, + "loss": 0.5346, + "step": 8443 + }, + { + "epoch": 0.89, + "grad_norm": 4.184411233315387, + "learning_rate": 3.222047965681141e-07, + "loss": 0.5935, + "step": 8444 + }, + { + "epoch": 0.89, + "grad_norm": 3.0278851604452073, + "learning_rate": 3.2160318194857655e-07, + "loss": 0.57, + "step": 8445 + }, + { + "epoch": 0.89, + "grad_norm": 2.526714549043835, + "learning_rate": 3.210021108516581e-07, + "loss": 0.5528, + "step": 8446 + }, + { + "epoch": 0.89, + "grad_norm": 7.168643812080329, + "learning_rate": 3.204015833471885e-07, + "loss": 0.5877, + "step": 8447 + }, + { + "epoch": 0.89, + "grad_norm": 2.058474171768321, + "learning_rate": 3.1980159950493526e-07, + "loss": 0.6024, + "step": 8448 + }, + { + "epoch": 0.89, + "grad_norm": 2.4688281862366876, + "learning_rate": 3.1920215939460263e-07, + "loss": 0.5965, + "step": 8449 + }, + { + "epoch": 0.89, + "grad_norm": 5.718194185512959, + "learning_rate": 3.186032630858332e-07, + "loss": 0.5933, + "step": 8450 + }, + { + "epoch": 0.89, + "grad_norm": 2.4889377896135856, + "learning_rate": 3.180049106482047e-07, + "loss": 0.5516, + "step": 8451 + }, + { + "epoch": 0.89, + "grad_norm": 12.775983633424683, + "learning_rate": 3.1740710215122985e-07, + "loss": 0.5611, + "step": 8452 + }, + { + "epoch": 0.89, + "grad_norm": 2.2721038696367137, + "learning_rate": 3.1680983766436244e-07, + "loss": 0.6155, + "step": 8453 + }, + { + "epoch": 0.89, + "grad_norm": 3.1139767872960693, + "learning_rate": 3.16213117256991e-07, + "loss": 0.5942, + "step": 8454 + }, + { + "epoch": 0.89, + "grad_norm": 3.630434059537754, + "learning_rate": 3.1561694099843885e-07, + "loss": 0.6442, + "step": 8455 + }, + { + "epoch": 0.89, + "grad_norm": 2.577929607649873, + "learning_rate": 3.1502130895797066e-07, + "loss": 0.6169, + "step": 8456 + }, + { + "epoch": 0.89, + "grad_norm": 2.8393452527215897, + "learning_rate": 3.144262212047833e-07, + "loss": 0.6314, + "step": 8457 + }, + { + "epoch": 0.89, + "grad_norm": 3.113366300161247, + "learning_rate": 3.138316778080125e-07, + "loss": 0.6043, + "step": 8458 + }, + { + "epoch": 0.89, + "grad_norm": 2.6875673979733223, + "learning_rate": 3.1323767883673193e-07, + "loss": 0.5829, + "step": 8459 + }, + { + "epoch": 0.89, + "grad_norm": 2.46249726044324, + "learning_rate": 3.1264422435994977e-07, + "loss": 0.6048, + "step": 8460 + }, + { + "epoch": 0.89, + "grad_norm": 2.3009561213241656, + "learning_rate": 3.120513144466109e-07, + "loss": 0.6559, + "step": 8461 + }, + { + "epoch": 0.89, + "grad_norm": 2.97192574369102, + "learning_rate": 3.114589491655989e-07, + "loss": 0.5194, + "step": 8462 + }, + { + "epoch": 0.89, + "grad_norm": 0.9106568447644855, + "learning_rate": 3.1086712858573396e-07, + "loss": 0.546, + "step": 8463 + }, + { + "epoch": 0.89, + "grad_norm": 2.8679076537404593, + "learning_rate": 3.10275852775771e-07, + "loss": 0.6296, + "step": 8464 + }, + { + "epoch": 0.89, + "grad_norm": 2.538688766180127, + "learning_rate": 3.0968512180440225e-07, + "loss": 0.5939, + "step": 8465 + }, + { + "epoch": 0.89, + "grad_norm": 2.185856983077277, + "learning_rate": 3.090949357402573e-07, + "loss": 0.5949, + "step": 8466 + }, + { + "epoch": 0.89, + "grad_norm": 2.1121250006231236, + "learning_rate": 3.0850529465190295e-07, + "loss": 0.6447, + "step": 8467 + }, + { + "epoch": 0.89, + "grad_norm": 3.842408960010343, + "learning_rate": 3.079161986078427e-07, + "loss": 0.7068, + "step": 8468 + }, + { + "epoch": 0.89, + "grad_norm": 2.2491385619430364, + "learning_rate": 3.073276476765147e-07, + "loss": 0.6113, + "step": 8469 + }, + { + "epoch": 0.89, + "grad_norm": 2.786879233199517, + "learning_rate": 3.0673964192629466e-07, + "loss": 0.5902, + "step": 8470 + }, + { + "epoch": 0.89, + "grad_norm": 3.7520700236563482, + "learning_rate": 3.061521814254964e-07, + "loss": 0.6111, + "step": 8471 + }, + { + "epoch": 0.89, + "grad_norm": 3.037239255860453, + "learning_rate": 3.0556526624237025e-07, + "loss": 0.5664, + "step": 8472 + }, + { + "epoch": 0.89, + "grad_norm": 2.6847363520434486, + "learning_rate": 3.049788964451006e-07, + "loss": 0.6382, + "step": 8473 + }, + { + "epoch": 0.89, + "grad_norm": 2.446565844450911, + "learning_rate": 3.043930721018107e-07, + "loss": 0.5745, + "step": 8474 + }, + { + "epoch": 0.89, + "grad_norm": 2.727557086897107, + "learning_rate": 3.0380779328055945e-07, + "loss": 0.5672, + "step": 8475 + }, + { + "epoch": 0.89, + "grad_norm": 2.677310040784338, + "learning_rate": 3.0322306004934467e-07, + "loss": 0.6836, + "step": 8476 + }, + { + "epoch": 0.89, + "grad_norm": 12.712228333925863, + "learning_rate": 3.026388724760976e-07, + "loss": 0.5878, + "step": 8477 + }, + { + "epoch": 0.89, + "grad_norm": 2.7959771086323078, + "learning_rate": 3.020552306286867e-07, + "loss": 0.5857, + "step": 8478 + }, + { + "epoch": 0.89, + "grad_norm": 2.3823323020029257, + "learning_rate": 3.0147213457491887e-07, + "loss": 0.6022, + "step": 8479 + }, + { + "epoch": 0.89, + "grad_norm": 2.488707780332215, + "learning_rate": 3.0088958438253656e-07, + "loss": 0.5982, + "step": 8480 + }, + { + "epoch": 0.89, + "grad_norm": 2.3382438488043396, + "learning_rate": 3.00307580119219e-07, + "loss": 0.543, + "step": 8481 + }, + { + "epoch": 0.89, + "grad_norm": 1.034614711972979, + "learning_rate": 2.9972612185258155e-07, + "loss": 0.5225, + "step": 8482 + }, + { + "epoch": 0.89, + "grad_norm": 3.30925472166408, + "learning_rate": 2.9914520965017515e-07, + "loss": 0.6756, + "step": 8483 + }, + { + "epoch": 0.89, + "grad_norm": 1.0845217256514865, + "learning_rate": 2.985648435794897e-07, + "loss": 0.5391, + "step": 8484 + }, + { + "epoch": 0.89, + "grad_norm": 2.0947266192143617, + "learning_rate": 2.9798502370795123e-07, + "loss": 0.6038, + "step": 8485 + }, + { + "epoch": 0.89, + "grad_norm": 2.143425501436348, + "learning_rate": 2.974057501029204e-07, + "loss": 0.6399, + "step": 8486 + }, + { + "epoch": 0.89, + "grad_norm": 2.9656804898570805, + "learning_rate": 2.968270228316944e-07, + "loss": 0.5797, + "step": 8487 + }, + { + "epoch": 0.89, + "grad_norm": 3.1462792820258367, + "learning_rate": 2.9624884196151003e-07, + "loss": 0.689, + "step": 8488 + }, + { + "epoch": 0.89, + "grad_norm": 2.3954574670266324, + "learning_rate": 2.956712075595386e-07, + "loss": 0.6849, + "step": 8489 + }, + { + "epoch": 0.89, + "grad_norm": 2.4138440964653873, + "learning_rate": 2.950941196928869e-07, + "loss": 0.6271, + "step": 8490 + }, + { + "epoch": 0.89, + "grad_norm": 2.0756281900786377, + "learning_rate": 2.945175784286003e-07, + "loss": 0.4975, + "step": 8491 + }, + { + "epoch": 0.89, + "grad_norm": 2.6184403922706543, + "learning_rate": 2.93941583833659e-07, + "loss": 0.6096, + "step": 8492 + }, + { + "epoch": 0.89, + "grad_norm": 3.0901832655249137, + "learning_rate": 2.933661359749801e-07, + "loss": 0.6738, + "step": 8493 + }, + { + "epoch": 0.89, + "grad_norm": 3.0034179637882055, + "learning_rate": 2.9279123491941895e-07, + "loss": 0.5927, + "step": 8494 + }, + { + "epoch": 0.89, + "grad_norm": 2.4572881737070493, + "learning_rate": 2.9221688073376497e-07, + "loss": 0.5141, + "step": 8495 + }, + { + "epoch": 0.89, + "grad_norm": 1.101626346769178, + "learning_rate": 2.916430734847442e-07, + "loss": 0.5403, + "step": 8496 + }, + { + "epoch": 0.89, + "grad_norm": 2.641326366393812, + "learning_rate": 2.910698132390211e-07, + "loss": 0.5564, + "step": 8497 + }, + { + "epoch": 0.89, + "grad_norm": 2.471920861357954, + "learning_rate": 2.904971000631951e-07, + "loss": 0.6577, + "step": 8498 + }, + { + "epoch": 0.89, + "grad_norm": 2.4266070572352922, + "learning_rate": 2.899249340238025e-07, + "loss": 0.5911, + "step": 8499 + }, + { + "epoch": 0.89, + "grad_norm": 2.536560963334578, + "learning_rate": 2.893533151873146e-07, + "loss": 0.5689, + "step": 8500 + }, + { + "epoch": 0.89, + "grad_norm": 2.6887191325980644, + "learning_rate": 2.887822436201415e-07, + "loss": 0.634, + "step": 8501 + }, + { + "epoch": 0.89, + "grad_norm": 3.3999765388204106, + "learning_rate": 2.882117193886297e-07, + "loss": 0.5534, + "step": 8502 + }, + { + "epoch": 0.89, + "grad_norm": 2.2917807653579576, + "learning_rate": 2.8764174255905886e-07, + "loss": 0.6365, + "step": 8503 + }, + { + "epoch": 0.89, + "grad_norm": 2.318274584316322, + "learning_rate": 2.870723131976494e-07, + "loss": 0.5574, + "step": 8504 + }, + { + "epoch": 0.89, + "grad_norm": 4.890909130662085, + "learning_rate": 2.865034313705539e-07, + "loss": 0.5967, + "step": 8505 + }, + { + "epoch": 0.9, + "grad_norm": 2.0997713123306756, + "learning_rate": 2.8593509714386456e-07, + "loss": 0.5588, + "step": 8506 + }, + { + "epoch": 0.9, + "grad_norm": 2.617809589685534, + "learning_rate": 2.853673105836091e-07, + "loss": 0.5374, + "step": 8507 + }, + { + "epoch": 0.9, + "grad_norm": 2.68605129954623, + "learning_rate": 2.8480007175575144e-07, + "loss": 0.5403, + "step": 8508 + }, + { + "epoch": 0.9, + "grad_norm": 2.2348488550547074, + "learning_rate": 2.842333807261899e-07, + "loss": 0.575, + "step": 8509 + }, + { + "epoch": 0.9, + "grad_norm": 2.701909082687133, + "learning_rate": 2.836672375607624e-07, + "loss": 0.5603, + "step": 8510 + }, + { + "epoch": 0.9, + "grad_norm": 2.982475752321344, + "learning_rate": 2.831016423252425e-07, + "loss": 0.5673, + "step": 8511 + }, + { + "epoch": 0.9, + "grad_norm": 2.9635176976288626, + "learning_rate": 2.825365950853387e-07, + "loss": 0.6621, + "step": 8512 + }, + { + "epoch": 0.9, + "grad_norm": 12.14753234673202, + "learning_rate": 2.8197209590669573e-07, + "loss": 0.5816, + "step": 8513 + }, + { + "epoch": 0.9, + "grad_norm": 2.4088842765153546, + "learning_rate": 2.814081448548961e-07, + "loss": 0.5515, + "step": 8514 + }, + { + "epoch": 0.9, + "grad_norm": 2.5924726034885115, + "learning_rate": 2.8084474199545907e-07, + "loss": 0.6995, + "step": 8515 + }, + { + "epoch": 0.9, + "grad_norm": 2.319648207905272, + "learning_rate": 2.802818873938373e-07, + "loss": 0.5947, + "step": 8516 + }, + { + "epoch": 0.9, + "grad_norm": 2.9901729743477024, + "learning_rate": 2.79719581115423e-07, + "loss": 0.6025, + "step": 8517 + }, + { + "epoch": 0.9, + "grad_norm": 2.1532085821221703, + "learning_rate": 2.7915782322554265e-07, + "loss": 0.574, + "step": 8518 + }, + { + "epoch": 0.9, + "grad_norm": 2.386635632359815, + "learning_rate": 2.7859661378945966e-07, + "loss": 0.5769, + "step": 8519 + }, + { + "epoch": 0.9, + "grad_norm": 2.495915367797323, + "learning_rate": 2.7803595287237416e-07, + "loss": 0.6033, + "step": 8520 + }, + { + "epoch": 0.9, + "grad_norm": 2.2480787603907597, + "learning_rate": 2.7747584053942236e-07, + "loss": 0.5373, + "step": 8521 + }, + { + "epoch": 0.9, + "grad_norm": 2.755749524388031, + "learning_rate": 2.7691627685567545e-07, + "loss": 0.6319, + "step": 8522 + }, + { + "epoch": 0.9, + "grad_norm": 3.8563418049060942, + "learning_rate": 2.763572618861421e-07, + "loss": 0.6336, + "step": 8523 + }, + { + "epoch": 0.9, + "grad_norm": 0.9803488628787727, + "learning_rate": 2.7579879569576805e-07, + "loss": 0.5288, + "step": 8524 + }, + { + "epoch": 0.9, + "grad_norm": 2.158913363300771, + "learning_rate": 2.7524087834943257e-07, + "loss": 0.5508, + "step": 8525 + }, + { + "epoch": 0.9, + "grad_norm": 2.2990257664705704, + "learning_rate": 2.746835099119555e-07, + "loss": 0.5405, + "step": 8526 + }, + { + "epoch": 0.9, + "grad_norm": 2.722742879522637, + "learning_rate": 2.7412669044808714e-07, + "loss": 0.59, + "step": 8527 + }, + { + "epoch": 0.9, + "grad_norm": 2.7098085005125347, + "learning_rate": 2.7357042002251977e-07, + "loss": 0.6599, + "step": 8528 + }, + { + "epoch": 0.9, + "grad_norm": 2.414604589485784, + "learning_rate": 2.730146986998783e-07, + "loss": 0.5914, + "step": 8529 + }, + { + "epoch": 0.9, + "grad_norm": 2.9999875277016037, + "learning_rate": 2.7245952654472495e-07, + "loss": 0.6089, + "step": 8530 + }, + { + "epoch": 0.9, + "grad_norm": 2.4698756738709617, + "learning_rate": 2.7190490362155706e-07, + "loss": 0.7083, + "step": 8531 + }, + { + "epoch": 0.9, + "grad_norm": 4.0071257148376604, + "learning_rate": 2.7135082999481033e-07, + "loss": 0.6586, + "step": 8532 + }, + { + "epoch": 0.9, + "grad_norm": 2.4350296312759276, + "learning_rate": 2.707973057288554e-07, + "loss": 0.5523, + "step": 8533 + }, + { + "epoch": 0.9, + "grad_norm": 0.9785381511638156, + "learning_rate": 2.7024433088799874e-07, + "loss": 0.5391, + "step": 8534 + }, + { + "epoch": 0.9, + "grad_norm": 3.7213298482636703, + "learning_rate": 2.696919055364827e-07, + "loss": 0.6217, + "step": 8535 + }, + { + "epoch": 0.9, + "grad_norm": 2.7426731105858804, + "learning_rate": 2.691400297384872e-07, + "loss": 0.622, + "step": 8536 + }, + { + "epoch": 0.9, + "grad_norm": 2.9682834230681583, + "learning_rate": 2.6858870355812807e-07, + "loss": 0.6019, + "step": 8537 + }, + { + "epoch": 0.9, + "grad_norm": 2.5959681300857804, + "learning_rate": 2.6803792705945574e-07, + "loss": 0.5257, + "step": 8538 + }, + { + "epoch": 0.9, + "grad_norm": 3.117284087962713, + "learning_rate": 2.674877003064591e-07, + "loss": 0.6142, + "step": 8539 + }, + { + "epoch": 0.9, + "grad_norm": 2.1014683352430614, + "learning_rate": 2.669380233630603e-07, + "loss": 0.6515, + "step": 8540 + }, + { + "epoch": 0.9, + "grad_norm": 2.665551926914057, + "learning_rate": 2.6638889629311994e-07, + "loss": 0.5391, + "step": 8541 + }, + { + "epoch": 0.9, + "grad_norm": 3.222214668746664, + "learning_rate": 2.6584031916043476e-07, + "loss": 0.6326, + "step": 8542 + }, + { + "epoch": 0.9, + "grad_norm": 2.6557840721881734, + "learning_rate": 2.652922920287365e-07, + "loss": 0.557, + "step": 8543 + }, + { + "epoch": 0.9, + "grad_norm": 2.4193798372931403, + "learning_rate": 2.647448149616921e-07, + "loss": 0.5985, + "step": 8544 + }, + { + "epoch": 0.9, + "grad_norm": 4.689574105729332, + "learning_rate": 2.641978880229074e-07, + "loss": 0.608, + "step": 8545 + }, + { + "epoch": 0.9, + "grad_norm": 2.743675378873042, + "learning_rate": 2.636515112759225e-07, + "loss": 0.5973, + "step": 8546 + }, + { + "epoch": 0.9, + "grad_norm": 2.5656395322683725, + "learning_rate": 2.631056847842134e-07, + "loss": 0.5468, + "step": 8547 + }, + { + "epoch": 0.9, + "grad_norm": 3.1810462861004862, + "learning_rate": 2.625604086111927e-07, + "loss": 0.5399, + "step": 8548 + }, + { + "epoch": 0.9, + "grad_norm": 2.5217278501998113, + "learning_rate": 2.620156828202092e-07, + "loss": 0.6972, + "step": 8549 + }, + { + "epoch": 0.9, + "grad_norm": 2.4534326162271425, + "learning_rate": 2.6147150747454776e-07, + "loss": 0.6776, + "step": 8550 + }, + { + "epoch": 0.9, + "grad_norm": 2.7071917479562653, + "learning_rate": 2.609278826374284e-07, + "loss": 0.6411, + "step": 8551 + }, + { + "epoch": 0.9, + "grad_norm": 2.581422750056675, + "learning_rate": 2.6038480837200896e-07, + "loss": 0.6482, + "step": 8552 + }, + { + "epoch": 0.9, + "grad_norm": 2.335476941637362, + "learning_rate": 2.5984228474138115e-07, + "loss": 0.535, + "step": 8553 + }, + { + "epoch": 0.9, + "grad_norm": 2.4914563871633013, + "learning_rate": 2.593003118085746e-07, + "loss": 0.539, + "step": 8554 + }, + { + "epoch": 0.9, + "grad_norm": 2.477999239774418, + "learning_rate": 2.5875888963655396e-07, + "loss": 0.5921, + "step": 8555 + }, + { + "epoch": 0.9, + "grad_norm": 3.0840194634488522, + "learning_rate": 2.582180182882205e-07, + "loss": 0.579, + "step": 8556 + }, + { + "epoch": 0.9, + "grad_norm": 2.206238807030622, + "learning_rate": 2.576776978264095e-07, + "loss": 0.6189, + "step": 8557 + }, + { + "epoch": 0.9, + "grad_norm": 2.937734436218125, + "learning_rate": 2.5713792831389473e-07, + "loss": 0.6302, + "step": 8558 + }, + { + "epoch": 0.9, + "grad_norm": 2.714921870739376, + "learning_rate": 2.565987098133865e-07, + "loss": 0.597, + "step": 8559 + }, + { + "epoch": 0.9, + "grad_norm": 3.5143184938458027, + "learning_rate": 2.56060042387527e-07, + "loss": 0.6705, + "step": 8560 + }, + { + "epoch": 0.9, + "grad_norm": 2.617749300044612, + "learning_rate": 2.5552192609890004e-07, + "loss": 0.6626, + "step": 8561 + }, + { + "epoch": 0.9, + "grad_norm": 2.422359435875593, + "learning_rate": 2.5498436101001946e-07, + "loss": 0.5862, + "step": 8562 + }, + { + "epoch": 0.9, + "grad_norm": 2.87556986453871, + "learning_rate": 2.544473471833403e-07, + "loss": 0.5421, + "step": 8563 + }, + { + "epoch": 0.9, + "grad_norm": 2.5521477189588757, + "learning_rate": 2.5391088468124934e-07, + "loss": 0.6127, + "step": 8564 + }, + { + "epoch": 0.9, + "grad_norm": 2.626589882214039, + "learning_rate": 2.533749735660729e-07, + "loss": 0.6057, + "step": 8565 + }, + { + "epoch": 0.9, + "grad_norm": 2.272089311491973, + "learning_rate": 2.528396139000705e-07, + "loss": 0.5462, + "step": 8566 + }, + { + "epoch": 0.9, + "grad_norm": 2.497924231522892, + "learning_rate": 2.5230480574543914e-07, + "loss": 0.6173, + "step": 8567 + }, + { + "epoch": 0.9, + "grad_norm": 2.5946958140611014, + "learning_rate": 2.5177054916431186e-07, + "loss": 0.5676, + "step": 8568 + }, + { + "epoch": 0.9, + "grad_norm": 4.622437188040537, + "learning_rate": 2.5123684421875627e-07, + "loss": 0.611, + "step": 8569 + }, + { + "epoch": 0.9, + "grad_norm": 2.4619075515236317, + "learning_rate": 2.507036909707766e-07, + "loss": 0.597, + "step": 8570 + }, + { + "epoch": 0.9, + "grad_norm": 2.130051490960946, + "learning_rate": 2.5017108948231284e-07, + "loss": 0.571, + "step": 8571 + }, + { + "epoch": 0.9, + "grad_norm": 2.3145413732517084, + "learning_rate": 2.4963903981524265e-07, + "loss": 0.6216, + "step": 8572 + }, + { + "epoch": 0.9, + "grad_norm": 2.6281633995236575, + "learning_rate": 2.4910754203137597e-07, + "loss": 0.5933, + "step": 8573 + }, + { + "epoch": 0.9, + "grad_norm": 0.9962463423913068, + "learning_rate": 2.4857659619246246e-07, + "loss": 0.4788, + "step": 8574 + }, + { + "epoch": 0.9, + "grad_norm": 3.509203947561383, + "learning_rate": 2.4804620236018376e-07, + "loss": 0.6344, + "step": 8575 + }, + { + "epoch": 0.9, + "grad_norm": 2.3901552696802058, + "learning_rate": 2.475163605961617e-07, + "loss": 0.6128, + "step": 8576 + }, + { + "epoch": 0.9, + "grad_norm": 2.851401165089648, + "learning_rate": 2.4698707096195094e-07, + "loss": 0.5852, + "step": 8577 + }, + { + "epoch": 0.9, + "grad_norm": 3.7359676704634306, + "learning_rate": 2.4645833351904235e-07, + "loss": 0.5583, + "step": 8578 + }, + { + "epoch": 0.9, + "grad_norm": 2.636845767665651, + "learning_rate": 2.4593014832886344e-07, + "loss": 0.6295, + "step": 8579 + }, + { + "epoch": 0.9, + "grad_norm": 4.772427246864486, + "learning_rate": 2.4540251545277726e-07, + "loss": 0.6169, + "step": 8580 + }, + { + "epoch": 0.9, + "grad_norm": 2.3697955879151564, + "learning_rate": 2.448754349520832e-07, + "loss": 0.5962, + "step": 8581 + }, + { + "epoch": 0.9, + "grad_norm": 2.7219924370672866, + "learning_rate": 2.4434890688801504e-07, + "loss": 0.6225, + "step": 8582 + }, + { + "epoch": 0.9, + "grad_norm": 2.566350444650246, + "learning_rate": 2.4382293132174384e-07, + "loss": 0.5551, + "step": 8583 + }, + { + "epoch": 0.9, + "grad_norm": 2.5372826204659056, + "learning_rate": 2.4329750831437514e-07, + "loss": 0.6557, + "step": 8584 + }, + { + "epoch": 0.9, + "grad_norm": 0.8353964126790068, + "learning_rate": 2.427726379269524e-07, + "loss": 0.5587, + "step": 8585 + }, + { + "epoch": 0.9, + "grad_norm": 2.1888874583305187, + "learning_rate": 2.422483202204523e-07, + "loss": 0.692, + "step": 8586 + }, + { + "epoch": 0.9, + "grad_norm": 2.4217812609046585, + "learning_rate": 2.417245552557901e-07, + "loss": 0.5887, + "step": 8587 + }, + { + "epoch": 0.9, + "grad_norm": 3.31796627980692, + "learning_rate": 2.4120134309381315e-07, + "loss": 0.7178, + "step": 8588 + }, + { + "epoch": 0.9, + "grad_norm": 2.213415629562026, + "learning_rate": 2.406786837953079e-07, + "loss": 0.5923, + "step": 8589 + }, + { + "epoch": 0.9, + "grad_norm": 0.941272484421341, + "learning_rate": 2.401565774209963e-07, + "loss": 0.5444, + "step": 8590 + }, + { + "epoch": 0.9, + "grad_norm": 3.0359025963461446, + "learning_rate": 2.396350240315337e-07, + "loss": 0.5498, + "step": 8591 + }, + { + "epoch": 0.9, + "grad_norm": 2.6306647881917153, + "learning_rate": 2.391140236875128e-07, + "loss": 0.6251, + "step": 8592 + }, + { + "epoch": 0.9, + "grad_norm": 2.6487766622684172, + "learning_rate": 2.3859357644946233e-07, + "loss": 0.6471, + "step": 8593 + }, + { + "epoch": 0.9, + "grad_norm": 2.645864611230044, + "learning_rate": 2.3807368237784735e-07, + "loss": 0.5693, + "step": 8594 + }, + { + "epoch": 0.9, + "grad_norm": 2.212215219692308, + "learning_rate": 2.3755434153306555e-07, + "loss": 0.5178, + "step": 8595 + }, + { + "epoch": 0.9, + "grad_norm": 2.5525173270216794, + "learning_rate": 2.370355539754543e-07, + "loss": 0.5043, + "step": 8596 + }, + { + "epoch": 0.9, + "grad_norm": 2.973557160116561, + "learning_rate": 2.3651731976528314e-07, + "loss": 0.5824, + "step": 8597 + }, + { + "epoch": 0.9, + "grad_norm": 2.368358688574046, + "learning_rate": 2.3599963896276113e-07, + "loss": 0.6403, + "step": 8598 + }, + { + "epoch": 0.9, + "grad_norm": 3.541437487586155, + "learning_rate": 2.354825116280285e-07, + "loss": 0.6646, + "step": 8599 + }, + { + "epoch": 0.9, + "grad_norm": 2.6360131144611247, + "learning_rate": 2.3496593782116607e-07, + "loss": 0.5936, + "step": 8600 + }, + { + "epoch": 0.91, + "grad_norm": 2.4729013605037418, + "learning_rate": 2.3444991760218526e-07, + "loss": 0.6654, + "step": 8601 + }, + { + "epoch": 0.91, + "grad_norm": 2.4321075434936485, + "learning_rate": 2.3393445103103762e-07, + "loss": 0.6181, + "step": 8602 + }, + { + "epoch": 0.91, + "grad_norm": 2.557292183237137, + "learning_rate": 2.334195381676091e-07, + "loss": 0.6118, + "step": 8603 + }, + { + "epoch": 0.91, + "grad_norm": 2.7334084120232323, + "learning_rate": 2.3290517907171962e-07, + "loss": 0.5789, + "step": 8604 + }, + { + "epoch": 0.91, + "grad_norm": 3.2292498265081755, + "learning_rate": 2.3239137380312526e-07, + "loss": 0.6254, + "step": 8605 + }, + { + "epoch": 0.91, + "grad_norm": 2.2190579271215136, + "learning_rate": 2.3187812242151996e-07, + "loss": 0.6202, + "step": 8606 + }, + { + "epoch": 0.91, + "grad_norm": 2.6556837855127435, + "learning_rate": 2.3136542498653103e-07, + "loss": 0.5697, + "step": 8607 + }, + { + "epoch": 0.91, + "grad_norm": 2.923481136322459, + "learning_rate": 2.308532815577219e-07, + "loss": 0.5777, + "step": 8608 + }, + { + "epoch": 0.91, + "grad_norm": 2.1026727157977447, + "learning_rate": 2.3034169219459336e-07, + "loss": 0.5759, + "step": 8609 + }, + { + "epoch": 0.91, + "grad_norm": 2.2566630684707336, + "learning_rate": 2.2983065695657835e-07, + "loss": 0.605, + "step": 8610 + }, + { + "epoch": 0.91, + "grad_norm": 3.0101849840783994, + "learning_rate": 2.2932017590304945e-07, + "loss": 0.6228, + "step": 8611 + }, + { + "epoch": 0.91, + "grad_norm": 2.9371051066532257, + "learning_rate": 2.2881024909331084e-07, + "loss": 0.5492, + "step": 8612 + }, + { + "epoch": 0.91, + "grad_norm": 3.594191038842626, + "learning_rate": 2.2830087658660626e-07, + "loss": 0.6286, + "step": 8613 + }, + { + "epoch": 0.91, + "grad_norm": 2.2603072217979463, + "learning_rate": 2.2779205844211115e-07, + "loss": 0.704, + "step": 8614 + }, + { + "epoch": 0.91, + "grad_norm": 0.8989627424235347, + "learning_rate": 2.2728379471893992e-07, + "loss": 0.4864, + "step": 8615 + }, + { + "epoch": 0.91, + "grad_norm": 16.729220556087142, + "learning_rate": 2.2677608547614195e-07, + "loss": 0.5918, + "step": 8616 + }, + { + "epoch": 0.91, + "grad_norm": 2.2488176924383954, + "learning_rate": 2.2626893077269952e-07, + "loss": 0.582, + "step": 8617 + }, + { + "epoch": 0.91, + "grad_norm": 2.4475963095140343, + "learning_rate": 2.2576233066753328e-07, + "loss": 0.6073, + "step": 8618 + }, + { + "epoch": 0.91, + "grad_norm": 2.50017796036233, + "learning_rate": 2.252562852194984e-07, + "loss": 0.6705, + "step": 8619 + }, + { + "epoch": 0.91, + "grad_norm": 1.0355653284721114, + "learning_rate": 2.2475079448738667e-07, + "loss": 0.5797, + "step": 8620 + }, + { + "epoch": 0.91, + "grad_norm": 5.740952195511575, + "learning_rate": 2.2424585852992287e-07, + "loss": 0.6276, + "step": 8621 + }, + { + "epoch": 0.91, + "grad_norm": 2.624621427504004, + "learning_rate": 2.2374147740577058e-07, + "loss": 0.5743, + "step": 8622 + }, + { + "epoch": 0.91, + "grad_norm": 2.4340730079804325, + "learning_rate": 2.2323765117352625e-07, + "loss": 0.5585, + "step": 8623 + }, + { + "epoch": 0.91, + "grad_norm": 2.1286673679109693, + "learning_rate": 2.2273437989172308e-07, + "loss": 0.6019, + "step": 8624 + }, + { + "epoch": 0.91, + "grad_norm": 4.029911691462066, + "learning_rate": 2.2223166361883096e-07, + "loss": 0.5091, + "step": 8625 + }, + { + "epoch": 0.91, + "grad_norm": 3.1602463814705746, + "learning_rate": 2.217295024132532e-07, + "loss": 0.627, + "step": 8626 + }, + { + "epoch": 0.91, + "grad_norm": 3.55085504048545, + "learning_rate": 2.2122789633332808e-07, + "loss": 0.6616, + "step": 8627 + }, + { + "epoch": 0.91, + "grad_norm": 3.3530586363040547, + "learning_rate": 2.2072684543733236e-07, + "loss": 0.6542, + "step": 8628 + }, + { + "epoch": 0.91, + "grad_norm": 2.8871688456843403, + "learning_rate": 2.2022634978347668e-07, + "loss": 0.7099, + "step": 8629 + }, + { + "epoch": 0.91, + "grad_norm": 2.5020918168833344, + "learning_rate": 2.197264094299062e-07, + "loss": 0.6461, + "step": 8630 + }, + { + "epoch": 0.91, + "grad_norm": 2.9739866771027983, + "learning_rate": 2.192270244347039e-07, + "loss": 0.6444, + "step": 8631 + }, + { + "epoch": 0.91, + "grad_norm": 2.453684270588019, + "learning_rate": 2.1872819485588504e-07, + "loss": 0.5701, + "step": 8632 + }, + { + "epoch": 0.91, + "grad_norm": 2.9298917878750905, + "learning_rate": 2.1822992075140382e-07, + "loss": 0.6017, + "step": 8633 + }, + { + "epoch": 0.91, + "grad_norm": 4.21769668421678, + "learning_rate": 2.177322021791478e-07, + "loss": 0.552, + "step": 8634 + }, + { + "epoch": 0.91, + "grad_norm": 2.333398707424515, + "learning_rate": 2.1723503919694022e-07, + "loss": 0.5818, + "step": 8635 + }, + { + "epoch": 0.91, + "grad_norm": 0.9600492185451742, + "learning_rate": 2.167384318625404e-07, + "loss": 0.5513, + "step": 8636 + }, + { + "epoch": 0.91, + "grad_norm": 2.1619655902215342, + "learning_rate": 2.1624238023364164e-07, + "loss": 0.6071, + "step": 8637 + }, + { + "epoch": 0.91, + "grad_norm": 2.30009030782602, + "learning_rate": 2.1574688436787616e-07, + "loss": 0.5676, + "step": 8638 + }, + { + "epoch": 0.91, + "grad_norm": 2.261626619968899, + "learning_rate": 2.152519443228074e-07, + "loss": 0.6164, + "step": 8639 + }, + { + "epoch": 0.91, + "grad_norm": 2.5969295591844355, + "learning_rate": 2.1475756015593597e-07, + "loss": 0.577, + "step": 8640 + }, + { + "epoch": 0.91, + "grad_norm": 2.8285213912360896, + "learning_rate": 2.142637319246982e-07, + "loss": 0.5747, + "step": 8641 + }, + { + "epoch": 0.91, + "grad_norm": 2.6663798524618434, + "learning_rate": 2.1377045968646648e-07, + "loss": 0.6167, + "step": 8642 + }, + { + "epoch": 0.91, + "grad_norm": 3.875625734126703, + "learning_rate": 2.1327774349854669e-07, + "loss": 0.642, + "step": 8643 + }, + { + "epoch": 0.91, + "grad_norm": 1.0023913927338757, + "learning_rate": 2.1278558341818245e-07, + "loss": 0.549, + "step": 8644 + }, + { + "epoch": 0.91, + "grad_norm": 2.492821927184673, + "learning_rate": 2.1229397950254971e-07, + "loss": 0.6158, + "step": 8645 + }, + { + "epoch": 0.91, + "grad_norm": 3.954417059877932, + "learning_rate": 2.1180293180876333e-07, + "loss": 0.5902, + "step": 8646 + }, + { + "epoch": 0.91, + "grad_norm": 2.4328009320824684, + "learning_rate": 2.11312440393871e-07, + "loss": 0.6403, + "step": 8647 + }, + { + "epoch": 0.91, + "grad_norm": 2.6647018614968547, + "learning_rate": 2.1082250531485658e-07, + "loss": 0.5715, + "step": 8648 + }, + { + "epoch": 0.91, + "grad_norm": 1.0060574812970433, + "learning_rate": 2.1033312662863902e-07, + "loss": 0.5627, + "step": 8649 + }, + { + "epoch": 0.91, + "grad_norm": 2.641443035192098, + "learning_rate": 2.0984430439207337e-07, + "loss": 0.5118, + "step": 8650 + }, + { + "epoch": 0.91, + "grad_norm": 3.9361538566284255, + "learning_rate": 2.0935603866194975e-07, + "loss": 0.6679, + "step": 8651 + }, + { + "epoch": 0.91, + "grad_norm": 2.39373289597829, + "learning_rate": 2.0886832949499337e-07, + "loss": 0.5668, + "step": 8652 + }, + { + "epoch": 0.91, + "grad_norm": 3.3861532699457686, + "learning_rate": 2.083811769478644e-07, + "loss": 0.6525, + "step": 8653 + }, + { + "epoch": 0.91, + "grad_norm": 3.936995441518683, + "learning_rate": 2.0789458107715876e-07, + "loss": 0.7025, + "step": 8654 + }, + { + "epoch": 0.91, + "grad_norm": 2.349046763306768, + "learning_rate": 2.0740854193940896e-07, + "loss": 0.6309, + "step": 8655 + }, + { + "epoch": 0.91, + "grad_norm": 2.2596999928277253, + "learning_rate": 2.0692305959107982e-07, + "loss": 0.6011, + "step": 8656 + }, + { + "epoch": 0.91, + "grad_norm": 0.9228352521725202, + "learning_rate": 2.0643813408857516e-07, + "loss": 0.4917, + "step": 8657 + }, + { + "epoch": 0.91, + "grad_norm": 2.6385919262613746, + "learning_rate": 2.05953765488231e-07, + "loss": 0.5755, + "step": 8658 + }, + { + "epoch": 0.91, + "grad_norm": 2.2137952255821753, + "learning_rate": 2.0546995384632008e-07, + "loss": 0.6005, + "step": 8659 + }, + { + "epoch": 0.91, + "grad_norm": 2.5803538966845636, + "learning_rate": 2.0498669921905024e-07, + "loss": 0.7106, + "step": 8660 + }, + { + "epoch": 0.91, + "grad_norm": 2.9792364157241105, + "learning_rate": 2.045040016625649e-07, + "loss": 0.5886, + "step": 8661 + }, + { + "epoch": 0.91, + "grad_norm": 2.3666959842671464, + "learning_rate": 2.04021861232942e-07, + "loss": 0.5773, + "step": 8662 + }, + { + "epoch": 0.91, + "grad_norm": 3.6824064705468786, + "learning_rate": 2.0354027798619557e-07, + "loss": 0.6144, + "step": 8663 + }, + { + "epoch": 0.91, + "grad_norm": 2.435875779491401, + "learning_rate": 2.030592519782748e-07, + "loss": 0.6403, + "step": 8664 + }, + { + "epoch": 0.91, + "grad_norm": 2.6128038565557805, + "learning_rate": 2.0257878326506386e-07, + "loss": 0.6337, + "step": 8665 + }, + { + "epoch": 0.91, + "grad_norm": 2.282841908523947, + "learning_rate": 2.020988719023814e-07, + "loss": 0.5942, + "step": 8666 + }, + { + "epoch": 0.91, + "grad_norm": 4.370234830147251, + "learning_rate": 2.0161951794598233e-07, + "loss": 0.5383, + "step": 8667 + }, + { + "epoch": 0.91, + "grad_norm": 2.1849633361416223, + "learning_rate": 2.011407214515576e-07, + "loss": 0.6604, + "step": 8668 + }, + { + "epoch": 0.91, + "grad_norm": 1.9975177789546006, + "learning_rate": 2.0066248247473108e-07, + "loss": 0.6303, + "step": 8669 + }, + { + "epoch": 0.91, + "grad_norm": 2.522737370998732, + "learning_rate": 2.0018480107106496e-07, + "loss": 0.5688, + "step": 8670 + }, + { + "epoch": 0.91, + "grad_norm": 2.4993421173578714, + "learning_rate": 1.9970767729605268e-07, + "loss": 0.5487, + "step": 8671 + }, + { + "epoch": 0.91, + "grad_norm": 3.527990417897828, + "learning_rate": 1.992311112051265e-07, + "loss": 0.5992, + "step": 8672 + }, + { + "epoch": 0.91, + "grad_norm": 6.350490216687445, + "learning_rate": 1.9875510285365273e-07, + "loss": 0.5894, + "step": 8673 + }, + { + "epoch": 0.91, + "grad_norm": 2.5923844574515673, + "learning_rate": 1.9827965229693215e-07, + "loss": 0.6643, + "step": 8674 + }, + { + "epoch": 0.91, + "grad_norm": 2.8259634397156805, + "learning_rate": 1.978047595902005e-07, + "loss": 0.5955, + "step": 8675 + }, + { + "epoch": 0.91, + "grad_norm": 3.233787620175592, + "learning_rate": 1.973304247886304e-07, + "loss": 0.5663, + "step": 8676 + }, + { + "epoch": 0.91, + "grad_norm": 2.9852555325806147, + "learning_rate": 1.9685664794732884e-07, + "loss": 0.6144, + "step": 8677 + }, + { + "epoch": 0.91, + "grad_norm": 4.705513555230461, + "learning_rate": 1.963834291213368e-07, + "loss": 0.5355, + "step": 8678 + }, + { + "epoch": 0.91, + "grad_norm": 5.025851027664062, + "learning_rate": 1.959107683656325e-07, + "loss": 0.5568, + "step": 8679 + }, + { + "epoch": 0.91, + "grad_norm": 2.5826049637506676, + "learning_rate": 1.954386657351276e-07, + "loss": 0.6396, + "step": 8680 + }, + { + "epoch": 0.91, + "grad_norm": 3.650551552519513, + "learning_rate": 1.9496712128467043e-07, + "loss": 0.6257, + "step": 8681 + }, + { + "epoch": 0.91, + "grad_norm": 3.075805993953122, + "learning_rate": 1.9449613506904275e-07, + "loss": 0.6136, + "step": 8682 + }, + { + "epoch": 0.91, + "grad_norm": 2.6518292689904186, + "learning_rate": 1.9402570714296353e-07, + "loss": 0.5471, + "step": 8683 + }, + { + "epoch": 0.91, + "grad_norm": 2.6510281697671734, + "learning_rate": 1.9355583756108408e-07, + "loss": 0.5416, + "step": 8684 + }, + { + "epoch": 0.91, + "grad_norm": 2.7279535357511, + "learning_rate": 1.9308652637799352e-07, + "loss": 0.5669, + "step": 8685 + }, + { + "epoch": 0.91, + "grad_norm": 2.359794373937808, + "learning_rate": 1.9261777364821542e-07, + "loss": 0.6428, + "step": 8686 + }, + { + "epoch": 0.91, + "grad_norm": 0.9251054541397337, + "learning_rate": 1.9214957942620738e-07, + "loss": 0.5481, + "step": 8687 + }, + { + "epoch": 0.91, + "grad_norm": 2.718945841396238, + "learning_rate": 1.9168194376636308e-07, + "loss": 0.6115, + "step": 8688 + }, + { + "epoch": 0.91, + "grad_norm": 2.0986880391335343, + "learning_rate": 1.912148667230107e-07, + "loss": 0.5174, + "step": 8689 + }, + { + "epoch": 0.91, + "grad_norm": 2.4561034728267597, + "learning_rate": 1.9074834835041523e-07, + "loss": 0.6184, + "step": 8690 + }, + { + "epoch": 0.91, + "grad_norm": 2.3130570678521942, + "learning_rate": 1.9028238870277383e-07, + "loss": 0.5663, + "step": 8691 + }, + { + "epoch": 0.91, + "grad_norm": 2.233247914946738, + "learning_rate": 1.8981698783422154e-07, + "loss": 0.5151, + "step": 8692 + }, + { + "epoch": 0.91, + "grad_norm": 2.142511538639313, + "learning_rate": 1.8935214579882622e-07, + "loss": 0.6681, + "step": 8693 + }, + { + "epoch": 0.91, + "grad_norm": 2.3532758793115693, + "learning_rate": 1.88887862650593e-07, + "loss": 0.5731, + "step": 8694 + }, + { + "epoch": 0.91, + "grad_norm": 2.9857413005512314, + "learning_rate": 1.8842413844345986e-07, + "loss": 0.5648, + "step": 8695 + }, + { + "epoch": 0.92, + "grad_norm": 3.9566011801313214, + "learning_rate": 1.8796097323130202e-07, + "loss": 0.5285, + "step": 8696 + }, + { + "epoch": 0.92, + "grad_norm": 2.808602535252724, + "learning_rate": 1.8749836706792758e-07, + "loss": 0.6272, + "step": 8697 + }, + { + "epoch": 0.92, + "grad_norm": 2.524622740136097, + "learning_rate": 1.8703632000708128e-07, + "loss": 0.5818, + "step": 8698 + }, + { + "epoch": 0.92, + "grad_norm": 12.660611398706083, + "learning_rate": 1.8657483210244298e-07, + "loss": 0.6365, + "step": 8699 + }, + { + "epoch": 0.92, + "grad_norm": 2.387834624346501, + "learning_rate": 1.8611390340762647e-07, + "loss": 0.6027, + "step": 8700 + }, + { + "epoch": 0.92, + "grad_norm": 3.142755060247217, + "learning_rate": 1.8565353397618057e-07, + "loss": 0.6635, + "step": 8701 + }, + { + "epoch": 0.92, + "grad_norm": 3.134961455919104, + "learning_rate": 1.8519372386159028e-07, + "loss": 0.5493, + "step": 8702 + }, + { + "epoch": 0.92, + "grad_norm": 2.8210863113852738, + "learning_rate": 1.8473447311727567e-07, + "loss": 0.6306, + "step": 8703 + }, + { + "epoch": 0.92, + "grad_norm": 2.286001395669524, + "learning_rate": 1.8427578179658957e-07, + "loss": 0.5625, + "step": 8704 + }, + { + "epoch": 0.92, + "grad_norm": 2.1320659323325755, + "learning_rate": 1.8381764995282269e-07, + "loss": 0.6272, + "step": 8705 + }, + { + "epoch": 0.92, + "grad_norm": 2.786721797593951, + "learning_rate": 1.8336007763919916e-07, + "loss": 0.5459, + "step": 8706 + }, + { + "epoch": 0.92, + "grad_norm": 2.9741697703618373, + "learning_rate": 1.8290306490887866e-07, + "loss": 0.6213, + "step": 8707 + }, + { + "epoch": 0.92, + "grad_norm": 7.392970288943625, + "learning_rate": 1.8244661181495426e-07, + "loss": 0.5805, + "step": 8708 + }, + { + "epoch": 0.92, + "grad_norm": 1.9781416987342864, + "learning_rate": 1.8199071841045746e-07, + "loss": 0.575, + "step": 8709 + }, + { + "epoch": 0.92, + "grad_norm": 1.9655678631701408, + "learning_rate": 1.8153538474835086e-07, + "loss": 0.6157, + "step": 8710 + }, + { + "epoch": 0.92, + "grad_norm": 2.1680563446589924, + "learning_rate": 1.810806108815344e-07, + "loss": 0.5988, + "step": 8711 + }, + { + "epoch": 0.92, + "grad_norm": 2.4250575483212824, + "learning_rate": 1.80626396862843e-07, + "loss": 0.5865, + "step": 8712 + }, + { + "epoch": 0.92, + "grad_norm": 2.1845565551153676, + "learning_rate": 1.801727427450445e-07, + "loss": 0.6155, + "step": 8713 + }, + { + "epoch": 0.92, + "grad_norm": 2.311124360025313, + "learning_rate": 1.79719648580845e-07, + "loss": 0.626, + "step": 8714 + }, + { + "epoch": 0.92, + "grad_norm": 0.9616951924685692, + "learning_rate": 1.7926711442288247e-07, + "loss": 0.5346, + "step": 8715 + }, + { + "epoch": 0.92, + "grad_norm": 3.549218920031044, + "learning_rate": 1.7881514032373147e-07, + "loss": 0.6247, + "step": 8716 + }, + { + "epoch": 0.92, + "grad_norm": 2.8740535822228344, + "learning_rate": 1.7836372633590005e-07, + "loss": 0.5935, + "step": 8717 + }, + { + "epoch": 0.92, + "grad_norm": 5.583700320203049, + "learning_rate": 1.7791287251183398e-07, + "loss": 0.6242, + "step": 8718 + }, + { + "epoch": 0.92, + "grad_norm": 2.7565126848470003, + "learning_rate": 1.7746257890391027e-07, + "loss": 0.5967, + "step": 8719 + }, + { + "epoch": 0.92, + "grad_norm": 3.684123664351955, + "learning_rate": 1.7701284556444377e-07, + "loss": 0.5584, + "step": 8720 + }, + { + "epoch": 0.92, + "grad_norm": 2.7777890746410563, + "learning_rate": 1.7656367254568374e-07, + "loss": 0.6293, + "step": 8721 + }, + { + "epoch": 0.92, + "grad_norm": 2.4453226597465716, + "learning_rate": 1.7611505989981293e-07, + "loss": 0.6327, + "step": 8722 + }, + { + "epoch": 0.92, + "grad_norm": 3.0727605055674445, + "learning_rate": 1.7566700767894906e-07, + "loss": 0.6328, + "step": 8723 + }, + { + "epoch": 0.92, + "grad_norm": 3.726441601765811, + "learning_rate": 1.7521951593514718e-07, + "loss": 0.5472, + "step": 8724 + }, + { + "epoch": 0.92, + "grad_norm": 3.130050707112883, + "learning_rate": 1.7477258472039517e-07, + "loss": 0.5057, + "step": 8725 + }, + { + "epoch": 0.92, + "grad_norm": 2.2032936513711974, + "learning_rate": 1.7432621408661532e-07, + "loss": 0.5802, + "step": 8726 + }, + { + "epoch": 0.92, + "grad_norm": 2.577694971816813, + "learning_rate": 1.7388040408566674e-07, + "loss": 0.6737, + "step": 8727 + }, + { + "epoch": 0.92, + "grad_norm": 2.571554032274808, + "learning_rate": 1.7343515476934136e-07, + "loss": 0.5762, + "step": 8728 + }, + { + "epoch": 0.92, + "grad_norm": 2.2675681775342817, + "learning_rate": 1.729904661893683e-07, + "loss": 0.5929, + "step": 8729 + }, + { + "epoch": 0.92, + "grad_norm": 2.4825069172397907, + "learning_rate": 1.725463383974091e-07, + "loss": 0.5898, + "step": 8730 + }, + { + "epoch": 0.92, + "grad_norm": 0.9735035948128292, + "learning_rate": 1.7210277144506182e-07, + "loss": 0.5335, + "step": 8731 + }, + { + "epoch": 0.92, + "grad_norm": 2.631721683378093, + "learning_rate": 1.7165976538385753e-07, + "loss": 0.5517, + "step": 8732 + }, + { + "epoch": 0.92, + "grad_norm": 1.0076500063785254, + "learning_rate": 1.7121732026526506e-07, + "loss": 0.5407, + "step": 8733 + }, + { + "epoch": 0.92, + "grad_norm": 3.581508418879825, + "learning_rate": 1.7077543614068604e-07, + "loss": 0.6803, + "step": 8734 + }, + { + "epoch": 0.92, + "grad_norm": 2.4861781722789957, + "learning_rate": 1.703341130614572e-07, + "loss": 0.6355, + "step": 8735 + }, + { + "epoch": 0.92, + "grad_norm": 3.515627205530569, + "learning_rate": 1.6989335107884863e-07, + "loss": 0.6033, + "step": 8736 + }, + { + "epoch": 0.92, + "grad_norm": 2.3777058580194343, + "learning_rate": 1.6945315024406883e-07, + "loss": 0.513, + "step": 8737 + }, + { + "epoch": 0.92, + "grad_norm": 0.8876260556728341, + "learning_rate": 1.6901351060825854e-07, + "loss": 0.5056, + "step": 8738 + }, + { + "epoch": 0.92, + "grad_norm": 2.7599871753116862, + "learning_rate": 1.685744322224936e-07, + "loss": 0.5685, + "step": 8739 + }, + { + "epoch": 0.92, + "grad_norm": 3.2326213858288804, + "learning_rate": 1.681359151377848e-07, + "loss": 0.5538, + "step": 8740 + }, + { + "epoch": 0.92, + "grad_norm": 2.901673864970551, + "learning_rate": 1.676979594050776e-07, + "loss": 0.6152, + "step": 8741 + }, + { + "epoch": 0.92, + "grad_norm": 0.8826659096790682, + "learning_rate": 1.6726056507525347e-07, + "loss": 0.5152, + "step": 8742 + }, + { + "epoch": 0.92, + "grad_norm": 2.665982233334478, + "learning_rate": 1.668237321991262e-07, + "loss": 0.5803, + "step": 8743 + }, + { + "epoch": 0.92, + "grad_norm": 2.5198209538212146, + "learning_rate": 1.6638746082744684e-07, + "loss": 0.6382, + "step": 8744 + }, + { + "epoch": 0.92, + "grad_norm": 2.568498917912977, + "learning_rate": 1.6595175101089877e-07, + "loss": 0.6567, + "step": 8745 + }, + { + "epoch": 0.92, + "grad_norm": 3.409410318358604, + "learning_rate": 1.6551660280010316e-07, + "loss": 0.6195, + "step": 8746 + }, + { + "epoch": 0.92, + "grad_norm": 2.719189581656458, + "learning_rate": 1.6508201624561404e-07, + "loss": 0.6318, + "step": 8747 + }, + { + "epoch": 0.92, + "grad_norm": 9.214568272301674, + "learning_rate": 1.6464799139791877e-07, + "loss": 0.5042, + "step": 8748 + }, + { + "epoch": 0.92, + "grad_norm": 4.086249960723928, + "learning_rate": 1.6421452830744366e-07, + "loss": 0.6093, + "step": 8749 + }, + { + "epoch": 0.92, + "grad_norm": 3.992792950152544, + "learning_rate": 1.6378162702454458e-07, + "loss": 0.5714, + "step": 8750 + }, + { + "epoch": 0.92, + "grad_norm": 4.417350418321379, + "learning_rate": 1.6334928759951684e-07, + "loss": 0.6329, + "step": 8751 + }, + { + "epoch": 0.92, + "grad_norm": 2.3306711526974686, + "learning_rate": 1.6291751008258693e-07, + "loss": 0.5744, + "step": 8752 + }, + { + "epoch": 0.92, + "grad_norm": 2.8025552214715774, + "learning_rate": 1.6248629452391862e-07, + "loss": 0.5275, + "step": 8753 + }, + { + "epoch": 0.92, + "grad_norm": 2.15974372047649, + "learning_rate": 1.620556409736085e-07, + "loss": 0.6049, + "step": 8754 + }, + { + "epoch": 0.92, + "grad_norm": 3.655812836017327, + "learning_rate": 1.616255494816893e-07, + "loss": 0.5968, + "step": 8755 + }, + { + "epoch": 0.92, + "grad_norm": 2.7698264187396346, + "learning_rate": 1.6119602009812663e-07, + "loss": 0.5531, + "step": 8756 + }, + { + "epoch": 0.92, + "grad_norm": 3.424978511207146, + "learning_rate": 1.6076705287282336e-07, + "loss": 0.6074, + "step": 8757 + }, + { + "epoch": 0.92, + "grad_norm": 2.5086511137119807, + "learning_rate": 1.6033864785561515e-07, + "loss": 0.567, + "step": 8758 + }, + { + "epoch": 0.92, + "grad_norm": 2.707797137818906, + "learning_rate": 1.5991080509627222e-07, + "loss": 0.6671, + "step": 8759 + }, + { + "epoch": 0.92, + "grad_norm": 2.3155332443393775, + "learning_rate": 1.5948352464450146e-07, + "loss": 0.5503, + "step": 8760 + }, + { + "epoch": 0.92, + "grad_norm": 3.2510331858276005, + "learning_rate": 1.590568065499415e-07, + "loss": 0.6343, + "step": 8761 + }, + { + "epoch": 0.92, + "grad_norm": 2.6772676116951213, + "learning_rate": 1.5863065086216878e-07, + "loss": 0.5982, + "step": 8762 + }, + { + "epoch": 0.92, + "grad_norm": 2.394546302211017, + "learning_rate": 1.582050576306915e-07, + "loss": 0.6014, + "step": 8763 + }, + { + "epoch": 0.92, + "grad_norm": 2.0972459059941415, + "learning_rate": 1.5778002690495453e-07, + "loss": 0.5364, + "step": 8764 + }, + { + "epoch": 0.92, + "grad_norm": 2.9104609457881114, + "learning_rate": 1.5735555873433673e-07, + "loss": 0.5498, + "step": 8765 + }, + { + "epoch": 0.92, + "grad_norm": 3.1530370674231816, + "learning_rate": 1.569316531681514e-07, + "loss": 0.6073, + "step": 8766 + }, + { + "epoch": 0.92, + "grad_norm": 2.348747898326347, + "learning_rate": 1.565083102556464e-07, + "loss": 0.5991, + "step": 8767 + }, + { + "epoch": 0.92, + "grad_norm": 2.64211735957492, + "learning_rate": 1.560855300460057e-07, + "loss": 0.5962, + "step": 8768 + }, + { + "epoch": 0.92, + "grad_norm": 2.5983785487904294, + "learning_rate": 1.5566331258834498e-07, + "loss": 0.6021, + "step": 8769 + }, + { + "epoch": 0.92, + "grad_norm": 3.48505144340559, + "learning_rate": 1.552416579317173e-07, + "loss": 0.5457, + "step": 8770 + }, + { + "epoch": 0.92, + "grad_norm": 4.118622729954412, + "learning_rate": 1.5482056612510898e-07, + "loss": 0.5525, + "step": 8771 + }, + { + "epoch": 0.92, + "grad_norm": 2.453328716580977, + "learning_rate": 1.544000372174409e-07, + "loss": 0.5662, + "step": 8772 + }, + { + "epoch": 0.92, + "grad_norm": 2.715493793911031, + "learning_rate": 1.539800712575701e-07, + "loss": 0.6442, + "step": 8773 + }, + { + "epoch": 0.92, + "grad_norm": 2.534362203603708, + "learning_rate": 1.5356066829428529e-07, + "loss": 0.5944, + "step": 8774 + }, + { + "epoch": 0.92, + "grad_norm": 2.7191866220592633, + "learning_rate": 1.531418283763131e-07, + "loss": 0.605, + "step": 8775 + }, + { + "epoch": 0.92, + "grad_norm": 3.6751125356427465, + "learning_rate": 1.5272355155231233e-07, + "loss": 0.5287, + "step": 8776 + }, + { + "epoch": 0.92, + "grad_norm": 2.6076140742413605, + "learning_rate": 1.5230583787087693e-07, + "loss": 0.6233, + "step": 8777 + }, + { + "epoch": 0.92, + "grad_norm": 3.2526568139523055, + "learning_rate": 1.5188868738053643e-07, + "loss": 0.6007, + "step": 8778 + }, + { + "epoch": 0.92, + "grad_norm": 2.556172706540122, + "learning_rate": 1.5147210012975366e-07, + "loss": 0.5975, + "step": 8779 + }, + { + "epoch": 0.92, + "grad_norm": 2.317968435237044, + "learning_rate": 1.5105607616692665e-07, + "loss": 0.5119, + "step": 8780 + }, + { + "epoch": 0.92, + "grad_norm": 2.364058422346575, + "learning_rate": 1.5064061554038723e-07, + "loss": 0.6071, + "step": 8781 + }, + { + "epoch": 0.92, + "grad_norm": 2.123856213413403, + "learning_rate": 1.5022571829840404e-07, + "loss": 0.5481, + "step": 8782 + }, + { + "epoch": 0.92, + "grad_norm": 2.5488265654803324, + "learning_rate": 1.4981138448917686e-07, + "loss": 0.5901, + "step": 8783 + }, + { + "epoch": 0.92, + "grad_norm": 3.9357992577530965, + "learning_rate": 1.4939761416084274e-07, + "loss": 0.5701, + "step": 8784 + }, + { + "epoch": 0.92, + "grad_norm": 2.0256367627019287, + "learning_rate": 1.4898440736147213e-07, + "loss": 0.6026, + "step": 8785 + }, + { + "epoch": 0.92, + "grad_norm": 4.257081804924682, + "learning_rate": 1.4857176413907048e-07, + "loss": 0.5129, + "step": 8786 + }, + { + "epoch": 0.92, + "grad_norm": 2.7139299347714076, + "learning_rate": 1.481596845415767e-07, + "loss": 0.5782, + "step": 8787 + }, + { + "epoch": 0.92, + "grad_norm": 2.4990484620871873, + "learning_rate": 1.4774816861686636e-07, + "loss": 0.6671, + "step": 8788 + }, + { + "epoch": 0.92, + "grad_norm": 2.9970898191679884, + "learning_rate": 1.4733721641274677e-07, + "loss": 0.653, + "step": 8789 + }, + { + "epoch": 0.92, + "grad_norm": 3.3568839354768514, + "learning_rate": 1.4692682797696201e-07, + "loss": 0.5761, + "step": 8790 + }, + { + "epoch": 0.93, + "grad_norm": 2.61930351344336, + "learning_rate": 1.4651700335718887e-07, + "loss": 0.6616, + "step": 8791 + }, + { + "epoch": 0.93, + "grad_norm": 0.9479674201636344, + "learning_rate": 1.4610774260104155e-07, + "loss": 0.5103, + "step": 8792 + }, + { + "epoch": 0.93, + "grad_norm": 2.5330814701077635, + "learning_rate": 1.456990457560642e-07, + "loss": 0.5497, + "step": 8793 + }, + { + "epoch": 0.93, + "grad_norm": 2.309748805294827, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.6107, + "step": 8794 + }, + { + "epoch": 0.93, + "grad_norm": 2.547398975416799, + "learning_rate": 1.4488334398948424e-07, + "loss": 0.6341, + "step": 8795 + }, + { + "epoch": 0.93, + "grad_norm": 2.0790737085851827, + "learning_rate": 1.444763391626458e-07, + "loss": 0.5955, + "step": 8796 + }, + { + "epoch": 0.93, + "grad_norm": 2.7131868065415845, + "learning_rate": 1.4406989843651186e-07, + "loss": 0.6069, + "step": 8797 + }, + { + "epoch": 0.93, + "grad_norm": 2.1351827973252804, + "learning_rate": 1.4366402185829852e-07, + "loss": 0.5892, + "step": 8798 + }, + { + "epoch": 0.93, + "grad_norm": 2.532366496256515, + "learning_rate": 1.4325870947516195e-07, + "loss": 0.6275, + "step": 8799 + }, + { + "epoch": 0.93, + "grad_norm": 3.5352178877025415, + "learning_rate": 1.4285396133418894e-07, + "loss": 0.6247, + "step": 8800 + }, + { + "epoch": 0.93, + "grad_norm": 2.3725771799289634, + "learning_rate": 1.424497774824024e-07, + "loss": 0.5422, + "step": 8801 + }, + { + "epoch": 0.93, + "grad_norm": 2.453635363952836, + "learning_rate": 1.4204615796675813e-07, + "loss": 0.6249, + "step": 8802 + }, + { + "epoch": 0.93, + "grad_norm": 3.6864136459930834, + "learning_rate": 1.4164310283414917e-07, + "loss": 0.6058, + "step": 8803 + }, + { + "epoch": 0.93, + "grad_norm": 2.5101928312119233, + "learning_rate": 1.4124061213139973e-07, + "loss": 0.674, + "step": 8804 + }, + { + "epoch": 0.93, + "grad_norm": 2.5816393554893198, + "learning_rate": 1.4083868590527128e-07, + "loss": 0.56, + "step": 8805 + }, + { + "epoch": 0.93, + "grad_norm": 1.008204586126537, + "learning_rate": 1.4043732420245703e-07, + "loss": 0.5047, + "step": 8806 + }, + { + "epoch": 0.93, + "grad_norm": 2.6435612166868547, + "learning_rate": 1.400365270695875e-07, + "loss": 0.6431, + "step": 8807 + }, + { + "epoch": 0.93, + "grad_norm": 2.3837590071708403, + "learning_rate": 1.3963629455322536e-07, + "loss": 0.6289, + "step": 8808 + }, + { + "epoch": 0.93, + "grad_norm": 2.2165814241637776, + "learning_rate": 1.3923662669986847e-07, + "loss": 0.6364, + "step": 8809 + }, + { + "epoch": 0.93, + "grad_norm": 3.8103240249394665, + "learning_rate": 1.388375235559497e-07, + "loss": 0.6198, + "step": 8810 + }, + { + "epoch": 0.93, + "grad_norm": 2.745817288882361, + "learning_rate": 1.3843898516783528e-07, + "loss": 0.6645, + "step": 8811 + }, + { + "epoch": 0.93, + "grad_norm": 0.9439152142723546, + "learning_rate": 1.3804101158182592e-07, + "loss": 0.5582, + "step": 8812 + }, + { + "epoch": 0.93, + "grad_norm": 2.2299912321829636, + "learning_rate": 1.3764360284415745e-07, + "loss": 0.5961, + "step": 8813 + }, + { + "epoch": 0.93, + "grad_norm": 3.274213469206632, + "learning_rate": 1.372467590009996e-07, + "loss": 0.6478, + "step": 8814 + }, + { + "epoch": 0.93, + "grad_norm": 2.736445144043836, + "learning_rate": 1.3685048009845602e-07, + "loss": 0.5455, + "step": 8815 + }, + { + "epoch": 0.93, + "grad_norm": 2.3511706665739376, + "learning_rate": 1.3645476618256658e-07, + "loss": 0.6518, + "step": 8816 + }, + { + "epoch": 0.93, + "grad_norm": 2.9759636145549075, + "learning_rate": 1.3605961729930283e-07, + "loss": 0.6208, + "step": 8817 + }, + { + "epoch": 0.93, + "grad_norm": 2.6395425552448972, + "learning_rate": 1.3566503349457193e-07, + "loss": 0.5986, + "step": 8818 + }, + { + "epoch": 0.93, + "grad_norm": 2.5256137921850588, + "learning_rate": 1.3527101481421722e-07, + "loss": 0.5586, + "step": 8819 + }, + { + "epoch": 0.93, + "grad_norm": 2.365812363582891, + "learning_rate": 1.3487756130401264e-07, + "loss": 0.5482, + "step": 8820 + }, + { + "epoch": 0.93, + "grad_norm": 2.41157315584283, + "learning_rate": 1.3448467300966995e-07, + "loss": 0.5167, + "step": 8821 + }, + { + "epoch": 0.93, + "grad_norm": 2.350934489365909, + "learning_rate": 1.3409234997683262e-07, + "loss": 0.5623, + "step": 8822 + }, + { + "epoch": 0.93, + "grad_norm": 2.8590303256325225, + "learning_rate": 1.3370059225108088e-07, + "loss": 0.615, + "step": 8823 + }, + { + "epoch": 0.93, + "grad_norm": 2.477586205368679, + "learning_rate": 1.3330939987792668e-07, + "loss": 0.6072, + "step": 8824 + }, + { + "epoch": 0.93, + "grad_norm": 2.401994252587328, + "learning_rate": 1.3291877290281864e-07, + "loss": 0.574, + "step": 8825 + }, + { + "epoch": 0.93, + "grad_norm": 3.7781870354990517, + "learning_rate": 1.3252871137113764e-07, + "loss": 0.56, + "step": 8826 + }, + { + "epoch": 0.93, + "grad_norm": 2.367412583097113, + "learning_rate": 1.3213921532820084e-07, + "loss": 0.678, + "step": 8827 + }, + { + "epoch": 0.93, + "grad_norm": 1.9230400310337836, + "learning_rate": 1.3175028481925865e-07, + "loss": 0.6137, + "step": 8828 + }, + { + "epoch": 0.93, + "grad_norm": 2.6899938815586157, + "learning_rate": 1.3136191988949498e-07, + "loss": 0.5719, + "step": 8829 + }, + { + "epoch": 0.93, + "grad_norm": 2.313782561671406, + "learning_rate": 1.3097412058403036e-07, + "loss": 0.6413, + "step": 8830 + }, + { + "epoch": 0.93, + "grad_norm": 2.3968960300093514, + "learning_rate": 1.305868869479171e-07, + "loss": 0.6002, + "step": 8831 + }, + { + "epoch": 0.93, + "grad_norm": 0.9061105754688217, + "learning_rate": 1.3020021902614366e-07, + "loss": 0.5138, + "step": 8832 + }, + { + "epoch": 0.93, + "grad_norm": 3.361235492021561, + "learning_rate": 1.2981411686363132e-07, + "loss": 0.6635, + "step": 8833 + }, + { + "epoch": 0.93, + "grad_norm": 2.299548642666442, + "learning_rate": 1.29428580505237e-07, + "loss": 0.574, + "step": 8834 + }, + { + "epoch": 0.93, + "grad_norm": 2.8233240276159557, + "learning_rate": 1.290436099957504e-07, + "loss": 0.6384, + "step": 8835 + }, + { + "epoch": 0.93, + "grad_norm": 2.4935098889156753, + "learning_rate": 1.2865920537989683e-07, + "loss": 0.5546, + "step": 8836 + }, + { + "epoch": 0.93, + "grad_norm": 2.7213656000819872, + "learning_rate": 1.2827536670233508e-07, + "loss": 0.6305, + "step": 8837 + }, + { + "epoch": 0.93, + "grad_norm": 2.741318595224126, + "learning_rate": 1.2789209400765889e-07, + "loss": 0.6613, + "step": 8838 + }, + { + "epoch": 0.93, + "grad_norm": 3.396722910728486, + "learning_rate": 1.2750938734039486e-07, + "loss": 0.5485, + "step": 8839 + }, + { + "epoch": 0.93, + "grad_norm": 2.597262875682383, + "learning_rate": 1.2712724674500575e-07, + "loss": 0.6165, + "step": 8840 + }, + { + "epoch": 0.93, + "grad_norm": 2.5915988746408636, + "learning_rate": 1.2674567226588662e-07, + "loss": 0.658, + "step": 8841 + }, + { + "epoch": 0.93, + "grad_norm": 12.953891578992364, + "learning_rate": 1.2636466394736758e-07, + "loss": 0.5931, + "step": 8842 + }, + { + "epoch": 0.93, + "grad_norm": 4.102318058166209, + "learning_rate": 1.2598422183371484e-07, + "loss": 0.5928, + "step": 8843 + }, + { + "epoch": 0.93, + "grad_norm": 3.2543978164700937, + "learning_rate": 1.256043459691253e-07, + "loss": 0.6385, + "step": 8844 + }, + { + "epoch": 0.93, + "grad_norm": 0.9749011154837868, + "learning_rate": 1.2522503639773254e-07, + "loss": 0.5538, + "step": 8845 + }, + { + "epoch": 0.93, + "grad_norm": 2.8982951468251628, + "learning_rate": 1.2484629316360297e-07, + "loss": 0.5987, + "step": 8846 + }, + { + "epoch": 0.93, + "grad_norm": 4.042274517432498, + "learning_rate": 1.244681163107392e-07, + "loss": 0.5936, + "step": 8847 + }, + { + "epoch": 0.93, + "grad_norm": 2.543924226290593, + "learning_rate": 1.2409050588307547e-07, + "loss": 0.6316, + "step": 8848 + }, + { + "epoch": 0.93, + "grad_norm": 2.275882847499862, + "learning_rate": 1.237134619244823e-07, + "loss": 0.6165, + "step": 8849 + }, + { + "epoch": 0.93, + "grad_norm": 2.5971023387508803, + "learning_rate": 1.2333698447876296e-07, + "loss": 0.6973, + "step": 8850 + }, + { + "epoch": 0.93, + "grad_norm": 2.306532467495735, + "learning_rate": 1.229610735896558e-07, + "loss": 0.5684, + "step": 8851 + }, + { + "epoch": 0.93, + "grad_norm": 2.5310472966419377, + "learning_rate": 1.2258572930083313e-07, + "loss": 0.7, + "step": 8852 + }, + { + "epoch": 0.93, + "grad_norm": 2.187034442784481, + "learning_rate": 1.222109516559006e-07, + "loss": 0.5667, + "step": 8853 + }, + { + "epoch": 0.93, + "grad_norm": 2.2888506566786493, + "learning_rate": 1.2183674069840057e-07, + "loss": 0.6023, + "step": 8854 + }, + { + "epoch": 0.93, + "grad_norm": 1.9527027435099331, + "learning_rate": 1.2146309647180554e-07, + "loss": 0.4499, + "step": 8855 + }, + { + "epoch": 0.93, + "grad_norm": 2.730891050434837, + "learning_rate": 1.2109001901952633e-07, + "loss": 0.6274, + "step": 8856 + }, + { + "epoch": 0.93, + "grad_norm": 0.916048809399537, + "learning_rate": 1.2071750838490492e-07, + "loss": 0.5672, + "step": 8857 + }, + { + "epoch": 0.93, + "grad_norm": 2.2808207217906893, + "learning_rate": 1.2034556461121894e-07, + "loss": 0.6706, + "step": 8858 + }, + { + "epoch": 0.93, + "grad_norm": 2.48748766003941, + "learning_rate": 1.1997418774167934e-07, + "loss": 0.6039, + "step": 8859 + }, + { + "epoch": 0.93, + "grad_norm": 2.37457209874063, + "learning_rate": 1.196033778194322e-07, + "loss": 0.6669, + "step": 8860 + }, + { + "epoch": 0.93, + "grad_norm": 2.1767809049061233, + "learning_rate": 1.1923313488755638e-07, + "loss": 0.5488, + "step": 8861 + }, + { + "epoch": 0.93, + "grad_norm": 2.2318763739789023, + "learning_rate": 1.1886345898906693e-07, + "loss": 0.6166, + "step": 8862 + }, + { + "epoch": 0.93, + "grad_norm": 2.8605686514104973, + "learning_rate": 1.1849435016691003e-07, + "loss": 0.606, + "step": 8863 + }, + { + "epoch": 0.93, + "grad_norm": 2.908693940117577, + "learning_rate": 1.1812580846396915e-07, + "loss": 0.5751, + "step": 8864 + }, + { + "epoch": 0.93, + "grad_norm": 3.700097097894084, + "learning_rate": 1.1775783392305895e-07, + "loss": 0.6018, + "step": 8865 + }, + { + "epoch": 0.93, + "grad_norm": 2.782107805001388, + "learning_rate": 1.1739042658693079e-07, + "loss": 0.5801, + "step": 8866 + }, + { + "epoch": 0.93, + "grad_norm": 2.6177299400948395, + "learning_rate": 1.1702358649826939e-07, + "loss": 0.6479, + "step": 8867 + }, + { + "epoch": 0.93, + "grad_norm": 2.350114941297819, + "learning_rate": 1.166573136996918e-07, + "loss": 0.5391, + "step": 8868 + }, + { + "epoch": 0.93, + "grad_norm": 2.5754262051694763, + "learning_rate": 1.1629160823375118e-07, + "loss": 0.6068, + "step": 8869 + }, + { + "epoch": 0.93, + "grad_norm": 3.382289379949096, + "learning_rate": 1.1592647014293412e-07, + "loss": 0.6544, + "step": 8870 + }, + { + "epoch": 0.93, + "grad_norm": 2.3588925482663523, + "learning_rate": 1.1556189946966168e-07, + "loss": 0.5807, + "step": 8871 + }, + { + "epoch": 0.93, + "grad_norm": 2.582600577577335, + "learning_rate": 1.151978962562883e-07, + "loss": 0.6165, + "step": 8872 + }, + { + "epoch": 0.93, + "grad_norm": 2.245726951542925, + "learning_rate": 1.1483446054510294e-07, + "loss": 0.5569, + "step": 8873 + }, + { + "epoch": 0.93, + "grad_norm": 2.638961684798639, + "learning_rate": 1.144715923783274e-07, + "loss": 0.6425, + "step": 8874 + }, + { + "epoch": 0.93, + "grad_norm": 4.337475255641479, + "learning_rate": 1.1410929179812069e-07, + "loss": 0.6021, + "step": 8875 + }, + { + "epoch": 0.93, + "grad_norm": 2.6499250104828334, + "learning_rate": 1.1374755884657195e-07, + "loss": 0.589, + "step": 8876 + }, + { + "epoch": 0.93, + "grad_norm": 3.7819394045598353, + "learning_rate": 1.1338639356570758e-07, + "loss": 0.6157, + "step": 8877 + }, + { + "epoch": 0.93, + "grad_norm": 3.9903219653791866, + "learning_rate": 1.130257959974862e-07, + "loss": 0.566, + "step": 8878 + }, + { + "epoch": 0.93, + "grad_norm": 0.9474747256626247, + "learning_rate": 1.1266576618380098e-07, + "loss": 0.5611, + "step": 8879 + }, + { + "epoch": 0.93, + "grad_norm": 2.8711918810126424, + "learning_rate": 1.1230630416647958e-07, + "loss": 0.5494, + "step": 8880 + }, + { + "epoch": 0.93, + "grad_norm": 2.991121349490518, + "learning_rate": 1.1194740998728193e-07, + "loss": 0.6804, + "step": 8881 + }, + { + "epoch": 0.93, + "grad_norm": 3.1006023896574195, + "learning_rate": 1.1158908368790523e-07, + "loss": 0.7024, + "step": 8882 + }, + { + "epoch": 0.93, + "grad_norm": 2.1579700060562166, + "learning_rate": 1.1123132530997727e-07, + "loss": 0.6184, + "step": 8883 + }, + { + "epoch": 0.93, + "grad_norm": 2.466363894718886, + "learning_rate": 1.1087413489506205e-07, + "loss": 0.6905, + "step": 8884 + }, + { + "epoch": 0.93, + "grad_norm": 3.6365001531373107, + "learning_rate": 1.1051751248465691e-07, + "loss": 0.5644, + "step": 8885 + }, + { + "epoch": 0.94, + "grad_norm": 3.007115123145037, + "learning_rate": 1.1016145812019319e-07, + "loss": 0.6451, + "step": 8886 + }, + { + "epoch": 0.94, + "grad_norm": 2.339803693912085, + "learning_rate": 1.098059718430361e-07, + "loss": 0.6171, + "step": 8887 + }, + { + "epoch": 0.94, + "grad_norm": 4.514752409454256, + "learning_rate": 1.0945105369448483e-07, + "loss": 0.6357, + "step": 8888 + }, + { + "epoch": 0.94, + "grad_norm": 3.1616055453161453, + "learning_rate": 1.0909670371577308e-07, + "loss": 0.6318, + "step": 8889 + }, + { + "epoch": 0.94, + "grad_norm": 2.3220638444142474, + "learning_rate": 1.087429219480679e-07, + "loss": 0.6093, + "step": 8890 + }, + { + "epoch": 0.94, + "grad_norm": 2.3976532258477015, + "learning_rate": 1.0838970843247143e-07, + "loss": 0.6613, + "step": 8891 + }, + { + "epoch": 0.94, + "grad_norm": 2.661253383224113, + "learning_rate": 1.0803706321001805e-07, + "loss": 0.6697, + "step": 8892 + }, + { + "epoch": 0.94, + "grad_norm": 2.6683239924659907, + "learning_rate": 1.0768498632167779e-07, + "loss": 0.6107, + "step": 8893 + }, + { + "epoch": 0.94, + "grad_norm": 2.5718421574140513, + "learning_rate": 1.0733347780835346e-07, + "loss": 0.5767, + "step": 8894 + }, + { + "epoch": 0.94, + "grad_norm": 2.146294517128654, + "learning_rate": 1.0698253771088241e-07, + "loss": 0.591, + "step": 8895 + }, + { + "epoch": 0.94, + "grad_norm": 2.2452315585617613, + "learning_rate": 1.0663216607003535e-07, + "loss": 0.5671, + "step": 8896 + }, + { + "epoch": 0.94, + "grad_norm": 2.3863559337974825, + "learning_rate": 1.0628236292651861e-07, + "loss": 0.5705, + "step": 8897 + }, + { + "epoch": 0.94, + "grad_norm": 2.6440910292337767, + "learning_rate": 1.0593312832097025e-07, + "loss": 0.5802, + "step": 8898 + }, + { + "epoch": 0.94, + "grad_norm": 0.9645714063530803, + "learning_rate": 1.055844622939639e-07, + "loss": 0.5883, + "step": 8899 + }, + { + "epoch": 0.94, + "grad_norm": 2.58708264588437, + "learning_rate": 1.0523636488600664e-07, + "loss": 0.5565, + "step": 8900 + }, + { + "epoch": 0.94, + "grad_norm": 2.265603984151188, + "learning_rate": 1.048888361375383e-07, + "loss": 0.6022, + "step": 8901 + }, + { + "epoch": 0.94, + "grad_norm": 2.357562605848145, + "learning_rate": 1.045418760889355e-07, + "loss": 0.5128, + "step": 8902 + }, + { + "epoch": 0.94, + "grad_norm": 3.4292025931619565, + "learning_rate": 1.0419548478050601e-07, + "loss": 0.5251, + "step": 8903 + }, + { + "epoch": 0.94, + "grad_norm": 2.0323838830808834, + "learning_rate": 1.038496622524926e-07, + "loss": 0.5835, + "step": 8904 + }, + { + "epoch": 0.94, + "grad_norm": 4.6168280543265166, + "learning_rate": 1.0350440854507205e-07, + "loss": 0.636, + "step": 8905 + }, + { + "epoch": 0.94, + "grad_norm": 2.052977096577669, + "learning_rate": 1.0315972369835559e-07, + "loss": 0.6365, + "step": 8906 + }, + { + "epoch": 0.94, + "grad_norm": 2.2130917241762083, + "learning_rate": 1.0281560775238619e-07, + "loss": 0.6581, + "step": 8907 + }, + { + "epoch": 0.94, + "grad_norm": 3.7050724972147653, + "learning_rate": 1.0247206074714411e-07, + "loss": 0.6105, + "step": 8908 + }, + { + "epoch": 0.94, + "grad_norm": 5.090324313422908, + "learning_rate": 1.0212908272253963e-07, + "loss": 0.6013, + "step": 8909 + }, + { + "epoch": 0.94, + "grad_norm": 3.5235691542628236, + "learning_rate": 1.0178667371842088e-07, + "loss": 0.6415, + "step": 8910 + }, + { + "epoch": 0.94, + "grad_norm": 2.4523022912907857, + "learning_rate": 1.014448337745666e-07, + "loss": 0.6337, + "step": 8911 + }, + { + "epoch": 0.94, + "grad_norm": 2.318885308337774, + "learning_rate": 1.0110356293069168e-07, + "loss": 0.6198, + "step": 8912 + }, + { + "epoch": 0.94, + "grad_norm": 2.0757919001418195, + "learning_rate": 1.0076286122644274e-07, + "loss": 0.6183, + "step": 8913 + }, + { + "epoch": 0.94, + "grad_norm": 2.5362575196035597, + "learning_rate": 1.0042272870140258e-07, + "loss": 0.6237, + "step": 8914 + }, + { + "epoch": 0.94, + "grad_norm": 2.31713074091154, + "learning_rate": 1.0008316539508733e-07, + "loss": 0.6719, + "step": 8915 + }, + { + "epoch": 0.94, + "grad_norm": 2.767274831301876, + "learning_rate": 9.974417134694491e-08, + "loss": 0.5058, + "step": 8916 + }, + { + "epoch": 0.94, + "grad_norm": 0.9939390220324551, + "learning_rate": 9.940574659635993e-08, + "loss": 0.5415, + "step": 8917 + }, + { + "epoch": 0.94, + "grad_norm": 3.3485962773421636, + "learning_rate": 9.90678911826487e-08, + "loss": 0.5924, + "step": 8918 + }, + { + "epoch": 0.94, + "grad_norm": 2.8816111251299037, + "learning_rate": 9.873060514506316e-08, + "loss": 0.6027, + "step": 8919 + }, + { + "epoch": 0.94, + "grad_norm": 1.0275962573135506, + "learning_rate": 9.839388852278752e-08, + "loss": 0.5166, + "step": 8920 + }, + { + "epoch": 0.94, + "grad_norm": 2.5666272152464558, + "learning_rate": 9.805774135494106e-08, + "loss": 0.6445, + "step": 8921 + }, + { + "epoch": 0.94, + "grad_norm": 2.4748907614724827, + "learning_rate": 9.772216368057586e-08, + "loss": 0.6334, + "step": 8922 + }, + { + "epoch": 0.94, + "grad_norm": 2.942060956680936, + "learning_rate": 9.738715553867851e-08, + "loss": 0.6156, + "step": 8923 + }, + { + "epoch": 0.94, + "grad_norm": 3.9951301127561973, + "learning_rate": 9.705271696816954e-08, + "loss": 0.5977, + "step": 8924 + }, + { + "epoch": 0.94, + "grad_norm": 2.777453454552441, + "learning_rate": 9.671884800790288e-08, + "loss": 0.6252, + "step": 8925 + }, + { + "epoch": 0.94, + "grad_norm": 2.348283390971774, + "learning_rate": 9.638554869666695e-08, + "loss": 0.6118, + "step": 8926 + }, + { + "epoch": 0.94, + "grad_norm": 2.4067385375615493, + "learning_rate": 9.605281907318243e-08, + "loss": 0.6271, + "step": 8927 + }, + { + "epoch": 0.94, + "grad_norm": 0.9552504322728489, + "learning_rate": 9.572065917610618e-08, + "loss": 0.5533, + "step": 8928 + }, + { + "epoch": 0.94, + "grad_norm": 2.271928583094003, + "learning_rate": 9.538906904402623e-08, + "loss": 0.6035, + "step": 8929 + }, + { + "epoch": 0.94, + "grad_norm": 3.3223263454773297, + "learning_rate": 9.505804871546731e-08, + "loss": 0.6097, + "step": 8930 + }, + { + "epoch": 0.94, + "grad_norm": 4.923685715911361, + "learning_rate": 9.472759822888478e-08, + "loss": 0.576, + "step": 8931 + }, + { + "epoch": 0.94, + "grad_norm": 2.249066974617624, + "learning_rate": 9.439771762267069e-08, + "loss": 0.6664, + "step": 8932 + }, + { + "epoch": 0.94, + "grad_norm": 2.2985171963927464, + "learning_rate": 9.40684069351483e-08, + "loss": 0.6363, + "step": 8933 + }, + { + "epoch": 0.94, + "grad_norm": 2.1586975547776857, + "learning_rate": 9.373966620457753e-08, + "loss": 0.5921, + "step": 8934 + }, + { + "epoch": 0.94, + "grad_norm": 2.699332277329513, + "learning_rate": 9.341149546914951e-08, + "loss": 0.6785, + "step": 8935 + }, + { + "epoch": 0.94, + "grad_norm": 2.705681753410133, + "learning_rate": 9.308389476699043e-08, + "loss": 0.6113, + "step": 8936 + }, + { + "epoch": 0.94, + "grad_norm": 2.299817850591371, + "learning_rate": 9.27568641361598e-08, + "loss": 0.6317, + "step": 8937 + }, + { + "epoch": 0.94, + "grad_norm": 2.6726846572437575, + "learning_rate": 9.243040361465172e-08, + "loss": 0.5638, + "step": 8938 + }, + { + "epoch": 0.94, + "grad_norm": 2.0645519294317953, + "learning_rate": 9.210451324039304e-08, + "loss": 0.6308, + "step": 8939 + }, + { + "epoch": 0.94, + "grad_norm": 2.4527040374452262, + "learning_rate": 9.177919305124405e-08, + "loss": 0.6753, + "step": 8940 + }, + { + "epoch": 0.94, + "grad_norm": 0.9497404981411139, + "learning_rate": 9.145444308500117e-08, + "loss": 0.5437, + "step": 8941 + }, + { + "epoch": 0.94, + "grad_norm": 5.86280566237663, + "learning_rate": 9.11302633793909e-08, + "loss": 0.5582, + "step": 8942 + }, + { + "epoch": 0.94, + "grad_norm": 2.231382224332527, + "learning_rate": 9.080665397207755e-08, + "loss": 0.5663, + "step": 8943 + }, + { + "epoch": 0.94, + "grad_norm": 2.383310389908959, + "learning_rate": 9.048361490065549e-08, + "loss": 0.5818, + "step": 8944 + }, + { + "epoch": 0.94, + "grad_norm": 2.365476329509127, + "learning_rate": 9.016114620265526e-08, + "loss": 0.6667, + "step": 8945 + }, + { + "epoch": 0.94, + "grad_norm": 2.6118574377559023, + "learning_rate": 8.983924791553966e-08, + "loss": 0.7023, + "step": 8946 + }, + { + "epoch": 0.94, + "grad_norm": 3.2390121343870013, + "learning_rate": 8.951792007670713e-08, + "loss": 0.5439, + "step": 8947 + }, + { + "epoch": 0.94, + "grad_norm": 2.824711199639862, + "learning_rate": 8.919716272348722e-08, + "loss": 0.6532, + "step": 8948 + }, + { + "epoch": 0.94, + "grad_norm": 2.7642587541543717, + "learning_rate": 8.88769758931457e-08, + "loss": 0.5559, + "step": 8949 + }, + { + "epoch": 0.94, + "grad_norm": 2.7721849377139645, + "learning_rate": 8.855735962288059e-08, + "loss": 0.6408, + "step": 8950 + }, + { + "epoch": 0.94, + "grad_norm": 12.581276150709563, + "learning_rate": 8.823831394982329e-08, + "loss": 0.5789, + "step": 8951 + }, + { + "epoch": 0.94, + "grad_norm": 2.6512425220755578, + "learning_rate": 8.791983891104084e-08, + "loss": 0.557, + "step": 8952 + }, + { + "epoch": 0.94, + "grad_norm": 2.4054321801371494, + "learning_rate": 8.760193454353194e-08, + "loss": 0.6467, + "step": 8953 + }, + { + "epoch": 0.94, + "grad_norm": 2.9064106195783275, + "learning_rate": 8.728460088422985e-08, + "loss": 0.5413, + "step": 8954 + }, + { + "epoch": 0.94, + "grad_norm": 2.5100181773055414, + "learning_rate": 8.696783797000174e-08, + "loss": 0.6023, + "step": 8955 + }, + { + "epoch": 0.94, + "grad_norm": 0.9356433470805314, + "learning_rate": 8.665164583764818e-08, + "loss": 0.5343, + "step": 8956 + }, + { + "epoch": 0.94, + "grad_norm": 2.9024523813801144, + "learning_rate": 8.633602452390311e-08, + "loss": 0.6028, + "step": 8957 + }, + { + "epoch": 0.94, + "grad_norm": 2.9298897593318474, + "learning_rate": 8.602097406543442e-08, + "loss": 0.6587, + "step": 8958 + }, + { + "epoch": 0.94, + "grad_norm": 2.6001966117983955, + "learning_rate": 8.570649449884505e-08, + "loss": 0.6481, + "step": 8959 + }, + { + "epoch": 0.94, + "grad_norm": 2.45160147679471, + "learning_rate": 8.539258586066912e-08, + "loss": 0.5524, + "step": 8960 + }, + { + "epoch": 0.94, + "grad_norm": 4.5237192425119686, + "learning_rate": 8.507924818737523e-08, + "loss": 0.7146, + "step": 8961 + }, + { + "epoch": 0.94, + "grad_norm": 2.5557536371206893, + "learning_rate": 8.476648151536704e-08, + "loss": 0.6013, + "step": 8962 + }, + { + "epoch": 0.94, + "grad_norm": 3.195273209399613, + "learning_rate": 8.445428588098048e-08, + "loss": 0.6442, + "step": 8963 + }, + { + "epoch": 0.94, + "grad_norm": 2.174929174236636, + "learning_rate": 8.414266132048543e-08, + "loss": 0.5563, + "step": 8964 + }, + { + "epoch": 0.94, + "grad_norm": 4.334437374273213, + "learning_rate": 8.383160787008627e-08, + "loss": 0.6087, + "step": 8965 + }, + { + "epoch": 0.94, + "grad_norm": 2.2982581898665733, + "learning_rate": 8.352112556591907e-08, + "loss": 0.5159, + "step": 8966 + }, + { + "epoch": 0.94, + "grad_norm": 2.485884461295592, + "learning_rate": 8.321121444405611e-08, + "loss": 0.6474, + "step": 8967 + }, + { + "epoch": 0.94, + "grad_norm": 0.9202366397635555, + "learning_rate": 8.29018745405008e-08, + "loss": 0.4943, + "step": 8968 + }, + { + "epoch": 0.94, + "grad_norm": 2.242007414517413, + "learning_rate": 8.259310589119162e-08, + "loss": 0.5891, + "step": 8969 + }, + { + "epoch": 0.94, + "grad_norm": 2.159210998912373, + "learning_rate": 8.2284908532001e-08, + "loss": 0.6245, + "step": 8970 + }, + { + "epoch": 0.94, + "grad_norm": 2.5880306782993467, + "learning_rate": 8.19772824987336e-08, + "loss": 0.5807, + "step": 8971 + }, + { + "epoch": 0.94, + "grad_norm": 0.9458064039539235, + "learning_rate": 8.167022782712919e-08, + "loss": 0.5854, + "step": 8972 + }, + { + "epoch": 0.94, + "grad_norm": 2.2865336633479743, + "learning_rate": 8.136374455286033e-08, + "loss": 0.6299, + "step": 8973 + }, + { + "epoch": 0.94, + "grad_norm": 0.9344831427818824, + "learning_rate": 8.105783271153356e-08, + "loss": 0.5415, + "step": 8974 + }, + { + "epoch": 0.94, + "grad_norm": 3.63891046584668, + "learning_rate": 8.075249233868821e-08, + "loss": 0.6369, + "step": 8975 + }, + { + "epoch": 0.94, + "grad_norm": 1.0932455433990051, + "learning_rate": 8.044772346979812e-08, + "loss": 0.5487, + "step": 8976 + }, + { + "epoch": 0.94, + "grad_norm": 0.8253603754509732, + "learning_rate": 8.014352614027054e-08, + "loss": 0.531, + "step": 8977 + }, + { + "epoch": 0.94, + "grad_norm": 2.3887697326667325, + "learning_rate": 7.983990038544664e-08, + "loss": 0.5317, + "step": 8978 + }, + { + "epoch": 0.94, + "grad_norm": 0.954945688802483, + "learning_rate": 7.953684624059987e-08, + "loss": 0.5603, + "step": 8979 + }, + { + "epoch": 0.94, + "grad_norm": 2.949484917320116, + "learning_rate": 7.923436374093929e-08, + "loss": 0.5994, + "step": 8980 + }, + { + "epoch": 0.95, + "grad_norm": 2.027629114424706, + "learning_rate": 7.893245292160511e-08, + "loss": 0.601, + "step": 8981 + }, + { + "epoch": 0.95, + "grad_norm": 2.3171055100942057, + "learning_rate": 7.863111381767374e-08, + "loss": 0.544, + "step": 8982 + }, + { + "epoch": 0.95, + "grad_norm": 2.5697202512374178, + "learning_rate": 7.833034646415272e-08, + "loss": 0.6016, + "step": 8983 + }, + { + "epoch": 0.95, + "grad_norm": 2.502909881767718, + "learning_rate": 7.80301508959852e-08, + "loss": 0.6382, + "step": 8984 + }, + { + "epoch": 0.95, + "grad_norm": 2.4985875211152675, + "learning_rate": 7.773052714804719e-08, + "loss": 0.5886, + "step": 8985 + }, + { + "epoch": 0.95, + "grad_norm": 2.9604096707667176, + "learning_rate": 7.743147525514749e-08, + "loss": 0.6592, + "step": 8986 + }, + { + "epoch": 0.95, + "grad_norm": 2.074366972373142, + "learning_rate": 7.713299525202944e-08, + "loss": 0.5226, + "step": 8987 + }, + { + "epoch": 0.95, + "grad_norm": 4.844749218296161, + "learning_rate": 7.683508717336918e-08, + "loss": 0.6071, + "step": 8988 + }, + { + "epoch": 0.95, + "grad_norm": 2.591141612749998, + "learning_rate": 7.653775105377737e-08, + "loss": 0.6346, + "step": 8989 + }, + { + "epoch": 0.95, + "grad_norm": 5.626446005912948, + "learning_rate": 7.62409869277969e-08, + "loss": 0.6454, + "step": 8990 + }, + { + "epoch": 0.95, + "grad_norm": 2.5111955019235888, + "learning_rate": 7.59447948299058e-08, + "loss": 0.6752, + "step": 8991 + }, + { + "epoch": 0.95, + "grad_norm": 3.0568359259106237, + "learning_rate": 7.564917479451373e-08, + "loss": 0.617, + "step": 8992 + }, + { + "epoch": 0.95, + "grad_norm": 2.22905556576431, + "learning_rate": 7.535412685596599e-08, + "loss": 0.658, + "step": 8993 + }, + { + "epoch": 0.95, + "grad_norm": 3.687646814338487, + "learning_rate": 7.505965104854073e-08, + "loss": 0.7067, + "step": 8994 + }, + { + "epoch": 0.95, + "grad_norm": 2.1712557178132945, + "learning_rate": 7.476574740644838e-08, + "loss": 0.5687, + "step": 8995 + }, + { + "epoch": 0.95, + "grad_norm": 3.390681967447077, + "learning_rate": 7.44724159638338e-08, + "loss": 0.6088, + "step": 8996 + }, + { + "epoch": 0.95, + "grad_norm": 1.0375781519569047, + "learning_rate": 7.417965675477534e-08, + "loss": 0.541, + "step": 8997 + }, + { + "epoch": 0.95, + "grad_norm": 2.695143332239307, + "learning_rate": 7.388746981328632e-08, + "loss": 0.5657, + "step": 8998 + }, + { + "epoch": 0.95, + "grad_norm": 3.5000885655908025, + "learning_rate": 7.359585517331014e-08, + "loss": 0.643, + "step": 8999 + }, + { + "epoch": 0.95, + "grad_norm": 2.556164468290286, + "learning_rate": 7.330481286872749e-08, + "loss": 0.5475, + "step": 9000 + }, + { + "epoch": 0.95, + "grad_norm": 0.8838512097484564, + "learning_rate": 7.301434293334908e-08, + "loss": 0.5353, + "step": 9001 + }, + { + "epoch": 0.95, + "grad_norm": 2.274766804514225, + "learning_rate": 7.272444540092294e-08, + "loss": 0.5828, + "step": 9002 + }, + { + "epoch": 0.95, + "grad_norm": 2.1596373340527326, + "learning_rate": 7.243512030512656e-08, + "loss": 0.6407, + "step": 9003 + }, + { + "epoch": 0.95, + "grad_norm": 2.3934408334922597, + "learning_rate": 7.214636767957417e-08, + "loss": 0.5951, + "step": 9004 + }, + { + "epoch": 0.95, + "grad_norm": 2.1077351249376233, + "learning_rate": 7.18581875578117e-08, + "loss": 0.5951, + "step": 9005 + }, + { + "epoch": 0.95, + "grad_norm": 2.7426426194662987, + "learning_rate": 7.157057997331907e-08, + "loss": 0.5838, + "step": 9006 + }, + { + "epoch": 0.95, + "grad_norm": 3.3577955817656235, + "learning_rate": 7.128354495951006e-08, + "loss": 0.6553, + "step": 9007 + }, + { + "epoch": 0.95, + "grad_norm": 3.089592673711738, + "learning_rate": 7.099708254973136e-08, + "loss": 0.542, + "step": 9008 + }, + { + "epoch": 0.95, + "grad_norm": 2.6306257005655937, + "learning_rate": 7.071119277726301e-08, + "loss": 0.5938, + "step": 9009 + }, + { + "epoch": 0.95, + "grad_norm": 4.773569073948341, + "learning_rate": 7.0425875675319e-08, + "loss": 0.6349, + "step": 9010 + }, + { + "epoch": 0.95, + "grad_norm": 2.1860135695945595, + "learning_rate": 7.014113127704725e-08, + "loss": 0.5982, + "step": 9011 + }, + { + "epoch": 0.95, + "grad_norm": 3.076079644508135, + "learning_rate": 6.985695961552796e-08, + "loss": 0.6265, + "step": 9012 + }, + { + "epoch": 0.95, + "grad_norm": 0.9415297800104788, + "learning_rate": 6.957336072377586e-08, + "loss": 0.5714, + "step": 9013 + }, + { + "epoch": 0.95, + "grad_norm": 2.7068897773748826, + "learning_rate": 6.929033463473789e-08, + "loss": 0.5468, + "step": 9014 + }, + { + "epoch": 0.95, + "grad_norm": 2.2176189144911542, + "learning_rate": 6.900788138129554e-08, + "loss": 0.6505, + "step": 9015 + }, + { + "epoch": 0.95, + "grad_norm": 2.3975049310120897, + "learning_rate": 6.872600099626369e-08, + "loss": 0.5567, + "step": 9016 + }, + { + "epoch": 0.95, + "grad_norm": 0.9140350212678829, + "learning_rate": 6.844469351239003e-08, + "loss": 0.5619, + "step": 9017 + }, + { + "epoch": 0.95, + "grad_norm": 2.5388047256734287, + "learning_rate": 6.816395896235617e-08, + "loss": 0.6404, + "step": 9018 + }, + { + "epoch": 0.95, + "grad_norm": 2.61129763755357, + "learning_rate": 6.78837973787766e-08, + "loss": 0.6912, + "step": 9019 + }, + { + "epoch": 0.95, + "grad_norm": 2.7678872176224463, + "learning_rate": 6.760420879420082e-08, + "loss": 0.5791, + "step": 9020 + }, + { + "epoch": 0.95, + "grad_norm": 5.085235281927168, + "learning_rate": 6.732519324111009e-08, + "loss": 0.6828, + "step": 9021 + }, + { + "epoch": 0.95, + "grad_norm": 2.045301446382808, + "learning_rate": 6.704675075191902e-08, + "loss": 0.6762, + "step": 9022 + }, + { + "epoch": 0.95, + "grad_norm": 3.2248231907993987, + "learning_rate": 6.676888135897674e-08, + "loss": 0.649, + "step": 9023 + }, + { + "epoch": 0.95, + "grad_norm": 2.704494653374823, + "learning_rate": 6.649158509456576e-08, + "loss": 0.5956, + "step": 9024 + }, + { + "epoch": 0.95, + "grad_norm": 0.9722338123614437, + "learning_rate": 6.621486199090088e-08, + "loss": 0.5335, + "step": 9025 + }, + { + "epoch": 0.95, + "grad_norm": 2.591910857960895, + "learning_rate": 6.593871208013136e-08, + "loss": 0.6333, + "step": 9026 + }, + { + "epoch": 0.95, + "grad_norm": 2.8111496930068176, + "learning_rate": 6.566313539433877e-08, + "loss": 0.6471, + "step": 9027 + }, + { + "epoch": 0.95, + "grad_norm": 2.432987898988874, + "learning_rate": 6.538813196553973e-08, + "loss": 0.5657, + "step": 9028 + }, + { + "epoch": 0.95, + "grad_norm": 2.439770278419752, + "learning_rate": 6.511370182568311e-08, + "loss": 0.6174, + "step": 9029 + }, + { + "epoch": 0.95, + "grad_norm": 2.3437842495883623, + "learning_rate": 6.483984500665119e-08, + "loss": 0.5372, + "step": 9030 + }, + { + "epoch": 0.95, + "grad_norm": 2.638108993601756, + "learning_rate": 6.456656154025964e-08, + "loss": 0.6398, + "step": 9031 + }, + { + "epoch": 0.95, + "grad_norm": 2.5417156476529437, + "learning_rate": 6.429385145825861e-08, + "loss": 0.5727, + "step": 9032 + }, + { + "epoch": 0.95, + "grad_norm": 2.6367489893136846, + "learning_rate": 6.402171479233e-08, + "loss": 0.6402, + "step": 9033 + }, + { + "epoch": 0.95, + "grad_norm": 2.001287522657275, + "learning_rate": 6.375015157409015e-08, + "loss": 0.6013, + "step": 9034 + }, + { + "epoch": 0.95, + "grad_norm": 2.4098090763309523, + "learning_rate": 6.347916183508828e-08, + "loss": 0.5185, + "step": 9035 + }, + { + "epoch": 0.95, + "grad_norm": 2.6963976959673808, + "learning_rate": 6.320874560680757e-08, + "loss": 0.7315, + "step": 9036 + }, + { + "epoch": 0.95, + "grad_norm": 0.8986435359967851, + "learning_rate": 6.293890292066395e-08, + "loss": 0.5098, + "step": 9037 + }, + { + "epoch": 0.95, + "grad_norm": 3.122976565906962, + "learning_rate": 6.266963380800684e-08, + "loss": 0.6318, + "step": 9038 + }, + { + "epoch": 0.95, + "grad_norm": 2.4548075643836396, + "learning_rate": 6.24009383001195e-08, + "loss": 0.6208, + "step": 9039 + }, + { + "epoch": 0.95, + "grad_norm": 3.0915406479900294, + "learning_rate": 6.213281642821811e-08, + "loss": 0.562, + "step": 9040 + }, + { + "epoch": 0.95, + "grad_norm": 2.875966474710966, + "learning_rate": 6.186526822345163e-08, + "loss": 0.6213, + "step": 9041 + }, + { + "epoch": 0.95, + "grad_norm": 3.426133515510632, + "learning_rate": 6.159829371690407e-08, + "loss": 0.5456, + "step": 9042 + }, + { + "epoch": 0.95, + "grad_norm": 2.21552519795364, + "learning_rate": 6.133189293959175e-08, + "loss": 0.5443, + "step": 9043 + }, + { + "epoch": 0.95, + "grad_norm": 3.182803894427842, + "learning_rate": 6.106606592246267e-08, + "loss": 0.7024, + "step": 9044 + }, + { + "epoch": 0.95, + "grad_norm": 2.838797575657555, + "learning_rate": 6.08008126964016e-08, + "loss": 0.5106, + "step": 9045 + }, + { + "epoch": 0.95, + "grad_norm": 2.464352882434948, + "learning_rate": 6.053613329222441e-08, + "loss": 0.6797, + "step": 9046 + }, + { + "epoch": 0.95, + "grad_norm": 2.5403063737877813, + "learning_rate": 6.027202774068042e-08, + "loss": 0.6741, + "step": 9047 + }, + { + "epoch": 0.95, + "grad_norm": 2.275405839493515, + "learning_rate": 6.00084960724534e-08, + "loss": 0.5259, + "step": 9048 + }, + { + "epoch": 0.95, + "grad_norm": 2.4141326847728526, + "learning_rate": 5.974553831815888e-08, + "loss": 0.6703, + "step": 9049 + }, + { + "epoch": 0.95, + "grad_norm": 2.7803880388763815, + "learning_rate": 5.9483154508347406e-08, + "loss": 0.6808, + "step": 9050 + }, + { + "epoch": 0.95, + "grad_norm": 2.32854036522692, + "learning_rate": 5.9221344673500714e-08, + "loss": 0.6318, + "step": 9051 + }, + { + "epoch": 0.95, + "grad_norm": 2.7007162566736373, + "learning_rate": 5.896010884403669e-08, + "loss": 0.6192, + "step": 9052 + }, + { + "epoch": 0.95, + "grad_norm": 2.6424830019293544, + "learning_rate": 5.8699447050303284e-08, + "loss": 0.6002, + "step": 9053 + }, + { + "epoch": 0.95, + "grad_norm": 1.9699800972674935, + "learning_rate": 5.84393593225846e-08, + "loss": 0.5143, + "step": 9054 + }, + { + "epoch": 0.95, + "grad_norm": 2.1870238921593677, + "learning_rate": 5.817984569109702e-08, + "loss": 0.6391, + "step": 9055 + }, + { + "epoch": 0.95, + "grad_norm": 2.3894652143809623, + "learning_rate": 5.792090618598922e-08, + "loss": 0.5925, + "step": 9056 + }, + { + "epoch": 0.95, + "grad_norm": 2.0806139077721775, + "learning_rate": 5.766254083734435e-08, + "loss": 0.6226, + "step": 9057 + }, + { + "epoch": 0.95, + "grad_norm": 2.0742443711293004, + "learning_rate": 5.740474967517839e-08, + "loss": 0.7206, + "step": 9058 + }, + { + "epoch": 0.95, + "grad_norm": 4.751056686871944, + "learning_rate": 5.714753272944129e-08, + "loss": 0.63, + "step": 9059 + }, + { + "epoch": 0.95, + "grad_norm": 2.9924140950273572, + "learning_rate": 5.68908900300158e-08, + "loss": 0.5352, + "step": 9060 + }, + { + "epoch": 0.95, + "grad_norm": 2.401100392375077, + "learning_rate": 5.6634821606717514e-08, + "loss": 0.6337, + "step": 9061 + }, + { + "epoch": 0.95, + "grad_norm": 3.3084059799813343, + "learning_rate": 5.6379327489295424e-08, + "loss": 0.5915, + "step": 9062 + }, + { + "epoch": 0.95, + "grad_norm": 2.4362237430967926, + "learning_rate": 5.6124407707432436e-08, + "loss": 0.6301, + "step": 9063 + }, + { + "epoch": 0.95, + "grad_norm": 2.974258772040746, + "learning_rate": 5.5870062290744876e-08, + "loss": 0.6755, + "step": 9064 + }, + { + "epoch": 0.95, + "grad_norm": 2.330894208256594, + "learning_rate": 5.5616291268781875e-08, + "loss": 0.6224, + "step": 9065 + }, + { + "epoch": 0.95, + "grad_norm": 2.1719011136515, + "learning_rate": 5.53630946710243e-08, + "loss": 0.6096, + "step": 9066 + }, + { + "epoch": 0.95, + "grad_norm": 2.645578213445915, + "learning_rate": 5.5110472526889725e-08, + "loss": 0.5871, + "step": 9067 + }, + { + "epoch": 0.95, + "grad_norm": 2.176097941547101, + "learning_rate": 5.485842486572579e-08, + "loss": 0.6087, + "step": 9068 + }, + { + "epoch": 0.95, + "grad_norm": 2.84982524682506, + "learning_rate": 5.4606951716815735e-08, + "loss": 0.6064, + "step": 9069 + }, + { + "epoch": 0.95, + "grad_norm": 2.4460753590123403, + "learning_rate": 5.435605310937342e-08, + "loss": 0.6023, + "step": 9070 + }, + { + "epoch": 0.95, + "grad_norm": 3.2859655207695355, + "learning_rate": 5.410572907254885e-08, + "loss": 0.6332, + "step": 9071 + }, + { + "epoch": 0.95, + "grad_norm": 2.7744254402541415, + "learning_rate": 5.3855979635423774e-08, + "loss": 0.6184, + "step": 9072 + }, + { + "epoch": 0.95, + "grad_norm": 2.2908907666964726, + "learning_rate": 5.360680482701275e-08, + "loss": 0.6022, + "step": 9073 + }, + { + "epoch": 0.95, + "grad_norm": 2.659397892050746, + "learning_rate": 5.3358204676264844e-08, + "loss": 0.5827, + "step": 9074 + }, + { + "epoch": 0.95, + "grad_norm": 2.568905156288951, + "learning_rate": 5.3110179212061406e-08, + "loss": 0.6192, + "step": 9075 + }, + { + "epoch": 0.96, + "grad_norm": 2.292649322204462, + "learning_rate": 5.286272846321716e-08, + "loss": 0.5548, + "step": 9076 + }, + { + "epoch": 0.96, + "grad_norm": 3.0132313937949267, + "learning_rate": 5.2615852458480775e-08, + "loss": 0.6589, + "step": 9077 + }, + { + "epoch": 0.96, + "grad_norm": 4.892679189485966, + "learning_rate": 5.23695512265332e-08, + "loss": 0.5454, + "step": 9078 + }, + { + "epoch": 0.96, + "grad_norm": 2.2508615371235727, + "learning_rate": 5.2123824795988764e-08, + "loss": 0.6351, + "step": 9079 + }, + { + "epoch": 0.96, + "grad_norm": 4.9551130391975455, + "learning_rate": 5.187867319539519e-08, + "loss": 0.6406, + "step": 9080 + }, + { + "epoch": 0.96, + "grad_norm": 10.096299527370705, + "learning_rate": 5.163409645323414e-08, + "loss": 0.6185, + "step": 9081 + }, + { + "epoch": 0.96, + "grad_norm": 3.0533996853064957, + "learning_rate": 5.139009459791955e-08, + "loss": 0.5818, + "step": 9082 + }, + { + "epoch": 0.96, + "grad_norm": 2.9947071352082357, + "learning_rate": 5.1146667657798744e-08, + "loss": 0.5878, + "step": 9083 + }, + { + "epoch": 0.96, + "grad_norm": 2.6515995229170928, + "learning_rate": 5.0903815661152435e-08, + "loss": 0.6047, + "step": 9084 + }, + { + "epoch": 0.96, + "grad_norm": 2.216360216245191, + "learning_rate": 5.0661538636194164e-08, + "loss": 0.6247, + "step": 9085 + }, + { + "epoch": 0.96, + "grad_norm": 3.268655449300178, + "learning_rate": 5.041983661107142e-08, + "loss": 0.622, + "step": 9086 + }, + { + "epoch": 0.96, + "grad_norm": 21.215101180819993, + "learning_rate": 5.017870961386451e-08, + "loss": 0.568, + "step": 9087 + }, + { + "epoch": 0.96, + "grad_norm": 2.415400161597065, + "learning_rate": 4.9938157672586585e-08, + "loss": 0.664, + "step": 9088 + }, + { + "epoch": 0.96, + "grad_norm": 2.498930091348122, + "learning_rate": 4.9698180815183626e-08, + "loss": 0.6175, + "step": 9089 + }, + { + "epoch": 0.96, + "grad_norm": 2.2002817993561457, + "learning_rate": 4.945877906953722e-08, + "loss": 0.5586, + "step": 9090 + }, + { + "epoch": 0.96, + "grad_norm": 2.936573265428427, + "learning_rate": 4.921995246345901e-08, + "loss": 0.5832, + "step": 9091 + }, + { + "epoch": 0.96, + "grad_norm": 2.2660732599551445, + "learning_rate": 4.898170102469513e-08, + "loss": 0.6299, + "step": 9092 + }, + { + "epoch": 0.96, + "grad_norm": 2.3604212255644623, + "learning_rate": 4.87440247809251e-08, + "loss": 0.6544, + "step": 9093 + }, + { + "epoch": 0.96, + "grad_norm": 2.441174650138323, + "learning_rate": 4.850692375976185e-08, + "loss": 0.6546, + "step": 9094 + }, + { + "epoch": 0.96, + "grad_norm": 4.559330779198589, + "learning_rate": 4.827039798875111e-08, + "loss": 0.5868, + "step": 9095 + }, + { + "epoch": 0.96, + "grad_norm": 2.6264220920489487, + "learning_rate": 4.803444749537145e-08, + "loss": 0.592, + "step": 9096 + }, + { + "epoch": 0.96, + "grad_norm": 2.346201879455138, + "learning_rate": 4.7799072307034845e-08, + "loss": 0.5913, + "step": 9097 + }, + { + "epoch": 0.96, + "grad_norm": 2.2187162896332735, + "learning_rate": 4.756427245108664e-08, + "loss": 0.5922, + "step": 9098 + }, + { + "epoch": 0.96, + "grad_norm": 2.665266994162754, + "learning_rate": 4.733004795480556e-08, + "loss": 0.5226, + "step": 9099 + }, + { + "epoch": 0.96, + "grad_norm": 2.309703067456407, + "learning_rate": 4.709639884540262e-08, + "loss": 0.6754, + "step": 9100 + }, + { + "epoch": 0.96, + "grad_norm": 3.519489921324912, + "learning_rate": 4.686332515002223e-08, + "loss": 0.5979, + "step": 9101 + }, + { + "epoch": 0.96, + "grad_norm": 3.4734030677173644, + "learning_rate": 4.663082689574328e-08, + "loss": 0.5856, + "step": 9102 + }, + { + "epoch": 0.96, + "grad_norm": 0.9763445326065605, + "learning_rate": 4.6398904109575815e-08, + "loss": 0.5201, + "step": 9103 + }, + { + "epoch": 0.96, + "grad_norm": 2.2503233196738193, + "learning_rate": 4.616755681846441e-08, + "loss": 0.543, + "step": 9104 + }, + { + "epoch": 0.96, + "grad_norm": 2.2179469381251273, + "learning_rate": 4.593678504928589e-08, + "loss": 0.6158, + "step": 9105 + }, + { + "epoch": 0.96, + "grad_norm": 2.623231471830531, + "learning_rate": 4.570658882885104e-08, + "loss": 0.6553, + "step": 9106 + }, + { + "epoch": 0.96, + "grad_norm": 2.50946392413075, + "learning_rate": 4.547696818390346e-08, + "loss": 0.5586, + "step": 9107 + }, + { + "epoch": 0.96, + "grad_norm": 2.2550240930054586, + "learning_rate": 4.524792314111959e-08, + "loss": 0.6628, + "step": 9108 + }, + { + "epoch": 0.96, + "grad_norm": 2.249364536937027, + "learning_rate": 4.501945372710925e-08, + "loss": 0.5307, + "step": 9109 + }, + { + "epoch": 0.96, + "grad_norm": 2.641533439490308, + "learning_rate": 4.4791559968415664e-08, + "loss": 0.581, + "step": 9110 + }, + { + "epoch": 0.96, + "grad_norm": 2.400735266374335, + "learning_rate": 4.456424189151376e-08, + "loss": 0.5803, + "step": 9111 + }, + { + "epoch": 0.96, + "grad_norm": 2.013311376301837, + "learning_rate": 4.433749952281463e-08, + "loss": 0.5211, + "step": 9112 + }, + { + "epoch": 0.96, + "grad_norm": 2.6401002509833007, + "learning_rate": 4.4111332888658876e-08, + "loss": 0.6004, + "step": 9113 + }, + { + "epoch": 0.96, + "grad_norm": 0.9931452731017555, + "learning_rate": 4.388574201532214e-08, + "loss": 0.5143, + "step": 9114 + }, + { + "epoch": 0.96, + "grad_norm": 2.6549315124291772, + "learning_rate": 4.366072692901346e-08, + "loss": 0.6261, + "step": 9115 + }, + { + "epoch": 0.96, + "grad_norm": 3.012204151927376, + "learning_rate": 4.343628765587471e-08, + "loss": 0.6443, + "step": 9116 + }, + { + "epoch": 0.96, + "grad_norm": 1.0512576287150874, + "learning_rate": 4.321242422197946e-08, + "loss": 0.5194, + "step": 9117 + }, + { + "epoch": 0.96, + "grad_norm": 2.609812624572654, + "learning_rate": 4.298913665333637e-08, + "loss": 0.5724, + "step": 9118 + }, + { + "epoch": 0.96, + "grad_norm": 3.067384788809318, + "learning_rate": 4.276642497588579e-08, + "loss": 0.5614, + "step": 9119 + }, + { + "epoch": 0.96, + "grad_norm": 3.4934409570440055, + "learning_rate": 4.2544289215502576e-08, + "loss": 0.5847, + "step": 9120 + }, + { + "epoch": 0.96, + "grad_norm": 2.470305736923208, + "learning_rate": 4.2322729397992755e-08, + "loss": 0.6099, + "step": 9121 + }, + { + "epoch": 0.96, + "grad_norm": 4.735639182147766, + "learning_rate": 4.210174554909796e-08, + "loss": 0.6468, + "step": 9122 + }, + { + "epoch": 0.96, + "grad_norm": 2.4294443877199807, + "learning_rate": 4.188133769448932e-08, + "loss": 0.5507, + "step": 9123 + }, + { + "epoch": 0.96, + "grad_norm": 3.268281292601878, + "learning_rate": 4.1661505859775245e-08, + "loss": 0.6073, + "step": 9124 + }, + { + "epoch": 0.96, + "grad_norm": 3.292116636642608, + "learning_rate": 4.1442250070494186e-08, + "loss": 0.625, + "step": 9125 + }, + { + "epoch": 0.96, + "grad_norm": 3.0214437747145553, + "learning_rate": 4.1223570352118545e-08, + "loss": 0.5964, + "step": 9126 + }, + { + "epoch": 0.96, + "grad_norm": 3.237561901276038, + "learning_rate": 4.100546673005412e-08, + "loss": 0.6468, + "step": 9127 + }, + { + "epoch": 0.96, + "grad_norm": 3.1688911358412177, + "learning_rate": 4.078793922963953e-08, + "loss": 0.6302, + "step": 9128 + }, + { + "epoch": 0.96, + "grad_norm": 3.447810752311471, + "learning_rate": 4.057098787614677e-08, + "loss": 0.5529, + "step": 9129 + }, + { + "epoch": 0.96, + "grad_norm": 2.636676069560152, + "learning_rate": 4.035461269478014e-08, + "loss": 0.653, + "step": 9130 + }, + { + "epoch": 0.96, + "grad_norm": 1.8829981767535657, + "learning_rate": 4.013881371067841e-08, + "loss": 0.617, + "step": 9131 + }, + { + "epoch": 0.96, + "grad_norm": 3.756084256939002, + "learning_rate": 3.992359094891096e-08, + "loss": 0.5825, + "step": 9132 + }, + { + "epoch": 0.96, + "grad_norm": 2.2300017538671466, + "learning_rate": 3.970894443448281e-08, + "loss": 0.5501, + "step": 9133 + }, + { + "epoch": 0.96, + "grad_norm": 3.0651156169351963, + "learning_rate": 3.949487419233122e-08, + "loss": 0.5455, + "step": 9134 + }, + { + "epoch": 0.96, + "grad_norm": 2.4150439079741504, + "learning_rate": 3.92813802473252e-08, + "loss": 0.6961, + "step": 9135 + }, + { + "epoch": 0.96, + "grad_norm": 3.197514010373594, + "learning_rate": 3.906846262426878e-08, + "loss": 0.6243, + "step": 9136 + }, + { + "epoch": 0.96, + "grad_norm": 2.122714920113005, + "learning_rate": 3.885612134789718e-08, + "loss": 0.6401, + "step": 9137 + }, + { + "epoch": 0.96, + "grad_norm": 0.9331466918924591, + "learning_rate": 3.864435644288123e-08, + "loss": 0.5596, + "step": 9138 + }, + { + "epoch": 0.96, + "grad_norm": 3.5512512202448434, + "learning_rate": 3.8433167933821234e-08, + "loss": 0.6279, + "step": 9139 + }, + { + "epoch": 0.96, + "grad_norm": 2.3384806493446684, + "learning_rate": 3.822255584525369e-08, + "loss": 0.6029, + "step": 9140 + }, + { + "epoch": 0.96, + "grad_norm": 2.7384044773724057, + "learning_rate": 3.8012520201646255e-08, + "loss": 0.5762, + "step": 9141 + }, + { + "epoch": 0.96, + "grad_norm": 2.537480825998291, + "learning_rate": 3.780306102740105e-08, + "loss": 0.6246, + "step": 9142 + }, + { + "epoch": 0.96, + "grad_norm": 2.535534515356477, + "learning_rate": 3.7594178346851974e-08, + "loss": 0.6051, + "step": 9143 + }, + { + "epoch": 0.96, + "grad_norm": 2.6217629251128436, + "learning_rate": 3.738587218426626e-08, + "loss": 0.5192, + "step": 9144 + }, + { + "epoch": 0.96, + "grad_norm": 3.930611934484491, + "learning_rate": 3.7178142563844e-08, + "loss": 0.554, + "step": 9145 + }, + { + "epoch": 0.96, + "grad_norm": 2.4744564645686906, + "learning_rate": 3.697098950971922e-08, + "loss": 0.6043, + "step": 9146 + }, + { + "epoch": 0.96, + "grad_norm": 2.800360974943865, + "learning_rate": 3.676441304595879e-08, + "loss": 0.5262, + "step": 9147 + }, + { + "epoch": 0.96, + "grad_norm": 2.2752239967517283, + "learning_rate": 3.655841319656128e-08, + "loss": 0.641, + "step": 9148 + }, + { + "epoch": 0.96, + "grad_norm": 2.9374913960050826, + "learning_rate": 3.635298998545922e-08, + "loss": 0.62, + "step": 9149 + }, + { + "epoch": 0.96, + "grad_norm": 2.780995594486591, + "learning_rate": 3.614814343651851e-08, + "loss": 0.6157, + "step": 9150 + }, + { + "epoch": 0.96, + "grad_norm": 2.194471742530128, + "learning_rate": 3.5943873573537903e-08, + "loss": 0.5811, + "step": 9151 + }, + { + "epoch": 0.96, + "grad_norm": 2.3614260129521387, + "learning_rate": 3.574018042024785e-08, + "loss": 0.5898, + "step": 9152 + }, + { + "epoch": 0.96, + "grad_norm": 3.281118604888936, + "learning_rate": 3.553706400031331e-08, + "loss": 0.624, + "step": 9153 + }, + { + "epoch": 0.96, + "grad_norm": 3.4511839932038293, + "learning_rate": 3.533452433733209e-08, + "loss": 0.6539, + "step": 9154 + }, + { + "epoch": 0.96, + "grad_norm": 3.0189891199871286, + "learning_rate": 3.513256145483479e-08, + "loss": 0.5865, + "step": 9155 + }, + { + "epoch": 0.96, + "grad_norm": 5.371611401435088, + "learning_rate": 3.493117537628432e-08, + "loss": 0.6437, + "step": 9156 + }, + { + "epoch": 0.96, + "grad_norm": 0.8768550617990599, + "learning_rate": 3.4730366125076966e-08, + "loss": 0.5259, + "step": 9157 + }, + { + "epoch": 0.96, + "grad_norm": 4.802661184621188, + "learning_rate": 3.453013372454295e-08, + "loss": 0.5858, + "step": 9158 + }, + { + "epoch": 0.96, + "grad_norm": 2.4916523907020736, + "learning_rate": 3.433047819794366e-08, + "loss": 0.6361, + "step": 9159 + }, + { + "epoch": 0.96, + "grad_norm": 2.3170062905496733, + "learning_rate": 3.413139956847611e-08, + "loss": 0.6925, + "step": 9160 + }, + { + "epoch": 0.96, + "grad_norm": 3.102122288265513, + "learning_rate": 3.3932897859267346e-08, + "loss": 0.6089, + "step": 9161 + }, + { + "epoch": 0.96, + "grad_norm": 2.212048736855744, + "learning_rate": 3.3734973093378367e-08, + "loss": 0.678, + "step": 9162 + }, + { + "epoch": 0.96, + "grad_norm": 3.5526846645578467, + "learning_rate": 3.353762529380466e-08, + "loss": 0.6322, + "step": 9163 + }, + { + "epoch": 0.96, + "grad_norm": 2.6953009571329827, + "learning_rate": 3.334085448347346e-08, + "loss": 0.6071, + "step": 9164 + }, + { + "epoch": 0.96, + "grad_norm": 2.09081032023049, + "learning_rate": 3.314466068524425e-08, + "loss": 0.5649, + "step": 9165 + }, + { + "epoch": 0.96, + "grad_norm": 2.8019153487339916, + "learning_rate": 3.294904392191045e-08, + "loss": 0.6405, + "step": 9166 + }, + { + "epoch": 0.96, + "grad_norm": 2.2938098097617368, + "learning_rate": 3.27540042161989e-08, + "loss": 0.6646, + "step": 9167 + }, + { + "epoch": 0.96, + "grad_norm": 2.475187200092886, + "learning_rate": 3.255954159076813e-08, + "loss": 0.5879, + "step": 9168 + }, + { + "epoch": 0.96, + "grad_norm": 3.1691062969062194, + "learning_rate": 3.236565606821007e-08, + "loss": 0.5371, + "step": 9169 + }, + { + "epoch": 0.96, + "grad_norm": 2.4161686022837277, + "learning_rate": 3.2172347671050596e-08, + "loss": 0.6002, + "step": 9170 + }, + { + "epoch": 0.97, + "grad_norm": 2.853074919404103, + "learning_rate": 3.197961642174674e-08, + "loss": 0.5644, + "step": 9171 + }, + { + "epoch": 0.97, + "grad_norm": 4.210820383910226, + "learning_rate": 3.1787462342690036e-08, + "loss": 0.6206, + "step": 9172 + }, + { + "epoch": 0.97, + "grad_norm": 2.628828526440614, + "learning_rate": 3.1595885456204845e-08, + "loss": 0.6029, + "step": 9173 + }, + { + "epoch": 0.97, + "grad_norm": 2.5487802779308995, + "learning_rate": 3.1404885784547256e-08, + "loss": 0.645, + "step": 9174 + }, + { + "epoch": 0.97, + "grad_norm": 0.9690656931951284, + "learning_rate": 3.1214463349907295e-08, + "loss": 0.5252, + "step": 9175 + }, + { + "epoch": 0.97, + "grad_norm": 1.965446772712665, + "learning_rate": 3.102461817440727e-08, + "loss": 0.6123, + "step": 9176 + }, + { + "epoch": 0.97, + "grad_norm": 2.6886931228239357, + "learning_rate": 3.083535028010343e-08, + "loss": 0.6453, + "step": 9177 + }, + { + "epoch": 0.97, + "grad_norm": 5.884853895703779, + "learning_rate": 3.064665968898428e-08, + "loss": 0.7053, + "step": 9178 + }, + { + "epoch": 0.97, + "grad_norm": 3.1113257956144733, + "learning_rate": 3.045854642297175e-08, + "loss": 0.5508, + "step": 9179 + }, + { + "epoch": 0.97, + "grad_norm": 2.493207760394977, + "learning_rate": 3.0271010503918896e-08, + "loss": 0.5488, + "step": 9180 + }, + { + "epoch": 0.97, + "grad_norm": 2.3766479319028724, + "learning_rate": 3.0084051953614414e-08, + "loss": 0.6282, + "step": 9181 + }, + { + "epoch": 0.97, + "grad_norm": 2.390922910780254, + "learning_rate": 2.98976707937787e-08, + "loss": 0.5949, + "step": 9182 + }, + { + "epoch": 0.97, + "grad_norm": 4.637454787285773, + "learning_rate": 2.971186704606388e-08, + "loss": 0.6665, + "step": 9183 + }, + { + "epoch": 0.97, + "grad_norm": 3.1213890226656327, + "learning_rate": 2.9526640732056577e-08, + "loss": 0.51, + "step": 9184 + }, + { + "epoch": 0.97, + "grad_norm": 2.490219535861415, + "learning_rate": 2.9341991873276244e-08, + "loss": 0.6355, + "step": 9185 + }, + { + "epoch": 0.97, + "grad_norm": 2.218227767506622, + "learning_rate": 2.9157920491174606e-08, + "loss": 0.683, + "step": 9186 + }, + { + "epoch": 0.97, + "grad_norm": 2.2005059135053537, + "learning_rate": 2.8974426607136784e-08, + "loss": 0.5855, + "step": 9187 + }, + { + "epoch": 0.97, + "grad_norm": 2.519798208810904, + "learning_rate": 2.8791510242480168e-08, + "loss": 0.6008, + "step": 9188 + }, + { + "epoch": 0.97, + "grad_norm": 1.0120446604343019, + "learning_rate": 2.8609171418454985e-08, + "loss": 0.5326, + "step": 9189 + }, + { + "epoch": 0.97, + "grad_norm": 2.450219497355183, + "learning_rate": 2.842741015624595e-08, + "loss": 0.6363, + "step": 9190 + }, + { + "epoch": 0.97, + "grad_norm": 3.052648319059015, + "learning_rate": 2.824622647696895e-08, + "loss": 0.5779, + "step": 9191 + }, + { + "epoch": 0.97, + "grad_norm": 3.335643356301621, + "learning_rate": 2.8065620401673823e-08, + "loss": 0.5543, + "step": 9192 + }, + { + "epoch": 0.97, + "grad_norm": 2.8344656986626906, + "learning_rate": 2.7885591951342104e-08, + "loss": 0.549, + "step": 9193 + }, + { + "epoch": 0.97, + "grad_norm": 3.599346650297371, + "learning_rate": 2.77061411468893e-08, + "loss": 0.6037, + "step": 9194 + }, + { + "epoch": 0.97, + "grad_norm": 3.0866091554980106, + "learning_rate": 2.752726800916372e-08, + "loss": 0.5579, + "step": 9195 + }, + { + "epoch": 0.97, + "grad_norm": 2.5707910612614655, + "learning_rate": 2.734897255894653e-08, + "loss": 0.6654, + "step": 9196 + }, + { + "epoch": 0.97, + "grad_norm": 2.5872290746708435, + "learning_rate": 2.71712548169506e-08, + "loss": 0.6281, + "step": 9197 + }, + { + "epoch": 0.97, + "grad_norm": 3.2731949836335774, + "learning_rate": 2.6994114803823858e-08, + "loss": 0.6685, + "step": 9198 + }, + { + "epoch": 0.97, + "grad_norm": 2.899660903128125, + "learning_rate": 2.6817552540144842e-08, + "loss": 0.6137, + "step": 9199 + }, + { + "epoch": 0.97, + "grad_norm": 2.320763158061146, + "learning_rate": 2.6641568046427146e-08, + "loss": 0.6083, + "step": 9200 + }, + { + "epoch": 0.97, + "grad_norm": 2.363696210436481, + "learning_rate": 2.646616134311497e-08, + "loss": 0.6219, + "step": 9201 + }, + { + "epoch": 0.97, + "grad_norm": 2.2956123393115724, + "learning_rate": 2.629133245058757e-08, + "loss": 0.6388, + "step": 9202 + }, + { + "epoch": 0.97, + "grad_norm": 3.8842450507583464, + "learning_rate": 2.6117081389155362e-08, + "loss": 0.6312, + "step": 9203 + }, + { + "epoch": 0.97, + "grad_norm": 2.3338099016770926, + "learning_rate": 2.5943408179062713e-08, + "loss": 0.5896, + "step": 9204 + }, + { + "epoch": 0.97, + "grad_norm": 2.3320839464277805, + "learning_rate": 2.5770312840486255e-08, + "loss": 0.6366, + "step": 9205 + }, + { + "epoch": 0.97, + "grad_norm": 2.984624737573807, + "learning_rate": 2.5597795393536017e-08, + "loss": 0.6062, + "step": 9206 + }, + { + "epoch": 0.97, + "grad_norm": 0.9651826359687896, + "learning_rate": 2.5425855858253744e-08, + "loss": 0.5126, + "step": 9207 + }, + { + "epoch": 0.97, + "grad_norm": 2.288749577144304, + "learning_rate": 2.5254494254616236e-08, + "loss": 0.6235, + "step": 9208 + }, + { + "epoch": 0.97, + "grad_norm": 2.813589985884465, + "learning_rate": 2.5083710602530897e-08, + "loss": 0.6208, + "step": 9209 + }, + { + "epoch": 0.97, + "grad_norm": 2.407381015638563, + "learning_rate": 2.4913504921839084e-08, + "loss": 0.5145, + "step": 9210 + }, + { + "epoch": 0.97, + "grad_norm": 2.2924984215742583, + "learning_rate": 2.4743877232314416e-08, + "loss": 0.5947, + "step": 9211 + }, + { + "epoch": 0.97, + "grad_norm": 3.1338707036223505, + "learning_rate": 2.457482755366447e-08, + "loss": 0.5781, + "step": 9212 + }, + { + "epoch": 0.97, + "grad_norm": 2.277076857886843, + "learning_rate": 2.4406355905528534e-08, + "loss": 0.6377, + "step": 9213 + }, + { + "epoch": 0.97, + "grad_norm": 2.5947856777870917, + "learning_rate": 2.4238462307478727e-08, + "loss": 0.6077, + "step": 9214 + }, + { + "epoch": 0.97, + "grad_norm": 2.869549468783701, + "learning_rate": 2.4071146779021116e-08, + "loss": 0.5939, + "step": 9215 + }, + { + "epoch": 0.97, + "grad_norm": 2.5062122517145817, + "learning_rate": 2.3904409339594036e-08, + "loss": 0.5345, + "step": 9216 + }, + { + "epoch": 0.97, + "grad_norm": 2.9256223760450024, + "learning_rate": 2.373825000856811e-08, + "loss": 0.5867, + "step": 9217 + }, + { + "epoch": 0.97, + "grad_norm": 2.919362951808869, + "learning_rate": 2.3572668805247335e-08, + "loss": 0.6491, + "step": 9218 + }, + { + "epoch": 0.97, + "grad_norm": 2.6991155767439214, + "learning_rate": 2.3407665748868548e-08, + "loss": 0.5601, + "step": 9219 + }, + { + "epoch": 0.97, + "grad_norm": 2.246298603916671, + "learning_rate": 2.324324085860086e-08, + "loss": 0.5792, + "step": 9220 + }, + { + "epoch": 0.97, + "grad_norm": 2.4031521640333504, + "learning_rate": 2.3079394153547874e-08, + "loss": 0.6562, + "step": 9221 + }, + { + "epoch": 0.97, + "grad_norm": 2.4774885599982843, + "learning_rate": 2.2916125652743814e-08, + "loss": 0.5821, + "step": 9222 + }, + { + "epoch": 0.97, + "grad_norm": 2.9008654534964395, + "learning_rate": 2.2753435375156284e-08, + "loss": 0.6582, + "step": 9223 + }, + { + "epoch": 0.97, + "grad_norm": 6.023778464208766, + "learning_rate": 2.2591323339687387e-08, + "loss": 0.6167, + "step": 9224 + }, + { + "epoch": 0.97, + "grad_norm": 2.3713659550667208, + "learning_rate": 2.242978956517039e-08, + "loss": 0.6183, + "step": 9225 + }, + { + "epoch": 0.97, + "grad_norm": 2.306655048988526, + "learning_rate": 2.2268834070371946e-08, + "loss": 0.6889, + "step": 9226 + }, + { + "epoch": 0.97, + "grad_norm": 3.0952640364000117, + "learning_rate": 2.210845687399099e-08, + "loss": 0.5602, + "step": 9227 + }, + { + "epoch": 0.97, + "grad_norm": 2.3413653802222494, + "learning_rate": 2.1948657994659838e-08, + "loss": 0.5923, + "step": 9228 + }, + { + "epoch": 0.97, + "grad_norm": 2.77564140602996, + "learning_rate": 2.1789437450943084e-08, + "loss": 0.5727, + "step": 9229 + }, + { + "epoch": 0.97, + "grad_norm": 2.421552954655445, + "learning_rate": 2.163079526133982e-08, + "loss": 0.611, + "step": 9230 + }, + { + "epoch": 0.97, + "grad_norm": 2.169302617388929, + "learning_rate": 2.1472731444279193e-08, + "loss": 0.5952, + "step": 9231 + }, + { + "epoch": 0.97, + "grad_norm": 2.101895600351323, + "learning_rate": 2.13152460181254e-08, + "loss": 0.6372, + "step": 9232 + }, + { + "epoch": 0.97, + "grad_norm": 3.657299806141878, + "learning_rate": 2.115833900117381e-08, + "loss": 0.6147, + "step": 9233 + }, + { + "epoch": 0.97, + "grad_norm": 2.239460891177711, + "learning_rate": 2.1002010411654838e-08, + "loss": 0.591, + "step": 9234 + }, + { + "epoch": 0.97, + "grad_norm": 3.488691890857634, + "learning_rate": 2.0846260267728957e-08, + "loss": 0.6899, + "step": 9235 + }, + { + "epoch": 0.97, + "grad_norm": 2.1238152317575727, + "learning_rate": 2.069108858749169e-08, + "loss": 0.6646, + "step": 9236 + }, + { + "epoch": 0.97, + "grad_norm": 2.487320361580777, + "learning_rate": 2.0536495388969734e-08, + "loss": 0.6297, + "step": 9237 + }, + { + "epoch": 0.97, + "grad_norm": 2.341768489528419, + "learning_rate": 2.0382480690123718e-08, + "loss": 0.472, + "step": 9238 + }, + { + "epoch": 0.97, + "grad_norm": 3.233011462575182, + "learning_rate": 2.0229044508845997e-08, + "loss": 0.6539, + "step": 9239 + }, + { + "epoch": 0.97, + "grad_norm": 2.121493412352339, + "learning_rate": 2.007618686296342e-08, + "loss": 0.5738, + "step": 9240 + }, + { + "epoch": 0.97, + "grad_norm": 0.9581343879954742, + "learning_rate": 1.9923907770233453e-08, + "loss": 0.5091, + "step": 9241 + }, + { + "epoch": 0.97, + "grad_norm": 2.403223954181605, + "learning_rate": 1.9772207248348607e-08, + "loss": 0.6106, + "step": 9242 + }, + { + "epoch": 0.97, + "grad_norm": 0.8777448291544866, + "learning_rate": 1.962108531493201e-08, + "loss": 0.5469, + "step": 9243 + }, + { + "epoch": 0.97, + "grad_norm": 2.9765736631382067, + "learning_rate": 1.9470541987540727e-08, + "loss": 0.5947, + "step": 9244 + }, + { + "epoch": 0.97, + "grad_norm": 3.5869579127381717, + "learning_rate": 1.9320577283664656e-08, + "loss": 0.6733, + "step": 9245 + }, + { + "epoch": 0.97, + "grad_norm": 2.7935438970542363, + "learning_rate": 1.9171191220726527e-08, + "loss": 0.5995, + "step": 9246 + }, + { + "epoch": 0.97, + "grad_norm": 2.928327785902899, + "learning_rate": 1.902238381608079e-08, + "loss": 0.5998, + "step": 9247 + }, + { + "epoch": 0.97, + "grad_norm": 3.4791238402015456, + "learning_rate": 1.88741550870164e-08, + "loss": 0.5935, + "step": 9248 + }, + { + "epoch": 0.97, + "grad_norm": 2.863133326433092, + "learning_rate": 1.8726505050753464e-08, + "loss": 0.5795, + "step": 9249 + }, + { + "epoch": 0.97, + "grad_norm": 4.0631417802895955, + "learning_rate": 1.8579433724446037e-08, + "loss": 0.5772, + "step": 9250 + }, + { + "epoch": 0.97, + "grad_norm": 2.2432282328723803, + "learning_rate": 1.8432941125179904e-08, + "loss": 0.5114, + "step": 9251 + }, + { + "epoch": 0.97, + "grad_norm": 2.5097002070968535, + "learning_rate": 1.8287027269974777e-08, + "loss": 0.6283, + "step": 9252 + }, + { + "epoch": 0.97, + "grad_norm": 2.415419955262919, + "learning_rate": 1.81416921757821e-08, + "loss": 0.6065, + "step": 9253 + }, + { + "epoch": 0.97, + "grad_norm": 2.2888006379470407, + "learning_rate": 1.7996935859486143e-08, + "loss": 0.5377, + "step": 9254 + }, + { + "epoch": 0.97, + "grad_norm": 0.95190793167679, + "learning_rate": 1.7852758337904564e-08, + "loss": 0.5234, + "step": 9255 + }, + { + "epoch": 0.97, + "grad_norm": 3.928375135216635, + "learning_rate": 1.7709159627787853e-08, + "loss": 0.5789, + "step": 9256 + }, + { + "epoch": 0.97, + "grad_norm": 2.772545157709395, + "learning_rate": 1.7566139745818778e-08, + "loss": 0.6105, + "step": 9257 + }, + { + "epoch": 0.97, + "grad_norm": 0.9706682161569412, + "learning_rate": 1.7423698708612935e-08, + "loss": 0.5673, + "step": 9258 + }, + { + "epoch": 0.97, + "grad_norm": 2.5143022423925516, + "learning_rate": 1.72818365327182e-08, + "loss": 0.5667, + "step": 9259 + }, + { + "epoch": 0.97, + "grad_norm": 3.5152224135253496, + "learning_rate": 1.7140553234616385e-08, + "loss": 0.5852, + "step": 9260 + }, + { + "epoch": 0.97, + "grad_norm": 2.6502532483318486, + "learning_rate": 1.6999848830721033e-08, + "loss": 0.585, + "step": 9261 + }, + { + "epoch": 0.97, + "grad_norm": 2.643359676277189, + "learning_rate": 1.6859723337379064e-08, + "loss": 0.5682, + "step": 9262 + }, + { + "epoch": 0.97, + "grad_norm": 2.073844785345583, + "learning_rate": 1.6720176770869124e-08, + "loss": 0.6263, + "step": 9263 + }, + { + "epoch": 0.97, + "grad_norm": 4.2055735703872275, + "learning_rate": 1.6581209147404355e-08, + "loss": 0.6097, + "step": 9264 + }, + { + "epoch": 0.97, + "grad_norm": 2.6043559102032106, + "learning_rate": 1.6442820483128508e-08, + "loss": 0.6575, + "step": 9265 + }, + { + "epoch": 0.98, + "grad_norm": 2.1311347692762466, + "learning_rate": 1.630501079412039e-08, + "loss": 0.5999, + "step": 9266 + }, + { + "epoch": 0.98, + "grad_norm": 3.1904074031842913, + "learning_rate": 1.6167780096389417e-08, + "loss": 0.6478, + "step": 9267 + }, + { + "epoch": 0.98, + "grad_norm": 2.1605938131759217, + "learning_rate": 1.603112840587895e-08, + "loss": 0.5674, + "step": 9268 + }, + { + "epoch": 0.98, + "grad_norm": 2.7661197207313495, + "learning_rate": 1.589505573846517e-08, + "loss": 0.6057, + "step": 9269 + }, + { + "epoch": 0.98, + "grad_norm": 2.6447415105460883, + "learning_rate": 1.5759562109955993e-08, + "loss": 0.6398, + "step": 9270 + }, + { + "epoch": 0.98, + "grad_norm": 3.2687696784623625, + "learning_rate": 1.562464753609272e-08, + "loss": 0.6534, + "step": 9271 + }, + { + "epoch": 0.98, + "grad_norm": 2.6228867844890065, + "learning_rate": 1.549031203254947e-08, + "loss": 0.627, + "step": 9272 + }, + { + "epoch": 0.98, + "grad_norm": 3.4312244311880393, + "learning_rate": 1.535655561493321e-08, + "loss": 0.6083, + "step": 9273 + }, + { + "epoch": 0.98, + "grad_norm": 2.4986020292339344, + "learning_rate": 1.5223378298783174e-08, + "loss": 0.5576, + "step": 9274 + }, + { + "epoch": 0.98, + "grad_norm": 2.519385627634563, + "learning_rate": 1.5090780099571435e-08, + "loss": 0.5539, + "step": 9275 + }, + { + "epoch": 0.98, + "grad_norm": 9.883104133354655, + "learning_rate": 1.4958761032702885e-08, + "loss": 0.5969, + "step": 9276 + }, + { + "epoch": 0.98, + "grad_norm": 2.536493973434565, + "learning_rate": 1.4827321113515259e-08, + "loss": 0.6409, + "step": 9277 + }, + { + "epoch": 0.98, + "grad_norm": 2.276130559735301, + "learning_rate": 1.4696460357279118e-08, + "loss": 0.5695, + "step": 9278 + }, + { + "epoch": 0.98, + "grad_norm": 2.7168915784835974, + "learning_rate": 1.4566178779197305e-08, + "loss": 0.5578, + "step": 9279 + }, + { + "epoch": 0.98, + "grad_norm": 2.303485469832306, + "learning_rate": 1.443647639440493e-08, + "loss": 0.619, + "step": 9280 + }, + { + "epoch": 0.98, + "grad_norm": 2.7280656636349336, + "learning_rate": 1.43073532179705e-08, + "loss": 0.7012, + "step": 9281 + }, + { + "epoch": 0.98, + "grad_norm": 2.803606301743524, + "learning_rate": 1.4178809264896454e-08, + "loss": 0.5857, + "step": 9282 + }, + { + "epoch": 0.98, + "grad_norm": 5.551430987761346, + "learning_rate": 1.4050844550115295e-08, + "loss": 0.6231, + "step": 9283 + }, + { + "epoch": 0.98, + "grad_norm": 3.3909002445584813, + "learning_rate": 1.3923459088494574e-08, + "loss": 0.5716, + "step": 9284 + }, + { + "epoch": 0.98, + "grad_norm": 2.489847512775473, + "learning_rate": 1.3796652894832452e-08, + "loss": 0.6399, + "step": 9285 + }, + { + "epoch": 0.98, + "grad_norm": 1.8687603509839055, + "learning_rate": 1.367042598386159e-08, + "loss": 0.5983, + "step": 9286 + }, + { + "epoch": 0.98, + "grad_norm": 2.603423975622383, + "learning_rate": 1.3544778370246924e-08, + "loss": 0.6098, + "step": 9287 + }, + { + "epoch": 0.98, + "grad_norm": 2.520998350761481, + "learning_rate": 1.3419710068585668e-08, + "loss": 0.5934, + "step": 9288 + }, + { + "epoch": 0.98, + "grad_norm": 4.649199879398089, + "learning_rate": 1.3295221093407862e-08, + "loss": 0.6294, + "step": 9289 + }, + { + "epoch": 0.98, + "grad_norm": 3.3152207409028662, + "learning_rate": 1.3171311459175829e-08, + "loss": 0.6102, + "step": 9290 + }, + { + "epoch": 0.98, + "grad_norm": 8.91183070336439, + "learning_rate": 1.3047981180285274e-08, + "loss": 0.5617, + "step": 9291 + }, + { + "epoch": 0.98, + "grad_norm": 2.536923939716204, + "learning_rate": 1.2925230271064736e-08, + "loss": 0.6005, + "step": 9292 + }, + { + "epoch": 0.98, + "grad_norm": 2.5935899938613978, + "learning_rate": 1.2803058745774477e-08, + "loss": 0.5446, + "step": 9293 + }, + { + "epoch": 0.98, + "grad_norm": 2.67588488131424, + "learning_rate": 1.2681466618608696e-08, + "loss": 0.6637, + "step": 9294 + }, + { + "epoch": 0.98, + "grad_norm": 2.298948783588412, + "learning_rate": 1.256045390369276e-08, + "loss": 0.5459, + "step": 9295 + }, + { + "epoch": 0.98, + "grad_norm": 3.32146484587026, + "learning_rate": 1.2440020615086534e-08, + "loss": 0.6503, + "step": 9296 + }, + { + "epoch": 0.98, + "grad_norm": 2.382979182133977, + "learning_rate": 1.232016676678105e-08, + "loss": 0.5351, + "step": 9297 + }, + { + "epoch": 0.98, + "grad_norm": 2.8108107342818927, + "learning_rate": 1.2200892372700168e-08, + "loss": 0.5628, + "step": 9298 + }, + { + "epoch": 0.98, + "grad_norm": 2.5429167783954405, + "learning_rate": 1.2082197446701693e-08, + "loss": 0.5136, + "step": 9299 + }, + { + "epoch": 0.98, + "grad_norm": 2.446935675654442, + "learning_rate": 1.196408200257515e-08, + "loss": 0.5591, + "step": 9300 + }, + { + "epoch": 0.98, + "grad_norm": 2.4390888642246606, + "learning_rate": 1.1846546054042341e-08, + "loss": 0.63, + "step": 9301 + }, + { + "epoch": 0.98, + "grad_norm": 3.182765263784984, + "learning_rate": 1.1729589614758452e-08, + "loss": 0.6281, + "step": 9302 + }, + { + "epoch": 0.98, + "grad_norm": 2.6515402589211248, + "learning_rate": 1.1613212698311504e-08, + "loss": 0.5422, + "step": 9303 + }, + { + "epoch": 0.98, + "grad_norm": 3.006167180494317, + "learning_rate": 1.1497415318221239e-08, + "loss": 0.5991, + "step": 9304 + }, + { + "epoch": 0.98, + "grad_norm": 2.6423248229678973, + "learning_rate": 1.1382197487941337e-08, + "loss": 0.5174, + "step": 9305 + }, + { + "epoch": 0.98, + "grad_norm": 2.713564676817919, + "learning_rate": 1.1267559220857204e-08, + "loss": 0.6037, + "step": 9306 + }, + { + "epoch": 0.98, + "grad_norm": 2.653891032493835, + "learning_rate": 1.1153500530286521e-08, + "loss": 0.5838, + "step": 9307 + }, + { + "epoch": 0.98, + "grad_norm": 3.3097916904344333, + "learning_rate": 1.1040021429480907e-08, + "loss": 0.5577, + "step": 9308 + }, + { + "epoch": 0.98, + "grad_norm": 2.68913204932153, + "learning_rate": 1.0927121931624263e-08, + "loss": 0.6034, + "step": 9309 + }, + { + "epoch": 0.98, + "grad_norm": 3.6960977207308368, + "learning_rate": 1.0814802049832762e-08, + "loss": 0.5788, + "step": 9310 + }, + { + "epoch": 0.98, + "grad_norm": 2.5549598391454404, + "learning_rate": 1.0703061797154857e-08, + "loss": 0.6632, + "step": 9311 + }, + { + "epoch": 0.98, + "grad_norm": 3.4964467940181576, + "learning_rate": 1.059190118657294e-08, + "loss": 0.5849, + "step": 9312 + }, + { + "epoch": 0.98, + "grad_norm": 3.4191869666444825, + "learning_rate": 1.0481320231001124e-08, + "loss": 0.6446, + "step": 9313 + }, + { + "epoch": 0.98, + "grad_norm": 2.233757773597287, + "learning_rate": 1.0371318943285802e-08, + "loss": 0.5923, + "step": 9314 + }, + { + "epoch": 0.98, + "grad_norm": 3.207322331157825, + "learning_rate": 1.0261897336207305e-08, + "loss": 0.617, + "step": 9315 + }, + { + "epoch": 0.98, + "grad_norm": 3.9505737873093274, + "learning_rate": 1.0153055422477686e-08, + "loss": 0.6109, + "step": 9316 + }, + { + "epoch": 0.98, + "grad_norm": 3.9526141318436947, + "learning_rate": 1.0044793214742387e-08, + "loss": 0.6762, + "step": 9317 + }, + { + "epoch": 0.98, + "grad_norm": 3.42531517384415, + "learning_rate": 9.937110725578015e-09, + "loss": 0.6146, + "step": 9318 + }, + { + "epoch": 0.98, + "grad_norm": 2.143828176250109, + "learning_rate": 9.83000796749567e-09, + "loss": 0.5419, + "step": 9319 + }, + { + "epoch": 0.98, + "grad_norm": 2.7349892039380914, + "learning_rate": 9.723484952937623e-09, + "loss": 0.6015, + "step": 9320 + }, + { + "epoch": 0.98, + "grad_norm": 2.880357843332127, + "learning_rate": 9.617541694279532e-09, + "loss": 0.5888, + "step": 9321 + }, + { + "epoch": 0.98, + "grad_norm": 3.046184759076013, + "learning_rate": 9.512178203829881e-09, + "loss": 0.5733, + "step": 9322 + }, + { + "epoch": 0.98, + "grad_norm": 2.0691863659915253, + "learning_rate": 9.407394493829436e-09, + "loss": 0.5912, + "step": 9323 + }, + { + "epoch": 0.98, + "grad_norm": 3.048172174777919, + "learning_rate": 9.303190576451237e-09, + "loss": 0.641, + "step": 9324 + }, + { + "epoch": 0.98, + "grad_norm": 6.232433634613707, + "learning_rate": 9.199566463801712e-09, + "loss": 0.583, + "step": 9325 + }, + { + "epoch": 0.98, + "grad_norm": 2.7792858367309243, + "learning_rate": 9.09652216792012e-09, + "loss": 0.6, + "step": 9326 + }, + { + "epoch": 0.98, + "grad_norm": 3.136354810393291, + "learning_rate": 8.994057700776881e-09, + "loss": 0.5886, + "step": 9327 + }, + { + "epoch": 0.98, + "grad_norm": 7.469179826762709, + "learning_rate": 8.892173074276921e-09, + "loss": 0.6292, + "step": 9328 + }, + { + "epoch": 0.98, + "grad_norm": 2.404099135191639, + "learning_rate": 8.790868300255773e-09, + "loss": 0.6475, + "step": 9329 + }, + { + "epoch": 0.98, + "grad_norm": 3.296160070077247, + "learning_rate": 8.690143390484018e-09, + "loss": 0.6183, + "step": 9330 + }, + { + "epoch": 0.98, + "grad_norm": 2.6296150848981825, + "learning_rate": 8.589998356662854e-09, + "loss": 0.6401, + "step": 9331 + }, + { + "epoch": 0.98, + "grad_norm": 3.253263381499738, + "learning_rate": 8.490433210426862e-09, + "loss": 0.6144, + "step": 9332 + }, + { + "epoch": 0.98, + "grad_norm": 3.110522658122041, + "learning_rate": 8.391447963343457e-09, + "loss": 0.5804, + "step": 9333 + }, + { + "epoch": 0.98, + "grad_norm": 6.067607575219891, + "learning_rate": 8.293042626912328e-09, + "loss": 0.5936, + "step": 9334 + }, + { + "epoch": 0.98, + "grad_norm": 2.5019568835203203, + "learning_rate": 8.195217212565998e-09, + "loss": 0.6324, + "step": 9335 + }, + { + "epoch": 0.98, + "grad_norm": 2.925396010634107, + "learning_rate": 8.097971731669263e-09, + "loss": 0.583, + "step": 9336 + }, + { + "epoch": 0.98, + "grad_norm": 3.319061858545403, + "learning_rate": 8.001306195520309e-09, + "loss": 0.6931, + "step": 9337 + }, + { + "epoch": 0.98, + "grad_norm": 2.2308896693198204, + "learning_rate": 7.90522061534904e-09, + "loss": 0.5912, + "step": 9338 + }, + { + "epoch": 0.98, + "grad_norm": 2.557196137943084, + "learning_rate": 7.809715002318751e-09, + "loss": 0.6286, + "step": 9339 + }, + { + "epoch": 0.98, + "grad_norm": 2.8420399596277703, + "learning_rate": 7.714789367524456e-09, + "loss": 0.5345, + "step": 9340 + }, + { + "epoch": 0.98, + "grad_norm": 2.9739842730635013, + "learning_rate": 7.620443721995107e-09, + "loss": 0.6429, + "step": 9341 + }, + { + "epoch": 0.98, + "grad_norm": 2.265940840873313, + "learning_rate": 7.52667807669083e-09, + "loss": 0.5545, + "step": 9342 + }, + { + "epoch": 0.98, + "grad_norm": 2.287899726527495, + "learning_rate": 7.43349244250513e-09, + "loss": 0.6421, + "step": 9343 + }, + { + "epoch": 0.98, + "grad_norm": 2.5574409267349916, + "learning_rate": 7.340886830264904e-09, + "loss": 0.6416, + "step": 9344 + }, + { + "epoch": 0.98, + "grad_norm": 2.8843132936950244, + "learning_rate": 7.2488612507276564e-09, + "loss": 0.6486, + "step": 9345 + }, + { + "epoch": 0.98, + "grad_norm": 3.5891319217005697, + "learning_rate": 7.157415714584836e-09, + "loss": 0.6042, + "step": 9346 + }, + { + "epoch": 0.98, + "grad_norm": 3.5665853233741407, + "learning_rate": 7.066550232461278e-09, + "loss": 0.5836, + "step": 9347 + }, + { + "epoch": 0.98, + "grad_norm": 2.156939647962542, + "learning_rate": 6.976264814912426e-09, + "loss": 0.6158, + "step": 9348 + }, + { + "epoch": 0.98, + "grad_norm": 2.318404717092306, + "learning_rate": 6.886559472427667e-09, + "loss": 0.5961, + "step": 9349 + }, + { + "epoch": 0.98, + "grad_norm": 0.9679186604158867, + "learning_rate": 6.797434215429222e-09, + "loss": 0.5583, + "step": 9350 + }, + { + "epoch": 0.98, + "grad_norm": 2.882857695190735, + "learning_rate": 6.708889054270473e-09, + "loss": 0.6257, + "step": 9351 + }, + { + "epoch": 0.98, + "grad_norm": 2.9855183026376255, + "learning_rate": 6.620923999239304e-09, + "loss": 0.5986, + "step": 9352 + }, + { + "epoch": 0.98, + "grad_norm": 2.965482349734234, + "learning_rate": 6.533539060554761e-09, + "loss": 0.5749, + "step": 9353 + }, + { + "epoch": 0.98, + "grad_norm": 2.4913808973827263, + "learning_rate": 6.446734248368725e-09, + "loss": 0.6426, + "step": 9354 + }, + { + "epoch": 0.98, + "grad_norm": 2.7932074858752802, + "learning_rate": 6.360509572765905e-09, + "loss": 0.5945, + "step": 9355 + }, + { + "epoch": 0.98, + "grad_norm": 2.200283514862744, + "learning_rate": 6.2748650437644e-09, + "loss": 0.6131, + "step": 9356 + }, + { + "epoch": 0.98, + "grad_norm": 4.443831408522394, + "learning_rate": 6.189800671314028e-09, + "loss": 0.6324, + "step": 9357 + }, + { + "epoch": 0.98, + "grad_norm": 2.722697074400931, + "learning_rate": 6.10531646529633e-09, + "loss": 0.5599, + "step": 9358 + }, + { + "epoch": 0.98, + "grad_norm": 2.8796778714377784, + "learning_rate": 6.021412435527341e-09, + "loss": 0.5826, + "step": 9359 + }, + { + "epoch": 0.98, + "grad_norm": 2.1980873977664355, + "learning_rate": 5.938088591754265e-09, + "loss": 0.5682, + "step": 9360 + }, + { + "epoch": 0.99, + "grad_norm": 2.506953681588526, + "learning_rate": 5.855344943658248e-09, + "loss": 0.6596, + "step": 9361 + }, + { + "epoch": 0.99, + "grad_norm": 2.4320110623826006, + "learning_rate": 5.773181500851044e-09, + "loss": 0.5329, + "step": 9362 + }, + { + "epoch": 0.99, + "grad_norm": 2.673093388240049, + "learning_rate": 5.691598272878907e-09, + "loss": 0.6882, + "step": 9363 + }, + { + "epoch": 0.99, + "grad_norm": 5.141915304940063, + "learning_rate": 5.610595269220364e-09, + "loss": 0.6534, + "step": 9364 + }, + { + "epoch": 0.99, + "grad_norm": 2.2742761378178487, + "learning_rate": 5.530172499285113e-09, + "loss": 0.5644, + "step": 9365 + }, + { + "epoch": 0.99, + "grad_norm": 3.7078245584599014, + "learning_rate": 5.45032997241679e-09, + "loss": 0.6237, + "step": 9366 + }, + { + "epoch": 0.99, + "grad_norm": 2.1910604685165675, + "learning_rate": 5.371067697891308e-09, + "loss": 0.6524, + "step": 9367 + }, + { + "epoch": 0.99, + "grad_norm": 2.263472973076441, + "learning_rate": 5.292385684917411e-09, + "loss": 0.5387, + "step": 9368 + }, + { + "epoch": 0.99, + "grad_norm": 2.6018067358751327, + "learning_rate": 5.214283942635567e-09, + "loss": 0.5654, + "step": 9369 + }, + { + "epoch": 0.99, + "grad_norm": 3.20137961246332, + "learning_rate": 5.136762480120183e-09, + "loss": 0.5999, + "step": 9370 + }, + { + "epoch": 0.99, + "grad_norm": 2.5099314930574357, + "learning_rate": 5.059821306376833e-09, + "loss": 0.5936, + "step": 9371 + }, + { + "epoch": 0.99, + "grad_norm": 2.70471458958764, + "learning_rate": 4.9834604303444774e-09, + "loss": 0.631, + "step": 9372 + }, + { + "epoch": 0.99, + "grad_norm": 3.1223721533213684, + "learning_rate": 4.907679860894355e-09, + "loss": 0.6049, + "step": 9373 + }, + { + "epoch": 0.99, + "grad_norm": 2.5657179600814275, + "learning_rate": 4.832479606831086e-09, + "loss": 0.5107, + "step": 9374 + }, + { + "epoch": 0.99, + "grad_norm": 5.080680659842182, + "learning_rate": 4.757859676891019e-09, + "loss": 0.5991, + "step": 9375 + }, + { + "epoch": 0.99, + "grad_norm": 2.8969187425122196, + "learning_rate": 4.683820079742218e-09, + "loss": 0.5855, + "step": 9376 + }, + { + "epoch": 0.99, + "grad_norm": 2.407260977526988, + "learning_rate": 4.610360823987803e-09, + "loss": 0.652, + "step": 9377 + }, + { + "epoch": 0.99, + "grad_norm": 2.990129810345131, + "learning_rate": 4.5374819181615015e-09, + "loss": 0.5611, + "step": 9378 + }, + { + "epoch": 0.99, + "grad_norm": 3.437506450033328, + "learning_rate": 4.465183370729875e-09, + "loss": 0.5923, + "step": 9379 + }, + { + "epoch": 0.99, + "grad_norm": 3.476756229416724, + "learning_rate": 4.393465190092316e-09, + "loss": 0.6029, + "step": 9380 + }, + { + "epoch": 0.99, + "grad_norm": 2.580270764805957, + "learning_rate": 4.322327384581604e-09, + "loss": 0.6063, + "step": 9381 + }, + { + "epoch": 0.99, + "grad_norm": 2.062993744196928, + "learning_rate": 4.251769962461683e-09, + "loss": 0.6294, + "step": 9382 + }, + { + "epoch": 0.99, + "grad_norm": 3.1334282612359785, + "learning_rate": 4.181792931929885e-09, + "loss": 0.6693, + "step": 9383 + }, + { + "epoch": 0.99, + "grad_norm": 1.989553082617997, + "learning_rate": 4.1123963011158175e-09, + "loss": 0.5787, + "step": 9384 + }, + { + "epoch": 0.99, + "grad_norm": 2.2591858616247964, + "learning_rate": 4.043580078081921e-09, + "loss": 0.5738, + "step": 9385 + }, + { + "epoch": 0.99, + "grad_norm": 2.6256761692714603, + "learning_rate": 3.975344270823467e-09, + "loss": 0.5571, + "step": 9386 + }, + { + "epoch": 0.99, + "grad_norm": 2.9188998285720706, + "learning_rate": 3.9076888872668914e-09, + "loss": 0.6569, + "step": 9387 + }, + { + "epoch": 0.99, + "grad_norm": 2.4406062966062727, + "learning_rate": 3.84061393527313e-09, + "loss": 0.5535, + "step": 9388 + }, + { + "epoch": 0.99, + "grad_norm": 2.499796926984259, + "learning_rate": 3.774119422634282e-09, + "loss": 0.6678, + "step": 9389 + }, + { + "epoch": 0.99, + "grad_norm": 2.50651659058804, + "learning_rate": 3.7082053570758338e-09, + "loss": 0.6311, + "step": 9390 + }, + { + "epoch": 0.99, + "grad_norm": 2.9483823752948752, + "learning_rate": 3.6428717462549944e-09, + "loss": 0.5463, + "step": 9391 + }, + { + "epoch": 0.99, + "grad_norm": 3.723110555766217, + "learning_rate": 3.578118597762914e-09, + "loss": 0.6034, + "step": 9392 + }, + { + "epoch": 0.99, + "grad_norm": 2.5286683838105293, + "learning_rate": 3.5139459191213533e-09, + "loss": 0.6347, + "step": 9393 + }, + { + "epoch": 0.99, + "grad_norm": 3.4531428030517914, + "learning_rate": 3.4503537177860145e-09, + "loss": 0.6048, + "step": 9394 + }, + { + "epoch": 0.99, + "grad_norm": 3.04607450890029, + "learning_rate": 3.3873420011448778e-09, + "loss": 0.5767, + "step": 9395 + }, + { + "epoch": 0.99, + "grad_norm": 2.4392946355168275, + "learning_rate": 3.324910776519308e-09, + "loss": 0.596, + "step": 9396 + }, + { + "epoch": 0.99, + "grad_norm": 3.422625198663431, + "learning_rate": 3.263060051161282e-09, + "loss": 0.5568, + "step": 9397 + }, + { + "epoch": 0.99, + "grad_norm": 2.418789715138034, + "learning_rate": 3.2017898322567185e-09, + "loss": 0.6036, + "step": 9398 + }, + { + "epoch": 0.99, + "grad_norm": 4.929592023564839, + "learning_rate": 3.1411001269238127e-09, + "loss": 0.6467, + "step": 9399 + }, + { + "epoch": 0.99, + "grad_norm": 2.2696725064713115, + "learning_rate": 3.080990942213591e-09, + "loss": 0.5624, + "step": 9400 + }, + { + "epoch": 0.99, + "grad_norm": 2.887084023089943, + "learning_rate": 3.0214622851093555e-09, + "loss": 0.6042, + "step": 9401 + }, + { + "epoch": 0.99, + "grad_norm": 2.2400748396131984, + "learning_rate": 2.9625141625266863e-09, + "loss": 0.6244, + "step": 9402 + }, + { + "epoch": 0.99, + "grad_norm": 2.4355210959399383, + "learning_rate": 2.9041465813145486e-09, + "loss": 0.5651, + "step": 9403 + }, + { + "epoch": 0.99, + "grad_norm": 2.590028657934874, + "learning_rate": 2.8463595482530747e-09, + "loss": 0.6597, + "step": 9404 + }, + { + "epoch": 0.99, + "grad_norm": 4.014180884320301, + "learning_rate": 2.7891530700563387e-09, + "loss": 0.6173, + "step": 9405 + }, + { + "epoch": 0.99, + "grad_norm": 3.4571650309323445, + "learning_rate": 2.73252715337069e-09, + "loss": 0.6504, + "step": 9406 + }, + { + "epoch": 0.99, + "grad_norm": 2.9475010958623757, + "learning_rate": 2.6764818047736453e-09, + "loss": 0.5891, + "step": 9407 + }, + { + "epoch": 0.99, + "grad_norm": 3.3576520108967984, + "learning_rate": 2.6210170307777726e-09, + "loss": 0.6288, + "step": 9408 + }, + { + "epoch": 0.99, + "grad_norm": 2.4213655278930717, + "learning_rate": 2.5661328378262516e-09, + "loss": 0.5432, + "step": 9409 + }, + { + "epoch": 0.99, + "grad_norm": 3.796645905475894, + "learning_rate": 2.5118292322950933e-09, + "loss": 0.5317, + "step": 9410 + }, + { + "epoch": 0.99, + "grad_norm": 2.7845772144730496, + "learning_rate": 2.4581062204931395e-09, + "loss": 0.5843, + "step": 9411 + }, + { + "epoch": 0.99, + "grad_norm": 3.8648265905534904, + "learning_rate": 2.404963808662064e-09, + "loss": 0.6527, + "step": 9412 + }, + { + "epoch": 0.99, + "grad_norm": 2.7630651447270407, + "learning_rate": 2.3524020029758175e-09, + "loss": 0.687, + "step": 9413 + }, + { + "epoch": 0.99, + "grad_norm": 8.873063326108902, + "learning_rate": 2.3004208095406268e-09, + "loss": 0.6214, + "step": 9414 + }, + { + "epoch": 0.99, + "grad_norm": 3.581099597260234, + "learning_rate": 2.249020234395549e-09, + "loss": 0.6127, + "step": 9415 + }, + { + "epoch": 0.99, + "grad_norm": 0.8980029935743684, + "learning_rate": 2.198200283512475e-09, + "loss": 0.5578, + "step": 9416 + }, + { + "epoch": 0.99, + "grad_norm": 2.843841317169783, + "learning_rate": 2.14796096279557e-09, + "loss": 0.6437, + "step": 9417 + }, + { + "epoch": 0.99, + "grad_norm": 2.669266532272996, + "learning_rate": 2.0983022780807217e-09, + "loss": 0.5516, + "step": 9418 + }, + { + "epoch": 0.99, + "grad_norm": 2.9826082921006214, + "learning_rate": 2.049224235138314e-09, + "loss": 0.6909, + "step": 9419 + }, + { + "epoch": 0.99, + "grad_norm": 2.724117248063, + "learning_rate": 2.0007268396687873e-09, + "loss": 0.6637, + "step": 9420 + }, + { + "epoch": 0.99, + "grad_norm": 2.8185604270090447, + "learning_rate": 1.9528100973070784e-09, + "loss": 0.5829, + "step": 9421 + }, + { + "epoch": 0.99, + "grad_norm": 3.095495170435446, + "learning_rate": 1.9054740136204007e-09, + "loss": 0.5911, + "step": 9422 + }, + { + "epoch": 0.99, + "grad_norm": 3.3241775905169297, + "learning_rate": 1.858718594107689e-09, + "loss": 0.6762, + "step": 9423 + }, + { + "epoch": 0.99, + "grad_norm": 2.856463977964485, + "learning_rate": 1.8125438442007093e-09, + "loss": 0.5795, + "step": 9424 + }, + { + "epoch": 0.99, + "grad_norm": 1.1071167853953825, + "learning_rate": 1.766949769264059e-09, + "loss": 0.5487, + "step": 9425 + }, + { + "epoch": 0.99, + "grad_norm": 0.898970735503558, + "learning_rate": 1.7219363745946127e-09, + "loss": 0.5282, + "step": 9426 + }, + { + "epoch": 0.99, + "grad_norm": 8.223703639655533, + "learning_rate": 1.6775036654226307e-09, + "loss": 0.5858, + "step": 9427 + }, + { + "epoch": 0.99, + "grad_norm": 2.6009487272996834, + "learning_rate": 1.6336516469089846e-09, + "loss": 0.6202, + "step": 9428 + }, + { + "epoch": 0.99, + "grad_norm": 3.470594389195533, + "learning_rate": 1.5903803241490435e-09, + "loss": 0.611, + "step": 9429 + }, + { + "epoch": 0.99, + "grad_norm": 2.4607686425198088, + "learning_rate": 1.5476897021698968e-09, + "loss": 0.6361, + "step": 9430 + }, + { + "epoch": 0.99, + "grad_norm": 3.015171435256967, + "learning_rate": 1.5055797859309108e-09, + "loss": 0.6213, + "step": 9431 + }, + { + "epoch": 0.99, + "grad_norm": 2.3552305276446104, + "learning_rate": 1.4640505803248384e-09, + "loss": 0.5813, + "step": 9432 + }, + { + "epoch": 0.99, + "grad_norm": 3.0756788209013686, + "learning_rate": 1.4231020901755988e-09, + "loss": 0.5959, + "step": 9433 + }, + { + "epoch": 0.99, + "grad_norm": 2.8917362317695483, + "learning_rate": 1.3827343202410527e-09, + "loss": 0.6495, + "step": 9434 + }, + { + "epoch": 0.99, + "grad_norm": 2.4911357145002206, + "learning_rate": 1.342947275211337e-09, + "loss": 0.6533, + "step": 9435 + }, + { + "epoch": 0.99, + "grad_norm": 4.243456265519813, + "learning_rate": 1.3037409597077555e-09, + "loss": 0.5686, + "step": 9436 + }, + { + "epoch": 0.99, + "grad_norm": 2.4091325818193186, + "learning_rate": 1.265115378286108e-09, + "loss": 0.5949, + "step": 9437 + }, + { + "epoch": 0.99, + "grad_norm": 2.6371286497158675, + "learning_rate": 1.2270705354333612e-09, + "loss": 0.6817, + "step": 9438 + }, + { + "epoch": 0.99, + "grad_norm": 2.8038456517621673, + "learning_rate": 1.1896064355698678e-09, + "loss": 0.6074, + "step": 9439 + }, + { + "epoch": 0.99, + "grad_norm": 2.2758737769300783, + "learning_rate": 1.152723083047702e-09, + "loss": 0.5709, + "step": 9440 + }, + { + "epoch": 0.99, + "grad_norm": 2.5994595563728633, + "learning_rate": 1.11642048215177e-09, + "loss": 0.6447, + "step": 9441 + }, + { + "epoch": 0.99, + "grad_norm": 2.5885198513951937, + "learning_rate": 1.0806986370998086e-09, + "loss": 0.6026, + "step": 9442 + }, + { + "epoch": 0.99, + "grad_norm": 2.56828110164626, + "learning_rate": 1.0455575520418315e-09, + "loss": 0.6416, + "step": 9443 + }, + { + "epoch": 0.99, + "grad_norm": 2.5864916103094, + "learning_rate": 1.0109972310606842e-09, + "loss": 0.5943, + "step": 9444 + }, + { + "epoch": 0.99, + "grad_norm": 3.1320803492939415, + "learning_rate": 9.770176781709329e-10, + "loss": 0.5664, + "step": 9445 + }, + { + "epoch": 0.99, + "grad_norm": 2.401636357615803, + "learning_rate": 9.436188973210858e-10, + "loss": 0.5257, + "step": 9446 + }, + { + "epoch": 0.99, + "grad_norm": 2.253966978691805, + "learning_rate": 9.108008923902623e-10, + "loss": 0.5868, + "step": 9447 + }, + { + "epoch": 0.99, + "grad_norm": 2.506566563109066, + "learning_rate": 8.785636671920783e-10, + "loss": 0.6921, + "step": 9448 + }, + { + "epoch": 0.99, + "grad_norm": 2.107145184612608, + "learning_rate": 8.469072254713162e-10, + "loss": 0.5729, + "step": 9449 + }, + { + "epoch": 0.99, + "grad_norm": 0.906976575314038, + "learning_rate": 8.158315709055897e-10, + "loss": 0.4864, + "step": 9450 + }, + { + "epoch": 0.99, + "grad_norm": 1.135047161405744, + "learning_rate": 7.853367071053441e-10, + "loss": 0.5252, + "step": 9451 + }, + { + "epoch": 0.99, + "grad_norm": 2.453216748538149, + "learning_rate": 7.554226376133012e-10, + "loss": 0.6918, + "step": 9452 + }, + { + "epoch": 0.99, + "grad_norm": 3.3255864656684557, + "learning_rate": 7.26089365905569e-10, + "loss": 0.5933, + "step": 9453 + }, + { + "epoch": 0.99, + "grad_norm": 3.4761913980531283, + "learning_rate": 6.97336895388867e-10, + "loss": 0.5683, + "step": 9454 + }, + { + "epoch": 0.99, + "grad_norm": 2.9519019016205963, + "learning_rate": 6.691652294038564e-10, + "loss": 0.6096, + "step": 9455 + }, + { + "epoch": 1.0, + "grad_norm": 3.0498036153708985, + "learning_rate": 6.415743712240296e-10, + "loss": 0.5917, + "step": 9456 + }, + { + "epoch": 1.0, + "grad_norm": 2.0448500807091055, + "learning_rate": 6.145643240540456e-10, + "loss": 0.5391, + "step": 9457 + }, + { + "epoch": 1.0, + "grad_norm": 2.6179190080516817, + "learning_rate": 5.881350910325046e-10, + "loss": 0.5956, + "step": 9458 + }, + { + "epoch": 1.0, + "grad_norm": 2.3232050763453804, + "learning_rate": 5.622866752291734e-10, + "loss": 0.6394, + "step": 9459 + }, + { + "epoch": 1.0, + "grad_norm": 2.7020619835422437, + "learning_rate": 5.370190796483155e-10, + "loss": 0.6787, + "step": 9460 + }, + { + "epoch": 1.0, + "grad_norm": 2.2927217439792016, + "learning_rate": 5.123323072236952e-10, + "loss": 0.6097, + "step": 9461 + }, + { + "epoch": 1.0, + "grad_norm": 2.10178763835819, + "learning_rate": 4.88226360824684e-10, + "loss": 0.64, + "step": 9462 + }, + { + "epoch": 1.0, + "grad_norm": 2.5684498862280543, + "learning_rate": 4.647012432512643e-10, + "loss": 0.4799, + "step": 9463 + }, + { + "epoch": 1.0, + "grad_norm": 2.2494270384655715, + "learning_rate": 4.417569572368052e-10, + "loss": 0.6441, + "step": 9464 + }, + { + "epoch": 1.0, + "grad_norm": 2.678614389562663, + "learning_rate": 4.1939350544695224e-10, + "loss": 0.5611, + "step": 9465 + }, + { + "epoch": 1.0, + "grad_norm": 2.949351619951574, + "learning_rate": 3.9761089047907206e-10, + "loss": 0.5365, + "step": 9466 + }, + { + "epoch": 1.0, + "grad_norm": 2.470566883786651, + "learning_rate": 3.764091148650284e-10, + "loss": 0.5893, + "step": 9467 + }, + { + "epoch": 1.0, + "grad_norm": 2.925739305236972, + "learning_rate": 3.5578818106674073e-10, + "loss": 0.4812, + "step": 9468 + }, + { + "epoch": 1.0, + "grad_norm": 2.416620880372823, + "learning_rate": 3.3574809148062546e-10, + "loss": 0.5753, + "step": 9469 + }, + { + "epoch": 1.0, + "grad_norm": 2.779623019239646, + "learning_rate": 3.1628884843537546e-10, + "loss": 0.6176, + "step": 9470 + }, + { + "epoch": 1.0, + "grad_norm": 2.564403061347666, + "learning_rate": 2.974104541902945e-10, + "loss": 0.539, + "step": 9471 + }, + { + "epoch": 1.0, + "grad_norm": 2.6305758356659785, + "learning_rate": 2.7911291093973835e-10, + "loss": 0.6094, + "step": 9472 + }, + { + "epoch": 1.0, + "grad_norm": 2.6682590701908198, + "learning_rate": 2.61396220808674e-10, + "loss": 0.6554, + "step": 9473 + }, + { + "epoch": 1.0, + "grad_norm": 2.690023661401972, + "learning_rate": 2.4426038585656507e-10, + "loss": 0.5741, + "step": 9474 + }, + { + "epoch": 1.0, + "grad_norm": 6.037525938734319, + "learning_rate": 2.277054080729313e-10, + "loss": 0.6665, + "step": 9475 + }, + { + "epoch": 1.0, + "grad_norm": 3.649606612868773, + "learning_rate": 2.117312893817891e-10, + "loss": 0.6428, + "step": 9476 + }, + { + "epoch": 1.0, + "grad_norm": 2.1735139647394646, + "learning_rate": 1.9633803163887633e-10, + "loss": 0.5668, + "step": 9477 + }, + { + "epoch": 1.0, + "grad_norm": 2.423065344674815, + "learning_rate": 1.8152563663220712e-10, + "loss": 0.5754, + "step": 9478 + }, + { + "epoch": 1.0, + "grad_norm": 2.4791765768968714, + "learning_rate": 1.672941060826272e-10, + "loss": 0.6408, + "step": 9479 + }, + { + "epoch": 1.0, + "grad_norm": 2.3385528810246217, + "learning_rate": 1.5364344164436885e-10, + "loss": 0.557, + "step": 9480 + }, + { + "epoch": 1.0, + "grad_norm": 2.724307971466079, + "learning_rate": 1.4057364490227542e-10, + "loss": 0.5809, + "step": 9481 + }, + { + "epoch": 1.0, + "grad_norm": 2.66578465438902, + "learning_rate": 1.2808471737568717e-10, + "loss": 0.5427, + "step": 9482 + }, + { + "epoch": 1.0, + "grad_norm": 2.2921319289193, + "learning_rate": 1.1617666051455534e-10, + "loss": 0.6357, + "step": 9483 + }, + { + "epoch": 1.0, + "grad_norm": 2.6535598474354454, + "learning_rate": 1.0484947570277293e-10, + "loss": 0.5376, + "step": 9484 + }, + { + "epoch": 1.0, + "grad_norm": 2.6190379885009096, + "learning_rate": 9.410316425706445e-11, + "loss": 0.6051, + "step": 9485 + }, + { + "epoch": 1.0, + "grad_norm": 2.5315721986313395, + "learning_rate": 8.393772742421036e-11, + "loss": 0.6413, + "step": 9486 + }, + { + "epoch": 1.0, + "grad_norm": 2.7299924264341318, + "learning_rate": 7.435316638715329e-11, + "loss": 0.5898, + "step": 9487 + }, + { + "epoch": 1.0, + "grad_norm": 0.9023624234891299, + "learning_rate": 6.53494822577816e-11, + "loss": 0.4633, + "step": 9488 + }, + { + "epoch": 1.0, + "grad_norm": 2.4516895016558786, + "learning_rate": 5.6926676083035593e-11, + "loss": 0.6467, + "step": 9489 + }, + { + "epoch": 1.0, + "grad_norm": 2.440327397988931, + "learning_rate": 4.908474884102177e-11, + "loss": 0.558, + "step": 9490 + }, + { + "epoch": 1.0, + "grad_norm": 3.9948194582261793, + "learning_rate": 4.1823701442678114e-11, + "loss": 0.6246, + "step": 9491 + }, + { + "epoch": 1.0, + "grad_norm": 2.751705490521763, + "learning_rate": 3.514353473232923e-11, + "loss": 0.5296, + "step": 9492 + }, + { + "epoch": 1.0, + "grad_norm": 3.4070953890206592, + "learning_rate": 2.9044249485465914e-11, + "loss": 0.5484, + "step": 9493 + }, + { + "epoch": 1.0, + "grad_norm": 3.8452438404500078, + "learning_rate": 2.3525846410965557e-11, + "loss": 0.6063, + "step": 9494 + }, + { + "epoch": 1.0, + "grad_norm": 2.5585166775524413, + "learning_rate": 1.858832614942685e-11, + "loss": 0.568, + "step": 9495 + }, + { + "epoch": 1.0, + "grad_norm": 3.292945274005528, + "learning_rate": 1.4231689274835093e-11, + "loss": 0.6767, + "step": 9496 + }, + { + "epoch": 1.0, + "grad_norm": 2.339794191169871, + "learning_rate": 1.0455936293451985e-11, + "loss": 0.5886, + "step": 9497 + }, + { + "epoch": 1.0, + "grad_norm": 2.5328995861443517, + "learning_rate": 7.261067643815622e-12, + "loss": 0.5278, + "step": 9498 + }, + { + "epoch": 1.0, + "grad_norm": 2.1976850234192837, + "learning_rate": 4.647083696740495e-12, + "loss": 0.5848, + "step": 9499 + }, + { + "epoch": 1.0, + "grad_norm": 2.9136231286554226, + "learning_rate": 2.613984756427712e-12, + "loss": 0.6258, + "step": 9500 + }, + { + "epoch": 1.0, + "grad_norm": 2.1228901797117725, + "learning_rate": 1.161771059354777e-12, + "loss": 0.6679, + "step": 9501 + }, + { + "epoch": 1.0, + "grad_norm": 2.8882038893522686, + "learning_rate": 2.90442773165367e-13, + "loss": 0.5436, + "step": 9502 + }, + { + "epoch": 1.0, + "grad_norm": 3.3614569935196172, + "learning_rate": 0.0, + "loss": 0.5138, + "step": 9503 + }, + { + "epoch": 1.0, + "step": 9503, + "total_flos": 4.083480469386035e+16, + "train_loss": 0.6581587371017812, + "train_runtime": 215857.4487, + "train_samples_per_second": 8.453, + "train_steps_per_second": 0.044 + } + ], + "logging_steps": 1.0, + "max_steps": 9503, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 30000, + "total_flos": 4.083480469386035e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}