diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,32524 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999461468038128, + "eval_steps": 500, + "global_step": 4642, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.3680746555328369, + "learning_rate": 7.142857142857144e-08, + "loss": 0.6038, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.5339280366897583, + "learning_rate": 1.4285714285714287e-07, + "loss": 0.6378, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 0.68046635389328, + "learning_rate": 2.142857142857143e-07, + "loss": 0.6492, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 0.4668383002281189, + "learning_rate": 2.8571428571428575e-07, + "loss": 0.5478, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 0.7353125810623169, + "learning_rate": 3.5714285714285716e-07, + "loss": 0.5802, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.4221579134464264, + "learning_rate": 4.285714285714286e-07, + "loss": 0.5863, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 0.35981830954551697, + "learning_rate": 5.000000000000001e-07, + "loss": 0.5881, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 0.49171018600463867, + "learning_rate": 5.714285714285715e-07, + "loss": 0.6196, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 0.26764529943466187, + "learning_rate": 6.428571428571428e-07, + "loss": 0.6026, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 0.31956347823143005, + "learning_rate": 7.142857142857143e-07, + "loss": 0.6077, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.46464401483535767, + "learning_rate": 7.857142857142857e-07, + "loss": 0.6144, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 0.41506412625312805, + "learning_rate": 8.571428571428572e-07, + "loss": 0.5684, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 0.448373407125473, + "learning_rate": 9.285714285714287e-07, + "loss": 0.5785, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 0.3355347514152527, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.5688, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 0.4548545181751251, + "learning_rate": 1.0714285714285714e-06, + "loss": 0.574, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.4471697211265564, + "learning_rate": 1.142857142857143e-06, + "loss": 0.5526, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 0.5379983186721802, + "learning_rate": 1.2142857142857144e-06, + "loss": 0.5503, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 0.3886550962924957, + "learning_rate": 1.2857142857142856e-06, + "loss": 0.6074, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 0.4560534656047821, + "learning_rate": 1.3571428571428572e-06, + "loss": 0.6262, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 0.3122997283935547, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.6103, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.35939767956733704, + "learning_rate": 1.5e-06, + "loss": 0.6164, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 0.3751821517944336, + "learning_rate": 1.5714285714285714e-06, + "loss": 0.5534, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 0.5395365953445435, + "learning_rate": 1.642857142857143e-06, + "loss": 0.5911, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 0.46072208881378174, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.5986, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 0.35585564374923706, + "learning_rate": 1.7857142857142859e-06, + "loss": 0.5726, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5622196197509766, + "learning_rate": 1.8571428571428573e-06, + "loss": 0.6092, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 0.4780106246471405, + "learning_rate": 1.928571428571429e-06, + "loss": 0.6318, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 0.4055005609989166, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5731, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 0.29330089688301086, + "learning_rate": 2.0714285714285717e-06, + "loss": 0.577, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 0.4011281132698059, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.5771, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.5358087420463562, + "learning_rate": 2.2142857142857146e-06, + "loss": 0.5324, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 0.39781442284584045, + "learning_rate": 2.285714285714286e-06, + "loss": 0.5668, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 0.44512811303138733, + "learning_rate": 2.3571428571428574e-06, + "loss": 0.6232, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 0.3510986566543579, + "learning_rate": 2.428571428571429e-06, + "loss": 0.5816, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 0.39098355174064636, + "learning_rate": 2.5e-06, + "loss": 0.5486, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.4460495412349701, + "learning_rate": 2.571428571428571e-06, + "loss": 0.6256, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 0.5601059794425964, + "learning_rate": 2.642857142857143e-06, + "loss": 0.5408, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 0.543770432472229, + "learning_rate": 2.7142857142857144e-06, + "loss": 0.5843, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 0.45234617590904236, + "learning_rate": 2.785714285714286e-06, + "loss": 0.6491, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 0.35524260997772217, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.5765, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.3411543071269989, + "learning_rate": 2.928571428571429e-06, + "loss": 0.5184, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 0.2239224910736084, + "learning_rate": 3e-06, + "loss": 0.5921, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 0.4779617190361023, + "learning_rate": 3.071428571428572e-06, + "loss": 0.5981, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 0.43023017048835754, + "learning_rate": 3.142857142857143e-06, + "loss": 0.5822, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 0.5614896416664124, + "learning_rate": 3.2142857142857147e-06, + "loss": 0.5585, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 0.35685351490974426, + "learning_rate": 3.285714285714286e-06, + "loss": 0.5623, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 0.2451944649219513, + "learning_rate": 3.357142857142857e-06, + "loss": 0.4998, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 0.24035461246967316, + "learning_rate": 3.428571428571429e-06, + "loss": 0.5864, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 0.29300373792648315, + "learning_rate": 3.5e-06, + "loss": 0.5861, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 0.3160554766654968, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.5769, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 0.20753253996372223, + "learning_rate": 3.642857142857143e-06, + "loss": 0.5592, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 0.29364365339279175, + "learning_rate": 3.7142857142857146e-06, + "loss": 0.5855, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 0.25966310501098633, + "learning_rate": 3.785714285714286e-06, + "loss": 0.5714, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 0.5970475673675537, + "learning_rate": 3.857142857142858e-06, + "loss": 0.5955, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 0.2742187976837158, + "learning_rate": 3.928571428571429e-06, + "loss": 0.5982, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.21851637959480286, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5086, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 0.31623247265815735, + "learning_rate": 4.071428571428572e-06, + "loss": 0.5507, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 0.21017701923847198, + "learning_rate": 4.1428571428571435e-06, + "loss": 0.5681, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 0.23857641220092773, + "learning_rate": 4.2142857142857145e-06, + "loss": 0.5873, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 0.330123633146286, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.5507, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.2590518295764923, + "learning_rate": 4.357142857142857e-06, + "loss": 0.5563, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 0.22396299242973328, + "learning_rate": 4.428571428571429e-06, + "loss": 0.5208, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 0.20799732208251953, + "learning_rate": 4.5e-06, + "loss": 0.5636, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 0.28667014837265015, + "learning_rate": 4.571428571428572e-06, + "loss": 0.5126, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 0.3183256983757019, + "learning_rate": 4.642857142857144e-06, + "loss": 0.4917, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.34055784344673157, + "learning_rate": 4.714285714285715e-06, + "loss": 0.5869, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 0.18996137380599976, + "learning_rate": 4.785714285714287e-06, + "loss": 0.5898, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 0.19642199575901031, + "learning_rate": 4.857142857142858e-06, + "loss": 0.5024, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 0.2719160318374634, + "learning_rate": 4.928571428571429e-06, + "loss": 0.5239, + "step": 69 + }, + { + "epoch": 0.02, + "grad_norm": 0.2251090109348297, + "learning_rate": 5e-06, + "loss": 0.5327, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.15708671510219574, + "learning_rate": 5.071428571428571e-06, + "loss": 0.5446, + "step": 71 + }, + { + "epoch": 0.02, + "grad_norm": 0.2416897416114807, + "learning_rate": 5.142857142857142e-06, + "loss": 0.5972, + "step": 72 + }, + { + "epoch": 0.02, + "grad_norm": 0.30218374729156494, + "learning_rate": 5.214285714285715e-06, + "loss": 0.5723, + "step": 73 + }, + { + "epoch": 0.02, + "grad_norm": 0.23136214911937714, + "learning_rate": 5.285714285714286e-06, + "loss": 0.5919, + "step": 74 + }, + { + "epoch": 0.02, + "grad_norm": 0.29520007967948914, + "learning_rate": 5.357142857142857e-06, + "loss": 0.5443, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 0.2675969898700714, + "learning_rate": 5.428571428571429e-06, + "loss": 0.6078, + "step": 76 + }, + { + "epoch": 0.02, + "grad_norm": 0.21040533483028412, + "learning_rate": 5.500000000000001e-06, + "loss": 0.5131, + "step": 77 + }, + { + "epoch": 0.02, + "grad_norm": 0.21507872641086578, + "learning_rate": 5.571428571428572e-06, + "loss": 0.5537, + "step": 78 + }, + { + "epoch": 0.02, + "grad_norm": 0.3713940680027008, + "learning_rate": 5.6428571428571435e-06, + "loss": 0.5677, + "step": 79 + }, + { + "epoch": 0.02, + "grad_norm": 0.2338705062866211, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5583, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 0.24568618834018707, + "learning_rate": 5.785714285714286e-06, + "loss": 0.591, + "step": 81 + }, + { + "epoch": 0.02, + "grad_norm": 0.2607351541519165, + "learning_rate": 5.857142857142858e-06, + "loss": 0.5854, + "step": 82 + }, + { + "epoch": 0.02, + "grad_norm": 0.25233450531959534, + "learning_rate": 5.928571428571429e-06, + "loss": 0.5435, + "step": 83 + }, + { + "epoch": 0.02, + "grad_norm": 0.1901504397392273, + "learning_rate": 6e-06, + "loss": 0.5189, + "step": 84 + }, + { + "epoch": 0.02, + "grad_norm": 0.20455455780029297, + "learning_rate": 6.071428571428571e-06, + "loss": 0.5372, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 0.2110891193151474, + "learning_rate": 6.142857142857144e-06, + "loss": 0.5386, + "step": 86 + }, + { + "epoch": 0.02, + "grad_norm": 0.18980112671852112, + "learning_rate": 6.214285714285715e-06, + "loss": 0.5834, + "step": 87 + }, + { + "epoch": 0.02, + "grad_norm": 0.24649843573570251, + "learning_rate": 6.285714285714286e-06, + "loss": 0.5769, + "step": 88 + }, + { + "epoch": 0.02, + "grad_norm": 0.20015066862106323, + "learning_rate": 6.357142857142858e-06, + "loss": 0.5213, + "step": 89 + }, + { + "epoch": 0.02, + "grad_norm": 0.23394432663917542, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.5351, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 0.23542846739292145, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.5527, + "step": 91 + }, + { + "epoch": 0.02, + "grad_norm": 0.2840578258037567, + "learning_rate": 6.571428571428572e-06, + "loss": 0.5917, + "step": 92 + }, + { + "epoch": 0.02, + "grad_norm": 0.5794204473495483, + "learning_rate": 6.642857142857143e-06, + "loss": 0.5693, + "step": 93 + }, + { + "epoch": 0.02, + "grad_norm": 0.2579974830150604, + "learning_rate": 6.714285714285714e-06, + "loss": 0.577, + "step": 94 + }, + { + "epoch": 0.02, + "grad_norm": 0.23646292090415955, + "learning_rate": 6.785714285714287e-06, + "loss": 0.5327, + "step": 95 + }, + { + "epoch": 0.02, + "grad_norm": 0.3467201888561249, + "learning_rate": 6.857142857142858e-06, + "loss": 0.5163, + "step": 96 + }, + { + "epoch": 0.02, + "grad_norm": 0.1848195195198059, + "learning_rate": 6.928571428571429e-06, + "loss": 0.5614, + "step": 97 + }, + { + "epoch": 0.02, + "grad_norm": 0.20233601331710815, + "learning_rate": 7e-06, + "loss": 0.5993, + "step": 98 + }, + { + "epoch": 0.02, + "grad_norm": 0.2592422366142273, + "learning_rate": 7.0714285714285726e-06, + "loss": 0.5568, + "step": 99 + }, + { + "epoch": 0.02, + "grad_norm": 0.19915878772735596, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.5331, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 0.24805685877799988, + "learning_rate": 7.2142857142857145e-06, + "loss": 0.5241, + "step": 101 + }, + { + "epoch": 0.02, + "grad_norm": 0.2214994579553604, + "learning_rate": 7.285714285714286e-06, + "loss": 0.5219, + "step": 102 + }, + { + "epoch": 0.02, + "grad_norm": 0.2977463901042938, + "learning_rate": 7.357142857142858e-06, + "loss": 0.5836, + "step": 103 + }, + { + "epoch": 0.02, + "grad_norm": 0.23825155198574066, + "learning_rate": 7.428571428571429e-06, + "loss": 0.5452, + "step": 104 + }, + { + "epoch": 0.02, + "grad_norm": 0.22910349071025848, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5525, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 0.24861909449100494, + "learning_rate": 7.571428571428572e-06, + "loss": 0.5602, + "step": 106 + }, + { + "epoch": 0.02, + "grad_norm": 0.220360666513443, + "learning_rate": 7.642857142857143e-06, + "loss": 0.4968, + "step": 107 + }, + { + "epoch": 0.02, + "grad_norm": 0.29663270711898804, + "learning_rate": 7.714285714285716e-06, + "loss": 0.547, + "step": 108 + }, + { + "epoch": 0.02, + "grad_norm": 0.15902388095855713, + "learning_rate": 7.785714285714287e-06, + "loss": 0.5593, + "step": 109 + }, + { + "epoch": 0.02, + "grad_norm": 0.25408726930618286, + "learning_rate": 7.857142857142858e-06, + "loss": 0.5129, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 0.25450989603996277, + "learning_rate": 7.928571428571429e-06, + "loss": 0.5568, + "step": 111 + }, + { + "epoch": 0.02, + "grad_norm": 0.2113712877035141, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5311, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 0.2673487663269043, + "learning_rate": 8.071428571428572e-06, + "loss": 0.5063, + "step": 113 + }, + { + "epoch": 0.02, + "grad_norm": 0.17971846461296082, + "learning_rate": 8.142857142857143e-06, + "loss": 0.4959, + "step": 114 + }, + { + "epoch": 0.02, + "grad_norm": 0.27486327290534973, + "learning_rate": 8.214285714285714e-06, + "loss": 0.504, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 0.3731400966644287, + "learning_rate": 8.285714285714287e-06, + "loss": 0.5262, + "step": 116 + }, + { + "epoch": 0.03, + "grad_norm": 0.1998678743839264, + "learning_rate": 8.357142857142858e-06, + "loss": 0.6066, + "step": 117 + }, + { + "epoch": 0.03, + "grad_norm": 0.18095743656158447, + "learning_rate": 8.428571428571429e-06, + "loss": 0.581, + "step": 118 + }, + { + "epoch": 0.03, + "grad_norm": 0.20576633512973785, + "learning_rate": 8.5e-06, + "loss": 0.5646, + "step": 119 + }, + { + "epoch": 0.03, + "grad_norm": 0.21952424943447113, + "learning_rate": 8.571428571428571e-06, + "loss": 0.5274, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 0.22617046535015106, + "learning_rate": 8.642857142857144e-06, + "loss": 0.4918, + "step": 121 + }, + { + "epoch": 0.03, + "grad_norm": 0.22353151440620422, + "learning_rate": 8.714285714285715e-06, + "loss": 0.5383, + "step": 122 + }, + { + "epoch": 0.03, + "grad_norm": 0.24257732927799225, + "learning_rate": 8.785714285714286e-06, + "loss": 0.4734, + "step": 123 + }, + { + "epoch": 0.03, + "grad_norm": 0.16320379078388214, + "learning_rate": 8.857142857142858e-06, + "loss": 0.5033, + "step": 124 + }, + { + "epoch": 0.03, + "grad_norm": 0.21186141669750214, + "learning_rate": 8.92857142857143e-06, + "loss": 0.5116, + "step": 125 + }, + { + "epoch": 0.03, + "grad_norm": 0.1727321892976761, + "learning_rate": 9e-06, + "loss": 0.4887, + "step": 126 + }, + { + "epoch": 0.03, + "grad_norm": 0.17333361506462097, + "learning_rate": 9.071428571428573e-06, + "loss": 0.5629, + "step": 127 + }, + { + "epoch": 0.03, + "grad_norm": 0.20159348845481873, + "learning_rate": 9.142857142857144e-06, + "loss": 0.5855, + "step": 128 + }, + { + "epoch": 0.03, + "grad_norm": 0.25432631373405457, + "learning_rate": 9.214285714285715e-06, + "loss": 0.5565, + "step": 129 + }, + { + "epoch": 0.03, + "grad_norm": 0.18436311185359955, + "learning_rate": 9.285714285714288e-06, + "loss": 0.4746, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 0.22167499363422394, + "learning_rate": 9.357142857142859e-06, + "loss": 0.5437, + "step": 131 + }, + { + "epoch": 0.03, + "grad_norm": 0.29192057251930237, + "learning_rate": 9.42857142857143e-06, + "loss": 0.5443, + "step": 132 + }, + { + "epoch": 0.03, + "grad_norm": 0.1628040224313736, + "learning_rate": 9.5e-06, + "loss": 0.5563, + "step": 133 + }, + { + "epoch": 0.03, + "grad_norm": 0.32334551215171814, + "learning_rate": 9.571428571428573e-06, + "loss": 0.575, + "step": 134 + }, + { + "epoch": 0.03, + "grad_norm": 0.272955983877182, + "learning_rate": 9.642857142857144e-06, + "loss": 0.5363, + "step": 135 + }, + { + "epoch": 0.03, + "grad_norm": 0.2314363420009613, + "learning_rate": 9.714285714285715e-06, + "loss": 0.5356, + "step": 136 + }, + { + "epoch": 0.03, + "grad_norm": 0.18768808245658875, + "learning_rate": 9.785714285714286e-06, + "loss": 0.5053, + "step": 137 + }, + { + "epoch": 0.03, + "grad_norm": 0.22900734841823578, + "learning_rate": 9.857142857142859e-06, + "loss": 0.5594, + "step": 138 + }, + { + "epoch": 0.03, + "grad_norm": 0.1723155379295349, + "learning_rate": 9.92857142857143e-06, + "loss": 0.5104, + "step": 139 + }, + { + "epoch": 0.03, + "grad_norm": 0.2596263885498047, + "learning_rate": 1e-05, + "loss": 0.5271, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 0.15986420214176178, + "learning_rate": 9.999998782612734e-06, + "loss": 0.5382, + "step": 141 + }, + { + "epoch": 0.03, + "grad_norm": 0.33412984013557434, + "learning_rate": 9.999995130451526e-06, + "loss": 0.4807, + "step": 142 + }, + { + "epoch": 0.03, + "grad_norm": 0.20340685546398163, + "learning_rate": 9.999989043518153e-06, + "loss": 0.519, + "step": 143 + }, + { + "epoch": 0.03, + "grad_norm": 0.18798081576824188, + "learning_rate": 9.999980521815582e-06, + "loss": 0.5347, + "step": 144 + }, + { + "epoch": 0.03, + "grad_norm": 0.20350557565689087, + "learning_rate": 9.99996956534796e-06, + "loss": 0.4913, + "step": 145 + }, + { + "epoch": 0.03, + "grad_norm": 0.2547079026699066, + "learning_rate": 9.999956174120626e-06, + "loss": 0.5284, + "step": 146 + }, + { + "epoch": 0.03, + "grad_norm": 0.26818037033081055, + "learning_rate": 9.999940348140098e-06, + "loss": 0.5597, + "step": 147 + }, + { + "epoch": 0.03, + "grad_norm": 0.1871444284915924, + "learning_rate": 9.999922087414084e-06, + "loss": 0.4857, + "step": 148 + }, + { + "epoch": 0.03, + "grad_norm": 0.24267414212226868, + "learning_rate": 9.999901391951474e-06, + "loss": 0.5243, + "step": 149 + }, + { + "epoch": 0.03, + "grad_norm": 0.22753533720970154, + "learning_rate": 9.99987826176235e-06, + "loss": 0.4868, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 0.17949774861335754, + "learning_rate": 9.99985269685797e-06, + "loss": 0.5477, + "step": 151 + }, + { + "epoch": 0.03, + "grad_norm": 0.24117450416088104, + "learning_rate": 9.999824697250786e-06, + "loss": 0.5583, + "step": 152 + }, + { + "epoch": 0.03, + "grad_norm": 0.2758869230747223, + "learning_rate": 9.999794262954432e-06, + "loss": 0.6049, + "step": 153 + }, + { + "epoch": 0.03, + "grad_norm": 0.22826828062534332, + "learning_rate": 9.999761393983728e-06, + "loss": 0.5437, + "step": 154 + }, + { + "epoch": 0.03, + "grad_norm": 0.24656014144420624, + "learning_rate": 9.999726090354683e-06, + "loss": 0.5417, + "step": 155 + }, + { + "epoch": 0.03, + "grad_norm": 0.1714806854724884, + "learning_rate": 9.999688352084482e-06, + "loss": 0.5189, + "step": 156 + }, + { + "epoch": 0.03, + "grad_norm": 0.17295528948307037, + "learning_rate": 9.999648179191505e-06, + "loss": 0.5478, + "step": 157 + }, + { + "epoch": 0.03, + "grad_norm": 0.19324244558811188, + "learning_rate": 9.999605571695317e-06, + "loss": 0.5664, + "step": 158 + }, + { + "epoch": 0.03, + "grad_norm": 0.20624053478240967, + "learning_rate": 9.999560529616661e-06, + "loss": 0.5087, + "step": 159 + }, + { + "epoch": 0.03, + "grad_norm": 0.26294004917144775, + "learning_rate": 9.999513052977473e-06, + "loss": 0.6106, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 0.19021935760974884, + "learning_rate": 9.999463141800873e-06, + "loss": 0.4975, + "step": 161 + }, + { + "epoch": 0.03, + "grad_norm": 0.17529787123203278, + "learning_rate": 9.999410796111163e-06, + "loss": 0.5204, + "step": 162 + }, + { + "epoch": 0.04, + "grad_norm": 0.19823302328586578, + "learning_rate": 9.999356015933834e-06, + "loss": 0.5312, + "step": 163 + }, + { + "epoch": 0.04, + "grad_norm": 0.2933864891529083, + "learning_rate": 9.999298801295564e-06, + "loss": 0.5123, + "step": 164 + }, + { + "epoch": 0.04, + "grad_norm": 0.19900120794773102, + "learning_rate": 9.99923915222421e-06, + "loss": 0.5824, + "step": 165 + }, + { + "epoch": 0.04, + "grad_norm": 0.18184617161750793, + "learning_rate": 9.99917706874882e-06, + "loss": 0.5623, + "step": 166 + }, + { + "epoch": 0.04, + "grad_norm": 0.1705755591392517, + "learning_rate": 9.999112550899627e-06, + "loss": 0.5458, + "step": 167 + }, + { + "epoch": 0.04, + "grad_norm": 0.3029863238334656, + "learning_rate": 9.999045598708047e-06, + "loss": 0.54, + "step": 168 + }, + { + "epoch": 0.04, + "grad_norm": 0.1956048458814621, + "learning_rate": 9.998976212206683e-06, + "loss": 0.5387, + "step": 169 + }, + { + "epoch": 0.04, + "grad_norm": 0.1850360929965973, + "learning_rate": 9.998904391429323e-06, + "loss": 0.5085, + "step": 170 + }, + { + "epoch": 0.04, + "grad_norm": 0.21800455451011658, + "learning_rate": 9.99883013641094e-06, + "loss": 0.5358, + "step": 171 + }, + { + "epoch": 0.04, + "grad_norm": 0.2191917896270752, + "learning_rate": 9.998753447187693e-06, + "loss": 0.5668, + "step": 172 + }, + { + "epoch": 0.04, + "grad_norm": 0.22353680431842804, + "learning_rate": 9.998674323796928e-06, + "loss": 0.5358, + "step": 173 + }, + { + "epoch": 0.04, + "grad_norm": 0.2535366117954254, + "learning_rate": 9.998592766277173e-06, + "loss": 0.5041, + "step": 174 + }, + { + "epoch": 0.04, + "grad_norm": 0.20851938426494598, + "learning_rate": 9.998508774668142e-06, + "loss": 0.4944, + "step": 175 + }, + { + "epoch": 0.04, + "grad_norm": 0.1755622774362564, + "learning_rate": 9.998422349010736e-06, + "loss": 0.5156, + "step": 176 + }, + { + "epoch": 0.04, + "grad_norm": 0.15284371376037598, + "learning_rate": 9.998333489347042e-06, + "loss": 0.5233, + "step": 177 + }, + { + "epoch": 0.04, + "grad_norm": 0.2053551822900772, + "learning_rate": 9.998242195720327e-06, + "loss": 0.5414, + "step": 178 + }, + { + "epoch": 0.04, + "grad_norm": 0.18832677602767944, + "learning_rate": 9.99814846817505e-06, + "loss": 0.5724, + "step": 179 + }, + { + "epoch": 0.04, + "grad_norm": 0.19767887890338898, + "learning_rate": 9.998052306756852e-06, + "loss": 0.5258, + "step": 180 + }, + { + "epoch": 0.04, + "grad_norm": 0.17682293057441711, + "learning_rate": 9.997953711512556e-06, + "loss": 0.5718, + "step": 181 + }, + { + "epoch": 0.04, + "grad_norm": 0.23476260900497437, + "learning_rate": 9.997852682490179e-06, + "loss": 0.5566, + "step": 182 + }, + { + "epoch": 0.04, + "grad_norm": 0.18441180884838104, + "learning_rate": 9.997749219738912e-06, + "loss": 0.583, + "step": 183 + }, + { + "epoch": 0.04, + "grad_norm": 0.2411859780550003, + "learning_rate": 9.997643323309139e-06, + "loss": 0.4423, + "step": 184 + }, + { + "epoch": 0.04, + "grad_norm": 0.1665976196527481, + "learning_rate": 9.997534993252427e-06, + "loss": 0.5353, + "step": 185 + }, + { + "epoch": 0.04, + "grad_norm": 0.1685672253370285, + "learning_rate": 9.997424229621529e-06, + "loss": 0.5073, + "step": 186 + }, + { + "epoch": 0.04, + "grad_norm": 0.277639776468277, + "learning_rate": 9.99731103247038e-06, + "loss": 0.4813, + "step": 187 + }, + { + "epoch": 0.04, + "grad_norm": 0.17908422648906708, + "learning_rate": 9.997195401854102e-06, + "loss": 0.5088, + "step": 188 + }, + { + "epoch": 0.04, + "grad_norm": 0.1873718649148941, + "learning_rate": 9.997077337829003e-06, + "loss": 0.5072, + "step": 189 + }, + { + "epoch": 0.04, + "grad_norm": 0.256670743227005, + "learning_rate": 9.996956840452573e-06, + "loss": 0.4865, + "step": 190 + }, + { + "epoch": 0.04, + "grad_norm": 0.27443787455558777, + "learning_rate": 9.996833909783492e-06, + "loss": 0.5466, + "step": 191 + }, + { + "epoch": 0.04, + "grad_norm": 0.19919687509536743, + "learning_rate": 9.996708545881617e-06, + "loss": 0.5387, + "step": 192 + }, + { + "epoch": 0.04, + "grad_norm": 0.16513916850090027, + "learning_rate": 9.996580748808e-06, + "loss": 0.5223, + "step": 193 + }, + { + "epoch": 0.04, + "grad_norm": 0.20502988994121552, + "learning_rate": 9.996450518624868e-06, + "loss": 0.5194, + "step": 194 + }, + { + "epoch": 0.04, + "grad_norm": 0.18695437908172607, + "learning_rate": 9.99631785539564e-06, + "loss": 0.4778, + "step": 195 + }, + { + "epoch": 0.04, + "grad_norm": 0.16061006486415863, + "learning_rate": 9.996182759184916e-06, + "loss": 0.5192, + "step": 196 + }, + { + "epoch": 0.04, + "grad_norm": 0.18725766241550446, + "learning_rate": 9.99604523005848e-06, + "loss": 0.5199, + "step": 197 + }, + { + "epoch": 0.04, + "grad_norm": 0.1948050707578659, + "learning_rate": 9.995905268083306e-06, + "loss": 0.5511, + "step": 198 + }, + { + "epoch": 0.04, + "grad_norm": 0.1752336025238037, + "learning_rate": 9.995762873327548e-06, + "loss": 0.5705, + "step": 199 + }, + { + "epoch": 0.04, + "grad_norm": 0.2874692678451538, + "learning_rate": 9.995618045860545e-06, + "loss": 0.5504, + "step": 200 + }, + { + "epoch": 0.04, + "grad_norm": 0.18488091230392456, + "learning_rate": 9.99547078575282e-06, + "loss": 0.5219, + "step": 201 + }, + { + "epoch": 0.04, + "grad_norm": 0.21118810772895813, + "learning_rate": 9.995321093076085e-06, + "loss": 0.6084, + "step": 202 + }, + { + "epoch": 0.04, + "grad_norm": 0.17937391996383667, + "learning_rate": 9.99516896790323e-06, + "loss": 0.4974, + "step": 203 + }, + { + "epoch": 0.04, + "grad_norm": 0.24880222976207733, + "learning_rate": 9.995014410308336e-06, + "loss": 0.5524, + "step": 204 + }, + { + "epoch": 0.04, + "grad_norm": 0.2270919531583786, + "learning_rate": 9.994857420366669e-06, + "loss": 0.5298, + "step": 205 + }, + { + "epoch": 0.04, + "grad_norm": 0.2064422369003296, + "learning_rate": 9.994697998154668e-06, + "loss": 0.5442, + "step": 206 + }, + { + "epoch": 0.04, + "grad_norm": 0.18932758271694183, + "learning_rate": 9.994536143749969e-06, + "loss": 0.4992, + "step": 207 + }, + { + "epoch": 0.04, + "grad_norm": 0.18627791106700897, + "learning_rate": 9.994371857231388e-06, + "loss": 0.5652, + "step": 208 + }, + { + "epoch": 0.05, + "grad_norm": 0.18181046843528748, + "learning_rate": 9.994205138678923e-06, + "loss": 0.4876, + "step": 209 + }, + { + "epoch": 0.05, + "grad_norm": 0.19523365795612335, + "learning_rate": 9.99403598817376e-06, + "loss": 0.5355, + "step": 210 + }, + { + "epoch": 0.05, + "grad_norm": 0.202137753367424, + "learning_rate": 9.993864405798268e-06, + "loss": 0.5474, + "step": 211 + }, + { + "epoch": 0.05, + "grad_norm": 0.1764814555644989, + "learning_rate": 9.993690391636e-06, + "loss": 0.511, + "step": 212 + }, + { + "epoch": 0.05, + "grad_norm": 0.36145585775375366, + "learning_rate": 9.99351394577169e-06, + "loss": 0.5303, + "step": 213 + }, + { + "epoch": 0.05, + "grad_norm": 0.16018763184547424, + "learning_rate": 9.993335068291264e-06, + "loss": 0.5363, + "step": 214 + }, + { + "epoch": 0.05, + "grad_norm": 0.14477033913135529, + "learning_rate": 9.993153759281824e-06, + "loss": 0.5394, + "step": 215 + }, + { + "epoch": 0.05, + "grad_norm": 0.20048737525939941, + "learning_rate": 9.99297001883166e-06, + "loss": 0.5348, + "step": 216 + }, + { + "epoch": 0.05, + "grad_norm": 0.1712445169687271, + "learning_rate": 9.992783847030246e-06, + "loss": 0.5438, + "step": 217 + }, + { + "epoch": 0.05, + "grad_norm": 0.15328474342823029, + "learning_rate": 9.992595243968238e-06, + "loss": 0.5454, + "step": 218 + }, + { + "epoch": 0.05, + "grad_norm": 0.22686372697353363, + "learning_rate": 9.992404209737476e-06, + "loss": 0.5648, + "step": 219 + }, + { + "epoch": 0.05, + "grad_norm": 0.21831001341342926, + "learning_rate": 9.99221074443099e-06, + "loss": 0.5224, + "step": 220 + }, + { + "epoch": 0.05, + "grad_norm": 0.15410234034061432, + "learning_rate": 9.992014848142984e-06, + "loss": 0.5185, + "step": 221 + }, + { + "epoch": 0.05, + "grad_norm": 0.20523761212825775, + "learning_rate": 9.991816520968853e-06, + "loss": 0.5687, + "step": 222 + }, + { + "epoch": 0.05, + "grad_norm": 0.15560153126716614, + "learning_rate": 9.991615763005172e-06, + "loss": 0.5229, + "step": 223 + }, + { + "epoch": 0.05, + "grad_norm": 0.19702470302581787, + "learning_rate": 9.991412574349704e-06, + "loss": 0.5337, + "step": 224 + }, + { + "epoch": 0.05, + "grad_norm": 0.24464666843414307, + "learning_rate": 9.991206955101388e-06, + "loss": 0.5367, + "step": 225 + }, + { + "epoch": 0.05, + "grad_norm": 0.1894879937171936, + "learning_rate": 9.990998905360357e-06, + "loss": 0.5228, + "step": 226 + }, + { + "epoch": 0.05, + "grad_norm": 0.14452479779720306, + "learning_rate": 9.990788425227915e-06, + "loss": 0.5354, + "step": 227 + }, + { + "epoch": 0.05, + "grad_norm": 0.23448392748832703, + "learning_rate": 9.990575514806563e-06, + "loss": 0.545, + "step": 228 + }, + { + "epoch": 0.05, + "grad_norm": 0.20030318200588226, + "learning_rate": 9.990360174199975e-06, + "loss": 0.5239, + "step": 229 + }, + { + "epoch": 0.05, + "grad_norm": 0.1632775366306305, + "learning_rate": 9.990142403513012e-06, + "loss": 0.5507, + "step": 230 + }, + { + "epoch": 0.05, + "grad_norm": 0.17613913118839264, + "learning_rate": 9.989922202851722e-06, + "loss": 0.5077, + "step": 231 + }, + { + "epoch": 0.05, + "grad_norm": 0.1816764920949936, + "learning_rate": 9.989699572323328e-06, + "loss": 0.5121, + "step": 232 + }, + { + "epoch": 0.05, + "grad_norm": 0.18507419526576996, + "learning_rate": 9.989474512036245e-06, + "loss": 0.5335, + "step": 233 + }, + { + "epoch": 0.05, + "grad_norm": 0.22486189007759094, + "learning_rate": 9.989247022100065e-06, + "loss": 0.5223, + "step": 234 + }, + { + "epoch": 0.05, + "grad_norm": 0.19390611350536346, + "learning_rate": 9.989017102625565e-06, + "loss": 0.564, + "step": 235 + }, + { + "epoch": 0.05, + "grad_norm": 0.22769489884376526, + "learning_rate": 9.988784753724707e-06, + "loss": 0.4891, + "step": 236 + }, + { + "epoch": 0.05, + "grad_norm": 0.18696601688861847, + "learning_rate": 9.988549975510635e-06, + "loss": 0.5424, + "step": 237 + }, + { + "epoch": 0.05, + "grad_norm": 0.1786351501941681, + "learning_rate": 9.988312768097673e-06, + "loss": 0.5279, + "step": 238 + }, + { + "epoch": 0.05, + "grad_norm": 0.19431112706661224, + "learning_rate": 9.988073131601332e-06, + "loss": 0.5463, + "step": 239 + }, + { + "epoch": 0.05, + "grad_norm": 0.171942800283432, + "learning_rate": 9.987831066138302e-06, + "loss": 0.5208, + "step": 240 + }, + { + "epoch": 0.05, + "grad_norm": 0.15704870223999023, + "learning_rate": 9.987586571826461e-06, + "loss": 0.5413, + "step": 241 + }, + { + "epoch": 0.05, + "grad_norm": 0.16401955485343933, + "learning_rate": 9.987339648784866e-06, + "loss": 0.562, + "step": 242 + }, + { + "epoch": 0.05, + "grad_norm": 0.2467910647392273, + "learning_rate": 9.987090297133756e-06, + "loss": 0.559, + "step": 243 + }, + { + "epoch": 0.05, + "grad_norm": 0.1753203123807907, + "learning_rate": 9.986838516994555e-06, + "loss": 0.6251, + "step": 244 + }, + { + "epoch": 0.05, + "grad_norm": 0.21456435322761536, + "learning_rate": 9.986584308489867e-06, + "loss": 0.5495, + "step": 245 + }, + { + "epoch": 0.05, + "grad_norm": 0.2137192189693451, + "learning_rate": 9.986327671743484e-06, + "loss": 0.5475, + "step": 246 + }, + { + "epoch": 0.05, + "grad_norm": 0.16317638754844666, + "learning_rate": 9.98606860688037e-06, + "loss": 0.565, + "step": 247 + }, + { + "epoch": 0.05, + "grad_norm": 0.15220917761325836, + "learning_rate": 9.985807114026684e-06, + "loss": 0.5185, + "step": 248 + }, + { + "epoch": 0.05, + "grad_norm": 0.2033926397562027, + "learning_rate": 9.98554319330976e-06, + "loss": 0.5255, + "step": 249 + }, + { + "epoch": 0.05, + "grad_norm": 0.17678335309028625, + "learning_rate": 9.985276844858114e-06, + "loss": 0.5371, + "step": 250 + }, + { + "epoch": 0.05, + "grad_norm": 0.1734929084777832, + "learning_rate": 9.985008068801446e-06, + "loss": 0.5148, + "step": 251 + }, + { + "epoch": 0.05, + "grad_norm": 0.19290420413017273, + "learning_rate": 9.984736865270637e-06, + "loss": 0.5077, + "step": 252 + }, + { + "epoch": 0.05, + "grad_norm": 0.19296742975711823, + "learning_rate": 9.984463234397752e-06, + "loss": 0.5376, + "step": 253 + }, + { + "epoch": 0.05, + "grad_norm": 0.21208150684833527, + "learning_rate": 9.984187176316038e-06, + "loss": 0.5431, + "step": 254 + }, + { + "epoch": 0.05, + "grad_norm": 0.16331081092357635, + "learning_rate": 9.983908691159921e-06, + "loss": 0.5494, + "step": 255 + }, + { + "epoch": 0.06, + "grad_norm": 0.2127610296010971, + "learning_rate": 9.983627779065012e-06, + "loss": 0.5196, + "step": 256 + }, + { + "epoch": 0.06, + "grad_norm": 0.27896058559417725, + "learning_rate": 9.983344440168101e-06, + "loss": 0.5004, + "step": 257 + }, + { + "epoch": 0.06, + "grad_norm": 0.2308092564344406, + "learning_rate": 9.983058674607164e-06, + "loss": 0.4996, + "step": 258 + }, + { + "epoch": 0.06, + "grad_norm": 0.1815384477376938, + "learning_rate": 9.982770482521353e-06, + "loss": 0.5484, + "step": 259 + }, + { + "epoch": 0.06, + "grad_norm": 0.19610373675823212, + "learning_rate": 9.982479864051005e-06, + "loss": 0.5465, + "step": 260 + }, + { + "epoch": 0.06, + "grad_norm": 0.21339653432369232, + "learning_rate": 9.982186819337639e-06, + "loss": 0.5318, + "step": 261 + }, + { + "epoch": 0.06, + "grad_norm": 0.16528427600860596, + "learning_rate": 9.981891348523955e-06, + "loss": 0.5164, + "step": 262 + }, + { + "epoch": 0.06, + "grad_norm": 0.19075849652290344, + "learning_rate": 9.981593451753833e-06, + "loss": 0.482, + "step": 263 + }, + { + "epoch": 0.06, + "grad_norm": 0.18467120826244354, + "learning_rate": 9.981293129172334e-06, + "loss": 0.4893, + "step": 264 + }, + { + "epoch": 0.06, + "grad_norm": 0.2131132185459137, + "learning_rate": 9.980990380925705e-06, + "loss": 0.5839, + "step": 265 + }, + { + "epoch": 0.06, + "grad_norm": 0.214164599776268, + "learning_rate": 9.980685207161368e-06, + "loss": 0.5351, + "step": 266 + }, + { + "epoch": 0.06, + "grad_norm": 0.20279422402381897, + "learning_rate": 9.98037760802793e-06, + "loss": 0.5339, + "step": 267 + }, + { + "epoch": 0.06, + "grad_norm": 0.16691498458385468, + "learning_rate": 9.980067583675177e-06, + "loss": 0.5257, + "step": 268 + }, + { + "epoch": 0.06, + "grad_norm": 0.19010309875011444, + "learning_rate": 9.97975513425408e-06, + "loss": 0.4422, + "step": 269 + }, + { + "epoch": 0.06, + "grad_norm": 0.17077746987342834, + "learning_rate": 9.979440259916782e-06, + "loss": 0.5756, + "step": 270 + }, + { + "epoch": 0.06, + "grad_norm": 0.15563777089118958, + "learning_rate": 9.979122960816617e-06, + "loss": 0.5803, + "step": 271 + }, + { + "epoch": 0.06, + "grad_norm": 0.1896345168352127, + "learning_rate": 9.978803237108095e-06, + "loss": 0.5307, + "step": 272 + }, + { + "epoch": 0.06, + "grad_norm": 0.20084036886692047, + "learning_rate": 9.978481088946905e-06, + "loss": 0.4988, + "step": 273 + }, + { + "epoch": 0.06, + "grad_norm": 0.18005971610546112, + "learning_rate": 9.97815651648992e-06, + "loss": 0.494, + "step": 274 + }, + { + "epoch": 0.06, + "grad_norm": 0.14255790412425995, + "learning_rate": 9.977829519895193e-06, + "loss": 0.5534, + "step": 275 + }, + { + "epoch": 0.06, + "grad_norm": 0.1580318808555603, + "learning_rate": 9.977500099321956e-06, + "loss": 0.5083, + "step": 276 + }, + { + "epoch": 0.06, + "grad_norm": 0.20587489008903503, + "learning_rate": 9.977168254930621e-06, + "loss": 0.5438, + "step": 277 + }, + { + "epoch": 0.06, + "grad_norm": 0.18426474928855896, + "learning_rate": 9.97683398688278e-06, + "loss": 0.5399, + "step": 278 + }, + { + "epoch": 0.06, + "grad_norm": 0.17722034454345703, + "learning_rate": 9.976497295341212e-06, + "loss": 0.4957, + "step": 279 + }, + { + "epoch": 0.06, + "grad_norm": 0.216731995344162, + "learning_rate": 9.976158180469866e-06, + "loss": 0.5127, + "step": 280 + }, + { + "epoch": 0.06, + "grad_norm": 0.20815429091453552, + "learning_rate": 9.975816642433876e-06, + "loss": 0.5859, + "step": 281 + }, + { + "epoch": 0.06, + "grad_norm": 0.15470731258392334, + "learning_rate": 9.975472681399556e-06, + "loss": 0.5417, + "step": 282 + }, + { + "epoch": 0.06, + "grad_norm": 0.17505955696105957, + "learning_rate": 9.975126297534399e-06, + "loss": 0.5197, + "step": 283 + }, + { + "epoch": 0.06, + "grad_norm": 0.15607048571109772, + "learning_rate": 9.97477749100708e-06, + "loss": 0.5202, + "step": 284 + }, + { + "epoch": 0.06, + "grad_norm": 0.25984108448028564, + "learning_rate": 9.97442626198745e-06, + "loss": 0.5113, + "step": 285 + }, + { + "epoch": 0.06, + "grad_norm": 0.17469698190689087, + "learning_rate": 9.974072610646543e-06, + "loss": 0.5274, + "step": 286 + }, + { + "epoch": 0.06, + "grad_norm": 0.1947067826986313, + "learning_rate": 9.973716537156573e-06, + "loss": 0.5743, + "step": 287 + }, + { + "epoch": 0.06, + "grad_norm": 0.16918258368968964, + "learning_rate": 9.973358041690926e-06, + "loss": 0.5623, + "step": 288 + }, + { + "epoch": 0.06, + "grad_norm": 0.1726803183555603, + "learning_rate": 9.972997124424179e-06, + "loss": 0.5577, + "step": 289 + }, + { + "epoch": 0.06, + "grad_norm": 0.22636979818344116, + "learning_rate": 9.972633785532082e-06, + "loss": 0.4822, + "step": 290 + }, + { + "epoch": 0.06, + "grad_norm": 0.1924733966588974, + "learning_rate": 9.972268025191561e-06, + "loss": 0.5294, + "step": 291 + }, + { + "epoch": 0.06, + "grad_norm": 0.16325077414512634, + "learning_rate": 9.971899843580728e-06, + "loss": 0.5588, + "step": 292 + }, + { + "epoch": 0.06, + "grad_norm": 0.17010986804962158, + "learning_rate": 9.971529240878869e-06, + "loss": 0.5254, + "step": 293 + }, + { + "epoch": 0.06, + "grad_norm": 0.22150567173957825, + "learning_rate": 9.971156217266451e-06, + "loss": 0.545, + "step": 294 + }, + { + "epoch": 0.06, + "grad_norm": 0.24462495744228363, + "learning_rate": 9.97078077292512e-06, + "loss": 0.5738, + "step": 295 + }, + { + "epoch": 0.06, + "grad_norm": 0.21568679809570312, + "learning_rate": 9.970402908037703e-06, + "loss": 0.5129, + "step": 296 + }, + { + "epoch": 0.06, + "grad_norm": 0.22609004378318787, + "learning_rate": 9.970022622788198e-06, + "loss": 0.535, + "step": 297 + }, + { + "epoch": 0.06, + "grad_norm": 0.20871202647686005, + "learning_rate": 9.96963991736179e-06, + "loss": 0.5405, + "step": 298 + }, + { + "epoch": 0.06, + "grad_norm": 0.20957247912883759, + "learning_rate": 9.969254791944839e-06, + "loss": 0.4701, + "step": 299 + }, + { + "epoch": 0.06, + "grad_norm": 0.26258644461631775, + "learning_rate": 9.968867246724882e-06, + "loss": 0.5575, + "step": 300 + }, + { + "epoch": 0.06, + "grad_norm": 0.17494796216487885, + "learning_rate": 9.96847728189064e-06, + "loss": 0.5414, + "step": 301 + }, + { + "epoch": 0.07, + "grad_norm": 0.21521888673305511, + "learning_rate": 9.968084897632004e-06, + "loss": 0.5152, + "step": 302 + }, + { + "epoch": 0.07, + "grad_norm": 0.16436263918876648, + "learning_rate": 9.967690094140052e-06, + "loss": 0.5144, + "step": 303 + }, + { + "epoch": 0.07, + "grad_norm": 0.17806245386600494, + "learning_rate": 9.96729287160703e-06, + "loss": 0.59, + "step": 304 + }, + { + "epoch": 0.07, + "grad_norm": 0.13292109966278076, + "learning_rate": 9.966893230226371e-06, + "loss": 0.5804, + "step": 305 + }, + { + "epoch": 0.07, + "grad_norm": 0.18737316131591797, + "learning_rate": 9.966491170192682e-06, + "loss": 0.5104, + "step": 306 + }, + { + "epoch": 0.07, + "grad_norm": 0.25303030014038086, + "learning_rate": 9.966086691701748e-06, + "loss": 0.5501, + "step": 307 + }, + { + "epoch": 0.07, + "grad_norm": 0.17302893102169037, + "learning_rate": 9.96567979495053e-06, + "loss": 0.5236, + "step": 308 + }, + { + "epoch": 0.07, + "grad_norm": 0.16693797707557678, + "learning_rate": 9.96527048013717e-06, + "loss": 0.5157, + "step": 309 + }, + { + "epoch": 0.07, + "grad_norm": 0.22584576904773712, + "learning_rate": 9.964858747460989e-06, + "loss": 0.5828, + "step": 310 + }, + { + "epoch": 0.07, + "grad_norm": 0.21684272587299347, + "learning_rate": 9.964444597122476e-06, + "loss": 0.5082, + "step": 311 + }, + { + "epoch": 0.07, + "grad_norm": 0.16628780961036682, + "learning_rate": 9.964028029323305e-06, + "loss": 0.5581, + "step": 312 + }, + { + "epoch": 0.07, + "grad_norm": 0.15919576585292816, + "learning_rate": 9.963609044266328e-06, + "loss": 0.5713, + "step": 313 + }, + { + "epoch": 0.07, + "grad_norm": 0.17761071026325226, + "learning_rate": 9.963187642155573e-06, + "loss": 0.5417, + "step": 314 + }, + { + "epoch": 0.07, + "grad_norm": 0.24367289245128632, + "learning_rate": 9.962763823196242e-06, + "loss": 0.5147, + "step": 315 + }, + { + "epoch": 0.07, + "grad_norm": 0.15818822383880615, + "learning_rate": 9.962337587594713e-06, + "loss": 0.4555, + "step": 316 + }, + { + "epoch": 0.07, + "grad_norm": 0.1815144419670105, + "learning_rate": 9.961908935558548e-06, + "loss": 0.5394, + "step": 317 + }, + { + "epoch": 0.07, + "grad_norm": 0.18169505894184113, + "learning_rate": 9.961477867296479e-06, + "loss": 0.5654, + "step": 318 + }, + { + "epoch": 0.07, + "grad_norm": 0.15763549506664276, + "learning_rate": 9.961044383018416e-06, + "loss": 0.5565, + "step": 319 + }, + { + "epoch": 0.07, + "grad_norm": 0.1651836484670639, + "learning_rate": 9.96060848293545e-06, + "loss": 0.5522, + "step": 320 + }, + { + "epoch": 0.07, + "grad_norm": 0.1605907380580902, + "learning_rate": 9.96017016725984e-06, + "loss": 0.521, + "step": 321 + }, + { + "epoch": 0.07, + "grad_norm": 0.1753988415002823, + "learning_rate": 9.959729436205027e-06, + "loss": 0.5217, + "step": 322 + }, + { + "epoch": 0.07, + "grad_norm": 0.15843777358531952, + "learning_rate": 9.95928628998563e-06, + "loss": 0.5384, + "step": 323 + }, + { + "epoch": 0.07, + "grad_norm": 0.15907292068004608, + "learning_rate": 9.95884072881744e-06, + "loss": 0.4972, + "step": 324 + }, + { + "epoch": 0.07, + "grad_norm": 0.14925910532474518, + "learning_rate": 9.958392752917425e-06, + "loss": 0.5313, + "step": 325 + }, + { + "epoch": 0.07, + "grad_norm": 0.1515308916568756, + "learning_rate": 9.957942362503728e-06, + "loss": 0.5329, + "step": 326 + }, + { + "epoch": 0.07, + "grad_norm": 0.1642216593027115, + "learning_rate": 9.957489557795667e-06, + "loss": 0.516, + "step": 327 + }, + { + "epoch": 0.07, + "grad_norm": 0.1622897833585739, + "learning_rate": 9.957034339013742e-06, + "loss": 0.5641, + "step": 328 + }, + { + "epoch": 0.07, + "grad_norm": 0.19205595552921295, + "learning_rate": 9.956576706379623e-06, + "loss": 0.5109, + "step": 329 + }, + { + "epoch": 0.07, + "grad_norm": 0.16615568101406097, + "learning_rate": 9.956116660116155e-06, + "loss": 0.5208, + "step": 330 + }, + { + "epoch": 0.07, + "grad_norm": 0.24837036430835724, + "learning_rate": 9.95565420044736e-06, + "loss": 0.5572, + "step": 331 + }, + { + "epoch": 0.07, + "grad_norm": 0.15492476522922516, + "learning_rate": 9.955189327598435e-06, + "loss": 0.5439, + "step": 332 + }, + { + "epoch": 0.07, + "grad_norm": 0.21482093632221222, + "learning_rate": 9.954722041795753e-06, + "loss": 0.5498, + "step": 333 + }, + { + "epoch": 0.07, + "grad_norm": 0.2388935685157776, + "learning_rate": 9.954252343266859e-06, + "loss": 0.4783, + "step": 334 + }, + { + "epoch": 0.07, + "grad_norm": 0.16139458119869232, + "learning_rate": 9.953780232240477e-06, + "loss": 0.5553, + "step": 335 + }, + { + "epoch": 0.07, + "grad_norm": 0.20065993070602417, + "learning_rate": 9.953305708946504e-06, + "loss": 0.5273, + "step": 336 + }, + { + "epoch": 0.07, + "grad_norm": 0.19159255921840668, + "learning_rate": 9.95282877361601e-06, + "loss": 0.5237, + "step": 337 + }, + { + "epoch": 0.07, + "grad_norm": 0.21565450727939606, + "learning_rate": 9.952349426481243e-06, + "loss": 0.5408, + "step": 338 + }, + { + "epoch": 0.07, + "grad_norm": 0.16351784765720367, + "learning_rate": 9.95186766777562e-06, + "loss": 0.516, + "step": 339 + }, + { + "epoch": 0.07, + "grad_norm": 0.18081237375736237, + "learning_rate": 9.95138349773374e-06, + "loss": 0.5319, + "step": 340 + }, + { + "epoch": 0.07, + "grad_norm": 0.17618371546268463, + "learning_rate": 9.950896916591368e-06, + "loss": 0.4894, + "step": 341 + }, + { + "epoch": 0.07, + "grad_norm": 0.21193227171897888, + "learning_rate": 9.95040792458545e-06, + "loss": 0.5017, + "step": 342 + }, + { + "epoch": 0.07, + "grad_norm": 0.15902818739414215, + "learning_rate": 9.949916521954104e-06, + "loss": 0.5468, + "step": 343 + }, + { + "epoch": 0.07, + "grad_norm": 0.15740692615509033, + "learning_rate": 9.949422708936616e-06, + "loss": 0.5108, + "step": 344 + }, + { + "epoch": 0.07, + "grad_norm": 0.2678494453430176, + "learning_rate": 9.948926485773455e-06, + "loss": 0.5588, + "step": 345 + }, + { + "epoch": 0.07, + "grad_norm": 0.11566779017448425, + "learning_rate": 9.948427852706257e-06, + "loss": 0.5603, + "step": 346 + }, + { + "epoch": 0.07, + "grad_norm": 0.19342099130153656, + "learning_rate": 9.947926809977835e-06, + "loss": 0.5577, + "step": 347 + }, + { + "epoch": 0.07, + "grad_norm": 0.20827247202396393, + "learning_rate": 9.947423357832176e-06, + "loss": 0.5401, + "step": 348 + }, + { + "epoch": 0.08, + "grad_norm": 0.21266911923885345, + "learning_rate": 9.946917496514435e-06, + "loss": 0.5555, + "step": 349 + }, + { + "epoch": 0.08, + "grad_norm": 0.2901332378387451, + "learning_rate": 9.946409226270945e-06, + "loss": 0.5615, + "step": 350 + }, + { + "epoch": 0.08, + "grad_norm": 0.18509002029895782, + "learning_rate": 9.94589854734921e-06, + "loss": 0.5187, + "step": 351 + }, + { + "epoch": 0.08, + "grad_norm": 0.1265445202589035, + "learning_rate": 9.945385459997909e-06, + "loss": 0.5356, + "step": 352 + }, + { + "epoch": 0.08, + "grad_norm": 0.1595221310853958, + "learning_rate": 9.944869964466892e-06, + "loss": 0.5861, + "step": 353 + }, + { + "epoch": 0.08, + "grad_norm": 0.24293914437294006, + "learning_rate": 9.944352061007182e-06, + "loss": 0.5336, + "step": 354 + }, + { + "epoch": 0.08, + "grad_norm": 0.17409418523311615, + "learning_rate": 9.943831749870973e-06, + "loss": 0.5194, + "step": 355 + }, + { + "epoch": 0.08, + "grad_norm": 0.2043304294347763, + "learning_rate": 9.943309031311637e-06, + "loss": 0.5477, + "step": 356 + }, + { + "epoch": 0.08, + "grad_norm": 0.16727691888809204, + "learning_rate": 9.942783905583711e-06, + "loss": 0.5276, + "step": 357 + }, + { + "epoch": 0.08, + "grad_norm": 0.19069473445415497, + "learning_rate": 9.942256372942909e-06, + "loss": 0.5096, + "step": 358 + }, + { + "epoch": 0.08, + "grad_norm": 0.1717585176229477, + "learning_rate": 9.941726433646115e-06, + "loss": 0.5186, + "step": 359 + }, + { + "epoch": 0.08, + "grad_norm": 0.15714509785175323, + "learning_rate": 9.941194087951384e-06, + "loss": 0.5358, + "step": 360 + }, + { + "epoch": 0.08, + "grad_norm": 0.1681104153394699, + "learning_rate": 9.940659336117948e-06, + "loss": 0.5832, + "step": 361 + }, + { + "epoch": 0.08, + "grad_norm": 0.21985803544521332, + "learning_rate": 9.940122178406205e-06, + "loss": 0.5477, + "step": 362 + }, + { + "epoch": 0.08, + "grad_norm": 0.19286422431468964, + "learning_rate": 9.939582615077724e-06, + "loss": 0.5428, + "step": 363 + }, + { + "epoch": 0.08, + "grad_norm": 0.198882594704628, + "learning_rate": 9.939040646395252e-06, + "loss": 0.5572, + "step": 364 + }, + { + "epoch": 0.08, + "grad_norm": 0.18456579744815826, + "learning_rate": 9.938496272622703e-06, + "loss": 0.5168, + "step": 365 + }, + { + "epoch": 0.08, + "grad_norm": 0.15817482769489288, + "learning_rate": 9.93794949402516e-06, + "loss": 0.4895, + "step": 366 + }, + { + "epoch": 0.08, + "grad_norm": 0.17915575206279755, + "learning_rate": 9.937400310868883e-06, + "loss": 0.6069, + "step": 367 + }, + { + "epoch": 0.08, + "grad_norm": 0.24024631083011627, + "learning_rate": 9.936848723421295e-06, + "loss": 0.5585, + "step": 368 + }, + { + "epoch": 0.08, + "grad_norm": 0.20291557908058167, + "learning_rate": 9.936294731950999e-06, + "loss": 0.5197, + "step": 369 + }, + { + "epoch": 0.08, + "grad_norm": 0.17010553181171417, + "learning_rate": 9.93573833672776e-06, + "loss": 0.4911, + "step": 370 + }, + { + "epoch": 0.08, + "grad_norm": 0.21095992624759674, + "learning_rate": 9.935179538022518e-06, + "loss": 0.5152, + "step": 371 + }, + { + "epoch": 0.08, + "grad_norm": 0.14240865409374237, + "learning_rate": 9.934618336107385e-06, + "loss": 0.5663, + "step": 372 + }, + { + "epoch": 0.08, + "grad_norm": 0.13972750306129456, + "learning_rate": 9.934054731255638e-06, + "loss": 0.5214, + "step": 373 + }, + { + "epoch": 0.08, + "grad_norm": 0.2143140733242035, + "learning_rate": 9.933488723741731e-06, + "loss": 0.5213, + "step": 374 + }, + { + "epoch": 0.08, + "grad_norm": 0.16753612458705902, + "learning_rate": 9.932920313841281e-06, + "loss": 0.5654, + "step": 375 + }, + { + "epoch": 0.08, + "grad_norm": 0.17069406807422638, + "learning_rate": 9.932349501831077e-06, + "loss": 0.5813, + "step": 376 + }, + { + "epoch": 0.08, + "grad_norm": 0.16417956352233887, + "learning_rate": 9.931776287989084e-06, + "loss": 0.5091, + "step": 377 + }, + { + "epoch": 0.08, + "grad_norm": 0.14209146797657013, + "learning_rate": 9.931200672594425e-06, + "loss": 0.5498, + "step": 378 + }, + { + "epoch": 0.08, + "grad_norm": 0.15816187858581543, + "learning_rate": 9.930622655927403e-06, + "loss": 0.5175, + "step": 379 + }, + { + "epoch": 0.08, + "grad_norm": 0.1323387622833252, + "learning_rate": 9.930042238269485e-06, + "loss": 0.5217, + "step": 380 + }, + { + "epoch": 0.08, + "grad_norm": 0.14928382635116577, + "learning_rate": 9.929459419903307e-06, + "loss": 0.5655, + "step": 381 + }, + { + "epoch": 0.08, + "grad_norm": 0.1628965139389038, + "learning_rate": 9.928874201112677e-06, + "loss": 0.5221, + "step": 382 + }, + { + "epoch": 0.08, + "grad_norm": 0.16259951889514923, + "learning_rate": 9.92828658218257e-06, + "loss": 0.496, + "step": 383 + }, + { + "epoch": 0.08, + "grad_norm": 0.15290167927742004, + "learning_rate": 9.927696563399127e-06, + "loss": 0.5241, + "step": 384 + }, + { + "epoch": 0.08, + "grad_norm": 0.1801231950521469, + "learning_rate": 9.927104145049664e-06, + "loss": 0.5671, + "step": 385 + }, + { + "epoch": 0.08, + "grad_norm": 0.1866559088230133, + "learning_rate": 9.926509327422661e-06, + "loss": 0.5476, + "step": 386 + }, + { + "epoch": 0.08, + "grad_norm": 0.1865171194076538, + "learning_rate": 9.925912110807766e-06, + "loss": 0.5352, + "step": 387 + }, + { + "epoch": 0.08, + "grad_norm": 0.1893538534641266, + "learning_rate": 9.9253124954958e-06, + "loss": 0.5537, + "step": 388 + }, + { + "epoch": 0.08, + "grad_norm": 0.18070872128009796, + "learning_rate": 9.924710481778746e-06, + "loss": 0.5292, + "step": 389 + }, + { + "epoch": 0.08, + "grad_norm": 0.17466960847377777, + "learning_rate": 9.924106069949756e-06, + "loss": 0.5212, + "step": 390 + }, + { + "epoch": 0.08, + "grad_norm": 0.18615244328975677, + "learning_rate": 9.923499260303155e-06, + "loss": 0.5116, + "step": 391 + }, + { + "epoch": 0.08, + "grad_norm": 0.17259790003299713, + "learning_rate": 9.922890053134428e-06, + "loss": 0.5159, + "step": 392 + }, + { + "epoch": 0.08, + "grad_norm": 0.16246852278709412, + "learning_rate": 9.922278448740235e-06, + "loss": 0.5268, + "step": 393 + }, + { + "epoch": 0.08, + "grad_norm": 0.15586566925048828, + "learning_rate": 9.9216644474184e-06, + "loss": 0.4965, + "step": 394 + }, + { + "epoch": 0.09, + "grad_norm": 0.1761687695980072, + "learning_rate": 9.92104804946791e-06, + "loss": 0.5504, + "step": 395 + }, + { + "epoch": 0.09, + "grad_norm": 0.19697882235050201, + "learning_rate": 9.920429255188926e-06, + "loss": 0.5055, + "step": 396 + }, + { + "epoch": 0.09, + "grad_norm": 0.1910158395767212, + "learning_rate": 9.919808064882773e-06, + "loss": 0.4947, + "step": 397 + }, + { + "epoch": 0.09, + "grad_norm": 0.18492764234542847, + "learning_rate": 9.91918447885194e-06, + "loss": 0.569, + "step": 398 + }, + { + "epoch": 0.09, + "grad_norm": 0.17928937077522278, + "learning_rate": 9.918558497400088e-06, + "loss": 0.4933, + "step": 399 + }, + { + "epoch": 0.09, + "grad_norm": 0.18491177260875702, + "learning_rate": 9.91793012083204e-06, + "loss": 0.5489, + "step": 400 + }, + { + "epoch": 0.09, + "grad_norm": 0.1796533763408661, + "learning_rate": 9.917299349453791e-06, + "loss": 0.5575, + "step": 401 + }, + { + "epoch": 0.09, + "grad_norm": 0.14460118114948273, + "learning_rate": 9.916666183572492e-06, + "loss": 0.4632, + "step": 402 + }, + { + "epoch": 0.09, + "grad_norm": 0.13730689883232117, + "learning_rate": 9.916030623496472e-06, + "loss": 0.5634, + "step": 403 + }, + { + "epoch": 0.09, + "grad_norm": 0.18971490859985352, + "learning_rate": 9.915392669535214e-06, + "loss": 0.5193, + "step": 404 + }, + { + "epoch": 0.09, + "grad_norm": 0.12481328845024109, + "learning_rate": 9.914752321999379e-06, + "loss": 0.5389, + "step": 405 + }, + { + "epoch": 0.09, + "grad_norm": 0.17612747848033905, + "learning_rate": 9.914109581200785e-06, + "loss": 0.5129, + "step": 406 + }, + { + "epoch": 0.09, + "grad_norm": 0.1852181851863861, + "learning_rate": 9.913464447452414e-06, + "loss": 0.5124, + "step": 407 + }, + { + "epoch": 0.09, + "grad_norm": 0.23606260120868683, + "learning_rate": 9.912816921068424e-06, + "loss": 0.4736, + "step": 408 + }, + { + "epoch": 0.09, + "grad_norm": 0.23079735040664673, + "learning_rate": 9.912167002364126e-06, + "loss": 0.5612, + "step": 409 + }, + { + "epoch": 0.09, + "grad_norm": 0.22326047718524933, + "learning_rate": 9.911514691656003e-06, + "loss": 0.5367, + "step": 410 + }, + { + "epoch": 0.09, + "grad_norm": 0.1975882351398468, + "learning_rate": 9.910859989261702e-06, + "loss": 0.5575, + "step": 411 + }, + { + "epoch": 0.09, + "grad_norm": 0.16411826014518738, + "learning_rate": 9.910202895500031e-06, + "loss": 0.5506, + "step": 412 + }, + { + "epoch": 0.09, + "grad_norm": 0.1982284039258957, + "learning_rate": 9.909543410690967e-06, + "loss": 0.5443, + "step": 413 + }, + { + "epoch": 0.09, + "grad_norm": 0.1679336577653885, + "learning_rate": 9.908881535155647e-06, + "loss": 0.4876, + "step": 414 + }, + { + "epoch": 0.09, + "grad_norm": 0.17549291253089905, + "learning_rate": 9.908217269216377e-06, + "loss": 0.558, + "step": 415 + }, + { + "epoch": 0.09, + "grad_norm": 0.13716278970241547, + "learning_rate": 9.907550613196624e-06, + "loss": 0.5527, + "step": 416 + }, + { + "epoch": 0.09, + "grad_norm": 0.171469584107399, + "learning_rate": 9.90688156742102e-06, + "loss": 0.5066, + "step": 417 + }, + { + "epoch": 0.09, + "grad_norm": 0.19487224519252777, + "learning_rate": 9.906210132215357e-06, + "loss": 0.5211, + "step": 418 + }, + { + "epoch": 0.09, + "grad_norm": 0.16895724833011627, + "learning_rate": 9.905536307906599e-06, + "loss": 0.4936, + "step": 419 + }, + { + "epoch": 0.09, + "grad_norm": 0.2055499255657196, + "learning_rate": 9.904860094822861e-06, + "loss": 0.4719, + "step": 420 + }, + { + "epoch": 0.09, + "grad_norm": 0.30334606766700745, + "learning_rate": 9.904181493293434e-06, + "loss": 0.5743, + "step": 421 + }, + { + "epoch": 0.09, + "grad_norm": 0.15841247141361237, + "learning_rate": 9.903500503648766e-06, + "loss": 0.5722, + "step": 422 + }, + { + "epoch": 0.09, + "grad_norm": 0.15331457555294037, + "learning_rate": 9.902817126220465e-06, + "loss": 0.4636, + "step": 423 + }, + { + "epoch": 0.09, + "grad_norm": 0.22973452508449554, + "learning_rate": 9.902131361341307e-06, + "loss": 0.5427, + "step": 424 + }, + { + "epoch": 0.09, + "grad_norm": 0.21478115022182465, + "learning_rate": 9.901443209345229e-06, + "loss": 0.5324, + "step": 425 + }, + { + "epoch": 0.09, + "grad_norm": 0.2344510704278946, + "learning_rate": 9.900752670567331e-06, + "loss": 0.5439, + "step": 426 + }, + { + "epoch": 0.09, + "grad_norm": 0.17472712695598602, + "learning_rate": 9.90005974534387e-06, + "loss": 0.4745, + "step": 427 + }, + { + "epoch": 0.09, + "grad_norm": 0.1476239264011383, + "learning_rate": 9.899364434012273e-06, + "loss": 0.4726, + "step": 428 + }, + { + "epoch": 0.09, + "grad_norm": 0.174478217959404, + "learning_rate": 9.898666736911125e-06, + "loss": 0.5485, + "step": 429 + }, + { + "epoch": 0.09, + "grad_norm": 0.20660632848739624, + "learning_rate": 9.897966654380172e-06, + "loss": 0.5274, + "step": 430 + }, + { + "epoch": 0.09, + "grad_norm": 0.1528811752796173, + "learning_rate": 9.89726418676032e-06, + "loss": 0.5305, + "step": 431 + }, + { + "epoch": 0.09, + "grad_norm": 0.23785395920276642, + "learning_rate": 9.896559334393644e-06, + "loss": 0.5553, + "step": 432 + }, + { + "epoch": 0.09, + "grad_norm": 0.19750644266605377, + "learning_rate": 9.895852097623374e-06, + "loss": 0.5441, + "step": 433 + }, + { + "epoch": 0.09, + "grad_norm": 0.16664327681064606, + "learning_rate": 9.895142476793902e-06, + "loss": 0.4756, + "step": 434 + }, + { + "epoch": 0.09, + "grad_norm": 0.18724434077739716, + "learning_rate": 9.89443047225078e-06, + "loss": 0.5046, + "step": 435 + }, + { + "epoch": 0.09, + "grad_norm": 0.20234829187393188, + "learning_rate": 9.893716084340723e-06, + "loss": 0.5276, + "step": 436 + }, + { + "epoch": 0.09, + "grad_norm": 0.17969612777233124, + "learning_rate": 9.892999313411607e-06, + "loss": 0.5428, + "step": 437 + }, + { + "epoch": 0.09, + "grad_norm": 0.19304272532463074, + "learning_rate": 9.892280159812465e-06, + "loss": 0.5281, + "step": 438 + }, + { + "epoch": 0.09, + "grad_norm": 0.15909235179424286, + "learning_rate": 9.891558623893492e-06, + "loss": 0.5393, + "step": 439 + }, + { + "epoch": 0.09, + "grad_norm": 0.24154618382453918, + "learning_rate": 9.890834706006048e-06, + "loss": 0.5446, + "step": 440 + }, + { + "epoch": 0.09, + "grad_norm": 0.1484946757555008, + "learning_rate": 9.890108406502642e-06, + "loss": 0.5034, + "step": 441 + }, + { + "epoch": 0.1, + "grad_norm": 0.20041412115097046, + "learning_rate": 9.889379725736953e-06, + "loss": 0.5569, + "step": 442 + }, + { + "epoch": 0.1, + "grad_norm": 0.1696542501449585, + "learning_rate": 9.888648664063815e-06, + "loss": 0.5521, + "step": 443 + }, + { + "epoch": 0.1, + "grad_norm": 0.2253563106060028, + "learning_rate": 9.887915221839223e-06, + "loss": 0.5881, + "step": 444 + }, + { + "epoch": 0.1, + "grad_norm": 0.16398414969444275, + "learning_rate": 9.88717939942033e-06, + "loss": 0.5276, + "step": 445 + }, + { + "epoch": 0.1, + "grad_norm": 0.19543707370758057, + "learning_rate": 9.886441197165446e-06, + "loss": 0.5172, + "step": 446 + }, + { + "epoch": 0.1, + "grad_norm": 0.19510690867900848, + "learning_rate": 9.885700615434044e-06, + "loss": 0.5489, + "step": 447 + }, + { + "epoch": 0.1, + "grad_norm": 0.20647871494293213, + "learning_rate": 9.884957654586753e-06, + "loss": 0.5691, + "step": 448 + }, + { + "epoch": 0.1, + "grad_norm": 0.1428651362657547, + "learning_rate": 9.884212314985363e-06, + "loss": 0.5415, + "step": 449 + }, + { + "epoch": 0.1, + "grad_norm": 0.20169362425804138, + "learning_rate": 9.88346459699282e-06, + "loss": 0.5035, + "step": 450 + }, + { + "epoch": 0.1, + "grad_norm": 0.1399114578962326, + "learning_rate": 9.88271450097323e-06, + "loss": 0.4997, + "step": 451 + }, + { + "epoch": 0.1, + "grad_norm": 0.13809053599834442, + "learning_rate": 9.881962027291855e-06, + "loss": 0.5106, + "step": 452 + }, + { + "epoch": 0.1, + "grad_norm": 0.15126360952854156, + "learning_rate": 9.881207176315112e-06, + "loss": 0.4804, + "step": 453 + }, + { + "epoch": 0.1, + "grad_norm": 0.17541149258613586, + "learning_rate": 9.880449948410587e-06, + "loss": 0.5529, + "step": 454 + }, + { + "epoch": 0.1, + "grad_norm": 0.21182189881801605, + "learning_rate": 9.879690343947009e-06, + "loss": 0.5671, + "step": 455 + }, + { + "epoch": 0.1, + "grad_norm": 0.16285210847854614, + "learning_rate": 9.878928363294275e-06, + "loss": 0.5288, + "step": 456 + }, + { + "epoch": 0.1, + "grad_norm": 0.17910024523735046, + "learning_rate": 9.878164006823434e-06, + "loss": 0.4876, + "step": 457 + }, + { + "epoch": 0.1, + "grad_norm": 0.18861602246761322, + "learning_rate": 9.877397274906694e-06, + "loss": 0.5403, + "step": 458 + }, + { + "epoch": 0.1, + "grad_norm": 0.18446660041809082, + "learning_rate": 9.876628167917417e-06, + "loss": 0.5558, + "step": 459 + }, + { + "epoch": 0.1, + "grad_norm": 0.23668085038661957, + "learning_rate": 9.875856686230125e-06, + "loss": 0.5781, + "step": 460 + }, + { + "epoch": 0.1, + "grad_norm": 0.1459677368402481, + "learning_rate": 9.875082830220496e-06, + "loss": 0.5102, + "step": 461 + }, + { + "epoch": 0.1, + "grad_norm": 0.19470389187335968, + "learning_rate": 9.87430660026536e-06, + "loss": 0.4579, + "step": 462 + }, + { + "epoch": 0.1, + "grad_norm": 0.13941837847232819, + "learning_rate": 9.873527996742707e-06, + "loss": 0.5971, + "step": 463 + }, + { + "epoch": 0.1, + "grad_norm": 0.1971631497144699, + "learning_rate": 9.872747020031682e-06, + "loss": 0.5637, + "step": 464 + }, + { + "epoch": 0.1, + "grad_norm": 0.1430051177740097, + "learning_rate": 9.871963670512586e-06, + "loss": 0.4621, + "step": 465 + }, + { + "epoch": 0.1, + "grad_norm": 0.1990920901298523, + "learning_rate": 9.871177948566875e-06, + "loss": 0.508, + "step": 466 + }, + { + "epoch": 0.1, + "grad_norm": 0.1744355857372284, + "learning_rate": 9.870389854577157e-06, + "loss": 0.5115, + "step": 467 + }, + { + "epoch": 0.1, + "grad_norm": 0.1770336627960205, + "learning_rate": 9.869599388927204e-06, + "loss": 0.5535, + "step": 468 + }, + { + "epoch": 0.1, + "grad_norm": 0.1405770629644394, + "learning_rate": 9.868806552001933e-06, + "loss": 0.5188, + "step": 469 + }, + { + "epoch": 0.1, + "grad_norm": 0.164622500538826, + "learning_rate": 9.868011344187421e-06, + "loss": 0.543, + "step": 470 + }, + { + "epoch": 0.1, + "grad_norm": 0.34272515773773193, + "learning_rate": 9.867213765870897e-06, + "loss": 0.444, + "step": 471 + }, + { + "epoch": 0.1, + "grad_norm": 0.29466864466667175, + "learning_rate": 9.866413817440748e-06, + "loss": 0.5177, + "step": 472 + }, + { + "epoch": 0.1, + "grad_norm": 0.1591256856918335, + "learning_rate": 9.865611499286511e-06, + "loss": 0.543, + "step": 473 + }, + { + "epoch": 0.1, + "grad_norm": 0.17290642857551575, + "learning_rate": 9.864806811798881e-06, + "loss": 0.5571, + "step": 474 + }, + { + "epoch": 0.1, + "grad_norm": 0.14120014011859894, + "learning_rate": 9.863999755369703e-06, + "loss": 0.5366, + "step": 475 + }, + { + "epoch": 0.1, + "grad_norm": 0.1678173840045929, + "learning_rate": 9.863190330391974e-06, + "loss": 0.5301, + "step": 476 + }, + { + "epoch": 0.1, + "grad_norm": 0.17042382061481476, + "learning_rate": 9.862378537259853e-06, + "loss": 0.5669, + "step": 477 + }, + { + "epoch": 0.1, + "grad_norm": 0.1473226100206375, + "learning_rate": 9.861564376368645e-06, + "loss": 0.5113, + "step": 478 + }, + { + "epoch": 0.1, + "grad_norm": 0.1683841496706009, + "learning_rate": 9.860747848114805e-06, + "loss": 0.542, + "step": 479 + }, + { + "epoch": 0.1, + "grad_norm": 0.17106866836547852, + "learning_rate": 9.859928952895952e-06, + "loss": 0.5023, + "step": 480 + }, + { + "epoch": 0.1, + "grad_norm": 0.16280145943164825, + "learning_rate": 9.859107691110847e-06, + "loss": 0.5605, + "step": 481 + }, + { + "epoch": 0.1, + "grad_norm": 0.14175820350646973, + "learning_rate": 9.858284063159411e-06, + "loss": 0.5716, + "step": 482 + }, + { + "epoch": 0.1, + "grad_norm": 0.21412678062915802, + "learning_rate": 9.857458069442709e-06, + "loss": 0.515, + "step": 483 + }, + { + "epoch": 0.1, + "grad_norm": 0.19349491596221924, + "learning_rate": 9.856629710362966e-06, + "loss": 0.5198, + "step": 484 + }, + { + "epoch": 0.1, + "grad_norm": 0.1516617089509964, + "learning_rate": 9.855798986323556e-06, + "loss": 0.4953, + "step": 485 + }, + { + "epoch": 0.1, + "grad_norm": 0.2074221968650818, + "learning_rate": 9.854965897729001e-06, + "loss": 0.5118, + "step": 486 + }, + { + "epoch": 0.1, + "grad_norm": 0.14066927134990692, + "learning_rate": 9.85413044498498e-06, + "loss": 0.5228, + "step": 487 + }, + { + "epoch": 0.11, + "grad_norm": 0.2228998988866806, + "learning_rate": 9.853292628498319e-06, + "loss": 0.6139, + "step": 488 + }, + { + "epoch": 0.11, + "grad_norm": 0.31960368156433105, + "learning_rate": 9.852452448676999e-06, + "loss": 0.5553, + "step": 489 + }, + { + "epoch": 0.11, + "grad_norm": 0.17156168818473816, + "learning_rate": 9.851609905930149e-06, + "loss": 0.5373, + "step": 490 + }, + { + "epoch": 0.11, + "grad_norm": 0.12861701846122742, + "learning_rate": 9.850765000668048e-06, + "loss": 0.5126, + "step": 491 + }, + { + "epoch": 0.11, + "grad_norm": 0.17264096438884735, + "learning_rate": 9.849917733302128e-06, + "loss": 0.5141, + "step": 492 + }, + { + "epoch": 0.11, + "grad_norm": 0.16210493445396423, + "learning_rate": 9.84906810424497e-06, + "loss": 0.524, + "step": 493 + }, + { + "epoch": 0.11, + "grad_norm": 0.12152129411697388, + "learning_rate": 9.848216113910306e-06, + "loss": 0.5405, + "step": 494 + }, + { + "epoch": 0.11, + "grad_norm": 0.17325671017169952, + "learning_rate": 9.847361762713013e-06, + "loss": 0.5062, + "step": 495 + }, + { + "epoch": 0.11, + "grad_norm": 0.14273308217525482, + "learning_rate": 9.846505051069126e-06, + "loss": 0.5302, + "step": 496 + }, + { + "epoch": 0.11, + "grad_norm": 0.2055240273475647, + "learning_rate": 9.845645979395824e-06, + "loss": 0.5018, + "step": 497 + }, + { + "epoch": 0.11, + "grad_norm": 0.1372431516647339, + "learning_rate": 9.844784548111433e-06, + "loss": 0.5665, + "step": 498 + }, + { + "epoch": 0.11, + "grad_norm": 0.1912691444158554, + "learning_rate": 9.843920757635435e-06, + "loss": 0.5267, + "step": 499 + }, + { + "epoch": 0.11, + "grad_norm": 0.14471903443336487, + "learning_rate": 9.843054608388455e-06, + "loss": 0.5087, + "step": 500 + }, + { + "epoch": 0.11, + "grad_norm": 0.17829883098602295, + "learning_rate": 9.84218610079227e-06, + "loss": 0.5029, + "step": 501 + }, + { + "epoch": 0.11, + "grad_norm": 0.16071033477783203, + "learning_rate": 9.8413152352698e-06, + "loss": 0.5259, + "step": 502 + }, + { + "epoch": 0.11, + "grad_norm": 0.21240952610969543, + "learning_rate": 9.840442012245125e-06, + "loss": 0.5266, + "step": 503 + }, + { + "epoch": 0.11, + "grad_norm": 0.1682009994983673, + "learning_rate": 9.839566432143459e-06, + "loss": 0.5132, + "step": 504 + }, + { + "epoch": 0.11, + "grad_norm": 0.14732059836387634, + "learning_rate": 9.838688495391171e-06, + "loss": 0.5745, + "step": 505 + }, + { + "epoch": 0.11, + "grad_norm": 0.15087178349494934, + "learning_rate": 9.837808202415778e-06, + "loss": 0.5017, + "step": 506 + }, + { + "epoch": 0.11, + "grad_norm": 0.16476622223854065, + "learning_rate": 9.836925553645941e-06, + "loss": 0.5044, + "step": 507 + }, + { + "epoch": 0.11, + "grad_norm": 0.23170307278633118, + "learning_rate": 9.836040549511472e-06, + "loss": 0.574, + "step": 508 + }, + { + "epoch": 0.11, + "grad_norm": 0.18723872303962708, + "learning_rate": 9.835153190443327e-06, + "loss": 0.4981, + "step": 509 + }, + { + "epoch": 0.11, + "grad_norm": 0.18692149221897125, + "learning_rate": 9.83426347687361e-06, + "loss": 0.554, + "step": 510 + }, + { + "epoch": 0.11, + "grad_norm": 0.16876354813575745, + "learning_rate": 9.833371409235575e-06, + "loss": 0.5535, + "step": 511 + }, + { + "epoch": 0.11, + "grad_norm": 0.1443847119808197, + "learning_rate": 9.832476987963613e-06, + "loss": 0.4957, + "step": 512 + }, + { + "epoch": 0.11, + "grad_norm": 0.17338885366916656, + "learning_rate": 9.83158021349327e-06, + "loss": 0.5019, + "step": 513 + }, + { + "epoch": 0.11, + "grad_norm": 0.19001881778240204, + "learning_rate": 9.830681086261234e-06, + "loss": 0.5165, + "step": 514 + }, + { + "epoch": 0.11, + "grad_norm": 0.24521715939044952, + "learning_rate": 9.829779606705337e-06, + "loss": 0.579, + "step": 515 + }, + { + "epoch": 0.11, + "grad_norm": 0.16400645673274994, + "learning_rate": 9.828875775264564e-06, + "loss": 0.5429, + "step": 516 + }, + { + "epoch": 0.11, + "grad_norm": 0.2782368063926697, + "learning_rate": 9.827969592379036e-06, + "loss": 0.4832, + "step": 517 + }, + { + "epoch": 0.11, + "grad_norm": 0.15196365118026733, + "learning_rate": 9.827061058490027e-06, + "loss": 0.4643, + "step": 518 + }, + { + "epoch": 0.11, + "grad_norm": 0.17149809002876282, + "learning_rate": 9.826150174039949e-06, + "loss": 0.5388, + "step": 519 + }, + { + "epoch": 0.11, + "grad_norm": 0.152251735329628, + "learning_rate": 9.82523693947236e-06, + "loss": 0.5147, + "step": 520 + }, + { + "epoch": 0.11, + "grad_norm": 0.1551138162612915, + "learning_rate": 9.824321355231968e-06, + "loss": 0.4826, + "step": 521 + }, + { + "epoch": 0.11, + "grad_norm": 0.15354926884174347, + "learning_rate": 9.82340342176462e-06, + "loss": 0.482, + "step": 522 + }, + { + "epoch": 0.11, + "grad_norm": 0.15569601953029633, + "learning_rate": 9.822483139517307e-06, + "loss": 0.4989, + "step": 523 + }, + { + "epoch": 0.11, + "grad_norm": 0.17023304104804993, + "learning_rate": 9.821560508938167e-06, + "loss": 0.4974, + "step": 524 + }, + { + "epoch": 0.11, + "grad_norm": 0.17514115571975708, + "learning_rate": 9.820635530476478e-06, + "loss": 0.4923, + "step": 525 + }, + { + "epoch": 0.11, + "grad_norm": 0.2187749445438385, + "learning_rate": 9.819708204582664e-06, + "loss": 0.5623, + "step": 526 + }, + { + "epoch": 0.11, + "grad_norm": 0.1382010579109192, + "learning_rate": 9.818778531708288e-06, + "loss": 0.4999, + "step": 527 + }, + { + "epoch": 0.11, + "grad_norm": 0.15643168985843658, + "learning_rate": 9.817846512306062e-06, + "loss": 0.4885, + "step": 528 + }, + { + "epoch": 0.11, + "grad_norm": 0.16030187904834747, + "learning_rate": 9.816912146829836e-06, + "loss": 0.5217, + "step": 529 + }, + { + "epoch": 0.11, + "grad_norm": 0.2057688981294632, + "learning_rate": 9.815975435734604e-06, + "loss": 0.5254, + "step": 530 + }, + { + "epoch": 0.11, + "grad_norm": 0.20325696468353271, + "learning_rate": 9.815036379476502e-06, + "loss": 0.5831, + "step": 531 + }, + { + "epoch": 0.11, + "grad_norm": 0.19320160150527954, + "learning_rate": 9.814094978512808e-06, + "loss": 0.4558, + "step": 532 + }, + { + "epoch": 0.11, + "grad_norm": 0.20372559130191803, + "learning_rate": 9.813151233301943e-06, + "loss": 0.5431, + "step": 533 + }, + { + "epoch": 0.12, + "grad_norm": 0.18915057182312012, + "learning_rate": 9.812205144303466e-06, + "loss": 0.5243, + "step": 534 + }, + { + "epoch": 0.12, + "grad_norm": 0.25529226660728455, + "learning_rate": 9.811256711978082e-06, + "loss": 0.5403, + "step": 535 + }, + { + "epoch": 0.12, + "grad_norm": 0.2046184092760086, + "learning_rate": 9.810305936787633e-06, + "loss": 0.5275, + "step": 536 + }, + { + "epoch": 0.12, + "grad_norm": 0.15673565864562988, + "learning_rate": 9.809352819195106e-06, + "loss": 0.57, + "step": 537 + }, + { + "epoch": 0.12, + "grad_norm": 0.13071295619010925, + "learning_rate": 9.808397359664624e-06, + "loss": 0.5232, + "step": 538 + }, + { + "epoch": 0.12, + "grad_norm": 0.17526838183403015, + "learning_rate": 9.807439558661453e-06, + "loss": 0.498, + "step": 539 + }, + { + "epoch": 0.12, + "grad_norm": 0.1860094964504242, + "learning_rate": 9.806479416652e-06, + "loss": 0.5327, + "step": 540 + }, + { + "epoch": 0.12, + "grad_norm": 0.18813055753707886, + "learning_rate": 9.80551693410381e-06, + "loss": 0.5199, + "step": 541 + }, + { + "epoch": 0.12, + "grad_norm": 0.1620221734046936, + "learning_rate": 9.804552111485568e-06, + "loss": 0.4961, + "step": 542 + }, + { + "epoch": 0.12, + "grad_norm": 0.2016637921333313, + "learning_rate": 9.8035849492671e-06, + "loss": 0.5042, + "step": 543 + }, + { + "epoch": 0.12, + "grad_norm": 0.15297263860702515, + "learning_rate": 9.80261544791937e-06, + "loss": 0.5561, + "step": 544 + }, + { + "epoch": 0.12, + "grad_norm": 0.16937948763370514, + "learning_rate": 9.801643607914485e-06, + "loss": 0.5356, + "step": 545 + }, + { + "epoch": 0.12, + "grad_norm": 0.1809961050748825, + "learning_rate": 9.80066942972568e-06, + "loss": 0.5516, + "step": 546 + }, + { + "epoch": 0.12, + "grad_norm": 0.14697958528995514, + "learning_rate": 9.799692913827342e-06, + "loss": 0.5072, + "step": 547 + }, + { + "epoch": 0.12, + "grad_norm": 0.18541914224624634, + "learning_rate": 9.798714060694988e-06, + "loss": 0.4925, + "step": 548 + }, + { + "epoch": 0.12, + "grad_norm": 0.19727596640586853, + "learning_rate": 9.797732870805273e-06, + "loss": 0.5206, + "step": 549 + }, + { + "epoch": 0.12, + "grad_norm": 0.15478399395942688, + "learning_rate": 9.796749344635996e-06, + "loss": 0.5122, + "step": 550 + }, + { + "epoch": 0.12, + "grad_norm": 0.1174599900841713, + "learning_rate": 9.79576348266609e-06, + "loss": 0.5003, + "step": 551 + }, + { + "epoch": 0.12, + "grad_norm": 0.16585126519203186, + "learning_rate": 9.794775285375623e-06, + "loss": 0.5029, + "step": 552 + }, + { + "epoch": 0.12, + "grad_norm": 0.17350535094738007, + "learning_rate": 9.793784753245802e-06, + "loss": 0.548, + "step": 553 + }, + { + "epoch": 0.12, + "grad_norm": 0.18735840916633606, + "learning_rate": 9.792791886758976e-06, + "loss": 0.5455, + "step": 554 + }, + { + "epoch": 0.12, + "grad_norm": 0.22835886478424072, + "learning_rate": 9.79179668639862e-06, + "loss": 0.4881, + "step": 555 + }, + { + "epoch": 0.12, + "grad_norm": 0.16034086048603058, + "learning_rate": 9.790799152649356e-06, + "loss": 0.5222, + "step": 556 + }, + { + "epoch": 0.12, + "grad_norm": 0.20814630389213562, + "learning_rate": 9.789799285996937e-06, + "loss": 0.5489, + "step": 557 + }, + { + "epoch": 0.12, + "grad_norm": 0.17920532822608948, + "learning_rate": 9.788797086928252e-06, + "loss": 0.493, + "step": 558 + }, + { + "epoch": 0.12, + "grad_norm": 0.21627596020698547, + "learning_rate": 9.787792555931328e-06, + "loss": 0.5491, + "step": 559 + }, + { + "epoch": 0.12, + "grad_norm": 0.14485371112823486, + "learning_rate": 9.786785693495327e-06, + "loss": 0.5144, + "step": 560 + }, + { + "epoch": 0.12, + "grad_norm": 0.1546938121318817, + "learning_rate": 9.785776500110542e-06, + "loss": 0.4812, + "step": 561 + }, + { + "epoch": 0.12, + "grad_norm": 0.1761971116065979, + "learning_rate": 9.784764976268408e-06, + "loss": 0.5788, + "step": 562 + }, + { + "epoch": 0.12, + "grad_norm": 0.18302011489868164, + "learning_rate": 9.78375112246149e-06, + "loss": 0.5186, + "step": 563 + }, + { + "epoch": 0.12, + "grad_norm": 0.17190533876419067, + "learning_rate": 9.78273493918349e-06, + "loss": 0.5252, + "step": 564 + }, + { + "epoch": 0.12, + "grad_norm": 0.1821742206811905, + "learning_rate": 9.781716426929243e-06, + "loss": 0.5174, + "step": 565 + }, + { + "epoch": 0.12, + "grad_norm": 0.19061587750911713, + "learning_rate": 9.780695586194719e-06, + "loss": 0.5662, + "step": 566 + }, + { + "epoch": 0.12, + "grad_norm": 0.15308646857738495, + "learning_rate": 9.77967241747702e-06, + "loss": 0.5297, + "step": 567 + }, + { + "epoch": 0.12, + "grad_norm": 0.16192299127578735, + "learning_rate": 9.778646921274385e-06, + "loss": 0.5846, + "step": 568 + }, + { + "epoch": 0.12, + "grad_norm": 0.1472279578447342, + "learning_rate": 9.777619098086181e-06, + "loss": 0.5596, + "step": 569 + }, + { + "epoch": 0.12, + "grad_norm": 0.20292969048023224, + "learning_rate": 9.776588948412917e-06, + "loss": 0.5179, + "step": 570 + }, + { + "epoch": 0.12, + "grad_norm": 0.16102533042430878, + "learning_rate": 9.775556472756226e-06, + "loss": 0.4919, + "step": 571 + }, + { + "epoch": 0.12, + "grad_norm": 0.18485024571418762, + "learning_rate": 9.774521671618877e-06, + "loss": 0.5455, + "step": 572 + }, + { + "epoch": 0.12, + "grad_norm": 0.1821717470884323, + "learning_rate": 9.773484545504771e-06, + "loss": 0.5091, + "step": 573 + }, + { + "epoch": 0.12, + "grad_norm": 0.16444329917430878, + "learning_rate": 9.772445094918944e-06, + "loss": 0.5218, + "step": 574 + }, + { + "epoch": 0.12, + "grad_norm": 0.17274467647075653, + "learning_rate": 9.771403320367558e-06, + "loss": 0.5823, + "step": 575 + }, + { + "epoch": 0.12, + "grad_norm": 0.15213851630687714, + "learning_rate": 9.770359222357914e-06, + "loss": 0.4696, + "step": 576 + }, + { + "epoch": 0.12, + "grad_norm": 0.13404731452465057, + "learning_rate": 9.76931280139844e-06, + "loss": 0.5365, + "step": 577 + }, + { + "epoch": 0.12, + "grad_norm": 0.1744057685136795, + "learning_rate": 9.768264057998693e-06, + "loss": 0.5559, + "step": 578 + }, + { + "epoch": 0.12, + "grad_norm": 0.17314410209655762, + "learning_rate": 9.767212992669368e-06, + "loss": 0.5614, + "step": 579 + }, + { + "epoch": 0.12, + "grad_norm": 0.1846940666437149, + "learning_rate": 9.766159605922282e-06, + "loss": 0.5122, + "step": 580 + }, + { + "epoch": 0.13, + "grad_norm": 0.15393884479999542, + "learning_rate": 9.76510389827039e-06, + "loss": 0.5734, + "step": 581 + }, + { + "epoch": 0.13, + "grad_norm": 0.1504923403263092, + "learning_rate": 9.764045870227772e-06, + "loss": 0.5111, + "step": 582 + }, + { + "epoch": 0.13, + "grad_norm": 0.16151262819766998, + "learning_rate": 9.762985522309642e-06, + "loss": 0.4965, + "step": 583 + }, + { + "epoch": 0.13, + "grad_norm": 0.15211625397205353, + "learning_rate": 9.761922855032339e-06, + "loss": 0.5263, + "step": 584 + }, + { + "epoch": 0.13, + "grad_norm": 0.17688104510307312, + "learning_rate": 9.760857868913335e-06, + "loss": 0.4846, + "step": 585 + }, + { + "epoch": 0.13, + "grad_norm": 0.1481778621673584, + "learning_rate": 9.759790564471233e-06, + "loss": 0.5189, + "step": 586 + }, + { + "epoch": 0.13, + "grad_norm": 0.1728227287530899, + "learning_rate": 9.758720942225759e-06, + "loss": 0.4878, + "step": 587 + }, + { + "epoch": 0.13, + "grad_norm": 0.15571308135986328, + "learning_rate": 9.757649002697771e-06, + "loss": 0.5456, + "step": 588 + }, + { + "epoch": 0.13, + "grad_norm": 0.15774881839752197, + "learning_rate": 9.756574746409258e-06, + "loss": 0.522, + "step": 589 + }, + { + "epoch": 0.13, + "grad_norm": 0.18842703104019165, + "learning_rate": 9.755498173883331e-06, + "loss": 0.442, + "step": 590 + }, + { + "epoch": 0.13, + "grad_norm": 0.1557362824678421, + "learning_rate": 9.754419285644233e-06, + "loss": 0.5149, + "step": 591 + }, + { + "epoch": 0.13, + "grad_norm": 0.15488624572753906, + "learning_rate": 9.753338082217334e-06, + "loss": 0.5567, + "step": 592 + }, + { + "epoch": 0.13, + "grad_norm": 0.14816376566886902, + "learning_rate": 9.752254564129134e-06, + "loss": 0.5244, + "step": 593 + }, + { + "epoch": 0.13, + "grad_norm": 0.144754558801651, + "learning_rate": 9.751168731907253e-06, + "loss": 0.4777, + "step": 594 + }, + { + "epoch": 0.13, + "grad_norm": 0.2727169096469879, + "learning_rate": 9.750080586080445e-06, + "loss": 0.5165, + "step": 595 + }, + { + "epoch": 0.13, + "grad_norm": 0.21706987917423248, + "learning_rate": 9.748990127178589e-06, + "loss": 0.5346, + "step": 596 + }, + { + "epoch": 0.13, + "grad_norm": 0.20381216704845428, + "learning_rate": 9.747897355732684e-06, + "loss": 0.5546, + "step": 597 + }, + { + "epoch": 0.13, + "grad_norm": 0.1514424830675125, + "learning_rate": 9.746802272274868e-06, + "loss": 0.5593, + "step": 598 + }, + { + "epoch": 0.13, + "grad_norm": 0.16018468141555786, + "learning_rate": 9.745704877338393e-06, + "loss": 0.5303, + "step": 599 + }, + { + "epoch": 0.13, + "grad_norm": 0.1491565853357315, + "learning_rate": 9.74460517145764e-06, + "loss": 0.5265, + "step": 600 + }, + { + "epoch": 0.13, + "grad_norm": 0.1546594202518463, + "learning_rate": 9.743503155168119e-06, + "loss": 0.5193, + "step": 601 + }, + { + "epoch": 0.13, + "grad_norm": 0.18948593735694885, + "learning_rate": 9.74239882900646e-06, + "loss": 0.5794, + "step": 602 + }, + { + "epoch": 0.13, + "grad_norm": 0.16691826283931732, + "learning_rate": 9.74129219351042e-06, + "loss": 0.5327, + "step": 603 + }, + { + "epoch": 0.13, + "grad_norm": 0.15864987671375275, + "learning_rate": 9.740183249218883e-06, + "loss": 0.5189, + "step": 604 + }, + { + "epoch": 0.13, + "grad_norm": 0.17005395889282227, + "learning_rate": 9.739071996671851e-06, + "loss": 0.5345, + "step": 605 + }, + { + "epoch": 0.13, + "grad_norm": 0.16413751244544983, + "learning_rate": 9.737958436410459e-06, + "loss": 0.5135, + "step": 606 + }, + { + "epoch": 0.13, + "grad_norm": 0.16234463453292847, + "learning_rate": 9.736842568976957e-06, + "loss": 0.523, + "step": 607 + }, + { + "epoch": 0.13, + "grad_norm": 0.24228431284427643, + "learning_rate": 9.73572439491472e-06, + "loss": 0.5346, + "step": 608 + }, + { + "epoch": 0.13, + "grad_norm": 0.16661712527275085, + "learning_rate": 9.734603914768254e-06, + "loss": 0.5846, + "step": 609 + }, + { + "epoch": 0.13, + "grad_norm": 0.1640872210264206, + "learning_rate": 9.73348112908318e-06, + "loss": 0.5245, + "step": 610 + }, + { + "epoch": 0.13, + "grad_norm": 0.15542039275169373, + "learning_rate": 9.732356038406242e-06, + "loss": 0.5418, + "step": 611 + }, + { + "epoch": 0.13, + "grad_norm": 0.16618283092975616, + "learning_rate": 9.73122864328531e-06, + "loss": 0.5491, + "step": 612 + }, + { + "epoch": 0.13, + "grad_norm": 0.15610603988170624, + "learning_rate": 9.730098944269377e-06, + "loss": 0.5672, + "step": 613 + }, + { + "epoch": 0.13, + "grad_norm": 0.16698190569877625, + "learning_rate": 9.72896694190855e-06, + "loss": 0.5467, + "step": 614 + }, + { + "epoch": 0.13, + "grad_norm": 0.16580110788345337, + "learning_rate": 9.727832636754066e-06, + "loss": 0.5943, + "step": 615 + }, + { + "epoch": 0.13, + "grad_norm": 0.17117217183113098, + "learning_rate": 9.726696029358283e-06, + "loss": 0.5022, + "step": 616 + }, + { + "epoch": 0.13, + "grad_norm": 0.18671591579914093, + "learning_rate": 9.725557120274673e-06, + "loss": 0.544, + "step": 617 + }, + { + "epoch": 0.13, + "grad_norm": 0.1708557903766632, + "learning_rate": 9.724415910057839e-06, + "loss": 0.5172, + "step": 618 + }, + { + "epoch": 0.13, + "grad_norm": 0.17382651567459106, + "learning_rate": 9.723272399263492e-06, + "loss": 0.5278, + "step": 619 + }, + { + "epoch": 0.13, + "grad_norm": 0.20800824463367462, + "learning_rate": 9.722126588448473e-06, + "loss": 0.5484, + "step": 620 + }, + { + "epoch": 0.13, + "grad_norm": 0.1626998484134674, + "learning_rate": 9.720978478170745e-06, + "loss": 0.5248, + "step": 621 + }, + { + "epoch": 0.13, + "grad_norm": 0.20097678899765015, + "learning_rate": 9.719828068989378e-06, + "loss": 0.4871, + "step": 622 + }, + { + "epoch": 0.13, + "grad_norm": 0.17374666035175323, + "learning_rate": 9.718675361464574e-06, + "loss": 0.5118, + "step": 623 + }, + { + "epoch": 0.13, + "grad_norm": 0.16872042417526245, + "learning_rate": 9.717520356157648e-06, + "loss": 0.5554, + "step": 624 + }, + { + "epoch": 0.13, + "grad_norm": 0.1880810260772705, + "learning_rate": 9.716363053631039e-06, + "loss": 0.4936, + "step": 625 + }, + { + "epoch": 0.13, + "grad_norm": 0.20248694717884064, + "learning_rate": 9.715203454448297e-06, + "loss": 0.5005, + "step": 626 + }, + { + "epoch": 0.14, + "grad_norm": 0.20392434298992157, + "learning_rate": 9.714041559174095e-06, + "loss": 0.5389, + "step": 627 + }, + { + "epoch": 0.14, + "grad_norm": 0.16137023270130157, + "learning_rate": 9.712877368374226e-06, + "loss": 0.5599, + "step": 628 + }, + { + "epoch": 0.14, + "grad_norm": 0.17454127967357635, + "learning_rate": 9.711710882615595e-06, + "loss": 0.5127, + "step": 629 + }, + { + "epoch": 0.14, + "grad_norm": 0.13746435940265656, + "learning_rate": 9.710542102466229e-06, + "loss": 0.5617, + "step": 630 + }, + { + "epoch": 0.14, + "grad_norm": 0.16572695970535278, + "learning_rate": 9.709371028495276e-06, + "loss": 0.5421, + "step": 631 + }, + { + "epoch": 0.14, + "grad_norm": 0.21156027913093567, + "learning_rate": 9.708197661272989e-06, + "loss": 0.5373, + "step": 632 + }, + { + "epoch": 0.14, + "grad_norm": 0.1574900895357132, + "learning_rate": 9.707022001370749e-06, + "loss": 0.526, + "step": 633 + }, + { + "epoch": 0.14, + "grad_norm": 0.14249767363071442, + "learning_rate": 9.70584404936105e-06, + "loss": 0.5104, + "step": 634 + }, + { + "epoch": 0.14, + "grad_norm": 0.18953874707221985, + "learning_rate": 9.704663805817499e-06, + "loss": 0.54, + "step": 635 + }, + { + "epoch": 0.14, + "grad_norm": 0.17047786712646484, + "learning_rate": 9.703481271314823e-06, + "loss": 0.5185, + "step": 636 + }, + { + "epoch": 0.14, + "grad_norm": 0.19651903212070465, + "learning_rate": 9.702296446428863e-06, + "loss": 0.5147, + "step": 637 + }, + { + "epoch": 0.14, + "grad_norm": 0.13926255702972412, + "learning_rate": 9.701109331736573e-06, + "loss": 0.5381, + "step": 638 + }, + { + "epoch": 0.14, + "grad_norm": 0.16642118990421295, + "learning_rate": 9.699919927816027e-06, + "loss": 0.5114, + "step": 639 + }, + { + "epoch": 0.14, + "grad_norm": 0.19338329136371613, + "learning_rate": 9.69872823524641e-06, + "loss": 0.5317, + "step": 640 + }, + { + "epoch": 0.14, + "grad_norm": 0.26251357793807983, + "learning_rate": 9.697534254608024e-06, + "loss": 0.5122, + "step": 641 + }, + { + "epoch": 0.14, + "grad_norm": 0.1683933585882187, + "learning_rate": 9.69633798648228e-06, + "loss": 0.5429, + "step": 642 + }, + { + "epoch": 0.14, + "grad_norm": 0.20809942483901978, + "learning_rate": 9.695139431451712e-06, + "loss": 0.537, + "step": 643 + }, + { + "epoch": 0.14, + "grad_norm": 0.1489880532026291, + "learning_rate": 9.693938590099958e-06, + "loss": 0.5049, + "step": 644 + }, + { + "epoch": 0.14, + "grad_norm": 0.14825065433979034, + "learning_rate": 9.692735463011774e-06, + "loss": 0.496, + "step": 645 + }, + { + "epoch": 0.14, + "grad_norm": 0.17281201481819153, + "learning_rate": 9.691530050773031e-06, + "loss": 0.524, + "step": 646 + }, + { + "epoch": 0.14, + "grad_norm": 0.227024644613266, + "learning_rate": 9.690322353970708e-06, + "loss": 0.5191, + "step": 647 + }, + { + "epoch": 0.14, + "grad_norm": 0.16183902323246002, + "learning_rate": 9.689112373192899e-06, + "loss": 0.5557, + "step": 648 + }, + { + "epoch": 0.14, + "grad_norm": 0.19648505747318268, + "learning_rate": 9.687900109028813e-06, + "loss": 0.4963, + "step": 649 + }, + { + "epoch": 0.14, + "grad_norm": 0.1439117044210434, + "learning_rate": 9.686685562068765e-06, + "loss": 0.5512, + "step": 650 + }, + { + "epoch": 0.14, + "grad_norm": 0.15333153307437897, + "learning_rate": 9.685468732904187e-06, + "loss": 0.4566, + "step": 651 + }, + { + "epoch": 0.14, + "grad_norm": 0.16783007979393005, + "learning_rate": 9.684249622127616e-06, + "loss": 0.5197, + "step": 652 + }, + { + "epoch": 0.14, + "grad_norm": 0.1759708672761917, + "learning_rate": 9.683028230332707e-06, + "loss": 0.5086, + "step": 653 + }, + { + "epoch": 0.14, + "grad_norm": 0.18851730227470398, + "learning_rate": 9.681804558114222e-06, + "loss": 0.5563, + "step": 654 + }, + { + "epoch": 0.14, + "grad_norm": 0.18417790532112122, + "learning_rate": 9.680578606068037e-06, + "loss": 0.5028, + "step": 655 + }, + { + "epoch": 0.14, + "grad_norm": 0.1564049869775772, + "learning_rate": 9.67935037479113e-06, + "loss": 0.5071, + "step": 656 + }, + { + "epoch": 0.14, + "grad_norm": 0.18582746386528015, + "learning_rate": 9.678119864881597e-06, + "loss": 0.4922, + "step": 657 + }, + { + "epoch": 0.14, + "grad_norm": 0.16415338218212128, + "learning_rate": 9.676887076938642e-06, + "loss": 0.5226, + "step": 658 + }, + { + "epoch": 0.14, + "grad_norm": 0.19364799559116364, + "learning_rate": 9.675652011562576e-06, + "loss": 0.5294, + "step": 659 + }, + { + "epoch": 0.14, + "grad_norm": 0.19111143052577972, + "learning_rate": 9.674414669354819e-06, + "loss": 0.5486, + "step": 660 + }, + { + "epoch": 0.14, + "grad_norm": 0.1607770472764969, + "learning_rate": 9.673175050917902e-06, + "loss": 0.5674, + "step": 661 + }, + { + "epoch": 0.14, + "grad_norm": 0.16375142335891724, + "learning_rate": 9.671933156855464e-06, + "loss": 0.5305, + "step": 662 + }, + { + "epoch": 0.14, + "grad_norm": 0.2223857343196869, + "learning_rate": 9.67068898777225e-06, + "loss": 0.5073, + "step": 663 + }, + { + "epoch": 0.14, + "grad_norm": 0.15344950556755066, + "learning_rate": 9.669442544274115e-06, + "loss": 0.5176, + "step": 664 + }, + { + "epoch": 0.14, + "grad_norm": 0.1769074946641922, + "learning_rate": 9.66819382696802e-06, + "loss": 0.4888, + "step": 665 + }, + { + "epoch": 0.14, + "grad_norm": 0.14724504947662354, + "learning_rate": 9.666942836462036e-06, + "loss": 0.5251, + "step": 666 + }, + { + "epoch": 0.14, + "grad_norm": 0.14657099545001984, + "learning_rate": 9.665689573365336e-06, + "loss": 0.5271, + "step": 667 + }, + { + "epoch": 0.14, + "grad_norm": 0.16149552166461945, + "learning_rate": 9.664434038288207e-06, + "loss": 0.521, + "step": 668 + }, + { + "epoch": 0.14, + "grad_norm": 0.17453457415103912, + "learning_rate": 9.663176231842034e-06, + "loss": 0.5071, + "step": 669 + }, + { + "epoch": 0.14, + "grad_norm": 0.1480516791343689, + "learning_rate": 9.661916154639312e-06, + "loss": 0.598, + "step": 670 + }, + { + "epoch": 0.14, + "grad_norm": 0.17226625978946686, + "learning_rate": 9.660653807293643e-06, + "loss": 0.534, + "step": 671 + }, + { + "epoch": 0.14, + "grad_norm": 0.13685333728790283, + "learning_rate": 9.659389190419735e-06, + "loss": 0.5049, + "step": 672 + }, + { + "epoch": 0.14, + "grad_norm": 0.1628991812467575, + "learning_rate": 9.658122304633395e-06, + "loss": 0.5246, + "step": 673 + }, + { + "epoch": 0.15, + "grad_norm": 0.1640816479921341, + "learning_rate": 9.656853150551543e-06, + "loss": 0.5104, + "step": 674 + }, + { + "epoch": 0.15, + "grad_norm": 0.16424889862537384, + "learning_rate": 9.6555817287922e-06, + "loss": 0.5173, + "step": 675 + }, + { + "epoch": 0.15, + "grad_norm": 0.26323530077934265, + "learning_rate": 9.654308039974489e-06, + "loss": 0.5144, + "step": 676 + }, + { + "epoch": 0.15, + "grad_norm": 0.13345208764076233, + "learning_rate": 9.65303208471864e-06, + "loss": 0.5294, + "step": 677 + }, + { + "epoch": 0.15, + "grad_norm": 0.179282546043396, + "learning_rate": 9.651753863645985e-06, + "loss": 0.5211, + "step": 678 + }, + { + "epoch": 0.15, + "grad_norm": 0.1983976811170578, + "learning_rate": 9.650473377378961e-06, + "loss": 0.5435, + "step": 679 + }, + { + "epoch": 0.15, + "grad_norm": 0.18049369752407074, + "learning_rate": 9.649190626541105e-06, + "loss": 0.533, + "step": 680 + }, + { + "epoch": 0.15, + "grad_norm": 0.16596846282482147, + "learning_rate": 9.647905611757062e-06, + "loss": 0.5274, + "step": 681 + }, + { + "epoch": 0.15, + "grad_norm": 0.17268408834934235, + "learning_rate": 9.646618333652574e-06, + "loss": 0.5481, + "step": 682 + }, + { + "epoch": 0.15, + "grad_norm": 0.168728306889534, + "learning_rate": 9.64532879285449e-06, + "loss": 0.5201, + "step": 683 + }, + { + "epoch": 0.15, + "grad_norm": 0.2116057574748993, + "learning_rate": 9.644036989990753e-06, + "loss": 0.5107, + "step": 684 + }, + { + "epoch": 0.15, + "grad_norm": 0.14726531505584717, + "learning_rate": 9.642742925690417e-06, + "loss": 0.5546, + "step": 685 + }, + { + "epoch": 0.15, + "grad_norm": 0.17111736536026, + "learning_rate": 9.641446600583632e-06, + "loss": 0.5123, + "step": 686 + }, + { + "epoch": 0.15, + "grad_norm": 0.17838339507579803, + "learning_rate": 9.640148015301651e-06, + "loss": 0.4966, + "step": 687 + }, + { + "epoch": 0.15, + "grad_norm": 0.17207923531532288, + "learning_rate": 9.638847170476824e-06, + "loss": 0.5189, + "step": 688 + }, + { + "epoch": 0.15, + "grad_norm": 0.15716849267482758, + "learning_rate": 9.637544066742606e-06, + "loss": 0.5553, + "step": 689 + }, + { + "epoch": 0.15, + "grad_norm": 0.19608205556869507, + "learning_rate": 9.636238704733547e-06, + "loss": 0.5691, + "step": 690 + }, + { + "epoch": 0.15, + "grad_norm": 0.15424737334251404, + "learning_rate": 9.634931085085301e-06, + "loss": 0.5419, + "step": 691 + }, + { + "epoch": 0.15, + "grad_norm": 0.24781200289726257, + "learning_rate": 9.633621208434623e-06, + "loss": 0.5374, + "step": 692 + }, + { + "epoch": 0.15, + "grad_norm": 0.1594979614019394, + "learning_rate": 9.63230907541936e-06, + "loss": 0.5093, + "step": 693 + }, + { + "epoch": 0.15, + "grad_norm": 0.1622641682624817, + "learning_rate": 9.630994686678462e-06, + "loss": 0.5247, + "step": 694 + }, + { + "epoch": 0.15, + "grad_norm": 0.19124239683151245, + "learning_rate": 9.629678042851976e-06, + "loss": 0.5241, + "step": 695 + }, + { + "epoch": 0.15, + "grad_norm": 0.1495082974433899, + "learning_rate": 9.628359144581052e-06, + "loss": 0.5295, + "step": 696 + }, + { + "epoch": 0.15, + "grad_norm": 0.1647813469171524, + "learning_rate": 9.627037992507931e-06, + "loss": 0.494, + "step": 697 + }, + { + "epoch": 0.15, + "grad_norm": 0.16081197559833527, + "learning_rate": 9.625714587275954e-06, + "loss": 0.5414, + "step": 698 + }, + { + "epoch": 0.15, + "grad_norm": 0.14257070422172546, + "learning_rate": 9.624388929529563e-06, + "loss": 0.5634, + "step": 699 + }, + { + "epoch": 0.15, + "grad_norm": 0.1383073329925537, + "learning_rate": 9.623061019914291e-06, + "loss": 0.4961, + "step": 700 + }, + { + "epoch": 0.15, + "grad_norm": 0.1932617723941803, + "learning_rate": 9.621730859076768e-06, + "loss": 0.522, + "step": 701 + }, + { + "epoch": 0.15, + "grad_norm": 0.20005308091640472, + "learning_rate": 9.620398447664727e-06, + "loss": 0.522, + "step": 702 + }, + { + "epoch": 0.15, + "grad_norm": 0.17601189017295837, + "learning_rate": 9.61906378632699e-06, + "loss": 0.5707, + "step": 703 + }, + { + "epoch": 0.15, + "grad_norm": 0.14197023212909698, + "learning_rate": 9.617726875713477e-06, + "loss": 0.5194, + "step": 704 + }, + { + "epoch": 0.15, + "grad_norm": 0.17921584844589233, + "learning_rate": 9.616387716475203e-06, + "loss": 0.5067, + "step": 705 + }, + { + "epoch": 0.15, + "grad_norm": 0.1330891251564026, + "learning_rate": 9.615046309264278e-06, + "loss": 0.4925, + "step": 706 + }, + { + "epoch": 0.15, + "grad_norm": 0.19038861989974976, + "learning_rate": 9.613702654733908e-06, + "loss": 0.5745, + "step": 707 + }, + { + "epoch": 0.15, + "grad_norm": 0.2451518177986145, + "learning_rate": 9.612356753538392e-06, + "loss": 0.5799, + "step": 708 + }, + { + "epoch": 0.15, + "grad_norm": 0.20882856845855713, + "learning_rate": 9.611008606333121e-06, + "loss": 0.4886, + "step": 709 + }, + { + "epoch": 0.15, + "grad_norm": 0.170186385512352, + "learning_rate": 9.609658213774584e-06, + "loss": 0.5118, + "step": 710 + }, + { + "epoch": 0.15, + "grad_norm": 0.15260860323905945, + "learning_rate": 9.608305576520361e-06, + "loss": 0.5166, + "step": 711 + }, + { + "epoch": 0.15, + "grad_norm": 0.16833122074604034, + "learning_rate": 9.606950695229125e-06, + "loss": 0.5003, + "step": 712 + }, + { + "epoch": 0.15, + "grad_norm": 0.17692722380161285, + "learning_rate": 9.605593570560642e-06, + "loss": 0.5378, + "step": 713 + }, + { + "epoch": 0.15, + "grad_norm": 0.2011829912662506, + "learning_rate": 9.60423420317577e-06, + "loss": 0.531, + "step": 714 + }, + { + "epoch": 0.15, + "grad_norm": 0.1459263414144516, + "learning_rate": 9.602872593736461e-06, + "loss": 0.5278, + "step": 715 + }, + { + "epoch": 0.15, + "grad_norm": 0.15884311497211456, + "learning_rate": 9.601508742905757e-06, + "loss": 0.5615, + "step": 716 + }, + { + "epoch": 0.15, + "grad_norm": 0.2560180127620697, + "learning_rate": 9.600142651347792e-06, + "loss": 0.5295, + "step": 717 + }, + { + "epoch": 0.15, + "grad_norm": 0.15647375583648682, + "learning_rate": 9.59877431972779e-06, + "loss": 0.5028, + "step": 718 + }, + { + "epoch": 0.15, + "grad_norm": 0.21782688796520233, + "learning_rate": 9.597403748712067e-06, + "loss": 0.4902, + "step": 719 + }, + { + "epoch": 0.16, + "grad_norm": 0.16878049075603485, + "learning_rate": 9.596030938968028e-06, + "loss": 0.5524, + "step": 720 + }, + { + "epoch": 0.16, + "grad_norm": 0.1529654562473297, + "learning_rate": 9.594655891164174e-06, + "loss": 0.4946, + "step": 721 + }, + { + "epoch": 0.16, + "grad_norm": 0.2102820873260498, + "learning_rate": 9.593278605970086e-06, + "loss": 0.5093, + "step": 722 + }, + { + "epoch": 0.16, + "grad_norm": 0.13754625618457794, + "learning_rate": 9.591899084056444e-06, + "loss": 0.55, + "step": 723 + }, + { + "epoch": 0.16, + "grad_norm": 0.20235078036785126, + "learning_rate": 9.590517326095012e-06, + "loss": 0.5277, + "step": 724 + }, + { + "epoch": 0.16, + "grad_norm": 0.20487360656261444, + "learning_rate": 9.58913333275864e-06, + "loss": 0.5274, + "step": 725 + }, + { + "epoch": 0.16, + "grad_norm": 0.15242727100849152, + "learning_rate": 9.587747104721275e-06, + "loss": 0.5361, + "step": 726 + }, + { + "epoch": 0.16, + "grad_norm": 0.16651783883571625, + "learning_rate": 9.586358642657946e-06, + "loss": 0.5422, + "step": 727 + }, + { + "epoch": 0.16, + "grad_norm": 0.20768210291862488, + "learning_rate": 9.58496794724477e-06, + "loss": 0.5204, + "step": 728 + }, + { + "epoch": 0.16, + "grad_norm": 0.13769538700580597, + "learning_rate": 9.583575019158954e-06, + "loss": 0.5485, + "step": 729 + }, + { + "epoch": 0.16, + "grad_norm": 0.2392173558473587, + "learning_rate": 9.582179859078793e-06, + "loss": 0.5178, + "step": 730 + }, + { + "epoch": 0.16, + "grad_norm": 0.17117203772068024, + "learning_rate": 9.580782467683666e-06, + "loss": 0.4959, + "step": 731 + }, + { + "epoch": 0.16, + "grad_norm": 0.14463159441947937, + "learning_rate": 9.579382845654038e-06, + "loss": 0.5405, + "step": 732 + }, + { + "epoch": 0.16, + "grad_norm": 0.15378107130527496, + "learning_rate": 9.577980993671461e-06, + "loss": 0.5239, + "step": 733 + }, + { + "epoch": 0.16, + "grad_norm": 0.18154248595237732, + "learning_rate": 9.576576912418577e-06, + "loss": 0.5138, + "step": 734 + }, + { + "epoch": 0.16, + "grad_norm": 0.17718815803527832, + "learning_rate": 9.575170602579109e-06, + "loss": 0.5281, + "step": 735 + }, + { + "epoch": 0.16, + "grad_norm": 0.18913020193576813, + "learning_rate": 9.573762064837866e-06, + "loss": 0.4653, + "step": 736 + }, + { + "epoch": 0.16, + "grad_norm": 0.16615386307239532, + "learning_rate": 9.572351299880742e-06, + "loss": 0.4993, + "step": 737 + }, + { + "epoch": 0.16, + "grad_norm": 0.1711035966873169, + "learning_rate": 9.570938308394717e-06, + "loss": 0.5527, + "step": 738 + }, + { + "epoch": 0.16, + "grad_norm": 0.1759718656539917, + "learning_rate": 9.569523091067855e-06, + "loss": 0.4892, + "step": 739 + }, + { + "epoch": 0.16, + "grad_norm": 0.16556698083877563, + "learning_rate": 9.568105648589299e-06, + "loss": 0.512, + "step": 740 + }, + { + "epoch": 0.16, + "grad_norm": 0.16739937663078308, + "learning_rate": 9.566685981649283e-06, + "loss": 0.5167, + "step": 741 + }, + { + "epoch": 0.16, + "grad_norm": 0.16000035405158997, + "learning_rate": 9.565264090939122e-06, + "loss": 0.5528, + "step": 742 + }, + { + "epoch": 0.16, + "grad_norm": 0.2087719887495041, + "learning_rate": 9.563839977151208e-06, + "loss": 0.5447, + "step": 743 + }, + { + "epoch": 0.16, + "grad_norm": 0.17800335586071014, + "learning_rate": 9.562413640979024e-06, + "loss": 0.5615, + "step": 744 + }, + { + "epoch": 0.16, + "grad_norm": 0.13852566480636597, + "learning_rate": 9.56098508311713e-06, + "loss": 0.5196, + "step": 745 + }, + { + "epoch": 0.16, + "grad_norm": 0.15705984830856323, + "learning_rate": 9.55955430426117e-06, + "loss": 0.5286, + "step": 746 + }, + { + "epoch": 0.16, + "grad_norm": 0.13705521821975708, + "learning_rate": 9.558121305107868e-06, + "loss": 0.4874, + "step": 747 + }, + { + "epoch": 0.16, + "grad_norm": 0.1593395620584488, + "learning_rate": 9.556686086355032e-06, + "loss": 0.508, + "step": 748 + }, + { + "epoch": 0.16, + "grad_norm": 0.1956239640712738, + "learning_rate": 9.555248648701546e-06, + "loss": 0.5165, + "step": 749 + }, + { + "epoch": 0.16, + "grad_norm": 0.15301111340522766, + "learning_rate": 9.553808992847377e-06, + "loss": 0.5279, + "step": 750 + }, + { + "epoch": 0.16, + "grad_norm": 0.1944187432527542, + "learning_rate": 9.552367119493575e-06, + "loss": 0.5328, + "step": 751 + }, + { + "epoch": 0.16, + "grad_norm": 0.16133981943130493, + "learning_rate": 9.550923029342266e-06, + "loss": 0.5258, + "step": 752 + }, + { + "epoch": 0.16, + "grad_norm": 0.1575002670288086, + "learning_rate": 9.549476723096658e-06, + "loss": 0.4785, + "step": 753 + }, + { + "epoch": 0.16, + "grad_norm": 0.2158762514591217, + "learning_rate": 9.548028201461034e-06, + "loss": 0.5069, + "step": 754 + }, + { + "epoch": 0.16, + "grad_norm": 0.1875433325767517, + "learning_rate": 9.546577465140763e-06, + "loss": 0.5165, + "step": 755 + }, + { + "epoch": 0.16, + "grad_norm": 0.15603913366794586, + "learning_rate": 9.545124514842284e-06, + "loss": 0.523, + "step": 756 + }, + { + "epoch": 0.16, + "grad_norm": 0.15902650356292725, + "learning_rate": 9.543669351273122e-06, + "loss": 0.5527, + "step": 757 + }, + { + "epoch": 0.16, + "grad_norm": 0.14115546643733978, + "learning_rate": 9.542211975141871e-06, + "loss": 0.515, + "step": 758 + }, + { + "epoch": 0.16, + "grad_norm": 0.12460020929574966, + "learning_rate": 9.540752387158213e-06, + "loss": 0.5186, + "step": 759 + }, + { + "epoch": 0.16, + "grad_norm": 0.2988269627094269, + "learning_rate": 9.5392905880329e-06, + "loss": 0.5062, + "step": 760 + }, + { + "epoch": 0.16, + "grad_norm": 0.1358107179403305, + "learning_rate": 9.537826578477758e-06, + "loss": 0.5129, + "step": 761 + }, + { + "epoch": 0.16, + "grad_norm": 0.1808885931968689, + "learning_rate": 9.5363603592057e-06, + "loss": 0.5442, + "step": 762 + }, + { + "epoch": 0.16, + "grad_norm": 0.16095423698425293, + "learning_rate": 9.534891930930705e-06, + "loss": 0.5632, + "step": 763 + }, + { + "epoch": 0.16, + "grad_norm": 0.14927184581756592, + "learning_rate": 9.53342129436783e-06, + "loss": 0.5345, + "step": 764 + }, + { + "epoch": 0.16, + "grad_norm": 0.17672008275985718, + "learning_rate": 9.531948450233213e-06, + "loss": 0.5667, + "step": 765 + }, + { + "epoch": 0.17, + "grad_norm": 0.17709845304489136, + "learning_rate": 9.530473399244061e-06, + "loss": 0.5354, + "step": 766 + }, + { + "epoch": 0.17, + "grad_norm": 0.16679351031780243, + "learning_rate": 9.528996142118654e-06, + "loss": 0.5584, + "step": 767 + }, + { + "epoch": 0.17, + "grad_norm": 0.21075226366519928, + "learning_rate": 9.527516679576353e-06, + "loss": 0.4759, + "step": 768 + }, + { + "epoch": 0.17, + "grad_norm": 0.15864352881908417, + "learning_rate": 9.526035012337591e-06, + "loss": 0.5861, + "step": 769 + }, + { + "epoch": 0.17, + "grad_norm": 0.18424198031425476, + "learning_rate": 9.52455114112387e-06, + "loss": 0.5176, + "step": 770 + }, + { + "epoch": 0.17, + "grad_norm": 0.14614816009998322, + "learning_rate": 9.523065066657769e-06, + "loss": 0.5267, + "step": 771 + }, + { + "epoch": 0.17, + "grad_norm": 0.18655577301979065, + "learning_rate": 9.52157678966294e-06, + "loss": 0.5034, + "step": 772 + }, + { + "epoch": 0.17, + "grad_norm": 0.1492408663034439, + "learning_rate": 9.520086310864104e-06, + "loss": 0.5242, + "step": 773 + }, + { + "epoch": 0.17, + "grad_norm": 0.18119966983795166, + "learning_rate": 9.518593630987063e-06, + "loss": 0.503, + "step": 774 + }, + { + "epoch": 0.17, + "grad_norm": 0.2733058035373688, + "learning_rate": 9.51709875075868e-06, + "loss": 0.5293, + "step": 775 + }, + { + "epoch": 0.17, + "grad_norm": 0.15847504138946533, + "learning_rate": 9.515601670906895e-06, + "loss": 0.5012, + "step": 776 + }, + { + "epoch": 0.17, + "grad_norm": 0.17875181138515472, + "learning_rate": 9.51410239216072e-06, + "loss": 0.4895, + "step": 777 + }, + { + "epoch": 0.17, + "grad_norm": 0.19667181372642517, + "learning_rate": 9.512600915250232e-06, + "loss": 0.5493, + "step": 778 + }, + { + "epoch": 0.17, + "grad_norm": 0.1711205095052719, + "learning_rate": 9.511097240906588e-06, + "loss": 0.4674, + "step": 779 + }, + { + "epoch": 0.17, + "grad_norm": 0.18481481075286865, + "learning_rate": 9.509591369862007e-06, + "loss": 0.5166, + "step": 780 + }, + { + "epoch": 0.17, + "grad_norm": 0.15598368644714355, + "learning_rate": 9.50808330284978e-06, + "loss": 0.5697, + "step": 781 + }, + { + "epoch": 0.17, + "grad_norm": 0.19259214401245117, + "learning_rate": 9.506573040604268e-06, + "loss": 0.5114, + "step": 782 + }, + { + "epoch": 0.17, + "grad_norm": 0.14538073539733887, + "learning_rate": 9.5050605838609e-06, + "loss": 0.5485, + "step": 783 + }, + { + "epoch": 0.17, + "grad_norm": 0.18423911929130554, + "learning_rate": 9.503545933356175e-06, + "loss": 0.5254, + "step": 784 + }, + { + "epoch": 0.17, + "grad_norm": 0.1563284546136856, + "learning_rate": 9.50202908982766e-06, + "loss": 0.5266, + "step": 785 + }, + { + "epoch": 0.17, + "grad_norm": 0.16368651390075684, + "learning_rate": 9.500510054013989e-06, + "loss": 0.5289, + "step": 786 + }, + { + "epoch": 0.17, + "grad_norm": 0.16315564513206482, + "learning_rate": 9.498988826654863e-06, + "loss": 0.4904, + "step": 787 + }, + { + "epoch": 0.17, + "grad_norm": 0.15771108865737915, + "learning_rate": 9.49746540849105e-06, + "loss": 0.5132, + "step": 788 + }, + { + "epoch": 0.17, + "grad_norm": 0.19994409382343292, + "learning_rate": 9.49593980026439e-06, + "loss": 0.5498, + "step": 789 + }, + { + "epoch": 0.17, + "grad_norm": 0.13863793015480042, + "learning_rate": 9.494412002717784e-06, + "loss": 0.5206, + "step": 790 + }, + { + "epoch": 0.17, + "grad_norm": 0.17389997839927673, + "learning_rate": 9.4928820165952e-06, + "loss": 0.4742, + "step": 791 + }, + { + "epoch": 0.17, + "grad_norm": 0.15407484769821167, + "learning_rate": 9.49134984264167e-06, + "loss": 0.4783, + "step": 792 + }, + { + "epoch": 0.17, + "grad_norm": 0.15034940838813782, + "learning_rate": 9.489815481603297e-06, + "loss": 0.5066, + "step": 793 + }, + { + "epoch": 0.17, + "grad_norm": 0.14711235463619232, + "learning_rate": 9.488278934227242e-06, + "loss": 0.5068, + "step": 794 + }, + { + "epoch": 0.17, + "grad_norm": 0.17346839606761932, + "learning_rate": 9.48674020126174e-06, + "loss": 0.536, + "step": 795 + }, + { + "epoch": 0.17, + "grad_norm": 0.14369408786296844, + "learning_rate": 9.485199283456078e-06, + "loss": 0.4971, + "step": 796 + }, + { + "epoch": 0.17, + "grad_norm": 0.1965474933385849, + "learning_rate": 9.483656181560618e-06, + "loss": 0.5791, + "step": 797 + }, + { + "epoch": 0.17, + "grad_norm": 0.17605896294116974, + "learning_rate": 9.48211089632678e-06, + "loss": 0.5551, + "step": 798 + }, + { + "epoch": 0.17, + "grad_norm": 0.1731802225112915, + "learning_rate": 9.480563428507045e-06, + "loss": 0.4776, + "step": 799 + }, + { + "epoch": 0.17, + "grad_norm": 0.17883409559726715, + "learning_rate": 9.479013778854966e-06, + "loss": 0.5357, + "step": 800 + }, + { + "epoch": 0.17, + "grad_norm": 0.1549665927886963, + "learning_rate": 9.477461948125149e-06, + "loss": 0.4987, + "step": 801 + }, + { + "epoch": 0.17, + "grad_norm": 0.23310746252536774, + "learning_rate": 9.475907937073265e-06, + "loss": 0.5242, + "step": 802 + }, + { + "epoch": 0.17, + "grad_norm": 0.21235214173793793, + "learning_rate": 9.474351746456048e-06, + "loss": 0.4909, + "step": 803 + }, + { + "epoch": 0.17, + "grad_norm": 0.16170482337474823, + "learning_rate": 9.472793377031293e-06, + "loss": 0.4607, + "step": 804 + }, + { + "epoch": 0.17, + "grad_norm": 0.21534408628940582, + "learning_rate": 9.471232829557857e-06, + "loss": 0.5182, + "step": 805 + }, + { + "epoch": 0.17, + "grad_norm": 0.155525341629982, + "learning_rate": 9.469670104795655e-06, + "loss": 0.5337, + "step": 806 + }, + { + "epoch": 0.17, + "grad_norm": 0.1875993311405182, + "learning_rate": 9.468105203505661e-06, + "loss": 0.4955, + "step": 807 + }, + { + "epoch": 0.17, + "grad_norm": 0.1549602895975113, + "learning_rate": 9.466538126449915e-06, + "loss": 0.5879, + "step": 808 + }, + { + "epoch": 0.17, + "grad_norm": 0.22798140347003937, + "learning_rate": 9.464968874391511e-06, + "loss": 0.539, + "step": 809 + }, + { + "epoch": 0.17, + "grad_norm": 0.1601991057395935, + "learning_rate": 9.463397448094605e-06, + "loss": 0.4695, + "step": 810 + }, + { + "epoch": 0.17, + "grad_norm": 0.16516649723052979, + "learning_rate": 9.46182384832441e-06, + "loss": 0.5621, + "step": 811 + }, + { + "epoch": 0.17, + "grad_norm": 0.14943736791610718, + "learning_rate": 9.460248075847199e-06, + "loss": 0.5337, + "step": 812 + }, + { + "epoch": 0.18, + "grad_norm": 0.1822364181280136, + "learning_rate": 9.4586701314303e-06, + "loss": 0.5071, + "step": 813 + }, + { + "epoch": 0.18, + "grad_norm": 0.16500526666641235, + "learning_rate": 9.457090015842104e-06, + "loss": 0.483, + "step": 814 + }, + { + "epoch": 0.18, + "grad_norm": 0.1568198800086975, + "learning_rate": 9.455507729852053e-06, + "loss": 0.496, + "step": 815 + }, + { + "epoch": 0.18, + "grad_norm": 0.17206601798534393, + "learning_rate": 9.453923274230653e-06, + "loss": 0.5544, + "step": 816 + }, + { + "epoch": 0.18, + "grad_norm": 0.15982304513454437, + "learning_rate": 9.452336649749458e-06, + "loss": 0.5124, + "step": 817 + }, + { + "epoch": 0.18, + "grad_norm": 0.19488324224948883, + "learning_rate": 9.450747857181084e-06, + "loss": 0.4981, + "step": 818 + }, + { + "epoch": 0.18, + "grad_norm": 0.23650221526622772, + "learning_rate": 9.449156897299202e-06, + "loss": 0.5373, + "step": 819 + }, + { + "epoch": 0.18, + "grad_norm": 0.15237529575824738, + "learning_rate": 9.447563770878535e-06, + "loss": 0.5248, + "step": 820 + }, + { + "epoch": 0.18, + "grad_norm": 0.15700353682041168, + "learning_rate": 9.44596847869487e-06, + "loss": 0.5289, + "step": 821 + }, + { + "epoch": 0.18, + "grad_norm": 0.17049898207187653, + "learning_rate": 9.444371021525036e-06, + "loss": 0.5195, + "step": 822 + }, + { + "epoch": 0.18, + "grad_norm": 0.18980465829372406, + "learning_rate": 9.442771400146926e-06, + "loss": 0.5191, + "step": 823 + }, + { + "epoch": 0.18, + "grad_norm": 0.14770746231079102, + "learning_rate": 9.441169615339482e-06, + "loss": 0.4799, + "step": 824 + }, + { + "epoch": 0.18, + "grad_norm": 0.1894197016954422, + "learning_rate": 9.439565667882702e-06, + "loss": 0.5771, + "step": 825 + }, + { + "epoch": 0.18, + "grad_norm": 0.17405198514461517, + "learning_rate": 9.437959558557635e-06, + "loss": 0.5276, + "step": 826 + }, + { + "epoch": 0.18, + "grad_norm": 0.2038612961769104, + "learning_rate": 9.436351288146383e-06, + "loss": 0.4888, + "step": 827 + }, + { + "epoch": 0.18, + "grad_norm": 0.18169601261615753, + "learning_rate": 9.434740857432105e-06, + "loss": 0.5273, + "step": 828 + }, + { + "epoch": 0.18, + "grad_norm": 0.19223563373088837, + "learning_rate": 9.433128267199006e-06, + "loss": 0.534, + "step": 829 + }, + { + "epoch": 0.18, + "grad_norm": 0.20077872276306152, + "learning_rate": 9.431513518232343e-06, + "loss": 0.5153, + "step": 830 + }, + { + "epoch": 0.18, + "grad_norm": 0.1688869744539261, + "learning_rate": 9.429896611318428e-06, + "loss": 0.5408, + "step": 831 + }, + { + "epoch": 0.18, + "grad_norm": 0.24384887516498566, + "learning_rate": 9.42827754724462e-06, + "loss": 0.5771, + "step": 832 + }, + { + "epoch": 0.18, + "grad_norm": 0.15766644477844238, + "learning_rate": 9.426656326799333e-06, + "loss": 0.4948, + "step": 833 + }, + { + "epoch": 0.18, + "grad_norm": 0.1572624146938324, + "learning_rate": 9.425032950772025e-06, + "loss": 0.5612, + "step": 834 + }, + { + "epoch": 0.18, + "grad_norm": 0.15511459112167358, + "learning_rate": 9.42340741995321e-06, + "loss": 0.544, + "step": 835 + }, + { + "epoch": 0.18, + "grad_norm": 0.1777951866388321, + "learning_rate": 9.421779735134446e-06, + "loss": 0.5394, + "step": 836 + }, + { + "epoch": 0.18, + "grad_norm": 0.2677023410797119, + "learning_rate": 9.420149897108341e-06, + "loss": 0.484, + "step": 837 + }, + { + "epoch": 0.18, + "grad_norm": 0.1472686529159546, + "learning_rate": 9.418517906668556e-06, + "loss": 0.4913, + "step": 838 + }, + { + "epoch": 0.18, + "grad_norm": 0.15383826196193695, + "learning_rate": 9.416883764609797e-06, + "loss": 0.4718, + "step": 839 + }, + { + "epoch": 0.18, + "grad_norm": 0.19486670196056366, + "learning_rate": 9.415247471727813e-06, + "loss": 0.527, + "step": 840 + }, + { + "epoch": 0.18, + "grad_norm": 0.16585233807563782, + "learning_rate": 9.413609028819409e-06, + "loss": 0.5039, + "step": 841 + }, + { + "epoch": 0.18, + "grad_norm": 0.18775971233844757, + "learning_rate": 9.41196843668243e-06, + "loss": 0.4744, + "step": 842 + }, + { + "epoch": 0.18, + "grad_norm": 0.16499534249305725, + "learning_rate": 9.410325696115775e-06, + "loss": 0.5376, + "step": 843 + }, + { + "epoch": 0.18, + "grad_norm": 0.1950598657131195, + "learning_rate": 9.408680807919377e-06, + "loss": 0.5213, + "step": 844 + }, + { + "epoch": 0.18, + "grad_norm": 0.14264388382434845, + "learning_rate": 9.407033772894229e-06, + "loss": 0.566, + "step": 845 + }, + { + "epoch": 0.18, + "grad_norm": 0.16956187784671783, + "learning_rate": 9.405384591842358e-06, + "loss": 0.5058, + "step": 846 + }, + { + "epoch": 0.18, + "grad_norm": 0.13649915158748627, + "learning_rate": 9.403733265566848e-06, + "loss": 0.4948, + "step": 847 + }, + { + "epoch": 0.18, + "grad_norm": 0.1546815037727356, + "learning_rate": 9.402079794871812e-06, + "loss": 0.5087, + "step": 848 + }, + { + "epoch": 0.18, + "grad_norm": 0.17630915343761444, + "learning_rate": 9.400424180562421e-06, + "loss": 0.5477, + "step": 849 + }, + { + "epoch": 0.18, + "grad_norm": 0.19923992455005646, + "learning_rate": 9.398766423444883e-06, + "loss": 0.5332, + "step": 850 + }, + { + "epoch": 0.18, + "grad_norm": 0.1514226794242859, + "learning_rate": 9.397106524326449e-06, + "loss": 0.5278, + "step": 851 + }, + { + "epoch": 0.18, + "grad_norm": 0.17602422833442688, + "learning_rate": 9.39544448401542e-06, + "loss": 0.4708, + "step": 852 + }, + { + "epoch": 0.18, + "grad_norm": 0.17394909262657166, + "learning_rate": 9.393780303321128e-06, + "loss": 0.5128, + "step": 853 + }, + { + "epoch": 0.18, + "grad_norm": 0.14890971779823303, + "learning_rate": 9.392113983053958e-06, + "loss": 0.4967, + "step": 854 + }, + { + "epoch": 0.18, + "grad_norm": 0.18306109309196472, + "learning_rate": 9.390445524025336e-06, + "loss": 0.4917, + "step": 855 + }, + { + "epoch": 0.18, + "grad_norm": 0.16756963729858398, + "learning_rate": 9.38877492704772e-06, + "loss": 0.5143, + "step": 856 + }, + { + "epoch": 0.18, + "grad_norm": 0.15101511776447296, + "learning_rate": 9.387102192934618e-06, + "loss": 0.5214, + "step": 857 + }, + { + "epoch": 0.18, + "grad_norm": 0.21072083711624146, + "learning_rate": 9.385427322500575e-06, + "loss": 0.5188, + "step": 858 + }, + { + "epoch": 0.19, + "grad_norm": 0.3193773627281189, + "learning_rate": 9.38375031656118e-06, + "loss": 0.5248, + "step": 859 + }, + { + "epoch": 0.19, + "grad_norm": 0.17284849286079407, + "learning_rate": 9.382071175933058e-06, + "loss": 0.5331, + "step": 860 + }, + { + "epoch": 0.19, + "grad_norm": 0.16421107947826385, + "learning_rate": 9.380389901433875e-06, + "loss": 0.5512, + "step": 861 + }, + { + "epoch": 0.19, + "grad_norm": 0.19052369892597198, + "learning_rate": 9.378706493882335e-06, + "loss": 0.5485, + "step": 862 + }, + { + "epoch": 0.19, + "grad_norm": 0.20467452704906464, + "learning_rate": 9.377020954098181e-06, + "loss": 0.5334, + "step": 863 + }, + { + "epoch": 0.19, + "grad_norm": 0.14375852048397064, + "learning_rate": 9.375333282902198e-06, + "loss": 0.5574, + "step": 864 + }, + { + "epoch": 0.19, + "grad_norm": 0.16476349532604218, + "learning_rate": 9.3736434811162e-06, + "loss": 0.542, + "step": 865 + }, + { + "epoch": 0.19, + "grad_norm": 0.18003122508525848, + "learning_rate": 9.37195154956305e-06, + "loss": 0.5179, + "step": 866 + }, + { + "epoch": 0.19, + "grad_norm": 0.17590996623039246, + "learning_rate": 9.37025748906664e-06, + "loss": 0.5528, + "step": 867 + }, + { + "epoch": 0.19, + "grad_norm": 0.20183418691158295, + "learning_rate": 9.368561300451902e-06, + "loss": 0.544, + "step": 868 + }, + { + "epoch": 0.19, + "grad_norm": 0.1688835769891739, + "learning_rate": 9.366862984544802e-06, + "loss": 0.4812, + "step": 869 + }, + { + "epoch": 0.19, + "grad_norm": 0.21423234045505524, + "learning_rate": 9.365162542172346e-06, + "loss": 0.5428, + "step": 870 + }, + { + "epoch": 0.19, + "grad_norm": 0.15897509455680847, + "learning_rate": 9.363459974162568e-06, + "loss": 0.5227, + "step": 871 + }, + { + "epoch": 0.19, + "grad_norm": 0.16136378049850464, + "learning_rate": 9.361755281344547e-06, + "loss": 0.555, + "step": 872 + }, + { + "epoch": 0.19, + "grad_norm": 0.1796402931213379, + "learning_rate": 9.360048464548386e-06, + "loss": 0.4782, + "step": 873 + }, + { + "epoch": 0.19, + "grad_norm": 0.531697690486908, + "learning_rate": 9.358339524605233e-06, + "loss": 0.5207, + "step": 874 + }, + { + "epoch": 0.19, + "grad_norm": 0.15121889114379883, + "learning_rate": 9.356628462347264e-06, + "loss": 0.4837, + "step": 875 + }, + { + "epoch": 0.19, + "grad_norm": 0.17013150453567505, + "learning_rate": 9.354915278607685e-06, + "loss": 0.4911, + "step": 876 + }, + { + "epoch": 0.19, + "grad_norm": 0.18947632610797882, + "learning_rate": 9.353199974220744e-06, + "loss": 0.5029, + "step": 877 + }, + { + "epoch": 0.19, + "grad_norm": 0.13245789706707, + "learning_rate": 9.351482550021713e-06, + "loss": 0.4782, + "step": 878 + }, + { + "epoch": 0.19, + "grad_norm": 0.21231511235237122, + "learning_rate": 9.349763006846903e-06, + "loss": 0.5535, + "step": 879 + }, + { + "epoch": 0.19, + "grad_norm": 0.18766769766807556, + "learning_rate": 9.348041345533653e-06, + "loss": 0.5222, + "step": 880 + }, + { + "epoch": 0.19, + "grad_norm": 0.16258256137371063, + "learning_rate": 9.346317566920335e-06, + "loss": 0.4873, + "step": 881 + }, + { + "epoch": 0.19, + "grad_norm": 0.14111053943634033, + "learning_rate": 9.34459167184635e-06, + "loss": 0.4795, + "step": 882 + }, + { + "epoch": 0.19, + "grad_norm": 0.20663069188594818, + "learning_rate": 9.342863661152133e-06, + "loss": 0.5221, + "step": 883 + }, + { + "epoch": 0.19, + "grad_norm": 0.1376432627439499, + "learning_rate": 9.341133535679145e-06, + "loss": 0.464, + "step": 884 + }, + { + "epoch": 0.19, + "grad_norm": 0.15190206468105316, + "learning_rate": 9.33940129626988e-06, + "loss": 0.5118, + "step": 885 + }, + { + "epoch": 0.19, + "grad_norm": 0.16546842455863953, + "learning_rate": 9.337666943767863e-06, + "loss": 0.5256, + "step": 886 + }, + { + "epoch": 0.19, + "grad_norm": 0.1859419345855713, + "learning_rate": 9.335930479017642e-06, + "loss": 0.562, + "step": 887 + }, + { + "epoch": 0.19, + "grad_norm": 0.2912534475326538, + "learning_rate": 9.334191902864799e-06, + "loss": 0.5298, + "step": 888 + }, + { + "epoch": 0.19, + "grad_norm": 0.16982656717300415, + "learning_rate": 9.33245121615594e-06, + "loss": 0.4953, + "step": 889 + }, + { + "epoch": 0.19, + "grad_norm": 0.21246029436588287, + "learning_rate": 9.330708419738704e-06, + "loss": 0.5222, + "step": 890 + }, + { + "epoch": 0.19, + "grad_norm": 0.3462158739566803, + "learning_rate": 9.328963514461753e-06, + "loss": 0.5451, + "step": 891 + }, + { + "epoch": 0.19, + "grad_norm": 0.14150933921337128, + "learning_rate": 9.327216501174775e-06, + "loss": 0.5529, + "step": 892 + }, + { + "epoch": 0.19, + "grad_norm": 0.15398851037025452, + "learning_rate": 9.32546738072849e-06, + "loss": 0.5258, + "step": 893 + }, + { + "epoch": 0.19, + "grad_norm": 0.14066843688488007, + "learning_rate": 9.323716153974639e-06, + "loss": 0.5097, + "step": 894 + }, + { + "epoch": 0.19, + "grad_norm": 0.1923949271440506, + "learning_rate": 9.321962821765991e-06, + "loss": 0.5511, + "step": 895 + }, + { + "epoch": 0.19, + "grad_norm": 0.2550576627254486, + "learning_rate": 9.320207384956339e-06, + "loss": 0.5541, + "step": 896 + }, + { + "epoch": 0.19, + "grad_norm": 0.178908571600914, + "learning_rate": 9.318449844400504e-06, + "loss": 0.5135, + "step": 897 + }, + { + "epoch": 0.19, + "grad_norm": 0.19086679816246033, + "learning_rate": 9.316690200954324e-06, + "loss": 0.5143, + "step": 898 + }, + { + "epoch": 0.19, + "grad_norm": 0.1290796995162964, + "learning_rate": 9.31492845547467e-06, + "loss": 0.502, + "step": 899 + }, + { + "epoch": 0.19, + "grad_norm": 0.14759333431720734, + "learning_rate": 9.313164608819434e-06, + "loss": 0.5287, + "step": 900 + }, + { + "epoch": 0.19, + "grad_norm": 0.158295676112175, + "learning_rate": 9.311398661847526e-06, + "loss": 0.56, + "step": 901 + }, + { + "epoch": 0.19, + "grad_norm": 0.15422774851322174, + "learning_rate": 9.309630615418884e-06, + "loss": 0.5334, + "step": 902 + }, + { + "epoch": 0.19, + "grad_norm": 0.16427233815193176, + "learning_rate": 9.307860470394467e-06, + "loss": 0.5364, + "step": 903 + }, + { + "epoch": 0.19, + "grad_norm": 0.21173103153705597, + "learning_rate": 9.306088227636257e-06, + "loss": 0.5094, + "step": 904 + }, + { + "epoch": 0.19, + "grad_norm": 0.21965515613555908, + "learning_rate": 9.304313888007254e-06, + "loss": 0.5219, + "step": 905 + }, + { + "epoch": 0.2, + "grad_norm": 0.15674197673797607, + "learning_rate": 9.302537452371482e-06, + "loss": 0.5188, + "step": 906 + }, + { + "epoch": 0.2, + "grad_norm": 0.19992782175540924, + "learning_rate": 9.300758921593986e-06, + "loss": 0.499, + "step": 907 + }, + { + "epoch": 0.2, + "grad_norm": 0.17217914760112762, + "learning_rate": 9.298978296540829e-06, + "loss": 0.5364, + "step": 908 + }, + { + "epoch": 0.2, + "grad_norm": 0.17133580148220062, + "learning_rate": 9.297195578079096e-06, + "loss": 0.4968, + "step": 909 + }, + { + "epoch": 0.2, + "grad_norm": 0.23099388182163239, + "learning_rate": 9.295410767076891e-06, + "loss": 0.5252, + "step": 910 + }, + { + "epoch": 0.2, + "grad_norm": 0.16104039549827576, + "learning_rate": 9.293623864403336e-06, + "loss": 0.4742, + "step": 911 + }, + { + "epoch": 0.2, + "grad_norm": 0.12712013721466064, + "learning_rate": 9.291834870928573e-06, + "loss": 0.559, + "step": 912 + }, + { + "epoch": 0.2, + "grad_norm": 0.17714501917362213, + "learning_rate": 9.29004378752376e-06, + "loss": 0.6085, + "step": 913 + }, + { + "epoch": 0.2, + "grad_norm": 0.16740843653678894, + "learning_rate": 9.288250615061073e-06, + "loss": 0.5035, + "step": 914 + }, + { + "epoch": 0.2, + "grad_norm": 0.19126859307289124, + "learning_rate": 9.286455354413707e-06, + "loss": 0.5777, + "step": 915 + }, + { + "epoch": 0.2, + "grad_norm": 0.14088517427444458, + "learning_rate": 9.284658006455871e-06, + "loss": 0.5092, + "step": 916 + }, + { + "epoch": 0.2, + "grad_norm": 0.14722870290279388, + "learning_rate": 9.282858572062795e-06, + "loss": 0.5206, + "step": 917 + }, + { + "epoch": 0.2, + "grad_norm": 0.1408064216375351, + "learning_rate": 9.281057052110725e-06, + "loss": 0.5287, + "step": 918 + }, + { + "epoch": 0.2, + "grad_norm": 0.1396157294511795, + "learning_rate": 9.279253447476914e-06, + "loss": 0.5116, + "step": 919 + }, + { + "epoch": 0.2, + "grad_norm": 0.14657460153102875, + "learning_rate": 9.27744775903964e-06, + "loss": 0.5108, + "step": 920 + }, + { + "epoch": 0.2, + "grad_norm": 0.17514435946941376, + "learning_rate": 9.27563998767819e-06, + "loss": 0.5112, + "step": 921 + }, + { + "epoch": 0.2, + "grad_norm": 0.17996759712696075, + "learning_rate": 9.27383013427287e-06, + "loss": 0.495, + "step": 922 + }, + { + "epoch": 0.2, + "grad_norm": 0.18228891491889954, + "learning_rate": 9.272018199704993e-06, + "loss": 0.4843, + "step": 923 + }, + { + "epoch": 0.2, + "grad_norm": 0.18271513283252716, + "learning_rate": 9.270204184856893e-06, + "loss": 0.5625, + "step": 924 + }, + { + "epoch": 0.2, + "grad_norm": 0.18662293255329132, + "learning_rate": 9.26838809061191e-06, + "loss": 0.5065, + "step": 925 + }, + { + "epoch": 0.2, + "grad_norm": 0.15625204145908356, + "learning_rate": 9.266569917854403e-06, + "loss": 0.5557, + "step": 926 + }, + { + "epoch": 0.2, + "grad_norm": 0.16261446475982666, + "learning_rate": 9.264749667469737e-06, + "loss": 0.5583, + "step": 927 + }, + { + "epoch": 0.2, + "grad_norm": 0.14734329283237457, + "learning_rate": 9.262927340344296e-06, + "loss": 0.567, + "step": 928 + }, + { + "epoch": 0.2, + "grad_norm": 0.18826404213905334, + "learning_rate": 9.261102937365468e-06, + "loss": 0.5309, + "step": 929 + }, + { + "epoch": 0.2, + "grad_norm": 0.18732258677482605, + "learning_rate": 9.259276459421655e-06, + "loss": 0.525, + "step": 930 + }, + { + "epoch": 0.2, + "grad_norm": 0.176020547747612, + "learning_rate": 9.257447907402272e-06, + "loss": 0.5187, + "step": 931 + }, + { + "epoch": 0.2, + "grad_norm": 0.15038305521011353, + "learning_rate": 9.255617282197739e-06, + "loss": 0.5049, + "step": 932 + }, + { + "epoch": 0.2, + "grad_norm": 0.15459555387496948, + "learning_rate": 9.253784584699488e-06, + "loss": 0.5021, + "step": 933 + }, + { + "epoch": 0.2, + "grad_norm": 0.16818863153457642, + "learning_rate": 9.25194981579996e-06, + "loss": 0.5109, + "step": 934 + }, + { + "epoch": 0.2, + "grad_norm": 0.158711776137352, + "learning_rate": 9.250112976392608e-06, + "loss": 0.5235, + "step": 935 + }, + { + "epoch": 0.2, + "grad_norm": 0.13350459933280945, + "learning_rate": 9.248274067371886e-06, + "loss": 0.5624, + "step": 936 + }, + { + "epoch": 0.2, + "grad_norm": 0.16148029267787933, + "learning_rate": 9.24643308963326e-06, + "loss": 0.5562, + "step": 937 + }, + { + "epoch": 0.2, + "grad_norm": 0.17886267602443695, + "learning_rate": 9.244590044073205e-06, + "loss": 0.5252, + "step": 938 + }, + { + "epoch": 0.2, + "grad_norm": 0.18493221700191498, + "learning_rate": 9.2427449315892e-06, + "loss": 0.5195, + "step": 939 + }, + { + "epoch": 0.2, + "grad_norm": 0.1529918760061264, + "learning_rate": 9.240897753079734e-06, + "loss": 0.517, + "step": 940 + }, + { + "epoch": 0.2, + "grad_norm": 0.18862253427505493, + "learning_rate": 9.239048509444296e-06, + "loss": 0.5214, + "step": 941 + }, + { + "epoch": 0.2, + "grad_norm": 0.1629784107208252, + "learning_rate": 9.237197201583386e-06, + "loss": 0.5421, + "step": 942 + }, + { + "epoch": 0.2, + "grad_norm": 0.2280578315258026, + "learning_rate": 9.235343830398506e-06, + "loss": 0.5033, + "step": 943 + }, + { + "epoch": 0.2, + "grad_norm": 0.15682753920555115, + "learning_rate": 9.233488396792167e-06, + "loss": 0.562, + "step": 944 + }, + { + "epoch": 0.2, + "grad_norm": 0.16542381048202515, + "learning_rate": 9.231630901667879e-06, + "loss": 0.5448, + "step": 945 + }, + { + "epoch": 0.2, + "grad_norm": 0.1738227754831314, + "learning_rate": 9.22977134593016e-06, + "loss": 0.5662, + "step": 946 + }, + { + "epoch": 0.2, + "grad_norm": 0.13837432861328125, + "learning_rate": 9.227909730484527e-06, + "loss": 0.5259, + "step": 947 + }, + { + "epoch": 0.2, + "grad_norm": 0.1606243997812271, + "learning_rate": 9.226046056237508e-06, + "loss": 0.5666, + "step": 948 + }, + { + "epoch": 0.2, + "grad_norm": 0.14131122827529907, + "learning_rate": 9.224180324096623e-06, + "loss": 0.5486, + "step": 949 + }, + { + "epoch": 0.2, + "grad_norm": 0.13392627239227295, + "learning_rate": 9.222312534970403e-06, + "loss": 0.4792, + "step": 950 + }, + { + "epoch": 0.2, + "grad_norm": 0.1538127064704895, + "learning_rate": 9.220442689768376e-06, + "loss": 0.484, + "step": 951 + }, + { + "epoch": 0.21, + "grad_norm": 0.13414621353149414, + "learning_rate": 9.218570789401071e-06, + "loss": 0.5123, + "step": 952 + }, + { + "epoch": 0.21, + "grad_norm": 0.17911511659622192, + "learning_rate": 9.21669683478002e-06, + "loss": 0.5549, + "step": 953 + }, + { + "epoch": 0.21, + "grad_norm": 0.19138379395008087, + "learning_rate": 9.214820826817754e-06, + "loss": 0.4892, + "step": 954 + }, + { + "epoch": 0.21, + "grad_norm": 0.20988555252552032, + "learning_rate": 9.212942766427806e-06, + "loss": 0.498, + "step": 955 + }, + { + "epoch": 0.21, + "grad_norm": 0.13097749650478363, + "learning_rate": 9.211062654524705e-06, + "loss": 0.4603, + "step": 956 + }, + { + "epoch": 0.21, + "grad_norm": 0.1466490477323532, + "learning_rate": 9.20918049202398e-06, + "loss": 0.4924, + "step": 957 + }, + { + "epoch": 0.21, + "grad_norm": 0.23887225985527039, + "learning_rate": 9.207296279842162e-06, + "loss": 0.5725, + "step": 958 + }, + { + "epoch": 0.21, + "grad_norm": 0.13960181176662445, + "learning_rate": 9.205410018896775e-06, + "loss": 0.5444, + "step": 959 + }, + { + "epoch": 0.21, + "grad_norm": 0.21269811689853668, + "learning_rate": 9.203521710106344e-06, + "loss": 0.5672, + "step": 960 + }, + { + "epoch": 0.21, + "grad_norm": 0.1967892199754715, + "learning_rate": 9.201631354390391e-06, + "loss": 0.5674, + "step": 961 + }, + { + "epoch": 0.21, + "grad_norm": 0.20892930030822754, + "learning_rate": 9.199738952669431e-06, + "loss": 0.4915, + "step": 962 + }, + { + "epoch": 0.21, + "grad_norm": 0.15803126990795135, + "learning_rate": 9.197844505864982e-06, + "loss": 0.4839, + "step": 963 + }, + { + "epoch": 0.21, + "grad_norm": 0.17779715359210968, + "learning_rate": 9.195948014899551e-06, + "loss": 0.5204, + "step": 964 + }, + { + "epoch": 0.21, + "grad_norm": 0.1472802758216858, + "learning_rate": 9.194049480696647e-06, + "loss": 0.5691, + "step": 965 + }, + { + "epoch": 0.21, + "grad_norm": 0.19076858460903168, + "learning_rate": 9.192148904180769e-06, + "loss": 0.555, + "step": 966 + }, + { + "epoch": 0.21, + "grad_norm": 0.15820792317390442, + "learning_rate": 9.19024628627741e-06, + "loss": 0.5462, + "step": 967 + }, + { + "epoch": 0.21, + "grad_norm": 0.1319994479417801, + "learning_rate": 9.188341627913061e-06, + "loss": 0.5487, + "step": 968 + }, + { + "epoch": 0.21, + "grad_norm": 0.24205906689167023, + "learning_rate": 9.186434930015205e-06, + "loss": 0.518, + "step": 969 + }, + { + "epoch": 0.21, + "grad_norm": 0.15955299139022827, + "learning_rate": 9.184526193512318e-06, + "loss": 0.5596, + "step": 970 + }, + { + "epoch": 0.21, + "grad_norm": 0.16520148515701294, + "learning_rate": 9.182615419333867e-06, + "loss": 0.5647, + "step": 971 + }, + { + "epoch": 0.21, + "grad_norm": 0.2001345306634903, + "learning_rate": 9.180702608410314e-06, + "loss": 0.544, + "step": 972 + }, + { + "epoch": 0.21, + "grad_norm": 0.17887279391288757, + "learning_rate": 9.178787761673111e-06, + "loss": 0.5225, + "step": 973 + }, + { + "epoch": 0.21, + "grad_norm": 0.15997150540351868, + "learning_rate": 9.176870880054704e-06, + "loss": 0.5674, + "step": 974 + }, + { + "epoch": 0.21, + "grad_norm": 0.14125515520572662, + "learning_rate": 9.174951964488528e-06, + "loss": 0.5542, + "step": 975 + }, + { + "epoch": 0.21, + "grad_norm": 0.1298058182001114, + "learning_rate": 9.173031015909005e-06, + "loss": 0.5015, + "step": 976 + }, + { + "epoch": 0.21, + "grad_norm": 0.17486491799354553, + "learning_rate": 9.17110803525155e-06, + "loss": 0.569, + "step": 977 + }, + { + "epoch": 0.21, + "grad_norm": 0.18652723729610443, + "learning_rate": 9.169183023452574e-06, + "loss": 0.5062, + "step": 978 + }, + { + "epoch": 0.21, + "grad_norm": 0.1338779628276825, + "learning_rate": 9.167255981449466e-06, + "loss": 0.5122, + "step": 979 + }, + { + "epoch": 0.21, + "grad_norm": 0.13061174750328064, + "learning_rate": 9.165326910180608e-06, + "loss": 0.4903, + "step": 980 + }, + { + "epoch": 0.21, + "grad_norm": 0.13457538187503815, + "learning_rate": 9.163395810585374e-06, + "loss": 0.5316, + "step": 981 + }, + { + "epoch": 0.21, + "grad_norm": 0.14567075669765472, + "learning_rate": 9.161462683604118e-06, + "loss": 0.5241, + "step": 982 + }, + { + "epoch": 0.21, + "grad_norm": 0.2161962240934372, + "learning_rate": 9.159527530178191e-06, + "loss": 0.513, + "step": 983 + }, + { + "epoch": 0.21, + "grad_norm": 0.14327682554721832, + "learning_rate": 9.157590351249923e-06, + "loss": 0.5493, + "step": 984 + }, + { + "epoch": 0.21, + "grad_norm": 0.14309169352054596, + "learning_rate": 9.155651147762631e-06, + "loss": 0.514, + "step": 985 + }, + { + "epoch": 0.21, + "grad_norm": 0.16566166281700134, + "learning_rate": 9.153709920660624e-06, + "loss": 0.4916, + "step": 986 + }, + { + "epoch": 0.21, + "grad_norm": 0.18244121968746185, + "learning_rate": 9.151766670889186e-06, + "loss": 0.5397, + "step": 987 + }, + { + "epoch": 0.21, + "grad_norm": 0.1684887707233429, + "learning_rate": 9.149821399394597e-06, + "loss": 0.5094, + "step": 988 + }, + { + "epoch": 0.21, + "grad_norm": 0.15885643661022186, + "learning_rate": 9.147874107124114e-06, + "loss": 0.5258, + "step": 989 + }, + { + "epoch": 0.21, + "grad_norm": 0.2527085542678833, + "learning_rate": 9.145924795025984e-06, + "loss": 0.5456, + "step": 990 + }, + { + "epoch": 0.21, + "grad_norm": 0.20791400969028473, + "learning_rate": 9.14397346404943e-06, + "loss": 0.5137, + "step": 991 + }, + { + "epoch": 0.21, + "grad_norm": 0.18550600111484528, + "learning_rate": 9.142020115144662e-06, + "loss": 0.4834, + "step": 992 + }, + { + "epoch": 0.21, + "grad_norm": 0.15677522122859955, + "learning_rate": 9.140064749262876e-06, + "loss": 0.5201, + "step": 993 + }, + { + "epoch": 0.21, + "grad_norm": 0.15685126185417175, + "learning_rate": 9.138107367356247e-06, + "loss": 0.4838, + "step": 994 + }, + { + "epoch": 0.21, + "grad_norm": 0.13539238274097443, + "learning_rate": 9.136147970377926e-06, + "loss": 0.5323, + "step": 995 + }, + { + "epoch": 0.21, + "grad_norm": 0.18492737412452698, + "learning_rate": 9.134186559282058e-06, + "loss": 0.5457, + "step": 996 + }, + { + "epoch": 0.21, + "grad_norm": 0.14817145466804504, + "learning_rate": 9.132223135023759e-06, + "loss": 0.5151, + "step": 997 + }, + { + "epoch": 0.21, + "grad_norm": 0.17167848348617554, + "learning_rate": 9.130257698559129e-06, + "loss": 0.5397, + "step": 998 + }, + { + "epoch": 0.22, + "grad_norm": 0.15762774646282196, + "learning_rate": 9.128290250845244e-06, + "loss": 0.527, + "step": 999 + }, + { + "epoch": 0.22, + "grad_norm": 0.20650818943977356, + "learning_rate": 9.126320792840165e-06, + "loss": 0.5657, + "step": 1000 + }, + { + "epoch": 0.22, + "grad_norm": 0.192567840218544, + "learning_rate": 9.124349325502928e-06, + "loss": 0.5291, + "step": 1001 + }, + { + "epoch": 0.22, + "grad_norm": 0.13800346851348877, + "learning_rate": 9.12237584979355e-06, + "loss": 0.526, + "step": 1002 + }, + { + "epoch": 0.22, + "grad_norm": 0.12781374156475067, + "learning_rate": 9.120400366673024e-06, + "loss": 0.5068, + "step": 1003 + }, + { + "epoch": 0.22, + "grad_norm": 0.1455235779285431, + "learning_rate": 9.11842287710332e-06, + "loss": 0.4949, + "step": 1004 + }, + { + "epoch": 0.22, + "grad_norm": 0.16621056199073792, + "learning_rate": 9.116443382047391e-06, + "loss": 0.5166, + "step": 1005 + }, + { + "epoch": 0.22, + "grad_norm": 0.19221191108226776, + "learning_rate": 9.114461882469154e-06, + "loss": 0.5088, + "step": 1006 + }, + { + "epoch": 0.22, + "grad_norm": 0.15902382135391235, + "learning_rate": 9.112478379333517e-06, + "loss": 0.5388, + "step": 1007 + }, + { + "epoch": 0.22, + "grad_norm": 0.13084392249584198, + "learning_rate": 9.110492873606351e-06, + "loss": 0.4672, + "step": 1008 + }, + { + "epoch": 0.22, + "grad_norm": 0.15393121540546417, + "learning_rate": 9.108505366254512e-06, + "loss": 0.5063, + "step": 1009 + }, + { + "epoch": 0.22, + "grad_norm": 0.16303934156894684, + "learning_rate": 9.106515858245825e-06, + "loss": 0.545, + "step": 1010 + }, + { + "epoch": 0.22, + "grad_norm": 0.16543173789978027, + "learning_rate": 9.10452435054909e-06, + "loss": 0.5345, + "step": 1011 + }, + { + "epoch": 0.22, + "grad_norm": 0.16311848163604736, + "learning_rate": 9.102530844134084e-06, + "loss": 0.4611, + "step": 1012 + }, + { + "epoch": 0.22, + "grad_norm": 0.16494883596897125, + "learning_rate": 9.10053533997155e-06, + "loss": 0.4955, + "step": 1013 + }, + { + "epoch": 0.22, + "grad_norm": 0.14451864361763, + "learning_rate": 9.098537839033213e-06, + "loss": 0.4997, + "step": 1014 + }, + { + "epoch": 0.22, + "grad_norm": 0.20046649873256683, + "learning_rate": 9.096538342291763e-06, + "loss": 0.5718, + "step": 1015 + }, + { + "epoch": 0.22, + "grad_norm": 0.1361169070005417, + "learning_rate": 9.094536850720867e-06, + "loss": 0.4561, + "step": 1016 + }, + { + "epoch": 0.22, + "grad_norm": 0.1675615757703781, + "learning_rate": 9.09253336529516e-06, + "loss": 0.5372, + "step": 1017 + }, + { + "epoch": 0.22, + "grad_norm": 0.22339864075183868, + "learning_rate": 9.090527886990249e-06, + "loss": 0.5611, + "step": 1018 + }, + { + "epoch": 0.22, + "grad_norm": 0.17522381246089935, + "learning_rate": 9.088520416782712e-06, + "loss": 0.5352, + "step": 1019 + }, + { + "epoch": 0.22, + "grad_norm": 0.13996882736682892, + "learning_rate": 9.086510955650095e-06, + "loss": 0.4947, + "step": 1020 + }, + { + "epoch": 0.22, + "grad_norm": 0.15913517773151398, + "learning_rate": 9.084499504570918e-06, + "loss": 0.4947, + "step": 1021 + }, + { + "epoch": 0.22, + "grad_norm": 0.17235067486763, + "learning_rate": 9.082486064524663e-06, + "loss": 0.53, + "step": 1022 + }, + { + "epoch": 0.22, + "grad_norm": 0.2162034660577774, + "learning_rate": 9.080470636491787e-06, + "loss": 0.4904, + "step": 1023 + }, + { + "epoch": 0.22, + "grad_norm": 0.21353678405284882, + "learning_rate": 9.078453221453714e-06, + "loss": 0.5088, + "step": 1024 + }, + { + "epoch": 0.22, + "grad_norm": 0.1277047097682953, + "learning_rate": 9.076433820392831e-06, + "loss": 0.5207, + "step": 1025 + }, + { + "epoch": 0.22, + "grad_norm": 0.15845198929309845, + "learning_rate": 9.074412434292496e-06, + "loss": 0.5951, + "step": 1026 + }, + { + "epoch": 0.22, + "grad_norm": 0.17977949976921082, + "learning_rate": 9.072389064137035e-06, + "loss": 0.5098, + "step": 1027 + }, + { + "epoch": 0.22, + "grad_norm": 0.15521718561649323, + "learning_rate": 9.070363710911736e-06, + "loss": 0.5513, + "step": 1028 + }, + { + "epoch": 0.22, + "grad_norm": 0.14528630673885345, + "learning_rate": 9.068336375602853e-06, + "loss": 0.4895, + "step": 1029 + }, + { + "epoch": 0.22, + "grad_norm": 0.16791880130767822, + "learning_rate": 9.066307059197612e-06, + "loss": 0.528, + "step": 1030 + }, + { + "epoch": 0.22, + "grad_norm": 0.1570877581834793, + "learning_rate": 9.064275762684194e-06, + "loss": 0.4957, + "step": 1031 + }, + { + "epoch": 0.22, + "grad_norm": 0.130596324801445, + "learning_rate": 9.062242487051752e-06, + "loss": 0.5338, + "step": 1032 + }, + { + "epoch": 0.22, + "grad_norm": 0.14908380806446075, + "learning_rate": 9.060207233290396e-06, + "loss": 0.5295, + "step": 1033 + }, + { + "epoch": 0.22, + "grad_norm": 0.18400724232196808, + "learning_rate": 9.058170002391205e-06, + "loss": 0.5265, + "step": 1034 + }, + { + "epoch": 0.22, + "grad_norm": 0.1491273045539856, + "learning_rate": 9.05613079534622e-06, + "loss": 0.4974, + "step": 1035 + }, + { + "epoch": 0.22, + "grad_norm": 0.1835760623216629, + "learning_rate": 9.05408961314844e-06, + "loss": 0.5317, + "step": 1036 + }, + { + "epoch": 0.22, + "grad_norm": 0.14263573288917542, + "learning_rate": 9.052046456791829e-06, + "loss": 0.4928, + "step": 1037 + }, + { + "epoch": 0.22, + "grad_norm": 0.3876129686832428, + "learning_rate": 9.050001327271314e-06, + "loss": 0.5149, + "step": 1038 + }, + { + "epoch": 0.22, + "grad_norm": 0.16249504685401917, + "learning_rate": 9.04795422558278e-06, + "loss": 0.5251, + "step": 1039 + }, + { + "epoch": 0.22, + "grad_norm": 0.16931766271591187, + "learning_rate": 9.045905152723074e-06, + "loss": 0.5532, + "step": 1040 + }, + { + "epoch": 0.22, + "grad_norm": 0.1582767814397812, + "learning_rate": 9.043854109689998e-06, + "loss": 0.4976, + "step": 1041 + }, + { + "epoch": 0.22, + "grad_norm": 0.15859778225421906, + "learning_rate": 9.041801097482323e-06, + "loss": 0.4995, + "step": 1042 + }, + { + "epoch": 0.22, + "grad_norm": 0.18055035173892975, + "learning_rate": 9.03974611709977e-06, + "loss": 0.493, + "step": 1043 + }, + { + "epoch": 0.22, + "grad_norm": 0.16349811851978302, + "learning_rate": 9.037689169543024e-06, + "loss": 0.5102, + "step": 1044 + }, + { + "epoch": 0.23, + "grad_norm": 0.19477395713329315, + "learning_rate": 9.035630255813724e-06, + "loss": 0.5361, + "step": 1045 + }, + { + "epoch": 0.23, + "grad_norm": 0.2538851499557495, + "learning_rate": 9.033569376914467e-06, + "loss": 0.5118, + "step": 1046 + }, + { + "epoch": 0.23, + "grad_norm": 0.16743601858615875, + "learning_rate": 9.031506533848811e-06, + "loss": 0.5127, + "step": 1047 + }, + { + "epoch": 0.23, + "grad_norm": 0.1517488956451416, + "learning_rate": 9.029441727621267e-06, + "loss": 0.4791, + "step": 1048 + }, + { + "epoch": 0.23, + "grad_norm": 0.17050126194953918, + "learning_rate": 9.0273749592373e-06, + "loss": 0.5652, + "step": 1049 + }, + { + "epoch": 0.23, + "grad_norm": 0.20682963728904724, + "learning_rate": 9.025306229703334e-06, + "loss": 0.5183, + "step": 1050 + }, + { + "epoch": 0.23, + "grad_norm": 0.16146351397037506, + "learning_rate": 9.02323554002675e-06, + "loss": 0.5112, + "step": 1051 + }, + { + "epoch": 0.23, + "grad_norm": 0.23130019009113312, + "learning_rate": 9.021162891215879e-06, + "loss": 0.5573, + "step": 1052 + }, + { + "epoch": 0.23, + "grad_norm": 0.15757335722446442, + "learning_rate": 9.019088284280004e-06, + "loss": 0.5232, + "step": 1053 + }, + { + "epoch": 0.23, + "grad_norm": 0.14029166102409363, + "learning_rate": 9.017011720229368e-06, + "loss": 0.5329, + "step": 1054 + }, + { + "epoch": 0.23, + "grad_norm": 0.14857496321201324, + "learning_rate": 9.014933200075165e-06, + "loss": 0.514, + "step": 1055 + }, + { + "epoch": 0.23, + "grad_norm": 0.17802828550338745, + "learning_rate": 9.012852724829539e-06, + "loss": 0.5324, + "step": 1056 + }, + { + "epoch": 0.23, + "grad_norm": 0.18392032384872437, + "learning_rate": 9.010770295505587e-06, + "loss": 0.603, + "step": 1057 + }, + { + "epoch": 0.23, + "grad_norm": 0.13357198238372803, + "learning_rate": 9.008685913117361e-06, + "loss": 0.4848, + "step": 1058 + }, + { + "epoch": 0.23, + "grad_norm": 0.2151726484298706, + "learning_rate": 9.006599578679859e-06, + "loss": 0.4963, + "step": 1059 + }, + { + "epoch": 0.23, + "grad_norm": 0.1715989112854004, + "learning_rate": 9.00451129320903e-06, + "loss": 0.5639, + "step": 1060 + }, + { + "epoch": 0.23, + "grad_norm": 0.19878040254116058, + "learning_rate": 9.002421057721781e-06, + "loss": 0.5452, + "step": 1061 + }, + { + "epoch": 0.23, + "grad_norm": 0.16640903055667877, + "learning_rate": 9.000328873235955e-06, + "loss": 0.5471, + "step": 1062 + }, + { + "epoch": 0.23, + "grad_norm": 0.15267455577850342, + "learning_rate": 8.998234740770358e-06, + "loss": 0.545, + "step": 1063 + }, + { + "epoch": 0.23, + "grad_norm": 0.1756962686777115, + "learning_rate": 8.996138661344734e-06, + "loss": 0.5793, + "step": 1064 + }, + { + "epoch": 0.23, + "grad_norm": 0.1579316258430481, + "learning_rate": 8.994040635979779e-06, + "loss": 0.466, + "step": 1065 + }, + { + "epoch": 0.23, + "grad_norm": 0.14408744871616364, + "learning_rate": 8.99194066569714e-06, + "loss": 0.5637, + "step": 1066 + }, + { + "epoch": 0.23, + "grad_norm": 0.20260116457939148, + "learning_rate": 8.989838751519404e-06, + "loss": 0.5361, + "step": 1067 + }, + { + "epoch": 0.23, + "grad_norm": 0.17308081686496735, + "learning_rate": 8.987734894470111e-06, + "loss": 0.5083, + "step": 1068 + }, + { + "epoch": 0.23, + "grad_norm": 0.21290896832942963, + "learning_rate": 8.985629095573743e-06, + "loss": 0.5312, + "step": 1069 + }, + { + "epoch": 0.23, + "grad_norm": 0.15569837391376495, + "learning_rate": 8.983521355855731e-06, + "loss": 0.5513, + "step": 1070 + }, + { + "epoch": 0.23, + "grad_norm": 0.169041246175766, + "learning_rate": 8.98141167634245e-06, + "loss": 0.5262, + "step": 1071 + }, + { + "epoch": 0.23, + "grad_norm": 0.15449997782707214, + "learning_rate": 8.979300058061214e-06, + "loss": 0.5301, + "step": 1072 + }, + { + "epoch": 0.23, + "grad_norm": 0.15848426520824432, + "learning_rate": 8.977186502040288e-06, + "loss": 0.556, + "step": 1073 + }, + { + "epoch": 0.23, + "grad_norm": 0.1425653100013733, + "learning_rate": 8.97507100930888e-06, + "loss": 0.489, + "step": 1074 + }, + { + "epoch": 0.23, + "grad_norm": 0.1488298773765564, + "learning_rate": 8.97295358089714e-06, + "loss": 0.5091, + "step": 1075 + }, + { + "epoch": 0.23, + "grad_norm": 0.2116803079843521, + "learning_rate": 8.97083421783616e-06, + "loss": 0.5654, + "step": 1076 + }, + { + "epoch": 0.23, + "grad_norm": 0.17678038775920868, + "learning_rate": 8.96871292115797e-06, + "loss": 0.5485, + "step": 1077 + }, + { + "epoch": 0.23, + "grad_norm": 0.2219185084104538, + "learning_rate": 8.96658969189555e-06, + "loss": 0.5414, + "step": 1078 + }, + { + "epoch": 0.23, + "grad_norm": 0.18654341995716095, + "learning_rate": 8.964464531082817e-06, + "loss": 0.4603, + "step": 1079 + }, + { + "epoch": 0.23, + "grad_norm": 0.29177331924438477, + "learning_rate": 8.962337439754627e-06, + "loss": 0.5267, + "step": 1080 + }, + { + "epoch": 0.23, + "grad_norm": 0.15607115626335144, + "learning_rate": 8.960208418946778e-06, + "loss": 0.5295, + "step": 1081 + }, + { + "epoch": 0.23, + "grad_norm": 0.161067396402359, + "learning_rate": 8.958077469696007e-06, + "loss": 0.5795, + "step": 1082 + }, + { + "epoch": 0.23, + "grad_norm": 0.1314525008201599, + "learning_rate": 8.955944593039991e-06, + "loss": 0.5274, + "step": 1083 + }, + { + "epoch": 0.23, + "grad_norm": 0.1945776492357254, + "learning_rate": 8.953809790017342e-06, + "loss": 0.4744, + "step": 1084 + }, + { + "epoch": 0.23, + "grad_norm": 0.1876978725194931, + "learning_rate": 8.951673061667616e-06, + "loss": 0.5036, + "step": 1085 + }, + { + "epoch": 0.23, + "grad_norm": 0.1536783128976822, + "learning_rate": 8.949534409031305e-06, + "loss": 0.5387, + "step": 1086 + }, + { + "epoch": 0.23, + "grad_norm": 0.15228869020938873, + "learning_rate": 8.94739383314983e-06, + "loss": 0.4566, + "step": 1087 + }, + { + "epoch": 0.23, + "grad_norm": 0.17565909028053284, + "learning_rate": 8.94525133506556e-06, + "loss": 0.4965, + "step": 1088 + }, + { + "epoch": 0.23, + "grad_norm": 0.17287708818912506, + "learning_rate": 8.943106915821793e-06, + "loss": 0.505, + "step": 1089 + }, + { + "epoch": 0.23, + "grad_norm": 0.13172249495983124, + "learning_rate": 8.940960576462763e-06, + "loss": 0.522, + "step": 1090 + }, + { + "epoch": 0.24, + "grad_norm": 0.14747697114944458, + "learning_rate": 8.938812318033646e-06, + "loss": 0.5058, + "step": 1091 + }, + { + "epoch": 0.24, + "grad_norm": 0.16435351967811584, + "learning_rate": 8.93666214158054e-06, + "loss": 0.5572, + "step": 1092 + }, + { + "epoch": 0.24, + "grad_norm": 0.12998394668102264, + "learning_rate": 8.93451004815049e-06, + "loss": 0.4825, + "step": 1093 + }, + { + "epoch": 0.24, + "grad_norm": 0.2101740837097168, + "learning_rate": 8.932356038791465e-06, + "loss": 0.5399, + "step": 1094 + }, + { + "epoch": 0.24, + "grad_norm": 0.14743265509605408, + "learning_rate": 8.930200114552371e-06, + "loss": 0.4891, + "step": 1095 + }, + { + "epoch": 0.24, + "grad_norm": 0.19330647587776184, + "learning_rate": 8.928042276483048e-06, + "loss": 0.5756, + "step": 1096 + }, + { + "epoch": 0.24, + "grad_norm": 0.14885154366493225, + "learning_rate": 8.925882525634262e-06, + "loss": 0.4704, + "step": 1097 + }, + { + "epoch": 0.24, + "grad_norm": 0.17634066939353943, + "learning_rate": 8.923720863057718e-06, + "loss": 0.4969, + "step": 1098 + }, + { + "epoch": 0.24, + "grad_norm": 0.16363896429538727, + "learning_rate": 8.921557289806045e-06, + "loss": 0.5074, + "step": 1099 + }, + { + "epoch": 0.24, + "grad_norm": 0.20823244750499725, + "learning_rate": 8.919391806932807e-06, + "loss": 0.5217, + "step": 1100 + }, + { + "epoch": 0.24, + "grad_norm": 0.16124127805233002, + "learning_rate": 8.917224415492497e-06, + "loss": 0.4827, + "step": 1101 + }, + { + "epoch": 0.24, + "grad_norm": 0.16462095081806183, + "learning_rate": 8.915055116540538e-06, + "loss": 0.5878, + "step": 1102 + }, + { + "epoch": 0.24, + "grad_norm": 0.1553676277399063, + "learning_rate": 8.912883911133276e-06, + "loss": 0.4883, + "step": 1103 + }, + { + "epoch": 0.24, + "grad_norm": 0.17461282014846802, + "learning_rate": 8.910710800327996e-06, + "loss": 0.4893, + "step": 1104 + }, + { + "epoch": 0.24, + "grad_norm": 0.179164856672287, + "learning_rate": 8.908535785182902e-06, + "loss": 0.4993, + "step": 1105 + }, + { + "epoch": 0.24, + "grad_norm": 0.16661059856414795, + "learning_rate": 8.906358866757128e-06, + "loss": 0.4797, + "step": 1106 + }, + { + "epoch": 0.24, + "grad_norm": 0.15980976819992065, + "learning_rate": 8.904180046110736e-06, + "loss": 0.5167, + "step": 1107 + }, + { + "epoch": 0.24, + "grad_norm": 0.15015141665935516, + "learning_rate": 8.901999324304713e-06, + "loss": 0.4971, + "step": 1108 + }, + { + "epoch": 0.24, + "grad_norm": 0.15872696042060852, + "learning_rate": 8.899816702400973e-06, + "loss": 0.5469, + "step": 1109 + }, + { + "epoch": 0.24, + "grad_norm": 0.13243776559829712, + "learning_rate": 8.897632181462354e-06, + "loss": 0.5135, + "step": 1110 + }, + { + "epoch": 0.24, + "grad_norm": 0.1544090360403061, + "learning_rate": 8.895445762552618e-06, + "loss": 0.4792, + "step": 1111 + }, + { + "epoch": 0.24, + "grad_norm": 0.15280136466026306, + "learning_rate": 8.893257446736455e-06, + "loss": 0.4888, + "step": 1112 + }, + { + "epoch": 0.24, + "grad_norm": 0.14897377789020538, + "learning_rate": 8.891067235079473e-06, + "loss": 0.4846, + "step": 1113 + }, + { + "epoch": 0.24, + "grad_norm": 0.21805572509765625, + "learning_rate": 8.888875128648208e-06, + "loss": 0.5184, + "step": 1114 + }, + { + "epoch": 0.24, + "grad_norm": 0.15725421905517578, + "learning_rate": 8.886681128510118e-06, + "loss": 0.5857, + "step": 1115 + }, + { + "epoch": 0.24, + "grad_norm": 0.1463284194469452, + "learning_rate": 8.884485235733579e-06, + "loss": 0.4969, + "step": 1116 + }, + { + "epoch": 0.24, + "grad_norm": 0.1490708589553833, + "learning_rate": 8.882287451387894e-06, + "loss": 0.5814, + "step": 1117 + }, + { + "epoch": 0.24, + "grad_norm": 0.20178869366645813, + "learning_rate": 8.880087776543287e-06, + "loss": 0.5091, + "step": 1118 + }, + { + "epoch": 0.24, + "grad_norm": 0.1965067982673645, + "learning_rate": 8.877886212270897e-06, + "loss": 0.4933, + "step": 1119 + }, + { + "epoch": 0.24, + "grad_norm": 0.16523069143295288, + "learning_rate": 8.875682759642786e-06, + "loss": 0.5445, + "step": 1120 + }, + { + "epoch": 0.24, + "grad_norm": 0.1690714955329895, + "learning_rate": 8.873477419731938e-06, + "loss": 0.5567, + "step": 1121 + }, + { + "epoch": 0.24, + "grad_norm": 0.18909381330013275, + "learning_rate": 8.871270193612254e-06, + "loss": 0.5133, + "step": 1122 + }, + { + "epoch": 0.24, + "grad_norm": 0.1338469237089157, + "learning_rate": 8.869061082358555e-06, + "loss": 0.4958, + "step": 1123 + }, + { + "epoch": 0.24, + "grad_norm": 0.1580471694469452, + "learning_rate": 8.866850087046574e-06, + "loss": 0.5595, + "step": 1124 + }, + { + "epoch": 0.24, + "grad_norm": 0.1788654625415802, + "learning_rate": 8.864637208752972e-06, + "loss": 0.5481, + "step": 1125 + }, + { + "epoch": 0.24, + "grad_norm": 0.20803380012512207, + "learning_rate": 8.862422448555317e-06, + "loss": 0.5478, + "step": 1126 + }, + { + "epoch": 0.24, + "grad_norm": 0.19867488741874695, + "learning_rate": 8.860205807532097e-06, + "loss": 0.4927, + "step": 1127 + }, + { + "epoch": 0.24, + "grad_norm": 0.13807149231433868, + "learning_rate": 8.857987286762718e-06, + "loss": 0.5021, + "step": 1128 + }, + { + "epoch": 0.24, + "grad_norm": 0.15068547427654266, + "learning_rate": 8.8557668873275e-06, + "loss": 0.4993, + "step": 1129 + }, + { + "epoch": 0.24, + "grad_norm": 0.14488062262535095, + "learning_rate": 8.853544610307675e-06, + "loss": 0.4815, + "step": 1130 + }, + { + "epoch": 0.24, + "grad_norm": 0.15107618272304535, + "learning_rate": 8.851320456785394e-06, + "loss": 0.5086, + "step": 1131 + }, + { + "epoch": 0.24, + "grad_norm": 0.16421128809452057, + "learning_rate": 8.84909442784372e-06, + "loss": 0.4844, + "step": 1132 + }, + { + "epoch": 0.24, + "grad_norm": 0.17027032375335693, + "learning_rate": 8.846866524566624e-06, + "loss": 0.4721, + "step": 1133 + }, + { + "epoch": 0.24, + "grad_norm": 0.2614370584487915, + "learning_rate": 8.844636748038999e-06, + "loss": 0.5745, + "step": 1134 + }, + { + "epoch": 0.24, + "grad_norm": 0.15496228635311127, + "learning_rate": 8.842405099346645e-06, + "loss": 0.5499, + "step": 1135 + }, + { + "epoch": 0.24, + "grad_norm": 0.1893419474363327, + "learning_rate": 8.840171579576273e-06, + "loss": 0.4691, + "step": 1136 + }, + { + "epoch": 0.24, + "grad_norm": 0.13554450869560242, + "learning_rate": 8.837936189815507e-06, + "loss": 0.54, + "step": 1137 + }, + { + "epoch": 0.25, + "grad_norm": 0.12900054454803467, + "learning_rate": 8.83569893115288e-06, + "loss": 0.479, + "step": 1138 + }, + { + "epoch": 0.25, + "grad_norm": 0.1491711586713791, + "learning_rate": 8.83345980467784e-06, + "loss": 0.5322, + "step": 1139 + }, + { + "epoch": 0.25, + "grad_norm": 0.16243106126785278, + "learning_rate": 8.831218811480735e-06, + "loss": 0.4434, + "step": 1140 + }, + { + "epoch": 0.25, + "grad_norm": 0.16812367737293243, + "learning_rate": 8.828975952652833e-06, + "loss": 0.5024, + "step": 1141 + }, + { + "epoch": 0.25, + "grad_norm": 0.1857740879058838, + "learning_rate": 8.8267312292863e-06, + "loss": 0.5696, + "step": 1142 + }, + { + "epoch": 0.25, + "grad_norm": 0.13055641949176788, + "learning_rate": 8.824484642474217e-06, + "loss": 0.4787, + "step": 1143 + }, + { + "epoch": 0.25, + "grad_norm": 0.17672252655029297, + "learning_rate": 8.822236193310574e-06, + "loss": 0.5788, + "step": 1144 + }, + { + "epoch": 0.25, + "grad_norm": 0.15305279195308685, + "learning_rate": 8.81998588289026e-06, + "loss": 0.503, + "step": 1145 + }, + { + "epoch": 0.25, + "grad_norm": 0.15624657273292542, + "learning_rate": 8.817733712309078e-06, + "loss": 0.5346, + "step": 1146 + }, + { + "epoch": 0.25, + "grad_norm": 0.14786425232887268, + "learning_rate": 8.815479682663729e-06, + "loss": 0.5083, + "step": 1147 + }, + { + "epoch": 0.25, + "grad_norm": 0.19573761522769928, + "learning_rate": 8.813223795051828e-06, + "loss": 0.5298, + "step": 1148 + }, + { + "epoch": 0.25, + "grad_norm": 0.1662847250699997, + "learning_rate": 8.810966050571888e-06, + "loss": 0.533, + "step": 1149 + }, + { + "epoch": 0.25, + "grad_norm": 0.1873636543750763, + "learning_rate": 8.80870645032333e-06, + "loss": 0.4825, + "step": 1150 + }, + { + "epoch": 0.25, + "grad_norm": 0.1731029748916626, + "learning_rate": 8.806444995406475e-06, + "loss": 0.488, + "step": 1151 + }, + { + "epoch": 0.25, + "grad_norm": 0.18040412664413452, + "learning_rate": 8.804181686922555e-06, + "loss": 0.5282, + "step": 1152 + }, + { + "epoch": 0.25, + "grad_norm": 0.15593977272510529, + "learning_rate": 8.801916525973696e-06, + "loss": 0.5124, + "step": 1153 + }, + { + "epoch": 0.25, + "grad_norm": 0.15248659253120422, + "learning_rate": 8.799649513662926e-06, + "loss": 0.513, + "step": 1154 + }, + { + "epoch": 0.25, + "grad_norm": 0.14471983909606934, + "learning_rate": 8.797380651094182e-06, + "loss": 0.504, + "step": 1155 + }, + { + "epoch": 0.25, + "grad_norm": 0.1660238355398178, + "learning_rate": 8.795109939372298e-06, + "loss": 0.5266, + "step": 1156 + }, + { + "epoch": 0.25, + "grad_norm": 0.15838298201560974, + "learning_rate": 8.792837379603005e-06, + "loss": 0.5438, + "step": 1157 + }, + { + "epoch": 0.25, + "grad_norm": 0.17816348373889923, + "learning_rate": 8.79056297289294e-06, + "loss": 0.5428, + "step": 1158 + }, + { + "epoch": 0.25, + "grad_norm": 0.1319669485092163, + "learning_rate": 8.788286720349638e-06, + "loss": 0.5487, + "step": 1159 + }, + { + "epoch": 0.25, + "grad_norm": 0.14675050973892212, + "learning_rate": 8.786008623081526e-06, + "loss": 0.5409, + "step": 1160 + }, + { + "epoch": 0.25, + "grad_norm": 0.16564631462097168, + "learning_rate": 8.783728682197935e-06, + "loss": 0.5405, + "step": 1161 + }, + { + "epoch": 0.25, + "grad_norm": 0.1422412395477295, + "learning_rate": 8.781446898809101e-06, + "loss": 0.5069, + "step": 1162 + }, + { + "epoch": 0.25, + "grad_norm": 0.16499634087085724, + "learning_rate": 8.77916327402614e-06, + "loss": 0.5038, + "step": 1163 + }, + { + "epoch": 0.25, + "grad_norm": 0.19437891244888306, + "learning_rate": 8.776877808961082e-06, + "loss": 0.5249, + "step": 1164 + }, + { + "epoch": 0.25, + "grad_norm": 0.16480234265327454, + "learning_rate": 8.774590504726842e-06, + "loss": 0.5104, + "step": 1165 + }, + { + "epoch": 0.25, + "grad_norm": 0.12336334586143494, + "learning_rate": 8.772301362437233e-06, + "loss": 0.497, + "step": 1166 + }, + { + "epoch": 0.25, + "grad_norm": 0.19107873737812042, + "learning_rate": 8.770010383206967e-06, + "loss": 0.5441, + "step": 1167 + }, + { + "epoch": 0.25, + "grad_norm": 0.16102471947669983, + "learning_rate": 8.767717568151643e-06, + "loss": 0.4736, + "step": 1168 + }, + { + "epoch": 0.25, + "grad_norm": 0.14254657924175262, + "learning_rate": 8.765422918387764e-06, + "loss": 0.5339, + "step": 1169 + }, + { + "epoch": 0.25, + "grad_norm": 0.1567242443561554, + "learning_rate": 8.763126435032717e-06, + "loss": 0.5516, + "step": 1170 + }, + { + "epoch": 0.25, + "grad_norm": 0.16098615527153015, + "learning_rate": 8.760828119204787e-06, + "loss": 0.5642, + "step": 1171 + }, + { + "epoch": 0.25, + "grad_norm": 0.16631126403808594, + "learning_rate": 8.758527972023151e-06, + "loss": 0.4856, + "step": 1172 + }, + { + "epoch": 0.25, + "grad_norm": 0.15367335081100464, + "learning_rate": 8.756225994607877e-06, + "loss": 0.5066, + "step": 1173 + }, + { + "epoch": 0.25, + "grad_norm": 0.14037656784057617, + "learning_rate": 8.753922188079923e-06, + "loss": 0.5029, + "step": 1174 + }, + { + "epoch": 0.25, + "grad_norm": 0.15949761867523193, + "learning_rate": 8.75161655356114e-06, + "loss": 0.4636, + "step": 1175 + }, + { + "epoch": 0.25, + "grad_norm": 0.1654081493616104, + "learning_rate": 8.749309092174267e-06, + "loss": 0.5005, + "step": 1176 + }, + { + "epoch": 0.25, + "grad_norm": 0.2345263659954071, + "learning_rate": 8.746999805042932e-06, + "loss": 0.5147, + "step": 1177 + }, + { + "epoch": 0.25, + "grad_norm": 0.13465999066829681, + "learning_rate": 8.744688693291658e-06, + "loss": 0.4982, + "step": 1178 + }, + { + "epoch": 0.25, + "grad_norm": 0.1473112851381302, + "learning_rate": 8.74237575804585e-06, + "loss": 0.4857, + "step": 1179 + }, + { + "epoch": 0.25, + "grad_norm": 0.18562085926532745, + "learning_rate": 8.740061000431805e-06, + "loss": 0.505, + "step": 1180 + }, + { + "epoch": 0.25, + "grad_norm": 0.15015870332717896, + "learning_rate": 8.737744421576702e-06, + "loss": 0.5246, + "step": 1181 + }, + { + "epoch": 0.25, + "grad_norm": 0.16794438660144806, + "learning_rate": 8.735426022608611e-06, + "loss": 0.5393, + "step": 1182 + }, + { + "epoch": 0.25, + "grad_norm": 0.15591543912887573, + "learning_rate": 8.73310580465649e-06, + "loss": 0.4964, + "step": 1183 + }, + { + "epoch": 0.26, + "grad_norm": 0.2005312144756317, + "learning_rate": 8.73078376885018e-06, + "loss": 0.5, + "step": 1184 + }, + { + "epoch": 0.26, + "grad_norm": 0.15269523859024048, + "learning_rate": 8.728459916320406e-06, + "loss": 0.509, + "step": 1185 + }, + { + "epoch": 0.26, + "grad_norm": 0.14824025332927704, + "learning_rate": 8.726134248198782e-06, + "loss": 0.5186, + "step": 1186 + }, + { + "epoch": 0.26, + "grad_norm": 0.15085245668888092, + "learning_rate": 8.723806765617801e-06, + "loss": 0.4852, + "step": 1187 + }, + { + "epoch": 0.26, + "grad_norm": 0.1564967930316925, + "learning_rate": 8.721477469710845e-06, + "loss": 0.5095, + "step": 1188 + }, + { + "epoch": 0.26, + "grad_norm": 0.1731698215007782, + "learning_rate": 8.719146361612172e-06, + "loss": 0.5231, + "step": 1189 + }, + { + "epoch": 0.26, + "grad_norm": 0.18087385594844818, + "learning_rate": 8.71681344245693e-06, + "loss": 0.5556, + "step": 1190 + }, + { + "epoch": 0.26, + "grad_norm": 0.1844499558210373, + "learning_rate": 8.714478713381144e-06, + "loss": 0.5893, + "step": 1191 + }, + { + "epoch": 0.26, + "grad_norm": 0.12835489213466644, + "learning_rate": 8.712142175521723e-06, + "loss": 0.4653, + "step": 1192 + }, + { + "epoch": 0.26, + "grad_norm": 0.1417992115020752, + "learning_rate": 8.709803830016454e-06, + "loss": 0.5421, + "step": 1193 + }, + { + "epoch": 0.26, + "grad_norm": 0.13503408432006836, + "learning_rate": 8.707463678004004e-06, + "loss": 0.5036, + "step": 1194 + }, + { + "epoch": 0.26, + "grad_norm": 0.1597423255443573, + "learning_rate": 8.705121720623927e-06, + "loss": 0.5046, + "step": 1195 + }, + { + "epoch": 0.26, + "grad_norm": 0.1646643579006195, + "learning_rate": 8.702777959016647e-06, + "loss": 0.5126, + "step": 1196 + }, + { + "epoch": 0.26, + "grad_norm": 0.18008291721343994, + "learning_rate": 8.700432394323471e-06, + "loss": 0.5419, + "step": 1197 + }, + { + "epoch": 0.26, + "grad_norm": 0.14976496994495392, + "learning_rate": 8.698085027686581e-06, + "loss": 0.5095, + "step": 1198 + }, + { + "epoch": 0.26, + "grad_norm": 0.16157154738903046, + "learning_rate": 8.695735860249041e-06, + "loss": 0.5152, + "step": 1199 + }, + { + "epoch": 0.26, + "grad_norm": 0.16819888353347778, + "learning_rate": 8.69338489315479e-06, + "loss": 0.5401, + "step": 1200 + }, + { + "epoch": 0.26, + "grad_norm": 0.16953587532043457, + "learning_rate": 8.691032127548643e-06, + "loss": 0.5177, + "step": 1201 + }, + { + "epoch": 0.26, + "grad_norm": 0.15358132123947144, + "learning_rate": 8.68867756457629e-06, + "loss": 0.547, + "step": 1202 + }, + { + "epoch": 0.26, + "grad_norm": 0.13902026414871216, + "learning_rate": 8.686321205384296e-06, + "loss": 0.5487, + "step": 1203 + }, + { + "epoch": 0.26, + "grad_norm": 0.1606639176607132, + "learning_rate": 8.683963051120103e-06, + "loss": 0.4611, + "step": 1204 + }, + { + "epoch": 0.26, + "grad_norm": 0.14703510701656342, + "learning_rate": 8.681603102932026e-06, + "loss": 0.4999, + "step": 1205 + }, + { + "epoch": 0.26, + "grad_norm": 0.19730383157730103, + "learning_rate": 8.679241361969252e-06, + "loss": 0.4937, + "step": 1206 + }, + { + "epoch": 0.26, + "grad_norm": 0.1710227131843567, + "learning_rate": 8.676877829381843e-06, + "loss": 0.5255, + "step": 1207 + }, + { + "epoch": 0.26, + "grad_norm": 0.18335406482219696, + "learning_rate": 8.674512506320733e-06, + "loss": 0.603, + "step": 1208 + }, + { + "epoch": 0.26, + "grad_norm": 0.14981816709041595, + "learning_rate": 8.67214539393773e-06, + "loss": 0.4541, + "step": 1209 + }, + { + "epoch": 0.26, + "grad_norm": 0.2136390507221222, + "learning_rate": 8.669776493385506e-06, + "loss": 0.5327, + "step": 1210 + }, + { + "epoch": 0.26, + "grad_norm": 0.1298462301492691, + "learning_rate": 8.667405805817613e-06, + "loss": 0.5373, + "step": 1211 + }, + { + "epoch": 0.26, + "grad_norm": 0.1850888431072235, + "learning_rate": 8.665033332388466e-06, + "loss": 0.5459, + "step": 1212 + }, + { + "epoch": 0.26, + "grad_norm": 0.19591952860355377, + "learning_rate": 8.662659074253355e-06, + "loss": 0.5137, + "step": 1213 + }, + { + "epoch": 0.26, + "grad_norm": 0.13489966094493866, + "learning_rate": 8.660283032568435e-06, + "loss": 0.5468, + "step": 1214 + }, + { + "epoch": 0.26, + "grad_norm": 0.15992878377437592, + "learning_rate": 8.657905208490732e-06, + "loss": 0.5045, + "step": 1215 + }, + { + "epoch": 0.26, + "grad_norm": 0.16097012162208557, + "learning_rate": 8.655525603178137e-06, + "loss": 0.5239, + "step": 1216 + }, + { + "epoch": 0.26, + "grad_norm": 0.17989301681518555, + "learning_rate": 8.653144217789414e-06, + "loss": 0.5239, + "step": 1217 + }, + { + "epoch": 0.26, + "grad_norm": 0.1628495454788208, + "learning_rate": 8.650761053484188e-06, + "loss": 0.5315, + "step": 1218 + }, + { + "epoch": 0.26, + "grad_norm": 0.13146936893463135, + "learning_rate": 8.648376111422954e-06, + "loss": 0.5351, + "step": 1219 + }, + { + "epoch": 0.26, + "grad_norm": 0.212355837225914, + "learning_rate": 8.645989392767068e-06, + "loss": 0.5092, + "step": 1220 + }, + { + "epoch": 0.26, + "grad_norm": 0.11173799633979797, + "learning_rate": 8.643600898678758e-06, + "loss": 0.5176, + "step": 1221 + }, + { + "epoch": 0.26, + "grad_norm": 0.1470513939857483, + "learning_rate": 8.641210630321115e-06, + "loss": 0.5159, + "step": 1222 + }, + { + "epoch": 0.26, + "grad_norm": 0.1474858522415161, + "learning_rate": 8.638818588858084e-06, + "loss": 0.5103, + "step": 1223 + }, + { + "epoch": 0.26, + "grad_norm": 0.13153494894504547, + "learning_rate": 8.636424775454489e-06, + "loss": 0.5596, + "step": 1224 + }, + { + "epoch": 0.26, + "grad_norm": 0.1469038426876068, + "learning_rate": 8.634029191276003e-06, + "loss": 0.5363, + "step": 1225 + }, + { + "epoch": 0.26, + "grad_norm": 0.15724244713783264, + "learning_rate": 8.631631837489173e-06, + "loss": 0.5318, + "step": 1226 + }, + { + "epoch": 0.26, + "grad_norm": 0.16701483726501465, + "learning_rate": 8.6292327152614e-06, + "loss": 0.5219, + "step": 1227 + }, + { + "epoch": 0.26, + "grad_norm": 0.1822412610054016, + "learning_rate": 8.626831825760946e-06, + "loss": 0.5067, + "step": 1228 + }, + { + "epoch": 0.26, + "grad_norm": 0.14838603138923645, + "learning_rate": 8.62442917015694e-06, + "loss": 0.5298, + "step": 1229 + }, + { + "epoch": 0.26, + "grad_norm": 0.13148529827594757, + "learning_rate": 8.622024749619363e-06, + "loss": 0.4947, + "step": 1230 + }, + { + "epoch": 0.27, + "grad_norm": 0.1674978882074356, + "learning_rate": 8.619618565319063e-06, + "loss": 0.5674, + "step": 1231 + }, + { + "epoch": 0.27, + "grad_norm": 0.2056237906217575, + "learning_rate": 8.61721061842774e-06, + "loss": 0.4931, + "step": 1232 + }, + { + "epoch": 0.27, + "grad_norm": 0.1400204300880432, + "learning_rate": 8.614800910117958e-06, + "loss": 0.543, + "step": 1233 + }, + { + "epoch": 0.27, + "grad_norm": 0.1407189816236496, + "learning_rate": 8.612389441563136e-06, + "loss": 0.5108, + "step": 1234 + }, + { + "epoch": 0.27, + "grad_norm": 0.1611378788948059, + "learning_rate": 8.60997621393755e-06, + "loss": 0.4961, + "step": 1235 + }, + { + "epoch": 0.27, + "grad_norm": 0.1521531641483307, + "learning_rate": 8.60756122841633e-06, + "loss": 0.4755, + "step": 1236 + }, + { + "epoch": 0.27, + "grad_norm": 0.14714032411575317, + "learning_rate": 8.60514448617547e-06, + "loss": 0.5365, + "step": 1237 + }, + { + "epoch": 0.27, + "grad_norm": 0.17980900406837463, + "learning_rate": 8.602725988391814e-06, + "loss": 0.5424, + "step": 1238 + }, + { + "epoch": 0.27, + "grad_norm": 0.16438312828540802, + "learning_rate": 8.600305736243057e-06, + "loss": 0.5523, + "step": 1239 + }, + { + "epoch": 0.27, + "grad_norm": 0.1427246630191803, + "learning_rate": 8.597883730907757e-06, + "loss": 0.5091, + "step": 1240 + }, + { + "epoch": 0.27, + "grad_norm": 0.1325269341468811, + "learning_rate": 8.59545997356532e-06, + "loss": 0.481, + "step": 1241 + }, + { + "epoch": 0.27, + "grad_norm": 0.17241443693637848, + "learning_rate": 8.593034465396007e-06, + "loss": 0.5071, + "step": 1242 + }, + { + "epoch": 0.27, + "grad_norm": 0.14038234949111938, + "learning_rate": 8.590607207580927e-06, + "loss": 0.5394, + "step": 1243 + }, + { + "epoch": 0.27, + "grad_norm": 0.20857305824756622, + "learning_rate": 8.588178201302052e-06, + "loss": 0.4944, + "step": 1244 + }, + { + "epoch": 0.27, + "grad_norm": 0.1448458433151245, + "learning_rate": 8.585747447742194e-06, + "loss": 0.52, + "step": 1245 + }, + { + "epoch": 0.27, + "grad_norm": 0.17979028820991516, + "learning_rate": 8.583314948085023e-06, + "loss": 0.5241, + "step": 1246 + }, + { + "epoch": 0.27, + "grad_norm": 0.16653123497962952, + "learning_rate": 8.580880703515052e-06, + "loss": 0.5061, + "step": 1247 + }, + { + "epoch": 0.27, + "grad_norm": 0.2052346169948578, + "learning_rate": 8.578444715217652e-06, + "loss": 0.471, + "step": 1248 + }, + { + "epoch": 0.27, + "grad_norm": 0.1382577270269394, + "learning_rate": 8.576006984379042e-06, + "loss": 0.4621, + "step": 1249 + }, + { + "epoch": 0.27, + "grad_norm": 0.17501065135002136, + "learning_rate": 8.57356751218628e-06, + "loss": 0.5761, + "step": 1250 + }, + { + "epoch": 0.27, + "grad_norm": 0.14629067480564117, + "learning_rate": 8.571126299827284e-06, + "loss": 0.511, + "step": 1251 + }, + { + "epoch": 0.27, + "grad_norm": 0.16205544769763947, + "learning_rate": 8.568683348490817e-06, + "loss": 0.5259, + "step": 1252 + }, + { + "epoch": 0.27, + "grad_norm": 0.14176106452941895, + "learning_rate": 8.566238659366477e-06, + "loss": 0.5333, + "step": 1253 + }, + { + "epoch": 0.27, + "grad_norm": 0.27345001697540283, + "learning_rate": 8.563792233644725e-06, + "loss": 0.5117, + "step": 1254 + }, + { + "epoch": 0.27, + "grad_norm": 0.16053150594234467, + "learning_rate": 8.561344072516858e-06, + "loss": 0.5015, + "step": 1255 + }, + { + "epoch": 0.27, + "grad_norm": 0.19150519371032715, + "learning_rate": 8.558894177175019e-06, + "loss": 0.5326, + "step": 1256 + }, + { + "epoch": 0.27, + "grad_norm": 0.14895778894424438, + "learning_rate": 8.556442548812198e-06, + "loss": 0.5247, + "step": 1257 + }, + { + "epoch": 0.27, + "grad_norm": 0.16230621933937073, + "learning_rate": 8.553989188622228e-06, + "loss": 0.5634, + "step": 1258 + }, + { + "epoch": 0.27, + "grad_norm": 0.15796539187431335, + "learning_rate": 8.55153409779978e-06, + "loss": 0.5686, + "step": 1259 + }, + { + "epoch": 0.27, + "grad_norm": 0.15374596416950226, + "learning_rate": 8.549077277540379e-06, + "loss": 0.5287, + "step": 1260 + }, + { + "epoch": 0.27, + "grad_norm": 0.14890524744987488, + "learning_rate": 8.546618729040382e-06, + "loss": 0.5112, + "step": 1261 + }, + { + "epoch": 0.27, + "grad_norm": 0.1993798166513443, + "learning_rate": 8.544158453496992e-06, + "loss": 0.5229, + "step": 1262 + }, + { + "epoch": 0.27, + "grad_norm": 0.16211991012096405, + "learning_rate": 8.541696452108253e-06, + "loss": 0.5332, + "step": 1263 + }, + { + "epoch": 0.27, + "grad_norm": 0.2108837217092514, + "learning_rate": 8.539232726073046e-06, + "loss": 0.5223, + "step": 1264 + }, + { + "epoch": 0.27, + "grad_norm": 0.14320197701454163, + "learning_rate": 8.536767276591098e-06, + "loss": 0.4906, + "step": 1265 + }, + { + "epoch": 0.27, + "grad_norm": 0.14289528131484985, + "learning_rate": 8.53430010486297e-06, + "loss": 0.5253, + "step": 1266 + }, + { + "epoch": 0.27, + "grad_norm": 0.1269850730895996, + "learning_rate": 8.531831212090062e-06, + "loss": 0.5145, + "step": 1267 + }, + { + "epoch": 0.27, + "grad_norm": 0.18504297733306885, + "learning_rate": 8.529360599474616e-06, + "loss": 0.4976, + "step": 1268 + }, + { + "epoch": 0.27, + "grad_norm": 0.13720788061618805, + "learning_rate": 8.52688826821971e-06, + "loss": 0.4952, + "step": 1269 + }, + { + "epoch": 0.27, + "grad_norm": 0.2334408462047577, + "learning_rate": 8.524414219529253e-06, + "loss": 0.5416, + "step": 1270 + }, + { + "epoch": 0.27, + "grad_norm": 0.21838751435279846, + "learning_rate": 8.521938454608e-06, + "loss": 0.5012, + "step": 1271 + }, + { + "epoch": 0.27, + "grad_norm": 0.143874391913414, + "learning_rate": 8.519460974661533e-06, + "loss": 0.5323, + "step": 1272 + }, + { + "epoch": 0.27, + "grad_norm": 0.14506854116916656, + "learning_rate": 8.516981780896276e-06, + "loss": 0.5148, + "step": 1273 + }, + { + "epoch": 0.27, + "grad_norm": 0.1657627373933792, + "learning_rate": 8.514500874519483e-06, + "loss": 0.5507, + "step": 1274 + }, + { + "epoch": 0.27, + "grad_norm": 0.15067879855632782, + "learning_rate": 8.512018256739242e-06, + "loss": 0.4994, + "step": 1275 + }, + { + "epoch": 0.27, + "grad_norm": 0.1645599901676178, + "learning_rate": 8.509533928764482e-06, + "loss": 0.5025, + "step": 1276 + }, + { + "epoch": 0.28, + "grad_norm": 0.14725331962108612, + "learning_rate": 8.507047891804951e-06, + "loss": 0.5635, + "step": 1277 + }, + { + "epoch": 0.28, + "grad_norm": 0.16245393455028534, + "learning_rate": 8.50456014707124e-06, + "loss": 0.4446, + "step": 1278 + }, + { + "epoch": 0.28, + "grad_norm": 0.14229734241962433, + "learning_rate": 8.502070695774771e-06, + "loss": 0.5043, + "step": 1279 + }, + { + "epoch": 0.28, + "grad_norm": 0.20700879395008087, + "learning_rate": 8.499579539127794e-06, + "loss": 0.487, + "step": 1280 + }, + { + "epoch": 0.28, + "grad_norm": 0.1793096512556076, + "learning_rate": 8.497086678343385e-06, + "loss": 0.5082, + "step": 1281 + }, + { + "epoch": 0.28, + "grad_norm": 0.14241085946559906, + "learning_rate": 8.494592114635458e-06, + "loss": 0.5334, + "step": 1282 + }, + { + "epoch": 0.28, + "grad_norm": 0.1370537132024765, + "learning_rate": 8.492095849218756e-06, + "loss": 0.5242, + "step": 1283 + }, + { + "epoch": 0.28, + "grad_norm": 0.1460958868265152, + "learning_rate": 8.489597883308844e-06, + "loss": 0.5325, + "step": 1284 + }, + { + "epoch": 0.28, + "grad_norm": 0.18947859108448029, + "learning_rate": 8.487098218122119e-06, + "loss": 0.5344, + "step": 1285 + }, + { + "epoch": 0.28, + "grad_norm": 0.2026044875383377, + "learning_rate": 8.484596854875806e-06, + "loss": 0.5627, + "step": 1286 + }, + { + "epoch": 0.28, + "grad_norm": 0.13377788662910461, + "learning_rate": 8.482093794787956e-06, + "loss": 0.5525, + "step": 1287 + }, + { + "epoch": 0.28, + "grad_norm": 0.22986631095409393, + "learning_rate": 8.479589039077446e-06, + "loss": 0.5288, + "step": 1288 + }, + { + "epoch": 0.28, + "grad_norm": 0.17068606615066528, + "learning_rate": 8.47708258896398e-06, + "loss": 0.5352, + "step": 1289 + }, + { + "epoch": 0.28, + "grad_norm": 0.15582841634750366, + "learning_rate": 8.474574445668085e-06, + "loss": 0.5475, + "step": 1290 + }, + { + "epoch": 0.28, + "grad_norm": 0.19104814529418945, + "learning_rate": 8.472064610411115e-06, + "loss": 0.5225, + "step": 1291 + }, + { + "epoch": 0.28, + "grad_norm": 0.12952920794487, + "learning_rate": 8.469553084415247e-06, + "loss": 0.4927, + "step": 1292 + }, + { + "epoch": 0.28, + "grad_norm": 0.32774683833122253, + "learning_rate": 8.467039868903477e-06, + "loss": 0.5286, + "step": 1293 + }, + { + "epoch": 0.28, + "grad_norm": 0.16002535820007324, + "learning_rate": 8.464524965099632e-06, + "loss": 0.5124, + "step": 1294 + }, + { + "epoch": 0.28, + "grad_norm": 0.15826278924942017, + "learning_rate": 8.462008374228356e-06, + "loss": 0.5502, + "step": 1295 + }, + { + "epoch": 0.28, + "grad_norm": 0.1503647416830063, + "learning_rate": 8.459490097515114e-06, + "loss": 0.5833, + "step": 1296 + }, + { + "epoch": 0.28, + "grad_norm": 0.18131448328495026, + "learning_rate": 8.456970136186193e-06, + "loss": 0.4606, + "step": 1297 + }, + { + "epoch": 0.28, + "grad_norm": 0.16622257232666016, + "learning_rate": 8.454448491468702e-06, + "loss": 0.5207, + "step": 1298 + }, + { + "epoch": 0.28, + "grad_norm": 0.16979950666427612, + "learning_rate": 8.451925164590568e-06, + "loss": 0.5655, + "step": 1299 + }, + { + "epoch": 0.28, + "grad_norm": 0.19531172513961792, + "learning_rate": 8.449400156780536e-06, + "loss": 0.4779, + "step": 1300 + }, + { + "epoch": 0.28, + "grad_norm": 0.17314670979976654, + "learning_rate": 8.44687346926817e-06, + "loss": 0.5046, + "step": 1301 + }, + { + "epoch": 0.28, + "grad_norm": 0.1429021954536438, + "learning_rate": 8.444345103283858e-06, + "loss": 0.527, + "step": 1302 + }, + { + "epoch": 0.28, + "grad_norm": 0.19530290365219116, + "learning_rate": 8.441815060058795e-06, + "loss": 0.518, + "step": 1303 + }, + { + "epoch": 0.28, + "grad_norm": 0.1742294281721115, + "learning_rate": 8.439283340825002e-06, + "loss": 0.5443, + "step": 1304 + }, + { + "epoch": 0.28, + "grad_norm": 0.18429934978485107, + "learning_rate": 8.436749946815308e-06, + "loss": 0.5474, + "step": 1305 + }, + { + "epoch": 0.28, + "grad_norm": 0.1543246954679489, + "learning_rate": 8.434214879263365e-06, + "loss": 0.5142, + "step": 1306 + }, + { + "epoch": 0.28, + "grad_norm": 0.16444545984268188, + "learning_rate": 8.431678139403635e-06, + "loss": 0.5534, + "step": 1307 + }, + { + "epoch": 0.28, + "grad_norm": 0.19701968133449554, + "learning_rate": 8.429139728471395e-06, + "loss": 0.5156, + "step": 1308 + }, + { + "epoch": 0.28, + "grad_norm": 0.14688943326473236, + "learning_rate": 8.426599647702738e-06, + "loss": 0.5208, + "step": 1309 + }, + { + "epoch": 0.28, + "grad_norm": 0.19136419892311096, + "learning_rate": 8.424057898334569e-06, + "loss": 0.6148, + "step": 1310 + }, + { + "epoch": 0.28, + "grad_norm": 0.17055533826351166, + "learning_rate": 8.421514481604605e-06, + "loss": 0.5107, + "step": 1311 + }, + { + "epoch": 0.28, + "grad_norm": 0.16385668516159058, + "learning_rate": 8.418969398751375e-06, + "loss": 0.502, + "step": 1312 + }, + { + "epoch": 0.28, + "grad_norm": 0.17869453132152557, + "learning_rate": 8.41642265101422e-06, + "loss": 0.5464, + "step": 1313 + }, + { + "epoch": 0.28, + "grad_norm": 0.14309388399124146, + "learning_rate": 8.413874239633291e-06, + "loss": 0.5585, + "step": 1314 + }, + { + "epoch": 0.28, + "grad_norm": 0.16163702309131622, + "learning_rate": 8.41132416584955e-06, + "loss": 0.553, + "step": 1315 + }, + { + "epoch": 0.28, + "grad_norm": 0.15878815948963165, + "learning_rate": 8.408772430904768e-06, + "loss": 0.5359, + "step": 1316 + }, + { + "epoch": 0.28, + "grad_norm": 0.14803734421730042, + "learning_rate": 8.406219036041523e-06, + "loss": 0.5177, + "step": 1317 + }, + { + "epoch": 0.28, + "grad_norm": 0.16167186200618744, + "learning_rate": 8.403663982503205e-06, + "loss": 0.5106, + "step": 1318 + }, + { + "epoch": 0.28, + "grad_norm": 0.14223089814186096, + "learning_rate": 8.40110727153401e-06, + "loss": 0.4768, + "step": 1319 + }, + { + "epoch": 0.28, + "grad_norm": 0.1392257660627365, + "learning_rate": 8.398548904378938e-06, + "loss": 0.4928, + "step": 1320 + }, + { + "epoch": 0.28, + "grad_norm": 0.1703733652830124, + "learning_rate": 8.395988882283803e-06, + "loss": 0.462, + "step": 1321 + }, + { + "epoch": 0.28, + "grad_norm": 0.14999133348464966, + "learning_rate": 8.393427206495217e-06, + "loss": 0.5035, + "step": 1322 + }, + { + "epoch": 0.28, + "grad_norm": 0.18849503993988037, + "learning_rate": 8.390863878260602e-06, + "loss": 0.5025, + "step": 1323 + }, + { + "epoch": 0.29, + "grad_norm": 0.2667754888534546, + "learning_rate": 8.388298898828182e-06, + "loss": 0.517, + "step": 1324 + }, + { + "epoch": 0.29, + "grad_norm": 0.1366441398859024, + "learning_rate": 8.385732269446987e-06, + "loss": 0.4938, + "step": 1325 + }, + { + "epoch": 0.29, + "grad_norm": 0.16878017783164978, + "learning_rate": 8.383163991366852e-06, + "loss": 0.5057, + "step": 1326 + }, + { + "epoch": 0.29, + "grad_norm": 0.14408189058303833, + "learning_rate": 8.38059406583841e-06, + "loss": 0.5197, + "step": 1327 + }, + { + "epoch": 0.29, + "grad_norm": 0.14448203146457672, + "learning_rate": 8.378022494113099e-06, + "loss": 0.5289, + "step": 1328 + }, + { + "epoch": 0.29, + "grad_norm": 0.1776053011417389, + "learning_rate": 8.37544927744316e-06, + "loss": 0.529, + "step": 1329 + }, + { + "epoch": 0.29, + "grad_norm": 0.1904003769159317, + "learning_rate": 8.372874417081632e-06, + "loss": 0.5253, + "step": 1330 + }, + { + "epoch": 0.29, + "grad_norm": 0.15336477756500244, + "learning_rate": 8.370297914282354e-06, + "loss": 0.5307, + "step": 1331 + }, + { + "epoch": 0.29, + "grad_norm": 0.1891254037618637, + "learning_rate": 8.367719770299972e-06, + "loss": 0.5089, + "step": 1332 + }, + { + "epoch": 0.29, + "grad_norm": 0.22274090349674225, + "learning_rate": 8.36513998638992e-06, + "loss": 0.5328, + "step": 1333 + }, + { + "epoch": 0.29, + "grad_norm": 0.1466333568096161, + "learning_rate": 8.36255856380844e-06, + "loss": 0.5408, + "step": 1334 + }, + { + "epoch": 0.29, + "grad_norm": 0.15075673162937164, + "learning_rate": 8.359975503812569e-06, + "loss": 0.5402, + "step": 1335 + }, + { + "epoch": 0.29, + "grad_norm": 0.1457648128271103, + "learning_rate": 8.35739080766014e-06, + "loss": 0.5256, + "step": 1336 + }, + { + "epoch": 0.29, + "grad_norm": 0.1647207885980606, + "learning_rate": 8.35480447660978e-06, + "loss": 0.5204, + "step": 1337 + }, + { + "epoch": 0.29, + "grad_norm": 0.16034474968910217, + "learning_rate": 8.352216511920921e-06, + "loss": 0.5282, + "step": 1338 + }, + { + "epoch": 0.29, + "grad_norm": 0.1303335428237915, + "learning_rate": 8.349626914853781e-06, + "loss": 0.4993, + "step": 1339 + }, + { + "epoch": 0.29, + "grad_norm": 0.17350099980831146, + "learning_rate": 8.34703568666938e-06, + "loss": 0.6363, + "step": 1340 + }, + { + "epoch": 0.29, + "grad_norm": 0.16359736025333405, + "learning_rate": 8.344442828629526e-06, + "loss": 0.5418, + "step": 1341 + }, + { + "epoch": 0.29, + "grad_norm": 0.1771382838487625, + "learning_rate": 8.341848341996828e-06, + "loss": 0.5243, + "step": 1342 + }, + { + "epoch": 0.29, + "grad_norm": 0.14461980760097504, + "learning_rate": 8.33925222803468e-06, + "loss": 0.5308, + "step": 1343 + }, + { + "epoch": 0.29, + "grad_norm": 0.19642101228237152, + "learning_rate": 8.336654488007277e-06, + "loss": 0.5189, + "step": 1344 + }, + { + "epoch": 0.29, + "grad_norm": 0.18800689280033112, + "learning_rate": 8.334055123179596e-06, + "loss": 0.5177, + "step": 1345 + }, + { + "epoch": 0.29, + "grad_norm": 0.20820565521717072, + "learning_rate": 8.331454134817414e-06, + "loss": 0.5033, + "step": 1346 + }, + { + "epoch": 0.29, + "grad_norm": 0.15935355424880981, + "learning_rate": 8.328851524187292e-06, + "loss": 0.4901, + "step": 1347 + }, + { + "epoch": 0.29, + "grad_norm": 0.15410637855529785, + "learning_rate": 8.326247292556588e-06, + "loss": 0.5402, + "step": 1348 + }, + { + "epoch": 0.29, + "grad_norm": 0.21510785818099976, + "learning_rate": 8.323641441193441e-06, + "loss": 0.5414, + "step": 1349 + }, + { + "epoch": 0.29, + "grad_norm": 0.20484770834445953, + "learning_rate": 8.321033971366788e-06, + "loss": 0.4995, + "step": 1350 + }, + { + "epoch": 0.29, + "grad_norm": 0.15138699114322662, + "learning_rate": 8.318424884346347e-06, + "loss": 0.5191, + "step": 1351 + }, + { + "epoch": 0.29, + "grad_norm": 0.1576775163412094, + "learning_rate": 8.315814181402623e-06, + "loss": 0.5358, + "step": 1352 + }, + { + "epoch": 0.29, + "grad_norm": 0.15024110674858093, + "learning_rate": 8.313201863806915e-06, + "loss": 0.4613, + "step": 1353 + }, + { + "epoch": 0.29, + "grad_norm": 0.15514235198497772, + "learning_rate": 8.310587932831302e-06, + "loss": 0.4951, + "step": 1354 + }, + { + "epoch": 0.29, + "grad_norm": 0.20852284133434296, + "learning_rate": 8.30797238974865e-06, + "loss": 0.5085, + "step": 1355 + }, + { + "epoch": 0.29, + "grad_norm": 0.15601487457752228, + "learning_rate": 8.305355235832611e-06, + "loss": 0.5467, + "step": 1356 + }, + { + "epoch": 0.29, + "grad_norm": 0.22823049128055573, + "learning_rate": 8.30273647235762e-06, + "loss": 0.5444, + "step": 1357 + }, + { + "epoch": 0.29, + "grad_norm": 0.17297740280628204, + "learning_rate": 8.300116100598899e-06, + "loss": 0.4745, + "step": 1358 + }, + { + "epoch": 0.29, + "grad_norm": 0.16721418499946594, + "learning_rate": 8.297494121832449e-06, + "loss": 0.5331, + "step": 1359 + }, + { + "epoch": 0.29, + "grad_norm": 0.20764422416687012, + "learning_rate": 8.294870537335054e-06, + "loss": 0.5123, + "step": 1360 + }, + { + "epoch": 0.29, + "grad_norm": 0.12124624103307724, + "learning_rate": 8.292245348384285e-06, + "loss": 0.4942, + "step": 1361 + }, + { + "epoch": 0.29, + "grad_norm": 0.18373292684555054, + "learning_rate": 8.28961855625849e-06, + "loss": 0.6003, + "step": 1362 + }, + { + "epoch": 0.29, + "grad_norm": 0.15665894746780396, + "learning_rate": 8.286990162236796e-06, + "loss": 0.5199, + "step": 1363 + }, + { + "epoch": 0.29, + "grad_norm": 0.18932463228702545, + "learning_rate": 8.284360167599113e-06, + "loss": 0.5577, + "step": 1364 + }, + { + "epoch": 0.29, + "grad_norm": 0.14339394867420197, + "learning_rate": 8.28172857362613e-06, + "loss": 0.5319, + "step": 1365 + }, + { + "epoch": 0.29, + "grad_norm": 0.16630741953849792, + "learning_rate": 8.279095381599318e-06, + "loss": 0.506, + "step": 1366 + }, + { + "epoch": 0.29, + "grad_norm": 0.15607817471027374, + "learning_rate": 8.27646059280092e-06, + "loss": 0.5348, + "step": 1367 + }, + { + "epoch": 0.29, + "grad_norm": 0.1827673465013504, + "learning_rate": 8.273824208513956e-06, + "loss": 0.5234, + "step": 1368 + }, + { + "epoch": 0.29, + "grad_norm": 0.18514670431613922, + "learning_rate": 8.27118623002223e-06, + "loss": 0.4667, + "step": 1369 + }, + { + "epoch": 0.3, + "grad_norm": 0.14588609337806702, + "learning_rate": 8.268546658610319e-06, + "loss": 0.4641, + "step": 1370 + }, + { + "epoch": 0.3, + "grad_norm": 0.14752966165542603, + "learning_rate": 8.265905495563573e-06, + "loss": 0.4737, + "step": 1371 + }, + { + "epoch": 0.3, + "grad_norm": 0.18035411834716797, + "learning_rate": 8.26326274216812e-06, + "loss": 0.5087, + "step": 1372 + }, + { + "epoch": 0.3, + "grad_norm": 0.14755289256572723, + "learning_rate": 8.260618399710864e-06, + "loss": 0.5454, + "step": 1373 + }, + { + "epoch": 0.3, + "grad_norm": 0.18107686936855316, + "learning_rate": 8.257972469479478e-06, + "loss": 0.469, + "step": 1374 + }, + { + "epoch": 0.3, + "grad_norm": 0.13992854952812195, + "learning_rate": 8.255324952762413e-06, + "loss": 0.4561, + "step": 1375 + }, + { + "epoch": 0.3, + "grad_norm": 0.18599078059196472, + "learning_rate": 8.252675850848886e-06, + "loss": 0.4449, + "step": 1376 + }, + { + "epoch": 0.3, + "grad_norm": 0.14460837841033936, + "learning_rate": 8.250025165028897e-06, + "loss": 0.5144, + "step": 1377 + }, + { + "epoch": 0.3, + "grad_norm": 0.15791229903697968, + "learning_rate": 8.247372896593203e-06, + "loss": 0.5268, + "step": 1378 + }, + { + "epoch": 0.3, + "grad_norm": 0.15533843636512756, + "learning_rate": 8.244719046833342e-06, + "loss": 0.5176, + "step": 1379 + }, + { + "epoch": 0.3, + "grad_norm": 0.16106192767620087, + "learning_rate": 8.24206361704162e-06, + "loss": 0.5609, + "step": 1380 + }, + { + "epoch": 0.3, + "grad_norm": 0.1757259964942932, + "learning_rate": 8.239406608511113e-06, + "loss": 0.5459, + "step": 1381 + }, + { + "epoch": 0.3, + "grad_norm": 0.14974632859230042, + "learning_rate": 8.236748022535662e-06, + "loss": 0.5193, + "step": 1382 + }, + { + "epoch": 0.3, + "grad_norm": 0.16588665544986725, + "learning_rate": 8.23408786040988e-06, + "loss": 0.5399, + "step": 1383 + }, + { + "epoch": 0.3, + "grad_norm": 0.18392562866210938, + "learning_rate": 8.231426123429143e-06, + "loss": 0.5266, + "step": 1384 + }, + { + "epoch": 0.3, + "grad_norm": 0.15321050584316254, + "learning_rate": 8.2287628128896e-06, + "loss": 0.5206, + "step": 1385 + }, + { + "epoch": 0.3, + "grad_norm": 0.25465235114097595, + "learning_rate": 8.226097930088162e-06, + "loss": 0.5679, + "step": 1386 + }, + { + "epoch": 0.3, + "grad_norm": 0.16098381578922272, + "learning_rate": 8.223431476322508e-06, + "loss": 0.501, + "step": 1387 + }, + { + "epoch": 0.3, + "grad_norm": 0.18890248239040375, + "learning_rate": 8.220763452891078e-06, + "loss": 0.5524, + "step": 1388 + }, + { + "epoch": 0.3, + "grad_norm": 0.19365254044532776, + "learning_rate": 8.218093861093082e-06, + "loss": 0.4858, + "step": 1389 + }, + { + "epoch": 0.3, + "grad_norm": 0.13747772574424744, + "learning_rate": 8.215422702228487e-06, + "loss": 0.5109, + "step": 1390 + }, + { + "epoch": 0.3, + "grad_norm": 0.1644936501979828, + "learning_rate": 8.212749977598032e-06, + "loss": 0.4996, + "step": 1391 + }, + { + "epoch": 0.3, + "grad_norm": 0.17819000780582428, + "learning_rate": 8.210075688503209e-06, + "loss": 0.5312, + "step": 1392 + }, + { + "epoch": 0.3, + "grad_norm": 0.15765920281410217, + "learning_rate": 8.207399836246278e-06, + "loss": 0.5171, + "step": 1393 + }, + { + "epoch": 0.3, + "grad_norm": 0.20357385277748108, + "learning_rate": 8.20472242213026e-06, + "loss": 0.5364, + "step": 1394 + }, + { + "epoch": 0.3, + "grad_norm": 0.15080830454826355, + "learning_rate": 8.202043447458934e-06, + "loss": 0.5169, + "step": 1395 + }, + { + "epoch": 0.3, + "grad_norm": 0.15993140637874603, + "learning_rate": 8.199362913536837e-06, + "loss": 0.6155, + "step": 1396 + }, + { + "epoch": 0.3, + "grad_norm": 0.18161435425281525, + "learning_rate": 8.19668082166927e-06, + "loss": 0.5493, + "step": 1397 + }, + { + "epoch": 0.3, + "grad_norm": 0.1412186175584793, + "learning_rate": 8.193997173162293e-06, + "loss": 0.5242, + "step": 1398 + }, + { + "epoch": 0.3, + "grad_norm": 0.15259157121181488, + "learning_rate": 8.19131196932272e-06, + "loss": 0.5644, + "step": 1399 + }, + { + "epoch": 0.3, + "grad_norm": 0.2190113365650177, + "learning_rate": 8.188625211458123e-06, + "loss": 0.541, + "step": 1400 + }, + { + "epoch": 0.3, + "grad_norm": 0.17318737506866455, + "learning_rate": 8.185936900876834e-06, + "loss": 0.5085, + "step": 1401 + }, + { + "epoch": 0.3, + "grad_norm": 0.16196967661380768, + "learning_rate": 8.183247038887937e-06, + "loss": 0.485, + "step": 1402 + }, + { + "epoch": 0.3, + "grad_norm": 0.19770100712776184, + "learning_rate": 8.180555626801274e-06, + "loss": 0.5142, + "step": 1403 + }, + { + "epoch": 0.3, + "grad_norm": 0.1743081659078598, + "learning_rate": 8.177862665927445e-06, + "loss": 0.565, + "step": 1404 + }, + { + "epoch": 0.3, + "grad_norm": 0.18734456598758698, + "learning_rate": 8.175168157577795e-06, + "loss": 0.5631, + "step": 1405 + }, + { + "epoch": 0.3, + "grad_norm": 0.15591241419315338, + "learning_rate": 8.17247210306443e-06, + "loss": 0.4886, + "step": 1406 + }, + { + "epoch": 0.3, + "grad_norm": 0.20416924357414246, + "learning_rate": 8.169774503700209e-06, + "loss": 0.5232, + "step": 1407 + }, + { + "epoch": 0.3, + "grad_norm": 0.1668728142976761, + "learning_rate": 8.167075360798739e-06, + "loss": 0.5058, + "step": 1408 + }, + { + "epoch": 0.3, + "grad_norm": 0.1554676592350006, + "learning_rate": 8.164374675674382e-06, + "loss": 0.5154, + "step": 1409 + }, + { + "epoch": 0.3, + "grad_norm": 0.2015198916196823, + "learning_rate": 8.161672449642248e-06, + "loss": 0.482, + "step": 1410 + }, + { + "epoch": 0.3, + "grad_norm": 0.13508014380931854, + "learning_rate": 8.158968684018202e-06, + "loss": 0.5501, + "step": 1411 + }, + { + "epoch": 0.3, + "grad_norm": 0.18742331862449646, + "learning_rate": 8.156263380118855e-06, + "loss": 0.5439, + "step": 1412 + }, + { + "epoch": 0.3, + "grad_norm": 0.13899442553520203, + "learning_rate": 8.153556539261566e-06, + "loss": 0.4965, + "step": 1413 + }, + { + "epoch": 0.3, + "grad_norm": 0.15461724996566772, + "learning_rate": 8.150848162764448e-06, + "loss": 0.5158, + "step": 1414 + }, + { + "epoch": 0.3, + "grad_norm": 0.1699683964252472, + "learning_rate": 8.148138251946355e-06, + "loss": 0.5345, + "step": 1415 + }, + { + "epoch": 0.31, + "grad_norm": 0.1647995263338089, + "learning_rate": 8.145426808126894e-06, + "loss": 0.5417, + "step": 1416 + }, + { + "epoch": 0.31, + "grad_norm": 0.15304109454154968, + "learning_rate": 8.142713832626412e-06, + "loss": 0.5546, + "step": 1417 + }, + { + "epoch": 0.31, + "grad_norm": 0.12711341679096222, + "learning_rate": 8.139999326766011e-06, + "loss": 0.5176, + "step": 1418 + }, + { + "epoch": 0.31, + "grad_norm": 0.15692314505577087, + "learning_rate": 8.137283291867527e-06, + "loss": 0.4648, + "step": 1419 + }, + { + "epoch": 0.31, + "grad_norm": 0.16730400919914246, + "learning_rate": 8.134565729253554e-06, + "loss": 0.5099, + "step": 1420 + }, + { + "epoch": 0.31, + "grad_norm": 0.15150144696235657, + "learning_rate": 8.131846640247415e-06, + "loss": 0.5261, + "step": 1421 + }, + { + "epoch": 0.31, + "grad_norm": 0.25064417719841003, + "learning_rate": 8.129126026173189e-06, + "loss": 0.5097, + "step": 1422 + }, + { + "epoch": 0.31, + "grad_norm": 0.1557064801454544, + "learning_rate": 8.126403888355689e-06, + "loss": 0.4951, + "step": 1423 + }, + { + "epoch": 0.31, + "grad_norm": 0.17393703758716583, + "learning_rate": 8.123680228120474e-06, + "loss": 0.5257, + "step": 1424 + }, + { + "epoch": 0.31, + "grad_norm": 0.1844862401485443, + "learning_rate": 8.120955046793847e-06, + "loss": 0.5361, + "step": 1425 + }, + { + "epoch": 0.31, + "grad_norm": 0.17331448197364807, + "learning_rate": 8.118228345702843e-06, + "loss": 0.5718, + "step": 1426 + }, + { + "epoch": 0.31, + "grad_norm": 0.19549396634101868, + "learning_rate": 8.115500126175246e-06, + "loss": 0.5322, + "step": 1427 + }, + { + "epoch": 0.31, + "grad_norm": 0.16723619401454926, + "learning_rate": 8.112770389539574e-06, + "loss": 0.5048, + "step": 1428 + }, + { + "epoch": 0.31, + "grad_norm": 0.15985050797462463, + "learning_rate": 8.11003913712509e-06, + "loss": 0.4759, + "step": 1429 + }, + { + "epoch": 0.31, + "grad_norm": 0.16711269319057465, + "learning_rate": 8.107306370261785e-06, + "loss": 0.5433, + "step": 1430 + }, + { + "epoch": 0.31, + "grad_norm": 0.15856465697288513, + "learning_rate": 8.104572090280397e-06, + "loss": 0.5132, + "step": 1431 + }, + { + "epoch": 0.31, + "grad_norm": 0.14167572557926178, + "learning_rate": 8.101836298512396e-06, + "loss": 0.4879, + "step": 1432 + }, + { + "epoch": 0.31, + "grad_norm": 0.17282311618328094, + "learning_rate": 8.099098996289986e-06, + "loss": 0.5943, + "step": 1433 + }, + { + "epoch": 0.31, + "grad_norm": 0.1634991616010666, + "learning_rate": 8.096360184946117e-06, + "loss": 0.5256, + "step": 1434 + }, + { + "epoch": 0.31, + "grad_norm": 0.17868229746818542, + "learning_rate": 8.093619865814461e-06, + "loss": 0.5314, + "step": 1435 + }, + { + "epoch": 0.31, + "grad_norm": 0.17916221916675568, + "learning_rate": 8.09087804022943e-06, + "loss": 0.5192, + "step": 1436 + }, + { + "epoch": 0.31, + "grad_norm": 0.15131542086601257, + "learning_rate": 8.088134709526174e-06, + "loss": 0.4965, + "step": 1437 + }, + { + "epoch": 0.31, + "grad_norm": 0.15476344525814056, + "learning_rate": 8.085389875040566e-06, + "loss": 0.547, + "step": 1438 + }, + { + "epoch": 0.31, + "grad_norm": 0.18421463668346405, + "learning_rate": 8.082643538109217e-06, + "loss": 0.5478, + "step": 1439 + }, + { + "epoch": 0.31, + "grad_norm": 0.1662701666355133, + "learning_rate": 8.079895700069473e-06, + "loss": 0.5092, + "step": 1440 + }, + { + "epoch": 0.31, + "grad_norm": 0.18112128973007202, + "learning_rate": 8.077146362259405e-06, + "loss": 0.5242, + "step": 1441 + }, + { + "epoch": 0.31, + "grad_norm": 0.13690048456192017, + "learning_rate": 8.074395526017816e-06, + "loss": 0.5172, + "step": 1442 + }, + { + "epoch": 0.31, + "grad_norm": 0.16095203161239624, + "learning_rate": 8.07164319268424e-06, + "loss": 0.5465, + "step": 1443 + }, + { + "epoch": 0.31, + "grad_norm": 0.13967949151992798, + "learning_rate": 8.06888936359894e-06, + "loss": 0.5786, + "step": 1444 + }, + { + "epoch": 0.31, + "grad_norm": 0.23251961171627045, + "learning_rate": 8.066134040102904e-06, + "loss": 0.5086, + "step": 1445 + }, + { + "epoch": 0.31, + "grad_norm": 0.20811443030834198, + "learning_rate": 8.063377223537853e-06, + "loss": 0.5101, + "step": 1446 + }, + { + "epoch": 0.31, + "grad_norm": 0.1625215709209442, + "learning_rate": 8.060618915246233e-06, + "loss": 0.5268, + "step": 1447 + }, + { + "epoch": 0.31, + "grad_norm": 0.1501462310552597, + "learning_rate": 8.057859116571213e-06, + "loss": 0.547, + "step": 1448 + }, + { + "epoch": 0.31, + "grad_norm": 0.16021014750003815, + "learning_rate": 8.055097828856691e-06, + "loss": 0.5311, + "step": 1449 + }, + { + "epoch": 0.31, + "grad_norm": 0.20781485736370087, + "learning_rate": 8.05233505344729e-06, + "loss": 0.5188, + "step": 1450 + }, + { + "epoch": 0.31, + "grad_norm": 0.3020351231098175, + "learning_rate": 8.049570791688356e-06, + "loss": 0.5023, + "step": 1451 + }, + { + "epoch": 0.31, + "grad_norm": 0.1566857397556305, + "learning_rate": 8.046805044925964e-06, + "loss": 0.48, + "step": 1452 + }, + { + "epoch": 0.31, + "grad_norm": 0.1672096997499466, + "learning_rate": 8.044037814506905e-06, + "loss": 0.5301, + "step": 1453 + }, + { + "epoch": 0.31, + "grad_norm": 0.19419468939304352, + "learning_rate": 8.041269101778694e-06, + "loss": 0.5226, + "step": 1454 + }, + { + "epoch": 0.31, + "grad_norm": 0.16195285320281982, + "learning_rate": 8.03849890808957e-06, + "loss": 0.5223, + "step": 1455 + }, + { + "epoch": 0.31, + "grad_norm": 0.14367403090000153, + "learning_rate": 8.035727234788496e-06, + "loss": 0.5274, + "step": 1456 + }, + { + "epoch": 0.31, + "grad_norm": 0.1967507302761078, + "learning_rate": 8.032954083225146e-06, + "loss": 0.4899, + "step": 1457 + }, + { + "epoch": 0.31, + "grad_norm": 0.23297229409217834, + "learning_rate": 8.030179454749925e-06, + "loss": 0.5186, + "step": 1458 + }, + { + "epoch": 0.31, + "grad_norm": 0.16745884716510773, + "learning_rate": 8.027403350713948e-06, + "loss": 0.492, + "step": 1459 + }, + { + "epoch": 0.31, + "grad_norm": 0.13999496400356293, + "learning_rate": 8.024625772469055e-06, + "loss": 0.5221, + "step": 1460 + }, + { + "epoch": 0.31, + "grad_norm": 0.140817791223526, + "learning_rate": 8.0218467213678e-06, + "loss": 0.5128, + "step": 1461 + }, + { + "epoch": 0.31, + "grad_norm": 0.15968118607997894, + "learning_rate": 8.019066198763458e-06, + "loss": 0.525, + "step": 1462 + }, + { + "epoch": 0.32, + "grad_norm": 0.13812531530857086, + "learning_rate": 8.016284206010015e-06, + "loss": 0.4477, + "step": 1463 + }, + { + "epoch": 0.32, + "grad_norm": 0.16426512598991394, + "learning_rate": 8.013500744462177e-06, + "loss": 0.4974, + "step": 1464 + }, + { + "epoch": 0.32, + "grad_norm": 0.15231406688690186, + "learning_rate": 8.010715815475365e-06, + "loss": 0.5289, + "step": 1465 + }, + { + "epoch": 0.32, + "grad_norm": 0.1844695508480072, + "learning_rate": 8.007929420405714e-06, + "loss": 0.5201, + "step": 1466 + }, + { + "epoch": 0.32, + "grad_norm": 0.17498986423015594, + "learning_rate": 8.005141560610072e-06, + "loss": 0.5619, + "step": 1467 + }, + { + "epoch": 0.32, + "grad_norm": 0.16564463078975677, + "learning_rate": 8.002352237446e-06, + "loss": 0.5398, + "step": 1468 + }, + { + "epoch": 0.32, + "grad_norm": 0.15143102407455444, + "learning_rate": 7.999561452271776e-06, + "loss": 0.5038, + "step": 1469 + }, + { + "epoch": 0.32, + "grad_norm": 0.17521046102046967, + "learning_rate": 7.996769206446383e-06, + "loss": 0.4634, + "step": 1470 + }, + { + "epoch": 0.32, + "grad_norm": 0.16226552426815033, + "learning_rate": 7.993975501329518e-06, + "loss": 0.5735, + "step": 1471 + }, + { + "epoch": 0.32, + "grad_norm": 0.2068720906972885, + "learning_rate": 7.991180338281594e-06, + "loss": 0.5329, + "step": 1472 + }, + { + "epoch": 0.32, + "grad_norm": 0.2290961742401123, + "learning_rate": 7.988383718663727e-06, + "loss": 0.5203, + "step": 1473 + }, + { + "epoch": 0.32, + "grad_norm": 0.14001663029193878, + "learning_rate": 7.985585643837743e-06, + "loss": 0.4844, + "step": 1474 + }, + { + "epoch": 0.32, + "grad_norm": 0.15565429627895355, + "learning_rate": 7.982786115166182e-06, + "loss": 0.5158, + "step": 1475 + }, + { + "epoch": 0.32, + "grad_norm": 0.12718220055103302, + "learning_rate": 7.979985134012285e-06, + "loss": 0.5256, + "step": 1476 + }, + { + "epoch": 0.32, + "grad_norm": 0.1732247918844223, + "learning_rate": 7.977182701740003e-06, + "loss": 0.5447, + "step": 1477 + }, + { + "epoch": 0.32, + "grad_norm": 0.16792930662631989, + "learning_rate": 7.974378819713998e-06, + "loss": 0.5415, + "step": 1478 + }, + { + "epoch": 0.32, + "grad_norm": 0.1823003590106964, + "learning_rate": 7.97157348929963e-06, + "loss": 0.5089, + "step": 1479 + }, + { + "epoch": 0.32, + "grad_norm": 0.1478123515844345, + "learning_rate": 7.968766711862971e-06, + "loss": 0.5763, + "step": 1480 + }, + { + "epoch": 0.32, + "grad_norm": 0.16354763507843018, + "learning_rate": 7.965958488770796e-06, + "loss": 0.5476, + "step": 1481 + }, + { + "epoch": 0.32, + "grad_norm": 0.13449835777282715, + "learning_rate": 7.963148821390578e-06, + "loss": 0.5205, + "step": 1482 + }, + { + "epoch": 0.32, + "grad_norm": 0.17802083492279053, + "learning_rate": 7.960337711090504e-06, + "loss": 0.5239, + "step": 1483 + }, + { + "epoch": 0.32, + "grad_norm": 0.20004011690616608, + "learning_rate": 7.957525159239454e-06, + "loss": 0.5291, + "step": 1484 + }, + { + "epoch": 0.32, + "grad_norm": 0.17748400568962097, + "learning_rate": 7.954711167207016e-06, + "loss": 0.4913, + "step": 1485 + }, + { + "epoch": 0.32, + "grad_norm": 0.22476144134998322, + "learning_rate": 7.951895736363477e-06, + "loss": 0.4939, + "step": 1486 + }, + { + "epoch": 0.32, + "grad_norm": 0.16127091646194458, + "learning_rate": 7.949078868079825e-06, + "loss": 0.5272, + "step": 1487 + }, + { + "epoch": 0.32, + "grad_norm": 0.18299731612205505, + "learning_rate": 7.946260563727746e-06, + "loss": 0.5951, + "step": 1488 + }, + { + "epoch": 0.32, + "grad_norm": 0.13896289467811584, + "learning_rate": 7.94344082467963e-06, + "loss": 0.5591, + "step": 1489 + }, + { + "epoch": 0.32, + "grad_norm": 0.1735697239637375, + "learning_rate": 7.940619652308562e-06, + "loss": 0.5432, + "step": 1490 + }, + { + "epoch": 0.32, + "grad_norm": 0.16972100734710693, + "learning_rate": 7.937797047988322e-06, + "loss": 0.4821, + "step": 1491 + }, + { + "epoch": 0.32, + "grad_norm": 0.1734873354434967, + "learning_rate": 7.934973013093397e-06, + "loss": 0.4922, + "step": 1492 + }, + { + "epoch": 0.32, + "grad_norm": 0.16801413893699646, + "learning_rate": 7.932147548998958e-06, + "loss": 0.5599, + "step": 1493 + }, + { + "epoch": 0.32, + "grad_norm": 0.12655183672904968, + "learning_rate": 7.929320657080886e-06, + "loss": 0.5432, + "step": 1494 + }, + { + "epoch": 0.32, + "grad_norm": 0.2155943512916565, + "learning_rate": 7.926492338715746e-06, + "loss": 0.5351, + "step": 1495 + }, + { + "epoch": 0.32, + "grad_norm": 0.1321111023426056, + "learning_rate": 7.923662595280799e-06, + "loss": 0.5267, + "step": 1496 + }, + { + "epoch": 0.32, + "grad_norm": 0.19633205235004425, + "learning_rate": 7.920831428154008e-06, + "loss": 0.5296, + "step": 1497 + }, + { + "epoch": 0.32, + "grad_norm": 0.19406452775001526, + "learning_rate": 7.917998838714019e-06, + "loss": 0.569, + "step": 1498 + }, + { + "epoch": 0.32, + "grad_norm": 0.17301122844219208, + "learning_rate": 7.915164828340179e-06, + "loss": 0.5303, + "step": 1499 + }, + { + "epoch": 0.32, + "grad_norm": 0.14050279557704926, + "learning_rate": 7.91232939841252e-06, + "loss": 0.5045, + "step": 1500 + }, + { + "epoch": 0.32, + "grad_norm": 0.13988257944583893, + "learning_rate": 7.909492550311769e-06, + "loss": 0.4965, + "step": 1501 + }, + { + "epoch": 0.32, + "grad_norm": 0.13999608159065247, + "learning_rate": 7.906654285419347e-06, + "loss": 0.5337, + "step": 1502 + }, + { + "epoch": 0.32, + "grad_norm": 0.18495085835456848, + "learning_rate": 7.903814605117355e-06, + "loss": 0.5266, + "step": 1503 + }, + { + "epoch": 0.32, + "grad_norm": 0.131727397441864, + "learning_rate": 7.900973510788595e-06, + "loss": 0.5131, + "step": 1504 + }, + { + "epoch": 0.32, + "grad_norm": 0.13659153878688812, + "learning_rate": 7.898131003816547e-06, + "loss": 0.4934, + "step": 1505 + }, + { + "epoch": 0.32, + "grad_norm": 0.22903259098529816, + "learning_rate": 7.895287085585386e-06, + "loss": 0.5258, + "step": 1506 + }, + { + "epoch": 0.32, + "grad_norm": 0.23151510953903198, + "learning_rate": 7.892441757479974e-06, + "loss": 0.5321, + "step": 1507 + }, + { + "epoch": 0.32, + "grad_norm": 0.18955311179161072, + "learning_rate": 7.889595020885853e-06, + "loss": 0.4939, + "step": 1508 + }, + { + "epoch": 0.33, + "grad_norm": 0.14848068356513977, + "learning_rate": 7.88674687718926e-06, + "loss": 0.4916, + "step": 1509 + }, + { + "epoch": 0.33, + "grad_norm": 0.13812664151191711, + "learning_rate": 7.883897327777108e-06, + "loss": 0.51, + "step": 1510 + }, + { + "epoch": 0.33, + "grad_norm": 0.14594610035419464, + "learning_rate": 7.881046374037002e-06, + "loss": 0.497, + "step": 1511 + }, + { + "epoch": 0.33, + "grad_norm": 0.18314702808856964, + "learning_rate": 7.878194017357229e-06, + "loss": 0.4968, + "step": 1512 + }, + { + "epoch": 0.33, + "grad_norm": 0.15771466493606567, + "learning_rate": 7.875340259126754e-06, + "loss": 0.5373, + "step": 1513 + }, + { + "epoch": 0.33, + "grad_norm": 0.15456095337867737, + "learning_rate": 7.87248510073523e-06, + "loss": 0.4797, + "step": 1514 + }, + { + "epoch": 0.33, + "grad_norm": 0.14819829165935516, + "learning_rate": 7.869628543572994e-06, + "loss": 0.4645, + "step": 1515 + }, + { + "epoch": 0.33, + "grad_norm": 0.16360363364219666, + "learning_rate": 7.866770589031057e-06, + "loss": 0.4941, + "step": 1516 + }, + { + "epoch": 0.33, + "grad_norm": 0.1475502848625183, + "learning_rate": 7.863911238501113e-06, + "loss": 0.5693, + "step": 1517 + }, + { + "epoch": 0.33, + "grad_norm": 0.17970135807991028, + "learning_rate": 7.86105049337554e-06, + "loss": 0.6145, + "step": 1518 + }, + { + "epoch": 0.33, + "grad_norm": 0.16100694239139557, + "learning_rate": 7.85818835504739e-06, + "loss": 0.4806, + "step": 1519 + }, + { + "epoch": 0.33, + "grad_norm": 0.18620309233665466, + "learning_rate": 7.855324824910395e-06, + "loss": 0.5659, + "step": 1520 + }, + { + "epoch": 0.33, + "grad_norm": 0.1660996675491333, + "learning_rate": 7.852459904358968e-06, + "loss": 0.5211, + "step": 1521 + }, + { + "epoch": 0.33, + "grad_norm": 0.18867598474025726, + "learning_rate": 7.849593594788192e-06, + "loss": 0.4975, + "step": 1522 + }, + { + "epoch": 0.33, + "grad_norm": 0.17060688138008118, + "learning_rate": 7.846725897593834e-06, + "loss": 0.527, + "step": 1523 + }, + { + "epoch": 0.33, + "grad_norm": 0.14144161343574524, + "learning_rate": 7.843856814172329e-06, + "loss": 0.478, + "step": 1524 + }, + { + "epoch": 0.33, + "grad_norm": 0.15240880846977234, + "learning_rate": 7.840986345920795e-06, + "loss": 0.4896, + "step": 1525 + }, + { + "epoch": 0.33, + "grad_norm": 0.1528806835412979, + "learning_rate": 7.83811449423702e-06, + "loss": 0.4968, + "step": 1526 + }, + { + "epoch": 0.33, + "grad_norm": 0.1606244146823883, + "learning_rate": 7.835241260519467e-06, + "loss": 0.4879, + "step": 1527 + }, + { + "epoch": 0.33, + "grad_norm": 0.14756283164024353, + "learning_rate": 7.832366646167268e-06, + "loss": 0.5135, + "step": 1528 + }, + { + "epoch": 0.33, + "grad_norm": 0.16397136449813843, + "learning_rate": 7.829490652580233e-06, + "loss": 0.5549, + "step": 1529 + }, + { + "epoch": 0.33, + "grad_norm": 0.1577044427394867, + "learning_rate": 7.82661328115884e-06, + "loss": 0.5037, + "step": 1530 + }, + { + "epoch": 0.33, + "grad_norm": 0.16425062716007233, + "learning_rate": 7.823734533304241e-06, + "loss": 0.5245, + "step": 1531 + }, + { + "epoch": 0.33, + "grad_norm": 0.18981023132801056, + "learning_rate": 7.820854410418255e-06, + "loss": 0.5009, + "step": 1532 + }, + { + "epoch": 0.33, + "grad_norm": 0.14500872790813446, + "learning_rate": 7.817972913903373e-06, + "loss": 0.4711, + "step": 1533 + }, + { + "epoch": 0.33, + "grad_norm": 0.2270984947681427, + "learning_rate": 7.815090045162752e-06, + "loss": 0.5454, + "step": 1534 + }, + { + "epoch": 0.33, + "grad_norm": 0.1595790833234787, + "learning_rate": 7.81220580560022e-06, + "loss": 0.5159, + "step": 1535 + }, + { + "epoch": 0.33, + "grad_norm": 0.18246832489967346, + "learning_rate": 7.809320196620272e-06, + "loss": 0.5324, + "step": 1536 + }, + { + "epoch": 0.33, + "grad_norm": 0.15763631463050842, + "learning_rate": 7.80643321962807e-06, + "loss": 0.5348, + "step": 1537 + }, + { + "epoch": 0.33, + "grad_norm": 0.1331566572189331, + "learning_rate": 7.80354487602944e-06, + "loss": 0.4746, + "step": 1538 + }, + { + "epoch": 0.33, + "grad_norm": 0.17700472474098206, + "learning_rate": 7.800655167230877e-06, + "loss": 0.5652, + "step": 1539 + }, + { + "epoch": 0.33, + "grad_norm": 0.15402348339557648, + "learning_rate": 7.797764094639537e-06, + "loss": 0.557, + "step": 1540 + }, + { + "epoch": 0.33, + "grad_norm": 0.17362762987613678, + "learning_rate": 7.794871659663242e-06, + "loss": 0.491, + "step": 1541 + }, + { + "epoch": 0.33, + "grad_norm": 0.14665651321411133, + "learning_rate": 7.79197786371048e-06, + "loss": 0.5373, + "step": 1542 + }, + { + "epoch": 0.33, + "grad_norm": 0.17219582200050354, + "learning_rate": 7.789082708190397e-06, + "loss": 0.4852, + "step": 1543 + }, + { + "epoch": 0.33, + "grad_norm": 0.15352313220500946, + "learning_rate": 7.786186194512802e-06, + "loss": 0.4926, + "step": 1544 + }, + { + "epoch": 0.33, + "grad_norm": 0.17823894321918488, + "learning_rate": 7.78328832408817e-06, + "loss": 0.5275, + "step": 1545 + }, + { + "epoch": 0.33, + "grad_norm": 0.20020678639411926, + "learning_rate": 7.780389098327629e-06, + "loss": 0.4786, + "step": 1546 + }, + { + "epoch": 0.33, + "grad_norm": 0.13879740238189697, + "learning_rate": 7.777488518642975e-06, + "loss": 0.5054, + "step": 1547 + }, + { + "epoch": 0.33, + "grad_norm": 0.1314191222190857, + "learning_rate": 7.774586586446658e-06, + "loss": 0.4901, + "step": 1548 + }, + { + "epoch": 0.33, + "grad_norm": 0.26172900199890137, + "learning_rate": 7.77168330315179e-06, + "loss": 0.5073, + "step": 1549 + }, + { + "epoch": 0.33, + "grad_norm": 0.15131932497024536, + "learning_rate": 7.768778670172135e-06, + "loss": 0.532, + "step": 1550 + }, + { + "epoch": 0.33, + "grad_norm": 0.14957192540168762, + "learning_rate": 7.76587268892212e-06, + "loss": 0.489, + "step": 1551 + }, + { + "epoch": 0.33, + "grad_norm": 0.15338850021362305, + "learning_rate": 7.762965360816828e-06, + "loss": 0.5161, + "step": 1552 + }, + { + "epoch": 0.33, + "grad_norm": 0.14951498806476593, + "learning_rate": 7.760056687271996e-06, + "loss": 0.545, + "step": 1553 + }, + { + "epoch": 0.33, + "grad_norm": 0.32918447256088257, + "learning_rate": 7.757146669704016e-06, + "loss": 0.5144, + "step": 1554 + }, + { + "epoch": 0.33, + "grad_norm": 0.1633896380662918, + "learning_rate": 7.754235309529939e-06, + "loss": 0.5305, + "step": 1555 + }, + { + "epoch": 0.34, + "grad_norm": 0.15538008511066437, + "learning_rate": 7.75132260816746e-06, + "loss": 0.5787, + "step": 1556 + }, + { + "epoch": 0.34, + "grad_norm": 0.16210249066352844, + "learning_rate": 7.748408567034938e-06, + "loss": 0.516, + "step": 1557 + }, + { + "epoch": 0.34, + "grad_norm": 0.140504851937294, + "learning_rate": 7.745493187551378e-06, + "loss": 0.5344, + "step": 1558 + }, + { + "epoch": 0.34, + "grad_norm": 0.1350797414779663, + "learning_rate": 7.74257647113644e-06, + "loss": 0.5773, + "step": 1559 + }, + { + "epoch": 0.34, + "grad_norm": 0.16812683641910553, + "learning_rate": 7.739658419210429e-06, + "loss": 0.4808, + "step": 1560 + }, + { + "epoch": 0.34, + "grad_norm": 0.15915554761886597, + "learning_rate": 7.73673903319431e-06, + "loss": 0.51, + "step": 1561 + }, + { + "epoch": 0.34, + "grad_norm": 0.14357538521289825, + "learning_rate": 7.733818314509689e-06, + "loss": 0.4821, + "step": 1562 + }, + { + "epoch": 0.34, + "grad_norm": 0.1362561285495758, + "learning_rate": 7.730896264578825e-06, + "loss": 0.5051, + "step": 1563 + }, + { + "epoch": 0.34, + "grad_norm": 0.29245832562446594, + "learning_rate": 7.727972884824625e-06, + "loss": 0.5387, + "step": 1564 + }, + { + "epoch": 0.34, + "grad_norm": 0.1896662563085556, + "learning_rate": 7.725048176670643e-06, + "loss": 0.5269, + "step": 1565 + }, + { + "epoch": 0.34, + "grad_norm": 0.16521599888801575, + "learning_rate": 7.72212214154108e-06, + "loss": 0.5207, + "step": 1566 + }, + { + "epoch": 0.34, + "grad_norm": 0.1532319337129593, + "learning_rate": 7.719194780860783e-06, + "loss": 0.4951, + "step": 1567 + }, + { + "epoch": 0.34, + "grad_norm": 0.15770648419857025, + "learning_rate": 7.716266096055243e-06, + "loss": 0.5328, + "step": 1568 + }, + { + "epoch": 0.34, + "grad_norm": 0.13383062183856964, + "learning_rate": 7.713336088550601e-06, + "loss": 0.5463, + "step": 1569 + }, + { + "epoch": 0.34, + "grad_norm": 0.2122948169708252, + "learning_rate": 7.710404759773637e-06, + "loss": 0.5193, + "step": 1570 + }, + { + "epoch": 0.34, + "grad_norm": 0.1524578481912613, + "learning_rate": 7.707472111151775e-06, + "loss": 0.5058, + "step": 1571 + }, + { + "epoch": 0.34, + "grad_norm": 0.1887030303478241, + "learning_rate": 7.704538144113082e-06, + "loss": 0.515, + "step": 1572 + }, + { + "epoch": 0.34, + "grad_norm": 0.18387439846992493, + "learning_rate": 7.70160286008627e-06, + "loss": 0.523, + "step": 1573 + }, + { + "epoch": 0.34, + "grad_norm": 0.1244322806596756, + "learning_rate": 7.698666260500688e-06, + "loss": 0.4878, + "step": 1574 + }, + { + "epoch": 0.34, + "grad_norm": 0.13694074749946594, + "learning_rate": 7.69572834678633e-06, + "loss": 0.4722, + "step": 1575 + }, + { + "epoch": 0.34, + "grad_norm": 0.17935697734355927, + "learning_rate": 7.692789120373824e-06, + "loss": 0.4532, + "step": 1576 + }, + { + "epoch": 0.34, + "grad_norm": 0.1903911679983139, + "learning_rate": 7.689848582694444e-06, + "loss": 0.5128, + "step": 1577 + }, + { + "epoch": 0.34, + "grad_norm": 0.15431609749794006, + "learning_rate": 7.686906735180099e-06, + "loss": 0.4882, + "step": 1578 + }, + { + "epoch": 0.34, + "grad_norm": 0.17097975313663483, + "learning_rate": 7.683963579263332e-06, + "loss": 0.5729, + "step": 1579 + }, + { + "epoch": 0.34, + "grad_norm": 0.14723485708236694, + "learning_rate": 7.681019116377331e-06, + "loss": 0.494, + "step": 1580 + }, + { + "epoch": 0.34, + "grad_norm": 0.17691069841384888, + "learning_rate": 7.678073347955918e-06, + "loss": 0.5062, + "step": 1581 + }, + { + "epoch": 0.34, + "grad_norm": 0.161320298910141, + "learning_rate": 7.675126275433545e-06, + "loss": 0.5685, + "step": 1582 + }, + { + "epoch": 0.34, + "grad_norm": 0.18011566996574402, + "learning_rate": 7.672177900245307e-06, + "loss": 0.5103, + "step": 1583 + }, + { + "epoch": 0.34, + "grad_norm": 0.16380946338176727, + "learning_rate": 7.669228223826926e-06, + "loss": 0.4897, + "step": 1584 + }, + { + "epoch": 0.34, + "grad_norm": 0.15541784465312958, + "learning_rate": 7.666277247614766e-06, + "loss": 0.4562, + "step": 1585 + }, + { + "epoch": 0.34, + "grad_norm": 0.21574871242046356, + "learning_rate": 7.663324973045818e-06, + "loss": 0.5683, + "step": 1586 + }, + { + "epoch": 0.34, + "grad_norm": 0.18054868280887604, + "learning_rate": 7.660371401557703e-06, + "loss": 0.5149, + "step": 1587 + }, + { + "epoch": 0.34, + "grad_norm": 0.1341419368982315, + "learning_rate": 7.657416534588683e-06, + "loss": 0.4946, + "step": 1588 + }, + { + "epoch": 0.34, + "grad_norm": 0.1958109736442566, + "learning_rate": 7.654460373577639e-06, + "loss": 0.5204, + "step": 1589 + }, + { + "epoch": 0.34, + "grad_norm": 0.13961777091026306, + "learning_rate": 7.651502919964092e-06, + "loss": 0.4753, + "step": 1590 + }, + { + "epoch": 0.34, + "grad_norm": 0.16249793767929077, + "learning_rate": 7.648544175188189e-06, + "loss": 0.5392, + "step": 1591 + }, + { + "epoch": 0.34, + "grad_norm": 0.17830121517181396, + "learning_rate": 7.645584140690702e-06, + "loss": 0.5414, + "step": 1592 + }, + { + "epoch": 0.34, + "grad_norm": 0.164913147687912, + "learning_rate": 7.642622817913036e-06, + "loss": 0.5127, + "step": 1593 + }, + { + "epoch": 0.34, + "grad_norm": 0.13776592910289764, + "learning_rate": 7.639660208297221e-06, + "loss": 0.4568, + "step": 1594 + }, + { + "epoch": 0.34, + "grad_norm": 0.4830784499645233, + "learning_rate": 7.636696313285917e-06, + "loss": 0.5153, + "step": 1595 + }, + { + "epoch": 0.34, + "grad_norm": 0.14156107604503632, + "learning_rate": 7.633731134322404e-06, + "loss": 0.5142, + "step": 1596 + }, + { + "epoch": 0.34, + "grad_norm": 0.1518123894929886, + "learning_rate": 7.630764672850593e-06, + "loss": 0.51, + "step": 1597 + }, + { + "epoch": 0.34, + "grad_norm": 0.17625145614147186, + "learning_rate": 7.6277969303150155e-06, + "loss": 0.495, + "step": 1598 + }, + { + "epoch": 0.34, + "grad_norm": 0.17110183835029602, + "learning_rate": 7.624827908160828e-06, + "loss": 0.5465, + "step": 1599 + }, + { + "epoch": 0.34, + "grad_norm": 0.18074309825897217, + "learning_rate": 7.6218576078338115e-06, + "loss": 0.519, + "step": 1600 + }, + { + "epoch": 0.34, + "grad_norm": 0.176472008228302, + "learning_rate": 7.618886030780366e-06, + "loss": 0.5301, + "step": 1601 + }, + { + "epoch": 0.35, + "grad_norm": 0.23984403908252716, + "learning_rate": 7.615913178447518e-06, + "loss": 0.5679, + "step": 1602 + }, + { + "epoch": 0.35, + "grad_norm": 0.16570177674293518, + "learning_rate": 7.612939052282913e-06, + "loss": 0.5353, + "step": 1603 + }, + { + "epoch": 0.35, + "grad_norm": 0.15504352748394012, + "learning_rate": 7.609963653734814e-06, + "loss": 0.4889, + "step": 1604 + }, + { + "epoch": 0.35, + "grad_norm": 0.12483610212802887, + "learning_rate": 7.606986984252107e-06, + "loss": 0.4901, + "step": 1605 + }, + { + "epoch": 0.35, + "grad_norm": 0.1474786102771759, + "learning_rate": 7.604009045284295e-06, + "loss": 0.5106, + "step": 1606 + }, + { + "epoch": 0.35, + "grad_norm": 0.1935417652130127, + "learning_rate": 7.601029838281503e-06, + "loss": 0.54, + "step": 1607 + }, + { + "epoch": 0.35, + "grad_norm": 0.15936410427093506, + "learning_rate": 7.598049364694466e-06, + "loss": 0.5259, + "step": 1608 + }, + { + "epoch": 0.35, + "grad_norm": 0.23374778032302856, + "learning_rate": 7.595067625974544e-06, + "loss": 0.4745, + "step": 1609 + }, + { + "epoch": 0.35, + "grad_norm": 0.1541801393032074, + "learning_rate": 7.592084623573708e-06, + "loss": 0.5009, + "step": 1610 + }, + { + "epoch": 0.35, + "grad_norm": 0.1573501080274582, + "learning_rate": 7.589100358944546e-06, + "loss": 0.5054, + "step": 1611 + }, + { + "epoch": 0.35, + "grad_norm": 0.14179089665412903, + "learning_rate": 7.586114833540257e-06, + "loss": 0.4971, + "step": 1612 + }, + { + "epoch": 0.35, + "grad_norm": 0.12740643322467804, + "learning_rate": 7.583128048814663e-06, + "loss": 0.5311, + "step": 1613 + }, + { + "epoch": 0.35, + "grad_norm": 0.18302515149116516, + "learning_rate": 7.58014000622219e-06, + "loss": 0.5443, + "step": 1614 + }, + { + "epoch": 0.35, + "grad_norm": 0.22869239747524261, + "learning_rate": 7.577150707217878e-06, + "loss": 0.5488, + "step": 1615 + }, + { + "epoch": 0.35, + "grad_norm": 0.11746443063020706, + "learning_rate": 7.574160153257386e-06, + "loss": 0.5052, + "step": 1616 + }, + { + "epoch": 0.35, + "grad_norm": 0.15382401645183563, + "learning_rate": 7.571168345796975e-06, + "loss": 0.5468, + "step": 1617 + }, + { + "epoch": 0.35, + "grad_norm": 0.18465621769428253, + "learning_rate": 7.568175286293522e-06, + "loss": 0.557, + "step": 1618 + }, + { + "epoch": 0.35, + "grad_norm": 0.14507010579109192, + "learning_rate": 7.5651809762045115e-06, + "loss": 0.4686, + "step": 1619 + }, + { + "epoch": 0.35, + "grad_norm": 0.17526701092720032, + "learning_rate": 7.562185416988039e-06, + "loss": 0.5065, + "step": 1620 + }, + { + "epoch": 0.35, + "grad_norm": 0.16445392370224, + "learning_rate": 7.559188610102803e-06, + "loss": 0.4226, + "step": 1621 + }, + { + "epoch": 0.35, + "grad_norm": 0.13059720396995544, + "learning_rate": 7.556190557008116e-06, + "loss": 0.4899, + "step": 1622 + }, + { + "epoch": 0.35, + "grad_norm": 0.19847136735916138, + "learning_rate": 7.553191259163896e-06, + "loss": 0.5169, + "step": 1623 + }, + { + "epoch": 0.35, + "grad_norm": 0.1679173707962036, + "learning_rate": 7.550190718030663e-06, + "loss": 0.5012, + "step": 1624 + }, + { + "epoch": 0.35, + "grad_norm": 0.15986262261867523, + "learning_rate": 7.547188935069547e-06, + "loss": 0.5436, + "step": 1625 + }, + { + "epoch": 0.35, + "grad_norm": 0.13230155408382416, + "learning_rate": 7.54418591174228e-06, + "loss": 0.5307, + "step": 1626 + }, + { + "epoch": 0.35, + "grad_norm": 0.13571912050247192, + "learning_rate": 7.5411816495111985e-06, + "loss": 0.5169, + "step": 1627 + }, + { + "epoch": 0.35, + "grad_norm": 0.17367611825466156, + "learning_rate": 7.5381761498392435e-06, + "loss": 0.5677, + "step": 1628 + }, + { + "epoch": 0.35, + "grad_norm": 0.1747978776693344, + "learning_rate": 7.535169414189959e-06, + "loss": 0.5706, + "step": 1629 + }, + { + "epoch": 0.35, + "grad_norm": 0.11080675572156906, + "learning_rate": 7.532161444027488e-06, + "loss": 0.4933, + "step": 1630 + }, + { + "epoch": 0.35, + "grad_norm": 0.1479070633649826, + "learning_rate": 7.529152240816577e-06, + "loss": 0.4794, + "step": 1631 + }, + { + "epoch": 0.35, + "grad_norm": 0.12181144952774048, + "learning_rate": 7.526141806022571e-06, + "loss": 0.5346, + "step": 1632 + }, + { + "epoch": 0.35, + "grad_norm": 0.18355728685855865, + "learning_rate": 7.523130141111419e-06, + "loss": 0.5696, + "step": 1633 + }, + { + "epoch": 0.35, + "grad_norm": 0.12792839109897614, + "learning_rate": 7.520117247549661e-06, + "loss": 0.5148, + "step": 1634 + }, + { + "epoch": 0.35, + "grad_norm": 0.17084498703479767, + "learning_rate": 7.517103126804446e-06, + "loss": 0.5362, + "step": 1635 + }, + { + "epoch": 0.35, + "grad_norm": 0.1391141563653946, + "learning_rate": 7.514087780343511e-06, + "loss": 0.4839, + "step": 1636 + }, + { + "epoch": 0.35, + "grad_norm": 0.13675713539123535, + "learning_rate": 7.511071209635197e-06, + "loss": 0.5153, + "step": 1637 + }, + { + "epoch": 0.35, + "grad_norm": 0.13880731165409088, + "learning_rate": 7.508053416148433e-06, + "loss": 0.5117, + "step": 1638 + }, + { + "epoch": 0.35, + "grad_norm": 0.11620379984378815, + "learning_rate": 7.5050344013527535e-06, + "loss": 0.5146, + "step": 1639 + }, + { + "epoch": 0.35, + "grad_norm": 0.1520024538040161, + "learning_rate": 7.502014166718279e-06, + "loss": 0.5332, + "step": 1640 + }, + { + "epoch": 0.35, + "grad_norm": 0.16113972663879395, + "learning_rate": 7.49899271371573e-06, + "loss": 0.4881, + "step": 1641 + }, + { + "epoch": 0.35, + "grad_norm": 0.177647203207016, + "learning_rate": 7.495970043816416e-06, + "loss": 0.506, + "step": 1642 + }, + { + "epoch": 0.35, + "grad_norm": 0.20048052072525024, + "learning_rate": 7.492946158492243e-06, + "loss": 0.5128, + "step": 1643 + }, + { + "epoch": 0.35, + "grad_norm": 0.18544965982437134, + "learning_rate": 7.489921059215703e-06, + "loss": 0.4755, + "step": 1644 + }, + { + "epoch": 0.35, + "grad_norm": 0.15983660519123077, + "learning_rate": 7.486894747459887e-06, + "loss": 0.5021, + "step": 1645 + }, + { + "epoch": 0.35, + "grad_norm": 0.13609494268894196, + "learning_rate": 7.483867224698471e-06, + "loss": 0.5392, + "step": 1646 + }, + { + "epoch": 0.35, + "grad_norm": 0.15707872807979584, + "learning_rate": 7.480838492405722e-06, + "loss": 0.5503, + "step": 1647 + }, + { + "epoch": 0.36, + "grad_norm": 0.14846757054328918, + "learning_rate": 7.477808552056496e-06, + "loss": 0.5162, + "step": 1648 + }, + { + "epoch": 0.36, + "grad_norm": 0.20370322465896606, + "learning_rate": 7.474777405126236e-06, + "loss": 0.5291, + "step": 1649 + }, + { + "epoch": 0.36, + "grad_norm": 0.19087088108062744, + "learning_rate": 7.471745053090976e-06, + "loss": 0.5647, + "step": 1650 + }, + { + "epoch": 0.36, + "grad_norm": 0.1674560159444809, + "learning_rate": 7.468711497427335e-06, + "loss": 0.502, + "step": 1651 + }, + { + "epoch": 0.36, + "grad_norm": 0.1854984611272812, + "learning_rate": 7.465676739612514e-06, + "loss": 0.5304, + "step": 1652 + }, + { + "epoch": 0.36, + "grad_norm": 0.17334036529064178, + "learning_rate": 7.462640781124309e-06, + "loss": 0.5476, + "step": 1653 + }, + { + "epoch": 0.36, + "grad_norm": 0.1636764258146286, + "learning_rate": 7.45960362344109e-06, + "loss": 0.5359, + "step": 1654 + }, + { + "epoch": 0.36, + "grad_norm": 0.16120000183582306, + "learning_rate": 7.456565268041815e-06, + "loss": 0.5591, + "step": 1655 + }, + { + "epoch": 0.36, + "grad_norm": 0.16681008040905, + "learning_rate": 7.4535257164060324e-06, + "loss": 0.4933, + "step": 1656 + }, + { + "epoch": 0.36, + "grad_norm": 0.15936830639839172, + "learning_rate": 7.450484970013863e-06, + "loss": 0.4903, + "step": 1657 + }, + { + "epoch": 0.36, + "grad_norm": 0.1579248011112213, + "learning_rate": 7.447443030346011e-06, + "loss": 0.5368, + "step": 1658 + }, + { + "epoch": 0.36, + "grad_norm": 0.17494046688079834, + "learning_rate": 7.444399898883768e-06, + "loss": 0.4972, + "step": 1659 + }, + { + "epoch": 0.36, + "grad_norm": 0.15343308448791504, + "learning_rate": 7.441355577108998e-06, + "loss": 0.485, + "step": 1660 + }, + { + "epoch": 0.36, + "grad_norm": 0.24387070536613464, + "learning_rate": 7.438310066504152e-06, + "loss": 0.5527, + "step": 1661 + }, + { + "epoch": 0.36, + "grad_norm": 0.27083417773246765, + "learning_rate": 7.4352633685522535e-06, + "loss": 0.4657, + "step": 1662 + }, + { + "epoch": 0.36, + "grad_norm": 0.20291651785373688, + "learning_rate": 7.432215484736909e-06, + "loss": 0.4805, + "step": 1663 + }, + { + "epoch": 0.36, + "grad_norm": 0.17441540956497192, + "learning_rate": 7.4291664165422985e-06, + "loss": 0.5157, + "step": 1664 + }, + { + "epoch": 0.36, + "grad_norm": 0.21364037692546844, + "learning_rate": 7.426116165453181e-06, + "loss": 0.5072, + "step": 1665 + }, + { + "epoch": 0.36, + "grad_norm": 0.16811180114746094, + "learning_rate": 7.423064732954895e-06, + "loss": 0.4577, + "step": 1666 + }, + { + "epoch": 0.36, + "grad_norm": 0.2634996473789215, + "learning_rate": 7.420012120533346e-06, + "loss": 0.5387, + "step": 1667 + }, + { + "epoch": 0.36, + "grad_norm": 0.15785469114780426, + "learning_rate": 7.4169583296750194e-06, + "loss": 0.5052, + "step": 1668 + }, + { + "epoch": 0.36, + "grad_norm": 0.18810074031352997, + "learning_rate": 7.4139033618669764e-06, + "loss": 0.5234, + "step": 1669 + }, + { + "epoch": 0.36, + "grad_norm": 0.14630138874053955, + "learning_rate": 7.410847218596846e-06, + "loss": 0.5155, + "step": 1670 + }, + { + "epoch": 0.36, + "grad_norm": 0.18249250948429108, + "learning_rate": 7.407789901352831e-06, + "loss": 0.5351, + "step": 1671 + }, + { + "epoch": 0.36, + "grad_norm": 0.13652457296848297, + "learning_rate": 7.40473141162371e-06, + "loss": 0.4474, + "step": 1672 + }, + { + "epoch": 0.36, + "grad_norm": 0.18352244794368744, + "learning_rate": 7.401671750898829e-06, + "loss": 0.4628, + "step": 1673 + }, + { + "epoch": 0.36, + "grad_norm": 0.16410337388515472, + "learning_rate": 7.398610920668102e-06, + "loss": 0.5673, + "step": 1674 + }, + { + "epoch": 0.36, + "grad_norm": 0.14850519597530365, + "learning_rate": 7.39554892242202e-06, + "loss": 0.48, + "step": 1675 + }, + { + "epoch": 0.36, + "grad_norm": 0.1457439661026001, + "learning_rate": 7.392485757651634e-06, + "loss": 0.5061, + "step": 1676 + }, + { + "epoch": 0.36, + "grad_norm": 0.15839837491512299, + "learning_rate": 7.3894214278485685e-06, + "loss": 0.5482, + "step": 1677 + }, + { + "epoch": 0.36, + "grad_norm": 0.1379930078983307, + "learning_rate": 7.386355934505015e-06, + "loss": 0.5207, + "step": 1678 + }, + { + "epoch": 0.36, + "grad_norm": 0.2140752226114273, + "learning_rate": 7.38328927911373e-06, + "loss": 0.5709, + "step": 1679 + }, + { + "epoch": 0.36, + "grad_norm": 0.16319052875041962, + "learning_rate": 7.380221463168036e-06, + "loss": 0.5182, + "step": 1680 + }, + { + "epoch": 0.36, + "grad_norm": 0.12774449586868286, + "learning_rate": 7.3771524881618204e-06, + "loss": 0.5274, + "step": 1681 + }, + { + "epoch": 0.36, + "grad_norm": 0.13371047377586365, + "learning_rate": 7.374082355589536e-06, + "loss": 0.4983, + "step": 1682 + }, + { + "epoch": 0.36, + "grad_norm": 0.13684460520744324, + "learning_rate": 7.371011066946199e-06, + "loss": 0.5395, + "step": 1683 + }, + { + "epoch": 0.36, + "grad_norm": 0.16260729730129242, + "learning_rate": 7.367938623727389e-06, + "loss": 0.4927, + "step": 1684 + }, + { + "epoch": 0.36, + "grad_norm": 0.1580437868833542, + "learning_rate": 7.364865027429247e-06, + "loss": 0.5391, + "step": 1685 + }, + { + "epoch": 0.36, + "grad_norm": 0.41100969910621643, + "learning_rate": 7.361790279548476e-06, + "loss": 0.4922, + "step": 1686 + }, + { + "epoch": 0.36, + "grad_norm": 0.16328592598438263, + "learning_rate": 7.358714381582339e-06, + "loss": 0.5809, + "step": 1687 + }, + { + "epoch": 0.36, + "grad_norm": 0.16407454013824463, + "learning_rate": 7.35563733502866e-06, + "loss": 0.5317, + "step": 1688 + }, + { + "epoch": 0.36, + "grad_norm": 0.16385860741138458, + "learning_rate": 7.352559141385823e-06, + "loss": 0.5182, + "step": 1689 + }, + { + "epoch": 0.36, + "grad_norm": 0.1773800253868103, + "learning_rate": 7.3494798021527665e-06, + "loss": 0.4972, + "step": 1690 + }, + { + "epoch": 0.36, + "grad_norm": 0.14111146330833435, + "learning_rate": 7.346399318828994e-06, + "loss": 0.485, + "step": 1691 + }, + { + "epoch": 0.36, + "grad_norm": 0.18736319243907928, + "learning_rate": 7.3433176929145574e-06, + "loss": 0.532, + "step": 1692 + }, + { + "epoch": 0.36, + "grad_norm": 0.1659240871667862, + "learning_rate": 7.3402349259100725e-06, + "loss": 0.4878, + "step": 1693 + }, + { + "epoch": 0.36, + "grad_norm": 0.13603948056697845, + "learning_rate": 7.337151019316708e-06, + "loss": 0.5024, + "step": 1694 + }, + { + "epoch": 0.37, + "grad_norm": 0.14938659965991974, + "learning_rate": 7.334065974636186e-06, + "loss": 0.4882, + "step": 1695 + }, + { + "epoch": 0.37, + "grad_norm": 0.15664424002170563, + "learning_rate": 7.330979793370784e-06, + "loss": 0.4855, + "step": 1696 + }, + { + "epoch": 0.37, + "grad_norm": 0.15226437151432037, + "learning_rate": 7.327892477023335e-06, + "loss": 0.5258, + "step": 1697 + }, + { + "epoch": 0.37, + "grad_norm": 0.20304326713085175, + "learning_rate": 7.324804027097221e-06, + "loss": 0.5325, + "step": 1698 + }, + { + "epoch": 0.37, + "grad_norm": 0.14442868530750275, + "learning_rate": 7.3217144450963774e-06, + "loss": 0.4676, + "step": 1699 + }, + { + "epoch": 0.37, + "grad_norm": 0.14504297077655792, + "learning_rate": 7.318623732525294e-06, + "loss": 0.523, + "step": 1700 + }, + { + "epoch": 0.37, + "grad_norm": 0.13879434764385223, + "learning_rate": 7.315531890889007e-06, + "loss": 0.5121, + "step": 1701 + }, + { + "epoch": 0.37, + "grad_norm": 0.16492860019207, + "learning_rate": 7.312438921693101e-06, + "loss": 0.508, + "step": 1702 + }, + { + "epoch": 0.37, + "grad_norm": 0.13094115257263184, + "learning_rate": 7.309344826443718e-06, + "loss": 0.5123, + "step": 1703 + }, + { + "epoch": 0.37, + "grad_norm": 0.16071003675460815, + "learning_rate": 7.30624960664754e-06, + "loss": 0.5077, + "step": 1704 + }, + { + "epoch": 0.37, + "grad_norm": 0.1596524864435196, + "learning_rate": 7.3031532638117974e-06, + "loss": 0.5193, + "step": 1705 + }, + { + "epoch": 0.37, + "grad_norm": 0.15532274544239044, + "learning_rate": 7.300055799444273e-06, + "loss": 0.5651, + "step": 1706 + }, + { + "epoch": 0.37, + "grad_norm": 0.1956198513507843, + "learning_rate": 7.296957215053292e-06, + "loss": 0.5238, + "step": 1707 + }, + { + "epoch": 0.37, + "grad_norm": 0.17350712418556213, + "learning_rate": 7.293857512147723e-06, + "loss": 0.5064, + "step": 1708 + }, + { + "epoch": 0.37, + "grad_norm": 0.1837831437587738, + "learning_rate": 7.290756692236982e-06, + "loss": 0.5456, + "step": 1709 + }, + { + "epoch": 0.37, + "grad_norm": 0.20104587078094482, + "learning_rate": 7.287654756831031e-06, + "loss": 0.5701, + "step": 1710 + }, + { + "epoch": 0.37, + "grad_norm": 0.22067013382911682, + "learning_rate": 7.284551707440369e-06, + "loss": 0.4858, + "step": 1711 + }, + { + "epoch": 0.37, + "grad_norm": 0.17873504757881165, + "learning_rate": 7.2814475455760445e-06, + "loss": 0.5027, + "step": 1712 + }, + { + "epoch": 0.37, + "grad_norm": 0.16447962820529938, + "learning_rate": 7.278342272749643e-06, + "loss": 0.4854, + "step": 1713 + }, + { + "epoch": 0.37, + "grad_norm": 0.18496006727218628, + "learning_rate": 7.275235890473291e-06, + "loss": 0.5098, + "step": 1714 + }, + { + "epoch": 0.37, + "grad_norm": 0.20452427864074707, + "learning_rate": 7.272128400259658e-06, + "loss": 0.4419, + "step": 1715 + }, + { + "epoch": 0.37, + "grad_norm": 0.16275016963481903, + "learning_rate": 7.269019803621953e-06, + "loss": 0.535, + "step": 1716 + }, + { + "epoch": 0.37, + "grad_norm": 0.15786287188529968, + "learning_rate": 7.2659101020739195e-06, + "loss": 0.4883, + "step": 1717 + }, + { + "epoch": 0.37, + "grad_norm": 0.1765165776014328, + "learning_rate": 7.262799297129843e-06, + "loss": 0.5827, + "step": 1718 + }, + { + "epoch": 0.37, + "grad_norm": 0.12849071621894836, + "learning_rate": 7.259687390304546e-06, + "loss": 0.4739, + "step": 1719 + }, + { + "epoch": 0.37, + "grad_norm": 0.18336515128612518, + "learning_rate": 7.256574383113386e-06, + "loss": 0.5344, + "step": 1720 + }, + { + "epoch": 0.37, + "grad_norm": 0.14962013065814972, + "learning_rate": 7.253460277072258e-06, + "loss": 0.4984, + "step": 1721 + }, + { + "epoch": 0.37, + "grad_norm": 0.14270378649234772, + "learning_rate": 7.25034507369759e-06, + "loss": 0.491, + "step": 1722 + }, + { + "epoch": 0.37, + "grad_norm": 0.18622830510139465, + "learning_rate": 7.247228774506347e-06, + "loss": 0.5553, + "step": 1723 + }, + { + "epoch": 0.37, + "grad_norm": 0.16195961833000183, + "learning_rate": 7.244111381016024e-06, + "loss": 0.5497, + "step": 1724 + }, + { + "epoch": 0.37, + "grad_norm": 0.1802990436553955, + "learning_rate": 7.2409928947446526e-06, + "loss": 0.5371, + "step": 1725 + }, + { + "epoch": 0.37, + "grad_norm": 0.1768779754638672, + "learning_rate": 7.237873317210796e-06, + "loss": 0.5328, + "step": 1726 + }, + { + "epoch": 0.37, + "grad_norm": 0.15915416181087494, + "learning_rate": 7.234752649933545e-06, + "loss": 0.5206, + "step": 1727 + }, + { + "epoch": 0.37, + "grad_norm": 0.22865630686283112, + "learning_rate": 7.231630894432527e-06, + "loss": 0.5433, + "step": 1728 + }, + { + "epoch": 0.37, + "grad_norm": 0.13628236949443817, + "learning_rate": 7.228508052227895e-06, + "loss": 0.4809, + "step": 1729 + }, + { + "epoch": 0.37, + "grad_norm": 0.1925947070121765, + "learning_rate": 7.22538412484033e-06, + "loss": 0.5716, + "step": 1730 + }, + { + "epoch": 0.37, + "grad_norm": 0.14507855474948883, + "learning_rate": 7.2222591137910454e-06, + "loss": 0.5409, + "step": 1731 + }, + { + "epoch": 0.37, + "grad_norm": 0.1448884755373001, + "learning_rate": 7.219133020601783e-06, + "loss": 0.5184, + "step": 1732 + }, + { + "epoch": 0.37, + "grad_norm": 0.24185587465763092, + "learning_rate": 7.216005846794807e-06, + "loss": 0.5093, + "step": 1733 + }, + { + "epoch": 0.37, + "grad_norm": 0.14733339846134186, + "learning_rate": 7.2128775938929095e-06, + "loss": 0.5361, + "step": 1734 + }, + { + "epoch": 0.37, + "grad_norm": 0.1741349697113037, + "learning_rate": 7.209748263419409e-06, + "loss": 0.5405, + "step": 1735 + }, + { + "epoch": 0.37, + "grad_norm": 0.16004079580307007, + "learning_rate": 7.206617856898149e-06, + "loss": 0.5217, + "step": 1736 + }, + { + "epoch": 0.37, + "grad_norm": 0.16466408967971802, + "learning_rate": 7.203486375853496e-06, + "loss": 0.4928, + "step": 1737 + }, + { + "epoch": 0.37, + "grad_norm": 0.17737893760204315, + "learning_rate": 7.20035382181034e-06, + "loss": 0.5084, + "step": 1738 + }, + { + "epoch": 0.37, + "grad_norm": 0.33183491230010986, + "learning_rate": 7.197220196294094e-06, + "loss": 0.5574, + "step": 1739 + }, + { + "epoch": 0.37, + "grad_norm": 0.14042764902114868, + "learning_rate": 7.194085500830691e-06, + "loss": 0.5856, + "step": 1740 + }, + { + "epoch": 0.38, + "grad_norm": 0.17238366603851318, + "learning_rate": 7.190949736946587e-06, + "loss": 0.5456, + "step": 1741 + }, + { + "epoch": 0.38, + "grad_norm": 0.17922283709049225, + "learning_rate": 7.1878129061687595e-06, + "loss": 0.5223, + "step": 1742 + }, + { + "epoch": 0.38, + "grad_norm": 0.14631612598896027, + "learning_rate": 7.184675010024701e-06, + "loss": 0.5193, + "step": 1743 + }, + { + "epoch": 0.38, + "grad_norm": 0.1614404171705246, + "learning_rate": 7.181536050042427e-06, + "loss": 0.5372, + "step": 1744 + }, + { + "epoch": 0.38, + "grad_norm": 0.14466199278831482, + "learning_rate": 7.1783960277504685e-06, + "loss": 0.4811, + "step": 1745 + }, + { + "epoch": 0.38, + "grad_norm": 0.14429622888565063, + "learning_rate": 7.175254944677874e-06, + "loss": 0.4989, + "step": 1746 + }, + { + "epoch": 0.38, + "grad_norm": 0.1409209966659546, + "learning_rate": 7.172112802354212e-06, + "loss": 0.5104, + "step": 1747 + }, + { + "epoch": 0.38, + "grad_norm": 0.19490914046764374, + "learning_rate": 7.1689696023095625e-06, + "loss": 0.5189, + "step": 1748 + }, + { + "epoch": 0.38, + "grad_norm": 0.20314301550388336, + "learning_rate": 7.165825346074521e-06, + "loss": 0.5169, + "step": 1749 + }, + { + "epoch": 0.38, + "grad_norm": 0.1676884889602661, + "learning_rate": 7.162680035180201e-06, + "loss": 0.5543, + "step": 1750 + }, + { + "epoch": 0.38, + "grad_norm": 0.17340156435966492, + "learning_rate": 7.159533671158225e-06, + "loss": 0.5374, + "step": 1751 + }, + { + "epoch": 0.38, + "grad_norm": 0.1684662252664566, + "learning_rate": 7.156386255540732e-06, + "loss": 0.5167, + "step": 1752 + }, + { + "epoch": 0.38, + "grad_norm": 0.1722518354654312, + "learning_rate": 7.15323778986037e-06, + "loss": 0.5236, + "step": 1753 + }, + { + "epoch": 0.38, + "grad_norm": 0.1535075604915619, + "learning_rate": 7.150088275650302e-06, + "loss": 0.5676, + "step": 1754 + }, + { + "epoch": 0.38, + "grad_norm": 0.2000323235988617, + "learning_rate": 7.1469377144441954e-06, + "loss": 0.5039, + "step": 1755 + }, + { + "epoch": 0.38, + "grad_norm": 0.1701248437166214, + "learning_rate": 7.143786107776236e-06, + "loss": 0.5528, + "step": 1756 + }, + { + "epoch": 0.38, + "grad_norm": 0.15805946290493011, + "learning_rate": 7.140633457181112e-06, + "loss": 0.4744, + "step": 1757 + }, + { + "epoch": 0.38, + "grad_norm": 0.1715155392885208, + "learning_rate": 7.137479764194022e-06, + "loss": 0.5385, + "step": 1758 + }, + { + "epoch": 0.38, + "grad_norm": 0.20759384334087372, + "learning_rate": 7.134325030350672e-06, + "loss": 0.4994, + "step": 1759 + }, + { + "epoch": 0.38, + "grad_norm": 0.1527446210384369, + "learning_rate": 7.131169257187276e-06, + "loss": 0.5411, + "step": 1760 + }, + { + "epoch": 0.38, + "grad_norm": 0.15912318229675293, + "learning_rate": 7.128012446240552e-06, + "loss": 0.5674, + "step": 1761 + }, + { + "epoch": 0.38, + "grad_norm": 0.1656845211982727, + "learning_rate": 7.1248545990477256e-06, + "loss": 0.4999, + "step": 1762 + }, + { + "epoch": 0.38, + "grad_norm": 0.14019495248794556, + "learning_rate": 7.121695717146526e-06, + "loss": 0.5353, + "step": 1763 + }, + { + "epoch": 0.38, + "grad_norm": 0.17298150062561035, + "learning_rate": 7.1185358020751875e-06, + "loss": 0.5064, + "step": 1764 + }, + { + "epoch": 0.38, + "grad_norm": 0.14910168945789337, + "learning_rate": 7.1153748553724425e-06, + "loss": 0.5262, + "step": 1765 + }, + { + "epoch": 0.38, + "grad_norm": 0.20957139134407043, + "learning_rate": 7.112212878577533e-06, + "loss": 0.5084, + "step": 1766 + }, + { + "epoch": 0.38, + "grad_norm": 0.17487388849258423, + "learning_rate": 7.109049873230198e-06, + "loss": 0.5578, + "step": 1767 + }, + { + "epoch": 0.38, + "grad_norm": 0.20940136909484863, + "learning_rate": 7.1058858408706765e-06, + "loss": 0.5895, + "step": 1768 + }, + { + "epoch": 0.38, + "grad_norm": 0.23022903501987457, + "learning_rate": 7.1027207830397134e-06, + "loss": 0.5334, + "step": 1769 + }, + { + "epoch": 0.38, + "grad_norm": 0.15674887597560883, + "learning_rate": 7.099554701278547e-06, + "loss": 0.5144, + "step": 1770 + }, + { + "epoch": 0.38, + "grad_norm": 0.15679983794689178, + "learning_rate": 7.096387597128916e-06, + "loss": 0.5139, + "step": 1771 + }, + { + "epoch": 0.38, + "grad_norm": 0.19758965075016022, + "learning_rate": 7.093219472133059e-06, + "loss": 0.5184, + "step": 1772 + }, + { + "epoch": 0.38, + "grad_norm": 0.17212289571762085, + "learning_rate": 7.0900503278337074e-06, + "loss": 0.5164, + "step": 1773 + }, + { + "epoch": 0.38, + "grad_norm": 0.18704959750175476, + "learning_rate": 7.086880165774093e-06, + "loss": 0.5332, + "step": 1774 + }, + { + "epoch": 0.38, + "grad_norm": 0.1653163731098175, + "learning_rate": 7.083708987497943e-06, + "loss": 0.536, + "step": 1775 + }, + { + "epoch": 0.38, + "grad_norm": 0.1986512988805771, + "learning_rate": 7.080536794549477e-06, + "loss": 0.5382, + "step": 1776 + }, + { + "epoch": 0.38, + "grad_norm": 0.15724928677082062, + "learning_rate": 7.077363588473408e-06, + "loss": 0.5549, + "step": 1777 + }, + { + "epoch": 0.38, + "grad_norm": 0.14671437442302704, + "learning_rate": 7.0741893708149475e-06, + "loss": 0.5662, + "step": 1778 + }, + { + "epoch": 0.38, + "grad_norm": 0.15560339391231537, + "learning_rate": 7.071014143119796e-06, + "loss": 0.5198, + "step": 1779 + }, + { + "epoch": 0.38, + "grad_norm": 0.14752082526683807, + "learning_rate": 7.067837906934143e-06, + "loss": 0.5337, + "step": 1780 + }, + { + "epoch": 0.38, + "grad_norm": 0.13522642850875854, + "learning_rate": 7.064660663804677e-06, + "loss": 0.5066, + "step": 1781 + }, + { + "epoch": 0.38, + "grad_norm": 0.1374634951353073, + "learning_rate": 7.061482415278569e-06, + "loss": 0.4911, + "step": 1782 + }, + { + "epoch": 0.38, + "grad_norm": 0.18049356341362, + "learning_rate": 7.058303162903483e-06, + "loss": 0.5261, + "step": 1783 + }, + { + "epoch": 0.38, + "grad_norm": 0.17125682532787323, + "learning_rate": 7.055122908227571e-06, + "loss": 0.5311, + "step": 1784 + }, + { + "epoch": 0.38, + "grad_norm": 0.16370706260204315, + "learning_rate": 7.051941652799476e-06, + "loss": 0.4968, + "step": 1785 + }, + { + "epoch": 0.38, + "grad_norm": 0.1682046800851822, + "learning_rate": 7.0487593981683246e-06, + "loss": 0.4958, + "step": 1786 + }, + { + "epoch": 0.38, + "grad_norm": 0.1765281856060028, + "learning_rate": 7.04557614588373e-06, + "loss": 0.5139, + "step": 1787 + }, + { + "epoch": 0.39, + "grad_norm": 0.33266332745552063, + "learning_rate": 7.042391897495795e-06, + "loss": 0.5654, + "step": 1788 + }, + { + "epoch": 0.39, + "grad_norm": 0.1499028503894806, + "learning_rate": 7.039206654555103e-06, + "loss": 0.4745, + "step": 1789 + }, + { + "epoch": 0.39, + "grad_norm": 0.1392756998538971, + "learning_rate": 7.036020418612724e-06, + "loss": 0.5564, + "step": 1790 + }, + { + "epoch": 0.39, + "grad_norm": 0.1803901195526123, + "learning_rate": 7.032833191220213e-06, + "loss": 0.4915, + "step": 1791 + }, + { + "epoch": 0.39, + "grad_norm": 0.17533114552497864, + "learning_rate": 7.029644973929604e-06, + "loss": 0.4861, + "step": 1792 + }, + { + "epoch": 0.39, + "grad_norm": 0.1752566695213318, + "learning_rate": 7.026455768293416e-06, + "loss": 0.508, + "step": 1793 + }, + { + "epoch": 0.39, + "grad_norm": 0.14547456800937653, + "learning_rate": 7.023265575864648e-06, + "loss": 0.5137, + "step": 1794 + }, + { + "epoch": 0.39, + "grad_norm": 0.19993162155151367, + "learning_rate": 7.020074398196779e-06, + "loss": 0.5089, + "step": 1795 + }, + { + "epoch": 0.39, + "grad_norm": 0.28430238366127014, + "learning_rate": 7.016882236843769e-06, + "loss": 0.536, + "step": 1796 + }, + { + "epoch": 0.39, + "grad_norm": 0.16877298057079315, + "learning_rate": 7.013689093360059e-06, + "loss": 0.5131, + "step": 1797 + }, + { + "epoch": 0.39, + "grad_norm": 0.12015072256326675, + "learning_rate": 7.0104949693005645e-06, + "loss": 0.4872, + "step": 1798 + }, + { + "epoch": 0.39, + "grad_norm": 0.154635950922966, + "learning_rate": 7.0072998662206775e-06, + "loss": 0.5255, + "step": 1799 + }, + { + "epoch": 0.39, + "grad_norm": 0.1528724581003189, + "learning_rate": 7.00410378567627e-06, + "loss": 0.5689, + "step": 1800 + }, + { + "epoch": 0.39, + "grad_norm": 0.1700393408536911, + "learning_rate": 7.000906729223693e-06, + "loss": 0.4934, + "step": 1801 + }, + { + "epoch": 0.39, + "grad_norm": 0.1635403037071228, + "learning_rate": 6.997708698419765e-06, + "loss": 0.4775, + "step": 1802 + }, + { + "epoch": 0.39, + "grad_norm": 0.14558027684688568, + "learning_rate": 6.994509694821784e-06, + "loss": 0.5529, + "step": 1803 + }, + { + "epoch": 0.39, + "grad_norm": 0.1189364641904831, + "learning_rate": 6.99130971998752e-06, + "loss": 0.5022, + "step": 1804 + }, + { + "epoch": 0.39, + "grad_norm": 0.17554467916488647, + "learning_rate": 6.988108775475218e-06, + "loss": 0.5326, + "step": 1805 + }, + { + "epoch": 0.39, + "grad_norm": 0.15480519831180573, + "learning_rate": 6.98490686284359e-06, + "loss": 0.4882, + "step": 1806 + }, + { + "epoch": 0.39, + "grad_norm": 0.1570086032152176, + "learning_rate": 6.981703983651827e-06, + "loss": 0.4771, + "step": 1807 + }, + { + "epoch": 0.39, + "grad_norm": 0.14414653182029724, + "learning_rate": 6.978500139459583e-06, + "loss": 0.4844, + "step": 1808 + }, + { + "epoch": 0.39, + "grad_norm": 0.181270033121109, + "learning_rate": 6.97529533182699e-06, + "loss": 0.6205, + "step": 1809 + }, + { + "epoch": 0.39, + "grad_norm": 0.13571658730506897, + "learning_rate": 6.972089562314644e-06, + "loss": 0.5364, + "step": 1810 + }, + { + "epoch": 0.39, + "grad_norm": 0.12950097024440765, + "learning_rate": 6.968882832483606e-06, + "loss": 0.5254, + "step": 1811 + }, + { + "epoch": 0.39, + "grad_norm": 0.15108050405979156, + "learning_rate": 6.9656751438954115e-06, + "loss": 0.5432, + "step": 1812 + }, + { + "epoch": 0.39, + "grad_norm": 0.1494326889514923, + "learning_rate": 6.962466498112062e-06, + "loss": 0.5615, + "step": 1813 + }, + { + "epoch": 0.39, + "grad_norm": 0.17007635533809662, + "learning_rate": 6.959256896696021e-06, + "loss": 0.5191, + "step": 1814 + }, + { + "epoch": 0.39, + "grad_norm": 0.16112545132637024, + "learning_rate": 6.956046341210221e-06, + "loss": 0.5374, + "step": 1815 + }, + { + "epoch": 0.39, + "grad_norm": 0.1815643608570099, + "learning_rate": 6.952834833218056e-06, + "loss": 0.5312, + "step": 1816 + }, + { + "epoch": 0.39, + "grad_norm": 0.14015376567840576, + "learning_rate": 6.949622374283387e-06, + "loss": 0.5012, + "step": 1817 + }, + { + "epoch": 0.39, + "grad_norm": 0.14989694952964783, + "learning_rate": 6.946408965970536e-06, + "loss": 0.5075, + "step": 1818 + }, + { + "epoch": 0.39, + "grad_norm": 0.1673702746629715, + "learning_rate": 6.943194609844288e-06, + "loss": 0.5485, + "step": 1819 + }, + { + "epoch": 0.39, + "grad_norm": 0.1309339702129364, + "learning_rate": 6.939979307469892e-06, + "loss": 0.5218, + "step": 1820 + }, + { + "epoch": 0.39, + "grad_norm": 0.1157936230301857, + "learning_rate": 6.93676306041305e-06, + "loss": 0.502, + "step": 1821 + }, + { + "epoch": 0.39, + "grad_norm": 0.1451912224292755, + "learning_rate": 6.933545870239933e-06, + "loss": 0.5339, + "step": 1822 + }, + { + "epoch": 0.39, + "grad_norm": 0.18552608788013458, + "learning_rate": 6.930327738517168e-06, + "loss": 0.4766, + "step": 1823 + }, + { + "epoch": 0.39, + "grad_norm": 0.1459437906742096, + "learning_rate": 6.927108666811837e-06, + "loss": 0.5381, + "step": 1824 + }, + { + "epoch": 0.39, + "grad_norm": 0.14324288070201874, + "learning_rate": 6.923888656691487e-06, + "loss": 0.4846, + "step": 1825 + }, + { + "epoch": 0.39, + "grad_norm": 0.14252141118049622, + "learning_rate": 6.920667709724113e-06, + "loss": 0.4756, + "step": 1826 + }, + { + "epoch": 0.39, + "grad_norm": 0.1347956657409668, + "learning_rate": 6.917445827478175e-06, + "loss": 0.5006, + "step": 1827 + }, + { + "epoch": 0.39, + "grad_norm": 0.17314203083515167, + "learning_rate": 6.914223011522581e-06, + "loss": 0.5711, + "step": 1828 + }, + { + "epoch": 0.39, + "grad_norm": 0.13734053075313568, + "learning_rate": 6.9109992634267e-06, + "loss": 0.4959, + "step": 1829 + }, + { + "epoch": 0.39, + "grad_norm": 0.15517868101596832, + "learning_rate": 6.90777458476035e-06, + "loss": 0.5151, + "step": 1830 + }, + { + "epoch": 0.39, + "grad_norm": 0.17450636625289917, + "learning_rate": 6.9045489770938045e-06, + "loss": 0.4883, + "step": 1831 + }, + { + "epoch": 0.39, + "grad_norm": 0.202430859208107, + "learning_rate": 6.901322441997791e-06, + "loss": 0.4894, + "step": 1832 + }, + { + "epoch": 0.39, + "grad_norm": 0.27107375860214233, + "learning_rate": 6.898094981043482e-06, + "loss": 0.5584, + "step": 1833 + }, + { + "epoch": 0.4, + "grad_norm": 0.15221843123435974, + "learning_rate": 6.894866595802509e-06, + "loss": 0.5003, + "step": 1834 + }, + { + "epoch": 0.4, + "grad_norm": 0.17178794741630554, + "learning_rate": 6.89163728784695e-06, + "loss": 0.548, + "step": 1835 + }, + { + "epoch": 0.4, + "grad_norm": 0.16640210151672363, + "learning_rate": 6.888407058749331e-06, + "loss": 0.5008, + "step": 1836 + }, + { + "epoch": 0.4, + "grad_norm": 0.19455331563949585, + "learning_rate": 6.885175910082631e-06, + "loss": 0.5069, + "step": 1837 + }, + { + "epoch": 0.4, + "grad_norm": 0.1528869867324829, + "learning_rate": 6.881943843420268e-06, + "loss": 0.5051, + "step": 1838 + }, + { + "epoch": 0.4, + "grad_norm": 0.16115941107273102, + "learning_rate": 6.878710860336118e-06, + "loss": 0.4924, + "step": 1839 + }, + { + "epoch": 0.4, + "grad_norm": 0.12841373682022095, + "learning_rate": 6.875476962404495e-06, + "loss": 0.4966, + "step": 1840 + }, + { + "epoch": 0.4, + "grad_norm": 0.1625949740409851, + "learning_rate": 6.8722421512001625e-06, + "loss": 0.5575, + "step": 1841 + }, + { + "epoch": 0.4, + "grad_norm": 0.18129919469356537, + "learning_rate": 6.869006428298328e-06, + "loss": 0.5509, + "step": 1842 + }, + { + "epoch": 0.4, + "grad_norm": 0.14833548665046692, + "learning_rate": 6.865769795274641e-06, + "loss": 0.5444, + "step": 1843 + }, + { + "epoch": 0.4, + "grad_norm": 0.14769743382930756, + "learning_rate": 6.862532253705199e-06, + "loss": 0.4723, + "step": 1844 + }, + { + "epoch": 0.4, + "grad_norm": 0.13029511272907257, + "learning_rate": 6.859293805166536e-06, + "loss": 0.4908, + "step": 1845 + }, + { + "epoch": 0.4, + "grad_norm": 0.19006066024303436, + "learning_rate": 6.85605445123563e-06, + "loss": 0.4983, + "step": 1846 + }, + { + "epoch": 0.4, + "grad_norm": 0.13327574729919434, + "learning_rate": 6.852814193489903e-06, + "loss": 0.5046, + "step": 1847 + }, + { + "epoch": 0.4, + "grad_norm": 0.16421039402484894, + "learning_rate": 6.849573033507213e-06, + "loss": 0.4845, + "step": 1848 + }, + { + "epoch": 0.4, + "grad_norm": 0.14652986824512482, + "learning_rate": 6.846330972865857e-06, + "loss": 0.5351, + "step": 1849 + }, + { + "epoch": 0.4, + "grad_norm": 0.1581708788871765, + "learning_rate": 6.843088013144575e-06, + "loss": 0.5125, + "step": 1850 + }, + { + "epoch": 0.4, + "grad_norm": 0.13055890798568726, + "learning_rate": 6.839844155922543e-06, + "loss": 0.4872, + "step": 1851 + }, + { + "epoch": 0.4, + "grad_norm": 0.17920167744159698, + "learning_rate": 6.8365994027793695e-06, + "loss": 0.5181, + "step": 1852 + }, + { + "epoch": 0.4, + "grad_norm": 0.16211476922035217, + "learning_rate": 6.833353755295104e-06, + "loss": 0.4617, + "step": 1853 + }, + { + "epoch": 0.4, + "grad_norm": 0.15161064267158508, + "learning_rate": 6.830107215050232e-06, + "loss": 0.4736, + "step": 1854 + }, + { + "epoch": 0.4, + "grad_norm": 0.15771758556365967, + "learning_rate": 6.826859783625674e-06, + "loss": 0.5481, + "step": 1855 + }, + { + "epoch": 0.4, + "grad_norm": 0.17663753032684326, + "learning_rate": 6.823611462602777e-06, + "loss": 0.562, + "step": 1856 + }, + { + "epoch": 0.4, + "grad_norm": 0.16153866052627563, + "learning_rate": 6.82036225356333e-06, + "loss": 0.4947, + "step": 1857 + }, + { + "epoch": 0.4, + "grad_norm": 0.20720727741718292, + "learning_rate": 6.817112158089554e-06, + "loss": 0.5606, + "step": 1858 + }, + { + "epoch": 0.4, + "grad_norm": 0.21727946400642395, + "learning_rate": 6.813861177764094e-06, + "loss": 0.5017, + "step": 1859 + }, + { + "epoch": 0.4, + "grad_norm": 0.2113008350133896, + "learning_rate": 6.8106093141700336e-06, + "loss": 0.5526, + "step": 1860 + }, + { + "epoch": 0.4, + "grad_norm": 0.16218236088752747, + "learning_rate": 6.807356568890884e-06, + "loss": 0.4807, + "step": 1861 + }, + { + "epoch": 0.4, + "grad_norm": 0.18519651889801025, + "learning_rate": 6.804102943510583e-06, + "loss": 0.5168, + "step": 1862 + }, + { + "epoch": 0.4, + "grad_norm": 0.18724150955677032, + "learning_rate": 6.800848439613504e-06, + "loss": 0.4815, + "step": 1863 + }, + { + "epoch": 0.4, + "grad_norm": 0.14294007420539856, + "learning_rate": 6.797593058784437e-06, + "loss": 0.5586, + "step": 1864 + }, + { + "epoch": 0.4, + "grad_norm": 0.159059077501297, + "learning_rate": 6.7943368026086124e-06, + "loss": 0.5098, + "step": 1865 + }, + { + "epoch": 0.4, + "grad_norm": 0.16052033007144928, + "learning_rate": 6.791079672671677e-06, + "loss": 0.5117, + "step": 1866 + }, + { + "epoch": 0.4, + "grad_norm": 0.1647024303674698, + "learning_rate": 6.787821670559705e-06, + "loss": 0.5381, + "step": 1867 + }, + { + "epoch": 0.4, + "grad_norm": 0.18817616999149323, + "learning_rate": 6.784562797859198e-06, + "loss": 0.4719, + "step": 1868 + }, + { + "epoch": 0.4, + "grad_norm": 0.18448995053768158, + "learning_rate": 6.78130305615708e-06, + "loss": 0.5259, + "step": 1869 + }, + { + "epoch": 0.4, + "grad_norm": 0.1643984615802765, + "learning_rate": 6.7780424470407004e-06, + "loss": 0.5437, + "step": 1870 + }, + { + "epoch": 0.4, + "grad_norm": 0.14963030815124512, + "learning_rate": 6.774780972097823e-06, + "loss": 0.4785, + "step": 1871 + }, + { + "epoch": 0.4, + "grad_norm": 0.18385331332683563, + "learning_rate": 6.771518632916645e-06, + "loss": 0.4909, + "step": 1872 + }, + { + "epoch": 0.4, + "grad_norm": 0.1393522322177887, + "learning_rate": 6.7682554310857755e-06, + "loss": 0.4809, + "step": 1873 + }, + { + "epoch": 0.4, + "grad_norm": 0.16635462641716003, + "learning_rate": 6.7649913681942455e-06, + "loss": 0.5425, + "step": 1874 + }, + { + "epoch": 0.4, + "grad_norm": 0.15144184231758118, + "learning_rate": 6.761726445831511e-06, + "loss": 0.5033, + "step": 1875 + }, + { + "epoch": 0.4, + "grad_norm": 0.17777347564697266, + "learning_rate": 6.758460665587437e-06, + "loss": 0.5561, + "step": 1876 + }, + { + "epoch": 0.4, + "grad_norm": 0.2699100375175476, + "learning_rate": 6.755194029052313e-06, + "loss": 0.5314, + "step": 1877 + }, + { + "epoch": 0.4, + "grad_norm": 0.17995339632034302, + "learning_rate": 6.751926537816846e-06, + "loss": 0.5097, + "step": 1878 + }, + { + "epoch": 0.4, + "grad_norm": 0.14517782628536224, + "learning_rate": 6.748658193472155e-06, + "loss": 0.524, + "step": 1879 + }, + { + "epoch": 0.4, + "grad_norm": 0.14715701341629028, + "learning_rate": 6.745388997609774e-06, + "loss": 0.5633, + "step": 1880 + }, + { + "epoch": 0.41, + "grad_norm": 0.16807079315185547, + "learning_rate": 6.7421189518216576e-06, + "loss": 0.5106, + "step": 1881 + }, + { + "epoch": 0.41, + "grad_norm": 0.1584351658821106, + "learning_rate": 6.738848057700169e-06, + "loss": 0.5602, + "step": 1882 + }, + { + "epoch": 0.41, + "grad_norm": 0.16300451755523682, + "learning_rate": 6.735576316838087e-06, + "loss": 0.5455, + "step": 1883 + }, + { + "epoch": 0.41, + "grad_norm": 0.16324667632579803, + "learning_rate": 6.732303730828601e-06, + "loss": 0.5247, + "step": 1884 + }, + { + "epoch": 0.41, + "grad_norm": 0.16887761652469635, + "learning_rate": 6.7290303012653136e-06, + "loss": 0.4953, + "step": 1885 + }, + { + "epoch": 0.41, + "grad_norm": 0.1934385746717453, + "learning_rate": 6.725756029742234e-06, + "loss": 0.4727, + "step": 1886 + }, + { + "epoch": 0.41, + "grad_norm": 0.17485982179641724, + "learning_rate": 6.7224809178537894e-06, + "loss": 0.5003, + "step": 1887 + }, + { + "epoch": 0.41, + "grad_norm": 0.14065895974636078, + "learning_rate": 6.7192049671948115e-06, + "loss": 0.4841, + "step": 1888 + }, + { + "epoch": 0.41, + "grad_norm": 0.12996014952659607, + "learning_rate": 6.715928179360538e-06, + "loss": 0.4906, + "step": 1889 + }, + { + "epoch": 0.41, + "grad_norm": 0.14599494636058807, + "learning_rate": 6.712650555946616e-06, + "loss": 0.5114, + "step": 1890 + }, + { + "epoch": 0.41, + "grad_norm": 0.1689714789390564, + "learning_rate": 6.709372098549104e-06, + "loss": 0.5318, + "step": 1891 + }, + { + "epoch": 0.41, + "grad_norm": 0.14123961329460144, + "learning_rate": 6.706092808764459e-06, + "loss": 0.5013, + "step": 1892 + }, + { + "epoch": 0.41, + "grad_norm": 0.14629031717777252, + "learning_rate": 6.702812688189551e-06, + "loss": 0.5524, + "step": 1893 + }, + { + "epoch": 0.41, + "grad_norm": 0.1583494246006012, + "learning_rate": 6.699531738421648e-06, + "loss": 0.5285, + "step": 1894 + }, + { + "epoch": 0.41, + "grad_norm": 0.17046624422073364, + "learning_rate": 6.696249961058426e-06, + "loss": 0.5125, + "step": 1895 + }, + { + "epoch": 0.41, + "grad_norm": 0.1436389535665512, + "learning_rate": 6.692967357697961e-06, + "loss": 0.5045, + "step": 1896 + }, + { + "epoch": 0.41, + "grad_norm": 0.18508578836917877, + "learning_rate": 6.689683929938736e-06, + "loss": 0.5401, + "step": 1897 + }, + { + "epoch": 0.41, + "grad_norm": 0.1609339416027069, + "learning_rate": 6.6863996793796286e-06, + "loss": 0.5026, + "step": 1898 + }, + { + "epoch": 0.41, + "grad_norm": 0.17639221251010895, + "learning_rate": 6.683114607619923e-06, + "loss": 0.5563, + "step": 1899 + }, + { + "epoch": 0.41, + "grad_norm": 0.15782758593559265, + "learning_rate": 6.6798287162593e-06, + "loss": 0.5344, + "step": 1900 + }, + { + "epoch": 0.41, + "grad_norm": 0.14880798757076263, + "learning_rate": 6.676542006897842e-06, + "loss": 0.4987, + "step": 1901 + }, + { + "epoch": 0.41, + "grad_norm": 0.18628853559494019, + "learning_rate": 6.6732544811360255e-06, + "loss": 0.4961, + "step": 1902 + }, + { + "epoch": 0.41, + "grad_norm": 0.18380938470363617, + "learning_rate": 6.669966140574729e-06, + "loss": 0.5529, + "step": 1903 + }, + { + "epoch": 0.41, + "grad_norm": 0.18866044282913208, + "learning_rate": 6.666676986815227e-06, + "loss": 0.5462, + "step": 1904 + }, + { + "epoch": 0.41, + "grad_norm": 0.16578936576843262, + "learning_rate": 6.663387021459187e-06, + "loss": 0.513, + "step": 1905 + }, + { + "epoch": 0.41, + "grad_norm": 0.18033047020435333, + "learning_rate": 6.660096246108677e-06, + "loss": 0.4892, + "step": 1906 + }, + { + "epoch": 0.41, + "grad_norm": 0.16443459689617157, + "learning_rate": 6.656804662366153e-06, + "loss": 0.5372, + "step": 1907 + }, + { + "epoch": 0.41, + "grad_norm": 0.14939545094966888, + "learning_rate": 6.653512271834468e-06, + "loss": 0.5273, + "step": 1908 + }, + { + "epoch": 0.41, + "grad_norm": 0.17759068310260773, + "learning_rate": 6.650219076116868e-06, + "loss": 0.4714, + "step": 1909 + }, + { + "epoch": 0.41, + "grad_norm": 0.1866803765296936, + "learning_rate": 6.646925076816994e-06, + "loss": 0.5261, + "step": 1910 + }, + { + "epoch": 0.41, + "grad_norm": 0.15621764957904816, + "learning_rate": 6.643630275538871e-06, + "loss": 0.521, + "step": 1911 + }, + { + "epoch": 0.41, + "grad_norm": 0.20561483502388, + "learning_rate": 6.640334673886921e-06, + "loss": 0.531, + "step": 1912 + }, + { + "epoch": 0.41, + "grad_norm": 0.1349986344575882, + "learning_rate": 6.637038273465952e-06, + "loss": 0.5328, + "step": 1913 + }, + { + "epoch": 0.41, + "grad_norm": 0.1595732718706131, + "learning_rate": 6.633741075881163e-06, + "loss": 0.5151, + "step": 1914 + }, + { + "epoch": 0.41, + "grad_norm": 0.15593409538269043, + "learning_rate": 6.63044308273814e-06, + "loss": 0.5507, + "step": 1915 + }, + { + "epoch": 0.41, + "grad_norm": 0.1654960662126541, + "learning_rate": 6.627144295642859e-06, + "loss": 0.5172, + "step": 1916 + }, + { + "epoch": 0.41, + "grad_norm": 0.13034138083457947, + "learning_rate": 6.6238447162016786e-06, + "loss": 0.561, + "step": 1917 + }, + { + "epoch": 0.41, + "grad_norm": 0.14604593813419342, + "learning_rate": 6.6205443460213445e-06, + "loss": 0.5173, + "step": 1918 + }, + { + "epoch": 0.41, + "grad_norm": 0.18159790337085724, + "learning_rate": 6.617243186708989e-06, + "loss": 0.5295, + "step": 1919 + }, + { + "epoch": 0.41, + "grad_norm": 0.1321515291929245, + "learning_rate": 6.613941239872129e-06, + "loss": 0.4762, + "step": 1920 + }, + { + "epoch": 0.41, + "grad_norm": 0.13790853321552277, + "learning_rate": 6.610638507118663e-06, + "loss": 0.5172, + "step": 1921 + }, + { + "epoch": 0.41, + "grad_norm": 0.15198110044002533, + "learning_rate": 6.607334990056873e-06, + "loss": 0.5019, + "step": 1922 + }, + { + "epoch": 0.41, + "grad_norm": 0.1440410614013672, + "learning_rate": 6.604030690295422e-06, + "loss": 0.481, + "step": 1923 + }, + { + "epoch": 0.41, + "grad_norm": 0.23520071804523468, + "learning_rate": 6.600725609443356e-06, + "loss": 0.4935, + "step": 1924 + }, + { + "epoch": 0.41, + "grad_norm": 0.12442398816347122, + "learning_rate": 6.597419749110099e-06, + "loss": 0.5067, + "step": 1925 + }, + { + "epoch": 0.41, + "grad_norm": 0.19941824674606323, + "learning_rate": 6.594113110905458e-06, + "loss": 0.5489, + "step": 1926 + }, + { + "epoch": 0.42, + "grad_norm": 0.16936185956001282, + "learning_rate": 6.5908056964396135e-06, + "loss": 0.5173, + "step": 1927 + }, + { + "epoch": 0.42, + "grad_norm": 0.1414109170436859, + "learning_rate": 6.587497507323132e-06, + "loss": 0.4946, + "step": 1928 + }, + { + "epoch": 0.42, + "grad_norm": 0.1461210548877716, + "learning_rate": 6.584188545166948e-06, + "loss": 0.5585, + "step": 1929 + }, + { + "epoch": 0.42, + "grad_norm": 0.14086653292179108, + "learning_rate": 6.580878811582379e-06, + "loss": 0.5138, + "step": 1930 + }, + { + "epoch": 0.42, + "grad_norm": 0.13712497055530548, + "learning_rate": 6.5775683081811144e-06, + "loss": 0.5223, + "step": 1931 + }, + { + "epoch": 0.42, + "grad_norm": 0.18051303923130035, + "learning_rate": 6.574257036575224e-06, + "loss": 0.5229, + "step": 1932 + }, + { + "epoch": 0.42, + "grad_norm": 0.18365350365638733, + "learning_rate": 6.5709449983771414e-06, + "loss": 0.5357, + "step": 1933 + }, + { + "epoch": 0.42, + "grad_norm": 0.1633131057024002, + "learning_rate": 6.567632195199686e-06, + "loss": 0.5919, + "step": 1934 + }, + { + "epoch": 0.42, + "grad_norm": 0.18704870343208313, + "learning_rate": 6.564318628656039e-06, + "loss": 0.5212, + "step": 1935 + }, + { + "epoch": 0.42, + "grad_norm": 0.15724125504493713, + "learning_rate": 6.5610043003597615e-06, + "loss": 0.5219, + "step": 1936 + }, + { + "epoch": 0.42, + "grad_norm": 0.14116469025611877, + "learning_rate": 6.557689211924779e-06, + "loss": 0.5133, + "step": 1937 + }, + { + "epoch": 0.42, + "grad_norm": 0.20150695741176605, + "learning_rate": 6.554373364965392e-06, + "loss": 0.5256, + "step": 1938 + }, + { + "epoch": 0.42, + "grad_norm": 0.18280090391635895, + "learning_rate": 6.551056761096269e-06, + "loss": 0.5481, + "step": 1939 + }, + { + "epoch": 0.42, + "grad_norm": 0.18789951503276825, + "learning_rate": 6.547739401932443e-06, + "loss": 0.4974, + "step": 1940 + }, + { + "epoch": 0.42, + "grad_norm": 0.15406067669391632, + "learning_rate": 6.544421289089321e-06, + "loss": 0.543, + "step": 1941 + }, + { + "epoch": 0.42, + "grad_norm": 0.16543880105018616, + "learning_rate": 6.541102424182676e-06, + "loss": 0.5503, + "step": 1942 + }, + { + "epoch": 0.42, + "grad_norm": 0.17979435622692108, + "learning_rate": 6.537782808828641e-06, + "loss": 0.5514, + "step": 1943 + }, + { + "epoch": 0.42, + "grad_norm": 0.19799616932868958, + "learning_rate": 6.5344624446437234e-06, + "loss": 0.499, + "step": 1944 + }, + { + "epoch": 0.42, + "grad_norm": 0.16152727603912354, + "learning_rate": 6.531141333244789e-06, + "loss": 0.5483, + "step": 1945 + }, + { + "epoch": 0.42, + "grad_norm": 0.16674454510211945, + "learning_rate": 6.527819476249066e-06, + "loss": 0.5127, + "step": 1946 + }, + { + "epoch": 0.42, + "grad_norm": 0.16409684717655182, + "learning_rate": 6.5244968752741555e-06, + "loss": 0.5407, + "step": 1947 + }, + { + "epoch": 0.42, + "grad_norm": 0.1826597899198532, + "learning_rate": 6.521173531938011e-06, + "loss": 0.446, + "step": 1948 + }, + { + "epoch": 0.42, + "grad_norm": 0.17517463862895966, + "learning_rate": 6.517849447858951e-06, + "loss": 0.5539, + "step": 1949 + }, + { + "epoch": 0.42, + "grad_norm": 0.14857599139213562, + "learning_rate": 6.514524624655654e-06, + "loss": 0.5278, + "step": 1950 + }, + { + "epoch": 0.42, + "grad_norm": 0.13251933455467224, + "learning_rate": 6.511199063947159e-06, + "loss": 0.4874, + "step": 1951 + }, + { + "epoch": 0.42, + "grad_norm": 0.138553187251091, + "learning_rate": 6.507872767352863e-06, + "loss": 0.5654, + "step": 1952 + }, + { + "epoch": 0.42, + "grad_norm": 0.13305741548538208, + "learning_rate": 6.504545736492526e-06, + "loss": 0.5318, + "step": 1953 + }, + { + "epoch": 0.42, + "grad_norm": 0.14779391884803772, + "learning_rate": 6.50121797298626e-06, + "loss": 0.5017, + "step": 1954 + }, + { + "epoch": 0.42, + "grad_norm": 0.1407061219215393, + "learning_rate": 6.497889478454534e-06, + "loss": 0.4967, + "step": 1955 + }, + { + "epoch": 0.42, + "grad_norm": 0.14632262289524078, + "learning_rate": 6.494560254518179e-06, + "loss": 0.4989, + "step": 1956 + }, + { + "epoch": 0.42, + "grad_norm": 0.2105487734079361, + "learning_rate": 6.491230302798372e-06, + "loss": 0.5095, + "step": 1957 + }, + { + "epoch": 0.42, + "grad_norm": 0.15186044573783875, + "learning_rate": 6.487899624916654e-06, + "loss": 0.5069, + "step": 1958 + }, + { + "epoch": 0.42, + "grad_norm": 0.15018121898174286, + "learning_rate": 6.484568222494911e-06, + "loss": 0.5031, + "step": 1959 + }, + { + "epoch": 0.42, + "grad_norm": 0.15453185141086578, + "learning_rate": 6.481236097155389e-06, + "loss": 0.513, + "step": 1960 + }, + { + "epoch": 0.42, + "grad_norm": 0.14290063083171844, + "learning_rate": 6.47790325052068e-06, + "loss": 0.4524, + "step": 1961 + }, + { + "epoch": 0.42, + "grad_norm": 0.17694444954395294, + "learning_rate": 6.4745696842137305e-06, + "loss": 0.5628, + "step": 1962 + }, + { + "epoch": 0.42, + "grad_norm": 0.15745702385902405, + "learning_rate": 6.4712353998578396e-06, + "loss": 0.5302, + "step": 1963 + }, + { + "epoch": 0.42, + "grad_norm": 0.1511646956205368, + "learning_rate": 6.467900399076651e-06, + "loss": 0.5041, + "step": 1964 + }, + { + "epoch": 0.42, + "grad_norm": 0.15910549461841583, + "learning_rate": 6.46456468349416e-06, + "loss": 0.5193, + "step": 1965 + }, + { + "epoch": 0.42, + "grad_norm": 0.15061886608600616, + "learning_rate": 6.461228254734711e-06, + "loss": 0.48, + "step": 1966 + }, + { + "epoch": 0.42, + "grad_norm": 0.1490405946969986, + "learning_rate": 6.4578911144229915e-06, + "loss": 0.4894, + "step": 1967 + }, + { + "epoch": 0.42, + "grad_norm": 0.13372862339019775, + "learning_rate": 6.454553264184041e-06, + "loss": 0.5259, + "step": 1968 + }, + { + "epoch": 0.42, + "grad_norm": 0.15754102170467377, + "learning_rate": 6.451214705643241e-06, + "loss": 0.5001, + "step": 1969 + }, + { + "epoch": 0.42, + "grad_norm": 0.17153845727443695, + "learning_rate": 6.447875440426319e-06, + "loss": 0.5492, + "step": 1970 + }, + { + "epoch": 0.42, + "grad_norm": 0.1674170047044754, + "learning_rate": 6.444535470159346e-06, + "loss": 0.5032, + "step": 1971 + }, + { + "epoch": 0.42, + "grad_norm": 0.12836651504039764, + "learning_rate": 6.441194796468739e-06, + "loss": 0.4732, + "step": 1972 + }, + { + "epoch": 0.43, + "grad_norm": 0.1573239266872406, + "learning_rate": 6.437853420981254e-06, + "loss": 0.4972, + "step": 1973 + }, + { + "epoch": 0.43, + "grad_norm": 0.18357399106025696, + "learning_rate": 6.434511345323988e-06, + "loss": 0.5351, + "step": 1974 + }, + { + "epoch": 0.43, + "grad_norm": 0.14846058189868927, + "learning_rate": 6.431168571124387e-06, + "loss": 0.4689, + "step": 1975 + }, + { + "epoch": 0.43, + "grad_norm": 0.14961528778076172, + "learning_rate": 6.427825100010225e-06, + "loss": 0.5394, + "step": 1976 + }, + { + "epoch": 0.43, + "grad_norm": 0.2099412977695465, + "learning_rate": 6.424480933609626e-06, + "loss": 0.5802, + "step": 1977 + }, + { + "epoch": 0.43, + "grad_norm": 0.1339603066444397, + "learning_rate": 6.421136073551047e-06, + "loss": 0.499, + "step": 1978 + }, + { + "epoch": 0.43, + "grad_norm": 0.1474086493253708, + "learning_rate": 6.417790521463282e-06, + "loss": 0.511, + "step": 1979 + }, + { + "epoch": 0.43, + "grad_norm": 0.13013140857219696, + "learning_rate": 6.414444278975465e-06, + "loss": 0.5215, + "step": 1980 + }, + { + "epoch": 0.43, + "grad_norm": 0.14738723635673523, + "learning_rate": 6.411097347717068e-06, + "loss": 0.5079, + "step": 1981 + }, + { + "epoch": 0.43, + "grad_norm": 0.18411760032176971, + "learning_rate": 6.407749729317892e-06, + "loss": 0.51, + "step": 1982 + }, + { + "epoch": 0.43, + "grad_norm": 0.15733294188976288, + "learning_rate": 6.404401425408079e-06, + "loss": 0.5247, + "step": 1983 + }, + { + "epoch": 0.43, + "grad_norm": 0.1328936368227005, + "learning_rate": 6.401052437618098e-06, + "loss": 0.5223, + "step": 1984 + }, + { + "epoch": 0.43, + "grad_norm": 0.15146395564079285, + "learning_rate": 6.397702767578761e-06, + "loss": 0.5676, + "step": 1985 + }, + { + "epoch": 0.43, + "grad_norm": 0.1269007921218872, + "learning_rate": 6.394352416921201e-06, + "loss": 0.47, + "step": 1986 + }, + { + "epoch": 0.43, + "grad_norm": 0.15181781351566315, + "learning_rate": 6.39100138727689e-06, + "loss": 0.56, + "step": 1987 + }, + { + "epoch": 0.43, + "grad_norm": 0.1406852751970291, + "learning_rate": 6.387649680277629e-06, + "loss": 0.5753, + "step": 1988 + }, + { + "epoch": 0.43, + "grad_norm": 0.2074470818042755, + "learning_rate": 6.384297297555546e-06, + "loss": 0.528, + "step": 1989 + }, + { + "epoch": 0.43, + "grad_norm": 0.15589666366577148, + "learning_rate": 6.380944240743101e-06, + "loss": 0.5103, + "step": 1990 + }, + { + "epoch": 0.43, + "grad_norm": 0.156142920255661, + "learning_rate": 6.377590511473083e-06, + "loss": 0.5082, + "step": 1991 + }, + { + "epoch": 0.43, + "grad_norm": 0.18364138901233673, + "learning_rate": 6.374236111378605e-06, + "loss": 0.5319, + "step": 1992 + }, + { + "epoch": 0.43, + "grad_norm": 0.13717058300971985, + "learning_rate": 6.37088104209311e-06, + "loss": 0.5207, + "step": 1993 + }, + { + "epoch": 0.43, + "grad_norm": 0.1605088859796524, + "learning_rate": 6.3675253052503645e-06, + "loss": 0.4823, + "step": 1994 + }, + { + "epoch": 0.43, + "grad_norm": 0.13547933101654053, + "learning_rate": 6.364168902484461e-06, + "loss": 0.5081, + "step": 1995 + }, + { + "epoch": 0.43, + "grad_norm": 0.1631360799074173, + "learning_rate": 6.360811835429817e-06, + "loss": 0.5494, + "step": 1996 + }, + { + "epoch": 0.43, + "grad_norm": 0.15566737949848175, + "learning_rate": 6.357454105721171e-06, + "loss": 0.5708, + "step": 1997 + }, + { + "epoch": 0.43, + "grad_norm": 0.18726012110710144, + "learning_rate": 6.35409571499359e-06, + "loss": 0.524, + "step": 1998 + }, + { + "epoch": 0.43, + "grad_norm": 0.18683874607086182, + "learning_rate": 6.350736664882454e-06, + "loss": 0.477, + "step": 1999 + }, + { + "epoch": 0.43, + "grad_norm": 0.15933635830879211, + "learning_rate": 6.347376957023471e-06, + "loss": 0.5524, + "step": 2000 + }, + { + "epoch": 0.43, + "grad_norm": 0.16675737500190735, + "learning_rate": 6.344016593052669e-06, + "loss": 0.5126, + "step": 2001 + }, + { + "epoch": 0.43, + "grad_norm": 0.22275328636169434, + "learning_rate": 6.340655574606391e-06, + "loss": 0.5203, + "step": 2002 + }, + { + "epoch": 0.43, + "grad_norm": 0.1311800628900528, + "learning_rate": 6.337293903321303e-06, + "loss": 0.5132, + "step": 2003 + }, + { + "epoch": 0.43, + "grad_norm": 0.12225490063428879, + "learning_rate": 6.333931580834387e-06, + "loss": 0.5529, + "step": 2004 + }, + { + "epoch": 0.43, + "grad_norm": 0.14834477007389069, + "learning_rate": 6.330568608782941e-06, + "loss": 0.5045, + "step": 2005 + }, + { + "epoch": 0.43, + "grad_norm": 0.13984233140945435, + "learning_rate": 6.327204988804583e-06, + "loss": 0.5398, + "step": 2006 + }, + { + "epoch": 0.43, + "grad_norm": 0.13225583732128143, + "learning_rate": 6.323840722537243e-06, + "loss": 0.5065, + "step": 2007 + }, + { + "epoch": 0.43, + "grad_norm": 0.16569088399410248, + "learning_rate": 6.320475811619167e-06, + "loss": 0.529, + "step": 2008 + }, + { + "epoch": 0.43, + "grad_norm": 0.20376458764076233, + "learning_rate": 6.317110257688917e-06, + "loss": 0.47, + "step": 2009 + }, + { + "epoch": 0.43, + "grad_norm": 0.20211917161941528, + "learning_rate": 6.313744062385363e-06, + "loss": 0.5044, + "step": 2010 + }, + { + "epoch": 0.43, + "grad_norm": 0.1894192099571228, + "learning_rate": 6.31037722734769e-06, + "loss": 0.574, + "step": 2011 + }, + { + "epoch": 0.43, + "grad_norm": 0.14667464792728424, + "learning_rate": 6.307009754215397e-06, + "loss": 0.5502, + "step": 2012 + }, + { + "epoch": 0.43, + "grad_norm": 0.17428962886333466, + "learning_rate": 6.303641644628291e-06, + "loss": 0.5423, + "step": 2013 + }, + { + "epoch": 0.43, + "grad_norm": 0.1584947109222412, + "learning_rate": 6.300272900226491e-06, + "loss": 0.4784, + "step": 2014 + }, + { + "epoch": 0.43, + "grad_norm": 0.14651672542095184, + "learning_rate": 6.296903522650419e-06, + "loss": 0.4896, + "step": 2015 + }, + { + "epoch": 0.43, + "grad_norm": 0.13722088932991028, + "learning_rate": 6.2935335135408135e-06, + "loss": 0.4324, + "step": 2016 + }, + { + "epoch": 0.43, + "grad_norm": 0.16364432871341705, + "learning_rate": 6.290162874538718e-06, + "loss": 0.5051, + "step": 2017 + }, + { + "epoch": 0.43, + "grad_norm": 0.17197778820991516, + "learning_rate": 6.286791607285478e-06, + "loss": 0.4707, + "step": 2018 + }, + { + "epoch": 0.43, + "grad_norm": 0.19821661710739136, + "learning_rate": 6.283419713422754e-06, + "loss": 0.5365, + "step": 2019 + }, + { + "epoch": 0.44, + "grad_norm": 0.18750454485416412, + "learning_rate": 6.2800471945925e-06, + "loss": 0.5813, + "step": 2020 + }, + { + "epoch": 0.44, + "grad_norm": 0.15011686086654663, + "learning_rate": 6.276674052436984e-06, + "loss": 0.4686, + "step": 2021 + }, + { + "epoch": 0.44, + "grad_norm": 0.14810575544834137, + "learning_rate": 6.2733002885987734e-06, + "loss": 0.5666, + "step": 2022 + }, + { + "epoch": 0.44, + "grad_norm": 0.15707622468471527, + "learning_rate": 6.26992590472074e-06, + "loss": 0.4939, + "step": 2023 + }, + { + "epoch": 0.44, + "grad_norm": 0.16260173916816711, + "learning_rate": 6.2665509024460554e-06, + "loss": 0.5063, + "step": 2024 + }, + { + "epoch": 0.44, + "grad_norm": 0.14994855225086212, + "learning_rate": 6.263175283418196e-06, + "loss": 0.4813, + "step": 2025 + }, + { + "epoch": 0.44, + "grad_norm": 0.16885532438755035, + "learning_rate": 6.259799049280932e-06, + "loss": 0.5239, + "step": 2026 + }, + { + "epoch": 0.44, + "grad_norm": 0.1944415271282196, + "learning_rate": 6.256422201678341e-06, + "loss": 0.4999, + "step": 2027 + }, + { + "epoch": 0.44, + "grad_norm": 0.16358405351638794, + "learning_rate": 6.253044742254791e-06, + "loss": 0.532, + "step": 2028 + }, + { + "epoch": 0.44, + "grad_norm": 0.184137225151062, + "learning_rate": 6.249666672654958e-06, + "loss": 0.4797, + "step": 2029 + }, + { + "epoch": 0.44, + "grad_norm": 0.18166375160217285, + "learning_rate": 6.246287994523805e-06, + "loss": 0.5129, + "step": 2030 + }, + { + "epoch": 0.44, + "grad_norm": 0.13478122651576996, + "learning_rate": 6.242908709506599e-06, + "loss": 0.4996, + "step": 2031 + }, + { + "epoch": 0.44, + "grad_norm": 0.1508375108242035, + "learning_rate": 6.239528819248898e-06, + "loss": 0.4822, + "step": 2032 + }, + { + "epoch": 0.44, + "grad_norm": 0.14239796996116638, + "learning_rate": 6.236148325396555e-06, + "loss": 0.5381, + "step": 2033 + }, + { + "epoch": 0.44, + "grad_norm": 0.13590578734874725, + "learning_rate": 6.232767229595719e-06, + "loss": 0.5076, + "step": 2034 + }, + { + "epoch": 0.44, + "grad_norm": 0.1495681256055832, + "learning_rate": 6.229385533492833e-06, + "loss": 0.5012, + "step": 2035 + }, + { + "epoch": 0.44, + "grad_norm": 0.11667856574058533, + "learning_rate": 6.226003238734628e-06, + "loss": 0.5408, + "step": 2036 + }, + { + "epoch": 0.44, + "grad_norm": 0.12598071992397308, + "learning_rate": 6.222620346968131e-06, + "loss": 0.4822, + "step": 2037 + }, + { + "epoch": 0.44, + "grad_norm": 0.18622703850269318, + "learning_rate": 6.219236859840656e-06, + "loss": 0.5583, + "step": 2038 + }, + { + "epoch": 0.44, + "grad_norm": 0.15623895823955536, + "learning_rate": 6.21585277899981e-06, + "loss": 0.5272, + "step": 2039 + }, + { + "epoch": 0.44, + "grad_norm": 0.15245303511619568, + "learning_rate": 6.2124681060934866e-06, + "loss": 0.5504, + "step": 2040 + }, + { + "epoch": 0.44, + "grad_norm": 0.2059142142534256, + "learning_rate": 6.2090828427698706e-06, + "loss": 0.5196, + "step": 2041 + }, + { + "epoch": 0.44, + "grad_norm": 0.14754840731620789, + "learning_rate": 6.205696990677431e-06, + "loss": 0.5198, + "step": 2042 + }, + { + "epoch": 0.44, + "grad_norm": 0.14195892214775085, + "learning_rate": 6.202310551464924e-06, + "loss": 0.523, + "step": 2043 + }, + { + "epoch": 0.44, + "grad_norm": 0.17063148319721222, + "learning_rate": 6.1989235267813964e-06, + "loss": 0.5115, + "step": 2044 + }, + { + "epoch": 0.44, + "grad_norm": 0.1315128356218338, + "learning_rate": 6.1955359182761745e-06, + "loss": 0.5535, + "step": 2045 + }, + { + "epoch": 0.44, + "grad_norm": 0.26819273829460144, + "learning_rate": 6.192147727598869e-06, + "loss": 0.4942, + "step": 2046 + }, + { + "epoch": 0.44, + "grad_norm": 0.15203434228897095, + "learning_rate": 6.188758956399379e-06, + "loss": 0.5349, + "step": 2047 + }, + { + "epoch": 0.44, + "grad_norm": 0.17396771907806396, + "learning_rate": 6.185369606327882e-06, + "loss": 0.5134, + "step": 2048 + }, + { + "epoch": 0.44, + "grad_norm": 0.14054559171199799, + "learning_rate": 6.1819796790348376e-06, + "loss": 0.5346, + "step": 2049 + }, + { + "epoch": 0.44, + "grad_norm": 0.13480958342552185, + "learning_rate": 6.178589176170991e-06, + "loss": 0.4995, + "step": 2050 + }, + { + "epoch": 0.44, + "grad_norm": 0.15606021881103516, + "learning_rate": 6.175198099387361e-06, + "loss": 0.5519, + "step": 2051 + }, + { + "epoch": 0.44, + "grad_norm": 0.14711807668209076, + "learning_rate": 6.171806450335248e-06, + "loss": 0.5303, + "step": 2052 + }, + { + "epoch": 0.44, + "grad_norm": 0.18359160423278809, + "learning_rate": 6.1684142306662366e-06, + "loss": 0.5784, + "step": 2053 + }, + { + "epoch": 0.44, + "grad_norm": 0.15108604729175568, + "learning_rate": 6.16502144203218e-06, + "loss": 0.5499, + "step": 2054 + }, + { + "epoch": 0.44, + "grad_norm": 0.12765131890773773, + "learning_rate": 6.161628086085218e-06, + "loss": 0.5531, + "step": 2055 + }, + { + "epoch": 0.44, + "grad_norm": 0.18855132162570953, + "learning_rate": 6.1582341644777575e-06, + "loss": 0.5236, + "step": 2056 + }, + { + "epoch": 0.44, + "grad_norm": 0.14612235128879547, + "learning_rate": 6.15483967886249e-06, + "loss": 0.5035, + "step": 2057 + }, + { + "epoch": 0.44, + "grad_norm": 0.1928872913122177, + "learning_rate": 6.151444630892372e-06, + "loss": 0.541, + "step": 2058 + }, + { + "epoch": 0.44, + "grad_norm": 0.16574794054031372, + "learning_rate": 6.1480490222206415e-06, + "loss": 0.5139, + "step": 2059 + }, + { + "epoch": 0.44, + "grad_norm": 0.17566706240177155, + "learning_rate": 6.144652854500806e-06, + "loss": 0.4495, + "step": 2060 + }, + { + "epoch": 0.44, + "grad_norm": 0.17141076922416687, + "learning_rate": 6.1412561293866455e-06, + "loss": 0.5434, + "step": 2061 + }, + { + "epoch": 0.44, + "grad_norm": 0.16970355808734894, + "learning_rate": 6.1378588485322145e-06, + "loss": 0.5635, + "step": 2062 + }, + { + "epoch": 0.44, + "grad_norm": 0.20742008090019226, + "learning_rate": 6.134461013591832e-06, + "loss": 0.5435, + "step": 2063 + }, + { + "epoch": 0.44, + "grad_norm": 0.1773451417684555, + "learning_rate": 6.131062626220094e-06, + "loss": 0.5539, + "step": 2064 + }, + { + "epoch": 0.44, + "grad_norm": 0.18251217901706696, + "learning_rate": 6.127663688071859e-06, + "loss": 0.5046, + "step": 2065 + }, + { + "epoch": 0.45, + "grad_norm": 0.19838100671768188, + "learning_rate": 6.124264200802259e-06, + "loss": 0.4714, + "step": 2066 + }, + { + "epoch": 0.45, + "grad_norm": 0.154763326048851, + "learning_rate": 6.120864166066689e-06, + "loss": 0.528, + "step": 2067 + }, + { + "epoch": 0.45, + "grad_norm": 0.19701255857944489, + "learning_rate": 6.117463585520813e-06, + "loss": 0.5295, + "step": 2068 + }, + { + "epoch": 0.45, + "grad_norm": 0.17150332033634186, + "learning_rate": 6.1140624608205626e-06, + "loss": 0.4896, + "step": 2069 + }, + { + "epoch": 0.45, + "grad_norm": 0.1474120020866394, + "learning_rate": 6.110660793622127e-06, + "loss": 0.5046, + "step": 2070 + }, + { + "epoch": 0.45, + "grad_norm": 0.18776945769786835, + "learning_rate": 6.10725858558197e-06, + "loss": 0.5263, + "step": 2071 + }, + { + "epoch": 0.45, + "grad_norm": 0.14684580266475677, + "learning_rate": 6.103855838356813e-06, + "loss": 0.5539, + "step": 2072 + }, + { + "epoch": 0.45, + "grad_norm": 0.12644240260124207, + "learning_rate": 6.100452553603638e-06, + "loss": 0.5047, + "step": 2073 + }, + { + "epoch": 0.45, + "grad_norm": 0.18356040120124817, + "learning_rate": 6.097048732979691e-06, + "loss": 0.5408, + "step": 2074 + }, + { + "epoch": 0.45, + "grad_norm": 0.13573047518730164, + "learning_rate": 6.093644378142481e-06, + "loss": 0.5369, + "step": 2075 + }, + { + "epoch": 0.45, + "grad_norm": 0.1704436093568802, + "learning_rate": 6.090239490749775e-06, + "loss": 0.4905, + "step": 2076 + }, + { + "epoch": 0.45, + "grad_norm": 0.1508268564939499, + "learning_rate": 6.086834072459599e-06, + "loss": 0.5288, + "step": 2077 + }, + { + "epoch": 0.45, + "grad_norm": 0.17939120531082153, + "learning_rate": 6.083428124930239e-06, + "loss": 0.5089, + "step": 2078 + }, + { + "epoch": 0.45, + "grad_norm": 0.1567559689283371, + "learning_rate": 6.080021649820238e-06, + "loss": 0.4933, + "step": 2079 + }, + { + "epoch": 0.45, + "grad_norm": 0.1430431753396988, + "learning_rate": 6.076614648788392e-06, + "loss": 0.5396, + "step": 2080 + }, + { + "epoch": 0.45, + "grad_norm": 0.15456099808216095, + "learning_rate": 6.073207123493763e-06, + "loss": 0.4786, + "step": 2081 + }, + { + "epoch": 0.45, + "grad_norm": 0.17080536484718323, + "learning_rate": 6.069799075595658e-06, + "loss": 0.5233, + "step": 2082 + }, + { + "epoch": 0.45, + "grad_norm": 0.13564909994602203, + "learning_rate": 6.066390506753644e-06, + "loss": 0.5682, + "step": 2083 + }, + { + "epoch": 0.45, + "grad_norm": 0.15913358330726624, + "learning_rate": 6.062981418627539e-06, + "loss": 0.5222, + "step": 2084 + }, + { + "epoch": 0.45, + "grad_norm": 0.16424204409122467, + "learning_rate": 6.059571812877419e-06, + "loss": 0.5062, + "step": 2085 + }, + { + "epoch": 0.45, + "grad_norm": 0.16678033769130707, + "learning_rate": 6.0561616911636025e-06, + "loss": 0.5138, + "step": 2086 + }, + { + "epoch": 0.45, + "grad_norm": 0.15992575883865356, + "learning_rate": 6.052751055146669e-06, + "loss": 0.5199, + "step": 2087 + }, + { + "epoch": 0.45, + "grad_norm": 0.18692535161972046, + "learning_rate": 6.049339906487443e-06, + "loss": 0.5434, + "step": 2088 + }, + { + "epoch": 0.45, + "grad_norm": 0.13587631285190582, + "learning_rate": 6.045928246847003e-06, + "loss": 0.5013, + "step": 2089 + }, + { + "epoch": 0.45, + "grad_norm": 0.20116516947746277, + "learning_rate": 6.042516077886669e-06, + "loss": 0.5329, + "step": 2090 + }, + { + "epoch": 0.45, + "grad_norm": 0.13471555709838867, + "learning_rate": 6.039103401268016e-06, + "loss": 0.4862, + "step": 2091 + }, + { + "epoch": 0.45, + "grad_norm": 0.15407685935497284, + "learning_rate": 6.035690218652861e-06, + "loss": 0.6036, + "step": 2092 + }, + { + "epoch": 0.45, + "grad_norm": 0.14876054227352142, + "learning_rate": 6.032276531703274e-06, + "loss": 0.4963, + "step": 2093 + }, + { + "epoch": 0.45, + "grad_norm": 0.16624298691749573, + "learning_rate": 6.028862342081564e-06, + "loss": 0.5164, + "step": 2094 + }, + { + "epoch": 0.45, + "grad_norm": 0.15883252024650574, + "learning_rate": 6.025447651450289e-06, + "loss": 0.5082, + "step": 2095 + }, + { + "epoch": 0.45, + "grad_norm": 0.1502091884613037, + "learning_rate": 6.022032461472247e-06, + "loss": 0.5722, + "step": 2096 + }, + { + "epoch": 0.45, + "grad_norm": 0.1553240269422531, + "learning_rate": 6.018616773810483e-06, + "loss": 0.5173, + "step": 2097 + }, + { + "epoch": 0.45, + "grad_norm": 0.15653330087661743, + "learning_rate": 6.015200590128284e-06, + "loss": 0.5355, + "step": 2098 + }, + { + "epoch": 0.45, + "grad_norm": 0.1457417756319046, + "learning_rate": 6.011783912089174e-06, + "loss": 0.5205, + "step": 2099 + }, + { + "epoch": 0.45, + "grad_norm": 0.13138940930366516, + "learning_rate": 6.008366741356926e-06, + "loss": 0.5424, + "step": 2100 + }, + { + "epoch": 0.45, + "grad_norm": 0.15823757648468018, + "learning_rate": 6.004949079595544e-06, + "loss": 0.5272, + "step": 2101 + }, + { + "epoch": 0.45, + "grad_norm": 0.17084883153438568, + "learning_rate": 6.001530928469277e-06, + "loss": 0.5291, + "step": 2102 + }, + { + "epoch": 0.45, + "grad_norm": 0.14622004330158234, + "learning_rate": 5.998112289642608e-06, + "loss": 0.498, + "step": 2103 + }, + { + "epoch": 0.45, + "grad_norm": 0.1439567655324936, + "learning_rate": 5.9946931647802645e-06, + "loss": 0.5381, + "step": 2104 + }, + { + "epoch": 0.45, + "grad_norm": 0.23978291451931, + "learning_rate": 5.9912735555472015e-06, + "loss": 0.5141, + "step": 2105 + }, + { + "epoch": 0.45, + "grad_norm": 0.14025025069713593, + "learning_rate": 5.987853463608618e-06, + "loss": 0.4712, + "step": 2106 + }, + { + "epoch": 0.45, + "grad_norm": 0.16210734844207764, + "learning_rate": 5.984432890629943e-06, + "loss": 0.5103, + "step": 2107 + }, + { + "epoch": 0.45, + "grad_norm": 0.17586356401443481, + "learning_rate": 5.981011838276841e-06, + "loss": 0.5507, + "step": 2108 + }, + { + "epoch": 0.45, + "grad_norm": 0.1554114818572998, + "learning_rate": 5.977590308215211e-06, + "loss": 0.5375, + "step": 2109 + }, + { + "epoch": 0.45, + "grad_norm": 0.14625568687915802, + "learning_rate": 5.974168302111183e-06, + "loss": 0.5195, + "step": 2110 + }, + { + "epoch": 0.45, + "grad_norm": 0.1564107984304428, + "learning_rate": 5.970745821631121e-06, + "loss": 0.5006, + "step": 2111 + }, + { + "epoch": 0.45, + "grad_norm": 0.1529904454946518, + "learning_rate": 5.967322868441616e-06, + "loss": 0.5455, + "step": 2112 + }, + { + "epoch": 0.46, + "grad_norm": 0.16919173300266266, + "learning_rate": 5.963899444209496e-06, + "loss": 0.5323, + "step": 2113 + }, + { + "epoch": 0.46, + "grad_norm": 0.2237899899482727, + "learning_rate": 5.9604755506018105e-06, + "loss": 0.5153, + "step": 2114 + }, + { + "epoch": 0.46, + "grad_norm": 0.17237022519111633, + "learning_rate": 5.957051189285843e-06, + "loss": 0.5237, + "step": 2115 + }, + { + "epoch": 0.46, + "grad_norm": 0.18111760914325714, + "learning_rate": 5.953626361929102e-06, + "loss": 0.492, + "step": 2116 + }, + { + "epoch": 0.46, + "grad_norm": 0.13480786979198456, + "learning_rate": 5.950201070199326e-06, + "loss": 0.4827, + "step": 2117 + }, + { + "epoch": 0.46, + "grad_norm": 0.17693190276622772, + "learning_rate": 5.946775315764476e-06, + "loss": 0.5592, + "step": 2118 + }, + { + "epoch": 0.46, + "grad_norm": 0.13885067403316498, + "learning_rate": 5.943349100292739e-06, + "loss": 0.497, + "step": 2119 + }, + { + "epoch": 0.46, + "grad_norm": 0.1679374873638153, + "learning_rate": 5.939922425452531e-06, + "loss": 0.5045, + "step": 2120 + }, + { + "epoch": 0.46, + "grad_norm": 0.1675940304994583, + "learning_rate": 5.936495292912483e-06, + "loss": 0.5518, + "step": 2121 + }, + { + "epoch": 0.46, + "grad_norm": 0.16924212872982025, + "learning_rate": 5.93306770434146e-06, + "loss": 0.5481, + "step": 2122 + }, + { + "epoch": 0.46, + "grad_norm": 0.21032755076885223, + "learning_rate": 5.929639661408538e-06, + "loss": 0.4816, + "step": 2123 + }, + { + "epoch": 0.46, + "grad_norm": 0.11854084581136703, + "learning_rate": 5.926211165783021e-06, + "loss": 0.5009, + "step": 2124 + }, + { + "epoch": 0.46, + "grad_norm": 0.13082769513130188, + "learning_rate": 5.922782219134433e-06, + "loss": 0.4822, + "step": 2125 + }, + { + "epoch": 0.46, + "grad_norm": 0.1662750393152237, + "learning_rate": 5.919352823132515e-06, + "loss": 0.5262, + "step": 2126 + }, + { + "epoch": 0.46, + "grad_norm": 0.1488747000694275, + "learning_rate": 5.915922979447228e-06, + "loss": 0.5553, + "step": 2127 + }, + { + "epoch": 0.46, + "grad_norm": 0.1871393918991089, + "learning_rate": 5.912492689748753e-06, + "loss": 0.4965, + "step": 2128 + }, + { + "epoch": 0.46, + "grad_norm": 0.18025460839271545, + "learning_rate": 5.909061955707486e-06, + "loss": 0.531, + "step": 2129 + }, + { + "epoch": 0.46, + "grad_norm": 0.1580578088760376, + "learning_rate": 5.905630778994036e-06, + "loss": 0.5089, + "step": 2130 + }, + { + "epoch": 0.46, + "grad_norm": 0.16995598375797272, + "learning_rate": 5.902199161279236e-06, + "loss": 0.5137, + "step": 2131 + }, + { + "epoch": 0.46, + "grad_norm": 0.14344586431980133, + "learning_rate": 5.898767104234128e-06, + "loss": 0.5051, + "step": 2132 + }, + { + "epoch": 0.46, + "grad_norm": 0.1728695183992386, + "learning_rate": 5.895334609529967e-06, + "loss": 0.509, + "step": 2133 + }, + { + "epoch": 0.46, + "grad_norm": 0.13887768983840942, + "learning_rate": 5.891901678838227e-06, + "loss": 0.4838, + "step": 2134 + }, + { + "epoch": 0.46, + "grad_norm": 0.18018049001693726, + "learning_rate": 5.8884683138305854e-06, + "loss": 0.5273, + "step": 2135 + }, + { + "epoch": 0.46, + "grad_norm": 0.15605993568897247, + "learning_rate": 5.88503451617894e-06, + "loss": 0.4847, + "step": 2136 + }, + { + "epoch": 0.46, + "grad_norm": 0.14139895141124725, + "learning_rate": 5.881600287555393e-06, + "loss": 0.4769, + "step": 2137 + }, + { + "epoch": 0.46, + "grad_norm": 0.15375615656375885, + "learning_rate": 5.878165629632262e-06, + "loss": 0.5479, + "step": 2138 + }, + { + "epoch": 0.46, + "grad_norm": 0.16424569487571716, + "learning_rate": 5.874730544082069e-06, + "loss": 0.5337, + "step": 2139 + }, + { + "epoch": 0.46, + "grad_norm": 0.20334842801094055, + "learning_rate": 5.8712950325775416e-06, + "loss": 0.5627, + "step": 2140 + }, + { + "epoch": 0.46, + "grad_norm": 0.13510531187057495, + "learning_rate": 5.867859096791626e-06, + "loss": 0.4906, + "step": 2141 + }, + { + "epoch": 0.46, + "grad_norm": 0.158345028758049, + "learning_rate": 5.864422738397465e-06, + "loss": 0.5136, + "step": 2142 + }, + { + "epoch": 0.46, + "grad_norm": 0.1618645340204239, + "learning_rate": 5.860985959068408e-06, + "loss": 0.4867, + "step": 2143 + }, + { + "epoch": 0.46, + "grad_norm": 0.1342993676662445, + "learning_rate": 5.857548760478015e-06, + "loss": 0.5011, + "step": 2144 + }, + { + "epoch": 0.46, + "grad_norm": 0.14608271420001984, + "learning_rate": 5.8541111443000455e-06, + "loss": 0.4916, + "step": 2145 + }, + { + "epoch": 0.46, + "grad_norm": 0.1352057158946991, + "learning_rate": 5.85067311220846e-06, + "loss": 0.5195, + "step": 2146 + }, + { + "epoch": 0.46, + "grad_norm": 0.1447547972202301, + "learning_rate": 5.847234665877432e-06, + "loss": 0.4918, + "step": 2147 + }, + { + "epoch": 0.46, + "grad_norm": 0.17679902911186218, + "learning_rate": 5.843795806981325e-06, + "loss": 0.5345, + "step": 2148 + }, + { + "epoch": 0.46, + "grad_norm": 0.1902516484260559, + "learning_rate": 5.840356537194708e-06, + "loss": 0.5343, + "step": 2149 + }, + { + "epoch": 0.46, + "grad_norm": 0.19225680828094482, + "learning_rate": 5.836916858192353e-06, + "loss": 0.4972, + "step": 2150 + }, + { + "epoch": 0.46, + "grad_norm": 0.17341876029968262, + "learning_rate": 5.833476771649227e-06, + "loss": 0.5002, + "step": 2151 + }, + { + "epoch": 0.46, + "grad_norm": 0.149870827794075, + "learning_rate": 5.830036279240497e-06, + "loss": 0.5484, + "step": 2152 + }, + { + "epoch": 0.46, + "grad_norm": 0.1548566222190857, + "learning_rate": 5.826595382641529e-06, + "loss": 0.5553, + "step": 2153 + }, + { + "epoch": 0.46, + "grad_norm": 0.16744022071361542, + "learning_rate": 5.823154083527884e-06, + "loss": 0.5416, + "step": 2154 + }, + { + "epoch": 0.46, + "grad_norm": 0.18139050900936127, + "learning_rate": 5.819712383575316e-06, + "loss": 0.5225, + "step": 2155 + }, + { + "epoch": 0.46, + "grad_norm": 0.16486258804798126, + "learning_rate": 5.816270284459783e-06, + "loss": 0.4938, + "step": 2156 + }, + { + "epoch": 0.46, + "grad_norm": 0.15385212004184723, + "learning_rate": 5.812827787857428e-06, + "loss": 0.562, + "step": 2157 + }, + { + "epoch": 0.46, + "grad_norm": 0.17840281128883362, + "learning_rate": 5.809384895444594e-06, + "loss": 0.487, + "step": 2158 + }, + { + "epoch": 0.47, + "grad_norm": 0.16368557512760162, + "learning_rate": 5.805941608897814e-06, + "loss": 0.4991, + "step": 2159 + }, + { + "epoch": 0.47, + "grad_norm": 0.22969526052474976, + "learning_rate": 5.802497929893813e-06, + "loss": 0.4751, + "step": 2160 + }, + { + "epoch": 0.47, + "grad_norm": 0.21182815730571747, + "learning_rate": 5.799053860109506e-06, + "loss": 0.5603, + "step": 2161 + }, + { + "epoch": 0.47, + "grad_norm": 0.16508375108242035, + "learning_rate": 5.795609401222001e-06, + "loss": 0.5308, + "step": 2162 + }, + { + "epoch": 0.47, + "grad_norm": 0.3659750521183014, + "learning_rate": 5.7921645549085955e-06, + "loss": 0.5229, + "step": 2163 + }, + { + "epoch": 0.47, + "grad_norm": 0.15634752810001373, + "learning_rate": 5.7887193228467735e-06, + "loss": 0.5594, + "step": 2164 + }, + { + "epoch": 0.47, + "grad_norm": 0.15100319683551788, + "learning_rate": 5.785273706714205e-06, + "loss": 0.5619, + "step": 2165 + }, + { + "epoch": 0.47, + "grad_norm": 0.13537266850471497, + "learning_rate": 5.781827708188753e-06, + "loss": 0.5224, + "step": 2166 + }, + { + "epoch": 0.47, + "grad_norm": 0.16945107281208038, + "learning_rate": 5.778381328948461e-06, + "loss": 0.513, + "step": 2167 + }, + { + "epoch": 0.47, + "grad_norm": 0.1476183384656906, + "learning_rate": 5.774934570671562e-06, + "loss": 0.5124, + "step": 2168 + }, + { + "epoch": 0.47, + "grad_norm": 0.156847283244133, + "learning_rate": 5.771487435036472e-06, + "loss": 0.5185, + "step": 2169 + }, + { + "epoch": 0.47, + "grad_norm": 0.24519124627113342, + "learning_rate": 5.768039923721791e-06, + "loss": 0.5001, + "step": 2170 + }, + { + "epoch": 0.47, + "grad_norm": 0.19340813159942627, + "learning_rate": 5.764592038406298e-06, + "loss": 0.528, + "step": 2171 + }, + { + "epoch": 0.47, + "grad_norm": 0.16022874414920807, + "learning_rate": 5.761143780768962e-06, + "loss": 0.4961, + "step": 2172 + }, + { + "epoch": 0.47, + "grad_norm": 0.18600255250930786, + "learning_rate": 5.7576951524889245e-06, + "loss": 0.4908, + "step": 2173 + }, + { + "epoch": 0.47, + "grad_norm": 0.11501923948526382, + "learning_rate": 5.7542461552455165e-06, + "loss": 0.5403, + "step": 2174 + }, + { + "epoch": 0.47, + "grad_norm": 0.14986415207386017, + "learning_rate": 5.750796790718243e-06, + "loss": 0.5027, + "step": 2175 + }, + { + "epoch": 0.47, + "grad_norm": 0.13095037639141083, + "learning_rate": 5.747347060586787e-06, + "loss": 0.5339, + "step": 2176 + }, + { + "epoch": 0.47, + "grad_norm": 0.12488746643066406, + "learning_rate": 5.743896966531012e-06, + "loss": 0.5256, + "step": 2177 + }, + { + "epoch": 0.47, + "grad_norm": 0.1328728049993515, + "learning_rate": 5.740446510230959e-06, + "loss": 0.429, + "step": 2178 + }, + { + "epoch": 0.47, + "grad_norm": 0.13304339349269867, + "learning_rate": 5.736995693366847e-06, + "loss": 0.4621, + "step": 2179 + }, + { + "epoch": 0.47, + "grad_norm": 0.22455641627311707, + "learning_rate": 5.733544517619064e-06, + "loss": 0.5157, + "step": 2180 + }, + { + "epoch": 0.47, + "grad_norm": 0.13997776806354523, + "learning_rate": 5.730092984668179e-06, + "loss": 0.4909, + "step": 2181 + }, + { + "epoch": 0.47, + "grad_norm": 0.1835583746433258, + "learning_rate": 5.726641096194932e-06, + "loss": 0.4697, + "step": 2182 + }, + { + "epoch": 0.47, + "grad_norm": 0.1669677495956421, + "learning_rate": 5.723188853880238e-06, + "loss": 0.5484, + "step": 2183 + }, + { + "epoch": 0.47, + "grad_norm": 0.1625543087720871, + "learning_rate": 5.719736259405182e-06, + "loss": 0.4743, + "step": 2184 + }, + { + "epoch": 0.47, + "grad_norm": 0.15123441815376282, + "learning_rate": 5.716283314451026e-06, + "loss": 0.482, + "step": 2185 + }, + { + "epoch": 0.47, + "grad_norm": 0.16270317137241364, + "learning_rate": 5.7128300206991925e-06, + "loss": 0.4675, + "step": 2186 + }, + { + "epoch": 0.47, + "grad_norm": 0.1661555916070938, + "learning_rate": 5.709376379831283e-06, + "loss": 0.5076, + "step": 2187 + }, + { + "epoch": 0.47, + "grad_norm": 0.16409648954868317, + "learning_rate": 5.705922393529065e-06, + "loss": 0.5271, + "step": 2188 + }, + { + "epoch": 0.47, + "grad_norm": 0.14545123279094696, + "learning_rate": 5.702468063474473e-06, + "loss": 0.4966, + "step": 2189 + }, + { + "epoch": 0.47, + "grad_norm": 0.22827212512493134, + "learning_rate": 5.69901339134961e-06, + "loss": 0.4808, + "step": 2190 + }, + { + "epoch": 0.47, + "grad_norm": 0.1843656748533249, + "learning_rate": 5.695558378836749e-06, + "loss": 0.505, + "step": 2191 + }, + { + "epoch": 0.47, + "grad_norm": 0.19031104445457458, + "learning_rate": 5.692103027618321e-06, + "loss": 0.5571, + "step": 2192 + }, + { + "epoch": 0.47, + "grad_norm": 0.16894584894180298, + "learning_rate": 5.688647339376926e-06, + "loss": 0.5266, + "step": 2193 + }, + { + "epoch": 0.47, + "grad_norm": 0.14823244512081146, + "learning_rate": 5.685191315795331e-06, + "loss": 0.5572, + "step": 2194 + }, + { + "epoch": 0.47, + "grad_norm": 0.13419359922409058, + "learning_rate": 5.681734958556463e-06, + "loss": 0.5163, + "step": 2195 + }, + { + "epoch": 0.47, + "grad_norm": 0.18760497868061066, + "learning_rate": 5.678278269343411e-06, + "loss": 0.5218, + "step": 2196 + }, + { + "epoch": 0.47, + "grad_norm": 0.1401587277650833, + "learning_rate": 5.674821249839428e-06, + "loss": 0.4821, + "step": 2197 + }, + { + "epoch": 0.47, + "grad_norm": 0.15496966242790222, + "learning_rate": 5.671363901727927e-06, + "loss": 0.504, + "step": 2198 + }, + { + "epoch": 0.47, + "grad_norm": 0.17478565871715546, + "learning_rate": 5.667906226692479e-06, + "loss": 0.5252, + "step": 2199 + }, + { + "epoch": 0.47, + "grad_norm": 0.14033323526382446, + "learning_rate": 5.664448226416815e-06, + "loss": 0.5534, + "step": 2200 + }, + { + "epoch": 0.47, + "grad_norm": 0.23815791308879852, + "learning_rate": 5.660989902584829e-06, + "loss": 0.5357, + "step": 2201 + }, + { + "epoch": 0.47, + "grad_norm": 0.16176384687423706, + "learning_rate": 5.657531256880565e-06, + "loss": 0.5378, + "step": 2202 + }, + { + "epoch": 0.47, + "grad_norm": 0.20444779098033905, + "learning_rate": 5.654072290988231e-06, + "loss": 0.5905, + "step": 2203 + }, + { + "epoch": 0.47, + "grad_norm": 0.14830709993839264, + "learning_rate": 5.650613006592185e-06, + "loss": 0.5192, + "step": 2204 + }, + { + "epoch": 0.47, + "grad_norm": 0.2211901992559433, + "learning_rate": 5.647153405376942e-06, + "loss": 0.564, + "step": 2205 + }, + { + "epoch": 0.48, + "grad_norm": 0.15610624849796295, + "learning_rate": 5.643693489027172e-06, + "loss": 0.49, + "step": 2206 + }, + { + "epoch": 0.48, + "grad_norm": 0.13824397325515747, + "learning_rate": 5.6402332592277e-06, + "loss": 0.519, + "step": 2207 + }, + { + "epoch": 0.48, + "grad_norm": 0.18318380415439606, + "learning_rate": 5.636772717663501e-06, + "loss": 0.5294, + "step": 2208 + }, + { + "epoch": 0.48, + "grad_norm": 0.20423349738121033, + "learning_rate": 5.633311866019703e-06, + "loss": 0.5128, + "step": 2209 + }, + { + "epoch": 0.48, + "grad_norm": 0.14289386570453644, + "learning_rate": 5.629850705981584e-06, + "loss": 0.5008, + "step": 2210 + }, + { + "epoch": 0.48, + "grad_norm": 0.17370502650737762, + "learning_rate": 5.626389239234572e-06, + "loss": 0.5657, + "step": 2211 + }, + { + "epoch": 0.48, + "grad_norm": 0.1700432002544403, + "learning_rate": 5.622927467464247e-06, + "loss": 0.5137, + "step": 2212 + }, + { + "epoch": 0.48, + "grad_norm": 0.1566981077194214, + "learning_rate": 5.619465392356335e-06, + "loss": 0.5698, + "step": 2213 + }, + { + "epoch": 0.48, + "grad_norm": 0.166670560836792, + "learning_rate": 5.6160030155967116e-06, + "loss": 0.5272, + "step": 2214 + }, + { + "epoch": 0.48, + "grad_norm": 0.14587420225143433, + "learning_rate": 5.612540338871395e-06, + "loss": 0.5049, + "step": 2215 + }, + { + "epoch": 0.48, + "grad_norm": 0.14537444710731506, + "learning_rate": 5.609077363866555e-06, + "loss": 0.523, + "step": 2216 + }, + { + "epoch": 0.48, + "grad_norm": 0.15122370421886444, + "learning_rate": 5.605614092268506e-06, + "loss": 0.5304, + "step": 2217 + }, + { + "epoch": 0.48, + "grad_norm": 0.11322161555290222, + "learning_rate": 5.602150525763701e-06, + "loss": 0.5269, + "step": 2218 + }, + { + "epoch": 0.48, + "grad_norm": 0.1510639786720276, + "learning_rate": 5.598686666038745e-06, + "loss": 0.5668, + "step": 2219 + }, + { + "epoch": 0.48, + "grad_norm": 0.16219152510166168, + "learning_rate": 5.595222514780379e-06, + "loss": 0.5016, + "step": 2220 + }, + { + "epoch": 0.48, + "grad_norm": 0.14243803918361664, + "learning_rate": 5.591758073675485e-06, + "loss": 0.5398, + "step": 2221 + }, + { + "epoch": 0.48, + "grad_norm": 0.16937606036663055, + "learning_rate": 5.588293344411097e-06, + "loss": 0.5621, + "step": 2222 + }, + { + "epoch": 0.48, + "grad_norm": 0.15524210035800934, + "learning_rate": 5.5848283286743786e-06, + "loss": 0.5695, + "step": 2223 + }, + { + "epoch": 0.48, + "grad_norm": 0.1765149086713791, + "learning_rate": 5.581363028152633e-06, + "loss": 0.5126, + "step": 2224 + }, + { + "epoch": 0.48, + "grad_norm": 0.2328573763370514, + "learning_rate": 5.5778974445333115e-06, + "loss": 0.5701, + "step": 2225 + }, + { + "epoch": 0.48, + "grad_norm": 0.1344151794910431, + "learning_rate": 5.574431579503991e-06, + "loss": 0.5512, + "step": 2226 + }, + { + "epoch": 0.48, + "grad_norm": 0.14871002733707428, + "learning_rate": 5.570965434752396e-06, + "loss": 0.5196, + "step": 2227 + }, + { + "epoch": 0.48, + "grad_norm": 0.19491346180438995, + "learning_rate": 5.5674990119663794e-06, + "loss": 0.5809, + "step": 2228 + }, + { + "epoch": 0.48, + "grad_norm": 0.15575414896011353, + "learning_rate": 5.564032312833936e-06, + "loss": 0.5395, + "step": 2229 + }, + { + "epoch": 0.48, + "grad_norm": 0.25920212268829346, + "learning_rate": 5.560565339043188e-06, + "loss": 0.4677, + "step": 2230 + }, + { + "epoch": 0.48, + "grad_norm": 0.1457945555448532, + "learning_rate": 5.557098092282399e-06, + "loss": 0.5326, + "step": 2231 + }, + { + "epoch": 0.48, + "grad_norm": 0.13234636187553406, + "learning_rate": 5.55363057423996e-06, + "loss": 0.4859, + "step": 2232 + }, + { + "epoch": 0.48, + "grad_norm": 0.146928608417511, + "learning_rate": 5.550162786604397e-06, + "loss": 0.5834, + "step": 2233 + }, + { + "epoch": 0.48, + "grad_norm": 0.13184037804603577, + "learning_rate": 5.546694731064364e-06, + "loss": 0.5236, + "step": 2234 + }, + { + "epoch": 0.48, + "grad_norm": 0.2852530777454376, + "learning_rate": 5.5432264093086505e-06, + "loss": 0.5034, + "step": 2235 + }, + { + "epoch": 0.48, + "grad_norm": 0.15083038806915283, + "learning_rate": 5.5397578230261715e-06, + "loss": 0.5118, + "step": 2236 + }, + { + "epoch": 0.48, + "grad_norm": 0.1430756151676178, + "learning_rate": 5.536288973905971e-06, + "loss": 0.5202, + "step": 2237 + }, + { + "epoch": 0.48, + "grad_norm": 0.16797691583633423, + "learning_rate": 5.532819863637223e-06, + "loss": 0.5105, + "step": 2238 + }, + { + "epoch": 0.48, + "grad_norm": 0.15367530286312103, + "learning_rate": 5.529350493909229e-06, + "loss": 0.5178, + "step": 2239 + }, + { + "epoch": 0.48, + "grad_norm": 0.13238172233104706, + "learning_rate": 5.525880866411414e-06, + "loss": 0.5376, + "step": 2240 + }, + { + "epoch": 0.48, + "grad_norm": 0.17009180784225464, + "learning_rate": 5.522410982833331e-06, + "loss": 0.5508, + "step": 2241 + }, + { + "epoch": 0.48, + "grad_norm": 0.1846666783094406, + "learning_rate": 5.5189408448646565e-06, + "loss": 0.5625, + "step": 2242 + }, + { + "epoch": 0.48, + "grad_norm": 0.18193793296813965, + "learning_rate": 5.515470454195188e-06, + "loss": 0.4663, + "step": 2243 + }, + { + "epoch": 0.48, + "grad_norm": 0.15874691307544708, + "learning_rate": 5.511999812514857e-06, + "loss": 0.5035, + "step": 2244 + }, + { + "epoch": 0.48, + "grad_norm": 0.17099712789058685, + "learning_rate": 5.5085289215137035e-06, + "loss": 0.5301, + "step": 2245 + }, + { + "epoch": 0.48, + "grad_norm": 0.14446376264095306, + "learning_rate": 5.505057782881896e-06, + "loss": 0.4915, + "step": 2246 + }, + { + "epoch": 0.48, + "grad_norm": 0.3006593883037567, + "learning_rate": 5.501586398309724e-06, + "loss": 0.5032, + "step": 2247 + }, + { + "epoch": 0.48, + "grad_norm": 0.175115704536438, + "learning_rate": 5.4981147694875924e-06, + "loss": 0.5242, + "step": 2248 + }, + { + "epoch": 0.48, + "grad_norm": 0.14558811485767365, + "learning_rate": 5.494642898106029e-06, + "loss": 0.4991, + "step": 2249 + }, + { + "epoch": 0.48, + "grad_norm": 0.1611151546239853, + "learning_rate": 5.491170785855681e-06, + "loss": 0.5272, + "step": 2250 + }, + { + "epoch": 0.48, + "grad_norm": 0.15863467752933502, + "learning_rate": 5.4876984344273095e-06, + "loss": 0.5034, + "step": 2251 + }, + { + "epoch": 0.49, + "grad_norm": 0.1683708131313324, + "learning_rate": 5.484225845511791e-06, + "loss": 0.4884, + "step": 2252 + }, + { + "epoch": 0.49, + "grad_norm": 0.1344245821237564, + "learning_rate": 5.480753020800121e-06, + "loss": 0.5165, + "step": 2253 + }, + { + "epoch": 0.49, + "grad_norm": 0.1735605001449585, + "learning_rate": 5.477279961983408e-06, + "loss": 0.5519, + "step": 2254 + }, + { + "epoch": 0.49, + "grad_norm": 0.14727462828159332, + "learning_rate": 5.473806670752877e-06, + "loss": 0.4778, + "step": 2255 + }, + { + "epoch": 0.49, + "grad_norm": 0.1414579451084137, + "learning_rate": 5.470333148799862e-06, + "loss": 0.4707, + "step": 2256 + }, + { + "epoch": 0.49, + "grad_norm": 0.1338963657617569, + "learning_rate": 5.466859397815812e-06, + "loss": 0.5236, + "step": 2257 + }, + { + "epoch": 0.49, + "grad_norm": 0.1523580551147461, + "learning_rate": 5.463385419492288e-06, + "loss": 0.516, + "step": 2258 + }, + { + "epoch": 0.49, + "grad_norm": 0.17260035872459412, + "learning_rate": 5.459911215520959e-06, + "loss": 0.5188, + "step": 2259 + }, + { + "epoch": 0.49, + "grad_norm": 0.19136221706867218, + "learning_rate": 5.456436787593609e-06, + "loss": 0.4909, + "step": 2260 + }, + { + "epoch": 0.49, + "grad_norm": 0.17576466500759125, + "learning_rate": 5.452962137402125e-06, + "loss": 0.5374, + "step": 2261 + }, + { + "epoch": 0.49, + "grad_norm": 0.18410582840442657, + "learning_rate": 5.449487266638504e-06, + "loss": 0.5541, + "step": 2262 + }, + { + "epoch": 0.49, + "grad_norm": 0.15502192080020905, + "learning_rate": 5.446012176994854e-06, + "loss": 0.5411, + "step": 2263 + }, + { + "epoch": 0.49, + "grad_norm": 0.21357733011245728, + "learning_rate": 5.442536870163386e-06, + "loss": 0.5284, + "step": 2264 + }, + { + "epoch": 0.49, + "grad_norm": 0.15364959836006165, + "learning_rate": 5.439061347836416e-06, + "loss": 0.4631, + "step": 2265 + }, + { + "epoch": 0.49, + "grad_norm": 0.14856620132923126, + "learning_rate": 5.43558561170637e-06, + "loss": 0.5164, + "step": 2266 + }, + { + "epoch": 0.49, + "grad_norm": 0.13780789077281952, + "learning_rate": 5.432109663465773e-06, + "loss": 0.5108, + "step": 2267 + }, + { + "epoch": 0.49, + "grad_norm": 0.13712283968925476, + "learning_rate": 5.428633504807253e-06, + "loss": 0.4914, + "step": 2268 + }, + { + "epoch": 0.49, + "grad_norm": 0.1509259045124054, + "learning_rate": 5.425157137423548e-06, + "loss": 0.5178, + "step": 2269 + }, + { + "epoch": 0.49, + "grad_norm": 0.16157595813274384, + "learning_rate": 5.421680563007486e-06, + "loss": 0.5337, + "step": 2270 + }, + { + "epoch": 0.49, + "grad_norm": 0.17313942313194275, + "learning_rate": 5.418203783252005e-06, + "loss": 0.512, + "step": 2271 + }, + { + "epoch": 0.49, + "grad_norm": 0.1417136937379837, + "learning_rate": 5.414726799850141e-06, + "loss": 0.5123, + "step": 2272 + }, + { + "epoch": 0.49, + "grad_norm": 0.15452702343463898, + "learning_rate": 5.411249614495027e-06, + "loss": 0.5249, + "step": 2273 + }, + { + "epoch": 0.49, + "grad_norm": 0.17498227953910828, + "learning_rate": 5.407772228879894e-06, + "loss": 0.5008, + "step": 2274 + }, + { + "epoch": 0.49, + "grad_norm": 0.2232121229171753, + "learning_rate": 5.404294644698073e-06, + "loss": 0.5113, + "step": 2275 + }, + { + "epoch": 0.49, + "grad_norm": 0.11952576041221619, + "learning_rate": 5.400816863642991e-06, + "loss": 0.5147, + "step": 2276 + }, + { + "epoch": 0.49, + "grad_norm": 0.15340656042099, + "learning_rate": 5.397338887408171e-06, + "loss": 0.478, + "step": 2277 + }, + { + "epoch": 0.49, + "grad_norm": 0.1494847536087036, + "learning_rate": 5.393860717687231e-06, + "loss": 0.5173, + "step": 2278 + }, + { + "epoch": 0.49, + "grad_norm": 0.16914784908294678, + "learning_rate": 5.390382356173881e-06, + "loss": 0.4979, + "step": 2279 + }, + { + "epoch": 0.49, + "grad_norm": 0.10972032696008682, + "learning_rate": 5.3869038045619275e-06, + "loss": 0.5214, + "step": 2280 + }, + { + "epoch": 0.49, + "grad_norm": 0.1643581986427307, + "learning_rate": 5.383425064545267e-06, + "loss": 0.535, + "step": 2281 + }, + { + "epoch": 0.49, + "grad_norm": 0.1384391486644745, + "learning_rate": 5.379946137817891e-06, + "loss": 0.5034, + "step": 2282 + }, + { + "epoch": 0.49, + "grad_norm": 0.1642947793006897, + "learning_rate": 5.376467026073878e-06, + "loss": 0.5549, + "step": 2283 + }, + { + "epoch": 0.49, + "grad_norm": 0.15689925849437714, + "learning_rate": 5.3729877310073985e-06, + "loss": 0.5086, + "step": 2284 + }, + { + "epoch": 0.49, + "grad_norm": 0.17627274990081787, + "learning_rate": 5.369508254312715e-06, + "loss": 0.5223, + "step": 2285 + }, + { + "epoch": 0.49, + "grad_norm": 0.12727420032024384, + "learning_rate": 5.366028597684173e-06, + "loss": 0.5149, + "step": 2286 + }, + { + "epoch": 0.49, + "grad_norm": 0.15203452110290527, + "learning_rate": 5.362548762816209e-06, + "loss": 0.5713, + "step": 2287 + }, + { + "epoch": 0.49, + "grad_norm": 0.13790200650691986, + "learning_rate": 5.359068751403347e-06, + "loss": 0.545, + "step": 2288 + }, + { + "epoch": 0.49, + "grad_norm": 0.13259437680244446, + "learning_rate": 5.355588565140195e-06, + "loss": 0.4586, + "step": 2289 + }, + { + "epoch": 0.49, + "grad_norm": 0.1421840488910675, + "learning_rate": 5.352108205721445e-06, + "loss": 0.4915, + "step": 2290 + }, + { + "epoch": 0.49, + "grad_norm": 0.14462217688560486, + "learning_rate": 5.348627674841876e-06, + "loss": 0.4412, + "step": 2291 + }, + { + "epoch": 0.49, + "grad_norm": 0.15902197360992432, + "learning_rate": 5.345146974196351e-06, + "loss": 0.5418, + "step": 2292 + }, + { + "epoch": 0.49, + "grad_norm": 0.1560838520526886, + "learning_rate": 5.341666105479812e-06, + "loss": 0.4639, + "step": 2293 + }, + { + "epoch": 0.49, + "grad_norm": 0.15082865953445435, + "learning_rate": 5.338185070387289e-06, + "loss": 0.501, + "step": 2294 + }, + { + "epoch": 0.49, + "grad_norm": 0.1447245180606842, + "learning_rate": 5.334703870613887e-06, + "loss": 0.4603, + "step": 2295 + }, + { + "epoch": 0.49, + "grad_norm": 0.23148810863494873, + "learning_rate": 5.3312225078547895e-06, + "loss": 0.5145, + "step": 2296 + }, + { + "epoch": 0.49, + "grad_norm": 0.1934991329908371, + "learning_rate": 5.327740983805267e-06, + "loss": 0.5137, + "step": 2297 + }, + { + "epoch": 0.5, + "grad_norm": 0.18782839179039001, + "learning_rate": 5.324259300160667e-06, + "loss": 0.5348, + "step": 2298 + }, + { + "epoch": 0.5, + "grad_norm": 0.17964793741703033, + "learning_rate": 5.320777458616407e-06, + "loss": 0.4938, + "step": 2299 + }, + { + "epoch": 0.5, + "grad_norm": 0.1606227159500122, + "learning_rate": 5.31729546086799e-06, + "loss": 0.5483, + "step": 2300 + }, + { + "epoch": 0.5, + "grad_norm": 0.16519147157669067, + "learning_rate": 5.313813308610993e-06, + "loss": 0.5018, + "step": 2301 + }, + { + "epoch": 0.5, + "grad_norm": 0.1705171763896942, + "learning_rate": 5.310331003541065e-06, + "loss": 0.4838, + "step": 2302 + }, + { + "epoch": 0.5, + "grad_norm": 0.22581948339939117, + "learning_rate": 5.30684854735393e-06, + "loss": 0.5207, + "step": 2303 + }, + { + "epoch": 0.5, + "grad_norm": 0.16089698672294617, + "learning_rate": 5.303365941745392e-06, + "loss": 0.5237, + "step": 2304 + }, + { + "epoch": 0.5, + "grad_norm": 0.15881328284740448, + "learning_rate": 5.299883188411318e-06, + "loss": 0.477, + "step": 2305 + }, + { + "epoch": 0.5, + "grad_norm": 0.21279747784137726, + "learning_rate": 5.296400289047655e-06, + "loss": 0.5183, + "step": 2306 + }, + { + "epoch": 0.5, + "grad_norm": 0.16910669207572937, + "learning_rate": 5.292917245350417e-06, + "loss": 0.4759, + "step": 2307 + }, + { + "epoch": 0.5, + "grad_norm": 0.16905193030834198, + "learning_rate": 5.289434059015689e-06, + "loss": 0.5334, + "step": 2308 + }, + { + "epoch": 0.5, + "grad_norm": 0.11337817460298538, + "learning_rate": 5.285950731739624e-06, + "loss": 0.4597, + "step": 2309 + }, + { + "epoch": 0.5, + "grad_norm": 0.20089925825595856, + "learning_rate": 5.28246726521845e-06, + "loss": 0.5143, + "step": 2310 + }, + { + "epoch": 0.5, + "grad_norm": 0.152847558259964, + "learning_rate": 5.278983661148453e-06, + "loss": 0.5067, + "step": 2311 + }, + { + "epoch": 0.5, + "grad_norm": 0.16231143474578857, + "learning_rate": 5.275499921225994e-06, + "loss": 0.4883, + "step": 2312 + }, + { + "epoch": 0.5, + "grad_norm": 0.13849905133247375, + "learning_rate": 5.2720160471474955e-06, + "loss": 0.5279, + "step": 2313 + }, + { + "epoch": 0.5, + "grad_norm": 0.2002251148223877, + "learning_rate": 5.26853204060945e-06, + "loss": 0.5652, + "step": 2314 + }, + { + "epoch": 0.5, + "grad_norm": 0.14642587304115295, + "learning_rate": 5.2650479033084075e-06, + "loss": 0.4926, + "step": 2315 + }, + { + "epoch": 0.5, + "grad_norm": 0.19536569714546204, + "learning_rate": 5.26156363694099e-06, + "loss": 0.5673, + "step": 2316 + }, + { + "epoch": 0.5, + "grad_norm": 0.16617797315120697, + "learning_rate": 5.258079243203875e-06, + "loss": 0.5427, + "step": 2317 + }, + { + "epoch": 0.5, + "grad_norm": 0.11626624315977097, + "learning_rate": 5.2545947237938055e-06, + "loss": 0.5398, + "step": 2318 + }, + { + "epoch": 0.5, + "grad_norm": 0.17686258256435394, + "learning_rate": 5.251110080407587e-06, + "loss": 0.5253, + "step": 2319 + }, + { + "epoch": 0.5, + "grad_norm": 0.1972484439611435, + "learning_rate": 5.247625314742083e-06, + "loss": 0.4815, + "step": 2320 + }, + { + "epoch": 0.5, + "grad_norm": 0.14836078882217407, + "learning_rate": 5.244140428494216e-06, + "loss": 0.5806, + "step": 2321 + }, + { + "epoch": 0.5, + "grad_norm": 0.22560933232307434, + "learning_rate": 5.240655423360969e-06, + "loss": 0.5267, + "step": 2322 + }, + { + "epoch": 0.5, + "grad_norm": 0.19489476084709167, + "learning_rate": 5.237170301039385e-06, + "loss": 0.5376, + "step": 2323 + }, + { + "epoch": 0.5, + "grad_norm": 0.1505575180053711, + "learning_rate": 5.233685063226557e-06, + "loss": 0.5049, + "step": 2324 + }, + { + "epoch": 0.5, + "grad_norm": 0.1474577635526657, + "learning_rate": 5.23019971161964e-06, + "loss": 0.5244, + "step": 2325 + }, + { + "epoch": 0.5, + "grad_norm": 0.15484069287776947, + "learning_rate": 5.226714247915846e-06, + "loss": 0.5052, + "step": 2326 + }, + { + "epoch": 0.5, + "grad_norm": 0.1554277092218399, + "learning_rate": 5.2232286738124346e-06, + "loss": 0.557, + "step": 2327 + }, + { + "epoch": 0.5, + "grad_norm": 0.16746380925178528, + "learning_rate": 5.219742991006728e-06, + "loss": 0.5164, + "step": 2328 + }, + { + "epoch": 0.5, + "grad_norm": 0.19356447458267212, + "learning_rate": 5.216257201196091e-06, + "loss": 0.5051, + "step": 2329 + }, + { + "epoch": 0.5, + "grad_norm": 0.19989141821861267, + "learning_rate": 5.212771306077951e-06, + "loss": 0.545, + "step": 2330 + }, + { + "epoch": 0.5, + "grad_norm": 0.14954493939876556, + "learning_rate": 5.209285307349776e-06, + "loss": 0.4857, + "step": 2331 + }, + { + "epoch": 0.5, + "grad_norm": 0.1772209256887436, + "learning_rate": 5.205799206709097e-06, + "loss": 0.4962, + "step": 2332 + }, + { + "epoch": 0.5, + "grad_norm": 0.18169115483760834, + "learning_rate": 5.202313005853483e-06, + "loss": 0.5147, + "step": 2333 + }, + { + "epoch": 0.5, + "grad_norm": 0.1574869155883789, + "learning_rate": 5.198826706480558e-06, + "loss": 0.5343, + "step": 2334 + }, + { + "epoch": 0.5, + "grad_norm": 0.1543438583612442, + "learning_rate": 5.195340310287993e-06, + "loss": 0.4861, + "step": 2335 + }, + { + "epoch": 0.5, + "grad_norm": 0.16991272568702698, + "learning_rate": 5.191853818973505e-06, + "loss": 0.5657, + "step": 2336 + }, + { + "epoch": 0.5, + "grad_norm": 0.198355033993721, + "learning_rate": 5.188367234234859e-06, + "loss": 0.551, + "step": 2337 + }, + { + "epoch": 0.5, + "grad_norm": 0.1566164791584015, + "learning_rate": 5.184880557769865e-06, + "loss": 0.5248, + "step": 2338 + }, + { + "epoch": 0.5, + "grad_norm": 0.1619618833065033, + "learning_rate": 5.181393791276374e-06, + "loss": 0.4884, + "step": 2339 + }, + { + "epoch": 0.5, + "grad_norm": 0.1328553855419159, + "learning_rate": 5.177906936452287e-06, + "loss": 0.5129, + "step": 2340 + }, + { + "epoch": 0.5, + "grad_norm": 0.1531621217727661, + "learning_rate": 5.174419994995545e-06, + "loss": 0.4932, + "step": 2341 + }, + { + "epoch": 0.5, + "grad_norm": 0.20409497618675232, + "learning_rate": 5.170932968604131e-06, + "loss": 0.5065, + "step": 2342 + }, + { + "epoch": 0.5, + "grad_norm": 0.14799822866916656, + "learning_rate": 5.167445858976068e-06, + "loss": 0.5578, + "step": 2343 + }, + { + "epoch": 0.5, + "grad_norm": 0.1554175615310669, + "learning_rate": 5.163958667809422e-06, + "loss": 0.514, + "step": 2344 + }, + { + "epoch": 0.51, + "grad_norm": 0.19117942452430725, + "learning_rate": 5.1604713968023e-06, + "loss": 0.5341, + "step": 2345 + }, + { + "epoch": 0.51, + "grad_norm": 0.15868812799453735, + "learning_rate": 5.156984047652841e-06, + "loss": 0.5528, + "step": 2346 + }, + { + "epoch": 0.51, + "grad_norm": 0.13103894889354706, + "learning_rate": 5.153496622059232e-06, + "loss": 0.4764, + "step": 2347 + }, + { + "epoch": 0.51, + "grad_norm": 0.1614736169576645, + "learning_rate": 5.15000912171969e-06, + "loss": 0.5218, + "step": 2348 + }, + { + "epoch": 0.51, + "grad_norm": 0.1403590589761734, + "learning_rate": 5.1465215483324685e-06, + "loss": 0.493, + "step": 2349 + }, + { + "epoch": 0.51, + "grad_norm": 0.13807451725006104, + "learning_rate": 5.143033903595862e-06, + "loss": 0.502, + "step": 2350 + }, + { + "epoch": 0.51, + "grad_norm": 0.1550104022026062, + "learning_rate": 5.1395461892081925e-06, + "loss": 0.541, + "step": 2351 + }, + { + "epoch": 0.51, + "grad_norm": 0.18088415265083313, + "learning_rate": 5.1360584068678225e-06, + "loss": 0.4898, + "step": 2352 + }, + { + "epoch": 0.51, + "grad_norm": 0.1560092568397522, + "learning_rate": 5.132570558273143e-06, + "loss": 0.4938, + "step": 2353 + }, + { + "epoch": 0.51, + "grad_norm": 0.21202325820922852, + "learning_rate": 5.129082645122579e-06, + "loss": 0.5163, + "step": 2354 + }, + { + "epoch": 0.51, + "grad_norm": 0.1860700100660324, + "learning_rate": 5.125594669114589e-06, + "loss": 0.528, + "step": 2355 + }, + { + "epoch": 0.51, + "grad_norm": 0.17803077399730682, + "learning_rate": 5.1221066319476576e-06, + "loss": 0.5005, + "step": 2356 + }, + { + "epoch": 0.51, + "grad_norm": 0.13310760259628296, + "learning_rate": 5.118618535320303e-06, + "loss": 0.5061, + "step": 2357 + }, + { + "epoch": 0.51, + "grad_norm": 0.14596043527126312, + "learning_rate": 5.115130380931071e-06, + "loss": 0.5381, + "step": 2358 + }, + { + "epoch": 0.51, + "grad_norm": 0.1787167489528656, + "learning_rate": 5.111642170478534e-06, + "loss": 0.4973, + "step": 2359 + }, + { + "epoch": 0.51, + "grad_norm": 0.1591702401638031, + "learning_rate": 5.108153905661296e-06, + "loss": 0.5501, + "step": 2360 + }, + { + "epoch": 0.51, + "grad_norm": 0.15234871208667755, + "learning_rate": 5.1046655881779825e-06, + "loss": 0.5135, + "step": 2361 + }, + { + "epoch": 0.51, + "grad_norm": 0.19040155410766602, + "learning_rate": 5.101177219727245e-06, + "loss": 0.5693, + "step": 2362 + }, + { + "epoch": 0.51, + "grad_norm": 0.15070025622844696, + "learning_rate": 5.097688802007767e-06, + "loss": 0.5232, + "step": 2363 + }, + { + "epoch": 0.51, + "grad_norm": 0.15969093143939972, + "learning_rate": 5.094200336718246e-06, + "loss": 0.5405, + "step": 2364 + }, + { + "epoch": 0.51, + "grad_norm": 0.12944184243679047, + "learning_rate": 5.090711825557408e-06, + "loss": 0.491, + "step": 2365 + }, + { + "epoch": 0.51, + "grad_norm": 0.1388048529624939, + "learning_rate": 5.087223270224003e-06, + "loss": 0.5004, + "step": 2366 + }, + { + "epoch": 0.51, + "grad_norm": 0.18210247159004211, + "learning_rate": 5.083734672416797e-06, + "loss": 0.4767, + "step": 2367 + }, + { + "epoch": 0.51, + "grad_norm": 0.1709405779838562, + "learning_rate": 5.080246033834581e-06, + "loss": 0.5355, + "step": 2368 + }, + { + "epoch": 0.51, + "grad_norm": 0.16608983278274536, + "learning_rate": 5.076757356176168e-06, + "loss": 0.5589, + "step": 2369 + }, + { + "epoch": 0.51, + "grad_norm": 0.18925471603870392, + "learning_rate": 5.0732686411403816e-06, + "loss": 0.5443, + "step": 2370 + }, + { + "epoch": 0.51, + "grad_norm": 0.17456351220607758, + "learning_rate": 5.069779890426072e-06, + "loss": 0.4903, + "step": 2371 + }, + { + "epoch": 0.51, + "grad_norm": 0.14656615257263184, + "learning_rate": 5.066291105732102e-06, + "loss": 0.4646, + "step": 2372 + }, + { + "epoch": 0.51, + "grad_norm": 0.14051038026809692, + "learning_rate": 5.0628022887573515e-06, + "loss": 0.5032, + "step": 2373 + }, + { + "epoch": 0.51, + "grad_norm": 0.15590442717075348, + "learning_rate": 5.05931344120072e-06, + "loss": 0.5255, + "step": 2374 + }, + { + "epoch": 0.51, + "grad_norm": 0.15599004924297333, + "learning_rate": 5.0558245647611155e-06, + "loss": 0.5418, + "step": 2375 + }, + { + "epoch": 0.51, + "grad_norm": 0.1530722975730896, + "learning_rate": 5.052335661137467e-06, + "loss": 0.469, + "step": 2376 + }, + { + "epoch": 0.51, + "grad_norm": 0.16184838116168976, + "learning_rate": 5.0488467320287106e-06, + "loss": 0.4887, + "step": 2377 + }, + { + "epoch": 0.51, + "grad_norm": 0.12114948034286499, + "learning_rate": 5.0453577791337984e-06, + "loss": 0.4982, + "step": 2378 + }, + { + "epoch": 0.51, + "grad_norm": 0.14550864696502686, + "learning_rate": 5.041868804151694e-06, + "loss": 0.4555, + "step": 2379 + }, + { + "epoch": 0.51, + "grad_norm": 0.1462576687335968, + "learning_rate": 5.03837980878137e-06, + "loss": 0.5135, + "step": 2380 + }, + { + "epoch": 0.51, + "grad_norm": 0.1352759301662445, + "learning_rate": 5.0348907947218086e-06, + "loss": 0.5367, + "step": 2381 + }, + { + "epoch": 0.51, + "grad_norm": 0.18618960678577423, + "learning_rate": 5.031401763672003e-06, + "loss": 0.4918, + "step": 2382 + }, + { + "epoch": 0.51, + "grad_norm": 0.1655811071395874, + "learning_rate": 5.027912717330956e-06, + "loss": 0.5077, + "step": 2383 + }, + { + "epoch": 0.51, + "grad_norm": 0.14371387660503387, + "learning_rate": 5.024423657397674e-06, + "loss": 0.5463, + "step": 2384 + }, + { + "epoch": 0.51, + "grad_norm": 0.1331823766231537, + "learning_rate": 5.020934585571171e-06, + "loss": 0.5586, + "step": 2385 + }, + { + "epoch": 0.51, + "grad_norm": 0.16544833779335022, + "learning_rate": 5.017445503550471e-06, + "loss": 0.5493, + "step": 2386 + }, + { + "epoch": 0.51, + "grad_norm": 0.16902866959571838, + "learning_rate": 5.013956413034595e-06, + "loss": 0.5215, + "step": 2387 + }, + { + "epoch": 0.51, + "grad_norm": 0.19423706829547882, + "learning_rate": 5.010467315722578e-06, + "loss": 0.5343, + "step": 2388 + }, + { + "epoch": 0.51, + "grad_norm": 0.1521768569946289, + "learning_rate": 5.006978213313448e-06, + "loss": 0.5021, + "step": 2389 + }, + { + "epoch": 0.51, + "grad_norm": 0.12153864651918411, + "learning_rate": 5.003489107506243e-06, + "loss": 0.4893, + "step": 2390 + }, + { + "epoch": 0.52, + "grad_norm": 0.1757657527923584, + "learning_rate": 5e-06, + "loss": 0.535, + "step": 2391 + }, + { + "epoch": 0.52, + "grad_norm": 0.17673848569393158, + "learning_rate": 4.996510892493758e-06, + "loss": 0.5201, + "step": 2392 + }, + { + "epoch": 0.52, + "grad_norm": 0.17887622117996216, + "learning_rate": 4.993021786686554e-06, + "loss": 0.5413, + "step": 2393 + }, + { + "epoch": 0.52, + "grad_norm": 0.1362655609846115, + "learning_rate": 4.989532684277424e-06, + "loss": 0.4757, + "step": 2394 + }, + { + "epoch": 0.52, + "grad_norm": 0.21385332942008972, + "learning_rate": 4.986043586965406e-06, + "loss": 0.5233, + "step": 2395 + }, + { + "epoch": 0.52, + "grad_norm": 0.16764004528522491, + "learning_rate": 4.98255449644953e-06, + "loss": 0.5193, + "step": 2396 + }, + { + "epoch": 0.52, + "grad_norm": 0.12933380901813507, + "learning_rate": 4.979065414428829e-06, + "loss": 0.4681, + "step": 2397 + }, + { + "epoch": 0.52, + "grad_norm": 0.17438261210918427, + "learning_rate": 4.975576342602329e-06, + "loss": 0.5437, + "step": 2398 + }, + { + "epoch": 0.52, + "grad_norm": 0.1581277847290039, + "learning_rate": 4.9720872826690455e-06, + "loss": 0.5147, + "step": 2399 + }, + { + "epoch": 0.52, + "grad_norm": 0.15126928687095642, + "learning_rate": 4.968598236327998e-06, + "loss": 0.5033, + "step": 2400 + }, + { + "epoch": 0.52, + "grad_norm": 0.144017294049263, + "learning_rate": 4.965109205278193e-06, + "loss": 0.4557, + "step": 2401 + }, + { + "epoch": 0.52, + "grad_norm": 0.158042773604393, + "learning_rate": 4.961620191218632e-06, + "loss": 0.5118, + "step": 2402 + }, + { + "epoch": 0.52, + "grad_norm": 0.21210241317749023, + "learning_rate": 4.9581311958483075e-06, + "loss": 0.556, + "step": 2403 + }, + { + "epoch": 0.52, + "grad_norm": 0.25010186433792114, + "learning_rate": 4.954642220866202e-06, + "loss": 0.542, + "step": 2404 + }, + { + "epoch": 0.52, + "grad_norm": 0.21155287325382233, + "learning_rate": 4.95115326797129e-06, + "loss": 0.4882, + "step": 2405 + }, + { + "epoch": 0.52, + "grad_norm": 0.17543160915374756, + "learning_rate": 4.947664338862534e-06, + "loss": 0.5883, + "step": 2406 + }, + { + "epoch": 0.52, + "grad_norm": 0.1969243884086609, + "learning_rate": 4.944175435238886e-06, + "loss": 0.5051, + "step": 2407 + }, + { + "epoch": 0.52, + "grad_norm": 0.1602378487586975, + "learning_rate": 4.940686558799283e-06, + "loss": 0.5139, + "step": 2408 + }, + { + "epoch": 0.52, + "grad_norm": 0.1955273449420929, + "learning_rate": 4.9371977112426485e-06, + "loss": 0.5096, + "step": 2409 + }, + { + "epoch": 0.52, + "grad_norm": 0.15544743835926056, + "learning_rate": 4.933708894267901e-06, + "loss": 0.5081, + "step": 2410 + }, + { + "epoch": 0.52, + "grad_norm": 0.177435964345932, + "learning_rate": 4.93022010957393e-06, + "loss": 0.5476, + "step": 2411 + }, + { + "epoch": 0.52, + "grad_norm": 0.14814496040344238, + "learning_rate": 4.92673135885962e-06, + "loss": 0.5653, + "step": 2412 + }, + { + "epoch": 0.52, + "grad_norm": 0.144011989235878, + "learning_rate": 4.923242643823834e-06, + "loss": 0.6024, + "step": 2413 + }, + { + "epoch": 0.52, + "grad_norm": 0.16023319959640503, + "learning_rate": 4.919753966165419e-06, + "loss": 0.5927, + "step": 2414 + }, + { + "epoch": 0.52, + "grad_norm": 0.13633988797664642, + "learning_rate": 4.916265327583204e-06, + "loss": 0.5548, + "step": 2415 + }, + { + "epoch": 0.52, + "grad_norm": 0.20109489560127258, + "learning_rate": 4.912776729775999e-06, + "loss": 0.5668, + "step": 2416 + }, + { + "epoch": 0.52, + "grad_norm": 0.3419434428215027, + "learning_rate": 4.9092881744425944e-06, + "loss": 0.4842, + "step": 2417 + }, + { + "epoch": 0.52, + "grad_norm": 0.1448926031589508, + "learning_rate": 4.905799663281756e-06, + "loss": 0.4886, + "step": 2418 + }, + { + "epoch": 0.52, + "grad_norm": 0.2309703230857849, + "learning_rate": 4.902311197992234e-06, + "loss": 0.5237, + "step": 2419 + }, + { + "epoch": 0.52, + "grad_norm": 0.14190784096717834, + "learning_rate": 4.898822780272757e-06, + "loss": 0.5218, + "step": 2420 + }, + { + "epoch": 0.52, + "grad_norm": 0.18205609917640686, + "learning_rate": 4.895334411822019e-06, + "loss": 0.5251, + "step": 2421 + }, + { + "epoch": 0.52, + "grad_norm": 0.13188670575618744, + "learning_rate": 4.8918460943387065e-06, + "loss": 0.4971, + "step": 2422 + }, + { + "epoch": 0.52, + "grad_norm": 0.16982056200504303, + "learning_rate": 4.888357829521466e-06, + "loss": 0.4846, + "step": 2423 + }, + { + "epoch": 0.52, + "grad_norm": 0.12666776776313782, + "learning_rate": 4.8848696190689295e-06, + "loss": 0.4853, + "step": 2424 + }, + { + "epoch": 0.52, + "grad_norm": 0.1458110213279724, + "learning_rate": 4.881381464679698e-06, + "loss": 0.4871, + "step": 2425 + }, + { + "epoch": 0.52, + "grad_norm": 0.18324485421180725, + "learning_rate": 4.877893368052343e-06, + "loss": 0.545, + "step": 2426 + }, + { + "epoch": 0.52, + "grad_norm": 0.18099121749401093, + "learning_rate": 4.874405330885413e-06, + "loss": 0.5002, + "step": 2427 + }, + { + "epoch": 0.52, + "grad_norm": 0.13774168491363525, + "learning_rate": 4.870917354877421e-06, + "loss": 0.4789, + "step": 2428 + }, + { + "epoch": 0.52, + "grad_norm": 0.16247624158859253, + "learning_rate": 4.867429441726858e-06, + "loss": 0.5491, + "step": 2429 + }, + { + "epoch": 0.52, + "grad_norm": 0.2958735525608063, + "learning_rate": 4.863941593132179e-06, + "loss": 0.5158, + "step": 2430 + }, + { + "epoch": 0.52, + "grad_norm": 0.1791061908006668, + "learning_rate": 4.860453810791808e-06, + "loss": 0.5083, + "step": 2431 + }, + { + "epoch": 0.52, + "grad_norm": 0.15824836492538452, + "learning_rate": 4.856966096404141e-06, + "loss": 0.5177, + "step": 2432 + }, + { + "epoch": 0.52, + "grad_norm": 0.15134254097938538, + "learning_rate": 4.853478451667532e-06, + "loss": 0.4666, + "step": 2433 + }, + { + "epoch": 0.52, + "grad_norm": 0.14412038028240204, + "learning_rate": 4.849990878280313e-06, + "loss": 0.5838, + "step": 2434 + }, + { + "epoch": 0.52, + "grad_norm": 0.1476101279258728, + "learning_rate": 4.84650337794077e-06, + "loss": 0.5155, + "step": 2435 + }, + { + "epoch": 0.52, + "grad_norm": 0.13752271234989166, + "learning_rate": 4.843015952347159e-06, + "loss": 0.5225, + "step": 2436 + }, + { + "epoch": 0.52, + "grad_norm": 0.1495019495487213, + "learning_rate": 4.839528603197702e-06, + "loss": 0.5148, + "step": 2437 + }, + { + "epoch": 0.53, + "grad_norm": 0.18052443861961365, + "learning_rate": 4.8360413321905786e-06, + "loss": 0.5321, + "step": 2438 + }, + { + "epoch": 0.53, + "grad_norm": 0.16000008583068848, + "learning_rate": 4.832554141023934e-06, + "loss": 0.5374, + "step": 2439 + }, + { + "epoch": 0.53, + "grad_norm": 0.14435116946697235, + "learning_rate": 4.829067031395871e-06, + "loss": 0.4763, + "step": 2440 + }, + { + "epoch": 0.53, + "grad_norm": 0.1648446023464203, + "learning_rate": 4.825580005004456e-06, + "loss": 0.5029, + "step": 2441 + }, + { + "epoch": 0.53, + "grad_norm": 0.1948603093624115, + "learning_rate": 4.822093063547715e-06, + "loss": 0.517, + "step": 2442 + }, + { + "epoch": 0.53, + "grad_norm": 0.1540631800889969, + "learning_rate": 4.818606208723627e-06, + "loss": 0.5113, + "step": 2443 + }, + { + "epoch": 0.53, + "grad_norm": 0.14734607934951782, + "learning_rate": 4.815119442230138e-06, + "loss": 0.5323, + "step": 2444 + }, + { + "epoch": 0.53, + "grad_norm": 0.13016067445278168, + "learning_rate": 4.811632765765143e-06, + "loss": 0.4841, + "step": 2445 + }, + { + "epoch": 0.53, + "grad_norm": 0.1437351554632187, + "learning_rate": 4.8081461810264955e-06, + "loss": 0.4775, + "step": 2446 + }, + { + "epoch": 0.53, + "grad_norm": 0.16473154723644257, + "learning_rate": 4.804659689712009e-06, + "loss": 0.5019, + "step": 2447 + }, + { + "epoch": 0.53, + "grad_norm": 0.13416069746017456, + "learning_rate": 4.801173293519442e-06, + "loss": 0.5193, + "step": 2448 + }, + { + "epoch": 0.53, + "grad_norm": 0.12704534828662872, + "learning_rate": 4.797686994146519e-06, + "loss": 0.499, + "step": 2449 + }, + { + "epoch": 0.53, + "grad_norm": 0.15111473202705383, + "learning_rate": 4.7942007932909046e-06, + "loss": 0.5168, + "step": 2450 + }, + { + "epoch": 0.53, + "grad_norm": 0.13630732893943787, + "learning_rate": 4.790714692650223e-06, + "loss": 0.4938, + "step": 2451 + }, + { + "epoch": 0.53, + "grad_norm": 0.13137710094451904, + "learning_rate": 4.7872286939220516e-06, + "loss": 0.4544, + "step": 2452 + }, + { + "epoch": 0.53, + "grad_norm": 0.15518240630626678, + "learning_rate": 4.783742798803909e-06, + "loss": 0.5013, + "step": 2453 + }, + { + "epoch": 0.53, + "grad_norm": 0.13857389986515045, + "learning_rate": 4.7802570089932746e-06, + "loss": 0.5551, + "step": 2454 + }, + { + "epoch": 0.53, + "grad_norm": 0.1502048522233963, + "learning_rate": 4.776771326187566e-06, + "loss": 0.4341, + "step": 2455 + }, + { + "epoch": 0.53, + "grad_norm": 0.16226448118686676, + "learning_rate": 4.773285752084154e-06, + "loss": 0.5555, + "step": 2456 + }, + { + "epoch": 0.53, + "grad_norm": 0.15207113325595856, + "learning_rate": 4.769800288380361e-06, + "loss": 0.4934, + "step": 2457 + }, + { + "epoch": 0.53, + "grad_norm": 0.16286228597164154, + "learning_rate": 4.766314936773445e-06, + "loss": 0.5066, + "step": 2458 + }, + { + "epoch": 0.53, + "grad_norm": 0.15286804735660553, + "learning_rate": 4.762829698960618e-06, + "loss": 0.5425, + "step": 2459 + }, + { + "epoch": 0.53, + "grad_norm": 0.149344801902771, + "learning_rate": 4.7593445766390315e-06, + "loss": 0.5626, + "step": 2460 + }, + { + "epoch": 0.53, + "grad_norm": 0.1389455944299698, + "learning_rate": 4.755859571505786e-06, + "loss": 0.4964, + "step": 2461 + }, + { + "epoch": 0.53, + "grad_norm": 0.14913085103034973, + "learning_rate": 4.752374685257919e-06, + "loss": 0.524, + "step": 2462 + }, + { + "epoch": 0.53, + "grad_norm": 0.14657723903656006, + "learning_rate": 4.748889919592414e-06, + "loss": 0.5059, + "step": 2463 + }, + { + "epoch": 0.53, + "grad_norm": 0.12738269567489624, + "learning_rate": 4.745405276206196e-06, + "loss": 0.5039, + "step": 2464 + }, + { + "epoch": 0.53, + "grad_norm": 0.2088775783777237, + "learning_rate": 4.741920756796126e-06, + "loss": 0.5238, + "step": 2465 + }, + { + "epoch": 0.53, + "grad_norm": 0.1429111510515213, + "learning_rate": 4.738436363059013e-06, + "loss": 0.4606, + "step": 2466 + }, + { + "epoch": 0.53, + "grad_norm": 0.1563674658536911, + "learning_rate": 4.734952096691594e-06, + "loss": 0.5982, + "step": 2467 + }, + { + "epoch": 0.53, + "grad_norm": 0.15420180559158325, + "learning_rate": 4.731467959390552e-06, + "loss": 0.515, + "step": 2468 + }, + { + "epoch": 0.53, + "grad_norm": 0.21299295127391815, + "learning_rate": 4.727983952852505e-06, + "loss": 0.5306, + "step": 2469 + }, + { + "epoch": 0.53, + "grad_norm": 0.15745538473129272, + "learning_rate": 4.724500078774008e-06, + "loss": 0.5118, + "step": 2470 + }, + { + "epoch": 0.53, + "grad_norm": 0.1578780859708786, + "learning_rate": 4.721016338851549e-06, + "loss": 0.5061, + "step": 2471 + }, + { + "epoch": 0.53, + "grad_norm": 0.1522160917520523, + "learning_rate": 4.717532734781552e-06, + "loss": 0.5417, + "step": 2472 + }, + { + "epoch": 0.53, + "grad_norm": 0.12511098384857178, + "learning_rate": 4.714049268260376e-06, + "loss": 0.4981, + "step": 2473 + }, + { + "epoch": 0.53, + "grad_norm": 0.1434258371591568, + "learning_rate": 4.710565940984313e-06, + "loss": 0.5178, + "step": 2474 + }, + { + "epoch": 0.53, + "grad_norm": 0.13308405876159668, + "learning_rate": 4.707082754649584e-06, + "loss": 0.4986, + "step": 2475 + }, + { + "epoch": 0.53, + "grad_norm": 0.15585026144981384, + "learning_rate": 4.703599710952347e-06, + "loss": 0.5179, + "step": 2476 + }, + { + "epoch": 0.53, + "grad_norm": 0.1660911738872528, + "learning_rate": 4.700116811588684e-06, + "loss": 0.4997, + "step": 2477 + }, + { + "epoch": 0.53, + "grad_norm": 0.1638256311416626, + "learning_rate": 4.6966340582546085e-06, + "loss": 0.4711, + "step": 2478 + }, + { + "epoch": 0.53, + "grad_norm": 0.14776884019374847, + "learning_rate": 4.693151452646071e-06, + "loss": 0.47, + "step": 2479 + }, + { + "epoch": 0.53, + "grad_norm": 0.15483321249485016, + "learning_rate": 4.689668996458937e-06, + "loss": 0.5476, + "step": 2480 + }, + { + "epoch": 0.53, + "grad_norm": 0.18256203830242157, + "learning_rate": 4.6861866913890094e-06, + "loss": 0.5002, + "step": 2481 + }, + { + "epoch": 0.53, + "grad_norm": 0.1372958868741989, + "learning_rate": 4.682704539132011e-06, + "loss": 0.5201, + "step": 2482 + }, + { + "epoch": 0.53, + "grad_norm": 0.17966903746128082, + "learning_rate": 4.679222541383594e-06, + "loss": 0.4896, + "step": 2483 + }, + { + "epoch": 0.54, + "grad_norm": 0.16434355080127716, + "learning_rate": 4.6757406998393354e-06, + "loss": 0.5577, + "step": 2484 + }, + { + "epoch": 0.54, + "grad_norm": 0.12742279469966888, + "learning_rate": 4.672259016194733e-06, + "loss": 0.5662, + "step": 2485 + }, + { + "epoch": 0.54, + "grad_norm": 0.14353856444358826, + "learning_rate": 4.668777492145212e-06, + "loss": 0.5476, + "step": 2486 + }, + { + "epoch": 0.54, + "grad_norm": 0.17390471696853638, + "learning_rate": 4.665296129386116e-06, + "loss": 0.5625, + "step": 2487 + }, + { + "epoch": 0.54, + "grad_norm": 0.16890183091163635, + "learning_rate": 4.661814929612713e-06, + "loss": 0.5211, + "step": 2488 + }, + { + "epoch": 0.54, + "grad_norm": 0.16704991459846497, + "learning_rate": 4.658333894520189e-06, + "loss": 0.4941, + "step": 2489 + }, + { + "epoch": 0.54, + "grad_norm": 0.15086905658245087, + "learning_rate": 4.654853025803649e-06, + "loss": 0.5065, + "step": 2490 + }, + { + "epoch": 0.54, + "grad_norm": 0.13586142659187317, + "learning_rate": 4.651372325158125e-06, + "loss": 0.5415, + "step": 2491 + }, + { + "epoch": 0.54, + "grad_norm": 0.17813622951507568, + "learning_rate": 4.6478917942785575e-06, + "loss": 0.5101, + "step": 2492 + }, + { + "epoch": 0.54, + "grad_norm": 0.16348902881145477, + "learning_rate": 4.644411434859808e-06, + "loss": 0.4916, + "step": 2493 + }, + { + "epoch": 0.54, + "grad_norm": 0.17885281145572662, + "learning_rate": 4.640931248596655e-06, + "loss": 0.4749, + "step": 2494 + }, + { + "epoch": 0.54, + "grad_norm": 0.15020768344402313, + "learning_rate": 4.637451237183792e-06, + "loss": 0.5273, + "step": 2495 + }, + { + "epoch": 0.54, + "grad_norm": 0.15204519033432007, + "learning_rate": 4.633971402315828e-06, + "loss": 0.5244, + "step": 2496 + }, + { + "epoch": 0.54, + "grad_norm": 0.15182174742221832, + "learning_rate": 4.630491745687286e-06, + "loss": 0.4928, + "step": 2497 + }, + { + "epoch": 0.54, + "grad_norm": 0.163527712225914, + "learning_rate": 4.627012268992603e-06, + "loss": 0.5102, + "step": 2498 + }, + { + "epoch": 0.54, + "grad_norm": 0.1811029613018036, + "learning_rate": 4.623532973926124e-06, + "loss": 0.5091, + "step": 2499 + }, + { + "epoch": 0.54, + "grad_norm": 0.17676551640033722, + "learning_rate": 4.62005386218211e-06, + "loss": 0.5543, + "step": 2500 + }, + { + "epoch": 0.54, + "grad_norm": 0.14058449864387512, + "learning_rate": 4.616574935454735e-06, + "loss": 0.4906, + "step": 2501 + }, + { + "epoch": 0.54, + "grad_norm": 0.14341934025287628, + "learning_rate": 4.613096195438074e-06, + "loss": 0.5007, + "step": 2502 + }, + { + "epoch": 0.54, + "grad_norm": 0.17558392882347107, + "learning_rate": 4.609617643826121e-06, + "loss": 0.4882, + "step": 2503 + }, + { + "epoch": 0.54, + "grad_norm": 0.13475576043128967, + "learning_rate": 4.60613928231277e-06, + "loss": 0.5144, + "step": 2504 + }, + { + "epoch": 0.54, + "grad_norm": 0.158226877450943, + "learning_rate": 4.602661112591829e-06, + "loss": 0.5136, + "step": 2505 + }, + { + "epoch": 0.54, + "grad_norm": 0.1458200067281723, + "learning_rate": 4.59918313635701e-06, + "loss": 0.4688, + "step": 2506 + }, + { + "epoch": 0.54, + "grad_norm": 0.19686149060726166, + "learning_rate": 4.595705355301927e-06, + "loss": 0.5185, + "step": 2507 + }, + { + "epoch": 0.54, + "grad_norm": 0.1282099336385727, + "learning_rate": 4.592227771120108e-06, + "loss": 0.5569, + "step": 2508 + }, + { + "epoch": 0.54, + "grad_norm": 0.18009676039218903, + "learning_rate": 4.588750385504975e-06, + "loss": 0.4903, + "step": 2509 + }, + { + "epoch": 0.54, + "grad_norm": 0.21005766093730927, + "learning_rate": 4.585273200149859e-06, + "loss": 0.5475, + "step": 2510 + }, + { + "epoch": 0.54, + "grad_norm": 0.12568634748458862, + "learning_rate": 4.581796216747996e-06, + "loss": 0.5061, + "step": 2511 + }, + { + "epoch": 0.54, + "grad_norm": 0.14265067875385284, + "learning_rate": 4.578319436992515e-06, + "loss": 0.4862, + "step": 2512 + }, + { + "epoch": 0.54, + "grad_norm": 0.14382900297641754, + "learning_rate": 4.574842862576455e-06, + "loss": 0.5384, + "step": 2513 + }, + { + "epoch": 0.54, + "grad_norm": 0.20735333859920502, + "learning_rate": 4.5713664951927475e-06, + "loss": 0.4868, + "step": 2514 + }, + { + "epoch": 0.54, + "grad_norm": 0.14771424233913422, + "learning_rate": 4.56789033653423e-06, + "loss": 0.4837, + "step": 2515 + }, + { + "epoch": 0.54, + "grad_norm": 0.1805439293384552, + "learning_rate": 4.5644143882936316e-06, + "loss": 0.5152, + "step": 2516 + }, + { + "epoch": 0.54, + "grad_norm": 0.16125157475471497, + "learning_rate": 4.560938652163585e-06, + "loss": 0.514, + "step": 2517 + }, + { + "epoch": 0.54, + "grad_norm": 0.16599629819393158, + "learning_rate": 4.5574631298366165e-06, + "loss": 0.4994, + "step": 2518 + }, + { + "epoch": 0.54, + "grad_norm": 0.15320484340190887, + "learning_rate": 4.553987823005148e-06, + "loss": 0.4958, + "step": 2519 + }, + { + "epoch": 0.54, + "grad_norm": 0.1553090214729309, + "learning_rate": 4.550512733361499e-06, + "loss": 0.5354, + "step": 2520 + }, + { + "epoch": 0.54, + "grad_norm": 0.14517144858837128, + "learning_rate": 4.5470378625978775e-06, + "loss": 0.5354, + "step": 2521 + }, + { + "epoch": 0.54, + "grad_norm": 0.2318032830953598, + "learning_rate": 4.543563212406392e-06, + "loss": 0.5206, + "step": 2522 + }, + { + "epoch": 0.54, + "grad_norm": 0.15330053865909576, + "learning_rate": 4.540088784479043e-06, + "loss": 0.4928, + "step": 2523 + }, + { + "epoch": 0.54, + "grad_norm": 0.16507619619369507, + "learning_rate": 4.536614580507714e-06, + "loss": 0.5271, + "step": 2524 + }, + { + "epoch": 0.54, + "grad_norm": 0.2185535430908203, + "learning_rate": 4.53314060218419e-06, + "loss": 0.4909, + "step": 2525 + }, + { + "epoch": 0.54, + "grad_norm": 0.1298246681690216, + "learning_rate": 4.52966685120014e-06, + "loss": 0.4953, + "step": 2526 + }, + { + "epoch": 0.54, + "grad_norm": 0.16641001403331757, + "learning_rate": 4.526193329247124e-06, + "loss": 0.5287, + "step": 2527 + }, + { + "epoch": 0.54, + "grad_norm": 0.1725553274154663, + "learning_rate": 4.5227200380165925e-06, + "loss": 0.5028, + "step": 2528 + }, + { + "epoch": 0.54, + "grad_norm": 0.14473970234394073, + "learning_rate": 4.51924697919988e-06, + "loss": 0.5306, + "step": 2529 + }, + { + "epoch": 0.54, + "grad_norm": 0.15652526915073395, + "learning_rate": 4.51577415448821e-06, + "loss": 0.5716, + "step": 2530 + }, + { + "epoch": 0.55, + "grad_norm": 0.127789705991745, + "learning_rate": 4.512301565572691e-06, + "loss": 0.501, + "step": 2531 + }, + { + "epoch": 0.55, + "grad_norm": 0.1573222577571869, + "learning_rate": 4.508829214144318e-06, + "loss": 0.5025, + "step": 2532 + }, + { + "epoch": 0.55, + "grad_norm": 0.14756949245929718, + "learning_rate": 4.5053571018939715e-06, + "loss": 0.5278, + "step": 2533 + }, + { + "epoch": 0.55, + "grad_norm": 0.16021455824375153, + "learning_rate": 4.5018852305124075e-06, + "loss": 0.4744, + "step": 2534 + }, + { + "epoch": 0.55, + "grad_norm": 0.16380415856838226, + "learning_rate": 4.498413601690278e-06, + "loss": 0.5568, + "step": 2535 + }, + { + "epoch": 0.55, + "grad_norm": 0.18932676315307617, + "learning_rate": 4.494942217118105e-06, + "loss": 0.4957, + "step": 2536 + }, + { + "epoch": 0.55, + "grad_norm": 0.18895933032035828, + "learning_rate": 4.491471078486297e-06, + "loss": 0.5109, + "step": 2537 + }, + { + "epoch": 0.55, + "grad_norm": 0.14104638993740082, + "learning_rate": 4.488000187485144e-06, + "loss": 0.5168, + "step": 2538 + }, + { + "epoch": 0.55, + "grad_norm": 0.13941968977451324, + "learning_rate": 4.484529545804811e-06, + "loss": 0.6087, + "step": 2539 + }, + { + "epoch": 0.55, + "grad_norm": 0.1779303252696991, + "learning_rate": 4.481059155135346e-06, + "loss": 0.5274, + "step": 2540 + }, + { + "epoch": 0.55, + "grad_norm": 0.16781829297542572, + "learning_rate": 4.477589017166671e-06, + "loss": 0.5129, + "step": 2541 + }, + { + "epoch": 0.55, + "grad_norm": 0.149240642786026, + "learning_rate": 4.474119133588588e-06, + "loss": 0.5388, + "step": 2542 + }, + { + "epoch": 0.55, + "grad_norm": 0.22637441754341125, + "learning_rate": 4.470649506090772e-06, + "loss": 0.5118, + "step": 2543 + }, + { + "epoch": 0.55, + "grad_norm": 0.1592012643814087, + "learning_rate": 4.4671801363627776e-06, + "loss": 0.5295, + "step": 2544 + }, + { + "epoch": 0.55, + "grad_norm": 0.18590115010738373, + "learning_rate": 4.463711026094032e-06, + "loss": 0.4976, + "step": 2545 + }, + { + "epoch": 0.55, + "grad_norm": 0.2293752282857895, + "learning_rate": 4.460242176973829e-06, + "loss": 0.54, + "step": 2546 + }, + { + "epoch": 0.55, + "grad_norm": 0.13310395181179047, + "learning_rate": 4.456773590691352e-06, + "loss": 0.5073, + "step": 2547 + }, + { + "epoch": 0.55, + "grad_norm": 0.1677958369255066, + "learning_rate": 4.453305268935637e-06, + "loss": 0.5132, + "step": 2548 + }, + { + "epoch": 0.55, + "grad_norm": 0.1587441861629486, + "learning_rate": 4.4498372133956046e-06, + "loss": 0.4854, + "step": 2549 + }, + { + "epoch": 0.55, + "grad_norm": 0.16079290211200714, + "learning_rate": 4.446369425760042e-06, + "loss": 0.4615, + "step": 2550 + }, + { + "epoch": 0.55, + "grad_norm": 0.1533065140247345, + "learning_rate": 4.442901907717603e-06, + "loss": 0.487, + "step": 2551 + }, + { + "epoch": 0.55, + "grad_norm": 0.17068631947040558, + "learning_rate": 4.439434660956814e-06, + "loss": 0.5596, + "step": 2552 + }, + { + "epoch": 0.55, + "grad_norm": 0.15625819563865662, + "learning_rate": 4.4359676871660665e-06, + "loss": 0.5136, + "step": 2553 + }, + { + "epoch": 0.55, + "grad_norm": 0.16105841100215912, + "learning_rate": 4.432500988033621e-06, + "loss": 0.5351, + "step": 2554 + }, + { + "epoch": 0.55, + "grad_norm": 0.16557753086090088, + "learning_rate": 4.429034565247606e-06, + "loss": 0.491, + "step": 2555 + }, + { + "epoch": 0.55, + "grad_norm": 0.151271790266037, + "learning_rate": 4.42556842049601e-06, + "loss": 0.5274, + "step": 2556 + }, + { + "epoch": 0.55, + "grad_norm": 0.1744944155216217, + "learning_rate": 4.422102555466691e-06, + "loss": 0.5155, + "step": 2557 + }, + { + "epoch": 0.55, + "grad_norm": 0.14737199246883392, + "learning_rate": 4.418636971847367e-06, + "loss": 0.5281, + "step": 2558 + }, + { + "epoch": 0.55, + "grad_norm": 0.16449463367462158, + "learning_rate": 4.415171671325622e-06, + "loss": 0.5183, + "step": 2559 + }, + { + "epoch": 0.55, + "grad_norm": 0.12860900163650513, + "learning_rate": 4.4117066555889045e-06, + "loss": 0.4566, + "step": 2560 + }, + { + "epoch": 0.55, + "grad_norm": 0.1798088699579239, + "learning_rate": 4.408241926324515e-06, + "loss": 0.5072, + "step": 2561 + }, + { + "epoch": 0.55, + "grad_norm": 0.1573198288679123, + "learning_rate": 4.404777485219624e-06, + "loss": 0.5375, + "step": 2562 + }, + { + "epoch": 0.55, + "grad_norm": 0.12489344924688339, + "learning_rate": 4.401313333961257e-06, + "loss": 0.4767, + "step": 2563 + }, + { + "epoch": 0.55, + "grad_norm": 0.1495581567287445, + "learning_rate": 4.397849474236299e-06, + "loss": 0.5327, + "step": 2564 + }, + { + "epoch": 0.55, + "grad_norm": 0.21391817927360535, + "learning_rate": 4.3943859077314956e-06, + "loss": 0.536, + "step": 2565 + }, + { + "epoch": 0.55, + "grad_norm": 0.1484507918357849, + "learning_rate": 4.390922636133444e-06, + "loss": 0.4943, + "step": 2566 + }, + { + "epoch": 0.55, + "grad_norm": 0.16837376356124878, + "learning_rate": 4.3874596611286076e-06, + "loss": 0.544, + "step": 2567 + }, + { + "epoch": 0.55, + "grad_norm": 0.1593373417854309, + "learning_rate": 4.38399698440329e-06, + "loss": 0.5119, + "step": 2568 + }, + { + "epoch": 0.55, + "grad_norm": 0.18334493041038513, + "learning_rate": 4.380534607643668e-06, + "loss": 0.5283, + "step": 2569 + }, + { + "epoch": 0.55, + "grad_norm": 0.1494050920009613, + "learning_rate": 4.377072532535756e-06, + "loss": 0.5343, + "step": 2570 + }, + { + "epoch": 0.55, + "grad_norm": 0.1535796821117401, + "learning_rate": 4.37361076076543e-06, + "loss": 0.5707, + "step": 2571 + }, + { + "epoch": 0.55, + "grad_norm": 0.22966866195201874, + "learning_rate": 4.370149294018419e-06, + "loss": 0.5478, + "step": 2572 + }, + { + "epoch": 0.55, + "grad_norm": 0.166322723031044, + "learning_rate": 4.366688133980299e-06, + "loss": 0.5321, + "step": 2573 + }, + { + "epoch": 0.55, + "grad_norm": 0.1559196412563324, + "learning_rate": 4.3632272823365004e-06, + "loss": 0.4929, + "step": 2574 + }, + { + "epoch": 0.55, + "grad_norm": 0.13609760999679565, + "learning_rate": 4.359766740772301e-06, + "loss": 0.5255, + "step": 2575 + }, + { + "epoch": 0.55, + "grad_norm": 0.17465785145759583, + "learning_rate": 4.356306510972829e-06, + "loss": 0.4871, + "step": 2576 + }, + { + "epoch": 0.56, + "grad_norm": 0.16919800639152527, + "learning_rate": 4.35284659462306e-06, + "loss": 0.5335, + "step": 2577 + }, + { + "epoch": 0.56, + "grad_norm": 0.20923703908920288, + "learning_rate": 4.349386993407817e-06, + "loss": 0.5549, + "step": 2578 + }, + { + "epoch": 0.56, + "grad_norm": 0.14986181259155273, + "learning_rate": 4.345927709011771e-06, + "loss": 0.5111, + "step": 2579 + }, + { + "epoch": 0.56, + "grad_norm": 0.151445209980011, + "learning_rate": 4.342468743119436e-06, + "loss": 0.5129, + "step": 2580 + }, + { + "epoch": 0.56, + "grad_norm": 0.14397896826267242, + "learning_rate": 4.3390100974151715e-06, + "loss": 0.4842, + "step": 2581 + }, + { + "epoch": 0.56, + "grad_norm": 0.16390201449394226, + "learning_rate": 4.335551773583186e-06, + "loss": 0.4678, + "step": 2582 + }, + { + "epoch": 0.56, + "grad_norm": 0.16361331939697266, + "learning_rate": 4.332093773307523e-06, + "loss": 0.5084, + "step": 2583 + }, + { + "epoch": 0.56, + "grad_norm": 0.17257274687290192, + "learning_rate": 4.328636098272075e-06, + "loss": 0.5223, + "step": 2584 + }, + { + "epoch": 0.56, + "grad_norm": 0.14300677180290222, + "learning_rate": 4.325178750160573e-06, + "loss": 0.4712, + "step": 2585 + }, + { + "epoch": 0.56, + "grad_norm": 0.18836241960525513, + "learning_rate": 4.32172173065659e-06, + "loss": 0.5017, + "step": 2586 + }, + { + "epoch": 0.56, + "grad_norm": 0.1701335459947586, + "learning_rate": 4.318265041443538e-06, + "loss": 0.4977, + "step": 2587 + }, + { + "epoch": 0.56, + "grad_norm": 0.18600672483444214, + "learning_rate": 4.31480868420467e-06, + "loss": 0.5121, + "step": 2588 + }, + { + "epoch": 0.56, + "grad_norm": 0.13420304656028748, + "learning_rate": 4.311352660623076e-06, + "loss": 0.4936, + "step": 2589 + }, + { + "epoch": 0.56, + "grad_norm": 0.19042494893074036, + "learning_rate": 4.307896972381681e-06, + "loss": 0.5553, + "step": 2590 + }, + { + "epoch": 0.56, + "grad_norm": 0.14162546396255493, + "learning_rate": 4.304441621163252e-06, + "loss": 0.547, + "step": 2591 + }, + { + "epoch": 0.56, + "grad_norm": 0.17262719571590424, + "learning_rate": 4.3009866086503905e-06, + "loss": 0.5414, + "step": 2592 + }, + { + "epoch": 0.56, + "grad_norm": 0.1511780321598053, + "learning_rate": 4.297531936525528e-06, + "loss": 0.4973, + "step": 2593 + }, + { + "epoch": 0.56, + "grad_norm": 0.15445083379745483, + "learning_rate": 4.294077606470937e-06, + "loss": 0.5506, + "step": 2594 + }, + { + "epoch": 0.56, + "grad_norm": 0.22606535255908966, + "learning_rate": 4.2906236201687186e-06, + "loss": 0.5627, + "step": 2595 + }, + { + "epoch": 0.56, + "grad_norm": 0.2494857758283615, + "learning_rate": 4.28716997930081e-06, + "loss": 0.5328, + "step": 2596 + }, + { + "epoch": 0.56, + "grad_norm": 0.1547478884458542, + "learning_rate": 4.283716685548976e-06, + "loss": 0.5037, + "step": 2597 + }, + { + "epoch": 0.56, + "grad_norm": 0.17305047810077667, + "learning_rate": 4.2802637405948175e-06, + "loss": 0.4701, + "step": 2598 + }, + { + "epoch": 0.56, + "grad_norm": 0.20879824459552765, + "learning_rate": 4.2768111461197635e-06, + "loss": 0.5721, + "step": 2599 + }, + { + "epoch": 0.56, + "grad_norm": 0.20338691771030426, + "learning_rate": 4.273358903805069e-06, + "loss": 0.4916, + "step": 2600 + }, + { + "epoch": 0.56, + "grad_norm": 0.1474212110042572, + "learning_rate": 4.2699070153318244e-06, + "loss": 0.5426, + "step": 2601 + }, + { + "epoch": 0.56, + "grad_norm": 0.1909620314836502, + "learning_rate": 4.266455482380938e-06, + "loss": 0.4591, + "step": 2602 + }, + { + "epoch": 0.56, + "grad_norm": 0.16102322936058044, + "learning_rate": 4.2630043066331536e-06, + "loss": 0.4825, + "step": 2603 + }, + { + "epoch": 0.56, + "grad_norm": 0.174557164311409, + "learning_rate": 4.2595534897690415e-06, + "loss": 0.5141, + "step": 2604 + }, + { + "epoch": 0.56, + "grad_norm": 0.17708678543567657, + "learning_rate": 4.256103033468989e-06, + "loss": 0.5301, + "step": 2605 + }, + { + "epoch": 0.56, + "grad_norm": 0.13558730483055115, + "learning_rate": 4.252652939413215e-06, + "loss": 0.4784, + "step": 2606 + }, + { + "epoch": 0.56, + "grad_norm": 0.188698410987854, + "learning_rate": 4.24920320928176e-06, + "loss": 0.5073, + "step": 2607 + }, + { + "epoch": 0.56, + "grad_norm": 0.181773841381073, + "learning_rate": 4.245753844754484e-06, + "loss": 0.5205, + "step": 2608 + }, + { + "epoch": 0.56, + "grad_norm": 0.15207915008068085, + "learning_rate": 4.242304847511076e-06, + "loss": 0.5098, + "step": 2609 + }, + { + "epoch": 0.56, + "grad_norm": 0.17972496151924133, + "learning_rate": 4.23885621923104e-06, + "loss": 0.5511, + "step": 2610 + }, + { + "epoch": 0.56, + "grad_norm": 0.14959251880645752, + "learning_rate": 4.235407961593704e-06, + "loss": 0.49, + "step": 2611 + }, + { + "epoch": 0.56, + "grad_norm": 0.1577451229095459, + "learning_rate": 4.231960076278211e-06, + "loss": 0.4616, + "step": 2612 + }, + { + "epoch": 0.56, + "grad_norm": 0.1617031991481781, + "learning_rate": 4.228512564963528e-06, + "loss": 0.5371, + "step": 2613 + }, + { + "epoch": 0.56, + "grad_norm": 0.15706071257591248, + "learning_rate": 4.225065429328439e-06, + "loss": 0.4847, + "step": 2614 + }, + { + "epoch": 0.56, + "grad_norm": 0.14980901777744293, + "learning_rate": 4.221618671051539e-06, + "loss": 0.5232, + "step": 2615 + }, + { + "epoch": 0.56, + "grad_norm": 0.15324559807777405, + "learning_rate": 4.218172291811249e-06, + "loss": 0.5333, + "step": 2616 + }, + { + "epoch": 0.56, + "grad_norm": 0.13998126983642578, + "learning_rate": 4.214726293285797e-06, + "loss": 0.5366, + "step": 2617 + }, + { + "epoch": 0.56, + "grad_norm": 0.16418395936489105, + "learning_rate": 4.211280677153228e-06, + "loss": 0.5233, + "step": 2618 + }, + { + "epoch": 0.56, + "grad_norm": 0.16183216869831085, + "learning_rate": 4.207835445091405e-06, + "loss": 0.4953, + "step": 2619 + }, + { + "epoch": 0.56, + "grad_norm": 0.15545772016048431, + "learning_rate": 4.204390598777999e-06, + "loss": 0.5336, + "step": 2620 + }, + { + "epoch": 0.56, + "grad_norm": 0.1599649339914322, + "learning_rate": 4.2009461398904955e-06, + "loss": 0.5047, + "step": 2621 + }, + { + "epoch": 0.56, + "grad_norm": 0.178667351603508, + "learning_rate": 4.1975020701061884e-06, + "loss": 0.5114, + "step": 2622 + }, + { + "epoch": 0.57, + "grad_norm": 0.16403385996818542, + "learning_rate": 4.194058391102188e-06, + "loss": 0.5077, + "step": 2623 + }, + { + "epoch": 0.57, + "grad_norm": 0.16363531351089478, + "learning_rate": 4.190615104555407e-06, + "loss": 0.5107, + "step": 2624 + }, + { + "epoch": 0.57, + "grad_norm": 0.1554226130247116, + "learning_rate": 4.1871722121425725e-06, + "loss": 0.489, + "step": 2625 + }, + { + "epoch": 0.57, + "grad_norm": 0.14770759642124176, + "learning_rate": 4.1837297155402204e-06, + "loss": 0.5776, + "step": 2626 + }, + { + "epoch": 0.57, + "grad_norm": 0.17107781767845154, + "learning_rate": 4.180287616424685e-06, + "loss": 0.4841, + "step": 2627 + }, + { + "epoch": 0.57, + "grad_norm": 0.17729692161083221, + "learning_rate": 4.17684591647212e-06, + "loss": 0.5217, + "step": 2628 + }, + { + "epoch": 0.57, + "grad_norm": 0.12309854477643967, + "learning_rate": 4.173404617358473e-06, + "loss": 0.5291, + "step": 2629 + }, + { + "epoch": 0.57, + "grad_norm": 0.1765958070755005, + "learning_rate": 4.1699637207595035e-06, + "loss": 0.5339, + "step": 2630 + }, + { + "epoch": 0.57, + "grad_norm": 0.13170979917049408, + "learning_rate": 4.166523228350775e-06, + "loss": 0.4824, + "step": 2631 + }, + { + "epoch": 0.57, + "grad_norm": 0.16714021563529968, + "learning_rate": 4.163083141807648e-06, + "loss": 0.5273, + "step": 2632 + }, + { + "epoch": 0.57, + "grad_norm": 0.14370590448379517, + "learning_rate": 4.159643462805293e-06, + "loss": 0.5099, + "step": 2633 + }, + { + "epoch": 0.57, + "grad_norm": 0.16657981276512146, + "learning_rate": 4.156204193018677e-06, + "loss": 0.5525, + "step": 2634 + }, + { + "epoch": 0.57, + "grad_norm": 0.16202954947948456, + "learning_rate": 4.152765334122569e-06, + "loss": 0.514, + "step": 2635 + }, + { + "epoch": 0.57, + "grad_norm": 0.16040794551372528, + "learning_rate": 4.149326887791541e-06, + "loss": 0.506, + "step": 2636 + }, + { + "epoch": 0.57, + "grad_norm": 0.20684373378753662, + "learning_rate": 4.145888855699957e-06, + "loss": 0.4962, + "step": 2637 + }, + { + "epoch": 0.57, + "grad_norm": 0.1377829760313034, + "learning_rate": 4.142451239521988e-06, + "loss": 0.5331, + "step": 2638 + }, + { + "epoch": 0.57, + "grad_norm": 0.1686798632144928, + "learning_rate": 4.139014040931594e-06, + "loss": 0.4454, + "step": 2639 + }, + { + "epoch": 0.57, + "grad_norm": 0.14603053033351898, + "learning_rate": 4.135577261602537e-06, + "loss": 0.4832, + "step": 2640 + }, + { + "epoch": 0.57, + "grad_norm": 0.14471474289894104, + "learning_rate": 4.132140903208376e-06, + "loss": 0.5147, + "step": 2641 + }, + { + "epoch": 0.57, + "grad_norm": 0.17779991030693054, + "learning_rate": 4.128704967422458e-06, + "loss": 0.5427, + "step": 2642 + }, + { + "epoch": 0.57, + "grad_norm": 0.15965478122234344, + "learning_rate": 4.125269455917934e-06, + "loss": 0.5276, + "step": 2643 + }, + { + "epoch": 0.57, + "grad_norm": 0.13071952760219574, + "learning_rate": 4.1218343703677385e-06, + "loss": 0.5247, + "step": 2644 + }, + { + "epoch": 0.57, + "grad_norm": 0.1529112011194229, + "learning_rate": 4.118399712444607e-06, + "loss": 0.4814, + "step": 2645 + }, + { + "epoch": 0.57, + "grad_norm": 0.1327049285173416, + "learning_rate": 4.114965483821061e-06, + "loss": 0.5298, + "step": 2646 + }, + { + "epoch": 0.57, + "grad_norm": 0.1503492295742035, + "learning_rate": 4.111531686169415e-06, + "loss": 0.4757, + "step": 2647 + }, + { + "epoch": 0.57, + "grad_norm": 0.15045446157455444, + "learning_rate": 4.108098321161776e-06, + "loss": 0.5147, + "step": 2648 + }, + { + "epoch": 0.57, + "grad_norm": 0.14457084238529205, + "learning_rate": 4.104665390470034e-06, + "loss": 0.4722, + "step": 2649 + }, + { + "epoch": 0.57, + "grad_norm": 0.16404461860656738, + "learning_rate": 4.101232895765875e-06, + "loss": 0.5217, + "step": 2650 + }, + { + "epoch": 0.57, + "grad_norm": 0.15563920140266418, + "learning_rate": 4.0978008387207656e-06, + "loss": 0.4825, + "step": 2651 + }, + { + "epoch": 0.57, + "grad_norm": 0.1561812460422516, + "learning_rate": 4.094369221005965e-06, + "loss": 0.461, + "step": 2652 + }, + { + "epoch": 0.57, + "grad_norm": 0.1469080001115799, + "learning_rate": 4.090938044292517e-06, + "loss": 0.5018, + "step": 2653 + }, + { + "epoch": 0.57, + "grad_norm": 0.14523988962173462, + "learning_rate": 4.0875073102512485e-06, + "loss": 0.5539, + "step": 2654 + }, + { + "epoch": 0.57, + "grad_norm": 0.13977643847465515, + "learning_rate": 4.084077020552773e-06, + "loss": 0.5137, + "step": 2655 + }, + { + "epoch": 0.57, + "grad_norm": 0.1671827733516693, + "learning_rate": 4.080647176867486e-06, + "loss": 0.4837, + "step": 2656 + }, + { + "epoch": 0.57, + "grad_norm": 0.17201554775238037, + "learning_rate": 4.077217780865568e-06, + "loss": 0.5317, + "step": 2657 + }, + { + "epoch": 0.57, + "grad_norm": 0.16702772676944733, + "learning_rate": 4.07378883421698e-06, + "loss": 0.5107, + "step": 2658 + }, + { + "epoch": 0.57, + "grad_norm": 0.15763095021247864, + "learning_rate": 4.070360338591463e-06, + "loss": 0.4985, + "step": 2659 + }, + { + "epoch": 0.57, + "grad_norm": 0.19694013893604279, + "learning_rate": 4.066932295658543e-06, + "loss": 0.5392, + "step": 2660 + }, + { + "epoch": 0.57, + "grad_norm": 0.16363875567913055, + "learning_rate": 4.0635047070875175e-06, + "loss": 0.5371, + "step": 2661 + }, + { + "epoch": 0.57, + "grad_norm": 0.14083370566368103, + "learning_rate": 4.06007757454747e-06, + "loss": 0.5127, + "step": 2662 + }, + { + "epoch": 0.57, + "grad_norm": 0.20537738502025604, + "learning_rate": 4.056650899707262e-06, + "loss": 0.5337, + "step": 2663 + }, + { + "epoch": 0.57, + "grad_norm": 0.18272843956947327, + "learning_rate": 4.053224684235526e-06, + "loss": 0.4706, + "step": 2664 + }, + { + "epoch": 0.57, + "grad_norm": 0.1590733379125595, + "learning_rate": 4.049798929800676e-06, + "loss": 0.5598, + "step": 2665 + }, + { + "epoch": 0.57, + "grad_norm": 0.1680363267660141, + "learning_rate": 4.0463736380708986e-06, + "loss": 0.5321, + "step": 2666 + }, + { + "epoch": 0.57, + "grad_norm": 0.14845013618469238, + "learning_rate": 4.042948810714158e-06, + "loss": 0.508, + "step": 2667 + }, + { + "epoch": 0.57, + "grad_norm": 0.18387869000434875, + "learning_rate": 4.039524449398191e-06, + "loss": 0.5134, + "step": 2668 + }, + { + "epoch": 0.57, + "grad_norm": 0.14832563698291779, + "learning_rate": 4.036100555790505e-06, + "loss": 0.5149, + "step": 2669 + }, + { + "epoch": 0.58, + "grad_norm": 0.16505834460258484, + "learning_rate": 4.032677131558386e-06, + "loss": 0.5326, + "step": 2670 + }, + { + "epoch": 0.58, + "grad_norm": 0.16617228090763092, + "learning_rate": 4.0292541783688804e-06, + "loss": 0.5246, + "step": 2671 + }, + { + "epoch": 0.58, + "grad_norm": 0.14918413758277893, + "learning_rate": 4.025831697888817e-06, + "loss": 0.4876, + "step": 2672 + }, + { + "epoch": 0.58, + "grad_norm": 0.14089904725551605, + "learning_rate": 4.022409691784791e-06, + "loss": 0.4799, + "step": 2673 + }, + { + "epoch": 0.58, + "grad_norm": 0.20409740507602692, + "learning_rate": 4.01898816172316e-06, + "loss": 0.4963, + "step": 2674 + }, + { + "epoch": 0.58, + "grad_norm": 0.180314838886261, + "learning_rate": 4.015567109370059e-06, + "loss": 0.4895, + "step": 2675 + }, + { + "epoch": 0.58, + "grad_norm": 0.1631566435098648, + "learning_rate": 4.012146536391383e-06, + "loss": 0.4868, + "step": 2676 + }, + { + "epoch": 0.58, + "grad_norm": 0.1476244032382965, + "learning_rate": 4.008726444452799e-06, + "loss": 0.4909, + "step": 2677 + }, + { + "epoch": 0.58, + "grad_norm": 0.14692912995815277, + "learning_rate": 4.005306835219737e-06, + "loss": 0.48, + "step": 2678 + }, + { + "epoch": 0.58, + "grad_norm": 0.15567363798618317, + "learning_rate": 4.001887710357392e-06, + "loss": 0.5127, + "step": 2679 + }, + { + "epoch": 0.58, + "grad_norm": 0.14364320039749146, + "learning_rate": 3.998469071530725e-06, + "loss": 0.5628, + "step": 2680 + }, + { + "epoch": 0.58, + "grad_norm": 0.132903054356575, + "learning_rate": 3.995050920404457e-06, + "loss": 0.5542, + "step": 2681 + }, + { + "epoch": 0.58, + "grad_norm": 0.18202251195907593, + "learning_rate": 3.991633258643077e-06, + "loss": 0.5721, + "step": 2682 + }, + { + "epoch": 0.58, + "grad_norm": 0.16773462295532227, + "learning_rate": 3.988216087910827e-06, + "loss": 0.5039, + "step": 2683 + }, + { + "epoch": 0.58, + "grad_norm": 0.14804890751838684, + "learning_rate": 3.9847994098717166e-06, + "loss": 0.5011, + "step": 2684 + }, + { + "epoch": 0.58, + "grad_norm": 0.14443790912628174, + "learning_rate": 3.981383226189518e-06, + "loss": 0.5187, + "step": 2685 + }, + { + "epoch": 0.58, + "grad_norm": 0.1854676902294159, + "learning_rate": 3.9779675385277545e-06, + "loss": 0.564, + "step": 2686 + }, + { + "epoch": 0.58, + "grad_norm": 0.14072245359420776, + "learning_rate": 3.974552348549714e-06, + "loss": 0.4614, + "step": 2687 + }, + { + "epoch": 0.58, + "grad_norm": 0.1532825082540512, + "learning_rate": 3.971137657918437e-06, + "loss": 0.5517, + "step": 2688 + }, + { + "epoch": 0.58, + "grad_norm": 0.16841329634189606, + "learning_rate": 3.967723468296727e-06, + "loss": 0.4833, + "step": 2689 + }, + { + "epoch": 0.58, + "grad_norm": 0.16667723655700684, + "learning_rate": 3.96430978134714e-06, + "loss": 0.4646, + "step": 2690 + }, + { + "epoch": 0.58, + "grad_norm": 0.15399962663650513, + "learning_rate": 3.960896598731986e-06, + "loss": 0.5664, + "step": 2691 + }, + { + "epoch": 0.58, + "grad_norm": 0.13954004645347595, + "learning_rate": 3.957483922113334e-06, + "loss": 0.4877, + "step": 2692 + }, + { + "epoch": 0.58, + "grad_norm": 0.1900685578584671, + "learning_rate": 3.954071753152999e-06, + "loss": 0.5557, + "step": 2693 + }, + { + "epoch": 0.58, + "grad_norm": 0.16399481892585754, + "learning_rate": 3.950660093512556e-06, + "loss": 0.5266, + "step": 2694 + }, + { + "epoch": 0.58, + "grad_norm": 0.1279776692390442, + "learning_rate": 3.947248944853332e-06, + "loss": 0.4697, + "step": 2695 + }, + { + "epoch": 0.58, + "grad_norm": 0.14985980093479156, + "learning_rate": 3.943838308836398e-06, + "loss": 0.5437, + "step": 2696 + }, + { + "epoch": 0.58, + "grad_norm": 0.2673838138580322, + "learning_rate": 3.940428187122584e-06, + "loss": 0.5087, + "step": 2697 + }, + { + "epoch": 0.58, + "grad_norm": 0.14051130414009094, + "learning_rate": 3.937018581372462e-06, + "loss": 0.5061, + "step": 2698 + }, + { + "epoch": 0.58, + "grad_norm": 0.16947080194950104, + "learning_rate": 3.933609493246357e-06, + "loss": 0.5193, + "step": 2699 + }, + { + "epoch": 0.58, + "grad_norm": 0.18120256066322327, + "learning_rate": 3.9302009244043435e-06, + "loss": 0.576, + "step": 2700 + }, + { + "epoch": 0.58, + "grad_norm": 0.1373000591993332, + "learning_rate": 3.926792876506238e-06, + "loss": 0.5132, + "step": 2701 + }, + { + "epoch": 0.58, + "grad_norm": 0.13234984874725342, + "learning_rate": 3.923385351211609e-06, + "loss": 0.5311, + "step": 2702 + }, + { + "epoch": 0.58, + "grad_norm": 0.1670055389404297, + "learning_rate": 3.919978350179764e-06, + "loss": 0.5461, + "step": 2703 + }, + { + "epoch": 0.58, + "grad_norm": 0.13284355401992798, + "learning_rate": 3.916571875069764e-06, + "loss": 0.4916, + "step": 2704 + }, + { + "epoch": 0.58, + "grad_norm": 0.16038647294044495, + "learning_rate": 3.913165927540403e-06, + "loss": 0.5024, + "step": 2705 + }, + { + "epoch": 0.58, + "grad_norm": 0.15044786036014557, + "learning_rate": 3.909760509250225e-06, + "loss": 0.5306, + "step": 2706 + }, + { + "epoch": 0.58, + "grad_norm": 0.1481400430202484, + "learning_rate": 3.90635562185752e-06, + "loss": 0.4572, + "step": 2707 + }, + { + "epoch": 0.58, + "grad_norm": 0.11982067674398422, + "learning_rate": 3.902951267020311e-06, + "loss": 0.4793, + "step": 2708 + }, + { + "epoch": 0.58, + "grad_norm": 0.1427111029624939, + "learning_rate": 3.899547446396365e-06, + "loss": 0.5488, + "step": 2709 + }, + { + "epoch": 0.58, + "grad_norm": 0.18075178563594818, + "learning_rate": 3.896144161643189e-06, + "loss": 0.5251, + "step": 2710 + }, + { + "epoch": 0.58, + "grad_norm": 0.17146192491054535, + "learning_rate": 3.89274141441803e-06, + "loss": 0.5115, + "step": 2711 + }, + { + "epoch": 0.58, + "grad_norm": 0.19387201964855194, + "learning_rate": 3.8893392063778736e-06, + "loss": 0.5017, + "step": 2712 + }, + { + "epoch": 0.58, + "grad_norm": 0.12743881344795227, + "learning_rate": 3.88593753917944e-06, + "loss": 0.4378, + "step": 2713 + }, + { + "epoch": 0.58, + "grad_norm": 0.20177234709262848, + "learning_rate": 3.882536414479189e-06, + "loss": 0.5104, + "step": 2714 + }, + { + "epoch": 0.58, + "grad_norm": 0.15477432310581207, + "learning_rate": 3.879135833933311e-06, + "loss": 0.4847, + "step": 2715 + }, + { + "epoch": 0.59, + "grad_norm": 0.18448805809020996, + "learning_rate": 3.8757357991977415e-06, + "loss": 0.4854, + "step": 2716 + }, + { + "epoch": 0.59, + "grad_norm": 0.1681356281042099, + "learning_rate": 3.8723363119281426e-06, + "loss": 0.5493, + "step": 2717 + }, + { + "epoch": 0.59, + "grad_norm": 0.1758078634738922, + "learning_rate": 3.868937373779907e-06, + "loss": 0.5012, + "step": 2718 + }, + { + "epoch": 0.59, + "grad_norm": 0.18355970084667206, + "learning_rate": 3.865538986408169e-06, + "loss": 0.5385, + "step": 2719 + }, + { + "epoch": 0.59, + "grad_norm": 0.15605668723583221, + "learning_rate": 3.862141151467787e-06, + "loss": 0.547, + "step": 2720 + }, + { + "epoch": 0.59, + "grad_norm": 0.17370112240314484, + "learning_rate": 3.858743870613355e-06, + "loss": 0.5308, + "step": 2721 + }, + { + "epoch": 0.59, + "grad_norm": 0.1348809152841568, + "learning_rate": 3.855347145499197e-06, + "loss": 0.5194, + "step": 2722 + }, + { + "epoch": 0.59, + "grad_norm": 0.17120207846164703, + "learning_rate": 3.851950977779361e-06, + "loss": 0.5159, + "step": 2723 + }, + { + "epoch": 0.59, + "grad_norm": 0.15506203472614288, + "learning_rate": 3.848555369107631e-06, + "loss": 0.5213, + "step": 2724 + }, + { + "epoch": 0.59, + "grad_norm": 0.13186971843242645, + "learning_rate": 3.845160321137512e-06, + "loss": 0.4798, + "step": 2725 + }, + { + "epoch": 0.59, + "grad_norm": 0.15033838152885437, + "learning_rate": 3.841765835522242e-06, + "loss": 0.5573, + "step": 2726 + }, + { + "epoch": 0.59, + "grad_norm": 0.18248233199119568, + "learning_rate": 3.838371913914783e-06, + "loss": 0.4529, + "step": 2727 + }, + { + "epoch": 0.59, + "grad_norm": 0.1604524403810501, + "learning_rate": 3.83497855796782e-06, + "loss": 0.506, + "step": 2728 + }, + { + "epoch": 0.59, + "grad_norm": 0.1608039289712906, + "learning_rate": 3.831585769333766e-06, + "loss": 0.5207, + "step": 2729 + }, + { + "epoch": 0.59, + "grad_norm": 0.14408713579177856, + "learning_rate": 3.8281935496647526e-06, + "loss": 0.5487, + "step": 2730 + }, + { + "epoch": 0.59, + "grad_norm": 0.15173058211803436, + "learning_rate": 3.824801900612642e-06, + "loss": 0.5054, + "step": 2731 + }, + { + "epoch": 0.59, + "grad_norm": 0.20556017756462097, + "learning_rate": 3.821410823829011e-06, + "loss": 0.5244, + "step": 2732 + }, + { + "epoch": 0.59, + "grad_norm": 0.1307820975780487, + "learning_rate": 3.818020320965162e-06, + "loss": 0.5035, + "step": 2733 + }, + { + "epoch": 0.59, + "grad_norm": 0.18517783284187317, + "learning_rate": 3.8146303936721197e-06, + "loss": 0.4838, + "step": 2734 + }, + { + "epoch": 0.59, + "grad_norm": 0.13619892299175262, + "learning_rate": 3.811241043600622e-06, + "loss": 0.5416, + "step": 2735 + }, + { + "epoch": 0.59, + "grad_norm": 0.19370396435260773, + "learning_rate": 3.8078522724011324e-06, + "loss": 0.5622, + "step": 2736 + }, + { + "epoch": 0.59, + "grad_norm": 0.1536007523536682, + "learning_rate": 3.8044640817238276e-06, + "loss": 0.5121, + "step": 2737 + }, + { + "epoch": 0.59, + "grad_norm": 0.15463578701019287, + "learning_rate": 3.8010764732186044e-06, + "loss": 0.5102, + "step": 2738 + }, + { + "epoch": 0.59, + "grad_norm": 0.17096665501594543, + "learning_rate": 3.797689448535078e-06, + "loss": 0.4799, + "step": 2739 + }, + { + "epoch": 0.59, + "grad_norm": 0.3133319616317749, + "learning_rate": 3.79430300932257e-06, + "loss": 0.5698, + "step": 2740 + }, + { + "epoch": 0.59, + "grad_norm": 0.23030497133731842, + "learning_rate": 3.790917157230132e-06, + "loss": 0.5706, + "step": 2741 + }, + { + "epoch": 0.59, + "grad_norm": 0.15342505276203156, + "learning_rate": 3.7875318939065147e-06, + "loss": 0.4826, + "step": 2742 + }, + { + "epoch": 0.59, + "grad_norm": 0.2005234658718109, + "learning_rate": 3.784147221000191e-06, + "loss": 0.5415, + "step": 2743 + }, + { + "epoch": 0.59, + "grad_norm": 0.13762331008911133, + "learning_rate": 3.7807631401593455e-06, + "loss": 0.5106, + "step": 2744 + }, + { + "epoch": 0.59, + "grad_norm": 0.2076551467180252, + "learning_rate": 3.7773796530318703e-06, + "loss": 0.503, + "step": 2745 + }, + { + "epoch": 0.59, + "grad_norm": 0.1570519208908081, + "learning_rate": 3.773996761265373e-06, + "loss": 0.5074, + "step": 2746 + }, + { + "epoch": 0.59, + "grad_norm": 0.1342182457447052, + "learning_rate": 3.7706144665071683e-06, + "loss": 0.4931, + "step": 2747 + }, + { + "epoch": 0.59, + "grad_norm": 0.17213162779808044, + "learning_rate": 3.767232770404281e-06, + "loss": 0.4552, + "step": 2748 + }, + { + "epoch": 0.59, + "grad_norm": 0.13107101619243622, + "learning_rate": 3.7638516746034465e-06, + "loss": 0.4909, + "step": 2749 + }, + { + "epoch": 0.59, + "grad_norm": 0.16508126258850098, + "learning_rate": 3.7604711807511034e-06, + "loss": 0.523, + "step": 2750 + }, + { + "epoch": 0.59, + "grad_norm": 0.15281084179878235, + "learning_rate": 3.757091290493404e-06, + "loss": 0.5309, + "step": 2751 + }, + { + "epoch": 0.59, + "grad_norm": 0.20402151346206665, + "learning_rate": 3.753712005476197e-06, + "loss": 0.5493, + "step": 2752 + }, + { + "epoch": 0.59, + "grad_norm": 0.15612109005451202, + "learning_rate": 3.7503333273450425e-06, + "loss": 0.5259, + "step": 2753 + }, + { + "epoch": 0.59, + "grad_norm": 0.1936381310224533, + "learning_rate": 3.74695525774521e-06, + "loss": 0.5087, + "step": 2754 + }, + { + "epoch": 0.59, + "grad_norm": 0.1426432728767395, + "learning_rate": 3.7435777983216614e-06, + "loss": 0.5044, + "step": 2755 + }, + { + "epoch": 0.59, + "grad_norm": 0.14533087611198425, + "learning_rate": 3.7402009507190696e-06, + "loss": 0.5529, + "step": 2756 + }, + { + "epoch": 0.59, + "grad_norm": 0.15488633513450623, + "learning_rate": 3.7368247165818056e-06, + "loss": 0.4872, + "step": 2757 + }, + { + "epoch": 0.59, + "grad_norm": 0.14580923318862915, + "learning_rate": 3.733449097553945e-06, + "loss": 0.551, + "step": 2758 + }, + { + "epoch": 0.59, + "grad_norm": 0.17380273342132568, + "learning_rate": 3.7300740952792602e-06, + "loss": 0.5494, + "step": 2759 + }, + { + "epoch": 0.59, + "grad_norm": 0.171724334359169, + "learning_rate": 3.7266997114012265e-06, + "loss": 0.5556, + "step": 2760 + }, + { + "epoch": 0.59, + "grad_norm": 0.15848620235919952, + "learning_rate": 3.723325947563018e-06, + "loss": 0.5165, + "step": 2761 + }, + { + "epoch": 0.59, + "grad_norm": 0.15606124699115753, + "learning_rate": 3.7199528054075005e-06, + "loss": 0.5302, + "step": 2762 + }, + { + "epoch": 0.6, + "grad_norm": 0.16441625356674194, + "learning_rate": 3.7165802865772495e-06, + "loss": 0.5862, + "step": 2763 + }, + { + "epoch": 0.6, + "grad_norm": 0.13233539462089539, + "learning_rate": 3.713208392714523e-06, + "loss": 0.5144, + "step": 2764 + }, + { + "epoch": 0.6, + "grad_norm": 0.16361810266971588, + "learning_rate": 3.709837125461283e-06, + "loss": 0.4873, + "step": 2765 + }, + { + "epoch": 0.6, + "grad_norm": 0.9805002808570862, + "learning_rate": 3.7064664864591878e-06, + "loss": 0.5081, + "step": 2766 + }, + { + "epoch": 0.6, + "grad_norm": 0.15291385352611542, + "learning_rate": 3.7030964773495823e-06, + "loss": 0.4899, + "step": 2767 + }, + { + "epoch": 0.6, + "grad_norm": 0.19646501541137695, + "learning_rate": 3.6997270997735122e-06, + "loss": 0.5642, + "step": 2768 + }, + { + "epoch": 0.6, + "grad_norm": 0.15896441042423248, + "learning_rate": 3.6963583553717104e-06, + "loss": 0.5153, + "step": 2769 + }, + { + "epoch": 0.6, + "grad_norm": 0.15843161940574646, + "learning_rate": 3.6929902457846034e-06, + "loss": 0.497, + "step": 2770 + }, + { + "epoch": 0.6, + "grad_norm": 0.19402094185352325, + "learning_rate": 3.6896227726523113e-06, + "loss": 0.5438, + "step": 2771 + }, + { + "epoch": 0.6, + "grad_norm": 0.1643831878900528, + "learning_rate": 3.6862559376146388e-06, + "loss": 0.5383, + "step": 2772 + }, + { + "epoch": 0.6, + "grad_norm": 0.15504218637943268, + "learning_rate": 3.6828897423110866e-06, + "loss": 0.505, + "step": 2773 + }, + { + "epoch": 0.6, + "grad_norm": 0.1874060332775116, + "learning_rate": 3.6795241883808342e-06, + "loss": 0.5366, + "step": 2774 + }, + { + "epoch": 0.6, + "grad_norm": 0.16982296109199524, + "learning_rate": 3.676159277462757e-06, + "loss": 0.5237, + "step": 2775 + }, + { + "epoch": 0.6, + "grad_norm": 0.16953998804092407, + "learning_rate": 3.6727950111954186e-06, + "loss": 0.498, + "step": 2776 + }, + { + "epoch": 0.6, + "grad_norm": 0.1400230973958969, + "learning_rate": 3.66943139121706e-06, + "loss": 0.4611, + "step": 2777 + }, + { + "epoch": 0.6, + "grad_norm": 0.15184669196605682, + "learning_rate": 3.6660684191656155e-06, + "loss": 0.5214, + "step": 2778 + }, + { + "epoch": 0.6, + "grad_norm": 0.14015498757362366, + "learning_rate": 3.662706096678699e-06, + "loss": 0.4915, + "step": 2779 + }, + { + "epoch": 0.6, + "grad_norm": 0.17873437702655792, + "learning_rate": 3.6593444253936094e-06, + "loss": 0.4492, + "step": 2780 + }, + { + "epoch": 0.6, + "grad_norm": 0.1276986002922058, + "learning_rate": 3.655983406947332e-06, + "loss": 0.4904, + "step": 2781 + }, + { + "epoch": 0.6, + "grad_norm": 0.1345810890197754, + "learning_rate": 3.652623042976529e-06, + "loss": 0.5068, + "step": 2782 + }, + { + "epoch": 0.6, + "grad_norm": 0.17123238742351532, + "learning_rate": 3.649263335117548e-06, + "loss": 0.5292, + "step": 2783 + }, + { + "epoch": 0.6, + "grad_norm": 0.22209994494915009, + "learning_rate": 3.645904285006412e-06, + "loss": 0.5488, + "step": 2784 + }, + { + "epoch": 0.6, + "grad_norm": 0.29981812834739685, + "learning_rate": 3.6425458942788306e-06, + "loss": 0.4935, + "step": 2785 + }, + { + "epoch": 0.6, + "grad_norm": 0.17638364434242249, + "learning_rate": 3.6391881645701854e-06, + "loss": 0.5535, + "step": 2786 + }, + { + "epoch": 0.6, + "grad_norm": 0.1817181259393692, + "learning_rate": 3.63583109751554e-06, + "loss": 0.5224, + "step": 2787 + }, + { + "epoch": 0.6, + "grad_norm": 0.16286495327949524, + "learning_rate": 3.632474694749638e-06, + "loss": 0.5397, + "step": 2788 + }, + { + "epoch": 0.6, + "grad_norm": 0.13048282265663147, + "learning_rate": 3.629118957906892e-06, + "loss": 0.5172, + "step": 2789 + }, + { + "epoch": 0.6, + "grad_norm": 0.1269851177930832, + "learning_rate": 3.625763888621397e-06, + "loss": 0.4823, + "step": 2790 + }, + { + "epoch": 0.6, + "grad_norm": 0.15424852073192596, + "learning_rate": 3.6224094885269184e-06, + "loss": 0.5374, + "step": 2791 + }, + { + "epoch": 0.6, + "grad_norm": 0.1900346428155899, + "learning_rate": 3.6190557592569e-06, + "loss": 0.4719, + "step": 2792 + }, + { + "epoch": 0.6, + "grad_norm": 0.1395425945520401, + "learning_rate": 3.6157027024444558e-06, + "loss": 0.5218, + "step": 2793 + }, + { + "epoch": 0.6, + "grad_norm": 0.1748196929693222, + "learning_rate": 3.612350319722372e-06, + "loss": 0.5003, + "step": 2794 + }, + { + "epoch": 0.6, + "grad_norm": 0.15849445760250092, + "learning_rate": 3.6089986127231117e-06, + "loss": 0.5239, + "step": 2795 + }, + { + "epoch": 0.6, + "grad_norm": 0.16830691695213318, + "learning_rate": 3.6056475830787997e-06, + "loss": 0.5213, + "step": 2796 + }, + { + "epoch": 0.6, + "grad_norm": 0.13852837681770325, + "learning_rate": 3.6022972324212396e-06, + "loss": 0.4697, + "step": 2797 + }, + { + "epoch": 0.6, + "grad_norm": 0.12535005807876587, + "learning_rate": 3.5989475623819025e-06, + "loss": 0.5444, + "step": 2798 + }, + { + "epoch": 0.6, + "grad_norm": 0.1402188241481781, + "learning_rate": 3.595598574591923e-06, + "loss": 0.5238, + "step": 2799 + }, + { + "epoch": 0.6, + "grad_norm": 0.14916275441646576, + "learning_rate": 3.5922502706821094e-06, + "loss": 0.4976, + "step": 2800 + }, + { + "epoch": 0.6, + "grad_norm": 0.1618949919939041, + "learning_rate": 3.588902652282934e-06, + "loss": 0.5345, + "step": 2801 + }, + { + "epoch": 0.6, + "grad_norm": 0.14491844177246094, + "learning_rate": 3.585555721024535e-06, + "loss": 0.515, + "step": 2802 + }, + { + "epoch": 0.6, + "grad_norm": 0.15220017731189728, + "learning_rate": 3.58220947853672e-06, + "loss": 0.5332, + "step": 2803 + }, + { + "epoch": 0.6, + "grad_norm": 0.152902290225029, + "learning_rate": 3.578863926448955e-06, + "loss": 0.5592, + "step": 2804 + }, + { + "epoch": 0.6, + "grad_norm": 0.11480627208948135, + "learning_rate": 3.5755190663903753e-06, + "loss": 0.4952, + "step": 2805 + }, + { + "epoch": 0.6, + "grad_norm": 0.14540837705135345, + "learning_rate": 3.5721748999897753e-06, + "loss": 0.5294, + "step": 2806 + }, + { + "epoch": 0.6, + "grad_norm": 0.1490909457206726, + "learning_rate": 3.5688314288756136e-06, + "loss": 0.5052, + "step": 2807 + }, + { + "epoch": 0.6, + "grad_norm": 0.18195994198322296, + "learning_rate": 3.5654886546760125e-06, + "loss": 0.5326, + "step": 2808 + }, + { + "epoch": 0.61, + "grad_norm": 0.2022872418165207, + "learning_rate": 3.562146579018747e-06, + "loss": 0.5723, + "step": 2809 + }, + { + "epoch": 0.61, + "grad_norm": 0.15741689503192902, + "learning_rate": 3.558805203531263e-06, + "loss": 0.5499, + "step": 2810 + }, + { + "epoch": 0.61, + "grad_norm": 0.1889500916004181, + "learning_rate": 3.5554645298406553e-06, + "loss": 0.5991, + "step": 2811 + }, + { + "epoch": 0.61, + "grad_norm": 0.1986282765865326, + "learning_rate": 3.5521245595736837e-06, + "loss": 0.4946, + "step": 2812 + }, + { + "epoch": 0.61, + "grad_norm": 0.17025060951709747, + "learning_rate": 3.5487852943567614e-06, + "loss": 0.567, + "step": 2813 + }, + { + "epoch": 0.61, + "grad_norm": 0.14447635412216187, + "learning_rate": 3.5454467358159606e-06, + "loss": 0.4781, + "step": 2814 + }, + { + "epoch": 0.61, + "grad_norm": 0.12846069037914276, + "learning_rate": 3.54210888557701e-06, + "loss": 0.511, + "step": 2815 + }, + { + "epoch": 0.61, + "grad_norm": 0.180193692445755, + "learning_rate": 3.5387717452652914e-06, + "loss": 0.4993, + "step": 2816 + }, + { + "epoch": 0.61, + "grad_norm": 0.13410285115242004, + "learning_rate": 3.535435316505843e-06, + "loss": 0.4746, + "step": 2817 + }, + { + "epoch": 0.61, + "grad_norm": 0.16177906095981598, + "learning_rate": 3.53209960092335e-06, + "loss": 0.5347, + "step": 2818 + }, + { + "epoch": 0.61, + "grad_norm": 0.15283246338367462, + "learning_rate": 3.5287646001421604e-06, + "loss": 0.5191, + "step": 2819 + }, + { + "epoch": 0.61, + "grad_norm": 0.15224431455135345, + "learning_rate": 3.5254303157862707e-06, + "loss": 0.5055, + "step": 2820 + }, + { + "epoch": 0.61, + "grad_norm": 0.18944452702999115, + "learning_rate": 3.5220967494793216e-06, + "loss": 0.463, + "step": 2821 + }, + { + "epoch": 0.61, + "grad_norm": 0.15556566417217255, + "learning_rate": 3.5187639028446136e-06, + "loss": 0.5134, + "step": 2822 + }, + { + "epoch": 0.61, + "grad_norm": 0.155210942029953, + "learning_rate": 3.5154317775050906e-06, + "loss": 0.4888, + "step": 2823 + }, + { + "epoch": 0.61, + "grad_norm": 0.16802415251731873, + "learning_rate": 3.512100375083347e-06, + "loss": 0.5124, + "step": 2824 + }, + { + "epoch": 0.61, + "grad_norm": 0.23786631226539612, + "learning_rate": 3.508769697201629e-06, + "loss": 0.5722, + "step": 2825 + }, + { + "epoch": 0.61, + "grad_norm": 0.15338438749313354, + "learning_rate": 3.5054397454818224e-06, + "loss": 0.5459, + "step": 2826 + }, + { + "epoch": 0.61, + "grad_norm": 0.1475946456193924, + "learning_rate": 3.5021105215454666e-06, + "loss": 0.5012, + "step": 2827 + }, + { + "epoch": 0.61, + "grad_norm": 0.15379135310649872, + "learning_rate": 3.498782027013742e-06, + "loss": 0.5131, + "step": 2828 + }, + { + "epoch": 0.61, + "grad_norm": 0.20665378868579865, + "learning_rate": 3.4954542635074744e-06, + "loss": 0.5291, + "step": 2829 + }, + { + "epoch": 0.61, + "grad_norm": 0.1389567106962204, + "learning_rate": 3.4921272326471388e-06, + "loss": 0.5211, + "step": 2830 + }, + { + "epoch": 0.61, + "grad_norm": 0.1549108624458313, + "learning_rate": 3.488800936052843e-06, + "loss": 0.4565, + "step": 2831 + }, + { + "epoch": 0.61, + "grad_norm": 0.16236495971679688, + "learning_rate": 3.4854753753443494e-06, + "loss": 0.4741, + "step": 2832 + }, + { + "epoch": 0.61, + "grad_norm": 0.13626092672348022, + "learning_rate": 3.4821505521410514e-06, + "loss": 0.4822, + "step": 2833 + }, + { + "epoch": 0.61, + "grad_norm": 0.13619300723075867, + "learning_rate": 3.47882646806199e-06, + "loss": 0.4672, + "step": 2834 + }, + { + "epoch": 0.61, + "grad_norm": 0.17099611461162567, + "learning_rate": 3.4755031247258453e-06, + "loss": 0.5018, + "step": 2835 + }, + { + "epoch": 0.61, + "grad_norm": 0.2704041600227356, + "learning_rate": 3.472180523750933e-06, + "loss": 0.4887, + "step": 2836 + }, + { + "epoch": 0.61, + "grad_norm": 0.1702050119638443, + "learning_rate": 3.468858666755214e-06, + "loss": 0.4735, + "step": 2837 + }, + { + "epoch": 0.61, + "grad_norm": 0.13018356263637543, + "learning_rate": 3.4655375553562774e-06, + "loss": 0.5054, + "step": 2838 + }, + { + "epoch": 0.61, + "grad_norm": 0.15961863100528717, + "learning_rate": 3.4622171911713597e-06, + "loss": 0.4903, + "step": 2839 + }, + { + "epoch": 0.61, + "grad_norm": 0.20230530202388763, + "learning_rate": 3.458897575817326e-06, + "loss": 0.4923, + "step": 2840 + }, + { + "epoch": 0.61, + "grad_norm": 0.1560392677783966, + "learning_rate": 3.4555787109106786e-06, + "loss": 0.4996, + "step": 2841 + }, + { + "epoch": 0.61, + "grad_norm": 0.17162789404392242, + "learning_rate": 3.4522605980675593e-06, + "loss": 0.5324, + "step": 2842 + }, + { + "epoch": 0.61, + "grad_norm": 0.14241425693035126, + "learning_rate": 3.4489432389037326e-06, + "loss": 0.5093, + "step": 2843 + }, + { + "epoch": 0.61, + "grad_norm": 0.17781661450862885, + "learning_rate": 3.44562663503461e-06, + "loss": 0.545, + "step": 2844 + }, + { + "epoch": 0.61, + "grad_norm": 0.26344063878059387, + "learning_rate": 3.4423107880752227e-06, + "loss": 0.5451, + "step": 2845 + }, + { + "epoch": 0.61, + "grad_norm": 0.1670253723859787, + "learning_rate": 3.43899569964024e-06, + "loss": 0.4649, + "step": 2846 + }, + { + "epoch": 0.61, + "grad_norm": 0.17507214844226837, + "learning_rate": 3.4356813713439626e-06, + "loss": 0.5291, + "step": 2847 + }, + { + "epoch": 0.61, + "grad_norm": 0.1973615288734436, + "learning_rate": 3.432367804800316e-06, + "loss": 0.5424, + "step": 2848 + }, + { + "epoch": 0.61, + "grad_norm": 0.13851170241832733, + "learning_rate": 3.42905500162286e-06, + "loss": 0.4921, + "step": 2849 + }, + { + "epoch": 0.61, + "grad_norm": 0.15649986267089844, + "learning_rate": 3.4257429634247783e-06, + "loss": 0.5102, + "step": 2850 + }, + { + "epoch": 0.61, + "grad_norm": 0.1704344004392624, + "learning_rate": 3.4224316918188855e-06, + "loss": 0.5317, + "step": 2851 + }, + { + "epoch": 0.61, + "grad_norm": 0.19456495344638824, + "learning_rate": 3.419121188417622e-06, + "loss": 0.4987, + "step": 2852 + }, + { + "epoch": 0.61, + "grad_norm": 0.14243166148662567, + "learning_rate": 3.4158114548330525e-06, + "loss": 0.5126, + "step": 2853 + }, + { + "epoch": 0.61, + "grad_norm": 0.1448044627904892, + "learning_rate": 3.41250249267687e-06, + "loss": 0.5183, + "step": 2854 + }, + { + "epoch": 0.62, + "grad_norm": 0.17978918552398682, + "learning_rate": 3.409194303560387e-06, + "loss": 0.5421, + "step": 2855 + }, + { + "epoch": 0.62, + "grad_norm": 0.14264936745166779, + "learning_rate": 3.4058868890945425e-06, + "loss": 0.4958, + "step": 2856 + }, + { + "epoch": 0.62, + "grad_norm": 0.15832003951072693, + "learning_rate": 3.4025802508899025e-06, + "loss": 0.4939, + "step": 2857 + }, + { + "epoch": 0.62, + "grad_norm": 0.1486930102109909, + "learning_rate": 3.3992743905566453e-06, + "loss": 0.5264, + "step": 2858 + }, + { + "epoch": 0.62, + "grad_norm": 0.19173184037208557, + "learning_rate": 3.39596930970458e-06, + "loss": 0.5165, + "step": 2859 + }, + { + "epoch": 0.62, + "grad_norm": 0.17818816006183624, + "learning_rate": 3.3926650099431286e-06, + "loss": 0.5617, + "step": 2860 + }, + { + "epoch": 0.62, + "grad_norm": 0.15651050209999084, + "learning_rate": 3.389361492881337e-06, + "loss": 0.4856, + "step": 2861 + }, + { + "epoch": 0.62, + "grad_norm": 0.1457422971725464, + "learning_rate": 3.3860587601278715e-06, + "loss": 0.5187, + "step": 2862 + }, + { + "epoch": 0.62, + "grad_norm": 0.13978311419487, + "learning_rate": 3.3827568132910117e-06, + "loss": 0.493, + "step": 2863 + }, + { + "epoch": 0.62, + "grad_norm": 0.14989745616912842, + "learning_rate": 3.3794556539786584e-06, + "loss": 0.5355, + "step": 2864 + }, + { + "epoch": 0.62, + "grad_norm": 0.16385847330093384, + "learning_rate": 3.376155283798323e-06, + "loss": 0.5402, + "step": 2865 + }, + { + "epoch": 0.62, + "grad_norm": 0.1365756392478943, + "learning_rate": 3.372855704357144e-06, + "loss": 0.5018, + "step": 2866 + }, + { + "epoch": 0.62, + "grad_norm": 0.14765289425849915, + "learning_rate": 3.3695569172618613e-06, + "loss": 0.5786, + "step": 2867 + }, + { + "epoch": 0.62, + "grad_norm": 0.14326290786266327, + "learning_rate": 3.3662589241188382e-06, + "loss": 0.4799, + "step": 2868 + }, + { + "epoch": 0.62, + "grad_norm": 0.1515820473432541, + "learning_rate": 3.3629617265340497e-06, + "loss": 0.4875, + "step": 2869 + }, + { + "epoch": 0.62, + "grad_norm": 0.14540225267410278, + "learning_rate": 3.3596653261130806e-06, + "loss": 0.5127, + "step": 2870 + }, + { + "epoch": 0.62, + "grad_norm": 0.162192702293396, + "learning_rate": 3.3563697244611303e-06, + "loss": 0.4825, + "step": 2871 + }, + { + "epoch": 0.62, + "grad_norm": 0.1744917333126068, + "learning_rate": 3.3530749231830073e-06, + "loss": 0.4677, + "step": 2872 + }, + { + "epoch": 0.62, + "grad_norm": 0.15274450182914734, + "learning_rate": 3.3497809238831314e-06, + "loss": 0.498, + "step": 2873 + }, + { + "epoch": 0.62, + "grad_norm": 0.15344925224781036, + "learning_rate": 3.3464877281655335e-06, + "loss": 0.461, + "step": 2874 + }, + { + "epoch": 0.62, + "grad_norm": 0.14903058111667633, + "learning_rate": 3.3431953376338487e-06, + "loss": 0.5207, + "step": 2875 + }, + { + "epoch": 0.62, + "grad_norm": 0.15112550556659698, + "learning_rate": 3.339903753891326e-06, + "loss": 0.5271, + "step": 2876 + }, + { + "epoch": 0.62, + "grad_norm": 0.13480481505393982, + "learning_rate": 3.3366129785408143e-06, + "loss": 0.4761, + "step": 2877 + }, + { + "epoch": 0.62, + "grad_norm": 0.17278815805912018, + "learning_rate": 3.333323013184773e-06, + "loss": 0.494, + "step": 2878 + }, + { + "epoch": 0.62, + "grad_norm": 0.16020460426807404, + "learning_rate": 3.3300338594252724e-06, + "loss": 0.5306, + "step": 2879 + }, + { + "epoch": 0.62, + "grad_norm": 0.19360634684562683, + "learning_rate": 3.326745518863976e-06, + "loss": 0.5292, + "step": 2880 + }, + { + "epoch": 0.62, + "grad_norm": 0.15092292428016663, + "learning_rate": 3.323457993102161e-06, + "loss": 0.5234, + "step": 2881 + }, + { + "epoch": 0.62, + "grad_norm": 0.13326002657413483, + "learning_rate": 3.320171283740702e-06, + "loss": 0.4962, + "step": 2882 + }, + { + "epoch": 0.62, + "grad_norm": 0.13950808346271515, + "learning_rate": 3.316885392380078e-06, + "loss": 0.5058, + "step": 2883 + }, + { + "epoch": 0.62, + "grad_norm": 0.15002663433551788, + "learning_rate": 3.3136003206203727e-06, + "loss": 0.5212, + "step": 2884 + }, + { + "epoch": 0.62, + "grad_norm": 0.14820055663585663, + "learning_rate": 3.310316070061266e-06, + "loss": 0.5309, + "step": 2885 + }, + { + "epoch": 0.62, + "grad_norm": 0.15101811289787292, + "learning_rate": 3.307032642302041e-06, + "loss": 0.5228, + "step": 2886 + }, + { + "epoch": 0.62, + "grad_norm": 0.15565958619117737, + "learning_rate": 3.3037500389415756e-06, + "loss": 0.4449, + "step": 2887 + }, + { + "epoch": 0.62, + "grad_norm": 0.12206115573644638, + "learning_rate": 3.3004682615783524e-06, + "loss": 0.469, + "step": 2888 + }, + { + "epoch": 0.62, + "grad_norm": 0.15403220057487488, + "learning_rate": 3.2971873118104515e-06, + "loss": 0.4853, + "step": 2889 + }, + { + "epoch": 0.62, + "grad_norm": 0.15070055425167084, + "learning_rate": 3.2939071912355424e-06, + "loss": 0.5003, + "step": 2890 + }, + { + "epoch": 0.62, + "grad_norm": 0.14524191617965698, + "learning_rate": 3.290627901450899e-06, + "loss": 0.5121, + "step": 2891 + }, + { + "epoch": 0.62, + "grad_norm": 0.13863269984722137, + "learning_rate": 3.2873494440533856e-06, + "loss": 0.483, + "step": 2892 + }, + { + "epoch": 0.62, + "grad_norm": 0.162959486246109, + "learning_rate": 3.284071820639465e-06, + "loss": 0.4901, + "step": 2893 + }, + { + "epoch": 0.62, + "grad_norm": 0.1397026926279068, + "learning_rate": 3.2807950328051906e-06, + "loss": 0.4907, + "step": 2894 + }, + { + "epoch": 0.62, + "grad_norm": 0.17842566967010498, + "learning_rate": 3.2775190821462105e-06, + "loss": 0.5001, + "step": 2895 + }, + { + "epoch": 0.62, + "grad_norm": 0.25389254093170166, + "learning_rate": 3.2742439702577665e-06, + "loss": 0.5028, + "step": 2896 + }, + { + "epoch": 0.62, + "grad_norm": 0.13854780793190002, + "learning_rate": 3.2709696987346885e-06, + "loss": 0.5351, + "step": 2897 + }, + { + "epoch": 0.62, + "grad_norm": 0.14294210076332092, + "learning_rate": 3.267696269171402e-06, + "loss": 0.4752, + "step": 2898 + }, + { + "epoch": 0.62, + "grad_norm": 0.12487441301345825, + "learning_rate": 3.264423683161914e-06, + "loss": 0.4884, + "step": 2899 + }, + { + "epoch": 0.62, + "grad_norm": 0.1544751673936844, + "learning_rate": 3.2611519422998308e-06, + "loss": 0.5406, + "step": 2900 + }, + { + "epoch": 0.62, + "grad_norm": 0.16319073736667633, + "learning_rate": 3.257881048178344e-06, + "loss": 0.4985, + "step": 2901 + }, + { + "epoch": 0.63, + "grad_norm": 0.19490410387516022, + "learning_rate": 3.254611002390227e-06, + "loss": 0.5006, + "step": 2902 + }, + { + "epoch": 0.63, + "grad_norm": 0.14253075420856476, + "learning_rate": 3.251341806527848e-06, + "loss": 0.4988, + "step": 2903 + }, + { + "epoch": 0.63, + "grad_norm": 0.14755187928676605, + "learning_rate": 3.248073462183155e-06, + "loss": 0.5083, + "step": 2904 + }, + { + "epoch": 0.63, + "grad_norm": 0.1382237672805786, + "learning_rate": 3.2448059709476864e-06, + "loss": 0.4941, + "step": 2905 + }, + { + "epoch": 0.63, + "grad_norm": 0.13519005477428436, + "learning_rate": 3.2415393344125647e-06, + "loss": 0.4855, + "step": 2906 + }, + { + "epoch": 0.63, + "grad_norm": 0.2366933822631836, + "learning_rate": 3.2382735541684905e-06, + "loss": 0.4875, + "step": 2907 + }, + { + "epoch": 0.63, + "grad_norm": 0.15798290073871613, + "learning_rate": 3.235008631805755e-06, + "loss": 0.5288, + "step": 2908 + }, + { + "epoch": 0.63, + "grad_norm": 0.16785183548927307, + "learning_rate": 3.231744568914226e-06, + "loss": 0.5308, + "step": 2909 + }, + { + "epoch": 0.63, + "grad_norm": 0.19100995361804962, + "learning_rate": 3.228481367083356e-06, + "loss": 0.4923, + "step": 2910 + }, + { + "epoch": 0.63, + "grad_norm": 0.131486177444458, + "learning_rate": 3.2252190279021788e-06, + "loss": 0.4967, + "step": 2911 + }, + { + "epoch": 0.63, + "grad_norm": 0.15485283732414246, + "learning_rate": 3.2219575529593017e-06, + "loss": 0.465, + "step": 2912 + }, + { + "epoch": 0.63, + "grad_norm": 0.1736060082912445, + "learning_rate": 3.2186969438429217e-06, + "loss": 0.5094, + "step": 2913 + }, + { + "epoch": 0.63, + "grad_norm": 0.17122332751750946, + "learning_rate": 3.215437202140803e-06, + "loss": 0.4891, + "step": 2914 + }, + { + "epoch": 0.63, + "grad_norm": 0.15651971101760864, + "learning_rate": 3.2121783294402966e-06, + "loss": 0.4704, + "step": 2915 + }, + { + "epoch": 0.63, + "grad_norm": 0.16835874319076538, + "learning_rate": 3.2089203273283253e-06, + "loss": 0.4694, + "step": 2916 + }, + { + "epoch": 0.63, + "grad_norm": 0.15919756889343262, + "learning_rate": 3.205663197391389e-06, + "loss": 0.5043, + "step": 2917 + }, + { + "epoch": 0.63, + "grad_norm": 0.17332980036735535, + "learning_rate": 3.2024069412155632e-06, + "loss": 0.5494, + "step": 2918 + }, + { + "epoch": 0.63, + "grad_norm": 0.15382111072540283, + "learning_rate": 3.199151560386498e-06, + "loss": 0.4838, + "step": 2919 + }, + { + "epoch": 0.63, + "grad_norm": 0.19345510005950928, + "learning_rate": 3.1958970564894187e-06, + "loss": 0.4929, + "step": 2920 + }, + { + "epoch": 0.63, + "grad_norm": 0.18597455322742462, + "learning_rate": 3.192643431109117e-06, + "loss": 0.5576, + "step": 2921 + }, + { + "epoch": 0.63, + "grad_norm": 0.16669237613677979, + "learning_rate": 3.189390685829967e-06, + "loss": 0.4878, + "step": 2922 + }, + { + "epoch": 0.63, + "grad_norm": 0.13570186495780945, + "learning_rate": 3.186138822235908e-06, + "loss": 0.4852, + "step": 2923 + }, + { + "epoch": 0.63, + "grad_norm": 0.1756938099861145, + "learning_rate": 3.182887841910448e-06, + "loss": 0.5295, + "step": 2924 + }, + { + "epoch": 0.63, + "grad_norm": 0.1592927873134613, + "learning_rate": 3.1796377464366713e-06, + "loss": 0.5879, + "step": 2925 + }, + { + "epoch": 0.63, + "grad_norm": 0.13915982842445374, + "learning_rate": 3.1763885373972246e-06, + "loss": 0.498, + "step": 2926 + }, + { + "epoch": 0.63, + "grad_norm": 0.18962885439395905, + "learning_rate": 3.1731402163743284e-06, + "loss": 0.4949, + "step": 2927 + }, + { + "epoch": 0.63, + "grad_norm": 0.17103898525238037, + "learning_rate": 3.1698927849497683e-06, + "loss": 0.5678, + "step": 2928 + }, + { + "epoch": 0.63, + "grad_norm": 0.19355489313602448, + "learning_rate": 3.166646244704896e-06, + "loss": 0.4849, + "step": 2929 + }, + { + "epoch": 0.63, + "grad_norm": 0.14212578535079956, + "learning_rate": 3.1634005972206326e-06, + "loss": 0.4616, + "step": 2930 + }, + { + "epoch": 0.63, + "grad_norm": 0.13874362409114838, + "learning_rate": 3.160155844077459e-06, + "loss": 0.5322, + "step": 2931 + }, + { + "epoch": 0.63, + "grad_norm": 0.1573115438222885, + "learning_rate": 3.156911986855425e-06, + "loss": 0.555, + "step": 2932 + }, + { + "epoch": 0.63, + "grad_norm": 0.1475786417722702, + "learning_rate": 3.153669027134144e-06, + "loss": 0.5179, + "step": 2933 + }, + { + "epoch": 0.63, + "grad_norm": 0.13680386543273926, + "learning_rate": 3.150426966492788e-06, + "loss": 0.521, + "step": 2934 + }, + { + "epoch": 0.63, + "grad_norm": 0.1602596789598465, + "learning_rate": 3.147185806510099e-06, + "loss": 0.5499, + "step": 2935 + }, + { + "epoch": 0.63, + "grad_norm": 0.14966510236263275, + "learning_rate": 3.143945548764371e-06, + "loss": 0.4922, + "step": 2936 + }, + { + "epoch": 0.63, + "grad_norm": 0.14178875088691711, + "learning_rate": 3.140706194833466e-06, + "loss": 0.4547, + "step": 2937 + }, + { + "epoch": 0.63, + "grad_norm": 0.16615799069404602, + "learning_rate": 3.137467746294803e-06, + "loss": 0.5192, + "step": 2938 + }, + { + "epoch": 0.63, + "grad_norm": 0.19471901655197144, + "learning_rate": 3.13423020472536e-06, + "loss": 0.5068, + "step": 2939 + }, + { + "epoch": 0.63, + "grad_norm": 0.1289563924074173, + "learning_rate": 3.130993571701674e-06, + "loss": 0.483, + "step": 2940 + }, + { + "epoch": 0.63, + "grad_norm": 0.1688213050365448, + "learning_rate": 3.1277578487998387e-06, + "loss": 0.5033, + "step": 2941 + }, + { + "epoch": 0.63, + "grad_norm": 0.14173230528831482, + "learning_rate": 3.124523037595506e-06, + "loss": 0.4745, + "step": 2942 + }, + { + "epoch": 0.63, + "grad_norm": 0.1439976543188095, + "learning_rate": 3.1212891396638834e-06, + "loss": 0.4909, + "step": 2943 + }, + { + "epoch": 0.63, + "grad_norm": 0.13524580001831055, + "learning_rate": 3.1180561565797323e-06, + "loss": 0.5079, + "step": 2944 + }, + { + "epoch": 0.63, + "grad_norm": 0.1610611528158188, + "learning_rate": 3.114824089917372e-06, + "loss": 0.5046, + "step": 2945 + }, + { + "epoch": 0.63, + "grad_norm": 0.1482682079076767, + "learning_rate": 3.1115929412506698e-06, + "loss": 0.4762, + "step": 2946 + }, + { + "epoch": 0.63, + "grad_norm": 0.1553899049758911, + "learning_rate": 3.1083627121530512e-06, + "loss": 0.5337, + "step": 2947 + }, + { + "epoch": 0.64, + "grad_norm": 0.14075995981693268, + "learning_rate": 3.1051334041974923e-06, + "loss": 0.5239, + "step": 2948 + }, + { + "epoch": 0.64, + "grad_norm": 0.14739052951335907, + "learning_rate": 3.1019050189565193e-06, + "loss": 0.5304, + "step": 2949 + }, + { + "epoch": 0.64, + "grad_norm": 0.16444166004657745, + "learning_rate": 3.0986775580022122e-06, + "loss": 0.5106, + "step": 2950 + }, + { + "epoch": 0.64, + "grad_norm": 0.2006131410598755, + "learning_rate": 3.0954510229061963e-06, + "loss": 0.5723, + "step": 2951 + }, + { + "epoch": 0.64, + "grad_norm": 0.16884103417396545, + "learning_rate": 3.092225415239652e-06, + "loss": 0.5637, + "step": 2952 + }, + { + "epoch": 0.64, + "grad_norm": 0.13112773001194, + "learning_rate": 3.089000736573301e-06, + "loss": 0.5007, + "step": 2953 + }, + { + "epoch": 0.64, + "grad_norm": 0.14087074995040894, + "learning_rate": 3.0857769884774192e-06, + "loss": 0.5106, + "step": 2954 + }, + { + "epoch": 0.64, + "grad_norm": 0.17167288064956665, + "learning_rate": 3.0825541725218266e-06, + "loss": 0.5006, + "step": 2955 + }, + { + "epoch": 0.64, + "grad_norm": 0.16773132979869843, + "learning_rate": 3.079332290275887e-06, + "loss": 0.4808, + "step": 2956 + }, + { + "epoch": 0.64, + "grad_norm": 0.15428221225738525, + "learning_rate": 3.076111343308516e-06, + "loss": 0.531, + "step": 2957 + }, + { + "epoch": 0.64, + "grad_norm": 0.2029823362827301, + "learning_rate": 3.0728913331881638e-06, + "loss": 0.5106, + "step": 2958 + }, + { + "epoch": 0.64, + "grad_norm": 0.13769736886024475, + "learning_rate": 3.069672261482832e-06, + "loss": 0.5005, + "step": 2959 + }, + { + "epoch": 0.64, + "grad_norm": 0.17260031402111053, + "learning_rate": 3.0664541297600682e-06, + "loss": 0.5118, + "step": 2960 + }, + { + "epoch": 0.64, + "grad_norm": 0.1693435162305832, + "learning_rate": 3.063236939586951e-06, + "loss": 0.5139, + "step": 2961 + }, + { + "epoch": 0.64, + "grad_norm": 0.12653128802776337, + "learning_rate": 3.0600206925301114e-06, + "loss": 0.5241, + "step": 2962 + }, + { + "epoch": 0.64, + "grad_norm": 0.1622675359249115, + "learning_rate": 3.0568053901557126e-06, + "loss": 0.5418, + "step": 2963 + }, + { + "epoch": 0.64, + "grad_norm": 0.12737122178077698, + "learning_rate": 3.053591034029465e-06, + "loss": 0.4476, + "step": 2964 + }, + { + "epoch": 0.64, + "grad_norm": 0.17606867849826813, + "learning_rate": 3.0503776257166145e-06, + "loss": 0.5201, + "step": 2965 + }, + { + "epoch": 0.64, + "grad_norm": 0.21557646989822388, + "learning_rate": 3.0471651667819447e-06, + "loss": 0.4985, + "step": 2966 + }, + { + "epoch": 0.64, + "grad_norm": 0.20406164228916168, + "learning_rate": 3.0439536587897822e-06, + "loss": 0.4886, + "step": 2967 + }, + { + "epoch": 0.64, + "grad_norm": 0.147229865193367, + "learning_rate": 3.0407431033039795e-06, + "loss": 0.5053, + "step": 2968 + }, + { + "epoch": 0.64, + "grad_norm": 0.20733335614204407, + "learning_rate": 3.0375335018879383e-06, + "loss": 0.4798, + "step": 2969 + }, + { + "epoch": 0.64, + "grad_norm": 0.17706511914730072, + "learning_rate": 3.03432485610459e-06, + "loss": 0.4957, + "step": 2970 + }, + { + "epoch": 0.64, + "grad_norm": 0.18925561010837555, + "learning_rate": 3.031117167516395e-06, + "loss": 0.4832, + "step": 2971 + }, + { + "epoch": 0.64, + "grad_norm": 0.14262109994888306, + "learning_rate": 3.0279104376853592e-06, + "loss": 0.5004, + "step": 2972 + }, + { + "epoch": 0.64, + "grad_norm": 0.21173708140850067, + "learning_rate": 3.0247046681730107e-06, + "loss": 0.534, + "step": 2973 + }, + { + "epoch": 0.64, + "grad_norm": 0.1742897927761078, + "learning_rate": 3.0214998605404165e-06, + "loss": 0.539, + "step": 2974 + }, + { + "epoch": 0.64, + "grad_norm": 0.13318294286727905, + "learning_rate": 3.0182960163481745e-06, + "loss": 0.4896, + "step": 2975 + }, + { + "epoch": 0.64, + "grad_norm": 0.14285793900489807, + "learning_rate": 3.0150931371564107e-06, + "loss": 0.5225, + "step": 2976 + }, + { + "epoch": 0.64, + "grad_norm": 0.14382816851139069, + "learning_rate": 3.0118912245247846e-06, + "loss": 0.5033, + "step": 2977 + }, + { + "epoch": 0.64, + "grad_norm": 0.1816745102405548, + "learning_rate": 3.0086902800124806e-06, + "loss": 0.5737, + "step": 2978 + }, + { + "epoch": 0.64, + "grad_norm": 0.1659248024225235, + "learning_rate": 3.005490305178218e-06, + "loss": 0.513, + "step": 2979 + }, + { + "epoch": 0.64, + "grad_norm": 0.16415072977542877, + "learning_rate": 3.0022913015802363e-06, + "loss": 0.5032, + "step": 2980 + }, + { + "epoch": 0.64, + "grad_norm": 0.12613564729690552, + "learning_rate": 2.9990932707763067e-06, + "loss": 0.5208, + "step": 2981 + }, + { + "epoch": 0.64, + "grad_norm": 0.15900714695453644, + "learning_rate": 2.99589621432373e-06, + "loss": 0.517, + "step": 2982 + }, + { + "epoch": 0.64, + "grad_norm": 0.15835516154766083, + "learning_rate": 2.992700133779324e-06, + "loss": 0.5217, + "step": 2983 + }, + { + "epoch": 0.64, + "grad_norm": 0.15380804240703583, + "learning_rate": 2.9895050306994385e-06, + "loss": 0.5457, + "step": 2984 + }, + { + "epoch": 0.64, + "grad_norm": 0.138858824968338, + "learning_rate": 2.986310906639942e-06, + "loss": 0.5249, + "step": 2985 + }, + { + "epoch": 0.64, + "grad_norm": 0.13095752894878387, + "learning_rate": 2.9831177631562306e-06, + "loss": 0.4808, + "step": 2986 + }, + { + "epoch": 0.64, + "grad_norm": 0.12830592691898346, + "learning_rate": 2.9799256018032223e-06, + "loss": 0.54, + "step": 2987 + }, + { + "epoch": 0.64, + "grad_norm": 0.1949312835931778, + "learning_rate": 2.9767344241353535e-06, + "loss": 0.5108, + "step": 2988 + }, + { + "epoch": 0.64, + "grad_norm": 0.1589624434709549, + "learning_rate": 2.9735442317065864e-06, + "loss": 0.5641, + "step": 2989 + }, + { + "epoch": 0.64, + "grad_norm": 0.14621149003505707, + "learning_rate": 2.9703550260703974e-06, + "loss": 0.5448, + "step": 2990 + }, + { + "epoch": 0.64, + "grad_norm": 0.16770517826080322, + "learning_rate": 2.967166808779788e-06, + "loss": 0.5617, + "step": 2991 + }, + { + "epoch": 0.64, + "grad_norm": 0.1380135864019394, + "learning_rate": 2.9639795813872773e-06, + "loss": 0.5228, + "step": 2992 + }, + { + "epoch": 0.64, + "grad_norm": 0.13159281015396118, + "learning_rate": 2.9607933454448985e-06, + "loss": 0.5122, + "step": 2993 + }, + { + "epoch": 0.64, + "grad_norm": 0.15131685137748718, + "learning_rate": 2.9576081025042068e-06, + "loss": 0.481, + "step": 2994 + }, + { + "epoch": 0.65, + "grad_norm": 0.13696128129959106, + "learning_rate": 2.9544238541162713e-06, + "loss": 0.4559, + "step": 2995 + }, + { + "epoch": 0.65, + "grad_norm": 0.17516811192035675, + "learning_rate": 2.9512406018316763e-06, + "loss": 0.5363, + "step": 2996 + }, + { + "epoch": 0.65, + "grad_norm": 0.17963650822639465, + "learning_rate": 2.9480583472005253e-06, + "loss": 0.4986, + "step": 2997 + }, + { + "epoch": 0.65, + "grad_norm": 0.1492321640253067, + "learning_rate": 2.9448770917724296e-06, + "loss": 0.5725, + "step": 2998 + }, + { + "epoch": 0.65, + "grad_norm": 0.15479613840579987, + "learning_rate": 2.9416968370965194e-06, + "loss": 0.4926, + "step": 2999 + }, + { + "epoch": 0.65, + "grad_norm": 0.1259550005197525, + "learning_rate": 2.9385175847214325e-06, + "loss": 0.5108, + "step": 3000 + }, + { + "epoch": 0.65, + "grad_norm": 0.1810281127691269, + "learning_rate": 2.9353393361953237e-06, + "loss": 0.5176, + "step": 3001 + }, + { + "epoch": 0.65, + "grad_norm": 0.20367856323719025, + "learning_rate": 2.9321620930658578e-06, + "loss": 0.5562, + "step": 3002 + }, + { + "epoch": 0.65, + "grad_norm": 0.1935432255268097, + "learning_rate": 2.928985856880205e-06, + "loss": 0.4959, + "step": 3003 + }, + { + "epoch": 0.65, + "grad_norm": 0.17958539724349976, + "learning_rate": 2.925810629185054e-06, + "loss": 0.5234, + "step": 3004 + }, + { + "epoch": 0.65, + "grad_norm": 0.15984192490577698, + "learning_rate": 2.922636411526593e-06, + "loss": 0.5221, + "step": 3005 + }, + { + "epoch": 0.65, + "grad_norm": 0.13086757063865662, + "learning_rate": 2.919463205450526e-06, + "loss": 0.5034, + "step": 3006 + }, + { + "epoch": 0.65, + "grad_norm": 0.16409295797348022, + "learning_rate": 2.9162910125020575e-06, + "loss": 0.499, + "step": 3007 + }, + { + "epoch": 0.65, + "grad_norm": 0.1658695936203003, + "learning_rate": 2.9131198342259065e-06, + "loss": 0.5489, + "step": 3008 + }, + { + "epoch": 0.65, + "grad_norm": 0.2198559045791626, + "learning_rate": 2.9099496721662947e-06, + "loss": 0.5026, + "step": 3009 + }, + { + "epoch": 0.65, + "grad_norm": 0.1836353838443756, + "learning_rate": 2.9067805278669425e-06, + "loss": 0.5644, + "step": 3010 + }, + { + "epoch": 0.65, + "grad_norm": 0.20136743783950806, + "learning_rate": 2.9036124028710865e-06, + "loss": 0.5142, + "step": 3011 + }, + { + "epoch": 0.65, + "grad_norm": 0.2073100060224533, + "learning_rate": 2.900445298721455e-06, + "loss": 0.5486, + "step": 3012 + }, + { + "epoch": 0.65, + "grad_norm": 0.19056002795696259, + "learning_rate": 2.8972792169602882e-06, + "loss": 0.5525, + "step": 3013 + }, + { + "epoch": 0.65, + "grad_norm": 0.16226232051849365, + "learning_rate": 2.894114159129324e-06, + "loss": 0.5438, + "step": 3014 + }, + { + "epoch": 0.65, + "grad_norm": 0.15393410623073578, + "learning_rate": 2.890950126769803e-06, + "loss": 0.519, + "step": 3015 + }, + { + "epoch": 0.65, + "grad_norm": 0.13310056924819946, + "learning_rate": 2.8877871214224694e-06, + "loss": 0.5414, + "step": 3016 + }, + { + "epoch": 0.65, + "grad_norm": 0.15130481123924255, + "learning_rate": 2.8846251446275587e-06, + "loss": 0.5139, + "step": 3017 + }, + { + "epoch": 0.65, + "grad_norm": 0.14056378602981567, + "learning_rate": 2.881464197924814e-06, + "loss": 0.5016, + "step": 3018 + }, + { + "epoch": 0.65, + "grad_norm": 0.16934460401535034, + "learning_rate": 2.8783042828534756e-06, + "loss": 0.5251, + "step": 3019 + }, + { + "epoch": 0.65, + "grad_norm": 0.172510027885437, + "learning_rate": 2.875145400952274e-06, + "loss": 0.4938, + "step": 3020 + }, + { + "epoch": 0.65, + "grad_norm": 0.18168850243091583, + "learning_rate": 2.87198755375945e-06, + "loss": 0.557, + "step": 3021 + }, + { + "epoch": 0.65, + "grad_norm": 0.18108013272285461, + "learning_rate": 2.868830742812726e-06, + "loss": 0.5058, + "step": 3022 + }, + { + "epoch": 0.65, + "grad_norm": 0.20254182815551758, + "learning_rate": 2.865674969649329e-06, + "loss": 0.5228, + "step": 3023 + }, + { + "epoch": 0.65, + "grad_norm": 0.1535319983959198, + "learning_rate": 2.8625202358059806e-06, + "loss": 0.5533, + "step": 3024 + }, + { + "epoch": 0.65, + "grad_norm": 0.17317281663417816, + "learning_rate": 2.85936654281889e-06, + "loss": 0.5433, + "step": 3025 + }, + { + "epoch": 0.65, + "grad_norm": 0.12184549868106842, + "learning_rate": 2.8562138922237648e-06, + "loss": 0.5126, + "step": 3026 + }, + { + "epoch": 0.65, + "grad_norm": 0.15135183930397034, + "learning_rate": 2.8530622855558045e-06, + "loss": 0.4813, + "step": 3027 + }, + { + "epoch": 0.65, + "grad_norm": 0.23094992339611053, + "learning_rate": 2.8499117243496986e-06, + "loss": 0.4868, + "step": 3028 + }, + { + "epoch": 0.65, + "grad_norm": 0.13720989227294922, + "learning_rate": 2.846762210139631e-06, + "loss": 0.4968, + "step": 3029 + }, + { + "epoch": 0.65, + "grad_norm": 0.1362716108560562, + "learning_rate": 2.8436137444592694e-06, + "loss": 0.5245, + "step": 3030 + }, + { + "epoch": 0.65, + "grad_norm": 0.14415206015110016, + "learning_rate": 2.840466328841778e-06, + "loss": 0.5186, + "step": 3031 + }, + { + "epoch": 0.65, + "grad_norm": 0.18695032596588135, + "learning_rate": 2.837319964819801e-06, + "loss": 0.5611, + "step": 3032 + }, + { + "epoch": 0.65, + "grad_norm": 0.1513887345790863, + "learning_rate": 2.8341746539254807e-06, + "loss": 0.5893, + "step": 3033 + }, + { + "epoch": 0.65, + "grad_norm": 0.17001493275165558, + "learning_rate": 2.8310303976904396e-06, + "loss": 0.4993, + "step": 3034 + }, + { + "epoch": 0.65, + "grad_norm": 0.19183696806430817, + "learning_rate": 2.827887197645789e-06, + "loss": 0.5087, + "step": 3035 + }, + { + "epoch": 0.65, + "grad_norm": 0.151499405503273, + "learning_rate": 2.824745055322128e-06, + "loss": 0.557, + "step": 3036 + }, + { + "epoch": 0.65, + "grad_norm": 0.15552127361297607, + "learning_rate": 2.8216039722495336e-06, + "loss": 0.5215, + "step": 3037 + }, + { + "epoch": 0.65, + "grad_norm": 0.12379120290279388, + "learning_rate": 2.818463949957575e-06, + "loss": 0.5217, + "step": 3038 + }, + { + "epoch": 0.65, + "grad_norm": 0.13502056896686554, + "learning_rate": 2.8153249899753e-06, + "loss": 0.5244, + "step": 3039 + }, + { + "epoch": 0.65, + "grad_norm": 0.15221551060676575, + "learning_rate": 2.8121870938312413e-06, + "loss": 0.5248, + "step": 3040 + }, + { + "epoch": 0.66, + "grad_norm": 0.16277168691158295, + "learning_rate": 2.809050263053414e-06, + "loss": 0.4598, + "step": 3041 + }, + { + "epoch": 0.66, + "grad_norm": 0.1595809906721115, + "learning_rate": 2.80591449916931e-06, + "loss": 0.5505, + "step": 3042 + }, + { + "epoch": 0.66, + "grad_norm": 0.1773127317428589, + "learning_rate": 2.8027798037059094e-06, + "loss": 0.5169, + "step": 3043 + }, + { + "epoch": 0.66, + "grad_norm": 0.1667371243238449, + "learning_rate": 2.7996461781896624e-06, + "loss": 0.4966, + "step": 3044 + }, + { + "epoch": 0.66, + "grad_norm": 0.13818593323230743, + "learning_rate": 2.796513624146504e-06, + "loss": 0.5132, + "step": 3045 + }, + { + "epoch": 0.66, + "grad_norm": 0.13870275020599365, + "learning_rate": 2.7933821431018523e-06, + "loss": 0.528, + "step": 3046 + }, + { + "epoch": 0.66, + "grad_norm": 0.1374882310628891, + "learning_rate": 2.7902517365805916e-06, + "loss": 0.5159, + "step": 3047 + }, + { + "epoch": 0.66, + "grad_norm": 0.1938783973455429, + "learning_rate": 2.7871224061070935e-06, + "loss": 0.5242, + "step": 3048 + }, + { + "epoch": 0.66, + "grad_norm": 0.13137510418891907, + "learning_rate": 2.7839941532051952e-06, + "loss": 0.5338, + "step": 3049 + }, + { + "epoch": 0.66, + "grad_norm": 0.1456771343946457, + "learning_rate": 2.780866979398218e-06, + "loss": 0.5029, + "step": 3050 + }, + { + "epoch": 0.66, + "grad_norm": 0.16268415749073029, + "learning_rate": 2.7777408862089537e-06, + "loss": 0.5301, + "step": 3051 + }, + { + "epoch": 0.66, + "grad_norm": 0.21177208423614502, + "learning_rate": 2.77461587515967e-06, + "loss": 0.5032, + "step": 3052 + }, + { + "epoch": 0.66, + "grad_norm": 0.19144344329833984, + "learning_rate": 2.771491947772108e-06, + "loss": 0.5062, + "step": 3053 + }, + { + "epoch": 0.66, + "grad_norm": 0.13552603125572205, + "learning_rate": 2.7683691055674745e-06, + "loss": 0.5184, + "step": 3054 + }, + { + "epoch": 0.66, + "grad_norm": 0.2080407440662384, + "learning_rate": 2.765247350066455e-06, + "loss": 0.5691, + "step": 3055 + }, + { + "epoch": 0.66, + "grad_norm": 0.1384773850440979, + "learning_rate": 2.7621266827892062e-06, + "loss": 0.4668, + "step": 3056 + }, + { + "epoch": 0.66, + "grad_norm": 0.1618855744600296, + "learning_rate": 2.7590071052553487e-06, + "loss": 0.5399, + "step": 3057 + }, + { + "epoch": 0.66, + "grad_norm": 0.14525936543941498, + "learning_rate": 2.755888618983977e-06, + "loss": 0.5207, + "step": 3058 + }, + { + "epoch": 0.66, + "grad_norm": 0.15105114877223969, + "learning_rate": 2.7527712254936545e-06, + "loss": 0.5042, + "step": 3059 + }, + { + "epoch": 0.66, + "grad_norm": 0.1427949219942093, + "learning_rate": 2.749654926302412e-06, + "loss": 0.5236, + "step": 3060 + }, + { + "epoch": 0.66, + "grad_norm": 0.16231150925159454, + "learning_rate": 2.7465397229277435e-06, + "loss": 0.5481, + "step": 3061 + }, + { + "epoch": 0.66, + "grad_norm": 0.2165137529373169, + "learning_rate": 2.743425616886615e-06, + "loss": 0.5748, + "step": 3062 + }, + { + "epoch": 0.66, + "grad_norm": 0.2217060923576355, + "learning_rate": 2.740312609695455e-06, + "loss": 0.537, + "step": 3063 + }, + { + "epoch": 0.66, + "grad_norm": 0.1639140248298645, + "learning_rate": 2.737200702870157e-06, + "loss": 0.5766, + "step": 3064 + }, + { + "epoch": 0.66, + "grad_norm": 0.16004133224487305, + "learning_rate": 2.734089897926082e-06, + "loss": 0.5546, + "step": 3065 + }, + { + "epoch": 0.66, + "grad_norm": 0.1548355221748352, + "learning_rate": 2.7309801963780485e-06, + "loss": 0.5479, + "step": 3066 + }, + { + "epoch": 0.66, + "grad_norm": 0.13668109476566315, + "learning_rate": 2.727871599740342e-06, + "loss": 0.4974, + "step": 3067 + }, + { + "epoch": 0.66, + "grad_norm": 0.24507245421409607, + "learning_rate": 2.724764109526711e-06, + "loss": 0.5418, + "step": 3068 + }, + { + "epoch": 0.66, + "grad_norm": 0.1891452968120575, + "learning_rate": 2.721657727250359e-06, + "loss": 0.4869, + "step": 3069 + }, + { + "epoch": 0.66, + "grad_norm": 0.16605839133262634, + "learning_rate": 2.7185524544239567e-06, + "loss": 0.5408, + "step": 3070 + }, + { + "epoch": 0.66, + "grad_norm": 0.1509867161512375, + "learning_rate": 2.7154482925596314e-06, + "loss": 0.4962, + "step": 3071 + }, + { + "epoch": 0.66, + "grad_norm": 0.13401636481285095, + "learning_rate": 2.71234524316897e-06, + "loss": 0.4739, + "step": 3072 + }, + { + "epoch": 0.66, + "grad_norm": 0.15112657845020294, + "learning_rate": 2.709243307763019e-06, + "loss": 0.5719, + "step": 3073 + }, + { + "epoch": 0.66, + "grad_norm": 0.1450798213481903, + "learning_rate": 2.706142487852279e-06, + "loss": 0.5104, + "step": 3074 + }, + { + "epoch": 0.66, + "grad_norm": 0.17470777034759521, + "learning_rate": 2.7030427849467113e-06, + "loss": 0.5122, + "step": 3075 + }, + { + "epoch": 0.66, + "grad_norm": 0.173739492893219, + "learning_rate": 2.699944200555727e-06, + "loss": 0.4591, + "step": 3076 + }, + { + "epoch": 0.66, + "grad_norm": 0.1204950362443924, + "learning_rate": 2.696846736188202e-06, + "loss": 0.536, + "step": 3077 + }, + { + "epoch": 0.66, + "grad_norm": 0.1670408993959427, + "learning_rate": 2.693750393352462e-06, + "loss": 0.5477, + "step": 3078 + }, + { + "epoch": 0.66, + "grad_norm": 0.1568535566329956, + "learning_rate": 2.6906551735562824e-06, + "loss": 0.5682, + "step": 3079 + }, + { + "epoch": 0.66, + "grad_norm": 0.18247413635253906, + "learning_rate": 2.6875610783069007e-06, + "loss": 0.4769, + "step": 3080 + }, + { + "epoch": 0.66, + "grad_norm": 0.14836347103118896, + "learning_rate": 2.6844681091109958e-06, + "loss": 0.479, + "step": 3081 + }, + { + "epoch": 0.66, + "grad_norm": 0.15542642772197723, + "learning_rate": 2.681376267474707e-06, + "loss": 0.5113, + "step": 3082 + }, + { + "epoch": 0.66, + "grad_norm": 0.15311211347579956, + "learning_rate": 2.678285554903623e-06, + "loss": 0.5267, + "step": 3083 + }, + { + "epoch": 0.66, + "grad_norm": 0.23527516424655914, + "learning_rate": 2.67519597290278e-06, + "loss": 0.5006, + "step": 3084 + }, + { + "epoch": 0.66, + "grad_norm": 0.13628728687763214, + "learning_rate": 2.6721075229766673e-06, + "loss": 0.5323, + "step": 3085 + }, + { + "epoch": 0.66, + "grad_norm": 0.16927917301654816, + "learning_rate": 2.669020206629217e-06, + "loss": 0.5134, + "step": 3086 + }, + { + "epoch": 0.66, + "grad_norm": 0.17380353808403015, + "learning_rate": 2.665934025363817e-06, + "loss": 0.4888, + "step": 3087 + }, + { + "epoch": 0.67, + "grad_norm": 0.1672961264848709, + "learning_rate": 2.6628489806832947e-06, + "loss": 0.4992, + "step": 3088 + }, + { + "epoch": 0.67, + "grad_norm": 0.15757709741592407, + "learning_rate": 2.659765074089927e-06, + "loss": 0.5237, + "step": 3089 + }, + { + "epoch": 0.67, + "grad_norm": 0.18813352286815643, + "learning_rate": 2.6566823070854442e-06, + "loss": 0.5696, + "step": 3090 + }, + { + "epoch": 0.67, + "grad_norm": 0.17737697064876556, + "learning_rate": 2.653600681171008e-06, + "loss": 0.5657, + "step": 3091 + }, + { + "epoch": 0.67, + "grad_norm": 0.1634911447763443, + "learning_rate": 2.650520197847235e-06, + "loss": 0.4947, + "step": 3092 + }, + { + "epoch": 0.67, + "grad_norm": 0.17239625751972198, + "learning_rate": 2.6474408586141794e-06, + "loss": 0.4936, + "step": 3093 + }, + { + "epoch": 0.67, + "grad_norm": 0.15810348093509674, + "learning_rate": 2.6443626649713407e-06, + "loss": 0.5008, + "step": 3094 + }, + { + "epoch": 0.67, + "grad_norm": 0.13702960312366486, + "learning_rate": 2.6412856184176615e-06, + "loss": 0.5653, + "step": 3095 + }, + { + "epoch": 0.67, + "grad_norm": 0.16318099200725555, + "learning_rate": 2.6382097204515246e-06, + "loss": 0.4573, + "step": 3096 + }, + { + "epoch": 0.67, + "grad_norm": 0.14889857172966003, + "learning_rate": 2.6351349725707543e-06, + "loss": 0.5022, + "step": 3097 + }, + { + "epoch": 0.67, + "grad_norm": 0.18676966428756714, + "learning_rate": 2.6320613762726123e-06, + "loss": 0.5089, + "step": 3098 + }, + { + "epoch": 0.67, + "grad_norm": 0.20256297290325165, + "learning_rate": 2.628988933053802e-06, + "loss": 0.4871, + "step": 3099 + }, + { + "epoch": 0.67, + "grad_norm": 0.18140171468257904, + "learning_rate": 2.625917644410467e-06, + "loss": 0.5102, + "step": 3100 + }, + { + "epoch": 0.67, + "grad_norm": 0.12691918015480042, + "learning_rate": 2.6228475118381825e-06, + "loss": 0.4831, + "step": 3101 + }, + { + "epoch": 0.67, + "grad_norm": 0.16980133950710297, + "learning_rate": 2.6197785368319663e-06, + "loss": 0.4974, + "step": 3102 + }, + { + "epoch": 0.67, + "grad_norm": 0.13890565931797028, + "learning_rate": 2.6167107208862707e-06, + "loss": 0.5288, + "step": 3103 + }, + { + "epoch": 0.67, + "grad_norm": 0.17979633808135986, + "learning_rate": 2.613644065494985e-06, + "loss": 0.5096, + "step": 3104 + }, + { + "epoch": 0.67, + "grad_norm": 0.17529235780239105, + "learning_rate": 2.610578572151433e-06, + "loss": 0.5083, + "step": 3105 + }, + { + "epoch": 0.67, + "grad_norm": 0.13134679198265076, + "learning_rate": 2.6075142423483675e-06, + "loss": 0.5128, + "step": 3106 + }, + { + "epoch": 0.67, + "grad_norm": 0.12923552095890045, + "learning_rate": 2.6044510775779815e-06, + "loss": 0.4858, + "step": 3107 + }, + { + "epoch": 0.67, + "grad_norm": 0.17722611129283905, + "learning_rate": 2.6013890793318972e-06, + "loss": 0.5177, + "step": 3108 + }, + { + "epoch": 0.67, + "grad_norm": 0.17613767087459564, + "learning_rate": 2.5983282491011718e-06, + "loss": 0.5113, + "step": 3109 + }, + { + "epoch": 0.67, + "grad_norm": 0.15595421195030212, + "learning_rate": 2.5952685883762918e-06, + "loss": 0.4972, + "step": 3110 + }, + { + "epoch": 0.67, + "grad_norm": 0.23644490540027618, + "learning_rate": 2.59221009864717e-06, + "loss": 0.533, + "step": 3111 + }, + { + "epoch": 0.67, + "grad_norm": 0.14558325707912445, + "learning_rate": 2.589152781403158e-06, + "loss": 0.4991, + "step": 3112 + }, + { + "epoch": 0.67, + "grad_norm": 0.15675747394561768, + "learning_rate": 2.5860966381330265e-06, + "loss": 0.4931, + "step": 3113 + }, + { + "epoch": 0.67, + "grad_norm": 0.14677970111370087, + "learning_rate": 2.583041670324982e-06, + "loss": 0.4964, + "step": 3114 + }, + { + "epoch": 0.67, + "grad_norm": 0.1544618010520935, + "learning_rate": 2.5799878794666555e-06, + "loss": 0.5627, + "step": 3115 + }, + { + "epoch": 0.67, + "grad_norm": 0.15437090396881104, + "learning_rate": 2.5769352670451058e-06, + "loss": 0.5382, + "step": 3116 + }, + { + "epoch": 0.67, + "grad_norm": 0.16935445368289948, + "learning_rate": 2.57388383454682e-06, + "loss": 0.4881, + "step": 3117 + }, + { + "epoch": 0.67, + "grad_norm": 0.145218625664711, + "learning_rate": 2.5708335834577035e-06, + "loss": 0.492, + "step": 3118 + }, + { + "epoch": 0.67, + "grad_norm": 0.15230430662631989, + "learning_rate": 2.567784515263093e-06, + "loss": 0.5286, + "step": 3119 + }, + { + "epoch": 0.67, + "grad_norm": 0.1474408656358719, + "learning_rate": 2.5647366314477473e-06, + "loss": 0.5342, + "step": 3120 + }, + { + "epoch": 0.67, + "grad_norm": 0.14141744375228882, + "learning_rate": 2.561689933495849e-06, + "loss": 0.4877, + "step": 3121 + }, + { + "epoch": 0.67, + "grad_norm": 0.14578036963939667, + "learning_rate": 2.5586444228910036e-06, + "loss": 0.5148, + "step": 3122 + }, + { + "epoch": 0.67, + "grad_norm": 0.15471605956554413, + "learning_rate": 2.5556001011162337e-06, + "loss": 0.5346, + "step": 3123 + }, + { + "epoch": 0.67, + "grad_norm": 0.15913046896457672, + "learning_rate": 2.5525569696539916e-06, + "loss": 0.5056, + "step": 3124 + }, + { + "epoch": 0.67, + "grad_norm": 0.17138166725635529, + "learning_rate": 2.54951502998614e-06, + "loss": 0.5096, + "step": 3125 + }, + { + "epoch": 0.67, + "grad_norm": 0.18976645171642303, + "learning_rate": 2.546474283593969e-06, + "loss": 0.484, + "step": 3126 + }, + { + "epoch": 0.67, + "grad_norm": 0.14352168142795563, + "learning_rate": 2.5434347319581844e-06, + "loss": 0.4984, + "step": 3127 + }, + { + "epoch": 0.67, + "grad_norm": 0.16046349704265594, + "learning_rate": 2.540396376558912e-06, + "loss": 0.5463, + "step": 3128 + }, + { + "epoch": 0.67, + "grad_norm": 0.1726856231689453, + "learning_rate": 2.5373592188756946e-06, + "loss": 0.5361, + "step": 3129 + }, + { + "epoch": 0.67, + "grad_norm": 0.16727623343467712, + "learning_rate": 2.5343232603874868e-06, + "loss": 0.4938, + "step": 3130 + }, + { + "epoch": 0.67, + "grad_norm": 0.14829504489898682, + "learning_rate": 2.531288502572667e-06, + "loss": 0.5588, + "step": 3131 + }, + { + "epoch": 0.67, + "grad_norm": 0.1359606683254242, + "learning_rate": 2.5282549469090246e-06, + "loss": 0.5151, + "step": 3132 + }, + { + "epoch": 0.67, + "grad_norm": 0.17951302230358124, + "learning_rate": 2.525222594873764e-06, + "loss": 0.552, + "step": 3133 + }, + { + "epoch": 0.68, + "grad_norm": 0.1504855751991272, + "learning_rate": 2.522191447943506e-06, + "loss": 0.5304, + "step": 3134 + }, + { + "epoch": 0.68, + "grad_norm": 0.17266714572906494, + "learning_rate": 2.519161507594279e-06, + "loss": 0.513, + "step": 3135 + }, + { + "epoch": 0.68, + "grad_norm": 0.164722740650177, + "learning_rate": 2.5161327753015297e-06, + "loss": 0.5392, + "step": 3136 + }, + { + "epoch": 0.68, + "grad_norm": 0.22426824271678925, + "learning_rate": 2.5131052525401145e-06, + "loss": 0.5337, + "step": 3137 + }, + { + "epoch": 0.68, + "grad_norm": 0.15474985539913177, + "learning_rate": 2.5100789407842985e-06, + "loss": 0.5149, + "step": 3138 + }, + { + "epoch": 0.68, + "grad_norm": 0.155501589179039, + "learning_rate": 2.5070538415077593e-06, + "loss": 0.5177, + "step": 3139 + }, + { + "epoch": 0.68, + "grad_norm": 0.15586499869823456, + "learning_rate": 2.5040299561835846e-06, + "loss": 0.4912, + "step": 3140 + }, + { + "epoch": 0.68, + "grad_norm": 0.15372590720653534, + "learning_rate": 2.5010072862842725e-06, + "loss": 0.4981, + "step": 3141 + }, + { + "epoch": 0.68, + "grad_norm": 0.1472439020872116, + "learning_rate": 2.4979858332817225e-06, + "loss": 0.5167, + "step": 3142 + }, + { + "epoch": 0.68, + "grad_norm": 0.3344082534313202, + "learning_rate": 2.494965598647248e-06, + "loss": 0.5456, + "step": 3143 + }, + { + "epoch": 0.68, + "grad_norm": 0.13474471867084503, + "learning_rate": 2.4919465838515687e-06, + "loss": 0.5113, + "step": 3144 + }, + { + "epoch": 0.68, + "grad_norm": 0.15115109086036682, + "learning_rate": 2.488928790364804e-06, + "loss": 0.4906, + "step": 3145 + }, + { + "epoch": 0.68, + "grad_norm": 0.13708892464637756, + "learning_rate": 2.48591221965649e-06, + "loss": 0.484, + "step": 3146 + }, + { + "epoch": 0.68, + "grad_norm": 0.14575795829296112, + "learning_rate": 2.482896873195555e-06, + "loss": 0.5477, + "step": 3147 + }, + { + "epoch": 0.68, + "grad_norm": 0.14525777101516724, + "learning_rate": 2.479882752450339e-06, + "loss": 0.5041, + "step": 3148 + }, + { + "epoch": 0.68, + "grad_norm": 0.12615111470222473, + "learning_rate": 2.4768698588885842e-06, + "loss": 0.4841, + "step": 3149 + }, + { + "epoch": 0.68, + "grad_norm": 0.1459239274263382, + "learning_rate": 2.4738581939774303e-06, + "loss": 0.5168, + "step": 3150 + }, + { + "epoch": 0.68, + "grad_norm": 0.16422203183174133, + "learning_rate": 2.4708477591834244e-06, + "loss": 0.5476, + "step": 3151 + }, + { + "epoch": 0.68, + "grad_norm": 0.12386015802621841, + "learning_rate": 2.4678385559725125e-06, + "loss": 0.4401, + "step": 3152 + }, + { + "epoch": 0.68, + "grad_norm": 0.16602352261543274, + "learning_rate": 2.4648305858100413e-06, + "loss": 0.5279, + "step": 3153 + }, + { + "epoch": 0.68, + "grad_norm": 0.195119708776474, + "learning_rate": 2.4618238501607577e-06, + "loss": 0.4794, + "step": 3154 + }, + { + "epoch": 0.68, + "grad_norm": 0.14807678759098053, + "learning_rate": 2.4588183504888023e-06, + "loss": 0.4964, + "step": 3155 + }, + { + "epoch": 0.68, + "grad_norm": 0.1313076764345169, + "learning_rate": 2.455814088257723e-06, + "loss": 0.5141, + "step": 3156 + }, + { + "epoch": 0.68, + "grad_norm": 0.17979438602924347, + "learning_rate": 2.4528110649304555e-06, + "loss": 0.5335, + "step": 3157 + }, + { + "epoch": 0.68, + "grad_norm": 0.17119114100933075, + "learning_rate": 2.4498092819693364e-06, + "loss": 0.4784, + "step": 3158 + }, + { + "epoch": 0.68, + "grad_norm": 0.1639591008424759, + "learning_rate": 2.4468087408361053e-06, + "loss": 0.5275, + "step": 3159 + }, + { + "epoch": 0.68, + "grad_norm": 0.16429801285266876, + "learning_rate": 2.443809442991884e-06, + "loss": 0.4829, + "step": 3160 + }, + { + "epoch": 0.68, + "grad_norm": 0.1692316085100174, + "learning_rate": 2.440811389897199e-06, + "loss": 0.5242, + "step": 3161 + }, + { + "epoch": 0.68, + "grad_norm": 0.16549012064933777, + "learning_rate": 2.4378145830119637e-06, + "loss": 0.5217, + "step": 3162 + }, + { + "epoch": 0.68, + "grad_norm": 0.16364479064941406, + "learning_rate": 2.4348190237954893e-06, + "loss": 0.556, + "step": 3163 + }, + { + "epoch": 0.68, + "grad_norm": 0.14696238934993744, + "learning_rate": 2.4318247137064788e-06, + "loss": 0.5393, + "step": 3164 + }, + { + "epoch": 0.68, + "grad_norm": 0.16105543076992035, + "learning_rate": 2.428831654203025e-06, + "loss": 0.5169, + "step": 3165 + }, + { + "epoch": 0.68, + "grad_norm": 0.14257711172103882, + "learning_rate": 2.425839846742616e-06, + "loss": 0.5376, + "step": 3166 + }, + { + "epoch": 0.68, + "grad_norm": 0.1712980419397354, + "learning_rate": 2.4228492927821227e-06, + "loss": 0.4776, + "step": 3167 + }, + { + "epoch": 0.68, + "grad_norm": 0.15367096662521362, + "learning_rate": 2.4198599937778138e-06, + "loss": 0.4887, + "step": 3168 + }, + { + "epoch": 0.68, + "grad_norm": 0.17562294006347656, + "learning_rate": 2.41687195118534e-06, + "loss": 0.4691, + "step": 3169 + }, + { + "epoch": 0.68, + "grad_norm": 0.1532362997531891, + "learning_rate": 2.4138851664597424e-06, + "loss": 0.5247, + "step": 3170 + }, + { + "epoch": 0.68, + "grad_norm": 0.14801405370235443, + "learning_rate": 2.4108996410554565e-06, + "loss": 0.499, + "step": 3171 + }, + { + "epoch": 0.68, + "grad_norm": 0.24842116236686707, + "learning_rate": 2.407915376426293e-06, + "loss": 0.5365, + "step": 3172 + }, + { + "epoch": 0.68, + "grad_norm": 0.15684857964515686, + "learning_rate": 2.4049323740254575e-06, + "loss": 0.5435, + "step": 3173 + }, + { + "epoch": 0.68, + "grad_norm": 0.15908139944076538, + "learning_rate": 2.401950635305535e-06, + "loss": 0.5011, + "step": 3174 + }, + { + "epoch": 0.68, + "grad_norm": 0.1335798054933548, + "learning_rate": 2.3989701617184986e-06, + "loss": 0.5187, + "step": 3175 + }, + { + "epoch": 0.68, + "grad_norm": 0.14431187510490417, + "learning_rate": 2.395990954715705e-06, + "loss": 0.5294, + "step": 3176 + }, + { + "epoch": 0.68, + "grad_norm": 0.22326305508613586, + "learning_rate": 2.3930130157478938e-06, + "loss": 0.5639, + "step": 3177 + }, + { + "epoch": 0.68, + "grad_norm": 0.11609046161174774, + "learning_rate": 2.390036346265188e-06, + "loss": 0.5045, + "step": 3178 + }, + { + "epoch": 0.68, + "grad_norm": 0.1769070327281952, + "learning_rate": 2.387060947717089e-06, + "loss": 0.4945, + "step": 3179 + }, + { + "epoch": 0.69, + "grad_norm": 0.20527726411819458, + "learning_rate": 2.3840868215524824e-06, + "loss": 0.5375, + "step": 3180 + }, + { + "epoch": 0.69, + "grad_norm": 0.1666276752948761, + "learning_rate": 2.381113969219636e-06, + "loss": 0.5197, + "step": 3181 + }, + { + "epoch": 0.69, + "grad_norm": 0.18321175873279572, + "learning_rate": 2.378142392166191e-06, + "loss": 0.5291, + "step": 3182 + }, + { + "epoch": 0.69, + "grad_norm": 0.14883701503276825, + "learning_rate": 2.375172091839174e-06, + "loss": 0.5554, + "step": 3183 + }, + { + "epoch": 0.69, + "grad_norm": 0.13777758181095123, + "learning_rate": 2.3722030696849857e-06, + "loss": 0.5396, + "step": 3184 + }, + { + "epoch": 0.69, + "grad_norm": 0.14664918184280396, + "learning_rate": 2.3692353271494073e-06, + "loss": 0.4809, + "step": 3185 + }, + { + "epoch": 0.69, + "grad_norm": 0.2592916190624237, + "learning_rate": 2.3662688656775973e-06, + "loss": 0.4879, + "step": 3186 + }, + { + "epoch": 0.69, + "grad_norm": 0.22084404528141022, + "learning_rate": 2.3633036867140843e-06, + "loss": 0.5349, + "step": 3187 + }, + { + "epoch": 0.69, + "grad_norm": 0.17190620303153992, + "learning_rate": 2.3603397917027787e-06, + "loss": 0.518, + "step": 3188 + }, + { + "epoch": 0.69, + "grad_norm": 0.16831335425376892, + "learning_rate": 2.3573771820869646e-06, + "loss": 0.4805, + "step": 3189 + }, + { + "epoch": 0.69, + "grad_norm": 0.1860908567905426, + "learning_rate": 2.3544158593092986e-06, + "loss": 0.5356, + "step": 3190 + }, + { + "epoch": 0.69, + "grad_norm": 0.1522263139486313, + "learning_rate": 2.3514558248118134e-06, + "loss": 0.532, + "step": 3191 + }, + { + "epoch": 0.69, + "grad_norm": 0.1509491503238678, + "learning_rate": 2.3484970800359087e-06, + "loss": 0.5388, + "step": 3192 + }, + { + "epoch": 0.69, + "grad_norm": 0.1586175262928009, + "learning_rate": 2.345539626422363e-06, + "loss": 0.5192, + "step": 3193 + }, + { + "epoch": 0.69, + "grad_norm": 0.13638122379779816, + "learning_rate": 2.34258346541132e-06, + "loss": 0.4945, + "step": 3194 + }, + { + "epoch": 0.69, + "grad_norm": 0.1359930783510208, + "learning_rate": 2.339628598442298e-06, + "loss": 0.5176, + "step": 3195 + }, + { + "epoch": 0.69, + "grad_norm": 0.12893450260162354, + "learning_rate": 2.3366750269541833e-06, + "loss": 0.5562, + "step": 3196 + }, + { + "epoch": 0.69, + "grad_norm": 0.13871856033802032, + "learning_rate": 2.3337227523852337e-06, + "loss": 0.5027, + "step": 3197 + }, + { + "epoch": 0.69, + "grad_norm": 0.3347227871417999, + "learning_rate": 2.3307717761730745e-06, + "loss": 0.4677, + "step": 3198 + }, + { + "epoch": 0.69, + "grad_norm": 0.14517731964588165, + "learning_rate": 2.3278220997546947e-06, + "loss": 0.477, + "step": 3199 + }, + { + "epoch": 0.69, + "grad_norm": 0.15198582410812378, + "learning_rate": 2.3248737245664575e-06, + "loss": 0.4762, + "step": 3200 + }, + { + "epoch": 0.69, + "grad_norm": 0.14821645617485046, + "learning_rate": 2.3219266520440833e-06, + "loss": 0.5042, + "step": 3201 + }, + { + "epoch": 0.69, + "grad_norm": 0.14736728370189667, + "learning_rate": 2.318980883622668e-06, + "loss": 0.542, + "step": 3202 + }, + { + "epoch": 0.69, + "grad_norm": 0.1347406953573227, + "learning_rate": 2.3160364207366687e-06, + "loss": 0.5291, + "step": 3203 + }, + { + "epoch": 0.69, + "grad_norm": 0.1243584007024765, + "learning_rate": 2.313093264819903e-06, + "loss": 0.5376, + "step": 3204 + }, + { + "epoch": 0.69, + "grad_norm": 0.1613331139087677, + "learning_rate": 2.310151417305558e-06, + "loss": 0.517, + "step": 3205 + }, + { + "epoch": 0.69, + "grad_norm": 0.16766297817230225, + "learning_rate": 2.3072108796261766e-06, + "loss": 0.4946, + "step": 3206 + }, + { + "epoch": 0.69, + "grad_norm": 0.161861851811409, + "learning_rate": 2.3042716532136718e-06, + "loss": 0.4984, + "step": 3207 + }, + { + "epoch": 0.69, + "grad_norm": 0.13287605345249176, + "learning_rate": 2.301333739499312e-06, + "loss": 0.4903, + "step": 3208 + }, + { + "epoch": 0.69, + "grad_norm": 0.1508372724056244, + "learning_rate": 2.2983971399137302e-06, + "loss": 0.5094, + "step": 3209 + }, + { + "epoch": 0.69, + "grad_norm": 0.11505939811468124, + "learning_rate": 2.2954618558869194e-06, + "loss": 0.4829, + "step": 3210 + }, + { + "epoch": 0.69, + "grad_norm": 0.19699527323246002, + "learning_rate": 2.2925278888482273e-06, + "loss": 0.508, + "step": 3211 + }, + { + "epoch": 0.69, + "grad_norm": 0.14467853307724, + "learning_rate": 2.2895952402263642e-06, + "loss": 0.508, + "step": 3212 + }, + { + "epoch": 0.69, + "grad_norm": 0.13654360175132751, + "learning_rate": 2.286663911449401e-06, + "loss": 0.4768, + "step": 3213 + }, + { + "epoch": 0.69, + "grad_norm": 0.18661533296108246, + "learning_rate": 2.283733903944756e-06, + "loss": 0.5364, + "step": 3214 + }, + { + "epoch": 0.69, + "grad_norm": 0.1598883867263794, + "learning_rate": 2.280805219139219e-06, + "loss": 0.5225, + "step": 3215 + }, + { + "epoch": 0.69, + "grad_norm": 0.174927219748497, + "learning_rate": 2.2778778584589214e-06, + "loss": 0.468, + "step": 3216 + }, + { + "epoch": 0.69, + "grad_norm": 0.13632676005363464, + "learning_rate": 2.274951823329358e-06, + "loss": 0.5381, + "step": 3217 + }, + { + "epoch": 0.69, + "grad_norm": 0.1499364972114563, + "learning_rate": 2.272027115175377e-06, + "loss": 0.5137, + "step": 3218 + }, + { + "epoch": 0.69, + "grad_norm": 0.1480303555727005, + "learning_rate": 2.2691037354211767e-06, + "loss": 0.4381, + "step": 3219 + }, + { + "epoch": 0.69, + "grad_norm": 0.13622407615184784, + "learning_rate": 2.2661816854903117e-06, + "loss": 0.556, + "step": 3220 + }, + { + "epoch": 0.69, + "grad_norm": 0.15215899050235748, + "learning_rate": 2.2632609668056906e-06, + "loss": 0.5208, + "step": 3221 + }, + { + "epoch": 0.69, + "grad_norm": 0.15400218963623047, + "learning_rate": 2.2603415807895718e-06, + "loss": 0.5073, + "step": 3222 + }, + { + "epoch": 0.69, + "grad_norm": 0.1677476465702057, + "learning_rate": 2.257423528863562e-06, + "loss": 0.4861, + "step": 3223 + }, + { + "epoch": 0.69, + "grad_norm": 0.1435810774564743, + "learning_rate": 2.254506812448622e-06, + "loss": 0.5095, + "step": 3224 + }, + { + "epoch": 0.69, + "grad_norm": 0.18489789962768555, + "learning_rate": 2.2515914329650636e-06, + "loss": 0.5538, + "step": 3225 + }, + { + "epoch": 0.69, + "grad_norm": 0.17401094734668732, + "learning_rate": 2.2486773918325394e-06, + "loss": 0.503, + "step": 3226 + }, + { + "epoch": 0.7, + "grad_norm": 0.13293685019016266, + "learning_rate": 2.2457646904700632e-06, + "loss": 0.524, + "step": 3227 + }, + { + "epoch": 0.7, + "grad_norm": 0.1453235000371933, + "learning_rate": 2.242853330295984e-06, + "loss": 0.5121, + "step": 3228 + }, + { + "epoch": 0.7, + "grad_norm": 0.1937217265367508, + "learning_rate": 2.239943312728004e-06, + "loss": 0.5165, + "step": 3229 + }, + { + "epoch": 0.7, + "grad_norm": 0.1602119654417038, + "learning_rate": 2.2370346391831737e-06, + "loss": 0.5076, + "step": 3230 + }, + { + "epoch": 0.7, + "grad_norm": 0.12913572788238525, + "learning_rate": 2.2341273110778817e-06, + "loss": 0.5258, + "step": 3231 + }, + { + "epoch": 0.7, + "grad_norm": 0.16789868474006653, + "learning_rate": 2.231221329827867e-06, + "loss": 0.5211, + "step": 3232 + }, + { + "epoch": 0.7, + "grad_norm": 0.2248123735189438, + "learning_rate": 2.228316696848212e-06, + "loss": 0.5348, + "step": 3233 + }, + { + "epoch": 0.7, + "grad_norm": 0.15316419303417206, + "learning_rate": 2.225413413553341e-06, + "loss": 0.5302, + "step": 3234 + }, + { + "epoch": 0.7, + "grad_norm": 0.13006356358528137, + "learning_rate": 2.222511481357026e-06, + "loss": 0.5373, + "step": 3235 + }, + { + "epoch": 0.7, + "grad_norm": 0.14037051796913147, + "learning_rate": 2.219610901672371e-06, + "loss": 0.5641, + "step": 3236 + }, + { + "epoch": 0.7, + "grad_norm": 0.17289666831493378, + "learning_rate": 2.216711675911833e-06, + "loss": 0.5221, + "step": 3237 + }, + { + "epoch": 0.7, + "grad_norm": 0.1685284972190857, + "learning_rate": 2.2138138054871993e-06, + "loss": 0.5341, + "step": 3238 + }, + { + "epoch": 0.7, + "grad_norm": 0.14729037880897522, + "learning_rate": 2.2109172918096034e-06, + "loss": 0.5447, + "step": 3239 + }, + { + "epoch": 0.7, + "grad_norm": 0.17818932235240936, + "learning_rate": 2.208022136289521e-06, + "loss": 0.5437, + "step": 3240 + }, + { + "epoch": 0.7, + "grad_norm": 0.14237408339977264, + "learning_rate": 2.205128340336758e-06, + "loss": 0.5103, + "step": 3241 + }, + { + "epoch": 0.7, + "grad_norm": 0.17500755190849304, + "learning_rate": 2.2022359053604654e-06, + "loss": 0.5444, + "step": 3242 + }, + { + "epoch": 0.7, + "grad_norm": 0.16034772992134094, + "learning_rate": 2.199344832769125e-06, + "loss": 0.4989, + "step": 3243 + }, + { + "epoch": 0.7, + "grad_norm": 0.18941017985343933, + "learning_rate": 2.1964551239705604e-06, + "loss": 0.5812, + "step": 3244 + }, + { + "epoch": 0.7, + "grad_norm": 0.19451190531253815, + "learning_rate": 2.1935667803719307e-06, + "loss": 0.4824, + "step": 3245 + }, + { + "epoch": 0.7, + "grad_norm": 0.15695609152317047, + "learning_rate": 2.1906798033797276e-06, + "loss": 0.6157, + "step": 3246 + }, + { + "epoch": 0.7, + "grad_norm": 0.1591734141111374, + "learning_rate": 2.1877941943997817e-06, + "loss": 0.5586, + "step": 3247 + }, + { + "epoch": 0.7, + "grad_norm": 0.15630222856998444, + "learning_rate": 2.1849099548372492e-06, + "loss": 0.5194, + "step": 3248 + }, + { + "epoch": 0.7, + "grad_norm": 0.16930875182151794, + "learning_rate": 2.18202708609663e-06, + "loss": 0.4618, + "step": 3249 + }, + { + "epoch": 0.7, + "grad_norm": 0.12013290822505951, + "learning_rate": 2.179145589581747e-06, + "loss": 0.5097, + "step": 3250 + }, + { + "epoch": 0.7, + "grad_norm": 0.125696063041687, + "learning_rate": 2.1762654666957606e-06, + "loss": 0.5076, + "step": 3251 + }, + { + "epoch": 0.7, + "grad_norm": 0.1357826292514801, + "learning_rate": 2.1733867188411606e-06, + "loss": 0.5084, + "step": 3252 + }, + { + "epoch": 0.7, + "grad_norm": 0.14690843224525452, + "learning_rate": 2.170509347419768e-06, + "loss": 0.5231, + "step": 3253 + }, + { + "epoch": 0.7, + "grad_norm": 0.14461292326450348, + "learning_rate": 2.167633353832734e-06, + "loss": 0.5063, + "step": 3254 + }, + { + "epoch": 0.7, + "grad_norm": 0.12568648159503937, + "learning_rate": 2.1647587394805353e-06, + "loss": 0.4546, + "step": 3255 + }, + { + "epoch": 0.7, + "grad_norm": 0.15997205674648285, + "learning_rate": 2.1618855057629804e-06, + "loss": 0.5882, + "step": 3256 + }, + { + "epoch": 0.7, + "grad_norm": 0.18054303526878357, + "learning_rate": 2.159013654079205e-06, + "loss": 0.4966, + "step": 3257 + }, + { + "epoch": 0.7, + "grad_norm": 0.15831291675567627, + "learning_rate": 2.156143185827671e-06, + "loss": 0.5021, + "step": 3258 + }, + { + "epoch": 0.7, + "grad_norm": 0.2396220564842224, + "learning_rate": 2.153274102406169e-06, + "loss": 0.5147, + "step": 3259 + }, + { + "epoch": 0.7, + "grad_norm": 0.13623584806919098, + "learning_rate": 2.1504064052118095e-06, + "loss": 0.4903, + "step": 3260 + }, + { + "epoch": 0.7, + "grad_norm": 0.18768242001533508, + "learning_rate": 2.1475400956410337e-06, + "loss": 0.5416, + "step": 3261 + }, + { + "epoch": 0.7, + "grad_norm": 0.15971659123897552, + "learning_rate": 2.144675175089606e-06, + "loss": 0.5072, + "step": 3262 + }, + { + "epoch": 0.7, + "grad_norm": 0.20064754784107208, + "learning_rate": 2.1418116449526117e-06, + "loss": 0.4895, + "step": 3263 + }, + { + "epoch": 0.7, + "grad_norm": 0.1600414365530014, + "learning_rate": 2.1389495066244613e-06, + "loss": 0.4434, + "step": 3264 + }, + { + "epoch": 0.7, + "grad_norm": 0.17641644179821014, + "learning_rate": 2.136088761498888e-06, + "loss": 0.5195, + "step": 3265 + }, + { + "epoch": 0.7, + "grad_norm": 0.16606129705905914, + "learning_rate": 2.1332294109689446e-06, + "loss": 0.4901, + "step": 3266 + }, + { + "epoch": 0.7, + "grad_norm": 0.16686317324638367, + "learning_rate": 2.1303714564270086e-06, + "loss": 0.5201, + "step": 3267 + }, + { + "epoch": 0.7, + "grad_norm": 0.14948517084121704, + "learning_rate": 2.127514899264771e-06, + "loss": 0.5022, + "step": 3268 + }, + { + "epoch": 0.7, + "grad_norm": 0.21814821660518646, + "learning_rate": 2.1246597408732493e-06, + "loss": 0.5412, + "step": 3269 + }, + { + "epoch": 0.7, + "grad_norm": 0.16125313937664032, + "learning_rate": 2.1218059826427727e-06, + "loss": 0.4777, + "step": 3270 + }, + { + "epoch": 0.7, + "grad_norm": 0.15708723664283752, + "learning_rate": 2.118953625962998e-06, + "loss": 0.4976, + "step": 3271 + }, + { + "epoch": 0.7, + "grad_norm": 0.14696088433265686, + "learning_rate": 2.1161026722228932e-06, + "loss": 0.4869, + "step": 3272 + }, + { + "epoch": 0.71, + "grad_norm": 0.173833429813385, + "learning_rate": 2.1132531228107416e-06, + "loss": 0.5291, + "step": 3273 + }, + { + "epoch": 0.71, + "grad_norm": 0.17601168155670166, + "learning_rate": 2.110404979114149e-06, + "loss": 0.53, + "step": 3274 + }, + { + "epoch": 0.71, + "grad_norm": 0.1788867861032486, + "learning_rate": 2.1075582425200286e-06, + "loss": 0.5061, + "step": 3275 + }, + { + "epoch": 0.71, + "grad_norm": 0.15546058118343353, + "learning_rate": 2.104712914414615e-06, + "loss": 0.5317, + "step": 3276 + }, + { + "epoch": 0.71, + "grad_norm": 0.1524634212255478, + "learning_rate": 2.101868996183454e-06, + "loss": 0.4956, + "step": 3277 + }, + { + "epoch": 0.71, + "grad_norm": 0.123813197016716, + "learning_rate": 2.0990264892114067e-06, + "loss": 0.5138, + "step": 3278 + }, + { + "epoch": 0.71, + "grad_norm": 0.16139627993106842, + "learning_rate": 2.0961853948826466e-06, + "loss": 0.5739, + "step": 3279 + }, + { + "epoch": 0.71, + "grad_norm": 0.17389854788780212, + "learning_rate": 2.093345714580656e-06, + "loss": 0.5025, + "step": 3280 + }, + { + "epoch": 0.71, + "grad_norm": 0.12925244867801666, + "learning_rate": 2.0905074496882333e-06, + "loss": 0.495, + "step": 3281 + }, + { + "epoch": 0.71, + "grad_norm": 0.16095152497291565, + "learning_rate": 2.0876706015874816e-06, + "loss": 0.5108, + "step": 3282 + }, + { + "epoch": 0.71, + "grad_norm": 0.1847510039806366, + "learning_rate": 2.0848351716598227e-06, + "loss": 0.5201, + "step": 3283 + }, + { + "epoch": 0.71, + "grad_norm": 0.20535904169082642, + "learning_rate": 2.0820011612859825e-06, + "loss": 0.4862, + "step": 3284 + }, + { + "epoch": 0.71, + "grad_norm": 0.14182107150554657, + "learning_rate": 2.0791685718459936e-06, + "loss": 0.5464, + "step": 3285 + }, + { + "epoch": 0.71, + "grad_norm": 0.12854298949241638, + "learning_rate": 2.076337404719203e-06, + "loss": 0.47, + "step": 3286 + }, + { + "epoch": 0.71, + "grad_norm": 0.1679810881614685, + "learning_rate": 2.073507661284257e-06, + "loss": 0.5748, + "step": 3287 + }, + { + "epoch": 0.71, + "grad_norm": 0.13592375814914703, + "learning_rate": 2.0706793429191156e-06, + "loss": 0.5212, + "step": 3288 + }, + { + "epoch": 0.71, + "grad_norm": 0.18078750371932983, + "learning_rate": 2.0678524510010416e-06, + "loss": 0.5825, + "step": 3289 + }, + { + "epoch": 0.71, + "grad_norm": 0.15121810138225555, + "learning_rate": 2.0650269869066048e-06, + "loss": 0.5514, + "step": 3290 + }, + { + "epoch": 0.71, + "grad_norm": 0.1346854269504547, + "learning_rate": 2.0622029520116798e-06, + "loss": 0.5185, + "step": 3291 + }, + { + "epoch": 0.71, + "grad_norm": 0.17876240611076355, + "learning_rate": 2.0593803476914407e-06, + "loss": 0.5203, + "step": 3292 + }, + { + "epoch": 0.71, + "grad_norm": 0.13062328100204468, + "learning_rate": 2.0565591753203713e-06, + "loss": 0.4912, + "step": 3293 + }, + { + "epoch": 0.71, + "grad_norm": 0.14382170140743256, + "learning_rate": 2.053739436272256e-06, + "loss": 0.5206, + "step": 3294 + }, + { + "epoch": 0.71, + "grad_norm": 0.1761350929737091, + "learning_rate": 2.0509211319201753e-06, + "loss": 0.5529, + "step": 3295 + }, + { + "epoch": 0.71, + "grad_norm": 0.14372633397579193, + "learning_rate": 2.0481042636365243e-06, + "loss": 0.5006, + "step": 3296 + }, + { + "epoch": 0.71, + "grad_norm": 0.17832966148853302, + "learning_rate": 2.045288832792985e-06, + "loss": 0.5626, + "step": 3297 + }, + { + "epoch": 0.71, + "grad_norm": 0.1652042418718338, + "learning_rate": 2.0424748407605468e-06, + "loss": 0.4809, + "step": 3298 + }, + { + "epoch": 0.71, + "grad_norm": 0.16764576733112335, + "learning_rate": 2.0396622889094984e-06, + "loss": 0.5232, + "step": 3299 + }, + { + "epoch": 0.71, + "grad_norm": 0.16923469305038452, + "learning_rate": 2.036851178609423e-06, + "loss": 0.5091, + "step": 3300 + }, + { + "epoch": 0.71, + "grad_norm": 0.18054278194904327, + "learning_rate": 2.0340415112292065e-06, + "loss": 0.5236, + "step": 3301 + }, + { + "epoch": 0.71, + "grad_norm": 0.18616452813148499, + "learning_rate": 2.0312332881370294e-06, + "loss": 0.486, + "step": 3302 + }, + { + "epoch": 0.71, + "grad_norm": 0.13819892704486847, + "learning_rate": 2.0284265107003715e-06, + "loss": 0.5546, + "step": 3303 + }, + { + "epoch": 0.71, + "grad_norm": 0.17926816642284393, + "learning_rate": 2.0256211802860044e-06, + "loss": 0.5188, + "step": 3304 + }, + { + "epoch": 0.71, + "grad_norm": 0.17235776782035828, + "learning_rate": 2.0228172982599974e-06, + "loss": 0.5168, + "step": 3305 + }, + { + "epoch": 0.71, + "grad_norm": 0.1339789479970932, + "learning_rate": 2.0200148659877185e-06, + "loss": 0.5189, + "step": 3306 + }, + { + "epoch": 0.71, + "grad_norm": 0.1831931173801422, + "learning_rate": 2.017213884833821e-06, + "loss": 0.5357, + "step": 3307 + }, + { + "epoch": 0.71, + "grad_norm": 0.1411074548959732, + "learning_rate": 2.014414356162258e-06, + "loss": 0.508, + "step": 3308 + }, + { + "epoch": 0.71, + "grad_norm": 0.17658570408821106, + "learning_rate": 2.0116162813362742e-06, + "loss": 0.4947, + "step": 3309 + }, + { + "epoch": 0.71, + "grad_norm": 0.15399529039859772, + "learning_rate": 2.0088196617184065e-06, + "loss": 0.4912, + "step": 3310 + }, + { + "epoch": 0.71, + "grad_norm": 0.14932404458522797, + "learning_rate": 2.0060244986704834e-06, + "loss": 0.5249, + "step": 3311 + }, + { + "epoch": 0.71, + "grad_norm": 0.1162419468164444, + "learning_rate": 2.00323079355362e-06, + "loss": 0.4978, + "step": 3312 + }, + { + "epoch": 0.71, + "grad_norm": 0.16213734447956085, + "learning_rate": 2.000438547728226e-06, + "loss": 0.5094, + "step": 3313 + }, + { + "epoch": 0.71, + "grad_norm": 0.1504596322774887, + "learning_rate": 1.997647762554e-06, + "loss": 0.4843, + "step": 3314 + }, + { + "epoch": 0.71, + "grad_norm": 0.14448657631874084, + "learning_rate": 1.994858439389929e-06, + "loss": 0.5749, + "step": 3315 + }, + { + "epoch": 0.71, + "grad_norm": 0.16685092449188232, + "learning_rate": 1.992070579594288e-06, + "loss": 0.4815, + "step": 3316 + }, + { + "epoch": 0.71, + "grad_norm": 0.17471322417259216, + "learning_rate": 1.9892841845246357e-06, + "loss": 0.5098, + "step": 3317 + }, + { + "epoch": 0.71, + "grad_norm": 0.16297274827957153, + "learning_rate": 1.9864992555378256e-06, + "loss": 0.5032, + "step": 3318 + }, + { + "epoch": 0.71, + "grad_norm": 0.15113520622253418, + "learning_rate": 1.983715793989987e-06, + "loss": 0.5265, + "step": 3319 + }, + { + "epoch": 0.72, + "grad_norm": 0.12620225548744202, + "learning_rate": 1.9809338012365438e-06, + "loss": 0.4534, + "step": 3320 + }, + { + "epoch": 0.72, + "grad_norm": 0.16422003507614136, + "learning_rate": 1.9781532786322005e-06, + "loss": 0.5404, + "step": 3321 + }, + { + "epoch": 0.72, + "grad_norm": 0.19491100311279297, + "learning_rate": 1.9753742275309456e-06, + "loss": 0.5811, + "step": 3322 + }, + { + "epoch": 0.72, + "grad_norm": 0.21426242589950562, + "learning_rate": 1.9725966492860536e-06, + "loss": 0.5046, + "step": 3323 + }, + { + "epoch": 0.72, + "grad_norm": 0.17637012898921967, + "learning_rate": 1.9698205452500772e-06, + "loss": 0.5513, + "step": 3324 + }, + { + "epoch": 0.72, + "grad_norm": 0.14277128875255585, + "learning_rate": 1.9670459167748552e-06, + "loss": 0.5306, + "step": 3325 + }, + { + "epoch": 0.72, + "grad_norm": 0.1377527266740799, + "learning_rate": 1.9642727652115056e-06, + "loss": 0.4497, + "step": 3326 + }, + { + "epoch": 0.72, + "grad_norm": 0.1276976764202118, + "learning_rate": 1.9615010919104296e-06, + "loss": 0.5035, + "step": 3327 + }, + { + "epoch": 0.72, + "grad_norm": 0.13921838998794556, + "learning_rate": 1.9587308982213077e-06, + "loss": 0.5545, + "step": 3328 + }, + { + "epoch": 0.72, + "grad_norm": 0.16659273207187653, + "learning_rate": 1.9559621854930968e-06, + "loss": 0.541, + "step": 3329 + }, + { + "epoch": 0.72, + "grad_norm": 0.17996352910995483, + "learning_rate": 1.953194955074038e-06, + "loss": 0.5388, + "step": 3330 + }, + { + "epoch": 0.72, + "grad_norm": 0.15658895671367645, + "learning_rate": 1.9504292083116442e-06, + "loss": 0.4891, + "step": 3331 + }, + { + "epoch": 0.72, + "grad_norm": 0.25563114881515503, + "learning_rate": 1.9476649465527116e-06, + "loss": 0.4923, + "step": 3332 + }, + { + "epoch": 0.72, + "grad_norm": 0.12000484019517899, + "learning_rate": 1.94490217114331e-06, + "loss": 0.464, + "step": 3333 + }, + { + "epoch": 0.72, + "grad_norm": 0.1276124268770218, + "learning_rate": 1.942140883428788e-06, + "loss": 0.4568, + "step": 3334 + }, + { + "epoch": 0.72, + "grad_norm": 0.1600504219532013, + "learning_rate": 1.939381084753769e-06, + "loss": 0.5439, + "step": 3335 + }, + { + "epoch": 0.72, + "grad_norm": 0.135470911860466, + "learning_rate": 1.936622776462147e-06, + "loss": 0.518, + "step": 3336 + }, + { + "epoch": 0.72, + "grad_norm": 0.15857979655265808, + "learning_rate": 1.933865959897096e-06, + "loss": 0.5453, + "step": 3337 + }, + { + "epoch": 0.72, + "grad_norm": 0.17830796539783478, + "learning_rate": 1.931110636401062e-06, + "loss": 0.5632, + "step": 3338 + }, + { + "epoch": 0.72, + "grad_norm": 0.1584271937608719, + "learning_rate": 1.9283568073157592e-06, + "loss": 0.5207, + "step": 3339 + }, + { + "epoch": 0.72, + "grad_norm": 0.1628987342119217, + "learning_rate": 1.925604473982185e-06, + "loss": 0.5068, + "step": 3340 + }, + { + "epoch": 0.72, + "grad_norm": 0.18597134947776794, + "learning_rate": 1.922853637740596e-06, + "loss": 0.532, + "step": 3341 + }, + { + "epoch": 0.72, + "grad_norm": 0.18993283808231354, + "learning_rate": 1.9201042999305276e-06, + "loss": 0.5386, + "step": 3342 + }, + { + "epoch": 0.72, + "grad_norm": 0.13225135207176208, + "learning_rate": 1.9173564618907843e-06, + "loss": 0.5157, + "step": 3343 + }, + { + "epoch": 0.72, + "grad_norm": 0.1662367582321167, + "learning_rate": 1.914610124959437e-06, + "loss": 0.5148, + "step": 3344 + }, + { + "epoch": 0.72, + "grad_norm": 0.1778230369091034, + "learning_rate": 1.9118652904738276e-06, + "loss": 0.4604, + "step": 3345 + }, + { + "epoch": 0.72, + "grad_norm": 0.17423133552074432, + "learning_rate": 1.9091219597705694e-06, + "loss": 0.5607, + "step": 3346 + }, + { + "epoch": 0.72, + "grad_norm": 0.15368233621120453, + "learning_rate": 1.9063801341855392e-06, + "loss": 0.511, + "step": 3347 + }, + { + "epoch": 0.72, + "grad_norm": 0.18018236756324768, + "learning_rate": 1.9036398150538842e-06, + "loss": 0.5437, + "step": 3348 + }, + { + "epoch": 0.72, + "grad_norm": 0.15523254871368408, + "learning_rate": 1.9009010037100133e-06, + "loss": 0.5262, + "step": 3349 + }, + { + "epoch": 0.72, + "grad_norm": 0.1918708235025406, + "learning_rate": 1.898163701487607e-06, + "loss": 0.5454, + "step": 3350 + }, + { + "epoch": 0.72, + "grad_norm": 0.16382472217082977, + "learning_rate": 1.8954279097196032e-06, + "loss": 0.5145, + "step": 3351 + }, + { + "epoch": 0.72, + "grad_norm": 0.1837274134159088, + "learning_rate": 1.8926936297382148e-06, + "loss": 0.5496, + "step": 3352 + }, + { + "epoch": 0.72, + "grad_norm": 0.15243403613567352, + "learning_rate": 1.8899608628749116e-06, + "loss": 0.499, + "step": 3353 + }, + { + "epoch": 0.72, + "grad_norm": 0.22877193987369537, + "learning_rate": 1.8872296104604255e-06, + "loss": 0.5414, + "step": 3354 + }, + { + "epoch": 0.72, + "grad_norm": 0.14050383865833282, + "learning_rate": 1.8844998738247562e-06, + "loss": 0.5615, + "step": 3355 + }, + { + "epoch": 0.72, + "grad_norm": 0.18556596338748932, + "learning_rate": 1.8817716542971593e-06, + "loss": 0.5232, + "step": 3356 + }, + { + "epoch": 0.72, + "grad_norm": 0.16367103159427643, + "learning_rate": 1.8790449532061556e-06, + "loss": 0.5193, + "step": 3357 + }, + { + "epoch": 0.72, + "grad_norm": 0.18238645792007446, + "learning_rate": 1.8763197718795262e-06, + "loss": 0.5631, + "step": 3358 + }, + { + "epoch": 0.72, + "grad_norm": 0.16355136036872864, + "learning_rate": 1.8735961116443118e-06, + "loss": 0.5209, + "step": 3359 + }, + { + "epoch": 0.72, + "grad_norm": 0.13203002512454987, + "learning_rate": 1.8708739738268133e-06, + "loss": 0.5164, + "step": 3360 + }, + { + "epoch": 0.72, + "grad_norm": 0.14900876581668854, + "learning_rate": 1.8681533597525859e-06, + "loss": 0.5436, + "step": 3361 + }, + { + "epoch": 0.72, + "grad_norm": 0.16481418907642365, + "learning_rate": 1.865434270746449e-06, + "loss": 0.4923, + "step": 3362 + }, + { + "epoch": 0.72, + "grad_norm": 0.1458396166563034, + "learning_rate": 1.8627167081324732e-06, + "loss": 0.4808, + "step": 3363 + }, + { + "epoch": 0.72, + "grad_norm": 0.15053486824035645, + "learning_rate": 1.8600006732339892e-06, + "loss": 0.4947, + "step": 3364 + }, + { + "epoch": 0.72, + "grad_norm": 0.1663227528333664, + "learning_rate": 1.8572861673735886e-06, + "loss": 0.5284, + "step": 3365 + }, + { + "epoch": 0.73, + "grad_norm": 0.17765146493911743, + "learning_rate": 1.8545731918731074e-06, + "loss": 0.5479, + "step": 3366 + }, + { + "epoch": 0.73, + "grad_norm": 0.17067447304725647, + "learning_rate": 1.8518617480536472e-06, + "loss": 0.5336, + "step": 3367 + }, + { + "epoch": 0.73, + "grad_norm": 0.15725915133953094, + "learning_rate": 1.8491518372355538e-06, + "loss": 0.5474, + "step": 3368 + }, + { + "epoch": 0.73, + "grad_norm": 0.17181500792503357, + "learning_rate": 1.8464434607384345e-06, + "loss": 0.541, + "step": 3369 + }, + { + "epoch": 0.73, + "grad_norm": 0.15348359942436218, + "learning_rate": 1.8437366198811463e-06, + "loss": 0.4621, + "step": 3370 + }, + { + "epoch": 0.73, + "grad_norm": 0.1679931879043579, + "learning_rate": 1.8410313159817982e-06, + "loss": 0.4791, + "step": 3371 + }, + { + "epoch": 0.73, + "grad_norm": 0.14566868543624878, + "learning_rate": 1.838327550357753e-06, + "loss": 0.4671, + "step": 3372 + }, + { + "epoch": 0.73, + "grad_norm": 0.12078259885311127, + "learning_rate": 1.83562532432562e-06, + "loss": 0.5072, + "step": 3373 + }, + { + "epoch": 0.73, + "grad_norm": 0.1410682201385498, + "learning_rate": 1.8329246392012622e-06, + "loss": 0.5099, + "step": 3374 + }, + { + "epoch": 0.73, + "grad_norm": 0.1579367220401764, + "learning_rate": 1.8302254962997934e-06, + "loss": 0.5076, + "step": 3375 + }, + { + "epoch": 0.73, + "grad_norm": 0.13609954714775085, + "learning_rate": 1.8275278969355714e-06, + "loss": 0.4894, + "step": 3376 + }, + { + "epoch": 0.73, + "grad_norm": 0.14277243614196777, + "learning_rate": 1.8248318424222071e-06, + "loss": 0.5272, + "step": 3377 + }, + { + "epoch": 0.73, + "grad_norm": 0.20254714787006378, + "learning_rate": 1.8221373340725568e-06, + "loss": 0.5519, + "step": 3378 + }, + { + "epoch": 0.73, + "grad_norm": 0.14397798478603363, + "learning_rate": 1.8194443731987254e-06, + "loss": 0.56, + "step": 3379 + }, + { + "epoch": 0.73, + "grad_norm": 0.1646426022052765, + "learning_rate": 1.8167529611120648e-06, + "loss": 0.5418, + "step": 3380 + }, + { + "epoch": 0.73, + "grad_norm": 0.15328004956245422, + "learning_rate": 1.8140630991231683e-06, + "loss": 0.515, + "step": 3381 + }, + { + "epoch": 0.73, + "grad_norm": 0.14614619314670563, + "learning_rate": 1.811374788541878e-06, + "loss": 0.461, + "step": 3382 + }, + { + "epoch": 0.73, + "grad_norm": 0.14326471090316772, + "learning_rate": 1.808688030677281e-06, + "loss": 0.5391, + "step": 3383 + }, + { + "epoch": 0.73, + "grad_norm": 0.16172616183757782, + "learning_rate": 1.8060028268377088e-06, + "loss": 0.4593, + "step": 3384 + }, + { + "epoch": 0.73, + "grad_norm": 0.16830769181251526, + "learning_rate": 1.8033191783307309e-06, + "loss": 0.5275, + "step": 3385 + }, + { + "epoch": 0.73, + "grad_norm": 0.16044098138809204, + "learning_rate": 1.8006370864631644e-06, + "loss": 0.4947, + "step": 3386 + }, + { + "epoch": 0.73, + "grad_norm": 0.16456788778305054, + "learning_rate": 1.7979565525410691e-06, + "loss": 0.5459, + "step": 3387 + }, + { + "epoch": 0.73, + "grad_norm": 0.18591398000717163, + "learning_rate": 1.7952775778697418e-06, + "loss": 0.5412, + "step": 3388 + }, + { + "epoch": 0.73, + "grad_norm": 0.16051456332206726, + "learning_rate": 1.7926001637537222e-06, + "loss": 0.5081, + "step": 3389 + }, + { + "epoch": 0.73, + "grad_norm": 0.1629684865474701, + "learning_rate": 1.7899243114967918e-06, + "loss": 0.5292, + "step": 3390 + }, + { + "epoch": 0.73, + "grad_norm": 0.171781525015831, + "learning_rate": 1.7872500224019696e-06, + "loss": 0.5031, + "step": 3391 + }, + { + "epoch": 0.73, + "grad_norm": 0.14067816734313965, + "learning_rate": 1.7845772977715148e-06, + "loss": 0.5218, + "step": 3392 + }, + { + "epoch": 0.73, + "grad_norm": 0.1485803723335266, + "learning_rate": 1.7819061389069208e-06, + "loss": 0.5542, + "step": 3393 + }, + { + "epoch": 0.73, + "grad_norm": 0.13566631078720093, + "learning_rate": 1.7792365471089252e-06, + "loss": 0.4824, + "step": 3394 + }, + { + "epoch": 0.73, + "grad_norm": 0.17010356485843658, + "learning_rate": 1.7765685236774937e-06, + "loss": 0.4645, + "step": 3395 + }, + { + "epoch": 0.73, + "grad_norm": 0.16269443929195404, + "learning_rate": 1.773902069911838e-06, + "loss": 0.4747, + "step": 3396 + }, + { + "epoch": 0.73, + "grad_norm": 0.20955312252044678, + "learning_rate": 1.7712371871104012e-06, + "loss": 0.4571, + "step": 3397 + }, + { + "epoch": 0.73, + "grad_norm": 0.13663814961910248, + "learning_rate": 1.7685738765708576e-06, + "loss": 0.4574, + "step": 3398 + }, + { + "epoch": 0.73, + "grad_norm": 0.15181677043437958, + "learning_rate": 1.765912139590123e-06, + "loss": 0.5594, + "step": 3399 + }, + { + "epoch": 0.73, + "grad_norm": 0.15845806896686554, + "learning_rate": 1.7632519774643391e-06, + "loss": 0.4867, + "step": 3400 + }, + { + "epoch": 0.73, + "grad_norm": 0.18000967800617218, + "learning_rate": 1.760593391488888e-06, + "loss": 0.486, + "step": 3401 + }, + { + "epoch": 0.73, + "grad_norm": 0.2162676602602005, + "learning_rate": 1.7579363829583794e-06, + "loss": 0.49, + "step": 3402 + }, + { + "epoch": 0.73, + "grad_norm": 0.1404118835926056, + "learning_rate": 1.7552809531666582e-06, + "loss": 0.4929, + "step": 3403 + }, + { + "epoch": 0.73, + "grad_norm": 0.1907334327697754, + "learning_rate": 1.7526271034067993e-06, + "loss": 0.5793, + "step": 3404 + }, + { + "epoch": 0.73, + "grad_norm": 0.16548538208007812, + "learning_rate": 1.749974834971106e-06, + "loss": 0.5448, + "step": 3405 + }, + { + "epoch": 0.73, + "grad_norm": 0.12368584424257278, + "learning_rate": 1.7473241491511139e-06, + "loss": 0.4833, + "step": 3406 + }, + { + "epoch": 0.73, + "grad_norm": 0.1966637670993805, + "learning_rate": 1.7446750472375879e-06, + "loss": 0.5532, + "step": 3407 + }, + { + "epoch": 0.73, + "grad_norm": 0.13835598528385162, + "learning_rate": 1.7420275305205214e-06, + "loss": 0.5279, + "step": 3408 + }, + { + "epoch": 0.73, + "grad_norm": 0.15932750701904297, + "learning_rate": 1.7393816002891368e-06, + "loss": 0.5535, + "step": 3409 + }, + { + "epoch": 0.73, + "grad_norm": 0.18594208359718323, + "learning_rate": 1.7367372578318797e-06, + "loss": 0.4495, + "step": 3410 + }, + { + "epoch": 0.73, + "grad_norm": 0.17896361649036407, + "learning_rate": 1.7340945044364293e-06, + "loss": 0.5242, + "step": 3411 + }, + { + "epoch": 0.73, + "grad_norm": 0.13597969710826874, + "learning_rate": 1.7314533413896833e-06, + "loss": 0.443, + "step": 3412 + }, + { + "epoch": 0.74, + "grad_norm": 0.15737301111221313, + "learning_rate": 1.7288137699777714e-06, + "loss": 0.5596, + "step": 3413 + }, + { + "epoch": 0.74, + "grad_norm": 0.14145611226558685, + "learning_rate": 1.7261757914860456e-06, + "loss": 0.5109, + "step": 3414 + }, + { + "epoch": 0.74, + "grad_norm": 0.17835848033428192, + "learning_rate": 1.7235394071990824e-06, + "loss": 0.5036, + "step": 3415 + }, + { + "epoch": 0.74, + "grad_norm": 0.21317198872566223, + "learning_rate": 1.720904618400684e-06, + "loss": 0.473, + "step": 3416 + }, + { + "epoch": 0.74, + "grad_norm": 0.13540047407150269, + "learning_rate": 1.7182714263738692e-06, + "loss": 0.538, + "step": 3417 + }, + { + "epoch": 0.74, + "grad_norm": 0.21162466704845428, + "learning_rate": 1.7156398324008871e-06, + "loss": 0.5771, + "step": 3418 + }, + { + "epoch": 0.74, + "grad_norm": 0.16375568509101868, + "learning_rate": 1.7130098377632065e-06, + "loss": 0.5353, + "step": 3419 + }, + { + "epoch": 0.74, + "grad_norm": 0.15535147488117218, + "learning_rate": 1.7103814437415105e-06, + "loss": 0.4993, + "step": 3420 + }, + { + "epoch": 0.74, + "grad_norm": 0.17422151565551758, + "learning_rate": 1.7077546516157156e-06, + "loss": 0.5527, + "step": 3421 + }, + { + "epoch": 0.74, + "grad_norm": 0.15542204678058624, + "learning_rate": 1.7051294626649462e-06, + "loss": 0.5521, + "step": 3422 + }, + { + "epoch": 0.74, + "grad_norm": 0.16863486170768738, + "learning_rate": 1.702505878167553e-06, + "loss": 0.5245, + "step": 3423 + }, + { + "epoch": 0.74, + "grad_norm": 0.17205984890460968, + "learning_rate": 1.6998838994011041e-06, + "loss": 0.5189, + "step": 3424 + }, + { + "epoch": 0.74, + "grad_norm": 0.15427836775779724, + "learning_rate": 1.6972635276423815e-06, + "loss": 0.5309, + "step": 3425 + }, + { + "epoch": 0.74, + "grad_norm": 0.12143069505691528, + "learning_rate": 1.6946447641673907e-06, + "loss": 0.4993, + "step": 3426 + }, + { + "epoch": 0.74, + "grad_norm": 0.15743811428546906, + "learning_rate": 1.6920276102513512e-06, + "loss": 0.4693, + "step": 3427 + }, + { + "epoch": 0.74, + "grad_norm": 0.15306471288204193, + "learning_rate": 1.6894120671686986e-06, + "loss": 0.5164, + "step": 3428 + }, + { + "epoch": 0.74, + "grad_norm": 0.21616849303245544, + "learning_rate": 1.6867981361930864e-06, + "loss": 0.5525, + "step": 3429 + }, + { + "epoch": 0.74, + "grad_norm": 0.1986081451177597, + "learning_rate": 1.6841858185973775e-06, + "loss": 0.5335, + "step": 3430 + }, + { + "epoch": 0.74, + "grad_norm": 0.1559258997440338, + "learning_rate": 1.681575115653656e-06, + "loss": 0.4944, + "step": 3431 + }, + { + "epoch": 0.74, + "grad_norm": 0.12250448018312454, + "learning_rate": 1.6789660286332132e-06, + "loss": 0.5096, + "step": 3432 + }, + { + "epoch": 0.74, + "grad_norm": 0.3296593129634857, + "learning_rate": 1.6763585588065579e-06, + "loss": 0.5291, + "step": 3433 + }, + { + "epoch": 0.74, + "grad_norm": 0.15359769761562347, + "learning_rate": 1.6737527074434135e-06, + "loss": 0.4591, + "step": 3434 + }, + { + "epoch": 0.74, + "grad_norm": 0.15591566264629364, + "learning_rate": 1.6711484758127088e-06, + "loss": 0.524, + "step": 3435 + }, + { + "epoch": 0.74, + "grad_norm": 0.16285593807697296, + "learning_rate": 1.6685458651825892e-06, + "loss": 0.4952, + "step": 3436 + }, + { + "epoch": 0.74, + "grad_norm": 0.1407860666513443, + "learning_rate": 1.6659448768204062e-06, + "loss": 0.436, + "step": 3437 + }, + { + "epoch": 0.74, + "grad_norm": 0.15365861356258392, + "learning_rate": 1.6633455119927256e-06, + "loss": 0.5039, + "step": 3438 + }, + { + "epoch": 0.74, + "grad_norm": 0.18231172859668732, + "learning_rate": 1.6607477719653198e-06, + "loss": 0.5312, + "step": 3439 + }, + { + "epoch": 0.74, + "grad_norm": 0.1712518185377121, + "learning_rate": 1.658151658003172e-06, + "loss": 0.5649, + "step": 3440 + }, + { + "epoch": 0.74, + "grad_norm": 0.19198983907699585, + "learning_rate": 1.6555571713704743e-06, + "loss": 0.5381, + "step": 3441 + }, + { + "epoch": 0.74, + "grad_norm": 0.14253763854503632, + "learning_rate": 1.6529643133306212e-06, + "loss": 0.545, + "step": 3442 + }, + { + "epoch": 0.74, + "grad_norm": 0.16237348318099976, + "learning_rate": 1.6503730851462208e-06, + "loss": 0.5184, + "step": 3443 + }, + { + "epoch": 0.74, + "grad_norm": 0.1761130690574646, + "learning_rate": 1.647783488079081e-06, + "loss": 0.547, + "step": 3444 + }, + { + "epoch": 0.74, + "grad_norm": 0.17866788804531097, + "learning_rate": 1.6451955233902206e-06, + "loss": 0.5351, + "step": 3445 + }, + { + "epoch": 0.74, + "grad_norm": 0.176165372133255, + "learning_rate": 1.6426091923398619e-06, + "loss": 0.4789, + "step": 3446 + }, + { + "epoch": 0.74, + "grad_norm": 0.21256986260414124, + "learning_rate": 1.6400244961874311e-06, + "loss": 0.5431, + "step": 3447 + }, + { + "epoch": 0.74, + "grad_norm": 0.17698679864406586, + "learning_rate": 1.6374414361915613e-06, + "loss": 0.515, + "step": 3448 + }, + { + "epoch": 0.74, + "grad_norm": 0.13463236391544342, + "learning_rate": 1.6348600136100817e-06, + "loss": 0.5694, + "step": 3449 + }, + { + "epoch": 0.74, + "grad_norm": 0.15754647552967072, + "learning_rate": 1.6322802297000306e-06, + "loss": 0.5126, + "step": 3450 + }, + { + "epoch": 0.74, + "grad_norm": 0.19439859688282013, + "learning_rate": 1.6297020857176466e-06, + "loss": 0.5368, + "step": 3451 + }, + { + "epoch": 0.74, + "grad_norm": 0.1266242265701294, + "learning_rate": 1.6271255829183702e-06, + "loss": 0.502, + "step": 3452 + }, + { + "epoch": 0.74, + "grad_norm": 0.18297268450260162, + "learning_rate": 1.6245507225568425e-06, + "loss": 0.4904, + "step": 3453 + }, + { + "epoch": 0.74, + "grad_norm": 0.1342281848192215, + "learning_rate": 1.6219775058869019e-06, + "loss": 0.4823, + "step": 3454 + }, + { + "epoch": 0.74, + "grad_norm": 0.17030228674411774, + "learning_rate": 1.6194059341615908e-06, + "loss": 0.5196, + "step": 3455 + }, + { + "epoch": 0.74, + "grad_norm": 0.1725773811340332, + "learning_rate": 1.6168360086331498e-06, + "loss": 0.4785, + "step": 3456 + }, + { + "epoch": 0.74, + "grad_norm": 0.16261161863803864, + "learning_rate": 1.614267730553013e-06, + "loss": 0.481, + "step": 3457 + }, + { + "epoch": 0.74, + "grad_norm": 0.16986827552318573, + "learning_rate": 1.6117011011718188e-06, + "loss": 0.4874, + "step": 3458 + }, + { + "epoch": 0.75, + "grad_norm": 0.148764505982399, + "learning_rate": 1.6091361217393992e-06, + "loss": 0.5044, + "step": 3459 + }, + { + "epoch": 0.75, + "grad_norm": 0.1262829303741455, + "learning_rate": 1.6065727935047837e-06, + "loss": 0.5185, + "step": 3460 + }, + { + "epoch": 0.75, + "grad_norm": 0.1936028003692627, + "learning_rate": 1.6040111177161994e-06, + "loss": 0.5645, + "step": 3461 + }, + { + "epoch": 0.75, + "grad_norm": 0.1676550656557083, + "learning_rate": 1.6014510956210632e-06, + "loss": 0.5394, + "step": 3462 + }, + { + "epoch": 0.75, + "grad_norm": 0.19447743892669678, + "learning_rate": 1.5988927284659921e-06, + "loss": 0.5471, + "step": 3463 + }, + { + "epoch": 0.75, + "grad_norm": 0.14620442688465118, + "learning_rate": 1.5963360174967956e-06, + "loss": 0.493, + "step": 3464 + }, + { + "epoch": 0.75, + "grad_norm": 0.1667238175868988, + "learning_rate": 1.593780963958479e-06, + "loss": 0.5172, + "step": 3465 + }, + { + "epoch": 0.75, + "grad_norm": 0.14564719796180725, + "learning_rate": 1.5912275690952339e-06, + "loss": 0.5031, + "step": 3466 + }, + { + "epoch": 0.75, + "grad_norm": 0.17484711110591888, + "learning_rate": 1.5886758341504506e-06, + "loss": 0.4841, + "step": 3467 + }, + { + "epoch": 0.75, + "grad_norm": 0.17965911328792572, + "learning_rate": 1.5861257603667106e-06, + "loss": 0.5354, + "step": 3468 + }, + { + "epoch": 0.75, + "grad_norm": 0.16746492683887482, + "learning_rate": 1.5835773489857813e-06, + "loss": 0.5087, + "step": 3469 + }, + { + "epoch": 0.75, + "grad_norm": 0.15599916875362396, + "learning_rate": 1.581030601248626e-06, + "loss": 0.5562, + "step": 3470 + }, + { + "epoch": 0.75, + "grad_norm": 0.13977332413196564, + "learning_rate": 1.5784855183953956e-06, + "loss": 0.5003, + "step": 3471 + }, + { + "epoch": 0.75, + "grad_norm": 0.19692641496658325, + "learning_rate": 1.5759421016654314e-06, + "loss": 0.4618, + "step": 3472 + }, + { + "epoch": 0.75, + "grad_norm": 0.19811460375785828, + "learning_rate": 1.5734003522972635e-06, + "loss": 0.4771, + "step": 3473 + }, + { + "epoch": 0.75, + "grad_norm": 0.15085352957248688, + "learning_rate": 1.570860271528607e-06, + "loss": 0.5023, + "step": 3474 + }, + { + "epoch": 0.75, + "grad_norm": 0.16862472891807556, + "learning_rate": 1.5683218605963686e-06, + "loss": 0.5323, + "step": 3475 + }, + { + "epoch": 0.75, + "grad_norm": 0.14913085103034973, + "learning_rate": 1.5657851207366359e-06, + "loss": 0.5062, + "step": 3476 + }, + { + "epoch": 0.75, + "grad_norm": 0.17679521441459656, + "learning_rate": 1.5632500531846916e-06, + "loss": 0.4542, + "step": 3477 + }, + { + "epoch": 0.75, + "grad_norm": 0.14363497495651245, + "learning_rate": 1.5607166591749995e-06, + "loss": 0.5322, + "step": 3478 + }, + { + "epoch": 0.75, + "grad_norm": 0.1939202845096588, + "learning_rate": 1.5581849399412047e-06, + "loss": 0.5045, + "step": 3479 + }, + { + "epoch": 0.75, + "grad_norm": 0.15432246029376984, + "learning_rate": 1.555654896716144e-06, + "loss": 0.5493, + "step": 3480 + }, + { + "epoch": 0.75, + "grad_norm": 0.14659973978996277, + "learning_rate": 1.55312653073183e-06, + "loss": 0.5053, + "step": 3481 + }, + { + "epoch": 0.75, + "grad_norm": 0.14200441539287567, + "learning_rate": 1.5505998432194658e-06, + "loss": 0.4921, + "step": 3482 + }, + { + "epoch": 0.75, + "grad_norm": 0.16654643416404724, + "learning_rate": 1.5480748354094332e-06, + "loss": 0.4844, + "step": 3483 + }, + { + "epoch": 0.75, + "grad_norm": 0.21012644469738007, + "learning_rate": 1.5455515085312984e-06, + "loss": 0.5075, + "step": 3484 + }, + { + "epoch": 0.75, + "grad_norm": 0.20673668384552002, + "learning_rate": 1.543029863813808e-06, + "loss": 0.5155, + "step": 3485 + }, + { + "epoch": 0.75, + "grad_norm": 0.16482393443584442, + "learning_rate": 1.5405099024848874e-06, + "loss": 0.4767, + "step": 3486 + }, + { + "epoch": 0.75, + "grad_norm": 0.14894434809684753, + "learning_rate": 1.5379916257716448e-06, + "loss": 0.5139, + "step": 3487 + }, + { + "epoch": 0.75, + "grad_norm": 0.14717522263526917, + "learning_rate": 1.5354750349003694e-06, + "loss": 0.5422, + "step": 3488 + }, + { + "epoch": 0.75, + "grad_norm": 0.1304522007703781, + "learning_rate": 1.5329601310965225e-06, + "loss": 0.5312, + "step": 3489 + }, + { + "epoch": 0.75, + "grad_norm": 0.20385202765464783, + "learning_rate": 1.5304469155847556e-06, + "loss": 0.5567, + "step": 3490 + }, + { + "epoch": 0.75, + "grad_norm": 0.13958947360515594, + "learning_rate": 1.527935389588886e-06, + "loss": 0.5514, + "step": 3491 + }, + { + "epoch": 0.75, + "grad_norm": 0.17612220346927643, + "learning_rate": 1.5254255543319168e-06, + "loss": 0.4965, + "step": 3492 + }, + { + "epoch": 0.75, + "grad_norm": 0.1380666047334671, + "learning_rate": 1.5229174110360222e-06, + "loss": 0.5664, + "step": 3493 + }, + { + "epoch": 0.75, + "grad_norm": 0.14796659350395203, + "learning_rate": 1.5204109609225553e-06, + "loss": 0.4855, + "step": 3494 + }, + { + "epoch": 0.75, + "grad_norm": 0.17148888111114502, + "learning_rate": 1.5179062052120459e-06, + "loss": 0.4734, + "step": 3495 + }, + { + "epoch": 0.75, + "grad_norm": 0.15438951551914215, + "learning_rate": 1.5154031451241952e-06, + "loss": 0.5619, + "step": 3496 + }, + { + "epoch": 0.75, + "grad_norm": 0.18706923723220825, + "learning_rate": 1.5129017818778835e-06, + "loss": 0.5614, + "step": 3497 + }, + { + "epoch": 0.75, + "grad_norm": 0.15529923141002655, + "learning_rate": 1.5104021166911582e-06, + "loss": 0.5682, + "step": 3498 + }, + { + "epoch": 0.75, + "grad_norm": 0.21242637932300568, + "learning_rate": 1.5079041507812454e-06, + "loss": 0.5401, + "step": 3499 + }, + { + "epoch": 0.75, + "grad_norm": 0.18680711090564728, + "learning_rate": 1.5054078853645432e-06, + "loss": 0.5004, + "step": 3500 + }, + { + "epoch": 0.75, + "grad_norm": 0.1526576429605484, + "learning_rate": 1.5029133216566172e-06, + "loss": 0.4787, + "step": 3501 + }, + { + "epoch": 0.75, + "grad_norm": 0.16828754544258118, + "learning_rate": 1.5004204608722088e-06, + "loss": 0.5431, + "step": 3502 + }, + { + "epoch": 0.75, + "grad_norm": 0.12067432701587677, + "learning_rate": 1.4979293042252291e-06, + "loss": 0.475, + "step": 3503 + }, + { + "epoch": 0.75, + "grad_norm": 0.2365456521511078, + "learning_rate": 1.495439852928759e-06, + "loss": 0.5043, + "step": 3504 + }, + { + "epoch": 0.76, + "grad_norm": 0.15215608477592468, + "learning_rate": 1.492952108195051e-06, + "loss": 0.5572, + "step": 3505 + }, + { + "epoch": 0.76, + "grad_norm": 0.12552374601364136, + "learning_rate": 1.4904660712355207e-06, + "loss": 0.4765, + "step": 3506 + }, + { + "epoch": 0.76, + "grad_norm": 0.14761190116405487, + "learning_rate": 1.4879817432607573e-06, + "loss": 0.5246, + "step": 3507 + }, + { + "epoch": 0.76, + "grad_norm": 0.18243740499019623, + "learning_rate": 1.4854991254805179e-06, + "loss": 0.5586, + "step": 3508 + }, + { + "epoch": 0.76, + "grad_norm": 0.14932124316692352, + "learning_rate": 1.4830182191037246e-06, + "loss": 0.5113, + "step": 3509 + }, + { + "epoch": 0.76, + "grad_norm": 0.14290283620357513, + "learning_rate": 1.4805390253384683e-06, + "loss": 0.5141, + "step": 3510 + }, + { + "epoch": 0.76, + "grad_norm": 0.1591915637254715, + "learning_rate": 1.4780615453920016e-06, + "loss": 0.5043, + "step": 3511 + }, + { + "epoch": 0.76, + "grad_norm": 0.15741455554962158, + "learning_rate": 1.4755857804707485e-06, + "loss": 0.5195, + "step": 3512 + }, + { + "epoch": 0.76, + "grad_norm": 0.16539999842643738, + "learning_rate": 1.4731117317802923e-06, + "loss": 0.5353, + "step": 3513 + }, + { + "epoch": 0.76, + "grad_norm": 0.13506761193275452, + "learning_rate": 1.4706394005253838e-06, + "loss": 0.5446, + "step": 3514 + }, + { + "epoch": 0.76, + "grad_norm": 0.1319400519132614, + "learning_rate": 1.4681687879099376e-06, + "loss": 0.5075, + "step": 3515 + }, + { + "epoch": 0.76, + "grad_norm": 0.1385585367679596, + "learning_rate": 1.465699895137031e-06, + "loss": 0.5429, + "step": 3516 + }, + { + "epoch": 0.76, + "grad_norm": 0.12562334537506104, + "learning_rate": 1.463232723408904e-06, + "loss": 0.5114, + "step": 3517 + }, + { + "epoch": 0.76, + "grad_norm": 0.1627231389284134, + "learning_rate": 1.4607672739269552e-06, + "loss": 0.4937, + "step": 3518 + }, + { + "epoch": 0.76, + "grad_norm": 0.18348899483680725, + "learning_rate": 1.458303547891749e-06, + "loss": 0.5292, + "step": 3519 + }, + { + "epoch": 0.76, + "grad_norm": 0.17182128131389618, + "learning_rate": 1.455841546503009e-06, + "loss": 0.5041, + "step": 3520 + }, + { + "epoch": 0.76, + "grad_norm": 0.1899326741695404, + "learning_rate": 1.4533812709596184e-06, + "loss": 0.5299, + "step": 3521 + }, + { + "epoch": 0.76, + "grad_norm": 0.2223159819841385, + "learning_rate": 1.450922722459623e-06, + "loss": 0.535, + "step": 3522 + }, + { + "epoch": 0.76, + "grad_norm": 0.16189001500606537, + "learning_rate": 1.4484659022002208e-06, + "loss": 0.5021, + "step": 3523 + }, + { + "epoch": 0.76, + "grad_norm": 0.19098880887031555, + "learning_rate": 1.446010811377776e-06, + "loss": 0.5376, + "step": 3524 + }, + { + "epoch": 0.76, + "grad_norm": 0.19344013929367065, + "learning_rate": 1.4435574511878037e-06, + "loss": 0.5651, + "step": 3525 + }, + { + "epoch": 0.76, + "grad_norm": 0.13735520839691162, + "learning_rate": 1.4411058228249824e-06, + "loss": 0.5185, + "step": 3526 + }, + { + "epoch": 0.76, + "grad_norm": 0.15287934243679047, + "learning_rate": 1.438655927483143e-06, + "loss": 0.538, + "step": 3527 + }, + { + "epoch": 0.76, + "grad_norm": 0.14649856090545654, + "learning_rate": 1.4362077663552754e-06, + "loss": 0.4895, + "step": 3528 + }, + { + "epoch": 0.76, + "grad_norm": 0.171720489859581, + "learning_rate": 1.4337613406335244e-06, + "loss": 0.5108, + "step": 3529 + }, + { + "epoch": 0.76, + "grad_norm": 0.15773873031139374, + "learning_rate": 1.4313166515091863e-06, + "loss": 0.5321, + "step": 3530 + }, + { + "epoch": 0.76, + "grad_norm": 0.1555633246898651, + "learning_rate": 1.428873700172716e-06, + "loss": 0.533, + "step": 3531 + }, + { + "epoch": 0.76, + "grad_norm": 0.15259774029254913, + "learning_rate": 1.4264324878137204e-06, + "loss": 0.5034, + "step": 3532 + }, + { + "epoch": 0.76, + "grad_norm": 0.13174332678318024, + "learning_rate": 1.4239930156209597e-06, + "loss": 0.5052, + "step": 3533 + }, + { + "epoch": 0.76, + "grad_norm": 0.15307480096817017, + "learning_rate": 1.421555284782349e-06, + "loss": 0.5494, + "step": 3534 + }, + { + "epoch": 0.76, + "grad_norm": 0.14553479850292206, + "learning_rate": 1.4191192964849492e-06, + "loss": 0.5103, + "step": 3535 + }, + { + "epoch": 0.76, + "grad_norm": 0.1426243633031845, + "learning_rate": 1.4166850519149794e-06, + "loss": 0.4749, + "step": 3536 + }, + { + "epoch": 0.76, + "grad_norm": 0.1951218992471695, + "learning_rate": 1.4142525522578082e-06, + "loss": 0.4723, + "step": 3537 + }, + { + "epoch": 0.76, + "grad_norm": 0.15164697170257568, + "learning_rate": 1.41182179869795e-06, + "loss": 0.5262, + "step": 3538 + }, + { + "epoch": 0.76, + "grad_norm": 0.15860851109027863, + "learning_rate": 1.409392792419073e-06, + "loss": 0.5282, + "step": 3539 + }, + { + "epoch": 0.76, + "grad_norm": 0.1473582684993744, + "learning_rate": 1.406965534603995e-06, + "loss": 0.5385, + "step": 3540 + }, + { + "epoch": 0.76, + "grad_norm": 0.15201956033706665, + "learning_rate": 1.404540026434681e-06, + "loss": 0.5183, + "step": 3541 + }, + { + "epoch": 0.76, + "grad_norm": 0.18102842569351196, + "learning_rate": 1.4021162690922441e-06, + "loss": 0.5474, + "step": 3542 + }, + { + "epoch": 0.76, + "grad_norm": 0.13703973591327667, + "learning_rate": 1.3996942637569438e-06, + "loss": 0.5333, + "step": 3543 + }, + { + "epoch": 0.76, + "grad_norm": 0.14135409891605377, + "learning_rate": 1.397274011608189e-06, + "loss": 0.5164, + "step": 3544 + }, + { + "epoch": 0.76, + "grad_norm": 0.14163129031658173, + "learning_rate": 1.3948555138245295e-06, + "loss": 0.5044, + "step": 3545 + }, + { + "epoch": 0.76, + "grad_norm": 0.17711517214775085, + "learning_rate": 1.3924387715836706e-06, + "loss": 0.5235, + "step": 3546 + }, + { + "epoch": 0.76, + "grad_norm": 0.14216595888137817, + "learning_rate": 1.390023786062452e-06, + "loss": 0.4795, + "step": 3547 + }, + { + "epoch": 0.76, + "grad_norm": 0.16309070587158203, + "learning_rate": 1.3876105584368653e-06, + "loss": 0.5377, + "step": 3548 + }, + { + "epoch": 0.76, + "grad_norm": 0.1565747708082199, + "learning_rate": 1.3851990898820439e-06, + "loss": 0.5447, + "step": 3549 + }, + { + "epoch": 0.76, + "grad_norm": 0.16009236872196198, + "learning_rate": 1.3827893815722614e-06, + "loss": 0.5253, + "step": 3550 + }, + { + "epoch": 0.76, + "grad_norm": 0.14546410739421844, + "learning_rate": 1.3803814346809386e-06, + "loss": 0.5234, + "step": 3551 + }, + { + "epoch": 0.77, + "grad_norm": 0.16906693577766418, + "learning_rate": 1.3779752503806375e-06, + "loss": 0.5151, + "step": 3552 + }, + { + "epoch": 0.77, + "grad_norm": 0.16307243704795837, + "learning_rate": 1.3755708298430614e-06, + "loss": 0.4965, + "step": 3553 + }, + { + "epoch": 0.77, + "grad_norm": 0.17134273052215576, + "learning_rate": 1.3731681742390558e-06, + "loss": 0.4913, + "step": 3554 + }, + { + "epoch": 0.77, + "grad_norm": 0.1275198608636856, + "learning_rate": 1.3707672847386021e-06, + "loss": 0.4962, + "step": 3555 + }, + { + "epoch": 0.77, + "grad_norm": 0.14432884752750397, + "learning_rate": 1.368368162510829e-06, + "loss": 0.4881, + "step": 3556 + }, + { + "epoch": 0.77, + "grad_norm": 0.17812266945838928, + "learning_rate": 1.3659708087239981e-06, + "loss": 0.5165, + "step": 3557 + }, + { + "epoch": 0.77, + "grad_norm": 0.13962845504283905, + "learning_rate": 1.363575224545512e-06, + "loss": 0.5171, + "step": 3558 + }, + { + "epoch": 0.77, + "grad_norm": 0.15105730295181274, + "learning_rate": 1.3611814111419163e-06, + "loss": 0.5498, + "step": 3559 + }, + { + "epoch": 0.77, + "grad_norm": 0.12979727983474731, + "learning_rate": 1.3587893696788868e-06, + "loss": 0.4992, + "step": 3560 + }, + { + "epoch": 0.77, + "grad_norm": 0.13221748173236847, + "learning_rate": 1.3563991013212424e-06, + "loss": 0.5035, + "step": 3561 + }, + { + "epoch": 0.77, + "grad_norm": 0.19586306810379028, + "learning_rate": 1.3540106072329323e-06, + "loss": 0.5264, + "step": 3562 + }, + { + "epoch": 0.77, + "grad_norm": 0.17884129285812378, + "learning_rate": 1.3516238885770477e-06, + "loss": 0.5184, + "step": 3563 + }, + { + "epoch": 0.77, + "grad_norm": 0.16461104154586792, + "learning_rate": 1.349238946515813e-06, + "loss": 0.5141, + "step": 3564 + }, + { + "epoch": 0.77, + "grad_norm": 0.14698609709739685, + "learning_rate": 1.3468557822105864e-06, + "loss": 0.5084, + "step": 3565 + }, + { + "epoch": 0.77, + "grad_norm": 0.1535801738500595, + "learning_rate": 1.344474396821865e-06, + "loss": 0.5352, + "step": 3566 + }, + { + "epoch": 0.77, + "grad_norm": 0.20536081492900848, + "learning_rate": 1.3420947915092708e-06, + "loss": 0.5344, + "step": 3567 + }, + { + "epoch": 0.77, + "grad_norm": 0.1282234936952591, + "learning_rate": 1.3397169674315668e-06, + "loss": 0.5156, + "step": 3568 + }, + { + "epoch": 0.77, + "grad_norm": 0.16148659586906433, + "learning_rate": 1.337340925746648e-06, + "loss": 0.5496, + "step": 3569 + }, + { + "epoch": 0.77, + "grad_norm": 0.13853482902050018, + "learning_rate": 1.3349666676115358e-06, + "loss": 0.5359, + "step": 3570 + }, + { + "epoch": 0.77, + "grad_norm": 0.1656581163406372, + "learning_rate": 1.332594194182389e-06, + "loss": 0.4921, + "step": 3571 + }, + { + "epoch": 0.77, + "grad_norm": 0.20992939174175262, + "learning_rate": 1.3302235066144948e-06, + "loss": 0.5248, + "step": 3572 + }, + { + "epoch": 0.77, + "grad_norm": 0.15347984433174133, + "learning_rate": 1.3278546060622727e-06, + "loss": 0.5024, + "step": 3573 + }, + { + "epoch": 0.77, + "grad_norm": 0.15400084853172302, + "learning_rate": 1.3254874936792672e-06, + "loss": 0.5103, + "step": 3574 + }, + { + "epoch": 0.77, + "grad_norm": 0.23529481887817383, + "learning_rate": 1.3231221706181575e-06, + "loss": 0.4866, + "step": 3575 + }, + { + "epoch": 0.77, + "grad_norm": 0.18368123471736908, + "learning_rate": 1.3207586380307486e-06, + "loss": 0.4999, + "step": 3576 + }, + { + "epoch": 0.77, + "grad_norm": 0.1438288390636444, + "learning_rate": 1.318396897067975e-06, + "loss": 0.6058, + "step": 3577 + }, + { + "epoch": 0.77, + "grad_norm": 0.13895906507968903, + "learning_rate": 1.3160369488798984e-06, + "loss": 0.4721, + "step": 3578 + }, + { + "epoch": 0.77, + "grad_norm": 0.1586867719888687, + "learning_rate": 1.3136787946157055e-06, + "loss": 0.5271, + "step": 3579 + }, + { + "epoch": 0.77, + "grad_norm": 0.15690717101097107, + "learning_rate": 1.3113224354237113e-06, + "loss": 0.5475, + "step": 3580 + }, + { + "epoch": 0.77, + "grad_norm": 0.15662527084350586, + "learning_rate": 1.3089678724513589e-06, + "loss": 0.5388, + "step": 3581 + }, + { + "epoch": 0.77, + "grad_norm": 0.12812086939811707, + "learning_rate": 1.306615106845211e-06, + "loss": 0.4969, + "step": 3582 + }, + { + "epoch": 0.77, + "grad_norm": 0.14390747249126434, + "learning_rate": 1.3042641397509597e-06, + "loss": 0.4779, + "step": 3583 + }, + { + "epoch": 0.77, + "grad_norm": 0.1376083791255951, + "learning_rate": 1.30191497231342e-06, + "loss": 0.5654, + "step": 3584 + }, + { + "epoch": 0.77, + "grad_norm": 0.20095455646514893, + "learning_rate": 1.299567605676531e-06, + "loss": 0.518, + "step": 3585 + }, + { + "epoch": 0.77, + "grad_norm": 0.16395071148872375, + "learning_rate": 1.2972220409833552e-06, + "loss": 0.5361, + "step": 3586 + }, + { + "epoch": 0.77, + "grad_norm": 0.12365875393152237, + "learning_rate": 1.2948782793760745e-06, + "loss": 0.5278, + "step": 3587 + }, + { + "epoch": 0.77, + "grad_norm": 0.1766958236694336, + "learning_rate": 1.2925363219959958e-06, + "loss": 0.569, + "step": 3588 + }, + { + "epoch": 0.77, + "grad_norm": 0.15711119771003723, + "learning_rate": 1.2901961699835475e-06, + "loss": 0.542, + "step": 3589 + }, + { + "epoch": 0.77, + "grad_norm": 0.16060031950473785, + "learning_rate": 1.2878578244782775e-06, + "loss": 0.5658, + "step": 3590 + }, + { + "epoch": 0.77, + "grad_norm": 0.15911975502967834, + "learning_rate": 1.2855212866188566e-06, + "loss": 0.5181, + "step": 3591 + }, + { + "epoch": 0.77, + "grad_norm": 0.16181927919387817, + "learning_rate": 1.2831865575430702e-06, + "loss": 0.5686, + "step": 3592 + }, + { + "epoch": 0.77, + "grad_norm": 0.1953277289867401, + "learning_rate": 1.2808536383878295e-06, + "loss": 0.5062, + "step": 3593 + }, + { + "epoch": 0.77, + "grad_norm": 0.17257684469223022, + "learning_rate": 1.2785225302891568e-06, + "loss": 0.4755, + "step": 3594 + }, + { + "epoch": 0.77, + "grad_norm": 0.17315979301929474, + "learning_rate": 1.2761932343821992e-06, + "loss": 0.5166, + "step": 3595 + }, + { + "epoch": 0.77, + "grad_norm": 0.19010502099990845, + "learning_rate": 1.2738657518012188e-06, + "loss": 0.5653, + "step": 3596 + }, + { + "epoch": 0.77, + "grad_norm": 0.1522989571094513, + "learning_rate": 1.2715400836795939e-06, + "loss": 0.5601, + "step": 3597 + }, + { + "epoch": 0.78, + "grad_norm": 0.15732337534427643, + "learning_rate": 1.2692162311498219e-06, + "loss": 0.5467, + "step": 3598 + }, + { + "epoch": 0.78, + "grad_norm": 0.16109420359134674, + "learning_rate": 1.266894195343511e-06, + "loss": 0.5328, + "step": 3599 + }, + { + "epoch": 0.78, + "grad_norm": 0.16374173760414124, + "learning_rate": 1.2645739773913911e-06, + "loss": 0.5324, + "step": 3600 + }, + { + "epoch": 0.78, + "grad_norm": 0.18841396272182465, + "learning_rate": 1.2622555784232992e-06, + "loss": 0.4905, + "step": 3601 + }, + { + "epoch": 0.78, + "grad_norm": 0.17438913881778717, + "learning_rate": 1.259938999568196e-06, + "loss": 0.4836, + "step": 3602 + }, + { + "epoch": 0.78, + "grad_norm": 0.1530585139989853, + "learning_rate": 1.2576242419541502e-06, + "loss": 0.4937, + "step": 3603 + }, + { + "epoch": 0.78, + "grad_norm": 0.16211232542991638, + "learning_rate": 1.2553113067083417e-06, + "loss": 0.5307, + "step": 3604 + }, + { + "epoch": 0.78, + "grad_norm": 0.18766599893569946, + "learning_rate": 1.2530001949570686e-06, + "loss": 0.4523, + "step": 3605 + }, + { + "epoch": 0.78, + "grad_norm": 0.14365822076797485, + "learning_rate": 1.2506909078257357e-06, + "loss": 0.5097, + "step": 3606 + }, + { + "epoch": 0.78, + "grad_norm": 0.13116493821144104, + "learning_rate": 1.2483834464388622e-06, + "loss": 0.5036, + "step": 3607 + }, + { + "epoch": 0.78, + "grad_norm": 0.15349233150482178, + "learning_rate": 1.2460778119200778e-06, + "loss": 0.4983, + "step": 3608 + }, + { + "epoch": 0.78, + "grad_norm": 0.15856203436851501, + "learning_rate": 1.2437740053921238e-06, + "loss": 0.4921, + "step": 3609 + }, + { + "epoch": 0.78, + "grad_norm": 0.13519662618637085, + "learning_rate": 1.24147202797685e-06, + "loss": 0.5291, + "step": 3610 + }, + { + "epoch": 0.78, + "grad_norm": 0.14394241571426392, + "learning_rate": 1.2391718807952142e-06, + "loss": 0.5235, + "step": 3611 + }, + { + "epoch": 0.78, + "grad_norm": 0.12606112658977509, + "learning_rate": 1.236873564967284e-06, + "loss": 0.4571, + "step": 3612 + }, + { + "epoch": 0.78, + "grad_norm": 0.1665707379579544, + "learning_rate": 1.2345770816122388e-06, + "loss": 0.5432, + "step": 3613 + }, + { + "epoch": 0.78, + "grad_norm": 0.14036637544631958, + "learning_rate": 1.2322824318483568e-06, + "loss": 0.4873, + "step": 3614 + }, + { + "epoch": 0.78, + "grad_norm": 0.1713072657585144, + "learning_rate": 1.2299896167930358e-06, + "loss": 0.5134, + "step": 3615 + }, + { + "epoch": 0.78, + "grad_norm": 0.15142671763896942, + "learning_rate": 1.227698637562768e-06, + "loss": 0.5193, + "step": 3616 + }, + { + "epoch": 0.78, + "grad_norm": 0.148328959941864, + "learning_rate": 1.2254094952731594e-06, + "loss": 0.5107, + "step": 3617 + }, + { + "epoch": 0.78, + "grad_norm": 0.15323348343372345, + "learning_rate": 1.2231221910389196e-06, + "loss": 0.5187, + "step": 3618 + }, + { + "epoch": 0.78, + "grad_norm": 0.13149654865264893, + "learning_rate": 1.2208367259738602e-06, + "loss": 0.5422, + "step": 3619 + }, + { + "epoch": 0.78, + "grad_norm": 0.15822039544582367, + "learning_rate": 1.2185531011909008e-06, + "loss": 0.493, + "step": 3620 + }, + { + "epoch": 0.78, + "grad_norm": 0.1450645625591278, + "learning_rate": 1.2162713178020641e-06, + "loss": 0.4954, + "step": 3621 + }, + { + "epoch": 0.78, + "grad_norm": 0.14330001175403595, + "learning_rate": 1.2139913769184757e-06, + "loss": 0.4457, + "step": 3622 + }, + { + "epoch": 0.78, + "grad_norm": 0.1793079823255539, + "learning_rate": 1.211713279650365e-06, + "loss": 0.5186, + "step": 3623 + }, + { + "epoch": 0.78, + "grad_norm": 0.1488538533449173, + "learning_rate": 1.2094370271070599e-06, + "loss": 0.5479, + "step": 3624 + }, + { + "epoch": 0.78, + "grad_norm": 0.18175008893013, + "learning_rate": 1.207162620396996e-06, + "loss": 0.5202, + "step": 3625 + }, + { + "epoch": 0.78, + "grad_norm": 0.15374873578548431, + "learning_rate": 1.2048900606277036e-06, + "loss": 0.5404, + "step": 3626 + }, + { + "epoch": 0.78, + "grad_norm": 0.1646018624305725, + "learning_rate": 1.2026193489058185e-06, + "loss": 0.5023, + "step": 3627 + }, + { + "epoch": 0.78, + "grad_norm": 0.13878245651721954, + "learning_rate": 1.2003504863370746e-06, + "loss": 0.4892, + "step": 3628 + }, + { + "epoch": 0.78, + "grad_norm": 0.1426292210817337, + "learning_rate": 1.1980834740263065e-06, + "loss": 0.5052, + "step": 3629 + }, + { + "epoch": 0.78, + "grad_norm": 0.15884622931480408, + "learning_rate": 1.195818313077447e-06, + "loss": 0.5517, + "step": 3630 + }, + { + "epoch": 0.78, + "grad_norm": 0.15096734464168549, + "learning_rate": 1.1935550045935252e-06, + "loss": 0.4624, + "step": 3631 + }, + { + "epoch": 0.78, + "grad_norm": 0.1437617689371109, + "learning_rate": 1.1912935496766719e-06, + "loss": 0.4659, + "step": 3632 + }, + { + "epoch": 0.78, + "grad_norm": 0.22621825337409973, + "learning_rate": 1.1890339494281133e-06, + "loss": 0.4706, + "step": 3633 + }, + { + "epoch": 0.78, + "grad_norm": 0.17736200988292694, + "learning_rate": 1.186776204948173e-06, + "loss": 0.5366, + "step": 3634 + }, + { + "epoch": 0.78, + "grad_norm": 0.1408076137304306, + "learning_rate": 1.1845203173362725e-06, + "loss": 0.5448, + "step": 3635 + }, + { + "epoch": 0.78, + "grad_norm": 0.15870949625968933, + "learning_rate": 1.182266287690924e-06, + "loss": 0.4851, + "step": 3636 + }, + { + "epoch": 0.78, + "grad_norm": 0.12970934808254242, + "learning_rate": 1.1800141171097412e-06, + "loss": 0.5177, + "step": 3637 + }, + { + "epoch": 0.78, + "grad_norm": 0.15456534922122955, + "learning_rate": 1.177763806689427e-06, + "loss": 0.4997, + "step": 3638 + }, + { + "epoch": 0.78, + "grad_norm": 0.14675313234329224, + "learning_rate": 1.175515357525781e-06, + "loss": 0.5198, + "step": 3639 + }, + { + "epoch": 0.78, + "grad_norm": 0.1615784913301468, + "learning_rate": 1.173268770713701e-06, + "loss": 0.5251, + "step": 3640 + }, + { + "epoch": 0.78, + "grad_norm": 0.19272805750370026, + "learning_rate": 1.1710240473471685e-06, + "loss": 0.5331, + "step": 3641 + }, + { + "epoch": 0.78, + "grad_norm": 0.15185219049453735, + "learning_rate": 1.1687811885192662e-06, + "loss": 0.5372, + "step": 3642 + }, + { + "epoch": 0.78, + "grad_norm": 0.1498749703168869, + "learning_rate": 1.1665401953221622e-06, + "loss": 0.4953, + "step": 3643 + }, + { + "epoch": 0.78, + "grad_norm": 0.12573790550231934, + "learning_rate": 1.16430106884712e-06, + "loss": 0.556, + "step": 3644 + }, + { + "epoch": 0.79, + "grad_norm": 0.15969648957252502, + "learning_rate": 1.1620638101844938e-06, + "loss": 0.4978, + "step": 3645 + }, + { + "epoch": 0.79, + "grad_norm": 0.14067567884922028, + "learning_rate": 1.159828420423728e-06, + "loss": 0.5057, + "step": 3646 + }, + { + "epoch": 0.79, + "grad_norm": 0.15133407711982727, + "learning_rate": 1.157594900653357e-06, + "loss": 0.5406, + "step": 3647 + }, + { + "epoch": 0.79, + "grad_norm": 0.1627969741821289, + "learning_rate": 1.1553632519610025e-06, + "loss": 0.5282, + "step": 3648 + }, + { + "epoch": 0.79, + "grad_norm": 0.163666769862175, + "learning_rate": 1.1531334754333772e-06, + "loss": 0.5655, + "step": 3649 + }, + { + "epoch": 0.79, + "grad_norm": 0.1481485515832901, + "learning_rate": 1.1509055721562839e-06, + "loss": 0.5012, + "step": 3650 + }, + { + "epoch": 0.79, + "grad_norm": 0.1437186896800995, + "learning_rate": 1.148679543214608e-06, + "loss": 0.4814, + "step": 3651 + }, + { + "epoch": 0.79, + "grad_norm": 0.1677170991897583, + "learning_rate": 1.1464553896923264e-06, + "loss": 0.5308, + "step": 3652 + }, + { + "epoch": 0.79, + "grad_norm": 0.15452422201633453, + "learning_rate": 1.1442331126725014e-06, + "loss": 0.4929, + "step": 3653 + }, + { + "epoch": 0.79, + "grad_norm": 0.1445283144712448, + "learning_rate": 1.1420127132372839e-06, + "loss": 0.5104, + "step": 3654 + }, + { + "epoch": 0.79, + "grad_norm": 0.13753926753997803, + "learning_rate": 1.1397941924679046e-06, + "loss": 0.4942, + "step": 3655 + }, + { + "epoch": 0.79, + "grad_norm": 0.1484784483909607, + "learning_rate": 1.1375775514446846e-06, + "loss": 0.5266, + "step": 3656 + }, + { + "epoch": 0.79, + "grad_norm": 0.21680472791194916, + "learning_rate": 1.1353627912470289e-06, + "loss": 0.5809, + "step": 3657 + }, + { + "epoch": 0.79, + "grad_norm": 0.14361554384231567, + "learning_rate": 1.1331499129534252e-06, + "loss": 0.5438, + "step": 3658 + }, + { + "epoch": 0.79, + "grad_norm": 0.14105179905891418, + "learning_rate": 1.1309389176414471e-06, + "loss": 0.5111, + "step": 3659 + }, + { + "epoch": 0.79, + "grad_norm": 0.1748628318309784, + "learning_rate": 1.128729806387746e-06, + "loss": 0.537, + "step": 3660 + }, + { + "epoch": 0.79, + "grad_norm": 0.162385493516922, + "learning_rate": 1.1265225802680623e-06, + "loss": 0.5325, + "step": 3661 + }, + { + "epoch": 0.79, + "grad_norm": 0.20901146531105042, + "learning_rate": 1.124317240357216e-06, + "loss": 0.5093, + "step": 3662 + }, + { + "epoch": 0.79, + "grad_norm": 0.14778484404087067, + "learning_rate": 1.122113787729106e-06, + "loss": 0.5484, + "step": 3663 + }, + { + "epoch": 0.79, + "grad_norm": 0.1553722620010376, + "learning_rate": 1.119912223456715e-06, + "loss": 0.5044, + "step": 3664 + }, + { + "epoch": 0.79, + "grad_norm": 0.15933193266391754, + "learning_rate": 1.117712548612106e-06, + "loss": 0.5267, + "step": 3665 + }, + { + "epoch": 0.79, + "grad_norm": 0.14769431948661804, + "learning_rate": 1.1155147642664217e-06, + "loss": 0.5027, + "step": 3666 + }, + { + "epoch": 0.79, + "grad_norm": 0.15224431455135345, + "learning_rate": 1.1133188714898846e-06, + "loss": 0.5068, + "step": 3667 + }, + { + "epoch": 0.79, + "grad_norm": 0.25812146067619324, + "learning_rate": 1.1111248713517935e-06, + "loss": 0.516, + "step": 3668 + }, + { + "epoch": 0.79, + "grad_norm": 0.1287028193473816, + "learning_rate": 1.1089327649205301e-06, + "loss": 0.497, + "step": 3669 + }, + { + "epoch": 0.79, + "grad_norm": 0.17315033078193665, + "learning_rate": 1.1067425532635463e-06, + "loss": 0.5764, + "step": 3670 + }, + { + "epoch": 0.79, + "grad_norm": 0.17579080164432526, + "learning_rate": 1.1045542374473821e-06, + "loss": 0.508, + "step": 3671 + }, + { + "epoch": 0.79, + "grad_norm": 0.1450372189283371, + "learning_rate": 1.1023678185376474e-06, + "loss": 0.5104, + "step": 3672 + }, + { + "epoch": 0.79, + "grad_norm": 0.2039722353219986, + "learning_rate": 1.1001832975990274e-06, + "loss": 0.5159, + "step": 3673 + }, + { + "epoch": 0.79, + "grad_norm": 0.15762609243392944, + "learning_rate": 1.0980006756952882e-06, + "loss": 0.5387, + "step": 3674 + }, + { + "epoch": 0.79, + "grad_norm": 0.14661352336406708, + "learning_rate": 1.095819953889265e-06, + "loss": 0.4672, + "step": 3675 + }, + { + "epoch": 0.79, + "grad_norm": 0.18748416006565094, + "learning_rate": 1.0936411332428732e-06, + "loss": 0.4949, + "step": 3676 + }, + { + "epoch": 0.79, + "grad_norm": 0.18095709383487701, + "learning_rate": 1.091464214817099e-06, + "loss": 0.5316, + "step": 3677 + }, + { + "epoch": 0.79, + "grad_norm": 0.13158752024173737, + "learning_rate": 1.089289199672004e-06, + "loss": 0.4978, + "step": 3678 + }, + { + "epoch": 0.79, + "grad_norm": 0.15241560339927673, + "learning_rate": 1.0871160888667242e-06, + "loss": 0.5195, + "step": 3679 + }, + { + "epoch": 0.79, + "grad_norm": 0.13921800255775452, + "learning_rate": 1.084944883459464e-06, + "loss": 0.5269, + "step": 3680 + }, + { + "epoch": 0.79, + "grad_norm": 0.16038571298122406, + "learning_rate": 1.0827755845075044e-06, + "loss": 0.5714, + "step": 3681 + }, + { + "epoch": 0.79, + "grad_norm": 0.1517871618270874, + "learning_rate": 1.0806081930671947e-06, + "loss": 0.4976, + "step": 3682 + }, + { + "epoch": 0.79, + "grad_norm": 0.1371169090270996, + "learning_rate": 1.0784427101939553e-06, + "loss": 0.5421, + "step": 3683 + }, + { + "epoch": 0.79, + "grad_norm": 0.13309676945209503, + "learning_rate": 1.0762791369422838e-06, + "loss": 0.4903, + "step": 3684 + }, + { + "epoch": 0.79, + "grad_norm": 0.15115486085414886, + "learning_rate": 1.0741174743657385e-06, + "loss": 0.5011, + "step": 3685 + }, + { + "epoch": 0.79, + "grad_norm": 0.1870022863149643, + "learning_rate": 1.0719577235169537e-06, + "loss": 0.5292, + "step": 3686 + }, + { + "epoch": 0.79, + "grad_norm": 0.37398022413253784, + "learning_rate": 1.0697998854476294e-06, + "loss": 0.5336, + "step": 3687 + }, + { + "epoch": 0.79, + "grad_norm": 0.1611040085554123, + "learning_rate": 1.0676439612085353e-06, + "loss": 0.5077, + "step": 3688 + }, + { + "epoch": 0.79, + "grad_norm": 0.14922092854976654, + "learning_rate": 1.0654899518495104e-06, + "loss": 0.5461, + "step": 3689 + }, + { + "epoch": 0.79, + "grad_norm": 0.1831667125225067, + "learning_rate": 1.0633378584194593e-06, + "loss": 0.4868, + "step": 3690 + }, + { + "epoch": 0.8, + "grad_norm": 0.15320684015750885, + "learning_rate": 1.0611876819663557e-06, + "loss": 0.5232, + "step": 3691 + }, + { + "epoch": 0.8, + "grad_norm": 0.14661262929439545, + "learning_rate": 1.059039423537237e-06, + "loss": 0.485, + "step": 3692 + }, + { + "epoch": 0.8, + "grad_norm": 0.13266430795192719, + "learning_rate": 1.0568930841782088e-06, + "loss": 0.5187, + "step": 3693 + }, + { + "epoch": 0.8, + "grad_norm": 0.22684414684772491, + "learning_rate": 1.054748664934443e-06, + "loss": 0.5477, + "step": 3694 + }, + { + "epoch": 0.8, + "grad_norm": 0.1602558046579361, + "learning_rate": 1.0526061668501708e-06, + "loss": 0.4672, + "step": 3695 + }, + { + "epoch": 0.8, + "grad_norm": 0.1579863578081131, + "learning_rate": 1.0504655909686978e-06, + "loss": 0.5119, + "step": 3696 + }, + { + "epoch": 0.8, + "grad_norm": 0.1422806680202484, + "learning_rate": 1.048326938332384e-06, + "loss": 0.549, + "step": 3697 + }, + { + "epoch": 0.8, + "grad_norm": 0.13555404543876648, + "learning_rate": 1.0461902099826577e-06, + "loss": 0.5839, + "step": 3698 + }, + { + "epoch": 0.8, + "grad_norm": 0.16685040295124054, + "learning_rate": 1.0440554069600112e-06, + "loss": 0.523, + "step": 3699 + }, + { + "epoch": 0.8, + "grad_norm": 0.15756739675998688, + "learning_rate": 1.0419225303039943e-06, + "loss": 0.4513, + "step": 3700 + }, + { + "epoch": 0.8, + "grad_norm": 0.15640777349472046, + "learning_rate": 1.0397915810532227e-06, + "loss": 0.525, + "step": 3701 + }, + { + "epoch": 0.8, + "grad_norm": 0.15122468769550323, + "learning_rate": 1.0376625602453733e-06, + "loss": 0.5116, + "step": 3702 + }, + { + "epoch": 0.8, + "grad_norm": 0.15925420820713043, + "learning_rate": 1.0355354689171831e-06, + "loss": 0.5259, + "step": 3703 + }, + { + "epoch": 0.8, + "grad_norm": 0.14913156628608704, + "learning_rate": 1.0334103081044504e-06, + "loss": 0.5148, + "step": 3704 + }, + { + "epoch": 0.8, + "grad_norm": 0.15156058967113495, + "learning_rate": 1.031287078842031e-06, + "loss": 0.5239, + "step": 3705 + }, + { + "epoch": 0.8, + "grad_norm": 0.1923210173845291, + "learning_rate": 1.0291657821638435e-06, + "loss": 0.5351, + "step": 3706 + }, + { + "epoch": 0.8, + "grad_norm": 0.16889818012714386, + "learning_rate": 1.0270464191028618e-06, + "loss": 0.5231, + "step": 3707 + }, + { + "epoch": 0.8, + "grad_norm": 0.13356614112854004, + "learning_rate": 1.024928990691121e-06, + "loss": 0.506, + "step": 3708 + }, + { + "epoch": 0.8, + "grad_norm": 0.1991608887910843, + "learning_rate": 1.0228134979597126e-06, + "loss": 0.5501, + "step": 3709 + }, + { + "epoch": 0.8, + "grad_norm": 0.15821781754493713, + "learning_rate": 1.0206999419387881e-06, + "loss": 0.5371, + "step": 3710 + }, + { + "epoch": 0.8, + "grad_norm": 0.1407308578491211, + "learning_rate": 1.0185883236575533e-06, + "loss": 0.5072, + "step": 3711 + }, + { + "epoch": 0.8, + "grad_norm": 0.14752401411533356, + "learning_rate": 1.0164786441442698e-06, + "loss": 0.5163, + "step": 3712 + }, + { + "epoch": 0.8, + "grad_norm": 0.14390012621879578, + "learning_rate": 1.0143709044262574e-06, + "loss": 0.4969, + "step": 3713 + }, + { + "epoch": 0.8, + "grad_norm": 0.1694592982530594, + "learning_rate": 1.0122651055298898e-06, + "loss": 0.4924, + "step": 3714 + }, + { + "epoch": 0.8, + "grad_norm": 0.15931564569473267, + "learning_rate": 1.0101612484805967e-06, + "loss": 0.4842, + "step": 3715 + }, + { + "epoch": 0.8, + "grad_norm": 0.16370849311351776, + "learning_rate": 1.0080593343028621e-06, + "loss": 0.497, + "step": 3716 + }, + { + "epoch": 0.8, + "grad_norm": 0.16331344842910767, + "learning_rate": 1.005959364020222e-06, + "loss": 0.4919, + "step": 3717 + }, + { + "epoch": 0.8, + "grad_norm": 0.1296970099210739, + "learning_rate": 1.0038613386552687e-06, + "loss": 0.5674, + "step": 3718 + }, + { + "epoch": 0.8, + "grad_norm": 0.15003569424152374, + "learning_rate": 1.001765259229644e-06, + "loss": 0.5164, + "step": 3719 + }, + { + "epoch": 0.8, + "grad_norm": 0.14973247051239014, + "learning_rate": 9.996711267640451e-07, + "loss": 0.4997, + "step": 3720 + }, + { + "epoch": 0.8, + "grad_norm": 0.14696918427944183, + "learning_rate": 9.975789422782205e-07, + "loss": 0.4806, + "step": 3721 + }, + { + "epoch": 0.8, + "grad_norm": 0.14564906060695648, + "learning_rate": 9.95488706790969e-07, + "loss": 0.5491, + "step": 3722 + }, + { + "epoch": 0.8, + "grad_norm": 0.18390415608882904, + "learning_rate": 9.934004213201431e-07, + "loss": 0.5264, + "step": 3723 + }, + { + "epoch": 0.8, + "grad_norm": 0.1590055525302887, + "learning_rate": 9.913140868826405e-07, + "loss": 0.497, + "step": 3724 + }, + { + "epoch": 0.8, + "grad_norm": 0.1445043832063675, + "learning_rate": 9.892297044944133e-07, + "loss": 0.5089, + "step": 3725 + }, + { + "epoch": 0.8, + "grad_norm": 0.15211768448352814, + "learning_rate": 9.871472751704625e-07, + "loss": 0.5093, + "step": 3726 + }, + { + "epoch": 0.8, + "grad_norm": 0.1753348559141159, + "learning_rate": 9.85066799924836e-07, + "loss": 0.5038, + "step": 3727 + }, + { + "epoch": 0.8, + "grad_norm": 0.14900852739810944, + "learning_rate": 9.829882797706336e-07, + "loss": 0.4721, + "step": 3728 + }, + { + "epoch": 0.8, + "grad_norm": 0.1514863520860672, + "learning_rate": 9.809117157199982e-07, + "loss": 0.5869, + "step": 3729 + }, + { + "epoch": 0.8, + "grad_norm": 0.15811176598072052, + "learning_rate": 9.788371087841236e-07, + "loss": 0.5396, + "step": 3730 + }, + { + "epoch": 0.8, + "grad_norm": 0.1752791553735733, + "learning_rate": 9.767644599732517e-07, + "loss": 0.4918, + "step": 3731 + }, + { + "epoch": 0.8, + "grad_norm": 0.17730747163295746, + "learning_rate": 9.74693770296667e-07, + "loss": 0.5003, + "step": 3732 + }, + { + "epoch": 0.8, + "grad_norm": 0.1746446192264557, + "learning_rate": 9.72625040762702e-07, + "loss": 0.4698, + "step": 3733 + }, + { + "epoch": 0.8, + "grad_norm": 0.14243842661380768, + "learning_rate": 9.705582723787348e-07, + "loss": 0.5296, + "step": 3734 + }, + { + "epoch": 0.8, + "grad_norm": 0.17734676599502563, + "learning_rate": 9.684934661511909e-07, + "loss": 0.5386, + "step": 3735 + }, + { + "epoch": 0.8, + "grad_norm": 0.12994273006916046, + "learning_rate": 9.664306230855342e-07, + "loss": 0.5133, + "step": 3736 + }, + { + "epoch": 0.8, + "grad_norm": 0.14365623891353607, + "learning_rate": 9.643697441862782e-07, + "loss": 0.4759, + "step": 3737 + }, + { + "epoch": 0.81, + "grad_norm": 0.12920841574668884, + "learning_rate": 9.623108304569783e-07, + "loss": 0.4998, + "step": 3738 + }, + { + "epoch": 0.81, + "grad_norm": 0.14855559170246124, + "learning_rate": 9.6025388290023e-07, + "loss": 0.5141, + "step": 3739 + }, + { + "epoch": 0.81, + "grad_norm": 0.16959311068058014, + "learning_rate": 9.58198902517678e-07, + "loss": 0.5376, + "step": 3740 + }, + { + "epoch": 0.81, + "grad_norm": 0.14248433709144592, + "learning_rate": 9.561458903100025e-07, + "loss": 0.5684, + "step": 3741 + }, + { + "epoch": 0.81, + "grad_norm": 0.1944878101348877, + "learning_rate": 9.540948472769278e-07, + "loss": 0.4685, + "step": 3742 + }, + { + "epoch": 0.81, + "grad_norm": 0.16937778890132904, + "learning_rate": 9.520457744172218e-07, + "loss": 0.5127, + "step": 3743 + }, + { + "epoch": 0.81, + "grad_norm": 0.15535053610801697, + "learning_rate": 9.499986727286869e-07, + "loss": 0.509, + "step": 3744 + }, + { + "epoch": 0.81, + "grad_norm": 0.13496124744415283, + "learning_rate": 9.479535432081716e-07, + "loss": 0.4883, + "step": 3745 + }, + { + "epoch": 0.81, + "grad_norm": 0.15980157256126404, + "learning_rate": 9.459103868515618e-07, + "loss": 0.5115, + "step": 3746 + }, + { + "epoch": 0.81, + "grad_norm": 0.13277289271354675, + "learning_rate": 9.438692046537812e-07, + "loss": 0.5383, + "step": 3747 + }, + { + "epoch": 0.81, + "grad_norm": 0.15120829641819, + "learning_rate": 9.418299976087964e-07, + "loss": 0.4822, + "step": 3748 + }, + { + "epoch": 0.81, + "grad_norm": 0.18064884841442108, + "learning_rate": 9.397927667096058e-07, + "loss": 0.4813, + "step": 3749 + }, + { + "epoch": 0.81, + "grad_norm": 0.15214307606220245, + "learning_rate": 9.377575129482513e-07, + "loss": 0.538, + "step": 3750 + }, + { + "epoch": 0.81, + "grad_norm": 0.13899169862270355, + "learning_rate": 9.357242373158076e-07, + "loss": 0.5259, + "step": 3751 + }, + { + "epoch": 0.81, + "grad_norm": 0.17859694361686707, + "learning_rate": 9.336929408023887e-07, + "loss": 0.5298, + "step": 3752 + }, + { + "epoch": 0.81, + "grad_norm": 0.16139504313468933, + "learning_rate": 9.316636243971472e-07, + "loss": 0.47, + "step": 3753 + }, + { + "epoch": 0.81, + "grad_norm": 0.16516685485839844, + "learning_rate": 9.29636289088266e-07, + "loss": 0.4834, + "step": 3754 + }, + { + "epoch": 0.81, + "grad_norm": 0.1459476500749588, + "learning_rate": 9.27610935862967e-07, + "loss": 0.505, + "step": 3755 + }, + { + "epoch": 0.81, + "grad_norm": 0.14627854526042938, + "learning_rate": 9.255875657075053e-07, + "loss": 0.5443, + "step": 3756 + }, + { + "epoch": 0.81, + "grad_norm": 0.17981037497520447, + "learning_rate": 9.235661796071704e-07, + "loss": 0.5165, + "step": 3757 + }, + { + "epoch": 0.81, + "grad_norm": 0.14158649742603302, + "learning_rate": 9.215467785462873e-07, + "loss": 0.5373, + "step": 3758 + }, + { + "epoch": 0.81, + "grad_norm": 0.1404084414243698, + "learning_rate": 9.195293635082125e-07, + "loss": 0.5071, + "step": 3759 + }, + { + "epoch": 0.81, + "grad_norm": 0.18543866276741028, + "learning_rate": 9.175139354753382e-07, + "loss": 0.4776, + "step": 3760 + }, + { + "epoch": 0.81, + "grad_norm": 0.1633271872997284, + "learning_rate": 9.155004954290842e-07, + "loss": 0.5757, + "step": 3761 + }, + { + "epoch": 0.81, + "grad_norm": 0.13727520406246185, + "learning_rate": 9.134890443499068e-07, + "loss": 0.489, + "step": 3762 + }, + { + "epoch": 0.81, + "grad_norm": 0.14105379581451416, + "learning_rate": 9.114795832172907e-07, + "loss": 0.4545, + "step": 3763 + }, + { + "epoch": 0.81, + "grad_norm": 0.15787868201732635, + "learning_rate": 9.094721130097517e-07, + "loss": 0.5232, + "step": 3764 + }, + { + "epoch": 0.81, + "grad_norm": 0.12892010807991028, + "learning_rate": 9.074666347048416e-07, + "loss": 0.5527, + "step": 3765 + }, + { + "epoch": 0.81, + "grad_norm": 0.1309516578912735, + "learning_rate": 9.054631492791344e-07, + "loss": 0.5209, + "step": 3766 + }, + { + "epoch": 0.81, + "grad_norm": 0.14891640841960907, + "learning_rate": 9.034616577082389e-07, + "loss": 0.4782, + "step": 3767 + }, + { + "epoch": 0.81, + "grad_norm": 0.17316175997257233, + "learning_rate": 9.014621609667896e-07, + "loss": 0.5075, + "step": 3768 + }, + { + "epoch": 0.81, + "grad_norm": 0.17712554335594177, + "learning_rate": 8.994646600284518e-07, + "loss": 0.5551, + "step": 3769 + }, + { + "epoch": 0.81, + "grad_norm": 0.17951372265815735, + "learning_rate": 8.974691558659187e-07, + "loss": 0.4612, + "step": 3770 + }, + { + "epoch": 0.81, + "grad_norm": 0.18492546677589417, + "learning_rate": 8.954756494509104e-07, + "loss": 0.498, + "step": 3771 + }, + { + "epoch": 0.81, + "grad_norm": 0.15967923402786255, + "learning_rate": 8.934841417541767e-07, + "loss": 0.5152, + "step": 3772 + }, + { + "epoch": 0.81, + "grad_norm": 0.1444973647594452, + "learning_rate": 8.914946337454894e-07, + "loss": 0.4852, + "step": 3773 + }, + { + "epoch": 0.81, + "grad_norm": 0.13344036042690277, + "learning_rate": 8.8950712639365e-07, + "loss": 0.5396, + "step": 3774 + }, + { + "epoch": 0.81, + "grad_norm": 0.14624960720539093, + "learning_rate": 8.87521620666486e-07, + "loss": 0.4983, + "step": 3775 + }, + { + "epoch": 0.81, + "grad_norm": 0.15818633139133453, + "learning_rate": 8.855381175308475e-07, + "loss": 0.4791, + "step": 3776 + }, + { + "epoch": 0.81, + "grad_norm": 0.17238670587539673, + "learning_rate": 8.835566179526118e-07, + "loss": 0.475, + "step": 3777 + }, + { + "epoch": 0.81, + "grad_norm": 0.16176079213619232, + "learning_rate": 8.815771228966796e-07, + "loss": 0.5353, + "step": 3778 + }, + { + "epoch": 0.81, + "grad_norm": 0.17096221446990967, + "learning_rate": 8.795996333269763e-07, + "loss": 0.483, + "step": 3779 + }, + { + "epoch": 0.81, + "grad_norm": 0.1631225347518921, + "learning_rate": 8.776241502064508e-07, + "loss": 0.5166, + "step": 3780 + }, + { + "epoch": 0.81, + "grad_norm": 0.19986139237880707, + "learning_rate": 8.756506744970722e-07, + "loss": 0.529, + "step": 3781 + }, + { + "epoch": 0.81, + "grad_norm": 0.1557171493768692, + "learning_rate": 8.736792071598355e-07, + "loss": 0.5267, + "step": 3782 + }, + { + "epoch": 0.81, + "grad_norm": 0.15632902085781097, + "learning_rate": 8.717097491547566e-07, + "loss": 0.5189, + "step": 3783 + }, + { + "epoch": 0.82, + "grad_norm": 0.16741085052490234, + "learning_rate": 8.697423014408718e-07, + "loss": 0.5474, + "step": 3784 + }, + { + "epoch": 0.82, + "grad_norm": 0.15666568279266357, + "learning_rate": 8.677768649762419e-07, + "loss": 0.5306, + "step": 3785 + }, + { + "epoch": 0.82, + "grad_norm": 0.20284314453601837, + "learning_rate": 8.658134407179419e-07, + "loss": 0.5003, + "step": 3786 + }, + { + "epoch": 0.82, + "grad_norm": 0.1518256664276123, + "learning_rate": 8.638520296220748e-07, + "loss": 0.5322, + "step": 3787 + }, + { + "epoch": 0.82, + "grad_norm": 0.18800678849220276, + "learning_rate": 8.61892632643756e-07, + "loss": 0.4763, + "step": 3788 + }, + { + "epoch": 0.82, + "grad_norm": 0.1399422585964203, + "learning_rate": 8.59935250737125e-07, + "loss": 0.5293, + "step": 3789 + }, + { + "epoch": 0.82, + "grad_norm": 0.14804938435554504, + "learning_rate": 8.579798848553389e-07, + "loss": 0.4703, + "step": 3790 + }, + { + "epoch": 0.82, + "grad_norm": 0.15333673357963562, + "learning_rate": 8.560265359505716e-07, + "loss": 0.4947, + "step": 3791 + }, + { + "epoch": 0.82, + "grad_norm": 0.1545214056968689, + "learning_rate": 8.540752049740181e-07, + "loss": 0.5079, + "step": 3792 + }, + { + "epoch": 0.82, + "grad_norm": 0.137412428855896, + "learning_rate": 8.521258928758864e-07, + "loss": 0.4973, + "step": 3793 + }, + { + "epoch": 0.82, + "grad_norm": 0.1467263251543045, + "learning_rate": 8.501786006054047e-07, + "loss": 0.5318, + "step": 3794 + }, + { + "epoch": 0.82, + "grad_norm": 0.16581839323043823, + "learning_rate": 8.482333291108141e-07, + "loss": 0.5226, + "step": 3795 + }, + { + "epoch": 0.82, + "grad_norm": 0.1452476680278778, + "learning_rate": 8.462900793393775e-07, + "loss": 0.5012, + "step": 3796 + }, + { + "epoch": 0.82, + "grad_norm": 0.1618988811969757, + "learning_rate": 8.443488522373694e-07, + "loss": 0.501, + "step": 3797 + }, + { + "epoch": 0.82, + "grad_norm": 0.1634100079536438, + "learning_rate": 8.424096487500777e-07, + "loss": 0.5288, + "step": 3798 + }, + { + "epoch": 0.82, + "grad_norm": 0.17481021583080292, + "learning_rate": 8.404724698218103e-07, + "loss": 0.5575, + "step": 3799 + }, + { + "epoch": 0.82, + "grad_norm": 0.13058480620384216, + "learning_rate": 8.385373163958821e-07, + "loss": 0.4976, + "step": 3800 + }, + { + "epoch": 0.82, + "grad_norm": 0.1389196217060089, + "learning_rate": 8.366041894146276e-07, + "loss": 0.4854, + "step": 3801 + }, + { + "epoch": 0.82, + "grad_norm": 0.15564516186714172, + "learning_rate": 8.346730898193928e-07, + "loss": 0.4984, + "step": 3802 + }, + { + "epoch": 0.82, + "grad_norm": 0.1349528729915619, + "learning_rate": 8.327440185505353e-07, + "loss": 0.5138, + "step": 3803 + }, + { + "epoch": 0.82, + "grad_norm": 0.1407652646303177, + "learning_rate": 8.308169765474278e-07, + "loss": 0.4912, + "step": 3804 + }, + { + "epoch": 0.82, + "grad_norm": 0.14387796819210052, + "learning_rate": 8.2889196474845e-07, + "loss": 0.5048, + "step": 3805 + }, + { + "epoch": 0.82, + "grad_norm": 0.15386423468589783, + "learning_rate": 8.269689840909967e-07, + "loss": 0.5339, + "step": 3806 + }, + { + "epoch": 0.82, + "grad_norm": 0.16335895657539368, + "learning_rate": 8.250480355114748e-07, + "loss": 0.5343, + "step": 3807 + }, + { + "epoch": 0.82, + "grad_norm": 0.16175401210784912, + "learning_rate": 8.231291199452956e-07, + "loss": 0.52, + "step": 3808 + }, + { + "epoch": 0.82, + "grad_norm": 0.15114691853523254, + "learning_rate": 8.212122383268889e-07, + "loss": 0.5034, + "step": 3809 + }, + { + "epoch": 0.82, + "grad_norm": 0.13014435768127441, + "learning_rate": 8.192973915896868e-07, + "loss": 0.5266, + "step": 3810 + }, + { + "epoch": 0.82, + "grad_norm": 0.1377837210893631, + "learning_rate": 8.17384580666134e-07, + "loss": 0.5326, + "step": 3811 + }, + { + "epoch": 0.82, + "grad_norm": 0.17275045812129974, + "learning_rate": 8.154738064876843e-07, + "loss": 0.5156, + "step": 3812 + }, + { + "epoch": 0.82, + "grad_norm": 0.1639435589313507, + "learning_rate": 8.135650699847963e-07, + "loss": 0.504, + "step": 3813 + }, + { + "epoch": 0.82, + "grad_norm": 0.18835903704166412, + "learning_rate": 8.116583720869398e-07, + "loss": 0.5377, + "step": 3814 + }, + { + "epoch": 0.82, + "grad_norm": 0.1467577964067459, + "learning_rate": 8.097537137225909e-07, + "loss": 0.5437, + "step": 3815 + }, + { + "epoch": 0.82, + "grad_norm": 0.1413908451795578, + "learning_rate": 8.078510958192337e-07, + "loss": 0.5246, + "step": 3816 + }, + { + "epoch": 0.82, + "grad_norm": 0.21813301742076874, + "learning_rate": 8.05950519303354e-07, + "loss": 0.48, + "step": 3817 + }, + { + "epoch": 0.82, + "grad_norm": 0.14480313658714294, + "learning_rate": 8.040519851004492e-07, + "loss": 0.5298, + "step": 3818 + }, + { + "epoch": 0.82, + "grad_norm": 0.16793721914291382, + "learning_rate": 8.021554941350202e-07, + "loss": 0.4885, + "step": 3819 + }, + { + "epoch": 0.82, + "grad_norm": 0.15354284644126892, + "learning_rate": 8.002610473305688e-07, + "loss": 0.4743, + "step": 3820 + }, + { + "epoch": 0.82, + "grad_norm": 0.15883216261863708, + "learning_rate": 7.983686456096112e-07, + "loss": 0.5344, + "step": 3821 + }, + { + "epoch": 0.82, + "grad_norm": 0.16302940249443054, + "learning_rate": 7.964782898936569e-07, + "loss": 0.5251, + "step": 3822 + }, + { + "epoch": 0.82, + "grad_norm": 0.1534924954175949, + "learning_rate": 7.945899811032254e-07, + "loss": 0.5438, + "step": 3823 + }, + { + "epoch": 0.82, + "grad_norm": 0.1581207513809204, + "learning_rate": 7.927037201578397e-07, + "loss": 0.4707, + "step": 3824 + }, + { + "epoch": 0.82, + "grad_norm": 0.16421711444854736, + "learning_rate": 7.908195079760205e-07, + "loss": 0.485, + "step": 3825 + }, + { + "epoch": 0.82, + "grad_norm": 0.16686981916427612, + "learning_rate": 7.889373454752964e-07, + "loss": 0.5225, + "step": 3826 + }, + { + "epoch": 0.82, + "grad_norm": 0.1350572556257248, + "learning_rate": 7.870572335721949e-07, + "loss": 0.5018, + "step": 3827 + }, + { + "epoch": 0.82, + "grad_norm": 0.1447533518075943, + "learning_rate": 7.851791731822461e-07, + "loss": 0.5149, + "step": 3828 + }, + { + "epoch": 0.82, + "grad_norm": 0.13988631963729858, + "learning_rate": 7.833031652199819e-07, + "loss": 0.5441, + "step": 3829 + }, + { + "epoch": 0.83, + "grad_norm": 0.17618080973625183, + "learning_rate": 7.814292105989308e-07, + "loss": 0.5189, + "step": 3830 + }, + { + "epoch": 0.83, + "grad_norm": 0.1565089076757431, + "learning_rate": 7.795573102316267e-07, + "loss": 0.5091, + "step": 3831 + }, + { + "epoch": 0.83, + "grad_norm": 0.15142589807510376, + "learning_rate": 7.776874650295984e-07, + "loss": 0.4814, + "step": 3832 + }, + { + "epoch": 0.83, + "grad_norm": 0.1675831824541092, + "learning_rate": 7.758196759033765e-07, + "loss": 0.4961, + "step": 3833 + }, + { + "epoch": 0.83, + "grad_norm": 0.15488111972808838, + "learning_rate": 7.739539437624933e-07, + "loss": 0.552, + "step": 3834 + }, + { + "epoch": 0.83, + "grad_norm": 0.133047953248024, + "learning_rate": 7.720902695154725e-07, + "loss": 0.506, + "step": 3835 + }, + { + "epoch": 0.83, + "grad_norm": 0.13741527497768402, + "learning_rate": 7.702286540698417e-07, + "loss": 0.4968, + "step": 3836 + }, + { + "epoch": 0.83, + "grad_norm": 0.13112328946590424, + "learning_rate": 7.683690983321224e-07, + "loss": 0.4906, + "step": 3837 + }, + { + "epoch": 0.83, + "grad_norm": 0.17950129508972168, + "learning_rate": 7.665116032078346e-07, + "loss": 0.5324, + "step": 3838 + }, + { + "epoch": 0.83, + "grad_norm": 0.21670496463775635, + "learning_rate": 7.646561696014948e-07, + "loss": 0.5378, + "step": 3839 + }, + { + "epoch": 0.83, + "grad_norm": 0.14193449914455414, + "learning_rate": 7.628027984166153e-07, + "loss": 0.5395, + "step": 3840 + }, + { + "epoch": 0.83, + "grad_norm": 0.14640313386917114, + "learning_rate": 7.609514905557058e-07, + "loss": 0.4765, + "step": 3841 + }, + { + "epoch": 0.83, + "grad_norm": 0.15662898123264313, + "learning_rate": 7.591022469202675e-07, + "loss": 0.5274, + "step": 3842 + }, + { + "epoch": 0.83, + "grad_norm": 0.15614978969097137, + "learning_rate": 7.57255068410801e-07, + "loss": 0.4858, + "step": 3843 + }, + { + "epoch": 0.83, + "grad_norm": 0.13435639441013336, + "learning_rate": 7.554099559267964e-07, + "loss": 0.4774, + "step": 3844 + }, + { + "epoch": 0.83, + "grad_norm": 0.1398366242647171, + "learning_rate": 7.535669103667409e-07, + "loss": 0.5893, + "step": 3845 + }, + { + "epoch": 0.83, + "grad_norm": 0.14986996352672577, + "learning_rate": 7.517259326281157e-07, + "loss": 0.5105, + "step": 3846 + }, + { + "epoch": 0.83, + "grad_norm": 0.15778091549873352, + "learning_rate": 7.49887023607393e-07, + "loss": 0.4488, + "step": 3847 + }, + { + "epoch": 0.83, + "grad_norm": 0.16323697566986084, + "learning_rate": 7.480501842000404e-07, + "loss": 0.5533, + "step": 3848 + }, + { + "epoch": 0.83, + "grad_norm": 0.14002352952957153, + "learning_rate": 7.462154153005136e-07, + "loss": 0.5196, + "step": 3849 + }, + { + "epoch": 0.83, + "grad_norm": 0.1188010647892952, + "learning_rate": 7.443827178022628e-07, + "loss": 0.4912, + "step": 3850 + }, + { + "epoch": 0.83, + "grad_norm": 0.14760838449001312, + "learning_rate": 7.425520925977292e-07, + "loss": 0.5157, + "step": 3851 + }, + { + "epoch": 0.83, + "grad_norm": 0.19391202926635742, + "learning_rate": 7.407235405783453e-07, + "loss": 0.4939, + "step": 3852 + }, + { + "epoch": 0.83, + "grad_norm": 0.1490384191274643, + "learning_rate": 7.388970626345343e-07, + "loss": 0.494, + "step": 3853 + }, + { + "epoch": 0.83, + "grad_norm": 0.16639220714569092, + "learning_rate": 7.370726596557059e-07, + "loss": 0.488, + "step": 3854 + }, + { + "epoch": 0.83, + "grad_norm": 0.16223375499248505, + "learning_rate": 7.352503325302635e-07, + "loss": 0.4825, + "step": 3855 + }, + { + "epoch": 0.83, + "grad_norm": 0.16969801485538483, + "learning_rate": 7.334300821455998e-07, + "loss": 0.5288, + "step": 3856 + }, + { + "epoch": 0.83, + "grad_norm": 0.1843784898519516, + "learning_rate": 7.316119093880919e-07, + "loss": 0.4818, + "step": 3857 + }, + { + "epoch": 0.83, + "grad_norm": 0.139174684882164, + "learning_rate": 7.297958151431094e-07, + "loss": 0.5019, + "step": 3858 + }, + { + "epoch": 0.83, + "grad_norm": 0.18277384340763092, + "learning_rate": 7.279818002950079e-07, + "loss": 0.5432, + "step": 3859 + }, + { + "epoch": 0.83, + "grad_norm": 0.1524992436170578, + "learning_rate": 7.26169865727131e-07, + "loss": 0.5223, + "step": 3860 + }, + { + "epoch": 0.83, + "grad_norm": 0.16654187440872192, + "learning_rate": 7.243600123218109e-07, + "loss": 0.4757, + "step": 3861 + }, + { + "epoch": 0.83, + "grad_norm": 0.24710118770599365, + "learning_rate": 7.225522409603608e-07, + "loss": 0.5699, + "step": 3862 + }, + { + "epoch": 0.83, + "grad_norm": 0.15701556205749512, + "learning_rate": 7.207465525230878e-07, + "loss": 0.5001, + "step": 3863 + }, + { + "epoch": 0.83, + "grad_norm": 0.17674629390239716, + "learning_rate": 7.189429478892762e-07, + "loss": 0.4661, + "step": 3864 + }, + { + "epoch": 0.83, + "grad_norm": 0.15791045129299164, + "learning_rate": 7.171414279372041e-07, + "loss": 0.4895, + "step": 3865 + }, + { + "epoch": 0.83, + "grad_norm": 0.10984218865633011, + "learning_rate": 7.153419935441303e-07, + "loss": 0.4908, + "step": 3866 + }, + { + "epoch": 0.83, + "grad_norm": 0.1386931836605072, + "learning_rate": 7.135446455862954e-07, + "loss": 0.452, + "step": 3867 + }, + { + "epoch": 0.83, + "grad_norm": 0.17676003277301788, + "learning_rate": 7.117493849389306e-07, + "loss": 0.5278, + "step": 3868 + }, + { + "epoch": 0.83, + "grad_norm": 0.1930963546037674, + "learning_rate": 7.099562124762426e-07, + "loss": 0.4919, + "step": 3869 + }, + { + "epoch": 0.83, + "grad_norm": 0.13967633247375488, + "learning_rate": 7.081651290714287e-07, + "loss": 0.5333, + "step": 3870 + }, + { + "epoch": 0.83, + "grad_norm": 0.18139459192752838, + "learning_rate": 7.063761355966642e-07, + "loss": 0.4855, + "step": 3871 + }, + { + "epoch": 0.83, + "grad_norm": 0.13663552701473236, + "learning_rate": 7.045892329231086e-07, + "loss": 0.5479, + "step": 3872 + }, + { + "epoch": 0.83, + "grad_norm": 0.1746217906475067, + "learning_rate": 7.028044219209046e-07, + "loss": 0.4923, + "step": 3873 + }, + { + "epoch": 0.83, + "grad_norm": 0.14870743453502655, + "learning_rate": 7.010217034591721e-07, + "loss": 0.5018, + "step": 3874 + }, + { + "epoch": 0.83, + "grad_norm": 0.1460588276386261, + "learning_rate": 6.992410784060166e-07, + "loss": 0.46, + "step": 3875 + }, + { + "epoch": 0.83, + "grad_norm": 0.1792103797197342, + "learning_rate": 6.974625476285191e-07, + "loss": 0.524, + "step": 3876 + }, + { + "epoch": 0.84, + "grad_norm": 0.18173110485076904, + "learning_rate": 6.956861119927472e-07, + "loss": 0.4626, + "step": 3877 + }, + { + "epoch": 0.84, + "grad_norm": 0.1377502679824829, + "learning_rate": 6.93911772363745e-07, + "loss": 0.5192, + "step": 3878 + }, + { + "epoch": 0.84, + "grad_norm": 0.17006491124629974, + "learning_rate": 6.921395296055333e-07, + "loss": 0.5051, + "step": 3879 + }, + { + "epoch": 0.84, + "grad_norm": 0.13877364993095398, + "learning_rate": 6.903693845811176e-07, + "loss": 0.5102, + "step": 3880 + }, + { + "epoch": 0.84, + "grad_norm": 0.17840033769607544, + "learning_rate": 6.886013381524753e-07, + "loss": 0.4961, + "step": 3881 + }, + { + "epoch": 0.84, + "grad_norm": 0.1865067183971405, + "learning_rate": 6.86835391180567e-07, + "loss": 0.5206, + "step": 3882 + }, + { + "epoch": 0.84, + "grad_norm": 0.20453877747058868, + "learning_rate": 6.850715445253297e-07, + "loss": 0.5632, + "step": 3883 + }, + { + "epoch": 0.84, + "grad_norm": 0.15611490607261658, + "learning_rate": 6.833097990456761e-07, + "loss": 0.5682, + "step": 3884 + }, + { + "epoch": 0.84, + "grad_norm": 0.15531837940216064, + "learning_rate": 6.815501555994986e-07, + "loss": 0.5113, + "step": 3885 + }, + { + "epoch": 0.84, + "grad_norm": 0.13591976463794708, + "learning_rate": 6.797926150436618e-07, + "loss": 0.5462, + "step": 3886 + }, + { + "epoch": 0.84, + "grad_norm": 0.1687079221010208, + "learning_rate": 6.780371782340101e-07, + "loss": 0.5001, + "step": 3887 + }, + { + "epoch": 0.84, + "grad_norm": 0.16073539853096008, + "learning_rate": 6.762838460253629e-07, + "loss": 0.4732, + "step": 3888 + }, + { + "epoch": 0.84, + "grad_norm": 0.14205871522426605, + "learning_rate": 6.745326192715107e-07, + "loss": 0.5361, + "step": 3889 + }, + { + "epoch": 0.84, + "grad_norm": 0.152149498462677, + "learning_rate": 6.727834988252258e-07, + "loss": 0.4968, + "step": 3890 + }, + { + "epoch": 0.84, + "grad_norm": 0.1546226292848587, + "learning_rate": 6.71036485538249e-07, + "loss": 0.5439, + "step": 3891 + }, + { + "epoch": 0.84, + "grad_norm": 0.1516941487789154, + "learning_rate": 6.692915802612965e-07, + "loss": 0.5259, + "step": 3892 + }, + { + "epoch": 0.84, + "grad_norm": 0.15468570590019226, + "learning_rate": 6.675487838440608e-07, + "loss": 0.4867, + "step": 3893 + }, + { + "epoch": 0.84, + "grad_norm": 0.15356989204883575, + "learning_rate": 6.658080971352026e-07, + "loss": 0.4858, + "step": 3894 + }, + { + "epoch": 0.84, + "grad_norm": 0.16798479855060577, + "learning_rate": 6.640695209823588e-07, + "loss": 0.5147, + "step": 3895 + }, + { + "epoch": 0.84, + "grad_norm": 0.15462863445281982, + "learning_rate": 6.623330562321378e-07, + "loss": 0.517, + "step": 3896 + }, + { + "epoch": 0.84, + "grad_norm": 0.13679371774196625, + "learning_rate": 6.605987037301204e-07, + "loss": 0.535, + "step": 3897 + }, + { + "epoch": 0.84, + "grad_norm": 0.12895052134990692, + "learning_rate": 6.588664643208559e-07, + "loss": 0.5082, + "step": 3898 + }, + { + "epoch": 0.84, + "grad_norm": 0.16611763834953308, + "learning_rate": 6.571363388478686e-07, + "loss": 0.495, + "step": 3899 + }, + { + "epoch": 0.84, + "grad_norm": 0.12473352998495102, + "learning_rate": 6.554083281536516e-07, + "loss": 0.5251, + "step": 3900 + }, + { + "epoch": 0.84, + "grad_norm": 0.15122053027153015, + "learning_rate": 6.53682433079667e-07, + "loss": 0.4709, + "step": 3901 + }, + { + "epoch": 0.84, + "grad_norm": 0.15693899989128113, + "learning_rate": 6.519586544663481e-07, + "loss": 0.4572, + "step": 3902 + }, + { + "epoch": 0.84, + "grad_norm": 0.18545718491077423, + "learning_rate": 6.502369931530977e-07, + "loss": 0.5047, + "step": 3903 + }, + { + "epoch": 0.84, + "grad_norm": 0.20683102309703827, + "learning_rate": 6.485174499782876e-07, + "loss": 0.476, + "step": 3904 + }, + { + "epoch": 0.84, + "grad_norm": 0.12381558865308762, + "learning_rate": 6.468000257792583e-07, + "loss": 0.5589, + "step": 3905 + }, + { + "epoch": 0.84, + "grad_norm": 0.22837506234645844, + "learning_rate": 6.450847213923162e-07, + "loss": 0.512, + "step": 3906 + }, + { + "epoch": 0.84, + "grad_norm": 0.15001285076141357, + "learning_rate": 6.433715376527383e-07, + "loss": 0.4689, + "step": 3907 + }, + { + "epoch": 0.84, + "grad_norm": 0.1989048719406128, + "learning_rate": 6.416604753947675e-07, + "loss": 0.4834, + "step": 3908 + }, + { + "epoch": 0.84, + "grad_norm": 0.23922041058540344, + "learning_rate": 6.399515354516139e-07, + "loss": 0.5496, + "step": 3909 + }, + { + "epoch": 0.84, + "grad_norm": 0.15358422696590424, + "learning_rate": 6.382447186554553e-07, + "loss": 0.5441, + "step": 3910 + }, + { + "epoch": 0.84, + "grad_norm": 0.19341875612735748, + "learning_rate": 6.365400258374327e-07, + "loss": 0.5052, + "step": 3911 + }, + { + "epoch": 0.84, + "grad_norm": 0.1362185925245285, + "learning_rate": 6.348374578276567e-07, + "loss": 0.5318, + "step": 3912 + }, + { + "epoch": 0.84, + "grad_norm": 0.172585591673851, + "learning_rate": 6.331370154551986e-07, + "loss": 0.5385, + "step": 3913 + }, + { + "epoch": 0.84, + "grad_norm": 0.16115382313728333, + "learning_rate": 6.314386995480987e-07, + "loss": 0.5018, + "step": 3914 + }, + { + "epoch": 0.84, + "grad_norm": 0.14296384155750275, + "learning_rate": 6.297425109333605e-07, + "loss": 0.5275, + "step": 3915 + }, + { + "epoch": 0.84, + "grad_norm": 0.16052164137363434, + "learning_rate": 6.280484504369505e-07, + "loss": 0.5066, + "step": 3916 + }, + { + "epoch": 0.84, + "grad_norm": 0.1424168050289154, + "learning_rate": 6.263565188838011e-07, + "loss": 0.4944, + "step": 3917 + }, + { + "epoch": 0.84, + "grad_norm": 0.1381656974554062, + "learning_rate": 6.246667170978049e-07, + "loss": 0.5041, + "step": 3918 + }, + { + "epoch": 0.84, + "grad_norm": 0.1506141573190689, + "learning_rate": 6.229790459018203e-07, + "loss": 0.5599, + "step": 3919 + }, + { + "epoch": 0.84, + "grad_norm": 0.142376109957695, + "learning_rate": 6.212935061176667e-07, + "loss": 0.5435, + "step": 3920 + }, + { + "epoch": 0.84, + "grad_norm": 0.1417161524295807, + "learning_rate": 6.196100985661258e-07, + "loss": 0.5334, + "step": 3921 + }, + { + "epoch": 0.84, + "grad_norm": 0.16186197102069855, + "learning_rate": 6.179288240669429e-07, + "loss": 0.5081, + "step": 3922 + }, + { + "epoch": 0.85, + "grad_norm": 0.18143245577812195, + "learning_rate": 6.162496834388204e-07, + "loss": 0.5346, + "step": 3923 + }, + { + "epoch": 0.85, + "grad_norm": 0.16008998453617096, + "learning_rate": 6.14572677499426e-07, + "loss": 0.5284, + "step": 3924 + }, + { + "epoch": 0.85, + "grad_norm": 0.1312318742275238, + "learning_rate": 6.12897807065384e-07, + "loss": 0.5083, + "step": 3925 + }, + { + "epoch": 0.85, + "grad_norm": 0.15271489322185516, + "learning_rate": 6.112250729522823e-07, + "loss": 0.5676, + "step": 3926 + }, + { + "epoch": 0.85, + "grad_norm": 0.1383863240480423, + "learning_rate": 6.095544759746663e-07, + "loss": 0.472, + "step": 3927 + }, + { + "epoch": 0.85, + "grad_norm": 0.14017315208911896, + "learning_rate": 6.078860169460416e-07, + "loss": 0.4941, + "step": 3928 + }, + { + "epoch": 0.85, + "grad_norm": 0.1404963880777359, + "learning_rate": 6.062196966788736e-07, + "loss": 0.5128, + "step": 3929 + }, + { + "epoch": 0.85, + "grad_norm": 0.1775158941745758, + "learning_rate": 6.045555159845828e-07, + "loss": 0.5326, + "step": 3930 + }, + { + "epoch": 0.85, + "grad_norm": 0.13232095539569855, + "learning_rate": 6.028934756735516e-07, + "loss": 0.4828, + "step": 3931 + }, + { + "epoch": 0.85, + "grad_norm": 0.14679361879825592, + "learning_rate": 6.012335765551186e-07, + "loss": 0.5059, + "step": 3932 + }, + { + "epoch": 0.85, + "grad_norm": 0.16096676886081696, + "learning_rate": 5.995758194375794e-07, + "loss": 0.4844, + "step": 3933 + }, + { + "epoch": 0.85, + "grad_norm": 0.17318318784236908, + "learning_rate": 5.979202051281891e-07, + "loss": 0.506, + "step": 3934 + }, + { + "epoch": 0.85, + "grad_norm": 0.1557616889476776, + "learning_rate": 5.962667344331535e-07, + "loss": 0.545, + "step": 3935 + }, + { + "epoch": 0.85, + "grad_norm": 0.1732773780822754, + "learning_rate": 5.946154081576411e-07, + "loss": 0.5198, + "step": 3936 + }, + { + "epoch": 0.85, + "grad_norm": 0.14775021374225616, + "learning_rate": 5.929662271057729e-07, + "loss": 0.5117, + "step": 3937 + }, + { + "epoch": 0.85, + "grad_norm": 0.2609883248806, + "learning_rate": 5.913191920806244e-07, + "loss": 0.495, + "step": 3938 + }, + { + "epoch": 0.85, + "grad_norm": 0.20081481337547302, + "learning_rate": 5.896743038842279e-07, + "loss": 0.51, + "step": 3939 + }, + { + "epoch": 0.85, + "grad_norm": 0.17543698847293854, + "learning_rate": 5.880315633175704e-07, + "loss": 0.5292, + "step": 3940 + }, + { + "epoch": 0.85, + "grad_norm": 0.15874987840652466, + "learning_rate": 5.863909711805915e-07, + "loss": 0.4689, + "step": 3941 + }, + { + "epoch": 0.85, + "grad_norm": 0.12618225812911987, + "learning_rate": 5.847525282721883e-07, + "loss": 0.4914, + "step": 3942 + }, + { + "epoch": 0.85, + "grad_norm": 0.12914496660232544, + "learning_rate": 5.831162353902048e-07, + "loss": 0.5027, + "step": 3943 + }, + { + "epoch": 0.85, + "grad_norm": 0.13037589192390442, + "learning_rate": 5.814820933314446e-07, + "loss": 0.5111, + "step": 3944 + }, + { + "epoch": 0.85, + "grad_norm": 0.14492201805114746, + "learning_rate": 5.798501028916587e-07, + "loss": 0.5404, + "step": 3945 + }, + { + "epoch": 0.85, + "grad_norm": 0.17597924172878265, + "learning_rate": 5.78220264865555e-07, + "loss": 0.4962, + "step": 3946 + }, + { + "epoch": 0.85, + "grad_norm": 0.1367659866809845, + "learning_rate": 5.76592580046792e-07, + "loss": 0.4624, + "step": 3947 + }, + { + "epoch": 0.85, + "grad_norm": 0.16057761013507843, + "learning_rate": 5.749670492279757e-07, + "loss": 0.4985, + "step": 3948 + }, + { + "epoch": 0.85, + "grad_norm": 0.1518482267856598, + "learning_rate": 5.733436732006692e-07, + "loss": 0.512, + "step": 3949 + }, + { + "epoch": 0.85, + "grad_norm": 0.14091022312641144, + "learning_rate": 5.717224527553811e-07, + "loss": 0.5218, + "step": 3950 + }, + { + "epoch": 0.85, + "grad_norm": 0.13686485588550568, + "learning_rate": 5.701033886815738e-07, + "loss": 0.4916, + "step": 3951 + }, + { + "epoch": 0.85, + "grad_norm": 0.13204288482666016, + "learning_rate": 5.684864817676583e-07, + "loss": 0.495, + "step": 3952 + }, + { + "epoch": 0.85, + "grad_norm": 0.2482197880744934, + "learning_rate": 5.668717328009954e-07, + "loss": 0.5075, + "step": 3953 + }, + { + "epoch": 0.85, + "grad_norm": 0.14586390554904938, + "learning_rate": 5.65259142567896e-07, + "loss": 0.504, + "step": 3954 + }, + { + "epoch": 0.85, + "grad_norm": 0.15525588393211365, + "learning_rate": 5.636487118536171e-07, + "loss": 0.5429, + "step": 3955 + }, + { + "epoch": 0.85, + "grad_norm": 0.1539800614118576, + "learning_rate": 5.620404414423674e-07, + "loss": 0.5228, + "step": 3956 + }, + { + "epoch": 0.85, + "grad_norm": 0.16243867576122284, + "learning_rate": 5.604343321173006e-07, + "loss": 0.5141, + "step": 3957 + }, + { + "epoch": 0.85, + "grad_norm": 0.17166343331336975, + "learning_rate": 5.588303846605187e-07, + "loss": 0.5474, + "step": 3958 + }, + { + "epoch": 0.85, + "grad_norm": 0.1559562087059021, + "learning_rate": 5.572285998530758e-07, + "loss": 0.4877, + "step": 3959 + }, + { + "epoch": 0.85, + "grad_norm": 0.12732228636741638, + "learning_rate": 5.556289784749653e-07, + "loss": 0.4967, + "step": 3960 + }, + { + "epoch": 0.85, + "grad_norm": 0.15208043158054352, + "learning_rate": 5.540315213051323e-07, + "loss": 0.5032, + "step": 3961 + }, + { + "epoch": 0.85, + "grad_norm": 0.14678843319416046, + "learning_rate": 5.524362291214652e-07, + "loss": 0.5706, + "step": 3962 + }, + { + "epoch": 0.85, + "grad_norm": 0.1634489744901657, + "learning_rate": 5.508431027008004e-07, + "loss": 0.4835, + "step": 3963 + }, + { + "epoch": 0.85, + "grad_norm": 0.13023607432842255, + "learning_rate": 5.492521428189179e-07, + "loss": 0.491, + "step": 3964 + }, + { + "epoch": 0.85, + "grad_norm": 0.13974343240261078, + "learning_rate": 5.476633502505436e-07, + "loss": 0.5619, + "step": 3965 + }, + { + "epoch": 0.85, + "grad_norm": 0.17008721828460693, + "learning_rate": 5.460767257693489e-07, + "loss": 0.4755, + "step": 3966 + }, + { + "epoch": 0.85, + "grad_norm": 0.20171983540058136, + "learning_rate": 5.444922701479465e-07, + "loss": 0.5274, + "step": 3967 + }, + { + "epoch": 0.85, + "grad_norm": 0.24001158773899078, + "learning_rate": 5.429099841578966e-07, + "loss": 0.5145, + "step": 3968 + }, + { + "epoch": 0.85, + "grad_norm": 0.15838083624839783, + "learning_rate": 5.413298685697005e-07, + "loss": 0.4835, + "step": 3969 + }, + { + "epoch": 0.86, + "grad_norm": 0.13740849494934082, + "learning_rate": 5.397519241528026e-07, + "loss": 0.4933, + "step": 3970 + }, + { + "epoch": 0.86, + "grad_norm": 0.15910400450229645, + "learning_rate": 5.381761516755907e-07, + "loss": 0.5559, + "step": 3971 + }, + { + "epoch": 0.86, + "grad_norm": 0.1526496410369873, + "learning_rate": 5.366025519053958e-07, + "loss": 0.5526, + "step": 3972 + }, + { + "epoch": 0.86, + "grad_norm": 0.14740879833698273, + "learning_rate": 5.350311256084895e-07, + "loss": 0.5, + "step": 3973 + }, + { + "epoch": 0.86, + "grad_norm": 0.12307767570018768, + "learning_rate": 5.334618735500868e-07, + "loss": 0.544, + "step": 3974 + }, + { + "epoch": 0.86, + "grad_norm": 0.1413116753101349, + "learning_rate": 5.3189479649434e-07, + "loss": 0.5074, + "step": 3975 + }, + { + "epoch": 0.86, + "grad_norm": 0.1412649154663086, + "learning_rate": 5.303298952043473e-07, + "loss": 0.5446, + "step": 3976 + }, + { + "epoch": 0.86, + "grad_norm": 0.17468446493148804, + "learning_rate": 5.287671704421437e-07, + "loss": 0.5217, + "step": 3977 + }, + { + "epoch": 0.86, + "grad_norm": 0.15514497458934784, + "learning_rate": 5.272066229687078e-07, + "loss": 0.542, + "step": 3978 + }, + { + "epoch": 0.86, + "grad_norm": 0.16648997366428375, + "learning_rate": 5.256482535439528e-07, + "loss": 0.4755, + "step": 3979 + }, + { + "epoch": 0.86, + "grad_norm": 0.14344756305217743, + "learning_rate": 5.24092062926736e-07, + "loss": 0.5393, + "step": 3980 + }, + { + "epoch": 0.86, + "grad_norm": 0.14399173855781555, + "learning_rate": 5.225380518748529e-07, + "loss": 0.4944, + "step": 3981 + }, + { + "epoch": 0.86, + "grad_norm": 0.16422103345394135, + "learning_rate": 5.209862211450351e-07, + "loss": 0.5151, + "step": 3982 + }, + { + "epoch": 0.86, + "grad_norm": 0.20136775076389313, + "learning_rate": 5.19436571492955e-07, + "loss": 0.4696, + "step": 3983 + }, + { + "epoch": 0.86, + "grad_norm": 0.16965395212173462, + "learning_rate": 5.17889103673222e-07, + "loss": 0.5225, + "step": 3984 + }, + { + "epoch": 0.86, + "grad_norm": 0.13450326025485992, + "learning_rate": 5.163438184393826e-07, + "loss": 0.5, + "step": 3985 + }, + { + "epoch": 0.86, + "grad_norm": 0.16451282799243927, + "learning_rate": 5.148007165439234e-07, + "loss": 0.4973, + "step": 3986 + }, + { + "epoch": 0.86, + "grad_norm": 0.13875506818294525, + "learning_rate": 5.13259798738262e-07, + "loss": 0.4976, + "step": 3987 + }, + { + "epoch": 0.86, + "grad_norm": 0.13691715896129608, + "learning_rate": 5.117210657727589e-07, + "loss": 0.5844, + "step": 3988 + }, + { + "epoch": 0.86, + "grad_norm": 0.16079024970531464, + "learning_rate": 5.101845183967041e-07, + "loss": 0.5084, + "step": 3989 + }, + { + "epoch": 0.86, + "grad_norm": 0.1576671302318573, + "learning_rate": 5.086501573583302e-07, + "loss": 0.5307, + "step": 3990 + }, + { + "epoch": 0.86, + "grad_norm": 0.14902909100055695, + "learning_rate": 5.071179834048018e-07, + "loss": 0.5562, + "step": 3991 + }, + { + "epoch": 0.86, + "grad_norm": 0.17067904770374298, + "learning_rate": 5.055879972822164e-07, + "loss": 0.5427, + "step": 3992 + }, + { + "epoch": 0.86, + "grad_norm": 0.23107197880744934, + "learning_rate": 5.040601997356098e-07, + "loss": 0.5028, + "step": 3993 + }, + { + "epoch": 0.86, + "grad_norm": 0.15796354413032532, + "learning_rate": 5.025345915089497e-07, + "loss": 0.5006, + "step": 3994 + }, + { + "epoch": 0.86, + "grad_norm": 0.15521222352981567, + "learning_rate": 5.010111733451384e-07, + "loss": 0.5438, + "step": 3995 + }, + { + "epoch": 0.86, + "grad_norm": 0.1400623768568039, + "learning_rate": 4.994899459860125e-07, + "loss": 0.5441, + "step": 3996 + }, + { + "epoch": 0.86, + "grad_norm": 0.15729603171348572, + "learning_rate": 4.979709101723407e-07, + "loss": 0.5244, + "step": 3997 + }, + { + "epoch": 0.86, + "grad_norm": 0.17316539585590363, + "learning_rate": 4.964540666438261e-07, + "loss": 0.5038, + "step": 3998 + }, + { + "epoch": 0.86, + "grad_norm": 0.16760565340518951, + "learning_rate": 4.949394161391013e-07, + "loss": 0.5128, + "step": 3999 + }, + { + "epoch": 0.86, + "grad_norm": 0.13866716623306274, + "learning_rate": 4.934269593957336e-07, + "loss": 0.5033, + "step": 4000 + }, + { + "epoch": 0.86, + "grad_norm": 0.17374561727046967, + "learning_rate": 4.919166971502215e-07, + "loss": 0.4985, + "step": 4001 + }, + { + "epoch": 0.86, + "grad_norm": 0.16311132907867432, + "learning_rate": 4.90408630137994e-07, + "loss": 0.5016, + "step": 4002 + }, + { + "epoch": 0.86, + "grad_norm": 0.15572021901607513, + "learning_rate": 4.889027590934131e-07, + "loss": 0.5121, + "step": 4003 + }, + { + "epoch": 0.86, + "grad_norm": 0.20856572687625885, + "learning_rate": 4.873990847497684e-07, + "loss": 0.5021, + "step": 4004 + }, + { + "epoch": 0.86, + "grad_norm": 0.15273533761501312, + "learning_rate": 4.85897607839283e-07, + "loss": 0.5781, + "step": 4005 + }, + { + "epoch": 0.86, + "grad_norm": 0.14332985877990723, + "learning_rate": 4.843983290931064e-07, + "loss": 0.4704, + "step": 4006 + }, + { + "epoch": 0.86, + "grad_norm": 0.17221957445144653, + "learning_rate": 4.829012492413215e-07, + "loss": 0.4858, + "step": 4007 + }, + { + "epoch": 0.86, + "grad_norm": 0.14145652949810028, + "learning_rate": 4.814063690129378e-07, + "loss": 0.5182, + "step": 4008 + }, + { + "epoch": 0.86, + "grad_norm": 0.15986113250255585, + "learning_rate": 4.799136891358952e-07, + "loss": 0.5424, + "step": 4009 + }, + { + "epoch": 0.86, + "grad_norm": 0.1356787085533142, + "learning_rate": 4.784232103370617e-07, + "loss": 0.494, + "step": 4010 + }, + { + "epoch": 0.86, + "grad_norm": 0.19140973687171936, + "learning_rate": 4.769349333422324e-07, + "loss": 0.4956, + "step": 4011 + }, + { + "epoch": 0.86, + "grad_norm": 0.14601151645183563, + "learning_rate": 4.7544885887613136e-07, + "loss": 0.5142, + "step": 4012 + }, + { + "epoch": 0.86, + "grad_norm": 0.16945038735866547, + "learning_rate": 4.739649876624108e-07, + "loss": 0.5068, + "step": 4013 + }, + { + "epoch": 0.86, + "grad_norm": 0.1639741212129593, + "learning_rate": 4.724833204236462e-07, + "loss": 0.4829, + "step": 4014 + }, + { + "epoch": 0.86, + "grad_norm": 0.21183674037456512, + "learning_rate": 4.710038578813469e-07, + "loss": 0.4902, + "step": 4015 + }, + { + "epoch": 0.87, + "grad_norm": 0.167417973279953, + "learning_rate": 4.695266007559407e-07, + "loss": 0.504, + "step": 4016 + }, + { + "epoch": 0.87, + "grad_norm": 0.18118150532245636, + "learning_rate": 4.6805154976678755e-07, + "loss": 0.5233, + "step": 4017 + }, + { + "epoch": 0.87, + "grad_norm": 0.16984857618808746, + "learning_rate": 4.6657870563217076e-07, + "loss": 0.5051, + "step": 4018 + }, + { + "epoch": 0.87, + "grad_norm": 0.17123106122016907, + "learning_rate": 4.651080690692972e-07, + "loss": 0.5429, + "step": 4019 + }, + { + "epoch": 0.87, + "grad_norm": 0.15946775674819946, + "learning_rate": 4.6363964079430166e-07, + "loss": 0.5523, + "step": 4020 + }, + { + "epoch": 0.87, + "grad_norm": 0.13110215961933136, + "learning_rate": 4.6217342152224233e-07, + "loss": 0.5525, + "step": 4021 + }, + { + "epoch": 0.87, + "grad_norm": 0.1535872519016266, + "learning_rate": 4.6070941196710186e-07, + "loss": 0.5344, + "step": 4022 + }, + { + "epoch": 0.87, + "grad_norm": 0.15114997327327728, + "learning_rate": 4.5924761284178834e-07, + "loss": 0.4776, + "step": 4023 + }, + { + "epoch": 0.87, + "grad_norm": 0.15840767323970795, + "learning_rate": 4.5778802485812956e-07, + "loss": 0.506, + "step": 4024 + }, + { + "epoch": 0.87, + "grad_norm": 0.1840353012084961, + "learning_rate": 4.5633064872688093e-07, + "loss": 0.5216, + "step": 4025 + }, + { + "epoch": 0.87, + "grad_norm": 0.18357300758361816, + "learning_rate": 4.548754851577175e-07, + "loss": 0.5406, + "step": 4026 + }, + { + "epoch": 0.87, + "grad_norm": 0.16347016394138336, + "learning_rate": 4.5342253485923803e-07, + "loss": 0.5085, + "step": 4027 + }, + { + "epoch": 0.87, + "grad_norm": 0.2056354433298111, + "learning_rate": 4.5197179853896654e-07, + "loss": 0.518, + "step": 4028 + }, + { + "epoch": 0.87, + "grad_norm": 0.1330898255109787, + "learning_rate": 4.505232769033435e-07, + "loss": 0.5138, + "step": 4029 + }, + { + "epoch": 0.87, + "grad_norm": 0.16567635536193848, + "learning_rate": 4.4907697065773523e-07, + "loss": 0.5258, + "step": 4030 + }, + { + "epoch": 0.87, + "grad_norm": 0.1845930814743042, + "learning_rate": 4.476328805064262e-07, + "loss": 0.5277, + "step": 4031 + }, + { + "epoch": 0.87, + "grad_norm": 0.1463019847869873, + "learning_rate": 4.4619100715262374e-07, + "loss": 0.4919, + "step": 4032 + }, + { + "epoch": 0.87, + "grad_norm": 0.12273728102445602, + "learning_rate": 4.447513512984558e-07, + "loss": 0.4665, + "step": 4033 + }, + { + "epoch": 0.87, + "grad_norm": 0.1603401154279709, + "learning_rate": 4.4331391364496934e-07, + "loss": 0.517, + "step": 4034 + }, + { + "epoch": 0.87, + "grad_norm": 0.15330933034420013, + "learning_rate": 4.4187869489213275e-07, + "loss": 0.5976, + "step": 4035 + }, + { + "epoch": 0.87, + "grad_norm": 0.21303099393844604, + "learning_rate": 4.404456957388309e-07, + "loss": 0.5608, + "step": 4036 + }, + { + "epoch": 0.87, + "grad_norm": 0.15875820815563202, + "learning_rate": 4.3901491688287113e-07, + "loss": 0.5394, + "step": 4037 + }, + { + "epoch": 0.87, + "grad_norm": 0.18736515939235687, + "learning_rate": 4.375863590209778e-07, + "loss": 0.4804, + "step": 4038 + }, + { + "epoch": 0.87, + "grad_norm": 0.21394529938697815, + "learning_rate": 4.3616002284879333e-07, + "loss": 0.5041, + "step": 4039 + }, + { + "epoch": 0.87, + "grad_norm": 0.18619798123836517, + "learning_rate": 4.3473590906088046e-07, + "loss": 0.5027, + "step": 4040 + }, + { + "epoch": 0.87, + "grad_norm": 0.16709107160568237, + "learning_rate": 4.3331401835071783e-07, + "loss": 0.4971, + "step": 4041 + }, + { + "epoch": 0.87, + "grad_norm": 0.1601034700870514, + "learning_rate": 4.3189435141070324e-07, + "loss": 0.5241, + "step": 4042 + }, + { + "epoch": 0.87, + "grad_norm": 0.15669238567352295, + "learning_rate": 4.304769089321481e-07, + "loss": 0.5291, + "step": 4043 + }, + { + "epoch": 0.87, + "grad_norm": 0.14634265005588531, + "learning_rate": 4.2906169160528424e-07, + "loss": 0.5253, + "step": 4044 + }, + { + "epoch": 0.87, + "grad_norm": 0.1932663768529892, + "learning_rate": 4.276487001192592e-07, + "loss": 0.5096, + "step": 4045 + }, + { + "epoch": 0.87, + "grad_norm": 0.12650729715824127, + "learning_rate": 4.262379351621354e-07, + "loss": 0.5037, + "step": 4046 + }, + { + "epoch": 0.87, + "grad_norm": 0.1783479005098343, + "learning_rate": 4.248293974208928e-07, + "loss": 0.5197, + "step": 4047 + }, + { + "epoch": 0.87, + "grad_norm": 0.17106756567955017, + "learning_rate": 4.2342308758142437e-07, + "loss": 0.4908, + "step": 4048 + }, + { + "epoch": 0.87, + "grad_norm": 0.1578291952610016, + "learning_rate": 4.220190063285401e-07, + "loss": 0.5028, + "step": 4049 + }, + { + "epoch": 0.87, + "grad_norm": 0.17856548726558685, + "learning_rate": 4.2061715434596475e-07, + "loss": 0.4998, + "step": 4050 + }, + { + "epoch": 0.87, + "grad_norm": 0.19411097466945648, + "learning_rate": 4.192175323163361e-07, + "loss": 0.5383, + "step": 4051 + }, + { + "epoch": 0.87, + "grad_norm": 0.1397572010755539, + "learning_rate": 4.1782014092120735e-07, + "loss": 0.4779, + "step": 4052 + }, + { + "epoch": 0.87, + "grad_norm": 0.12479076534509659, + "learning_rate": 4.164249808410459e-07, + "loss": 0.498, + "step": 4053 + }, + { + "epoch": 0.87, + "grad_norm": 0.13633649051189423, + "learning_rate": 4.150320527552304e-07, + "loss": 0.5257, + "step": 4054 + }, + { + "epoch": 0.87, + "grad_norm": 0.16726909577846527, + "learning_rate": 4.1364135734205556e-07, + "loss": 0.4955, + "step": 4055 + }, + { + "epoch": 0.87, + "grad_norm": 0.1693604290485382, + "learning_rate": 4.122528952787258e-07, + "loss": 0.5903, + "step": 4056 + }, + { + "epoch": 0.87, + "grad_norm": 0.13616541028022766, + "learning_rate": 4.1086666724136024e-07, + "loss": 0.4837, + "step": 4057 + }, + { + "epoch": 0.87, + "grad_norm": 0.14842045307159424, + "learning_rate": 4.0948267390498953e-07, + "loss": 0.4777, + "step": 4058 + }, + { + "epoch": 0.87, + "grad_norm": 0.15691286325454712, + "learning_rate": 4.0810091594355674e-07, + "loss": 0.4684, + "step": 4059 + }, + { + "epoch": 0.87, + "grad_norm": 0.20302332937717438, + "learning_rate": 4.067213940299136e-07, + "loss": 0.5461, + "step": 4060 + }, + { + "epoch": 0.87, + "grad_norm": 0.1701618880033493, + "learning_rate": 4.0534410883582673e-07, + "loss": 0.5253, + "step": 4061 + }, + { + "epoch": 0.88, + "grad_norm": 0.16087806224822998, + "learning_rate": 4.0396906103197244e-07, + "loss": 0.5728, + "step": 4062 + }, + { + "epoch": 0.88, + "grad_norm": 0.1731209009885788, + "learning_rate": 4.02596251287935e-07, + "loss": 0.4793, + "step": 4063 + }, + { + "epoch": 0.88, + "grad_norm": 0.15619364380836487, + "learning_rate": 4.01225680272212e-07, + "loss": 0.5675, + "step": 4064 + }, + { + "epoch": 0.88, + "grad_norm": 0.14686357975006104, + "learning_rate": 3.998573486522095e-07, + "loss": 0.5241, + "step": 4065 + }, + { + "epoch": 0.88, + "grad_norm": 0.14786110818386078, + "learning_rate": 3.984912570942434e-07, + "loss": 0.5098, + "step": 4066 + }, + { + "epoch": 0.88, + "grad_norm": 0.1765190064907074, + "learning_rate": 3.9712740626354e-07, + "loss": 0.6106, + "step": 4067 + }, + { + "epoch": 0.88, + "grad_norm": 0.14524182677268982, + "learning_rate": 3.9576579682423066e-07, + "loss": 0.5239, + "step": 4068 + }, + { + "epoch": 0.88, + "grad_norm": 0.18796804547309875, + "learning_rate": 3.9440642943936013e-07, + "loss": 0.4934, + "step": 4069 + }, + { + "epoch": 0.88, + "grad_norm": 0.13147200644016266, + "learning_rate": 3.930493047708761e-07, + "loss": 0.5417, + "step": 4070 + }, + { + "epoch": 0.88, + "grad_norm": 0.137882798910141, + "learning_rate": 3.916944234796399e-07, + "loss": 0.4724, + "step": 4071 + }, + { + "epoch": 0.88, + "grad_norm": 0.1864192932844162, + "learning_rate": 3.903417862254172e-07, + "loss": 0.4951, + "step": 4072 + }, + { + "epoch": 0.88, + "grad_norm": 0.14649604260921478, + "learning_rate": 3.8899139366687985e-07, + "loss": 0.5297, + "step": 4073 + }, + { + "epoch": 0.88, + "grad_norm": 0.19774487614631653, + "learning_rate": 3.876432464616103e-07, + "loss": 0.5174, + "step": 4074 + }, + { + "epoch": 0.88, + "grad_norm": 0.12834720313549042, + "learning_rate": 3.862973452660929e-07, + "loss": 0.523, + "step": 4075 + }, + { + "epoch": 0.88, + "grad_norm": 0.1609206348657608, + "learning_rate": 3.8495369073572266e-07, + "loss": 0.5635, + "step": 4076 + }, + { + "epoch": 0.88, + "grad_norm": 0.1672678291797638, + "learning_rate": 3.8361228352479795e-07, + "loss": 0.478, + "step": 4077 + }, + { + "epoch": 0.88, + "grad_norm": 0.13725100457668304, + "learning_rate": 3.822731242865235e-07, + "loss": 0.5276, + "step": 4078 + }, + { + "epoch": 0.88, + "grad_norm": 0.1619109809398651, + "learning_rate": 3.8093621367301103e-07, + "loss": 0.5497, + "step": 4079 + }, + { + "epoch": 0.88, + "grad_norm": 0.18122999370098114, + "learning_rate": 3.7960155233527364e-07, + "loss": 0.5882, + "step": 4080 + }, + { + "epoch": 0.88, + "grad_norm": 0.15119299292564392, + "learning_rate": 3.782691409232325e-07, + "loss": 0.4459, + "step": 4081 + }, + { + "epoch": 0.88, + "grad_norm": 0.164114847779274, + "learning_rate": 3.7693898008571205e-07, + "loss": 0.525, + "step": 4082 + }, + { + "epoch": 0.88, + "grad_norm": 0.1447734236717224, + "learning_rate": 3.75611070470438e-07, + "loss": 0.542, + "step": 4083 + }, + { + "epoch": 0.88, + "grad_norm": 0.16693483293056488, + "learning_rate": 3.742854127240464e-07, + "loss": 0.5254, + "step": 4084 + }, + { + "epoch": 0.88, + "grad_norm": 0.1440124213695526, + "learning_rate": 3.7296200749207034e-07, + "loss": 0.4841, + "step": 4085 + }, + { + "epoch": 0.88, + "grad_norm": 0.14543622732162476, + "learning_rate": 3.7164085541894937e-07, + "loss": 0.5613, + "step": 4086 + }, + { + "epoch": 0.88, + "grad_norm": 0.16933149099349976, + "learning_rate": 3.703219571480249e-07, + "loss": 0.5304, + "step": 4087 + }, + { + "epoch": 0.88, + "grad_norm": 0.14789710938930511, + "learning_rate": 3.690053133215399e-07, + "loss": 0.5256, + "step": 4088 + }, + { + "epoch": 0.88, + "grad_norm": 0.16581717133522034, + "learning_rate": 3.676909245806415e-07, + "loss": 0.5014, + "step": 4089 + }, + { + "epoch": 0.88, + "grad_norm": 0.13003475964069366, + "learning_rate": 3.663787915653777e-07, + "loss": 0.5366, + "step": 4090 + }, + { + "epoch": 0.88, + "grad_norm": 0.14335590600967407, + "learning_rate": 3.650689149146991e-07, + "loss": 0.5642, + "step": 4091 + }, + { + "epoch": 0.88, + "grad_norm": 0.20606780052185059, + "learning_rate": 3.6376129526645376e-07, + "loss": 0.5484, + "step": 4092 + }, + { + "epoch": 0.88, + "grad_norm": 0.1340981125831604, + "learning_rate": 3.624559332573957e-07, + "loss": 0.4645, + "step": 4093 + }, + { + "epoch": 0.88, + "grad_norm": 0.14133349061012268, + "learning_rate": 3.6115282952317807e-07, + "loss": 0.4575, + "step": 4094 + }, + { + "epoch": 0.88, + "grad_norm": 0.16861362755298615, + "learning_rate": 3.598519846983511e-07, + "loss": 0.4783, + "step": 4095 + }, + { + "epoch": 0.88, + "grad_norm": 0.126511812210083, + "learning_rate": 3.5855339941636867e-07, + "loss": 0.4925, + "step": 4096 + }, + { + "epoch": 0.88, + "grad_norm": 0.17741841077804565, + "learning_rate": 3.572570743095838e-07, + "loss": 0.4844, + "step": 4097 + }, + { + "epoch": 0.88, + "grad_norm": 0.13794460892677307, + "learning_rate": 3.5596301000924815e-07, + "loss": 0.5503, + "step": 4098 + }, + { + "epoch": 0.88, + "grad_norm": 0.14488175511360168, + "learning_rate": 3.546712071455127e-07, + "loss": 0.4982, + "step": 4099 + }, + { + "epoch": 0.88, + "grad_norm": 0.2083345204591751, + "learning_rate": 3.533816663474271e-07, + "loss": 0.4913, + "step": 4100 + }, + { + "epoch": 0.88, + "grad_norm": 0.15425090491771698, + "learning_rate": 3.5209438824293896e-07, + "loss": 0.5406, + "step": 4101 + }, + { + "epoch": 0.88, + "grad_norm": 0.19911810755729675, + "learning_rate": 3.508093734588952e-07, + "loss": 0.4975, + "step": 4102 + }, + { + "epoch": 0.88, + "grad_norm": 0.14716565608978271, + "learning_rate": 3.4952662262104033e-07, + "loss": 0.4834, + "step": 4103 + }, + { + "epoch": 0.88, + "grad_norm": 0.18182729184627533, + "learning_rate": 3.482461363540163e-07, + "loss": 0.5785, + "step": 4104 + }, + { + "epoch": 0.88, + "grad_norm": 0.16187834739685059, + "learning_rate": 3.46967915281361e-07, + "loss": 0.5109, + "step": 4105 + }, + { + "epoch": 0.88, + "grad_norm": 0.13375143706798553, + "learning_rate": 3.456919600255126e-07, + "loss": 0.4686, + "step": 4106 + }, + { + "epoch": 0.88, + "grad_norm": 0.16278614103794098, + "learning_rate": 3.4441827120780147e-07, + "loss": 0.5005, + "step": 4107 + }, + { + "epoch": 0.88, + "grad_norm": 0.2082134485244751, + "learning_rate": 3.4314684944845747e-07, + "loss": 0.4992, + "step": 4108 + }, + { + "epoch": 0.89, + "grad_norm": 0.134648859500885, + "learning_rate": 3.4187769536660533e-07, + "loss": 0.4795, + "step": 4109 + }, + { + "epoch": 0.89, + "grad_norm": 0.1557423323392868, + "learning_rate": 3.406108095802668e-07, + "loss": 0.5266, + "step": 4110 + }, + { + "epoch": 0.89, + "grad_norm": 0.15993043780326843, + "learning_rate": 3.393461927063585e-07, + "loss": 0.4967, + "step": 4111 + }, + { + "epoch": 0.89, + "grad_norm": 0.12335589528083801, + "learning_rate": 3.3808384536068997e-07, + "loss": 0.5044, + "step": 4112 + }, + { + "epoch": 0.89, + "grad_norm": 0.1581617146730423, + "learning_rate": 3.3682376815796834e-07, + "loss": 0.5087, + "step": 4113 + }, + { + "epoch": 0.89, + "grad_norm": 0.1484224498271942, + "learning_rate": 3.3556596171179455e-07, + "loss": 0.4655, + "step": 4114 + }, + { + "epoch": 0.89, + "grad_norm": 0.16247932612895966, + "learning_rate": 3.343104266346636e-07, + "loss": 0.4964, + "step": 4115 + }, + { + "epoch": 0.89, + "grad_norm": 0.18105369806289673, + "learning_rate": 3.3305716353796537e-07, + "loss": 0.5231, + "step": 4116 + }, + { + "epoch": 0.89, + "grad_norm": 0.16214075684547424, + "learning_rate": 3.3180617303198046e-07, + "loss": 0.546, + "step": 4117 + }, + { + "epoch": 0.89, + "grad_norm": 0.16236190497875214, + "learning_rate": 3.305574557258867e-07, + "loss": 0.4751, + "step": 4118 + }, + { + "epoch": 0.89, + "grad_norm": 0.1777781993150711, + "learning_rate": 3.2931101222775154e-07, + "loss": 0.5563, + "step": 4119 + }, + { + "epoch": 0.89, + "grad_norm": 0.17919768393039703, + "learning_rate": 3.2806684314453774e-07, + "loss": 0.4581, + "step": 4120 + }, + { + "epoch": 0.89, + "grad_norm": 0.16179294884204865, + "learning_rate": 3.2682494908209906e-07, + "loss": 0.5197, + "step": 4121 + }, + { + "epoch": 0.89, + "grad_norm": 0.14991389214992523, + "learning_rate": 3.255853306451823e-07, + "loss": 0.5574, + "step": 4122 + }, + { + "epoch": 0.89, + "grad_norm": 0.22062784433364868, + "learning_rate": 3.243479884374262e-07, + "loss": 0.5563, + "step": 4123 + }, + { + "epoch": 0.89, + "grad_norm": 0.15176159143447876, + "learning_rate": 3.2311292306135944e-07, + "loss": 0.4785, + "step": 4124 + }, + { + "epoch": 0.89, + "grad_norm": 0.1553657054901123, + "learning_rate": 3.2188013511840365e-07, + "loss": 0.5524, + "step": 4125 + }, + { + "epoch": 0.89, + "grad_norm": 0.2217596471309662, + "learning_rate": 3.2064962520887146e-07, + "loss": 0.4976, + "step": 4126 + }, + { + "epoch": 0.89, + "grad_norm": 0.1558333784341812, + "learning_rate": 3.194213939319646e-07, + "loss": 0.5259, + "step": 4127 + }, + { + "epoch": 0.89, + "grad_norm": 0.1418876200914383, + "learning_rate": 3.18195441885778e-07, + "loss": 0.5342, + "step": 4128 + }, + { + "epoch": 0.89, + "grad_norm": 0.12174227088689804, + "learning_rate": 3.169717696672936e-07, + "loss": 0.5027, + "step": 4129 + }, + { + "epoch": 0.89, + "grad_norm": 0.12317800521850586, + "learning_rate": 3.157503778723847e-07, + "loss": 0.5245, + "step": 4130 + }, + { + "epoch": 0.89, + "grad_norm": 0.14522142708301544, + "learning_rate": 3.145312670958156e-07, + "loss": 0.4962, + "step": 4131 + }, + { + "epoch": 0.89, + "grad_norm": 0.1550437957048416, + "learning_rate": 3.1331443793123585e-07, + "loss": 0.4934, + "step": 4132 + }, + { + "epoch": 0.89, + "grad_norm": 0.15259462594985962, + "learning_rate": 3.120998909711881e-07, + "loss": 0.4907, + "step": 4133 + }, + { + "epoch": 0.89, + "grad_norm": 0.12828658521175385, + "learning_rate": 3.108876268071009e-07, + "loss": 0.4977, + "step": 4134 + }, + { + "epoch": 0.89, + "grad_norm": 0.18181227147579193, + "learning_rate": 3.096776460292927e-07, + "loss": 0.4883, + "step": 4135 + }, + { + "epoch": 0.89, + "grad_norm": 0.16961906850337982, + "learning_rate": 3.0846994922697104e-07, + "loss": 0.5045, + "step": 4136 + }, + { + "epoch": 0.89, + "grad_norm": 0.14249150454998016, + "learning_rate": 3.072645369882271e-07, + "loss": 0.5097, + "step": 4137 + }, + { + "epoch": 0.89, + "grad_norm": 0.1759713590145111, + "learning_rate": 3.060614099000442e-07, + "loss": 0.4703, + "step": 4138 + }, + { + "epoch": 0.89, + "grad_norm": 0.1403285712003708, + "learning_rate": 3.048605685482892e-07, + "loss": 0.4779, + "step": 4139 + }, + { + "epoch": 0.89, + "grad_norm": 0.15652833878993988, + "learning_rate": 3.0366201351771983e-07, + "loss": 0.4843, + "step": 4140 + }, + { + "epoch": 0.89, + "grad_norm": 0.1837598830461502, + "learning_rate": 3.024657453919777e-07, + "loss": 0.5272, + "step": 4141 + }, + { + "epoch": 0.89, + "grad_norm": 0.1516779363155365, + "learning_rate": 3.0127176475359065e-07, + "loss": 0.5174, + "step": 4142 + }, + { + "epoch": 0.89, + "grad_norm": 0.15141400694847107, + "learning_rate": 3.0008007218397415e-07, + "loss": 0.5148, + "step": 4143 + }, + { + "epoch": 0.89, + "grad_norm": 0.1702878326177597, + "learning_rate": 2.988906682634285e-07, + "loss": 0.5279, + "step": 4144 + }, + { + "epoch": 0.89, + "grad_norm": 0.17491145431995392, + "learning_rate": 2.977035535711392e-07, + "loss": 0.5319, + "step": 4145 + }, + { + "epoch": 0.89, + "grad_norm": 0.16550621390342712, + "learning_rate": 2.965187286851784e-07, + "loss": 0.5071, + "step": 4146 + }, + { + "epoch": 0.89, + "grad_norm": 0.19162628054618835, + "learning_rate": 2.953361941825017e-07, + "loss": 0.5123, + "step": 4147 + }, + { + "epoch": 0.89, + "grad_norm": 0.15393443405628204, + "learning_rate": 2.941559506389513e-07, + "loss": 0.5035, + "step": 4148 + }, + { + "epoch": 0.89, + "grad_norm": 0.20023614168167114, + "learning_rate": 2.9297799862925136e-07, + "loss": 0.5585, + "step": 4149 + }, + { + "epoch": 0.89, + "grad_norm": 0.13044221699237823, + "learning_rate": 2.9180233872701247e-07, + "loss": 0.4811, + "step": 4150 + }, + { + "epoch": 0.89, + "grad_norm": 0.13851873576641083, + "learning_rate": 2.906289715047267e-07, + "loss": 0.5445, + "step": 4151 + }, + { + "epoch": 0.89, + "grad_norm": 0.15511353313922882, + "learning_rate": 2.894578975337703e-07, + "loss": 0.5014, + "step": 4152 + }, + { + "epoch": 0.89, + "grad_norm": 0.12846341729164124, + "learning_rate": 2.8828911738440713e-07, + "loss": 0.4618, + "step": 4153 + }, + { + "epoch": 0.89, + "grad_norm": 0.19906170666217804, + "learning_rate": 2.8712263162577636e-07, + "loss": 0.5116, + "step": 4154 + }, + { + "epoch": 0.9, + "grad_norm": 0.16838042438030243, + "learning_rate": 2.8595844082590695e-07, + "loss": 0.4673, + "step": 4155 + }, + { + "epoch": 0.9, + "grad_norm": 0.14445045590400696, + "learning_rate": 2.8479654555170546e-07, + "loss": 0.5171, + "step": 4156 + }, + { + "epoch": 0.9, + "grad_norm": 0.14654265344142914, + "learning_rate": 2.836369463689631e-07, + "loss": 0.5053, + "step": 4157 + }, + { + "epoch": 0.9, + "grad_norm": 0.14703992009162903, + "learning_rate": 2.8247964384235214e-07, + "loss": 0.5053, + "step": 4158 + }, + { + "epoch": 0.9, + "grad_norm": 0.15812741219997406, + "learning_rate": 2.813246385354268e-07, + "loss": 0.5083, + "step": 4159 + }, + { + "epoch": 0.9, + "grad_norm": 0.18599557876586914, + "learning_rate": 2.8017193101062377e-07, + "loss": 0.5569, + "step": 4160 + }, + { + "epoch": 0.9, + "grad_norm": 0.16005754470825195, + "learning_rate": 2.7902152182925746e-07, + "loss": 0.5608, + "step": 4161 + }, + { + "epoch": 0.9, + "grad_norm": 0.1266726851463318, + "learning_rate": 2.778734115515269e-07, + "loss": 0.536, + "step": 4162 + }, + { + "epoch": 0.9, + "grad_norm": 0.15541070699691772, + "learning_rate": 2.7672760073650996e-07, + "loss": 0.5001, + "step": 4163 + }, + { + "epoch": 0.9, + "grad_norm": 0.12981733679771423, + "learning_rate": 2.755840899421636e-07, + "loss": 0.5071, + "step": 4164 + }, + { + "epoch": 0.9, + "grad_norm": 0.14456294476985931, + "learning_rate": 2.744428797253268e-07, + "loss": 0.4775, + "step": 4165 + }, + { + "epoch": 0.9, + "grad_norm": 0.16282424330711365, + "learning_rate": 2.7330397064171787e-07, + "loss": 0.5036, + "step": 4166 + }, + { + "epoch": 0.9, + "grad_norm": 0.13770410418510437, + "learning_rate": 2.7216736324593316e-07, + "loss": 0.4779, + "step": 4167 + }, + { + "epoch": 0.9, + "grad_norm": 0.1373995691537857, + "learning_rate": 2.7103305809145106e-07, + "loss": 0.5212, + "step": 4168 + }, + { + "epoch": 0.9, + "grad_norm": 0.13789519667625427, + "learning_rate": 2.699010557306253e-07, + "loss": 0.5334, + "step": 4169 + }, + { + "epoch": 0.9, + "grad_norm": 0.15480732917785645, + "learning_rate": 2.687713567146899e-07, + "loss": 0.5061, + "step": 4170 + }, + { + "epoch": 0.9, + "grad_norm": 0.1765686720609665, + "learning_rate": 2.676439615937582e-07, + "loss": 0.4803, + "step": 4171 + }, + { + "epoch": 0.9, + "grad_norm": 0.13667207956314087, + "learning_rate": 2.665188709168215e-07, + "loss": 0.529, + "step": 4172 + }, + { + "epoch": 0.9, + "grad_norm": 0.1658671498298645, + "learning_rate": 2.6539608523174665e-07, + "loss": 0.5231, + "step": 4173 + }, + { + "epoch": 0.9, + "grad_norm": 0.1876867711544037, + "learning_rate": 2.642756050852796e-07, + "loss": 0.5373, + "step": 4174 + }, + { + "epoch": 0.9, + "grad_norm": 0.1558872014284134, + "learning_rate": 2.631574310230456e-07, + "loss": 0.5224, + "step": 4175 + }, + { + "epoch": 0.9, + "grad_norm": 0.15014420449733734, + "learning_rate": 2.620415635895429e-07, + "loss": 0.5415, + "step": 4176 + }, + { + "epoch": 0.9, + "grad_norm": 0.14560414850711823, + "learning_rate": 2.6092800332814914e-07, + "loss": 0.4483, + "step": 4177 + }, + { + "epoch": 0.9, + "grad_norm": 0.19036678969860077, + "learning_rate": 2.5981675078111835e-07, + "loss": 0.545, + "step": 4178 + }, + { + "epoch": 0.9, + "grad_norm": 0.3053010404109955, + "learning_rate": 2.587078064895804e-07, + "loss": 0.5174, + "step": 4179 + }, + { + "epoch": 0.9, + "grad_norm": 0.14357438683509827, + "learning_rate": 2.5760117099354163e-07, + "loss": 0.484, + "step": 4180 + }, + { + "epoch": 0.9, + "grad_norm": 0.1921653300523758, + "learning_rate": 2.5649684483188274e-07, + "loss": 0.5016, + "step": 4181 + }, + { + "epoch": 0.9, + "grad_norm": 0.15328127145767212, + "learning_rate": 2.5539482854236076e-07, + "loss": 0.4675, + "step": 4182 + }, + { + "epoch": 0.9, + "grad_norm": 0.16819103062152863, + "learning_rate": 2.5429512266160805e-07, + "loss": 0.4982, + "step": 4183 + }, + { + "epoch": 0.9, + "grad_norm": 0.16778436303138733, + "learning_rate": 2.531977277251324e-07, + "loss": 0.5475, + "step": 4184 + }, + { + "epoch": 0.9, + "grad_norm": 0.1751207560300827, + "learning_rate": 2.521026442673158e-07, + "loss": 0.5167, + "step": 4185 + }, + { + "epoch": 0.9, + "grad_norm": 0.16310207545757294, + "learning_rate": 2.510098728214133e-07, + "loss": 0.5168, + "step": 4186 + }, + { + "epoch": 0.9, + "grad_norm": 0.17500171065330505, + "learning_rate": 2.4991941391955654e-07, + "loss": 0.5662, + "step": 4187 + }, + { + "epoch": 0.9, + "grad_norm": 0.15705259144306183, + "learning_rate": 2.488312680927485e-07, + "loss": 0.5006, + "step": 4188 + }, + { + "epoch": 0.9, + "grad_norm": 0.18603339791297913, + "learning_rate": 2.4774543587086807e-07, + "loss": 0.5122, + "step": 4189 + }, + { + "epoch": 0.9, + "grad_norm": 0.18599247932434082, + "learning_rate": 2.466619177826668e-07, + "loss": 0.5189, + "step": 4190 + }, + { + "epoch": 0.9, + "grad_norm": 0.1684497594833374, + "learning_rate": 2.4558071435576813e-07, + "loss": 0.5395, + "step": 4191 + }, + { + "epoch": 0.9, + "grad_norm": 0.15068615972995758, + "learning_rate": 2.4450182611667096e-07, + "loss": 0.4888, + "step": 4192 + }, + { + "epoch": 0.9, + "grad_norm": 0.1370285153388977, + "learning_rate": 2.4342525359074385e-07, + "loss": 0.5118, + "step": 4193 + }, + { + "epoch": 0.9, + "grad_norm": 0.16400447487831116, + "learning_rate": 2.423509973022292e-07, + "loss": 0.4971, + "step": 4194 + }, + { + "epoch": 0.9, + "grad_norm": 0.22781065106391907, + "learning_rate": 2.4127905777424134e-07, + "loss": 0.5495, + "step": 4195 + }, + { + "epoch": 0.9, + "grad_norm": 0.1345801055431366, + "learning_rate": 2.4020943552876706e-07, + "loss": 0.509, + "step": 4196 + }, + { + "epoch": 0.9, + "grad_norm": 0.15502989292144775, + "learning_rate": 2.391421310866648e-07, + "loss": 0.5564, + "step": 4197 + }, + { + "epoch": 0.9, + "grad_norm": 0.13414627313613892, + "learning_rate": 2.3807714496766165e-07, + "loss": 0.5253, + "step": 4198 + }, + { + "epoch": 0.9, + "grad_norm": 0.1705794483423233, + "learning_rate": 2.370144776903599e-07, + "loss": 0.4849, + "step": 4199 + }, + { + "epoch": 0.9, + "grad_norm": 0.16182225942611694, + "learning_rate": 2.3595412977222897e-07, + "loss": 0.5487, + "step": 4200 + }, + { + "epoch": 0.9, + "grad_norm": 0.18094182014465332, + "learning_rate": 2.3489610172961143e-07, + "loss": 0.4966, + "step": 4201 + }, + { + "epoch": 0.91, + "grad_norm": 0.134856179356575, + "learning_rate": 2.3384039407771896e-07, + "loss": 0.5284, + "step": 4202 + }, + { + "epoch": 0.91, + "grad_norm": 0.12742473185062408, + "learning_rate": 2.327870073306332e-07, + "loss": 0.5371, + "step": 4203 + }, + { + "epoch": 0.91, + "grad_norm": 0.16482314467430115, + "learning_rate": 2.317359420013071e-07, + "loss": 0.5241, + "step": 4204 + }, + { + "epoch": 0.91, + "grad_norm": 0.17178313434123993, + "learning_rate": 2.306871986015613e-07, + "loss": 0.5146, + "step": 4205 + }, + { + "epoch": 0.91, + "grad_norm": 0.16056092083454132, + "learning_rate": 2.2964077764208615e-07, + "loss": 0.5389, + "step": 4206 + }, + { + "epoch": 0.91, + "grad_norm": 0.18820203840732574, + "learning_rate": 2.2859667963244236e-07, + "loss": 0.4964, + "step": 4207 + }, + { + "epoch": 0.91, + "grad_norm": 0.18173396587371826, + "learning_rate": 2.2755490508105716e-07, + "loss": 0.5323, + "step": 4208 + }, + { + "epoch": 0.91, + "grad_norm": 0.15220309793949127, + "learning_rate": 2.2651545449522972e-07, + "loss": 0.477, + "step": 4209 + }, + { + "epoch": 0.91, + "grad_norm": 0.17373429238796234, + "learning_rate": 2.254783283811246e-07, + "loss": 0.5649, + "step": 4210 + }, + { + "epoch": 0.91, + "grad_norm": 0.1504889577627182, + "learning_rate": 2.2444352724377505e-07, + "loss": 0.5183, + "step": 4211 + }, + { + "epoch": 0.91, + "grad_norm": 0.1400587409734726, + "learning_rate": 2.2341105158708408e-07, + "loss": 0.5101, + "step": 4212 + }, + { + "epoch": 0.91, + "grad_norm": 0.18521972000598907, + "learning_rate": 2.22380901913819e-07, + "loss": 0.4801, + "step": 4213 + }, + { + "epoch": 0.91, + "grad_norm": 0.1672522872686386, + "learning_rate": 2.2135307872561628e-07, + "loss": 0.4725, + "step": 4214 + }, + { + "epoch": 0.91, + "grad_norm": 0.15692496299743652, + "learning_rate": 2.2032758252298115e-07, + "loss": 0.5603, + "step": 4215 + }, + { + "epoch": 0.91, + "grad_norm": 0.18013040721416473, + "learning_rate": 2.1930441380528243e-07, + "loss": 0.5292, + "step": 4216 + }, + { + "epoch": 0.91, + "grad_norm": 0.14542804658412933, + "learning_rate": 2.182835730707583e-07, + "loss": 0.5056, + "step": 4217 + }, + { + "epoch": 0.91, + "grad_norm": 0.16739703714847565, + "learning_rate": 2.172650608165111e-07, + "loss": 0.4897, + "step": 4218 + }, + { + "epoch": 0.91, + "grad_norm": 0.16817772388458252, + "learning_rate": 2.1624887753851186e-07, + "loss": 0.515, + "step": 4219 + }, + { + "epoch": 0.91, + "grad_norm": 0.1342426985502243, + "learning_rate": 2.1523502373159367e-07, + "loss": 0.4996, + "step": 4220 + }, + { + "epoch": 0.91, + "grad_norm": 0.3892795741558075, + "learning_rate": 2.142234998894588e-07, + "loss": 0.4838, + "step": 4221 + }, + { + "epoch": 0.91, + "grad_norm": 0.16268621385097504, + "learning_rate": 2.1321430650467546e-07, + "loss": 0.5302, + "step": 4222 + }, + { + "epoch": 0.91, + "grad_norm": 0.12644894421100616, + "learning_rate": 2.1220744406867278e-07, + "loss": 0.5567, + "step": 4223 + }, + { + "epoch": 0.91, + "grad_norm": 0.1844691038131714, + "learning_rate": 2.112029130717491e-07, + "loss": 0.6264, + "step": 4224 + }, + { + "epoch": 0.91, + "grad_norm": 0.18971168994903564, + "learning_rate": 2.1020071400306429e-07, + "loss": 0.5327, + "step": 4225 + }, + { + "epoch": 0.91, + "grad_norm": 0.11999719589948654, + "learning_rate": 2.092008473506446e-07, + "loss": 0.5153, + "step": 4226 + }, + { + "epoch": 0.91, + "grad_norm": 0.12612876296043396, + "learning_rate": 2.0820331360138058e-07, + "loss": 0.4838, + "step": 4227 + }, + { + "epoch": 0.91, + "grad_norm": 0.15707595646381378, + "learning_rate": 2.072081132410253e-07, + "loss": 0.5158, + "step": 4228 + }, + { + "epoch": 0.91, + "grad_norm": 0.14865291118621826, + "learning_rate": 2.062152467541978e-07, + "loss": 0.512, + "step": 4229 + }, + { + "epoch": 0.91, + "grad_norm": 0.17846401035785675, + "learning_rate": 2.0522471462437798e-07, + "loss": 0.4865, + "step": 4230 + }, + { + "epoch": 0.91, + "grad_norm": 0.15374383330345154, + "learning_rate": 2.042365173339117e-07, + "loss": 0.5007, + "step": 4231 + }, + { + "epoch": 0.91, + "grad_norm": 0.14291195571422577, + "learning_rate": 2.0325065536400456e-07, + "loss": 0.5102, + "step": 4232 + }, + { + "epoch": 0.91, + "grad_norm": 0.12746839225292206, + "learning_rate": 2.02267129194727e-07, + "loss": 0.51, + "step": 4233 + }, + { + "epoch": 0.91, + "grad_norm": 0.16647961735725403, + "learning_rate": 2.0128593930501427e-07, + "loss": 0.5033, + "step": 4234 + }, + { + "epoch": 0.91, + "grad_norm": 0.16872666776180267, + "learning_rate": 2.0030708617265971e-07, + "loss": 0.4992, + "step": 4235 + }, + { + "epoch": 0.91, + "grad_norm": 0.13757802546024323, + "learning_rate": 1.9933057027432147e-07, + "loss": 0.5519, + "step": 4236 + }, + { + "epoch": 0.91, + "grad_norm": 0.15467625856399536, + "learning_rate": 1.9835639208551803e-07, + "loss": 0.5208, + "step": 4237 + }, + { + "epoch": 0.91, + "grad_norm": 0.14848686754703522, + "learning_rate": 1.9738455208063055e-07, + "loss": 0.5348, + "step": 4238 + }, + { + "epoch": 0.91, + "grad_norm": 0.18028004467487335, + "learning_rate": 1.9641505073290103e-07, + "loss": 0.5313, + "step": 4239 + }, + { + "epoch": 0.91, + "grad_norm": 0.19385437667369843, + "learning_rate": 1.9544788851443342e-07, + "loss": 0.5109, + "step": 4240 + }, + { + "epoch": 0.91, + "grad_norm": 0.1776755303144455, + "learning_rate": 1.944830658961927e-07, + "loss": 0.4881, + "step": 4241 + }, + { + "epoch": 0.91, + "grad_norm": 0.15795911848545074, + "learning_rate": 1.9352058334800195e-07, + "loss": 0.5299, + "step": 4242 + }, + { + "epoch": 0.91, + "grad_norm": 0.1372847557067871, + "learning_rate": 1.9256044133854846e-07, + "loss": 0.5026, + "step": 4243 + }, + { + "epoch": 0.91, + "grad_norm": 0.1478043794631958, + "learning_rate": 1.9160264033537824e-07, + "loss": 0.4663, + "step": 4244 + }, + { + "epoch": 0.91, + "grad_norm": 0.16185085475444794, + "learning_rate": 1.9064718080489596e-07, + "loss": 0.4501, + "step": 4245 + }, + { + "epoch": 0.91, + "grad_norm": 0.15890911221504211, + "learning_rate": 1.8969406321236727e-07, + "loss": 0.5688, + "step": 4246 + }, + { + "epoch": 0.91, + "grad_norm": 0.1747117042541504, + "learning_rate": 1.8874328802191867e-07, + "loss": 0.5213, + "step": 4247 + }, + { + "epoch": 0.92, + "grad_norm": 0.15179674327373505, + "learning_rate": 1.8779485569653422e-07, + "loss": 0.5252, + "step": 4248 + }, + { + "epoch": 0.92, + "grad_norm": 0.1634942591190338, + "learning_rate": 1.868487666980584e-07, + "loss": 0.4914, + "step": 4249 + }, + { + "epoch": 0.92, + "grad_norm": 0.13174600899219513, + "learning_rate": 1.859050214871927e-07, + "loss": 0.5337, + "step": 4250 + }, + { + "epoch": 0.92, + "grad_norm": 0.1955437809228897, + "learning_rate": 1.8496362052349893e-07, + "loss": 0.4623, + "step": 4251 + }, + { + "epoch": 0.92, + "grad_norm": 0.14583423733711243, + "learning_rate": 1.8402456426539706e-07, + "loss": 0.5413, + "step": 4252 + }, + { + "epoch": 0.92, + "grad_norm": 0.15666338801383972, + "learning_rate": 1.830878531701652e-07, + "loss": 0.4953, + "step": 4253 + }, + { + "epoch": 0.92, + "grad_norm": 0.15777826309204102, + "learning_rate": 1.8215348769393904e-07, + "loss": 0.5767, + "step": 4254 + }, + { + "epoch": 0.92, + "grad_norm": 0.1892169713973999, + "learning_rate": 1.8122146829171294e-07, + "loss": 0.5119, + "step": 4255 + }, + { + "epoch": 0.92, + "grad_norm": 0.13005930185317993, + "learning_rate": 1.8029179541733833e-07, + "loss": 0.5126, + "step": 4256 + }, + { + "epoch": 0.92, + "grad_norm": 0.15139774978160858, + "learning_rate": 1.7936446952352303e-07, + "loss": 0.5505, + "step": 4257 + }, + { + "epoch": 0.92, + "grad_norm": 0.18456581234931946, + "learning_rate": 1.7843949106183368e-07, + "loss": 0.4961, + "step": 4258 + }, + { + "epoch": 0.92, + "grad_norm": 0.1589013636112213, + "learning_rate": 1.7751686048269322e-07, + "loss": 0.5622, + "step": 4259 + }, + { + "epoch": 0.92, + "grad_norm": 0.16003479063510895, + "learning_rate": 1.7659657823538067e-07, + "loss": 0.5282, + "step": 4260 + }, + { + "epoch": 0.92, + "grad_norm": 0.15125080943107605, + "learning_rate": 1.7567864476803254e-07, + "loss": 0.4712, + "step": 4261 + }, + { + "epoch": 0.92, + "grad_norm": 0.14712797105312347, + "learning_rate": 1.747630605276407e-07, + "loss": 0.5371, + "step": 4262 + }, + { + "epoch": 0.92, + "grad_norm": 0.16056658327579498, + "learning_rate": 1.7384982596005352e-07, + "loss": 0.5107, + "step": 4263 + }, + { + "epoch": 0.92, + "grad_norm": 0.14780429005622864, + "learning_rate": 1.7293894150997414e-07, + "loss": 0.5311, + "step": 4264 + }, + { + "epoch": 0.92, + "grad_norm": 0.14676974713802338, + "learning_rate": 1.720304076209639e-07, + "loss": 0.4981, + "step": 4265 + }, + { + "epoch": 0.92, + "grad_norm": 0.14568917453289032, + "learning_rate": 1.711242247354372e-07, + "loss": 0.5237, + "step": 4266 + }, + { + "epoch": 0.92, + "grad_norm": 0.13407346606254578, + "learning_rate": 1.7022039329466333e-07, + "loss": 0.5197, + "step": 4267 + }, + { + "epoch": 0.92, + "grad_norm": 0.14667077362537384, + "learning_rate": 1.6931891373876852e-07, + "loss": 0.5155, + "step": 4268 + }, + { + "epoch": 0.92, + "grad_norm": 0.14774075150489807, + "learning_rate": 1.6841978650673218e-07, + "loss": 0.5261, + "step": 4269 + }, + { + "epoch": 0.92, + "grad_norm": 0.13004808127880096, + "learning_rate": 1.6752301203638854e-07, + "loss": 0.4793, + "step": 4270 + }, + { + "epoch": 0.92, + "grad_norm": 0.1556776911020279, + "learning_rate": 1.666285907644266e-07, + "loss": 0.5356, + "step": 4271 + }, + { + "epoch": 0.92, + "grad_norm": 0.1795538365840912, + "learning_rate": 1.657365231263891e-07, + "loss": 0.5119, + "step": 4272 + }, + { + "epoch": 0.92, + "grad_norm": 0.15904632210731506, + "learning_rate": 1.6484680955667354e-07, + "loss": 0.5485, + "step": 4273 + }, + { + "epoch": 0.92, + "grad_norm": 0.1445087045431137, + "learning_rate": 1.6395945048852947e-07, + "loss": 0.4932, + "step": 4274 + }, + { + "epoch": 0.92, + "grad_norm": 0.13816164433956146, + "learning_rate": 1.6307444635406011e-07, + "loss": 0.5038, + "step": 4275 + }, + { + "epoch": 0.92, + "grad_norm": 0.15279729664325714, + "learning_rate": 1.6219179758422465e-07, + "loss": 0.5235, + "step": 4276 + }, + { + "epoch": 0.92, + "grad_norm": 0.15122798085212708, + "learning_rate": 1.6131150460883038e-07, + "loss": 0.4975, + "step": 4277 + }, + { + "epoch": 0.92, + "grad_norm": 0.19103887677192688, + "learning_rate": 1.6043356785654273e-07, + "loss": 0.5026, + "step": 4278 + }, + { + "epoch": 0.92, + "grad_norm": 0.1535024344921112, + "learning_rate": 1.595579877548764e-07, + "loss": 0.5348, + "step": 4279 + }, + { + "epoch": 0.92, + "grad_norm": 0.17013922333717346, + "learning_rate": 1.5868476473019922e-07, + "loss": 0.528, + "step": 4280 + }, + { + "epoch": 0.92, + "grad_norm": 0.13540351390838623, + "learning_rate": 1.578138992077316e-07, + "loss": 0.5253, + "step": 4281 + }, + { + "epoch": 0.92, + "grad_norm": 0.14699843525886536, + "learning_rate": 1.5694539161154598e-07, + "loss": 0.4991, + "step": 4282 + }, + { + "epoch": 0.92, + "grad_norm": 0.1623685657978058, + "learning_rate": 1.560792423645663e-07, + "loss": 0.5254, + "step": 4283 + }, + { + "epoch": 0.92, + "grad_norm": 0.17117798328399658, + "learning_rate": 1.5521545188856734e-07, + "loss": 0.557, + "step": 4284 + }, + { + "epoch": 0.92, + "grad_norm": 0.16229890286922455, + "learning_rate": 1.5435402060417825e-07, + "loss": 0.5552, + "step": 4285 + }, + { + "epoch": 0.92, + "grad_norm": 0.28365910053253174, + "learning_rate": 1.5349494893087514e-07, + "loss": 0.5357, + "step": 4286 + }, + { + "epoch": 0.92, + "grad_norm": 0.1524672657251358, + "learning_rate": 1.526382372869878e-07, + "loss": 0.5343, + "step": 4287 + }, + { + "epoch": 0.92, + "grad_norm": 0.18612819910049438, + "learning_rate": 1.517838860896964e-07, + "loss": 0.4767, + "step": 4288 + }, + { + "epoch": 0.92, + "grad_norm": 0.15579423308372498, + "learning_rate": 1.50931895755031e-07, + "loss": 0.5174, + "step": 4289 + }, + { + "epoch": 0.92, + "grad_norm": 0.1574939489364624, + "learning_rate": 1.500822666978735e-07, + "loss": 0.4945, + "step": 4290 + }, + { + "epoch": 0.92, + "grad_norm": 0.13923248648643494, + "learning_rate": 1.492349993319536e-07, + "loss": 0.5056, + "step": 4291 + }, + { + "epoch": 0.92, + "grad_norm": 0.1429956555366516, + "learning_rate": 1.4839009406985295e-07, + "loss": 0.4775, + "step": 4292 + }, + { + "epoch": 0.92, + "grad_norm": 0.1344211846590042, + "learning_rate": 1.4754755132300292e-07, + "loss": 0.5308, + "step": 4293 + }, + { + "epoch": 0.92, + "grad_norm": 0.17861835658550262, + "learning_rate": 1.4670737150168257e-07, + "loss": 0.4766, + "step": 4294 + }, + { + "epoch": 0.93, + "grad_norm": 0.1777002215385437, + "learning_rate": 1.4586955501502186e-07, + "loss": 0.5361, + "step": 4295 + }, + { + "epoch": 0.93, + "grad_norm": 0.14904451370239258, + "learning_rate": 1.4503410227100057e-07, + "loss": 0.4903, + "step": 4296 + }, + { + "epoch": 0.93, + "grad_norm": 0.19658173620700836, + "learning_rate": 1.4420101367644602e-07, + "loss": 0.5013, + "step": 4297 + }, + { + "epoch": 0.93, + "grad_norm": 0.12814252078533173, + "learning_rate": 1.433702896370348e-07, + "loss": 0.5173, + "step": 4298 + }, + { + "epoch": 0.93, + "grad_norm": 0.1587502658367157, + "learning_rate": 1.4254193055729171e-07, + "loss": 0.5192, + "step": 4299 + }, + { + "epoch": 0.93, + "grad_norm": 0.16808383166790009, + "learning_rate": 1.417159368405907e-07, + "loss": 0.54, + "step": 4300 + }, + { + "epoch": 0.93, + "grad_norm": 0.14128008484840393, + "learning_rate": 1.408923088891534e-07, + "loss": 0.5069, + "step": 4301 + }, + { + "epoch": 0.93, + "grad_norm": 0.1637185662984848, + "learning_rate": 1.4007104710404838e-07, + "loss": 0.514, + "step": 4302 + }, + { + "epoch": 0.93, + "grad_norm": 0.1476011574268341, + "learning_rate": 1.3925215188519525e-07, + "loss": 0.5337, + "step": 4303 + }, + { + "epoch": 0.93, + "grad_norm": 0.1344112902879715, + "learning_rate": 1.384356236313572e-07, + "loss": 0.4939, + "step": 4304 + }, + { + "epoch": 0.93, + "grad_norm": 0.15150727331638336, + "learning_rate": 1.3762146274014842e-07, + "loss": 0.4818, + "step": 4305 + }, + { + "epoch": 0.93, + "grad_norm": 0.14989051222801208, + "learning_rate": 1.3680966960802623e-07, + "loss": 0.4746, + "step": 4306 + }, + { + "epoch": 0.93, + "grad_norm": 0.14494554698467255, + "learning_rate": 1.3600024463029938e-07, + "loss": 0.5037, + "step": 4307 + }, + { + "epoch": 0.93, + "grad_norm": 0.17142927646636963, + "learning_rate": 1.3519318820111983e-07, + "loss": 0.5133, + "step": 4308 + }, + { + "epoch": 0.93, + "grad_norm": 0.16990455985069275, + "learning_rate": 1.3438850071348874e-07, + "loss": 0.5251, + "step": 4309 + }, + { + "epoch": 0.93, + "grad_norm": 0.1605384796857834, + "learning_rate": 1.3358618255925214e-07, + "loss": 0.5038, + "step": 4310 + }, + { + "epoch": 0.93, + "grad_norm": 0.13191020488739014, + "learning_rate": 1.3278623412910308e-07, + "loss": 0.5257, + "step": 4311 + }, + { + "epoch": 0.93, + "grad_norm": 0.1355755478143692, + "learning_rate": 1.3198865581258046e-07, + "loss": 0.5244, + "step": 4312 + }, + { + "epoch": 0.93, + "grad_norm": 0.1625167280435562, + "learning_rate": 1.311934479980681e-07, + "loss": 0.4965, + "step": 4313 + }, + { + "epoch": 0.93, + "grad_norm": 0.18114399909973145, + "learning_rate": 1.3040061107279679e-07, + "loss": 0.5235, + "step": 4314 + }, + { + "epoch": 0.93, + "grad_norm": 0.15504209697246552, + "learning_rate": 1.2961014542284266e-07, + "loss": 0.5038, + "step": 4315 + }, + { + "epoch": 0.93, + "grad_norm": 0.20267391204833984, + "learning_rate": 1.2882205143312676e-07, + "loss": 0.4623, + "step": 4316 + }, + { + "epoch": 0.93, + "grad_norm": 0.1550229787826538, + "learning_rate": 1.280363294874154e-07, + "loss": 0.4784, + "step": 4317 + }, + { + "epoch": 0.93, + "grad_norm": 0.1660616248846054, + "learning_rate": 1.272529799683192e-07, + "loss": 0.4987, + "step": 4318 + }, + { + "epoch": 0.93, + "grad_norm": 0.15414029359817505, + "learning_rate": 1.264720032572947e-07, + "loss": 0.56, + "step": 4319 + }, + { + "epoch": 0.93, + "grad_norm": 0.18424440920352936, + "learning_rate": 1.2569339973464155e-07, + "loss": 0.4993, + "step": 4320 + }, + { + "epoch": 0.93, + "grad_norm": 0.1249246671795845, + "learning_rate": 1.249171697795054e-07, + "loss": 0.486, + "step": 4321 + }, + { + "epoch": 0.93, + "grad_norm": 0.15937843918800354, + "learning_rate": 1.2414331376987555e-07, + "loss": 0.5439, + "step": 4322 + }, + { + "epoch": 0.93, + "grad_norm": 0.16112691164016724, + "learning_rate": 1.233718320825833e-07, + "loss": 0.4971, + "step": 4323 + }, + { + "epoch": 0.93, + "grad_norm": 0.13961079716682434, + "learning_rate": 1.2260272509330707e-07, + "loss": 0.5513, + "step": 4324 + }, + { + "epoch": 0.93, + "grad_norm": 0.1391015648841858, + "learning_rate": 1.218359931765667e-07, + "loss": 0.5472, + "step": 4325 + }, + { + "epoch": 0.93, + "grad_norm": 0.1630607694387436, + "learning_rate": 1.2107163670572574e-07, + "loss": 0.5002, + "step": 4326 + }, + { + "epoch": 0.93, + "grad_norm": 0.16287516057491302, + "learning_rate": 1.2030965605299204e-07, + "loss": 0.4701, + "step": 4327 + }, + { + "epoch": 0.93, + "grad_norm": 0.12734615802764893, + "learning_rate": 1.195500515894149e-07, + "loss": 0.5591, + "step": 4328 + }, + { + "epoch": 0.93, + "grad_norm": 0.16435910761356354, + "learning_rate": 1.1879282368488787e-07, + "loss": 0.5503, + "step": 4329 + }, + { + "epoch": 0.93, + "grad_norm": 0.16866935789585114, + "learning_rate": 1.1803797270814765e-07, + "loss": 0.518, + "step": 4330 + }, + { + "epoch": 0.93, + "grad_norm": 0.17033065855503082, + "learning_rate": 1.1728549902677133e-07, + "loss": 0.4658, + "step": 4331 + }, + { + "epoch": 0.93, + "grad_norm": 0.18168850243091583, + "learning_rate": 1.165354030071808e-07, + "loss": 0.5154, + "step": 4332 + }, + { + "epoch": 0.93, + "grad_norm": 0.15495000779628754, + "learning_rate": 1.1578768501463722e-07, + "loss": 0.5399, + "step": 4333 + }, + { + "epoch": 0.93, + "grad_norm": 0.14426656067371368, + "learning_rate": 1.1504234541324765e-07, + "loss": 0.4739, + "step": 4334 + }, + { + "epoch": 0.93, + "grad_norm": 0.1701272577047348, + "learning_rate": 1.1429938456595735e-07, + "loss": 0.5633, + "step": 4335 + }, + { + "epoch": 0.93, + "grad_norm": 0.16904759407043457, + "learning_rate": 1.1355880283455523e-07, + "loss": 0.528, + "step": 4336 + }, + { + "epoch": 0.93, + "grad_norm": 0.1610129028558731, + "learning_rate": 1.1282060057967226e-07, + "loss": 0.5077, + "step": 4337 + }, + { + "epoch": 0.93, + "grad_norm": 0.1449388712644577, + "learning_rate": 1.1208477816077756e-07, + "loss": 0.5261, + "step": 4338 + }, + { + "epoch": 0.93, + "grad_norm": 0.17686261236667633, + "learning_rate": 1.1135133593618508e-07, + "loss": 0.5136, + "step": 4339 + }, + { + "epoch": 0.93, + "grad_norm": 0.13631290197372437, + "learning_rate": 1.1062027426304744e-07, + "loss": 0.5105, + "step": 4340 + }, + { + "epoch": 0.94, + "grad_norm": 0.15161027014255524, + "learning_rate": 1.0989159349735879e-07, + "loss": 0.5221, + "step": 4341 + }, + { + "epoch": 0.94, + "grad_norm": 0.15384641289710999, + "learning_rate": 1.091652939939547e-07, + "loss": 0.5192, + "step": 4342 + }, + { + "epoch": 0.94, + "grad_norm": 0.166702538728714, + "learning_rate": 1.084413761065084e-07, + "loss": 0.5481, + "step": 4343 + }, + { + "epoch": 0.94, + "grad_norm": 0.15912270545959473, + "learning_rate": 1.0771984018753733e-07, + "loss": 0.6039, + "step": 4344 + }, + { + "epoch": 0.94, + "grad_norm": 0.15669448673725128, + "learning_rate": 1.0700068658839491e-07, + "loss": 0.5047, + "step": 4345 + }, + { + "epoch": 0.94, + "grad_norm": 0.16294890642166138, + "learning_rate": 1.0628391565927765e-07, + "loss": 0.5736, + "step": 4346 + }, + { + "epoch": 0.94, + "grad_norm": 0.18943636119365692, + "learning_rate": 1.0556952774922136e-07, + "loss": 0.5612, + "step": 4347 + }, + { + "epoch": 0.94, + "grad_norm": 0.1898173987865448, + "learning_rate": 1.0485752320609944e-07, + "loss": 0.5456, + "step": 4348 + }, + { + "epoch": 0.94, + "grad_norm": 0.13543102145195007, + "learning_rate": 1.0414790237662676e-07, + "loss": 0.5888, + "step": 4349 + }, + { + "epoch": 0.94, + "grad_norm": 0.1901504397392273, + "learning_rate": 1.0344066560635635e-07, + "loss": 0.5364, + "step": 4350 + }, + { + "epoch": 0.94, + "grad_norm": 0.16581448912620544, + "learning_rate": 1.0273581323968052e-07, + "loss": 0.4779, + "step": 4351 + }, + { + "epoch": 0.94, + "grad_norm": 0.16107046604156494, + "learning_rate": 1.0203334561983025e-07, + "loss": 0.5074, + "step": 4352 + }, + { + "epoch": 0.94, + "grad_norm": 0.15327927470207214, + "learning_rate": 1.0133326308887692e-07, + "loss": 0.5471, + "step": 4353 + }, + { + "epoch": 0.94, + "grad_norm": 0.1985284835100174, + "learning_rate": 1.0063556598772839e-07, + "loss": 0.5462, + "step": 4354 + }, + { + "epoch": 0.94, + "grad_norm": 0.13533158600330353, + "learning_rate": 9.994025465613122e-08, + "loss": 0.5763, + "step": 4355 + }, + { + "epoch": 0.94, + "grad_norm": 0.19730281829833984, + "learning_rate": 9.924732943267068e-08, + "loss": 0.535, + "step": 4356 + }, + { + "epoch": 0.94, + "grad_norm": 0.18454429507255554, + "learning_rate": 9.855679065477131e-08, + "loss": 0.5222, + "step": 4357 + }, + { + "epoch": 0.94, + "grad_norm": 0.15890662372112274, + "learning_rate": 9.7868638658693e-08, + "loss": 0.4811, + "step": 4358 + }, + { + "epoch": 0.94, + "grad_norm": 0.181091770529747, + "learning_rate": 9.71828737795355e-08, + "loss": 0.5643, + "step": 4359 + }, + { + "epoch": 0.94, + "grad_norm": 0.13532613217830658, + "learning_rate": 9.6499496351235e-08, + "loss": 0.5115, + "step": 4360 + }, + { + "epoch": 0.94, + "grad_norm": 0.15786287188529968, + "learning_rate": 9.581850670656644e-08, + "loss": 0.5078, + "step": 4361 + }, + { + "epoch": 0.94, + "grad_norm": 0.1745007038116455, + "learning_rate": 9.513990517713955e-08, + "loss": 0.5805, + "step": 4362 + }, + { + "epoch": 0.94, + "grad_norm": 0.15297739207744598, + "learning_rate": 9.446369209340334e-08, + "loss": 0.4882, + "step": 4363 + }, + { + "epoch": 0.94, + "grad_norm": 0.1355600208044052, + "learning_rate": 9.378986778464327e-08, + "loss": 0.4854, + "step": 4364 + }, + { + "epoch": 0.94, + "grad_norm": 0.1561882495880127, + "learning_rate": 9.311843257898134e-08, + "loss": 0.491, + "step": 4365 + }, + { + "epoch": 0.94, + "grad_norm": 0.17752040922641754, + "learning_rate": 9.244938680337656e-08, + "loss": 0.5178, + "step": 4366 + }, + { + "epoch": 0.94, + "grad_norm": 0.12778738141059875, + "learning_rate": 9.178273078362332e-08, + "loss": 0.5, + "step": 4367 + }, + { + "epoch": 0.94, + "grad_norm": 0.1494607776403427, + "learning_rate": 9.111846484435361e-08, + "loss": 0.5469, + "step": 4368 + }, + { + "epoch": 0.94, + "grad_norm": 0.1332845240831375, + "learning_rate": 9.045658930903477e-08, + "loss": 0.5386, + "step": 4369 + }, + { + "epoch": 0.94, + "grad_norm": 0.18359340727329254, + "learning_rate": 8.979710449997014e-08, + "loss": 0.5668, + "step": 4370 + }, + { + "epoch": 0.94, + "grad_norm": 0.16064810752868652, + "learning_rate": 8.914001073829892e-08, + "loss": 0.5341, + "step": 4371 + }, + { + "epoch": 0.94, + "grad_norm": 0.14224553108215332, + "learning_rate": 8.848530834399683e-08, + "loss": 0.5512, + "step": 4372 + }, + { + "epoch": 0.94, + "grad_norm": 0.14381971955299377, + "learning_rate": 8.783299763587439e-08, + "loss": 0.5154, + "step": 4373 + }, + { + "epoch": 0.94, + "grad_norm": 0.1366354078054428, + "learning_rate": 8.718307893157696e-08, + "loss": 0.5354, + "step": 4374 + }, + { + "epoch": 0.94, + "grad_norm": 0.21582616865634918, + "learning_rate": 8.653555254758583e-08, + "loss": 0.5755, + "step": 4375 + }, + { + "epoch": 0.94, + "grad_norm": 0.18118129670619965, + "learning_rate": 8.589041879921711e-08, + "loss": 0.5604, + "step": 4376 + }, + { + "epoch": 0.94, + "grad_norm": 0.18753331899642944, + "learning_rate": 8.524767800062228e-08, + "loss": 0.5141, + "step": 4377 + }, + { + "epoch": 0.94, + "grad_norm": 0.15496698021888733, + "learning_rate": 8.460733046478653e-08, + "loss": 0.5408, + "step": 4378 + }, + { + "epoch": 0.94, + "grad_norm": 0.19295796751976013, + "learning_rate": 8.396937650353042e-08, + "loss": 0.5633, + "step": 4379 + }, + { + "epoch": 0.94, + "grad_norm": 0.16296663880348206, + "learning_rate": 8.333381642750882e-08, + "loss": 0.4816, + "step": 4380 + }, + { + "epoch": 0.94, + "grad_norm": 0.18352928757667542, + "learning_rate": 8.270065054621135e-08, + "loss": 0.521, + "step": 4381 + }, + { + "epoch": 0.94, + "grad_norm": 0.1636262685060501, + "learning_rate": 8.206987916796027e-08, + "loss": 0.492, + "step": 4382 + }, + { + "epoch": 0.94, + "grad_norm": 0.1417970508337021, + "learning_rate": 8.144150259991323e-08, + "loss": 0.4883, + "step": 4383 + }, + { + "epoch": 0.94, + "grad_norm": 0.15278513729572296, + "learning_rate": 8.081552114806101e-08, + "loss": 0.5371, + "step": 4384 + }, + { + "epoch": 0.94, + "grad_norm": 0.1443348526954651, + "learning_rate": 8.019193511722922e-08, + "loss": 0.4936, + "step": 4385 + }, + { + "epoch": 0.94, + "grad_norm": 0.17426589131355286, + "learning_rate": 7.957074481107551e-08, + "loss": 0.5743, + "step": 4386 + }, + { + "epoch": 0.95, + "grad_norm": 0.1521102637052536, + "learning_rate": 7.895195053209126e-08, + "loss": 0.5066, + "step": 4387 + }, + { + "epoch": 0.95, + "grad_norm": 0.1313631236553192, + "learning_rate": 7.833555258160208e-08, + "loss": 0.4878, + "step": 4388 + }, + { + "epoch": 0.95, + "grad_norm": 0.1430417150259018, + "learning_rate": 7.77215512597651e-08, + "loss": 0.5264, + "step": 4389 + }, + { + "epoch": 0.95, + "grad_norm": 0.1771220713853836, + "learning_rate": 7.710994686557172e-08, + "loss": 0.5333, + "step": 4390 + }, + { + "epoch": 0.95, + "grad_norm": 0.13800616562366486, + "learning_rate": 7.650073969684646e-08, + "loss": 0.5203, + "step": 4391 + }, + { + "epoch": 0.95, + "grad_norm": 0.1415596306324005, + "learning_rate": 7.589393005024482e-08, + "loss": 0.5199, + "step": 4392 + }, + { + "epoch": 0.95, + "grad_norm": 0.1424768567085266, + "learning_rate": 7.528951822125596e-08, + "loss": 0.4892, + "step": 4393 + }, + { + "epoch": 0.95, + "grad_norm": 0.15463979542255402, + "learning_rate": 7.468750450420114e-08, + "loss": 0.4966, + "step": 4394 + }, + { + "epoch": 0.95, + "grad_norm": 0.12930360436439514, + "learning_rate": 7.40878891922342e-08, + "loss": 0.525, + "step": 4395 + }, + { + "epoch": 0.95, + "grad_norm": 0.12379728257656097, + "learning_rate": 7.349067257733989e-08, + "loss": 0.532, + "step": 4396 + }, + { + "epoch": 0.95, + "grad_norm": 0.15126173198223114, + "learning_rate": 7.289585495033668e-08, + "loss": 0.5074, + "step": 4397 + }, + { + "epoch": 0.95, + "grad_norm": 0.1340315043926239, + "learning_rate": 7.230343660087402e-08, + "loss": 0.5003, + "step": 4398 + }, + { + "epoch": 0.95, + "grad_norm": 0.14905254542827606, + "learning_rate": 7.171341781743224e-08, + "loss": 0.5331, + "step": 4399 + }, + { + "epoch": 0.95, + "grad_norm": 0.13680437207221985, + "learning_rate": 7.11257988873243e-08, + "loss": 0.5186, + "step": 4400 + }, + { + "epoch": 0.95, + "grad_norm": 0.1248023733496666, + "learning_rate": 7.054058009669407e-08, + "loss": 0.4576, + "step": 4401 + }, + { + "epoch": 0.95, + "grad_norm": 0.14953729510307312, + "learning_rate": 6.995776173051583e-08, + "loss": 0.4709, + "step": 4402 + }, + { + "epoch": 0.95, + "grad_norm": 0.139199897646904, + "learning_rate": 6.937734407259756e-08, + "loss": 0.5412, + "step": 4403 + }, + { + "epoch": 0.95, + "grad_norm": 0.1763693392276764, + "learning_rate": 6.879932740557538e-08, + "loss": 0.5147, + "step": 4404 + }, + { + "epoch": 0.95, + "grad_norm": 0.17772704362869263, + "learning_rate": 6.822371201091749e-08, + "loss": 0.5658, + "step": 4405 + }, + { + "epoch": 0.95, + "grad_norm": 0.16532278060913086, + "learning_rate": 6.7650498168923e-08, + "loss": 0.4743, + "step": 4406 + }, + { + "epoch": 0.95, + "grad_norm": 0.17449362576007843, + "learning_rate": 6.707968615872085e-08, + "loss": 0.5396, + "step": 4407 + }, + { + "epoch": 0.95, + "grad_norm": 0.18282443284988403, + "learning_rate": 6.651127625827037e-08, + "loss": 0.5423, + "step": 4408 + }, + { + "epoch": 0.95, + "grad_norm": 0.13201217353343964, + "learning_rate": 6.594526874436236e-08, + "loss": 0.5364, + "step": 4409 + }, + { + "epoch": 0.95, + "grad_norm": 0.1461392194032669, + "learning_rate": 6.538166389261635e-08, + "loss": 0.5235, + "step": 4410 + }, + { + "epoch": 0.95, + "grad_norm": 0.15727302432060242, + "learning_rate": 6.482046197748282e-08, + "loss": 0.4949, + "step": 4411 + }, + { + "epoch": 0.95, + "grad_norm": 0.1405402272939682, + "learning_rate": 6.426166327224148e-08, + "loss": 0.5097, + "step": 4412 + }, + { + "epoch": 0.95, + "grad_norm": 0.14864054322242737, + "learning_rate": 6.3705268049003e-08, + "loss": 0.5318, + "step": 4413 + }, + { + "epoch": 0.95, + "grad_norm": 0.13717585802078247, + "learning_rate": 6.315127657870513e-08, + "loss": 0.4753, + "step": 4414 + }, + { + "epoch": 0.95, + "grad_norm": 0.15826748311519623, + "learning_rate": 6.259968913111869e-08, + "loss": 0.522, + "step": 4415 + }, + { + "epoch": 0.95, + "grad_norm": 0.1390410214662552, + "learning_rate": 6.205050597483997e-08, + "loss": 0.5485, + "step": 4416 + }, + { + "epoch": 0.95, + "grad_norm": 0.13676656782627106, + "learning_rate": 6.150372737729781e-08, + "loss": 0.5234, + "step": 4417 + }, + { + "epoch": 0.95, + "grad_norm": 0.20203281939029694, + "learning_rate": 6.095935360474814e-08, + "loss": 0.5139, + "step": 4418 + }, + { + "epoch": 0.95, + "grad_norm": 0.16294489800930023, + "learning_rate": 6.041738492227666e-08, + "loss": 0.5323, + "step": 4419 + }, + { + "epoch": 0.95, + "grad_norm": 0.19889448583126068, + "learning_rate": 5.98778215937973e-08, + "loss": 0.5655, + "step": 4420 + }, + { + "epoch": 0.95, + "grad_norm": 0.14263413846492767, + "learning_rate": 5.9340663882053727e-08, + "loss": 0.5585, + "step": 4421 + }, + { + "epoch": 0.95, + "grad_norm": 0.14396370947360992, + "learning_rate": 5.880591204861674e-08, + "loss": 0.5063, + "step": 4422 + }, + { + "epoch": 0.95, + "grad_norm": 0.1644524484872818, + "learning_rate": 5.827356635388692e-08, + "loss": 0.5243, + "step": 4423 + }, + { + "epoch": 0.95, + "grad_norm": 0.16655051708221436, + "learning_rate": 5.7743627057092463e-08, + "loss": 0.5033, + "step": 4424 + }, + { + "epoch": 0.95, + "grad_norm": 0.18211567401885986, + "learning_rate": 5.721609441629028e-08, + "loss": 0.5083, + "step": 4425 + }, + { + "epoch": 0.95, + "grad_norm": 0.14441342651844025, + "learning_rate": 5.669096868836377e-08, + "loss": 0.4764, + "step": 4426 + }, + { + "epoch": 0.95, + "grad_norm": 0.14789772033691406, + "learning_rate": 5.616825012902616e-08, + "loss": 0.4914, + "step": 4427 + }, + { + "epoch": 0.95, + "grad_norm": 0.1412544995546341, + "learning_rate": 5.564793899281884e-08, + "loss": 0.4267, + "step": 4428 + }, + { + "epoch": 0.95, + "grad_norm": 0.15629424154758453, + "learning_rate": 5.5130035533108587e-08, + "loss": 0.4807, + "step": 4429 + }, + { + "epoch": 0.95, + "grad_norm": 0.1984405219554901, + "learning_rate": 5.461454000209199e-08, + "loss": 0.5074, + "step": 4430 + }, + { + "epoch": 0.95, + "grad_norm": 0.15565001964569092, + "learning_rate": 5.410145265079103e-08, + "loss": 0.4852, + "step": 4431 + }, + { + "epoch": 0.95, + "grad_norm": 0.16649481654167175, + "learning_rate": 5.3590773729056965e-08, + "loss": 0.5007, + "step": 4432 + }, + { + "epoch": 0.95, + "grad_norm": 0.14113038778305054, + "learning_rate": 5.3082503485566425e-08, + "loss": 0.4873, + "step": 4433 + }, + { + "epoch": 0.96, + "grad_norm": 0.14539320766925812, + "learning_rate": 5.257664216782532e-08, + "loss": 0.4856, + "step": 4434 + }, + { + "epoch": 0.96, + "grad_norm": 0.1761976182460785, + "learning_rate": 5.2073190022164933e-08, + "loss": 0.4845, + "step": 4435 + }, + { + "epoch": 0.96, + "grad_norm": 0.1391577571630478, + "learning_rate": 5.157214729374305e-08, + "loss": 0.4913, + "step": 4436 + }, + { + "epoch": 0.96, + "grad_norm": 0.16997891664505005, + "learning_rate": 5.107351422654561e-08, + "loss": 0.5419, + "step": 4437 + }, + { + "epoch": 0.96, + "grad_norm": 0.15367954969406128, + "learning_rate": 5.057729106338505e-08, + "loss": 0.4658, + "step": 4438 + }, + { + "epoch": 0.96, + "grad_norm": 0.15846063196659088, + "learning_rate": 5.008347804589808e-08, + "loss": 0.5814, + "step": 4439 + }, + { + "epoch": 0.96, + "grad_norm": 0.18725064396858215, + "learning_rate": 4.959207541455013e-08, + "loss": 0.5488, + "step": 4440 + }, + { + "epoch": 0.96, + "grad_norm": 0.14484313130378723, + "learning_rate": 4.910308340863201e-08, + "loss": 0.4471, + "step": 4441 + }, + { + "epoch": 0.96, + "grad_norm": 0.1446012258529663, + "learning_rate": 4.8616502266261026e-08, + "loss": 0.5428, + "step": 4442 + }, + { + "epoch": 0.96, + "grad_norm": 0.17468306422233582, + "learning_rate": 4.813233222438041e-08, + "loss": 0.5287, + "step": 4443 + }, + { + "epoch": 0.96, + "grad_norm": 0.14374323189258575, + "learning_rate": 4.765057351875879e-08, + "loss": 0.5374, + "step": 4444 + }, + { + "epoch": 0.96, + "grad_norm": 0.14365346729755402, + "learning_rate": 4.7171226383990745e-08, + "loss": 0.5042, + "step": 4445 + }, + { + "epoch": 0.96, + "grad_norm": 0.16741974651813507, + "learning_rate": 4.6694291053496766e-08, + "loss": 0.5172, + "step": 4446 + }, + { + "epoch": 0.96, + "grad_norm": 0.15114641189575195, + "learning_rate": 4.621976775952386e-08, + "loss": 0.4949, + "step": 4447 + }, + { + "epoch": 0.96, + "grad_norm": 0.13638369739055634, + "learning_rate": 4.5747656733142184e-08, + "loss": 0.5654, + "step": 4448 + }, + { + "epoch": 0.96, + "grad_norm": 0.15733817219734192, + "learning_rate": 4.527795820424896e-08, + "loss": 0.5382, + "step": 4449 + }, + { + "epoch": 0.96, + "grad_norm": 0.18564561009407043, + "learning_rate": 4.481067240156678e-08, + "loss": 0.5269, + "step": 4450 + }, + { + "epoch": 0.96, + "grad_norm": 0.2260461002588272, + "learning_rate": 4.43457995526414e-08, + "loss": 0.546, + "step": 4451 + }, + { + "epoch": 0.96, + "grad_norm": 0.15831370651721954, + "learning_rate": 4.3883339883846186e-08, + "loss": 0.4982, + "step": 4452 + }, + { + "epoch": 0.96, + "grad_norm": 0.16516351699829102, + "learning_rate": 4.342329362037767e-08, + "loss": 0.5072, + "step": 4453 + }, + { + "epoch": 0.96, + "grad_norm": 0.16760680079460144, + "learning_rate": 4.296566098625776e-08, + "loss": 0.4515, + "step": 4454 + }, + { + "epoch": 0.96, + "grad_norm": 0.12296677380800247, + "learning_rate": 4.25104422043332e-08, + "loss": 0.4929, + "step": 4455 + }, + { + "epoch": 0.96, + "grad_norm": 0.16518919169902802, + "learning_rate": 4.2057637496273896e-08, + "loss": 0.524, + "step": 4456 + }, + { + "epoch": 0.96, + "grad_norm": 0.17474794387817383, + "learning_rate": 4.16072470825768e-08, + "loss": 0.5345, + "step": 4457 + }, + { + "epoch": 0.96, + "grad_norm": 0.13814187049865723, + "learning_rate": 4.115927118256036e-08, + "loss": 0.5051, + "step": 4458 + }, + { + "epoch": 0.96, + "grad_norm": 0.1405845582485199, + "learning_rate": 4.071371001436952e-08, + "loss": 0.4459, + "step": 4459 + }, + { + "epoch": 0.96, + "grad_norm": 0.13282142579555511, + "learning_rate": 4.02705637949724e-08, + "loss": 0.4892, + "step": 4460 + }, + { + "epoch": 0.96, + "grad_norm": 0.17903049290180206, + "learning_rate": 3.9829832740160834e-08, + "loss": 0.5045, + "step": 4461 + }, + { + "epoch": 0.96, + "grad_norm": 0.1339827924966812, + "learning_rate": 3.939151706455146e-08, + "loss": 0.5043, + "step": 4462 + }, + { + "epoch": 0.96, + "grad_norm": 0.1924246847629547, + "learning_rate": 3.895561698158357e-08, + "loss": 0.4559, + "step": 4463 + }, + { + "epoch": 0.96, + "grad_norm": 0.1578565388917923, + "learning_rate": 3.8522132703521784e-08, + "loss": 0.5406, + "step": 4464 + }, + { + "epoch": 0.96, + "grad_norm": 0.17296722531318665, + "learning_rate": 3.809106444145228e-08, + "loss": 0.5006, + "step": 4465 + }, + { + "epoch": 0.96, + "grad_norm": 0.15273821353912354, + "learning_rate": 3.7662412405286567e-08, + "loss": 0.5106, + "step": 4466 + }, + { + "epoch": 0.96, + "grad_norm": 0.13537849485874176, + "learning_rate": 3.723617680375935e-08, + "loss": 0.5051, + "step": 4467 + }, + { + "epoch": 0.96, + "grad_norm": 0.14120222628116608, + "learning_rate": 3.6812357844427385e-08, + "loss": 0.5358, + "step": 4468 + }, + { + "epoch": 0.96, + "grad_norm": 0.1762859970331192, + "learning_rate": 3.639095573367168e-08, + "loss": 0.5097, + "step": 4469 + }, + { + "epoch": 0.96, + "grad_norm": 0.13967743515968323, + "learning_rate": 3.597197067669533e-08, + "loss": 0.5434, + "step": 4470 + }, + { + "epoch": 0.96, + "grad_norm": 0.18413011729717255, + "learning_rate": 3.555540287752568e-08, + "loss": 0.5334, + "step": 4471 + }, + { + "epoch": 0.96, + "grad_norm": 0.2260027378797531, + "learning_rate": 3.514125253901324e-08, + "loss": 0.5245, + "step": 4472 + }, + { + "epoch": 0.96, + "grad_norm": 0.14992649853229523, + "learning_rate": 3.4729519862829466e-08, + "loss": 0.4908, + "step": 4473 + }, + { + "epoch": 0.96, + "grad_norm": 0.14780190587043762, + "learning_rate": 3.432020504947064e-08, + "loss": 0.5405, + "step": 4474 + }, + { + "epoch": 0.96, + "grad_norm": 0.1981695592403412, + "learning_rate": 3.3913308298253456e-08, + "loss": 0.5227, + "step": 4475 + }, + { + "epoch": 0.96, + "grad_norm": 0.23844297230243683, + "learning_rate": 3.350882980731884e-08, + "loss": 0.578, + "step": 4476 + }, + { + "epoch": 0.96, + "grad_norm": 0.17352676391601562, + "learning_rate": 3.310676977362925e-08, + "loss": 0.5591, + "step": 4477 + }, + { + "epoch": 0.96, + "grad_norm": 0.15774790942668915, + "learning_rate": 3.27071283929703e-08, + "loss": 0.5255, + "step": 4478 + }, + { + "epoch": 0.96, + "grad_norm": 0.1580473780632019, + "learning_rate": 3.230990585994964e-08, + "loss": 0.5217, + "step": 4479 + }, + { + "epoch": 0.97, + "grad_norm": 0.16399511694908142, + "learning_rate": 3.191510236799589e-08, + "loss": 0.4837, + "step": 4480 + }, + { + "epoch": 0.97, + "grad_norm": 0.14228610694408417, + "learning_rate": 3.152271810936081e-08, + "loss": 0.5381, + "step": 4481 + }, + { + "epoch": 0.97, + "grad_norm": 0.16083598136901855, + "learning_rate": 3.113275327511767e-08, + "loss": 0.4836, + "step": 4482 + }, + { + "epoch": 0.97, + "grad_norm": 0.11880119889974594, + "learning_rate": 3.074520805516235e-08, + "loss": 0.497, + "step": 4483 + }, + { + "epoch": 0.97, + "grad_norm": 0.1469106525182724, + "learning_rate": 3.0360082638211666e-08, + "loss": 0.5033, + "step": 4484 + }, + { + "epoch": 0.97, + "grad_norm": 0.1515989750623703, + "learning_rate": 2.997737721180338e-08, + "loss": 0.5238, + "step": 4485 + }, + { + "epoch": 0.97, + "grad_norm": 0.16306115686893463, + "learning_rate": 2.959709196229954e-08, + "loss": 0.5491, + "step": 4486 + }, + { + "epoch": 0.97, + "grad_norm": 0.14227931201457977, + "learning_rate": 2.921922707488034e-08, + "loss": 0.5258, + "step": 4487 + }, + { + "epoch": 0.97, + "grad_norm": 0.13914653658866882, + "learning_rate": 2.8843782733549706e-08, + "loss": 0.525, + "step": 4488 + }, + { + "epoch": 0.97, + "grad_norm": 0.16191960871219635, + "learning_rate": 2.847075912113195e-08, + "loss": 0.491, + "step": 4489 + }, + { + "epoch": 0.97, + "grad_norm": 0.2092602699995041, + "learning_rate": 2.8100156419272885e-08, + "loss": 0.5296, + "step": 4490 + }, + { + "epoch": 0.97, + "grad_norm": 0.1607397496700287, + "learning_rate": 2.7731974808439256e-08, + "loss": 0.5445, + "step": 4491 + }, + { + "epoch": 0.97, + "grad_norm": 0.15962082147598267, + "learning_rate": 2.7366214467919318e-08, + "loss": 0.5054, + "step": 4492 + }, + { + "epoch": 0.97, + "grad_norm": 0.1509018987417221, + "learning_rate": 2.7002875575820598e-08, + "loss": 0.5188, + "step": 4493 + }, + { + "epoch": 0.97, + "grad_norm": 0.1463000327348709, + "learning_rate": 2.664195830907379e-08, + "loss": 0.5455, + "step": 4494 + }, + { + "epoch": 0.97, + "grad_norm": 0.15445829927921295, + "learning_rate": 2.628346284342942e-08, + "loss": 0.4282, + "step": 4495 + }, + { + "epoch": 0.97, + "grad_norm": 0.1485372930765152, + "learning_rate": 2.5927389353457842e-08, + "loss": 0.5097, + "step": 4496 + }, + { + "epoch": 0.97, + "grad_norm": 0.1508139967918396, + "learning_rate": 2.5573738012550918e-08, + "loss": 0.5222, + "step": 4497 + }, + { + "epoch": 0.97, + "grad_norm": 0.1392061859369278, + "learning_rate": 2.5222508992922e-08, + "loss": 0.4993, + "step": 4498 + }, + { + "epoch": 0.97, + "grad_norm": 0.18485471606254578, + "learning_rate": 2.4873702465602612e-08, + "loss": 0.556, + "step": 4499 + }, + { + "epoch": 0.97, + "grad_norm": 0.14879798889160156, + "learning_rate": 2.4527318600446324e-08, + "loss": 0.53, + "step": 4500 + }, + { + "epoch": 0.97, + "grad_norm": 0.1910664439201355, + "learning_rate": 2.4183357566125998e-08, + "loss": 0.5285, + "step": 4501 + }, + { + "epoch": 0.97, + "grad_norm": 0.13035574555397034, + "learning_rate": 2.3841819530135424e-08, + "loss": 0.4912, + "step": 4502 + }, + { + "epoch": 0.97, + "grad_norm": 0.15490761399269104, + "learning_rate": 2.350270465878879e-08, + "loss": 0.4842, + "step": 4503 + }, + { + "epoch": 0.97, + "grad_norm": 0.16533678770065308, + "learning_rate": 2.3166013117218998e-08, + "loss": 0.5493, + "step": 4504 + }, + { + "epoch": 0.97, + "grad_norm": 0.15088367462158203, + "learning_rate": 2.2831745069379907e-08, + "loss": 0.5552, + "step": 4505 + }, + { + "epoch": 0.97, + "grad_norm": 0.17368783056735992, + "learning_rate": 2.249990067804464e-08, + "loss": 0.5297, + "step": 4506 + }, + { + "epoch": 0.97, + "grad_norm": 0.20666424930095673, + "learning_rate": 2.2170480104807268e-08, + "loss": 0.4992, + "step": 4507 + }, + { + "epoch": 0.97, + "grad_norm": 0.17731893062591553, + "learning_rate": 2.1843483510080032e-08, + "loss": 0.4926, + "step": 4508 + }, + { + "epoch": 0.97, + "grad_norm": 0.15629667043685913, + "learning_rate": 2.151891105309556e-08, + "loss": 0.4927, + "step": 4509 + }, + { + "epoch": 0.97, + "grad_norm": 0.13626927137374878, + "learning_rate": 2.119676289190631e-08, + "loss": 0.4622, + "step": 4510 + }, + { + "epoch": 0.97, + "grad_norm": 0.14321957528591156, + "learning_rate": 2.0877039183384018e-08, + "loss": 0.4869, + "step": 4511 + }, + { + "epoch": 0.97, + "grad_norm": 0.15675011277198792, + "learning_rate": 2.0559740083219147e-08, + "loss": 0.4736, + "step": 4512 + }, + { + "epoch": 0.97, + "grad_norm": 0.14412085711956024, + "learning_rate": 2.024486574592255e-08, + "loss": 0.457, + "step": 4513 + }, + { + "epoch": 0.97, + "grad_norm": 0.15253007411956787, + "learning_rate": 1.9932416324823235e-08, + "loss": 0.5338, + "step": 4514 + }, + { + "epoch": 0.97, + "grad_norm": 0.23936396837234497, + "learning_rate": 1.9622391972071164e-08, + "loss": 0.5333, + "step": 4515 + }, + { + "epoch": 0.97, + "grad_norm": 0.16216091811656952, + "learning_rate": 1.93147928386328e-08, + "loss": 0.445, + "step": 4516 + }, + { + "epoch": 0.97, + "grad_norm": 0.15972132980823517, + "learning_rate": 1.9009619074296102e-08, + "loss": 0.5372, + "step": 4517 + }, + { + "epoch": 0.97, + "grad_norm": 0.13417501747608185, + "learning_rate": 1.8706870827666646e-08, + "loss": 0.4829, + "step": 4518 + }, + { + "epoch": 0.97, + "grad_norm": 0.184243842959404, + "learning_rate": 1.840654824616872e-08, + "loss": 0.5087, + "step": 4519 + }, + { + "epoch": 0.97, + "grad_norm": 0.14154835045337677, + "learning_rate": 1.8108651476046457e-08, + "loss": 0.5314, + "step": 4520 + }, + { + "epoch": 0.97, + "grad_norm": 0.14898306131362915, + "learning_rate": 1.781318066236215e-08, + "loss": 0.536, + "step": 4521 + }, + { + "epoch": 0.97, + "grad_norm": 0.14588691294193268, + "learning_rate": 1.7520135948996263e-08, + "loss": 0.5138, + "step": 4522 + }, + { + "epoch": 0.97, + "grad_norm": 0.19737721979618073, + "learning_rate": 1.722951747864854e-08, + "loss": 0.5161, + "step": 4523 + }, + { + "epoch": 0.97, + "grad_norm": 0.16627101600170135, + "learning_rate": 1.6941325392837437e-08, + "loss": 0.573, + "step": 4524 + }, + { + "epoch": 0.97, + "grad_norm": 0.16760136187076569, + "learning_rate": 1.6655559831899038e-08, + "loss": 0.5447, + "step": 4525 + }, + { + "epoch": 0.97, + "grad_norm": 0.16920168697834015, + "learning_rate": 1.6372220934988693e-08, + "loss": 0.5242, + "step": 4526 + }, + { + "epoch": 0.98, + "grad_norm": 0.1605527251958847, + "learning_rate": 1.609130884007881e-08, + "loss": 0.5065, + "step": 4527 + }, + { + "epoch": 0.98, + "grad_norm": 0.16007407009601593, + "learning_rate": 1.5812823683962198e-08, + "loss": 0.4823, + "step": 4528 + }, + { + "epoch": 0.98, + "grad_norm": 0.15912877023220062, + "learning_rate": 1.5536765602248148e-08, + "loss": 0.4905, + "step": 4529 + }, + { + "epoch": 0.98, + "grad_norm": 0.1717677265405655, + "learning_rate": 1.5263134729363582e-08, + "loss": 0.5239, + "step": 4530 + }, + { + "epoch": 0.98, + "grad_norm": 0.16128626465797424, + "learning_rate": 1.49919311985558e-08, + "loss": 0.5036, + "step": 4531 + }, + { + "epoch": 0.98, + "grad_norm": 0.15915502607822418, + "learning_rate": 1.472315514188749e-08, + "loss": 0.4892, + "step": 4532 + }, + { + "epoch": 0.98, + "grad_norm": 0.18323585391044617, + "learning_rate": 1.4456806690241187e-08, + "loss": 0.487, + "step": 4533 + }, + { + "epoch": 0.98, + "grad_norm": 0.16480879485607147, + "learning_rate": 1.4192885973315918e-08, + "loss": 0.5064, + "step": 4534 + }, + { + "epoch": 0.98, + "grad_norm": 0.15716604888439178, + "learning_rate": 1.3931393119629987e-08, + "loss": 0.4886, + "step": 4535 + }, + { + "epoch": 0.98, + "grad_norm": 0.1497613936662674, + "learning_rate": 1.3672328256518208e-08, + "loss": 0.5372, + "step": 4536 + }, + { + "epoch": 0.98, + "grad_norm": 0.14100497961044312, + "learning_rate": 1.3415691510133555e-08, + "loss": 0.4763, + "step": 4537 + }, + { + "epoch": 0.98, + "grad_norm": 0.16514605283737183, + "learning_rate": 1.3161483005446618e-08, + "loss": 0.5506, + "step": 4538 + }, + { + "epoch": 0.98, + "grad_norm": 0.17945894598960876, + "learning_rate": 1.2909702866245045e-08, + "loss": 0.5278, + "step": 4539 + }, + { + "epoch": 0.98, + "grad_norm": 0.13528050482273102, + "learning_rate": 1.2660351215135203e-08, + "loss": 0.4926, + "step": 4540 + }, + { + "epoch": 0.98, + "grad_norm": 0.1794215887784958, + "learning_rate": 1.241342817353941e-08, + "loss": 0.4929, + "step": 4541 + }, + { + "epoch": 0.98, + "grad_norm": 0.14071007072925568, + "learning_rate": 1.2168933861698151e-08, + "loss": 0.502, + "step": 4542 + }, + { + "epoch": 0.98, + "grad_norm": 0.16157187521457672, + "learning_rate": 1.1926868398669522e-08, + "loss": 0.5321, + "step": 4543 + }, + { + "epoch": 0.98, + "grad_norm": 0.15778125822544098, + "learning_rate": 1.1687231902328122e-08, + "loss": 0.4767, + "step": 4544 + }, + { + "epoch": 0.98, + "grad_norm": 0.16504241526126862, + "learning_rate": 1.1450024489366163e-08, + "loss": 0.5074, + "step": 4545 + }, + { + "epoch": 0.98, + "grad_norm": 0.16821053624153137, + "learning_rate": 1.1215246275292913e-08, + "loss": 0.5654, + "step": 4546 + }, + { + "epoch": 0.98, + "grad_norm": 0.18481959402561188, + "learning_rate": 1.0982897374435252e-08, + "loss": 0.5467, + "step": 4547 + }, + { + "epoch": 0.98, + "grad_norm": 0.18275012075901031, + "learning_rate": 1.0752977899936013e-08, + "loss": 0.5656, + "step": 4548 + }, + { + "epoch": 0.98, + "grad_norm": 0.15175531804561615, + "learning_rate": 1.0525487963756186e-08, + "loss": 0.5455, + "step": 4549 + }, + { + "epoch": 0.98, + "grad_norm": 0.14397265017032623, + "learning_rate": 1.0300427676672164e-08, + "loss": 0.5424, + "step": 4550 + }, + { + "epoch": 0.98, + "grad_norm": 0.14812220633029938, + "learning_rate": 1.0077797148279056e-08, + "loss": 0.5515, + "step": 4551 + }, + { + "epoch": 0.98, + "grad_norm": 0.15202391147613525, + "learning_rate": 9.85759648698792e-09, + "loss": 0.5168, + "step": 4552 + }, + { + "epoch": 0.98, + "grad_norm": 0.221909299492836, + "learning_rate": 9.63982580002576e-09, + "loss": 0.5585, + "step": 4553 + }, + { + "epoch": 0.98, + "grad_norm": 0.1755637228488922, + "learning_rate": 9.42448519343775e-09, + "loss": 0.4478, + "step": 4554 + }, + { + "epoch": 0.98, + "grad_norm": 0.2023898959159851, + "learning_rate": 9.211574772085009e-09, + "loss": 0.5461, + "step": 4555 + }, + { + "epoch": 0.98, + "grad_norm": 0.26978346705436707, + "learning_rate": 9.001094639645158e-09, + "loss": 0.5124, + "step": 4556 + }, + { + "epoch": 0.98, + "grad_norm": 0.42439812421798706, + "learning_rate": 8.793044898612324e-09, + "loss": 0.4763, + "step": 4557 + }, + { + "epoch": 0.98, + "grad_norm": 0.1549844741821289, + "learning_rate": 8.587425650297688e-09, + "loss": 0.5193, + "step": 4558 + }, + { + "epoch": 0.98, + "grad_norm": 0.16406425833702087, + "learning_rate": 8.384236994828376e-09, + "loss": 0.5499, + "step": 4559 + }, + { + "epoch": 0.98, + "grad_norm": 0.15720784664154053, + "learning_rate": 8.183479031148022e-09, + "loss": 0.5331, + "step": 4560 + }, + { + "epoch": 0.98, + "grad_norm": 0.13800294697284698, + "learning_rate": 7.98515185701676e-09, + "loss": 0.493, + "step": 4561 + }, + { + "epoch": 0.98, + "grad_norm": 0.17194029688835144, + "learning_rate": 7.789255569011223e-09, + "loss": 0.5432, + "step": 4562 + }, + { + "epoch": 0.98, + "grad_norm": 0.13786643743515015, + "learning_rate": 7.595790262523995e-09, + "loss": 0.5465, + "step": 4563 + }, + { + "epoch": 0.98, + "grad_norm": 0.13633519411087036, + "learning_rate": 7.40475603176416e-09, + "loss": 0.4532, + "step": 4564 + }, + { + "epoch": 0.98, + "grad_norm": 0.1479789763689041, + "learning_rate": 7.216152969755641e-09, + "loss": 0.459, + "step": 4565 + }, + { + "epoch": 0.98, + "grad_norm": 0.15188181400299072, + "learning_rate": 7.029981168341082e-09, + "loss": 0.5184, + "step": 4566 + }, + { + "epoch": 0.98, + "grad_norm": 0.19617387652397156, + "learning_rate": 6.846240718176855e-09, + "loss": 0.4477, + "step": 4567 + }, + { + "epoch": 0.98, + "grad_norm": 0.1465783268213272, + "learning_rate": 6.664931708736943e-09, + "loss": 0.5164, + "step": 4568 + }, + { + "epoch": 0.98, + "grad_norm": 0.17284545302391052, + "learning_rate": 6.486054228309613e-09, + "loss": 0.5033, + "step": 4569 + }, + { + "epoch": 0.98, + "grad_norm": 0.1475616693496704, + "learning_rate": 6.309608364001296e-09, + "loss": 0.5787, + "step": 4570 + }, + { + "epoch": 0.98, + "grad_norm": 0.21322426199913025, + "learning_rate": 6.1355942017321534e-09, + "loss": 0.5463, + "step": 4571 + }, + { + "epoch": 0.98, + "grad_norm": 0.13938239216804504, + "learning_rate": 5.9640118262399575e-09, + "loss": 0.4885, + "step": 4572 + }, + { + "epoch": 0.99, + "grad_norm": 0.14598731696605682, + "learning_rate": 5.794861321077872e-09, + "loss": 0.5078, + "step": 4573 + }, + { + "epoch": 0.99, + "grad_norm": 0.13491961359977722, + "learning_rate": 5.628142768613343e-09, + "loss": 0.5065, + "step": 4574 + }, + { + "epoch": 0.99, + "grad_norm": 0.1279793381690979, + "learning_rate": 5.4638562500319844e-09, + "loss": 0.5256, + "step": 4575 + }, + { + "epoch": 0.99, + "grad_norm": 0.12754946947097778, + "learning_rate": 5.302001845333138e-09, + "loss": 0.5323, + "step": 4576 + }, + { + "epoch": 0.99, + "grad_norm": 0.1419224590063095, + "learning_rate": 5.1425796333332e-09, + "loss": 0.5048, + "step": 4577 + }, + { + "epoch": 0.99, + "grad_norm": 0.14779268205165863, + "learning_rate": 4.9855896916634065e-09, + "loss": 0.5018, + "step": 4578 + }, + { + "epoch": 0.99, + "grad_norm": 0.15771281719207764, + "learning_rate": 4.831032096770383e-09, + "loss": 0.54, + "step": 4579 + }, + { + "epoch": 0.99, + "grad_norm": 0.14732089638710022, + "learning_rate": 4.678906923916704e-09, + "loss": 0.5594, + "step": 4580 + }, + { + "epoch": 0.99, + "grad_norm": 0.13935159146785736, + "learning_rate": 4.529214247181446e-09, + "loss": 0.4833, + "step": 4581 + }, + { + "epoch": 0.99, + "grad_norm": 0.1624104082584381, + "learning_rate": 4.381954139457411e-09, + "loss": 0.5112, + "step": 4582 + }, + { + "epoch": 0.99, + "grad_norm": 0.14294303953647614, + "learning_rate": 4.237126672453351e-09, + "loss": 0.5499, + "step": 4583 + }, + { + "epoch": 0.99, + "grad_norm": 0.16533686220645905, + "learning_rate": 4.094731916693962e-09, + "loss": 0.5159, + "step": 4584 + }, + { + "epoch": 0.99, + "grad_norm": 0.14635160565376282, + "learning_rate": 3.9547699415198874e-09, + "loss": 0.5057, + "step": 4585 + }, + { + "epoch": 0.99, + "grad_norm": 0.149323508143425, + "learning_rate": 3.817240815084944e-09, + "loss": 0.4638, + "step": 4586 + }, + { + "epoch": 0.99, + "grad_norm": 0.19318562746047974, + "learning_rate": 3.68214460436056e-09, + "loss": 0.5073, + "step": 4587 + }, + { + "epoch": 0.99, + "grad_norm": 0.139207124710083, + "learning_rate": 3.5494813751324466e-09, + "loss": 0.5214, + "step": 4588 + }, + { + "epoch": 0.99, + "grad_norm": 0.16351597011089325, + "learning_rate": 3.4192511920011495e-09, + "loss": 0.548, + "step": 4589 + }, + { + "epoch": 0.99, + "grad_norm": 0.21700578927993774, + "learning_rate": 3.291454118383164e-09, + "loss": 0.5148, + "step": 4590 + }, + { + "epoch": 0.99, + "grad_norm": 0.1478326916694641, + "learning_rate": 3.1660902165098205e-09, + "loss": 0.5089, + "step": 4591 + }, + { + "epoch": 0.99, + "grad_norm": 0.13109348714351654, + "learning_rate": 3.043159547427843e-09, + "loss": 0.5693, + "step": 4592 + }, + { + "epoch": 0.99, + "grad_norm": 0.18486939370632172, + "learning_rate": 2.922662170998791e-09, + "loss": 0.5101, + "step": 4593 + }, + { + "epoch": 0.99, + "grad_norm": 0.23270238935947418, + "learning_rate": 2.804598145899062e-09, + "loss": 0.5173, + "step": 4594 + }, + { + "epoch": 0.99, + "grad_norm": 0.13426940143108368, + "learning_rate": 2.688967529621556e-09, + "loss": 0.5174, + "step": 4595 + }, + { + "epoch": 0.99, + "grad_norm": 0.1617153435945511, + "learning_rate": 2.575770378472342e-09, + "loss": 0.509, + "step": 4596 + }, + { + "epoch": 0.99, + "grad_norm": 0.17806501686573029, + "learning_rate": 2.4650067475734398e-09, + "loss": 0.5115, + "step": 4597 + }, + { + "epoch": 0.99, + "grad_norm": 0.16140803694725037, + "learning_rate": 2.3566766908622586e-09, + "loss": 0.5718, + "step": 4598 + }, + { + "epoch": 0.99, + "grad_norm": 0.1904280185699463, + "learning_rate": 2.25078026108938e-09, + "loss": 0.4756, + "step": 4599 + }, + { + "epoch": 0.99, + "grad_norm": 0.154715433716774, + "learning_rate": 2.1473175098229993e-09, + "loss": 0.5476, + "step": 4600 + }, + { + "epoch": 0.99, + "grad_norm": 0.13388168811798096, + "learning_rate": 2.046288487444481e-09, + "loss": 0.5101, + "step": 4601 + }, + { + "epoch": 0.99, + "grad_norm": 0.15827472507953644, + "learning_rate": 1.9476932431500286e-09, + "loss": 0.5506, + "step": 4602 + }, + { + "epoch": 0.99, + "grad_norm": 0.1361783891916275, + "learning_rate": 1.8515318249506809e-09, + "loss": 0.4876, + "step": 4603 + }, + { + "epoch": 0.99, + "grad_norm": 0.13444480299949646, + "learning_rate": 1.7578042796739803e-09, + "loss": 0.5557, + "step": 4604 + }, + { + "epoch": 0.99, + "grad_norm": 0.14045651257038116, + "learning_rate": 1.666510652960085e-09, + "loss": 0.533, + "step": 4605 + }, + { + "epoch": 0.99, + "grad_norm": 0.16582362353801727, + "learning_rate": 1.5776509892645453e-09, + "loss": 0.4909, + "step": 4606 + }, + { + "epoch": 0.99, + "grad_norm": 0.18087778985500336, + "learning_rate": 1.4912253318594138e-09, + "loss": 0.5907, + "step": 4607 + }, + { + "epoch": 0.99, + "grad_norm": 0.1612243801355362, + "learning_rate": 1.4072337228282496e-09, + "loss": 0.489, + "step": 4608 + }, + { + "epoch": 0.99, + "grad_norm": 0.1329774111509323, + "learning_rate": 1.3256762030727788e-09, + "loss": 0.4982, + "step": 4609 + }, + { + "epoch": 0.99, + "grad_norm": 0.15983089804649353, + "learning_rate": 1.2465528123073445e-09, + "loss": 0.554, + "step": 4610 + }, + { + "epoch": 0.99, + "grad_norm": 0.14717040956020355, + "learning_rate": 1.1698635890611264e-09, + "loss": 0.5216, + "step": 4611 + }, + { + "epoch": 0.99, + "grad_norm": 0.1585836112499237, + "learning_rate": 1.0956085706781416e-09, + "loss": 0.5291, + "step": 4612 + }, + { + "epoch": 0.99, + "grad_norm": 0.1522228866815567, + "learning_rate": 1.0237877933183538e-09, + "loss": 0.4878, + "step": 4613 + }, + { + "epoch": 0.99, + "grad_norm": 0.1510564535856247, + "learning_rate": 9.54401291953788e-10, + "loss": 0.454, + "step": 4614 + }, + { + "epoch": 0.99, + "grad_norm": 0.15678934752941132, + "learning_rate": 8.874491003735275e-10, + "loss": 0.4875, + "step": 4615 + }, + { + "epoch": 0.99, + "grad_norm": 0.14470191299915314, + "learning_rate": 8.229312511803811e-10, + "loss": 0.5576, + "step": 4616 + }, + { + "epoch": 0.99, + "grad_norm": 0.14706255495548248, + "learning_rate": 7.60847775790885e-10, + "loss": 0.5316, + "step": 4617 + }, + { + "epoch": 0.99, + "grad_norm": 0.18483339250087738, + "learning_rate": 7.011987044369673e-10, + "loss": 0.5227, + "step": 4618 + }, + { + "epoch": 0.99, + "grad_norm": 0.15726877748966217, + "learning_rate": 6.43984066165948e-10, + "loss": 0.4667, + "step": 4619 + }, + { + "epoch": 1.0, + "grad_norm": 0.17411313951015472, + "learning_rate": 5.892038888377638e-10, + "loss": 0.5399, + "step": 4620 + }, + { + "epoch": 1.0, + "grad_norm": 0.12193353474140167, + "learning_rate": 5.368581991282983e-10, + "loss": 0.5359, + "step": 4621 + }, + { + "epoch": 1.0, + "grad_norm": 0.1844024807214737, + "learning_rate": 4.869470225277174e-10, + "loss": 0.5551, + "step": 4622 + }, + { + "epoch": 1.0, + "grad_norm": 0.144753098487854, + "learning_rate": 4.3947038334046785e-10, + "loss": 0.5273, + "step": 4623 + }, + { + "epoch": 1.0, + "grad_norm": 0.18550604581832886, + "learning_rate": 3.9442830468472414e-10, + "loss": 0.5496, + "step": 4624 + }, + { + "epoch": 1.0, + "grad_norm": 0.1372082531452179, + "learning_rate": 3.5182080849516245e-10, + "loss": 0.4912, + "step": 4625 + }, + { + "epoch": 1.0, + "grad_norm": 0.16977030038833618, + "learning_rate": 3.1164791551907545e-10, + "loss": 0.4574, + "step": 4626 + }, + { + "epoch": 1.0, + "grad_norm": 0.15275530517101288, + "learning_rate": 2.739096453191481e-10, + "loss": 0.5382, + "step": 4627 + }, + { + "epoch": 1.0, + "grad_norm": 0.1399284303188324, + "learning_rate": 2.386060162717918e-10, + "loss": 0.5144, + "step": 4628 + }, + { + "epoch": 1.0, + "grad_norm": 0.18410004675388336, + "learning_rate": 2.05737045568255e-10, + "loss": 0.5252, + "step": 4629 + }, + { + "epoch": 1.0, + "grad_norm": 0.16072387993335724, + "learning_rate": 1.7530274921462308e-10, + "loss": 0.5003, + "step": 4630 + }, + { + "epoch": 1.0, + "grad_norm": 0.18865381181240082, + "learning_rate": 1.4730314203126318e-10, + "loss": 0.5048, + "step": 4631 + }, + { + "epoch": 1.0, + "grad_norm": 0.1395450234413147, + "learning_rate": 1.21738237651714e-10, + "loss": 0.4849, + "step": 4632 + }, + { + "epoch": 1.0, + "grad_norm": 0.13772456347942352, + "learning_rate": 9.860804852601658e-11, + "loss": 0.5158, + "step": 4633 + }, + { + "epoch": 1.0, + "grad_norm": 0.13184866309165955, + "learning_rate": 7.791258591682837e-11, + "loss": 0.526, + "step": 4634 + }, + { + "epoch": 1.0, + "grad_norm": 0.15045422315597534, + "learning_rate": 5.96518599021989e-11, + "loss": 0.5058, + "step": 4635 + }, + { + "epoch": 1.0, + "grad_norm": 0.15769906342029572, + "learning_rate": 4.382587937445948e-11, + "loss": 0.5534, + "step": 4636 + }, + { + "epoch": 1.0, + "grad_norm": 0.172768235206604, + "learning_rate": 3.0434652039668114e-11, + "loss": 0.5297, + "step": 4637 + }, + { + "epoch": 1.0, + "grad_norm": 0.13954681158065796, + "learning_rate": 1.947818441927485e-11, + "loss": 0.5442, + "step": 4638 + }, + { + "epoch": 1.0, + "grad_norm": 0.12631604075431824, + "learning_rate": 1.0956481847901323e-11, + "loss": 0.5348, + "step": 4639 + }, + { + "epoch": 1.0, + "grad_norm": 0.14625275135040283, + "learning_rate": 4.8695484761163145e-12, + "loss": 0.5268, + "step": 4640 + }, + { + "epoch": 1.0, + "grad_norm": 0.16151678562164307, + "learning_rate": 1.2173872671050746e-12, + "loss": 0.5801, + "step": 4641 + }, + { + "epoch": 1.0, + "grad_norm": 0.16961205005645752, + "learning_rate": 0.0, + "loss": 0.5504, + "step": 4642 + }, + { + "epoch": 1.0, + "step": 4642, + "total_flos": 1.0981284140833309e+19, + "train_loss": 0.5198679990626676, + "train_runtime": 67897.289, + "train_samples_per_second": 17.504, + "train_steps_per_second": 0.068 + } + ], + "logging_steps": 1.0, + "max_steps": 4642, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 8000, + "total_flos": 1.0981284140833309e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}