diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,115080 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.999878308487983, + "eval_steps": 500, + "global_step": 16434, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012169151201703681, + "grad_norm": 103.34007263183594, + "learning_rate": 6.079027355623101e-08, + "loss": 2.0025, + "step": 1 + }, + { + "epoch": 0.00024338302403407362, + "grad_norm": 103.11289978027344, + "learning_rate": 1.2158054711246203e-07, + "loss": 2.0521, + "step": 2 + }, + { + "epoch": 0.0003650745360511104, + "grad_norm": 95.9359130859375, + "learning_rate": 1.8237082066869301e-07, + "loss": 1.8462, + "step": 3 + }, + { + "epoch": 0.00048676604806814725, + "grad_norm": 75.22039031982422, + "learning_rate": 2.4316109422492405e-07, + "loss": 1.7121, + "step": 4 + }, + { + "epoch": 0.000608457560085184, + "grad_norm": 69.626708984375, + "learning_rate": 3.0395136778115507e-07, + "loss": 1.4793, + "step": 5 + }, + { + "epoch": 0.0007301490721022208, + "grad_norm": 34.15979766845703, + "learning_rate": 3.6474164133738603e-07, + "loss": 1.3097, + "step": 6 + }, + { + "epoch": 0.0008518405841192577, + "grad_norm": 19.58428192138672, + "learning_rate": 4.2553191489361704e-07, + "loss": 1.0655, + "step": 7 + }, + { + "epoch": 0.0009735320961362945, + "grad_norm": 16.660463333129883, + "learning_rate": 4.863221884498481e-07, + "loss": 1.0143, + "step": 8 + }, + { + "epoch": 0.0010952236081533313, + "grad_norm": 28.945844650268555, + "learning_rate": 5.471124620060791e-07, + "loss": 0.99, + "step": 9 + }, + { + "epoch": 0.001216915120170368, + "grad_norm": 28.549259185791016, + "learning_rate": 6.079027355623101e-07, + "loss": 1.0168, + "step": 10 + }, + { + "epoch": 0.0013386066321874049, + "grad_norm": 32.18401336669922, + "learning_rate": 6.686930091185411e-07, + "loss": 0.9256, + "step": 11 + }, + { + "epoch": 0.0014602981442044416, + "grad_norm": 31.586956024169922, + "learning_rate": 7.294832826747721e-07, + "loss": 0.9215, + "step": 12 + }, + { + "epoch": 0.0015819896562214786, + "grad_norm": 28.776288986206055, + "learning_rate": 7.902735562310031e-07, + "loss": 0.9203, + "step": 13 + }, + { + "epoch": 0.0017036811682385154, + "grad_norm": 16.66475486755371, + "learning_rate": 8.510638297872341e-07, + "loss": 0.7229, + "step": 14 + }, + { + "epoch": 0.0018253726802555522, + "grad_norm": 12.193924903869629, + "learning_rate": 9.118541033434651e-07, + "loss": 0.7789, + "step": 15 + }, + { + "epoch": 0.001947064192272589, + "grad_norm": 25.486886978149414, + "learning_rate": 9.726443768996962e-07, + "loss": 0.8058, + "step": 16 + }, + { + "epoch": 0.002068755704289626, + "grad_norm": 47.33616256713867, + "learning_rate": 1.0334346504559272e-06, + "loss": 0.8348, + "step": 17 + }, + { + "epoch": 0.0021904472163066626, + "grad_norm": 51.361446380615234, + "learning_rate": 1.0942249240121581e-06, + "loss": 0.8964, + "step": 18 + }, + { + "epoch": 0.0023121387283236996, + "grad_norm": 50.1767578125, + "learning_rate": 1.155015197568389e-06, + "loss": 0.8765, + "step": 19 + }, + { + "epoch": 0.002433830240340736, + "grad_norm": 28.88109016418457, + "learning_rate": 1.2158054711246203e-06, + "loss": 0.6874, + "step": 20 + }, + { + "epoch": 0.002555521752357773, + "grad_norm": 10.250425338745117, + "learning_rate": 1.276595744680851e-06, + "loss": 0.7434, + "step": 21 + }, + { + "epoch": 0.0026772132643748097, + "grad_norm": 12.3607759475708, + "learning_rate": 1.3373860182370822e-06, + "loss": 0.7158, + "step": 22 + }, + { + "epoch": 0.0027989047763918467, + "grad_norm": 14.11135196685791, + "learning_rate": 1.3981762917933131e-06, + "loss": 0.7237, + "step": 23 + }, + { + "epoch": 0.0029205962884088833, + "grad_norm": 11.794168472290039, + "learning_rate": 1.4589665653495441e-06, + "loss": 0.7236, + "step": 24 + }, + { + "epoch": 0.0030422878004259203, + "grad_norm": 5.893219470977783, + "learning_rate": 1.519756838905775e-06, + "loss": 0.5845, + "step": 25 + }, + { + "epoch": 0.0031639793124429573, + "grad_norm": 5.568309783935547, + "learning_rate": 1.5805471124620062e-06, + "loss": 0.5906, + "step": 26 + }, + { + "epoch": 0.003285670824459994, + "grad_norm": 14.479249954223633, + "learning_rate": 1.6413373860182372e-06, + "loss": 0.5553, + "step": 27 + }, + { + "epoch": 0.003407362336477031, + "grad_norm": 5.768650531768799, + "learning_rate": 1.7021276595744682e-06, + "loss": 0.561, + "step": 28 + }, + { + "epoch": 0.0035290538484940674, + "grad_norm": 12.296796798706055, + "learning_rate": 1.7629179331306991e-06, + "loss": 0.6391, + "step": 29 + }, + { + "epoch": 0.0036507453605111044, + "grad_norm": 6.892004013061523, + "learning_rate": 1.8237082066869303e-06, + "loss": 0.5507, + "step": 30 + }, + { + "epoch": 0.003772436872528141, + "grad_norm": 4.593900680541992, + "learning_rate": 1.8844984802431613e-06, + "loss": 0.5387, + "step": 31 + }, + { + "epoch": 0.003894128384545178, + "grad_norm": 11.900116920471191, + "learning_rate": 1.9452887537993924e-06, + "loss": 0.6643, + "step": 32 + }, + { + "epoch": 0.0040158198965622146, + "grad_norm": 8.636366844177246, + "learning_rate": 2.0060790273556234e-06, + "loss": 0.6494, + "step": 33 + }, + { + "epoch": 0.004137511408579252, + "grad_norm": 11.49934196472168, + "learning_rate": 2.0668693009118543e-06, + "loss": 0.5801, + "step": 34 + }, + { + "epoch": 0.0042592029205962886, + "grad_norm": 12.536081314086914, + "learning_rate": 2.1276595744680853e-06, + "loss": 0.5851, + "step": 35 + }, + { + "epoch": 0.004380894432613325, + "grad_norm": 3.741614818572998, + "learning_rate": 2.1884498480243163e-06, + "loss": 0.5611, + "step": 36 + }, + { + "epoch": 0.004502585944630362, + "grad_norm": 4.851962089538574, + "learning_rate": 2.2492401215805472e-06, + "loss": 0.5617, + "step": 37 + }, + { + "epoch": 0.004624277456647399, + "grad_norm": 3.1746225357055664, + "learning_rate": 2.310030395136778e-06, + "loss": 0.4807, + "step": 38 + }, + { + "epoch": 0.004745968968664436, + "grad_norm": 3.161885976791382, + "learning_rate": 2.3708206686930096e-06, + "loss": 0.5301, + "step": 39 + }, + { + "epoch": 0.004867660480681472, + "grad_norm": 4.953426837921143, + "learning_rate": 2.4316109422492405e-06, + "loss": 0.5889, + "step": 40 + }, + { + "epoch": 0.00498935199269851, + "grad_norm": 5.341402530670166, + "learning_rate": 2.4924012158054715e-06, + "loss": 0.6252, + "step": 41 + }, + { + "epoch": 0.005111043504715546, + "grad_norm": 9.844038963317871, + "learning_rate": 2.553191489361702e-06, + "loss": 0.6178, + "step": 42 + }, + { + "epoch": 0.005232735016732583, + "grad_norm": 8.744702339172363, + "learning_rate": 2.613981762917933e-06, + "loss": 0.5627, + "step": 43 + }, + { + "epoch": 0.005354426528749619, + "grad_norm": 2.649850606918335, + "learning_rate": 2.6747720364741644e-06, + "loss": 0.4677, + "step": 44 + }, + { + "epoch": 0.005476118040766657, + "grad_norm": 13.687397956848145, + "learning_rate": 2.7355623100303953e-06, + "loss": 0.6101, + "step": 45 + }, + { + "epoch": 0.005597809552783693, + "grad_norm": 13.606250762939453, + "learning_rate": 2.7963525835866263e-06, + "loss": 0.6129, + "step": 46 + }, + { + "epoch": 0.00571950106480073, + "grad_norm": 10.961346626281738, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.6148, + "step": 47 + }, + { + "epoch": 0.0058411925768177666, + "grad_norm": 3.9125142097473145, + "learning_rate": 2.9179331306990882e-06, + "loss": 0.5812, + "step": 48 + }, + { + "epoch": 0.005962884088834804, + "grad_norm": 6.282345294952393, + "learning_rate": 2.978723404255319e-06, + "loss": 0.4988, + "step": 49 + }, + { + "epoch": 0.0060845756008518406, + "grad_norm": 6.259521484375, + "learning_rate": 3.03951367781155e-06, + "loss": 0.5785, + "step": 50 + }, + { + "epoch": 0.006206267112868877, + "grad_norm": 3.5301406383514404, + "learning_rate": 3.100303951367781e-06, + "loss": 0.5786, + "step": 51 + }, + { + "epoch": 0.0063279586248859146, + "grad_norm": 7.533758640289307, + "learning_rate": 3.1610942249240125e-06, + "loss": 0.5956, + "step": 52 + }, + { + "epoch": 0.006449650136902951, + "grad_norm": 4.951503753662109, + "learning_rate": 3.2218844984802434e-06, + "loss": 0.5074, + "step": 53 + }, + { + "epoch": 0.006571341648919988, + "grad_norm": 5.272761821746826, + "learning_rate": 3.2826747720364744e-06, + "loss": 0.5353, + "step": 54 + }, + { + "epoch": 0.006693033160937024, + "grad_norm": 9.48100471496582, + "learning_rate": 3.3434650455927054e-06, + "loss": 0.5857, + "step": 55 + }, + { + "epoch": 0.006814724672954062, + "grad_norm": 2.2276763916015625, + "learning_rate": 3.4042553191489363e-06, + "loss": 0.4564, + "step": 56 + }, + { + "epoch": 0.006936416184971098, + "grad_norm": 2.5077621936798096, + "learning_rate": 3.4650455927051673e-06, + "loss": 0.5416, + "step": 57 + }, + { + "epoch": 0.007058107696988135, + "grad_norm": 2.5162456035614014, + "learning_rate": 3.5258358662613982e-06, + "loss": 0.4984, + "step": 58 + }, + { + "epoch": 0.007179799209005172, + "grad_norm": 3.7858893871307373, + "learning_rate": 3.586626139817629e-06, + "loss": 0.444, + "step": 59 + }, + { + "epoch": 0.007301490721022209, + "grad_norm": 7.784074306488037, + "learning_rate": 3.6474164133738606e-06, + "loss": 0.5784, + "step": 60 + }, + { + "epoch": 0.007423182233039245, + "grad_norm": 6.502420902252197, + "learning_rate": 3.7082066869300915e-06, + "loss": 0.5378, + "step": 61 + }, + { + "epoch": 0.007544873745056282, + "grad_norm": 3.627375602722168, + "learning_rate": 3.7689969604863225e-06, + "loss": 0.531, + "step": 62 + }, + { + "epoch": 0.007666565257073319, + "grad_norm": 6.836421012878418, + "learning_rate": 3.8297872340425535e-06, + "loss": 0.5663, + "step": 63 + }, + { + "epoch": 0.007788256769090356, + "grad_norm": 7.055088520050049, + "learning_rate": 3.890577507598785e-06, + "loss": 0.5491, + "step": 64 + }, + { + "epoch": 0.007909948281107393, + "grad_norm": 8.892171859741211, + "learning_rate": 3.951367781155015e-06, + "loss": 0.4521, + "step": 65 + }, + { + "epoch": 0.008031639793124429, + "grad_norm": 3.930820941925049, + "learning_rate": 4.012158054711247e-06, + "loss": 0.52, + "step": 66 + }, + { + "epoch": 0.008153331305141466, + "grad_norm": 12.123941421508789, + "learning_rate": 4.072948328267477e-06, + "loss": 0.6482, + "step": 67 + }, + { + "epoch": 0.008275022817158504, + "grad_norm": 2.9747817516326904, + "learning_rate": 4.133738601823709e-06, + "loss": 0.5113, + "step": 68 + }, + { + "epoch": 0.00839671432917554, + "grad_norm": 3.337928533554077, + "learning_rate": 4.194528875379939e-06, + "loss": 0.5408, + "step": 69 + }, + { + "epoch": 0.008518405841192577, + "grad_norm": 9.0592622756958, + "learning_rate": 4.255319148936171e-06, + "loss": 0.5251, + "step": 70 + }, + { + "epoch": 0.008640097353209614, + "grad_norm": 3.707595109939575, + "learning_rate": 4.316109422492402e-06, + "loss": 0.5173, + "step": 71 + }, + { + "epoch": 0.00876178886522665, + "grad_norm": 1.4675965309143066, + "learning_rate": 4.3768996960486325e-06, + "loss": 0.4422, + "step": 72 + }, + { + "epoch": 0.008883480377243687, + "grad_norm": 10.405287742614746, + "learning_rate": 4.437689969604864e-06, + "loss": 0.6371, + "step": 73 + }, + { + "epoch": 0.009005171889260723, + "grad_norm": 7.697737693786621, + "learning_rate": 4.4984802431610945e-06, + "loss": 0.5918, + "step": 74 + }, + { + "epoch": 0.009126863401277762, + "grad_norm": 1.7287604808807373, + "learning_rate": 4.559270516717326e-06, + "loss": 0.4604, + "step": 75 + }, + { + "epoch": 0.009248554913294798, + "grad_norm": 2.8088109493255615, + "learning_rate": 4.620060790273556e-06, + "loss": 0.4477, + "step": 76 + }, + { + "epoch": 0.009370246425311835, + "grad_norm": 1.390653133392334, + "learning_rate": 4.680851063829788e-06, + "loss": 0.5009, + "step": 77 + }, + { + "epoch": 0.009491937937328871, + "grad_norm": 8.15897274017334, + "learning_rate": 4.741641337386019e-06, + "loss": 0.6593, + "step": 78 + }, + { + "epoch": 0.009613629449345908, + "grad_norm": 6.279831409454346, + "learning_rate": 4.80243161094225e-06, + "loss": 0.4464, + "step": 79 + }, + { + "epoch": 0.009735320961362945, + "grad_norm": 4.1383514404296875, + "learning_rate": 4.863221884498481e-06, + "loss": 0.6184, + "step": 80 + }, + { + "epoch": 0.009857012473379981, + "grad_norm": 8.388766288757324, + "learning_rate": 4.924012158054712e-06, + "loss": 0.4524, + "step": 81 + }, + { + "epoch": 0.00997870398539702, + "grad_norm": 1.4439729452133179, + "learning_rate": 4.984802431610943e-06, + "loss": 0.5034, + "step": 82 + }, + { + "epoch": 0.010100395497414056, + "grad_norm": 2.35052227973938, + "learning_rate": 5.0455927051671735e-06, + "loss": 0.5005, + "step": 83 + }, + { + "epoch": 0.010222087009431093, + "grad_norm": 1.400492548942566, + "learning_rate": 5.106382978723404e-06, + "loss": 0.4697, + "step": 84 + }, + { + "epoch": 0.010343778521448129, + "grad_norm": 2.054168939590454, + "learning_rate": 5.1671732522796354e-06, + "loss": 0.5697, + "step": 85 + }, + { + "epoch": 0.010465470033465166, + "grad_norm": 2.0980570316314697, + "learning_rate": 5.227963525835866e-06, + "loss": 0.5479, + "step": 86 + }, + { + "epoch": 0.010587161545482202, + "grad_norm": 10.143542289733887, + "learning_rate": 5.288753799392098e-06, + "loss": 0.5854, + "step": 87 + }, + { + "epoch": 0.010708853057499239, + "grad_norm": 10.95671558380127, + "learning_rate": 5.349544072948329e-06, + "loss": 0.5479, + "step": 88 + }, + { + "epoch": 0.010830544569516275, + "grad_norm": 4.231632709503174, + "learning_rate": 5.41033434650456e-06, + "loss": 0.514, + "step": 89 + }, + { + "epoch": 0.010952236081533314, + "grad_norm": 6.10847282409668, + "learning_rate": 5.471124620060791e-06, + "loss": 0.5568, + "step": 90 + }, + { + "epoch": 0.01107392759355035, + "grad_norm": 7.639449596405029, + "learning_rate": 5.531914893617022e-06, + "loss": 0.5347, + "step": 91 + }, + { + "epoch": 0.011195619105567387, + "grad_norm": 8.659672737121582, + "learning_rate": 5.592705167173253e-06, + "loss": 0.6023, + "step": 92 + }, + { + "epoch": 0.011317310617584423, + "grad_norm": 2.5167593955993652, + "learning_rate": 5.653495440729484e-06, + "loss": 0.5298, + "step": 93 + }, + { + "epoch": 0.01143900212960146, + "grad_norm": 9.971253395080566, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5734, + "step": 94 + }, + { + "epoch": 0.011560693641618497, + "grad_norm": 16.309722900390625, + "learning_rate": 5.775075987841946e-06, + "loss": 0.6351, + "step": 95 + }, + { + "epoch": 0.011682385153635533, + "grad_norm": 12.970685005187988, + "learning_rate": 5.8358662613981764e-06, + "loss": 0.5801, + "step": 96 + }, + { + "epoch": 0.011804076665652571, + "grad_norm": 2.6465160846710205, + "learning_rate": 5.896656534954408e-06, + "loss": 0.5355, + "step": 97 + }, + { + "epoch": 0.011925768177669608, + "grad_norm": 5.874882698059082, + "learning_rate": 5.957446808510638e-06, + "loss": 0.5155, + "step": 98 + }, + { + "epoch": 0.012047459689686645, + "grad_norm": 6.874263763427734, + "learning_rate": 6.01823708206687e-06, + "loss": 0.5755, + "step": 99 + }, + { + "epoch": 0.012169151201703681, + "grad_norm": 8.646947860717773, + "learning_rate": 6.0790273556231e-06, + "loss": 0.5908, + "step": 100 + }, + { + "epoch": 0.012290842713720718, + "grad_norm": 1.429110050201416, + "learning_rate": 6.1398176291793325e-06, + "loss": 0.4829, + "step": 101 + }, + { + "epoch": 0.012412534225737754, + "grad_norm": 3.2724087238311768, + "learning_rate": 6.200607902735562e-06, + "loss": 0.5257, + "step": 102 + }, + { + "epoch": 0.01253422573775479, + "grad_norm": 6.1953511238098145, + "learning_rate": 6.2613981762917944e-06, + "loss": 0.5356, + "step": 103 + }, + { + "epoch": 0.012655917249771829, + "grad_norm": 1.6497021913528442, + "learning_rate": 6.322188449848025e-06, + "loss": 0.5142, + "step": 104 + }, + { + "epoch": 0.012777608761788866, + "grad_norm": 1.1163548231124878, + "learning_rate": 6.382978723404256e-06, + "loss": 0.4192, + "step": 105 + }, + { + "epoch": 0.012899300273805902, + "grad_norm": 5.324417591094971, + "learning_rate": 6.443768996960487e-06, + "loss": 0.4452, + "step": 106 + }, + { + "epoch": 0.013020991785822939, + "grad_norm": 9.527470588684082, + "learning_rate": 6.504559270516718e-06, + "loss": 0.5937, + "step": 107 + }, + { + "epoch": 0.013142683297839975, + "grad_norm": 4.845725059509277, + "learning_rate": 6.565349544072949e-06, + "loss": 0.4486, + "step": 108 + }, + { + "epoch": 0.013264374809857012, + "grad_norm": 5.2774271965026855, + "learning_rate": 6.62613981762918e-06, + "loss": 0.5525, + "step": 109 + }, + { + "epoch": 0.013386066321874049, + "grad_norm": 5.132938861846924, + "learning_rate": 6.686930091185411e-06, + "loss": 0.527, + "step": 110 + }, + { + "epoch": 0.013507757833891087, + "grad_norm": 11.405508995056152, + "learning_rate": 6.747720364741642e-06, + "loss": 0.6392, + "step": 111 + }, + { + "epoch": 0.013629449345908123, + "grad_norm": 9.988105773925781, + "learning_rate": 6.808510638297873e-06, + "loss": 0.6323, + "step": 112 + }, + { + "epoch": 0.01375114085792516, + "grad_norm": 6.9306769371032715, + "learning_rate": 6.869300911854104e-06, + "loss": 0.5208, + "step": 113 + }, + { + "epoch": 0.013872832369942197, + "grad_norm": 5.900634765625, + "learning_rate": 6.9300911854103346e-06, + "loss": 0.6097, + "step": 114 + }, + { + "epoch": 0.013994523881959233, + "grad_norm": 2.1774234771728516, + "learning_rate": 6.990881458966566e-06, + "loss": 0.4525, + "step": 115 + }, + { + "epoch": 0.01411621539397627, + "grad_norm": 2.4822614192962646, + "learning_rate": 7.0516717325227965e-06, + "loss": 0.4894, + "step": 116 + }, + { + "epoch": 0.014237906905993306, + "grad_norm": 2.224492311477661, + "learning_rate": 7.112462006079029e-06, + "loss": 0.5191, + "step": 117 + }, + { + "epoch": 0.014359598418010345, + "grad_norm": 4.142158031463623, + "learning_rate": 7.173252279635258e-06, + "loss": 0.4941, + "step": 118 + }, + { + "epoch": 0.014481289930027381, + "grad_norm": 1.3689216375350952, + "learning_rate": 7.234042553191491e-06, + "loss": 0.5031, + "step": 119 + }, + { + "epoch": 0.014602981442044418, + "grad_norm": 1.7017595767974854, + "learning_rate": 7.294832826747721e-06, + "loss": 0.4759, + "step": 120 + }, + { + "epoch": 0.014724672954061454, + "grad_norm": 2.922427177429199, + "learning_rate": 7.3556231003039526e-06, + "loss": 0.5302, + "step": 121 + }, + { + "epoch": 0.01484636446607849, + "grad_norm": 2.68149471282959, + "learning_rate": 7.416413373860183e-06, + "loss": 0.5008, + "step": 122 + }, + { + "epoch": 0.014968055978095527, + "grad_norm": 2.105281114578247, + "learning_rate": 7.4772036474164145e-06, + "loss": 0.5114, + "step": 123 + }, + { + "epoch": 0.015089747490112564, + "grad_norm": 1.4559125900268555, + "learning_rate": 7.537993920972645e-06, + "loss": 0.4699, + "step": 124 + }, + { + "epoch": 0.015211439002129602, + "grad_norm": 3.2693777084350586, + "learning_rate": 7.598784194528876e-06, + "loss": 0.5062, + "step": 125 + }, + { + "epoch": 0.015333130514146639, + "grad_norm": 2.183715343475342, + "learning_rate": 7.659574468085107e-06, + "loss": 0.5046, + "step": 126 + }, + { + "epoch": 0.015454822026163675, + "grad_norm": 1.0585291385650635, + "learning_rate": 7.720364741641338e-06, + "loss": 0.5224, + "step": 127 + }, + { + "epoch": 0.015576513538180712, + "grad_norm": 2.833522081375122, + "learning_rate": 7.78115501519757e-06, + "loss": 0.5289, + "step": 128 + }, + { + "epoch": 0.01569820505019775, + "grad_norm": 1.0448874235153198, + "learning_rate": 7.841945288753801e-06, + "loss": 0.499, + "step": 129 + }, + { + "epoch": 0.015819896562214785, + "grad_norm": 0.856838047504425, + "learning_rate": 7.90273556231003e-06, + "loss": 0.4805, + "step": 130 + }, + { + "epoch": 0.01594158807423182, + "grad_norm": 5.1833391189575195, + "learning_rate": 7.963525835866262e-06, + "loss": 0.5597, + "step": 131 + }, + { + "epoch": 0.016063279586248858, + "grad_norm": 6.778291702270508, + "learning_rate": 8.024316109422494e-06, + "loss": 0.5685, + "step": 132 + }, + { + "epoch": 0.016184971098265895, + "grad_norm": 3.2497646808624268, + "learning_rate": 8.085106382978723e-06, + "loss": 0.4972, + "step": 133 + }, + { + "epoch": 0.01630666261028293, + "grad_norm": 1.978299617767334, + "learning_rate": 8.145896656534955e-06, + "loss": 0.5389, + "step": 134 + }, + { + "epoch": 0.016428354122299968, + "grad_norm": 8.466510772705078, + "learning_rate": 8.206686930091186e-06, + "loss": 0.5174, + "step": 135 + }, + { + "epoch": 0.016550045634317008, + "grad_norm": 6.085693836212158, + "learning_rate": 8.267477203647417e-06, + "loss": 0.5267, + "step": 136 + }, + { + "epoch": 0.016671737146334045, + "grad_norm": 1.6802982091903687, + "learning_rate": 8.328267477203647e-06, + "loss": 0.5324, + "step": 137 + }, + { + "epoch": 0.01679342865835108, + "grad_norm": 3.9072301387786865, + "learning_rate": 8.389057750759878e-06, + "loss": 0.4342, + "step": 138 + }, + { + "epoch": 0.016915120170368118, + "grad_norm": 1.671865701675415, + "learning_rate": 8.44984802431611e-06, + "loss": 0.4282, + "step": 139 + }, + { + "epoch": 0.017036811682385154, + "grad_norm": 3.5130422115325928, + "learning_rate": 8.510638297872341e-06, + "loss": 0.4797, + "step": 140 + }, + { + "epoch": 0.01715850319440219, + "grad_norm": 1.251298189163208, + "learning_rate": 8.571428571428571e-06, + "loss": 0.4266, + "step": 141 + }, + { + "epoch": 0.017280194706419227, + "grad_norm": 1.784014105796814, + "learning_rate": 8.632218844984804e-06, + "loss": 0.5515, + "step": 142 + }, + { + "epoch": 0.017401886218436264, + "grad_norm": 8.78653621673584, + "learning_rate": 8.693009118541034e-06, + "loss": 0.6032, + "step": 143 + }, + { + "epoch": 0.0175235777304533, + "grad_norm": 11.122417449951172, + "learning_rate": 8.753799392097265e-06, + "loss": 0.6042, + "step": 144 + }, + { + "epoch": 0.017645269242470337, + "grad_norm": 13.862433433532715, + "learning_rate": 8.814589665653496e-06, + "loss": 0.6152, + "step": 145 + }, + { + "epoch": 0.017766960754487374, + "grad_norm": 9.457635879516602, + "learning_rate": 8.875379939209728e-06, + "loss": 0.5355, + "step": 146 + }, + { + "epoch": 0.01788865226650441, + "grad_norm": 1.336855173110962, + "learning_rate": 8.936170212765958e-06, + "loss": 0.5491, + "step": 147 + }, + { + "epoch": 0.018010343778521447, + "grad_norm": 2.6724729537963867, + "learning_rate": 8.996960486322189e-06, + "loss": 0.5289, + "step": 148 + }, + { + "epoch": 0.018132035290538483, + "grad_norm": 3.028740167617798, + "learning_rate": 9.05775075987842e-06, + "loss": 0.5178, + "step": 149 + }, + { + "epoch": 0.018253726802555523, + "grad_norm": 2.821035861968994, + "learning_rate": 9.118541033434652e-06, + "loss": 0.52, + "step": 150 + }, + { + "epoch": 0.01837541831457256, + "grad_norm": 2.4448201656341553, + "learning_rate": 9.179331306990881e-06, + "loss": 0.5336, + "step": 151 + }, + { + "epoch": 0.018497109826589597, + "grad_norm": 10.468660354614258, + "learning_rate": 9.240121580547113e-06, + "loss": 0.5113, + "step": 152 + }, + { + "epoch": 0.018618801338606633, + "grad_norm": 4.343624114990234, + "learning_rate": 9.300911854103344e-06, + "loss": 0.5518, + "step": 153 + }, + { + "epoch": 0.01874049285062367, + "grad_norm": 0.9401276707649231, + "learning_rate": 9.361702127659576e-06, + "loss": 0.4987, + "step": 154 + }, + { + "epoch": 0.018862184362640706, + "grad_norm": 2.693747043609619, + "learning_rate": 9.422492401215805e-06, + "loss": 0.4839, + "step": 155 + }, + { + "epoch": 0.018983875874657743, + "grad_norm": 3.352670669555664, + "learning_rate": 9.483282674772038e-06, + "loss": 0.5325, + "step": 156 + }, + { + "epoch": 0.01910556738667478, + "grad_norm": 3.489257574081421, + "learning_rate": 9.544072948328268e-06, + "loss": 0.4427, + "step": 157 + }, + { + "epoch": 0.019227258898691816, + "grad_norm": 1.562726378440857, + "learning_rate": 9.6048632218845e-06, + "loss": 0.4979, + "step": 158 + }, + { + "epoch": 0.019348950410708853, + "grad_norm": 1.2442615032196045, + "learning_rate": 9.66565349544073e-06, + "loss": 0.503, + "step": 159 + }, + { + "epoch": 0.01947064192272589, + "grad_norm": 2.0917882919311523, + "learning_rate": 9.726443768996962e-06, + "loss": 0.5225, + "step": 160 + }, + { + "epoch": 0.019592333434742926, + "grad_norm": 3.4695098400115967, + "learning_rate": 9.787234042553192e-06, + "loss": 0.4602, + "step": 161 + }, + { + "epoch": 0.019714024946759962, + "grad_norm": 4.359494686126709, + "learning_rate": 9.848024316109423e-06, + "loss": 0.4752, + "step": 162 + }, + { + "epoch": 0.019835716458777, + "grad_norm": 4.106424808502197, + "learning_rate": 9.908814589665655e-06, + "loss": 0.5438, + "step": 163 + }, + { + "epoch": 0.01995740797079404, + "grad_norm": 2.0894875526428223, + "learning_rate": 9.969604863221886e-06, + "loss": 0.4366, + "step": 164 + }, + { + "epoch": 0.020079099482811075, + "grad_norm": 0.964451253414154, + "learning_rate": 1.0030395136778117e-05, + "loss": 0.4383, + "step": 165 + }, + { + "epoch": 0.020200790994828112, + "grad_norm": 4.342470169067383, + "learning_rate": 1.0091185410334347e-05, + "loss": 0.5336, + "step": 166 + }, + { + "epoch": 0.02032248250684515, + "grad_norm": 1.3713736534118652, + "learning_rate": 1.0151975683890578e-05, + "loss": 0.4795, + "step": 167 + }, + { + "epoch": 0.020444174018862185, + "grad_norm": 4.365769863128662, + "learning_rate": 1.0212765957446808e-05, + "loss": 0.4266, + "step": 168 + }, + { + "epoch": 0.02056586553087922, + "grad_norm": 1.003467321395874, + "learning_rate": 1.0273556231003041e-05, + "loss": 0.5158, + "step": 169 + }, + { + "epoch": 0.020687557042896258, + "grad_norm": 3.7893731594085693, + "learning_rate": 1.0334346504559271e-05, + "loss": 0.4936, + "step": 170 + }, + { + "epoch": 0.020809248554913295, + "grad_norm": 0.8778900504112244, + "learning_rate": 1.0395136778115502e-05, + "loss": 0.5322, + "step": 171 + }, + { + "epoch": 0.02093094006693033, + "grad_norm": 3.3810131549835205, + "learning_rate": 1.0455927051671732e-05, + "loss": 0.5526, + "step": 172 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 1.4848077297210693, + "learning_rate": 1.0516717325227965e-05, + "loss": 0.4736, + "step": 173 + }, + { + "epoch": 0.021174323090964405, + "grad_norm": 0.8066727519035339, + "learning_rate": 1.0577507598784196e-05, + "loss": 0.4718, + "step": 174 + }, + { + "epoch": 0.02129601460298144, + "grad_norm": 0.8787925839424133, + "learning_rate": 1.0638297872340426e-05, + "loss": 0.4784, + "step": 175 + }, + { + "epoch": 0.021417706114998478, + "grad_norm": 4.162363529205322, + "learning_rate": 1.0699088145896657e-05, + "loss": 0.549, + "step": 176 + }, + { + "epoch": 0.021539397627015514, + "grad_norm": 0.9096184372901917, + "learning_rate": 1.0759878419452889e-05, + "loss": 0.4436, + "step": 177 + }, + { + "epoch": 0.02166108913903255, + "grad_norm": 2.608381748199463, + "learning_rate": 1.082066869300912e-05, + "loss": 0.5248, + "step": 178 + }, + { + "epoch": 0.02178278065104959, + "grad_norm": 2.3067309856414795, + "learning_rate": 1.088145896656535e-05, + "loss": 0.4617, + "step": 179 + }, + { + "epoch": 0.021904472163066627, + "grad_norm": 1.1305943727493286, + "learning_rate": 1.0942249240121581e-05, + "loss": 0.4767, + "step": 180 + }, + { + "epoch": 0.022026163675083664, + "grad_norm": 2.6685497760772705, + "learning_rate": 1.1003039513677813e-05, + "loss": 0.5104, + "step": 181 + }, + { + "epoch": 0.0221478551871007, + "grad_norm": 1.002787709236145, + "learning_rate": 1.1063829787234044e-05, + "loss": 0.4866, + "step": 182 + }, + { + "epoch": 0.022269546699117737, + "grad_norm": 2.944110870361328, + "learning_rate": 1.1124620060790274e-05, + "loss": 0.5557, + "step": 183 + }, + { + "epoch": 0.022391238211134774, + "grad_norm": 3.287485122680664, + "learning_rate": 1.1185410334346505e-05, + "loss": 0.4684, + "step": 184 + }, + { + "epoch": 0.02251292972315181, + "grad_norm": 2.4028401374816895, + "learning_rate": 1.1246200607902738e-05, + "loss": 0.5386, + "step": 185 + }, + { + "epoch": 0.022634621235168847, + "grad_norm": 2.2685742378234863, + "learning_rate": 1.1306990881458968e-05, + "loss": 0.4718, + "step": 186 + }, + { + "epoch": 0.022756312747185883, + "grad_norm": 0.9333683252334595, + "learning_rate": 1.1367781155015198e-05, + "loss": 0.4368, + "step": 187 + }, + { + "epoch": 0.02287800425920292, + "grad_norm": 5.021225929260254, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.5321, + "step": 188 + }, + { + "epoch": 0.022999695771219957, + "grad_norm": 4.723958969116211, + "learning_rate": 1.1489361702127662e-05, + "loss": 0.5198, + "step": 189 + }, + { + "epoch": 0.023121387283236993, + "grad_norm": 2.935609817504883, + "learning_rate": 1.1550151975683892e-05, + "loss": 0.5107, + "step": 190 + }, + { + "epoch": 0.02324307879525403, + "grad_norm": 3.1390504837036133, + "learning_rate": 1.1610942249240123e-05, + "loss": 0.49, + "step": 191 + }, + { + "epoch": 0.023364770307271066, + "grad_norm": 4.994821071624756, + "learning_rate": 1.1671732522796353e-05, + "loss": 0.5153, + "step": 192 + }, + { + "epoch": 0.023486461819288106, + "grad_norm": 5.175562381744385, + "learning_rate": 1.1732522796352586e-05, + "loss": 0.501, + "step": 193 + }, + { + "epoch": 0.023608153331305143, + "grad_norm": 1.231567621231079, + "learning_rate": 1.1793313069908816e-05, + "loss": 0.5349, + "step": 194 + }, + { + "epoch": 0.02372984484332218, + "grad_norm": 2.930424928665161, + "learning_rate": 1.1854103343465047e-05, + "loss": 0.4459, + "step": 195 + }, + { + "epoch": 0.023851536355339216, + "grad_norm": 4.068364143371582, + "learning_rate": 1.1914893617021277e-05, + "loss": 0.5316, + "step": 196 + }, + { + "epoch": 0.023973227867356253, + "grad_norm": 6.025232791900635, + "learning_rate": 1.197568389057751e-05, + "loss": 0.5814, + "step": 197 + }, + { + "epoch": 0.02409491937937329, + "grad_norm": 3.0085935592651367, + "learning_rate": 1.203647416413374e-05, + "loss": 0.5008, + "step": 198 + }, + { + "epoch": 0.024216610891390326, + "grad_norm": 0.9371998906135559, + "learning_rate": 1.2097264437689971e-05, + "loss": 0.4974, + "step": 199 + }, + { + "epoch": 0.024338302403407362, + "grad_norm": 6.098117351531982, + "learning_rate": 1.21580547112462e-05, + "loss": 0.4823, + "step": 200 + }, + { + "epoch": 0.0244599939154244, + "grad_norm": 3.9042601585388184, + "learning_rate": 1.2218844984802432e-05, + "loss": 0.5012, + "step": 201 + }, + { + "epoch": 0.024581685427441435, + "grad_norm": 1.7924104928970337, + "learning_rate": 1.2279635258358665e-05, + "loss": 0.4639, + "step": 202 + }, + { + "epoch": 0.024703376939458472, + "grad_norm": 1.657804250717163, + "learning_rate": 1.2340425531914895e-05, + "loss": 0.4892, + "step": 203 + }, + { + "epoch": 0.02482506845147551, + "grad_norm": 1.11095130443573, + "learning_rate": 1.2401215805471124e-05, + "loss": 0.4308, + "step": 204 + }, + { + "epoch": 0.024946759963492545, + "grad_norm": 4.05391788482666, + "learning_rate": 1.2462006079027356e-05, + "loss": 0.5384, + "step": 205 + }, + { + "epoch": 0.02506845147550958, + "grad_norm": 1.0696470737457275, + "learning_rate": 1.2522796352583589e-05, + "loss": 0.4884, + "step": 206 + }, + { + "epoch": 0.02519014298752662, + "grad_norm": 1.3560253381729126, + "learning_rate": 1.2583586626139819e-05, + "loss": 0.5223, + "step": 207 + }, + { + "epoch": 0.025311834499543658, + "grad_norm": 1.0009101629257202, + "learning_rate": 1.264437689969605e-05, + "loss": 0.5241, + "step": 208 + }, + { + "epoch": 0.025433526011560695, + "grad_norm": 4.8241071701049805, + "learning_rate": 1.270516717325228e-05, + "loss": 0.4817, + "step": 209 + }, + { + "epoch": 0.02555521752357773, + "grad_norm": 2.7588982582092285, + "learning_rate": 1.2765957446808513e-05, + "loss": 0.4969, + "step": 210 + }, + { + "epoch": 0.025676909035594768, + "grad_norm": 1.2233370542526245, + "learning_rate": 1.2826747720364742e-05, + "loss": 0.4564, + "step": 211 + }, + { + "epoch": 0.025798600547611805, + "grad_norm": 3.9769198894500732, + "learning_rate": 1.2887537993920974e-05, + "loss": 0.525, + "step": 212 + }, + { + "epoch": 0.02592029205962884, + "grad_norm": 1.5460400581359863, + "learning_rate": 1.2948328267477203e-05, + "loss": 0.4579, + "step": 213 + }, + { + "epoch": 0.026041983571645878, + "grad_norm": 1.1946918964385986, + "learning_rate": 1.3009118541033437e-05, + "loss": 0.4664, + "step": 214 + }, + { + "epoch": 0.026163675083662914, + "grad_norm": 2.718722105026245, + "learning_rate": 1.3069908814589666e-05, + "loss": 0.5612, + "step": 215 + }, + { + "epoch": 0.02628536659567995, + "grad_norm": 4.307705402374268, + "learning_rate": 1.3130699088145898e-05, + "loss": 0.5013, + "step": 216 + }, + { + "epoch": 0.026407058107696987, + "grad_norm": 3.6850602626800537, + "learning_rate": 1.3191489361702127e-05, + "loss": 0.5158, + "step": 217 + }, + { + "epoch": 0.026528749619714024, + "grad_norm": 3.4409823417663574, + "learning_rate": 1.325227963525836e-05, + "loss": 0.4689, + "step": 218 + }, + { + "epoch": 0.02665044113173106, + "grad_norm": 1.849242091178894, + "learning_rate": 1.3313069908814592e-05, + "loss": 0.5321, + "step": 219 + }, + { + "epoch": 0.026772132643748097, + "grad_norm": 1.3650225400924683, + "learning_rate": 1.3373860182370821e-05, + "loss": 0.4366, + "step": 220 + }, + { + "epoch": 0.026893824155765137, + "grad_norm": 1.60374116897583, + "learning_rate": 1.3434650455927051e-05, + "loss": 0.4558, + "step": 221 + }, + { + "epoch": 0.027015515667782174, + "grad_norm": 1.3378283977508545, + "learning_rate": 1.3495440729483284e-05, + "loss": 0.5394, + "step": 222 + }, + { + "epoch": 0.02713720717979921, + "grad_norm": 4.942467212677002, + "learning_rate": 1.3556231003039516e-05, + "loss": 0.4617, + "step": 223 + }, + { + "epoch": 0.027258898691816247, + "grad_norm": 4.84968900680542, + "learning_rate": 1.3617021276595745e-05, + "loss": 0.4769, + "step": 224 + }, + { + "epoch": 0.027380590203833283, + "grad_norm": 4.440089702606201, + "learning_rate": 1.3677811550151977e-05, + "loss": 0.4477, + "step": 225 + }, + { + "epoch": 0.02750228171585032, + "grad_norm": 3.5566582679748535, + "learning_rate": 1.3738601823708208e-05, + "loss": 0.5177, + "step": 226 + }, + { + "epoch": 0.027623973227867357, + "grad_norm": 3.283655881881714, + "learning_rate": 1.379939209726444e-05, + "loss": 0.5039, + "step": 227 + }, + { + "epoch": 0.027745664739884393, + "grad_norm": 3.0711073875427246, + "learning_rate": 1.3860182370820669e-05, + "loss": 0.4983, + "step": 228 + }, + { + "epoch": 0.02786735625190143, + "grad_norm": 3.589137077331543, + "learning_rate": 1.39209726443769e-05, + "loss": 0.4378, + "step": 229 + }, + { + "epoch": 0.027989047763918466, + "grad_norm": 2.9477665424346924, + "learning_rate": 1.3981762917933132e-05, + "loss": 0.4478, + "step": 230 + }, + { + "epoch": 0.028110739275935503, + "grad_norm": 1.009657382965088, + "learning_rate": 1.4042553191489363e-05, + "loss": 0.4792, + "step": 231 + }, + { + "epoch": 0.02823243078795254, + "grad_norm": 0.7988582253456116, + "learning_rate": 1.4103343465045593e-05, + "loss": 0.4839, + "step": 232 + }, + { + "epoch": 0.028354122299969576, + "grad_norm": 1.4024368524551392, + "learning_rate": 1.4164133738601824e-05, + "loss": 0.5062, + "step": 233 + }, + { + "epoch": 0.028475813811986612, + "grad_norm": 1.2348082065582275, + "learning_rate": 1.4224924012158057e-05, + "loss": 0.5137, + "step": 234 + }, + { + "epoch": 0.02859750532400365, + "grad_norm": 5.30483865737915, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.4835, + "step": 235 + }, + { + "epoch": 0.02871919683602069, + "grad_norm": 6.46610689163208, + "learning_rate": 1.4346504559270517e-05, + "loss": 0.4385, + "step": 236 + }, + { + "epoch": 0.028840888348037726, + "grad_norm": 3.176692485809326, + "learning_rate": 1.4407294832826748e-05, + "loss": 0.4695, + "step": 237 + }, + { + "epoch": 0.028962579860054762, + "grad_norm": 1.7878280878067017, + "learning_rate": 1.4468085106382981e-05, + "loss": 0.4521, + "step": 238 + }, + { + "epoch": 0.0290842713720718, + "grad_norm": 3.6849300861358643, + "learning_rate": 1.4528875379939211e-05, + "loss": 0.4959, + "step": 239 + }, + { + "epoch": 0.029205962884088835, + "grad_norm": 2.631035089492798, + "learning_rate": 1.4589665653495442e-05, + "loss": 0.4523, + "step": 240 + }, + { + "epoch": 0.029327654396105872, + "grad_norm": 2.448408842086792, + "learning_rate": 1.4650455927051672e-05, + "loss": 0.5762, + "step": 241 + }, + { + "epoch": 0.02944934590812291, + "grad_norm": 6.383382320404053, + "learning_rate": 1.4711246200607905e-05, + "loss": 0.534, + "step": 242 + }, + { + "epoch": 0.029571037420139945, + "grad_norm": 6.500555992126465, + "learning_rate": 1.4772036474164135e-05, + "loss": 0.522, + "step": 243 + }, + { + "epoch": 0.02969272893215698, + "grad_norm": 5.931544303894043, + "learning_rate": 1.4832826747720366e-05, + "loss": 0.5868, + "step": 244 + }, + { + "epoch": 0.029814420444174018, + "grad_norm": 2.1462721824645996, + "learning_rate": 1.4893617021276596e-05, + "loss": 0.5073, + "step": 245 + }, + { + "epoch": 0.029936111956191055, + "grad_norm": 1.5611602067947388, + "learning_rate": 1.4954407294832829e-05, + "loss": 0.5167, + "step": 246 + }, + { + "epoch": 0.03005780346820809, + "grad_norm": 1.691525936126709, + "learning_rate": 1.5015197568389059e-05, + "loss": 0.4363, + "step": 247 + }, + { + "epoch": 0.030179494980225128, + "grad_norm": 3.175032138824463, + "learning_rate": 1.507598784194529e-05, + "loss": 0.5074, + "step": 248 + }, + { + "epoch": 0.030301186492242164, + "grad_norm": 3.3713743686676025, + "learning_rate": 1.513677811550152e-05, + "loss": 0.582, + "step": 249 + }, + { + "epoch": 0.030422878004259205, + "grad_norm": 3.4759769439697266, + "learning_rate": 1.5197568389057753e-05, + "loss": 0.5001, + "step": 250 + }, + { + "epoch": 0.03054456951627624, + "grad_norm": 5.467329978942871, + "learning_rate": 1.5258358662613984e-05, + "loss": 0.5104, + "step": 251 + }, + { + "epoch": 0.030666261028293278, + "grad_norm": 4.782961845397949, + "learning_rate": 1.5319148936170214e-05, + "loss": 0.5096, + "step": 252 + }, + { + "epoch": 0.030787952540310314, + "grad_norm": 4.240315914154053, + "learning_rate": 1.5379939209726444e-05, + "loss": 0.4531, + "step": 253 + }, + { + "epoch": 0.03090964405232735, + "grad_norm": 2.355752944946289, + "learning_rate": 1.5440729483282677e-05, + "loss": 0.518, + "step": 254 + }, + { + "epoch": 0.031031335564344387, + "grad_norm": 3.694533109664917, + "learning_rate": 1.5501519756838906e-05, + "loss": 0.4638, + "step": 255 + }, + { + "epoch": 0.031153027076361424, + "grad_norm": 4.192401885986328, + "learning_rate": 1.556231003039514e-05, + "loss": 0.5219, + "step": 256 + }, + { + "epoch": 0.031274718588378464, + "grad_norm": 0.9859777688980103, + "learning_rate": 1.562310030395137e-05, + "loss": 0.432, + "step": 257 + }, + { + "epoch": 0.0313964101003955, + "grad_norm": 2.2214391231536865, + "learning_rate": 1.5683890577507602e-05, + "loss": 0.4391, + "step": 258 + }, + { + "epoch": 0.03151810161241254, + "grad_norm": 2.3708407878875732, + "learning_rate": 1.5744680851063832e-05, + "loss": 0.4693, + "step": 259 + }, + { + "epoch": 0.03163979312442957, + "grad_norm": 3.287144660949707, + "learning_rate": 1.580547112462006e-05, + "loss": 0.4559, + "step": 260 + }, + { + "epoch": 0.03176148463644661, + "grad_norm": 3.5540168285369873, + "learning_rate": 1.586626139817629e-05, + "loss": 0.5076, + "step": 261 + }, + { + "epoch": 0.03188317614846364, + "grad_norm": 2.0204150676727295, + "learning_rate": 1.5927051671732524e-05, + "loss": 0.4701, + "step": 262 + }, + { + "epoch": 0.03200486766048068, + "grad_norm": 1.7141085863113403, + "learning_rate": 1.5987841945288754e-05, + "loss": 0.4284, + "step": 263 + }, + { + "epoch": 0.032126559172497716, + "grad_norm": 1.6268690824508667, + "learning_rate": 1.6048632218844987e-05, + "loss": 0.4692, + "step": 264 + }, + { + "epoch": 0.032248250684514757, + "grad_norm": 0.8782216310501099, + "learning_rate": 1.6109422492401217e-05, + "loss": 0.4461, + "step": 265 + }, + { + "epoch": 0.03236994219653179, + "grad_norm": 2.026752233505249, + "learning_rate": 1.6170212765957446e-05, + "loss": 0.5242, + "step": 266 + }, + { + "epoch": 0.03249163370854883, + "grad_norm": 6.13303279876709, + "learning_rate": 1.623100303951368e-05, + "loss": 0.454, + "step": 267 + }, + { + "epoch": 0.03261332522056586, + "grad_norm": 4.696516990661621, + "learning_rate": 1.629179331306991e-05, + "loss": 0.5818, + "step": 268 + }, + { + "epoch": 0.0327350167325829, + "grad_norm": 6.200691223144531, + "learning_rate": 1.6352583586626142e-05, + "loss": 0.518, + "step": 269 + }, + { + "epoch": 0.032856708244599936, + "grad_norm": 0.9025941491127014, + "learning_rate": 1.6413373860182372e-05, + "loss": 0.558, + "step": 270 + }, + { + "epoch": 0.032978399756616976, + "grad_norm": 1.4122920036315918, + "learning_rate": 1.6474164133738605e-05, + "loss": 0.4987, + "step": 271 + }, + { + "epoch": 0.033100091268634016, + "grad_norm": 0.9242690205574036, + "learning_rate": 1.6534954407294835e-05, + "loss": 0.4018, + "step": 272 + }, + { + "epoch": 0.03322178278065105, + "grad_norm": 1.8618580102920532, + "learning_rate": 1.6595744680851064e-05, + "loss": 0.4336, + "step": 273 + }, + { + "epoch": 0.03334347429266809, + "grad_norm": 3.876899003982544, + "learning_rate": 1.6656534954407294e-05, + "loss": 0.4944, + "step": 274 + }, + { + "epoch": 0.03346516580468512, + "grad_norm": 2.4337871074676514, + "learning_rate": 1.6717325227963527e-05, + "loss": 0.4392, + "step": 275 + }, + { + "epoch": 0.03358685731670216, + "grad_norm": 1.483424425125122, + "learning_rate": 1.6778115501519757e-05, + "loss": 0.5294, + "step": 276 + }, + { + "epoch": 0.033708548828719195, + "grad_norm": 3.8621246814727783, + "learning_rate": 1.683890577507599e-05, + "loss": 0.5008, + "step": 277 + }, + { + "epoch": 0.033830240340736235, + "grad_norm": 7.214571475982666, + "learning_rate": 1.689969604863222e-05, + "loss": 0.5577, + "step": 278 + }, + { + "epoch": 0.03395193185275327, + "grad_norm": 8.26302433013916, + "learning_rate": 1.6960486322188453e-05, + "loss": 0.5369, + "step": 279 + }, + { + "epoch": 0.03407362336477031, + "grad_norm": 3.8138177394866943, + "learning_rate": 1.7021276595744682e-05, + "loss": 0.531, + "step": 280 + }, + { + "epoch": 0.03419531487678734, + "grad_norm": 2.2180545330047607, + "learning_rate": 1.7082066869300912e-05, + "loss": 0.4654, + "step": 281 + }, + { + "epoch": 0.03431700638880438, + "grad_norm": 2.205357789993286, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.511, + "step": 282 + }, + { + "epoch": 0.034438697900821415, + "grad_norm": 3.5590744018554688, + "learning_rate": 1.7203647416413375e-05, + "loss": 0.4326, + "step": 283 + }, + { + "epoch": 0.034560389412838455, + "grad_norm": 5.377081394195557, + "learning_rate": 1.7264437689969608e-05, + "loss": 0.5293, + "step": 284 + }, + { + "epoch": 0.03468208092485549, + "grad_norm": 4.490129470825195, + "learning_rate": 1.7325227963525838e-05, + "loss": 0.4954, + "step": 285 + }, + { + "epoch": 0.03480377243687253, + "grad_norm": 2.799752950668335, + "learning_rate": 1.7386018237082067e-05, + "loss": 0.5176, + "step": 286 + }, + { + "epoch": 0.03492546394888957, + "grad_norm": 3.117070198059082, + "learning_rate": 1.74468085106383e-05, + "loss": 0.4903, + "step": 287 + }, + { + "epoch": 0.0350471554609066, + "grad_norm": 7.865053653717041, + "learning_rate": 1.750759878419453e-05, + "loss": 0.5106, + "step": 288 + }, + { + "epoch": 0.03516884697292364, + "grad_norm": 6.971349716186523, + "learning_rate": 1.756838905775076e-05, + "loss": 0.5398, + "step": 289 + }, + { + "epoch": 0.035290538484940674, + "grad_norm": 3.857039213180542, + "learning_rate": 1.7629179331306993e-05, + "loss": 0.5261, + "step": 290 + }, + { + "epoch": 0.035412229996957714, + "grad_norm": 2.1724472045898438, + "learning_rate": 1.7689969604863223e-05, + "loss": 0.4705, + "step": 291 + }, + { + "epoch": 0.03553392150897475, + "grad_norm": 2.0216357707977295, + "learning_rate": 1.7750759878419456e-05, + "loss": 0.4393, + "step": 292 + }, + { + "epoch": 0.03565561302099179, + "grad_norm": 3.1432712078094482, + "learning_rate": 1.7811550151975685e-05, + "loss": 0.4266, + "step": 293 + }, + { + "epoch": 0.03577730453300882, + "grad_norm": 2.9277265071868896, + "learning_rate": 1.7872340425531915e-05, + "loss": 0.4499, + "step": 294 + }, + { + "epoch": 0.03589899604502586, + "grad_norm": 1.1710922718048096, + "learning_rate": 1.7933130699088148e-05, + "loss": 0.4201, + "step": 295 + }, + { + "epoch": 0.036020687557042894, + "grad_norm": 0.8354945182800293, + "learning_rate": 1.7993920972644378e-05, + "loss": 0.448, + "step": 296 + }, + { + "epoch": 0.036142379069059934, + "grad_norm": 1.3958899974822998, + "learning_rate": 1.8054711246200608e-05, + "loss": 0.4885, + "step": 297 + }, + { + "epoch": 0.03626407058107697, + "grad_norm": 3.9022505283355713, + "learning_rate": 1.811550151975684e-05, + "loss": 0.3886, + "step": 298 + }, + { + "epoch": 0.03638576209309401, + "grad_norm": 0.839918851852417, + "learning_rate": 1.8176291793313074e-05, + "loss": 0.4849, + "step": 299 + }, + { + "epoch": 0.03650745360511105, + "grad_norm": 3.41076397895813, + "learning_rate": 1.8237082066869303e-05, + "loss": 0.5644, + "step": 300 + }, + { + "epoch": 0.03662914511712808, + "grad_norm": 0.78065425157547, + "learning_rate": 1.8297872340425533e-05, + "loss": 0.4809, + "step": 301 + }, + { + "epoch": 0.03675083662914512, + "grad_norm": 0.8052918314933777, + "learning_rate": 1.8358662613981763e-05, + "loss": 0.4642, + "step": 302 + }, + { + "epoch": 0.03687252814116215, + "grad_norm": 0.6569188237190247, + "learning_rate": 1.8419452887537996e-05, + "loss": 0.4968, + "step": 303 + }, + { + "epoch": 0.03699421965317919, + "grad_norm": 0.7363128662109375, + "learning_rate": 1.8480243161094226e-05, + "loss": 0.4562, + "step": 304 + }, + { + "epoch": 0.037115911165196226, + "grad_norm": 1.6311641931533813, + "learning_rate": 1.854103343465046e-05, + "loss": 0.4832, + "step": 305 + }, + { + "epoch": 0.037237602677213266, + "grad_norm": 2.1871397495269775, + "learning_rate": 1.8601823708206688e-05, + "loss": 0.5213, + "step": 306 + }, + { + "epoch": 0.0373592941892303, + "grad_norm": 2.021156072616577, + "learning_rate": 1.866261398176292e-05, + "loss": 0.4394, + "step": 307 + }, + { + "epoch": 0.03748098570124734, + "grad_norm": 0.7310835123062134, + "learning_rate": 1.872340425531915e-05, + "loss": 0.4983, + "step": 308 + }, + { + "epoch": 0.03760267721326437, + "grad_norm": 1.3981071710586548, + "learning_rate": 1.878419452887538e-05, + "loss": 0.4777, + "step": 309 + }, + { + "epoch": 0.03772436872528141, + "grad_norm": 1.1701666116714478, + "learning_rate": 1.884498480243161e-05, + "loss": 0.4418, + "step": 310 + }, + { + "epoch": 0.037846060237298446, + "grad_norm": 2.2293436527252197, + "learning_rate": 1.8905775075987844e-05, + "loss": 0.4731, + "step": 311 + }, + { + "epoch": 0.037967751749315486, + "grad_norm": 3.6277523040771484, + "learning_rate": 1.8966565349544077e-05, + "loss": 0.4534, + "step": 312 + }, + { + "epoch": 0.03808944326133252, + "grad_norm": 1.6841593980789185, + "learning_rate": 1.9027355623100306e-05, + "loss": 0.4092, + "step": 313 + }, + { + "epoch": 0.03821113477334956, + "grad_norm": 1.074906826019287, + "learning_rate": 1.9088145896656536e-05, + "loss": 0.4645, + "step": 314 + }, + { + "epoch": 0.0383328262853666, + "grad_norm": 2.8434669971466064, + "learning_rate": 1.914893617021277e-05, + "loss": 0.4917, + "step": 315 + }, + { + "epoch": 0.03845451779738363, + "grad_norm": 2.05016827583313, + "learning_rate": 1.9209726443769e-05, + "loss": 0.503, + "step": 316 + }, + { + "epoch": 0.03857620930940067, + "grad_norm": 1.4294145107269287, + "learning_rate": 1.927051671732523e-05, + "loss": 0.5082, + "step": 317 + }, + { + "epoch": 0.038697900821417705, + "grad_norm": 1.0127038955688477, + "learning_rate": 1.933130699088146e-05, + "loss": 0.4814, + "step": 318 + }, + { + "epoch": 0.038819592333434745, + "grad_norm": 1.9038892984390259, + "learning_rate": 1.939209726443769e-05, + "loss": 0.4856, + "step": 319 + }, + { + "epoch": 0.03894128384545178, + "grad_norm": 1.7029699087142944, + "learning_rate": 1.9452887537993924e-05, + "loss": 0.5233, + "step": 320 + }, + { + "epoch": 0.03906297535746882, + "grad_norm": 2.3036530017852783, + "learning_rate": 1.9513677811550154e-05, + "loss": 0.4595, + "step": 321 + }, + { + "epoch": 0.03918466686948585, + "grad_norm": 3.268527030944824, + "learning_rate": 1.9574468085106384e-05, + "loss": 0.439, + "step": 322 + }, + { + "epoch": 0.03930635838150289, + "grad_norm": 1.4382330179214478, + "learning_rate": 1.9635258358662617e-05, + "loss": 0.5159, + "step": 323 + }, + { + "epoch": 0.039428049893519924, + "grad_norm": 1.197644591331482, + "learning_rate": 1.9696048632218846e-05, + "loss": 0.4827, + "step": 324 + }, + { + "epoch": 0.039549741405536964, + "grad_norm": 0.81956946849823, + "learning_rate": 1.9756838905775076e-05, + "loss": 0.4557, + "step": 325 + }, + { + "epoch": 0.039671432917554, + "grad_norm": 0.9710156321525574, + "learning_rate": 1.981762917933131e-05, + "loss": 0.4497, + "step": 326 + }, + { + "epoch": 0.03979312442957104, + "grad_norm": 1.3333054780960083, + "learning_rate": 1.9878419452887542e-05, + "loss": 0.4709, + "step": 327 + }, + { + "epoch": 0.03991481594158808, + "grad_norm": 2.5868492126464844, + "learning_rate": 1.9939209726443772e-05, + "loss": 0.5669, + "step": 328 + }, + { + "epoch": 0.04003650745360511, + "grad_norm": 1.8039932250976562, + "learning_rate": 2e-05, + "loss": 0.5401, + "step": 329 + }, + { + "epoch": 0.04015819896562215, + "grad_norm": 2.1921143531799316, + "learning_rate": 1.999999980973965e-05, + "loss": 0.5032, + "step": 330 + }, + { + "epoch": 0.040279890477639184, + "grad_norm": 2.1752829551696777, + "learning_rate": 1.999999923895861e-05, + "loss": 0.5376, + "step": 331 + }, + { + "epoch": 0.040401581989656224, + "grad_norm": 0.7578356862068176, + "learning_rate": 1.9999998287656898e-05, + "loss": 0.4951, + "step": 332 + }, + { + "epoch": 0.04052327350167326, + "grad_norm": 2.8342578411102295, + "learning_rate": 1.9999996955834554e-05, + "loss": 0.4677, + "step": 333 + }, + { + "epoch": 0.0406449650136903, + "grad_norm": 0.7389137744903564, + "learning_rate": 1.9999995243491626e-05, + "loss": 0.4923, + "step": 334 + }, + { + "epoch": 0.04076665652570733, + "grad_norm": 1.022040843963623, + "learning_rate": 1.999999315062818e-05, + "loss": 0.4689, + "step": 335 + }, + { + "epoch": 0.04088834803772437, + "grad_norm": 0.8664236664772034, + "learning_rate": 1.99999906772443e-05, + "loss": 0.483, + "step": 336 + }, + { + "epoch": 0.0410100395497414, + "grad_norm": 3.2425484657287598, + "learning_rate": 1.999998782334007e-05, + "loss": 0.5375, + "step": 337 + }, + { + "epoch": 0.04113173106175844, + "grad_norm": 0.7734496593475342, + "learning_rate": 1.9999984588915606e-05, + "loss": 0.5087, + "step": 338 + }, + { + "epoch": 0.041253422573775476, + "grad_norm": 1.4330683946609497, + "learning_rate": 1.9999980973971033e-05, + "loss": 0.5776, + "step": 339 + }, + { + "epoch": 0.041375114085792516, + "grad_norm": 5.349335193634033, + "learning_rate": 1.999997697850648e-05, + "loss": 0.5142, + "step": 340 + }, + { + "epoch": 0.04149680559780955, + "grad_norm": 5.650288105010986, + "learning_rate": 1.9999972602522106e-05, + "loss": 0.5403, + "step": 341 + }, + { + "epoch": 0.04161849710982659, + "grad_norm": 6.786245346069336, + "learning_rate": 1.9999967846018074e-05, + "loss": 0.5502, + "step": 342 + }, + { + "epoch": 0.04174018862184363, + "grad_norm": 2.413320302963257, + "learning_rate": 1.999996270899457e-05, + "loss": 0.5304, + "step": 343 + }, + { + "epoch": 0.04186188013386066, + "grad_norm": 1.1327245235443115, + "learning_rate": 1.9999957191451788e-05, + "loss": 0.4999, + "step": 344 + }, + { + "epoch": 0.0419835716458777, + "grad_norm": 2.179643154144287, + "learning_rate": 1.9999951293389933e-05, + "loss": 0.4844, + "step": 345 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 2.8902554512023926, + "learning_rate": 1.9999945014809234e-05, + "loss": 0.4757, + "step": 346 + }, + { + "epoch": 0.042226954669911776, + "grad_norm": 2.484220027923584, + "learning_rate": 1.9999938355709926e-05, + "loss": 0.5243, + "step": 347 + }, + { + "epoch": 0.04234864618192881, + "grad_norm": 1.1086695194244385, + "learning_rate": 1.999993131609226e-05, + "loss": 0.49, + "step": 348 + }, + { + "epoch": 0.04247033769394585, + "grad_norm": 2.491640329360962, + "learning_rate": 1.9999923895956518e-05, + "loss": 0.4581, + "step": 349 + }, + { + "epoch": 0.04259202920596288, + "grad_norm": 0.9489032626152039, + "learning_rate": 1.9999916095302972e-05, + "loss": 0.4852, + "step": 350 + }, + { + "epoch": 0.04271372071797992, + "grad_norm": 0.7250332236289978, + "learning_rate": 1.9999907914131916e-05, + "loss": 0.4906, + "step": 351 + }, + { + "epoch": 0.042835412229996955, + "grad_norm": 3.0588483810424805, + "learning_rate": 1.9999899352443666e-05, + "loss": 0.5091, + "step": 352 + }, + { + "epoch": 0.042957103742013995, + "grad_norm": 0.782995343208313, + "learning_rate": 1.999989041023855e-05, + "loss": 0.4513, + "step": 353 + }, + { + "epoch": 0.04307879525403103, + "grad_norm": 1.0188297033309937, + "learning_rate": 1.99998810875169e-05, + "loss": 0.497, + "step": 354 + }, + { + "epoch": 0.04320048676604807, + "grad_norm": 1.5506155490875244, + "learning_rate": 1.999987138427908e-05, + "loss": 0.4425, + "step": 355 + }, + { + "epoch": 0.0433221782780651, + "grad_norm": 2.8500266075134277, + "learning_rate": 1.999986130052545e-05, + "loss": 0.5447, + "step": 356 + }, + { + "epoch": 0.04344386979008214, + "grad_norm": 2.310084581375122, + "learning_rate": 1.9999850836256406e-05, + "loss": 0.4612, + "step": 357 + }, + { + "epoch": 0.04356556130209918, + "grad_norm": 3.462616443634033, + "learning_rate": 1.9999839991472337e-05, + "loss": 0.5422, + "step": 358 + }, + { + "epoch": 0.043687252814116215, + "grad_norm": 1.3153492212295532, + "learning_rate": 1.9999828766173655e-05, + "loss": 0.5231, + "step": 359 + }, + { + "epoch": 0.043808944326133255, + "grad_norm": 1.3355218172073364, + "learning_rate": 1.9999817160360794e-05, + "loss": 0.5127, + "step": 360 + }, + { + "epoch": 0.04393063583815029, + "grad_norm": 1.3839240074157715, + "learning_rate": 1.999980517403419e-05, + "loss": 0.5186, + "step": 361 + }, + { + "epoch": 0.04405232735016733, + "grad_norm": 2.0387721061706543, + "learning_rate": 1.9999792807194297e-05, + "loss": 0.4131, + "step": 362 + }, + { + "epoch": 0.04417401886218436, + "grad_norm": 3.998417377471924, + "learning_rate": 1.9999780059841593e-05, + "loss": 0.5405, + "step": 363 + }, + { + "epoch": 0.0442957103742014, + "grad_norm": 1.6171361207962036, + "learning_rate": 1.999976693197656e-05, + "loss": 0.5098, + "step": 364 + }, + { + "epoch": 0.044417401886218434, + "grad_norm": 0.6380708813667297, + "learning_rate": 1.9999753423599696e-05, + "loss": 0.4769, + "step": 365 + }, + { + "epoch": 0.044539093398235474, + "grad_norm": 1.0627521276474, + "learning_rate": 1.9999739534711514e-05, + "loss": 0.5338, + "step": 366 + }, + { + "epoch": 0.04466078491025251, + "grad_norm": 2.228125810623169, + "learning_rate": 1.9999725265312545e-05, + "loss": 0.524, + "step": 367 + }, + { + "epoch": 0.04478247642226955, + "grad_norm": 4.595504283905029, + "learning_rate": 1.9999710615403333e-05, + "loss": 0.5322, + "step": 368 + }, + { + "epoch": 0.04490416793428658, + "grad_norm": 3.422703504562378, + "learning_rate": 1.9999695584984434e-05, + "loss": 0.5009, + "step": 369 + }, + { + "epoch": 0.04502585944630362, + "grad_norm": 0.6527664065361023, + "learning_rate": 1.999968017405642e-05, + "loss": 0.498, + "step": 370 + }, + { + "epoch": 0.04514755095832066, + "grad_norm": 1.485638976097107, + "learning_rate": 1.9999664382619876e-05, + "loss": 0.4686, + "step": 371 + }, + { + "epoch": 0.045269242470337694, + "grad_norm": 2.8725483417510986, + "learning_rate": 1.9999648210675402e-05, + "loss": 0.4766, + "step": 372 + }, + { + "epoch": 0.045390933982354734, + "grad_norm": 2.0454182624816895, + "learning_rate": 1.9999631658223617e-05, + "loss": 0.4705, + "step": 373 + }, + { + "epoch": 0.04551262549437177, + "grad_norm": 1.9818562269210815, + "learning_rate": 1.999961472526515e-05, + "loss": 0.4322, + "step": 374 + }, + { + "epoch": 0.04563431700638881, + "grad_norm": 1.107570767402649, + "learning_rate": 1.9999597411800645e-05, + "loss": 0.5237, + "step": 375 + }, + { + "epoch": 0.04575600851840584, + "grad_norm": 2.3792004585266113, + "learning_rate": 1.999957971783076e-05, + "loss": 0.4749, + "step": 376 + }, + { + "epoch": 0.04587770003042288, + "grad_norm": 6.037851810455322, + "learning_rate": 1.9999561643356168e-05, + "loss": 0.4582, + "step": 377 + }, + { + "epoch": 0.04599939154243991, + "grad_norm": 1.0004757642745972, + "learning_rate": 1.9999543188377557e-05, + "loss": 0.5358, + "step": 378 + }, + { + "epoch": 0.04612108305445695, + "grad_norm": 2.3799378871917725, + "learning_rate": 1.9999524352895633e-05, + "loss": 0.4951, + "step": 379 + }, + { + "epoch": 0.046242774566473986, + "grad_norm": 1.0956833362579346, + "learning_rate": 1.9999505136911106e-05, + "loss": 0.498, + "step": 380 + }, + { + "epoch": 0.046364466078491026, + "grad_norm": 1.9067175388336182, + "learning_rate": 1.999948554042471e-05, + "loss": 0.4516, + "step": 381 + }, + { + "epoch": 0.04648615759050806, + "grad_norm": 2.6854097843170166, + "learning_rate": 1.9999465563437194e-05, + "loss": 0.5042, + "step": 382 + }, + { + "epoch": 0.0466078491025251, + "grad_norm": 2.899186849594116, + "learning_rate": 1.9999445205949315e-05, + "loss": 0.536, + "step": 383 + }, + { + "epoch": 0.04672954061454213, + "grad_norm": 0.5587601661682129, + "learning_rate": 1.999942446796185e-05, + "loss": 0.4765, + "step": 384 + }, + { + "epoch": 0.04685123212655917, + "grad_norm": 2.5319371223449707, + "learning_rate": 1.9999403349475584e-05, + "loss": 0.4526, + "step": 385 + }, + { + "epoch": 0.04697292363857621, + "grad_norm": 2.785966396331787, + "learning_rate": 1.9999381850491323e-05, + "loss": 0.5165, + "step": 386 + }, + { + "epoch": 0.047094615150593246, + "grad_norm": 1.0049335956573486, + "learning_rate": 1.9999359971009885e-05, + "loss": 0.5142, + "step": 387 + }, + { + "epoch": 0.047216306662610286, + "grad_norm": 2.116328477859497, + "learning_rate": 1.9999337711032102e-05, + "loss": 0.513, + "step": 388 + }, + { + "epoch": 0.04733799817462732, + "grad_norm": 0.9692975282669067, + "learning_rate": 1.9999315070558825e-05, + "loss": 0.4365, + "step": 389 + }, + { + "epoch": 0.04745968968664436, + "grad_norm": 0.9520614147186279, + "learning_rate": 1.9999292049590908e-05, + "loss": 0.4283, + "step": 390 + }, + { + "epoch": 0.04758138119866139, + "grad_norm": 1.2361197471618652, + "learning_rate": 1.9999268648129234e-05, + "loss": 0.479, + "step": 391 + }, + { + "epoch": 0.04770307271067843, + "grad_norm": 0.915768563747406, + "learning_rate": 1.9999244866174686e-05, + "loss": 0.4832, + "step": 392 + }, + { + "epoch": 0.047824764222695465, + "grad_norm": 0.9080315232276917, + "learning_rate": 1.999922070372818e-05, + "loss": 0.4433, + "step": 393 + }, + { + "epoch": 0.047946455734712505, + "grad_norm": 1.0250380039215088, + "learning_rate": 1.9999196160790627e-05, + "loss": 0.4607, + "step": 394 + }, + { + "epoch": 0.04806814724672954, + "grad_norm": 1.125348687171936, + "learning_rate": 1.999917123736296e-05, + "loss": 0.5341, + "step": 395 + }, + { + "epoch": 0.04818983875874658, + "grad_norm": 1.0356920957565308, + "learning_rate": 1.999914593344613e-05, + "loss": 0.5035, + "step": 396 + }, + { + "epoch": 0.04831153027076361, + "grad_norm": 1.220845341682434, + "learning_rate": 1.9999120249041108e-05, + "loss": 0.4952, + "step": 397 + }, + { + "epoch": 0.04843322178278065, + "grad_norm": 0.7832775712013245, + "learning_rate": 1.9999094184148852e-05, + "loss": 0.4883, + "step": 398 + }, + { + "epoch": 0.048554913294797684, + "grad_norm": 2.6819679737091064, + "learning_rate": 1.9999067738770376e-05, + "loss": 0.5663, + "step": 399 + }, + { + "epoch": 0.048676604806814724, + "grad_norm": 3.7089293003082275, + "learning_rate": 1.9999040912906668e-05, + "loss": 0.4004, + "step": 400 + }, + { + "epoch": 0.048798296318831764, + "grad_norm": 2.1037650108337402, + "learning_rate": 1.999901370655876e-05, + "loss": 0.5194, + "step": 401 + }, + { + "epoch": 0.0489199878308488, + "grad_norm": 0.7117876410484314, + "learning_rate": 1.999898611972768e-05, + "loss": 0.5096, + "step": 402 + }, + { + "epoch": 0.04904167934286584, + "grad_norm": 1.2061089277267456, + "learning_rate": 1.9998958152414486e-05, + "loss": 0.5562, + "step": 403 + }, + { + "epoch": 0.04916337085488287, + "grad_norm": 1.141405701637268, + "learning_rate": 1.9998929804620234e-05, + "loss": 0.558, + "step": 404 + }, + { + "epoch": 0.04928506236689991, + "grad_norm": 4.762540817260742, + "learning_rate": 1.999890107634601e-05, + "loss": 0.5011, + "step": 405 + }, + { + "epoch": 0.049406753878916944, + "grad_norm": 2.3913159370422363, + "learning_rate": 1.99988719675929e-05, + "loss": 0.5089, + "step": 406 + }, + { + "epoch": 0.049528445390933984, + "grad_norm": 0.647175133228302, + "learning_rate": 1.9998842478362017e-05, + "loss": 0.5114, + "step": 407 + }, + { + "epoch": 0.04965013690295102, + "grad_norm": 0.9456661343574524, + "learning_rate": 1.999881260865448e-05, + "loss": 0.4777, + "step": 408 + }, + { + "epoch": 0.04977182841496806, + "grad_norm": 2.7678298950195312, + "learning_rate": 1.9998782358471428e-05, + "loss": 0.5321, + "step": 409 + }, + { + "epoch": 0.04989351992698509, + "grad_norm": 3.9966847896575928, + "learning_rate": 1.999875172781401e-05, + "loss": 0.5802, + "step": 410 + }, + { + "epoch": 0.05001521143900213, + "grad_norm": 2.654068946838379, + "learning_rate": 1.9998720716683393e-05, + "loss": 0.5437, + "step": 411 + }, + { + "epoch": 0.05013690295101916, + "grad_norm": 2.712520122528076, + "learning_rate": 1.9998689325080754e-05, + "loss": 0.4692, + "step": 412 + }, + { + "epoch": 0.0502585944630362, + "grad_norm": 5.590470314025879, + "learning_rate": 1.9998657553007294e-05, + "loss": 0.4677, + "step": 413 + }, + { + "epoch": 0.05038028597505324, + "grad_norm": 3.3917086124420166, + "learning_rate": 1.9998625400464218e-05, + "loss": 0.5052, + "step": 414 + }, + { + "epoch": 0.050501977487070276, + "grad_norm": 0.8707383275032043, + "learning_rate": 1.9998592867452747e-05, + "loss": 0.5402, + "step": 415 + }, + { + "epoch": 0.050623668999087316, + "grad_norm": 1.3056360483169556, + "learning_rate": 1.9998559953974123e-05, + "loss": 0.4541, + "step": 416 + }, + { + "epoch": 0.05074536051110435, + "grad_norm": 2.2689220905303955, + "learning_rate": 1.9998526660029597e-05, + "loss": 0.4737, + "step": 417 + }, + { + "epoch": 0.05086705202312139, + "grad_norm": 5.5445427894592285, + "learning_rate": 1.9998492985620436e-05, + "loss": 0.5552, + "step": 418 + }, + { + "epoch": 0.05098874353513842, + "grad_norm": 5.148685455322266, + "learning_rate": 1.9998458930747917e-05, + "loss": 0.5547, + "step": 419 + }, + { + "epoch": 0.05111043504715546, + "grad_norm": 2.8303542137145996, + "learning_rate": 1.9998424495413346e-05, + "loss": 0.4769, + "step": 420 + }, + { + "epoch": 0.051232126559172496, + "grad_norm": 1.7225395441055298, + "learning_rate": 1.9998389679618025e-05, + "loss": 0.4278, + "step": 421 + }, + { + "epoch": 0.051353818071189536, + "grad_norm": 1.1106629371643066, + "learning_rate": 1.9998354483363277e-05, + "loss": 0.5342, + "step": 422 + }, + { + "epoch": 0.05147550958320657, + "grad_norm": 1.0146124362945557, + "learning_rate": 1.999831890665045e-05, + "loss": 0.4857, + "step": 423 + }, + { + "epoch": 0.05159720109522361, + "grad_norm": 1.1379413604736328, + "learning_rate": 1.9998282949480893e-05, + "loss": 0.4965, + "step": 424 + }, + { + "epoch": 0.05171889260724064, + "grad_norm": 2.018253803253174, + "learning_rate": 1.9998246611855974e-05, + "loss": 0.4512, + "step": 425 + }, + { + "epoch": 0.05184058411925768, + "grad_norm": 0.7502725124359131, + "learning_rate": 1.9998209893777076e-05, + "loss": 0.4766, + "step": 426 + }, + { + "epoch": 0.051962275631274715, + "grad_norm": 0.723865807056427, + "learning_rate": 1.9998172795245598e-05, + "loss": 0.4393, + "step": 427 + }, + { + "epoch": 0.052083967143291755, + "grad_norm": 3.620739698410034, + "learning_rate": 1.999813531626295e-05, + "loss": 0.5174, + "step": 428 + }, + { + "epoch": 0.052205658655308795, + "grad_norm": 4.749644756317139, + "learning_rate": 1.9998097456830553e-05, + "loss": 0.5757, + "step": 429 + }, + { + "epoch": 0.05232735016732583, + "grad_norm": 0.6498982906341553, + "learning_rate": 1.9998059216949856e-05, + "loss": 0.4976, + "step": 430 + }, + { + "epoch": 0.05244904167934287, + "grad_norm": 1.9424355030059814, + "learning_rate": 1.9998020596622312e-05, + "loss": 0.4655, + "step": 431 + }, + { + "epoch": 0.0525707331913599, + "grad_norm": 1.9933793544769287, + "learning_rate": 1.999798159584939e-05, + "loss": 0.4856, + "step": 432 + }, + { + "epoch": 0.05269242470337694, + "grad_norm": 3.7161357402801514, + "learning_rate": 1.9997942214632574e-05, + "loss": 0.4689, + "step": 433 + }, + { + "epoch": 0.052814116215393975, + "grad_norm": 1.2382320165634155, + "learning_rate": 1.9997902452973358e-05, + "loss": 0.476, + "step": 434 + }, + { + "epoch": 0.052935807727411015, + "grad_norm": 0.6176339387893677, + "learning_rate": 1.999786231087326e-05, + "loss": 0.3707, + "step": 435 + }, + { + "epoch": 0.05305749923942805, + "grad_norm": 6.290051460266113, + "learning_rate": 1.9997821788333812e-05, + "loss": 0.5724, + "step": 436 + }, + { + "epoch": 0.05317919075144509, + "grad_norm": 5.688040733337402, + "learning_rate": 1.9997780885356545e-05, + "loss": 0.5273, + "step": 437 + }, + { + "epoch": 0.05330088226346212, + "grad_norm": 5.579360008239746, + "learning_rate": 1.9997739601943025e-05, + "loss": 0.5466, + "step": 438 + }, + { + "epoch": 0.05342257377547916, + "grad_norm": 6.472201347351074, + "learning_rate": 1.9997697938094815e-05, + "loss": 0.6465, + "step": 439 + }, + { + "epoch": 0.053544265287496194, + "grad_norm": 0.6766566038131714, + "learning_rate": 1.999765589381351e-05, + "loss": 0.4383, + "step": 440 + }, + { + "epoch": 0.053665956799513234, + "grad_norm": 1.6292884349822998, + "learning_rate": 1.9997613469100702e-05, + "loss": 0.4656, + "step": 441 + }, + { + "epoch": 0.053787648311530274, + "grad_norm": 3.057184934616089, + "learning_rate": 1.9997570663958005e-05, + "loss": 0.4397, + "step": 442 + }, + { + "epoch": 0.05390933982354731, + "grad_norm": 0.6015315651893616, + "learning_rate": 1.9997527478387048e-05, + "loss": 0.4972, + "step": 443 + }, + { + "epoch": 0.05403103133556435, + "grad_norm": 0.6792243123054504, + "learning_rate": 1.9997483912389484e-05, + "loss": 0.4833, + "step": 444 + }, + { + "epoch": 0.05415272284758138, + "grad_norm": 1.7965141534805298, + "learning_rate": 1.9997439965966957e-05, + "loss": 0.5224, + "step": 445 + }, + { + "epoch": 0.05427441435959842, + "grad_norm": 2.2070422172546387, + "learning_rate": 1.999739563912115e-05, + "loss": 0.5126, + "step": 446 + }, + { + "epoch": 0.054396105871615454, + "grad_norm": 2.6806740760803223, + "learning_rate": 1.9997350931853745e-05, + "loss": 0.4878, + "step": 447 + }, + { + "epoch": 0.054517797383632494, + "grad_norm": 1.3019781112670898, + "learning_rate": 1.999730584416644e-05, + "loss": 0.5178, + "step": 448 + }, + { + "epoch": 0.05463948889564953, + "grad_norm": 1.6776936054229736, + "learning_rate": 1.9997260376060958e-05, + "loss": 0.5151, + "step": 449 + }, + { + "epoch": 0.05476118040766657, + "grad_norm": 0.9040634036064148, + "learning_rate": 1.9997214527539025e-05, + "loss": 0.5035, + "step": 450 + }, + { + "epoch": 0.0548828719196836, + "grad_norm": 0.9000717997550964, + "learning_rate": 1.999716829860239e-05, + "loss": 0.4203, + "step": 451 + }, + { + "epoch": 0.05500456343170064, + "grad_norm": 4.662450790405273, + "learning_rate": 1.99971216892528e-05, + "loss": 0.5618, + "step": 452 + }, + { + "epoch": 0.05512625494371767, + "grad_norm": 5.027231693267822, + "learning_rate": 1.999707469949204e-05, + "loss": 0.5614, + "step": 453 + }, + { + "epoch": 0.05524794645573471, + "grad_norm": 1.7503198385238647, + "learning_rate": 1.9997027329321896e-05, + "loss": 0.4443, + "step": 454 + }, + { + "epoch": 0.055369637967751746, + "grad_norm": 1.9756243228912354, + "learning_rate": 1.999697957874417e-05, + "loss": 0.5215, + "step": 455 + }, + { + "epoch": 0.055491329479768786, + "grad_norm": 2.088327407836914, + "learning_rate": 1.9996931447760677e-05, + "loss": 0.4871, + "step": 456 + }, + { + "epoch": 0.055613020991785826, + "grad_norm": 4.555768966674805, + "learning_rate": 1.999688293637325e-05, + "loss": 0.5058, + "step": 457 + }, + { + "epoch": 0.05573471250380286, + "grad_norm": 6.4992570877075195, + "learning_rate": 1.9996834044583736e-05, + "loss": 0.5105, + "step": 458 + }, + { + "epoch": 0.0558564040158199, + "grad_norm": 3.1522233486175537, + "learning_rate": 1.999678477239399e-05, + "loss": 0.5332, + "step": 459 + }, + { + "epoch": 0.05597809552783693, + "grad_norm": 1.4650074243545532, + "learning_rate": 1.9996735119805895e-05, + "loss": 0.5653, + "step": 460 + }, + { + "epoch": 0.05609978703985397, + "grad_norm": 0.9694099426269531, + "learning_rate": 1.9996685086821338e-05, + "loss": 0.4524, + "step": 461 + }, + { + "epoch": 0.056221478551871006, + "grad_norm": 1.3509925603866577, + "learning_rate": 1.9996634673442222e-05, + "loss": 0.4799, + "step": 462 + }, + { + "epoch": 0.056343170063888046, + "grad_norm": 1.8713821172714233, + "learning_rate": 1.9996583879670463e-05, + "loss": 0.3705, + "step": 463 + }, + { + "epoch": 0.05646486157590508, + "grad_norm": 3.057955503463745, + "learning_rate": 1.999653270550799e-05, + "loss": 0.4877, + "step": 464 + }, + { + "epoch": 0.05658655308792212, + "grad_norm": 3.9810750484466553, + "learning_rate": 1.9996481150956764e-05, + "loss": 0.4692, + "step": 465 + }, + { + "epoch": 0.05670824459993915, + "grad_norm": 3.1951699256896973, + "learning_rate": 1.9996429216018734e-05, + "loss": 0.49, + "step": 466 + }, + { + "epoch": 0.05682993611195619, + "grad_norm": 0.9979059100151062, + "learning_rate": 1.9996376900695884e-05, + "loss": 0.4406, + "step": 467 + }, + { + "epoch": 0.056951627623973225, + "grad_norm": 1.417478084564209, + "learning_rate": 1.99963242049902e-05, + "loss": 0.465, + "step": 468 + }, + { + "epoch": 0.057073319135990265, + "grad_norm": 1.4806361198425293, + "learning_rate": 1.999627112890369e-05, + "loss": 0.5039, + "step": 469 + }, + { + "epoch": 0.0571950106480073, + "grad_norm": 3.3733386993408203, + "learning_rate": 1.999621767243837e-05, + "loss": 0.4892, + "step": 470 + }, + { + "epoch": 0.05731670216002434, + "grad_norm": 0.9595805406570435, + "learning_rate": 1.9996163835596277e-05, + "loss": 0.5136, + "step": 471 + }, + { + "epoch": 0.05743839367204138, + "grad_norm": 0.77589350938797, + "learning_rate": 1.999610961837946e-05, + "loss": 0.4569, + "step": 472 + }, + { + "epoch": 0.05756008518405841, + "grad_norm": 4.537379264831543, + "learning_rate": 1.9996055020789983e-05, + "loss": 0.5497, + "step": 473 + }, + { + "epoch": 0.05768177669607545, + "grad_norm": 3.5111641883850098, + "learning_rate": 1.999600004282992e-05, + "loss": 0.4577, + "step": 474 + }, + { + "epoch": 0.057803468208092484, + "grad_norm": 3.577052116394043, + "learning_rate": 1.999594468450136e-05, + "loss": 0.5132, + "step": 475 + }, + { + "epoch": 0.057925159720109524, + "grad_norm": 1.4253040552139282, + "learning_rate": 1.9995888945806425e-05, + "loss": 0.4952, + "step": 476 + }, + { + "epoch": 0.05804685123212656, + "grad_norm": 1.8965922594070435, + "learning_rate": 1.9995832826747213e-05, + "loss": 0.4613, + "step": 477 + }, + { + "epoch": 0.0581685427441436, + "grad_norm": 1.0503103733062744, + "learning_rate": 1.999577632732588e-05, + "loss": 0.5271, + "step": 478 + }, + { + "epoch": 0.05829023425616063, + "grad_norm": 1.200190782546997, + "learning_rate": 1.9995719447544567e-05, + "loss": 0.535, + "step": 479 + }, + { + "epoch": 0.05841192576817767, + "grad_norm": 1.953628420829773, + "learning_rate": 1.9995662187405438e-05, + "loss": 0.4963, + "step": 480 + }, + { + "epoch": 0.058533617280194704, + "grad_norm": 1.271244764328003, + "learning_rate": 1.999560454691067e-05, + "loss": 0.4463, + "step": 481 + }, + { + "epoch": 0.058655308792211744, + "grad_norm": 1.5221885442733765, + "learning_rate": 1.9995546526062464e-05, + "loss": 0.4485, + "step": 482 + }, + { + "epoch": 0.05877700030422878, + "grad_norm": 4.1907572746276855, + "learning_rate": 1.999548812486302e-05, + "loss": 0.5146, + "step": 483 + }, + { + "epoch": 0.05889869181624582, + "grad_norm": 1.2595373392105103, + "learning_rate": 1.9995429343314564e-05, + "loss": 0.4258, + "step": 484 + }, + { + "epoch": 0.05902038332826286, + "grad_norm": 2.8437564373016357, + "learning_rate": 1.9995370181419332e-05, + "loss": 0.4847, + "step": 485 + }, + { + "epoch": 0.05914207484027989, + "grad_norm": 2.1599769592285156, + "learning_rate": 1.9995310639179575e-05, + "loss": 0.523, + "step": 486 + }, + { + "epoch": 0.05926376635229693, + "grad_norm": 1.2120065689086914, + "learning_rate": 1.999525071659756e-05, + "loss": 0.5032, + "step": 487 + }, + { + "epoch": 0.05938545786431396, + "grad_norm": 2.0114965438842773, + "learning_rate": 1.9995190413675564e-05, + "loss": 0.5021, + "step": 488 + }, + { + "epoch": 0.059507149376331, + "grad_norm": 2.2819175720214844, + "learning_rate": 1.9995129730415884e-05, + "loss": 0.4898, + "step": 489 + }, + { + "epoch": 0.059628840888348036, + "grad_norm": 2.7193124294281006, + "learning_rate": 1.9995068666820833e-05, + "loss": 0.5523, + "step": 490 + }, + { + "epoch": 0.059750532400365076, + "grad_norm": 1.8687043190002441, + "learning_rate": 1.999500722289273e-05, + "loss": 0.4748, + "step": 491 + }, + { + "epoch": 0.05987222391238211, + "grad_norm": 0.7520543336868286, + "learning_rate": 1.999494539863391e-05, + "loss": 0.4602, + "step": 492 + }, + { + "epoch": 0.05999391542439915, + "grad_norm": 4.535391807556152, + "learning_rate": 1.999488319404673e-05, + "loss": 0.5382, + "step": 493 + }, + { + "epoch": 0.06011560693641618, + "grad_norm": 4.206397533416748, + "learning_rate": 1.9994820609133558e-05, + "loss": 0.5677, + "step": 494 + }, + { + "epoch": 0.06023729844843322, + "grad_norm": 1.6907819509506226, + "learning_rate": 1.999475764389677e-05, + "loss": 0.4298, + "step": 495 + }, + { + "epoch": 0.060358989960450256, + "grad_norm": 4.499179363250732, + "learning_rate": 1.999469429833877e-05, + "loss": 0.5314, + "step": 496 + }, + { + "epoch": 0.060480681472467296, + "grad_norm": 1.034327745437622, + "learning_rate": 1.999463057246196e-05, + "loss": 0.505, + "step": 497 + }, + { + "epoch": 0.06060237298448433, + "grad_norm": 2.9028069972991943, + "learning_rate": 1.999456646626877e-05, + "loss": 0.4879, + "step": 498 + }, + { + "epoch": 0.06072406449650137, + "grad_norm": 2.4188835620880127, + "learning_rate": 1.9994501979761644e-05, + "loss": 0.5334, + "step": 499 + }, + { + "epoch": 0.06084575600851841, + "grad_norm": 6.573634624481201, + "learning_rate": 1.9994437112943025e-05, + "loss": 0.5274, + "step": 500 + }, + { + "epoch": 0.06096744752053544, + "grad_norm": 3.9176599979400635, + "learning_rate": 1.9994371865815388e-05, + "loss": 0.5444, + "step": 501 + }, + { + "epoch": 0.06108913903255248, + "grad_norm": 3.5634405612945557, + "learning_rate": 1.999430623838121e-05, + "loss": 0.4507, + "step": 502 + }, + { + "epoch": 0.061210830544569515, + "grad_norm": 2.5009710788726807, + "learning_rate": 1.9994240230642997e-05, + "loss": 0.3973, + "step": 503 + }, + { + "epoch": 0.061332522056586555, + "grad_norm": 5.271429538726807, + "learning_rate": 1.9994173842603258e-05, + "loss": 0.5523, + "step": 504 + }, + { + "epoch": 0.06145421356860359, + "grad_norm": 3.050410747528076, + "learning_rate": 1.9994107074264516e-05, + "loss": 0.4536, + "step": 505 + }, + { + "epoch": 0.06157590508062063, + "grad_norm": 5.5924177169799805, + "learning_rate": 1.9994039925629313e-05, + "loss": 0.5737, + "step": 506 + }, + { + "epoch": 0.06169759659263766, + "grad_norm": 7.040491104125977, + "learning_rate": 1.9993972396700202e-05, + "loss": 0.6121, + "step": 507 + }, + { + "epoch": 0.0618192881046547, + "grad_norm": 2.6737732887268066, + "learning_rate": 1.9993904487479753e-05, + "loss": 0.4417, + "step": 508 + }, + { + "epoch": 0.061940979616671735, + "grad_norm": 1.7591124773025513, + "learning_rate": 1.9993836197970556e-05, + "loss": 0.4893, + "step": 509 + }, + { + "epoch": 0.062062671128688775, + "grad_norm": 0.7876875400543213, + "learning_rate": 1.999376752817521e-05, + "loss": 0.4941, + "step": 510 + }, + { + "epoch": 0.06218436264070581, + "grad_norm": 2.649890422821045, + "learning_rate": 1.9993698478096315e-05, + "loss": 0.5487, + "step": 511 + }, + { + "epoch": 0.06230605415272285, + "grad_norm": 3.669792413711548, + "learning_rate": 1.999362904773651e-05, + "loss": 0.5138, + "step": 512 + }, + { + "epoch": 0.06242774566473988, + "grad_norm": 4.561140060424805, + "learning_rate": 1.9993559237098436e-05, + "loss": 0.5344, + "step": 513 + }, + { + "epoch": 0.06254943717675693, + "grad_norm": 4.805019855499268, + "learning_rate": 1.9993489046184743e-05, + "loss": 0.4836, + "step": 514 + }, + { + "epoch": 0.06267112868877396, + "grad_norm": 1.5226490497589111, + "learning_rate": 1.9993418474998113e-05, + "loss": 0.5059, + "step": 515 + }, + { + "epoch": 0.062792820200791, + "grad_norm": 0.5332703590393066, + "learning_rate": 1.999334752354122e-05, + "loss": 0.4763, + "step": 516 + }, + { + "epoch": 0.06291451171280803, + "grad_norm": 2.0380637645721436, + "learning_rate": 1.999327619181677e-05, + "loss": 0.48, + "step": 517 + }, + { + "epoch": 0.06303620322482507, + "grad_norm": 3.6975255012512207, + "learning_rate": 1.9993204479827476e-05, + "loss": 0.4883, + "step": 518 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 5.069300174713135, + "learning_rate": 1.9993132387576067e-05, + "loss": 0.5322, + "step": 519 + }, + { + "epoch": 0.06327958624885914, + "grad_norm": 5.06345272064209, + "learning_rate": 1.9993059915065286e-05, + "loss": 0.5796, + "step": 520 + }, + { + "epoch": 0.06340127776087617, + "grad_norm": 0.6309605240821838, + "learning_rate": 1.9992987062297892e-05, + "loss": 0.4647, + "step": 521 + }, + { + "epoch": 0.06352296927289322, + "grad_norm": 1.317283272743225, + "learning_rate": 1.999291382927665e-05, + "loss": 0.4892, + "step": 522 + }, + { + "epoch": 0.06364466078491025, + "grad_norm": 2.677121639251709, + "learning_rate": 1.9992840216004358e-05, + "loss": 0.5097, + "step": 523 + }, + { + "epoch": 0.06376635229692729, + "grad_norm": 5.121943950653076, + "learning_rate": 1.999276622248381e-05, + "loss": 0.4887, + "step": 524 + }, + { + "epoch": 0.06388804380894432, + "grad_norm": 5.404939651489258, + "learning_rate": 1.9992691848717826e-05, + "loss": 0.4744, + "step": 525 + }, + { + "epoch": 0.06400973532096137, + "grad_norm": 2.4270565509796143, + "learning_rate": 1.9992617094709233e-05, + "loss": 0.5326, + "step": 526 + }, + { + "epoch": 0.0641314268329784, + "grad_norm": 2.0383589267730713, + "learning_rate": 1.999254196046087e-05, + "loss": 0.4906, + "step": 527 + }, + { + "epoch": 0.06425311834499543, + "grad_norm": 2.442333459854126, + "learning_rate": 1.999246644597561e-05, + "loss": 0.5293, + "step": 528 + }, + { + "epoch": 0.06437480985701248, + "grad_norm": 3.750847816467285, + "learning_rate": 1.9992390551256314e-05, + "loss": 0.5635, + "step": 529 + }, + { + "epoch": 0.06449650136902951, + "grad_norm": 1.1939170360565186, + "learning_rate": 1.9992314276305874e-05, + "loss": 0.4542, + "step": 530 + }, + { + "epoch": 0.06461819288104655, + "grad_norm": 0.9305385947227478, + "learning_rate": 1.9992237621127196e-05, + "loss": 0.4437, + "step": 531 + }, + { + "epoch": 0.06473988439306358, + "grad_norm": 1.619693398475647, + "learning_rate": 1.999216058572319e-05, + "loss": 0.4907, + "step": 532 + }, + { + "epoch": 0.06486157590508063, + "grad_norm": 1.302485704421997, + "learning_rate": 1.9992083170096794e-05, + "loss": 0.4165, + "step": 533 + }, + { + "epoch": 0.06498326741709766, + "grad_norm": 1.965978741645813, + "learning_rate": 1.999200537425095e-05, + "loss": 0.5254, + "step": 534 + }, + { + "epoch": 0.06510495892911469, + "grad_norm": 0.8912264704704285, + "learning_rate": 1.9991927198188618e-05, + "loss": 0.514, + "step": 535 + }, + { + "epoch": 0.06522665044113173, + "grad_norm": 1.8724647760391235, + "learning_rate": 1.9991848641912774e-05, + "loss": 0.4674, + "step": 536 + }, + { + "epoch": 0.06534834195314877, + "grad_norm": 2.7350761890411377, + "learning_rate": 1.999176970542641e-05, + "loss": 0.4574, + "step": 537 + }, + { + "epoch": 0.0654700334651658, + "grad_norm": 0.8851805925369263, + "learning_rate": 1.9991690388732527e-05, + "loss": 0.5187, + "step": 538 + }, + { + "epoch": 0.06559172497718284, + "grad_norm": 0.7318903803825378, + "learning_rate": 1.9991610691834137e-05, + "loss": 0.493, + "step": 539 + }, + { + "epoch": 0.06571341648919987, + "grad_norm": 3.4889729022979736, + "learning_rate": 1.9991530614734285e-05, + "loss": 0.4216, + "step": 540 + }, + { + "epoch": 0.06583510800121692, + "grad_norm": 2.57153582572937, + "learning_rate": 1.9991450157436008e-05, + "loss": 0.4032, + "step": 541 + }, + { + "epoch": 0.06595679951323395, + "grad_norm": 1.5060590505599976, + "learning_rate": 1.9991369319942374e-05, + "loss": 0.4741, + "step": 542 + }, + { + "epoch": 0.06607849102525098, + "grad_norm": 2.0602190494537354, + "learning_rate": 1.9991288102256453e-05, + "loss": 0.4927, + "step": 543 + }, + { + "epoch": 0.06620018253726803, + "grad_norm": 2.9015443325042725, + "learning_rate": 1.9991206504381343e-05, + "loss": 0.5037, + "step": 544 + }, + { + "epoch": 0.06632187404928507, + "grad_norm": 0.7738814353942871, + "learning_rate": 1.9991124526320142e-05, + "loss": 0.4696, + "step": 545 + }, + { + "epoch": 0.0664435655613021, + "grad_norm": 3.829948902130127, + "learning_rate": 1.9991042168075972e-05, + "loss": 0.3952, + "step": 546 + }, + { + "epoch": 0.06656525707331913, + "grad_norm": 0.9256926774978638, + "learning_rate": 1.999095942965197e-05, + "loss": 0.5364, + "step": 547 + }, + { + "epoch": 0.06668694858533618, + "grad_norm": 2.401303768157959, + "learning_rate": 1.999087631105128e-05, + "loss": 0.433, + "step": 548 + }, + { + "epoch": 0.06680864009735321, + "grad_norm": 1.4442120790481567, + "learning_rate": 1.9990792812277068e-05, + "loss": 0.4271, + "step": 549 + }, + { + "epoch": 0.06693033160937024, + "grad_norm": 3.9047372341156006, + "learning_rate": 1.9990708933332506e-05, + "loss": 0.542, + "step": 550 + }, + { + "epoch": 0.06705202312138728, + "grad_norm": 4.522757530212402, + "learning_rate": 1.9990624674220794e-05, + "loss": 0.5684, + "step": 551 + }, + { + "epoch": 0.06717371463340432, + "grad_norm": 5.236936092376709, + "learning_rate": 1.999054003494513e-05, + "loss": 0.5813, + "step": 552 + }, + { + "epoch": 0.06729540614542136, + "grad_norm": 3.983177423477173, + "learning_rate": 1.9990455015508738e-05, + "loss": 0.5217, + "step": 553 + }, + { + "epoch": 0.06741709765743839, + "grad_norm": 0.9311882257461548, + "learning_rate": 1.9990369615914854e-05, + "loss": 0.5054, + "step": 554 + }, + { + "epoch": 0.06753878916945542, + "grad_norm": 2.2126944065093994, + "learning_rate": 1.9990283836166732e-05, + "loss": 0.4941, + "step": 555 + }, + { + "epoch": 0.06766048068147247, + "grad_norm": 3.741231679916382, + "learning_rate": 1.9990197676267624e-05, + "loss": 0.5134, + "step": 556 + }, + { + "epoch": 0.0677821721934895, + "grad_norm": 4.3026628494262695, + "learning_rate": 1.999011113622082e-05, + "loss": 0.498, + "step": 557 + }, + { + "epoch": 0.06790386370550654, + "grad_norm": 1.644929051399231, + "learning_rate": 1.999002421602961e-05, + "loss": 0.5626, + "step": 558 + }, + { + "epoch": 0.06802555521752358, + "grad_norm": 0.7731117010116577, + "learning_rate": 1.9989936915697295e-05, + "loss": 0.5289, + "step": 559 + }, + { + "epoch": 0.06814724672954062, + "grad_norm": 1.6183112859725952, + "learning_rate": 1.998984923522721e-05, + "loss": 0.4713, + "step": 560 + }, + { + "epoch": 0.06826893824155765, + "grad_norm": 1.2894877195358276, + "learning_rate": 1.998976117462268e-05, + "loss": 0.428, + "step": 561 + }, + { + "epoch": 0.06839062975357468, + "grad_norm": 1.8582295179367065, + "learning_rate": 1.9989672733887058e-05, + "loss": 0.4538, + "step": 562 + }, + { + "epoch": 0.06851232126559173, + "grad_norm": 3.921905517578125, + "learning_rate": 1.998958391302371e-05, + "loss": 0.4928, + "step": 563 + }, + { + "epoch": 0.06863401277760876, + "grad_norm": 3.756831169128418, + "learning_rate": 1.998949471203602e-05, + "loss": 0.4769, + "step": 564 + }, + { + "epoch": 0.0687557042896258, + "grad_norm": 1.5508170127868652, + "learning_rate": 1.998940513092738e-05, + "loss": 0.4209, + "step": 565 + }, + { + "epoch": 0.06887739580164283, + "grad_norm": 2.5746004581451416, + "learning_rate": 1.9989315169701197e-05, + "loss": 0.4748, + "step": 566 + }, + { + "epoch": 0.06899908731365988, + "grad_norm": 3.5863168239593506, + "learning_rate": 1.9989224828360893e-05, + "loss": 0.4375, + "step": 567 + }, + { + "epoch": 0.06912077882567691, + "grad_norm": 4.102794647216797, + "learning_rate": 1.9989134106909908e-05, + "loss": 0.4298, + "step": 568 + }, + { + "epoch": 0.06924247033769394, + "grad_norm": 4.051459789276123, + "learning_rate": 1.9989043005351695e-05, + "loss": 0.4173, + "step": 569 + }, + { + "epoch": 0.06936416184971098, + "grad_norm": 2.05891489982605, + "learning_rate": 1.9988951523689718e-05, + "loss": 0.4662, + "step": 570 + }, + { + "epoch": 0.06948585336172802, + "grad_norm": 1.4780614376068115, + "learning_rate": 1.9988859661927465e-05, + "loss": 0.4957, + "step": 571 + }, + { + "epoch": 0.06960754487374506, + "grad_norm": 2.591366767883301, + "learning_rate": 1.998876742006842e-05, + "loss": 0.5035, + "step": 572 + }, + { + "epoch": 0.06972923638576209, + "grad_norm": 0.8730366826057434, + "learning_rate": 1.99886747981161e-05, + "loss": 0.3829, + "step": 573 + }, + { + "epoch": 0.06985092789777914, + "grad_norm": 2.2657363414764404, + "learning_rate": 1.998858179607403e-05, + "loss": 0.4586, + "step": 574 + }, + { + "epoch": 0.06997261940979617, + "grad_norm": 3.80833101272583, + "learning_rate": 1.9988488413945747e-05, + "loss": 0.4622, + "step": 575 + }, + { + "epoch": 0.0700943109218132, + "grad_norm": 1.078434944152832, + "learning_rate": 1.9988394651734804e-05, + "loss": 0.4044, + "step": 576 + }, + { + "epoch": 0.07021600243383024, + "grad_norm": 2.571518659591675, + "learning_rate": 1.998830050944477e-05, + "loss": 0.5679, + "step": 577 + }, + { + "epoch": 0.07033769394584728, + "grad_norm": 3.9330830574035645, + "learning_rate": 1.9988205987079227e-05, + "loss": 0.4345, + "step": 578 + }, + { + "epoch": 0.07045938545786432, + "grad_norm": 5.81560754776001, + "learning_rate": 1.9988111084641772e-05, + "loss": 0.4732, + "step": 579 + }, + { + "epoch": 0.07058107696988135, + "grad_norm": 6.2755351066589355, + "learning_rate": 1.9988015802136017e-05, + "loss": 0.4984, + "step": 580 + }, + { + "epoch": 0.07070276848189838, + "grad_norm": 2.3427951335906982, + "learning_rate": 1.9987920139565585e-05, + "loss": 0.5199, + "step": 581 + }, + { + "epoch": 0.07082445999391543, + "grad_norm": 1.0140855312347412, + "learning_rate": 1.998782409693412e-05, + "loss": 0.5637, + "step": 582 + }, + { + "epoch": 0.07094615150593246, + "grad_norm": 0.869575560092926, + "learning_rate": 1.998772767424527e-05, + "loss": 0.5076, + "step": 583 + }, + { + "epoch": 0.0710678430179495, + "grad_norm": 3.894951105117798, + "learning_rate": 1.9987630871502713e-05, + "loss": 0.5494, + "step": 584 + }, + { + "epoch": 0.07118953452996654, + "grad_norm": 0.6733858585357666, + "learning_rate": 1.9987533688710124e-05, + "loss": 0.506, + "step": 585 + }, + { + "epoch": 0.07131122604198357, + "grad_norm": 3.8077239990234375, + "learning_rate": 1.998743612587121e-05, + "loss": 0.5401, + "step": 586 + }, + { + "epoch": 0.07143291755400061, + "grad_norm": 1.1248000860214233, + "learning_rate": 1.9987338182989676e-05, + "loss": 0.5047, + "step": 587 + }, + { + "epoch": 0.07155460906601764, + "grad_norm": 2.5931150913238525, + "learning_rate": 1.9987239860069253e-05, + "loss": 0.4814, + "step": 588 + }, + { + "epoch": 0.07167630057803469, + "grad_norm": 2.2741830348968506, + "learning_rate": 1.9987141157113677e-05, + "loss": 0.4806, + "step": 589 + }, + { + "epoch": 0.07179799209005172, + "grad_norm": 1.1695295572280884, + "learning_rate": 1.998704207412671e-05, + "loss": 0.4837, + "step": 590 + }, + { + "epoch": 0.07191968360206875, + "grad_norm": 0.9373350739479065, + "learning_rate": 1.998694261111212e-05, + "loss": 0.5446, + "step": 591 + }, + { + "epoch": 0.07204137511408579, + "grad_norm": 1.958163857460022, + "learning_rate": 1.9986842768073693e-05, + "loss": 0.426, + "step": 592 + }, + { + "epoch": 0.07216306662610283, + "grad_norm": 2.700521945953369, + "learning_rate": 1.9986742545015226e-05, + "loss": 0.4952, + "step": 593 + }, + { + "epoch": 0.07228475813811987, + "grad_norm": 1.9372899532318115, + "learning_rate": 1.9986641941940534e-05, + "loss": 0.4693, + "step": 594 + }, + { + "epoch": 0.0724064496501369, + "grad_norm": 2.725580930709839, + "learning_rate": 1.9986540958853445e-05, + "loss": 0.5572, + "step": 595 + }, + { + "epoch": 0.07252814116215393, + "grad_norm": 1.3791035413742065, + "learning_rate": 1.9986439595757803e-05, + "loss": 0.5472, + "step": 596 + }, + { + "epoch": 0.07264983267417098, + "grad_norm": 5.115930557250977, + "learning_rate": 1.998633785265746e-05, + "loss": 0.4827, + "step": 597 + }, + { + "epoch": 0.07277152418618801, + "grad_norm": 1.7975986003875732, + "learning_rate": 1.9986235729556292e-05, + "loss": 0.5776, + "step": 598 + }, + { + "epoch": 0.07289321569820505, + "grad_norm": 4.306365013122559, + "learning_rate": 1.9986133226458187e-05, + "loss": 0.5265, + "step": 599 + }, + { + "epoch": 0.0730149072102221, + "grad_norm": 4.6414713859558105, + "learning_rate": 1.998603034336704e-05, + "loss": 0.4969, + "step": 600 + }, + { + "epoch": 0.07313659872223913, + "grad_norm": 2.895811080932617, + "learning_rate": 1.998592708028677e-05, + "loss": 0.4529, + "step": 601 + }, + { + "epoch": 0.07325829023425616, + "grad_norm": 0.918133020401001, + "learning_rate": 1.9985823437221305e-05, + "loss": 0.4501, + "step": 602 + }, + { + "epoch": 0.07337998174627319, + "grad_norm": 4.656303405761719, + "learning_rate": 1.9985719414174584e-05, + "loss": 0.5325, + "step": 603 + }, + { + "epoch": 0.07350167325829024, + "grad_norm": 5.414013862609863, + "learning_rate": 1.9985615011150577e-05, + "loss": 0.5515, + "step": 604 + }, + { + "epoch": 0.07362336477030727, + "grad_norm": 3.049126148223877, + "learning_rate": 1.9985510228153245e-05, + "loss": 0.4249, + "step": 605 + }, + { + "epoch": 0.0737450562823243, + "grad_norm": 2.161055564880371, + "learning_rate": 1.9985405065186582e-05, + "loss": 0.4104, + "step": 606 + }, + { + "epoch": 0.07386674779434134, + "grad_norm": 5.049049377441406, + "learning_rate": 1.9985299522254586e-05, + "loss": 0.5592, + "step": 607 + }, + { + "epoch": 0.07398843930635839, + "grad_norm": 2.2185564041137695, + "learning_rate": 1.9985193599361276e-05, + "loss": 0.5075, + "step": 608 + }, + { + "epoch": 0.07411013081837542, + "grad_norm": 0.7173896431922913, + "learning_rate": 1.998508729651068e-05, + "loss": 0.5072, + "step": 609 + }, + { + "epoch": 0.07423182233039245, + "grad_norm": 5.2295403480529785, + "learning_rate": 1.9984980613706847e-05, + "loss": 0.467, + "step": 610 + }, + { + "epoch": 0.07435351384240949, + "grad_norm": 4.912859916687012, + "learning_rate": 1.9984873550953833e-05, + "loss": 0.5044, + "step": 611 + }, + { + "epoch": 0.07447520535442653, + "grad_norm": 6.189967155456543, + "learning_rate": 1.9984766108255712e-05, + "loss": 0.5105, + "step": 612 + }, + { + "epoch": 0.07459689686644357, + "grad_norm": 6.655056476593018, + "learning_rate": 1.9984658285616573e-05, + "loss": 0.5368, + "step": 613 + }, + { + "epoch": 0.0747185883784606, + "grad_norm": 2.5998427867889404, + "learning_rate": 1.9984550083040516e-05, + "loss": 0.4911, + "step": 614 + }, + { + "epoch": 0.07484027989047765, + "grad_norm": 3.477908134460449, + "learning_rate": 1.9984441500531667e-05, + "loss": 0.438, + "step": 615 + }, + { + "epoch": 0.07496197140249468, + "grad_norm": 0.9623250961303711, + "learning_rate": 1.9984332538094148e-05, + "loss": 0.4336, + "step": 616 + }, + { + "epoch": 0.07508366291451171, + "grad_norm": 3.307692050933838, + "learning_rate": 1.9984223195732113e-05, + "loss": 0.4819, + "step": 617 + }, + { + "epoch": 0.07520535442652874, + "grad_norm": 3.5812041759490967, + "learning_rate": 1.998411347344972e-05, + "loss": 0.4711, + "step": 618 + }, + { + "epoch": 0.07532704593854579, + "grad_norm": 4.81799840927124, + "learning_rate": 1.998400337125114e-05, + "loss": 0.5442, + "step": 619 + }, + { + "epoch": 0.07544873745056282, + "grad_norm": 4.450498104095459, + "learning_rate": 1.998389288914057e-05, + "loss": 0.5491, + "step": 620 + }, + { + "epoch": 0.07557042896257986, + "grad_norm": 0.8603129982948303, + "learning_rate": 1.9983782027122206e-05, + "loss": 0.509, + "step": 621 + }, + { + "epoch": 0.07569212047459689, + "grad_norm": 2.767587900161743, + "learning_rate": 1.998367078520027e-05, + "loss": 0.484, + "step": 622 + }, + { + "epoch": 0.07581381198661394, + "grad_norm": 1.7696137428283691, + "learning_rate": 1.9983559163379e-05, + "loss": 0.5173, + "step": 623 + }, + { + "epoch": 0.07593550349863097, + "grad_norm": 2.1982898712158203, + "learning_rate": 1.9983447161662636e-05, + "loss": 0.5144, + "step": 624 + }, + { + "epoch": 0.076057195010648, + "grad_norm": 0.9459839463233948, + "learning_rate": 1.9983334780055442e-05, + "loss": 0.4982, + "step": 625 + }, + { + "epoch": 0.07617888652266504, + "grad_norm": 1.8358972072601318, + "learning_rate": 1.9983222018561696e-05, + "loss": 0.4644, + "step": 626 + }, + { + "epoch": 0.07630057803468208, + "grad_norm": 2.825239896774292, + "learning_rate": 1.9983108877185687e-05, + "loss": 0.4953, + "step": 627 + }, + { + "epoch": 0.07642226954669912, + "grad_norm": 2.4302449226379395, + "learning_rate": 1.9982995355931726e-05, + "loss": 0.4941, + "step": 628 + }, + { + "epoch": 0.07654396105871615, + "grad_norm": 2.003037691116333, + "learning_rate": 1.9982881454804125e-05, + "loss": 0.4765, + "step": 629 + }, + { + "epoch": 0.0766656525707332, + "grad_norm": 1.2302556037902832, + "learning_rate": 1.9982767173807218e-05, + "loss": 0.3842, + "step": 630 + }, + { + "epoch": 0.07678734408275023, + "grad_norm": 1.5501840114593506, + "learning_rate": 1.998265251294536e-05, + "loss": 0.4991, + "step": 631 + }, + { + "epoch": 0.07690903559476726, + "grad_norm": 1.1216791868209839, + "learning_rate": 1.9982537472222913e-05, + "loss": 0.4981, + "step": 632 + }, + { + "epoch": 0.0770307271067843, + "grad_norm": 0.7789998650550842, + "learning_rate": 1.998242205164425e-05, + "loss": 0.5166, + "step": 633 + }, + { + "epoch": 0.07715241861880134, + "grad_norm": 1.4345245361328125, + "learning_rate": 1.9982306251213767e-05, + "loss": 0.5435, + "step": 634 + }, + { + "epoch": 0.07727411013081838, + "grad_norm": 4.756383419036865, + "learning_rate": 1.9982190070935864e-05, + "loss": 0.461, + "step": 635 + }, + { + "epoch": 0.07739580164283541, + "grad_norm": 2.975379228591919, + "learning_rate": 1.998207351081497e-05, + "loss": 0.4201, + "step": 636 + }, + { + "epoch": 0.07751749315485244, + "grad_norm": 1.7346969842910767, + "learning_rate": 1.998195657085552e-05, + "loss": 0.436, + "step": 637 + }, + { + "epoch": 0.07763918466686949, + "grad_norm": 2.7414207458496094, + "learning_rate": 1.9981839251061957e-05, + "loss": 0.5508, + "step": 638 + }, + { + "epoch": 0.07776087617888652, + "grad_norm": 2.9115066528320312, + "learning_rate": 1.9981721551438757e-05, + "loss": 0.4698, + "step": 639 + }, + { + "epoch": 0.07788256769090356, + "grad_norm": 0.6065772175788879, + "learning_rate": 1.998160347199038e-05, + "loss": 0.4355, + "step": 640 + }, + { + "epoch": 0.07800425920292059, + "grad_norm": 3.2246828079223633, + "learning_rate": 1.9981485012721338e-05, + "loss": 0.5056, + "step": 641 + }, + { + "epoch": 0.07812595071493764, + "grad_norm": 1.8916140794754028, + "learning_rate": 1.9981366173636125e-05, + "loss": 0.4789, + "step": 642 + }, + { + "epoch": 0.07824764222695467, + "grad_norm": 1.258357048034668, + "learning_rate": 1.9981246954739272e-05, + "loss": 0.5108, + "step": 643 + }, + { + "epoch": 0.0783693337389717, + "grad_norm": 5.186991214752197, + "learning_rate": 1.998112735603531e-05, + "loss": 0.4405, + "step": 644 + }, + { + "epoch": 0.07849102525098875, + "grad_norm": 4.149471282958984, + "learning_rate": 1.9981007377528795e-05, + "loss": 0.4755, + "step": 645 + }, + { + "epoch": 0.07861271676300578, + "grad_norm": 1.185378074645996, + "learning_rate": 1.998088701922429e-05, + "loss": 0.5405, + "step": 646 + }, + { + "epoch": 0.07873440827502282, + "grad_norm": 4.87866735458374, + "learning_rate": 1.9980766281126373e-05, + "loss": 0.4716, + "step": 647 + }, + { + "epoch": 0.07885609978703985, + "grad_norm": 2.766418933868408, + "learning_rate": 1.998064516323964e-05, + "loss": 0.4536, + "step": 648 + }, + { + "epoch": 0.0789777912990569, + "grad_norm": 3.7572224140167236, + "learning_rate": 1.9980523665568704e-05, + "loss": 0.5514, + "step": 649 + }, + { + "epoch": 0.07909948281107393, + "grad_norm": 3.2041800022125244, + "learning_rate": 1.998040178811818e-05, + "loss": 0.5101, + "step": 650 + }, + { + "epoch": 0.07922117432309096, + "grad_norm": 0.7075926661491394, + "learning_rate": 1.9980279530892708e-05, + "loss": 0.4541, + "step": 651 + }, + { + "epoch": 0.079342865835108, + "grad_norm": 2.305121898651123, + "learning_rate": 1.998015689389694e-05, + "loss": 0.4913, + "step": 652 + }, + { + "epoch": 0.07946455734712504, + "grad_norm": 3.071054697036743, + "learning_rate": 1.9980033877135548e-05, + "loss": 0.5009, + "step": 653 + }, + { + "epoch": 0.07958624885914208, + "grad_norm": 1.4548903703689575, + "learning_rate": 1.9979910480613212e-05, + "loss": 0.4932, + "step": 654 + }, + { + "epoch": 0.07970794037115911, + "grad_norm": 1.6935406923294067, + "learning_rate": 1.9979786704334624e-05, + "loss": 0.4554, + "step": 655 + }, + { + "epoch": 0.07982963188317616, + "grad_norm": 0.7648898363113403, + "learning_rate": 1.9979662548304492e-05, + "loss": 0.5108, + "step": 656 + }, + { + "epoch": 0.07995132339519319, + "grad_norm": 4.780690670013428, + "learning_rate": 1.9979538012527544e-05, + "loss": 0.4358, + "step": 657 + }, + { + "epoch": 0.08007301490721022, + "grad_norm": 2.169375419616699, + "learning_rate": 1.9979413097008518e-05, + "loss": 0.4821, + "step": 658 + }, + { + "epoch": 0.08019470641922725, + "grad_norm": 3.0978429317474365, + "learning_rate": 1.997928780175217e-05, + "loss": 0.487, + "step": 659 + }, + { + "epoch": 0.0803163979312443, + "grad_norm": 1.2815518379211426, + "learning_rate": 1.997916212676326e-05, + "loss": 0.5157, + "step": 660 + }, + { + "epoch": 0.08043808944326133, + "grad_norm": 0.915164053440094, + "learning_rate": 1.997903607204658e-05, + "loss": 0.472, + "step": 661 + }, + { + "epoch": 0.08055978095527837, + "grad_norm": 2.605832099914551, + "learning_rate": 1.997890963760692e-05, + "loss": 0.521, + "step": 662 + }, + { + "epoch": 0.0806814724672954, + "grad_norm": 1.9893995523452759, + "learning_rate": 1.9978782823449094e-05, + "loss": 0.481, + "step": 663 + }, + { + "epoch": 0.08080316397931245, + "grad_norm": 2.0475409030914307, + "learning_rate": 1.997865562957793e-05, + "loss": 0.4323, + "step": 664 + }, + { + "epoch": 0.08092485549132948, + "grad_norm": 1.4584271907806396, + "learning_rate": 1.9978528055998258e-05, + "loss": 0.4443, + "step": 665 + }, + { + "epoch": 0.08104654700334651, + "grad_norm": 1.6210416555404663, + "learning_rate": 1.997840010271494e-05, + "loss": 0.4888, + "step": 666 + }, + { + "epoch": 0.08116823851536355, + "grad_norm": 1.6450836658477783, + "learning_rate": 1.9978271769732848e-05, + "loss": 0.4643, + "step": 667 + }, + { + "epoch": 0.0812899300273806, + "grad_norm": 1.25798499584198, + "learning_rate": 1.997814305705686e-05, + "loss": 0.4872, + "step": 668 + }, + { + "epoch": 0.08141162153939763, + "grad_norm": 1.6350147724151611, + "learning_rate": 1.9978013964691875e-05, + "loss": 0.4669, + "step": 669 + }, + { + "epoch": 0.08153331305141466, + "grad_norm": 1.0698541402816772, + "learning_rate": 1.9977884492642802e-05, + "loss": 0.4882, + "step": 670 + }, + { + "epoch": 0.08165500456343171, + "grad_norm": 3.3107619285583496, + "learning_rate": 1.9977754640914572e-05, + "loss": 0.4375, + "step": 671 + }, + { + "epoch": 0.08177669607544874, + "grad_norm": 4.443649768829346, + "learning_rate": 1.9977624409512128e-05, + "loss": 0.4511, + "step": 672 + }, + { + "epoch": 0.08189838758746577, + "grad_norm": 4.450760364532471, + "learning_rate": 1.9977493798440423e-05, + "loss": 0.4579, + "step": 673 + }, + { + "epoch": 0.0820200790994828, + "grad_norm": 0.8068259358406067, + "learning_rate": 1.9977362807704424e-05, + "loss": 0.4998, + "step": 674 + }, + { + "epoch": 0.08214177061149985, + "grad_norm": 1.5836198329925537, + "learning_rate": 1.9977231437309118e-05, + "loss": 0.526, + "step": 675 + }, + { + "epoch": 0.08226346212351689, + "grad_norm": 3.4682936668395996, + "learning_rate": 1.9977099687259507e-05, + "loss": 0.5666, + "step": 676 + }, + { + "epoch": 0.08238515363553392, + "grad_norm": 4.291449069976807, + "learning_rate": 1.9976967557560598e-05, + "loss": 0.5683, + "step": 677 + }, + { + "epoch": 0.08250684514755095, + "grad_norm": 1.4395569562911987, + "learning_rate": 1.9976835048217422e-05, + "loss": 0.4415, + "step": 678 + }, + { + "epoch": 0.082628536659568, + "grad_norm": 0.5459081530570984, + "learning_rate": 1.9976702159235025e-05, + "loss": 0.4887, + "step": 679 + }, + { + "epoch": 0.08275022817158503, + "grad_norm": 0.7991795539855957, + "learning_rate": 1.9976568890618458e-05, + "loss": 0.5233, + "step": 680 + }, + { + "epoch": 0.08287191968360207, + "grad_norm": 2.7139601707458496, + "learning_rate": 1.9976435242372796e-05, + "loss": 0.4678, + "step": 681 + }, + { + "epoch": 0.0829936111956191, + "grad_norm": 1.2524338960647583, + "learning_rate": 1.997630121450312e-05, + "loss": 0.4884, + "step": 682 + }, + { + "epoch": 0.08311530270763615, + "grad_norm": 0.6839807033538818, + "learning_rate": 1.9976166807014534e-05, + "loss": 0.4555, + "step": 683 + }, + { + "epoch": 0.08323699421965318, + "grad_norm": 2.2753326892852783, + "learning_rate": 1.9976032019912153e-05, + "loss": 0.5509, + "step": 684 + }, + { + "epoch": 0.08335868573167021, + "grad_norm": 0.8514805436134338, + "learning_rate": 1.9975896853201104e-05, + "loss": 0.4656, + "step": 685 + }, + { + "epoch": 0.08348037724368726, + "grad_norm": 0.7678331732749939, + "learning_rate": 1.997576130688653e-05, + "loss": 0.4715, + "step": 686 + }, + { + "epoch": 0.08360206875570429, + "grad_norm": 0.6587056517601013, + "learning_rate": 1.997562538097359e-05, + "loss": 0.4708, + "step": 687 + }, + { + "epoch": 0.08372376026772133, + "grad_norm": 1.8988431692123413, + "learning_rate": 1.9975489075467456e-05, + "loss": 0.5027, + "step": 688 + }, + { + "epoch": 0.08384545177973836, + "grad_norm": 1.1294777393341064, + "learning_rate": 1.9975352390373315e-05, + "loss": 0.4647, + "step": 689 + }, + { + "epoch": 0.0839671432917554, + "grad_norm": 2.2918758392333984, + "learning_rate": 1.9975215325696368e-05, + "loss": 0.4432, + "step": 690 + }, + { + "epoch": 0.08408883480377244, + "grad_norm": 0.8133344650268555, + "learning_rate": 1.997507788144183e-05, + "loss": 0.4883, + "step": 691 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 0.8164210319519043, + "learning_rate": 1.9974940057614927e-05, + "loss": 0.4511, + "step": 692 + }, + { + "epoch": 0.0843322178278065, + "grad_norm": 2.617898464202881, + "learning_rate": 1.9974801854220913e-05, + "loss": 0.56, + "step": 693 + }, + { + "epoch": 0.08445390933982355, + "grad_norm": 0.9751055240631104, + "learning_rate": 1.997466327126504e-05, + "loss": 0.4568, + "step": 694 + }, + { + "epoch": 0.08457560085184058, + "grad_norm": 0.8269926905632019, + "learning_rate": 1.997452430875258e-05, + "loss": 0.4883, + "step": 695 + }, + { + "epoch": 0.08469729236385762, + "grad_norm": 2.613527774810791, + "learning_rate": 1.9974384966688832e-05, + "loss": 0.5167, + "step": 696 + }, + { + "epoch": 0.08481898387587465, + "grad_norm": 1.7332885265350342, + "learning_rate": 1.9974245245079086e-05, + "loss": 0.5151, + "step": 697 + }, + { + "epoch": 0.0849406753878917, + "grad_norm": 0.8025236129760742, + "learning_rate": 1.997410514392866e-05, + "loss": 0.4848, + "step": 698 + }, + { + "epoch": 0.08506236689990873, + "grad_norm": 2.6296651363372803, + "learning_rate": 1.9973964663242892e-05, + "loss": 0.4481, + "step": 699 + }, + { + "epoch": 0.08518405841192576, + "grad_norm": 2.230381727218628, + "learning_rate": 1.997382380302712e-05, + "loss": 0.4668, + "step": 700 + }, + { + "epoch": 0.08530574992394281, + "grad_norm": 2.0779402256011963, + "learning_rate": 1.997368256328671e-05, + "loss": 0.5133, + "step": 701 + }, + { + "epoch": 0.08542744143595984, + "grad_norm": 2.086312770843506, + "learning_rate": 1.9973540944027033e-05, + "loss": 0.4751, + "step": 702 + }, + { + "epoch": 0.08554913294797688, + "grad_norm": 2.4139175415039062, + "learning_rate": 1.997339894525348e-05, + "loss": 0.5354, + "step": 703 + }, + { + "epoch": 0.08567082445999391, + "grad_norm": 1.2465251684188843, + "learning_rate": 1.9973256566971455e-05, + "loss": 0.4577, + "step": 704 + }, + { + "epoch": 0.08579251597201096, + "grad_norm": 1.7190598249435425, + "learning_rate": 1.997311380918637e-05, + "loss": 0.4723, + "step": 705 + }, + { + "epoch": 0.08591420748402799, + "grad_norm": 3.6874632835388184, + "learning_rate": 1.9972970671903666e-05, + "loss": 0.5263, + "step": 706 + }, + { + "epoch": 0.08603589899604502, + "grad_norm": 3.3435325622558594, + "learning_rate": 1.9972827155128782e-05, + "loss": 0.5053, + "step": 707 + }, + { + "epoch": 0.08615759050806206, + "grad_norm": 2.0431723594665527, + "learning_rate": 1.9972683258867183e-05, + "loss": 0.4962, + "step": 708 + }, + { + "epoch": 0.0862792820200791, + "grad_norm": 0.8176119923591614, + "learning_rate": 1.997253898312434e-05, + "loss": 0.5145, + "step": 709 + }, + { + "epoch": 0.08640097353209614, + "grad_norm": 2.744959831237793, + "learning_rate": 1.997239432790575e-05, + "loss": 0.4921, + "step": 710 + }, + { + "epoch": 0.08652266504411317, + "grad_norm": 1.4425395727157593, + "learning_rate": 1.9972249293216913e-05, + "loss": 0.5073, + "step": 711 + }, + { + "epoch": 0.0866443565561302, + "grad_norm": 1.0155556201934814, + "learning_rate": 1.9972103879063352e-05, + "loss": 0.5211, + "step": 712 + }, + { + "epoch": 0.08676604806814725, + "grad_norm": 1.4708799123764038, + "learning_rate": 1.9971958085450594e-05, + "loss": 0.5522, + "step": 713 + }, + { + "epoch": 0.08688773958016428, + "grad_norm": 2.139232873916626, + "learning_rate": 1.9971811912384193e-05, + "loss": 0.4502, + "step": 714 + }, + { + "epoch": 0.08700943109218132, + "grad_norm": 2.236619710922241, + "learning_rate": 1.9971665359869705e-05, + "loss": 0.4659, + "step": 715 + }, + { + "epoch": 0.08713112260419836, + "grad_norm": 0.6716448068618774, + "learning_rate": 1.9971518427912713e-05, + "loss": 0.4488, + "step": 716 + }, + { + "epoch": 0.0872528141162154, + "grad_norm": 5.232616424560547, + "learning_rate": 1.99713711165188e-05, + "loss": 0.6131, + "step": 717 + }, + { + "epoch": 0.08737450562823243, + "grad_norm": 4.66196346282959, + "learning_rate": 1.997122342569358e-05, + "loss": 0.591, + "step": 718 + }, + { + "epoch": 0.08749619714024946, + "grad_norm": 2.9173707962036133, + "learning_rate": 1.997107535544267e-05, + "loss": 0.5139, + "step": 719 + }, + { + "epoch": 0.08761788865226651, + "grad_norm": 2.5711612701416016, + "learning_rate": 1.9970926905771704e-05, + "loss": 0.502, + "step": 720 + }, + { + "epoch": 0.08773958016428354, + "grad_norm": 3.4926297664642334, + "learning_rate": 1.997077807668633e-05, + "loss": 0.4308, + "step": 721 + }, + { + "epoch": 0.08786127167630058, + "grad_norm": 2.6729512214660645, + "learning_rate": 1.997062886819221e-05, + "loss": 0.4532, + "step": 722 + }, + { + "epoch": 0.08798296318831761, + "grad_norm": 0.8609975576400757, + "learning_rate": 1.9970479280295024e-05, + "loss": 0.5278, + "step": 723 + }, + { + "epoch": 0.08810465470033466, + "grad_norm": 4.873531341552734, + "learning_rate": 1.9970329313000463e-05, + "loss": 0.4265, + "step": 724 + }, + { + "epoch": 0.08822634621235169, + "grad_norm": 2.0352935791015625, + "learning_rate": 1.9970178966314238e-05, + "loss": 0.5317, + "step": 725 + }, + { + "epoch": 0.08834803772436872, + "grad_norm": 0.8737489581108093, + "learning_rate": 1.9970028240242064e-05, + "loss": 0.5306, + "step": 726 + }, + { + "epoch": 0.08846972923638577, + "grad_norm": 1.6529440879821777, + "learning_rate": 1.9969877134789678e-05, + "loss": 0.4342, + "step": 727 + }, + { + "epoch": 0.0885914207484028, + "grad_norm": 1.2789009809494019, + "learning_rate": 1.996972564996283e-05, + "loss": 0.5364, + "step": 728 + }, + { + "epoch": 0.08871311226041984, + "grad_norm": 1.3129600286483765, + "learning_rate": 1.9969573785767283e-05, + "loss": 0.4785, + "step": 729 + }, + { + "epoch": 0.08883480377243687, + "grad_norm": 0.9026879072189331, + "learning_rate": 1.9969421542208822e-05, + "loss": 0.467, + "step": 730 + }, + { + "epoch": 0.08895649528445392, + "grad_norm": 1.4935601949691772, + "learning_rate": 1.9969268919293234e-05, + "loss": 0.4896, + "step": 731 + }, + { + "epoch": 0.08907818679647095, + "grad_norm": 0.9021433591842651, + "learning_rate": 1.996911591702633e-05, + "loss": 0.4782, + "step": 732 + }, + { + "epoch": 0.08919987830848798, + "grad_norm": 0.6788372993469238, + "learning_rate": 1.9968962535413928e-05, + "loss": 0.4376, + "step": 733 + }, + { + "epoch": 0.08932156982050501, + "grad_norm": 0.7806774377822876, + "learning_rate": 1.9968808774461864e-05, + "loss": 0.4691, + "step": 734 + }, + { + "epoch": 0.08944326133252206, + "grad_norm": 1.081540822982788, + "learning_rate": 1.9968654634175995e-05, + "loss": 0.5036, + "step": 735 + }, + { + "epoch": 0.0895649528445391, + "grad_norm": 2.1174347400665283, + "learning_rate": 1.9968500114562186e-05, + "loss": 0.4296, + "step": 736 + }, + { + "epoch": 0.08968664435655613, + "grad_norm": 0.7901777625083923, + "learning_rate": 1.9968345215626314e-05, + "loss": 0.5028, + "step": 737 + }, + { + "epoch": 0.08980833586857316, + "grad_norm": 1.5048437118530273, + "learning_rate": 1.996818993737427e-05, + "loss": 0.5052, + "step": 738 + }, + { + "epoch": 0.08993002738059021, + "grad_norm": 3.451389789581299, + "learning_rate": 1.9968034279811966e-05, + "loss": 0.4719, + "step": 739 + }, + { + "epoch": 0.09005171889260724, + "grad_norm": 0.8119122385978699, + "learning_rate": 1.9967878242945328e-05, + "loss": 0.5408, + "step": 740 + }, + { + "epoch": 0.09017341040462427, + "grad_norm": 1.7198354005813599, + "learning_rate": 1.9967721826780284e-05, + "loss": 0.4764, + "step": 741 + }, + { + "epoch": 0.09029510191664132, + "grad_norm": 0.8779088258743286, + "learning_rate": 1.9967565031322797e-05, + "loss": 0.5024, + "step": 742 + }, + { + "epoch": 0.09041679342865835, + "grad_norm": 3.365600109100342, + "learning_rate": 1.996740785657883e-05, + "loss": 0.5576, + "step": 743 + }, + { + "epoch": 0.09053848494067539, + "grad_norm": 0.9623547196388245, + "learning_rate": 1.996725030255436e-05, + "loss": 0.4835, + "step": 744 + }, + { + "epoch": 0.09066017645269242, + "grad_norm": 0.7516545653343201, + "learning_rate": 1.9967092369255386e-05, + "loss": 0.4944, + "step": 745 + }, + { + "epoch": 0.09078186796470947, + "grad_norm": 1.2452806234359741, + "learning_rate": 1.9966934056687917e-05, + "loss": 0.5219, + "step": 746 + }, + { + "epoch": 0.0909035594767265, + "grad_norm": 3.2794156074523926, + "learning_rate": 1.9966775364857977e-05, + "loss": 0.413, + "step": 747 + }, + { + "epoch": 0.09102525098874353, + "grad_norm": 1.883211612701416, + "learning_rate": 1.9966616293771602e-05, + "loss": 0.523, + "step": 748 + }, + { + "epoch": 0.09114694250076057, + "grad_norm": 1.2692965269088745, + "learning_rate": 1.996645684343485e-05, + "loss": 0.4636, + "step": 749 + }, + { + "epoch": 0.09126863401277761, + "grad_norm": 2.2186050415039062, + "learning_rate": 1.9966297013853784e-05, + "loss": 0.4492, + "step": 750 + }, + { + "epoch": 0.09139032552479465, + "grad_norm": 0.8209418058395386, + "learning_rate": 1.996613680503449e-05, + "loss": 0.4841, + "step": 751 + }, + { + "epoch": 0.09151201703681168, + "grad_norm": 0.8032373785972595, + "learning_rate": 1.9965976216983057e-05, + "loss": 0.4351, + "step": 752 + }, + { + "epoch": 0.09163370854882871, + "grad_norm": 1.9146921634674072, + "learning_rate": 1.9965815249705603e-05, + "loss": 0.4913, + "step": 753 + }, + { + "epoch": 0.09175540006084576, + "grad_norm": 2.451493978500366, + "learning_rate": 1.996565390320825e-05, + "loss": 0.393, + "step": 754 + }, + { + "epoch": 0.09187709157286279, + "grad_norm": 1.9551799297332764, + "learning_rate": 1.996549217749714e-05, + "loss": 0.5184, + "step": 755 + }, + { + "epoch": 0.09199878308487983, + "grad_norm": 2.2005109786987305, + "learning_rate": 1.9965330072578423e-05, + "loss": 0.5057, + "step": 756 + }, + { + "epoch": 0.09212047459689687, + "grad_norm": 1.9353986978530884, + "learning_rate": 1.996516758845827e-05, + "loss": 0.4057, + "step": 757 + }, + { + "epoch": 0.0922421661089139, + "grad_norm": 0.7365932464599609, + "learning_rate": 1.9965004725142867e-05, + "loss": 0.492, + "step": 758 + }, + { + "epoch": 0.09236385762093094, + "grad_norm": 2.247995376586914, + "learning_rate": 1.9964841482638406e-05, + "loss": 0.4588, + "step": 759 + }, + { + "epoch": 0.09248554913294797, + "grad_norm": 0.7459166049957275, + "learning_rate": 1.9964677860951097e-05, + "loss": 0.43, + "step": 760 + }, + { + "epoch": 0.09260724064496502, + "grad_norm": 1.3299825191497803, + "learning_rate": 1.9964513860087172e-05, + "loss": 0.4365, + "step": 761 + }, + { + "epoch": 0.09272893215698205, + "grad_norm": 1.968151569366455, + "learning_rate": 1.9964349480052872e-05, + "loss": 0.5189, + "step": 762 + }, + { + "epoch": 0.09285062366899909, + "grad_norm": 1.3241199254989624, + "learning_rate": 1.9964184720854444e-05, + "loss": 0.4101, + "step": 763 + }, + { + "epoch": 0.09297231518101612, + "grad_norm": 1.8080377578735352, + "learning_rate": 1.9964019582498167e-05, + "loss": 0.5639, + "step": 764 + }, + { + "epoch": 0.09309400669303317, + "grad_norm": 0.9407429099082947, + "learning_rate": 1.996385406499032e-05, + "loss": 0.4957, + "step": 765 + }, + { + "epoch": 0.0932156982050502, + "grad_norm": 2.313314914703369, + "learning_rate": 1.99636881683372e-05, + "loss": 0.4636, + "step": 766 + }, + { + "epoch": 0.09333738971706723, + "grad_norm": 2.7045557498931885, + "learning_rate": 1.9963521892545122e-05, + "loss": 0.5549, + "step": 767 + }, + { + "epoch": 0.09345908122908426, + "grad_norm": 2.3331668376922607, + "learning_rate": 1.9963355237620412e-05, + "loss": 0.4176, + "step": 768 + }, + { + "epoch": 0.09358077274110131, + "grad_norm": 1.3269882202148438, + "learning_rate": 1.9963188203569413e-05, + "loss": 0.4579, + "step": 769 + }, + { + "epoch": 0.09370246425311834, + "grad_norm": 2.986143112182617, + "learning_rate": 1.996302079039848e-05, + "loss": 0.5186, + "step": 770 + }, + { + "epoch": 0.09382415576513538, + "grad_norm": 1.0925177335739136, + "learning_rate": 1.9962852998113983e-05, + "loss": 0.4653, + "step": 771 + }, + { + "epoch": 0.09394584727715243, + "grad_norm": 0.6733190417289734, + "learning_rate": 1.996268482672231e-05, + "loss": 0.4671, + "step": 772 + }, + { + "epoch": 0.09406753878916946, + "grad_norm": 4.965397834777832, + "learning_rate": 1.9962516276229856e-05, + "loss": 0.5807, + "step": 773 + }, + { + "epoch": 0.09418923030118649, + "grad_norm": 1.6570329666137695, + "learning_rate": 1.9962347346643035e-05, + "loss": 0.4375, + "step": 774 + }, + { + "epoch": 0.09431092181320352, + "grad_norm": 0.9413464069366455, + "learning_rate": 1.9962178037968282e-05, + "loss": 0.458, + "step": 775 + }, + { + "epoch": 0.09443261332522057, + "grad_norm": 1.3105307817459106, + "learning_rate": 1.9962008350212028e-05, + "loss": 0.5169, + "step": 776 + }, + { + "epoch": 0.0945543048372376, + "grad_norm": 1.5985509157180786, + "learning_rate": 1.9961838283380737e-05, + "loss": 0.4437, + "step": 777 + }, + { + "epoch": 0.09467599634925464, + "grad_norm": 1.004827618598938, + "learning_rate": 1.996166783748088e-05, + "loss": 0.4798, + "step": 778 + }, + { + "epoch": 0.09479768786127167, + "grad_norm": 2.7168939113616943, + "learning_rate": 1.9961497012518944e-05, + "loss": 0.4753, + "step": 779 + }, + { + "epoch": 0.09491937937328872, + "grad_norm": 1.3701131343841553, + "learning_rate": 1.9961325808501425e-05, + "loss": 0.5003, + "step": 780 + }, + { + "epoch": 0.09504107088530575, + "grad_norm": 2.065967559814453, + "learning_rate": 1.9961154225434843e-05, + "loss": 0.5236, + "step": 781 + }, + { + "epoch": 0.09516276239732278, + "grad_norm": 0.8278501629829407, + "learning_rate": 1.9960982263325723e-05, + "loss": 0.507, + "step": 782 + }, + { + "epoch": 0.09528445390933982, + "grad_norm": 1.4648507833480835, + "learning_rate": 1.9960809922180607e-05, + "loss": 0.4769, + "step": 783 + }, + { + "epoch": 0.09540614542135686, + "grad_norm": 4.1555399894714355, + "learning_rate": 1.996063720200606e-05, + "loss": 0.4459, + "step": 784 + }, + { + "epoch": 0.0955278369333739, + "grad_norm": 1.7836990356445312, + "learning_rate": 1.996046410280865e-05, + "loss": 0.5033, + "step": 785 + }, + { + "epoch": 0.09564952844539093, + "grad_norm": 1.0879727602005005, + "learning_rate": 1.996029062459496e-05, + "loss": 0.4188, + "step": 786 + }, + { + "epoch": 0.09577121995740798, + "grad_norm": 3.2963836193084717, + "learning_rate": 1.99601167673716e-05, + "loss": 0.5207, + "step": 787 + }, + { + "epoch": 0.09589291146942501, + "grad_norm": 1.8777716159820557, + "learning_rate": 1.9959942531145176e-05, + "loss": 0.4218, + "step": 788 + }, + { + "epoch": 0.09601460298144204, + "grad_norm": 2.411846160888672, + "learning_rate": 1.9959767915922325e-05, + "loss": 0.4521, + "step": 789 + }, + { + "epoch": 0.09613629449345908, + "grad_norm": 4.264321804046631, + "learning_rate": 1.995959292170969e-05, + "loss": 0.5327, + "step": 790 + }, + { + "epoch": 0.09625798600547612, + "grad_norm": 1.3433444499969482, + "learning_rate": 1.9959417548513926e-05, + "loss": 0.4393, + "step": 791 + }, + { + "epoch": 0.09637967751749316, + "grad_norm": 0.7187265157699585, + "learning_rate": 1.9959241796341714e-05, + "loss": 0.4836, + "step": 792 + }, + { + "epoch": 0.09650136902951019, + "grad_norm": 1.6666771173477173, + "learning_rate": 1.995906566519973e-05, + "loss": 0.4286, + "step": 793 + }, + { + "epoch": 0.09662306054152722, + "grad_norm": 1.0978599786758423, + "learning_rate": 1.9958889155094693e-05, + "loss": 0.4652, + "step": 794 + }, + { + "epoch": 0.09674475205354427, + "grad_norm": 2.4084653854370117, + "learning_rate": 1.99587122660333e-05, + "loss": 0.4832, + "step": 795 + }, + { + "epoch": 0.0968664435655613, + "grad_norm": 1.3513977527618408, + "learning_rate": 1.9958534998022298e-05, + "loss": 0.5276, + "step": 796 + }, + { + "epoch": 0.09698813507757834, + "grad_norm": 0.6939101219177246, + "learning_rate": 1.9958357351068425e-05, + "loss": 0.4665, + "step": 797 + }, + { + "epoch": 0.09710982658959537, + "grad_norm": 2.0230472087860107, + "learning_rate": 1.9958179325178443e-05, + "loss": 0.4881, + "step": 798 + }, + { + "epoch": 0.09723151810161242, + "grad_norm": 1.1054073572158813, + "learning_rate": 1.9958000920359123e-05, + "loss": 0.4991, + "step": 799 + }, + { + "epoch": 0.09735320961362945, + "grad_norm": 2.6655113697052, + "learning_rate": 1.9957822136617257e-05, + "loss": 0.5364, + "step": 800 + }, + { + "epoch": 0.09747490112564648, + "grad_norm": 1.5029679536819458, + "learning_rate": 1.9957642973959647e-05, + "loss": 0.4314, + "step": 801 + }, + { + "epoch": 0.09759659263766353, + "grad_norm": 0.5799332857131958, + "learning_rate": 1.9957463432393113e-05, + "loss": 0.4507, + "step": 802 + }, + { + "epoch": 0.09771828414968056, + "grad_norm": 3.1802175045013428, + "learning_rate": 1.9957283511924483e-05, + "loss": 0.5046, + "step": 803 + }, + { + "epoch": 0.0978399756616976, + "grad_norm": 1.7593673467636108, + "learning_rate": 1.9957103212560605e-05, + "loss": 0.4751, + "step": 804 + }, + { + "epoch": 0.09796166717371463, + "grad_norm": 1.0430735349655151, + "learning_rate": 1.9956922534308338e-05, + "loss": 0.518, + "step": 805 + }, + { + "epoch": 0.09808335868573168, + "grad_norm": 3.0028467178344727, + "learning_rate": 1.995674147717456e-05, + "loss": 0.4686, + "step": 806 + }, + { + "epoch": 0.09820505019774871, + "grad_norm": 0.6665245294570923, + "learning_rate": 1.995656004116616e-05, + "loss": 0.5079, + "step": 807 + }, + { + "epoch": 0.09832674170976574, + "grad_norm": 3.278702974319458, + "learning_rate": 1.995637822629004e-05, + "loss": 0.4364, + "step": 808 + }, + { + "epoch": 0.09844843322178277, + "grad_norm": 2.833733081817627, + "learning_rate": 1.9956196032553122e-05, + "loss": 0.4419, + "step": 809 + }, + { + "epoch": 0.09857012473379982, + "grad_norm": 0.650507390499115, + "learning_rate": 1.995601345996234e-05, + "loss": 0.4976, + "step": 810 + }, + { + "epoch": 0.09869181624581685, + "grad_norm": 1.0216940641403198, + "learning_rate": 1.995583050852463e-05, + "loss": 0.4909, + "step": 811 + }, + { + "epoch": 0.09881350775783389, + "grad_norm": 1.4588136672973633, + "learning_rate": 1.9955647178246965e-05, + "loss": 0.4329, + "step": 812 + }, + { + "epoch": 0.09893519926985093, + "grad_norm": 1.6294156312942505, + "learning_rate": 1.995546346913632e-05, + "loss": 0.4915, + "step": 813 + }, + { + "epoch": 0.09905689078186797, + "grad_norm": 2.5995635986328125, + "learning_rate": 1.995527938119968e-05, + "loss": 0.4922, + "step": 814 + }, + { + "epoch": 0.099178582293885, + "grad_norm": 2.078435182571411, + "learning_rate": 1.9955094914444056e-05, + "loss": 0.432, + "step": 815 + }, + { + "epoch": 0.09930027380590203, + "grad_norm": 2.96297550201416, + "learning_rate": 1.9954910068876465e-05, + "loss": 0.5354, + "step": 816 + }, + { + "epoch": 0.09942196531791908, + "grad_norm": 2.1711313724517822, + "learning_rate": 1.995472484450394e-05, + "loss": 0.435, + "step": 817 + }, + { + "epoch": 0.09954365682993611, + "grad_norm": 3.4678258895874023, + "learning_rate": 1.9954539241333527e-05, + "loss": 0.3969, + "step": 818 + }, + { + "epoch": 0.09966534834195315, + "grad_norm": 0.750209391117096, + "learning_rate": 1.9954353259372295e-05, + "loss": 0.4628, + "step": 819 + }, + { + "epoch": 0.09978703985397018, + "grad_norm": 2.4872379302978516, + "learning_rate": 1.9954166898627313e-05, + "loss": 0.505, + "step": 820 + }, + { + "epoch": 0.09990873136598723, + "grad_norm": 0.9021669626235962, + "learning_rate": 1.995398015910568e-05, + "loss": 0.487, + "step": 821 + }, + { + "epoch": 0.10003042287800426, + "grad_norm": 1.3522422313690186, + "learning_rate": 1.9953793040814492e-05, + "loss": 0.4986, + "step": 822 + }, + { + "epoch": 0.1001521143900213, + "grad_norm": 2.327824592590332, + "learning_rate": 1.995360554376088e-05, + "loss": 0.5162, + "step": 823 + }, + { + "epoch": 0.10027380590203833, + "grad_norm": 2.900378942489624, + "learning_rate": 1.9953417667951975e-05, + "loss": 0.4366, + "step": 824 + }, + { + "epoch": 0.10039549741405537, + "grad_norm": 1.3112826347351074, + "learning_rate": 1.9953229413394925e-05, + "loss": 0.4836, + "step": 825 + }, + { + "epoch": 0.1005171889260724, + "grad_norm": 1.839365005493164, + "learning_rate": 1.9953040780096897e-05, + "loss": 0.4911, + "step": 826 + }, + { + "epoch": 0.10063888043808944, + "grad_norm": 2.9116950035095215, + "learning_rate": 1.995285176806506e-05, + "loss": 0.4629, + "step": 827 + }, + { + "epoch": 0.10076057195010649, + "grad_norm": 0.6931069493293762, + "learning_rate": 1.9952662377306614e-05, + "loss": 0.4987, + "step": 828 + }, + { + "epoch": 0.10088226346212352, + "grad_norm": 1.930259108543396, + "learning_rate": 1.9952472607828762e-05, + "loss": 0.4959, + "step": 829 + }, + { + "epoch": 0.10100395497414055, + "grad_norm": 2.0763046741485596, + "learning_rate": 1.995228245963873e-05, + "loss": 0.4727, + "step": 830 + }, + { + "epoch": 0.10112564648615759, + "grad_norm": 0.8251340389251709, + "learning_rate": 1.9952091932743747e-05, + "loss": 0.4737, + "step": 831 + }, + { + "epoch": 0.10124733799817463, + "grad_norm": 1.5569857358932495, + "learning_rate": 1.995190102715107e-05, + "loss": 0.4774, + "step": 832 + }, + { + "epoch": 0.10136902951019167, + "grad_norm": 0.6953150033950806, + "learning_rate": 1.9951709742867958e-05, + "loss": 0.474, + "step": 833 + }, + { + "epoch": 0.1014907210222087, + "grad_norm": 2.0638668537139893, + "learning_rate": 1.995151807990169e-05, + "loss": 0.4485, + "step": 834 + }, + { + "epoch": 0.10161241253422573, + "grad_norm": 0.864733874797821, + "learning_rate": 1.995132603825956e-05, + "loss": 0.5076, + "step": 835 + }, + { + "epoch": 0.10173410404624278, + "grad_norm": 1.438027024269104, + "learning_rate": 1.9951133617948878e-05, + "loss": 0.4838, + "step": 836 + }, + { + "epoch": 0.10185579555825981, + "grad_norm": 1.6404670476913452, + "learning_rate": 1.9950940818976968e-05, + "loss": 0.4596, + "step": 837 + }, + { + "epoch": 0.10197748707027685, + "grad_norm": 1.4606159925460815, + "learning_rate": 1.9950747641351156e-05, + "loss": 0.4691, + "step": 838 + }, + { + "epoch": 0.10209917858229388, + "grad_norm": 2.0456371307373047, + "learning_rate": 1.99505540850788e-05, + "loss": 0.4973, + "step": 839 + }, + { + "epoch": 0.10222087009431093, + "grad_norm": 1.6921428442001343, + "learning_rate": 1.9950360150167268e-05, + "loss": 0.4706, + "step": 840 + }, + { + "epoch": 0.10234256160632796, + "grad_norm": 1.1583380699157715, + "learning_rate": 1.9950165836623934e-05, + "loss": 0.4314, + "step": 841 + }, + { + "epoch": 0.10246425311834499, + "grad_norm": 0.7111881375312805, + "learning_rate": 1.9949971144456195e-05, + "loss": 0.4348, + "step": 842 + }, + { + "epoch": 0.10258594463036204, + "grad_norm": 1.1225740909576416, + "learning_rate": 1.9949776073671457e-05, + "loss": 0.4831, + "step": 843 + }, + { + "epoch": 0.10270763614237907, + "grad_norm": 0.5911033749580383, + "learning_rate": 1.9949580624277148e-05, + "loss": 0.4614, + "step": 844 + }, + { + "epoch": 0.1028293276543961, + "grad_norm": 4.8914642333984375, + "learning_rate": 1.9949384796280697e-05, + "loss": 0.5864, + "step": 845 + }, + { + "epoch": 0.10295101916641314, + "grad_norm": 0.8192464113235474, + "learning_rate": 1.9949188589689565e-05, + "loss": 0.4838, + "step": 846 + }, + { + "epoch": 0.10307271067843018, + "grad_norm": 0.6399006843566895, + "learning_rate": 1.994899200451121e-05, + "loss": 0.4895, + "step": 847 + }, + { + "epoch": 0.10319440219044722, + "grad_norm": 2.1333060264587402, + "learning_rate": 1.994879504075312e-05, + "loss": 0.4945, + "step": 848 + }, + { + "epoch": 0.10331609370246425, + "grad_norm": 2.7365994453430176, + "learning_rate": 1.994859769842278e-05, + "loss": 0.4895, + "step": 849 + }, + { + "epoch": 0.10343778521448128, + "grad_norm": 5.075249671936035, + "learning_rate": 1.994839997752771e-05, + "loss": 0.4565, + "step": 850 + }, + { + "epoch": 0.10355947672649833, + "grad_norm": 2.683525562286377, + "learning_rate": 1.9948201878075427e-05, + "loss": 0.4617, + "step": 851 + }, + { + "epoch": 0.10368116823851536, + "grad_norm": 1.4198042154312134, + "learning_rate": 1.994800340007347e-05, + "loss": 0.427, + "step": 852 + }, + { + "epoch": 0.1038028597505324, + "grad_norm": 1.2674226760864258, + "learning_rate": 1.9947804543529394e-05, + "loss": 0.4575, + "step": 853 + }, + { + "epoch": 0.10392455126254943, + "grad_norm": 3.470630168914795, + "learning_rate": 1.9947605308450763e-05, + "loss": 0.5183, + "step": 854 + }, + { + "epoch": 0.10404624277456648, + "grad_norm": 6.524876594543457, + "learning_rate": 1.994740569484516e-05, + "loss": 0.6254, + "step": 855 + }, + { + "epoch": 0.10416793428658351, + "grad_norm": 6.869317054748535, + "learning_rate": 1.994720570272018e-05, + "loss": 0.6211, + "step": 856 + }, + { + "epoch": 0.10428962579860054, + "grad_norm": 1.907633900642395, + "learning_rate": 1.9947005332083435e-05, + "loss": 0.4342, + "step": 857 + }, + { + "epoch": 0.10441131731061759, + "grad_norm": 4.02207088470459, + "learning_rate": 1.9946804582942546e-05, + "loss": 0.5247, + "step": 858 + }, + { + "epoch": 0.10453300882263462, + "grad_norm": 0.6411753296852112, + "learning_rate": 1.9946603455305157e-05, + "loss": 0.4765, + "step": 859 + }, + { + "epoch": 0.10465470033465166, + "grad_norm": 2.5639162063598633, + "learning_rate": 1.9946401949178916e-05, + "loss": 0.4834, + "step": 860 + }, + { + "epoch": 0.10477639184666869, + "grad_norm": 3.0237371921539307, + "learning_rate": 1.994620006457149e-05, + "loss": 0.4756, + "step": 861 + }, + { + "epoch": 0.10489808335868574, + "grad_norm": 5.799304485321045, + "learning_rate": 1.9945997801490568e-05, + "loss": 0.4627, + "step": 862 + }, + { + "epoch": 0.10501977487070277, + "grad_norm": 3.4371719360351562, + "learning_rate": 1.994579515994384e-05, + "loss": 0.528, + "step": 863 + }, + { + "epoch": 0.1051414663827198, + "grad_norm": 1.3571069240570068, + "learning_rate": 1.9945592139939024e-05, + "loss": 0.4961, + "step": 864 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.2791073322296143, + "learning_rate": 1.9945388741483837e-05, + "loss": 0.4875, + "step": 865 + }, + { + "epoch": 0.10538484940675388, + "grad_norm": 2.836144208908081, + "learning_rate": 1.9945184964586022e-05, + "loss": 0.3998, + "step": 866 + }, + { + "epoch": 0.10550654091877092, + "grad_norm": 3.1967105865478516, + "learning_rate": 1.9944980809253338e-05, + "loss": 0.4914, + "step": 867 + }, + { + "epoch": 0.10562823243078795, + "grad_norm": 5.369935989379883, + "learning_rate": 1.9944776275493546e-05, + "loss": 0.5375, + "step": 868 + }, + { + "epoch": 0.10574992394280498, + "grad_norm": 0.90535569190979, + "learning_rate": 1.994457136331443e-05, + "loss": 0.3825, + "step": 869 + }, + { + "epoch": 0.10587161545482203, + "grad_norm": 5.089012622833252, + "learning_rate": 1.9944366072723794e-05, + "loss": 0.5606, + "step": 870 + }, + { + "epoch": 0.10599330696683906, + "grad_norm": 3.612671136856079, + "learning_rate": 1.9944160403729444e-05, + "loss": 0.5026, + "step": 871 + }, + { + "epoch": 0.1061149984788561, + "grad_norm": 1.5098363161087036, + "learning_rate": 1.9943954356339207e-05, + "loss": 0.3872, + "step": 872 + }, + { + "epoch": 0.10623668999087314, + "grad_norm": 3.0710408687591553, + "learning_rate": 1.994374793056092e-05, + "loss": 0.5371, + "step": 873 + }, + { + "epoch": 0.10635838150289018, + "grad_norm": 0.7500321269035339, + "learning_rate": 1.9943541126402447e-05, + "loss": 0.4474, + "step": 874 + }, + { + "epoch": 0.10648007301490721, + "grad_norm": 0.9233241677284241, + "learning_rate": 1.994333394387165e-05, + "loss": 0.4565, + "step": 875 + }, + { + "epoch": 0.10660176452692424, + "grad_norm": 3.2471554279327393, + "learning_rate": 1.9943126382976413e-05, + "loss": 0.583, + "step": 876 + }, + { + "epoch": 0.10672345603894129, + "grad_norm": 1.9059737920761108, + "learning_rate": 1.994291844372464e-05, + "loss": 0.5059, + "step": 877 + }, + { + "epoch": 0.10684514755095832, + "grad_norm": 4.65517520904541, + "learning_rate": 1.9942710126124238e-05, + "loss": 0.4765, + "step": 878 + }, + { + "epoch": 0.10696683906297536, + "grad_norm": 4.789467811584473, + "learning_rate": 1.9942501430183135e-05, + "loss": 0.4697, + "step": 879 + }, + { + "epoch": 0.10708853057499239, + "grad_norm": 3.8766236305236816, + "learning_rate": 1.994229235590927e-05, + "loss": 0.47, + "step": 880 + }, + { + "epoch": 0.10721022208700944, + "grad_norm": 2.6545610427856445, + "learning_rate": 1.9942082903310607e-05, + "loss": 0.5592, + "step": 881 + }, + { + "epoch": 0.10733191359902647, + "grad_norm": 0.9934804439544678, + "learning_rate": 1.994187307239511e-05, + "loss": 0.4918, + "step": 882 + }, + { + "epoch": 0.1074536051110435, + "grad_norm": 0.7720689177513123, + "learning_rate": 1.994166286317076e-05, + "loss": 0.4565, + "step": 883 + }, + { + "epoch": 0.10757529662306055, + "grad_norm": 0.6046221852302551, + "learning_rate": 1.9941452275645562e-05, + "loss": 0.4436, + "step": 884 + }, + { + "epoch": 0.10769698813507758, + "grad_norm": 2.1446096897125244, + "learning_rate": 1.9941241309827525e-05, + "loss": 0.5119, + "step": 885 + }, + { + "epoch": 0.10781867964709461, + "grad_norm": 1.3840595483779907, + "learning_rate": 1.9941029965724683e-05, + "loss": 0.4771, + "step": 886 + }, + { + "epoch": 0.10794037115911165, + "grad_norm": 0.8104763031005859, + "learning_rate": 1.994081824334507e-05, + "loss": 0.4435, + "step": 887 + }, + { + "epoch": 0.1080620626711287, + "grad_norm": 0.6394281387329102, + "learning_rate": 1.994060614269675e-05, + "loss": 0.4484, + "step": 888 + }, + { + "epoch": 0.10818375418314573, + "grad_norm": 0.690391480922699, + "learning_rate": 1.9940393663787788e-05, + "loss": 0.4747, + "step": 889 + }, + { + "epoch": 0.10830544569516276, + "grad_norm": 1.1946213245391846, + "learning_rate": 1.9940180806626275e-05, + "loss": 0.4622, + "step": 890 + }, + { + "epoch": 0.1084271372071798, + "grad_norm": 1.191467046737671, + "learning_rate": 1.9939967571220306e-05, + "loss": 0.467, + "step": 891 + }, + { + "epoch": 0.10854882871919684, + "grad_norm": 1.7758926153182983, + "learning_rate": 1.9939753957577994e-05, + "loss": 0.4391, + "step": 892 + }, + { + "epoch": 0.10867052023121387, + "grad_norm": 3.4437081813812256, + "learning_rate": 1.993953996570747e-05, + "loss": 0.5467, + "step": 893 + }, + { + "epoch": 0.10879221174323091, + "grad_norm": 0.9177939891815186, + "learning_rate": 1.9939325595616878e-05, + "loss": 0.4543, + "step": 894 + }, + { + "epoch": 0.10891390325524794, + "grad_norm": 1.0351811647415161, + "learning_rate": 1.9939110847314373e-05, + "loss": 0.5189, + "step": 895 + }, + { + "epoch": 0.10903559476726499, + "grad_norm": 3.4975383281707764, + "learning_rate": 1.9938895720808127e-05, + "loss": 0.5733, + "step": 896 + }, + { + "epoch": 0.10915728627928202, + "grad_norm": 1.1447761058807373, + "learning_rate": 1.9938680216106326e-05, + "loss": 0.4417, + "step": 897 + }, + { + "epoch": 0.10927897779129905, + "grad_norm": 0.6356577277183533, + "learning_rate": 1.9938464333217175e-05, + "loss": 0.4615, + "step": 898 + }, + { + "epoch": 0.1094006693033161, + "grad_norm": 4.211939334869385, + "learning_rate": 1.993824807214888e-05, + "loss": 0.5406, + "step": 899 + }, + { + "epoch": 0.10952236081533313, + "grad_norm": 0.8971146941184998, + "learning_rate": 1.9938031432909675e-05, + "loss": 0.4292, + "step": 900 + }, + { + "epoch": 0.10964405232735017, + "grad_norm": 2.098039150238037, + "learning_rate": 1.9937814415507807e-05, + "loss": 0.5111, + "step": 901 + }, + { + "epoch": 0.1097657438393672, + "grad_norm": 0.7762995958328247, + "learning_rate": 1.9937597019951528e-05, + "loss": 0.5062, + "step": 902 + }, + { + "epoch": 0.10988743535138425, + "grad_norm": 0.6407161355018616, + "learning_rate": 1.9937379246249113e-05, + "loss": 0.5024, + "step": 903 + }, + { + "epoch": 0.11000912686340128, + "grad_norm": 0.8079186081886292, + "learning_rate": 1.9937161094408845e-05, + "loss": 0.5342, + "step": 904 + }, + { + "epoch": 0.11013081837541831, + "grad_norm": 5.116917133331299, + "learning_rate": 1.9936942564439033e-05, + "loss": 0.4633, + "step": 905 + }, + { + "epoch": 0.11025250988743535, + "grad_norm": 2.475346326828003, + "learning_rate": 1.9936723656347987e-05, + "loss": 0.4502, + "step": 906 + }, + { + "epoch": 0.11037420139945239, + "grad_norm": 1.9243186712265015, + "learning_rate": 1.9936504370144035e-05, + "loss": 0.4578, + "step": 907 + }, + { + "epoch": 0.11049589291146943, + "grad_norm": 1.1755101680755615, + "learning_rate": 1.993628470583553e-05, + "loss": 0.5217, + "step": 908 + }, + { + "epoch": 0.11061758442348646, + "grad_norm": 1.224277138710022, + "learning_rate": 1.9936064663430823e-05, + "loss": 0.4773, + "step": 909 + }, + { + "epoch": 0.11073927593550349, + "grad_norm": 1.4294017553329468, + "learning_rate": 1.9935844242938285e-05, + "loss": 0.4639, + "step": 910 + }, + { + "epoch": 0.11086096744752054, + "grad_norm": 2.0240399837493896, + "learning_rate": 1.993562344436631e-05, + "loss": 0.5299, + "step": 911 + }, + { + "epoch": 0.11098265895953757, + "grad_norm": 0.9139840602874756, + "learning_rate": 1.99354022677233e-05, + "loss": 0.4666, + "step": 912 + }, + { + "epoch": 0.1111043504715546, + "grad_norm": 3.7568726539611816, + "learning_rate": 1.9935180713017668e-05, + "loss": 0.5334, + "step": 913 + }, + { + "epoch": 0.11122604198357165, + "grad_norm": 0.7634105682373047, + "learning_rate": 1.9934958780257843e-05, + "loss": 0.4715, + "step": 914 + }, + { + "epoch": 0.11134773349558869, + "grad_norm": 2.5053353309631348, + "learning_rate": 1.9934736469452272e-05, + "loss": 0.4529, + "step": 915 + }, + { + "epoch": 0.11146942500760572, + "grad_norm": 0.9232064485549927, + "learning_rate": 1.9934513780609416e-05, + "loss": 0.5065, + "step": 916 + }, + { + "epoch": 0.11159111651962275, + "grad_norm": 1.6837064027786255, + "learning_rate": 1.9934290713737747e-05, + "loss": 0.4873, + "step": 917 + }, + { + "epoch": 0.1117128080316398, + "grad_norm": 4.035303592681885, + "learning_rate": 1.9934067268845753e-05, + "loss": 0.4216, + "step": 918 + }, + { + "epoch": 0.11183449954365683, + "grad_norm": 0.6953611373901367, + "learning_rate": 1.9933843445941935e-05, + "loss": 0.5105, + "step": 919 + }, + { + "epoch": 0.11195619105567386, + "grad_norm": 0.8081039190292358, + "learning_rate": 1.9933619245034818e-05, + "loss": 0.4463, + "step": 920 + }, + { + "epoch": 0.1120778825676909, + "grad_norm": 3.260673761367798, + "learning_rate": 1.9933394666132922e-05, + "loss": 0.5449, + "step": 921 + }, + { + "epoch": 0.11219957407970794, + "grad_norm": 0.915120542049408, + "learning_rate": 1.99331697092448e-05, + "loss": 0.4654, + "step": 922 + }, + { + "epoch": 0.11232126559172498, + "grad_norm": 1.1050307750701904, + "learning_rate": 1.993294437437901e-05, + "loss": 0.4771, + "step": 923 + }, + { + "epoch": 0.11244295710374201, + "grad_norm": 1.436227798461914, + "learning_rate": 1.9932718661544125e-05, + "loss": 0.454, + "step": 924 + }, + { + "epoch": 0.11256464861575904, + "grad_norm": 0.7246893048286438, + "learning_rate": 1.9932492570748737e-05, + "loss": 0.5215, + "step": 925 + }, + { + "epoch": 0.11268634012777609, + "grad_norm": 2.818067789077759, + "learning_rate": 1.9932266102001445e-05, + "loss": 0.4894, + "step": 926 + }, + { + "epoch": 0.11280803163979312, + "grad_norm": 1.345335841178894, + "learning_rate": 1.9932039255310873e-05, + "loss": 0.5307, + "step": 927 + }, + { + "epoch": 0.11292972315181016, + "grad_norm": 0.6055800914764404, + "learning_rate": 1.9931812030685646e-05, + "loss": 0.4788, + "step": 928 + }, + { + "epoch": 0.1130514146638272, + "grad_norm": 0.9500460028648376, + "learning_rate": 1.9931584428134413e-05, + "loss": 0.4571, + "step": 929 + }, + { + "epoch": 0.11317310617584424, + "grad_norm": 1.5998355150222778, + "learning_rate": 1.993135644766584e-05, + "loss": 0.4345, + "step": 930 + }, + { + "epoch": 0.11329479768786127, + "grad_norm": 3.776104211807251, + "learning_rate": 1.9931128089288592e-05, + "loss": 0.5406, + "step": 931 + }, + { + "epoch": 0.1134164891998783, + "grad_norm": 0.6229236721992493, + "learning_rate": 1.9930899353011365e-05, + "loss": 0.4151, + "step": 932 + }, + { + "epoch": 0.11353818071189535, + "grad_norm": 5.357823848724365, + "learning_rate": 1.9930670238842864e-05, + "loss": 0.5458, + "step": 933 + }, + { + "epoch": 0.11365987222391238, + "grad_norm": 2.321004629135132, + "learning_rate": 1.9930440746791806e-05, + "loss": 0.4422, + "step": 934 + }, + { + "epoch": 0.11378156373592942, + "grad_norm": 1.4499346017837524, + "learning_rate": 1.993021087686692e-05, + "loss": 0.4569, + "step": 935 + }, + { + "epoch": 0.11390325524794645, + "grad_norm": 0.6692584156990051, + "learning_rate": 1.9929980629076956e-05, + "loss": 0.468, + "step": 936 + }, + { + "epoch": 0.1140249467599635, + "grad_norm": 0.6575024127960205, + "learning_rate": 1.9929750003430673e-05, + "loss": 0.4698, + "step": 937 + }, + { + "epoch": 0.11414663827198053, + "grad_norm": 1.0745880603790283, + "learning_rate": 1.9929518999936854e-05, + "loss": 0.4704, + "step": 938 + }, + { + "epoch": 0.11426832978399756, + "grad_norm": 1.6050153970718384, + "learning_rate": 1.9929287618604282e-05, + "loss": 0.4374, + "step": 939 + }, + { + "epoch": 0.1143900212960146, + "grad_norm": 1.3916279077529907, + "learning_rate": 1.9929055859441765e-05, + "loss": 0.5018, + "step": 940 + }, + { + "epoch": 0.11451171280803164, + "grad_norm": 1.0368601083755493, + "learning_rate": 1.9928823722458115e-05, + "loss": 0.4426, + "step": 941 + }, + { + "epoch": 0.11463340432004868, + "grad_norm": 0.6375675797462463, + "learning_rate": 1.9928591207662175e-05, + "loss": 0.4695, + "step": 942 + }, + { + "epoch": 0.11475509583206571, + "grad_norm": 4.086197853088379, + "learning_rate": 1.992835831506279e-05, + "loss": 0.5444, + "step": 943 + }, + { + "epoch": 0.11487678734408276, + "grad_norm": 0.8371261358261108, + "learning_rate": 1.9928125044668816e-05, + "loss": 0.4071, + "step": 944 + }, + { + "epoch": 0.11499847885609979, + "grad_norm": 3.4294397830963135, + "learning_rate": 1.9927891396489138e-05, + "loss": 0.4922, + "step": 945 + }, + { + "epoch": 0.11512017036811682, + "grad_norm": 4.158844470977783, + "learning_rate": 1.9927657370532643e-05, + "loss": 0.5664, + "step": 946 + }, + { + "epoch": 0.11524186188013386, + "grad_norm": 1.3181960582733154, + "learning_rate": 1.9927422966808233e-05, + "loss": 0.4846, + "step": 947 + }, + { + "epoch": 0.1153635533921509, + "grad_norm": 3.0161585807800293, + "learning_rate": 1.992718818532483e-05, + "loss": 0.443, + "step": 948 + }, + { + "epoch": 0.11548524490416794, + "grad_norm": 4.553255081176758, + "learning_rate": 1.992695302609137e-05, + "loss": 0.4652, + "step": 949 + }, + { + "epoch": 0.11560693641618497, + "grad_norm": 3.0985097885131836, + "learning_rate": 1.99267174891168e-05, + "loss": 0.4747, + "step": 950 + }, + { + "epoch": 0.115728627928202, + "grad_norm": 0.7888035774230957, + "learning_rate": 1.9926481574410085e-05, + "loss": 0.4975, + "step": 951 + }, + { + "epoch": 0.11585031944021905, + "grad_norm": 1.0028328895568848, + "learning_rate": 1.9926245281980194e-05, + "loss": 0.4851, + "step": 952 + }, + { + "epoch": 0.11597201095223608, + "grad_norm": 0.7690591812133789, + "learning_rate": 1.9926008611836127e-05, + "loss": 0.5013, + "step": 953 + }, + { + "epoch": 0.11609370246425312, + "grad_norm": 4.568863868713379, + "learning_rate": 1.9925771563986885e-05, + "loss": 0.4469, + "step": 954 + }, + { + "epoch": 0.11621539397627015, + "grad_norm": 1.4368757009506226, + "learning_rate": 1.992553413844149e-05, + "loss": 0.4179, + "step": 955 + }, + { + "epoch": 0.1163370854882872, + "grad_norm": 0.8411977291107178, + "learning_rate": 1.992529633520898e-05, + "loss": 0.4744, + "step": 956 + }, + { + "epoch": 0.11645877700030423, + "grad_norm": 3.5928070545196533, + "learning_rate": 1.9925058154298397e-05, + "loss": 0.5184, + "step": 957 + }, + { + "epoch": 0.11658046851232126, + "grad_norm": 1.5290552377700806, + "learning_rate": 1.9924819595718805e-05, + "loss": 0.4537, + "step": 958 + }, + { + "epoch": 0.11670216002433831, + "grad_norm": 0.76931232213974, + "learning_rate": 1.9924580659479288e-05, + "loss": 0.4143, + "step": 959 + }, + { + "epoch": 0.11682385153635534, + "grad_norm": 2.371206760406494, + "learning_rate": 1.9924341345588935e-05, + "loss": 0.4811, + "step": 960 + }, + { + "epoch": 0.11694554304837237, + "grad_norm": 1.6431336402893066, + "learning_rate": 1.992410165405685e-05, + "loss": 0.4544, + "step": 961 + }, + { + "epoch": 0.11706723456038941, + "grad_norm": 1.5598409175872803, + "learning_rate": 1.9923861584892155e-05, + "loss": 0.458, + "step": 962 + }, + { + "epoch": 0.11718892607240645, + "grad_norm": 0.637040913105011, + "learning_rate": 1.9923621138103983e-05, + "loss": 0.4577, + "step": 963 + }, + { + "epoch": 0.11731061758442349, + "grad_norm": 2.3656256198883057, + "learning_rate": 1.992338031370149e-05, + "loss": 0.4685, + "step": 964 + }, + { + "epoch": 0.11743230909644052, + "grad_norm": 0.9912247061729431, + "learning_rate": 1.9923139111693833e-05, + "loss": 0.4945, + "step": 965 + }, + { + "epoch": 0.11755400060845755, + "grad_norm": 1.3829625844955444, + "learning_rate": 1.9922897532090194e-05, + "loss": 0.4901, + "step": 966 + }, + { + "epoch": 0.1176756921204746, + "grad_norm": 2.4134111404418945, + "learning_rate": 1.9922655574899762e-05, + "loss": 0.4622, + "step": 967 + }, + { + "epoch": 0.11779738363249163, + "grad_norm": 1.6261649131774902, + "learning_rate": 1.992241324013175e-05, + "loss": 0.4443, + "step": 968 + }, + { + "epoch": 0.11791907514450867, + "grad_norm": 0.6375196576118469, + "learning_rate": 1.9922170527795376e-05, + "loss": 0.4885, + "step": 969 + }, + { + "epoch": 0.11804076665652571, + "grad_norm": 3.2338030338287354, + "learning_rate": 1.9921927437899876e-05, + "loss": 0.5274, + "step": 970 + }, + { + "epoch": 0.11816245816854275, + "grad_norm": 0.527633786201477, + "learning_rate": 1.99216839704545e-05, + "loss": 0.4384, + "step": 971 + }, + { + "epoch": 0.11828414968055978, + "grad_norm": 2.055549383163452, + "learning_rate": 1.992144012546851e-05, + "loss": 0.4834, + "step": 972 + }, + { + "epoch": 0.11840584119257681, + "grad_norm": 2.95906925201416, + "learning_rate": 1.9921195902951187e-05, + "loss": 0.4957, + "step": 973 + }, + { + "epoch": 0.11852753270459386, + "grad_norm": 0.7035129070281982, + "learning_rate": 1.9920951302911823e-05, + "loss": 0.4753, + "step": 974 + }, + { + "epoch": 0.1186492242166109, + "grad_norm": 0.6045013070106506, + "learning_rate": 1.992070632535973e-05, + "loss": 0.4678, + "step": 975 + }, + { + "epoch": 0.11877091572862793, + "grad_norm": 1.5679330825805664, + "learning_rate": 1.9920460970304224e-05, + "loss": 0.4778, + "step": 976 + }, + { + "epoch": 0.11889260724064496, + "grad_norm": 1.5576215982437134, + "learning_rate": 1.9920215237754647e-05, + "loss": 0.4611, + "step": 977 + }, + { + "epoch": 0.119014298752662, + "grad_norm": 1.4160807132720947, + "learning_rate": 1.9919969127720345e-05, + "loss": 0.471, + "step": 978 + }, + { + "epoch": 0.11913599026467904, + "grad_norm": 0.7447370290756226, + "learning_rate": 1.9919722640210685e-05, + "loss": 0.4402, + "step": 979 + }, + { + "epoch": 0.11925768177669607, + "grad_norm": 1.8573156595230103, + "learning_rate": 1.9919475775235043e-05, + "loss": 0.5008, + "step": 980 + }, + { + "epoch": 0.1193793732887131, + "grad_norm": 4.865150451660156, + "learning_rate": 1.9919228532802817e-05, + "loss": 0.5459, + "step": 981 + }, + { + "epoch": 0.11950106480073015, + "grad_norm": 2.2894115447998047, + "learning_rate": 1.9918980912923412e-05, + "loss": 0.4876, + "step": 982 + }, + { + "epoch": 0.11962275631274719, + "grad_norm": 0.7747580409049988, + "learning_rate": 1.9918732915606255e-05, + "loss": 0.4849, + "step": 983 + }, + { + "epoch": 0.11974444782476422, + "grad_norm": 0.8401018977165222, + "learning_rate": 1.991848454086078e-05, + "loss": 0.5099, + "step": 984 + }, + { + "epoch": 0.11986613933678127, + "grad_norm": 1.1207845211029053, + "learning_rate": 1.9918235788696437e-05, + "loss": 0.5499, + "step": 985 + }, + { + "epoch": 0.1199878308487983, + "grad_norm": 1.5522438287734985, + "learning_rate": 1.9917986659122692e-05, + "loss": 0.5234, + "step": 986 + }, + { + "epoch": 0.12010952236081533, + "grad_norm": 4.20838737487793, + "learning_rate": 1.9917737152149027e-05, + "loss": 0.4454, + "step": 987 + }, + { + "epoch": 0.12023121387283237, + "grad_norm": 1.867698073387146, + "learning_rate": 1.9917487267784934e-05, + "loss": 0.4816, + "step": 988 + }, + { + "epoch": 0.12035290538484941, + "grad_norm": 4.351875305175781, + "learning_rate": 1.9917237006039922e-05, + "loss": 0.4132, + "step": 989 + }, + { + "epoch": 0.12047459689686645, + "grad_norm": 2.0159127712249756, + "learning_rate": 1.9916986366923517e-05, + "loss": 0.5015, + "step": 990 + }, + { + "epoch": 0.12059628840888348, + "grad_norm": 2.2653989791870117, + "learning_rate": 1.991673535044525e-05, + "loss": 0.497, + "step": 991 + }, + { + "epoch": 0.12071797992090051, + "grad_norm": 1.2745273113250732, + "learning_rate": 1.991648395661468e-05, + "loss": 0.4094, + "step": 992 + }, + { + "epoch": 0.12083967143291756, + "grad_norm": 2.639756441116333, + "learning_rate": 1.9916232185441365e-05, + "loss": 0.4572, + "step": 993 + }, + { + "epoch": 0.12096136294493459, + "grad_norm": 5.562560081481934, + "learning_rate": 1.9915980036934894e-05, + "loss": 0.613, + "step": 994 + }, + { + "epoch": 0.12108305445695162, + "grad_norm": 0.9916262626647949, + "learning_rate": 1.9915727511104854e-05, + "loss": 0.5221, + "step": 995 + }, + { + "epoch": 0.12120474596896866, + "grad_norm": 1.339257836341858, + "learning_rate": 1.991547460796086e-05, + "loss": 0.5748, + "step": 996 + }, + { + "epoch": 0.1213264374809857, + "grad_norm": 5.516851425170898, + "learning_rate": 1.9915221327512532e-05, + "loss": 0.5339, + "step": 997 + }, + { + "epoch": 0.12144812899300274, + "grad_norm": 4.493785381317139, + "learning_rate": 1.9914967669769512e-05, + "loss": 0.5209, + "step": 998 + }, + { + "epoch": 0.12156982050501977, + "grad_norm": 7.7040019035339355, + "learning_rate": 1.991471363474145e-05, + "loss": 0.5222, + "step": 999 + }, + { + "epoch": 0.12169151201703682, + "grad_norm": 6.866464138031006, + "learning_rate": 1.9914459222438006e-05, + "loss": 0.5507, + "step": 1000 + }, + { + "epoch": 0.12181320352905385, + "grad_norm": 6.365875720977783, + "learning_rate": 1.991420443286887e-05, + "loss": 0.5163, + "step": 1001 + }, + { + "epoch": 0.12193489504107088, + "grad_norm": 4.724626541137695, + "learning_rate": 1.9913949266043735e-05, + "loss": 0.5124, + "step": 1002 + }, + { + "epoch": 0.12205658655308792, + "grad_norm": 4.0745344161987305, + "learning_rate": 1.9913693721972307e-05, + "loss": 0.5122, + "step": 1003 + }, + { + "epoch": 0.12217827806510496, + "grad_norm": 1.1672906875610352, + "learning_rate": 1.9913437800664313e-05, + "loss": 0.4765, + "step": 1004 + }, + { + "epoch": 0.122299969577122, + "grad_norm": 2.6373729705810547, + "learning_rate": 1.9913181502129495e-05, + "loss": 0.4792, + "step": 1005 + }, + { + "epoch": 0.12242166108913903, + "grad_norm": 3.2573599815368652, + "learning_rate": 1.9912924826377598e-05, + "loss": 0.4979, + "step": 1006 + }, + { + "epoch": 0.12254335260115606, + "grad_norm": 5.804367542266846, + "learning_rate": 1.9912667773418394e-05, + "loss": 0.5343, + "step": 1007 + }, + { + "epoch": 0.12266504411317311, + "grad_norm": 6.964966297149658, + "learning_rate": 1.9912410343261664e-05, + "loss": 0.5789, + "step": 1008 + }, + { + "epoch": 0.12278673562519014, + "grad_norm": 4.244586944580078, + "learning_rate": 1.99121525359172e-05, + "loss": 0.496, + "step": 1009 + }, + { + "epoch": 0.12290842713720718, + "grad_norm": 1.811240315437317, + "learning_rate": 1.991189435139482e-05, + "loss": 0.455, + "step": 1010 + }, + { + "epoch": 0.12303011864922421, + "grad_norm": 1.173520803451538, + "learning_rate": 1.9911635789704338e-05, + "loss": 0.4285, + "step": 1011 + }, + { + "epoch": 0.12315181016124126, + "grad_norm": 5.163769721984863, + "learning_rate": 1.99113768508556e-05, + "loss": 0.5986, + "step": 1012 + }, + { + "epoch": 0.12327350167325829, + "grad_norm": 2.043832540512085, + "learning_rate": 1.991111753485846e-05, + "loss": 0.4694, + "step": 1013 + }, + { + "epoch": 0.12339519318527532, + "grad_norm": 3.5095207691192627, + "learning_rate": 1.991085784172278e-05, + "loss": 0.4978, + "step": 1014 + }, + { + "epoch": 0.12351688469729237, + "grad_norm": 4.421924591064453, + "learning_rate": 1.9910597771458446e-05, + "loss": 0.4866, + "step": 1015 + }, + { + "epoch": 0.1236385762093094, + "grad_norm": 5.892740726470947, + "learning_rate": 1.991033732407535e-05, + "loss": 0.4918, + "step": 1016 + }, + { + "epoch": 0.12376026772132644, + "grad_norm": 6.371843338012695, + "learning_rate": 1.991007649958341e-05, + "loss": 0.4837, + "step": 1017 + }, + { + "epoch": 0.12388195923334347, + "grad_norm": 1.5353717803955078, + "learning_rate": 1.990981529799254e-05, + "loss": 0.4839, + "step": 1018 + }, + { + "epoch": 0.12400365074536052, + "grad_norm": 2.3877511024475098, + "learning_rate": 1.9909553719312693e-05, + "loss": 0.52, + "step": 1019 + }, + { + "epoch": 0.12412534225737755, + "grad_norm": 1.7043259143829346, + "learning_rate": 1.9909291763553813e-05, + "loss": 0.4516, + "step": 1020 + }, + { + "epoch": 0.12424703376939458, + "grad_norm": 1.2159875631332397, + "learning_rate": 1.990902943072587e-05, + "loss": 0.4664, + "step": 1021 + }, + { + "epoch": 0.12436872528141162, + "grad_norm": 1.8231126070022583, + "learning_rate": 1.9908766720838847e-05, + "loss": 0.4709, + "step": 1022 + }, + { + "epoch": 0.12449041679342866, + "grad_norm": 0.6514496803283691, + "learning_rate": 1.990850363390274e-05, + "loss": 0.5004, + "step": 1023 + }, + { + "epoch": 0.1246121083054457, + "grad_norm": 0.9176408648490906, + "learning_rate": 1.990824016992756e-05, + "loss": 0.5215, + "step": 1024 + }, + { + "epoch": 0.12473379981746273, + "grad_norm": 3.3087456226348877, + "learning_rate": 1.990797632892333e-05, + "loss": 0.403, + "step": 1025 + }, + { + "epoch": 0.12485549132947976, + "grad_norm": 3.45039701461792, + "learning_rate": 1.9907712110900098e-05, + "loss": 0.4367, + "step": 1026 + }, + { + "epoch": 0.12497718284149681, + "grad_norm": 2.794262409210205, + "learning_rate": 1.9907447515867907e-05, + "loss": 0.4585, + "step": 1027 + }, + { + "epoch": 0.12509887435351386, + "grad_norm": 0.5666921138763428, + "learning_rate": 1.9907182543836835e-05, + "loss": 0.5058, + "step": 1028 + }, + { + "epoch": 0.12522056586553088, + "grad_norm": 0.8674224019050598, + "learning_rate": 1.990691719481696e-05, + "loss": 0.4928, + "step": 1029 + }, + { + "epoch": 0.12534225737754792, + "grad_norm": 1.3358700275421143, + "learning_rate": 1.9906651468818374e-05, + "loss": 0.4874, + "step": 1030 + }, + { + "epoch": 0.12546394888956494, + "grad_norm": 0.5441078543663025, + "learning_rate": 1.9906385365851198e-05, + "loss": 0.4483, + "step": 1031 + }, + { + "epoch": 0.125585640401582, + "grad_norm": 0.9660111665725708, + "learning_rate": 1.9906118885925558e-05, + "loss": 0.5085, + "step": 1032 + }, + { + "epoch": 0.12570733191359904, + "grad_norm": 0.888455331325531, + "learning_rate": 1.990585202905158e-05, + "loss": 0.4702, + "step": 1033 + }, + { + "epoch": 0.12582902342561605, + "grad_norm": 1.2749918699264526, + "learning_rate": 1.9905584795239435e-05, + "loss": 0.4653, + "step": 1034 + }, + { + "epoch": 0.1259507149376331, + "grad_norm": 1.0314130783081055, + "learning_rate": 1.9905317184499284e-05, + "loss": 0.4667, + "step": 1035 + }, + { + "epoch": 0.12607240644965015, + "grad_norm": 0.6198263764381409, + "learning_rate": 1.9905049196841313e-05, + "loss": 0.4537, + "step": 1036 + }, + { + "epoch": 0.12619409796166717, + "grad_norm": 0.7804372906684875, + "learning_rate": 1.9904780832275717e-05, + "loss": 0.4572, + "step": 1037 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 1.3657641410827637, + "learning_rate": 1.9904512090812705e-05, + "loss": 0.4797, + "step": 1038 + }, + { + "epoch": 0.12643748098570123, + "grad_norm": 3.615712881088257, + "learning_rate": 1.9904242972462507e-05, + "loss": 0.5061, + "step": 1039 + }, + { + "epoch": 0.12655917249771828, + "grad_norm": 0.7965696454048157, + "learning_rate": 1.9903973477235368e-05, + "loss": 0.4296, + "step": 1040 + }, + { + "epoch": 0.12668086400973533, + "grad_norm": 0.87240070104599, + "learning_rate": 1.9903703605141536e-05, + "loss": 0.4423, + "step": 1041 + }, + { + "epoch": 0.12680255552175235, + "grad_norm": 0.8806689381599426, + "learning_rate": 1.990343335619128e-05, + "loss": 0.4445, + "step": 1042 + }, + { + "epoch": 0.1269242470337694, + "grad_norm": 0.944068431854248, + "learning_rate": 1.990316273039489e-05, + "loss": 0.495, + "step": 1043 + }, + { + "epoch": 0.12704593854578644, + "grad_norm": 1.1978839635849, + "learning_rate": 1.9902891727762656e-05, + "loss": 0.5269, + "step": 1044 + }, + { + "epoch": 0.12716763005780346, + "grad_norm": 2.517319679260254, + "learning_rate": 1.9902620348304894e-05, + "loss": 0.4923, + "step": 1045 + }, + { + "epoch": 0.1272893215698205, + "grad_norm": 3.820152759552002, + "learning_rate": 1.9902348592031932e-05, + "loss": 0.4883, + "step": 1046 + }, + { + "epoch": 0.12741101308183755, + "grad_norm": 2.059230089187622, + "learning_rate": 1.990207645895411e-05, + "loss": 0.4895, + "step": 1047 + }, + { + "epoch": 0.12753270459385457, + "grad_norm": 4.584321022033691, + "learning_rate": 1.990180394908178e-05, + "loss": 0.4741, + "step": 1048 + }, + { + "epoch": 0.12765439610587162, + "grad_norm": 1.132431149482727, + "learning_rate": 1.9901531062425316e-05, + "loss": 0.4337, + "step": 1049 + }, + { + "epoch": 0.12777608761788864, + "grad_norm": 1.0330190658569336, + "learning_rate": 1.99012577989951e-05, + "loss": 0.481, + "step": 1050 + }, + { + "epoch": 0.1278977791299057, + "grad_norm": 5.490146160125732, + "learning_rate": 1.990098415880153e-05, + "loss": 0.6271, + "step": 1051 + }, + { + "epoch": 0.12801947064192273, + "grad_norm": 2.893369436264038, + "learning_rate": 1.9900710141855018e-05, + "loss": 0.5039, + "step": 1052 + }, + { + "epoch": 0.12814116215393975, + "grad_norm": 5.065304756164551, + "learning_rate": 1.9900435748165994e-05, + "loss": 0.5771, + "step": 1053 + }, + { + "epoch": 0.1282628536659568, + "grad_norm": 1.762570858001709, + "learning_rate": 1.9900160977744897e-05, + "loss": 0.5018, + "step": 1054 + }, + { + "epoch": 0.12838454517797385, + "grad_norm": 1.611161708831787, + "learning_rate": 1.989988583060218e-05, + "loss": 0.5268, + "step": 1055 + }, + { + "epoch": 0.12850623668999087, + "grad_norm": 3.819828510284424, + "learning_rate": 1.989961030674832e-05, + "loss": 0.4531, + "step": 1056 + }, + { + "epoch": 0.1286279282020079, + "grad_norm": 2.2731497287750244, + "learning_rate": 1.989933440619379e-05, + "loss": 0.5055, + "step": 1057 + }, + { + "epoch": 0.12874961971402496, + "grad_norm": 2.4282963275909424, + "learning_rate": 1.98990581289491e-05, + "loss": 0.5095, + "step": 1058 + }, + { + "epoch": 0.12887131122604198, + "grad_norm": 1.9960323572158813, + "learning_rate": 1.9898781475024755e-05, + "loss": 0.5085, + "step": 1059 + }, + { + "epoch": 0.12899300273805903, + "grad_norm": 1.3955953121185303, + "learning_rate": 1.989850444443129e-05, + "loss": 0.5128, + "step": 1060 + }, + { + "epoch": 0.12911469425007605, + "grad_norm": 3.0201621055603027, + "learning_rate": 1.989822703717924e-05, + "loss": 0.5401, + "step": 1061 + }, + { + "epoch": 0.1292363857620931, + "grad_norm": 1.393862247467041, + "learning_rate": 1.989794925327916e-05, + "loss": 0.4261, + "step": 1062 + }, + { + "epoch": 0.12935807727411014, + "grad_norm": 6.086862087249756, + "learning_rate": 1.989767109274163e-05, + "loss": 0.5931, + "step": 1063 + }, + { + "epoch": 0.12947976878612716, + "grad_norm": 3.9537577629089355, + "learning_rate": 1.989739255557722e-05, + "loss": 0.5358, + "step": 1064 + }, + { + "epoch": 0.1296014602981442, + "grad_norm": 1.3847154378890991, + "learning_rate": 1.989711364179654e-05, + "loss": 0.4831, + "step": 1065 + }, + { + "epoch": 0.12972315181016125, + "grad_norm": 0.8064104914665222, + "learning_rate": 1.98968343514102e-05, + "loss": 0.4341, + "step": 1066 + }, + { + "epoch": 0.12984484332217827, + "grad_norm": 1.4337934255599976, + "learning_rate": 1.989655468442883e-05, + "loss": 0.4775, + "step": 1067 + }, + { + "epoch": 0.12996653483419532, + "grad_norm": 2.249725103378296, + "learning_rate": 1.9896274640863068e-05, + "loss": 0.4609, + "step": 1068 + }, + { + "epoch": 0.13008822634621234, + "grad_norm": 4.285830974578857, + "learning_rate": 1.989599422072357e-05, + "loss": 0.4345, + "step": 1069 + }, + { + "epoch": 0.13020991785822938, + "grad_norm": 1.1885123252868652, + "learning_rate": 1.9895713424021013e-05, + "loss": 0.4692, + "step": 1070 + }, + { + "epoch": 0.13033160937024643, + "grad_norm": 0.7677265405654907, + "learning_rate": 1.9895432250766073e-05, + "loss": 0.5207, + "step": 1071 + }, + { + "epoch": 0.13045330088226345, + "grad_norm": 0.6993517279624939, + "learning_rate": 1.9895150700969453e-05, + "loss": 0.4858, + "step": 1072 + }, + { + "epoch": 0.1305749923942805, + "grad_norm": 3.629011392593384, + "learning_rate": 1.989486877464187e-05, + "loss": 0.4192, + "step": 1073 + }, + { + "epoch": 0.13069668390629754, + "grad_norm": 4.147429943084717, + "learning_rate": 1.9894586471794047e-05, + "loss": 0.5904, + "step": 1074 + }, + { + "epoch": 0.13081837541831456, + "grad_norm": 1.766664743423462, + "learning_rate": 1.989430379243673e-05, + "loss": 0.526, + "step": 1075 + }, + { + "epoch": 0.1309400669303316, + "grad_norm": 1.9937471151351929, + "learning_rate": 1.9894020736580672e-05, + "loss": 0.5269, + "step": 1076 + }, + { + "epoch": 0.13106175844234866, + "grad_norm": 1.8291488885879517, + "learning_rate": 1.9893737304236644e-05, + "loss": 0.5168, + "step": 1077 + }, + { + "epoch": 0.13118344995436568, + "grad_norm": 3.317469835281372, + "learning_rate": 1.9893453495415436e-05, + "loss": 0.4334, + "step": 1078 + }, + { + "epoch": 0.13130514146638272, + "grad_norm": 1.517987608909607, + "learning_rate": 1.989316931012784e-05, + "loss": 0.5321, + "step": 1079 + }, + { + "epoch": 0.13142683297839974, + "grad_norm": 3.121569871902466, + "learning_rate": 1.9892884748384678e-05, + "loss": 0.4948, + "step": 1080 + }, + { + "epoch": 0.1315485244904168, + "grad_norm": 4.9344024658203125, + "learning_rate": 1.9892599810196772e-05, + "loss": 0.4189, + "step": 1081 + }, + { + "epoch": 0.13167021600243384, + "grad_norm": 2.4887192249298096, + "learning_rate": 1.9892314495574967e-05, + "loss": 0.4245, + "step": 1082 + }, + { + "epoch": 0.13179190751445086, + "grad_norm": 1.7641483545303345, + "learning_rate": 1.989202880453012e-05, + "loss": 0.5314, + "step": 1083 + }, + { + "epoch": 0.1319135990264679, + "grad_norm": 5.092477321624756, + "learning_rate": 1.98917427370731e-05, + "loss": 0.5421, + "step": 1084 + }, + { + "epoch": 0.13203529053848495, + "grad_norm": 3.9195423126220703, + "learning_rate": 1.9891456293214797e-05, + "loss": 0.5188, + "step": 1085 + }, + { + "epoch": 0.13215698205050197, + "grad_norm": 3.7562026977539062, + "learning_rate": 1.9891169472966107e-05, + "loss": 0.5396, + "step": 1086 + }, + { + "epoch": 0.13227867356251902, + "grad_norm": 1.3222163915634155, + "learning_rate": 1.9890882276337943e-05, + "loss": 0.5105, + "step": 1087 + }, + { + "epoch": 0.13240036507453606, + "grad_norm": 0.5431426167488098, + "learning_rate": 1.9890594703341234e-05, + "loss": 0.4952, + "step": 1088 + }, + { + "epoch": 0.13252205658655308, + "grad_norm": 4.460387706756592, + "learning_rate": 1.9890306753986928e-05, + "loss": 0.4645, + "step": 1089 + }, + { + "epoch": 0.13264374809857013, + "grad_norm": 2.880514144897461, + "learning_rate": 1.9890018428285977e-05, + "loss": 0.4797, + "step": 1090 + }, + { + "epoch": 0.13276543961058715, + "grad_norm": 2.051021099090576, + "learning_rate": 1.9889729726249355e-05, + "loss": 0.4882, + "step": 1091 + }, + { + "epoch": 0.1328871311226042, + "grad_norm": 2.871445894241333, + "learning_rate": 1.9889440647888043e-05, + "loss": 0.5146, + "step": 1092 + }, + { + "epoch": 0.13300882263462124, + "grad_norm": 1.0191707611083984, + "learning_rate": 1.988915119321305e-05, + "loss": 0.4754, + "step": 1093 + }, + { + "epoch": 0.13313051414663826, + "grad_norm": 3.2318060398101807, + "learning_rate": 1.988886136223538e-05, + "loss": 0.5293, + "step": 1094 + }, + { + "epoch": 0.1332522056586553, + "grad_norm": 0.6224351525306702, + "learning_rate": 1.9888571154966065e-05, + "loss": 0.4171, + "step": 1095 + }, + { + "epoch": 0.13337389717067236, + "grad_norm": 2.7519643306732178, + "learning_rate": 1.988828057141615e-05, + "loss": 0.4937, + "step": 1096 + }, + { + "epoch": 0.13349558868268938, + "grad_norm": 1.3889471292495728, + "learning_rate": 1.9887989611596694e-05, + "loss": 0.4393, + "step": 1097 + }, + { + "epoch": 0.13361728019470642, + "grad_norm": 2.1229891777038574, + "learning_rate": 1.9887698275518764e-05, + "loss": 0.495, + "step": 1098 + }, + { + "epoch": 0.13373897170672347, + "grad_norm": 0.706048309803009, + "learning_rate": 1.9887406563193452e-05, + "loss": 0.5292, + "step": 1099 + }, + { + "epoch": 0.1338606632187405, + "grad_norm": 0.8292319178581238, + "learning_rate": 1.9887114474631852e-05, + "loss": 0.4669, + "step": 1100 + }, + { + "epoch": 0.13398235473075754, + "grad_norm": 2.162430763244629, + "learning_rate": 1.9886822009845082e-05, + "loss": 0.5202, + "step": 1101 + }, + { + "epoch": 0.13410404624277455, + "grad_norm": 3.347226142883301, + "learning_rate": 1.9886529168844267e-05, + "loss": 0.4971, + "step": 1102 + }, + { + "epoch": 0.1342257377547916, + "grad_norm": 5.355313777923584, + "learning_rate": 1.9886235951640556e-05, + "loss": 0.4803, + "step": 1103 + }, + { + "epoch": 0.13434742926680865, + "grad_norm": 2.244405746459961, + "learning_rate": 1.9885942358245104e-05, + "loss": 0.4865, + "step": 1104 + }, + { + "epoch": 0.13446912077882567, + "grad_norm": 4.953440189361572, + "learning_rate": 1.988564838866908e-05, + "loss": 0.428, + "step": 1105 + }, + { + "epoch": 0.13459081229084272, + "grad_norm": 0.7820557951927185, + "learning_rate": 1.9885354042923674e-05, + "loss": 0.4954, + "step": 1106 + }, + { + "epoch": 0.13471250380285976, + "grad_norm": 1.209432601928711, + "learning_rate": 1.9885059321020085e-05, + "loss": 0.4696, + "step": 1107 + }, + { + "epoch": 0.13483419531487678, + "grad_norm": 0.7294307351112366, + "learning_rate": 1.988476422296953e-05, + "loss": 0.487, + "step": 1108 + }, + { + "epoch": 0.13495588682689383, + "grad_norm": 0.777212381362915, + "learning_rate": 1.9884468748783236e-05, + "loss": 0.4852, + "step": 1109 + }, + { + "epoch": 0.13507757833891085, + "grad_norm": 0.8023810982704163, + "learning_rate": 1.9884172898472444e-05, + "loss": 0.5026, + "step": 1110 + }, + { + "epoch": 0.1351992698509279, + "grad_norm": 0.6087104678153992, + "learning_rate": 1.988387667204841e-05, + "loss": 0.4899, + "step": 1111 + }, + { + "epoch": 0.13532096136294494, + "grad_norm": 0.657290518283844, + "learning_rate": 1.9883580069522417e-05, + "loss": 0.5048, + "step": 1112 + }, + { + "epoch": 0.13544265287496196, + "grad_norm": 1.1142759323120117, + "learning_rate": 1.9883283090905744e-05, + "loss": 0.4823, + "step": 1113 + }, + { + "epoch": 0.135564344386979, + "grad_norm": 2.6991608142852783, + "learning_rate": 1.988298573620969e-05, + "loss": 0.4653, + "step": 1114 + }, + { + "epoch": 0.13568603589899605, + "grad_norm": 0.6051679253578186, + "learning_rate": 1.988268800544557e-05, + "loss": 0.475, + "step": 1115 + }, + { + "epoch": 0.13580772741101307, + "grad_norm": 1.133419394493103, + "learning_rate": 1.9882389898624716e-05, + "loss": 0.4446, + "step": 1116 + }, + { + "epoch": 0.13592941892303012, + "grad_norm": 3.801710367202759, + "learning_rate": 1.9882091415758474e-05, + "loss": 0.5287, + "step": 1117 + }, + { + "epoch": 0.13605111043504717, + "grad_norm": 2.743229866027832, + "learning_rate": 1.9881792556858197e-05, + "loss": 0.5043, + "step": 1118 + }, + { + "epoch": 0.1361728019470642, + "grad_norm": 0.6475219130516052, + "learning_rate": 1.9881493321935258e-05, + "loss": 0.4895, + "step": 1119 + }, + { + "epoch": 0.13629449345908123, + "grad_norm": 1.1221957206726074, + "learning_rate": 1.9881193711001048e-05, + "loss": 0.5212, + "step": 1120 + }, + { + "epoch": 0.13641618497109825, + "grad_norm": 1.2978185415267944, + "learning_rate": 1.9880893724066964e-05, + "loss": 0.4827, + "step": 1121 + }, + { + "epoch": 0.1365378764831153, + "grad_norm": 2.9070370197296143, + "learning_rate": 1.9880593361144415e-05, + "loss": 0.4286, + "step": 1122 + }, + { + "epoch": 0.13665956799513235, + "grad_norm": 0.9966745972633362, + "learning_rate": 1.9880292622244845e-05, + "loss": 0.4341, + "step": 1123 + }, + { + "epoch": 0.13678125950714937, + "grad_norm": 0.8091104030609131, + "learning_rate": 1.9879991507379686e-05, + "loss": 0.4283, + "step": 1124 + }, + { + "epoch": 0.1369029510191664, + "grad_norm": 2.4667422771453857, + "learning_rate": 1.98796900165604e-05, + "loss": 0.4976, + "step": 1125 + }, + { + "epoch": 0.13702464253118346, + "grad_norm": 4.526529788970947, + "learning_rate": 1.9879388149798456e-05, + "loss": 0.5393, + "step": 1126 + }, + { + "epoch": 0.13714633404320048, + "grad_norm": 4.00773286819458, + "learning_rate": 1.987908590710535e-05, + "loss": 0.5343, + "step": 1127 + }, + { + "epoch": 0.13726802555521753, + "grad_norm": 0.6866424679756165, + "learning_rate": 1.987878328849257e-05, + "loss": 0.4278, + "step": 1128 + }, + { + "epoch": 0.13738971706723457, + "grad_norm": 1.0679837465286255, + "learning_rate": 1.9878480293971646e-05, + "loss": 0.44, + "step": 1129 + }, + { + "epoch": 0.1375114085792516, + "grad_norm": 0.7804145812988281, + "learning_rate": 1.9878176923554093e-05, + "loss": 0.5078, + "step": 1130 + }, + { + "epoch": 0.13763310009126864, + "grad_norm": 1.4702852964401245, + "learning_rate": 1.9877873177251464e-05, + "loss": 0.4924, + "step": 1131 + }, + { + "epoch": 0.13775479160328566, + "grad_norm": 1.2784676551818848, + "learning_rate": 1.987756905507531e-05, + "loss": 0.5162, + "step": 1132 + }, + { + "epoch": 0.1378764831153027, + "grad_norm": 3.1514322757720947, + "learning_rate": 1.9877264557037213e-05, + "loss": 0.4654, + "step": 1133 + }, + { + "epoch": 0.13799817462731975, + "grad_norm": 0.7366341352462769, + "learning_rate": 1.9876959683148753e-05, + "loss": 0.4548, + "step": 1134 + }, + { + "epoch": 0.13811986613933677, + "grad_norm": 0.8866164088249207, + "learning_rate": 1.9876654433421534e-05, + "loss": 0.4808, + "step": 1135 + }, + { + "epoch": 0.13824155765135382, + "grad_norm": 2.6878104209899902, + "learning_rate": 1.987634880786717e-05, + "loss": 0.495, + "step": 1136 + }, + { + "epoch": 0.13836324916337087, + "grad_norm": 0.9557205438613892, + "learning_rate": 1.987604280649729e-05, + "loss": 0.4519, + "step": 1137 + }, + { + "epoch": 0.13848494067538789, + "grad_norm": 3.3331611156463623, + "learning_rate": 1.987573642932354e-05, + "loss": 0.4966, + "step": 1138 + }, + { + "epoch": 0.13860663218740493, + "grad_norm": 1.2201111316680908, + "learning_rate": 1.9875429676357576e-05, + "loss": 0.4481, + "step": 1139 + }, + { + "epoch": 0.13872832369942195, + "grad_norm": 1.5304169654846191, + "learning_rate": 1.9875122547611072e-05, + "loss": 0.4668, + "step": 1140 + }, + { + "epoch": 0.138850015211439, + "grad_norm": 0.8971834182739258, + "learning_rate": 1.9874815043095714e-05, + "loss": 0.5049, + "step": 1141 + }, + { + "epoch": 0.13897170672345605, + "grad_norm": 2.9369068145751953, + "learning_rate": 1.9874507162823205e-05, + "loss": 0.4672, + "step": 1142 + }, + { + "epoch": 0.13909339823547306, + "grad_norm": 1.979405403137207, + "learning_rate": 1.987419890680526e-05, + "loss": 0.5469, + "step": 1143 + }, + { + "epoch": 0.1392150897474901, + "grad_norm": 3.0535216331481934, + "learning_rate": 1.9873890275053606e-05, + "loss": 0.4846, + "step": 1144 + }, + { + "epoch": 0.13933678125950716, + "grad_norm": 7.240109443664551, + "learning_rate": 1.9873581267579992e-05, + "loss": 0.4554, + "step": 1145 + }, + { + "epoch": 0.13945847277152418, + "grad_norm": 6.796575546264648, + "learning_rate": 1.9873271884396173e-05, + "loss": 0.4877, + "step": 1146 + }, + { + "epoch": 0.13958016428354122, + "grad_norm": 4.516753673553467, + "learning_rate": 1.9872962125513922e-05, + "loss": 0.4755, + "step": 1147 + }, + { + "epoch": 0.13970185579555827, + "grad_norm": 2.8543522357940674, + "learning_rate": 1.9872651990945024e-05, + "loss": 0.4566, + "step": 1148 + }, + { + "epoch": 0.1398235473075753, + "grad_norm": 0.7286142706871033, + "learning_rate": 1.9872341480701286e-05, + "loss": 0.4856, + "step": 1149 + }, + { + "epoch": 0.13994523881959234, + "grad_norm": 2.64182448387146, + "learning_rate": 1.987203059479452e-05, + "loss": 0.4704, + "step": 1150 + }, + { + "epoch": 0.14006693033160936, + "grad_norm": 2.409075975418091, + "learning_rate": 1.987171933323655e-05, + "loss": 0.5042, + "step": 1151 + }, + { + "epoch": 0.1401886218436264, + "grad_norm": 5.348703861236572, + "learning_rate": 1.987140769603923e-05, + "loss": 0.5052, + "step": 1152 + }, + { + "epoch": 0.14031031335564345, + "grad_norm": 3.3560080528259277, + "learning_rate": 1.9871095683214414e-05, + "loss": 0.4862, + "step": 1153 + }, + { + "epoch": 0.14043200486766047, + "grad_norm": 1.841772437095642, + "learning_rate": 1.9870783294773977e-05, + "loss": 0.511, + "step": 1154 + }, + { + "epoch": 0.14055369637967752, + "grad_norm": 0.870384156703949, + "learning_rate": 1.9870470530729805e-05, + "loss": 0.4354, + "step": 1155 + }, + { + "epoch": 0.14067538789169456, + "grad_norm": 0.7190083861351013, + "learning_rate": 1.9870157391093795e-05, + "loss": 0.5154, + "step": 1156 + }, + { + "epoch": 0.14079707940371158, + "grad_norm": 0.9531543850898743, + "learning_rate": 1.9869843875877867e-05, + "loss": 0.5232, + "step": 1157 + }, + { + "epoch": 0.14091877091572863, + "grad_norm": 2.533087968826294, + "learning_rate": 1.986952998509395e-05, + "loss": 0.5027, + "step": 1158 + }, + { + "epoch": 0.14104046242774568, + "grad_norm": 4.98061466217041, + "learning_rate": 1.986921571875399e-05, + "loss": 0.4683, + "step": 1159 + }, + { + "epoch": 0.1411621539397627, + "grad_norm": 1.3997783660888672, + "learning_rate": 1.9868901076869946e-05, + "loss": 0.4841, + "step": 1160 + }, + { + "epoch": 0.14128384545177974, + "grad_norm": 4.7056474685668945, + "learning_rate": 1.986858605945378e-05, + "loss": 0.4607, + "step": 1161 + }, + { + "epoch": 0.14140553696379676, + "grad_norm": 3.663830041885376, + "learning_rate": 1.9868270666517496e-05, + "loss": 0.4455, + "step": 1162 + }, + { + "epoch": 0.1415272284758138, + "grad_norm": 1.792288899421692, + "learning_rate": 1.9867954898073084e-05, + "loss": 0.4995, + "step": 1163 + }, + { + "epoch": 0.14164891998783086, + "grad_norm": 2.9815714359283447, + "learning_rate": 1.9867638754132562e-05, + "loss": 0.5496, + "step": 1164 + }, + { + "epoch": 0.14177061149984788, + "grad_norm": 2.771726369857788, + "learning_rate": 1.9867322234707963e-05, + "loss": 0.5031, + "step": 1165 + }, + { + "epoch": 0.14189230301186492, + "grad_norm": 2.5088088512420654, + "learning_rate": 1.9867005339811324e-05, + "loss": 0.4973, + "step": 1166 + }, + { + "epoch": 0.14201399452388197, + "grad_norm": 1.4814538955688477, + "learning_rate": 1.986668806945471e-05, + "loss": 0.4504, + "step": 1167 + }, + { + "epoch": 0.142135686035899, + "grad_norm": 1.8390085697174072, + "learning_rate": 1.9866370423650194e-05, + "loss": 0.4916, + "step": 1168 + }, + { + "epoch": 0.14225737754791604, + "grad_norm": 2.3560585975646973, + "learning_rate": 1.986605240240986e-05, + "loss": 0.4252, + "step": 1169 + }, + { + "epoch": 0.14237906905993308, + "grad_norm": 0.6675092577934265, + "learning_rate": 1.9865734005745812e-05, + "loss": 0.4759, + "step": 1170 + }, + { + "epoch": 0.1425007605719501, + "grad_norm": 1.4101814031600952, + "learning_rate": 1.986541523367016e-05, + "loss": 0.494, + "step": 1171 + }, + { + "epoch": 0.14262245208396715, + "grad_norm": 0.9319317936897278, + "learning_rate": 1.986509608619504e-05, + "loss": 0.4938, + "step": 1172 + }, + { + "epoch": 0.14274414359598417, + "grad_norm": 1.551396369934082, + "learning_rate": 1.9864776563332598e-05, + "loss": 0.4423, + "step": 1173 + }, + { + "epoch": 0.14286583510800122, + "grad_norm": 0.7158803939819336, + "learning_rate": 1.9864456665094985e-05, + "loss": 0.4823, + "step": 1174 + }, + { + "epoch": 0.14298752662001826, + "grad_norm": 1.0481125116348267, + "learning_rate": 1.9864136391494376e-05, + "loss": 0.4491, + "step": 1175 + }, + { + "epoch": 0.14310921813203528, + "grad_norm": 0.7452911734580994, + "learning_rate": 1.9863815742542965e-05, + "loss": 0.4208, + "step": 1176 + }, + { + "epoch": 0.14323090964405233, + "grad_norm": 2.0615103244781494, + "learning_rate": 1.9863494718252945e-05, + "loss": 0.5336, + "step": 1177 + }, + { + "epoch": 0.14335260115606938, + "grad_norm": 1.0810911655426025, + "learning_rate": 1.9863173318636535e-05, + "loss": 0.473, + "step": 1178 + }, + { + "epoch": 0.1434742926680864, + "grad_norm": 0.989427387714386, + "learning_rate": 1.9862851543705965e-05, + "loss": 0.4505, + "step": 1179 + }, + { + "epoch": 0.14359598418010344, + "grad_norm": 1.1737773418426514, + "learning_rate": 1.9862529393473476e-05, + "loss": 0.5425, + "step": 1180 + }, + { + "epoch": 0.14371767569212046, + "grad_norm": 5.367320537567139, + "learning_rate": 1.986220686795133e-05, + "loss": 0.4171, + "step": 1181 + }, + { + "epoch": 0.1438393672041375, + "grad_norm": 2.964874744415283, + "learning_rate": 1.98618839671518e-05, + "loss": 0.5029, + "step": 1182 + }, + { + "epoch": 0.14396105871615456, + "grad_norm": 3.477079153060913, + "learning_rate": 1.9861560691087175e-05, + "loss": 0.4708, + "step": 1183 + }, + { + "epoch": 0.14408275022817157, + "grad_norm": 1.2514410018920898, + "learning_rate": 1.9861237039769752e-05, + "loss": 0.4466, + "step": 1184 + }, + { + "epoch": 0.14420444174018862, + "grad_norm": 2.359375238418579, + "learning_rate": 1.9860913013211848e-05, + "loss": 0.5482, + "step": 1185 + }, + { + "epoch": 0.14432613325220567, + "grad_norm": 2.930695056915283, + "learning_rate": 1.986058861142579e-05, + "loss": 0.4813, + "step": 1186 + }, + { + "epoch": 0.1444478247642227, + "grad_norm": 3.335024356842041, + "learning_rate": 1.9860263834423926e-05, + "loss": 0.5209, + "step": 1187 + }, + { + "epoch": 0.14456951627623973, + "grad_norm": 1.3997037410736084, + "learning_rate": 1.9859938682218615e-05, + "loss": 0.4741, + "step": 1188 + }, + { + "epoch": 0.14469120778825678, + "grad_norm": 0.7062254548072815, + "learning_rate": 1.9859613154822228e-05, + "loss": 0.4747, + "step": 1189 + }, + { + "epoch": 0.1448128993002738, + "grad_norm": 1.2905937433242798, + "learning_rate": 1.985928725224715e-05, + "loss": 0.4199, + "step": 1190 + }, + { + "epoch": 0.14493459081229085, + "grad_norm": 0.667495846748352, + "learning_rate": 1.9858960974505786e-05, + "loss": 0.4732, + "step": 1191 + }, + { + "epoch": 0.14505628232430787, + "grad_norm": 2.2584023475646973, + "learning_rate": 1.9858634321610553e-05, + "loss": 0.4406, + "step": 1192 + }, + { + "epoch": 0.14517797383632491, + "grad_norm": 2.2858972549438477, + "learning_rate": 1.9858307293573873e-05, + "loss": 0.5062, + "step": 1193 + }, + { + "epoch": 0.14529966534834196, + "grad_norm": 2.145503282546997, + "learning_rate": 1.9857979890408198e-05, + "loss": 0.4937, + "step": 1194 + }, + { + "epoch": 0.14542135686035898, + "grad_norm": 1.4335459470748901, + "learning_rate": 1.985765211212598e-05, + "loss": 0.5567, + "step": 1195 + }, + { + "epoch": 0.14554304837237603, + "grad_norm": 1.0593979358673096, + "learning_rate": 1.98573239587397e-05, + "loss": 0.5149, + "step": 1196 + }, + { + "epoch": 0.14566473988439307, + "grad_norm": 2.3031575679779053, + "learning_rate": 1.9856995430261837e-05, + "loss": 0.4734, + "step": 1197 + }, + { + "epoch": 0.1457864313964101, + "grad_norm": 3.046948194503784, + "learning_rate": 1.9856666526704893e-05, + "loss": 0.4595, + "step": 1198 + }, + { + "epoch": 0.14590812290842714, + "grad_norm": 0.8235745429992676, + "learning_rate": 1.9856337248081388e-05, + "loss": 0.523, + "step": 1199 + }, + { + "epoch": 0.1460298144204442, + "grad_norm": 0.8751189112663269, + "learning_rate": 1.9856007594403846e-05, + "loss": 0.51, + "step": 1200 + }, + { + "epoch": 0.1461515059324612, + "grad_norm": 1.4345073699951172, + "learning_rate": 1.985567756568482e-05, + "loss": 0.5157, + "step": 1201 + }, + { + "epoch": 0.14627319744447825, + "grad_norm": 0.7432553172111511, + "learning_rate": 1.9855347161936858e-05, + "loss": 0.4679, + "step": 1202 + }, + { + "epoch": 0.14639488895649527, + "grad_norm": 3.2682695388793945, + "learning_rate": 1.9855016383172538e-05, + "loss": 0.5176, + "step": 1203 + }, + { + "epoch": 0.14651658046851232, + "grad_norm": 1.7866331338882446, + "learning_rate": 1.9854685229404444e-05, + "loss": 0.5018, + "step": 1204 + }, + { + "epoch": 0.14663827198052937, + "grad_norm": 0.6467131972312927, + "learning_rate": 1.9854353700645185e-05, + "loss": 0.4415, + "step": 1205 + }, + { + "epoch": 0.14675996349254639, + "grad_norm": 0.9730408787727356, + "learning_rate": 1.9854021796907364e-05, + "loss": 0.4247, + "step": 1206 + }, + { + "epoch": 0.14688165500456343, + "grad_norm": 0.8538654446601868, + "learning_rate": 1.985368951820362e-05, + "loss": 0.4538, + "step": 1207 + }, + { + "epoch": 0.14700334651658048, + "grad_norm": 0.6595228314399719, + "learning_rate": 1.9853356864546595e-05, + "loss": 0.4631, + "step": 1208 + }, + { + "epoch": 0.1471250380285975, + "grad_norm": 1.1981980800628662, + "learning_rate": 1.9853023835948943e-05, + "loss": 0.4812, + "step": 1209 + }, + { + "epoch": 0.14724672954061455, + "grad_norm": 1.0465261936187744, + "learning_rate": 1.985269043242334e-05, + "loss": 0.4832, + "step": 1210 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 0.7678244709968567, + "learning_rate": 1.9852356653982473e-05, + "loss": 0.473, + "step": 1211 + }, + { + "epoch": 0.1474901125646486, + "grad_norm": 0.7757473587989807, + "learning_rate": 1.985202250063904e-05, + "loss": 0.5061, + "step": 1212 + }, + { + "epoch": 0.14761180407666566, + "grad_norm": 1.121665120124817, + "learning_rate": 1.9851687972405763e-05, + "loss": 0.4582, + "step": 1213 + }, + { + "epoch": 0.14773349558868268, + "grad_norm": 1.907209038734436, + "learning_rate": 1.985135306929536e-05, + "loss": 0.44, + "step": 1214 + }, + { + "epoch": 0.14785518710069973, + "grad_norm": 0.8149450421333313, + "learning_rate": 1.985101779132059e-05, + "loss": 0.432, + "step": 1215 + }, + { + "epoch": 0.14797687861271677, + "grad_norm": 3.072385549545288, + "learning_rate": 1.9850682138494193e-05, + "loss": 0.4966, + "step": 1216 + }, + { + "epoch": 0.1480985701247338, + "grad_norm": 2.164401054382324, + "learning_rate": 1.9850346110828957e-05, + "loss": 0.4656, + "step": 1217 + }, + { + "epoch": 0.14822026163675084, + "grad_norm": 0.923026978969574, + "learning_rate": 1.985000970833766e-05, + "loss": 0.4549, + "step": 1218 + }, + { + "epoch": 0.14834195314876789, + "grad_norm": 0.5282090902328491, + "learning_rate": 1.9849672931033107e-05, + "loss": 0.4801, + "step": 1219 + }, + { + "epoch": 0.1484636446607849, + "grad_norm": 1.6033319234848022, + "learning_rate": 1.984933577892811e-05, + "loss": 0.463, + "step": 1220 + }, + { + "epoch": 0.14858533617280195, + "grad_norm": 1.5123202800750732, + "learning_rate": 1.98489982520355e-05, + "loss": 0.4453, + "step": 1221 + }, + { + "epoch": 0.14870702768481897, + "grad_norm": 1.361024022102356, + "learning_rate": 1.984866035036812e-05, + "loss": 0.4721, + "step": 1222 + }, + { + "epoch": 0.14882871919683602, + "grad_norm": 3.041339159011841, + "learning_rate": 1.9848322073938832e-05, + "loss": 0.5307, + "step": 1223 + }, + { + "epoch": 0.14895041070885306, + "grad_norm": 0.6843611598014832, + "learning_rate": 1.98479834227605e-05, + "loss": 0.4554, + "step": 1224 + }, + { + "epoch": 0.14907210222087008, + "grad_norm": 1.0955889225006104, + "learning_rate": 1.984764439684601e-05, + "loss": 0.4861, + "step": 1225 + }, + { + "epoch": 0.14919379373288713, + "grad_norm": 1.1836271286010742, + "learning_rate": 1.9847304996208273e-05, + "loss": 0.5122, + "step": 1226 + }, + { + "epoch": 0.14931548524490418, + "grad_norm": 1.690466046333313, + "learning_rate": 1.9846965220860198e-05, + "loss": 0.5115, + "step": 1227 + }, + { + "epoch": 0.1494371767569212, + "grad_norm": 2.871076822280884, + "learning_rate": 1.9846625070814715e-05, + "loss": 0.4335, + "step": 1228 + }, + { + "epoch": 0.14955886826893824, + "grad_norm": 2.2395992279052734, + "learning_rate": 1.9846284546084766e-05, + "loss": 0.447, + "step": 1229 + }, + { + "epoch": 0.1496805597809553, + "grad_norm": 2.40677547454834, + "learning_rate": 1.984594364668331e-05, + "loss": 0.4331, + "step": 1230 + }, + { + "epoch": 0.1498022512929723, + "grad_norm": 1.767208456993103, + "learning_rate": 1.9845602372623313e-05, + "loss": 0.4661, + "step": 1231 + }, + { + "epoch": 0.14992394280498936, + "grad_norm": 2.165086030960083, + "learning_rate": 1.984526072391777e-05, + "loss": 0.4881, + "step": 1232 + }, + { + "epoch": 0.15004563431700638, + "grad_norm": 2.5997352600097656, + "learning_rate": 1.984491870057968e-05, + "loss": 0.5009, + "step": 1233 + }, + { + "epoch": 0.15016732582902342, + "grad_norm": 3.7121734619140625, + "learning_rate": 1.9844576302622054e-05, + "loss": 0.544, + "step": 1234 + }, + { + "epoch": 0.15028901734104047, + "grad_norm": 0.8754316568374634, + "learning_rate": 1.9844233530057923e-05, + "loss": 0.4954, + "step": 1235 + }, + { + "epoch": 0.1504107088530575, + "grad_norm": 3.065603017807007, + "learning_rate": 1.984389038290033e-05, + "loss": 0.4756, + "step": 1236 + }, + { + "epoch": 0.15053240036507454, + "grad_norm": 1.9599072933197021, + "learning_rate": 1.984354686116233e-05, + "loss": 0.4949, + "step": 1237 + }, + { + "epoch": 0.15065409187709158, + "grad_norm": 5.242061138153076, + "learning_rate": 1.9843202964856998e-05, + "loss": 0.4772, + "step": 1238 + }, + { + "epoch": 0.1507757833891086, + "grad_norm": 1.7279301881790161, + "learning_rate": 1.984285869399742e-05, + "loss": 0.5142, + "step": 1239 + }, + { + "epoch": 0.15089747490112565, + "grad_norm": 3.280184268951416, + "learning_rate": 1.9842514048596696e-05, + "loss": 0.4491, + "step": 1240 + }, + { + "epoch": 0.1510191664131427, + "grad_norm": 3.316697835922241, + "learning_rate": 1.9842169028667935e-05, + "loss": 0.5483, + "step": 1241 + }, + { + "epoch": 0.15114085792515972, + "grad_norm": 3.42592453956604, + "learning_rate": 1.9841823634224274e-05, + "loss": 0.5365, + "step": 1242 + }, + { + "epoch": 0.15126254943717676, + "grad_norm": 1.5819356441497803, + "learning_rate": 1.984147786527885e-05, + "loss": 0.4699, + "step": 1243 + }, + { + "epoch": 0.15138424094919378, + "grad_norm": 2.019106388092041, + "learning_rate": 1.9841131721844825e-05, + "loss": 0.4797, + "step": 1244 + }, + { + "epoch": 0.15150593246121083, + "grad_norm": 0.5561219453811646, + "learning_rate": 1.9840785203935366e-05, + "loss": 0.4535, + "step": 1245 + }, + { + "epoch": 0.15162762397322788, + "grad_norm": 1.6924302577972412, + "learning_rate": 1.9840438311563663e-05, + "loss": 0.4818, + "step": 1246 + }, + { + "epoch": 0.1517493154852449, + "grad_norm": 1.3197678327560425, + "learning_rate": 1.984009104474291e-05, + "loss": 0.4978, + "step": 1247 + }, + { + "epoch": 0.15187100699726194, + "grad_norm": 1.0036183595657349, + "learning_rate": 1.9839743403486326e-05, + "loss": 0.4692, + "step": 1248 + }, + { + "epoch": 0.151992698509279, + "grad_norm": 1.493024230003357, + "learning_rate": 1.9839395387807138e-05, + "loss": 0.4888, + "step": 1249 + }, + { + "epoch": 0.152114390021296, + "grad_norm": 1.1445404291152954, + "learning_rate": 1.983904699771859e-05, + "loss": 0.469, + "step": 1250 + }, + { + "epoch": 0.15223608153331306, + "grad_norm": 1.5749421119689941, + "learning_rate": 1.983869823323394e-05, + "loss": 0.4765, + "step": 1251 + }, + { + "epoch": 0.15235777304533007, + "grad_norm": 1.6942704916000366, + "learning_rate": 1.9838349094366457e-05, + "loss": 0.4437, + "step": 1252 + }, + { + "epoch": 0.15247946455734712, + "grad_norm": 0.5991498827934265, + "learning_rate": 1.9837999581129423e-05, + "loss": 0.466, + "step": 1253 + }, + { + "epoch": 0.15260115606936417, + "grad_norm": 1.3291352987289429, + "learning_rate": 1.9837649693536146e-05, + "loss": 0.4655, + "step": 1254 + }, + { + "epoch": 0.1527228475813812, + "grad_norm": 1.4382669925689697, + "learning_rate": 1.9837299431599934e-05, + "loss": 0.476, + "step": 1255 + }, + { + "epoch": 0.15284453909339824, + "grad_norm": 1.5753225088119507, + "learning_rate": 1.9836948795334113e-05, + "loss": 0.4147, + "step": 1256 + }, + { + "epoch": 0.15296623060541528, + "grad_norm": 3.0628366470336914, + "learning_rate": 1.9836597784752032e-05, + "loss": 0.5525, + "step": 1257 + }, + { + "epoch": 0.1530879221174323, + "grad_norm": 0.7171027064323425, + "learning_rate": 1.9836246399867043e-05, + "loss": 0.4853, + "step": 1258 + }, + { + "epoch": 0.15320961362944935, + "grad_norm": 0.6188491582870483, + "learning_rate": 1.983589464069252e-05, + "loss": 0.4688, + "step": 1259 + }, + { + "epoch": 0.1533313051414664, + "grad_norm": 2.889864444732666, + "learning_rate": 1.9835542507241847e-05, + "loss": 0.468, + "step": 1260 + }, + { + "epoch": 0.15345299665348341, + "grad_norm": 0.6995101571083069, + "learning_rate": 1.9835189999528425e-05, + "loss": 0.5083, + "step": 1261 + }, + { + "epoch": 0.15357468816550046, + "grad_norm": 1.239412784576416, + "learning_rate": 1.9834837117565662e-05, + "loss": 0.4339, + "step": 1262 + }, + { + "epoch": 0.15369637967751748, + "grad_norm": 1.104770541191101, + "learning_rate": 1.9834483861366992e-05, + "loss": 0.4905, + "step": 1263 + }, + { + "epoch": 0.15381807118953453, + "grad_norm": 1.2953804731369019, + "learning_rate": 1.9834130230945853e-05, + "loss": 0.4552, + "step": 1264 + }, + { + "epoch": 0.15393976270155157, + "grad_norm": 3.1955671310424805, + "learning_rate": 1.9833776226315705e-05, + "loss": 0.5419, + "step": 1265 + }, + { + "epoch": 0.1540614542135686, + "grad_norm": 0.6878771781921387, + "learning_rate": 1.9833421847490016e-05, + "loss": 0.4383, + "step": 1266 + }, + { + "epoch": 0.15418314572558564, + "grad_norm": 1.1765056848526, + "learning_rate": 1.983306709448227e-05, + "loss": 0.4682, + "step": 1267 + }, + { + "epoch": 0.1543048372376027, + "grad_norm": 3.464308023452759, + "learning_rate": 1.9832711967305972e-05, + "loss": 0.5212, + "step": 1268 + }, + { + "epoch": 0.1544265287496197, + "grad_norm": 0.6605225205421448, + "learning_rate": 1.9832356465974623e-05, + "loss": 0.4457, + "step": 1269 + }, + { + "epoch": 0.15454822026163675, + "grad_norm": 1.1634217500686646, + "learning_rate": 1.9832000590501764e-05, + "loss": 0.4741, + "step": 1270 + }, + { + "epoch": 0.1546699117736538, + "grad_norm": 0.6154793500900269, + "learning_rate": 1.9831644340900934e-05, + "loss": 0.4729, + "step": 1271 + }, + { + "epoch": 0.15479160328567082, + "grad_norm": 1.2960669994354248, + "learning_rate": 1.9831287717185685e-05, + "loss": 0.4494, + "step": 1272 + }, + { + "epoch": 0.15491329479768787, + "grad_norm": 1.1229777336120605, + "learning_rate": 1.9830930719369587e-05, + "loss": 0.4363, + "step": 1273 + }, + { + "epoch": 0.1550349863097049, + "grad_norm": 1.195603847503662, + "learning_rate": 1.9830573347466226e-05, + "loss": 0.4574, + "step": 1274 + }, + { + "epoch": 0.15515667782172193, + "grad_norm": 2.0545639991760254, + "learning_rate": 1.9830215601489202e-05, + "loss": 0.485, + "step": 1275 + }, + { + "epoch": 0.15527836933373898, + "grad_norm": 2.203857660293579, + "learning_rate": 1.9829857481452125e-05, + "loss": 0.5288, + "step": 1276 + }, + { + "epoch": 0.155400060845756, + "grad_norm": 1.737296462059021, + "learning_rate": 1.9829498987368626e-05, + "loss": 0.4403, + "step": 1277 + }, + { + "epoch": 0.15552175235777305, + "grad_norm": 2.9370386600494385, + "learning_rate": 1.9829140119252345e-05, + "loss": 0.4675, + "step": 1278 + }, + { + "epoch": 0.1556434438697901, + "grad_norm": 0.869575560092926, + "learning_rate": 1.9828780877116936e-05, + "loss": 0.5189, + "step": 1279 + }, + { + "epoch": 0.1557651353818071, + "grad_norm": 1.3392378091812134, + "learning_rate": 1.982842126097607e-05, + "loss": 0.4932, + "step": 1280 + }, + { + "epoch": 0.15588682689382416, + "grad_norm": 0.5321990847587585, + "learning_rate": 1.9828061270843434e-05, + "loss": 0.4607, + "step": 1281 + }, + { + "epoch": 0.15600851840584118, + "grad_norm": 2.7002933025360107, + "learning_rate": 1.982770090673272e-05, + "loss": 0.4551, + "step": 1282 + }, + { + "epoch": 0.15613020991785823, + "grad_norm": 1.1558703184127808, + "learning_rate": 1.9827340168657648e-05, + "loss": 0.4484, + "step": 1283 + }, + { + "epoch": 0.15625190142987527, + "grad_norm": 1.924221396446228, + "learning_rate": 1.982697905663194e-05, + "loss": 0.5133, + "step": 1284 + }, + { + "epoch": 0.1563735929418923, + "grad_norm": 0.8073462247848511, + "learning_rate": 1.9826617570669336e-05, + "loss": 0.4456, + "step": 1285 + }, + { + "epoch": 0.15649528445390934, + "grad_norm": 0.9620110988616943, + "learning_rate": 1.9826255710783595e-05, + "loss": 0.482, + "step": 1286 + }, + { + "epoch": 0.15661697596592639, + "grad_norm": 1.0379416942596436, + "learning_rate": 1.9825893476988485e-05, + "loss": 0.4754, + "step": 1287 + }, + { + "epoch": 0.1567386674779434, + "grad_norm": 1.1025772094726562, + "learning_rate": 1.9825530869297788e-05, + "loss": 0.4721, + "step": 1288 + }, + { + "epoch": 0.15686035898996045, + "grad_norm": 0.9665660858154297, + "learning_rate": 1.9825167887725305e-05, + "loss": 0.4766, + "step": 1289 + }, + { + "epoch": 0.1569820505019775, + "grad_norm": 1.3248168230056763, + "learning_rate": 1.9824804532284846e-05, + "loss": 0.4727, + "step": 1290 + }, + { + "epoch": 0.15710374201399452, + "grad_norm": 0.65620356798172, + "learning_rate": 1.982444080299024e-05, + "loss": 0.4748, + "step": 1291 + }, + { + "epoch": 0.15722543352601157, + "grad_norm": 0.9081881046295166, + "learning_rate": 1.9824076699855324e-05, + "loss": 0.5071, + "step": 1292 + }, + { + "epoch": 0.15734712503802858, + "grad_norm": 1.0297307968139648, + "learning_rate": 1.982371222289396e-05, + "loss": 0.5102, + "step": 1293 + }, + { + "epoch": 0.15746881655004563, + "grad_norm": 1.0757161378860474, + "learning_rate": 1.9823347372120008e-05, + "loss": 0.5321, + "step": 1294 + }, + { + "epoch": 0.15759050806206268, + "grad_norm": 1.443177580833435, + "learning_rate": 1.9822982147547353e-05, + "loss": 0.4689, + "step": 1295 + }, + { + "epoch": 0.1577121995740797, + "grad_norm": 2.8265788555145264, + "learning_rate": 1.9822616549189898e-05, + "loss": 0.4059, + "step": 1296 + }, + { + "epoch": 0.15783389108609674, + "grad_norm": 1.7425169944763184, + "learning_rate": 1.982225057706155e-05, + "loss": 0.494, + "step": 1297 + }, + { + "epoch": 0.1579555825981138, + "grad_norm": 0.5316634774208069, + "learning_rate": 1.9821884231176237e-05, + "loss": 0.4543, + "step": 1298 + }, + { + "epoch": 0.1580772741101308, + "grad_norm": 4.981151580810547, + "learning_rate": 1.98215175115479e-05, + "loss": 0.5478, + "step": 1299 + }, + { + "epoch": 0.15819896562214786, + "grad_norm": 1.6411571502685547, + "learning_rate": 1.9821150418190492e-05, + "loss": 0.3831, + "step": 1300 + }, + { + "epoch": 0.1583206571341649, + "grad_norm": 1.6752369403839111, + "learning_rate": 1.982078295111798e-05, + "loss": 0.4786, + "step": 1301 + }, + { + "epoch": 0.15844234864618192, + "grad_norm": 3.5548789501190186, + "learning_rate": 1.982041511034435e-05, + "loss": 0.552, + "step": 1302 + }, + { + "epoch": 0.15856404015819897, + "grad_norm": 0.6623461842536926, + "learning_rate": 1.9820046895883596e-05, + "loss": 0.4651, + "step": 1303 + }, + { + "epoch": 0.158685731670216, + "grad_norm": 0.5890607833862305, + "learning_rate": 1.9819678307749735e-05, + "loss": 0.4894, + "step": 1304 + }, + { + "epoch": 0.15880742318223304, + "grad_norm": 1.6179745197296143, + "learning_rate": 1.981930934595679e-05, + "loss": 0.4875, + "step": 1305 + }, + { + "epoch": 0.15892911469425008, + "grad_norm": 3.996370792388916, + "learning_rate": 1.9818940010518798e-05, + "loss": 0.4228, + "step": 1306 + }, + { + "epoch": 0.1590508062062671, + "grad_norm": 1.6369858980178833, + "learning_rate": 1.9818570301449812e-05, + "loss": 0.4584, + "step": 1307 + }, + { + "epoch": 0.15917249771828415, + "grad_norm": 2.4803812503814697, + "learning_rate": 1.98182002187639e-05, + "loss": 0.5223, + "step": 1308 + }, + { + "epoch": 0.1592941892303012, + "grad_norm": 1.377143383026123, + "learning_rate": 1.9817829762475154e-05, + "loss": 0.4676, + "step": 1309 + }, + { + "epoch": 0.15941588074231822, + "grad_norm": 1.393758773803711, + "learning_rate": 1.981745893259766e-05, + "loss": 0.4693, + "step": 1310 + }, + { + "epoch": 0.15953757225433526, + "grad_norm": 1.0268025398254395, + "learning_rate": 1.9817087729145532e-05, + "loss": 0.4291, + "step": 1311 + }, + { + "epoch": 0.1596592637663523, + "grad_norm": 4.2349348068237305, + "learning_rate": 1.9816716152132897e-05, + "loss": 0.552, + "step": 1312 + }, + { + "epoch": 0.15978095527836933, + "grad_norm": 0.6407625675201416, + "learning_rate": 1.9816344201573895e-05, + "loss": 0.4616, + "step": 1313 + }, + { + "epoch": 0.15990264679038638, + "grad_norm": 0.6401371359825134, + "learning_rate": 1.9815971877482676e-05, + "loss": 0.4627, + "step": 1314 + }, + { + "epoch": 0.1600243383024034, + "grad_norm": 1.0625510215759277, + "learning_rate": 1.9815599179873407e-05, + "loss": 0.4909, + "step": 1315 + }, + { + "epoch": 0.16014602981442044, + "grad_norm": 1.3133800029754639, + "learning_rate": 1.981522610876027e-05, + "loss": 0.4868, + "step": 1316 + }, + { + "epoch": 0.1602677213264375, + "grad_norm": 0.6777065396308899, + "learning_rate": 1.981485266415747e-05, + "loss": 0.5052, + "step": 1317 + }, + { + "epoch": 0.1603894128384545, + "grad_norm": 1.8892720937728882, + "learning_rate": 1.9814478846079206e-05, + "loss": 0.4298, + "step": 1318 + }, + { + "epoch": 0.16051110435047156, + "grad_norm": 2.186417818069458, + "learning_rate": 1.981410465453971e-05, + "loss": 0.4795, + "step": 1319 + }, + { + "epoch": 0.1606327958624886, + "grad_norm": 2.658979892730713, + "learning_rate": 1.9813730089553217e-05, + "loss": 0.5155, + "step": 1320 + }, + { + "epoch": 0.16075448737450562, + "grad_norm": 1.0589911937713623, + "learning_rate": 1.9813355151133977e-05, + "loss": 0.4451, + "step": 1321 + }, + { + "epoch": 0.16087617888652267, + "grad_norm": 1.619744062423706, + "learning_rate": 1.9812979839296264e-05, + "loss": 0.4959, + "step": 1322 + }, + { + "epoch": 0.1609978703985397, + "grad_norm": 0.577512800693512, + "learning_rate": 1.981260415405436e-05, + "loss": 0.4367, + "step": 1323 + }, + { + "epoch": 0.16111956191055674, + "grad_norm": 3.114175319671631, + "learning_rate": 1.981222809542255e-05, + "loss": 0.5297, + "step": 1324 + }, + { + "epoch": 0.16124125342257378, + "grad_norm": 0.6847550868988037, + "learning_rate": 1.9811851663415153e-05, + "loss": 0.5072, + "step": 1325 + }, + { + "epoch": 0.1613629449345908, + "grad_norm": 0.6069439649581909, + "learning_rate": 1.981147485804649e-05, + "loss": 0.5412, + "step": 1326 + }, + { + "epoch": 0.16148463644660785, + "grad_norm": 2.925081729888916, + "learning_rate": 1.9811097679330902e-05, + "loss": 0.48, + "step": 1327 + }, + { + "epoch": 0.1616063279586249, + "grad_norm": 4.44187068939209, + "learning_rate": 1.981072012728274e-05, + "loss": 0.4733, + "step": 1328 + }, + { + "epoch": 0.16172801947064191, + "grad_norm": 3.3925576210021973, + "learning_rate": 1.981034220191637e-05, + "loss": 0.4853, + "step": 1329 + }, + { + "epoch": 0.16184971098265896, + "grad_norm": 3.8604345321655273, + "learning_rate": 1.980996390324617e-05, + "loss": 0.439, + "step": 1330 + }, + { + "epoch": 0.161971402494676, + "grad_norm": 0.7895316481590271, + "learning_rate": 1.980958523128654e-05, + "loss": 0.4518, + "step": 1331 + }, + { + "epoch": 0.16209309400669303, + "grad_norm": 2.4462637901306152, + "learning_rate": 1.9809206186051888e-05, + "loss": 0.3835, + "step": 1332 + }, + { + "epoch": 0.16221478551871008, + "grad_norm": 5.019735336303711, + "learning_rate": 1.9808826767556636e-05, + "loss": 0.5215, + "step": 1333 + }, + { + "epoch": 0.1623364770307271, + "grad_norm": 1.9361854791641235, + "learning_rate": 1.9808446975815224e-05, + "loss": 0.4501, + "step": 1334 + }, + { + "epoch": 0.16245816854274414, + "grad_norm": 2.8780288696289062, + "learning_rate": 1.98080668108421e-05, + "loss": 0.4343, + "step": 1335 + }, + { + "epoch": 0.1625798600547612, + "grad_norm": 3.8520593643188477, + "learning_rate": 1.9807686272651732e-05, + "loss": 0.5221, + "step": 1336 + }, + { + "epoch": 0.1627015515667782, + "grad_norm": 3.5907115936279297, + "learning_rate": 1.9807305361258603e-05, + "loss": 0.4857, + "step": 1337 + }, + { + "epoch": 0.16282324307879525, + "grad_norm": 0.6669024229049683, + "learning_rate": 1.98069240766772e-05, + "loss": 0.4889, + "step": 1338 + }, + { + "epoch": 0.1629449345908123, + "grad_norm": 1.2091970443725586, + "learning_rate": 1.980654241892204e-05, + "loss": 0.5279, + "step": 1339 + }, + { + "epoch": 0.16306662610282932, + "grad_norm": 2.0911097526550293, + "learning_rate": 1.9806160388007644e-05, + "loss": 0.5004, + "step": 1340 + }, + { + "epoch": 0.16318831761484637, + "grad_norm": 2.878972053527832, + "learning_rate": 1.9805777983948545e-05, + "loss": 0.5446, + "step": 1341 + }, + { + "epoch": 0.16331000912686341, + "grad_norm": 4.621723175048828, + "learning_rate": 1.98053952067593e-05, + "loss": 0.5321, + "step": 1342 + }, + { + "epoch": 0.16343170063888043, + "grad_norm": 6.323509216308594, + "learning_rate": 1.980501205645447e-05, + "loss": 0.4841, + "step": 1343 + }, + { + "epoch": 0.16355339215089748, + "grad_norm": 4.261968612670898, + "learning_rate": 1.9804628533048635e-05, + "loss": 0.4904, + "step": 1344 + }, + { + "epoch": 0.1636750836629145, + "grad_norm": 1.5866800546646118, + "learning_rate": 1.980424463655639e-05, + "loss": 0.516, + "step": 1345 + }, + { + "epoch": 0.16379677517493155, + "grad_norm": 1.8813953399658203, + "learning_rate": 1.980386036699234e-05, + "loss": 0.4551, + "step": 1346 + }, + { + "epoch": 0.1639184666869486, + "grad_norm": 1.9495863914489746, + "learning_rate": 1.9803475724371114e-05, + "loss": 0.4733, + "step": 1347 + }, + { + "epoch": 0.1640401581989656, + "grad_norm": 5.486594200134277, + "learning_rate": 1.9803090708707344e-05, + "loss": 0.5705, + "step": 1348 + }, + { + "epoch": 0.16416184971098266, + "grad_norm": 4.218322277069092, + "learning_rate": 1.980270532001568e-05, + "loss": 0.5275, + "step": 1349 + }, + { + "epoch": 0.1642835412229997, + "grad_norm": 2.9367752075195312, + "learning_rate": 1.9802319558310793e-05, + "loss": 0.4456, + "step": 1350 + }, + { + "epoch": 0.16440523273501673, + "grad_norm": 3.1779539585113525, + "learning_rate": 1.980193342360735e-05, + "loss": 0.4785, + "step": 1351 + }, + { + "epoch": 0.16452692424703377, + "grad_norm": 3.3056774139404297, + "learning_rate": 1.9801546915920056e-05, + "loss": 0.5167, + "step": 1352 + }, + { + "epoch": 0.1646486157590508, + "grad_norm": 1.3719897270202637, + "learning_rate": 1.9801160035263615e-05, + "loss": 0.43, + "step": 1353 + }, + { + "epoch": 0.16477030727106784, + "grad_norm": 2.068129539489746, + "learning_rate": 1.9800772781652743e-05, + "loss": 0.4732, + "step": 1354 + }, + { + "epoch": 0.1648919987830849, + "grad_norm": 2.7401654720306396, + "learning_rate": 1.980038515510218e-05, + "loss": 0.4272, + "step": 1355 + }, + { + "epoch": 0.1650136902951019, + "grad_norm": 2.1367151737213135, + "learning_rate": 1.9799997155626677e-05, + "loss": 0.4377, + "step": 1356 + }, + { + "epoch": 0.16513538180711895, + "grad_norm": 2.0781209468841553, + "learning_rate": 1.9799608783241e-05, + "loss": 0.5002, + "step": 1357 + }, + { + "epoch": 0.165257073319136, + "grad_norm": 2.80288028717041, + "learning_rate": 1.9799220037959924e-05, + "loss": 0.4984, + "step": 1358 + }, + { + "epoch": 0.16537876483115302, + "grad_norm": 2.189610719680786, + "learning_rate": 1.979883091979824e-05, + "loss": 0.5228, + "step": 1359 + }, + { + "epoch": 0.16550045634317007, + "grad_norm": 3.734894037246704, + "learning_rate": 1.979844142877076e-05, + "loss": 0.4173, + "step": 1360 + }, + { + "epoch": 0.1656221478551871, + "grad_norm": 1.6671816110610962, + "learning_rate": 1.97980515648923e-05, + "loss": 0.4727, + "step": 1361 + }, + { + "epoch": 0.16574383936720413, + "grad_norm": 0.8954133987426758, + "learning_rate": 1.9797661328177696e-05, + "loss": 0.4993, + "step": 1362 + }, + { + "epoch": 0.16586553087922118, + "grad_norm": 0.8533859848976135, + "learning_rate": 1.9797270718641803e-05, + "loss": 0.4749, + "step": 1363 + }, + { + "epoch": 0.1659872223912382, + "grad_norm": 2.5973927974700928, + "learning_rate": 1.9796879736299476e-05, + "loss": 0.549, + "step": 1364 + }, + { + "epoch": 0.16610891390325525, + "grad_norm": 3.560835838317871, + "learning_rate": 1.9796488381165595e-05, + "loss": 0.3948, + "step": 1365 + }, + { + "epoch": 0.1662306054152723, + "grad_norm": 1.5512914657592773, + "learning_rate": 1.9796096653255056e-05, + "loss": 0.4799, + "step": 1366 + }, + { + "epoch": 0.1663522969272893, + "grad_norm": 0.5944809913635254, + "learning_rate": 1.9795704552582765e-05, + "loss": 0.4685, + "step": 1367 + }, + { + "epoch": 0.16647398843930636, + "grad_norm": 0.7071087956428528, + "learning_rate": 1.9795312079163634e-05, + "loss": 0.4407, + "step": 1368 + }, + { + "epoch": 0.1665956799513234, + "grad_norm": 1.7477627992630005, + "learning_rate": 1.979491923301261e-05, + "loss": 0.4873, + "step": 1369 + }, + { + "epoch": 0.16671737146334042, + "grad_norm": 2.589707136154175, + "learning_rate": 1.979452601414463e-05, + "loss": 0.5397, + "step": 1370 + }, + { + "epoch": 0.16683906297535747, + "grad_norm": 1.9456111192703247, + "learning_rate": 1.9794132422574663e-05, + "loss": 0.4219, + "step": 1371 + }, + { + "epoch": 0.16696075448737452, + "grad_norm": 0.8413524627685547, + "learning_rate": 1.9793738458317683e-05, + "loss": 0.4416, + "step": 1372 + }, + { + "epoch": 0.16708244599939154, + "grad_norm": 1.5911568403244019, + "learning_rate": 1.9793344121388687e-05, + "loss": 0.4726, + "step": 1373 + }, + { + "epoch": 0.16720413751140858, + "grad_norm": 1.8373585939407349, + "learning_rate": 1.979294941180267e-05, + "loss": 0.5415, + "step": 1374 + }, + { + "epoch": 0.1673258290234256, + "grad_norm": 0.7813757658004761, + "learning_rate": 1.9792554329574663e-05, + "loss": 0.4752, + "step": 1375 + }, + { + "epoch": 0.16744752053544265, + "grad_norm": 3.577542781829834, + "learning_rate": 1.979215887471969e-05, + "loss": 0.4165, + "step": 1376 + }, + { + "epoch": 0.1675692120474597, + "grad_norm": 0.7695497870445251, + "learning_rate": 1.9791763047252805e-05, + "loss": 0.475, + "step": 1377 + }, + { + "epoch": 0.16769090355947672, + "grad_norm": 0.779041588306427, + "learning_rate": 1.979136684718907e-05, + "loss": 0.4779, + "step": 1378 + }, + { + "epoch": 0.16781259507149376, + "grad_norm": 0.7756707072257996, + "learning_rate": 1.9790970274543557e-05, + "loss": 0.4676, + "step": 1379 + }, + { + "epoch": 0.1679342865835108, + "grad_norm": 0.7839758992195129, + "learning_rate": 1.979057332933136e-05, + "loss": 0.4524, + "step": 1380 + }, + { + "epoch": 0.16805597809552783, + "grad_norm": 0.6166628003120422, + "learning_rate": 1.9790176011567583e-05, + "loss": 0.4534, + "step": 1381 + }, + { + "epoch": 0.16817766960754488, + "grad_norm": 0.7479401230812073, + "learning_rate": 1.9789778321267343e-05, + "loss": 0.4626, + "step": 1382 + }, + { + "epoch": 0.16829936111956192, + "grad_norm": 0.7742567658424377, + "learning_rate": 1.9789380258445777e-05, + "loss": 0.4857, + "step": 1383 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 0.9864305853843689, + "learning_rate": 1.9788981823118027e-05, + "loss": 0.5222, + "step": 1384 + }, + { + "epoch": 0.168542744143596, + "grad_norm": 0.8407061100006104, + "learning_rate": 1.9788583015299254e-05, + "loss": 0.4125, + "step": 1385 + }, + { + "epoch": 0.168664435655613, + "grad_norm": 4.476330280303955, + "learning_rate": 1.978818383500464e-05, + "loss": 0.5687, + "step": 1386 + }, + { + "epoch": 0.16878612716763006, + "grad_norm": 0.6254599094390869, + "learning_rate": 1.9787784282249366e-05, + "loss": 0.5055, + "step": 1387 + }, + { + "epoch": 0.1689078186796471, + "grad_norm": 4.242245674133301, + "learning_rate": 1.9787384357048645e-05, + "loss": 0.408, + "step": 1388 + }, + { + "epoch": 0.16902951019166412, + "grad_norm": 0.7404464483261108, + "learning_rate": 1.978698405941769e-05, + "loss": 0.5097, + "step": 1389 + }, + { + "epoch": 0.16915120170368117, + "grad_norm": 3.682858467102051, + "learning_rate": 1.9786583389371733e-05, + "loss": 0.4462, + "step": 1390 + }, + { + "epoch": 0.16927289321569822, + "grad_norm": 0.7396299839019775, + "learning_rate": 1.978618234692602e-05, + "loss": 0.461, + "step": 1391 + }, + { + "epoch": 0.16939458472771524, + "grad_norm": 1.486547827720642, + "learning_rate": 1.9785780932095812e-05, + "loss": 0.4913, + "step": 1392 + }, + { + "epoch": 0.16951627623973228, + "grad_norm": 2.3887839317321777, + "learning_rate": 1.9785379144896385e-05, + "loss": 0.4833, + "step": 1393 + }, + { + "epoch": 0.1696379677517493, + "grad_norm": 3.684333324432373, + "learning_rate": 1.9784976985343028e-05, + "loss": 0.5747, + "step": 1394 + }, + { + "epoch": 0.16975965926376635, + "grad_norm": 1.318081259727478, + "learning_rate": 1.978457445345104e-05, + "loss": 0.5194, + "step": 1395 + }, + { + "epoch": 0.1698813507757834, + "grad_norm": 1.0038377046585083, + "learning_rate": 1.9784171549235743e-05, + "loss": 0.49, + "step": 1396 + }, + { + "epoch": 0.17000304228780042, + "grad_norm": 1.3077162504196167, + "learning_rate": 1.9783768272712467e-05, + "loss": 0.4569, + "step": 1397 + }, + { + "epoch": 0.17012473379981746, + "grad_norm": 4.728083610534668, + "learning_rate": 1.9783364623896557e-05, + "loss": 0.4363, + "step": 1398 + }, + { + "epoch": 0.1702464253118345, + "grad_norm": 0.90293288230896, + "learning_rate": 1.978296060280337e-05, + "loss": 0.5272, + "step": 1399 + }, + { + "epoch": 0.17036811682385153, + "grad_norm": 0.9383813738822937, + "learning_rate": 1.9782556209448285e-05, + "loss": 0.484, + "step": 1400 + }, + { + "epoch": 0.17048980833586858, + "grad_norm": 1.6882658004760742, + "learning_rate": 1.978215144384669e-05, + "loss": 0.4673, + "step": 1401 + }, + { + "epoch": 0.17061149984788562, + "grad_norm": 1.3856250047683716, + "learning_rate": 1.978174630601398e-05, + "loss": 0.4766, + "step": 1402 + }, + { + "epoch": 0.17073319135990264, + "grad_norm": 1.3125702142715454, + "learning_rate": 1.9781340795965577e-05, + "loss": 0.4922, + "step": 1403 + }, + { + "epoch": 0.1708548828719197, + "grad_norm": 2.7786688804626465, + "learning_rate": 1.978093491371691e-05, + "loss": 0.4305, + "step": 1404 + }, + { + "epoch": 0.1709765743839367, + "grad_norm": 1.0468382835388184, + "learning_rate": 1.9780528659283424e-05, + "loss": 0.494, + "step": 1405 + }, + { + "epoch": 0.17109826589595376, + "grad_norm": 0.6543203592300415, + "learning_rate": 1.9780122032680578e-05, + "loss": 0.4446, + "step": 1406 + }, + { + "epoch": 0.1712199574079708, + "grad_norm": 2.7950873374938965, + "learning_rate": 1.9779715033923846e-05, + "loss": 0.5344, + "step": 1407 + }, + { + "epoch": 0.17134164891998782, + "grad_norm": 1.3265191316604614, + "learning_rate": 1.977930766302872e-05, + "loss": 0.5046, + "step": 1408 + }, + { + "epoch": 0.17146334043200487, + "grad_norm": 2.336111068725586, + "learning_rate": 1.9778899920010682e-05, + "loss": 0.5123, + "step": 1409 + }, + { + "epoch": 0.17158503194402192, + "grad_norm": 2.7772269248962402, + "learning_rate": 1.977849180488527e-05, + "loss": 0.4722, + "step": 1410 + }, + { + "epoch": 0.17170672345603893, + "grad_norm": 0.832023561000824, + "learning_rate": 1.9778083317668004e-05, + "loss": 0.5053, + "step": 1411 + }, + { + "epoch": 0.17182841496805598, + "grad_norm": 2.150932550430298, + "learning_rate": 1.9777674458374428e-05, + "loss": 0.4809, + "step": 1412 + }, + { + "epoch": 0.17195010648007303, + "grad_norm": 3.3440325260162354, + "learning_rate": 1.9777265227020096e-05, + "loss": 0.4613, + "step": 1413 + }, + { + "epoch": 0.17207179799209005, + "grad_norm": 3.8201725482940674, + "learning_rate": 1.9776855623620588e-05, + "loss": 0.4687, + "step": 1414 + }, + { + "epoch": 0.1721934895041071, + "grad_norm": 0.9091955423355103, + "learning_rate": 1.9776445648191488e-05, + "loss": 0.5033, + "step": 1415 + }, + { + "epoch": 0.1723151810161241, + "grad_norm": 0.576927125453949, + "learning_rate": 1.977603530074839e-05, + "loss": 0.4989, + "step": 1416 + }, + { + "epoch": 0.17243687252814116, + "grad_norm": 0.6666160821914673, + "learning_rate": 1.9775624581306917e-05, + "loss": 0.4638, + "step": 1417 + }, + { + "epoch": 0.1725585640401582, + "grad_norm": 0.6030458211898804, + "learning_rate": 1.9775213489882693e-05, + "loss": 0.4479, + "step": 1418 + }, + { + "epoch": 0.17268025555217523, + "grad_norm": 2.0674116611480713, + "learning_rate": 1.9774802026491363e-05, + "loss": 0.4862, + "step": 1419 + }, + { + "epoch": 0.17280194706419227, + "grad_norm": 0.6450443863868713, + "learning_rate": 1.977439019114858e-05, + "loss": 0.4664, + "step": 1420 + }, + { + "epoch": 0.17292363857620932, + "grad_norm": 0.700458824634552, + "learning_rate": 1.9773977983870023e-05, + "loss": 0.452, + "step": 1421 + }, + { + "epoch": 0.17304533008822634, + "grad_norm": 1.8581122159957886, + "learning_rate": 1.9773565404671374e-05, + "loss": 0.4482, + "step": 1422 + }, + { + "epoch": 0.1731670216002434, + "grad_norm": 1.016422986984253, + "learning_rate": 1.9773152453568326e-05, + "loss": 0.3997, + "step": 1423 + }, + { + "epoch": 0.1732887131122604, + "grad_norm": 1.2067203521728516, + "learning_rate": 1.9772739130576598e-05, + "loss": 0.4319, + "step": 1424 + }, + { + "epoch": 0.17341040462427745, + "grad_norm": 2.8970839977264404, + "learning_rate": 1.977232543571192e-05, + "loss": 0.4704, + "step": 1425 + }, + { + "epoch": 0.1735320961362945, + "grad_norm": 2.0348024368286133, + "learning_rate": 1.9771911368990032e-05, + "loss": 0.4478, + "step": 1426 + }, + { + "epoch": 0.17365378764831152, + "grad_norm": 1.62767493724823, + "learning_rate": 1.977149693042669e-05, + "loss": 0.4721, + "step": 1427 + }, + { + "epoch": 0.17377547916032857, + "grad_norm": 2.4103634357452393, + "learning_rate": 1.9771082120037662e-05, + "loss": 0.4777, + "step": 1428 + }, + { + "epoch": 0.1738971706723456, + "grad_norm": 1.2694464921951294, + "learning_rate": 1.9770666937838736e-05, + "loss": 0.4837, + "step": 1429 + }, + { + "epoch": 0.17401886218436263, + "grad_norm": 0.7212273478507996, + "learning_rate": 1.9770251383845705e-05, + "loss": 0.4851, + "step": 1430 + }, + { + "epoch": 0.17414055369637968, + "grad_norm": 1.1466623544692993, + "learning_rate": 1.976983545807439e-05, + "loss": 0.5138, + "step": 1431 + }, + { + "epoch": 0.17426224520839673, + "grad_norm": 4.840686321258545, + "learning_rate": 1.976941916054061e-05, + "loss": 0.4672, + "step": 1432 + }, + { + "epoch": 0.17438393672041375, + "grad_norm": 2.6939663887023926, + "learning_rate": 1.9769002491260213e-05, + "loss": 0.4899, + "step": 1433 + }, + { + "epoch": 0.1745056282324308, + "grad_norm": 1.769325613975525, + "learning_rate": 1.976858545024905e-05, + "loss": 0.5061, + "step": 1434 + }, + { + "epoch": 0.1746273197444478, + "grad_norm": 2.664933681488037, + "learning_rate": 1.9768168037522984e-05, + "loss": 0.4518, + "step": 1435 + }, + { + "epoch": 0.17474901125646486, + "grad_norm": 1.2224141359329224, + "learning_rate": 1.9767750253097914e-05, + "loss": 0.4823, + "step": 1436 + }, + { + "epoch": 0.1748707027684819, + "grad_norm": 1.0934165716171265, + "learning_rate": 1.9767332096989725e-05, + "loss": 0.4899, + "step": 1437 + }, + { + "epoch": 0.17499239428049893, + "grad_norm": 4.517724990844727, + "learning_rate": 1.976691356921433e-05, + "loss": 0.5438, + "step": 1438 + }, + { + "epoch": 0.17511408579251597, + "grad_norm": 1.3791165351867676, + "learning_rate": 1.9766494669787662e-05, + "loss": 0.4578, + "step": 1439 + }, + { + "epoch": 0.17523577730453302, + "grad_norm": 1.3613044023513794, + "learning_rate": 1.9766075398725654e-05, + "loss": 0.5075, + "step": 1440 + }, + { + "epoch": 0.17535746881655004, + "grad_norm": 1.1534603834152222, + "learning_rate": 1.976565575604426e-05, + "loss": 0.465, + "step": 1441 + }, + { + "epoch": 0.17547916032856709, + "grad_norm": 2.8964574337005615, + "learning_rate": 1.9765235741759455e-05, + "loss": 0.4515, + "step": 1442 + }, + { + "epoch": 0.17560085184058413, + "grad_norm": 1.6747967004776, + "learning_rate": 1.9764815355887217e-05, + "loss": 0.499, + "step": 1443 + }, + { + "epoch": 0.17572254335260115, + "grad_norm": 2.8970320224761963, + "learning_rate": 1.976439459844354e-05, + "loss": 0.4272, + "step": 1444 + }, + { + "epoch": 0.1758442348646182, + "grad_norm": 2.2143208980560303, + "learning_rate": 1.976397346944444e-05, + "loss": 0.4318, + "step": 1445 + }, + { + "epoch": 0.17596592637663522, + "grad_norm": 2.0721049308776855, + "learning_rate": 1.9763551968905936e-05, + "loss": 0.4664, + "step": 1446 + }, + { + "epoch": 0.17608761788865226, + "grad_norm": 3.9072515964508057, + "learning_rate": 1.976313009684407e-05, + "loss": 0.5073, + "step": 1447 + }, + { + "epoch": 0.1762093094006693, + "grad_norm": 1.4120219945907593, + "learning_rate": 1.9762707853274898e-05, + "loss": 0.4753, + "step": 1448 + }, + { + "epoch": 0.17633100091268633, + "grad_norm": 1.265520691871643, + "learning_rate": 1.9762285238214485e-05, + "loss": 0.4801, + "step": 1449 + }, + { + "epoch": 0.17645269242470338, + "grad_norm": 0.5826008915901184, + "learning_rate": 1.976186225167891e-05, + "loss": 0.4543, + "step": 1450 + }, + { + "epoch": 0.17657438393672042, + "grad_norm": 0.7945842146873474, + "learning_rate": 1.976143889368427e-05, + "loss": 0.4836, + "step": 1451 + }, + { + "epoch": 0.17669607544873744, + "grad_norm": 1.0744837522506714, + "learning_rate": 1.9761015164246675e-05, + "loss": 0.5164, + "step": 1452 + }, + { + "epoch": 0.1768177669607545, + "grad_norm": 0.738902747631073, + "learning_rate": 1.976059106338225e-05, + "loss": 0.5643, + "step": 1453 + }, + { + "epoch": 0.17693945847277154, + "grad_norm": 3.688669204711914, + "learning_rate": 1.976016659110713e-05, + "loss": 0.5167, + "step": 1454 + }, + { + "epoch": 0.17706114998478856, + "grad_norm": 8.816498756408691, + "learning_rate": 1.975974174743747e-05, + "loss": 0.4466, + "step": 1455 + }, + { + "epoch": 0.1771828414968056, + "grad_norm": 6.204555034637451, + "learning_rate": 1.9759316532389436e-05, + "loss": 0.4853, + "step": 1456 + }, + { + "epoch": 0.17730453300882262, + "grad_norm": 3.5571959018707275, + "learning_rate": 1.9758890945979207e-05, + "loss": 0.521, + "step": 1457 + }, + { + "epoch": 0.17742622452083967, + "grad_norm": 0.8225950002670288, + "learning_rate": 1.9758464988222977e-05, + "loss": 0.5098, + "step": 1458 + }, + { + "epoch": 0.17754791603285672, + "grad_norm": 1.2101472616195679, + "learning_rate": 1.9758038659136953e-05, + "loss": 0.5098, + "step": 1459 + }, + { + "epoch": 0.17766960754487374, + "grad_norm": 0.677570641040802, + "learning_rate": 1.9757611958737364e-05, + "loss": 0.4861, + "step": 1460 + }, + { + "epoch": 0.17779129905689078, + "grad_norm": 0.9192119836807251, + "learning_rate": 1.975718488704044e-05, + "loss": 0.4325, + "step": 1461 + }, + { + "epoch": 0.17791299056890783, + "grad_norm": 5.186525344848633, + "learning_rate": 1.9756757444062436e-05, + "loss": 0.5557, + "step": 1462 + }, + { + "epoch": 0.17803468208092485, + "grad_norm": 5.008727073669434, + "learning_rate": 1.9756329629819613e-05, + "loss": 0.5383, + "step": 1463 + }, + { + "epoch": 0.1781563735929419, + "grad_norm": 4.002062797546387, + "learning_rate": 1.9755901444328257e-05, + "loss": 0.4975, + "step": 1464 + }, + { + "epoch": 0.17827806510495892, + "grad_norm": 2.736166477203369, + "learning_rate": 1.9755472887604657e-05, + "loss": 0.4873, + "step": 1465 + }, + { + "epoch": 0.17839975661697596, + "grad_norm": 1.2713899612426758, + "learning_rate": 1.975504395966512e-05, + "loss": 0.4485, + "step": 1466 + }, + { + "epoch": 0.178521448128993, + "grad_norm": 0.8007040023803711, + "learning_rate": 1.975461466052597e-05, + "loss": 0.4454, + "step": 1467 + }, + { + "epoch": 0.17864313964101003, + "grad_norm": 1.5794107913970947, + "learning_rate": 1.975418499020354e-05, + "loss": 0.4887, + "step": 1468 + }, + { + "epoch": 0.17876483115302708, + "grad_norm": 1.300399899482727, + "learning_rate": 1.975375494871418e-05, + "loss": 0.4707, + "step": 1469 + }, + { + "epoch": 0.17888652266504412, + "grad_norm": 0.7974724769592285, + "learning_rate": 1.9753324536074255e-05, + "loss": 0.5093, + "step": 1470 + }, + { + "epoch": 0.17900821417706114, + "grad_norm": 3.367178201675415, + "learning_rate": 1.9752893752300145e-05, + "loss": 0.441, + "step": 1471 + }, + { + "epoch": 0.1791299056890782, + "grad_norm": 0.9021749496459961, + "learning_rate": 1.975246259740824e-05, + "loss": 0.4766, + "step": 1472 + }, + { + "epoch": 0.17925159720109524, + "grad_norm": 0.5949093103408813, + "learning_rate": 1.9752031071414945e-05, + "loss": 0.4384, + "step": 1473 + }, + { + "epoch": 0.17937328871311226, + "grad_norm": 1.8567700386047363, + "learning_rate": 1.9751599174336683e-05, + "loss": 0.4416, + "step": 1474 + }, + { + "epoch": 0.1794949802251293, + "grad_norm": 3.348370313644409, + "learning_rate": 1.9751166906189886e-05, + "loss": 0.5317, + "step": 1475 + }, + { + "epoch": 0.17961667173714632, + "grad_norm": 4.627408504486084, + "learning_rate": 1.9750734266991006e-05, + "loss": 0.5239, + "step": 1476 + }, + { + "epoch": 0.17973836324916337, + "grad_norm": 4.465948104858398, + "learning_rate": 1.975030125675651e-05, + "loss": 0.5472, + "step": 1477 + }, + { + "epoch": 0.17986005476118042, + "grad_norm": 1.0233813524246216, + "learning_rate": 1.9749867875502858e-05, + "loss": 0.4415, + "step": 1478 + }, + { + "epoch": 0.17998174627319743, + "grad_norm": 5.391047954559326, + "learning_rate": 1.974943412324656e-05, + "loss": 0.4094, + "step": 1479 + }, + { + "epoch": 0.18010343778521448, + "grad_norm": 1.2071317434310913, + "learning_rate": 1.9749000000004113e-05, + "loss": 0.5348, + "step": 1480 + }, + { + "epoch": 0.18022512929723153, + "grad_norm": 1.624406337738037, + "learning_rate": 1.9748565505792033e-05, + "loss": 0.5067, + "step": 1481 + }, + { + "epoch": 0.18034682080924855, + "grad_norm": 1.272099494934082, + "learning_rate": 1.9748130640626858e-05, + "loss": 0.5282, + "step": 1482 + }, + { + "epoch": 0.1804685123212656, + "grad_norm": 3.5889365673065186, + "learning_rate": 1.9747695404525138e-05, + "loss": 0.4702, + "step": 1483 + }, + { + "epoch": 0.18059020383328264, + "grad_norm": 4.6624956130981445, + "learning_rate": 1.9747259797503425e-05, + "loss": 0.4418, + "step": 1484 + }, + { + "epoch": 0.18071189534529966, + "grad_norm": 1.1972137689590454, + "learning_rate": 1.974682381957831e-05, + "loss": 0.5065, + "step": 1485 + }, + { + "epoch": 0.1808335868573167, + "grad_norm": 0.8423951268196106, + "learning_rate": 1.9746387470766366e-05, + "loss": 0.4769, + "step": 1486 + }, + { + "epoch": 0.18095527836933373, + "grad_norm": 1.6587998867034912, + "learning_rate": 1.974595075108421e-05, + "loss": 0.4039, + "step": 1487 + }, + { + "epoch": 0.18107696988135077, + "grad_norm": 2.4894673824310303, + "learning_rate": 1.974551366054845e-05, + "loss": 0.5075, + "step": 1488 + }, + { + "epoch": 0.18119866139336782, + "grad_norm": 1.9014842510223389, + "learning_rate": 1.9745076199175724e-05, + "loss": 0.3931, + "step": 1489 + }, + { + "epoch": 0.18132035290538484, + "grad_norm": 2.464437246322632, + "learning_rate": 1.9744638366982684e-05, + "loss": 0.4796, + "step": 1490 + }, + { + "epoch": 0.1814420444174019, + "grad_norm": 6.043072700500488, + "learning_rate": 1.9744200163985975e-05, + "loss": 0.5681, + "step": 1491 + }, + { + "epoch": 0.18156373592941893, + "grad_norm": 1.4362983703613281, + "learning_rate": 1.9743761590202287e-05, + "loss": 0.3742, + "step": 1492 + }, + { + "epoch": 0.18168542744143595, + "grad_norm": 1.9414125680923462, + "learning_rate": 1.9743322645648297e-05, + "loss": 0.4798, + "step": 1493 + }, + { + "epoch": 0.181807118953453, + "grad_norm": 1.6063414812088013, + "learning_rate": 1.9742883330340715e-05, + "loss": 0.3964, + "step": 1494 + }, + { + "epoch": 0.18192881046547002, + "grad_norm": 1.3805633783340454, + "learning_rate": 1.9742443644296255e-05, + "loss": 0.4396, + "step": 1495 + }, + { + "epoch": 0.18205050197748707, + "grad_norm": 2.264557361602783, + "learning_rate": 1.974200358753165e-05, + "loss": 0.4916, + "step": 1496 + }, + { + "epoch": 0.18217219348950411, + "grad_norm": 2.068603038787842, + "learning_rate": 1.9741563160063643e-05, + "loss": 0.4703, + "step": 1497 + }, + { + "epoch": 0.18229388500152113, + "grad_norm": 1.1746764183044434, + "learning_rate": 1.9741122361908992e-05, + "loss": 0.5142, + "step": 1498 + }, + { + "epoch": 0.18241557651353818, + "grad_norm": 1.3434480428695679, + "learning_rate": 1.9740681193084478e-05, + "loss": 0.536, + "step": 1499 + }, + { + "epoch": 0.18253726802555523, + "grad_norm": 1.5712302923202515, + "learning_rate": 1.9740239653606876e-05, + "loss": 0.546, + "step": 1500 + }, + { + "epoch": 0.18265895953757225, + "grad_norm": 5.305239200592041, + "learning_rate": 1.9739797743492997e-05, + "loss": 0.4613, + "step": 1501 + }, + { + "epoch": 0.1827806510495893, + "grad_norm": 1.1593258380889893, + "learning_rate": 1.9739355462759652e-05, + "loss": 0.5387, + "step": 1502 + }, + { + "epoch": 0.18290234256160634, + "grad_norm": 0.6386822462081909, + "learning_rate": 1.9738912811423673e-05, + "loss": 0.5373, + "step": 1503 + }, + { + "epoch": 0.18302403407362336, + "grad_norm": 2.554844617843628, + "learning_rate": 1.9738469789501903e-05, + "loss": 0.4694, + "step": 1504 + }, + { + "epoch": 0.1831457255856404, + "grad_norm": 0.9370599389076233, + "learning_rate": 1.97380263970112e-05, + "loss": 0.5007, + "step": 1505 + }, + { + "epoch": 0.18326741709765743, + "grad_norm": 0.9100454449653625, + "learning_rate": 1.9737582633968436e-05, + "loss": 0.4997, + "step": 1506 + }, + { + "epoch": 0.18338910860967447, + "grad_norm": 1.140694499015808, + "learning_rate": 1.97371385003905e-05, + "loss": 0.4223, + "step": 1507 + }, + { + "epoch": 0.18351080012169152, + "grad_norm": 1.4786837100982666, + "learning_rate": 1.9736693996294286e-05, + "loss": 0.4445, + "step": 1508 + }, + { + "epoch": 0.18363249163370854, + "grad_norm": 3.574382781982422, + "learning_rate": 1.973624912169671e-05, + "loss": 0.4964, + "step": 1509 + }, + { + "epoch": 0.18375418314572559, + "grad_norm": 2.0820257663726807, + "learning_rate": 1.9735803876614705e-05, + "loss": 0.4301, + "step": 1510 + }, + { + "epoch": 0.18387587465774263, + "grad_norm": 2.8352251052856445, + "learning_rate": 1.973535826106521e-05, + "loss": 0.4673, + "step": 1511 + }, + { + "epoch": 0.18399756616975965, + "grad_norm": 1.9511176347732544, + "learning_rate": 1.9734912275065184e-05, + "loss": 0.473, + "step": 1512 + }, + { + "epoch": 0.1841192576817767, + "grad_norm": 1.2392737865447998, + "learning_rate": 1.9734465918631592e-05, + "loss": 0.4755, + "step": 1513 + }, + { + "epoch": 0.18424094919379375, + "grad_norm": 1.0681768655776978, + "learning_rate": 1.9734019191781425e-05, + "loss": 0.4941, + "step": 1514 + }, + { + "epoch": 0.18436264070581077, + "grad_norm": 4.178776741027832, + "learning_rate": 1.9733572094531676e-05, + "loss": 0.4713, + "step": 1515 + }, + { + "epoch": 0.1844843322178278, + "grad_norm": 4.6374311447143555, + "learning_rate": 1.9733124626899365e-05, + "loss": 0.4569, + "step": 1516 + }, + { + "epoch": 0.18460602372984483, + "grad_norm": 4.767979621887207, + "learning_rate": 1.9732676788901512e-05, + "loss": 0.516, + "step": 1517 + }, + { + "epoch": 0.18472771524186188, + "grad_norm": 4.664592266082764, + "learning_rate": 1.9732228580555166e-05, + "loss": 0.5147, + "step": 1518 + }, + { + "epoch": 0.18484940675387893, + "grad_norm": 2.1037893295288086, + "learning_rate": 1.9731780001877376e-05, + "loss": 0.5286, + "step": 1519 + }, + { + "epoch": 0.18497109826589594, + "grad_norm": 3.0732429027557373, + "learning_rate": 1.9731331052885214e-05, + "loss": 0.4729, + "step": 1520 + }, + { + "epoch": 0.185092789777913, + "grad_norm": 0.7659862041473389, + "learning_rate": 1.973088173359576e-05, + "loss": 0.4441, + "step": 1521 + }, + { + "epoch": 0.18521448128993004, + "grad_norm": 3.8449151515960693, + "learning_rate": 1.9730432044026117e-05, + "loss": 0.5271, + "step": 1522 + }, + { + "epoch": 0.18533617280194706, + "grad_norm": 2.932002544403076, + "learning_rate": 1.9729981984193393e-05, + "loss": 0.4535, + "step": 1523 + }, + { + "epoch": 0.1854578643139641, + "grad_norm": 4.661848068237305, + "learning_rate": 1.9729531554114713e-05, + "loss": 0.5529, + "step": 1524 + }, + { + "epoch": 0.18557955582598112, + "grad_norm": 3.4412319660186768, + "learning_rate": 1.9729080753807223e-05, + "loss": 0.5123, + "step": 1525 + }, + { + "epoch": 0.18570124733799817, + "grad_norm": 2.6023199558258057, + "learning_rate": 1.972862958328807e-05, + "loss": 0.4922, + "step": 1526 + }, + { + "epoch": 0.18582293885001522, + "grad_norm": 1.1642584800720215, + "learning_rate": 1.9728178042574422e-05, + "loss": 0.4206, + "step": 1527 + }, + { + "epoch": 0.18594463036203224, + "grad_norm": 1.6560192108154297, + "learning_rate": 1.9727726131683464e-05, + "loss": 0.4345, + "step": 1528 + }, + { + "epoch": 0.18606632187404928, + "grad_norm": 1.3707681894302368, + "learning_rate": 1.9727273850632393e-05, + "loss": 0.5126, + "step": 1529 + }, + { + "epoch": 0.18618801338606633, + "grad_norm": 1.8400931358337402, + "learning_rate": 1.9726821199438417e-05, + "loss": 0.5129, + "step": 1530 + }, + { + "epoch": 0.18630970489808335, + "grad_norm": 4.784855365753174, + "learning_rate": 1.9726368178118758e-05, + "loss": 0.4349, + "step": 1531 + }, + { + "epoch": 0.1864313964101004, + "grad_norm": 1.6330609321594238, + "learning_rate": 1.972591478669066e-05, + "loss": 0.5038, + "step": 1532 + }, + { + "epoch": 0.18655308792211744, + "grad_norm": 2.845757246017456, + "learning_rate": 1.9725461025171372e-05, + "loss": 0.4627, + "step": 1533 + }, + { + "epoch": 0.18667477943413446, + "grad_norm": 2.9922804832458496, + "learning_rate": 1.9725006893578165e-05, + "loss": 0.4335, + "step": 1534 + }, + { + "epoch": 0.1867964709461515, + "grad_norm": 1.7592154741287231, + "learning_rate": 1.972455239192831e-05, + "loss": 0.4252, + "step": 1535 + }, + { + "epoch": 0.18691816245816853, + "grad_norm": 2.723053216934204, + "learning_rate": 1.9724097520239113e-05, + "loss": 0.4712, + "step": 1536 + }, + { + "epoch": 0.18703985397018558, + "grad_norm": 2.3298447132110596, + "learning_rate": 1.9723642278527876e-05, + "loss": 0.4578, + "step": 1537 + }, + { + "epoch": 0.18716154548220262, + "grad_norm": 5.630123138427734, + "learning_rate": 1.9723186666811922e-05, + "loss": 0.5456, + "step": 1538 + }, + { + "epoch": 0.18728323699421964, + "grad_norm": 4.125349998474121, + "learning_rate": 1.9722730685108588e-05, + "loss": 0.4988, + "step": 1539 + }, + { + "epoch": 0.1874049285062367, + "grad_norm": 1.7700765132904053, + "learning_rate": 1.9722274333435233e-05, + "loss": 0.4519, + "step": 1540 + }, + { + "epoch": 0.18752662001825374, + "grad_norm": 1.5369956493377686, + "learning_rate": 1.9721817611809212e-05, + "loss": 0.49, + "step": 1541 + }, + { + "epoch": 0.18764831153027076, + "grad_norm": 0.5952844619750977, + "learning_rate": 1.97213605202479e-05, + "loss": 0.4692, + "step": 1542 + }, + { + "epoch": 0.1877700030422878, + "grad_norm": 2.1867940425872803, + "learning_rate": 1.972090305876871e-05, + "loss": 0.491, + "step": 1543 + }, + { + "epoch": 0.18789169455430485, + "grad_norm": 2.9622294902801514, + "learning_rate": 1.9720445227389032e-05, + "loss": 0.4972, + "step": 1544 + }, + { + "epoch": 0.18801338606632187, + "grad_norm": 3.5816261768341064, + "learning_rate": 1.9719987026126296e-05, + "loss": 0.5116, + "step": 1545 + }, + { + "epoch": 0.18813507757833892, + "grad_norm": 2.4237194061279297, + "learning_rate": 1.971952845499793e-05, + "loss": 0.4703, + "step": 1546 + }, + { + "epoch": 0.18825676909035594, + "grad_norm": 2.7580742835998535, + "learning_rate": 1.9719069514021393e-05, + "loss": 0.5255, + "step": 1547 + }, + { + "epoch": 0.18837846060237298, + "grad_norm": 2.2957465648651123, + "learning_rate": 1.9718610203214136e-05, + "loss": 0.4359, + "step": 1548 + }, + { + "epoch": 0.18850015211439003, + "grad_norm": 2.8581619262695312, + "learning_rate": 1.971815052259365e-05, + "loss": 0.5241, + "step": 1549 + }, + { + "epoch": 0.18862184362640705, + "grad_norm": 2.8525230884552, + "learning_rate": 1.971769047217742e-05, + "loss": 0.5052, + "step": 1550 + }, + { + "epoch": 0.1887435351384241, + "grad_norm": 2.728126049041748, + "learning_rate": 1.9717230051982958e-05, + "loss": 0.459, + "step": 1551 + }, + { + "epoch": 0.18886522665044114, + "grad_norm": 2.577486276626587, + "learning_rate": 1.971676926202777e-05, + "loss": 0.3763, + "step": 1552 + }, + { + "epoch": 0.18898691816245816, + "grad_norm": 0.9827896952629089, + "learning_rate": 1.97163081023294e-05, + "loss": 0.4509, + "step": 1553 + }, + { + "epoch": 0.1891086096744752, + "grad_norm": 5.716380596160889, + "learning_rate": 1.9715846572905402e-05, + "loss": 0.5873, + "step": 1554 + }, + { + "epoch": 0.18923030118649226, + "grad_norm": 0.8581305146217346, + "learning_rate": 1.9715384673773327e-05, + "loss": 0.453, + "step": 1555 + }, + { + "epoch": 0.18935199269850927, + "grad_norm": 1.1005945205688477, + "learning_rate": 1.9714922404950755e-05, + "loss": 0.4697, + "step": 1556 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 0.757716953754425, + "learning_rate": 1.971445976645528e-05, + "loss": 0.4548, + "step": 1557 + }, + { + "epoch": 0.18959537572254334, + "grad_norm": 1.0333950519561768, + "learning_rate": 1.9713996758304498e-05, + "loss": 0.4893, + "step": 1558 + }, + { + "epoch": 0.1897170672345604, + "grad_norm": 1.0344300270080566, + "learning_rate": 1.9713533380516037e-05, + "loss": 0.4795, + "step": 1559 + }, + { + "epoch": 0.18983875874657744, + "grad_norm": 0.6981343626976013, + "learning_rate": 1.9713069633107524e-05, + "loss": 0.4903, + "step": 1560 + }, + { + "epoch": 0.18996045025859445, + "grad_norm": 0.9314073920249939, + "learning_rate": 1.9712605516096604e-05, + "loss": 0.4881, + "step": 1561 + }, + { + "epoch": 0.1900821417706115, + "grad_norm": 1.9468327760696411, + "learning_rate": 1.971214102950094e-05, + "loss": 0.4537, + "step": 1562 + }, + { + "epoch": 0.19020383328262855, + "grad_norm": 0.5633378028869629, + "learning_rate": 1.971167617333821e-05, + "loss": 0.4775, + "step": 1563 + }, + { + "epoch": 0.19032552479464557, + "grad_norm": 0.9177316427230835, + "learning_rate": 1.9711210947626098e-05, + "loss": 0.5227, + "step": 1564 + }, + { + "epoch": 0.19044721630666261, + "grad_norm": 1.4895813465118408, + "learning_rate": 1.971074535238231e-05, + "loss": 0.4706, + "step": 1565 + }, + { + "epoch": 0.19056890781867963, + "grad_norm": 0.8878472447395325, + "learning_rate": 1.971027938762456e-05, + "loss": 0.4882, + "step": 1566 + }, + { + "epoch": 0.19069059933069668, + "grad_norm": 0.852823793888092, + "learning_rate": 1.9709813053370578e-05, + "loss": 0.4971, + "step": 1567 + }, + { + "epoch": 0.19081229084271373, + "grad_norm": 0.7474746108055115, + "learning_rate": 1.970934634963811e-05, + "loss": 0.517, + "step": 1568 + }, + { + "epoch": 0.19093398235473075, + "grad_norm": 1.4665369987487793, + "learning_rate": 1.970887927644492e-05, + "loss": 0.4324, + "step": 1569 + }, + { + "epoch": 0.1910556738667478, + "grad_norm": 1.3971643447875977, + "learning_rate": 1.9708411833808777e-05, + "loss": 0.4399, + "step": 1570 + }, + { + "epoch": 0.19117736537876484, + "grad_norm": 3.2567062377929688, + "learning_rate": 1.9707944021747468e-05, + "loss": 0.5218, + "step": 1571 + }, + { + "epoch": 0.19129905689078186, + "grad_norm": 1.3860619068145752, + "learning_rate": 1.9707475840278794e-05, + "loss": 0.5077, + "step": 1572 + }, + { + "epoch": 0.1914207484027989, + "grad_norm": 1.0201047658920288, + "learning_rate": 1.970700728942057e-05, + "loss": 0.467, + "step": 1573 + }, + { + "epoch": 0.19154243991481595, + "grad_norm": 2.747863531112671, + "learning_rate": 1.970653836919063e-05, + "loss": 0.5031, + "step": 1574 + }, + { + "epoch": 0.19166413142683297, + "grad_norm": 0.9433050751686096, + "learning_rate": 1.9706069079606812e-05, + "loss": 0.449, + "step": 1575 + }, + { + "epoch": 0.19178582293885002, + "grad_norm": 1.9547761678695679, + "learning_rate": 1.970559942068697e-05, + "loss": 0.4538, + "step": 1576 + }, + { + "epoch": 0.19190751445086704, + "grad_norm": 0.9808923602104187, + "learning_rate": 1.9705129392448987e-05, + "loss": 0.5191, + "step": 1577 + }, + { + "epoch": 0.1920292059628841, + "grad_norm": 2.032186508178711, + "learning_rate": 1.970465899491074e-05, + "loss": 0.4711, + "step": 1578 + }, + { + "epoch": 0.19215089747490113, + "grad_norm": 3.9353396892547607, + "learning_rate": 1.9704188228090127e-05, + "loss": 0.4053, + "step": 1579 + }, + { + "epoch": 0.19227258898691815, + "grad_norm": 1.908689022064209, + "learning_rate": 1.970371709200507e-05, + "loss": 0.546, + "step": 1580 + }, + { + "epoch": 0.1923942804989352, + "grad_norm": 0.7221613526344299, + "learning_rate": 1.9703245586673492e-05, + "loss": 0.4312, + "step": 1581 + }, + { + "epoch": 0.19251597201095225, + "grad_norm": 0.6049261093139648, + "learning_rate": 1.970277371211333e-05, + "loss": 0.4789, + "step": 1582 + }, + { + "epoch": 0.19263766352296927, + "grad_norm": 1.7574793100357056, + "learning_rate": 1.970230146834255e-05, + "loss": 0.4951, + "step": 1583 + }, + { + "epoch": 0.1927593550349863, + "grad_norm": 0.6227784156799316, + "learning_rate": 1.9701828855379113e-05, + "loss": 0.4605, + "step": 1584 + }, + { + "epoch": 0.19288104654700336, + "grad_norm": 2.2516350746154785, + "learning_rate": 1.970135587324101e-05, + "loss": 0.4836, + "step": 1585 + }, + { + "epoch": 0.19300273805902038, + "grad_norm": 1.1869683265686035, + "learning_rate": 1.9700882521946236e-05, + "loss": 0.4411, + "step": 1586 + }, + { + "epoch": 0.19312442957103743, + "grad_norm": 3.2054316997528076, + "learning_rate": 1.9700408801512797e-05, + "loss": 0.5176, + "step": 1587 + }, + { + "epoch": 0.19324612108305445, + "grad_norm": 1.0156232118606567, + "learning_rate": 1.9699934711958728e-05, + "loss": 0.4538, + "step": 1588 + }, + { + "epoch": 0.1933678125950715, + "grad_norm": 1.167963981628418, + "learning_rate": 1.9699460253302066e-05, + "loss": 0.5323, + "step": 1589 + }, + { + "epoch": 0.19348950410708854, + "grad_norm": 0.937066376209259, + "learning_rate": 1.969898542556086e-05, + "loss": 0.467, + "step": 1590 + }, + { + "epoch": 0.19361119561910556, + "grad_norm": 1.9532171487808228, + "learning_rate": 1.969851022875319e-05, + "loss": 0.5079, + "step": 1591 + }, + { + "epoch": 0.1937328871311226, + "grad_norm": 0.8318353891372681, + "learning_rate": 1.9698034662897125e-05, + "loss": 0.4786, + "step": 1592 + }, + { + "epoch": 0.19385457864313965, + "grad_norm": 2.816213846206665, + "learning_rate": 1.969755872801077e-05, + "loss": 0.4129, + "step": 1593 + }, + { + "epoch": 0.19397627015515667, + "grad_norm": 1.4102116823196411, + "learning_rate": 1.969708242411223e-05, + "loss": 0.4369, + "step": 1594 + }, + { + "epoch": 0.19409796166717372, + "grad_norm": 2.5704290866851807, + "learning_rate": 1.9696605751219634e-05, + "loss": 0.4941, + "step": 1595 + }, + { + "epoch": 0.19421965317919074, + "grad_norm": 0.6522196531295776, + "learning_rate": 1.969612870935112e-05, + "loss": 0.4207, + "step": 1596 + }, + { + "epoch": 0.19434134469120778, + "grad_norm": 1.5597845315933228, + "learning_rate": 1.9695651298524834e-05, + "loss": 0.4537, + "step": 1597 + }, + { + "epoch": 0.19446303620322483, + "grad_norm": 4.25393009185791, + "learning_rate": 1.969517351875895e-05, + "loss": 0.5641, + "step": 1598 + }, + { + "epoch": 0.19458472771524185, + "grad_norm": 1.259340524673462, + "learning_rate": 1.9694695370071645e-05, + "loss": 0.5148, + "step": 1599 + }, + { + "epoch": 0.1947064192272589, + "grad_norm": 1.2821242809295654, + "learning_rate": 1.9694216852481113e-05, + "loss": 0.4794, + "step": 1600 + }, + { + "epoch": 0.19482811073927594, + "grad_norm": 1.6673787832260132, + "learning_rate": 1.9693737966005567e-05, + "loss": 0.4423, + "step": 1601 + }, + { + "epoch": 0.19494980225129296, + "grad_norm": 5.536383152008057, + "learning_rate": 1.9693258710663226e-05, + "loss": 0.4358, + "step": 1602 + }, + { + "epoch": 0.19507149376331, + "grad_norm": 2.4902002811431885, + "learning_rate": 1.9692779086472328e-05, + "loss": 0.4815, + "step": 1603 + }, + { + "epoch": 0.19519318527532706, + "grad_norm": 3.3068079948425293, + "learning_rate": 1.969229909345112e-05, + "loss": 0.4255, + "step": 1604 + }, + { + "epoch": 0.19531487678734408, + "grad_norm": 2.6858489513397217, + "learning_rate": 1.969181873161787e-05, + "loss": 0.5322, + "step": 1605 + }, + { + "epoch": 0.19543656829936112, + "grad_norm": 3.3064382076263428, + "learning_rate": 1.9691338000990855e-05, + "loss": 0.5252, + "step": 1606 + }, + { + "epoch": 0.19555825981137814, + "grad_norm": 1.214639663696289, + "learning_rate": 1.9690856901588372e-05, + "loss": 0.4623, + "step": 1607 + }, + { + "epoch": 0.1956799513233952, + "grad_norm": 1.6819217205047607, + "learning_rate": 1.9690375433428723e-05, + "loss": 0.4662, + "step": 1608 + }, + { + "epoch": 0.19580164283541224, + "grad_norm": 0.8032550811767578, + "learning_rate": 1.968989359653023e-05, + "loss": 0.4565, + "step": 1609 + }, + { + "epoch": 0.19592333434742926, + "grad_norm": 1.0182809829711914, + "learning_rate": 1.9689411390911233e-05, + "loss": 0.4792, + "step": 1610 + }, + { + "epoch": 0.1960450258594463, + "grad_norm": 1.019756555557251, + "learning_rate": 1.9688928816590075e-05, + "loss": 0.468, + "step": 1611 + }, + { + "epoch": 0.19616671737146335, + "grad_norm": 3.679610252380371, + "learning_rate": 1.968844587358512e-05, + "loss": 0.4385, + "step": 1612 + }, + { + "epoch": 0.19628840888348037, + "grad_norm": 2.3145744800567627, + "learning_rate": 1.9687962561914743e-05, + "loss": 0.4934, + "step": 1613 + }, + { + "epoch": 0.19641010039549742, + "grad_norm": 1.0022114515304565, + "learning_rate": 1.968747888159734e-05, + "loss": 0.4672, + "step": 1614 + }, + { + "epoch": 0.19653179190751446, + "grad_norm": 0.7019039988517761, + "learning_rate": 1.968699483265131e-05, + "loss": 0.4291, + "step": 1615 + }, + { + "epoch": 0.19665348341953148, + "grad_norm": 3.1726160049438477, + "learning_rate": 1.9686510415095078e-05, + "loss": 0.5213, + "step": 1616 + }, + { + "epoch": 0.19677517493154853, + "grad_norm": 3.2987570762634277, + "learning_rate": 1.968602562894708e-05, + "loss": 0.4844, + "step": 1617 + }, + { + "epoch": 0.19689686644356555, + "grad_norm": 0.9264179468154907, + "learning_rate": 1.968554047422575e-05, + "loss": 0.4127, + "step": 1618 + }, + { + "epoch": 0.1970185579555826, + "grad_norm": 0.6382076740264893, + "learning_rate": 1.968505495094956e-05, + "loss": 0.4601, + "step": 1619 + }, + { + "epoch": 0.19714024946759964, + "grad_norm": 2.624558448791504, + "learning_rate": 1.9684569059136984e-05, + "loss": 0.5142, + "step": 1620 + }, + { + "epoch": 0.19726194097961666, + "grad_norm": 1.2857919931411743, + "learning_rate": 1.9684082798806507e-05, + "loss": 0.494, + "step": 1621 + }, + { + "epoch": 0.1973836324916337, + "grad_norm": 2.6425671577453613, + "learning_rate": 1.9683596169976637e-05, + "loss": 0.5788, + "step": 1622 + }, + { + "epoch": 0.19750532400365076, + "grad_norm": 2.8701817989349365, + "learning_rate": 1.9683109172665887e-05, + "loss": 0.4749, + "step": 1623 + }, + { + "epoch": 0.19762701551566778, + "grad_norm": 4.955239295959473, + "learning_rate": 1.968262180689279e-05, + "loss": 0.4661, + "step": 1624 + }, + { + "epoch": 0.19774870702768482, + "grad_norm": 1.945864200592041, + "learning_rate": 1.968213407267589e-05, + "loss": 0.5084, + "step": 1625 + }, + { + "epoch": 0.19787039853970187, + "grad_norm": 3.634922981262207, + "learning_rate": 1.968164597003375e-05, + "loss": 0.4742, + "step": 1626 + }, + { + "epoch": 0.1979920900517189, + "grad_norm": 3.1284406185150146, + "learning_rate": 1.968115749898494e-05, + "loss": 0.4812, + "step": 1627 + }, + { + "epoch": 0.19811378156373594, + "grad_norm": 0.6137954592704773, + "learning_rate": 1.9680668659548047e-05, + "loss": 0.5057, + "step": 1628 + }, + { + "epoch": 0.19823547307575295, + "grad_norm": 1.2982310056686401, + "learning_rate": 1.9680179451741676e-05, + "loss": 0.4971, + "step": 1629 + }, + { + "epoch": 0.19835716458777, + "grad_norm": 0.7524140477180481, + "learning_rate": 1.967968987558444e-05, + "loss": 0.4631, + "step": 1630 + }, + { + "epoch": 0.19847885609978705, + "grad_norm": 2.648993730545044, + "learning_rate": 1.9679199931094967e-05, + "loss": 0.4921, + "step": 1631 + }, + { + "epoch": 0.19860054761180407, + "grad_norm": 4.057670593261719, + "learning_rate": 1.9678709618291903e-05, + "loss": 0.5464, + "step": 1632 + }, + { + "epoch": 0.19872223912382111, + "grad_norm": 3.1056995391845703, + "learning_rate": 1.96782189371939e-05, + "loss": 0.5447, + "step": 1633 + }, + { + "epoch": 0.19884393063583816, + "grad_norm": 0.6378831267356873, + "learning_rate": 1.9677727887819637e-05, + "loss": 0.526, + "step": 1634 + }, + { + "epoch": 0.19896562214785518, + "grad_norm": 5.221714019775391, + "learning_rate": 1.9677236470187796e-05, + "loss": 0.4275, + "step": 1635 + }, + { + "epoch": 0.19908731365987223, + "grad_norm": 2.322024345397949, + "learning_rate": 1.9676744684317076e-05, + "loss": 0.5101, + "step": 1636 + }, + { + "epoch": 0.19920900517188925, + "grad_norm": 2.1963043212890625, + "learning_rate": 1.9676252530226195e-05, + "loss": 0.4707, + "step": 1637 + }, + { + "epoch": 0.1993306966839063, + "grad_norm": 1.522512674331665, + "learning_rate": 1.967576000793387e-05, + "loss": 0.4794, + "step": 1638 + }, + { + "epoch": 0.19945238819592334, + "grad_norm": 0.6875200867652893, + "learning_rate": 1.9675267117458853e-05, + "loss": 0.5085, + "step": 1639 + }, + { + "epoch": 0.19957407970794036, + "grad_norm": 1.8022245168685913, + "learning_rate": 1.9674773858819895e-05, + "loss": 0.5139, + "step": 1640 + }, + { + "epoch": 0.1996957712199574, + "grad_norm": 0.6288191676139832, + "learning_rate": 1.9674280232035766e-05, + "loss": 0.4801, + "step": 1641 + }, + { + "epoch": 0.19981746273197445, + "grad_norm": 0.6057789325714111, + "learning_rate": 1.9673786237125247e-05, + "loss": 0.4749, + "step": 1642 + }, + { + "epoch": 0.19993915424399147, + "grad_norm": 2.8260412216186523, + "learning_rate": 1.967329187410714e-05, + "loss": 0.5089, + "step": 1643 + }, + { + "epoch": 0.20006084575600852, + "grad_norm": 0.9718367457389832, + "learning_rate": 1.9672797143000254e-05, + "loss": 0.4772, + "step": 1644 + }, + { + "epoch": 0.20018253726802557, + "grad_norm": 2.362091302871704, + "learning_rate": 1.9672302043823418e-05, + "loss": 0.5211, + "step": 1645 + }, + { + "epoch": 0.2003042287800426, + "grad_norm": 2.3104591369628906, + "learning_rate": 1.9671806576595466e-05, + "loss": 0.5554, + "step": 1646 + }, + { + "epoch": 0.20042592029205963, + "grad_norm": 2.9208545684814453, + "learning_rate": 1.9671310741335253e-05, + "loss": 0.4374, + "step": 1647 + }, + { + "epoch": 0.20054761180407665, + "grad_norm": 1.604841709136963, + "learning_rate": 1.967081453806165e-05, + "loss": 0.4621, + "step": 1648 + }, + { + "epoch": 0.2006693033160937, + "grad_norm": 2.000429630279541, + "learning_rate": 1.9670317966793537e-05, + "loss": 0.4759, + "step": 1649 + }, + { + "epoch": 0.20079099482811075, + "grad_norm": 0.6824323534965515, + "learning_rate": 1.9669821027549804e-05, + "loss": 0.4973, + "step": 1650 + }, + { + "epoch": 0.20091268634012777, + "grad_norm": 2.5940921306610107, + "learning_rate": 1.9669323720349373e-05, + "loss": 0.5476, + "step": 1651 + }, + { + "epoch": 0.2010343778521448, + "grad_norm": 0.9436287879943848, + "learning_rate": 1.9668826045211154e-05, + "loss": 0.44, + "step": 1652 + }, + { + "epoch": 0.20115606936416186, + "grad_norm": 0.8677988648414612, + "learning_rate": 1.9668328002154092e-05, + "loss": 0.4688, + "step": 1653 + }, + { + "epoch": 0.20127776087617888, + "grad_norm": 1.9527193307876587, + "learning_rate": 1.966782959119714e-05, + "loss": 0.5002, + "step": 1654 + }, + { + "epoch": 0.20139945238819593, + "grad_norm": 1.1784852743148804, + "learning_rate": 1.9667330812359256e-05, + "loss": 0.4618, + "step": 1655 + }, + { + "epoch": 0.20152114390021297, + "grad_norm": 0.9972782731056213, + "learning_rate": 1.9666831665659426e-05, + "loss": 0.4955, + "step": 1656 + }, + { + "epoch": 0.20164283541223, + "grad_norm": 2.0112106800079346, + "learning_rate": 1.9666332151116644e-05, + "loss": 0.4423, + "step": 1657 + }, + { + "epoch": 0.20176452692424704, + "grad_norm": 3.010773181915283, + "learning_rate": 1.9665832268749915e-05, + "loss": 0.4715, + "step": 1658 + }, + { + "epoch": 0.20188621843626406, + "grad_norm": 2.0638082027435303, + "learning_rate": 1.966533201857826e-05, + "loss": 0.4301, + "step": 1659 + }, + { + "epoch": 0.2020079099482811, + "grad_norm": 1.6525065898895264, + "learning_rate": 1.9664831400620716e-05, + "loss": 0.492, + "step": 1660 + }, + { + "epoch": 0.20212960146029815, + "grad_norm": 1.4906165599822998, + "learning_rate": 1.966433041489633e-05, + "loss": 0.4813, + "step": 1661 + }, + { + "epoch": 0.20225129297231517, + "grad_norm": 0.6232516169548035, + "learning_rate": 1.9663829061424172e-05, + "loss": 0.4963, + "step": 1662 + }, + { + "epoch": 0.20237298448433222, + "grad_norm": 1.5922417640686035, + "learning_rate": 1.966332734022331e-05, + "loss": 0.4668, + "step": 1663 + }, + { + "epoch": 0.20249467599634927, + "grad_norm": 1.6148895025253296, + "learning_rate": 1.9662825251312845e-05, + "loss": 0.4853, + "step": 1664 + }, + { + "epoch": 0.20261636750836629, + "grad_norm": 0.6693671941757202, + "learning_rate": 1.9662322794711875e-05, + "loss": 0.4473, + "step": 1665 + }, + { + "epoch": 0.20273805902038333, + "grad_norm": 2.002835512161255, + "learning_rate": 1.9661819970439526e-05, + "loss": 0.4778, + "step": 1666 + }, + { + "epoch": 0.20285975053240035, + "grad_norm": 0.8164340853691101, + "learning_rate": 1.966131677851493e-05, + "loss": 0.4557, + "step": 1667 + }, + { + "epoch": 0.2029814420444174, + "grad_norm": 0.6746049523353577, + "learning_rate": 1.9660813218957226e-05, + "loss": 0.4741, + "step": 1668 + }, + { + "epoch": 0.20310313355643445, + "grad_norm": 0.8485199213027954, + "learning_rate": 1.9660309291785588e-05, + "loss": 0.47, + "step": 1669 + }, + { + "epoch": 0.20322482506845146, + "grad_norm": 1.3783713579177856, + "learning_rate": 1.965980499701918e-05, + "loss": 0.5104, + "step": 1670 + }, + { + "epoch": 0.2033465165804685, + "grad_norm": 2.0943398475646973, + "learning_rate": 1.96593003346772e-05, + "loss": 0.4353, + "step": 1671 + }, + { + "epoch": 0.20346820809248556, + "grad_norm": 1.8847817182540894, + "learning_rate": 1.965879530477885e-05, + "loss": 0.5095, + "step": 1672 + }, + { + "epoch": 0.20358989960450258, + "grad_norm": 1.30797278881073, + "learning_rate": 1.9658289907343344e-05, + "loss": 0.5466, + "step": 1673 + }, + { + "epoch": 0.20371159111651962, + "grad_norm": 1.2622805833816528, + "learning_rate": 1.9657784142389918e-05, + "loss": 0.4799, + "step": 1674 + }, + { + "epoch": 0.20383328262853667, + "grad_norm": 1.3247990608215332, + "learning_rate": 1.9657278009937813e-05, + "loss": 0.4787, + "step": 1675 + }, + { + "epoch": 0.2039549741405537, + "grad_norm": 3.931656837463379, + "learning_rate": 1.9656771510006288e-05, + "loss": 0.3798, + "step": 1676 + }, + { + "epoch": 0.20407666565257074, + "grad_norm": 1.6144198179244995, + "learning_rate": 1.9656264642614625e-05, + "loss": 0.3948, + "step": 1677 + }, + { + "epoch": 0.20419835716458776, + "grad_norm": 2.988980770111084, + "learning_rate": 1.96557574077821e-05, + "loss": 0.5197, + "step": 1678 + }, + { + "epoch": 0.2043200486766048, + "grad_norm": 4.086143970489502, + "learning_rate": 1.965524980552802e-05, + "loss": 0.506, + "step": 1679 + }, + { + "epoch": 0.20444174018862185, + "grad_norm": 3.4909090995788574, + "learning_rate": 1.96547418358717e-05, + "loss": 0.4732, + "step": 1680 + }, + { + "epoch": 0.20456343170063887, + "grad_norm": 5.454492568969727, + "learning_rate": 1.9654233498832467e-05, + "loss": 0.5418, + "step": 1681 + }, + { + "epoch": 0.20468512321265592, + "grad_norm": 3.660712242126465, + "learning_rate": 1.965372479442967e-05, + "loss": 0.4767, + "step": 1682 + }, + { + "epoch": 0.20480681472467296, + "grad_norm": 0.8374196887016296, + "learning_rate": 1.965321572268266e-05, + "loss": 0.4485, + "step": 1683 + }, + { + "epoch": 0.20492850623668998, + "grad_norm": 0.5816757082939148, + "learning_rate": 1.9652706283610814e-05, + "loss": 0.4607, + "step": 1684 + }, + { + "epoch": 0.20505019774870703, + "grad_norm": 0.9996181726455688, + "learning_rate": 1.965219647723351e-05, + "loss": 0.4796, + "step": 1685 + }, + { + "epoch": 0.20517188926072408, + "grad_norm": 1.0449223518371582, + "learning_rate": 1.9651686303570156e-05, + "loss": 0.509, + "step": 1686 + }, + { + "epoch": 0.2052935807727411, + "grad_norm": 4.236747741699219, + "learning_rate": 1.9651175762640155e-05, + "loss": 0.4712, + "step": 1687 + }, + { + "epoch": 0.20541527228475814, + "grad_norm": 3.6349127292633057, + "learning_rate": 1.965066485446294e-05, + "loss": 0.4773, + "step": 1688 + }, + { + "epoch": 0.20553696379677516, + "grad_norm": 4.4095869064331055, + "learning_rate": 1.9650153579057955e-05, + "loss": 0.4812, + "step": 1689 + }, + { + "epoch": 0.2056586553087922, + "grad_norm": 2.3108582496643066, + "learning_rate": 1.964964193644465e-05, + "loss": 0.4961, + "step": 1690 + }, + { + "epoch": 0.20578034682080926, + "grad_norm": 1.692794680595398, + "learning_rate": 1.964912992664249e-05, + "loss": 0.4767, + "step": 1691 + }, + { + "epoch": 0.20590203833282628, + "grad_norm": 0.6628739237785339, + "learning_rate": 1.9648617549670972e-05, + "loss": 0.4661, + "step": 1692 + }, + { + "epoch": 0.20602372984484332, + "grad_norm": 3.363628387451172, + "learning_rate": 1.964810480554958e-05, + "loss": 0.5465, + "step": 1693 + }, + { + "epoch": 0.20614542135686037, + "grad_norm": 0.7035858035087585, + "learning_rate": 1.9647591694297833e-05, + "loss": 0.4204, + "step": 1694 + }, + { + "epoch": 0.2062671128688774, + "grad_norm": 1.1090821027755737, + "learning_rate": 1.964707821593525e-05, + "loss": 0.4426, + "step": 1695 + }, + { + "epoch": 0.20638880438089444, + "grad_norm": 3.3618927001953125, + "learning_rate": 1.9646564370481373e-05, + "loss": 0.4973, + "step": 1696 + }, + { + "epoch": 0.20651049589291148, + "grad_norm": 3.866363763809204, + "learning_rate": 1.9646050157955752e-05, + "loss": 0.5566, + "step": 1697 + }, + { + "epoch": 0.2066321874049285, + "grad_norm": 2.323979616165161, + "learning_rate": 1.964553557837796e-05, + "loss": 0.5455, + "step": 1698 + }, + { + "epoch": 0.20675387891694555, + "grad_norm": 1.5137630701065063, + "learning_rate": 1.9645020631767574e-05, + "loss": 0.4519, + "step": 1699 + }, + { + "epoch": 0.20687557042896257, + "grad_norm": 1.2780119180679321, + "learning_rate": 1.9644505318144188e-05, + "loss": 0.4665, + "step": 1700 + }, + { + "epoch": 0.20699726194097962, + "grad_norm": 0.6163020730018616, + "learning_rate": 1.9643989637527412e-05, + "loss": 0.4992, + "step": 1701 + }, + { + "epoch": 0.20711895345299666, + "grad_norm": 1.6576118469238281, + "learning_rate": 1.9643473589936867e-05, + "loss": 0.4643, + "step": 1702 + }, + { + "epoch": 0.20724064496501368, + "grad_norm": 0.7050173878669739, + "learning_rate": 1.9642957175392192e-05, + "loss": 0.495, + "step": 1703 + }, + { + "epoch": 0.20736233647703073, + "grad_norm": 1.7582765817642212, + "learning_rate": 1.9642440393913034e-05, + "loss": 0.4285, + "step": 1704 + }, + { + "epoch": 0.20748402798904778, + "grad_norm": 1.6956439018249512, + "learning_rate": 1.9641923245519065e-05, + "loss": 0.4437, + "step": 1705 + }, + { + "epoch": 0.2076057195010648, + "grad_norm": 0.760586142539978, + "learning_rate": 1.9641405730229955e-05, + "loss": 0.4931, + "step": 1706 + }, + { + "epoch": 0.20772741101308184, + "grad_norm": 2.3825411796569824, + "learning_rate": 1.96408878480654e-05, + "loss": 0.452, + "step": 1707 + }, + { + "epoch": 0.20784910252509886, + "grad_norm": 0.6912671327590942, + "learning_rate": 1.9640369599045113e-05, + "loss": 0.4434, + "step": 1708 + }, + { + "epoch": 0.2079707940371159, + "grad_norm": 0.9357388615608215, + "learning_rate": 1.9639850983188802e-05, + "loss": 0.419, + "step": 1709 + }, + { + "epoch": 0.20809248554913296, + "grad_norm": 2.109111785888672, + "learning_rate": 1.963933200051621e-05, + "loss": 0.4641, + "step": 1710 + }, + { + "epoch": 0.20821417706114997, + "grad_norm": 1.1385763883590698, + "learning_rate": 1.9638812651047082e-05, + "loss": 0.4858, + "step": 1711 + }, + { + "epoch": 0.20833586857316702, + "grad_norm": 1.7680236101150513, + "learning_rate": 1.9638292934801183e-05, + "loss": 0.489, + "step": 1712 + }, + { + "epoch": 0.20845756008518407, + "grad_norm": 0.8580129742622375, + "learning_rate": 1.9637772851798287e-05, + "loss": 0.5272, + "step": 1713 + }, + { + "epoch": 0.2085792515972011, + "grad_norm": 3.5318949222564697, + "learning_rate": 1.9637252402058186e-05, + "loss": 0.48, + "step": 1714 + }, + { + "epoch": 0.20870094310921813, + "grad_norm": 4.347428321838379, + "learning_rate": 1.963673158560068e-05, + "loss": 0.4515, + "step": 1715 + }, + { + "epoch": 0.20882263462123518, + "grad_norm": 3.0599441528320312, + "learning_rate": 1.963621040244559e-05, + "loss": 0.524, + "step": 1716 + }, + { + "epoch": 0.2089443261332522, + "grad_norm": 4.8122053146362305, + "learning_rate": 1.963568885261275e-05, + "loss": 0.4874, + "step": 1717 + }, + { + "epoch": 0.20906601764526925, + "grad_norm": 2.8686513900756836, + "learning_rate": 1.9635166936122007e-05, + "loss": 0.4431, + "step": 1718 + }, + { + "epoch": 0.20918770915728627, + "grad_norm": 1.3506346940994263, + "learning_rate": 1.9634644652993217e-05, + "loss": 0.5089, + "step": 1719 + }, + { + "epoch": 0.2093094006693033, + "grad_norm": 0.822596549987793, + "learning_rate": 1.9634122003246253e-05, + "loss": 0.4589, + "step": 1720 + }, + { + "epoch": 0.20943109218132036, + "grad_norm": 1.0705163478851318, + "learning_rate": 1.9633598986901007e-05, + "loss": 0.45, + "step": 1721 + }, + { + "epoch": 0.20955278369333738, + "grad_norm": 2.329230308532715, + "learning_rate": 1.963307560397738e-05, + "loss": 0.4612, + "step": 1722 + }, + { + "epoch": 0.20967447520535443, + "grad_norm": 1.2663108110427856, + "learning_rate": 1.9632551854495285e-05, + "loss": 0.4053, + "step": 1723 + }, + { + "epoch": 0.20979616671737147, + "grad_norm": 2.3283092975616455, + "learning_rate": 1.9632027738474652e-05, + "loss": 0.4691, + "step": 1724 + }, + { + "epoch": 0.2099178582293885, + "grad_norm": 2.6995961666107178, + "learning_rate": 1.963150325593543e-05, + "loss": 0.4785, + "step": 1725 + }, + { + "epoch": 0.21003954974140554, + "grad_norm": 3.0928895473480225, + "learning_rate": 1.963097840689757e-05, + "loss": 0.5069, + "step": 1726 + }, + { + "epoch": 0.2101612412534226, + "grad_norm": 0.7816913723945618, + "learning_rate": 1.9630453191381053e-05, + "loss": 0.4427, + "step": 1727 + }, + { + "epoch": 0.2102829327654396, + "grad_norm": 1.0641828775405884, + "learning_rate": 1.962992760940585e-05, + "loss": 0.5262, + "step": 1728 + }, + { + "epoch": 0.21040462427745665, + "grad_norm": 1.9659558534622192, + "learning_rate": 1.9629401660991974e-05, + "loss": 0.4812, + "step": 1729 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 2.0269155502319336, + "learning_rate": 1.962887534615943e-05, + "loss": 0.4901, + "step": 1730 + }, + { + "epoch": 0.21064800730149072, + "grad_norm": 3.1222195625305176, + "learning_rate": 1.9628348664928247e-05, + "loss": 0.4477, + "step": 1731 + }, + { + "epoch": 0.21076969881350777, + "grad_norm": 2.626588821411133, + "learning_rate": 1.962782161731847e-05, + "loss": 0.4712, + "step": 1732 + }, + { + "epoch": 0.21089139032552479, + "grad_norm": 1.2941805124282837, + "learning_rate": 1.9627294203350152e-05, + "loss": 0.5096, + "step": 1733 + }, + { + "epoch": 0.21101308183754183, + "grad_norm": 0.8076706528663635, + "learning_rate": 1.9626766423043365e-05, + "loss": 0.4525, + "step": 1734 + }, + { + "epoch": 0.21113477334955888, + "grad_norm": 0.6155074834823608, + "learning_rate": 1.9626238276418187e-05, + "loss": 0.4853, + "step": 1735 + }, + { + "epoch": 0.2112564648615759, + "grad_norm": 0.9919223189353943, + "learning_rate": 1.9625709763494715e-05, + "loss": 0.4641, + "step": 1736 + }, + { + "epoch": 0.21137815637359295, + "grad_norm": 3.096661329269409, + "learning_rate": 1.9625180884293064e-05, + "loss": 0.5071, + "step": 1737 + }, + { + "epoch": 0.21149984788560997, + "grad_norm": 4.443472862243652, + "learning_rate": 1.9624651638833358e-05, + "loss": 0.5887, + "step": 1738 + }, + { + "epoch": 0.211621539397627, + "grad_norm": 0.8604888916015625, + "learning_rate": 1.9624122027135735e-05, + "loss": 0.4802, + "step": 1739 + }, + { + "epoch": 0.21174323090964406, + "grad_norm": 3.7411863803863525, + "learning_rate": 1.9623592049220347e-05, + "loss": 0.3979, + "step": 1740 + }, + { + "epoch": 0.21186492242166108, + "grad_norm": 3.657951593399048, + "learning_rate": 1.962306170510736e-05, + "loss": 0.45, + "step": 1741 + }, + { + "epoch": 0.21198661393367813, + "grad_norm": 2.9228360652923584, + "learning_rate": 1.9622530994816963e-05, + "loss": 0.4605, + "step": 1742 + }, + { + "epoch": 0.21210830544569517, + "grad_norm": 1.526663899421692, + "learning_rate": 1.962199991836934e-05, + "loss": 0.4615, + "step": 1743 + }, + { + "epoch": 0.2122299969577122, + "grad_norm": 1.1082109212875366, + "learning_rate": 1.9621468475784703e-05, + "loss": 0.4833, + "step": 1744 + }, + { + "epoch": 0.21235168846972924, + "grad_norm": 2.901003122329712, + "learning_rate": 1.9620936667083277e-05, + "loss": 0.5169, + "step": 1745 + }, + { + "epoch": 0.21247337998174629, + "grad_norm": 5.073161602020264, + "learning_rate": 1.9620404492285293e-05, + "loss": 0.566, + "step": 1746 + }, + { + "epoch": 0.2125950714937633, + "grad_norm": 1.83866286277771, + "learning_rate": 1.961987195141101e-05, + "loss": 0.4691, + "step": 1747 + }, + { + "epoch": 0.21271676300578035, + "grad_norm": 2.80662202835083, + "learning_rate": 1.9619339044480682e-05, + "loss": 0.5147, + "step": 1748 + }, + { + "epoch": 0.21283845451779737, + "grad_norm": 2.4437170028686523, + "learning_rate": 1.9618805771514596e-05, + "loss": 0.4068, + "step": 1749 + }, + { + "epoch": 0.21296014602981442, + "grad_norm": 1.8178179264068604, + "learning_rate": 1.961827213253304e-05, + "loss": 0.4783, + "step": 1750 + }, + { + "epoch": 0.21308183754183146, + "grad_norm": 2.1436269283294678, + "learning_rate": 1.961773812755632e-05, + "loss": 0.5199, + "step": 1751 + }, + { + "epoch": 0.21320352905384848, + "grad_norm": 1.8073639869689941, + "learning_rate": 1.9617203756604757e-05, + "loss": 0.4897, + "step": 1752 + }, + { + "epoch": 0.21332522056586553, + "grad_norm": 0.6793341040611267, + "learning_rate": 1.9616669019698683e-05, + "loss": 0.5321, + "step": 1753 + }, + { + "epoch": 0.21344691207788258, + "grad_norm": 2.9085850715637207, + "learning_rate": 1.961613391685845e-05, + "loss": 0.5059, + "step": 1754 + }, + { + "epoch": 0.2135686035898996, + "grad_norm": 3.8958041667938232, + "learning_rate": 1.9615598448104417e-05, + "loss": 0.415, + "step": 1755 + }, + { + "epoch": 0.21369029510191664, + "grad_norm": 3.782226324081421, + "learning_rate": 1.961506261345696e-05, + "loss": 0.409, + "step": 1756 + }, + { + "epoch": 0.2138119866139337, + "grad_norm": 1.3364371061325073, + "learning_rate": 1.9614526412936464e-05, + "loss": 0.5208, + "step": 1757 + }, + { + "epoch": 0.2139336781259507, + "grad_norm": 1.1603829860687256, + "learning_rate": 1.961398984656334e-05, + "loss": 0.4394, + "step": 1758 + }, + { + "epoch": 0.21405536963796776, + "grad_norm": 1.2047865390777588, + "learning_rate": 1.9613452914358002e-05, + "loss": 0.4675, + "step": 1759 + }, + { + "epoch": 0.21417706114998478, + "grad_norm": 1.6339871883392334, + "learning_rate": 1.9612915616340886e-05, + "loss": 0.4626, + "step": 1760 + }, + { + "epoch": 0.21429875266200182, + "grad_norm": 0.5711697340011597, + "learning_rate": 1.9612377952532428e-05, + "loss": 0.4651, + "step": 1761 + }, + { + "epoch": 0.21442044417401887, + "grad_norm": 0.6917303800582886, + "learning_rate": 1.9611839922953094e-05, + "loss": 0.4676, + "step": 1762 + }, + { + "epoch": 0.2145421356860359, + "grad_norm": 1.1803677082061768, + "learning_rate": 1.9611301527623358e-05, + "loss": 0.4282, + "step": 1763 + }, + { + "epoch": 0.21466382719805294, + "grad_norm": 1.2051255702972412, + "learning_rate": 1.96107627665637e-05, + "loss": 0.4997, + "step": 1764 + }, + { + "epoch": 0.21478551871006998, + "grad_norm": 3.2209393978118896, + "learning_rate": 1.9610223639794628e-05, + "loss": 0.4172, + "step": 1765 + }, + { + "epoch": 0.214907210222087, + "grad_norm": 0.9959712028503418, + "learning_rate": 1.9609684147336654e-05, + "loss": 0.4617, + "step": 1766 + }, + { + "epoch": 0.21502890173410405, + "grad_norm": 0.6463029384613037, + "learning_rate": 1.9609144289210308e-05, + "loss": 0.4563, + "step": 1767 + }, + { + "epoch": 0.2151505932461211, + "grad_norm": 1.8162944316864014, + "learning_rate": 1.960860406543613e-05, + "loss": 0.5229, + "step": 1768 + }, + { + "epoch": 0.21527228475813812, + "grad_norm": 2.373663902282715, + "learning_rate": 1.9608063476034683e-05, + "loss": 0.4438, + "step": 1769 + }, + { + "epoch": 0.21539397627015516, + "grad_norm": 0.7366934418678284, + "learning_rate": 1.960752252102653e-05, + "loss": 0.5095, + "step": 1770 + }, + { + "epoch": 0.21551566778217218, + "grad_norm": 1.5187993049621582, + "learning_rate": 1.9606981200432256e-05, + "loss": 0.4881, + "step": 1771 + }, + { + "epoch": 0.21563735929418923, + "grad_norm": 1.8819949626922607, + "learning_rate": 1.9606439514272465e-05, + "loss": 0.5457, + "step": 1772 + }, + { + "epoch": 0.21575905080620628, + "grad_norm": 1.7443825006484985, + "learning_rate": 1.9605897462567765e-05, + "loss": 0.4856, + "step": 1773 + }, + { + "epoch": 0.2158807423182233, + "grad_norm": 1.0818630456924438, + "learning_rate": 1.9605355045338784e-05, + "loss": 0.4808, + "step": 1774 + }, + { + "epoch": 0.21600243383024034, + "grad_norm": 3.5668601989746094, + "learning_rate": 1.9604812262606162e-05, + "loss": 0.4699, + "step": 1775 + }, + { + "epoch": 0.2161241253422574, + "grad_norm": 1.3467004299163818, + "learning_rate": 1.9604269114390556e-05, + "loss": 0.4751, + "step": 1776 + }, + { + "epoch": 0.2162458168542744, + "grad_norm": 0.5569159388542175, + "learning_rate": 1.9603725600712624e-05, + "loss": 0.4459, + "step": 1777 + }, + { + "epoch": 0.21636750836629146, + "grad_norm": 1.232100486755371, + "learning_rate": 1.9603181721593057e-05, + "loss": 0.4397, + "step": 1778 + }, + { + "epoch": 0.21648919987830847, + "grad_norm": 4.7086944580078125, + "learning_rate": 1.960263747705255e-05, + "loss": 0.5435, + "step": 1779 + }, + { + "epoch": 0.21661089139032552, + "grad_norm": 1.6949118375778198, + "learning_rate": 1.9602092867111807e-05, + "loss": 0.4974, + "step": 1780 + }, + { + "epoch": 0.21673258290234257, + "grad_norm": 2.6949243545532227, + "learning_rate": 1.9601547891791558e-05, + "loss": 0.5137, + "step": 1781 + }, + { + "epoch": 0.2168542744143596, + "grad_norm": 0.7370144724845886, + "learning_rate": 1.9601002551112537e-05, + "loss": 0.4492, + "step": 1782 + }, + { + "epoch": 0.21697596592637663, + "grad_norm": 1.397643804550171, + "learning_rate": 1.9600456845095498e-05, + "loss": 0.4988, + "step": 1783 + }, + { + "epoch": 0.21709765743839368, + "grad_norm": 4.239346981048584, + "learning_rate": 1.9599910773761198e-05, + "loss": 0.4688, + "step": 1784 + }, + { + "epoch": 0.2172193489504107, + "grad_norm": 4.348527431488037, + "learning_rate": 1.9599364337130425e-05, + "loss": 0.4913, + "step": 1785 + }, + { + "epoch": 0.21734104046242775, + "grad_norm": 2.584226131439209, + "learning_rate": 1.9598817535223974e-05, + "loss": 0.5575, + "step": 1786 + }, + { + "epoch": 0.2174627319744448, + "grad_norm": 1.4294018745422363, + "learning_rate": 1.9598270368062642e-05, + "loss": 0.5263, + "step": 1787 + }, + { + "epoch": 0.21758442348646181, + "grad_norm": 6.325389385223389, + "learning_rate": 1.9597722835667256e-05, + "loss": 0.4818, + "step": 1788 + }, + { + "epoch": 0.21770611499847886, + "grad_norm": 2.446532726287842, + "learning_rate": 1.9597174938058653e-05, + "loss": 0.4782, + "step": 1789 + }, + { + "epoch": 0.21782780651049588, + "grad_norm": 3.8551485538482666, + "learning_rate": 1.9596626675257673e-05, + "loss": 0.5752, + "step": 1790 + }, + { + "epoch": 0.21794949802251293, + "grad_norm": 0.8540073037147522, + "learning_rate": 1.959607804728519e-05, + "loss": 0.453, + "step": 1791 + }, + { + "epoch": 0.21807118953452997, + "grad_norm": 0.7740235328674316, + "learning_rate": 1.959552905416207e-05, + "loss": 0.3969, + "step": 1792 + }, + { + "epoch": 0.218192881046547, + "grad_norm": 2.930610179901123, + "learning_rate": 1.9594979695909212e-05, + "loss": 0.509, + "step": 1793 + }, + { + "epoch": 0.21831457255856404, + "grad_norm": 2.813776731491089, + "learning_rate": 1.9594429972547512e-05, + "loss": 0.4784, + "step": 1794 + }, + { + "epoch": 0.2184362640705811, + "grad_norm": 1.513473629951477, + "learning_rate": 1.9593879884097894e-05, + "loss": 0.4409, + "step": 1795 + }, + { + "epoch": 0.2185579555825981, + "grad_norm": 0.9543504118919373, + "learning_rate": 1.9593329430581288e-05, + "loss": 0.385, + "step": 1796 + }, + { + "epoch": 0.21867964709461515, + "grad_norm": 0.7908185720443726, + "learning_rate": 1.959277861201864e-05, + "loss": 0.4647, + "step": 1797 + }, + { + "epoch": 0.2188013386066322, + "grad_norm": 2.3689825534820557, + "learning_rate": 1.9592227428430914e-05, + "loss": 0.5096, + "step": 1798 + }, + { + "epoch": 0.21892303011864922, + "grad_norm": 1.5015161037445068, + "learning_rate": 1.9591675879839074e-05, + "loss": 0.4844, + "step": 1799 + }, + { + "epoch": 0.21904472163066627, + "grad_norm": 3.2143096923828125, + "learning_rate": 1.9591123966264113e-05, + "loss": 0.4031, + "step": 1800 + }, + { + "epoch": 0.2191664131426833, + "grad_norm": 3.81089448928833, + "learning_rate": 1.9590571687727035e-05, + "loss": 0.469, + "step": 1801 + }, + { + "epoch": 0.21928810465470033, + "grad_norm": 4.690742492675781, + "learning_rate": 1.959001904424885e-05, + "loss": 0.45, + "step": 1802 + }, + { + "epoch": 0.21940979616671738, + "grad_norm": 0.7875168919563293, + "learning_rate": 1.9589466035850595e-05, + "loss": 0.4531, + "step": 1803 + }, + { + "epoch": 0.2195314876787344, + "grad_norm": 2.462885856628418, + "learning_rate": 1.9588912662553302e-05, + "loss": 0.5211, + "step": 1804 + }, + { + "epoch": 0.21965317919075145, + "grad_norm": 1.3978126049041748, + "learning_rate": 1.958835892437804e-05, + "loss": 0.463, + "step": 1805 + }, + { + "epoch": 0.2197748707027685, + "grad_norm": 1.172132968902588, + "learning_rate": 1.9587804821345874e-05, + "loss": 0.4236, + "step": 1806 + }, + { + "epoch": 0.2198965622147855, + "grad_norm": 3.6447465419769287, + "learning_rate": 1.9587250353477885e-05, + "loss": 0.5574, + "step": 1807 + }, + { + "epoch": 0.22001825372680256, + "grad_norm": 0.9601985812187195, + "learning_rate": 1.9586695520795178e-05, + "loss": 0.4515, + "step": 1808 + }, + { + "epoch": 0.22013994523881958, + "grad_norm": 1.2912932634353638, + "learning_rate": 1.958614032331886e-05, + "loss": 0.5161, + "step": 1809 + }, + { + "epoch": 0.22026163675083663, + "grad_norm": 0.9475353956222534, + "learning_rate": 1.9585584761070064e-05, + "loss": 0.4697, + "step": 1810 + }, + { + "epoch": 0.22038332826285367, + "grad_norm": 0.9893306493759155, + "learning_rate": 1.9585028834069924e-05, + "loss": 0.5137, + "step": 1811 + }, + { + "epoch": 0.2205050197748707, + "grad_norm": 1.252672553062439, + "learning_rate": 1.9584472542339603e-05, + "loss": 0.4504, + "step": 1812 + }, + { + "epoch": 0.22062671128688774, + "grad_norm": 1.7876701354980469, + "learning_rate": 1.9583915885900255e-05, + "loss": 0.4579, + "step": 1813 + }, + { + "epoch": 0.22074840279890479, + "grad_norm": 2.5286545753479004, + "learning_rate": 1.9583358864773075e-05, + "loss": 0.5526, + "step": 1814 + }, + { + "epoch": 0.2208700943109218, + "grad_norm": 1.6176166534423828, + "learning_rate": 1.9582801478979254e-05, + "loss": 0.4994, + "step": 1815 + }, + { + "epoch": 0.22099178582293885, + "grad_norm": 3.4232685565948486, + "learning_rate": 1.958224372854e-05, + "loss": 0.5623, + "step": 1816 + }, + { + "epoch": 0.2211134773349559, + "grad_norm": 1.9408061504364014, + "learning_rate": 1.9581685613476536e-05, + "loss": 0.4838, + "step": 1817 + }, + { + "epoch": 0.22123516884697292, + "grad_norm": 1.1546335220336914, + "learning_rate": 1.9581127133810103e-05, + "loss": 0.4906, + "step": 1818 + }, + { + "epoch": 0.22135686035898997, + "grad_norm": 0.6846708059310913, + "learning_rate": 1.958056828956195e-05, + "loss": 0.5004, + "step": 1819 + }, + { + "epoch": 0.22147855187100698, + "grad_norm": 3.5709493160247803, + "learning_rate": 1.9580009080753343e-05, + "loss": 0.4759, + "step": 1820 + }, + { + "epoch": 0.22160024338302403, + "grad_norm": 0.5858487486839294, + "learning_rate": 1.957944950740556e-05, + "loss": 0.4857, + "step": 1821 + }, + { + "epoch": 0.22172193489504108, + "grad_norm": 0.6543298363685608, + "learning_rate": 1.9578889569539895e-05, + "loss": 0.4621, + "step": 1822 + }, + { + "epoch": 0.2218436264070581, + "grad_norm": 1.3765203952789307, + "learning_rate": 1.957832926717766e-05, + "loss": 0.4849, + "step": 1823 + }, + { + "epoch": 0.22196531791907514, + "grad_norm": 1.184235692024231, + "learning_rate": 1.9577768600340165e-05, + "loss": 0.4716, + "step": 1824 + }, + { + "epoch": 0.2220870094310922, + "grad_norm": 2.138306140899658, + "learning_rate": 1.957720756904875e-05, + "loss": 0.4938, + "step": 1825 + }, + { + "epoch": 0.2222087009431092, + "grad_norm": 2.1889560222625732, + "learning_rate": 1.9576646173324768e-05, + "loss": 0.4491, + "step": 1826 + }, + { + "epoch": 0.22233039245512626, + "grad_norm": 3.8328733444213867, + "learning_rate": 1.957608441318957e-05, + "loss": 0.6034, + "step": 1827 + }, + { + "epoch": 0.2224520839671433, + "grad_norm": 1.3347678184509277, + "learning_rate": 1.9575522288664542e-05, + "loss": 0.5146, + "step": 1828 + }, + { + "epoch": 0.22257377547916032, + "grad_norm": 0.8727834820747375, + "learning_rate": 1.957495979977107e-05, + "loss": 0.532, + "step": 1829 + }, + { + "epoch": 0.22269546699117737, + "grad_norm": 4.480216979980469, + "learning_rate": 1.957439694653056e-05, + "loss": 0.4885, + "step": 1830 + }, + { + "epoch": 0.2228171585031944, + "grad_norm": 3.6056365966796875, + "learning_rate": 1.9573833728964428e-05, + "loss": 0.5132, + "step": 1831 + }, + { + "epoch": 0.22293885001521144, + "grad_norm": 2.7152721881866455, + "learning_rate": 1.9573270147094104e-05, + "loss": 0.4854, + "step": 1832 + }, + { + "epoch": 0.22306054152722848, + "grad_norm": 1.096818447113037, + "learning_rate": 1.9572706200941036e-05, + "loss": 0.472, + "step": 1833 + }, + { + "epoch": 0.2231822330392455, + "grad_norm": 1.5718345642089844, + "learning_rate": 1.9572141890526684e-05, + "loss": 0.4515, + "step": 1834 + }, + { + "epoch": 0.22330392455126255, + "grad_norm": 1.770283818244934, + "learning_rate": 1.9571577215872518e-05, + "loss": 0.4712, + "step": 1835 + }, + { + "epoch": 0.2234256160632796, + "grad_norm": 1.5038691759109497, + "learning_rate": 1.957101217700003e-05, + "loss": 0.4915, + "step": 1836 + }, + { + "epoch": 0.22354730757529662, + "grad_norm": 0.8830704689025879, + "learning_rate": 1.9570446773930715e-05, + "loss": 0.4567, + "step": 1837 + }, + { + "epoch": 0.22366899908731366, + "grad_norm": 0.9205377697944641, + "learning_rate": 1.956988100668609e-05, + "loss": 0.4732, + "step": 1838 + }, + { + "epoch": 0.2237906905993307, + "grad_norm": 3.763309955596924, + "learning_rate": 1.9569314875287687e-05, + "loss": 0.4171, + "step": 1839 + }, + { + "epoch": 0.22391238211134773, + "grad_norm": 1.2489964962005615, + "learning_rate": 1.956874837975704e-05, + "loss": 0.4366, + "step": 1840 + }, + { + "epoch": 0.22403407362336478, + "grad_norm": 0.7394869327545166, + "learning_rate": 1.9568181520115717e-05, + "loss": 0.4668, + "step": 1841 + }, + { + "epoch": 0.2241557651353818, + "grad_norm": 0.8081273436546326, + "learning_rate": 1.956761429638528e-05, + "loss": 0.5224, + "step": 1842 + }, + { + "epoch": 0.22427745664739884, + "grad_norm": 1.2300883531570435, + "learning_rate": 1.9567046708587313e-05, + "loss": 0.4521, + "step": 1843 + }, + { + "epoch": 0.2243991481594159, + "grad_norm": 1.7750821113586426, + "learning_rate": 1.956647875674342e-05, + "loss": 0.3908, + "step": 1844 + }, + { + "epoch": 0.2245208396714329, + "grad_norm": 1.9537758827209473, + "learning_rate": 1.9565910440875203e-05, + "loss": 0.4675, + "step": 1845 + }, + { + "epoch": 0.22464253118344996, + "grad_norm": 1.7771425247192383, + "learning_rate": 1.95653417610043e-05, + "loss": 0.4941, + "step": 1846 + }, + { + "epoch": 0.224764222695467, + "grad_norm": 4.661746025085449, + "learning_rate": 1.956477271715234e-05, + "loss": 0.5875, + "step": 1847 + }, + { + "epoch": 0.22488591420748402, + "grad_norm": 0.7554677724838257, + "learning_rate": 1.9564203309340982e-05, + "loss": 0.4472, + "step": 1848 + }, + { + "epoch": 0.22500760571950107, + "grad_norm": 0.7332402467727661, + "learning_rate": 1.956363353759189e-05, + "loss": 0.5146, + "step": 1849 + }, + { + "epoch": 0.2251292972315181, + "grad_norm": 0.6872220635414124, + "learning_rate": 1.9563063401926747e-05, + "loss": 0.4928, + "step": 1850 + }, + { + "epoch": 0.22525098874353514, + "grad_norm": 2.8405821323394775, + "learning_rate": 1.9562492902367247e-05, + "loss": 0.4289, + "step": 1851 + }, + { + "epoch": 0.22537268025555218, + "grad_norm": 0.5601333975791931, + "learning_rate": 1.9561922038935096e-05, + "loss": 0.4823, + "step": 1852 + }, + { + "epoch": 0.2254943717675692, + "grad_norm": 1.0897819995880127, + "learning_rate": 1.9561350811652024e-05, + "loss": 0.4858, + "step": 1853 + }, + { + "epoch": 0.22561606327958625, + "grad_norm": 0.6712708473205566, + "learning_rate": 1.956077922053976e-05, + "loss": 0.4965, + "step": 1854 + }, + { + "epoch": 0.2257377547916033, + "grad_norm": 2.067476987838745, + "learning_rate": 1.9560207265620058e-05, + "loss": 0.4242, + "step": 1855 + }, + { + "epoch": 0.22585944630362031, + "grad_norm": 0.992720901966095, + "learning_rate": 1.955963494691468e-05, + "loss": 0.4779, + "step": 1856 + }, + { + "epoch": 0.22598113781563736, + "grad_norm": 2.4270431995391846, + "learning_rate": 1.9559062264445404e-05, + "loss": 0.5383, + "step": 1857 + }, + { + "epoch": 0.2261028293276544, + "grad_norm": 0.9442155957221985, + "learning_rate": 1.9558489218234023e-05, + "loss": 0.4629, + "step": 1858 + }, + { + "epoch": 0.22622452083967143, + "grad_norm": 1.953385829925537, + "learning_rate": 1.9557915808302344e-05, + "loss": 0.4717, + "step": 1859 + }, + { + "epoch": 0.22634621235168847, + "grad_norm": 3.446536064147949, + "learning_rate": 1.9557342034672184e-05, + "loss": 0.5155, + "step": 1860 + }, + { + "epoch": 0.2264679038637055, + "grad_norm": 2.581662654876709, + "learning_rate": 1.9556767897365375e-05, + "loss": 0.3887, + "step": 1861 + }, + { + "epoch": 0.22658959537572254, + "grad_norm": 0.994106113910675, + "learning_rate": 1.955619339640377e-05, + "loss": 0.4577, + "step": 1862 + }, + { + "epoch": 0.2267112868877396, + "grad_norm": 1.1811777353286743, + "learning_rate": 1.9555618531809225e-05, + "loss": 0.4742, + "step": 1863 + }, + { + "epoch": 0.2268329783997566, + "grad_norm": 0.8801113963127136, + "learning_rate": 1.9555043303603614e-05, + "loss": 0.4302, + "step": 1864 + }, + { + "epoch": 0.22695466991177365, + "grad_norm": 1.5900946855545044, + "learning_rate": 1.955446771180883e-05, + "loss": 0.4826, + "step": 1865 + }, + { + "epoch": 0.2270763614237907, + "grad_norm": 4.646539688110352, + "learning_rate": 1.955389175644677e-05, + "loss": 0.5922, + "step": 1866 + }, + { + "epoch": 0.22719805293580772, + "grad_norm": 0.7682361602783203, + "learning_rate": 1.9553315437539354e-05, + "loss": 0.4721, + "step": 1867 + }, + { + "epoch": 0.22731974444782477, + "grad_norm": 0.7815783619880676, + "learning_rate": 1.955273875510851e-05, + "loss": 0.4761, + "step": 1868 + }, + { + "epoch": 0.22744143595984181, + "grad_norm": 2.1399126052856445, + "learning_rate": 1.9552161709176186e-05, + "loss": 0.4418, + "step": 1869 + }, + { + "epoch": 0.22756312747185883, + "grad_norm": 1.1615606546401978, + "learning_rate": 1.9551584299764337e-05, + "loss": 0.4836, + "step": 1870 + }, + { + "epoch": 0.22768481898387588, + "grad_norm": 1.3012791872024536, + "learning_rate": 1.9551006526894937e-05, + "loss": 0.4472, + "step": 1871 + }, + { + "epoch": 0.2278065104958929, + "grad_norm": 1.6008267402648926, + "learning_rate": 1.9550428390589963e-05, + "loss": 0.5028, + "step": 1872 + }, + { + "epoch": 0.22792820200790995, + "grad_norm": 1.3280154466629028, + "learning_rate": 1.9549849890871423e-05, + "loss": 0.5007, + "step": 1873 + }, + { + "epoch": 0.228049893519927, + "grad_norm": 2.204503059387207, + "learning_rate": 1.954927102776133e-05, + "loss": 0.4493, + "step": 1874 + }, + { + "epoch": 0.228171585031944, + "grad_norm": 0.6567058563232422, + "learning_rate": 1.9548691801281706e-05, + "loss": 0.5283, + "step": 1875 + }, + { + "epoch": 0.22829327654396106, + "grad_norm": 0.6983780264854431, + "learning_rate": 1.9548112211454592e-05, + "loss": 0.4797, + "step": 1876 + }, + { + "epoch": 0.2284149680559781, + "grad_norm": 1.5089805126190186, + "learning_rate": 1.954753225830205e-05, + "loss": 0.5219, + "step": 1877 + }, + { + "epoch": 0.22853665956799513, + "grad_norm": 3.4219298362731934, + "learning_rate": 1.954695194184614e-05, + "loss": 0.4544, + "step": 1878 + }, + { + "epoch": 0.22865835108001217, + "grad_norm": 2.7527565956115723, + "learning_rate": 1.954637126210895e-05, + "loss": 0.4123, + "step": 1879 + }, + { + "epoch": 0.2287800425920292, + "grad_norm": 0.8203837275505066, + "learning_rate": 1.954579021911257e-05, + "loss": 0.459, + "step": 1880 + }, + { + "epoch": 0.22890173410404624, + "grad_norm": 0.6884888410568237, + "learning_rate": 1.9545208812879114e-05, + "loss": 0.4456, + "step": 1881 + }, + { + "epoch": 0.2290234256160633, + "grad_norm": 0.9354062676429749, + "learning_rate": 1.9544627043430706e-05, + "loss": 0.4658, + "step": 1882 + }, + { + "epoch": 0.2291451171280803, + "grad_norm": 2.0113630294799805, + "learning_rate": 1.9544044910789485e-05, + "loss": 0.5228, + "step": 1883 + }, + { + "epoch": 0.22926680864009735, + "grad_norm": 1.7827974557876587, + "learning_rate": 1.95434624149776e-05, + "loss": 0.5069, + "step": 1884 + }, + { + "epoch": 0.2293885001521144, + "grad_norm": 1.4230190515518188, + "learning_rate": 1.9542879556017212e-05, + "loss": 0.4563, + "step": 1885 + }, + { + "epoch": 0.22951019166413142, + "grad_norm": 2.9803624153137207, + "learning_rate": 1.9542296333930508e-05, + "loss": 0.5347, + "step": 1886 + }, + { + "epoch": 0.22963188317614847, + "grad_norm": 1.2775452136993408, + "learning_rate": 1.9541712748739675e-05, + "loss": 0.4578, + "step": 1887 + }, + { + "epoch": 0.2297535746881655, + "grad_norm": 2.397963285446167, + "learning_rate": 1.9541128800466923e-05, + "loss": 0.4792, + "step": 1888 + }, + { + "epoch": 0.22987526620018253, + "grad_norm": 2.783512830734253, + "learning_rate": 1.954054448913447e-05, + "loss": 0.4446, + "step": 1889 + }, + { + "epoch": 0.22999695771219958, + "grad_norm": 0.6921996474266052, + "learning_rate": 1.9539959814764553e-05, + "loss": 0.4802, + "step": 1890 + }, + { + "epoch": 0.2301186492242166, + "grad_norm": 2.020805835723877, + "learning_rate": 1.9539374777379416e-05, + "loss": 0.5665, + "step": 1891 + }, + { + "epoch": 0.23024034073623365, + "grad_norm": 1.0037791728973389, + "learning_rate": 1.9538789377001324e-05, + "loss": 0.4838, + "step": 1892 + }, + { + "epoch": 0.2303620322482507, + "grad_norm": 1.7108644247055054, + "learning_rate": 1.9538203613652555e-05, + "loss": 0.48, + "step": 1893 + }, + { + "epoch": 0.2304837237602677, + "grad_norm": 2.821938991546631, + "learning_rate": 1.9537617487355393e-05, + "loss": 0.4187, + "step": 1894 + }, + { + "epoch": 0.23060541527228476, + "grad_norm": 0.7069106101989746, + "learning_rate": 1.9537030998132144e-05, + "loss": 0.453, + "step": 1895 + }, + { + "epoch": 0.2307271067843018, + "grad_norm": 1.8053488731384277, + "learning_rate": 1.953644414600512e-05, + "loss": 0.4792, + "step": 1896 + }, + { + "epoch": 0.23084879829631882, + "grad_norm": 2.3494744300842285, + "learning_rate": 1.9535856930996664e-05, + "loss": 0.3955, + "step": 1897 + }, + { + "epoch": 0.23097048980833587, + "grad_norm": 3.206209421157837, + "learning_rate": 1.953526935312911e-05, + "loss": 0.4974, + "step": 1898 + }, + { + "epoch": 0.23109218132035292, + "grad_norm": 3.2301957607269287, + "learning_rate": 1.9534681412424824e-05, + "loss": 0.5038, + "step": 1899 + }, + { + "epoch": 0.23121387283236994, + "grad_norm": 1.8758232593536377, + "learning_rate": 1.953409310890617e-05, + "loss": 0.5013, + "step": 1900 + }, + { + "epoch": 0.23133556434438698, + "grad_norm": 0.5013694167137146, + "learning_rate": 1.953350444259554e-05, + "loss": 0.4573, + "step": 1901 + }, + { + "epoch": 0.231457255856404, + "grad_norm": 2.4453580379486084, + "learning_rate": 1.9532915413515337e-05, + "loss": 0.5048, + "step": 1902 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 1.0904576778411865, + "learning_rate": 1.9532326021687967e-05, + "loss": 0.5322, + "step": 1903 + }, + { + "epoch": 0.2317006388804381, + "grad_norm": 3.3153820037841797, + "learning_rate": 1.9531736267135858e-05, + "loss": 0.4781, + "step": 1904 + }, + { + "epoch": 0.23182233039245512, + "grad_norm": 5.554010391235352, + "learning_rate": 1.9531146149881456e-05, + "loss": 0.503, + "step": 1905 + }, + { + "epoch": 0.23194402190447216, + "grad_norm": 7.145920276641846, + "learning_rate": 1.953055566994722e-05, + "loss": 0.5097, + "step": 1906 + }, + { + "epoch": 0.2320657134164892, + "grad_norm": 5.55267333984375, + "learning_rate": 1.952996482735561e-05, + "loss": 0.5018, + "step": 1907 + }, + { + "epoch": 0.23218740492850623, + "grad_norm": 5.191817760467529, + "learning_rate": 1.9529373622129107e-05, + "loss": 0.5197, + "step": 1908 + }, + { + "epoch": 0.23230909644052328, + "grad_norm": 1.8097574710845947, + "learning_rate": 1.952878205429022e-05, + "loss": 0.4605, + "step": 1909 + }, + { + "epoch": 0.2324307879525403, + "grad_norm": 1.145939588546753, + "learning_rate": 1.952819012386145e-05, + "loss": 0.4898, + "step": 1910 + }, + { + "epoch": 0.23255247946455734, + "grad_norm": 1.1284998655319214, + "learning_rate": 1.9527597830865325e-05, + "loss": 0.4932, + "step": 1911 + }, + { + "epoch": 0.2326741709765744, + "grad_norm": 1.0728673934936523, + "learning_rate": 1.952700517532438e-05, + "loss": 0.4367, + "step": 1912 + }, + { + "epoch": 0.2327958624885914, + "grad_norm": 1.984288215637207, + "learning_rate": 1.952641215726117e-05, + "loss": 0.4561, + "step": 1913 + }, + { + "epoch": 0.23291755400060846, + "grad_norm": 2.4089512825012207, + "learning_rate": 1.9525818776698257e-05, + "loss": 0.4925, + "step": 1914 + }, + { + "epoch": 0.2330392455126255, + "grad_norm": 2.184720993041992, + "learning_rate": 1.9525225033658222e-05, + "loss": 0.4942, + "step": 1915 + }, + { + "epoch": 0.23316093702464252, + "grad_norm": 2.7158260345458984, + "learning_rate": 1.952463092816366e-05, + "loss": 0.502, + "step": 1916 + }, + { + "epoch": 0.23328262853665957, + "grad_norm": 0.6456950306892395, + "learning_rate": 1.952403646023717e-05, + "loss": 0.4662, + "step": 1917 + }, + { + "epoch": 0.23340432004867662, + "grad_norm": 1.2357690334320068, + "learning_rate": 1.952344162990139e-05, + "loss": 0.4933, + "step": 1918 + }, + { + "epoch": 0.23352601156069364, + "grad_norm": 2.7533791065216064, + "learning_rate": 1.9522846437178933e-05, + "loss": 0.4568, + "step": 1919 + }, + { + "epoch": 0.23364770307271068, + "grad_norm": 0.5824018120765686, + "learning_rate": 1.9522250882092465e-05, + "loss": 0.4831, + "step": 1920 + }, + { + "epoch": 0.2337693945847277, + "grad_norm": 2.744816780090332, + "learning_rate": 1.952165496466464e-05, + "loss": 0.4405, + "step": 1921 + }, + { + "epoch": 0.23389108609674475, + "grad_norm": 1.5789531469345093, + "learning_rate": 1.9521058684918133e-05, + "loss": 0.4395, + "step": 1922 + }, + { + "epoch": 0.2340127776087618, + "grad_norm": 3.622791051864624, + "learning_rate": 1.9520462042875635e-05, + "loss": 0.5167, + "step": 1923 + }, + { + "epoch": 0.23413446912077882, + "grad_norm": 0.9125657081604004, + "learning_rate": 1.951986503855985e-05, + "loss": 0.4775, + "step": 1924 + }, + { + "epoch": 0.23425616063279586, + "grad_norm": 1.3564062118530273, + "learning_rate": 1.9519267671993498e-05, + "loss": 0.4726, + "step": 1925 + }, + { + "epoch": 0.2343778521448129, + "grad_norm": 1.0720938444137573, + "learning_rate": 1.9518669943199303e-05, + "loss": 0.4711, + "step": 1926 + }, + { + "epoch": 0.23449954365682993, + "grad_norm": 2.363069772720337, + "learning_rate": 1.9518071852200017e-05, + "loss": 0.4189, + "step": 1927 + }, + { + "epoch": 0.23462123516884698, + "grad_norm": 0.6921089887619019, + "learning_rate": 1.9517473399018397e-05, + "loss": 0.4886, + "step": 1928 + }, + { + "epoch": 0.23474292668086402, + "grad_norm": 0.6319681406021118, + "learning_rate": 1.951687458367721e-05, + "loss": 0.5, + "step": 1929 + }, + { + "epoch": 0.23486461819288104, + "grad_norm": 2.0676770210266113, + "learning_rate": 1.951627540619925e-05, + "loss": 0.4935, + "step": 1930 + }, + { + "epoch": 0.2349863097048981, + "grad_norm": 0.8923307061195374, + "learning_rate": 1.951567586660731e-05, + "loss": 0.4506, + "step": 1931 + }, + { + "epoch": 0.2351080012169151, + "grad_norm": 1.6264599561691284, + "learning_rate": 1.951507596492421e-05, + "loss": 0.5304, + "step": 1932 + }, + { + "epoch": 0.23522969272893215, + "grad_norm": 1.3363028764724731, + "learning_rate": 1.9514475701172775e-05, + "loss": 0.5183, + "step": 1933 + }, + { + "epoch": 0.2353513842409492, + "grad_norm": 1.9410772323608398, + "learning_rate": 1.951387507537584e-05, + "loss": 0.4618, + "step": 1934 + }, + { + "epoch": 0.23547307575296622, + "grad_norm": 0.6874043941497803, + "learning_rate": 1.951327408755627e-05, + "loss": 0.4997, + "step": 1935 + }, + { + "epoch": 0.23559476726498327, + "grad_norm": 0.8465669751167297, + "learning_rate": 1.9512672737736932e-05, + "loss": 0.4785, + "step": 1936 + }, + { + "epoch": 0.23571645877700032, + "grad_norm": 0.6066112518310547, + "learning_rate": 1.9512071025940702e-05, + "loss": 0.4777, + "step": 1937 + }, + { + "epoch": 0.23583815028901733, + "grad_norm": 1.6517739295959473, + "learning_rate": 1.9511468952190482e-05, + "loss": 0.4206, + "step": 1938 + }, + { + "epoch": 0.23595984180103438, + "grad_norm": 2.3720359802246094, + "learning_rate": 1.951086651650918e-05, + "loss": 0.3877, + "step": 1939 + }, + { + "epoch": 0.23608153331305143, + "grad_norm": 3.838566780090332, + "learning_rate": 1.9510263718919723e-05, + "loss": 0.5187, + "step": 1940 + }, + { + "epoch": 0.23620322482506845, + "grad_norm": 0.7764478921890259, + "learning_rate": 1.9509660559445042e-05, + "loss": 0.4338, + "step": 1941 + }, + { + "epoch": 0.2363249163370855, + "grad_norm": 0.8405648469924927, + "learning_rate": 1.9509057038108095e-05, + "loss": 0.4133, + "step": 1942 + }, + { + "epoch": 0.2364466078491025, + "grad_norm": 2.0843472480773926, + "learning_rate": 1.950845315493185e-05, + "loss": 0.4736, + "step": 1943 + }, + { + "epoch": 0.23656829936111956, + "grad_norm": 1.8424642086029053, + "learning_rate": 1.9507848909939273e-05, + "loss": 0.4793, + "step": 1944 + }, + { + "epoch": 0.2366899908731366, + "grad_norm": 0.6707680821418762, + "learning_rate": 1.950724430315337e-05, + "loss": 0.4889, + "step": 1945 + }, + { + "epoch": 0.23681168238515363, + "grad_norm": 2.9982569217681885, + "learning_rate": 1.9506639334597137e-05, + "loss": 0.4055, + "step": 1946 + }, + { + "epoch": 0.23693337389717067, + "grad_norm": 1.0548427104949951, + "learning_rate": 1.9506034004293606e-05, + "loss": 0.486, + "step": 1947 + }, + { + "epoch": 0.23705506540918772, + "grad_norm": 0.7568971514701843, + "learning_rate": 1.9505428312265802e-05, + "loss": 0.4748, + "step": 1948 + }, + { + "epoch": 0.23717675692120474, + "grad_norm": 1.1806089878082275, + "learning_rate": 1.9504822258536773e-05, + "loss": 0.4741, + "step": 1949 + }, + { + "epoch": 0.2372984484332218, + "grad_norm": 1.6513252258300781, + "learning_rate": 1.9504215843129585e-05, + "loss": 0.4595, + "step": 1950 + }, + { + "epoch": 0.2374201399452388, + "grad_norm": 1.5319706201553345, + "learning_rate": 1.9503609066067315e-05, + "loss": 0.4736, + "step": 1951 + }, + { + "epoch": 0.23754183145725585, + "grad_norm": 1.2497912645339966, + "learning_rate": 1.9503001927373045e-05, + "loss": 0.4246, + "step": 1952 + }, + { + "epoch": 0.2376635229692729, + "grad_norm": 1.8263548612594604, + "learning_rate": 1.950239442706988e-05, + "loss": 0.4915, + "step": 1953 + }, + { + "epoch": 0.23778521448128992, + "grad_norm": 4.213039398193359, + "learning_rate": 1.9501786565180944e-05, + "loss": 0.5669, + "step": 1954 + }, + { + "epoch": 0.23790690599330697, + "grad_norm": 1.3360931873321533, + "learning_rate": 1.9501178341729356e-05, + "loss": 0.4602, + "step": 1955 + }, + { + "epoch": 0.238028597505324, + "grad_norm": 2.0655064582824707, + "learning_rate": 1.950056975673827e-05, + "loss": 0.4722, + "step": 1956 + }, + { + "epoch": 0.23815028901734103, + "grad_norm": 0.5991892218589783, + "learning_rate": 1.9499960810230836e-05, + "loss": 0.4838, + "step": 1957 + }, + { + "epoch": 0.23827198052935808, + "grad_norm": 1.3843307495117188, + "learning_rate": 1.949935150223023e-05, + "loss": 0.4921, + "step": 1958 + }, + { + "epoch": 0.23839367204137513, + "grad_norm": 2.5422253608703613, + "learning_rate": 1.9498741832759638e-05, + "loss": 0.4749, + "step": 1959 + }, + { + "epoch": 0.23851536355339215, + "grad_norm": 5.866820335388184, + "learning_rate": 1.9498131801842256e-05, + "loss": 0.4401, + "step": 1960 + }, + { + "epoch": 0.2386370550654092, + "grad_norm": 1.6477247476577759, + "learning_rate": 1.9497521409501302e-05, + "loss": 0.4534, + "step": 1961 + }, + { + "epoch": 0.2387587465774262, + "grad_norm": 1.0300894975662231, + "learning_rate": 1.9496910655759996e-05, + "loss": 0.4297, + "step": 1962 + }, + { + "epoch": 0.23888043808944326, + "grad_norm": 3.3744678497314453, + "learning_rate": 1.9496299540641586e-05, + "loss": 0.5299, + "step": 1963 + }, + { + "epoch": 0.2390021296014603, + "grad_norm": 1.0261774063110352, + "learning_rate": 1.949568806416932e-05, + "loss": 0.4944, + "step": 1964 + }, + { + "epoch": 0.23912382111347733, + "grad_norm": 0.6185161471366882, + "learning_rate": 1.949507622636647e-05, + "loss": 0.4847, + "step": 1965 + }, + { + "epoch": 0.23924551262549437, + "grad_norm": 1.6548773050308228, + "learning_rate": 1.9494464027256313e-05, + "loss": 0.4379, + "step": 1966 + }, + { + "epoch": 0.23936720413751142, + "grad_norm": 2.1984941959381104, + "learning_rate": 1.9493851466862147e-05, + "loss": 0.4849, + "step": 1967 + }, + { + "epoch": 0.23948889564952844, + "grad_norm": 1.665209174156189, + "learning_rate": 1.9493238545207284e-05, + "loss": 0.4208, + "step": 1968 + }, + { + "epoch": 0.23961058716154549, + "grad_norm": 1.1189358234405518, + "learning_rate": 1.9492625262315044e-05, + "loss": 0.506, + "step": 1969 + }, + { + "epoch": 0.23973227867356253, + "grad_norm": 2.0177454948425293, + "learning_rate": 1.9492011618208764e-05, + "loss": 0.4045, + "step": 1970 + }, + { + "epoch": 0.23985397018557955, + "grad_norm": 2.4533140659332275, + "learning_rate": 1.9491397612911793e-05, + "loss": 0.5428, + "step": 1971 + }, + { + "epoch": 0.2399756616975966, + "grad_norm": 2.858595371246338, + "learning_rate": 1.94907832464475e-05, + "loss": 0.5619, + "step": 1972 + }, + { + "epoch": 0.24009735320961362, + "grad_norm": 0.5678976774215698, + "learning_rate": 1.9490168518839255e-05, + "loss": 0.473, + "step": 1973 + }, + { + "epoch": 0.24021904472163066, + "grad_norm": 1.6049307584762573, + "learning_rate": 1.9489553430110458e-05, + "loss": 0.4734, + "step": 1974 + }, + { + "epoch": 0.2403407362336477, + "grad_norm": 2.4758260250091553, + "learning_rate": 1.9488937980284508e-05, + "loss": 0.4305, + "step": 1975 + }, + { + "epoch": 0.24046242774566473, + "grad_norm": 1.9499974250793457, + "learning_rate": 1.948832216938483e-05, + "loss": 0.4245, + "step": 1976 + }, + { + "epoch": 0.24058411925768178, + "grad_norm": 1.9068377017974854, + "learning_rate": 1.948770599743485e-05, + "loss": 0.5296, + "step": 1977 + }, + { + "epoch": 0.24070581076969882, + "grad_norm": 0.6975396275520325, + "learning_rate": 1.948708946445802e-05, + "loss": 0.4464, + "step": 1978 + }, + { + "epoch": 0.24082750228171584, + "grad_norm": 4.860745429992676, + "learning_rate": 1.94864725704778e-05, + "loss": 0.5933, + "step": 1979 + }, + { + "epoch": 0.2409491937937329, + "grad_norm": 0.5379089117050171, + "learning_rate": 1.948585531551766e-05, + "loss": 0.4426, + "step": 1980 + }, + { + "epoch": 0.2410708853057499, + "grad_norm": 1.878548502922058, + "learning_rate": 1.9485237699601095e-05, + "loss": 0.4766, + "step": 1981 + }, + { + "epoch": 0.24119257681776696, + "grad_norm": 1.4302805662155151, + "learning_rate": 1.9484619722751596e-05, + "loss": 0.4927, + "step": 1982 + }, + { + "epoch": 0.241314268329784, + "grad_norm": 1.1551105976104736, + "learning_rate": 1.948400138499269e-05, + "loss": 0.5121, + "step": 1983 + }, + { + "epoch": 0.24143595984180102, + "grad_norm": 2.576261043548584, + "learning_rate": 1.9483382686347898e-05, + "loss": 0.4855, + "step": 1984 + }, + { + "epoch": 0.24155765135381807, + "grad_norm": 1.2556712627410889, + "learning_rate": 1.9482763626840767e-05, + "loss": 0.5142, + "step": 1985 + }, + { + "epoch": 0.24167934286583512, + "grad_norm": 1.447229027748108, + "learning_rate": 1.948214420649485e-05, + "loss": 0.5148, + "step": 1986 + }, + { + "epoch": 0.24180103437785214, + "grad_norm": 5.459035396575928, + "learning_rate": 1.9481524425333717e-05, + "loss": 0.4225, + "step": 1987 + }, + { + "epoch": 0.24192272588986918, + "grad_norm": 2.6113619804382324, + "learning_rate": 1.948090428338096e-05, + "loss": 0.5247, + "step": 1988 + }, + { + "epoch": 0.24204441740188623, + "grad_norm": 2.7069947719573975, + "learning_rate": 1.9480283780660164e-05, + "loss": 0.4035, + "step": 1989 + }, + { + "epoch": 0.24216610891390325, + "grad_norm": 1.63068687915802, + "learning_rate": 1.947966291719495e-05, + "loss": 0.467, + "step": 1990 + }, + { + "epoch": 0.2422878004259203, + "grad_norm": 3.6869382858276367, + "learning_rate": 1.9479041693008944e-05, + "loss": 0.5476, + "step": 1991 + }, + { + "epoch": 0.24240949193793732, + "grad_norm": 3.250044584274292, + "learning_rate": 1.9478420108125773e-05, + "loss": 0.5629, + "step": 1992 + }, + { + "epoch": 0.24253118344995436, + "grad_norm": 3.5663516521453857, + "learning_rate": 1.9477798162569105e-05, + "loss": 0.4127, + "step": 1993 + }, + { + "epoch": 0.2426528749619714, + "grad_norm": 2.670294761657715, + "learning_rate": 1.9477175856362597e-05, + "loss": 0.4251, + "step": 1994 + }, + { + "epoch": 0.24277456647398843, + "grad_norm": 4.858532905578613, + "learning_rate": 1.9476553189529928e-05, + "loss": 0.4047, + "step": 1995 + }, + { + "epoch": 0.24289625798600548, + "grad_norm": 0.7859447002410889, + "learning_rate": 1.9475930162094797e-05, + "loss": 0.4906, + "step": 1996 + }, + { + "epoch": 0.24301794949802252, + "grad_norm": 0.641616702079773, + "learning_rate": 1.9475306774080906e-05, + "loss": 0.4803, + "step": 1997 + }, + { + "epoch": 0.24313964101003954, + "grad_norm": 2.0165393352508545, + "learning_rate": 1.9474683025511984e-05, + "loss": 0.4235, + "step": 1998 + }, + { + "epoch": 0.2432613325220566, + "grad_norm": 0.6107766032218933, + "learning_rate": 1.947405891641176e-05, + "loss": 0.4807, + "step": 1999 + }, + { + "epoch": 0.24338302403407364, + "grad_norm": 1.1288394927978516, + "learning_rate": 1.9473434446803983e-05, + "loss": 0.4626, + "step": 2000 + }, + { + "epoch": 0.24350471554609066, + "grad_norm": 0.6809804439544678, + "learning_rate": 1.9472809616712416e-05, + "loss": 0.4509, + "step": 2001 + }, + { + "epoch": 0.2436264070581077, + "grad_norm": 3.5753886699676514, + "learning_rate": 1.947218442616084e-05, + "loss": 0.5296, + "step": 2002 + }, + { + "epoch": 0.24374809857012472, + "grad_norm": 0.5915960073471069, + "learning_rate": 1.9471558875173032e-05, + "loss": 0.4311, + "step": 2003 + }, + { + "epoch": 0.24386979008214177, + "grad_norm": 3.084946870803833, + "learning_rate": 1.947093296377281e-05, + "loss": 0.5147, + "step": 2004 + }, + { + "epoch": 0.24399148159415882, + "grad_norm": 0.6248292922973633, + "learning_rate": 1.9470306691983985e-05, + "loss": 0.4607, + "step": 2005 + }, + { + "epoch": 0.24411317310617583, + "grad_norm": 2.3376266956329346, + "learning_rate": 1.9469680059830385e-05, + "loss": 0.5279, + "step": 2006 + }, + { + "epoch": 0.24423486461819288, + "grad_norm": 1.9937899112701416, + "learning_rate": 1.9469053067335858e-05, + "loss": 0.4314, + "step": 2007 + }, + { + "epoch": 0.24435655613020993, + "grad_norm": 1.7855887413024902, + "learning_rate": 1.9468425714524265e-05, + "loss": 0.5054, + "step": 2008 + }, + { + "epoch": 0.24447824764222695, + "grad_norm": 2.464897871017456, + "learning_rate": 1.9467798001419473e-05, + "loss": 0.4798, + "step": 2009 + }, + { + "epoch": 0.244599939154244, + "grad_norm": 1.9896260499954224, + "learning_rate": 1.9467169928045368e-05, + "loss": 0.4576, + "step": 2010 + }, + { + "epoch": 0.24472163066626104, + "grad_norm": 2.3477017879486084, + "learning_rate": 1.9466541494425853e-05, + "loss": 0.4582, + "step": 2011 + }, + { + "epoch": 0.24484332217827806, + "grad_norm": 1.1433109045028687, + "learning_rate": 1.946591270058484e-05, + "loss": 0.4259, + "step": 2012 + }, + { + "epoch": 0.2449650136902951, + "grad_norm": 3.811953544616699, + "learning_rate": 1.9465283546546256e-05, + "loss": 0.5063, + "step": 2013 + }, + { + "epoch": 0.24508670520231213, + "grad_norm": 3.000900983810425, + "learning_rate": 1.9464654032334037e-05, + "loss": 0.4729, + "step": 2014 + }, + { + "epoch": 0.24520839671432917, + "grad_norm": 4.58302640914917, + "learning_rate": 1.9464024157972147e-05, + "loss": 0.5203, + "step": 2015 + }, + { + "epoch": 0.24533008822634622, + "grad_norm": 3.0696282386779785, + "learning_rate": 1.9463393923484543e-05, + "loss": 0.4356, + "step": 2016 + }, + { + "epoch": 0.24545177973836324, + "grad_norm": 3.5197534561157227, + "learning_rate": 1.9462763328895214e-05, + "loss": 0.489, + "step": 2017 + }, + { + "epoch": 0.2455734712503803, + "grad_norm": 2.1193952560424805, + "learning_rate": 1.9462132374228154e-05, + "loss": 0.4487, + "step": 2018 + }, + { + "epoch": 0.24569516276239733, + "grad_norm": 0.969070315361023, + "learning_rate": 1.946150105950737e-05, + "loss": 0.4758, + "step": 2019 + }, + { + "epoch": 0.24581685427441435, + "grad_norm": 1.8888543844223022, + "learning_rate": 1.946086938475689e-05, + "loss": 0.4576, + "step": 2020 + }, + { + "epoch": 0.2459385457864314, + "grad_norm": 5.966174602508545, + "learning_rate": 1.9460237350000744e-05, + "loss": 0.4188, + "step": 2021 + }, + { + "epoch": 0.24606023729844842, + "grad_norm": 4.978442668914795, + "learning_rate": 1.945960495526299e-05, + "loss": 0.4571, + "step": 2022 + }, + { + "epoch": 0.24618192881046547, + "grad_norm": 4.704474449157715, + "learning_rate": 1.945897220056768e-05, + "loss": 0.4063, + "step": 2023 + }, + { + "epoch": 0.2463036203224825, + "grad_norm": 0.8123753070831299, + "learning_rate": 1.9458339085938902e-05, + "loss": 0.5244, + "step": 2024 + }, + { + "epoch": 0.24642531183449953, + "grad_norm": 0.9004320502281189, + "learning_rate": 1.9457705611400747e-05, + "loss": 0.4622, + "step": 2025 + }, + { + "epoch": 0.24654700334651658, + "grad_norm": 2.759958028793335, + "learning_rate": 1.945707177697731e-05, + "loss": 0.4771, + "step": 2026 + }, + { + "epoch": 0.24666869485853363, + "grad_norm": 2.455812692642212, + "learning_rate": 1.945643758269272e-05, + "loss": 0.5234, + "step": 2027 + }, + { + "epoch": 0.24679038637055065, + "grad_norm": 0.7070157527923584, + "learning_rate": 1.9455803028571108e-05, + "loss": 0.4688, + "step": 2028 + }, + { + "epoch": 0.2469120778825677, + "grad_norm": 1.804274320602417, + "learning_rate": 1.945516811463662e-05, + "loss": 0.4572, + "step": 2029 + }, + { + "epoch": 0.24703376939458474, + "grad_norm": 1.191839575767517, + "learning_rate": 1.945453284091341e-05, + "loss": 0.5136, + "step": 2030 + }, + { + "epoch": 0.24715546090660176, + "grad_norm": 3.599769115447998, + "learning_rate": 1.945389720742566e-05, + "loss": 0.5154, + "step": 2031 + }, + { + "epoch": 0.2472771524186188, + "grad_norm": 1.8192546367645264, + "learning_rate": 1.9453261214197548e-05, + "loss": 0.4857, + "step": 2032 + }, + { + "epoch": 0.24739884393063583, + "grad_norm": 2.1078298091888428, + "learning_rate": 1.945262486125328e-05, + "loss": 0.4766, + "step": 2033 + }, + { + "epoch": 0.24752053544265287, + "grad_norm": 1.5302866697311401, + "learning_rate": 1.9451988148617077e-05, + "loss": 0.4294, + "step": 2034 + }, + { + "epoch": 0.24764222695466992, + "grad_norm": 0.7828501462936401, + "learning_rate": 1.9451351076313153e-05, + "loss": 0.4909, + "step": 2035 + }, + { + "epoch": 0.24776391846668694, + "grad_norm": 1.9352378845214844, + "learning_rate": 1.9450713644365758e-05, + "loss": 0.5069, + "step": 2036 + }, + { + "epoch": 0.24788560997870399, + "grad_norm": 2.0285608768463135, + "learning_rate": 1.9450075852799148e-05, + "loss": 0.4491, + "step": 2037 + }, + { + "epoch": 0.24800730149072103, + "grad_norm": 1.0679059028625488, + "learning_rate": 1.9449437701637596e-05, + "loss": 0.4988, + "step": 2038 + }, + { + "epoch": 0.24812899300273805, + "grad_norm": 2.314944267272949, + "learning_rate": 1.9448799190905373e-05, + "loss": 0.4612, + "step": 2039 + }, + { + "epoch": 0.2482506845147551, + "grad_norm": 1.9832383394241333, + "learning_rate": 1.9448160320626787e-05, + "loss": 0.4437, + "step": 2040 + }, + { + "epoch": 0.24837237602677215, + "grad_norm": 2.0206549167633057, + "learning_rate": 1.9447521090826144e-05, + "loss": 0.5113, + "step": 2041 + }, + { + "epoch": 0.24849406753878917, + "grad_norm": 1.4016667604446411, + "learning_rate": 1.9446881501527764e-05, + "loss": 0.5089, + "step": 2042 + }, + { + "epoch": 0.2486157590508062, + "grad_norm": 2.2301433086395264, + "learning_rate": 1.9446241552755994e-05, + "loss": 0.4546, + "step": 2043 + }, + { + "epoch": 0.24873745056282323, + "grad_norm": 1.0734565258026123, + "learning_rate": 1.9445601244535178e-05, + "loss": 0.4469, + "step": 2044 + }, + { + "epoch": 0.24885914207484028, + "grad_norm": 0.7326973676681519, + "learning_rate": 1.9444960576889683e-05, + "loss": 0.4469, + "step": 2045 + }, + { + "epoch": 0.24898083358685733, + "grad_norm": 0.9987289905548096, + "learning_rate": 1.9444319549843887e-05, + "loss": 0.4473, + "step": 2046 + }, + { + "epoch": 0.24910252509887434, + "grad_norm": 2.1177191734313965, + "learning_rate": 1.9443678163422188e-05, + "loss": 0.5367, + "step": 2047 + }, + { + "epoch": 0.2492242166108914, + "grad_norm": 0.7155449986457825, + "learning_rate": 1.9443036417648983e-05, + "loss": 0.4839, + "step": 2048 + }, + { + "epoch": 0.24934590812290844, + "grad_norm": 0.7096008658409119, + "learning_rate": 1.94423943125487e-05, + "loss": 0.4979, + "step": 2049 + }, + { + "epoch": 0.24946759963492546, + "grad_norm": 1.0529060363769531, + "learning_rate": 1.9441751848145766e-05, + "loss": 0.4686, + "step": 2050 + }, + { + "epoch": 0.2495892911469425, + "grad_norm": 0.7185301184654236, + "learning_rate": 1.9441109024464633e-05, + "loss": 0.4914, + "step": 2051 + }, + { + "epoch": 0.24971098265895952, + "grad_norm": 0.927126944065094, + "learning_rate": 1.9440465841529756e-05, + "loss": 0.5258, + "step": 2052 + }, + { + "epoch": 0.24983267417097657, + "grad_norm": 0.8160197734832764, + "learning_rate": 1.9439822299365614e-05, + "loss": 0.5177, + "step": 2053 + }, + { + "epoch": 0.24995436568299362, + "grad_norm": 1.8633160591125488, + "learning_rate": 1.9439178397996697e-05, + "loss": 0.5176, + "step": 2054 + }, + { + "epoch": 0.25007605719501064, + "grad_norm": 0.8045934438705444, + "learning_rate": 1.9438534137447504e-05, + "loss": 0.4727, + "step": 2055 + }, + { + "epoch": 0.2501977487070277, + "grad_norm": 0.9101488590240479, + "learning_rate": 1.943788951774255e-05, + "loss": 0.4853, + "step": 2056 + }, + { + "epoch": 0.25031944021904473, + "grad_norm": 1.4784225225448608, + "learning_rate": 1.943724453890636e-05, + "loss": 0.4963, + "step": 2057 + }, + { + "epoch": 0.25044113173106175, + "grad_norm": 0.8246391415596008, + "learning_rate": 1.9436599200963483e-05, + "loss": 0.5007, + "step": 2058 + }, + { + "epoch": 0.25056282324307877, + "grad_norm": 3.6727147102355957, + "learning_rate": 1.9435953503938477e-05, + "loss": 0.4572, + "step": 2059 + }, + { + "epoch": 0.25068451475509584, + "grad_norm": 0.6610450148582458, + "learning_rate": 1.9435307447855907e-05, + "loss": 0.5188, + "step": 2060 + }, + { + "epoch": 0.25080620626711286, + "grad_norm": 3.301255464553833, + "learning_rate": 1.943466103274036e-05, + "loss": 0.4586, + "step": 2061 + }, + { + "epoch": 0.2509278977791299, + "grad_norm": 0.9013835191726685, + "learning_rate": 1.943401425861643e-05, + "loss": 0.5056, + "step": 2062 + }, + { + "epoch": 0.25104958929114696, + "grad_norm": 1.0015555620193481, + "learning_rate": 1.943336712550873e-05, + "loss": 0.5238, + "step": 2063 + }, + { + "epoch": 0.251171280803164, + "grad_norm": 1.1146483421325684, + "learning_rate": 1.9432719633441887e-05, + "loss": 0.4374, + "step": 2064 + }, + { + "epoch": 0.251292972315181, + "grad_norm": 0.6692948937416077, + "learning_rate": 1.9432071782440536e-05, + "loss": 0.4511, + "step": 2065 + }, + { + "epoch": 0.25141466382719807, + "grad_norm": 2.12496018409729, + "learning_rate": 1.943142357252933e-05, + "loss": 0.3818, + "step": 2066 + }, + { + "epoch": 0.2515363553392151, + "grad_norm": 3.132035732269287, + "learning_rate": 1.9430775003732934e-05, + "loss": 0.4566, + "step": 2067 + }, + { + "epoch": 0.2516580468512321, + "grad_norm": 0.588349461555481, + "learning_rate": 1.943012607607603e-05, + "loss": 0.4216, + "step": 2068 + }, + { + "epoch": 0.2517797383632492, + "grad_norm": 3.744241714477539, + "learning_rate": 1.942947678958331e-05, + "loss": 0.5176, + "step": 2069 + }, + { + "epoch": 0.2519014298752662, + "grad_norm": 0.6484661102294922, + "learning_rate": 1.9428827144279482e-05, + "loss": 0.4388, + "step": 2070 + }, + { + "epoch": 0.2520231213872832, + "grad_norm": 4.514074325561523, + "learning_rate": 1.942817714018926e-05, + "loss": 0.5645, + "step": 2071 + }, + { + "epoch": 0.2521448128993003, + "grad_norm": 0.5719945430755615, + "learning_rate": 1.9427526777337383e-05, + "loss": 0.45, + "step": 2072 + }, + { + "epoch": 0.2522665044113173, + "grad_norm": 3.7293598651885986, + "learning_rate": 1.94268760557486e-05, + "loss": 0.4308, + "step": 2073 + }, + { + "epoch": 0.25238819592333434, + "grad_norm": 0.8145673274993896, + "learning_rate": 1.942622497544767e-05, + "loss": 0.5112, + "step": 2074 + }, + { + "epoch": 0.2525098874353514, + "grad_norm": 0.6826562881469727, + "learning_rate": 1.942557353645937e-05, + "loss": 0.5207, + "step": 2075 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 0.5564038157463074, + "learning_rate": 1.9424921738808488e-05, + "loss": 0.4955, + "step": 2076 + }, + { + "epoch": 0.25275327045938545, + "grad_norm": 1.6042362451553345, + "learning_rate": 1.9424269582519823e-05, + "loss": 0.4865, + "step": 2077 + }, + { + "epoch": 0.25287496197140247, + "grad_norm": 2.475109100341797, + "learning_rate": 1.942361706761819e-05, + "loss": 0.4577, + "step": 2078 + }, + { + "epoch": 0.25299665348341954, + "grad_norm": 1.4047729969024658, + "learning_rate": 1.9422964194128427e-05, + "loss": 0.5013, + "step": 2079 + }, + { + "epoch": 0.25311834499543656, + "grad_norm": 0.605525016784668, + "learning_rate": 1.9422310962075372e-05, + "loss": 0.4243, + "step": 2080 + }, + { + "epoch": 0.2532400365074536, + "grad_norm": 2.4479782581329346, + "learning_rate": 1.942165737148388e-05, + "loss": 0.495, + "step": 2081 + }, + { + "epoch": 0.25336172801947066, + "grad_norm": 3.241241931915283, + "learning_rate": 1.942100342237882e-05, + "loss": 0.5247, + "step": 2082 + }, + { + "epoch": 0.2534834195314877, + "grad_norm": 1.4796130657196045, + "learning_rate": 1.9420349114785085e-05, + "loss": 0.4205, + "step": 2083 + }, + { + "epoch": 0.2536051110435047, + "grad_norm": 0.9417670369148254, + "learning_rate": 1.9419694448727566e-05, + "loss": 0.4824, + "step": 2084 + }, + { + "epoch": 0.25372680255552177, + "grad_norm": 1.0193896293640137, + "learning_rate": 1.9419039424231175e-05, + "loss": 0.474, + "step": 2085 + }, + { + "epoch": 0.2538484940675388, + "grad_norm": 0.62428218126297, + "learning_rate": 1.9418384041320835e-05, + "loss": 0.4622, + "step": 2086 + }, + { + "epoch": 0.2539701855795558, + "grad_norm": 0.6079922914505005, + "learning_rate": 1.9417728300021488e-05, + "loss": 0.4668, + "step": 2087 + }, + { + "epoch": 0.2540918770915729, + "grad_norm": 3.7671585083007812, + "learning_rate": 1.9417072200358086e-05, + "loss": 0.4308, + "step": 2088 + }, + { + "epoch": 0.2542135686035899, + "grad_norm": 0.777911365032196, + "learning_rate": 1.9416415742355596e-05, + "loss": 0.5113, + "step": 2089 + }, + { + "epoch": 0.2543352601156069, + "grad_norm": 3.967838764190674, + "learning_rate": 1.9415758926038997e-05, + "loss": 0.4617, + "step": 2090 + }, + { + "epoch": 0.254456951627624, + "grad_norm": 2.4600133895874023, + "learning_rate": 1.9415101751433278e-05, + "loss": 0.4774, + "step": 2091 + }, + { + "epoch": 0.254578643139641, + "grad_norm": 0.7972972989082336, + "learning_rate": 1.941444421856345e-05, + "loss": 0.4907, + "step": 2092 + }, + { + "epoch": 0.25470033465165803, + "grad_norm": 0.8862624764442444, + "learning_rate": 1.9413786327454534e-05, + "loss": 0.5256, + "step": 2093 + }, + { + "epoch": 0.2548220261636751, + "grad_norm": 3.9216880798339844, + "learning_rate": 1.941312807813156e-05, + "loss": 0.4142, + "step": 2094 + }, + { + "epoch": 0.2549437176756921, + "grad_norm": 0.7092095017433167, + "learning_rate": 1.9412469470619582e-05, + "loss": 0.4778, + "step": 2095 + }, + { + "epoch": 0.25506540918770915, + "grad_norm": 1.5533114671707153, + "learning_rate": 1.941181050494366e-05, + "loss": 0.4856, + "step": 2096 + }, + { + "epoch": 0.2551871006997262, + "grad_norm": 0.6967470049858093, + "learning_rate": 1.9411151181128862e-05, + "loss": 0.475, + "step": 2097 + }, + { + "epoch": 0.25530879221174324, + "grad_norm": 2.781846523284912, + "learning_rate": 1.9410491499200282e-05, + "loss": 0.4963, + "step": 2098 + }, + { + "epoch": 0.25543048372376026, + "grad_norm": 0.8223859071731567, + "learning_rate": 1.9409831459183022e-05, + "loss": 0.4714, + "step": 2099 + }, + { + "epoch": 0.2555521752357773, + "grad_norm": 0.7446115612983704, + "learning_rate": 1.94091710611022e-05, + "loss": 0.457, + "step": 2100 + }, + { + "epoch": 0.25567386674779435, + "grad_norm": 3.1892948150634766, + "learning_rate": 1.940851030498294e-05, + "loss": 0.4483, + "step": 2101 + }, + { + "epoch": 0.2557955582598114, + "grad_norm": 1.5362653732299805, + "learning_rate": 1.9407849190850392e-05, + "loss": 0.526, + "step": 2102 + }, + { + "epoch": 0.2559172497718284, + "grad_norm": 2.39308500289917, + "learning_rate": 1.9407187718729706e-05, + "loss": 0.4301, + "step": 2103 + }, + { + "epoch": 0.25603894128384547, + "grad_norm": 0.7289983034133911, + "learning_rate": 1.9406525888646056e-05, + "loss": 0.4903, + "step": 2104 + }, + { + "epoch": 0.2561606327958625, + "grad_norm": 1.737002968788147, + "learning_rate": 1.940586370062463e-05, + "loss": 0.502, + "step": 2105 + }, + { + "epoch": 0.2562823243078795, + "grad_norm": 1.964486002922058, + "learning_rate": 1.9405201154690613e-05, + "loss": 0.4537, + "step": 2106 + }, + { + "epoch": 0.2564040158198966, + "grad_norm": 1.0727633237838745, + "learning_rate": 1.9404538250869232e-05, + "loss": 0.4334, + "step": 2107 + }, + { + "epoch": 0.2565257073319136, + "grad_norm": 0.9376628398895264, + "learning_rate": 1.94038749891857e-05, + "loss": 0.4774, + "step": 2108 + }, + { + "epoch": 0.2566473988439306, + "grad_norm": 2.0631730556488037, + "learning_rate": 1.940321136966526e-05, + "loss": 0.4905, + "step": 2109 + }, + { + "epoch": 0.2567690903559477, + "grad_norm": 0.629659116268158, + "learning_rate": 1.9402547392333164e-05, + "loss": 0.4625, + "step": 2110 + }, + { + "epoch": 0.2568907818679647, + "grad_norm": 0.5920600891113281, + "learning_rate": 1.940188305721468e-05, + "loss": 0.4606, + "step": 2111 + }, + { + "epoch": 0.25701247337998173, + "grad_norm": 2.8812389373779297, + "learning_rate": 1.940121836433508e-05, + "loss": 0.5282, + "step": 2112 + }, + { + "epoch": 0.2571341648919988, + "grad_norm": 0.6083371043205261, + "learning_rate": 1.9400553313719665e-05, + "loss": 0.4579, + "step": 2113 + }, + { + "epoch": 0.2572558564040158, + "grad_norm": 1.8386752605438232, + "learning_rate": 1.9399887905393736e-05, + "loss": 0.4796, + "step": 2114 + }, + { + "epoch": 0.25737754791603284, + "grad_norm": 0.7643177509307861, + "learning_rate": 1.939922213938262e-05, + "loss": 0.5253, + "step": 2115 + }, + { + "epoch": 0.2574992394280499, + "grad_norm": 6.415184020996094, + "learning_rate": 1.9398556015711642e-05, + "loss": 0.4542, + "step": 2116 + }, + { + "epoch": 0.25762093094006694, + "grad_norm": 1.6722726821899414, + "learning_rate": 1.9397889534406157e-05, + "loss": 0.5623, + "step": 2117 + }, + { + "epoch": 0.25774262245208396, + "grad_norm": 6.019209861755371, + "learning_rate": 1.939722269549152e-05, + "loss": 0.4724, + "step": 2118 + }, + { + "epoch": 0.257864313964101, + "grad_norm": 5.110898494720459, + "learning_rate": 1.939655549899311e-05, + "loss": 0.4221, + "step": 2119 + }, + { + "epoch": 0.25798600547611805, + "grad_norm": 1.2796639204025269, + "learning_rate": 1.939588794493631e-05, + "loss": 0.4717, + "step": 2120 + }, + { + "epoch": 0.25810769698813507, + "grad_norm": 0.8888446688652039, + "learning_rate": 1.939522003334653e-05, + "loss": 0.4764, + "step": 2121 + }, + { + "epoch": 0.2582293885001521, + "grad_norm": 0.6068385243415833, + "learning_rate": 1.939455176424918e-05, + "loss": 0.4427, + "step": 2122 + }, + { + "epoch": 0.25835108001216917, + "grad_norm": 4.162399768829346, + "learning_rate": 1.9393883137669685e-05, + "loss": 0.5389, + "step": 2123 + }, + { + "epoch": 0.2584727715241862, + "grad_norm": 2.305088996887207, + "learning_rate": 1.93932141536335e-05, + "loss": 0.4304, + "step": 2124 + }, + { + "epoch": 0.2585944630362032, + "grad_norm": 0.608076810836792, + "learning_rate": 1.9392544812166067e-05, + "loss": 0.4071, + "step": 2125 + }, + { + "epoch": 0.2587161545482203, + "grad_norm": 2.529494047164917, + "learning_rate": 1.9391875113292867e-05, + "loss": 0.4576, + "step": 2126 + }, + { + "epoch": 0.2588378460602373, + "grad_norm": 1.8165841102600098, + "learning_rate": 1.9391205057039373e-05, + "loss": 0.4516, + "step": 2127 + }, + { + "epoch": 0.2589595375722543, + "grad_norm": 2.3291776180267334, + "learning_rate": 1.9390534643431095e-05, + "loss": 0.5445, + "step": 2128 + }, + { + "epoch": 0.2590812290842714, + "grad_norm": 0.573996901512146, + "learning_rate": 1.9389863872493532e-05, + "loss": 0.4904, + "step": 2129 + }, + { + "epoch": 0.2592029205962884, + "grad_norm": 1.5546201467514038, + "learning_rate": 1.9389192744252213e-05, + "loss": 0.5248, + "step": 2130 + }, + { + "epoch": 0.25932461210830543, + "grad_norm": 1.4292373657226562, + "learning_rate": 1.9388521258732675e-05, + "loss": 0.5139, + "step": 2131 + }, + { + "epoch": 0.2594463036203225, + "grad_norm": 4.89036750793457, + "learning_rate": 1.938784941596047e-05, + "loss": 0.448, + "step": 2132 + }, + { + "epoch": 0.2595679951323395, + "grad_norm": 4.473513126373291, + "learning_rate": 1.9387177215961167e-05, + "loss": 0.5163, + "step": 2133 + }, + { + "epoch": 0.25968968664435654, + "grad_norm": 4.129972457885742, + "learning_rate": 1.9386504658760334e-05, + "loss": 0.5059, + "step": 2134 + }, + { + "epoch": 0.2598113781563736, + "grad_norm": 3.488123655319214, + "learning_rate": 1.9385831744383577e-05, + "loss": 0.5243, + "step": 2135 + }, + { + "epoch": 0.25993306966839064, + "grad_norm": 1.467781662940979, + "learning_rate": 1.938515847285649e-05, + "loss": 0.5173, + "step": 2136 + }, + { + "epoch": 0.26005476118040766, + "grad_norm": 1.2999566793441772, + "learning_rate": 1.9384484844204698e-05, + "loss": 0.4551, + "step": 2137 + }, + { + "epoch": 0.2601764526924247, + "grad_norm": 1.2947548627853394, + "learning_rate": 1.938381085845383e-05, + "loss": 0.4794, + "step": 2138 + }, + { + "epoch": 0.26029814420444175, + "grad_norm": 3.792018175125122, + "learning_rate": 1.938313651562954e-05, + "loss": 0.5355, + "step": 2139 + }, + { + "epoch": 0.26041983571645877, + "grad_norm": 1.3959805965423584, + "learning_rate": 1.938246181575748e-05, + "loss": 0.438, + "step": 2140 + }, + { + "epoch": 0.2605415272284758, + "grad_norm": 1.5099070072174072, + "learning_rate": 1.938178675886333e-05, + "loss": 0.4443, + "step": 2141 + }, + { + "epoch": 0.26066321874049286, + "grad_norm": 0.9759753346443176, + "learning_rate": 1.9381111344972772e-05, + "loss": 0.4437, + "step": 2142 + }, + { + "epoch": 0.2607849102525099, + "grad_norm": 1.4681051969528198, + "learning_rate": 1.9380435574111512e-05, + "loss": 0.4046, + "step": 2143 + }, + { + "epoch": 0.2609066017645269, + "grad_norm": 1.7579940557479858, + "learning_rate": 1.9379759446305263e-05, + "loss": 0.4154, + "step": 2144 + }, + { + "epoch": 0.261028293276544, + "grad_norm": 2.2013461589813232, + "learning_rate": 1.9379082961579747e-05, + "loss": 0.4129, + "step": 2145 + }, + { + "epoch": 0.261149984788561, + "grad_norm": 1.7656598091125488, + "learning_rate": 1.9378406119960713e-05, + "loss": 0.4611, + "step": 2146 + }, + { + "epoch": 0.261271676300578, + "grad_norm": 1.525275707244873, + "learning_rate": 1.9377728921473915e-05, + "loss": 0.4184, + "step": 2147 + }, + { + "epoch": 0.2613933678125951, + "grad_norm": 2.439988613128662, + "learning_rate": 1.937705136614512e-05, + "loss": 0.4759, + "step": 2148 + }, + { + "epoch": 0.2615150593246121, + "grad_norm": 1.2350704669952393, + "learning_rate": 1.937637345400011e-05, + "loss": 0.4316, + "step": 2149 + }, + { + "epoch": 0.26163675083662913, + "grad_norm": 2.9700818061828613, + "learning_rate": 1.9375695185064686e-05, + "loss": 0.5293, + "step": 2150 + }, + { + "epoch": 0.2617584423486462, + "grad_norm": 2.5232958793640137, + "learning_rate": 1.937501655936465e-05, + "loss": 0.3677, + "step": 2151 + }, + { + "epoch": 0.2618801338606632, + "grad_norm": 2.983248472213745, + "learning_rate": 1.937433757692583e-05, + "loss": 0.5393, + "step": 2152 + }, + { + "epoch": 0.26200182537268024, + "grad_norm": 1.8255480527877808, + "learning_rate": 1.937365823777406e-05, + "loss": 0.5169, + "step": 2153 + }, + { + "epoch": 0.2621235168846973, + "grad_norm": 2.7404890060424805, + "learning_rate": 1.9372978541935192e-05, + "loss": 0.5093, + "step": 2154 + }, + { + "epoch": 0.26224520839671434, + "grad_norm": 0.8094709515571594, + "learning_rate": 1.937229848943509e-05, + "loss": 0.572, + "step": 2155 + }, + { + "epoch": 0.26236689990873135, + "grad_norm": 3.907942771911621, + "learning_rate": 1.9371618080299633e-05, + "loss": 0.4653, + "step": 2156 + }, + { + "epoch": 0.26248859142074843, + "grad_norm": 5.443722248077393, + "learning_rate": 1.9370937314554707e-05, + "loss": 0.4763, + "step": 2157 + }, + { + "epoch": 0.26261028293276545, + "grad_norm": 1.1331325769424438, + "learning_rate": 1.937025619222622e-05, + "loss": 0.5443, + "step": 2158 + }, + { + "epoch": 0.26273197444478247, + "grad_norm": 3.942476511001587, + "learning_rate": 1.936957471334009e-05, + "loss": 0.4644, + "step": 2159 + }, + { + "epoch": 0.2628536659567995, + "grad_norm": 4.187047481536865, + "learning_rate": 1.9368892877922248e-05, + "loss": 0.4084, + "step": 2160 + }, + { + "epoch": 0.26297535746881656, + "grad_norm": 0.6882132887840271, + "learning_rate": 1.9368210685998637e-05, + "loss": 0.4785, + "step": 2161 + }, + { + "epoch": 0.2630970489808336, + "grad_norm": 3.2719621658325195, + "learning_rate": 1.936752813759522e-05, + "loss": 0.5122, + "step": 2162 + }, + { + "epoch": 0.2632187404928506, + "grad_norm": 1.8161216974258423, + "learning_rate": 1.936684523273797e-05, + "loss": 0.4466, + "step": 2163 + }, + { + "epoch": 0.2633404320048677, + "grad_norm": 2.6128652095794678, + "learning_rate": 1.936616197145287e-05, + "loss": 0.4741, + "step": 2164 + }, + { + "epoch": 0.2634621235168847, + "grad_norm": 3.289337396621704, + "learning_rate": 1.936547835376592e-05, + "loss": 0.5019, + "step": 2165 + }, + { + "epoch": 0.2635838150289017, + "grad_norm": 0.7658073902130127, + "learning_rate": 1.936479437970313e-05, + "loss": 0.4821, + "step": 2166 + }, + { + "epoch": 0.2637055065409188, + "grad_norm": 0.5153932571411133, + "learning_rate": 1.936411004929053e-05, + "loss": 0.4461, + "step": 2167 + }, + { + "epoch": 0.2638271980529358, + "grad_norm": 1.6056749820709229, + "learning_rate": 1.9363425362554164e-05, + "loss": 0.4823, + "step": 2168 + }, + { + "epoch": 0.2639488895649528, + "grad_norm": 2.5974862575531006, + "learning_rate": 1.936274031952008e-05, + "loss": 0.4662, + "step": 2169 + }, + { + "epoch": 0.2640705810769699, + "grad_norm": 0.7344169616699219, + "learning_rate": 1.9362054920214347e-05, + "loss": 0.4949, + "step": 2170 + }, + { + "epoch": 0.2641922725889869, + "grad_norm": 0.9193450808525085, + "learning_rate": 1.9361369164663045e-05, + "loss": 0.5033, + "step": 2171 + }, + { + "epoch": 0.26431396410100394, + "grad_norm": 1.0487114191055298, + "learning_rate": 1.936068305289227e-05, + "loss": 0.5028, + "step": 2172 + }, + { + "epoch": 0.264435655613021, + "grad_norm": 3.5050742626190186, + "learning_rate": 1.935999658492813e-05, + "loss": 0.4179, + "step": 2173 + }, + { + "epoch": 0.26455734712503803, + "grad_norm": 2.2077624797821045, + "learning_rate": 1.9359309760796744e-05, + "loss": 0.4884, + "step": 2174 + }, + { + "epoch": 0.26467903863705505, + "grad_norm": 1.7855932712554932, + "learning_rate": 1.935862258052425e-05, + "loss": 0.4103, + "step": 2175 + }, + { + "epoch": 0.2648007301490721, + "grad_norm": 1.12953782081604, + "learning_rate": 1.9357935044136795e-05, + "loss": 0.4326, + "step": 2176 + }, + { + "epoch": 0.26492242166108915, + "grad_norm": 2.887451410293579, + "learning_rate": 1.935724715166054e-05, + "loss": 0.5096, + "step": 2177 + }, + { + "epoch": 0.26504411317310617, + "grad_norm": 3.2784857749938965, + "learning_rate": 1.935655890312167e-05, + "loss": 0.4511, + "step": 2178 + }, + { + "epoch": 0.2651658046851232, + "grad_norm": 4.850772380828857, + "learning_rate": 1.9355870298546358e-05, + "loss": 0.5255, + "step": 2179 + }, + { + "epoch": 0.26528749619714026, + "grad_norm": 3.1773927211761475, + "learning_rate": 1.935518133796082e-05, + "loss": 0.492, + "step": 2180 + }, + { + "epoch": 0.2654091877091573, + "grad_norm": 1.1647827625274658, + "learning_rate": 1.9354492021391265e-05, + "loss": 0.4258, + "step": 2181 + }, + { + "epoch": 0.2655308792211743, + "grad_norm": 4.982237339019775, + "learning_rate": 1.935380234886393e-05, + "loss": 0.5518, + "step": 2182 + }, + { + "epoch": 0.2656525707331914, + "grad_norm": 1.5905909538269043, + "learning_rate": 1.935311232040505e-05, + "loss": 0.3749, + "step": 2183 + }, + { + "epoch": 0.2657742622452084, + "grad_norm": 0.5936698317527771, + "learning_rate": 1.935242193604089e-05, + "loss": 0.4552, + "step": 2184 + }, + { + "epoch": 0.2658959537572254, + "grad_norm": 1.5796947479248047, + "learning_rate": 1.9351731195797715e-05, + "loss": 0.4618, + "step": 2185 + }, + { + "epoch": 0.2660176452692425, + "grad_norm": 1.928078532218933, + "learning_rate": 1.935104009970181e-05, + "loss": 0.5464, + "step": 2186 + }, + { + "epoch": 0.2661393367812595, + "grad_norm": 1.3768632411956787, + "learning_rate": 1.9350348647779476e-05, + "loss": 0.4793, + "step": 2187 + }, + { + "epoch": 0.2662610282932765, + "grad_norm": 4.635210037231445, + "learning_rate": 1.934965684005702e-05, + "loss": 0.4367, + "step": 2188 + }, + { + "epoch": 0.2663827198052936, + "grad_norm": 0.9606807231903076, + "learning_rate": 1.934896467656077e-05, + "loss": 0.4888, + "step": 2189 + }, + { + "epoch": 0.2665044113173106, + "grad_norm": 2.4115006923675537, + "learning_rate": 1.934827215731706e-05, + "loss": 0.4899, + "step": 2190 + }, + { + "epoch": 0.26662610282932764, + "grad_norm": 2.0559732913970947, + "learning_rate": 1.9347579282352246e-05, + "loss": 0.4114, + "step": 2191 + }, + { + "epoch": 0.2667477943413447, + "grad_norm": 1.51226007938385, + "learning_rate": 1.9346886051692694e-05, + "loss": 0.4894, + "step": 2192 + }, + { + "epoch": 0.26686948585336173, + "grad_norm": 0.567767858505249, + "learning_rate": 1.9346192465364776e-05, + "loss": 0.4321, + "step": 2193 + }, + { + "epoch": 0.26699117736537875, + "grad_norm": 0.7620263695716858, + "learning_rate": 1.9345498523394893e-05, + "loss": 0.4424, + "step": 2194 + }, + { + "epoch": 0.2671128688773958, + "grad_norm": 0.5864660143852234, + "learning_rate": 1.9344804225809445e-05, + "loss": 0.4139, + "step": 2195 + }, + { + "epoch": 0.26723456038941285, + "grad_norm": 1.7521392107009888, + "learning_rate": 1.9344109572634857e-05, + "loss": 0.4849, + "step": 2196 + }, + { + "epoch": 0.26735625190142986, + "grad_norm": 1.7217597961425781, + "learning_rate": 1.9343414563897555e-05, + "loss": 0.5002, + "step": 2197 + }, + { + "epoch": 0.26747794341344694, + "grad_norm": 0.6881518363952637, + "learning_rate": 1.934271919962399e-05, + "loss": 0.4608, + "step": 2198 + }, + { + "epoch": 0.26759963492546396, + "grad_norm": 2.678766965866089, + "learning_rate": 1.9342023479840622e-05, + "loss": 0.5312, + "step": 2199 + }, + { + "epoch": 0.267721326437481, + "grad_norm": 1.6380953788757324, + "learning_rate": 1.9341327404573925e-05, + "loss": 0.4994, + "step": 2200 + }, + { + "epoch": 0.267843017949498, + "grad_norm": 0.6163226962089539, + "learning_rate": 1.934063097385038e-05, + "loss": 0.5, + "step": 2201 + }, + { + "epoch": 0.26796470946151507, + "grad_norm": 4.325922012329102, + "learning_rate": 1.9339934187696498e-05, + "loss": 0.4136, + "step": 2202 + }, + { + "epoch": 0.2680864009735321, + "grad_norm": 1.6639729738235474, + "learning_rate": 1.933923704613878e-05, + "loss": 0.5488, + "step": 2203 + }, + { + "epoch": 0.2682080924855491, + "grad_norm": 1.3243522644042969, + "learning_rate": 1.9338539549203767e-05, + "loss": 0.5001, + "step": 2204 + }, + { + "epoch": 0.2683297839975662, + "grad_norm": 2.7837817668914795, + "learning_rate": 1.9337841696917996e-05, + "loss": 0.4861, + "step": 2205 + }, + { + "epoch": 0.2684514755095832, + "grad_norm": 3.3673646450042725, + "learning_rate": 1.9337143489308015e-05, + "loss": 0.441, + "step": 2206 + }, + { + "epoch": 0.2685731670216002, + "grad_norm": 1.975538969039917, + "learning_rate": 1.9336444926400402e-05, + "loss": 0.4832, + "step": 2207 + }, + { + "epoch": 0.2686948585336173, + "grad_norm": 0.7136867046356201, + "learning_rate": 1.9335746008221733e-05, + "loss": 0.4625, + "step": 2208 + }, + { + "epoch": 0.2688165500456343, + "grad_norm": 0.8701152205467224, + "learning_rate": 1.93350467347986e-05, + "loss": 0.4747, + "step": 2209 + }, + { + "epoch": 0.26893824155765134, + "grad_norm": 1.676513910293579, + "learning_rate": 1.933434710615762e-05, + "loss": 0.4889, + "step": 2210 + }, + { + "epoch": 0.2690599330696684, + "grad_norm": 1.9879202842712402, + "learning_rate": 1.933364712232541e-05, + "loss": 0.4739, + "step": 2211 + }, + { + "epoch": 0.26918162458168543, + "grad_norm": 1.4625931978225708, + "learning_rate": 1.933294678332861e-05, + "loss": 0.4926, + "step": 2212 + }, + { + "epoch": 0.26930331609370245, + "grad_norm": 1.0162174701690674, + "learning_rate": 1.9332246089193867e-05, + "loss": 0.4096, + "step": 2213 + }, + { + "epoch": 0.2694250076057195, + "grad_norm": 1.7706711292266846, + "learning_rate": 1.933154503994784e-05, + "loss": 0.5084, + "step": 2214 + }, + { + "epoch": 0.26954669911773654, + "grad_norm": 1.3316534757614136, + "learning_rate": 1.9330843635617212e-05, + "loss": 0.481, + "step": 2215 + }, + { + "epoch": 0.26966839062975356, + "grad_norm": 0.7049447894096375, + "learning_rate": 1.933014187622867e-05, + "loss": 0.4486, + "step": 2216 + }, + { + "epoch": 0.26979008214177064, + "grad_norm": 2.222529888153076, + "learning_rate": 1.9329439761808915e-05, + "loss": 0.4474, + "step": 2217 + }, + { + "epoch": 0.26991177365378766, + "grad_norm": 0.7434383630752563, + "learning_rate": 1.932873729238467e-05, + "loss": 0.4741, + "step": 2218 + }, + { + "epoch": 0.2700334651658047, + "grad_norm": 1.619347095489502, + "learning_rate": 1.932803446798266e-05, + "loss": 0.4741, + "step": 2219 + }, + { + "epoch": 0.2701551566778217, + "grad_norm": 1.62326180934906, + "learning_rate": 1.932733128862963e-05, + "loss": 0.4739, + "step": 2220 + }, + { + "epoch": 0.27027684818983877, + "grad_norm": 1.5984550714492798, + "learning_rate": 1.9326627754352336e-05, + "loss": 0.4963, + "step": 2221 + }, + { + "epoch": 0.2703985397018558, + "grad_norm": 0.9873847961425781, + "learning_rate": 1.9325923865177555e-05, + "loss": 0.447, + "step": 2222 + }, + { + "epoch": 0.2705202312138728, + "grad_norm": 2.9002130031585693, + "learning_rate": 1.9325219621132063e-05, + "loss": 0.4574, + "step": 2223 + }, + { + "epoch": 0.2706419227258899, + "grad_norm": 1.3462127447128296, + "learning_rate": 1.9324515022242664e-05, + "loss": 0.4172, + "step": 2224 + }, + { + "epoch": 0.2707636142379069, + "grad_norm": 2.764958620071411, + "learning_rate": 1.9323810068536167e-05, + "loss": 0.5009, + "step": 2225 + }, + { + "epoch": 0.2708853057499239, + "grad_norm": 0.8994090557098389, + "learning_rate": 1.93231047600394e-05, + "loss": 0.4367, + "step": 2226 + }, + { + "epoch": 0.271006997261941, + "grad_norm": 1.600385069847107, + "learning_rate": 1.9322399096779197e-05, + "loss": 0.4662, + "step": 2227 + }, + { + "epoch": 0.271128688773958, + "grad_norm": 1.9499895572662354, + "learning_rate": 1.932169307878241e-05, + "loss": 0.4469, + "step": 2228 + }, + { + "epoch": 0.27125038028597503, + "grad_norm": 2.8471550941467285, + "learning_rate": 1.9320986706075913e-05, + "loss": 0.4985, + "step": 2229 + }, + { + "epoch": 0.2713720717979921, + "grad_norm": 1.845637559890747, + "learning_rate": 1.9320279978686575e-05, + "loss": 0.5318, + "step": 2230 + }, + { + "epoch": 0.27149376331000913, + "grad_norm": 0.7418341636657715, + "learning_rate": 1.931957289664129e-05, + "loss": 0.4912, + "step": 2231 + }, + { + "epoch": 0.27161545482202615, + "grad_norm": 2.186032772064209, + "learning_rate": 1.9318865459966968e-05, + "loss": 0.4461, + "step": 2232 + }, + { + "epoch": 0.2717371463340432, + "grad_norm": 3.187054395675659, + "learning_rate": 1.9318157668690526e-05, + "loss": 0.4776, + "step": 2233 + }, + { + "epoch": 0.27185883784606024, + "grad_norm": 2.6151225566864014, + "learning_rate": 1.9317449522838896e-05, + "loss": 0.4735, + "step": 2234 + }, + { + "epoch": 0.27198052935807726, + "grad_norm": 1.8302274942398071, + "learning_rate": 1.9316741022439024e-05, + "loss": 0.5444, + "step": 2235 + }, + { + "epoch": 0.27210222087009434, + "grad_norm": 0.6191455125808716, + "learning_rate": 1.9316032167517876e-05, + "loss": 0.5168, + "step": 2236 + }, + { + "epoch": 0.27222391238211135, + "grad_norm": 1.1184285879135132, + "learning_rate": 1.9315322958102417e-05, + "loss": 0.4514, + "step": 2237 + }, + { + "epoch": 0.2723456038941284, + "grad_norm": 2.1851346492767334, + "learning_rate": 1.9314613394219642e-05, + "loss": 0.54, + "step": 2238 + }, + { + "epoch": 0.27246729540614545, + "grad_norm": 2.121798515319824, + "learning_rate": 1.9313903475896544e-05, + "loss": 0.4459, + "step": 2239 + }, + { + "epoch": 0.27258898691816247, + "grad_norm": 1.4100135564804077, + "learning_rate": 1.9313193203160142e-05, + "loss": 0.4631, + "step": 2240 + }, + { + "epoch": 0.2727106784301795, + "grad_norm": 1.1367652416229248, + "learning_rate": 1.9312482576037457e-05, + "loss": 0.4941, + "step": 2241 + }, + { + "epoch": 0.2728323699421965, + "grad_norm": 0.8598164319992065, + "learning_rate": 1.9311771594555537e-05, + "loss": 0.4661, + "step": 2242 + }, + { + "epoch": 0.2729540614542136, + "grad_norm": 0.7251045107841492, + "learning_rate": 1.9311060258741436e-05, + "loss": 0.4794, + "step": 2243 + }, + { + "epoch": 0.2730757529662306, + "grad_norm": 0.6071829199790955, + "learning_rate": 1.9310348568622218e-05, + "loss": 0.4597, + "step": 2244 + }, + { + "epoch": 0.2731974444782476, + "grad_norm": 0.7943944334983826, + "learning_rate": 1.9309636524224965e-05, + "loss": 0.461, + "step": 2245 + }, + { + "epoch": 0.2733191359902647, + "grad_norm": 0.5888009667396545, + "learning_rate": 1.930892412557677e-05, + "loss": 0.4669, + "step": 2246 + }, + { + "epoch": 0.2734408275022817, + "grad_norm": 4.058525085449219, + "learning_rate": 1.9308211372704745e-05, + "loss": 0.4134, + "step": 2247 + }, + { + "epoch": 0.27356251901429873, + "grad_norm": 0.9731010794639587, + "learning_rate": 1.9307498265636013e-05, + "loss": 0.4935, + "step": 2248 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 2.326507568359375, + "learning_rate": 1.9306784804397702e-05, + "loss": 0.4454, + "step": 2249 + }, + { + "epoch": 0.2738059020383328, + "grad_norm": 0.8813508749008179, + "learning_rate": 1.930607098901697e-05, + "loss": 0.5151, + "step": 2250 + }, + { + "epoch": 0.27392759355034985, + "grad_norm": 2.809396505355835, + "learning_rate": 1.930535681952097e-05, + "loss": 0.4301, + "step": 2251 + }, + { + "epoch": 0.2740492850623669, + "grad_norm": 1.8027896881103516, + "learning_rate": 1.9304642295936882e-05, + "loss": 0.5244, + "step": 2252 + }, + { + "epoch": 0.27417097657438394, + "grad_norm": 1.5659570693969727, + "learning_rate": 1.93039274182919e-05, + "loss": 0.4208, + "step": 2253 + }, + { + "epoch": 0.27429266808640096, + "grad_norm": 1.3783743381500244, + "learning_rate": 1.9303212186613218e-05, + "loss": 0.4895, + "step": 2254 + }, + { + "epoch": 0.27441435959841803, + "grad_norm": 4.133609294891357, + "learning_rate": 1.930249660092806e-05, + "loss": 0.5578, + "step": 2255 + }, + { + "epoch": 0.27453605111043505, + "grad_norm": 0.8889201283454895, + "learning_rate": 1.9301780661263647e-05, + "loss": 0.4732, + "step": 2256 + }, + { + "epoch": 0.2746577426224521, + "grad_norm": 2.5607032775878906, + "learning_rate": 1.9301064367647226e-05, + "loss": 0.5005, + "step": 2257 + }, + { + "epoch": 0.27477943413446915, + "grad_norm": 0.8294042348861694, + "learning_rate": 1.9300347720106054e-05, + "loss": 0.4443, + "step": 2258 + }, + { + "epoch": 0.27490112564648617, + "grad_norm": 2.740612030029297, + "learning_rate": 1.92996307186674e-05, + "loss": 0.413, + "step": 2259 + }, + { + "epoch": 0.2750228171585032, + "grad_norm": 2.467393636703491, + "learning_rate": 1.9298913363358552e-05, + "loss": 0.4236, + "step": 2260 + }, + { + "epoch": 0.2751445086705202, + "grad_norm": 0.8414928913116455, + "learning_rate": 1.92981956542068e-05, + "loss": 0.4725, + "step": 2261 + }, + { + "epoch": 0.2752662001825373, + "grad_norm": 0.662197470664978, + "learning_rate": 1.9297477591239456e-05, + "loss": 0.4877, + "step": 2262 + }, + { + "epoch": 0.2753878916945543, + "grad_norm": 2.279961109161377, + "learning_rate": 1.9296759174483846e-05, + "loss": 0.4785, + "step": 2263 + }, + { + "epoch": 0.2755095832065713, + "grad_norm": 0.7570948600769043, + "learning_rate": 1.9296040403967308e-05, + "loss": 0.4634, + "step": 2264 + }, + { + "epoch": 0.2756312747185884, + "grad_norm": 1.0017637014389038, + "learning_rate": 1.9295321279717187e-05, + "loss": 0.4686, + "step": 2265 + }, + { + "epoch": 0.2757529662306054, + "grad_norm": 0.9803701043128967, + "learning_rate": 1.9294601801760855e-05, + "loss": 0.4628, + "step": 2266 + }, + { + "epoch": 0.27587465774262243, + "grad_norm": 2.301069974899292, + "learning_rate": 1.9293881970125682e-05, + "loss": 0.4529, + "step": 2267 + }, + { + "epoch": 0.2759963492546395, + "grad_norm": 1.323573350906372, + "learning_rate": 1.9293161784839065e-05, + "loss": 0.448, + "step": 2268 + }, + { + "epoch": 0.2761180407666565, + "grad_norm": 1.5175703763961792, + "learning_rate": 1.9292441245928407e-05, + "loss": 0.4807, + "step": 2269 + }, + { + "epoch": 0.27623973227867354, + "grad_norm": 2.870685577392578, + "learning_rate": 1.9291720353421124e-05, + "loss": 0.4407, + "step": 2270 + }, + { + "epoch": 0.2763614237906906, + "grad_norm": 1.8498321771621704, + "learning_rate": 1.9290999107344647e-05, + "loss": 0.4528, + "step": 2271 + }, + { + "epoch": 0.27648311530270764, + "grad_norm": 2.5913422107696533, + "learning_rate": 1.9290277507726424e-05, + "loss": 0.4353, + "step": 2272 + }, + { + "epoch": 0.27660480681472466, + "grad_norm": 3.1198387145996094, + "learning_rate": 1.9289555554593914e-05, + "loss": 0.5134, + "step": 2273 + }, + { + "epoch": 0.27672649832674173, + "grad_norm": 1.7768486738204956, + "learning_rate": 1.928883324797459e-05, + "loss": 0.4734, + "step": 2274 + }, + { + "epoch": 0.27684818983875875, + "grad_norm": 1.2226992845535278, + "learning_rate": 1.9288110587895926e-05, + "loss": 0.4916, + "step": 2275 + }, + { + "epoch": 0.27696988135077577, + "grad_norm": 0.8427600264549255, + "learning_rate": 1.928738757438543e-05, + "loss": 0.4526, + "step": 2276 + }, + { + "epoch": 0.27709157286279285, + "grad_norm": 1.2037626504898071, + "learning_rate": 1.9286664207470618e-05, + "loss": 0.4697, + "step": 2277 + }, + { + "epoch": 0.27721326437480986, + "grad_norm": 4.869029521942139, + "learning_rate": 1.928594048717901e-05, + "loss": 0.5728, + "step": 2278 + }, + { + "epoch": 0.2773349558868269, + "grad_norm": 2.456289529800415, + "learning_rate": 1.928521641353814e-05, + "loss": 0.5, + "step": 2279 + }, + { + "epoch": 0.2774566473988439, + "grad_norm": 0.8228026032447815, + "learning_rate": 1.928449198657557e-05, + "loss": 0.4886, + "step": 2280 + }, + { + "epoch": 0.277578338910861, + "grad_norm": 0.7905141711235046, + "learning_rate": 1.9283767206318865e-05, + "loss": 0.4451, + "step": 2281 + }, + { + "epoch": 0.277700030422878, + "grad_norm": 0.5709277987480164, + "learning_rate": 1.9283042072795597e-05, + "loss": 0.4851, + "step": 2282 + }, + { + "epoch": 0.277821721934895, + "grad_norm": 0.796428382396698, + "learning_rate": 1.9282316586033362e-05, + "loss": 0.4954, + "step": 2283 + }, + { + "epoch": 0.2779434134469121, + "grad_norm": 0.6578648686408997, + "learning_rate": 1.928159074605977e-05, + "loss": 0.4905, + "step": 2284 + }, + { + "epoch": 0.2780651049589291, + "grad_norm": 1.1714102029800415, + "learning_rate": 1.928086455290244e-05, + "loss": 0.4908, + "step": 2285 + }, + { + "epoch": 0.27818679647094613, + "grad_norm": 0.7354956865310669, + "learning_rate": 1.9280138006589e-05, + "loss": 0.5065, + "step": 2286 + }, + { + "epoch": 0.2783084879829632, + "grad_norm": 0.6846218109130859, + "learning_rate": 1.9279411107147104e-05, + "loss": 0.5125, + "step": 2287 + }, + { + "epoch": 0.2784301794949802, + "grad_norm": 1.20828378200531, + "learning_rate": 1.9278683854604404e-05, + "loss": 0.5039, + "step": 2288 + }, + { + "epoch": 0.27855187100699724, + "grad_norm": 1.1589032411575317, + "learning_rate": 1.9277956248988578e-05, + "loss": 0.5256, + "step": 2289 + }, + { + "epoch": 0.2786735625190143, + "grad_norm": 2.371429443359375, + "learning_rate": 1.9277228290327318e-05, + "loss": 0.4737, + "step": 2290 + }, + { + "epoch": 0.27879525403103134, + "grad_norm": 2.052790880203247, + "learning_rate": 1.927649997864831e-05, + "loss": 0.4986, + "step": 2291 + }, + { + "epoch": 0.27891694554304836, + "grad_norm": 2.4020862579345703, + "learning_rate": 1.9275771313979284e-05, + "loss": 0.4932, + "step": 2292 + }, + { + "epoch": 0.27903863705506543, + "grad_norm": 1.0700608491897583, + "learning_rate": 1.9275042296347957e-05, + "loss": 0.5298, + "step": 2293 + }, + { + "epoch": 0.27916032856708245, + "grad_norm": 3.5983588695526123, + "learning_rate": 1.927431292578207e-05, + "loss": 0.4424, + "step": 2294 + }, + { + "epoch": 0.27928202007909947, + "grad_norm": 0.9342120885848999, + "learning_rate": 1.9273583202309382e-05, + "loss": 0.4334, + "step": 2295 + }, + { + "epoch": 0.27940371159111654, + "grad_norm": 2.4847373962402344, + "learning_rate": 1.927285312595766e-05, + "loss": 0.4921, + "step": 2296 + }, + { + "epoch": 0.27952540310313356, + "grad_norm": 1.8025132417678833, + "learning_rate": 1.9272122696754677e-05, + "loss": 0.4823, + "step": 2297 + }, + { + "epoch": 0.2796470946151506, + "grad_norm": 0.6341730356216431, + "learning_rate": 1.9271391914728236e-05, + "loss": 0.4618, + "step": 2298 + }, + { + "epoch": 0.27976878612716766, + "grad_norm": 0.7882016897201538, + "learning_rate": 1.9270660779906144e-05, + "loss": 0.4464, + "step": 2299 + }, + { + "epoch": 0.2798904776391847, + "grad_norm": 1.488471269607544, + "learning_rate": 1.926992929231622e-05, + "loss": 0.5159, + "step": 2300 + }, + { + "epoch": 0.2800121691512017, + "grad_norm": 2.271127700805664, + "learning_rate": 1.9269197451986295e-05, + "loss": 0.4994, + "step": 2301 + }, + { + "epoch": 0.2801338606632187, + "grad_norm": 2.7641971111297607, + "learning_rate": 1.926846525894422e-05, + "loss": 0.5357, + "step": 2302 + }, + { + "epoch": 0.2802555521752358, + "grad_norm": 1.1550077199935913, + "learning_rate": 1.9267732713217865e-05, + "loss": 0.5515, + "step": 2303 + }, + { + "epoch": 0.2803772436872528, + "grad_norm": 4.89072847366333, + "learning_rate": 1.926699981483509e-05, + "loss": 0.4561, + "step": 2304 + }, + { + "epoch": 0.2804989351992698, + "grad_norm": 5.637208938598633, + "learning_rate": 1.926626656382379e-05, + "loss": 0.4555, + "step": 2305 + }, + { + "epoch": 0.2806206267112869, + "grad_norm": 3.1426360607147217, + "learning_rate": 1.926553296021187e-05, + "loss": 0.4986, + "step": 2306 + }, + { + "epoch": 0.2807423182233039, + "grad_norm": 2.4551095962524414, + "learning_rate": 1.9264799004027245e-05, + "loss": 0.5442, + "step": 2307 + }, + { + "epoch": 0.28086400973532094, + "grad_norm": 5.031198501586914, + "learning_rate": 1.9264064695297836e-05, + "loss": 0.4493, + "step": 2308 + }, + { + "epoch": 0.280985701247338, + "grad_norm": 0.5657782554626465, + "learning_rate": 1.926333003405159e-05, + "loss": 0.491, + "step": 2309 + }, + { + "epoch": 0.28110739275935503, + "grad_norm": 0.5203744769096375, + "learning_rate": 1.9262595020316465e-05, + "loss": 0.5181, + "step": 2310 + }, + { + "epoch": 0.28122908427137205, + "grad_norm": 0.8510515093803406, + "learning_rate": 1.926185965412043e-05, + "loss": 0.4504, + "step": 2311 + }, + { + "epoch": 0.28135077578338913, + "grad_norm": 4.522860527038574, + "learning_rate": 1.9261123935491458e-05, + "loss": 0.5591, + "step": 2312 + }, + { + "epoch": 0.28147246729540615, + "grad_norm": 3.9935905933380127, + "learning_rate": 1.926038786445755e-05, + "loss": 0.5442, + "step": 2313 + }, + { + "epoch": 0.28159415880742317, + "grad_norm": 2.9965057373046875, + "learning_rate": 1.925965144104672e-05, + "loss": 0.5105, + "step": 2314 + }, + { + "epoch": 0.28171585031944024, + "grad_norm": 0.4628284275531769, + "learning_rate": 1.9258914665286983e-05, + "loss": 0.41, + "step": 2315 + }, + { + "epoch": 0.28183754183145726, + "grad_norm": 1.1634234189987183, + "learning_rate": 1.925817753720638e-05, + "loss": 0.4551, + "step": 2316 + }, + { + "epoch": 0.2819592333434743, + "grad_norm": 1.2035220861434937, + "learning_rate": 1.9257440056832955e-05, + "loss": 0.469, + "step": 2317 + }, + { + "epoch": 0.28208092485549136, + "grad_norm": 1.8352148532867432, + "learning_rate": 1.9256702224194778e-05, + "loss": 0.4468, + "step": 2318 + }, + { + "epoch": 0.2822026163675084, + "grad_norm": 0.6279745697975159, + "learning_rate": 1.925596403931992e-05, + "loss": 0.5322, + "step": 2319 + }, + { + "epoch": 0.2823243078795254, + "grad_norm": 4.472016334533691, + "learning_rate": 1.9255225502236473e-05, + "loss": 0.4378, + "step": 2320 + }, + { + "epoch": 0.2824459993915424, + "grad_norm": 4.093414783477783, + "learning_rate": 1.9254486612972538e-05, + "loss": 0.4489, + "step": 2321 + }, + { + "epoch": 0.2825676909035595, + "grad_norm": 2.149359703063965, + "learning_rate": 1.925374737155623e-05, + "loss": 0.4886, + "step": 2322 + }, + { + "epoch": 0.2826893824155765, + "grad_norm": 0.7144204378128052, + "learning_rate": 1.9253007778015682e-05, + "loss": 0.4277, + "step": 2323 + }, + { + "epoch": 0.2828110739275935, + "grad_norm": 1.666211485862732, + "learning_rate": 1.9252267832379035e-05, + "loss": 0.5334, + "step": 2324 + }, + { + "epoch": 0.2829327654396106, + "grad_norm": 2.787982940673828, + "learning_rate": 1.9251527534674444e-05, + "loss": 0.524, + "step": 2325 + }, + { + "epoch": 0.2830544569516276, + "grad_norm": 0.6031523942947388, + "learning_rate": 1.9250786884930083e-05, + "loss": 0.4115, + "step": 2326 + }, + { + "epoch": 0.28317614846364464, + "grad_norm": 1.570724606513977, + "learning_rate": 1.9250045883174132e-05, + "loss": 0.478, + "step": 2327 + }, + { + "epoch": 0.2832978399756617, + "grad_norm": 0.8946213126182556, + "learning_rate": 1.9249304529434786e-05, + "loss": 0.4306, + "step": 2328 + }, + { + "epoch": 0.28341953148767873, + "grad_norm": 0.9296962022781372, + "learning_rate": 1.9248562823740263e-05, + "loss": 0.4789, + "step": 2329 + }, + { + "epoch": 0.28354122299969575, + "grad_norm": 1.0657775402069092, + "learning_rate": 1.9247820766118773e-05, + "loss": 0.4362, + "step": 2330 + }, + { + "epoch": 0.2836629145117128, + "grad_norm": 2.9592397212982178, + "learning_rate": 1.9247078356598567e-05, + "loss": 0.4978, + "step": 2331 + }, + { + "epoch": 0.28378460602372985, + "grad_norm": 0.6225043535232544, + "learning_rate": 1.924633559520789e-05, + "loss": 0.446, + "step": 2332 + }, + { + "epoch": 0.28390629753574687, + "grad_norm": 1.293463945388794, + "learning_rate": 1.9245592481975004e-05, + "loss": 0.451, + "step": 2333 + }, + { + "epoch": 0.28402798904776394, + "grad_norm": 0.6287753582000732, + "learning_rate": 1.9244849016928184e-05, + "loss": 0.4997, + "step": 2334 + }, + { + "epoch": 0.28414968055978096, + "grad_norm": 1.572800636291504, + "learning_rate": 1.9244105200095723e-05, + "loss": 0.5002, + "step": 2335 + }, + { + "epoch": 0.284271372071798, + "grad_norm": 0.7218564748764038, + "learning_rate": 1.924336103150593e-05, + "loss": 0.4669, + "step": 2336 + }, + { + "epoch": 0.28439306358381505, + "grad_norm": 1.4338526725769043, + "learning_rate": 1.9242616511187113e-05, + "loss": 0.4811, + "step": 2337 + }, + { + "epoch": 0.2845147550958321, + "grad_norm": 1.0821044445037842, + "learning_rate": 1.9241871639167606e-05, + "loss": 0.4953, + "step": 2338 + }, + { + "epoch": 0.2846364466078491, + "grad_norm": 0.9856778979301453, + "learning_rate": 1.9241126415475755e-05, + "loss": 0.4809, + "step": 2339 + }, + { + "epoch": 0.28475813811986617, + "grad_norm": 0.9627021551132202, + "learning_rate": 1.9240380840139917e-05, + "loss": 0.5173, + "step": 2340 + }, + { + "epoch": 0.2848798296318832, + "grad_norm": 3.340620994567871, + "learning_rate": 1.9239634913188458e-05, + "loss": 0.488, + "step": 2341 + }, + { + "epoch": 0.2850015211439002, + "grad_norm": 4.829874515533447, + "learning_rate": 1.923888863464977e-05, + "loss": 0.4835, + "step": 2342 + }, + { + "epoch": 0.2851232126559172, + "grad_norm": 0.6045045852661133, + "learning_rate": 1.9238142004552243e-05, + "loss": 0.5201, + "step": 2343 + }, + { + "epoch": 0.2852449041679343, + "grad_norm": 0.9127470254898071, + "learning_rate": 1.923739502292429e-05, + "loss": 0.4976, + "step": 2344 + }, + { + "epoch": 0.2853665956799513, + "grad_norm": 3.985020875930786, + "learning_rate": 1.923664768979434e-05, + "loss": 0.4391, + "step": 2345 + }, + { + "epoch": 0.28548828719196834, + "grad_norm": 2.159661293029785, + "learning_rate": 1.923590000519082e-05, + "loss": 0.4728, + "step": 2346 + }, + { + "epoch": 0.2856099787039854, + "grad_norm": 0.770510733127594, + "learning_rate": 1.9235151969142194e-05, + "loss": 0.4209, + "step": 2347 + }, + { + "epoch": 0.28573167021600243, + "grad_norm": 3.129359483718872, + "learning_rate": 1.9234403581676917e-05, + "loss": 0.5098, + "step": 2348 + }, + { + "epoch": 0.28585336172801945, + "grad_norm": 3.219273805618286, + "learning_rate": 1.923365484282347e-05, + "loss": 0.5177, + "step": 2349 + }, + { + "epoch": 0.2859750532400365, + "grad_norm": 3.299837112426758, + "learning_rate": 1.9232905752610343e-05, + "loss": 0.5014, + "step": 2350 + }, + { + "epoch": 0.28609674475205354, + "grad_norm": 1.3421010971069336, + "learning_rate": 1.9232156311066045e-05, + "loss": 0.4848, + "step": 2351 + }, + { + "epoch": 0.28621843626407056, + "grad_norm": 1.5841246843338013, + "learning_rate": 1.9231406518219084e-05, + "loss": 0.5247, + "step": 2352 + }, + { + "epoch": 0.28634012777608764, + "grad_norm": 0.7412301301956177, + "learning_rate": 1.9230656374098e-05, + "loss": 0.4661, + "step": 2353 + }, + { + "epoch": 0.28646181928810466, + "grad_norm": 2.3621151447296143, + "learning_rate": 1.9229905878731333e-05, + "loss": 0.4927, + "step": 2354 + }, + { + "epoch": 0.2865835108001217, + "grad_norm": 3.8191757202148438, + "learning_rate": 1.9229155032147644e-05, + "loss": 0.4524, + "step": 2355 + }, + { + "epoch": 0.28670520231213875, + "grad_norm": 1.4366422891616821, + "learning_rate": 1.9228403834375502e-05, + "loss": 0.5542, + "step": 2356 + }, + { + "epoch": 0.28682689382415577, + "grad_norm": 1.9640687704086304, + "learning_rate": 1.9227652285443494e-05, + "loss": 0.485, + "step": 2357 + }, + { + "epoch": 0.2869485853361728, + "grad_norm": 1.7183088064193726, + "learning_rate": 1.9226900385380215e-05, + "loss": 0.5474, + "step": 2358 + }, + { + "epoch": 0.28707027684818986, + "grad_norm": 3.0479187965393066, + "learning_rate": 1.9226148134214275e-05, + "loss": 0.4701, + "step": 2359 + }, + { + "epoch": 0.2871919683602069, + "grad_norm": 1.2891998291015625, + "learning_rate": 1.9225395531974306e-05, + "loss": 0.4179, + "step": 2360 + }, + { + "epoch": 0.2873136598722239, + "grad_norm": 1.107069492340088, + "learning_rate": 1.9224642578688943e-05, + "loss": 0.4554, + "step": 2361 + }, + { + "epoch": 0.2874353513842409, + "grad_norm": 2.142453908920288, + "learning_rate": 1.922388927438683e-05, + "loss": 0.4842, + "step": 2362 + }, + { + "epoch": 0.287557042896258, + "grad_norm": 3.440673589706421, + "learning_rate": 1.9223135619096637e-05, + "loss": 0.4964, + "step": 2363 + }, + { + "epoch": 0.287678734408275, + "grad_norm": 3.190993070602417, + "learning_rate": 1.922238161284705e-05, + "loss": 0.4949, + "step": 2364 + }, + { + "epoch": 0.28780042592029204, + "grad_norm": 4.557056903839111, + "learning_rate": 1.9221627255666747e-05, + "loss": 0.4768, + "step": 2365 + }, + { + "epoch": 0.2879221174323091, + "grad_norm": 3.298712968826294, + "learning_rate": 1.9220872547584443e-05, + "loss": 0.4852, + "step": 2366 + }, + { + "epoch": 0.28804380894432613, + "grad_norm": 1.117448091506958, + "learning_rate": 1.922011748862885e-05, + "loss": 0.3987, + "step": 2367 + }, + { + "epoch": 0.28816550045634315, + "grad_norm": 2.4735140800476074, + "learning_rate": 1.92193620788287e-05, + "loss": 0.4845, + "step": 2368 + }, + { + "epoch": 0.2882871919683602, + "grad_norm": 2.5398001670837402, + "learning_rate": 1.9218606318212747e-05, + "loss": 0.4312, + "step": 2369 + }, + { + "epoch": 0.28840888348037724, + "grad_norm": 0.6697260141372681, + "learning_rate": 1.921785020680974e-05, + "loss": 0.4837, + "step": 2370 + }, + { + "epoch": 0.28853057499239426, + "grad_norm": 1.876666784286499, + "learning_rate": 1.9217093744648446e-05, + "loss": 0.4604, + "step": 2371 + }, + { + "epoch": 0.28865226650441134, + "grad_norm": 2.5812888145446777, + "learning_rate": 1.921633693175766e-05, + "loss": 0.4395, + "step": 2372 + }, + { + "epoch": 0.28877395801642836, + "grad_norm": 1.9372605085372925, + "learning_rate": 1.9215579768166183e-05, + "loss": 0.4555, + "step": 2373 + }, + { + "epoch": 0.2888956495284454, + "grad_norm": 0.7640296220779419, + "learning_rate": 1.9214822253902813e-05, + "loss": 0.4639, + "step": 2374 + }, + { + "epoch": 0.28901734104046245, + "grad_norm": 2.6883747577667236, + "learning_rate": 1.921406438899639e-05, + "loss": 0.4993, + "step": 2375 + }, + { + "epoch": 0.28913903255247947, + "grad_norm": 4.16229248046875, + "learning_rate": 1.921330617347574e-05, + "loss": 0.5439, + "step": 2376 + }, + { + "epoch": 0.2892607240644965, + "grad_norm": 0.6326306462287903, + "learning_rate": 1.921254760736972e-05, + "loss": 0.4194, + "step": 2377 + }, + { + "epoch": 0.28938241557651356, + "grad_norm": 1.1483418941497803, + "learning_rate": 1.9211788690707194e-05, + "loss": 0.5079, + "step": 2378 + }, + { + "epoch": 0.2895041070885306, + "grad_norm": 3.2198092937469482, + "learning_rate": 1.921102942351704e-05, + "loss": 0.5132, + "step": 2379 + }, + { + "epoch": 0.2896257986005476, + "grad_norm": 1.8041861057281494, + "learning_rate": 1.9210269805828152e-05, + "loss": 0.4677, + "step": 2380 + }, + { + "epoch": 0.2897474901125647, + "grad_norm": 1.8039244413375854, + "learning_rate": 1.9209509837669436e-05, + "loss": 0.4692, + "step": 2381 + }, + { + "epoch": 0.2898691816245817, + "grad_norm": 1.2893513441085815, + "learning_rate": 1.9208749519069805e-05, + "loss": 0.4675, + "step": 2382 + }, + { + "epoch": 0.2899908731365987, + "grad_norm": 2.565755605697632, + "learning_rate": 1.9207988850058193e-05, + "loss": 0.4988, + "step": 2383 + }, + { + "epoch": 0.29011256464861573, + "grad_norm": 0.6244593858718872, + "learning_rate": 1.920722783066355e-05, + "loss": 0.4613, + "step": 2384 + }, + { + "epoch": 0.2902342561606328, + "grad_norm": 3.351763963699341, + "learning_rate": 1.920646646091483e-05, + "loss": 0.5336, + "step": 2385 + }, + { + "epoch": 0.29035594767264983, + "grad_norm": 1.7007615566253662, + "learning_rate": 1.9205704740840998e-05, + "loss": 0.4341, + "step": 2386 + }, + { + "epoch": 0.29047763918466685, + "grad_norm": 1.734458088874817, + "learning_rate": 1.920494267047105e-05, + "loss": 0.5159, + "step": 2387 + }, + { + "epoch": 0.2905993306966839, + "grad_norm": 1.9868311882019043, + "learning_rate": 1.9204180249833984e-05, + "loss": 0.4485, + "step": 2388 + }, + { + "epoch": 0.29072102220870094, + "grad_norm": 1.497117519378662, + "learning_rate": 1.92034174789588e-05, + "loss": 0.4978, + "step": 2389 + }, + { + "epoch": 0.29084271372071796, + "grad_norm": 3.925015449523926, + "learning_rate": 1.920265435787454e-05, + "loss": 0.3847, + "step": 2390 + }, + { + "epoch": 0.29096440523273503, + "grad_norm": 1.5976030826568604, + "learning_rate": 1.9201890886610225e-05, + "loss": 0.4759, + "step": 2391 + }, + { + "epoch": 0.29108609674475205, + "grad_norm": 1.1353329420089722, + "learning_rate": 1.920112706519492e-05, + "loss": 0.4495, + "step": 2392 + }, + { + "epoch": 0.2912077882567691, + "grad_norm": 0.681307315826416, + "learning_rate": 1.9200362893657685e-05, + "loss": 0.4632, + "step": 2393 + }, + { + "epoch": 0.29132947976878615, + "grad_norm": 1.3313989639282227, + "learning_rate": 1.9199598372027593e-05, + "loss": 0.4714, + "step": 2394 + }, + { + "epoch": 0.29145117128080317, + "grad_norm": 1.5667229890823364, + "learning_rate": 1.9198833500333746e-05, + "loss": 0.4323, + "step": 2395 + }, + { + "epoch": 0.2915728627928202, + "grad_norm": 0.9350096583366394, + "learning_rate": 1.919806827860524e-05, + "loss": 0.4355, + "step": 2396 + }, + { + "epoch": 0.29169455430483726, + "grad_norm": 0.5832048654556274, + "learning_rate": 1.9197302706871202e-05, + "loss": 0.4359, + "step": 2397 + }, + { + "epoch": 0.2918162458168543, + "grad_norm": 3.143641471862793, + "learning_rate": 1.9196536785160755e-05, + "loss": 0.5044, + "step": 2398 + }, + { + "epoch": 0.2919379373288713, + "grad_norm": 3.0947747230529785, + "learning_rate": 1.919577051350305e-05, + "loss": 0.4606, + "step": 2399 + }, + { + "epoch": 0.2920596288408884, + "grad_norm": 1.3649258613586426, + "learning_rate": 1.919500389192724e-05, + "loss": 0.4519, + "step": 2400 + }, + { + "epoch": 0.2921813203529054, + "grad_norm": 3.5724737644195557, + "learning_rate": 1.91942369204625e-05, + "loss": 0.4938, + "step": 2401 + }, + { + "epoch": 0.2923030118649224, + "grad_norm": 0.7385405898094177, + "learning_rate": 1.9193469599138015e-05, + "loss": 0.4786, + "step": 2402 + }, + { + "epoch": 0.29242470337693943, + "grad_norm": 1.4629532098770142, + "learning_rate": 1.919270192798298e-05, + "loss": 0.4887, + "step": 2403 + }, + { + "epoch": 0.2925463948889565, + "grad_norm": 3.9168739318847656, + "learning_rate": 1.9191933907026612e-05, + "loss": 0.4213, + "step": 2404 + }, + { + "epoch": 0.2926680864009735, + "grad_norm": 2.0356836318969727, + "learning_rate": 1.9191165536298132e-05, + "loss": 0.5293, + "step": 2405 + }, + { + "epoch": 0.29278977791299055, + "grad_norm": 2.9639980792999268, + "learning_rate": 1.9190396815826777e-05, + "loss": 0.4895, + "step": 2406 + }, + { + "epoch": 0.2929114694250076, + "grad_norm": 2.15425968170166, + "learning_rate": 1.91896277456418e-05, + "loss": 0.4493, + "step": 2407 + }, + { + "epoch": 0.29303316093702464, + "grad_norm": 1.167723536491394, + "learning_rate": 1.9188858325772467e-05, + "loss": 0.5197, + "step": 2408 + }, + { + "epoch": 0.29315485244904166, + "grad_norm": 0.8349345922470093, + "learning_rate": 1.918808855624806e-05, + "loss": 0.4763, + "step": 2409 + }, + { + "epoch": 0.29327654396105873, + "grad_norm": 0.5412417650222778, + "learning_rate": 1.9187318437097855e-05, + "loss": 0.4501, + "step": 2410 + }, + { + "epoch": 0.29339823547307575, + "grad_norm": 2.312762975692749, + "learning_rate": 1.918654796835117e-05, + "loss": 0.4769, + "step": 2411 + }, + { + "epoch": 0.29351992698509277, + "grad_norm": 3.0606560707092285, + "learning_rate": 1.9185777150037323e-05, + "loss": 0.4928, + "step": 2412 + }, + { + "epoch": 0.29364161849710985, + "grad_norm": 0.9675942063331604, + "learning_rate": 1.918500598218564e-05, + "loss": 0.4834, + "step": 2413 + }, + { + "epoch": 0.29376331000912687, + "grad_norm": 0.9042759537696838, + "learning_rate": 1.918423446482547e-05, + "loss": 0.5189, + "step": 2414 + }, + { + "epoch": 0.2938850015211439, + "grad_norm": 1.7681832313537598, + "learning_rate": 1.9183462597986165e-05, + "loss": 0.4502, + "step": 2415 + }, + { + "epoch": 0.29400669303316096, + "grad_norm": 1.1707684993743896, + "learning_rate": 1.91826903816971e-05, + "loss": 0.5075, + "step": 2416 + }, + { + "epoch": 0.294128384545178, + "grad_norm": 5.285305023193359, + "learning_rate": 1.9181917815987658e-05, + "loss": 0.4428, + "step": 2417 + }, + { + "epoch": 0.294250076057195, + "grad_norm": 2.9431893825531006, + "learning_rate": 1.918114490088724e-05, + "loss": 0.5061, + "step": 2418 + }, + { + "epoch": 0.2943717675692121, + "grad_norm": 2.6012656688690186, + "learning_rate": 1.9180371636425253e-05, + "loss": 0.4953, + "step": 2419 + }, + { + "epoch": 0.2944934590812291, + "grad_norm": 1.6666074991226196, + "learning_rate": 1.9179598022631124e-05, + "loss": 0.4996, + "step": 2420 + }, + { + "epoch": 0.2946151505932461, + "grad_norm": 1.650451421737671, + "learning_rate": 1.917882405953429e-05, + "loss": 0.5031, + "step": 2421 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 1.5714176893234253, + "learning_rate": 1.9178049747164198e-05, + "loss": 0.5226, + "step": 2422 + }, + { + "epoch": 0.2948585336172802, + "grad_norm": 0.761840283870697, + "learning_rate": 1.9177275085550316e-05, + "loss": 0.4144, + "step": 2423 + }, + { + "epoch": 0.2949802251292972, + "grad_norm": 0.5682148933410645, + "learning_rate": 1.9176500074722125e-05, + "loss": 0.4423, + "step": 2424 + }, + { + "epoch": 0.29510191664131424, + "grad_norm": 0.9621097445487976, + "learning_rate": 1.917572471470911e-05, + "loss": 0.4437, + "step": 2425 + }, + { + "epoch": 0.2952236081533313, + "grad_norm": 1.0111867189407349, + "learning_rate": 1.9174949005540777e-05, + "loss": 0.4262, + "step": 2426 + }, + { + "epoch": 0.29534529966534834, + "grad_norm": 2.042614221572876, + "learning_rate": 1.9174172947246642e-05, + "loss": 0.4754, + "step": 2427 + }, + { + "epoch": 0.29546699117736536, + "grad_norm": 4.59519100189209, + "learning_rate": 1.9173396539856236e-05, + "loss": 0.553, + "step": 2428 + }, + { + "epoch": 0.29558868268938243, + "grad_norm": 1.9203051328659058, + "learning_rate": 1.9172619783399108e-05, + "loss": 0.4725, + "step": 2429 + }, + { + "epoch": 0.29571037420139945, + "grad_norm": 1.053370714187622, + "learning_rate": 1.9171842677904808e-05, + "loss": 0.4826, + "step": 2430 + }, + { + "epoch": 0.29583206571341647, + "grad_norm": 4.839434623718262, + "learning_rate": 1.9171065223402908e-05, + "loss": 0.4299, + "step": 2431 + }, + { + "epoch": 0.29595375722543354, + "grad_norm": 3.3164596557617188, + "learning_rate": 1.9170287419922993e-05, + "loss": 0.4689, + "step": 2432 + }, + { + "epoch": 0.29607544873745056, + "grad_norm": 3.1286797523498535, + "learning_rate": 1.9169509267494666e-05, + "loss": 0.481, + "step": 2433 + }, + { + "epoch": 0.2961971402494676, + "grad_norm": 2.202488660812378, + "learning_rate": 1.916873076614753e-05, + "loss": 0.5111, + "step": 2434 + }, + { + "epoch": 0.29631883176148466, + "grad_norm": 1.047048568725586, + "learning_rate": 1.9167951915911206e-05, + "loss": 0.5096, + "step": 2435 + }, + { + "epoch": 0.2964405232735017, + "grad_norm": 2.2479233741760254, + "learning_rate": 1.9167172716815335e-05, + "loss": 0.528, + "step": 2436 + }, + { + "epoch": 0.2965622147855187, + "grad_norm": 0.7420548796653748, + "learning_rate": 1.916639316888957e-05, + "loss": 0.4368, + "step": 2437 + }, + { + "epoch": 0.29668390629753577, + "grad_norm": 5.904504776000977, + "learning_rate": 1.916561327216357e-05, + "loss": 0.5687, + "step": 2438 + }, + { + "epoch": 0.2968055978095528, + "grad_norm": 2.3244457244873047, + "learning_rate": 1.9164833026667016e-05, + "loss": 0.4947, + "step": 2439 + }, + { + "epoch": 0.2969272893215698, + "grad_norm": 1.5631097555160522, + "learning_rate": 1.9164052432429588e-05, + "loss": 0.4252, + "step": 2440 + }, + { + "epoch": 0.2970489808335869, + "grad_norm": 2.7869882583618164, + "learning_rate": 1.9163271489481005e-05, + "loss": 0.5109, + "step": 2441 + }, + { + "epoch": 0.2971706723456039, + "grad_norm": 0.6710759997367859, + "learning_rate": 1.916249019785097e-05, + "loss": 0.4478, + "step": 2442 + }, + { + "epoch": 0.2972923638576209, + "grad_norm": 0.7273445725440979, + "learning_rate": 1.916170855756922e-05, + "loss": 0.434, + "step": 2443 + }, + { + "epoch": 0.29741405536963794, + "grad_norm": 1.0478951930999756, + "learning_rate": 1.9160926568665496e-05, + "loss": 0.4523, + "step": 2444 + }, + { + "epoch": 0.297535746881655, + "grad_norm": 1.6161104440689087, + "learning_rate": 1.9160144231169552e-05, + "loss": 0.4872, + "step": 2445 + }, + { + "epoch": 0.29765743839367204, + "grad_norm": 0.8364020586013794, + "learning_rate": 1.9159361545111158e-05, + "loss": 0.4355, + "step": 2446 + }, + { + "epoch": 0.29777912990568906, + "grad_norm": 0.6326636075973511, + "learning_rate": 1.9158578510520103e-05, + "loss": 0.4926, + "step": 2447 + }, + { + "epoch": 0.29790082141770613, + "grad_norm": 0.7602832913398743, + "learning_rate": 1.9157795127426174e-05, + "loss": 0.4765, + "step": 2448 + }, + { + "epoch": 0.29802251292972315, + "grad_norm": 2.0874526500701904, + "learning_rate": 1.9157011395859188e-05, + "loss": 0.4091, + "step": 2449 + }, + { + "epoch": 0.29814420444174017, + "grad_norm": 4.617832183837891, + "learning_rate": 1.9156227315848962e-05, + "loss": 0.6028, + "step": 2450 + }, + { + "epoch": 0.29826589595375724, + "grad_norm": 3.986806869506836, + "learning_rate": 1.9155442887425337e-05, + "loss": 0.557, + "step": 2451 + }, + { + "epoch": 0.29838758746577426, + "grad_norm": 1.9443796873092651, + "learning_rate": 1.9154658110618156e-05, + "loss": 0.4479, + "step": 2452 + }, + { + "epoch": 0.2985092789777913, + "grad_norm": 0.6482906341552734, + "learning_rate": 1.9153872985457287e-05, + "loss": 0.4684, + "step": 2453 + }, + { + "epoch": 0.29863097048980836, + "grad_norm": 0.6355880498886108, + "learning_rate": 1.9153087511972604e-05, + "loss": 0.4498, + "step": 2454 + }, + { + "epoch": 0.2987526620018254, + "grad_norm": 2.498286008834839, + "learning_rate": 1.915230169019399e-05, + "loss": 0.5195, + "step": 2455 + }, + { + "epoch": 0.2988743535138424, + "grad_norm": 1.5511423349380493, + "learning_rate": 1.915151552015136e-05, + "loss": 0.4857, + "step": 2456 + }, + { + "epoch": 0.29899604502585947, + "grad_norm": 0.9520419836044312, + "learning_rate": 1.9150729001874618e-05, + "loss": 0.4751, + "step": 2457 + }, + { + "epoch": 0.2991177365378765, + "grad_norm": 1.1225643157958984, + "learning_rate": 1.9149942135393696e-05, + "loss": 0.4656, + "step": 2458 + }, + { + "epoch": 0.2992394280498935, + "grad_norm": 1.2036497592926025, + "learning_rate": 1.914915492073854e-05, + "loss": 0.4623, + "step": 2459 + }, + { + "epoch": 0.2993611195619106, + "grad_norm": 2.4605398178100586, + "learning_rate": 1.91483673579391e-05, + "loss": 0.4863, + "step": 2460 + }, + { + "epoch": 0.2994828110739276, + "grad_norm": 0.9072971343994141, + "learning_rate": 1.9147579447025343e-05, + "loss": 0.4194, + "step": 2461 + }, + { + "epoch": 0.2996045025859446, + "grad_norm": 1.0341025590896606, + "learning_rate": 1.9146791188027257e-05, + "loss": 0.5028, + "step": 2462 + }, + { + "epoch": 0.29972619409796164, + "grad_norm": 2.1331140995025635, + "learning_rate": 1.914600258097483e-05, + "loss": 0.3947, + "step": 2463 + }, + { + "epoch": 0.2998478856099787, + "grad_norm": 1.425396203994751, + "learning_rate": 1.914521362589808e-05, + "loss": 0.4031, + "step": 2464 + }, + { + "epoch": 0.29996957712199573, + "grad_norm": 1.7659382820129395, + "learning_rate": 1.9144424322827016e-05, + "loss": 0.4646, + "step": 2465 + }, + { + "epoch": 0.30009126863401275, + "grad_norm": 0.8104053735733032, + "learning_rate": 1.914363467179168e-05, + "loss": 0.4547, + "step": 2466 + }, + { + "epoch": 0.30021296014602983, + "grad_norm": 1.4789149761199951, + "learning_rate": 1.914284467282212e-05, + "loss": 0.4649, + "step": 2467 + }, + { + "epoch": 0.30033465165804685, + "grad_norm": 0.697638988494873, + "learning_rate": 1.9142054325948393e-05, + "loss": 0.4429, + "step": 2468 + }, + { + "epoch": 0.30045634317006387, + "grad_norm": 1.612952709197998, + "learning_rate": 1.9141263631200578e-05, + "loss": 0.5509, + "step": 2469 + }, + { + "epoch": 0.30057803468208094, + "grad_norm": 1.3188133239746094, + "learning_rate": 1.914047258860876e-05, + "loss": 0.476, + "step": 2470 + }, + { + "epoch": 0.30069972619409796, + "grad_norm": 0.8661088347434998, + "learning_rate": 1.913968119820304e-05, + "loss": 0.5148, + "step": 2471 + }, + { + "epoch": 0.300821417706115, + "grad_norm": 1.7356548309326172, + "learning_rate": 1.913888946001353e-05, + "loss": 0.4534, + "step": 2472 + }, + { + "epoch": 0.30094310921813205, + "grad_norm": 2.5952534675598145, + "learning_rate": 1.9138097374070364e-05, + "loss": 0.4611, + "step": 2473 + }, + { + "epoch": 0.3010648007301491, + "grad_norm": 1.9293267726898193, + "learning_rate": 1.9137304940403678e-05, + "loss": 0.4856, + "step": 2474 + }, + { + "epoch": 0.3011864922421661, + "grad_norm": 0.8100680708885193, + "learning_rate": 1.913651215904362e-05, + "loss": 0.4947, + "step": 2475 + }, + { + "epoch": 0.30130818375418317, + "grad_norm": 1.1908873319625854, + "learning_rate": 1.9135719030020368e-05, + "loss": 0.4405, + "step": 2476 + }, + { + "epoch": 0.3014298752662002, + "grad_norm": 2.3379483222961426, + "learning_rate": 1.9134925553364097e-05, + "loss": 0.4851, + "step": 2477 + }, + { + "epoch": 0.3015515667782172, + "grad_norm": 2.4926347732543945, + "learning_rate": 1.9134131729104997e-05, + "loss": 0.507, + "step": 2478 + }, + { + "epoch": 0.3016732582902343, + "grad_norm": 1.1349623203277588, + "learning_rate": 1.9133337557273282e-05, + "loss": 0.4548, + "step": 2479 + }, + { + "epoch": 0.3017949498022513, + "grad_norm": 1.875536322593689, + "learning_rate": 1.9132543037899166e-05, + "loss": 0.4822, + "step": 2480 + }, + { + "epoch": 0.3019166413142683, + "grad_norm": 2.2871627807617188, + "learning_rate": 1.9131748171012882e-05, + "loss": 0.5135, + "step": 2481 + }, + { + "epoch": 0.3020383328262854, + "grad_norm": 1.4449636936187744, + "learning_rate": 1.913095295664468e-05, + "loss": 0.4997, + "step": 2482 + }, + { + "epoch": 0.3021600243383024, + "grad_norm": 2.2744860649108887, + "learning_rate": 1.913015739482482e-05, + "loss": 0.4831, + "step": 2483 + }, + { + "epoch": 0.30228171585031943, + "grad_norm": 0.9401661157608032, + "learning_rate": 1.9129361485583572e-05, + "loss": 0.4872, + "step": 2484 + }, + { + "epoch": 0.30240340736233645, + "grad_norm": 2.0501365661621094, + "learning_rate": 1.9128565228951223e-05, + "loss": 0.4214, + "step": 2485 + }, + { + "epoch": 0.3025250988743535, + "grad_norm": 3.1846559047698975, + "learning_rate": 1.912776862495807e-05, + "loss": 0.4215, + "step": 2486 + }, + { + "epoch": 0.30264679038637055, + "grad_norm": 2.1140296459198, + "learning_rate": 1.912697167363443e-05, + "loss": 0.5448, + "step": 2487 + }, + { + "epoch": 0.30276848189838756, + "grad_norm": 1.8529914617538452, + "learning_rate": 1.9126174375010622e-05, + "loss": 0.4128, + "step": 2488 + }, + { + "epoch": 0.30289017341040464, + "grad_norm": 3.9982242584228516, + "learning_rate": 1.9125376729116992e-05, + "loss": 0.4783, + "step": 2489 + }, + { + "epoch": 0.30301186492242166, + "grad_norm": 6.011218547821045, + "learning_rate": 1.912457873598389e-05, + "loss": 0.5774, + "step": 2490 + }, + { + "epoch": 0.3031335564344387, + "grad_norm": 1.1237566471099854, + "learning_rate": 1.9123780395641676e-05, + "loss": 0.4281, + "step": 2491 + }, + { + "epoch": 0.30325524794645575, + "grad_norm": 1.6968867778778076, + "learning_rate": 1.9122981708120734e-05, + "loss": 0.4728, + "step": 2492 + }, + { + "epoch": 0.30337693945847277, + "grad_norm": 0.6782594323158264, + "learning_rate": 1.9122182673451456e-05, + "loss": 0.407, + "step": 2493 + }, + { + "epoch": 0.3034986309704898, + "grad_norm": 1.179334044456482, + "learning_rate": 1.9121383291664245e-05, + "loss": 0.4575, + "step": 2494 + }, + { + "epoch": 0.30362032248250687, + "grad_norm": 0.8710147738456726, + "learning_rate": 1.9120583562789522e-05, + "loss": 0.4924, + "step": 2495 + }, + { + "epoch": 0.3037420139945239, + "grad_norm": 2.6657702922821045, + "learning_rate": 1.9119783486857712e-05, + "loss": 0.446, + "step": 2496 + }, + { + "epoch": 0.3038637055065409, + "grad_norm": 1.1198617219924927, + "learning_rate": 1.9118983063899266e-05, + "loss": 0.4392, + "step": 2497 + }, + { + "epoch": 0.303985397018558, + "grad_norm": 0.7784521579742432, + "learning_rate": 1.9118182293944638e-05, + "loss": 0.4623, + "step": 2498 + }, + { + "epoch": 0.304107088530575, + "grad_norm": 1.823186993598938, + "learning_rate": 1.9117381177024298e-05, + "loss": 0.4761, + "step": 2499 + }, + { + "epoch": 0.304228780042592, + "grad_norm": 1.5546553134918213, + "learning_rate": 1.9116579713168736e-05, + "loss": 0.4741, + "step": 2500 + }, + { + "epoch": 0.3043504715546091, + "grad_norm": 1.006614327430725, + "learning_rate": 1.9115777902408442e-05, + "loss": 0.4693, + "step": 2501 + }, + { + "epoch": 0.3044721630666261, + "grad_norm": 0.8246433734893799, + "learning_rate": 1.9114975744773937e-05, + "loss": 0.4552, + "step": 2502 + }, + { + "epoch": 0.30459385457864313, + "grad_norm": 0.7124652862548828, + "learning_rate": 1.9114173240295732e-05, + "loss": 0.4905, + "step": 2503 + }, + { + "epoch": 0.30471554609066015, + "grad_norm": 2.100367546081543, + "learning_rate": 1.9113370389004367e-05, + "loss": 0.5163, + "step": 2504 + }, + { + "epoch": 0.3048372376026772, + "grad_norm": 2.617018938064575, + "learning_rate": 1.9112567190930402e-05, + "loss": 0.428, + "step": 2505 + }, + { + "epoch": 0.30495892911469424, + "grad_norm": 2.650043249130249, + "learning_rate": 1.911176364610439e-05, + "loss": 0.4706, + "step": 2506 + }, + { + "epoch": 0.30508062062671126, + "grad_norm": 5.9851274490356445, + "learning_rate": 1.9110959754556914e-05, + "loss": 0.4172, + "step": 2507 + }, + { + "epoch": 0.30520231213872834, + "grad_norm": 0.9120343923568726, + "learning_rate": 1.9110155516318558e-05, + "loss": 0.4849, + "step": 2508 + }, + { + "epoch": 0.30532400365074536, + "grad_norm": 1.3405015468597412, + "learning_rate": 1.9109350931419926e-05, + "loss": 0.5099, + "step": 2509 + }, + { + "epoch": 0.3054456951627624, + "grad_norm": 0.6575984954833984, + "learning_rate": 1.910854599989164e-05, + "loss": 0.4623, + "step": 2510 + }, + { + "epoch": 0.30556738667477945, + "grad_norm": 0.6552987098693848, + "learning_rate": 1.910774072176432e-05, + "loss": 0.5027, + "step": 2511 + }, + { + "epoch": 0.30568907818679647, + "grad_norm": 0.8985508680343628, + "learning_rate": 1.9106935097068618e-05, + "loss": 0.4469, + "step": 2512 + }, + { + "epoch": 0.3058107696988135, + "grad_norm": 0.9726141095161438, + "learning_rate": 1.9106129125835183e-05, + "loss": 0.4518, + "step": 2513 + }, + { + "epoch": 0.30593246121083056, + "grad_norm": 0.8692079186439514, + "learning_rate": 1.9105322808094685e-05, + "loss": 0.433, + "step": 2514 + }, + { + "epoch": 0.3060541527228476, + "grad_norm": 2.1015286445617676, + "learning_rate": 1.910451614387781e-05, + "loss": 0.4818, + "step": 2515 + }, + { + "epoch": 0.3061758442348646, + "grad_norm": 0.8361689448356628, + "learning_rate": 1.910370913321525e-05, + "loss": 0.5331, + "step": 2516 + }, + { + "epoch": 0.3062975357468817, + "grad_norm": 1.3579707145690918, + "learning_rate": 1.910290177613771e-05, + "loss": 0.4276, + "step": 2517 + }, + { + "epoch": 0.3064192272588987, + "grad_norm": 0.7217264175415039, + "learning_rate": 1.910209407267592e-05, + "loss": 0.4899, + "step": 2518 + }, + { + "epoch": 0.3065409187709157, + "grad_norm": 1.9349385499954224, + "learning_rate": 1.910128602286061e-05, + "loss": 0.4995, + "step": 2519 + }, + { + "epoch": 0.3066626102829328, + "grad_norm": 0.823741614818573, + "learning_rate": 1.9100477626722525e-05, + "loss": 0.5206, + "step": 2520 + }, + { + "epoch": 0.3067843017949498, + "grad_norm": 3.0831916332244873, + "learning_rate": 1.909966888429243e-05, + "loss": 0.4678, + "step": 2521 + }, + { + "epoch": 0.30690599330696683, + "grad_norm": 4.734895706176758, + "learning_rate": 1.9098859795601102e-05, + "loss": 0.4239, + "step": 2522 + }, + { + "epoch": 0.3070276848189839, + "grad_norm": 2.184396982192993, + "learning_rate": 1.9098050360679323e-05, + "loss": 0.4342, + "step": 2523 + }, + { + "epoch": 0.3071493763310009, + "grad_norm": 1.0338777303695679, + "learning_rate": 1.9097240579557893e-05, + "loss": 0.4687, + "step": 2524 + }, + { + "epoch": 0.30727106784301794, + "grad_norm": 0.7237124443054199, + "learning_rate": 1.9096430452267633e-05, + "loss": 0.4232, + "step": 2525 + }, + { + "epoch": 0.30739275935503496, + "grad_norm": 0.7365623712539673, + "learning_rate": 1.9095619978839367e-05, + "loss": 0.4227, + "step": 2526 + }, + { + "epoch": 0.30751445086705204, + "grad_norm": 1.2416023015975952, + "learning_rate": 1.909480915930393e-05, + "loss": 0.4526, + "step": 2527 + }, + { + "epoch": 0.30763614237906906, + "grad_norm": 2.0551960468292236, + "learning_rate": 1.909399799369218e-05, + "loss": 0.5196, + "step": 2528 + }, + { + "epoch": 0.3077578338910861, + "grad_norm": 1.16765558719635, + "learning_rate": 1.9093186482034985e-05, + "loss": 0.4921, + "step": 2529 + }, + { + "epoch": 0.30787952540310315, + "grad_norm": 1.5175971984863281, + "learning_rate": 1.9092374624363218e-05, + "loss": 0.4539, + "step": 2530 + }, + { + "epoch": 0.30800121691512017, + "grad_norm": 1.5309423208236694, + "learning_rate": 1.909156242070778e-05, + "loss": 0.496, + "step": 2531 + }, + { + "epoch": 0.3081229084271372, + "grad_norm": 0.7406741976737976, + "learning_rate": 1.9090749871099573e-05, + "loss": 0.4751, + "step": 2532 + }, + { + "epoch": 0.30824459993915426, + "grad_norm": 0.831911027431488, + "learning_rate": 1.9089936975569516e-05, + "loss": 0.4986, + "step": 2533 + }, + { + "epoch": 0.3083662914511713, + "grad_norm": 0.7211702466011047, + "learning_rate": 1.9089123734148544e-05, + "loss": 0.489, + "step": 2534 + }, + { + "epoch": 0.3084879829631883, + "grad_norm": 1.2845163345336914, + "learning_rate": 1.9088310146867597e-05, + "loss": 0.4846, + "step": 2535 + }, + { + "epoch": 0.3086096744752054, + "grad_norm": 2.483842611312866, + "learning_rate": 1.9087496213757637e-05, + "loss": 0.4313, + "step": 2536 + }, + { + "epoch": 0.3087313659872224, + "grad_norm": 0.9204269051551819, + "learning_rate": 1.9086681934849636e-05, + "loss": 0.4669, + "step": 2537 + }, + { + "epoch": 0.3088530574992394, + "grad_norm": 1.0120172500610352, + "learning_rate": 1.9085867310174584e-05, + "loss": 0.497, + "step": 2538 + }, + { + "epoch": 0.3089747490112565, + "grad_norm": 1.2834872007369995, + "learning_rate": 1.908505233976347e-05, + "loss": 0.5095, + "step": 2539 + }, + { + "epoch": 0.3090964405232735, + "grad_norm": 1.1929011344909668, + "learning_rate": 1.908423702364731e-05, + "loss": 0.5056, + "step": 2540 + }, + { + "epoch": 0.3092181320352905, + "grad_norm": 0.6175735592842102, + "learning_rate": 1.908342136185713e-05, + "loss": 0.4391, + "step": 2541 + }, + { + "epoch": 0.3093398235473076, + "grad_norm": 2.8288943767547607, + "learning_rate": 1.9082605354423962e-05, + "loss": 0.3945, + "step": 2542 + }, + { + "epoch": 0.3094615150593246, + "grad_norm": 0.8661826848983765, + "learning_rate": 1.9081789001378862e-05, + "loss": 0.4429, + "step": 2543 + }, + { + "epoch": 0.30958320657134164, + "grad_norm": 0.6647040247917175, + "learning_rate": 1.9080972302752895e-05, + "loss": 0.4475, + "step": 2544 + }, + { + "epoch": 0.30970489808335866, + "grad_norm": 2.2310469150543213, + "learning_rate": 1.908015525857713e-05, + "loss": 0.504, + "step": 2545 + }, + { + "epoch": 0.30982658959537573, + "grad_norm": 2.417956590652466, + "learning_rate": 1.9079337868882667e-05, + "loss": 0.3962, + "step": 2546 + }, + { + "epoch": 0.30994828110739275, + "grad_norm": 1.634032130241394, + "learning_rate": 1.90785201337006e-05, + "loss": 0.447, + "step": 2547 + }, + { + "epoch": 0.3100699726194098, + "grad_norm": 0.724041223526001, + "learning_rate": 1.907770205306205e-05, + "loss": 0.4854, + "step": 2548 + }, + { + "epoch": 0.31019166413142685, + "grad_norm": 3.027937889099121, + "learning_rate": 1.9076883626998155e-05, + "loss": 0.5682, + "step": 2549 + }, + { + "epoch": 0.31031335564344387, + "grad_norm": 1.0180504322052002, + "learning_rate": 1.9076064855540045e-05, + "loss": 0.481, + "step": 2550 + }, + { + "epoch": 0.3104350471554609, + "grad_norm": 0.6276563405990601, + "learning_rate": 1.907524573871888e-05, + "loss": 0.498, + "step": 2551 + }, + { + "epoch": 0.31055673866747796, + "grad_norm": 1.1533331871032715, + "learning_rate": 1.907442627656583e-05, + "loss": 0.482, + "step": 2552 + }, + { + "epoch": 0.310678430179495, + "grad_norm": 0.6698744893074036, + "learning_rate": 1.907360646911208e-05, + "loss": 0.4765, + "step": 2553 + }, + { + "epoch": 0.310800121691512, + "grad_norm": 3.309514045715332, + "learning_rate": 1.907278631638882e-05, + "loss": 0.4083, + "step": 2554 + }, + { + "epoch": 0.3109218132035291, + "grad_norm": 0.6981541514396667, + "learning_rate": 1.9071965818427263e-05, + "loss": 0.4487, + "step": 2555 + }, + { + "epoch": 0.3110435047155461, + "grad_norm": 3.5589592456817627, + "learning_rate": 1.9071144975258626e-05, + "loss": 0.5174, + "step": 2556 + }, + { + "epoch": 0.3111651962275631, + "grad_norm": 3.27559232711792, + "learning_rate": 1.907032378691415e-05, + "loss": 0.4969, + "step": 2557 + }, + { + "epoch": 0.3112868877395802, + "grad_norm": 2.0862462520599365, + "learning_rate": 1.9069502253425075e-05, + "loss": 0.4246, + "step": 2558 + }, + { + "epoch": 0.3114085792515972, + "grad_norm": 0.8357444405555725, + "learning_rate": 1.906868037482267e-05, + "loss": 0.4468, + "step": 2559 + }, + { + "epoch": 0.3115302707636142, + "grad_norm": 4.043797492980957, + "learning_rate": 1.9067858151138202e-05, + "loss": 0.533, + "step": 2560 + }, + { + "epoch": 0.3116519622756313, + "grad_norm": 3.5708625316619873, + "learning_rate": 1.9067035582402966e-05, + "loss": 0.5395, + "step": 2561 + }, + { + "epoch": 0.3117736537876483, + "grad_norm": 1.6642167568206787, + "learning_rate": 1.9066212668648252e-05, + "loss": 0.4719, + "step": 2562 + }, + { + "epoch": 0.31189534529966534, + "grad_norm": 1.824846863746643, + "learning_rate": 1.9065389409905388e-05, + "loss": 0.504, + "step": 2563 + }, + { + "epoch": 0.31201703681168236, + "grad_norm": 1.5471199750900269, + "learning_rate": 1.906456580620569e-05, + "loss": 0.4801, + "step": 2564 + }, + { + "epoch": 0.31213872832369943, + "grad_norm": 3.82975697517395, + "learning_rate": 1.90637418575805e-05, + "loss": 0.4872, + "step": 2565 + }, + { + "epoch": 0.31226041983571645, + "grad_norm": 4.280481815338135, + "learning_rate": 1.906291756406117e-05, + "loss": 0.4568, + "step": 2566 + }, + { + "epoch": 0.31238211134773347, + "grad_norm": 3.213447332382202, + "learning_rate": 1.906209292567907e-05, + "loss": 0.5027, + "step": 2567 + }, + { + "epoch": 0.31250380285975055, + "grad_norm": 5.017328262329102, + "learning_rate": 1.906126794246557e-05, + "loss": 0.4715, + "step": 2568 + }, + { + "epoch": 0.31262549437176756, + "grad_norm": 2.812809467315674, + "learning_rate": 1.906044261445208e-05, + "loss": 0.4877, + "step": 2569 + }, + { + "epoch": 0.3127471858837846, + "grad_norm": 3.3021914958953857, + "learning_rate": 1.9059616941669984e-05, + "loss": 0.4812, + "step": 2570 + }, + { + "epoch": 0.31286887739580166, + "grad_norm": 0.8027843236923218, + "learning_rate": 1.9058790924150717e-05, + "loss": 0.4703, + "step": 2571 + }, + { + "epoch": 0.3129905689078187, + "grad_norm": 1.1845877170562744, + "learning_rate": 1.90579645619257e-05, + "loss": 0.4588, + "step": 2572 + }, + { + "epoch": 0.3131122604198357, + "grad_norm": 2.4159934520721436, + "learning_rate": 1.905713785502639e-05, + "loss": 0.4828, + "step": 2573 + }, + { + "epoch": 0.31323395193185277, + "grad_norm": 0.750566840171814, + "learning_rate": 1.9056310803484232e-05, + "loss": 0.4151, + "step": 2574 + }, + { + "epoch": 0.3133556434438698, + "grad_norm": 1.8776639699935913, + "learning_rate": 1.9055483407330705e-05, + "loss": 0.4723, + "step": 2575 + }, + { + "epoch": 0.3134773349558868, + "grad_norm": 5.200119972229004, + "learning_rate": 1.9054655666597287e-05, + "loss": 0.581, + "step": 2576 + }, + { + "epoch": 0.3135990264679039, + "grad_norm": 0.7368274331092834, + "learning_rate": 1.905382758131548e-05, + "loss": 0.463, + "step": 2577 + }, + { + "epoch": 0.3137207179799209, + "grad_norm": 2.442462921142578, + "learning_rate": 1.9052999151516792e-05, + "loss": 0.5122, + "step": 2578 + }, + { + "epoch": 0.3138424094919379, + "grad_norm": 0.9392907619476318, + "learning_rate": 1.9052170377232752e-05, + "loss": 0.4879, + "step": 2579 + }, + { + "epoch": 0.313964101003955, + "grad_norm": 2.1264443397521973, + "learning_rate": 1.905134125849489e-05, + "loss": 0.478, + "step": 2580 + }, + { + "epoch": 0.314085792515972, + "grad_norm": 1.442296028137207, + "learning_rate": 1.9050511795334756e-05, + "loss": 0.4788, + "step": 2581 + }, + { + "epoch": 0.31420748402798904, + "grad_norm": 4.7049946784973145, + "learning_rate": 1.9049681987783914e-05, + "loss": 0.4229, + "step": 2582 + }, + { + "epoch": 0.3143291755400061, + "grad_norm": 0.6403811573982239, + "learning_rate": 1.9048851835873945e-05, + "loss": 0.4954, + "step": 2583 + }, + { + "epoch": 0.31445086705202313, + "grad_norm": 0.6243477463722229, + "learning_rate": 1.9048021339636433e-05, + "loss": 0.48, + "step": 2584 + }, + { + "epoch": 0.31457255856404015, + "grad_norm": 1.9575669765472412, + "learning_rate": 1.9047190499102975e-05, + "loss": 0.4271, + "step": 2585 + }, + { + "epoch": 0.31469425007605717, + "grad_norm": 1.1199640035629272, + "learning_rate": 1.9046359314305198e-05, + "loss": 0.4391, + "step": 2586 + }, + { + "epoch": 0.31481594158807424, + "grad_norm": 3.7026619911193848, + "learning_rate": 1.904552778527472e-05, + "loss": 0.5125, + "step": 2587 + }, + { + "epoch": 0.31493763310009126, + "grad_norm": 3.1253371238708496, + "learning_rate": 1.904469591204319e-05, + "loss": 0.5123, + "step": 2588 + }, + { + "epoch": 0.3150593246121083, + "grad_norm": 2.278315782546997, + "learning_rate": 1.9043863694642258e-05, + "loss": 0.3863, + "step": 2589 + }, + { + "epoch": 0.31518101612412536, + "grad_norm": 2.6691672801971436, + "learning_rate": 1.9043031133103593e-05, + "loss": 0.4662, + "step": 2590 + }, + { + "epoch": 0.3153027076361424, + "grad_norm": 1.2145169973373413, + "learning_rate": 1.9042198227458873e-05, + "loss": 0.4847, + "step": 2591 + }, + { + "epoch": 0.3154243991481594, + "grad_norm": 2.419055223464966, + "learning_rate": 1.9041364977739792e-05, + "loss": 0.4087, + "step": 2592 + }, + { + "epoch": 0.31554609066017647, + "grad_norm": 1.7537059783935547, + "learning_rate": 1.9040531383978063e-05, + "loss": 0.5198, + "step": 2593 + }, + { + "epoch": 0.3156677821721935, + "grad_norm": 2.4825210571289062, + "learning_rate": 1.9039697446205405e-05, + "loss": 0.4276, + "step": 2594 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 2.5851564407348633, + "learning_rate": 1.9038863164453544e-05, + "loss": 0.4007, + "step": 2595 + }, + { + "epoch": 0.3159111651962276, + "grad_norm": 2.3171944618225098, + "learning_rate": 1.903802853875423e-05, + "loss": 0.4155, + "step": 2596 + }, + { + "epoch": 0.3160328567082446, + "grad_norm": 0.9399721622467041, + "learning_rate": 1.9037193569139224e-05, + "loss": 0.5118, + "step": 2597 + }, + { + "epoch": 0.3161545482202616, + "grad_norm": 3.951594114303589, + "learning_rate": 1.9036358255640298e-05, + "loss": 0.3415, + "step": 2598 + }, + { + "epoch": 0.3162762397322787, + "grad_norm": 0.9210025072097778, + "learning_rate": 1.903552259828923e-05, + "loss": 0.4397, + "step": 2599 + }, + { + "epoch": 0.3163979312442957, + "grad_norm": 3.0612332820892334, + "learning_rate": 1.9034686597117832e-05, + "loss": 0.496, + "step": 2600 + }, + { + "epoch": 0.31651962275631274, + "grad_norm": 1.8251466751098633, + "learning_rate": 1.90338502521579e-05, + "loss": 0.4521, + "step": 2601 + }, + { + "epoch": 0.3166413142683298, + "grad_norm": 0.791189432144165, + "learning_rate": 1.9033013563441276e-05, + "loss": 0.4273, + "step": 2602 + }, + { + "epoch": 0.31676300578034683, + "grad_norm": 1.7600021362304688, + "learning_rate": 1.9032176530999782e-05, + "loss": 0.4843, + "step": 2603 + }, + { + "epoch": 0.31688469729236385, + "grad_norm": 2.6301116943359375, + "learning_rate": 1.903133915486528e-05, + "loss": 0.4079, + "step": 2604 + }, + { + "epoch": 0.31700638880438087, + "grad_norm": 2.031949758529663, + "learning_rate": 1.9030501435069626e-05, + "loss": 0.4813, + "step": 2605 + }, + { + "epoch": 0.31712808031639794, + "grad_norm": 3.4081430435180664, + "learning_rate": 1.9029663371644698e-05, + "loss": 0.4335, + "step": 2606 + }, + { + "epoch": 0.31724977182841496, + "grad_norm": 0.8446583151817322, + "learning_rate": 1.9028824964622393e-05, + "loss": 0.4635, + "step": 2607 + }, + { + "epoch": 0.317371463340432, + "grad_norm": 2.818190574645996, + "learning_rate": 1.902798621403461e-05, + "loss": 0.4097, + "step": 2608 + }, + { + "epoch": 0.31749315485244906, + "grad_norm": 0.8132585287094116, + "learning_rate": 1.9027147119913263e-05, + "loss": 0.472, + "step": 2609 + }, + { + "epoch": 0.3176148463644661, + "grad_norm": 1.3215408325195312, + "learning_rate": 1.9026307682290284e-05, + "loss": 0.4712, + "step": 2610 + }, + { + "epoch": 0.3177365378764831, + "grad_norm": 1.2061406373977661, + "learning_rate": 1.902546790119761e-05, + "loss": 0.4666, + "step": 2611 + }, + { + "epoch": 0.31785822938850017, + "grad_norm": 1.0162789821624756, + "learning_rate": 1.9024627776667204e-05, + "loss": 0.4524, + "step": 2612 + }, + { + "epoch": 0.3179799209005172, + "grad_norm": 0.5247098803520203, + "learning_rate": 1.902378730873103e-05, + "loss": 0.4736, + "step": 2613 + }, + { + "epoch": 0.3181016124125342, + "grad_norm": 2.5346930027008057, + "learning_rate": 1.9022946497421074e-05, + "loss": 0.5081, + "step": 2614 + }, + { + "epoch": 0.3182233039245513, + "grad_norm": 1.1095057725906372, + "learning_rate": 1.9022105342769323e-05, + "loss": 0.4631, + "step": 2615 + }, + { + "epoch": 0.3183449954365683, + "grad_norm": 3.6375324726104736, + "learning_rate": 1.902126384480779e-05, + "loss": 0.413, + "step": 2616 + }, + { + "epoch": 0.3184666869485853, + "grad_norm": 0.7454293966293335, + "learning_rate": 1.9020422003568498e-05, + "loss": 0.4436, + "step": 2617 + }, + { + "epoch": 0.3185883784606024, + "grad_norm": 0.5880517363548279, + "learning_rate": 1.9019579819083475e-05, + "loss": 0.4692, + "step": 2618 + }, + { + "epoch": 0.3187100699726194, + "grad_norm": 1.3258553743362427, + "learning_rate": 1.901873729138477e-05, + "loss": 0.4984, + "step": 2619 + }, + { + "epoch": 0.31883176148463643, + "grad_norm": 0.8498595356941223, + "learning_rate": 1.9017894420504445e-05, + "loss": 0.4221, + "step": 2620 + }, + { + "epoch": 0.3189534529966535, + "grad_norm": 1.8798936605453491, + "learning_rate": 1.9017051206474568e-05, + "loss": 0.512, + "step": 2621 + }, + { + "epoch": 0.3190751445086705, + "grad_norm": 2.344435214996338, + "learning_rate": 1.901620764932723e-05, + "loss": 0.4559, + "step": 2622 + }, + { + "epoch": 0.31919683602068755, + "grad_norm": 0.6877545118331909, + "learning_rate": 1.901536374909453e-05, + "loss": 0.5253, + "step": 2623 + }, + { + "epoch": 0.3193185275327046, + "grad_norm": 0.9242064356803894, + "learning_rate": 1.9014519505808575e-05, + "loss": 0.504, + "step": 2624 + }, + { + "epoch": 0.31944021904472164, + "grad_norm": 1.3465564250946045, + "learning_rate": 1.9013674919501498e-05, + "loss": 0.4877, + "step": 2625 + }, + { + "epoch": 0.31956191055673866, + "grad_norm": 2.5336482524871826, + "learning_rate": 1.901282999020543e-05, + "loss": 0.4596, + "step": 2626 + }, + { + "epoch": 0.3196836020687557, + "grad_norm": 3.471236228942871, + "learning_rate": 1.901198471795253e-05, + "loss": 0.4903, + "step": 2627 + }, + { + "epoch": 0.31980529358077275, + "grad_norm": 3.1795148849487305, + "learning_rate": 1.9011139102774948e-05, + "loss": 0.4349, + "step": 2628 + }, + { + "epoch": 0.3199269850927898, + "grad_norm": 3.48786997795105, + "learning_rate": 1.901029314470488e-05, + "loss": 0.4243, + "step": 2629 + }, + { + "epoch": 0.3200486766048068, + "grad_norm": 1.9860074520111084, + "learning_rate": 1.9009446843774503e-05, + "loss": 0.5187, + "step": 2630 + }, + { + "epoch": 0.32017036811682387, + "grad_norm": 1.605178713798523, + "learning_rate": 1.900860020001603e-05, + "loss": 0.4712, + "step": 2631 + }, + { + "epoch": 0.3202920596288409, + "grad_norm": 1.0413576364517212, + "learning_rate": 1.9007753213461667e-05, + "loss": 0.5031, + "step": 2632 + }, + { + "epoch": 0.3204137511408579, + "grad_norm": 2.5105197429656982, + "learning_rate": 1.9006905884143652e-05, + "loss": 0.5191, + "step": 2633 + }, + { + "epoch": 0.320535442652875, + "grad_norm": 2.3670494556427, + "learning_rate": 1.9006058212094224e-05, + "loss": 0.456, + "step": 2634 + }, + { + "epoch": 0.320657134164892, + "grad_norm": 3.461168050765991, + "learning_rate": 1.900521019734564e-05, + "loss": 0.4252, + "step": 2635 + }, + { + "epoch": 0.320778825676909, + "grad_norm": 1.7053115367889404, + "learning_rate": 1.9004361839930167e-05, + "loss": 0.4593, + "step": 2636 + }, + { + "epoch": 0.3209005171889261, + "grad_norm": 1.234446406364441, + "learning_rate": 1.900351313988009e-05, + "loss": 0.4533, + "step": 2637 + }, + { + "epoch": 0.3210222087009431, + "grad_norm": 0.5966386795043945, + "learning_rate": 1.9002664097227702e-05, + "loss": 0.4492, + "step": 2638 + }, + { + "epoch": 0.32114390021296013, + "grad_norm": 1.6105024814605713, + "learning_rate": 1.9001814712005307e-05, + "loss": 0.409, + "step": 2639 + }, + { + "epoch": 0.3212655917249772, + "grad_norm": 2.2848470211029053, + "learning_rate": 1.9000964984245235e-05, + "loss": 0.4825, + "step": 2640 + }, + { + "epoch": 0.3213872832369942, + "grad_norm": 3.3710803985595703, + "learning_rate": 1.9000114913979812e-05, + "loss": 0.5075, + "step": 2641 + }, + { + "epoch": 0.32150897474901124, + "grad_norm": 3.251838207244873, + "learning_rate": 1.8999264501241383e-05, + "loss": 0.4821, + "step": 2642 + }, + { + "epoch": 0.3216306662610283, + "grad_norm": 1.7791250944137573, + "learning_rate": 1.899841374606232e-05, + "loss": 0.4193, + "step": 2643 + }, + { + "epoch": 0.32175235777304534, + "grad_norm": 3.4943878650665283, + "learning_rate": 1.8997562648474982e-05, + "loss": 0.4736, + "step": 2644 + }, + { + "epoch": 0.32187404928506236, + "grad_norm": 1.01828932762146, + "learning_rate": 1.8996711208511763e-05, + "loss": 0.4264, + "step": 2645 + }, + { + "epoch": 0.3219957407970794, + "grad_norm": 0.7106894850730896, + "learning_rate": 1.899585942620506e-05, + "loss": 0.4713, + "step": 2646 + }, + { + "epoch": 0.32211743230909645, + "grad_norm": 0.8335497379302979, + "learning_rate": 1.8995007301587286e-05, + "loss": 0.4871, + "step": 2647 + }, + { + "epoch": 0.32223912382111347, + "grad_norm": 1.4388506412506104, + "learning_rate": 1.8994154834690867e-05, + "loss": 0.4841, + "step": 2648 + }, + { + "epoch": 0.3223608153331305, + "grad_norm": 3.9006192684173584, + "learning_rate": 1.899330202554824e-05, + "loss": 0.442, + "step": 2649 + }, + { + "epoch": 0.32248250684514757, + "grad_norm": 4.2960100173950195, + "learning_rate": 1.8992448874191852e-05, + "loss": 0.4606, + "step": 2650 + }, + { + "epoch": 0.3226041983571646, + "grad_norm": 3.8702080249786377, + "learning_rate": 1.8991595380654173e-05, + "loss": 0.4501, + "step": 2651 + }, + { + "epoch": 0.3227258898691816, + "grad_norm": 0.8250051736831665, + "learning_rate": 1.8990741544967683e-05, + "loss": 0.5075, + "step": 2652 + }, + { + "epoch": 0.3228475813811987, + "grad_norm": 1.2080780267715454, + "learning_rate": 1.898988736716486e-05, + "loss": 0.4404, + "step": 2653 + }, + { + "epoch": 0.3229692728932157, + "grad_norm": 0.9255439043045044, + "learning_rate": 1.8989032847278216e-05, + "loss": 0.3903, + "step": 2654 + }, + { + "epoch": 0.3230909644052327, + "grad_norm": 1.1625902652740479, + "learning_rate": 1.8988177985340267e-05, + "loss": 0.4181, + "step": 2655 + }, + { + "epoch": 0.3232126559172498, + "grad_norm": 1.6690421104431152, + "learning_rate": 1.898732278138354e-05, + "loss": 0.4357, + "step": 2656 + }, + { + "epoch": 0.3233343474292668, + "grad_norm": 4.66319465637207, + "learning_rate": 1.8986467235440585e-05, + "loss": 0.5556, + "step": 2657 + }, + { + "epoch": 0.32345603894128383, + "grad_norm": 3.319766044616699, + "learning_rate": 1.8985611347543944e-05, + "loss": 0.4759, + "step": 2658 + }, + { + "epoch": 0.3235777304533009, + "grad_norm": 4.456297874450684, + "learning_rate": 1.8984755117726194e-05, + "loss": 0.5651, + "step": 2659 + }, + { + "epoch": 0.3236994219653179, + "grad_norm": 1.5082676410675049, + "learning_rate": 1.8983898546019916e-05, + "loss": 0.4919, + "step": 2660 + }, + { + "epoch": 0.32382111347733494, + "grad_norm": 1.5878376960754395, + "learning_rate": 1.89830416324577e-05, + "loss": 0.5402, + "step": 2661 + }, + { + "epoch": 0.323942804989352, + "grad_norm": 1.2165762186050415, + "learning_rate": 1.8982184377072155e-05, + "loss": 0.5297, + "step": 2662 + }, + { + "epoch": 0.32406449650136904, + "grad_norm": 3.985576629638672, + "learning_rate": 1.8981326779895905e-05, + "loss": 0.5141, + "step": 2663 + }, + { + "epoch": 0.32418618801338606, + "grad_norm": 6.296304702758789, + "learning_rate": 1.898046884096158e-05, + "loss": 0.5078, + "step": 2664 + }, + { + "epoch": 0.3243078795254031, + "grad_norm": 4.510111331939697, + "learning_rate": 1.8979610560301826e-05, + "loss": 0.4696, + "step": 2665 + }, + { + "epoch": 0.32442957103742015, + "grad_norm": 6.386861324310303, + "learning_rate": 1.8978751937949304e-05, + "loss": 0.4972, + "step": 2666 + }, + { + "epoch": 0.32455126254943717, + "grad_norm": 4.193247318267822, + "learning_rate": 1.8977892973936685e-05, + "loss": 0.4859, + "step": 2667 + }, + { + "epoch": 0.3246729540614542, + "grad_norm": 1.83122718334198, + "learning_rate": 1.8977033668296658e-05, + "loss": 0.5128, + "step": 2668 + }, + { + "epoch": 0.32479464557347126, + "grad_norm": 1.1672346591949463, + "learning_rate": 1.8976174021061917e-05, + "loss": 0.463, + "step": 2669 + }, + { + "epoch": 0.3249163370854883, + "grad_norm": 0.722151517868042, + "learning_rate": 1.8975314032265175e-05, + "loss": 0.4263, + "step": 2670 + }, + { + "epoch": 0.3250380285975053, + "grad_norm": 0.6169853210449219, + "learning_rate": 1.8974453701939155e-05, + "loss": 0.4185, + "step": 2671 + }, + { + "epoch": 0.3251597201095224, + "grad_norm": 7.150989055633545, + "learning_rate": 1.8973593030116597e-05, + "loss": 0.5815, + "step": 2672 + }, + { + "epoch": 0.3252814116215394, + "grad_norm": 1.5084476470947266, + "learning_rate": 1.897273201683025e-05, + "loss": 0.4425, + "step": 2673 + }, + { + "epoch": 0.3254031031335564, + "grad_norm": 2.0572752952575684, + "learning_rate": 1.8971870662112873e-05, + "loss": 0.4273, + "step": 2674 + }, + { + "epoch": 0.3255247946455735, + "grad_norm": 3.8607330322265625, + "learning_rate": 1.897100896599725e-05, + "loss": 0.5148, + "step": 2675 + }, + { + "epoch": 0.3256464861575905, + "grad_norm": 4.759674072265625, + "learning_rate": 1.8970146928516167e-05, + "loss": 0.5774, + "step": 2676 + }, + { + "epoch": 0.32576817766960753, + "grad_norm": 3.9468443393707275, + "learning_rate": 1.8969284549702425e-05, + "loss": 0.5379, + "step": 2677 + }, + { + "epoch": 0.3258898691816246, + "grad_norm": 3.6373515129089355, + "learning_rate": 1.8968421829588843e-05, + "loss": 0.5789, + "step": 2678 + }, + { + "epoch": 0.3260115606936416, + "grad_norm": 1.2568577527999878, + "learning_rate": 1.8967558768208244e-05, + "loss": 0.4792, + "step": 2679 + }, + { + "epoch": 0.32613325220565864, + "grad_norm": 3.928135633468628, + "learning_rate": 1.8966695365593476e-05, + "loss": 0.4718, + "step": 2680 + }, + { + "epoch": 0.3262549437176757, + "grad_norm": 2.526728868484497, + "learning_rate": 1.8965831621777385e-05, + "loss": 0.534, + "step": 2681 + }, + { + "epoch": 0.32637663522969274, + "grad_norm": 4.106503009796143, + "learning_rate": 1.8964967536792845e-05, + "loss": 0.5144, + "step": 2682 + }, + { + "epoch": 0.32649832674170975, + "grad_norm": 1.0128921270370483, + "learning_rate": 1.8964103110672734e-05, + "loss": 0.4953, + "step": 2683 + }, + { + "epoch": 0.32662001825372683, + "grad_norm": 0.894976794719696, + "learning_rate": 1.8963238343449945e-05, + "loss": 0.5, + "step": 2684 + }, + { + "epoch": 0.32674170976574385, + "grad_norm": 2.012657642364502, + "learning_rate": 1.8962373235157382e-05, + "loss": 0.4423, + "step": 2685 + }, + { + "epoch": 0.32686340127776087, + "grad_norm": 1.444948673248291, + "learning_rate": 1.8961507785827966e-05, + "loss": 0.4272, + "step": 2686 + }, + { + "epoch": 0.3269850927897779, + "grad_norm": 3.2959814071655273, + "learning_rate": 1.896064199549463e-05, + "loss": 0.5427, + "step": 2687 + }, + { + "epoch": 0.32710678430179496, + "grad_norm": 4.652558326721191, + "learning_rate": 1.895977586419032e-05, + "loss": 0.5288, + "step": 2688 + }, + { + "epoch": 0.327228475813812, + "grad_norm": 0.7047215104103088, + "learning_rate": 1.895890939194799e-05, + "loss": 0.3965, + "step": 2689 + }, + { + "epoch": 0.327350167325829, + "grad_norm": 3.409531354904175, + "learning_rate": 1.8958042578800614e-05, + "loss": 0.4796, + "step": 2690 + }, + { + "epoch": 0.3274718588378461, + "grad_norm": 1.6422313451766968, + "learning_rate": 1.895717542478118e-05, + "loss": 0.4466, + "step": 2691 + }, + { + "epoch": 0.3275935503498631, + "grad_norm": 5.620854377746582, + "learning_rate": 1.8956307929922676e-05, + "loss": 0.5593, + "step": 2692 + }, + { + "epoch": 0.3277152418618801, + "grad_norm": 1.3282305002212524, + "learning_rate": 1.895544009425812e-05, + "loss": 0.46, + "step": 2693 + }, + { + "epoch": 0.3278369333738972, + "grad_norm": 1.2437347173690796, + "learning_rate": 1.895457191782053e-05, + "loss": 0.4019, + "step": 2694 + }, + { + "epoch": 0.3279586248859142, + "grad_norm": 1.669395923614502, + "learning_rate": 1.8953703400642945e-05, + "loss": 0.485, + "step": 2695 + }, + { + "epoch": 0.3280803163979312, + "grad_norm": 1.0959354639053345, + "learning_rate": 1.8952834542758413e-05, + "loss": 0.452, + "step": 2696 + }, + { + "epoch": 0.3282020079099483, + "grad_norm": 2.905395984649658, + "learning_rate": 1.8951965344199995e-05, + "loss": 0.463, + "step": 2697 + }, + { + "epoch": 0.3283236994219653, + "grad_norm": 2.0541398525238037, + "learning_rate": 1.8951095805000762e-05, + "loss": 0.4591, + "step": 2698 + }, + { + "epoch": 0.32844539093398234, + "grad_norm": 0.7949023246765137, + "learning_rate": 1.895022592519381e-05, + "loss": 0.5016, + "step": 2699 + }, + { + "epoch": 0.3285670824459994, + "grad_norm": 0.7792330980300903, + "learning_rate": 1.8949355704812235e-05, + "loss": 0.4759, + "step": 2700 + }, + { + "epoch": 0.32868877395801643, + "grad_norm": 1.6458280086517334, + "learning_rate": 1.894848514388915e-05, + "loss": 0.4482, + "step": 2701 + }, + { + "epoch": 0.32881046547003345, + "grad_norm": 2.4719221591949463, + "learning_rate": 1.8947614242457685e-05, + "loss": 0.5209, + "step": 2702 + }, + { + "epoch": 0.3289321569820505, + "grad_norm": 3.014310359954834, + "learning_rate": 1.8946743000550975e-05, + "loss": 0.4965, + "step": 2703 + }, + { + "epoch": 0.32905384849406755, + "grad_norm": 0.6068018674850464, + "learning_rate": 1.8945871418202174e-05, + "loss": 0.4677, + "step": 2704 + }, + { + "epoch": 0.32917554000608457, + "grad_norm": 1.910712480545044, + "learning_rate": 1.894499949544445e-05, + "loss": 0.5107, + "step": 2705 + }, + { + "epoch": 0.3292972315181016, + "grad_norm": 1.9104607105255127, + "learning_rate": 1.8944127232310984e-05, + "loss": 0.4692, + "step": 2706 + }, + { + "epoch": 0.32941892303011866, + "grad_norm": 2.0082781314849854, + "learning_rate": 1.894325462883496e-05, + "loss": 0.4773, + "step": 2707 + }, + { + "epoch": 0.3295406145421357, + "grad_norm": 0.5868907570838928, + "learning_rate": 1.8942381685049587e-05, + "loss": 0.4841, + "step": 2708 + }, + { + "epoch": 0.3296623060541527, + "grad_norm": 0.6337372660636902, + "learning_rate": 1.894150840098808e-05, + "loss": 0.4778, + "step": 2709 + }, + { + "epoch": 0.3297839975661698, + "grad_norm": 2.131854772567749, + "learning_rate": 1.8940634776683672e-05, + "loss": 0.435, + "step": 2710 + }, + { + "epoch": 0.3299056890781868, + "grad_norm": 3.417280673980713, + "learning_rate": 1.89397608121696e-05, + "loss": 0.5719, + "step": 2711 + }, + { + "epoch": 0.3300273805902038, + "grad_norm": 0.8599411845207214, + "learning_rate": 1.893888650747913e-05, + "loss": 0.4387, + "step": 2712 + }, + { + "epoch": 0.3301490721022209, + "grad_norm": 2.0451738834381104, + "learning_rate": 1.8938011862645527e-05, + "loss": 0.4744, + "step": 2713 + }, + { + "epoch": 0.3302707636142379, + "grad_norm": 1.1521553993225098, + "learning_rate": 1.8937136877702066e-05, + "loss": 0.4606, + "step": 2714 + }, + { + "epoch": 0.3303924551262549, + "grad_norm": 0.6685805916786194, + "learning_rate": 1.8936261552682053e-05, + "loss": 0.4721, + "step": 2715 + }, + { + "epoch": 0.330514146638272, + "grad_norm": 0.9258437752723694, + "learning_rate": 1.893538588761879e-05, + "loss": 0.4562, + "step": 2716 + }, + { + "epoch": 0.330635838150289, + "grad_norm": 0.9971997141838074, + "learning_rate": 1.8934509882545597e-05, + "loss": 0.4747, + "step": 2717 + }, + { + "epoch": 0.33075752966230604, + "grad_norm": 2.043272018432617, + "learning_rate": 1.893363353749581e-05, + "loss": 0.5159, + "step": 2718 + }, + { + "epoch": 0.3308792211743231, + "grad_norm": 0.5416118502616882, + "learning_rate": 1.8932756852502777e-05, + "loss": 0.5034, + "step": 2719 + }, + { + "epoch": 0.33100091268634013, + "grad_norm": 1.4942349195480347, + "learning_rate": 1.8931879827599853e-05, + "loss": 0.5068, + "step": 2720 + }, + { + "epoch": 0.33112260419835715, + "grad_norm": 1.0185729265213013, + "learning_rate": 1.8931002462820416e-05, + "loss": 0.4905, + "step": 2721 + }, + { + "epoch": 0.3312442957103742, + "grad_norm": 1.0013389587402344, + "learning_rate": 1.893012475819785e-05, + "loss": 0.5051, + "step": 2722 + }, + { + "epoch": 0.33136598722239125, + "grad_norm": 0.8700234889984131, + "learning_rate": 1.8929246713765548e-05, + "loss": 0.4942, + "step": 2723 + }, + { + "epoch": 0.33148767873440826, + "grad_norm": 1.5754849910736084, + "learning_rate": 1.892836832955693e-05, + "loss": 0.4665, + "step": 2724 + }, + { + "epoch": 0.33160937024642534, + "grad_norm": 2.8131306171417236, + "learning_rate": 1.8927489605605414e-05, + "loss": 0.458, + "step": 2725 + }, + { + "epoch": 0.33173106175844236, + "grad_norm": 1.2453330755233765, + "learning_rate": 1.8926610541944443e-05, + "loss": 0.4675, + "step": 2726 + }, + { + "epoch": 0.3318527532704594, + "grad_norm": 0.7671390771865845, + "learning_rate": 1.892573113860746e-05, + "loss": 0.4825, + "step": 2727 + }, + { + "epoch": 0.3319744447824764, + "grad_norm": 1.2482337951660156, + "learning_rate": 1.8924851395627932e-05, + "loss": 0.4569, + "step": 2728 + }, + { + "epoch": 0.33209613629449347, + "grad_norm": 0.6935709714889526, + "learning_rate": 1.8923971313039336e-05, + "loss": 0.4748, + "step": 2729 + }, + { + "epoch": 0.3322178278065105, + "grad_norm": 0.9674332141876221, + "learning_rate": 1.8923090890875164e-05, + "loss": 0.506, + "step": 2730 + }, + { + "epoch": 0.3323395193185275, + "grad_norm": 1.2981398105621338, + "learning_rate": 1.892221012916891e-05, + "loss": 0.4117, + "step": 2731 + }, + { + "epoch": 0.3324612108305446, + "grad_norm": 0.8491005897521973, + "learning_rate": 1.892132902795409e-05, + "loss": 0.474, + "step": 2732 + }, + { + "epoch": 0.3325829023425616, + "grad_norm": 0.6901066899299622, + "learning_rate": 1.892044758726424e-05, + "loss": 0.4595, + "step": 2733 + }, + { + "epoch": 0.3327045938545786, + "grad_norm": 0.8076638579368591, + "learning_rate": 1.8919565807132893e-05, + "loss": 0.4713, + "step": 2734 + }, + { + "epoch": 0.3328262853665957, + "grad_norm": 1.3747620582580566, + "learning_rate": 1.89186836875936e-05, + "loss": 0.5038, + "step": 2735 + }, + { + "epoch": 0.3329479768786127, + "grad_norm": 0.7989272475242615, + "learning_rate": 1.891780122867994e-05, + "loss": 0.4633, + "step": 2736 + }, + { + "epoch": 0.33306966839062974, + "grad_norm": 0.6159539222717285, + "learning_rate": 1.891691843042548e-05, + "loss": 0.5099, + "step": 2737 + }, + { + "epoch": 0.3331913599026468, + "grad_norm": 4.381326198577881, + "learning_rate": 1.8916035292863816e-05, + "loss": 0.4413, + "step": 2738 + }, + { + "epoch": 0.33331305141466383, + "grad_norm": 1.493550181388855, + "learning_rate": 1.8915151816028554e-05, + "loss": 0.4773, + "step": 2739 + }, + { + "epoch": 0.33343474292668085, + "grad_norm": 2.293708086013794, + "learning_rate": 1.8914267999953314e-05, + "loss": 0.4657, + "step": 2740 + }, + { + "epoch": 0.3335564344386979, + "grad_norm": 1.616192102432251, + "learning_rate": 1.8913383844671722e-05, + "loss": 0.4863, + "step": 2741 + }, + { + "epoch": 0.33367812595071494, + "grad_norm": 1.026611566543579, + "learning_rate": 1.8912499350217427e-05, + "loss": 0.4974, + "step": 2742 + }, + { + "epoch": 0.33379981746273196, + "grad_norm": 1.9575178623199463, + "learning_rate": 1.8911614516624085e-05, + "loss": 0.5133, + "step": 2743 + }, + { + "epoch": 0.33392150897474904, + "grad_norm": 2.692417860031128, + "learning_rate": 1.8910729343925366e-05, + "loss": 0.5383, + "step": 2744 + }, + { + "epoch": 0.33404320048676606, + "grad_norm": 0.7036192417144775, + "learning_rate": 1.8909843832154946e-05, + "loss": 0.4751, + "step": 2745 + }, + { + "epoch": 0.3341648919987831, + "grad_norm": 1.0749365091323853, + "learning_rate": 1.890895798134653e-05, + "loss": 0.4912, + "step": 2746 + }, + { + "epoch": 0.3342865835108001, + "grad_norm": 2.491318702697754, + "learning_rate": 1.890807179153382e-05, + "loss": 0.4296, + "step": 2747 + }, + { + "epoch": 0.33440827502281717, + "grad_norm": 2.35545015335083, + "learning_rate": 1.890718526275054e-05, + "loss": 0.4132, + "step": 2748 + }, + { + "epoch": 0.3345299665348342, + "grad_norm": 0.6383217573165894, + "learning_rate": 1.8906298395030424e-05, + "loss": 0.4687, + "step": 2749 + }, + { + "epoch": 0.3346516580468512, + "grad_norm": 2.1951515674591064, + "learning_rate": 1.8905411188407223e-05, + "loss": 0.4882, + "step": 2750 + }, + { + "epoch": 0.3347733495588683, + "grad_norm": 1.6575039625167847, + "learning_rate": 1.8904523642914688e-05, + "loss": 0.493, + "step": 2751 + }, + { + "epoch": 0.3348950410708853, + "grad_norm": 1.6144286394119263, + "learning_rate": 1.89036357585866e-05, + "loss": 0.4798, + "step": 2752 + }, + { + "epoch": 0.3350167325829023, + "grad_norm": 2.8362350463867188, + "learning_rate": 1.890274753545674e-05, + "loss": 0.5066, + "step": 2753 + }, + { + "epoch": 0.3351384240949194, + "grad_norm": 4.57258939743042, + "learning_rate": 1.8901858973558908e-05, + "loss": 0.3973, + "step": 2754 + }, + { + "epoch": 0.3352601156069364, + "grad_norm": 0.7866657376289368, + "learning_rate": 1.890097007292692e-05, + "loss": 0.5257, + "step": 2755 + }, + { + "epoch": 0.33538180711895343, + "grad_norm": 0.8888288736343384, + "learning_rate": 1.890008083359459e-05, + "loss": 0.5212, + "step": 2756 + }, + { + "epoch": 0.3355034986309705, + "grad_norm": 2.7362277507781982, + "learning_rate": 1.8899191255595765e-05, + "loss": 0.4664, + "step": 2757 + }, + { + "epoch": 0.33562519014298753, + "grad_norm": 2.118499994277954, + "learning_rate": 1.8898301338964294e-05, + "loss": 0.5035, + "step": 2758 + }, + { + "epoch": 0.33574688165500455, + "grad_norm": 0.7798293232917786, + "learning_rate": 1.889741108373404e-05, + "loss": 0.4654, + "step": 2759 + }, + { + "epoch": 0.3358685731670216, + "grad_norm": 1.1053814888000488, + "learning_rate": 1.8896520489938875e-05, + "loss": 0.4615, + "step": 2760 + }, + { + "epoch": 0.33599026467903864, + "grad_norm": 1.2729030847549438, + "learning_rate": 1.889562955761269e-05, + "loss": 0.4872, + "step": 2761 + }, + { + "epoch": 0.33611195619105566, + "grad_norm": 0.782295286655426, + "learning_rate": 1.8894738286789392e-05, + "loss": 0.4995, + "step": 2762 + }, + { + "epoch": 0.33623364770307274, + "grad_norm": 0.6598064303398132, + "learning_rate": 1.889384667750289e-05, + "loss": 0.418, + "step": 2763 + }, + { + "epoch": 0.33635533921508975, + "grad_norm": 3.087890863418579, + "learning_rate": 1.889295472978711e-05, + "loss": 0.5344, + "step": 2764 + }, + { + "epoch": 0.3364770307271068, + "grad_norm": 0.7532784938812256, + "learning_rate": 1.8892062443675994e-05, + "loss": 0.4529, + "step": 2765 + }, + { + "epoch": 0.33659872223912385, + "grad_norm": 2.8095479011535645, + "learning_rate": 1.8891169819203495e-05, + "loss": 0.5191, + "step": 2766 + }, + { + "epoch": 0.33672041375114087, + "grad_norm": 1.919663429260254, + "learning_rate": 1.8890276856403587e-05, + "loss": 0.5213, + "step": 2767 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 4.026738166809082, + "learning_rate": 1.888938355531024e-05, + "loss": 0.445, + "step": 2768 + }, + { + "epoch": 0.3369637967751749, + "grad_norm": 2.9974539279937744, + "learning_rate": 1.8888489915957445e-05, + "loss": 0.4558, + "step": 2769 + }, + { + "epoch": 0.337085488287192, + "grad_norm": 4.151406288146973, + "learning_rate": 1.8887595938379215e-05, + "loss": 0.4215, + "step": 2770 + }, + { + "epoch": 0.337207179799209, + "grad_norm": 1.079087495803833, + "learning_rate": 1.888670162260956e-05, + "loss": 0.4717, + "step": 2771 + }, + { + "epoch": 0.337328871311226, + "grad_norm": 1.5083805322647095, + "learning_rate": 1.888580696868252e-05, + "loss": 0.461, + "step": 2772 + }, + { + "epoch": 0.3374505628232431, + "grad_norm": 1.3275517225265503, + "learning_rate": 1.888491197663213e-05, + "loss": 0.4833, + "step": 2773 + }, + { + "epoch": 0.3375722543352601, + "grad_norm": 4.457827091217041, + "learning_rate": 1.8884016646492443e-05, + "loss": 0.5247, + "step": 2774 + }, + { + "epoch": 0.33769394584727713, + "grad_norm": 1.0470054149627686, + "learning_rate": 1.8883120978297538e-05, + "loss": 0.4699, + "step": 2775 + }, + { + "epoch": 0.3378156373592942, + "grad_norm": 3.0794103145599365, + "learning_rate": 1.888222497208149e-05, + "loss": 0.5117, + "step": 2776 + }, + { + "epoch": 0.3379373288713112, + "grad_norm": 1.5797334909439087, + "learning_rate": 1.88813286278784e-05, + "loss": 0.5184, + "step": 2777 + }, + { + "epoch": 0.33805902038332825, + "grad_norm": 0.5934585332870483, + "learning_rate": 1.888043194572237e-05, + "loss": 0.4778, + "step": 2778 + }, + { + "epoch": 0.3381807118953453, + "grad_norm": 0.6023896336555481, + "learning_rate": 1.8879534925647526e-05, + "loss": 0.5243, + "step": 2779 + }, + { + "epoch": 0.33830240340736234, + "grad_norm": 3.0522515773773193, + "learning_rate": 1.8878637567687997e-05, + "loss": 0.4728, + "step": 2780 + }, + { + "epoch": 0.33842409491937936, + "grad_norm": 2.841778516769409, + "learning_rate": 1.887773987187793e-05, + "loss": 0.4514, + "step": 2781 + }, + { + "epoch": 0.33854578643139643, + "grad_norm": 1.7480906248092651, + "learning_rate": 1.8876841838251483e-05, + "loss": 0.4862, + "step": 2782 + }, + { + "epoch": 0.33866747794341345, + "grad_norm": 1.4940919876098633, + "learning_rate": 1.8875943466842833e-05, + "loss": 0.5177, + "step": 2783 + }, + { + "epoch": 0.33878916945543047, + "grad_norm": 2.3407397270202637, + "learning_rate": 1.8875044757686162e-05, + "loss": 0.5175, + "step": 2784 + }, + { + "epoch": 0.33891086096744755, + "grad_norm": 1.7706379890441895, + "learning_rate": 1.8874145710815666e-05, + "loss": 0.4893, + "step": 2785 + }, + { + "epoch": 0.33903255247946457, + "grad_norm": 1.0196385383605957, + "learning_rate": 1.8873246326265558e-05, + "loss": 0.4712, + "step": 2786 + }, + { + "epoch": 0.3391542439914816, + "grad_norm": 0.5567321181297302, + "learning_rate": 1.887234660407006e-05, + "loss": 0.4105, + "step": 2787 + }, + { + "epoch": 0.3392759355034986, + "grad_norm": 3.6515402793884277, + "learning_rate": 1.887144654426341e-05, + "loss": 0.5096, + "step": 2788 + }, + { + "epoch": 0.3393976270155157, + "grad_norm": 2.8964643478393555, + "learning_rate": 1.8870546146879854e-05, + "loss": 0.5019, + "step": 2789 + }, + { + "epoch": 0.3395193185275327, + "grad_norm": 4.397404193878174, + "learning_rate": 1.886964541195366e-05, + "loss": 0.5012, + "step": 2790 + }, + { + "epoch": 0.3396410100395497, + "grad_norm": 3.022035837173462, + "learning_rate": 1.8868744339519094e-05, + "loss": 0.5045, + "step": 2791 + }, + { + "epoch": 0.3397627015515668, + "grad_norm": 0.6503461003303528, + "learning_rate": 1.8867842929610454e-05, + "loss": 0.4702, + "step": 2792 + }, + { + "epoch": 0.3398843930635838, + "grad_norm": 1.7951877117156982, + "learning_rate": 1.886694118226203e-05, + "loss": 0.5322, + "step": 2793 + }, + { + "epoch": 0.34000608457560083, + "grad_norm": 1.2949520349502563, + "learning_rate": 1.8866039097508145e-05, + "loss": 0.4826, + "step": 2794 + }, + { + "epoch": 0.3401277760876179, + "grad_norm": 4.235204219818115, + "learning_rate": 1.8865136675383118e-05, + "loss": 0.4421, + "step": 2795 + }, + { + "epoch": 0.3402494675996349, + "grad_norm": 2.6653316020965576, + "learning_rate": 1.886423391592129e-05, + "loss": 0.4239, + "step": 2796 + }, + { + "epoch": 0.34037115911165194, + "grad_norm": 5.5826263427734375, + "learning_rate": 1.8863330819157017e-05, + "loss": 0.4413, + "step": 2797 + }, + { + "epoch": 0.340492850623669, + "grad_norm": 2.973708152770996, + "learning_rate": 1.8862427385124656e-05, + "loss": 0.4527, + "step": 2798 + }, + { + "epoch": 0.34061454213568604, + "grad_norm": 3.3249263763427734, + "learning_rate": 1.8861523613858592e-05, + "loss": 0.4075, + "step": 2799 + }, + { + "epoch": 0.34073623364770306, + "grad_norm": 0.6413004398345947, + "learning_rate": 1.8860619505393208e-05, + "loss": 0.4784, + "step": 2800 + }, + { + "epoch": 0.34085792515972013, + "grad_norm": 3.2820169925689697, + "learning_rate": 1.8859715059762914e-05, + "loss": 0.5052, + "step": 2801 + }, + { + "epoch": 0.34097961667173715, + "grad_norm": 3.1755149364471436, + "learning_rate": 1.8858810277002126e-05, + "loss": 0.5167, + "step": 2802 + }, + { + "epoch": 0.34110130818375417, + "grad_norm": 2.0215020179748535, + "learning_rate": 1.8857905157145267e-05, + "loss": 0.5109, + "step": 2803 + }, + { + "epoch": 0.34122299969577125, + "grad_norm": 3.8230559825897217, + "learning_rate": 1.8856999700226782e-05, + "loss": 0.5325, + "step": 2804 + }, + { + "epoch": 0.34134469120778826, + "grad_norm": 0.9028104543685913, + "learning_rate": 1.8856093906281128e-05, + "loss": 0.4747, + "step": 2805 + }, + { + "epoch": 0.3414663827198053, + "grad_norm": 1.2290637493133545, + "learning_rate": 1.8855187775342765e-05, + "loss": 0.4252, + "step": 2806 + }, + { + "epoch": 0.3415880742318223, + "grad_norm": 1.887423038482666, + "learning_rate": 1.885428130744618e-05, + "loss": 0.54, + "step": 2807 + }, + { + "epoch": 0.3417097657438394, + "grad_norm": 0.7684519290924072, + "learning_rate": 1.8853374502625868e-05, + "loss": 0.5034, + "step": 2808 + }, + { + "epoch": 0.3418314572558564, + "grad_norm": 3.2190868854522705, + "learning_rate": 1.8852467360916327e-05, + "loss": 0.4629, + "step": 2809 + }, + { + "epoch": 0.3419531487678734, + "grad_norm": 0.6683231592178345, + "learning_rate": 1.885155988235208e-05, + "loss": 0.5333, + "step": 2810 + }, + { + "epoch": 0.3420748402798905, + "grad_norm": 1.9942041635513306, + "learning_rate": 1.8850652066967656e-05, + "loss": 0.5002, + "step": 2811 + }, + { + "epoch": 0.3421965317919075, + "grad_norm": 2.207380533218384, + "learning_rate": 1.8849743914797603e-05, + "loss": 0.5059, + "step": 2812 + }, + { + "epoch": 0.34231822330392453, + "grad_norm": 1.8366376161575317, + "learning_rate": 1.8848835425876474e-05, + "loss": 0.5013, + "step": 2813 + }, + { + "epoch": 0.3424399148159416, + "grad_norm": 2.434509515762329, + "learning_rate": 1.8847926600238843e-05, + "loss": 0.4347, + "step": 2814 + }, + { + "epoch": 0.3425616063279586, + "grad_norm": 1.484734296798706, + "learning_rate": 1.884701743791929e-05, + "loss": 0.4256, + "step": 2815 + }, + { + "epoch": 0.34268329783997564, + "grad_norm": 1.5371334552764893, + "learning_rate": 1.8846107938952412e-05, + "loss": 0.4507, + "step": 2816 + }, + { + "epoch": 0.3428049893519927, + "grad_norm": 4.77773904800415, + "learning_rate": 1.8845198103372814e-05, + "loss": 0.5031, + "step": 2817 + }, + { + "epoch": 0.34292668086400974, + "grad_norm": 4.3006157875061035, + "learning_rate": 1.8844287931215123e-05, + "loss": 0.5207, + "step": 2818 + }, + { + "epoch": 0.34304837237602676, + "grad_norm": 0.9419856667518616, + "learning_rate": 1.884337742251397e-05, + "loss": 0.3967, + "step": 2819 + }, + { + "epoch": 0.34317006388804383, + "grad_norm": 4.042980670928955, + "learning_rate": 1.8842466577303997e-05, + "loss": 0.5164, + "step": 2820 + }, + { + "epoch": 0.34329175540006085, + "grad_norm": 1.0231975317001343, + "learning_rate": 1.884155539561987e-05, + "loss": 0.4028, + "step": 2821 + }, + { + "epoch": 0.34341344691207787, + "grad_norm": 3.537090301513672, + "learning_rate": 1.8840643877496258e-05, + "loss": 0.4964, + "step": 2822 + }, + { + "epoch": 0.34353513842409494, + "grad_norm": 2.2879130840301514, + "learning_rate": 1.883973202296785e-05, + "loss": 0.5221, + "step": 2823 + }, + { + "epoch": 0.34365682993611196, + "grad_norm": 1.6462324857711792, + "learning_rate": 1.883881983206934e-05, + "loss": 0.482, + "step": 2824 + }, + { + "epoch": 0.343778521448129, + "grad_norm": 2.8886613845825195, + "learning_rate": 1.883790730483544e-05, + "loss": 0.482, + "step": 2825 + }, + { + "epoch": 0.34390021296014606, + "grad_norm": 1.7604490518569946, + "learning_rate": 1.8836994441300875e-05, + "loss": 0.5086, + "step": 2826 + }, + { + "epoch": 0.3440219044721631, + "grad_norm": 3.2546560764312744, + "learning_rate": 1.8836081241500378e-05, + "loss": 0.5067, + "step": 2827 + }, + { + "epoch": 0.3441435959841801, + "grad_norm": 3.459789991378784, + "learning_rate": 1.88351677054687e-05, + "loss": 0.4771, + "step": 2828 + }, + { + "epoch": 0.3442652874961971, + "grad_norm": 2.373236894607544, + "learning_rate": 1.8834253833240603e-05, + "loss": 0.4621, + "step": 2829 + }, + { + "epoch": 0.3443869790082142, + "grad_norm": 1.322063684463501, + "learning_rate": 1.8833339624850864e-05, + "loss": 0.5272, + "step": 2830 + }, + { + "epoch": 0.3445086705202312, + "grad_norm": 1.5012385845184326, + "learning_rate": 1.8832425080334266e-05, + "loss": 0.411, + "step": 2831 + }, + { + "epoch": 0.3446303620322482, + "grad_norm": 1.3234150409698486, + "learning_rate": 1.883151019972561e-05, + "loss": 0.4698, + "step": 2832 + }, + { + "epoch": 0.3447520535442653, + "grad_norm": 2.324329137802124, + "learning_rate": 1.8830594983059712e-05, + "loss": 0.4496, + "step": 2833 + }, + { + "epoch": 0.3448737450562823, + "grad_norm": 1.3122886419296265, + "learning_rate": 1.88296794303714e-05, + "loss": 0.4643, + "step": 2834 + }, + { + "epoch": 0.34499543656829934, + "grad_norm": 1.234619140625, + "learning_rate": 1.8828763541695507e-05, + "loss": 0.4623, + "step": 2835 + }, + { + "epoch": 0.3451171280803164, + "grad_norm": 3.562305450439453, + "learning_rate": 1.8827847317066887e-05, + "loss": 0.5686, + "step": 2836 + }, + { + "epoch": 0.34523881959233343, + "grad_norm": 2.6911988258361816, + "learning_rate": 1.8826930756520402e-05, + "loss": 0.5208, + "step": 2837 + }, + { + "epoch": 0.34536051110435045, + "grad_norm": 1.0738780498504639, + "learning_rate": 1.882601386009093e-05, + "loss": 0.5003, + "step": 2838 + }, + { + "epoch": 0.34548220261636753, + "grad_norm": 2.8758702278137207, + "learning_rate": 1.8825096627813366e-05, + "loss": 0.5024, + "step": 2839 + }, + { + "epoch": 0.34560389412838455, + "grad_norm": 2.8085744380950928, + "learning_rate": 1.8824179059722607e-05, + "loss": 0.4672, + "step": 2840 + }, + { + "epoch": 0.34572558564040157, + "grad_norm": 1.3944097757339478, + "learning_rate": 1.882326115585357e-05, + "loss": 0.4967, + "step": 2841 + }, + { + "epoch": 0.34584727715241864, + "grad_norm": 1.1971148252487183, + "learning_rate": 1.882234291624118e-05, + "loss": 0.4531, + "step": 2842 + }, + { + "epoch": 0.34596896866443566, + "grad_norm": 0.6625984907150269, + "learning_rate": 1.8821424340920385e-05, + "loss": 0.4953, + "step": 2843 + }, + { + "epoch": 0.3460906601764527, + "grad_norm": 3.236550807952881, + "learning_rate": 1.882050542992613e-05, + "loss": 0.3911, + "step": 2844 + }, + { + "epoch": 0.34621235168846975, + "grad_norm": 3.334770441055298, + "learning_rate": 1.881958618329339e-05, + "loss": 0.4725, + "step": 2845 + }, + { + "epoch": 0.3463340432004868, + "grad_norm": 1.0479373931884766, + "learning_rate": 1.8818666601057138e-05, + "loss": 0.4473, + "step": 2846 + }, + { + "epoch": 0.3464557347125038, + "grad_norm": 4.242272853851318, + "learning_rate": 1.8817746683252368e-05, + "loss": 0.4854, + "step": 2847 + }, + { + "epoch": 0.3465774262245208, + "grad_norm": 2.097931385040283, + "learning_rate": 1.8816826429914087e-05, + "loss": 0.4628, + "step": 2848 + }, + { + "epoch": 0.3466991177365379, + "grad_norm": 0.7699806094169617, + "learning_rate": 1.8815905841077307e-05, + "loss": 0.4293, + "step": 2849 + }, + { + "epoch": 0.3468208092485549, + "grad_norm": 1.3720494508743286, + "learning_rate": 1.8814984916777062e-05, + "loss": 0.4073, + "step": 2850 + }, + { + "epoch": 0.3469425007605719, + "grad_norm": 0.8056511282920837, + "learning_rate": 1.88140636570484e-05, + "loss": 0.4344, + "step": 2851 + }, + { + "epoch": 0.347064192272589, + "grad_norm": 0.6837261915206909, + "learning_rate": 1.8813142061926365e-05, + "loss": 0.4409, + "step": 2852 + }, + { + "epoch": 0.347185883784606, + "grad_norm": 1.1518371105194092, + "learning_rate": 1.8812220131446036e-05, + "loss": 0.4991, + "step": 2853 + }, + { + "epoch": 0.34730757529662304, + "grad_norm": 1.1454092264175415, + "learning_rate": 1.8811297865642494e-05, + "loss": 0.4919, + "step": 2854 + }, + { + "epoch": 0.3474292668086401, + "grad_norm": 2.137289047241211, + "learning_rate": 1.8810375264550823e-05, + "loss": 0.4316, + "step": 2855 + }, + { + "epoch": 0.34755095832065713, + "grad_norm": 1.0075408220291138, + "learning_rate": 1.880945232820614e-05, + "loss": 0.5358, + "step": 2856 + }, + { + "epoch": 0.34767264983267415, + "grad_norm": 0.9391731023788452, + "learning_rate": 1.8808529056643564e-05, + "loss": 0.5343, + "step": 2857 + }, + { + "epoch": 0.3477943413446912, + "grad_norm": 1.0174161195755005, + "learning_rate": 1.8807605449898222e-05, + "loss": 0.527, + "step": 2858 + }, + { + "epoch": 0.34791603285670825, + "grad_norm": 1.3289825916290283, + "learning_rate": 1.880668150800526e-05, + "loss": 0.4528, + "step": 2859 + }, + { + "epoch": 0.34803772436872527, + "grad_norm": 0.6193122863769531, + "learning_rate": 1.880575723099984e-05, + "loss": 0.4765, + "step": 2860 + }, + { + "epoch": 0.34815941588074234, + "grad_norm": 0.7730928659439087, + "learning_rate": 1.8804832618917133e-05, + "loss": 0.4744, + "step": 2861 + }, + { + "epoch": 0.34828110739275936, + "grad_norm": 2.7276222705841064, + "learning_rate": 1.8803907671792317e-05, + "loss": 0.541, + "step": 2862 + }, + { + "epoch": 0.3484027989047764, + "grad_norm": 0.9313828945159912, + "learning_rate": 1.880298238966059e-05, + "loss": 0.4434, + "step": 2863 + }, + { + "epoch": 0.34852449041679345, + "grad_norm": 0.7989535331726074, + "learning_rate": 1.8802056772557162e-05, + "loss": 0.4701, + "step": 2864 + }, + { + "epoch": 0.3486461819288105, + "grad_norm": 2.94881010055542, + "learning_rate": 1.8801130820517256e-05, + "loss": 0.5601, + "step": 2865 + }, + { + "epoch": 0.3487678734408275, + "grad_norm": 0.5548211336135864, + "learning_rate": 1.8800204533576104e-05, + "loss": 0.4637, + "step": 2866 + }, + { + "epoch": 0.34888956495284457, + "grad_norm": 2.4702563285827637, + "learning_rate": 1.8799277911768957e-05, + "loss": 0.4482, + "step": 2867 + }, + { + "epoch": 0.3490112564648616, + "grad_norm": 0.616993248462677, + "learning_rate": 1.8798350955131068e-05, + "loss": 0.4901, + "step": 2868 + }, + { + "epoch": 0.3491329479768786, + "grad_norm": 0.8367584347724915, + "learning_rate": 1.8797423663697714e-05, + "loss": 0.4935, + "step": 2869 + }, + { + "epoch": 0.3492546394888956, + "grad_norm": 3.479548454284668, + "learning_rate": 1.8796496037504184e-05, + "loss": 0.4365, + "step": 2870 + }, + { + "epoch": 0.3493763310009127, + "grad_norm": 1.181849479675293, + "learning_rate": 1.8795568076585767e-05, + "loss": 0.4605, + "step": 2871 + }, + { + "epoch": 0.3494980225129297, + "grad_norm": 2.44211745262146, + "learning_rate": 1.8794639780977782e-05, + "loss": 0.441, + "step": 2872 + }, + { + "epoch": 0.34961971402494674, + "grad_norm": 2.3819055557250977, + "learning_rate": 1.8793711150715546e-05, + "loss": 0.5386, + "step": 2873 + }, + { + "epoch": 0.3497414055369638, + "grad_norm": 0.6934733986854553, + "learning_rate": 1.8792782185834402e-05, + "loss": 0.4636, + "step": 2874 + }, + { + "epoch": 0.34986309704898083, + "grad_norm": 0.6664032936096191, + "learning_rate": 1.8791852886369697e-05, + "loss": 0.4538, + "step": 2875 + }, + { + "epoch": 0.34998478856099785, + "grad_norm": 2.2818241119384766, + "learning_rate": 1.879092325235679e-05, + "loss": 0.4985, + "step": 2876 + }, + { + "epoch": 0.3501064800730149, + "grad_norm": 2.093757152557373, + "learning_rate": 1.8789993283831056e-05, + "loss": 0.4957, + "step": 2877 + }, + { + "epoch": 0.35022817158503194, + "grad_norm": 2.0471956729888916, + "learning_rate": 1.8789062980827882e-05, + "loss": 0.4531, + "step": 2878 + }, + { + "epoch": 0.35034986309704896, + "grad_norm": 0.890379011631012, + "learning_rate": 1.8788132343382672e-05, + "loss": 0.4521, + "step": 2879 + }, + { + "epoch": 0.35047155460906604, + "grad_norm": 0.76273113489151, + "learning_rate": 1.8787201371530834e-05, + "loss": 0.4405, + "step": 2880 + }, + { + "epoch": 0.35059324612108306, + "grad_norm": 0.5440983176231384, + "learning_rate": 1.8786270065307797e-05, + "loss": 0.4935, + "step": 2881 + }, + { + "epoch": 0.3507149376331001, + "grad_norm": 0.6818732023239136, + "learning_rate": 1.8785338424748997e-05, + "loss": 0.5389, + "step": 2882 + }, + { + "epoch": 0.35083662914511715, + "grad_norm": 1.2945020198822021, + "learning_rate": 1.8784406449889884e-05, + "loss": 0.4481, + "step": 2883 + }, + { + "epoch": 0.35095832065713417, + "grad_norm": 1.3245103359222412, + "learning_rate": 1.8783474140765925e-05, + "loss": 0.4908, + "step": 2884 + }, + { + "epoch": 0.3510800121691512, + "grad_norm": 1.9959620237350464, + "learning_rate": 1.878254149741259e-05, + "loss": 0.4373, + "step": 2885 + }, + { + "epoch": 0.35120170368116826, + "grad_norm": 1.1753344535827637, + "learning_rate": 1.878160851986538e-05, + "loss": 0.4317, + "step": 2886 + }, + { + "epoch": 0.3513233951931853, + "grad_norm": 0.8848109841346741, + "learning_rate": 1.878067520815978e-05, + "loss": 0.4722, + "step": 2887 + }, + { + "epoch": 0.3514450867052023, + "grad_norm": 1.23782479763031, + "learning_rate": 1.8779741562331318e-05, + "loss": 0.47, + "step": 2888 + }, + { + "epoch": 0.3515667782172193, + "grad_norm": 0.7877939343452454, + "learning_rate": 1.8778807582415517e-05, + "loss": 0.4515, + "step": 2889 + }, + { + "epoch": 0.3516884697292364, + "grad_norm": 1.2659249305725098, + "learning_rate": 1.8777873268447914e-05, + "loss": 0.4937, + "step": 2890 + }, + { + "epoch": 0.3518101612412534, + "grad_norm": 0.7893572449684143, + "learning_rate": 1.8776938620464065e-05, + "loss": 0.5063, + "step": 2891 + }, + { + "epoch": 0.35193185275327044, + "grad_norm": 2.6077706813812256, + "learning_rate": 1.8776003638499534e-05, + "loss": 0.4431, + "step": 2892 + }, + { + "epoch": 0.3520535442652875, + "grad_norm": 1.8502311706542969, + "learning_rate": 1.8775068322589898e-05, + "loss": 0.5301, + "step": 2893 + }, + { + "epoch": 0.35217523577730453, + "grad_norm": 0.7246078252792358, + "learning_rate": 1.877413267277075e-05, + "loss": 0.4604, + "step": 2894 + }, + { + "epoch": 0.35229692728932155, + "grad_norm": 0.6087958216667175, + "learning_rate": 1.8773196689077692e-05, + "loss": 0.4996, + "step": 2895 + }, + { + "epoch": 0.3524186188013386, + "grad_norm": 1.547107458114624, + "learning_rate": 1.8772260371546344e-05, + "loss": 0.4747, + "step": 2896 + }, + { + "epoch": 0.35254031031335564, + "grad_norm": 0.6520598530769348, + "learning_rate": 1.8771323720212326e-05, + "loss": 0.5005, + "step": 2897 + }, + { + "epoch": 0.35266200182537266, + "grad_norm": 3.044903039932251, + "learning_rate": 1.877038673511129e-05, + "loss": 0.5054, + "step": 2898 + }, + { + "epoch": 0.35278369333738974, + "grad_norm": 1.189260482788086, + "learning_rate": 1.8769449416278883e-05, + "loss": 0.481, + "step": 2899 + }, + { + "epoch": 0.35290538484940676, + "grad_norm": 0.6019036173820496, + "learning_rate": 1.8768511763750772e-05, + "loss": 0.4801, + "step": 2900 + }, + { + "epoch": 0.3530270763614238, + "grad_norm": 0.663816511631012, + "learning_rate": 1.876757377756264e-05, + "loss": 0.4882, + "step": 2901 + }, + { + "epoch": 0.35314876787344085, + "grad_norm": 3.2873332500457764, + "learning_rate": 1.8766635457750177e-05, + "loss": 0.5305, + "step": 2902 + }, + { + "epoch": 0.35327045938545787, + "grad_norm": 3.3045499324798584, + "learning_rate": 1.8765696804349088e-05, + "loss": 0.5321, + "step": 2903 + }, + { + "epoch": 0.3533921508974749, + "grad_norm": 3.791252374649048, + "learning_rate": 1.8764757817395094e-05, + "loss": 0.5519, + "step": 2904 + }, + { + "epoch": 0.35351384240949196, + "grad_norm": 0.915611207485199, + "learning_rate": 1.8763818496923924e-05, + "loss": 0.4589, + "step": 2905 + }, + { + "epoch": 0.353635533921509, + "grad_norm": 0.6651073098182678, + "learning_rate": 1.876287884297132e-05, + "loss": 0.4832, + "step": 2906 + }, + { + "epoch": 0.353757225433526, + "grad_norm": 1.3482612371444702, + "learning_rate": 1.8761938855573033e-05, + "loss": 0.482, + "step": 2907 + }, + { + "epoch": 0.3538789169455431, + "grad_norm": 2.7918756008148193, + "learning_rate": 1.876099853476484e-05, + "loss": 0.4702, + "step": 2908 + }, + { + "epoch": 0.3540006084575601, + "grad_norm": 1.6856144666671753, + "learning_rate": 1.876005788058252e-05, + "loss": 0.5149, + "step": 2909 + }, + { + "epoch": 0.3541222999695771, + "grad_norm": 1.2886600494384766, + "learning_rate": 1.875911689306186e-05, + "loss": 0.4733, + "step": 2910 + }, + { + "epoch": 0.35424399148159413, + "grad_norm": 0.736177384853363, + "learning_rate": 1.875817557223868e-05, + "loss": 0.4227, + "step": 2911 + }, + { + "epoch": 0.3543656829936112, + "grad_norm": 1.5236917734146118, + "learning_rate": 1.8757233918148784e-05, + "loss": 0.4576, + "step": 2912 + }, + { + "epoch": 0.3544873745056282, + "grad_norm": 2.465754270553589, + "learning_rate": 1.8756291930828013e-05, + "loss": 0.4282, + "step": 2913 + }, + { + "epoch": 0.35460906601764525, + "grad_norm": 3.590388536453247, + "learning_rate": 1.875534961031221e-05, + "loss": 0.4917, + "step": 2914 + }, + { + "epoch": 0.3547307575296623, + "grad_norm": 3.2868614196777344, + "learning_rate": 1.875440695663723e-05, + "loss": 0.4756, + "step": 2915 + }, + { + "epoch": 0.35485244904167934, + "grad_norm": 3.7432565689086914, + "learning_rate": 1.8753463969838946e-05, + "loss": 0.4873, + "step": 2916 + }, + { + "epoch": 0.35497414055369636, + "grad_norm": 1.5701254606246948, + "learning_rate": 1.875252064995324e-05, + "loss": 0.4789, + "step": 2917 + }, + { + "epoch": 0.35509583206571343, + "grad_norm": 2.1755101680755615, + "learning_rate": 1.8751576997016007e-05, + "loss": 0.4393, + "step": 2918 + }, + { + "epoch": 0.35521752357773045, + "grad_norm": 4.6662821769714355, + "learning_rate": 1.875063301106315e-05, + "loss": 0.424, + "step": 2919 + }, + { + "epoch": 0.3553392150897475, + "grad_norm": 1.7847932577133179, + "learning_rate": 1.87496886921306e-05, + "loss": 0.4688, + "step": 2920 + }, + { + "epoch": 0.35546090660176455, + "grad_norm": 4.292427062988281, + "learning_rate": 1.874874404025428e-05, + "loss": 0.442, + "step": 2921 + }, + { + "epoch": 0.35558259811378157, + "grad_norm": 0.6120730638504028, + "learning_rate": 1.8747799055470142e-05, + "loss": 0.4823, + "step": 2922 + }, + { + "epoch": 0.3557042896257986, + "grad_norm": 1.2348554134368896, + "learning_rate": 1.8746853737814144e-05, + "loss": 0.4693, + "step": 2923 + }, + { + "epoch": 0.35582598113781566, + "grad_norm": 1.578283667564392, + "learning_rate": 1.8745908087322254e-05, + "loss": 0.3873, + "step": 2924 + }, + { + "epoch": 0.3559476726498327, + "grad_norm": 1.4291319847106934, + "learning_rate": 1.874496210403046e-05, + "loss": 0.4529, + "step": 2925 + }, + { + "epoch": 0.3560693641618497, + "grad_norm": 4.548673629760742, + "learning_rate": 1.8744015787974757e-05, + "loss": 0.5328, + "step": 2926 + }, + { + "epoch": 0.3561910556738668, + "grad_norm": 3.222435712814331, + "learning_rate": 1.8743069139191154e-05, + "loss": 0.4922, + "step": 2927 + }, + { + "epoch": 0.3563127471858838, + "grad_norm": 2.703038454055786, + "learning_rate": 1.8742122157715673e-05, + "loss": 0.4921, + "step": 2928 + }, + { + "epoch": 0.3564344386979008, + "grad_norm": 1.7311506271362305, + "learning_rate": 1.874117484358435e-05, + "loss": 0.5038, + "step": 2929 + }, + { + "epoch": 0.35655613020991783, + "grad_norm": 0.9427840709686279, + "learning_rate": 1.874022719683323e-05, + "loss": 0.4464, + "step": 2930 + }, + { + "epoch": 0.3566778217219349, + "grad_norm": 0.9701077342033386, + "learning_rate": 1.873927921749837e-05, + "loss": 0.4763, + "step": 2931 + }, + { + "epoch": 0.3567995132339519, + "grad_norm": 1.4765790700912476, + "learning_rate": 1.8738330905615852e-05, + "loss": 0.4468, + "step": 2932 + }, + { + "epoch": 0.35692120474596895, + "grad_norm": 2.8618738651275635, + "learning_rate": 1.873738226122175e-05, + "loss": 0.4578, + "step": 2933 + }, + { + "epoch": 0.357042896257986, + "grad_norm": 2.3875904083251953, + "learning_rate": 1.8736433284352175e-05, + "loss": 0.4877, + "step": 2934 + }, + { + "epoch": 0.35716458777000304, + "grad_norm": 1.8850828409194946, + "learning_rate": 1.8735483975043225e-05, + "loss": 0.4546, + "step": 2935 + }, + { + "epoch": 0.35728627928202006, + "grad_norm": 0.5865916013717651, + "learning_rate": 1.873453433333103e-05, + "loss": 0.4806, + "step": 2936 + }, + { + "epoch": 0.35740797079403713, + "grad_norm": 3.0282232761383057, + "learning_rate": 1.8733584359251724e-05, + "loss": 0.4123, + "step": 2937 + }, + { + "epoch": 0.35752966230605415, + "grad_norm": 2.4615886211395264, + "learning_rate": 1.8732634052841456e-05, + "loss": 0.524, + "step": 2938 + }, + { + "epoch": 0.35765135381807117, + "grad_norm": 1.4478707313537598, + "learning_rate": 1.8731683414136386e-05, + "loss": 0.431, + "step": 2939 + }, + { + "epoch": 0.35777304533008825, + "grad_norm": 4.296384811401367, + "learning_rate": 1.8730732443172687e-05, + "loss": 0.4883, + "step": 2940 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 1.4367122650146484, + "learning_rate": 1.872978113998655e-05, + "loss": 0.4463, + "step": 2941 + }, + { + "epoch": 0.3580164283541223, + "grad_norm": 4.117356300354004, + "learning_rate": 1.8728829504614168e-05, + "loss": 0.5055, + "step": 2942 + }, + { + "epoch": 0.35813811986613936, + "grad_norm": 3.7410740852355957, + "learning_rate": 1.8727877537091758e-05, + "loss": 0.557, + "step": 2943 + }, + { + "epoch": 0.3582598113781564, + "grad_norm": 1.436547040939331, + "learning_rate": 1.872692523745554e-05, + "loss": 0.4756, + "step": 2944 + }, + { + "epoch": 0.3583815028901734, + "grad_norm": 2.0740349292755127, + "learning_rate": 1.8725972605741755e-05, + "loss": 0.4979, + "step": 2945 + }, + { + "epoch": 0.3585031944021905, + "grad_norm": 2.4010510444641113, + "learning_rate": 1.8725019641986647e-05, + "loss": 0.4986, + "step": 2946 + }, + { + "epoch": 0.3586248859142075, + "grad_norm": 1.252356767654419, + "learning_rate": 1.8724066346226485e-05, + "loss": 0.5293, + "step": 2947 + }, + { + "epoch": 0.3587465774262245, + "grad_norm": 3.0641136169433594, + "learning_rate": 1.8723112718497544e-05, + "loss": 0.4434, + "step": 2948 + }, + { + "epoch": 0.35886826893824153, + "grad_norm": 1.4979795217514038, + "learning_rate": 1.8722158758836103e-05, + "loss": 0.4689, + "step": 2949 + }, + { + "epoch": 0.3589899604502586, + "grad_norm": 1.061935544013977, + "learning_rate": 1.8721204467278465e-05, + "loss": 0.4475, + "step": 2950 + }, + { + "epoch": 0.3591116519622756, + "grad_norm": 2.4618287086486816, + "learning_rate": 1.8720249843860953e-05, + "loss": 0.4959, + "step": 2951 + }, + { + "epoch": 0.35923334347429264, + "grad_norm": 1.5725812911987305, + "learning_rate": 1.871929488861988e-05, + "loss": 0.4504, + "step": 2952 + }, + { + "epoch": 0.3593550349863097, + "grad_norm": 1.2420568466186523, + "learning_rate": 1.8718339601591585e-05, + "loss": 0.4554, + "step": 2953 + }, + { + "epoch": 0.35947672649832674, + "grad_norm": 0.7908331751823425, + "learning_rate": 1.871738398281243e-05, + "loss": 0.4604, + "step": 2954 + }, + { + "epoch": 0.35959841801034376, + "grad_norm": 0.5810034275054932, + "learning_rate": 1.871642803231876e-05, + "loss": 0.4401, + "step": 2955 + }, + { + "epoch": 0.35972010952236083, + "grad_norm": 2.186800718307495, + "learning_rate": 1.8715471750146972e-05, + "loss": 0.5402, + "step": 2956 + }, + { + "epoch": 0.35984180103437785, + "grad_norm": 2.936331272125244, + "learning_rate": 1.871451513633344e-05, + "loss": 0.4414, + "step": 2957 + }, + { + "epoch": 0.35996349254639487, + "grad_norm": 1.2168655395507812, + "learning_rate": 1.871355819091457e-05, + "loss": 0.5076, + "step": 2958 + }, + { + "epoch": 0.36008518405841194, + "grad_norm": 3.5353779792785645, + "learning_rate": 1.8712600913926772e-05, + "loss": 0.476, + "step": 2959 + }, + { + "epoch": 0.36020687557042896, + "grad_norm": 1.4204754829406738, + "learning_rate": 1.8711643305406477e-05, + "loss": 0.5084, + "step": 2960 + }, + { + "epoch": 0.360328567082446, + "grad_norm": 1.3160148859024048, + "learning_rate": 1.871068536539012e-05, + "loss": 0.4482, + "step": 2961 + }, + { + "epoch": 0.36045025859446306, + "grad_norm": 0.6225706934928894, + "learning_rate": 1.8709727093914157e-05, + "loss": 0.4804, + "step": 2962 + }, + { + "epoch": 0.3605719501064801, + "grad_norm": 1.5934213399887085, + "learning_rate": 1.870876849101505e-05, + "loss": 0.4316, + "step": 2963 + }, + { + "epoch": 0.3606936416184971, + "grad_norm": 3.3230910301208496, + "learning_rate": 1.8707809556729276e-05, + "loss": 0.4535, + "step": 2964 + }, + { + "epoch": 0.36081533313051417, + "grad_norm": 2.180410861968994, + "learning_rate": 1.8706850291093323e-05, + "loss": 0.4707, + "step": 2965 + }, + { + "epoch": 0.3609370246425312, + "grad_norm": 1.1301518678665161, + "learning_rate": 1.8705890694143693e-05, + "loss": 0.4177, + "step": 2966 + }, + { + "epoch": 0.3610587161545482, + "grad_norm": 4.07941198348999, + "learning_rate": 1.8704930765916905e-05, + "loss": 0.538, + "step": 2967 + }, + { + "epoch": 0.3611804076665653, + "grad_norm": 1.671410322189331, + "learning_rate": 1.870397050644948e-05, + "loss": 0.4319, + "step": 2968 + }, + { + "epoch": 0.3613020991785823, + "grad_norm": 1.7199392318725586, + "learning_rate": 1.8703009915777963e-05, + "loss": 0.4621, + "step": 2969 + }, + { + "epoch": 0.3614237906905993, + "grad_norm": 0.6137834787368774, + "learning_rate": 1.8702048993938902e-05, + "loss": 0.4629, + "step": 2970 + }, + { + "epoch": 0.36154548220261634, + "grad_norm": 3.8859007358551025, + "learning_rate": 1.8701087740968868e-05, + "loss": 0.4386, + "step": 2971 + }, + { + "epoch": 0.3616671737146334, + "grad_norm": 3.885434865951538, + "learning_rate": 1.8700126156904436e-05, + "loss": 0.4457, + "step": 2972 + }, + { + "epoch": 0.36178886522665044, + "grad_norm": 2.96514630317688, + "learning_rate": 1.8699164241782194e-05, + "loss": 0.4704, + "step": 2973 + }, + { + "epoch": 0.36191055673866745, + "grad_norm": 1.3474972248077393, + "learning_rate": 1.869820199563874e-05, + "loss": 0.5081, + "step": 2974 + }, + { + "epoch": 0.36203224825068453, + "grad_norm": 0.6787735223770142, + "learning_rate": 1.8697239418510707e-05, + "loss": 0.5376, + "step": 2975 + }, + { + "epoch": 0.36215393976270155, + "grad_norm": 2.3206374645233154, + "learning_rate": 1.8696276510434702e-05, + "loss": 0.4883, + "step": 2976 + }, + { + "epoch": 0.36227563127471857, + "grad_norm": 0.8279562592506409, + "learning_rate": 1.869531327144738e-05, + "loss": 0.5084, + "step": 2977 + }, + { + "epoch": 0.36239732278673564, + "grad_norm": 1.6186060905456543, + "learning_rate": 1.8694349701585392e-05, + "loss": 0.4235, + "step": 2978 + }, + { + "epoch": 0.36251901429875266, + "grad_norm": 2.840852975845337, + "learning_rate": 1.8693385800885398e-05, + "loss": 0.5106, + "step": 2979 + }, + { + "epoch": 0.3626407058107697, + "grad_norm": 2.2010650634765625, + "learning_rate": 1.869242156938408e-05, + "loss": 0.4212, + "step": 2980 + }, + { + "epoch": 0.36276239732278676, + "grad_norm": 1.0991575717926025, + "learning_rate": 1.869145700711813e-05, + "loss": 0.4654, + "step": 2981 + }, + { + "epoch": 0.3628840888348038, + "grad_norm": 1.6484620571136475, + "learning_rate": 1.869049211412425e-05, + "loss": 0.5005, + "step": 2982 + }, + { + "epoch": 0.3630057803468208, + "grad_norm": 1.2549841403961182, + "learning_rate": 1.8689526890439156e-05, + "loss": 0.462, + "step": 2983 + }, + { + "epoch": 0.36312747185883787, + "grad_norm": 1.1052154302597046, + "learning_rate": 1.868856133609958e-05, + "loss": 0.4376, + "step": 2984 + }, + { + "epoch": 0.3632491633708549, + "grad_norm": 2.9149084091186523, + "learning_rate": 1.8687595451142257e-05, + "loss": 0.5047, + "step": 2985 + }, + { + "epoch": 0.3633708548828719, + "grad_norm": 0.6402516961097717, + "learning_rate": 1.868662923560395e-05, + "loss": 0.4594, + "step": 2986 + }, + { + "epoch": 0.363492546394889, + "grad_norm": 1.3251070976257324, + "learning_rate": 1.8685662689521417e-05, + "loss": 0.4426, + "step": 2987 + }, + { + "epoch": 0.363614237906906, + "grad_norm": 0.6694589257240295, + "learning_rate": 1.8684695812931442e-05, + "loss": 0.4383, + "step": 2988 + }, + { + "epoch": 0.363735929418923, + "grad_norm": 1.0918630361557007, + "learning_rate": 1.868372860587081e-05, + "loss": 0.4706, + "step": 2989 + }, + { + "epoch": 0.36385762093094004, + "grad_norm": 0.621202826499939, + "learning_rate": 1.8682761068376335e-05, + "loss": 0.4735, + "step": 2990 + }, + { + "epoch": 0.3639793124429571, + "grad_norm": 2.1684494018554688, + "learning_rate": 1.8681793200484827e-05, + "loss": 0.5035, + "step": 2991 + }, + { + "epoch": 0.36410100395497413, + "grad_norm": 0.8980317711830139, + "learning_rate": 1.8680825002233122e-05, + "loss": 0.4568, + "step": 2992 + }, + { + "epoch": 0.36422269546699115, + "grad_norm": 0.6235323548316956, + "learning_rate": 1.8679856473658053e-05, + "loss": 0.524, + "step": 2993 + }, + { + "epoch": 0.36434438697900823, + "grad_norm": 1.199827790260315, + "learning_rate": 1.8678887614796475e-05, + "loss": 0.4607, + "step": 2994 + }, + { + "epoch": 0.36446607849102525, + "grad_norm": 1.9739487171173096, + "learning_rate": 1.8677918425685266e-05, + "loss": 0.5419, + "step": 2995 + }, + { + "epoch": 0.36458777000304227, + "grad_norm": 3.7945010662078857, + "learning_rate": 1.8676948906361295e-05, + "loss": 0.4466, + "step": 2996 + }, + { + "epoch": 0.36470946151505934, + "grad_norm": 0.9805242419242859, + "learning_rate": 1.8675979056861462e-05, + "loss": 0.5259, + "step": 2997 + }, + { + "epoch": 0.36483115302707636, + "grad_norm": 1.7036242485046387, + "learning_rate": 1.8675008877222664e-05, + "loss": 0.4341, + "step": 2998 + }, + { + "epoch": 0.3649528445390934, + "grad_norm": 1.3992851972579956, + "learning_rate": 1.867403836748182e-05, + "loss": 0.5244, + "step": 2999 + }, + { + "epoch": 0.36507453605111045, + "grad_norm": 1.4511429071426392, + "learning_rate": 1.8673067527675867e-05, + "loss": 0.4555, + "step": 3000 + }, + { + "epoch": 0.3651962275631275, + "grad_norm": 1.183764934539795, + "learning_rate": 1.867209635784174e-05, + "loss": 0.5047, + "step": 3001 + }, + { + "epoch": 0.3653179190751445, + "grad_norm": 0.9412627220153809, + "learning_rate": 1.8671124858016393e-05, + "loss": 0.4418, + "step": 3002 + }, + { + "epoch": 0.36543961058716157, + "grad_norm": 0.5016685724258423, + "learning_rate": 1.86701530282368e-05, + "loss": 0.426, + "step": 3003 + }, + { + "epoch": 0.3655613020991786, + "grad_norm": 1.155144453048706, + "learning_rate": 1.866918086853994e-05, + "loss": 0.4714, + "step": 3004 + }, + { + "epoch": 0.3656829936111956, + "grad_norm": 2.9690589904785156, + "learning_rate": 1.86682083789628e-05, + "loss": 0.5204, + "step": 3005 + }, + { + "epoch": 0.3658046851232127, + "grad_norm": 1.997794508934021, + "learning_rate": 1.866723555954239e-05, + "loss": 0.4133, + "step": 3006 + }, + { + "epoch": 0.3659263766352297, + "grad_norm": 2.890911102294922, + "learning_rate": 1.8666262410315727e-05, + "loss": 0.4246, + "step": 3007 + }, + { + "epoch": 0.3660480681472467, + "grad_norm": 0.6775578856468201, + "learning_rate": 1.8665288931319843e-05, + "loss": 0.4426, + "step": 3008 + }, + { + "epoch": 0.3661697596592638, + "grad_norm": 1.965939998626709, + "learning_rate": 1.8664315122591778e-05, + "loss": 0.4939, + "step": 3009 + }, + { + "epoch": 0.3662914511712808, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.8663340984168585e-05, + "loss": 0.4658, + "step": 3010 + }, + { + "epoch": 0.36641314268329783, + "grad_norm": 0.7034749388694763, + "learning_rate": 1.866236651608734e-05, + "loss": 0.4643, + "step": 3011 + }, + { + "epoch": 0.36653483419531485, + "grad_norm": 1.5485363006591797, + "learning_rate": 1.8661391718385115e-05, + "loss": 0.4317, + "step": 3012 + }, + { + "epoch": 0.3666565257073319, + "grad_norm": 1.4653748273849487, + "learning_rate": 1.866041659109901e-05, + "loss": 0.4267, + "step": 3013 + }, + { + "epoch": 0.36677821721934895, + "grad_norm": 1.0314141511917114, + "learning_rate": 1.865944113426613e-05, + "loss": 0.445, + "step": 3014 + }, + { + "epoch": 0.36689990873136596, + "grad_norm": 1.2275699377059937, + "learning_rate": 1.8658465347923586e-05, + "loss": 0.4702, + "step": 3015 + }, + { + "epoch": 0.36702160024338304, + "grad_norm": 0.7662177085876465, + "learning_rate": 1.8657489232108514e-05, + "loss": 0.4413, + "step": 3016 + }, + { + "epoch": 0.36714329175540006, + "grad_norm": 3.074997901916504, + "learning_rate": 1.865651278685806e-05, + "loss": 0.498, + "step": 3017 + }, + { + "epoch": 0.3672649832674171, + "grad_norm": 4.269886493682861, + "learning_rate": 1.8655536012209373e-05, + "loss": 0.5312, + "step": 3018 + }, + { + "epoch": 0.36738667477943415, + "grad_norm": 1.078355312347412, + "learning_rate": 1.8654558908199627e-05, + "loss": 0.4139, + "step": 3019 + }, + { + "epoch": 0.36750836629145117, + "grad_norm": 0.8460218906402588, + "learning_rate": 1.8653581474865998e-05, + "loss": 0.4355, + "step": 3020 + }, + { + "epoch": 0.3676300578034682, + "grad_norm": 2.2422807216644287, + "learning_rate": 1.8652603712245685e-05, + "loss": 0.477, + "step": 3021 + }, + { + "epoch": 0.36775174931548527, + "grad_norm": 1.8369975090026855, + "learning_rate": 1.865162562037589e-05, + "loss": 0.4722, + "step": 3022 + }, + { + "epoch": 0.3678734408275023, + "grad_norm": 1.912951946258545, + "learning_rate": 1.865064719929383e-05, + "loss": 0.4495, + "step": 3023 + }, + { + "epoch": 0.3679951323395193, + "grad_norm": 0.7084397077560425, + "learning_rate": 1.864966844903674e-05, + "loss": 0.4629, + "step": 3024 + }, + { + "epoch": 0.3681168238515364, + "grad_norm": 1.1747092008590698, + "learning_rate": 1.864868936964186e-05, + "loss": 0.5198, + "step": 3025 + }, + { + "epoch": 0.3682385153635534, + "grad_norm": 0.6192000508308411, + "learning_rate": 1.864770996114645e-05, + "loss": 0.5062, + "step": 3026 + }, + { + "epoch": 0.3683602068755704, + "grad_norm": 0.6726124882698059, + "learning_rate": 1.8646730223587778e-05, + "loss": 0.4915, + "step": 3027 + }, + { + "epoch": 0.3684818983875875, + "grad_norm": 1.4156765937805176, + "learning_rate": 1.864575015700312e-05, + "loss": 0.4549, + "step": 3028 + }, + { + "epoch": 0.3686035898996045, + "grad_norm": 0.8093295097351074, + "learning_rate": 1.864476976142977e-05, + "loss": 0.5323, + "step": 3029 + }, + { + "epoch": 0.36872528141162153, + "grad_norm": 1.5512853860855103, + "learning_rate": 1.8643789036905042e-05, + "loss": 0.5057, + "step": 3030 + }, + { + "epoch": 0.36884697292363855, + "grad_norm": 1.6699373722076416, + "learning_rate": 1.8642807983466248e-05, + "loss": 0.4696, + "step": 3031 + }, + { + "epoch": 0.3689686644356556, + "grad_norm": 3.567721128463745, + "learning_rate": 1.864182660115072e-05, + "loss": 0.4849, + "step": 3032 + }, + { + "epoch": 0.36909035594767264, + "grad_norm": 1.2548764944076538, + "learning_rate": 1.8640844889995803e-05, + "loss": 0.4827, + "step": 3033 + }, + { + "epoch": 0.36921204745968966, + "grad_norm": 3.7645862102508545, + "learning_rate": 1.8639862850038854e-05, + "loss": 0.44, + "step": 3034 + }, + { + "epoch": 0.36933373897170674, + "grad_norm": 1.1269211769104004, + "learning_rate": 1.8638880481317237e-05, + "loss": 0.4405, + "step": 3035 + }, + { + "epoch": 0.36945543048372376, + "grad_norm": 2.2970995903015137, + "learning_rate": 1.8637897783868337e-05, + "loss": 0.5253, + "step": 3036 + }, + { + "epoch": 0.3695771219957408, + "grad_norm": 0.6540803909301758, + "learning_rate": 1.8636914757729548e-05, + "loss": 0.4067, + "step": 3037 + }, + { + "epoch": 0.36969881350775785, + "grad_norm": 4.421058177947998, + "learning_rate": 1.8635931402938273e-05, + "loss": 0.5236, + "step": 3038 + }, + { + "epoch": 0.36982050501977487, + "grad_norm": 3.468085765838623, + "learning_rate": 1.8634947719531936e-05, + "loss": 0.5116, + "step": 3039 + }, + { + "epoch": 0.3699421965317919, + "grad_norm": 2.381770372390747, + "learning_rate": 1.8633963707547963e-05, + "loss": 0.4558, + "step": 3040 + }, + { + "epoch": 0.37006388804380896, + "grad_norm": 0.6035611629486084, + "learning_rate": 1.86329793670238e-05, + "loss": 0.4219, + "step": 3041 + }, + { + "epoch": 0.370185579555826, + "grad_norm": 3.956756114959717, + "learning_rate": 1.86319946979969e-05, + "loss": 0.4186, + "step": 3042 + }, + { + "epoch": 0.370307271067843, + "grad_norm": 1.367324948310852, + "learning_rate": 1.8631009700504738e-05, + "loss": 0.4244, + "step": 3043 + }, + { + "epoch": 0.3704289625798601, + "grad_norm": 2.888293743133545, + "learning_rate": 1.8630024374584788e-05, + "loss": 0.5086, + "step": 3044 + }, + { + "epoch": 0.3705506540918771, + "grad_norm": 1.178636908531189, + "learning_rate": 1.862903872027455e-05, + "loss": 0.4545, + "step": 3045 + }, + { + "epoch": 0.3706723456038941, + "grad_norm": 0.98091721534729, + "learning_rate": 1.8628052737611528e-05, + "loss": 0.4393, + "step": 3046 + }, + { + "epoch": 0.3707940371159112, + "grad_norm": 2.4809136390686035, + "learning_rate": 1.862706642663324e-05, + "loss": 0.4855, + "step": 3047 + }, + { + "epoch": 0.3709157286279282, + "grad_norm": 2.2146472930908203, + "learning_rate": 1.8626079787377217e-05, + "loss": 0.5508, + "step": 3048 + }, + { + "epoch": 0.37103742013994523, + "grad_norm": 1.3630300760269165, + "learning_rate": 1.8625092819881e-05, + "loss": 0.4659, + "step": 3049 + }, + { + "epoch": 0.37115911165196225, + "grad_norm": 2.8149638175964355, + "learning_rate": 1.8624105524182156e-05, + "loss": 0.4705, + "step": 3050 + }, + { + "epoch": 0.3712808031639793, + "grad_norm": 3.239107370376587, + "learning_rate": 1.862311790031824e-05, + "loss": 0.4514, + "step": 3051 + }, + { + "epoch": 0.37140249467599634, + "grad_norm": 2.323507785797119, + "learning_rate": 1.8622129948326838e-05, + "loss": 0.4717, + "step": 3052 + }, + { + "epoch": 0.37152418618801336, + "grad_norm": 1.4264402389526367, + "learning_rate": 1.862114166824555e-05, + "loss": 0.4246, + "step": 3053 + }, + { + "epoch": 0.37164587770003044, + "grad_norm": 1.1614569425582886, + "learning_rate": 1.8620153060111973e-05, + "loss": 0.4682, + "step": 3054 + }, + { + "epoch": 0.37176756921204746, + "grad_norm": 0.7365112900733948, + "learning_rate": 1.861916412396373e-05, + "loss": 0.4603, + "step": 3055 + }, + { + "epoch": 0.3718892607240645, + "grad_norm": 2.646296262741089, + "learning_rate": 1.8618174859838452e-05, + "loss": 0.4112, + "step": 3056 + }, + { + "epoch": 0.37201095223608155, + "grad_norm": 2.904318332672119, + "learning_rate": 1.861718526777378e-05, + "loss": 0.4823, + "step": 3057 + }, + { + "epoch": 0.37213264374809857, + "grad_norm": 0.769153892993927, + "learning_rate": 1.8616195347807374e-05, + "loss": 0.409, + "step": 3058 + }, + { + "epoch": 0.3722543352601156, + "grad_norm": 2.342787981033325, + "learning_rate": 1.86152050999769e-05, + "loss": 0.4912, + "step": 3059 + }, + { + "epoch": 0.37237602677213266, + "grad_norm": 2.2397356033325195, + "learning_rate": 1.861421452432004e-05, + "loss": 0.4621, + "step": 3060 + }, + { + "epoch": 0.3724977182841497, + "grad_norm": 3.1565186977386475, + "learning_rate": 1.8613223620874486e-05, + "loss": 0.4904, + "step": 3061 + }, + { + "epoch": 0.3726194097961667, + "grad_norm": 2.013030767440796, + "learning_rate": 1.8612232389677943e-05, + "loss": 0.4241, + "step": 3062 + }, + { + "epoch": 0.3727411013081838, + "grad_norm": 4.367907524108887, + "learning_rate": 1.8611240830768134e-05, + "loss": 0.4096, + "step": 3063 + }, + { + "epoch": 0.3728627928202008, + "grad_norm": 0.6541080474853516, + "learning_rate": 1.8610248944182786e-05, + "loss": 0.5421, + "step": 3064 + }, + { + "epoch": 0.3729844843322178, + "grad_norm": 3.0682034492492676, + "learning_rate": 1.8609256729959642e-05, + "loss": 0.4723, + "step": 3065 + }, + { + "epoch": 0.3731061758442349, + "grad_norm": 3.624817132949829, + "learning_rate": 1.860826418813646e-05, + "loss": 0.4369, + "step": 3066 + }, + { + "epoch": 0.3732278673562519, + "grad_norm": 1.2501392364501953, + "learning_rate": 1.860727131875101e-05, + "loss": 0.5066, + "step": 3067 + }, + { + "epoch": 0.3733495588682689, + "grad_norm": 1.8625503778457642, + "learning_rate": 1.860627812184107e-05, + "loss": 0.4618, + "step": 3068 + }, + { + "epoch": 0.373471250380286, + "grad_norm": 1.5235109329223633, + "learning_rate": 1.860528459744443e-05, + "loss": 0.4613, + "step": 3069 + }, + { + "epoch": 0.373592941892303, + "grad_norm": 1.551566243171692, + "learning_rate": 1.8604290745598902e-05, + "loss": 0.4596, + "step": 3070 + }, + { + "epoch": 0.37371463340432004, + "grad_norm": 2.195451259613037, + "learning_rate": 1.8603296566342303e-05, + "loss": 0.4602, + "step": 3071 + }, + { + "epoch": 0.37383632491633706, + "grad_norm": 2.439582586288452, + "learning_rate": 1.8602302059712457e-05, + "loss": 0.4571, + "step": 3072 + }, + { + "epoch": 0.37395801642835413, + "grad_norm": 3.3360729217529297, + "learning_rate": 1.8601307225747213e-05, + "loss": 0.4833, + "step": 3073 + }, + { + "epoch": 0.37407970794037115, + "grad_norm": 1.1381869316101074, + "learning_rate": 1.8600312064484427e-05, + "loss": 0.4559, + "step": 3074 + }, + { + "epoch": 0.3742013994523882, + "grad_norm": 1.5621495246887207, + "learning_rate": 1.8599316575961964e-05, + "loss": 0.4815, + "step": 3075 + }, + { + "epoch": 0.37432309096440525, + "grad_norm": 0.6870940327644348, + "learning_rate": 1.8598320760217707e-05, + "loss": 0.477, + "step": 3076 + }, + { + "epoch": 0.37444478247642227, + "grad_norm": 1.9823907613754272, + "learning_rate": 1.859732461728955e-05, + "loss": 0.4602, + "step": 3077 + }, + { + "epoch": 0.3745664739884393, + "grad_norm": 1.0109331607818604, + "learning_rate": 1.8596328147215394e-05, + "loss": 0.4893, + "step": 3078 + }, + { + "epoch": 0.37468816550045636, + "grad_norm": 2.337618827819824, + "learning_rate": 1.8595331350033153e-05, + "loss": 0.409, + "step": 3079 + }, + { + "epoch": 0.3748098570124734, + "grad_norm": 0.6965295076370239, + "learning_rate": 1.8594334225780768e-05, + "loss": 0.5056, + "step": 3080 + }, + { + "epoch": 0.3749315485244904, + "grad_norm": 3.9322896003723145, + "learning_rate": 1.8593336774496175e-05, + "loss": 0.3649, + "step": 3081 + }, + { + "epoch": 0.3750532400365075, + "grad_norm": 1.7873562574386597, + "learning_rate": 1.8592338996217332e-05, + "loss": 0.4541, + "step": 3082 + }, + { + "epoch": 0.3751749315485245, + "grad_norm": 1.682086706161499, + "learning_rate": 1.8591340890982203e-05, + "loss": 0.4129, + "step": 3083 + }, + { + "epoch": 0.3752966230605415, + "grad_norm": 2.881730794906616, + "learning_rate": 1.859034245882877e-05, + "loss": 0.4704, + "step": 3084 + }, + { + "epoch": 0.3754183145725586, + "grad_norm": 0.9620259404182434, + "learning_rate": 1.858934369979503e-05, + "loss": 0.3721, + "step": 3085 + }, + { + "epoch": 0.3755400060845756, + "grad_norm": 3.6075077056884766, + "learning_rate": 1.8588344613918977e-05, + "loss": 0.4792, + "step": 3086 + }, + { + "epoch": 0.3756616975965926, + "grad_norm": 7.657585620880127, + "learning_rate": 1.8587345201238637e-05, + "loss": 0.5958, + "step": 3087 + }, + { + "epoch": 0.3757833891086097, + "grad_norm": 3.526669979095459, + "learning_rate": 1.8586345461792038e-05, + "loss": 0.448, + "step": 3088 + }, + { + "epoch": 0.3759050806206267, + "grad_norm": 3.5936107635498047, + "learning_rate": 1.8585345395617224e-05, + "loss": 0.4398, + "step": 3089 + }, + { + "epoch": 0.37602677213264374, + "grad_norm": 3.408573627471924, + "learning_rate": 1.858434500275224e-05, + "loss": 0.471, + "step": 3090 + }, + { + "epoch": 0.37614846364466076, + "grad_norm": 4.0012311935424805, + "learning_rate": 1.8583344283235165e-05, + "loss": 0.5293, + "step": 3091 + }, + { + "epoch": 0.37627015515667783, + "grad_norm": 0.6235780715942383, + "learning_rate": 1.8582343237104072e-05, + "loss": 0.4129, + "step": 3092 + }, + { + "epoch": 0.37639184666869485, + "grad_norm": 0.6681236624717712, + "learning_rate": 1.8581341864397055e-05, + "loss": 0.4596, + "step": 3093 + }, + { + "epoch": 0.37651353818071187, + "grad_norm": 0.9401674866676331, + "learning_rate": 1.858034016515222e-05, + "loss": 0.4958, + "step": 3094 + }, + { + "epoch": 0.37663522969272895, + "grad_norm": 2.4536235332489014, + "learning_rate": 1.8579338139407675e-05, + "loss": 0.4365, + "step": 3095 + }, + { + "epoch": 0.37675692120474596, + "grad_norm": 5.458724021911621, + "learning_rate": 1.8578335787201562e-05, + "loss": 0.4391, + "step": 3096 + }, + { + "epoch": 0.376878612716763, + "grad_norm": 1.9668610095977783, + "learning_rate": 1.8577333108572012e-05, + "loss": 0.4531, + "step": 3097 + }, + { + "epoch": 0.37700030422878006, + "grad_norm": 0.6157615780830383, + "learning_rate": 1.8576330103557187e-05, + "loss": 0.4675, + "step": 3098 + }, + { + "epoch": 0.3771219957407971, + "grad_norm": 0.9017341732978821, + "learning_rate": 1.8575326772195244e-05, + "loss": 0.4424, + "step": 3099 + }, + { + "epoch": 0.3772436872528141, + "grad_norm": 1.8384252786636353, + "learning_rate": 1.857432311452437e-05, + "loss": 0.4114, + "step": 3100 + }, + { + "epoch": 0.37736537876483117, + "grad_norm": 3.7494544982910156, + "learning_rate": 1.8573319130582756e-05, + "loss": 0.5206, + "step": 3101 + }, + { + "epoch": 0.3774870702768482, + "grad_norm": 1.9978009462356567, + "learning_rate": 1.85723148204086e-05, + "loss": 0.4427, + "step": 3102 + }, + { + "epoch": 0.3776087617888652, + "grad_norm": 1.0292961597442627, + "learning_rate": 1.8571310184040124e-05, + "loss": 0.4257, + "step": 3103 + }, + { + "epoch": 0.3777304533008823, + "grad_norm": 2.8446903228759766, + "learning_rate": 1.8570305221515556e-05, + "loss": 0.4582, + "step": 3104 + }, + { + "epoch": 0.3778521448128993, + "grad_norm": 4.5071587562561035, + "learning_rate": 1.856929993287313e-05, + "loss": 0.5599, + "step": 3105 + }, + { + "epoch": 0.3779738363249163, + "grad_norm": 2.5502004623413086, + "learning_rate": 1.8568294318151104e-05, + "loss": 0.4947, + "step": 3106 + }, + { + "epoch": 0.3780955278369334, + "grad_norm": 2.0243778228759766, + "learning_rate": 1.8567288377387745e-05, + "loss": 0.4753, + "step": 3107 + }, + { + "epoch": 0.3782172193489504, + "grad_norm": 2.132016181945801, + "learning_rate": 1.856628211062133e-05, + "loss": 0.4193, + "step": 3108 + }, + { + "epoch": 0.37833891086096744, + "grad_norm": 1.9406275749206543, + "learning_rate": 1.856527551789015e-05, + "loss": 0.551, + "step": 3109 + }, + { + "epoch": 0.3784606023729845, + "grad_norm": 2.985602617263794, + "learning_rate": 1.8564268599232507e-05, + "loss": 0.4908, + "step": 3110 + }, + { + "epoch": 0.37858229388500153, + "grad_norm": 3.1786415576934814, + "learning_rate": 1.8563261354686718e-05, + "loss": 0.4849, + "step": 3111 + }, + { + "epoch": 0.37870398539701855, + "grad_norm": 1.6061556339263916, + "learning_rate": 1.8562253784291108e-05, + "loss": 0.5219, + "step": 3112 + }, + { + "epoch": 0.37882567690903557, + "grad_norm": 2.637531042098999, + "learning_rate": 1.8561245888084017e-05, + "loss": 0.4505, + "step": 3113 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 3.5307555198669434, + "learning_rate": 1.8560237666103803e-05, + "loss": 0.4024, + "step": 3114 + }, + { + "epoch": 0.37906905993306966, + "grad_norm": 0.8938859701156616, + "learning_rate": 1.8559229118388825e-05, + "loss": 0.4621, + "step": 3115 + }, + { + "epoch": 0.3791907514450867, + "grad_norm": 0.8579207062721252, + "learning_rate": 1.8558220244977462e-05, + "loss": 0.4147, + "step": 3116 + }, + { + "epoch": 0.37931244295710376, + "grad_norm": 3.38228440284729, + "learning_rate": 1.8557211045908104e-05, + "loss": 0.4969, + "step": 3117 + }, + { + "epoch": 0.3794341344691208, + "grad_norm": 4.783172607421875, + "learning_rate": 1.8556201521219152e-05, + "loss": 0.5167, + "step": 3118 + }, + { + "epoch": 0.3795558259811378, + "grad_norm": 3.9394025802612305, + "learning_rate": 1.8555191670949028e-05, + "loss": 0.491, + "step": 3119 + }, + { + "epoch": 0.37967751749315487, + "grad_norm": 3.0613887310028076, + "learning_rate": 1.8554181495136146e-05, + "loss": 0.438, + "step": 3120 + }, + { + "epoch": 0.3797992090051719, + "grad_norm": 0.802926242351532, + "learning_rate": 1.855317099381895e-05, + "loss": 0.3874, + "step": 3121 + }, + { + "epoch": 0.3799209005171889, + "grad_norm": 2.5228464603424072, + "learning_rate": 1.8552160167035902e-05, + "loss": 0.489, + "step": 3122 + }, + { + "epoch": 0.380042592029206, + "grad_norm": 0.8206809163093567, + "learning_rate": 1.855114901482545e-05, + "loss": 0.4929, + "step": 3123 + }, + { + "epoch": 0.380164283541223, + "grad_norm": 1.3448454141616821, + "learning_rate": 1.8550137537226085e-05, + "loss": 0.4734, + "step": 3124 + }, + { + "epoch": 0.38028597505324, + "grad_norm": 1.8659371137619019, + "learning_rate": 1.8549125734276284e-05, + "loss": 0.4574, + "step": 3125 + }, + { + "epoch": 0.3804076665652571, + "grad_norm": 1.289597749710083, + "learning_rate": 1.854811360601456e-05, + "loss": 0.4602, + "step": 3126 + }, + { + "epoch": 0.3805293580772741, + "grad_norm": 1.759281039237976, + "learning_rate": 1.854710115247941e-05, + "loss": 0.5542, + "step": 3127 + }, + { + "epoch": 0.38065104958929113, + "grad_norm": 0.6956965327262878, + "learning_rate": 1.8546088373709373e-05, + "loss": 0.4512, + "step": 3128 + }, + { + "epoch": 0.3807727411013082, + "grad_norm": 2.488795757293701, + "learning_rate": 1.8545075269742986e-05, + "loss": 0.5226, + "step": 3129 + }, + { + "epoch": 0.38089443261332523, + "grad_norm": 1.9304710626602173, + "learning_rate": 1.8544061840618798e-05, + "loss": 0.5039, + "step": 3130 + }, + { + "epoch": 0.38101612412534225, + "grad_norm": 2.1361005306243896, + "learning_rate": 1.8543048086375368e-05, + "loss": 0.466, + "step": 3131 + }, + { + "epoch": 0.38113781563735927, + "grad_norm": 0.9239948987960815, + "learning_rate": 1.8542034007051278e-05, + "loss": 0.5134, + "step": 3132 + }, + { + "epoch": 0.38125950714937634, + "grad_norm": 1.5530760288238525, + "learning_rate": 1.854101960268511e-05, + "loss": 0.4617, + "step": 3133 + }, + { + "epoch": 0.38138119866139336, + "grad_norm": 2.8756372928619385, + "learning_rate": 1.854000487331547e-05, + "loss": 0.5835, + "step": 3134 + }, + { + "epoch": 0.3815028901734104, + "grad_norm": 1.1003234386444092, + "learning_rate": 1.8538989818980967e-05, + "loss": 0.4987, + "step": 3135 + }, + { + "epoch": 0.38162458168542746, + "grad_norm": 0.6150041222572327, + "learning_rate": 1.853797443972023e-05, + "loss": 0.5037, + "step": 3136 + }, + { + "epoch": 0.3817462731974445, + "grad_norm": 1.2901042699813843, + "learning_rate": 1.853695873557189e-05, + "loss": 0.473, + "step": 3137 + }, + { + "epoch": 0.3818679647094615, + "grad_norm": 2.51076078414917, + "learning_rate": 1.8535942706574598e-05, + "loss": 0.4695, + "step": 3138 + }, + { + "epoch": 0.38198965622147857, + "grad_norm": 1.9291123151779175, + "learning_rate": 1.853492635276702e-05, + "loss": 0.4401, + "step": 3139 + }, + { + "epoch": 0.3821113477334956, + "grad_norm": 2.271430730819702, + "learning_rate": 1.8533909674187828e-05, + "loss": 0.4886, + "step": 3140 + }, + { + "epoch": 0.3822330392455126, + "grad_norm": 3.22769832611084, + "learning_rate": 1.8532892670875707e-05, + "loss": 0.5278, + "step": 3141 + }, + { + "epoch": 0.3823547307575297, + "grad_norm": 1.6389480829238892, + "learning_rate": 1.8531875342869355e-05, + "loss": 0.4236, + "step": 3142 + }, + { + "epoch": 0.3824764222695467, + "grad_norm": 0.8063991665840149, + "learning_rate": 1.8530857690207492e-05, + "loss": 0.4253, + "step": 3143 + }, + { + "epoch": 0.3825981137815637, + "grad_norm": 0.7704494595527649, + "learning_rate": 1.852983971292883e-05, + "loss": 0.4495, + "step": 3144 + }, + { + "epoch": 0.3827198052935808, + "grad_norm": 1.41098952293396, + "learning_rate": 1.8528821411072118e-05, + "loss": 0.4881, + "step": 3145 + }, + { + "epoch": 0.3828414968055978, + "grad_norm": 0.8357442021369934, + "learning_rate": 1.852780278467609e-05, + "loss": 0.4332, + "step": 3146 + }, + { + "epoch": 0.38296318831761483, + "grad_norm": 2.3424081802368164, + "learning_rate": 1.852678383377952e-05, + "loss": 0.4608, + "step": 3147 + }, + { + "epoch": 0.3830848798296319, + "grad_norm": 1.8472098112106323, + "learning_rate": 1.8525764558421175e-05, + "loss": 0.4991, + "step": 3148 + }, + { + "epoch": 0.3832065713416489, + "grad_norm": 0.8137487173080444, + "learning_rate": 1.852474495863984e-05, + "loss": 0.494, + "step": 3149 + }, + { + "epoch": 0.38332826285366595, + "grad_norm": 1.8833178281784058, + "learning_rate": 1.8523725034474317e-05, + "loss": 0.4695, + "step": 3150 + }, + { + "epoch": 0.383449954365683, + "grad_norm": 1.8519325256347656, + "learning_rate": 1.8522704785963412e-05, + "loss": 0.4628, + "step": 3151 + }, + { + "epoch": 0.38357164587770004, + "grad_norm": 0.9221046566963196, + "learning_rate": 1.8521684213145947e-05, + "loss": 0.464, + "step": 3152 + }, + { + "epoch": 0.38369333738971706, + "grad_norm": 0.7965523600578308, + "learning_rate": 1.852066331606076e-05, + "loss": 0.4283, + "step": 3153 + }, + { + "epoch": 0.3838150289017341, + "grad_norm": 2.841887950897217, + "learning_rate": 1.8519642094746696e-05, + "loss": 0.4569, + "step": 3154 + }, + { + "epoch": 0.38393672041375115, + "grad_norm": 5.291511058807373, + "learning_rate": 1.8518620549242615e-05, + "loss": 0.517, + "step": 3155 + }, + { + "epoch": 0.3840584119257682, + "grad_norm": 3.747833490371704, + "learning_rate": 1.8517598679587393e-05, + "loss": 0.509, + "step": 3156 + }, + { + "epoch": 0.3841801034377852, + "grad_norm": 0.9247905611991882, + "learning_rate": 1.8516576485819907e-05, + "loss": 0.4003, + "step": 3157 + }, + { + "epoch": 0.38430179494980227, + "grad_norm": 2.4241535663604736, + "learning_rate": 1.8515553967979058e-05, + "loss": 0.4917, + "step": 3158 + }, + { + "epoch": 0.3844234864618193, + "grad_norm": 0.6749876737594604, + "learning_rate": 1.8514531126103756e-05, + "loss": 0.4372, + "step": 3159 + }, + { + "epoch": 0.3845451779738363, + "grad_norm": 2.327443838119507, + "learning_rate": 1.8513507960232918e-05, + "loss": 0.505, + "step": 3160 + }, + { + "epoch": 0.3846668694858534, + "grad_norm": 1.2845443487167358, + "learning_rate": 1.8512484470405482e-05, + "loss": 0.4301, + "step": 3161 + }, + { + "epoch": 0.3847885609978704, + "grad_norm": 0.9656535387039185, + "learning_rate": 1.8511460656660393e-05, + "loss": 0.5199, + "step": 3162 + }, + { + "epoch": 0.3849102525098874, + "grad_norm": 1.7029277086257935, + "learning_rate": 1.8510436519036606e-05, + "loss": 0.4509, + "step": 3163 + }, + { + "epoch": 0.3850319440219045, + "grad_norm": 1.0286000967025757, + "learning_rate": 1.8509412057573096e-05, + "loss": 0.4566, + "step": 3164 + }, + { + "epoch": 0.3851536355339215, + "grad_norm": 2.8110828399658203, + "learning_rate": 1.8508387272308843e-05, + "loss": 0.4685, + "step": 3165 + }, + { + "epoch": 0.38527532704593853, + "grad_norm": 0.7943376898765564, + "learning_rate": 1.8507362163282844e-05, + "loss": 0.4981, + "step": 3166 + }, + { + "epoch": 0.3853970185579556, + "grad_norm": 1.1829615831375122, + "learning_rate": 1.8506336730534104e-05, + "loss": 0.4821, + "step": 3167 + }, + { + "epoch": 0.3855187100699726, + "grad_norm": 2.0051746368408203, + "learning_rate": 1.8505310974101643e-05, + "loss": 0.4552, + "step": 3168 + }, + { + "epoch": 0.38564040158198964, + "grad_norm": 1.1509095430374146, + "learning_rate": 1.8504284894024497e-05, + "loss": 0.4286, + "step": 3169 + }, + { + "epoch": 0.3857620930940067, + "grad_norm": 2.34548020362854, + "learning_rate": 1.8503258490341706e-05, + "loss": 0.4734, + "step": 3170 + }, + { + "epoch": 0.38588378460602374, + "grad_norm": 0.5796806216239929, + "learning_rate": 1.850223176309233e-05, + "loss": 0.417, + "step": 3171 + }, + { + "epoch": 0.38600547611804076, + "grad_norm": 1.1869587898254395, + "learning_rate": 1.8501204712315433e-05, + "loss": 0.433, + "step": 3172 + }, + { + "epoch": 0.3861271676300578, + "grad_norm": 1.4356694221496582, + "learning_rate": 1.8500177338050104e-05, + "loss": 0.4626, + "step": 3173 + }, + { + "epoch": 0.38624885914207485, + "grad_norm": 3.639671802520752, + "learning_rate": 1.8499149640335432e-05, + "loss": 0.5392, + "step": 3174 + }, + { + "epoch": 0.38637055065409187, + "grad_norm": 1.6376925706863403, + "learning_rate": 1.8498121619210523e-05, + "loss": 0.4046, + "step": 3175 + }, + { + "epoch": 0.3864922421661089, + "grad_norm": 1.9681793451309204, + "learning_rate": 1.84970932747145e-05, + "loss": 0.4775, + "step": 3176 + }, + { + "epoch": 0.38661393367812596, + "grad_norm": 2.046285390853882, + "learning_rate": 1.8496064606886485e-05, + "loss": 0.3908, + "step": 3177 + }, + { + "epoch": 0.386735625190143, + "grad_norm": 1.4376453161239624, + "learning_rate": 1.8495035615765625e-05, + "loss": 0.5125, + "step": 3178 + }, + { + "epoch": 0.38685731670216, + "grad_norm": 0.7068982124328613, + "learning_rate": 1.8494006301391083e-05, + "loss": 0.5078, + "step": 3179 + }, + { + "epoch": 0.3869790082141771, + "grad_norm": 1.790786623954773, + "learning_rate": 1.8492976663802013e-05, + "loss": 0.5096, + "step": 3180 + }, + { + "epoch": 0.3871006997261941, + "grad_norm": 1.2692198753356934, + "learning_rate": 1.8491946703037604e-05, + "loss": 0.5065, + "step": 3181 + }, + { + "epoch": 0.3872223912382111, + "grad_norm": 1.8550636768341064, + "learning_rate": 1.8490916419137046e-05, + "loss": 0.4894, + "step": 3182 + }, + { + "epoch": 0.3873440827502282, + "grad_norm": 2.1490261554718018, + "learning_rate": 1.8489885812139543e-05, + "loss": 0.4483, + "step": 3183 + }, + { + "epoch": 0.3874657742622452, + "grad_norm": 1.4833805561065674, + "learning_rate": 1.8488854882084312e-05, + "loss": 0.4461, + "step": 3184 + }, + { + "epoch": 0.38758746577426223, + "grad_norm": 2.0343050956726074, + "learning_rate": 1.8487823629010582e-05, + "loss": 0.4807, + "step": 3185 + }, + { + "epoch": 0.3877091572862793, + "grad_norm": 1.3174079656600952, + "learning_rate": 1.8486792052957593e-05, + "loss": 0.4724, + "step": 3186 + }, + { + "epoch": 0.3878308487982963, + "grad_norm": 2.131606101989746, + "learning_rate": 1.84857601539646e-05, + "loss": 0.489, + "step": 3187 + }, + { + "epoch": 0.38795254031031334, + "grad_norm": 2.056992769241333, + "learning_rate": 1.8484727932070864e-05, + "loss": 0.4577, + "step": 3188 + }, + { + "epoch": 0.3880742318223304, + "grad_norm": 0.6132264137268066, + "learning_rate": 1.8483695387315675e-05, + "loss": 0.4851, + "step": 3189 + }, + { + "epoch": 0.38819592333434744, + "grad_norm": 2.0401928424835205, + "learning_rate": 1.8482662519738313e-05, + "loss": 0.4652, + "step": 3190 + }, + { + "epoch": 0.38831761484636446, + "grad_norm": 1.4590213298797607, + "learning_rate": 1.8481629329378084e-05, + "loss": 0.4427, + "step": 3191 + }, + { + "epoch": 0.3884393063583815, + "grad_norm": 1.439366102218628, + "learning_rate": 1.8480595816274305e-05, + "loss": 0.4303, + "step": 3192 + }, + { + "epoch": 0.38856099787039855, + "grad_norm": 0.7999074459075928, + "learning_rate": 1.84795619804663e-05, + "loss": 0.4477, + "step": 3193 + }, + { + "epoch": 0.38868268938241557, + "grad_norm": 1.023721694946289, + "learning_rate": 1.847852782199341e-05, + "loss": 0.4605, + "step": 3194 + }, + { + "epoch": 0.3888043808944326, + "grad_norm": 1.1404272317886353, + "learning_rate": 1.8477493340894984e-05, + "loss": 0.4423, + "step": 3195 + }, + { + "epoch": 0.38892607240644966, + "grad_norm": 1.0734004974365234, + "learning_rate": 1.8476458537210393e-05, + "loss": 0.4737, + "step": 3196 + }, + { + "epoch": 0.3890477639184667, + "grad_norm": 2.0375170707702637, + "learning_rate": 1.8475423410979006e-05, + "loss": 0.5029, + "step": 3197 + }, + { + "epoch": 0.3891694554304837, + "grad_norm": 2.0767176151275635, + "learning_rate": 1.8474387962240218e-05, + "loss": 0.4329, + "step": 3198 + }, + { + "epoch": 0.3892911469425008, + "grad_norm": 1.063238263130188, + "learning_rate": 1.8473352191033425e-05, + "loss": 0.4815, + "step": 3199 + }, + { + "epoch": 0.3894128384545178, + "grad_norm": 0.7509887218475342, + "learning_rate": 1.8472316097398045e-05, + "loss": 0.5098, + "step": 3200 + }, + { + "epoch": 0.3895345299665348, + "grad_norm": 0.647834062576294, + "learning_rate": 1.8471279681373496e-05, + "loss": 0.4813, + "step": 3201 + }, + { + "epoch": 0.3896562214785519, + "grad_norm": 1.1288594007492065, + "learning_rate": 1.8470242942999225e-05, + "loss": 0.4776, + "step": 3202 + }, + { + "epoch": 0.3897779129905689, + "grad_norm": 2.3126728534698486, + "learning_rate": 1.8469205882314675e-05, + "loss": 0.4457, + "step": 3203 + }, + { + "epoch": 0.38989960450258593, + "grad_norm": 1.7314441204071045, + "learning_rate": 1.8468168499359312e-05, + "loss": 0.5302, + "step": 3204 + }, + { + "epoch": 0.390021296014603, + "grad_norm": 0.7410517334938049, + "learning_rate": 1.8467130794172606e-05, + "loss": 0.4846, + "step": 3205 + }, + { + "epoch": 0.39014298752662, + "grad_norm": 1.3053652048110962, + "learning_rate": 1.8466092766794052e-05, + "loss": 0.4721, + "step": 3206 + }, + { + "epoch": 0.39026467903863704, + "grad_norm": 0.9628663659095764, + "learning_rate": 1.8465054417263142e-05, + "loss": 0.4574, + "step": 3207 + }, + { + "epoch": 0.3903863705506541, + "grad_norm": 0.7171255350112915, + "learning_rate": 1.8464015745619392e-05, + "loss": 0.497, + "step": 3208 + }, + { + "epoch": 0.39050806206267114, + "grad_norm": 1.5145084857940674, + "learning_rate": 1.846297675190232e-05, + "loss": 0.4119, + "step": 3209 + }, + { + "epoch": 0.39062975357468815, + "grad_norm": 1.139266848564148, + "learning_rate": 1.846193743615147e-05, + "loss": 0.439, + "step": 3210 + }, + { + "epoch": 0.39075144508670523, + "grad_norm": 1.806175947189331, + "learning_rate": 1.846089779840638e-05, + "loss": 0.4214, + "step": 3211 + }, + { + "epoch": 0.39087313659872225, + "grad_norm": 1.168665885925293, + "learning_rate": 1.8459857838706622e-05, + "loss": 0.4281, + "step": 3212 + }, + { + "epoch": 0.39099482811073927, + "grad_norm": 0.6928753852844238, + "learning_rate": 1.8458817557091757e-05, + "loss": 0.42, + "step": 3213 + }, + { + "epoch": 0.3911165196227563, + "grad_norm": 2.061511278152466, + "learning_rate": 1.8457776953601377e-05, + "loss": 0.4984, + "step": 3214 + }, + { + "epoch": 0.39123821113477336, + "grad_norm": 2.9306259155273438, + "learning_rate": 1.8456736028275075e-05, + "loss": 0.4753, + "step": 3215 + }, + { + "epoch": 0.3913599026467904, + "grad_norm": 1.4316598176956177, + "learning_rate": 1.8455694781152463e-05, + "loss": 0.4614, + "step": 3216 + }, + { + "epoch": 0.3914815941588074, + "grad_norm": 1.952510118484497, + "learning_rate": 1.8454653212273165e-05, + "loss": 0.5211, + "step": 3217 + }, + { + "epoch": 0.3916032856708245, + "grad_norm": 1.8742213249206543, + "learning_rate": 1.845361132167681e-05, + "loss": 0.5503, + "step": 3218 + }, + { + "epoch": 0.3917249771828415, + "grad_norm": 3.4823076725006104, + "learning_rate": 1.8452569109403045e-05, + "loss": 0.4356, + "step": 3219 + }, + { + "epoch": 0.3918466686948585, + "grad_norm": 2.8115499019622803, + "learning_rate": 1.8451526575491537e-05, + "loss": 0.4885, + "step": 3220 + }, + { + "epoch": 0.3919683602068756, + "grad_norm": 3.0544533729553223, + "learning_rate": 1.845048371998194e-05, + "loss": 0.5334, + "step": 3221 + }, + { + "epoch": 0.3920900517188926, + "grad_norm": 2.2687594890594482, + "learning_rate": 1.8449440542913953e-05, + "loss": 0.5203, + "step": 3222 + }, + { + "epoch": 0.3922117432309096, + "grad_norm": 6.608068466186523, + "learning_rate": 1.844839704432726e-05, + "loss": 0.4333, + "step": 3223 + }, + { + "epoch": 0.3923334347429267, + "grad_norm": 2.947314739227295, + "learning_rate": 1.8447353224261572e-05, + "loss": 0.4973, + "step": 3224 + }, + { + "epoch": 0.3924551262549437, + "grad_norm": 1.3329120874404907, + "learning_rate": 1.844630908275661e-05, + "loss": 0.4909, + "step": 3225 + }, + { + "epoch": 0.39257681776696074, + "grad_norm": 0.5378501415252686, + "learning_rate": 1.8445264619852103e-05, + "loss": 0.4375, + "step": 3226 + }, + { + "epoch": 0.3926985092789778, + "grad_norm": 2.282072067260742, + "learning_rate": 1.8444219835587798e-05, + "loss": 0.44, + "step": 3227 + }, + { + "epoch": 0.39282020079099483, + "grad_norm": 1.0764062404632568, + "learning_rate": 1.844317473000345e-05, + "loss": 0.3869, + "step": 3228 + }, + { + "epoch": 0.39294189230301185, + "grad_norm": 3.2027347087860107, + "learning_rate": 1.8442129303138825e-05, + "loss": 0.4877, + "step": 3229 + }, + { + "epoch": 0.3930635838150289, + "grad_norm": 2.674499988555908, + "learning_rate": 1.8441083555033706e-05, + "loss": 0.4407, + "step": 3230 + }, + { + "epoch": 0.39318527532704595, + "grad_norm": 3.659951686859131, + "learning_rate": 1.8440037485727887e-05, + "loss": 0.4509, + "step": 3231 + }, + { + "epoch": 0.39330696683906297, + "grad_norm": 6.1977105140686035, + "learning_rate": 1.843899109526117e-05, + "loss": 0.5455, + "step": 3232 + }, + { + "epoch": 0.39342865835108, + "grad_norm": 1.3173803091049194, + "learning_rate": 1.8437944383673377e-05, + "loss": 0.4109, + "step": 3233 + }, + { + "epoch": 0.39355034986309706, + "grad_norm": 2.1625349521636963, + "learning_rate": 1.8436897351004336e-05, + "loss": 0.4817, + "step": 3234 + }, + { + "epoch": 0.3936720413751141, + "grad_norm": 0.7124523520469666, + "learning_rate": 1.8435849997293883e-05, + "loss": 0.4062, + "step": 3235 + }, + { + "epoch": 0.3937937328871311, + "grad_norm": 2.6680171489715576, + "learning_rate": 1.8434802322581877e-05, + "loss": 0.5378, + "step": 3236 + }, + { + "epoch": 0.3939154243991482, + "grad_norm": 1.0353717803955078, + "learning_rate": 1.8433754326908185e-05, + "loss": 0.5727, + "step": 3237 + }, + { + "epoch": 0.3940371159111652, + "grad_norm": 1.7438045740127563, + "learning_rate": 1.8432706010312684e-05, + "loss": 0.5204, + "step": 3238 + }, + { + "epoch": 0.3941588074231822, + "grad_norm": 6.921764850616455, + "learning_rate": 1.8431657372835264e-05, + "loss": 0.4544, + "step": 3239 + }, + { + "epoch": 0.3942804989351993, + "grad_norm": 4.179312705993652, + "learning_rate": 1.8430608414515828e-05, + "loss": 0.5309, + "step": 3240 + }, + { + "epoch": 0.3944021904472163, + "grad_norm": 5.485282897949219, + "learning_rate": 1.842955913539429e-05, + "loss": 0.4554, + "step": 3241 + }, + { + "epoch": 0.3945238819592333, + "grad_norm": 7.443978309631348, + "learning_rate": 1.842850953551058e-05, + "loss": 0.4626, + "step": 3242 + }, + { + "epoch": 0.3946455734712504, + "grad_norm": 2.6822259426116943, + "learning_rate": 1.8427459614904636e-05, + "loss": 0.5191, + "step": 3243 + }, + { + "epoch": 0.3947672649832674, + "grad_norm": 4.467551231384277, + "learning_rate": 1.8426409373616412e-05, + "loss": 0.4317, + "step": 3244 + }, + { + "epoch": 0.39488895649528444, + "grad_norm": 2.4827613830566406, + "learning_rate": 1.842535881168587e-05, + "loss": 0.4193, + "step": 3245 + }, + { + "epoch": 0.3950106480073015, + "grad_norm": 1.2630842924118042, + "learning_rate": 1.8424307929152983e-05, + "loss": 0.4623, + "step": 3246 + }, + { + "epoch": 0.39513233951931853, + "grad_norm": 2.2650156021118164, + "learning_rate": 1.842325672605774e-05, + "loss": 0.4433, + "step": 3247 + }, + { + "epoch": 0.39525403103133555, + "grad_norm": 3.270244836807251, + "learning_rate": 1.8422205202440148e-05, + "loss": 0.4915, + "step": 3248 + }, + { + "epoch": 0.3953757225433526, + "grad_norm": 3.1242475509643555, + "learning_rate": 1.8421153358340213e-05, + "loss": 0.4862, + "step": 3249 + }, + { + "epoch": 0.39549741405536964, + "grad_norm": 3.5277795791625977, + "learning_rate": 1.842010119379796e-05, + "loss": 0.4808, + "step": 3250 + }, + { + "epoch": 0.39561910556738666, + "grad_norm": 2.312628746032715, + "learning_rate": 1.8419048708853433e-05, + "loss": 0.4654, + "step": 3251 + }, + { + "epoch": 0.39574079707940374, + "grad_norm": 0.6908182501792908, + "learning_rate": 1.841799590354667e-05, + "loss": 0.4268, + "step": 3252 + }, + { + "epoch": 0.39586248859142076, + "grad_norm": 4.046888828277588, + "learning_rate": 1.841694277791774e-05, + "loss": 0.555, + "step": 3253 + }, + { + "epoch": 0.3959841801034378, + "grad_norm": 1.4682831764221191, + "learning_rate": 1.8415889332006718e-05, + "loss": 0.5175, + "step": 3254 + }, + { + "epoch": 0.3961058716154548, + "grad_norm": 1.267814040184021, + "learning_rate": 1.8414835565853687e-05, + "loss": 0.497, + "step": 3255 + }, + { + "epoch": 0.39622756312747187, + "grad_norm": 3.2317216396331787, + "learning_rate": 1.8413781479498746e-05, + "loss": 0.4678, + "step": 3256 + }, + { + "epoch": 0.3963492546394889, + "grad_norm": 4.2852888107299805, + "learning_rate": 1.8412727072982e-05, + "loss": 0.4886, + "step": 3257 + }, + { + "epoch": 0.3964709461515059, + "grad_norm": 6.081776142120361, + "learning_rate": 1.8411672346343575e-05, + "loss": 0.4546, + "step": 3258 + }, + { + "epoch": 0.396592637663523, + "grad_norm": 3.301145315170288, + "learning_rate": 1.8410617299623607e-05, + "loss": 0.4797, + "step": 3259 + }, + { + "epoch": 0.39671432917554, + "grad_norm": 2.7169342041015625, + "learning_rate": 1.8409561932862244e-05, + "loss": 0.4636, + "step": 3260 + }, + { + "epoch": 0.396836020687557, + "grad_norm": 0.7840808033943176, + "learning_rate": 1.8408506246099644e-05, + "loss": 0.4668, + "step": 3261 + }, + { + "epoch": 0.3969577121995741, + "grad_norm": 0.6465523838996887, + "learning_rate": 1.8407450239375976e-05, + "loss": 0.4728, + "step": 3262 + }, + { + "epoch": 0.3970794037115911, + "grad_norm": 2.52044677734375, + "learning_rate": 1.840639391273142e-05, + "loss": 0.3897, + "step": 3263 + }, + { + "epoch": 0.39720109522360814, + "grad_norm": 4.680634021759033, + "learning_rate": 1.8405337266206178e-05, + "loss": 0.543, + "step": 3264 + }, + { + "epoch": 0.3973227867356252, + "grad_norm": 3.9424571990966797, + "learning_rate": 1.8404280299840452e-05, + "loss": 0.4894, + "step": 3265 + }, + { + "epoch": 0.39744447824764223, + "grad_norm": 2.4979941844940186, + "learning_rate": 1.840322301367447e-05, + "loss": 0.4603, + "step": 3266 + }, + { + "epoch": 0.39756616975965925, + "grad_norm": 3.376971483230591, + "learning_rate": 1.8402165407748453e-05, + "loss": 0.5014, + "step": 3267 + }, + { + "epoch": 0.3976878612716763, + "grad_norm": 1.4296678304672241, + "learning_rate": 1.8401107482102654e-05, + "loss": 0.3858, + "step": 3268 + }, + { + "epoch": 0.39780955278369334, + "grad_norm": 3.5760421752929688, + "learning_rate": 1.8400049236777328e-05, + "loss": 0.5213, + "step": 3269 + }, + { + "epoch": 0.39793124429571036, + "grad_norm": 3.6825640201568604, + "learning_rate": 1.8398990671812738e-05, + "loss": 0.4906, + "step": 3270 + }, + { + "epoch": 0.39805293580772744, + "grad_norm": 1.4299966096878052, + "learning_rate": 1.839793178724917e-05, + "loss": 0.484, + "step": 3271 + }, + { + "epoch": 0.39817462731974446, + "grad_norm": 2.198282241821289, + "learning_rate": 1.839687258312691e-05, + "loss": 0.4378, + "step": 3272 + }, + { + "epoch": 0.3982963188317615, + "grad_norm": 0.8088342547416687, + "learning_rate": 1.8395813059486273e-05, + "loss": 0.5123, + "step": 3273 + }, + { + "epoch": 0.3984180103437785, + "grad_norm": 2.553713083267212, + "learning_rate": 1.839475321636757e-05, + "loss": 0.501, + "step": 3274 + }, + { + "epoch": 0.39853970185579557, + "grad_norm": 2.351316213607788, + "learning_rate": 1.839369305381113e-05, + "loss": 0.4584, + "step": 3275 + }, + { + "epoch": 0.3986613933678126, + "grad_norm": 3.3632233142852783, + "learning_rate": 1.8392632571857294e-05, + "loss": 0.4518, + "step": 3276 + }, + { + "epoch": 0.3987830848798296, + "grad_norm": 1.2141530513763428, + "learning_rate": 1.839157177054642e-05, + "loss": 0.4583, + "step": 3277 + }, + { + "epoch": 0.3989047763918467, + "grad_norm": 1.5478171110153198, + "learning_rate": 1.8390510649918867e-05, + "loss": 0.4387, + "step": 3278 + }, + { + "epoch": 0.3990264679038637, + "grad_norm": 3.9410643577575684, + "learning_rate": 1.8389449210015017e-05, + "loss": 0.5018, + "step": 3279 + }, + { + "epoch": 0.3991481594158807, + "grad_norm": 1.6830229759216309, + "learning_rate": 1.838838745087526e-05, + "loss": 0.4476, + "step": 3280 + }, + { + "epoch": 0.3992698509278978, + "grad_norm": 3.652818441390991, + "learning_rate": 1.838732537254e-05, + "loss": 0.5367, + "step": 3281 + }, + { + "epoch": 0.3993915424399148, + "grad_norm": 3.036189556121826, + "learning_rate": 1.8386262975049644e-05, + "loss": 0.475, + "step": 3282 + }, + { + "epoch": 0.39951323395193183, + "grad_norm": 0.6534156203269958, + "learning_rate": 1.8385200258444628e-05, + "loss": 0.4087, + "step": 3283 + }, + { + "epoch": 0.3996349254639489, + "grad_norm": 2.4086899757385254, + "learning_rate": 1.8384137222765383e-05, + "loss": 0.4772, + "step": 3284 + }, + { + "epoch": 0.39975661697596593, + "grad_norm": 3.8731942176818848, + "learning_rate": 1.838307386805236e-05, + "loss": 0.5396, + "step": 3285 + }, + { + "epoch": 0.39987830848798295, + "grad_norm": 4.17984676361084, + "learning_rate": 1.838201019434603e-05, + "loss": 0.4055, + "step": 3286 + }, + { + "epoch": 0.4, + "grad_norm": 3.8723251819610596, + "learning_rate": 1.8380946201686857e-05, + "loss": 0.5522, + "step": 3287 + }, + { + "epoch": 0.40012169151201704, + "grad_norm": 1.8191925287246704, + "learning_rate": 1.8379881890115338e-05, + "loss": 0.4723, + "step": 3288 + }, + { + "epoch": 0.40024338302403406, + "grad_norm": 5.876353740692139, + "learning_rate": 1.8378817259671967e-05, + "loss": 0.5102, + "step": 3289 + }, + { + "epoch": 0.40036507453605114, + "grad_norm": 3.0011301040649414, + "learning_rate": 1.8377752310397254e-05, + "loss": 0.4784, + "step": 3290 + }, + { + "epoch": 0.40048676604806815, + "grad_norm": 3.0897092819213867, + "learning_rate": 1.8376687042331723e-05, + "loss": 0.4851, + "step": 3291 + }, + { + "epoch": 0.4006084575600852, + "grad_norm": 6.221704006195068, + "learning_rate": 1.8375621455515916e-05, + "loss": 0.4841, + "step": 3292 + }, + { + "epoch": 0.40073014907210225, + "grad_norm": 2.357998847961426, + "learning_rate": 1.837455554999037e-05, + "loss": 0.4625, + "step": 3293 + }, + { + "epoch": 0.40085184058411927, + "grad_norm": 1.0451438426971436, + "learning_rate": 1.8373489325795657e-05, + "loss": 0.4475, + "step": 3294 + }, + { + "epoch": 0.4009735320961363, + "grad_norm": 1.7552294731140137, + "learning_rate": 1.837242278297234e-05, + "loss": 0.4717, + "step": 3295 + }, + { + "epoch": 0.4010952236081533, + "grad_norm": 1.0604931116104126, + "learning_rate": 1.8371355921561007e-05, + "loss": 0.446, + "step": 3296 + }, + { + "epoch": 0.4012169151201704, + "grad_norm": 3.4094369411468506, + "learning_rate": 1.8370288741602255e-05, + "loss": 0.4922, + "step": 3297 + }, + { + "epoch": 0.4013386066321874, + "grad_norm": 3.9399912357330322, + "learning_rate": 1.836922124313669e-05, + "loss": 0.5141, + "step": 3298 + }, + { + "epoch": 0.4014602981442044, + "grad_norm": 3.1885385513305664, + "learning_rate": 1.8368153426204932e-05, + "loss": 0.4728, + "step": 3299 + }, + { + "epoch": 0.4015819896562215, + "grad_norm": 6.2420244216918945, + "learning_rate": 1.8367085290847612e-05, + "loss": 0.5498, + "step": 3300 + }, + { + "epoch": 0.4017036811682385, + "grad_norm": 0.5824435949325562, + "learning_rate": 1.836601683710538e-05, + "loss": 0.4129, + "step": 3301 + }, + { + "epoch": 0.40182537268025553, + "grad_norm": 1.4892432689666748, + "learning_rate": 1.836494806501889e-05, + "loss": 0.4185, + "step": 3302 + }, + { + "epoch": 0.4019470641922726, + "grad_norm": 0.5604004859924316, + "learning_rate": 1.8363878974628817e-05, + "loss": 0.4461, + "step": 3303 + }, + { + "epoch": 0.4020687557042896, + "grad_norm": 1.9960089921951294, + "learning_rate": 1.8362809565975832e-05, + "loss": 0.4965, + "step": 3304 + }, + { + "epoch": 0.40219044721630665, + "grad_norm": 0.629467248916626, + "learning_rate": 1.8361739839100634e-05, + "loss": 0.501, + "step": 3305 + }, + { + "epoch": 0.4023121387283237, + "grad_norm": 0.8893454074859619, + "learning_rate": 1.836066979404393e-05, + "loss": 0.4848, + "step": 3306 + }, + { + "epoch": 0.40243383024034074, + "grad_norm": 5.554185390472412, + "learning_rate": 1.835959943084643e-05, + "loss": 0.4314, + "step": 3307 + }, + { + "epoch": 0.40255552175235776, + "grad_norm": 3.0835459232330322, + "learning_rate": 1.835852874954887e-05, + "loss": 0.4508, + "step": 3308 + }, + { + "epoch": 0.40267721326437483, + "grad_norm": 3.098881721496582, + "learning_rate": 1.835745775019199e-05, + "loss": 0.4798, + "step": 3309 + }, + { + "epoch": 0.40279890477639185, + "grad_norm": 1.095411777496338, + "learning_rate": 1.835638643281654e-05, + "loss": 0.4785, + "step": 3310 + }, + { + "epoch": 0.40292059628840887, + "grad_norm": 4.372706413269043, + "learning_rate": 1.8355314797463297e-05, + "loss": 0.4251, + "step": 3311 + }, + { + "epoch": 0.40304228780042595, + "grad_norm": 2.5591952800750732, + "learning_rate": 1.8354242844173028e-05, + "loss": 0.4365, + "step": 3312 + }, + { + "epoch": 0.40316397931244297, + "grad_norm": 0.6773574948310852, + "learning_rate": 1.8353170572986523e-05, + "loss": 0.414, + "step": 3313 + }, + { + "epoch": 0.40328567082446, + "grad_norm": 1.0209612846374512, + "learning_rate": 1.8352097983944593e-05, + "loss": 0.3957, + "step": 3314 + }, + { + "epoch": 0.403407362336477, + "grad_norm": 3.8572816848754883, + "learning_rate": 1.8351025077088044e-05, + "loss": 0.5391, + "step": 3315 + }, + { + "epoch": 0.4035290538484941, + "grad_norm": 3.0534300804138184, + "learning_rate": 1.8349951852457707e-05, + "loss": 0.4318, + "step": 3316 + }, + { + "epoch": 0.4036507453605111, + "grad_norm": 5.404112339019775, + "learning_rate": 1.8348878310094415e-05, + "loss": 0.5699, + "step": 3317 + }, + { + "epoch": 0.4037724368725281, + "grad_norm": 6.7610602378845215, + "learning_rate": 1.8347804450039025e-05, + "loss": 0.5638, + "step": 3318 + }, + { + "epoch": 0.4038941283845452, + "grad_norm": 4.170486927032471, + "learning_rate": 1.8346730272332396e-05, + "loss": 0.4879, + "step": 3319 + }, + { + "epoch": 0.4040158198965622, + "grad_norm": 4.948755264282227, + "learning_rate": 1.8345655777015405e-05, + "loss": 0.5558, + "step": 3320 + }, + { + "epoch": 0.40413751140857923, + "grad_norm": 1.204403042793274, + "learning_rate": 1.8344580964128936e-05, + "loss": 0.4401, + "step": 3321 + }, + { + "epoch": 0.4042592029205963, + "grad_norm": 0.9451858401298523, + "learning_rate": 1.8343505833713885e-05, + "loss": 0.4616, + "step": 3322 + }, + { + "epoch": 0.4043808944326133, + "grad_norm": 1.7080293893814087, + "learning_rate": 1.8342430385811172e-05, + "loss": 0.53, + "step": 3323 + }, + { + "epoch": 0.40450258594463034, + "grad_norm": 0.9750133156776428, + "learning_rate": 1.8341354620461715e-05, + "loss": 0.5347, + "step": 3324 + }, + { + "epoch": 0.4046242774566474, + "grad_norm": 6.347476482391357, + "learning_rate": 1.8340278537706443e-05, + "loss": 0.4849, + "step": 3325 + }, + { + "epoch": 0.40474596896866444, + "grad_norm": 7.887727737426758, + "learning_rate": 1.8339202137586317e-05, + "loss": 0.4894, + "step": 3326 + }, + { + "epoch": 0.40486766048068146, + "grad_norm": 5.168460369110107, + "learning_rate": 1.8338125420142282e-05, + "loss": 0.471, + "step": 3327 + }, + { + "epoch": 0.40498935199269853, + "grad_norm": 3.7578842639923096, + "learning_rate": 1.8337048385415318e-05, + "loss": 0.5391, + "step": 3328 + }, + { + "epoch": 0.40511104350471555, + "grad_norm": 4.743853569030762, + "learning_rate": 1.8335971033446406e-05, + "loss": 0.481, + "step": 3329 + }, + { + "epoch": 0.40523273501673257, + "grad_norm": 2.081660509109497, + "learning_rate": 1.8334893364276545e-05, + "loss": 0.4572, + "step": 3330 + }, + { + "epoch": 0.40535442652874965, + "grad_norm": 0.922373354434967, + "learning_rate": 1.8333815377946736e-05, + "loss": 0.54, + "step": 3331 + }, + { + "epoch": 0.40547611804076666, + "grad_norm": 1.1141186952590942, + "learning_rate": 1.8332737074498004e-05, + "loss": 0.4836, + "step": 3332 + }, + { + "epoch": 0.4055978095527837, + "grad_norm": 2.6496200561523438, + "learning_rate": 1.833165845397138e-05, + "loss": 0.4913, + "step": 3333 + }, + { + "epoch": 0.4057195010648007, + "grad_norm": 2.3231499195098877, + "learning_rate": 1.83305795164079e-05, + "loss": 0.4828, + "step": 3334 + }, + { + "epoch": 0.4058411925768178, + "grad_norm": 1.9056240320205688, + "learning_rate": 1.832950026184863e-05, + "loss": 0.4781, + "step": 3335 + }, + { + "epoch": 0.4059628840888348, + "grad_norm": 0.7461123466491699, + "learning_rate": 1.832842069033463e-05, + "loss": 0.4762, + "step": 3336 + }, + { + "epoch": 0.4060845756008518, + "grad_norm": 1.2821344137191772, + "learning_rate": 1.832734080190699e-05, + "loss": 0.3855, + "step": 3337 + }, + { + "epoch": 0.4062062671128689, + "grad_norm": 0.6767551898956299, + "learning_rate": 1.8326260596606794e-05, + "loss": 0.4364, + "step": 3338 + }, + { + "epoch": 0.4063279586248859, + "grad_norm": 0.6140134930610657, + "learning_rate": 1.8325180074475146e-05, + "loss": 0.4084, + "step": 3339 + }, + { + "epoch": 0.40644965013690293, + "grad_norm": 1.4850640296936035, + "learning_rate": 1.8324099235553165e-05, + "loss": 0.4795, + "step": 3340 + }, + { + "epoch": 0.40657134164892, + "grad_norm": 2.3355071544647217, + "learning_rate": 1.8323018079881976e-05, + "loss": 0.5079, + "step": 3341 + }, + { + "epoch": 0.406693033160937, + "grad_norm": 1.6171776056289673, + "learning_rate": 1.8321936607502723e-05, + "loss": 0.5031, + "step": 3342 + }, + { + "epoch": 0.40681472467295404, + "grad_norm": 1.5494000911712646, + "learning_rate": 1.832085481845656e-05, + "loss": 0.4083, + "step": 3343 + }, + { + "epoch": 0.4069364161849711, + "grad_norm": 3.4000465869903564, + "learning_rate": 1.8319772712784646e-05, + "loss": 0.4416, + "step": 3344 + }, + { + "epoch": 0.40705810769698814, + "grad_norm": 1.4358357191085815, + "learning_rate": 1.8318690290528154e-05, + "loss": 0.5197, + "step": 3345 + }, + { + "epoch": 0.40717979920900516, + "grad_norm": 0.8626274466514587, + "learning_rate": 1.8317607551728285e-05, + "loss": 0.5032, + "step": 3346 + }, + { + "epoch": 0.40730149072102223, + "grad_norm": 0.861656129360199, + "learning_rate": 1.831652449642623e-05, + "loss": 0.4972, + "step": 3347 + }, + { + "epoch": 0.40742318223303925, + "grad_norm": 2.628462076187134, + "learning_rate": 1.8315441124663202e-05, + "loss": 0.4517, + "step": 3348 + }, + { + "epoch": 0.40754487374505627, + "grad_norm": 1.4902620315551758, + "learning_rate": 1.831435743648043e-05, + "loss": 0.4858, + "step": 3349 + }, + { + "epoch": 0.40766656525707334, + "grad_norm": 1.521276593208313, + "learning_rate": 1.831327343191914e-05, + "loss": 0.5016, + "step": 3350 + }, + { + "epoch": 0.40778825676909036, + "grad_norm": 3.085007667541504, + "learning_rate": 1.8312189111020595e-05, + "loss": 0.4451, + "step": 3351 + }, + { + "epoch": 0.4079099482811074, + "grad_norm": 1.2297166585922241, + "learning_rate": 1.831110447382605e-05, + "loss": 0.5072, + "step": 3352 + }, + { + "epoch": 0.40803163979312446, + "grad_norm": 0.931147575378418, + "learning_rate": 1.8310019520376773e-05, + "loss": 0.4918, + "step": 3353 + }, + { + "epoch": 0.4081533313051415, + "grad_norm": 1.8890422582626343, + "learning_rate": 1.8308934250714054e-05, + "loss": 0.4916, + "step": 3354 + }, + { + "epoch": 0.4082750228171585, + "grad_norm": 1.5281296968460083, + "learning_rate": 1.830784866487919e-05, + "loss": 0.4513, + "step": 3355 + }, + { + "epoch": 0.4083967143291755, + "grad_norm": 3.4499669075012207, + "learning_rate": 1.8306762762913487e-05, + "loss": 0.4268, + "step": 3356 + }, + { + "epoch": 0.4085184058411926, + "grad_norm": 0.7381409406661987, + "learning_rate": 1.8305676544858264e-05, + "loss": 0.4737, + "step": 3357 + }, + { + "epoch": 0.4086400973532096, + "grad_norm": 2.1498446464538574, + "learning_rate": 1.830459001075486e-05, + "loss": 0.4298, + "step": 3358 + }, + { + "epoch": 0.4087617888652266, + "grad_norm": 0.7460692524909973, + "learning_rate": 1.8303503160644616e-05, + "loss": 0.4171, + "step": 3359 + }, + { + "epoch": 0.4088834803772437, + "grad_norm": 0.7128798961639404, + "learning_rate": 1.8302415994568886e-05, + "loss": 0.4263, + "step": 3360 + }, + { + "epoch": 0.4090051718892607, + "grad_norm": 0.6934868693351746, + "learning_rate": 1.830132851256905e-05, + "loss": 0.4192, + "step": 3361 + }, + { + "epoch": 0.40912686340127774, + "grad_norm": 1.396048665046692, + "learning_rate": 1.8300240714686476e-05, + "loss": 0.4333, + "step": 3362 + }, + { + "epoch": 0.4092485549132948, + "grad_norm": 3.5061545372009277, + "learning_rate": 1.829915260096256e-05, + "loss": 0.5092, + "step": 3363 + }, + { + "epoch": 0.40937024642531183, + "grad_norm": 5.316830635070801, + "learning_rate": 1.829806417143871e-05, + "loss": 0.5483, + "step": 3364 + }, + { + "epoch": 0.40949193793732885, + "grad_norm": 3.1464223861694336, + "learning_rate": 1.8296975426156346e-05, + "loss": 0.4639, + "step": 3365 + }, + { + "epoch": 0.40961362944934593, + "grad_norm": 6.090936183929443, + "learning_rate": 1.8295886365156893e-05, + "loss": 0.5401, + "step": 3366 + }, + { + "epoch": 0.40973532096136295, + "grad_norm": 0.7507444620132446, + "learning_rate": 1.829479698848179e-05, + "loss": 0.4469, + "step": 3367 + }, + { + "epoch": 0.40985701247337997, + "grad_norm": 3.268221378326416, + "learning_rate": 1.8293707296172493e-05, + "loss": 0.4241, + "step": 3368 + }, + { + "epoch": 0.40997870398539704, + "grad_norm": 0.9399189949035645, + "learning_rate": 1.8292617288270467e-05, + "loss": 0.5179, + "step": 3369 + }, + { + "epoch": 0.41010039549741406, + "grad_norm": 1.1699978113174438, + "learning_rate": 1.829152696481719e-05, + "loss": 0.4816, + "step": 3370 + }, + { + "epoch": 0.4102220870094311, + "grad_norm": 0.703235387802124, + "learning_rate": 1.8290436325854143e-05, + "loss": 0.527, + "step": 3371 + }, + { + "epoch": 0.41034377852144815, + "grad_norm": 1.9470754861831665, + "learning_rate": 1.828934537142284e-05, + "loss": 0.4971, + "step": 3372 + }, + { + "epoch": 0.4104654700334652, + "grad_norm": 2.758213520050049, + "learning_rate": 1.8288254101564783e-05, + "loss": 0.445, + "step": 3373 + }, + { + "epoch": 0.4105871615454822, + "grad_norm": 2.5770461559295654, + "learning_rate": 1.8287162516321506e-05, + "loss": 0.4301, + "step": 3374 + }, + { + "epoch": 0.4107088530574992, + "grad_norm": 0.9836858510971069, + "learning_rate": 1.828607061573454e-05, + "loss": 0.4585, + "step": 3375 + }, + { + "epoch": 0.4108305445695163, + "grad_norm": 0.8131800293922424, + "learning_rate": 1.8284978399845435e-05, + "loss": 0.4285, + "step": 3376 + }, + { + "epoch": 0.4109522360815333, + "grad_norm": 1.0802713632583618, + "learning_rate": 1.8283885868695756e-05, + "loss": 0.4344, + "step": 3377 + }, + { + "epoch": 0.4110739275935503, + "grad_norm": 4.088990688323975, + "learning_rate": 1.8282793022327068e-05, + "loss": 0.5048, + "step": 3378 + }, + { + "epoch": 0.4111956191055674, + "grad_norm": 4.4374823570251465, + "learning_rate": 1.8281699860780965e-05, + "loss": 0.5403, + "step": 3379 + }, + { + "epoch": 0.4113173106175844, + "grad_norm": 2.3837692737579346, + "learning_rate": 1.828060638409904e-05, + "loss": 0.4597, + "step": 3380 + }, + { + "epoch": 0.41143900212960144, + "grad_norm": 3.1547815799713135, + "learning_rate": 1.8279512592322903e-05, + "loss": 0.486, + "step": 3381 + }, + { + "epoch": 0.4115606936416185, + "grad_norm": 2.082961082458496, + "learning_rate": 1.827841848549417e-05, + "loss": 0.4724, + "step": 3382 + }, + { + "epoch": 0.41168238515363553, + "grad_norm": 1.4442596435546875, + "learning_rate": 1.827732406365448e-05, + "loss": 0.4824, + "step": 3383 + }, + { + "epoch": 0.41180407666565255, + "grad_norm": 1.8321324586868286, + "learning_rate": 1.827622932684548e-05, + "loss": 0.5221, + "step": 3384 + }, + { + "epoch": 0.4119257681776696, + "grad_norm": 3.4394984245300293, + "learning_rate": 1.8275134275108817e-05, + "loss": 0.5057, + "step": 3385 + }, + { + "epoch": 0.41204745968968665, + "grad_norm": 2.7843782901763916, + "learning_rate": 1.827403890848617e-05, + "loss": 0.464, + "step": 3386 + }, + { + "epoch": 0.41216915120170367, + "grad_norm": 4.026331901550293, + "learning_rate": 1.8272943227019216e-05, + "loss": 0.4785, + "step": 3387 + }, + { + "epoch": 0.41229084271372074, + "grad_norm": 2.7709524631500244, + "learning_rate": 1.8271847230749643e-05, + "loss": 0.4911, + "step": 3388 + }, + { + "epoch": 0.41241253422573776, + "grad_norm": 1.947475552558899, + "learning_rate": 1.8270750919719167e-05, + "loss": 0.4404, + "step": 3389 + }, + { + "epoch": 0.4125342257377548, + "grad_norm": 1.105980396270752, + "learning_rate": 1.82696542939695e-05, + "loss": 0.5274, + "step": 3390 + }, + { + "epoch": 0.41265591724977185, + "grad_norm": 2.854992389678955, + "learning_rate": 1.8268557353542366e-05, + "loss": 0.4452, + "step": 3391 + }, + { + "epoch": 0.4127776087617889, + "grad_norm": 3.014014482498169, + "learning_rate": 1.826746009847951e-05, + "loss": 0.4757, + "step": 3392 + }, + { + "epoch": 0.4128993002738059, + "grad_norm": 0.8796677589416504, + "learning_rate": 1.8266362528822687e-05, + "loss": 0.4567, + "step": 3393 + }, + { + "epoch": 0.41302099178582297, + "grad_norm": 1.8404645919799805, + "learning_rate": 1.826526464461366e-05, + "loss": 0.4512, + "step": 3394 + }, + { + "epoch": 0.41314268329784, + "grad_norm": 1.5910371541976929, + "learning_rate": 1.8264166445894204e-05, + "loss": 0.4683, + "step": 3395 + }, + { + "epoch": 0.413264374809857, + "grad_norm": 0.6318998336791992, + "learning_rate": 1.8263067932706106e-05, + "loss": 0.4666, + "step": 3396 + }, + { + "epoch": 0.413386066321874, + "grad_norm": 2.4946372509002686, + "learning_rate": 1.8261969105091172e-05, + "loss": 0.507, + "step": 3397 + }, + { + "epoch": 0.4135077578338911, + "grad_norm": 0.6217384338378906, + "learning_rate": 1.826086996309121e-05, + "loss": 0.4719, + "step": 3398 + }, + { + "epoch": 0.4136294493459081, + "grad_norm": 1.6143803596496582, + "learning_rate": 1.825977050674805e-05, + "loss": 0.4742, + "step": 3399 + }, + { + "epoch": 0.41375114085792514, + "grad_norm": 1.202484369277954, + "learning_rate": 1.8258670736103523e-05, + "loss": 0.5338, + "step": 3400 + }, + { + "epoch": 0.4138728323699422, + "grad_norm": 0.6325135231018066, + "learning_rate": 1.8257570651199483e-05, + "loss": 0.4858, + "step": 3401 + }, + { + "epoch": 0.41399452388195923, + "grad_norm": 0.6948682069778442, + "learning_rate": 1.8256470252077786e-05, + "loss": 0.5216, + "step": 3402 + }, + { + "epoch": 0.41411621539397625, + "grad_norm": 4.379599094390869, + "learning_rate": 1.8255369538780307e-05, + "loss": 0.424, + "step": 3403 + }, + { + "epoch": 0.4142379069059933, + "grad_norm": 3.0073533058166504, + "learning_rate": 1.8254268511348926e-05, + "loss": 0.4858, + "step": 3404 + }, + { + "epoch": 0.41435959841801034, + "grad_norm": 3.610830783843994, + "learning_rate": 1.8253167169825545e-05, + "loss": 0.422, + "step": 3405 + }, + { + "epoch": 0.41448128993002736, + "grad_norm": 3.9430594444274902, + "learning_rate": 1.8252065514252068e-05, + "loss": 0.4568, + "step": 3406 + }, + { + "epoch": 0.41460298144204444, + "grad_norm": 2.9395554065704346, + "learning_rate": 1.8250963544670422e-05, + "loss": 0.3995, + "step": 3407 + }, + { + "epoch": 0.41472467295406146, + "grad_norm": 1.8223469257354736, + "learning_rate": 1.824986126112253e-05, + "loss": 0.4552, + "step": 3408 + }, + { + "epoch": 0.4148463644660785, + "grad_norm": 2.2542219161987305, + "learning_rate": 1.824875866365034e-05, + "loss": 0.4619, + "step": 3409 + }, + { + "epoch": 0.41496805597809555, + "grad_norm": 2.5607094764709473, + "learning_rate": 1.8247655752295814e-05, + "loss": 0.475, + "step": 3410 + }, + { + "epoch": 0.41508974749011257, + "grad_norm": 1.6788568496704102, + "learning_rate": 1.824655252710091e-05, + "loss": 0.4749, + "step": 3411 + }, + { + "epoch": 0.4152114390021296, + "grad_norm": 5.006715774536133, + "learning_rate": 1.824544898810762e-05, + "loss": 0.5662, + "step": 3412 + }, + { + "epoch": 0.41533313051414666, + "grad_norm": 2.8922886848449707, + "learning_rate": 1.8244345135357922e-05, + "loss": 0.5175, + "step": 3413 + }, + { + "epoch": 0.4154548220261637, + "grad_norm": 1.1472426652908325, + "learning_rate": 1.8243240968893827e-05, + "loss": 0.463, + "step": 3414 + }, + { + "epoch": 0.4155765135381807, + "grad_norm": 2.0325188636779785, + "learning_rate": 1.8242136488757355e-05, + "loss": 0.4689, + "step": 3415 + }, + { + "epoch": 0.4156982050501977, + "grad_norm": 1.1346851587295532, + "learning_rate": 1.8241031694990525e-05, + "loss": 0.4601, + "step": 3416 + }, + { + "epoch": 0.4158198965622148, + "grad_norm": 1.8192977905273438, + "learning_rate": 1.823992658763538e-05, + "loss": 0.501, + "step": 3417 + }, + { + "epoch": 0.4159415880742318, + "grad_norm": 2.941066026687622, + "learning_rate": 1.823882116673398e-05, + "loss": 0.4911, + "step": 3418 + }, + { + "epoch": 0.41606327958624884, + "grad_norm": 1.0002509355545044, + "learning_rate": 1.8237715432328372e-05, + "loss": 0.5257, + "step": 3419 + }, + { + "epoch": 0.4161849710982659, + "grad_norm": 0.8495650887489319, + "learning_rate": 1.8236609384460647e-05, + "loss": 0.4786, + "step": 3420 + }, + { + "epoch": 0.41630666261028293, + "grad_norm": 0.671356201171875, + "learning_rate": 1.8235503023172883e-05, + "loss": 0.474, + "step": 3421 + }, + { + "epoch": 0.41642835412229995, + "grad_norm": 0.6988985538482666, + "learning_rate": 1.8234396348507185e-05, + "loss": 0.4906, + "step": 3422 + }, + { + "epoch": 0.416550045634317, + "grad_norm": 0.5956538915634155, + "learning_rate": 1.8233289360505663e-05, + "loss": 0.4345, + "step": 3423 + }, + { + "epoch": 0.41667173714633404, + "grad_norm": 0.7774266600608826, + "learning_rate": 1.8232182059210434e-05, + "loss": 0.4751, + "step": 3424 + }, + { + "epoch": 0.41679342865835106, + "grad_norm": 0.6722429990768433, + "learning_rate": 1.823107444466364e-05, + "loss": 0.4674, + "step": 3425 + }, + { + "epoch": 0.41691512017036814, + "grad_norm": 1.5568604469299316, + "learning_rate": 1.8229966516907426e-05, + "loss": 0.4183, + "step": 3426 + }, + { + "epoch": 0.41703681168238516, + "grad_norm": 0.8228222727775574, + "learning_rate": 1.822885827598395e-05, + "loss": 0.4377, + "step": 3427 + }, + { + "epoch": 0.4171585031944022, + "grad_norm": 2.1870813369750977, + "learning_rate": 1.8227749721935386e-05, + "loss": 0.4758, + "step": 3428 + }, + { + "epoch": 0.41728019470641925, + "grad_norm": 0.8316046595573425, + "learning_rate": 1.8226640854803914e-05, + "loss": 0.4713, + "step": 3429 + }, + { + "epoch": 0.41740188621843627, + "grad_norm": 2.2942614555358887, + "learning_rate": 1.822553167463173e-05, + "loss": 0.4328, + "step": 3430 + }, + { + "epoch": 0.4175235777304533, + "grad_norm": 3.5903139114379883, + "learning_rate": 1.8224422181461035e-05, + "loss": 0.4831, + "step": 3431 + }, + { + "epoch": 0.41764526924247036, + "grad_norm": 1.6217827796936035, + "learning_rate": 1.8223312375334056e-05, + "loss": 0.4861, + "step": 3432 + }, + { + "epoch": 0.4177669607544874, + "grad_norm": 1.5356171131134033, + "learning_rate": 1.8222202256293022e-05, + "loss": 0.4808, + "step": 3433 + }, + { + "epoch": 0.4178886522665044, + "grad_norm": 3.0851364135742188, + "learning_rate": 1.8221091824380172e-05, + "loss": 0.5581, + "step": 3434 + }, + { + "epoch": 0.4180103437785214, + "grad_norm": 4.13778829574585, + "learning_rate": 1.8219981079637756e-05, + "loss": 0.4668, + "step": 3435 + }, + { + "epoch": 0.4181320352905385, + "grad_norm": 1.4807171821594238, + "learning_rate": 1.8218870022108052e-05, + "loss": 0.4669, + "step": 3436 + }, + { + "epoch": 0.4182537268025555, + "grad_norm": 1.7364503145217896, + "learning_rate": 1.821775865183333e-05, + "loss": 0.4634, + "step": 3437 + }, + { + "epoch": 0.41837541831457253, + "grad_norm": 2.777949571609497, + "learning_rate": 1.821664696885588e-05, + "loss": 0.4795, + "step": 3438 + }, + { + "epoch": 0.4184971098265896, + "grad_norm": 1.0836976766586304, + "learning_rate": 1.8215534973218005e-05, + "loss": 0.512, + "step": 3439 + }, + { + "epoch": 0.4186188013386066, + "grad_norm": 1.1404494047164917, + "learning_rate": 1.821442266496202e-05, + "loss": 0.424, + "step": 3440 + }, + { + "epoch": 0.41874049285062365, + "grad_norm": 0.692095160484314, + "learning_rate": 1.8213310044130252e-05, + "loss": 0.4325, + "step": 3441 + }, + { + "epoch": 0.4188621843626407, + "grad_norm": 0.6862213611602783, + "learning_rate": 1.821219711076503e-05, + "loss": 0.4487, + "step": 3442 + }, + { + "epoch": 0.41898387587465774, + "grad_norm": 3.447136640548706, + "learning_rate": 1.8211083864908716e-05, + "loss": 0.5145, + "step": 3443 + }, + { + "epoch": 0.41910556738667476, + "grad_norm": 1.4854779243469238, + "learning_rate": 1.820997030660366e-05, + "loss": 0.446, + "step": 3444 + }, + { + "epoch": 0.41922725889869183, + "grad_norm": 0.635477602481842, + "learning_rate": 1.8208856435892242e-05, + "loss": 0.4159, + "step": 3445 + }, + { + "epoch": 0.41934895041070885, + "grad_norm": 0.9970229268074036, + "learning_rate": 1.8207742252816848e-05, + "loss": 0.4346, + "step": 3446 + }, + { + "epoch": 0.4194706419227259, + "grad_norm": 2.287344217300415, + "learning_rate": 1.820662775741987e-05, + "loss": 0.3874, + "step": 3447 + }, + { + "epoch": 0.41959233343474295, + "grad_norm": 4.3752288818359375, + "learning_rate": 1.820551294974372e-05, + "loss": 0.5562, + "step": 3448 + }, + { + "epoch": 0.41971402494675997, + "grad_norm": 1.8130929470062256, + "learning_rate": 1.8204397829830816e-05, + "loss": 0.5413, + "step": 3449 + }, + { + "epoch": 0.419835716458777, + "grad_norm": 2.3292407989501953, + "learning_rate": 1.8203282397723595e-05, + "loss": 0.4668, + "step": 3450 + }, + { + "epoch": 0.41995740797079406, + "grad_norm": 2.0253348350524902, + "learning_rate": 1.8202166653464496e-05, + "loss": 0.4466, + "step": 3451 + }, + { + "epoch": 0.4200790994828111, + "grad_norm": 1.1529165506362915, + "learning_rate": 1.820105059709598e-05, + "loss": 0.4927, + "step": 3452 + }, + { + "epoch": 0.4202007909948281, + "grad_norm": 1.3928618431091309, + "learning_rate": 1.819993422866051e-05, + "loss": 0.4947, + "step": 3453 + }, + { + "epoch": 0.4203224825068452, + "grad_norm": 4.294394016265869, + "learning_rate": 1.8198817548200573e-05, + "loss": 0.4146, + "step": 3454 + }, + { + "epoch": 0.4204441740188622, + "grad_norm": 1.6274051666259766, + "learning_rate": 1.8197700555758656e-05, + "loss": 0.4835, + "step": 3455 + }, + { + "epoch": 0.4205658655308792, + "grad_norm": 2.516552448272705, + "learning_rate": 1.8196583251377266e-05, + "loss": 0.4625, + "step": 3456 + }, + { + "epoch": 0.42068755704289623, + "grad_norm": 1.104288101196289, + "learning_rate": 1.819546563509892e-05, + "loss": 0.4258, + "step": 3457 + }, + { + "epoch": 0.4208092485549133, + "grad_norm": 5.030282020568848, + "learning_rate": 1.819434770696614e-05, + "loss": 0.5489, + "step": 3458 + }, + { + "epoch": 0.4209309400669303, + "grad_norm": 3.4844894409179688, + "learning_rate": 1.8193229467021468e-05, + "loss": 0.5059, + "step": 3459 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.8286397457122803, + "learning_rate": 1.8192110915307454e-05, + "loss": 0.4693, + "step": 3460 + }, + { + "epoch": 0.4211743230909644, + "grad_norm": 4.959057807922363, + "learning_rate": 1.8190992051866664e-05, + "loss": 0.5689, + "step": 3461 + }, + { + "epoch": 0.42129601460298144, + "grad_norm": 2.2343626022338867, + "learning_rate": 1.8189872876741676e-05, + "loss": 0.4614, + "step": 3462 + }, + { + "epoch": 0.42141770611499846, + "grad_norm": 2.78779935836792, + "learning_rate": 1.818875338997507e-05, + "loss": 0.5115, + "step": 3463 + }, + { + "epoch": 0.42153939762701553, + "grad_norm": 0.9812670946121216, + "learning_rate": 1.818763359160945e-05, + "loss": 0.4569, + "step": 3464 + }, + { + "epoch": 0.42166108913903255, + "grad_norm": 2.293518304824829, + "learning_rate": 1.818651348168742e-05, + "loss": 0.4576, + "step": 3465 + }, + { + "epoch": 0.42178278065104957, + "grad_norm": 2.0170249938964844, + "learning_rate": 1.818539306025161e-05, + "loss": 0.457, + "step": 3466 + }, + { + "epoch": 0.42190447216306665, + "grad_norm": 2.2560160160064697, + "learning_rate": 1.818427232734465e-05, + "loss": 0.481, + "step": 3467 + }, + { + "epoch": 0.42202616367508367, + "grad_norm": 3.352612018585205, + "learning_rate": 1.8183151283009186e-05, + "loss": 0.4166, + "step": 3468 + }, + { + "epoch": 0.4221478551871007, + "grad_norm": 0.5273807644844055, + "learning_rate": 1.8182029927287878e-05, + "loss": 0.4574, + "step": 3469 + }, + { + "epoch": 0.42226954669911776, + "grad_norm": 0.9066219925880432, + "learning_rate": 1.8180908260223398e-05, + "loss": 0.4589, + "step": 3470 + }, + { + "epoch": 0.4223912382111348, + "grad_norm": 1.6761623620986938, + "learning_rate": 1.8179786281858423e-05, + "loss": 0.468, + "step": 3471 + }, + { + "epoch": 0.4225129297231518, + "grad_norm": 2.475247859954834, + "learning_rate": 1.817866399223565e-05, + "loss": 0.55, + "step": 3472 + }, + { + "epoch": 0.4226346212351689, + "grad_norm": 1.0085084438323975, + "learning_rate": 1.8177541391397783e-05, + "loss": 0.4677, + "step": 3473 + }, + { + "epoch": 0.4227563127471859, + "grad_norm": 1.6037474870681763, + "learning_rate": 1.8176418479387536e-05, + "loss": 0.4184, + "step": 3474 + }, + { + "epoch": 0.4228780042592029, + "grad_norm": 4.496516227722168, + "learning_rate": 1.8175295256247648e-05, + "loss": 0.3825, + "step": 3475 + }, + { + "epoch": 0.42299969577121993, + "grad_norm": 0.6811562180519104, + "learning_rate": 1.817417172202085e-05, + "loss": 0.4389, + "step": 3476 + }, + { + "epoch": 0.423121387283237, + "grad_norm": 1.438529372215271, + "learning_rate": 1.8173047876749898e-05, + "loss": 0.4591, + "step": 3477 + }, + { + "epoch": 0.423243078795254, + "grad_norm": 1.5971885919570923, + "learning_rate": 1.817192372047756e-05, + "loss": 0.4761, + "step": 3478 + }, + { + "epoch": 0.42336477030727104, + "grad_norm": 5.686800003051758, + "learning_rate": 1.81707992532466e-05, + "loss": 0.5873, + "step": 3479 + }, + { + "epoch": 0.4234864618192881, + "grad_norm": 0.879494845867157, + "learning_rate": 1.8169674475099827e-05, + "loss": 0.4553, + "step": 3480 + }, + { + "epoch": 0.42360815333130514, + "grad_norm": 2.3507444858551025, + "learning_rate": 1.8168549386080024e-05, + "loss": 0.4739, + "step": 3481 + }, + { + "epoch": 0.42372984484332216, + "grad_norm": 1.277342438697815, + "learning_rate": 1.8167423986230013e-05, + "loss": 0.4308, + "step": 3482 + }, + { + "epoch": 0.42385153635533923, + "grad_norm": 1.3545634746551514, + "learning_rate": 1.8166298275592612e-05, + "loss": 0.4413, + "step": 3483 + }, + { + "epoch": 0.42397322786735625, + "grad_norm": 1.9282746315002441, + "learning_rate": 1.8165172254210658e-05, + "loss": 0.4137, + "step": 3484 + }, + { + "epoch": 0.42409491937937327, + "grad_norm": 3.0479226112365723, + "learning_rate": 1.8164045922127e-05, + "loss": 0.4864, + "step": 3485 + }, + { + "epoch": 0.42421661089139034, + "grad_norm": 1.064579963684082, + "learning_rate": 1.816291927938449e-05, + "loss": 0.469, + "step": 3486 + }, + { + "epoch": 0.42433830240340736, + "grad_norm": 2.6730575561523438, + "learning_rate": 1.816179232602601e-05, + "loss": 0.4854, + "step": 3487 + }, + { + "epoch": 0.4244599939154244, + "grad_norm": 0.7642972469329834, + "learning_rate": 1.816066506209444e-05, + "loss": 0.4122, + "step": 3488 + }, + { + "epoch": 0.42458168542744146, + "grad_norm": 2.3168811798095703, + "learning_rate": 1.815953748763267e-05, + "loss": 0.4776, + "step": 3489 + }, + { + "epoch": 0.4247033769394585, + "grad_norm": 1.2830462455749512, + "learning_rate": 1.815840960268361e-05, + "loss": 0.5178, + "step": 3490 + }, + { + "epoch": 0.4248250684514755, + "grad_norm": 2.7923495769500732, + "learning_rate": 1.8157281407290176e-05, + "loss": 0.4382, + "step": 3491 + }, + { + "epoch": 0.42494675996349257, + "grad_norm": 1.8422168493270874, + "learning_rate": 1.8156152901495302e-05, + "loss": 0.5148, + "step": 3492 + }, + { + "epoch": 0.4250684514755096, + "grad_norm": 3.1832783222198486, + "learning_rate": 1.8155024085341924e-05, + "loss": 0.4743, + "step": 3493 + }, + { + "epoch": 0.4251901429875266, + "grad_norm": 4.278620719909668, + "learning_rate": 1.8153894958873005e-05, + "loss": 0.4392, + "step": 3494 + }, + { + "epoch": 0.4253118344995437, + "grad_norm": 5.984546661376953, + "learning_rate": 1.8152765522131503e-05, + "loss": 0.4783, + "step": 3495 + }, + { + "epoch": 0.4254335260115607, + "grad_norm": 2.6696994304656982, + "learning_rate": 1.8151635775160396e-05, + "loss": 0.5193, + "step": 3496 + }, + { + "epoch": 0.4255552175235777, + "grad_norm": 1.6294728517532349, + "learning_rate": 1.8150505718002676e-05, + "loss": 0.4854, + "step": 3497 + }, + { + "epoch": 0.42567690903559474, + "grad_norm": 1.8985587358474731, + "learning_rate": 1.814937535070134e-05, + "loss": 0.5055, + "step": 3498 + }, + { + "epoch": 0.4257986005476118, + "grad_norm": 1.1241909265518188, + "learning_rate": 1.814824467329941e-05, + "loss": 0.5013, + "step": 3499 + }, + { + "epoch": 0.42592029205962884, + "grad_norm": 0.8562899827957153, + "learning_rate": 1.8147113685839897e-05, + "loss": 0.4742, + "step": 3500 + }, + { + "epoch": 0.42604198357164585, + "grad_norm": 0.9550133347511292, + "learning_rate": 1.814598238836585e-05, + "loss": 0.4552, + "step": 3501 + }, + { + "epoch": 0.42616367508366293, + "grad_norm": 0.7729173898696899, + "learning_rate": 1.814485078092031e-05, + "loss": 0.4681, + "step": 3502 + }, + { + "epoch": 0.42628536659567995, + "grad_norm": 0.6184672713279724, + "learning_rate": 1.814371886354634e-05, + "loss": 0.4711, + "step": 3503 + }, + { + "epoch": 0.42640705810769697, + "grad_norm": 1.3956339359283447, + "learning_rate": 1.8142586636287006e-05, + "loss": 0.4401, + "step": 3504 + }, + { + "epoch": 0.42652874961971404, + "grad_norm": 1.0720890760421753, + "learning_rate": 1.8141454099185403e-05, + "loss": 0.4321, + "step": 3505 + }, + { + "epoch": 0.42665044113173106, + "grad_norm": 1.84479558467865, + "learning_rate": 1.8140321252284617e-05, + "loss": 0.4683, + "step": 3506 + }, + { + "epoch": 0.4267721326437481, + "grad_norm": 2.46502685546875, + "learning_rate": 1.8139188095627758e-05, + "loss": 0.4028, + "step": 3507 + }, + { + "epoch": 0.42689382415576516, + "grad_norm": 0.7737486958503723, + "learning_rate": 1.8138054629257943e-05, + "loss": 0.5058, + "step": 3508 + }, + { + "epoch": 0.4270155156677822, + "grad_norm": 2.7997729778289795, + "learning_rate": 1.8136920853218308e-05, + "loss": 0.4604, + "step": 3509 + }, + { + "epoch": 0.4271372071797992, + "grad_norm": 1.1051172018051147, + "learning_rate": 1.813578676755199e-05, + "loss": 0.523, + "step": 3510 + }, + { + "epoch": 0.42725889869181627, + "grad_norm": 4.384588718414307, + "learning_rate": 1.813465237230214e-05, + "loss": 0.4176, + "step": 3511 + }, + { + "epoch": 0.4273805902038333, + "grad_norm": 1.8338876962661743, + "learning_rate": 1.8133517667511936e-05, + "loss": 0.4727, + "step": 3512 + }, + { + "epoch": 0.4275022817158503, + "grad_norm": 0.812783420085907, + "learning_rate": 1.813238265322455e-05, + "loss": 0.4329, + "step": 3513 + }, + { + "epoch": 0.4276239732278674, + "grad_norm": 1.7692828178405762, + "learning_rate": 1.8131247329483167e-05, + "loss": 0.4753, + "step": 3514 + }, + { + "epoch": 0.4277456647398844, + "grad_norm": 4.737448692321777, + "learning_rate": 1.8130111696330995e-05, + "loss": 0.5285, + "step": 3515 + }, + { + "epoch": 0.4278673562519014, + "grad_norm": 4.863689422607422, + "learning_rate": 1.8128975753811242e-05, + "loss": 0.5593, + "step": 3516 + }, + { + "epoch": 0.42798904776391844, + "grad_norm": 2.1955201625823975, + "learning_rate": 1.812783950196714e-05, + "loss": 0.4665, + "step": 3517 + }, + { + "epoch": 0.4281107392759355, + "grad_norm": 2.6062629222869873, + "learning_rate": 1.8126702940841917e-05, + "loss": 0.4715, + "step": 3518 + }, + { + "epoch": 0.42823243078795253, + "grad_norm": 2.4371426105499268, + "learning_rate": 1.8125566070478825e-05, + "loss": 0.3948, + "step": 3519 + }, + { + "epoch": 0.42835412229996955, + "grad_norm": 0.6621604561805725, + "learning_rate": 1.8124428890921128e-05, + "loss": 0.4342, + "step": 3520 + }, + { + "epoch": 0.4284758138119866, + "grad_norm": 0.9522155523300171, + "learning_rate": 1.8123291402212096e-05, + "loss": 0.486, + "step": 3521 + }, + { + "epoch": 0.42859750532400365, + "grad_norm": 0.8200867176055908, + "learning_rate": 1.812215360439501e-05, + "loss": 0.4563, + "step": 3522 + }, + { + "epoch": 0.42871919683602067, + "grad_norm": 0.7225762605667114, + "learning_rate": 1.8121015497513166e-05, + "loss": 0.4863, + "step": 3523 + }, + { + "epoch": 0.42884088834803774, + "grad_norm": 0.9208165407180786, + "learning_rate": 1.8119877081609876e-05, + "loss": 0.4696, + "step": 3524 + }, + { + "epoch": 0.42896257986005476, + "grad_norm": 2.1651906967163086, + "learning_rate": 1.811873835672845e-05, + "loss": 0.4358, + "step": 3525 + }, + { + "epoch": 0.4290842713720718, + "grad_norm": 1.9516147375106812, + "learning_rate": 1.811759932291223e-05, + "loss": 0.5196, + "step": 3526 + }, + { + "epoch": 0.42920596288408885, + "grad_norm": 2.269613742828369, + "learning_rate": 1.811645998020455e-05, + "loss": 0.3981, + "step": 3527 + }, + { + "epoch": 0.4293276543961059, + "grad_norm": 1.2206636667251587, + "learning_rate": 1.8115320328648774e-05, + "loss": 0.453, + "step": 3528 + }, + { + "epoch": 0.4294493459081229, + "grad_norm": 0.9070826172828674, + "learning_rate": 1.8114180368288257e-05, + "loss": 0.4432, + "step": 3529 + }, + { + "epoch": 0.42957103742013997, + "grad_norm": 0.7746086120605469, + "learning_rate": 1.8113040099166383e-05, + "loss": 0.4246, + "step": 3530 + }, + { + "epoch": 0.429692728932157, + "grad_norm": 3.6724777221679688, + "learning_rate": 1.8111899521326535e-05, + "loss": 0.5331, + "step": 3531 + }, + { + "epoch": 0.429814420444174, + "grad_norm": 2.6812140941619873, + "learning_rate": 1.8110758634812123e-05, + "loss": 0.4455, + "step": 3532 + }, + { + "epoch": 0.4299361119561911, + "grad_norm": 1.895676851272583, + "learning_rate": 1.810961743966656e-05, + "loss": 0.4869, + "step": 3533 + }, + { + "epoch": 0.4300578034682081, + "grad_norm": 4.820573806762695, + "learning_rate": 1.8108475935933263e-05, + "loss": 0.5522, + "step": 3534 + }, + { + "epoch": 0.4301794949802251, + "grad_norm": 0.795985221862793, + "learning_rate": 1.8107334123655675e-05, + "loss": 0.4574, + "step": 3535 + }, + { + "epoch": 0.4303011864922422, + "grad_norm": 2.154789686203003, + "learning_rate": 1.8106192002877243e-05, + "loss": 0.5156, + "step": 3536 + }, + { + "epoch": 0.4304228780042592, + "grad_norm": 0.7947393655776978, + "learning_rate": 1.8105049573641423e-05, + "loss": 0.4721, + "step": 3537 + }, + { + "epoch": 0.43054456951627623, + "grad_norm": 2.2035183906555176, + "learning_rate": 1.8103906835991694e-05, + "loss": 0.4401, + "step": 3538 + }, + { + "epoch": 0.43066626102829325, + "grad_norm": 1.8686167001724243, + "learning_rate": 1.8102763789971534e-05, + "loss": 0.4722, + "step": 3539 + }, + { + "epoch": 0.4307879525403103, + "grad_norm": 4.780008316040039, + "learning_rate": 1.810162043562444e-05, + "loss": 0.3956, + "step": 3540 + }, + { + "epoch": 0.43090964405232735, + "grad_norm": 2.7494750022888184, + "learning_rate": 1.8100476772993918e-05, + "loss": 0.4507, + "step": 3541 + }, + { + "epoch": 0.43103133556434436, + "grad_norm": 0.79237961769104, + "learning_rate": 1.809933280212349e-05, + "loss": 0.4862, + "step": 3542 + }, + { + "epoch": 0.43115302707636144, + "grad_norm": 2.263197660446167, + "learning_rate": 1.8098188523056685e-05, + "loss": 0.4596, + "step": 3543 + }, + { + "epoch": 0.43127471858837846, + "grad_norm": 2.3083460330963135, + "learning_rate": 1.809704393583704e-05, + "loss": 0.3967, + "step": 3544 + }, + { + "epoch": 0.4313964101003955, + "grad_norm": 0.7695382833480835, + "learning_rate": 1.8095899040508115e-05, + "loss": 0.4085, + "step": 3545 + }, + { + "epoch": 0.43151810161241255, + "grad_norm": 1.6725908517837524, + "learning_rate": 1.8094753837113473e-05, + "loss": 0.4756, + "step": 3546 + }, + { + "epoch": 0.43163979312442957, + "grad_norm": 3.4811530113220215, + "learning_rate": 1.8093608325696695e-05, + "loss": 0.5039, + "step": 3547 + }, + { + "epoch": 0.4317614846364466, + "grad_norm": 1.961618185043335, + "learning_rate": 1.8092462506301367e-05, + "loss": 0.4096, + "step": 3548 + }, + { + "epoch": 0.43188317614846367, + "grad_norm": 3.5613763332366943, + "learning_rate": 1.809131637897109e-05, + "loss": 0.4797, + "step": 3549 + }, + { + "epoch": 0.4320048676604807, + "grad_norm": 0.792500376701355, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.428, + "step": 3550 + }, + { + "epoch": 0.4321265591724977, + "grad_norm": 0.8221563100814819, + "learning_rate": 1.8089023200680152e-05, + "loss": 0.4576, + "step": 3551 + }, + { + "epoch": 0.4322482506845148, + "grad_norm": 1.3412209749221802, + "learning_rate": 1.8087876149806746e-05, + "loss": 0.4646, + "step": 3552 + }, + { + "epoch": 0.4323699421965318, + "grad_norm": 1.2393990755081177, + "learning_rate": 1.8086728791172917e-05, + "loss": 0.5035, + "step": 3553 + }, + { + "epoch": 0.4324916337085488, + "grad_norm": 1.7637604475021362, + "learning_rate": 1.808558112482232e-05, + "loss": 0.423, + "step": 3554 + }, + { + "epoch": 0.4326133252205659, + "grad_norm": 1.3344647884368896, + "learning_rate": 1.808443315079862e-05, + "loss": 0.5133, + "step": 3555 + }, + { + "epoch": 0.4327350167325829, + "grad_norm": 0.5926569700241089, + "learning_rate": 1.8083284869145504e-05, + "loss": 0.48, + "step": 3556 + }, + { + "epoch": 0.43285670824459993, + "grad_norm": 1.8394889831542969, + "learning_rate": 1.8082136279906672e-05, + "loss": 0.4413, + "step": 3557 + }, + { + "epoch": 0.43297839975661695, + "grad_norm": 0.5895689129829407, + "learning_rate": 1.8080987383125826e-05, + "loss": 0.4586, + "step": 3558 + }, + { + "epoch": 0.433100091268634, + "grad_norm": 3.8162670135498047, + "learning_rate": 1.8079838178846677e-05, + "loss": 0.4662, + "step": 3559 + }, + { + "epoch": 0.43322178278065104, + "grad_norm": 0.640853762626648, + "learning_rate": 1.8078688667112965e-05, + "loss": 0.5245, + "step": 3560 + }, + { + "epoch": 0.43334347429266806, + "grad_norm": 1.9288088083267212, + "learning_rate": 1.807753884796843e-05, + "loss": 0.4369, + "step": 3561 + }, + { + "epoch": 0.43346516580468514, + "grad_norm": 0.737969696521759, + "learning_rate": 1.807638872145682e-05, + "loss": 0.4464, + "step": 3562 + }, + { + "epoch": 0.43358685731670216, + "grad_norm": 4.160451412200928, + "learning_rate": 1.80752382876219e-05, + "loss": 0.5351, + "step": 3563 + }, + { + "epoch": 0.4337085488287192, + "grad_norm": 1.9113166332244873, + "learning_rate": 1.8074087546507448e-05, + "loss": 0.4893, + "step": 3564 + }, + { + "epoch": 0.43383024034073625, + "grad_norm": 2.4951300621032715, + "learning_rate": 1.8072936498157254e-05, + "loss": 0.4872, + "step": 3565 + }, + { + "epoch": 0.43395193185275327, + "grad_norm": 3.5019259452819824, + "learning_rate": 1.8071785142615118e-05, + "loss": 0.5351, + "step": 3566 + }, + { + "epoch": 0.4340736233647703, + "grad_norm": 1.509994387626648, + "learning_rate": 1.807063347992485e-05, + "loss": 0.5239, + "step": 3567 + }, + { + "epoch": 0.43419531487678736, + "grad_norm": 4.691476821899414, + "learning_rate": 1.806948151013027e-05, + "loss": 0.4274, + "step": 3568 + }, + { + "epoch": 0.4343170063888044, + "grad_norm": 2.923957586288452, + "learning_rate": 1.8068329233275215e-05, + "loss": 0.4908, + "step": 3569 + }, + { + "epoch": 0.4344386979008214, + "grad_norm": 4.793639659881592, + "learning_rate": 1.8067176649403535e-05, + "loss": 0.4861, + "step": 3570 + }, + { + "epoch": 0.4345603894128385, + "grad_norm": 3.9228663444519043, + "learning_rate": 1.806602375855908e-05, + "loss": 0.4801, + "step": 3571 + }, + { + "epoch": 0.4346820809248555, + "grad_norm": 3.6681878566741943, + "learning_rate": 1.8064870560785734e-05, + "loss": 0.4598, + "step": 3572 + }, + { + "epoch": 0.4348037724368725, + "grad_norm": 1.9317867755889893, + "learning_rate": 1.8063717056127362e-05, + "loss": 0.4952, + "step": 3573 + }, + { + "epoch": 0.4349254639488896, + "grad_norm": 2.479525327682495, + "learning_rate": 1.806256324462787e-05, + "loss": 0.4097, + "step": 3574 + }, + { + "epoch": 0.4350471554609066, + "grad_norm": 0.7312712073326111, + "learning_rate": 1.8061409126331157e-05, + "loss": 0.4804, + "step": 3575 + }, + { + "epoch": 0.43516884697292363, + "grad_norm": 1.7317736148834229, + "learning_rate": 1.8060254701281137e-05, + "loss": 0.4731, + "step": 3576 + }, + { + "epoch": 0.43529053848494065, + "grad_norm": 2.882720470428467, + "learning_rate": 1.8059099969521747e-05, + "loss": 0.5266, + "step": 3577 + }, + { + "epoch": 0.4354122299969577, + "grad_norm": 0.724238932132721, + "learning_rate": 1.8057944931096914e-05, + "loss": 0.3851, + "step": 3578 + }, + { + "epoch": 0.43553392150897474, + "grad_norm": 3.183248281478882, + "learning_rate": 1.8056789586050603e-05, + "loss": 0.5077, + "step": 3579 + }, + { + "epoch": 0.43565561302099176, + "grad_norm": 1.8994688987731934, + "learning_rate": 1.805563393442677e-05, + "loss": 0.4547, + "step": 3580 + }, + { + "epoch": 0.43577730453300884, + "grad_norm": 3.0137557983398438, + "learning_rate": 1.8054477976269394e-05, + "loss": 0.4719, + "step": 3581 + }, + { + "epoch": 0.43589899604502585, + "grad_norm": 2.221926212310791, + "learning_rate": 1.805332171162246e-05, + "loss": 0.5107, + "step": 3582 + }, + { + "epoch": 0.4360206875570429, + "grad_norm": 2.2234017848968506, + "learning_rate": 1.805216514052996e-05, + "loss": 0.4966, + "step": 3583 + }, + { + "epoch": 0.43614237906905995, + "grad_norm": 4.759458065032959, + "learning_rate": 1.8051008263035913e-05, + "loss": 0.5852, + "step": 3584 + }, + { + "epoch": 0.43626407058107697, + "grad_norm": 3.5544915199279785, + "learning_rate": 1.8049851079184337e-05, + "loss": 0.4546, + "step": 3585 + }, + { + "epoch": 0.436385762093094, + "grad_norm": 3.342607259750366, + "learning_rate": 1.8048693589019267e-05, + "loss": 0.5096, + "step": 3586 + }, + { + "epoch": 0.43650745360511106, + "grad_norm": 6.241463661193848, + "learning_rate": 1.804753579258474e-05, + "loss": 0.4798, + "step": 3587 + }, + { + "epoch": 0.4366291451171281, + "grad_norm": 5.943205833435059, + "learning_rate": 1.8046377689924823e-05, + "loss": 0.5163, + "step": 3588 + }, + { + "epoch": 0.4367508366291451, + "grad_norm": 4.17158842086792, + "learning_rate": 1.804521928108358e-05, + "loss": 0.5011, + "step": 3589 + }, + { + "epoch": 0.4368725281411622, + "grad_norm": 5.012595176696777, + "learning_rate": 1.8044060566105088e-05, + "loss": 0.4918, + "step": 3590 + }, + { + "epoch": 0.4369942196531792, + "grad_norm": 2.1947054862976074, + "learning_rate": 1.8042901545033446e-05, + "loss": 0.5261, + "step": 3591 + }, + { + "epoch": 0.4371159111651962, + "grad_norm": 3.3039352893829346, + "learning_rate": 1.8041742217912746e-05, + "loss": 0.4359, + "step": 3592 + }, + { + "epoch": 0.4372376026772133, + "grad_norm": 0.761996865272522, + "learning_rate": 1.804058258478711e-05, + "loss": 0.4397, + "step": 3593 + }, + { + "epoch": 0.4373592941892303, + "grad_norm": 1.6251555681228638, + "learning_rate": 1.8039422645700672e-05, + "loss": 0.4399, + "step": 3594 + }, + { + "epoch": 0.4374809857012473, + "grad_norm": 3.0419235229492188, + "learning_rate": 1.8038262400697554e-05, + "loss": 0.4908, + "step": 3595 + }, + { + "epoch": 0.4376026772132644, + "grad_norm": 2.6626012325286865, + "learning_rate": 1.803710184982191e-05, + "loss": 0.4483, + "step": 3596 + }, + { + "epoch": 0.4377243687252814, + "grad_norm": 5.830317497253418, + "learning_rate": 1.803594099311791e-05, + "loss": 0.5787, + "step": 3597 + }, + { + "epoch": 0.43784606023729844, + "grad_norm": 0.9365830421447754, + "learning_rate": 1.8034779830629725e-05, + "loss": 0.4021, + "step": 3598 + }, + { + "epoch": 0.43796775174931546, + "grad_norm": 1.5541876554489136, + "learning_rate": 1.803361836240153e-05, + "loss": 0.4361, + "step": 3599 + }, + { + "epoch": 0.43808944326133253, + "grad_norm": 1.2261126041412354, + "learning_rate": 1.8032456588477533e-05, + "loss": 0.4349, + "step": 3600 + }, + { + "epoch": 0.43821113477334955, + "grad_norm": 1.5325047969818115, + "learning_rate": 1.803129450890193e-05, + "loss": 0.5074, + "step": 3601 + }, + { + "epoch": 0.4383328262853666, + "grad_norm": 0.6190427541732788, + "learning_rate": 1.8030132123718953e-05, + "loss": 0.4453, + "step": 3602 + }, + { + "epoch": 0.43845451779738365, + "grad_norm": 3.680974245071411, + "learning_rate": 1.8028969432972826e-05, + "loss": 0.4604, + "step": 3603 + }, + { + "epoch": 0.43857620930940067, + "grad_norm": 2.0287678241729736, + "learning_rate": 1.8027806436707792e-05, + "loss": 0.4754, + "step": 3604 + }, + { + "epoch": 0.4386979008214177, + "grad_norm": 2.9809305667877197, + "learning_rate": 1.8026643134968106e-05, + "loss": 0.4819, + "step": 3605 + }, + { + "epoch": 0.43881959233343476, + "grad_norm": 6.6440863609313965, + "learning_rate": 1.802547952779803e-05, + "loss": 0.4978, + "step": 3606 + }, + { + "epoch": 0.4389412838454518, + "grad_norm": 2.136735200881958, + "learning_rate": 1.8024315615241853e-05, + "loss": 0.4557, + "step": 3607 + }, + { + "epoch": 0.4390629753574688, + "grad_norm": 3.9290289878845215, + "learning_rate": 1.8023151397343857e-05, + "loss": 0.4094, + "step": 3608 + }, + { + "epoch": 0.4391846668694859, + "grad_norm": 1.2914810180664062, + "learning_rate": 1.8021986874148342e-05, + "loss": 0.4297, + "step": 3609 + }, + { + "epoch": 0.4393063583815029, + "grad_norm": 2.9472551345825195, + "learning_rate": 1.8020822045699622e-05, + "loss": 0.4975, + "step": 3610 + }, + { + "epoch": 0.4394280498935199, + "grad_norm": 0.6643158793449402, + "learning_rate": 1.801965691204202e-05, + "loss": 0.3759, + "step": 3611 + }, + { + "epoch": 0.439549741405537, + "grad_norm": 2.8930108547210693, + "learning_rate": 1.8018491473219875e-05, + "loss": 0.5077, + "step": 3612 + }, + { + "epoch": 0.439671432917554, + "grad_norm": 1.9661660194396973, + "learning_rate": 1.8017325729277532e-05, + "loss": 0.4441, + "step": 3613 + }, + { + "epoch": 0.439793124429571, + "grad_norm": 4.3794941902160645, + "learning_rate": 1.8016159680259346e-05, + "loss": 0.4921, + "step": 3614 + }, + { + "epoch": 0.4399148159415881, + "grad_norm": 3.2524712085723877, + "learning_rate": 1.8014993326209695e-05, + "loss": 0.4702, + "step": 3615 + }, + { + "epoch": 0.4400365074536051, + "grad_norm": 1.2992031574249268, + "learning_rate": 1.8013826667172956e-05, + "loss": 0.4833, + "step": 3616 + }, + { + "epoch": 0.44015819896562214, + "grad_norm": 1.4849412441253662, + "learning_rate": 1.8012659703193528e-05, + "loss": 0.4682, + "step": 3617 + }, + { + "epoch": 0.44027989047763916, + "grad_norm": 1.713822364807129, + "learning_rate": 1.801149243431581e-05, + "loss": 0.4883, + "step": 3618 + }, + { + "epoch": 0.44040158198965623, + "grad_norm": 1.120497465133667, + "learning_rate": 1.801032486058422e-05, + "loss": 0.4824, + "step": 3619 + }, + { + "epoch": 0.44052327350167325, + "grad_norm": 3.9051904678344727, + "learning_rate": 1.8009156982043192e-05, + "loss": 0.4432, + "step": 3620 + }, + { + "epoch": 0.44064496501369027, + "grad_norm": 4.045989990234375, + "learning_rate": 1.8007988798737163e-05, + "loss": 0.4757, + "step": 3621 + }, + { + "epoch": 0.44076665652570735, + "grad_norm": 4.53957986831665, + "learning_rate": 1.8006820310710584e-05, + "loss": 0.4469, + "step": 3622 + }, + { + "epoch": 0.44088834803772436, + "grad_norm": 1.0481483936309814, + "learning_rate": 1.8005651518007917e-05, + "loss": 0.5475, + "step": 3623 + }, + { + "epoch": 0.4410100395497414, + "grad_norm": 2.4604034423828125, + "learning_rate": 1.8004482420673642e-05, + "loss": 0.521, + "step": 3624 + }, + { + "epoch": 0.44113173106175846, + "grad_norm": 3.3843867778778076, + "learning_rate": 1.800331301875224e-05, + "loss": 0.4371, + "step": 3625 + }, + { + "epoch": 0.4412534225737755, + "grad_norm": 3.9853744506835938, + "learning_rate": 1.8002143312288213e-05, + "loss": 0.4224, + "step": 3626 + }, + { + "epoch": 0.4413751140857925, + "grad_norm": 1.23611581325531, + "learning_rate": 1.8000973301326068e-05, + "loss": 0.4731, + "step": 3627 + }, + { + "epoch": 0.44149680559780957, + "grad_norm": 1.6350594758987427, + "learning_rate": 1.799980298591033e-05, + "loss": 0.495, + "step": 3628 + }, + { + "epoch": 0.4416184971098266, + "grad_norm": 3.6791670322418213, + "learning_rate": 1.7998632366085527e-05, + "loss": 0.4994, + "step": 3629 + }, + { + "epoch": 0.4417401886218436, + "grad_norm": 3.62467098236084, + "learning_rate": 1.799746144189621e-05, + "loss": 0.5047, + "step": 3630 + }, + { + "epoch": 0.4418618801338607, + "grad_norm": 2.546487331390381, + "learning_rate": 1.7996290213386927e-05, + "loss": 0.4929, + "step": 3631 + }, + { + "epoch": 0.4419835716458777, + "grad_norm": 1.99821138381958, + "learning_rate": 1.799511868060225e-05, + "loss": 0.3992, + "step": 3632 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 2.085075855255127, + "learning_rate": 1.7993946843586766e-05, + "loss": 0.4839, + "step": 3633 + }, + { + "epoch": 0.4422269546699118, + "grad_norm": 0.5979752540588379, + "learning_rate": 1.799277470238505e-05, + "loss": 0.4611, + "step": 3634 + }, + { + "epoch": 0.4423486461819288, + "grad_norm": 1.6199902296066284, + "learning_rate": 1.7991602257041717e-05, + "loss": 0.4078, + "step": 3635 + }, + { + "epoch": 0.44247033769394584, + "grad_norm": 0.978445291519165, + "learning_rate": 1.799042950760137e-05, + "loss": 0.4959, + "step": 3636 + }, + { + "epoch": 0.4425920292059629, + "grad_norm": 0.7537236213684082, + "learning_rate": 1.7989256454108644e-05, + "loss": 0.4696, + "step": 3637 + }, + { + "epoch": 0.44271372071797993, + "grad_norm": 2.1018757820129395, + "learning_rate": 1.798808309660818e-05, + "loss": 0.4545, + "step": 3638 + }, + { + "epoch": 0.44283541222999695, + "grad_norm": 0.9911224246025085, + "learning_rate": 1.798690943514461e-05, + "loss": 0.4297, + "step": 3639 + }, + { + "epoch": 0.44295710374201397, + "grad_norm": 1.6557685136795044, + "learning_rate": 1.7985735469762608e-05, + "loss": 0.4334, + "step": 3640 + }, + { + "epoch": 0.44307879525403104, + "grad_norm": 1.01318359375, + "learning_rate": 1.798456120050684e-05, + "loss": 0.4271, + "step": 3641 + }, + { + "epoch": 0.44320048676604806, + "grad_norm": 1.455711841583252, + "learning_rate": 1.7983386627421996e-05, + "loss": 0.4834, + "step": 3642 + }, + { + "epoch": 0.4433221782780651, + "grad_norm": 1.481998324394226, + "learning_rate": 1.7982211750552763e-05, + "loss": 0.4819, + "step": 3643 + }, + { + "epoch": 0.44344386979008216, + "grad_norm": 1.1394925117492676, + "learning_rate": 1.798103656994385e-05, + "loss": 0.4671, + "step": 3644 + }, + { + "epoch": 0.4435655613020992, + "grad_norm": 3.476978302001953, + "learning_rate": 1.7979861085639975e-05, + "loss": 0.5468, + "step": 3645 + }, + { + "epoch": 0.4436872528141162, + "grad_norm": 0.9046595692634583, + "learning_rate": 1.797868529768587e-05, + "loss": 0.4951, + "step": 3646 + }, + { + "epoch": 0.44380894432613327, + "grad_norm": 0.703719437122345, + "learning_rate": 1.7977509206126274e-05, + "loss": 0.4727, + "step": 3647 + }, + { + "epoch": 0.4439306358381503, + "grad_norm": 1.4390543699264526, + "learning_rate": 1.7976332811005943e-05, + "loss": 0.5164, + "step": 3648 + }, + { + "epoch": 0.4440523273501673, + "grad_norm": 5.06640100479126, + "learning_rate": 1.7975156112369634e-05, + "loss": 0.4624, + "step": 3649 + }, + { + "epoch": 0.4441740188621844, + "grad_norm": 2.183990955352783, + "learning_rate": 1.797397911026213e-05, + "loss": 0.4968, + "step": 3650 + }, + { + "epoch": 0.4442957103742014, + "grad_norm": 3.0395090579986572, + "learning_rate": 1.7972801804728213e-05, + "loss": 0.4543, + "step": 3651 + }, + { + "epoch": 0.4444174018862184, + "grad_norm": 0.6118097305297852, + "learning_rate": 1.7971624195812687e-05, + "loss": 0.4827, + "step": 3652 + }, + { + "epoch": 0.4445390933982355, + "grad_norm": 1.3973321914672852, + "learning_rate": 1.7970446283560358e-05, + "loss": 0.5065, + "step": 3653 + }, + { + "epoch": 0.4446607849102525, + "grad_norm": 0.7725859880447388, + "learning_rate": 1.796926806801605e-05, + "loss": 0.509, + "step": 3654 + }, + { + "epoch": 0.44478247642226953, + "grad_norm": 1.0097979307174683, + "learning_rate": 1.7968089549224598e-05, + "loss": 0.4639, + "step": 3655 + }, + { + "epoch": 0.4449041679342866, + "grad_norm": 2.128896713256836, + "learning_rate": 1.7966910727230846e-05, + "loss": 0.4611, + "step": 3656 + }, + { + "epoch": 0.44502585944630363, + "grad_norm": 1.722802758216858, + "learning_rate": 1.796573160207965e-05, + "loss": 0.5142, + "step": 3657 + }, + { + "epoch": 0.44514755095832065, + "grad_norm": 2.6929211616516113, + "learning_rate": 1.796455217381588e-05, + "loss": 0.5164, + "step": 3658 + }, + { + "epoch": 0.44526924247033767, + "grad_norm": 1.578313946723938, + "learning_rate": 1.796337244248441e-05, + "loss": 0.4304, + "step": 3659 + }, + { + "epoch": 0.44539093398235474, + "grad_norm": 2.301638126373291, + "learning_rate": 1.7962192408130137e-05, + "loss": 0.4424, + "step": 3660 + }, + { + "epoch": 0.44551262549437176, + "grad_norm": 1.1318200826644897, + "learning_rate": 1.7961012070797964e-05, + "loss": 0.46, + "step": 3661 + }, + { + "epoch": 0.4456343170063888, + "grad_norm": 0.882349967956543, + "learning_rate": 1.79598314305328e-05, + "loss": 0.4947, + "step": 3662 + }, + { + "epoch": 0.44575600851840586, + "grad_norm": 1.035900354385376, + "learning_rate": 1.7958650487379574e-05, + "loss": 0.4493, + "step": 3663 + }, + { + "epoch": 0.4458777000304229, + "grad_norm": 4.451942443847656, + "learning_rate": 1.795746924138323e-05, + "loss": 0.5493, + "step": 3664 + }, + { + "epoch": 0.4459993915424399, + "grad_norm": 1.1115249395370483, + "learning_rate": 1.7956287692588704e-05, + "loss": 0.4852, + "step": 3665 + }, + { + "epoch": 0.44612108305445697, + "grad_norm": 2.1183042526245117, + "learning_rate": 1.7955105841040962e-05, + "loss": 0.4061, + "step": 3666 + }, + { + "epoch": 0.446242774566474, + "grad_norm": 1.7583872079849243, + "learning_rate": 1.7953923686784977e-05, + "loss": 0.4558, + "step": 3667 + }, + { + "epoch": 0.446364466078491, + "grad_norm": 0.9607902765274048, + "learning_rate": 1.7952741229865735e-05, + "loss": 0.4705, + "step": 3668 + }, + { + "epoch": 0.4464861575905081, + "grad_norm": 1.133246898651123, + "learning_rate": 1.795155847032823e-05, + "loss": 0.462, + "step": 3669 + }, + { + "epoch": 0.4466078491025251, + "grad_norm": 1.0391075611114502, + "learning_rate": 1.795037540821746e-05, + "loss": 0.5168, + "step": 3670 + }, + { + "epoch": 0.4467295406145421, + "grad_norm": 1.3781741857528687, + "learning_rate": 1.794919204357845e-05, + "loss": 0.4272, + "step": 3671 + }, + { + "epoch": 0.4468512321265592, + "grad_norm": 0.6506280303001404, + "learning_rate": 1.7948008376456235e-05, + "loss": 0.48, + "step": 3672 + }, + { + "epoch": 0.4469729236385762, + "grad_norm": 0.892476499080658, + "learning_rate": 1.7946824406895845e-05, + "loss": 0.4561, + "step": 3673 + }, + { + "epoch": 0.44709461515059323, + "grad_norm": 1.248900294303894, + "learning_rate": 1.7945640134942336e-05, + "loss": 0.5008, + "step": 3674 + }, + { + "epoch": 0.4472163066626103, + "grad_norm": 1.0760215520858765, + "learning_rate": 1.7944455560640775e-05, + "loss": 0.4878, + "step": 3675 + }, + { + "epoch": 0.4473379981746273, + "grad_norm": 3.092639684677124, + "learning_rate": 1.794327068403624e-05, + "loss": 0.3874, + "step": 3676 + }, + { + "epoch": 0.44745968968664435, + "grad_norm": 1.7849409580230713, + "learning_rate": 1.794208550517381e-05, + "loss": 0.4354, + "step": 3677 + }, + { + "epoch": 0.4475813811986614, + "grad_norm": 1.5701935291290283, + "learning_rate": 1.7940900024098585e-05, + "loss": 0.4115, + "step": 3678 + }, + { + "epoch": 0.44770307271067844, + "grad_norm": 1.1873188018798828, + "learning_rate": 1.793971424085568e-05, + "loss": 0.4755, + "step": 3679 + }, + { + "epoch": 0.44782476422269546, + "grad_norm": 1.8533138036727905, + "learning_rate": 1.7938528155490215e-05, + "loss": 0.4706, + "step": 3680 + }, + { + "epoch": 0.4479464557347125, + "grad_norm": 3.065023422241211, + "learning_rate": 1.7937341768047322e-05, + "loss": 0.504, + "step": 3681 + }, + { + "epoch": 0.44806814724672955, + "grad_norm": 1.467176914215088, + "learning_rate": 1.793615507857214e-05, + "loss": 0.4483, + "step": 3682 + }, + { + "epoch": 0.4481898387587466, + "grad_norm": 4.730821132659912, + "learning_rate": 1.7934968087109837e-05, + "loss": 0.5012, + "step": 3683 + }, + { + "epoch": 0.4483115302707636, + "grad_norm": 0.5738435387611389, + "learning_rate": 1.7933780793705572e-05, + "loss": 0.4191, + "step": 3684 + }, + { + "epoch": 0.44843322178278067, + "grad_norm": 0.4980515241622925, + "learning_rate": 1.7932593198404524e-05, + "loss": 0.4294, + "step": 3685 + }, + { + "epoch": 0.4485549132947977, + "grad_norm": 2.7868967056274414, + "learning_rate": 1.7931405301251885e-05, + "loss": 0.4866, + "step": 3686 + }, + { + "epoch": 0.4486766048068147, + "grad_norm": 1.3674237728118896, + "learning_rate": 1.793021710229286e-05, + "loss": 0.4485, + "step": 3687 + }, + { + "epoch": 0.4487982963188318, + "grad_norm": 1.4269791841506958, + "learning_rate": 1.7929028601572656e-05, + "loss": 0.5182, + "step": 3688 + }, + { + "epoch": 0.4489199878308488, + "grad_norm": 2.5022637844085693, + "learning_rate": 1.7927839799136505e-05, + "loss": 0.4469, + "step": 3689 + }, + { + "epoch": 0.4490416793428658, + "grad_norm": 1.9184142351150513, + "learning_rate": 1.7926650695029635e-05, + "loss": 0.4679, + "step": 3690 + }, + { + "epoch": 0.4491633708548829, + "grad_norm": 0.9711014032363892, + "learning_rate": 1.7925461289297303e-05, + "loss": 0.4837, + "step": 3691 + }, + { + "epoch": 0.4492850623668999, + "grad_norm": 0.6276342868804932, + "learning_rate": 1.7924271581984762e-05, + "loss": 0.4912, + "step": 3692 + }, + { + "epoch": 0.44940675387891693, + "grad_norm": 0.5797700881958008, + "learning_rate": 1.7923081573137287e-05, + "loss": 0.5034, + "step": 3693 + }, + { + "epoch": 0.449528445390934, + "grad_norm": 2.8033759593963623, + "learning_rate": 1.7921891262800158e-05, + "loss": 0.445, + "step": 3694 + }, + { + "epoch": 0.449650136902951, + "grad_norm": 1.6636983156204224, + "learning_rate": 1.7920700651018667e-05, + "loss": 0.4711, + "step": 3695 + }, + { + "epoch": 0.44977182841496804, + "grad_norm": 3.3158512115478516, + "learning_rate": 1.7919509737838123e-05, + "loss": 0.4434, + "step": 3696 + }, + { + "epoch": 0.4498935199269851, + "grad_norm": 1.4207450151443481, + "learning_rate": 1.7918318523303843e-05, + "loss": 0.4615, + "step": 3697 + }, + { + "epoch": 0.45001521143900214, + "grad_norm": 2.2780520915985107, + "learning_rate": 1.791712700746115e-05, + "loss": 0.5064, + "step": 3698 + }, + { + "epoch": 0.45013690295101916, + "grad_norm": 2.2248661518096924, + "learning_rate": 1.7915935190355386e-05, + "loss": 0.4462, + "step": 3699 + }, + { + "epoch": 0.4502585944630362, + "grad_norm": 0.8618022203445435, + "learning_rate": 1.7914743072031906e-05, + "loss": 0.3969, + "step": 3700 + }, + { + "epoch": 0.45038028597505325, + "grad_norm": 1.1378401517868042, + "learning_rate": 1.791355065253607e-05, + "loss": 0.3672, + "step": 3701 + }, + { + "epoch": 0.45050197748707027, + "grad_norm": 4.653499126434326, + "learning_rate": 1.7912357931913245e-05, + "loss": 0.494, + "step": 3702 + }, + { + "epoch": 0.4506236689990873, + "grad_norm": 2.465895652770996, + "learning_rate": 1.791116491020883e-05, + "loss": 0.4396, + "step": 3703 + }, + { + "epoch": 0.45074536051110436, + "grad_norm": 2.1950838565826416, + "learning_rate": 1.7909971587468212e-05, + "loss": 0.4512, + "step": 3704 + }, + { + "epoch": 0.4508670520231214, + "grad_norm": 3.510770320892334, + "learning_rate": 1.7908777963736802e-05, + "loss": 0.4879, + "step": 3705 + }, + { + "epoch": 0.4509887435351384, + "grad_norm": 1.9464043378829956, + "learning_rate": 1.790758403906002e-05, + "loss": 0.4959, + "step": 3706 + }, + { + "epoch": 0.4511104350471555, + "grad_norm": 1.292353630065918, + "learning_rate": 1.79063898134833e-05, + "loss": 0.4514, + "step": 3707 + }, + { + "epoch": 0.4512321265591725, + "grad_norm": 0.5941998362541199, + "learning_rate": 1.790519528705208e-05, + "loss": 0.4442, + "step": 3708 + }, + { + "epoch": 0.4513538180711895, + "grad_norm": 1.889320969581604, + "learning_rate": 1.790400045981182e-05, + "loss": 0.449, + "step": 3709 + }, + { + "epoch": 0.4514755095832066, + "grad_norm": 1.416853666305542, + "learning_rate": 1.7902805331807977e-05, + "loss": 0.5016, + "step": 3710 + }, + { + "epoch": 0.4515972010952236, + "grad_norm": 0.5840568542480469, + "learning_rate": 1.7901609903086036e-05, + "loss": 0.4585, + "step": 3711 + }, + { + "epoch": 0.45171889260724063, + "grad_norm": 2.6656131744384766, + "learning_rate": 1.7900414173691482e-05, + "loss": 0.4889, + "step": 3712 + }, + { + "epoch": 0.4518405841192577, + "grad_norm": 1.2066540718078613, + "learning_rate": 1.7899218143669817e-05, + "loss": 0.4516, + "step": 3713 + }, + { + "epoch": 0.4519622756312747, + "grad_norm": 0.5673147439956665, + "learning_rate": 1.789802181306655e-05, + "loss": 0.4816, + "step": 3714 + }, + { + "epoch": 0.45208396714329174, + "grad_norm": 2.687893867492676, + "learning_rate": 1.7896825181927203e-05, + "loss": 0.504, + "step": 3715 + }, + { + "epoch": 0.4522056586553088, + "grad_norm": 2.188190460205078, + "learning_rate": 1.789562825029732e-05, + "loss": 0.5118, + "step": 3716 + }, + { + "epoch": 0.45232735016732584, + "grad_norm": 2.8252346515655518, + "learning_rate": 1.789443101822243e-05, + "loss": 0.5364, + "step": 3717 + }, + { + "epoch": 0.45244904167934286, + "grad_norm": 0.5264298915863037, + "learning_rate": 1.7893233485748107e-05, + "loss": 0.4892, + "step": 3718 + }, + { + "epoch": 0.4525707331913599, + "grad_norm": 1.020469069480896, + "learning_rate": 1.7892035652919904e-05, + "loss": 0.4331, + "step": 3719 + }, + { + "epoch": 0.45269242470337695, + "grad_norm": 0.6001567840576172, + "learning_rate": 1.7890837519783414e-05, + "loss": 0.509, + "step": 3720 + }, + { + "epoch": 0.45281411621539397, + "grad_norm": 1.42613685131073, + "learning_rate": 1.788963908638422e-05, + "loss": 0.4818, + "step": 3721 + }, + { + "epoch": 0.452935807727411, + "grad_norm": 4.9061784744262695, + "learning_rate": 1.7888440352767927e-05, + "loss": 0.4356, + "step": 3722 + }, + { + "epoch": 0.45305749923942806, + "grad_norm": 4.788461208343506, + "learning_rate": 1.7887241318980156e-05, + "loss": 0.4378, + "step": 3723 + }, + { + "epoch": 0.4531791907514451, + "grad_norm": 1.0287023782730103, + "learning_rate": 1.7886041985066524e-05, + "loss": 0.4683, + "step": 3724 + }, + { + "epoch": 0.4533008822634621, + "grad_norm": 0.6777575612068176, + "learning_rate": 1.788484235107267e-05, + "loss": 0.4313, + "step": 3725 + }, + { + "epoch": 0.4534225737754792, + "grad_norm": 3.7466187477111816, + "learning_rate": 1.788364241704425e-05, + "loss": 0.5249, + "step": 3726 + }, + { + "epoch": 0.4535442652874962, + "grad_norm": 0.6062596440315247, + "learning_rate": 1.7882442183026908e-05, + "loss": 0.4085, + "step": 3727 + }, + { + "epoch": 0.4536659567995132, + "grad_norm": 4.0420684814453125, + "learning_rate": 1.788124164906633e-05, + "loss": 0.4956, + "step": 3728 + }, + { + "epoch": 0.4537876483115303, + "grad_norm": 5.275223255157471, + "learning_rate": 1.7880040815208196e-05, + "loss": 0.5215, + "step": 3729 + }, + { + "epoch": 0.4539093398235473, + "grad_norm": 2.709576368331909, + "learning_rate": 1.7878839681498195e-05, + "loss": 0.4773, + "step": 3730 + }, + { + "epoch": 0.45403103133556433, + "grad_norm": 1.9356625080108643, + "learning_rate": 1.787763824798204e-05, + "loss": 0.4709, + "step": 3731 + }, + { + "epoch": 0.4541527228475814, + "grad_norm": 2.830550193786621, + "learning_rate": 1.787643651470544e-05, + "loss": 0.4965, + "step": 3732 + }, + { + "epoch": 0.4542744143595984, + "grad_norm": 2.397204637527466, + "learning_rate": 1.7875234481714126e-05, + "loss": 0.4983, + "step": 3733 + }, + { + "epoch": 0.45439610587161544, + "grad_norm": 0.8092364072799683, + "learning_rate": 1.7874032149053844e-05, + "loss": 0.4383, + "step": 3734 + }, + { + "epoch": 0.4545177973836325, + "grad_norm": 2.5677106380462646, + "learning_rate": 1.787282951677034e-05, + "loss": 0.4482, + "step": 3735 + }, + { + "epoch": 0.45463948889564954, + "grad_norm": 2.810300827026367, + "learning_rate": 1.7871626584909374e-05, + "loss": 0.4965, + "step": 3736 + }, + { + "epoch": 0.45476118040766655, + "grad_norm": 2.3000433444976807, + "learning_rate": 1.7870423353516723e-05, + "loss": 0.4841, + "step": 3737 + }, + { + "epoch": 0.45488287191968363, + "grad_norm": 4.659969806671143, + "learning_rate": 1.7869219822638175e-05, + "loss": 0.4329, + "step": 3738 + }, + { + "epoch": 0.45500456343170065, + "grad_norm": 2.9188547134399414, + "learning_rate": 1.786801599231952e-05, + "loss": 0.4563, + "step": 3739 + }, + { + "epoch": 0.45512625494371767, + "grad_norm": 1.448610782623291, + "learning_rate": 1.7866811862606576e-05, + "loss": 0.5119, + "step": 3740 + }, + { + "epoch": 0.4552479464557347, + "grad_norm": 1.2721482515335083, + "learning_rate": 1.7865607433545154e-05, + "loss": 0.5271, + "step": 3741 + }, + { + "epoch": 0.45536963796775176, + "grad_norm": 4.151176452636719, + "learning_rate": 1.786440270518109e-05, + "loss": 0.557, + "step": 3742 + }, + { + "epoch": 0.4554913294797688, + "grad_norm": 0.549390435218811, + "learning_rate": 1.7863197677560222e-05, + "loss": 0.4078, + "step": 3743 + }, + { + "epoch": 0.4556130209917858, + "grad_norm": 1.7603040933609009, + "learning_rate": 1.786199235072841e-05, + "loss": 0.4618, + "step": 3744 + }, + { + "epoch": 0.4557347125038029, + "grad_norm": 2.47747540473938, + "learning_rate": 1.7860786724731512e-05, + "loss": 0.4762, + "step": 3745 + }, + { + "epoch": 0.4558564040158199, + "grad_norm": 1.6292123794555664, + "learning_rate": 1.785958079961541e-05, + "loss": 0.4939, + "step": 3746 + }, + { + "epoch": 0.4559780955278369, + "grad_norm": 1.9071540832519531, + "learning_rate": 1.785837457542599e-05, + "loss": 0.441, + "step": 3747 + }, + { + "epoch": 0.456099787039854, + "grad_norm": 0.6643903851509094, + "learning_rate": 1.785716805220915e-05, + "loss": 0.4735, + "step": 3748 + }, + { + "epoch": 0.456221478551871, + "grad_norm": 1.6004359722137451, + "learning_rate": 1.7855961230010804e-05, + "loss": 0.5101, + "step": 3749 + }, + { + "epoch": 0.456343170063888, + "grad_norm": 0.7148923873901367, + "learning_rate": 1.7854754108876874e-05, + "loss": 0.4871, + "step": 3750 + }, + { + "epoch": 0.4564648615759051, + "grad_norm": 3.2015488147735596, + "learning_rate": 1.785354668885329e-05, + "loss": 0.4596, + "step": 3751 + }, + { + "epoch": 0.4565865530879221, + "grad_norm": 3.9936769008636475, + "learning_rate": 1.7852338969985996e-05, + "loss": 0.4663, + "step": 3752 + }, + { + "epoch": 0.45670824459993914, + "grad_norm": 3.0331928730010986, + "learning_rate": 1.7851130952320953e-05, + "loss": 0.4671, + "step": 3753 + }, + { + "epoch": 0.4568299361119562, + "grad_norm": 0.7422975897789001, + "learning_rate": 1.7849922635904127e-05, + "loss": 0.5073, + "step": 3754 + }, + { + "epoch": 0.45695162762397323, + "grad_norm": 3.285104513168335, + "learning_rate": 1.7848714020781495e-05, + "loss": 0.4424, + "step": 3755 + }, + { + "epoch": 0.45707331913599025, + "grad_norm": 1.2517046928405762, + "learning_rate": 1.7847505106999047e-05, + "loss": 0.453, + "step": 3756 + }, + { + "epoch": 0.4571950106480073, + "grad_norm": 1.3074380159378052, + "learning_rate": 1.7846295894602787e-05, + "loss": 0.4556, + "step": 3757 + }, + { + "epoch": 0.45731670216002435, + "grad_norm": 0.7568677663803101, + "learning_rate": 1.7845086383638733e-05, + "loss": 0.463, + "step": 3758 + }, + { + "epoch": 0.45743839367204137, + "grad_norm": 0.7038098573684692, + "learning_rate": 1.7843876574152896e-05, + "loss": 0.4344, + "step": 3759 + }, + { + "epoch": 0.4575600851840584, + "grad_norm": 0.8059700727462769, + "learning_rate": 1.7842666466191323e-05, + "loss": 0.422, + "step": 3760 + }, + { + "epoch": 0.45768177669607546, + "grad_norm": 1.4205422401428223, + "learning_rate": 1.7841456059800057e-05, + "loss": 0.4254, + "step": 3761 + }, + { + "epoch": 0.4578034682080925, + "grad_norm": 4.686393737792969, + "learning_rate": 1.7840245355025158e-05, + "loss": 0.5376, + "step": 3762 + }, + { + "epoch": 0.4579251597201095, + "grad_norm": 4.416186332702637, + "learning_rate": 1.7839034351912695e-05, + "loss": 0.5207, + "step": 3763 + }, + { + "epoch": 0.4580468512321266, + "grad_norm": 4.030603885650635, + "learning_rate": 1.7837823050508748e-05, + "loss": 0.5198, + "step": 3764 + }, + { + "epoch": 0.4581685427441436, + "grad_norm": 0.7521973848342896, + "learning_rate": 1.783661145085941e-05, + "loss": 0.4701, + "step": 3765 + }, + { + "epoch": 0.4582902342561606, + "grad_norm": 2.4491348266601562, + "learning_rate": 1.7835399553010785e-05, + "loss": 0.4408, + "step": 3766 + }, + { + "epoch": 0.4584119257681777, + "grad_norm": 1.0065889358520508, + "learning_rate": 1.783418735700899e-05, + "loss": 0.4873, + "step": 3767 + }, + { + "epoch": 0.4585336172801947, + "grad_norm": 5.402151107788086, + "learning_rate": 1.7832974862900147e-05, + "loss": 0.4324, + "step": 3768 + }, + { + "epoch": 0.4586553087922117, + "grad_norm": 3.2790324687957764, + "learning_rate": 1.7831762070730402e-05, + "loss": 0.5019, + "step": 3769 + }, + { + "epoch": 0.4587770003042288, + "grad_norm": 4.150466442108154, + "learning_rate": 1.7830548980545895e-05, + "loss": 0.4097, + "step": 3770 + }, + { + "epoch": 0.4588986918162458, + "grad_norm": 0.6237979531288147, + "learning_rate": 1.7829335592392795e-05, + "loss": 0.5079, + "step": 3771 + }, + { + "epoch": 0.45902038332826284, + "grad_norm": 1.5224637985229492, + "learning_rate": 1.7828121906317268e-05, + "loss": 0.431, + "step": 3772 + }, + { + "epoch": 0.4591420748402799, + "grad_norm": 0.721524715423584, + "learning_rate": 1.78269079223655e-05, + "loss": 0.478, + "step": 3773 + }, + { + "epoch": 0.45926376635229693, + "grad_norm": 0.9786434769630432, + "learning_rate": 1.782569364058368e-05, + "loss": 0.4422, + "step": 3774 + }, + { + "epoch": 0.45938545786431395, + "grad_norm": 0.7823892831802368, + "learning_rate": 1.7824479061018025e-05, + "loss": 0.4519, + "step": 3775 + }, + { + "epoch": 0.459507149376331, + "grad_norm": 0.7161319851875305, + "learning_rate": 1.7823264183714742e-05, + "loss": 0.4428, + "step": 3776 + }, + { + "epoch": 0.45962884088834804, + "grad_norm": 2.006716728210449, + "learning_rate": 1.7822049008720066e-05, + "loss": 0.3923, + "step": 3777 + }, + { + "epoch": 0.45975053240036506, + "grad_norm": 0.8360805511474609, + "learning_rate": 1.7820833536080235e-05, + "loss": 0.4275, + "step": 3778 + }, + { + "epoch": 0.45987222391238214, + "grad_norm": 3.4726150035858154, + "learning_rate": 1.7819617765841496e-05, + "loss": 0.5008, + "step": 3779 + }, + { + "epoch": 0.45999391542439916, + "grad_norm": 6.386075019836426, + "learning_rate": 1.781840169805012e-05, + "loss": 0.6021, + "step": 3780 + }, + { + "epoch": 0.4601156069364162, + "grad_norm": 1.8586344718933105, + "learning_rate": 1.7817185332752372e-05, + "loss": 0.4739, + "step": 3781 + }, + { + "epoch": 0.4602372984484332, + "grad_norm": 0.9821096062660217, + "learning_rate": 1.7815968669994543e-05, + "loss": 0.4423, + "step": 3782 + }, + { + "epoch": 0.46035898996045027, + "grad_norm": 0.6338329315185547, + "learning_rate": 1.7814751709822927e-05, + "loss": 0.4833, + "step": 3783 + }, + { + "epoch": 0.4604806814724673, + "grad_norm": 1.1122868061065674, + "learning_rate": 1.7813534452283835e-05, + "loss": 0.4164, + "step": 3784 + }, + { + "epoch": 0.4606023729844843, + "grad_norm": 3.8239831924438477, + "learning_rate": 1.7812316897423584e-05, + "loss": 0.375, + "step": 3785 + }, + { + "epoch": 0.4607240644965014, + "grad_norm": 1.563788890838623, + "learning_rate": 1.78110990452885e-05, + "loss": 0.4274, + "step": 3786 + }, + { + "epoch": 0.4608457560085184, + "grad_norm": 1.5114758014678955, + "learning_rate": 1.7809880895924935e-05, + "loss": 0.5033, + "step": 3787 + }, + { + "epoch": 0.4609674475205354, + "grad_norm": 1.6998744010925293, + "learning_rate": 1.7808662449379233e-05, + "loss": 0.4505, + "step": 3788 + }, + { + "epoch": 0.4610891390325525, + "grad_norm": 1.6544452905654907, + "learning_rate": 1.780744370569776e-05, + "loss": 0.5071, + "step": 3789 + }, + { + "epoch": 0.4612108305445695, + "grad_norm": 3.371568202972412, + "learning_rate": 1.78062246649269e-05, + "loss": 0.5407, + "step": 3790 + }, + { + "epoch": 0.46133252205658654, + "grad_norm": 1.258341670036316, + "learning_rate": 1.7805005327113028e-05, + "loss": 0.4406, + "step": 3791 + }, + { + "epoch": 0.4614542135686036, + "grad_norm": 0.6823922395706177, + "learning_rate": 1.7803785692302548e-05, + "loss": 0.484, + "step": 3792 + }, + { + "epoch": 0.46157590508062063, + "grad_norm": 0.9683195352554321, + "learning_rate": 1.7802565760541873e-05, + "loss": 0.4713, + "step": 3793 + }, + { + "epoch": 0.46169759659263765, + "grad_norm": 4.262025833129883, + "learning_rate": 1.7801345531877417e-05, + "loss": 0.4367, + "step": 3794 + }, + { + "epoch": 0.4618192881046547, + "grad_norm": 3.687232255935669, + "learning_rate": 1.7800125006355618e-05, + "loss": 0.4934, + "step": 3795 + }, + { + "epoch": 0.46194097961667174, + "grad_norm": 3.1868460178375244, + "learning_rate": 1.7798904184022916e-05, + "loss": 0.4662, + "step": 3796 + }, + { + "epoch": 0.46206267112868876, + "grad_norm": 1.6585912704467773, + "learning_rate": 1.7797683064925767e-05, + "loss": 0.4582, + "step": 3797 + }, + { + "epoch": 0.46218436264070584, + "grad_norm": 1.2644832134246826, + "learning_rate": 1.7796461649110636e-05, + "loss": 0.4843, + "step": 3798 + }, + { + "epoch": 0.46230605415272286, + "grad_norm": 0.7805526852607727, + "learning_rate": 1.7795239936624004e-05, + "loss": 0.4628, + "step": 3799 + }, + { + "epoch": 0.4624277456647399, + "grad_norm": 1.4566998481750488, + "learning_rate": 1.7794017927512356e-05, + "loss": 0.5042, + "step": 3800 + }, + { + "epoch": 0.4625494371767569, + "grad_norm": 0.6934186816215515, + "learning_rate": 1.779279562182219e-05, + "loss": 0.4564, + "step": 3801 + }, + { + "epoch": 0.46267112868877397, + "grad_norm": 0.7121795415878296, + "learning_rate": 1.7791573019600024e-05, + "loss": 0.4521, + "step": 3802 + }, + { + "epoch": 0.462792820200791, + "grad_norm": 0.9816343784332275, + "learning_rate": 1.7790350120892376e-05, + "loss": 0.4665, + "step": 3803 + }, + { + "epoch": 0.462914511712808, + "grad_norm": 1.8246629238128662, + "learning_rate": 1.7789126925745778e-05, + "loss": 0.4029, + "step": 3804 + }, + { + "epoch": 0.4630362032248251, + "grad_norm": 3.1889076232910156, + "learning_rate": 1.7787903434206783e-05, + "loss": 0.4012, + "step": 3805 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 1.9276530742645264, + "learning_rate": 1.7786679646321937e-05, + "loss": 0.4858, + "step": 3806 + }, + { + "epoch": 0.4632795862488591, + "grad_norm": 2.8115413188934326, + "learning_rate": 1.7785455562137818e-05, + "loss": 0.5007, + "step": 3807 + }, + { + "epoch": 0.4634012777608762, + "grad_norm": 1.1064033508300781, + "learning_rate": 1.7784231181700998e-05, + "loss": 0.401, + "step": 3808 + }, + { + "epoch": 0.4635229692728932, + "grad_norm": 1.599065899848938, + "learning_rate": 1.7783006505058068e-05, + "loss": 0.4867, + "step": 3809 + }, + { + "epoch": 0.46364466078491023, + "grad_norm": 2.7338004112243652, + "learning_rate": 1.778178153225563e-05, + "loss": 0.4869, + "step": 3810 + }, + { + "epoch": 0.4637663522969273, + "grad_norm": 0.6401821374893188, + "learning_rate": 1.77805562633403e-05, + "loss": 0.4681, + "step": 3811 + }, + { + "epoch": 0.46388804380894433, + "grad_norm": 1.9539035558700562, + "learning_rate": 1.77793306983587e-05, + "loss": 0.4517, + "step": 3812 + }, + { + "epoch": 0.46400973532096135, + "grad_norm": 0.6471558809280396, + "learning_rate": 1.7778104837357462e-05, + "loss": 0.4657, + "step": 3813 + }, + { + "epoch": 0.4641314268329784, + "grad_norm": 0.9634807109832764, + "learning_rate": 1.7776878680383232e-05, + "loss": 0.471, + "step": 3814 + }, + { + "epoch": 0.46425311834499544, + "grad_norm": 2.7370593547821045, + "learning_rate": 1.7775652227482677e-05, + "loss": 0.429, + "step": 3815 + }, + { + "epoch": 0.46437480985701246, + "grad_norm": 2.71479868888855, + "learning_rate": 1.7774425478702458e-05, + "loss": 0.4274, + "step": 3816 + }, + { + "epoch": 0.46449650136902954, + "grad_norm": 1.3262461423873901, + "learning_rate": 1.7773198434089256e-05, + "loss": 0.4955, + "step": 3817 + }, + { + "epoch": 0.46461819288104655, + "grad_norm": 4.287867546081543, + "learning_rate": 1.7771971093689765e-05, + "loss": 0.5026, + "step": 3818 + }, + { + "epoch": 0.4647398843930636, + "grad_norm": 2.4017977714538574, + "learning_rate": 1.7770743457550688e-05, + "loss": 0.5088, + "step": 3819 + }, + { + "epoch": 0.4648615759050806, + "grad_norm": 0.6364077925682068, + "learning_rate": 1.7769515525718738e-05, + "loss": 0.4219, + "step": 3820 + }, + { + "epoch": 0.46498326741709767, + "grad_norm": 0.9554967284202576, + "learning_rate": 1.7768287298240638e-05, + "loss": 0.4376, + "step": 3821 + }, + { + "epoch": 0.4651049589291147, + "grad_norm": 1.2945671081542969, + "learning_rate": 1.7767058775163126e-05, + "loss": 0.4526, + "step": 3822 + }, + { + "epoch": 0.4652266504411317, + "grad_norm": 1.7694463729858398, + "learning_rate": 1.7765829956532953e-05, + "loss": 0.4589, + "step": 3823 + }, + { + "epoch": 0.4653483419531488, + "grad_norm": 1.8475998640060425, + "learning_rate": 1.7764600842396874e-05, + "loss": 0.4849, + "step": 3824 + }, + { + "epoch": 0.4654700334651658, + "grad_norm": 0.6160339117050171, + "learning_rate": 1.7763371432801663e-05, + "loss": 0.4598, + "step": 3825 + }, + { + "epoch": 0.4655917249771828, + "grad_norm": 0.7143591642379761, + "learning_rate": 1.77621417277941e-05, + "loss": 0.5174, + "step": 3826 + }, + { + "epoch": 0.4657134164891999, + "grad_norm": 1.369228482246399, + "learning_rate": 1.7760911727420977e-05, + "loss": 0.4692, + "step": 3827 + }, + { + "epoch": 0.4658351080012169, + "grad_norm": 2.0281574726104736, + "learning_rate": 1.7759681431729095e-05, + "loss": 0.4916, + "step": 3828 + }, + { + "epoch": 0.46595679951323393, + "grad_norm": 1.601747989654541, + "learning_rate": 1.7758450840765275e-05, + "loss": 0.5092, + "step": 3829 + }, + { + "epoch": 0.466078491025251, + "grad_norm": 1.9887176752090454, + "learning_rate": 1.7757219954576345e-05, + "loss": 0.4802, + "step": 3830 + }, + { + "epoch": 0.466200182537268, + "grad_norm": 1.197410225868225, + "learning_rate": 1.7755988773209135e-05, + "loss": 0.4655, + "step": 3831 + }, + { + "epoch": 0.46632187404928505, + "grad_norm": 1.164315938949585, + "learning_rate": 1.7754757296710502e-05, + "loss": 0.5029, + "step": 3832 + }, + { + "epoch": 0.4664435655613021, + "grad_norm": 2.4914932250976562, + "learning_rate": 1.77535255251273e-05, + "loss": 0.5014, + "step": 3833 + }, + { + "epoch": 0.46656525707331914, + "grad_norm": 1.3653773069381714, + "learning_rate": 1.77522934585064e-05, + "loss": 0.4461, + "step": 3834 + }, + { + "epoch": 0.46668694858533616, + "grad_norm": 1.459444284439087, + "learning_rate": 1.775106109689469e-05, + "loss": 0.4332, + "step": 3835 + }, + { + "epoch": 0.46680864009735323, + "grad_norm": 1.3079185485839844, + "learning_rate": 1.774982844033906e-05, + "loss": 0.4074, + "step": 3836 + }, + { + "epoch": 0.46693033160937025, + "grad_norm": 1.0709340572357178, + "learning_rate": 1.774859548888642e-05, + "loss": 0.4394, + "step": 3837 + }, + { + "epoch": 0.46705202312138727, + "grad_norm": 5.704166889190674, + "learning_rate": 1.774736224258368e-05, + "loss": 0.5509, + "step": 3838 + }, + { + "epoch": 0.46717371463340435, + "grad_norm": 1.1317018270492554, + "learning_rate": 1.7746128701477775e-05, + "loss": 0.4485, + "step": 3839 + }, + { + "epoch": 0.46729540614542137, + "grad_norm": 1.9302055835723877, + "learning_rate": 1.7744894865615638e-05, + "loss": 0.4492, + "step": 3840 + }, + { + "epoch": 0.4674170976574384, + "grad_norm": 3.718958616256714, + "learning_rate": 1.7743660735044216e-05, + "loss": 0.4925, + "step": 3841 + }, + { + "epoch": 0.4675387891694554, + "grad_norm": 1.6200578212738037, + "learning_rate": 1.7742426309810475e-05, + "loss": 0.4323, + "step": 3842 + }, + { + "epoch": 0.4676604806814725, + "grad_norm": 0.9008049964904785, + "learning_rate": 1.774119158996139e-05, + "loss": 0.4503, + "step": 3843 + }, + { + "epoch": 0.4677821721934895, + "grad_norm": 0.8503767251968384, + "learning_rate": 1.773995657554394e-05, + "loss": 0.4789, + "step": 3844 + }, + { + "epoch": 0.4679038637055065, + "grad_norm": 3.2512094974517822, + "learning_rate": 1.7738721266605122e-05, + "loss": 0.4485, + "step": 3845 + }, + { + "epoch": 0.4680255552175236, + "grad_norm": 0.7887652516365051, + "learning_rate": 1.7737485663191944e-05, + "loss": 0.5324, + "step": 3846 + }, + { + "epoch": 0.4681472467295406, + "grad_norm": 1.9994163513183594, + "learning_rate": 1.7736249765351418e-05, + "loss": 0.4199, + "step": 3847 + }, + { + "epoch": 0.46826893824155763, + "grad_norm": 0.9749098420143127, + "learning_rate": 1.7735013573130574e-05, + "loss": 0.4692, + "step": 3848 + }, + { + "epoch": 0.4683906297535747, + "grad_norm": 0.7642173171043396, + "learning_rate": 1.7733777086576457e-05, + "loss": 0.4222, + "step": 3849 + }, + { + "epoch": 0.4685123212655917, + "grad_norm": 3.0395326614379883, + "learning_rate": 1.773254030573611e-05, + "loss": 0.4655, + "step": 3850 + }, + { + "epoch": 0.46863401277760874, + "grad_norm": 4.106037139892578, + "learning_rate": 1.7731303230656598e-05, + "loss": 0.5463, + "step": 3851 + }, + { + "epoch": 0.4687557042896258, + "grad_norm": 3.6523020267486572, + "learning_rate": 1.7730065861384998e-05, + "loss": 0.5447, + "step": 3852 + }, + { + "epoch": 0.46887739580164284, + "grad_norm": 2.7071120738983154, + "learning_rate": 1.772882819796839e-05, + "loss": 0.5235, + "step": 3853 + }, + { + "epoch": 0.46899908731365986, + "grad_norm": 0.5655571222305298, + "learning_rate": 1.7727590240453872e-05, + "loss": 0.4853, + "step": 3854 + }, + { + "epoch": 0.46912077882567693, + "grad_norm": 3.7611331939697266, + "learning_rate": 1.772635198888855e-05, + "loss": 0.4106, + "step": 3855 + }, + { + "epoch": 0.46924247033769395, + "grad_norm": 0.7470240592956543, + "learning_rate": 1.7725113443319545e-05, + "loss": 0.4921, + "step": 3856 + }, + { + "epoch": 0.46936416184971097, + "grad_norm": 0.6359212398529053, + "learning_rate": 1.772387460379398e-05, + "loss": 0.4929, + "step": 3857 + }, + { + "epoch": 0.46948585336172804, + "grad_norm": 2.6233646869659424, + "learning_rate": 1.7722635470359e-05, + "loss": 0.4745, + "step": 3858 + }, + { + "epoch": 0.46960754487374506, + "grad_norm": 3.774162530899048, + "learning_rate": 1.7721396043061758e-05, + "loss": 0.4428, + "step": 3859 + }, + { + "epoch": 0.4697292363857621, + "grad_norm": 0.6341200470924377, + "learning_rate": 1.772015632194941e-05, + "loss": 0.5379, + "step": 3860 + }, + { + "epoch": 0.4698509278977791, + "grad_norm": 3.63291597366333, + "learning_rate": 1.7718916307069133e-05, + "loss": 0.4503, + "step": 3861 + }, + { + "epoch": 0.4699726194097962, + "grad_norm": 1.582391619682312, + "learning_rate": 1.7717675998468116e-05, + "loss": 0.448, + "step": 3862 + }, + { + "epoch": 0.4700943109218132, + "grad_norm": 1.2197394371032715, + "learning_rate": 1.7716435396193553e-05, + "loss": 0.459, + "step": 3863 + }, + { + "epoch": 0.4702160024338302, + "grad_norm": 0.724090039730072, + "learning_rate": 1.771519450029265e-05, + "loss": 0.4733, + "step": 3864 + }, + { + "epoch": 0.4703376939458473, + "grad_norm": 0.7258306741714478, + "learning_rate": 1.7713953310812626e-05, + "loss": 0.4367, + "step": 3865 + }, + { + "epoch": 0.4704593854578643, + "grad_norm": 1.6493923664093018, + "learning_rate": 1.7712711827800713e-05, + "loss": 0.3821, + "step": 3866 + }, + { + "epoch": 0.47058107696988133, + "grad_norm": 3.9862005710601807, + "learning_rate": 1.7711470051304148e-05, + "loss": 0.5409, + "step": 3867 + }, + { + "epoch": 0.4707027684818984, + "grad_norm": 4.961276531219482, + "learning_rate": 1.771022798137019e-05, + "loss": 0.5585, + "step": 3868 + }, + { + "epoch": 0.4708244599939154, + "grad_norm": 2.241069793701172, + "learning_rate": 1.7708985618046096e-05, + "loss": 0.4895, + "step": 3869 + }, + { + "epoch": 0.47094615150593244, + "grad_norm": 0.7463817596435547, + "learning_rate": 1.770774296137914e-05, + "loss": 0.488, + "step": 3870 + }, + { + "epoch": 0.4710678430179495, + "grad_norm": 0.6749276518821716, + "learning_rate": 1.7706500011416616e-05, + "loss": 0.4543, + "step": 3871 + }, + { + "epoch": 0.47118953452996654, + "grad_norm": 2.60927414894104, + "learning_rate": 1.7705256768205806e-05, + "loss": 0.5627, + "step": 3872 + }, + { + "epoch": 0.47131122604198356, + "grad_norm": 3.590068817138672, + "learning_rate": 1.7704013231794036e-05, + "loss": 0.4644, + "step": 3873 + }, + { + "epoch": 0.47143291755400063, + "grad_norm": 2.517420530319214, + "learning_rate": 1.770276940222861e-05, + "loss": 0.4595, + "step": 3874 + }, + { + "epoch": 0.47155460906601765, + "grad_norm": 2.272606372833252, + "learning_rate": 1.770152527955687e-05, + "loss": 0.4536, + "step": 3875 + }, + { + "epoch": 0.47167630057803467, + "grad_norm": 2.187446355819702, + "learning_rate": 1.7700280863826144e-05, + "loss": 0.444, + "step": 3876 + }, + { + "epoch": 0.47179799209005174, + "grad_norm": 1.4742701053619385, + "learning_rate": 1.7699036155083797e-05, + "loss": 0.5108, + "step": 3877 + }, + { + "epoch": 0.47191968360206876, + "grad_norm": 1.1884597539901733, + "learning_rate": 1.769779115337719e-05, + "loss": 0.4347, + "step": 3878 + }, + { + "epoch": 0.4720413751140858, + "grad_norm": 1.8193438053131104, + "learning_rate": 1.7696545858753693e-05, + "loss": 0.4224, + "step": 3879 + }, + { + "epoch": 0.47216306662610286, + "grad_norm": 2.2266461849212646, + "learning_rate": 1.76953002712607e-05, + "loss": 0.5117, + "step": 3880 + }, + { + "epoch": 0.4722847581381199, + "grad_norm": 0.5971178412437439, + "learning_rate": 1.7694054390945595e-05, + "loss": 0.4135, + "step": 3881 + }, + { + "epoch": 0.4724064496501369, + "grad_norm": 2.7110724449157715, + "learning_rate": 1.7692808217855803e-05, + "loss": 0.5097, + "step": 3882 + }, + { + "epoch": 0.4725281411621539, + "grad_norm": 2.0419669151306152, + "learning_rate": 1.769156175203873e-05, + "loss": 0.4734, + "step": 3883 + }, + { + "epoch": 0.472649832674171, + "grad_norm": 1.217822790145874, + "learning_rate": 1.7690314993541814e-05, + "loss": 0.4206, + "step": 3884 + }, + { + "epoch": 0.472771524186188, + "grad_norm": 2.3650529384613037, + "learning_rate": 1.7689067942412492e-05, + "loss": 0.4842, + "step": 3885 + }, + { + "epoch": 0.472893215698205, + "grad_norm": 0.6739344596862793, + "learning_rate": 1.768782059869822e-05, + "loss": 0.4683, + "step": 3886 + }, + { + "epoch": 0.4730149072102221, + "grad_norm": 1.8087377548217773, + "learning_rate": 1.768657296244646e-05, + "loss": 0.4023, + "step": 3887 + }, + { + "epoch": 0.4731365987222391, + "grad_norm": 1.481730341911316, + "learning_rate": 1.768532503370469e-05, + "loss": 0.524, + "step": 3888 + }, + { + "epoch": 0.47325829023425614, + "grad_norm": 1.256410837173462, + "learning_rate": 1.7684076812520397e-05, + "loss": 0.4728, + "step": 3889 + }, + { + "epoch": 0.4733799817462732, + "grad_norm": 2.8640084266662598, + "learning_rate": 1.7682828298941074e-05, + "loss": 0.465, + "step": 3890 + }, + { + "epoch": 0.47350167325829023, + "grad_norm": 1.838978886604309, + "learning_rate": 1.7681579493014232e-05, + "loss": 0.4309, + "step": 3891 + }, + { + "epoch": 0.47362336477030725, + "grad_norm": 0.6003907918930054, + "learning_rate": 1.768033039478739e-05, + "loss": 0.4523, + "step": 3892 + }, + { + "epoch": 0.47374505628232433, + "grad_norm": 0.8352644443511963, + "learning_rate": 1.767908100430808e-05, + "loss": 0.5105, + "step": 3893 + }, + { + "epoch": 0.47386674779434135, + "grad_norm": 0.6823041439056396, + "learning_rate": 1.767783132162384e-05, + "loss": 0.5165, + "step": 3894 + }, + { + "epoch": 0.47398843930635837, + "grad_norm": 0.9185401797294617, + "learning_rate": 1.767658134678223e-05, + "loss": 0.4721, + "step": 3895 + }, + { + "epoch": 0.47411013081837544, + "grad_norm": 1.2255065441131592, + "learning_rate": 1.767533107983081e-05, + "loss": 0.506, + "step": 3896 + }, + { + "epoch": 0.47423182233039246, + "grad_norm": 1.4775419235229492, + "learning_rate": 1.7674080520817153e-05, + "loss": 0.5556, + "step": 3897 + }, + { + "epoch": 0.4743535138424095, + "grad_norm": 0.6126468777656555, + "learning_rate": 1.7672829669788847e-05, + "loss": 0.4671, + "step": 3898 + }, + { + "epoch": 0.47447520535442655, + "grad_norm": 0.8431262373924255, + "learning_rate": 1.7671578526793492e-05, + "loss": 0.4915, + "step": 3899 + }, + { + "epoch": 0.4745968968664436, + "grad_norm": 4.0832438468933105, + "learning_rate": 1.7670327091878694e-05, + "loss": 0.4709, + "step": 3900 + }, + { + "epoch": 0.4747185883784606, + "grad_norm": 2.341308832168579, + "learning_rate": 1.7669075365092073e-05, + "loss": 0.482, + "step": 3901 + }, + { + "epoch": 0.4748402798904776, + "grad_norm": 0.706177830696106, + "learning_rate": 1.7667823346481263e-05, + "loss": 0.4873, + "step": 3902 + }, + { + "epoch": 0.4749619714024947, + "grad_norm": 0.6799188256263733, + "learning_rate": 1.76665710360939e-05, + "loss": 0.4703, + "step": 3903 + }, + { + "epoch": 0.4750836629145117, + "grad_norm": 1.3929799795150757, + "learning_rate": 1.766531843397764e-05, + "loss": 0.5084, + "step": 3904 + }, + { + "epoch": 0.4752053544265287, + "grad_norm": 0.5733524560928345, + "learning_rate": 1.766406554018015e-05, + "loss": 0.471, + "step": 3905 + }, + { + "epoch": 0.4753270459385458, + "grad_norm": 1.8817859888076782, + "learning_rate": 1.76628123547491e-05, + "loss": 0.4149, + "step": 3906 + }, + { + "epoch": 0.4754487374505628, + "grad_norm": 0.9669296145439148, + "learning_rate": 1.766155887773218e-05, + "loss": 0.4681, + "step": 3907 + }, + { + "epoch": 0.47557042896257984, + "grad_norm": 2.8782145977020264, + "learning_rate": 1.7660305109177087e-05, + "loss": 0.394, + "step": 3908 + }, + { + "epoch": 0.4756921204745969, + "grad_norm": 0.8162304162979126, + "learning_rate": 1.7659051049131527e-05, + "loss": 0.4689, + "step": 3909 + }, + { + "epoch": 0.47581381198661393, + "grad_norm": 1.816863775253296, + "learning_rate": 1.765779669764322e-05, + "loss": 0.5006, + "step": 3910 + }, + { + "epoch": 0.47593550349863095, + "grad_norm": 2.4650182723999023, + "learning_rate": 1.7656542054759902e-05, + "loss": 0.3802, + "step": 3911 + }, + { + "epoch": 0.476057195010648, + "grad_norm": 2.4364404678344727, + "learning_rate": 1.7655287120529307e-05, + "loss": 0.4814, + "step": 3912 + }, + { + "epoch": 0.47617888652266505, + "grad_norm": 1.3512928485870361, + "learning_rate": 1.7654031894999192e-05, + "loss": 0.4554, + "step": 3913 + }, + { + "epoch": 0.47630057803468207, + "grad_norm": 2.0477256774902344, + "learning_rate": 1.7652776378217324e-05, + "loss": 0.3982, + "step": 3914 + }, + { + "epoch": 0.47642226954669914, + "grad_norm": 2.214003324508667, + "learning_rate": 1.7651520570231467e-05, + "loss": 0.3799, + "step": 3915 + }, + { + "epoch": 0.47654396105871616, + "grad_norm": 0.8521255254745483, + "learning_rate": 1.7650264471089423e-05, + "loss": 0.4711, + "step": 3916 + }, + { + "epoch": 0.4766656525707332, + "grad_norm": 1.2974274158477783, + "learning_rate": 1.7649008080838976e-05, + "loss": 0.4625, + "step": 3917 + }, + { + "epoch": 0.47678734408275025, + "grad_norm": 2.397916078567505, + "learning_rate": 1.7647751399527938e-05, + "loss": 0.5298, + "step": 3918 + }, + { + "epoch": 0.47690903559476727, + "grad_norm": 2.0906403064727783, + "learning_rate": 1.764649442720413e-05, + "loss": 0.4034, + "step": 3919 + }, + { + "epoch": 0.4770307271067843, + "grad_norm": 1.348837971687317, + "learning_rate": 1.7645237163915383e-05, + "loss": 0.4621, + "step": 3920 + }, + { + "epoch": 0.47715241861880137, + "grad_norm": 2.266852855682373, + "learning_rate": 1.764397960970954e-05, + "loss": 0.4936, + "step": 3921 + }, + { + "epoch": 0.4772741101308184, + "grad_norm": 1.897757887840271, + "learning_rate": 1.7642721764634447e-05, + "loss": 0.5249, + "step": 3922 + }, + { + "epoch": 0.4773958016428354, + "grad_norm": 1.560788631439209, + "learning_rate": 1.764146362873797e-05, + "loss": 0.4601, + "step": 3923 + }, + { + "epoch": 0.4775174931548524, + "grad_norm": 2.0800445079803467, + "learning_rate": 1.7640205202067985e-05, + "loss": 0.5232, + "step": 3924 + }, + { + "epoch": 0.4776391846668695, + "grad_norm": 2.9814136028289795, + "learning_rate": 1.7638946484672382e-05, + "loss": 0.5021, + "step": 3925 + }, + { + "epoch": 0.4777608761788865, + "grad_norm": 6.038138389587402, + "learning_rate": 1.763768747659905e-05, + "loss": 0.4347, + "step": 3926 + }, + { + "epoch": 0.47788256769090354, + "grad_norm": 2.1282153129577637, + "learning_rate": 1.7636428177895902e-05, + "loss": 0.4785, + "step": 3927 + }, + { + "epoch": 0.4780042592029206, + "grad_norm": 3.8376569747924805, + "learning_rate": 1.7635168588610855e-05, + "loss": 0.5067, + "step": 3928 + }, + { + "epoch": 0.47812595071493763, + "grad_norm": 0.8172527551651001, + "learning_rate": 1.7633908708791837e-05, + "loss": 0.503, + "step": 3929 + }, + { + "epoch": 0.47824764222695465, + "grad_norm": 2.8463196754455566, + "learning_rate": 1.7632648538486792e-05, + "loss": 0.4964, + "step": 3930 + }, + { + "epoch": 0.4783693337389717, + "grad_norm": 0.7011560201644897, + "learning_rate": 1.7631388077743675e-05, + "loss": 0.4432, + "step": 3931 + }, + { + "epoch": 0.47849102525098874, + "grad_norm": 1.3020755052566528, + "learning_rate": 1.7630127326610442e-05, + "loss": 0.467, + "step": 3932 + }, + { + "epoch": 0.47861271676300576, + "grad_norm": 1.6074851751327515, + "learning_rate": 1.762886628513507e-05, + "loss": 0.4168, + "step": 3933 + }, + { + "epoch": 0.47873440827502284, + "grad_norm": 0.9652578234672546, + "learning_rate": 1.7627604953365548e-05, + "loss": 0.4363, + "step": 3934 + }, + { + "epoch": 0.47885609978703986, + "grad_norm": 1.9899330139160156, + "learning_rate": 1.7626343331349872e-05, + "loss": 0.4976, + "step": 3935 + }, + { + "epoch": 0.4789777912990569, + "grad_norm": 1.2797359228134155, + "learning_rate": 1.7625081419136042e-05, + "loss": 0.4216, + "step": 3936 + }, + { + "epoch": 0.47909948281107395, + "grad_norm": 3.8270087242126465, + "learning_rate": 1.762381921677208e-05, + "loss": 0.523, + "step": 3937 + }, + { + "epoch": 0.47922117432309097, + "grad_norm": 4.82899808883667, + "learning_rate": 1.7622556724306018e-05, + "loss": 0.5312, + "step": 3938 + }, + { + "epoch": 0.479342865835108, + "grad_norm": 1.2736430168151855, + "learning_rate": 1.7621293941785893e-05, + "loss": 0.442, + "step": 3939 + }, + { + "epoch": 0.47946455734712506, + "grad_norm": 0.8976635932922363, + "learning_rate": 1.7620030869259762e-05, + "loss": 0.4657, + "step": 3940 + }, + { + "epoch": 0.4795862488591421, + "grad_norm": 0.7164414525032043, + "learning_rate": 1.7618767506775683e-05, + "loss": 0.4773, + "step": 3941 + }, + { + "epoch": 0.4797079403711591, + "grad_norm": 1.1275383234024048, + "learning_rate": 1.7617503854381727e-05, + "loss": 0.4578, + "step": 3942 + }, + { + "epoch": 0.4798296318831761, + "grad_norm": 2.4435224533081055, + "learning_rate": 1.7616239912125985e-05, + "loss": 0.4429, + "step": 3943 + }, + { + "epoch": 0.4799513233951932, + "grad_norm": 1.6950342655181885, + "learning_rate": 1.761497568005655e-05, + "loss": 0.4977, + "step": 3944 + }, + { + "epoch": 0.4800730149072102, + "grad_norm": 3.6762256622314453, + "learning_rate": 1.7613711158221523e-05, + "loss": 0.3738, + "step": 3945 + }, + { + "epoch": 0.48019470641922724, + "grad_norm": 1.2391748428344727, + "learning_rate": 1.7612446346669034e-05, + "loss": 0.4362, + "step": 3946 + }, + { + "epoch": 0.4803163979312443, + "grad_norm": 1.591848611831665, + "learning_rate": 1.7611181245447203e-05, + "loss": 0.4849, + "step": 3947 + }, + { + "epoch": 0.48043808944326133, + "grad_norm": 2.195936441421509, + "learning_rate": 1.7609915854604168e-05, + "loss": 0.5437, + "step": 3948 + }, + { + "epoch": 0.48055978095527835, + "grad_norm": 4.923917293548584, + "learning_rate": 1.7608650174188087e-05, + "loss": 0.5803, + "step": 3949 + }, + { + "epoch": 0.4806814724672954, + "grad_norm": 1.3654608726501465, + "learning_rate": 1.7607384204247114e-05, + "loss": 0.4934, + "step": 3950 + }, + { + "epoch": 0.48080316397931244, + "grad_norm": 1.8725523948669434, + "learning_rate": 1.7606117944829427e-05, + "loss": 0.4706, + "step": 3951 + }, + { + "epoch": 0.48092485549132946, + "grad_norm": 1.3736755847930908, + "learning_rate": 1.7604851395983208e-05, + "loss": 0.4654, + "step": 3952 + }, + { + "epoch": 0.48104654700334654, + "grad_norm": 1.4079389572143555, + "learning_rate": 1.7603584557756655e-05, + "loss": 0.445, + "step": 3953 + }, + { + "epoch": 0.48116823851536356, + "grad_norm": 1.7103395462036133, + "learning_rate": 1.760231743019797e-05, + "loss": 0.5362, + "step": 3954 + }, + { + "epoch": 0.4812899300273806, + "grad_norm": 2.03585147857666, + "learning_rate": 1.7601050013355368e-05, + "loss": 0.4644, + "step": 3955 + }, + { + "epoch": 0.48141162153939765, + "grad_norm": 1.4892867803573608, + "learning_rate": 1.759978230727708e-05, + "loss": 0.5017, + "step": 3956 + }, + { + "epoch": 0.48153331305141467, + "grad_norm": 2.6341655254364014, + "learning_rate": 1.7598514312011348e-05, + "loss": 0.4664, + "step": 3957 + }, + { + "epoch": 0.4816550045634317, + "grad_norm": 0.9715067148208618, + "learning_rate": 1.7597246027606417e-05, + "loss": 0.5133, + "step": 3958 + }, + { + "epoch": 0.48177669607544876, + "grad_norm": 2.2763943672180176, + "learning_rate": 1.7595977454110547e-05, + "loss": 0.4355, + "step": 3959 + }, + { + "epoch": 0.4818983875874658, + "grad_norm": 3.025862216949463, + "learning_rate": 1.7594708591572013e-05, + "loss": 0.4154, + "step": 3960 + }, + { + "epoch": 0.4820200790994828, + "grad_norm": 1.073451042175293, + "learning_rate": 1.7593439440039098e-05, + "loss": 0.4503, + "step": 3961 + }, + { + "epoch": 0.4821417706114998, + "grad_norm": 2.2360925674438477, + "learning_rate": 1.7592169999560097e-05, + "loss": 0.5033, + "step": 3962 + }, + { + "epoch": 0.4822634621235169, + "grad_norm": 1.2230095863342285, + "learning_rate": 1.7590900270183308e-05, + "loss": 0.4539, + "step": 3963 + }, + { + "epoch": 0.4823851536355339, + "grad_norm": 2.842133045196533, + "learning_rate": 1.7589630251957052e-05, + "loss": 0.4501, + "step": 3964 + }, + { + "epoch": 0.48250684514755093, + "grad_norm": 1.4633009433746338, + "learning_rate": 1.7588359944929658e-05, + "loss": 0.4063, + "step": 3965 + }, + { + "epoch": 0.482628536659568, + "grad_norm": 1.738244891166687, + "learning_rate": 1.758708934914946e-05, + "loss": 0.4938, + "step": 3966 + }, + { + "epoch": 0.482750228171585, + "grad_norm": 0.9651433825492859, + "learning_rate": 1.758581846466481e-05, + "loss": 0.4369, + "step": 3967 + }, + { + "epoch": 0.48287191968360205, + "grad_norm": 0.5670275092124939, + "learning_rate": 1.758454729152406e-05, + "loss": 0.4598, + "step": 3968 + }, + { + "epoch": 0.4829936111956191, + "grad_norm": 2.052802562713623, + "learning_rate": 1.7583275829775593e-05, + "loss": 0.4577, + "step": 3969 + }, + { + "epoch": 0.48311530270763614, + "grad_norm": 1.7143601179122925, + "learning_rate": 1.7582004079467777e-05, + "loss": 0.4412, + "step": 3970 + }, + { + "epoch": 0.48323699421965316, + "grad_norm": 1.1367214918136597, + "learning_rate": 1.7580732040649016e-05, + "loss": 0.4603, + "step": 3971 + }, + { + "epoch": 0.48335868573167023, + "grad_norm": 0.6317379474639893, + "learning_rate": 1.757945971336771e-05, + "loss": 0.4584, + "step": 3972 + }, + { + "epoch": 0.48348037724368725, + "grad_norm": 1.0157256126403809, + "learning_rate": 1.7578187097672272e-05, + "loss": 0.4668, + "step": 3973 + }, + { + "epoch": 0.4836020687557043, + "grad_norm": 2.4871716499328613, + "learning_rate": 1.757691419361113e-05, + "loss": 0.5098, + "step": 3974 + }, + { + "epoch": 0.48372376026772135, + "grad_norm": 0.5689814686775208, + "learning_rate": 1.7575641001232718e-05, + "loss": 0.4674, + "step": 3975 + }, + { + "epoch": 0.48384545177973837, + "grad_norm": 1.0546921491622925, + "learning_rate": 1.7574367520585486e-05, + "loss": 0.4654, + "step": 3976 + }, + { + "epoch": 0.4839671432917554, + "grad_norm": 1.9708023071289062, + "learning_rate": 1.7573093751717895e-05, + "loss": 0.4678, + "step": 3977 + }, + { + "epoch": 0.48408883480377246, + "grad_norm": 2.288806676864624, + "learning_rate": 1.757181969467841e-05, + "loss": 0.4426, + "step": 3978 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 1.6643061637878418, + "learning_rate": 1.7570545349515508e-05, + "loss": 0.4832, + "step": 3979 + }, + { + "epoch": 0.4843322178278065, + "grad_norm": 3.4978158473968506, + "learning_rate": 1.756927071627769e-05, + "loss": 0.5471, + "step": 3980 + }, + { + "epoch": 0.4844539093398236, + "grad_norm": 0.7838190793991089, + "learning_rate": 1.7567995795013454e-05, + "loss": 0.4458, + "step": 3981 + }, + { + "epoch": 0.4845756008518406, + "grad_norm": 3.7718207836151123, + "learning_rate": 1.756672058577131e-05, + "loss": 0.516, + "step": 3982 + }, + { + "epoch": 0.4846972923638576, + "grad_norm": 3.249464273452759, + "learning_rate": 1.7565445088599788e-05, + "loss": 0.5103, + "step": 3983 + }, + { + "epoch": 0.48481898387587463, + "grad_norm": 0.5579743385314941, + "learning_rate": 1.7564169303547415e-05, + "loss": 0.4202, + "step": 3984 + }, + { + "epoch": 0.4849406753878917, + "grad_norm": 0.9039687514305115, + "learning_rate": 1.756289323066275e-05, + "loss": 0.4614, + "step": 3985 + }, + { + "epoch": 0.4850623668999087, + "grad_norm": 1.2483718395233154, + "learning_rate": 1.756161686999434e-05, + "loss": 0.4882, + "step": 3986 + }, + { + "epoch": 0.48518405841192574, + "grad_norm": 1.0861611366271973, + "learning_rate": 1.7560340221590756e-05, + "loss": 0.4555, + "step": 3987 + }, + { + "epoch": 0.4853057499239428, + "grad_norm": 1.099472999572754, + "learning_rate": 1.7559063285500578e-05, + "loss": 0.4417, + "step": 3988 + }, + { + "epoch": 0.48542744143595984, + "grad_norm": 3.4981207847595215, + "learning_rate": 1.75577860617724e-05, + "loss": 0.5534, + "step": 3989 + }, + { + "epoch": 0.48554913294797686, + "grad_norm": 2.388129949569702, + "learning_rate": 1.7556508550454815e-05, + "loss": 0.491, + "step": 3990 + }, + { + "epoch": 0.48567082445999393, + "grad_norm": 1.3636597394943237, + "learning_rate": 1.7555230751596437e-05, + "loss": 0.4993, + "step": 3991 + }, + { + "epoch": 0.48579251597201095, + "grad_norm": 0.9246495962142944, + "learning_rate": 1.755395266524589e-05, + "loss": 0.4778, + "step": 3992 + }, + { + "epoch": 0.48591420748402797, + "grad_norm": 2.5662167072296143, + "learning_rate": 1.7552674291451814e-05, + "loss": 0.4185, + "step": 3993 + }, + { + "epoch": 0.48603589899604505, + "grad_norm": 1.6272306442260742, + "learning_rate": 1.7551395630262843e-05, + "loss": 0.4665, + "step": 3994 + }, + { + "epoch": 0.48615759050806207, + "grad_norm": 1.5110605955123901, + "learning_rate": 1.7550116681727637e-05, + "loss": 0.4882, + "step": 3995 + }, + { + "epoch": 0.4862792820200791, + "grad_norm": 0.7350053191184998, + "learning_rate": 1.7548837445894866e-05, + "loss": 0.4889, + "step": 3996 + }, + { + "epoch": 0.48640097353209616, + "grad_norm": 2.356365442276001, + "learning_rate": 1.7547557922813204e-05, + "loss": 0.4135, + "step": 3997 + }, + { + "epoch": 0.4865226650441132, + "grad_norm": 3.077037811279297, + "learning_rate": 1.754627811253134e-05, + "loss": 0.4469, + "step": 3998 + }, + { + "epoch": 0.4866443565561302, + "grad_norm": 2.208920478820801, + "learning_rate": 1.7544998015097973e-05, + "loss": 0.4359, + "step": 3999 + }, + { + "epoch": 0.4867660480681473, + "grad_norm": 0.7632595300674438, + "learning_rate": 1.754371763056182e-05, + "loss": 0.4808, + "step": 4000 + }, + { + "epoch": 0.4868877395801643, + "grad_norm": 1.7443029880523682, + "learning_rate": 1.754243695897159e-05, + "loss": 0.4512, + "step": 4001 + }, + { + "epoch": 0.4870094310921813, + "grad_norm": 3.6838951110839844, + "learning_rate": 1.7541156000376025e-05, + "loss": 0.4925, + "step": 4002 + }, + { + "epoch": 0.48713112260419833, + "grad_norm": 2.964545726776123, + "learning_rate": 1.7539874754823863e-05, + "loss": 0.531, + "step": 4003 + }, + { + "epoch": 0.4872528141162154, + "grad_norm": 0.8194064497947693, + "learning_rate": 1.753859322236386e-05, + "loss": 0.4405, + "step": 4004 + }, + { + "epoch": 0.4873745056282324, + "grad_norm": 0.8101553916931152, + "learning_rate": 1.753731140304478e-05, + "loss": 0.4474, + "step": 4005 + }, + { + "epoch": 0.48749619714024944, + "grad_norm": 1.820739984512329, + "learning_rate": 1.75360292969154e-05, + "loss": 0.4061, + "step": 4006 + }, + { + "epoch": 0.4876178886522665, + "grad_norm": 1.4210189580917358, + "learning_rate": 1.753474690402451e-05, + "loss": 0.4641, + "step": 4007 + }, + { + "epoch": 0.48773958016428354, + "grad_norm": 3.2839109897613525, + "learning_rate": 1.7533464224420904e-05, + "loss": 0.5322, + "step": 4008 + }, + { + "epoch": 0.48786127167630056, + "grad_norm": 1.1567356586456299, + "learning_rate": 1.753218125815339e-05, + "loss": 0.4478, + "step": 4009 + }, + { + "epoch": 0.48798296318831763, + "grad_norm": 1.8849678039550781, + "learning_rate": 1.7530898005270788e-05, + "loss": 0.4845, + "step": 4010 + }, + { + "epoch": 0.48810465470033465, + "grad_norm": 1.187034249305725, + "learning_rate": 1.7529614465821928e-05, + "loss": 0.4692, + "step": 4011 + }, + { + "epoch": 0.48822634621235167, + "grad_norm": 2.560075044631958, + "learning_rate": 1.752833063985565e-05, + "loss": 0.4299, + "step": 4012 + }, + { + "epoch": 0.48834803772436874, + "grad_norm": 2.4375059604644775, + "learning_rate": 1.752704652742081e-05, + "loss": 0.4855, + "step": 4013 + }, + { + "epoch": 0.48846972923638576, + "grad_norm": 3.6712076663970947, + "learning_rate": 1.752576212856627e-05, + "loss": 0.4107, + "step": 4014 + }, + { + "epoch": 0.4885914207484028, + "grad_norm": 1.1576403379440308, + "learning_rate": 1.7524477443340905e-05, + "loss": 0.5098, + "step": 4015 + }, + { + "epoch": 0.48871311226041986, + "grad_norm": 2.328127145767212, + "learning_rate": 1.7523192471793598e-05, + "loss": 0.4331, + "step": 4016 + }, + { + "epoch": 0.4888348037724369, + "grad_norm": 2.9909980297088623, + "learning_rate": 1.7521907213973243e-05, + "loss": 0.3904, + "step": 4017 + }, + { + "epoch": 0.4889564952844539, + "grad_norm": 3.2087669372558594, + "learning_rate": 1.752062166992875e-05, + "loss": 0.5608, + "step": 4018 + }, + { + "epoch": 0.48907818679647097, + "grad_norm": 1.2506738901138306, + "learning_rate": 1.7519335839709035e-05, + "loss": 0.4497, + "step": 4019 + }, + { + "epoch": 0.489199878308488, + "grad_norm": 0.967567503452301, + "learning_rate": 1.751804972336303e-05, + "loss": 0.4856, + "step": 4020 + }, + { + "epoch": 0.489321569820505, + "grad_norm": 0.9390907287597656, + "learning_rate": 1.751676332093967e-05, + "loss": 0.4632, + "step": 4021 + }, + { + "epoch": 0.4894432613325221, + "grad_norm": 0.8339318037033081, + "learning_rate": 1.7515476632487907e-05, + "loss": 0.4618, + "step": 4022 + }, + { + "epoch": 0.4895649528445391, + "grad_norm": 1.5600872039794922, + "learning_rate": 1.75141896580567e-05, + "loss": 0.448, + "step": 4023 + }, + { + "epoch": 0.4896866443565561, + "grad_norm": 0.7890185117721558, + "learning_rate": 1.7512902397695025e-05, + "loss": 0.4597, + "step": 4024 + }, + { + "epoch": 0.48980833586857314, + "grad_norm": 2.2592806816101074, + "learning_rate": 1.7511614851451862e-05, + "loss": 0.4719, + "step": 4025 + }, + { + "epoch": 0.4899300273805902, + "grad_norm": 2.5629689693450928, + "learning_rate": 1.7510327019376205e-05, + "loss": 0.442, + "step": 4026 + }, + { + "epoch": 0.49005171889260724, + "grad_norm": 4.361266136169434, + "learning_rate": 1.7509038901517063e-05, + "loss": 0.4591, + "step": 4027 + }, + { + "epoch": 0.49017341040462425, + "grad_norm": 3.8432695865631104, + "learning_rate": 1.7507750497923444e-05, + "loss": 0.562, + "step": 4028 + }, + { + "epoch": 0.49029510191664133, + "grad_norm": 1.3878878355026245, + "learning_rate": 1.750646180864438e-05, + "loss": 0.4691, + "step": 4029 + }, + { + "epoch": 0.49041679342865835, + "grad_norm": 2.3850536346435547, + "learning_rate": 1.7505172833728905e-05, + "loss": 0.4728, + "step": 4030 + }, + { + "epoch": 0.49053848494067537, + "grad_norm": 3.909846067428589, + "learning_rate": 1.750388357322607e-05, + "loss": 0.3717, + "step": 4031 + }, + { + "epoch": 0.49066017645269244, + "grad_norm": 0.9928010702133179, + "learning_rate": 1.7502594027184937e-05, + "loss": 0.4621, + "step": 4032 + }, + { + "epoch": 0.49078186796470946, + "grad_norm": 2.579378128051758, + "learning_rate": 1.7501304195654564e-05, + "loss": 0.4829, + "step": 4033 + }, + { + "epoch": 0.4909035594767265, + "grad_norm": 4.5460686683654785, + "learning_rate": 1.7500014078684045e-05, + "loss": 0.5258, + "step": 4034 + }, + { + "epoch": 0.49102525098874356, + "grad_norm": 0.8646695613861084, + "learning_rate": 1.7498723676322464e-05, + "loss": 0.4579, + "step": 4035 + }, + { + "epoch": 0.4911469425007606, + "grad_norm": 2.715090751647949, + "learning_rate": 1.7497432988618926e-05, + "loss": 0.5009, + "step": 4036 + }, + { + "epoch": 0.4912686340127776, + "grad_norm": 1.2365626096725464, + "learning_rate": 1.7496142015622545e-05, + "loss": 0.4652, + "step": 4037 + }, + { + "epoch": 0.49139032552479467, + "grad_norm": 3.33669376373291, + "learning_rate": 1.7494850757382442e-05, + "loss": 0.4254, + "step": 4038 + }, + { + "epoch": 0.4915120170368117, + "grad_norm": 1.378563642501831, + "learning_rate": 1.7493559213947755e-05, + "loss": 0.5104, + "step": 4039 + }, + { + "epoch": 0.4916337085488287, + "grad_norm": 1.6664854288101196, + "learning_rate": 1.749226738536763e-05, + "loss": 0.4294, + "step": 4040 + }, + { + "epoch": 0.4917554000608458, + "grad_norm": 0.942799985408783, + "learning_rate": 1.7490975271691223e-05, + "loss": 0.5178, + "step": 4041 + }, + { + "epoch": 0.4918770915728628, + "grad_norm": 3.033684730529785, + "learning_rate": 1.74896828729677e-05, + "loss": 0.4936, + "step": 4042 + }, + { + "epoch": 0.4919987830848798, + "grad_norm": 2.8540375232696533, + "learning_rate": 1.7488390189246242e-05, + "loss": 0.5073, + "step": 4043 + }, + { + "epoch": 0.49212047459689684, + "grad_norm": 2.979496717453003, + "learning_rate": 1.7487097220576035e-05, + "loss": 0.5174, + "step": 4044 + }, + { + "epoch": 0.4922421661089139, + "grad_norm": 2.3568081855773926, + "learning_rate": 1.7485803967006287e-05, + "loss": 0.5338, + "step": 4045 + }, + { + "epoch": 0.49236385762093093, + "grad_norm": 1.4154558181762695, + "learning_rate": 1.7484510428586195e-05, + "loss": 0.4959, + "step": 4046 + }, + { + "epoch": 0.49248554913294795, + "grad_norm": 2.261134624481201, + "learning_rate": 1.7483216605364997e-05, + "loss": 0.4787, + "step": 4047 + }, + { + "epoch": 0.492607240644965, + "grad_norm": 1.465767741203308, + "learning_rate": 1.7481922497391912e-05, + "loss": 0.4968, + "step": 4048 + }, + { + "epoch": 0.49272893215698205, + "grad_norm": 2.4370086193084717, + "learning_rate": 1.7480628104716193e-05, + "loss": 0.4608, + "step": 4049 + }, + { + "epoch": 0.49285062366899907, + "grad_norm": 2.0654404163360596, + "learning_rate": 1.747933342738709e-05, + "loss": 0.5361, + "step": 4050 + }, + { + "epoch": 0.49297231518101614, + "grad_norm": 2.6077675819396973, + "learning_rate": 1.7478038465453866e-05, + "loss": 0.4227, + "step": 4051 + }, + { + "epoch": 0.49309400669303316, + "grad_norm": 0.9016644954681396, + "learning_rate": 1.7476743218965802e-05, + "loss": 0.4744, + "step": 4052 + }, + { + "epoch": 0.4932156982050502, + "grad_norm": 2.60406756401062, + "learning_rate": 1.747544768797218e-05, + "loss": 0.5289, + "step": 4053 + }, + { + "epoch": 0.49333738971706725, + "grad_norm": 2.225156545639038, + "learning_rate": 1.7474151872522305e-05, + "loss": 0.5364, + "step": 4054 + }, + { + "epoch": 0.4934590812290843, + "grad_norm": 1.802567958831787, + "learning_rate": 1.7472855772665477e-05, + "loss": 0.4567, + "step": 4055 + }, + { + "epoch": 0.4935807727411013, + "grad_norm": 0.8455933928489685, + "learning_rate": 1.747155938845102e-05, + "loss": 0.4737, + "step": 4056 + }, + { + "epoch": 0.49370246425311837, + "grad_norm": 2.3552300930023193, + "learning_rate": 1.747026271992826e-05, + "loss": 0.4232, + "step": 4057 + }, + { + "epoch": 0.4938241557651354, + "grad_norm": 4.302911281585693, + "learning_rate": 1.7468965767146545e-05, + "loss": 0.5367, + "step": 4058 + }, + { + "epoch": 0.4939458472771524, + "grad_norm": 1.4347680807113647, + "learning_rate": 1.7467668530155223e-05, + "loss": 0.4538, + "step": 4059 + }, + { + "epoch": 0.4940675387891695, + "grad_norm": 2.1260783672332764, + "learning_rate": 1.7466371009003652e-05, + "loss": 0.5289, + "step": 4060 + }, + { + "epoch": 0.4941892303011865, + "grad_norm": 1.401283860206604, + "learning_rate": 1.7465073203741215e-05, + "loss": 0.5127, + "step": 4061 + }, + { + "epoch": 0.4943109218132035, + "grad_norm": 2.4197702407836914, + "learning_rate": 1.7463775114417284e-05, + "loss": 0.4605, + "step": 4062 + }, + { + "epoch": 0.4944326133252206, + "grad_norm": 4.5010666847229, + "learning_rate": 1.7462476741081267e-05, + "loss": 0.4089, + "step": 4063 + }, + { + "epoch": 0.4945543048372376, + "grad_norm": 1.8668774366378784, + "learning_rate": 1.746117808378256e-05, + "loss": 0.5301, + "step": 4064 + }, + { + "epoch": 0.49467599634925463, + "grad_norm": 3.9548213481903076, + "learning_rate": 1.7459879142570585e-05, + "loss": 0.4425, + "step": 4065 + }, + { + "epoch": 0.49479768786127165, + "grad_norm": 3.71551251411438, + "learning_rate": 1.7458579917494767e-05, + "loss": 0.4331, + "step": 4066 + }, + { + "epoch": 0.4949193793732887, + "grad_norm": 0.6080107688903809, + "learning_rate": 1.7457280408604545e-05, + "loss": 0.4848, + "step": 4067 + }, + { + "epoch": 0.49504107088530575, + "grad_norm": 0.9849717020988464, + "learning_rate": 1.7455980615949365e-05, + "loss": 0.5123, + "step": 4068 + }, + { + "epoch": 0.49516276239732276, + "grad_norm": 1.4397056102752686, + "learning_rate": 1.7454680539578693e-05, + "loss": 0.4668, + "step": 4069 + }, + { + "epoch": 0.49528445390933984, + "grad_norm": 2.6759631633758545, + "learning_rate": 1.7453380179541996e-05, + "loss": 0.45, + "step": 4070 + }, + { + "epoch": 0.49540614542135686, + "grad_norm": 2.5754477977752686, + "learning_rate": 1.7452079535888755e-05, + "loss": 0.5208, + "step": 4071 + }, + { + "epoch": 0.4955278369333739, + "grad_norm": 1.1573103666305542, + "learning_rate": 1.745077860866846e-05, + "loss": 0.4589, + "step": 4072 + }, + { + "epoch": 0.49564952844539095, + "grad_norm": 1.0905152559280396, + "learning_rate": 1.744947739793062e-05, + "loss": 0.4502, + "step": 4073 + }, + { + "epoch": 0.49577121995740797, + "grad_norm": 0.6032271981239319, + "learning_rate": 1.744817590372474e-05, + "loss": 0.4309, + "step": 4074 + }, + { + "epoch": 0.495892911469425, + "grad_norm": 2.3655929565429688, + "learning_rate": 1.7446874126100356e-05, + "loss": 0.4804, + "step": 4075 + }, + { + "epoch": 0.49601460298144207, + "grad_norm": 2.5027287006378174, + "learning_rate": 1.7445572065106996e-05, + "loss": 0.4181, + "step": 4076 + }, + { + "epoch": 0.4961362944934591, + "grad_norm": 0.9157415628433228, + "learning_rate": 1.7444269720794206e-05, + "loss": 0.5058, + "step": 4077 + }, + { + "epoch": 0.4962579860054761, + "grad_norm": 1.4561556577682495, + "learning_rate": 1.7442967093211546e-05, + "loss": 0.5247, + "step": 4078 + }, + { + "epoch": 0.4963796775174932, + "grad_norm": 2.8743176460266113, + "learning_rate": 1.744166418240858e-05, + "loss": 0.4662, + "step": 4079 + }, + { + "epoch": 0.4965013690295102, + "grad_norm": 0.6691282391548157, + "learning_rate": 1.744036098843489e-05, + "loss": 0.4882, + "step": 4080 + }, + { + "epoch": 0.4966230605415272, + "grad_norm": 0.9167976379394531, + "learning_rate": 1.7439057511340064e-05, + "loss": 0.5171, + "step": 4081 + }, + { + "epoch": 0.4967447520535443, + "grad_norm": 2.1515262126922607, + "learning_rate": 1.74377537511737e-05, + "loss": 0.5415, + "step": 4082 + }, + { + "epoch": 0.4968664435655613, + "grad_norm": 2.418586015701294, + "learning_rate": 1.743644970798541e-05, + "loss": 0.4787, + "step": 4083 + }, + { + "epoch": 0.49698813507757833, + "grad_norm": 1.9286082983016968, + "learning_rate": 1.743514538182482e-05, + "loss": 0.4931, + "step": 4084 + }, + { + "epoch": 0.49710982658959535, + "grad_norm": 2.3758864402770996, + "learning_rate": 1.7433840772741556e-05, + "loss": 0.4512, + "step": 4085 + }, + { + "epoch": 0.4972315181016124, + "grad_norm": 3.0013015270233154, + "learning_rate": 1.743253588078526e-05, + "loss": 0.445, + "step": 4086 + }, + { + "epoch": 0.49735320961362944, + "grad_norm": 1.7188196182250977, + "learning_rate": 1.7431230706005596e-05, + "loss": 0.4143, + "step": 4087 + }, + { + "epoch": 0.49747490112564646, + "grad_norm": 1.4521846771240234, + "learning_rate": 1.7429925248452218e-05, + "loss": 0.4227, + "step": 4088 + }, + { + "epoch": 0.49759659263766354, + "grad_norm": 3.806412696838379, + "learning_rate": 1.7428619508174804e-05, + "loss": 0.4834, + "step": 4089 + }, + { + "epoch": 0.49771828414968056, + "grad_norm": 6.8284525871276855, + "learning_rate": 1.7427313485223045e-05, + "loss": 0.5896, + "step": 4090 + }, + { + "epoch": 0.4978399756616976, + "grad_norm": 4.187354564666748, + "learning_rate": 1.742600717964663e-05, + "loss": 0.4897, + "step": 4091 + }, + { + "epoch": 0.49796166717371465, + "grad_norm": 4.489995956420898, + "learning_rate": 1.7424700591495273e-05, + "loss": 0.4805, + "step": 4092 + }, + { + "epoch": 0.49808335868573167, + "grad_norm": 3.3616721630096436, + "learning_rate": 1.742339372081869e-05, + "loss": 0.4352, + "step": 4093 + }, + { + "epoch": 0.4982050501977487, + "grad_norm": 1.5986047983169556, + "learning_rate": 1.742208656766661e-05, + "loss": 0.4272, + "step": 4094 + }, + { + "epoch": 0.49832674170976576, + "grad_norm": 1.3044337034225464, + "learning_rate": 1.742077913208877e-05, + "loss": 0.4099, + "step": 4095 + }, + { + "epoch": 0.4984484332217828, + "grad_norm": 2.6791794300079346, + "learning_rate": 1.7419471414134932e-05, + "loss": 0.5036, + "step": 4096 + }, + { + "epoch": 0.4985701247337998, + "grad_norm": 2.7280631065368652, + "learning_rate": 1.7418163413854844e-05, + "loss": 0.4131, + "step": 4097 + }, + { + "epoch": 0.4986918162458169, + "grad_norm": 1.076400637626648, + "learning_rate": 1.7416855131298285e-05, + "loss": 0.4781, + "step": 4098 + }, + { + "epoch": 0.4988135077578339, + "grad_norm": 4.6368727684021, + "learning_rate": 1.7415546566515034e-05, + "loss": 0.4339, + "step": 4099 + }, + { + "epoch": 0.4989351992698509, + "grad_norm": 3.553502082824707, + "learning_rate": 1.7414237719554886e-05, + "loss": 0.4217, + "step": 4100 + }, + { + "epoch": 0.499056890781868, + "grad_norm": 2.262726068496704, + "learning_rate": 1.741292859046765e-05, + "loss": 0.4472, + "step": 4101 + }, + { + "epoch": 0.499178582293885, + "grad_norm": 1.033531904220581, + "learning_rate": 1.7411619179303136e-05, + "loss": 0.5153, + "step": 4102 + }, + { + "epoch": 0.49930027380590203, + "grad_norm": 4.11911153793335, + "learning_rate": 1.741030948611117e-05, + "loss": 0.377, + "step": 4103 + }, + { + "epoch": 0.49942196531791905, + "grad_norm": 1.7390393018722534, + "learning_rate": 1.740899951094159e-05, + "loss": 0.4826, + "step": 4104 + }, + { + "epoch": 0.4995436568299361, + "grad_norm": 2.917625665664673, + "learning_rate": 1.7407689253844244e-05, + "loss": 0.5216, + "step": 4105 + }, + { + "epoch": 0.49966534834195314, + "grad_norm": 1.3471200466156006, + "learning_rate": 1.7406378714868987e-05, + "loss": 0.4533, + "step": 4106 + }, + { + "epoch": 0.49978703985397016, + "grad_norm": 0.6451941132545471, + "learning_rate": 1.7405067894065692e-05, + "loss": 0.4281, + "step": 4107 + }, + { + "epoch": 0.49990873136598724, + "grad_norm": 2.406010389328003, + "learning_rate": 1.7403756791484232e-05, + "loss": 0.4903, + "step": 4108 + }, + { + "epoch": 0.5000304228780043, + "grad_norm": 2.0882816314697266, + "learning_rate": 1.7402445407174502e-05, + "loss": 0.4176, + "step": 4109 + }, + { + "epoch": 0.5001521143900213, + "grad_norm": 1.0812381505966187, + "learning_rate": 1.74011337411864e-05, + "loss": 0.4613, + "step": 4110 + }, + { + "epoch": 0.5002738059020383, + "grad_norm": 0.6587196588516235, + "learning_rate": 1.7399821793569847e-05, + "loss": 0.4721, + "step": 4111 + }, + { + "epoch": 0.5003954974140554, + "grad_norm": 1.8877878189086914, + "learning_rate": 1.7398509564374754e-05, + "loss": 0.53, + "step": 4112 + }, + { + "epoch": 0.5005171889260724, + "grad_norm": 1.9657156467437744, + "learning_rate": 1.7397197053651055e-05, + "loss": 0.4418, + "step": 4113 + }, + { + "epoch": 0.5006388804380895, + "grad_norm": 3.436377763748169, + "learning_rate": 1.73958842614487e-05, + "loss": 0.4596, + "step": 4114 + }, + { + "epoch": 0.5007605719501065, + "grad_norm": 0.9162855744361877, + "learning_rate": 1.7394571187817642e-05, + "loss": 0.5119, + "step": 4115 + }, + { + "epoch": 0.5008822634621235, + "grad_norm": 1.646669626235962, + "learning_rate": 1.7393257832807843e-05, + "loss": 0.4449, + "step": 4116 + }, + { + "epoch": 0.5010039549741405, + "grad_norm": 2.6974854469299316, + "learning_rate": 1.7391944196469278e-05, + "loss": 0.4252, + "step": 4117 + }, + { + "epoch": 0.5011256464861575, + "grad_norm": 0.7376638650894165, + "learning_rate": 1.7390630278851938e-05, + "loss": 0.4669, + "step": 4118 + }, + { + "epoch": 0.5012473379981747, + "grad_norm": 0.874047040939331, + "learning_rate": 1.7389316080005816e-05, + "loss": 0.4451, + "step": 4119 + }, + { + "epoch": 0.5013690295101917, + "grad_norm": 1.197197675704956, + "learning_rate": 1.7388001599980927e-05, + "loss": 0.4617, + "step": 4120 + }, + { + "epoch": 0.5014907210222087, + "grad_norm": 2.542201519012451, + "learning_rate": 1.738668683882728e-05, + "loss": 0.4511, + "step": 4121 + }, + { + "epoch": 0.5016124125342257, + "grad_norm": 1.7542531490325928, + "learning_rate": 1.738537179659491e-05, + "loss": 0.4955, + "step": 4122 + }, + { + "epoch": 0.5017341040462427, + "grad_norm": 0.7148008346557617, + "learning_rate": 1.7384056473333854e-05, + "loss": 0.4621, + "step": 4123 + }, + { + "epoch": 0.5018557955582598, + "grad_norm": 1.160097599029541, + "learning_rate": 1.738274086909417e-05, + "loss": 0.4712, + "step": 4124 + }, + { + "epoch": 0.5019774870702769, + "grad_norm": 3.369720697402954, + "learning_rate": 1.738142498392591e-05, + "loss": 0.4247, + "step": 4125 + }, + { + "epoch": 0.5020991785822939, + "grad_norm": 1.2303825616836548, + "learning_rate": 1.7380108817879156e-05, + "loss": 0.5338, + "step": 4126 + }, + { + "epoch": 0.5022208700943109, + "grad_norm": 0.8658193349838257, + "learning_rate": 1.737879237100398e-05, + "loss": 0.4616, + "step": 4127 + }, + { + "epoch": 0.502342561606328, + "grad_norm": 1.5830113887786865, + "learning_rate": 1.7377475643350484e-05, + "loss": 0.5055, + "step": 4128 + }, + { + "epoch": 0.502464253118345, + "grad_norm": 1.2154951095581055, + "learning_rate": 1.737615863496877e-05, + "loss": 0.4631, + "step": 4129 + }, + { + "epoch": 0.502585944630362, + "grad_norm": 0.8518238067626953, + "learning_rate": 1.737484134590895e-05, + "loss": 0.4541, + "step": 4130 + }, + { + "epoch": 0.5027076361423791, + "grad_norm": 3.4788103103637695, + "learning_rate": 1.7373523776221154e-05, + "loss": 0.5365, + "step": 4131 + }, + { + "epoch": 0.5028293276543961, + "grad_norm": 2.623056411743164, + "learning_rate": 1.7372205925955513e-05, + "loss": 0.4676, + "step": 4132 + }, + { + "epoch": 0.5029510191664132, + "grad_norm": 0.6838477253913879, + "learning_rate": 1.737088779516218e-05, + "loss": 0.4693, + "step": 4133 + }, + { + "epoch": 0.5030727106784302, + "grad_norm": 2.127175807952881, + "learning_rate": 1.7369569383891306e-05, + "loss": 0.4386, + "step": 4134 + }, + { + "epoch": 0.5031944021904472, + "grad_norm": 2.199289321899414, + "learning_rate": 1.7368250692193066e-05, + "loss": 0.4609, + "step": 4135 + }, + { + "epoch": 0.5033160937024642, + "grad_norm": 3.731417417526245, + "learning_rate": 1.7366931720117633e-05, + "loss": 0.4096, + "step": 4136 + }, + { + "epoch": 0.5034377852144812, + "grad_norm": 0.7778638005256653, + "learning_rate": 1.7365612467715203e-05, + "loss": 0.4376, + "step": 4137 + }, + { + "epoch": 0.5035594767264984, + "grad_norm": 1.3162226676940918, + "learning_rate": 1.7364292935035968e-05, + "loss": 0.4524, + "step": 4138 + }, + { + "epoch": 0.5036811682385154, + "grad_norm": 3.344547748565674, + "learning_rate": 1.7362973122130146e-05, + "loss": 0.4787, + "step": 4139 + }, + { + "epoch": 0.5038028597505324, + "grad_norm": 3.077211618423462, + "learning_rate": 1.7361653029047956e-05, + "loss": 0.4852, + "step": 4140 + }, + { + "epoch": 0.5039245512625494, + "grad_norm": 2.960585117340088, + "learning_rate": 1.7360332655839635e-05, + "loss": 0.4577, + "step": 4141 + }, + { + "epoch": 0.5040462427745664, + "grad_norm": 4.019593715667725, + "learning_rate": 1.7359012002555414e-05, + "loss": 0.526, + "step": 4142 + }, + { + "epoch": 0.5041679342865835, + "grad_norm": 0.6087198853492737, + "learning_rate": 1.7357691069245554e-05, + "loss": 0.4062, + "step": 4143 + }, + { + "epoch": 0.5042896257986006, + "grad_norm": 1.8577182292938232, + "learning_rate": 1.7356369855960323e-05, + "loss": 0.4851, + "step": 4144 + }, + { + "epoch": 0.5044113173106176, + "grad_norm": 1.0888867378234863, + "learning_rate": 1.7355048362749995e-05, + "loss": 0.423, + "step": 4145 + }, + { + "epoch": 0.5045330088226346, + "grad_norm": 1.065039873123169, + "learning_rate": 1.7353726589664847e-05, + "loss": 0.4904, + "step": 4146 + }, + { + "epoch": 0.5046547003346517, + "grad_norm": 2.658245086669922, + "learning_rate": 1.7352404536755185e-05, + "loss": 0.4414, + "step": 4147 + }, + { + "epoch": 0.5047763918466687, + "grad_norm": 0.7043006420135498, + "learning_rate": 1.735108220407131e-05, + "loss": 0.5126, + "step": 4148 + }, + { + "epoch": 0.5048980833586857, + "grad_norm": 2.154768943786621, + "learning_rate": 1.7349759591663538e-05, + "loss": 0.5068, + "step": 4149 + }, + { + "epoch": 0.5050197748707028, + "grad_norm": 2.659702777862549, + "learning_rate": 1.7348436699582204e-05, + "loss": 0.453, + "step": 4150 + }, + { + "epoch": 0.5051414663827198, + "grad_norm": 2.186054229736328, + "learning_rate": 1.7347113527877642e-05, + "loss": 0.5176, + "step": 4151 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 2.088869571685791, + "learning_rate": 1.7345790076600203e-05, + "loss": 0.542, + "step": 4152 + }, + { + "epoch": 0.5053848494067539, + "grad_norm": 3.6443979740142822, + "learning_rate": 1.7344466345800245e-05, + "loss": 0.457, + "step": 4153 + }, + { + "epoch": 0.5055065409187709, + "grad_norm": 2.7742044925689697, + "learning_rate": 1.7343142335528146e-05, + "loss": 0.4461, + "step": 4154 + }, + { + "epoch": 0.5056282324307879, + "grad_norm": 0.9349391460418701, + "learning_rate": 1.7341818045834274e-05, + "loss": 0.5243, + "step": 4155 + }, + { + "epoch": 0.5057499239428049, + "grad_norm": 1.9604241847991943, + "learning_rate": 1.7340493476769034e-05, + "loss": 0.4679, + "step": 4156 + }, + { + "epoch": 0.5058716154548221, + "grad_norm": 3.2405283451080322, + "learning_rate": 1.733916862838282e-05, + "loss": 0.4052, + "step": 4157 + }, + { + "epoch": 0.5059933069668391, + "grad_norm": 0.9567452073097229, + "learning_rate": 1.733784350072605e-05, + "loss": 0.4537, + "step": 4158 + }, + { + "epoch": 0.5061149984788561, + "grad_norm": 1.1237132549285889, + "learning_rate": 1.7336518093849145e-05, + "loss": 0.4919, + "step": 4159 + }, + { + "epoch": 0.5062366899908731, + "grad_norm": 1.4495265483856201, + "learning_rate": 1.7335192407802543e-05, + "loss": 0.472, + "step": 4160 + }, + { + "epoch": 0.5063583815028901, + "grad_norm": 0.9601306319236755, + "learning_rate": 1.7333866442636688e-05, + "loss": 0.4649, + "step": 4161 + }, + { + "epoch": 0.5064800730149072, + "grad_norm": 0.9483425617218018, + "learning_rate": 1.733254019840203e-05, + "loss": 0.4897, + "step": 4162 + }, + { + "epoch": 0.5066017645269243, + "grad_norm": 4.501932621002197, + "learning_rate": 1.733121367514904e-05, + "loss": 0.401, + "step": 4163 + }, + { + "epoch": 0.5067234560389413, + "grad_norm": 0.9702973365783691, + "learning_rate": 1.7329886872928198e-05, + "loss": 0.4985, + "step": 4164 + }, + { + "epoch": 0.5068451475509583, + "grad_norm": 0.978947639465332, + "learning_rate": 1.732855979178999e-05, + "loss": 0.4877, + "step": 4165 + }, + { + "epoch": 0.5069668390629753, + "grad_norm": 2.289695978164673, + "learning_rate": 1.7327232431784908e-05, + "loss": 0.4432, + "step": 4166 + }, + { + "epoch": 0.5070885305749924, + "grad_norm": 1.8299050331115723, + "learning_rate": 1.732590479296347e-05, + "loss": 0.4472, + "step": 4167 + }, + { + "epoch": 0.5072102220870094, + "grad_norm": 0.9049208760261536, + "learning_rate": 1.7324576875376185e-05, + "loss": 0.4436, + "step": 4168 + }, + { + "epoch": 0.5073319135990265, + "grad_norm": 0.6118660569190979, + "learning_rate": 1.7323248679073597e-05, + "loss": 0.4401, + "step": 4169 + }, + { + "epoch": 0.5074536051110435, + "grad_norm": 1.1906383037567139, + "learning_rate": 1.732192020410623e-05, + "loss": 0.4581, + "step": 4170 + }, + { + "epoch": 0.5075752966230606, + "grad_norm": 1.0905030965805054, + "learning_rate": 1.7320591450524648e-05, + "loss": 0.4124, + "step": 4171 + }, + { + "epoch": 0.5076969881350776, + "grad_norm": 2.1233954429626465, + "learning_rate": 1.731926241837941e-05, + "loss": 0.4598, + "step": 4172 + }, + { + "epoch": 0.5078186796470946, + "grad_norm": 1.2013047933578491, + "learning_rate": 1.7317933107721086e-05, + "loss": 0.4087, + "step": 4173 + }, + { + "epoch": 0.5079403711591116, + "grad_norm": 2.3826663494110107, + "learning_rate": 1.731660351860026e-05, + "loss": 0.4887, + "step": 4174 + }, + { + "epoch": 0.5080620626711286, + "grad_norm": 1.9369261264801025, + "learning_rate": 1.7315273651067524e-05, + "loss": 0.4968, + "step": 4175 + }, + { + "epoch": 0.5081837541831458, + "grad_norm": 0.9575729370117188, + "learning_rate": 1.7313943505173483e-05, + "loss": 0.4478, + "step": 4176 + }, + { + "epoch": 0.5083054456951628, + "grad_norm": 0.9614267349243164, + "learning_rate": 1.7312613080968753e-05, + "loss": 0.4365, + "step": 4177 + }, + { + "epoch": 0.5084271372071798, + "grad_norm": 3.17458438873291, + "learning_rate": 1.731128237850396e-05, + "loss": 0.4386, + "step": 4178 + }, + { + "epoch": 0.5085488287191968, + "grad_norm": 5.5925211906433105, + "learning_rate": 1.730995139782974e-05, + "loss": 0.4063, + "step": 4179 + }, + { + "epoch": 0.5086705202312138, + "grad_norm": 1.0321651697158813, + "learning_rate": 1.7308620138996738e-05, + "loss": 0.4571, + "step": 4180 + }, + { + "epoch": 0.5087922117432309, + "grad_norm": 2.9922969341278076, + "learning_rate": 1.730728860205561e-05, + "loss": 0.4258, + "step": 4181 + }, + { + "epoch": 0.508913903255248, + "grad_norm": 2.339219093322754, + "learning_rate": 1.7305956787057024e-05, + "loss": 0.5205, + "step": 4182 + }, + { + "epoch": 0.509035594767265, + "grad_norm": 0.655060887336731, + "learning_rate": 1.7304624694051668e-05, + "loss": 0.4319, + "step": 4183 + }, + { + "epoch": 0.509157286279282, + "grad_norm": 0.9896650910377502, + "learning_rate": 1.7303292323090213e-05, + "loss": 0.4283, + "step": 4184 + }, + { + "epoch": 0.509278977791299, + "grad_norm": 4.292147159576416, + "learning_rate": 1.730195967422337e-05, + "loss": 0.5668, + "step": 4185 + }, + { + "epoch": 0.5094006693033161, + "grad_norm": 2.3417439460754395, + "learning_rate": 1.7300626747501852e-05, + "loss": 0.4874, + "step": 4186 + }, + { + "epoch": 0.5095223608153331, + "grad_norm": 0.5930525660514832, + "learning_rate": 1.7299293542976373e-05, + "loss": 0.464, + "step": 4187 + }, + { + "epoch": 0.5096440523273502, + "grad_norm": 2.3383800983428955, + "learning_rate": 1.7297960060697666e-05, + "loss": 0.5812, + "step": 4188 + }, + { + "epoch": 0.5097657438393672, + "grad_norm": 1.579831838607788, + "learning_rate": 1.7296626300716467e-05, + "loss": 0.5273, + "step": 4189 + }, + { + "epoch": 0.5098874353513843, + "grad_norm": 1.0200680494308472, + "learning_rate": 1.7295292263083543e-05, + "loss": 0.5626, + "step": 4190 + }, + { + "epoch": 0.5100091268634013, + "grad_norm": 1.3273138999938965, + "learning_rate": 1.729395794784964e-05, + "loss": 0.5056, + "step": 4191 + }, + { + "epoch": 0.5101308183754183, + "grad_norm": 6.612588882446289, + "learning_rate": 1.7292623355065546e-05, + "loss": 0.4488, + "step": 4192 + }, + { + "epoch": 0.5102525098874353, + "grad_norm": 5.209959506988525, + "learning_rate": 1.7291288484782037e-05, + "loss": 0.4682, + "step": 4193 + }, + { + "epoch": 0.5103742013994524, + "grad_norm": 2.4689810276031494, + "learning_rate": 1.7289953337049907e-05, + "loss": 0.4987, + "step": 4194 + }, + { + "epoch": 0.5104958929114695, + "grad_norm": 3.125850200653076, + "learning_rate": 1.7288617911919963e-05, + "loss": 0.4687, + "step": 4195 + }, + { + "epoch": 0.5106175844234865, + "grad_norm": 0.8129388093948364, + "learning_rate": 1.7287282209443022e-05, + "loss": 0.4901, + "step": 4196 + }, + { + "epoch": 0.5107392759355035, + "grad_norm": 2.6493606567382812, + "learning_rate": 1.7285946229669906e-05, + "loss": 0.4648, + "step": 4197 + }, + { + "epoch": 0.5108609674475205, + "grad_norm": 1.5875645875930786, + "learning_rate": 1.728460997265146e-05, + "loss": 0.4635, + "step": 4198 + }, + { + "epoch": 0.5109826589595375, + "grad_norm": 1.5532900094985962, + "learning_rate": 1.7283273438438525e-05, + "loss": 0.4109, + "step": 4199 + }, + { + "epoch": 0.5111043504715546, + "grad_norm": 1.8107355833053589, + "learning_rate": 1.7281936627081957e-05, + "loss": 0.4249, + "step": 4200 + }, + { + "epoch": 0.5112260419835717, + "grad_norm": 4.1558942794799805, + "learning_rate": 1.728059953863263e-05, + "loss": 0.5209, + "step": 4201 + }, + { + "epoch": 0.5113477334955887, + "grad_norm": 0.5414964556694031, + "learning_rate": 1.727926217314142e-05, + "loss": 0.3922, + "step": 4202 + }, + { + "epoch": 0.5114694250076057, + "grad_norm": 0.9969562292098999, + "learning_rate": 1.7277924530659218e-05, + "loss": 0.4593, + "step": 4203 + }, + { + "epoch": 0.5115911165196227, + "grad_norm": 3.9075279235839844, + "learning_rate": 1.7276586611236923e-05, + "loss": 0.5432, + "step": 4204 + }, + { + "epoch": 0.5117128080316398, + "grad_norm": 2.7518815994262695, + "learning_rate": 1.7275248414925443e-05, + "loss": 0.5094, + "step": 4205 + }, + { + "epoch": 0.5118344995436568, + "grad_norm": 1.7149337530136108, + "learning_rate": 1.7273909941775705e-05, + "loss": 0.5003, + "step": 4206 + }, + { + "epoch": 0.5119561910556739, + "grad_norm": 1.7216975688934326, + "learning_rate": 1.7272571191838636e-05, + "loss": 0.4639, + "step": 4207 + }, + { + "epoch": 0.5120778825676909, + "grad_norm": 1.2411049604415894, + "learning_rate": 1.727123216516518e-05, + "loss": 0.4718, + "step": 4208 + }, + { + "epoch": 0.512199574079708, + "grad_norm": 1.085710883140564, + "learning_rate": 1.726989286180629e-05, + "loss": 0.4424, + "step": 4209 + }, + { + "epoch": 0.512321265591725, + "grad_norm": 3.1756680011749268, + "learning_rate": 1.726855328181293e-05, + "loss": 0.4665, + "step": 4210 + }, + { + "epoch": 0.512442957103742, + "grad_norm": 3.1350553035736084, + "learning_rate": 1.7267213425236068e-05, + "loss": 0.4327, + "step": 4211 + }, + { + "epoch": 0.512564648615759, + "grad_norm": 0.7822600603103638, + "learning_rate": 1.7265873292126698e-05, + "loss": 0.4245, + "step": 4212 + }, + { + "epoch": 0.5126863401277761, + "grad_norm": 1.2810771465301514, + "learning_rate": 1.7264532882535806e-05, + "loss": 0.4163, + "step": 4213 + }, + { + "epoch": 0.5128080316397932, + "grad_norm": 1.6901302337646484, + "learning_rate": 1.7263192196514404e-05, + "loss": 0.474, + "step": 4214 + }, + { + "epoch": 0.5129297231518102, + "grad_norm": 2.7114171981811523, + "learning_rate": 1.7261851234113504e-05, + "loss": 0.4763, + "step": 4215 + }, + { + "epoch": 0.5130514146638272, + "grad_norm": 4.1810622215271, + "learning_rate": 1.7260509995384134e-05, + "loss": 0.4644, + "step": 4216 + }, + { + "epoch": 0.5131731061758442, + "grad_norm": 2.705242156982422, + "learning_rate": 1.725916848037733e-05, + "loss": 0.4948, + "step": 4217 + }, + { + "epoch": 0.5132947976878612, + "grad_norm": 1.6102042198181152, + "learning_rate": 1.7257826689144136e-05, + "loss": 0.4279, + "step": 4218 + }, + { + "epoch": 0.5134164891998783, + "grad_norm": 3.9557557106018066, + "learning_rate": 1.7256484621735617e-05, + "loss": 0.5429, + "step": 4219 + }, + { + "epoch": 0.5135381807118954, + "grad_norm": 3.7163450717926025, + "learning_rate": 1.7255142278202837e-05, + "loss": 0.6007, + "step": 4220 + }, + { + "epoch": 0.5136598722239124, + "grad_norm": 1.8740004301071167, + "learning_rate": 1.7253799658596876e-05, + "loss": 0.4645, + "step": 4221 + }, + { + "epoch": 0.5137815637359294, + "grad_norm": 1.6189954280853271, + "learning_rate": 1.7252456762968824e-05, + "loss": 0.4754, + "step": 4222 + }, + { + "epoch": 0.5139032552479464, + "grad_norm": 2.928853988647461, + "learning_rate": 1.725111359136978e-05, + "loss": 0.4769, + "step": 4223 + }, + { + "epoch": 0.5140249467599635, + "grad_norm": 4.390937805175781, + "learning_rate": 1.7249770143850857e-05, + "loss": 0.4878, + "step": 4224 + }, + { + "epoch": 0.5141466382719805, + "grad_norm": 1.4352803230285645, + "learning_rate": 1.724842642046317e-05, + "loss": 0.4702, + "step": 4225 + }, + { + "epoch": 0.5142683297839976, + "grad_norm": 0.9098310470581055, + "learning_rate": 1.7247082421257857e-05, + "loss": 0.5597, + "step": 4226 + }, + { + "epoch": 0.5143900212960146, + "grad_norm": 1.9175128936767578, + "learning_rate": 1.7245738146286057e-05, + "loss": 0.5001, + "step": 4227 + }, + { + "epoch": 0.5145117128080317, + "grad_norm": 0.9920680522918701, + "learning_rate": 1.724439359559892e-05, + "loss": 0.4591, + "step": 4228 + }, + { + "epoch": 0.5146334043200487, + "grad_norm": 2.749835968017578, + "learning_rate": 1.7243048769247613e-05, + "loss": 0.5375, + "step": 4229 + }, + { + "epoch": 0.5147550958320657, + "grad_norm": 1.11312997341156, + "learning_rate": 1.7241703667283312e-05, + "loss": 0.4813, + "step": 4230 + }, + { + "epoch": 0.5148767873440827, + "grad_norm": 1.1330859661102295, + "learning_rate": 1.7240358289757195e-05, + "loss": 0.4896, + "step": 4231 + }, + { + "epoch": 0.5149984788560998, + "grad_norm": 1.01093327999115, + "learning_rate": 1.7239012636720455e-05, + "loss": 0.4682, + "step": 4232 + }, + { + "epoch": 0.5151201703681169, + "grad_norm": 2.669506311416626, + "learning_rate": 1.7237666708224305e-05, + "loss": 0.5285, + "step": 4233 + }, + { + "epoch": 0.5152418618801339, + "grad_norm": 1.8913952112197876, + "learning_rate": 1.723632050431995e-05, + "loss": 0.4625, + "step": 4234 + }, + { + "epoch": 0.5153635533921509, + "grad_norm": 3.2770209312438965, + "learning_rate": 1.7234974025058625e-05, + "loss": 0.4378, + "step": 4235 + }, + { + "epoch": 0.5154852449041679, + "grad_norm": 2.3642210960388184, + "learning_rate": 1.7233627270491563e-05, + "loss": 0.4806, + "step": 4236 + }, + { + "epoch": 0.5156069364161849, + "grad_norm": 0.9507365226745605, + "learning_rate": 1.723228024067001e-05, + "loss": 0.4633, + "step": 4237 + }, + { + "epoch": 0.515728627928202, + "grad_norm": 0.725978672504425, + "learning_rate": 1.723093293564522e-05, + "loss": 0.5193, + "step": 4238 + }, + { + "epoch": 0.5158503194402191, + "grad_norm": 1.2293250560760498, + "learning_rate": 1.7229585355468473e-05, + "loss": 0.4787, + "step": 4239 + }, + { + "epoch": 0.5159720109522361, + "grad_norm": 1.822168231010437, + "learning_rate": 1.722823750019103e-05, + "loss": 0.5467, + "step": 4240 + }, + { + "epoch": 0.5160937024642531, + "grad_norm": 1.5222446918487549, + "learning_rate": 1.7226889369864196e-05, + "loss": 0.5023, + "step": 4241 + }, + { + "epoch": 0.5162153939762701, + "grad_norm": 1.0482224225997925, + "learning_rate": 1.722554096453926e-05, + "loss": 0.4847, + "step": 4242 + }, + { + "epoch": 0.5163370854882872, + "grad_norm": 1.706272840499878, + "learning_rate": 1.7224192284267532e-05, + "loss": 0.5126, + "step": 4243 + }, + { + "epoch": 0.5164587770003042, + "grad_norm": 2.338200330734253, + "learning_rate": 1.7222843329100335e-05, + "loss": 0.512, + "step": 4244 + }, + { + "epoch": 0.5165804685123213, + "grad_norm": 1.325298547744751, + "learning_rate": 1.7221494099089e-05, + "loss": 0.4755, + "step": 4245 + }, + { + "epoch": 0.5167021600243383, + "grad_norm": 1.9629168510437012, + "learning_rate": 1.7220144594284867e-05, + "loss": 0.443, + "step": 4246 + }, + { + "epoch": 0.5168238515363553, + "grad_norm": 1.20656156539917, + "learning_rate": 1.7218794814739286e-05, + "loss": 0.508, + "step": 4247 + }, + { + "epoch": 0.5169455430483724, + "grad_norm": 2.1021134853363037, + "learning_rate": 1.721744476050362e-05, + "loss": 0.4782, + "step": 4248 + }, + { + "epoch": 0.5170672345603894, + "grad_norm": 1.4776021242141724, + "learning_rate": 1.7216094431629243e-05, + "loss": 0.4677, + "step": 4249 + }, + { + "epoch": 0.5171889260724064, + "grad_norm": 3.4578335285186768, + "learning_rate": 1.721474382816754e-05, + "loss": 0.4005, + "step": 4250 + }, + { + "epoch": 0.5173106175844235, + "grad_norm": 0.9584029912948608, + "learning_rate": 1.7213392950169893e-05, + "loss": 0.4322, + "step": 4251 + }, + { + "epoch": 0.5174323090964406, + "grad_norm": 3.120654821395874, + "learning_rate": 1.7212041797687718e-05, + "loss": 0.5056, + "step": 4252 + }, + { + "epoch": 0.5175540006084576, + "grad_norm": 1.416257381439209, + "learning_rate": 1.7210690370772424e-05, + "loss": 0.4517, + "step": 4253 + }, + { + "epoch": 0.5176756921204746, + "grad_norm": 2.2621262073516846, + "learning_rate": 1.7209338669475436e-05, + "loss": 0.4912, + "step": 4254 + }, + { + "epoch": 0.5177973836324916, + "grad_norm": 0.9535350799560547, + "learning_rate": 1.7207986693848188e-05, + "loss": 0.3897, + "step": 4255 + }, + { + "epoch": 0.5179190751445086, + "grad_norm": 2.8512415885925293, + "learning_rate": 1.7206634443942126e-05, + "loss": 0.4744, + "step": 4256 + }, + { + "epoch": 0.5180407666565257, + "grad_norm": 0.6961869597434998, + "learning_rate": 1.7205281919808708e-05, + "loss": 0.4297, + "step": 4257 + }, + { + "epoch": 0.5181624581685428, + "grad_norm": 0.9827209115028381, + "learning_rate": 1.7203929121499398e-05, + "loss": 0.4442, + "step": 4258 + }, + { + "epoch": 0.5182841496805598, + "grad_norm": 2.678150177001953, + "learning_rate": 1.7202576049065672e-05, + "loss": 0.5111, + "step": 4259 + }, + { + "epoch": 0.5184058411925768, + "grad_norm": 2.8638734817504883, + "learning_rate": 1.7201222702559024e-05, + "loss": 0.4201, + "step": 4260 + }, + { + "epoch": 0.5185275327045938, + "grad_norm": 2.1369612216949463, + "learning_rate": 1.7199869082030943e-05, + "loss": 0.5096, + "step": 4261 + }, + { + "epoch": 0.5186492242166109, + "grad_norm": 1.1570193767547607, + "learning_rate": 1.719851518753294e-05, + "loss": 0.4641, + "step": 4262 + }, + { + "epoch": 0.5187709157286279, + "grad_norm": 1.9469950199127197, + "learning_rate": 1.7197161019116536e-05, + "loss": 0.482, + "step": 4263 + }, + { + "epoch": 0.518892607240645, + "grad_norm": 0.6886633038520813, + "learning_rate": 1.7195806576833258e-05, + "loss": 0.4471, + "step": 4264 + }, + { + "epoch": 0.519014298752662, + "grad_norm": 0.8435122966766357, + "learning_rate": 1.7194451860734642e-05, + "loss": 0.4484, + "step": 4265 + }, + { + "epoch": 0.519135990264679, + "grad_norm": 2.783785820007324, + "learning_rate": 1.7193096870872245e-05, + "loss": 0.5064, + "step": 4266 + }, + { + "epoch": 0.5192576817766961, + "grad_norm": 3.0867910385131836, + "learning_rate": 1.7191741607297618e-05, + "loss": 0.551, + "step": 4267 + }, + { + "epoch": 0.5193793732887131, + "grad_norm": 0.8378987312316895, + "learning_rate": 1.7190386070062343e-05, + "loss": 0.4494, + "step": 4268 + }, + { + "epoch": 0.5195010648007301, + "grad_norm": 1.8553073406219482, + "learning_rate": 1.718903025921799e-05, + "loss": 0.5132, + "step": 4269 + }, + { + "epoch": 0.5196227563127472, + "grad_norm": 2.2718989849090576, + "learning_rate": 1.718767417481616e-05, + "loss": 0.5392, + "step": 4270 + }, + { + "epoch": 0.5197444478247643, + "grad_norm": 0.8725932240486145, + "learning_rate": 1.7186317816908446e-05, + "loss": 0.5039, + "step": 4271 + }, + { + "epoch": 0.5198661393367813, + "grad_norm": 1.375636339187622, + "learning_rate": 1.7184961185546462e-05, + "loss": 0.4961, + "step": 4272 + }, + { + "epoch": 0.5199878308487983, + "grad_norm": 0.8475624918937683, + "learning_rate": 1.7183604280781837e-05, + "loss": 0.4866, + "step": 4273 + }, + { + "epoch": 0.5201095223608153, + "grad_norm": 3.4090487957000732, + "learning_rate": 1.7182247102666202e-05, + "loss": 0.4834, + "step": 4274 + }, + { + "epoch": 0.5202312138728323, + "grad_norm": 4.4174323081970215, + "learning_rate": 1.7180889651251195e-05, + "loss": 0.4856, + "step": 4275 + }, + { + "epoch": 0.5203529053848494, + "grad_norm": 3.009592294692993, + "learning_rate": 1.7179531926588472e-05, + "loss": 0.5154, + "step": 4276 + }, + { + "epoch": 0.5204745968968665, + "grad_norm": 3.412261486053467, + "learning_rate": 1.71781739287297e-05, + "loss": 0.4374, + "step": 4277 + }, + { + "epoch": 0.5205962884088835, + "grad_norm": 1.509874701499939, + "learning_rate": 1.717681565772655e-05, + "loss": 0.4908, + "step": 4278 + }, + { + "epoch": 0.5207179799209005, + "grad_norm": 4.0937299728393555, + "learning_rate": 1.717545711363071e-05, + "loss": 0.4304, + "step": 4279 + }, + { + "epoch": 0.5208396714329175, + "grad_norm": 1.2221148014068604, + "learning_rate": 1.7174098296493875e-05, + "loss": 0.4548, + "step": 4280 + }, + { + "epoch": 0.5209613629449346, + "grad_norm": 3.9335718154907227, + "learning_rate": 1.717273920636775e-05, + "loss": 0.5566, + "step": 4281 + }, + { + "epoch": 0.5210830544569516, + "grad_norm": 1.5230729579925537, + "learning_rate": 1.7171379843304053e-05, + "loss": 0.3334, + "step": 4282 + }, + { + "epoch": 0.5212047459689687, + "grad_norm": 5.134873390197754, + "learning_rate": 1.7170020207354507e-05, + "loss": 0.5461, + "step": 4283 + }, + { + "epoch": 0.5213264374809857, + "grad_norm": 2.6143572330474854, + "learning_rate": 1.7168660298570855e-05, + "loss": 0.4184, + "step": 4284 + }, + { + "epoch": 0.5214481289930027, + "grad_norm": 4.081435203552246, + "learning_rate": 1.7167300117004836e-05, + "loss": 0.4742, + "step": 4285 + }, + { + "epoch": 0.5215698205050198, + "grad_norm": 4.400676250457764, + "learning_rate": 1.7165939662708215e-05, + "loss": 0.4768, + "step": 4286 + }, + { + "epoch": 0.5216915120170368, + "grad_norm": 5.75072717666626, + "learning_rate": 1.716457893573275e-05, + "loss": 0.5651, + "step": 4287 + }, + { + "epoch": 0.5218132035290538, + "grad_norm": 2.7458395957946777, + "learning_rate": 1.7163217936130237e-05, + "loss": 0.496, + "step": 4288 + }, + { + "epoch": 0.5219348950410709, + "grad_norm": 1.4024596214294434, + "learning_rate": 1.7161856663952446e-05, + "loss": 0.4286, + "step": 4289 + }, + { + "epoch": 0.522056586553088, + "grad_norm": 1.2655744552612305, + "learning_rate": 1.716049511925119e-05, + "loss": 0.4469, + "step": 4290 + }, + { + "epoch": 0.522178278065105, + "grad_norm": 1.254462480545044, + "learning_rate": 1.715913330207827e-05, + "loss": 0.3975, + "step": 4291 + }, + { + "epoch": 0.522299969577122, + "grad_norm": 5.87849235534668, + "learning_rate": 1.7157771212485514e-05, + "loss": 0.4475, + "step": 4292 + }, + { + "epoch": 0.522421661089139, + "grad_norm": 1.213152289390564, + "learning_rate": 1.715640885052474e-05, + "loss": 0.5433, + "step": 4293 + }, + { + "epoch": 0.522543352601156, + "grad_norm": 1.4798842668533325, + "learning_rate": 1.7155046216247803e-05, + "loss": 0.4764, + "step": 4294 + }, + { + "epoch": 0.5226650441131732, + "grad_norm": 2.092900276184082, + "learning_rate": 1.7153683309706547e-05, + "loss": 0.5102, + "step": 4295 + }, + { + "epoch": 0.5227867356251902, + "grad_norm": 2.112396001815796, + "learning_rate": 1.715232013095283e-05, + "loss": 0.4268, + "step": 4296 + }, + { + "epoch": 0.5229084271372072, + "grad_norm": 0.8308590650558472, + "learning_rate": 1.715095668003853e-05, + "loss": 0.446, + "step": 4297 + }, + { + "epoch": 0.5230301186492242, + "grad_norm": 1.8258624076843262, + "learning_rate": 1.7149592957015528e-05, + "loss": 0.4633, + "step": 4298 + }, + { + "epoch": 0.5231518101612412, + "grad_norm": 2.094789505004883, + "learning_rate": 1.714822896193571e-05, + "loss": 0.4796, + "step": 4299 + }, + { + "epoch": 0.5232735016732583, + "grad_norm": 2.63779878616333, + "learning_rate": 1.7146864694850988e-05, + "loss": 0.5265, + "step": 4300 + }, + { + "epoch": 0.5233951931852753, + "grad_norm": 0.6422003507614136, + "learning_rate": 1.714550015581327e-05, + "loss": 0.4783, + "step": 4301 + }, + { + "epoch": 0.5235168846972924, + "grad_norm": 2.620772123336792, + "learning_rate": 1.7144135344874482e-05, + "loss": 0.4379, + "step": 4302 + }, + { + "epoch": 0.5236385762093094, + "grad_norm": 0.6759090423583984, + "learning_rate": 1.7142770262086555e-05, + "loss": 0.4859, + "step": 4303 + }, + { + "epoch": 0.5237602677213264, + "grad_norm": 0.8766072988510132, + "learning_rate": 1.7141404907501433e-05, + "loss": 0.445, + "step": 4304 + }, + { + "epoch": 0.5238819592333435, + "grad_norm": 3.3468663692474365, + "learning_rate": 1.7140039281171078e-05, + "loss": 0.469, + "step": 4305 + }, + { + "epoch": 0.5240036507453605, + "grad_norm": 1.918367862701416, + "learning_rate": 1.7138673383147444e-05, + "loss": 0.4344, + "step": 4306 + }, + { + "epoch": 0.5241253422573775, + "grad_norm": 0.7115817070007324, + "learning_rate": 1.7137307213482513e-05, + "loss": 0.4826, + "step": 4307 + }, + { + "epoch": 0.5242470337693946, + "grad_norm": 0.6736893653869629, + "learning_rate": 1.713594077222827e-05, + "loss": 0.4772, + "step": 4308 + }, + { + "epoch": 0.5243687252814117, + "grad_norm": 1.109004259109497, + "learning_rate": 1.7134574059436708e-05, + "loss": 0.485, + "step": 4309 + }, + { + "epoch": 0.5244904167934287, + "grad_norm": 0.5608221888542175, + "learning_rate": 1.713320707515984e-05, + "loss": 0.4639, + "step": 4310 + }, + { + "epoch": 0.5246121083054457, + "grad_norm": 3.7170917987823486, + "learning_rate": 1.7131839819449673e-05, + "loss": 0.5261, + "step": 4311 + }, + { + "epoch": 0.5247337998174627, + "grad_norm": 0.5669408440589905, + "learning_rate": 1.713047229235824e-05, + "loss": 0.4629, + "step": 4312 + }, + { + "epoch": 0.5248554913294797, + "grad_norm": 1.5024335384368896, + "learning_rate": 1.712910449393758e-05, + "loss": 0.3937, + "step": 4313 + }, + { + "epoch": 0.5249771828414969, + "grad_norm": 1.9744325876235962, + "learning_rate": 1.7127736424239735e-05, + "loss": 0.5048, + "step": 4314 + }, + { + "epoch": 0.5250988743535139, + "grad_norm": 0.6904813647270203, + "learning_rate": 1.712636808331676e-05, + "loss": 0.4498, + "step": 4315 + }, + { + "epoch": 0.5252205658655309, + "grad_norm": 2.075857162475586, + "learning_rate": 1.712499947122074e-05, + "loss": 0.5032, + "step": 4316 + }, + { + "epoch": 0.5253422573775479, + "grad_norm": 1.9914791584014893, + "learning_rate": 1.7123630588003735e-05, + "loss": 0.5115, + "step": 4317 + }, + { + "epoch": 0.5254639488895649, + "grad_norm": 2.0719830989837646, + "learning_rate": 1.712226143371784e-05, + "loss": 0.4369, + "step": 4318 + }, + { + "epoch": 0.525585640401582, + "grad_norm": 2.386539936065674, + "learning_rate": 1.7120892008415156e-05, + "loss": 0.4711, + "step": 4319 + }, + { + "epoch": 0.525707331913599, + "grad_norm": 0.9945567846298218, + "learning_rate": 1.7119522312147797e-05, + "loss": 0.4932, + "step": 4320 + }, + { + "epoch": 0.5258290234256161, + "grad_norm": 0.8807507157325745, + "learning_rate": 1.7118152344967873e-05, + "loss": 0.4962, + "step": 4321 + }, + { + "epoch": 0.5259507149376331, + "grad_norm": 1.061413288116455, + "learning_rate": 1.7116782106927517e-05, + "loss": 0.4946, + "step": 4322 + }, + { + "epoch": 0.5260724064496501, + "grad_norm": 3.4664292335510254, + "learning_rate": 1.7115411598078874e-05, + "loss": 0.4208, + "step": 4323 + }, + { + "epoch": 0.5261940979616672, + "grad_norm": 2.399772882461548, + "learning_rate": 1.7114040818474095e-05, + "loss": 0.4239, + "step": 4324 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.0668370723724365, + "learning_rate": 1.711266976816533e-05, + "loss": 0.5125, + "step": 4325 + }, + { + "epoch": 0.5264374809857012, + "grad_norm": 0.8144875764846802, + "learning_rate": 1.711129844720476e-05, + "loss": 0.4769, + "step": 4326 + }, + { + "epoch": 0.5265591724977183, + "grad_norm": 0.6472660899162292, + "learning_rate": 1.7109926855644567e-05, + "loss": 0.4858, + "step": 4327 + }, + { + "epoch": 0.5266808640097353, + "grad_norm": 2.5389180183410645, + "learning_rate": 1.710855499353694e-05, + "loss": 0.4952, + "step": 4328 + }, + { + "epoch": 0.5268025555217524, + "grad_norm": 0.6194647550582886, + "learning_rate": 1.710718286093408e-05, + "loss": 0.4715, + "step": 4329 + }, + { + "epoch": 0.5269242470337694, + "grad_norm": 1.4323644638061523, + "learning_rate": 1.7105810457888202e-05, + "loss": 0.4727, + "step": 4330 + }, + { + "epoch": 0.5270459385457864, + "grad_norm": 1.7734172344207764, + "learning_rate": 1.7104437784451527e-05, + "loss": 0.4946, + "step": 4331 + }, + { + "epoch": 0.5271676300578034, + "grad_norm": 1.1468697786331177, + "learning_rate": 1.7103064840676287e-05, + "loss": 0.4526, + "step": 4332 + }, + { + "epoch": 0.5272893215698206, + "grad_norm": 2.776639461517334, + "learning_rate": 1.7101691626614735e-05, + "loss": 0.4451, + "step": 4333 + }, + { + "epoch": 0.5274110130818376, + "grad_norm": 1.2381566762924194, + "learning_rate": 1.710031814231911e-05, + "loss": 0.4351, + "step": 4334 + }, + { + "epoch": 0.5275327045938546, + "grad_norm": 1.198959231376648, + "learning_rate": 1.7098944387841686e-05, + "loss": 0.4433, + "step": 4335 + }, + { + "epoch": 0.5276543961058716, + "grad_norm": 1.1127218008041382, + "learning_rate": 1.709757036323473e-05, + "loss": 0.4534, + "step": 4336 + }, + { + "epoch": 0.5277760876178886, + "grad_norm": 2.3528897762298584, + "learning_rate": 1.7096196068550535e-05, + "loss": 0.5155, + "step": 4337 + }, + { + "epoch": 0.5278977791299057, + "grad_norm": 1.0404629707336426, + "learning_rate": 1.709482150384139e-05, + "loss": 0.49, + "step": 4338 + }, + { + "epoch": 0.5280194706419227, + "grad_norm": 0.5362629294395447, + "learning_rate": 1.70934466691596e-05, + "loss": 0.4199, + "step": 4339 + }, + { + "epoch": 0.5281411621539398, + "grad_norm": 1.6750553846359253, + "learning_rate": 1.709207156455748e-05, + "loss": 0.4848, + "step": 4340 + }, + { + "epoch": 0.5282628536659568, + "grad_norm": 3.2698941230773926, + "learning_rate": 1.7090696190087358e-05, + "loss": 0.5596, + "step": 4341 + }, + { + "epoch": 0.5283845451779738, + "grad_norm": 1.4459127187728882, + "learning_rate": 1.7089320545801574e-05, + "loss": 0.4529, + "step": 4342 + }, + { + "epoch": 0.5285062366899909, + "grad_norm": 1.2845115661621094, + "learning_rate": 1.7087944631752466e-05, + "loss": 0.4779, + "step": 4343 + }, + { + "epoch": 0.5286279282020079, + "grad_norm": 0.7503172755241394, + "learning_rate": 1.708656844799239e-05, + "loss": 0.5371, + "step": 4344 + }, + { + "epoch": 0.5287496197140249, + "grad_norm": 2.6030914783477783, + "learning_rate": 1.708519199457372e-05, + "loss": 0.4416, + "step": 4345 + }, + { + "epoch": 0.528871311226042, + "grad_norm": 2.975177526473999, + "learning_rate": 1.708381527154883e-05, + "loss": 0.5184, + "step": 4346 + }, + { + "epoch": 0.528993002738059, + "grad_norm": 1.4393500089645386, + "learning_rate": 1.7082438278970103e-05, + "loss": 0.4357, + "step": 4347 + }, + { + "epoch": 0.5291146942500761, + "grad_norm": 1.8966056108474731, + "learning_rate": 1.708106101688994e-05, + "loss": 0.4358, + "step": 4348 + }, + { + "epoch": 0.5292363857620931, + "grad_norm": 2.0543057918548584, + "learning_rate": 1.7079683485360753e-05, + "loss": 0.5132, + "step": 4349 + }, + { + "epoch": 0.5293580772741101, + "grad_norm": 3.377936363220215, + "learning_rate": 1.707830568443495e-05, + "loss": 0.5183, + "step": 4350 + }, + { + "epoch": 0.5294797687861271, + "grad_norm": 0.7073395848274231, + "learning_rate": 1.707692761416497e-05, + "loss": 0.4372, + "step": 4351 + }, + { + "epoch": 0.5296014602981443, + "grad_norm": 0.6043311953544617, + "learning_rate": 1.707554927460324e-05, + "loss": 0.4346, + "step": 4352 + }, + { + "epoch": 0.5297231518101613, + "grad_norm": 4.110519886016846, + "learning_rate": 1.7074170665802213e-05, + "loss": 0.5369, + "step": 4353 + }, + { + "epoch": 0.5298448433221783, + "grad_norm": 0.6491762399673462, + "learning_rate": 1.707279178781436e-05, + "loss": 0.4788, + "step": 4354 + }, + { + "epoch": 0.5299665348341953, + "grad_norm": 0.9395544528961182, + "learning_rate": 1.707141264069213e-05, + "loss": 0.4782, + "step": 4355 + }, + { + "epoch": 0.5300882263462123, + "grad_norm": 2.036043882369995, + "learning_rate": 1.7070033224488015e-05, + "loss": 0.4958, + "step": 4356 + }, + { + "epoch": 0.5302099178582294, + "grad_norm": 0.676498293876648, + "learning_rate": 1.70686535392545e-05, + "loss": 0.5, + "step": 4357 + }, + { + "epoch": 0.5303316093702464, + "grad_norm": 3.6512396335601807, + "learning_rate": 1.706727358504409e-05, + "loss": 0.427, + "step": 4358 + }, + { + "epoch": 0.5304533008822635, + "grad_norm": 2.562626600265503, + "learning_rate": 1.7065893361909287e-05, + "loss": 0.4477, + "step": 4359 + }, + { + "epoch": 0.5305749923942805, + "grad_norm": 0.6620128154754639, + "learning_rate": 1.706451286990262e-05, + "loss": 0.5259, + "step": 4360 + }, + { + "epoch": 0.5306966839062975, + "grad_norm": 1.16592538356781, + "learning_rate": 1.7063132109076614e-05, + "loss": 0.4656, + "step": 4361 + }, + { + "epoch": 0.5308183754183146, + "grad_norm": 2.391920566558838, + "learning_rate": 1.706175107948381e-05, + "loss": 0.4522, + "step": 4362 + }, + { + "epoch": 0.5309400669303316, + "grad_norm": 2.8118526935577393, + "learning_rate": 1.706036978117676e-05, + "loss": 0.3875, + "step": 4363 + }, + { + "epoch": 0.5310617584423486, + "grad_norm": 4.067058563232422, + "learning_rate": 1.705898821420803e-05, + "loss": 0.5152, + "step": 4364 + }, + { + "epoch": 0.5311834499543657, + "grad_norm": 0.6446700096130371, + "learning_rate": 1.7057606378630186e-05, + "loss": 0.4365, + "step": 4365 + }, + { + "epoch": 0.5313051414663827, + "grad_norm": 3.115743637084961, + "learning_rate": 1.7056224274495806e-05, + "loss": 0.5132, + "step": 4366 + }, + { + "epoch": 0.5314268329783998, + "grad_norm": 1.6996607780456543, + "learning_rate": 1.7054841901857494e-05, + "loss": 0.4611, + "step": 4367 + }, + { + "epoch": 0.5315485244904168, + "grad_norm": 4.770429611206055, + "learning_rate": 1.7053459260767837e-05, + "loss": 0.5108, + "step": 4368 + }, + { + "epoch": 0.5316702160024338, + "grad_norm": 1.3477798700332642, + "learning_rate": 1.705207635127946e-05, + "loss": 0.464, + "step": 4369 + }, + { + "epoch": 0.5317919075144508, + "grad_norm": 1.077989101409912, + "learning_rate": 1.7050693173444982e-05, + "loss": 0.4584, + "step": 4370 + }, + { + "epoch": 0.531913599026468, + "grad_norm": 3.331965446472168, + "learning_rate": 1.7049309727317035e-05, + "loss": 0.5296, + "step": 4371 + }, + { + "epoch": 0.532035290538485, + "grad_norm": 1.5123640298843384, + "learning_rate": 1.7047926012948255e-05, + "loss": 0.4677, + "step": 4372 + }, + { + "epoch": 0.532156982050502, + "grad_norm": 3.858304262161255, + "learning_rate": 1.7046542030391307e-05, + "loss": 0.4444, + "step": 4373 + }, + { + "epoch": 0.532278673562519, + "grad_norm": 2.681751012802124, + "learning_rate": 1.704515777969885e-05, + "loss": 0.5066, + "step": 4374 + }, + { + "epoch": 0.532400365074536, + "grad_norm": 4.16647481918335, + "learning_rate": 1.7043773260923557e-05, + "loss": 0.4516, + "step": 4375 + }, + { + "epoch": 0.532522056586553, + "grad_norm": 2.4314897060394287, + "learning_rate": 1.704238847411811e-05, + "loss": 0.4653, + "step": 4376 + }, + { + "epoch": 0.5326437480985702, + "grad_norm": 0.959051251411438, + "learning_rate": 1.7041003419335204e-05, + "loss": 0.4537, + "step": 4377 + }, + { + "epoch": 0.5327654396105872, + "grad_norm": 2.0506279468536377, + "learning_rate": 1.7039618096627545e-05, + "loss": 0.4241, + "step": 4378 + }, + { + "epoch": 0.5328871311226042, + "grad_norm": 0.9842852354049683, + "learning_rate": 1.7038232506047844e-05, + "loss": 0.4856, + "step": 4379 + }, + { + "epoch": 0.5330088226346212, + "grad_norm": 1.5740323066711426, + "learning_rate": 1.7036846647648827e-05, + "loss": 0.4695, + "step": 4380 + }, + { + "epoch": 0.5331305141466383, + "grad_norm": 3.1457581520080566, + "learning_rate": 1.703546052148323e-05, + "loss": 0.5303, + "step": 4381 + }, + { + "epoch": 0.5332522056586553, + "grad_norm": 4.844799518585205, + "learning_rate": 1.7034074127603805e-05, + "loss": 0.5308, + "step": 4382 + }, + { + "epoch": 0.5333738971706723, + "grad_norm": 1.412689208984375, + "learning_rate": 1.7032687466063292e-05, + "loss": 0.4884, + "step": 4383 + }, + { + "epoch": 0.5334955886826894, + "grad_norm": 0.8175567984580994, + "learning_rate": 1.7031300536914467e-05, + "loss": 0.4566, + "step": 4384 + }, + { + "epoch": 0.5336172801947064, + "grad_norm": 2.5051369667053223, + "learning_rate": 1.70299133402101e-05, + "loss": 0.5196, + "step": 4385 + }, + { + "epoch": 0.5337389717067235, + "grad_norm": 4.426215648651123, + "learning_rate": 1.702852587600298e-05, + "loss": 0.4202, + "step": 4386 + }, + { + "epoch": 0.5338606632187405, + "grad_norm": 0.9319576025009155, + "learning_rate": 1.7027138144345906e-05, + "loss": 0.486, + "step": 4387 + }, + { + "epoch": 0.5339823547307575, + "grad_norm": 3.2823240756988525, + "learning_rate": 1.702575014529168e-05, + "loss": 0.4387, + "step": 4388 + }, + { + "epoch": 0.5341040462427745, + "grad_norm": 2.318687915802002, + "learning_rate": 1.702436187889311e-05, + "loss": 0.4616, + "step": 4389 + }, + { + "epoch": 0.5342257377547917, + "grad_norm": 1.5169910192489624, + "learning_rate": 1.702297334520304e-05, + "loss": 0.4657, + "step": 4390 + }, + { + "epoch": 0.5343474292668087, + "grad_norm": 0.652877926826477, + "learning_rate": 1.7021584544274294e-05, + "loss": 0.4724, + "step": 4391 + }, + { + "epoch": 0.5344691207788257, + "grad_norm": 0.5499361157417297, + "learning_rate": 1.7020195476159724e-05, + "loss": 0.4621, + "step": 4392 + }, + { + "epoch": 0.5345908122908427, + "grad_norm": 3.1362850666046143, + "learning_rate": 1.701880614091218e-05, + "loss": 0.4163, + "step": 4393 + }, + { + "epoch": 0.5347125038028597, + "grad_norm": 0.5517951846122742, + "learning_rate": 1.701741653858454e-05, + "loss": 0.4089, + "step": 4394 + }, + { + "epoch": 0.5348341953148767, + "grad_norm": 4.176994323730469, + "learning_rate": 1.701602666922967e-05, + "loss": 0.5421, + "step": 4395 + }, + { + "epoch": 0.5349558868268939, + "grad_norm": 2.6072075366973877, + "learning_rate": 1.701463653290047e-05, + "loss": 0.4728, + "step": 4396 + }, + { + "epoch": 0.5350775783389109, + "grad_norm": 1.0330872535705566, + "learning_rate": 1.7013246129649825e-05, + "loss": 0.4258, + "step": 4397 + }, + { + "epoch": 0.5351992698509279, + "grad_norm": 3.934319496154785, + "learning_rate": 1.701185545953065e-05, + "loss": 0.5134, + "step": 4398 + }, + { + "epoch": 0.5353209613629449, + "grad_norm": 1.0257481336593628, + "learning_rate": 1.7010464522595863e-05, + "loss": 0.4314, + "step": 4399 + }, + { + "epoch": 0.535442652874962, + "grad_norm": 3.8877146244049072, + "learning_rate": 1.7009073318898386e-05, + "loss": 0.5703, + "step": 4400 + }, + { + "epoch": 0.535564344386979, + "grad_norm": 2.250124454498291, + "learning_rate": 1.7007681848491163e-05, + "loss": 0.3921, + "step": 4401 + }, + { + "epoch": 0.535686035898996, + "grad_norm": 1.0167269706726074, + "learning_rate": 1.700629011142714e-05, + "loss": 0.4644, + "step": 4402 + }, + { + "epoch": 0.5358077274110131, + "grad_norm": 0.91866135597229, + "learning_rate": 1.7004898107759276e-05, + "loss": 0.4841, + "step": 4403 + }, + { + "epoch": 0.5359294189230301, + "grad_norm": 1.4587697982788086, + "learning_rate": 1.7003505837540538e-05, + "loss": 0.4953, + "step": 4404 + }, + { + "epoch": 0.5360511104350472, + "grad_norm": 0.7243636250495911, + "learning_rate": 1.7002113300823914e-05, + "loss": 0.4664, + "step": 4405 + }, + { + "epoch": 0.5361728019470642, + "grad_norm": 1.1874521970748901, + "learning_rate": 1.7000720497662376e-05, + "loss": 0.4971, + "step": 4406 + }, + { + "epoch": 0.5362944934590812, + "grad_norm": 0.9897105097770691, + "learning_rate": 1.699932742810894e-05, + "loss": 0.4577, + "step": 4407 + }, + { + "epoch": 0.5364161849710982, + "grad_norm": 2.2329607009887695, + "learning_rate": 1.69979340922166e-05, + "loss": 0.4491, + "step": 4408 + }, + { + "epoch": 0.5365378764831154, + "grad_norm": 3.0158960819244385, + "learning_rate": 1.6996540490038387e-05, + "loss": 0.4425, + "step": 4409 + }, + { + "epoch": 0.5366595679951324, + "grad_norm": 1.7176218032836914, + "learning_rate": 1.6995146621627328e-05, + "loss": 0.4632, + "step": 4410 + }, + { + "epoch": 0.5367812595071494, + "grad_norm": 0.8636429905891418, + "learning_rate": 1.699375248703646e-05, + "loss": 0.474, + "step": 4411 + }, + { + "epoch": 0.5369029510191664, + "grad_norm": 1.9928045272827148, + "learning_rate": 1.6992358086318832e-05, + "loss": 0.4679, + "step": 4412 + }, + { + "epoch": 0.5370246425311834, + "grad_norm": 2.9881694316864014, + "learning_rate": 1.6990963419527507e-05, + "loss": 0.4901, + "step": 4413 + }, + { + "epoch": 0.5371463340432004, + "grad_norm": 1.7402344942092896, + "learning_rate": 1.698956848671555e-05, + "loss": 0.4655, + "step": 4414 + }, + { + "epoch": 0.5372680255552176, + "grad_norm": 1.499241590499878, + "learning_rate": 1.6988173287936044e-05, + "loss": 0.4655, + "step": 4415 + }, + { + "epoch": 0.5373897170672346, + "grad_norm": 4.834335803985596, + "learning_rate": 1.6986777823242087e-05, + "loss": 0.5442, + "step": 4416 + }, + { + "epoch": 0.5375114085792516, + "grad_norm": 0.8546738624572754, + "learning_rate": 1.6985382092686766e-05, + "loss": 0.434, + "step": 4417 + }, + { + "epoch": 0.5376331000912686, + "grad_norm": 3.5857698917388916, + "learning_rate": 1.6983986096323198e-05, + "loss": 0.5334, + "step": 4418 + }, + { + "epoch": 0.5377547916032857, + "grad_norm": 4.435749053955078, + "learning_rate": 1.6982589834204507e-05, + "loss": 0.3882, + "step": 4419 + }, + { + "epoch": 0.5378764831153027, + "grad_norm": 4.089552402496338, + "learning_rate": 1.6981193306383815e-05, + "loss": 0.4474, + "step": 4420 + }, + { + "epoch": 0.5379981746273197, + "grad_norm": 1.451317310333252, + "learning_rate": 1.6979796512914268e-05, + "loss": 0.4887, + "step": 4421 + }, + { + "epoch": 0.5381198661393368, + "grad_norm": 3.0299057960510254, + "learning_rate": 1.697839945384902e-05, + "loss": 0.4518, + "step": 4422 + }, + { + "epoch": 0.5382415576513538, + "grad_norm": 2.9058687686920166, + "learning_rate": 1.697700212924122e-05, + "loss": 0.4367, + "step": 4423 + }, + { + "epoch": 0.5383632491633709, + "grad_norm": 0.6400445699691772, + "learning_rate": 1.697560453914406e-05, + "loss": 0.5043, + "step": 4424 + }, + { + "epoch": 0.5384849406753879, + "grad_norm": 1.751715064048767, + "learning_rate": 1.69742066836107e-05, + "loss": 0.4979, + "step": 4425 + }, + { + "epoch": 0.5386066321874049, + "grad_norm": 2.2712137699127197, + "learning_rate": 1.6972808562694337e-05, + "loss": 0.5098, + "step": 4426 + }, + { + "epoch": 0.5387283236994219, + "grad_norm": 0.9613345861434937, + "learning_rate": 1.6971410176448183e-05, + "loss": 0.4499, + "step": 4427 + }, + { + "epoch": 0.538850015211439, + "grad_norm": 0.5806231498718262, + "learning_rate": 1.697001152492544e-05, + "loss": 0.4994, + "step": 4428 + }, + { + "epoch": 0.5389717067234561, + "grad_norm": 1.7357196807861328, + "learning_rate": 1.6968612608179328e-05, + "loss": 0.5158, + "step": 4429 + }, + { + "epoch": 0.5390933982354731, + "grad_norm": 1.5228915214538574, + "learning_rate": 1.6967213426263084e-05, + "loss": 0.4658, + "step": 4430 + }, + { + "epoch": 0.5392150897474901, + "grad_norm": 2.5115373134613037, + "learning_rate": 1.6965813979229947e-05, + "loss": 0.4553, + "step": 4431 + }, + { + "epoch": 0.5393367812595071, + "grad_norm": 0.6168383359909058, + "learning_rate": 1.696441426713317e-05, + "loss": 0.4806, + "step": 4432 + }, + { + "epoch": 0.5394584727715241, + "grad_norm": 1.9714595079421997, + "learning_rate": 1.6963014290026014e-05, + "loss": 0.4487, + "step": 4433 + }, + { + "epoch": 0.5395801642835413, + "grad_norm": 0.7653484344482422, + "learning_rate": 1.6961614047961755e-05, + "loss": 0.4851, + "step": 4434 + }, + { + "epoch": 0.5397018557955583, + "grad_norm": 1.018984317779541, + "learning_rate": 1.6960213540993668e-05, + "loss": 0.445, + "step": 4435 + }, + { + "epoch": 0.5398235473075753, + "grad_norm": 0.766427218914032, + "learning_rate": 1.695881276917505e-05, + "loss": 0.4643, + "step": 4436 + }, + { + "epoch": 0.5399452388195923, + "grad_norm": 2.314392566680908, + "learning_rate": 1.69574117325592e-05, + "loss": 0.507, + "step": 4437 + }, + { + "epoch": 0.5400669303316094, + "grad_norm": 1.6304572820663452, + "learning_rate": 1.6956010431199437e-05, + "loss": 0.488, + "step": 4438 + }, + { + "epoch": 0.5401886218436264, + "grad_norm": 1.6652350425720215, + "learning_rate": 1.6954608865149075e-05, + "loss": 0.4362, + "step": 4439 + }, + { + "epoch": 0.5403103133556434, + "grad_norm": 0.621074914932251, + "learning_rate": 1.6953207034461456e-05, + "loss": 0.4909, + "step": 4440 + }, + { + "epoch": 0.5404320048676605, + "grad_norm": 4.307516098022461, + "learning_rate": 1.6951804939189912e-05, + "loss": 0.3552, + "step": 4441 + }, + { + "epoch": 0.5405536963796775, + "grad_norm": 0.6626152992248535, + "learning_rate": 1.69504025793878e-05, + "loss": 0.4665, + "step": 4442 + }, + { + "epoch": 0.5406753878916946, + "grad_norm": 1.8344053030014038, + "learning_rate": 1.6948999955108484e-05, + "loss": 0.4216, + "step": 4443 + }, + { + "epoch": 0.5407970794037116, + "grad_norm": 2.0432820320129395, + "learning_rate": 1.6947597066405335e-05, + "loss": 0.5277, + "step": 4444 + }, + { + "epoch": 0.5409187709157286, + "grad_norm": 1.7651121616363525, + "learning_rate": 1.6946193913331742e-05, + "loss": 0.4775, + "step": 4445 + }, + { + "epoch": 0.5410404624277456, + "grad_norm": 0.6606485247612, + "learning_rate": 1.6944790495941094e-05, + "loss": 0.451, + "step": 4446 + }, + { + "epoch": 0.5411621539397627, + "grad_norm": 0.6312865018844604, + "learning_rate": 1.6943386814286785e-05, + "loss": 0.4736, + "step": 4447 + }, + { + "epoch": 0.5412838454517798, + "grad_norm": 2.1066336631774902, + "learning_rate": 1.694198286842224e-05, + "loss": 0.5037, + "step": 4448 + }, + { + "epoch": 0.5414055369637968, + "grad_norm": 2.693444013595581, + "learning_rate": 1.6940578658400876e-05, + "loss": 0.4198, + "step": 4449 + }, + { + "epoch": 0.5415272284758138, + "grad_norm": 1.0548605918884277, + "learning_rate": 1.693917418427613e-05, + "loss": 0.4843, + "step": 4450 + }, + { + "epoch": 0.5416489199878308, + "grad_norm": 1.1741557121276855, + "learning_rate": 1.6937769446101443e-05, + "loss": 0.4229, + "step": 4451 + }, + { + "epoch": 0.5417706114998478, + "grad_norm": 1.3165100812911987, + "learning_rate": 1.6936364443930267e-05, + "loss": 0.4929, + "step": 4452 + }, + { + "epoch": 0.541892303011865, + "grad_norm": 1.962891936302185, + "learning_rate": 1.6934959177816062e-05, + "loss": 0.4519, + "step": 4453 + }, + { + "epoch": 0.542013994523882, + "grad_norm": 1.664731502532959, + "learning_rate": 1.693355364781231e-05, + "loss": 0.4852, + "step": 4454 + }, + { + "epoch": 0.542135686035899, + "grad_norm": 1.6235625743865967, + "learning_rate": 1.693214785397249e-05, + "loss": 0.4579, + "step": 4455 + }, + { + "epoch": 0.542257377547916, + "grad_norm": 3.519458532333374, + "learning_rate": 1.6930741796350096e-05, + "loss": 0.5604, + "step": 4456 + }, + { + "epoch": 0.542379069059933, + "grad_norm": 1.6399915218353271, + "learning_rate": 1.692933547499863e-05, + "loss": 0.4062, + "step": 4457 + }, + { + "epoch": 0.5425007605719501, + "grad_norm": 5.24549674987793, + "learning_rate": 1.6927928889971606e-05, + "loss": 0.5516, + "step": 4458 + }, + { + "epoch": 0.5426224520839671, + "grad_norm": 0.7926618456840515, + "learning_rate": 1.692652204132255e-05, + "loss": 0.4773, + "step": 4459 + }, + { + "epoch": 0.5427441435959842, + "grad_norm": 0.8715618848800659, + "learning_rate": 1.692511492910499e-05, + "loss": 0.4178, + "step": 4460 + }, + { + "epoch": 0.5428658351080012, + "grad_norm": 2.7368240356445312, + "learning_rate": 1.6923707553372473e-05, + "loss": 0.5298, + "step": 4461 + }, + { + "epoch": 0.5429875266200183, + "grad_norm": 0.9556664228439331, + "learning_rate": 1.6922299914178556e-05, + "loss": 0.4706, + "step": 4462 + }, + { + "epoch": 0.5431092181320353, + "grad_norm": 1.2215858697891235, + "learning_rate": 1.6920892011576796e-05, + "loss": 0.4695, + "step": 4463 + }, + { + "epoch": 0.5432309096440523, + "grad_norm": 1.4884281158447266, + "learning_rate": 1.691948384562077e-05, + "loss": 0.4724, + "step": 4464 + }, + { + "epoch": 0.5433526011560693, + "grad_norm": 1.1030713319778442, + "learning_rate": 1.6918075416364065e-05, + "loss": 0.4794, + "step": 4465 + }, + { + "epoch": 0.5434742926680864, + "grad_norm": 1.4520702362060547, + "learning_rate": 1.6916666723860263e-05, + "loss": 0.5309, + "step": 4466 + }, + { + "epoch": 0.5435959841801035, + "grad_norm": 2.1015758514404297, + "learning_rate": 1.691525776816298e-05, + "loss": 0.4605, + "step": 4467 + }, + { + "epoch": 0.5437176756921205, + "grad_norm": 1.9199395179748535, + "learning_rate": 1.6913848549325825e-05, + "loss": 0.4908, + "step": 4468 + }, + { + "epoch": 0.5438393672041375, + "grad_norm": 0.8052958250045776, + "learning_rate": 1.691243906740242e-05, + "loss": 0.4665, + "step": 4469 + }, + { + "epoch": 0.5439610587161545, + "grad_norm": 0.5576810836791992, + "learning_rate": 1.6911029322446402e-05, + "loss": 0.4764, + "step": 4470 + }, + { + "epoch": 0.5440827502281715, + "grad_norm": 2.050319194793701, + "learning_rate": 1.6909619314511414e-05, + "loss": 0.4844, + "step": 4471 + }, + { + "epoch": 0.5442044417401887, + "grad_norm": 1.3106358051300049, + "learning_rate": 1.6908209043651107e-05, + "loss": 0.4114, + "step": 4472 + }, + { + "epoch": 0.5443261332522057, + "grad_norm": 1.462104320526123, + "learning_rate": 1.6906798509919147e-05, + "loss": 0.4581, + "step": 4473 + }, + { + "epoch": 0.5444478247642227, + "grad_norm": 0.8785805106163025, + "learning_rate": 1.6905387713369207e-05, + "loss": 0.4826, + "step": 4474 + }, + { + "epoch": 0.5445695162762397, + "grad_norm": 4.6345415115356445, + "learning_rate": 1.6903976654054973e-05, + "loss": 0.5868, + "step": 4475 + }, + { + "epoch": 0.5446912077882567, + "grad_norm": 1.8743029832839966, + "learning_rate": 1.6902565332030136e-05, + "loss": 0.4257, + "step": 4476 + }, + { + "epoch": 0.5448128993002738, + "grad_norm": 0.8281827569007874, + "learning_rate": 1.6901153747348403e-05, + "loss": 0.4734, + "step": 4477 + }, + { + "epoch": 0.5449345908122909, + "grad_norm": 3.947049856185913, + "learning_rate": 1.689974190006348e-05, + "loss": 0.4194, + "step": 4478 + }, + { + "epoch": 0.5450562823243079, + "grad_norm": 2.000213384628296, + "learning_rate": 1.68983297902291e-05, + "loss": 0.4394, + "step": 4479 + }, + { + "epoch": 0.5451779738363249, + "grad_norm": 1.2415028810501099, + "learning_rate": 1.6896917417898992e-05, + "loss": 0.458, + "step": 4480 + }, + { + "epoch": 0.545299665348342, + "grad_norm": 1.306781530380249, + "learning_rate": 1.6895504783126902e-05, + "loss": 0.4265, + "step": 4481 + }, + { + "epoch": 0.545421356860359, + "grad_norm": 1.994301676750183, + "learning_rate": 1.689409188596658e-05, + "loss": 0.467, + "step": 4482 + }, + { + "epoch": 0.545543048372376, + "grad_norm": 2.5383036136627197, + "learning_rate": 1.6892678726471788e-05, + "loss": 0.4743, + "step": 4483 + }, + { + "epoch": 0.545664739884393, + "grad_norm": 2.524864435195923, + "learning_rate": 1.689126530469631e-05, + "loss": 0.5043, + "step": 4484 + }, + { + "epoch": 0.5457864313964101, + "grad_norm": 1.3676741123199463, + "learning_rate": 1.688985162069392e-05, + "loss": 0.5021, + "step": 4485 + }, + { + "epoch": 0.5459081229084272, + "grad_norm": 1.1412726640701294, + "learning_rate": 1.6888437674518418e-05, + "loss": 0.4829, + "step": 4486 + }, + { + "epoch": 0.5460298144204442, + "grad_norm": 0.9708731770515442, + "learning_rate": 1.6887023466223596e-05, + "loss": 0.5063, + "step": 4487 + }, + { + "epoch": 0.5461515059324612, + "grad_norm": 0.9692050814628601, + "learning_rate": 1.6885608995863282e-05, + "loss": 0.4804, + "step": 4488 + }, + { + "epoch": 0.5462731974444782, + "grad_norm": 6.402462482452393, + "learning_rate": 1.6884194263491293e-05, + "loss": 0.4229, + "step": 4489 + }, + { + "epoch": 0.5463948889564952, + "grad_norm": 2.0523343086242676, + "learning_rate": 1.688277926916146e-05, + "loss": 0.4944, + "step": 4490 + }, + { + "epoch": 0.5465165804685124, + "grad_norm": 3.7021658420562744, + "learning_rate": 1.6881364012927633e-05, + "loss": 0.5076, + "step": 4491 + }, + { + "epoch": 0.5466382719805294, + "grad_norm": 2.4698987007141113, + "learning_rate": 1.687994849484366e-05, + "loss": 0.5152, + "step": 4492 + }, + { + "epoch": 0.5467599634925464, + "grad_norm": 3.081721544265747, + "learning_rate": 1.6878532714963406e-05, + "loss": 0.4497, + "step": 4493 + }, + { + "epoch": 0.5468816550045634, + "grad_norm": 1.5112048387527466, + "learning_rate": 1.6877116673340746e-05, + "loss": 0.4707, + "step": 4494 + }, + { + "epoch": 0.5470033465165804, + "grad_norm": 1.6402385234832764, + "learning_rate": 1.687570037002956e-05, + "loss": 0.4772, + "step": 4495 + }, + { + "epoch": 0.5471250380285975, + "grad_norm": 1.9205235242843628, + "learning_rate": 1.687428380508374e-05, + "loss": 0.4888, + "step": 4496 + }, + { + "epoch": 0.5472467295406146, + "grad_norm": 2.1676223278045654, + "learning_rate": 1.6872866978557197e-05, + "loss": 0.4948, + "step": 4497 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 3.0421810150146484, + "learning_rate": 1.687144989050384e-05, + "loss": 0.4957, + "step": 4498 + }, + { + "epoch": 0.5474901125646486, + "grad_norm": 1.840588927268982, + "learning_rate": 1.687003254097759e-05, + "loss": 0.4732, + "step": 4499 + }, + { + "epoch": 0.5476118040766657, + "grad_norm": 2.079371690750122, + "learning_rate": 1.686861493003238e-05, + "loss": 0.3964, + "step": 4500 + }, + { + "epoch": 0.5477334955886827, + "grad_norm": 0.7197351455688477, + "learning_rate": 1.6867197057722153e-05, + "loss": 0.4992, + "step": 4501 + }, + { + "epoch": 0.5478551871006997, + "grad_norm": 1.077666997909546, + "learning_rate": 1.6865778924100867e-05, + "loss": 0.4579, + "step": 4502 + }, + { + "epoch": 0.5479768786127167, + "grad_norm": 2.053227424621582, + "learning_rate": 1.686436052922248e-05, + "loss": 0.4269, + "step": 4503 + }, + { + "epoch": 0.5480985701247338, + "grad_norm": 0.6475696563720703, + "learning_rate": 1.686294187314097e-05, + "loss": 0.4638, + "step": 4504 + }, + { + "epoch": 0.5482202616367509, + "grad_norm": 1.3565797805786133, + "learning_rate": 1.6861522955910313e-05, + "loss": 0.4694, + "step": 4505 + }, + { + "epoch": 0.5483419531487679, + "grad_norm": 0.9397381544113159, + "learning_rate": 1.6860103777584502e-05, + "loss": 0.4249, + "step": 4506 + }, + { + "epoch": 0.5484636446607849, + "grad_norm": 0.6636385917663574, + "learning_rate": 1.6858684338217547e-05, + "loss": 0.4291, + "step": 4507 + }, + { + "epoch": 0.5485853361728019, + "grad_norm": 5.156121253967285, + "learning_rate": 1.6857264637863457e-05, + "loss": 0.5727, + "step": 4508 + }, + { + "epoch": 0.5487070276848189, + "grad_norm": 1.4113715887069702, + "learning_rate": 1.6855844676576253e-05, + "loss": 0.4652, + "step": 4509 + }, + { + "epoch": 0.5488287191968361, + "grad_norm": 1.1701862812042236, + "learning_rate": 1.6854424454409962e-05, + "loss": 0.4241, + "step": 4510 + }, + { + "epoch": 0.5489504107088531, + "grad_norm": 3.4081249237060547, + "learning_rate": 1.685300397141864e-05, + "loss": 0.5306, + "step": 4511 + }, + { + "epoch": 0.5490721022208701, + "grad_norm": 1.1083649396896362, + "learning_rate": 1.685158322765633e-05, + "loss": 0.4515, + "step": 4512 + }, + { + "epoch": 0.5491937937328871, + "grad_norm": 3.107776641845703, + "learning_rate": 1.6850162223177094e-05, + "loss": 0.5052, + "step": 4513 + }, + { + "epoch": 0.5493154852449041, + "grad_norm": 0.8299750685691833, + "learning_rate": 1.6848740958035012e-05, + "loss": 0.4737, + "step": 4514 + }, + { + "epoch": 0.5494371767569212, + "grad_norm": 5.90954065322876, + "learning_rate": 1.6847319432284155e-05, + "loss": 0.4333, + "step": 4515 + }, + { + "epoch": 0.5495588682689383, + "grad_norm": 4.6237382888793945, + "learning_rate": 1.6845897645978624e-05, + "loss": 0.4629, + "step": 4516 + }, + { + "epoch": 0.5496805597809553, + "grad_norm": 3.8987646102905273, + "learning_rate": 1.6844475599172516e-05, + "loss": 0.443, + "step": 4517 + }, + { + "epoch": 0.5498022512929723, + "grad_norm": 2.2025468349456787, + "learning_rate": 1.684305329191994e-05, + "loss": 0.4827, + "step": 4518 + }, + { + "epoch": 0.5499239428049894, + "grad_norm": 0.7520539164543152, + "learning_rate": 1.6841630724275027e-05, + "loss": 0.462, + "step": 4519 + }, + { + "epoch": 0.5500456343170064, + "grad_norm": 1.3843849897384644, + "learning_rate": 1.6840207896291905e-05, + "loss": 0.4426, + "step": 4520 + }, + { + "epoch": 0.5501673258290234, + "grad_norm": 2.7557339668273926, + "learning_rate": 1.683878480802471e-05, + "loss": 0.4157, + "step": 4521 + }, + { + "epoch": 0.5502890173410404, + "grad_norm": 1.0031745433807373, + "learning_rate": 1.6837361459527597e-05, + "loss": 0.4636, + "step": 4522 + }, + { + "epoch": 0.5504107088530575, + "grad_norm": 1.2858787775039673, + "learning_rate": 1.683593785085473e-05, + "loss": 0.429, + "step": 4523 + }, + { + "epoch": 0.5505324003650746, + "grad_norm": 0.6925680637359619, + "learning_rate": 1.6834513982060277e-05, + "loss": 0.4322, + "step": 4524 + }, + { + "epoch": 0.5506540918770916, + "grad_norm": 1.1897066831588745, + "learning_rate": 1.6833089853198422e-05, + "loss": 0.4261, + "step": 4525 + }, + { + "epoch": 0.5507757833891086, + "grad_norm": 1.5610861778259277, + "learning_rate": 1.6831665464323354e-05, + "loss": 0.4535, + "step": 4526 + }, + { + "epoch": 0.5508974749011256, + "grad_norm": 2.1373817920684814, + "learning_rate": 1.6830240815489274e-05, + "loss": 0.4717, + "step": 4527 + }, + { + "epoch": 0.5510191664131426, + "grad_norm": 1.7551811933517456, + "learning_rate": 1.6828815906750395e-05, + "loss": 0.4387, + "step": 4528 + }, + { + "epoch": 0.5511408579251598, + "grad_norm": 2.961994171142578, + "learning_rate": 1.6827390738160933e-05, + "loss": 0.5014, + "step": 4529 + }, + { + "epoch": 0.5512625494371768, + "grad_norm": 0.7889112830162048, + "learning_rate": 1.682596530977512e-05, + "loss": 0.4805, + "step": 4530 + }, + { + "epoch": 0.5513842409491938, + "grad_norm": 3.972547769546509, + "learning_rate": 1.6824539621647202e-05, + "loss": 0.4624, + "step": 4531 + }, + { + "epoch": 0.5515059324612108, + "grad_norm": 0.8660321235656738, + "learning_rate": 1.682311367383142e-05, + "loss": 0.5218, + "step": 4532 + }, + { + "epoch": 0.5516276239732278, + "grad_norm": 0.7364094853401184, + "learning_rate": 1.6821687466382046e-05, + "loss": 0.5325, + "step": 4533 + }, + { + "epoch": 0.5517493154852449, + "grad_norm": 3.64518404006958, + "learning_rate": 1.682026099935334e-05, + "loss": 0.4127, + "step": 4534 + }, + { + "epoch": 0.551871006997262, + "grad_norm": 1.7240809202194214, + "learning_rate": 1.6818834272799587e-05, + "loss": 0.4547, + "step": 4535 + }, + { + "epoch": 0.551992698509279, + "grad_norm": 1.6545079946517944, + "learning_rate": 1.6817407286775075e-05, + "loss": 0.4767, + "step": 4536 + }, + { + "epoch": 0.552114390021296, + "grad_norm": 2.3876535892486572, + "learning_rate": 1.6815980041334108e-05, + "loss": 0.5022, + "step": 4537 + }, + { + "epoch": 0.552236081533313, + "grad_norm": 1.983043909072876, + "learning_rate": 1.681455253653099e-05, + "loss": 0.451, + "step": 4538 + }, + { + "epoch": 0.5523577730453301, + "grad_norm": 0.5386200547218323, + "learning_rate": 1.681312477242004e-05, + "loss": 0.4298, + "step": 4539 + }, + { + "epoch": 0.5524794645573471, + "grad_norm": 1.1130664348602295, + "learning_rate": 1.6811696749055592e-05, + "loss": 0.3978, + "step": 4540 + }, + { + "epoch": 0.5526011560693641, + "grad_norm": 3.314008951187134, + "learning_rate": 1.681026846649198e-05, + "loss": 0.4727, + "step": 4541 + }, + { + "epoch": 0.5527228475813812, + "grad_norm": 1.0250221490859985, + "learning_rate": 1.6808839924783563e-05, + "loss": 0.4253, + "step": 4542 + }, + { + "epoch": 0.5528445390933983, + "grad_norm": 2.0319528579711914, + "learning_rate": 1.680741112398469e-05, + "loss": 0.4398, + "step": 4543 + }, + { + "epoch": 0.5529662306054153, + "grad_norm": 2.3150103092193604, + "learning_rate": 1.680598206414973e-05, + "loss": 0.4706, + "step": 4544 + }, + { + "epoch": 0.5530879221174323, + "grad_norm": 1.0327575206756592, + "learning_rate": 1.6804552745333073e-05, + "loss": 0.3994, + "step": 4545 + }, + { + "epoch": 0.5532096136294493, + "grad_norm": 1.6495288610458374, + "learning_rate": 1.6803123167589092e-05, + "loss": 0.4733, + "step": 4546 + }, + { + "epoch": 0.5533313051414663, + "grad_norm": 1.3810745477676392, + "learning_rate": 1.6801693330972195e-05, + "loss": 0.5053, + "step": 4547 + }, + { + "epoch": 0.5534529966534835, + "grad_norm": 1.2458540201187134, + "learning_rate": 1.6800263235536787e-05, + "loss": 0.4658, + "step": 4548 + }, + { + "epoch": 0.5535746881655005, + "grad_norm": 3.3685176372528076, + "learning_rate": 1.6798832881337287e-05, + "loss": 0.4695, + "step": 4549 + }, + { + "epoch": 0.5536963796775175, + "grad_norm": 3.4742231369018555, + "learning_rate": 1.6797402268428127e-05, + "loss": 0.5015, + "step": 4550 + }, + { + "epoch": 0.5538180711895345, + "grad_norm": 4.773903846740723, + "learning_rate": 1.679597139686374e-05, + "loss": 0.4695, + "step": 4551 + }, + { + "epoch": 0.5539397627015515, + "grad_norm": 1.949647068977356, + "learning_rate": 1.679454026669857e-05, + "loss": 0.5018, + "step": 4552 + }, + { + "epoch": 0.5540614542135686, + "grad_norm": 2.3221402168273926, + "learning_rate": 1.679310887798708e-05, + "loss": 0.4421, + "step": 4553 + }, + { + "epoch": 0.5541831457255857, + "grad_norm": 2.228231191635132, + "learning_rate": 1.679167723078374e-05, + "loss": 0.4202, + "step": 4554 + }, + { + "epoch": 0.5543048372376027, + "grad_norm": 0.7083761692047119, + "learning_rate": 1.679024532514302e-05, + "loss": 0.4665, + "step": 4555 + }, + { + "epoch": 0.5544265287496197, + "grad_norm": 2.922244071960449, + "learning_rate": 1.6788813161119412e-05, + "loss": 0.5286, + "step": 4556 + }, + { + "epoch": 0.5545482202616367, + "grad_norm": 2.4359800815582275, + "learning_rate": 1.6787380738767408e-05, + "loss": 0.5093, + "step": 4557 + }, + { + "epoch": 0.5546699117736538, + "grad_norm": 4.310858249664307, + "learning_rate": 1.6785948058141523e-05, + "loss": 0.5383, + "step": 4558 + }, + { + "epoch": 0.5547916032856708, + "grad_norm": 1.0454226732254028, + "learning_rate": 1.6784515119296262e-05, + "loss": 0.4027, + "step": 4559 + }, + { + "epoch": 0.5549132947976878, + "grad_norm": 1.8177512884140015, + "learning_rate": 1.6783081922286166e-05, + "loss": 0.4619, + "step": 4560 + }, + { + "epoch": 0.5550349863097049, + "grad_norm": 1.7751100063323975, + "learning_rate": 1.6781648467165755e-05, + "loss": 0.4635, + "step": 4561 + }, + { + "epoch": 0.555156677821722, + "grad_norm": 1.5312920808792114, + "learning_rate": 1.6780214753989587e-05, + "loss": 0.398, + "step": 4562 + }, + { + "epoch": 0.555278369333739, + "grad_norm": 0.9218560457229614, + "learning_rate": 1.6778780782812208e-05, + "loss": 0.4978, + "step": 4563 + }, + { + "epoch": 0.555400060845756, + "grad_norm": 0.9840953350067139, + "learning_rate": 1.6777346553688194e-05, + "loss": 0.4493, + "step": 4564 + }, + { + "epoch": 0.555521752357773, + "grad_norm": 0.9022337198257446, + "learning_rate": 1.6775912066672114e-05, + "loss": 0.4574, + "step": 4565 + }, + { + "epoch": 0.55564344386979, + "grad_norm": 1.912704348564148, + "learning_rate": 1.6774477321818558e-05, + "loss": 0.4326, + "step": 4566 + }, + { + "epoch": 0.5557651353818072, + "grad_norm": 1.1413514614105225, + "learning_rate": 1.6773042319182115e-05, + "loss": 0.4808, + "step": 4567 + }, + { + "epoch": 0.5558868268938242, + "grad_norm": 0.8798618316650391, + "learning_rate": 1.6771607058817388e-05, + "loss": 0.4272, + "step": 4568 + }, + { + "epoch": 0.5560085184058412, + "grad_norm": 1.481116771697998, + "learning_rate": 1.6770171540778998e-05, + "loss": 0.4009, + "step": 4569 + }, + { + "epoch": 0.5561302099178582, + "grad_norm": 1.1077401638031006, + "learning_rate": 1.6768735765121568e-05, + "loss": 0.509, + "step": 4570 + }, + { + "epoch": 0.5562519014298752, + "grad_norm": 2.603334426879883, + "learning_rate": 1.6767299731899735e-05, + "loss": 0.5018, + "step": 4571 + }, + { + "epoch": 0.5563735929418923, + "grad_norm": 0.6485960483551025, + "learning_rate": 1.6765863441168136e-05, + "loss": 0.4543, + "step": 4572 + }, + { + "epoch": 0.5564952844539094, + "grad_norm": 0.6065893769264221, + "learning_rate": 1.6764426892981428e-05, + "loss": 0.4407, + "step": 4573 + }, + { + "epoch": 0.5566169759659264, + "grad_norm": 1.0884718894958496, + "learning_rate": 1.676299008739428e-05, + "loss": 0.449, + "step": 4574 + }, + { + "epoch": 0.5567386674779434, + "grad_norm": 1.9185982942581177, + "learning_rate": 1.6761553024461353e-05, + "loss": 0.5235, + "step": 4575 + }, + { + "epoch": 0.5568603589899604, + "grad_norm": 1.969686508178711, + "learning_rate": 1.6760115704237345e-05, + "loss": 0.4874, + "step": 4576 + }, + { + "epoch": 0.5569820505019775, + "grad_norm": 4.09069299697876, + "learning_rate": 1.6758678126776935e-05, + "loss": 0.4007, + "step": 4577 + }, + { + "epoch": 0.5571037420139945, + "grad_norm": 1.188254714012146, + "learning_rate": 1.675724029213484e-05, + "loss": 0.4849, + "step": 4578 + }, + { + "epoch": 0.5572254335260116, + "grad_norm": 0.7444627285003662, + "learning_rate": 1.6755802200365757e-05, + "loss": 0.4778, + "step": 4579 + }, + { + "epoch": 0.5573471250380286, + "grad_norm": 0.8509902954101562, + "learning_rate": 1.6754363851524423e-05, + "loss": 0.4862, + "step": 4580 + }, + { + "epoch": 0.5574688165500457, + "grad_norm": 1.926151156425476, + "learning_rate": 1.675292524566556e-05, + "loss": 0.3914, + "step": 4581 + }, + { + "epoch": 0.5575905080620627, + "grad_norm": 1.415952205657959, + "learning_rate": 1.675148638284392e-05, + "loss": 0.4093, + "step": 4582 + }, + { + "epoch": 0.5577121995740797, + "grad_norm": 1.0945243835449219, + "learning_rate": 1.675004726311424e-05, + "loss": 0.4246, + "step": 4583 + }, + { + "epoch": 0.5578338910860967, + "grad_norm": 0.7974362373352051, + "learning_rate": 1.6748607886531293e-05, + "loss": 0.4402, + "step": 4584 + }, + { + "epoch": 0.5579555825981137, + "grad_norm": 3.542086124420166, + "learning_rate": 1.6747168253149847e-05, + "loss": 0.4661, + "step": 4585 + }, + { + "epoch": 0.5580772741101309, + "grad_norm": 4.370833873748779, + "learning_rate": 1.6745728363024685e-05, + "loss": 0.5445, + "step": 4586 + }, + { + "epoch": 0.5581989656221479, + "grad_norm": 3.091870069503784, + "learning_rate": 1.6744288216210593e-05, + "loss": 0.4522, + "step": 4587 + }, + { + "epoch": 0.5583206571341649, + "grad_norm": 3.192312002182007, + "learning_rate": 1.6742847812762377e-05, + "loss": 0.4739, + "step": 4588 + }, + { + "epoch": 0.5584423486461819, + "grad_norm": 3.6456682682037354, + "learning_rate": 1.6741407152734844e-05, + "loss": 0.524, + "step": 4589 + }, + { + "epoch": 0.5585640401581989, + "grad_norm": 1.131582498550415, + "learning_rate": 1.6739966236182814e-05, + "loss": 0.42, + "step": 4590 + }, + { + "epoch": 0.558685731670216, + "grad_norm": 2.22727370262146, + "learning_rate": 1.6738525063161117e-05, + "loss": 0.4722, + "step": 4591 + }, + { + "epoch": 0.5588074231822331, + "grad_norm": 0.9118644595146179, + "learning_rate": 1.6737083633724596e-05, + "loss": 0.4698, + "step": 4592 + }, + { + "epoch": 0.5589291146942501, + "grad_norm": 2.725691795349121, + "learning_rate": 1.6735641947928095e-05, + "loss": 0.4214, + "step": 4593 + }, + { + "epoch": 0.5590508062062671, + "grad_norm": 4.871251583099365, + "learning_rate": 1.673420000582648e-05, + "loss": 0.4596, + "step": 4594 + }, + { + "epoch": 0.5591724977182841, + "grad_norm": 3.398688316345215, + "learning_rate": 1.673275780747461e-05, + "loss": 0.4721, + "step": 4595 + }, + { + "epoch": 0.5592941892303012, + "grad_norm": 2.8676795959472656, + "learning_rate": 1.6731315352927375e-05, + "loss": 0.4868, + "step": 4596 + }, + { + "epoch": 0.5594158807423182, + "grad_norm": 3.3091068267822266, + "learning_rate": 1.6729872642239656e-05, + "loss": 0.4924, + "step": 4597 + }, + { + "epoch": 0.5595375722543353, + "grad_norm": 2.1699841022491455, + "learning_rate": 1.672842967546635e-05, + "loss": 0.5075, + "step": 4598 + }, + { + "epoch": 0.5596592637663523, + "grad_norm": 2.9286632537841797, + "learning_rate": 1.6726986452662372e-05, + "loss": 0.4725, + "step": 4599 + }, + { + "epoch": 0.5597809552783694, + "grad_norm": 1.9711849689483643, + "learning_rate": 1.672554297388263e-05, + "loss": 0.4611, + "step": 4600 + }, + { + "epoch": 0.5599026467903864, + "grad_norm": 2.653597831726074, + "learning_rate": 1.6724099239182062e-05, + "loss": 0.5205, + "step": 4601 + }, + { + "epoch": 0.5600243383024034, + "grad_norm": 1.3509598970413208, + "learning_rate": 1.6722655248615597e-05, + "loss": 0.4686, + "step": 4602 + }, + { + "epoch": 0.5601460298144204, + "grad_norm": 0.5680484771728516, + "learning_rate": 1.6721211002238185e-05, + "loss": 0.3957, + "step": 4603 + }, + { + "epoch": 0.5602677213264374, + "grad_norm": 2.985861301422119, + "learning_rate": 1.6719766500104782e-05, + "loss": 0.4969, + "step": 4604 + }, + { + "epoch": 0.5603894128384546, + "grad_norm": 4.2559895515441895, + "learning_rate": 1.6718321742270353e-05, + "loss": 0.5123, + "step": 4605 + }, + { + "epoch": 0.5605111043504716, + "grad_norm": 1.1454652547836304, + "learning_rate": 1.6716876728789882e-05, + "loss": 0.4393, + "step": 4606 + }, + { + "epoch": 0.5606327958624886, + "grad_norm": 2.8724396228790283, + "learning_rate": 1.671543145971834e-05, + "loss": 0.5274, + "step": 4607 + }, + { + "epoch": 0.5607544873745056, + "grad_norm": 0.716998815536499, + "learning_rate": 1.671398593511074e-05, + "loss": 0.4497, + "step": 4608 + }, + { + "epoch": 0.5608761788865226, + "grad_norm": 2.7783238887786865, + "learning_rate": 1.6712540155022072e-05, + "loss": 0.5338, + "step": 4609 + }, + { + "epoch": 0.5609978703985397, + "grad_norm": 1.3436685800552368, + "learning_rate": 1.6711094119507354e-05, + "loss": 0.4717, + "step": 4610 + }, + { + "epoch": 0.5611195619105568, + "grad_norm": 2.0009007453918457, + "learning_rate": 1.6709647828621614e-05, + "loss": 0.5594, + "step": 4611 + }, + { + "epoch": 0.5612412534225738, + "grad_norm": 3.040457248687744, + "learning_rate": 1.6708201282419893e-05, + "loss": 0.4543, + "step": 4612 + }, + { + "epoch": 0.5613629449345908, + "grad_norm": 2.340853691101074, + "learning_rate": 1.6706754480957225e-05, + "loss": 0.5073, + "step": 4613 + }, + { + "epoch": 0.5614846364466078, + "grad_norm": 2.7375338077545166, + "learning_rate": 1.6705307424288666e-05, + "loss": 0.4791, + "step": 4614 + }, + { + "epoch": 0.5616063279586249, + "grad_norm": 5.7058024406433105, + "learning_rate": 1.670386011246928e-05, + "loss": 0.4387, + "step": 4615 + }, + { + "epoch": 0.5617280194706419, + "grad_norm": 2.7753055095672607, + "learning_rate": 1.6702412545554142e-05, + "loss": 0.4808, + "step": 4616 + }, + { + "epoch": 0.561849710982659, + "grad_norm": 1.7488722801208496, + "learning_rate": 1.670096472359833e-05, + "loss": 0.4558, + "step": 4617 + }, + { + "epoch": 0.561971402494676, + "grad_norm": 0.9848224520683289, + "learning_rate": 1.6699516646656945e-05, + "loss": 0.4455, + "step": 4618 + }, + { + "epoch": 0.562093094006693, + "grad_norm": 1.3750373125076294, + "learning_rate": 1.6698068314785078e-05, + "loss": 0.3986, + "step": 4619 + }, + { + "epoch": 0.5622147855187101, + "grad_norm": 1.9118283987045288, + "learning_rate": 1.6696619728037853e-05, + "loss": 0.4601, + "step": 4620 + }, + { + "epoch": 0.5623364770307271, + "grad_norm": 2.759049892425537, + "learning_rate": 1.669517088647038e-05, + "loss": 0.4695, + "step": 4621 + }, + { + "epoch": 0.5624581685427441, + "grad_norm": 4.957734107971191, + "learning_rate": 1.66937217901378e-05, + "loss": 0.5114, + "step": 4622 + }, + { + "epoch": 0.5625798600547611, + "grad_norm": 3.4825074672698975, + "learning_rate": 1.6692272439095256e-05, + "loss": 0.4419, + "step": 4623 + }, + { + "epoch": 0.5627015515667783, + "grad_norm": 2.347583055496216, + "learning_rate": 1.6690822833397885e-05, + "loss": 0.4439, + "step": 4624 + }, + { + "epoch": 0.5628232430787953, + "grad_norm": 2.6734941005706787, + "learning_rate": 1.6689372973100862e-05, + "loss": 0.4732, + "step": 4625 + }, + { + "epoch": 0.5629449345908123, + "grad_norm": 0.9740279316902161, + "learning_rate": 1.668792285825935e-05, + "loss": 0.4302, + "step": 4626 + }, + { + "epoch": 0.5630666261028293, + "grad_norm": 1.0920979976654053, + "learning_rate": 1.6686472488928526e-05, + "loss": 0.4567, + "step": 4627 + }, + { + "epoch": 0.5631883176148463, + "grad_norm": 0.9627112150192261, + "learning_rate": 1.6685021865163587e-05, + "loss": 0.4918, + "step": 4628 + }, + { + "epoch": 0.5633100091268634, + "grad_norm": 1.3993078470230103, + "learning_rate": 1.668357098701973e-05, + "loss": 0.5383, + "step": 4629 + }, + { + "epoch": 0.5634317006388805, + "grad_norm": 1.691230297088623, + "learning_rate": 1.6682119854552156e-05, + "loss": 0.5084, + "step": 4630 + }, + { + "epoch": 0.5635533921508975, + "grad_norm": 3.0796499252319336, + "learning_rate": 1.6680668467816097e-05, + "loss": 0.5028, + "step": 4631 + }, + { + "epoch": 0.5636750836629145, + "grad_norm": 6.316005229949951, + "learning_rate": 1.667921682686677e-05, + "loss": 0.5222, + "step": 4632 + }, + { + "epoch": 0.5637967751749315, + "grad_norm": 5.254156589508057, + "learning_rate": 1.6677764931759423e-05, + "loss": 0.4946, + "step": 4633 + }, + { + "epoch": 0.5639184666869486, + "grad_norm": 5.059786796569824, + "learning_rate": 1.6676312782549292e-05, + "loss": 0.5197, + "step": 4634 + }, + { + "epoch": 0.5640401581989656, + "grad_norm": 6.235612869262695, + "learning_rate": 1.6674860379291646e-05, + "loss": 0.4901, + "step": 4635 + }, + { + "epoch": 0.5641618497109827, + "grad_norm": 5.896116733551025, + "learning_rate": 1.6673407722041744e-05, + "loss": 0.5305, + "step": 4636 + }, + { + "epoch": 0.5642835412229997, + "grad_norm": 4.427737236022949, + "learning_rate": 1.6671954810854865e-05, + "loss": 0.4569, + "step": 4637 + }, + { + "epoch": 0.5644052327350167, + "grad_norm": 3.9069371223449707, + "learning_rate": 1.6670501645786293e-05, + "loss": 0.4572, + "step": 4638 + }, + { + "epoch": 0.5645269242470338, + "grad_norm": 2.219663143157959, + "learning_rate": 1.666904822689133e-05, + "loss": 0.4588, + "step": 4639 + }, + { + "epoch": 0.5646486157590508, + "grad_norm": 2.792325019836426, + "learning_rate": 1.666759455422528e-05, + "loss": 0.399, + "step": 4640 + }, + { + "epoch": 0.5647703072710678, + "grad_norm": 3.0173792839050293, + "learning_rate": 1.666614062784345e-05, + "loss": 0.4876, + "step": 4641 + }, + { + "epoch": 0.5648919987830848, + "grad_norm": 3.6950814723968506, + "learning_rate": 1.6664686447801174e-05, + "loss": 0.5053, + "step": 4642 + }, + { + "epoch": 0.565013690295102, + "grad_norm": 4.100681781768799, + "learning_rate": 1.666323201415378e-05, + "loss": 0.4865, + "step": 4643 + }, + { + "epoch": 0.565135381807119, + "grad_norm": 5.905385971069336, + "learning_rate": 1.666177732695662e-05, + "loss": 0.5211, + "step": 4644 + }, + { + "epoch": 0.565257073319136, + "grad_norm": 7.304001808166504, + "learning_rate": 1.666032238626504e-05, + "loss": 0.6014, + "step": 4645 + }, + { + "epoch": 0.565378764831153, + "grad_norm": 0.6334952116012573, + "learning_rate": 1.665886719213441e-05, + "loss": 0.4107, + "step": 4646 + }, + { + "epoch": 0.56550045634317, + "grad_norm": 2.1371588706970215, + "learning_rate": 1.66574117446201e-05, + "loss": 0.466, + "step": 4647 + }, + { + "epoch": 0.565622147855187, + "grad_norm": 3.159528970718384, + "learning_rate": 1.6655956043777496e-05, + "loss": 0.4985, + "step": 4648 + }, + { + "epoch": 0.5657438393672042, + "grad_norm": 0.7443142533302307, + "learning_rate": 1.665450008966198e-05, + "loss": 0.4486, + "step": 4649 + }, + { + "epoch": 0.5658655308792212, + "grad_norm": 0.5971969962120056, + "learning_rate": 1.6653043882328965e-05, + "loss": 0.4701, + "step": 4650 + }, + { + "epoch": 0.5659872223912382, + "grad_norm": 2.6483771800994873, + "learning_rate": 1.665158742183386e-05, + "loss": 0.4667, + "step": 4651 + }, + { + "epoch": 0.5661089139032552, + "grad_norm": 2.697734832763672, + "learning_rate": 1.6650130708232088e-05, + "loss": 0.4719, + "step": 4652 + }, + { + "epoch": 0.5662306054152723, + "grad_norm": 3.1195905208587646, + "learning_rate": 1.6648673741579075e-05, + "loss": 0.4742, + "step": 4653 + }, + { + "epoch": 0.5663522969272893, + "grad_norm": 1.0706874132156372, + "learning_rate": 1.664721652193026e-05, + "loss": 0.5073, + "step": 4654 + }, + { + "epoch": 0.5664739884393064, + "grad_norm": 2.4989240169525146, + "learning_rate": 1.6645759049341103e-05, + "loss": 0.4466, + "step": 4655 + }, + { + "epoch": 0.5665956799513234, + "grad_norm": 2.144212007522583, + "learning_rate": 1.6644301323867058e-05, + "loss": 0.4208, + "step": 4656 + }, + { + "epoch": 0.5667173714633404, + "grad_norm": 1.0743597745895386, + "learning_rate": 1.664284334556359e-05, + "loss": 0.4878, + "step": 4657 + }, + { + "epoch": 0.5668390629753575, + "grad_norm": 2.698432445526123, + "learning_rate": 1.6641385114486188e-05, + "loss": 0.5191, + "step": 4658 + }, + { + "epoch": 0.5669607544873745, + "grad_norm": 0.7531874179840088, + "learning_rate": 1.6639926630690332e-05, + "loss": 0.4123, + "step": 4659 + }, + { + "epoch": 0.5670824459993915, + "grad_norm": 1.6510300636291504, + "learning_rate": 1.6638467894231527e-05, + "loss": 0.4555, + "step": 4660 + }, + { + "epoch": 0.5672041375114085, + "grad_norm": 2.069141387939453, + "learning_rate": 1.6637008905165274e-05, + "loss": 0.5063, + "step": 4661 + }, + { + "epoch": 0.5673258290234257, + "grad_norm": 0.9788318276405334, + "learning_rate": 1.6635549663547096e-05, + "loss": 0.4381, + "step": 4662 + }, + { + "epoch": 0.5674475205354427, + "grad_norm": 0.9708678126335144, + "learning_rate": 1.663409016943252e-05, + "loss": 0.4658, + "step": 4663 + }, + { + "epoch": 0.5675692120474597, + "grad_norm": 2.4511301517486572, + "learning_rate": 1.663263042287708e-05, + "loss": 0.5153, + "step": 4664 + }, + { + "epoch": 0.5676909035594767, + "grad_norm": 3.0183563232421875, + "learning_rate": 1.6631170423936326e-05, + "loss": 0.483, + "step": 4665 + }, + { + "epoch": 0.5678125950714937, + "grad_norm": 3.073075532913208, + "learning_rate": 1.6629710172665805e-05, + "loss": 0.5383, + "step": 4666 + }, + { + "epoch": 0.5679342865835108, + "grad_norm": 0.844896674156189, + "learning_rate": 1.6628249669121095e-05, + "loss": 0.4744, + "step": 4667 + }, + { + "epoch": 0.5680559780955279, + "grad_norm": 4.082496166229248, + "learning_rate": 1.6626788913357764e-05, + "loss": 0.4501, + "step": 4668 + }, + { + "epoch": 0.5681776696075449, + "grad_norm": 3.9005239009857178, + "learning_rate": 1.6625327905431395e-05, + "loss": 0.5026, + "step": 4669 + }, + { + "epoch": 0.5682993611195619, + "grad_norm": 3.5627479553222656, + "learning_rate": 1.662386664539759e-05, + "loss": 0.4466, + "step": 4670 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 2.6441195011138916, + "learning_rate": 1.6622405133311946e-05, + "loss": 0.4449, + "step": 4671 + }, + { + "epoch": 0.568542744143596, + "grad_norm": 1.890883207321167, + "learning_rate": 1.662094336923008e-05, + "loss": 0.5002, + "step": 4672 + }, + { + "epoch": 0.568664435655613, + "grad_norm": 3.0529839992523193, + "learning_rate": 1.6619481353207617e-05, + "loss": 0.4042, + "step": 4673 + }, + { + "epoch": 0.5687861271676301, + "grad_norm": 1.8808848857879639, + "learning_rate": 1.6618019085300183e-05, + "loss": 0.5237, + "step": 4674 + }, + { + "epoch": 0.5689078186796471, + "grad_norm": 0.9577593207359314, + "learning_rate": 1.6616556565563427e-05, + "loss": 0.4571, + "step": 4675 + }, + { + "epoch": 0.5690295101916641, + "grad_norm": 0.7071251273155212, + "learning_rate": 1.6615093794052995e-05, + "loss": 0.4333, + "step": 4676 + }, + { + "epoch": 0.5691512017036812, + "grad_norm": 0.5237108469009399, + "learning_rate": 1.6613630770824554e-05, + "loss": 0.4135, + "step": 4677 + }, + { + "epoch": 0.5692728932156982, + "grad_norm": 2.173288345336914, + "learning_rate": 1.6612167495933772e-05, + "loss": 0.4607, + "step": 4678 + }, + { + "epoch": 0.5693945847277152, + "grad_norm": 3.9257712364196777, + "learning_rate": 1.6610703969436328e-05, + "loss": 0.515, + "step": 4679 + }, + { + "epoch": 0.5695162762397323, + "grad_norm": 0.795545220375061, + "learning_rate": 1.6609240191387917e-05, + "loss": 0.3928, + "step": 4680 + }, + { + "epoch": 0.5696379677517494, + "grad_norm": 2.5526885986328125, + "learning_rate": 1.6607776161844237e-05, + "loss": 0.5013, + "step": 4681 + }, + { + "epoch": 0.5697596592637664, + "grad_norm": 1.6104010343551636, + "learning_rate": 1.6606311880860998e-05, + "loss": 0.3992, + "step": 4682 + }, + { + "epoch": 0.5698813507757834, + "grad_norm": 0.6600081324577332, + "learning_rate": 1.6604847348493914e-05, + "loss": 0.4367, + "step": 4683 + }, + { + "epoch": 0.5700030422878004, + "grad_norm": 0.7592805624008179, + "learning_rate": 1.6603382564798718e-05, + "loss": 0.4157, + "step": 4684 + }, + { + "epoch": 0.5701247337998174, + "grad_norm": 0.8882337808609009, + "learning_rate": 1.6601917529831147e-05, + "loss": 0.4152, + "step": 4685 + }, + { + "epoch": 0.5702464253118344, + "grad_norm": 1.7887595891952515, + "learning_rate": 1.6600452243646947e-05, + "loss": 0.4681, + "step": 4686 + }, + { + "epoch": 0.5703681168238516, + "grad_norm": 1.3326091766357422, + "learning_rate": 1.6598986706301882e-05, + "loss": 0.3958, + "step": 4687 + }, + { + "epoch": 0.5704898083358686, + "grad_norm": 0.5888066291809082, + "learning_rate": 1.659752091785171e-05, + "loss": 0.4339, + "step": 4688 + }, + { + "epoch": 0.5706114998478856, + "grad_norm": 1.2763609886169434, + "learning_rate": 1.6596054878352213e-05, + "loss": 0.4798, + "step": 4689 + }, + { + "epoch": 0.5707331913599026, + "grad_norm": 1.2049351930618286, + "learning_rate": 1.6594588587859174e-05, + "loss": 0.4304, + "step": 4690 + }, + { + "epoch": 0.5708548828719197, + "grad_norm": 1.304108738899231, + "learning_rate": 1.659312204642839e-05, + "loss": 0.4148, + "step": 4691 + }, + { + "epoch": 0.5709765743839367, + "grad_norm": 0.7838099598884583, + "learning_rate": 1.659165525411566e-05, + "loss": 0.4536, + "step": 4692 + }, + { + "epoch": 0.5710982658959538, + "grad_norm": 3.4736058712005615, + "learning_rate": 1.659018821097681e-05, + "loss": 0.4795, + "step": 4693 + }, + { + "epoch": 0.5712199574079708, + "grad_norm": 1.1521637439727783, + "learning_rate": 1.6588720917067655e-05, + "loss": 0.5262, + "step": 4694 + }, + { + "epoch": 0.5713416489199878, + "grad_norm": 0.8665807843208313, + "learning_rate": 1.6587253372444034e-05, + "loss": 0.4808, + "step": 4695 + }, + { + "epoch": 0.5714633404320049, + "grad_norm": 1.7835919857025146, + "learning_rate": 1.658578557716178e-05, + "loss": 0.4328, + "step": 4696 + }, + { + "epoch": 0.5715850319440219, + "grad_norm": 2.4548680782318115, + "learning_rate": 1.658431753127676e-05, + "loss": 0.4276, + "step": 4697 + }, + { + "epoch": 0.5717067234560389, + "grad_norm": 3.152401924133301, + "learning_rate": 1.658284923484483e-05, + "loss": 0.4058, + "step": 4698 + }, + { + "epoch": 0.571828414968056, + "grad_norm": 1.9626328945159912, + "learning_rate": 1.6581380687921856e-05, + "loss": 0.5017, + "step": 4699 + }, + { + "epoch": 0.571950106480073, + "grad_norm": 1.310911774635315, + "learning_rate": 1.6579911890563726e-05, + "loss": 0.4867, + "step": 4700 + }, + { + "epoch": 0.5720717979920901, + "grad_norm": 1.5087254047393799, + "learning_rate": 1.657844284282633e-05, + "loss": 0.4185, + "step": 4701 + }, + { + "epoch": 0.5721934895041071, + "grad_norm": 0.9733480215072632, + "learning_rate": 1.6576973544765564e-05, + "loss": 0.4241, + "step": 4702 + }, + { + "epoch": 0.5723151810161241, + "grad_norm": 3.9932644367218018, + "learning_rate": 1.6575503996437344e-05, + "loss": 0.5327, + "step": 4703 + }, + { + "epoch": 0.5724368725281411, + "grad_norm": 3.181094169616699, + "learning_rate": 1.6574034197897587e-05, + "loss": 0.4977, + "step": 4704 + }, + { + "epoch": 0.5725585640401581, + "grad_norm": 0.5844399929046631, + "learning_rate": 1.6572564149202217e-05, + "loss": 0.4598, + "step": 4705 + }, + { + "epoch": 0.5726802555521753, + "grad_norm": 2.7010364532470703, + "learning_rate": 1.657109385040718e-05, + "loss": 0.4997, + "step": 4706 + }, + { + "epoch": 0.5728019470641923, + "grad_norm": 1.0860276222229004, + "learning_rate": 1.656962330156842e-05, + "loss": 0.4276, + "step": 4707 + }, + { + "epoch": 0.5729236385762093, + "grad_norm": 2.9141764640808105, + "learning_rate": 1.6568152502741895e-05, + "loss": 0.5186, + "step": 4708 + }, + { + "epoch": 0.5730453300882263, + "grad_norm": 1.4544330835342407, + "learning_rate": 1.6566681453983574e-05, + "loss": 0.4707, + "step": 4709 + }, + { + "epoch": 0.5731670216002434, + "grad_norm": 1.860729694366455, + "learning_rate": 1.656521015534943e-05, + "loss": 0.4024, + "step": 4710 + }, + { + "epoch": 0.5732887131122604, + "grad_norm": 0.5835658311843872, + "learning_rate": 1.6563738606895447e-05, + "loss": 0.4861, + "step": 4711 + }, + { + "epoch": 0.5734104046242775, + "grad_norm": 2.6423275470733643, + "learning_rate": 1.6562266808677628e-05, + "loss": 0.5109, + "step": 4712 + }, + { + "epoch": 0.5735320961362945, + "grad_norm": 1.2133762836456299, + "learning_rate": 1.656079476075197e-05, + "loss": 0.4732, + "step": 4713 + }, + { + "epoch": 0.5736537876483115, + "grad_norm": 3.589836597442627, + "learning_rate": 1.6559322463174495e-05, + "loss": 0.4235, + "step": 4714 + }, + { + "epoch": 0.5737754791603286, + "grad_norm": 4.277742385864258, + "learning_rate": 1.655784991600122e-05, + "loss": 0.4232, + "step": 4715 + }, + { + "epoch": 0.5738971706723456, + "grad_norm": 4.2156662940979, + "learning_rate": 1.6556377119288185e-05, + "loss": 0.4285, + "step": 4716 + }, + { + "epoch": 0.5740188621843626, + "grad_norm": 2.3883912563323975, + "learning_rate": 1.655490407309143e-05, + "loss": 0.4379, + "step": 4717 + }, + { + "epoch": 0.5741405536963797, + "grad_norm": 2.569422721862793, + "learning_rate": 1.6553430777467004e-05, + "loss": 0.4769, + "step": 4718 + }, + { + "epoch": 0.5742622452083967, + "grad_norm": 2.017634391784668, + "learning_rate": 1.6551957232470973e-05, + "loss": 0.4085, + "step": 4719 + }, + { + "epoch": 0.5743839367204138, + "grad_norm": 2.7294881343841553, + "learning_rate": 1.6550483438159407e-05, + "loss": 0.4689, + "step": 4720 + }, + { + "epoch": 0.5745056282324308, + "grad_norm": 3.2133383750915527, + "learning_rate": 1.654900939458839e-05, + "loss": 0.4674, + "step": 4721 + }, + { + "epoch": 0.5746273197444478, + "grad_norm": 5.8939290046691895, + "learning_rate": 1.6547535101814007e-05, + "loss": 0.565, + "step": 4722 + }, + { + "epoch": 0.5747490112564648, + "grad_norm": 2.4256250858306885, + "learning_rate": 1.654606055989236e-05, + "loss": 0.4194, + "step": 4723 + }, + { + "epoch": 0.5748707027684818, + "grad_norm": 1.6022299528121948, + "learning_rate": 1.654458576887956e-05, + "loss": 0.4738, + "step": 4724 + }, + { + "epoch": 0.574992394280499, + "grad_norm": 0.6131394505500793, + "learning_rate": 1.6543110728831727e-05, + "loss": 0.4412, + "step": 4725 + }, + { + "epoch": 0.575114085792516, + "grad_norm": 0.6355972290039062, + "learning_rate": 1.6541635439804985e-05, + "loss": 0.4802, + "step": 4726 + }, + { + "epoch": 0.575235777304533, + "grad_norm": 0.8190802931785583, + "learning_rate": 1.6540159901855477e-05, + "loss": 0.4735, + "step": 4727 + }, + { + "epoch": 0.57535746881655, + "grad_norm": 1.7251677513122559, + "learning_rate": 1.6538684115039344e-05, + "loss": 0.4333, + "step": 4728 + }, + { + "epoch": 0.575479160328567, + "grad_norm": 0.7691818475723267, + "learning_rate": 1.6537208079412747e-05, + "loss": 0.5113, + "step": 4729 + }, + { + "epoch": 0.5756008518405841, + "grad_norm": 1.1802380084991455, + "learning_rate": 1.6535731795031853e-05, + "loss": 0.4724, + "step": 4730 + }, + { + "epoch": 0.5757225433526012, + "grad_norm": 2.2256858348846436, + "learning_rate": 1.6534255261952835e-05, + "loss": 0.4635, + "step": 4731 + }, + { + "epoch": 0.5758442348646182, + "grad_norm": 1.4814857244491577, + "learning_rate": 1.653277848023188e-05, + "loss": 0.4927, + "step": 4732 + }, + { + "epoch": 0.5759659263766352, + "grad_norm": 1.2086328268051147, + "learning_rate": 1.653130144992518e-05, + "loss": 0.4895, + "step": 4733 + }, + { + "epoch": 0.5760876178886523, + "grad_norm": 0.6060865521430969, + "learning_rate": 1.652982417108894e-05, + "loss": 0.4568, + "step": 4734 + }, + { + "epoch": 0.5762093094006693, + "grad_norm": 1.071037769317627, + "learning_rate": 1.6528346643779377e-05, + "loss": 0.4623, + "step": 4735 + }, + { + "epoch": 0.5763310009126863, + "grad_norm": 0.8550727963447571, + "learning_rate": 1.652686886805271e-05, + "loss": 0.4454, + "step": 4736 + }, + { + "epoch": 0.5764526924247034, + "grad_norm": 1.1249865293502808, + "learning_rate": 1.6525390843965172e-05, + "loss": 0.4224, + "step": 4737 + }, + { + "epoch": 0.5765743839367204, + "grad_norm": 1.7408998012542725, + "learning_rate": 1.6523912571573007e-05, + "loss": 0.4851, + "step": 4738 + }, + { + "epoch": 0.5766960754487375, + "grad_norm": 1.7351619005203247, + "learning_rate": 1.6522434050932466e-05, + "loss": 0.4468, + "step": 4739 + }, + { + "epoch": 0.5768177669607545, + "grad_norm": 0.5777702927589417, + "learning_rate": 1.6520955282099803e-05, + "loss": 0.45, + "step": 4740 + }, + { + "epoch": 0.5769394584727715, + "grad_norm": 1.0006364583969116, + "learning_rate": 1.6519476265131302e-05, + "loss": 0.4369, + "step": 4741 + }, + { + "epoch": 0.5770611499847885, + "grad_norm": 1.83672034740448, + "learning_rate": 1.6517997000083228e-05, + "loss": 0.4153, + "step": 4742 + }, + { + "epoch": 0.5771828414968055, + "grad_norm": 0.8941745758056641, + "learning_rate": 1.651651748701188e-05, + "loss": 0.4614, + "step": 4743 + }, + { + "epoch": 0.5773045330088227, + "grad_norm": 0.72663813829422, + "learning_rate": 1.651503772597355e-05, + "loss": 0.4476, + "step": 4744 + }, + { + "epoch": 0.5774262245208397, + "grad_norm": 1.300182819366455, + "learning_rate": 1.651355771702455e-05, + "loss": 0.429, + "step": 4745 + }, + { + "epoch": 0.5775479160328567, + "grad_norm": 2.005625009536743, + "learning_rate": 1.65120774602212e-05, + "loss": 0.4519, + "step": 4746 + }, + { + "epoch": 0.5776696075448737, + "grad_norm": 0.5779674649238586, + "learning_rate": 1.651059695561982e-05, + "loss": 0.4911, + "step": 4747 + }, + { + "epoch": 0.5777912990568908, + "grad_norm": 4.288958549499512, + "learning_rate": 1.650911620327675e-05, + "loss": 0.3728, + "step": 4748 + }, + { + "epoch": 0.5779129905689078, + "grad_norm": 0.7756676077842712, + "learning_rate": 1.6507635203248334e-05, + "loss": 0.4317, + "step": 4749 + }, + { + "epoch": 0.5780346820809249, + "grad_norm": 0.5822075009346008, + "learning_rate": 1.6506153955590932e-05, + "loss": 0.4457, + "step": 4750 + }, + { + "epoch": 0.5781563735929419, + "grad_norm": 1.170448660850525, + "learning_rate": 1.6504672460360907e-05, + "loss": 0.4572, + "step": 4751 + }, + { + "epoch": 0.5782780651049589, + "grad_norm": 1.5626784563064575, + "learning_rate": 1.6503190717614628e-05, + "loss": 0.4258, + "step": 4752 + }, + { + "epoch": 0.578399756616976, + "grad_norm": 1.3395341634750366, + "learning_rate": 1.650170872740848e-05, + "loss": 0.4338, + "step": 4753 + }, + { + "epoch": 0.578521448128993, + "grad_norm": 3.614774227142334, + "learning_rate": 1.6500226489798858e-05, + "loss": 0.5023, + "step": 4754 + }, + { + "epoch": 0.57864313964101, + "grad_norm": 3.1597650051116943, + "learning_rate": 1.649874400484216e-05, + "loss": 0.4769, + "step": 4755 + }, + { + "epoch": 0.5787648311530271, + "grad_norm": 0.8550236225128174, + "learning_rate": 1.6497261272594808e-05, + "loss": 0.4298, + "step": 4756 + }, + { + "epoch": 0.5788865226650441, + "grad_norm": 1.9913235902786255, + "learning_rate": 1.649577829311321e-05, + "loss": 0.5268, + "step": 4757 + }, + { + "epoch": 0.5790082141770612, + "grad_norm": 0.6691299080848694, + "learning_rate": 1.6494295066453806e-05, + "loss": 0.481, + "step": 4758 + }, + { + "epoch": 0.5791299056890782, + "grad_norm": 4.586913108825684, + "learning_rate": 1.649281159267303e-05, + "loss": 0.4638, + "step": 4759 + }, + { + "epoch": 0.5792515972010952, + "grad_norm": 3.3829920291900635, + "learning_rate": 1.6491327871827333e-05, + "loss": 0.4657, + "step": 4760 + }, + { + "epoch": 0.5793732887131122, + "grad_norm": 3.0412375926971436, + "learning_rate": 1.648984390397318e-05, + "loss": 0.4837, + "step": 4761 + }, + { + "epoch": 0.5794949802251294, + "grad_norm": 1.7043672800064087, + "learning_rate": 1.6488359689167027e-05, + "loss": 0.4748, + "step": 4762 + }, + { + "epoch": 0.5796166717371464, + "grad_norm": 3.3852155208587646, + "learning_rate": 1.6486875227465356e-05, + "loss": 0.4383, + "step": 4763 + }, + { + "epoch": 0.5797383632491634, + "grad_norm": 3.5761141777038574, + "learning_rate": 1.648539051892466e-05, + "loss": 0.4572, + "step": 4764 + }, + { + "epoch": 0.5798600547611804, + "grad_norm": 1.1287291049957275, + "learning_rate": 1.648390556360143e-05, + "loss": 0.4512, + "step": 4765 + }, + { + "epoch": 0.5799817462731974, + "grad_norm": 0.5648757815361023, + "learning_rate": 1.6482420361552173e-05, + "loss": 0.4424, + "step": 4766 + }, + { + "epoch": 0.5801034377852144, + "grad_norm": 3.5339434146881104, + "learning_rate": 1.64809349128334e-05, + "loss": 0.5179, + "step": 4767 + }, + { + "epoch": 0.5802251292972315, + "grad_norm": 2.2196714878082275, + "learning_rate": 1.6479449217501643e-05, + "loss": 0.4842, + "step": 4768 + }, + { + "epoch": 0.5803468208092486, + "grad_norm": 3.4961092472076416, + "learning_rate": 1.647796327561343e-05, + "loss": 0.4792, + "step": 4769 + }, + { + "epoch": 0.5804685123212656, + "grad_norm": 3.0751497745513916, + "learning_rate": 1.6476477087225306e-05, + "loss": 0.4604, + "step": 4770 + }, + { + "epoch": 0.5805902038332826, + "grad_norm": 2.4273838996887207, + "learning_rate": 1.6474990652393822e-05, + "loss": 0.4588, + "step": 4771 + }, + { + "epoch": 0.5807118953452997, + "grad_norm": 1.406319499015808, + "learning_rate": 1.647350397117554e-05, + "loss": 0.4464, + "step": 4772 + }, + { + "epoch": 0.5808335868573167, + "grad_norm": 0.7434306144714355, + "learning_rate": 1.647201704362704e-05, + "loss": 0.4546, + "step": 4773 + }, + { + "epoch": 0.5809552783693337, + "grad_norm": 0.7493419647216797, + "learning_rate": 1.6470529869804886e-05, + "loss": 0.4255, + "step": 4774 + }, + { + "epoch": 0.5810769698813508, + "grad_norm": 0.9410748481750488, + "learning_rate": 1.6469042449765682e-05, + "loss": 0.5143, + "step": 4775 + }, + { + "epoch": 0.5811986613933678, + "grad_norm": 1.2688018083572388, + "learning_rate": 1.646755478356602e-05, + "loss": 0.4925, + "step": 4776 + }, + { + "epoch": 0.5813203529053849, + "grad_norm": 2.6639444828033447, + "learning_rate": 1.6466066871262513e-05, + "loss": 0.4878, + "step": 4777 + }, + { + "epoch": 0.5814420444174019, + "grad_norm": 1.3994344472885132, + "learning_rate": 1.6464578712911776e-05, + "loss": 0.4807, + "step": 4778 + }, + { + "epoch": 0.5815637359294189, + "grad_norm": 2.3861780166625977, + "learning_rate": 1.646309030857044e-05, + "loss": 0.4415, + "step": 4779 + }, + { + "epoch": 0.5816854274414359, + "grad_norm": 1.6318391561508179, + "learning_rate": 1.646160165829514e-05, + "loss": 0.5171, + "step": 4780 + }, + { + "epoch": 0.581807118953453, + "grad_norm": 0.8817933797836304, + "learning_rate": 1.646011276214252e-05, + "loss": 0.4937, + "step": 4781 + }, + { + "epoch": 0.5819288104654701, + "grad_norm": 2.3189210891723633, + "learning_rate": 1.6458623620169238e-05, + "loss": 0.4736, + "step": 4782 + }, + { + "epoch": 0.5820505019774871, + "grad_norm": 1.1711735725402832, + "learning_rate": 1.6457134232431958e-05, + "loss": 0.4882, + "step": 4783 + }, + { + "epoch": 0.5821721934895041, + "grad_norm": 0.8435712456703186, + "learning_rate": 1.645564459898736e-05, + "loss": 0.4444, + "step": 4784 + }, + { + "epoch": 0.5822938850015211, + "grad_norm": 0.9287135004997253, + "learning_rate": 1.645415471989212e-05, + "loss": 0.4782, + "step": 4785 + }, + { + "epoch": 0.5824155765135381, + "grad_norm": 1.105912446975708, + "learning_rate": 1.645266459520293e-05, + "loss": 0.4619, + "step": 4786 + }, + { + "epoch": 0.5825372680255552, + "grad_norm": 2.156761646270752, + "learning_rate": 1.64511742249765e-05, + "loss": 0.4537, + "step": 4787 + }, + { + "epoch": 0.5826589595375723, + "grad_norm": 3.292728900909424, + "learning_rate": 1.6449683609269535e-05, + "loss": 0.4982, + "step": 4788 + }, + { + "epoch": 0.5827806510495893, + "grad_norm": 1.987899661064148, + "learning_rate": 1.644819274813876e-05, + "loss": 0.4016, + "step": 4789 + }, + { + "epoch": 0.5829023425616063, + "grad_norm": 1.496383786201477, + "learning_rate": 1.6446701641640904e-05, + "loss": 0.4981, + "step": 4790 + }, + { + "epoch": 0.5830240340736234, + "grad_norm": 1.2411843538284302, + "learning_rate": 1.6445210289832706e-05, + "loss": 0.542, + "step": 4791 + }, + { + "epoch": 0.5831457255856404, + "grad_norm": 0.8396580219268799, + "learning_rate": 1.6443718692770916e-05, + "loss": 0.4655, + "step": 4792 + }, + { + "epoch": 0.5832674170976574, + "grad_norm": 1.2516398429870605, + "learning_rate": 1.6442226850512292e-05, + "loss": 0.4523, + "step": 4793 + }, + { + "epoch": 0.5833891086096745, + "grad_norm": 2.404691696166992, + "learning_rate": 1.6440734763113598e-05, + "loss": 0.4642, + "step": 4794 + }, + { + "epoch": 0.5835108001216915, + "grad_norm": 1.21297287940979, + "learning_rate": 1.643924243063162e-05, + "loss": 0.48, + "step": 4795 + }, + { + "epoch": 0.5836324916337086, + "grad_norm": 2.2260003089904785, + "learning_rate": 1.643774985312314e-05, + "loss": 0.5069, + "step": 4796 + }, + { + "epoch": 0.5837541831457256, + "grad_norm": 1.9143424034118652, + "learning_rate": 1.643625703064495e-05, + "loss": 0.4693, + "step": 4797 + }, + { + "epoch": 0.5838758746577426, + "grad_norm": 1.9398530721664429, + "learning_rate": 1.6434763963253856e-05, + "loss": 0.4186, + "step": 4798 + }, + { + "epoch": 0.5839975661697596, + "grad_norm": 2.0223612785339355, + "learning_rate": 1.6433270651006676e-05, + "loss": 0.4722, + "step": 4799 + }, + { + "epoch": 0.5841192576817767, + "grad_norm": 0.7397261261940002, + "learning_rate": 1.6431777093960228e-05, + "loss": 0.4237, + "step": 4800 + }, + { + "epoch": 0.5842409491937938, + "grad_norm": 2.930478572845459, + "learning_rate": 1.643028329217135e-05, + "loss": 0.5151, + "step": 4801 + }, + { + "epoch": 0.5843626407058108, + "grad_norm": 1.4000325202941895, + "learning_rate": 1.6428789245696886e-05, + "loss": 0.4847, + "step": 4802 + }, + { + "epoch": 0.5844843322178278, + "grad_norm": 0.560815155506134, + "learning_rate": 1.6427294954593685e-05, + "loss": 0.4272, + "step": 4803 + }, + { + "epoch": 0.5846060237298448, + "grad_norm": 0.6566979885101318, + "learning_rate": 1.6425800418918604e-05, + "loss": 0.4515, + "step": 4804 + }, + { + "epoch": 0.5847277152418618, + "grad_norm": 2.36249041557312, + "learning_rate": 1.6424305638728516e-05, + "loss": 0.3983, + "step": 4805 + }, + { + "epoch": 0.5848494067538789, + "grad_norm": 0.9769846796989441, + "learning_rate": 1.64228106140803e-05, + "loss": 0.4773, + "step": 4806 + }, + { + "epoch": 0.584971098265896, + "grad_norm": 0.8010731339454651, + "learning_rate": 1.642131534503085e-05, + "loss": 0.5231, + "step": 4807 + }, + { + "epoch": 0.585092789777913, + "grad_norm": 2.609766960144043, + "learning_rate": 1.641981983163706e-05, + "loss": 0.4453, + "step": 4808 + }, + { + "epoch": 0.58521448128993, + "grad_norm": 1.526746153831482, + "learning_rate": 1.6418324073955833e-05, + "loss": 0.5037, + "step": 4809 + }, + { + "epoch": 0.585336172801947, + "grad_norm": 2.1547367572784424, + "learning_rate": 1.641682807204409e-05, + "loss": 0.452, + "step": 4810 + }, + { + "epoch": 0.5854578643139641, + "grad_norm": 1.9977123737335205, + "learning_rate": 1.6415331825958757e-05, + "loss": 0.4689, + "step": 4811 + }, + { + "epoch": 0.5855795558259811, + "grad_norm": 1.8976426124572754, + "learning_rate": 1.641383533575677e-05, + "loss": 0.4885, + "step": 4812 + }, + { + "epoch": 0.5857012473379982, + "grad_norm": 1.007218360900879, + "learning_rate": 1.6412338601495073e-05, + "loss": 0.4817, + "step": 4813 + }, + { + "epoch": 0.5858229388500152, + "grad_norm": 0.6481262445449829, + "learning_rate": 1.641084162323062e-05, + "loss": 0.4444, + "step": 4814 + }, + { + "epoch": 0.5859446303620323, + "grad_norm": 2.1297457218170166, + "learning_rate": 1.6409344401020372e-05, + "loss": 0.5232, + "step": 4815 + }, + { + "epoch": 0.5860663218740493, + "grad_norm": 2.920334815979004, + "learning_rate": 1.6407846934921304e-05, + "loss": 0.5061, + "step": 4816 + }, + { + "epoch": 0.5861880133860663, + "grad_norm": 1.4199961423873901, + "learning_rate": 1.6406349224990396e-05, + "loss": 0.4438, + "step": 4817 + }, + { + "epoch": 0.5863097048980833, + "grad_norm": 2.7833731174468994, + "learning_rate": 1.6404851271284638e-05, + "loss": 0.544, + "step": 4818 + }, + { + "epoch": 0.5864313964101004, + "grad_norm": 0.976802408695221, + "learning_rate": 1.6403353073861037e-05, + "loss": 0.4999, + "step": 4819 + }, + { + "epoch": 0.5865530879221175, + "grad_norm": 1.2913380861282349, + "learning_rate": 1.6401854632776594e-05, + "loss": 0.5261, + "step": 4820 + }, + { + "epoch": 0.5866747794341345, + "grad_norm": 1.2182037830352783, + "learning_rate": 1.6400355948088328e-05, + "loss": 0.4645, + "step": 4821 + }, + { + "epoch": 0.5867964709461515, + "grad_norm": 0.6297833323478699, + "learning_rate": 1.6398857019853272e-05, + "loss": 0.4733, + "step": 4822 + }, + { + "epoch": 0.5869181624581685, + "grad_norm": 1.6896119117736816, + "learning_rate": 1.6397357848128465e-05, + "loss": 0.4369, + "step": 4823 + }, + { + "epoch": 0.5870398539701855, + "grad_norm": 2.1172473430633545, + "learning_rate": 1.639585843297095e-05, + "loss": 0.5097, + "step": 4824 + }, + { + "epoch": 0.5871615454822026, + "grad_norm": 0.5329152941703796, + "learning_rate": 1.639435877443778e-05, + "loss": 0.4364, + "step": 4825 + }, + { + "epoch": 0.5872832369942197, + "grad_norm": 0.8318140506744385, + "learning_rate": 1.6392858872586022e-05, + "loss": 0.471, + "step": 4826 + }, + { + "epoch": 0.5874049285062367, + "grad_norm": 1.0805152654647827, + "learning_rate": 1.6391358727472755e-05, + "loss": 0.4668, + "step": 4827 + }, + { + "epoch": 0.5875266200182537, + "grad_norm": 1.5350522994995117, + "learning_rate": 1.6389858339155057e-05, + "loss": 0.4386, + "step": 4828 + }, + { + "epoch": 0.5876483115302708, + "grad_norm": 2.1716132164001465, + "learning_rate": 1.638835770769002e-05, + "loss": 0.5089, + "step": 4829 + }, + { + "epoch": 0.5877700030422878, + "grad_norm": 0.7079111337661743, + "learning_rate": 1.6386856833134753e-05, + "loss": 0.4696, + "step": 4830 + }, + { + "epoch": 0.5878916945543048, + "grad_norm": 0.7532975673675537, + "learning_rate": 1.6385355715546364e-05, + "loss": 0.4632, + "step": 4831 + }, + { + "epoch": 0.5880133860663219, + "grad_norm": 0.8625035881996155, + "learning_rate": 1.6383854354981972e-05, + "loss": 0.4494, + "step": 4832 + }, + { + "epoch": 0.5881350775783389, + "grad_norm": 0.8266791701316833, + "learning_rate": 1.6382352751498706e-05, + "loss": 0.4651, + "step": 4833 + }, + { + "epoch": 0.588256769090356, + "grad_norm": 1.7575665712356567, + "learning_rate": 1.6380850905153706e-05, + "loss": 0.4565, + "step": 4834 + }, + { + "epoch": 0.588378460602373, + "grad_norm": 1.365768313407898, + "learning_rate": 1.6379348816004127e-05, + "loss": 0.4584, + "step": 4835 + }, + { + "epoch": 0.58850015211439, + "grad_norm": 0.6995435357093811, + "learning_rate": 1.6377846484107116e-05, + "loss": 0.4273, + "step": 4836 + }, + { + "epoch": 0.588621843626407, + "grad_norm": 1.1196446418762207, + "learning_rate": 1.637634390951985e-05, + "loss": 0.4104, + "step": 4837 + }, + { + "epoch": 0.5887435351384241, + "grad_norm": 1.402665615081787, + "learning_rate": 1.6374841092299493e-05, + "loss": 0.4343, + "step": 4838 + }, + { + "epoch": 0.5888652266504412, + "grad_norm": 2.1505250930786133, + "learning_rate": 1.637333803250324e-05, + "loss": 0.4644, + "step": 4839 + }, + { + "epoch": 0.5889869181624582, + "grad_norm": 3.135927438735962, + "learning_rate": 1.6371834730188284e-05, + "loss": 0.5546, + "step": 4840 + }, + { + "epoch": 0.5891086096744752, + "grad_norm": 0.7046291828155518, + "learning_rate": 1.637033118541183e-05, + "loss": 0.4591, + "step": 4841 + }, + { + "epoch": 0.5892303011864922, + "grad_norm": 1.194037914276123, + "learning_rate": 1.6368827398231085e-05, + "loss": 0.5576, + "step": 4842 + }, + { + "epoch": 0.5893519926985092, + "grad_norm": 1.2430156469345093, + "learning_rate": 1.6367323368703275e-05, + "loss": 0.5076, + "step": 4843 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 0.9934839606285095, + "learning_rate": 1.6365819096885635e-05, + "loss": 0.4939, + "step": 4844 + }, + { + "epoch": 0.5895953757225434, + "grad_norm": 3.0190505981445312, + "learning_rate": 1.6364314582835395e-05, + "loss": 0.4703, + "step": 4845 + }, + { + "epoch": 0.5897170672345604, + "grad_norm": 1.2759708166122437, + "learning_rate": 1.6362809826609817e-05, + "loss": 0.4738, + "step": 4846 + }, + { + "epoch": 0.5898387587465774, + "grad_norm": 3.3026254177093506, + "learning_rate": 1.6361304828266153e-05, + "loss": 0.469, + "step": 4847 + }, + { + "epoch": 0.5899604502585944, + "grad_norm": 2.642491340637207, + "learning_rate": 1.6359799587861675e-05, + "loss": 0.4496, + "step": 4848 + }, + { + "epoch": 0.5900821417706115, + "grad_norm": 0.5912262797355652, + "learning_rate": 1.6358294105453656e-05, + "loss": 0.4696, + "step": 4849 + }, + { + "epoch": 0.5902038332826285, + "grad_norm": 1.8558549880981445, + "learning_rate": 1.6356788381099384e-05, + "loss": 0.4912, + "step": 4850 + }, + { + "epoch": 0.5903255247946456, + "grad_norm": 2.138456344604492, + "learning_rate": 1.6355282414856158e-05, + "loss": 0.5067, + "step": 4851 + }, + { + "epoch": 0.5904472163066626, + "grad_norm": 3.22438645362854, + "learning_rate": 1.635377620678128e-05, + "loss": 0.3721, + "step": 4852 + }, + { + "epoch": 0.5905689078186797, + "grad_norm": 1.7777165174484253, + "learning_rate": 1.6352269756932072e-05, + "loss": 0.4624, + "step": 4853 + }, + { + "epoch": 0.5906905993306967, + "grad_norm": 3.3200674057006836, + "learning_rate": 1.6350763065365846e-05, + "loss": 0.4839, + "step": 4854 + }, + { + "epoch": 0.5908122908427137, + "grad_norm": 1.2257426977157593, + "learning_rate": 1.634925613213994e-05, + "loss": 0.4666, + "step": 4855 + }, + { + "epoch": 0.5909339823547307, + "grad_norm": 1.2461884021759033, + "learning_rate": 1.6347748957311698e-05, + "loss": 0.4759, + "step": 4856 + }, + { + "epoch": 0.5910556738667478, + "grad_norm": 3.076406240463257, + "learning_rate": 1.6346241540938467e-05, + "loss": 0.4267, + "step": 4857 + }, + { + "epoch": 0.5911773653787649, + "grad_norm": 0.7298029065132141, + "learning_rate": 1.634473388307761e-05, + "loss": 0.4986, + "step": 4858 + }, + { + "epoch": 0.5912990568907819, + "grad_norm": 4.680472373962402, + "learning_rate": 1.6343225983786496e-05, + "loss": 0.4754, + "step": 4859 + }, + { + "epoch": 0.5914207484027989, + "grad_norm": 4.681982517242432, + "learning_rate": 1.6341717843122507e-05, + "loss": 0.4279, + "step": 4860 + }, + { + "epoch": 0.5915424399148159, + "grad_norm": 1.4455347061157227, + "learning_rate": 1.6340209461143023e-05, + "loss": 0.4768, + "step": 4861 + }, + { + "epoch": 0.5916641314268329, + "grad_norm": 2.154365301132202, + "learning_rate": 1.6338700837905446e-05, + "loss": 0.4205, + "step": 4862 + }, + { + "epoch": 0.5917858229388501, + "grad_norm": 0.7109108567237854, + "learning_rate": 1.6337191973467182e-05, + "loss": 0.4549, + "step": 4863 + }, + { + "epoch": 0.5919075144508671, + "grad_norm": 1.5381110906600952, + "learning_rate": 1.6335682867885646e-05, + "loss": 0.5015, + "step": 4864 + }, + { + "epoch": 0.5920292059628841, + "grad_norm": 2.0715560913085938, + "learning_rate": 1.633417352121826e-05, + "loss": 0.3974, + "step": 4865 + }, + { + "epoch": 0.5921508974749011, + "grad_norm": 1.9576348066329956, + "learning_rate": 1.6332663933522468e-05, + "loss": 0.4673, + "step": 4866 + }, + { + "epoch": 0.5922725889869181, + "grad_norm": 0.580463171005249, + "learning_rate": 1.63311541048557e-05, + "loss": 0.3903, + "step": 4867 + }, + { + "epoch": 0.5923942804989352, + "grad_norm": 4.284531116485596, + "learning_rate": 1.6329644035275416e-05, + "loss": 0.5127, + "step": 4868 + }, + { + "epoch": 0.5925159720109522, + "grad_norm": 1.575125813484192, + "learning_rate": 1.6328133724839074e-05, + "loss": 0.4162, + "step": 4869 + }, + { + "epoch": 0.5926376635229693, + "grad_norm": 0.6911008358001709, + "learning_rate": 1.6326623173604143e-05, + "loss": 0.4181, + "step": 4870 + }, + { + "epoch": 0.5927593550349863, + "grad_norm": 2.712672233581543, + "learning_rate": 1.6325112381628106e-05, + "loss": 0.4471, + "step": 4871 + }, + { + "epoch": 0.5928810465470034, + "grad_norm": 4.300261974334717, + "learning_rate": 1.6323601348968454e-05, + "loss": 0.5571, + "step": 4872 + }, + { + "epoch": 0.5930027380590204, + "grad_norm": 1.1900861263275146, + "learning_rate": 1.632209007568268e-05, + "loss": 0.4998, + "step": 4873 + }, + { + "epoch": 0.5931244295710374, + "grad_norm": 1.2653896808624268, + "learning_rate": 1.6320578561828288e-05, + "loss": 0.4469, + "step": 4874 + }, + { + "epoch": 0.5932461210830544, + "grad_norm": 1.5442241430282593, + "learning_rate": 1.63190668074628e-05, + "loss": 0.443, + "step": 4875 + }, + { + "epoch": 0.5933678125950715, + "grad_norm": 1.3200793266296387, + "learning_rate": 1.6317554812643744e-05, + "loss": 0.4788, + "step": 4876 + }, + { + "epoch": 0.5934895041070886, + "grad_norm": 2.5573348999023438, + "learning_rate": 1.631604257742865e-05, + "loss": 0.4422, + "step": 4877 + }, + { + "epoch": 0.5936111956191056, + "grad_norm": 2.103461503982544, + "learning_rate": 1.631453010187506e-05, + "loss": 0.4654, + "step": 4878 + }, + { + "epoch": 0.5937328871311226, + "grad_norm": 0.6830573678016663, + "learning_rate": 1.6313017386040532e-05, + "loss": 0.4731, + "step": 4879 + }, + { + "epoch": 0.5938545786431396, + "grad_norm": 1.3459645509719849, + "learning_rate": 1.6311504429982624e-05, + "loss": 0.4194, + "step": 4880 + }, + { + "epoch": 0.5939762701551566, + "grad_norm": 1.379117727279663, + "learning_rate": 1.6309991233758908e-05, + "loss": 0.4661, + "step": 4881 + }, + { + "epoch": 0.5940979616671738, + "grad_norm": 4.034727573394775, + "learning_rate": 1.6308477797426966e-05, + "loss": 0.5149, + "step": 4882 + }, + { + "epoch": 0.5942196531791908, + "grad_norm": 1.3604557514190674, + "learning_rate": 1.6306964121044386e-05, + "loss": 0.4949, + "step": 4883 + }, + { + "epoch": 0.5943413446912078, + "grad_norm": 0.6975970268249512, + "learning_rate": 1.6305450204668766e-05, + "loss": 0.3993, + "step": 4884 + }, + { + "epoch": 0.5944630362032248, + "grad_norm": 0.853491485118866, + "learning_rate": 1.630393604835771e-05, + "loss": 0.4374, + "step": 4885 + }, + { + "epoch": 0.5945847277152418, + "grad_norm": 1.17313551902771, + "learning_rate": 1.6302421652168846e-05, + "loss": 0.4897, + "step": 4886 + }, + { + "epoch": 0.5947064192272589, + "grad_norm": 1.399959683418274, + "learning_rate": 1.6300907016159787e-05, + "loss": 0.5056, + "step": 4887 + }, + { + "epoch": 0.5948281107392759, + "grad_norm": 1.9173386096954346, + "learning_rate": 1.6299392140388176e-05, + "loss": 0.5131, + "step": 4888 + }, + { + "epoch": 0.594949802251293, + "grad_norm": 2.126542091369629, + "learning_rate": 1.6297877024911658e-05, + "loss": 0.4163, + "step": 4889 + }, + { + "epoch": 0.59507149376331, + "grad_norm": 1.7467695474624634, + "learning_rate": 1.629636166978788e-05, + "loss": 0.5125, + "step": 4890 + }, + { + "epoch": 0.595193185275327, + "grad_norm": 1.3654292821884155, + "learning_rate": 1.629484607507451e-05, + "loss": 0.4825, + "step": 4891 + }, + { + "epoch": 0.5953148767873441, + "grad_norm": 0.8388196229934692, + "learning_rate": 1.6293330240829215e-05, + "loss": 0.4478, + "step": 4892 + }, + { + "epoch": 0.5954365682993611, + "grad_norm": 4.521686553955078, + "learning_rate": 1.6291814167109677e-05, + "loss": 0.4196, + "step": 4893 + }, + { + "epoch": 0.5955582598113781, + "grad_norm": 1.1228781938552856, + "learning_rate": 1.629029785397359e-05, + "loss": 0.4497, + "step": 4894 + }, + { + "epoch": 0.5956799513233952, + "grad_norm": 1.361053466796875, + "learning_rate": 1.6288781301478647e-05, + "loss": 0.5548, + "step": 4895 + }, + { + "epoch": 0.5958016428354123, + "grad_norm": 2.383488178253174, + "learning_rate": 1.6287264509682558e-05, + "loss": 0.4392, + "step": 4896 + }, + { + "epoch": 0.5959233343474293, + "grad_norm": 0.8069707751274109, + "learning_rate": 1.6285747478643042e-05, + "loss": 0.4718, + "step": 4897 + }, + { + "epoch": 0.5960450258594463, + "grad_norm": 0.800216794013977, + "learning_rate": 1.628423020841782e-05, + "loss": 0.47, + "step": 4898 + }, + { + "epoch": 0.5961667173714633, + "grad_norm": 2.162963390350342, + "learning_rate": 1.628271269906464e-05, + "loss": 0.4975, + "step": 4899 + }, + { + "epoch": 0.5962884088834803, + "grad_norm": 2.524778366088867, + "learning_rate": 1.6281194950641227e-05, + "loss": 0.4764, + "step": 4900 + }, + { + "epoch": 0.5964101003954975, + "grad_norm": 0.7725741267204285, + "learning_rate": 1.627967696320535e-05, + "loss": 0.4834, + "step": 4901 + }, + { + "epoch": 0.5965317919075145, + "grad_norm": 0.7840932011604309, + "learning_rate": 1.6278158736814765e-05, + "loss": 0.527, + "step": 4902 + }, + { + "epoch": 0.5966534834195315, + "grad_norm": 1.531099796295166, + "learning_rate": 1.6276640271527245e-05, + "loss": 0.4972, + "step": 4903 + }, + { + "epoch": 0.5967751749315485, + "grad_norm": 2.037529706954956, + "learning_rate": 1.6275121567400575e-05, + "loss": 0.4906, + "step": 4904 + }, + { + "epoch": 0.5968968664435655, + "grad_norm": 1.8573511838912964, + "learning_rate": 1.627360262449253e-05, + "loss": 0.4649, + "step": 4905 + }, + { + "epoch": 0.5970185579555826, + "grad_norm": 2.257289409637451, + "learning_rate": 1.627208344286093e-05, + "loss": 0.4452, + "step": 4906 + }, + { + "epoch": 0.5971402494675996, + "grad_norm": 0.8152996897697449, + "learning_rate": 1.627056402256357e-05, + "loss": 0.4529, + "step": 4907 + }, + { + "epoch": 0.5972619409796167, + "grad_norm": 0.8906998634338379, + "learning_rate": 1.6269044363658268e-05, + "loss": 0.4517, + "step": 4908 + }, + { + "epoch": 0.5973836324916337, + "grad_norm": 1.3469293117523193, + "learning_rate": 1.626752446620285e-05, + "loss": 0.4754, + "step": 4909 + }, + { + "epoch": 0.5975053240036508, + "grad_norm": 4.20992374420166, + "learning_rate": 1.6266004330255155e-05, + "loss": 0.5106, + "step": 4910 + }, + { + "epoch": 0.5976270155156678, + "grad_norm": 4.386756896972656, + "learning_rate": 1.6264483955873025e-05, + "loss": 0.5171, + "step": 4911 + }, + { + "epoch": 0.5977487070276848, + "grad_norm": 3.6281614303588867, + "learning_rate": 1.6262963343114316e-05, + "loss": 0.5142, + "step": 4912 + }, + { + "epoch": 0.5978703985397018, + "grad_norm": 4.057873249053955, + "learning_rate": 1.6261442492036887e-05, + "loss": 0.5312, + "step": 4913 + }, + { + "epoch": 0.5979920900517189, + "grad_norm": 1.9563565254211426, + "learning_rate": 1.625992140269861e-05, + "loss": 0.5086, + "step": 4914 + }, + { + "epoch": 0.598113781563736, + "grad_norm": 1.8507587909698486, + "learning_rate": 1.6258400075157362e-05, + "loss": 0.4166, + "step": 4915 + }, + { + "epoch": 0.598235473075753, + "grad_norm": 0.7084296345710754, + "learning_rate": 1.6256878509471043e-05, + "loss": 0.4427, + "step": 4916 + }, + { + "epoch": 0.59835716458777, + "grad_norm": 0.6602417230606079, + "learning_rate": 1.6255356705697548e-05, + "loss": 0.4912, + "step": 4917 + }, + { + "epoch": 0.598478856099787, + "grad_norm": 0.734244704246521, + "learning_rate": 1.6253834663894778e-05, + "loss": 0.4978, + "step": 4918 + }, + { + "epoch": 0.598600547611804, + "grad_norm": 2.2112808227539062, + "learning_rate": 1.6252312384120652e-05, + "loss": 0.4609, + "step": 4919 + }, + { + "epoch": 0.5987222391238212, + "grad_norm": 2.419268846511841, + "learning_rate": 1.6250789866433102e-05, + "loss": 0.48, + "step": 4920 + }, + { + "epoch": 0.5988439306358382, + "grad_norm": 1.058548927307129, + "learning_rate": 1.6249267110890057e-05, + "loss": 0.4747, + "step": 4921 + }, + { + "epoch": 0.5989656221478552, + "grad_norm": 0.5447883009910583, + "learning_rate": 1.624774411754946e-05, + "loss": 0.4808, + "step": 4922 + }, + { + "epoch": 0.5990873136598722, + "grad_norm": 0.8803136944770813, + "learning_rate": 1.624622088646928e-05, + "loss": 0.4378, + "step": 4923 + }, + { + "epoch": 0.5992090051718892, + "grad_norm": 0.6201842427253723, + "learning_rate": 1.6244697417707453e-05, + "loss": 0.4355, + "step": 4924 + }, + { + "epoch": 0.5993306966839063, + "grad_norm": 0.7090044617652893, + "learning_rate": 1.624317371132197e-05, + "loss": 0.3923, + "step": 4925 + }, + { + "epoch": 0.5994523881959233, + "grad_norm": 6.064487934112549, + "learning_rate": 1.62416497673708e-05, + "loss": 0.5719, + "step": 4926 + }, + { + "epoch": 0.5995740797079404, + "grad_norm": 4.118100643157959, + "learning_rate": 1.624012558591194e-05, + "loss": 0.4988, + "step": 4927 + }, + { + "epoch": 0.5996957712199574, + "grad_norm": 1.6848117113113403, + "learning_rate": 1.6238601167003384e-05, + "loss": 0.4782, + "step": 4928 + }, + { + "epoch": 0.5998174627319744, + "grad_norm": 1.9276460409164429, + "learning_rate": 1.6237076510703142e-05, + "loss": 0.4817, + "step": 4929 + }, + { + "epoch": 0.5999391542439915, + "grad_norm": 3.334984302520752, + "learning_rate": 1.6235551617069228e-05, + "loss": 0.5285, + "step": 4930 + }, + { + "epoch": 0.6000608457560085, + "grad_norm": 1.0082061290740967, + "learning_rate": 1.6234026486159668e-05, + "loss": 0.4259, + "step": 4931 + }, + { + "epoch": 0.6001825372680255, + "grad_norm": 2.297333002090454, + "learning_rate": 1.6232501118032496e-05, + "loss": 0.5576, + "step": 4932 + }, + { + "epoch": 0.6003042287800426, + "grad_norm": 1.1869359016418457, + "learning_rate": 1.6230975512745756e-05, + "loss": 0.4634, + "step": 4933 + }, + { + "epoch": 0.6004259202920597, + "grad_norm": 1.9856035709381104, + "learning_rate": 1.6229449670357502e-05, + "loss": 0.4876, + "step": 4934 + }, + { + "epoch": 0.6005476118040767, + "grad_norm": 0.6499016880989075, + "learning_rate": 1.6227923590925794e-05, + "loss": 0.4823, + "step": 4935 + }, + { + "epoch": 0.6006693033160937, + "grad_norm": 2.8664772510528564, + "learning_rate": 1.6226397274508697e-05, + "loss": 0.4258, + "step": 4936 + }, + { + "epoch": 0.6007909948281107, + "grad_norm": 2.8016884326934814, + "learning_rate": 1.6224870721164304e-05, + "loss": 0.448, + "step": 4937 + }, + { + "epoch": 0.6009126863401277, + "grad_norm": 0.6614217758178711, + "learning_rate": 1.622334393095069e-05, + "loss": 0.4562, + "step": 4938 + }, + { + "epoch": 0.6010343778521449, + "grad_norm": 1.4142911434173584, + "learning_rate": 1.6221816903925956e-05, + "loss": 0.4369, + "step": 4939 + }, + { + "epoch": 0.6011560693641619, + "grad_norm": 1.9742951393127441, + "learning_rate": 1.6220289640148214e-05, + "loss": 0.4935, + "step": 4940 + }, + { + "epoch": 0.6012777608761789, + "grad_norm": 0.5329909324645996, + "learning_rate": 1.6218762139675574e-05, + "loss": 0.4132, + "step": 4941 + }, + { + "epoch": 0.6013994523881959, + "grad_norm": 2.0596513748168945, + "learning_rate": 1.6217234402566165e-05, + "loss": 0.4943, + "step": 4942 + }, + { + "epoch": 0.6015211439002129, + "grad_norm": 2.7426199913024902, + "learning_rate": 1.6215706428878115e-05, + "loss": 0.3465, + "step": 4943 + }, + { + "epoch": 0.60164283541223, + "grad_norm": 4.940963268280029, + "learning_rate": 1.621417821866957e-05, + "loss": 0.5164, + "step": 4944 + }, + { + "epoch": 0.601764526924247, + "grad_norm": 0.9825218319892883, + "learning_rate": 1.6212649771998685e-05, + "loss": 0.4044, + "step": 4945 + }, + { + "epoch": 0.6018862184362641, + "grad_norm": 1.3012899160385132, + "learning_rate": 1.621112108892361e-05, + "loss": 0.4254, + "step": 4946 + }, + { + "epoch": 0.6020079099482811, + "grad_norm": 0.6120168566703796, + "learning_rate": 1.6209592169502527e-05, + "loss": 0.374, + "step": 4947 + }, + { + "epoch": 0.6021296014602981, + "grad_norm": 2.465064764022827, + "learning_rate": 1.6208063013793607e-05, + "loss": 0.4842, + "step": 4948 + }, + { + "epoch": 0.6022512929723152, + "grad_norm": 2.29573917388916, + "learning_rate": 1.6206533621855037e-05, + "loss": 0.3824, + "step": 4949 + }, + { + "epoch": 0.6023729844843322, + "grad_norm": 1.0123569965362549, + "learning_rate": 1.620500399374502e-05, + "loss": 0.4662, + "step": 4950 + }, + { + "epoch": 0.6024946759963492, + "grad_norm": 0.9911242127418518, + "learning_rate": 1.6203474129521753e-05, + "loss": 0.4553, + "step": 4951 + }, + { + "epoch": 0.6026163675083663, + "grad_norm": 1.0947352647781372, + "learning_rate": 1.620194402924346e-05, + "loss": 0.4781, + "step": 4952 + }, + { + "epoch": 0.6027380590203834, + "grad_norm": 2.374330759048462, + "learning_rate": 1.6200413692968354e-05, + "loss": 0.423, + "step": 4953 + }, + { + "epoch": 0.6028597505324004, + "grad_norm": 1.1348915100097656, + "learning_rate": 1.619888312075468e-05, + "loss": 0.5043, + "step": 4954 + }, + { + "epoch": 0.6029814420444174, + "grad_norm": 3.420475721359253, + "learning_rate": 1.6197352312660664e-05, + "loss": 0.4196, + "step": 4955 + }, + { + "epoch": 0.6031031335564344, + "grad_norm": 4.5161824226379395, + "learning_rate": 1.6195821268744566e-05, + "loss": 0.3938, + "step": 4956 + }, + { + "epoch": 0.6032248250684514, + "grad_norm": 1.612881064414978, + "learning_rate": 1.619428998906465e-05, + "loss": 0.4163, + "step": 4957 + }, + { + "epoch": 0.6033465165804686, + "grad_norm": 1.0352132320404053, + "learning_rate": 1.6192758473679175e-05, + "loss": 0.4616, + "step": 4958 + }, + { + "epoch": 0.6034682080924856, + "grad_norm": 1.2749437093734741, + "learning_rate": 1.619122672264642e-05, + "loss": 0.4593, + "step": 4959 + }, + { + "epoch": 0.6035898996045026, + "grad_norm": 4.1626691818237305, + "learning_rate": 1.6189694736024674e-05, + "loss": 0.5131, + "step": 4960 + }, + { + "epoch": 0.6037115911165196, + "grad_norm": 1.3867450952529907, + "learning_rate": 1.6188162513872234e-05, + "loss": 0.4607, + "step": 4961 + }, + { + "epoch": 0.6038332826285366, + "grad_norm": 1.3090866804122925, + "learning_rate": 1.61866300562474e-05, + "loss": 0.4699, + "step": 4962 + }, + { + "epoch": 0.6039549741405537, + "grad_norm": 2.3500983715057373, + "learning_rate": 1.6185097363208487e-05, + "loss": 0.484, + "step": 4963 + }, + { + "epoch": 0.6040766656525708, + "grad_norm": 2.8437275886535645, + "learning_rate": 1.6183564434813813e-05, + "loss": 0.5235, + "step": 4964 + }, + { + "epoch": 0.6041983571645878, + "grad_norm": 2.4429731369018555, + "learning_rate": 1.6182031271121718e-05, + "loss": 0.3938, + "step": 4965 + }, + { + "epoch": 0.6043200486766048, + "grad_norm": 1.2975447177886963, + "learning_rate": 1.6180497872190534e-05, + "loss": 0.468, + "step": 4966 + }, + { + "epoch": 0.6044417401886218, + "grad_norm": 0.9424712657928467, + "learning_rate": 1.6178964238078617e-05, + "loss": 0.4977, + "step": 4967 + }, + { + "epoch": 0.6045634317006389, + "grad_norm": 4.481785774230957, + "learning_rate": 1.6177430368844316e-05, + "loss": 0.441, + "step": 4968 + }, + { + "epoch": 0.6046851232126559, + "grad_norm": 1.465193271636963, + "learning_rate": 1.6175896264546005e-05, + "loss": 0.4188, + "step": 4969 + }, + { + "epoch": 0.6048068147246729, + "grad_norm": 0.6886785626411438, + "learning_rate": 1.617436192524206e-05, + "loss": 0.4769, + "step": 4970 + }, + { + "epoch": 0.60492850623669, + "grad_norm": 0.572315514087677, + "learning_rate": 1.617282735099086e-05, + "loss": 0.4988, + "step": 4971 + }, + { + "epoch": 0.605050197748707, + "grad_norm": 0.5896018147468567, + "learning_rate": 1.6171292541850805e-05, + "loss": 0.4711, + "step": 4972 + }, + { + "epoch": 0.6051718892607241, + "grad_norm": 0.6635491251945496, + "learning_rate": 1.6169757497880293e-05, + "loss": 0.4336, + "step": 4973 + }, + { + "epoch": 0.6052935807727411, + "grad_norm": 0.7088550925254822, + "learning_rate": 1.6168222219137736e-05, + "loss": 0.4356, + "step": 4974 + }, + { + "epoch": 0.6054152722847581, + "grad_norm": 2.279660224914551, + "learning_rate": 1.616668670568156e-05, + "loss": 0.4754, + "step": 4975 + }, + { + "epoch": 0.6055369637967751, + "grad_norm": 1.945552945137024, + "learning_rate": 1.6165150957570187e-05, + "loss": 0.4438, + "step": 4976 + }, + { + "epoch": 0.6056586553087923, + "grad_norm": 1.1030707359313965, + "learning_rate": 1.616361497486206e-05, + "loss": 0.4702, + "step": 4977 + }, + { + "epoch": 0.6057803468208093, + "grad_norm": 0.756426990032196, + "learning_rate": 1.616207875761563e-05, + "loss": 0.4668, + "step": 4978 + }, + { + "epoch": 0.6059020383328263, + "grad_norm": 0.6813930869102478, + "learning_rate": 1.616054230588934e-05, + "loss": 0.4458, + "step": 4979 + }, + { + "epoch": 0.6060237298448433, + "grad_norm": 0.7297816872596741, + "learning_rate": 1.6159005619741667e-05, + "loss": 0.4551, + "step": 4980 + }, + { + "epoch": 0.6061454213568603, + "grad_norm": 1.605370044708252, + "learning_rate": 1.6157468699231083e-05, + "loss": 0.5052, + "step": 4981 + }, + { + "epoch": 0.6062671128688774, + "grad_norm": 1.510328769683838, + "learning_rate": 1.6155931544416072e-05, + "loss": 0.4427, + "step": 4982 + }, + { + "epoch": 0.6063888043808945, + "grad_norm": 0.8366087675094604, + "learning_rate": 1.6154394155355122e-05, + "loss": 0.4812, + "step": 4983 + }, + { + "epoch": 0.6065104958929115, + "grad_norm": 2.1733524799346924, + "learning_rate": 1.6152856532106733e-05, + "loss": 0.442, + "step": 4984 + }, + { + "epoch": 0.6066321874049285, + "grad_norm": 2.20855712890625, + "learning_rate": 1.615131867472942e-05, + "loss": 0.4537, + "step": 4985 + }, + { + "epoch": 0.6067538789169455, + "grad_norm": 0.5362184643745422, + "learning_rate": 1.6149780583281698e-05, + "loss": 0.4579, + "step": 4986 + }, + { + "epoch": 0.6068755704289626, + "grad_norm": 1.5650177001953125, + "learning_rate": 1.6148242257822095e-05, + "loss": 0.469, + "step": 4987 + }, + { + "epoch": 0.6069972619409796, + "grad_norm": 4.210306167602539, + "learning_rate": 1.614670369840915e-05, + "loss": 0.5633, + "step": 4988 + }, + { + "epoch": 0.6071189534529966, + "grad_norm": 1.1583614349365234, + "learning_rate": 1.61451649051014e-05, + "loss": 0.4377, + "step": 4989 + }, + { + "epoch": 0.6072406449650137, + "grad_norm": 0.6843428015708923, + "learning_rate": 1.614362587795741e-05, + "loss": 0.4719, + "step": 4990 + }, + { + "epoch": 0.6073623364770308, + "grad_norm": 1.592503309249878, + "learning_rate": 1.614208661703574e-05, + "loss": 0.4186, + "step": 4991 + }, + { + "epoch": 0.6074840279890478, + "grad_norm": 0.6590865850448608, + "learning_rate": 1.6140547122394957e-05, + "loss": 0.4677, + "step": 4992 + }, + { + "epoch": 0.6076057195010648, + "grad_norm": 1.2795705795288086, + "learning_rate": 1.613900739409365e-05, + "loss": 0.4689, + "step": 4993 + }, + { + "epoch": 0.6077274110130818, + "grad_norm": 3.304614305496216, + "learning_rate": 1.61374674321904e-05, + "loss": 0.4189, + "step": 4994 + }, + { + "epoch": 0.6078491025250988, + "grad_norm": 2.031473159790039, + "learning_rate": 1.6135927236743814e-05, + "loss": 0.496, + "step": 4995 + }, + { + "epoch": 0.607970794037116, + "grad_norm": 2.451896905899048, + "learning_rate": 1.6134386807812497e-05, + "loss": 0.3828, + "step": 4996 + }, + { + "epoch": 0.608092485549133, + "grad_norm": 0.5683725476264954, + "learning_rate": 1.6132846145455064e-05, + "loss": 0.4231, + "step": 4997 + }, + { + "epoch": 0.60821417706115, + "grad_norm": 3.8416659832000732, + "learning_rate": 1.6131305249730137e-05, + "loss": 0.5397, + "step": 4998 + }, + { + "epoch": 0.608335868573167, + "grad_norm": 0.7293667793273926, + "learning_rate": 1.6129764120696358e-05, + "loss": 0.4245, + "step": 4999 + }, + { + "epoch": 0.608457560085184, + "grad_norm": 1.453599214553833, + "learning_rate": 1.6128222758412365e-05, + "loss": 0.4548, + "step": 5000 + }, + { + "epoch": 0.608579251597201, + "grad_norm": 0.8216196298599243, + "learning_rate": 1.612668116293681e-05, + "loss": 0.4281, + "step": 5001 + }, + { + "epoch": 0.6087009431092182, + "grad_norm": 1.4079035520553589, + "learning_rate": 1.6125139334328355e-05, + "loss": 0.4655, + "step": 5002 + }, + { + "epoch": 0.6088226346212352, + "grad_norm": 0.7399680614471436, + "learning_rate": 1.6123597272645673e-05, + "loss": 0.4446, + "step": 5003 + }, + { + "epoch": 0.6089443261332522, + "grad_norm": 3.4661307334899902, + "learning_rate": 1.612205497794744e-05, + "loss": 0.5774, + "step": 5004 + }, + { + "epoch": 0.6090660176452692, + "grad_norm": 1.2089195251464844, + "learning_rate": 1.612051245029234e-05, + "loss": 0.4944, + "step": 5005 + }, + { + "epoch": 0.6091877091572863, + "grad_norm": 1.1540513038635254, + "learning_rate": 1.6118969689739072e-05, + "loss": 0.4597, + "step": 5006 + }, + { + "epoch": 0.6093094006693033, + "grad_norm": 0.5720807909965515, + "learning_rate": 1.6117426696346345e-05, + "loss": 0.5137, + "step": 5007 + }, + { + "epoch": 0.6094310921813203, + "grad_norm": 4.510030746459961, + "learning_rate": 1.6115883470172867e-05, + "loss": 0.4496, + "step": 5008 + }, + { + "epoch": 0.6095527836933374, + "grad_norm": 3.608569383621216, + "learning_rate": 1.6114340011277365e-05, + "loss": 0.4726, + "step": 5009 + }, + { + "epoch": 0.6096744752053544, + "grad_norm": 5.447242259979248, + "learning_rate": 1.6112796319718568e-05, + "loss": 0.4202, + "step": 5010 + }, + { + "epoch": 0.6097961667173715, + "grad_norm": 0.8850874304771423, + "learning_rate": 1.6111252395555223e-05, + "loss": 0.4787, + "step": 5011 + }, + { + "epoch": 0.6099178582293885, + "grad_norm": 0.6327923536300659, + "learning_rate": 1.610970823884607e-05, + "loss": 0.4952, + "step": 5012 + }, + { + "epoch": 0.6100395497414055, + "grad_norm": 0.665271520614624, + "learning_rate": 1.6108163849649874e-05, + "loss": 0.4684, + "step": 5013 + }, + { + "epoch": 0.6101612412534225, + "grad_norm": 0.6051411032676697, + "learning_rate": 1.61066192280254e-05, + "loss": 0.4839, + "step": 5014 + }, + { + "epoch": 0.6102829327654397, + "grad_norm": 1.5371066331863403, + "learning_rate": 1.6105074374031425e-05, + "loss": 0.4579, + "step": 5015 + }, + { + "epoch": 0.6104046242774567, + "grad_norm": 2.597341299057007, + "learning_rate": 1.6103529287726733e-05, + "loss": 0.512, + "step": 5016 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 1.2913267612457275, + "learning_rate": 1.6101983969170117e-05, + "loss": 0.4294, + "step": 5017 + }, + { + "epoch": 0.6106480073014907, + "grad_norm": 3.880460500717163, + "learning_rate": 1.610043841842038e-05, + "loss": 0.5532, + "step": 5018 + }, + { + "epoch": 0.6107696988135077, + "grad_norm": 1.3539235591888428, + "learning_rate": 1.6098892635536336e-05, + "loss": 0.4773, + "step": 5019 + }, + { + "epoch": 0.6108913903255248, + "grad_norm": 0.8958723545074463, + "learning_rate": 1.60973466205768e-05, + "loss": 0.5105, + "step": 5020 + }, + { + "epoch": 0.6110130818375419, + "grad_norm": 3.1031079292297363, + "learning_rate": 1.609580037360061e-05, + "loss": 0.5389, + "step": 5021 + }, + { + "epoch": 0.6111347733495589, + "grad_norm": 2.0144450664520264, + "learning_rate": 1.6094253894666595e-05, + "loss": 0.4731, + "step": 5022 + }, + { + "epoch": 0.6112564648615759, + "grad_norm": 3.7965126037597656, + "learning_rate": 1.6092707183833605e-05, + "loss": 0.4452, + "step": 5023 + }, + { + "epoch": 0.6113781563735929, + "grad_norm": 2.4872331619262695, + "learning_rate": 1.6091160241160492e-05, + "loss": 0.4603, + "step": 5024 + }, + { + "epoch": 0.61149984788561, + "grad_norm": 3.876128673553467, + "learning_rate": 1.608961306670613e-05, + "loss": 0.444, + "step": 5025 + }, + { + "epoch": 0.611621539397627, + "grad_norm": 1.5480860471725464, + "learning_rate": 1.608806566052938e-05, + "loss": 0.4383, + "step": 5026 + }, + { + "epoch": 0.611743230909644, + "grad_norm": 1.0099836587905884, + "learning_rate": 1.6086518022689135e-05, + "loss": 0.4676, + "step": 5027 + }, + { + "epoch": 0.6118649224216611, + "grad_norm": 1.7212241888046265, + "learning_rate": 1.608497015324428e-05, + "loss": 0.4228, + "step": 5028 + }, + { + "epoch": 0.6119866139336781, + "grad_norm": 0.7636945247650146, + "learning_rate": 1.6083422052253713e-05, + "loss": 0.448, + "step": 5029 + }, + { + "epoch": 0.6121083054456952, + "grad_norm": 2.512002468109131, + "learning_rate": 1.6081873719776346e-05, + "loss": 0.4652, + "step": 5030 + }, + { + "epoch": 0.6122299969577122, + "grad_norm": 3.997149705886841, + "learning_rate": 1.6080325155871095e-05, + "loss": 0.507, + "step": 5031 + }, + { + "epoch": 0.6123516884697292, + "grad_norm": 4.454903602600098, + "learning_rate": 1.6078776360596885e-05, + "loss": 0.5244, + "step": 5032 + }, + { + "epoch": 0.6124733799817462, + "grad_norm": 4.167040824890137, + "learning_rate": 1.6077227334012657e-05, + "loss": 0.5063, + "step": 5033 + }, + { + "epoch": 0.6125950714937634, + "grad_norm": 3.052686929702759, + "learning_rate": 1.6075678076177345e-05, + "loss": 0.476, + "step": 5034 + }, + { + "epoch": 0.6127167630057804, + "grad_norm": 1.222151279449463, + "learning_rate": 1.607412858714991e-05, + "loss": 0.4604, + "step": 5035 + }, + { + "epoch": 0.6128384545177974, + "grad_norm": 1.5098296403884888, + "learning_rate": 1.6072578866989303e-05, + "loss": 0.4152, + "step": 5036 + }, + { + "epoch": 0.6129601460298144, + "grad_norm": 1.3683090209960938, + "learning_rate": 1.6071028915754505e-05, + "loss": 0.4434, + "step": 5037 + }, + { + "epoch": 0.6130818375418314, + "grad_norm": 1.541131854057312, + "learning_rate": 1.6069478733504494e-05, + "loss": 0.5116, + "step": 5038 + }, + { + "epoch": 0.6132035290538485, + "grad_norm": 6.025382041931152, + "learning_rate": 1.6067928320298247e-05, + "loss": 0.4294, + "step": 5039 + }, + { + "epoch": 0.6133252205658656, + "grad_norm": 1.479815125465393, + "learning_rate": 1.606637767619477e-05, + "loss": 0.4795, + "step": 5040 + }, + { + "epoch": 0.6134469120778826, + "grad_norm": 1.080148696899414, + "learning_rate": 1.6064826801253066e-05, + "loss": 0.5043, + "step": 5041 + }, + { + "epoch": 0.6135686035898996, + "grad_norm": 1.3082995414733887, + "learning_rate": 1.606327569553215e-05, + "loss": 0.431, + "step": 5042 + }, + { + "epoch": 0.6136902951019166, + "grad_norm": 1.3065533638000488, + "learning_rate": 1.606172435909104e-05, + "loss": 0.4555, + "step": 5043 + }, + { + "epoch": 0.6138119866139337, + "grad_norm": 1.8795653581619263, + "learning_rate": 1.6060172791988778e-05, + "loss": 0.4893, + "step": 5044 + }, + { + "epoch": 0.6139336781259507, + "grad_norm": 1.1706182956695557, + "learning_rate": 1.6058620994284394e-05, + "loss": 0.4463, + "step": 5045 + }, + { + "epoch": 0.6140553696379678, + "grad_norm": 0.9591489434242249, + "learning_rate": 1.6057068966036938e-05, + "loss": 0.504, + "step": 5046 + }, + { + "epoch": 0.6141770611499848, + "grad_norm": 0.6447705626487732, + "learning_rate": 1.6055516707305474e-05, + "loss": 0.4692, + "step": 5047 + }, + { + "epoch": 0.6142987526620018, + "grad_norm": 1.0074318647384644, + "learning_rate": 1.605396421814906e-05, + "loss": 0.491, + "step": 5048 + }, + { + "epoch": 0.6144204441740189, + "grad_norm": 2.2753360271453857, + "learning_rate": 1.6052411498626777e-05, + "loss": 0.4424, + "step": 5049 + }, + { + "epoch": 0.6145421356860359, + "grad_norm": 2.695660352706909, + "learning_rate": 1.6050858548797713e-05, + "loss": 0.4454, + "step": 5050 + }, + { + "epoch": 0.6146638271980529, + "grad_norm": 1.296058177947998, + "learning_rate": 1.6049305368720956e-05, + "loss": 0.4783, + "step": 5051 + }, + { + "epoch": 0.6147855187100699, + "grad_norm": 1.0239715576171875, + "learning_rate": 1.604775195845561e-05, + "loss": 0.4706, + "step": 5052 + }, + { + "epoch": 0.614907210222087, + "grad_norm": 0.730590283870697, + "learning_rate": 1.604619831806078e-05, + "loss": 0.485, + "step": 5053 + }, + { + "epoch": 0.6150289017341041, + "grad_norm": 0.6984491348266602, + "learning_rate": 1.6044644447595587e-05, + "loss": 0.4213, + "step": 5054 + }, + { + "epoch": 0.6151505932461211, + "grad_norm": 2.7646350860595703, + "learning_rate": 1.6043090347119165e-05, + "loss": 0.5108, + "step": 5055 + }, + { + "epoch": 0.6152722847581381, + "grad_norm": 1.0232855081558228, + "learning_rate": 1.6041536016690646e-05, + "loss": 0.4026, + "step": 5056 + }, + { + "epoch": 0.6153939762701551, + "grad_norm": 4.111287593841553, + "learning_rate": 1.6039981456369175e-05, + "loss": 0.5338, + "step": 5057 + }, + { + "epoch": 0.6155156677821721, + "grad_norm": 1.2919718027114868, + "learning_rate": 1.603842666621391e-05, + "loss": 0.46, + "step": 5058 + }, + { + "epoch": 0.6156373592941893, + "grad_norm": 1.4163293838500977, + "learning_rate": 1.603687164628401e-05, + "loss": 0.4058, + "step": 5059 + }, + { + "epoch": 0.6157590508062063, + "grad_norm": 3.231069564819336, + "learning_rate": 1.603531639663865e-05, + "loss": 0.5565, + "step": 5060 + }, + { + "epoch": 0.6158807423182233, + "grad_norm": 1.8095322847366333, + "learning_rate": 1.6033760917337007e-05, + "loss": 0.4078, + "step": 5061 + }, + { + "epoch": 0.6160024338302403, + "grad_norm": 0.8809127807617188, + "learning_rate": 1.6032205208438273e-05, + "loss": 0.4693, + "step": 5062 + }, + { + "epoch": 0.6161241253422574, + "grad_norm": 2.7180685997009277, + "learning_rate": 1.603064927000164e-05, + "loss": 0.5109, + "step": 5063 + }, + { + "epoch": 0.6162458168542744, + "grad_norm": 0.8069382905960083, + "learning_rate": 1.6029093102086325e-05, + "loss": 0.4567, + "step": 5064 + }, + { + "epoch": 0.6163675083662915, + "grad_norm": 1.958562970161438, + "learning_rate": 1.6027536704751535e-05, + "loss": 0.5393, + "step": 5065 + }, + { + "epoch": 0.6164891998783085, + "grad_norm": 1.3394745588302612, + "learning_rate": 1.60259800780565e-05, + "loss": 0.4854, + "step": 5066 + }, + { + "epoch": 0.6166108913903255, + "grad_norm": 2.8461692333221436, + "learning_rate": 1.602442322206045e-05, + "loss": 0.4376, + "step": 5067 + }, + { + "epoch": 0.6167325829023426, + "grad_norm": 2.2890515327453613, + "learning_rate": 1.6022866136822623e-05, + "loss": 0.4666, + "step": 5068 + }, + { + "epoch": 0.6168542744143596, + "grad_norm": 3.1733345985412598, + "learning_rate": 1.6021308822402277e-05, + "loss": 0.4322, + "step": 5069 + }, + { + "epoch": 0.6169759659263766, + "grad_norm": 2.5025064945220947, + "learning_rate": 1.601975127885866e-05, + "loss": 0.446, + "step": 5070 + }, + { + "epoch": 0.6170976574383936, + "grad_norm": 0.9727060794830322, + "learning_rate": 1.6018193506251053e-05, + "loss": 0.4911, + "step": 5071 + }, + { + "epoch": 0.6172193489504108, + "grad_norm": 2.2073404788970947, + "learning_rate": 1.6016635504638724e-05, + "loss": 0.4827, + "step": 5072 + }, + { + "epoch": 0.6173410404624278, + "grad_norm": 0.7293530106544495, + "learning_rate": 1.601507727408096e-05, + "loss": 0.4454, + "step": 5073 + }, + { + "epoch": 0.6174627319744448, + "grad_norm": 1.3858633041381836, + "learning_rate": 1.6013518814637055e-05, + "loss": 0.432, + "step": 5074 + }, + { + "epoch": 0.6175844234864618, + "grad_norm": 1.1621677875518799, + "learning_rate": 1.6011960126366314e-05, + "loss": 0.4445, + "step": 5075 + }, + { + "epoch": 0.6177061149984788, + "grad_norm": 0.6184937357902527, + "learning_rate": 1.6010401209328046e-05, + "loss": 0.4228, + "step": 5076 + }, + { + "epoch": 0.6178278065104958, + "grad_norm": 0.5854331254959106, + "learning_rate": 1.6008842063581566e-05, + "loss": 0.4689, + "step": 5077 + }, + { + "epoch": 0.617949498022513, + "grad_norm": 1.4424673318862915, + "learning_rate": 1.6007282689186215e-05, + "loss": 0.4507, + "step": 5078 + }, + { + "epoch": 0.61807118953453, + "grad_norm": 1.6235584020614624, + "learning_rate": 1.6005723086201318e-05, + "loss": 0.4512, + "step": 5079 + }, + { + "epoch": 0.618192881046547, + "grad_norm": 1.3193002939224243, + "learning_rate": 1.600416325468623e-05, + "loss": 0.471, + "step": 5080 + }, + { + "epoch": 0.618314572558564, + "grad_norm": 2.2656495571136475, + "learning_rate": 1.60026031947003e-05, + "loss": 0.492, + "step": 5081 + }, + { + "epoch": 0.618436264070581, + "grad_norm": 1.2294079065322876, + "learning_rate": 1.600104290630289e-05, + "loss": 0.4719, + "step": 5082 + }, + { + "epoch": 0.6185579555825981, + "grad_norm": 3.1018309593200684, + "learning_rate": 1.5999482389553382e-05, + "loss": 0.4441, + "step": 5083 + }, + { + "epoch": 0.6186796470946152, + "grad_norm": 0.8289585113525391, + "learning_rate": 1.5997921644511153e-05, + "loss": 0.4753, + "step": 5084 + }, + { + "epoch": 0.6188013386066322, + "grad_norm": 0.6501938104629517, + "learning_rate": 1.5996360671235588e-05, + "loss": 0.4853, + "step": 5085 + }, + { + "epoch": 0.6189230301186492, + "grad_norm": 1.0880118608474731, + "learning_rate": 1.599479946978608e-05, + "loss": 0.4962, + "step": 5086 + }, + { + "epoch": 0.6190447216306663, + "grad_norm": 0.6680058836936951, + "learning_rate": 1.5993238040222053e-05, + "loss": 0.4557, + "step": 5087 + }, + { + "epoch": 0.6191664131426833, + "grad_norm": 2.530580997467041, + "learning_rate": 1.599167638260291e-05, + "loss": 0.4418, + "step": 5088 + }, + { + "epoch": 0.6192881046547003, + "grad_norm": 3.3691353797912598, + "learning_rate": 1.5990114496988077e-05, + "loss": 0.4235, + "step": 5089 + }, + { + "epoch": 0.6194097961667173, + "grad_norm": 1.6397669315338135, + "learning_rate": 1.5988552383436992e-05, + "loss": 0.5388, + "step": 5090 + }, + { + "epoch": 0.6195314876787344, + "grad_norm": 1.6287271976470947, + "learning_rate": 1.598699004200909e-05, + "loss": 0.5284, + "step": 5091 + }, + { + "epoch": 0.6196531791907515, + "grad_norm": 1.6321117877960205, + "learning_rate": 1.5985427472763828e-05, + "loss": 0.457, + "step": 5092 + }, + { + "epoch": 0.6197748707027685, + "grad_norm": 1.9909961223602295, + "learning_rate": 1.598386467576066e-05, + "loss": 0.5002, + "step": 5093 + }, + { + "epoch": 0.6198965622147855, + "grad_norm": 1.2118253707885742, + "learning_rate": 1.598230165105905e-05, + "loss": 0.4897, + "step": 5094 + }, + { + "epoch": 0.6200182537268025, + "grad_norm": 1.9891451597213745, + "learning_rate": 1.5980738398718485e-05, + "loss": 0.4212, + "step": 5095 + }, + { + "epoch": 0.6201399452388195, + "grad_norm": 2.7410707473754883, + "learning_rate": 1.5979174918798447e-05, + "loss": 0.4292, + "step": 5096 + }, + { + "epoch": 0.6202616367508367, + "grad_norm": 1.1813677549362183, + "learning_rate": 1.597761121135842e-05, + "loss": 0.458, + "step": 5097 + }, + { + "epoch": 0.6203833282628537, + "grad_norm": 0.6601487398147583, + "learning_rate": 1.5976047276457916e-05, + "loss": 0.474, + "step": 5098 + }, + { + "epoch": 0.6205050197748707, + "grad_norm": 0.5981538891792297, + "learning_rate": 1.5974483114156447e-05, + "loss": 0.4616, + "step": 5099 + }, + { + "epoch": 0.6206267112868877, + "grad_norm": 1.1330775022506714, + "learning_rate": 1.5972918724513524e-05, + "loss": 0.4499, + "step": 5100 + }, + { + "epoch": 0.6207484027989048, + "grad_norm": 2.344569444656372, + "learning_rate": 1.5971354107588687e-05, + "loss": 0.4656, + "step": 5101 + }, + { + "epoch": 0.6208700943109218, + "grad_norm": 1.7827597856521606, + "learning_rate": 1.596978926344146e-05, + "loss": 0.5002, + "step": 5102 + }, + { + "epoch": 0.6209917858229389, + "grad_norm": 0.9423059821128845, + "learning_rate": 1.59682241921314e-05, + "loss": 0.455, + "step": 5103 + }, + { + "epoch": 0.6211134773349559, + "grad_norm": 0.8463561534881592, + "learning_rate": 1.596665889371805e-05, + "loss": 0.4902, + "step": 5104 + }, + { + "epoch": 0.6212351688469729, + "grad_norm": 0.6051124930381775, + "learning_rate": 1.596509336826098e-05, + "loss": 0.4697, + "step": 5105 + }, + { + "epoch": 0.62135686035899, + "grad_norm": 0.9681133031845093, + "learning_rate": 1.5963527615819764e-05, + "loss": 0.4791, + "step": 5106 + }, + { + "epoch": 0.621478551871007, + "grad_norm": 1.2018495798110962, + "learning_rate": 1.5961961636453974e-05, + "loss": 0.4505, + "step": 5107 + }, + { + "epoch": 0.621600243383024, + "grad_norm": 3.915792465209961, + "learning_rate": 1.5960395430223206e-05, + "loss": 0.4162, + "step": 5108 + }, + { + "epoch": 0.621721934895041, + "grad_norm": 1.6480474472045898, + "learning_rate": 1.5958828997187054e-05, + "loss": 0.4328, + "step": 5109 + }, + { + "epoch": 0.6218436264070581, + "grad_norm": 1.29371976852417, + "learning_rate": 1.5957262337405125e-05, + "loss": 0.4462, + "step": 5110 + }, + { + "epoch": 0.6219653179190752, + "grad_norm": 2.5986709594726562, + "learning_rate": 1.595569545093703e-05, + "loss": 0.4659, + "step": 5111 + }, + { + "epoch": 0.6220870094310922, + "grad_norm": 4.487428665161133, + "learning_rate": 1.5954128337842398e-05, + "loss": 0.5351, + "step": 5112 + }, + { + "epoch": 0.6222087009431092, + "grad_norm": 1.3018454313278198, + "learning_rate": 1.5952560998180858e-05, + "loss": 0.4611, + "step": 5113 + }, + { + "epoch": 0.6223303924551262, + "grad_norm": 1.7453690767288208, + "learning_rate": 1.5950993432012053e-05, + "loss": 0.4462, + "step": 5114 + }, + { + "epoch": 0.6224520839671432, + "grad_norm": 0.8891845345497131, + "learning_rate": 1.594942563939563e-05, + "loss": 0.4487, + "step": 5115 + }, + { + "epoch": 0.6225737754791604, + "grad_norm": 3.4997634887695312, + "learning_rate": 1.5947857620391243e-05, + "loss": 0.5302, + "step": 5116 + }, + { + "epoch": 0.6226954669911774, + "grad_norm": 0.6400924921035767, + "learning_rate": 1.5946289375058562e-05, + "loss": 0.4907, + "step": 5117 + }, + { + "epoch": 0.6228171585031944, + "grad_norm": 1.9319634437561035, + "learning_rate": 1.5944720903457266e-05, + "loss": 0.4963, + "step": 5118 + }, + { + "epoch": 0.6229388500152114, + "grad_norm": 3.117453098297119, + "learning_rate": 1.5943152205647035e-05, + "loss": 0.4496, + "step": 5119 + }, + { + "epoch": 0.6230605415272285, + "grad_norm": 1.8318192958831787, + "learning_rate": 1.594158328168756e-05, + "loss": 0.4525, + "step": 5120 + }, + { + "epoch": 0.6231822330392455, + "grad_norm": 0.903439462184906, + "learning_rate": 1.594001413163854e-05, + "loss": 0.4767, + "step": 5121 + }, + { + "epoch": 0.6233039245512626, + "grad_norm": 1.5297845602035522, + "learning_rate": 1.5938444755559688e-05, + "loss": 0.479, + "step": 5122 + }, + { + "epoch": 0.6234256160632796, + "grad_norm": 0.9639740586280823, + "learning_rate": 1.5936875153510723e-05, + "loss": 0.4581, + "step": 5123 + }, + { + "epoch": 0.6235473075752966, + "grad_norm": 2.8778951168060303, + "learning_rate": 1.5935305325551367e-05, + "loss": 0.4404, + "step": 5124 + }, + { + "epoch": 0.6236689990873137, + "grad_norm": 0.8338596224784851, + "learning_rate": 1.593373527174136e-05, + "loss": 0.5022, + "step": 5125 + }, + { + "epoch": 0.6237906905993307, + "grad_norm": 0.7757508754730225, + "learning_rate": 1.5932164992140443e-05, + "loss": 0.5042, + "step": 5126 + }, + { + "epoch": 0.6239123821113477, + "grad_norm": 0.849368155002594, + "learning_rate": 1.593059448680837e-05, + "loss": 0.4919, + "step": 5127 + }, + { + "epoch": 0.6240340736233647, + "grad_norm": 1.5418044328689575, + "learning_rate": 1.59290237558049e-05, + "loss": 0.4748, + "step": 5128 + }, + { + "epoch": 0.6241557651353818, + "grad_norm": 0.7975398898124695, + "learning_rate": 1.5927452799189804e-05, + "loss": 0.5243, + "step": 5129 + }, + { + "epoch": 0.6242774566473989, + "grad_norm": 3.799165964126587, + "learning_rate": 1.5925881617022862e-05, + "loss": 0.4201, + "step": 5130 + }, + { + "epoch": 0.6243991481594159, + "grad_norm": 2.2506279945373535, + "learning_rate": 1.5924310209363854e-05, + "loss": 0.432, + "step": 5131 + }, + { + "epoch": 0.6245208396714329, + "grad_norm": 0.8625126481056213, + "learning_rate": 1.5922738576272584e-05, + "loss": 0.424, + "step": 5132 + }, + { + "epoch": 0.6246425311834499, + "grad_norm": 0.9615429639816284, + "learning_rate": 1.592116671780885e-05, + "loss": 0.4782, + "step": 5133 + }, + { + "epoch": 0.6247642226954669, + "grad_norm": 0.7857763171195984, + "learning_rate": 1.5919594634032468e-05, + "loss": 0.4736, + "step": 5134 + }, + { + "epoch": 0.6248859142074841, + "grad_norm": 3.0265188217163086, + "learning_rate": 1.5918022325003258e-05, + "loss": 0.5136, + "step": 5135 + }, + { + "epoch": 0.6250076057195011, + "grad_norm": 0.7972736358642578, + "learning_rate": 1.5916449790781045e-05, + "loss": 0.4418, + "step": 5136 + }, + { + "epoch": 0.6251292972315181, + "grad_norm": 2.0893096923828125, + "learning_rate": 1.5914877031425674e-05, + "loss": 0.4794, + "step": 5137 + }, + { + "epoch": 0.6252509887435351, + "grad_norm": 1.120312213897705, + "learning_rate": 1.5913304046996986e-05, + "loss": 0.4673, + "step": 5138 + }, + { + "epoch": 0.6253726802555521, + "grad_norm": 1.1246203184127808, + "learning_rate": 1.5911730837554843e-05, + "loss": 0.4067, + "step": 5139 + }, + { + "epoch": 0.6254943717675692, + "grad_norm": 2.8888893127441406, + "learning_rate": 1.5910157403159102e-05, + "loss": 0.4083, + "step": 5140 + }, + { + "epoch": 0.6256160632795863, + "grad_norm": 2.039015769958496, + "learning_rate": 1.590858374386964e-05, + "loss": 0.4315, + "step": 5141 + }, + { + "epoch": 0.6257377547916033, + "grad_norm": 1.864123821258545, + "learning_rate": 1.5907009859746336e-05, + "loss": 0.4771, + "step": 5142 + }, + { + "epoch": 0.6258594463036203, + "grad_norm": 2.2958595752716064, + "learning_rate": 1.590543575084908e-05, + "loss": 0.4834, + "step": 5143 + }, + { + "epoch": 0.6259811378156374, + "grad_norm": 0.6923889517784119, + "learning_rate": 1.590386141723777e-05, + "loss": 0.4716, + "step": 5144 + }, + { + "epoch": 0.6261028293276544, + "grad_norm": 0.6824405789375305, + "learning_rate": 1.590228685897231e-05, + "loss": 0.4586, + "step": 5145 + }, + { + "epoch": 0.6262245208396714, + "grad_norm": 0.7675740122795105, + "learning_rate": 1.590071207611262e-05, + "loss": 0.4556, + "step": 5146 + }, + { + "epoch": 0.6263462123516885, + "grad_norm": 0.6634732484817505, + "learning_rate": 1.5899137068718624e-05, + "loss": 0.4344, + "step": 5147 + }, + { + "epoch": 0.6264679038637055, + "grad_norm": 0.9993885159492493, + "learning_rate": 1.5897561836850254e-05, + "loss": 0.4542, + "step": 5148 + }, + { + "epoch": 0.6265895953757226, + "grad_norm": 1.7292815446853638, + "learning_rate": 1.5895986380567444e-05, + "loss": 0.431, + "step": 5149 + }, + { + "epoch": 0.6267112868877396, + "grad_norm": 2.644493579864502, + "learning_rate": 1.589441069993015e-05, + "loss": 0.5238, + "step": 5150 + }, + { + "epoch": 0.6268329783997566, + "grad_norm": 2.9488461017608643, + "learning_rate": 1.5892834794998325e-05, + "loss": 0.3738, + "step": 5151 + }, + { + "epoch": 0.6269546699117736, + "grad_norm": 1.1818256378173828, + "learning_rate": 1.5891258665831942e-05, + "loss": 0.4326, + "step": 5152 + }, + { + "epoch": 0.6270763614237906, + "grad_norm": 0.8681172132492065, + "learning_rate": 1.588968231249097e-05, + "loss": 0.427, + "step": 5153 + }, + { + "epoch": 0.6271980529358078, + "grad_norm": 1.333761215209961, + "learning_rate": 1.58881057350354e-05, + "loss": 0.4909, + "step": 5154 + }, + { + "epoch": 0.6273197444478248, + "grad_norm": 1.2859219312667847, + "learning_rate": 1.5886528933525214e-05, + "loss": 0.394, + "step": 5155 + }, + { + "epoch": 0.6274414359598418, + "grad_norm": 0.5924080610275269, + "learning_rate": 1.5884951908020418e-05, + "loss": 0.4341, + "step": 5156 + }, + { + "epoch": 0.6275631274718588, + "grad_norm": 4.823788166046143, + "learning_rate": 1.5883374658581022e-05, + "loss": 0.5689, + "step": 5157 + }, + { + "epoch": 0.6276848189838758, + "grad_norm": 1.682992935180664, + "learning_rate": 1.5881797185267044e-05, + "loss": 0.4708, + "step": 5158 + }, + { + "epoch": 0.6278065104958929, + "grad_norm": 0.7032126784324646, + "learning_rate": 1.5880219488138507e-05, + "loss": 0.4509, + "step": 5159 + }, + { + "epoch": 0.62792820200791, + "grad_norm": 1.5909678936004639, + "learning_rate": 1.5878641567255447e-05, + "loss": 0.4401, + "step": 5160 + }, + { + "epoch": 0.628049893519927, + "grad_norm": 1.1590338945388794, + "learning_rate": 1.5877063422677904e-05, + "loss": 0.4821, + "step": 5161 + }, + { + "epoch": 0.628171585031944, + "grad_norm": 0.644655704498291, + "learning_rate": 1.587548505446594e-05, + "loss": 0.4964, + "step": 5162 + }, + { + "epoch": 0.628293276543961, + "grad_norm": 0.7603266835212708, + "learning_rate": 1.58739064626796e-05, + "loss": 0.4482, + "step": 5163 + }, + { + "epoch": 0.6284149680559781, + "grad_norm": 1.9220455884933472, + "learning_rate": 1.5872327647378968e-05, + "loss": 0.4739, + "step": 5164 + }, + { + "epoch": 0.6285366595679951, + "grad_norm": 2.9807608127593994, + "learning_rate": 1.587074860862411e-05, + "loss": 0.4803, + "step": 5165 + }, + { + "epoch": 0.6286583510800122, + "grad_norm": 1.0503041744232178, + "learning_rate": 1.5869169346475116e-05, + "loss": 0.4631, + "step": 5166 + }, + { + "epoch": 0.6287800425920292, + "grad_norm": 1.2424944639205933, + "learning_rate": 1.586758986099208e-05, + "loss": 0.4956, + "step": 5167 + }, + { + "epoch": 0.6289017341040463, + "grad_norm": 0.8208186626434326, + "learning_rate": 1.5866010152235105e-05, + "loss": 0.452, + "step": 5168 + }, + { + "epoch": 0.6290234256160633, + "grad_norm": 0.6422441601753235, + "learning_rate": 1.5864430220264303e-05, + "loss": 0.4307, + "step": 5169 + }, + { + "epoch": 0.6291451171280803, + "grad_norm": 1.340247631072998, + "learning_rate": 1.5862850065139788e-05, + "loss": 0.4408, + "step": 5170 + }, + { + "epoch": 0.6292668086400973, + "grad_norm": 3.4430768489837646, + "learning_rate": 1.58612696869217e-05, + "loss": 0.5098, + "step": 5171 + }, + { + "epoch": 0.6293885001521143, + "grad_norm": 1.6935397386550903, + "learning_rate": 1.5859689085670162e-05, + "loss": 0.49, + "step": 5172 + }, + { + "epoch": 0.6295101916641315, + "grad_norm": 3.508847951889038, + "learning_rate": 1.585810826144533e-05, + "loss": 0.5803, + "step": 5173 + }, + { + "epoch": 0.6296318831761485, + "grad_norm": 2.1378252506256104, + "learning_rate": 1.585652721430735e-05, + "loss": 0.4301, + "step": 5174 + }, + { + "epoch": 0.6297535746881655, + "grad_norm": 0.6643288731575012, + "learning_rate": 1.5854945944316385e-05, + "loss": 0.4668, + "step": 5175 + }, + { + "epoch": 0.6298752662001825, + "grad_norm": 0.8753386735916138, + "learning_rate": 1.5853364451532608e-05, + "loss": 0.5287, + "step": 5176 + }, + { + "epoch": 0.6299969577121995, + "grad_norm": 1.686498999595642, + "learning_rate": 1.5851782736016205e-05, + "loss": 0.4932, + "step": 5177 + }, + { + "epoch": 0.6301186492242166, + "grad_norm": 1.5822488069534302, + "learning_rate": 1.585020079782735e-05, + "loss": 0.5047, + "step": 5178 + }, + { + "epoch": 0.6302403407362337, + "grad_norm": 1.9763538837432861, + "learning_rate": 1.5848618637026248e-05, + "loss": 0.5038, + "step": 5179 + }, + { + "epoch": 0.6303620322482507, + "grad_norm": 3.5097556114196777, + "learning_rate": 1.5847036253673097e-05, + "loss": 0.4345, + "step": 5180 + }, + { + "epoch": 0.6304837237602677, + "grad_norm": 1.1224864721298218, + "learning_rate": 1.584545364782812e-05, + "loss": 0.4911, + "step": 5181 + }, + { + "epoch": 0.6306054152722848, + "grad_norm": 0.9945358633995056, + "learning_rate": 1.5843870819551526e-05, + "loss": 0.4711, + "step": 5182 + }, + { + "epoch": 0.6307271067843018, + "grad_norm": 2.5181162357330322, + "learning_rate": 1.5842287768903553e-05, + "loss": 0.429, + "step": 5183 + }, + { + "epoch": 0.6308487982963188, + "grad_norm": 0.7400104403495789, + "learning_rate": 1.584070449594444e-05, + "loss": 0.4913, + "step": 5184 + }, + { + "epoch": 0.6309704898083359, + "grad_norm": 2.210681438446045, + "learning_rate": 1.583912100073443e-05, + "loss": 0.4597, + "step": 5185 + }, + { + "epoch": 0.6310921813203529, + "grad_norm": 4.362020492553711, + "learning_rate": 1.5837537283333778e-05, + "loss": 0.5229, + "step": 5186 + }, + { + "epoch": 0.63121387283237, + "grad_norm": 1.9521353244781494, + "learning_rate": 1.5835953343802752e-05, + "loss": 0.4793, + "step": 5187 + }, + { + "epoch": 0.631335564344387, + "grad_norm": 1.97042977809906, + "learning_rate": 1.5834369182201622e-05, + "loss": 0.5017, + "step": 5188 + }, + { + "epoch": 0.631457255856404, + "grad_norm": 1.4353485107421875, + "learning_rate": 1.5832784798590667e-05, + "loss": 0.4165, + "step": 5189 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.9788597822189331, + "learning_rate": 1.5831200193030178e-05, + "loss": 0.5053, + "step": 5190 + }, + { + "epoch": 0.631700638880438, + "grad_norm": 2.2736315727233887, + "learning_rate": 1.582961536558045e-05, + "loss": 0.526, + "step": 5191 + }, + { + "epoch": 0.6318223303924552, + "grad_norm": 2.653142213821411, + "learning_rate": 1.5828030316301788e-05, + "loss": 0.4812, + "step": 5192 + }, + { + "epoch": 0.6319440219044722, + "grad_norm": 3.02508282661438, + "learning_rate": 1.5826445045254512e-05, + "loss": 0.4367, + "step": 5193 + }, + { + "epoch": 0.6320657134164892, + "grad_norm": 2.291142463684082, + "learning_rate": 1.5824859552498944e-05, + "loss": 0.4363, + "step": 5194 + }, + { + "epoch": 0.6321874049285062, + "grad_norm": 2.344771385192871, + "learning_rate": 1.582327383809541e-05, + "loss": 0.4709, + "step": 5195 + }, + { + "epoch": 0.6323090964405232, + "grad_norm": 1.9913361072540283, + "learning_rate": 1.5821687902104254e-05, + "loss": 0.426, + "step": 5196 + }, + { + "epoch": 0.6324307879525403, + "grad_norm": 1.2045221328735352, + "learning_rate": 1.5820101744585824e-05, + "loss": 0.4792, + "step": 5197 + }, + { + "epoch": 0.6325524794645574, + "grad_norm": 0.5647122263908386, + "learning_rate": 1.5818515365600476e-05, + "loss": 0.4754, + "step": 5198 + }, + { + "epoch": 0.6326741709765744, + "grad_norm": 0.9412499666213989, + "learning_rate": 1.5816928765208573e-05, + "loss": 0.4264, + "step": 5199 + }, + { + "epoch": 0.6327958624885914, + "grad_norm": 0.7304482460021973, + "learning_rate": 1.5815341943470494e-05, + "loss": 0.4226, + "step": 5200 + }, + { + "epoch": 0.6329175540006085, + "grad_norm": 4.112847328186035, + "learning_rate": 1.5813754900446613e-05, + "loss": 0.5466, + "step": 5201 + }, + { + "epoch": 0.6330392455126255, + "grad_norm": 3.4612157344818115, + "learning_rate": 1.5812167636197323e-05, + "loss": 0.4729, + "step": 5202 + }, + { + "epoch": 0.6331609370246425, + "grad_norm": 2.4185543060302734, + "learning_rate": 1.5810580150783024e-05, + "loss": 0.4932, + "step": 5203 + }, + { + "epoch": 0.6332826285366596, + "grad_norm": 2.4247961044311523, + "learning_rate": 1.5808992444264128e-05, + "loss": 0.4842, + "step": 5204 + }, + { + "epoch": 0.6334043200486766, + "grad_norm": 1.3789006471633911, + "learning_rate": 1.5807404516701043e-05, + "loss": 0.5269, + "step": 5205 + }, + { + "epoch": 0.6335260115606937, + "grad_norm": 4.834405899047852, + "learning_rate": 1.580581636815419e-05, + "loss": 0.4216, + "step": 5206 + }, + { + "epoch": 0.6336477030727107, + "grad_norm": 2.637833595275879, + "learning_rate": 1.5804227998684013e-05, + "loss": 0.4934, + "step": 5207 + }, + { + "epoch": 0.6337693945847277, + "grad_norm": 5.018129348754883, + "learning_rate": 1.580263940835095e-05, + "loss": 0.4937, + "step": 5208 + }, + { + "epoch": 0.6338910860967447, + "grad_norm": 2.8750083446502686, + "learning_rate": 1.580105059721544e-05, + "loss": 0.4857, + "step": 5209 + }, + { + "epoch": 0.6340127776087617, + "grad_norm": 3.6261367797851562, + "learning_rate": 1.579946156533795e-05, + "loss": 0.4281, + "step": 5210 + }, + { + "epoch": 0.6341344691207789, + "grad_norm": 3.6124765872955322, + "learning_rate": 1.5797872312778944e-05, + "loss": 0.4266, + "step": 5211 + }, + { + "epoch": 0.6342561606327959, + "grad_norm": 2.5368010997772217, + "learning_rate": 1.5796282839598892e-05, + "loss": 0.4392, + "step": 5212 + }, + { + "epoch": 0.6343778521448129, + "grad_norm": 1.7947090864181519, + "learning_rate": 1.579469314585828e-05, + "loss": 0.5077, + "step": 5213 + }, + { + "epoch": 0.6344995436568299, + "grad_norm": 0.8332177996635437, + "learning_rate": 1.5793103231617603e-05, + "loss": 0.4674, + "step": 5214 + }, + { + "epoch": 0.6346212351688469, + "grad_norm": 0.6878183484077454, + "learning_rate": 1.5791513096937356e-05, + "loss": 0.4312, + "step": 5215 + }, + { + "epoch": 0.634742926680864, + "grad_norm": 1.2983524799346924, + "learning_rate": 1.578992274187805e-05, + "loss": 0.4759, + "step": 5216 + }, + { + "epoch": 0.6348646181928811, + "grad_norm": 1.0577523708343506, + "learning_rate": 1.5788332166500196e-05, + "loss": 0.4247, + "step": 5217 + }, + { + "epoch": 0.6349863097048981, + "grad_norm": 1.254083275794983, + "learning_rate": 1.578674137086432e-05, + "loss": 0.4704, + "step": 5218 + }, + { + "epoch": 0.6351080012169151, + "grad_norm": 1.862012267112732, + "learning_rate": 1.578515035503096e-05, + "loss": 0.5051, + "step": 5219 + }, + { + "epoch": 0.6352296927289321, + "grad_norm": 1.180809497833252, + "learning_rate": 1.5783559119060656e-05, + "loss": 0.5547, + "step": 5220 + }, + { + "epoch": 0.6353513842409492, + "grad_norm": 1.7578306198120117, + "learning_rate": 1.5781967663013954e-05, + "loss": 0.4508, + "step": 5221 + }, + { + "epoch": 0.6354730757529662, + "grad_norm": 1.8263388872146606, + "learning_rate": 1.5780375986951417e-05, + "loss": 0.4601, + "step": 5222 + }, + { + "epoch": 0.6355947672649833, + "grad_norm": 4.632490158081055, + "learning_rate": 1.5778784090933607e-05, + "loss": 0.4253, + "step": 5223 + }, + { + "epoch": 0.6357164587770003, + "grad_norm": 0.77925044298172, + "learning_rate": 1.57771919750211e-05, + "loss": 0.5082, + "step": 5224 + }, + { + "epoch": 0.6358381502890174, + "grad_norm": 3.3072025775909424, + "learning_rate": 1.577559963927448e-05, + "loss": 0.4455, + "step": 5225 + }, + { + "epoch": 0.6359598418010344, + "grad_norm": 0.5746170282363892, + "learning_rate": 1.5774007083754338e-05, + "loss": 0.5056, + "step": 5226 + }, + { + "epoch": 0.6360815333130514, + "grad_norm": 1.3007668256759644, + "learning_rate": 1.5772414308521278e-05, + "loss": 0.503, + "step": 5227 + }, + { + "epoch": 0.6362032248250684, + "grad_norm": 0.714690625667572, + "learning_rate": 1.5770821313635906e-05, + "loss": 0.5201, + "step": 5228 + }, + { + "epoch": 0.6363249163370854, + "grad_norm": 1.153825283050537, + "learning_rate": 1.5769228099158835e-05, + "loss": 0.4634, + "step": 5229 + }, + { + "epoch": 0.6364466078491026, + "grad_norm": 0.6355719566345215, + "learning_rate": 1.5767634665150692e-05, + "loss": 0.4479, + "step": 5230 + }, + { + "epoch": 0.6365682993611196, + "grad_norm": 1.5121638774871826, + "learning_rate": 1.5766041011672114e-05, + "loss": 0.5044, + "step": 5231 + }, + { + "epoch": 0.6366899908731366, + "grad_norm": 1.667244553565979, + "learning_rate": 1.576444713878374e-05, + "loss": 0.3769, + "step": 5232 + }, + { + "epoch": 0.6368116823851536, + "grad_norm": 2.893745183944702, + "learning_rate": 1.576285304654622e-05, + "loss": 0.4857, + "step": 5233 + }, + { + "epoch": 0.6369333738971706, + "grad_norm": 1.582780122756958, + "learning_rate": 1.576125873502022e-05, + "loss": 0.458, + "step": 5234 + }, + { + "epoch": 0.6370550654091877, + "grad_norm": 2.3318421840667725, + "learning_rate": 1.5759664204266392e-05, + "loss": 0.5, + "step": 5235 + }, + { + "epoch": 0.6371767569212048, + "grad_norm": 2.5021045207977295, + "learning_rate": 1.575806945434542e-05, + "loss": 0.5116, + "step": 5236 + }, + { + "epoch": 0.6372984484332218, + "grad_norm": 0.8516119122505188, + "learning_rate": 1.5756474485317986e-05, + "loss": 0.4837, + "step": 5237 + }, + { + "epoch": 0.6374201399452388, + "grad_norm": 3.058852434158325, + "learning_rate": 1.5754879297244786e-05, + "loss": 0.4431, + "step": 5238 + }, + { + "epoch": 0.6375418314572558, + "grad_norm": 0.5639399290084839, + "learning_rate": 1.5753283890186515e-05, + "loss": 0.4668, + "step": 5239 + }, + { + "epoch": 0.6376635229692729, + "grad_norm": 0.5681480169296265, + "learning_rate": 1.5751688264203885e-05, + "loss": 0.4879, + "step": 5240 + }, + { + "epoch": 0.6377852144812899, + "grad_norm": 1.5465158224105835, + "learning_rate": 1.575009241935761e-05, + "loss": 0.4864, + "step": 5241 + }, + { + "epoch": 0.637906905993307, + "grad_norm": 2.9557650089263916, + "learning_rate": 1.5748496355708416e-05, + "loss": 0.4475, + "step": 5242 + }, + { + "epoch": 0.638028597505324, + "grad_norm": 2.6665782928466797, + "learning_rate": 1.5746900073317037e-05, + "loss": 0.4435, + "step": 5243 + }, + { + "epoch": 0.638150289017341, + "grad_norm": 1.3420488834381104, + "learning_rate": 1.5745303572244215e-05, + "loss": 0.5205, + "step": 5244 + }, + { + "epoch": 0.6382719805293581, + "grad_norm": 3.566497325897217, + "learning_rate": 1.57437068525507e-05, + "loss": 0.5772, + "step": 5245 + }, + { + "epoch": 0.6383936720413751, + "grad_norm": 0.9798552393913269, + "learning_rate": 1.574210991429725e-05, + "loss": 0.4787, + "step": 5246 + }, + { + "epoch": 0.6385153635533921, + "grad_norm": 2.5602974891662598, + "learning_rate": 1.5740512757544634e-05, + "loss": 0.4915, + "step": 5247 + }, + { + "epoch": 0.6386370550654092, + "grad_norm": 3.6115434169769287, + "learning_rate": 1.5738915382353624e-05, + "loss": 0.4268, + "step": 5248 + }, + { + "epoch": 0.6387587465774263, + "grad_norm": 1.4270596504211426, + "learning_rate": 1.5737317788785e-05, + "loss": 0.4992, + "step": 5249 + }, + { + "epoch": 0.6388804380894433, + "grad_norm": 0.8352439999580383, + "learning_rate": 1.5735719976899566e-05, + "loss": 0.4929, + "step": 5250 + }, + { + "epoch": 0.6390021296014603, + "grad_norm": 4.70586633682251, + "learning_rate": 1.5734121946758114e-05, + "loss": 0.4813, + "step": 5251 + }, + { + "epoch": 0.6391238211134773, + "grad_norm": 1.007424235343933, + "learning_rate": 1.5732523698421452e-05, + "loss": 0.5212, + "step": 5252 + }, + { + "epoch": 0.6392455126254943, + "grad_norm": 3.383769989013672, + "learning_rate": 1.5730925231950395e-05, + "loss": 0.4723, + "step": 5253 + }, + { + "epoch": 0.6393672041375114, + "grad_norm": 2.987266778945923, + "learning_rate": 1.5729326547405772e-05, + "loss": 0.4829, + "step": 5254 + }, + { + "epoch": 0.6394888956495285, + "grad_norm": 0.930807888507843, + "learning_rate": 1.5727727644848415e-05, + "loss": 0.5075, + "step": 5255 + }, + { + "epoch": 0.6396105871615455, + "grad_norm": 1.6078609228134155, + "learning_rate": 1.5726128524339163e-05, + "loss": 0.5098, + "step": 5256 + }, + { + "epoch": 0.6397322786735625, + "grad_norm": 0.6467624306678772, + "learning_rate": 1.572452918593887e-05, + "loss": 0.4819, + "step": 5257 + }, + { + "epoch": 0.6398539701855795, + "grad_norm": 1.394906759262085, + "learning_rate": 1.5722929629708397e-05, + "loss": 0.496, + "step": 5258 + }, + { + "epoch": 0.6399756616975966, + "grad_norm": 1.2083563804626465, + "learning_rate": 1.57213298557086e-05, + "loss": 0.4524, + "step": 5259 + }, + { + "epoch": 0.6400973532096136, + "grad_norm": 1.187841534614563, + "learning_rate": 1.5719729864000363e-05, + "loss": 0.4273, + "step": 5260 + }, + { + "epoch": 0.6402190447216307, + "grad_norm": 1.1763041019439697, + "learning_rate": 1.5718129654644562e-05, + "loss": 0.4677, + "step": 5261 + }, + { + "epoch": 0.6403407362336477, + "grad_norm": 1.6368167400360107, + "learning_rate": 1.571652922770209e-05, + "loss": 0.4532, + "step": 5262 + }, + { + "epoch": 0.6404624277456648, + "grad_norm": 1.949811577796936, + "learning_rate": 1.5714928583233854e-05, + "loss": 0.4179, + "step": 5263 + }, + { + "epoch": 0.6405841192576818, + "grad_norm": 1.9312878847122192, + "learning_rate": 1.5713327721300753e-05, + "loss": 0.4226, + "step": 5264 + }, + { + "epoch": 0.6407058107696988, + "grad_norm": 1.3562158346176147, + "learning_rate": 1.5711726641963708e-05, + "loss": 0.4908, + "step": 5265 + }, + { + "epoch": 0.6408275022817158, + "grad_norm": 1.574460506439209, + "learning_rate": 1.571012534528364e-05, + "loss": 0.472, + "step": 5266 + }, + { + "epoch": 0.6409491937937329, + "grad_norm": 2.8433024883270264, + "learning_rate": 1.5708523831321482e-05, + "loss": 0.5206, + "step": 5267 + }, + { + "epoch": 0.64107088530575, + "grad_norm": 1.0959405899047852, + "learning_rate": 1.570692210013818e-05, + "loss": 0.4416, + "step": 5268 + }, + { + "epoch": 0.641192576817767, + "grad_norm": 1.6597760915756226, + "learning_rate": 1.570532015179468e-05, + "loss": 0.4574, + "step": 5269 + }, + { + "epoch": 0.641314268329784, + "grad_norm": 2.3682327270507812, + "learning_rate": 1.5703717986351934e-05, + "loss": 0.5196, + "step": 5270 + }, + { + "epoch": 0.641435959841801, + "grad_norm": 1.7758485078811646, + "learning_rate": 1.5702115603870914e-05, + "loss": 0.502, + "step": 5271 + }, + { + "epoch": 0.641557651353818, + "grad_norm": 1.2930625677108765, + "learning_rate": 1.5700513004412593e-05, + "loss": 0.4532, + "step": 5272 + }, + { + "epoch": 0.6416793428658351, + "grad_norm": 1.7209218740463257, + "learning_rate": 1.5698910188037954e-05, + "loss": 0.4361, + "step": 5273 + }, + { + "epoch": 0.6418010343778522, + "grad_norm": 0.6990587711334229, + "learning_rate": 1.569730715480799e-05, + "loss": 0.4443, + "step": 5274 + }, + { + "epoch": 0.6419227258898692, + "grad_norm": 1.332281231880188, + "learning_rate": 1.569570390478369e-05, + "loss": 0.5104, + "step": 5275 + }, + { + "epoch": 0.6420444174018862, + "grad_norm": 0.722692608833313, + "learning_rate": 1.5694100438026066e-05, + "loss": 0.439, + "step": 5276 + }, + { + "epoch": 0.6421661089139032, + "grad_norm": 0.8775246739387512, + "learning_rate": 1.569249675459614e-05, + "loss": 0.5065, + "step": 5277 + }, + { + "epoch": 0.6422878004259203, + "grad_norm": 0.7255299687385559, + "learning_rate": 1.5690892854554926e-05, + "loss": 0.4966, + "step": 5278 + }, + { + "epoch": 0.6424094919379373, + "grad_norm": 0.5717247724533081, + "learning_rate": 1.568928873796346e-05, + "loss": 0.4658, + "step": 5279 + }, + { + "epoch": 0.6425311834499544, + "grad_norm": 2.2494089603424072, + "learning_rate": 1.568768440488278e-05, + "loss": 0.4658, + "step": 5280 + }, + { + "epoch": 0.6426528749619714, + "grad_norm": 1.175611138343811, + "learning_rate": 1.5686079855373936e-05, + "loss": 0.4917, + "step": 5281 + }, + { + "epoch": 0.6427745664739885, + "grad_norm": 1.840100884437561, + "learning_rate": 1.5684475089497983e-05, + "loss": 0.4713, + "step": 5282 + }, + { + "epoch": 0.6428962579860055, + "grad_norm": 0.8531798720359802, + "learning_rate": 1.568287010731599e-05, + "loss": 0.4593, + "step": 5283 + }, + { + "epoch": 0.6430179494980225, + "grad_norm": 1.5386767387390137, + "learning_rate": 1.5681264908889026e-05, + "loss": 0.4133, + "step": 5284 + }, + { + "epoch": 0.6431396410100395, + "grad_norm": 1.202364444732666, + "learning_rate": 1.567965949427817e-05, + "loss": 0.5303, + "step": 5285 + }, + { + "epoch": 0.6432613325220566, + "grad_norm": 2.0715887546539307, + "learning_rate": 1.5678053863544516e-05, + "loss": 0.5159, + "step": 5286 + }, + { + "epoch": 0.6433830240340737, + "grad_norm": 0.6678662300109863, + "learning_rate": 1.567644801674916e-05, + "loss": 0.4942, + "step": 5287 + }, + { + "epoch": 0.6435047155460907, + "grad_norm": 3.980654239654541, + "learning_rate": 1.5674841953953205e-05, + "loss": 0.5529, + "step": 5288 + }, + { + "epoch": 0.6436264070581077, + "grad_norm": 1.2205555438995361, + "learning_rate": 1.567323567521777e-05, + "loss": 0.4752, + "step": 5289 + }, + { + "epoch": 0.6437480985701247, + "grad_norm": 2.3752331733703613, + "learning_rate": 1.5671629180603972e-05, + "loss": 0.4849, + "step": 5290 + }, + { + "epoch": 0.6438697900821417, + "grad_norm": 3.106736660003662, + "learning_rate": 1.5670022470172947e-05, + "loss": 0.485, + "step": 5291 + }, + { + "epoch": 0.6439914815941588, + "grad_norm": 1.166581153869629, + "learning_rate": 1.5668415543985828e-05, + "loss": 0.5282, + "step": 5292 + }, + { + "epoch": 0.6441131731061759, + "grad_norm": 2.677962303161621, + "learning_rate": 1.5666808402103764e-05, + "loss": 0.4469, + "step": 5293 + }, + { + "epoch": 0.6442348646181929, + "grad_norm": 4.46078634262085, + "learning_rate": 1.5665201044587912e-05, + "loss": 0.3993, + "step": 5294 + }, + { + "epoch": 0.6443565561302099, + "grad_norm": 1.144884705543518, + "learning_rate": 1.5663593471499434e-05, + "loss": 0.4871, + "step": 5295 + }, + { + "epoch": 0.6444782476422269, + "grad_norm": 0.7435542941093445, + "learning_rate": 1.5661985682899503e-05, + "loss": 0.4552, + "step": 5296 + }, + { + "epoch": 0.644599939154244, + "grad_norm": 0.851441502571106, + "learning_rate": 1.5660377678849298e-05, + "loss": 0.4103, + "step": 5297 + }, + { + "epoch": 0.644721630666261, + "grad_norm": 2.650285482406616, + "learning_rate": 1.565876945941e-05, + "loss": 0.4894, + "step": 5298 + }, + { + "epoch": 0.6448433221782781, + "grad_norm": 1.9493920803070068, + "learning_rate": 1.5657161024642817e-05, + "loss": 0.4432, + "step": 5299 + }, + { + "epoch": 0.6449650136902951, + "grad_norm": 2.567227840423584, + "learning_rate": 1.5655552374608945e-05, + "loss": 0.4976, + "step": 5300 + }, + { + "epoch": 0.6450867052023121, + "grad_norm": 0.7542309761047363, + "learning_rate": 1.56539435093696e-05, + "loss": 0.4631, + "step": 5301 + }, + { + "epoch": 0.6452083967143292, + "grad_norm": 0.6160406470298767, + "learning_rate": 1.5652334428985998e-05, + "loss": 0.443, + "step": 5302 + }, + { + "epoch": 0.6453300882263462, + "grad_norm": 0.9996970891952515, + "learning_rate": 1.5650725133519376e-05, + "loss": 0.4547, + "step": 5303 + }, + { + "epoch": 0.6454517797383632, + "grad_norm": 0.5419983863830566, + "learning_rate": 1.564911562303096e-05, + "loss": 0.4629, + "step": 5304 + }, + { + "epoch": 0.6455734712503803, + "grad_norm": 0.7758529782295227, + "learning_rate": 1.5647505897582006e-05, + "loss": 0.4724, + "step": 5305 + }, + { + "epoch": 0.6456951627623974, + "grad_norm": 3.16382098197937, + "learning_rate": 1.5645895957233765e-05, + "loss": 0.4316, + "step": 5306 + }, + { + "epoch": 0.6458168542744144, + "grad_norm": 0.7268728613853455, + "learning_rate": 1.5644285802047493e-05, + "loss": 0.4823, + "step": 5307 + }, + { + "epoch": 0.6459385457864314, + "grad_norm": 2.1866092681884766, + "learning_rate": 1.5642675432084463e-05, + "loss": 0.4537, + "step": 5308 + }, + { + "epoch": 0.6460602372984484, + "grad_norm": 2.061885356903076, + "learning_rate": 1.5641064847405957e-05, + "loss": 0.4952, + "step": 5309 + }, + { + "epoch": 0.6461819288104654, + "grad_norm": 1.0812735557556152, + "learning_rate": 1.5639454048073256e-05, + "loss": 0.4955, + "step": 5310 + }, + { + "epoch": 0.6463036203224825, + "grad_norm": 2.22967529296875, + "learning_rate": 1.563784303414765e-05, + "loss": 0.4669, + "step": 5311 + }, + { + "epoch": 0.6464253118344996, + "grad_norm": 0.7430720925331116, + "learning_rate": 1.563623180569045e-05, + "loss": 0.4569, + "step": 5312 + }, + { + "epoch": 0.6465470033465166, + "grad_norm": 3.014439105987549, + "learning_rate": 1.5634620362762963e-05, + "loss": 0.4925, + "step": 5313 + }, + { + "epoch": 0.6466686948585336, + "grad_norm": 2.8883817195892334, + "learning_rate": 1.563300870542651e-05, + "loss": 0.4759, + "step": 5314 + }, + { + "epoch": 0.6467903863705506, + "grad_norm": 1.9737778902053833, + "learning_rate": 1.5631396833742417e-05, + "loss": 0.4736, + "step": 5315 + }, + { + "epoch": 0.6469120778825677, + "grad_norm": 3.116694211959839, + "learning_rate": 1.562978474777202e-05, + "loss": 0.543, + "step": 5316 + }, + { + "epoch": 0.6470337693945847, + "grad_norm": 0.757223904132843, + "learning_rate": 1.5628172447576652e-05, + "loss": 0.4693, + "step": 5317 + }, + { + "epoch": 0.6471554609066018, + "grad_norm": 3.094352960586548, + "learning_rate": 1.562655993321768e-05, + "loss": 0.5112, + "step": 5318 + }, + { + "epoch": 0.6472771524186188, + "grad_norm": 1.4840407371520996, + "learning_rate": 1.5624947204756454e-05, + "loss": 0.4901, + "step": 5319 + }, + { + "epoch": 0.6473988439306358, + "grad_norm": 1.1860746145248413, + "learning_rate": 1.5623334262254343e-05, + "loss": 0.5034, + "step": 5320 + }, + { + "epoch": 0.6475205354426529, + "grad_norm": 1.8971335887908936, + "learning_rate": 1.562172110577272e-05, + "loss": 0.4777, + "step": 5321 + }, + { + "epoch": 0.6476422269546699, + "grad_norm": 2.2858195304870605, + "learning_rate": 1.562010773537298e-05, + "loss": 0.4795, + "step": 5322 + }, + { + "epoch": 0.6477639184666869, + "grad_norm": 2.7201476097106934, + "learning_rate": 1.5618494151116505e-05, + "loss": 0.539, + "step": 5323 + }, + { + "epoch": 0.647885609978704, + "grad_norm": 2.1227800846099854, + "learning_rate": 1.5616880353064696e-05, + "loss": 0.5149, + "step": 5324 + }, + { + "epoch": 0.648007301490721, + "grad_norm": 2.547903299331665, + "learning_rate": 1.5615266341278966e-05, + "loss": 0.4574, + "step": 5325 + }, + { + "epoch": 0.6481289930027381, + "grad_norm": 2.562516689300537, + "learning_rate": 1.561365211582073e-05, + "loss": 0.4475, + "step": 5326 + }, + { + "epoch": 0.6482506845147551, + "grad_norm": 0.7874716520309448, + "learning_rate": 1.5612037676751407e-05, + "loss": 0.483, + "step": 5327 + }, + { + "epoch": 0.6483723760267721, + "grad_norm": 1.3339600563049316, + "learning_rate": 1.5610423024132435e-05, + "loss": 0.4822, + "step": 5328 + }, + { + "epoch": 0.6484940675387891, + "grad_norm": 0.5802419781684875, + "learning_rate": 1.5608808158025256e-05, + "loss": 0.4078, + "step": 5329 + }, + { + "epoch": 0.6486157590508062, + "grad_norm": 1.7999463081359863, + "learning_rate": 1.5607193078491314e-05, + "loss": 0.4811, + "step": 5330 + }, + { + "epoch": 0.6487374505628233, + "grad_norm": 1.8625408411026, + "learning_rate": 1.5605577785592073e-05, + "loss": 0.4349, + "step": 5331 + }, + { + "epoch": 0.6488591420748403, + "grad_norm": 1.455593228340149, + "learning_rate": 1.560396227938899e-05, + "loss": 0.4569, + "step": 5332 + }, + { + "epoch": 0.6489808335868573, + "grad_norm": 1.1102403402328491, + "learning_rate": 1.5602346559943548e-05, + "loss": 0.4034, + "step": 5333 + }, + { + "epoch": 0.6491025250988743, + "grad_norm": 0.7270345091819763, + "learning_rate": 1.560073062731722e-05, + "loss": 0.4186, + "step": 5334 + }, + { + "epoch": 0.6492242166108914, + "grad_norm": 1.4078460931777954, + "learning_rate": 1.5599114481571498e-05, + "loss": 0.4467, + "step": 5335 + }, + { + "epoch": 0.6493459081229084, + "grad_norm": 1.071409821510315, + "learning_rate": 1.5597498122767884e-05, + "loss": 0.473, + "step": 5336 + }, + { + "epoch": 0.6494675996349255, + "grad_norm": 1.9835041761398315, + "learning_rate": 1.5595881550967873e-05, + "loss": 0.4919, + "step": 5337 + }, + { + "epoch": 0.6495892911469425, + "grad_norm": 2.0246047973632812, + "learning_rate": 1.5594264766232993e-05, + "loss": 0.5229, + "step": 5338 + }, + { + "epoch": 0.6497109826589595, + "grad_norm": 0.6315078735351562, + "learning_rate": 1.5592647768624756e-05, + "loss": 0.4514, + "step": 5339 + }, + { + "epoch": 0.6498326741709766, + "grad_norm": 2.390519142150879, + "learning_rate": 1.5591030558204696e-05, + "loss": 0.5189, + "step": 5340 + }, + { + "epoch": 0.6499543656829936, + "grad_norm": 2.800323486328125, + "learning_rate": 1.558941313503435e-05, + "loss": 0.5062, + "step": 5341 + }, + { + "epoch": 0.6500760571950106, + "grad_norm": 4.180983543395996, + "learning_rate": 1.5587795499175265e-05, + "loss": 0.4858, + "step": 5342 + }, + { + "epoch": 0.6501977487070277, + "grad_norm": 3.3033080101013184, + "learning_rate": 1.5586177650688996e-05, + "loss": 0.4585, + "step": 5343 + }, + { + "epoch": 0.6503194402190448, + "grad_norm": 2.4165728092193604, + "learning_rate": 1.5584559589637108e-05, + "loss": 0.471, + "step": 5344 + }, + { + "epoch": 0.6504411317310618, + "grad_norm": 4.680166244506836, + "learning_rate": 1.558294131608116e-05, + "loss": 0.3864, + "step": 5345 + }, + { + "epoch": 0.6505628232430788, + "grad_norm": 1.562940001487732, + "learning_rate": 1.5581322830082747e-05, + "loss": 0.481, + "step": 5346 + }, + { + "epoch": 0.6506845147550958, + "grad_norm": 1.728887677192688, + "learning_rate": 1.5579704131703442e-05, + "loss": 0.4934, + "step": 5347 + }, + { + "epoch": 0.6508062062671128, + "grad_norm": 2.84002423286438, + "learning_rate": 1.557808522100485e-05, + "loss": 0.4873, + "step": 5348 + }, + { + "epoch": 0.65092789777913, + "grad_norm": 1.885184407234192, + "learning_rate": 1.5576466098048566e-05, + "loss": 0.3914, + "step": 5349 + }, + { + "epoch": 0.651049589291147, + "grad_norm": 0.6747542023658752, + "learning_rate": 1.5574846762896204e-05, + "loss": 0.4383, + "step": 5350 + }, + { + "epoch": 0.651171280803164, + "grad_norm": 2.4659926891326904, + "learning_rate": 1.5573227215609383e-05, + "loss": 0.4715, + "step": 5351 + }, + { + "epoch": 0.651292972315181, + "grad_norm": 2.432208299636841, + "learning_rate": 1.557160745624973e-05, + "loss": 0.5088, + "step": 5352 + }, + { + "epoch": 0.651414663827198, + "grad_norm": 2.133415937423706, + "learning_rate": 1.5569987484878887e-05, + "loss": 0.4315, + "step": 5353 + }, + { + "epoch": 0.6515363553392151, + "grad_norm": 0.7251248359680176, + "learning_rate": 1.5568367301558486e-05, + "loss": 0.3825, + "step": 5354 + }, + { + "epoch": 0.6516580468512321, + "grad_norm": 3.2056667804718018, + "learning_rate": 1.5566746906350184e-05, + "loss": 0.4797, + "step": 5355 + }, + { + "epoch": 0.6517797383632492, + "grad_norm": 0.8430180549621582, + "learning_rate": 1.556512629931564e-05, + "loss": 0.4634, + "step": 5356 + }, + { + "epoch": 0.6519014298752662, + "grad_norm": 1.3142163753509521, + "learning_rate": 1.5563505480516517e-05, + "loss": 0.4085, + "step": 5357 + }, + { + "epoch": 0.6520231213872832, + "grad_norm": 1.23605477809906, + "learning_rate": 1.55618844500145e-05, + "loss": 0.4429, + "step": 5358 + }, + { + "epoch": 0.6521448128993003, + "grad_norm": 1.4192166328430176, + "learning_rate": 1.5560263207871263e-05, + "loss": 0.5062, + "step": 5359 + }, + { + "epoch": 0.6522665044113173, + "grad_norm": 0.722477376461029, + "learning_rate": 1.5558641754148506e-05, + "loss": 0.4979, + "step": 5360 + }, + { + "epoch": 0.6523881959233343, + "grad_norm": 1.8833805322647095, + "learning_rate": 1.5557020088907924e-05, + "loss": 0.4776, + "step": 5361 + }, + { + "epoch": 0.6525098874353514, + "grad_norm": 1.0407469272613525, + "learning_rate": 1.5555398212211225e-05, + "loss": 0.4656, + "step": 5362 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 1.2440108060836792, + "learning_rate": 1.5553776124120123e-05, + "loss": 0.4512, + "step": 5363 + }, + { + "epoch": 0.6527532704593855, + "grad_norm": 1.5020626783370972, + "learning_rate": 1.5552153824696344e-05, + "loss": 0.4823, + "step": 5364 + }, + { + "epoch": 0.6528749619714025, + "grad_norm": 1.579473614692688, + "learning_rate": 1.555053131400162e-05, + "loss": 0.4724, + "step": 5365 + }, + { + "epoch": 0.6529966534834195, + "grad_norm": 1.1490020751953125, + "learning_rate": 1.5548908592097692e-05, + "loss": 0.4654, + "step": 5366 + }, + { + "epoch": 0.6531183449954365, + "grad_norm": 1.049917459487915, + "learning_rate": 1.554728565904631e-05, + "loss": 0.494, + "step": 5367 + }, + { + "epoch": 0.6532400365074537, + "grad_norm": 1.5266389846801758, + "learning_rate": 1.554566251490922e-05, + "loss": 0.5075, + "step": 5368 + }, + { + "epoch": 0.6533617280194707, + "grad_norm": 2.6105377674102783, + "learning_rate": 1.5544039159748194e-05, + "loss": 0.4033, + "step": 5369 + }, + { + "epoch": 0.6534834195314877, + "grad_norm": 0.7732549905776978, + "learning_rate": 1.5542415593625002e-05, + "loss": 0.4629, + "step": 5370 + }, + { + "epoch": 0.6536051110435047, + "grad_norm": 2.330594062805176, + "learning_rate": 1.5540791816601423e-05, + "loss": 0.5096, + "step": 5371 + }, + { + "epoch": 0.6537268025555217, + "grad_norm": 2.2057223320007324, + "learning_rate": 1.553916782873925e-05, + "loss": 0.5194, + "step": 5372 + }, + { + "epoch": 0.6538484940675388, + "grad_norm": 1.3206000328063965, + "learning_rate": 1.5537543630100277e-05, + "loss": 0.507, + "step": 5373 + }, + { + "epoch": 0.6539701855795558, + "grad_norm": 0.6833221316337585, + "learning_rate": 1.55359192207463e-05, + "loss": 0.474, + "step": 5374 + }, + { + "epoch": 0.6540918770915729, + "grad_norm": 0.6391467452049255, + "learning_rate": 1.5534294600739143e-05, + "loss": 0.5138, + "step": 5375 + }, + { + "epoch": 0.6542135686035899, + "grad_norm": 4.589588642120361, + "learning_rate": 1.553266977014062e-05, + "loss": 0.4424, + "step": 5376 + }, + { + "epoch": 0.6543352601156069, + "grad_norm": 2.5533909797668457, + "learning_rate": 1.5531044729012558e-05, + "loss": 0.4476, + "step": 5377 + }, + { + "epoch": 0.654456951627624, + "grad_norm": 3.4447693824768066, + "learning_rate": 1.55294194774168e-05, + "loss": 0.4461, + "step": 5378 + }, + { + "epoch": 0.654578643139641, + "grad_norm": 1.4094719886779785, + "learning_rate": 1.5527794015415184e-05, + "loss": 0.4644, + "step": 5379 + }, + { + "epoch": 0.654700334651658, + "grad_norm": 1.311448335647583, + "learning_rate": 1.5526168343069567e-05, + "loss": 0.4886, + "step": 5380 + }, + { + "epoch": 0.6548220261636751, + "grad_norm": 0.9678633809089661, + "learning_rate": 1.55245424604418e-05, + "loss": 0.4304, + "step": 5381 + }, + { + "epoch": 0.6549437176756921, + "grad_norm": 0.8762246370315552, + "learning_rate": 1.552291636759376e-05, + "loss": 0.3999, + "step": 5382 + }, + { + "epoch": 0.6550654091877092, + "grad_norm": 1.3736207485198975, + "learning_rate": 1.552129006458732e-05, + "loss": 0.393, + "step": 5383 + }, + { + "epoch": 0.6551871006997262, + "grad_norm": 3.8743481636047363, + "learning_rate": 1.5519663551484363e-05, + "loss": 0.4919, + "step": 5384 + }, + { + "epoch": 0.6553087922117432, + "grad_norm": 4.454881191253662, + "learning_rate": 1.5518036828346785e-05, + "loss": 0.479, + "step": 5385 + }, + { + "epoch": 0.6554304837237602, + "grad_norm": 7.7504963874816895, + "learning_rate": 1.5516409895236485e-05, + "loss": 0.6064, + "step": 5386 + }, + { + "epoch": 0.6555521752357774, + "grad_norm": 3.546290397644043, + "learning_rate": 1.5514782752215368e-05, + "loss": 0.4805, + "step": 5387 + }, + { + "epoch": 0.6556738667477944, + "grad_norm": 0.7145805358886719, + "learning_rate": 1.551315539934535e-05, + "loss": 0.3471, + "step": 5388 + }, + { + "epoch": 0.6557955582598114, + "grad_norm": 1.957080364227295, + "learning_rate": 1.5511527836688364e-05, + "loss": 0.4635, + "step": 5389 + }, + { + "epoch": 0.6559172497718284, + "grad_norm": 4.414008617401123, + "learning_rate": 1.550990006430633e-05, + "loss": 0.5479, + "step": 5390 + }, + { + "epoch": 0.6560389412838454, + "grad_norm": 2.238805055618286, + "learning_rate": 1.5508272082261196e-05, + "loss": 0.4913, + "step": 5391 + }, + { + "epoch": 0.6561606327958625, + "grad_norm": 1.4231898784637451, + "learning_rate": 1.5506643890614906e-05, + "loss": 0.4437, + "step": 5392 + }, + { + "epoch": 0.6562823243078795, + "grad_norm": 1.734437108039856, + "learning_rate": 1.550501548942942e-05, + "loss": 0.4563, + "step": 5393 + }, + { + "epoch": 0.6564040158198966, + "grad_norm": 0.7681286931037903, + "learning_rate": 1.5503386878766698e-05, + "loss": 0.5023, + "step": 5394 + }, + { + "epoch": 0.6565257073319136, + "grad_norm": 0.8565199375152588, + "learning_rate": 1.5501758058688717e-05, + "loss": 0.4895, + "step": 5395 + }, + { + "epoch": 0.6566473988439306, + "grad_norm": 4.39603853225708, + "learning_rate": 1.550012902925745e-05, + "loss": 0.3932, + "step": 5396 + }, + { + "epoch": 0.6567690903559477, + "grad_norm": 4.179236888885498, + "learning_rate": 1.5498499790534892e-05, + "loss": 0.4024, + "step": 5397 + }, + { + "epoch": 0.6568907818679647, + "grad_norm": 3.2235324382781982, + "learning_rate": 1.5496870342583033e-05, + "loss": 0.434, + "step": 5398 + }, + { + "epoch": 0.6570124733799817, + "grad_norm": 0.9747445583343506, + "learning_rate": 1.5495240685463882e-05, + "loss": 0.4964, + "step": 5399 + }, + { + "epoch": 0.6571341648919988, + "grad_norm": 4.188055038452148, + "learning_rate": 1.5493610819239446e-05, + "loss": 0.4294, + "step": 5400 + }, + { + "epoch": 0.6572558564040158, + "grad_norm": 2.397346258163452, + "learning_rate": 1.5491980743971747e-05, + "loss": 0.4005, + "step": 5401 + }, + { + "epoch": 0.6573775479160329, + "grad_norm": 2.431401491165161, + "learning_rate": 1.5490350459722815e-05, + "loss": 0.4571, + "step": 5402 + }, + { + "epoch": 0.6574992394280499, + "grad_norm": 0.6270154118537903, + "learning_rate": 1.5488719966554685e-05, + "loss": 0.3771, + "step": 5403 + }, + { + "epoch": 0.6576209309400669, + "grad_norm": 2.432560682296753, + "learning_rate": 1.5487089264529395e-05, + "loss": 0.4197, + "step": 5404 + }, + { + "epoch": 0.6577426224520839, + "grad_norm": 4.105227947235107, + "learning_rate": 1.5485458353709003e-05, + "loss": 0.5095, + "step": 5405 + }, + { + "epoch": 0.657864313964101, + "grad_norm": 4.4798970222473145, + "learning_rate": 1.5483827234155566e-05, + "loss": 0.5282, + "step": 5406 + }, + { + "epoch": 0.6579860054761181, + "grad_norm": 1.9804679155349731, + "learning_rate": 1.5482195905931155e-05, + "loss": 0.435, + "step": 5407 + }, + { + "epoch": 0.6581076969881351, + "grad_norm": 2.009260416030884, + "learning_rate": 1.548056436909784e-05, + "loss": 0.4863, + "step": 5408 + }, + { + "epoch": 0.6582293885001521, + "grad_norm": 1.7735986709594727, + "learning_rate": 1.5478932623717704e-05, + "loss": 0.458, + "step": 5409 + }, + { + "epoch": 0.6583510800121691, + "grad_norm": 1.1705528497695923, + "learning_rate": 1.547730066985284e-05, + "loss": 0.4082, + "step": 5410 + }, + { + "epoch": 0.6584727715241862, + "grad_norm": 1.4086755514144897, + "learning_rate": 1.5475668507565355e-05, + "loss": 0.5022, + "step": 5411 + }, + { + "epoch": 0.6585944630362032, + "grad_norm": 3.3585305213928223, + "learning_rate": 1.5474036136917343e-05, + "loss": 0.4414, + "step": 5412 + }, + { + "epoch": 0.6587161545482203, + "grad_norm": 1.8085182905197144, + "learning_rate": 1.547240355797093e-05, + "loss": 0.509, + "step": 5413 + }, + { + "epoch": 0.6588378460602373, + "grad_norm": 3.6988284587860107, + "learning_rate": 1.547077077078823e-05, + "loss": 0.4333, + "step": 5414 + }, + { + "epoch": 0.6589595375722543, + "grad_norm": 1.0893356800079346, + "learning_rate": 1.546913777543138e-05, + "loss": 0.5473, + "step": 5415 + }, + { + "epoch": 0.6590812290842714, + "grad_norm": 2.670675277709961, + "learning_rate": 1.546750457196252e-05, + "loss": 0.4735, + "step": 5416 + }, + { + "epoch": 0.6592029205962884, + "grad_norm": 2.467972993850708, + "learning_rate": 1.546587116044379e-05, + "loss": 0.4606, + "step": 5417 + }, + { + "epoch": 0.6593246121083054, + "grad_norm": 2.314988136291504, + "learning_rate": 1.546423754093735e-05, + "loss": 0.4472, + "step": 5418 + }, + { + "epoch": 0.6594463036203225, + "grad_norm": 1.3508634567260742, + "learning_rate": 1.546260371350536e-05, + "loss": 0.4122, + "step": 5419 + }, + { + "epoch": 0.6595679951323395, + "grad_norm": 0.7820689678192139, + "learning_rate": 1.5460969678209992e-05, + "loss": 0.4673, + "step": 5420 + }, + { + "epoch": 0.6596896866443566, + "grad_norm": 4.751261234283447, + "learning_rate": 1.5459335435113427e-05, + "loss": 0.5512, + "step": 5421 + }, + { + "epoch": 0.6598113781563736, + "grad_norm": 5.589266300201416, + "learning_rate": 1.5457700984277844e-05, + "loss": 0.5782, + "step": 5422 + }, + { + "epoch": 0.6599330696683906, + "grad_norm": 3.081406593322754, + "learning_rate": 1.5456066325765446e-05, + "loss": 0.4664, + "step": 5423 + }, + { + "epoch": 0.6600547611804076, + "grad_norm": 0.7796817421913147, + "learning_rate": 1.545443145963843e-05, + "loss": 0.3979, + "step": 5424 + }, + { + "epoch": 0.6601764526924248, + "grad_norm": 3.147052764892578, + "learning_rate": 1.5452796385959007e-05, + "loss": 0.5019, + "step": 5425 + }, + { + "epoch": 0.6602981442044418, + "grad_norm": 3.250377893447876, + "learning_rate": 1.5451161104789392e-05, + "loss": 0.5038, + "step": 5426 + }, + { + "epoch": 0.6604198357164588, + "grad_norm": 3.661663770675659, + "learning_rate": 1.5449525616191817e-05, + "loss": 0.5171, + "step": 5427 + }, + { + "epoch": 0.6605415272284758, + "grad_norm": 3.851524591445923, + "learning_rate": 1.544788992022851e-05, + "loss": 0.5036, + "step": 5428 + }, + { + "epoch": 0.6606632187404928, + "grad_norm": 1.04959237575531, + "learning_rate": 1.544625401696172e-05, + "loss": 0.4604, + "step": 5429 + }, + { + "epoch": 0.6607849102525098, + "grad_norm": 0.8070778846740723, + "learning_rate": 1.5444617906453685e-05, + "loss": 0.4762, + "step": 5430 + }, + { + "epoch": 0.660906601764527, + "grad_norm": 0.5956177711486816, + "learning_rate": 1.544298158876667e-05, + "loss": 0.4951, + "step": 5431 + }, + { + "epoch": 0.661028293276544, + "grad_norm": 2.463252305984497, + "learning_rate": 1.544134506396294e-05, + "loss": 0.484, + "step": 5432 + }, + { + "epoch": 0.661149984788561, + "grad_norm": 4.864258289337158, + "learning_rate": 1.5439708332104772e-05, + "loss": 0.48, + "step": 5433 + }, + { + "epoch": 0.661271676300578, + "grad_norm": 1.0821592807769775, + "learning_rate": 1.5438071393254437e-05, + "loss": 0.4501, + "step": 5434 + }, + { + "epoch": 0.6613933678125951, + "grad_norm": 0.5609249472618103, + "learning_rate": 1.543643424747423e-05, + "loss": 0.4688, + "step": 5435 + }, + { + "epoch": 0.6615150593246121, + "grad_norm": 1.0449097156524658, + "learning_rate": 1.543479689482645e-05, + "loss": 0.4628, + "step": 5436 + }, + { + "epoch": 0.6616367508366291, + "grad_norm": 2.5006985664367676, + "learning_rate": 1.5433159335373396e-05, + "loss": 0.4908, + "step": 5437 + }, + { + "epoch": 0.6617584423486462, + "grad_norm": 1.795767903327942, + "learning_rate": 1.5431521569177382e-05, + "loss": 0.481, + "step": 5438 + }, + { + "epoch": 0.6618801338606632, + "grad_norm": 2.1494393348693848, + "learning_rate": 1.5429883596300732e-05, + "loss": 0.4861, + "step": 5439 + }, + { + "epoch": 0.6620018253726803, + "grad_norm": 0.6086944937705994, + "learning_rate": 1.5428245416805773e-05, + "loss": 0.4491, + "step": 5440 + }, + { + "epoch": 0.6621235168846973, + "grad_norm": 1.006093144416809, + "learning_rate": 1.5426607030754833e-05, + "loss": 0.4501, + "step": 5441 + }, + { + "epoch": 0.6622452083967143, + "grad_norm": 0.8190460801124573, + "learning_rate": 1.5424968438210274e-05, + "loss": 0.4526, + "step": 5442 + }, + { + "epoch": 0.6623668999087313, + "grad_norm": 0.9641715288162231, + "learning_rate": 1.542332963923443e-05, + "loss": 0.4415, + "step": 5443 + }, + { + "epoch": 0.6624885914207485, + "grad_norm": 2.7706751823425293, + "learning_rate": 1.542169063388967e-05, + "loss": 0.5111, + "step": 5444 + }, + { + "epoch": 0.6626102829327655, + "grad_norm": 0.5637524724006653, + "learning_rate": 1.5420051422238356e-05, + "loss": 0.4413, + "step": 5445 + }, + { + "epoch": 0.6627319744447825, + "grad_norm": 1.2848612070083618, + "learning_rate": 1.5418412004342867e-05, + "loss": 0.4529, + "step": 5446 + }, + { + "epoch": 0.6628536659567995, + "grad_norm": 0.5230322480201721, + "learning_rate": 1.541677238026559e-05, + "loss": 0.4474, + "step": 5447 + }, + { + "epoch": 0.6629753574688165, + "grad_norm": 2.7056941986083984, + "learning_rate": 1.541513255006891e-05, + "loss": 0.4824, + "step": 5448 + }, + { + "epoch": 0.6630970489808335, + "grad_norm": 1.772239089012146, + "learning_rate": 1.5413492513815226e-05, + "loss": 0.4833, + "step": 5449 + }, + { + "epoch": 0.6632187404928507, + "grad_norm": 2.0207180976867676, + "learning_rate": 1.5411852271566945e-05, + "loss": 0.4532, + "step": 5450 + }, + { + "epoch": 0.6633404320048677, + "grad_norm": 0.6144729852676392, + "learning_rate": 1.5410211823386488e-05, + "loss": 0.5102, + "step": 5451 + }, + { + "epoch": 0.6634621235168847, + "grad_norm": 3.442842960357666, + "learning_rate": 1.540857116933627e-05, + "loss": 0.4237, + "step": 5452 + }, + { + "epoch": 0.6635838150289017, + "grad_norm": 0.664132833480835, + "learning_rate": 1.5406930309478726e-05, + "loss": 0.4604, + "step": 5453 + }, + { + "epoch": 0.6637055065409188, + "grad_norm": 1.3695380687713623, + "learning_rate": 1.5405289243876286e-05, + "loss": 0.4292, + "step": 5454 + }, + { + "epoch": 0.6638271980529358, + "grad_norm": 0.751721978187561, + "learning_rate": 1.5403647972591407e-05, + "loss": 0.4654, + "step": 5455 + }, + { + "epoch": 0.6639488895649528, + "grad_norm": 2.613236904144287, + "learning_rate": 1.5402006495686536e-05, + "loss": 0.3987, + "step": 5456 + }, + { + "epoch": 0.6640705810769699, + "grad_norm": 0.7441973090171814, + "learning_rate": 1.540036481322414e-05, + "loss": 0.4523, + "step": 5457 + }, + { + "epoch": 0.6641922725889869, + "grad_norm": 3.3111002445220947, + "learning_rate": 1.5398722925266678e-05, + "loss": 0.5148, + "step": 5458 + }, + { + "epoch": 0.664313964101004, + "grad_norm": 1.8782531023025513, + "learning_rate": 1.5397080831876642e-05, + "loss": 0.4594, + "step": 5459 + }, + { + "epoch": 0.664435655613021, + "grad_norm": 0.7957891821861267, + "learning_rate": 1.53954385331165e-05, + "loss": 0.4332, + "step": 5460 + }, + { + "epoch": 0.664557347125038, + "grad_norm": 1.368462324142456, + "learning_rate": 1.539379602904876e-05, + "loss": 0.4418, + "step": 5461 + }, + { + "epoch": 0.664679038637055, + "grad_norm": 2.4055609703063965, + "learning_rate": 1.5392153319735913e-05, + "loss": 0.5075, + "step": 5462 + }, + { + "epoch": 0.6648007301490721, + "grad_norm": 1.81770920753479, + "learning_rate": 1.5390510405240476e-05, + "loss": 0.4736, + "step": 5463 + }, + { + "epoch": 0.6649224216610892, + "grad_norm": 0.7309744358062744, + "learning_rate": 1.5388867285624955e-05, + "loss": 0.4946, + "step": 5464 + }, + { + "epoch": 0.6650441131731062, + "grad_norm": 2.999840497970581, + "learning_rate": 1.5387223960951882e-05, + "loss": 0.5051, + "step": 5465 + }, + { + "epoch": 0.6651658046851232, + "grad_norm": 1.6216768026351929, + "learning_rate": 1.538558043128379e-05, + "loss": 0.4504, + "step": 5466 + }, + { + "epoch": 0.6652874961971402, + "grad_norm": 2.0688908100128174, + "learning_rate": 1.538393669668321e-05, + "loss": 0.4362, + "step": 5467 + }, + { + "epoch": 0.6654091877091572, + "grad_norm": 2.377845525741577, + "learning_rate": 1.5382292757212694e-05, + "loss": 0.5253, + "step": 5468 + }, + { + "epoch": 0.6655308792211744, + "grad_norm": 4.468915939331055, + "learning_rate": 1.53806486129348e-05, + "loss": 0.409, + "step": 5469 + }, + { + "epoch": 0.6656525707331914, + "grad_norm": 0.770176351070404, + "learning_rate": 1.537900426391209e-05, + "loss": 0.4928, + "step": 5470 + }, + { + "epoch": 0.6657742622452084, + "grad_norm": 1.4929450750350952, + "learning_rate": 1.5377359710207132e-05, + "loss": 0.4306, + "step": 5471 + }, + { + "epoch": 0.6658959537572254, + "grad_norm": 1.6689743995666504, + "learning_rate": 1.5375714951882505e-05, + "loss": 0.4661, + "step": 5472 + }, + { + "epoch": 0.6660176452692425, + "grad_norm": 0.7525126934051514, + "learning_rate": 1.53740699890008e-05, + "loss": 0.4826, + "step": 5473 + }, + { + "epoch": 0.6661393367812595, + "grad_norm": 2.3113670349121094, + "learning_rate": 1.5372424821624606e-05, + "loss": 0.5149, + "step": 5474 + }, + { + "epoch": 0.6662610282932765, + "grad_norm": 1.2864298820495605, + "learning_rate": 1.5370779449816526e-05, + "loss": 0.4331, + "step": 5475 + }, + { + "epoch": 0.6663827198052936, + "grad_norm": 4.866365432739258, + "learning_rate": 1.5369133873639176e-05, + "loss": 0.5607, + "step": 5476 + }, + { + "epoch": 0.6665044113173106, + "grad_norm": 1.059059500694275, + "learning_rate": 1.5367488093155168e-05, + "loss": 0.4295, + "step": 5477 + }, + { + "epoch": 0.6666261028293277, + "grad_norm": 1.578230619430542, + "learning_rate": 1.536584210842712e-05, + "loss": 0.5176, + "step": 5478 + }, + { + "epoch": 0.6667477943413447, + "grad_norm": 2.8086841106414795, + "learning_rate": 1.536419591951768e-05, + "loss": 0.5332, + "step": 5479 + }, + { + "epoch": 0.6668694858533617, + "grad_norm": 1.0661730766296387, + "learning_rate": 1.5362549526489484e-05, + "loss": 0.4423, + "step": 5480 + }, + { + "epoch": 0.6669911773653787, + "grad_norm": 2.354785203933716, + "learning_rate": 1.5360902929405176e-05, + "loss": 0.5158, + "step": 5481 + }, + { + "epoch": 0.6671128688773958, + "grad_norm": 1.889513373374939, + "learning_rate": 1.5359256128327413e-05, + "loss": 0.4993, + "step": 5482 + }, + { + "epoch": 0.6672345603894129, + "grad_norm": 0.8707626461982727, + "learning_rate": 1.5357609123318864e-05, + "loss": 0.4848, + "step": 5483 + }, + { + "epoch": 0.6673562519014299, + "grad_norm": 2.764751434326172, + "learning_rate": 1.5355961914442198e-05, + "loss": 0.4228, + "step": 5484 + }, + { + "epoch": 0.6674779434134469, + "grad_norm": 3.466641664505005, + "learning_rate": 1.5354314501760093e-05, + "loss": 0.4407, + "step": 5485 + }, + { + "epoch": 0.6675996349254639, + "grad_norm": 2.373563766479492, + "learning_rate": 1.535266688533524e-05, + "loss": 0.4894, + "step": 5486 + }, + { + "epoch": 0.6677213264374809, + "grad_norm": 3.4814956188201904, + "learning_rate": 1.535101906523033e-05, + "loss": 0.4381, + "step": 5487 + }, + { + "epoch": 0.6678430179494981, + "grad_norm": 2.9427573680877686, + "learning_rate": 1.5349371041508073e-05, + "loss": 0.4122, + "step": 5488 + }, + { + "epoch": 0.6679647094615151, + "grad_norm": 0.8163983821868896, + "learning_rate": 1.5347722814231172e-05, + "loss": 0.4651, + "step": 5489 + }, + { + "epoch": 0.6680864009735321, + "grad_norm": 0.5386254787445068, + "learning_rate": 1.534607438346235e-05, + "loss": 0.4104, + "step": 5490 + }, + { + "epoch": 0.6682080924855491, + "grad_norm": 5.744544506072998, + "learning_rate": 1.5344425749264332e-05, + "loss": 0.5607, + "step": 5491 + }, + { + "epoch": 0.6683297839975662, + "grad_norm": 6.783387184143066, + "learning_rate": 1.5342776911699853e-05, + "loss": 0.5923, + "step": 5492 + }, + { + "epoch": 0.6684514755095832, + "grad_norm": 4.496654987335205, + "learning_rate": 1.534112787083165e-05, + "loss": 0.5294, + "step": 5493 + }, + { + "epoch": 0.6685731670216002, + "grad_norm": 2.8708765506744385, + "learning_rate": 1.533947862672248e-05, + "loss": 0.4533, + "step": 5494 + }, + { + "epoch": 0.6686948585336173, + "grad_norm": 4.694194793701172, + "learning_rate": 1.5337829179435094e-05, + "loss": 0.5418, + "step": 5495 + }, + { + "epoch": 0.6688165500456343, + "grad_norm": 1.285262107849121, + "learning_rate": 1.533617952903226e-05, + "loss": 0.4647, + "step": 5496 + }, + { + "epoch": 0.6689382415576514, + "grad_norm": 2.4946682453155518, + "learning_rate": 1.5334529675576744e-05, + "loss": 0.4507, + "step": 5497 + }, + { + "epoch": 0.6690599330696684, + "grad_norm": 1.1124497652053833, + "learning_rate": 1.533287961913134e-05, + "loss": 0.416, + "step": 5498 + }, + { + "epoch": 0.6691816245816854, + "grad_norm": 2.001948118209839, + "learning_rate": 1.5331229359758822e-05, + "loss": 0.4752, + "step": 5499 + }, + { + "epoch": 0.6693033160937024, + "grad_norm": 2.354109764099121, + "learning_rate": 1.5329578897521996e-05, + "loss": 0.4701, + "step": 5500 + }, + { + "epoch": 0.6694250076057195, + "grad_norm": 3.811180353164673, + "learning_rate": 1.5327928232483662e-05, + "loss": 0.3861, + "step": 5501 + }, + { + "epoch": 0.6695466991177366, + "grad_norm": 1.0407211780548096, + "learning_rate": 1.5326277364706627e-05, + "loss": 0.4993, + "step": 5502 + }, + { + "epoch": 0.6696683906297536, + "grad_norm": 1.975663423538208, + "learning_rate": 1.5324626294253717e-05, + "loss": 0.4661, + "step": 5503 + }, + { + "epoch": 0.6697900821417706, + "grad_norm": 0.8591955900192261, + "learning_rate": 1.532297502118775e-05, + "loss": 0.4676, + "step": 5504 + }, + { + "epoch": 0.6699117736537876, + "grad_norm": 1.231006145477295, + "learning_rate": 1.532132354557157e-05, + "loss": 0.5024, + "step": 5505 + }, + { + "epoch": 0.6700334651658046, + "grad_norm": 1.9702006578445435, + "learning_rate": 1.5319671867468013e-05, + "loss": 0.442, + "step": 5506 + }, + { + "epoch": 0.6701551566778218, + "grad_norm": 2.3092076778411865, + "learning_rate": 1.531801998693993e-05, + "loss": 0.4003, + "step": 5507 + }, + { + "epoch": 0.6702768481898388, + "grad_norm": 1.0356674194335938, + "learning_rate": 1.531636790405018e-05, + "loss": 0.4754, + "step": 5508 + }, + { + "epoch": 0.6703985397018558, + "grad_norm": 2.3935837745666504, + "learning_rate": 1.5314715618861626e-05, + "loss": 0.5382, + "step": 5509 + }, + { + "epoch": 0.6705202312138728, + "grad_norm": 1.5197428464889526, + "learning_rate": 1.531306313143714e-05, + "loss": 0.4783, + "step": 5510 + }, + { + "epoch": 0.6706419227258898, + "grad_norm": 1.051122784614563, + "learning_rate": 1.531141044183961e-05, + "loss": 0.4467, + "step": 5511 + }, + { + "epoch": 0.6707636142379069, + "grad_norm": 0.6544637680053711, + "learning_rate": 1.5309757550131913e-05, + "loss": 0.4602, + "step": 5512 + }, + { + "epoch": 0.6708853057499239, + "grad_norm": 1.5131142139434814, + "learning_rate": 1.5308104456376955e-05, + "loss": 0.4584, + "step": 5513 + }, + { + "epoch": 0.671006997261941, + "grad_norm": 1.4282640218734741, + "learning_rate": 1.5306451160637633e-05, + "loss": 0.4791, + "step": 5514 + }, + { + "epoch": 0.671128688773958, + "grad_norm": 1.8938522338867188, + "learning_rate": 1.530479766297686e-05, + "loss": 0.4683, + "step": 5515 + }, + { + "epoch": 0.6712503802859751, + "grad_norm": 1.9751620292663574, + "learning_rate": 1.5303143963457558e-05, + "loss": 0.4577, + "step": 5516 + }, + { + "epoch": 0.6713720717979921, + "grad_norm": 1.6509391069412231, + "learning_rate": 1.530149006214265e-05, + "loss": 0.4275, + "step": 5517 + }, + { + "epoch": 0.6714937633100091, + "grad_norm": 4.49000358581543, + "learning_rate": 1.529983595909507e-05, + "loss": 0.3828, + "step": 5518 + }, + { + "epoch": 0.6716154548220261, + "grad_norm": 2.0262069702148438, + "learning_rate": 1.5298181654377763e-05, + "loss": 0.4375, + "step": 5519 + }, + { + "epoch": 0.6717371463340432, + "grad_norm": 0.6181219816207886, + "learning_rate": 1.5296527148053677e-05, + "loss": 0.438, + "step": 5520 + }, + { + "epoch": 0.6718588378460603, + "grad_norm": 1.0754001140594482, + "learning_rate": 1.5294872440185772e-05, + "loss": 0.4789, + "step": 5521 + }, + { + "epoch": 0.6719805293580773, + "grad_norm": 1.1276960372924805, + "learning_rate": 1.529321753083701e-05, + "loss": 0.4451, + "step": 5522 + }, + { + "epoch": 0.6721022208700943, + "grad_norm": 1.2000776529312134, + "learning_rate": 1.529156242007036e-05, + "loss": 0.4504, + "step": 5523 + }, + { + "epoch": 0.6722239123821113, + "grad_norm": 2.402139663696289, + "learning_rate": 1.5289907107948813e-05, + "loss": 0.4979, + "step": 5524 + }, + { + "epoch": 0.6723456038941283, + "grad_norm": 1.8518776893615723, + "learning_rate": 1.528825159453535e-05, + "loss": 0.4142, + "step": 5525 + }, + { + "epoch": 0.6724672954061455, + "grad_norm": 1.219595193862915, + "learning_rate": 1.5286595879892964e-05, + "loss": 0.494, + "step": 5526 + }, + { + "epoch": 0.6725889869181625, + "grad_norm": 2.703617572784424, + "learning_rate": 1.5284939964084664e-05, + "loss": 0.5516, + "step": 5527 + }, + { + "epoch": 0.6727106784301795, + "grad_norm": 0.9608203768730164, + "learning_rate": 1.528328384717346e-05, + "loss": 0.4381, + "step": 5528 + }, + { + "epoch": 0.6728323699421965, + "grad_norm": 2.390717029571533, + "learning_rate": 1.528162752922237e-05, + "loss": 0.4369, + "step": 5529 + }, + { + "epoch": 0.6729540614542135, + "grad_norm": 1.0250049829483032, + "learning_rate": 1.527997101029442e-05, + "loss": 0.4801, + "step": 5530 + }, + { + "epoch": 0.6730757529662306, + "grad_norm": 1.4124159812927246, + "learning_rate": 1.5278314290452643e-05, + "loss": 0.4673, + "step": 5531 + }, + { + "epoch": 0.6731974444782477, + "grad_norm": 1.3681262731552124, + "learning_rate": 1.527665736976008e-05, + "loss": 0.4686, + "step": 5532 + }, + { + "epoch": 0.6733191359902647, + "grad_norm": 0.6639538407325745, + "learning_rate": 1.527500024827979e-05, + "loss": 0.4615, + "step": 5533 + }, + { + "epoch": 0.6734408275022817, + "grad_norm": 1.2778096199035645, + "learning_rate": 1.5273342926074816e-05, + "loss": 0.4352, + "step": 5534 + }, + { + "epoch": 0.6735625190142988, + "grad_norm": 3.330084800720215, + "learning_rate": 1.5271685403208226e-05, + "loss": 0.5171, + "step": 5535 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 1.220238447189331, + "learning_rate": 1.5270027679743095e-05, + "loss": 0.4413, + "step": 5536 + }, + { + "epoch": 0.6738059020383328, + "grad_norm": 1.125942349433899, + "learning_rate": 1.5268369755742507e-05, + "loss": 0.4512, + "step": 5537 + }, + { + "epoch": 0.6739275935503498, + "grad_norm": 2.4602012634277344, + "learning_rate": 1.5266711631269543e-05, + "loss": 0.5589, + "step": 5538 + }, + { + "epoch": 0.6740492850623669, + "grad_norm": 2.895455837249756, + "learning_rate": 1.52650533063873e-05, + "loss": 0.5227, + "step": 5539 + }, + { + "epoch": 0.674170976574384, + "grad_norm": 1.3872512578964233, + "learning_rate": 1.526339478115888e-05, + "loss": 0.5064, + "step": 5540 + }, + { + "epoch": 0.674292668086401, + "grad_norm": 3.270491600036621, + "learning_rate": 1.5261736055647397e-05, + "loss": 0.3966, + "step": 5541 + }, + { + "epoch": 0.674414359598418, + "grad_norm": 0.9828242659568787, + "learning_rate": 1.526007712991596e-05, + "loss": 0.5008, + "step": 5542 + }, + { + "epoch": 0.674536051110435, + "grad_norm": 0.8394197821617126, + "learning_rate": 1.5258418004027705e-05, + "loss": 0.4652, + "step": 5543 + }, + { + "epoch": 0.674657742622452, + "grad_norm": 1.5907459259033203, + "learning_rate": 1.5256758678045757e-05, + "loss": 0.4611, + "step": 5544 + }, + { + "epoch": 0.6747794341344692, + "grad_norm": 0.7604082226753235, + "learning_rate": 1.5255099152033266e-05, + "loss": 0.4791, + "step": 5545 + }, + { + "epoch": 0.6749011256464862, + "grad_norm": 1.8490198850631714, + "learning_rate": 1.525343942605337e-05, + "loss": 0.4198, + "step": 5546 + }, + { + "epoch": 0.6750228171585032, + "grad_norm": 0.7479878664016724, + "learning_rate": 1.525177950016923e-05, + "loss": 0.4598, + "step": 5547 + }, + { + "epoch": 0.6751445086705202, + "grad_norm": 0.5962470173835754, + "learning_rate": 1.525011937444401e-05, + "loss": 0.4445, + "step": 5548 + }, + { + "epoch": 0.6752662001825372, + "grad_norm": 2.5411951541900635, + "learning_rate": 1.5248459048940882e-05, + "loss": 0.4763, + "step": 5549 + }, + { + "epoch": 0.6753878916945543, + "grad_norm": 2.578291893005371, + "learning_rate": 1.5246798523723023e-05, + "loss": 0.512, + "step": 5550 + }, + { + "epoch": 0.6755095832065714, + "grad_norm": 1.4908937215805054, + "learning_rate": 1.524513779885362e-05, + "loss": 0.4611, + "step": 5551 + }, + { + "epoch": 0.6756312747185884, + "grad_norm": 1.360190987586975, + "learning_rate": 1.5243476874395866e-05, + "loss": 0.4921, + "step": 5552 + }, + { + "epoch": 0.6757529662306054, + "grad_norm": 1.984915018081665, + "learning_rate": 1.5241815750412964e-05, + "loss": 0.468, + "step": 5553 + }, + { + "epoch": 0.6758746577426225, + "grad_norm": 0.7403093576431274, + "learning_rate": 1.5240154426968124e-05, + "loss": 0.4391, + "step": 5554 + }, + { + "epoch": 0.6759963492546395, + "grad_norm": 0.7237421870231628, + "learning_rate": 1.523849290412456e-05, + "loss": 0.5147, + "step": 5555 + }, + { + "epoch": 0.6761180407666565, + "grad_norm": 3.2059812545776367, + "learning_rate": 1.5236831181945499e-05, + "loss": 0.4522, + "step": 5556 + }, + { + "epoch": 0.6762397322786735, + "grad_norm": 0.7338497638702393, + "learning_rate": 1.5235169260494171e-05, + "loss": 0.5182, + "step": 5557 + }, + { + "epoch": 0.6763614237906906, + "grad_norm": 1.3542253971099854, + "learning_rate": 1.5233507139833816e-05, + "loss": 0.4887, + "step": 5558 + }, + { + "epoch": 0.6764831153027077, + "grad_norm": 3.261929750442505, + "learning_rate": 1.5231844820027681e-05, + "loss": 0.4913, + "step": 5559 + }, + { + "epoch": 0.6766048068147247, + "grad_norm": 4.380527973175049, + "learning_rate": 1.5230182301139025e-05, + "loss": 0.4261, + "step": 5560 + }, + { + "epoch": 0.6767264983267417, + "grad_norm": 1.559615969657898, + "learning_rate": 1.5228519583231102e-05, + "loss": 0.4594, + "step": 5561 + }, + { + "epoch": 0.6768481898387587, + "grad_norm": 1.7029471397399902, + "learning_rate": 1.5226856666367189e-05, + "loss": 0.4205, + "step": 5562 + }, + { + "epoch": 0.6769698813507757, + "grad_norm": 2.043337345123291, + "learning_rate": 1.5225193550610563e-05, + "loss": 0.393, + "step": 5563 + }, + { + "epoch": 0.6770915728627929, + "grad_norm": 0.7511071562767029, + "learning_rate": 1.5223530236024502e-05, + "loss": 0.4422, + "step": 5564 + }, + { + "epoch": 0.6772132643748099, + "grad_norm": 3.5116820335388184, + "learning_rate": 1.5221866722672305e-05, + "loss": 0.4958, + "step": 5565 + }, + { + "epoch": 0.6773349558868269, + "grad_norm": 5.950035572052002, + "learning_rate": 1.5220203010617272e-05, + "loss": 0.573, + "step": 5566 + }, + { + "epoch": 0.6774566473988439, + "grad_norm": 3.390221118927002, + "learning_rate": 1.5218539099922708e-05, + "loss": 0.4837, + "step": 5567 + }, + { + "epoch": 0.6775783389108609, + "grad_norm": 2.9605467319488525, + "learning_rate": 1.521687499065193e-05, + "loss": 0.4167, + "step": 5568 + }, + { + "epoch": 0.677700030422878, + "grad_norm": 4.046869277954102, + "learning_rate": 1.521521068286826e-05, + "loss": 0.4792, + "step": 5569 + }, + { + "epoch": 0.6778217219348951, + "grad_norm": 3.218365430831909, + "learning_rate": 1.5213546176635028e-05, + "loss": 0.4844, + "step": 5570 + }, + { + "epoch": 0.6779434134469121, + "grad_norm": 2.4036426544189453, + "learning_rate": 1.5211881472015575e-05, + "loss": 0.4585, + "step": 5571 + }, + { + "epoch": 0.6780651049589291, + "grad_norm": 1.279919981956482, + "learning_rate": 1.521021656907324e-05, + "loss": 0.4493, + "step": 5572 + }, + { + "epoch": 0.6781867964709462, + "grad_norm": 0.6202378869056702, + "learning_rate": 1.5208551467871382e-05, + "loss": 0.4631, + "step": 5573 + }, + { + "epoch": 0.6783084879829632, + "grad_norm": 0.7910367250442505, + "learning_rate": 1.520688616847336e-05, + "loss": 0.5026, + "step": 5574 + }, + { + "epoch": 0.6784301794949802, + "grad_norm": 0.6538732647895813, + "learning_rate": 1.5205220670942544e-05, + "loss": 0.4999, + "step": 5575 + }, + { + "epoch": 0.6785518710069972, + "grad_norm": 5.974656105041504, + "learning_rate": 1.5203554975342303e-05, + "loss": 0.4544, + "step": 5576 + }, + { + "epoch": 0.6786735625190143, + "grad_norm": 4.890570163726807, + "learning_rate": 1.5201889081736026e-05, + "loss": 0.4672, + "step": 5577 + }, + { + "epoch": 0.6787952540310314, + "grad_norm": 3.031325340270996, + "learning_rate": 1.52002229901871e-05, + "loss": 0.5157, + "step": 5578 + }, + { + "epoch": 0.6789169455430484, + "grad_norm": 4.464944362640381, + "learning_rate": 1.519855670075893e-05, + "loss": 0.4632, + "step": 5579 + }, + { + "epoch": 0.6790386370550654, + "grad_norm": 4.358697891235352, + "learning_rate": 1.5196890213514915e-05, + "loss": 0.4865, + "step": 5580 + }, + { + "epoch": 0.6791603285670824, + "grad_norm": 1.2585469484329224, + "learning_rate": 1.5195223528518468e-05, + "loss": 0.5086, + "step": 5581 + }, + { + "epoch": 0.6792820200790994, + "grad_norm": 0.8789167404174805, + "learning_rate": 1.5193556645833014e-05, + "loss": 0.5191, + "step": 5582 + }, + { + "epoch": 0.6794037115911166, + "grad_norm": 4.2059149742126465, + "learning_rate": 1.5191889565521982e-05, + "loss": 0.4085, + "step": 5583 + }, + { + "epoch": 0.6795254031031336, + "grad_norm": 0.7056273818016052, + "learning_rate": 1.5190222287648802e-05, + "loss": 0.4414, + "step": 5584 + }, + { + "epoch": 0.6796470946151506, + "grad_norm": 2.5234735012054443, + "learning_rate": 1.5188554812276918e-05, + "loss": 0.4537, + "step": 5585 + }, + { + "epoch": 0.6797687861271676, + "grad_norm": 0.6492642164230347, + "learning_rate": 1.518688713946979e-05, + "loss": 0.4398, + "step": 5586 + }, + { + "epoch": 0.6798904776391846, + "grad_norm": 2.277618408203125, + "learning_rate": 1.518521926929087e-05, + "loss": 0.5147, + "step": 5587 + }, + { + "epoch": 0.6800121691512017, + "grad_norm": 0.8896979093551636, + "learning_rate": 1.5183551201803622e-05, + "loss": 0.4478, + "step": 5588 + }, + { + "epoch": 0.6801338606632188, + "grad_norm": 2.678936243057251, + "learning_rate": 1.518188293707152e-05, + "loss": 0.4671, + "step": 5589 + }, + { + "epoch": 0.6802555521752358, + "grad_norm": 1.265942096710205, + "learning_rate": 1.5180214475158045e-05, + "loss": 0.4043, + "step": 5590 + }, + { + "epoch": 0.6803772436872528, + "grad_norm": 2.5065581798553467, + "learning_rate": 1.5178545816126692e-05, + "loss": 0.489, + "step": 5591 + }, + { + "epoch": 0.6804989351992698, + "grad_norm": 1.4736247062683105, + "learning_rate": 1.5176876960040949e-05, + "loss": 0.4395, + "step": 5592 + }, + { + "epoch": 0.6806206267112869, + "grad_norm": 0.8289631009101868, + "learning_rate": 1.5175207906964322e-05, + "loss": 0.4188, + "step": 5593 + }, + { + "epoch": 0.6807423182233039, + "grad_norm": 1.6755791902542114, + "learning_rate": 1.5173538656960323e-05, + "loss": 0.4852, + "step": 5594 + }, + { + "epoch": 0.6808640097353209, + "grad_norm": 1.860309362411499, + "learning_rate": 1.5171869210092467e-05, + "loss": 0.5106, + "step": 5595 + }, + { + "epoch": 0.680985701247338, + "grad_norm": 1.8324044942855835, + "learning_rate": 1.5170199566424284e-05, + "loss": 0.4293, + "step": 5596 + }, + { + "epoch": 0.6811073927593551, + "grad_norm": 2.2882726192474365, + "learning_rate": 1.5168529726019305e-05, + "loss": 0.3954, + "step": 5597 + }, + { + "epoch": 0.6812290842713721, + "grad_norm": 1.8817601203918457, + "learning_rate": 1.5166859688941076e-05, + "loss": 0.4848, + "step": 5598 + }, + { + "epoch": 0.6813507757833891, + "grad_norm": 2.001265048980713, + "learning_rate": 1.5165189455253133e-05, + "loss": 0.501, + "step": 5599 + }, + { + "epoch": 0.6814724672954061, + "grad_norm": 0.9268087148666382, + "learning_rate": 1.5163519025019046e-05, + "loss": 0.5207, + "step": 5600 + }, + { + "epoch": 0.6815941588074231, + "grad_norm": 2.2409093379974365, + "learning_rate": 1.516184839830237e-05, + "loss": 0.4014, + "step": 5601 + }, + { + "epoch": 0.6817158503194403, + "grad_norm": 0.9919574856758118, + "learning_rate": 1.5160177575166679e-05, + "loss": 0.5077, + "step": 5602 + }, + { + "epoch": 0.6818375418314573, + "grad_norm": 1.652867078781128, + "learning_rate": 1.515850655567555e-05, + "loss": 0.4699, + "step": 5603 + }, + { + "epoch": 0.6819592333434743, + "grad_norm": 1.7148882150650024, + "learning_rate": 1.5156835339892568e-05, + "loss": 0.458, + "step": 5604 + }, + { + "epoch": 0.6820809248554913, + "grad_norm": 1.8744171857833862, + "learning_rate": 1.515516392788133e-05, + "loss": 0.4774, + "step": 5605 + }, + { + "epoch": 0.6822026163675083, + "grad_norm": 1.464301586151123, + "learning_rate": 1.515349231970543e-05, + "loss": 0.4594, + "step": 5606 + }, + { + "epoch": 0.6823243078795254, + "grad_norm": 3.034067392349243, + "learning_rate": 1.5151820515428482e-05, + "loss": 0.5391, + "step": 5607 + }, + { + "epoch": 0.6824459993915425, + "grad_norm": 0.645681619644165, + "learning_rate": 1.5150148515114099e-05, + "loss": 0.4468, + "step": 5608 + }, + { + "epoch": 0.6825676909035595, + "grad_norm": 0.6690253019332886, + "learning_rate": 1.5148476318825909e-05, + "loss": 0.4275, + "step": 5609 + }, + { + "epoch": 0.6826893824155765, + "grad_norm": 2.369357109069824, + "learning_rate": 1.5146803926627534e-05, + "loss": 0.3906, + "step": 5610 + }, + { + "epoch": 0.6828110739275935, + "grad_norm": 2.59771466255188, + "learning_rate": 1.5145131338582615e-05, + "loss": 0.4812, + "step": 5611 + }, + { + "epoch": 0.6829327654396106, + "grad_norm": 2.3747684955596924, + "learning_rate": 1.51434585547548e-05, + "loss": 0.5041, + "step": 5612 + }, + { + "epoch": 0.6830544569516276, + "grad_norm": 2.551231861114502, + "learning_rate": 1.5141785575207744e-05, + "loss": 0.4599, + "step": 5613 + }, + { + "epoch": 0.6831761484636446, + "grad_norm": 0.8567521572113037, + "learning_rate": 1.5140112400005103e-05, + "loss": 0.4329, + "step": 5614 + }, + { + "epoch": 0.6832978399756617, + "grad_norm": 1.7583280801773071, + "learning_rate": 1.5138439029210542e-05, + "loss": 0.4723, + "step": 5615 + }, + { + "epoch": 0.6834195314876788, + "grad_norm": 0.6423031687736511, + "learning_rate": 1.5136765462887742e-05, + "loss": 0.4496, + "step": 5616 + }, + { + "epoch": 0.6835412229996958, + "grad_norm": 2.487212657928467, + "learning_rate": 1.5135091701100384e-05, + "loss": 0.4933, + "step": 5617 + }, + { + "epoch": 0.6836629145117128, + "grad_norm": 4.152840614318848, + "learning_rate": 1.5133417743912157e-05, + "loss": 0.4278, + "step": 5618 + }, + { + "epoch": 0.6837846060237298, + "grad_norm": 2.1671485900878906, + "learning_rate": 1.5131743591386758e-05, + "loss": 0.445, + "step": 5619 + }, + { + "epoch": 0.6839062975357468, + "grad_norm": 0.6360027194023132, + "learning_rate": 1.5130069243587895e-05, + "loss": 0.4911, + "step": 5620 + }, + { + "epoch": 0.684027989047764, + "grad_norm": 0.8293319940567017, + "learning_rate": 1.5128394700579281e-05, + "loss": 0.5493, + "step": 5621 + }, + { + "epoch": 0.684149680559781, + "grad_norm": 4.631020545959473, + "learning_rate": 1.512671996242463e-05, + "loss": 0.4577, + "step": 5622 + }, + { + "epoch": 0.684271372071798, + "grad_norm": 2.609022855758667, + "learning_rate": 1.5125045029187673e-05, + "loss": 0.446, + "step": 5623 + }, + { + "epoch": 0.684393063583815, + "grad_norm": 2.240832805633545, + "learning_rate": 1.5123369900932146e-05, + "loss": 0.4519, + "step": 5624 + }, + { + "epoch": 0.684514755095832, + "grad_norm": 1.183343768119812, + "learning_rate": 1.5121694577721786e-05, + "loss": 0.4758, + "step": 5625 + }, + { + "epoch": 0.6846364466078491, + "grad_norm": 1.699865460395813, + "learning_rate": 1.512001905962035e-05, + "loss": 0.4708, + "step": 5626 + }, + { + "epoch": 0.6847581381198662, + "grad_norm": 0.5912694334983826, + "learning_rate": 1.5118343346691591e-05, + "loss": 0.4246, + "step": 5627 + }, + { + "epoch": 0.6848798296318832, + "grad_norm": 0.5839297771453857, + "learning_rate": 1.511666743899927e-05, + "loss": 0.4443, + "step": 5628 + }, + { + "epoch": 0.6850015211439002, + "grad_norm": 1.018214464187622, + "learning_rate": 1.5114991336607165e-05, + "loss": 0.4821, + "step": 5629 + }, + { + "epoch": 0.6851232126559172, + "grad_norm": 1.6293895244598389, + "learning_rate": 1.511331503957905e-05, + "loss": 0.4425, + "step": 5630 + }, + { + "epoch": 0.6852449041679343, + "grad_norm": 3.308887481689453, + "learning_rate": 1.5111638547978716e-05, + "loss": 0.5099, + "step": 5631 + }, + { + "epoch": 0.6853665956799513, + "grad_norm": 1.809523344039917, + "learning_rate": 1.5109961861869953e-05, + "loss": 0.3913, + "step": 5632 + }, + { + "epoch": 0.6854882871919684, + "grad_norm": 0.7841582298278809, + "learning_rate": 1.5108284981316565e-05, + "loss": 0.4681, + "step": 5633 + }, + { + "epoch": 0.6856099787039854, + "grad_norm": 1.096095085144043, + "learning_rate": 1.5106607906382358e-05, + "loss": 0.486, + "step": 5634 + }, + { + "epoch": 0.6857316702160025, + "grad_norm": 0.6582747101783752, + "learning_rate": 1.5104930637131153e-05, + "loss": 0.4538, + "step": 5635 + }, + { + "epoch": 0.6858533617280195, + "grad_norm": 0.7895265221595764, + "learning_rate": 1.5103253173626768e-05, + "loss": 0.4495, + "step": 5636 + }, + { + "epoch": 0.6859750532400365, + "grad_norm": 1.1880378723144531, + "learning_rate": 1.510157551593304e-05, + "loss": 0.5013, + "step": 5637 + }, + { + "epoch": 0.6860967447520535, + "grad_norm": 0.7382751703262329, + "learning_rate": 1.5099897664113798e-05, + "loss": 0.4878, + "step": 5638 + }, + { + "epoch": 0.6862184362640705, + "grad_norm": 2.165334463119507, + "learning_rate": 1.5098219618232901e-05, + "loss": 0.4351, + "step": 5639 + }, + { + "epoch": 0.6863401277760877, + "grad_norm": 2.846501350402832, + "learning_rate": 1.509654137835419e-05, + "loss": 0.4026, + "step": 5640 + }, + { + "epoch": 0.6864618192881047, + "grad_norm": 0.986497700214386, + "learning_rate": 1.5094862944541531e-05, + "loss": 0.4494, + "step": 5641 + }, + { + "epoch": 0.6865835108001217, + "grad_norm": 0.9017858505249023, + "learning_rate": 1.509318431685879e-05, + "loss": 0.45, + "step": 5642 + }, + { + "epoch": 0.6867052023121387, + "grad_norm": 0.8901139497756958, + "learning_rate": 1.509150549536985e-05, + "loss": 0.4423, + "step": 5643 + }, + { + "epoch": 0.6868268938241557, + "grad_norm": 2.297985553741455, + "learning_rate": 1.5089826480138578e-05, + "loss": 0.4732, + "step": 5644 + }, + { + "epoch": 0.6869485853361728, + "grad_norm": 1.1578972339630127, + "learning_rate": 1.508814727122888e-05, + "loss": 0.3979, + "step": 5645 + }, + { + "epoch": 0.6870702768481899, + "grad_norm": 2.9678287506103516, + "learning_rate": 1.5086467868704641e-05, + "loss": 0.462, + "step": 5646 + }, + { + "epoch": 0.6871919683602069, + "grad_norm": 1.9689537286758423, + "learning_rate": 1.5084788272629776e-05, + "loss": 0.4712, + "step": 5647 + }, + { + "epoch": 0.6873136598722239, + "grad_norm": 3.804685354232788, + "learning_rate": 1.5083108483068187e-05, + "loss": 0.5089, + "step": 5648 + }, + { + "epoch": 0.687435351384241, + "grad_norm": 3.8038132190704346, + "learning_rate": 1.5081428500083803e-05, + "loss": 0.5358, + "step": 5649 + }, + { + "epoch": 0.687557042896258, + "grad_norm": 0.9257991909980774, + "learning_rate": 1.5079748323740546e-05, + "loss": 0.4731, + "step": 5650 + }, + { + "epoch": 0.687678734408275, + "grad_norm": 1.4556457996368408, + "learning_rate": 1.507806795410235e-05, + "loss": 0.5403, + "step": 5651 + }, + { + "epoch": 0.6878004259202921, + "grad_norm": 3.0026285648345947, + "learning_rate": 1.5076387391233156e-05, + "loss": 0.4319, + "step": 5652 + }, + { + "epoch": 0.6879221174323091, + "grad_norm": 2.762108564376831, + "learning_rate": 1.5074706635196917e-05, + "loss": 0.4569, + "step": 5653 + }, + { + "epoch": 0.6880438089443262, + "grad_norm": 3.7026870250701904, + "learning_rate": 1.5073025686057584e-05, + "loss": 0.4529, + "step": 5654 + }, + { + "epoch": 0.6881655004563432, + "grad_norm": 0.7228449583053589, + "learning_rate": 1.5071344543879127e-05, + "loss": 0.5242, + "step": 5655 + }, + { + "epoch": 0.6882871919683602, + "grad_norm": 5.228067398071289, + "learning_rate": 1.506966320872551e-05, + "loss": 0.4504, + "step": 5656 + }, + { + "epoch": 0.6884088834803772, + "grad_norm": 1.1155802011489868, + "learning_rate": 1.5067981680660713e-05, + "loss": 0.5388, + "step": 5657 + }, + { + "epoch": 0.6885305749923942, + "grad_norm": 3.316756248474121, + "learning_rate": 1.5066299959748727e-05, + "loss": 0.412, + "step": 5658 + }, + { + "epoch": 0.6886522665044114, + "grad_norm": 3.818772077560425, + "learning_rate": 1.5064618046053536e-05, + "loss": 0.3747, + "step": 5659 + }, + { + "epoch": 0.6887739580164284, + "grad_norm": 1.189836859703064, + "learning_rate": 1.506293593963915e-05, + "loss": 0.4608, + "step": 5660 + }, + { + "epoch": 0.6888956495284454, + "grad_norm": 1.1319499015808105, + "learning_rate": 1.5061253640569567e-05, + "loss": 0.4224, + "step": 5661 + }, + { + "epoch": 0.6890173410404624, + "grad_norm": 1.5180237293243408, + "learning_rate": 1.5059571148908808e-05, + "loss": 0.4526, + "step": 5662 + }, + { + "epoch": 0.6891390325524794, + "grad_norm": 1.438735842704773, + "learning_rate": 1.5057888464720899e-05, + "loss": 0.4363, + "step": 5663 + }, + { + "epoch": 0.6892607240644965, + "grad_norm": 0.6268392205238342, + "learning_rate": 1.5056205588069856e-05, + "loss": 0.4438, + "step": 5664 + }, + { + "epoch": 0.6893824155765136, + "grad_norm": 1.0462889671325684, + "learning_rate": 1.5054522519019731e-05, + "loss": 0.4679, + "step": 5665 + }, + { + "epoch": 0.6895041070885306, + "grad_norm": 4.216460704803467, + "learning_rate": 1.5052839257634562e-05, + "loss": 0.5472, + "step": 5666 + }, + { + "epoch": 0.6896257986005476, + "grad_norm": 1.432166337966919, + "learning_rate": 1.5051155803978397e-05, + "loss": 0.4426, + "step": 5667 + }, + { + "epoch": 0.6897474901125646, + "grad_norm": 1.6374738216400146, + "learning_rate": 1.5049472158115302e-05, + "loss": 0.4699, + "step": 5668 + }, + { + "epoch": 0.6898691816245817, + "grad_norm": 4.411357402801514, + "learning_rate": 1.5047788320109335e-05, + "loss": 0.4548, + "step": 5669 + }, + { + "epoch": 0.6899908731365987, + "grad_norm": 1.863133430480957, + "learning_rate": 1.5046104290024577e-05, + "loss": 0.4958, + "step": 5670 + }, + { + "epoch": 0.6901125646486158, + "grad_norm": 1.1183873414993286, + "learning_rate": 1.5044420067925104e-05, + "loss": 0.4668, + "step": 5671 + }, + { + "epoch": 0.6902342561606328, + "grad_norm": 0.8334675431251526, + "learning_rate": 1.5042735653875008e-05, + "loss": 0.4572, + "step": 5672 + }, + { + "epoch": 0.6903559476726499, + "grad_norm": 1.472253441810608, + "learning_rate": 1.5041051047938387e-05, + "loss": 0.4418, + "step": 5673 + }, + { + "epoch": 0.6904776391846669, + "grad_norm": 1.1135720014572144, + "learning_rate": 1.503936625017933e-05, + "loss": 0.4828, + "step": 5674 + }, + { + "epoch": 0.6905993306966839, + "grad_norm": 1.8146601915359497, + "learning_rate": 1.5037681260661963e-05, + "loss": 0.4016, + "step": 5675 + }, + { + "epoch": 0.6907210222087009, + "grad_norm": 2.1689348220825195, + "learning_rate": 1.5035996079450392e-05, + "loss": 0.4691, + "step": 5676 + }, + { + "epoch": 0.6908427137207179, + "grad_norm": 2.723686456680298, + "learning_rate": 1.503431070660875e-05, + "loss": 0.4806, + "step": 5677 + }, + { + "epoch": 0.6909644052327351, + "grad_norm": 5.7829437255859375, + "learning_rate": 1.5032625142201163e-05, + "loss": 0.5798, + "step": 5678 + }, + { + "epoch": 0.6910860967447521, + "grad_norm": 0.8585506081581116, + "learning_rate": 1.5030939386291775e-05, + "loss": 0.455, + "step": 5679 + }, + { + "epoch": 0.6912077882567691, + "grad_norm": 0.9236282110214233, + "learning_rate": 1.5029253438944725e-05, + "loss": 0.4338, + "step": 5680 + }, + { + "epoch": 0.6913294797687861, + "grad_norm": 1.8230072259902954, + "learning_rate": 1.5027567300224175e-05, + "loss": 0.4613, + "step": 5681 + }, + { + "epoch": 0.6914511712808031, + "grad_norm": 0.6471611261367798, + "learning_rate": 1.5025880970194282e-05, + "loss": 0.4345, + "step": 5682 + }, + { + "epoch": 0.6915728627928202, + "grad_norm": 1.7413103580474854, + "learning_rate": 1.5024194448919217e-05, + "loss": 0.4108, + "step": 5683 + }, + { + "epoch": 0.6916945543048373, + "grad_norm": 0.8616017699241638, + "learning_rate": 1.502250773646315e-05, + "loss": 0.4738, + "step": 5684 + }, + { + "epoch": 0.6918162458168543, + "grad_norm": 0.7052682638168335, + "learning_rate": 1.502082083289027e-05, + "loss": 0.4773, + "step": 5685 + }, + { + "epoch": 0.6919379373288713, + "grad_norm": 0.9195804595947266, + "learning_rate": 1.5019133738264764e-05, + "loss": 0.453, + "step": 5686 + }, + { + "epoch": 0.6920596288408883, + "grad_norm": 3.081632614135742, + "learning_rate": 1.501744645265083e-05, + "loss": 0.492, + "step": 5687 + }, + { + "epoch": 0.6921813203529054, + "grad_norm": 2.3086626529693604, + "learning_rate": 1.5015758976112675e-05, + "loss": 0.5479, + "step": 5688 + }, + { + "epoch": 0.6923030118649224, + "grad_norm": 2.546926259994507, + "learning_rate": 1.5014071308714508e-05, + "loss": 0.4258, + "step": 5689 + }, + { + "epoch": 0.6924247033769395, + "grad_norm": 1.9054832458496094, + "learning_rate": 1.5012383450520549e-05, + "loss": 0.4514, + "step": 5690 + }, + { + "epoch": 0.6925463948889565, + "grad_norm": 1.3438888788223267, + "learning_rate": 1.5010695401595024e-05, + "loss": 0.4868, + "step": 5691 + }, + { + "epoch": 0.6926680864009735, + "grad_norm": 0.9181016683578491, + "learning_rate": 1.5009007162002167e-05, + "loss": 0.4442, + "step": 5692 + }, + { + "epoch": 0.6927897779129906, + "grad_norm": 1.5741522312164307, + "learning_rate": 1.500731873180622e-05, + "loss": 0.4839, + "step": 5693 + }, + { + "epoch": 0.6929114694250076, + "grad_norm": 1.6062912940979004, + "learning_rate": 1.500563011107143e-05, + "loss": 0.473, + "step": 5694 + }, + { + "epoch": 0.6930331609370246, + "grad_norm": 2.201292037963867, + "learning_rate": 1.5003941299862055e-05, + "loss": 0.4974, + "step": 5695 + }, + { + "epoch": 0.6931548524490416, + "grad_norm": 0.7437626123428345, + "learning_rate": 1.5002252298242356e-05, + "loss": 0.4317, + "step": 5696 + }, + { + "epoch": 0.6932765439610588, + "grad_norm": 0.65212082862854, + "learning_rate": 1.50005631062766e-05, + "loss": 0.4424, + "step": 5697 + }, + { + "epoch": 0.6933982354730758, + "grad_norm": 2.1397335529327393, + "learning_rate": 1.4998873724029068e-05, + "loss": 0.4782, + "step": 5698 + }, + { + "epoch": 0.6935199269850928, + "grad_norm": 0.9799501299858093, + "learning_rate": 1.4997184151564046e-05, + "loss": 0.4234, + "step": 5699 + }, + { + "epoch": 0.6936416184971098, + "grad_norm": 1.6238120794296265, + "learning_rate": 1.4995494388945821e-05, + "loss": 0.4468, + "step": 5700 + }, + { + "epoch": 0.6937633100091268, + "grad_norm": 1.225841999053955, + "learning_rate": 1.4993804436238696e-05, + "loss": 0.4318, + "step": 5701 + }, + { + "epoch": 0.6938850015211439, + "grad_norm": 0.6529251337051392, + "learning_rate": 1.4992114293506976e-05, + "loss": 0.419, + "step": 5702 + }, + { + "epoch": 0.694006693033161, + "grad_norm": 1.2169039249420166, + "learning_rate": 1.4990423960814972e-05, + "loss": 0.3727, + "step": 5703 + }, + { + "epoch": 0.694128384545178, + "grad_norm": 0.755707859992981, + "learning_rate": 1.4988733438227008e-05, + "loss": 0.5071, + "step": 5704 + }, + { + "epoch": 0.694250076057195, + "grad_norm": 1.2898119688034058, + "learning_rate": 1.498704272580741e-05, + "loss": 0.5125, + "step": 5705 + }, + { + "epoch": 0.694371767569212, + "grad_norm": 1.5574294328689575, + "learning_rate": 1.4985351823620516e-05, + "loss": 0.5136, + "step": 5706 + }, + { + "epoch": 0.6944934590812291, + "grad_norm": 1.4448221921920776, + "learning_rate": 1.4983660731730663e-05, + "loss": 0.4437, + "step": 5707 + }, + { + "epoch": 0.6946151505932461, + "grad_norm": 2.195535182952881, + "learning_rate": 1.4981969450202204e-05, + "loss": 0.3932, + "step": 5708 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 0.743669331073761, + "learning_rate": 1.4980277979099499e-05, + "loss": 0.4938, + "step": 5709 + }, + { + "epoch": 0.6948585336172802, + "grad_norm": 2.3098220825195312, + "learning_rate": 1.4978586318486903e-05, + "loss": 0.5058, + "step": 5710 + }, + { + "epoch": 0.6949802251292972, + "grad_norm": 1.493943452835083, + "learning_rate": 1.4976894468428796e-05, + "loss": 0.512, + "step": 5711 + }, + { + "epoch": 0.6951019166413143, + "grad_norm": 2.427304744720459, + "learning_rate": 1.497520242898955e-05, + "loss": 0.5396, + "step": 5712 + }, + { + "epoch": 0.6952236081533313, + "grad_norm": 0.9201792478561401, + "learning_rate": 1.4973510200233556e-05, + "loss": 0.487, + "step": 5713 + }, + { + "epoch": 0.6953452996653483, + "grad_norm": 2.018176317214966, + "learning_rate": 1.4971817782225202e-05, + "loss": 0.4906, + "step": 5714 + }, + { + "epoch": 0.6954669911773653, + "grad_norm": 2.691685199737549, + "learning_rate": 1.4970125175028894e-05, + "loss": 0.4541, + "step": 5715 + }, + { + "epoch": 0.6955886826893825, + "grad_norm": 1.2946938276290894, + "learning_rate": 1.4968432378709032e-05, + "loss": 0.4812, + "step": 5716 + }, + { + "epoch": 0.6957103742013995, + "grad_norm": 0.8203717470169067, + "learning_rate": 1.4966739393330034e-05, + "loss": 0.4848, + "step": 5717 + }, + { + "epoch": 0.6958320657134165, + "grad_norm": 1.0984758138656616, + "learning_rate": 1.4965046218956324e-05, + "loss": 0.4805, + "step": 5718 + }, + { + "epoch": 0.6959537572254335, + "grad_norm": 0.6645656228065491, + "learning_rate": 1.4963352855652326e-05, + "loss": 0.4598, + "step": 5719 + }, + { + "epoch": 0.6960754487374505, + "grad_norm": 1.2504998445510864, + "learning_rate": 1.4961659303482478e-05, + "loss": 0.4238, + "step": 5720 + }, + { + "epoch": 0.6961971402494675, + "grad_norm": 1.5913711786270142, + "learning_rate": 1.4959965562511224e-05, + "loss": 0.513, + "step": 5721 + }, + { + "epoch": 0.6963188317614847, + "grad_norm": 1.1032487154006958, + "learning_rate": 1.4958271632803017e-05, + "loss": 0.4615, + "step": 5722 + }, + { + "epoch": 0.6964405232735017, + "grad_norm": 1.172655701637268, + "learning_rate": 1.4956577514422308e-05, + "loss": 0.4485, + "step": 5723 + }, + { + "epoch": 0.6965622147855187, + "grad_norm": 1.7108309268951416, + "learning_rate": 1.4954883207433566e-05, + "loss": 0.429, + "step": 5724 + }, + { + "epoch": 0.6966839062975357, + "grad_norm": 1.714949369430542, + "learning_rate": 1.4953188711901261e-05, + "loss": 0.403, + "step": 5725 + }, + { + "epoch": 0.6968055978095528, + "grad_norm": 1.5385502576828003, + "learning_rate": 1.4951494027889872e-05, + "loss": 0.4724, + "step": 5726 + }, + { + "epoch": 0.6969272893215698, + "grad_norm": 0.7456535696983337, + "learning_rate": 1.494979915546389e-05, + "loss": 0.4401, + "step": 5727 + }, + { + "epoch": 0.6970489808335869, + "grad_norm": 0.6776686906814575, + "learning_rate": 1.4948104094687802e-05, + "loss": 0.4696, + "step": 5728 + }, + { + "epoch": 0.6971706723456039, + "grad_norm": 1.2693860530853271, + "learning_rate": 1.494640884562611e-05, + "loss": 0.426, + "step": 5729 + }, + { + "epoch": 0.697292363857621, + "grad_norm": 0.7595444917678833, + "learning_rate": 1.4944713408343325e-05, + "loss": 0.395, + "step": 5730 + }, + { + "epoch": 0.697414055369638, + "grad_norm": 1.4870376586914062, + "learning_rate": 1.494301778290396e-05, + "loss": 0.4203, + "step": 5731 + }, + { + "epoch": 0.697535746881655, + "grad_norm": 2.3356668949127197, + "learning_rate": 1.4941321969372536e-05, + "loss": 0.4833, + "step": 5732 + }, + { + "epoch": 0.697657438393672, + "grad_norm": 0.7620767951011658, + "learning_rate": 1.4939625967813581e-05, + "loss": 0.466, + "step": 5733 + }, + { + "epoch": 0.6977791299056891, + "grad_norm": 3.1272616386413574, + "learning_rate": 1.4937929778291638e-05, + "loss": 0.5087, + "step": 5734 + }, + { + "epoch": 0.6979008214177062, + "grad_norm": 3.444051742553711, + "learning_rate": 1.4936233400871242e-05, + "loss": 0.5168, + "step": 5735 + }, + { + "epoch": 0.6980225129297232, + "grad_norm": 1.9624532461166382, + "learning_rate": 1.4934536835616951e-05, + "loss": 0.4502, + "step": 5736 + }, + { + "epoch": 0.6981442044417402, + "grad_norm": 2.0625057220458984, + "learning_rate": 1.4932840082593317e-05, + "loss": 0.4729, + "step": 5737 + }, + { + "epoch": 0.6982658959537572, + "grad_norm": 1.064643144607544, + "learning_rate": 1.4931143141864908e-05, + "loss": 0.4574, + "step": 5738 + }, + { + "epoch": 0.6983875874657742, + "grad_norm": 1.589437484741211, + "learning_rate": 1.4929446013496297e-05, + "loss": 0.4285, + "step": 5739 + }, + { + "epoch": 0.6985092789777912, + "grad_norm": 0.9425216913223267, + "learning_rate": 1.4927748697552058e-05, + "loss": 0.4235, + "step": 5740 + }, + { + "epoch": 0.6986309704898084, + "grad_norm": 0.6291329860687256, + "learning_rate": 1.4926051194096785e-05, + "loss": 0.4441, + "step": 5741 + }, + { + "epoch": 0.6987526620018254, + "grad_norm": 1.7625223398208618, + "learning_rate": 1.4924353503195068e-05, + "loss": 0.4293, + "step": 5742 + }, + { + "epoch": 0.6988743535138424, + "grad_norm": 1.1574428081512451, + "learning_rate": 1.4922655624911503e-05, + "loss": 0.4465, + "step": 5743 + }, + { + "epoch": 0.6989960450258594, + "grad_norm": 2.1988799571990967, + "learning_rate": 1.4920957559310704e-05, + "loss": 0.5103, + "step": 5744 + }, + { + "epoch": 0.6991177365378765, + "grad_norm": 0.7254383563995361, + "learning_rate": 1.4919259306457289e-05, + "loss": 0.4544, + "step": 5745 + }, + { + "epoch": 0.6992394280498935, + "grad_norm": 1.0539625883102417, + "learning_rate": 1.4917560866415868e-05, + "loss": 0.4843, + "step": 5746 + }, + { + "epoch": 0.6993611195619106, + "grad_norm": 1.2837316989898682, + "learning_rate": 1.4915862239251081e-05, + "loss": 0.4656, + "step": 5747 + }, + { + "epoch": 0.6994828110739276, + "grad_norm": 0.7310906052589417, + "learning_rate": 1.4914163425027559e-05, + "loss": 0.4958, + "step": 5748 + }, + { + "epoch": 0.6996045025859446, + "grad_norm": 0.942044734954834, + "learning_rate": 1.491246442380995e-05, + "loss": 0.4321, + "step": 5749 + }, + { + "epoch": 0.6997261940979617, + "grad_norm": 2.467073678970337, + "learning_rate": 1.4910765235662898e-05, + "loss": 0.4225, + "step": 5750 + }, + { + "epoch": 0.6998478856099787, + "grad_norm": 2.887951374053955, + "learning_rate": 1.4909065860651064e-05, + "loss": 0.4274, + "step": 5751 + }, + { + "epoch": 0.6999695771219957, + "grad_norm": 0.6570051908493042, + "learning_rate": 1.4907366298839115e-05, + "loss": 0.4795, + "step": 5752 + }, + { + "epoch": 0.7000912686340128, + "grad_norm": 0.7301941514015198, + "learning_rate": 1.4905666550291723e-05, + "loss": 0.4611, + "step": 5753 + }, + { + "epoch": 0.7002129601460299, + "grad_norm": 2.472259521484375, + "learning_rate": 1.4903966615073558e-05, + "loss": 0.4582, + "step": 5754 + }, + { + "epoch": 0.7003346516580469, + "grad_norm": 2.261049270629883, + "learning_rate": 1.4902266493249315e-05, + "loss": 0.5226, + "step": 5755 + }, + { + "epoch": 0.7004563431700639, + "grad_norm": 1.559216856956482, + "learning_rate": 1.4900566184883687e-05, + "loss": 0.4821, + "step": 5756 + }, + { + "epoch": 0.7005780346820809, + "grad_norm": 3.4437015056610107, + "learning_rate": 1.489886569004137e-05, + "loss": 0.3834, + "step": 5757 + }, + { + "epoch": 0.7006997261940979, + "grad_norm": 2.4797775745391846, + "learning_rate": 1.4897165008787075e-05, + "loss": 0.451, + "step": 5758 + }, + { + "epoch": 0.700821417706115, + "grad_norm": 1.560303807258606, + "learning_rate": 1.4895464141185513e-05, + "loss": 0.4415, + "step": 5759 + }, + { + "epoch": 0.7009431092181321, + "grad_norm": 0.6299927234649658, + "learning_rate": 1.4893763087301409e-05, + "loss": 0.4945, + "step": 5760 + }, + { + "epoch": 0.7010648007301491, + "grad_norm": 0.8315796256065369, + "learning_rate": 1.489206184719949e-05, + "loss": 0.4082, + "step": 5761 + }, + { + "epoch": 0.7011864922421661, + "grad_norm": 2.6863346099853516, + "learning_rate": 1.489036042094449e-05, + "loss": 0.5307, + "step": 5762 + }, + { + "epoch": 0.7013081837541831, + "grad_norm": 1.8284863233566284, + "learning_rate": 1.4888658808601154e-05, + "loss": 0.3859, + "step": 5763 + }, + { + "epoch": 0.7014298752662002, + "grad_norm": 0.75916588306427, + "learning_rate": 1.4886957010234234e-05, + "loss": 0.4393, + "step": 5764 + }, + { + "epoch": 0.7015515667782172, + "grad_norm": 2.136780261993408, + "learning_rate": 1.4885255025908483e-05, + "loss": 0.4948, + "step": 5765 + }, + { + "epoch": 0.7016732582902343, + "grad_norm": 2.5012426376342773, + "learning_rate": 1.4883552855688663e-05, + "loss": 0.4917, + "step": 5766 + }, + { + "epoch": 0.7017949498022513, + "grad_norm": 2.0736212730407715, + "learning_rate": 1.488185049963955e-05, + "loss": 0.505, + "step": 5767 + }, + { + "epoch": 0.7019166413142683, + "grad_norm": 1.3989472389221191, + "learning_rate": 1.4880147957825923e-05, + "loss": 0.4589, + "step": 5768 + }, + { + "epoch": 0.7020383328262854, + "grad_norm": 2.407496452331543, + "learning_rate": 1.4878445230312564e-05, + "loss": 0.4992, + "step": 5769 + }, + { + "epoch": 0.7021600243383024, + "grad_norm": 0.6087340116500854, + "learning_rate": 1.4876742317164266e-05, + "loss": 0.4994, + "step": 5770 + }, + { + "epoch": 0.7022817158503194, + "grad_norm": 3.714498996734619, + "learning_rate": 1.4875039218445829e-05, + "loss": 0.4473, + "step": 5771 + }, + { + "epoch": 0.7024034073623365, + "grad_norm": 1.873753309249878, + "learning_rate": 1.4873335934222058e-05, + "loss": 0.4541, + "step": 5772 + }, + { + "epoch": 0.7025250988743535, + "grad_norm": 0.7393276691436768, + "learning_rate": 1.4871632464557765e-05, + "loss": 0.4965, + "step": 5773 + }, + { + "epoch": 0.7026467903863706, + "grad_norm": 4.063852787017822, + "learning_rate": 1.4869928809517775e-05, + "loss": 0.4466, + "step": 5774 + }, + { + "epoch": 0.7027684818983876, + "grad_norm": 1.7568515539169312, + "learning_rate": 1.4868224969166916e-05, + "loss": 0.4829, + "step": 5775 + }, + { + "epoch": 0.7028901734104046, + "grad_norm": 0.6663513779640198, + "learning_rate": 1.486652094357002e-05, + "loss": 0.4925, + "step": 5776 + }, + { + "epoch": 0.7030118649224216, + "grad_norm": 1.2031806707382202, + "learning_rate": 1.4864816732791924e-05, + "loss": 0.4852, + "step": 5777 + }, + { + "epoch": 0.7031335564344386, + "grad_norm": 3.4352993965148926, + "learning_rate": 1.4863112336897488e-05, + "loss": 0.4169, + "step": 5778 + }, + { + "epoch": 0.7032552479464558, + "grad_norm": 0.6917054057121277, + "learning_rate": 1.4861407755951558e-05, + "loss": 0.5017, + "step": 5779 + }, + { + "epoch": 0.7033769394584728, + "grad_norm": 0.9249730110168457, + "learning_rate": 1.4859702990019e-05, + "loss": 0.4599, + "step": 5780 + }, + { + "epoch": 0.7034986309704898, + "grad_norm": 1.2479867935180664, + "learning_rate": 1.4857998039164688e-05, + "loss": 0.5164, + "step": 5781 + }, + { + "epoch": 0.7036203224825068, + "grad_norm": 2.1523680686950684, + "learning_rate": 1.4856292903453488e-05, + "loss": 0.4649, + "step": 5782 + }, + { + "epoch": 0.7037420139945239, + "grad_norm": 1.4893996715545654, + "learning_rate": 1.4854587582950297e-05, + "loss": 0.4061, + "step": 5783 + }, + { + "epoch": 0.7038637055065409, + "grad_norm": 1.368286371231079, + "learning_rate": 1.4852882077719998e-05, + "loss": 0.4246, + "step": 5784 + }, + { + "epoch": 0.703985397018558, + "grad_norm": 0.6861351728439331, + "learning_rate": 1.4851176387827491e-05, + "loss": 0.452, + "step": 5785 + }, + { + "epoch": 0.704107088530575, + "grad_norm": 2.3142786026000977, + "learning_rate": 1.484947051333768e-05, + "loss": 0.4771, + "step": 5786 + }, + { + "epoch": 0.704228780042592, + "grad_norm": 1.445865273475647, + "learning_rate": 1.4847764454315483e-05, + "loss": 0.3717, + "step": 5787 + }, + { + "epoch": 0.7043504715546091, + "grad_norm": 0.9301458597183228, + "learning_rate": 1.484605821082581e-05, + "loss": 0.4165, + "step": 5788 + }, + { + "epoch": 0.7044721630666261, + "grad_norm": 1.196312427520752, + "learning_rate": 1.484435178293359e-05, + "loss": 0.4422, + "step": 5789 + }, + { + "epoch": 0.7045938545786431, + "grad_norm": 0.9635326266288757, + "learning_rate": 1.484264517070376e-05, + "loss": 0.451, + "step": 5790 + }, + { + "epoch": 0.7047155460906602, + "grad_norm": 2.4443612098693848, + "learning_rate": 1.4840938374201258e-05, + "loss": 0.4943, + "step": 5791 + }, + { + "epoch": 0.7048372376026772, + "grad_norm": 2.8149466514587402, + "learning_rate": 1.4839231393491028e-05, + "loss": 0.444, + "step": 5792 + }, + { + "epoch": 0.7049589291146943, + "grad_norm": 0.6951661109924316, + "learning_rate": 1.483752422863803e-05, + "loss": 0.4546, + "step": 5793 + }, + { + "epoch": 0.7050806206267113, + "grad_norm": 1.6646931171417236, + "learning_rate": 1.483581687970722e-05, + "loss": 0.4362, + "step": 5794 + }, + { + "epoch": 0.7052023121387283, + "grad_norm": 2.798440933227539, + "learning_rate": 1.4834109346763568e-05, + "loss": 0.4205, + "step": 5795 + }, + { + "epoch": 0.7053240036507453, + "grad_norm": 0.9285602569580078, + "learning_rate": 1.4832401629872048e-05, + "loss": 0.4837, + "step": 5796 + }, + { + "epoch": 0.7054456951627623, + "grad_norm": 3.0468952655792236, + "learning_rate": 1.4830693729097646e-05, + "loss": 0.4108, + "step": 5797 + }, + { + "epoch": 0.7055673866747795, + "grad_norm": 1.4386199712753296, + "learning_rate": 1.4828985644505349e-05, + "loss": 0.4557, + "step": 5798 + }, + { + "epoch": 0.7056890781867965, + "grad_norm": 1.4633842706680298, + "learning_rate": 1.4827277376160152e-05, + "loss": 0.4327, + "step": 5799 + }, + { + "epoch": 0.7058107696988135, + "grad_norm": 4.524606227874756, + "learning_rate": 1.4825568924127059e-05, + "loss": 0.5267, + "step": 5800 + }, + { + "epoch": 0.7059324612108305, + "grad_norm": 1.3131097555160522, + "learning_rate": 1.4823860288471076e-05, + "loss": 0.4354, + "step": 5801 + }, + { + "epoch": 0.7060541527228476, + "grad_norm": 0.942453920841217, + "learning_rate": 1.4822151469257228e-05, + "loss": 0.4598, + "step": 5802 + }, + { + "epoch": 0.7061758442348646, + "grad_norm": 1.812514066696167, + "learning_rate": 1.4820442466550536e-05, + "loss": 0.4776, + "step": 5803 + }, + { + "epoch": 0.7062975357468817, + "grad_norm": 0.9279342889785767, + "learning_rate": 1.4818733280416032e-05, + "loss": 0.4733, + "step": 5804 + }, + { + "epoch": 0.7064192272588987, + "grad_norm": 0.8237505555152893, + "learning_rate": 1.4817023910918749e-05, + "loss": 0.4654, + "step": 5805 + }, + { + "epoch": 0.7065409187709157, + "grad_norm": 2.42615008354187, + "learning_rate": 1.4815314358123736e-05, + "loss": 0.4265, + "step": 5806 + }, + { + "epoch": 0.7066626102829328, + "grad_norm": 1.8335754871368408, + "learning_rate": 1.4813604622096044e-05, + "loss": 0.4393, + "step": 5807 + }, + { + "epoch": 0.7067843017949498, + "grad_norm": 3.235401153564453, + "learning_rate": 1.4811894702900733e-05, + "loss": 0.4272, + "step": 5808 + }, + { + "epoch": 0.7069059933069668, + "grad_norm": 0.8789718747138977, + "learning_rate": 1.481018460060287e-05, + "loss": 0.4828, + "step": 5809 + }, + { + "epoch": 0.7070276848189839, + "grad_norm": 1.8483392000198364, + "learning_rate": 1.4808474315267528e-05, + "loss": 0.4666, + "step": 5810 + }, + { + "epoch": 0.707149376331001, + "grad_norm": 2.138598918914795, + "learning_rate": 1.480676384695978e-05, + "loss": 0.4116, + "step": 5811 + }, + { + "epoch": 0.707271067843018, + "grad_norm": 1.3106712102890015, + "learning_rate": 1.4805053195744721e-05, + "loss": 0.4412, + "step": 5812 + }, + { + "epoch": 0.707392759355035, + "grad_norm": 3.930408477783203, + "learning_rate": 1.4803342361687444e-05, + "loss": 0.5671, + "step": 5813 + }, + { + "epoch": 0.707514450867052, + "grad_norm": 1.1056636571884155, + "learning_rate": 1.4801631344853043e-05, + "loss": 0.4313, + "step": 5814 + }, + { + "epoch": 0.707636142379069, + "grad_norm": 1.9966340065002441, + "learning_rate": 1.4799920145306632e-05, + "loss": 0.4761, + "step": 5815 + }, + { + "epoch": 0.7077578338910862, + "grad_norm": 0.7812060713768005, + "learning_rate": 1.4798208763113326e-05, + "loss": 0.4313, + "step": 5816 + }, + { + "epoch": 0.7078795254031032, + "grad_norm": 0.795521080493927, + "learning_rate": 1.4796497198338245e-05, + "loss": 0.3767, + "step": 5817 + }, + { + "epoch": 0.7080012169151202, + "grad_norm": 0.5660823583602905, + "learning_rate": 1.4794785451046517e-05, + "loss": 0.4524, + "step": 5818 + }, + { + "epoch": 0.7081229084271372, + "grad_norm": 1.1923648118972778, + "learning_rate": 1.4793073521303277e-05, + "loss": 0.4397, + "step": 5819 + }, + { + "epoch": 0.7082445999391542, + "grad_norm": 1.0181834697723389, + "learning_rate": 1.4791361409173668e-05, + "loss": 0.4499, + "step": 5820 + }, + { + "epoch": 0.7083662914511712, + "grad_norm": 0.6459413766860962, + "learning_rate": 1.478964911472284e-05, + "loss": 0.4158, + "step": 5821 + }, + { + "epoch": 0.7084879829631883, + "grad_norm": 0.7527841925621033, + "learning_rate": 1.4787936638015951e-05, + "loss": 0.4379, + "step": 5822 + }, + { + "epoch": 0.7086096744752054, + "grad_norm": 1.0385246276855469, + "learning_rate": 1.4786223979118161e-05, + "loss": 0.4924, + "step": 5823 + }, + { + "epoch": 0.7087313659872224, + "grad_norm": 2.1502439975738525, + "learning_rate": 1.4784511138094642e-05, + "loss": 0.4311, + "step": 5824 + }, + { + "epoch": 0.7088530574992394, + "grad_norm": 3.1310477256774902, + "learning_rate": 1.478279811501057e-05, + "loss": 0.5512, + "step": 5825 + }, + { + "epoch": 0.7089747490112565, + "grad_norm": 2.4541409015655518, + "learning_rate": 1.478108490993113e-05, + "loss": 0.5089, + "step": 5826 + }, + { + "epoch": 0.7090964405232735, + "grad_norm": 2.9800031185150146, + "learning_rate": 1.4779371522921513e-05, + "loss": 0.4152, + "step": 5827 + }, + { + "epoch": 0.7092181320352905, + "grad_norm": 3.3575425148010254, + "learning_rate": 1.477765795404692e-05, + "loss": 0.5093, + "step": 5828 + }, + { + "epoch": 0.7093398235473076, + "grad_norm": 4.567439556121826, + "learning_rate": 1.4775944203372547e-05, + "loss": 0.4041, + "step": 5829 + }, + { + "epoch": 0.7094615150593246, + "grad_norm": 1.471593976020813, + "learning_rate": 1.4774230270963614e-05, + "loss": 0.4673, + "step": 5830 + }, + { + "epoch": 0.7095832065713417, + "grad_norm": 1.7912744283676147, + "learning_rate": 1.4772516156885337e-05, + "loss": 0.5279, + "step": 5831 + }, + { + "epoch": 0.7097048980833587, + "grad_norm": 0.7484567165374756, + "learning_rate": 1.4770801861202943e-05, + "loss": 0.4824, + "step": 5832 + }, + { + "epoch": 0.7098265895953757, + "grad_norm": 1.3792976140975952, + "learning_rate": 1.4769087383981663e-05, + "loss": 0.4234, + "step": 5833 + }, + { + "epoch": 0.7099482811073927, + "grad_norm": 2.3992929458618164, + "learning_rate": 1.4767372725286735e-05, + "loss": 0.4106, + "step": 5834 + }, + { + "epoch": 0.7100699726194099, + "grad_norm": 0.61894291639328, + "learning_rate": 1.4765657885183407e-05, + "loss": 0.3979, + "step": 5835 + }, + { + "epoch": 0.7101916641314269, + "grad_norm": 2.3444623947143555, + "learning_rate": 1.4763942863736933e-05, + "loss": 0.4568, + "step": 5836 + }, + { + "epoch": 0.7103133556434439, + "grad_norm": 3.8838231563568115, + "learning_rate": 1.4762227661012572e-05, + "loss": 0.5065, + "step": 5837 + }, + { + "epoch": 0.7104350471554609, + "grad_norm": 4.756305694580078, + "learning_rate": 1.476051227707559e-05, + "loss": 0.5242, + "step": 5838 + }, + { + "epoch": 0.7105567386674779, + "grad_norm": 5.430038928985596, + "learning_rate": 1.4758796711991266e-05, + "loss": 0.5217, + "step": 5839 + }, + { + "epoch": 0.710678430179495, + "grad_norm": 3.640932321548462, + "learning_rate": 1.4757080965824875e-05, + "loss": 0.5106, + "step": 5840 + }, + { + "epoch": 0.710800121691512, + "grad_norm": 0.9580857753753662, + "learning_rate": 1.4755365038641704e-05, + "loss": 0.4355, + "step": 5841 + }, + { + "epoch": 0.7109218132035291, + "grad_norm": 2.5009093284606934, + "learning_rate": 1.475364893050705e-05, + "loss": 0.4956, + "step": 5842 + }, + { + "epoch": 0.7110435047155461, + "grad_norm": 2.2408010959625244, + "learning_rate": 1.4751932641486218e-05, + "loss": 0.5174, + "step": 5843 + }, + { + "epoch": 0.7111651962275631, + "grad_norm": 0.7956969141960144, + "learning_rate": 1.4750216171644512e-05, + "loss": 0.4878, + "step": 5844 + }, + { + "epoch": 0.7112868877395802, + "grad_norm": 1.8937441110610962, + "learning_rate": 1.4748499521047249e-05, + "loss": 0.5596, + "step": 5845 + }, + { + "epoch": 0.7114085792515972, + "grad_norm": 7.486874580383301, + "learning_rate": 1.4746782689759749e-05, + "loss": 0.4819, + "step": 5846 + }, + { + "epoch": 0.7115302707636142, + "grad_norm": 3.9388303756713867, + "learning_rate": 1.4745065677847343e-05, + "loss": 0.5238, + "step": 5847 + }, + { + "epoch": 0.7116519622756313, + "grad_norm": 8.442525863647461, + "learning_rate": 1.4743348485375366e-05, + "loss": 0.5231, + "step": 5848 + }, + { + "epoch": 0.7117736537876483, + "grad_norm": 7.823458671569824, + "learning_rate": 1.474163111240916e-05, + "loss": 0.518, + "step": 5849 + }, + { + "epoch": 0.7118953452996654, + "grad_norm": 6.702693462371826, + "learning_rate": 1.4739913559014076e-05, + "loss": 0.4794, + "step": 5850 + }, + { + "epoch": 0.7120170368116824, + "grad_norm": 5.421233654022217, + "learning_rate": 1.4738195825255473e-05, + "loss": 0.465, + "step": 5851 + }, + { + "epoch": 0.7121387283236994, + "grad_norm": 5.18204402923584, + "learning_rate": 1.4736477911198707e-05, + "loss": 0.5068, + "step": 5852 + }, + { + "epoch": 0.7122604198357164, + "grad_norm": 4.880541801452637, + "learning_rate": 1.4734759816909155e-05, + "loss": 0.4628, + "step": 5853 + }, + { + "epoch": 0.7123821113477335, + "grad_norm": 4.086813449859619, + "learning_rate": 1.4733041542452191e-05, + "loss": 0.4474, + "step": 5854 + }, + { + "epoch": 0.7125038028597506, + "grad_norm": 2.828464984893799, + "learning_rate": 1.4731323087893197e-05, + "loss": 0.4365, + "step": 5855 + }, + { + "epoch": 0.7126254943717676, + "grad_norm": 1.823262095451355, + "learning_rate": 1.472960445329757e-05, + "loss": 0.5002, + "step": 5856 + }, + { + "epoch": 0.7127471858837846, + "grad_norm": 2.4425671100616455, + "learning_rate": 1.47278856387307e-05, + "loss": 0.4883, + "step": 5857 + }, + { + "epoch": 0.7128688773958016, + "grad_norm": 2.2251341342926025, + "learning_rate": 1.4726166644258e-05, + "loss": 0.4824, + "step": 5858 + }, + { + "epoch": 0.7129905689078186, + "grad_norm": 1.1528090238571167, + "learning_rate": 1.4724447469944873e-05, + "loss": 0.4208, + "step": 5859 + }, + { + "epoch": 0.7131122604198357, + "grad_norm": 2.3191587924957275, + "learning_rate": 1.4722728115856742e-05, + "loss": 0.4846, + "step": 5860 + }, + { + "epoch": 0.7132339519318528, + "grad_norm": 1.4297856092453003, + "learning_rate": 1.4721008582059029e-05, + "loss": 0.4338, + "step": 5861 + }, + { + "epoch": 0.7133556434438698, + "grad_norm": 2.150423765182495, + "learning_rate": 1.4719288868617172e-05, + "loss": 0.4886, + "step": 5862 + }, + { + "epoch": 0.7134773349558868, + "grad_norm": 3.3077571392059326, + "learning_rate": 1.4717568975596601e-05, + "loss": 0.5114, + "step": 5863 + }, + { + "epoch": 0.7135990264679039, + "grad_norm": 0.746435821056366, + "learning_rate": 1.4715848903062766e-05, + "loss": 0.4851, + "step": 5864 + }, + { + "epoch": 0.7137207179799209, + "grad_norm": 1.1853212118148804, + "learning_rate": 1.4714128651081118e-05, + "loss": 0.4575, + "step": 5865 + }, + { + "epoch": 0.7138424094919379, + "grad_norm": 3.2545742988586426, + "learning_rate": 1.471240821971712e-05, + "loss": 0.4258, + "step": 5866 + }, + { + "epoch": 0.713964101003955, + "grad_norm": 2.2967448234558105, + "learning_rate": 1.4710687609036234e-05, + "loss": 0.4682, + "step": 5867 + }, + { + "epoch": 0.714085792515972, + "grad_norm": 2.816882371902466, + "learning_rate": 1.4708966819103936e-05, + "loss": 0.4652, + "step": 5868 + }, + { + "epoch": 0.7142074840279891, + "grad_norm": 1.005064845085144, + "learning_rate": 1.47072458499857e-05, + "loss": 0.47, + "step": 5869 + }, + { + "epoch": 0.7143291755400061, + "grad_norm": 2.8408045768737793, + "learning_rate": 1.4705524701747018e-05, + "loss": 0.4671, + "step": 5870 + }, + { + "epoch": 0.7144508670520231, + "grad_norm": 0.9220614433288574, + "learning_rate": 1.470380337445338e-05, + "loss": 0.4954, + "step": 5871 + }, + { + "epoch": 0.7145725585640401, + "grad_norm": 0.7577651739120483, + "learning_rate": 1.4702081868170286e-05, + "loss": 0.4406, + "step": 5872 + }, + { + "epoch": 0.7146942500760572, + "grad_norm": 2.3003900051116943, + "learning_rate": 1.4700360182963246e-05, + "loss": 0.4832, + "step": 5873 + }, + { + "epoch": 0.7148159415880743, + "grad_norm": 3.662625312805176, + "learning_rate": 1.4698638318897773e-05, + "loss": 0.5059, + "step": 5874 + }, + { + "epoch": 0.7149376331000913, + "grad_norm": 3.140347480773926, + "learning_rate": 1.4696916276039386e-05, + "loss": 0.4814, + "step": 5875 + }, + { + "epoch": 0.7150593246121083, + "grad_norm": 2.9149513244628906, + "learning_rate": 1.469519405445361e-05, + "loss": 0.5218, + "step": 5876 + }, + { + "epoch": 0.7151810161241253, + "grad_norm": 0.9762936234474182, + "learning_rate": 1.4693471654205986e-05, + "loss": 0.4256, + "step": 5877 + }, + { + "epoch": 0.7153027076361423, + "grad_norm": 0.8658326268196106, + "learning_rate": 1.4691749075362047e-05, + "loss": 0.475, + "step": 5878 + }, + { + "epoch": 0.7154243991481594, + "grad_norm": 2.1648755073547363, + "learning_rate": 1.4690026317987345e-05, + "loss": 0.5038, + "step": 5879 + }, + { + "epoch": 0.7155460906601765, + "grad_norm": 0.8065931797027588, + "learning_rate": 1.4688303382147433e-05, + "loss": 0.4604, + "step": 5880 + }, + { + "epoch": 0.7156677821721935, + "grad_norm": 1.0664634704589844, + "learning_rate": 1.4686580267907874e-05, + "loss": 0.4805, + "step": 5881 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 1.085752248764038, + "learning_rate": 1.4684856975334236e-05, + "loss": 0.4813, + "step": 5882 + }, + { + "epoch": 0.7159111651962276, + "grad_norm": 3.1130802631378174, + "learning_rate": 1.468313350449209e-05, + "loss": 0.4718, + "step": 5883 + }, + { + "epoch": 0.7160328567082446, + "grad_norm": 1.487403154373169, + "learning_rate": 1.4681409855447025e-05, + "loss": 0.4896, + "step": 5884 + }, + { + "epoch": 0.7161545482202616, + "grad_norm": 3.086695432662964, + "learning_rate": 1.4679686028264623e-05, + "loss": 0.4607, + "step": 5885 + }, + { + "epoch": 0.7162762397322787, + "grad_norm": 0.9925442337989807, + "learning_rate": 1.4677962023010482e-05, + "loss": 0.492, + "step": 5886 + }, + { + "epoch": 0.7163979312442957, + "grad_norm": 2.014378309249878, + "learning_rate": 1.46762378397502e-05, + "loss": 0.4685, + "step": 5887 + }, + { + "epoch": 0.7165196227563128, + "grad_norm": 3.0304832458496094, + "learning_rate": 1.4674513478549392e-05, + "loss": 0.5593, + "step": 5888 + }, + { + "epoch": 0.7166413142683298, + "grad_norm": 0.8478283882141113, + "learning_rate": 1.467278893947367e-05, + "loss": 0.4695, + "step": 5889 + }, + { + "epoch": 0.7167630057803468, + "grad_norm": 1.388184905052185, + "learning_rate": 1.4671064222588655e-05, + "loss": 0.4229, + "step": 5890 + }, + { + "epoch": 0.7168846972923638, + "grad_norm": 3.3339555263519287, + "learning_rate": 1.466933932795998e-05, + "loss": 0.5191, + "step": 5891 + }, + { + "epoch": 0.717006388804381, + "grad_norm": 2.435145854949951, + "learning_rate": 1.466761425565328e-05, + "loss": 0.3819, + "step": 5892 + }, + { + "epoch": 0.717128080316398, + "grad_norm": 2.5282704830169678, + "learning_rate": 1.466588900573419e-05, + "loss": 0.3678, + "step": 5893 + }, + { + "epoch": 0.717249771828415, + "grad_norm": 4.2987799644470215, + "learning_rate": 1.466416357826837e-05, + "loss": 0.5137, + "step": 5894 + }, + { + "epoch": 0.717371463340432, + "grad_norm": 2.4013993740081787, + "learning_rate": 1.466243797332147e-05, + "loss": 0.4572, + "step": 5895 + }, + { + "epoch": 0.717493154852449, + "grad_norm": 1.6005690097808838, + "learning_rate": 1.4660712190959156e-05, + "loss": 0.4596, + "step": 5896 + }, + { + "epoch": 0.717614846364466, + "grad_norm": 1.3890860080718994, + "learning_rate": 1.4658986231247096e-05, + "loss": 0.4286, + "step": 5897 + }, + { + "epoch": 0.7177365378764831, + "grad_norm": 2.1254982948303223, + "learning_rate": 1.4657260094250965e-05, + "loss": 0.5241, + "step": 5898 + }, + { + "epoch": 0.7178582293885002, + "grad_norm": 0.6179935336112976, + "learning_rate": 1.4655533780036447e-05, + "loss": 0.4128, + "step": 5899 + }, + { + "epoch": 0.7179799209005172, + "grad_norm": 1.0235605239868164, + "learning_rate": 1.4653807288669234e-05, + "loss": 0.459, + "step": 5900 + }, + { + "epoch": 0.7181016124125342, + "grad_norm": 1.7978922128677368, + "learning_rate": 1.4652080620215019e-05, + "loss": 0.4812, + "step": 5901 + }, + { + "epoch": 0.7182233039245512, + "grad_norm": 0.7480745315551758, + "learning_rate": 1.465035377473951e-05, + "loss": 0.5349, + "step": 5902 + }, + { + "epoch": 0.7183449954365683, + "grad_norm": 3.734602928161621, + "learning_rate": 1.4648626752308411e-05, + "loss": 0.4396, + "step": 5903 + }, + { + "epoch": 0.7184666869485853, + "grad_norm": 1.0031163692474365, + "learning_rate": 1.4646899552987444e-05, + "loss": 0.5399, + "step": 5904 + }, + { + "epoch": 0.7185883784606024, + "grad_norm": 1.9062103033065796, + "learning_rate": 1.4645172176842328e-05, + "loss": 0.5394, + "step": 5905 + }, + { + "epoch": 0.7187100699726194, + "grad_norm": 1.9268572330474854, + "learning_rate": 1.4643444623938795e-05, + "loss": 0.4902, + "step": 5906 + }, + { + "epoch": 0.7188317614846365, + "grad_norm": 4.705201625823975, + "learning_rate": 1.4641716894342588e-05, + "loss": 0.4799, + "step": 5907 + }, + { + "epoch": 0.7189534529966535, + "grad_norm": 3.5488810539245605, + "learning_rate": 1.4639988988119442e-05, + "loss": 0.4139, + "step": 5908 + }, + { + "epoch": 0.7190751445086705, + "grad_norm": 1.1344313621520996, + "learning_rate": 1.4638260905335112e-05, + "loss": 0.4726, + "step": 5909 + }, + { + "epoch": 0.7191968360206875, + "grad_norm": 1.1475894451141357, + "learning_rate": 1.4636532646055353e-05, + "loss": 0.459, + "step": 5910 + }, + { + "epoch": 0.7193185275327046, + "grad_norm": 2.2852580547332764, + "learning_rate": 1.463480421034593e-05, + "loss": 0.5121, + "step": 5911 + }, + { + "epoch": 0.7194402190447217, + "grad_norm": 1.8809605836868286, + "learning_rate": 1.463307559827261e-05, + "loss": 0.4656, + "step": 5912 + }, + { + "epoch": 0.7195619105567387, + "grad_norm": 2.616227865219116, + "learning_rate": 1.4631346809901177e-05, + "loss": 0.4341, + "step": 5913 + }, + { + "epoch": 0.7196836020687557, + "grad_norm": 3.135422468185425, + "learning_rate": 1.4629617845297412e-05, + "loss": 0.5123, + "step": 5914 + }, + { + "epoch": 0.7198052935807727, + "grad_norm": 0.6078523993492126, + "learning_rate": 1.4627888704527106e-05, + "loss": 0.4089, + "step": 5915 + }, + { + "epoch": 0.7199269850927897, + "grad_norm": 1.3834606409072876, + "learning_rate": 1.4626159387656053e-05, + "loss": 0.4314, + "step": 5916 + }, + { + "epoch": 0.7200486766048069, + "grad_norm": 1.716572880744934, + "learning_rate": 1.4624429894750058e-05, + "loss": 0.4798, + "step": 5917 + }, + { + "epoch": 0.7201703681168239, + "grad_norm": 1.5965250730514526, + "learning_rate": 1.4622700225874937e-05, + "loss": 0.4619, + "step": 5918 + }, + { + "epoch": 0.7202920596288409, + "grad_norm": 2.600443124771118, + "learning_rate": 1.4620970381096502e-05, + "loss": 0.4933, + "step": 5919 + }, + { + "epoch": 0.7204137511408579, + "grad_norm": 1.8338534832000732, + "learning_rate": 1.4619240360480582e-05, + "loss": 0.4453, + "step": 5920 + }, + { + "epoch": 0.720535442652875, + "grad_norm": 0.6676712036132812, + "learning_rate": 1.4617510164093004e-05, + "loss": 0.4644, + "step": 5921 + }, + { + "epoch": 0.720657134164892, + "grad_norm": 1.648997187614441, + "learning_rate": 1.4615779791999605e-05, + "loss": 0.4346, + "step": 5922 + }, + { + "epoch": 0.720778825676909, + "grad_norm": 0.579127848148346, + "learning_rate": 1.4614049244266231e-05, + "loss": 0.4259, + "step": 5923 + }, + { + "epoch": 0.7209005171889261, + "grad_norm": 2.6304335594177246, + "learning_rate": 1.4612318520958732e-05, + "loss": 0.503, + "step": 5924 + }, + { + "epoch": 0.7210222087009431, + "grad_norm": 0.8772050738334656, + "learning_rate": 1.4610587622142967e-05, + "loss": 0.4569, + "step": 5925 + }, + { + "epoch": 0.7211439002129602, + "grad_norm": 0.6525957584381104, + "learning_rate": 1.4608856547884801e-05, + "loss": 0.417, + "step": 5926 + }, + { + "epoch": 0.7212655917249772, + "grad_norm": 0.6636439561843872, + "learning_rate": 1.4607125298250102e-05, + "loss": 0.4363, + "step": 5927 + }, + { + "epoch": 0.7213872832369942, + "grad_norm": 1.632570505142212, + "learning_rate": 1.460539387330475e-05, + "loss": 0.4237, + "step": 5928 + }, + { + "epoch": 0.7215089747490112, + "grad_norm": 1.3413593769073486, + "learning_rate": 1.4603662273114627e-05, + "loss": 0.5053, + "step": 5929 + }, + { + "epoch": 0.7216306662610283, + "grad_norm": 1.0939706563949585, + "learning_rate": 1.4601930497745627e-05, + "loss": 0.4152, + "step": 5930 + }, + { + "epoch": 0.7217523577730454, + "grad_norm": 3.562614917755127, + "learning_rate": 1.4600198547263646e-05, + "loss": 0.3813, + "step": 5931 + }, + { + "epoch": 0.7218740492850624, + "grad_norm": 2.0604302883148193, + "learning_rate": 1.4598466421734587e-05, + "loss": 0.4275, + "step": 5932 + }, + { + "epoch": 0.7219957407970794, + "grad_norm": 2.1646599769592285, + "learning_rate": 1.4596734121224364e-05, + "loss": 0.512, + "step": 5933 + }, + { + "epoch": 0.7221174323090964, + "grad_norm": 0.6308103203773499, + "learning_rate": 1.4595001645798893e-05, + "loss": 0.4383, + "step": 5934 + }, + { + "epoch": 0.7222391238211134, + "grad_norm": 0.6690347790718079, + "learning_rate": 1.4593268995524097e-05, + "loss": 0.4302, + "step": 5935 + }, + { + "epoch": 0.7223608153331306, + "grad_norm": 1.080430030822754, + "learning_rate": 1.4591536170465907e-05, + "loss": 0.4319, + "step": 5936 + }, + { + "epoch": 0.7224825068451476, + "grad_norm": 0.5825051069259644, + "learning_rate": 1.4589803170690265e-05, + "loss": 0.425, + "step": 5937 + }, + { + "epoch": 0.7226041983571646, + "grad_norm": 0.9939306378364563, + "learning_rate": 1.4588069996263112e-05, + "loss": 0.4695, + "step": 5938 + }, + { + "epoch": 0.7227258898691816, + "grad_norm": 1.0614670515060425, + "learning_rate": 1.4586336647250396e-05, + "loss": 0.4963, + "step": 5939 + }, + { + "epoch": 0.7228475813811986, + "grad_norm": 0.8363815546035767, + "learning_rate": 1.4584603123718077e-05, + "loss": 0.4382, + "step": 5940 + }, + { + "epoch": 0.7229692728932157, + "grad_norm": 1.1701054573059082, + "learning_rate": 1.4582869425732123e-05, + "loss": 0.4902, + "step": 5941 + }, + { + "epoch": 0.7230909644052327, + "grad_norm": 3.5971455574035645, + "learning_rate": 1.4581135553358499e-05, + "loss": 0.4136, + "step": 5942 + }, + { + "epoch": 0.7232126559172498, + "grad_norm": 0.7977434396743774, + "learning_rate": 1.4579401506663187e-05, + "loss": 0.4752, + "step": 5943 + }, + { + "epoch": 0.7233343474292668, + "grad_norm": 3.086181640625, + "learning_rate": 1.4577667285712164e-05, + "loss": 0.4305, + "step": 5944 + }, + { + "epoch": 0.7234560389412839, + "grad_norm": 1.7511723041534424, + "learning_rate": 1.4575932890571431e-05, + "loss": 0.4408, + "step": 5945 + }, + { + "epoch": 0.7235777304533009, + "grad_norm": 2.2279856204986572, + "learning_rate": 1.4574198321306976e-05, + "loss": 0.5178, + "step": 5946 + }, + { + "epoch": 0.7236994219653179, + "grad_norm": 1.210328221321106, + "learning_rate": 1.4572463577984807e-05, + "loss": 0.4651, + "step": 5947 + }, + { + "epoch": 0.7238211134773349, + "grad_norm": 0.8268551230430603, + "learning_rate": 1.457072866067094e-05, + "loss": 0.4507, + "step": 5948 + }, + { + "epoch": 0.723942804989352, + "grad_norm": 3.1050729751586914, + "learning_rate": 1.4568993569431382e-05, + "loss": 0.5243, + "step": 5949 + }, + { + "epoch": 0.7240644965013691, + "grad_norm": 1.531017780303955, + "learning_rate": 1.4567258304332162e-05, + "loss": 0.4292, + "step": 5950 + }, + { + "epoch": 0.7241861880133861, + "grad_norm": 2.9399163722991943, + "learning_rate": 1.4565522865439309e-05, + "loss": 0.5025, + "step": 5951 + }, + { + "epoch": 0.7243078795254031, + "grad_norm": 1.609010100364685, + "learning_rate": 1.4563787252818862e-05, + "loss": 0.4655, + "step": 5952 + }, + { + "epoch": 0.7244295710374201, + "grad_norm": 1.4937900304794312, + "learning_rate": 1.456205146653686e-05, + "loss": 0.4875, + "step": 5953 + }, + { + "epoch": 0.7245512625494371, + "grad_norm": 0.7539805769920349, + "learning_rate": 1.4560315506659362e-05, + "loss": 0.4559, + "step": 5954 + }, + { + "epoch": 0.7246729540614543, + "grad_norm": 2.937546491622925, + "learning_rate": 1.4558579373252414e-05, + "loss": 0.4344, + "step": 5955 + }, + { + "epoch": 0.7247946455734713, + "grad_norm": 2.2965593338012695, + "learning_rate": 1.4556843066382091e-05, + "loss": 0.4613, + "step": 5956 + }, + { + "epoch": 0.7249163370854883, + "grad_norm": 2.042698621749878, + "learning_rate": 1.4555106586114453e-05, + "loss": 0.4273, + "step": 5957 + }, + { + "epoch": 0.7250380285975053, + "grad_norm": 1.3926624059677124, + "learning_rate": 1.4553369932515582e-05, + "loss": 0.5431, + "step": 5958 + }, + { + "epoch": 0.7251597201095223, + "grad_norm": 2.5994324684143066, + "learning_rate": 1.4551633105651561e-05, + "loss": 0.5101, + "step": 5959 + }, + { + "epoch": 0.7252814116215394, + "grad_norm": 0.6208077073097229, + "learning_rate": 1.4549896105588481e-05, + "loss": 0.4981, + "step": 5960 + }, + { + "epoch": 0.7254031031335564, + "grad_norm": 0.6817079186439514, + "learning_rate": 1.4548158932392432e-05, + "loss": 0.487, + "step": 5961 + }, + { + "epoch": 0.7255247946455735, + "grad_norm": 1.5651510953903198, + "learning_rate": 1.4546421586129524e-05, + "loss": 0.4199, + "step": 5962 + }, + { + "epoch": 0.7256464861575905, + "grad_norm": 2.3398616313934326, + "learning_rate": 1.4544684066865861e-05, + "loss": 0.4183, + "step": 5963 + }, + { + "epoch": 0.7257681776696076, + "grad_norm": 0.9032950401306152, + "learning_rate": 1.4542946374667565e-05, + "loss": 0.4485, + "step": 5964 + }, + { + "epoch": 0.7258898691816246, + "grad_norm": 3.494946002960205, + "learning_rate": 1.4541208509600756e-05, + "loss": 0.5279, + "step": 5965 + }, + { + "epoch": 0.7260115606936416, + "grad_norm": 1.360865592956543, + "learning_rate": 1.4539470471731564e-05, + "loss": 0.4893, + "step": 5966 + }, + { + "epoch": 0.7261332522056586, + "grad_norm": 1.9501760005950928, + "learning_rate": 1.4537732261126123e-05, + "loss": 0.4504, + "step": 5967 + }, + { + "epoch": 0.7262549437176757, + "grad_norm": 3.098060369491577, + "learning_rate": 1.4535993877850578e-05, + "loss": 0.357, + "step": 5968 + }, + { + "epoch": 0.7263766352296928, + "grad_norm": 1.314513921737671, + "learning_rate": 1.4534255321971077e-05, + "loss": 0.4599, + "step": 5969 + }, + { + "epoch": 0.7264983267417098, + "grad_norm": 1.2205268144607544, + "learning_rate": 1.4532516593553773e-05, + "loss": 0.5137, + "step": 5970 + }, + { + "epoch": 0.7266200182537268, + "grad_norm": 0.7066939473152161, + "learning_rate": 1.4530777692664835e-05, + "loss": 0.4336, + "step": 5971 + }, + { + "epoch": 0.7267417097657438, + "grad_norm": 1.753503680229187, + "learning_rate": 1.4529038619370425e-05, + "loss": 0.4184, + "step": 5972 + }, + { + "epoch": 0.7268634012777608, + "grad_norm": 1.8298426866531372, + "learning_rate": 1.4527299373736721e-05, + "loss": 0.4486, + "step": 5973 + }, + { + "epoch": 0.726985092789778, + "grad_norm": 3.290282964706421, + "learning_rate": 1.4525559955829904e-05, + "loss": 0.4086, + "step": 5974 + }, + { + "epoch": 0.727106784301795, + "grad_norm": 1.6207035779953003, + "learning_rate": 1.4523820365716166e-05, + "loss": 0.4068, + "step": 5975 + }, + { + "epoch": 0.727228475813812, + "grad_norm": 1.3039968013763428, + "learning_rate": 1.4522080603461696e-05, + "loss": 0.4258, + "step": 5976 + }, + { + "epoch": 0.727350167325829, + "grad_norm": 2.7305996417999268, + "learning_rate": 1.4520340669132704e-05, + "loss": 0.5397, + "step": 5977 + }, + { + "epoch": 0.727471858837846, + "grad_norm": 4.471368312835693, + "learning_rate": 1.4518600562795389e-05, + "loss": 0.514, + "step": 5978 + }, + { + "epoch": 0.7275935503498631, + "grad_norm": 6.474067211151123, + "learning_rate": 1.451686028451597e-05, + "loss": 0.5442, + "step": 5979 + }, + { + "epoch": 0.7277152418618801, + "grad_norm": 3.210071086883545, + "learning_rate": 1.4515119834360667e-05, + "loss": 0.4903, + "step": 5980 + }, + { + "epoch": 0.7278369333738972, + "grad_norm": 0.7314683198928833, + "learning_rate": 1.4513379212395709e-05, + "loss": 0.4202, + "step": 5981 + }, + { + "epoch": 0.7279586248859142, + "grad_norm": 1.355797290802002, + "learning_rate": 1.4511638418687331e-05, + "loss": 0.3614, + "step": 5982 + }, + { + "epoch": 0.7280803163979312, + "grad_norm": 1.362479329109192, + "learning_rate": 1.4509897453301774e-05, + "loss": 0.381, + "step": 5983 + }, + { + "epoch": 0.7282020079099483, + "grad_norm": 0.6450584530830383, + "learning_rate": 1.450815631630528e-05, + "loss": 0.3976, + "step": 5984 + }, + { + "epoch": 0.7283236994219653, + "grad_norm": 2.72170090675354, + "learning_rate": 1.4506415007764107e-05, + "loss": 0.5003, + "step": 5985 + }, + { + "epoch": 0.7284453909339823, + "grad_norm": 4.203404426574707, + "learning_rate": 1.4504673527744518e-05, + "loss": 0.5351, + "step": 5986 + }, + { + "epoch": 0.7285670824459994, + "grad_norm": 4.757719993591309, + "learning_rate": 1.4502931876312775e-05, + "loss": 0.5548, + "step": 5987 + }, + { + "epoch": 0.7286887739580165, + "grad_norm": 0.9673572778701782, + "learning_rate": 1.4501190053535154e-05, + "loss": 0.4309, + "step": 5988 + }, + { + "epoch": 0.7288104654700335, + "grad_norm": 1.2794528007507324, + "learning_rate": 1.4499448059477937e-05, + "loss": 0.4278, + "step": 5989 + }, + { + "epoch": 0.7289321569820505, + "grad_norm": 2.066556692123413, + "learning_rate": 1.4497705894207406e-05, + "loss": 0.4871, + "step": 5990 + }, + { + "epoch": 0.7290538484940675, + "grad_norm": 1.3739701509475708, + "learning_rate": 1.4495963557789854e-05, + "loss": 0.4573, + "step": 5991 + }, + { + "epoch": 0.7291755400060845, + "grad_norm": 0.831739604473114, + "learning_rate": 1.4494221050291583e-05, + "loss": 0.4695, + "step": 5992 + }, + { + "epoch": 0.7292972315181017, + "grad_norm": 1.7267261743545532, + "learning_rate": 1.44924783717789e-05, + "loss": 0.4176, + "step": 5993 + }, + { + "epoch": 0.7294189230301187, + "grad_norm": 3.431475877761841, + "learning_rate": 1.4490735522318118e-05, + "loss": 0.469, + "step": 5994 + }, + { + "epoch": 0.7295406145421357, + "grad_norm": 2.1612870693206787, + "learning_rate": 1.4488992501975551e-05, + "loss": 0.4403, + "step": 5995 + }, + { + "epoch": 0.7296623060541527, + "grad_norm": 1.2238311767578125, + "learning_rate": 1.4487249310817529e-05, + "loss": 0.4537, + "step": 5996 + }, + { + "epoch": 0.7297839975661697, + "grad_norm": 2.493213176727295, + "learning_rate": 1.4485505948910381e-05, + "loss": 0.4746, + "step": 5997 + }, + { + "epoch": 0.7299056890781868, + "grad_norm": 1.2177133560180664, + "learning_rate": 1.448376241632045e-05, + "loss": 0.4385, + "step": 5998 + }, + { + "epoch": 0.7300273805902038, + "grad_norm": 1.7608141899108887, + "learning_rate": 1.4482018713114076e-05, + "loss": 0.5151, + "step": 5999 + }, + { + "epoch": 0.7301490721022209, + "grad_norm": 1.5174391269683838, + "learning_rate": 1.4480274839357614e-05, + "loss": 0.4775, + "step": 6000 + }, + { + "epoch": 0.7302707636142379, + "grad_norm": 1.2709519863128662, + "learning_rate": 1.4478530795117418e-05, + "loss": 0.4722, + "step": 6001 + }, + { + "epoch": 0.730392455126255, + "grad_norm": 1.2479143142700195, + "learning_rate": 1.4476786580459858e-05, + "loss": 0.4332, + "step": 6002 + }, + { + "epoch": 0.730514146638272, + "grad_norm": 0.7227367758750916, + "learning_rate": 1.44750421954513e-05, + "loss": 0.4708, + "step": 6003 + }, + { + "epoch": 0.730635838150289, + "grad_norm": 0.7163572907447815, + "learning_rate": 1.4473297640158126e-05, + "loss": 0.4706, + "step": 6004 + }, + { + "epoch": 0.730757529662306, + "grad_norm": 0.9213974475860596, + "learning_rate": 1.4471552914646716e-05, + "loss": 0.4079, + "step": 6005 + }, + { + "epoch": 0.7308792211743231, + "grad_norm": 1.2564480304718018, + "learning_rate": 1.4469808018983464e-05, + "loss": 0.5273, + "step": 6006 + }, + { + "epoch": 0.7310009126863402, + "grad_norm": 0.6403021812438965, + "learning_rate": 1.4468062953234762e-05, + "loss": 0.4486, + "step": 6007 + }, + { + "epoch": 0.7311226041983572, + "grad_norm": 0.9182046055793762, + "learning_rate": 1.4466317717467017e-05, + "loss": 0.4827, + "step": 6008 + }, + { + "epoch": 0.7312442957103742, + "grad_norm": 1.4530596733093262, + "learning_rate": 1.4464572311746641e-05, + "loss": 0.4827, + "step": 6009 + }, + { + "epoch": 0.7313659872223912, + "grad_norm": 0.9574877619743347, + "learning_rate": 1.4462826736140044e-05, + "loss": 0.489, + "step": 6010 + }, + { + "epoch": 0.7314876787344082, + "grad_norm": 1.3845919370651245, + "learning_rate": 1.4461080990713652e-05, + "loss": 0.4515, + "step": 6011 + }, + { + "epoch": 0.7316093702464254, + "grad_norm": 0.7245004177093506, + "learning_rate": 1.4459335075533898e-05, + "loss": 0.4878, + "step": 6012 + }, + { + "epoch": 0.7317310617584424, + "grad_norm": 0.7056178450584412, + "learning_rate": 1.4457588990667213e-05, + "loss": 0.5015, + "step": 6013 + }, + { + "epoch": 0.7318527532704594, + "grad_norm": 1.8699898719787598, + "learning_rate": 1.4455842736180037e-05, + "loss": 0.497, + "step": 6014 + }, + { + "epoch": 0.7319744447824764, + "grad_norm": 0.7331851720809937, + "learning_rate": 1.4454096312138824e-05, + "loss": 0.4927, + "step": 6015 + }, + { + "epoch": 0.7320961362944934, + "grad_norm": 2.5568206310272217, + "learning_rate": 1.4452349718610026e-05, + "loss": 0.4548, + "step": 6016 + }, + { + "epoch": 0.7322178278065105, + "grad_norm": 0.5771918892860413, + "learning_rate": 1.4450602955660104e-05, + "loss": 0.4428, + "step": 6017 + }, + { + "epoch": 0.7323395193185276, + "grad_norm": 2.8147075176239014, + "learning_rate": 1.4448856023355534e-05, + "loss": 0.4689, + "step": 6018 + }, + { + "epoch": 0.7324612108305446, + "grad_norm": 1.0193289518356323, + "learning_rate": 1.4447108921762776e-05, + "loss": 0.3991, + "step": 6019 + }, + { + "epoch": 0.7325829023425616, + "grad_norm": 2.1485724449157715, + "learning_rate": 1.4445361650948322e-05, + "loss": 0.4746, + "step": 6020 + }, + { + "epoch": 0.7327045938545786, + "grad_norm": 0.8030677437782288, + "learning_rate": 1.4443614210978653e-05, + "loss": 0.4164, + "step": 6021 + }, + { + "epoch": 0.7328262853665957, + "grad_norm": 2.0524990558624268, + "learning_rate": 1.4441866601920267e-05, + "loss": 0.4891, + "step": 6022 + }, + { + "epoch": 0.7329479768786127, + "grad_norm": 0.626239538192749, + "learning_rate": 1.4440118823839666e-05, + "loss": 0.4295, + "step": 6023 + }, + { + "epoch": 0.7330696683906297, + "grad_norm": 3.944857597351074, + "learning_rate": 1.4438370876803352e-05, + "loss": 0.5429, + "step": 6024 + }, + { + "epoch": 0.7331913599026468, + "grad_norm": 1.0516984462738037, + "learning_rate": 1.4436622760877837e-05, + "loss": 0.489, + "step": 6025 + }, + { + "epoch": 0.7333130514146639, + "grad_norm": 1.464741587638855, + "learning_rate": 1.443487447612964e-05, + "loss": 0.4528, + "step": 6026 + }, + { + "epoch": 0.7334347429266809, + "grad_norm": 1.0691699981689453, + "learning_rate": 1.4433126022625294e-05, + "loss": 0.4712, + "step": 6027 + }, + { + "epoch": 0.7335564344386979, + "grad_norm": 3.160796880722046, + "learning_rate": 1.4431377400431325e-05, + "loss": 0.4193, + "step": 6028 + }, + { + "epoch": 0.7336781259507149, + "grad_norm": 1.5413644313812256, + "learning_rate": 1.4429628609614277e-05, + "loss": 0.4308, + "step": 6029 + }, + { + "epoch": 0.7337998174627319, + "grad_norm": 2.5708987712860107, + "learning_rate": 1.4427879650240689e-05, + "loss": 0.4258, + "step": 6030 + }, + { + "epoch": 0.7339215089747491, + "grad_norm": 2.6990456581115723, + "learning_rate": 1.4426130522377114e-05, + "loss": 0.4086, + "step": 6031 + }, + { + "epoch": 0.7340432004867661, + "grad_norm": 0.9847862124443054, + "learning_rate": 1.4424381226090112e-05, + "loss": 0.4118, + "step": 6032 + }, + { + "epoch": 0.7341648919987831, + "grad_norm": 3.2449450492858887, + "learning_rate": 1.4422631761446244e-05, + "loss": 0.5218, + "step": 6033 + }, + { + "epoch": 0.7342865835108001, + "grad_norm": 1.1541451215744019, + "learning_rate": 1.4420882128512083e-05, + "loss": 0.4187, + "step": 6034 + }, + { + "epoch": 0.7344082750228171, + "grad_norm": 4.895941257476807, + "learning_rate": 1.4419132327354212e-05, + "loss": 0.5096, + "step": 6035 + }, + { + "epoch": 0.7345299665348342, + "grad_norm": 1.1692439317703247, + "learning_rate": 1.4417382358039207e-05, + "loss": 0.4297, + "step": 6036 + }, + { + "epoch": 0.7346516580468513, + "grad_norm": 0.8970011472702026, + "learning_rate": 1.4415632220633655e-05, + "loss": 0.4166, + "step": 6037 + }, + { + "epoch": 0.7347733495588683, + "grad_norm": 4.252860069274902, + "learning_rate": 1.441388191520416e-05, + "loss": 0.5466, + "step": 6038 + }, + { + "epoch": 0.7348950410708853, + "grad_norm": 3.2991886138916016, + "learning_rate": 1.4412131441817325e-05, + "loss": 0.5121, + "step": 6039 + }, + { + "epoch": 0.7350167325829023, + "grad_norm": 1.2423970699310303, + "learning_rate": 1.4410380800539751e-05, + "loss": 0.4284, + "step": 6040 + }, + { + "epoch": 0.7351384240949194, + "grad_norm": 1.6417961120605469, + "learning_rate": 1.4408629991438064e-05, + "loss": 0.5017, + "step": 6041 + }, + { + "epoch": 0.7352601156069364, + "grad_norm": 0.8933529257774353, + "learning_rate": 1.4406879014578875e-05, + "loss": 0.4613, + "step": 6042 + }, + { + "epoch": 0.7353818071189534, + "grad_norm": 1.0704344511032104, + "learning_rate": 1.440512787002882e-05, + "loss": 0.4568, + "step": 6043 + }, + { + "epoch": 0.7355034986309705, + "grad_norm": 1.5222461223602295, + "learning_rate": 1.4403376557854531e-05, + "loss": 0.484, + "step": 6044 + }, + { + "epoch": 0.7356251901429876, + "grad_norm": 1.502522349357605, + "learning_rate": 1.440162507812265e-05, + "loss": 0.4512, + "step": 6045 + }, + { + "epoch": 0.7357468816550046, + "grad_norm": 2.4995596408843994, + "learning_rate": 1.4399873430899826e-05, + "loss": 0.4204, + "step": 6046 + }, + { + "epoch": 0.7358685731670216, + "grad_norm": 0.9355295896530151, + "learning_rate": 1.439812161625271e-05, + "loss": 0.4922, + "step": 6047 + }, + { + "epoch": 0.7359902646790386, + "grad_norm": 0.6649131178855896, + "learning_rate": 1.439636963424796e-05, + "loss": 0.4532, + "step": 6048 + }, + { + "epoch": 0.7361119561910556, + "grad_norm": 0.8592821359634399, + "learning_rate": 1.4394617484952247e-05, + "loss": 0.4581, + "step": 6049 + }, + { + "epoch": 0.7362336477030728, + "grad_norm": 2.093097448348999, + "learning_rate": 1.4392865168432242e-05, + "loss": 0.448, + "step": 6050 + }, + { + "epoch": 0.7363553392150898, + "grad_norm": 2.3281893730163574, + "learning_rate": 1.4391112684754624e-05, + "loss": 0.5024, + "step": 6051 + }, + { + "epoch": 0.7364770307271068, + "grad_norm": 1.9858554601669312, + "learning_rate": 1.4389360033986081e-05, + "loss": 0.4949, + "step": 6052 + }, + { + "epoch": 0.7365987222391238, + "grad_norm": 1.0654797554016113, + "learning_rate": 1.4387607216193301e-05, + "loss": 0.4536, + "step": 6053 + }, + { + "epoch": 0.7367204137511408, + "grad_norm": 0.8458957076072693, + "learning_rate": 1.4385854231442988e-05, + "loss": 0.5319, + "step": 6054 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 1.7338603734970093, + "learning_rate": 1.438410107980184e-05, + "loss": 0.5203, + "step": 6055 + }, + { + "epoch": 0.736963796775175, + "grad_norm": 0.9456865787506104, + "learning_rate": 1.4382347761336572e-05, + "loss": 0.4419, + "step": 6056 + }, + { + "epoch": 0.737085488287192, + "grad_norm": 0.7714319229125977, + "learning_rate": 1.4380594276113899e-05, + "loss": 0.4369, + "step": 6057 + }, + { + "epoch": 0.737207179799209, + "grad_norm": 0.9238236546516418, + "learning_rate": 1.4378840624200554e-05, + "loss": 0.4569, + "step": 6058 + }, + { + "epoch": 0.737328871311226, + "grad_norm": 1.561295509338379, + "learning_rate": 1.4377086805663253e-05, + "loss": 0.4599, + "step": 6059 + }, + { + "epoch": 0.7374505628232431, + "grad_norm": 1.9544538259506226, + "learning_rate": 1.437533282056874e-05, + "loss": 0.4566, + "step": 6060 + }, + { + "epoch": 0.7375722543352601, + "grad_norm": 1.2616060972213745, + "learning_rate": 1.4373578668983754e-05, + "loss": 0.5024, + "step": 6061 + }, + { + "epoch": 0.7376939458472771, + "grad_norm": 1.5177977085113525, + "learning_rate": 1.4371824350975052e-05, + "loss": 0.4671, + "step": 6062 + }, + { + "epoch": 0.7378156373592942, + "grad_norm": 1.683990240097046, + "learning_rate": 1.4370069866609381e-05, + "loss": 0.4766, + "step": 6063 + }, + { + "epoch": 0.7379373288713112, + "grad_norm": 3.2027511596679688, + "learning_rate": 1.4368315215953508e-05, + "loss": 0.5095, + "step": 6064 + }, + { + "epoch": 0.7380590203833283, + "grad_norm": 1.4263639450073242, + "learning_rate": 1.4366560399074199e-05, + "loss": 0.4772, + "step": 6065 + }, + { + "epoch": 0.7381807118953453, + "grad_norm": 0.7903024554252625, + "learning_rate": 1.4364805416038229e-05, + "loss": 0.4954, + "step": 6066 + }, + { + "epoch": 0.7383024034073623, + "grad_norm": 4.501694202423096, + "learning_rate": 1.4363050266912375e-05, + "loss": 0.4043, + "step": 6067 + }, + { + "epoch": 0.7384240949193793, + "grad_norm": 1.428065299987793, + "learning_rate": 1.4361294951763429e-05, + "loss": 0.448, + "step": 6068 + }, + { + "epoch": 0.7385457864313965, + "grad_norm": 3.916456937789917, + "learning_rate": 1.4359539470658184e-05, + "loss": 0.4458, + "step": 6069 + }, + { + "epoch": 0.7386674779434135, + "grad_norm": 5.319366931915283, + "learning_rate": 1.4357783823663439e-05, + "loss": 0.3954, + "step": 6070 + }, + { + "epoch": 0.7387891694554305, + "grad_norm": 3.7769486904144287, + "learning_rate": 1.4356028010845995e-05, + "loss": 0.4294, + "step": 6071 + }, + { + "epoch": 0.7389108609674475, + "grad_norm": 0.56814044713974, + "learning_rate": 1.4354272032272671e-05, + "loss": 0.479, + "step": 6072 + }, + { + "epoch": 0.7390325524794645, + "grad_norm": 3.2404799461364746, + "learning_rate": 1.4352515888010285e-05, + "loss": 0.5203, + "step": 6073 + }, + { + "epoch": 0.7391542439914816, + "grad_norm": 0.6911748051643372, + "learning_rate": 1.435075957812566e-05, + "loss": 0.4232, + "step": 6074 + }, + { + "epoch": 0.7392759355034987, + "grad_norm": 0.900762677192688, + "learning_rate": 1.4349003102685627e-05, + "loss": 0.4974, + "step": 6075 + }, + { + "epoch": 0.7393976270155157, + "grad_norm": 1.5303701162338257, + "learning_rate": 1.4347246461757022e-05, + "loss": 0.4523, + "step": 6076 + }, + { + "epoch": 0.7395193185275327, + "grad_norm": 1.6663451194763184, + "learning_rate": 1.4345489655406695e-05, + "loss": 0.4831, + "step": 6077 + }, + { + "epoch": 0.7396410100395497, + "grad_norm": 0.6217003464698792, + "learning_rate": 1.4343732683701489e-05, + "loss": 0.4503, + "step": 6078 + }, + { + "epoch": 0.7397627015515668, + "grad_norm": 0.7839460372924805, + "learning_rate": 1.434197554670826e-05, + "loss": 0.4368, + "step": 6079 + }, + { + "epoch": 0.7398843930635838, + "grad_norm": 0.7203729748725891, + "learning_rate": 1.434021824449388e-05, + "loss": 0.437, + "step": 6080 + }, + { + "epoch": 0.7400060845756008, + "grad_norm": 0.592182457447052, + "learning_rate": 1.4338460777125211e-05, + "loss": 0.4376, + "step": 6081 + }, + { + "epoch": 0.7401277760876179, + "grad_norm": 2.769014596939087, + "learning_rate": 1.4336703144669129e-05, + "loss": 0.4962, + "step": 6082 + }, + { + "epoch": 0.740249467599635, + "grad_norm": 2.8925223350524902, + "learning_rate": 1.4334945347192512e-05, + "loss": 0.3603, + "step": 6083 + }, + { + "epoch": 0.740371159111652, + "grad_norm": 1.656357765197754, + "learning_rate": 1.4333187384762255e-05, + "loss": 0.4227, + "step": 6084 + }, + { + "epoch": 0.740492850623669, + "grad_norm": 0.7605194449424744, + "learning_rate": 1.4331429257445248e-05, + "loss": 0.4204, + "step": 6085 + }, + { + "epoch": 0.740614542135686, + "grad_norm": 1.3129829168319702, + "learning_rate": 1.4329670965308393e-05, + "loss": 0.5163, + "step": 6086 + }, + { + "epoch": 0.740736233647703, + "grad_norm": 1.6419928073883057, + "learning_rate": 1.4327912508418596e-05, + "loss": 0.4725, + "step": 6087 + }, + { + "epoch": 0.7408579251597202, + "grad_norm": 1.90644109249115, + "learning_rate": 1.432615388684277e-05, + "loss": 0.457, + "step": 6088 + }, + { + "epoch": 0.7409796166717372, + "grad_norm": 0.9284657835960388, + "learning_rate": 1.4324395100647834e-05, + "loss": 0.4033, + "step": 6089 + }, + { + "epoch": 0.7411013081837542, + "grad_norm": 2.17691969871521, + "learning_rate": 1.432263614990071e-05, + "loss": 0.4851, + "step": 6090 + }, + { + "epoch": 0.7412229996957712, + "grad_norm": 1.805864691734314, + "learning_rate": 1.4320877034668334e-05, + "loss": 0.4579, + "step": 6091 + }, + { + "epoch": 0.7413446912077882, + "grad_norm": 0.7513481378555298, + "learning_rate": 1.431911775501765e-05, + "loss": 0.4591, + "step": 6092 + }, + { + "epoch": 0.7414663827198053, + "grad_norm": 2.5264878273010254, + "learning_rate": 1.431735831101559e-05, + "loss": 0.4501, + "step": 6093 + }, + { + "epoch": 0.7415880742318224, + "grad_norm": 1.6010634899139404, + "learning_rate": 1.4315598702729108e-05, + "loss": 0.4412, + "step": 6094 + }, + { + "epoch": 0.7417097657438394, + "grad_norm": 0.9898052215576172, + "learning_rate": 1.4313838930225163e-05, + "loss": 0.4001, + "step": 6095 + }, + { + "epoch": 0.7418314572558564, + "grad_norm": 0.9766753911972046, + "learning_rate": 1.4312078993570722e-05, + "loss": 0.515, + "step": 6096 + }, + { + "epoch": 0.7419531487678734, + "grad_norm": 1.4401381015777588, + "learning_rate": 1.4310318892832746e-05, + "loss": 0.4613, + "step": 6097 + }, + { + "epoch": 0.7420748402798905, + "grad_norm": 1.6686652898788452, + "learning_rate": 1.4308558628078216e-05, + "loss": 0.4764, + "step": 6098 + }, + { + "epoch": 0.7421965317919075, + "grad_norm": 2.4315953254699707, + "learning_rate": 1.430679819937411e-05, + "loss": 0.3932, + "step": 6099 + }, + { + "epoch": 0.7423182233039245, + "grad_norm": 0.7192448377609253, + "learning_rate": 1.430503760678742e-05, + "loss": 0.4288, + "step": 6100 + }, + { + "epoch": 0.7424399148159416, + "grad_norm": 1.8684046268463135, + "learning_rate": 1.4303276850385138e-05, + "loss": 0.4748, + "step": 6101 + }, + { + "epoch": 0.7425616063279586, + "grad_norm": 1.9394108057022095, + "learning_rate": 1.430151593023426e-05, + "loss": 0.473, + "step": 6102 + }, + { + "epoch": 0.7426832978399757, + "grad_norm": 0.9935706257820129, + "learning_rate": 1.4299754846401803e-05, + "loss": 0.4586, + "step": 6103 + }, + { + "epoch": 0.7428049893519927, + "grad_norm": 2.932865858078003, + "learning_rate": 1.4297993598954773e-05, + "loss": 0.507, + "step": 6104 + }, + { + "epoch": 0.7429266808640097, + "grad_norm": 0.6139119863510132, + "learning_rate": 1.4296232187960188e-05, + "loss": 0.4553, + "step": 6105 + }, + { + "epoch": 0.7430483723760267, + "grad_norm": 1.5251184701919556, + "learning_rate": 1.4294470613485076e-05, + "loss": 0.4521, + "step": 6106 + }, + { + "epoch": 0.7431700638880439, + "grad_norm": 2.93698787689209, + "learning_rate": 1.429270887559647e-05, + "loss": 0.4311, + "step": 6107 + }, + { + "epoch": 0.7432917554000609, + "grad_norm": 0.6989178657531738, + "learning_rate": 1.4290946974361406e-05, + "loss": 0.475, + "step": 6108 + }, + { + "epoch": 0.7434134469120779, + "grad_norm": 1.882533311843872, + "learning_rate": 1.4289184909846925e-05, + "loss": 0.3862, + "step": 6109 + }, + { + "epoch": 0.7435351384240949, + "grad_norm": 1.3065087795257568, + "learning_rate": 1.4287422682120083e-05, + "loss": 0.4972, + "step": 6110 + }, + { + "epoch": 0.7436568299361119, + "grad_norm": 1.854225516319275, + "learning_rate": 1.4285660291247934e-05, + "loss": 0.5097, + "step": 6111 + }, + { + "epoch": 0.743778521448129, + "grad_norm": 1.8840022087097168, + "learning_rate": 1.4283897737297536e-05, + "loss": 0.4979, + "step": 6112 + }, + { + "epoch": 0.7439002129601461, + "grad_norm": 1.655224084854126, + "learning_rate": 1.4282135020335962e-05, + "loss": 0.4028, + "step": 6113 + }, + { + "epoch": 0.7440219044721631, + "grad_norm": 1.3481104373931885, + "learning_rate": 1.4280372140430292e-05, + "loss": 0.4418, + "step": 6114 + }, + { + "epoch": 0.7441435959841801, + "grad_norm": 1.0108870267868042, + "learning_rate": 1.42786090976476e-05, + "loss": 0.4448, + "step": 6115 + }, + { + "epoch": 0.7442652874961971, + "grad_norm": 3.476997137069702, + "learning_rate": 1.4276845892054973e-05, + "loss": 0.3725, + "step": 6116 + }, + { + "epoch": 0.7443869790082142, + "grad_norm": 0.9589830040931702, + "learning_rate": 1.427508252371951e-05, + "loss": 0.4116, + "step": 6117 + }, + { + "epoch": 0.7445086705202312, + "grad_norm": 0.716291606426239, + "learning_rate": 1.4273318992708306e-05, + "loss": 0.4057, + "step": 6118 + }, + { + "epoch": 0.7446303620322483, + "grad_norm": 0.9443511366844177, + "learning_rate": 1.4271555299088471e-05, + "loss": 0.4097, + "step": 6119 + }, + { + "epoch": 0.7447520535442653, + "grad_norm": 2.6531548500061035, + "learning_rate": 1.4269791442927112e-05, + "loss": 0.4657, + "step": 6120 + }, + { + "epoch": 0.7448737450562823, + "grad_norm": 2.101766586303711, + "learning_rate": 1.4268027424291355e-05, + "loss": 0.4665, + "step": 6121 + }, + { + "epoch": 0.7449954365682994, + "grad_norm": 4.310474872589111, + "learning_rate": 1.426626324324832e-05, + "loss": 0.522, + "step": 6122 + }, + { + "epoch": 0.7451171280803164, + "grad_norm": 1.9548985958099365, + "learning_rate": 1.4264498899865133e-05, + "loss": 0.4162, + "step": 6123 + }, + { + "epoch": 0.7452388195923334, + "grad_norm": 2.3424088954925537, + "learning_rate": 1.4262734394208938e-05, + "loss": 0.5095, + "step": 6124 + }, + { + "epoch": 0.7453605111043504, + "grad_norm": 0.6676538586616516, + "learning_rate": 1.4260969726346878e-05, + "loss": 0.4723, + "step": 6125 + }, + { + "epoch": 0.7454822026163676, + "grad_norm": 3.512550115585327, + "learning_rate": 1.4259204896346099e-05, + "loss": 0.4389, + "step": 6126 + }, + { + "epoch": 0.7456038941283846, + "grad_norm": 3.5912954807281494, + "learning_rate": 1.425743990427376e-05, + "loss": 0.5473, + "step": 6127 + }, + { + "epoch": 0.7457255856404016, + "grad_norm": 1.7361263036727905, + "learning_rate": 1.4255674750197017e-05, + "loss": 0.4941, + "step": 6128 + }, + { + "epoch": 0.7458472771524186, + "grad_norm": 3.1802353858947754, + "learning_rate": 1.425390943418304e-05, + "loss": 0.4235, + "step": 6129 + }, + { + "epoch": 0.7459689686644356, + "grad_norm": 0.8718621730804443, + "learning_rate": 1.4252143956299008e-05, + "loss": 0.5157, + "step": 6130 + }, + { + "epoch": 0.7460906601764526, + "grad_norm": 3.226101875305176, + "learning_rate": 1.4250378316612094e-05, + "loss": 0.4383, + "step": 6131 + }, + { + "epoch": 0.7462123516884698, + "grad_norm": 2.2750840187072754, + "learning_rate": 1.4248612515189486e-05, + "loss": 0.4161, + "step": 6132 + }, + { + "epoch": 0.7463340432004868, + "grad_norm": 1.2184933423995972, + "learning_rate": 1.4246846552098382e-05, + "loss": 0.5146, + "step": 6133 + }, + { + "epoch": 0.7464557347125038, + "grad_norm": 0.802947998046875, + "learning_rate": 1.4245080427405975e-05, + "loss": 0.4813, + "step": 6134 + }, + { + "epoch": 0.7465774262245208, + "grad_norm": 0.8590162992477417, + "learning_rate": 1.4243314141179467e-05, + "loss": 0.4744, + "step": 6135 + }, + { + "epoch": 0.7466991177365379, + "grad_norm": 1.9463750123977661, + "learning_rate": 1.4241547693486075e-05, + "loss": 0.4756, + "step": 6136 + }, + { + "epoch": 0.7468208092485549, + "grad_norm": 0.5914576649665833, + "learning_rate": 1.4239781084393017e-05, + "loss": 0.4592, + "step": 6137 + }, + { + "epoch": 0.746942500760572, + "grad_norm": 0.985463559627533, + "learning_rate": 1.423801431396751e-05, + "loss": 0.4602, + "step": 6138 + }, + { + "epoch": 0.747064192272589, + "grad_norm": 1.106080174446106, + "learning_rate": 1.4236247382276787e-05, + "loss": 0.4303, + "step": 6139 + }, + { + "epoch": 0.747185883784606, + "grad_norm": 0.8888548612594604, + "learning_rate": 1.4234480289388079e-05, + "loss": 0.4443, + "step": 6140 + }, + { + "epoch": 0.7473075752966231, + "grad_norm": 0.67289799451828, + "learning_rate": 1.4232713035368637e-05, + "loss": 0.4474, + "step": 6141 + }, + { + "epoch": 0.7474292668086401, + "grad_norm": 1.9233694076538086, + "learning_rate": 1.42309456202857e-05, + "loss": 0.4564, + "step": 6142 + }, + { + "epoch": 0.7475509583206571, + "grad_norm": 1.9132890701293945, + "learning_rate": 1.4229178044206522e-05, + "loss": 0.4799, + "step": 6143 + }, + { + "epoch": 0.7476726498326741, + "grad_norm": 0.6200768947601318, + "learning_rate": 1.4227410307198368e-05, + "loss": 0.4231, + "step": 6144 + }, + { + "epoch": 0.7477943413446912, + "grad_norm": 0.9088713526725769, + "learning_rate": 1.4225642409328504e-05, + "loss": 0.4461, + "step": 6145 + }, + { + "epoch": 0.7479160328567083, + "grad_norm": 1.5293049812316895, + "learning_rate": 1.4223874350664195e-05, + "loss": 0.4041, + "step": 6146 + }, + { + "epoch": 0.7480377243687253, + "grad_norm": 1.091378092765808, + "learning_rate": 1.4222106131272726e-05, + "loss": 0.4448, + "step": 6147 + }, + { + "epoch": 0.7481594158807423, + "grad_norm": 1.5216516256332397, + "learning_rate": 1.422033775122138e-05, + "loss": 0.465, + "step": 6148 + }, + { + "epoch": 0.7482811073927593, + "grad_norm": 0.6210158467292786, + "learning_rate": 1.4218569210577446e-05, + "loss": 0.4484, + "step": 6149 + }, + { + "epoch": 0.7484027989047763, + "grad_norm": 1.097381830215454, + "learning_rate": 1.4216800509408222e-05, + "loss": 0.4377, + "step": 6150 + }, + { + "epoch": 0.7485244904167935, + "grad_norm": 1.5110297203063965, + "learning_rate": 1.421503164778101e-05, + "loss": 0.4224, + "step": 6151 + }, + { + "epoch": 0.7486461819288105, + "grad_norm": 2.089169979095459, + "learning_rate": 1.421326262576312e-05, + "loss": 0.4273, + "step": 6152 + }, + { + "epoch": 0.7487678734408275, + "grad_norm": 0.6483123302459717, + "learning_rate": 1.4211493443421867e-05, + "loss": 0.4432, + "step": 6153 + }, + { + "epoch": 0.7488895649528445, + "grad_norm": 0.6911384463310242, + "learning_rate": 1.4209724100824569e-05, + "loss": 0.4114, + "step": 6154 + }, + { + "epoch": 0.7490112564648616, + "grad_norm": 2.1643879413604736, + "learning_rate": 1.4207954598038554e-05, + "loss": 0.4626, + "step": 6155 + }, + { + "epoch": 0.7491329479768786, + "grad_norm": 1.4167627096176147, + "learning_rate": 1.4206184935131163e-05, + "loss": 0.4771, + "step": 6156 + }, + { + "epoch": 0.7492546394888957, + "grad_norm": 5.108611583709717, + "learning_rate": 1.4204415112169722e-05, + "loss": 0.5653, + "step": 6157 + }, + { + "epoch": 0.7493763310009127, + "grad_norm": 1.5637836456298828, + "learning_rate": 1.4202645129221586e-05, + "loss": 0.4755, + "step": 6158 + }, + { + "epoch": 0.7494980225129297, + "grad_norm": 1.8591865301132202, + "learning_rate": 1.4200874986354103e-05, + "loss": 0.4654, + "step": 6159 + }, + { + "epoch": 0.7496197140249468, + "grad_norm": 0.8010194301605225, + "learning_rate": 1.4199104683634632e-05, + "loss": 0.475, + "step": 6160 + }, + { + "epoch": 0.7497414055369638, + "grad_norm": 3.587212085723877, + "learning_rate": 1.4197334221130536e-05, + "loss": 0.4319, + "step": 6161 + }, + { + "epoch": 0.7498630970489808, + "grad_norm": 1.6658228635787964, + "learning_rate": 1.4195563598909186e-05, + "loss": 0.504, + "step": 6162 + }, + { + "epoch": 0.7499847885609978, + "grad_norm": 4.480776786804199, + "learning_rate": 1.4193792817037956e-05, + "loss": 0.4505, + "step": 6163 + }, + { + "epoch": 0.750106480073015, + "grad_norm": 2.69167160987854, + "learning_rate": 1.419202187558423e-05, + "loss": 0.506, + "step": 6164 + }, + { + "epoch": 0.750228171585032, + "grad_norm": 1.3299251794815063, + "learning_rate": 1.4190250774615391e-05, + "loss": 0.4919, + "step": 6165 + }, + { + "epoch": 0.750349863097049, + "grad_norm": 5.226564884185791, + "learning_rate": 1.4188479514198839e-05, + "loss": 0.3578, + "step": 6166 + }, + { + "epoch": 0.750471554609066, + "grad_norm": 1.679444670677185, + "learning_rate": 1.4186708094401974e-05, + "loss": 0.458, + "step": 6167 + }, + { + "epoch": 0.750593246121083, + "grad_norm": 2.688237190246582, + "learning_rate": 1.4184936515292197e-05, + "loss": 0.4935, + "step": 6168 + }, + { + "epoch": 0.7507149376331, + "grad_norm": 1.224530577659607, + "learning_rate": 1.4183164776936924e-05, + "loss": 0.4572, + "step": 6169 + }, + { + "epoch": 0.7508366291451172, + "grad_norm": 0.6217115521430969, + "learning_rate": 1.4181392879403571e-05, + "loss": 0.4263, + "step": 6170 + }, + { + "epoch": 0.7509583206571342, + "grad_norm": 1.9961968660354614, + "learning_rate": 1.4179620822759565e-05, + "loss": 0.4795, + "step": 6171 + }, + { + "epoch": 0.7510800121691512, + "grad_norm": 0.787127673625946, + "learning_rate": 1.4177848607072338e-05, + "loss": 0.4202, + "step": 6172 + }, + { + "epoch": 0.7512017036811682, + "grad_norm": 3.706024408340454, + "learning_rate": 1.4176076232409321e-05, + "loss": 0.515, + "step": 6173 + }, + { + "epoch": 0.7513233951931853, + "grad_norm": 1.176655888557434, + "learning_rate": 1.4174303698837959e-05, + "loss": 0.4429, + "step": 6174 + }, + { + "epoch": 0.7514450867052023, + "grad_norm": 0.9679132699966431, + "learning_rate": 1.4172531006425702e-05, + "loss": 0.4408, + "step": 6175 + }, + { + "epoch": 0.7515667782172194, + "grad_norm": 2.6041781902313232, + "learning_rate": 1.4170758155240002e-05, + "loss": 0.509, + "step": 6176 + }, + { + "epoch": 0.7516884697292364, + "grad_norm": 4.564065456390381, + "learning_rate": 1.4168985145348325e-05, + "loss": 0.3952, + "step": 6177 + }, + { + "epoch": 0.7518101612412534, + "grad_norm": 3.0847840309143066, + "learning_rate": 1.4167211976818131e-05, + "loss": 0.4491, + "step": 6178 + }, + { + "epoch": 0.7519318527532705, + "grad_norm": 4.316932201385498, + "learning_rate": 1.4165438649716899e-05, + "loss": 0.3967, + "step": 6179 + }, + { + "epoch": 0.7520535442652875, + "grad_norm": 2.947129726409912, + "learning_rate": 1.4163665164112099e-05, + "loss": 0.4491, + "step": 6180 + }, + { + "epoch": 0.7521752357773045, + "grad_norm": 1.3479586839675903, + "learning_rate": 1.4161891520071222e-05, + "loss": 0.471, + "step": 6181 + }, + { + "epoch": 0.7522969272893215, + "grad_norm": 0.9378282427787781, + "learning_rate": 1.4160117717661761e-05, + "loss": 0.4513, + "step": 6182 + }, + { + "epoch": 0.7524186188013386, + "grad_norm": 1.7979302406311035, + "learning_rate": 1.4158343756951207e-05, + "loss": 0.502, + "step": 6183 + }, + { + "epoch": 0.7525403103133557, + "grad_norm": 1.7357534170150757, + "learning_rate": 1.4156569638007067e-05, + "loss": 0.4453, + "step": 6184 + }, + { + "epoch": 0.7526620018253727, + "grad_norm": 0.6115503311157227, + "learning_rate": 1.4154795360896851e-05, + "loss": 0.4293, + "step": 6185 + }, + { + "epoch": 0.7527836933373897, + "grad_norm": 0.5923071503639221, + "learning_rate": 1.4153020925688067e-05, + "loss": 0.428, + "step": 6186 + }, + { + "epoch": 0.7529053848494067, + "grad_norm": 1.1753947734832764, + "learning_rate": 1.4151246332448245e-05, + "loss": 0.4243, + "step": 6187 + }, + { + "epoch": 0.7530270763614237, + "grad_norm": 1.2448301315307617, + "learning_rate": 1.4149471581244904e-05, + "loss": 0.4523, + "step": 6188 + }, + { + "epoch": 0.7531487678734409, + "grad_norm": 2.440727949142456, + "learning_rate": 1.4147696672145581e-05, + "loss": 0.3989, + "step": 6189 + }, + { + "epoch": 0.7532704593854579, + "grad_norm": 1.6212975978851318, + "learning_rate": 1.4145921605217818e-05, + "loss": 0.4734, + "step": 6190 + }, + { + "epoch": 0.7533921508974749, + "grad_norm": 2.1013755798339844, + "learning_rate": 1.4144146380529153e-05, + "loss": 0.4352, + "step": 6191 + }, + { + "epoch": 0.7535138424094919, + "grad_norm": 1.486703634262085, + "learning_rate": 1.4142370998147142e-05, + "loss": 0.3978, + "step": 6192 + }, + { + "epoch": 0.753635533921509, + "grad_norm": 1.0408581495285034, + "learning_rate": 1.4140595458139336e-05, + "loss": 0.4439, + "step": 6193 + }, + { + "epoch": 0.753757225433526, + "grad_norm": 3.242246389389038, + "learning_rate": 1.4138819760573307e-05, + "loss": 0.5018, + "step": 6194 + }, + { + "epoch": 0.7538789169455431, + "grad_norm": 4.305569648742676, + "learning_rate": 1.4137043905516618e-05, + "loss": 0.4965, + "step": 6195 + }, + { + "epoch": 0.7540006084575601, + "grad_norm": 2.4217042922973633, + "learning_rate": 1.4135267893036846e-05, + "loss": 0.4811, + "step": 6196 + }, + { + "epoch": 0.7541222999695771, + "grad_norm": 1.1083225011825562, + "learning_rate": 1.4133491723201568e-05, + "loss": 0.4583, + "step": 6197 + }, + { + "epoch": 0.7542439914815942, + "grad_norm": 1.0425829887390137, + "learning_rate": 1.4131715396078378e-05, + "loss": 0.3968, + "step": 6198 + }, + { + "epoch": 0.7543656829936112, + "grad_norm": 1.5561612844467163, + "learning_rate": 1.4129938911734862e-05, + "loss": 0.4804, + "step": 6199 + }, + { + "epoch": 0.7544873745056282, + "grad_norm": 1.563738226890564, + "learning_rate": 1.4128162270238624e-05, + "loss": 0.4653, + "step": 6200 + }, + { + "epoch": 0.7546090660176453, + "grad_norm": 0.6354861259460449, + "learning_rate": 1.4126385471657264e-05, + "loss": 0.4298, + "step": 6201 + }, + { + "epoch": 0.7547307575296623, + "grad_norm": 2.2225916385650635, + "learning_rate": 1.41246085160584e-05, + "loss": 0.5005, + "step": 6202 + }, + { + "epoch": 0.7548524490416794, + "grad_norm": 1.5832833051681519, + "learning_rate": 1.412283140350964e-05, + "loss": 0.4206, + "step": 6203 + }, + { + "epoch": 0.7549741405536964, + "grad_norm": 0.7486116886138916, + "learning_rate": 1.4121054134078614e-05, + "loss": 0.4735, + "step": 6204 + }, + { + "epoch": 0.7550958320657134, + "grad_norm": 1.1001203060150146, + "learning_rate": 1.4119276707832947e-05, + "loss": 0.4766, + "step": 6205 + }, + { + "epoch": 0.7552175235777304, + "grad_norm": 3.053910732269287, + "learning_rate": 1.4117499124840275e-05, + "loss": 0.4291, + "step": 6206 + }, + { + "epoch": 0.7553392150897474, + "grad_norm": 1.651018500328064, + "learning_rate": 1.4115721385168238e-05, + "loss": 0.446, + "step": 6207 + }, + { + "epoch": 0.7554609066017646, + "grad_norm": 2.9158413410186768, + "learning_rate": 1.4113943488884484e-05, + "loss": 0.4949, + "step": 6208 + }, + { + "epoch": 0.7555825981137816, + "grad_norm": 3.128105401992798, + "learning_rate": 1.4112165436056665e-05, + "loss": 0.3843, + "step": 6209 + }, + { + "epoch": 0.7557042896257986, + "grad_norm": 4.592965602874756, + "learning_rate": 1.411038722675244e-05, + "loss": 0.5558, + "step": 6210 + }, + { + "epoch": 0.7558259811378156, + "grad_norm": 2.2755916118621826, + "learning_rate": 1.4108608861039471e-05, + "loss": 0.4778, + "step": 6211 + }, + { + "epoch": 0.7559476726498326, + "grad_norm": 2.344993829727173, + "learning_rate": 1.410683033898543e-05, + "loss": 0.4826, + "step": 6212 + }, + { + "epoch": 0.7560693641618497, + "grad_norm": 0.717507004737854, + "learning_rate": 1.4105051660658e-05, + "loss": 0.4191, + "step": 6213 + }, + { + "epoch": 0.7561910556738668, + "grad_norm": 1.4596961736679077, + "learning_rate": 1.4103272826124851e-05, + "loss": 0.3941, + "step": 6214 + }, + { + "epoch": 0.7563127471858838, + "grad_norm": 4.0464067459106445, + "learning_rate": 1.410149383545368e-05, + "loss": 0.5399, + "step": 6215 + }, + { + "epoch": 0.7564344386979008, + "grad_norm": 1.7963706254959106, + "learning_rate": 1.4099714688712176e-05, + "loss": 0.4045, + "step": 6216 + }, + { + "epoch": 0.7565561302099179, + "grad_norm": 0.8785014152526855, + "learning_rate": 1.4097935385968047e-05, + "loss": 0.4383, + "step": 6217 + }, + { + "epoch": 0.7566778217219349, + "grad_norm": 0.9378146529197693, + "learning_rate": 1.409615592728899e-05, + "loss": 0.4987, + "step": 6218 + }, + { + "epoch": 0.7567995132339519, + "grad_norm": 0.8495815396308899, + "learning_rate": 1.4094376312742726e-05, + "loss": 0.4569, + "step": 6219 + }, + { + "epoch": 0.756921204745969, + "grad_norm": 3.7105400562286377, + "learning_rate": 1.4092596542396964e-05, + "loss": 0.3826, + "step": 6220 + }, + { + "epoch": 0.757042896257986, + "grad_norm": 0.7114666104316711, + "learning_rate": 1.4090816616319436e-05, + "loss": 0.4599, + "step": 6221 + }, + { + "epoch": 0.7571645877700031, + "grad_norm": 0.859508216381073, + "learning_rate": 1.4089036534577865e-05, + "loss": 0.442, + "step": 6222 + }, + { + "epoch": 0.7572862792820201, + "grad_norm": 1.110184907913208, + "learning_rate": 1.4087256297239992e-05, + "loss": 0.4818, + "step": 6223 + }, + { + "epoch": 0.7574079707940371, + "grad_norm": 0.813687801361084, + "learning_rate": 1.4085475904373555e-05, + "loss": 0.4465, + "step": 6224 + }, + { + "epoch": 0.7575296623060541, + "grad_norm": 0.7737751007080078, + "learning_rate": 1.4083695356046305e-05, + "loss": 0.4602, + "step": 6225 + }, + { + "epoch": 0.7576513538180711, + "grad_norm": 1.0113240480422974, + "learning_rate": 1.4081914652325993e-05, + "loss": 0.4702, + "step": 6226 + }, + { + "epoch": 0.7577730453300883, + "grad_norm": 0.7339016795158386, + "learning_rate": 1.408013379328038e-05, + "loss": 0.4586, + "step": 6227 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 3.585162878036499, + "learning_rate": 1.407835277897723e-05, + "loss": 0.5134, + "step": 6228 + }, + { + "epoch": 0.7580164283541223, + "grad_norm": 2.021517515182495, + "learning_rate": 1.4076571609484315e-05, + "loss": 0.492, + "step": 6229 + }, + { + "epoch": 0.7581381198661393, + "grad_norm": 0.9600722789764404, + "learning_rate": 1.4074790284869413e-05, + "loss": 0.4608, + "step": 6230 + }, + { + "epoch": 0.7582598113781563, + "grad_norm": 0.7058702111244202, + "learning_rate": 1.4073008805200305e-05, + "loss": 0.5258, + "step": 6231 + }, + { + "epoch": 0.7583815028901734, + "grad_norm": 1.4849412441253662, + "learning_rate": 1.4071227170544785e-05, + "loss": 0.4701, + "step": 6232 + }, + { + "epoch": 0.7585031944021905, + "grad_norm": 2.2401154041290283, + "learning_rate": 1.4069445380970642e-05, + "loss": 0.4296, + "step": 6233 + }, + { + "epoch": 0.7586248859142075, + "grad_norm": 2.012572765350342, + "learning_rate": 1.4067663436545678e-05, + "loss": 0.503, + "step": 6234 + }, + { + "epoch": 0.7587465774262245, + "grad_norm": 2.148432493209839, + "learning_rate": 1.4065881337337702e-05, + "loss": 0.4142, + "step": 6235 + }, + { + "epoch": 0.7588682689382416, + "grad_norm": 2.492414951324463, + "learning_rate": 1.4064099083414524e-05, + "loss": 0.4166, + "step": 6236 + }, + { + "epoch": 0.7589899604502586, + "grad_norm": 0.5464257001876831, + "learning_rate": 1.4062316674843963e-05, + "loss": 0.4455, + "step": 6237 + }, + { + "epoch": 0.7591116519622756, + "grad_norm": 0.5835963487625122, + "learning_rate": 1.4060534111693844e-05, + "loss": 0.4577, + "step": 6238 + }, + { + "epoch": 0.7592333434742927, + "grad_norm": 0.5076159238815308, + "learning_rate": 1.4058751394032e-05, + "loss": 0.4497, + "step": 6239 + }, + { + "epoch": 0.7593550349863097, + "grad_norm": 0.6692099571228027, + "learning_rate": 1.4056968521926263e-05, + "loss": 0.4483, + "step": 6240 + }, + { + "epoch": 0.7594767264983268, + "grad_norm": 2.1873650550842285, + "learning_rate": 1.4055185495444471e-05, + "loss": 0.4726, + "step": 6241 + }, + { + "epoch": 0.7595984180103438, + "grad_norm": 2.5355031490325928, + "learning_rate": 1.4053402314654485e-05, + "loss": 0.507, + "step": 6242 + }, + { + "epoch": 0.7597201095223608, + "grad_norm": 0.678715169429779, + "learning_rate": 1.4051618979624148e-05, + "loss": 0.4156, + "step": 6243 + }, + { + "epoch": 0.7598418010343778, + "grad_norm": 1.4492064714431763, + "learning_rate": 1.4049835490421322e-05, + "loss": 0.4665, + "step": 6244 + }, + { + "epoch": 0.7599634925463948, + "grad_norm": 1.2751160860061646, + "learning_rate": 1.4048051847113872e-05, + "loss": 0.4664, + "step": 6245 + }, + { + "epoch": 0.760085184058412, + "grad_norm": 1.7432068586349487, + "learning_rate": 1.4046268049769671e-05, + "loss": 0.5113, + "step": 6246 + }, + { + "epoch": 0.760206875570429, + "grad_norm": 1.3685929775238037, + "learning_rate": 1.4044484098456596e-05, + "loss": 0.4781, + "step": 6247 + }, + { + "epoch": 0.760328567082446, + "grad_norm": 0.6925094723701477, + "learning_rate": 1.404269999324253e-05, + "loss": 0.4705, + "step": 6248 + }, + { + "epoch": 0.760450258594463, + "grad_norm": 2.2157018184661865, + "learning_rate": 1.4040915734195362e-05, + "loss": 0.5278, + "step": 6249 + }, + { + "epoch": 0.76057195010648, + "grad_norm": 0.6025604009628296, + "learning_rate": 1.4039131321382981e-05, + "loss": 0.463, + "step": 6250 + }, + { + "epoch": 0.7606936416184971, + "grad_norm": 2.4737188816070557, + "learning_rate": 1.4037346754873297e-05, + "loss": 0.4322, + "step": 6251 + }, + { + "epoch": 0.7608153331305142, + "grad_norm": 0.7999370694160461, + "learning_rate": 1.4035562034734212e-05, + "loss": 0.4926, + "step": 6252 + }, + { + "epoch": 0.7609370246425312, + "grad_norm": 1.7495086193084717, + "learning_rate": 1.4033777161033636e-05, + "loss": 0.4549, + "step": 6253 + }, + { + "epoch": 0.7610587161545482, + "grad_norm": 3.3959546089172363, + "learning_rate": 1.4031992133839493e-05, + "loss": 0.4324, + "step": 6254 + }, + { + "epoch": 0.7611804076665653, + "grad_norm": 1.3222390413284302, + "learning_rate": 1.4030206953219703e-05, + "loss": 0.3875, + "step": 6255 + }, + { + "epoch": 0.7613020991785823, + "grad_norm": 3.336341619491577, + "learning_rate": 1.4028421619242195e-05, + "loss": 0.518, + "step": 6256 + }, + { + "epoch": 0.7614237906905993, + "grad_norm": 1.691164493560791, + "learning_rate": 1.4026636131974905e-05, + "loss": 0.4753, + "step": 6257 + }, + { + "epoch": 0.7615454822026164, + "grad_norm": 1.8130719661712646, + "learning_rate": 1.4024850491485777e-05, + "loss": 0.4808, + "step": 6258 + }, + { + "epoch": 0.7616671737146334, + "grad_norm": 2.3917202949523926, + "learning_rate": 1.4023064697842759e-05, + "loss": 0.4737, + "step": 6259 + }, + { + "epoch": 0.7617888652266505, + "grad_norm": 2.974717140197754, + "learning_rate": 1.4021278751113798e-05, + "loss": 0.5057, + "step": 6260 + }, + { + "epoch": 0.7619105567386675, + "grad_norm": 0.7627979516983032, + "learning_rate": 1.4019492651366857e-05, + "loss": 0.452, + "step": 6261 + }, + { + "epoch": 0.7620322482506845, + "grad_norm": 1.5494791269302368, + "learning_rate": 1.4017706398669903e-05, + "loss": 0.5055, + "step": 6262 + }, + { + "epoch": 0.7621539397627015, + "grad_norm": 0.7332980036735535, + "learning_rate": 1.4015919993090903e-05, + "loss": 0.4968, + "step": 6263 + }, + { + "epoch": 0.7622756312747185, + "grad_norm": 1.1335598230361938, + "learning_rate": 1.4014133434697834e-05, + "loss": 0.5001, + "step": 6264 + }, + { + "epoch": 0.7623973227867357, + "grad_norm": 4.4941911697387695, + "learning_rate": 1.4012346723558682e-05, + "loss": 0.4083, + "step": 6265 + }, + { + "epoch": 0.7625190142987527, + "grad_norm": 3.962347984313965, + "learning_rate": 1.4010559859741432e-05, + "loss": 0.4492, + "step": 6266 + }, + { + "epoch": 0.7626407058107697, + "grad_norm": 0.9826950430870056, + "learning_rate": 1.4008772843314073e-05, + "loss": 0.5136, + "step": 6267 + }, + { + "epoch": 0.7627623973227867, + "grad_norm": 0.7330333590507507, + "learning_rate": 1.4006985674344614e-05, + "loss": 0.5176, + "step": 6268 + }, + { + "epoch": 0.7628840888348037, + "grad_norm": 2.1018736362457275, + "learning_rate": 1.4005198352901057e-05, + "loss": 0.4189, + "step": 6269 + }, + { + "epoch": 0.7630057803468208, + "grad_norm": 0.92769855260849, + "learning_rate": 1.4003410879051409e-05, + "loss": 0.4839, + "step": 6270 + }, + { + "epoch": 0.7631274718588379, + "grad_norm": 1.6458044052124023, + "learning_rate": 1.4001623252863692e-05, + "loss": 0.4103, + "step": 6271 + }, + { + "epoch": 0.7632491633708549, + "grad_norm": 1.2086974382400513, + "learning_rate": 1.3999835474405927e-05, + "loss": 0.402, + "step": 6272 + }, + { + "epoch": 0.7633708548828719, + "grad_norm": 1.6740870475769043, + "learning_rate": 1.3998047543746144e-05, + "loss": 0.4497, + "step": 6273 + }, + { + "epoch": 0.763492546394889, + "grad_norm": 2.8389978408813477, + "learning_rate": 1.3996259460952373e-05, + "loss": 0.4674, + "step": 6274 + }, + { + "epoch": 0.763614237906906, + "grad_norm": 3.967487096786499, + "learning_rate": 1.399447122609266e-05, + "loss": 0.506, + "step": 6275 + }, + { + "epoch": 0.763735929418923, + "grad_norm": 3.609405755996704, + "learning_rate": 1.3992682839235048e-05, + "loss": 0.5023, + "step": 6276 + }, + { + "epoch": 0.7638576209309401, + "grad_norm": 2.6832504272460938, + "learning_rate": 1.3990894300447594e-05, + "loss": 0.4316, + "step": 6277 + }, + { + "epoch": 0.7639793124429571, + "grad_norm": 0.5120570659637451, + "learning_rate": 1.3989105609798346e-05, + "loss": 0.4213, + "step": 6278 + }, + { + "epoch": 0.7641010039549742, + "grad_norm": 1.5480200052261353, + "learning_rate": 1.3987316767355375e-05, + "loss": 0.4727, + "step": 6279 + }, + { + "epoch": 0.7642226954669912, + "grad_norm": 0.5076949596405029, + "learning_rate": 1.3985527773186744e-05, + "loss": 0.4282, + "step": 6280 + }, + { + "epoch": 0.7643443869790082, + "grad_norm": 0.569638192653656, + "learning_rate": 1.3983738627360536e-05, + "loss": 0.4255, + "step": 6281 + }, + { + "epoch": 0.7644660784910252, + "grad_norm": 2.1035823822021484, + "learning_rate": 1.3981949329944828e-05, + "loss": 0.4337, + "step": 6282 + }, + { + "epoch": 0.7645877700030422, + "grad_norm": 1.754557490348816, + "learning_rate": 1.3980159881007703e-05, + "loss": 0.428, + "step": 6283 + }, + { + "epoch": 0.7647094615150594, + "grad_norm": 2.9902520179748535, + "learning_rate": 1.3978370280617255e-05, + "loss": 0.541, + "step": 6284 + }, + { + "epoch": 0.7648311530270764, + "grad_norm": 0.9860368371009827, + "learning_rate": 1.3976580528841588e-05, + "loss": 0.4641, + "step": 6285 + }, + { + "epoch": 0.7649528445390934, + "grad_norm": 1.414223551750183, + "learning_rate": 1.3974790625748795e-05, + "loss": 0.4627, + "step": 6286 + }, + { + "epoch": 0.7650745360511104, + "grad_norm": 0.855566680431366, + "learning_rate": 1.3973000571406995e-05, + "loss": 0.4921, + "step": 6287 + }, + { + "epoch": 0.7651962275631274, + "grad_norm": 0.7208280563354492, + "learning_rate": 1.39712103658843e-05, + "loss": 0.4161, + "step": 6288 + }, + { + "epoch": 0.7653179190751445, + "grad_norm": 1.3667845726013184, + "learning_rate": 1.3969420009248829e-05, + "loss": 0.4772, + "step": 6289 + }, + { + "epoch": 0.7654396105871616, + "grad_norm": 1.8280746936798096, + "learning_rate": 1.3967629501568709e-05, + "loss": 0.4807, + "step": 6290 + }, + { + "epoch": 0.7655613020991786, + "grad_norm": 0.6588313579559326, + "learning_rate": 1.3965838842912076e-05, + "loss": 0.5001, + "step": 6291 + }, + { + "epoch": 0.7656829936111956, + "grad_norm": 1.9605486392974854, + "learning_rate": 1.3964048033347066e-05, + "loss": 0.4445, + "step": 6292 + }, + { + "epoch": 0.7658046851232126, + "grad_norm": 2.34366512298584, + "learning_rate": 1.3962257072941823e-05, + "loss": 0.4059, + "step": 6293 + }, + { + "epoch": 0.7659263766352297, + "grad_norm": 1.2740964889526367, + "learning_rate": 1.3960465961764497e-05, + "loss": 0.4971, + "step": 6294 + }, + { + "epoch": 0.7660480681472467, + "grad_norm": 2.097116470336914, + "learning_rate": 1.3958674699883244e-05, + "loss": 0.4322, + "step": 6295 + }, + { + "epoch": 0.7661697596592638, + "grad_norm": 2.9974052906036377, + "learning_rate": 1.3956883287366223e-05, + "loss": 0.3888, + "step": 6296 + }, + { + "epoch": 0.7662914511712808, + "grad_norm": 1.7853320837020874, + "learning_rate": 1.3955091724281603e-05, + "loss": 0.3838, + "step": 6297 + }, + { + "epoch": 0.7664131426832979, + "grad_norm": 2.2220449447631836, + "learning_rate": 1.3953300010697558e-05, + "loss": 0.4776, + "step": 6298 + }, + { + "epoch": 0.7665348341953149, + "grad_norm": 4.222470760345459, + "learning_rate": 1.3951508146682265e-05, + "loss": 0.5196, + "step": 6299 + }, + { + "epoch": 0.7666565257073319, + "grad_norm": 2.9106667041778564, + "learning_rate": 1.3949716132303905e-05, + "loss": 0.4976, + "step": 6300 + }, + { + "epoch": 0.7667782172193489, + "grad_norm": 0.9559108018875122, + "learning_rate": 1.3947923967630673e-05, + "loss": 0.4347, + "step": 6301 + }, + { + "epoch": 0.766899908731366, + "grad_norm": 1.8563309907913208, + "learning_rate": 1.3946131652730758e-05, + "loss": 0.4669, + "step": 6302 + }, + { + "epoch": 0.7670216002433831, + "grad_norm": 0.6632311344146729, + "learning_rate": 1.3944339187672373e-05, + "loss": 0.4586, + "step": 6303 + }, + { + "epoch": 0.7671432917554001, + "grad_norm": 1.7021671533584595, + "learning_rate": 1.3942546572523713e-05, + "loss": 0.4099, + "step": 6304 + }, + { + "epoch": 0.7672649832674171, + "grad_norm": 1.3959519863128662, + "learning_rate": 1.3940753807352993e-05, + "loss": 0.4419, + "step": 6305 + }, + { + "epoch": 0.7673866747794341, + "grad_norm": 1.1899783611297607, + "learning_rate": 1.3938960892228439e-05, + "loss": 0.4721, + "step": 6306 + }, + { + "epoch": 0.7675083662914511, + "grad_norm": 1.665818452835083, + "learning_rate": 1.3937167827218269e-05, + "loss": 0.4059, + "step": 6307 + }, + { + "epoch": 0.7676300578034682, + "grad_norm": 2.4116342067718506, + "learning_rate": 1.393537461239071e-05, + "loss": 0.4999, + "step": 6308 + }, + { + "epoch": 0.7677517493154853, + "grad_norm": 1.0609209537506104, + "learning_rate": 1.3933581247814003e-05, + "loss": 0.471, + "step": 6309 + }, + { + "epoch": 0.7678734408275023, + "grad_norm": 0.7659875154495239, + "learning_rate": 1.3931787733556388e-05, + "loss": 0.4423, + "step": 6310 + }, + { + "epoch": 0.7679951323395193, + "grad_norm": 1.3102385997772217, + "learning_rate": 1.3929994069686114e-05, + "loss": 0.411, + "step": 6311 + }, + { + "epoch": 0.7681168238515363, + "grad_norm": 0.6528482437133789, + "learning_rate": 1.3928200256271427e-05, + "loss": 0.4424, + "step": 6312 + }, + { + "epoch": 0.7682385153635534, + "grad_norm": 1.4142247438430786, + "learning_rate": 1.392640629338059e-05, + "loss": 0.4585, + "step": 6313 + }, + { + "epoch": 0.7683602068755704, + "grad_norm": 2.222587823867798, + "learning_rate": 1.3924612181081865e-05, + "loss": 0.496, + "step": 6314 + }, + { + "epoch": 0.7684818983875875, + "grad_norm": 0.6123923063278198, + "learning_rate": 1.3922817919443525e-05, + "loss": 0.419, + "step": 6315 + }, + { + "epoch": 0.7686035898996045, + "grad_norm": 1.7828913927078247, + "learning_rate": 1.3921023508533844e-05, + "loss": 0.4894, + "step": 6316 + }, + { + "epoch": 0.7687252814116216, + "grad_norm": 1.040553331375122, + "learning_rate": 1.3919228948421102e-05, + "loss": 0.44, + "step": 6317 + }, + { + "epoch": 0.7688469729236386, + "grad_norm": 0.8742536902427673, + "learning_rate": 1.3917434239173586e-05, + "loss": 0.449, + "step": 6318 + }, + { + "epoch": 0.7689686644356556, + "grad_norm": 0.8310899138450623, + "learning_rate": 1.3915639380859589e-05, + "loss": 0.4681, + "step": 6319 + }, + { + "epoch": 0.7690903559476726, + "grad_norm": 0.8623323440551758, + "learning_rate": 1.3913844373547407e-05, + "loss": 0.4616, + "step": 6320 + }, + { + "epoch": 0.7692120474596897, + "grad_norm": 2.6715378761291504, + "learning_rate": 1.3912049217305346e-05, + "loss": 0.4024, + "step": 6321 + }, + { + "epoch": 0.7693337389717068, + "grad_norm": 0.5722630023956299, + "learning_rate": 1.3910253912201717e-05, + "loss": 0.4581, + "step": 6322 + }, + { + "epoch": 0.7694554304837238, + "grad_norm": 0.7350225448608398, + "learning_rate": 1.3908458458304833e-05, + "loss": 0.459, + "step": 6323 + }, + { + "epoch": 0.7695771219957408, + "grad_norm": 2.838193655014038, + "learning_rate": 1.3906662855683012e-05, + "loss": 0.4299, + "step": 6324 + }, + { + "epoch": 0.7696988135077578, + "grad_norm": 0.8740293383598328, + "learning_rate": 1.390486710440458e-05, + "loss": 0.4643, + "step": 6325 + }, + { + "epoch": 0.7698205050197748, + "grad_norm": 1.383274793624878, + "learning_rate": 1.3903071204537877e-05, + "loss": 0.451, + "step": 6326 + }, + { + "epoch": 0.7699421965317919, + "grad_norm": 1.5800137519836426, + "learning_rate": 1.3901275156151232e-05, + "loss": 0.4928, + "step": 6327 + }, + { + "epoch": 0.770063888043809, + "grad_norm": 0.6011481285095215, + "learning_rate": 1.3899478959312995e-05, + "loss": 0.4498, + "step": 6328 + }, + { + "epoch": 0.770185579555826, + "grad_norm": 1.9011449813842773, + "learning_rate": 1.3897682614091514e-05, + "loss": 0.469, + "step": 6329 + }, + { + "epoch": 0.770307271067843, + "grad_norm": 1.7227898836135864, + "learning_rate": 1.389588612055514e-05, + "loss": 0.4258, + "step": 6330 + }, + { + "epoch": 0.77042896257986, + "grad_norm": 2.1644248962402344, + "learning_rate": 1.3894089478772236e-05, + "loss": 0.4913, + "step": 6331 + }, + { + "epoch": 0.7705506540918771, + "grad_norm": 1.188430666923523, + "learning_rate": 1.3892292688811162e-05, + "loss": 0.4784, + "step": 6332 + }, + { + "epoch": 0.7706723456038941, + "grad_norm": 1.5333168506622314, + "learning_rate": 1.3890495750740299e-05, + "loss": 0.4415, + "step": 6333 + }, + { + "epoch": 0.7707940371159112, + "grad_norm": 4.04938268661499, + "learning_rate": 1.388869866462802e-05, + "loss": 0.4251, + "step": 6334 + }, + { + "epoch": 0.7709157286279282, + "grad_norm": 0.6987572312355042, + "learning_rate": 1.3886901430542705e-05, + "loss": 0.4885, + "step": 6335 + }, + { + "epoch": 0.7710374201399453, + "grad_norm": 0.6160205602645874, + "learning_rate": 1.3885104048552746e-05, + "loss": 0.4983, + "step": 6336 + }, + { + "epoch": 0.7711591116519623, + "grad_norm": 2.124755859375, + "learning_rate": 1.3883306518726537e-05, + "loss": 0.4489, + "step": 6337 + }, + { + "epoch": 0.7712808031639793, + "grad_norm": 0.6458317637443542, + "learning_rate": 1.3881508841132478e-05, + "loss": 0.4841, + "step": 6338 + }, + { + "epoch": 0.7714024946759963, + "grad_norm": 0.7657617926597595, + "learning_rate": 1.3879711015838971e-05, + "loss": 0.4808, + "step": 6339 + }, + { + "epoch": 0.7715241861880134, + "grad_norm": 0.8904387354850769, + "learning_rate": 1.3877913042914433e-05, + "loss": 0.4857, + "step": 6340 + }, + { + "epoch": 0.7716458777000305, + "grad_norm": 0.642325758934021, + "learning_rate": 1.3876114922427273e-05, + "loss": 0.4086, + "step": 6341 + }, + { + "epoch": 0.7717675692120475, + "grad_norm": 1.4109954833984375, + "learning_rate": 1.3874316654445918e-05, + "loss": 0.4338, + "step": 6342 + }, + { + "epoch": 0.7718892607240645, + "grad_norm": 0.6055235266685486, + "learning_rate": 1.3872518239038795e-05, + "loss": 0.4362, + "step": 6343 + }, + { + "epoch": 0.7720109522360815, + "grad_norm": 2.2909557819366455, + "learning_rate": 1.3870719676274335e-05, + "loss": 0.4864, + "step": 6344 + }, + { + "epoch": 0.7721326437480985, + "grad_norm": 3.124408006668091, + "learning_rate": 1.3868920966220984e-05, + "loss": 0.5032, + "step": 6345 + }, + { + "epoch": 0.7722543352601156, + "grad_norm": 2.1755948066711426, + "learning_rate": 1.3867122108947182e-05, + "loss": 0.4677, + "step": 6346 + }, + { + "epoch": 0.7723760267721327, + "grad_norm": 0.8197031617164612, + "learning_rate": 1.3865323104521378e-05, + "loss": 0.4477, + "step": 6347 + }, + { + "epoch": 0.7724977182841497, + "grad_norm": 0.6858694553375244, + "learning_rate": 1.3863523953012026e-05, + "loss": 0.4034, + "step": 6348 + }, + { + "epoch": 0.7726194097961667, + "grad_norm": 0.8114713430404663, + "learning_rate": 1.3861724654487593e-05, + "loss": 0.4808, + "step": 6349 + }, + { + "epoch": 0.7727411013081837, + "grad_norm": 2.3523550033569336, + "learning_rate": 1.3859925209016543e-05, + "loss": 0.4682, + "step": 6350 + }, + { + "epoch": 0.7728627928202008, + "grad_norm": 1.0293583869934082, + "learning_rate": 1.385812561666735e-05, + "loss": 0.4593, + "step": 6351 + }, + { + "epoch": 0.7729844843322178, + "grad_norm": 2.6615138053894043, + "learning_rate": 1.3856325877508491e-05, + "loss": 0.4122, + "step": 6352 + }, + { + "epoch": 0.7731061758442349, + "grad_norm": 0.6460965871810913, + "learning_rate": 1.3854525991608452e-05, + "loss": 0.4832, + "step": 6353 + }, + { + "epoch": 0.7732278673562519, + "grad_norm": 0.6187189817428589, + "learning_rate": 1.3852725959035718e-05, + "loss": 0.4675, + "step": 6354 + }, + { + "epoch": 0.773349558868269, + "grad_norm": 1.2971177101135254, + "learning_rate": 1.3850925779858786e-05, + "loss": 0.477, + "step": 6355 + }, + { + "epoch": 0.773471250380286, + "grad_norm": 3.847198486328125, + "learning_rate": 1.384912545414616e-05, + "loss": 0.4911, + "step": 6356 + }, + { + "epoch": 0.773592941892303, + "grad_norm": 0.6393918991088867, + "learning_rate": 1.3847324981966344e-05, + "loss": 0.4697, + "step": 6357 + }, + { + "epoch": 0.77371463340432, + "grad_norm": 0.7775894999504089, + "learning_rate": 1.3845524363387845e-05, + "loss": 0.4614, + "step": 6358 + }, + { + "epoch": 0.7738363249163371, + "grad_norm": 1.077245831489563, + "learning_rate": 1.3843723598479185e-05, + "loss": 0.4592, + "step": 6359 + }, + { + "epoch": 0.7739580164283542, + "grad_norm": 0.734381914138794, + "learning_rate": 1.3841922687308889e-05, + "loss": 0.4915, + "step": 6360 + }, + { + "epoch": 0.7740797079403712, + "grad_norm": 0.8254387378692627, + "learning_rate": 1.3840121629945478e-05, + "loss": 0.4889, + "step": 6361 + }, + { + "epoch": 0.7742013994523882, + "grad_norm": 3.6123549938201904, + "learning_rate": 1.3838320426457493e-05, + "loss": 0.4165, + "step": 6362 + }, + { + "epoch": 0.7743230909644052, + "grad_norm": 0.5983392000198364, + "learning_rate": 1.383651907691347e-05, + "loss": 0.435, + "step": 6363 + }, + { + "epoch": 0.7744447824764222, + "grad_norm": 1.523672342300415, + "learning_rate": 1.3834717581381956e-05, + "loss": 0.4771, + "step": 6364 + }, + { + "epoch": 0.7745664739884393, + "grad_norm": 0.6426738500595093, + "learning_rate": 1.38329159399315e-05, + "loss": 0.4386, + "step": 6365 + }, + { + "epoch": 0.7746881655004564, + "grad_norm": 2.6999123096466064, + "learning_rate": 1.3831114152630658e-05, + "loss": 0.4972, + "step": 6366 + }, + { + "epoch": 0.7748098570124734, + "grad_norm": 1.011676549911499, + "learning_rate": 1.3829312219547995e-05, + "loss": 0.455, + "step": 6367 + }, + { + "epoch": 0.7749315485244904, + "grad_norm": 2.438202142715454, + "learning_rate": 1.3827510140752071e-05, + "loss": 0.4724, + "step": 6368 + }, + { + "epoch": 0.7750532400365074, + "grad_norm": 0.7217767238616943, + "learning_rate": 1.3825707916311468e-05, + "loss": 0.4771, + "step": 6369 + }, + { + "epoch": 0.7751749315485245, + "grad_norm": 1.510183572769165, + "learning_rate": 1.3823905546294758e-05, + "loss": 0.4212, + "step": 6370 + }, + { + "epoch": 0.7752966230605415, + "grad_norm": 2.2356646060943604, + "learning_rate": 1.3822103030770531e-05, + "loss": 0.4242, + "step": 6371 + }, + { + "epoch": 0.7754183145725586, + "grad_norm": 0.5703055262565613, + "learning_rate": 1.3820300369807367e-05, + "loss": 0.483, + "step": 6372 + }, + { + "epoch": 0.7755400060845756, + "grad_norm": 2.6634860038757324, + "learning_rate": 1.3818497563473867e-05, + "loss": 0.4255, + "step": 6373 + }, + { + "epoch": 0.7756616975965926, + "grad_norm": 0.7997645735740662, + "learning_rate": 1.3816694611838633e-05, + "loss": 0.5027, + "step": 6374 + }, + { + "epoch": 0.7757833891086097, + "grad_norm": 2.9507253170013428, + "learning_rate": 1.381489151497027e-05, + "loss": 0.5292, + "step": 6375 + }, + { + "epoch": 0.7759050806206267, + "grad_norm": 0.6053027510643005, + "learning_rate": 1.3813088272937387e-05, + "loss": 0.4847, + "step": 6376 + }, + { + "epoch": 0.7760267721326437, + "grad_norm": 0.9724107980728149, + "learning_rate": 1.38112848858086e-05, + "loss": 0.4819, + "step": 6377 + }, + { + "epoch": 0.7761484636446608, + "grad_norm": 0.6126247644424438, + "learning_rate": 1.3809481353652536e-05, + "loss": 0.4758, + "step": 6378 + }, + { + "epoch": 0.7762701551566779, + "grad_norm": 2.095560312271118, + "learning_rate": 1.3807677676537823e-05, + "loss": 0.4395, + "step": 6379 + }, + { + "epoch": 0.7763918466686949, + "grad_norm": 0.6928943991661072, + "learning_rate": 1.3805873854533093e-05, + "loss": 0.4961, + "step": 6380 + }, + { + "epoch": 0.7765135381807119, + "grad_norm": 1.153184413909912, + "learning_rate": 1.380406988770698e-05, + "loss": 0.4788, + "step": 6381 + }, + { + "epoch": 0.7766352296927289, + "grad_norm": 1.0534050464630127, + "learning_rate": 1.3802265776128139e-05, + "loss": 0.4809, + "step": 6382 + }, + { + "epoch": 0.7767569212047459, + "grad_norm": 1.61732816696167, + "learning_rate": 1.3800461519865214e-05, + "loss": 0.469, + "step": 6383 + }, + { + "epoch": 0.776878612716763, + "grad_norm": 0.7235574126243591, + "learning_rate": 1.379865711898686e-05, + "loss": 0.4702, + "step": 6384 + }, + { + "epoch": 0.7770003042287801, + "grad_norm": 1.3811155557632446, + "learning_rate": 1.379685257356174e-05, + "loss": 0.4118, + "step": 6385 + }, + { + "epoch": 0.7771219957407971, + "grad_norm": 0.6938037276268005, + "learning_rate": 1.3795047883658523e-05, + "loss": 0.448, + "step": 6386 + }, + { + "epoch": 0.7772436872528141, + "grad_norm": 1.8938543796539307, + "learning_rate": 1.379324304934588e-05, + "loss": 0.4431, + "step": 6387 + }, + { + "epoch": 0.7773653787648311, + "grad_norm": 4.036731243133545, + "learning_rate": 1.3791438070692483e-05, + "loss": 0.5117, + "step": 6388 + }, + { + "epoch": 0.7774870702768482, + "grad_norm": 0.718258261680603, + "learning_rate": 1.378963294776702e-05, + "loss": 0.3954, + "step": 6389 + }, + { + "epoch": 0.7776087617888652, + "grad_norm": 2.801138401031494, + "learning_rate": 1.3787827680638181e-05, + "loss": 0.5315, + "step": 6390 + }, + { + "epoch": 0.7777304533008823, + "grad_norm": 3.210613965988159, + "learning_rate": 1.3786022269374656e-05, + "loss": 0.5369, + "step": 6391 + }, + { + "epoch": 0.7778521448128993, + "grad_norm": 0.7968564629554749, + "learning_rate": 1.378421671404515e-05, + "loss": 0.4276, + "step": 6392 + }, + { + "epoch": 0.7779738363249163, + "grad_norm": 1.0648247003555298, + "learning_rate": 1.3782411014718363e-05, + "loss": 0.4272, + "step": 6393 + }, + { + "epoch": 0.7780955278369334, + "grad_norm": 2.704493284225464, + "learning_rate": 1.3780605171463011e-05, + "loss": 0.5124, + "step": 6394 + }, + { + "epoch": 0.7782172193489504, + "grad_norm": 2.9222958087921143, + "learning_rate": 1.3778799184347802e-05, + "loss": 0.4211, + "step": 6395 + }, + { + "epoch": 0.7783389108609674, + "grad_norm": 0.6378592848777771, + "learning_rate": 1.3776993053441463e-05, + "loss": 0.4479, + "step": 6396 + }, + { + "epoch": 0.7784606023729845, + "grad_norm": 2.8318779468536377, + "learning_rate": 1.3775186778812724e-05, + "loss": 0.4044, + "step": 6397 + }, + { + "epoch": 0.7785822938850016, + "grad_norm": 2.004495620727539, + "learning_rate": 1.3773380360530312e-05, + "loss": 0.5328, + "step": 6398 + }, + { + "epoch": 0.7787039853970186, + "grad_norm": 1.049403190612793, + "learning_rate": 1.3771573798662966e-05, + "loss": 0.4416, + "step": 6399 + }, + { + "epoch": 0.7788256769090356, + "grad_norm": 1.4881782531738281, + "learning_rate": 1.3769767093279432e-05, + "loss": 0.5014, + "step": 6400 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 1.335480809211731, + "learning_rate": 1.3767960244448457e-05, + "loss": 0.4849, + "step": 6401 + }, + { + "epoch": 0.7790690599330696, + "grad_norm": 0.7853442430496216, + "learning_rate": 1.3766153252238794e-05, + "loss": 0.5364, + "step": 6402 + }, + { + "epoch": 0.7791907514450868, + "grad_norm": 2.952008008956909, + "learning_rate": 1.3764346116719204e-05, + "loss": 0.4707, + "step": 6403 + }, + { + "epoch": 0.7793124429571038, + "grad_norm": 1.7441281080245972, + "learning_rate": 1.3762538837958454e-05, + "loss": 0.4456, + "step": 6404 + }, + { + "epoch": 0.7794341344691208, + "grad_norm": 3.066791296005249, + "learning_rate": 1.3760731416025314e-05, + "loss": 0.4796, + "step": 6405 + }, + { + "epoch": 0.7795558259811378, + "grad_norm": 1.7521189451217651, + "learning_rate": 1.3758923850988556e-05, + "loss": 0.4341, + "step": 6406 + }, + { + "epoch": 0.7796775174931548, + "grad_norm": 1.0489593744277954, + "learning_rate": 1.3757116142916967e-05, + "loss": 0.489, + "step": 6407 + }, + { + "epoch": 0.7797992090051719, + "grad_norm": 0.6687289476394653, + "learning_rate": 1.375530829187933e-05, + "loss": 0.4091, + "step": 6408 + }, + { + "epoch": 0.7799209005171889, + "grad_norm": 4.361695289611816, + "learning_rate": 1.3753500297944444e-05, + "loss": 0.5757, + "step": 6409 + }, + { + "epoch": 0.780042592029206, + "grad_norm": 1.25162672996521, + "learning_rate": 1.3751692161181099e-05, + "loss": 0.4427, + "step": 6410 + }, + { + "epoch": 0.780164283541223, + "grad_norm": 3.646221160888672, + "learning_rate": 1.3749883881658101e-05, + "loss": 0.5428, + "step": 6411 + }, + { + "epoch": 0.78028597505324, + "grad_norm": 2.618098735809326, + "learning_rate": 1.3748075459444258e-05, + "loss": 0.4906, + "step": 6412 + }, + { + "epoch": 0.7804076665652571, + "grad_norm": 2.6059303283691406, + "learning_rate": 1.374626689460839e-05, + "loss": 0.4919, + "step": 6413 + }, + { + "epoch": 0.7805293580772741, + "grad_norm": 0.7968857288360596, + "learning_rate": 1.3744458187219312e-05, + "loss": 0.4373, + "step": 6414 + }, + { + "epoch": 0.7806510495892911, + "grad_norm": 1.2142266035079956, + "learning_rate": 1.3742649337345849e-05, + "loss": 0.4482, + "step": 6415 + }, + { + "epoch": 0.7807727411013082, + "grad_norm": 0.8644128441810608, + "learning_rate": 1.374084034505683e-05, + "loss": 0.476, + "step": 6416 + }, + { + "epoch": 0.7808944326133253, + "grad_norm": 1.1000736951828003, + "learning_rate": 1.3739031210421094e-05, + "loss": 0.5062, + "step": 6417 + }, + { + "epoch": 0.7810161241253423, + "grad_norm": 3.9336023330688477, + "learning_rate": 1.3737221933507481e-05, + "loss": 0.4391, + "step": 6418 + }, + { + "epoch": 0.7811378156373593, + "grad_norm": 1.4541451930999756, + "learning_rate": 1.3735412514384837e-05, + "loss": 0.4629, + "step": 6419 + }, + { + "epoch": 0.7812595071493763, + "grad_norm": 2.3276968002319336, + "learning_rate": 1.3733602953122014e-05, + "loss": 0.4614, + "step": 6420 + }, + { + "epoch": 0.7813811986613933, + "grad_norm": 0.907979428768158, + "learning_rate": 1.3731793249787875e-05, + "loss": 0.4683, + "step": 6421 + }, + { + "epoch": 0.7815028901734105, + "grad_norm": 3.948151111602783, + "learning_rate": 1.3729983404451272e-05, + "loss": 0.4201, + "step": 6422 + }, + { + "epoch": 0.7816245816854275, + "grad_norm": 0.6311213374137878, + "learning_rate": 1.3728173417181085e-05, + "loss": 0.4654, + "step": 6423 + }, + { + "epoch": 0.7817462731974445, + "grad_norm": 1.8003267049789429, + "learning_rate": 1.3726363288046181e-05, + "loss": 0.4549, + "step": 6424 + }, + { + "epoch": 0.7818679647094615, + "grad_norm": 1.109677791595459, + "learning_rate": 1.3724553017115438e-05, + "loss": 0.4337, + "step": 6425 + }, + { + "epoch": 0.7819896562214785, + "grad_norm": 4.777871608734131, + "learning_rate": 1.3722742604457747e-05, + "loss": 0.5414, + "step": 6426 + }, + { + "epoch": 0.7821113477334956, + "grad_norm": 1.626053810119629, + "learning_rate": 1.3720932050141996e-05, + "loss": 0.4728, + "step": 6427 + }, + { + "epoch": 0.7822330392455126, + "grad_norm": 1.4872353076934814, + "learning_rate": 1.3719121354237074e-05, + "loss": 0.4589, + "step": 6428 + }, + { + "epoch": 0.7823547307575297, + "grad_norm": 3.58205246925354, + "learning_rate": 1.3717310516811888e-05, + "loss": 0.5007, + "step": 6429 + }, + { + "epoch": 0.7824764222695467, + "grad_norm": 0.6488206386566162, + "learning_rate": 1.3715499537935341e-05, + "loss": 0.4383, + "step": 6430 + }, + { + "epoch": 0.7825981137815637, + "grad_norm": 0.6844986081123352, + "learning_rate": 1.371368841767635e-05, + "loss": 0.4375, + "step": 6431 + }, + { + "epoch": 0.7827198052935808, + "grad_norm": 1.920110821723938, + "learning_rate": 1.3711877156103826e-05, + "loss": 0.4727, + "step": 6432 + }, + { + "epoch": 0.7828414968055978, + "grad_norm": 0.9400794506072998, + "learning_rate": 1.371006575328669e-05, + "loss": 0.5225, + "step": 6433 + }, + { + "epoch": 0.7829631883176148, + "grad_norm": 3.376373767852783, + "learning_rate": 1.3708254209293874e-05, + "loss": 0.4412, + "step": 6434 + }, + { + "epoch": 0.7830848798296319, + "grad_norm": 0.8922308683395386, + "learning_rate": 1.3706442524194313e-05, + "loss": 0.4354, + "step": 6435 + }, + { + "epoch": 0.783206571341649, + "grad_norm": 2.258458137512207, + "learning_rate": 1.3704630698056938e-05, + "loss": 0.4349, + "step": 6436 + }, + { + "epoch": 0.783328262853666, + "grad_norm": 1.4515469074249268, + "learning_rate": 1.3702818730950698e-05, + "loss": 0.4934, + "step": 6437 + }, + { + "epoch": 0.783449954365683, + "grad_norm": 1.3646801710128784, + "learning_rate": 1.3701006622944541e-05, + "loss": 0.4672, + "step": 6438 + }, + { + "epoch": 0.7835716458777, + "grad_norm": 0.8932005763053894, + "learning_rate": 1.3699194374107423e-05, + "loss": 0.4127, + "step": 6439 + }, + { + "epoch": 0.783693337389717, + "grad_norm": 0.6561334133148193, + "learning_rate": 1.36973819845083e-05, + "loss": 0.4465, + "step": 6440 + }, + { + "epoch": 0.7838150289017342, + "grad_norm": 1.804722785949707, + "learning_rate": 1.3695569454216138e-05, + "loss": 0.3787, + "step": 6441 + }, + { + "epoch": 0.7839367204137512, + "grad_norm": 5.419088840484619, + "learning_rate": 1.3693756783299912e-05, + "loss": 0.5702, + "step": 6442 + }, + { + "epoch": 0.7840584119257682, + "grad_norm": 1.484358787536621, + "learning_rate": 1.3691943971828594e-05, + "loss": 0.4366, + "step": 6443 + }, + { + "epoch": 0.7841801034377852, + "grad_norm": 1.8780895471572876, + "learning_rate": 1.3690131019871167e-05, + "loss": 0.4635, + "step": 6444 + }, + { + "epoch": 0.7843017949498022, + "grad_norm": 0.8616920113563538, + "learning_rate": 1.3688317927496612e-05, + "loss": 0.398, + "step": 6445 + }, + { + "epoch": 0.7844234864618193, + "grad_norm": 3.895115375518799, + "learning_rate": 1.368650469477393e-05, + "loss": 0.5957, + "step": 6446 + }, + { + "epoch": 0.7845451779738363, + "grad_norm": 1.2453265190124512, + "learning_rate": 1.3684691321772113e-05, + "loss": 0.4492, + "step": 6447 + }, + { + "epoch": 0.7846668694858534, + "grad_norm": 1.2877627611160278, + "learning_rate": 1.3682877808560161e-05, + "loss": 0.3868, + "step": 6448 + }, + { + "epoch": 0.7847885609978704, + "grad_norm": 0.6936625242233276, + "learning_rate": 1.3681064155207088e-05, + "loss": 0.4509, + "step": 6449 + }, + { + "epoch": 0.7849102525098874, + "grad_norm": 0.8250743746757507, + "learning_rate": 1.3679250361781907e-05, + "loss": 0.4356, + "step": 6450 + }, + { + "epoch": 0.7850319440219045, + "grad_norm": 0.7703726291656494, + "learning_rate": 1.3677436428353632e-05, + "loss": 0.4986, + "step": 6451 + }, + { + "epoch": 0.7851536355339215, + "grad_norm": 1.0868650674819946, + "learning_rate": 1.3675622354991286e-05, + "loss": 0.4504, + "step": 6452 + }, + { + "epoch": 0.7852753270459385, + "grad_norm": 1.223739504814148, + "learning_rate": 1.3673808141763904e-05, + "loss": 0.4237, + "step": 6453 + }, + { + "epoch": 0.7853970185579556, + "grad_norm": 2.5764353275299072, + "learning_rate": 1.3671993788740519e-05, + "loss": 0.4736, + "step": 6454 + }, + { + "epoch": 0.7855187100699726, + "grad_norm": 0.646618664264679, + "learning_rate": 1.3670179295990169e-05, + "loss": 0.4239, + "step": 6455 + }, + { + "epoch": 0.7856404015819897, + "grad_norm": 3.6138246059417725, + "learning_rate": 1.3668364663581899e-05, + "loss": 0.3765, + "step": 6456 + }, + { + "epoch": 0.7857620930940067, + "grad_norm": 1.7653045654296875, + "learning_rate": 1.3666549891584759e-05, + "loss": 0.5194, + "step": 6457 + }, + { + "epoch": 0.7858837846060237, + "grad_norm": 1.4856643676757812, + "learning_rate": 1.366473498006781e-05, + "loss": 0.4613, + "step": 6458 + }, + { + "epoch": 0.7860054761180407, + "grad_norm": 2.887765407562256, + "learning_rate": 1.3662919929100107e-05, + "loss": 0.3844, + "step": 6459 + }, + { + "epoch": 0.7861271676300579, + "grad_norm": 0.6231181621551514, + "learning_rate": 1.3661104738750716e-05, + "loss": 0.4294, + "step": 6460 + }, + { + "epoch": 0.7862488591420749, + "grad_norm": 1.4189949035644531, + "learning_rate": 1.3659289409088719e-05, + "loss": 0.4769, + "step": 6461 + }, + { + "epoch": 0.7863705506540919, + "grad_norm": 3.7589845657348633, + "learning_rate": 1.3657473940183182e-05, + "loss": 0.3839, + "step": 6462 + }, + { + "epoch": 0.7864922421661089, + "grad_norm": 2.8911733627319336, + "learning_rate": 1.365565833210319e-05, + "loss": 0.5308, + "step": 6463 + }, + { + "epoch": 0.7866139336781259, + "grad_norm": 1.43215811252594, + "learning_rate": 1.3653842584917832e-05, + "loss": 0.4179, + "step": 6464 + }, + { + "epoch": 0.786735625190143, + "grad_norm": 2.66440749168396, + "learning_rate": 1.3652026698696201e-05, + "loss": 0.4608, + "step": 6465 + }, + { + "epoch": 0.78685731670216, + "grad_norm": 1.143523931503296, + "learning_rate": 1.3650210673507393e-05, + "loss": 0.3864, + "step": 6466 + }, + { + "epoch": 0.7869790082141771, + "grad_norm": 0.8716341257095337, + "learning_rate": 1.3648394509420517e-05, + "loss": 0.3965, + "step": 6467 + }, + { + "epoch": 0.7871006997261941, + "grad_norm": 2.1278419494628906, + "learning_rate": 1.3646578206504675e-05, + "loss": 0.4714, + "step": 6468 + }, + { + "epoch": 0.7872223912382111, + "grad_norm": 2.4792592525482178, + "learning_rate": 1.3644761764828986e-05, + "loss": 0.4715, + "step": 6469 + }, + { + "epoch": 0.7873440827502282, + "grad_norm": 3.393587112426758, + "learning_rate": 1.3642945184462569e-05, + "loss": 0.4737, + "step": 6470 + }, + { + "epoch": 0.7874657742622452, + "grad_norm": 1.6722385883331299, + "learning_rate": 1.3641128465474543e-05, + "loss": 0.4715, + "step": 6471 + }, + { + "epoch": 0.7875874657742622, + "grad_norm": 0.9631937146186829, + "learning_rate": 1.3639311607934042e-05, + "loss": 0.4704, + "step": 6472 + }, + { + "epoch": 0.7877091572862793, + "grad_norm": 2.8960633277893066, + "learning_rate": 1.363749461191021e-05, + "loss": 0.4971, + "step": 6473 + }, + { + "epoch": 0.7878308487982963, + "grad_norm": 1.4005905389785767, + "learning_rate": 1.3635677477472168e-05, + "loss": 0.429, + "step": 6474 + }, + { + "epoch": 0.7879525403103134, + "grad_norm": 3.2016022205352783, + "learning_rate": 1.3633860204689078e-05, + "loss": 0.4018, + "step": 6475 + }, + { + "epoch": 0.7880742318223304, + "grad_norm": 0.6304191946983337, + "learning_rate": 1.3632042793630086e-05, + "loss": 0.4765, + "step": 6476 + }, + { + "epoch": 0.7881959233343474, + "grad_norm": 3.028447151184082, + "learning_rate": 1.3630225244364345e-05, + "loss": 0.417, + "step": 6477 + }, + { + "epoch": 0.7883176148463644, + "grad_norm": 1.7520768642425537, + "learning_rate": 1.3628407556961023e-05, + "loss": 0.4991, + "step": 6478 + }, + { + "epoch": 0.7884393063583816, + "grad_norm": 5.242979049682617, + "learning_rate": 1.3626589731489279e-05, + "loss": 0.3984, + "step": 6479 + }, + { + "epoch": 0.7885609978703986, + "grad_norm": 1.7049250602722168, + "learning_rate": 1.362477176801829e-05, + "loss": 0.4551, + "step": 6480 + }, + { + "epoch": 0.7886826893824156, + "grad_norm": 1.8115218877792358, + "learning_rate": 1.3622953666617236e-05, + "loss": 0.5037, + "step": 6481 + }, + { + "epoch": 0.7888043808944326, + "grad_norm": 0.8826643824577332, + "learning_rate": 1.3621135427355293e-05, + "loss": 0.5147, + "step": 6482 + }, + { + "epoch": 0.7889260724064496, + "grad_norm": 1.7377963066101074, + "learning_rate": 1.361931705030165e-05, + "loss": 0.4976, + "step": 6483 + }, + { + "epoch": 0.7890477639184666, + "grad_norm": 2.7146713733673096, + "learning_rate": 1.3617498535525504e-05, + "loss": 0.5104, + "step": 6484 + }, + { + "epoch": 0.7891694554304837, + "grad_norm": 1.4817066192626953, + "learning_rate": 1.3615679883096051e-05, + "loss": 0.4647, + "step": 6485 + }, + { + "epoch": 0.7892911469425008, + "grad_norm": 1.098913311958313, + "learning_rate": 1.3613861093082494e-05, + "loss": 0.4601, + "step": 6486 + }, + { + "epoch": 0.7894128384545178, + "grad_norm": 2.1476383209228516, + "learning_rate": 1.361204216555404e-05, + "loss": 0.5228, + "step": 6487 + }, + { + "epoch": 0.7895345299665348, + "grad_norm": 0.5649075508117676, + "learning_rate": 1.361022310057991e-05, + "loss": 0.4664, + "step": 6488 + }, + { + "epoch": 0.7896562214785519, + "grad_norm": 2.738413095474243, + "learning_rate": 1.3608403898229314e-05, + "loss": 0.4699, + "step": 6489 + }, + { + "epoch": 0.7897779129905689, + "grad_norm": 1.8481881618499756, + "learning_rate": 1.3606584558571483e-05, + "loss": 0.4628, + "step": 6490 + }, + { + "epoch": 0.7898996045025859, + "grad_norm": 1.9711811542510986, + "learning_rate": 1.3604765081675641e-05, + "loss": 0.449, + "step": 6491 + }, + { + "epoch": 0.790021296014603, + "grad_norm": 1.1668801307678223, + "learning_rate": 1.3602945467611031e-05, + "loss": 0.4673, + "step": 6492 + }, + { + "epoch": 0.79014298752662, + "grad_norm": 2.104055166244507, + "learning_rate": 1.3601125716446885e-05, + "loss": 0.4204, + "step": 6493 + }, + { + "epoch": 0.7902646790386371, + "grad_norm": 1.897060751914978, + "learning_rate": 1.3599305828252452e-05, + "loss": 0.4173, + "step": 6494 + }, + { + "epoch": 0.7903863705506541, + "grad_norm": 0.6285289525985718, + "learning_rate": 1.3597485803096983e-05, + "loss": 0.4019, + "step": 6495 + }, + { + "epoch": 0.7905080620626711, + "grad_norm": 5.3628339767456055, + "learning_rate": 1.3595665641049734e-05, + "loss": 0.5362, + "step": 6496 + }, + { + "epoch": 0.7906297535746881, + "grad_norm": 3.2266054153442383, + "learning_rate": 1.359384534217996e-05, + "loss": 0.4644, + "step": 6497 + }, + { + "epoch": 0.7907514450867053, + "grad_norm": 4.081913471221924, + "learning_rate": 1.3592024906556936e-05, + "loss": 0.4825, + "step": 6498 + }, + { + "epoch": 0.7908731365987223, + "grad_norm": 2.8375980854034424, + "learning_rate": 1.3590204334249928e-05, + "loss": 0.5015, + "step": 6499 + }, + { + "epoch": 0.7909948281107393, + "grad_norm": 1.9742200374603271, + "learning_rate": 1.3588383625328215e-05, + "loss": 0.4875, + "step": 6500 + }, + { + "epoch": 0.7911165196227563, + "grad_norm": 0.7385501861572266, + "learning_rate": 1.3586562779861077e-05, + "loss": 0.4159, + "step": 6501 + }, + { + "epoch": 0.7912382111347733, + "grad_norm": 1.6306407451629639, + "learning_rate": 1.35847417979178e-05, + "loss": 0.3884, + "step": 6502 + }, + { + "epoch": 0.7913599026467903, + "grad_norm": 0.6484414935112, + "learning_rate": 1.3582920679567679e-05, + "loss": 0.3928, + "step": 6503 + }, + { + "epoch": 0.7914815941588075, + "grad_norm": 1.0021167993545532, + "learning_rate": 1.3581099424880009e-05, + "loss": 0.3776, + "step": 6504 + }, + { + "epoch": 0.7916032856708245, + "grad_norm": 1.3124027252197266, + "learning_rate": 1.3579278033924093e-05, + "loss": 0.4218, + "step": 6505 + }, + { + "epoch": 0.7917249771828415, + "grad_norm": 1.2102559804916382, + "learning_rate": 1.3577456506769238e-05, + "loss": 0.3956, + "step": 6506 + }, + { + "epoch": 0.7918466686948585, + "grad_norm": 0.8479447960853577, + "learning_rate": 1.3575634843484764e-05, + "loss": 0.4437, + "step": 6507 + }, + { + "epoch": 0.7919683602068756, + "grad_norm": 1.2440087795257568, + "learning_rate": 1.3573813044139975e-05, + "loss": 0.4555, + "step": 6508 + }, + { + "epoch": 0.7920900517188926, + "grad_norm": 2.2641000747680664, + "learning_rate": 1.3571991108804208e-05, + "loss": 0.4961, + "step": 6509 + }, + { + "epoch": 0.7922117432309096, + "grad_norm": 2.6826422214508057, + "learning_rate": 1.3570169037546781e-05, + "loss": 0.4992, + "step": 6510 + }, + { + "epoch": 0.7923334347429267, + "grad_norm": 1.5702519416809082, + "learning_rate": 1.3568346830437039e-05, + "loss": 0.4136, + "step": 6511 + }, + { + "epoch": 0.7924551262549437, + "grad_norm": 1.8496413230895996, + "learning_rate": 1.3566524487544308e-05, + "loss": 0.4888, + "step": 6512 + }, + { + "epoch": 0.7925768177669608, + "grad_norm": 1.3136520385742188, + "learning_rate": 1.356470200893794e-05, + "loss": 0.4155, + "step": 6513 + }, + { + "epoch": 0.7926985092789778, + "grad_norm": 1.2406553030014038, + "learning_rate": 1.3562879394687283e-05, + "loss": 0.4158, + "step": 6514 + }, + { + "epoch": 0.7928202007909948, + "grad_norm": 2.73810076713562, + "learning_rate": 1.356105664486169e-05, + "loss": 0.3995, + "step": 6515 + }, + { + "epoch": 0.7929418923030118, + "grad_norm": 4.002604007720947, + "learning_rate": 1.355923375953052e-05, + "loss": 0.5193, + "step": 6516 + }, + { + "epoch": 0.793063583815029, + "grad_norm": 2.522327184677124, + "learning_rate": 1.3557410738763137e-05, + "loss": 0.5068, + "step": 6517 + }, + { + "epoch": 0.793185275327046, + "grad_norm": 3.4677863121032715, + "learning_rate": 1.3555587582628913e-05, + "loss": 0.5471, + "step": 6518 + }, + { + "epoch": 0.793306966839063, + "grad_norm": 1.7428756952285767, + "learning_rate": 1.3553764291197224e-05, + "loss": 0.4736, + "step": 6519 + }, + { + "epoch": 0.79342865835108, + "grad_norm": 0.9846490025520325, + "learning_rate": 1.3551940864537445e-05, + "loss": 0.4776, + "step": 6520 + }, + { + "epoch": 0.793550349863097, + "grad_norm": 1.8506323099136353, + "learning_rate": 1.3550117302718964e-05, + "loss": 0.4826, + "step": 6521 + }, + { + "epoch": 0.793672041375114, + "grad_norm": 4.4684739112854, + "learning_rate": 1.3548293605811174e-05, + "loss": 0.4113, + "step": 6522 + }, + { + "epoch": 0.7937937328871312, + "grad_norm": 0.8352494835853577, + "learning_rate": 1.3546469773883463e-05, + "loss": 0.4626, + "step": 6523 + }, + { + "epoch": 0.7939154243991482, + "grad_norm": 1.014554500579834, + "learning_rate": 1.354464580700524e-05, + "loss": 0.4993, + "step": 6524 + }, + { + "epoch": 0.7940371159111652, + "grad_norm": 2.263448476791382, + "learning_rate": 1.3542821705245908e-05, + "loss": 0.4553, + "step": 6525 + }, + { + "epoch": 0.7941588074231822, + "grad_norm": 5.160703659057617, + "learning_rate": 1.3540997468674876e-05, + "loss": 0.3738, + "step": 6526 + }, + { + "epoch": 0.7942804989351993, + "grad_norm": 2.4914934635162354, + "learning_rate": 1.353917309736156e-05, + "loss": 0.4812, + "step": 6527 + }, + { + "epoch": 0.7944021904472163, + "grad_norm": 3.265282392501831, + "learning_rate": 1.353734859137538e-05, + "loss": 0.3995, + "step": 6528 + }, + { + "epoch": 0.7945238819592333, + "grad_norm": 0.5951906442642212, + "learning_rate": 1.3535523950785768e-05, + "loss": 0.4473, + "step": 6529 + }, + { + "epoch": 0.7946455734712504, + "grad_norm": 3.6978344917297363, + "learning_rate": 1.3533699175662149e-05, + "loss": 0.481, + "step": 6530 + }, + { + "epoch": 0.7947672649832674, + "grad_norm": 3.979156017303467, + "learning_rate": 1.3531874266073963e-05, + "loss": 0.5491, + "step": 6531 + }, + { + "epoch": 0.7948889564952845, + "grad_norm": 0.5644336342811584, + "learning_rate": 1.353004922209065e-05, + "loss": 0.421, + "step": 6532 + }, + { + "epoch": 0.7950106480073015, + "grad_norm": 1.8630427122116089, + "learning_rate": 1.3528224043781659e-05, + "loss": 0.4421, + "step": 6533 + }, + { + "epoch": 0.7951323395193185, + "grad_norm": 1.3852897882461548, + "learning_rate": 1.3526398731216438e-05, + "loss": 0.4247, + "step": 6534 + }, + { + "epoch": 0.7952540310313355, + "grad_norm": 0.5969976186752319, + "learning_rate": 1.3524573284464444e-05, + "loss": 0.4231, + "step": 6535 + }, + { + "epoch": 0.7953757225433526, + "grad_norm": 1.3499457836151123, + "learning_rate": 1.3522747703595145e-05, + "loss": 0.3978, + "step": 6536 + }, + { + "epoch": 0.7954974140553697, + "grad_norm": 1.8621306419372559, + "learning_rate": 1.3520921988678003e-05, + "loss": 0.4872, + "step": 6537 + }, + { + "epoch": 0.7956191055673867, + "grad_norm": 1.6049827337265015, + "learning_rate": 1.3519096139782493e-05, + "loss": 0.4328, + "step": 6538 + }, + { + "epoch": 0.7957407970794037, + "grad_norm": 0.6554694771766663, + "learning_rate": 1.3517270156978088e-05, + "loss": 0.4753, + "step": 6539 + }, + { + "epoch": 0.7958624885914207, + "grad_norm": 0.6777941584587097, + "learning_rate": 1.3515444040334274e-05, + "loss": 0.4845, + "step": 6540 + }, + { + "epoch": 0.7959841801034377, + "grad_norm": 0.7399910092353821, + "learning_rate": 1.3513617789920538e-05, + "loss": 0.4673, + "step": 6541 + }, + { + "epoch": 0.7961058716154549, + "grad_norm": 2.4419808387756348, + "learning_rate": 1.3511791405806374e-05, + "loss": 0.4347, + "step": 6542 + }, + { + "epoch": 0.7962275631274719, + "grad_norm": 0.7896532416343689, + "learning_rate": 1.3509964888061276e-05, + "loss": 0.4232, + "step": 6543 + }, + { + "epoch": 0.7963492546394889, + "grad_norm": 0.5458998680114746, + "learning_rate": 1.3508138236754746e-05, + "loss": 0.4167, + "step": 6544 + }, + { + "epoch": 0.7964709461515059, + "grad_norm": 1.0785274505615234, + "learning_rate": 1.35063114519563e-05, + "loss": 0.4212, + "step": 6545 + }, + { + "epoch": 0.796592637663523, + "grad_norm": 1.268502950668335, + "learning_rate": 1.3504484533735442e-05, + "loss": 0.4492, + "step": 6546 + }, + { + "epoch": 0.79671432917554, + "grad_norm": 2.303605079650879, + "learning_rate": 1.3502657482161699e-05, + "loss": 0.4725, + "step": 6547 + }, + { + "epoch": 0.796836020687557, + "grad_norm": 5.79015588760376, + "learning_rate": 1.3500830297304582e-05, + "loss": 0.5871, + "step": 6548 + }, + { + "epoch": 0.7969577121995741, + "grad_norm": 4.665104389190674, + "learning_rate": 1.349900297923363e-05, + "loss": 0.5182, + "step": 6549 + }, + { + "epoch": 0.7970794037115911, + "grad_norm": 2.7406017780303955, + "learning_rate": 1.3497175528018369e-05, + "loss": 0.4544, + "step": 6550 + }, + { + "epoch": 0.7972010952236082, + "grad_norm": 0.5910564064979553, + "learning_rate": 1.3495347943728341e-05, + "loss": 0.3936, + "step": 6551 + }, + { + "epoch": 0.7973227867356252, + "grad_norm": 1.666337013244629, + "learning_rate": 1.3493520226433093e-05, + "loss": 0.4549, + "step": 6552 + }, + { + "epoch": 0.7974444782476422, + "grad_norm": 1.374701738357544, + "learning_rate": 1.3491692376202165e-05, + "loss": 0.5081, + "step": 6553 + }, + { + "epoch": 0.7975661697596592, + "grad_norm": 0.7142630815505981, + "learning_rate": 1.3489864393105117e-05, + "loss": 0.4929, + "step": 6554 + }, + { + "epoch": 0.7976878612716763, + "grad_norm": 2.436876058578491, + "learning_rate": 1.3488036277211502e-05, + "loss": 0.4503, + "step": 6555 + }, + { + "epoch": 0.7978095527836934, + "grad_norm": 2.3436686992645264, + "learning_rate": 1.348620802859089e-05, + "loss": 0.4649, + "step": 6556 + }, + { + "epoch": 0.7979312442957104, + "grad_norm": 0.8214684724807739, + "learning_rate": 1.3484379647312844e-05, + "loss": 0.5103, + "step": 6557 + }, + { + "epoch": 0.7980529358077274, + "grad_norm": 2.7678382396698, + "learning_rate": 1.3482551133446942e-05, + "loss": 0.4188, + "step": 6558 + }, + { + "epoch": 0.7981746273197444, + "grad_norm": 2.1724841594696045, + "learning_rate": 1.3480722487062759e-05, + "loss": 0.4948, + "step": 6559 + }, + { + "epoch": 0.7982963188317614, + "grad_norm": 1.8633304834365845, + "learning_rate": 1.3478893708229883e-05, + "loss": 0.4662, + "step": 6560 + }, + { + "epoch": 0.7984180103437786, + "grad_norm": 0.9057595133781433, + "learning_rate": 1.34770647970179e-05, + "loss": 0.4964, + "step": 6561 + }, + { + "epoch": 0.7985397018557956, + "grad_norm": 0.6968687176704407, + "learning_rate": 1.3475235753496403e-05, + "loss": 0.4332, + "step": 6562 + }, + { + "epoch": 0.7986613933678126, + "grad_norm": 2.182694673538208, + "learning_rate": 1.3473406577734993e-05, + "loss": 0.5144, + "step": 6563 + }, + { + "epoch": 0.7987830848798296, + "grad_norm": 0.9346823692321777, + "learning_rate": 1.3471577269803274e-05, + "loss": 0.4359, + "step": 6564 + }, + { + "epoch": 0.7989047763918466, + "grad_norm": 2.3150224685668945, + "learning_rate": 1.3469747829770854e-05, + "loss": 0.482, + "step": 6565 + }, + { + "epoch": 0.7990264679038637, + "grad_norm": 1.5315240621566772, + "learning_rate": 1.3467918257707344e-05, + "loss": 0.4033, + "step": 6566 + }, + { + "epoch": 0.7991481594158807, + "grad_norm": 1.580141305923462, + "learning_rate": 1.346608855368237e-05, + "loss": 0.4982, + "step": 6567 + }, + { + "epoch": 0.7992698509278978, + "grad_norm": 4.9104228019714355, + "learning_rate": 1.3464258717765551e-05, + "loss": 0.5695, + "step": 6568 + }, + { + "epoch": 0.7993915424399148, + "grad_norm": 0.6425305604934692, + "learning_rate": 1.3462428750026514e-05, + "loss": 0.4191, + "step": 6569 + }, + { + "epoch": 0.7995132339519319, + "grad_norm": 0.6832162737846375, + "learning_rate": 1.3460598650534902e-05, + "loss": 0.4127, + "step": 6570 + }, + { + "epoch": 0.7996349254639489, + "grad_norm": 0.6767962574958801, + "learning_rate": 1.3458768419360344e-05, + "loss": 0.4449, + "step": 6571 + }, + { + "epoch": 0.7997566169759659, + "grad_norm": 4.664670944213867, + "learning_rate": 1.3456938056572489e-05, + "loss": 0.5902, + "step": 6572 + }, + { + "epoch": 0.7998783084879829, + "grad_norm": 0.5448784828186035, + "learning_rate": 1.3455107562240985e-05, + "loss": 0.408, + "step": 6573 + }, + { + "epoch": 0.8, + "grad_norm": 2.1333236694335938, + "learning_rate": 1.3453276936435486e-05, + "loss": 0.4352, + "step": 6574 + }, + { + "epoch": 0.8001216915120171, + "grad_norm": 2.676685333251953, + "learning_rate": 1.3451446179225655e-05, + "loss": 0.4436, + "step": 6575 + }, + { + "epoch": 0.8002433830240341, + "grad_norm": 2.743525981903076, + "learning_rate": 1.3449615290681154e-05, + "loss": 0.4025, + "step": 6576 + }, + { + "epoch": 0.8003650745360511, + "grad_norm": 1.2232776880264282, + "learning_rate": 1.3447784270871646e-05, + "loss": 0.4367, + "step": 6577 + }, + { + "epoch": 0.8004867660480681, + "grad_norm": 0.6163656115531921, + "learning_rate": 1.3445953119866813e-05, + "loss": 0.4581, + "step": 6578 + }, + { + "epoch": 0.8006084575600851, + "grad_norm": 0.7034110426902771, + "learning_rate": 1.344412183773633e-05, + "loss": 0.4409, + "step": 6579 + }, + { + "epoch": 0.8007301490721023, + "grad_norm": 4.315017223358154, + "learning_rate": 1.3442290424549882e-05, + "loss": 0.5704, + "step": 6580 + }, + { + "epoch": 0.8008518405841193, + "grad_norm": 1.6207174062728882, + "learning_rate": 1.3440458880377156e-05, + "loss": 0.4527, + "step": 6581 + }, + { + "epoch": 0.8009735320961363, + "grad_norm": 0.8339226245880127, + "learning_rate": 1.3438627205287853e-05, + "loss": 0.3911, + "step": 6582 + }, + { + "epoch": 0.8010952236081533, + "grad_norm": 0.7527160048484802, + "learning_rate": 1.3436795399351665e-05, + "loss": 0.4591, + "step": 6583 + }, + { + "epoch": 0.8012169151201703, + "grad_norm": 0.7859740257263184, + "learning_rate": 1.3434963462638298e-05, + "loss": 0.4251, + "step": 6584 + }, + { + "epoch": 0.8013386066321874, + "grad_norm": 1.3363676071166992, + "learning_rate": 1.343313139521746e-05, + "loss": 0.4363, + "step": 6585 + }, + { + "epoch": 0.8014602981442045, + "grad_norm": 2.3438727855682373, + "learning_rate": 1.3431299197158869e-05, + "loss": 0.4807, + "step": 6586 + }, + { + "epoch": 0.8015819896562215, + "grad_norm": 0.6119399666786194, + "learning_rate": 1.3429466868532239e-05, + "loss": 0.4138, + "step": 6587 + }, + { + "epoch": 0.8017036811682385, + "grad_norm": 0.8425772190093994, + "learning_rate": 1.3427634409407298e-05, + "loss": 0.4803, + "step": 6588 + }, + { + "epoch": 0.8018253726802556, + "grad_norm": 0.7016222476959229, + "learning_rate": 1.342580181985377e-05, + "loss": 0.4458, + "step": 6589 + }, + { + "epoch": 0.8019470641922726, + "grad_norm": 2.7810826301574707, + "learning_rate": 1.3423969099941396e-05, + "loss": 0.4113, + "step": 6590 + }, + { + "epoch": 0.8020687557042896, + "grad_norm": 1.088829517364502, + "learning_rate": 1.3422136249739906e-05, + "loss": 0.4257, + "step": 6591 + }, + { + "epoch": 0.8021904472163066, + "grad_norm": 0.992914617061615, + "learning_rate": 1.3420303269319051e-05, + "loss": 0.4797, + "step": 6592 + }, + { + "epoch": 0.8023121387283237, + "grad_norm": 1.0340906381607056, + "learning_rate": 1.3418470158748575e-05, + "loss": 0.4216, + "step": 6593 + }, + { + "epoch": 0.8024338302403408, + "grad_norm": 1.5709484815597534, + "learning_rate": 1.3416636918098239e-05, + "loss": 0.4455, + "step": 6594 + }, + { + "epoch": 0.8025555217523578, + "grad_norm": 3.0877702236175537, + "learning_rate": 1.3414803547437789e-05, + "loss": 0.3774, + "step": 6595 + }, + { + "epoch": 0.8026772132643748, + "grad_norm": 1.7690620422363281, + "learning_rate": 1.3412970046837e-05, + "loss": 0.4586, + "step": 6596 + }, + { + "epoch": 0.8027989047763918, + "grad_norm": 3.440791606903076, + "learning_rate": 1.3411136416365635e-05, + "loss": 0.5166, + "step": 6597 + }, + { + "epoch": 0.8029205962884088, + "grad_norm": 0.8563041090965271, + "learning_rate": 1.3409302656093468e-05, + "loss": 0.4341, + "step": 6598 + }, + { + "epoch": 0.803042287800426, + "grad_norm": 2.609707832336426, + "learning_rate": 1.340746876609028e-05, + "loss": 0.4871, + "step": 6599 + }, + { + "epoch": 0.803163979312443, + "grad_norm": 0.8562548160552979, + "learning_rate": 1.340563474642585e-05, + "loss": 0.4687, + "step": 6600 + }, + { + "epoch": 0.80328567082446, + "grad_norm": 0.8712829947471619, + "learning_rate": 1.3403800597169971e-05, + "loss": 0.4597, + "step": 6601 + }, + { + "epoch": 0.803407362336477, + "grad_norm": 0.7331824898719788, + "learning_rate": 1.3401966318392433e-05, + "loss": 0.4464, + "step": 6602 + }, + { + "epoch": 0.803529053848494, + "grad_norm": 1.2333920001983643, + "learning_rate": 1.3400131910163032e-05, + "loss": 0.4577, + "step": 6603 + }, + { + "epoch": 0.8036507453605111, + "grad_norm": 0.9844228029251099, + "learning_rate": 1.3398297372551577e-05, + "loss": 0.4685, + "step": 6604 + }, + { + "epoch": 0.8037724368725282, + "grad_norm": 1.145633339881897, + "learning_rate": 1.3396462705627875e-05, + "loss": 0.4617, + "step": 6605 + }, + { + "epoch": 0.8038941283845452, + "grad_norm": 3.1598007678985596, + "learning_rate": 1.3394627909461733e-05, + "loss": 0.4209, + "step": 6606 + }, + { + "epoch": 0.8040158198965622, + "grad_norm": 1.0236690044403076, + "learning_rate": 1.3392792984122973e-05, + "loss": 0.4654, + "step": 6607 + }, + { + "epoch": 0.8041375114085793, + "grad_norm": 1.344252109527588, + "learning_rate": 1.3390957929681416e-05, + "loss": 0.495, + "step": 6608 + }, + { + "epoch": 0.8042592029205963, + "grad_norm": 0.7431895732879639, + "learning_rate": 1.3389122746206896e-05, + "loss": 0.4865, + "step": 6609 + }, + { + "epoch": 0.8043808944326133, + "grad_norm": 0.7696148753166199, + "learning_rate": 1.3387287433769236e-05, + "loss": 0.4916, + "step": 6610 + }, + { + "epoch": 0.8045025859446303, + "grad_norm": 1.1978672742843628, + "learning_rate": 1.338545199243828e-05, + "loss": 0.463, + "step": 6611 + }, + { + "epoch": 0.8046242774566474, + "grad_norm": 0.9850582480430603, + "learning_rate": 1.3383616422283865e-05, + "loss": 0.4874, + "step": 6612 + }, + { + "epoch": 0.8047459689686645, + "grad_norm": 1.781611680984497, + "learning_rate": 1.3381780723375845e-05, + "loss": 0.4051, + "step": 6613 + }, + { + "epoch": 0.8048676604806815, + "grad_norm": 0.579318106174469, + "learning_rate": 1.3379944895784067e-05, + "loss": 0.4391, + "step": 6614 + }, + { + "epoch": 0.8049893519926985, + "grad_norm": 1.0190308094024658, + "learning_rate": 1.337810893957839e-05, + "loss": 0.4578, + "step": 6615 + }, + { + "epoch": 0.8051110435047155, + "grad_norm": 1.111925482749939, + "learning_rate": 1.3376272854828675e-05, + "loss": 0.451, + "step": 6616 + }, + { + "epoch": 0.8052327350167325, + "grad_norm": 1.1480294466018677, + "learning_rate": 1.3374436641604791e-05, + "loss": 0.4072, + "step": 6617 + }, + { + "epoch": 0.8053544265287497, + "grad_norm": 0.5749523639678955, + "learning_rate": 1.3372600299976606e-05, + "loss": 0.4472, + "step": 6618 + }, + { + "epoch": 0.8054761180407667, + "grad_norm": 1.753135323524475, + "learning_rate": 1.3370763830014e-05, + "loss": 0.4757, + "step": 6619 + }, + { + "epoch": 0.8055978095527837, + "grad_norm": 1.2991291284561157, + "learning_rate": 1.3368927231786853e-05, + "loss": 0.4878, + "step": 6620 + }, + { + "epoch": 0.8057195010648007, + "grad_norm": 0.6958457827568054, + "learning_rate": 1.3367090505365051e-05, + "loss": 0.4391, + "step": 6621 + }, + { + "epoch": 0.8058411925768177, + "grad_norm": 0.7624719142913818, + "learning_rate": 1.3365253650818485e-05, + "loss": 0.4363, + "step": 6622 + }, + { + "epoch": 0.8059628840888348, + "grad_norm": 0.706021785736084, + "learning_rate": 1.3363416668217055e-05, + "loss": 0.4787, + "step": 6623 + }, + { + "epoch": 0.8060845756008519, + "grad_norm": 4.138987064361572, + "learning_rate": 1.3361579557630658e-05, + "loss": 0.4307, + "step": 6624 + }, + { + "epoch": 0.8062062671128689, + "grad_norm": 1.60456383228302, + "learning_rate": 1.3359742319129199e-05, + "loss": 0.5063, + "step": 6625 + }, + { + "epoch": 0.8063279586248859, + "grad_norm": 2.763915538787842, + "learning_rate": 1.3357904952782587e-05, + "loss": 0.4472, + "step": 6626 + }, + { + "epoch": 0.806449650136903, + "grad_norm": 2.176271677017212, + "learning_rate": 1.3356067458660749e-05, + "loss": 0.4464, + "step": 6627 + }, + { + "epoch": 0.80657134164892, + "grad_norm": 0.8584074974060059, + "learning_rate": 1.3354229836833594e-05, + "loss": 0.4601, + "step": 6628 + }, + { + "epoch": 0.806693033160937, + "grad_norm": 2.678847074508667, + "learning_rate": 1.335239208737105e-05, + "loss": 0.384, + "step": 6629 + }, + { + "epoch": 0.806814724672954, + "grad_norm": 2.9203453063964844, + "learning_rate": 1.3350554210343048e-05, + "loss": 0.4937, + "step": 6630 + }, + { + "epoch": 0.8069364161849711, + "grad_norm": 1.1138005256652832, + "learning_rate": 1.3348716205819523e-05, + "loss": 0.5036, + "step": 6631 + }, + { + "epoch": 0.8070581076969882, + "grad_norm": 0.6669288873672485, + "learning_rate": 1.3346878073870415e-05, + "loss": 0.5199, + "step": 6632 + }, + { + "epoch": 0.8071797992090052, + "grad_norm": 3.3877501487731934, + "learning_rate": 1.3345039814565668e-05, + "loss": 0.3926, + "step": 6633 + }, + { + "epoch": 0.8073014907210222, + "grad_norm": 1.7293299436569214, + "learning_rate": 1.3343201427975234e-05, + "loss": 0.4229, + "step": 6634 + }, + { + "epoch": 0.8074231822330392, + "grad_norm": 1.7728866338729858, + "learning_rate": 1.3341362914169067e-05, + "loss": 0.4609, + "step": 6635 + }, + { + "epoch": 0.8075448737450562, + "grad_norm": 2.9592413902282715, + "learning_rate": 1.3339524273217122e-05, + "loss": 0.4021, + "step": 6636 + }, + { + "epoch": 0.8076665652570734, + "grad_norm": 1.5425691604614258, + "learning_rate": 1.3337685505189364e-05, + "loss": 0.373, + "step": 6637 + }, + { + "epoch": 0.8077882567690904, + "grad_norm": 1.6649357080459595, + "learning_rate": 1.3335846610155767e-05, + "loss": 0.4277, + "step": 6638 + }, + { + "epoch": 0.8079099482811074, + "grad_norm": 5.9295268058776855, + "learning_rate": 1.3334007588186301e-05, + "loss": 0.5674, + "step": 6639 + }, + { + "epoch": 0.8080316397931244, + "grad_norm": 0.8094533681869507, + "learning_rate": 1.3332168439350948e-05, + "loss": 0.4052, + "step": 6640 + }, + { + "epoch": 0.8081533313051414, + "grad_norm": 1.8929312229156494, + "learning_rate": 1.3330329163719684e-05, + "loss": 0.4286, + "step": 6641 + }, + { + "epoch": 0.8082750228171585, + "grad_norm": 2.9139959812164307, + "learning_rate": 1.3328489761362505e-05, + "loss": 0.4411, + "step": 6642 + }, + { + "epoch": 0.8083967143291756, + "grad_norm": 2.1933822631835938, + "learning_rate": 1.33266502323494e-05, + "loss": 0.4252, + "step": 6643 + }, + { + "epoch": 0.8085184058411926, + "grad_norm": 1.8865286111831665, + "learning_rate": 1.3324810576750369e-05, + "loss": 0.4757, + "step": 6644 + }, + { + "epoch": 0.8086400973532096, + "grad_norm": 2.809324026107788, + "learning_rate": 1.3322970794635412e-05, + "loss": 0.5048, + "step": 6645 + }, + { + "epoch": 0.8087617888652266, + "grad_norm": 0.6368886828422546, + "learning_rate": 1.3321130886074538e-05, + "loss": 0.4258, + "step": 6646 + }, + { + "epoch": 0.8088834803772437, + "grad_norm": 2.1594743728637695, + "learning_rate": 1.3319290851137763e-05, + "loss": 0.4035, + "step": 6647 + }, + { + "epoch": 0.8090051718892607, + "grad_norm": 1.0266525745391846, + "learning_rate": 1.3317450689895095e-05, + "loss": 0.4867, + "step": 6648 + }, + { + "epoch": 0.8091268634012777, + "grad_norm": 1.2973319292068481, + "learning_rate": 1.3315610402416563e-05, + "loss": 0.4697, + "step": 6649 + }, + { + "epoch": 0.8092485549132948, + "grad_norm": 1.4728530645370483, + "learning_rate": 1.3313769988772195e-05, + "loss": 0.5034, + "step": 6650 + }, + { + "epoch": 0.8093702464253119, + "grad_norm": 0.8336383700370789, + "learning_rate": 1.331192944903202e-05, + "loss": 0.4768, + "step": 6651 + }, + { + "epoch": 0.8094919379373289, + "grad_norm": 1.8269641399383545, + "learning_rate": 1.3310088783266071e-05, + "loss": 0.4789, + "step": 6652 + }, + { + "epoch": 0.8096136294493459, + "grad_norm": 3.5603976249694824, + "learning_rate": 1.3308247991544392e-05, + "loss": 0.4306, + "step": 6653 + }, + { + "epoch": 0.8097353209613629, + "grad_norm": 0.7099655270576477, + "learning_rate": 1.3306407073937031e-05, + "loss": 0.4638, + "step": 6654 + }, + { + "epoch": 0.8098570124733799, + "grad_norm": 1.9727869033813477, + "learning_rate": 1.3304566030514037e-05, + "loss": 0.4353, + "step": 6655 + }, + { + "epoch": 0.8099787039853971, + "grad_norm": 2.759723663330078, + "learning_rate": 1.3302724861345464e-05, + "loss": 0.3676, + "step": 6656 + }, + { + "epoch": 0.8101003954974141, + "grad_norm": 1.030905842781067, + "learning_rate": 1.3300883566501376e-05, + "loss": 0.4789, + "step": 6657 + }, + { + "epoch": 0.8102220870094311, + "grad_norm": 0.8978413343429565, + "learning_rate": 1.3299042146051837e-05, + "loss": 0.4918, + "step": 6658 + }, + { + "epoch": 0.8103437785214481, + "grad_norm": 1.9734023809432983, + "learning_rate": 1.3297200600066912e-05, + "loss": 0.4365, + "step": 6659 + }, + { + "epoch": 0.8104654700334651, + "grad_norm": 6.101864337921143, + "learning_rate": 1.3295358928616679e-05, + "loss": 0.5943, + "step": 6660 + }, + { + "epoch": 0.8105871615454822, + "grad_norm": 1.9487838745117188, + "learning_rate": 1.3293517131771224e-05, + "loss": 0.4412, + "step": 6661 + }, + { + "epoch": 0.8107088530574993, + "grad_norm": 0.6250563263893127, + "learning_rate": 1.3291675209600619e-05, + "loss": 0.4365, + "step": 6662 + }, + { + "epoch": 0.8108305445695163, + "grad_norm": 2.8699235916137695, + "learning_rate": 1.328983316217496e-05, + "loss": 0.3765, + "step": 6663 + }, + { + "epoch": 0.8109522360815333, + "grad_norm": 2.4612600803375244, + "learning_rate": 1.3287990989564338e-05, + "loss": 0.3853, + "step": 6664 + }, + { + "epoch": 0.8110739275935503, + "grad_norm": 2.3636205196380615, + "learning_rate": 1.3286148691838859e-05, + "loss": 0.4363, + "step": 6665 + }, + { + "epoch": 0.8111956191055674, + "grad_norm": 4.065032482147217, + "learning_rate": 1.3284306269068614e-05, + "loss": 0.5549, + "step": 6666 + }, + { + "epoch": 0.8113173106175844, + "grad_norm": 2.5878663063049316, + "learning_rate": 1.328246372132372e-05, + "loss": 0.4955, + "step": 6667 + }, + { + "epoch": 0.8114390021296014, + "grad_norm": 0.8897340297698975, + "learning_rate": 1.3280621048674288e-05, + "loss": 0.4586, + "step": 6668 + }, + { + "epoch": 0.8115606936416185, + "grad_norm": 1.4318222999572754, + "learning_rate": 1.3278778251190431e-05, + "loss": 0.4908, + "step": 6669 + }, + { + "epoch": 0.8116823851536356, + "grad_norm": 0.6420795321464539, + "learning_rate": 1.327693532894228e-05, + "loss": 0.4176, + "step": 6670 + }, + { + "epoch": 0.8118040766656526, + "grad_norm": 2.915782928466797, + "learning_rate": 1.3275092281999952e-05, + "loss": 0.3939, + "step": 6671 + }, + { + "epoch": 0.8119257681776696, + "grad_norm": 1.9668383598327637, + "learning_rate": 1.3273249110433587e-05, + "loss": 0.4195, + "step": 6672 + }, + { + "epoch": 0.8120474596896866, + "grad_norm": 0.6759459972381592, + "learning_rate": 1.3271405814313316e-05, + "loss": 0.4455, + "step": 6673 + }, + { + "epoch": 0.8121691512017036, + "grad_norm": 0.6696354150772095, + "learning_rate": 1.3269562393709286e-05, + "loss": 0.4332, + "step": 6674 + }, + { + "epoch": 0.8122908427137208, + "grad_norm": 1.029154658317566, + "learning_rate": 1.3267718848691634e-05, + "loss": 0.4161, + "step": 6675 + }, + { + "epoch": 0.8124125342257378, + "grad_norm": 1.3693962097167969, + "learning_rate": 1.3265875179330517e-05, + "loss": 0.4497, + "step": 6676 + }, + { + "epoch": 0.8125342257377548, + "grad_norm": 3.642892837524414, + "learning_rate": 1.3264031385696092e-05, + "loss": 0.5223, + "step": 6677 + }, + { + "epoch": 0.8126559172497718, + "grad_norm": 0.8846643567085266, + "learning_rate": 1.3262187467858514e-05, + "loss": 0.4769, + "step": 6678 + }, + { + "epoch": 0.8127776087617888, + "grad_norm": 2.3803341388702393, + "learning_rate": 1.326034342588795e-05, + "loss": 0.4949, + "step": 6679 + }, + { + "epoch": 0.8128993002738059, + "grad_norm": 3.128622531890869, + "learning_rate": 1.3258499259854576e-05, + "loss": 0.5033, + "step": 6680 + }, + { + "epoch": 0.813020991785823, + "grad_norm": 0.6702750325202942, + "learning_rate": 1.3256654969828558e-05, + "loss": 0.4666, + "step": 6681 + }, + { + "epoch": 0.81314268329784, + "grad_norm": 2.27921724319458, + "learning_rate": 1.3254810555880074e-05, + "loss": 0.5003, + "step": 6682 + }, + { + "epoch": 0.813264374809857, + "grad_norm": 0.7695069909095764, + "learning_rate": 1.3252966018079312e-05, + "loss": 0.5037, + "step": 6683 + }, + { + "epoch": 0.813386066321874, + "grad_norm": 0.6124937534332275, + "learning_rate": 1.3251121356496462e-05, + "loss": 0.4561, + "step": 6684 + }, + { + "epoch": 0.8135077578338911, + "grad_norm": 2.236314296722412, + "learning_rate": 1.3249276571201714e-05, + "loss": 0.4811, + "step": 6685 + }, + { + "epoch": 0.8136294493459081, + "grad_norm": 3.784168243408203, + "learning_rate": 1.3247431662265267e-05, + "loss": 0.4334, + "step": 6686 + }, + { + "epoch": 0.8137511408579252, + "grad_norm": 5.357327461242676, + "learning_rate": 1.3245586629757323e-05, + "loss": 0.4391, + "step": 6687 + }, + { + "epoch": 0.8138728323699422, + "grad_norm": 2.820408821105957, + "learning_rate": 1.3243741473748091e-05, + "loss": 0.4778, + "step": 6688 + }, + { + "epoch": 0.8139945238819593, + "grad_norm": 3.5945191383361816, + "learning_rate": 1.324189619430778e-05, + "loss": 0.4732, + "step": 6689 + }, + { + "epoch": 0.8141162153939763, + "grad_norm": 2.078001022338867, + "learning_rate": 1.3240050791506609e-05, + "loss": 0.4883, + "step": 6690 + }, + { + "epoch": 0.8142379069059933, + "grad_norm": 2.0544700622558594, + "learning_rate": 1.32382052654148e-05, + "loss": 0.5085, + "step": 6691 + }, + { + "epoch": 0.8143595984180103, + "grad_norm": 3.0333774089813232, + "learning_rate": 1.3236359616102576e-05, + "loss": 0.3931, + "step": 6692 + }, + { + "epoch": 0.8144812899300273, + "grad_norm": 0.9521600008010864, + "learning_rate": 1.3234513843640171e-05, + "loss": 0.4128, + "step": 6693 + }, + { + "epoch": 0.8146029814420445, + "grad_norm": 0.8809683322906494, + "learning_rate": 1.3232667948097818e-05, + "loss": 0.4374, + "step": 6694 + }, + { + "epoch": 0.8147246729540615, + "grad_norm": 5.278363227844238, + "learning_rate": 1.3230821929545758e-05, + "loss": 0.5436, + "step": 6695 + }, + { + "epoch": 0.8148463644660785, + "grad_norm": 2.8285536766052246, + "learning_rate": 1.322897578805424e-05, + "loss": 0.4687, + "step": 6696 + }, + { + "epoch": 0.8149680559780955, + "grad_norm": 0.9588330984115601, + "learning_rate": 1.3227129523693507e-05, + "loss": 0.4441, + "step": 6697 + }, + { + "epoch": 0.8150897474901125, + "grad_norm": 4.537371635437012, + "learning_rate": 1.3225283136533815e-05, + "loss": 0.5379, + "step": 6698 + }, + { + "epoch": 0.8152114390021296, + "grad_norm": 7.747735977172852, + "learning_rate": 1.3223436626645423e-05, + "loss": 0.609, + "step": 6699 + }, + { + "epoch": 0.8153331305141467, + "grad_norm": 3.778442859649658, + "learning_rate": 1.3221589994098598e-05, + "loss": 0.4583, + "step": 6700 + }, + { + "epoch": 0.8154548220261637, + "grad_norm": 0.7494981288909912, + "learning_rate": 1.3219743238963603e-05, + "loss": 0.422, + "step": 6701 + }, + { + "epoch": 0.8155765135381807, + "grad_norm": 0.6733980774879456, + "learning_rate": 1.3217896361310713e-05, + "loss": 0.4108, + "step": 6702 + }, + { + "epoch": 0.8156982050501977, + "grad_norm": 3.6487674713134766, + "learning_rate": 1.3216049361210208e-05, + "loss": 0.3315, + "step": 6703 + }, + { + "epoch": 0.8158198965622148, + "grad_norm": 0.939558744430542, + "learning_rate": 1.3214202238732367e-05, + "loss": 0.4469, + "step": 6704 + }, + { + "epoch": 0.8159415880742318, + "grad_norm": 0.7197710275650024, + "learning_rate": 1.3212354993947478e-05, + "loss": 0.4774, + "step": 6705 + }, + { + "epoch": 0.8160632795862489, + "grad_norm": 2.546074390411377, + "learning_rate": 1.3210507626925833e-05, + "loss": 0.4595, + "step": 6706 + }, + { + "epoch": 0.8161849710982659, + "grad_norm": 1.2112523317337036, + "learning_rate": 1.3208660137737725e-05, + "loss": 0.5203, + "step": 6707 + }, + { + "epoch": 0.816306662610283, + "grad_norm": 3.5929346084594727, + "learning_rate": 1.3206812526453458e-05, + "loss": 0.4012, + "step": 6708 + }, + { + "epoch": 0.8164283541223, + "grad_norm": 3.0846757888793945, + "learning_rate": 1.320496479314334e-05, + "loss": 0.4433, + "step": 6709 + }, + { + "epoch": 0.816550045634317, + "grad_norm": 0.7732279300689697, + "learning_rate": 1.3203116937877674e-05, + "loss": 0.4835, + "step": 6710 + }, + { + "epoch": 0.816671737146334, + "grad_norm": 2.4987215995788574, + "learning_rate": 1.320126896072678e-05, + "loss": 0.4186, + "step": 6711 + }, + { + "epoch": 0.816793428658351, + "grad_norm": 1.4232234954833984, + "learning_rate": 1.3199420861760974e-05, + "loss": 0.5002, + "step": 6712 + }, + { + "epoch": 0.8169151201703682, + "grad_norm": 2.0465641021728516, + "learning_rate": 1.3197572641050583e-05, + "loss": 0.4509, + "step": 6713 + }, + { + "epoch": 0.8170368116823852, + "grad_norm": 1.7632215023040771, + "learning_rate": 1.3195724298665935e-05, + "loss": 0.4045, + "step": 6714 + }, + { + "epoch": 0.8171585031944022, + "grad_norm": 0.7635778188705444, + "learning_rate": 1.3193875834677363e-05, + "loss": 0.4632, + "step": 6715 + }, + { + "epoch": 0.8172801947064192, + "grad_norm": 0.5357716083526611, + "learning_rate": 1.3192027249155205e-05, + "loss": 0.4081, + "step": 6716 + }, + { + "epoch": 0.8174018862184362, + "grad_norm": 2.5138120651245117, + "learning_rate": 1.31901785421698e-05, + "loss": 0.5036, + "step": 6717 + }, + { + "epoch": 0.8175235777304533, + "grad_norm": 1.8262012004852295, + "learning_rate": 1.3188329713791502e-05, + "loss": 0.4514, + "step": 6718 + }, + { + "epoch": 0.8176452692424704, + "grad_norm": 2.4968302249908447, + "learning_rate": 1.3186480764090655e-05, + "loss": 0.4719, + "step": 6719 + }, + { + "epoch": 0.8177669607544874, + "grad_norm": 0.6859037280082703, + "learning_rate": 1.3184631693137622e-05, + "loss": 0.4342, + "step": 6720 + }, + { + "epoch": 0.8178886522665044, + "grad_norm": 2.740236282348633, + "learning_rate": 1.3182782501002757e-05, + "loss": 0.4796, + "step": 6721 + }, + { + "epoch": 0.8180103437785214, + "grad_norm": 0.7942762970924377, + "learning_rate": 1.3180933187756435e-05, + "loss": 0.43, + "step": 6722 + }, + { + "epoch": 0.8181320352905385, + "grad_norm": 0.9182960391044617, + "learning_rate": 1.3179083753469018e-05, + "loss": 0.4821, + "step": 6723 + }, + { + "epoch": 0.8182537268025555, + "grad_norm": 1.0311622619628906, + "learning_rate": 1.3177234198210885e-05, + "loss": 0.5229, + "step": 6724 + }, + { + "epoch": 0.8183754183145726, + "grad_norm": 2.3608343601226807, + "learning_rate": 1.3175384522052413e-05, + "loss": 0.4715, + "step": 6725 + }, + { + "epoch": 0.8184971098265896, + "grad_norm": 3.9457805156707764, + "learning_rate": 1.3173534725063992e-05, + "loss": 0.4546, + "step": 6726 + }, + { + "epoch": 0.8186188013386066, + "grad_norm": 2.941842555999756, + "learning_rate": 1.3171684807316e-05, + "loss": 0.5047, + "step": 6727 + }, + { + "epoch": 0.8187404928506237, + "grad_norm": 3.6807191371917725, + "learning_rate": 1.316983476887884e-05, + "loss": 0.5019, + "step": 6728 + }, + { + "epoch": 0.8188621843626407, + "grad_norm": 5.569693565368652, + "learning_rate": 1.3167984609822901e-05, + "loss": 0.4589, + "step": 6729 + }, + { + "epoch": 0.8189838758746577, + "grad_norm": 1.4934221506118774, + "learning_rate": 1.3166134330218597e-05, + "loss": 0.485, + "step": 6730 + }, + { + "epoch": 0.8191055673866747, + "grad_norm": 2.0929012298583984, + "learning_rate": 1.3164283930136321e-05, + "loss": 0.4819, + "step": 6731 + }, + { + "epoch": 0.8192272588986919, + "grad_norm": 1.0450574159622192, + "learning_rate": 1.3162433409646499e-05, + "loss": 0.4977, + "step": 6732 + }, + { + "epoch": 0.8193489504107089, + "grad_norm": 0.6460596323013306, + "learning_rate": 1.3160582768819534e-05, + "loss": 0.4696, + "step": 6733 + }, + { + "epoch": 0.8194706419227259, + "grad_norm": 1.1410826444625854, + "learning_rate": 1.3158732007725856e-05, + "loss": 0.4508, + "step": 6734 + }, + { + "epoch": 0.8195923334347429, + "grad_norm": 0.6852778196334839, + "learning_rate": 1.3156881126435885e-05, + "loss": 0.4372, + "step": 6735 + }, + { + "epoch": 0.8197140249467599, + "grad_norm": 4.219789028167725, + "learning_rate": 1.3155030125020051e-05, + "loss": 0.5393, + "step": 6736 + }, + { + "epoch": 0.819835716458777, + "grad_norm": 3.642214775085449, + "learning_rate": 1.3153179003548794e-05, + "loss": 0.4818, + "step": 6737 + }, + { + "epoch": 0.8199574079707941, + "grad_norm": 2.299353837966919, + "learning_rate": 1.3151327762092549e-05, + "loss": 0.4565, + "step": 6738 + }, + { + "epoch": 0.8200790994828111, + "grad_norm": 0.5892159342765808, + "learning_rate": 1.314947640072176e-05, + "loss": 0.3876, + "step": 6739 + }, + { + "epoch": 0.8202007909948281, + "grad_norm": 0.6633529663085938, + "learning_rate": 1.3147624919506872e-05, + "loss": 0.385, + "step": 6740 + }, + { + "epoch": 0.8203224825068451, + "grad_norm": 5.231131076812744, + "learning_rate": 1.3145773318518342e-05, + "loss": 0.5464, + "step": 6741 + }, + { + "epoch": 0.8204441740188622, + "grad_norm": 1.2580164670944214, + "learning_rate": 1.3143921597826625e-05, + "loss": 0.432, + "step": 6742 + }, + { + "epoch": 0.8205658655308792, + "grad_norm": 0.8528087735176086, + "learning_rate": 1.3142069757502187e-05, + "loss": 0.432, + "step": 6743 + }, + { + "epoch": 0.8206875570428963, + "grad_norm": 0.9162859916687012, + "learning_rate": 1.3140217797615486e-05, + "loss": 0.4255, + "step": 6744 + }, + { + "epoch": 0.8208092485549133, + "grad_norm": 1.9959614276885986, + "learning_rate": 1.3138365718237002e-05, + "loss": 0.4994, + "step": 6745 + }, + { + "epoch": 0.8209309400669303, + "grad_norm": 1.2443194389343262, + "learning_rate": 1.3136513519437204e-05, + "loss": 0.4776, + "step": 6746 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 1.8186997175216675, + "learning_rate": 1.3134661201286573e-05, + "loss": 0.5325, + "step": 6747 + }, + { + "epoch": 0.8211743230909644, + "grad_norm": 1.274896502494812, + "learning_rate": 1.31328087638556e-05, + "loss": 0.4833, + "step": 6748 + }, + { + "epoch": 0.8212960146029814, + "grad_norm": 2.247789144515991, + "learning_rate": 1.3130956207214767e-05, + "loss": 0.4409, + "step": 6749 + }, + { + "epoch": 0.8214177061149984, + "grad_norm": 4.778298377990723, + "learning_rate": 1.3129103531434569e-05, + "loss": 0.3931, + "step": 6750 + }, + { + "epoch": 0.8215393976270156, + "grad_norm": 1.2628536224365234, + "learning_rate": 1.3127250736585504e-05, + "loss": 0.4313, + "step": 6751 + }, + { + "epoch": 0.8216610891390326, + "grad_norm": 0.8084657788276672, + "learning_rate": 1.3125397822738075e-05, + "loss": 0.4557, + "step": 6752 + }, + { + "epoch": 0.8217827806510496, + "grad_norm": 1.6310756206512451, + "learning_rate": 1.312354478996279e-05, + "loss": 0.4337, + "step": 6753 + }, + { + "epoch": 0.8219044721630666, + "grad_norm": 0.6733612418174744, + "learning_rate": 1.312169163833016e-05, + "loss": 0.4298, + "step": 6754 + }, + { + "epoch": 0.8220261636750836, + "grad_norm": 0.6573811173439026, + "learning_rate": 1.3119838367910704e-05, + "loss": 0.46, + "step": 6755 + }, + { + "epoch": 0.8221478551871007, + "grad_norm": 0.8170048594474792, + "learning_rate": 1.3117984978774941e-05, + "loss": 0.49, + "step": 6756 + }, + { + "epoch": 0.8222695466991178, + "grad_norm": 4.894285678863525, + "learning_rate": 1.3116131470993391e-05, + "loss": 0.5681, + "step": 6757 + }, + { + "epoch": 0.8223912382111348, + "grad_norm": 0.9377179741859436, + "learning_rate": 1.3114277844636592e-05, + "loss": 0.4883, + "step": 6758 + }, + { + "epoch": 0.8225129297231518, + "grad_norm": 2.1831438541412354, + "learning_rate": 1.3112424099775071e-05, + "loss": 0.4006, + "step": 6759 + }, + { + "epoch": 0.8226346212351688, + "grad_norm": 2.9049322605133057, + "learning_rate": 1.3110570236479374e-05, + "loss": 0.5278, + "step": 6760 + }, + { + "epoch": 0.8227563127471859, + "grad_norm": 1.5541597604751587, + "learning_rate": 1.3108716254820043e-05, + "loss": 0.429, + "step": 6761 + }, + { + "epoch": 0.8228780042592029, + "grad_norm": 2.308603048324585, + "learning_rate": 1.3106862154867619e-05, + "loss": 0.4838, + "step": 6762 + }, + { + "epoch": 0.82299969577122, + "grad_norm": 3.5870141983032227, + "learning_rate": 1.3105007936692662e-05, + "loss": 0.5245, + "step": 6763 + }, + { + "epoch": 0.823121387283237, + "grad_norm": 0.8772350549697876, + "learning_rate": 1.3103153600365726e-05, + "loss": 0.398, + "step": 6764 + }, + { + "epoch": 0.823243078795254, + "grad_norm": 1.807956337928772, + "learning_rate": 1.3101299145957372e-05, + "loss": 0.4522, + "step": 6765 + }, + { + "epoch": 0.8233647703072711, + "grad_norm": 1.533187747001648, + "learning_rate": 1.3099444573538164e-05, + "loss": 0.5183, + "step": 6766 + }, + { + "epoch": 0.8234864618192881, + "grad_norm": 0.8093053102493286, + "learning_rate": 1.3097589883178677e-05, + "loss": 0.4431, + "step": 6767 + }, + { + "epoch": 0.8236081533313051, + "grad_norm": 0.859160840511322, + "learning_rate": 1.3095735074949485e-05, + "loss": 0.4724, + "step": 6768 + }, + { + "epoch": 0.8237298448433221, + "grad_norm": 2.8490512371063232, + "learning_rate": 1.3093880148921162e-05, + "loss": 0.4371, + "step": 6769 + }, + { + "epoch": 0.8238515363553393, + "grad_norm": 1.9302666187286377, + "learning_rate": 1.3092025105164297e-05, + "loss": 0.4655, + "step": 6770 + }, + { + "epoch": 0.8239732278673563, + "grad_norm": 1.8279956579208374, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.4513, + "step": 6771 + }, + { + "epoch": 0.8240949193793733, + "grad_norm": 0.9794836044311523, + "learning_rate": 1.3088314664747293e-05, + "loss": 0.4507, + "step": 6772 + }, + { + "epoch": 0.8242166108913903, + "grad_norm": 1.7553751468658447, + "learning_rate": 1.3086459268228345e-05, + "loss": 0.3991, + "step": 6773 + }, + { + "epoch": 0.8243383024034073, + "grad_norm": 2.9451494216918945, + "learning_rate": 1.308460375426323e-05, + "loss": 0.5058, + "step": 6774 + }, + { + "epoch": 0.8244599939154243, + "grad_norm": 1.7878193855285645, + "learning_rate": 1.3082748122922562e-05, + "loss": 0.4512, + "step": 6775 + }, + { + "epoch": 0.8245816854274415, + "grad_norm": 2.5065412521362305, + "learning_rate": 1.3080892374276943e-05, + "loss": 0.4822, + "step": 6776 + }, + { + "epoch": 0.8247033769394585, + "grad_norm": 0.6355451345443726, + "learning_rate": 1.3079036508396991e-05, + "loss": 0.4485, + "step": 6777 + }, + { + "epoch": 0.8248250684514755, + "grad_norm": 2.876383066177368, + "learning_rate": 1.3077180525353332e-05, + "loss": 0.3951, + "step": 6778 + }, + { + "epoch": 0.8249467599634925, + "grad_norm": 2.601952314376831, + "learning_rate": 1.3075324425216583e-05, + "loss": 0.5184, + "step": 6779 + }, + { + "epoch": 0.8250684514755096, + "grad_norm": 1.2750346660614014, + "learning_rate": 1.3073468208057372e-05, + "loss": 0.4566, + "step": 6780 + }, + { + "epoch": 0.8251901429875266, + "grad_norm": 2.2961859703063965, + "learning_rate": 1.3071611873946335e-05, + "loss": 0.4345, + "step": 6781 + }, + { + "epoch": 0.8253118344995437, + "grad_norm": 0.566150426864624, + "learning_rate": 1.306975542295411e-05, + "loss": 0.4692, + "step": 6782 + }, + { + "epoch": 0.8254335260115607, + "grad_norm": 2.2687556743621826, + "learning_rate": 1.3067898855151333e-05, + "loss": 0.4322, + "step": 6783 + }, + { + "epoch": 0.8255552175235777, + "grad_norm": 0.7312899231910706, + "learning_rate": 1.3066042170608658e-05, + "loss": 0.4089, + "step": 6784 + }, + { + "epoch": 0.8256769090355948, + "grad_norm": 0.7542760968208313, + "learning_rate": 1.306418536939673e-05, + "loss": 0.441, + "step": 6785 + }, + { + "epoch": 0.8257986005476118, + "grad_norm": 1.011281967163086, + "learning_rate": 1.3062328451586209e-05, + "loss": 0.4073, + "step": 6786 + }, + { + "epoch": 0.8259202920596288, + "grad_norm": 3.765763759613037, + "learning_rate": 1.3060471417247746e-05, + "loss": 0.5107, + "step": 6787 + }, + { + "epoch": 0.8260419835716459, + "grad_norm": 0.9791185259819031, + "learning_rate": 1.3058614266452014e-05, + "loss": 0.4392, + "step": 6788 + }, + { + "epoch": 0.826163675083663, + "grad_norm": 3.819981813430786, + "learning_rate": 1.3056756999269679e-05, + "loss": 0.4919, + "step": 6789 + }, + { + "epoch": 0.82628536659568, + "grad_norm": 0.8946986794471741, + "learning_rate": 1.3054899615771414e-05, + "loss": 0.4398, + "step": 6790 + }, + { + "epoch": 0.826407058107697, + "grad_norm": 0.6221190094947815, + "learning_rate": 1.3053042116027895e-05, + "loss": 0.4638, + "step": 6791 + }, + { + "epoch": 0.826528749619714, + "grad_norm": 1.3485747575759888, + "learning_rate": 1.3051184500109801e-05, + "loss": 0.4613, + "step": 6792 + }, + { + "epoch": 0.826650441131731, + "grad_norm": 1.647303581237793, + "learning_rate": 1.3049326768087821e-05, + "loss": 0.4878, + "step": 6793 + }, + { + "epoch": 0.826772132643748, + "grad_norm": 1.5896073579788208, + "learning_rate": 1.3047468920032651e-05, + "loss": 0.4391, + "step": 6794 + }, + { + "epoch": 0.8268938241557652, + "grad_norm": 1.961309552192688, + "learning_rate": 1.3045610956014978e-05, + "loss": 0.4564, + "step": 6795 + }, + { + "epoch": 0.8270155156677822, + "grad_norm": 2.0452980995178223, + "learning_rate": 1.3043752876105504e-05, + "loss": 0.4475, + "step": 6796 + }, + { + "epoch": 0.8271372071797992, + "grad_norm": 0.641858696937561, + "learning_rate": 1.3041894680374932e-05, + "loss": 0.474, + "step": 6797 + }, + { + "epoch": 0.8272588986918162, + "grad_norm": 0.9289491176605225, + "learning_rate": 1.304003636889397e-05, + "loss": 0.4717, + "step": 6798 + }, + { + "epoch": 0.8273805902038333, + "grad_norm": 1.0018569231033325, + "learning_rate": 1.3038177941733333e-05, + "loss": 0.4869, + "step": 6799 + }, + { + "epoch": 0.8275022817158503, + "grad_norm": 0.9068577289581299, + "learning_rate": 1.3036319398963737e-05, + "loss": 0.4649, + "step": 6800 + }, + { + "epoch": 0.8276239732278674, + "grad_norm": 1.6282252073287964, + "learning_rate": 1.3034460740655903e-05, + "loss": 0.481, + "step": 6801 + }, + { + "epoch": 0.8277456647398844, + "grad_norm": 1.6111154556274414, + "learning_rate": 1.3032601966880558e-05, + "loss": 0.4279, + "step": 6802 + }, + { + "epoch": 0.8278673562519014, + "grad_norm": 2.688147783279419, + "learning_rate": 1.303074307770843e-05, + "loss": 0.4764, + "step": 6803 + }, + { + "epoch": 0.8279890477639185, + "grad_norm": 2.1374621391296387, + "learning_rate": 1.3028884073210253e-05, + "loss": 0.5043, + "step": 6804 + }, + { + "epoch": 0.8281107392759355, + "grad_norm": 1.8053512573242188, + "learning_rate": 1.302702495345677e-05, + "loss": 0.4679, + "step": 6805 + }, + { + "epoch": 0.8282324307879525, + "grad_norm": 0.7423006892204285, + "learning_rate": 1.302516571851872e-05, + "loss": 0.4426, + "step": 6806 + }, + { + "epoch": 0.8283541222999696, + "grad_norm": 1.0323774814605713, + "learning_rate": 1.3023306368466853e-05, + "loss": 0.4747, + "step": 6807 + }, + { + "epoch": 0.8284758138119866, + "grad_norm": 2.339226484298706, + "learning_rate": 1.3021446903371922e-05, + "loss": 0.4342, + "step": 6808 + }, + { + "epoch": 0.8285975053240037, + "grad_norm": 1.1705610752105713, + "learning_rate": 1.301958732330468e-05, + "loss": 0.4637, + "step": 6809 + }, + { + "epoch": 0.8287191968360207, + "grad_norm": 0.7676281929016113, + "learning_rate": 1.3017727628335892e-05, + "loss": 0.4896, + "step": 6810 + }, + { + "epoch": 0.8288408883480377, + "grad_norm": 2.7097115516662598, + "learning_rate": 1.3015867818536321e-05, + "loss": 0.4419, + "step": 6811 + }, + { + "epoch": 0.8289625798600547, + "grad_norm": 1.556746482849121, + "learning_rate": 1.3014007893976737e-05, + "loss": 0.5079, + "step": 6812 + }, + { + "epoch": 0.8290842713720717, + "grad_norm": 0.8942267894744873, + "learning_rate": 1.3012147854727917e-05, + "loss": 0.433, + "step": 6813 + }, + { + "epoch": 0.8292059628840889, + "grad_norm": 1.9167677164077759, + "learning_rate": 1.3010287700860632e-05, + "loss": 0.4215, + "step": 6814 + }, + { + "epoch": 0.8293276543961059, + "grad_norm": 1.3114110231399536, + "learning_rate": 1.3008427432445667e-05, + "loss": 0.4681, + "step": 6815 + }, + { + "epoch": 0.8294493459081229, + "grad_norm": 1.196704387664795, + "learning_rate": 1.3006567049553818e-05, + "loss": 0.456, + "step": 6816 + }, + { + "epoch": 0.8295710374201399, + "grad_norm": 1.963807225227356, + "learning_rate": 1.3004706552255863e-05, + "loss": 0.4448, + "step": 6817 + }, + { + "epoch": 0.829692728932157, + "grad_norm": 3.58132266998291, + "learning_rate": 1.3002845940622609e-05, + "loss": 0.5033, + "step": 6818 + }, + { + "epoch": 0.829814420444174, + "grad_norm": 1.2175681591033936, + "learning_rate": 1.3000985214724848e-05, + "loss": 0.441, + "step": 6819 + }, + { + "epoch": 0.8299361119561911, + "grad_norm": 0.669368326663971, + "learning_rate": 1.2999124374633392e-05, + "loss": 0.4174, + "step": 6820 + }, + { + "epoch": 0.8300578034682081, + "grad_norm": 4.004969596862793, + "learning_rate": 1.2997263420419042e-05, + "loss": 0.503, + "step": 6821 + }, + { + "epoch": 0.8301794949802251, + "grad_norm": 3.592200994491577, + "learning_rate": 1.2995402352152614e-05, + "loss": 0.5281, + "step": 6822 + }, + { + "epoch": 0.8303011864922422, + "grad_norm": 1.0471320152282715, + "learning_rate": 1.2993541169904928e-05, + "loss": 0.4788, + "step": 6823 + }, + { + "epoch": 0.8304228780042592, + "grad_norm": 4.47477912902832, + "learning_rate": 1.299167987374681e-05, + "loss": 0.5547, + "step": 6824 + }, + { + "epoch": 0.8305445695162762, + "grad_norm": 1.6746526956558228, + "learning_rate": 1.2989818463749073e-05, + "loss": 0.4444, + "step": 6825 + }, + { + "epoch": 0.8306662610282933, + "grad_norm": 0.6307390332221985, + "learning_rate": 1.2987956939982556e-05, + "loss": 0.487, + "step": 6826 + }, + { + "epoch": 0.8307879525403103, + "grad_norm": 2.1108851432800293, + "learning_rate": 1.2986095302518092e-05, + "loss": 0.4754, + "step": 6827 + }, + { + "epoch": 0.8309096440523274, + "grad_norm": 3.383765459060669, + "learning_rate": 1.2984233551426525e-05, + "loss": 0.4277, + "step": 6828 + }, + { + "epoch": 0.8310313355643444, + "grad_norm": 3.48599910736084, + "learning_rate": 1.298237168677869e-05, + "loss": 0.4248, + "step": 6829 + }, + { + "epoch": 0.8311530270763614, + "grad_norm": 1.4197397232055664, + "learning_rate": 1.2980509708645441e-05, + "loss": 0.4709, + "step": 6830 + }, + { + "epoch": 0.8312747185883784, + "grad_norm": 0.9290311336517334, + "learning_rate": 1.2978647617097629e-05, + "loss": 0.4969, + "step": 6831 + }, + { + "epoch": 0.8313964101003954, + "grad_norm": 1.0463742017745972, + "learning_rate": 1.297678541220611e-05, + "loss": 0.4636, + "step": 6832 + }, + { + "epoch": 0.8315181016124126, + "grad_norm": 0.6120090484619141, + "learning_rate": 1.2974923094041741e-05, + "loss": 0.431, + "step": 6833 + }, + { + "epoch": 0.8316397931244296, + "grad_norm": 1.3695932626724243, + "learning_rate": 1.2973060662675391e-05, + "loss": 0.4979, + "step": 6834 + }, + { + "epoch": 0.8317614846364466, + "grad_norm": 1.5041674375534058, + "learning_rate": 1.2971198118177932e-05, + "loss": 0.4218, + "step": 6835 + }, + { + "epoch": 0.8318831761484636, + "grad_norm": 1.0865904092788696, + "learning_rate": 1.2969335460620236e-05, + "loss": 0.4389, + "step": 6836 + }, + { + "epoch": 0.8320048676604807, + "grad_norm": 1.5804051160812378, + "learning_rate": 1.2967472690073176e-05, + "loss": 0.4821, + "step": 6837 + }, + { + "epoch": 0.8321265591724977, + "grad_norm": 0.8305590152740479, + "learning_rate": 1.2965609806607637e-05, + "loss": 0.4192, + "step": 6838 + }, + { + "epoch": 0.8322482506845148, + "grad_norm": 0.9001047015190125, + "learning_rate": 1.296374681029451e-05, + "loss": 0.4589, + "step": 6839 + }, + { + "epoch": 0.8323699421965318, + "grad_norm": 1.0995434522628784, + "learning_rate": 1.296188370120468e-05, + "loss": 0.4638, + "step": 6840 + }, + { + "epoch": 0.8324916337085488, + "grad_norm": 2.307422161102295, + "learning_rate": 1.2960020479409043e-05, + "loss": 0.488, + "step": 6841 + }, + { + "epoch": 0.8326133252205659, + "grad_norm": 1.6505639553070068, + "learning_rate": 1.2958157144978503e-05, + "loss": 0.46, + "step": 6842 + }, + { + "epoch": 0.8327350167325829, + "grad_norm": 1.4252490997314453, + "learning_rate": 1.2956293697983959e-05, + "loss": 0.4589, + "step": 6843 + }, + { + "epoch": 0.8328567082445999, + "grad_norm": 1.731046438217163, + "learning_rate": 1.295443013849632e-05, + "loss": 0.4576, + "step": 6844 + }, + { + "epoch": 0.832978399756617, + "grad_norm": 3.6126105785369873, + "learning_rate": 1.29525664665865e-05, + "loss": 0.3881, + "step": 6845 + }, + { + "epoch": 0.833100091268634, + "grad_norm": 0.7489259243011475, + "learning_rate": 1.2950702682325415e-05, + "loss": 0.471, + "step": 6846 + }, + { + "epoch": 0.8332217827806511, + "grad_norm": 0.6991187334060669, + "learning_rate": 1.2948838785783986e-05, + "loss": 0.4702, + "step": 6847 + }, + { + "epoch": 0.8333434742926681, + "grad_norm": 1.273205041885376, + "learning_rate": 1.2946974777033135e-05, + "loss": 0.484, + "step": 6848 + }, + { + "epoch": 0.8334651658046851, + "grad_norm": 0.6542345285415649, + "learning_rate": 1.2945110656143793e-05, + "loss": 0.4822, + "step": 6849 + }, + { + "epoch": 0.8335868573167021, + "grad_norm": 1.059162974357605, + "learning_rate": 1.2943246423186897e-05, + "loss": 0.478, + "step": 6850 + }, + { + "epoch": 0.8337085488287191, + "grad_norm": 0.596342146396637, + "learning_rate": 1.294138207823338e-05, + "loss": 0.4319, + "step": 6851 + }, + { + "epoch": 0.8338302403407363, + "grad_norm": 3.7490267753601074, + "learning_rate": 1.2939517621354187e-05, + "loss": 0.3969, + "step": 6852 + }, + { + "epoch": 0.8339519318527533, + "grad_norm": 1.5371110439300537, + "learning_rate": 1.2937653052620266e-05, + "loss": 0.451, + "step": 6853 + }, + { + "epoch": 0.8340736233647703, + "grad_norm": 0.8665494918823242, + "learning_rate": 1.2935788372102566e-05, + "loss": 0.4436, + "step": 6854 + }, + { + "epoch": 0.8341953148767873, + "grad_norm": 1.6559678316116333, + "learning_rate": 1.2933923579872042e-05, + "loss": 0.42, + "step": 6855 + }, + { + "epoch": 0.8343170063888043, + "grad_norm": 2.324580669403076, + "learning_rate": 1.2932058675999651e-05, + "loss": 0.5133, + "step": 6856 + }, + { + "epoch": 0.8344386979008214, + "grad_norm": 1.1224908828735352, + "learning_rate": 1.2930193660556356e-05, + "loss": 0.4425, + "step": 6857 + }, + { + "epoch": 0.8345603894128385, + "grad_norm": 0.6706041097640991, + "learning_rate": 1.2928328533613135e-05, + "loss": 0.472, + "step": 6858 + }, + { + "epoch": 0.8346820809248555, + "grad_norm": 1.2882907390594482, + "learning_rate": 1.2926463295240945e-05, + "loss": 0.4124, + "step": 6859 + }, + { + "epoch": 0.8348037724368725, + "grad_norm": 1.9551949501037598, + "learning_rate": 1.2924597945510771e-05, + "loss": 0.4866, + "step": 6860 + }, + { + "epoch": 0.8349254639488896, + "grad_norm": 0.8849619626998901, + "learning_rate": 1.292273248449359e-05, + "loss": 0.4067, + "step": 6861 + }, + { + "epoch": 0.8350471554609066, + "grad_norm": 0.9172243475914001, + "learning_rate": 1.292086691226039e-05, + "loss": 0.4562, + "step": 6862 + }, + { + "epoch": 0.8351688469729236, + "grad_norm": 1.9458006620407104, + "learning_rate": 1.2919001228882157e-05, + "loss": 0.4622, + "step": 6863 + }, + { + "epoch": 0.8352905384849407, + "grad_norm": 3.4780547618865967, + "learning_rate": 1.2917135434429888e-05, + "loss": 0.5445, + "step": 6864 + }, + { + "epoch": 0.8354122299969577, + "grad_norm": 1.3642549514770508, + "learning_rate": 1.2915269528974576e-05, + "loss": 0.4394, + "step": 6865 + }, + { + "epoch": 0.8355339215089748, + "grad_norm": 0.6495612263679504, + "learning_rate": 1.2913403512587227e-05, + "loss": 0.4251, + "step": 6866 + }, + { + "epoch": 0.8356556130209918, + "grad_norm": 0.8254455327987671, + "learning_rate": 1.2911537385338836e-05, + "loss": 0.5089, + "step": 6867 + }, + { + "epoch": 0.8357773045330088, + "grad_norm": 2.1402394771575928, + "learning_rate": 1.2909671147300427e-05, + "loss": 0.5193, + "step": 6868 + }, + { + "epoch": 0.8358989960450258, + "grad_norm": 1.8830991983413696, + "learning_rate": 1.2907804798543006e-05, + "loss": 0.4631, + "step": 6869 + }, + { + "epoch": 0.8360206875570428, + "grad_norm": 3.3242721557617188, + "learning_rate": 1.2905938339137598e-05, + "loss": 0.4568, + "step": 6870 + }, + { + "epoch": 0.83614237906906, + "grad_norm": 1.459088921546936, + "learning_rate": 1.2904071769155215e-05, + "loss": 0.5031, + "step": 6871 + }, + { + "epoch": 0.836264070581077, + "grad_norm": 2.3964123725891113, + "learning_rate": 1.2902205088666894e-05, + "loss": 0.4945, + "step": 6872 + }, + { + "epoch": 0.836385762093094, + "grad_norm": 1.7578063011169434, + "learning_rate": 1.2900338297743661e-05, + "loss": 0.4726, + "step": 6873 + }, + { + "epoch": 0.836507453605111, + "grad_norm": 1.5480945110321045, + "learning_rate": 1.2898471396456554e-05, + "loss": 0.4515, + "step": 6874 + }, + { + "epoch": 0.836629145117128, + "grad_norm": 2.650372266769409, + "learning_rate": 1.2896604384876608e-05, + "loss": 0.4032, + "step": 6875 + }, + { + "epoch": 0.8367508366291451, + "grad_norm": 2.133085012435913, + "learning_rate": 1.2894737263074872e-05, + "loss": 0.5347, + "step": 6876 + }, + { + "epoch": 0.8368725281411622, + "grad_norm": 3.180300712585449, + "learning_rate": 1.289287003112239e-05, + "loss": 0.5483, + "step": 6877 + }, + { + "epoch": 0.8369942196531792, + "grad_norm": 1.7303646802902222, + "learning_rate": 1.2891002689090215e-05, + "loss": 0.4755, + "step": 6878 + }, + { + "epoch": 0.8371159111651962, + "grad_norm": 2.27557110786438, + "learning_rate": 1.2889135237049405e-05, + "loss": 0.4871, + "step": 6879 + }, + { + "epoch": 0.8372376026772133, + "grad_norm": 0.836880087852478, + "learning_rate": 1.2887267675071018e-05, + "loss": 0.4039, + "step": 6880 + }, + { + "epoch": 0.8373592941892303, + "grad_norm": 3.1311147212982178, + "learning_rate": 1.2885400003226118e-05, + "loss": 0.5287, + "step": 6881 + }, + { + "epoch": 0.8374809857012473, + "grad_norm": 1.0516445636749268, + "learning_rate": 1.288353222158578e-05, + "loss": 0.4073, + "step": 6882 + }, + { + "epoch": 0.8376026772132644, + "grad_norm": 0.9085931181907654, + "learning_rate": 1.2881664330221069e-05, + "loss": 0.4554, + "step": 6883 + }, + { + "epoch": 0.8377243687252814, + "grad_norm": 0.946977972984314, + "learning_rate": 1.2879796329203067e-05, + "loss": 0.4458, + "step": 6884 + }, + { + "epoch": 0.8378460602372985, + "grad_norm": 1.1708645820617676, + "learning_rate": 1.2877928218602853e-05, + "loss": 0.4093, + "step": 6885 + }, + { + "epoch": 0.8379677517493155, + "grad_norm": 2.924179792404175, + "learning_rate": 1.2876059998491513e-05, + "loss": 0.4055, + "step": 6886 + }, + { + "epoch": 0.8380894432613325, + "grad_norm": 0.7647840976715088, + "learning_rate": 1.2874191668940136e-05, + "loss": 0.4459, + "step": 6887 + }, + { + "epoch": 0.8382111347733495, + "grad_norm": 2.0494229793548584, + "learning_rate": 1.2872323230019822e-05, + "loss": 0.4948, + "step": 6888 + }, + { + "epoch": 0.8383328262853667, + "grad_norm": 0.9894136190414429, + "learning_rate": 1.2870454681801658e-05, + "loss": 0.4954, + "step": 6889 + }, + { + "epoch": 0.8384545177973837, + "grad_norm": 1.4254064559936523, + "learning_rate": 1.2868586024356753e-05, + "loss": 0.4589, + "step": 6890 + }, + { + "epoch": 0.8385762093094007, + "grad_norm": 2.4123616218566895, + "learning_rate": 1.2866717257756212e-05, + "loss": 0.4195, + "step": 6891 + }, + { + "epoch": 0.8386979008214177, + "grad_norm": 1.3306949138641357, + "learning_rate": 1.2864848382071147e-05, + "loss": 0.4329, + "step": 6892 + }, + { + "epoch": 0.8388195923334347, + "grad_norm": 0.8937929272651672, + "learning_rate": 1.286297939737267e-05, + "loss": 0.4117, + "step": 6893 + }, + { + "epoch": 0.8389412838454517, + "grad_norm": 1.6041615009307861, + "learning_rate": 1.2861110303731904e-05, + "loss": 0.4808, + "step": 6894 + }, + { + "epoch": 0.8390629753574688, + "grad_norm": 0.73006671667099, + "learning_rate": 1.2859241101219964e-05, + "loss": 0.4072, + "step": 6895 + }, + { + "epoch": 0.8391846668694859, + "grad_norm": 2.1428239345550537, + "learning_rate": 1.2857371789907984e-05, + "loss": 0.5044, + "step": 6896 + }, + { + "epoch": 0.8393063583815029, + "grad_norm": 1.0156277418136597, + "learning_rate": 1.2855502369867092e-05, + "loss": 0.4348, + "step": 6897 + }, + { + "epoch": 0.8394280498935199, + "grad_norm": 0.6417730450630188, + "learning_rate": 1.2853632841168424e-05, + "loss": 0.4338, + "step": 6898 + }, + { + "epoch": 0.839549741405537, + "grad_norm": 0.69439697265625, + "learning_rate": 1.2851763203883122e-05, + "loss": 0.4667, + "step": 6899 + }, + { + "epoch": 0.839671432917554, + "grad_norm": 2.310706377029419, + "learning_rate": 1.2849893458082328e-05, + "loss": 0.4788, + "step": 6900 + }, + { + "epoch": 0.839793124429571, + "grad_norm": 0.5879369378089905, + "learning_rate": 1.2848023603837185e-05, + "loss": 0.4435, + "step": 6901 + }, + { + "epoch": 0.8399148159415881, + "grad_norm": 0.8880062699317932, + "learning_rate": 1.2846153641218851e-05, + "loss": 0.467, + "step": 6902 + }, + { + "epoch": 0.8400365074536051, + "grad_norm": 2.1720924377441406, + "learning_rate": 1.2844283570298481e-05, + "loss": 0.4848, + "step": 6903 + }, + { + "epoch": 0.8401581989656222, + "grad_norm": 2.6391782760620117, + "learning_rate": 1.284241339114723e-05, + "loss": 0.4548, + "step": 6904 + }, + { + "epoch": 0.8402798904776392, + "grad_norm": 3.1797895431518555, + "learning_rate": 1.2840543103836271e-05, + "loss": 0.4227, + "step": 6905 + }, + { + "epoch": 0.8404015819896562, + "grad_norm": 2.2403340339660645, + "learning_rate": 1.2838672708436764e-05, + "loss": 0.4866, + "step": 6906 + }, + { + "epoch": 0.8405232735016732, + "grad_norm": 1.0137779712677002, + "learning_rate": 1.2836802205019887e-05, + "loss": 0.4771, + "step": 6907 + }, + { + "epoch": 0.8406449650136903, + "grad_norm": 2.729987382888794, + "learning_rate": 1.2834931593656812e-05, + "loss": 0.459, + "step": 6908 + }, + { + "epoch": 0.8407666565257074, + "grad_norm": 4.043386936187744, + "learning_rate": 1.2833060874418722e-05, + "loss": 0.4533, + "step": 6909 + }, + { + "epoch": 0.8408883480377244, + "grad_norm": 1.1356867551803589, + "learning_rate": 1.2831190047376802e-05, + "loss": 0.4259, + "step": 6910 + }, + { + "epoch": 0.8410100395497414, + "grad_norm": 0.9805501699447632, + "learning_rate": 1.282931911260224e-05, + "loss": 0.4577, + "step": 6911 + }, + { + "epoch": 0.8411317310617584, + "grad_norm": 1.4176305532455444, + "learning_rate": 1.282744807016623e-05, + "loss": 0.4371, + "step": 6912 + }, + { + "epoch": 0.8412534225737754, + "grad_norm": 0.6131776571273804, + "learning_rate": 1.2825576920139969e-05, + "loss": 0.4187, + "step": 6913 + }, + { + "epoch": 0.8413751140857925, + "grad_norm": 2.1293554306030273, + "learning_rate": 1.2823705662594658e-05, + "loss": 0.4521, + "step": 6914 + }, + { + "epoch": 0.8414968055978096, + "grad_norm": 4.38294792175293, + "learning_rate": 1.2821834297601498e-05, + "loss": 0.5177, + "step": 6915 + }, + { + "epoch": 0.8416184971098266, + "grad_norm": 2.352923631668091, + "learning_rate": 1.2819962825231707e-05, + "loss": 0.4575, + "step": 6916 + }, + { + "epoch": 0.8417401886218436, + "grad_norm": 0.7982110977172852, + "learning_rate": 1.281809124555649e-05, + "loss": 0.4086, + "step": 6917 + }, + { + "epoch": 0.8418618801338607, + "grad_norm": 3.7458646297454834, + "learning_rate": 1.2816219558647072e-05, + "loss": 0.5379, + "step": 6918 + }, + { + "epoch": 0.8419835716458777, + "grad_norm": 0.6902008056640625, + "learning_rate": 1.2814347764574666e-05, + "loss": 0.4685, + "step": 6919 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 4.48379373550415, + "learning_rate": 1.2812475863410505e-05, + "loss": 0.5657, + "step": 6920 + }, + { + "epoch": 0.8422269546699118, + "grad_norm": 2.713804006576538, + "learning_rate": 1.2810603855225811e-05, + "loss": 0.4333, + "step": 6921 + }, + { + "epoch": 0.8423486461819288, + "grad_norm": 1.6324368715286255, + "learning_rate": 1.2808731740091834e-05, + "loss": 0.4431, + "step": 6922 + }, + { + "epoch": 0.8424703376939459, + "grad_norm": 0.8463773131370544, + "learning_rate": 1.280685951807979e-05, + "loss": 0.4789, + "step": 6923 + }, + { + "epoch": 0.8425920292059629, + "grad_norm": 1.5698280334472656, + "learning_rate": 1.2804987189260933e-05, + "loss": 0.4959, + "step": 6924 + }, + { + "epoch": 0.8427137207179799, + "grad_norm": 4.177562713623047, + "learning_rate": 1.280311475370651e-05, + "loss": 0.3915, + "step": 6925 + }, + { + "epoch": 0.8428354122299969, + "grad_norm": 2.055678606033325, + "learning_rate": 1.2801242211487768e-05, + "loss": 0.4247, + "step": 6926 + }, + { + "epoch": 0.842957103742014, + "grad_norm": 2.990649938583374, + "learning_rate": 1.279936956267596e-05, + "loss": 0.4974, + "step": 6927 + }, + { + "epoch": 0.8430787952540311, + "grad_norm": 1.3391287326812744, + "learning_rate": 1.2797496807342349e-05, + "loss": 0.4509, + "step": 6928 + }, + { + "epoch": 0.8432004867660481, + "grad_norm": 1.2605738639831543, + "learning_rate": 1.2795623945558191e-05, + "loss": 0.4482, + "step": 6929 + }, + { + "epoch": 0.8433221782780651, + "grad_norm": 0.6613712906837463, + "learning_rate": 1.2793750977394756e-05, + "loss": 0.4305, + "step": 6930 + }, + { + "epoch": 0.8434438697900821, + "grad_norm": 1.0457223653793335, + "learning_rate": 1.2791877902923312e-05, + "loss": 0.4542, + "step": 6931 + }, + { + "epoch": 0.8435655613020991, + "grad_norm": 1.8725061416625977, + "learning_rate": 1.2790004722215137e-05, + "loss": 0.4481, + "step": 6932 + }, + { + "epoch": 0.8436872528141162, + "grad_norm": 3.1485283374786377, + "learning_rate": 1.278813143534151e-05, + "loss": 0.4629, + "step": 6933 + }, + { + "epoch": 0.8438089443261333, + "grad_norm": 1.9086508750915527, + "learning_rate": 1.2786258042373707e-05, + "loss": 0.4748, + "step": 6934 + }, + { + "epoch": 0.8439306358381503, + "grad_norm": 5.065165996551514, + "learning_rate": 1.2784384543383017e-05, + "loss": 0.5661, + "step": 6935 + }, + { + "epoch": 0.8440523273501673, + "grad_norm": 0.6957314610481262, + "learning_rate": 1.2782510938440735e-05, + "loss": 0.4371, + "step": 6936 + }, + { + "epoch": 0.8441740188621843, + "grad_norm": 1.8577358722686768, + "learning_rate": 1.2780637227618152e-05, + "loss": 0.5035, + "step": 6937 + }, + { + "epoch": 0.8442957103742014, + "grad_norm": 0.8081720471382141, + "learning_rate": 1.2778763410986565e-05, + "loss": 0.4894, + "step": 6938 + }, + { + "epoch": 0.8444174018862184, + "grad_norm": 2.4398961067199707, + "learning_rate": 1.2776889488617279e-05, + "loss": 0.4733, + "step": 6939 + }, + { + "epoch": 0.8445390933982355, + "grad_norm": 3.0027964115142822, + "learning_rate": 1.2775015460581602e-05, + "loss": 0.4543, + "step": 6940 + }, + { + "epoch": 0.8446607849102525, + "grad_norm": 0.6925974488258362, + "learning_rate": 1.2773141326950842e-05, + "loss": 0.543, + "step": 6941 + }, + { + "epoch": 0.8447824764222696, + "grad_norm": 1.5968799591064453, + "learning_rate": 1.2771267087796312e-05, + "loss": 0.4453, + "step": 6942 + }, + { + "epoch": 0.8449041679342866, + "grad_norm": 0.6327440142631531, + "learning_rate": 1.2769392743189335e-05, + "loss": 0.4714, + "step": 6943 + }, + { + "epoch": 0.8450258594463036, + "grad_norm": 2.0380287170410156, + "learning_rate": 1.2767518293201232e-05, + "loss": 0.4506, + "step": 6944 + }, + { + "epoch": 0.8451475509583206, + "grad_norm": 1.119584083557129, + "learning_rate": 1.2765643737903331e-05, + "loss": 0.4757, + "step": 6945 + }, + { + "epoch": 0.8452692424703377, + "grad_norm": 0.9903263449668884, + "learning_rate": 1.276376907736696e-05, + "loss": 0.429, + "step": 6946 + }, + { + "epoch": 0.8453909339823548, + "grad_norm": 1.4283775091171265, + "learning_rate": 1.2761894311663451e-05, + "loss": 0.4905, + "step": 6947 + }, + { + "epoch": 0.8455126254943718, + "grad_norm": 1.6286977529525757, + "learning_rate": 1.276001944086415e-05, + "loss": 0.4565, + "step": 6948 + }, + { + "epoch": 0.8456343170063888, + "grad_norm": 1.2994225025177002, + "learning_rate": 1.2758144465040396e-05, + "loss": 0.4266, + "step": 6949 + }, + { + "epoch": 0.8457560085184058, + "grad_norm": 0.5361411571502686, + "learning_rate": 1.2756269384263536e-05, + "loss": 0.4064, + "step": 6950 + }, + { + "epoch": 0.8458777000304228, + "grad_norm": 0.6261139512062073, + "learning_rate": 1.2754394198604923e-05, + "loss": 0.4544, + "step": 6951 + }, + { + "epoch": 0.8459993915424399, + "grad_norm": 1.6742547750473022, + "learning_rate": 1.275251890813591e-05, + "loss": 0.5153, + "step": 6952 + }, + { + "epoch": 0.846121083054457, + "grad_norm": 1.8770606517791748, + "learning_rate": 1.2750643512927849e-05, + "loss": 0.424, + "step": 6953 + }, + { + "epoch": 0.846242774566474, + "grad_norm": 0.8181618452072144, + "learning_rate": 1.2748768013052113e-05, + "loss": 0.4654, + "step": 6954 + }, + { + "epoch": 0.846364466078491, + "grad_norm": 1.6564304828643799, + "learning_rate": 1.2746892408580062e-05, + "loss": 0.4019, + "step": 6955 + }, + { + "epoch": 0.846486157590508, + "grad_norm": 1.423676609992981, + "learning_rate": 1.2745016699583074e-05, + "loss": 0.4491, + "step": 6956 + }, + { + "epoch": 0.8466078491025251, + "grad_norm": 0.8537080883979797, + "learning_rate": 1.2743140886132514e-05, + "loss": 0.4357, + "step": 6957 + }, + { + "epoch": 0.8467295406145421, + "grad_norm": 2.9443321228027344, + "learning_rate": 1.2741264968299767e-05, + "loss": 0.4013, + "step": 6958 + }, + { + "epoch": 0.8468512321265592, + "grad_norm": 1.2850115299224854, + "learning_rate": 1.2739388946156215e-05, + "loss": 0.4111, + "step": 6959 + }, + { + "epoch": 0.8469729236385762, + "grad_norm": 0.7939808368682861, + "learning_rate": 1.273751281977324e-05, + "loss": 0.4055, + "step": 6960 + }, + { + "epoch": 0.8470946151505933, + "grad_norm": 1.5699455738067627, + "learning_rate": 1.273563658922224e-05, + "loss": 0.4129, + "step": 6961 + }, + { + "epoch": 0.8472163066626103, + "grad_norm": 1.7347068786621094, + "learning_rate": 1.2733760254574606e-05, + "loss": 0.458, + "step": 6962 + }, + { + "epoch": 0.8473379981746273, + "grad_norm": 4.550609588623047, + "learning_rate": 1.2731883815901731e-05, + "loss": 0.5723, + "step": 6963 + }, + { + "epoch": 0.8474596896866443, + "grad_norm": 2.323796272277832, + "learning_rate": 1.2730007273275025e-05, + "loss": 0.4742, + "step": 6964 + }, + { + "epoch": 0.8475813811986614, + "grad_norm": 6.394164562225342, + "learning_rate": 1.2728130626765892e-05, + "loss": 0.554, + "step": 6965 + }, + { + "epoch": 0.8477030727106785, + "grad_norm": 2.1003170013427734, + "learning_rate": 1.2726253876445738e-05, + "loss": 0.4306, + "step": 6966 + }, + { + "epoch": 0.8478247642226955, + "grad_norm": 1.0300663709640503, + "learning_rate": 1.2724377022385985e-05, + "loss": 0.4196, + "step": 6967 + }, + { + "epoch": 0.8479464557347125, + "grad_norm": 2.2606160640716553, + "learning_rate": 1.2722500064658045e-05, + "loss": 0.5082, + "step": 6968 + }, + { + "epoch": 0.8480681472467295, + "grad_norm": 0.9641751646995544, + "learning_rate": 1.2720623003333343e-05, + "loss": 0.4564, + "step": 6969 + }, + { + "epoch": 0.8481898387587465, + "grad_norm": 3.5402324199676514, + "learning_rate": 1.2718745838483304e-05, + "loss": 0.4244, + "step": 6970 + }, + { + "epoch": 0.8483115302707637, + "grad_norm": 1.4322373867034912, + "learning_rate": 1.2716868570179359e-05, + "loss": 0.5406, + "step": 6971 + }, + { + "epoch": 0.8484332217827807, + "grad_norm": 1.7457278966903687, + "learning_rate": 1.271499119849294e-05, + "loss": 0.4959, + "step": 6972 + }, + { + "epoch": 0.8485549132947977, + "grad_norm": 1.1944416761398315, + "learning_rate": 1.2713113723495485e-05, + "loss": 0.514, + "step": 6973 + }, + { + "epoch": 0.8486766048068147, + "grad_norm": 6.41412353515625, + "learning_rate": 1.271123614525844e-05, + "loss": 0.4585, + "step": 6974 + }, + { + "epoch": 0.8487982963188317, + "grad_norm": 5.333705902099609, + "learning_rate": 1.2709358463853249e-05, + "loss": 0.4747, + "step": 6975 + }, + { + "epoch": 0.8489199878308488, + "grad_norm": 4.668219566345215, + "learning_rate": 1.2707480679351358e-05, + "loss": 0.4706, + "step": 6976 + }, + { + "epoch": 0.8490416793428658, + "grad_norm": 4.608364105224609, + "learning_rate": 1.2705602791824224e-05, + "loss": 0.4438, + "step": 6977 + }, + { + "epoch": 0.8491633708548829, + "grad_norm": 1.8792756795883179, + "learning_rate": 1.2703724801343303e-05, + "loss": 0.4628, + "step": 6978 + }, + { + "epoch": 0.8492850623668999, + "grad_norm": 2.620440721511841, + "learning_rate": 1.2701846707980056e-05, + "loss": 0.4182, + "step": 6979 + }, + { + "epoch": 0.849406753878917, + "grad_norm": 2.442648410797119, + "learning_rate": 1.269996851180595e-05, + "loss": 0.475, + "step": 6980 + }, + { + "epoch": 0.849528445390934, + "grad_norm": 1.1661570072174072, + "learning_rate": 1.2698090212892452e-05, + "loss": 0.4674, + "step": 6981 + }, + { + "epoch": 0.849650136902951, + "grad_norm": 2.131560802459717, + "learning_rate": 1.269621181131104e-05, + "loss": 0.4829, + "step": 6982 + }, + { + "epoch": 0.849771828414968, + "grad_norm": 1.2309439182281494, + "learning_rate": 1.2694333307133184e-05, + "loss": 0.4513, + "step": 6983 + }, + { + "epoch": 0.8498935199269851, + "grad_norm": 5.524863243103027, + "learning_rate": 1.2692454700430369e-05, + "loss": 0.5625, + "step": 6984 + }, + { + "epoch": 0.8500152114390022, + "grad_norm": 1.8256990909576416, + "learning_rate": 1.2690575991274083e-05, + "loss": 0.5141, + "step": 6985 + }, + { + "epoch": 0.8501369029510192, + "grad_norm": 0.8253006935119629, + "learning_rate": 1.268869717973581e-05, + "loss": 0.4413, + "step": 6986 + }, + { + "epoch": 0.8502585944630362, + "grad_norm": 0.9531517624855042, + "learning_rate": 1.2686818265887042e-05, + "loss": 0.4575, + "step": 6987 + }, + { + "epoch": 0.8503802859750532, + "grad_norm": 3.4873125553131104, + "learning_rate": 1.2684939249799277e-05, + "loss": 0.4095, + "step": 6988 + }, + { + "epoch": 0.8505019774870702, + "grad_norm": 1.0143766403198242, + "learning_rate": 1.2683060131544018e-05, + "loss": 0.468, + "step": 6989 + }, + { + "epoch": 0.8506236689990874, + "grad_norm": 1.0920214653015137, + "learning_rate": 1.2681180911192767e-05, + "loss": 0.4626, + "step": 6990 + }, + { + "epoch": 0.8507453605111044, + "grad_norm": 0.7049693465232849, + "learning_rate": 1.2679301588817034e-05, + "loss": 0.4518, + "step": 6991 + }, + { + "epoch": 0.8508670520231214, + "grad_norm": 0.6504801511764526, + "learning_rate": 1.2677422164488328e-05, + "loss": 0.4709, + "step": 6992 + }, + { + "epoch": 0.8509887435351384, + "grad_norm": 1.0879284143447876, + "learning_rate": 1.2675542638278166e-05, + "loss": 0.4197, + "step": 6993 + }, + { + "epoch": 0.8511104350471554, + "grad_norm": 1.1372443437576294, + "learning_rate": 1.2673663010258073e-05, + "loss": 0.4018, + "step": 6994 + }, + { + "epoch": 0.8512321265591725, + "grad_norm": 1.4385079145431519, + "learning_rate": 1.2671783280499563e-05, + "loss": 0.5101, + "step": 6995 + }, + { + "epoch": 0.8513538180711895, + "grad_norm": 1.0067007541656494, + "learning_rate": 1.266990344907417e-05, + "loss": 0.434, + "step": 6996 + }, + { + "epoch": 0.8514755095832066, + "grad_norm": 0.6502609848976135, + "learning_rate": 1.2668023516053426e-05, + "loss": 0.3992, + "step": 6997 + }, + { + "epoch": 0.8515972010952236, + "grad_norm": 2.3904335498809814, + "learning_rate": 1.2666143481508865e-05, + "loss": 0.478, + "step": 6998 + }, + { + "epoch": 0.8517188926072407, + "grad_norm": 0.9435008764266968, + "learning_rate": 1.2664263345512024e-05, + "loss": 0.4797, + "step": 6999 + }, + { + "epoch": 0.8518405841192577, + "grad_norm": 1.326151728630066, + "learning_rate": 1.2662383108134448e-05, + "loss": 0.496, + "step": 7000 + }, + { + "epoch": 0.8519622756312747, + "grad_norm": 1.5210660696029663, + "learning_rate": 1.2660502769447686e-05, + "loss": 0.4509, + "step": 7001 + }, + { + "epoch": 0.8520839671432917, + "grad_norm": 0.9029418230056763, + "learning_rate": 1.2658622329523287e-05, + "loss": 0.5236, + "step": 7002 + }, + { + "epoch": 0.8522056586553088, + "grad_norm": 2.1396021842956543, + "learning_rate": 1.2656741788432805e-05, + "loss": 0.3981, + "step": 7003 + }, + { + "epoch": 0.8523273501673259, + "grad_norm": 0.6995888948440552, + "learning_rate": 1.2654861146247794e-05, + "loss": 0.4642, + "step": 7004 + }, + { + "epoch": 0.8524490416793429, + "grad_norm": 4.560553550720215, + "learning_rate": 1.2652980403039827e-05, + "loss": 0.4239, + "step": 7005 + }, + { + "epoch": 0.8525707331913599, + "grad_norm": 2.2662789821624756, + "learning_rate": 1.2651099558880462e-05, + "loss": 0.4247, + "step": 7006 + }, + { + "epoch": 0.8526924247033769, + "grad_norm": 2.8812103271484375, + "learning_rate": 1.264921861384127e-05, + "loss": 0.4502, + "step": 7007 + }, + { + "epoch": 0.8528141162153939, + "grad_norm": 0.6335647702217102, + "learning_rate": 1.264733756799383e-05, + "loss": 0.4236, + "step": 7008 + }, + { + "epoch": 0.8529358077274111, + "grad_norm": 0.8655391931533813, + "learning_rate": 1.2645456421409712e-05, + "loss": 0.4455, + "step": 7009 + }, + { + "epoch": 0.8530574992394281, + "grad_norm": 2.688206672668457, + "learning_rate": 1.2643575174160503e-05, + "loss": 0.4724, + "step": 7010 + }, + { + "epoch": 0.8531791907514451, + "grad_norm": 0.983683705329895, + "learning_rate": 1.2641693826317785e-05, + "loss": 0.4607, + "step": 7011 + }, + { + "epoch": 0.8533008822634621, + "grad_norm": 0.7513974905014038, + "learning_rate": 1.2639812377953152e-05, + "loss": 0.4052, + "step": 7012 + }, + { + "epoch": 0.8534225737754791, + "grad_norm": 3.3914999961853027, + "learning_rate": 1.2637930829138192e-05, + "loss": 0.5243, + "step": 7013 + }, + { + "epoch": 0.8535442652874962, + "grad_norm": 1.1194955110549927, + "learning_rate": 1.2636049179944502e-05, + "loss": 0.445, + "step": 7014 + }, + { + "epoch": 0.8536659567995132, + "grad_norm": 1.1755385398864746, + "learning_rate": 1.2634167430443687e-05, + "loss": 0.4314, + "step": 7015 + }, + { + "epoch": 0.8537876483115303, + "grad_norm": 3.3257882595062256, + "learning_rate": 1.2632285580707349e-05, + "loss": 0.514, + "step": 7016 + }, + { + "epoch": 0.8539093398235473, + "grad_norm": 0.9307615756988525, + "learning_rate": 1.2630403630807096e-05, + "loss": 0.5101, + "step": 7017 + }, + { + "epoch": 0.8540310313355644, + "grad_norm": 1.614334225654602, + "learning_rate": 1.262852158081454e-05, + "loss": 0.4671, + "step": 7018 + }, + { + "epoch": 0.8541527228475814, + "grad_norm": 0.7288075089454651, + "learning_rate": 1.2626639430801293e-05, + "loss": 0.4697, + "step": 7019 + }, + { + "epoch": 0.8542744143595984, + "grad_norm": 2.6823551654815674, + "learning_rate": 1.2624757180838987e-05, + "loss": 0.4346, + "step": 7020 + }, + { + "epoch": 0.8543961058716154, + "grad_norm": 2.2687909603118896, + "learning_rate": 1.262287483099923e-05, + "loss": 0.4285, + "step": 7021 + }, + { + "epoch": 0.8545177973836325, + "grad_norm": 2.0029501914978027, + "learning_rate": 1.262099238135366e-05, + "loss": 0.4403, + "step": 7022 + }, + { + "epoch": 0.8546394888956496, + "grad_norm": 1.5380704402923584, + "learning_rate": 1.2619109831973903e-05, + "loss": 0.4675, + "step": 7023 + }, + { + "epoch": 0.8547611804076666, + "grad_norm": 1.8958693742752075, + "learning_rate": 1.2617227182931597e-05, + "loss": 0.386, + "step": 7024 + }, + { + "epoch": 0.8548828719196836, + "grad_norm": 2.458688259124756, + "learning_rate": 1.261534443429838e-05, + "loss": 0.5004, + "step": 7025 + }, + { + "epoch": 0.8550045634317006, + "grad_norm": 0.9578983187675476, + "learning_rate": 1.2613461586145892e-05, + "loss": 0.4134, + "step": 7026 + }, + { + "epoch": 0.8551262549437176, + "grad_norm": 1.2905571460723877, + "learning_rate": 1.2611578638545783e-05, + "loss": 0.4597, + "step": 7027 + }, + { + "epoch": 0.8552479464557348, + "grad_norm": 9.036710739135742, + "learning_rate": 1.26096955915697e-05, + "loss": 0.6334, + "step": 7028 + }, + { + "epoch": 0.8553696379677518, + "grad_norm": 2.6435916423797607, + "learning_rate": 1.2607812445289297e-05, + "loss": 0.4937, + "step": 7029 + }, + { + "epoch": 0.8554913294797688, + "grad_norm": 3.5096120834350586, + "learning_rate": 1.2605929199776234e-05, + "loss": 0.5237, + "step": 7030 + }, + { + "epoch": 0.8556130209917858, + "grad_norm": 1.8670804500579834, + "learning_rate": 1.2604045855102172e-05, + "loss": 0.4498, + "step": 7031 + }, + { + "epoch": 0.8557347125038028, + "grad_norm": 0.5056993961334229, + "learning_rate": 1.2602162411338775e-05, + "loss": 0.4072, + "step": 7032 + }, + { + "epoch": 0.8558564040158199, + "grad_norm": 1.6217615604400635, + "learning_rate": 1.260027886855771e-05, + "loss": 0.4865, + "step": 7033 + }, + { + "epoch": 0.8559780955278369, + "grad_norm": 1.4946330785751343, + "learning_rate": 1.259839522683065e-05, + "loss": 0.4389, + "step": 7034 + }, + { + "epoch": 0.856099787039854, + "grad_norm": 0.7999100089073181, + "learning_rate": 1.2596511486229277e-05, + "loss": 0.4596, + "step": 7035 + }, + { + "epoch": 0.856221478551871, + "grad_norm": 1.6567870378494263, + "learning_rate": 1.2594627646825265e-05, + "loss": 0.4584, + "step": 7036 + }, + { + "epoch": 0.856343170063888, + "grad_norm": 1.1645410060882568, + "learning_rate": 1.2592743708690305e-05, + "loss": 0.4197, + "step": 7037 + }, + { + "epoch": 0.8564648615759051, + "grad_norm": 0.6656481027603149, + "learning_rate": 1.2590859671896076e-05, + "loss": 0.4582, + "step": 7038 + }, + { + "epoch": 0.8565865530879221, + "grad_norm": 0.6635571718215942, + "learning_rate": 1.2588975536514276e-05, + "loss": 0.4571, + "step": 7039 + }, + { + "epoch": 0.8567082445999391, + "grad_norm": 0.7041589617729187, + "learning_rate": 1.2587091302616594e-05, + "loss": 0.4551, + "step": 7040 + }, + { + "epoch": 0.8568299361119562, + "grad_norm": 1.993066668510437, + "learning_rate": 1.2585206970274734e-05, + "loss": 0.4401, + "step": 7041 + }, + { + "epoch": 0.8569516276239733, + "grad_norm": 2.9001550674438477, + "learning_rate": 1.2583322539560401e-05, + "loss": 0.3901, + "step": 7042 + }, + { + "epoch": 0.8570733191359903, + "grad_norm": 0.9461142420768738, + "learning_rate": 1.25814380105453e-05, + "loss": 0.4657, + "step": 7043 + }, + { + "epoch": 0.8571950106480073, + "grad_norm": 1.4206129312515259, + "learning_rate": 1.2579553383301134e-05, + "loss": 0.4596, + "step": 7044 + }, + { + "epoch": 0.8573167021600243, + "grad_norm": 1.6657085418701172, + "learning_rate": 1.2577668657899622e-05, + "loss": 0.422, + "step": 7045 + }, + { + "epoch": 0.8574383936720413, + "grad_norm": 2.3552663326263428, + "learning_rate": 1.2575783834412488e-05, + "loss": 0.5153, + "step": 7046 + }, + { + "epoch": 0.8575600851840585, + "grad_norm": 0.9289820790290833, + "learning_rate": 1.2573898912911442e-05, + "loss": 0.4273, + "step": 7047 + }, + { + "epoch": 0.8576817766960755, + "grad_norm": 2.8469924926757812, + "learning_rate": 1.2572013893468216e-05, + "loss": 0.4772, + "step": 7048 + }, + { + "epoch": 0.8578034682080925, + "grad_norm": 2.5421817302703857, + "learning_rate": 1.2570128776154538e-05, + "loss": 0.5141, + "step": 7049 + }, + { + "epoch": 0.8579251597201095, + "grad_norm": 1.3275059461593628, + "learning_rate": 1.2568243561042141e-05, + "loss": 0.4079, + "step": 7050 + }, + { + "epoch": 0.8580468512321265, + "grad_norm": 0.9563422203063965, + "learning_rate": 1.2566358248202758e-05, + "loss": 0.4907, + "step": 7051 + }, + { + "epoch": 0.8581685427441436, + "grad_norm": 0.7685717940330505, + "learning_rate": 1.2564472837708134e-05, + "loss": 0.4813, + "step": 7052 + }, + { + "epoch": 0.8582902342561606, + "grad_norm": 1.4967350959777832, + "learning_rate": 1.2562587329630009e-05, + "loss": 0.4423, + "step": 7053 + }, + { + "epoch": 0.8584119257681777, + "grad_norm": 1.104519009590149, + "learning_rate": 1.2560701724040136e-05, + "loss": 0.4902, + "step": 7054 + }, + { + "epoch": 0.8585336172801947, + "grad_norm": 2.1416592597961426, + "learning_rate": 1.2558816021010259e-05, + "loss": 0.4408, + "step": 7055 + }, + { + "epoch": 0.8586553087922117, + "grad_norm": 0.6394123435020447, + "learning_rate": 1.2556930220612134e-05, + "loss": 0.4507, + "step": 7056 + }, + { + "epoch": 0.8587770003042288, + "grad_norm": 0.7312126159667969, + "learning_rate": 1.2555044322917523e-05, + "loss": 0.4624, + "step": 7057 + }, + { + "epoch": 0.8588986918162458, + "grad_norm": 2.855180025100708, + "learning_rate": 1.2553158327998186e-05, + "loss": 0.5386, + "step": 7058 + }, + { + "epoch": 0.8590203833282628, + "grad_norm": 2.4441428184509277, + "learning_rate": 1.255127223592589e-05, + "loss": 0.4996, + "step": 7059 + }, + { + "epoch": 0.8591420748402799, + "grad_norm": 1.8379968404769897, + "learning_rate": 1.2549386046772408e-05, + "loss": 0.4044, + "step": 7060 + }, + { + "epoch": 0.859263766352297, + "grad_norm": 1.8424160480499268, + "learning_rate": 1.2547499760609507e-05, + "loss": 0.417, + "step": 7061 + }, + { + "epoch": 0.859385457864314, + "grad_norm": 0.595614492893219, + "learning_rate": 1.2545613377508967e-05, + "loss": 0.4212, + "step": 7062 + }, + { + "epoch": 0.859507149376331, + "grad_norm": 1.5273895263671875, + "learning_rate": 1.254372689754257e-05, + "loss": 0.4303, + "step": 7063 + }, + { + "epoch": 0.859628840888348, + "grad_norm": 0.9880451560020447, + "learning_rate": 1.2541840320782097e-05, + "loss": 0.4197, + "step": 7064 + }, + { + "epoch": 0.859750532400365, + "grad_norm": 3.9398880004882812, + "learning_rate": 1.2539953647299341e-05, + "loss": 0.4562, + "step": 7065 + }, + { + "epoch": 0.8598722239123822, + "grad_norm": 1.0115792751312256, + "learning_rate": 1.2538066877166093e-05, + "loss": 0.4358, + "step": 7066 + }, + { + "epoch": 0.8599939154243992, + "grad_norm": 0.5628373026847839, + "learning_rate": 1.2536180010454142e-05, + "loss": 0.3998, + "step": 7067 + }, + { + "epoch": 0.8601156069364162, + "grad_norm": 1.36702561378479, + "learning_rate": 1.2534293047235292e-05, + "loss": 0.4534, + "step": 7068 + }, + { + "epoch": 0.8602372984484332, + "grad_norm": 1.2451943159103394, + "learning_rate": 1.253240598758135e-05, + "loss": 0.3906, + "step": 7069 + }, + { + "epoch": 0.8603589899604502, + "grad_norm": 1.6277918815612793, + "learning_rate": 1.2530518831564119e-05, + "loss": 0.4692, + "step": 7070 + }, + { + "epoch": 0.8604806814724673, + "grad_norm": 1.4523383378982544, + "learning_rate": 1.2528631579255404e-05, + "loss": 0.43, + "step": 7071 + }, + { + "epoch": 0.8606023729844844, + "grad_norm": 2.8507893085479736, + "learning_rate": 1.2526744230727027e-05, + "loss": 0.5151, + "step": 7072 + }, + { + "epoch": 0.8607240644965014, + "grad_norm": 2.7794971466064453, + "learning_rate": 1.25248567860508e-05, + "loss": 0.3745, + "step": 7073 + }, + { + "epoch": 0.8608457560085184, + "grad_norm": 0.602327823638916, + "learning_rate": 1.2522969245298546e-05, + "loss": 0.4562, + "step": 7074 + }, + { + "epoch": 0.8609674475205354, + "grad_norm": 2.184446334838867, + "learning_rate": 1.2521081608542089e-05, + "loss": 0.4258, + "step": 7075 + }, + { + "epoch": 0.8610891390325525, + "grad_norm": 0.6675443053245544, + "learning_rate": 1.2519193875853261e-05, + "loss": 0.5, + "step": 7076 + }, + { + "epoch": 0.8612108305445695, + "grad_norm": 0.8092020153999329, + "learning_rate": 1.2517306047303893e-05, + "loss": 0.4604, + "step": 7077 + }, + { + "epoch": 0.8613325220565865, + "grad_norm": 2.3450775146484375, + "learning_rate": 1.2515418122965817e-05, + "loss": 0.4237, + "step": 7078 + }, + { + "epoch": 0.8614542135686036, + "grad_norm": 2.036470890045166, + "learning_rate": 1.2513530102910873e-05, + "loss": 0.5295, + "step": 7079 + }, + { + "epoch": 0.8615759050806207, + "grad_norm": 0.8510911464691162, + "learning_rate": 1.251164198721091e-05, + "loss": 0.5133, + "step": 7080 + }, + { + "epoch": 0.8616975965926377, + "grad_norm": 3.924497365951538, + "learning_rate": 1.2509753775937765e-05, + "loss": 0.4127, + "step": 7081 + }, + { + "epoch": 0.8618192881046547, + "grad_norm": 2.202623128890991, + "learning_rate": 1.2507865469163298e-05, + "loss": 0.531, + "step": 7082 + }, + { + "epoch": 0.8619409796166717, + "grad_norm": 2.0649092197418213, + "learning_rate": 1.2505977066959358e-05, + "loss": 0.422, + "step": 7083 + }, + { + "epoch": 0.8620626711286887, + "grad_norm": 1.8406898975372314, + "learning_rate": 1.2504088569397805e-05, + "loss": 0.4662, + "step": 7084 + }, + { + "epoch": 0.8621843626407059, + "grad_norm": 2.1745588779449463, + "learning_rate": 1.2502199976550498e-05, + "loss": 0.5012, + "step": 7085 + }, + { + "epoch": 0.8623060541527229, + "grad_norm": 2.647151470184326, + "learning_rate": 1.25003112884893e-05, + "loss": 0.4238, + "step": 7086 + }, + { + "epoch": 0.8624277456647399, + "grad_norm": 0.8039729595184326, + "learning_rate": 1.2498422505286082e-05, + "loss": 0.4899, + "step": 7087 + }, + { + "epoch": 0.8625494371767569, + "grad_norm": 1.5099488496780396, + "learning_rate": 1.2496533627012719e-05, + "loss": 0.4487, + "step": 7088 + }, + { + "epoch": 0.8626711286887739, + "grad_norm": 0.8914634585380554, + "learning_rate": 1.2494644653741087e-05, + "loss": 0.4621, + "step": 7089 + }, + { + "epoch": 0.862792820200791, + "grad_norm": 2.4344775676727295, + "learning_rate": 1.2492755585543055e-05, + "loss": 0.4127, + "step": 7090 + }, + { + "epoch": 0.8629145117128081, + "grad_norm": 1.771337628364563, + "learning_rate": 1.2490866422490515e-05, + "loss": 0.4578, + "step": 7091 + }, + { + "epoch": 0.8630362032248251, + "grad_norm": 0.6396241188049316, + "learning_rate": 1.2488977164655355e-05, + "loss": 0.4695, + "step": 7092 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 1.6561543941497803, + "learning_rate": 1.248708781210946e-05, + "loss": 0.4, + "step": 7093 + }, + { + "epoch": 0.8632795862488591, + "grad_norm": 0.6191179752349854, + "learning_rate": 1.2485198364924726e-05, + "loss": 0.4182, + "step": 7094 + }, + { + "epoch": 0.8634012777608762, + "grad_norm": 0.6067349314689636, + "learning_rate": 1.2483308823173052e-05, + "loss": 0.4489, + "step": 7095 + }, + { + "epoch": 0.8635229692728932, + "grad_norm": 1.1665351390838623, + "learning_rate": 1.2481419186926335e-05, + "loss": 0.4723, + "step": 7096 + }, + { + "epoch": 0.8636446607849102, + "grad_norm": 1.1563318967819214, + "learning_rate": 1.2479529456256483e-05, + "loss": 0.4638, + "step": 7097 + }, + { + "epoch": 0.8637663522969273, + "grad_norm": 2.5375454425811768, + "learning_rate": 1.24776396312354e-05, + "loss": 0.5132, + "step": 7098 + }, + { + "epoch": 0.8638880438089444, + "grad_norm": 1.057485580444336, + "learning_rate": 1.2475749711935006e-05, + "loss": 0.4776, + "step": 7099 + }, + { + "epoch": 0.8640097353209614, + "grad_norm": 0.8790457844734192, + "learning_rate": 1.2473859698427209e-05, + "loss": 0.4589, + "step": 7100 + }, + { + "epoch": 0.8641314268329784, + "grad_norm": 3.862192153930664, + "learning_rate": 1.2471969590783926e-05, + "loss": 0.3849, + "step": 7101 + }, + { + "epoch": 0.8642531183449954, + "grad_norm": 0.7336155772209167, + "learning_rate": 1.2470079389077085e-05, + "loss": 0.4815, + "step": 7102 + }, + { + "epoch": 0.8643748098570124, + "grad_norm": 4.680565357208252, + "learning_rate": 1.2468189093378613e-05, + "loss": 0.4125, + "step": 7103 + }, + { + "epoch": 0.8644965013690296, + "grad_norm": 1.2660815715789795, + "learning_rate": 1.2466298703760434e-05, + "loss": 0.484, + "step": 7104 + }, + { + "epoch": 0.8646181928810466, + "grad_norm": 0.9602317810058594, + "learning_rate": 1.2464408220294483e-05, + "loss": 0.4982, + "step": 7105 + }, + { + "epoch": 0.8647398843930636, + "grad_norm": 1.0130568742752075, + "learning_rate": 1.2462517643052704e-05, + "loss": 0.4701, + "step": 7106 + }, + { + "epoch": 0.8648615759050806, + "grad_norm": 1.9429272413253784, + "learning_rate": 1.246062697210703e-05, + "loss": 0.4102, + "step": 7107 + }, + { + "epoch": 0.8649832674170976, + "grad_norm": 0.5669442415237427, + "learning_rate": 1.2458736207529403e-05, + "loss": 0.4701, + "step": 7108 + }, + { + "epoch": 0.8651049589291147, + "grad_norm": 2.781632423400879, + "learning_rate": 1.2456845349391776e-05, + "loss": 0.5548, + "step": 7109 + }, + { + "epoch": 0.8652266504411318, + "grad_norm": 1.8443169593811035, + "learning_rate": 1.2454954397766099e-05, + "loss": 0.455, + "step": 7110 + }, + { + "epoch": 0.8653483419531488, + "grad_norm": 0.7192366719245911, + "learning_rate": 1.2453063352724324e-05, + "loss": 0.4606, + "step": 7111 + }, + { + "epoch": 0.8654700334651658, + "grad_norm": 3.1163535118103027, + "learning_rate": 1.2451172214338411e-05, + "loss": 0.5635, + "step": 7112 + }, + { + "epoch": 0.8655917249771828, + "grad_norm": 0.7690176367759705, + "learning_rate": 1.2449280982680324e-05, + "loss": 0.4425, + "step": 7113 + }, + { + "epoch": 0.8657134164891999, + "grad_norm": 1.1607067584991455, + "learning_rate": 1.2447389657822025e-05, + "loss": 0.3777, + "step": 7114 + }, + { + "epoch": 0.8658351080012169, + "grad_norm": 0.8777136206626892, + "learning_rate": 1.2445498239835481e-05, + "loss": 0.4317, + "step": 7115 + }, + { + "epoch": 0.8659567995132339, + "grad_norm": 1.2214540243148804, + "learning_rate": 1.2443606728792667e-05, + "loss": 0.4019, + "step": 7116 + }, + { + "epoch": 0.866078491025251, + "grad_norm": 0.7902207374572754, + "learning_rate": 1.244171512476556e-05, + "loss": 0.3849, + "step": 7117 + }, + { + "epoch": 0.866200182537268, + "grad_norm": 3.835721969604492, + "learning_rate": 1.2439823427826144e-05, + "loss": 0.4813, + "step": 7118 + }, + { + "epoch": 0.8663218740492851, + "grad_norm": 2.578134298324585, + "learning_rate": 1.243793163804639e-05, + "loss": 0.4414, + "step": 7119 + }, + { + "epoch": 0.8664435655613021, + "grad_norm": 0.7175315618515015, + "learning_rate": 1.2436039755498292e-05, + "loss": 0.4005, + "step": 7120 + }, + { + "epoch": 0.8665652570733191, + "grad_norm": 2.298964262008667, + "learning_rate": 1.2434147780253837e-05, + "loss": 0.4534, + "step": 7121 + }, + { + "epoch": 0.8666869485853361, + "grad_norm": 1.3722517490386963, + "learning_rate": 1.2432255712385025e-05, + "loss": 0.427, + "step": 7122 + }, + { + "epoch": 0.8668086400973533, + "grad_norm": 4.137610912322998, + "learning_rate": 1.2430363551963849e-05, + "loss": 0.5157, + "step": 7123 + }, + { + "epoch": 0.8669303316093703, + "grad_norm": 0.8999399542808533, + "learning_rate": 1.2428471299062306e-05, + "loss": 0.4493, + "step": 7124 + }, + { + "epoch": 0.8670520231213873, + "grad_norm": 0.6382309794425964, + "learning_rate": 1.2426578953752404e-05, + "loss": 0.4157, + "step": 7125 + }, + { + "epoch": 0.8671737146334043, + "grad_norm": 2.151242971420288, + "learning_rate": 1.2424686516106153e-05, + "loss": 0.4643, + "step": 7126 + }, + { + "epoch": 0.8672954061454213, + "grad_norm": 1.0443936586380005, + "learning_rate": 1.2422793986195556e-05, + "loss": 0.5304, + "step": 7127 + }, + { + "epoch": 0.8674170976574384, + "grad_norm": 1.3880633115768433, + "learning_rate": 1.2420901364092634e-05, + "loss": 0.4677, + "step": 7128 + }, + { + "epoch": 0.8675387891694555, + "grad_norm": 0.8804852366447449, + "learning_rate": 1.2419008649869408e-05, + "loss": 0.444, + "step": 7129 + }, + { + "epoch": 0.8676604806814725, + "grad_norm": 3.4794905185699463, + "learning_rate": 1.2417115843597896e-05, + "loss": 0.4505, + "step": 7130 + }, + { + "epoch": 0.8677821721934895, + "grad_norm": 1.386530876159668, + "learning_rate": 1.2415222945350118e-05, + "loss": 0.4642, + "step": 7131 + }, + { + "epoch": 0.8679038637055065, + "grad_norm": 4.581042766571045, + "learning_rate": 1.241332995519811e-05, + "loss": 0.564, + "step": 7132 + }, + { + "epoch": 0.8680255552175236, + "grad_norm": 2.8398358821868896, + "learning_rate": 1.2411436873213902e-05, + "loss": 0.4133, + "step": 7133 + }, + { + "epoch": 0.8681472467295406, + "grad_norm": 1.2776179313659668, + "learning_rate": 1.2409543699469531e-05, + "loss": 0.479, + "step": 7134 + }, + { + "epoch": 0.8682689382415576, + "grad_norm": 0.623054027557373, + "learning_rate": 1.2407650434037036e-05, + "loss": 0.4777, + "step": 7135 + }, + { + "epoch": 0.8683906297535747, + "grad_norm": 1.7206213474273682, + "learning_rate": 1.2405757076988454e-05, + "loss": 0.4599, + "step": 7136 + }, + { + "epoch": 0.8685123212655917, + "grad_norm": 2.46237850189209, + "learning_rate": 1.2403863628395838e-05, + "loss": 0.3843, + "step": 7137 + }, + { + "epoch": 0.8686340127776088, + "grad_norm": 2.648427724838257, + "learning_rate": 1.2401970088331234e-05, + "loss": 0.4233, + "step": 7138 + }, + { + "epoch": 0.8687557042896258, + "grad_norm": 1.5909925699234009, + "learning_rate": 1.2400076456866696e-05, + "loss": 0.4339, + "step": 7139 + }, + { + "epoch": 0.8688773958016428, + "grad_norm": 1.2027496099472046, + "learning_rate": 1.2398182734074282e-05, + "loss": 0.4169, + "step": 7140 + }, + { + "epoch": 0.8689990873136598, + "grad_norm": 1.9359771013259888, + "learning_rate": 1.2396288920026055e-05, + "loss": 0.4506, + "step": 7141 + }, + { + "epoch": 0.869120778825677, + "grad_norm": 0.8780108690261841, + "learning_rate": 1.2394395014794066e-05, + "loss": 0.3983, + "step": 7142 + }, + { + "epoch": 0.869242470337694, + "grad_norm": 2.922348976135254, + "learning_rate": 1.2392501018450395e-05, + "loss": 0.5137, + "step": 7143 + }, + { + "epoch": 0.869364161849711, + "grad_norm": 2.718005418777466, + "learning_rate": 1.239060693106711e-05, + "loss": 0.4996, + "step": 7144 + }, + { + "epoch": 0.869485853361728, + "grad_norm": 1.3465734720230103, + "learning_rate": 1.2388712752716281e-05, + "loss": 0.4439, + "step": 7145 + }, + { + "epoch": 0.869607544873745, + "grad_norm": 1.294167399406433, + "learning_rate": 1.238681848346999e-05, + "loss": 0.3801, + "step": 7146 + }, + { + "epoch": 0.869729236385762, + "grad_norm": 2.0159707069396973, + "learning_rate": 1.238492412340031e-05, + "loss": 0.5163, + "step": 7147 + }, + { + "epoch": 0.8698509278977792, + "grad_norm": 0.953387975692749, + "learning_rate": 1.2383029672579335e-05, + "loss": 0.4632, + "step": 7148 + }, + { + "epoch": 0.8699726194097962, + "grad_norm": 2.860001802444458, + "learning_rate": 1.2381135131079145e-05, + "loss": 0.4683, + "step": 7149 + }, + { + "epoch": 0.8700943109218132, + "grad_norm": 1.1687285900115967, + "learning_rate": 1.2379240498971837e-05, + "loss": 0.4774, + "step": 7150 + }, + { + "epoch": 0.8702160024338302, + "grad_norm": 3.3176209926605225, + "learning_rate": 1.2377345776329502e-05, + "loss": 0.4032, + "step": 7151 + }, + { + "epoch": 0.8703376939458473, + "grad_norm": 0.8278681635856628, + "learning_rate": 1.237545096322424e-05, + "loss": 0.4556, + "step": 7152 + }, + { + "epoch": 0.8704593854578643, + "grad_norm": 2.099942684173584, + "learning_rate": 1.2373556059728153e-05, + "loss": 0.4425, + "step": 7153 + }, + { + "epoch": 0.8705810769698813, + "grad_norm": 1.1819173097610474, + "learning_rate": 1.2371661065913343e-05, + "loss": 0.4999, + "step": 7154 + }, + { + "epoch": 0.8707027684818984, + "grad_norm": 2.0561752319335938, + "learning_rate": 1.236976598185192e-05, + "loss": 0.4994, + "step": 7155 + }, + { + "epoch": 0.8708244599939154, + "grad_norm": 1.27903413772583, + "learning_rate": 1.2367870807615998e-05, + "loss": 0.4209, + "step": 7156 + }, + { + "epoch": 0.8709461515059325, + "grad_norm": 1.2673401832580566, + "learning_rate": 1.2365975543277688e-05, + "loss": 0.4317, + "step": 7157 + }, + { + "epoch": 0.8710678430179495, + "grad_norm": 2.4399564266204834, + "learning_rate": 1.2364080188909116e-05, + "loss": 0.418, + "step": 7158 + }, + { + "epoch": 0.8711895345299665, + "grad_norm": 5.944895267486572, + "learning_rate": 1.2362184744582396e-05, + "loss": 0.5677, + "step": 7159 + }, + { + "epoch": 0.8713112260419835, + "grad_norm": 3.566612958908081, + "learning_rate": 1.2360289210369658e-05, + "loss": 0.5076, + "step": 7160 + }, + { + "epoch": 0.8714329175540007, + "grad_norm": 0.9285900592803955, + "learning_rate": 1.235839358634303e-05, + "loss": 0.4202, + "step": 7161 + }, + { + "epoch": 0.8715546090660177, + "grad_norm": 0.7251654267311096, + "learning_rate": 1.2356497872574642e-05, + "loss": 0.4246, + "step": 7162 + }, + { + "epoch": 0.8716763005780347, + "grad_norm": 0.7016144394874573, + "learning_rate": 1.2354602069136636e-05, + "loss": 0.4016, + "step": 7163 + }, + { + "epoch": 0.8717979920900517, + "grad_norm": 2.1173007488250732, + "learning_rate": 1.2352706176101147e-05, + "loss": 0.4487, + "step": 7164 + }, + { + "epoch": 0.8719196836020687, + "grad_norm": 1.1782103776931763, + "learning_rate": 1.2350810193540318e-05, + "loss": 0.4655, + "step": 7165 + }, + { + "epoch": 0.8720413751140857, + "grad_norm": 2.83894419670105, + "learning_rate": 1.2348914121526292e-05, + "loss": 0.4468, + "step": 7166 + }, + { + "epoch": 0.8721630666261029, + "grad_norm": 4.003988742828369, + "learning_rate": 1.2347017960131225e-05, + "loss": 0.4113, + "step": 7167 + }, + { + "epoch": 0.8722847581381199, + "grad_norm": 2.185760259628296, + "learning_rate": 1.2345121709427265e-05, + "loss": 0.4315, + "step": 7168 + }, + { + "epoch": 0.8724064496501369, + "grad_norm": 0.6458480954170227, + "learning_rate": 1.2343225369486569e-05, + "loss": 0.4639, + "step": 7169 + }, + { + "epoch": 0.8725281411621539, + "grad_norm": 0.8225185871124268, + "learning_rate": 1.23413289403813e-05, + "loss": 0.4299, + "step": 7170 + }, + { + "epoch": 0.872649832674171, + "grad_norm": 2.206825017929077, + "learning_rate": 1.233943242218362e-05, + "loss": 0.5521, + "step": 7171 + }, + { + "epoch": 0.872771524186188, + "grad_norm": 1.1052758693695068, + "learning_rate": 1.2337535814965688e-05, + "loss": 0.4625, + "step": 7172 + }, + { + "epoch": 0.8728932156982051, + "grad_norm": 1.6416510343551636, + "learning_rate": 1.2335639118799682e-05, + "loss": 0.459, + "step": 7173 + }, + { + "epoch": 0.8730149072102221, + "grad_norm": 0.891686737537384, + "learning_rate": 1.2333742333757776e-05, + "loss": 0.4849, + "step": 7174 + }, + { + "epoch": 0.8731365987222391, + "grad_norm": 0.7240790128707886, + "learning_rate": 1.2331845459912144e-05, + "loss": 0.4103, + "step": 7175 + }, + { + "epoch": 0.8732582902342562, + "grad_norm": 0.6536197066307068, + "learning_rate": 1.2329948497334963e-05, + "loss": 0.443, + "step": 7176 + }, + { + "epoch": 0.8733799817462732, + "grad_norm": 1.9579108953475952, + "learning_rate": 1.2328051446098418e-05, + "loss": 0.4965, + "step": 7177 + }, + { + "epoch": 0.8735016732582902, + "grad_norm": 2.1566762924194336, + "learning_rate": 1.2326154306274698e-05, + "loss": 0.5056, + "step": 7178 + }, + { + "epoch": 0.8736233647703072, + "grad_norm": 1.443878173828125, + "learning_rate": 1.2324257077935993e-05, + "loss": 0.4966, + "step": 7179 + }, + { + "epoch": 0.8737450562823244, + "grad_norm": 1.3102716207504272, + "learning_rate": 1.2322359761154493e-05, + "loss": 0.4265, + "step": 7180 + }, + { + "epoch": 0.8738667477943414, + "grad_norm": 0.9915708899497986, + "learning_rate": 1.2320462356002399e-05, + "loss": 0.476, + "step": 7181 + }, + { + "epoch": 0.8739884393063584, + "grad_norm": 3.007075786590576, + "learning_rate": 1.2318564862551908e-05, + "loss": 0.424, + "step": 7182 + }, + { + "epoch": 0.8741101308183754, + "grad_norm": 0.8018471002578735, + "learning_rate": 1.2316667280875226e-05, + "loss": 0.4277, + "step": 7183 + }, + { + "epoch": 0.8742318223303924, + "grad_norm": 2.0959227085113525, + "learning_rate": 1.2314769611044557e-05, + "loss": 0.4004, + "step": 7184 + }, + { + "epoch": 0.8743535138424094, + "grad_norm": 0.8163623809814453, + "learning_rate": 1.2312871853132114e-05, + "loss": 0.4715, + "step": 7185 + }, + { + "epoch": 0.8744752053544266, + "grad_norm": 1.8603123426437378, + "learning_rate": 1.231097400721011e-05, + "loss": 0.4353, + "step": 7186 + }, + { + "epoch": 0.8745968968664436, + "grad_norm": 0.8322412967681885, + "learning_rate": 1.2309076073350761e-05, + "loss": 0.4262, + "step": 7187 + }, + { + "epoch": 0.8747185883784606, + "grad_norm": 0.9720268249511719, + "learning_rate": 1.2307178051626287e-05, + "loss": 0.4559, + "step": 7188 + }, + { + "epoch": 0.8748402798904776, + "grad_norm": 0.630632758140564, + "learning_rate": 1.2305279942108914e-05, + "loss": 0.4697, + "step": 7189 + }, + { + "epoch": 0.8749619714024947, + "grad_norm": 2.5271854400634766, + "learning_rate": 1.2303381744870868e-05, + "loss": 0.4924, + "step": 7190 + }, + { + "epoch": 0.8750836629145117, + "grad_norm": 1.0941441059112549, + "learning_rate": 1.2301483459984375e-05, + "loss": 0.4539, + "step": 7191 + }, + { + "epoch": 0.8752053544265288, + "grad_norm": 0.6484375596046448, + "learning_rate": 1.2299585087521675e-05, + "loss": 0.4354, + "step": 7192 + }, + { + "epoch": 0.8753270459385458, + "grad_norm": 3.5244596004486084, + "learning_rate": 1.2297686627555006e-05, + "loss": 0.532, + "step": 7193 + }, + { + "epoch": 0.8754487374505628, + "grad_norm": 0.7241256833076477, + "learning_rate": 1.2295788080156604e-05, + "loss": 0.434, + "step": 7194 + }, + { + "epoch": 0.8755704289625799, + "grad_norm": 1.1277343034744263, + "learning_rate": 1.229388944539871e-05, + "loss": 0.497, + "step": 7195 + }, + { + "epoch": 0.8756921204745969, + "grad_norm": 1.9367152452468872, + "learning_rate": 1.2291990723353573e-05, + "loss": 0.4692, + "step": 7196 + }, + { + "epoch": 0.8758138119866139, + "grad_norm": 1.7133387327194214, + "learning_rate": 1.229009191409345e-05, + "loss": 0.5249, + "step": 7197 + }, + { + "epoch": 0.8759355034986309, + "grad_norm": 2.437053680419922, + "learning_rate": 1.2288193017690588e-05, + "loss": 0.4532, + "step": 7198 + }, + { + "epoch": 0.876057195010648, + "grad_norm": 1.1704226732254028, + "learning_rate": 1.2286294034217243e-05, + "loss": 0.4483, + "step": 7199 + }, + { + "epoch": 0.8761788865226651, + "grad_norm": 2.379946708679199, + "learning_rate": 1.2284394963745679e-05, + "loss": 0.4841, + "step": 7200 + }, + { + "epoch": 0.8763005780346821, + "grad_norm": 1.1669596433639526, + "learning_rate": 1.228249580634816e-05, + "loss": 0.4965, + "step": 7201 + }, + { + "epoch": 0.8764222695466991, + "grad_norm": 1.0141856670379639, + "learning_rate": 1.2280596562096947e-05, + "loss": 0.464, + "step": 7202 + }, + { + "epoch": 0.8765439610587161, + "grad_norm": 2.4254069328308105, + "learning_rate": 1.2278697231064317e-05, + "loss": 0.4599, + "step": 7203 + }, + { + "epoch": 0.8766656525707331, + "grad_norm": 2.0517711639404297, + "learning_rate": 1.2276797813322541e-05, + "loss": 0.3785, + "step": 7204 + }, + { + "epoch": 0.8767873440827503, + "grad_norm": 1.5930449962615967, + "learning_rate": 1.2274898308943896e-05, + "loss": 0.4806, + "step": 7205 + }, + { + "epoch": 0.8769090355947673, + "grad_norm": 2.6375033855438232, + "learning_rate": 1.227299871800066e-05, + "loss": 0.4879, + "step": 7206 + }, + { + "epoch": 0.8770307271067843, + "grad_norm": 3.8798134326934814, + "learning_rate": 1.2271099040565118e-05, + "loss": 0.5059, + "step": 7207 + }, + { + "epoch": 0.8771524186188013, + "grad_norm": 2.800853729248047, + "learning_rate": 1.2269199276709555e-05, + "loss": 0.5118, + "step": 7208 + }, + { + "epoch": 0.8772741101308184, + "grad_norm": 0.7687027454376221, + "learning_rate": 1.2267299426506267e-05, + "loss": 0.413, + "step": 7209 + }, + { + "epoch": 0.8773958016428354, + "grad_norm": 4.138387203216553, + "learning_rate": 1.226539949002754e-05, + "loss": 0.5305, + "step": 7210 + }, + { + "epoch": 0.8775174931548525, + "grad_norm": 0.5947757363319397, + "learning_rate": 1.2263499467345673e-05, + "loss": 0.4685, + "step": 7211 + }, + { + "epoch": 0.8776391846668695, + "grad_norm": 0.7174993753433228, + "learning_rate": 1.2261599358532965e-05, + "loss": 0.4538, + "step": 7212 + }, + { + "epoch": 0.8777608761788865, + "grad_norm": 1.968227505683899, + "learning_rate": 1.2259699163661725e-05, + "loss": 0.4028, + "step": 7213 + }, + { + "epoch": 0.8778825676909036, + "grad_norm": 1.0651625394821167, + "learning_rate": 1.225779888280425e-05, + "loss": 0.4257, + "step": 7214 + }, + { + "epoch": 0.8780042592029206, + "grad_norm": 2.9992117881774902, + "learning_rate": 1.2255898516032853e-05, + "loss": 0.4499, + "step": 7215 + }, + { + "epoch": 0.8781259507149376, + "grad_norm": 1.5156244039535522, + "learning_rate": 1.2253998063419852e-05, + "loss": 0.4771, + "step": 7216 + }, + { + "epoch": 0.8782476422269546, + "grad_norm": 0.7558187246322632, + "learning_rate": 1.2252097525037558e-05, + "loss": 0.4056, + "step": 7217 + }, + { + "epoch": 0.8783693337389717, + "grad_norm": 0.6978296041488647, + "learning_rate": 1.2250196900958291e-05, + "loss": 0.41, + "step": 7218 + }, + { + "epoch": 0.8784910252509888, + "grad_norm": 0.7302688360214233, + "learning_rate": 1.2248296191254375e-05, + "loss": 0.3786, + "step": 7219 + }, + { + "epoch": 0.8786127167630058, + "grad_norm": 1.1655205488204956, + "learning_rate": 1.2246395395998135e-05, + "loss": 0.4495, + "step": 7220 + }, + { + "epoch": 0.8787344082750228, + "grad_norm": 0.8397741317749023, + "learning_rate": 1.2244494515261902e-05, + "loss": 0.467, + "step": 7221 + }, + { + "epoch": 0.8788560997870398, + "grad_norm": 2.247478723526001, + "learning_rate": 1.2242593549118003e-05, + "loss": 0.4866, + "step": 7222 + }, + { + "epoch": 0.8789777912990568, + "grad_norm": 1.0904483795166016, + "learning_rate": 1.2240692497638778e-05, + "loss": 0.4166, + "step": 7223 + }, + { + "epoch": 0.879099482811074, + "grad_norm": 0.6437256932258606, + "learning_rate": 1.2238791360896569e-05, + "loss": 0.3745, + "step": 7224 + }, + { + "epoch": 0.879221174323091, + "grad_norm": 0.959119439125061, + "learning_rate": 1.2236890138963711e-05, + "loss": 0.4285, + "step": 7225 + }, + { + "epoch": 0.879342865835108, + "grad_norm": 1.366894245147705, + "learning_rate": 1.2234988831912553e-05, + "loss": 0.4823, + "step": 7226 + }, + { + "epoch": 0.879464557347125, + "grad_norm": 2.7522079944610596, + "learning_rate": 1.2233087439815447e-05, + "loss": 0.4738, + "step": 7227 + }, + { + "epoch": 0.879586248859142, + "grad_norm": 1.0359946489334106, + "learning_rate": 1.2231185962744742e-05, + "loss": 0.4299, + "step": 7228 + }, + { + "epoch": 0.8797079403711591, + "grad_norm": 1.0898159742355347, + "learning_rate": 1.222928440077279e-05, + "loss": 0.4624, + "step": 7229 + }, + { + "epoch": 0.8798296318831762, + "grad_norm": 1.9964925050735474, + "learning_rate": 1.2227382753971953e-05, + "loss": 0.5015, + "step": 7230 + }, + { + "epoch": 0.8799513233951932, + "grad_norm": 0.7144758701324463, + "learning_rate": 1.2225481022414592e-05, + "loss": 0.4422, + "step": 7231 + }, + { + "epoch": 0.8800730149072102, + "grad_norm": 1.533362865447998, + "learning_rate": 1.2223579206173071e-05, + "loss": 0.4291, + "step": 7232 + }, + { + "epoch": 0.8801947064192273, + "grad_norm": 1.6278109550476074, + "learning_rate": 1.2221677305319762e-05, + "loss": 0.4924, + "step": 7233 + }, + { + "epoch": 0.8803163979312443, + "grad_norm": 0.7324231863021851, + "learning_rate": 1.2219775319927027e-05, + "loss": 0.4821, + "step": 7234 + }, + { + "epoch": 0.8804380894432613, + "grad_norm": 2.150149345397949, + "learning_rate": 1.2217873250067252e-05, + "loss": 0.4658, + "step": 7235 + }, + { + "epoch": 0.8805597809552783, + "grad_norm": 4.297105312347412, + "learning_rate": 1.2215971095812805e-05, + "loss": 0.4161, + "step": 7236 + }, + { + "epoch": 0.8806814724672954, + "grad_norm": 1.6495928764343262, + "learning_rate": 1.2214068857236072e-05, + "loss": 0.4556, + "step": 7237 + }, + { + "epoch": 0.8808031639793125, + "grad_norm": 1.1916217803955078, + "learning_rate": 1.2212166534409436e-05, + "loss": 0.4998, + "step": 7238 + }, + { + "epoch": 0.8809248554913295, + "grad_norm": 1.4659065008163452, + "learning_rate": 1.2210264127405287e-05, + "loss": 0.4226, + "step": 7239 + }, + { + "epoch": 0.8810465470033465, + "grad_norm": 1.8044829368591309, + "learning_rate": 1.2208361636296012e-05, + "loss": 0.4885, + "step": 7240 + }, + { + "epoch": 0.8811682385153635, + "grad_norm": 0.8489356637001038, + "learning_rate": 1.2206459061154004e-05, + "loss": 0.4815, + "step": 7241 + }, + { + "epoch": 0.8812899300273805, + "grad_norm": 1.1724481582641602, + "learning_rate": 1.2204556402051659e-05, + "loss": 0.4341, + "step": 7242 + }, + { + "epoch": 0.8814116215393977, + "grad_norm": 0.7249171137809753, + "learning_rate": 1.2202653659061385e-05, + "loss": 0.4524, + "step": 7243 + }, + { + "epoch": 0.8815333130514147, + "grad_norm": 2.6195907592773438, + "learning_rate": 1.2200750832255578e-05, + "loss": 0.3956, + "step": 7244 + }, + { + "epoch": 0.8816550045634317, + "grad_norm": 1.4745408296585083, + "learning_rate": 1.2198847921706646e-05, + "loss": 0.4319, + "step": 7245 + }, + { + "epoch": 0.8817766960754487, + "grad_norm": 2.000275135040283, + "learning_rate": 1.2196944927487e-05, + "loss": 0.4587, + "step": 7246 + }, + { + "epoch": 0.8818983875874657, + "grad_norm": 1.536466121673584, + "learning_rate": 1.2195041849669054e-05, + "loss": 0.385, + "step": 7247 + }, + { + "epoch": 0.8820200790994828, + "grad_norm": 1.8848366737365723, + "learning_rate": 1.2193138688325218e-05, + "loss": 0.4976, + "step": 7248 + }, + { + "epoch": 0.8821417706114999, + "grad_norm": 2.3491923809051514, + "learning_rate": 1.2191235443527919e-05, + "loss": 0.4517, + "step": 7249 + }, + { + "epoch": 0.8822634621235169, + "grad_norm": 0.7377593517303467, + "learning_rate": 1.2189332115349573e-05, + "loss": 0.4509, + "step": 7250 + }, + { + "epoch": 0.8823851536355339, + "grad_norm": 2.4110500812530518, + "learning_rate": 1.2187428703862612e-05, + "loss": 0.5268, + "step": 7251 + }, + { + "epoch": 0.882506845147551, + "grad_norm": 1.6337910890579224, + "learning_rate": 1.2185525209139457e-05, + "loss": 0.4134, + "step": 7252 + }, + { + "epoch": 0.882628536659568, + "grad_norm": 0.6018592119216919, + "learning_rate": 1.2183621631252544e-05, + "loss": 0.4462, + "step": 7253 + }, + { + "epoch": 0.882750228171585, + "grad_norm": 0.7575495839118958, + "learning_rate": 1.2181717970274312e-05, + "loss": 0.4576, + "step": 7254 + }, + { + "epoch": 0.882871919683602, + "grad_norm": 1.188772439956665, + "learning_rate": 1.2179814226277191e-05, + "loss": 0.4451, + "step": 7255 + }, + { + "epoch": 0.8829936111956191, + "grad_norm": 2.6152424812316895, + "learning_rate": 1.217791039933363e-05, + "loss": 0.5052, + "step": 7256 + }, + { + "epoch": 0.8831153027076362, + "grad_norm": 3.8781511783599854, + "learning_rate": 1.2176006489516068e-05, + "loss": 0.3836, + "step": 7257 + }, + { + "epoch": 0.8832369942196532, + "grad_norm": 0.968696117401123, + "learning_rate": 1.2174102496896959e-05, + "loss": 0.5065, + "step": 7258 + }, + { + "epoch": 0.8833586857316702, + "grad_norm": 0.9881165623664856, + "learning_rate": 1.2172198421548745e-05, + "loss": 0.463, + "step": 7259 + }, + { + "epoch": 0.8834803772436872, + "grad_norm": 1.793229341506958, + "learning_rate": 1.2170294263543884e-05, + "loss": 0.4729, + "step": 7260 + }, + { + "epoch": 0.8836020687557042, + "grad_norm": 0.7428906559944153, + "learning_rate": 1.216839002295484e-05, + "loss": 0.4582, + "step": 7261 + }, + { + "epoch": 0.8837237602677214, + "grad_norm": 1.8898204565048218, + "learning_rate": 1.2166485699854064e-05, + "loss": 0.4011, + "step": 7262 + }, + { + "epoch": 0.8838454517797384, + "grad_norm": 0.725135326385498, + "learning_rate": 1.2164581294314022e-05, + "loss": 0.4659, + "step": 7263 + }, + { + "epoch": 0.8839671432917554, + "grad_norm": 1.1981191635131836, + "learning_rate": 1.2162676806407181e-05, + "loss": 0.47, + "step": 7264 + }, + { + "epoch": 0.8840888348037724, + "grad_norm": 1.1754435300827026, + "learning_rate": 1.2160772236206014e-05, + "loss": 0.4856, + "step": 7265 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 2.904205322265625, + "learning_rate": 1.2158867583782988e-05, + "loss": 0.5361, + "step": 7266 + }, + { + "epoch": 0.8843322178278065, + "grad_norm": 0.8026193380355835, + "learning_rate": 1.2156962849210584e-05, + "loss": 0.4773, + "step": 7267 + }, + { + "epoch": 0.8844539093398236, + "grad_norm": 0.7588707208633423, + "learning_rate": 1.2155058032561278e-05, + "loss": 0.4551, + "step": 7268 + }, + { + "epoch": 0.8845756008518406, + "grad_norm": 1.2164289951324463, + "learning_rate": 1.2153153133907554e-05, + "loss": 0.4417, + "step": 7269 + }, + { + "epoch": 0.8846972923638576, + "grad_norm": 0.9452533721923828, + "learning_rate": 1.2151248153321892e-05, + "loss": 0.5107, + "step": 7270 + }, + { + "epoch": 0.8848189838758747, + "grad_norm": 0.8283289074897766, + "learning_rate": 1.2149343090876788e-05, + "loss": 0.5, + "step": 7271 + }, + { + "epoch": 0.8849406753878917, + "grad_norm": 0.9408752918243408, + "learning_rate": 1.2147437946644731e-05, + "loss": 0.544, + "step": 7272 + }, + { + "epoch": 0.8850623668999087, + "grad_norm": 1.163599967956543, + "learning_rate": 1.2145532720698219e-05, + "loss": 0.5075, + "step": 7273 + }, + { + "epoch": 0.8851840584119258, + "grad_norm": 3.3686656951904297, + "learning_rate": 1.214362741310974e-05, + "loss": 0.4778, + "step": 7274 + }, + { + "epoch": 0.8853057499239428, + "grad_norm": 2.4053075313568115, + "learning_rate": 1.2141722023951801e-05, + "loss": 0.4655, + "step": 7275 + }, + { + "epoch": 0.8854274414359599, + "grad_norm": 5.931481838226318, + "learning_rate": 1.2139816553296908e-05, + "loss": 0.4043, + "step": 7276 + }, + { + "epoch": 0.8855491329479769, + "grad_norm": 3.4912824630737305, + "learning_rate": 1.2137911001217565e-05, + "loss": 0.4501, + "step": 7277 + }, + { + "epoch": 0.8856708244599939, + "grad_norm": 0.6624214053153992, + "learning_rate": 1.2136005367786284e-05, + "loss": 0.4562, + "step": 7278 + }, + { + "epoch": 0.8857925159720109, + "grad_norm": 0.6605298519134521, + "learning_rate": 1.2134099653075579e-05, + "loss": 0.4765, + "step": 7279 + }, + { + "epoch": 0.8859142074840279, + "grad_norm": 2.066791296005249, + "learning_rate": 1.213219385715796e-05, + "loss": 0.3924, + "step": 7280 + }, + { + "epoch": 0.8860358989960451, + "grad_norm": 0.65671706199646, + "learning_rate": 1.2130287980105958e-05, + "loss": 0.4331, + "step": 7281 + }, + { + "epoch": 0.8861575905080621, + "grad_norm": 0.5998497605323792, + "learning_rate": 1.2128382021992084e-05, + "loss": 0.421, + "step": 7282 + }, + { + "epoch": 0.8862792820200791, + "grad_norm": 2.2659995555877686, + "learning_rate": 1.2126475982888868e-05, + "loss": 0.4705, + "step": 7283 + }, + { + "epoch": 0.8864009735320961, + "grad_norm": 0.7178171873092651, + "learning_rate": 1.2124569862868842e-05, + "loss": 0.3816, + "step": 7284 + }, + { + "epoch": 0.8865226650441131, + "grad_norm": 4.186813831329346, + "learning_rate": 1.2122663662004536e-05, + "loss": 0.5712, + "step": 7285 + }, + { + "epoch": 0.8866443565561302, + "grad_norm": 1.2549664974212646, + "learning_rate": 1.2120757380368481e-05, + "loss": 0.417, + "step": 7286 + }, + { + "epoch": 0.8867660480681473, + "grad_norm": 2.6965174674987793, + "learning_rate": 1.2118851018033219e-05, + "loss": 0.4671, + "step": 7287 + }, + { + "epoch": 0.8868877395801643, + "grad_norm": 2.5200355052948, + "learning_rate": 1.2116944575071293e-05, + "loss": 0.3725, + "step": 7288 + }, + { + "epoch": 0.8870094310921813, + "grad_norm": 0.7925547957420349, + "learning_rate": 1.2115038051555239e-05, + "loss": 0.4236, + "step": 7289 + }, + { + "epoch": 0.8871311226041984, + "grad_norm": 2.165656566619873, + "learning_rate": 1.2113131447557614e-05, + "loss": 0.4924, + "step": 7290 + }, + { + "epoch": 0.8872528141162154, + "grad_norm": 0.9340012669563293, + "learning_rate": 1.211122476315096e-05, + "loss": 0.4407, + "step": 7291 + }, + { + "epoch": 0.8873745056282324, + "grad_norm": 1.1947416067123413, + "learning_rate": 1.2109317998407834e-05, + "loss": 0.513, + "step": 7292 + }, + { + "epoch": 0.8874961971402495, + "grad_norm": 2.4484124183654785, + "learning_rate": 1.2107411153400793e-05, + "loss": 0.406, + "step": 7293 + }, + { + "epoch": 0.8876178886522665, + "grad_norm": 0.7745649814605713, + "learning_rate": 1.2105504228202394e-05, + "loss": 0.4893, + "step": 7294 + }, + { + "epoch": 0.8877395801642836, + "grad_norm": 2.3933732509613037, + "learning_rate": 1.2103597222885204e-05, + "loss": 0.4451, + "step": 7295 + }, + { + "epoch": 0.8878612716763006, + "grad_norm": 3.0880136489868164, + "learning_rate": 1.2101690137521785e-05, + "loss": 0.4411, + "step": 7296 + }, + { + "epoch": 0.8879829631883176, + "grad_norm": 2.0262298583984375, + "learning_rate": 1.2099782972184704e-05, + "loss": 0.4405, + "step": 7297 + }, + { + "epoch": 0.8881046547003346, + "grad_norm": 2.956406593322754, + "learning_rate": 1.2097875726946535e-05, + "loss": 0.4667, + "step": 7298 + }, + { + "epoch": 0.8882263462123516, + "grad_norm": 2.091343402862549, + "learning_rate": 1.2095968401879854e-05, + "loss": 0.3916, + "step": 7299 + }, + { + "epoch": 0.8883480377243688, + "grad_norm": 1.1204673051834106, + "learning_rate": 1.2094060997057233e-05, + "loss": 0.439, + "step": 7300 + }, + { + "epoch": 0.8884697292363858, + "grad_norm": 1.1490085124969482, + "learning_rate": 1.2092153512551258e-05, + "loss": 0.4843, + "step": 7301 + }, + { + "epoch": 0.8885914207484028, + "grad_norm": 0.8195436596870422, + "learning_rate": 1.2090245948434514e-05, + "loss": 0.3827, + "step": 7302 + }, + { + "epoch": 0.8887131122604198, + "grad_norm": 1.1624146699905396, + "learning_rate": 1.2088338304779586e-05, + "loss": 0.5068, + "step": 7303 + }, + { + "epoch": 0.8888348037724368, + "grad_norm": 2.6067113876342773, + "learning_rate": 1.2086430581659058e-05, + "loss": 0.4732, + "step": 7304 + }, + { + "epoch": 0.8889564952844539, + "grad_norm": 5.265781402587891, + "learning_rate": 1.2084522779145531e-05, + "loss": 0.5815, + "step": 7305 + }, + { + "epoch": 0.889078186796471, + "grad_norm": 1.8413618803024292, + "learning_rate": 1.2082614897311595e-05, + "loss": 0.4129, + "step": 7306 + }, + { + "epoch": 0.889199878308488, + "grad_norm": 2.391411542892456, + "learning_rate": 1.2080706936229854e-05, + "loss": 0.4605, + "step": 7307 + }, + { + "epoch": 0.889321569820505, + "grad_norm": 0.9443608522415161, + "learning_rate": 1.2078798895972908e-05, + "loss": 0.4439, + "step": 7308 + }, + { + "epoch": 0.889443261332522, + "grad_norm": 2.8269126415252686, + "learning_rate": 1.207689077661336e-05, + "loss": 0.4953, + "step": 7309 + }, + { + "epoch": 0.8895649528445391, + "grad_norm": 0.7635732293128967, + "learning_rate": 1.2074982578223815e-05, + "loss": 0.4872, + "step": 7310 + }, + { + "epoch": 0.8896866443565561, + "grad_norm": 2.8566296100616455, + "learning_rate": 1.2073074300876893e-05, + "loss": 0.45, + "step": 7311 + }, + { + "epoch": 0.8898083358685732, + "grad_norm": 1.6522257328033447, + "learning_rate": 1.20711659446452e-05, + "loss": 0.4648, + "step": 7312 + }, + { + "epoch": 0.8899300273805902, + "grad_norm": 4.044620513916016, + "learning_rate": 1.2069257509601356e-05, + "loss": 0.4064, + "step": 7313 + }, + { + "epoch": 0.8900517188926073, + "grad_norm": 4.702152252197266, + "learning_rate": 1.2067348995817983e-05, + "loss": 0.3677, + "step": 7314 + }, + { + "epoch": 0.8901734104046243, + "grad_norm": 1.3512136936187744, + "learning_rate": 1.20654404033677e-05, + "loss": 0.4851, + "step": 7315 + }, + { + "epoch": 0.8902951019166413, + "grad_norm": 1.169438123703003, + "learning_rate": 1.2063531732323135e-05, + "loss": 0.4954, + "step": 7316 + }, + { + "epoch": 0.8904167934286583, + "grad_norm": 1.094239592552185, + "learning_rate": 1.2061622982756916e-05, + "loss": 0.4881, + "step": 7317 + }, + { + "epoch": 0.8905384849406753, + "grad_norm": 1.7024352550506592, + "learning_rate": 1.2059714154741675e-05, + "loss": 0.4796, + "step": 7318 + }, + { + "epoch": 0.8906601764526925, + "grad_norm": 1.6726089715957642, + "learning_rate": 1.2057805248350047e-05, + "loss": 0.5149, + "step": 7319 + }, + { + "epoch": 0.8907818679647095, + "grad_norm": 1.5365747213363647, + "learning_rate": 1.205589626365467e-05, + "loss": 0.4815, + "step": 7320 + }, + { + "epoch": 0.8909035594767265, + "grad_norm": 1.5698299407958984, + "learning_rate": 1.2053987200728183e-05, + "loss": 0.4367, + "step": 7321 + }, + { + "epoch": 0.8910252509887435, + "grad_norm": 1.539196252822876, + "learning_rate": 1.2052078059643235e-05, + "loss": 0.4727, + "step": 7322 + }, + { + "epoch": 0.8911469425007605, + "grad_norm": 1.279540777206421, + "learning_rate": 1.2050168840472466e-05, + "loss": 0.4997, + "step": 7323 + }, + { + "epoch": 0.8912686340127776, + "grad_norm": 5.007602214813232, + "learning_rate": 1.2048259543288529e-05, + "loss": 0.3912, + "step": 7324 + }, + { + "epoch": 0.8913903255247947, + "grad_norm": 2.6805882453918457, + "learning_rate": 1.204635016816408e-05, + "loss": 0.4709, + "step": 7325 + }, + { + "epoch": 0.8915120170368117, + "grad_norm": 0.5695067048072815, + "learning_rate": 1.204444071517177e-05, + "loss": 0.4758, + "step": 7326 + }, + { + "epoch": 0.8916337085488287, + "grad_norm": 3.1602389812469482, + "learning_rate": 1.2042531184384258e-05, + "loss": 0.4345, + "step": 7327 + }, + { + "epoch": 0.8917554000608457, + "grad_norm": 3.4763336181640625, + "learning_rate": 1.2040621575874208e-05, + "loss": 0.3752, + "step": 7328 + }, + { + "epoch": 0.8918770915728628, + "grad_norm": 0.6043762564659119, + "learning_rate": 1.2038711889714282e-05, + "loss": 0.4491, + "step": 7329 + }, + { + "epoch": 0.8919987830848798, + "grad_norm": 2.324209213256836, + "learning_rate": 1.2036802125977149e-05, + "loss": 0.4579, + "step": 7330 + }, + { + "epoch": 0.8921204745968969, + "grad_norm": 1.1331285238265991, + "learning_rate": 1.203489228473548e-05, + "loss": 0.4105, + "step": 7331 + }, + { + "epoch": 0.8922421661089139, + "grad_norm": 1.273290991783142, + "learning_rate": 1.2032982366061945e-05, + "loss": 0.3622, + "step": 7332 + }, + { + "epoch": 0.892363857620931, + "grad_norm": 3.2276227474212646, + "learning_rate": 1.2031072370029227e-05, + "loss": 0.4421, + "step": 7333 + }, + { + "epoch": 0.892485549132948, + "grad_norm": 4.9920148849487305, + "learning_rate": 1.2029162296709998e-05, + "loss": 0.5286, + "step": 7334 + }, + { + "epoch": 0.892607240644965, + "grad_norm": 4.205452919006348, + "learning_rate": 1.2027252146176943e-05, + "loss": 0.5436, + "step": 7335 + }, + { + "epoch": 0.892728932156982, + "grad_norm": 1.5160728693008423, + "learning_rate": 1.2025341918502748e-05, + "loss": 0.3205, + "step": 7336 + }, + { + "epoch": 0.892850623668999, + "grad_norm": 3.0665640830993652, + "learning_rate": 1.2023431613760106e-05, + "loss": 0.4703, + "step": 7337 + }, + { + "epoch": 0.8929723151810162, + "grad_norm": 0.7929785847663879, + "learning_rate": 1.2021521232021698e-05, + "loss": 0.4062, + "step": 7338 + }, + { + "epoch": 0.8930940066930332, + "grad_norm": 1.8136169910430908, + "learning_rate": 1.2019610773360222e-05, + "loss": 0.4626, + "step": 7339 + }, + { + "epoch": 0.8932156982050502, + "grad_norm": 0.8245934247970581, + "learning_rate": 1.2017700237848375e-05, + "loss": 0.4614, + "step": 7340 + }, + { + "epoch": 0.8933373897170672, + "grad_norm": 1.2006621360778809, + "learning_rate": 1.2015789625558862e-05, + "loss": 0.4077, + "step": 7341 + }, + { + "epoch": 0.8934590812290842, + "grad_norm": 0.699004054069519, + "learning_rate": 1.201387893656438e-05, + "loss": 0.4188, + "step": 7342 + }, + { + "epoch": 0.8935807727411013, + "grad_norm": 1.2466466426849365, + "learning_rate": 1.2011968170937634e-05, + "loss": 0.4306, + "step": 7343 + }, + { + "epoch": 0.8937024642531184, + "grad_norm": 1.9796028137207031, + "learning_rate": 1.2010057328751335e-05, + "loss": 0.4271, + "step": 7344 + }, + { + "epoch": 0.8938241557651354, + "grad_norm": 2.442113161087036, + "learning_rate": 1.2008146410078195e-05, + "loss": 0.3871, + "step": 7345 + }, + { + "epoch": 0.8939458472771524, + "grad_norm": 0.6641497611999512, + "learning_rate": 1.2006235414990925e-05, + "loss": 0.4854, + "step": 7346 + }, + { + "epoch": 0.8940675387891694, + "grad_norm": 1.1614047288894653, + "learning_rate": 1.2004324343562246e-05, + "loss": 0.4383, + "step": 7347 + }, + { + "epoch": 0.8941892303011865, + "grad_norm": 1.7455962896347046, + "learning_rate": 1.200241319586488e-05, + "loss": 0.4646, + "step": 7348 + }, + { + "epoch": 0.8943109218132035, + "grad_norm": 0.7281454205513, + "learning_rate": 1.2000501971971546e-05, + "loss": 0.4736, + "step": 7349 + }, + { + "epoch": 0.8944326133252206, + "grad_norm": 3.365340232849121, + "learning_rate": 1.1998590671954969e-05, + "loss": 0.4921, + "step": 7350 + }, + { + "epoch": 0.8945543048372376, + "grad_norm": 0.8367118835449219, + "learning_rate": 1.1996679295887881e-05, + "loss": 0.411, + "step": 7351 + }, + { + "epoch": 0.8946759963492547, + "grad_norm": 1.1888952255249023, + "learning_rate": 1.1994767843843013e-05, + "loss": 0.4386, + "step": 7352 + }, + { + "epoch": 0.8947976878612717, + "grad_norm": 1.3032678365707397, + "learning_rate": 1.19928563158931e-05, + "loss": 0.4851, + "step": 7353 + }, + { + "epoch": 0.8949193793732887, + "grad_norm": 2.7483127117156982, + "learning_rate": 1.199094471211088e-05, + "loss": 0.3948, + "step": 7354 + }, + { + "epoch": 0.8950410708853057, + "grad_norm": 0.7429040670394897, + "learning_rate": 1.1989033032569091e-05, + "loss": 0.4517, + "step": 7355 + }, + { + "epoch": 0.8951627623973228, + "grad_norm": 0.5595561265945435, + "learning_rate": 1.198712127734048e-05, + "loss": 0.4758, + "step": 7356 + }, + { + "epoch": 0.8952844539093399, + "grad_norm": 1.5698339939117432, + "learning_rate": 1.1985209446497788e-05, + "loss": 0.5223, + "step": 7357 + }, + { + "epoch": 0.8954061454213569, + "grad_norm": 3.6477086544036865, + "learning_rate": 1.1983297540113767e-05, + "loss": 0.3842, + "step": 7358 + }, + { + "epoch": 0.8955278369333739, + "grad_norm": 1.0380009412765503, + "learning_rate": 1.1981385558261173e-05, + "loss": 0.4511, + "step": 7359 + }, + { + "epoch": 0.8956495284453909, + "grad_norm": 1.0342826843261719, + "learning_rate": 1.1979473501012757e-05, + "loss": 0.4634, + "step": 7360 + }, + { + "epoch": 0.8957712199574079, + "grad_norm": 1.539221167564392, + "learning_rate": 1.1977561368441275e-05, + "loss": 0.481, + "step": 7361 + }, + { + "epoch": 0.895892911469425, + "grad_norm": 0.6270670890808105, + "learning_rate": 1.1975649160619488e-05, + "loss": 0.4722, + "step": 7362 + }, + { + "epoch": 0.8960146029814421, + "grad_norm": 0.6505113840103149, + "learning_rate": 1.1973736877620166e-05, + "loss": 0.464, + "step": 7363 + }, + { + "epoch": 0.8961362944934591, + "grad_norm": 2.1215133666992188, + "learning_rate": 1.1971824519516066e-05, + "loss": 0.4914, + "step": 7364 + }, + { + "epoch": 0.8962579860054761, + "grad_norm": 0.8316892981529236, + "learning_rate": 1.1969912086379968e-05, + "loss": 0.4485, + "step": 7365 + }, + { + "epoch": 0.8963796775174931, + "grad_norm": 0.6590808033943176, + "learning_rate": 1.196799957828463e-05, + "loss": 0.4684, + "step": 7366 + }, + { + "epoch": 0.8965013690295102, + "grad_norm": 1.3061925172805786, + "learning_rate": 1.196608699530284e-05, + "loss": 0.4935, + "step": 7367 + }, + { + "epoch": 0.8966230605415272, + "grad_norm": 2.2511823177337646, + "learning_rate": 1.1964174337507366e-05, + "loss": 0.4563, + "step": 7368 + }, + { + "epoch": 0.8967447520535443, + "grad_norm": 1.2181216478347778, + "learning_rate": 1.1962261604970994e-05, + "loss": 0.4609, + "step": 7369 + }, + { + "epoch": 0.8968664435655613, + "grad_norm": 0.5540208220481873, + "learning_rate": 1.1960348797766505e-05, + "loss": 0.4736, + "step": 7370 + }, + { + "epoch": 0.8969881350775784, + "grad_norm": 0.9309449195861816, + "learning_rate": 1.1958435915966692e-05, + "loss": 0.4526, + "step": 7371 + }, + { + "epoch": 0.8971098265895954, + "grad_norm": 1.24196195602417, + "learning_rate": 1.1956522959644334e-05, + "loss": 0.4407, + "step": 7372 + }, + { + "epoch": 0.8972315181016124, + "grad_norm": 1.3524045944213867, + "learning_rate": 1.1954609928872229e-05, + "loss": 0.4742, + "step": 7373 + }, + { + "epoch": 0.8973532096136294, + "grad_norm": 1.753662347793579, + "learning_rate": 1.1952696823723169e-05, + "loss": 0.4339, + "step": 7374 + }, + { + "epoch": 0.8974749011256465, + "grad_norm": 2.8566055297851562, + "learning_rate": 1.1950783644269956e-05, + "loss": 0.4882, + "step": 7375 + }, + { + "epoch": 0.8975965926376636, + "grad_norm": 2.9239728450775146, + "learning_rate": 1.1948870390585386e-05, + "loss": 0.5488, + "step": 7376 + }, + { + "epoch": 0.8977182841496806, + "grad_norm": 1.1236060857772827, + "learning_rate": 1.1946957062742263e-05, + "loss": 0.4221, + "step": 7377 + }, + { + "epoch": 0.8978399756616976, + "grad_norm": 0.7916250824928284, + "learning_rate": 1.1945043660813393e-05, + "loss": 0.4487, + "step": 7378 + }, + { + "epoch": 0.8979616671737146, + "grad_norm": 0.6620454788208008, + "learning_rate": 1.1943130184871588e-05, + "loss": 0.4188, + "step": 7379 + }, + { + "epoch": 0.8980833586857316, + "grad_norm": 2.3451781272888184, + "learning_rate": 1.1941216634989656e-05, + "loss": 0.4309, + "step": 7380 + }, + { + "epoch": 0.8982050501977487, + "grad_norm": 0.7819168567657471, + "learning_rate": 1.1939303011240413e-05, + "loss": 0.4162, + "step": 7381 + }, + { + "epoch": 0.8983267417097658, + "grad_norm": 1.7895784378051758, + "learning_rate": 1.1937389313696677e-05, + "loss": 0.5011, + "step": 7382 + }, + { + "epoch": 0.8984484332217828, + "grad_norm": 2.3792643547058105, + "learning_rate": 1.1935475542431268e-05, + "loss": 0.5144, + "step": 7383 + }, + { + "epoch": 0.8985701247337998, + "grad_norm": 2.1358370780944824, + "learning_rate": 1.1933561697517006e-05, + "loss": 0.4459, + "step": 7384 + }, + { + "epoch": 0.8986918162458168, + "grad_norm": 0.6661283373832703, + "learning_rate": 1.193164777902672e-05, + "loss": 0.4378, + "step": 7385 + }, + { + "epoch": 0.8988135077578339, + "grad_norm": 0.6997846364974976, + "learning_rate": 1.192973378703324e-05, + "loss": 0.4555, + "step": 7386 + }, + { + "epoch": 0.8989351992698509, + "grad_norm": 4.277323246002197, + "learning_rate": 1.192781972160939e-05, + "loss": 0.5582, + "step": 7387 + }, + { + "epoch": 0.899056890781868, + "grad_norm": 1.2798309326171875, + "learning_rate": 1.1925905582828015e-05, + "loss": 0.4706, + "step": 7388 + }, + { + "epoch": 0.899178582293885, + "grad_norm": 2.09159517288208, + "learning_rate": 1.1923991370761942e-05, + "loss": 0.4634, + "step": 7389 + }, + { + "epoch": 0.899300273805902, + "grad_norm": 1.8460427522659302, + "learning_rate": 1.1922077085484018e-05, + "loss": 0.5384, + "step": 7390 + }, + { + "epoch": 0.8994219653179191, + "grad_norm": 3.0884695053100586, + "learning_rate": 1.1920162727067082e-05, + "loss": 0.4529, + "step": 7391 + }, + { + "epoch": 0.8995436568299361, + "grad_norm": 1.5712168216705322, + "learning_rate": 1.1918248295583977e-05, + "loss": 0.5111, + "step": 7392 + }, + { + "epoch": 0.8996653483419531, + "grad_norm": 2.340404510498047, + "learning_rate": 1.1916333791107558e-05, + "loss": 0.4486, + "step": 7393 + }, + { + "epoch": 0.8997870398539702, + "grad_norm": 3.346893787384033, + "learning_rate": 1.1914419213710669e-05, + "loss": 0.4525, + "step": 7394 + }, + { + "epoch": 0.8999087313659873, + "grad_norm": 2.9239792823791504, + "learning_rate": 1.1912504563466165e-05, + "loss": 0.4413, + "step": 7395 + }, + { + "epoch": 0.9000304228780043, + "grad_norm": 1.453131914138794, + "learning_rate": 1.1910589840446904e-05, + "loss": 0.4343, + "step": 7396 + }, + { + "epoch": 0.9001521143900213, + "grad_norm": 0.6888946890830994, + "learning_rate": 1.1908675044725749e-05, + "loss": 0.4343, + "step": 7397 + }, + { + "epoch": 0.9002738059020383, + "grad_norm": 1.2008459568023682, + "learning_rate": 1.1906760176375551e-05, + "loss": 0.4219, + "step": 7398 + }, + { + "epoch": 0.9003954974140553, + "grad_norm": 1.9007642269134521, + "learning_rate": 1.1904845235469184e-05, + "loss": 0.451, + "step": 7399 + }, + { + "epoch": 0.9005171889260724, + "grad_norm": 3.8408656120300293, + "learning_rate": 1.1902930222079516e-05, + "loss": 0.5274, + "step": 7400 + }, + { + "epoch": 0.9006388804380895, + "grad_norm": 1.6050820350646973, + "learning_rate": 1.1901015136279415e-05, + "loss": 0.4759, + "step": 7401 + }, + { + "epoch": 0.9007605719501065, + "grad_norm": 1.1178021430969238, + "learning_rate": 1.1899099978141748e-05, + "loss": 0.4682, + "step": 7402 + }, + { + "epoch": 0.9008822634621235, + "grad_norm": 0.864290714263916, + "learning_rate": 1.1897184747739398e-05, + "loss": 0.444, + "step": 7403 + }, + { + "epoch": 0.9010039549741405, + "grad_norm": 1.1707119941711426, + "learning_rate": 1.1895269445145241e-05, + "loss": 0.4448, + "step": 7404 + }, + { + "epoch": 0.9011256464861576, + "grad_norm": 2.937156915664673, + "learning_rate": 1.1893354070432161e-05, + "loss": 0.5074, + "step": 7405 + }, + { + "epoch": 0.9012473379981746, + "grad_norm": 0.6468366384506226, + "learning_rate": 1.1891438623673039e-05, + "loss": 0.4692, + "step": 7406 + }, + { + "epoch": 0.9013690295101917, + "grad_norm": 0.8036524057388306, + "learning_rate": 1.1889523104940762e-05, + "loss": 0.4641, + "step": 7407 + }, + { + "epoch": 0.9014907210222087, + "grad_norm": 2.6297693252563477, + "learning_rate": 1.1887607514308218e-05, + "loss": 0.4292, + "step": 7408 + }, + { + "epoch": 0.9016124125342257, + "grad_norm": 4.296541213989258, + "learning_rate": 1.1885691851848302e-05, + "loss": 0.4809, + "step": 7409 + }, + { + "epoch": 0.9017341040462428, + "grad_norm": 3.5419514179229736, + "learning_rate": 1.1883776117633907e-05, + "loss": 0.4349, + "step": 7410 + }, + { + "epoch": 0.9018557955582598, + "grad_norm": 2.7667031288146973, + "learning_rate": 1.1881860311737934e-05, + "loss": 0.4575, + "step": 7411 + }, + { + "epoch": 0.9019774870702768, + "grad_norm": 3.723479986190796, + "learning_rate": 1.187994443423328e-05, + "loss": 0.4109, + "step": 7412 + }, + { + "epoch": 0.9020991785822939, + "grad_norm": 2.6736536026000977, + "learning_rate": 1.187802848519285e-05, + "loss": 0.4412, + "step": 7413 + }, + { + "epoch": 0.902220870094311, + "grad_norm": 0.7212310433387756, + "learning_rate": 1.1876112464689547e-05, + "loss": 0.4576, + "step": 7414 + }, + { + "epoch": 0.902342561606328, + "grad_norm": 1.5392414331436157, + "learning_rate": 1.1874196372796278e-05, + "loss": 0.4342, + "step": 7415 + }, + { + "epoch": 0.902464253118345, + "grad_norm": 1.6171127557754517, + "learning_rate": 1.1872280209585965e-05, + "loss": 0.4437, + "step": 7416 + }, + { + "epoch": 0.902585944630362, + "grad_norm": 5.273656845092773, + "learning_rate": 1.1870363975131511e-05, + "loss": 0.5091, + "step": 7417 + }, + { + "epoch": 0.902707636142379, + "grad_norm": 2.4460678100585938, + "learning_rate": 1.1868447669505836e-05, + "loss": 0.5266, + "step": 7418 + }, + { + "epoch": 0.902829327654396, + "grad_norm": 2.832026958465576, + "learning_rate": 1.1866531292781858e-05, + "loss": 0.4937, + "step": 7419 + }, + { + "epoch": 0.9029510191664132, + "grad_norm": 2.171494245529175, + "learning_rate": 1.1864614845032504e-05, + "loss": 0.4102, + "step": 7420 + }, + { + "epoch": 0.9030727106784302, + "grad_norm": 1.6888043880462646, + "learning_rate": 1.1862698326330694e-05, + "loss": 0.449, + "step": 7421 + }, + { + "epoch": 0.9031944021904472, + "grad_norm": 0.7823889255523682, + "learning_rate": 1.1860781736749355e-05, + "loss": 0.4129, + "step": 7422 + }, + { + "epoch": 0.9033160937024642, + "grad_norm": 0.9548131227493286, + "learning_rate": 1.1858865076361423e-05, + "loss": 0.4522, + "step": 7423 + }, + { + "epoch": 0.9034377852144813, + "grad_norm": 0.9475173354148865, + "learning_rate": 1.1856948345239827e-05, + "loss": 0.4368, + "step": 7424 + }, + { + "epoch": 0.9035594767264983, + "grad_norm": 1.3620988130569458, + "learning_rate": 1.1855031543457502e-05, + "loss": 0.4828, + "step": 7425 + }, + { + "epoch": 0.9036811682385154, + "grad_norm": 0.6987718343734741, + "learning_rate": 1.1853114671087387e-05, + "loss": 0.4597, + "step": 7426 + }, + { + "epoch": 0.9038028597505324, + "grad_norm": 3.693128824234009, + "learning_rate": 1.1851197728202423e-05, + "loss": 0.4131, + "step": 7427 + }, + { + "epoch": 0.9039245512625494, + "grad_norm": 2.660182237625122, + "learning_rate": 1.1849280714875552e-05, + "loss": 0.4346, + "step": 7428 + }, + { + "epoch": 0.9040462427745665, + "grad_norm": 3.2342629432678223, + "learning_rate": 1.1847363631179724e-05, + "loss": 0.402, + "step": 7429 + }, + { + "epoch": 0.9041679342865835, + "grad_norm": 1.0060064792633057, + "learning_rate": 1.1845446477187886e-05, + "loss": 0.4664, + "step": 7430 + }, + { + "epoch": 0.9042896257986005, + "grad_norm": 0.7507879734039307, + "learning_rate": 1.184352925297299e-05, + "loss": 0.432, + "step": 7431 + }, + { + "epoch": 0.9044113173106176, + "grad_norm": 2.0956578254699707, + "learning_rate": 1.1841611958607988e-05, + "loss": 0.5218, + "step": 7432 + }, + { + "epoch": 0.9045330088226347, + "grad_norm": 3.350637674331665, + "learning_rate": 1.183969459416584e-05, + "loss": 0.5159, + "step": 7433 + }, + { + "epoch": 0.9046547003346517, + "grad_norm": 2.215989351272583, + "learning_rate": 1.1837777159719506e-05, + "loss": 0.4057, + "step": 7434 + }, + { + "epoch": 0.9047763918466687, + "grad_norm": 1.168872594833374, + "learning_rate": 1.1835859655341948e-05, + "loss": 0.3942, + "step": 7435 + }, + { + "epoch": 0.9048980833586857, + "grad_norm": 0.8087822794914246, + "learning_rate": 1.1833942081106127e-05, + "loss": 0.4598, + "step": 7436 + }, + { + "epoch": 0.9050197748707027, + "grad_norm": 0.8854578137397766, + "learning_rate": 1.1832024437085015e-05, + "loss": 0.4295, + "step": 7437 + }, + { + "epoch": 0.9051414663827198, + "grad_norm": 1.3707044124603271, + "learning_rate": 1.1830106723351578e-05, + "loss": 0.433, + "step": 7438 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 2.760690212249756, + "learning_rate": 1.1828188939978798e-05, + "loss": 0.5251, + "step": 7439 + }, + { + "epoch": 0.9053848494067539, + "grad_norm": 0.8044907450675964, + "learning_rate": 1.1826271087039643e-05, + "loss": 0.3961, + "step": 7440 + }, + { + "epoch": 0.9055065409187709, + "grad_norm": 2.011630058288574, + "learning_rate": 1.182435316460709e-05, + "loss": 0.504, + "step": 7441 + }, + { + "epoch": 0.9056282324307879, + "grad_norm": 1.7443996667861938, + "learning_rate": 1.1822435172754123e-05, + "loss": 0.4619, + "step": 7442 + }, + { + "epoch": 0.905749923942805, + "grad_norm": 1.1376187801361084, + "learning_rate": 1.1820517111553729e-05, + "loss": 0.4551, + "step": 7443 + }, + { + "epoch": 0.905871615454822, + "grad_norm": 0.6837798953056335, + "learning_rate": 1.1818598981078888e-05, + "loss": 0.4405, + "step": 7444 + }, + { + "epoch": 0.9059933069668391, + "grad_norm": 0.5845446586608887, + "learning_rate": 1.1816680781402592e-05, + "loss": 0.4486, + "step": 7445 + }, + { + "epoch": 0.9061149984788561, + "grad_norm": 3.576133966445923, + "learning_rate": 1.1814762512597833e-05, + "loss": 0.5182, + "step": 7446 + }, + { + "epoch": 0.9062366899908731, + "grad_norm": 2.3765969276428223, + "learning_rate": 1.1812844174737603e-05, + "loss": 0.5145, + "step": 7447 + }, + { + "epoch": 0.9063583815028902, + "grad_norm": 1.5969698429107666, + "learning_rate": 1.18109257678949e-05, + "loss": 0.4703, + "step": 7448 + }, + { + "epoch": 0.9064800730149072, + "grad_norm": 0.770078182220459, + "learning_rate": 1.1809007292142723e-05, + "loss": 0.533, + "step": 7449 + }, + { + "epoch": 0.9066017645269242, + "grad_norm": 5.47883415222168, + "learning_rate": 1.1807088747554074e-05, + "loss": 0.4112, + "step": 7450 + }, + { + "epoch": 0.9067234560389413, + "grad_norm": 0.868976891040802, + "learning_rate": 1.1805170134201957e-05, + "loss": 0.4785, + "step": 7451 + }, + { + "epoch": 0.9068451475509584, + "grad_norm": 4.280722618103027, + "learning_rate": 1.1803251452159383e-05, + "loss": 0.4133, + "step": 7452 + }, + { + "epoch": 0.9069668390629754, + "grad_norm": 2.8576579093933105, + "learning_rate": 1.1801332701499355e-05, + "loss": 0.4578, + "step": 7453 + }, + { + "epoch": 0.9070885305749924, + "grad_norm": 1.9192330837249756, + "learning_rate": 1.1799413882294893e-05, + "loss": 0.4472, + "step": 7454 + }, + { + "epoch": 0.9072102220870094, + "grad_norm": 1.2525372505187988, + "learning_rate": 1.1797494994619003e-05, + "loss": 0.4659, + "step": 7455 + }, + { + "epoch": 0.9073319135990264, + "grad_norm": 0.9913723468780518, + "learning_rate": 1.1795576038544709e-05, + "loss": 0.4375, + "step": 7456 + }, + { + "epoch": 0.9074536051110436, + "grad_norm": 1.3874493837356567, + "learning_rate": 1.1793657014145034e-05, + "loss": 0.4531, + "step": 7457 + }, + { + "epoch": 0.9075752966230606, + "grad_norm": 2.199550151824951, + "learning_rate": 1.1791737921492997e-05, + "loss": 0.5044, + "step": 7458 + }, + { + "epoch": 0.9076969881350776, + "grad_norm": 1.8949652910232544, + "learning_rate": 1.178981876066162e-05, + "loss": 0.5028, + "step": 7459 + }, + { + "epoch": 0.9078186796470946, + "grad_norm": 1.4145840406417847, + "learning_rate": 1.1787899531723935e-05, + "loss": 0.3584, + "step": 7460 + }, + { + "epoch": 0.9079403711591116, + "grad_norm": 0.9962152242660522, + "learning_rate": 1.1785980234752975e-05, + "loss": 0.3894, + "step": 7461 + }, + { + "epoch": 0.9080620626711287, + "grad_norm": 4.3158721923828125, + "learning_rate": 1.1784060869821766e-05, + "loss": 0.5194, + "step": 7462 + }, + { + "epoch": 0.9081837541831457, + "grad_norm": 2.5999600887298584, + "learning_rate": 1.178214143700335e-05, + "loss": 0.5032, + "step": 7463 + }, + { + "epoch": 0.9083054456951628, + "grad_norm": 2.517180919647217, + "learning_rate": 1.1780221936370761e-05, + "loss": 0.4338, + "step": 7464 + }, + { + "epoch": 0.9084271372071798, + "grad_norm": 2.1384212970733643, + "learning_rate": 1.1778302367997048e-05, + "loss": 0.4647, + "step": 7465 + }, + { + "epoch": 0.9085488287191968, + "grad_norm": 2.1684670448303223, + "learning_rate": 1.1776382731955245e-05, + "loss": 0.4728, + "step": 7466 + }, + { + "epoch": 0.9086705202312139, + "grad_norm": 3.1083970069885254, + "learning_rate": 1.1774463028318401e-05, + "loss": 0.3706, + "step": 7467 + }, + { + "epoch": 0.9087922117432309, + "grad_norm": 1.0006368160247803, + "learning_rate": 1.1772543257159568e-05, + "loss": 0.4787, + "step": 7468 + }, + { + "epoch": 0.9089139032552479, + "grad_norm": 0.7188969850540161, + "learning_rate": 1.1770623418551798e-05, + "loss": 0.4405, + "step": 7469 + }, + { + "epoch": 0.909035594767265, + "grad_norm": 3.0520830154418945, + "learning_rate": 1.1768703512568139e-05, + "loss": 0.4465, + "step": 7470 + }, + { + "epoch": 0.909157286279282, + "grad_norm": 2.5445644855499268, + "learning_rate": 1.176678353928165e-05, + "loss": 0.464, + "step": 7471 + }, + { + "epoch": 0.9092789777912991, + "grad_norm": 1.9943443536758423, + "learning_rate": 1.1764863498765388e-05, + "loss": 0.4769, + "step": 7472 + }, + { + "epoch": 0.9094006693033161, + "grad_norm": 1.370458960533142, + "learning_rate": 1.1762943391092421e-05, + "loss": 0.4377, + "step": 7473 + }, + { + "epoch": 0.9095223608153331, + "grad_norm": 1.6611050367355347, + "learning_rate": 1.1761023216335807e-05, + "loss": 0.4448, + "step": 7474 + }, + { + "epoch": 0.9096440523273501, + "grad_norm": 3.1339404582977295, + "learning_rate": 1.1759102974568616e-05, + "loss": 0.497, + "step": 7475 + }, + { + "epoch": 0.9097657438393673, + "grad_norm": 1.0105726718902588, + "learning_rate": 1.1757182665863913e-05, + "loss": 0.445, + "step": 7476 + }, + { + "epoch": 0.9098874353513843, + "grad_norm": 0.903846025466919, + "learning_rate": 1.1755262290294776e-05, + "loss": 0.456, + "step": 7477 + }, + { + "epoch": 0.9100091268634013, + "grad_norm": 1.3151462078094482, + "learning_rate": 1.1753341847934272e-05, + "loss": 0.3545, + "step": 7478 + }, + { + "epoch": 0.9101308183754183, + "grad_norm": 3.2192325592041016, + "learning_rate": 1.1751421338855483e-05, + "loss": 0.5171, + "step": 7479 + }, + { + "epoch": 0.9102525098874353, + "grad_norm": 0.8245891332626343, + "learning_rate": 1.1749500763131488e-05, + "loss": 0.4194, + "step": 7480 + }, + { + "epoch": 0.9103742013994524, + "grad_norm": 4.182288646697998, + "learning_rate": 1.1747580120835367e-05, + "loss": 0.4919, + "step": 7481 + }, + { + "epoch": 0.9104958929114694, + "grad_norm": 1.492456316947937, + "learning_rate": 1.1745659412040202e-05, + "loss": 0.3937, + "step": 7482 + }, + { + "epoch": 0.9106175844234865, + "grad_norm": 4.322916507720947, + "learning_rate": 1.1743738636819087e-05, + "loss": 0.5162, + "step": 7483 + }, + { + "epoch": 0.9107392759355035, + "grad_norm": 5.514195442199707, + "learning_rate": 1.1741817795245107e-05, + "loss": 0.5864, + "step": 7484 + }, + { + "epoch": 0.9108609674475205, + "grad_norm": 0.722213089466095, + "learning_rate": 1.1739896887391353e-05, + "loss": 0.4183, + "step": 7485 + }, + { + "epoch": 0.9109826589595376, + "grad_norm": 1.453924298286438, + "learning_rate": 1.173797591333092e-05, + "loss": 0.4374, + "step": 7486 + }, + { + "epoch": 0.9111043504715546, + "grad_norm": 1.0098438262939453, + "learning_rate": 1.1736054873136909e-05, + "loss": 0.444, + "step": 7487 + }, + { + "epoch": 0.9112260419835716, + "grad_norm": 0.6750814318656921, + "learning_rate": 1.1734133766882416e-05, + "loss": 0.42, + "step": 7488 + }, + { + "epoch": 0.9113477334955887, + "grad_norm": 0.855018138885498, + "learning_rate": 1.1732212594640541e-05, + "loss": 0.4508, + "step": 7489 + }, + { + "epoch": 0.9114694250076057, + "grad_norm": 0.7046476602554321, + "learning_rate": 1.1730291356484394e-05, + "loss": 0.4117, + "step": 7490 + }, + { + "epoch": 0.9115911165196228, + "grad_norm": 2.677027702331543, + "learning_rate": 1.1728370052487082e-05, + "loss": 0.4285, + "step": 7491 + }, + { + "epoch": 0.9117128080316398, + "grad_norm": 1.3213434219360352, + "learning_rate": 1.172644868272171e-05, + "loss": 0.4502, + "step": 7492 + }, + { + "epoch": 0.9118344995436568, + "grad_norm": 0.5881977677345276, + "learning_rate": 1.1724527247261389e-05, + "loss": 0.4433, + "step": 7493 + }, + { + "epoch": 0.9119561910556738, + "grad_norm": 1.3601247072219849, + "learning_rate": 1.1722605746179237e-05, + "loss": 0.4671, + "step": 7494 + }, + { + "epoch": 0.912077882567691, + "grad_norm": 1.056037187576294, + "learning_rate": 1.1720684179548373e-05, + "loss": 0.4797, + "step": 7495 + }, + { + "epoch": 0.912199574079708, + "grad_norm": 1.241981863975525, + "learning_rate": 1.1718762547441913e-05, + "loss": 0.4758, + "step": 7496 + }, + { + "epoch": 0.912321265591725, + "grad_norm": 1.5421791076660156, + "learning_rate": 1.171684084993298e-05, + "loss": 0.4242, + "step": 7497 + }, + { + "epoch": 0.912442957103742, + "grad_norm": 3.0606703758239746, + "learning_rate": 1.1714919087094703e-05, + "loss": 0.369, + "step": 7498 + }, + { + "epoch": 0.912564648615759, + "grad_norm": 0.6550858616828918, + "learning_rate": 1.1712997259000203e-05, + "loss": 0.4452, + "step": 7499 + }, + { + "epoch": 0.912686340127776, + "grad_norm": 2.153909683227539, + "learning_rate": 1.1711075365722608e-05, + "loss": 0.4649, + "step": 7500 + }, + { + "epoch": 0.9128080316397931, + "grad_norm": 2.1597583293914795, + "learning_rate": 1.1709153407335057e-05, + "loss": 0.3629, + "step": 7501 + }, + { + "epoch": 0.9129297231518102, + "grad_norm": 1.0668424367904663, + "learning_rate": 1.170723138391068e-05, + "loss": 0.4301, + "step": 7502 + }, + { + "epoch": 0.9130514146638272, + "grad_norm": 1.500740647315979, + "learning_rate": 1.1705309295522615e-05, + "loss": 0.4778, + "step": 7503 + }, + { + "epoch": 0.9131731061758442, + "grad_norm": 2.7584002017974854, + "learning_rate": 1.1703387142244005e-05, + "loss": 0.4581, + "step": 7504 + }, + { + "epoch": 0.9132947976878613, + "grad_norm": 1.1044843196868896, + "learning_rate": 1.1701464924147983e-05, + "loss": 0.4057, + "step": 7505 + }, + { + "epoch": 0.9134164891998783, + "grad_norm": 2.9148340225219727, + "learning_rate": 1.1699542641307701e-05, + "loss": 0.4766, + "step": 7506 + }, + { + "epoch": 0.9135381807118953, + "grad_norm": 2.5233004093170166, + "learning_rate": 1.1697620293796306e-05, + "loss": 0.4927, + "step": 7507 + }, + { + "epoch": 0.9136598722239124, + "grad_norm": 3.836001396179199, + "learning_rate": 1.1695697881686941e-05, + "loss": 0.5249, + "step": 7508 + }, + { + "epoch": 0.9137815637359294, + "grad_norm": 0.6965628266334534, + "learning_rate": 1.1693775405052764e-05, + "loss": 0.4303, + "step": 7509 + }, + { + "epoch": 0.9139032552479465, + "grad_norm": 1.6198863983154297, + "learning_rate": 1.1691852863966926e-05, + "loss": 0.4665, + "step": 7510 + }, + { + "epoch": 0.9140249467599635, + "grad_norm": 2.0401527881622314, + "learning_rate": 1.1689930258502587e-05, + "loss": 0.415, + "step": 7511 + }, + { + "epoch": 0.9141466382719805, + "grad_norm": 0.9270548820495605, + "learning_rate": 1.16880075887329e-05, + "loss": 0.4653, + "step": 7512 + }, + { + "epoch": 0.9142683297839975, + "grad_norm": 3.1184487342834473, + "learning_rate": 1.168608485473103e-05, + "loss": 0.3977, + "step": 7513 + }, + { + "epoch": 0.9143900212960147, + "grad_norm": 1.2735332250595093, + "learning_rate": 1.1684162056570146e-05, + "loss": 0.4227, + "step": 7514 + }, + { + "epoch": 0.9145117128080317, + "grad_norm": 5.183312892913818, + "learning_rate": 1.1682239194323408e-05, + "loss": 0.3778, + "step": 7515 + }, + { + "epoch": 0.9146334043200487, + "grad_norm": 0.7662824988365173, + "learning_rate": 1.1680316268063984e-05, + "loss": 0.4914, + "step": 7516 + }, + { + "epoch": 0.9147550958320657, + "grad_norm": 2.325381278991699, + "learning_rate": 1.167839327786505e-05, + "loss": 0.4475, + "step": 7517 + }, + { + "epoch": 0.9148767873440827, + "grad_norm": 0.6889393925666809, + "learning_rate": 1.1676470223799776e-05, + "loss": 0.4448, + "step": 7518 + }, + { + "epoch": 0.9149984788560998, + "grad_norm": 0.9693259596824646, + "learning_rate": 1.1674547105941341e-05, + "loss": 0.4316, + "step": 7519 + }, + { + "epoch": 0.9151201703681168, + "grad_norm": 0.8678448796272278, + "learning_rate": 1.1672623924362922e-05, + "loss": 0.4413, + "step": 7520 + }, + { + "epoch": 0.9152418618801339, + "grad_norm": 1.315632939338684, + "learning_rate": 1.1670700679137703e-05, + "loss": 0.4647, + "step": 7521 + }, + { + "epoch": 0.9153635533921509, + "grad_norm": 2.9123716354370117, + "learning_rate": 1.1668777370338864e-05, + "loss": 0.484, + "step": 7522 + }, + { + "epoch": 0.9154852449041679, + "grad_norm": 1.3041127920150757, + "learning_rate": 1.166685399803959e-05, + "loss": 0.4509, + "step": 7523 + }, + { + "epoch": 0.915606936416185, + "grad_norm": 1.2023916244506836, + "learning_rate": 1.166493056231307e-05, + "loss": 0.4204, + "step": 7524 + }, + { + "epoch": 0.915728627928202, + "grad_norm": 1.6076616048812866, + "learning_rate": 1.1663007063232501e-05, + "loss": 0.4671, + "step": 7525 + }, + { + "epoch": 0.915850319440219, + "grad_norm": 1.4111597537994385, + "learning_rate": 1.1661083500871066e-05, + "loss": 0.5001, + "step": 7526 + }, + { + "epoch": 0.9159720109522361, + "grad_norm": 0.858390748500824, + "learning_rate": 1.1659159875301968e-05, + "loss": 0.4709, + "step": 7527 + }, + { + "epoch": 0.9160937024642531, + "grad_norm": 1.0628103017807007, + "learning_rate": 1.1657236186598401e-05, + "loss": 0.493, + "step": 7528 + }, + { + "epoch": 0.9162153939762702, + "grad_norm": 2.3629496097564697, + "learning_rate": 1.165531243483357e-05, + "loss": 0.4638, + "step": 7529 + }, + { + "epoch": 0.9163370854882872, + "grad_norm": 5.7145867347717285, + "learning_rate": 1.1653388620080672e-05, + "loss": 0.3881, + "step": 7530 + }, + { + "epoch": 0.9164587770003042, + "grad_norm": 2.803727865219116, + "learning_rate": 1.1651464742412915e-05, + "loss": 0.4856, + "step": 7531 + }, + { + "epoch": 0.9165804685123212, + "grad_norm": 1.3399631977081299, + "learning_rate": 1.1649540801903506e-05, + "loss": 0.51, + "step": 7532 + }, + { + "epoch": 0.9167021600243384, + "grad_norm": 3.168067455291748, + "learning_rate": 1.1647616798625659e-05, + "loss": 0.4752, + "step": 7533 + }, + { + "epoch": 0.9168238515363554, + "grad_norm": 1.5800386667251587, + "learning_rate": 1.1645692732652577e-05, + "loss": 0.4972, + "step": 7534 + }, + { + "epoch": 0.9169455430483724, + "grad_norm": 2.391519069671631, + "learning_rate": 1.1643768604057482e-05, + "loss": 0.4715, + "step": 7535 + }, + { + "epoch": 0.9170672345603894, + "grad_norm": 2.9794158935546875, + "learning_rate": 1.1641844412913588e-05, + "loss": 0.3831, + "step": 7536 + }, + { + "epoch": 0.9171889260724064, + "grad_norm": 1.9917330741882324, + "learning_rate": 1.1639920159294119e-05, + "loss": 0.5072, + "step": 7537 + }, + { + "epoch": 0.9173106175844234, + "grad_norm": 2.534503698348999, + "learning_rate": 1.163799584327229e-05, + "loss": 0.4568, + "step": 7538 + }, + { + "epoch": 0.9174323090964405, + "grad_norm": 3.330085515975952, + "learning_rate": 1.163607146492133e-05, + "loss": 0.4921, + "step": 7539 + }, + { + "epoch": 0.9175540006084576, + "grad_norm": 2.5798494815826416, + "learning_rate": 1.1634147024314463e-05, + "loss": 0.5148, + "step": 7540 + }, + { + "epoch": 0.9176756921204746, + "grad_norm": 3.7334725856781006, + "learning_rate": 1.1632222521524923e-05, + "loss": 0.5113, + "step": 7541 + }, + { + "epoch": 0.9177973836324916, + "grad_norm": 4.550693988800049, + "learning_rate": 1.1630297956625934e-05, + "loss": 0.5152, + "step": 7542 + }, + { + "epoch": 0.9179190751445087, + "grad_norm": 0.8807277083396912, + "learning_rate": 1.1628373329690732e-05, + "loss": 0.4625, + "step": 7543 + }, + { + "epoch": 0.9180407666565257, + "grad_norm": 2.635573625564575, + "learning_rate": 1.1626448640792558e-05, + "loss": 0.4093, + "step": 7544 + }, + { + "epoch": 0.9181624581685427, + "grad_norm": 1.0053194761276245, + "learning_rate": 1.1624523890004646e-05, + "loss": 0.4681, + "step": 7545 + }, + { + "epoch": 0.9182841496805598, + "grad_norm": 1.5999778509140015, + "learning_rate": 1.1622599077400236e-05, + "loss": 0.4843, + "step": 7546 + }, + { + "epoch": 0.9184058411925768, + "grad_norm": 1.8159563541412354, + "learning_rate": 1.1620674203052574e-05, + "loss": 0.436, + "step": 7547 + }, + { + "epoch": 0.9185275327045939, + "grad_norm": 1.4748505353927612, + "learning_rate": 1.1618749267034904e-05, + "loss": 0.4914, + "step": 7548 + }, + { + "epoch": 0.9186492242166109, + "grad_norm": 4.531437397003174, + "learning_rate": 1.161682426942047e-05, + "loss": 0.4053, + "step": 7549 + }, + { + "epoch": 0.9187709157286279, + "grad_norm": 2.7811291217803955, + "learning_rate": 1.1614899210282531e-05, + "loss": 0.4047, + "step": 7550 + }, + { + "epoch": 0.9188926072406449, + "grad_norm": 1.8396155834197998, + "learning_rate": 1.161297408969433e-05, + "loss": 0.4214, + "step": 7551 + }, + { + "epoch": 0.919014298752662, + "grad_norm": 0.8901538252830505, + "learning_rate": 1.161104890772913e-05, + "loss": 0.447, + "step": 7552 + }, + { + "epoch": 0.9191359902646791, + "grad_norm": 1.542142629623413, + "learning_rate": 1.1609123664460183e-05, + "loss": 0.4491, + "step": 7553 + }, + { + "epoch": 0.9192576817766961, + "grad_norm": 2.397416114807129, + "learning_rate": 1.1607198359960748e-05, + "loss": 0.4692, + "step": 7554 + }, + { + "epoch": 0.9193793732887131, + "grad_norm": 0.9102994799613953, + "learning_rate": 1.1605272994304091e-05, + "loss": 0.426, + "step": 7555 + }, + { + "epoch": 0.9195010648007301, + "grad_norm": 0.5880528092384338, + "learning_rate": 1.1603347567563474e-05, + "loss": 0.3992, + "step": 7556 + }, + { + "epoch": 0.9196227563127471, + "grad_norm": 0.758091926574707, + "learning_rate": 1.1601422079812163e-05, + "loss": 0.4286, + "step": 7557 + }, + { + "epoch": 0.9197444478247643, + "grad_norm": 3.115398406982422, + "learning_rate": 1.1599496531123426e-05, + "loss": 0.4922, + "step": 7558 + }, + { + "epoch": 0.9198661393367813, + "grad_norm": 3.4865224361419678, + "learning_rate": 1.1597570921570536e-05, + "loss": 0.4673, + "step": 7559 + }, + { + "epoch": 0.9199878308487983, + "grad_norm": 2.4543771743774414, + "learning_rate": 1.1595645251226766e-05, + "loss": 0.4802, + "step": 7560 + }, + { + "epoch": 0.9201095223608153, + "grad_norm": 1.5124155282974243, + "learning_rate": 1.1593719520165392e-05, + "loss": 0.4534, + "step": 7561 + }, + { + "epoch": 0.9202312138728324, + "grad_norm": 0.8431416153907776, + "learning_rate": 1.1591793728459689e-05, + "loss": 0.4483, + "step": 7562 + }, + { + "epoch": 0.9203529053848494, + "grad_norm": 0.9718037843704224, + "learning_rate": 1.1589867876182941e-05, + "loss": 0.4385, + "step": 7563 + }, + { + "epoch": 0.9204745968968664, + "grad_norm": 2.2607533931732178, + "learning_rate": 1.1587941963408429e-05, + "loss": 0.3986, + "step": 7564 + }, + { + "epoch": 0.9205962884088835, + "grad_norm": 2.767716407775879, + "learning_rate": 1.1586015990209439e-05, + "loss": 0.4265, + "step": 7565 + }, + { + "epoch": 0.9207179799209005, + "grad_norm": 1.0288968086242676, + "learning_rate": 1.1584089956659254e-05, + "loss": 0.4565, + "step": 7566 + }, + { + "epoch": 0.9208396714329176, + "grad_norm": 0.7784706354141235, + "learning_rate": 1.1582163862831175e-05, + "loss": 0.5245, + "step": 7567 + }, + { + "epoch": 0.9209613629449346, + "grad_norm": 0.5957329869270325, + "learning_rate": 1.1580237708798482e-05, + "loss": 0.4513, + "step": 7568 + }, + { + "epoch": 0.9210830544569516, + "grad_norm": 2.8319318294525146, + "learning_rate": 1.1578311494634474e-05, + "loss": 0.5012, + "step": 7569 + }, + { + "epoch": 0.9212047459689686, + "grad_norm": 1.967776894569397, + "learning_rate": 1.1576385220412442e-05, + "loss": 0.4083, + "step": 7570 + }, + { + "epoch": 0.9213264374809857, + "grad_norm": 1.3437954187393188, + "learning_rate": 1.1574458886205698e-05, + "loss": 0.3761, + "step": 7571 + }, + { + "epoch": 0.9214481289930028, + "grad_norm": 0.6504002809524536, + "learning_rate": 1.1572532492087527e-05, + "loss": 0.4464, + "step": 7572 + }, + { + "epoch": 0.9215698205050198, + "grad_norm": 0.5987673401832581, + "learning_rate": 1.1570606038131245e-05, + "loss": 0.4289, + "step": 7573 + }, + { + "epoch": 0.9216915120170368, + "grad_norm": 2.900508165359497, + "learning_rate": 1.1568679524410147e-05, + "loss": 0.4928, + "step": 7574 + }, + { + "epoch": 0.9218132035290538, + "grad_norm": 0.6257543563842773, + "learning_rate": 1.156675295099755e-05, + "loss": 0.4191, + "step": 7575 + }, + { + "epoch": 0.9219348950410708, + "grad_norm": 1.9884703159332275, + "learning_rate": 1.156482631796676e-05, + "loss": 0.458, + "step": 7576 + }, + { + "epoch": 0.922056586553088, + "grad_norm": 1.192807912826538, + "learning_rate": 1.1562899625391086e-05, + "loss": 0.4594, + "step": 7577 + }, + { + "epoch": 0.922178278065105, + "grad_norm": 1.2598100900650024, + "learning_rate": 1.1560972873343852e-05, + "loss": 0.4703, + "step": 7578 + }, + { + "epoch": 0.922299969577122, + "grad_norm": 1.4153900146484375, + "learning_rate": 1.1559046061898367e-05, + "loss": 0.4706, + "step": 7579 + }, + { + "epoch": 0.922421661089139, + "grad_norm": 0.853300929069519, + "learning_rate": 1.155711919112795e-05, + "loss": 0.4465, + "step": 7580 + }, + { + "epoch": 0.922543352601156, + "grad_norm": 0.5372827053070068, + "learning_rate": 1.1555192261105925e-05, + "loss": 0.4326, + "step": 7581 + }, + { + "epoch": 0.9226650441131731, + "grad_norm": 0.7183429598808289, + "learning_rate": 1.1553265271905619e-05, + "loss": 0.4755, + "step": 7582 + }, + { + "epoch": 0.9227867356251901, + "grad_norm": 3.2192203998565674, + "learning_rate": 1.155133822360035e-05, + "loss": 0.4516, + "step": 7583 + }, + { + "epoch": 0.9229084271372072, + "grad_norm": 4.470743656158447, + "learning_rate": 1.1549411116263454e-05, + "loss": 0.4699, + "step": 7584 + }, + { + "epoch": 0.9230301186492242, + "grad_norm": 2.28182053565979, + "learning_rate": 1.1547483949968254e-05, + "loss": 0.4633, + "step": 7585 + }, + { + "epoch": 0.9231518101612413, + "grad_norm": 1.8521881103515625, + "learning_rate": 1.154555672478809e-05, + "loss": 0.4858, + "step": 7586 + }, + { + "epoch": 0.9232735016732583, + "grad_norm": 2.6726293563842773, + "learning_rate": 1.1543629440796291e-05, + "loss": 0.479, + "step": 7587 + }, + { + "epoch": 0.9233951931852753, + "grad_norm": 4.213736534118652, + "learning_rate": 1.1541702098066199e-05, + "loss": 0.4363, + "step": 7588 + }, + { + "epoch": 0.9235168846972923, + "grad_norm": 1.0938345193862915, + "learning_rate": 1.1539774696671151e-05, + "loss": 0.4592, + "step": 7589 + }, + { + "epoch": 0.9236385762093094, + "grad_norm": 0.6823290586471558, + "learning_rate": 1.1537847236684487e-05, + "loss": 0.4207, + "step": 7590 + }, + { + "epoch": 0.9237602677213265, + "grad_norm": 1.2085717916488647, + "learning_rate": 1.1535919718179554e-05, + "loss": 0.4407, + "step": 7591 + }, + { + "epoch": 0.9238819592333435, + "grad_norm": 0.6506874561309814, + "learning_rate": 1.1533992141229693e-05, + "loss": 0.448, + "step": 7592 + }, + { + "epoch": 0.9240036507453605, + "grad_norm": 3.0533831119537354, + "learning_rate": 1.153206450590826e-05, + "loss": 0.4598, + "step": 7593 + }, + { + "epoch": 0.9241253422573775, + "grad_norm": 3.3513081073760986, + "learning_rate": 1.15301368122886e-05, + "loss": 0.5091, + "step": 7594 + }, + { + "epoch": 0.9242470337693945, + "grad_norm": 0.7972692251205444, + "learning_rate": 1.1528209060444065e-05, + "loss": 0.4345, + "step": 7595 + }, + { + "epoch": 0.9243687252814117, + "grad_norm": 1.1738011837005615, + "learning_rate": 1.1526281250448015e-05, + "loss": 0.3849, + "step": 7596 + }, + { + "epoch": 0.9244904167934287, + "grad_norm": 0.7687472105026245, + "learning_rate": 1.1524353382373806e-05, + "loss": 0.4731, + "step": 7597 + }, + { + "epoch": 0.9246121083054457, + "grad_norm": 1.6522984504699707, + "learning_rate": 1.152242545629479e-05, + "loss": 0.4614, + "step": 7598 + }, + { + "epoch": 0.9247337998174627, + "grad_norm": 2.1395158767700195, + "learning_rate": 1.1520497472284337e-05, + "loss": 0.4891, + "step": 7599 + }, + { + "epoch": 0.9248554913294798, + "grad_norm": 1.6859644651412964, + "learning_rate": 1.1518569430415806e-05, + "loss": 0.4822, + "step": 7600 + }, + { + "epoch": 0.9249771828414968, + "grad_norm": 3.8348793983459473, + "learning_rate": 1.1516641330762567e-05, + "loss": 0.566, + "step": 7601 + }, + { + "epoch": 0.9250988743535138, + "grad_norm": 1.5323084592819214, + "learning_rate": 1.1514713173397989e-05, + "loss": 0.4458, + "step": 7602 + }, + { + "epoch": 0.9252205658655309, + "grad_norm": 0.7786366939544678, + "learning_rate": 1.1512784958395436e-05, + "loss": 0.4926, + "step": 7603 + }, + { + "epoch": 0.9253422573775479, + "grad_norm": 2.2099411487579346, + "learning_rate": 1.1510856685828283e-05, + "loss": 0.4048, + "step": 7604 + }, + { + "epoch": 0.925463948889565, + "grad_norm": 3.047236204147339, + "learning_rate": 1.1508928355769907e-05, + "loss": 0.4442, + "step": 7605 + }, + { + "epoch": 0.925585640401582, + "grad_norm": 3.9911813735961914, + "learning_rate": 1.1506999968293683e-05, + "loss": 0.4097, + "step": 7606 + }, + { + "epoch": 0.925707331913599, + "grad_norm": 4.025291442871094, + "learning_rate": 1.1505071523472992e-05, + "loss": 0.4271, + "step": 7607 + }, + { + "epoch": 0.925829023425616, + "grad_norm": 0.8336036205291748, + "learning_rate": 1.1503143021381213e-05, + "loss": 0.4559, + "step": 7608 + }, + { + "epoch": 0.9259507149376331, + "grad_norm": 0.9317759871482849, + "learning_rate": 1.1501214462091734e-05, + "loss": 0.4526, + "step": 7609 + }, + { + "epoch": 0.9260724064496502, + "grad_norm": 2.1263556480407715, + "learning_rate": 1.1499285845677934e-05, + "loss": 0.3997, + "step": 7610 + }, + { + "epoch": 0.9261940979616672, + "grad_norm": 1.8488959074020386, + "learning_rate": 1.1497357172213204e-05, + "loss": 0.5209, + "step": 7611 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 0.9055743217468262, + "learning_rate": 1.1495428441770937e-05, + "loss": 0.4589, + "step": 7612 + }, + { + "epoch": 0.9264374809857012, + "grad_norm": 0.6781041622161865, + "learning_rate": 1.1493499654424523e-05, + "loss": 0.4829, + "step": 7613 + }, + { + "epoch": 0.9265591724977182, + "grad_norm": 2.101365804672241, + "learning_rate": 1.1491570810247351e-05, + "loss": 0.5071, + "step": 7614 + }, + { + "epoch": 0.9266808640097354, + "grad_norm": 1.1475825309753418, + "learning_rate": 1.1489641909312825e-05, + "loss": 0.4539, + "step": 7615 + }, + { + "epoch": 0.9268025555217524, + "grad_norm": 1.1658658981323242, + "learning_rate": 1.148771295169434e-05, + "loss": 0.4879, + "step": 7616 + }, + { + "epoch": 0.9269242470337694, + "grad_norm": 2.036994695663452, + "learning_rate": 1.14857839374653e-05, + "loss": 0.4839, + "step": 7617 + }, + { + "epoch": 0.9270459385457864, + "grad_norm": 0.8018247485160828, + "learning_rate": 1.1483854866699102e-05, + "loss": 0.5041, + "step": 7618 + }, + { + "epoch": 0.9271676300578034, + "grad_norm": 2.9389092922210693, + "learning_rate": 1.148192573946916e-05, + "loss": 0.4927, + "step": 7619 + }, + { + "epoch": 0.9272893215698205, + "grad_norm": 2.154796838760376, + "learning_rate": 1.1479996555848874e-05, + "loss": 0.4972, + "step": 7620 + }, + { + "epoch": 0.9274110130818375, + "grad_norm": 2.108656406402588, + "learning_rate": 1.1478067315911653e-05, + "loss": 0.5127, + "step": 7621 + }, + { + "epoch": 0.9275327045938546, + "grad_norm": 3.507293224334717, + "learning_rate": 1.1476138019730913e-05, + "loss": 0.4675, + "step": 7622 + }, + { + "epoch": 0.9276543961058716, + "grad_norm": 2.8766088485717773, + "learning_rate": 1.147420866738007e-05, + "loss": 0.4614, + "step": 7623 + }, + { + "epoch": 0.9277760876178887, + "grad_norm": 2.3016607761383057, + "learning_rate": 1.147227925893253e-05, + "loss": 0.5216, + "step": 7624 + }, + { + "epoch": 0.9278977791299057, + "grad_norm": 1.2048115730285645, + "learning_rate": 1.1470349794461719e-05, + "loss": 0.5709, + "step": 7625 + }, + { + "epoch": 0.9280194706419227, + "grad_norm": 0.7632753849029541, + "learning_rate": 1.1468420274041054e-05, + "loss": 0.477, + "step": 7626 + }, + { + "epoch": 0.9281411621539397, + "grad_norm": 4.178420543670654, + "learning_rate": 1.1466490697743962e-05, + "loss": 0.4384, + "step": 7627 + }, + { + "epoch": 0.9282628536659568, + "grad_norm": 1.5677729845046997, + "learning_rate": 1.1464561065643858e-05, + "loss": 0.453, + "step": 7628 + }, + { + "epoch": 0.9283845451779739, + "grad_norm": 0.7407684326171875, + "learning_rate": 1.1462631377814175e-05, + "loss": 0.481, + "step": 7629 + }, + { + "epoch": 0.9285062366899909, + "grad_norm": 2.8354055881500244, + "learning_rate": 1.1460701634328344e-05, + "loss": 0.5165, + "step": 7630 + }, + { + "epoch": 0.9286279282020079, + "grad_norm": 1.4617078304290771, + "learning_rate": 1.1458771835259787e-05, + "loss": 0.4801, + "step": 7631 + }, + { + "epoch": 0.9287496197140249, + "grad_norm": 2.3162219524383545, + "learning_rate": 1.1456841980681948e-05, + "loss": 0.4921, + "step": 7632 + }, + { + "epoch": 0.9288713112260419, + "grad_norm": 1.2323050498962402, + "learning_rate": 1.1454912070668254e-05, + "loss": 0.4334, + "step": 7633 + }, + { + "epoch": 0.9289930027380591, + "grad_norm": 2.2230920791625977, + "learning_rate": 1.1452982105292141e-05, + "loss": 0.4812, + "step": 7634 + }, + { + "epoch": 0.9291146942500761, + "grad_norm": 0.9971070289611816, + "learning_rate": 1.1451052084627055e-05, + "loss": 0.4549, + "step": 7635 + }, + { + "epoch": 0.9292363857620931, + "grad_norm": 2.170532703399658, + "learning_rate": 1.1449122008746434e-05, + "loss": 0.4398, + "step": 7636 + }, + { + "epoch": 0.9293580772741101, + "grad_norm": 1.6843416690826416, + "learning_rate": 1.144719187772372e-05, + "loss": 0.4763, + "step": 7637 + }, + { + "epoch": 0.9294797687861271, + "grad_norm": 0.6193882822990417, + "learning_rate": 1.144526169163236e-05, + "loss": 0.3935, + "step": 7638 + }, + { + "epoch": 0.9296014602981442, + "grad_norm": 0.9456354379653931, + "learning_rate": 1.14433314505458e-05, + "loss": 0.4211, + "step": 7639 + }, + { + "epoch": 0.9297231518101612, + "grad_norm": 2.8768813610076904, + "learning_rate": 1.144140115453749e-05, + "loss": 0.4857, + "step": 7640 + }, + { + "epoch": 0.9298448433221783, + "grad_norm": 0.8512792587280273, + "learning_rate": 1.1439470803680884e-05, + "loss": 0.4374, + "step": 7641 + }, + { + "epoch": 0.9299665348341953, + "grad_norm": 1.271427035331726, + "learning_rate": 1.1437540398049433e-05, + "loss": 0.4334, + "step": 7642 + }, + { + "epoch": 0.9300882263462124, + "grad_norm": 1.7403056621551514, + "learning_rate": 1.1435609937716599e-05, + "loss": 0.399, + "step": 7643 + }, + { + "epoch": 0.9302099178582294, + "grad_norm": 1.6461127996444702, + "learning_rate": 1.143367942275583e-05, + "loss": 0.4877, + "step": 7644 + }, + { + "epoch": 0.9303316093702464, + "grad_norm": 1.8353902101516724, + "learning_rate": 1.1431748853240591e-05, + "loss": 0.4439, + "step": 7645 + }, + { + "epoch": 0.9304533008822634, + "grad_norm": 2.511136293411255, + "learning_rate": 1.1429818229244349e-05, + "loss": 0.4193, + "step": 7646 + }, + { + "epoch": 0.9305749923942805, + "grad_norm": 2.807276725769043, + "learning_rate": 1.142788755084056e-05, + "loss": 0.3767, + "step": 7647 + }, + { + "epoch": 0.9306966839062976, + "grad_norm": 0.7150379419326782, + "learning_rate": 1.1425956818102696e-05, + "loss": 0.4879, + "step": 7648 + }, + { + "epoch": 0.9308183754183146, + "grad_norm": 3.0271072387695312, + "learning_rate": 1.1424026031104223e-05, + "loss": 0.4003, + "step": 7649 + }, + { + "epoch": 0.9309400669303316, + "grad_norm": 2.6720077991485596, + "learning_rate": 1.1422095189918614e-05, + "loss": 0.436, + "step": 7650 + }, + { + "epoch": 0.9310617584423486, + "grad_norm": 0.6751669645309448, + "learning_rate": 1.1420164294619336e-05, + "loss": 0.4393, + "step": 7651 + }, + { + "epoch": 0.9311834499543656, + "grad_norm": 2.4788801670074463, + "learning_rate": 1.1418233345279868e-05, + "loss": 0.3788, + "step": 7652 + }, + { + "epoch": 0.9313051414663828, + "grad_norm": 1.3087700605392456, + "learning_rate": 1.1416302341973689e-05, + "loss": 0.4118, + "step": 7653 + }, + { + "epoch": 0.9314268329783998, + "grad_norm": 2.1150686740875244, + "learning_rate": 1.141437128477427e-05, + "loss": 0.4446, + "step": 7654 + }, + { + "epoch": 0.9315485244904168, + "grad_norm": 3.462644338607788, + "learning_rate": 1.1412440173755098e-05, + "loss": 0.4835, + "step": 7655 + }, + { + "epoch": 0.9316702160024338, + "grad_norm": 2.551018714904785, + "learning_rate": 1.1410509008989652e-05, + "loss": 0.5146, + "step": 7656 + }, + { + "epoch": 0.9317919075144508, + "grad_norm": 2.713292121887207, + "learning_rate": 1.140857779055142e-05, + "loss": 0.4853, + "step": 7657 + }, + { + "epoch": 0.9319135990264679, + "grad_norm": 0.8535588979721069, + "learning_rate": 1.1406646518513888e-05, + "loss": 0.4075, + "step": 7658 + }, + { + "epoch": 0.932035290538485, + "grad_norm": 1.7017369270324707, + "learning_rate": 1.1404715192950543e-05, + "loss": 0.4578, + "step": 7659 + }, + { + "epoch": 0.932156982050502, + "grad_norm": 1.3032909631729126, + "learning_rate": 1.140278381393488e-05, + "loss": 0.4138, + "step": 7660 + }, + { + "epoch": 0.932278673562519, + "grad_norm": 0.8046365976333618, + "learning_rate": 1.1400852381540385e-05, + "loss": 0.4335, + "step": 7661 + }, + { + "epoch": 0.932400365074536, + "grad_norm": 1.8814215660095215, + "learning_rate": 1.1398920895840561e-05, + "loss": 0.4021, + "step": 7662 + }, + { + "epoch": 0.9325220565865531, + "grad_norm": 3.4482665061950684, + "learning_rate": 1.1396989356908899e-05, + "loss": 0.4154, + "step": 7663 + }, + { + "epoch": 0.9326437480985701, + "grad_norm": 1.482643723487854, + "learning_rate": 1.13950577648189e-05, + "loss": 0.4834, + "step": 7664 + }, + { + "epoch": 0.9327654396105871, + "grad_norm": 0.714070737361908, + "learning_rate": 1.1393126119644068e-05, + "loss": 0.422, + "step": 7665 + }, + { + "epoch": 0.9328871311226042, + "grad_norm": 2.219923496246338, + "learning_rate": 1.1391194421457905e-05, + "loss": 0.4054, + "step": 7666 + }, + { + "epoch": 0.9330088226346213, + "grad_norm": 1.8329023122787476, + "learning_rate": 1.138926267033391e-05, + "loss": 0.4334, + "step": 7667 + }, + { + "epoch": 0.9331305141466383, + "grad_norm": 2.2366943359375, + "learning_rate": 1.1387330866345596e-05, + "loss": 0.4228, + "step": 7668 + }, + { + "epoch": 0.9332522056586553, + "grad_norm": 4.111137390136719, + "learning_rate": 1.1385399009566473e-05, + "loss": 0.5119, + "step": 7669 + }, + { + "epoch": 0.9333738971706723, + "grad_norm": 2.5041019916534424, + "learning_rate": 1.1383467100070046e-05, + "loss": 0.4909, + "step": 7670 + }, + { + "epoch": 0.9334955886826893, + "grad_norm": 2.5130231380462646, + "learning_rate": 1.1381535137929837e-05, + "loss": 0.4771, + "step": 7671 + }, + { + "epoch": 0.9336172801947065, + "grad_norm": 1.2716031074523926, + "learning_rate": 1.1379603123219353e-05, + "loss": 0.4239, + "step": 7672 + }, + { + "epoch": 0.9337389717067235, + "grad_norm": 1.0430330038070679, + "learning_rate": 1.1377671056012119e-05, + "loss": 0.4412, + "step": 7673 + }, + { + "epoch": 0.9338606632187405, + "grad_norm": 2.0244851112365723, + "learning_rate": 1.1375738936381644e-05, + "loss": 0.4621, + "step": 7674 + }, + { + "epoch": 0.9339823547307575, + "grad_norm": 1.8305261135101318, + "learning_rate": 1.1373806764401459e-05, + "loss": 0.4551, + "step": 7675 + }, + { + "epoch": 0.9341040462427745, + "grad_norm": 2.0901358127593994, + "learning_rate": 1.137187454014508e-05, + "loss": 0.4825, + "step": 7676 + }, + { + "epoch": 0.9342257377547916, + "grad_norm": 2.215219020843506, + "learning_rate": 1.1369942263686038e-05, + "loss": 0.5068, + "step": 7677 + }, + { + "epoch": 0.9343474292668087, + "grad_norm": 1.669443964958191, + "learning_rate": 1.1368009935097856e-05, + "loss": 0.4864, + "step": 7678 + }, + { + "epoch": 0.9344691207788257, + "grad_norm": 2.9809720516204834, + "learning_rate": 1.1366077554454062e-05, + "loss": 0.4143, + "step": 7679 + }, + { + "epoch": 0.9345908122908427, + "grad_norm": 3.228161334991455, + "learning_rate": 1.1364145121828195e-05, + "loss": 0.4205, + "step": 7680 + }, + { + "epoch": 0.9347125038028598, + "grad_norm": 3.5505542755126953, + "learning_rate": 1.1362212637293777e-05, + "loss": 0.4767, + "step": 7681 + }, + { + "epoch": 0.9348341953148768, + "grad_norm": 2.12424373626709, + "learning_rate": 1.136028010092435e-05, + "loss": 0.4426, + "step": 7682 + }, + { + "epoch": 0.9349558868268938, + "grad_norm": 0.9856480956077576, + "learning_rate": 1.1358347512793451e-05, + "loss": 0.4322, + "step": 7683 + }, + { + "epoch": 0.9350775783389108, + "grad_norm": 1.7891829013824463, + "learning_rate": 1.1356414872974617e-05, + "loss": 0.4414, + "step": 7684 + }, + { + "epoch": 0.9351992698509279, + "grad_norm": 0.9244073033332825, + "learning_rate": 1.1354482181541389e-05, + "loss": 0.4606, + "step": 7685 + }, + { + "epoch": 0.935320961362945, + "grad_norm": 0.7673761248588562, + "learning_rate": 1.1352549438567308e-05, + "loss": 0.4425, + "step": 7686 + }, + { + "epoch": 0.935442652874962, + "grad_norm": 0.7276907563209534, + "learning_rate": 1.1350616644125925e-05, + "loss": 0.4191, + "step": 7687 + }, + { + "epoch": 0.935564344386979, + "grad_norm": 2.1830379962921143, + "learning_rate": 1.1348683798290782e-05, + "loss": 0.3948, + "step": 7688 + }, + { + "epoch": 0.935686035898996, + "grad_norm": 1.2450603246688843, + "learning_rate": 1.1346750901135431e-05, + "loss": 0.468, + "step": 7689 + }, + { + "epoch": 0.935807727411013, + "grad_norm": 1.1602596044540405, + "learning_rate": 1.1344817952733416e-05, + "loss": 0.4631, + "step": 7690 + }, + { + "epoch": 0.9359294189230302, + "grad_norm": 1.6805158853530884, + "learning_rate": 1.1342884953158295e-05, + "loss": 0.4661, + "step": 7691 + }, + { + "epoch": 0.9360511104350472, + "grad_norm": 3.5069258213043213, + "learning_rate": 1.1340951902483623e-05, + "loss": 0.4618, + "step": 7692 + }, + { + "epoch": 0.9361728019470642, + "grad_norm": 1.3578343391418457, + "learning_rate": 1.1339018800782956e-05, + "loss": 0.4815, + "step": 7693 + }, + { + "epoch": 0.9362944934590812, + "grad_norm": 2.591719388961792, + "learning_rate": 1.1337085648129853e-05, + "loss": 0.4983, + "step": 7694 + }, + { + "epoch": 0.9364161849710982, + "grad_norm": 3.20554256439209, + "learning_rate": 1.1335152444597872e-05, + "loss": 0.4021, + "step": 7695 + }, + { + "epoch": 0.9365378764831153, + "grad_norm": 1.2479726076126099, + "learning_rate": 1.1333219190260578e-05, + "loss": 0.4433, + "step": 7696 + }, + { + "epoch": 0.9366595679951324, + "grad_norm": 3.1756629943847656, + "learning_rate": 1.1331285885191533e-05, + "loss": 0.436, + "step": 7697 + }, + { + "epoch": 0.9367812595071494, + "grad_norm": 1.2032915353775024, + "learning_rate": 1.1329352529464304e-05, + "loss": 0.4449, + "step": 7698 + }, + { + "epoch": 0.9369029510191664, + "grad_norm": 1.4280959367752075, + "learning_rate": 1.1327419123152461e-05, + "loss": 0.441, + "step": 7699 + }, + { + "epoch": 0.9370246425311834, + "grad_norm": 0.6721690893173218, + "learning_rate": 1.1325485666329573e-05, + "loss": 0.4451, + "step": 7700 + }, + { + "epoch": 0.9371463340432005, + "grad_norm": 1.0141255855560303, + "learning_rate": 1.1323552159069211e-05, + "loss": 0.4365, + "step": 7701 + }, + { + "epoch": 0.9372680255552175, + "grad_norm": 1.9130144119262695, + "learning_rate": 1.1321618601444947e-05, + "loss": 0.4723, + "step": 7702 + }, + { + "epoch": 0.9373897170672345, + "grad_norm": 1.519945502281189, + "learning_rate": 1.1319684993530366e-05, + "loss": 0.4234, + "step": 7703 + }, + { + "epoch": 0.9375114085792516, + "grad_norm": 1.5344319343566895, + "learning_rate": 1.1317751335399034e-05, + "loss": 0.4235, + "step": 7704 + }, + { + "epoch": 0.9376331000912687, + "grad_norm": 0.768884539604187, + "learning_rate": 1.1315817627124538e-05, + "loss": 0.3798, + "step": 7705 + }, + { + "epoch": 0.9377547916032857, + "grad_norm": 2.5807077884674072, + "learning_rate": 1.1313883868780458e-05, + "loss": 0.4691, + "step": 7706 + }, + { + "epoch": 0.9378764831153027, + "grad_norm": 2.1828975677490234, + "learning_rate": 1.1311950060440377e-05, + "loss": 0.4533, + "step": 7707 + }, + { + "epoch": 0.9379981746273197, + "grad_norm": 2.83746337890625, + "learning_rate": 1.131001620217788e-05, + "loss": 0.4487, + "step": 7708 + }, + { + "epoch": 0.9381198661393367, + "grad_norm": 2.4260363578796387, + "learning_rate": 1.1308082294066556e-05, + "loss": 0.4927, + "step": 7709 + }, + { + "epoch": 0.9382415576513539, + "grad_norm": 0.9822359085083008, + "learning_rate": 1.1306148336179992e-05, + "loss": 0.46, + "step": 7710 + }, + { + "epoch": 0.9383632491633709, + "grad_norm": 2.834970712661743, + "learning_rate": 1.1304214328591783e-05, + "loss": 0.5125, + "step": 7711 + }, + { + "epoch": 0.9384849406753879, + "grad_norm": 1.2395174503326416, + "learning_rate": 1.1302280271375516e-05, + "loss": 0.5118, + "step": 7712 + }, + { + "epoch": 0.9386066321874049, + "grad_norm": 0.6867635250091553, + "learning_rate": 1.130034616460479e-05, + "loss": 0.4497, + "step": 7713 + }, + { + "epoch": 0.9387283236994219, + "grad_norm": 1.8402960300445557, + "learning_rate": 1.1298412008353201e-05, + "loss": 0.441, + "step": 7714 + }, + { + "epoch": 0.938850015211439, + "grad_norm": 2.4694528579711914, + "learning_rate": 1.1296477802694345e-05, + "loss": 0.4654, + "step": 7715 + }, + { + "epoch": 0.9389717067234561, + "grad_norm": 0.6076860427856445, + "learning_rate": 1.1294543547701828e-05, + "loss": 0.4502, + "step": 7716 + }, + { + "epoch": 0.9390933982354731, + "grad_norm": 0.8330790996551514, + "learning_rate": 1.129260924344925e-05, + "loss": 0.4533, + "step": 7717 + }, + { + "epoch": 0.9392150897474901, + "grad_norm": 1.718134880065918, + "learning_rate": 1.1290674890010216e-05, + "loss": 0.4553, + "step": 7718 + }, + { + "epoch": 0.9393367812595071, + "grad_norm": 2.0217318534851074, + "learning_rate": 1.1288740487458327e-05, + "loss": 0.4683, + "step": 7719 + }, + { + "epoch": 0.9394584727715242, + "grad_norm": 2.768575429916382, + "learning_rate": 1.1286806035867195e-05, + "loss": 0.4406, + "step": 7720 + }, + { + "epoch": 0.9395801642835412, + "grad_norm": 0.8233996033668518, + "learning_rate": 1.1284871535310432e-05, + "loss": 0.5018, + "step": 7721 + }, + { + "epoch": 0.9397018557955582, + "grad_norm": 2.154956579208374, + "learning_rate": 1.1282936985861647e-05, + "loss": 0.4169, + "step": 7722 + }, + { + "epoch": 0.9398235473075753, + "grad_norm": 0.7472934126853943, + "learning_rate": 1.1281002387594455e-05, + "loss": 0.4237, + "step": 7723 + }, + { + "epoch": 0.9399452388195924, + "grad_norm": 3.3204634189605713, + "learning_rate": 1.1279067740582468e-05, + "loss": 0.5002, + "step": 7724 + }, + { + "epoch": 0.9400669303316094, + "grad_norm": 1.817721962928772, + "learning_rate": 1.1277133044899307e-05, + "loss": 0.4804, + "step": 7725 + }, + { + "epoch": 0.9401886218436264, + "grad_norm": 1.491402506828308, + "learning_rate": 1.1275198300618591e-05, + "loss": 0.4862, + "step": 7726 + }, + { + "epoch": 0.9403103133556434, + "grad_norm": 0.7314885258674622, + "learning_rate": 1.127326350781394e-05, + "loss": 0.4537, + "step": 7727 + }, + { + "epoch": 0.9404320048676604, + "grad_norm": 1.3579291105270386, + "learning_rate": 1.1271328666558978e-05, + "loss": 0.4434, + "step": 7728 + }, + { + "epoch": 0.9405536963796776, + "grad_norm": 1.0967844724655151, + "learning_rate": 1.1269393776927327e-05, + "loss": 0.4219, + "step": 7729 + }, + { + "epoch": 0.9406753878916946, + "grad_norm": 0.7108358144760132, + "learning_rate": 1.1267458838992616e-05, + "loss": 0.4716, + "step": 7730 + }, + { + "epoch": 0.9407970794037116, + "grad_norm": 0.7127029895782471, + "learning_rate": 1.126552385282847e-05, + "loss": 0.452, + "step": 7731 + }, + { + "epoch": 0.9409187709157286, + "grad_norm": 0.7911932468414307, + "learning_rate": 1.1263588818508524e-05, + "loss": 0.4367, + "step": 7732 + }, + { + "epoch": 0.9410404624277456, + "grad_norm": 0.7735646367073059, + "learning_rate": 1.1261653736106411e-05, + "loss": 0.452, + "step": 7733 + }, + { + "epoch": 0.9411621539397627, + "grad_norm": 1.4993888139724731, + "learning_rate": 1.125971860569576e-05, + "loss": 0.4146, + "step": 7734 + }, + { + "epoch": 0.9412838454517798, + "grad_norm": 1.8502323627471924, + "learning_rate": 1.1257783427350207e-05, + "loss": 0.4132, + "step": 7735 + }, + { + "epoch": 0.9414055369637968, + "grad_norm": 1.0543010234832764, + "learning_rate": 1.125584820114339e-05, + "loss": 0.3663, + "step": 7736 + }, + { + "epoch": 0.9415272284758138, + "grad_norm": 0.7789441347122192, + "learning_rate": 1.1253912927148954e-05, + "loss": 0.46, + "step": 7737 + }, + { + "epoch": 0.9416489199878308, + "grad_norm": 3.0533335208892822, + "learning_rate": 1.1251977605440532e-05, + "loss": 0.4897, + "step": 7738 + }, + { + "epoch": 0.9417706114998479, + "grad_norm": 2.086733818054199, + "learning_rate": 1.125004223609177e-05, + "loss": 0.4754, + "step": 7739 + }, + { + "epoch": 0.9418923030118649, + "grad_norm": 2.711740732192993, + "learning_rate": 1.1248106819176317e-05, + "loss": 0.4434, + "step": 7740 + }, + { + "epoch": 0.942013994523882, + "grad_norm": 1.6003062725067139, + "learning_rate": 1.1246171354767813e-05, + "loss": 0.4299, + "step": 7741 + }, + { + "epoch": 0.942135686035899, + "grad_norm": 0.9266567230224609, + "learning_rate": 1.1244235842939912e-05, + "loss": 0.4865, + "step": 7742 + }, + { + "epoch": 0.942257377547916, + "grad_norm": 1.6402803659439087, + "learning_rate": 1.1242300283766258e-05, + "loss": 0.4187, + "step": 7743 + }, + { + "epoch": 0.9423790690599331, + "grad_norm": 1.8191728591918945, + "learning_rate": 1.1240364677320513e-05, + "loss": 0.4103, + "step": 7744 + }, + { + "epoch": 0.9425007605719501, + "grad_norm": 1.5551636219024658, + "learning_rate": 1.1238429023676317e-05, + "loss": 0.4612, + "step": 7745 + }, + { + "epoch": 0.9426224520839671, + "grad_norm": 2.4148149490356445, + "learning_rate": 1.1236493322907341e-05, + "loss": 0.5204, + "step": 7746 + }, + { + "epoch": 0.9427441435959841, + "grad_norm": 1.983602523803711, + "learning_rate": 1.123455757508723e-05, + "loss": 0.454, + "step": 7747 + }, + { + "epoch": 0.9428658351080013, + "grad_norm": 1.3249270915985107, + "learning_rate": 1.123262178028965e-05, + "loss": 0.422, + "step": 7748 + }, + { + "epoch": 0.9429875266200183, + "grad_norm": 1.3510316610336304, + "learning_rate": 1.1230685938588257e-05, + "loss": 0.5, + "step": 7749 + }, + { + "epoch": 0.9431092181320353, + "grad_norm": 1.6859837770462036, + "learning_rate": 1.1228750050056718e-05, + "loss": 0.4248, + "step": 7750 + }, + { + "epoch": 0.9432309096440523, + "grad_norm": 0.818302571773529, + "learning_rate": 1.1226814114768696e-05, + "loss": 0.4821, + "step": 7751 + }, + { + "epoch": 0.9433526011560693, + "grad_norm": 1.6927692890167236, + "learning_rate": 1.122487813279786e-05, + "loss": 0.4331, + "step": 7752 + }, + { + "epoch": 0.9434742926680864, + "grad_norm": 0.570755124092102, + "learning_rate": 1.1222942104217874e-05, + "loss": 0.4724, + "step": 7753 + }, + { + "epoch": 0.9435959841801035, + "grad_norm": 4.1477837562561035, + "learning_rate": 1.1221006029102408e-05, + "loss": 0.5048, + "step": 7754 + }, + { + "epoch": 0.9437176756921205, + "grad_norm": 3.0145456790924072, + "learning_rate": 1.1219069907525136e-05, + "loss": 0.5098, + "step": 7755 + }, + { + "epoch": 0.9438393672041375, + "grad_norm": 0.701015293598175, + "learning_rate": 1.1217133739559731e-05, + "loss": 0.4806, + "step": 7756 + }, + { + "epoch": 0.9439610587161545, + "grad_norm": 0.7368866801261902, + "learning_rate": 1.121519752527987e-05, + "loss": 0.4193, + "step": 7757 + }, + { + "epoch": 0.9440827502281716, + "grad_norm": 0.7859698534011841, + "learning_rate": 1.1213261264759226e-05, + "loss": 0.4399, + "step": 7758 + }, + { + "epoch": 0.9442044417401886, + "grad_norm": 0.7844321727752686, + "learning_rate": 1.1211324958071477e-05, + "loss": 0.435, + "step": 7759 + }, + { + "epoch": 0.9443261332522057, + "grad_norm": 1.2891095876693726, + "learning_rate": 1.120938860529031e-05, + "loss": 0.423, + "step": 7760 + }, + { + "epoch": 0.9444478247642227, + "grad_norm": 1.3656929731369019, + "learning_rate": 1.1207452206489402e-05, + "loss": 0.454, + "step": 7761 + }, + { + "epoch": 0.9445695162762398, + "grad_norm": 3.1759777069091797, + "learning_rate": 1.1205515761742436e-05, + "loss": 0.5318, + "step": 7762 + }, + { + "epoch": 0.9446912077882568, + "grad_norm": 2.6389236450195312, + "learning_rate": 1.1203579271123103e-05, + "loss": 0.5193, + "step": 7763 + }, + { + "epoch": 0.9448128993002738, + "grad_norm": 3.1276309490203857, + "learning_rate": 1.1201642734705089e-05, + "loss": 0.4167, + "step": 7764 + }, + { + "epoch": 0.9449345908122908, + "grad_norm": 2.766252040863037, + "learning_rate": 1.1199706152562077e-05, + "loss": 0.4579, + "step": 7765 + }, + { + "epoch": 0.9450562823243078, + "grad_norm": 2.338665246963501, + "learning_rate": 1.1197769524767765e-05, + "loss": 0.451, + "step": 7766 + }, + { + "epoch": 0.945177973836325, + "grad_norm": 3.3780970573425293, + "learning_rate": 1.1195832851395844e-05, + "loss": 0.5034, + "step": 7767 + }, + { + "epoch": 0.945299665348342, + "grad_norm": 2.698901891708374, + "learning_rate": 1.1193896132520006e-05, + "loss": 0.4493, + "step": 7768 + }, + { + "epoch": 0.945421356860359, + "grad_norm": 1.6006723642349243, + "learning_rate": 1.1191959368213952e-05, + "loss": 0.4806, + "step": 7769 + }, + { + "epoch": 0.945543048372376, + "grad_norm": 0.8120681047439575, + "learning_rate": 1.1190022558551372e-05, + "loss": 0.4707, + "step": 7770 + }, + { + "epoch": 0.945664739884393, + "grad_norm": 3.7931699752807617, + "learning_rate": 1.1188085703605976e-05, + "loss": 0.3507, + "step": 7771 + }, + { + "epoch": 0.94578643139641, + "grad_norm": 1.8634275197982788, + "learning_rate": 1.1186148803451455e-05, + "loss": 0.3815, + "step": 7772 + }, + { + "epoch": 0.9459081229084272, + "grad_norm": 1.5907340049743652, + "learning_rate": 1.1184211858161517e-05, + "loss": 0.4136, + "step": 7773 + }, + { + "epoch": 0.9460298144204442, + "grad_norm": 0.8339758515357971, + "learning_rate": 1.118227486780987e-05, + "loss": 0.3881, + "step": 7774 + }, + { + "epoch": 0.9461515059324612, + "grad_norm": 5.796790599822998, + "learning_rate": 1.1180337832470218e-05, + "loss": 0.5456, + "step": 7775 + }, + { + "epoch": 0.9462731974444782, + "grad_norm": 3.344573736190796, + "learning_rate": 1.1178400752216265e-05, + "loss": 0.4823, + "step": 7776 + }, + { + "epoch": 0.9463948889564953, + "grad_norm": 6.225649356842041, + "learning_rate": 1.1176463627121723e-05, + "loss": 0.5337, + "step": 7777 + }, + { + "epoch": 0.9465165804685123, + "grad_norm": 2.477018117904663, + "learning_rate": 1.1174526457260309e-05, + "loss": 0.4146, + "step": 7778 + }, + { + "epoch": 0.9466382719805294, + "grad_norm": 3.7175071239471436, + "learning_rate": 1.117258924270573e-05, + "loss": 0.4607, + "step": 7779 + }, + { + "epoch": 0.9467599634925464, + "grad_norm": 3.969531536102295, + "learning_rate": 1.1170651983531704e-05, + "loss": 0.5194, + "step": 7780 + }, + { + "epoch": 0.9468816550045634, + "grad_norm": 1.6964995861053467, + "learning_rate": 1.1168714679811945e-05, + "loss": 0.4055, + "step": 7781 + }, + { + "epoch": 0.9470033465165805, + "grad_norm": 1.2695993185043335, + "learning_rate": 1.1166777331620175e-05, + "loss": 0.4172, + "step": 7782 + }, + { + "epoch": 0.9471250380285975, + "grad_norm": 3.6955606937408447, + "learning_rate": 1.1164839939030112e-05, + "loss": 0.5375, + "step": 7783 + }, + { + "epoch": 0.9472467295406145, + "grad_norm": 2.6843903064727783, + "learning_rate": 1.1162902502115476e-05, + "loss": 0.5435, + "step": 7784 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 1.131152868270874, + "learning_rate": 1.1160965020949994e-05, + "loss": 0.4948, + "step": 7785 + }, + { + "epoch": 0.9474901125646487, + "grad_norm": 2.265782594680786, + "learning_rate": 1.1159027495607395e-05, + "loss": 0.4427, + "step": 7786 + }, + { + "epoch": 0.9476118040766657, + "grad_norm": 4.863142967224121, + "learning_rate": 1.1157089926161395e-05, + "loss": 0.4785, + "step": 7787 + }, + { + "epoch": 0.9477334955886827, + "grad_norm": 4.738919734954834, + "learning_rate": 1.115515231268573e-05, + "loss": 0.4153, + "step": 7788 + }, + { + "epoch": 0.9478551871006997, + "grad_norm": 5.452841281890869, + "learning_rate": 1.1153214655254126e-05, + "loss": 0.4179, + "step": 7789 + }, + { + "epoch": 0.9479768786127167, + "grad_norm": 2.430560827255249, + "learning_rate": 1.115127695394032e-05, + "loss": 0.4652, + "step": 7790 + }, + { + "epoch": 0.9480985701247338, + "grad_norm": 1.904536247253418, + "learning_rate": 1.1149339208818042e-05, + "loss": 0.4662, + "step": 7791 + }, + { + "epoch": 0.9482202616367509, + "grad_norm": 1.2570780515670776, + "learning_rate": 1.114740141996103e-05, + "loss": 0.4943, + "step": 7792 + }, + { + "epoch": 0.9483419531487679, + "grad_norm": 2.6728243827819824, + "learning_rate": 1.1145463587443016e-05, + "loss": 0.4437, + "step": 7793 + }, + { + "epoch": 0.9484636446607849, + "grad_norm": 0.8751927018165588, + "learning_rate": 1.1143525711337743e-05, + "loss": 0.4869, + "step": 7794 + }, + { + "epoch": 0.9485853361728019, + "grad_norm": 3.1249399185180664, + "learning_rate": 1.1141587791718951e-05, + "loss": 0.4167, + "step": 7795 + }, + { + "epoch": 0.948707027684819, + "grad_norm": 0.6778789162635803, + "learning_rate": 1.1139649828660377e-05, + "loss": 0.4662, + "step": 7796 + }, + { + "epoch": 0.948828719196836, + "grad_norm": 1.434187650680542, + "learning_rate": 1.1137711822235773e-05, + "loss": 0.4959, + "step": 7797 + }, + { + "epoch": 0.9489504107088531, + "grad_norm": 1.2248542308807373, + "learning_rate": 1.1135773772518877e-05, + "loss": 0.4752, + "step": 7798 + }, + { + "epoch": 0.9490721022208701, + "grad_norm": 0.7918793559074402, + "learning_rate": 1.1133835679583438e-05, + "loss": 0.3909, + "step": 7799 + }, + { + "epoch": 0.9491937937328871, + "grad_norm": 1.0940003395080566, + "learning_rate": 1.1131897543503203e-05, + "loss": 0.4337, + "step": 7800 + }, + { + "epoch": 0.9493154852449042, + "grad_norm": 0.6187695264816284, + "learning_rate": 1.1129959364351925e-05, + "loss": 0.4122, + "step": 7801 + }, + { + "epoch": 0.9494371767569212, + "grad_norm": 0.647481381893158, + "learning_rate": 1.1128021142203354e-05, + "loss": 0.4565, + "step": 7802 + }, + { + "epoch": 0.9495588682689382, + "grad_norm": 1.5043638944625854, + "learning_rate": 1.1126082877131244e-05, + "loss": 0.4489, + "step": 7803 + }, + { + "epoch": 0.9496805597809552, + "grad_norm": 3.3977267742156982, + "learning_rate": 1.1124144569209347e-05, + "loss": 0.4837, + "step": 7804 + }, + { + "epoch": 0.9498022512929724, + "grad_norm": 0.715659499168396, + "learning_rate": 1.1122206218511427e-05, + "loss": 0.4442, + "step": 7805 + }, + { + "epoch": 0.9499239428049894, + "grad_norm": 0.8718197345733643, + "learning_rate": 1.1120267825111235e-05, + "loss": 0.4297, + "step": 7806 + }, + { + "epoch": 0.9500456343170064, + "grad_norm": 0.855361819267273, + "learning_rate": 1.1118329389082532e-05, + "loss": 0.4581, + "step": 7807 + }, + { + "epoch": 0.9501673258290234, + "grad_norm": 1.6435692310333252, + "learning_rate": 1.1116390910499085e-05, + "loss": 0.4583, + "step": 7808 + }, + { + "epoch": 0.9502890173410404, + "grad_norm": 0.9460768699645996, + "learning_rate": 1.1114452389434652e-05, + "loss": 0.4686, + "step": 7809 + }, + { + "epoch": 0.9504107088530575, + "grad_norm": 2.108517646789551, + "learning_rate": 1.1112513825962994e-05, + "loss": 0.4201, + "step": 7810 + }, + { + "epoch": 0.9505324003650746, + "grad_norm": 1.9522459506988525, + "learning_rate": 1.1110575220157886e-05, + "loss": 0.4228, + "step": 7811 + }, + { + "epoch": 0.9506540918770916, + "grad_norm": 1.0424877405166626, + "learning_rate": 1.1108636572093092e-05, + "loss": 0.3555, + "step": 7812 + }, + { + "epoch": 0.9507757833891086, + "grad_norm": 0.8412451148033142, + "learning_rate": 1.1106697881842381e-05, + "loss": 0.4947, + "step": 7813 + }, + { + "epoch": 0.9508974749011256, + "grad_norm": 1.0469846725463867, + "learning_rate": 1.1104759149479525e-05, + "loss": 0.4358, + "step": 7814 + }, + { + "epoch": 0.9510191664131427, + "grad_norm": 0.6246845722198486, + "learning_rate": 1.1102820375078296e-05, + "loss": 0.4381, + "step": 7815 + }, + { + "epoch": 0.9511408579251597, + "grad_norm": 1.6863676309585571, + "learning_rate": 1.1100881558712473e-05, + "loss": 0.4764, + "step": 7816 + }, + { + "epoch": 0.9512625494371768, + "grad_norm": 2.348029851913452, + "learning_rate": 1.1098942700455823e-05, + "loss": 0.487, + "step": 7817 + }, + { + "epoch": 0.9513842409491938, + "grad_norm": 1.1672379970550537, + "learning_rate": 1.1097003800382129e-05, + "loss": 0.4594, + "step": 7818 + }, + { + "epoch": 0.9515059324612108, + "grad_norm": 1.862343430519104, + "learning_rate": 1.109506485856517e-05, + "loss": 0.5072, + "step": 7819 + }, + { + "epoch": 0.9516276239732279, + "grad_norm": 1.7099848985671997, + "learning_rate": 1.1093125875078732e-05, + "loss": 0.5179, + "step": 7820 + }, + { + "epoch": 0.9517493154852449, + "grad_norm": 1.5098011493682861, + "learning_rate": 1.1091186849996585e-05, + "loss": 0.5294, + "step": 7821 + }, + { + "epoch": 0.9518710069972619, + "grad_norm": 2.793037176132202, + "learning_rate": 1.1089247783392523e-05, + "loss": 0.4304, + "step": 7822 + }, + { + "epoch": 0.9519926985092789, + "grad_norm": 0.8674801588058472, + "learning_rate": 1.1087308675340324e-05, + "loss": 0.5023, + "step": 7823 + }, + { + "epoch": 0.952114390021296, + "grad_norm": 6.732608795166016, + "learning_rate": 1.1085369525913784e-05, + "loss": 0.4349, + "step": 7824 + }, + { + "epoch": 0.9522360815333131, + "grad_norm": 4.1696014404296875, + "learning_rate": 1.1083430335186687e-05, + "loss": 0.4973, + "step": 7825 + }, + { + "epoch": 0.9523577730453301, + "grad_norm": 5.230327129364014, + "learning_rate": 1.1081491103232822e-05, + "loss": 0.4882, + "step": 7826 + }, + { + "epoch": 0.9524794645573471, + "grad_norm": 4.096656799316406, + "learning_rate": 1.107955183012598e-05, + "loss": 0.4937, + "step": 7827 + }, + { + "epoch": 0.9526011560693641, + "grad_norm": 5.5169572830200195, + "learning_rate": 1.1077612515939959e-05, + "loss": 0.4352, + "step": 7828 + }, + { + "epoch": 0.9527228475813811, + "grad_norm": 1.130476713180542, + "learning_rate": 1.107567316074855e-05, + "loss": 0.4749, + "step": 7829 + }, + { + "epoch": 0.9528445390933983, + "grad_norm": 3.9312586784362793, + "learning_rate": 1.107373376462555e-05, + "loss": 0.4433, + "step": 7830 + }, + { + "epoch": 0.9529662306054153, + "grad_norm": 1.3090933561325073, + "learning_rate": 1.107179432764476e-05, + "loss": 0.5189, + "step": 7831 + }, + { + "epoch": 0.9530879221174323, + "grad_norm": 1.763823390007019, + "learning_rate": 1.1069854849879977e-05, + "loss": 0.384, + "step": 7832 + }, + { + "epoch": 0.9532096136294493, + "grad_norm": 1.0364371538162231, + "learning_rate": 1.1067915331405002e-05, + "loss": 0.4412, + "step": 7833 + }, + { + "epoch": 0.9533313051414664, + "grad_norm": 4.0301642417907715, + "learning_rate": 1.1065975772293635e-05, + "loss": 0.4838, + "step": 7834 + }, + { + "epoch": 0.9534529966534834, + "grad_norm": 2.8961918354034424, + "learning_rate": 1.1064036172619688e-05, + "loss": 0.4697, + "step": 7835 + }, + { + "epoch": 0.9535746881655005, + "grad_norm": 4.301506042480469, + "learning_rate": 1.1062096532456958e-05, + "loss": 0.4825, + "step": 7836 + }, + { + "epoch": 0.9536963796775175, + "grad_norm": 4.657220363616943, + "learning_rate": 1.1060156851879257e-05, + "loss": 0.5186, + "step": 7837 + }, + { + "epoch": 0.9538180711895345, + "grad_norm": 0.9875574707984924, + "learning_rate": 1.1058217130960398e-05, + "loss": 0.3917, + "step": 7838 + }, + { + "epoch": 0.9539397627015516, + "grad_norm": 3.456489086151123, + "learning_rate": 1.1056277369774186e-05, + "loss": 0.4665, + "step": 7839 + }, + { + "epoch": 0.9540614542135686, + "grad_norm": 2.8122589588165283, + "learning_rate": 1.105433756839443e-05, + "loss": 0.4774, + "step": 7840 + }, + { + "epoch": 0.9541831457255856, + "grad_norm": 0.8617849946022034, + "learning_rate": 1.1052397726894949e-05, + "loss": 0.3969, + "step": 7841 + }, + { + "epoch": 0.9543048372376027, + "grad_norm": 0.8713522553443909, + "learning_rate": 1.1050457845349557e-05, + "loss": 0.4523, + "step": 7842 + }, + { + "epoch": 0.9544265287496198, + "grad_norm": 1.827952265739441, + "learning_rate": 1.104851792383207e-05, + "loss": 0.4539, + "step": 7843 + }, + { + "epoch": 0.9545482202616368, + "grad_norm": 1.5133836269378662, + "learning_rate": 1.1046577962416303e-05, + "loss": 0.4833, + "step": 7844 + }, + { + "epoch": 0.9546699117736538, + "grad_norm": 2.6935362815856934, + "learning_rate": 1.1044637961176079e-05, + "loss": 0.4413, + "step": 7845 + }, + { + "epoch": 0.9547916032856708, + "grad_norm": 2.455949068069458, + "learning_rate": 1.1042697920185218e-05, + "loss": 0.466, + "step": 7846 + }, + { + "epoch": 0.9549132947976878, + "grad_norm": 3.4989287853240967, + "learning_rate": 1.1040757839517544e-05, + "loss": 0.4032, + "step": 7847 + }, + { + "epoch": 0.9550349863097048, + "grad_norm": 4.018179416656494, + "learning_rate": 1.103881771924688e-05, + "loss": 0.4783, + "step": 7848 + }, + { + "epoch": 0.955156677821722, + "grad_norm": 2.9321541786193848, + "learning_rate": 1.1036877559447052e-05, + "loss": 0.437, + "step": 7849 + }, + { + "epoch": 0.955278369333739, + "grad_norm": 1.1128114461898804, + "learning_rate": 1.1034937360191887e-05, + "loss": 0.489, + "step": 7850 + }, + { + "epoch": 0.955400060845756, + "grad_norm": 1.6909722089767456, + "learning_rate": 1.103299712155521e-05, + "loss": 0.4762, + "step": 7851 + }, + { + "epoch": 0.955521752357773, + "grad_norm": 1.436305284500122, + "learning_rate": 1.1031056843610856e-05, + "loss": 0.437, + "step": 7852 + }, + { + "epoch": 0.95564344386979, + "grad_norm": 2.8903472423553467, + "learning_rate": 1.1029116526432655e-05, + "loss": 0.5151, + "step": 7853 + }, + { + "epoch": 0.9557651353818071, + "grad_norm": 1.2533063888549805, + "learning_rate": 1.1027176170094442e-05, + "loss": 0.4662, + "step": 7854 + }, + { + "epoch": 0.9558868268938242, + "grad_norm": 0.7719179391860962, + "learning_rate": 1.1025235774670048e-05, + "loss": 0.417, + "step": 7855 + }, + { + "epoch": 0.9560085184058412, + "grad_norm": 2.024005174636841, + "learning_rate": 1.102329534023331e-05, + "loss": 0.4928, + "step": 7856 + }, + { + "epoch": 0.9561302099178582, + "grad_norm": 2.4934709072113037, + "learning_rate": 1.1021354866858067e-05, + "loss": 0.4653, + "step": 7857 + }, + { + "epoch": 0.9562519014298753, + "grad_norm": 1.243676781654358, + "learning_rate": 1.1019414354618158e-05, + "loss": 0.3537, + "step": 7858 + }, + { + "epoch": 0.9563735929418923, + "grad_norm": 2.9375689029693604, + "learning_rate": 1.101747380358742e-05, + "loss": 0.5234, + "step": 7859 + }, + { + "epoch": 0.9564952844539093, + "grad_norm": 0.78957599401474, + "learning_rate": 1.10155332138397e-05, + "loss": 0.4446, + "step": 7860 + }, + { + "epoch": 0.9566169759659264, + "grad_norm": 0.7856225967407227, + "learning_rate": 1.101359258544884e-05, + "loss": 0.4417, + "step": 7861 + }, + { + "epoch": 0.9567386674779434, + "grad_norm": 0.7760050296783447, + "learning_rate": 1.1011651918488683e-05, + "loss": 0.4326, + "step": 7862 + }, + { + "epoch": 0.9568603589899605, + "grad_norm": 1.8795158863067627, + "learning_rate": 1.1009711213033076e-05, + "loss": 0.5253, + "step": 7863 + }, + { + "epoch": 0.9569820505019775, + "grad_norm": 1.5947014093399048, + "learning_rate": 1.1007770469155865e-05, + "loss": 0.5076, + "step": 7864 + }, + { + "epoch": 0.9571037420139945, + "grad_norm": 1.5609192848205566, + "learning_rate": 1.1005829686930906e-05, + "loss": 0.4749, + "step": 7865 + }, + { + "epoch": 0.9572254335260115, + "grad_norm": 1.0820679664611816, + "learning_rate": 1.1003888866432047e-05, + "loss": 0.5074, + "step": 7866 + }, + { + "epoch": 0.9573471250380285, + "grad_norm": 1.0442790985107422, + "learning_rate": 1.1001948007733135e-05, + "loss": 0.478, + "step": 7867 + }, + { + "epoch": 0.9574688165500457, + "grad_norm": 1.3109970092773438, + "learning_rate": 1.1000007110908025e-05, + "loss": 0.4551, + "step": 7868 + }, + { + "epoch": 0.9575905080620627, + "grad_norm": 1.043371558189392, + "learning_rate": 1.099806617603058e-05, + "loss": 0.461, + "step": 7869 + }, + { + "epoch": 0.9577121995740797, + "grad_norm": 4.1665778160095215, + "learning_rate": 1.0996125203174645e-05, + "loss": 0.4379, + "step": 7870 + }, + { + "epoch": 0.9578338910860967, + "grad_norm": 0.6852086186408997, + "learning_rate": 1.0994184192414088e-05, + "loss": 0.4537, + "step": 7871 + }, + { + "epoch": 0.9579555825981138, + "grad_norm": 0.8128134608268738, + "learning_rate": 1.0992243143822764e-05, + "loss": 0.4858, + "step": 7872 + }, + { + "epoch": 0.9580772741101308, + "grad_norm": 0.9471670389175415, + "learning_rate": 1.0990302057474537e-05, + "loss": 0.4835, + "step": 7873 + }, + { + "epoch": 0.9581989656221479, + "grad_norm": 0.9546118974685669, + "learning_rate": 1.0988360933443264e-05, + "loss": 0.4539, + "step": 7874 + }, + { + "epoch": 0.9583206571341649, + "grad_norm": 0.7892980575561523, + "learning_rate": 1.0986419771802812e-05, + "loss": 0.446, + "step": 7875 + }, + { + "epoch": 0.9584423486461819, + "grad_norm": 2.0440328121185303, + "learning_rate": 1.0984478572627049e-05, + "loss": 0.4483, + "step": 7876 + }, + { + "epoch": 0.958564040158199, + "grad_norm": 2.138730525970459, + "learning_rate": 1.0982537335989833e-05, + "loss": 0.4118, + "step": 7877 + }, + { + "epoch": 0.958685731670216, + "grad_norm": 0.5984135270118713, + "learning_rate": 1.0980596061965043e-05, + "loss": 0.4379, + "step": 7878 + }, + { + "epoch": 0.958807423182233, + "grad_norm": 2.018582820892334, + "learning_rate": 1.0978654750626538e-05, + "loss": 0.4464, + "step": 7879 + }, + { + "epoch": 0.9589291146942501, + "grad_norm": 1.2306653261184692, + "learning_rate": 1.09767134020482e-05, + "loss": 0.4465, + "step": 7880 + }, + { + "epoch": 0.9590508062062671, + "grad_norm": 1.4559270143508911, + "learning_rate": 1.0974772016303889e-05, + "loss": 0.4586, + "step": 7881 + }, + { + "epoch": 0.9591724977182842, + "grad_norm": 2.112050771713257, + "learning_rate": 1.097283059346749e-05, + "loss": 0.4912, + "step": 7882 + }, + { + "epoch": 0.9592941892303012, + "grad_norm": 2.625102996826172, + "learning_rate": 1.097088913361287e-05, + "loss": 0.5085, + "step": 7883 + }, + { + "epoch": 0.9594158807423182, + "grad_norm": 0.7203896045684814, + "learning_rate": 1.0968947636813913e-05, + "loss": 0.4644, + "step": 7884 + }, + { + "epoch": 0.9595375722543352, + "grad_norm": 0.7176701426506042, + "learning_rate": 1.0967006103144488e-05, + "loss": 0.4436, + "step": 7885 + }, + { + "epoch": 0.9596592637663522, + "grad_norm": 1.31585693359375, + "learning_rate": 1.0965064532678483e-05, + "loss": 0.4951, + "step": 7886 + }, + { + "epoch": 0.9597809552783694, + "grad_norm": 2.936957359313965, + "learning_rate": 1.096312292548977e-05, + "loss": 0.4465, + "step": 7887 + }, + { + "epoch": 0.9599026467903864, + "grad_norm": 1.3865190744400024, + "learning_rate": 1.096118128165224e-05, + "loss": 0.5442, + "step": 7888 + }, + { + "epoch": 0.9600243383024034, + "grad_norm": 2.1573688983917236, + "learning_rate": 1.0959239601239773e-05, + "loss": 0.4627, + "step": 7889 + }, + { + "epoch": 0.9601460298144204, + "grad_norm": 2.1060612201690674, + "learning_rate": 1.0957297884326252e-05, + "loss": 0.5051, + "step": 7890 + }, + { + "epoch": 0.9602677213264375, + "grad_norm": 3.0208218097686768, + "learning_rate": 1.0955356130985566e-05, + "loss": 0.4453, + "step": 7891 + }, + { + "epoch": 0.9603894128384545, + "grad_norm": 3.036423444747925, + "learning_rate": 1.0953414341291602e-05, + "loss": 0.4282, + "step": 7892 + }, + { + "epoch": 0.9605111043504716, + "grad_norm": 4.146114826202393, + "learning_rate": 1.0951472515318249e-05, + "loss": 0.3671, + "step": 7893 + }, + { + "epoch": 0.9606327958624886, + "grad_norm": 1.8409923315048218, + "learning_rate": 1.0949530653139395e-05, + "loss": 0.4574, + "step": 7894 + }, + { + "epoch": 0.9607544873745056, + "grad_norm": 0.992895781993866, + "learning_rate": 1.0947588754828937e-05, + "loss": 0.452, + "step": 7895 + }, + { + "epoch": 0.9608761788865227, + "grad_norm": 3.341982364654541, + "learning_rate": 1.0945646820460765e-05, + "loss": 0.4647, + "step": 7896 + }, + { + "epoch": 0.9609978703985397, + "grad_norm": 0.7378147840499878, + "learning_rate": 1.0943704850108774e-05, + "loss": 0.3954, + "step": 7897 + }, + { + "epoch": 0.9611195619105567, + "grad_norm": 0.9263473749160767, + "learning_rate": 1.0941762843846857e-05, + "loss": 0.3905, + "step": 7898 + }, + { + "epoch": 0.9612412534225738, + "grad_norm": 5.433279037475586, + "learning_rate": 1.0939820801748919e-05, + "loss": 0.5553, + "step": 7899 + }, + { + "epoch": 0.9613629449345908, + "grad_norm": 2.567471981048584, + "learning_rate": 1.093787872388885e-05, + "loss": 0.4695, + "step": 7900 + }, + { + "epoch": 0.9614846364466079, + "grad_norm": 4.649729251861572, + "learning_rate": 1.0935936610340559e-05, + "loss": 0.4914, + "step": 7901 + }, + { + "epoch": 0.9616063279586249, + "grad_norm": 2.801370859146118, + "learning_rate": 1.093399446117794e-05, + "loss": 0.4613, + "step": 7902 + }, + { + "epoch": 0.9617280194706419, + "grad_norm": 3.3774945735931396, + "learning_rate": 1.0932052276474898e-05, + "loss": 0.4858, + "step": 7903 + }, + { + "epoch": 0.9618497109826589, + "grad_norm": 1.2636725902557373, + "learning_rate": 1.0930110056305339e-05, + "loss": 0.4887, + "step": 7904 + }, + { + "epoch": 0.9619714024946759, + "grad_norm": 1.8090736865997314, + "learning_rate": 1.0928167800743164e-05, + "loss": 0.4384, + "step": 7905 + }, + { + "epoch": 0.9620930940066931, + "grad_norm": 0.94919753074646, + "learning_rate": 1.0926225509862288e-05, + "loss": 0.4764, + "step": 7906 + }, + { + "epoch": 0.9622147855187101, + "grad_norm": 2.3158748149871826, + "learning_rate": 1.0924283183736613e-05, + "loss": 0.4355, + "step": 7907 + }, + { + "epoch": 0.9623364770307271, + "grad_norm": 3.156244993209839, + "learning_rate": 1.0922340822440045e-05, + "loss": 0.4394, + "step": 7908 + }, + { + "epoch": 0.9624581685427441, + "grad_norm": 2.830498218536377, + "learning_rate": 1.0920398426046503e-05, + "loss": 0.4476, + "step": 7909 + }, + { + "epoch": 0.9625798600547611, + "grad_norm": 3.9664883613586426, + "learning_rate": 1.0918455994629898e-05, + "loss": 0.3834, + "step": 7910 + }, + { + "epoch": 0.9627015515667782, + "grad_norm": 3.5587096214294434, + "learning_rate": 1.0916513528264136e-05, + "loss": 0.4311, + "step": 7911 + }, + { + "epoch": 0.9628232430787953, + "grad_norm": 0.6019009947776794, + "learning_rate": 1.0914571027023139e-05, + "loss": 0.4633, + "step": 7912 + }, + { + "epoch": 0.9629449345908123, + "grad_norm": 1.592553973197937, + "learning_rate": 1.0912628490980826e-05, + "loss": 0.4046, + "step": 7913 + }, + { + "epoch": 0.9630666261028293, + "grad_norm": 1.5847268104553223, + "learning_rate": 1.0910685920211106e-05, + "loss": 0.4608, + "step": 7914 + }, + { + "epoch": 0.9631883176148464, + "grad_norm": 0.9771660566329956, + "learning_rate": 1.0908743314787901e-05, + "loss": 0.4617, + "step": 7915 + }, + { + "epoch": 0.9633100091268634, + "grad_norm": 1.221008062362671, + "learning_rate": 1.0906800674785132e-05, + "loss": 0.4406, + "step": 7916 + }, + { + "epoch": 0.9634317006388804, + "grad_norm": 3.2355761528015137, + "learning_rate": 1.0904858000276719e-05, + "loss": 0.4787, + "step": 7917 + }, + { + "epoch": 0.9635533921508975, + "grad_norm": 2.323673963546753, + "learning_rate": 1.0902915291336594e-05, + "loss": 0.4961, + "step": 7918 + }, + { + "epoch": 0.9636750836629145, + "grad_norm": 3.9719388484954834, + "learning_rate": 1.0900972548038666e-05, + "loss": 0.5399, + "step": 7919 + }, + { + "epoch": 0.9637967751749316, + "grad_norm": 0.7563192844390869, + "learning_rate": 1.0899029770456869e-05, + "loss": 0.4525, + "step": 7920 + }, + { + "epoch": 0.9639184666869486, + "grad_norm": 1.9598678350448608, + "learning_rate": 1.0897086958665126e-05, + "loss": 0.5004, + "step": 7921 + }, + { + "epoch": 0.9640401581989656, + "grad_norm": 0.7722530364990234, + "learning_rate": 1.0895144112737372e-05, + "loss": 0.46, + "step": 7922 + }, + { + "epoch": 0.9641618497109826, + "grad_norm": 0.8511223793029785, + "learning_rate": 1.0893201232747527e-05, + "loss": 0.4829, + "step": 7923 + }, + { + "epoch": 0.9642835412229996, + "grad_norm": 0.705536425113678, + "learning_rate": 1.089125831876953e-05, + "loss": 0.4622, + "step": 7924 + }, + { + "epoch": 0.9644052327350168, + "grad_norm": 3.8996047973632812, + "learning_rate": 1.088931537087731e-05, + "loss": 0.4224, + "step": 7925 + }, + { + "epoch": 0.9645269242470338, + "grad_norm": 1.6934938430786133, + "learning_rate": 1.0887372389144797e-05, + "loss": 0.4959, + "step": 7926 + }, + { + "epoch": 0.9646486157590508, + "grad_norm": 2.820828676223755, + "learning_rate": 1.0885429373645928e-05, + "loss": 0.4182, + "step": 7927 + }, + { + "epoch": 0.9647703072710678, + "grad_norm": 6.752305507659912, + "learning_rate": 1.0883486324454637e-05, + "loss": 0.4019, + "step": 7928 + }, + { + "epoch": 0.9648919987830848, + "grad_norm": 3.006600856781006, + "learning_rate": 1.0881543241644864e-05, + "loss": 0.4482, + "step": 7929 + }, + { + "epoch": 0.9650136902951019, + "grad_norm": 2.8017425537109375, + "learning_rate": 1.087960012529055e-05, + "loss": 0.4382, + "step": 7930 + }, + { + "epoch": 0.965135381807119, + "grad_norm": 4.913672924041748, + "learning_rate": 1.0877656975465625e-05, + "loss": 0.4139, + "step": 7931 + }, + { + "epoch": 0.965257073319136, + "grad_norm": 2.5266127586364746, + "learning_rate": 1.0875713792244038e-05, + "loss": 0.4548, + "step": 7932 + }, + { + "epoch": 0.965378764831153, + "grad_norm": 2.3445112705230713, + "learning_rate": 1.087377057569973e-05, + "loss": 0.3933, + "step": 7933 + }, + { + "epoch": 0.96550045634317, + "grad_norm": 1.4738410711288452, + "learning_rate": 1.0871827325906638e-05, + "loss": 0.4935, + "step": 7934 + }, + { + "epoch": 0.9656221478551871, + "grad_norm": 3.2260611057281494, + "learning_rate": 1.0869884042938714e-05, + "loss": 0.4784, + "step": 7935 + }, + { + "epoch": 0.9657438393672041, + "grad_norm": 0.6361846327781677, + "learning_rate": 1.0867940726869903e-05, + "loss": 0.4354, + "step": 7936 + }, + { + "epoch": 0.9658655308792212, + "grad_norm": 4.284882545471191, + "learning_rate": 1.086599737777415e-05, + "loss": 0.5149, + "step": 7937 + }, + { + "epoch": 0.9659872223912382, + "grad_norm": 3.0723934173583984, + "learning_rate": 1.0864053995725405e-05, + "loss": 0.4787, + "step": 7938 + }, + { + "epoch": 0.9661089139032553, + "grad_norm": 1.0320440530776978, + "learning_rate": 1.0862110580797615e-05, + "loss": 0.459, + "step": 7939 + }, + { + "epoch": 0.9662306054152723, + "grad_norm": 2.403982639312744, + "learning_rate": 1.0860167133064737e-05, + "loss": 0.4482, + "step": 7940 + }, + { + "epoch": 0.9663522969272893, + "grad_norm": 1.6134270429611206, + "learning_rate": 1.0858223652600717e-05, + "loss": 0.4297, + "step": 7941 + }, + { + "epoch": 0.9664739884393063, + "grad_norm": 1.1011158227920532, + "learning_rate": 1.085628013947951e-05, + "loss": 0.4411, + "step": 7942 + }, + { + "epoch": 0.9665956799513234, + "grad_norm": 2.1511731147766113, + "learning_rate": 1.085433659377507e-05, + "loss": 0.4684, + "step": 7943 + }, + { + "epoch": 0.9667173714633405, + "grad_norm": 1.5668193101882935, + "learning_rate": 1.0852393015561356e-05, + "loss": 0.3859, + "step": 7944 + }, + { + "epoch": 0.9668390629753575, + "grad_norm": 1.4748427867889404, + "learning_rate": 1.0850449404912323e-05, + "loss": 0.4024, + "step": 7945 + }, + { + "epoch": 0.9669607544873745, + "grad_norm": 0.9931533336639404, + "learning_rate": 1.0848505761901926e-05, + "loss": 0.4947, + "step": 7946 + }, + { + "epoch": 0.9670824459993915, + "grad_norm": 1.8887418508529663, + "learning_rate": 1.0846562086604135e-05, + "loss": 0.4781, + "step": 7947 + }, + { + "epoch": 0.9672041375114085, + "grad_norm": 1.539760947227478, + "learning_rate": 1.0844618379092901e-05, + "loss": 0.4573, + "step": 7948 + }, + { + "epoch": 0.9673258290234256, + "grad_norm": 0.7642394304275513, + "learning_rate": 1.084267463944219e-05, + "loss": 0.4918, + "step": 7949 + }, + { + "epoch": 0.9674475205354427, + "grad_norm": 0.912656843662262, + "learning_rate": 1.0840730867725964e-05, + "loss": 0.4864, + "step": 7950 + }, + { + "epoch": 0.9675692120474597, + "grad_norm": 1.3620154857635498, + "learning_rate": 1.0838787064018187e-05, + "loss": 0.3881, + "step": 7951 + }, + { + "epoch": 0.9676909035594767, + "grad_norm": 1.8657649755477905, + "learning_rate": 1.0836843228392831e-05, + "loss": 0.5264, + "step": 7952 + }, + { + "epoch": 0.9678125950714938, + "grad_norm": 2.46170711517334, + "learning_rate": 1.0834899360923853e-05, + "loss": 0.4703, + "step": 7953 + }, + { + "epoch": 0.9679342865835108, + "grad_norm": 1.167879581451416, + "learning_rate": 1.0832955461685228e-05, + "loss": 0.3924, + "step": 7954 + }, + { + "epoch": 0.9680559780955278, + "grad_norm": 0.729956328868866, + "learning_rate": 1.0831011530750923e-05, + "loss": 0.4273, + "step": 7955 + }, + { + "epoch": 0.9681776696075449, + "grad_norm": 0.868808388710022, + "learning_rate": 1.0829067568194911e-05, + "loss": 0.4107, + "step": 7956 + }, + { + "epoch": 0.9682993611195619, + "grad_norm": 0.6682521104812622, + "learning_rate": 1.082712357409116e-05, + "loss": 0.4158, + "step": 7957 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 1.4129011631011963, + "learning_rate": 1.0825179548513644e-05, + "loss": 0.4656, + "step": 7958 + }, + { + "epoch": 0.968542744143596, + "grad_norm": 1.1172596216201782, + "learning_rate": 1.082323549153634e-05, + "loss": 0.4157, + "step": 7959 + }, + { + "epoch": 0.968664435655613, + "grad_norm": 0.6368606686592102, + "learning_rate": 1.0821291403233226e-05, + "loss": 0.4312, + "step": 7960 + }, + { + "epoch": 0.96878612716763, + "grad_norm": 3.5384790897369385, + "learning_rate": 1.0819347283678268e-05, + "loss": 0.5049, + "step": 7961 + }, + { + "epoch": 0.9689078186796471, + "grad_norm": 2.799133539199829, + "learning_rate": 1.081740313294545e-05, + "loss": 0.4962, + "step": 7962 + }, + { + "epoch": 0.9690295101916642, + "grad_norm": 1.3229289054870605, + "learning_rate": 1.0815458951108753e-05, + "loss": 0.4062, + "step": 7963 + }, + { + "epoch": 0.9691512017036812, + "grad_norm": 0.7160334587097168, + "learning_rate": 1.0813514738242154e-05, + "loss": 0.4056, + "step": 7964 + }, + { + "epoch": 0.9692728932156982, + "grad_norm": 1.8991798162460327, + "learning_rate": 1.0811570494419636e-05, + "loss": 0.3743, + "step": 7965 + }, + { + "epoch": 0.9693945847277152, + "grad_norm": 0.9729048013687134, + "learning_rate": 1.080962621971518e-05, + "loss": 0.4048, + "step": 7966 + }, + { + "epoch": 0.9695162762397322, + "grad_norm": 2.3653435707092285, + "learning_rate": 1.0807681914202773e-05, + "loss": 0.4834, + "step": 7967 + }, + { + "epoch": 0.9696379677517493, + "grad_norm": 4.200676441192627, + "learning_rate": 1.0805737577956393e-05, + "loss": 0.5534, + "step": 7968 + }, + { + "epoch": 0.9697596592637664, + "grad_norm": 0.9078891277313232, + "learning_rate": 1.0803793211050032e-05, + "loss": 0.4479, + "step": 7969 + }, + { + "epoch": 0.9698813507757834, + "grad_norm": 1.4626671075820923, + "learning_rate": 1.0801848813557677e-05, + "loss": 0.4325, + "step": 7970 + }, + { + "epoch": 0.9700030422878004, + "grad_norm": 1.7735768556594849, + "learning_rate": 1.0799904385553315e-05, + "loss": 0.4878, + "step": 7971 + }, + { + "epoch": 0.9701247337998175, + "grad_norm": 2.8348875045776367, + "learning_rate": 1.0797959927110934e-05, + "loss": 0.423, + "step": 7972 + }, + { + "epoch": 0.9702464253118345, + "grad_norm": 2.85650897026062, + "learning_rate": 1.0796015438304526e-05, + "loss": 0.4507, + "step": 7973 + }, + { + "epoch": 0.9703681168238515, + "grad_norm": 3.913283109664917, + "learning_rate": 1.0794070919208084e-05, + "loss": 0.4315, + "step": 7974 + }, + { + "epoch": 0.9704898083358686, + "grad_norm": 4.523223400115967, + "learning_rate": 1.0792126369895599e-05, + "loss": 0.3945, + "step": 7975 + }, + { + "epoch": 0.9706114998478856, + "grad_norm": 2.4726908206939697, + "learning_rate": 1.0790181790441069e-05, + "loss": 0.4747, + "step": 7976 + }, + { + "epoch": 0.9707331913599027, + "grad_norm": 1.4418996572494507, + "learning_rate": 1.0788237180918481e-05, + "loss": 0.4491, + "step": 7977 + }, + { + "epoch": 0.9708548828719197, + "grad_norm": 2.8546974658966064, + "learning_rate": 1.0786292541401842e-05, + "loss": 0.4155, + "step": 7978 + }, + { + "epoch": 0.9709765743839367, + "grad_norm": 0.6050379276275635, + "learning_rate": 1.078434787196514e-05, + "loss": 0.4322, + "step": 7979 + }, + { + "epoch": 0.9710982658959537, + "grad_norm": 0.8127751350402832, + "learning_rate": 1.0782403172682378e-05, + "loss": 0.428, + "step": 7980 + }, + { + "epoch": 0.9712199574079708, + "grad_norm": 3.4307353496551514, + "learning_rate": 1.0780458443627558e-05, + "loss": 0.4747, + "step": 7981 + }, + { + "epoch": 0.9713416489199879, + "grad_norm": 1.7517369985580444, + "learning_rate": 1.077851368487468e-05, + "loss": 0.4546, + "step": 7982 + }, + { + "epoch": 0.9714633404320049, + "grad_norm": 2.215606212615967, + "learning_rate": 1.0776568896497744e-05, + "loss": 0.4481, + "step": 7983 + }, + { + "epoch": 0.9715850319440219, + "grad_norm": 3.4851198196411133, + "learning_rate": 1.077462407857075e-05, + "loss": 0.5047, + "step": 7984 + }, + { + "epoch": 0.9717067234560389, + "grad_norm": 1.1918259859085083, + "learning_rate": 1.0772679231167709e-05, + "loss": 0.4192, + "step": 7985 + }, + { + "epoch": 0.9718284149680559, + "grad_norm": 1.1821300983428955, + "learning_rate": 1.0770734354362627e-05, + "loss": 0.4501, + "step": 7986 + }, + { + "epoch": 0.971950106480073, + "grad_norm": 4.396644115447998, + "learning_rate": 1.0768789448229504e-05, + "loss": 0.5528, + "step": 7987 + }, + { + "epoch": 0.9720717979920901, + "grad_norm": 0.6211681962013245, + "learning_rate": 1.0766844512842351e-05, + "loss": 0.4304, + "step": 7988 + }, + { + "epoch": 0.9721934895041071, + "grad_norm": 1.9274446964263916, + "learning_rate": 1.0764899548275179e-05, + "loss": 0.3943, + "step": 7989 + }, + { + "epoch": 0.9723151810161241, + "grad_norm": 1.3689601421356201, + "learning_rate": 1.0762954554601996e-05, + "loss": 0.4719, + "step": 7990 + }, + { + "epoch": 0.9724368725281411, + "grad_norm": 2.6538727283477783, + "learning_rate": 1.0761009531896811e-05, + "loss": 0.4165, + "step": 7991 + }, + { + "epoch": 0.9725585640401582, + "grad_norm": 3.45505952835083, + "learning_rate": 1.0759064480233639e-05, + "loss": 0.3796, + "step": 7992 + }, + { + "epoch": 0.9726802555521752, + "grad_norm": 0.7931990027427673, + "learning_rate": 1.0757119399686494e-05, + "loss": 0.4602, + "step": 7993 + }, + { + "epoch": 0.9728019470641923, + "grad_norm": 2.7926878929138184, + "learning_rate": 1.0755174290329386e-05, + "loss": 0.4818, + "step": 7994 + }, + { + "epoch": 0.9729236385762093, + "grad_norm": 2.470961332321167, + "learning_rate": 1.0753229152236335e-05, + "loss": 0.4783, + "step": 7995 + }, + { + "epoch": 0.9730453300882264, + "grad_norm": 1.097302794456482, + "learning_rate": 1.0751283985481353e-05, + "loss": 0.4102, + "step": 7996 + }, + { + "epoch": 0.9731670216002434, + "grad_norm": 1.4294880628585815, + "learning_rate": 1.0749338790138464e-05, + "loss": 0.4144, + "step": 7997 + }, + { + "epoch": 0.9732887131122604, + "grad_norm": 1.4758702516555786, + "learning_rate": 1.074739356628168e-05, + "loss": 0.3761, + "step": 7998 + }, + { + "epoch": 0.9734104046242774, + "grad_norm": 3.157487392425537, + "learning_rate": 1.0745448313985026e-05, + "loss": 0.5063, + "step": 7999 + }, + { + "epoch": 0.9735320961362945, + "grad_norm": 2.099881410598755, + "learning_rate": 1.074350303332252e-05, + "loss": 0.4658, + "step": 8000 + }, + { + "epoch": 0.9736537876483116, + "grad_norm": 2.018212080001831, + "learning_rate": 1.0741557724368183e-05, + "loss": 0.4919, + "step": 8001 + }, + { + "epoch": 0.9737754791603286, + "grad_norm": 0.6164096593856812, + "learning_rate": 1.073961238719604e-05, + "loss": 0.4311, + "step": 8002 + }, + { + "epoch": 0.9738971706723456, + "grad_norm": 0.6218458414077759, + "learning_rate": 1.0737667021880117e-05, + "loss": 0.4306, + "step": 8003 + }, + { + "epoch": 0.9740188621843626, + "grad_norm": 2.3931751251220703, + "learning_rate": 1.0735721628494436e-05, + "loss": 0.3992, + "step": 8004 + }, + { + "epoch": 0.9741405536963796, + "grad_norm": 2.135546922683716, + "learning_rate": 1.0733776207113025e-05, + "loss": 0.5176, + "step": 8005 + }, + { + "epoch": 0.9742622452083967, + "grad_norm": 0.7601376175880432, + "learning_rate": 1.0731830757809908e-05, + "loss": 0.4715, + "step": 8006 + }, + { + "epoch": 0.9743839367204138, + "grad_norm": 0.591479480266571, + "learning_rate": 1.0729885280659116e-05, + "loss": 0.4278, + "step": 8007 + }, + { + "epoch": 0.9745056282324308, + "grad_norm": 1.0128854513168335, + "learning_rate": 1.0727939775734682e-05, + "loss": 0.455, + "step": 8008 + }, + { + "epoch": 0.9746273197444478, + "grad_norm": 1.5185626745224, + "learning_rate": 1.0725994243110628e-05, + "loss": 0.4675, + "step": 8009 + }, + { + "epoch": 0.9747490112564648, + "grad_norm": 0.6635894775390625, + "learning_rate": 1.0724048682860995e-05, + "loss": 0.4645, + "step": 8010 + }, + { + "epoch": 0.9748707027684819, + "grad_norm": 2.5288872718811035, + "learning_rate": 1.0722103095059806e-05, + "loss": 0.4553, + "step": 8011 + }, + { + "epoch": 0.9749923942804989, + "grad_norm": 1.3128995895385742, + "learning_rate": 1.0720157479781103e-05, + "loss": 0.464, + "step": 8012 + }, + { + "epoch": 0.975114085792516, + "grad_norm": 0.8010328412055969, + "learning_rate": 1.0718211837098915e-05, + "loss": 0.4679, + "step": 8013 + }, + { + "epoch": 0.975235777304533, + "grad_norm": 0.6297222971916199, + "learning_rate": 1.071626616708728e-05, + "loss": 0.4577, + "step": 8014 + }, + { + "epoch": 0.97535746881655, + "grad_norm": 3.365506887435913, + "learning_rate": 1.0714320469820236e-05, + "loss": 0.3953, + "step": 8015 + }, + { + "epoch": 0.9754791603285671, + "grad_norm": 1.4621787071228027, + "learning_rate": 1.0712374745371822e-05, + "loss": 0.4813, + "step": 8016 + }, + { + "epoch": 0.9756008518405841, + "grad_norm": 0.8708003163337708, + "learning_rate": 1.0710428993816073e-05, + "loss": 0.4664, + "step": 8017 + }, + { + "epoch": 0.9757225433526011, + "grad_norm": 1.6532293558120728, + "learning_rate": 1.0708483215227028e-05, + "loss": 0.4473, + "step": 8018 + }, + { + "epoch": 0.9758442348646182, + "grad_norm": 2.949024200439453, + "learning_rate": 1.070653740967873e-05, + "loss": 0.3964, + "step": 8019 + }, + { + "epoch": 0.9759659263766353, + "grad_norm": 0.9826018810272217, + "learning_rate": 1.0704591577245225e-05, + "loss": 0.43, + "step": 8020 + }, + { + "epoch": 0.9760876178886523, + "grad_norm": 1.0905530452728271, + "learning_rate": 1.0702645718000549e-05, + "loss": 0.435, + "step": 8021 + }, + { + "epoch": 0.9762093094006693, + "grad_norm": 0.9655706286430359, + "learning_rate": 1.0700699832018751e-05, + "loss": 0.418, + "step": 8022 + }, + { + "epoch": 0.9763310009126863, + "grad_norm": 0.6622583866119385, + "learning_rate": 1.069875391937387e-05, + "loss": 0.4271, + "step": 8023 + }, + { + "epoch": 0.9764526924247033, + "grad_norm": 2.9581708908081055, + "learning_rate": 1.069680798013996e-05, + "loss": 0.5099, + "step": 8024 + }, + { + "epoch": 0.9765743839367204, + "grad_norm": 1.334496021270752, + "learning_rate": 1.069486201439106e-05, + "loss": 0.4278, + "step": 8025 + }, + { + "epoch": 0.9766960754487375, + "grad_norm": 2.130648136138916, + "learning_rate": 1.0692916022201226e-05, + "loss": 0.4931, + "step": 8026 + }, + { + "epoch": 0.9768177669607545, + "grad_norm": 1.12907075881958, + "learning_rate": 1.0690970003644503e-05, + "loss": 0.4844, + "step": 8027 + }, + { + "epoch": 0.9769394584727715, + "grad_norm": 1.4944736957550049, + "learning_rate": 1.0689023958794942e-05, + "loss": 0.4369, + "step": 8028 + }, + { + "epoch": 0.9770611499847885, + "grad_norm": 1.1342177391052246, + "learning_rate": 1.0687077887726589e-05, + "loss": 0.4793, + "step": 8029 + }, + { + "epoch": 0.9771828414968056, + "grad_norm": 0.7727245688438416, + "learning_rate": 1.0685131790513502e-05, + "loss": 0.4604, + "step": 8030 + }, + { + "epoch": 0.9773045330088226, + "grad_norm": 1.1200982332229614, + "learning_rate": 1.0683185667229733e-05, + "loss": 0.4478, + "step": 8031 + }, + { + "epoch": 0.9774262245208397, + "grad_norm": 1.6211756467819214, + "learning_rate": 1.0681239517949336e-05, + "loss": 0.4423, + "step": 8032 + }, + { + "epoch": 0.9775479160328567, + "grad_norm": 3.1147689819335938, + "learning_rate": 1.0679293342746362e-05, + "loss": 0.5705, + "step": 8033 + }, + { + "epoch": 0.9776696075448738, + "grad_norm": 1.3835824728012085, + "learning_rate": 1.0677347141694874e-05, + "loss": 0.456, + "step": 8034 + }, + { + "epoch": 0.9777912990568908, + "grad_norm": 1.6693867444992065, + "learning_rate": 1.0675400914868924e-05, + "loss": 0.4734, + "step": 8035 + }, + { + "epoch": 0.9779129905689078, + "grad_norm": 2.5441606044769287, + "learning_rate": 1.0673454662342571e-05, + "loss": 0.4351, + "step": 8036 + }, + { + "epoch": 0.9780346820809248, + "grad_norm": 3.679476499557495, + "learning_rate": 1.0671508384189872e-05, + "loss": 0.4211, + "step": 8037 + }, + { + "epoch": 0.9781563735929419, + "grad_norm": 2.839444637298584, + "learning_rate": 1.0669562080484892e-05, + "loss": 0.4505, + "step": 8038 + }, + { + "epoch": 0.978278065104959, + "grad_norm": 3.4824390411376953, + "learning_rate": 1.066761575130169e-05, + "loss": 0.4268, + "step": 8039 + }, + { + "epoch": 0.978399756616976, + "grad_norm": 2.349884510040283, + "learning_rate": 1.0665669396714322e-05, + "loss": 0.4043, + "step": 8040 + }, + { + "epoch": 0.978521448128993, + "grad_norm": 1.8292826414108276, + "learning_rate": 1.0663723016796859e-05, + "loss": 0.4386, + "step": 8041 + }, + { + "epoch": 0.97864313964101, + "grad_norm": 0.918819785118103, + "learning_rate": 1.066177661162336e-05, + "loss": 0.4734, + "step": 8042 + }, + { + "epoch": 0.978764831153027, + "grad_norm": 1.3980753421783447, + "learning_rate": 1.0659830181267891e-05, + "loss": 0.3627, + "step": 8043 + }, + { + "epoch": 0.9788865226650442, + "grad_norm": 3.8002572059631348, + "learning_rate": 1.0657883725804518e-05, + "loss": 0.4821, + "step": 8044 + }, + { + "epoch": 0.9790082141770612, + "grad_norm": 1.8831466436386108, + "learning_rate": 1.0655937245307308e-05, + "loss": 0.4471, + "step": 8045 + }, + { + "epoch": 0.9791299056890782, + "grad_norm": 2.864485502243042, + "learning_rate": 1.065399073985033e-05, + "loss": 0.4561, + "step": 8046 + }, + { + "epoch": 0.9792515972010952, + "grad_norm": 3.2202043533325195, + "learning_rate": 1.0652044209507648e-05, + "loss": 0.5084, + "step": 8047 + }, + { + "epoch": 0.9793732887131122, + "grad_norm": 3.6493804454803467, + "learning_rate": 1.0650097654353335e-05, + "loss": 0.4981, + "step": 8048 + }, + { + "epoch": 0.9794949802251293, + "grad_norm": 2.2826626300811768, + "learning_rate": 1.0648151074461459e-05, + "loss": 0.4728, + "step": 8049 + }, + { + "epoch": 0.9796166717371463, + "grad_norm": 1.0844839811325073, + "learning_rate": 1.0646204469906096e-05, + "loss": 0.4585, + "step": 8050 + }, + { + "epoch": 0.9797383632491634, + "grad_norm": 2.003732204437256, + "learning_rate": 1.0644257840761317e-05, + "loss": 0.4725, + "step": 8051 + }, + { + "epoch": 0.9798600547611804, + "grad_norm": 1.4262032508850098, + "learning_rate": 1.0642311187101189e-05, + "loss": 0.4778, + "step": 8052 + }, + { + "epoch": 0.9799817462731975, + "grad_norm": 1.154963493347168, + "learning_rate": 1.064036450899979e-05, + "loss": 0.457, + "step": 8053 + }, + { + "epoch": 0.9801034377852145, + "grad_norm": 1.2797378301620483, + "learning_rate": 1.06384178065312e-05, + "loss": 0.4689, + "step": 8054 + }, + { + "epoch": 0.9802251292972315, + "grad_norm": 1.3024284839630127, + "learning_rate": 1.0636471079769488e-05, + "loss": 0.4614, + "step": 8055 + }, + { + "epoch": 0.9803468208092485, + "grad_norm": 3.3173351287841797, + "learning_rate": 1.0634524328788736e-05, + "loss": 0.4246, + "step": 8056 + }, + { + "epoch": 0.9804685123212656, + "grad_norm": 3.05373477935791, + "learning_rate": 1.0632577553663019e-05, + "loss": 0.4104, + "step": 8057 + }, + { + "epoch": 0.9805902038332827, + "grad_norm": 1.5117512941360474, + "learning_rate": 1.0630630754466419e-05, + "loss": 0.5033, + "step": 8058 + }, + { + "epoch": 0.9807118953452997, + "grad_norm": 1.2367396354675293, + "learning_rate": 1.0628683931273009e-05, + "loss": 0.5227, + "step": 8059 + }, + { + "epoch": 0.9808335868573167, + "grad_norm": 1.2027482986450195, + "learning_rate": 1.0626737084156878e-05, + "loss": 0.4124, + "step": 8060 + }, + { + "epoch": 0.9809552783693337, + "grad_norm": 0.9053662419319153, + "learning_rate": 1.0624790213192102e-05, + "loss": 0.4552, + "step": 8061 + }, + { + "epoch": 0.9810769698813507, + "grad_norm": 1.1773267984390259, + "learning_rate": 1.0622843318452767e-05, + "loss": 0.466, + "step": 8062 + }, + { + "epoch": 0.9811986613933679, + "grad_norm": 2.4418399333953857, + "learning_rate": 1.0620896400012952e-05, + "loss": 0.4115, + "step": 8063 + }, + { + "epoch": 0.9813203529053849, + "grad_norm": 1.4186867475509644, + "learning_rate": 1.0618949457946743e-05, + "loss": 0.4203, + "step": 8064 + }, + { + "epoch": 0.9814420444174019, + "grad_norm": 0.6780378818511963, + "learning_rate": 1.0617002492328228e-05, + "loss": 0.4326, + "step": 8065 + }, + { + "epoch": 0.9815637359294189, + "grad_norm": 0.7168206572532654, + "learning_rate": 1.0615055503231491e-05, + "loss": 0.4493, + "step": 8066 + }, + { + "epoch": 0.9816854274414359, + "grad_norm": 0.6558755040168762, + "learning_rate": 1.0613108490730617e-05, + "loss": 0.3912, + "step": 8067 + }, + { + "epoch": 0.981807118953453, + "grad_norm": 2.957249641418457, + "learning_rate": 1.06111614548997e-05, + "loss": 0.4703, + "step": 8068 + }, + { + "epoch": 0.98192881046547, + "grad_norm": 0.7346828579902649, + "learning_rate": 1.0609214395812821e-05, + "loss": 0.4539, + "step": 8069 + }, + { + "epoch": 0.9820505019774871, + "grad_norm": 0.9538126587867737, + "learning_rate": 1.0607267313544074e-05, + "loss": 0.4513, + "step": 8070 + }, + { + "epoch": 0.9821721934895041, + "grad_norm": 2.4194936752319336, + "learning_rate": 1.0605320208167549e-05, + "loss": 0.3865, + "step": 8071 + }, + { + "epoch": 0.9822938850015211, + "grad_norm": 1.0028941631317139, + "learning_rate": 1.0603373079757338e-05, + "loss": 0.4133, + "step": 8072 + }, + { + "epoch": 0.9824155765135382, + "grad_norm": 2.4903273582458496, + "learning_rate": 1.0601425928387533e-05, + "loss": 0.3828, + "step": 8073 + }, + { + "epoch": 0.9825372680255552, + "grad_norm": 1.5925891399383545, + "learning_rate": 1.0599478754132224e-05, + "loss": 0.501, + "step": 8074 + }, + { + "epoch": 0.9826589595375722, + "grad_norm": 1.3841291666030884, + "learning_rate": 1.0597531557065508e-05, + "loss": 0.442, + "step": 8075 + }, + { + "epoch": 0.9827806510495893, + "grad_norm": 1.2095823287963867, + "learning_rate": 1.0595584337261483e-05, + "loss": 0.494, + "step": 8076 + }, + { + "epoch": 0.9829023425616064, + "grad_norm": 1.2328397035598755, + "learning_rate": 1.0593637094794236e-05, + "loss": 0.4506, + "step": 8077 + }, + { + "epoch": 0.9830240340736234, + "grad_norm": 0.9837632179260254, + "learning_rate": 1.059168982973787e-05, + "loss": 0.4636, + "step": 8078 + }, + { + "epoch": 0.9831457255856404, + "grad_norm": 1.1830470561981201, + "learning_rate": 1.0589742542166482e-05, + "loss": 0.4417, + "step": 8079 + }, + { + "epoch": 0.9832674170976574, + "grad_norm": 1.3787841796875, + "learning_rate": 1.0587795232154174e-05, + "loss": 0.4026, + "step": 8080 + }, + { + "epoch": 0.9833891086096744, + "grad_norm": 2.046830892562866, + "learning_rate": 1.0585847899775034e-05, + "loss": 0.5416, + "step": 8081 + }, + { + "epoch": 0.9835108001216916, + "grad_norm": 1.3431073427200317, + "learning_rate": 1.0583900545103171e-05, + "loss": 0.4506, + "step": 8082 + }, + { + "epoch": 0.9836324916337086, + "grad_norm": 3.117535352706909, + "learning_rate": 1.0581953168212684e-05, + "loss": 0.4619, + "step": 8083 + }, + { + "epoch": 0.9837541831457256, + "grad_norm": 3.2632980346679688, + "learning_rate": 1.0580005769177674e-05, + "loss": 0.5134, + "step": 8084 + }, + { + "epoch": 0.9838758746577426, + "grad_norm": 0.6580621004104614, + "learning_rate": 1.0578058348072247e-05, + "loss": 0.468, + "step": 8085 + }, + { + "epoch": 0.9839975661697596, + "grad_norm": 3.4985034465789795, + "learning_rate": 1.05761109049705e-05, + "loss": 0.3818, + "step": 8086 + }, + { + "epoch": 0.9841192576817767, + "grad_norm": 0.6641891598701477, + "learning_rate": 1.0574163439946541e-05, + "loss": 0.4628, + "step": 8087 + }, + { + "epoch": 0.9842409491937937, + "grad_norm": 2.8652753829956055, + "learning_rate": 1.0572215953074475e-05, + "loss": 0.4703, + "step": 8088 + }, + { + "epoch": 0.9843626407058108, + "grad_norm": 1.4694719314575195, + "learning_rate": 1.0570268444428406e-05, + "loss": 0.4086, + "step": 8089 + }, + { + "epoch": 0.9844843322178278, + "grad_norm": 0.5935743451118469, + "learning_rate": 1.0568320914082444e-05, + "loss": 0.4627, + "step": 8090 + }, + { + "epoch": 0.9846060237298448, + "grad_norm": 0.6337292194366455, + "learning_rate": 1.0566373362110695e-05, + "loss": 0.4272, + "step": 8091 + }, + { + "epoch": 0.9847277152418619, + "grad_norm": 1.4187356233596802, + "learning_rate": 1.0564425788587269e-05, + "loss": 0.3871, + "step": 8092 + }, + { + "epoch": 0.9848494067538789, + "grad_norm": 3.1834704875946045, + "learning_rate": 1.0562478193586272e-05, + "loss": 0.3792, + "step": 8093 + }, + { + "epoch": 0.9849710982658959, + "grad_norm": 0.5928133726119995, + "learning_rate": 1.0560530577181813e-05, + "loss": 0.4406, + "step": 8094 + }, + { + "epoch": 0.985092789777913, + "grad_norm": 1.171393871307373, + "learning_rate": 1.0558582939448008e-05, + "loss": 0.476, + "step": 8095 + }, + { + "epoch": 0.98521448128993, + "grad_norm": 1.6947165727615356, + "learning_rate": 1.0556635280458964e-05, + "loss": 0.4708, + "step": 8096 + }, + { + "epoch": 0.9853361728019471, + "grad_norm": 1.3873652219772339, + "learning_rate": 1.0554687600288797e-05, + "loss": 0.4371, + "step": 8097 + }, + { + "epoch": 0.9854578643139641, + "grad_norm": 2.427231788635254, + "learning_rate": 1.0552739899011619e-05, + "loss": 0.5076, + "step": 8098 + }, + { + "epoch": 0.9855795558259811, + "grad_norm": 1.2444276809692383, + "learning_rate": 1.0550792176701543e-05, + "loss": 0.43, + "step": 8099 + }, + { + "epoch": 0.9857012473379981, + "grad_norm": 2.5057947635650635, + "learning_rate": 1.0548844433432685e-05, + "loss": 0.4122, + "step": 8100 + }, + { + "epoch": 0.9858229388500153, + "grad_norm": 0.8455339074134827, + "learning_rate": 1.0546896669279158e-05, + "loss": 0.3948, + "step": 8101 + }, + { + "epoch": 0.9859446303620323, + "grad_norm": 1.0947209596633911, + "learning_rate": 1.0544948884315085e-05, + "loss": 0.4276, + "step": 8102 + }, + { + "epoch": 0.9860663218740493, + "grad_norm": 0.5759259462356567, + "learning_rate": 1.0543001078614576e-05, + "loss": 0.4281, + "step": 8103 + }, + { + "epoch": 0.9861880133860663, + "grad_norm": 2.048048257827759, + "learning_rate": 1.0541053252251751e-05, + "loss": 0.5305, + "step": 8104 + }, + { + "epoch": 0.9863097048980833, + "grad_norm": 0.7636784315109253, + "learning_rate": 1.0539105405300731e-05, + "loss": 0.4454, + "step": 8105 + }, + { + "epoch": 0.9864313964101004, + "grad_norm": 2.2165699005126953, + "learning_rate": 1.0537157537835635e-05, + "loss": 0.515, + "step": 8106 + }, + { + "epoch": 0.9865530879221174, + "grad_norm": 1.2681456804275513, + "learning_rate": 1.0535209649930584e-05, + "loss": 0.4387, + "step": 8107 + }, + { + "epoch": 0.9866747794341345, + "grad_norm": 2.2847306728363037, + "learning_rate": 1.0533261741659697e-05, + "loss": 0.4726, + "step": 8108 + }, + { + "epoch": 0.9867964709461515, + "grad_norm": 0.9131169319152832, + "learning_rate": 1.0531313813097097e-05, + "loss": 0.4478, + "step": 8109 + }, + { + "epoch": 0.9869181624581685, + "grad_norm": 2.2885148525238037, + "learning_rate": 1.0529365864316907e-05, + "loss": 0.4037, + "step": 8110 + }, + { + "epoch": 0.9870398539701856, + "grad_norm": 0.6963400840759277, + "learning_rate": 1.0527417895393248e-05, + "loss": 0.4684, + "step": 8111 + }, + { + "epoch": 0.9871615454822026, + "grad_norm": 0.8338130712509155, + "learning_rate": 1.0525469906400248e-05, + "loss": 0.4678, + "step": 8112 + }, + { + "epoch": 0.9872832369942196, + "grad_norm": 0.8437964916229248, + "learning_rate": 1.0523521897412028e-05, + "loss": 0.3977, + "step": 8113 + }, + { + "epoch": 0.9874049285062367, + "grad_norm": 0.8176448941230774, + "learning_rate": 1.0521573868502719e-05, + "loss": 0.4377, + "step": 8114 + }, + { + "epoch": 0.9875266200182538, + "grad_norm": 0.7634590268135071, + "learning_rate": 1.0519625819746447e-05, + "loss": 0.4158, + "step": 8115 + }, + { + "epoch": 0.9876483115302708, + "grad_norm": 1.011165976524353, + "learning_rate": 1.0517677751217332e-05, + "loss": 0.4631, + "step": 8116 + }, + { + "epoch": 0.9877700030422878, + "grad_norm": 1.6365612745285034, + "learning_rate": 1.051572966298951e-05, + "loss": 0.5118, + "step": 8117 + }, + { + "epoch": 0.9878916945543048, + "grad_norm": 2.087566614151001, + "learning_rate": 1.0513781555137105e-05, + "loss": 0.4119, + "step": 8118 + }, + { + "epoch": 0.9880133860663218, + "grad_norm": 1.306759238243103, + "learning_rate": 1.0511833427734249e-05, + "loss": 0.4218, + "step": 8119 + }, + { + "epoch": 0.988135077578339, + "grad_norm": 0.6562801599502563, + "learning_rate": 1.0509885280855073e-05, + "loss": 0.441, + "step": 8120 + }, + { + "epoch": 0.988256769090356, + "grad_norm": 0.8264156579971313, + "learning_rate": 1.0507937114573703e-05, + "loss": 0.375, + "step": 8121 + }, + { + "epoch": 0.988378460602373, + "grad_norm": 1.1634833812713623, + "learning_rate": 1.0505988928964279e-05, + "loss": 0.4765, + "step": 8122 + }, + { + "epoch": 0.98850015211439, + "grad_norm": 1.3989956378936768, + "learning_rate": 1.0504040724100925e-05, + "loss": 0.4342, + "step": 8123 + }, + { + "epoch": 0.988621843626407, + "grad_norm": 0.9042601585388184, + "learning_rate": 1.0502092500057781e-05, + "loss": 0.4459, + "step": 8124 + }, + { + "epoch": 0.9887435351384241, + "grad_norm": 1.1182591915130615, + "learning_rate": 1.0500144256908977e-05, + "loss": 0.4743, + "step": 8125 + }, + { + "epoch": 0.9888652266504412, + "grad_norm": 1.599513053894043, + "learning_rate": 1.0498195994728651e-05, + "loss": 0.3931, + "step": 8126 + }, + { + "epoch": 0.9889869181624582, + "grad_norm": 1.2687733173370361, + "learning_rate": 1.0496247713590933e-05, + "loss": 0.4575, + "step": 8127 + }, + { + "epoch": 0.9891086096744752, + "grad_norm": 4.393535614013672, + "learning_rate": 1.0494299413569962e-05, + "loss": 0.5469, + "step": 8128 + }, + { + "epoch": 0.9892303011864922, + "grad_norm": 3.1072423458099365, + "learning_rate": 1.0492351094739879e-05, + "loss": 0.4975, + "step": 8129 + }, + { + "epoch": 0.9893519926985093, + "grad_norm": 1.6055103540420532, + "learning_rate": 1.0490402757174814e-05, + "loss": 0.4647, + "step": 8130 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 0.6324020028114319, + "learning_rate": 1.0488454400948911e-05, + "loss": 0.4582, + "step": 8131 + }, + { + "epoch": 0.9895953757225433, + "grad_norm": 0.8509452939033508, + "learning_rate": 1.0486506026136304e-05, + "loss": 0.5157, + "step": 8132 + }, + { + "epoch": 0.9897170672345604, + "grad_norm": 3.5176970958709717, + "learning_rate": 1.048455763281114e-05, + "loss": 0.4282, + "step": 8133 + }, + { + "epoch": 0.9898387587465775, + "grad_norm": 2.642094135284424, + "learning_rate": 1.0482609221047552e-05, + "loss": 0.4644, + "step": 8134 + }, + { + "epoch": 0.9899604502585945, + "grad_norm": 3.5484108924865723, + "learning_rate": 1.0480660790919686e-05, + "loss": 0.4293, + "step": 8135 + }, + { + "epoch": 0.9900821417706115, + "grad_norm": 1.7466529607772827, + "learning_rate": 1.0478712342501682e-05, + "loss": 0.4277, + "step": 8136 + }, + { + "epoch": 0.9902038332826285, + "grad_norm": 1.8871071338653564, + "learning_rate": 1.0476763875867682e-05, + "loss": 0.4307, + "step": 8137 + }, + { + "epoch": 0.9903255247946455, + "grad_norm": 0.8592079877853394, + "learning_rate": 1.0474815391091828e-05, + "loss": 0.4214, + "step": 8138 + }, + { + "epoch": 0.9904472163066627, + "grad_norm": 0.5436782836914062, + "learning_rate": 1.0472866888248267e-05, + "loss": 0.4249, + "step": 8139 + }, + { + "epoch": 0.9905689078186797, + "grad_norm": 2.1338112354278564, + "learning_rate": 1.0470918367411143e-05, + "loss": 0.4682, + "step": 8140 + }, + { + "epoch": 0.9906905993306967, + "grad_norm": 2.0289323329925537, + "learning_rate": 1.0468969828654598e-05, + "loss": 0.4192, + "step": 8141 + }, + { + "epoch": 0.9908122908427137, + "grad_norm": 2.854349374771118, + "learning_rate": 1.0467021272052782e-05, + "loss": 0.4502, + "step": 8142 + }, + { + "epoch": 0.9909339823547307, + "grad_norm": 0.7675443291664124, + "learning_rate": 1.0465072697679842e-05, + "loss": 0.3835, + "step": 8143 + }, + { + "epoch": 0.9910556738667478, + "grad_norm": 4.075288772583008, + "learning_rate": 1.0463124105609918e-05, + "loss": 0.4956, + "step": 8144 + }, + { + "epoch": 0.9911773653787649, + "grad_norm": 0.7009711861610413, + "learning_rate": 1.0461175495917168e-05, + "loss": 0.4574, + "step": 8145 + }, + { + "epoch": 0.9912990568907819, + "grad_norm": 2.847881317138672, + "learning_rate": 1.0459226868675734e-05, + "loss": 0.4974, + "step": 8146 + }, + { + "epoch": 0.9914207484027989, + "grad_norm": 1.2025774717330933, + "learning_rate": 1.0457278223959766e-05, + "loss": 0.4279, + "step": 8147 + }, + { + "epoch": 0.9915424399148159, + "grad_norm": 1.956728458404541, + "learning_rate": 1.0455329561843417e-05, + "loss": 0.5224, + "step": 8148 + }, + { + "epoch": 0.991664131426833, + "grad_norm": 0.6114785671234131, + "learning_rate": 1.0453380882400834e-05, + "loss": 0.425, + "step": 8149 + }, + { + "epoch": 0.99178582293885, + "grad_norm": 2.43511962890625, + "learning_rate": 1.0451432185706172e-05, + "loss": 0.4496, + "step": 8150 + }, + { + "epoch": 0.991907514450867, + "grad_norm": 1.9524028301239014, + "learning_rate": 1.0449483471833576e-05, + "loss": 0.4857, + "step": 8151 + }, + { + "epoch": 0.9920292059628841, + "grad_norm": 1.7438323497772217, + "learning_rate": 1.044753474085721e-05, + "loss": 0.4801, + "step": 8152 + }, + { + "epoch": 0.9921508974749012, + "grad_norm": 2.3419482707977295, + "learning_rate": 1.0445585992851217e-05, + "loss": 0.4749, + "step": 8153 + }, + { + "epoch": 0.9922725889869182, + "grad_norm": 2.623054027557373, + "learning_rate": 1.0443637227889756e-05, + "loss": 0.4209, + "step": 8154 + }, + { + "epoch": 0.9923942804989352, + "grad_norm": 1.219497561454773, + "learning_rate": 1.044168844604698e-05, + "loss": 0.4762, + "step": 8155 + }, + { + "epoch": 0.9925159720109522, + "grad_norm": 2.0181477069854736, + "learning_rate": 1.0439739647397046e-05, + "loss": 0.4959, + "step": 8156 + }, + { + "epoch": 0.9926376635229692, + "grad_norm": 0.9117008447647095, + "learning_rate": 1.0437790832014106e-05, + "loss": 0.5067, + "step": 8157 + }, + { + "epoch": 0.9927593550349864, + "grad_norm": 2.0771126747131348, + "learning_rate": 1.0435841999972316e-05, + "loss": 0.4743, + "step": 8158 + }, + { + "epoch": 0.9928810465470034, + "grad_norm": 0.7010003924369812, + "learning_rate": 1.0433893151345839e-05, + "loss": 0.463, + "step": 8159 + }, + { + "epoch": 0.9930027380590204, + "grad_norm": 1.738229513168335, + "learning_rate": 1.0431944286208833e-05, + "loss": 0.4638, + "step": 8160 + }, + { + "epoch": 0.9931244295710374, + "grad_norm": 0.6289927363395691, + "learning_rate": 1.0429995404635447e-05, + "loss": 0.4477, + "step": 8161 + }, + { + "epoch": 0.9932461210830544, + "grad_norm": 3.480825424194336, + "learning_rate": 1.0428046506699847e-05, + "loss": 0.5465, + "step": 8162 + }, + { + "epoch": 0.9933678125950715, + "grad_norm": 0.818274199962616, + "learning_rate": 1.0426097592476194e-05, + "loss": 0.44, + "step": 8163 + }, + { + "epoch": 0.9934895041070886, + "grad_norm": 3.45645809173584, + "learning_rate": 1.0424148662038643e-05, + "loss": 0.5326, + "step": 8164 + }, + { + "epoch": 0.9936111956191056, + "grad_norm": 0.7355402708053589, + "learning_rate": 1.0422199715461357e-05, + "loss": 0.4693, + "step": 8165 + }, + { + "epoch": 0.9937328871311226, + "grad_norm": 0.5822993516921997, + "learning_rate": 1.0420250752818502e-05, + "loss": 0.4469, + "step": 8166 + }, + { + "epoch": 0.9938545786431396, + "grad_norm": 1.0866787433624268, + "learning_rate": 1.0418301774184234e-05, + "loss": 0.486, + "step": 8167 + }, + { + "epoch": 0.9939762701551567, + "grad_norm": 4.596395969390869, + "learning_rate": 1.0416352779632714e-05, + "loss": 0.4246, + "step": 8168 + }, + { + "epoch": 0.9940979616671737, + "grad_norm": 2.685981273651123, + "learning_rate": 1.0414403769238112e-05, + "loss": 0.4356, + "step": 8169 + }, + { + "epoch": 0.9942196531791907, + "grad_norm": 3.646270513534546, + "learning_rate": 1.0412454743074588e-05, + "loss": 0.4195, + "step": 8170 + }, + { + "epoch": 0.9943413446912078, + "grad_norm": 0.9976490139961243, + "learning_rate": 1.0410505701216308e-05, + "loss": 0.4838, + "step": 8171 + }, + { + "epoch": 0.9944630362032248, + "grad_norm": 2.3518176078796387, + "learning_rate": 1.0408556643737439e-05, + "loss": 0.4278, + "step": 8172 + }, + { + "epoch": 0.9945847277152419, + "grad_norm": 1.7078113555908203, + "learning_rate": 1.040660757071214e-05, + "loss": 0.5016, + "step": 8173 + }, + { + "epoch": 0.9947064192272589, + "grad_norm": 1.3084639310836792, + "learning_rate": 1.0404658482214582e-05, + "loss": 0.4896, + "step": 8174 + }, + { + "epoch": 0.9948281107392759, + "grad_norm": 0.8833538293838501, + "learning_rate": 1.0402709378318934e-05, + "loss": 0.4566, + "step": 8175 + }, + { + "epoch": 0.9949498022512929, + "grad_norm": 0.837421178817749, + "learning_rate": 1.0400760259099356e-05, + "loss": 0.3917, + "step": 8176 + }, + { + "epoch": 0.99507149376331, + "grad_norm": 0.7357523441314697, + "learning_rate": 1.0398811124630024e-05, + "loss": 0.4046, + "step": 8177 + }, + { + "epoch": 0.9951931852753271, + "grad_norm": 1.0310944318771362, + "learning_rate": 1.0396861974985103e-05, + "loss": 0.4362, + "step": 8178 + }, + { + "epoch": 0.9953148767873441, + "grad_norm": 0.57745361328125, + "learning_rate": 1.0394912810238762e-05, + "loss": 0.4271, + "step": 8179 + }, + { + "epoch": 0.9954365682993611, + "grad_norm": 0.6754943132400513, + "learning_rate": 1.039296363046517e-05, + "loss": 0.4182, + "step": 8180 + }, + { + "epoch": 0.9955582598113781, + "grad_norm": 2.6498584747314453, + "learning_rate": 1.0391014435738498e-05, + "loss": 0.5195, + "step": 8181 + }, + { + "epoch": 0.9956799513233952, + "grad_norm": 0.6231868863105774, + "learning_rate": 1.0389065226132922e-05, + "loss": 0.3748, + "step": 8182 + }, + { + "epoch": 0.9958016428354123, + "grad_norm": 1.6310969591140747, + "learning_rate": 1.0387116001722605e-05, + "loss": 0.4782, + "step": 8183 + }, + { + "epoch": 0.9959233343474293, + "grad_norm": 1.122788429260254, + "learning_rate": 1.0385166762581722e-05, + "loss": 0.4788, + "step": 8184 + }, + { + "epoch": 0.9960450258594463, + "grad_norm": 1.01002037525177, + "learning_rate": 1.0383217508784447e-05, + "loss": 0.4427, + "step": 8185 + }, + { + "epoch": 0.9961667173714633, + "grad_norm": 0.8244919776916504, + "learning_rate": 1.0381268240404956e-05, + "loss": 0.4694, + "step": 8186 + }, + { + "epoch": 0.9962884088834804, + "grad_norm": 1.242445945739746, + "learning_rate": 1.0379318957517414e-05, + "loss": 0.4883, + "step": 8187 + }, + { + "epoch": 0.9964101003954974, + "grad_norm": 2.397307872772217, + "learning_rate": 1.0377369660196004e-05, + "loss": 0.4223, + "step": 8188 + }, + { + "epoch": 0.9965317919075144, + "grad_norm": 1.7231874465942383, + "learning_rate": 1.0375420348514898e-05, + "loss": 0.43, + "step": 8189 + }, + { + "epoch": 0.9966534834195315, + "grad_norm": 0.785193920135498, + "learning_rate": 1.037347102254827e-05, + "loss": 0.4362, + "step": 8190 + }, + { + "epoch": 0.9967751749315485, + "grad_norm": 2.0725505352020264, + "learning_rate": 1.0371521682370294e-05, + "loss": 0.3877, + "step": 8191 + }, + { + "epoch": 0.9968968664435656, + "grad_norm": 1.4224882125854492, + "learning_rate": 1.0369572328055149e-05, + "loss": 0.4718, + "step": 8192 + }, + { + "epoch": 0.9970185579555826, + "grad_norm": 0.6750485301017761, + "learning_rate": 1.0367622959677015e-05, + "loss": 0.467, + "step": 8193 + }, + { + "epoch": 0.9971402494675996, + "grad_norm": 0.8924117088317871, + "learning_rate": 1.0365673577310065e-05, + "loss": 0.4152, + "step": 8194 + }, + { + "epoch": 0.9972619409796166, + "grad_norm": 1.3113890886306763, + "learning_rate": 1.0363724181028479e-05, + "loss": 0.4625, + "step": 8195 + }, + { + "epoch": 0.9973836324916338, + "grad_norm": 2.259995460510254, + "learning_rate": 1.0361774770906434e-05, + "loss": 0.496, + "step": 8196 + }, + { + "epoch": 0.9975053240036508, + "grad_norm": 2.002284049987793, + "learning_rate": 1.0359825347018111e-05, + "loss": 0.4816, + "step": 8197 + }, + { + "epoch": 0.9976270155156678, + "grad_norm": 1.1135305166244507, + "learning_rate": 1.035787590943769e-05, + "loss": 0.4589, + "step": 8198 + }, + { + "epoch": 0.9977487070276848, + "grad_norm": 1.1858857870101929, + "learning_rate": 1.0355926458239346e-05, + "loss": 0.4791, + "step": 8199 + }, + { + "epoch": 0.9978703985397018, + "grad_norm": 0.6245574355125427, + "learning_rate": 1.0353976993497265e-05, + "loss": 0.4505, + "step": 8200 + }, + { + "epoch": 0.9979920900517188, + "grad_norm": 1.240212321281433, + "learning_rate": 1.035202751528563e-05, + "loss": 0.5072, + "step": 8201 + }, + { + "epoch": 0.998113781563736, + "grad_norm": 2.408939838409424, + "learning_rate": 1.0350078023678616e-05, + "loss": 0.4241, + "step": 8202 + }, + { + "epoch": 0.998235473075753, + "grad_norm": 1.2402812242507935, + "learning_rate": 1.0348128518750409e-05, + "loss": 0.4347, + "step": 8203 + }, + { + "epoch": 0.99835716458777, + "grad_norm": 1.7202292680740356, + "learning_rate": 1.0346179000575191e-05, + "loss": 0.425, + "step": 8204 + }, + { + "epoch": 0.998478856099787, + "grad_norm": 4.543060302734375, + "learning_rate": 1.0344229469227148e-05, + "loss": 0.4079, + "step": 8205 + }, + { + "epoch": 0.9986005476118041, + "grad_norm": 0.8375499248504639, + "learning_rate": 1.034227992478046e-05, + "loss": 0.5166, + "step": 8206 + }, + { + "epoch": 0.9987222391238211, + "grad_norm": 0.60439532995224, + "learning_rate": 1.034033036730931e-05, + "loss": 0.444, + "step": 8207 + }, + { + "epoch": 0.9988439306358381, + "grad_norm": 1.0107192993164062, + "learning_rate": 1.0338380796887888e-05, + "loss": 0.4738, + "step": 8208 + }, + { + "epoch": 0.9989656221478552, + "grad_norm": 0.79546058177948, + "learning_rate": 1.0336431213590377e-05, + "loss": 0.4708, + "step": 8209 + }, + { + "epoch": 0.9990873136598722, + "grad_norm": 0.7611692547798157, + "learning_rate": 1.033448161749096e-05, + "loss": 0.456, + "step": 8210 + }, + { + "epoch": 0.9992090051718893, + "grad_norm": 0.8569793701171875, + "learning_rate": 1.0332532008663823e-05, + "loss": 0.4847, + "step": 8211 + }, + { + "epoch": 0.9993306966839063, + "grad_norm": 1.7697551250457764, + "learning_rate": 1.0330582387183156e-05, + "loss": 0.4477, + "step": 8212 + }, + { + "epoch": 0.9994523881959233, + "grad_norm": 2.360614776611328, + "learning_rate": 1.0328632753123149e-05, + "loss": 0.4372, + "step": 8213 + }, + { + "epoch": 0.9995740797079403, + "grad_norm": 2.1063103675842285, + "learning_rate": 1.0326683106557982e-05, + "loss": 0.4407, + "step": 8214 + }, + { + "epoch": 0.9996957712199575, + "grad_norm": 0.9967697858810425, + "learning_rate": 1.0324733447561845e-05, + "loss": 0.4375, + "step": 8215 + }, + { + "epoch": 0.9998174627319745, + "grad_norm": 1.2738629579544067, + "learning_rate": 1.0322783776208932e-05, + "loss": 0.4834, + "step": 8216 + }, + { + "epoch": 0.9999391542439915, + "grad_norm": 1.358147382736206, + "learning_rate": 1.0320834092573426e-05, + "loss": 0.4268, + "step": 8217 + }, + { + "epoch": 1.0000608457560085, + "grad_norm": 2.3392598628997803, + "learning_rate": 1.031888439672952e-05, + "loss": 0.4757, + "step": 8218 + }, + { + "epoch": 1.0001825372680255, + "grad_norm": 2.130840301513672, + "learning_rate": 1.0316934688751401e-05, + "loss": 0.4377, + "step": 8219 + }, + { + "epoch": 1.0003042287800425, + "grad_norm": 2.0060911178588867, + "learning_rate": 1.0314984968713262e-05, + "loss": 0.4483, + "step": 8220 + }, + { + "epoch": 1.0004259202920596, + "grad_norm": 0.7743332386016846, + "learning_rate": 1.0313035236689293e-05, + "loss": 0.4587, + "step": 8221 + }, + { + "epoch": 1.0005476118040766, + "grad_norm": 0.8040422797203064, + "learning_rate": 1.0311085492753683e-05, + "loss": 0.4174, + "step": 8222 + }, + { + "epoch": 1.0006693033160936, + "grad_norm": 1.2777271270751953, + "learning_rate": 1.030913573698063e-05, + "loss": 0.3941, + "step": 8223 + }, + { + "epoch": 1.0007909948281108, + "grad_norm": 0.7108688354492188, + "learning_rate": 1.0307185969444322e-05, + "loss": 0.4358, + "step": 8224 + }, + { + "epoch": 1.0009126863401279, + "grad_norm": 1.1994948387145996, + "learning_rate": 1.0305236190218947e-05, + "loss": 0.4465, + "step": 8225 + }, + { + "epoch": 1.0010343778521449, + "grad_norm": 3.568629503250122, + "learning_rate": 1.0303286399378706e-05, + "loss": 0.3845, + "step": 8226 + }, + { + "epoch": 1.001156069364162, + "grad_norm": 1.0252903699874878, + "learning_rate": 1.0301336596997792e-05, + "loss": 0.4039, + "step": 8227 + }, + { + "epoch": 1.001277760876179, + "grad_norm": 0.7791546583175659, + "learning_rate": 1.0299386783150395e-05, + "loss": 0.4395, + "step": 8228 + }, + { + "epoch": 1.001399452388196, + "grad_norm": 1.6991033554077148, + "learning_rate": 1.0297436957910713e-05, + "loss": 0.4063, + "step": 8229 + }, + { + "epoch": 1.001521143900213, + "grad_norm": 0.9151224493980408, + "learning_rate": 1.0295487121352936e-05, + "loss": 0.3929, + "step": 8230 + }, + { + "epoch": 1.00164283541223, + "grad_norm": 0.763907253742218, + "learning_rate": 1.0293537273551266e-05, + "loss": 0.3901, + "step": 8231 + }, + { + "epoch": 1.001764526924247, + "grad_norm": 1.2544831037521362, + "learning_rate": 1.029158741457989e-05, + "loss": 0.3418, + "step": 8232 + }, + { + "epoch": 1.001886218436264, + "grad_norm": 2.4731099605560303, + "learning_rate": 1.0289637544513014e-05, + "loss": 0.4695, + "step": 8233 + }, + { + "epoch": 1.002007909948281, + "grad_norm": 2.8757359981536865, + "learning_rate": 1.0287687663424826e-05, + "loss": 0.4298, + "step": 8234 + }, + { + "epoch": 1.002129601460298, + "grad_norm": 0.9083685874938965, + "learning_rate": 1.0285737771389532e-05, + "loss": 0.4306, + "step": 8235 + }, + { + "epoch": 1.002251292972315, + "grad_norm": 1.6674168109893799, + "learning_rate": 1.0283787868481322e-05, + "loss": 0.4163, + "step": 8236 + }, + { + "epoch": 1.0023729844843323, + "grad_norm": 0.9981405138969421, + "learning_rate": 1.0281837954774395e-05, + "loss": 0.38, + "step": 8237 + }, + { + "epoch": 1.0024946759963493, + "grad_norm": 1.7152098417282104, + "learning_rate": 1.027988803034295e-05, + "loss": 0.4261, + "step": 8238 + }, + { + "epoch": 1.0026163675083664, + "grad_norm": 1.7241359949111938, + "learning_rate": 1.0277938095261189e-05, + "loss": 0.3885, + "step": 8239 + }, + { + "epoch": 1.0027380590203834, + "grad_norm": 1.241711974143982, + "learning_rate": 1.0275988149603305e-05, + "loss": 0.5193, + "step": 8240 + }, + { + "epoch": 1.0028597505324004, + "grad_norm": 1.847245693206787, + "learning_rate": 1.0274038193443503e-05, + "loss": 0.409, + "step": 8241 + }, + { + "epoch": 1.0029814420444174, + "grad_norm": 1.475324034690857, + "learning_rate": 1.0272088226855979e-05, + "loss": 0.3985, + "step": 8242 + }, + { + "epoch": 1.0031031335564344, + "grad_norm": 3.2274577617645264, + "learning_rate": 1.0270138249914936e-05, + "loss": 0.3604, + "step": 8243 + }, + { + "epoch": 1.0032248250684515, + "grad_norm": 0.855535626411438, + "learning_rate": 1.0268188262694571e-05, + "loss": 0.4186, + "step": 8244 + }, + { + "epoch": 1.0033465165804685, + "grad_norm": 2.6282224655151367, + "learning_rate": 1.0266238265269088e-05, + "loss": 0.3621, + "step": 8245 + }, + { + "epoch": 1.0034682080924855, + "grad_norm": 2.8923850059509277, + "learning_rate": 1.0264288257712691e-05, + "loss": 0.3931, + "step": 8246 + }, + { + "epoch": 1.0035898996045025, + "grad_norm": 1.0964852571487427, + "learning_rate": 1.0262338240099579e-05, + "loss": 0.3694, + "step": 8247 + }, + { + "epoch": 1.0037115911165195, + "grad_norm": 1.529205560684204, + "learning_rate": 1.026038821250395e-05, + "loss": 0.3994, + "step": 8248 + }, + { + "epoch": 1.0038332826285365, + "grad_norm": 1.7007806301116943, + "learning_rate": 1.0258438175000011e-05, + "loss": 0.3804, + "step": 8249 + }, + { + "epoch": 1.0039549741405538, + "grad_norm": 1.1398683786392212, + "learning_rate": 1.025648812766197e-05, + "loss": 0.3983, + "step": 8250 + }, + { + "epoch": 1.0040766656525708, + "grad_norm": 1.3885143995285034, + "learning_rate": 1.025453807056402e-05, + "loss": 0.4128, + "step": 8251 + }, + { + "epoch": 1.0041983571645878, + "grad_norm": 1.401789903640747, + "learning_rate": 1.025258800378037e-05, + "loss": 0.3435, + "step": 8252 + }, + { + "epoch": 1.0043200486766048, + "grad_norm": 3.9362881183624268, + "learning_rate": 1.0250637927385223e-05, + "loss": 0.5015, + "step": 8253 + }, + { + "epoch": 1.0044417401886219, + "grad_norm": 1.0945338010787964, + "learning_rate": 1.0248687841452787e-05, + "loss": 0.4033, + "step": 8254 + }, + { + "epoch": 1.0045634317006389, + "grad_norm": 0.976088285446167, + "learning_rate": 1.0246737746057261e-05, + "loss": 0.3983, + "step": 8255 + }, + { + "epoch": 1.004685123212656, + "grad_norm": 2.0697836875915527, + "learning_rate": 1.0244787641272852e-05, + "loss": 0.3914, + "step": 8256 + }, + { + "epoch": 1.004806814724673, + "grad_norm": 1.0286303758621216, + "learning_rate": 1.024283752717377e-05, + "loss": 0.3357, + "step": 8257 + }, + { + "epoch": 1.00492850623669, + "grad_norm": 3.1036605834960938, + "learning_rate": 1.0240887403834218e-05, + "loss": 0.4642, + "step": 8258 + }, + { + "epoch": 1.005050197748707, + "grad_norm": 0.9778620004653931, + "learning_rate": 1.0238937271328398e-05, + "loss": 0.463, + "step": 8259 + }, + { + "epoch": 1.005171889260724, + "grad_norm": 2.1090810298919678, + "learning_rate": 1.0236987129730522e-05, + "loss": 0.4064, + "step": 8260 + }, + { + "epoch": 1.005293580772741, + "grad_norm": 2.055647611618042, + "learning_rate": 1.0235036979114796e-05, + "loss": 0.437, + "step": 8261 + }, + { + "epoch": 1.0054152722847582, + "grad_norm": 1.9423773288726807, + "learning_rate": 1.0233086819555424e-05, + "loss": 0.3509, + "step": 8262 + }, + { + "epoch": 1.0055369637967753, + "grad_norm": 1.6141581535339355, + "learning_rate": 1.0231136651126616e-05, + "loss": 0.4145, + "step": 8263 + }, + { + "epoch": 1.0056586553087923, + "grad_norm": 1.7718088626861572, + "learning_rate": 1.0229186473902583e-05, + "loss": 0.3689, + "step": 8264 + }, + { + "epoch": 1.0057803468208093, + "grad_norm": 1.3700051307678223, + "learning_rate": 1.0227236287957532e-05, + "loss": 0.4207, + "step": 8265 + }, + { + "epoch": 1.0059020383328263, + "grad_norm": 1.1339055299758911, + "learning_rate": 1.0225286093365665e-05, + "loss": 0.4443, + "step": 8266 + }, + { + "epoch": 1.0060237298448433, + "grad_norm": 3.1934893131256104, + "learning_rate": 1.0223335890201195e-05, + "loss": 0.3983, + "step": 8267 + }, + { + "epoch": 1.0061454213568604, + "grad_norm": 2.673259735107422, + "learning_rate": 1.0221385678538335e-05, + "loss": 0.4212, + "step": 8268 + }, + { + "epoch": 1.0062671128688774, + "grad_norm": 1.2998101711273193, + "learning_rate": 1.0219435458451292e-05, + "loss": 0.4295, + "step": 8269 + }, + { + "epoch": 1.0063888043808944, + "grad_norm": 1.089137077331543, + "learning_rate": 1.0217485230014278e-05, + "loss": 0.419, + "step": 8270 + }, + { + "epoch": 1.0065104958929114, + "grad_norm": 1.3858327865600586, + "learning_rate": 1.0215534993301496e-05, + "loss": 0.4549, + "step": 8271 + }, + { + "epoch": 1.0066321874049284, + "grad_norm": 2.626344680786133, + "learning_rate": 1.0213584748387163e-05, + "loss": 0.3498, + "step": 8272 + }, + { + "epoch": 1.0067538789169455, + "grad_norm": 1.2461892366409302, + "learning_rate": 1.0211634495345488e-05, + "loss": 0.4532, + "step": 8273 + }, + { + "epoch": 1.0068755704289625, + "grad_norm": 1.191898226737976, + "learning_rate": 1.0209684234250683e-05, + "loss": 0.412, + "step": 8274 + }, + { + "epoch": 1.0069972619409797, + "grad_norm": 1.1674786806106567, + "learning_rate": 1.020773396517696e-05, + "loss": 0.3746, + "step": 8275 + }, + { + "epoch": 1.0071189534529967, + "grad_norm": 1.2406764030456543, + "learning_rate": 1.0205783688198527e-05, + "loss": 0.3944, + "step": 8276 + }, + { + "epoch": 1.0072406449650138, + "grad_norm": 2.467060089111328, + "learning_rate": 1.0203833403389601e-05, + "loss": 0.491, + "step": 8277 + }, + { + "epoch": 1.0073623364770308, + "grad_norm": 2.001375436782837, + "learning_rate": 1.0201883110824391e-05, + "loss": 0.3867, + "step": 8278 + }, + { + "epoch": 1.0074840279890478, + "grad_norm": 2.6476292610168457, + "learning_rate": 1.019993281057711e-05, + "loss": 0.4524, + "step": 8279 + }, + { + "epoch": 1.0076057195010648, + "grad_norm": 1.4431946277618408, + "learning_rate": 1.0197982502721973e-05, + "loss": 0.391, + "step": 8280 + }, + { + "epoch": 1.0077274110130818, + "grad_norm": 0.9354568123817444, + "learning_rate": 1.0196032187333194e-05, + "loss": 0.3646, + "step": 8281 + }, + { + "epoch": 1.0078491025250988, + "grad_norm": 1.1406186819076538, + "learning_rate": 1.019408186448498e-05, + "loss": 0.4219, + "step": 8282 + }, + { + "epoch": 1.0079707940371159, + "grad_norm": 2.238574504852295, + "learning_rate": 1.019213153425155e-05, + "loss": 0.3707, + "step": 8283 + }, + { + "epoch": 1.0080924855491329, + "grad_norm": 0.9136751294136047, + "learning_rate": 1.0190181196707121e-05, + "loss": 0.4287, + "step": 8284 + }, + { + "epoch": 1.00821417706115, + "grad_norm": 0.9762387275695801, + "learning_rate": 1.01882308519259e-05, + "loss": 0.393, + "step": 8285 + }, + { + "epoch": 1.008335868573167, + "grad_norm": 1.1271981000900269, + "learning_rate": 1.0186280499982107e-05, + "loss": 0.4125, + "step": 8286 + }, + { + "epoch": 1.0084575600851842, + "grad_norm": 1.9838197231292725, + "learning_rate": 1.0184330140949956e-05, + "loss": 0.3889, + "step": 8287 + }, + { + "epoch": 1.0085792515972012, + "grad_norm": 1.7504675388336182, + "learning_rate": 1.0182379774903662e-05, + "loss": 0.343, + "step": 8288 + }, + { + "epoch": 1.0087009431092182, + "grad_norm": 1.7771848440170288, + "learning_rate": 1.0180429401917438e-05, + "loss": 0.3894, + "step": 8289 + }, + { + "epoch": 1.0088226346212352, + "grad_norm": 1.4864696264266968, + "learning_rate": 1.0178479022065501e-05, + "loss": 0.4092, + "step": 8290 + }, + { + "epoch": 1.0089443261332522, + "grad_norm": 0.9380364418029785, + "learning_rate": 1.0176528635422072e-05, + "loss": 0.3772, + "step": 8291 + }, + { + "epoch": 1.0090660176452693, + "grad_norm": 2.18104887008667, + "learning_rate": 1.017457824206136e-05, + "loss": 0.4245, + "step": 8292 + }, + { + "epoch": 1.0091877091572863, + "grad_norm": 0.9738626480102539, + "learning_rate": 1.0172627842057588e-05, + "loss": 0.3595, + "step": 8293 + }, + { + "epoch": 1.0093094006693033, + "grad_norm": 0.7704262733459473, + "learning_rate": 1.0170677435484964e-05, + "loss": 0.3229, + "step": 8294 + }, + { + "epoch": 1.0094310921813203, + "grad_norm": 2.369701623916626, + "learning_rate": 1.0168727022417715e-05, + "loss": 0.4364, + "step": 8295 + }, + { + "epoch": 1.0095527836933373, + "grad_norm": 2.02177095413208, + "learning_rate": 1.0166776602930051e-05, + "loss": 0.4443, + "step": 8296 + }, + { + "epoch": 1.0096744752053544, + "grad_norm": 1.763436198234558, + "learning_rate": 1.0164826177096189e-05, + "loss": 0.4182, + "step": 8297 + }, + { + "epoch": 1.0097961667173714, + "grad_norm": 2.5117340087890625, + "learning_rate": 1.0162875744990357e-05, + "loss": 0.4028, + "step": 8298 + }, + { + "epoch": 1.0099178582293884, + "grad_norm": 1.0737465620040894, + "learning_rate": 1.0160925306686762e-05, + "loss": 0.3655, + "step": 8299 + }, + { + "epoch": 1.0100395497414056, + "grad_norm": 2.2438952922821045, + "learning_rate": 1.0158974862259626e-05, + "loss": 0.3568, + "step": 8300 + }, + { + "epoch": 1.0101612412534227, + "grad_norm": 1.2577983140945435, + "learning_rate": 1.0157024411783165e-05, + "loss": 0.4057, + "step": 8301 + }, + { + "epoch": 1.0102829327654397, + "grad_norm": 1.3593028783798218, + "learning_rate": 1.0155073955331603e-05, + "loss": 0.4129, + "step": 8302 + }, + { + "epoch": 1.0104046242774567, + "grad_norm": 1.9193497896194458, + "learning_rate": 1.0153123492979156e-05, + "loss": 0.3816, + "step": 8303 + }, + { + "epoch": 1.0105263157894737, + "grad_norm": 1.3715996742248535, + "learning_rate": 1.0151173024800045e-05, + "loss": 0.3462, + "step": 8304 + }, + { + "epoch": 1.0106480073014907, + "grad_norm": 1.147500991821289, + "learning_rate": 1.0149222550868483e-05, + "loss": 0.3727, + "step": 8305 + }, + { + "epoch": 1.0107696988135078, + "grad_norm": 2.520171642303467, + "learning_rate": 1.0147272071258695e-05, + "loss": 0.4317, + "step": 8306 + }, + { + "epoch": 1.0108913903255248, + "grad_norm": 1.0026464462280273, + "learning_rate": 1.0145321586044903e-05, + "loss": 0.3443, + "step": 8307 + }, + { + "epoch": 1.0110130818375418, + "grad_norm": 3.3628146648406982, + "learning_rate": 1.0143371095301323e-05, + "loss": 0.4607, + "step": 8308 + }, + { + "epoch": 1.0111347733495588, + "grad_norm": 1.7303556203842163, + "learning_rate": 1.0141420599102172e-05, + "loss": 0.393, + "step": 8309 + }, + { + "epoch": 1.0112564648615758, + "grad_norm": 2.9159469604492188, + "learning_rate": 1.013947009752168e-05, + "loss": 0.3767, + "step": 8310 + }, + { + "epoch": 1.0113781563735929, + "grad_norm": 3.6203832626342773, + "learning_rate": 1.013751959063406e-05, + "loss": 0.4231, + "step": 8311 + }, + { + "epoch": 1.0114998478856099, + "grad_norm": 1.3753291368484497, + "learning_rate": 1.0135569078513534e-05, + "loss": 0.3822, + "step": 8312 + }, + { + "epoch": 1.0116215393976271, + "grad_norm": 2.3555948734283447, + "learning_rate": 1.0133618561234322e-05, + "loss": 0.3742, + "step": 8313 + }, + { + "epoch": 1.0117432309096441, + "grad_norm": 2.5891268253326416, + "learning_rate": 1.0131668038870653e-05, + "loss": 0.4598, + "step": 8314 + }, + { + "epoch": 1.0118649224216612, + "grad_norm": 4.6124162673950195, + "learning_rate": 1.0129717511496738e-05, + "loss": 0.3334, + "step": 8315 + }, + { + "epoch": 1.0119866139336782, + "grad_norm": 0.944620668888092, + "learning_rate": 1.0127766979186804e-05, + "loss": 0.374, + "step": 8316 + }, + { + "epoch": 1.0121083054456952, + "grad_norm": 1.506091594696045, + "learning_rate": 1.0125816442015073e-05, + "loss": 0.4436, + "step": 8317 + }, + { + "epoch": 1.0122299969577122, + "grad_norm": 3.5881686210632324, + "learning_rate": 1.0123865900055765e-05, + "loss": 0.4124, + "step": 8318 + }, + { + "epoch": 1.0123516884697292, + "grad_norm": 2.568885087966919, + "learning_rate": 1.0121915353383103e-05, + "loss": 0.4077, + "step": 8319 + }, + { + "epoch": 1.0124733799817462, + "grad_norm": 1.9236934185028076, + "learning_rate": 1.011996480207131e-05, + "loss": 0.37, + "step": 8320 + }, + { + "epoch": 1.0125950714937633, + "grad_norm": 2.2964608669281006, + "learning_rate": 1.0118014246194607e-05, + "loss": 0.438, + "step": 8321 + }, + { + "epoch": 1.0127167630057803, + "grad_norm": 1.2992522716522217, + "learning_rate": 1.0116063685827222e-05, + "loss": 0.4205, + "step": 8322 + }, + { + "epoch": 1.0128384545177973, + "grad_norm": 1.1891727447509766, + "learning_rate": 1.0114113121043367e-05, + "loss": 0.4003, + "step": 8323 + }, + { + "epoch": 1.0129601460298143, + "grad_norm": 3.500699996948242, + "learning_rate": 1.0112162551917275e-05, + "loss": 0.2874, + "step": 8324 + }, + { + "epoch": 1.0130818375418316, + "grad_norm": 1.4891784191131592, + "learning_rate": 1.0110211978523166e-05, + "loss": 0.4533, + "step": 8325 + }, + { + "epoch": 1.0132035290538486, + "grad_norm": 1.2669380903244019, + "learning_rate": 1.0108261400935262e-05, + "loss": 0.4277, + "step": 8326 + }, + { + "epoch": 1.0133252205658656, + "grad_norm": 1.8431144952774048, + "learning_rate": 1.0106310819227789e-05, + "loss": 0.385, + "step": 8327 + }, + { + "epoch": 1.0134469120778826, + "grad_norm": 1.4549047946929932, + "learning_rate": 1.0104360233474967e-05, + "loss": 0.3893, + "step": 8328 + }, + { + "epoch": 1.0135686035898996, + "grad_norm": 1.4614136219024658, + "learning_rate": 1.0102409643751025e-05, + "loss": 0.3559, + "step": 8329 + }, + { + "epoch": 1.0136902951019167, + "grad_norm": 1.1574559211730957, + "learning_rate": 1.0100459050130182e-05, + "loss": 0.3239, + "step": 8330 + }, + { + "epoch": 1.0138119866139337, + "grad_norm": 1.0747854709625244, + "learning_rate": 1.0098508452686664e-05, + "loss": 0.3846, + "step": 8331 + }, + { + "epoch": 1.0139336781259507, + "grad_norm": 1.212923288345337, + "learning_rate": 1.0096557851494695e-05, + "loss": 0.315, + "step": 8332 + }, + { + "epoch": 1.0140553696379677, + "grad_norm": 3.0859949588775635, + "learning_rate": 1.0094607246628505e-05, + "loss": 0.4746, + "step": 8333 + }, + { + "epoch": 1.0141770611499847, + "grad_norm": 1.2591862678527832, + "learning_rate": 1.0092656638162309e-05, + "loss": 0.3338, + "step": 8334 + }, + { + "epoch": 1.0142987526620018, + "grad_norm": 1.3367829322814941, + "learning_rate": 1.0090706026170334e-05, + "loss": 0.3447, + "step": 8335 + }, + { + "epoch": 1.0144204441740188, + "grad_norm": 1.5000842809677124, + "learning_rate": 1.0088755410726809e-05, + "loss": 0.4379, + "step": 8336 + }, + { + "epoch": 1.0145421356860358, + "grad_norm": 2.5861613750457764, + "learning_rate": 1.008680479190596e-05, + "loss": 0.3318, + "step": 8337 + }, + { + "epoch": 1.014663827198053, + "grad_norm": 2.0502939224243164, + "learning_rate": 1.0084854169782006e-05, + "loss": 0.349, + "step": 8338 + }, + { + "epoch": 1.01478551871007, + "grad_norm": 1.8105688095092773, + "learning_rate": 1.0082903544429176e-05, + "loss": 0.4004, + "step": 8339 + }, + { + "epoch": 1.014907210222087, + "grad_norm": 1.7817609310150146, + "learning_rate": 1.0080952915921694e-05, + "loss": 0.3905, + "step": 8340 + }, + { + "epoch": 1.015028901734104, + "grad_norm": 1.7019411325454712, + "learning_rate": 1.0079002284333785e-05, + "loss": 0.4047, + "step": 8341 + }, + { + "epoch": 1.0151505932461211, + "grad_norm": 1.144789457321167, + "learning_rate": 1.0077051649739678e-05, + "loss": 0.3553, + "step": 8342 + }, + { + "epoch": 1.0152722847581381, + "grad_norm": 3.05646014213562, + "learning_rate": 1.0075101012213592e-05, + "loss": 0.4102, + "step": 8343 + }, + { + "epoch": 1.0153939762701552, + "grad_norm": 1.4944475889205933, + "learning_rate": 1.0073150371829763e-05, + "loss": 0.3906, + "step": 8344 + }, + { + "epoch": 1.0155156677821722, + "grad_norm": 2.971116781234741, + "learning_rate": 1.0071199728662409e-05, + "loss": 0.3853, + "step": 8345 + }, + { + "epoch": 1.0156373592941892, + "grad_norm": 1.6913899183273315, + "learning_rate": 1.0069249082785755e-05, + "loss": 0.3914, + "step": 8346 + }, + { + "epoch": 1.0157590508062062, + "grad_norm": 4.988412857055664, + "learning_rate": 1.0067298434274031e-05, + "loss": 0.499, + "step": 8347 + }, + { + "epoch": 1.0158807423182232, + "grad_norm": 3.384791851043701, + "learning_rate": 1.0065347783201464e-05, + "loss": 0.3158, + "step": 8348 + }, + { + "epoch": 1.0160024338302402, + "grad_norm": 3.1956398487091064, + "learning_rate": 1.0063397129642278e-05, + "loss": 0.2915, + "step": 8349 + }, + { + "epoch": 1.0161241253422575, + "grad_norm": 1.756445050239563, + "learning_rate": 1.0061446473670701e-05, + "loss": 0.3852, + "step": 8350 + }, + { + "epoch": 1.0162458168542745, + "grad_norm": 1.7911667823791504, + "learning_rate": 1.0059495815360956e-05, + "loss": 0.4497, + "step": 8351 + }, + { + "epoch": 1.0163675083662915, + "grad_norm": 1.727837324142456, + "learning_rate": 1.0057545154787276e-05, + "loss": 0.3689, + "step": 8352 + }, + { + "epoch": 1.0164891998783085, + "grad_norm": 3.2925519943237305, + "learning_rate": 1.005559449202388e-05, + "loss": 0.4847, + "step": 8353 + }, + { + "epoch": 1.0166108913903256, + "grad_norm": 1.6262190341949463, + "learning_rate": 1.0053643827144999e-05, + "loss": 0.4004, + "step": 8354 + }, + { + "epoch": 1.0167325829023426, + "grad_norm": 1.9189738035202026, + "learning_rate": 1.005169316022486e-05, + "loss": 0.4345, + "step": 8355 + }, + { + "epoch": 1.0168542744143596, + "grad_norm": 1.4556199312210083, + "learning_rate": 1.0049742491337691e-05, + "loss": 0.4061, + "step": 8356 + }, + { + "epoch": 1.0169759659263766, + "grad_norm": 1.363373041152954, + "learning_rate": 1.0047791820557715e-05, + "loss": 0.4366, + "step": 8357 + }, + { + "epoch": 1.0170976574383936, + "grad_norm": 2.4752964973449707, + "learning_rate": 1.0045841147959161e-05, + "loss": 0.3997, + "step": 8358 + }, + { + "epoch": 1.0172193489504107, + "grad_norm": 2.4163966178894043, + "learning_rate": 1.0043890473616258e-05, + "loss": 0.3868, + "step": 8359 + }, + { + "epoch": 1.0173410404624277, + "grad_norm": 1.5908942222595215, + "learning_rate": 1.0041939797603231e-05, + "loss": 0.3949, + "step": 8360 + }, + { + "epoch": 1.0174627319744447, + "grad_norm": 2.44240140914917, + "learning_rate": 1.0039989119994306e-05, + "loss": 0.353, + "step": 8361 + }, + { + "epoch": 1.0175844234864617, + "grad_norm": 2.568925619125366, + "learning_rate": 1.0038038440863715e-05, + "loss": 0.478, + "step": 8362 + }, + { + "epoch": 1.017706114998479, + "grad_norm": 1.585746169090271, + "learning_rate": 1.0036087760285682e-05, + "loss": 0.3459, + "step": 8363 + }, + { + "epoch": 1.017827806510496, + "grad_norm": 1.1615705490112305, + "learning_rate": 1.0034137078334434e-05, + "loss": 0.3797, + "step": 8364 + }, + { + "epoch": 1.017949498022513, + "grad_norm": 1.4657227993011475, + "learning_rate": 1.00321863950842e-05, + "loss": 0.3588, + "step": 8365 + }, + { + "epoch": 1.01807118953453, + "grad_norm": 1.4175293445587158, + "learning_rate": 1.0030235710609206e-05, + "loss": 0.3784, + "step": 8366 + }, + { + "epoch": 1.018192881046547, + "grad_norm": 1.450131893157959, + "learning_rate": 1.0028285024983681e-05, + "loss": 0.4149, + "step": 8367 + }, + { + "epoch": 1.018314572558564, + "grad_norm": 2.158311605453491, + "learning_rate": 1.0026334338281856e-05, + "loss": 0.3856, + "step": 8368 + }, + { + "epoch": 1.018436264070581, + "grad_norm": 1.6758432388305664, + "learning_rate": 1.0024383650577952e-05, + "loss": 0.4044, + "step": 8369 + }, + { + "epoch": 1.018557955582598, + "grad_norm": 1.2153000831604004, + "learning_rate": 1.0022432961946197e-05, + "loss": 0.3824, + "step": 8370 + }, + { + "epoch": 1.0186796470946151, + "grad_norm": 1.2526644468307495, + "learning_rate": 1.0020482272460825e-05, + "loss": 0.3422, + "step": 8371 + }, + { + "epoch": 1.0188013386066321, + "grad_norm": 3.0249688625335693, + "learning_rate": 1.0018531582196059e-05, + "loss": 0.2929, + "step": 8372 + }, + { + "epoch": 1.0189230301186492, + "grad_norm": 1.8084049224853516, + "learning_rate": 1.001658089122613e-05, + "loss": 0.3341, + "step": 8373 + }, + { + "epoch": 1.0190447216306662, + "grad_norm": 1.1040598154067993, + "learning_rate": 1.001463019962526e-05, + "loss": 0.3424, + "step": 8374 + }, + { + "epoch": 1.0191664131426832, + "grad_norm": 1.4620301723480225, + "learning_rate": 1.0012679507467683e-05, + "loss": 0.3775, + "step": 8375 + }, + { + "epoch": 1.0192881046547004, + "grad_norm": 1.6535565853118896, + "learning_rate": 1.0010728814827623e-05, + "loss": 0.4196, + "step": 8376 + }, + { + "epoch": 1.0194097961667175, + "grad_norm": 1.3088042736053467, + "learning_rate": 1.000877812177931e-05, + "loss": 0.3615, + "step": 8377 + }, + { + "epoch": 1.0195314876787345, + "grad_norm": 1.5654534101486206, + "learning_rate": 1.0006827428396972e-05, + "loss": 0.3696, + "step": 8378 + }, + { + "epoch": 1.0196531791907515, + "grad_norm": 2.361877679824829, + "learning_rate": 1.0004876734754838e-05, + "loss": 0.4321, + "step": 8379 + }, + { + "epoch": 1.0197748707027685, + "grad_norm": 1.1035239696502686, + "learning_rate": 1.000292604092713e-05, + "loss": 0.4019, + "step": 8380 + }, + { + "epoch": 1.0198965622147855, + "grad_norm": 1.299080729484558, + "learning_rate": 1.0000975346988081e-05, + "loss": 0.3164, + "step": 8381 + }, + { + "epoch": 1.0200182537268025, + "grad_norm": 1.5622678995132446, + "learning_rate": 9.99902465301192e-06, + "loss": 0.4089, + "step": 8382 + }, + { + "epoch": 1.0201399452388196, + "grad_norm": 1.23862624168396, + "learning_rate": 9.997073959072872e-06, + "loss": 0.4181, + "step": 8383 + }, + { + "epoch": 1.0202616367508366, + "grad_norm": 1.8458223342895508, + "learning_rate": 9.995123265245165e-06, + "loss": 0.4401, + "step": 8384 + }, + { + "epoch": 1.0203833282628536, + "grad_norm": 1.8534002304077148, + "learning_rate": 9.99317257160303e-06, + "loss": 0.3928, + "step": 8385 + }, + { + "epoch": 1.0205050197748706, + "grad_norm": 1.2381389141082764, + "learning_rate": 9.991221878220691e-06, + "loss": 0.434, + "step": 8386 + }, + { + "epoch": 1.0206267112868876, + "grad_norm": 1.700703501701355, + "learning_rate": 9.989271185172376e-06, + "loss": 0.3741, + "step": 8387 + }, + { + "epoch": 1.0207484027989047, + "grad_norm": 2.201749324798584, + "learning_rate": 9.987320492532322e-06, + "loss": 0.3951, + "step": 8388 + }, + { + "epoch": 1.020870094310922, + "grad_norm": 1.0145998001098633, + "learning_rate": 9.985369800374743e-06, + "loss": 0.3689, + "step": 8389 + }, + { + "epoch": 1.020991785822939, + "grad_norm": 1.4275449514389038, + "learning_rate": 9.983419108773876e-06, + "loss": 0.3933, + "step": 8390 + }, + { + "epoch": 1.021113477334956, + "grad_norm": 2.5123236179351807, + "learning_rate": 9.981468417803945e-06, + "loss": 0.3489, + "step": 8391 + }, + { + "epoch": 1.021235168846973, + "grad_norm": 1.210709571838379, + "learning_rate": 9.979517727539177e-06, + "loss": 0.4135, + "step": 8392 + }, + { + "epoch": 1.02135686035899, + "grad_norm": 1.035210371017456, + "learning_rate": 9.977567038053804e-06, + "loss": 0.3461, + "step": 8393 + }, + { + "epoch": 1.021478551871007, + "grad_norm": 2.321002244949341, + "learning_rate": 9.975616349422052e-06, + "loss": 0.416, + "step": 8394 + }, + { + "epoch": 1.021600243383024, + "grad_norm": 1.3586382865905762, + "learning_rate": 9.973665661718147e-06, + "loss": 0.4224, + "step": 8395 + }, + { + "epoch": 1.021721934895041, + "grad_norm": 1.3766831159591675, + "learning_rate": 9.971714975016318e-06, + "loss": 0.3822, + "step": 8396 + }, + { + "epoch": 1.021843626407058, + "grad_norm": 1.6176156997680664, + "learning_rate": 9.969764289390796e-06, + "loss": 0.372, + "step": 8397 + }, + { + "epoch": 1.021965317919075, + "grad_norm": 1.1981098651885986, + "learning_rate": 9.967813604915801e-06, + "loss": 0.4232, + "step": 8398 + }, + { + "epoch": 1.022087009431092, + "grad_norm": 1.5008865594863892, + "learning_rate": 9.96586292166557e-06, + "loss": 0.3796, + "step": 8399 + }, + { + "epoch": 1.0222087009431091, + "grad_norm": 1.383406639099121, + "learning_rate": 9.963912239714323e-06, + "loss": 0.3915, + "step": 8400 + }, + { + "epoch": 1.0223303924551264, + "grad_norm": 0.9409666657447815, + "learning_rate": 9.96196155913629e-06, + "loss": 0.3455, + "step": 8401 + }, + { + "epoch": 1.0224520839671434, + "grad_norm": 1.1549283266067505, + "learning_rate": 9.960010880005697e-06, + "loss": 0.3877, + "step": 8402 + }, + { + "epoch": 1.0225737754791604, + "grad_norm": 2.7519052028656006, + "learning_rate": 9.958060202396774e-06, + "loss": 0.4259, + "step": 8403 + }, + { + "epoch": 1.0226954669911774, + "grad_norm": 1.1812822818756104, + "learning_rate": 9.956109526383745e-06, + "loss": 0.4274, + "step": 8404 + }, + { + "epoch": 1.0228171585031944, + "grad_norm": 2.096966505050659, + "learning_rate": 9.954158852040842e-06, + "loss": 0.4345, + "step": 8405 + }, + { + "epoch": 1.0229388500152115, + "grad_norm": 0.9592916369438171, + "learning_rate": 9.952208179442288e-06, + "loss": 0.3592, + "step": 8406 + }, + { + "epoch": 1.0230605415272285, + "grad_norm": 1.6376439332962036, + "learning_rate": 9.950257508662312e-06, + "loss": 0.3201, + "step": 8407 + }, + { + "epoch": 1.0231822330392455, + "grad_norm": 1.5710434913635254, + "learning_rate": 9.948306839775143e-06, + "loss": 0.3452, + "step": 8408 + }, + { + "epoch": 1.0233039245512625, + "grad_norm": 1.166159749031067, + "learning_rate": 9.946356172855003e-06, + "loss": 0.3465, + "step": 8409 + }, + { + "epoch": 1.0234256160632795, + "grad_norm": 1.8354363441467285, + "learning_rate": 9.944405507976122e-06, + "loss": 0.4257, + "step": 8410 + }, + { + "epoch": 1.0235473075752965, + "grad_norm": 2.438997745513916, + "learning_rate": 9.94245484521273e-06, + "loss": 0.3444, + "step": 8411 + }, + { + "epoch": 1.0236689990873136, + "grad_norm": 1.3106298446655273, + "learning_rate": 9.940504184639046e-06, + "loss": 0.4019, + "step": 8412 + }, + { + "epoch": 1.0237906905993306, + "grad_norm": 1.2060514688491821, + "learning_rate": 9.938553526329304e-06, + "loss": 0.377, + "step": 8413 + }, + { + "epoch": 1.0239123821113478, + "grad_norm": 1.3195699453353882, + "learning_rate": 9.936602870357725e-06, + "loss": 0.387, + "step": 8414 + }, + { + "epoch": 1.0240340736233648, + "grad_norm": 1.252862572669983, + "learning_rate": 9.934652216798537e-06, + "loss": 0.383, + "step": 8415 + }, + { + "epoch": 1.0241557651353819, + "grad_norm": 2.6345582008361816, + "learning_rate": 9.93270156572597e-06, + "loss": 0.3047, + "step": 8416 + }, + { + "epoch": 1.0242774566473989, + "grad_norm": 1.6867715120315552, + "learning_rate": 9.930750917214246e-06, + "loss": 0.3688, + "step": 8417 + }, + { + "epoch": 1.024399148159416, + "grad_norm": 3.6022634506225586, + "learning_rate": 9.928800271337595e-06, + "loss": 0.4462, + "step": 8418 + }, + { + "epoch": 1.024520839671433, + "grad_norm": 2.9298648834228516, + "learning_rate": 9.92684962817024e-06, + "loss": 0.4327, + "step": 8419 + }, + { + "epoch": 1.02464253118345, + "grad_norm": 3.2493534088134766, + "learning_rate": 9.924898987786408e-06, + "loss": 0.413, + "step": 8420 + }, + { + "epoch": 1.024764222695467, + "grad_norm": 2.7008402347564697, + "learning_rate": 9.922948350260323e-06, + "loss": 0.4117, + "step": 8421 + }, + { + "epoch": 1.024885914207484, + "grad_norm": 1.7091697454452515, + "learning_rate": 9.920997715666218e-06, + "loss": 0.3822, + "step": 8422 + }, + { + "epoch": 1.025007605719501, + "grad_norm": 1.5987539291381836, + "learning_rate": 9.919047084078311e-06, + "loss": 0.401, + "step": 8423 + }, + { + "epoch": 1.025129297231518, + "grad_norm": 1.2505837678909302, + "learning_rate": 9.917096455570829e-06, + "loss": 0.3448, + "step": 8424 + }, + { + "epoch": 1.025250988743535, + "grad_norm": 1.6886801719665527, + "learning_rate": 9.915145830217999e-06, + "loss": 0.3685, + "step": 8425 + }, + { + "epoch": 1.0253726802555523, + "grad_norm": 2.0024356842041016, + "learning_rate": 9.913195208094043e-06, + "loss": 0.4339, + "step": 8426 + }, + { + "epoch": 1.0254943717675693, + "grad_norm": 2.178471326828003, + "learning_rate": 9.911244589273193e-06, + "loss": 0.3396, + "step": 8427 + }, + { + "epoch": 1.0256160632795863, + "grad_norm": 1.8820332288742065, + "learning_rate": 9.909293973829667e-06, + "loss": 0.4167, + "step": 8428 + }, + { + "epoch": 1.0257377547916033, + "grad_norm": 1.6188408136367798, + "learning_rate": 9.907343361837694e-06, + "loss": 0.42, + "step": 8429 + }, + { + "epoch": 1.0258594463036204, + "grad_norm": 1.5058780908584595, + "learning_rate": 9.9053927533715e-06, + "loss": 0.3765, + "step": 8430 + }, + { + "epoch": 1.0259811378156374, + "grad_norm": 3.7936012744903564, + "learning_rate": 9.903442148505305e-06, + "loss": 0.3855, + "step": 8431 + }, + { + "epoch": 1.0261028293276544, + "grad_norm": 1.5638360977172852, + "learning_rate": 9.901491547313336e-06, + "loss": 0.4067, + "step": 8432 + }, + { + "epoch": 1.0262245208396714, + "grad_norm": 1.5825241804122925, + "learning_rate": 9.89954094986982e-06, + "loss": 0.3515, + "step": 8433 + }, + { + "epoch": 1.0263462123516884, + "grad_norm": 2.1016364097595215, + "learning_rate": 9.897590356248979e-06, + "loss": 0.3441, + "step": 8434 + }, + { + "epoch": 1.0264679038637055, + "grad_norm": 1.3313311338424683, + "learning_rate": 9.895639766525038e-06, + "loss": 0.3558, + "step": 8435 + }, + { + "epoch": 1.0265895953757225, + "grad_norm": 2.491170644760132, + "learning_rate": 9.893689180772216e-06, + "loss": 0.4247, + "step": 8436 + }, + { + "epoch": 1.0267112868877395, + "grad_norm": 3.083341121673584, + "learning_rate": 9.891738599064741e-06, + "loss": 0.4331, + "step": 8437 + }, + { + "epoch": 1.0268329783997565, + "grad_norm": 4.732062339782715, + "learning_rate": 9.889788021476836e-06, + "loss": 0.5266, + "step": 8438 + }, + { + "epoch": 1.0269546699117738, + "grad_norm": 2.2248575687408447, + "learning_rate": 9.887837448082728e-06, + "loss": 0.3503, + "step": 8439 + }, + { + "epoch": 1.0270763614237908, + "grad_norm": 1.8852100372314453, + "learning_rate": 9.885886878956634e-06, + "loss": 0.3964, + "step": 8440 + }, + { + "epoch": 1.0271980529358078, + "grad_norm": 2.4943511486053467, + "learning_rate": 9.883936314172783e-06, + "loss": 0.4099, + "step": 8441 + }, + { + "epoch": 1.0273197444478248, + "grad_norm": 2.6485531330108643, + "learning_rate": 9.881985753805394e-06, + "loss": 0.3922, + "step": 8442 + }, + { + "epoch": 1.0274414359598418, + "grad_norm": 1.6862001419067383, + "learning_rate": 9.880035197928692e-06, + "loss": 0.4046, + "step": 8443 + }, + { + "epoch": 1.0275631274718589, + "grad_norm": 2.152690887451172, + "learning_rate": 9.878084646616897e-06, + "loss": 0.4025, + "step": 8444 + }, + { + "epoch": 1.0276848189838759, + "grad_norm": 3.224316358566284, + "learning_rate": 9.87613409994424e-06, + "loss": 0.3862, + "step": 8445 + }, + { + "epoch": 1.0278065104958929, + "grad_norm": 2.362570285797119, + "learning_rate": 9.87418355798493e-06, + "loss": 0.4232, + "step": 8446 + }, + { + "epoch": 1.02792820200791, + "grad_norm": 4.146801948547363, + "learning_rate": 9.8722330208132e-06, + "loss": 0.3559, + "step": 8447 + }, + { + "epoch": 1.028049893519927, + "grad_norm": 1.7981668710708618, + "learning_rate": 9.870282488503266e-06, + "loss": 0.4, + "step": 8448 + }, + { + "epoch": 1.028171585031944, + "grad_norm": 2.4671928882598877, + "learning_rate": 9.868331961129352e-06, + "loss": 0.3622, + "step": 8449 + }, + { + "epoch": 1.028293276543961, + "grad_norm": 2.83992600440979, + "learning_rate": 9.86638143876568e-06, + "loss": 0.3442, + "step": 8450 + }, + { + "epoch": 1.0284149680559782, + "grad_norm": 1.4505071640014648, + "learning_rate": 9.86443092148647e-06, + "loss": 0.3732, + "step": 8451 + }, + { + "epoch": 1.0285366595679952, + "grad_norm": 1.1865346431732178, + "learning_rate": 9.862480409365942e-06, + "loss": 0.3853, + "step": 8452 + }, + { + "epoch": 1.0286583510800122, + "grad_norm": 1.0423961877822876, + "learning_rate": 9.860529902478324e-06, + "loss": 0.3462, + "step": 8453 + }, + { + "epoch": 1.0287800425920293, + "grad_norm": 2.1626217365264893, + "learning_rate": 9.85857940089783e-06, + "loss": 0.3861, + "step": 8454 + }, + { + "epoch": 1.0289017341040463, + "grad_norm": 1.8315163850784302, + "learning_rate": 9.856628904698679e-06, + "loss": 0.3745, + "step": 8455 + }, + { + "epoch": 1.0290234256160633, + "grad_norm": 1.2824817895889282, + "learning_rate": 9.854678413955098e-06, + "loss": 0.3668, + "step": 8456 + }, + { + "epoch": 1.0291451171280803, + "grad_norm": 1.1688225269317627, + "learning_rate": 9.852727928741307e-06, + "loss": 0.3328, + "step": 8457 + }, + { + "epoch": 1.0292668086400973, + "grad_norm": 3.8883888721466064, + "learning_rate": 9.850777449131522e-06, + "loss": 0.4166, + "step": 8458 + }, + { + "epoch": 1.0293885001521144, + "grad_norm": 1.1900955438613892, + "learning_rate": 9.848826975199961e-06, + "loss": 0.3376, + "step": 8459 + }, + { + "epoch": 1.0295101916641314, + "grad_norm": 2.726933717727661, + "learning_rate": 9.846876507020847e-06, + "loss": 0.4204, + "step": 8460 + }, + { + "epoch": 1.0296318831761484, + "grad_norm": 1.4068548679351807, + "learning_rate": 9.8449260446684e-06, + "loss": 0.3828, + "step": 8461 + }, + { + "epoch": 1.0297535746881654, + "grad_norm": 1.3711700439453125, + "learning_rate": 9.842975588216838e-06, + "loss": 0.3966, + "step": 8462 + }, + { + "epoch": 1.0298752662001824, + "grad_norm": 1.7826921939849854, + "learning_rate": 9.841025137740377e-06, + "loss": 0.3647, + "step": 8463 + }, + { + "epoch": 1.0299969577121997, + "grad_norm": 1.4266414642333984, + "learning_rate": 9.839074693313241e-06, + "loss": 0.4476, + "step": 8464 + }, + { + "epoch": 1.0301186492242167, + "grad_norm": 1.5153205394744873, + "learning_rate": 9.837124255009647e-06, + "loss": 0.3638, + "step": 8465 + }, + { + "epoch": 1.0302403407362337, + "grad_norm": 1.2447034120559692, + "learning_rate": 9.83517382290381e-06, + "loss": 0.4118, + "step": 8466 + }, + { + "epoch": 1.0303620322482507, + "grad_norm": 2.4332399368286133, + "learning_rate": 9.83322339706995e-06, + "loss": 0.3432, + "step": 8467 + }, + { + "epoch": 1.0304837237602678, + "grad_norm": 1.9648536443710327, + "learning_rate": 9.83127297758229e-06, + "loss": 0.3456, + "step": 8468 + }, + { + "epoch": 1.0306054152722848, + "grad_norm": 1.266471266746521, + "learning_rate": 9.829322564515041e-06, + "loss": 0.4106, + "step": 8469 + }, + { + "epoch": 1.0307271067843018, + "grad_norm": 0.9075831174850464, + "learning_rate": 9.827372157942419e-06, + "loss": 0.345, + "step": 8470 + }, + { + "epoch": 1.0308487982963188, + "grad_norm": 1.0461357831954956, + "learning_rate": 9.825421757938642e-06, + "loss": 0.4258, + "step": 8471 + }, + { + "epoch": 1.0309704898083358, + "grad_norm": 1.6484135389328003, + "learning_rate": 9.82347136457793e-06, + "loss": 0.3812, + "step": 8472 + }, + { + "epoch": 1.0310921813203529, + "grad_norm": 2.3146800994873047, + "learning_rate": 9.8215209779345e-06, + "loss": 0.3719, + "step": 8473 + }, + { + "epoch": 1.0312138728323699, + "grad_norm": 1.8806352615356445, + "learning_rate": 9.819570598082564e-06, + "loss": 0.4018, + "step": 8474 + }, + { + "epoch": 1.031335564344387, + "grad_norm": 1.2429287433624268, + "learning_rate": 9.81762022509634e-06, + "loss": 0.3971, + "step": 8475 + }, + { + "epoch": 1.031457255856404, + "grad_norm": 1.5485801696777344, + "learning_rate": 9.815669859050046e-06, + "loss": 0.3715, + "step": 8476 + }, + { + "epoch": 1.0315789473684212, + "grad_norm": 1.1920545101165771, + "learning_rate": 9.813719500017895e-06, + "loss": 0.3639, + "step": 8477 + }, + { + "epoch": 1.0317006388804382, + "grad_norm": 1.3843580484390259, + "learning_rate": 9.8117691480741e-06, + "loss": 0.3614, + "step": 8478 + }, + { + "epoch": 1.0318223303924552, + "grad_norm": 1.169682264328003, + "learning_rate": 9.809818803292882e-06, + "loss": 0.4095, + "step": 8479 + }, + { + "epoch": 1.0319440219044722, + "grad_norm": 1.1997531652450562, + "learning_rate": 9.807868465748453e-06, + "loss": 0.3971, + "step": 8480 + }, + { + "epoch": 1.0320657134164892, + "grad_norm": 1.1105883121490479, + "learning_rate": 9.805918135515025e-06, + "loss": 0.3703, + "step": 8481 + }, + { + "epoch": 1.0321874049285062, + "grad_norm": 2.0782408714294434, + "learning_rate": 9.803967812666812e-06, + "loss": 0.3923, + "step": 8482 + }, + { + "epoch": 1.0323090964405233, + "grad_norm": 4.343510627746582, + "learning_rate": 9.80201749727803e-06, + "loss": 0.4519, + "step": 8483 + }, + { + "epoch": 1.0324307879525403, + "grad_norm": 1.1199190616607666, + "learning_rate": 9.800067189422894e-06, + "loss": 0.3378, + "step": 8484 + }, + { + "epoch": 1.0325524794645573, + "grad_norm": 1.6541204452514648, + "learning_rate": 9.798116889175614e-06, + "loss": 0.4255, + "step": 8485 + }, + { + "epoch": 1.0326741709765743, + "grad_norm": 1.1295616626739502, + "learning_rate": 9.7961665966104e-06, + "loss": 0.3888, + "step": 8486 + }, + { + "epoch": 1.0327958624885913, + "grad_norm": 1.1412512063980103, + "learning_rate": 9.794216311801477e-06, + "loss": 0.368, + "step": 8487 + }, + { + "epoch": 1.0329175540006084, + "grad_norm": 1.2467306852340698, + "learning_rate": 9.792266034823044e-06, + "loss": 0.3577, + "step": 8488 + }, + { + "epoch": 1.0330392455126254, + "grad_norm": 1.0315723419189453, + "learning_rate": 9.790315765749319e-06, + "loss": 0.3634, + "step": 8489 + }, + { + "epoch": 1.0331609370246426, + "grad_norm": 1.0767110586166382, + "learning_rate": 9.788365504654512e-06, + "loss": 0.3715, + "step": 8490 + }, + { + "epoch": 1.0332826285366596, + "grad_norm": 1.5041097402572632, + "learning_rate": 9.78641525161284e-06, + "loss": 0.397, + "step": 8491 + }, + { + "epoch": 1.0334043200486767, + "grad_norm": 1.2021644115447998, + "learning_rate": 9.78446500669851e-06, + "loss": 0.3752, + "step": 8492 + }, + { + "epoch": 1.0335260115606937, + "grad_norm": 2.2690627574920654, + "learning_rate": 9.782514769985727e-06, + "loss": 0.3089, + "step": 8493 + }, + { + "epoch": 1.0336477030727107, + "grad_norm": 3.597916603088379, + "learning_rate": 9.78056454154871e-06, + "loss": 0.4476, + "step": 8494 + }, + { + "epoch": 1.0337693945847277, + "grad_norm": 1.4413632154464722, + "learning_rate": 9.778614321461668e-06, + "loss": 0.3987, + "step": 8495 + }, + { + "epoch": 1.0338910860967447, + "grad_norm": 1.7488199472427368, + "learning_rate": 9.776664109798806e-06, + "loss": 0.3999, + "step": 8496 + }, + { + "epoch": 1.0340127776087618, + "grad_norm": 3.842494010925293, + "learning_rate": 9.774713906634338e-06, + "loss": 0.3472, + "step": 8497 + }, + { + "epoch": 1.0341344691207788, + "grad_norm": 2.5658071041107178, + "learning_rate": 9.772763712042473e-06, + "loss": 0.4204, + "step": 8498 + }, + { + "epoch": 1.0342561606327958, + "grad_norm": 1.6479542255401611, + "learning_rate": 9.770813526097419e-06, + "loss": 0.4029, + "step": 8499 + }, + { + "epoch": 1.0343778521448128, + "grad_norm": 1.7017662525177002, + "learning_rate": 9.768863348873384e-06, + "loss": 0.3774, + "step": 8500 + }, + { + "epoch": 1.0344995436568298, + "grad_norm": 1.4452855587005615, + "learning_rate": 9.766913180444576e-06, + "loss": 0.4026, + "step": 8501 + }, + { + "epoch": 1.034621235168847, + "grad_norm": 2.4560563564300537, + "learning_rate": 9.764963020885208e-06, + "loss": 0.4194, + "step": 8502 + }, + { + "epoch": 1.034742926680864, + "grad_norm": 2.3867592811584473, + "learning_rate": 9.763012870269484e-06, + "loss": 0.3994, + "step": 8503 + }, + { + "epoch": 1.0348646181928811, + "grad_norm": 1.6456576585769653, + "learning_rate": 9.761062728671607e-06, + "loss": 0.3625, + "step": 8504 + }, + { + "epoch": 1.0349863097048981, + "grad_norm": 1.394225835800171, + "learning_rate": 9.759112596165787e-06, + "loss": 0.4008, + "step": 8505 + }, + { + "epoch": 1.0351080012169152, + "grad_norm": 1.8049538135528564, + "learning_rate": 9.757162472826232e-06, + "loss": 0.395, + "step": 8506 + }, + { + "epoch": 1.0352296927289322, + "grad_norm": 3.1767828464508057, + "learning_rate": 9.755212358727151e-06, + "loss": 0.368, + "step": 8507 + }, + { + "epoch": 1.0353513842409492, + "grad_norm": 2.3020334243774414, + "learning_rate": 9.753262253942742e-06, + "loss": 0.4263, + "step": 8508 + }, + { + "epoch": 1.0354730757529662, + "grad_norm": 1.5832927227020264, + "learning_rate": 9.751312158547216e-06, + "loss": 0.3424, + "step": 8509 + }, + { + "epoch": 1.0355947672649832, + "grad_norm": 1.5203386545181274, + "learning_rate": 9.749362072614779e-06, + "loss": 0.3642, + "step": 8510 + }, + { + "epoch": 1.0357164587770002, + "grad_norm": 3.6001834869384766, + "learning_rate": 9.747411996219633e-06, + "loss": 0.3466, + "step": 8511 + }, + { + "epoch": 1.0358381502890173, + "grad_norm": 1.806621789932251, + "learning_rate": 9.745461929435982e-06, + "loss": 0.4198, + "step": 8512 + }, + { + "epoch": 1.0359598418010343, + "grad_norm": 2.951817750930786, + "learning_rate": 9.743511872338034e-06, + "loss": 0.36, + "step": 8513 + }, + { + "epoch": 1.0360815333130513, + "grad_norm": 2.2316105365753174, + "learning_rate": 9.74156182499999e-06, + "loss": 0.4014, + "step": 8514 + }, + { + "epoch": 1.0362032248250685, + "grad_norm": 1.3834682703018188, + "learning_rate": 9.739611787496054e-06, + "loss": 0.3548, + "step": 8515 + }, + { + "epoch": 1.0363249163370856, + "grad_norm": 3.6755449771881104, + "learning_rate": 9.737661759900426e-06, + "loss": 0.4702, + "step": 8516 + }, + { + "epoch": 1.0364466078491026, + "grad_norm": 2.0684592723846436, + "learning_rate": 9.73571174228731e-06, + "loss": 0.4418, + "step": 8517 + }, + { + "epoch": 1.0365682993611196, + "grad_norm": 2.0509941577911377, + "learning_rate": 9.733761734730913e-06, + "loss": 0.4023, + "step": 8518 + }, + { + "epoch": 1.0366899908731366, + "grad_norm": 2.624793291091919, + "learning_rate": 9.731811737305432e-06, + "loss": 0.4104, + "step": 8519 + }, + { + "epoch": 1.0368116823851536, + "grad_norm": 1.7117773294448853, + "learning_rate": 9.729861750085066e-06, + "loss": 0.3884, + "step": 8520 + }, + { + "epoch": 1.0369333738971707, + "grad_norm": 2.3872177600860596, + "learning_rate": 9.727911773144024e-06, + "loss": 0.3559, + "step": 8521 + }, + { + "epoch": 1.0370550654091877, + "grad_norm": 3.1806843280792236, + "learning_rate": 9.7259618065565e-06, + "loss": 0.3488, + "step": 8522 + }, + { + "epoch": 1.0371767569212047, + "grad_norm": 2.596280097961426, + "learning_rate": 9.724011850396697e-06, + "loss": 0.3712, + "step": 8523 + }, + { + "epoch": 1.0372984484332217, + "grad_norm": 1.2299621105194092, + "learning_rate": 9.722061904738811e-06, + "loss": 0.3875, + "step": 8524 + }, + { + "epoch": 1.0374201399452387, + "grad_norm": 1.091686487197876, + "learning_rate": 9.720111969657052e-06, + "loss": 0.3545, + "step": 8525 + }, + { + "epoch": 1.0375418314572558, + "grad_norm": 1.6109882593154907, + "learning_rate": 9.71816204522561e-06, + "loss": 0.3877, + "step": 8526 + }, + { + "epoch": 1.037663522969273, + "grad_norm": 1.3331568241119385, + "learning_rate": 9.716212131518683e-06, + "loss": 0.3777, + "step": 8527 + }, + { + "epoch": 1.03778521448129, + "grad_norm": 3.5424487590789795, + "learning_rate": 9.714262228610471e-06, + "loss": 0.4608, + "step": 8528 + }, + { + "epoch": 1.037906905993307, + "grad_norm": 1.1005115509033203, + "learning_rate": 9.712312336575175e-06, + "loss": 0.3841, + "step": 8529 + }, + { + "epoch": 1.038028597505324, + "grad_norm": 2.815463066101074, + "learning_rate": 9.71036245548699e-06, + "loss": 0.2905, + "step": 8530 + }, + { + "epoch": 1.038150289017341, + "grad_norm": 1.3579227924346924, + "learning_rate": 9.708412585420111e-06, + "loss": 0.3057, + "step": 8531 + }, + { + "epoch": 1.038271980529358, + "grad_norm": 2.1427407264709473, + "learning_rate": 9.706462726448738e-06, + "loss": 0.4161, + "step": 8532 + }, + { + "epoch": 1.0383936720413751, + "grad_norm": 1.2537420988082886, + "learning_rate": 9.704512878647066e-06, + "loss": 0.358, + "step": 8533 + }, + { + "epoch": 1.0385153635533921, + "grad_norm": 1.2238306999206543, + "learning_rate": 9.70256304208929e-06, + "loss": 0.3449, + "step": 8534 + }, + { + "epoch": 1.0386370550654092, + "grad_norm": 1.53570556640625, + "learning_rate": 9.700613216849605e-06, + "loss": 0.3847, + "step": 8535 + }, + { + "epoch": 1.0387587465774262, + "grad_norm": 2.3927245140075684, + "learning_rate": 9.69866340300221e-06, + "loss": 0.4218, + "step": 8536 + }, + { + "epoch": 1.0388804380894432, + "grad_norm": 1.9626636505126953, + "learning_rate": 9.696713600621292e-06, + "loss": 0.437, + "step": 8537 + }, + { + "epoch": 1.0390021296014602, + "grad_norm": 1.0661342144012451, + "learning_rate": 9.694763809781056e-06, + "loss": 0.3301, + "step": 8538 + }, + { + "epoch": 1.0391238211134772, + "grad_norm": 1.2507702112197876, + "learning_rate": 9.692814030555683e-06, + "loss": 0.3564, + "step": 8539 + }, + { + "epoch": 1.0392455126254945, + "grad_norm": 1.2951289415359497, + "learning_rate": 9.690864263019372e-06, + "loss": 0.3778, + "step": 8540 + }, + { + "epoch": 1.0393672041375115, + "grad_norm": 1.6303167343139648, + "learning_rate": 9.68891450724632e-06, + "loss": 0.3563, + "step": 8541 + }, + { + "epoch": 1.0394888956495285, + "grad_norm": 1.423923373222351, + "learning_rate": 9.686964763310712e-06, + "loss": 0.3839, + "step": 8542 + }, + { + "epoch": 1.0396105871615455, + "grad_norm": 1.3353351354599, + "learning_rate": 9.685015031286741e-06, + "loss": 0.3779, + "step": 8543 + }, + { + "epoch": 1.0397322786735625, + "grad_norm": 1.1837313175201416, + "learning_rate": 9.683065311248602e-06, + "loss": 0.3522, + "step": 8544 + }, + { + "epoch": 1.0398539701855796, + "grad_norm": 1.2641229629516602, + "learning_rate": 9.681115603270484e-06, + "loss": 0.3511, + "step": 8545 + }, + { + "epoch": 1.0399756616975966, + "grad_norm": 1.8671753406524658, + "learning_rate": 9.679165907426576e-06, + "loss": 0.3627, + "step": 8546 + }, + { + "epoch": 1.0400973532096136, + "grad_norm": 2.247713088989258, + "learning_rate": 9.67721622379107e-06, + "loss": 0.3977, + "step": 8547 + }, + { + "epoch": 1.0402190447216306, + "grad_norm": 3.400810956954956, + "learning_rate": 9.675266552438155e-06, + "loss": 0.4593, + "step": 8548 + }, + { + "epoch": 1.0403407362336476, + "grad_norm": 1.6697648763656616, + "learning_rate": 9.673316893442025e-06, + "loss": 0.334, + "step": 8549 + }, + { + "epoch": 1.0404624277456647, + "grad_norm": 1.9920457601547241, + "learning_rate": 9.671367246876856e-06, + "loss": 0.4137, + "step": 8550 + }, + { + "epoch": 1.0405841192576817, + "grad_norm": 1.3066365718841553, + "learning_rate": 9.669417612816846e-06, + "loss": 0.344, + "step": 8551 + }, + { + "epoch": 1.040705810769699, + "grad_norm": 1.6996352672576904, + "learning_rate": 9.66746799133618e-06, + "loss": 0.4173, + "step": 8552 + }, + { + "epoch": 1.040827502281716, + "grad_norm": 1.5663464069366455, + "learning_rate": 9.665518382509046e-06, + "loss": 0.4363, + "step": 8553 + }, + { + "epoch": 1.040949193793733, + "grad_norm": 1.4060596227645874, + "learning_rate": 9.663568786409628e-06, + "loss": 0.3948, + "step": 8554 + }, + { + "epoch": 1.04107088530575, + "grad_norm": 2.2613673210144043, + "learning_rate": 9.661619203112115e-06, + "loss": 0.4088, + "step": 8555 + }, + { + "epoch": 1.041192576817767, + "grad_norm": 1.3217846155166626, + "learning_rate": 9.659669632690691e-06, + "loss": 0.419, + "step": 8556 + }, + { + "epoch": 1.041314268329784, + "grad_norm": 2.6818349361419678, + "learning_rate": 9.657720075219542e-06, + "loss": 0.3835, + "step": 8557 + }, + { + "epoch": 1.041435959841801, + "grad_norm": 1.4916032552719116, + "learning_rate": 9.655770530772854e-06, + "loss": 0.3406, + "step": 8558 + }, + { + "epoch": 1.041557651353818, + "grad_norm": 1.768543004989624, + "learning_rate": 9.65382099942481e-06, + "loss": 0.3617, + "step": 8559 + }, + { + "epoch": 1.041679342865835, + "grad_norm": 1.6199798583984375, + "learning_rate": 9.651871481249591e-06, + "loss": 0.3767, + "step": 8560 + }, + { + "epoch": 1.041801034377852, + "grad_norm": 1.3501124382019043, + "learning_rate": 9.649921976321388e-06, + "loss": 0.405, + "step": 8561 + }, + { + "epoch": 1.0419227258898691, + "grad_norm": 1.4673627614974976, + "learning_rate": 9.647972484714374e-06, + "loss": 0.3952, + "step": 8562 + }, + { + "epoch": 1.0420444174018861, + "grad_norm": 1.5570197105407715, + "learning_rate": 9.646023006502738e-06, + "loss": 0.3859, + "step": 8563 + }, + { + "epoch": 1.0421661089139032, + "grad_norm": 1.3670626878738403, + "learning_rate": 9.644073541760656e-06, + "loss": 0.3899, + "step": 8564 + }, + { + "epoch": 1.0422878004259204, + "grad_norm": 1.4149664640426636, + "learning_rate": 9.642124090562314e-06, + "loss": 0.3713, + "step": 8565 + }, + { + "epoch": 1.0424094919379374, + "grad_norm": 2.66680908203125, + "learning_rate": 9.64017465298189e-06, + "loss": 0.4047, + "step": 8566 + }, + { + "epoch": 1.0425311834499544, + "grad_norm": 1.4891482591629028, + "learning_rate": 9.638225229093568e-06, + "loss": 0.3903, + "step": 8567 + }, + { + "epoch": 1.0426528749619715, + "grad_norm": 1.4337607622146606, + "learning_rate": 9.636275818971523e-06, + "loss": 0.3711, + "step": 8568 + }, + { + "epoch": 1.0427745664739885, + "grad_norm": 2.509216785430908, + "learning_rate": 9.634326422689935e-06, + "loss": 0.4449, + "step": 8569 + }, + { + "epoch": 1.0428962579860055, + "grad_norm": 2.597322702407837, + "learning_rate": 9.632377040322988e-06, + "loss": 0.4404, + "step": 8570 + }, + { + "epoch": 1.0430179494980225, + "grad_norm": 1.3111467361450195, + "learning_rate": 9.630427671944851e-06, + "loss": 0.3985, + "step": 8571 + }, + { + "epoch": 1.0431396410100395, + "grad_norm": 1.3656083345413208, + "learning_rate": 9.62847831762971e-06, + "loss": 0.373, + "step": 8572 + }, + { + "epoch": 1.0432613325220566, + "grad_norm": 1.302311897277832, + "learning_rate": 9.626528977451735e-06, + "loss": 0.4065, + "step": 8573 + }, + { + "epoch": 1.0433830240340736, + "grad_norm": 1.2328763008117676, + "learning_rate": 9.624579651485107e-06, + "loss": 0.3925, + "step": 8574 + }, + { + "epoch": 1.0435047155460906, + "grad_norm": 1.8730429410934448, + "learning_rate": 9.622630339804e-06, + "loss": 0.4329, + "step": 8575 + }, + { + "epoch": 1.0436264070581076, + "grad_norm": 1.287023901939392, + "learning_rate": 9.620681042482587e-06, + "loss": 0.4223, + "step": 8576 + }, + { + "epoch": 1.0437480985701246, + "grad_norm": 1.6565619707107544, + "learning_rate": 9.618731759595049e-06, + "loss": 0.3709, + "step": 8577 + }, + { + "epoch": 1.0438697900821419, + "grad_norm": 1.4961142539978027, + "learning_rate": 9.616782491215555e-06, + "loss": 0.3484, + "step": 8578 + }, + { + "epoch": 1.0439914815941589, + "grad_norm": 1.7269107103347778, + "learning_rate": 9.61483323741828e-06, + "loss": 0.4001, + "step": 8579 + }, + { + "epoch": 1.044113173106176, + "grad_norm": 2.211500883102417, + "learning_rate": 9.612883998277398e-06, + "loss": 0.3564, + "step": 8580 + }, + { + "epoch": 1.044234864618193, + "grad_norm": 1.8098746538162231, + "learning_rate": 9.610934773867083e-06, + "loss": 0.3972, + "step": 8581 + }, + { + "epoch": 1.04435655613021, + "grad_norm": 2.49975848197937, + "learning_rate": 9.608985564261502e-06, + "loss": 0.3494, + "step": 8582 + }, + { + "epoch": 1.044478247642227, + "grad_norm": 1.2084003686904907, + "learning_rate": 9.60703636953483e-06, + "loss": 0.3629, + "step": 8583 + }, + { + "epoch": 1.044599939154244, + "grad_norm": 1.2928369045257568, + "learning_rate": 9.605087189761244e-06, + "loss": 0.3882, + "step": 8584 + }, + { + "epoch": 1.044721630666261, + "grad_norm": 1.0913563966751099, + "learning_rate": 9.6031380250149e-06, + "loss": 0.3421, + "step": 8585 + }, + { + "epoch": 1.044843322178278, + "grad_norm": 1.943540334701538, + "learning_rate": 9.60118887536998e-06, + "loss": 0.3426, + "step": 8586 + }, + { + "epoch": 1.044965013690295, + "grad_norm": 1.5081590414047241, + "learning_rate": 9.599239740900647e-06, + "loss": 0.3626, + "step": 8587 + }, + { + "epoch": 1.045086705202312, + "grad_norm": 2.3020002841949463, + "learning_rate": 9.59729062168107e-06, + "loss": 0.338, + "step": 8588 + }, + { + "epoch": 1.045208396714329, + "grad_norm": 2.5086286067962646, + "learning_rate": 9.59534151778542e-06, + "loss": 0.3969, + "step": 8589 + }, + { + "epoch": 1.045330088226346, + "grad_norm": 2.89668345451355, + "learning_rate": 9.593392429287861e-06, + "loss": 0.4072, + "step": 8590 + }, + { + "epoch": 1.0454517797383633, + "grad_norm": 1.1879040002822876, + "learning_rate": 9.591443356262564e-06, + "loss": 0.3883, + "step": 8591 + }, + { + "epoch": 1.0455734712503804, + "grad_norm": 1.5011917352676392, + "learning_rate": 9.589494298783692e-06, + "loss": 0.3596, + "step": 8592 + }, + { + "epoch": 1.0456951627623974, + "grad_norm": 3.433818817138672, + "learning_rate": 9.587545256925412e-06, + "loss": 0.4029, + "step": 8593 + }, + { + "epoch": 1.0458168542744144, + "grad_norm": 2.515930652618408, + "learning_rate": 9.58559623076189e-06, + "loss": 0.4472, + "step": 8594 + }, + { + "epoch": 1.0459385457864314, + "grad_norm": 1.3873485326766968, + "learning_rate": 9.583647220367291e-06, + "loss": 0.3614, + "step": 8595 + }, + { + "epoch": 1.0460602372984484, + "grad_norm": 1.8964276313781738, + "learning_rate": 9.581698225815771e-06, + "loss": 0.4187, + "step": 8596 + }, + { + "epoch": 1.0461819288104655, + "grad_norm": 1.5862632989883423, + "learning_rate": 9.579749247181502e-06, + "loss": 0.3632, + "step": 8597 + }, + { + "epoch": 1.0463036203224825, + "grad_norm": 2.7335329055786133, + "learning_rate": 9.577800284538645e-06, + "loss": 0.4063, + "step": 8598 + }, + { + "epoch": 1.0464253118344995, + "grad_norm": 3.465146064758301, + "learning_rate": 9.57585133796136e-06, + "loss": 0.3964, + "step": 8599 + }, + { + "epoch": 1.0465470033465165, + "grad_norm": 3.3736541271209717, + "learning_rate": 9.57390240752381e-06, + "loss": 0.3591, + "step": 8600 + }, + { + "epoch": 1.0466686948585335, + "grad_norm": 3.1835906505584717, + "learning_rate": 9.571953493300156e-06, + "loss": 0.3519, + "step": 8601 + }, + { + "epoch": 1.0467903863705506, + "grad_norm": 1.5710586309432983, + "learning_rate": 9.570004595364557e-06, + "loss": 0.4274, + "step": 8602 + }, + { + "epoch": 1.0469120778825678, + "grad_norm": 1.8897483348846436, + "learning_rate": 9.56805571379117e-06, + "loss": 0.4137, + "step": 8603 + }, + { + "epoch": 1.0470337693945848, + "grad_norm": 1.7368671894073486, + "learning_rate": 9.566106848654163e-06, + "loss": 0.3819, + "step": 8604 + }, + { + "epoch": 1.0471554609066018, + "grad_norm": 1.4896665811538696, + "learning_rate": 9.564158000027685e-06, + "loss": 0.3523, + "step": 8605 + }, + { + "epoch": 1.0472771524186189, + "grad_norm": 1.4961308240890503, + "learning_rate": 9.562209167985896e-06, + "loss": 0.3786, + "step": 8606 + }, + { + "epoch": 1.0473988439306359, + "grad_norm": 2.8934786319732666, + "learning_rate": 9.56026035260296e-06, + "loss": 0.3932, + "step": 8607 + }, + { + "epoch": 1.047520535442653, + "grad_norm": 1.2765803337097168, + "learning_rate": 9.558311553953023e-06, + "loss": 0.3441, + "step": 8608 + }, + { + "epoch": 1.04764222695467, + "grad_norm": 2.254314661026001, + "learning_rate": 9.556362772110249e-06, + "loss": 0.3895, + "step": 8609 + }, + { + "epoch": 1.047763918466687, + "grad_norm": 2.7281601428985596, + "learning_rate": 9.554414007148786e-06, + "loss": 0.4032, + "step": 8610 + }, + { + "epoch": 1.047885609978704, + "grad_norm": 1.9062083959579468, + "learning_rate": 9.552465259142793e-06, + "loss": 0.3693, + "step": 8611 + }, + { + "epoch": 1.048007301490721, + "grad_norm": 2.7098045349121094, + "learning_rate": 9.550516528166425e-06, + "loss": 0.4419, + "step": 8612 + }, + { + "epoch": 1.048128993002738, + "grad_norm": 4.986382007598877, + "learning_rate": 9.548567814293831e-06, + "loss": 0.5056, + "step": 8613 + }, + { + "epoch": 1.048250684514755, + "grad_norm": 1.4649035930633545, + "learning_rate": 9.546619117599167e-06, + "loss": 0.3925, + "step": 8614 + }, + { + "epoch": 1.048372376026772, + "grad_norm": 1.3123399019241333, + "learning_rate": 9.544670438156588e-06, + "loss": 0.3392, + "step": 8615 + }, + { + "epoch": 1.0484940675387893, + "grad_norm": 1.9751774072647095, + "learning_rate": 9.542721776040236e-06, + "loss": 0.4658, + "step": 8616 + }, + { + "epoch": 1.0486157590508063, + "grad_norm": 1.6269317865371704, + "learning_rate": 9.540773131324267e-06, + "loss": 0.3948, + "step": 8617 + }, + { + "epoch": 1.0487374505628233, + "grad_norm": 2.909550189971924, + "learning_rate": 9.538824504082837e-06, + "loss": 0.389, + "step": 8618 + }, + { + "epoch": 1.0488591420748403, + "grad_norm": 3.067458152770996, + "learning_rate": 9.536875894390083e-06, + "loss": 0.443, + "step": 8619 + }, + { + "epoch": 1.0489808335868573, + "grad_norm": 2.9201252460479736, + "learning_rate": 9.534927302320165e-06, + "loss": 0.3534, + "step": 8620 + }, + { + "epoch": 1.0491025250988744, + "grad_norm": 1.5402511358261108, + "learning_rate": 9.532978727947221e-06, + "loss": 0.3872, + "step": 8621 + }, + { + "epoch": 1.0492242166108914, + "grad_norm": 1.5486410856246948, + "learning_rate": 9.531030171345405e-06, + "loss": 0.4344, + "step": 8622 + }, + { + "epoch": 1.0493459081229084, + "grad_norm": 1.9518561363220215, + "learning_rate": 9.529081632588858e-06, + "loss": 0.3875, + "step": 8623 + }, + { + "epoch": 1.0494675996349254, + "grad_norm": 1.8062916994094849, + "learning_rate": 9.527133111751736e-06, + "loss": 0.3393, + "step": 8624 + }, + { + "epoch": 1.0495892911469424, + "grad_norm": 2.0456032752990723, + "learning_rate": 9.525184608908175e-06, + "loss": 0.3419, + "step": 8625 + }, + { + "epoch": 1.0497109826589595, + "grad_norm": 1.0896750688552856, + "learning_rate": 9.52323612413232e-06, + "loss": 0.3771, + "step": 8626 + }, + { + "epoch": 1.0498326741709765, + "grad_norm": 2.2507569789886475, + "learning_rate": 9.521287657498322e-06, + "loss": 0.4071, + "step": 8627 + }, + { + "epoch": 1.0499543656829937, + "grad_norm": 1.2501325607299805, + "learning_rate": 9.519339209080316e-06, + "loss": 0.3853, + "step": 8628 + }, + { + "epoch": 1.0500760571950107, + "grad_norm": 1.1588976383209229, + "learning_rate": 9.517390778952448e-06, + "loss": 0.3499, + "step": 8629 + }, + { + "epoch": 1.0501977487070278, + "grad_norm": 1.8816875219345093, + "learning_rate": 9.515442367188866e-06, + "loss": 0.3683, + "step": 8630 + }, + { + "epoch": 1.0503194402190448, + "grad_norm": 3.6141932010650635, + "learning_rate": 9.513493973863697e-06, + "loss": 0.4278, + "step": 8631 + }, + { + "epoch": 1.0504411317310618, + "grad_norm": 1.2083204984664917, + "learning_rate": 9.511545599051094e-06, + "loss": 0.3318, + "step": 8632 + }, + { + "epoch": 1.0505628232430788, + "grad_norm": 2.6953628063201904, + "learning_rate": 9.50959724282519e-06, + "loss": 0.4406, + "step": 8633 + }, + { + "epoch": 1.0506845147550958, + "grad_norm": 3.108588695526123, + "learning_rate": 9.507648905260125e-06, + "loss": 0.3972, + "step": 8634 + }, + { + "epoch": 1.0508062062671129, + "grad_norm": 1.7507376670837402, + "learning_rate": 9.505700586430042e-06, + "loss": 0.4256, + "step": 8635 + }, + { + "epoch": 1.0509278977791299, + "grad_norm": 2.2934391498565674, + "learning_rate": 9.50375228640907e-06, + "loss": 0.3484, + "step": 8636 + }, + { + "epoch": 1.051049589291147, + "grad_norm": 1.4860869646072388, + "learning_rate": 9.501804005271352e-06, + "loss": 0.4162, + "step": 8637 + }, + { + "epoch": 1.051171280803164, + "grad_norm": 2.9421560764312744, + "learning_rate": 9.499855743091026e-06, + "loss": 0.4091, + "step": 8638 + }, + { + "epoch": 1.051292972315181, + "grad_norm": 3.1615567207336426, + "learning_rate": 9.49790749994222e-06, + "loss": 0.383, + "step": 8639 + }, + { + "epoch": 1.051414663827198, + "grad_norm": 1.2670717239379883, + "learning_rate": 9.495959275899075e-06, + "loss": 0.374, + "step": 8640 + }, + { + "epoch": 1.0515363553392152, + "grad_norm": 1.1805905103683472, + "learning_rate": 9.494011071035726e-06, + "loss": 0.4205, + "step": 8641 + }, + { + "epoch": 1.0516580468512322, + "grad_norm": 1.6930080652236938, + "learning_rate": 9.492062885426299e-06, + "loss": 0.4114, + "step": 8642 + }, + { + "epoch": 1.0517797383632492, + "grad_norm": 1.0549845695495605, + "learning_rate": 9.490114719144932e-06, + "loss": 0.4335, + "step": 8643 + }, + { + "epoch": 1.0519014298752662, + "grad_norm": 2.2710280418395996, + "learning_rate": 9.488166572265754e-06, + "loss": 0.4464, + "step": 8644 + }, + { + "epoch": 1.0520231213872833, + "grad_norm": 1.156900405883789, + "learning_rate": 9.486218444862898e-06, + "loss": 0.4372, + "step": 8645 + }, + { + "epoch": 1.0521448128993003, + "grad_norm": 1.38802170753479, + "learning_rate": 9.484270337010494e-06, + "loss": 0.3541, + "step": 8646 + }, + { + "epoch": 1.0522665044113173, + "grad_norm": 3.0831542015075684, + "learning_rate": 9.482322248782671e-06, + "loss": 0.504, + "step": 8647 + }, + { + "epoch": 1.0523881959233343, + "grad_norm": 1.4287234544754028, + "learning_rate": 9.480374180253557e-06, + "loss": 0.4476, + "step": 8648 + }, + { + "epoch": 1.0525098874353513, + "grad_norm": 1.5159581899642944, + "learning_rate": 9.478426131497284e-06, + "loss": 0.3851, + "step": 8649 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.4192575216293335, + "learning_rate": 9.476478102587973e-06, + "loss": 0.4047, + "step": 8650 + }, + { + "epoch": 1.0527532704593854, + "grad_norm": 1.8735322952270508, + "learning_rate": 9.474530093599754e-06, + "loss": 0.393, + "step": 8651 + }, + { + "epoch": 1.0528749619714024, + "grad_norm": 1.8461400270462036, + "learning_rate": 9.472582104606752e-06, + "loss": 0.3292, + "step": 8652 + }, + { + "epoch": 1.0529966534834196, + "grad_norm": 1.4358570575714111, + "learning_rate": 9.4706341356831e-06, + "loss": 0.3882, + "step": 8653 + }, + { + "epoch": 1.0531183449954367, + "grad_norm": 2.3466813564300537, + "learning_rate": 9.468686186902908e-06, + "loss": 0.3251, + "step": 8654 + }, + { + "epoch": 1.0532400365074537, + "grad_norm": 1.7165895700454712, + "learning_rate": 9.466738258340308e-06, + "loss": 0.4862, + "step": 8655 + }, + { + "epoch": 1.0533617280194707, + "grad_norm": 1.2065918445587158, + "learning_rate": 9.464790350069419e-06, + "loss": 0.3958, + "step": 8656 + }, + { + "epoch": 1.0534834195314877, + "grad_norm": 1.5157067775726318, + "learning_rate": 9.462842462164366e-06, + "loss": 0.3655, + "step": 8657 + }, + { + "epoch": 1.0536051110435047, + "grad_norm": 2.298922061920166, + "learning_rate": 9.46089459469927e-06, + "loss": 0.3877, + "step": 8658 + }, + { + "epoch": 1.0537268025555218, + "grad_norm": 3.1769769191741943, + "learning_rate": 9.45894674774825e-06, + "loss": 0.4215, + "step": 8659 + }, + { + "epoch": 1.0538484940675388, + "grad_norm": 1.8627218008041382, + "learning_rate": 9.456998921385427e-06, + "loss": 0.4379, + "step": 8660 + }, + { + "epoch": 1.0539701855795558, + "grad_norm": 3.351285934448242, + "learning_rate": 9.455051115684919e-06, + "loss": 0.465, + "step": 8661 + }, + { + "epoch": 1.0540918770915728, + "grad_norm": 1.3899731636047363, + "learning_rate": 9.453103330720842e-06, + "loss": 0.4141, + "step": 8662 + }, + { + "epoch": 1.0542135686035898, + "grad_norm": 1.1285886764526367, + "learning_rate": 9.451155566567317e-06, + "loss": 0.3705, + "step": 8663 + }, + { + "epoch": 1.0543352601156069, + "grad_norm": 1.893664002418518, + "learning_rate": 9.44920782329846e-06, + "loss": 0.3667, + "step": 8664 + }, + { + "epoch": 1.0544569516276239, + "grad_norm": 2.149561882019043, + "learning_rate": 9.447260100988385e-06, + "loss": 0.3469, + "step": 8665 + }, + { + "epoch": 1.0545786431396411, + "grad_norm": 1.609248161315918, + "learning_rate": 9.445312399711206e-06, + "loss": 0.3795, + "step": 8666 + }, + { + "epoch": 1.0547003346516581, + "grad_norm": 1.1167349815368652, + "learning_rate": 9.443364719541038e-06, + "loss": 0.3944, + "step": 8667 + }, + { + "epoch": 1.0548220261636752, + "grad_norm": 1.164547324180603, + "learning_rate": 9.441417060551994e-06, + "loss": 0.3642, + "step": 8668 + }, + { + "epoch": 1.0549437176756922, + "grad_norm": 1.6111547946929932, + "learning_rate": 9.43946942281819e-06, + "loss": 0.3918, + "step": 8669 + }, + { + "epoch": 1.0550654091877092, + "grad_norm": 2.4160683155059814, + "learning_rate": 9.437521806413733e-06, + "loss": 0.3723, + "step": 8670 + }, + { + "epoch": 1.0551871006997262, + "grad_norm": 1.576104760169983, + "learning_rate": 9.435574211412734e-06, + "loss": 0.4009, + "step": 8671 + }, + { + "epoch": 1.0553087922117432, + "grad_norm": 1.3091334104537964, + "learning_rate": 9.433626637889306e-06, + "loss": 0.3651, + "step": 8672 + }, + { + "epoch": 1.0554304837237602, + "grad_norm": 2.005645275115967, + "learning_rate": 9.431679085917556e-06, + "loss": 0.3496, + "step": 8673 + }, + { + "epoch": 1.0555521752357773, + "grad_norm": 1.513327956199646, + "learning_rate": 9.429731555571592e-06, + "loss": 0.4306, + "step": 8674 + }, + { + "epoch": 1.0556738667477943, + "grad_norm": 3.060948133468628, + "learning_rate": 9.427784046925526e-06, + "loss": 0.4433, + "step": 8675 + }, + { + "epoch": 1.0557955582598113, + "grad_norm": 1.9053220748901367, + "learning_rate": 9.425836560053462e-06, + "loss": 0.3622, + "step": 8676 + }, + { + "epoch": 1.0559172497718283, + "grad_norm": 2.5148892402648926, + "learning_rate": 9.423889095029505e-06, + "loss": 0.3924, + "step": 8677 + }, + { + "epoch": 1.0560389412838453, + "grad_norm": 3.0455322265625, + "learning_rate": 9.421941651927757e-06, + "loss": 0.4315, + "step": 8678 + }, + { + "epoch": 1.0561606327958626, + "grad_norm": 3.107144594192505, + "learning_rate": 9.419994230822328e-06, + "loss": 0.3726, + "step": 8679 + }, + { + "epoch": 1.0562823243078796, + "grad_norm": 1.6978682279586792, + "learning_rate": 9.41804683178732e-06, + "loss": 0.3486, + "step": 8680 + }, + { + "epoch": 1.0564040158198966, + "grad_norm": 1.8656402826309204, + "learning_rate": 9.41609945489683e-06, + "loss": 0.3767, + "step": 8681 + }, + { + "epoch": 1.0565257073319136, + "grad_norm": 1.498172640800476, + "learning_rate": 9.414152100224968e-06, + "loss": 0.3394, + "step": 8682 + }, + { + "epoch": 1.0566473988439307, + "grad_norm": 2.0387983322143555, + "learning_rate": 9.41220476784583e-06, + "loss": 0.3797, + "step": 8683 + }, + { + "epoch": 1.0567690903559477, + "grad_norm": 2.857862949371338, + "learning_rate": 9.410257457833518e-06, + "loss": 0.4023, + "step": 8684 + }, + { + "epoch": 1.0568907818679647, + "grad_norm": 1.7712801694869995, + "learning_rate": 9.40831017026213e-06, + "loss": 0.384, + "step": 8685 + }, + { + "epoch": 1.0570124733799817, + "grad_norm": 1.6085553169250488, + "learning_rate": 9.406362905205765e-06, + "loss": 0.4231, + "step": 8686 + }, + { + "epoch": 1.0571341648919987, + "grad_norm": 1.550455927848816, + "learning_rate": 9.404415662738522e-06, + "loss": 0.4289, + "step": 8687 + }, + { + "epoch": 1.0572558564040158, + "grad_norm": 2.210205078125, + "learning_rate": 9.402468442934497e-06, + "loss": 0.3495, + "step": 8688 + }, + { + "epoch": 1.0573775479160328, + "grad_norm": 3.5440616607666016, + "learning_rate": 9.400521245867781e-06, + "loss": 0.4284, + "step": 8689 + }, + { + "epoch": 1.0574992394280498, + "grad_norm": 2.110807180404663, + "learning_rate": 9.398574071612472e-06, + "loss": 0.4182, + "step": 8690 + }, + { + "epoch": 1.0576209309400668, + "grad_norm": 3.383354902267456, + "learning_rate": 9.396626920242664e-06, + "loss": 0.3348, + "step": 8691 + }, + { + "epoch": 1.057742622452084, + "grad_norm": 3.4444661140441895, + "learning_rate": 9.394679791832455e-06, + "loss": 0.371, + "step": 8692 + }, + { + "epoch": 1.057864313964101, + "grad_norm": 1.4034574031829834, + "learning_rate": 9.392732686455927e-06, + "loss": 0.3931, + "step": 8693 + }, + { + "epoch": 1.057986005476118, + "grad_norm": 1.4356985092163086, + "learning_rate": 9.39078560418718e-06, + "loss": 0.3658, + "step": 8694 + }, + { + "epoch": 1.0581076969881351, + "grad_norm": 2.41737699508667, + "learning_rate": 9.388838545100304e-06, + "loss": 0.4302, + "step": 8695 + }, + { + "epoch": 1.0582293885001521, + "grad_norm": 2.68820858001709, + "learning_rate": 9.386891509269383e-06, + "loss": 0.306, + "step": 8696 + }, + { + "epoch": 1.0583510800121692, + "grad_norm": 1.6873308420181274, + "learning_rate": 9.38494449676851e-06, + "loss": 0.4345, + "step": 8697 + }, + { + "epoch": 1.0584727715241862, + "grad_norm": 3.743823289871216, + "learning_rate": 9.382997507671774e-06, + "loss": 0.4291, + "step": 8698 + }, + { + "epoch": 1.0585944630362032, + "grad_norm": 1.1851119995117188, + "learning_rate": 9.38105054205326e-06, + "loss": 0.3115, + "step": 8699 + }, + { + "epoch": 1.0587161545482202, + "grad_norm": 1.9133899211883545, + "learning_rate": 9.379103599987053e-06, + "loss": 0.3641, + "step": 8700 + }, + { + "epoch": 1.0588378460602372, + "grad_norm": 1.8852527141571045, + "learning_rate": 9.37715668154724e-06, + "loss": 0.4106, + "step": 8701 + }, + { + "epoch": 1.0589595375722543, + "grad_norm": 2.589930534362793, + "learning_rate": 9.375209786807901e-06, + "loss": 0.4151, + "step": 8702 + }, + { + "epoch": 1.0590812290842713, + "grad_norm": 4.11845064163208, + "learning_rate": 9.373262915843127e-06, + "loss": 0.3209, + "step": 8703 + }, + { + "epoch": 1.0592029205962885, + "grad_norm": 1.5849689245224, + "learning_rate": 9.371316068726993e-06, + "loss": 0.4191, + "step": 8704 + }, + { + "epoch": 1.0593246121083055, + "grad_norm": 1.9058828353881836, + "learning_rate": 9.369369245533585e-06, + "loss": 0.3362, + "step": 8705 + }, + { + "epoch": 1.0594463036203225, + "grad_norm": 1.423583745956421, + "learning_rate": 9.367422446336984e-06, + "loss": 0.4166, + "step": 8706 + }, + { + "epoch": 1.0595679951323396, + "grad_norm": 3.1402063369750977, + "learning_rate": 9.365475671211266e-06, + "loss": 0.4016, + "step": 8707 + }, + { + "epoch": 1.0596896866443566, + "grad_norm": 3.0349225997924805, + "learning_rate": 9.363528920230512e-06, + "loss": 0.3238, + "step": 8708 + }, + { + "epoch": 1.0598113781563736, + "grad_norm": 2.0186822414398193, + "learning_rate": 9.361582193468801e-06, + "loss": 0.3772, + "step": 8709 + }, + { + "epoch": 1.0599330696683906, + "grad_norm": 2.000281810760498, + "learning_rate": 9.359635491000214e-06, + "loss": 0.3737, + "step": 8710 + }, + { + "epoch": 1.0600547611804076, + "grad_norm": 1.2245726585388184, + "learning_rate": 9.357688812898818e-06, + "loss": 0.4214, + "step": 8711 + }, + { + "epoch": 1.0601764526924247, + "grad_norm": 1.5693227052688599, + "learning_rate": 9.35574215923869e-06, + "loss": 0.3292, + "step": 8712 + }, + { + "epoch": 1.0602981442044417, + "grad_norm": 1.3693928718566895, + "learning_rate": 9.353795530093906e-06, + "loss": 0.3625, + "step": 8713 + }, + { + "epoch": 1.0604198357164587, + "grad_norm": 1.4481825828552246, + "learning_rate": 9.351848925538543e-06, + "loss": 0.39, + "step": 8714 + }, + { + "epoch": 1.0605415272284757, + "grad_norm": 1.5233197212219238, + "learning_rate": 9.349902345646669e-06, + "loss": 0.3279, + "step": 8715 + }, + { + "epoch": 1.0606632187404927, + "grad_norm": 2.397857189178467, + "learning_rate": 9.347955790492354e-06, + "loss": 0.3597, + "step": 8716 + }, + { + "epoch": 1.06078491025251, + "grad_norm": 4.655124664306641, + "learning_rate": 9.346009260149673e-06, + "loss": 0.4989, + "step": 8717 + }, + { + "epoch": 1.060906601764527, + "grad_norm": 1.253547191619873, + "learning_rate": 9.344062754692694e-06, + "loss": 0.3722, + "step": 8718 + }, + { + "epoch": 1.061028293276544, + "grad_norm": 1.3941712379455566, + "learning_rate": 9.342116274195484e-06, + "loss": 0.3944, + "step": 8719 + }, + { + "epoch": 1.061149984788561, + "grad_norm": 1.6175655126571655, + "learning_rate": 9.340169818732109e-06, + "loss": 0.4208, + "step": 8720 + }, + { + "epoch": 1.061271676300578, + "grad_norm": 3.4816179275512695, + "learning_rate": 9.338223388376642e-06, + "loss": 0.4428, + "step": 8721 + }, + { + "epoch": 1.061393367812595, + "grad_norm": 3.7616019248962402, + "learning_rate": 9.336276983203148e-06, + "loss": 0.3549, + "step": 8722 + }, + { + "epoch": 1.061515059324612, + "grad_norm": 3.567310333251953, + "learning_rate": 9.334330603285683e-06, + "loss": 0.3903, + "step": 8723 + }, + { + "epoch": 1.0616367508366291, + "grad_norm": 1.2563401460647583, + "learning_rate": 9.332384248698316e-06, + "loss": 0.4021, + "step": 8724 + }, + { + "epoch": 1.0617584423486461, + "grad_norm": 1.5313869714736938, + "learning_rate": 9.330437919515111e-06, + "loss": 0.4242, + "step": 8725 + }, + { + "epoch": 1.0618801338606632, + "grad_norm": 2.9950108528137207, + "learning_rate": 9.328491615810131e-06, + "loss": 0.3551, + "step": 8726 + }, + { + "epoch": 1.0620018253726802, + "grad_norm": 2.3801920413970947, + "learning_rate": 9.326545337657434e-06, + "loss": 0.3934, + "step": 8727 + }, + { + "epoch": 1.0621235168846972, + "grad_norm": 1.8461861610412598, + "learning_rate": 9.324599085131078e-06, + "loss": 0.4328, + "step": 8728 + }, + { + "epoch": 1.0622452083967144, + "grad_norm": 2.450988292694092, + "learning_rate": 9.32265285830513e-06, + "loss": 0.4012, + "step": 8729 + }, + { + "epoch": 1.0623668999087315, + "grad_norm": 1.270151972770691, + "learning_rate": 9.320706657253638e-06, + "loss": 0.4078, + "step": 8730 + }, + { + "epoch": 1.0624885914207485, + "grad_norm": 1.7064636945724487, + "learning_rate": 9.318760482050665e-06, + "loss": 0.4137, + "step": 8731 + }, + { + "epoch": 1.0626102829327655, + "grad_norm": 2.0447769165039062, + "learning_rate": 9.31681433277027e-06, + "loss": 0.3445, + "step": 8732 + }, + { + "epoch": 1.0627319744447825, + "grad_norm": 1.4277113676071167, + "learning_rate": 9.3148682094865e-06, + "loss": 0.3636, + "step": 8733 + }, + { + "epoch": 1.0628536659567995, + "grad_norm": 3.52839732170105, + "learning_rate": 9.312922112273416e-06, + "loss": 0.4182, + "step": 8734 + }, + { + "epoch": 1.0629753574688166, + "grad_norm": 4.402937412261963, + "learning_rate": 9.310976041205065e-06, + "loss": 0.4483, + "step": 8735 + }, + { + "epoch": 1.0630970489808336, + "grad_norm": 2.3507585525512695, + "learning_rate": 9.309029996355499e-06, + "loss": 0.3871, + "step": 8736 + }, + { + "epoch": 1.0632187404928506, + "grad_norm": 1.6308351755142212, + "learning_rate": 9.307083977798777e-06, + "loss": 0.3349, + "step": 8737 + }, + { + "epoch": 1.0633404320048676, + "grad_norm": 1.4599895477294922, + "learning_rate": 9.305137985608941e-06, + "loss": 0.3478, + "step": 8738 + }, + { + "epoch": 1.0634621235168846, + "grad_norm": 1.2582248449325562, + "learning_rate": 9.303192019860042e-06, + "loss": 0.3317, + "step": 8739 + }, + { + "epoch": 1.0635838150289016, + "grad_norm": 1.1287273168563843, + "learning_rate": 9.301246080626133e-06, + "loss": 0.3687, + "step": 8740 + }, + { + "epoch": 1.0637055065409187, + "grad_norm": 1.214354395866394, + "learning_rate": 9.299300167981254e-06, + "loss": 0.4112, + "step": 8741 + }, + { + "epoch": 1.063827198052936, + "grad_norm": 3.5570178031921387, + "learning_rate": 9.297354281999453e-06, + "loss": 0.4338, + "step": 8742 + }, + { + "epoch": 1.063948889564953, + "grad_norm": 2.9221506118774414, + "learning_rate": 9.295408422754777e-06, + "loss": 0.4156, + "step": 8743 + }, + { + "epoch": 1.06407058107697, + "grad_norm": 1.737151026725769, + "learning_rate": 9.29346259032127e-06, + "loss": 0.4072, + "step": 8744 + }, + { + "epoch": 1.064192272588987, + "grad_norm": 1.730349063873291, + "learning_rate": 9.291516784772977e-06, + "loss": 0.4002, + "step": 8745 + }, + { + "epoch": 1.064313964101004, + "grad_norm": 2.5676918029785156, + "learning_rate": 9.289571006183932e-06, + "loss": 0.3182, + "step": 8746 + }, + { + "epoch": 1.064435655613021, + "grad_norm": 2.8133351802825928, + "learning_rate": 9.287625254628183e-06, + "loss": 0.3892, + "step": 8747 + }, + { + "epoch": 1.064557347125038, + "grad_norm": 1.8877850770950317, + "learning_rate": 9.285679530179766e-06, + "loss": 0.3755, + "step": 8748 + }, + { + "epoch": 1.064679038637055, + "grad_norm": 1.8910564184188843, + "learning_rate": 9.28373383291272e-06, + "loss": 0.4811, + "step": 8749 + }, + { + "epoch": 1.064800730149072, + "grad_norm": 1.5362210273742676, + "learning_rate": 9.281788162901087e-06, + "loss": 0.4273, + "step": 8750 + }, + { + "epoch": 1.064922421661089, + "grad_norm": 2.644871950149536, + "learning_rate": 9.279842520218898e-06, + "loss": 0.3503, + "step": 8751 + }, + { + "epoch": 1.065044113173106, + "grad_norm": 2.3236091136932373, + "learning_rate": 9.277896904940196e-06, + "loss": 0.3814, + "step": 8752 + }, + { + "epoch": 1.0651658046851231, + "grad_norm": 1.8920600414276123, + "learning_rate": 9.275951317139008e-06, + "loss": 0.4223, + "step": 8753 + }, + { + "epoch": 1.0652874961971404, + "grad_norm": 1.8277065753936768, + "learning_rate": 9.274005756889373e-06, + "loss": 0.3635, + "step": 8754 + }, + { + "epoch": 1.0654091877091574, + "grad_norm": 1.6762092113494873, + "learning_rate": 9.272060224265321e-06, + "loss": 0.3715, + "step": 8755 + }, + { + "epoch": 1.0655308792211744, + "grad_norm": 3.025296211242676, + "learning_rate": 9.270114719340889e-06, + "loss": 0.3354, + "step": 8756 + }, + { + "epoch": 1.0656525707331914, + "grad_norm": 1.6936650276184082, + "learning_rate": 9.268169242190097e-06, + "loss": 0.3951, + "step": 8757 + }, + { + "epoch": 1.0657742622452084, + "grad_norm": 2.7245430946350098, + "learning_rate": 9.26622379288698e-06, + "loss": 0.4547, + "step": 8758 + }, + { + "epoch": 1.0658959537572255, + "grad_norm": 2.2041938304901123, + "learning_rate": 9.264278371505568e-06, + "loss": 0.4219, + "step": 8759 + }, + { + "epoch": 1.0660176452692425, + "grad_norm": 2.5318431854248047, + "learning_rate": 9.262332978119887e-06, + "loss": 0.3871, + "step": 8760 + }, + { + "epoch": 1.0661393367812595, + "grad_norm": 1.6668375730514526, + "learning_rate": 9.260387612803961e-06, + "loss": 0.3858, + "step": 8761 + }, + { + "epoch": 1.0662610282932765, + "grad_norm": 3.3045215606689453, + "learning_rate": 9.258442275631818e-06, + "loss": 0.416, + "step": 8762 + }, + { + "epoch": 1.0663827198052935, + "grad_norm": 1.2915488481521606, + "learning_rate": 9.256496966677484e-06, + "loss": 0.379, + "step": 8763 + }, + { + "epoch": 1.0665044113173106, + "grad_norm": 2.8317501544952393, + "learning_rate": 9.254551686014976e-06, + "loss": 0.3517, + "step": 8764 + }, + { + "epoch": 1.0666261028293276, + "grad_norm": 2.2278428077697754, + "learning_rate": 9.252606433718321e-06, + "loss": 0.3576, + "step": 8765 + }, + { + "epoch": 1.0667477943413446, + "grad_norm": 3.0616414546966553, + "learning_rate": 9.25066120986154e-06, + "loss": 0.3451, + "step": 8766 + }, + { + "epoch": 1.0668694858533618, + "grad_norm": 1.6762421131134033, + "learning_rate": 9.248716014518647e-06, + "loss": 0.4452, + "step": 8767 + }, + { + "epoch": 1.0669911773653789, + "grad_norm": 2.5014431476593018, + "learning_rate": 9.246770847763671e-06, + "loss": 0.3339, + "step": 8768 + }, + { + "epoch": 1.0671128688773959, + "grad_norm": 1.3409093618392944, + "learning_rate": 9.244825709670619e-06, + "loss": 0.3892, + "step": 8769 + }, + { + "epoch": 1.067234560389413, + "grad_norm": 2.126960277557373, + "learning_rate": 9.242880600313511e-06, + "loss": 0.4607, + "step": 8770 + }, + { + "epoch": 1.06735625190143, + "grad_norm": 1.0942060947418213, + "learning_rate": 9.240935519766365e-06, + "loss": 0.3462, + "step": 8771 + }, + { + "epoch": 1.067477943413447, + "grad_norm": 1.2500476837158203, + "learning_rate": 9.238990468103192e-06, + "loss": 0.364, + "step": 8772 + }, + { + "epoch": 1.067599634925464, + "grad_norm": 1.6133836507797241, + "learning_rate": 9.237045445398007e-06, + "loss": 0.4237, + "step": 8773 + }, + { + "epoch": 1.067721326437481, + "grad_norm": 1.4467661380767822, + "learning_rate": 9.235100451724825e-06, + "loss": 0.3572, + "step": 8774 + }, + { + "epoch": 1.067843017949498, + "grad_norm": 1.4626907110214233, + "learning_rate": 9.23315548715765e-06, + "loss": 0.4046, + "step": 8775 + }, + { + "epoch": 1.067964709461515, + "grad_norm": 1.370842456817627, + "learning_rate": 9.231210551770497e-06, + "loss": 0.326, + "step": 8776 + }, + { + "epoch": 1.068086400973532, + "grad_norm": 1.4305375814437866, + "learning_rate": 9.229265645637375e-06, + "loss": 0.4208, + "step": 8777 + }, + { + "epoch": 1.068208092485549, + "grad_norm": 1.6250733137130737, + "learning_rate": 9.227320768832293e-06, + "loss": 0.3933, + "step": 8778 + }, + { + "epoch": 1.0683297839975663, + "grad_norm": 1.7090439796447754, + "learning_rate": 9.225375921429255e-06, + "loss": 0.4064, + "step": 8779 + }, + { + "epoch": 1.0684514755095833, + "grad_norm": 3.8332064151763916, + "learning_rate": 9.223431103502263e-06, + "loss": 0.4336, + "step": 8780 + }, + { + "epoch": 1.0685731670216003, + "grad_norm": 2.608372688293457, + "learning_rate": 9.221486315125325e-06, + "loss": 0.3625, + "step": 8781 + }, + { + "epoch": 1.0686948585336173, + "grad_norm": 1.9305294752120972, + "learning_rate": 9.219541556372447e-06, + "loss": 0.3772, + "step": 8782 + }, + { + "epoch": 1.0688165500456344, + "grad_norm": 1.221787929534912, + "learning_rate": 9.217596827317624e-06, + "loss": 0.4123, + "step": 8783 + }, + { + "epoch": 1.0689382415576514, + "grad_norm": 1.9899630546569824, + "learning_rate": 9.215652128034864e-06, + "loss": 0.4315, + "step": 8784 + }, + { + "epoch": 1.0690599330696684, + "grad_norm": 2.623753547668457, + "learning_rate": 9.213707458598163e-06, + "loss": 0.3293, + "step": 8785 + }, + { + "epoch": 1.0691816245816854, + "grad_norm": 1.6596559286117554, + "learning_rate": 9.211762819081522e-06, + "loss": 0.3473, + "step": 8786 + }, + { + "epoch": 1.0693033160937024, + "grad_norm": 1.4267139434814453, + "learning_rate": 9.209818209558935e-06, + "loss": 0.405, + "step": 8787 + }, + { + "epoch": 1.0694250076057195, + "grad_norm": 1.655957818031311, + "learning_rate": 9.207873630104401e-06, + "loss": 0.4123, + "step": 8788 + }, + { + "epoch": 1.0695466991177365, + "grad_norm": 2.438105821609497, + "learning_rate": 9.205929080791919e-06, + "loss": 0.4156, + "step": 8789 + }, + { + "epoch": 1.0696683906297535, + "grad_norm": 1.2603689432144165, + "learning_rate": 9.203984561695476e-06, + "loss": 0.3776, + "step": 8790 + }, + { + "epoch": 1.0697900821417705, + "grad_norm": 2.1065311431884766, + "learning_rate": 9.202040072889071e-06, + "loss": 0.3181, + "step": 8791 + }, + { + "epoch": 1.0699117736537875, + "grad_norm": 2.2273366451263428, + "learning_rate": 9.200095614446689e-06, + "loss": 0.4216, + "step": 8792 + }, + { + "epoch": 1.0700334651658048, + "grad_norm": 1.4721736907958984, + "learning_rate": 9.198151186442325e-06, + "loss": 0.3372, + "step": 8793 + }, + { + "epoch": 1.0701551566778218, + "grad_norm": 1.687626838684082, + "learning_rate": 9.19620678894997e-06, + "loss": 0.409, + "step": 8794 + }, + { + "epoch": 1.0702768481898388, + "grad_norm": 1.4892747402191162, + "learning_rate": 9.194262422043609e-06, + "loss": 0.3486, + "step": 8795 + }, + { + "epoch": 1.0703985397018558, + "grad_norm": 2.552877426147461, + "learning_rate": 9.192318085797232e-06, + "loss": 0.4032, + "step": 8796 + }, + { + "epoch": 1.0705202312138729, + "grad_norm": 1.3186944723129272, + "learning_rate": 9.190373780284824e-06, + "loss": 0.3962, + "step": 8797 + }, + { + "epoch": 1.0706419227258899, + "grad_norm": 1.7931240797042847, + "learning_rate": 9.188429505580366e-06, + "loss": 0.4171, + "step": 8798 + }, + { + "epoch": 1.070763614237907, + "grad_norm": 1.9625226259231567, + "learning_rate": 9.186485261757848e-06, + "loss": 0.3774, + "step": 8799 + }, + { + "epoch": 1.070885305749924, + "grad_norm": 1.2802168130874634, + "learning_rate": 9.18454104889125e-06, + "loss": 0.3809, + "step": 8800 + }, + { + "epoch": 1.071006997261941, + "grad_norm": 3.9507319927215576, + "learning_rate": 9.182596867054551e-06, + "loss": 0.3209, + "step": 8801 + }, + { + "epoch": 1.071128688773958, + "grad_norm": 1.738481879234314, + "learning_rate": 9.180652716321739e-06, + "loss": 0.3867, + "step": 8802 + }, + { + "epoch": 1.071250380285975, + "grad_norm": 1.4252978563308716, + "learning_rate": 9.17870859676678e-06, + "loss": 0.3912, + "step": 8803 + }, + { + "epoch": 1.071372071797992, + "grad_norm": 1.5513497591018677, + "learning_rate": 9.176764508463661e-06, + "loss": 0.3632, + "step": 8804 + }, + { + "epoch": 1.0714937633100092, + "grad_norm": 2.4214656352996826, + "learning_rate": 9.174820451486357e-06, + "loss": 0.3568, + "step": 8805 + }, + { + "epoch": 1.0716154548220262, + "grad_norm": 1.6125140190124512, + "learning_rate": 9.172876425908843e-06, + "loss": 0.3933, + "step": 8806 + }, + { + "epoch": 1.0717371463340433, + "grad_norm": 1.800565242767334, + "learning_rate": 9.17093243180509e-06, + "loss": 0.3946, + "step": 8807 + }, + { + "epoch": 1.0718588378460603, + "grad_norm": 1.5824241638183594, + "learning_rate": 9.168988469249079e-06, + "loss": 0.3349, + "step": 8808 + }, + { + "epoch": 1.0719805293580773, + "grad_norm": 1.8837624788284302, + "learning_rate": 9.167044538314774e-06, + "loss": 0.3671, + "step": 8809 + }, + { + "epoch": 1.0721022208700943, + "grad_norm": 3.5118982791900635, + "learning_rate": 9.165100639076148e-06, + "loss": 0.3186, + "step": 8810 + }, + { + "epoch": 1.0722239123821113, + "grad_norm": 1.37621009349823, + "learning_rate": 9.16315677160717e-06, + "loss": 0.367, + "step": 8811 + }, + { + "epoch": 1.0723456038941284, + "grad_norm": 1.3543016910552979, + "learning_rate": 9.161212935981812e-06, + "loss": 0.379, + "step": 8812 + }, + { + "epoch": 1.0724672954061454, + "grad_norm": 2.310431718826294, + "learning_rate": 9.159269132274036e-06, + "loss": 0.4218, + "step": 8813 + }, + { + "epoch": 1.0725889869181624, + "grad_norm": 1.7307064533233643, + "learning_rate": 9.157325360557815e-06, + "loss": 0.3893, + "step": 8814 + }, + { + "epoch": 1.0727106784301794, + "grad_norm": 1.887952446937561, + "learning_rate": 9.155381620907102e-06, + "loss": 0.4116, + "step": 8815 + }, + { + "epoch": 1.0728323699421964, + "grad_norm": 1.7383230924606323, + "learning_rate": 9.15343791339587e-06, + "loss": 0.3697, + "step": 8816 + }, + { + "epoch": 1.0729540614542135, + "grad_norm": 1.839990496635437, + "learning_rate": 9.151494238098075e-06, + "loss": 0.4193, + "step": 8817 + }, + { + "epoch": 1.0730757529662307, + "grad_norm": 1.9042268991470337, + "learning_rate": 9.149550595087682e-06, + "loss": 0.4027, + "step": 8818 + }, + { + "epoch": 1.0731974444782477, + "grad_norm": 3.3323802947998047, + "learning_rate": 9.147606984438647e-06, + "loss": 0.4564, + "step": 8819 + }, + { + "epoch": 1.0733191359902647, + "grad_norm": 1.3060674667358398, + "learning_rate": 9.145663406224935e-06, + "loss": 0.3908, + "step": 8820 + }, + { + "epoch": 1.0734408275022818, + "grad_norm": 1.3116837739944458, + "learning_rate": 9.143719860520494e-06, + "loss": 0.4063, + "step": 8821 + }, + { + "epoch": 1.0735625190142988, + "grad_norm": 1.5498803853988647, + "learning_rate": 9.141776347399286e-06, + "loss": 0.4034, + "step": 8822 + }, + { + "epoch": 1.0736842105263158, + "grad_norm": 2.3495378494262695, + "learning_rate": 9.139832866935266e-06, + "loss": 0.3526, + "step": 8823 + }, + { + "epoch": 1.0738059020383328, + "grad_norm": 1.535097599029541, + "learning_rate": 9.137889419202385e-06, + "loss": 0.4213, + "step": 8824 + }, + { + "epoch": 1.0739275935503498, + "grad_norm": 3.1412715911865234, + "learning_rate": 9.135946004274595e-06, + "loss": 0.3762, + "step": 8825 + }, + { + "epoch": 1.0740492850623669, + "grad_norm": 1.5283244848251343, + "learning_rate": 9.134002622225853e-06, + "loss": 0.3751, + "step": 8826 + }, + { + "epoch": 1.0741709765743839, + "grad_norm": 1.8251268863677979, + "learning_rate": 9.1320592731301e-06, + "loss": 0.395, + "step": 8827 + }, + { + "epoch": 1.074292668086401, + "grad_norm": 1.467444658279419, + "learning_rate": 9.130115957061289e-06, + "loss": 0.4248, + "step": 8828 + }, + { + "epoch": 1.074414359598418, + "grad_norm": 1.3606672286987305, + "learning_rate": 9.128172674093363e-06, + "loss": 0.343, + "step": 8829 + }, + { + "epoch": 1.0745360511104352, + "grad_norm": 2.5812063217163086, + "learning_rate": 9.126229424300274e-06, + "loss": 0.3869, + "step": 8830 + }, + { + "epoch": 1.0746577426224522, + "grad_norm": 3.732025384902954, + "learning_rate": 9.124286207755966e-06, + "loss": 0.4326, + "step": 8831 + }, + { + "epoch": 1.0747794341344692, + "grad_norm": 2.6255147457122803, + "learning_rate": 9.122343024534377e-06, + "loss": 0.4407, + "step": 8832 + }, + { + "epoch": 1.0749011256464862, + "grad_norm": 2.0973968505859375, + "learning_rate": 9.120399874709453e-06, + "loss": 0.3559, + "step": 8833 + }, + { + "epoch": 1.0750228171585032, + "grad_norm": 3.68205189704895, + "learning_rate": 9.118456758355137e-06, + "loss": 0.4265, + "step": 8834 + }, + { + "epoch": 1.0751445086705202, + "grad_norm": 2.149353265762329, + "learning_rate": 9.116513675545363e-06, + "loss": 0.3803, + "step": 8835 + }, + { + "epoch": 1.0752662001825373, + "grad_norm": 1.8083761930465698, + "learning_rate": 9.114570626354073e-06, + "loss": 0.4038, + "step": 8836 + }, + { + "epoch": 1.0753878916945543, + "grad_norm": 1.7582157850265503, + "learning_rate": 9.112627610855208e-06, + "loss": 0.4099, + "step": 8837 + }, + { + "epoch": 1.0755095832065713, + "grad_norm": 1.6295803785324097, + "learning_rate": 9.110684629122695e-06, + "loss": 0.4194, + "step": 8838 + }, + { + "epoch": 1.0756312747185883, + "grad_norm": 1.2005438804626465, + "learning_rate": 9.108741681230475e-06, + "loss": 0.394, + "step": 8839 + }, + { + "epoch": 1.0757529662306053, + "grad_norm": 1.6948192119598389, + "learning_rate": 9.106798767252474e-06, + "loss": 0.3906, + "step": 8840 + }, + { + "epoch": 1.0758746577426224, + "grad_norm": 2.4337706565856934, + "learning_rate": 9.104855887262631e-06, + "loss": 0.4182, + "step": 8841 + }, + { + "epoch": 1.0759963492546394, + "grad_norm": 3.97274112701416, + "learning_rate": 9.102913041334877e-06, + "loss": 0.3682, + "step": 8842 + }, + { + "epoch": 1.0761180407666566, + "grad_norm": 2.991827964782715, + "learning_rate": 9.100970229543135e-06, + "loss": 0.3702, + "step": 8843 + }, + { + "epoch": 1.0762397322786736, + "grad_norm": 3.40278697013855, + "learning_rate": 9.099027451961337e-06, + "loss": 0.3512, + "step": 8844 + }, + { + "epoch": 1.0763614237906907, + "grad_norm": 3.2497072219848633, + "learning_rate": 9.09708470866341e-06, + "loss": 0.4095, + "step": 8845 + }, + { + "epoch": 1.0764831153027077, + "grad_norm": 1.726302981376648, + "learning_rate": 9.09514199972328e-06, + "loss": 0.4569, + "step": 8846 + }, + { + "epoch": 1.0766048068147247, + "grad_norm": 1.5319948196411133, + "learning_rate": 9.093199325214868e-06, + "loss": 0.364, + "step": 8847 + }, + { + "epoch": 1.0767264983267417, + "grad_norm": 3.0809860229492188, + "learning_rate": 9.0912566852121e-06, + "loss": 0.4536, + "step": 8848 + }, + { + "epoch": 1.0768481898387587, + "grad_norm": 1.3485472202301025, + "learning_rate": 9.089314079788899e-06, + "loss": 0.3907, + "step": 8849 + }, + { + "epoch": 1.0769698813507758, + "grad_norm": 1.5011563301086426, + "learning_rate": 9.08737150901918e-06, + "loss": 0.3879, + "step": 8850 + }, + { + "epoch": 1.0770915728627928, + "grad_norm": 2.9348831176757812, + "learning_rate": 9.085428972976863e-06, + "loss": 0.3307, + "step": 8851 + }, + { + "epoch": 1.0772132643748098, + "grad_norm": 1.3488210439682007, + "learning_rate": 9.083486471735866e-06, + "loss": 0.3311, + "step": 8852 + }, + { + "epoch": 1.0773349558868268, + "grad_norm": 2.135638952255249, + "learning_rate": 9.081544005370107e-06, + "loss": 0.3848, + "step": 8853 + }, + { + "epoch": 1.0774566473988438, + "grad_norm": 1.6729844808578491, + "learning_rate": 9.0796015739535e-06, + "loss": 0.4042, + "step": 8854 + }, + { + "epoch": 1.077578338910861, + "grad_norm": 2.3693199157714844, + "learning_rate": 9.077659177559957e-06, + "loss": 0.4112, + "step": 8855 + }, + { + "epoch": 1.077700030422878, + "grad_norm": 2.576648473739624, + "learning_rate": 9.075716816263392e-06, + "loss": 0.4163, + "step": 8856 + }, + { + "epoch": 1.0778217219348951, + "grad_norm": 2.359023332595825, + "learning_rate": 9.073774490137716e-06, + "loss": 0.3977, + "step": 8857 + }, + { + "epoch": 1.0779434134469121, + "grad_norm": 1.4538905620574951, + "learning_rate": 9.071832199256836e-06, + "loss": 0.3907, + "step": 8858 + }, + { + "epoch": 1.0780651049589292, + "grad_norm": 2.0680110454559326, + "learning_rate": 9.069889943694663e-06, + "loss": 0.3331, + "step": 8859 + }, + { + "epoch": 1.0781867964709462, + "grad_norm": 3.00321102142334, + "learning_rate": 9.067947723525107e-06, + "loss": 0.4598, + "step": 8860 + }, + { + "epoch": 1.0783084879829632, + "grad_norm": 2.321582078933716, + "learning_rate": 9.066005538822064e-06, + "loss": 0.324, + "step": 8861 + }, + { + "epoch": 1.0784301794949802, + "grad_norm": 1.5175471305847168, + "learning_rate": 9.064063389659446e-06, + "loss": 0.3803, + "step": 8862 + }, + { + "epoch": 1.0785518710069972, + "grad_norm": 1.7753382921218872, + "learning_rate": 9.062121276111152e-06, + "loss": 0.451, + "step": 8863 + }, + { + "epoch": 1.0786735625190143, + "grad_norm": 1.395018219947815, + "learning_rate": 9.060179198251085e-06, + "loss": 0.346, + "step": 8864 + }, + { + "epoch": 1.0787952540310313, + "grad_norm": 1.4062145948410034, + "learning_rate": 9.058237156153145e-06, + "loss": 0.3591, + "step": 8865 + }, + { + "epoch": 1.0789169455430483, + "grad_norm": 2.2314913272857666, + "learning_rate": 9.05629514989123e-06, + "loss": 0.3756, + "step": 8866 + }, + { + "epoch": 1.0790386370550653, + "grad_norm": 2.2746989727020264, + "learning_rate": 9.054353179539237e-06, + "loss": 0.3447, + "step": 8867 + }, + { + "epoch": 1.0791603285670825, + "grad_norm": 3.5027270317077637, + "learning_rate": 9.052411245171066e-06, + "loss": 0.3327, + "step": 8868 + }, + { + "epoch": 1.0792820200790996, + "grad_norm": 2.264431953430176, + "learning_rate": 9.050469346860606e-06, + "loss": 0.3397, + "step": 8869 + }, + { + "epoch": 1.0794037115911166, + "grad_norm": 1.7220628261566162, + "learning_rate": 9.048527484681753e-06, + "loss": 0.3998, + "step": 8870 + }, + { + "epoch": 1.0795254031031336, + "grad_norm": 1.2057875394821167, + "learning_rate": 9.046585658708398e-06, + "loss": 0.4028, + "step": 8871 + }, + { + "epoch": 1.0796470946151506, + "grad_norm": 1.6283847093582153, + "learning_rate": 9.044643869014437e-06, + "loss": 0.3383, + "step": 8872 + }, + { + "epoch": 1.0797687861271676, + "grad_norm": 1.2450268268585205, + "learning_rate": 9.042702115673751e-06, + "loss": 0.3279, + "step": 8873 + }, + { + "epoch": 1.0798904776391847, + "grad_norm": 1.7437647581100464, + "learning_rate": 9.04076039876023e-06, + "loss": 0.3888, + "step": 8874 + }, + { + "epoch": 1.0800121691512017, + "grad_norm": 2.5526015758514404, + "learning_rate": 9.038818718347761e-06, + "loss": 0.4001, + "step": 8875 + }, + { + "epoch": 1.0801338606632187, + "grad_norm": 2.628814458847046, + "learning_rate": 9.036877074510233e-06, + "loss": 0.4228, + "step": 8876 + }, + { + "epoch": 1.0802555521752357, + "grad_norm": 1.5292574167251587, + "learning_rate": 9.034935467321522e-06, + "loss": 0.3849, + "step": 8877 + }, + { + "epoch": 1.0803772436872527, + "grad_norm": 3.0046262741088867, + "learning_rate": 9.032993896855514e-06, + "loss": 0.4562, + "step": 8878 + }, + { + "epoch": 1.0804989351992698, + "grad_norm": 1.7847601175308228, + "learning_rate": 9.03105236318609e-06, + "loss": 0.3832, + "step": 8879 + }, + { + "epoch": 1.080620626711287, + "grad_norm": 2.2581450939178467, + "learning_rate": 9.029110866387132e-06, + "loss": 0.455, + "step": 8880 + }, + { + "epoch": 1.080742318223304, + "grad_norm": 1.386810064315796, + "learning_rate": 9.027169406532512e-06, + "loss": 0.3573, + "step": 8881 + }, + { + "epoch": 1.080864009735321, + "grad_norm": 1.7611544132232666, + "learning_rate": 9.02522798369611e-06, + "loss": 0.4436, + "step": 8882 + }, + { + "epoch": 1.080985701247338, + "grad_norm": 1.5873936414718628, + "learning_rate": 9.023286597951805e-06, + "loss": 0.4315, + "step": 8883 + }, + { + "epoch": 1.081107392759355, + "grad_norm": 2.20467209815979, + "learning_rate": 9.021345249373465e-06, + "loss": 0.4533, + "step": 8884 + }, + { + "epoch": 1.081229084271372, + "grad_norm": 3.7632782459259033, + "learning_rate": 9.019403938034962e-06, + "loss": 0.4037, + "step": 8885 + }, + { + "epoch": 1.0813507757833891, + "grad_norm": 4.266801357269287, + "learning_rate": 9.01746266401017e-06, + "loss": 0.3872, + "step": 8886 + }, + { + "epoch": 1.0814724672954061, + "grad_norm": 5.662842273712158, + "learning_rate": 9.015521427372955e-06, + "loss": 0.3623, + "step": 8887 + }, + { + "epoch": 1.0815941588074232, + "grad_norm": 4.647026538848877, + "learning_rate": 9.013580228197191e-06, + "loss": 0.367, + "step": 8888 + }, + { + "epoch": 1.0817158503194402, + "grad_norm": 3.4710135459899902, + "learning_rate": 9.011639066556737e-06, + "loss": 0.3964, + "step": 8889 + }, + { + "epoch": 1.0818375418314572, + "grad_norm": 3.1195731163024902, + "learning_rate": 9.009697942525464e-06, + "loss": 0.3439, + "step": 8890 + }, + { + "epoch": 1.0819592333434742, + "grad_norm": 1.837060570716858, + "learning_rate": 9.007756856177237e-06, + "loss": 0.3648, + "step": 8891 + }, + { + "epoch": 1.0820809248554912, + "grad_norm": 1.3897647857666016, + "learning_rate": 9.005815807585912e-06, + "loss": 0.4354, + "step": 8892 + }, + { + "epoch": 1.0822026163675083, + "grad_norm": 1.2845149040222168, + "learning_rate": 9.003874796825353e-06, + "loss": 0.4081, + "step": 8893 + }, + { + "epoch": 1.0823243078795255, + "grad_norm": 1.4223989248275757, + "learning_rate": 9.001933823969424e-06, + "loss": 0.3724, + "step": 8894 + }, + { + "epoch": 1.0824459993915425, + "grad_norm": 3.233513116836548, + "learning_rate": 8.999992889091977e-06, + "loss": 0.4095, + "step": 8895 + }, + { + "epoch": 1.0825676909035595, + "grad_norm": 1.3134852647781372, + "learning_rate": 8.998051992266872e-06, + "loss": 0.4355, + "step": 8896 + }, + { + "epoch": 1.0826893824155766, + "grad_norm": 1.2544664144515991, + "learning_rate": 8.99611113356796e-06, + "loss": 0.3631, + "step": 8897 + }, + { + "epoch": 1.0828110739275936, + "grad_norm": 2.711484432220459, + "learning_rate": 8.994170313069095e-06, + "loss": 0.4238, + "step": 8898 + }, + { + "epoch": 1.0829327654396106, + "grad_norm": 1.5422803163528442, + "learning_rate": 8.992229530844136e-06, + "loss": 0.3907, + "step": 8899 + }, + { + "epoch": 1.0830544569516276, + "grad_norm": 1.4914556741714478, + "learning_rate": 8.990288786966927e-06, + "loss": 0.3557, + "step": 8900 + }, + { + "epoch": 1.0831761484636446, + "grad_norm": 2.2798070907592773, + "learning_rate": 8.98834808151132e-06, + "loss": 0.4193, + "step": 8901 + }, + { + "epoch": 1.0832978399756616, + "grad_norm": 3.790067434310913, + "learning_rate": 8.986407414551164e-06, + "loss": 0.4607, + "step": 8902 + }, + { + "epoch": 1.0834195314876787, + "grad_norm": 1.3658655881881714, + "learning_rate": 8.984466786160302e-06, + "loss": 0.4044, + "step": 8903 + }, + { + "epoch": 1.0835412229996957, + "grad_norm": 1.631594181060791, + "learning_rate": 8.98252619641258e-06, + "loss": 0.4105, + "step": 8904 + }, + { + "epoch": 1.0836629145117127, + "grad_norm": 2.061410427093506, + "learning_rate": 8.980585645381844e-06, + "loss": 0.3873, + "step": 8905 + }, + { + "epoch": 1.08378460602373, + "grad_norm": 1.6519362926483154, + "learning_rate": 8.978645133141936e-06, + "loss": 0.4416, + "step": 8906 + }, + { + "epoch": 1.083906297535747, + "grad_norm": 2.980961799621582, + "learning_rate": 8.976704659766694e-06, + "loss": 0.3371, + "step": 8907 + }, + { + "epoch": 1.084027989047764, + "grad_norm": 2.6576199531555176, + "learning_rate": 8.974764225329957e-06, + "loss": 0.3535, + "step": 8908 + }, + { + "epoch": 1.084149680559781, + "grad_norm": 1.69933021068573, + "learning_rate": 8.972823829905561e-06, + "loss": 0.4165, + "step": 8909 + }, + { + "epoch": 1.084271372071798, + "grad_norm": 3.7051103115081787, + "learning_rate": 8.970883473567348e-06, + "loss": 0.3918, + "step": 8910 + }, + { + "epoch": 1.084393063583815, + "grad_norm": 1.5553418397903442, + "learning_rate": 8.968943156389146e-06, + "loss": 0.4838, + "step": 8911 + }, + { + "epoch": 1.084514755095832, + "grad_norm": 3.2772324085235596, + "learning_rate": 8.967002878444791e-06, + "loss": 0.3841, + "step": 8912 + }, + { + "epoch": 1.084636446607849, + "grad_norm": 1.2527636289596558, + "learning_rate": 8.965062639808116e-06, + "loss": 0.3816, + "step": 8913 + }, + { + "epoch": 1.084758138119866, + "grad_norm": 3.1844944953918457, + "learning_rate": 8.963122440552951e-06, + "loss": 0.3513, + "step": 8914 + }, + { + "epoch": 1.0848798296318831, + "grad_norm": 2.1681220531463623, + "learning_rate": 8.961182280753122e-06, + "loss": 0.3885, + "step": 8915 + }, + { + "epoch": 1.0850015211439001, + "grad_norm": 2.5879220962524414, + "learning_rate": 8.959242160482456e-06, + "loss": 0.4517, + "step": 8916 + }, + { + "epoch": 1.0851232126559172, + "grad_norm": 1.6130682229995728, + "learning_rate": 8.957302079814783e-06, + "loss": 0.3424, + "step": 8917 + }, + { + "epoch": 1.0852449041679342, + "grad_norm": 1.7708659172058105, + "learning_rate": 8.955362038823926e-06, + "loss": 0.3234, + "step": 8918 + }, + { + "epoch": 1.0853665956799514, + "grad_norm": 5.493325710296631, + "learning_rate": 8.953422037583702e-06, + "loss": 0.4904, + "step": 8919 + }, + { + "epoch": 1.0854882871919684, + "grad_norm": 4.483700752258301, + "learning_rate": 8.951482076167935e-06, + "loss": 0.4188, + "step": 8920 + }, + { + "epoch": 1.0856099787039855, + "grad_norm": 2.5243887901306152, + "learning_rate": 8.949542154650445e-06, + "loss": 0.4062, + "step": 8921 + }, + { + "epoch": 1.0857316702160025, + "grad_norm": 2.6309549808502197, + "learning_rate": 8.947602273105055e-06, + "loss": 0.4069, + "step": 8922 + }, + { + "epoch": 1.0858533617280195, + "grad_norm": 4.536709785461426, + "learning_rate": 8.945662431605573e-06, + "loss": 0.4769, + "step": 8923 + }, + { + "epoch": 1.0859750532400365, + "grad_norm": 5.025242328643799, + "learning_rate": 8.943722630225817e-06, + "loss": 0.4726, + "step": 8924 + }, + { + "epoch": 1.0860967447520535, + "grad_norm": 2.740510940551758, + "learning_rate": 8.941782869039604e-06, + "loss": 0.3125, + "step": 8925 + }, + { + "epoch": 1.0862184362640706, + "grad_norm": 1.3224332332611084, + "learning_rate": 8.939843148120741e-06, + "loss": 0.3817, + "step": 8926 + }, + { + "epoch": 1.0863401277760876, + "grad_norm": 2.4443953037261963, + "learning_rate": 8.937903467543042e-06, + "loss": 0.4152, + "step": 8927 + }, + { + "epoch": 1.0864618192881046, + "grad_norm": 1.2838761806488037, + "learning_rate": 8.935963827380315e-06, + "loss": 0.4078, + "step": 8928 + }, + { + "epoch": 1.0865835108001216, + "grad_norm": 1.5518044233322144, + "learning_rate": 8.934024227706366e-06, + "loss": 0.3563, + "step": 8929 + }, + { + "epoch": 1.0867052023121386, + "grad_norm": 1.638936996459961, + "learning_rate": 8.932084668595005e-06, + "loss": 0.3953, + "step": 8930 + }, + { + "epoch": 1.0868268938241559, + "grad_norm": 4.243192672729492, + "learning_rate": 8.930145150120028e-06, + "loss": 0.3295, + "step": 8931 + }, + { + "epoch": 1.086948585336173, + "grad_norm": 2.638798952102661, + "learning_rate": 8.928205672355244e-06, + "loss": 0.4957, + "step": 8932 + }, + { + "epoch": 1.08707027684819, + "grad_norm": 3.9487528800964355, + "learning_rate": 8.926266235374454e-06, + "loss": 0.3609, + "step": 8933 + }, + { + "epoch": 1.087191968360207, + "grad_norm": 3.4017183780670166, + "learning_rate": 8.924326839251452e-06, + "loss": 0.3467, + "step": 8934 + }, + { + "epoch": 1.087313659872224, + "grad_norm": 1.3985438346862793, + "learning_rate": 8.922387484060043e-06, + "loss": 0.4121, + "step": 8935 + }, + { + "epoch": 1.087435351384241, + "grad_norm": 1.3551188707351685, + "learning_rate": 8.920448169874023e-06, + "loss": 0.38, + "step": 8936 + }, + { + "epoch": 1.087557042896258, + "grad_norm": 2.9014194011688232, + "learning_rate": 8.918508896767181e-06, + "loss": 0.3386, + "step": 8937 + }, + { + "epoch": 1.087678734408275, + "grad_norm": 1.9644094705581665, + "learning_rate": 8.916569664813315e-06, + "loss": 0.3995, + "step": 8938 + }, + { + "epoch": 1.087800425920292, + "grad_norm": 1.6576064825057983, + "learning_rate": 8.914630474086216e-06, + "loss": 0.3901, + "step": 8939 + }, + { + "epoch": 1.087922117432309, + "grad_norm": 3.281888008117676, + "learning_rate": 8.912691324659676e-06, + "loss": 0.4632, + "step": 8940 + }, + { + "epoch": 1.088043808944326, + "grad_norm": 1.4651509523391724, + "learning_rate": 8.910752216607483e-06, + "loss": 0.4087, + "step": 8941 + }, + { + "epoch": 1.088165500456343, + "grad_norm": 1.5841832160949707, + "learning_rate": 8.908813150003418e-06, + "loss": 0.3013, + "step": 8942 + }, + { + "epoch": 1.08828719196836, + "grad_norm": 1.4435245990753174, + "learning_rate": 8.906874124921274e-06, + "loss": 0.3077, + "step": 8943 + }, + { + "epoch": 1.0884088834803773, + "grad_norm": 3.5181400775909424, + "learning_rate": 8.904935141434833e-06, + "loss": 0.4537, + "step": 8944 + }, + { + "epoch": 1.0885305749923944, + "grad_norm": 2.530200242996216, + "learning_rate": 8.902996199617875e-06, + "loss": 0.4344, + "step": 8945 + }, + { + "epoch": 1.0886522665044114, + "grad_norm": 1.2385213375091553, + "learning_rate": 8.90105729954418e-06, + "loss": 0.3527, + "step": 8946 + }, + { + "epoch": 1.0887739580164284, + "grad_norm": 1.6580597162246704, + "learning_rate": 8.899118441287532e-06, + "loss": 0.3766, + "step": 8947 + }, + { + "epoch": 1.0888956495284454, + "grad_norm": 1.090971827507019, + "learning_rate": 8.897179624921706e-06, + "loss": 0.356, + "step": 8948 + }, + { + "epoch": 1.0890173410404624, + "grad_norm": 1.24204421043396, + "learning_rate": 8.895240850520477e-06, + "loss": 0.3309, + "step": 8949 + }, + { + "epoch": 1.0891390325524795, + "grad_norm": 1.9583499431610107, + "learning_rate": 8.893302118157619e-06, + "loss": 0.429, + "step": 8950 + }, + { + "epoch": 1.0892607240644965, + "grad_norm": 1.2250736951828003, + "learning_rate": 8.891363427906911e-06, + "loss": 0.3833, + "step": 8951 + }, + { + "epoch": 1.0893824155765135, + "grad_norm": 1.339637279510498, + "learning_rate": 8.88942477984212e-06, + "loss": 0.3988, + "step": 8952 + }, + { + "epoch": 1.0895041070885305, + "grad_norm": 2.841278314590454, + "learning_rate": 8.88748617403701e-06, + "loss": 0.2837, + "step": 8953 + }, + { + "epoch": 1.0896257986005475, + "grad_norm": 1.1660181283950806, + "learning_rate": 8.885547610565355e-06, + "loss": 0.3354, + "step": 8954 + }, + { + "epoch": 1.0897474901125646, + "grad_norm": 1.377590537071228, + "learning_rate": 8.883609089500919e-06, + "loss": 0.3999, + "step": 8955 + }, + { + "epoch": 1.0898691816245818, + "grad_norm": 1.1790530681610107, + "learning_rate": 8.881670610917471e-06, + "loss": 0.3932, + "step": 8956 + }, + { + "epoch": 1.0899908731365988, + "grad_norm": 1.4846265316009521, + "learning_rate": 8.87973217488877e-06, + "loss": 0.388, + "step": 8957 + }, + { + "epoch": 1.0901125646486158, + "grad_norm": 1.8230562210083008, + "learning_rate": 8.877793781488575e-06, + "loss": 0.4082, + "step": 8958 + }, + { + "epoch": 1.0902342561606329, + "grad_norm": 2.732302188873291, + "learning_rate": 8.875855430790655e-06, + "loss": 0.4456, + "step": 8959 + }, + { + "epoch": 1.0903559476726499, + "grad_norm": 1.9497780799865723, + "learning_rate": 8.87391712286876e-06, + "loss": 0.3993, + "step": 8960 + }, + { + "epoch": 1.090477639184667, + "grad_norm": 1.7974390983581543, + "learning_rate": 8.871978857796648e-06, + "loss": 0.3589, + "step": 8961 + }, + { + "epoch": 1.090599330696684, + "grad_norm": 2.550435781478882, + "learning_rate": 8.870040635648078e-06, + "loss": 0.3376, + "step": 8962 + }, + { + "epoch": 1.090721022208701, + "grad_norm": 1.7439709901809692, + "learning_rate": 8.868102456496799e-06, + "loss": 0.3484, + "step": 8963 + }, + { + "epoch": 1.090842713720718, + "grad_norm": 1.7590044736862183, + "learning_rate": 8.866164320416568e-06, + "loss": 0.4406, + "step": 8964 + }, + { + "epoch": 1.090964405232735, + "grad_norm": 1.5954684019088745, + "learning_rate": 8.864226227481127e-06, + "loss": 0.4353, + "step": 8965 + }, + { + "epoch": 1.091086096744752, + "grad_norm": 1.9115084409713745, + "learning_rate": 8.862288177764232e-06, + "loss": 0.3942, + "step": 8966 + }, + { + "epoch": 1.091207788256769, + "grad_norm": 1.9837453365325928, + "learning_rate": 8.860350171339626e-06, + "loss": 0.4004, + "step": 8967 + }, + { + "epoch": 1.091329479768786, + "grad_norm": 1.2382985353469849, + "learning_rate": 8.858412208281052e-06, + "loss": 0.4126, + "step": 8968 + }, + { + "epoch": 1.0914511712808033, + "grad_norm": 1.5390450954437256, + "learning_rate": 8.856474288662258e-06, + "loss": 0.3302, + "step": 8969 + }, + { + "epoch": 1.0915728627928203, + "grad_norm": 1.4210050106048584, + "learning_rate": 8.854536412556987e-06, + "loss": 0.3664, + "step": 8970 + }, + { + "epoch": 1.0916945543048373, + "grad_norm": 2.797977924346924, + "learning_rate": 8.852598580038974e-06, + "loss": 0.4233, + "step": 8971 + }, + { + "epoch": 1.0918162458168543, + "grad_norm": 1.2185076475143433, + "learning_rate": 8.850660791181958e-06, + "loss": 0.329, + "step": 8972 + }, + { + "epoch": 1.0919379373288713, + "grad_norm": 1.5773074626922607, + "learning_rate": 8.84872304605968e-06, + "loss": 0.3643, + "step": 8973 + }, + { + "epoch": 1.0920596288408884, + "grad_norm": 1.7411830425262451, + "learning_rate": 8.846785344745876e-06, + "loss": 0.3819, + "step": 8974 + }, + { + "epoch": 1.0921813203529054, + "grad_norm": 1.5986206531524658, + "learning_rate": 8.844847687314276e-06, + "loss": 0.3522, + "step": 8975 + }, + { + "epoch": 1.0923030118649224, + "grad_norm": 2.6501529216766357, + "learning_rate": 8.84291007383861e-06, + "loss": 0.392, + "step": 8976 + }, + { + "epoch": 1.0924247033769394, + "grad_norm": 1.5997583866119385, + "learning_rate": 8.840972504392612e-06, + "loss": 0.3987, + "step": 8977 + }, + { + "epoch": 1.0925463948889564, + "grad_norm": 1.9058746099472046, + "learning_rate": 8.83903497905001e-06, + "loss": 0.4024, + "step": 8978 + }, + { + "epoch": 1.0926680864009735, + "grad_norm": 2.0032706260681152, + "learning_rate": 8.837097497884527e-06, + "loss": 0.4052, + "step": 8979 + }, + { + "epoch": 1.0927897779129905, + "grad_norm": 1.894484043121338, + "learning_rate": 8.835160060969891e-06, + "loss": 0.3937, + "step": 8980 + }, + { + "epoch": 1.0929114694250077, + "grad_norm": 1.5778229236602783, + "learning_rate": 8.833222668379828e-06, + "loss": 0.4101, + "step": 8981 + }, + { + "epoch": 1.0930331609370247, + "grad_norm": 4.399184703826904, + "learning_rate": 8.83128532018806e-06, + "loss": 0.3433, + "step": 8982 + }, + { + "epoch": 1.0931548524490418, + "grad_norm": 1.858673095703125, + "learning_rate": 8.829348016468299e-06, + "loss": 0.3592, + "step": 8983 + }, + { + "epoch": 1.0932765439610588, + "grad_norm": 1.6883916854858398, + "learning_rate": 8.827410757294272e-06, + "loss": 0.4269, + "step": 8984 + }, + { + "epoch": 1.0933982354730758, + "grad_norm": 1.6884262561798096, + "learning_rate": 8.825473542739695e-06, + "loss": 0.3854, + "step": 8985 + }, + { + "epoch": 1.0935199269850928, + "grad_norm": 1.9616135358810425, + "learning_rate": 8.823536372878277e-06, + "loss": 0.4414, + "step": 8986 + }, + { + "epoch": 1.0936416184971098, + "grad_norm": 1.9091920852661133, + "learning_rate": 8.821599247783742e-06, + "loss": 0.4148, + "step": 8987 + }, + { + "epoch": 1.0937633100091269, + "grad_norm": 1.4490082263946533, + "learning_rate": 8.819662167529787e-06, + "loss": 0.381, + "step": 8988 + }, + { + "epoch": 1.0938850015211439, + "grad_norm": 1.411734938621521, + "learning_rate": 8.817725132190132e-06, + "loss": 0.4299, + "step": 8989 + }, + { + "epoch": 1.094006693033161, + "grad_norm": 1.2066599130630493, + "learning_rate": 8.815788141838484e-06, + "loss": 0.3801, + "step": 8990 + }, + { + "epoch": 1.094128384545178, + "grad_norm": 2.4151101112365723, + "learning_rate": 8.813851196548547e-06, + "loss": 0.3141, + "step": 8991 + }, + { + "epoch": 1.094250076057195, + "grad_norm": 1.2600125074386597, + "learning_rate": 8.811914296394027e-06, + "loss": 0.3488, + "step": 8992 + }, + { + "epoch": 1.094371767569212, + "grad_norm": 1.534321665763855, + "learning_rate": 8.80997744144863e-06, + "loss": 0.4236, + "step": 8993 + }, + { + "epoch": 1.094493459081229, + "grad_norm": 2.4045729637145996, + "learning_rate": 8.808040631786052e-06, + "loss": 0.4665, + "step": 8994 + }, + { + "epoch": 1.0946151505932462, + "grad_norm": 2.070873737335205, + "learning_rate": 8.806103867479994e-06, + "loss": 0.3362, + "step": 8995 + }, + { + "epoch": 1.0947368421052632, + "grad_norm": 1.5666120052337646, + "learning_rate": 8.804167148604159e-06, + "loss": 0.4153, + "step": 8996 + }, + { + "epoch": 1.0948585336172802, + "grad_norm": 1.8920365571975708, + "learning_rate": 8.802230475232237e-06, + "loss": 0.3489, + "step": 8997 + }, + { + "epoch": 1.0949802251292973, + "grad_norm": 1.5304187536239624, + "learning_rate": 8.800293847437928e-06, + "loss": 0.3528, + "step": 8998 + }, + { + "epoch": 1.0951019166413143, + "grad_norm": 1.4707058668136597, + "learning_rate": 8.798357265294916e-06, + "loss": 0.3852, + "step": 8999 + }, + { + "epoch": 1.0952236081533313, + "grad_norm": 1.4516830444335938, + "learning_rate": 8.796420728876898e-06, + "loss": 0.3903, + "step": 9000 + }, + { + "epoch": 1.0953452996653483, + "grad_norm": 2.015927314758301, + "learning_rate": 8.794484238257568e-06, + "loss": 0.3625, + "step": 9001 + }, + { + "epoch": 1.0954669911773653, + "grad_norm": 1.454624056816101, + "learning_rate": 8.792547793510602e-06, + "loss": 0.3123, + "step": 9002 + }, + { + "epoch": 1.0955886826893824, + "grad_norm": 1.3874398469924927, + "learning_rate": 8.790611394709693e-06, + "loss": 0.3987, + "step": 9003 + }, + { + "epoch": 1.0957103742013994, + "grad_norm": 1.4493876695632935, + "learning_rate": 8.788675041928525e-06, + "loss": 0.3916, + "step": 9004 + }, + { + "epoch": 1.0958320657134164, + "grad_norm": 1.4685176610946655, + "learning_rate": 8.786738735240777e-06, + "loss": 0.3743, + "step": 9005 + }, + { + "epoch": 1.0959537572254334, + "grad_norm": 2.2720980644226074, + "learning_rate": 8.784802474720133e-06, + "loss": 0.4268, + "step": 9006 + }, + { + "epoch": 1.0960754487374507, + "grad_norm": 1.6242934465408325, + "learning_rate": 8.782866260440268e-06, + "loss": 0.3399, + "step": 9007 + }, + { + "epoch": 1.0961971402494677, + "grad_norm": 2.1982522010803223, + "learning_rate": 8.780930092474866e-06, + "loss": 0.4336, + "step": 9008 + }, + { + "epoch": 1.0963188317614847, + "grad_norm": 1.506544589996338, + "learning_rate": 8.778993970897593e-06, + "loss": 0.3508, + "step": 9009 + }, + { + "epoch": 1.0964405232735017, + "grad_norm": 2.265421152114868, + "learning_rate": 8.777057895782131e-06, + "loss": 0.4363, + "step": 9010 + }, + { + "epoch": 1.0965622147855187, + "grad_norm": 5.215832233428955, + "learning_rate": 8.775121867202144e-06, + "loss": 0.3515, + "step": 9011 + }, + { + "epoch": 1.0966839062975358, + "grad_norm": 2.833876609802246, + "learning_rate": 8.773185885231307e-06, + "loss": 0.3791, + "step": 9012 + }, + { + "epoch": 1.0968055978095528, + "grad_norm": 1.7682383060455322, + "learning_rate": 8.771249949943285e-06, + "loss": 0.4235, + "step": 9013 + }, + { + "epoch": 1.0969272893215698, + "grad_norm": 1.5311999320983887, + "learning_rate": 8.769314061411746e-06, + "loss": 0.4589, + "step": 9014 + }, + { + "epoch": 1.0970489808335868, + "grad_norm": 3.27245831489563, + "learning_rate": 8.767378219710353e-06, + "loss": 0.3361, + "step": 9015 + }, + { + "epoch": 1.0971706723456038, + "grad_norm": 1.8526626825332642, + "learning_rate": 8.765442424912774e-06, + "loss": 0.384, + "step": 9016 + }, + { + "epoch": 1.0972923638576209, + "grad_norm": 1.8122971057891846, + "learning_rate": 8.763506677092662e-06, + "loss": 0.3677, + "step": 9017 + }, + { + "epoch": 1.0974140553696379, + "grad_norm": 1.3476673364639282, + "learning_rate": 8.76157097632368e-06, + "loss": 0.419, + "step": 9018 + }, + { + "epoch": 1.097535746881655, + "grad_norm": 1.5779483318328857, + "learning_rate": 8.75963532267949e-06, + "loss": 0.3504, + "step": 9019 + }, + { + "epoch": 1.0976574383936721, + "grad_norm": 4.563393592834473, + "learning_rate": 8.757699716233742e-06, + "loss": 0.3593, + "step": 9020 + }, + { + "epoch": 1.0977791299056892, + "grad_norm": 2.21390700340271, + "learning_rate": 8.755764157060093e-06, + "loss": 0.3094, + "step": 9021 + }, + { + "epoch": 1.0979008214177062, + "grad_norm": 2.7866342067718506, + "learning_rate": 8.75382864523219e-06, + "loss": 0.3956, + "step": 9022 + }, + { + "epoch": 1.0980225129297232, + "grad_norm": 1.9960756301879883, + "learning_rate": 8.751893180823686e-06, + "loss": 0.3557, + "step": 9023 + }, + { + "epoch": 1.0981442044417402, + "grad_norm": 1.3319308757781982, + "learning_rate": 8.749957763908232e-06, + "loss": 0.3524, + "step": 9024 + }, + { + "epoch": 1.0982658959537572, + "grad_norm": 1.9648208618164062, + "learning_rate": 8.748022394559472e-06, + "loss": 0.4304, + "step": 9025 + }, + { + "epoch": 1.0983875874657743, + "grad_norm": 1.5563209056854248, + "learning_rate": 8.74608707285105e-06, + "loss": 0.3136, + "step": 9026 + }, + { + "epoch": 1.0985092789777913, + "grad_norm": 2.40421724319458, + "learning_rate": 8.744151798856613e-06, + "loss": 0.4276, + "step": 9027 + }, + { + "epoch": 1.0986309704898083, + "grad_norm": 2.3067102432250977, + "learning_rate": 8.742216572649797e-06, + "loss": 0.376, + "step": 9028 + }, + { + "epoch": 1.0987526620018253, + "grad_norm": 3.315098762512207, + "learning_rate": 8.740281394304243e-06, + "loss": 0.4297, + "step": 9029 + }, + { + "epoch": 1.0988743535138423, + "grad_norm": 1.6769059896469116, + "learning_rate": 8.738346263893592e-06, + "loss": 0.381, + "step": 9030 + }, + { + "epoch": 1.0989960450258593, + "grad_norm": 2.220285177230835, + "learning_rate": 8.736411181491476e-06, + "loss": 0.4493, + "step": 9031 + }, + { + "epoch": 1.0991177365378766, + "grad_norm": 2.3177056312561035, + "learning_rate": 8.73447614717153e-06, + "loss": 0.387, + "step": 9032 + }, + { + "epoch": 1.0992394280498936, + "grad_norm": 2.4205918312072754, + "learning_rate": 8.732541161007389e-06, + "loss": 0.3581, + "step": 9033 + }, + { + "epoch": 1.0993611195619106, + "grad_norm": 1.4935222864151, + "learning_rate": 8.730606223072678e-06, + "loss": 0.3938, + "step": 9034 + }, + { + "epoch": 1.0994828110739276, + "grad_norm": 3.0986626148223877, + "learning_rate": 8.728671333441027e-06, + "loss": 0.4781, + "step": 9035 + }, + { + "epoch": 1.0996045025859447, + "grad_norm": 2.061501979827881, + "learning_rate": 8.726736492186062e-06, + "loss": 0.4442, + "step": 9036 + }, + { + "epoch": 1.0997261940979617, + "grad_norm": 1.3886741399765015, + "learning_rate": 8.72480169938141e-06, + "loss": 0.4234, + "step": 9037 + }, + { + "epoch": 1.0998478856099787, + "grad_norm": 2.325425148010254, + "learning_rate": 8.722866955100697e-06, + "loss": 0.4441, + "step": 9038 + }, + { + "epoch": 1.0999695771219957, + "grad_norm": 2.2290287017822266, + "learning_rate": 8.720932259417536e-06, + "loss": 0.3339, + "step": 9039 + }, + { + "epoch": 1.1000912686340127, + "grad_norm": 3.4437472820281982, + "learning_rate": 8.718997612405548e-06, + "loss": 0.3829, + "step": 9040 + }, + { + "epoch": 1.1002129601460298, + "grad_norm": 1.9450740814208984, + "learning_rate": 8.717063014138354e-06, + "loss": 0.4499, + "step": 9041 + }, + { + "epoch": 1.1003346516580468, + "grad_norm": 2.27532696723938, + "learning_rate": 8.71512846468957e-06, + "loss": 0.4174, + "step": 9042 + }, + { + "epoch": 1.1004563431700638, + "grad_norm": 1.454737663269043, + "learning_rate": 8.713193964132805e-06, + "loss": 0.4493, + "step": 9043 + }, + { + "epoch": 1.1005780346820808, + "grad_norm": 1.8113757371902466, + "learning_rate": 8.711259512541678e-06, + "loss": 0.3786, + "step": 9044 + }, + { + "epoch": 1.100699726194098, + "grad_norm": 3.0968167781829834, + "learning_rate": 8.70932510998979e-06, + "loss": 0.3427, + "step": 9045 + }, + { + "epoch": 1.100821417706115, + "grad_norm": 1.5274698734283447, + "learning_rate": 8.707390756550755e-06, + "loss": 0.3505, + "step": 9046 + }, + { + "epoch": 1.100943109218132, + "grad_norm": 1.8361154794692993, + "learning_rate": 8.705456452298175e-06, + "loss": 0.3611, + "step": 9047 + }, + { + "epoch": 1.1010648007301491, + "grad_norm": 1.2664860486984253, + "learning_rate": 8.703522197305657e-06, + "loss": 0.3678, + "step": 9048 + }, + { + "epoch": 1.1011864922421661, + "grad_norm": 1.4269071817398071, + "learning_rate": 8.701587991646802e-06, + "loss": 0.3288, + "step": 9049 + }, + { + "epoch": 1.1013081837541832, + "grad_norm": 2.48342227935791, + "learning_rate": 8.699653835395215e-06, + "loss": 0.2848, + "step": 9050 + }, + { + "epoch": 1.1014298752662002, + "grad_norm": 1.5357853174209595, + "learning_rate": 8.697719728624487e-06, + "loss": 0.316, + "step": 9051 + }, + { + "epoch": 1.1015515667782172, + "grad_norm": 2.211686372756958, + "learning_rate": 8.69578567140822e-06, + "loss": 0.3841, + "step": 9052 + }, + { + "epoch": 1.1016732582902342, + "grad_norm": 4.63115930557251, + "learning_rate": 8.69385166382001e-06, + "loss": 0.4341, + "step": 9053 + }, + { + "epoch": 1.1017949498022512, + "grad_norm": 4.628605365753174, + "learning_rate": 8.691917705933445e-06, + "loss": 0.3662, + "step": 9054 + }, + { + "epoch": 1.1019166413142683, + "grad_norm": 3.7070493698120117, + "learning_rate": 8.68998379782212e-06, + "loss": 0.3668, + "step": 9055 + }, + { + "epoch": 1.1020383328262853, + "grad_norm": 5.416265964508057, + "learning_rate": 8.688049939559626e-06, + "loss": 0.4415, + "step": 9056 + }, + { + "epoch": 1.1021600243383025, + "grad_norm": 2.968954086303711, + "learning_rate": 8.686116131219544e-06, + "loss": 0.3658, + "step": 9057 + }, + { + "epoch": 1.1022817158503195, + "grad_norm": 4.6864519119262695, + "learning_rate": 8.684182372875465e-06, + "loss": 0.432, + "step": 9058 + }, + { + "epoch": 1.1024034073623366, + "grad_norm": 4.840615749359131, + "learning_rate": 8.682248664600969e-06, + "loss": 0.5121, + "step": 9059 + }, + { + "epoch": 1.1025250988743536, + "grad_norm": 2.544372797012329, + "learning_rate": 8.680315006469638e-06, + "loss": 0.3224, + "step": 9060 + }, + { + "epoch": 1.1026467903863706, + "grad_norm": 1.6478246450424194, + "learning_rate": 8.678381398555054e-06, + "loss": 0.3644, + "step": 9061 + }, + { + "epoch": 1.1027684818983876, + "grad_norm": 2.746987819671631, + "learning_rate": 8.676447840930792e-06, + "loss": 0.3744, + "step": 9062 + }, + { + "epoch": 1.1028901734104046, + "grad_norm": 2.1918320655822754, + "learning_rate": 8.674514333670428e-06, + "loss": 0.3869, + "step": 9063 + }, + { + "epoch": 1.1030118649224216, + "grad_norm": 1.4808180332183838, + "learning_rate": 8.672580876847542e-06, + "loss": 0.411, + "step": 9064 + }, + { + "epoch": 1.1031335564344387, + "grad_norm": 1.4325323104858398, + "learning_rate": 8.670647470535698e-06, + "loss": 0.4233, + "step": 9065 + }, + { + "epoch": 1.1032552479464557, + "grad_norm": 1.5158377885818481, + "learning_rate": 8.668714114808467e-06, + "loss": 0.4147, + "step": 9066 + }, + { + "epoch": 1.1033769394584727, + "grad_norm": 1.4325822591781616, + "learning_rate": 8.666780809739427e-06, + "loss": 0.3805, + "step": 9067 + }, + { + "epoch": 1.1034986309704897, + "grad_norm": 2.2317309379577637, + "learning_rate": 8.66484755540213e-06, + "loss": 0.4299, + "step": 9068 + }, + { + "epoch": 1.1036203224825067, + "grad_norm": 2.6835567951202393, + "learning_rate": 8.66291435187015e-06, + "loss": 0.3028, + "step": 9069 + }, + { + "epoch": 1.103742013994524, + "grad_norm": 3.496803045272827, + "learning_rate": 8.660981199217047e-06, + "loss": 0.3697, + "step": 9070 + }, + { + "epoch": 1.103863705506541, + "grad_norm": 2.323664903640747, + "learning_rate": 8.659048097516378e-06, + "loss": 0.365, + "step": 9071 + }, + { + "epoch": 1.103985397018558, + "grad_norm": 2.18013596534729, + "learning_rate": 8.657115046841708e-06, + "loss": 0.346, + "step": 9072 + }, + { + "epoch": 1.104107088530575, + "grad_norm": 2.304213047027588, + "learning_rate": 8.655182047266587e-06, + "loss": 0.3382, + "step": 9073 + }, + { + "epoch": 1.104228780042592, + "grad_norm": 1.591457486152649, + "learning_rate": 8.653249098864574e-06, + "loss": 0.385, + "step": 9074 + }, + { + "epoch": 1.104350471554609, + "grad_norm": 1.8094230890274048, + "learning_rate": 8.65131620170922e-06, + "loss": 0.3944, + "step": 9075 + }, + { + "epoch": 1.104472163066626, + "grad_norm": 1.4685986042022705, + "learning_rate": 8.649383355874077e-06, + "loss": 0.3502, + "step": 9076 + }, + { + "epoch": 1.1045938545786431, + "grad_norm": 2.0377144813537598, + "learning_rate": 8.647450561432692e-06, + "loss": 0.4188, + "step": 9077 + }, + { + "epoch": 1.1047155460906601, + "grad_norm": 1.2891614437103271, + "learning_rate": 8.645517818458611e-06, + "loss": 0.3349, + "step": 9078 + }, + { + "epoch": 1.1048372376026772, + "grad_norm": 1.2488749027252197, + "learning_rate": 8.643585127025388e-06, + "loss": 0.3321, + "step": 9079 + }, + { + "epoch": 1.1049589291146942, + "grad_norm": 4.072513103485107, + "learning_rate": 8.641652487206552e-06, + "loss": 0.4602, + "step": 9080 + }, + { + "epoch": 1.1050806206267112, + "grad_norm": 2.200491189956665, + "learning_rate": 8.639719899075654e-06, + "loss": 0.393, + "step": 9081 + }, + { + "epoch": 1.1052023121387284, + "grad_norm": 1.5780788660049438, + "learning_rate": 8.637787362706227e-06, + "loss": 0.3448, + "step": 9082 + }, + { + "epoch": 1.1053240036507455, + "grad_norm": 2.6987144947052, + "learning_rate": 8.63585487817181e-06, + "loss": 0.4521, + "step": 9083 + }, + { + "epoch": 1.1054456951627625, + "grad_norm": 1.9878219366073608, + "learning_rate": 8.63392244554594e-06, + "loss": 0.4167, + "step": 9084 + }, + { + "epoch": 1.1055673866747795, + "grad_norm": 1.4324803352355957, + "learning_rate": 8.631990064902147e-06, + "loss": 0.3928, + "step": 9085 + }, + { + "epoch": 1.1056890781867965, + "grad_norm": 1.9192440509796143, + "learning_rate": 8.630057736313964e-06, + "loss": 0.3828, + "step": 9086 + }, + { + "epoch": 1.1058107696988135, + "grad_norm": 2.194176197052002, + "learning_rate": 8.628125459854922e-06, + "loss": 0.3858, + "step": 9087 + }, + { + "epoch": 1.1059324612108306, + "grad_norm": 3.5617263317108154, + "learning_rate": 8.626193235598543e-06, + "loss": 0.3727, + "step": 9088 + }, + { + "epoch": 1.1060541527228476, + "grad_norm": 4.5613112449646, + "learning_rate": 8.624261063618356e-06, + "loss": 0.3473, + "step": 9089 + }, + { + "epoch": 1.1061758442348646, + "grad_norm": 2.2122268676757812, + "learning_rate": 8.622328943987886e-06, + "loss": 0.4251, + "step": 9090 + }, + { + "epoch": 1.1062975357468816, + "grad_norm": 2.576096534729004, + "learning_rate": 8.620396876780648e-06, + "loss": 0.4217, + "step": 9091 + }, + { + "epoch": 1.1064192272588986, + "grad_norm": 2.0854804515838623, + "learning_rate": 8.618464862070168e-06, + "loss": 0.3648, + "step": 9092 + }, + { + "epoch": 1.1065409187709156, + "grad_norm": 1.4130666255950928, + "learning_rate": 8.616532899929955e-06, + "loss": 0.3559, + "step": 9093 + }, + { + "epoch": 1.1066626102829327, + "grad_norm": 1.5977747440338135, + "learning_rate": 8.61460099043353e-06, + "loss": 0.3893, + "step": 9094 + }, + { + "epoch": 1.1067843017949497, + "grad_norm": 2.612947463989258, + "learning_rate": 8.612669133654406e-06, + "loss": 0.4082, + "step": 9095 + }, + { + "epoch": 1.106905993306967, + "grad_norm": 1.413270354270935, + "learning_rate": 8.610737329666092e-06, + "loss": 0.3817, + "step": 9096 + }, + { + "epoch": 1.107027684818984, + "grad_norm": 1.6126654148101807, + "learning_rate": 8.608805578542099e-06, + "loss": 0.3163, + "step": 9097 + }, + { + "epoch": 1.107149376331001, + "grad_norm": 2.7629494667053223, + "learning_rate": 8.606873880355933e-06, + "loss": 0.4096, + "step": 9098 + }, + { + "epoch": 1.107271067843018, + "grad_norm": 2.252469778060913, + "learning_rate": 8.6049422351811e-06, + "loss": 0.3871, + "step": 9099 + }, + { + "epoch": 1.107392759355035, + "grad_norm": 2.1956517696380615, + "learning_rate": 8.603010643091101e-06, + "loss": 0.3415, + "step": 9100 + }, + { + "epoch": 1.107514450867052, + "grad_norm": 1.7556936740875244, + "learning_rate": 8.60107910415944e-06, + "loss": 0.3619, + "step": 9101 + }, + { + "epoch": 1.107636142379069, + "grad_norm": 2.947998046875, + "learning_rate": 8.599147618459618e-06, + "loss": 0.4105, + "step": 9102 + }, + { + "epoch": 1.107757833891086, + "grad_norm": 2.2991061210632324, + "learning_rate": 8.597216186065125e-06, + "loss": 0.3917, + "step": 9103 + }, + { + "epoch": 1.107879525403103, + "grad_norm": 1.6912895441055298, + "learning_rate": 8.59528480704946e-06, + "loss": 0.3173, + "step": 9104 + }, + { + "epoch": 1.10800121691512, + "grad_norm": 1.5079476833343506, + "learning_rate": 8.593353481486115e-06, + "loss": 0.3756, + "step": 9105 + }, + { + "epoch": 1.1081229084271371, + "grad_norm": 2.070499897003174, + "learning_rate": 8.591422209448582e-06, + "loss": 0.3856, + "step": 9106 + }, + { + "epoch": 1.1082445999391541, + "grad_norm": 1.5848976373672485, + "learning_rate": 8.589490991010351e-06, + "loss": 0.3934, + "step": 9107 + }, + { + "epoch": 1.1083662914511714, + "grad_norm": 2.169907569885254, + "learning_rate": 8.587559826244904e-06, + "loss": 0.3406, + "step": 9108 + }, + { + "epoch": 1.1084879829631884, + "grad_norm": 1.6242631673812866, + "learning_rate": 8.585628715225731e-06, + "loss": 0.4194, + "step": 9109 + }, + { + "epoch": 1.1086096744752054, + "grad_norm": 1.4864424467086792, + "learning_rate": 8.583697658026316e-06, + "loss": 0.4087, + "step": 9110 + }, + { + "epoch": 1.1087313659872224, + "grad_norm": 2.291318655014038, + "learning_rate": 8.581766654720132e-06, + "loss": 0.3521, + "step": 9111 + }, + { + "epoch": 1.1088530574992395, + "grad_norm": 1.442834496498108, + "learning_rate": 8.579835705380664e-06, + "loss": 0.3855, + "step": 9112 + }, + { + "epoch": 1.1089747490112565, + "grad_norm": 1.7348870038986206, + "learning_rate": 8.57790481008139e-06, + "loss": 0.405, + "step": 9113 + }, + { + "epoch": 1.1090964405232735, + "grad_norm": 1.9099787473678589, + "learning_rate": 8.57597396889578e-06, + "loss": 0.4049, + "step": 9114 + }, + { + "epoch": 1.1092181320352905, + "grad_norm": 1.8496531248092651, + "learning_rate": 8.574043181897309e-06, + "loss": 0.4312, + "step": 9115 + }, + { + "epoch": 1.1093398235473075, + "grad_norm": 1.6800591945648193, + "learning_rate": 8.572112449159442e-06, + "loss": 0.3626, + "step": 9116 + }, + { + "epoch": 1.1094615150593246, + "grad_norm": 1.8353216648101807, + "learning_rate": 8.570181770755654e-06, + "loss": 0.4022, + "step": 9117 + }, + { + "epoch": 1.1095832065713416, + "grad_norm": 1.3841429948806763, + "learning_rate": 8.56825114675941e-06, + "loss": 0.3756, + "step": 9118 + }, + { + "epoch": 1.1097048980833586, + "grad_norm": 1.9450618028640747, + "learning_rate": 8.566320577244174e-06, + "loss": 0.3856, + "step": 9119 + }, + { + "epoch": 1.1098265895953756, + "grad_norm": 1.8709648847579956, + "learning_rate": 8.564390062283405e-06, + "loss": 0.3316, + "step": 9120 + }, + { + "epoch": 1.1099482811073929, + "grad_norm": 2.1731176376342773, + "learning_rate": 8.562459601950568e-06, + "loss": 0.446, + "step": 9121 + }, + { + "epoch": 1.1100699726194099, + "grad_norm": 1.196104645729065, + "learning_rate": 8.560529196319118e-06, + "loss": 0.3708, + "step": 9122 + }, + { + "epoch": 1.110191664131427, + "grad_norm": 2.1624343395233154, + "learning_rate": 8.558598845462512e-06, + "loss": 0.3673, + "step": 9123 + }, + { + "epoch": 1.110313355643444, + "grad_norm": 1.9917705059051514, + "learning_rate": 8.556668549454203e-06, + "loss": 0.3972, + "step": 9124 + }, + { + "epoch": 1.110435047155461, + "grad_norm": 1.3727229833602905, + "learning_rate": 8.554738308367645e-06, + "loss": 0.3699, + "step": 9125 + }, + { + "epoch": 1.110556738667478, + "grad_norm": 1.489043116569519, + "learning_rate": 8.552808122276286e-06, + "loss": 0.3719, + "step": 9126 + }, + { + "epoch": 1.110678430179495, + "grad_norm": 1.4416900873184204, + "learning_rate": 8.55087799125357e-06, + "loss": 0.3998, + "step": 9127 + }, + { + "epoch": 1.110800121691512, + "grad_norm": 1.2847437858581543, + "learning_rate": 8.548947915372947e-06, + "loss": 0.3837, + "step": 9128 + }, + { + "epoch": 1.110921813203529, + "grad_norm": 2.7136335372924805, + "learning_rate": 8.54701789470786e-06, + "loss": 0.4881, + "step": 9129 + }, + { + "epoch": 1.111043504715546, + "grad_norm": 1.9693403244018555, + "learning_rate": 8.545087929331751e-06, + "loss": 0.4192, + "step": 9130 + }, + { + "epoch": 1.111165196227563, + "grad_norm": 2.21697735786438, + "learning_rate": 8.543158019318053e-06, + "loss": 0.3393, + "step": 9131 + }, + { + "epoch": 1.11128688773958, + "grad_norm": 1.8898667097091675, + "learning_rate": 8.541228164740214e-06, + "loss": 0.4462, + "step": 9132 + }, + { + "epoch": 1.1114085792515973, + "grad_norm": 2.183990955352783, + "learning_rate": 8.53929836567166e-06, + "loss": 0.4004, + "step": 9133 + }, + { + "epoch": 1.1115302707636143, + "grad_norm": 1.8716946840286255, + "learning_rate": 8.537368622185825e-06, + "loss": 0.4424, + "step": 9134 + }, + { + "epoch": 1.1116519622756313, + "grad_norm": 1.44870924949646, + "learning_rate": 8.535438934356142e-06, + "loss": 0.39, + "step": 9135 + }, + { + "epoch": 1.1117736537876484, + "grad_norm": 1.6148954629898071, + "learning_rate": 8.533509302256042e-06, + "loss": 0.3959, + "step": 9136 + }, + { + "epoch": 1.1118953452996654, + "grad_norm": 3.0691044330596924, + "learning_rate": 8.531579725958951e-06, + "loss": 0.479, + "step": 9137 + }, + { + "epoch": 1.1120170368116824, + "grad_norm": 2.1254775524139404, + "learning_rate": 8.529650205538285e-06, + "loss": 0.47, + "step": 9138 + }, + { + "epoch": 1.1121387283236994, + "grad_norm": 2.366456985473633, + "learning_rate": 8.527720741067473e-06, + "loss": 0.4024, + "step": 9139 + }, + { + "epoch": 1.1122604198357164, + "grad_norm": 1.6609021425247192, + "learning_rate": 8.525791332619934e-06, + "loss": 0.4353, + "step": 9140 + }, + { + "epoch": 1.1123821113477335, + "grad_norm": 2.807094097137451, + "learning_rate": 8.523861980269088e-06, + "loss": 0.2954, + "step": 9141 + }, + { + "epoch": 1.1125038028597505, + "grad_norm": 2.602961540222168, + "learning_rate": 8.521932684088348e-06, + "loss": 0.3672, + "step": 9142 + }, + { + "epoch": 1.1126254943717675, + "grad_norm": 1.7842473983764648, + "learning_rate": 8.520003444151128e-06, + "loss": 0.3832, + "step": 9143 + }, + { + "epoch": 1.1127471858837845, + "grad_norm": 2.3027572631835938, + "learning_rate": 8.518074260530842e-06, + "loss": 0.4034, + "step": 9144 + }, + { + "epoch": 1.1128688773958015, + "grad_norm": 2.4500296115875244, + "learning_rate": 8.516145133300896e-06, + "loss": 0.355, + "step": 9145 + }, + { + "epoch": 1.1129905689078188, + "grad_norm": 2.3904201984405518, + "learning_rate": 8.514216062534702e-06, + "loss": 0.4425, + "step": 9146 + }, + { + "epoch": 1.1131122604198358, + "grad_norm": 2.657768964767456, + "learning_rate": 8.512287048305661e-06, + "loss": 0.3289, + "step": 9147 + }, + { + "epoch": 1.1132339519318528, + "grad_norm": 2.2551615238189697, + "learning_rate": 8.510358090687178e-06, + "loss": 0.4052, + "step": 9148 + }, + { + "epoch": 1.1133556434438698, + "grad_norm": 2.1249330043792725, + "learning_rate": 8.508429189752654e-06, + "loss": 0.355, + "step": 9149 + }, + { + "epoch": 1.1134773349558869, + "grad_norm": 4.088589191436768, + "learning_rate": 8.506500345575484e-06, + "loss": 0.4759, + "step": 9150 + }, + { + "epoch": 1.1135990264679039, + "grad_norm": 2.492638349533081, + "learning_rate": 8.504571558229067e-06, + "loss": 0.3861, + "step": 9151 + }, + { + "epoch": 1.113720717979921, + "grad_norm": 2.263943910598755, + "learning_rate": 8.502642827786799e-06, + "loss": 0.3854, + "step": 9152 + }, + { + "epoch": 1.113842409491938, + "grad_norm": 2.477700710296631, + "learning_rate": 8.500714154322069e-06, + "loss": 0.4286, + "step": 9153 + }, + { + "epoch": 1.113964101003955, + "grad_norm": 1.5313174724578857, + "learning_rate": 8.49878553790827e-06, + "loss": 0.3613, + "step": 9154 + }, + { + "epoch": 1.114085792515972, + "grad_norm": 2.7664523124694824, + "learning_rate": 8.496856978618788e-06, + "loss": 0.3821, + "step": 9155 + }, + { + "epoch": 1.114207484027989, + "grad_norm": 1.5243641138076782, + "learning_rate": 8.49492847652701e-06, + "loss": 0.3569, + "step": 9156 + }, + { + "epoch": 1.114329175540006, + "grad_norm": 1.43416166305542, + "learning_rate": 8.493000031706318e-06, + "loss": 0.3354, + "step": 9157 + }, + { + "epoch": 1.1144508670520232, + "grad_norm": 1.70289945602417, + "learning_rate": 8.491071644230095e-06, + "loss": 0.3995, + "step": 9158 + }, + { + "epoch": 1.1145725585640402, + "grad_norm": 1.881446361541748, + "learning_rate": 8.489143314171719e-06, + "loss": 0.3974, + "step": 9159 + }, + { + "epoch": 1.1146942500760573, + "grad_norm": 1.7493022680282593, + "learning_rate": 8.487215041604571e-06, + "loss": 0.4416, + "step": 9160 + }, + { + "epoch": 1.1148159415880743, + "grad_norm": 1.911035180091858, + "learning_rate": 8.485286826602018e-06, + "loss": 0.4283, + "step": 9161 + }, + { + "epoch": 1.1149376331000913, + "grad_norm": 3.0140066146850586, + "learning_rate": 8.483358669237434e-06, + "loss": 0.3294, + "step": 9162 + }, + { + "epoch": 1.1150593246121083, + "grad_norm": 1.4261788129806519, + "learning_rate": 8.481430569584197e-06, + "loss": 0.4374, + "step": 9163 + }, + { + "epoch": 1.1151810161241253, + "grad_norm": 2.5744519233703613, + "learning_rate": 8.479502527715666e-06, + "loss": 0.3847, + "step": 9164 + }, + { + "epoch": 1.1153027076361424, + "grad_norm": 3.1720798015594482, + "learning_rate": 8.477574543705211e-06, + "loss": 0.4055, + "step": 9165 + }, + { + "epoch": 1.1154243991481594, + "grad_norm": 3.349222421646118, + "learning_rate": 8.475646617626198e-06, + "loss": 0.3601, + "step": 9166 + }, + { + "epoch": 1.1155460906601764, + "grad_norm": 2.2097604274749756, + "learning_rate": 8.473718749551987e-06, + "loss": 0.4389, + "step": 9167 + }, + { + "epoch": 1.1156677821721934, + "grad_norm": 2.555494546890259, + "learning_rate": 8.471790939555935e-06, + "loss": 0.3649, + "step": 9168 + }, + { + "epoch": 1.1157894736842104, + "grad_norm": 1.521283745765686, + "learning_rate": 8.4698631877114e-06, + "loss": 0.4093, + "step": 9169 + }, + { + "epoch": 1.1159111651962275, + "grad_norm": 1.1920220851898193, + "learning_rate": 8.467935494091743e-06, + "loss": 0.3774, + "step": 9170 + }, + { + "epoch": 1.1160328567082447, + "grad_norm": 2.5965051651000977, + "learning_rate": 8.46600785877031e-06, + "loss": 0.3704, + "step": 9171 + }, + { + "epoch": 1.1161545482202617, + "grad_norm": 1.5529069900512695, + "learning_rate": 8.464080281820453e-06, + "loss": 0.394, + "step": 9172 + }, + { + "epoch": 1.1162762397322787, + "grad_norm": 2.5242857933044434, + "learning_rate": 8.462152763315516e-06, + "loss": 0.3802, + "step": 9173 + }, + { + "epoch": 1.1163979312442958, + "grad_norm": 1.7580677270889282, + "learning_rate": 8.460225303328854e-06, + "loss": 0.3439, + "step": 9174 + }, + { + "epoch": 1.1165196227563128, + "grad_norm": 1.6584011316299438, + "learning_rate": 8.458297901933805e-06, + "loss": 0.3621, + "step": 9175 + }, + { + "epoch": 1.1166413142683298, + "grad_norm": 2.3109233379364014, + "learning_rate": 8.45637055920371e-06, + "loss": 0.3843, + "step": 9176 + }, + { + "epoch": 1.1167630057803468, + "grad_norm": 2.325183629989624, + "learning_rate": 8.454443275211912e-06, + "loss": 0.4008, + "step": 9177 + }, + { + "epoch": 1.1168846972923638, + "grad_norm": 2.5963947772979736, + "learning_rate": 8.452516050031749e-06, + "loss": 0.4097, + "step": 9178 + }, + { + "epoch": 1.1170063888043809, + "grad_norm": 3.48311185836792, + "learning_rate": 8.45058888373655e-06, + "loss": 0.4332, + "step": 9179 + }, + { + "epoch": 1.1171280803163979, + "grad_norm": 3.0110666751861572, + "learning_rate": 8.448661776399652e-06, + "loss": 0.3904, + "step": 9180 + }, + { + "epoch": 1.117249771828415, + "grad_norm": 4.495974063873291, + "learning_rate": 8.446734728094386e-06, + "loss": 0.4892, + "step": 9181 + }, + { + "epoch": 1.117371463340432, + "grad_norm": 2.3228940963745117, + "learning_rate": 8.444807738894077e-06, + "loss": 0.3903, + "step": 9182 + }, + { + "epoch": 1.1174931548524492, + "grad_norm": 1.5135903358459473, + "learning_rate": 8.442880808872057e-06, + "loss": 0.3918, + "step": 9183 + }, + { + "epoch": 1.1176148463644662, + "grad_norm": 2.5475640296936035, + "learning_rate": 8.44095393810164e-06, + "loss": 0.4213, + "step": 9184 + }, + { + "epoch": 1.1177365378764832, + "grad_norm": 3.3910868167877197, + "learning_rate": 8.439027126656151e-06, + "loss": 0.2829, + "step": 9185 + }, + { + "epoch": 1.1178582293885002, + "grad_norm": 2.7618460655212402, + "learning_rate": 8.437100374608916e-06, + "loss": 0.3769, + "step": 9186 + }, + { + "epoch": 1.1179799209005172, + "grad_norm": 1.492190957069397, + "learning_rate": 8.435173682033245e-06, + "loss": 0.426, + "step": 9187 + }, + { + "epoch": 1.1181016124125343, + "grad_norm": 2.698499917984009, + "learning_rate": 8.433247049002453e-06, + "loss": 0.3676, + "step": 9188 + }, + { + "epoch": 1.1182233039245513, + "grad_norm": 1.435552716255188, + "learning_rate": 8.431320475589855e-06, + "loss": 0.4303, + "step": 9189 + }, + { + "epoch": 1.1183449954365683, + "grad_norm": 2.180490016937256, + "learning_rate": 8.42939396186876e-06, + "loss": 0.3763, + "step": 9190 + }, + { + "epoch": 1.1184666869485853, + "grad_norm": 1.9120277166366577, + "learning_rate": 8.427467507912473e-06, + "loss": 0.3883, + "step": 9191 + }, + { + "epoch": 1.1185883784606023, + "grad_norm": 3.7214653491973877, + "learning_rate": 8.425541113794304e-06, + "loss": 0.3713, + "step": 9192 + }, + { + "epoch": 1.1187100699726193, + "grad_norm": 1.3134238719940186, + "learning_rate": 8.423614779587556e-06, + "loss": 0.401, + "step": 9193 + }, + { + "epoch": 1.1188317614846364, + "grad_norm": 1.6264400482177734, + "learning_rate": 8.421688505365533e-06, + "loss": 0.3972, + "step": 9194 + }, + { + "epoch": 1.1189534529966534, + "grad_norm": 2.182687759399414, + "learning_rate": 8.419762291201523e-06, + "loss": 0.3879, + "step": 9195 + }, + { + "epoch": 1.1190751445086704, + "grad_norm": 2.0381920337677, + "learning_rate": 8.417836137168828e-06, + "loss": 0.3713, + "step": 9196 + }, + { + "epoch": 1.1191968360206876, + "grad_norm": 2.056309700012207, + "learning_rate": 8.415910043340747e-06, + "loss": 0.3585, + "step": 9197 + }, + { + "epoch": 1.1193185275327047, + "grad_norm": 1.435675859451294, + "learning_rate": 8.413984009790565e-06, + "loss": 0.3617, + "step": 9198 + }, + { + "epoch": 1.1194402190447217, + "grad_norm": 1.9840588569641113, + "learning_rate": 8.412058036591573e-06, + "loss": 0.4369, + "step": 9199 + }, + { + "epoch": 1.1195619105567387, + "grad_norm": 1.6251378059387207, + "learning_rate": 8.41013212381706e-06, + "loss": 0.4057, + "step": 9200 + }, + { + "epoch": 1.1196836020687557, + "grad_norm": 2.0687034130096436, + "learning_rate": 8.408206271540314e-06, + "loss": 0.352, + "step": 9201 + }, + { + "epoch": 1.1198052935807727, + "grad_norm": 4.334172248840332, + "learning_rate": 8.406280479834612e-06, + "loss": 0.4505, + "step": 9202 + }, + { + "epoch": 1.1199269850927898, + "grad_norm": 2.7174582481384277, + "learning_rate": 8.404354748773235e-06, + "loss": 0.4329, + "step": 9203 + }, + { + "epoch": 1.1200486766048068, + "grad_norm": 1.7748609781265259, + "learning_rate": 8.402429078429466e-06, + "loss": 0.3516, + "step": 9204 + }, + { + "epoch": 1.1201703681168238, + "grad_norm": 1.4807707071304321, + "learning_rate": 8.400503468876576e-06, + "loss": 0.3765, + "step": 9205 + }, + { + "epoch": 1.1202920596288408, + "grad_norm": 1.528741717338562, + "learning_rate": 8.398577920187842e-06, + "loss": 0.4212, + "step": 9206 + }, + { + "epoch": 1.1204137511408578, + "grad_norm": 1.4302330017089844, + "learning_rate": 8.39665243243653e-06, + "loss": 0.3877, + "step": 9207 + }, + { + "epoch": 1.1205354426528749, + "grad_norm": 3.2975523471832275, + "learning_rate": 8.39472700569591e-06, + "loss": 0.3145, + "step": 9208 + }, + { + "epoch": 1.120657134164892, + "grad_norm": 1.322030782699585, + "learning_rate": 8.392801640039254e-06, + "loss": 0.3785, + "step": 9209 + }, + { + "epoch": 1.1207788256769091, + "grad_norm": 1.6766749620437622, + "learning_rate": 8.39087633553982e-06, + "loss": 0.408, + "step": 9210 + }, + { + "epoch": 1.1209005171889261, + "grad_norm": 1.4917322397232056, + "learning_rate": 8.388951092270871e-06, + "loss": 0.3892, + "step": 9211 + }, + { + "epoch": 1.1210222087009432, + "grad_norm": 3.555649518966675, + "learning_rate": 8.387025910305671e-06, + "loss": 0.3748, + "step": 9212 + }, + { + "epoch": 1.1211439002129602, + "grad_norm": 2.2035226821899414, + "learning_rate": 8.385100789717472e-06, + "loss": 0.4583, + "step": 9213 + }, + { + "epoch": 1.1212655917249772, + "grad_norm": 1.5977227687835693, + "learning_rate": 8.383175730579528e-06, + "loss": 0.3636, + "step": 9214 + }, + { + "epoch": 1.1213872832369942, + "grad_norm": 1.6033726930618286, + "learning_rate": 8.3812507329651e-06, + "loss": 0.3817, + "step": 9215 + }, + { + "epoch": 1.1215089747490112, + "grad_norm": 1.445035457611084, + "learning_rate": 8.379325796947428e-06, + "loss": 0.3518, + "step": 9216 + }, + { + "epoch": 1.1216306662610283, + "grad_norm": 2.4747064113616943, + "learning_rate": 8.377400922599769e-06, + "loss": 0.3803, + "step": 9217 + }, + { + "epoch": 1.1217523577730453, + "grad_norm": 3.3628170490264893, + "learning_rate": 8.375476109995359e-06, + "loss": 0.4531, + "step": 9218 + }, + { + "epoch": 1.1218740492850623, + "grad_norm": 2.5855064392089844, + "learning_rate": 8.373551359207445e-06, + "loss": 0.4042, + "step": 9219 + }, + { + "epoch": 1.1219957407970793, + "grad_norm": 1.3485170602798462, + "learning_rate": 8.37162667030927e-06, + "loss": 0.3386, + "step": 9220 + }, + { + "epoch": 1.1221174323090963, + "grad_norm": 2.2683331966400146, + "learning_rate": 8.36970204337407e-06, + "loss": 0.4098, + "step": 9221 + }, + { + "epoch": 1.1222391238211136, + "grad_norm": 2.050955057144165, + "learning_rate": 8.367777478475082e-06, + "loss": 0.349, + "step": 9222 + }, + { + "epoch": 1.1223608153331306, + "grad_norm": 2.4875850677490234, + "learning_rate": 8.36585297568554e-06, + "loss": 0.4101, + "step": 9223 + }, + { + "epoch": 1.1224825068451476, + "grad_norm": 1.9098100662231445, + "learning_rate": 8.363928535078674e-06, + "loss": 0.3642, + "step": 9224 + }, + { + "epoch": 1.1226041983571646, + "grad_norm": 1.338596224784851, + "learning_rate": 8.362004156727712e-06, + "loss": 0.3923, + "step": 9225 + }, + { + "epoch": 1.1227258898691816, + "grad_norm": 1.468164324760437, + "learning_rate": 8.360079840705883e-06, + "loss": 0.4224, + "step": 9226 + }, + { + "epoch": 1.1228475813811987, + "grad_norm": 1.3696421384811401, + "learning_rate": 8.358155587086414e-06, + "loss": 0.376, + "step": 9227 + }, + { + "epoch": 1.1229692728932157, + "grad_norm": 1.6756013631820679, + "learning_rate": 8.35623139594252e-06, + "loss": 0.3851, + "step": 9228 + }, + { + "epoch": 1.1230909644052327, + "grad_norm": 1.3554133176803589, + "learning_rate": 8.354307267347428e-06, + "loss": 0.3774, + "step": 9229 + }, + { + "epoch": 1.1232126559172497, + "grad_norm": 1.647902011871338, + "learning_rate": 8.352383201374348e-06, + "loss": 0.4031, + "step": 9230 + }, + { + "epoch": 1.1233343474292667, + "grad_norm": 1.9269342422485352, + "learning_rate": 8.350459198096499e-06, + "loss": 0.3639, + "step": 9231 + }, + { + "epoch": 1.1234560389412838, + "grad_norm": 1.6590639352798462, + "learning_rate": 8.348535257587089e-06, + "loss": 0.4309, + "step": 9232 + }, + { + "epoch": 1.1235777304533008, + "grad_norm": 1.882504940032959, + "learning_rate": 8.346611379919331e-06, + "loss": 0.392, + "step": 9233 + }, + { + "epoch": 1.123699421965318, + "grad_norm": 1.2456754446029663, + "learning_rate": 8.344687565166431e-06, + "loss": 0.3953, + "step": 9234 + }, + { + "epoch": 1.123821113477335, + "grad_norm": 1.6376365423202515, + "learning_rate": 8.3427638134016e-06, + "loss": 0.3902, + "step": 9235 + }, + { + "epoch": 1.123942804989352, + "grad_norm": 1.4814348220825195, + "learning_rate": 8.340840124698033e-06, + "loss": 0.3828, + "step": 9236 + }, + { + "epoch": 1.124064496501369, + "grad_norm": 1.7772419452667236, + "learning_rate": 8.338916499128934e-06, + "loss": 0.4027, + "step": 9237 + }, + { + "epoch": 1.124186188013386, + "grad_norm": 1.310228943824768, + "learning_rate": 8.336992936767502e-06, + "loss": 0.3816, + "step": 9238 + }, + { + "epoch": 1.1243078795254031, + "grad_norm": 1.4920202493667603, + "learning_rate": 8.33506943768693e-06, + "loss": 0.3356, + "step": 9239 + }, + { + "epoch": 1.1244295710374201, + "grad_norm": 1.5532442331314087, + "learning_rate": 8.333146001960414e-06, + "loss": 0.3891, + "step": 9240 + }, + { + "epoch": 1.1245512625494372, + "grad_norm": 1.3390220403671265, + "learning_rate": 8.331222629661141e-06, + "loss": 0.4108, + "step": 9241 + }, + { + "epoch": 1.1246729540614542, + "grad_norm": 2.116316556930542, + "learning_rate": 8.3292993208623e-06, + "loss": 0.4052, + "step": 9242 + }, + { + "epoch": 1.1247946455734712, + "grad_norm": 1.5120859146118164, + "learning_rate": 8.32737607563708e-06, + "loss": 0.3689, + "step": 9243 + }, + { + "epoch": 1.1249163370854882, + "grad_norm": 3.3274855613708496, + "learning_rate": 8.32545289405866e-06, + "loss": 0.3176, + "step": 9244 + }, + { + "epoch": 1.1250380285975052, + "grad_norm": 1.7269874811172485, + "learning_rate": 8.323529776200225e-06, + "loss": 0.3963, + "step": 9245 + }, + { + "epoch": 1.1251597201095223, + "grad_norm": 1.530533790588379, + "learning_rate": 8.321606722134954e-06, + "loss": 0.4131, + "step": 9246 + }, + { + "epoch": 1.1252814116215393, + "grad_norm": 1.2506675720214844, + "learning_rate": 8.31968373193602e-06, + "loss": 0.4242, + "step": 9247 + }, + { + "epoch": 1.1254031031335565, + "grad_norm": 1.1500694751739502, + "learning_rate": 8.317760805676595e-06, + "loss": 0.3202, + "step": 9248 + }, + { + "epoch": 1.1255247946455735, + "grad_norm": 1.965857744216919, + "learning_rate": 8.315837943429858e-06, + "loss": 0.3931, + "step": 9249 + }, + { + "epoch": 1.1256464861575906, + "grad_norm": 1.6869757175445557, + "learning_rate": 8.313915145268969e-06, + "loss": 0.3322, + "step": 9250 + }, + { + "epoch": 1.1257681776696076, + "grad_norm": 3.3146512508392334, + "learning_rate": 8.3119924112671e-06, + "loss": 0.3118, + "step": 9251 + }, + { + "epoch": 1.1258898691816246, + "grad_norm": 1.2627558708190918, + "learning_rate": 8.310069741497418e-06, + "loss": 0.3802, + "step": 9252 + }, + { + "epoch": 1.1260115606936416, + "grad_norm": 1.7003408670425415, + "learning_rate": 8.308147136033077e-06, + "loss": 0.3377, + "step": 9253 + }, + { + "epoch": 1.1261332522056586, + "grad_norm": 4.627403259277344, + "learning_rate": 8.30622459494724e-06, + "loss": 0.5301, + "step": 9254 + }, + { + "epoch": 1.1262549437176756, + "grad_norm": 2.9649951457977295, + "learning_rate": 8.304302118313062e-06, + "loss": 0.4097, + "step": 9255 + }, + { + "epoch": 1.1263766352296927, + "grad_norm": 1.5506354570388794, + "learning_rate": 8.302379706203699e-06, + "loss": 0.4451, + "step": 9256 + }, + { + "epoch": 1.1264983267417097, + "grad_norm": 1.629686951637268, + "learning_rate": 8.300457358692302e-06, + "loss": 0.4682, + "step": 9257 + }, + { + "epoch": 1.1266200182537267, + "grad_norm": 1.4924015998840332, + "learning_rate": 8.298535075852018e-06, + "loss": 0.4257, + "step": 9258 + }, + { + "epoch": 1.126741709765744, + "grad_norm": 1.261066198348999, + "learning_rate": 8.296612857755999e-06, + "loss": 0.3768, + "step": 9259 + }, + { + "epoch": 1.126863401277761, + "grad_norm": 1.8587522506713867, + "learning_rate": 8.294690704477385e-06, + "loss": 0.3911, + "step": 9260 + }, + { + "epoch": 1.126985092789778, + "grad_norm": 1.5861588716506958, + "learning_rate": 8.292768616089321e-06, + "loss": 0.4208, + "step": 9261 + }, + { + "epoch": 1.127106784301795, + "grad_norm": 1.6087912321090698, + "learning_rate": 8.290846592664944e-06, + "loss": 0.4646, + "step": 9262 + }, + { + "epoch": 1.127228475813812, + "grad_norm": 1.486458659172058, + "learning_rate": 8.288924634277395e-06, + "loss": 0.4132, + "step": 9263 + }, + { + "epoch": 1.127350167325829, + "grad_norm": 2.022153854370117, + "learning_rate": 8.287002740999804e-06, + "loss": 0.4231, + "step": 9264 + }, + { + "epoch": 1.127471858837846, + "grad_norm": 1.5830142498016357, + "learning_rate": 8.285080912905304e-06, + "loss": 0.3751, + "step": 9265 + }, + { + "epoch": 1.127593550349863, + "grad_norm": 1.609625220298767, + "learning_rate": 8.283159150067021e-06, + "loss": 0.445, + "step": 9266 + }, + { + "epoch": 1.12771524186188, + "grad_norm": 2.2812600135803223, + "learning_rate": 8.28123745255809e-06, + "loss": 0.4171, + "step": 9267 + }, + { + "epoch": 1.1278369333738971, + "grad_norm": 1.4414499998092651, + "learning_rate": 8.279315820451629e-06, + "loss": 0.3523, + "step": 9268 + }, + { + "epoch": 1.1279586248859141, + "grad_norm": 2.064664602279663, + "learning_rate": 8.277394253820765e-06, + "loss": 0.4462, + "step": 9269 + }, + { + "epoch": 1.1280803163979312, + "grad_norm": 2.7489442825317383, + "learning_rate": 8.275472752738613e-06, + "loss": 0.3337, + "step": 9270 + }, + { + "epoch": 1.1282020079099482, + "grad_norm": 1.8284212350845337, + "learning_rate": 8.273551317278294e-06, + "loss": 0.3644, + "step": 9271 + }, + { + "epoch": 1.1283236994219652, + "grad_norm": 1.5729069709777832, + "learning_rate": 8.271629947512922e-06, + "loss": 0.4098, + "step": 9272 + }, + { + "epoch": 1.1284453909339824, + "grad_norm": 1.5879161357879639, + "learning_rate": 8.269708643515606e-06, + "loss": 0.3688, + "step": 9273 + }, + { + "epoch": 1.1285670824459995, + "grad_norm": 2.1110644340515137, + "learning_rate": 8.267787405359457e-06, + "loss": 0.3482, + "step": 9274 + }, + { + "epoch": 1.1286887739580165, + "grad_norm": 1.3235408067703247, + "learning_rate": 8.265866233117589e-06, + "loss": 0.4, + "step": 9275 + }, + { + "epoch": 1.1288104654700335, + "grad_norm": 1.99334716796875, + "learning_rate": 8.263945126863093e-06, + "loss": 0.4066, + "step": 9276 + }, + { + "epoch": 1.1289321569820505, + "grad_norm": 1.5238032341003418, + "learning_rate": 8.262024086669083e-06, + "loss": 0.3942, + "step": 9277 + }, + { + "epoch": 1.1290538484940675, + "grad_norm": 1.3232477903366089, + "learning_rate": 8.26010311260865e-06, + "loss": 0.3316, + "step": 9278 + }, + { + "epoch": 1.1291755400060846, + "grad_norm": 1.5495525598526, + "learning_rate": 8.258182204754896e-06, + "loss": 0.3627, + "step": 9279 + }, + { + "epoch": 1.1292972315181016, + "grad_norm": 3.8505191802978516, + "learning_rate": 8.256261363180917e-06, + "loss": 0.4347, + "step": 9280 + }, + { + "epoch": 1.1294189230301186, + "grad_norm": 1.376433253288269, + "learning_rate": 8.2543405879598e-06, + "loss": 0.3677, + "step": 9281 + }, + { + "epoch": 1.1295406145421356, + "grad_norm": 2.3608484268188477, + "learning_rate": 8.252419879164637e-06, + "loss": 0.3811, + "step": 9282 + }, + { + "epoch": 1.1296623060541526, + "grad_norm": 1.8226901292800903, + "learning_rate": 8.250499236868517e-06, + "loss": 0.3393, + "step": 9283 + }, + { + "epoch": 1.1297839975661699, + "grad_norm": 2.5892343521118164, + "learning_rate": 8.248578661144519e-06, + "loss": 0.3924, + "step": 9284 + }, + { + "epoch": 1.129905689078187, + "grad_norm": 1.4924262762069702, + "learning_rate": 8.24665815206573e-06, + "loss": 0.3799, + "step": 9285 + }, + { + "epoch": 1.130027380590204, + "grad_norm": 1.3813202381134033, + "learning_rate": 8.24473770970523e-06, + "loss": 0.3664, + "step": 9286 + }, + { + "epoch": 1.130149072102221, + "grad_norm": 1.8140778541564941, + "learning_rate": 8.24281733413609e-06, + "loss": 0.3809, + "step": 9287 + }, + { + "epoch": 1.130270763614238, + "grad_norm": 1.605759620666504, + "learning_rate": 8.24089702543139e-06, + "loss": 0.3871, + "step": 9288 + }, + { + "epoch": 1.130392455126255, + "grad_norm": 1.528796911239624, + "learning_rate": 8.238976783664196e-06, + "loss": 0.3757, + "step": 9289 + }, + { + "epoch": 1.130514146638272, + "grad_norm": 2.570866584777832, + "learning_rate": 8.237056608907582e-06, + "loss": 0.3257, + "step": 9290 + }, + { + "epoch": 1.130635838150289, + "grad_norm": 1.2629880905151367, + "learning_rate": 8.235136501234615e-06, + "loss": 0.3844, + "step": 9291 + }, + { + "epoch": 1.130757529662306, + "grad_norm": 1.410779356956482, + "learning_rate": 8.233216460718354e-06, + "loss": 0.4059, + "step": 9292 + }, + { + "epoch": 1.130879221174323, + "grad_norm": 1.5605778694152832, + "learning_rate": 8.231296487431865e-06, + "loss": 0.3685, + "step": 9293 + }, + { + "epoch": 1.13100091268634, + "grad_norm": 2.2018704414367676, + "learning_rate": 8.229376581448203e-06, + "loss": 0.342, + "step": 9294 + }, + { + "epoch": 1.131122604198357, + "grad_norm": 2.390566825866699, + "learning_rate": 8.227456742840432e-06, + "loss": 0.4373, + "step": 9295 + }, + { + "epoch": 1.131244295710374, + "grad_norm": 1.6698867082595825, + "learning_rate": 8.225536971681599e-06, + "loss": 0.3872, + "step": 9296 + }, + { + "epoch": 1.1313659872223911, + "grad_norm": 1.9178253412246704, + "learning_rate": 8.223617268044755e-06, + "loss": 0.3579, + "step": 9297 + }, + { + "epoch": 1.1314876787344084, + "grad_norm": 2.1450469493865967, + "learning_rate": 8.221697632002957e-06, + "loss": 0.3704, + "step": 9298 + }, + { + "epoch": 1.1316093702464254, + "grad_norm": 1.6373666524887085, + "learning_rate": 8.219778063629242e-06, + "loss": 0.4104, + "step": 9299 + }, + { + "epoch": 1.1317310617584424, + "grad_norm": 2.797213554382324, + "learning_rate": 8.217858562996655e-06, + "loss": 0.3473, + "step": 9300 + }, + { + "epoch": 1.1318527532704594, + "grad_norm": 1.3088014125823975, + "learning_rate": 8.215939130178238e-06, + "loss": 0.3543, + "step": 9301 + }, + { + "epoch": 1.1319744447824764, + "grad_norm": 2.1467719078063965, + "learning_rate": 8.21401976524703e-06, + "loss": 0.4568, + "step": 9302 + }, + { + "epoch": 1.1320961362944935, + "grad_norm": 1.691737413406372, + "learning_rate": 8.212100468276068e-06, + "loss": 0.3544, + "step": 9303 + }, + { + "epoch": 1.1322178278065105, + "grad_norm": 1.674431324005127, + "learning_rate": 8.210181239338385e-06, + "loss": 0.3666, + "step": 9304 + }, + { + "epoch": 1.1323395193185275, + "grad_norm": 1.9545689821243286, + "learning_rate": 8.208262078507007e-06, + "loss": 0.3792, + "step": 9305 + }, + { + "epoch": 1.1324612108305445, + "grad_norm": 1.3984578847885132, + "learning_rate": 8.206342985854969e-06, + "loss": 0.3438, + "step": 9306 + }, + { + "epoch": 1.1325829023425615, + "grad_norm": 2.0488922595977783, + "learning_rate": 8.20442396145529e-06, + "loss": 0.4096, + "step": 9307 + }, + { + "epoch": 1.1327045938545786, + "grad_norm": 2.844698905944824, + "learning_rate": 8.202505005380997e-06, + "loss": 0.3938, + "step": 9308 + }, + { + "epoch": 1.1328262853665958, + "grad_norm": 2.3427329063415527, + "learning_rate": 8.200586117705114e-06, + "loss": 0.3957, + "step": 9309 + }, + { + "epoch": 1.1329479768786128, + "grad_norm": 1.4236059188842773, + "learning_rate": 8.198667298500648e-06, + "loss": 0.3811, + "step": 9310 + }, + { + "epoch": 1.1330696683906298, + "grad_norm": 1.2733254432678223, + "learning_rate": 8.196748547840622e-06, + "loss": 0.337, + "step": 9311 + }, + { + "epoch": 1.1331913599026469, + "grad_norm": 1.3997050523757935, + "learning_rate": 8.194829865798045e-06, + "loss": 0.3535, + "step": 9312 + }, + { + "epoch": 1.1333130514146639, + "grad_norm": 1.8210163116455078, + "learning_rate": 8.19291125244593e-06, + "loss": 0.3624, + "step": 9313 + }, + { + "epoch": 1.133434742926681, + "grad_norm": 3.7033047676086426, + "learning_rate": 8.190992707857282e-06, + "loss": 0.4599, + "step": 9314 + }, + { + "epoch": 1.133556434438698, + "grad_norm": 2.7419254779815674, + "learning_rate": 8.189074232105103e-06, + "loss": 0.3757, + "step": 9315 + }, + { + "epoch": 1.133678125950715, + "grad_norm": 1.3748414516448975, + "learning_rate": 8.187155825262398e-06, + "loss": 0.3675, + "step": 9316 + }, + { + "epoch": 1.133799817462732, + "grad_norm": 1.866584062576294, + "learning_rate": 8.18523748740217e-06, + "loss": 0.4044, + "step": 9317 + }, + { + "epoch": 1.133921508974749, + "grad_norm": 1.968496322631836, + "learning_rate": 8.18331921859741e-06, + "loss": 0.3317, + "step": 9318 + }, + { + "epoch": 1.134043200486766, + "grad_norm": 1.6435009241104126, + "learning_rate": 8.181401018921114e-06, + "loss": 0.4012, + "step": 9319 + }, + { + "epoch": 1.134164891998783, + "grad_norm": 2.9385650157928467, + "learning_rate": 8.179482888446271e-06, + "loss": 0.3563, + "step": 9320 + }, + { + "epoch": 1.1342865835108, + "grad_norm": 3.4491231441497803, + "learning_rate": 8.177564827245879e-06, + "loss": 0.3935, + "step": 9321 + }, + { + "epoch": 1.134408275022817, + "grad_norm": 2.5235519409179688, + "learning_rate": 8.175646835392915e-06, + "loss": 0.4009, + "step": 9322 + }, + { + "epoch": 1.1345299665348343, + "grad_norm": 1.8858593702316284, + "learning_rate": 8.173728912960362e-06, + "loss": 0.3625, + "step": 9323 + }, + { + "epoch": 1.1346516580468513, + "grad_norm": 1.8929585218429565, + "learning_rate": 8.171811060021206e-06, + "loss": 0.3495, + "step": 9324 + }, + { + "epoch": 1.1347733495588683, + "grad_norm": 2.0479977130889893, + "learning_rate": 8.169893276648423e-06, + "loss": 0.3461, + "step": 9325 + }, + { + "epoch": 1.1348950410708853, + "grad_norm": 2.209444761276245, + "learning_rate": 8.16797556291499e-06, + "loss": 0.3812, + "step": 9326 + }, + { + "epoch": 1.1350167325829024, + "grad_norm": 1.4777228832244873, + "learning_rate": 8.166057918893876e-06, + "loss": 0.3582, + "step": 9327 + }, + { + "epoch": 1.1351384240949194, + "grad_norm": 1.1509705781936646, + "learning_rate": 8.164140344658057e-06, + "loss": 0.3489, + "step": 9328 + }, + { + "epoch": 1.1352601156069364, + "grad_norm": 1.1321022510528564, + "learning_rate": 8.162222840280497e-06, + "loss": 0.3523, + "step": 9329 + }, + { + "epoch": 1.1353818071189534, + "grad_norm": 1.3123786449432373, + "learning_rate": 8.160305405834162e-06, + "loss": 0.3658, + "step": 9330 + }, + { + "epoch": 1.1355034986309704, + "grad_norm": 2.0186431407928467, + "learning_rate": 8.158388041392012e-06, + "loss": 0.4565, + "step": 9331 + }, + { + "epoch": 1.1356251901429875, + "grad_norm": 3.378861904144287, + "learning_rate": 8.156470747027015e-06, + "loss": 0.4604, + "step": 9332 + }, + { + "epoch": 1.1357468816550045, + "grad_norm": 3.521886110305786, + "learning_rate": 8.154553522812118e-06, + "loss": 0.2668, + "step": 9333 + }, + { + "epoch": 1.1358685731670217, + "grad_norm": 1.5217844247817993, + "learning_rate": 8.152636368820279e-06, + "loss": 0.3835, + "step": 9334 + }, + { + "epoch": 1.1359902646790387, + "grad_norm": 1.7410460710525513, + "learning_rate": 8.150719285124451e-06, + "loss": 0.3861, + "step": 9335 + }, + { + "epoch": 1.1361119561910558, + "grad_norm": 1.5696029663085938, + "learning_rate": 8.14880227179758e-06, + "loss": 0.3902, + "step": 9336 + }, + { + "epoch": 1.1362336477030728, + "grad_norm": 1.8855774402618408, + "learning_rate": 8.146885328912618e-06, + "loss": 0.3451, + "step": 9337 + }, + { + "epoch": 1.1363553392150898, + "grad_norm": 1.956186056137085, + "learning_rate": 8.144968456542502e-06, + "loss": 0.4759, + "step": 9338 + }, + { + "epoch": 1.1364770307271068, + "grad_norm": 1.9672017097473145, + "learning_rate": 8.143051654760174e-06, + "loss": 0.3978, + "step": 9339 + }, + { + "epoch": 1.1365987222391238, + "grad_norm": 1.4946990013122559, + "learning_rate": 8.141134923638579e-06, + "loss": 0.3736, + "step": 9340 + }, + { + "epoch": 1.1367204137511409, + "grad_norm": 1.4063612222671509, + "learning_rate": 8.139218263250644e-06, + "loss": 0.4137, + "step": 9341 + }, + { + "epoch": 1.1368421052631579, + "grad_norm": 2.0300369262695312, + "learning_rate": 8.137301673669306e-06, + "loss": 0.3085, + "step": 9342 + }, + { + "epoch": 1.136963796775175, + "grad_norm": 2.0914924144744873, + "learning_rate": 8.135385154967499e-06, + "loss": 0.4254, + "step": 9343 + }, + { + "epoch": 1.137085488287192, + "grad_norm": 1.692579984664917, + "learning_rate": 8.133468707218145e-06, + "loss": 0.3931, + "step": 9344 + }, + { + "epoch": 1.137207179799209, + "grad_norm": 1.779434084892273, + "learning_rate": 8.13155233049417e-06, + "loss": 0.3496, + "step": 9345 + }, + { + "epoch": 1.137328871311226, + "grad_norm": 1.9572780132293701, + "learning_rate": 8.129636024868495e-06, + "loss": 0.3248, + "step": 9346 + }, + { + "epoch": 1.137450562823243, + "grad_norm": 1.767472743988037, + "learning_rate": 8.12771979041404e-06, + "loss": 0.4445, + "step": 9347 + }, + { + "epoch": 1.13757225433526, + "grad_norm": 1.6177034378051758, + "learning_rate": 8.125803627203724e-06, + "loss": 0.4019, + "step": 9348 + }, + { + "epoch": 1.1376939458472772, + "grad_norm": 1.58851957321167, + "learning_rate": 8.123887535310457e-06, + "loss": 0.3887, + "step": 9349 + }, + { + "epoch": 1.1378156373592943, + "grad_norm": 2.0800893306732178, + "learning_rate": 8.121971514807154e-06, + "loss": 0.4332, + "step": 9350 + }, + { + "epoch": 1.1379373288713113, + "grad_norm": 1.859734296798706, + "learning_rate": 8.120055565766723e-06, + "loss": 0.3338, + "step": 9351 + }, + { + "epoch": 1.1380590203833283, + "grad_norm": 1.957097053527832, + "learning_rate": 8.118139688262067e-06, + "loss": 0.4234, + "step": 9352 + }, + { + "epoch": 1.1381807118953453, + "grad_norm": 1.6906638145446777, + "learning_rate": 8.116223882366091e-06, + "loss": 0.3642, + "step": 9353 + }, + { + "epoch": 1.1383024034073623, + "grad_norm": 2.7752556800842285, + "learning_rate": 8.114308148151697e-06, + "loss": 0.3455, + "step": 9354 + }, + { + "epoch": 1.1384240949193793, + "grad_norm": 1.729985237121582, + "learning_rate": 8.112392485691784e-06, + "loss": 0.4193, + "step": 9355 + }, + { + "epoch": 1.1385457864313964, + "grad_norm": 2.2575976848602295, + "learning_rate": 8.110476895059243e-06, + "loss": 0.3086, + "step": 9356 + }, + { + "epoch": 1.1386674779434134, + "grad_norm": 1.2856334447860718, + "learning_rate": 8.108561376326966e-06, + "loss": 0.3668, + "step": 9357 + }, + { + "epoch": 1.1387891694554304, + "grad_norm": 2.5189595222473145, + "learning_rate": 8.106645929567842e-06, + "loss": 0.3139, + "step": 9358 + }, + { + "epoch": 1.1389108609674474, + "grad_norm": 2.3296563625335693, + "learning_rate": 8.10473055485476e-06, + "loss": 0.3866, + "step": 9359 + }, + { + "epoch": 1.1390325524794647, + "grad_norm": 2.5810060501098633, + "learning_rate": 8.102815252260604e-06, + "loss": 0.3913, + "step": 9360 + }, + { + "epoch": 1.1391542439914817, + "grad_norm": 2.634850025177002, + "learning_rate": 8.100900021858253e-06, + "loss": 0.4001, + "step": 9361 + }, + { + "epoch": 1.1392759355034987, + "grad_norm": 2.4573683738708496, + "learning_rate": 8.098984863720588e-06, + "loss": 0.4085, + "step": 9362 + }, + { + "epoch": 1.1393976270155157, + "grad_norm": 1.4731765985488892, + "learning_rate": 8.097069777920486e-06, + "loss": 0.3693, + "step": 9363 + }, + { + "epoch": 1.1395193185275327, + "grad_norm": 1.8492422103881836, + "learning_rate": 8.095154764530814e-06, + "loss": 0.3682, + "step": 9364 + }, + { + "epoch": 1.1396410100395498, + "grad_norm": 1.3343756198883057, + "learning_rate": 8.093239823624449e-06, + "loss": 0.3421, + "step": 9365 + }, + { + "epoch": 1.1397627015515668, + "grad_norm": 2.886051893234253, + "learning_rate": 8.091324955274256e-06, + "loss": 0.4413, + "step": 9366 + }, + { + "epoch": 1.1398843930635838, + "grad_norm": 3.353578567504883, + "learning_rate": 8.0894101595531e-06, + "loss": 0.4363, + "step": 9367 + }, + { + "epoch": 1.1400060845756008, + "grad_norm": 1.4267032146453857, + "learning_rate": 8.08749543653384e-06, + "loss": 0.4186, + "step": 9368 + }, + { + "epoch": 1.1401277760876178, + "grad_norm": 2.3749492168426514, + "learning_rate": 8.085580786289336e-06, + "loss": 0.3865, + "step": 9369 + }, + { + "epoch": 1.1402494675996349, + "grad_norm": 2.1741597652435303, + "learning_rate": 8.083666208892447e-06, + "loss": 0.4266, + "step": 9370 + }, + { + "epoch": 1.1403711591116519, + "grad_norm": 1.214575171470642, + "learning_rate": 8.081751704416027e-06, + "loss": 0.3935, + "step": 9371 + }, + { + "epoch": 1.140492850623669, + "grad_norm": 2.4452576637268066, + "learning_rate": 8.079837272932923e-06, + "loss": 0.3847, + "step": 9372 + }, + { + "epoch": 1.140614542135686, + "grad_norm": 1.7560369968414307, + "learning_rate": 8.077922914515984e-06, + "loss": 0.4192, + "step": 9373 + }, + { + "epoch": 1.1407362336477032, + "grad_norm": 1.2648472785949707, + "learning_rate": 8.07600862923806e-06, + "loss": 0.391, + "step": 9374 + }, + { + "epoch": 1.1408579251597202, + "grad_norm": 1.4950051307678223, + "learning_rate": 8.074094417171987e-06, + "loss": 0.4275, + "step": 9375 + }, + { + "epoch": 1.1409796166717372, + "grad_norm": 2.6746535301208496, + "learning_rate": 8.07218027839061e-06, + "loss": 0.3993, + "step": 9376 + }, + { + "epoch": 1.1411013081837542, + "grad_norm": 1.4658801555633545, + "learning_rate": 8.070266212966765e-06, + "loss": 0.3736, + "step": 9377 + }, + { + "epoch": 1.1412229996957712, + "grad_norm": 2.162775993347168, + "learning_rate": 8.068352220973284e-06, + "loss": 0.3678, + "step": 9378 + }, + { + "epoch": 1.1413446912077883, + "grad_norm": 3.0236072540283203, + "learning_rate": 8.066438302482999e-06, + "loss": 0.4055, + "step": 9379 + }, + { + "epoch": 1.1414663827198053, + "grad_norm": 3.2523727416992188, + "learning_rate": 8.064524457568736e-06, + "loss": 0.3519, + "step": 9380 + }, + { + "epoch": 1.1415880742318223, + "grad_norm": 1.3547011613845825, + "learning_rate": 8.062610686303326e-06, + "loss": 0.3739, + "step": 9381 + }, + { + "epoch": 1.1417097657438393, + "grad_norm": 1.3020365238189697, + "learning_rate": 8.06069698875959e-06, + "loss": 0.3894, + "step": 9382 + }, + { + "epoch": 1.1418314572558563, + "grad_norm": 2.1218912601470947, + "learning_rate": 8.058783365010348e-06, + "loss": 0.4066, + "step": 9383 + }, + { + "epoch": 1.1419531487678733, + "grad_norm": 2.3884449005126953, + "learning_rate": 8.056869815128414e-06, + "loss": 0.4146, + "step": 9384 + }, + { + "epoch": 1.1420748402798906, + "grad_norm": 4.383427143096924, + "learning_rate": 8.05495633918661e-06, + "loss": 0.4325, + "step": 9385 + }, + { + "epoch": 1.1421965317919076, + "grad_norm": 1.297877311706543, + "learning_rate": 8.053042937257739e-06, + "loss": 0.3786, + "step": 9386 + }, + { + "epoch": 1.1423182233039246, + "grad_norm": 2.1148841381073, + "learning_rate": 8.051129609414616e-06, + "loss": 0.3984, + "step": 9387 + }, + { + "epoch": 1.1424399148159416, + "grad_norm": 1.0986117124557495, + "learning_rate": 8.049216355730046e-06, + "loss": 0.3043, + "step": 9388 + }, + { + "epoch": 1.1425616063279587, + "grad_norm": 2.2859504222869873, + "learning_rate": 8.047303176276831e-06, + "loss": 0.3026, + "step": 9389 + }, + { + "epoch": 1.1426832978399757, + "grad_norm": 3.8585379123687744, + "learning_rate": 8.045390071127776e-06, + "loss": 0.4313, + "step": 9390 + }, + { + "epoch": 1.1428049893519927, + "grad_norm": 1.3305115699768066, + "learning_rate": 8.043477040355669e-06, + "loss": 0.3836, + "step": 9391 + }, + { + "epoch": 1.1429266808640097, + "grad_norm": 2.696345567703247, + "learning_rate": 8.041564084033311e-06, + "loss": 0.4444, + "step": 9392 + }, + { + "epoch": 1.1430483723760267, + "grad_norm": 1.7355663776397705, + "learning_rate": 8.039651202233497e-06, + "loss": 0.4127, + "step": 9393 + }, + { + "epoch": 1.1431700638880438, + "grad_norm": 2.0472328662872314, + "learning_rate": 8.037738395029009e-06, + "loss": 0.3923, + "step": 9394 + }, + { + "epoch": 1.1432917554000608, + "grad_norm": 1.3737040758132935, + "learning_rate": 8.035825662492637e-06, + "loss": 0.3666, + "step": 9395 + }, + { + "epoch": 1.1434134469120778, + "grad_norm": 2.1715660095214844, + "learning_rate": 8.033913004697164e-06, + "loss": 0.4093, + "step": 9396 + }, + { + "epoch": 1.1435351384240948, + "grad_norm": 1.5705746412277222, + "learning_rate": 8.032000421715372e-06, + "loss": 0.4284, + "step": 9397 + }, + { + "epoch": 1.1436568299361118, + "grad_norm": 1.605338454246521, + "learning_rate": 8.030087913620036e-06, + "loss": 0.4397, + "step": 9398 + }, + { + "epoch": 1.143778521448129, + "grad_norm": 1.901050090789795, + "learning_rate": 8.028175480483932e-06, + "loss": 0.37, + "step": 9399 + }, + { + "epoch": 1.143900212960146, + "grad_norm": 1.8695341348648071, + "learning_rate": 8.026263122379835e-06, + "loss": 0.3656, + "step": 9400 + }, + { + "epoch": 1.1440219044721631, + "grad_norm": 1.4707010984420776, + "learning_rate": 8.02435083938051e-06, + "loss": 0.3743, + "step": 9401 + }, + { + "epoch": 1.1441435959841801, + "grad_norm": 1.474139928817749, + "learning_rate": 8.02243863155873e-06, + "loss": 0.3596, + "step": 9402 + }, + { + "epoch": 1.1442652874961972, + "grad_norm": 1.4010915756225586, + "learning_rate": 8.020526498987248e-06, + "loss": 0.3748, + "step": 9403 + }, + { + "epoch": 1.1443869790082142, + "grad_norm": 1.3375674486160278, + "learning_rate": 8.01861444173883e-06, + "loss": 0.3559, + "step": 9404 + }, + { + "epoch": 1.1445086705202312, + "grad_norm": 1.069501280784607, + "learning_rate": 8.016702459886235e-06, + "loss": 0.3508, + "step": 9405 + }, + { + "epoch": 1.1446303620322482, + "grad_norm": 2.4618308544158936, + "learning_rate": 8.014790553502215e-06, + "loss": 0.3773, + "step": 9406 + }, + { + "epoch": 1.1447520535442652, + "grad_norm": 2.4717705249786377, + "learning_rate": 8.012878722659526e-06, + "loss": 0.4141, + "step": 9407 + }, + { + "epoch": 1.1448737450562823, + "grad_norm": 1.9223419427871704, + "learning_rate": 8.010966967430914e-06, + "loss": 0.3935, + "step": 9408 + }, + { + "epoch": 1.1449954365682993, + "grad_norm": 1.3690690994262695, + "learning_rate": 8.009055287889124e-06, + "loss": 0.3107, + "step": 9409 + }, + { + "epoch": 1.1451171280803165, + "grad_norm": 2.378538131713867, + "learning_rate": 8.007143684106901e-06, + "loss": 0.3318, + "step": 9410 + }, + { + "epoch": 1.1452388195923335, + "grad_norm": 1.3570115566253662, + "learning_rate": 8.00523215615699e-06, + "loss": 0.3661, + "step": 9411 + }, + { + "epoch": 1.1453605111043506, + "grad_norm": 1.7419265508651733, + "learning_rate": 8.00332070411212e-06, + "loss": 0.3471, + "step": 9412 + }, + { + "epoch": 1.1454822026163676, + "grad_norm": 3.308589458465576, + "learning_rate": 8.001409328045036e-06, + "loss": 0.4324, + "step": 9413 + }, + { + "epoch": 1.1456038941283846, + "grad_norm": 3.726086378097534, + "learning_rate": 7.999498028028459e-06, + "loss": 0.461, + "step": 9414 + }, + { + "epoch": 1.1457255856404016, + "grad_norm": 1.404338002204895, + "learning_rate": 7.997586804135124e-06, + "loss": 0.3102, + "step": 9415 + }, + { + "epoch": 1.1458472771524186, + "grad_norm": 1.4919569492340088, + "learning_rate": 7.995675656437756e-06, + "loss": 0.3622, + "step": 9416 + }, + { + "epoch": 1.1459689686644356, + "grad_norm": 1.9902676343917847, + "learning_rate": 7.993764585009078e-06, + "loss": 0.3578, + "step": 9417 + }, + { + "epoch": 1.1460906601764527, + "grad_norm": 1.3841243982315063, + "learning_rate": 7.991853589921807e-06, + "loss": 0.3436, + "step": 9418 + }, + { + "epoch": 1.1462123516884697, + "grad_norm": 1.3774094581604004, + "learning_rate": 7.98994267124867e-06, + "loss": 0.3874, + "step": 9419 + }, + { + "epoch": 1.1463340432004867, + "grad_norm": 3.0227530002593994, + "learning_rate": 7.98803182906237e-06, + "loss": 0.4731, + "step": 9420 + }, + { + "epoch": 1.1464557347125037, + "grad_norm": 1.482176423072815, + "learning_rate": 7.986121063435623e-06, + "loss": 0.3768, + "step": 9421 + }, + { + "epoch": 1.1465774262245207, + "grad_norm": 1.861307978630066, + "learning_rate": 7.98421037444114e-06, + "loss": 0.34, + "step": 9422 + }, + { + "epoch": 1.1466991177365378, + "grad_norm": 1.1559419631958008, + "learning_rate": 7.982299762151625e-06, + "loss": 0.3539, + "step": 9423 + }, + { + "epoch": 1.146820809248555, + "grad_norm": 1.223334550857544, + "learning_rate": 7.98038922663978e-06, + "loss": 0.3921, + "step": 9424 + }, + { + "epoch": 1.146942500760572, + "grad_norm": 2.4468960762023926, + "learning_rate": 7.978478767978308e-06, + "loss": 0.3745, + "step": 9425 + }, + { + "epoch": 1.147064192272589, + "grad_norm": 1.5288887023925781, + "learning_rate": 7.9765683862399e-06, + "loss": 0.3728, + "step": 9426 + }, + { + "epoch": 1.147185883784606, + "grad_norm": 2.264828681945801, + "learning_rate": 7.974658081497255e-06, + "loss": 0.4029, + "step": 9427 + }, + { + "epoch": 1.147307575296623, + "grad_norm": 1.2708088159561157, + "learning_rate": 7.972747853823059e-06, + "loss": 0.3864, + "step": 9428 + }, + { + "epoch": 1.14742926680864, + "grad_norm": 2.664573907852173, + "learning_rate": 7.970837703290006e-06, + "loss": 0.4648, + "step": 9429 + }, + { + "epoch": 1.1475509583206571, + "grad_norm": 1.6222294569015503, + "learning_rate": 7.968927629970776e-06, + "loss": 0.3818, + "step": 9430 + }, + { + "epoch": 1.1476726498326741, + "grad_norm": 1.9193700551986694, + "learning_rate": 7.967017633938057e-06, + "loss": 0.3729, + "step": 9431 + }, + { + "epoch": 1.1477943413446912, + "grad_norm": 2.1691250801086426, + "learning_rate": 7.965107715264523e-06, + "loss": 0.3274, + "step": 9432 + }, + { + "epoch": 1.1479160328567082, + "grad_norm": 1.5296939611434937, + "learning_rate": 7.963197874022853e-06, + "loss": 0.4097, + "step": 9433 + }, + { + "epoch": 1.1480377243687252, + "grad_norm": 1.1184592247009277, + "learning_rate": 7.961288110285721e-06, + "loss": 0.3531, + "step": 9434 + }, + { + "epoch": 1.1481594158807424, + "grad_norm": 1.580876111984253, + "learning_rate": 7.959378424125794e-06, + "loss": 0.3329, + "step": 9435 + }, + { + "epoch": 1.1482811073927595, + "grad_norm": 1.337518572807312, + "learning_rate": 7.957468815615747e-06, + "loss": 0.3967, + "step": 9436 + }, + { + "epoch": 1.1484027989047765, + "grad_norm": 4.1903767585754395, + "learning_rate": 7.955559284828234e-06, + "loss": 0.4417, + "step": 9437 + }, + { + "epoch": 1.1485244904167935, + "grad_norm": 1.5820752382278442, + "learning_rate": 7.953649831835923e-06, + "loss": 0.4218, + "step": 9438 + }, + { + "epoch": 1.1486461819288105, + "grad_norm": 2.5573534965515137, + "learning_rate": 7.951740456711473e-06, + "loss": 0.3377, + "step": 9439 + }, + { + "epoch": 1.1487678734408275, + "grad_norm": 1.11497962474823, + "learning_rate": 7.949831159527537e-06, + "loss": 0.3298, + "step": 9440 + }, + { + "epoch": 1.1488895649528446, + "grad_norm": 1.5248496532440186, + "learning_rate": 7.947921940356767e-06, + "loss": 0.3813, + "step": 9441 + }, + { + "epoch": 1.1490112564648616, + "grad_norm": 3.7922863960266113, + "learning_rate": 7.946012799271818e-06, + "loss": 0.4952, + "step": 9442 + }, + { + "epoch": 1.1491329479768786, + "grad_norm": 2.071898937225342, + "learning_rate": 7.944103736345332e-06, + "loss": 0.3852, + "step": 9443 + }, + { + "epoch": 1.1492546394888956, + "grad_norm": 1.9802888631820679, + "learning_rate": 7.942194751649955e-06, + "loss": 0.4081, + "step": 9444 + }, + { + "epoch": 1.1493763310009126, + "grad_norm": 3.9047274589538574, + "learning_rate": 7.940285845258328e-06, + "loss": 0.4395, + "step": 9445 + }, + { + "epoch": 1.1494980225129297, + "grad_norm": 1.3604471683502197, + "learning_rate": 7.938377017243086e-06, + "loss": 0.3886, + "step": 9446 + }, + { + "epoch": 1.1496197140249467, + "grad_norm": 3.498446464538574, + "learning_rate": 7.936468267676865e-06, + "loss": 0.3412, + "step": 9447 + }, + { + "epoch": 1.1497414055369637, + "grad_norm": 3.1781203746795654, + "learning_rate": 7.934559596632303e-06, + "loss": 0.3846, + "step": 9448 + }, + { + "epoch": 1.1498630970489807, + "grad_norm": 1.4405567646026611, + "learning_rate": 7.932651004182019e-06, + "loss": 0.3895, + "step": 9449 + }, + { + "epoch": 1.149984788560998, + "grad_norm": 1.8151578903198242, + "learning_rate": 7.930742490398646e-06, + "loss": 0.368, + "step": 9450 + }, + { + "epoch": 1.150106480073015, + "grad_norm": 1.7208164930343628, + "learning_rate": 7.928834055354803e-06, + "loss": 0.431, + "step": 9451 + }, + { + "epoch": 1.150228171585032, + "grad_norm": 2.3178372383117676, + "learning_rate": 7.926925699123109e-06, + "loss": 0.3344, + "step": 9452 + }, + { + "epoch": 1.150349863097049, + "grad_norm": 1.9946759939193726, + "learning_rate": 7.925017421776188e-06, + "loss": 0.4022, + "step": 9453 + }, + { + "epoch": 1.150471554609066, + "grad_norm": 2.473324775695801, + "learning_rate": 7.923109223386644e-06, + "loss": 0.3413, + "step": 9454 + }, + { + "epoch": 1.150593246121083, + "grad_norm": 1.7858223915100098, + "learning_rate": 7.921201104027095e-06, + "loss": 0.3399, + "step": 9455 + }, + { + "epoch": 1.1507149376331, + "grad_norm": 2.6613070964813232, + "learning_rate": 7.919293063770147e-06, + "loss": 0.422, + "step": 9456 + }, + { + "epoch": 1.150836629145117, + "grad_norm": 1.9846540689468384, + "learning_rate": 7.917385102688407e-06, + "loss": 0.3686, + "step": 9457 + }, + { + "epoch": 1.150958320657134, + "grad_norm": 1.9592974185943604, + "learning_rate": 7.91547722085447e-06, + "loss": 0.3084, + "step": 9458 + }, + { + "epoch": 1.1510800121691511, + "grad_norm": 2.1782310009002686, + "learning_rate": 7.913569418340947e-06, + "loss": 0.3826, + "step": 9459 + }, + { + "epoch": 1.1512017036811681, + "grad_norm": 1.5790261030197144, + "learning_rate": 7.911661695220419e-06, + "loss": 0.3544, + "step": 9460 + }, + { + "epoch": 1.1513233951931854, + "grad_norm": 4.169583320617676, + "learning_rate": 7.90975405156549e-06, + "loss": 0.4341, + "step": 9461 + }, + { + "epoch": 1.1514450867052024, + "grad_norm": 2.2710633277893066, + "learning_rate": 7.907846487448743e-06, + "loss": 0.3897, + "step": 9462 + }, + { + "epoch": 1.1515667782172194, + "grad_norm": 1.6063332557678223, + "learning_rate": 7.905939002942769e-06, + "loss": 0.3778, + "step": 9463 + }, + { + "epoch": 1.1516884697292364, + "grad_norm": 2.9622507095336914, + "learning_rate": 7.90403159812015e-06, + "loss": 0.424, + "step": 9464 + }, + { + "epoch": 1.1518101612412535, + "grad_norm": 1.6363670825958252, + "learning_rate": 7.902124273053469e-06, + "loss": 0.3507, + "step": 9465 + }, + { + "epoch": 1.1519318527532705, + "grad_norm": 1.6198620796203613, + "learning_rate": 7.900217027815299e-06, + "loss": 0.3514, + "step": 9466 + }, + { + "epoch": 1.1520535442652875, + "grad_norm": 2.052090644836426, + "learning_rate": 7.898309862478219e-06, + "loss": 0.3794, + "step": 9467 + }, + { + "epoch": 1.1521752357773045, + "grad_norm": 1.5545611381530762, + "learning_rate": 7.896402777114799e-06, + "loss": 0.388, + "step": 9468 + }, + { + "epoch": 1.1522969272893215, + "grad_norm": 1.7451837062835693, + "learning_rate": 7.894495771797607e-06, + "loss": 0.3818, + "step": 9469 + }, + { + "epoch": 1.1524186188013386, + "grad_norm": 4.016990661621094, + "learning_rate": 7.892588846599207e-06, + "loss": 0.367, + "step": 9470 + }, + { + "epoch": 1.1525403103133556, + "grad_norm": 1.678607702255249, + "learning_rate": 7.89068200159217e-06, + "loss": 0.3721, + "step": 9471 + }, + { + "epoch": 1.1526620018253726, + "grad_norm": 1.4158412218093872, + "learning_rate": 7.888775236849045e-06, + "loss": 0.3955, + "step": 9472 + }, + { + "epoch": 1.1527836933373896, + "grad_norm": 1.546684980392456, + "learning_rate": 7.886868552442393e-06, + "loss": 0.3704, + "step": 9473 + }, + { + "epoch": 1.1529053848494066, + "grad_norm": 1.6489508152008057, + "learning_rate": 7.884961948444763e-06, + "loss": 0.4321, + "step": 9474 + }, + { + "epoch": 1.1530270763614239, + "grad_norm": 2.5284345149993896, + "learning_rate": 7.883055424928712e-06, + "loss": 0.4659, + "step": 9475 + }, + { + "epoch": 1.153148767873441, + "grad_norm": 1.6549315452575684, + "learning_rate": 7.881148981966784e-06, + "loss": 0.379, + "step": 9476 + }, + { + "epoch": 1.153270459385458, + "grad_norm": 2.288231611251831, + "learning_rate": 7.87924261963152e-06, + "loss": 0.3886, + "step": 9477 + }, + { + "epoch": 1.153392150897475, + "grad_norm": 2.0967490673065186, + "learning_rate": 7.877336337995465e-06, + "loss": 0.3451, + "step": 9478 + }, + { + "epoch": 1.153513842409492, + "grad_norm": 1.1757196187973022, + "learning_rate": 7.87543013713116e-06, + "loss": 0.3828, + "step": 9479 + }, + { + "epoch": 1.153635533921509, + "grad_norm": 1.3049899339675903, + "learning_rate": 7.873524017111132e-06, + "loss": 0.4098, + "step": 9480 + }, + { + "epoch": 1.153757225433526, + "grad_norm": 2.831350803375244, + "learning_rate": 7.871617978007918e-06, + "loss": 0.4293, + "step": 9481 + }, + { + "epoch": 1.153878916945543, + "grad_norm": 2.961888551712036, + "learning_rate": 7.869712019894047e-06, + "loss": 0.4551, + "step": 9482 + }, + { + "epoch": 1.15400060845756, + "grad_norm": 1.5543574094772339, + "learning_rate": 7.867806142842041e-06, + "loss": 0.4219, + "step": 9483 + }, + { + "epoch": 1.154122299969577, + "grad_norm": 1.2422986030578613, + "learning_rate": 7.865900346924426e-06, + "loss": 0.3535, + "step": 9484 + }, + { + "epoch": 1.154243991481594, + "grad_norm": 1.8995840549468994, + "learning_rate": 7.863994632213718e-06, + "loss": 0.415, + "step": 9485 + }, + { + "epoch": 1.1543656829936113, + "grad_norm": 1.3236974477767944, + "learning_rate": 7.862088998782436e-06, + "loss": 0.3878, + "step": 9486 + }, + { + "epoch": 1.1544873745056283, + "grad_norm": 1.3614071607589722, + "learning_rate": 7.860183446703096e-06, + "loss": 0.3808, + "step": 9487 + }, + { + "epoch": 1.1546090660176453, + "grad_norm": 1.6428714990615845, + "learning_rate": 7.8582779760482e-06, + "loss": 0.2749, + "step": 9488 + }, + { + "epoch": 1.1547307575296624, + "grad_norm": 1.4920694828033447, + "learning_rate": 7.856372586890262e-06, + "loss": 0.3991, + "step": 9489 + }, + { + "epoch": 1.1548524490416794, + "grad_norm": 1.3494758605957031, + "learning_rate": 7.854467279301785e-06, + "loss": 0.3659, + "step": 9490 + }, + { + "epoch": 1.1549741405536964, + "grad_norm": 2.7925567626953125, + "learning_rate": 7.85256205335527e-06, + "loss": 0.3223, + "step": 9491 + }, + { + "epoch": 1.1550958320657134, + "grad_norm": 1.8954964876174927, + "learning_rate": 7.850656909123212e-06, + "loss": 0.4144, + "step": 9492 + }, + { + "epoch": 1.1552175235777304, + "grad_norm": 2.248262643814087, + "learning_rate": 7.848751846678106e-06, + "loss": 0.3885, + "step": 9493 + }, + { + "epoch": 1.1553392150897475, + "grad_norm": 1.478324055671692, + "learning_rate": 7.846846866092452e-06, + "loss": 0.3959, + "step": 9494 + }, + { + "epoch": 1.1554609066017645, + "grad_norm": 1.411107063293457, + "learning_rate": 7.844941967438729e-06, + "loss": 0.3684, + "step": 9495 + }, + { + "epoch": 1.1555825981137815, + "grad_norm": 4.012429237365723, + "learning_rate": 7.84303715078942e-06, + "loss": 0.4168, + "step": 9496 + }, + { + "epoch": 1.1557042896257985, + "grad_norm": 1.3292492628097534, + "learning_rate": 7.841132416217014e-06, + "loss": 0.3785, + "step": 9497 + }, + { + "epoch": 1.1558259811378155, + "grad_norm": 1.4239649772644043, + "learning_rate": 7.839227763793988e-06, + "loss": 0.3594, + "step": 9498 + }, + { + "epoch": 1.1559476726498326, + "grad_norm": 2.2892231941223145, + "learning_rate": 7.83732319359282e-06, + "loss": 0.3809, + "step": 9499 + }, + { + "epoch": 1.1560693641618498, + "grad_norm": 1.491167664527893, + "learning_rate": 7.83541870568598e-06, + "loss": 0.3873, + "step": 9500 + }, + { + "epoch": 1.1561910556738668, + "grad_norm": 1.4967931509017944, + "learning_rate": 7.833514300145937e-06, + "loss": 0.3836, + "step": 9501 + }, + { + "epoch": 1.1563127471858838, + "grad_norm": 1.2635350227355957, + "learning_rate": 7.831609977045164e-06, + "loss": 0.3119, + "step": 9502 + }, + { + "epoch": 1.1564344386979009, + "grad_norm": 2.0637807846069336, + "learning_rate": 7.829705736456114e-06, + "loss": 0.3688, + "step": 9503 + }, + { + "epoch": 1.1565561302099179, + "grad_norm": 1.5053224563598633, + "learning_rate": 7.827801578451255e-06, + "loss": 0.4093, + "step": 9504 + }, + { + "epoch": 1.156677821721935, + "grad_norm": 1.4580307006835938, + "learning_rate": 7.825897503103046e-06, + "loss": 0.4102, + "step": 9505 + }, + { + "epoch": 1.156799513233952, + "grad_norm": 1.7580782175064087, + "learning_rate": 7.823993510483934e-06, + "loss": 0.3734, + "step": 9506 + }, + { + "epoch": 1.156921204745969, + "grad_norm": 2.484034299850464, + "learning_rate": 7.822089600666373e-06, + "loss": 0.4252, + "step": 9507 + }, + { + "epoch": 1.157042896257986, + "grad_norm": 1.912593126296997, + "learning_rate": 7.820185773722812e-06, + "loss": 0.3722, + "step": 9508 + }, + { + "epoch": 1.157164587770003, + "grad_norm": 1.8472379446029663, + "learning_rate": 7.818282029725691e-06, + "loss": 0.411, + "step": 9509 + }, + { + "epoch": 1.15728627928202, + "grad_norm": 1.7662009000778198, + "learning_rate": 7.816378368747459e-06, + "loss": 0.356, + "step": 9510 + }, + { + "epoch": 1.1574079707940372, + "grad_norm": 2.1545989513397217, + "learning_rate": 7.814474790860546e-06, + "loss": 0.3269, + "step": 9511 + }, + { + "epoch": 1.1575296623060543, + "grad_norm": 1.389992117881775, + "learning_rate": 7.812571296137392e-06, + "loss": 0.3876, + "step": 9512 + }, + { + "epoch": 1.1576513538180713, + "grad_norm": 1.614871859550476, + "learning_rate": 7.810667884650429e-06, + "loss": 0.3792, + "step": 9513 + }, + { + "epoch": 1.1577730453300883, + "grad_norm": 1.670812964439392, + "learning_rate": 7.808764556472083e-06, + "loss": 0.3475, + "step": 9514 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 1.552437663078308, + "learning_rate": 7.806861311674783e-06, + "loss": 0.3635, + "step": 9515 + }, + { + "epoch": 1.1580164283541223, + "grad_norm": 1.466321587562561, + "learning_rate": 7.804958150330947e-06, + "loss": 0.3797, + "step": 9516 + }, + { + "epoch": 1.1581381198661393, + "grad_norm": 1.5108704566955566, + "learning_rate": 7.803055072513003e-06, + "loss": 0.3225, + "step": 9517 + }, + { + "epoch": 1.1582598113781564, + "grad_norm": 2.024376392364502, + "learning_rate": 7.801152078293357e-06, + "loss": 0.3914, + "step": 9518 + }, + { + "epoch": 1.1583815028901734, + "grad_norm": 2.414236307144165, + "learning_rate": 7.799249167744425e-06, + "loss": 0.3444, + "step": 9519 + }, + { + "epoch": 1.1585031944021904, + "grad_norm": 4.399750232696533, + "learning_rate": 7.797346340938618e-06, + "loss": 0.4668, + "step": 9520 + }, + { + "epoch": 1.1586248859142074, + "grad_norm": 1.9857077598571777, + "learning_rate": 7.795443597948343e-06, + "loss": 0.3944, + "step": 9521 + }, + { + "epoch": 1.1587465774262244, + "grad_norm": 1.38261079788208, + "learning_rate": 7.793540938846e-06, + "loss": 0.3501, + "step": 9522 + }, + { + "epoch": 1.1588682689382415, + "grad_norm": 1.349539875984192, + "learning_rate": 7.791638363703992e-06, + "loss": 0.3397, + "step": 9523 + }, + { + "epoch": 1.1589899604502585, + "grad_norm": 2.1774744987487793, + "learning_rate": 7.789735872594714e-06, + "loss": 0.3167, + "step": 9524 + }, + { + "epoch": 1.1591116519622757, + "grad_norm": 1.8662327527999878, + "learning_rate": 7.787833465590566e-06, + "loss": 0.3149, + "step": 9525 + }, + { + "epoch": 1.1592333434742927, + "grad_norm": 1.6271507740020752, + "learning_rate": 7.78593114276393e-06, + "loss": 0.4027, + "step": 9526 + }, + { + "epoch": 1.1593550349863098, + "grad_norm": 2.30841064453125, + "learning_rate": 7.784028904187195e-06, + "loss": 0.4197, + "step": 9527 + }, + { + "epoch": 1.1594767264983268, + "grad_norm": 1.694427728652954, + "learning_rate": 7.782126749932753e-06, + "loss": 0.3307, + "step": 9528 + }, + { + "epoch": 1.1595984180103438, + "grad_norm": 2.769146203994751, + "learning_rate": 7.780224680072978e-06, + "loss": 0.4299, + "step": 9529 + }, + { + "epoch": 1.1597201095223608, + "grad_norm": 1.845624327659607, + "learning_rate": 7.778322694680245e-06, + "loss": 0.4046, + "step": 9530 + }, + { + "epoch": 1.1598418010343778, + "grad_norm": 2.73650860786438, + "learning_rate": 7.776420793826932e-06, + "loss": 0.4572, + "step": 9531 + }, + { + "epoch": 1.1599634925463949, + "grad_norm": 1.5545141696929932, + "learning_rate": 7.77451897758541e-06, + "loss": 0.3782, + "step": 9532 + }, + { + "epoch": 1.1600851840584119, + "grad_norm": 1.8445744514465332, + "learning_rate": 7.772617246028052e-06, + "loss": 0.3725, + "step": 9533 + }, + { + "epoch": 1.160206875570429, + "grad_norm": 3.447798013687134, + "learning_rate": 7.770715599227214e-06, + "loss": 0.3537, + "step": 9534 + }, + { + "epoch": 1.160328567082446, + "grad_norm": 1.9832454919815063, + "learning_rate": 7.768814037255261e-06, + "loss": 0.4088, + "step": 9535 + }, + { + "epoch": 1.1604502585944632, + "grad_norm": 1.8081860542297363, + "learning_rate": 7.766912560184555e-06, + "loss": 0.3834, + "step": 9536 + }, + { + "epoch": 1.1605719501064802, + "grad_norm": 2.8829517364501953, + "learning_rate": 7.765011168087447e-06, + "loss": 0.3133, + "step": 9537 + }, + { + "epoch": 1.1606936416184972, + "grad_norm": 2.1916608810424805, + "learning_rate": 7.76310986103629e-06, + "loss": 0.3603, + "step": 9538 + }, + { + "epoch": 1.1608153331305142, + "grad_norm": 1.6348036527633667, + "learning_rate": 7.761208639103435e-06, + "loss": 0.3911, + "step": 9539 + }, + { + "epoch": 1.1609370246425312, + "grad_norm": 1.549636721611023, + "learning_rate": 7.759307502361224e-06, + "loss": 0.3496, + "step": 9540 + }, + { + "epoch": 1.1610587161545483, + "grad_norm": 2.0951642990112305, + "learning_rate": 7.757406450882002e-06, + "loss": 0.3438, + "step": 9541 + }, + { + "epoch": 1.1611804076665653, + "grad_norm": 2.578937530517578, + "learning_rate": 7.755505484738103e-06, + "loss": 0.3842, + "step": 9542 + }, + { + "epoch": 1.1613020991785823, + "grad_norm": 1.7002606391906738, + "learning_rate": 7.753604604001867e-06, + "loss": 0.3123, + "step": 9543 + }, + { + "epoch": 1.1614237906905993, + "grad_norm": 2.162013292312622, + "learning_rate": 7.751703808745629e-06, + "loss": 0.4071, + "step": 9544 + }, + { + "epoch": 1.1615454822026163, + "grad_norm": 1.66376531124115, + "learning_rate": 7.749803099041712e-06, + "loss": 0.4031, + "step": 9545 + }, + { + "epoch": 1.1616671737146333, + "grad_norm": 1.3813987970352173, + "learning_rate": 7.747902474962444e-06, + "loss": 0.3427, + "step": 9546 + }, + { + "epoch": 1.1617888652266504, + "grad_norm": 2.2929234504699707, + "learning_rate": 7.74600193658015e-06, + "loss": 0.3727, + "step": 9547 + }, + { + "epoch": 1.1619105567386674, + "grad_norm": 4.208395004272461, + "learning_rate": 7.744101483967147e-06, + "loss": 0.4426, + "step": 9548 + }, + { + "epoch": 1.1620322482506844, + "grad_norm": 1.5069340467453003, + "learning_rate": 7.742201117195751e-06, + "loss": 0.3932, + "step": 9549 + }, + { + "epoch": 1.1621539397627014, + "grad_norm": 1.8345972299575806, + "learning_rate": 7.740300836338276e-06, + "loss": 0.3631, + "step": 9550 + }, + { + "epoch": 1.1622756312747187, + "grad_norm": 1.3778551816940308, + "learning_rate": 7.738400641467037e-06, + "loss": 0.3853, + "step": 9551 + }, + { + "epoch": 1.1623973227867357, + "grad_norm": 2.1200578212738037, + "learning_rate": 7.736500532654332e-06, + "loss": 0.4271, + "step": 9552 + }, + { + "epoch": 1.1625190142987527, + "grad_norm": 1.5334471464157104, + "learning_rate": 7.734600509972465e-06, + "loss": 0.4049, + "step": 9553 + }, + { + "epoch": 1.1626407058107697, + "grad_norm": 3.031325340270996, + "learning_rate": 7.732700573493736e-06, + "loss": 0.2943, + "step": 9554 + }, + { + "epoch": 1.1627623973227867, + "grad_norm": 1.5473443269729614, + "learning_rate": 7.730800723290448e-06, + "loss": 0.3508, + "step": 9555 + }, + { + "epoch": 1.1628840888348038, + "grad_norm": 1.4994291067123413, + "learning_rate": 7.728900959434885e-06, + "loss": 0.3533, + "step": 9556 + }, + { + "epoch": 1.1630057803468208, + "grad_norm": 2.1711630821228027, + "learning_rate": 7.727001281999342e-06, + "loss": 0.4169, + "step": 9557 + }, + { + "epoch": 1.1631274718588378, + "grad_norm": 1.7278785705566406, + "learning_rate": 7.725101691056107e-06, + "loss": 0.4028, + "step": 9558 + }, + { + "epoch": 1.1632491633708548, + "grad_norm": 1.880561113357544, + "learning_rate": 7.72320218667746e-06, + "loss": 0.3925, + "step": 9559 + }, + { + "epoch": 1.1633708548828718, + "grad_norm": 1.623520851135254, + "learning_rate": 7.721302768935683e-06, + "loss": 0.3819, + "step": 9560 + }, + { + "epoch": 1.1634925463948889, + "grad_norm": 1.408267617225647, + "learning_rate": 7.719403437903053e-06, + "loss": 0.4024, + "step": 9561 + }, + { + "epoch": 1.163614237906906, + "grad_norm": 1.4975931644439697, + "learning_rate": 7.717504193651843e-06, + "loss": 0.3827, + "step": 9562 + }, + { + "epoch": 1.1637359294189231, + "grad_norm": 1.5320889949798584, + "learning_rate": 7.715605036254323e-06, + "loss": 0.3858, + "step": 9563 + }, + { + "epoch": 1.1638576209309401, + "grad_norm": 1.6493922472000122, + "learning_rate": 7.71370596578276e-06, + "loss": 0.3898, + "step": 9564 + }, + { + "epoch": 1.1639793124429572, + "grad_norm": 2.1500160694122314, + "learning_rate": 7.711806982309416e-06, + "loss": 0.3732, + "step": 9565 + }, + { + "epoch": 1.1641010039549742, + "grad_norm": 2.043473720550537, + "learning_rate": 7.709908085906553e-06, + "loss": 0.4098, + "step": 9566 + }, + { + "epoch": 1.1642226954669912, + "grad_norm": 2.018477439880371, + "learning_rate": 7.708009276646428e-06, + "loss": 0.3647, + "step": 9567 + }, + { + "epoch": 1.1643443869790082, + "grad_norm": 1.6066014766693115, + "learning_rate": 7.706110554601293e-06, + "loss": 0.3598, + "step": 9568 + }, + { + "epoch": 1.1644660784910252, + "grad_norm": 1.5660622119903564, + "learning_rate": 7.704211919843401e-06, + "loss": 0.3656, + "step": 9569 + }, + { + "epoch": 1.1645877700030423, + "grad_norm": 2.9562876224517822, + "learning_rate": 7.702313372444998e-06, + "loss": 0.4249, + "step": 9570 + }, + { + "epoch": 1.1647094615150593, + "grad_norm": 1.541069507598877, + "learning_rate": 7.700414912478324e-06, + "loss": 0.3297, + "step": 9571 + }, + { + "epoch": 1.1648311530270763, + "grad_norm": 3.7196342945098877, + "learning_rate": 7.698516540015623e-06, + "loss": 0.4506, + "step": 9572 + }, + { + "epoch": 1.1649528445390933, + "grad_norm": 1.3341870307922363, + "learning_rate": 7.696618255129135e-06, + "loss": 0.3751, + "step": 9573 + }, + { + "epoch": 1.1650745360511103, + "grad_norm": 1.5453259944915771, + "learning_rate": 7.69472005789109e-06, + "loss": 0.4101, + "step": 9574 + }, + { + "epoch": 1.1651962275631274, + "grad_norm": 1.5480189323425293, + "learning_rate": 7.692821948373718e-06, + "loss": 0.3633, + "step": 9575 + }, + { + "epoch": 1.1653179190751446, + "grad_norm": 2.057725191116333, + "learning_rate": 7.690923926649242e-06, + "loss": 0.3647, + "step": 9576 + }, + { + "epoch": 1.1654396105871616, + "grad_norm": 1.6835273504257202, + "learning_rate": 7.689025992789893e-06, + "loss": 0.4442, + "step": 9577 + }, + { + "epoch": 1.1655613020991786, + "grad_norm": 2.4741570949554443, + "learning_rate": 7.687128146867891e-06, + "loss": 0.3646, + "step": 9578 + }, + { + "epoch": 1.1656829936111957, + "grad_norm": 2.169314384460449, + "learning_rate": 7.685230388955447e-06, + "loss": 0.4438, + "step": 9579 + }, + { + "epoch": 1.1658046851232127, + "grad_norm": 2.268157958984375, + "learning_rate": 7.683332719124778e-06, + "loss": 0.313, + "step": 9580 + }, + { + "epoch": 1.1659263766352297, + "grad_norm": 1.5645852088928223, + "learning_rate": 7.681435137448095e-06, + "loss": 0.3983, + "step": 9581 + }, + { + "epoch": 1.1660480681472467, + "grad_norm": 1.926119089126587, + "learning_rate": 7.679537643997604e-06, + "loss": 0.4253, + "step": 9582 + }, + { + "epoch": 1.1661697596592637, + "grad_norm": 2.4873692989349365, + "learning_rate": 7.677640238845509e-06, + "loss": 0.3784, + "step": 9583 + }, + { + "epoch": 1.1662914511712807, + "grad_norm": 2.1079134941101074, + "learning_rate": 7.675742922064008e-06, + "loss": 0.3709, + "step": 9584 + }, + { + "epoch": 1.1664131426832978, + "grad_norm": 1.2665895223617554, + "learning_rate": 7.673845693725304e-06, + "loss": 0.363, + "step": 9585 + }, + { + "epoch": 1.1665348341953148, + "grad_norm": 2.4232044219970703, + "learning_rate": 7.671948553901587e-06, + "loss": 0.326, + "step": 9586 + }, + { + "epoch": 1.166656525707332, + "grad_norm": 5.492870807647705, + "learning_rate": 7.670051502665042e-06, + "loss": 0.5059, + "step": 9587 + }, + { + "epoch": 1.166778217219349, + "grad_norm": 1.7048954963684082, + "learning_rate": 7.668154540087861e-06, + "loss": 0.4105, + "step": 9588 + }, + { + "epoch": 1.166899908731366, + "grad_norm": 1.9728702306747437, + "learning_rate": 7.666257666242225e-06, + "loss": 0.2991, + "step": 9589 + }, + { + "epoch": 1.167021600243383, + "grad_norm": 1.2459090948104858, + "learning_rate": 7.66436088120032e-06, + "loss": 0.3501, + "step": 9590 + }, + { + "epoch": 1.1671432917554, + "grad_norm": 1.744399905204773, + "learning_rate": 7.662464185034313e-06, + "loss": 0.3504, + "step": 9591 + }, + { + "epoch": 1.1672649832674171, + "grad_norm": 1.4024866819381714, + "learning_rate": 7.660567577816383e-06, + "loss": 0.3744, + "step": 9592 + }, + { + "epoch": 1.1673866747794341, + "grad_norm": 2.816185474395752, + "learning_rate": 7.658671059618703e-06, + "loss": 0.4085, + "step": 9593 + }, + { + "epoch": 1.1675083662914512, + "grad_norm": 2.363020896911621, + "learning_rate": 7.656774630513431e-06, + "loss": 0.389, + "step": 9594 + }, + { + "epoch": 1.1676300578034682, + "grad_norm": 2.158487558364868, + "learning_rate": 7.654878290572737e-06, + "loss": 0.3419, + "step": 9595 + }, + { + "epoch": 1.1677517493154852, + "grad_norm": 1.329377293586731, + "learning_rate": 7.652982039868777e-06, + "loss": 0.3486, + "step": 9596 + }, + { + "epoch": 1.1678734408275022, + "grad_norm": 1.7175960540771484, + "learning_rate": 7.65108587847371e-06, + "loss": 0.4068, + "step": 9597 + }, + { + "epoch": 1.1679951323395192, + "grad_norm": 2.4583442211151123, + "learning_rate": 7.649189806459687e-06, + "loss": 0.4199, + "step": 9598 + }, + { + "epoch": 1.1681168238515363, + "grad_norm": 2.073394775390625, + "learning_rate": 7.647293823898858e-06, + "loss": 0.411, + "step": 9599 + }, + { + "epoch": 1.1682385153635533, + "grad_norm": 2.0896148681640625, + "learning_rate": 7.645397930863366e-06, + "loss": 0.4237, + "step": 9600 + }, + { + "epoch": 1.1683602068755705, + "grad_norm": 1.3931843042373657, + "learning_rate": 7.643502127425359e-06, + "loss": 0.3768, + "step": 9601 + }, + { + "epoch": 1.1684818983875875, + "grad_norm": 1.3291289806365967, + "learning_rate": 7.641606413656974e-06, + "loss": 0.3568, + "step": 9602 + }, + { + "epoch": 1.1686035898996046, + "grad_norm": 2.253200054168701, + "learning_rate": 7.639710789630344e-06, + "loss": 0.2967, + "step": 9603 + }, + { + "epoch": 1.1687252814116216, + "grad_norm": 2.661163091659546, + "learning_rate": 7.637815255417606e-06, + "loss": 0.4568, + "step": 9604 + }, + { + "epoch": 1.1688469729236386, + "grad_norm": 1.7723489999771118, + "learning_rate": 7.635919811090887e-06, + "loss": 0.4396, + "step": 9605 + }, + { + "epoch": 1.1689686644356556, + "grad_norm": 1.6135865449905396, + "learning_rate": 7.63402445672231e-06, + "loss": 0.3781, + "step": 9606 + }, + { + "epoch": 1.1690903559476726, + "grad_norm": 1.3138494491577148, + "learning_rate": 7.632129192384005e-06, + "loss": 0.4024, + "step": 9607 + }, + { + "epoch": 1.1692120474596897, + "grad_norm": 1.2561352252960205, + "learning_rate": 7.63023401814808e-06, + "loss": 0.3739, + "step": 9608 + }, + { + "epoch": 1.1693337389717067, + "grad_norm": 2.543386459350586, + "learning_rate": 7.628338934086662e-06, + "loss": 0.3233, + "step": 9609 + }, + { + "epoch": 1.1694554304837237, + "grad_norm": 4.182769775390625, + "learning_rate": 7.626443940271853e-06, + "loss": 0.3412, + "step": 9610 + }, + { + "epoch": 1.1695771219957407, + "grad_norm": 3.469388008117676, + "learning_rate": 7.624549036775764e-06, + "loss": 0.3312, + "step": 9611 + }, + { + "epoch": 1.169698813507758, + "grad_norm": 1.8135895729064941, + "learning_rate": 7.622654223670502e-06, + "loss": 0.3475, + "step": 9612 + }, + { + "epoch": 1.169820505019775, + "grad_norm": 2.1234781742095947, + "learning_rate": 7.6207595010281675e-06, + "loss": 0.3414, + "step": 9613 + }, + { + "epoch": 1.169942196531792, + "grad_norm": 1.6187366247177124, + "learning_rate": 7.618864868920858e-06, + "loss": 0.3791, + "step": 9614 + }, + { + "epoch": 1.170063888043809, + "grad_norm": 2.2679715156555176, + "learning_rate": 7.6169703274206685e-06, + "loss": 0.3374, + "step": 9615 + }, + { + "epoch": 1.170185579555826, + "grad_norm": 2.1995928287506104, + "learning_rate": 7.615075876599692e-06, + "loss": 0.3545, + "step": 9616 + }, + { + "epoch": 1.170307271067843, + "grad_norm": 2.2592668533325195, + "learning_rate": 7.613181516530015e-06, + "loss": 0.4188, + "step": 9617 + }, + { + "epoch": 1.17042896257986, + "grad_norm": 2.811978816986084, + "learning_rate": 7.611287247283721e-06, + "loss": 0.4238, + "step": 9618 + }, + { + "epoch": 1.170550654091877, + "grad_norm": 3.0794565677642822, + "learning_rate": 7.6093930689328935e-06, + "loss": 0.3908, + "step": 9619 + }, + { + "epoch": 1.170672345603894, + "grad_norm": 1.6768244504928589, + "learning_rate": 7.607498981549609e-06, + "loss": 0.2969, + "step": 9620 + }, + { + "epoch": 1.1707940371159111, + "grad_norm": 2.241546869277954, + "learning_rate": 7.605604985205937e-06, + "loss": 0.3922, + "step": 9621 + }, + { + "epoch": 1.1709157286279281, + "grad_norm": 3.904524326324463, + "learning_rate": 7.603711079973952e-06, + "loss": 0.4114, + "step": 9622 + }, + { + "epoch": 1.1710374201399452, + "grad_norm": 3.861419200897217, + "learning_rate": 7.60181726592572e-06, + "loss": 0.4676, + "step": 9623 + }, + { + "epoch": 1.1711591116519622, + "grad_norm": 2.285588502883911, + "learning_rate": 7.599923543133307e-06, + "loss": 0.4361, + "step": 9624 + }, + { + "epoch": 1.1712808031639792, + "grad_norm": 1.8813375234603882, + "learning_rate": 7.5980299116687695e-06, + "loss": 0.4012, + "step": 9625 + }, + { + "epoch": 1.1714024946759964, + "grad_norm": 1.4907963275909424, + "learning_rate": 7.596136371604165e-06, + "loss": 0.3759, + "step": 9626 + }, + { + "epoch": 1.1715241861880135, + "grad_norm": 1.702423334121704, + "learning_rate": 7.59424292301155e-06, + "loss": 0.3906, + "step": 9627 + }, + { + "epoch": 1.1716458777000305, + "grad_norm": 2.7096595764160156, + "learning_rate": 7.592349565962968e-06, + "loss": 0.3805, + "step": 9628 + }, + { + "epoch": 1.1717675692120475, + "grad_norm": 1.9202237129211426, + "learning_rate": 7.590456300530471e-06, + "loss": 0.3681, + "step": 9629 + }, + { + "epoch": 1.1718892607240645, + "grad_norm": 3.4236960411071777, + "learning_rate": 7.588563126786099e-06, + "loss": 0.3606, + "step": 9630 + }, + { + "epoch": 1.1720109522360815, + "grad_norm": 2.2568109035491943, + "learning_rate": 7.58667004480189e-06, + "loss": 0.4256, + "step": 9631 + }, + { + "epoch": 1.1721326437480986, + "grad_norm": 2.0422170162200928, + "learning_rate": 7.584777054649886e-06, + "loss": 0.3705, + "step": 9632 + }, + { + "epoch": 1.1722543352601156, + "grad_norm": 1.7017064094543457, + "learning_rate": 7.582884156402111e-06, + "loss": 0.4099, + "step": 9633 + }, + { + "epoch": 1.1723760267721326, + "grad_norm": 3.462963104248047, + "learning_rate": 7.580991350130594e-06, + "loss": 0.4323, + "step": 9634 + }, + { + "epoch": 1.1724977182841496, + "grad_norm": 2.1411759853363037, + "learning_rate": 7.579098635907367e-06, + "loss": 0.4156, + "step": 9635 + }, + { + "epoch": 1.1726194097961666, + "grad_norm": 1.6380159854888916, + "learning_rate": 7.577206013804446e-06, + "loss": 0.4219, + "step": 9636 + }, + { + "epoch": 1.1727411013081839, + "grad_norm": 1.9403705596923828, + "learning_rate": 7.575313483893851e-06, + "loss": 0.4444, + "step": 9637 + }, + { + "epoch": 1.172862792820201, + "grad_norm": 1.975189208984375, + "learning_rate": 7.573421046247598e-06, + "loss": 0.3766, + "step": 9638 + }, + { + "epoch": 1.172984484332218, + "grad_norm": 1.2692707777023315, + "learning_rate": 7.571528700937696e-06, + "loss": 0.3673, + "step": 9639 + }, + { + "epoch": 1.173106175844235, + "grad_norm": 1.503158688545227, + "learning_rate": 7.569636448036154e-06, + "loss": 0.4124, + "step": 9640 + }, + { + "epoch": 1.173227867356252, + "grad_norm": 1.2525988817214966, + "learning_rate": 7.567744287614976e-06, + "loss": 0.3587, + "step": 9641 + }, + { + "epoch": 1.173349558868269, + "grad_norm": 1.385740041732788, + "learning_rate": 7.565852219746162e-06, + "loss": 0.3972, + "step": 9642 + }, + { + "epoch": 1.173471250380286, + "grad_norm": 1.5023621320724487, + "learning_rate": 7.563960244501714e-06, + "loss": 0.3456, + "step": 9643 + }, + { + "epoch": 1.173592941892303, + "grad_norm": 1.666089415550232, + "learning_rate": 7.562068361953614e-06, + "loss": 0.3258, + "step": 9644 + }, + { + "epoch": 1.17371463340432, + "grad_norm": 1.487396001815796, + "learning_rate": 7.5601765721738605e-06, + "loss": 0.3475, + "step": 9645 + }, + { + "epoch": 1.173836324916337, + "grad_norm": 1.9156768321990967, + "learning_rate": 7.558284875234441e-06, + "loss": 0.3504, + "step": 9646 + }, + { + "epoch": 1.173958016428354, + "grad_norm": 2.0139801502227783, + "learning_rate": 7.556393271207334e-06, + "loss": 0.3415, + "step": 9647 + }, + { + "epoch": 1.174079707940371, + "grad_norm": 2.9057841300964355, + "learning_rate": 7.554501760164521e-06, + "loss": 0.3603, + "step": 9648 + }, + { + "epoch": 1.174201399452388, + "grad_norm": 1.9443708658218384, + "learning_rate": 7.5526103421779785e-06, + "loss": 0.4088, + "step": 9649 + }, + { + "epoch": 1.1743230909644051, + "grad_norm": 2.8054327964782715, + "learning_rate": 7.55071901731968e-06, + "loss": 0.4205, + "step": 9650 + }, + { + "epoch": 1.1744447824764224, + "grad_norm": 1.5990451574325562, + "learning_rate": 7.54882778566159e-06, + "loss": 0.373, + "step": 9651 + }, + { + "epoch": 1.1745664739884394, + "grad_norm": 2.325359582901001, + "learning_rate": 7.546936647275676e-06, + "loss": 0.3566, + "step": 9652 + }, + { + "epoch": 1.1746881655004564, + "grad_norm": 1.5325226783752441, + "learning_rate": 7.545045602233904e-06, + "loss": 0.331, + "step": 9653 + }, + { + "epoch": 1.1748098570124734, + "grad_norm": 1.6761375665664673, + "learning_rate": 7.543154650608224e-06, + "loss": 0.409, + "step": 9654 + }, + { + "epoch": 1.1749315485244904, + "grad_norm": 3.7169172763824463, + "learning_rate": 7.541263792470601e-06, + "loss": 0.4699, + "step": 9655 + }, + { + "epoch": 1.1750532400365075, + "grad_norm": 1.3243436813354492, + "learning_rate": 7.539373027892976e-06, + "loss": 0.3336, + "step": 9656 + }, + { + "epoch": 1.1751749315485245, + "grad_norm": 1.3653485774993896, + "learning_rate": 7.537482356947299e-06, + "loss": 0.3249, + "step": 9657 + }, + { + "epoch": 1.1752966230605415, + "grad_norm": 2.285682439804077, + "learning_rate": 7.535591779705518e-06, + "loss": 0.4148, + "step": 9658 + }, + { + "epoch": 1.1754183145725585, + "grad_norm": 1.385016918182373, + "learning_rate": 7.533701296239568e-06, + "loss": 0.3052, + "step": 9659 + }, + { + "epoch": 1.1755400060845755, + "grad_norm": 1.9938673973083496, + "learning_rate": 7.531810906621391e-06, + "loss": 0.374, + "step": 9660 + }, + { + "epoch": 1.1756616975965926, + "grad_norm": 1.736778736114502, + "learning_rate": 7.529920610922918e-06, + "loss": 0.4145, + "step": 9661 + }, + { + "epoch": 1.1757833891086096, + "grad_norm": 2.017746686935425, + "learning_rate": 7.5280304092160775e-06, + "loss": 0.4215, + "step": 9662 + }, + { + "epoch": 1.1759050806206268, + "grad_norm": 1.3665560483932495, + "learning_rate": 7.526140301572795e-06, + "loss": 0.3773, + "step": 9663 + }, + { + "epoch": 1.1760267721326438, + "grad_norm": 1.4421672821044922, + "learning_rate": 7.524250288064998e-06, + "loss": 0.372, + "step": 9664 + }, + { + "epoch": 1.1761484636446609, + "grad_norm": 1.7199981212615967, + "learning_rate": 7.522360368764599e-06, + "loss": 0.3864, + "step": 9665 + }, + { + "epoch": 1.1762701551566779, + "grad_norm": 1.7247111797332764, + "learning_rate": 7.520470543743522e-06, + "loss": 0.3953, + "step": 9666 + }, + { + "epoch": 1.176391846668695, + "grad_norm": 1.8313848972320557, + "learning_rate": 7.518580813073668e-06, + "loss": 0.3981, + "step": 9667 + }, + { + "epoch": 1.176513538180712, + "grad_norm": 1.5674959421157837, + "learning_rate": 7.516691176826951e-06, + "loss": 0.4093, + "step": 9668 + }, + { + "epoch": 1.176635229692729, + "grad_norm": 2.8634324073791504, + "learning_rate": 7.514801635075277e-06, + "loss": 0.3387, + "step": 9669 + }, + { + "epoch": 1.176756921204746, + "grad_norm": 1.6176297664642334, + "learning_rate": 7.512912187890542e-06, + "loss": 0.3732, + "step": 9670 + }, + { + "epoch": 1.176878612716763, + "grad_norm": 1.6058251857757568, + "learning_rate": 7.511022835344647e-06, + "loss": 0.394, + "step": 9671 + }, + { + "epoch": 1.17700030422878, + "grad_norm": 1.987532138824463, + "learning_rate": 7.509133577509486e-06, + "loss": 0.3916, + "step": 9672 + }, + { + "epoch": 1.177121995740797, + "grad_norm": 2.52313494682312, + "learning_rate": 7.507244414456947e-06, + "loss": 0.301, + "step": 9673 + }, + { + "epoch": 1.177243687252814, + "grad_norm": 1.3120063543319702, + "learning_rate": 7.505355346258918e-06, + "loss": 0.3575, + "step": 9674 + }, + { + "epoch": 1.177365378764831, + "grad_norm": 2.4810423851013184, + "learning_rate": 7.50346637298728e-06, + "loss": 0.352, + "step": 9675 + }, + { + "epoch": 1.177487070276848, + "grad_norm": 2.557352066040039, + "learning_rate": 7.5015774947139185e-06, + "loss": 0.4296, + "step": 9676 + }, + { + "epoch": 1.1776087617888653, + "grad_norm": 1.4737499952316284, + "learning_rate": 7.499688711510702e-06, + "loss": 0.3795, + "step": 9677 + }, + { + "epoch": 1.1777304533008823, + "grad_norm": 1.374341607093811, + "learning_rate": 7.497800023449509e-06, + "loss": 0.3717, + "step": 9678 + }, + { + "epoch": 1.1778521448128993, + "grad_norm": 2.6653521060943604, + "learning_rate": 7.4959114306022005e-06, + "loss": 0.3967, + "step": 9679 + }, + { + "epoch": 1.1779738363249164, + "grad_norm": 1.7362717390060425, + "learning_rate": 7.494022933040646e-06, + "loss": 0.3598, + "step": 9680 + }, + { + "epoch": 1.1780955278369334, + "grad_norm": 3.3668129444122314, + "learning_rate": 7.492134530836705e-06, + "loss": 0.4189, + "step": 9681 + }, + { + "epoch": 1.1782172193489504, + "grad_norm": 3.8593332767486572, + "learning_rate": 7.4902462240622364e-06, + "loss": 0.4334, + "step": 9682 + }, + { + "epoch": 1.1783389108609674, + "grad_norm": 2.8784804344177246, + "learning_rate": 7.488358012789094e-06, + "loss": 0.4429, + "step": 9683 + }, + { + "epoch": 1.1784606023729844, + "grad_norm": 1.2115579843521118, + "learning_rate": 7.4864698970891305e-06, + "loss": 0.3521, + "step": 9684 + }, + { + "epoch": 1.1785822938850015, + "grad_norm": 2.027649402618408, + "learning_rate": 7.484581877034187e-06, + "loss": 0.3684, + "step": 9685 + }, + { + "epoch": 1.1787039853970185, + "grad_norm": 2.467210292816162, + "learning_rate": 7.48269395269611e-06, + "loss": 0.3297, + "step": 9686 + }, + { + "epoch": 1.1788256769090355, + "grad_norm": 1.1987730264663696, + "learning_rate": 7.48080612414674e-06, + "loss": 0.3585, + "step": 9687 + }, + { + "epoch": 1.1789473684210527, + "grad_norm": 1.5257595777511597, + "learning_rate": 7.47891839145791e-06, + "loss": 0.4022, + "step": 9688 + }, + { + "epoch": 1.1790690599330698, + "grad_norm": 2.9447364807128906, + "learning_rate": 7.477030754701454e-06, + "loss": 0.3705, + "step": 9689 + }, + { + "epoch": 1.1791907514450868, + "grad_norm": 2.2697343826293945, + "learning_rate": 7.475143213949204e-06, + "loss": 0.3544, + "step": 9690 + }, + { + "epoch": 1.1793124429571038, + "grad_norm": 2.7392823696136475, + "learning_rate": 7.473255769272977e-06, + "loss": 0.4308, + "step": 9691 + }, + { + "epoch": 1.1794341344691208, + "grad_norm": 2.2082958221435547, + "learning_rate": 7.4713684207446e-06, + "loss": 0.3614, + "step": 9692 + }, + { + "epoch": 1.1795558259811378, + "grad_norm": 1.5046038627624512, + "learning_rate": 7.469481168435886e-06, + "loss": 0.3863, + "step": 9693 + }, + { + "epoch": 1.1796775174931549, + "grad_norm": 2.409872055053711, + "learning_rate": 7.4675940124186504e-06, + "loss": 0.438, + "step": 9694 + }, + { + "epoch": 1.1797992090051719, + "grad_norm": 1.4781765937805176, + "learning_rate": 7.465706952764708e-06, + "loss": 0.3759, + "step": 9695 + }, + { + "epoch": 1.179920900517189, + "grad_norm": 1.8360564708709717, + "learning_rate": 7.463819989545859e-06, + "loss": 0.3712, + "step": 9696 + }, + { + "epoch": 1.180042592029206, + "grad_norm": 1.7445881366729736, + "learning_rate": 7.461933122833911e-06, + "loss": 0.4061, + "step": 9697 + }, + { + "epoch": 1.180164283541223, + "grad_norm": 1.9159667491912842, + "learning_rate": 7.460046352700661e-06, + "loss": 0.3885, + "step": 9698 + }, + { + "epoch": 1.18028597505324, + "grad_norm": 2.351688861846924, + "learning_rate": 7.4581596792179034e-06, + "loss": 0.4035, + "step": 9699 + }, + { + "epoch": 1.180407666565257, + "grad_norm": 3.483651638031006, + "learning_rate": 7.456273102457432e-06, + "loss": 0.4115, + "step": 9700 + }, + { + "epoch": 1.180529358077274, + "grad_norm": 1.9175450801849365, + "learning_rate": 7.454386622491037e-06, + "loss": 0.3779, + "step": 9701 + }, + { + "epoch": 1.1806510495892912, + "grad_norm": 2.3120839595794678, + "learning_rate": 7.4525002393904965e-06, + "loss": 0.356, + "step": 9702 + }, + { + "epoch": 1.1807727411013083, + "grad_norm": 2.721449851989746, + "learning_rate": 7.4506139532275965e-06, + "loss": 0.3882, + "step": 9703 + }, + { + "epoch": 1.1808944326133253, + "grad_norm": 2.6561784744262695, + "learning_rate": 7.448727764074112e-06, + "loss": 0.3323, + "step": 9704 + }, + { + "epoch": 1.1810161241253423, + "grad_norm": 1.5090900659561157, + "learning_rate": 7.446841672001815e-06, + "loss": 0.4024, + "step": 9705 + }, + { + "epoch": 1.1811378156373593, + "grad_norm": 1.976524829864502, + "learning_rate": 7.4449556770824814e-06, + "loss": 0.3893, + "step": 9706 + }, + { + "epoch": 1.1812595071493763, + "grad_norm": 1.6574209928512573, + "learning_rate": 7.44306977938787e-06, + "loss": 0.3967, + "step": 9707 + }, + { + "epoch": 1.1813811986613934, + "grad_norm": 1.6262218952178955, + "learning_rate": 7.441183978989745e-06, + "loss": 0.3693, + "step": 9708 + }, + { + "epoch": 1.1815028901734104, + "grad_norm": 2.3745126724243164, + "learning_rate": 7.439298275959866e-06, + "loss": 0.3243, + "step": 9709 + }, + { + "epoch": 1.1816245816854274, + "grad_norm": 1.668454647064209, + "learning_rate": 7.437412670369992e-06, + "loss": 0.3735, + "step": 9710 + }, + { + "epoch": 1.1817462731974444, + "grad_norm": 1.3087494373321533, + "learning_rate": 7.435527162291867e-06, + "loss": 0.3566, + "step": 9711 + }, + { + "epoch": 1.1818679647094614, + "grad_norm": 2.0906765460968018, + "learning_rate": 7.433641751797241e-06, + "loss": 0.3043, + "step": 9712 + }, + { + "epoch": 1.1819896562214787, + "grad_norm": 2.554093837738037, + "learning_rate": 7.4317564389578624e-06, + "loss": 0.4262, + "step": 9713 + }, + { + "epoch": 1.1821113477334957, + "grad_norm": 1.4899507761001587, + "learning_rate": 7.429871223845466e-06, + "loss": 0.3661, + "step": 9714 + }, + { + "epoch": 1.1822330392455127, + "grad_norm": 2.2557945251464844, + "learning_rate": 7.4279861065317885e-06, + "loss": 0.3676, + "step": 9715 + }, + { + "epoch": 1.1823547307575297, + "grad_norm": 1.3241007328033447, + "learning_rate": 7.426101087088562e-06, + "loss": 0.3787, + "step": 9716 + }, + { + "epoch": 1.1824764222695467, + "grad_norm": 2.7104976177215576, + "learning_rate": 7.4242161655875165e-06, + "loss": 0.4282, + "step": 9717 + }, + { + "epoch": 1.1825981137815638, + "grad_norm": 2.966212034225464, + "learning_rate": 7.42233134210038e-06, + "loss": 0.4097, + "step": 9718 + }, + { + "epoch": 1.1827198052935808, + "grad_norm": 1.4211074113845825, + "learning_rate": 7.420446616698869e-06, + "loss": 0.357, + "step": 9719 + }, + { + "epoch": 1.1828414968055978, + "grad_norm": 1.311387538909912, + "learning_rate": 7.418561989454705e-06, + "loss": 0.3527, + "step": 9720 + }, + { + "epoch": 1.1829631883176148, + "grad_norm": 1.228264570236206, + "learning_rate": 7.4166774604396e-06, + "loss": 0.3358, + "step": 9721 + }, + { + "epoch": 1.1830848798296318, + "grad_norm": 1.6796833276748657, + "learning_rate": 7.414793029725265e-06, + "loss": 0.369, + "step": 9722 + }, + { + "epoch": 1.1832065713416489, + "grad_norm": 1.2923469543457031, + "learning_rate": 7.4129086973834055e-06, + "loss": 0.3645, + "step": 9723 + }, + { + "epoch": 1.1833282628536659, + "grad_norm": 1.2359907627105713, + "learning_rate": 7.4110244634857295e-06, + "loss": 0.3565, + "step": 9724 + }, + { + "epoch": 1.183449954365683, + "grad_norm": 2.5839452743530273, + "learning_rate": 7.409140328103927e-06, + "loss": 0.396, + "step": 9725 + }, + { + "epoch": 1.1835716458777, + "grad_norm": 2.000016450881958, + "learning_rate": 7.407256291309702e-06, + "loss": 0.3948, + "step": 9726 + }, + { + "epoch": 1.1836933373897172, + "grad_norm": 1.8811098337173462, + "learning_rate": 7.4053723531747355e-06, + "loss": 0.4016, + "step": 9727 + }, + { + "epoch": 1.1838150289017342, + "grad_norm": 1.4274972677230835, + "learning_rate": 7.403488513770724e-06, + "loss": 0.4323, + "step": 9728 + }, + { + "epoch": 1.1839367204137512, + "grad_norm": 1.712700366973877, + "learning_rate": 7.401604773169352e-06, + "loss": 0.3595, + "step": 9729 + }, + { + "epoch": 1.1840584119257682, + "grad_norm": 1.2554993629455566, + "learning_rate": 7.399721131442294e-06, + "loss": 0.3462, + "step": 9730 + }, + { + "epoch": 1.1841801034377852, + "grad_norm": 2.3004250526428223, + "learning_rate": 7.397837588661229e-06, + "loss": 0.3885, + "step": 9731 + }, + { + "epoch": 1.1843017949498023, + "grad_norm": 2.6135618686676025, + "learning_rate": 7.395954144897831e-06, + "loss": 0.3391, + "step": 9732 + }, + { + "epoch": 1.1844234864618193, + "grad_norm": 1.9746272563934326, + "learning_rate": 7.3940708002237675e-06, + "loss": 0.4412, + "step": 9733 + }, + { + "epoch": 1.1845451779738363, + "grad_norm": 3.0108489990234375, + "learning_rate": 7.3921875547107035e-06, + "loss": 0.3767, + "step": 9734 + }, + { + "epoch": 1.1846668694858533, + "grad_norm": 1.8141714334487915, + "learning_rate": 7.3903044084303e-06, + "loss": 0.3114, + "step": 9735 + }, + { + "epoch": 1.1847885609978703, + "grad_norm": 1.6276839971542358, + "learning_rate": 7.388421361454221e-06, + "loss": 0.3967, + "step": 9736 + }, + { + "epoch": 1.1849102525098874, + "grad_norm": 1.4108457565307617, + "learning_rate": 7.386538413854112e-06, + "loss": 0.3832, + "step": 9737 + }, + { + "epoch": 1.1850319440219046, + "grad_norm": 1.443261742591858, + "learning_rate": 7.384655565701624e-06, + "loss": 0.3648, + "step": 9738 + }, + { + "epoch": 1.1851536355339216, + "grad_norm": 1.2201173305511475, + "learning_rate": 7.3827728170684045e-06, + "loss": 0.3359, + "step": 9739 + }, + { + "epoch": 1.1852753270459386, + "grad_norm": 3.517454147338867, + "learning_rate": 7.3808901680261e-06, + "loss": 0.4007, + "step": 9740 + }, + { + "epoch": 1.1853970185579557, + "grad_norm": 1.5070616006851196, + "learning_rate": 7.379007618646343e-06, + "loss": 0.3682, + "step": 9741 + }, + { + "epoch": 1.1855187100699727, + "grad_norm": 1.5357564687728882, + "learning_rate": 7.377125169000772e-06, + "loss": 0.3386, + "step": 9742 + }, + { + "epoch": 1.1856404015819897, + "grad_norm": 2.7693560123443604, + "learning_rate": 7.375242819161017e-06, + "loss": 0.4197, + "step": 9743 + }, + { + "epoch": 1.1857620930940067, + "grad_norm": 1.6653414964675903, + "learning_rate": 7.373360569198706e-06, + "loss": 0.3906, + "step": 9744 + }, + { + "epoch": 1.1858837846060237, + "grad_norm": 2.5514349937438965, + "learning_rate": 7.371478419185462e-06, + "loss": 0.3186, + "step": 9745 + }, + { + "epoch": 1.1860054761180407, + "grad_norm": 1.5120397806167603, + "learning_rate": 7.369596369192905e-06, + "loss": 0.3917, + "step": 9746 + }, + { + "epoch": 1.1861271676300578, + "grad_norm": 1.4133343696594238, + "learning_rate": 7.3677144192926555e-06, + "loss": 0.3929, + "step": 9747 + }, + { + "epoch": 1.1862488591420748, + "grad_norm": 1.445682168006897, + "learning_rate": 7.365832569556317e-06, + "loss": 0.3389, + "step": 9748 + }, + { + "epoch": 1.1863705506540918, + "grad_norm": 2.119556427001953, + "learning_rate": 7.363950820055501e-06, + "loss": 0.4039, + "step": 9749 + }, + { + "epoch": 1.1864922421661088, + "grad_norm": 1.8560068607330322, + "learning_rate": 7.362069170861812e-06, + "loss": 0.3172, + "step": 9750 + }, + { + "epoch": 1.1866139336781258, + "grad_norm": 1.8694087266921997, + "learning_rate": 7.360187622046851e-06, + "loss": 0.3566, + "step": 9751 + }, + { + "epoch": 1.186735625190143, + "grad_norm": 1.401943564414978, + "learning_rate": 7.358306173682217e-06, + "loss": 0.3761, + "step": 9752 + }, + { + "epoch": 1.18685731670216, + "grad_norm": 1.337256908416748, + "learning_rate": 7.356424825839501e-06, + "loss": 0.3357, + "step": 9753 + }, + { + "epoch": 1.1869790082141771, + "grad_norm": 1.3931041955947876, + "learning_rate": 7.35454357859029e-06, + "loss": 0.3937, + "step": 9754 + }, + { + "epoch": 1.1871006997261941, + "grad_norm": 4.364648818969727, + "learning_rate": 7.352662432006175e-06, + "loss": 0.4306, + "step": 9755 + }, + { + "epoch": 1.1872223912382112, + "grad_norm": 1.8168553113937378, + "learning_rate": 7.3507813861587315e-06, + "loss": 0.3763, + "step": 9756 + }, + { + "epoch": 1.1873440827502282, + "grad_norm": 1.891121745109558, + "learning_rate": 7.3489004411195395e-06, + "loss": 0.3256, + "step": 9757 + }, + { + "epoch": 1.1874657742622452, + "grad_norm": 1.7567882537841797, + "learning_rate": 7.347019596960177e-06, + "loss": 0.3887, + "step": 9758 + }, + { + "epoch": 1.1875874657742622, + "grad_norm": 1.993302822113037, + "learning_rate": 7.345138853752207e-06, + "loss": 0.3819, + "step": 9759 + }, + { + "epoch": 1.1877091572862792, + "grad_norm": 1.6605867147445679, + "learning_rate": 7.343258211567201e-06, + "loss": 0.4151, + "step": 9760 + }, + { + "epoch": 1.1878308487982963, + "grad_norm": 1.5491313934326172, + "learning_rate": 7.341377670476717e-06, + "loss": 0.4117, + "step": 9761 + }, + { + "epoch": 1.1879525403103133, + "grad_norm": 3.200650691986084, + "learning_rate": 7.339497230552316e-06, + "loss": 0.4397, + "step": 9762 + }, + { + "epoch": 1.1880742318223303, + "grad_norm": 2.271812915802002, + "learning_rate": 7.3376168918655536e-06, + "loss": 0.3352, + "step": 9763 + }, + { + "epoch": 1.1881959233343475, + "grad_norm": 2.0693397521972656, + "learning_rate": 7.335736654487978e-06, + "loss": 0.3671, + "step": 9764 + }, + { + "epoch": 1.1883176148463646, + "grad_norm": 2.6654064655303955, + "learning_rate": 7.333856518491137e-06, + "loss": 0.3299, + "step": 9765 + }, + { + "epoch": 1.1884393063583816, + "grad_norm": 1.5677542686462402, + "learning_rate": 7.331976483946577e-06, + "loss": 0.4042, + "step": 9766 + }, + { + "epoch": 1.1885609978703986, + "grad_norm": 1.3392091989517212, + "learning_rate": 7.330096550925831e-06, + "loss": 0.3886, + "step": 9767 + }, + { + "epoch": 1.1886826893824156, + "grad_norm": 1.5454658269882202, + "learning_rate": 7.328216719500437e-06, + "loss": 0.3749, + "step": 9768 + }, + { + "epoch": 1.1888043808944326, + "grad_norm": 1.7193052768707275, + "learning_rate": 7.326336989741929e-06, + "loss": 0.4039, + "step": 9769 + }, + { + "epoch": 1.1889260724064497, + "grad_norm": 1.9421888589859009, + "learning_rate": 7.324457361721835e-06, + "loss": 0.305, + "step": 9770 + }, + { + "epoch": 1.1890477639184667, + "grad_norm": 1.9311881065368652, + "learning_rate": 7.322577835511676e-06, + "loss": 0.3984, + "step": 9771 + }, + { + "epoch": 1.1891694554304837, + "grad_norm": 1.4759806394577026, + "learning_rate": 7.32069841118297e-06, + "loss": 0.3929, + "step": 9772 + }, + { + "epoch": 1.1892911469425007, + "grad_norm": 1.4620578289031982, + "learning_rate": 7.318819088807234e-06, + "loss": 0.381, + "step": 9773 + }, + { + "epoch": 1.1894128384545177, + "grad_norm": 2.0390918254852295, + "learning_rate": 7.316939868455985e-06, + "loss": 0.4152, + "step": 9774 + }, + { + "epoch": 1.1895345299665347, + "grad_norm": 2.3117787837982178, + "learning_rate": 7.3150607502007246e-06, + "loss": 0.3536, + "step": 9775 + }, + { + "epoch": 1.1896562214785518, + "grad_norm": 2.2581448554992676, + "learning_rate": 7.313181734112961e-06, + "loss": 0.3817, + "step": 9776 + }, + { + "epoch": 1.1897779129905688, + "grad_norm": 1.3782665729522705, + "learning_rate": 7.311302820264193e-06, + "loss": 0.3219, + "step": 9777 + }, + { + "epoch": 1.189899604502586, + "grad_norm": 1.7863227128982544, + "learning_rate": 7.3094240087259205e-06, + "loss": 0.3513, + "step": 9778 + }, + { + "epoch": 1.190021296014603, + "grad_norm": 1.720304250717163, + "learning_rate": 7.307545299569631e-06, + "loss": 0.4215, + "step": 9779 + }, + { + "epoch": 1.19014298752662, + "grad_norm": 2.7664988040924072, + "learning_rate": 7.305666692866817e-06, + "loss": 0.4042, + "step": 9780 + }, + { + "epoch": 1.190264679038637, + "grad_norm": 2.4821979999542236, + "learning_rate": 7.303788188688964e-06, + "loss": 0.3629, + "step": 9781 + }, + { + "epoch": 1.190386370550654, + "grad_norm": 1.498619794845581, + "learning_rate": 7.301909787107553e-06, + "loss": 0.3873, + "step": 9782 + }, + { + "epoch": 1.1905080620626711, + "grad_norm": 2.3452537059783936, + "learning_rate": 7.300031488194055e-06, + "loss": 0.3863, + "step": 9783 + }, + { + "epoch": 1.1906297535746881, + "grad_norm": 1.4993228912353516, + "learning_rate": 7.298153292019948e-06, + "loss": 0.3699, + "step": 9784 + }, + { + "epoch": 1.1907514450867052, + "grad_norm": 2.058849811553955, + "learning_rate": 7.296275198656701e-06, + "loss": 0.3875, + "step": 9785 + }, + { + "epoch": 1.1908731365987222, + "grad_norm": 3.372253894805908, + "learning_rate": 7.2943972081757805e-06, + "loss": 0.3094, + "step": 9786 + }, + { + "epoch": 1.1909948281107392, + "grad_norm": 3.1086041927337646, + "learning_rate": 7.292519320648646e-06, + "loss": 0.4501, + "step": 9787 + }, + { + "epoch": 1.1911165196227562, + "grad_norm": 2.2981343269348145, + "learning_rate": 7.290641536146753e-06, + "loss": 0.3557, + "step": 9788 + }, + { + "epoch": 1.1912382111347735, + "grad_norm": 1.3525437116622925, + "learning_rate": 7.28876385474156e-06, + "loss": 0.3874, + "step": 9789 + }, + { + "epoch": 1.1913599026467905, + "grad_norm": 2.6758930683135986, + "learning_rate": 7.286886276504514e-06, + "loss": 0.3771, + "step": 9790 + }, + { + "epoch": 1.1914815941588075, + "grad_norm": 1.3250406980514526, + "learning_rate": 7.285008801507061e-06, + "loss": 0.3683, + "step": 9791 + }, + { + "epoch": 1.1916032856708245, + "grad_norm": 2.3617355823516846, + "learning_rate": 7.283131429820644e-06, + "loss": 0.4301, + "step": 9792 + }, + { + "epoch": 1.1917249771828415, + "grad_norm": 1.3408892154693604, + "learning_rate": 7.2812541615166995e-06, + "loss": 0.4119, + "step": 9793 + }, + { + "epoch": 1.1918466686948586, + "grad_norm": 1.8691617250442505, + "learning_rate": 7.279376996666662e-06, + "loss": 0.3411, + "step": 9794 + }, + { + "epoch": 1.1919683602068756, + "grad_norm": 2.074599027633667, + "learning_rate": 7.277499935341959e-06, + "loss": 0.4041, + "step": 9795 + }, + { + "epoch": 1.1920900517188926, + "grad_norm": 1.9167262315750122, + "learning_rate": 7.2756229776140185e-06, + "loss": 0.4124, + "step": 9796 + }, + { + "epoch": 1.1922117432309096, + "grad_norm": 1.5892618894577026, + "learning_rate": 7.273746123554265e-06, + "loss": 0.3583, + "step": 9797 + }, + { + "epoch": 1.1923334347429266, + "grad_norm": 1.6339491605758667, + "learning_rate": 7.271869373234113e-06, + "loss": 0.3575, + "step": 9798 + }, + { + "epoch": 1.1924551262549437, + "grad_norm": 2.667067289352417, + "learning_rate": 7.269992726724977e-06, + "loss": 0.3877, + "step": 9799 + }, + { + "epoch": 1.1925768177669607, + "grad_norm": 1.3575387001037598, + "learning_rate": 7.268116184098273e-06, + "loss": 0.4284, + "step": 9800 + }, + { + "epoch": 1.1926985092789777, + "grad_norm": 2.481475830078125, + "learning_rate": 7.266239745425398e-06, + "loss": 0.3566, + "step": 9801 + }, + { + "epoch": 1.1928202007909947, + "grad_norm": 2.239607810974121, + "learning_rate": 7.264363410777761e-06, + "loss": 0.4492, + "step": 9802 + }, + { + "epoch": 1.192941892303012, + "grad_norm": 2.346715211868286, + "learning_rate": 7.262487180226758e-06, + "loss": 0.3635, + "step": 9803 + }, + { + "epoch": 1.193063583815029, + "grad_norm": 1.6116138696670532, + "learning_rate": 7.260611053843787e-06, + "loss": 0.38, + "step": 9804 + }, + { + "epoch": 1.193185275327046, + "grad_norm": 1.5168038606643677, + "learning_rate": 7.258735031700237e-06, + "loss": 0.4297, + "step": 9805 + }, + { + "epoch": 1.193306966839063, + "grad_norm": 1.8599116802215576, + "learning_rate": 7.256859113867489e-06, + "loss": 0.3472, + "step": 9806 + }, + { + "epoch": 1.19342865835108, + "grad_norm": 1.5954008102416992, + "learning_rate": 7.25498330041693e-06, + "loss": 0.3332, + "step": 9807 + }, + { + "epoch": 1.193550349863097, + "grad_norm": 2.0614848136901855, + "learning_rate": 7.25310759141994e-06, + "loss": 0.4302, + "step": 9808 + }, + { + "epoch": 1.193672041375114, + "grad_norm": 1.8109195232391357, + "learning_rate": 7.251231986947891e-06, + "loss": 0.4274, + "step": 9809 + }, + { + "epoch": 1.193793732887131, + "grad_norm": 2.0193774700164795, + "learning_rate": 7.249356487072153e-06, + "loss": 0.3392, + "step": 9810 + }, + { + "epoch": 1.193915424399148, + "grad_norm": 1.418201208114624, + "learning_rate": 7.2474810918640945e-06, + "loss": 0.349, + "step": 9811 + }, + { + "epoch": 1.1940371159111651, + "grad_norm": 1.9252541065216064, + "learning_rate": 7.245605801395081e-06, + "loss": 0.3656, + "step": 9812 + }, + { + "epoch": 1.1941588074231821, + "grad_norm": 1.3925299644470215, + "learning_rate": 7.243730615736464e-06, + "loss": 0.3959, + "step": 9813 + }, + { + "epoch": 1.1942804989351994, + "grad_norm": 2.263493776321411, + "learning_rate": 7.2418555349596035e-06, + "loss": 0.3544, + "step": 9814 + }, + { + "epoch": 1.1944021904472164, + "grad_norm": 1.6489735841751099, + "learning_rate": 7.239980559135851e-06, + "loss": 0.3908, + "step": 9815 + }, + { + "epoch": 1.1945238819592334, + "grad_norm": 1.5460681915283203, + "learning_rate": 7.238105688336554e-06, + "loss": 0.3676, + "step": 9816 + }, + { + "epoch": 1.1946455734712504, + "grad_norm": 1.6218175888061523, + "learning_rate": 7.2362309226330474e-06, + "loss": 0.4264, + "step": 9817 + }, + { + "epoch": 1.1947672649832675, + "grad_norm": 2.4622278213500977, + "learning_rate": 7.234356262096675e-06, + "loss": 0.354, + "step": 9818 + }, + { + "epoch": 1.1948889564952845, + "grad_norm": 2.2065672874450684, + "learning_rate": 7.232481706798771e-06, + "loss": 0.4001, + "step": 9819 + }, + { + "epoch": 1.1950106480073015, + "grad_norm": 2.0798799991607666, + "learning_rate": 7.230607256810669e-06, + "loss": 0.425, + "step": 9820 + }, + { + "epoch": 1.1951323395193185, + "grad_norm": 1.8250443935394287, + "learning_rate": 7.228732912203691e-06, + "loss": 0.3903, + "step": 9821 + }, + { + "epoch": 1.1952540310313355, + "grad_norm": 2.283653497695923, + "learning_rate": 7.2268586730491615e-06, + "loss": 0.4084, + "step": 9822 + }, + { + "epoch": 1.1953757225433526, + "grad_norm": 1.6580002307891846, + "learning_rate": 7.224984539418402e-06, + "loss": 0.4022, + "step": 9823 + }, + { + "epoch": 1.1954974140553696, + "grad_norm": 1.3858612775802612, + "learning_rate": 7.2231105113827226e-06, + "loss": 0.3303, + "step": 9824 + }, + { + "epoch": 1.1956191055673866, + "grad_norm": 1.97579824924469, + "learning_rate": 7.2212365890134365e-06, + "loss": 0.4339, + "step": 9825 + }, + { + "epoch": 1.1957407970794036, + "grad_norm": 3.906874895095825, + "learning_rate": 7.219362772381851e-06, + "loss": 0.4547, + "step": 9826 + }, + { + "epoch": 1.1958624885914206, + "grad_norm": 1.5980494022369385, + "learning_rate": 7.217489061559266e-06, + "loss": 0.3882, + "step": 9827 + }, + { + "epoch": 1.1959841801034379, + "grad_norm": 1.6412523984909058, + "learning_rate": 7.215615456616987e-06, + "loss": 0.4059, + "step": 9828 + }, + { + "epoch": 1.196105871615455, + "grad_norm": 1.7347638607025146, + "learning_rate": 7.213741957626297e-06, + "loss": 0.3622, + "step": 9829 + }, + { + "epoch": 1.196227563127472, + "grad_norm": 2.732839345932007, + "learning_rate": 7.2118685646584955e-06, + "loss": 0.3609, + "step": 9830 + }, + { + "epoch": 1.196349254639489, + "grad_norm": 2.1362144947052, + "learning_rate": 7.209995277784866e-06, + "loss": 0.3514, + "step": 9831 + }, + { + "epoch": 1.196470946151506, + "grad_norm": 1.543515682220459, + "learning_rate": 7.20812209707669e-06, + "loss": 0.3733, + "step": 9832 + }, + { + "epoch": 1.196592637663523, + "grad_norm": 1.354123592376709, + "learning_rate": 7.2062490226052464e-06, + "loss": 0.3699, + "step": 9833 + }, + { + "epoch": 1.19671432917554, + "grad_norm": 1.7345744371414185, + "learning_rate": 7.204376054441813e-06, + "loss": 0.3448, + "step": 9834 + }, + { + "epoch": 1.196836020687557, + "grad_norm": 3.250802993774414, + "learning_rate": 7.202503192657655e-06, + "loss": 0.4241, + "step": 9835 + }, + { + "epoch": 1.196957712199574, + "grad_norm": 2.1031477451324463, + "learning_rate": 7.200630437324041e-06, + "loss": 0.391, + "step": 9836 + }, + { + "epoch": 1.197079403711591, + "grad_norm": 1.759873628616333, + "learning_rate": 7.198757788512233e-06, + "loss": 0.36, + "step": 9837 + }, + { + "epoch": 1.197201095223608, + "grad_norm": 2.229681968688965, + "learning_rate": 7.196885246293492e-06, + "loss": 0.4182, + "step": 9838 + }, + { + "epoch": 1.1973227867356253, + "grad_norm": 2.7650949954986572, + "learning_rate": 7.19501281073907e-06, + "loss": 0.4363, + "step": 9839 + }, + { + "epoch": 1.1974444782476423, + "grad_norm": 1.41323721408844, + "learning_rate": 7.193140481920215e-06, + "loss": 0.3438, + "step": 9840 + }, + { + "epoch": 1.1975661697596593, + "grad_norm": 2.160066843032837, + "learning_rate": 7.191268259908173e-06, + "loss": 0.4054, + "step": 9841 + }, + { + "epoch": 1.1976878612716764, + "grad_norm": 2.335728168487549, + "learning_rate": 7.18939614477419e-06, + "loss": 0.3609, + "step": 9842 + }, + { + "epoch": 1.1978095527836934, + "grad_norm": 1.6418588161468506, + "learning_rate": 7.187524136589499e-06, + "loss": 0.4025, + "step": 9843 + }, + { + "epoch": 1.1979312442957104, + "grad_norm": 3.662625312805176, + "learning_rate": 7.185652235425335e-06, + "loss": 0.3183, + "step": 9844 + }, + { + "epoch": 1.1980529358077274, + "grad_norm": 1.84652841091156, + "learning_rate": 7.183780441352931e-06, + "loss": 0.4042, + "step": 9845 + }, + { + "epoch": 1.1981746273197444, + "grad_norm": 1.9028189182281494, + "learning_rate": 7.1819087544435115e-06, + "loss": 0.3812, + "step": 9846 + }, + { + "epoch": 1.1982963188317615, + "grad_norm": 1.8770074844360352, + "learning_rate": 7.180037174768295e-06, + "loss": 0.4045, + "step": 9847 + }, + { + "epoch": 1.1984180103437785, + "grad_norm": 1.9270813465118408, + "learning_rate": 7.178165702398501e-06, + "loss": 0.4106, + "step": 9848 + }, + { + "epoch": 1.1985397018557955, + "grad_norm": 2.323920726776123, + "learning_rate": 7.176294337405345e-06, + "loss": 0.4461, + "step": 9849 + }, + { + "epoch": 1.1986613933678125, + "grad_norm": 2.5973894596099854, + "learning_rate": 7.174423079860032e-06, + "loss": 0.4, + "step": 9850 + }, + { + "epoch": 1.1987830848798295, + "grad_norm": 1.7463017702102661, + "learning_rate": 7.1725519298337745e-06, + "loss": 0.4001, + "step": 9851 + }, + { + "epoch": 1.1989047763918466, + "grad_norm": 2.995480537414551, + "learning_rate": 7.170680887397763e-06, + "loss": 0.3835, + "step": 9852 + }, + { + "epoch": 1.1990264679038638, + "grad_norm": 2.71461820602417, + "learning_rate": 7.1688099526232015e-06, + "loss": 0.4077, + "step": 9853 + }, + { + "epoch": 1.1991481594158808, + "grad_norm": 2.7108864784240723, + "learning_rate": 7.166939125581283e-06, + "loss": 0.3734, + "step": 9854 + }, + { + "epoch": 1.1992698509278978, + "grad_norm": 1.7506965398788452, + "learning_rate": 7.165068406343192e-06, + "loss": 0.4198, + "step": 9855 + }, + { + "epoch": 1.1993915424399149, + "grad_norm": 2.925708770751953, + "learning_rate": 7.163197794980117e-06, + "loss": 0.3223, + "step": 9856 + }, + { + "epoch": 1.1995132339519319, + "grad_norm": 1.3340356349945068, + "learning_rate": 7.161327291563239e-06, + "loss": 0.3423, + "step": 9857 + }, + { + "epoch": 1.199634925463949, + "grad_norm": 3.1486504077911377, + "learning_rate": 7.1594568961637325e-06, + "loss": 0.4445, + "step": 9858 + }, + { + "epoch": 1.199756616975966, + "grad_norm": 1.4620342254638672, + "learning_rate": 7.157586608852769e-06, + "loss": 0.3572, + "step": 9859 + }, + { + "epoch": 1.199878308487983, + "grad_norm": 1.8884685039520264, + "learning_rate": 7.155716429701522e-06, + "loss": 0.4246, + "step": 9860 + }, + { + "epoch": 1.2, + "grad_norm": 1.7373028993606567, + "learning_rate": 7.153846358781149e-06, + "loss": 0.3394, + "step": 9861 + }, + { + "epoch": 1.200121691512017, + "grad_norm": 1.6023287773132324, + "learning_rate": 7.151976396162818e-06, + "loss": 0.3618, + "step": 9862 + }, + { + "epoch": 1.200243383024034, + "grad_norm": 3.0164480209350586, + "learning_rate": 7.1501065419176775e-06, + "loss": 0.3861, + "step": 9863 + }, + { + "epoch": 1.2003650745360512, + "grad_norm": 1.3019382953643799, + "learning_rate": 7.148236796116881e-06, + "loss": 0.3772, + "step": 9864 + }, + { + "epoch": 1.2004867660480683, + "grad_norm": 1.565251350402832, + "learning_rate": 7.146367158831578e-06, + "loss": 0.3929, + "step": 9865 + }, + { + "epoch": 1.2006084575600853, + "grad_norm": 2.9135732650756836, + "learning_rate": 7.1444976301329105e-06, + "loss": 0.4156, + "step": 9866 + }, + { + "epoch": 1.2007301490721023, + "grad_norm": 1.5117042064666748, + "learning_rate": 7.142628210092019e-06, + "loss": 0.3535, + "step": 9867 + }, + { + "epoch": 1.2008518405841193, + "grad_norm": 1.5485661029815674, + "learning_rate": 7.14075889878004e-06, + "loss": 0.4021, + "step": 9868 + }, + { + "epoch": 1.2009735320961363, + "grad_norm": 1.477531909942627, + "learning_rate": 7.138889696268101e-06, + "loss": 0.3909, + "step": 9869 + }, + { + "epoch": 1.2010952236081534, + "grad_norm": 1.7225297689437866, + "learning_rate": 7.137020602627332e-06, + "loss": 0.3782, + "step": 9870 + }, + { + "epoch": 1.2012169151201704, + "grad_norm": 1.5606352090835571, + "learning_rate": 7.135151617928855e-06, + "loss": 0.3743, + "step": 9871 + }, + { + "epoch": 1.2013386066321874, + "grad_norm": 1.413008689880371, + "learning_rate": 7.13328274224379e-06, + "loss": 0.3959, + "step": 9872 + }, + { + "epoch": 1.2014602981442044, + "grad_norm": 1.8778789043426514, + "learning_rate": 7.131413975643249e-06, + "loss": 0.3499, + "step": 9873 + }, + { + "epoch": 1.2015819896562214, + "grad_norm": 1.7032098770141602, + "learning_rate": 7.1295453181983475e-06, + "loss": 0.3843, + "step": 9874 + }, + { + "epoch": 1.2017036811682384, + "grad_norm": 1.5106642246246338, + "learning_rate": 7.127676769980185e-06, + "loss": 0.3485, + "step": 9875 + }, + { + "epoch": 1.2018253726802555, + "grad_norm": 1.6289196014404297, + "learning_rate": 7.125808331059868e-06, + "loss": 0.3643, + "step": 9876 + }, + { + "epoch": 1.2019470641922725, + "grad_norm": 3.55741286277771, + "learning_rate": 7.123940001508491e-06, + "loss": 0.3463, + "step": 9877 + }, + { + "epoch": 1.2020687557042895, + "grad_norm": 1.4855420589447021, + "learning_rate": 7.122071781397151e-06, + "loss": 0.36, + "step": 9878 + }, + { + "epoch": 1.2021904472163067, + "grad_norm": 1.5825939178466797, + "learning_rate": 7.120203670796936e-06, + "loss": 0.3507, + "step": 9879 + }, + { + "epoch": 1.2023121387283238, + "grad_norm": 2.6188483238220215, + "learning_rate": 7.118335669778934e-06, + "loss": 0.335, + "step": 9880 + }, + { + "epoch": 1.2024338302403408, + "grad_norm": 1.9618849754333496, + "learning_rate": 7.116467778414223e-06, + "loss": 0.411, + "step": 9881 + }, + { + "epoch": 1.2025555217523578, + "grad_norm": 1.689078450202942, + "learning_rate": 7.114599996773881e-06, + "loss": 0.4108, + "step": 9882 + }, + { + "epoch": 1.2026772132643748, + "grad_norm": 1.631308674812317, + "learning_rate": 7.112732324928985e-06, + "loss": 0.3441, + "step": 9883 + }, + { + "epoch": 1.2027989047763918, + "grad_norm": 3.095848798751831, + "learning_rate": 7.110864762950598e-06, + "loss": 0.4489, + "step": 9884 + }, + { + "epoch": 1.2029205962884089, + "grad_norm": 3.32705020904541, + "learning_rate": 7.1089973109097894e-06, + "loss": 0.4167, + "step": 9885 + }, + { + "epoch": 1.2030422878004259, + "grad_norm": 1.5482968091964722, + "learning_rate": 7.1071299688776155e-06, + "loss": 0.358, + "step": 9886 + }, + { + "epoch": 1.203163979312443, + "grad_norm": 3.5254554748535156, + "learning_rate": 7.105262736925132e-06, + "loss": 0.3866, + "step": 9887 + }, + { + "epoch": 1.20328567082446, + "grad_norm": 3.1695315837860107, + "learning_rate": 7.103395615123396e-06, + "loss": 0.3123, + "step": 9888 + }, + { + "epoch": 1.203407362336477, + "grad_norm": 1.5443141460418701, + "learning_rate": 7.101528603543451e-06, + "loss": 0.3557, + "step": 9889 + }, + { + "epoch": 1.2035290538484942, + "grad_norm": 1.659907579421997, + "learning_rate": 7.099661702256341e-06, + "loss": 0.4285, + "step": 9890 + }, + { + "epoch": 1.2036507453605112, + "grad_norm": 1.6083381175994873, + "learning_rate": 7.097794911333109e-06, + "loss": 0.4032, + "step": 9891 + }, + { + "epoch": 1.2037724368725282, + "grad_norm": 1.3338496685028076, + "learning_rate": 7.095928230844786e-06, + "loss": 0.3515, + "step": 9892 + }, + { + "epoch": 1.2038941283845452, + "grad_norm": 3.1042778491973877, + "learning_rate": 7.0940616608624056e-06, + "loss": 0.3583, + "step": 9893 + }, + { + "epoch": 1.2040158198965623, + "grad_norm": 2.0684478282928467, + "learning_rate": 7.092195201456995e-06, + "loss": 0.3558, + "step": 9894 + }, + { + "epoch": 1.2041375114085793, + "grad_norm": 4.512422561645508, + "learning_rate": 7.0903288526995736e-06, + "loss": 0.3876, + "step": 9895 + }, + { + "epoch": 1.2042592029205963, + "grad_norm": 2.115954637527466, + "learning_rate": 7.088462614661163e-06, + "loss": 0.4278, + "step": 9896 + }, + { + "epoch": 1.2043808944326133, + "grad_norm": 1.716399073600769, + "learning_rate": 7.08659648741278e-06, + "loss": 0.3915, + "step": 9897 + }, + { + "epoch": 1.2045025859446303, + "grad_norm": 1.7902320623397827, + "learning_rate": 7.084730471025427e-06, + "loss": 0.4327, + "step": 9898 + }, + { + "epoch": 1.2046242774566474, + "grad_norm": 2.3689777851104736, + "learning_rate": 7.0828645655701155e-06, + "loss": 0.3079, + "step": 9899 + }, + { + "epoch": 1.2047459689686644, + "grad_norm": 1.9941939115524292, + "learning_rate": 7.080998771117844e-06, + "loss": 0.4092, + "step": 9900 + }, + { + "epoch": 1.2048676604806814, + "grad_norm": 1.7526655197143555, + "learning_rate": 7.079133087739611e-06, + "loss": 0.3985, + "step": 9901 + }, + { + "epoch": 1.2049893519926984, + "grad_norm": 1.446198582649231, + "learning_rate": 7.077267515506413e-06, + "loss": 0.4139, + "step": 9902 + }, + { + "epoch": 1.2051110435047154, + "grad_norm": 1.6286088228225708, + "learning_rate": 7.075402054489234e-06, + "loss": 0.3994, + "step": 9903 + }, + { + "epoch": 1.2052327350167327, + "grad_norm": 1.712848424911499, + "learning_rate": 7.073536704759059e-06, + "loss": 0.421, + "step": 9904 + }, + { + "epoch": 1.2053544265287497, + "grad_norm": 1.4952540397644043, + "learning_rate": 7.071671466386869e-06, + "loss": 0.3555, + "step": 9905 + }, + { + "epoch": 1.2054761180407667, + "grad_norm": 1.493610143661499, + "learning_rate": 7.0698063394436435e-06, + "loss": 0.3761, + "step": 9906 + }, + { + "epoch": 1.2055978095527837, + "grad_norm": 1.6017245054244995, + "learning_rate": 7.067941324000352e-06, + "loss": 0.3529, + "step": 9907 + }, + { + "epoch": 1.2057195010648007, + "grad_norm": 1.9543555974960327, + "learning_rate": 7.066076420127964e-06, + "loss": 0.4133, + "step": 9908 + }, + { + "epoch": 1.2058411925768178, + "grad_norm": 2.609764575958252, + "learning_rate": 7.064211627897437e-06, + "loss": 0.4384, + "step": 9909 + }, + { + "epoch": 1.2059628840888348, + "grad_norm": 2.833123207092285, + "learning_rate": 7.062346947379738e-06, + "loss": 0.3833, + "step": 9910 + }, + { + "epoch": 1.2060845756008518, + "grad_norm": 1.7996008396148682, + "learning_rate": 7.060482378645814e-06, + "loss": 0.3262, + "step": 9911 + }, + { + "epoch": 1.2062062671128688, + "grad_norm": 1.8962863683700562, + "learning_rate": 7.058617921766622e-06, + "loss": 0.3332, + "step": 9912 + }, + { + "epoch": 1.2063279586248858, + "grad_norm": 2.3121402263641357, + "learning_rate": 7.056753576813106e-06, + "loss": 0.4039, + "step": 9913 + }, + { + "epoch": 1.2064496501369029, + "grad_norm": 1.6381328105926514, + "learning_rate": 7.0548893438562105e-06, + "loss": 0.4022, + "step": 9914 + }, + { + "epoch": 1.20657134164892, + "grad_norm": 1.5254966020584106, + "learning_rate": 7.05302522296687e-06, + "loss": 0.3737, + "step": 9915 + }, + { + "epoch": 1.2066930331609371, + "grad_norm": 2.949911594390869, + "learning_rate": 7.051161214216018e-06, + "loss": 0.4068, + "step": 9916 + }, + { + "epoch": 1.2068147246729541, + "grad_norm": 2.3714778423309326, + "learning_rate": 7.049297317674588e-06, + "loss": 0.3557, + "step": 9917 + }, + { + "epoch": 1.2069364161849712, + "grad_norm": 1.5922006368637085, + "learning_rate": 7.047433533413501e-06, + "loss": 0.3946, + "step": 9918 + }, + { + "epoch": 1.2070581076969882, + "grad_norm": 1.5477550029754639, + "learning_rate": 7.04556986150368e-06, + "loss": 0.3763, + "step": 9919 + }, + { + "epoch": 1.2071797992090052, + "grad_norm": 2.199549913406372, + "learning_rate": 7.0437063020160455e-06, + "loss": 0.4445, + "step": 9920 + }, + { + "epoch": 1.2073014907210222, + "grad_norm": 2.6356499195098877, + "learning_rate": 7.041842855021501e-06, + "loss": 0.3384, + "step": 9921 + }, + { + "epoch": 1.2074231822330392, + "grad_norm": 1.3380391597747803, + "learning_rate": 7.03997952059096e-06, + "loss": 0.3674, + "step": 9922 + }, + { + "epoch": 1.2075448737450563, + "grad_norm": 1.6883161067962646, + "learning_rate": 7.038116298795323e-06, + "loss": 0.4058, + "step": 9923 + }, + { + "epoch": 1.2076665652570733, + "grad_norm": 1.5603445768356323, + "learning_rate": 7.036253189705493e-06, + "loss": 0.3278, + "step": 9924 + }, + { + "epoch": 1.2077882567690903, + "grad_norm": 1.529004693031311, + "learning_rate": 7.034390193392366e-06, + "loss": 0.4173, + "step": 9925 + }, + { + "epoch": 1.2079099482811073, + "grad_norm": 1.2442429065704346, + "learning_rate": 7.032527309926827e-06, + "loss": 0.3665, + "step": 9926 + }, + { + "epoch": 1.2080316397931243, + "grad_norm": 1.8064535856246948, + "learning_rate": 7.030664539379768e-06, + "loss": 0.3875, + "step": 9927 + }, + { + "epoch": 1.2081533313051414, + "grad_norm": 2.5870845317840576, + "learning_rate": 7.028801881822069e-06, + "loss": 0.4509, + "step": 9928 + }, + { + "epoch": 1.2082750228171586, + "grad_norm": 1.7757675647735596, + "learning_rate": 7.026939337324609e-06, + "loss": 0.3831, + "step": 9929 + }, + { + "epoch": 1.2083967143291756, + "grad_norm": 2.2062106132507324, + "learning_rate": 7.025076905958259e-06, + "loss": 0.3626, + "step": 9930 + }, + { + "epoch": 1.2085184058411926, + "grad_norm": 1.3745630979537964, + "learning_rate": 7.0232145877938965e-06, + "loss": 0.372, + "step": 9931 + }, + { + "epoch": 1.2086400973532097, + "grad_norm": 1.5313496589660645, + "learning_rate": 7.021352382902375e-06, + "loss": 0.3815, + "step": 9932 + }, + { + "epoch": 1.2087617888652267, + "grad_norm": 1.932791829109192, + "learning_rate": 7.019490291354563e-06, + "loss": 0.4015, + "step": 9933 + }, + { + "epoch": 1.2088834803772437, + "grad_norm": 1.8884142637252808, + "learning_rate": 7.0176283132213145e-06, + "loss": 0.3278, + "step": 9934 + }, + { + "epoch": 1.2090051718892607, + "grad_norm": 1.3709759712219238, + "learning_rate": 7.01576644857348e-06, + "loss": 0.3884, + "step": 9935 + }, + { + "epoch": 1.2091268634012777, + "grad_norm": 1.4755620956420898, + "learning_rate": 7.01390469748191e-06, + "loss": 0.3341, + "step": 9936 + }, + { + "epoch": 1.2092485549132947, + "grad_norm": 1.6526072025299072, + "learning_rate": 7.012043060017447e-06, + "loss": 0.3783, + "step": 9937 + }, + { + "epoch": 1.2093702464253118, + "grad_norm": 1.6409109830856323, + "learning_rate": 7.01018153625093e-06, + "loss": 0.4409, + "step": 9938 + }, + { + "epoch": 1.2094919379373288, + "grad_norm": 3.184429883956909, + "learning_rate": 7.008320126253194e-06, + "loss": 0.4623, + "step": 9939 + }, + { + "epoch": 1.209613629449346, + "grad_norm": 1.2408146858215332, + "learning_rate": 7.006458830095072e-06, + "loss": 0.3218, + "step": 9940 + }, + { + "epoch": 1.209735320961363, + "grad_norm": 2.685734748840332, + "learning_rate": 7.004597647847386e-06, + "loss": 0.3654, + "step": 9941 + }, + { + "epoch": 1.20985701247338, + "grad_norm": 2.2608585357666016, + "learning_rate": 7.002736579580958e-06, + "loss": 0.4121, + "step": 9942 + }, + { + "epoch": 1.209978703985397, + "grad_norm": 1.6932148933410645, + "learning_rate": 7.000875625366613e-06, + "loss": 0.3663, + "step": 9943 + }, + { + "epoch": 1.210100395497414, + "grad_norm": 1.6316936016082764, + "learning_rate": 6.999014785275155e-06, + "loss": 0.3944, + "step": 9944 + }, + { + "epoch": 1.2102220870094311, + "grad_norm": 1.554034948348999, + "learning_rate": 6.9971540593773955e-06, + "loss": 0.3744, + "step": 9945 + }, + { + "epoch": 1.2103437785214481, + "grad_norm": 3.209688663482666, + "learning_rate": 6.995293447744138e-06, + "loss": 0.344, + "step": 9946 + }, + { + "epoch": 1.2104654700334652, + "grad_norm": 1.660309910774231, + "learning_rate": 6.993432950446187e-06, + "loss": 0.3896, + "step": 9947 + }, + { + "epoch": 1.2105871615454822, + "grad_norm": 1.545854926109314, + "learning_rate": 6.991572567554334e-06, + "loss": 0.3323, + "step": 9948 + }, + { + "epoch": 1.2107088530574992, + "grad_norm": 1.3997136354446411, + "learning_rate": 6.989712299139372e-06, + "loss": 0.3802, + "step": 9949 + }, + { + "epoch": 1.2108305445695162, + "grad_norm": 1.8171509504318237, + "learning_rate": 6.987852145272087e-06, + "loss": 0.3664, + "step": 9950 + }, + { + "epoch": 1.2109522360815332, + "grad_norm": 1.4273338317871094, + "learning_rate": 6.985992106023265e-06, + "loss": 0.3818, + "step": 9951 + }, + { + "epoch": 1.2110739275935503, + "grad_norm": 2.940361976623535, + "learning_rate": 6.984132181463681e-06, + "loss": 0.4578, + "step": 9952 + }, + { + "epoch": 1.2111956191055673, + "grad_norm": 1.6295067071914673, + "learning_rate": 6.9822723716641084e-06, + "loss": 0.3527, + "step": 9953 + }, + { + "epoch": 1.2113173106175845, + "grad_norm": 2.1354970932006836, + "learning_rate": 6.9804126766953225e-06, + "loss": 0.3786, + "step": 9954 + }, + { + "epoch": 1.2114390021296015, + "grad_norm": 1.4374134540557861, + "learning_rate": 6.978553096628082e-06, + "loss": 0.3605, + "step": 9955 + }, + { + "epoch": 1.2115606936416186, + "grad_norm": 1.7722989320755005, + "learning_rate": 6.976693631533151e-06, + "loss": 0.3307, + "step": 9956 + }, + { + "epoch": 1.2116823851536356, + "grad_norm": 1.442394495010376, + "learning_rate": 6.974834281481284e-06, + "loss": 0.3688, + "step": 9957 + }, + { + "epoch": 1.2118040766656526, + "grad_norm": 1.6845611333847046, + "learning_rate": 6.972975046543233e-06, + "loss": 0.3842, + "step": 9958 + }, + { + "epoch": 1.2119257681776696, + "grad_norm": 2.095054864883423, + "learning_rate": 6.971115926789751e-06, + "loss": 0.4221, + "step": 9959 + }, + { + "epoch": 1.2120474596896866, + "grad_norm": 1.698492407798767, + "learning_rate": 6.969256922291574e-06, + "loss": 0.3146, + "step": 9960 + }, + { + "epoch": 1.2121691512017037, + "grad_norm": 1.813895583152771, + "learning_rate": 6.967398033119445e-06, + "loss": 0.339, + "step": 9961 + }, + { + "epoch": 1.2122908427137207, + "grad_norm": 1.9622679948806763, + "learning_rate": 6.9655392593440985e-06, + "loss": 0.3981, + "step": 9962 + }, + { + "epoch": 1.2124125342257377, + "grad_norm": 1.5471811294555664, + "learning_rate": 6.963680601036264e-06, + "loss": 0.3574, + "step": 9963 + }, + { + "epoch": 1.2125342257377547, + "grad_norm": 2.0501105785369873, + "learning_rate": 6.961822058266667e-06, + "loss": 0.3415, + "step": 9964 + }, + { + "epoch": 1.212655917249772, + "grad_norm": 1.4566954374313354, + "learning_rate": 6.959963631106029e-06, + "loss": 0.4249, + "step": 9965 + }, + { + "epoch": 1.212777608761789, + "grad_norm": 1.7529780864715576, + "learning_rate": 6.958105319625073e-06, + "loss": 0.3655, + "step": 9966 + }, + { + "epoch": 1.212899300273806, + "grad_norm": 1.6072533130645752, + "learning_rate": 6.956247123894502e-06, + "loss": 0.3096, + "step": 9967 + }, + { + "epoch": 1.213020991785823, + "grad_norm": 1.5779048204421997, + "learning_rate": 6.954389043985026e-06, + "loss": 0.3452, + "step": 9968 + }, + { + "epoch": 1.21314268329784, + "grad_norm": 2.105333089828491, + "learning_rate": 6.952531079967352e-06, + "loss": 0.3881, + "step": 9969 + }, + { + "epoch": 1.213264374809857, + "grad_norm": 1.9414513111114502, + "learning_rate": 6.950673231912179e-06, + "loss": 0.4065, + "step": 9970 + }, + { + "epoch": 1.213386066321874, + "grad_norm": 1.4344571828842163, + "learning_rate": 6.948815499890201e-06, + "loss": 0.376, + "step": 9971 + }, + { + "epoch": 1.213507757833891, + "grad_norm": 1.4175901412963867, + "learning_rate": 6.946957883972107e-06, + "loss": 0.3402, + "step": 9972 + }, + { + "epoch": 1.213629449345908, + "grad_norm": 2.5647964477539062, + "learning_rate": 6.945100384228587e-06, + "loss": 0.3717, + "step": 9973 + }, + { + "epoch": 1.2137511408579251, + "grad_norm": 2.6028192043304443, + "learning_rate": 6.943243000730322e-06, + "loss": 0.3441, + "step": 9974 + }, + { + "epoch": 1.2138728323699421, + "grad_norm": 2.2155604362487793, + "learning_rate": 6.941385733547985e-06, + "loss": 0.3948, + "step": 9975 + }, + { + "epoch": 1.2139945238819592, + "grad_norm": 1.776086449623108, + "learning_rate": 6.939528582752253e-06, + "loss": 0.3442, + "step": 9976 + }, + { + "epoch": 1.2141162153939762, + "grad_norm": 2.011115312576294, + "learning_rate": 6.937671548413796e-06, + "loss": 0.4114, + "step": 9977 + }, + { + "epoch": 1.2142379069059932, + "grad_norm": 1.7712589502334595, + "learning_rate": 6.935814630603275e-06, + "loss": 0.36, + "step": 9978 + }, + { + "epoch": 1.2143595984180102, + "grad_norm": 1.51893150806427, + "learning_rate": 6.933957829391346e-06, + "loss": 0.3782, + "step": 9979 + }, + { + "epoch": 1.2144812899300275, + "grad_norm": 2.2671079635620117, + "learning_rate": 6.932101144848669e-06, + "loss": 0.3938, + "step": 9980 + }, + { + "epoch": 1.2146029814420445, + "grad_norm": 1.7446645498275757, + "learning_rate": 6.930244577045895e-06, + "loss": 0.3357, + "step": 9981 + }, + { + "epoch": 1.2147246729540615, + "grad_norm": 2.676668405532837, + "learning_rate": 6.928388126053669e-06, + "loss": 0.4248, + "step": 9982 + }, + { + "epoch": 1.2148463644660785, + "grad_norm": 3.3868157863616943, + "learning_rate": 6.92653179194263e-06, + "loss": 0.4796, + "step": 9983 + }, + { + "epoch": 1.2149680559780955, + "grad_norm": 1.732919454574585, + "learning_rate": 6.92467557478342e-06, + "loss": 0.35, + "step": 9984 + }, + { + "epoch": 1.2150897474901126, + "grad_norm": 1.8993979692459106, + "learning_rate": 6.922819474646672e-06, + "loss": 0.3858, + "step": 9985 + }, + { + "epoch": 1.2152114390021296, + "grad_norm": 1.9380769729614258, + "learning_rate": 6.920963491603008e-06, + "loss": 0.3871, + "step": 9986 + }, + { + "epoch": 1.2153331305141466, + "grad_norm": 1.2939344644546509, + "learning_rate": 6.919107625723058e-06, + "loss": 0.398, + "step": 9987 + }, + { + "epoch": 1.2154548220261636, + "grad_norm": 1.5331484079360962, + "learning_rate": 6.917251877077442e-06, + "loss": 0.332, + "step": 9988 + }, + { + "epoch": 1.2155765135381806, + "grad_norm": 1.5995283126831055, + "learning_rate": 6.915396245736773e-06, + "loss": 0.3808, + "step": 9989 + }, + { + "epoch": 1.2156982050501977, + "grad_norm": 1.3807971477508545, + "learning_rate": 6.913540731771662e-06, + "loss": 0.3857, + "step": 9990 + }, + { + "epoch": 1.215819896562215, + "grad_norm": 2.8587934970855713, + "learning_rate": 6.9116853352527115e-06, + "loss": 0.3431, + "step": 9991 + }, + { + "epoch": 1.215941588074232, + "grad_norm": 1.5146300792694092, + "learning_rate": 6.909830056250527e-06, + "loss": 0.3872, + "step": 9992 + }, + { + "epoch": 1.216063279586249, + "grad_norm": 1.8994717597961426, + "learning_rate": 6.907974894835708e-06, + "loss": 0.387, + "step": 9993 + }, + { + "epoch": 1.216184971098266, + "grad_norm": 1.5792795419692993, + "learning_rate": 6.906119851078841e-06, + "loss": 0.3821, + "step": 9994 + }, + { + "epoch": 1.216306662610283, + "grad_norm": 1.907387137413025, + "learning_rate": 6.9042649250505186e-06, + "loss": 0.4274, + "step": 9995 + }, + { + "epoch": 1.2164283541223, + "grad_norm": 2.6051313877105713, + "learning_rate": 6.9024101168213255e-06, + "loss": 0.3032, + "step": 9996 + }, + { + "epoch": 1.216550045634317, + "grad_norm": 2.166860818862915, + "learning_rate": 6.900555426461837e-06, + "loss": 0.3578, + "step": 9997 + }, + { + "epoch": 1.216671737146334, + "grad_norm": 1.4204481840133667, + "learning_rate": 6.89870085404263e-06, + "loss": 0.3766, + "step": 9998 + }, + { + "epoch": 1.216793428658351, + "grad_norm": 1.676345705986023, + "learning_rate": 6.8968463996342736e-06, + "loss": 0.3709, + "step": 9999 + }, + { + "epoch": 1.216915120170368, + "grad_norm": 3.2839457988739014, + "learning_rate": 6.89499206330734e-06, + "loss": 0.3924, + "step": 10000 + }, + { + "epoch": 1.217036811682385, + "grad_norm": 1.7567495107650757, + "learning_rate": 6.8931378451323846e-06, + "loss": 0.3009, + "step": 10001 + }, + { + "epoch": 1.217158503194402, + "grad_norm": 1.9860072135925293, + "learning_rate": 6.891283745179962e-06, + "loss": 0.4073, + "step": 10002 + }, + { + "epoch": 1.2172801947064191, + "grad_norm": 1.4669981002807617, + "learning_rate": 6.889429763520627e-06, + "loss": 0.3701, + "step": 10003 + }, + { + "epoch": 1.2174018862184361, + "grad_norm": 1.966073751449585, + "learning_rate": 6.887575900224931e-06, + "loss": 0.3667, + "step": 10004 + }, + { + "epoch": 1.2175235777304534, + "grad_norm": 2.080537796020508, + "learning_rate": 6.885722155363413e-06, + "loss": 0.316, + "step": 10005 + }, + { + "epoch": 1.2176452692424704, + "grad_norm": 1.2806013822555542, + "learning_rate": 6.883868529006611e-06, + "loss": 0.308, + "step": 10006 + }, + { + "epoch": 1.2177669607544874, + "grad_norm": 4.204131603240967, + "learning_rate": 6.882015021225062e-06, + "loss": 0.4191, + "step": 10007 + }, + { + "epoch": 1.2178886522665044, + "grad_norm": 1.7619069814682007, + "learning_rate": 6.8801616320893e-06, + "loss": 0.3843, + "step": 10008 + }, + { + "epoch": 1.2180103437785215, + "grad_norm": 2.0150258541107178, + "learning_rate": 6.878308361669841e-06, + "loss": 0.3972, + "step": 10009 + }, + { + "epoch": 1.2181320352905385, + "grad_norm": 1.7189114093780518, + "learning_rate": 6.876455210037209e-06, + "loss": 0.3605, + "step": 10010 + }, + { + "epoch": 1.2182537268025555, + "grad_norm": 1.3818416595458984, + "learning_rate": 6.874602177261926e-06, + "loss": 0.3659, + "step": 10011 + }, + { + "epoch": 1.2183754183145725, + "grad_norm": 1.6566050052642822, + "learning_rate": 6.872749263414502e-06, + "loss": 0.3668, + "step": 10012 + }, + { + "epoch": 1.2184971098265895, + "grad_norm": 1.6086212396621704, + "learning_rate": 6.870896468565437e-06, + "loss": 0.415, + "step": 10013 + }, + { + "epoch": 1.2186188013386066, + "grad_norm": 2.278784990310669, + "learning_rate": 6.869043792785237e-06, + "loss": 0.3187, + "step": 10014 + }, + { + "epoch": 1.2187404928506236, + "grad_norm": 1.5243163108825684, + "learning_rate": 6.867191236144402e-06, + "loss": 0.4224, + "step": 10015 + }, + { + "epoch": 1.2188621843626408, + "grad_norm": 2.868350028991699, + "learning_rate": 6.8653387987134275e-06, + "loss": 0.4393, + "step": 10016 + }, + { + "epoch": 1.2189838758746578, + "grad_norm": 1.6022921800613403, + "learning_rate": 6.863486480562798e-06, + "loss": 0.375, + "step": 10017 + }, + { + "epoch": 1.2191055673866749, + "grad_norm": 2.5440804958343506, + "learning_rate": 6.861634281763e-06, + "loss": 0.4253, + "step": 10018 + }, + { + "epoch": 1.2192272588986919, + "grad_norm": 1.742983341217041, + "learning_rate": 6.859782202384515e-06, + "loss": 0.3718, + "step": 10019 + }, + { + "epoch": 1.219348950410709, + "grad_norm": 1.916079044342041, + "learning_rate": 6.857930242497817e-06, + "loss": 0.3954, + "step": 10020 + }, + { + "epoch": 1.219470641922726, + "grad_norm": 2.3970232009887695, + "learning_rate": 6.8560784021733755e-06, + "loss": 0.3671, + "step": 10021 + }, + { + "epoch": 1.219592333434743, + "grad_norm": 1.591254472732544, + "learning_rate": 6.8542266814816594e-06, + "loss": 0.3837, + "step": 10022 + }, + { + "epoch": 1.21971402494676, + "grad_norm": 2.426933765411377, + "learning_rate": 6.85237508049313e-06, + "loss": 0.346, + "step": 10023 + }, + { + "epoch": 1.219835716458777, + "grad_norm": 1.8062587976455688, + "learning_rate": 6.850523599278246e-06, + "loss": 0.431, + "step": 10024 + }, + { + "epoch": 1.219957407970794, + "grad_norm": 1.314801812171936, + "learning_rate": 6.8486722379074545e-06, + "loss": 0.3489, + "step": 10025 + }, + { + "epoch": 1.220079099482811, + "grad_norm": 1.3319562673568726, + "learning_rate": 6.8468209964512074e-06, + "loss": 0.3814, + "step": 10026 + }, + { + "epoch": 1.220200790994828, + "grad_norm": 1.550036907196045, + "learning_rate": 6.84496987497995e-06, + "loss": 0.3752, + "step": 10027 + }, + { + "epoch": 1.220322482506845, + "grad_norm": 1.9958138465881348, + "learning_rate": 6.843118873564119e-06, + "loss": 0.3645, + "step": 10028 + }, + { + "epoch": 1.220444174018862, + "grad_norm": 1.5241824388504028, + "learning_rate": 6.841267992274147e-06, + "loss": 0.4009, + "step": 10029 + }, + { + "epoch": 1.2205658655308793, + "grad_norm": 1.9343754053115845, + "learning_rate": 6.839417231180469e-06, + "loss": 0.3696, + "step": 10030 + }, + { + "epoch": 1.2206875570428963, + "grad_norm": 1.2802180051803589, + "learning_rate": 6.837566590353506e-06, + "loss": 0.3188, + "step": 10031 + }, + { + "epoch": 1.2208092485549134, + "grad_norm": 1.5967847108840942, + "learning_rate": 6.835716069863678e-06, + "loss": 0.4215, + "step": 10032 + }, + { + "epoch": 1.2209309400669304, + "grad_norm": 1.9407161474227905, + "learning_rate": 6.8338656697814055e-06, + "loss": 0.3554, + "step": 10033 + }, + { + "epoch": 1.2210526315789474, + "grad_norm": 1.7045966386795044, + "learning_rate": 6.8320153901770995e-06, + "loss": 0.3964, + "step": 10034 + }, + { + "epoch": 1.2211743230909644, + "grad_norm": 1.8515386581420898, + "learning_rate": 6.830165231121166e-06, + "loss": 0.418, + "step": 10035 + }, + { + "epoch": 1.2212960146029814, + "grad_norm": 2.134194850921631, + "learning_rate": 6.828315192684004e-06, + "loss": 0.4145, + "step": 10036 + }, + { + "epoch": 1.2214177061149984, + "grad_norm": 2.974844455718994, + "learning_rate": 6.8264652749360136e-06, + "loss": 0.416, + "step": 10037 + }, + { + "epoch": 1.2215393976270155, + "grad_norm": 1.8573710918426514, + "learning_rate": 6.824615477947589e-06, + "loss": 0.4382, + "step": 10038 + }, + { + "epoch": 1.2216610891390325, + "grad_norm": 3.1503186225891113, + "learning_rate": 6.822765801789117e-06, + "loss": 0.4425, + "step": 10039 + }, + { + "epoch": 1.2217827806510495, + "grad_norm": 1.7028475999832153, + "learning_rate": 6.820916246530984e-06, + "loss": 0.4468, + "step": 10040 + }, + { + "epoch": 1.2219044721630667, + "grad_norm": 1.371720314025879, + "learning_rate": 6.819066812243567e-06, + "loss": 0.3636, + "step": 10041 + }, + { + "epoch": 1.2220261636750838, + "grad_norm": 2.932171106338501, + "learning_rate": 6.8172174989972435e-06, + "loss": 0.3632, + "step": 10042 + }, + { + "epoch": 1.2221478551871008, + "grad_norm": 1.4529882669448853, + "learning_rate": 6.815368306862382e-06, + "loss": 0.3776, + "step": 10043 + }, + { + "epoch": 1.2222695466991178, + "grad_norm": 2.41349196434021, + "learning_rate": 6.813519235909347e-06, + "loss": 0.3981, + "step": 10044 + }, + { + "epoch": 1.2223912382111348, + "grad_norm": 1.9547290802001953, + "learning_rate": 6.811670286208503e-06, + "loss": 0.3828, + "step": 10045 + }, + { + "epoch": 1.2225129297231518, + "grad_norm": 3.3234975337982178, + "learning_rate": 6.809821457830201e-06, + "loss": 0.3358, + "step": 10046 + }, + { + "epoch": 1.2226346212351689, + "grad_norm": 2.3116579055786133, + "learning_rate": 6.807972750844801e-06, + "loss": 0.3848, + "step": 10047 + }, + { + "epoch": 1.2227563127471859, + "grad_norm": 1.3412984609603882, + "learning_rate": 6.806124165322641e-06, + "loss": 0.4085, + "step": 10048 + }, + { + "epoch": 1.222878004259203, + "grad_norm": 1.237619161605835, + "learning_rate": 6.8042757013340665e-06, + "loss": 0.3525, + "step": 10049 + }, + { + "epoch": 1.22299969577122, + "grad_norm": 1.3679311275482178, + "learning_rate": 6.8024273589494195e-06, + "loss": 0.3829, + "step": 10050 + }, + { + "epoch": 1.223121387283237, + "grad_norm": 1.5347281694412231, + "learning_rate": 6.800579138239028e-06, + "loss": 0.3576, + "step": 10051 + }, + { + "epoch": 1.223243078795254, + "grad_norm": 2.21738338470459, + "learning_rate": 6.7987310392732235e-06, + "loss": 0.3385, + "step": 10052 + }, + { + "epoch": 1.223364770307271, + "grad_norm": 2.2998335361480713, + "learning_rate": 6.796883062122329e-06, + "loss": 0.4308, + "step": 10053 + }, + { + "epoch": 1.223486461819288, + "grad_norm": 1.3314573764801025, + "learning_rate": 6.795035206856663e-06, + "loss": 0.3532, + "step": 10054 + }, + { + "epoch": 1.2236081533313052, + "grad_norm": 1.3780741691589355, + "learning_rate": 6.793187473546541e-06, + "loss": 0.3867, + "step": 10055 + }, + { + "epoch": 1.2237298448433223, + "grad_norm": 2.9927659034729004, + "learning_rate": 6.791339862262276e-06, + "loss": 0.3601, + "step": 10056 + }, + { + "epoch": 1.2238515363553393, + "grad_norm": 3.4382476806640625, + "learning_rate": 6.789492373074169e-06, + "loss": 0.438, + "step": 10057 + }, + { + "epoch": 1.2239732278673563, + "grad_norm": 2.0436625480651855, + "learning_rate": 6.787645006052525e-06, + "loss": 0.341, + "step": 10058 + }, + { + "epoch": 1.2240949193793733, + "grad_norm": 1.3106759786605835, + "learning_rate": 6.785797761267636e-06, + "loss": 0.3361, + "step": 10059 + }, + { + "epoch": 1.2242166108913903, + "grad_norm": 2.131434917449951, + "learning_rate": 6.7839506387897935e-06, + "loss": 0.4025, + "step": 10060 + }, + { + "epoch": 1.2243383024034074, + "grad_norm": 1.2191104888916016, + "learning_rate": 6.7821036386892905e-06, + "loss": 0.344, + "step": 10061 + }, + { + "epoch": 1.2244599939154244, + "grad_norm": 2.634955644607544, + "learning_rate": 6.7802567610364e-06, + "loss": 0.3743, + "step": 10062 + }, + { + "epoch": 1.2245816854274414, + "grad_norm": 1.5233964920043945, + "learning_rate": 6.778410005901406e-06, + "loss": 0.3452, + "step": 10063 + }, + { + "epoch": 1.2247033769394584, + "grad_norm": 1.5571216344833374, + "learning_rate": 6.77656337335458e-06, + "loss": 0.4033, + "step": 10064 + }, + { + "epoch": 1.2248250684514754, + "grad_norm": 2.993842601776123, + "learning_rate": 6.774716863466189e-06, + "loss": 0.4045, + "step": 10065 + }, + { + "epoch": 1.2249467599634927, + "grad_norm": 1.3878711462020874, + "learning_rate": 6.772870476306496e-06, + "loss": 0.395, + "step": 10066 + }, + { + "epoch": 1.2250684514755097, + "grad_norm": 2.649106502532959, + "learning_rate": 6.771024211945762e-06, + "loss": 0.379, + "step": 10067 + }, + { + "epoch": 1.2251901429875267, + "grad_norm": 2.9401559829711914, + "learning_rate": 6.769178070454243e-06, + "loss": 0.3604, + "step": 10068 + }, + { + "epoch": 1.2253118344995437, + "grad_norm": 1.7393020391464233, + "learning_rate": 6.767332051902182e-06, + "loss": 0.3582, + "step": 10069 + }, + { + "epoch": 1.2254335260115607, + "grad_norm": 2.9692940711975098, + "learning_rate": 6.765486156359834e-06, + "loss": 0.3832, + "step": 10070 + }, + { + "epoch": 1.2255552175235778, + "grad_norm": 2.12664532661438, + "learning_rate": 6.7636403838974276e-06, + "loss": 0.3998, + "step": 10071 + }, + { + "epoch": 1.2256769090355948, + "grad_norm": 1.749603509902954, + "learning_rate": 6.761794734585203e-06, + "loss": 0.4046, + "step": 10072 + }, + { + "epoch": 1.2257986005476118, + "grad_norm": 1.5714327096939087, + "learning_rate": 6.759949208493395e-06, + "loss": 0.3969, + "step": 10073 + }, + { + "epoch": 1.2259202920596288, + "grad_norm": 1.6170574426651, + "learning_rate": 6.758103805692223e-06, + "loss": 0.3675, + "step": 10074 + }, + { + "epoch": 1.2260419835716458, + "grad_norm": 1.404693603515625, + "learning_rate": 6.756258526251912e-06, + "loss": 0.3699, + "step": 10075 + }, + { + "epoch": 1.2261636750836629, + "grad_norm": 1.5548479557037354, + "learning_rate": 6.754413370242681e-06, + "loss": 0.4125, + "step": 10076 + }, + { + "epoch": 1.2262853665956799, + "grad_norm": 2.4601943492889404, + "learning_rate": 6.752568337734735e-06, + "loss": 0.3723, + "step": 10077 + }, + { + "epoch": 1.226407058107697, + "grad_norm": 1.7282521724700928, + "learning_rate": 6.750723428798287e-06, + "loss": 0.4007, + "step": 10078 + }, + { + "epoch": 1.226528749619714, + "grad_norm": 3.0546650886535645, + "learning_rate": 6.74887864350354e-06, + "loss": 0.3832, + "step": 10079 + }, + { + "epoch": 1.226650441131731, + "grad_norm": 1.4626129865646362, + "learning_rate": 6.747033981920688e-06, + "loss": 0.3536, + "step": 10080 + }, + { + "epoch": 1.2267721326437482, + "grad_norm": 1.3409062623977661, + "learning_rate": 6.74518944411993e-06, + "loss": 0.3313, + "step": 10081 + }, + { + "epoch": 1.2268938241557652, + "grad_norm": 1.5369508266448975, + "learning_rate": 6.743345030171448e-06, + "loss": 0.3651, + "step": 10082 + }, + { + "epoch": 1.2270155156677822, + "grad_norm": 1.8735958337783813, + "learning_rate": 6.741500740145428e-06, + "loss": 0.3617, + "step": 10083 + }, + { + "epoch": 1.2271372071797992, + "grad_norm": 2.009453058242798, + "learning_rate": 6.73965657411205e-06, + "loss": 0.3338, + "step": 10084 + }, + { + "epoch": 1.2272588986918163, + "grad_norm": 2.2092716693878174, + "learning_rate": 6.737812532141488e-06, + "loss": 0.3531, + "step": 10085 + }, + { + "epoch": 1.2273805902038333, + "grad_norm": 1.8117817640304565, + "learning_rate": 6.735968614303911e-06, + "loss": 0.4075, + "step": 10086 + }, + { + "epoch": 1.2275022817158503, + "grad_norm": 2.3265016078948975, + "learning_rate": 6.734124820669485e-06, + "loss": 0.4198, + "step": 10087 + }, + { + "epoch": 1.2276239732278673, + "grad_norm": 1.1171107292175293, + "learning_rate": 6.732281151308369e-06, + "loss": 0.3141, + "step": 10088 + }, + { + "epoch": 1.2277456647398843, + "grad_norm": 1.3437461853027344, + "learning_rate": 6.730437606290718e-06, + "loss": 0.3325, + "step": 10089 + }, + { + "epoch": 1.2278673562519014, + "grad_norm": 1.692828893661499, + "learning_rate": 6.728594185686687e-06, + "loss": 0.3521, + "step": 10090 + }, + { + "epoch": 1.2279890477639184, + "grad_norm": 1.748609185218811, + "learning_rate": 6.726750889566416e-06, + "loss": 0.4422, + "step": 10091 + }, + { + "epoch": 1.2281107392759356, + "grad_norm": 1.7633051872253418, + "learning_rate": 6.724907718000049e-06, + "loss": 0.3008, + "step": 10092 + }, + { + "epoch": 1.2282324307879526, + "grad_norm": 1.5486901998519897, + "learning_rate": 6.7230646710577265e-06, + "loss": 0.3776, + "step": 10093 + }, + { + "epoch": 1.2283541222999697, + "grad_norm": 1.6553490161895752, + "learning_rate": 6.721221748809571e-06, + "loss": 0.3271, + "step": 10094 + }, + { + "epoch": 1.2284758138119867, + "grad_norm": 4.259578227996826, + "learning_rate": 6.719378951325718e-06, + "loss": 0.4764, + "step": 10095 + }, + { + "epoch": 1.2285975053240037, + "grad_norm": 1.6228665113449097, + "learning_rate": 6.717536278676284e-06, + "loss": 0.4343, + "step": 10096 + }, + { + "epoch": 1.2287191968360207, + "grad_norm": 3.021451950073242, + "learning_rate": 6.715693730931389e-06, + "loss": 0.4, + "step": 10097 + }, + { + "epoch": 1.2288408883480377, + "grad_norm": 1.5281996726989746, + "learning_rate": 6.713851308161146e-06, + "loss": 0.3604, + "step": 10098 + }, + { + "epoch": 1.2289625798600547, + "grad_norm": 1.3944039344787598, + "learning_rate": 6.712009010435663e-06, + "loss": 0.3637, + "step": 10099 + }, + { + "epoch": 1.2290842713720718, + "grad_norm": 1.44630765914917, + "learning_rate": 6.710166837825043e-06, + "loss": 0.3759, + "step": 10100 + }, + { + "epoch": 1.2292059628840888, + "grad_norm": 1.4572608470916748, + "learning_rate": 6.708324790399383e-06, + "loss": 0.3688, + "step": 10101 + }, + { + "epoch": 1.2293276543961058, + "grad_norm": 3.140326499938965, + "learning_rate": 6.70648286822878e-06, + "loss": 0.4223, + "step": 10102 + }, + { + "epoch": 1.2294493459081228, + "grad_norm": 1.6996397972106934, + "learning_rate": 6.70464107138332e-06, + "loss": 0.3813, + "step": 10103 + }, + { + "epoch": 1.2295710374201398, + "grad_norm": 1.4673484563827515, + "learning_rate": 6.702799399933093e-06, + "loss": 0.3678, + "step": 10104 + }, + { + "epoch": 1.2296927289321569, + "grad_norm": 1.7116461992263794, + "learning_rate": 6.700957853948168e-06, + "loss": 0.3603, + "step": 10105 + }, + { + "epoch": 1.229814420444174, + "grad_norm": 1.863695502281189, + "learning_rate": 6.699116433498626e-06, + "loss": 0.4474, + "step": 10106 + }, + { + "epoch": 1.2299361119561911, + "grad_norm": 1.6932259798049927, + "learning_rate": 6.6972751386545376e-06, + "loss": 0.3891, + "step": 10107 + }, + { + "epoch": 1.2300578034682081, + "grad_norm": 2.165278196334839, + "learning_rate": 6.695433969485965e-06, + "loss": 0.3865, + "step": 10108 + }, + { + "epoch": 1.2301794949802252, + "grad_norm": 2.702542543411255, + "learning_rate": 6.693592926062971e-06, + "loss": 0.3932, + "step": 10109 + }, + { + "epoch": 1.2303011864922422, + "grad_norm": 2.1497843265533447, + "learning_rate": 6.69175200845561e-06, + "loss": 0.3509, + "step": 10110 + }, + { + "epoch": 1.2304228780042592, + "grad_norm": 1.8543788194656372, + "learning_rate": 6.689911216733932e-06, + "loss": 0.4296, + "step": 10111 + }, + { + "epoch": 1.2305445695162762, + "grad_norm": 2.2742533683776855, + "learning_rate": 6.688070550967983e-06, + "loss": 0.3936, + "step": 10112 + }, + { + "epoch": 1.2306662610282932, + "grad_norm": 1.6529985666275024, + "learning_rate": 6.686230011227808e-06, + "loss": 0.3612, + "step": 10113 + }, + { + "epoch": 1.2307879525403103, + "grad_norm": 2.7607147693634033, + "learning_rate": 6.684389597583437e-06, + "loss": 0.3032, + "step": 10114 + }, + { + "epoch": 1.2309096440523273, + "grad_norm": 1.5222597122192383, + "learning_rate": 6.682549310104905e-06, + "loss": 0.3835, + "step": 10115 + }, + { + "epoch": 1.2310313355643443, + "grad_norm": 1.828380823135376, + "learning_rate": 6.680709148862243e-06, + "loss": 0.3611, + "step": 10116 + }, + { + "epoch": 1.2311530270763615, + "grad_norm": 2.4681479930877686, + "learning_rate": 6.678869113925465e-06, + "loss": 0.4313, + "step": 10117 + }, + { + "epoch": 1.2312747185883786, + "grad_norm": 1.53141188621521, + "learning_rate": 6.677029205364592e-06, + "loss": 0.3677, + "step": 10118 + }, + { + "epoch": 1.2313964101003956, + "grad_norm": 1.5276798009872437, + "learning_rate": 6.675189423249635e-06, + "loss": 0.3886, + "step": 10119 + }, + { + "epoch": 1.2315181016124126, + "grad_norm": 1.2358633279800415, + "learning_rate": 6.673349767650602e-06, + "loss": 0.3303, + "step": 10120 + }, + { + "epoch": 1.2316397931244296, + "grad_norm": 1.5187122821807861, + "learning_rate": 6.671510238637498e-06, + "loss": 0.3367, + "step": 10121 + }, + { + "epoch": 1.2317614846364466, + "grad_norm": 3.9124903678894043, + "learning_rate": 6.6696708362803174e-06, + "loss": 0.4285, + "step": 10122 + }, + { + "epoch": 1.2318831761484637, + "grad_norm": 2.3285555839538574, + "learning_rate": 6.667831560649054e-06, + "loss": 0.3948, + "step": 10123 + }, + { + "epoch": 1.2320048676604807, + "grad_norm": 3.6912076473236084, + "learning_rate": 6.6659924118136996e-06, + "loss": 0.4472, + "step": 10124 + }, + { + "epoch": 1.2321265591724977, + "grad_norm": 1.5694102048873901, + "learning_rate": 6.664153389844234e-06, + "loss": 0.317, + "step": 10125 + }, + { + "epoch": 1.2322482506845147, + "grad_norm": 1.686165690422058, + "learning_rate": 6.662314494810636e-06, + "loss": 0.3926, + "step": 10126 + }, + { + "epoch": 1.2323699421965317, + "grad_norm": 4.579761505126953, + "learning_rate": 6.660475726782883e-06, + "loss": 0.4725, + "step": 10127 + }, + { + "epoch": 1.2324916337085488, + "grad_norm": 1.8019663095474243, + "learning_rate": 6.658637085830939e-06, + "loss": 0.4087, + "step": 10128 + }, + { + "epoch": 1.2326133252205658, + "grad_norm": 2.6613926887512207, + "learning_rate": 6.65679857202477e-06, + "loss": 0.4409, + "step": 10129 + }, + { + "epoch": 1.2327350167325828, + "grad_norm": 1.8014506101608276, + "learning_rate": 6.6549601854343345e-06, + "loss": 0.4228, + "step": 10130 + }, + { + "epoch": 1.2328567082446, + "grad_norm": 1.81883704662323, + "learning_rate": 6.653121926129588e-06, + "loss": 0.3724, + "step": 10131 + }, + { + "epoch": 1.232978399756617, + "grad_norm": 1.9304344654083252, + "learning_rate": 6.651283794180479e-06, + "loss": 0.4326, + "step": 10132 + }, + { + "epoch": 1.233100091268634, + "grad_norm": 2.783109188079834, + "learning_rate": 6.6494457896569564e-06, + "loss": 0.3593, + "step": 10133 + }, + { + "epoch": 1.233221782780651, + "grad_norm": 2.8495147228240967, + "learning_rate": 6.647607912628953e-06, + "loss": 0.398, + "step": 10134 + }, + { + "epoch": 1.233343474292668, + "grad_norm": 2.1942520141601562, + "learning_rate": 6.645770163166409e-06, + "loss": 0.4162, + "step": 10135 + }, + { + "epoch": 1.2334651658046851, + "grad_norm": 2.531790256500244, + "learning_rate": 6.6439325413392556e-06, + "loss": 0.4235, + "step": 10136 + }, + { + "epoch": 1.2335868573167021, + "grad_norm": 3.0320332050323486, + "learning_rate": 6.642095047217412e-06, + "loss": 0.3819, + "step": 10137 + }, + { + "epoch": 1.2337085488287192, + "grad_norm": 1.732704997062683, + "learning_rate": 6.640257680870803e-06, + "loss": 0.4441, + "step": 10138 + }, + { + "epoch": 1.2338302403407362, + "grad_norm": 1.34881591796875, + "learning_rate": 6.638420442369349e-06, + "loss": 0.4119, + "step": 10139 + }, + { + "epoch": 1.2339519318527532, + "grad_norm": 3.5255260467529297, + "learning_rate": 6.636583331782949e-06, + "loss": 0.3677, + "step": 10140 + }, + { + "epoch": 1.2340736233647702, + "grad_norm": 2.842275619506836, + "learning_rate": 6.634746349181518e-06, + "loss": 0.3659, + "step": 10141 + }, + { + "epoch": 1.2341953148767875, + "grad_norm": 1.715838074684143, + "learning_rate": 6.6329094946349515e-06, + "loss": 0.3604, + "step": 10142 + }, + { + "epoch": 1.2343170063888045, + "grad_norm": 1.959950566291809, + "learning_rate": 6.631072768213149e-06, + "loss": 0.4179, + "step": 10143 + }, + { + "epoch": 1.2344386979008215, + "grad_norm": 1.688847303390503, + "learning_rate": 6.629236169986004e-06, + "loss": 0.3726, + "step": 10144 + }, + { + "epoch": 1.2345603894128385, + "grad_norm": 1.3267831802368164, + "learning_rate": 6.6273997000233955e-06, + "loss": 0.3284, + "step": 10145 + }, + { + "epoch": 1.2346820809248555, + "grad_norm": 1.2984108924865723, + "learning_rate": 6.62556335839521e-06, + "loss": 0.348, + "step": 10146 + }, + { + "epoch": 1.2348037724368726, + "grad_norm": 2.858015298843384, + "learning_rate": 6.623727145171327e-06, + "loss": 0.4159, + "step": 10147 + }, + { + "epoch": 1.2349254639488896, + "grad_norm": 1.4354840517044067, + "learning_rate": 6.621891060421613e-06, + "loss": 0.3848, + "step": 10148 + }, + { + "epoch": 1.2350471554609066, + "grad_norm": 1.2782552242279053, + "learning_rate": 6.620055104215933e-06, + "loss": 0.3356, + "step": 10149 + }, + { + "epoch": 1.2351688469729236, + "grad_norm": 1.8523955345153809, + "learning_rate": 6.6182192766241596e-06, + "loss": 0.3372, + "step": 10150 + }, + { + "epoch": 1.2352905384849406, + "grad_norm": 4.148677825927734, + "learning_rate": 6.616383577716137e-06, + "loss": 0.4638, + "step": 10151 + }, + { + "epoch": 1.2354122299969577, + "grad_norm": 1.3652501106262207, + "learning_rate": 6.614548007561725e-06, + "loss": 0.3417, + "step": 10152 + }, + { + "epoch": 1.2355339215089747, + "grad_norm": 1.4691781997680664, + "learning_rate": 6.612712566230768e-06, + "loss": 0.355, + "step": 10153 + }, + { + "epoch": 1.2356556130209917, + "grad_norm": 1.8688024282455444, + "learning_rate": 6.6108772537931075e-06, + "loss": 0.3737, + "step": 10154 + }, + { + "epoch": 1.2357773045330087, + "grad_norm": 2.493049383163452, + "learning_rate": 6.609042070318585e-06, + "loss": 0.3017, + "step": 10155 + }, + { + "epoch": 1.235898996045026, + "grad_norm": 1.7352840900421143, + "learning_rate": 6.6072070158770295e-06, + "loss": 0.3994, + "step": 10156 + }, + { + "epoch": 1.236020687557043, + "grad_norm": 1.376338005065918, + "learning_rate": 6.605372090538269e-06, + "loss": 0.3413, + "step": 10157 + }, + { + "epoch": 1.23614237906906, + "grad_norm": 1.3053025007247925, + "learning_rate": 6.603537294372127e-06, + "loss": 0.3294, + "step": 10158 + }, + { + "epoch": 1.236264070581077, + "grad_norm": 1.4417226314544678, + "learning_rate": 6.6017026274484245e-06, + "loss": 0.3933, + "step": 10159 + }, + { + "epoch": 1.236385762093094, + "grad_norm": 3.6771433353424072, + "learning_rate": 6.599868089836968e-06, + "loss": 0.4538, + "step": 10160 + }, + { + "epoch": 1.236507453605111, + "grad_norm": 1.7547956705093384, + "learning_rate": 6.598033681607568e-06, + "loss": 0.3958, + "step": 10161 + }, + { + "epoch": 1.236629145117128, + "grad_norm": 1.8888362646102905, + "learning_rate": 6.596199402830034e-06, + "loss": 0.3507, + "step": 10162 + }, + { + "epoch": 1.236750836629145, + "grad_norm": 2.2229413986206055, + "learning_rate": 6.594365253574155e-06, + "loss": 0.4569, + "step": 10163 + }, + { + "epoch": 1.236872528141162, + "grad_norm": 1.3697724342346191, + "learning_rate": 6.5925312339097245e-06, + "loss": 0.3815, + "step": 10164 + }, + { + "epoch": 1.2369942196531791, + "grad_norm": 1.8411109447479248, + "learning_rate": 6.590697343906535e-06, + "loss": 0.3435, + "step": 10165 + }, + { + "epoch": 1.2371159111651961, + "grad_norm": 2.2527406215667725, + "learning_rate": 6.588863583634369e-06, + "loss": 0.3543, + "step": 10166 + }, + { + "epoch": 1.2372376026772134, + "grad_norm": 1.8241209983825684, + "learning_rate": 6.587029953163004e-06, + "loss": 0.4076, + "step": 10167 + }, + { + "epoch": 1.2373592941892304, + "grad_norm": 1.5561507940292358, + "learning_rate": 6.5851964525622125e-06, + "loss": 0.3842, + "step": 10168 + }, + { + "epoch": 1.2374809857012474, + "grad_norm": 1.294209599494934, + "learning_rate": 6.583363081901765e-06, + "loss": 0.3522, + "step": 10169 + }, + { + "epoch": 1.2376026772132644, + "grad_norm": 2.8769376277923584, + "learning_rate": 6.581529841251426e-06, + "loss": 0.3822, + "step": 10170 + }, + { + "epoch": 1.2377243687252815, + "grad_norm": 1.2180982828140259, + "learning_rate": 6.5796967306809515e-06, + "loss": 0.3649, + "step": 10171 + }, + { + "epoch": 1.2378460602372985, + "grad_norm": 1.7926931381225586, + "learning_rate": 6.577863750260094e-06, + "loss": 0.4302, + "step": 10172 + }, + { + "epoch": 1.2379677517493155, + "grad_norm": 1.6825025081634521, + "learning_rate": 6.57603090005861e-06, + "loss": 0.371, + "step": 10173 + }, + { + "epoch": 1.2380894432613325, + "grad_norm": 1.562591552734375, + "learning_rate": 6.574198180146232e-06, + "loss": 0.3653, + "step": 10174 + }, + { + "epoch": 1.2382111347733495, + "grad_norm": 2.818526029586792, + "learning_rate": 6.572365590592706e-06, + "loss": 0.4531, + "step": 10175 + }, + { + "epoch": 1.2383328262853666, + "grad_norm": 1.4376499652862549, + "learning_rate": 6.570533131467763e-06, + "loss": 0.3616, + "step": 10176 + }, + { + "epoch": 1.2384545177973836, + "grad_norm": 2.309314012527466, + "learning_rate": 6.568700802841134e-06, + "loss": 0.3877, + "step": 10177 + }, + { + "epoch": 1.2385762093094006, + "grad_norm": 1.6091163158416748, + "learning_rate": 6.566868604782542e-06, + "loss": 0.3977, + "step": 10178 + }, + { + "epoch": 1.2386979008214176, + "grad_norm": 1.6217697858810425, + "learning_rate": 6.565036537361704e-06, + "loss": 0.3667, + "step": 10179 + }, + { + "epoch": 1.2388195923334346, + "grad_norm": 3.172600030899048, + "learning_rate": 6.5632046006483375e-06, + "loss": 0.4775, + "step": 10180 + }, + { + "epoch": 1.2389412838454517, + "grad_norm": 1.9307940006256104, + "learning_rate": 6.561372794712151e-06, + "loss": 0.3849, + "step": 10181 + }, + { + "epoch": 1.239062975357469, + "grad_norm": 1.5082497596740723, + "learning_rate": 6.559541119622844e-06, + "loss": 0.3834, + "step": 10182 + }, + { + "epoch": 1.239184666869486, + "grad_norm": 1.4714469909667969, + "learning_rate": 6.55770957545012e-06, + "loss": 0.377, + "step": 10183 + }, + { + "epoch": 1.239306358381503, + "grad_norm": 3.1042087078094482, + "learning_rate": 6.555878162263672e-06, + "loss": 0.32, + "step": 10184 + }, + { + "epoch": 1.23942804989352, + "grad_norm": 1.747067928314209, + "learning_rate": 6.554046880133192e-06, + "loss": 0.3043, + "step": 10185 + }, + { + "epoch": 1.239549741405537, + "grad_norm": 1.8358150720596313, + "learning_rate": 6.552215729128358e-06, + "loss": 0.3467, + "step": 10186 + }, + { + "epoch": 1.239671432917554, + "grad_norm": 1.4600703716278076, + "learning_rate": 6.5503847093188515e-06, + "loss": 0.3745, + "step": 10187 + }, + { + "epoch": 1.239793124429571, + "grad_norm": 1.4450266361236572, + "learning_rate": 6.548553820774346e-06, + "loss": 0.3873, + "step": 10188 + }, + { + "epoch": 1.239914815941588, + "grad_norm": 2.450169801712036, + "learning_rate": 6.546723063564515e-06, + "loss": 0.4485, + "step": 10189 + }, + { + "epoch": 1.240036507453605, + "grad_norm": 2.0614356994628906, + "learning_rate": 6.544892437759016e-06, + "loss": 0.3637, + "step": 10190 + }, + { + "epoch": 1.240158198965622, + "grad_norm": 1.4805302619934082, + "learning_rate": 6.543061943427513e-06, + "loss": 0.4028, + "step": 10191 + }, + { + "epoch": 1.240279890477639, + "grad_norm": 2.427363634109497, + "learning_rate": 6.541231580639657e-06, + "loss": 0.4194, + "step": 10192 + }, + { + "epoch": 1.2404015819896563, + "grad_norm": 2.1027939319610596, + "learning_rate": 6.539401349465102e-06, + "loss": 0.4009, + "step": 10193 + }, + { + "epoch": 1.2405232735016734, + "grad_norm": 2.545494794845581, + "learning_rate": 6.537571249973487e-06, + "loss": 0.3337, + "step": 10194 + }, + { + "epoch": 1.2406449650136904, + "grad_norm": 1.5310806035995483, + "learning_rate": 6.5357412822344515e-06, + "loss": 0.3811, + "step": 10195 + }, + { + "epoch": 1.2407666565257074, + "grad_norm": 1.6960728168487549, + "learning_rate": 6.533911446317635e-06, + "loss": 0.3441, + "step": 10196 + }, + { + "epoch": 1.2408883480377244, + "grad_norm": 1.7053056955337524, + "learning_rate": 6.532081742292661e-06, + "loss": 0.3558, + "step": 10197 + }, + { + "epoch": 1.2410100395497414, + "grad_norm": 3.8674912452697754, + "learning_rate": 6.530252170229152e-06, + "loss": 0.4513, + "step": 10198 + }, + { + "epoch": 1.2411317310617584, + "grad_norm": 1.6030267477035522, + "learning_rate": 6.52842273019673e-06, + "loss": 0.3747, + "step": 10199 + }, + { + "epoch": 1.2412534225737755, + "grad_norm": 1.641355276107788, + "learning_rate": 6.52659342226501e-06, + "loss": 0.3494, + "step": 10200 + }, + { + "epoch": 1.2413751140857925, + "grad_norm": 1.4010977745056152, + "learning_rate": 6.524764246503601e-06, + "loss": 0.3742, + "step": 10201 + }, + { + "epoch": 1.2414968055978095, + "grad_norm": 1.8821971416473389, + "learning_rate": 6.522935202982104e-06, + "loss": 0.3698, + "step": 10202 + }, + { + "epoch": 1.2416184971098265, + "grad_norm": 1.9448819160461426, + "learning_rate": 6.521106291770118e-06, + "loss": 0.3454, + "step": 10203 + }, + { + "epoch": 1.2417401886218435, + "grad_norm": 2.2565510272979736, + "learning_rate": 6.519277512937243e-06, + "loss": 0.3388, + "step": 10204 + }, + { + "epoch": 1.2418618801338606, + "grad_norm": 2.1581931114196777, + "learning_rate": 6.51744886655306e-06, + "loss": 0.3958, + "step": 10205 + }, + { + "epoch": 1.2419835716458776, + "grad_norm": 2.816659450531006, + "learning_rate": 6.515620352687157e-06, + "loss": 0.311, + "step": 10206 + }, + { + "epoch": 1.2421052631578948, + "grad_norm": 1.8190574645996094, + "learning_rate": 6.5137919714091135e-06, + "loss": 0.4218, + "step": 10207 + }, + { + "epoch": 1.2422269546699118, + "grad_norm": 1.860929250717163, + "learning_rate": 6.511963722788501e-06, + "loss": 0.393, + "step": 10208 + }, + { + "epoch": 1.2423486461819289, + "grad_norm": 1.3672665357589722, + "learning_rate": 6.51013560689489e-06, + "loss": 0.3877, + "step": 10209 + }, + { + "epoch": 1.2424703376939459, + "grad_norm": 2.0359270572662354, + "learning_rate": 6.508307623797838e-06, + "loss": 0.3544, + "step": 10210 + }, + { + "epoch": 1.242592029205963, + "grad_norm": 3.789458990097046, + "learning_rate": 6.506479773566912e-06, + "loss": 0.433, + "step": 10211 + }, + { + "epoch": 1.24271372071798, + "grad_norm": 1.888265609741211, + "learning_rate": 6.504652056271661e-06, + "loss": 0.4067, + "step": 10212 + }, + { + "epoch": 1.242835412229997, + "grad_norm": 2.016385316848755, + "learning_rate": 6.502824471981633e-06, + "loss": 0.3379, + "step": 10213 + }, + { + "epoch": 1.242957103742014, + "grad_norm": 2.9620585441589355, + "learning_rate": 6.500997020766373e-06, + "loss": 0.3188, + "step": 10214 + }, + { + "epoch": 1.243078795254031, + "grad_norm": 1.5754475593566895, + "learning_rate": 6.49916970269542e-06, + "loss": 0.402, + "step": 10215 + }, + { + "epoch": 1.243200486766048, + "grad_norm": 3.2240493297576904, + "learning_rate": 6.4973425178383055e-06, + "loss": 0.3597, + "step": 10216 + }, + { + "epoch": 1.243322178278065, + "grad_norm": 1.8452606201171875, + "learning_rate": 6.495515466264557e-06, + "loss": 0.3952, + "step": 10217 + }, + { + "epoch": 1.2434438697900823, + "grad_norm": 1.7084141969680786, + "learning_rate": 6.4936885480437e-06, + "loss": 0.397, + "step": 10218 + }, + { + "epoch": 1.2435655613020993, + "grad_norm": 1.5936448574066162, + "learning_rate": 6.491861763245255e-06, + "loss": 0.3668, + "step": 10219 + }, + { + "epoch": 1.2436872528141163, + "grad_norm": 3.834557294845581, + "learning_rate": 6.49003511193873e-06, + "loss": 0.3288, + "step": 10220 + }, + { + "epoch": 1.2438089443261333, + "grad_norm": 2.0085062980651855, + "learning_rate": 6.4882085941936305e-06, + "loss": 0.3986, + "step": 10221 + }, + { + "epoch": 1.2439306358381503, + "grad_norm": 2.1349685192108154, + "learning_rate": 6.486382210079465e-06, + "loss": 0.3752, + "step": 10222 + }, + { + "epoch": 1.2440523273501674, + "grad_norm": 1.5375348329544067, + "learning_rate": 6.48455595966573e-06, + "loss": 0.3818, + "step": 10223 + }, + { + "epoch": 1.2441740188621844, + "grad_norm": 2.064540147781372, + "learning_rate": 6.482729843021916e-06, + "loss": 0.3664, + "step": 10224 + }, + { + "epoch": 1.2442957103742014, + "grad_norm": 1.913798213005066, + "learning_rate": 6.48090386021751e-06, + "loss": 0.3562, + "step": 10225 + }, + { + "epoch": 1.2444174018862184, + "grad_norm": 1.8298752307891846, + "learning_rate": 6.479078011321997e-06, + "loss": 0.3599, + "step": 10226 + }, + { + "epoch": 1.2445390933982354, + "grad_norm": 2.9885013103485107, + "learning_rate": 6.477252296404857e-06, + "loss": 0.3852, + "step": 10227 + }, + { + "epoch": 1.2446607849102524, + "grad_norm": 1.758154034614563, + "learning_rate": 6.4754267155355555e-06, + "loss": 0.3362, + "step": 10228 + }, + { + "epoch": 1.2447824764222695, + "grad_norm": 1.5940898656845093, + "learning_rate": 6.473601268783564e-06, + "loss": 0.3527, + "step": 10229 + }, + { + "epoch": 1.2449041679342865, + "grad_norm": 2.102858543395996, + "learning_rate": 6.471775956218345e-06, + "loss": 0.4091, + "step": 10230 + }, + { + "epoch": 1.2450258594463035, + "grad_norm": 1.4504879713058472, + "learning_rate": 6.4699507779093555e-06, + "loss": 0.3457, + "step": 10231 + }, + { + "epoch": 1.2451475509583207, + "grad_norm": 2.159846544265747, + "learning_rate": 6.4681257339260425e-06, + "loss": 0.3592, + "step": 10232 + }, + { + "epoch": 1.2452692424703378, + "grad_norm": 1.8087475299835205, + "learning_rate": 6.466300824337854e-06, + "loss": 0.3693, + "step": 10233 + }, + { + "epoch": 1.2453909339823548, + "grad_norm": 1.5439751148223877, + "learning_rate": 6.464476049214235e-06, + "loss": 0.4035, + "step": 10234 + }, + { + "epoch": 1.2455126254943718, + "grad_norm": 2.1327357292175293, + "learning_rate": 6.462651408624623e-06, + "loss": 0.3915, + "step": 10235 + }, + { + "epoch": 1.2456343170063888, + "grad_norm": 1.6450012922286987, + "learning_rate": 6.460826902638444e-06, + "loss": 0.4, + "step": 10236 + }, + { + "epoch": 1.2457560085184058, + "grad_norm": 1.636935830116272, + "learning_rate": 6.459002531325127e-06, + "loss": 0.396, + "step": 10237 + }, + { + "epoch": 1.2458777000304229, + "grad_norm": 2.4607183933258057, + "learning_rate": 6.4571782947540965e-06, + "loss": 0.363, + "step": 10238 + }, + { + "epoch": 1.2459993915424399, + "grad_norm": 1.7880399227142334, + "learning_rate": 6.4553541929947605e-06, + "loss": 0.3323, + "step": 10239 + }, + { + "epoch": 1.246121083054457, + "grad_norm": 1.579453468322754, + "learning_rate": 6.453530226116536e-06, + "loss": 0.3864, + "step": 10240 + }, + { + "epoch": 1.246242774566474, + "grad_norm": 3.1174049377441406, + "learning_rate": 6.451706394188829e-06, + "loss": 0.4405, + "step": 10241 + }, + { + "epoch": 1.246364466078491, + "grad_norm": 2.23661470413208, + "learning_rate": 6.449882697281038e-06, + "loss": 0.398, + "step": 10242 + }, + { + "epoch": 1.2464861575905082, + "grad_norm": 3.240889072418213, + "learning_rate": 6.44805913546256e-06, + "loss": 0.2665, + "step": 10243 + }, + { + "epoch": 1.2466078491025252, + "grad_norm": 1.862520694732666, + "learning_rate": 6.446235708802782e-06, + "loss": 0.3625, + "step": 10244 + }, + { + "epoch": 1.2467295406145422, + "grad_norm": 1.716979742050171, + "learning_rate": 6.444412417371089e-06, + "loss": 0.3963, + "step": 10245 + }, + { + "epoch": 1.2468512321265592, + "grad_norm": 1.7534376382827759, + "learning_rate": 6.442589261236867e-06, + "loss": 0.3965, + "step": 10246 + }, + { + "epoch": 1.2469729236385763, + "grad_norm": 1.667065143585205, + "learning_rate": 6.440766240469485e-06, + "loss": 0.3614, + "step": 10247 + }, + { + "epoch": 1.2470946151505933, + "grad_norm": 1.7638033628463745, + "learning_rate": 6.438943355138313e-06, + "loss": 0.3301, + "step": 10248 + }, + { + "epoch": 1.2472163066626103, + "grad_norm": 1.2614213228225708, + "learning_rate": 6.437120605312721e-06, + "loss": 0.3725, + "step": 10249 + }, + { + "epoch": 1.2473379981746273, + "grad_norm": 1.9589706659317017, + "learning_rate": 6.435297991062061e-06, + "loss": 0.3194, + "step": 10250 + }, + { + "epoch": 1.2474596896866443, + "grad_norm": 1.2972359657287598, + "learning_rate": 6.433475512455694e-06, + "loss": 0.3961, + "step": 10251 + }, + { + "epoch": 1.2475813811986614, + "grad_norm": 1.3767954111099243, + "learning_rate": 6.431653169562964e-06, + "loss": 0.3558, + "step": 10252 + }, + { + "epoch": 1.2477030727106784, + "grad_norm": 3.4938995838165283, + "learning_rate": 6.429830962453219e-06, + "loss": 0.4091, + "step": 10253 + }, + { + "epoch": 1.2478247642226954, + "grad_norm": 1.1383659839630127, + "learning_rate": 6.428008891195798e-06, + "loss": 0.298, + "step": 10254 + }, + { + "epoch": 1.2479464557347124, + "grad_norm": 2.4070913791656494, + "learning_rate": 6.426186955860028e-06, + "loss": 0.3771, + "step": 10255 + }, + { + "epoch": 1.2480681472467294, + "grad_norm": 2.375009536743164, + "learning_rate": 6.424365156515242e-06, + "loss": 0.437, + "step": 10256 + }, + { + "epoch": 1.2481898387587467, + "grad_norm": 3.548109769821167, + "learning_rate": 6.422543493230765e-06, + "loss": 0.4835, + "step": 10257 + }, + { + "epoch": 1.2483115302707637, + "grad_norm": 2.1627519130706787, + "learning_rate": 6.4207219660759116e-06, + "loss": 0.3335, + "step": 10258 + }, + { + "epoch": 1.2484332217827807, + "grad_norm": 1.7186542749404907, + "learning_rate": 6.4189005751199955e-06, + "loss": 0.3683, + "step": 10259 + }, + { + "epoch": 1.2485549132947977, + "grad_norm": 2.8404996395111084, + "learning_rate": 6.417079320432324e-06, + "loss": 0.4033, + "step": 10260 + }, + { + "epoch": 1.2486766048068147, + "grad_norm": 2.7294185161590576, + "learning_rate": 6.415258202082204e-06, + "loss": 0.4078, + "step": 10261 + }, + { + "epoch": 1.2487982963188318, + "grad_norm": 1.9257264137268066, + "learning_rate": 6.413437220138927e-06, + "loss": 0.3943, + "step": 10262 + }, + { + "epoch": 1.2489199878308488, + "grad_norm": 2.3244078159332275, + "learning_rate": 6.411616374671788e-06, + "loss": 0.4044, + "step": 10263 + }, + { + "epoch": 1.2490416793428658, + "grad_norm": 3.294506311416626, + "learning_rate": 6.409795665750074e-06, + "loss": 0.3042, + "step": 10264 + }, + { + "epoch": 1.2491633708548828, + "grad_norm": 2.544189214706421, + "learning_rate": 6.407975093443065e-06, + "loss": 0.3241, + "step": 10265 + }, + { + "epoch": 1.2492850623668998, + "grad_norm": 2.8588218688964844, + "learning_rate": 6.406154657820043e-06, + "loss": 0.3496, + "step": 10266 + }, + { + "epoch": 1.2494067538789169, + "grad_norm": 2.1826016902923584, + "learning_rate": 6.404334358950271e-06, + "loss": 0.334, + "step": 10267 + }, + { + "epoch": 1.249528445390934, + "grad_norm": 1.4517606496810913, + "learning_rate": 6.4025141969030205e-06, + "loss": 0.3756, + "step": 10268 + }, + { + "epoch": 1.2496501369029511, + "grad_norm": 1.3809586763381958, + "learning_rate": 6.400694171747552e-06, + "loss": 0.3607, + "step": 10269 + }, + { + "epoch": 1.2497718284149681, + "grad_norm": 2.6558356285095215, + "learning_rate": 6.398874283553118e-06, + "loss": 0.3177, + "step": 10270 + }, + { + "epoch": 1.2498935199269852, + "grad_norm": 1.7323626279830933, + "learning_rate": 6.397054532388972e-06, + "loss": 0.3724, + "step": 10271 + }, + { + "epoch": 1.2500152114390022, + "grad_norm": 2.186450719833374, + "learning_rate": 6.39523491832436e-06, + "loss": 0.4119, + "step": 10272 + }, + { + "epoch": 1.2501369029510192, + "grad_norm": 5.2563862800598145, + "learning_rate": 6.39341544142852e-06, + "loss": 0.471, + "step": 10273 + }, + { + "epoch": 1.2502585944630362, + "grad_norm": 1.3286327123641968, + "learning_rate": 6.391596101770687e-06, + "loss": 0.3021, + "step": 10274 + }, + { + "epoch": 1.2503802859750532, + "grad_norm": 1.8804073333740234, + "learning_rate": 6.389776899420094e-06, + "loss": 0.3226, + "step": 10275 + }, + { + "epoch": 1.2505019774870703, + "grad_norm": 2.5674660205841064, + "learning_rate": 6.387957834445959e-06, + "loss": 0.4329, + "step": 10276 + }, + { + "epoch": 1.2506236689990873, + "grad_norm": 1.4156824350357056, + "learning_rate": 6.386138906917512e-06, + "loss": 0.3409, + "step": 10277 + }, + { + "epoch": 1.2507453605111043, + "grad_norm": 1.4968140125274658, + "learning_rate": 6.3843201169039524e-06, + "loss": 0.3516, + "step": 10278 + }, + { + "epoch": 1.2508670520231213, + "grad_norm": 1.9649887084960938, + "learning_rate": 6.382501464474499e-06, + "loss": 0.3675, + "step": 10279 + }, + { + "epoch": 1.2509887435351383, + "grad_norm": 1.5103336572647095, + "learning_rate": 6.380682949698354e-06, + "loss": 0.3452, + "step": 10280 + }, + { + "epoch": 1.2511104350471554, + "grad_norm": 1.548187017440796, + "learning_rate": 6.378864572644712e-06, + "loss": 0.3453, + "step": 10281 + }, + { + "epoch": 1.2512321265591724, + "grad_norm": 2.288419485092163, + "learning_rate": 6.377046333382768e-06, + "loss": 0.4032, + "step": 10282 + }, + { + "epoch": 1.2513538180711896, + "grad_norm": 2.0537350177764893, + "learning_rate": 6.375228231981711e-06, + "loss": 0.4204, + "step": 10283 + }, + { + "epoch": 1.2514755095832066, + "grad_norm": 1.1866052150726318, + "learning_rate": 6.373410268510722e-06, + "loss": 0.3567, + "step": 10284 + }, + { + "epoch": 1.2515972010952237, + "grad_norm": 2.002417802810669, + "learning_rate": 6.371592443038981e-06, + "loss": 0.394, + "step": 10285 + }, + { + "epoch": 1.2517188926072407, + "grad_norm": 2.173264741897583, + "learning_rate": 6.369774755635655e-06, + "loss": 0.3891, + "step": 10286 + }, + { + "epoch": 1.2518405841192577, + "grad_norm": 1.8141224384307861, + "learning_rate": 6.367957206369917e-06, + "loss": 0.3315, + "step": 10287 + }, + { + "epoch": 1.2519622756312747, + "grad_norm": 2.7340219020843506, + "learning_rate": 6.3661397953109225e-06, + "loss": 0.3624, + "step": 10288 + }, + { + "epoch": 1.2520839671432917, + "grad_norm": 2.2916109561920166, + "learning_rate": 6.364322522527835e-06, + "loss": 0.3399, + "step": 10289 + }, + { + "epoch": 1.2522056586553088, + "grad_norm": 1.6363352537155151, + "learning_rate": 6.362505388089797e-06, + "loss": 0.3806, + "step": 10290 + }, + { + "epoch": 1.2523273501673258, + "grad_norm": 1.769532322883606, + "learning_rate": 6.360688392065959e-06, + "loss": 0.357, + "step": 10291 + }, + { + "epoch": 1.2524490416793428, + "grad_norm": 2.0367841720581055, + "learning_rate": 6.35887153452546e-06, + "loss": 0.4071, + "step": 10292 + }, + { + "epoch": 1.25257073319136, + "grad_norm": 1.7407128810882568, + "learning_rate": 6.357054815537435e-06, + "loss": 0.4159, + "step": 10293 + }, + { + "epoch": 1.252692424703377, + "grad_norm": 1.7790228128433228, + "learning_rate": 6.355238235171015e-06, + "loss": 0.3962, + "step": 10294 + }, + { + "epoch": 1.252814116215394, + "grad_norm": 3.8156418800354004, + "learning_rate": 6.353421793495327e-06, + "loss": 0.499, + "step": 10295 + }, + { + "epoch": 1.252935807727411, + "grad_norm": 2.255209445953369, + "learning_rate": 6.351605490579486e-06, + "loss": 0.4571, + "step": 10296 + }, + { + "epoch": 1.253057499239428, + "grad_norm": 1.682839274406433, + "learning_rate": 6.349789326492607e-06, + "loss": 0.3459, + "step": 10297 + }, + { + "epoch": 1.2531791907514451, + "grad_norm": 1.8207453489303589, + "learning_rate": 6.347973301303802e-06, + "loss": 0.4251, + "step": 10298 + }, + { + "epoch": 1.2533008822634621, + "grad_norm": 1.597184419631958, + "learning_rate": 6.34615741508217e-06, + "loss": 0.3675, + "step": 10299 + }, + { + "epoch": 1.2534225737754792, + "grad_norm": 2.73740553855896, + "learning_rate": 6.3443416678968165e-06, + "loss": 0.4034, + "step": 10300 + }, + { + "epoch": 1.2535442652874962, + "grad_norm": 2.7080307006835938, + "learning_rate": 6.342526059816822e-06, + "loss": 0.3327, + "step": 10301 + }, + { + "epoch": 1.2536659567995132, + "grad_norm": 1.590369701385498, + "learning_rate": 6.340710590911285e-06, + "loss": 0.3874, + "step": 10302 + }, + { + "epoch": 1.2537876483115302, + "grad_norm": 1.3904131650924683, + "learning_rate": 6.338895261249285e-06, + "loss": 0.3501, + "step": 10303 + }, + { + "epoch": 1.2539093398235472, + "grad_norm": 2.322983980178833, + "learning_rate": 6.337080070899897e-06, + "loss": 0.3466, + "step": 10304 + }, + { + "epoch": 1.2540310313355643, + "grad_norm": 1.473681092262268, + "learning_rate": 6.335265019932193e-06, + "loss": 0.3856, + "step": 10305 + }, + { + "epoch": 1.2541527228475813, + "grad_norm": 1.3059062957763672, + "learning_rate": 6.333450108415243e-06, + "loss": 0.3513, + "step": 10306 + }, + { + "epoch": 1.2542744143595983, + "grad_norm": 1.4276163578033447, + "learning_rate": 6.331635336418104e-06, + "loss": 0.3417, + "step": 10307 + }, + { + "epoch": 1.2543961058716153, + "grad_norm": 2.6507444381713867, + "learning_rate": 6.329820704009833e-06, + "loss": 0.4115, + "step": 10308 + }, + { + "epoch": 1.2545177973836326, + "grad_norm": 1.5199767351150513, + "learning_rate": 6.3280062112594835e-06, + "loss": 0.3512, + "step": 10309 + }, + { + "epoch": 1.2546394888956496, + "grad_norm": 2.149470329284668, + "learning_rate": 6.326191858236097e-06, + "loss": 0.3676, + "step": 10310 + }, + { + "epoch": 1.2547611804076666, + "grad_norm": 1.625446081161499, + "learning_rate": 6.3243776450087135e-06, + "loss": 0.3881, + "step": 10311 + }, + { + "epoch": 1.2548828719196836, + "grad_norm": 2.1296708583831787, + "learning_rate": 6.322563571646373e-06, + "loss": 0.4005, + "step": 10312 + }, + { + "epoch": 1.2550045634317006, + "grad_norm": 1.544885277748108, + "learning_rate": 6.320749638218097e-06, + "loss": 0.3482, + "step": 10313 + }, + { + "epoch": 1.2551262549437177, + "grad_norm": 1.4518001079559326, + "learning_rate": 6.318935844792915e-06, + "loss": 0.3343, + "step": 10314 + }, + { + "epoch": 1.2552479464557347, + "grad_norm": 2.0721778869628906, + "learning_rate": 6.317122191439839e-06, + "loss": 0.4879, + "step": 10315 + }, + { + "epoch": 1.2553696379677517, + "grad_norm": 2.368177890777588, + "learning_rate": 6.31530867822789e-06, + "loss": 0.4587, + "step": 10316 + }, + { + "epoch": 1.2554913294797687, + "grad_norm": 2.0135738849639893, + "learning_rate": 6.313495305226074e-06, + "loss": 0.3674, + "step": 10317 + }, + { + "epoch": 1.255613020991786, + "grad_norm": 2.9008350372314453, + "learning_rate": 6.3116820725033885e-06, + "loss": 0.4191, + "step": 10318 + }, + { + "epoch": 1.255734712503803, + "grad_norm": 1.9180659055709839, + "learning_rate": 6.309868980128837e-06, + "loss": 0.3526, + "step": 10319 + }, + { + "epoch": 1.25585640401582, + "grad_norm": 2.159235715866089, + "learning_rate": 6.308056028171407e-06, + "loss": 0.369, + "step": 10320 + }, + { + "epoch": 1.255978095527837, + "grad_norm": 1.8591614961624146, + "learning_rate": 6.30624321670009e-06, + "loss": 0.372, + "step": 10321 + }, + { + "epoch": 1.256099787039854, + "grad_norm": 3.3553457260131836, + "learning_rate": 6.3044305457838615e-06, + "loss": 0.3147, + "step": 10322 + }, + { + "epoch": 1.256221478551871, + "grad_norm": 1.7500940561294556, + "learning_rate": 6.3026180154917046e-06, + "loss": 0.389, + "step": 10323 + }, + { + "epoch": 1.256343170063888, + "grad_norm": 2.3331050872802734, + "learning_rate": 6.300805625892581e-06, + "loss": 0.3456, + "step": 10324 + }, + { + "epoch": 1.256464861575905, + "grad_norm": 1.8292016983032227, + "learning_rate": 6.298993377055463e-06, + "loss": 0.3568, + "step": 10325 + }, + { + "epoch": 1.256586553087922, + "grad_norm": 1.286298155784607, + "learning_rate": 6.297181269049306e-06, + "loss": 0.3377, + "step": 10326 + }, + { + "epoch": 1.2567082445999391, + "grad_norm": 1.907997488975525, + "learning_rate": 6.295369301943065e-06, + "loss": 0.363, + "step": 10327 + }, + { + "epoch": 1.2568299361119561, + "grad_norm": 1.6424189805984497, + "learning_rate": 6.29355747580569e-06, + "loss": 0.3547, + "step": 10328 + }, + { + "epoch": 1.2569516276239732, + "grad_norm": 3.8504080772399902, + "learning_rate": 6.291745790706129e-06, + "loss": 0.4063, + "step": 10329 + }, + { + "epoch": 1.2570733191359902, + "grad_norm": 3.5502266883850098, + "learning_rate": 6.2899342467133115e-06, + "loss": 0.4361, + "step": 10330 + }, + { + "epoch": 1.2571950106480072, + "grad_norm": 1.7539945840835571, + "learning_rate": 6.288122843896179e-06, + "loss": 0.3525, + "step": 10331 + }, + { + "epoch": 1.2573167021600242, + "grad_norm": 1.3641618490219116, + "learning_rate": 6.286311582323654e-06, + "loss": 0.3734, + "step": 10332 + }, + { + "epoch": 1.2574383936720412, + "grad_norm": 1.7663099765777588, + "learning_rate": 6.28450046206466e-06, + "loss": 0.375, + "step": 10333 + }, + { + "epoch": 1.2575600851840585, + "grad_norm": 1.6508175134658813, + "learning_rate": 6.282689483188113e-06, + "loss": 0.3093, + "step": 10334 + }, + { + "epoch": 1.2576817766960755, + "grad_norm": 1.401906132698059, + "learning_rate": 6.28087864576293e-06, + "loss": 0.4104, + "step": 10335 + }, + { + "epoch": 1.2578034682080925, + "grad_norm": 1.5382647514343262, + "learning_rate": 6.279067949858009e-06, + "loss": 0.3662, + "step": 10336 + }, + { + "epoch": 1.2579251597201095, + "grad_norm": 1.4446736574172974, + "learning_rate": 6.277257395542256e-06, + "loss": 0.2971, + "step": 10337 + }, + { + "epoch": 1.2580468512321266, + "grad_norm": 1.5693936347961426, + "learning_rate": 6.275446982884563e-06, + "loss": 0.4226, + "step": 10338 + }, + { + "epoch": 1.2581685427441436, + "grad_norm": 3.176318645477295, + "learning_rate": 6.273636711953821e-06, + "loss": 0.456, + "step": 10339 + }, + { + "epoch": 1.2582902342561606, + "grad_norm": 2.546290874481201, + "learning_rate": 6.271826582818918e-06, + "loss": 0.379, + "step": 10340 + }, + { + "epoch": 1.2584119257681776, + "grad_norm": 2.663435459136963, + "learning_rate": 6.270016595548729e-06, + "loss": 0.4254, + "step": 10341 + }, + { + "epoch": 1.2585336172801946, + "grad_norm": 2.669820785522461, + "learning_rate": 6.268206750212129e-06, + "loss": 0.3611, + "step": 10342 + }, + { + "epoch": 1.2586553087922119, + "grad_norm": 1.6334837675094604, + "learning_rate": 6.266397046877988e-06, + "loss": 0.3197, + "step": 10343 + }, + { + "epoch": 1.258777000304229, + "grad_norm": 1.5402953624725342, + "learning_rate": 6.264587485615166e-06, + "loss": 0.3859, + "step": 10344 + }, + { + "epoch": 1.258898691816246, + "grad_norm": 1.406273365020752, + "learning_rate": 6.262778066492521e-06, + "loss": 0.2967, + "step": 10345 + }, + { + "epoch": 1.259020383328263, + "grad_norm": 1.5371147394180298, + "learning_rate": 6.260968789578911e-06, + "loss": 0.3625, + "step": 10346 + }, + { + "epoch": 1.25914207484028, + "grad_norm": 2.012908697128296, + "learning_rate": 6.259159654943173e-06, + "loss": 0.3578, + "step": 10347 + }, + { + "epoch": 1.259263766352297, + "grad_norm": 2.0378217697143555, + "learning_rate": 6.2573506626541555e-06, + "loss": 0.3543, + "step": 10348 + }, + { + "epoch": 1.259385457864314, + "grad_norm": 1.4626818895339966, + "learning_rate": 6.255541812780692e-06, + "loss": 0.3738, + "step": 10349 + }, + { + "epoch": 1.259507149376331, + "grad_norm": 2.4697868824005127, + "learning_rate": 6.2537331053916105e-06, + "loss": 0.3798, + "step": 10350 + }, + { + "epoch": 1.259628840888348, + "grad_norm": 3.879641532897949, + "learning_rate": 6.2519245405557424e-06, + "loss": 0.4534, + "step": 10351 + }, + { + "epoch": 1.259750532400365, + "grad_norm": 2.0256338119506836, + "learning_rate": 6.250116118341901e-06, + "loss": 0.3961, + "step": 10352 + }, + { + "epoch": 1.259872223912382, + "grad_norm": 1.9154810905456543, + "learning_rate": 6.248307838818904e-06, + "loss": 0.4165, + "step": 10353 + }, + { + "epoch": 1.259993915424399, + "grad_norm": 1.7380759716033936, + "learning_rate": 6.246499702055559e-06, + "loss": 0.4114, + "step": 10354 + }, + { + "epoch": 1.260115606936416, + "grad_norm": 1.822481393814087, + "learning_rate": 6.2446917081206705e-06, + "loss": 0.3156, + "step": 10355 + }, + { + "epoch": 1.2602372984484331, + "grad_norm": 1.9142069816589355, + "learning_rate": 6.242883857083034e-06, + "loss": 0.4287, + "step": 10356 + }, + { + "epoch": 1.2603589899604501, + "grad_norm": 2.1088061332702637, + "learning_rate": 6.241076149011444e-06, + "loss": 0.4112, + "step": 10357 + }, + { + "epoch": 1.2604806814724672, + "grad_norm": 2.0033791065216064, + "learning_rate": 6.239268583974692e-06, + "loss": 0.3466, + "step": 10358 + }, + { + "epoch": 1.2606023729844844, + "grad_norm": 2.1132278442382812, + "learning_rate": 6.2374611620415516e-06, + "loss": 0.4509, + "step": 10359 + }, + { + "epoch": 1.2607240644965014, + "grad_norm": 1.5643467903137207, + "learning_rate": 6.235653883280799e-06, + "loss": 0.4395, + "step": 10360 + }, + { + "epoch": 1.2608457560085184, + "grad_norm": 1.7210465669631958, + "learning_rate": 6.23384674776121e-06, + "loss": 0.3896, + "step": 10361 + }, + { + "epoch": 1.2609674475205355, + "grad_norm": 2.4575791358947754, + "learning_rate": 6.2320397555515465e-06, + "loss": 0.3392, + "step": 10362 + }, + { + "epoch": 1.2610891390325525, + "grad_norm": 1.9225492477416992, + "learning_rate": 6.230232906720572e-06, + "loss": 0.3109, + "step": 10363 + }, + { + "epoch": 1.2612108305445695, + "grad_norm": 1.8778384923934937, + "learning_rate": 6.2284262013370344e-06, + "loss": 0.3385, + "step": 10364 + }, + { + "epoch": 1.2613325220565865, + "grad_norm": 2.1595747470855713, + "learning_rate": 6.226619639469689e-06, + "loss": 0.3921, + "step": 10365 + }, + { + "epoch": 1.2614542135686035, + "grad_norm": 2.5503785610198975, + "learning_rate": 6.224813221187278e-06, + "loss": 0.3422, + "step": 10366 + }, + { + "epoch": 1.2615759050806206, + "grad_norm": 1.3293473720550537, + "learning_rate": 6.223006946558536e-06, + "loss": 0.3688, + "step": 10367 + }, + { + "epoch": 1.2616975965926376, + "grad_norm": 2.0430235862731934, + "learning_rate": 6.2212008156521975e-06, + "loss": 0.3498, + "step": 10368 + }, + { + "epoch": 1.2618192881046548, + "grad_norm": 1.8556219339370728, + "learning_rate": 6.219394828536996e-06, + "loss": 0.3842, + "step": 10369 + }, + { + "epoch": 1.2619409796166718, + "grad_norm": 1.4079780578613281, + "learning_rate": 6.21758898528164e-06, + "loss": 0.3325, + "step": 10370 + }, + { + "epoch": 1.2620626711286889, + "grad_norm": 1.5446381568908691, + "learning_rate": 6.215783285954855e-06, + "loss": 0.3633, + "step": 10371 + }, + { + "epoch": 1.2621843626407059, + "grad_norm": 2.038118600845337, + "learning_rate": 6.213977730625347e-06, + "loss": 0.3862, + "step": 10372 + }, + { + "epoch": 1.262306054152723, + "grad_norm": 1.4054160118103027, + "learning_rate": 6.2121723193618225e-06, + "loss": 0.3166, + "step": 10373 + }, + { + "epoch": 1.26242774566474, + "grad_norm": 3.0983309745788574, + "learning_rate": 6.210367052232984e-06, + "loss": 0.4216, + "step": 10374 + }, + { + "epoch": 1.262549437176757, + "grad_norm": 1.440018653869629, + "learning_rate": 6.208561929307521e-06, + "loss": 0.368, + "step": 10375 + }, + { + "epoch": 1.262671128688774, + "grad_norm": 2.5499024391174316, + "learning_rate": 6.206756950654125e-06, + "loss": 0.3805, + "step": 10376 + }, + { + "epoch": 1.262792820200791, + "grad_norm": 1.522775411605835, + "learning_rate": 6.204952116341481e-06, + "loss": 0.3586, + "step": 10377 + }, + { + "epoch": 1.262914511712808, + "grad_norm": 2.750680446624756, + "learning_rate": 6.203147426438261e-06, + "loss": 0.441, + "step": 10378 + }, + { + "epoch": 1.263036203224825, + "grad_norm": 2.937898635864258, + "learning_rate": 6.20134288101314e-06, + "loss": 0.3937, + "step": 10379 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.5787930488586426, + "learning_rate": 6.199538480134786e-06, + "loss": 0.381, + "step": 10380 + }, + { + "epoch": 1.263279586248859, + "grad_norm": 1.8676972389221191, + "learning_rate": 6.197734223871864e-06, + "loss": 0.3818, + "step": 10381 + }, + { + "epoch": 1.263401277760876, + "grad_norm": 1.419344186782837, + "learning_rate": 6.195930112293023e-06, + "loss": 0.4017, + "step": 10382 + }, + { + "epoch": 1.263522969272893, + "grad_norm": 1.6382222175598145, + "learning_rate": 6.1941261454669125e-06, + "loss": 0.4107, + "step": 10383 + }, + { + "epoch": 1.2636446607849103, + "grad_norm": 1.8934613466262817, + "learning_rate": 6.192322323462181e-06, + "loss": 0.4257, + "step": 10384 + }, + { + "epoch": 1.2637663522969274, + "grad_norm": 1.9939029216766357, + "learning_rate": 6.190518646347468e-06, + "loss": 0.3989, + "step": 10385 + }, + { + "epoch": 1.2638880438089444, + "grad_norm": 1.7058422565460205, + "learning_rate": 6.1887151141914025e-06, + "loss": 0.4125, + "step": 10386 + }, + { + "epoch": 1.2640097353209614, + "grad_norm": 2.658456325531006, + "learning_rate": 6.186911727062617e-06, + "loss": 0.3489, + "step": 10387 + }, + { + "epoch": 1.2641314268329784, + "grad_norm": 3.096217632293701, + "learning_rate": 6.185108485029731e-06, + "loss": 0.3478, + "step": 10388 + }, + { + "epoch": 1.2642531183449954, + "grad_norm": 1.8618708848953247, + "learning_rate": 6.183305388161369e-06, + "loss": 0.3766, + "step": 10389 + }, + { + "epoch": 1.2643748098570124, + "grad_norm": 1.5148802995681763, + "learning_rate": 6.181502436526132e-06, + "loss": 0.3652, + "step": 10390 + }, + { + "epoch": 1.2644965013690295, + "grad_norm": 1.4201743602752686, + "learning_rate": 6.179699630192634e-06, + "loss": 0.3551, + "step": 10391 + }, + { + "epoch": 1.2646181928810465, + "grad_norm": 2.339498996734619, + "learning_rate": 6.177896969229476e-06, + "loss": 0.4219, + "step": 10392 + }, + { + "epoch": 1.2647398843930635, + "grad_norm": 4.591128349304199, + "learning_rate": 6.176094453705246e-06, + "loss": 0.4586, + "step": 10393 + }, + { + "epoch": 1.2648615759050807, + "grad_norm": 1.3773913383483887, + "learning_rate": 6.174292083688536e-06, + "loss": 0.3465, + "step": 10394 + }, + { + "epoch": 1.2649832674170978, + "grad_norm": 1.5853371620178223, + "learning_rate": 6.17248985924793e-06, + "loss": 0.3678, + "step": 10395 + }, + { + "epoch": 1.2651049589291148, + "grad_norm": 1.7863986492156982, + "learning_rate": 6.170687780452009e-06, + "loss": 0.3967, + "step": 10396 + }, + { + "epoch": 1.2652266504411318, + "grad_norm": 1.4064834117889404, + "learning_rate": 6.168885847369346e-06, + "loss": 0.4242, + "step": 10397 + }, + { + "epoch": 1.2653483419531488, + "grad_norm": 2.5834035873413086, + "learning_rate": 6.167084060068502e-06, + "loss": 0.4319, + "step": 10398 + }, + { + "epoch": 1.2654700334651658, + "grad_norm": 2.945024013519287, + "learning_rate": 6.165282418618046e-06, + "loss": 0.4173, + "step": 10399 + }, + { + "epoch": 1.2655917249771829, + "grad_norm": 1.9413625001907349, + "learning_rate": 6.163480923086534e-06, + "loss": 0.4148, + "step": 10400 + }, + { + "epoch": 1.2657134164891999, + "grad_norm": 1.3249505758285522, + "learning_rate": 6.1616795735425095e-06, + "loss": 0.3529, + "step": 10401 + }, + { + "epoch": 1.265835108001217, + "grad_norm": 2.0023956298828125, + "learning_rate": 6.159878370054523e-06, + "loss": 0.3941, + "step": 10402 + }, + { + "epoch": 1.265956799513234, + "grad_norm": 2.4751486778259277, + "learning_rate": 6.158077312691116e-06, + "loss": 0.3906, + "step": 10403 + }, + { + "epoch": 1.266078491025251, + "grad_norm": 1.5486822128295898, + "learning_rate": 6.156276401520818e-06, + "loss": 0.3909, + "step": 10404 + }, + { + "epoch": 1.266200182537268, + "grad_norm": 1.4864442348480225, + "learning_rate": 6.15447563661216e-06, + "loss": 0.3464, + "step": 10405 + }, + { + "epoch": 1.266321874049285, + "grad_norm": 1.8442856073379517, + "learning_rate": 6.152675018033662e-06, + "loss": 0.3722, + "step": 10406 + }, + { + "epoch": 1.266443565561302, + "grad_norm": 1.901624321937561, + "learning_rate": 6.150874545853842e-06, + "loss": 0.4188, + "step": 10407 + }, + { + "epoch": 1.266565257073319, + "grad_norm": 1.8881466388702393, + "learning_rate": 6.149074220141216e-06, + "loss": 0.3879, + "step": 10408 + }, + { + "epoch": 1.266686948585336, + "grad_norm": 1.4268056154251099, + "learning_rate": 6.1472740409642854e-06, + "loss": 0.3486, + "step": 10409 + }, + { + "epoch": 1.2668086400973533, + "grad_norm": 2.9135031700134277, + "learning_rate": 6.145474008391551e-06, + "loss": 0.3641, + "step": 10410 + }, + { + "epoch": 1.2669303316093703, + "grad_norm": 2.232487440109253, + "learning_rate": 6.143674122491512e-06, + "loss": 0.3718, + "step": 10411 + }, + { + "epoch": 1.2670520231213873, + "grad_norm": 2.7165050506591797, + "learning_rate": 6.141874383332651e-06, + "loss": 0.3647, + "step": 10412 + }, + { + "epoch": 1.2671737146334043, + "grad_norm": 1.9660296440124512, + "learning_rate": 6.140074790983457e-06, + "loss": 0.3844, + "step": 10413 + }, + { + "epoch": 1.2672954061454214, + "grad_norm": 2.1090433597564697, + "learning_rate": 6.138275345512407e-06, + "loss": 0.4064, + "step": 10414 + }, + { + "epoch": 1.2674170976574384, + "grad_norm": 2.349531650543213, + "learning_rate": 6.136476046987977e-06, + "loss": 0.3735, + "step": 10415 + }, + { + "epoch": 1.2675387891694554, + "grad_norm": 1.6262054443359375, + "learning_rate": 6.134676895478628e-06, + "loss": 0.3674, + "step": 10416 + }, + { + "epoch": 1.2676604806814724, + "grad_norm": 1.6123547554016113, + "learning_rate": 6.132877891052823e-06, + "loss": 0.3453, + "step": 10417 + }, + { + "epoch": 1.2677821721934894, + "grad_norm": 1.8852487802505493, + "learning_rate": 6.131079033779017e-06, + "loss": 0.3866, + "step": 10418 + }, + { + "epoch": 1.2679038637055067, + "grad_norm": 2.070927381515503, + "learning_rate": 6.129280323725666e-06, + "loss": 0.4192, + "step": 10419 + }, + { + "epoch": 1.2680255552175237, + "grad_norm": 1.594740629196167, + "learning_rate": 6.127481760961208e-06, + "loss": 0.363, + "step": 10420 + }, + { + "epoch": 1.2681472467295407, + "grad_norm": 2.4988176822662354, + "learning_rate": 6.125683345554085e-06, + "loss": 0.4495, + "step": 10421 + }, + { + "epoch": 1.2682689382415577, + "grad_norm": 1.6801209449768066, + "learning_rate": 6.123885077572729e-06, + "loss": 0.2916, + "step": 10422 + }, + { + "epoch": 1.2683906297535747, + "grad_norm": 2.5788590908050537, + "learning_rate": 6.122086957085571e-06, + "loss": 0.4272, + "step": 10423 + }, + { + "epoch": 1.2685123212655918, + "grad_norm": 1.5483750104904175, + "learning_rate": 6.120288984161029e-06, + "loss": 0.3907, + "step": 10424 + }, + { + "epoch": 1.2686340127776088, + "grad_norm": 1.8146661520004272, + "learning_rate": 6.118491158867523e-06, + "loss": 0.4039, + "step": 10425 + }, + { + "epoch": 1.2687557042896258, + "grad_norm": 1.6989229917526245, + "learning_rate": 6.1166934812734635e-06, + "loss": 0.3721, + "step": 10426 + }, + { + "epoch": 1.2688773958016428, + "grad_norm": 1.8849676847457886, + "learning_rate": 6.114895951447259e-06, + "loss": 0.3449, + "step": 10427 + }, + { + "epoch": 1.2689990873136598, + "grad_norm": 2.7213823795318604, + "learning_rate": 6.113098569457299e-06, + "loss": 0.4148, + "step": 10428 + }, + { + "epoch": 1.2691207788256769, + "grad_norm": 4.174452304840088, + "learning_rate": 6.111301335371984e-06, + "loss": 0.4482, + "step": 10429 + }, + { + "epoch": 1.2692424703376939, + "grad_norm": 1.5713180303573608, + "learning_rate": 6.109504249259703e-06, + "loss": 0.3635, + "step": 10430 + }, + { + "epoch": 1.269364161849711, + "grad_norm": 3.1803181171417236, + "learning_rate": 6.107707311188841e-06, + "loss": 0.4462, + "step": 10431 + }, + { + "epoch": 1.269485853361728, + "grad_norm": 1.4651141166687012, + "learning_rate": 6.1059105212277695e-06, + "loss": 0.3747, + "step": 10432 + }, + { + "epoch": 1.269607544873745, + "grad_norm": 1.3476706743240356, + "learning_rate": 6.104113879444863e-06, + "loss": 0.3531, + "step": 10433 + }, + { + "epoch": 1.269729236385762, + "grad_norm": 1.4035331010818481, + "learning_rate": 6.102317385908489e-06, + "loss": 0.3541, + "step": 10434 + }, + { + "epoch": 1.2698509278977792, + "grad_norm": 2.7053279876708984, + "learning_rate": 6.100521040687005e-06, + "loss": 0.3502, + "step": 10435 + }, + { + "epoch": 1.2699726194097962, + "grad_norm": 2.0300729274749756, + "learning_rate": 6.098724843848767e-06, + "loss": 0.3674, + "step": 10436 + }, + { + "epoch": 1.2700943109218132, + "grad_norm": 1.6558020114898682, + "learning_rate": 6.096928795462125e-06, + "loss": 0.3625, + "step": 10437 + }, + { + "epoch": 1.2702160024338303, + "grad_norm": 3.5191328525543213, + "learning_rate": 6.0951328955954225e-06, + "loss": 0.468, + "step": 10438 + }, + { + "epoch": 1.2703376939458473, + "grad_norm": 1.4504538774490356, + "learning_rate": 6.093337144316995e-06, + "loss": 0.382, + "step": 10439 + }, + { + "epoch": 1.2704593854578643, + "grad_norm": 1.5944061279296875, + "learning_rate": 6.091541541695174e-06, + "loss": 0.4136, + "step": 10440 + }, + { + "epoch": 1.2705810769698813, + "grad_norm": 2.0001142024993896, + "learning_rate": 6.089746087798287e-06, + "loss": 0.3918, + "step": 10441 + }, + { + "epoch": 1.2707027684818983, + "grad_norm": 1.9967511892318726, + "learning_rate": 6.087950782694657e-06, + "loss": 0.3508, + "step": 10442 + }, + { + "epoch": 1.2708244599939154, + "grad_norm": 1.6681797504425049, + "learning_rate": 6.086155626452596e-06, + "loss": 0.4393, + "step": 10443 + }, + { + "epoch": 1.2709461515059326, + "grad_norm": 1.7271015644073486, + "learning_rate": 6.084360619140414e-06, + "loss": 0.4042, + "step": 10444 + }, + { + "epoch": 1.2710678430179496, + "grad_norm": 1.9814927577972412, + "learning_rate": 6.082565760826417e-06, + "loss": 0.3668, + "step": 10445 + }, + { + "epoch": 1.2711895345299666, + "grad_norm": 1.9423177242279053, + "learning_rate": 6.0807710515789e-06, + "loss": 0.3419, + "step": 10446 + }, + { + "epoch": 1.2713112260419837, + "grad_norm": 1.3841135501861572, + "learning_rate": 6.078976491466158e-06, + "loss": 0.406, + "step": 10447 + }, + { + "epoch": 1.2714329175540007, + "grad_norm": 1.5829485654830933, + "learning_rate": 6.077182080556474e-06, + "loss": 0.4288, + "step": 10448 + }, + { + "epoch": 1.2715546090660177, + "grad_norm": 1.6783592700958252, + "learning_rate": 6.0753878189181345e-06, + "loss": 0.3824, + "step": 10449 + }, + { + "epoch": 1.2716763005780347, + "grad_norm": 1.9294824600219727, + "learning_rate": 6.073593706619416e-06, + "loss": 0.3789, + "step": 10450 + }, + { + "epoch": 1.2717979920900517, + "grad_norm": 2.5260190963745117, + "learning_rate": 6.071799743728578e-06, + "loss": 0.4571, + "step": 10451 + }, + { + "epoch": 1.2719196836020688, + "grad_norm": 2.35833740234375, + "learning_rate": 6.070005930313891e-06, + "loss": 0.3704, + "step": 10452 + }, + { + "epoch": 1.2720413751140858, + "grad_norm": 1.851613163948059, + "learning_rate": 6.068212266443616e-06, + "loss": 0.2806, + "step": 10453 + }, + { + "epoch": 1.2721630666261028, + "grad_norm": 2.70015811920166, + "learning_rate": 6.0664187521859994e-06, + "loss": 0.4471, + "step": 10454 + }, + { + "epoch": 1.2722847581381198, + "grad_norm": 1.429207444190979, + "learning_rate": 6.064625387609292e-06, + "loss": 0.3852, + "step": 10455 + }, + { + "epoch": 1.2724064496501368, + "grad_norm": 1.3841800689697266, + "learning_rate": 6.062832172781734e-06, + "loss": 0.3861, + "step": 10456 + }, + { + "epoch": 1.2725281411621538, + "grad_norm": 2.082512140274048, + "learning_rate": 6.061039107771565e-06, + "loss": 0.3865, + "step": 10457 + }, + { + "epoch": 1.2726498326741709, + "grad_norm": 1.7528374195098877, + "learning_rate": 6.0592461926470055e-06, + "loss": 0.3884, + "step": 10458 + }, + { + "epoch": 1.2727715241861879, + "grad_norm": 2.5518510341644287, + "learning_rate": 6.057453427476289e-06, + "loss": 0.3653, + "step": 10459 + }, + { + "epoch": 1.2728932156982051, + "grad_norm": 1.5256962776184082, + "learning_rate": 6.055660812327631e-06, + "loss": 0.3882, + "step": 10460 + }, + { + "epoch": 1.2730149072102221, + "grad_norm": 1.8758482933044434, + "learning_rate": 6.053868347269245e-06, + "loss": 0.3497, + "step": 10461 + }, + { + "epoch": 1.2731365987222392, + "grad_norm": 1.7514450550079346, + "learning_rate": 6.052076032369332e-06, + "loss": 0.353, + "step": 10462 + }, + { + "epoch": 1.2732582902342562, + "grad_norm": 1.7371220588684082, + "learning_rate": 6.050283867696099e-06, + "loss": 0.3624, + "step": 10463 + }, + { + "epoch": 1.2733799817462732, + "grad_norm": 1.4457263946533203, + "learning_rate": 6.04849185331774e-06, + "loss": 0.3553, + "step": 10464 + }, + { + "epoch": 1.2735016732582902, + "grad_norm": 3.211045503616333, + "learning_rate": 6.046699989302446e-06, + "loss": 0.4267, + "step": 10465 + }, + { + "epoch": 1.2736233647703072, + "grad_norm": 1.5058374404907227, + "learning_rate": 6.0449082757184e-06, + "loss": 0.361, + "step": 10466 + }, + { + "epoch": 1.2737450562823243, + "grad_norm": 2.124441385269165, + "learning_rate": 6.043116712633778e-06, + "loss": 0.3761, + "step": 10467 + }, + { + "epoch": 1.2738667477943413, + "grad_norm": 1.8949488401412964, + "learning_rate": 6.04132530011676e-06, + "loss": 0.4325, + "step": 10468 + }, + { + "epoch": 1.2739884393063583, + "grad_norm": 1.6625025272369385, + "learning_rate": 6.039534038235505e-06, + "loss": 0.4106, + "step": 10469 + }, + { + "epoch": 1.2741101308183755, + "grad_norm": 1.4373219013214111, + "learning_rate": 6.037742927058179e-06, + "loss": 0.358, + "step": 10470 + }, + { + "epoch": 1.2742318223303926, + "grad_norm": 1.883716344833374, + "learning_rate": 6.035951966652936e-06, + "loss": 0.405, + "step": 10471 + }, + { + "epoch": 1.2743535138424096, + "grad_norm": 2.272364377975464, + "learning_rate": 6.034161157087926e-06, + "loss": 0.3156, + "step": 10472 + }, + { + "epoch": 1.2744752053544266, + "grad_norm": 1.715686559677124, + "learning_rate": 6.032370498431294e-06, + "loss": 0.3643, + "step": 10473 + }, + { + "epoch": 1.2745968968664436, + "grad_norm": 1.8858438730239868, + "learning_rate": 6.030579990751175e-06, + "loss": 0.4144, + "step": 10474 + }, + { + "epoch": 1.2747185883784606, + "grad_norm": 1.255143642425537, + "learning_rate": 6.028789634115704e-06, + "loss": 0.3419, + "step": 10475 + }, + { + "epoch": 1.2748402798904777, + "grad_norm": 1.8643040657043457, + "learning_rate": 6.026999428593009e-06, + "loss": 0.355, + "step": 10476 + }, + { + "epoch": 1.2749619714024947, + "grad_norm": 2.9114835262298584, + "learning_rate": 6.025209374251206e-06, + "loss": 0.3684, + "step": 10477 + }, + { + "epoch": 1.2750836629145117, + "grad_norm": 2.755558729171753, + "learning_rate": 6.023419471158416e-06, + "loss": 0.4078, + "step": 10478 + }, + { + "epoch": 1.2752053544265287, + "grad_norm": 2.8887221813201904, + "learning_rate": 6.021629719382746e-06, + "loss": 0.3392, + "step": 10479 + }, + { + "epoch": 1.2753270459385457, + "grad_norm": 1.4835281372070312, + "learning_rate": 6.0198401189922995e-06, + "loss": 0.3395, + "step": 10480 + }, + { + "epoch": 1.2754487374505628, + "grad_norm": 1.3329451084136963, + "learning_rate": 6.018050670055174e-06, + "loss": 0.3471, + "step": 10481 + }, + { + "epoch": 1.2755704289625798, + "grad_norm": 2.1320884227752686, + "learning_rate": 6.016261372639464e-06, + "loss": 0.3477, + "step": 10482 + }, + { + "epoch": 1.2756921204745968, + "grad_norm": 1.4166489839553833, + "learning_rate": 6.0144722268132546e-06, + "loss": 0.3639, + "step": 10483 + }, + { + "epoch": 1.2758138119866138, + "grad_norm": 2.165843963623047, + "learning_rate": 6.01268323264463e-06, + "loss": 0.336, + "step": 10484 + }, + { + "epoch": 1.275935503498631, + "grad_norm": 2.06430721282959, + "learning_rate": 6.010894390201658e-06, + "loss": 0.381, + "step": 10485 + }, + { + "epoch": 1.276057195010648, + "grad_norm": 1.5169594287872314, + "learning_rate": 6.009105699552411e-06, + "loss": 0.3166, + "step": 10486 + }, + { + "epoch": 1.276178886522665, + "grad_norm": 1.3819568157196045, + "learning_rate": 6.007317160764954e-06, + "loss": 0.3702, + "step": 10487 + }, + { + "epoch": 1.276300578034682, + "grad_norm": 2.10349178314209, + "learning_rate": 6.005528773907343e-06, + "loss": 0.3391, + "step": 10488 + }, + { + "epoch": 1.2764222695466991, + "grad_norm": 2.059483289718628, + "learning_rate": 6.003740539047629e-06, + "loss": 0.2821, + "step": 10489 + }, + { + "epoch": 1.2765439610587161, + "grad_norm": 2.134270429611206, + "learning_rate": 6.001952456253859e-06, + "loss": 0.3515, + "step": 10490 + }, + { + "epoch": 1.2766656525707332, + "grad_norm": 2.585449457168579, + "learning_rate": 6.000164525594076e-06, + "loss": 0.3941, + "step": 10491 + }, + { + "epoch": 1.2767873440827502, + "grad_norm": 1.4174909591674805, + "learning_rate": 5.998376747136311e-06, + "loss": 0.3334, + "step": 10492 + }, + { + "epoch": 1.2769090355947672, + "grad_norm": 2.21463942527771, + "learning_rate": 5.996589120948593e-06, + "loss": 0.4005, + "step": 10493 + }, + { + "epoch": 1.2770307271067842, + "grad_norm": 2.017909288406372, + "learning_rate": 5.9948016470989465e-06, + "loss": 0.3409, + "step": 10494 + }, + { + "epoch": 1.2771524186188015, + "grad_norm": 2.3163602352142334, + "learning_rate": 5.993014325655386e-06, + "loss": 0.3779, + "step": 10495 + }, + { + "epoch": 1.2772741101308185, + "grad_norm": 2.354531764984131, + "learning_rate": 5.99122715668593e-06, + "loss": 0.3876, + "step": 10496 + }, + { + "epoch": 1.2773958016428355, + "grad_norm": 2.2206008434295654, + "learning_rate": 5.989440140258573e-06, + "loss": 0.3968, + "step": 10497 + }, + { + "epoch": 1.2775174931548525, + "grad_norm": 1.627376914024353, + "learning_rate": 5.987653276441321e-06, + "loss": 0.4265, + "step": 10498 + }, + { + "epoch": 1.2776391846668695, + "grad_norm": 2.0795376300811768, + "learning_rate": 5.9858665653021675e-06, + "loss": 0.4345, + "step": 10499 + }, + { + "epoch": 1.2777608761788866, + "grad_norm": 1.858400583267212, + "learning_rate": 5.984080006909099e-06, + "loss": 0.3971, + "step": 10500 + }, + { + "epoch": 1.2778825676909036, + "grad_norm": 2.803896188735962, + "learning_rate": 5.982293601330099e-06, + "loss": 0.4094, + "step": 10501 + }, + { + "epoch": 1.2780042592029206, + "grad_norm": 1.7287530899047852, + "learning_rate": 5.980507348633146e-06, + "loss": 0.3817, + "step": 10502 + }, + { + "epoch": 1.2781259507149376, + "grad_norm": 3.074690580368042, + "learning_rate": 5.978721248886206e-06, + "loss": 0.3365, + "step": 10503 + }, + { + "epoch": 1.2782476422269546, + "grad_norm": 3.238543748855591, + "learning_rate": 5.976935302157245e-06, + "loss": 0.3338, + "step": 10504 + }, + { + "epoch": 1.2783693337389717, + "grad_norm": 1.3183553218841553, + "learning_rate": 5.9751495085142254e-06, + "loss": 0.3598, + "step": 10505 + }, + { + "epoch": 1.2784910252509887, + "grad_norm": 1.4870617389678955, + "learning_rate": 5.973363868025096e-06, + "loss": 0.4025, + "step": 10506 + }, + { + "epoch": 1.2786127167630057, + "grad_norm": 3.533945083618164, + "learning_rate": 5.9715783807578095e-06, + "loss": 0.3561, + "step": 10507 + }, + { + "epoch": 1.2787344082750227, + "grad_norm": 1.8328453302383423, + "learning_rate": 5.9697930467803015e-06, + "loss": 0.3965, + "step": 10508 + }, + { + "epoch": 1.2788560997870397, + "grad_norm": 2.160099744796753, + "learning_rate": 5.96800786616051e-06, + "loss": 0.4167, + "step": 10509 + }, + { + "epoch": 1.2789777912990568, + "grad_norm": 2.485539436340332, + "learning_rate": 5.966222838966367e-06, + "loss": 0.4213, + "step": 10510 + }, + { + "epoch": 1.279099482811074, + "grad_norm": 3.2244744300842285, + "learning_rate": 5.96443796526579e-06, + "loss": 0.3162, + "step": 10511 + }, + { + "epoch": 1.279221174323091, + "grad_norm": 1.4829498529434204, + "learning_rate": 5.962653245126704e-06, + "loss": 0.396, + "step": 10512 + }, + { + "epoch": 1.279342865835108, + "grad_norm": 2.4302046298980713, + "learning_rate": 5.96086867861702e-06, + "loss": 0.3655, + "step": 10513 + }, + { + "epoch": 1.279464557347125, + "grad_norm": 2.0691308975219727, + "learning_rate": 5.959084265804643e-06, + "loss": 0.3993, + "step": 10514 + }, + { + "epoch": 1.279586248859142, + "grad_norm": 1.4534651041030884, + "learning_rate": 5.957300006757472e-06, + "loss": 0.3915, + "step": 10515 + }, + { + "epoch": 1.279707940371159, + "grad_norm": 3.1938302516937256, + "learning_rate": 5.955515901543404e-06, + "loss": 0.3721, + "step": 10516 + }, + { + "epoch": 1.279829631883176, + "grad_norm": 1.5802381038665771, + "learning_rate": 5.953731950230331e-06, + "loss": 0.3975, + "step": 10517 + }, + { + "epoch": 1.2799513233951931, + "grad_norm": 1.2114176750183105, + "learning_rate": 5.951948152886129e-06, + "loss": 0.3343, + "step": 10518 + }, + { + "epoch": 1.2800730149072101, + "grad_norm": 1.7845396995544434, + "learning_rate": 5.950164509578682e-06, + "loss": 0.4012, + "step": 10519 + }, + { + "epoch": 1.2801947064192274, + "grad_norm": 1.4808484315872192, + "learning_rate": 5.9483810203758555e-06, + "loss": 0.3533, + "step": 10520 + }, + { + "epoch": 1.2803163979312444, + "grad_norm": 1.723787784576416, + "learning_rate": 5.946597685345519e-06, + "loss": 0.3638, + "step": 10521 + }, + { + "epoch": 1.2804380894432614, + "grad_norm": 1.3908257484436035, + "learning_rate": 5.9448145045555294e-06, + "loss": 0.3404, + "step": 10522 + }, + { + "epoch": 1.2805597809552784, + "grad_norm": 1.4263622760772705, + "learning_rate": 5.9430314780737416e-06, + "loss": 0.3533, + "step": 10523 + }, + { + "epoch": 1.2806814724672955, + "grad_norm": 1.8044443130493164, + "learning_rate": 5.941248605968003e-06, + "loss": 0.3934, + "step": 10524 + }, + { + "epoch": 1.2808031639793125, + "grad_norm": 3.0128695964813232, + "learning_rate": 5.939465888306159e-06, + "loss": 0.4611, + "step": 10525 + }, + { + "epoch": 1.2809248554913295, + "grad_norm": 1.506617546081543, + "learning_rate": 5.93768332515604e-06, + "loss": 0.4022, + "step": 10526 + }, + { + "epoch": 1.2810465470033465, + "grad_norm": 2.5495035648345947, + "learning_rate": 5.935900916585478e-06, + "loss": 0.3399, + "step": 10527 + }, + { + "epoch": 1.2811682385153635, + "grad_norm": 2.3683578968048096, + "learning_rate": 5.9341186626623025e-06, + "loss": 0.4227, + "step": 10528 + }, + { + "epoch": 1.2812899300273806, + "grad_norm": 2.409839153289795, + "learning_rate": 5.932336563454324e-06, + "loss": 0.4357, + "step": 10529 + }, + { + "epoch": 1.2814116215393976, + "grad_norm": 2.0458576679229736, + "learning_rate": 5.9305546190293635e-06, + "loss": 0.4032, + "step": 10530 + }, + { + "epoch": 1.2815333130514146, + "grad_norm": 2.188157558441162, + "learning_rate": 5.9287728294552195e-06, + "loss": 0.3893, + "step": 10531 + }, + { + "epoch": 1.2816550045634316, + "grad_norm": 2.3667166233062744, + "learning_rate": 5.926991194799696e-06, + "loss": 0.3384, + "step": 10532 + }, + { + "epoch": 1.2817766960754486, + "grad_norm": 3.17220401763916, + "learning_rate": 5.925209715130591e-06, + "loss": 0.309, + "step": 10533 + }, + { + "epoch": 1.2818983875874657, + "grad_norm": 2.1808674335479736, + "learning_rate": 5.923428390515686e-06, + "loss": 0.3614, + "step": 10534 + }, + { + "epoch": 1.2820200790994827, + "grad_norm": 1.6956264972686768, + "learning_rate": 5.921647221022772e-06, + "loss": 0.393, + "step": 10535 + }, + { + "epoch": 1.2821417706115, + "grad_norm": 3.252528667449951, + "learning_rate": 5.919866206719623e-06, + "loss": 0.498, + "step": 10536 + }, + { + "epoch": 1.282263462123517, + "grad_norm": 1.5510379076004028, + "learning_rate": 5.91808534767401e-06, + "loss": 0.3572, + "step": 10537 + }, + { + "epoch": 1.282385153635534, + "grad_norm": 1.9704879522323608, + "learning_rate": 5.916304643953696e-06, + "loss": 0.3447, + "step": 10538 + }, + { + "epoch": 1.282506845147551, + "grad_norm": 1.686834454536438, + "learning_rate": 5.9145240956264475e-06, + "loss": 0.4191, + "step": 10539 + }, + { + "epoch": 1.282628536659568, + "grad_norm": 2.781834840774536, + "learning_rate": 5.912743702760011e-06, + "loss": 0.3845, + "step": 10540 + }, + { + "epoch": 1.282750228171585, + "grad_norm": 1.7784477472305298, + "learning_rate": 5.9109634654221356e-06, + "loss": 0.3653, + "step": 10541 + }, + { + "epoch": 1.282871919683602, + "grad_norm": 1.4981151819229126, + "learning_rate": 5.90918338368057e-06, + "loss": 0.3537, + "step": 10542 + }, + { + "epoch": 1.282993611195619, + "grad_norm": 1.455384373664856, + "learning_rate": 5.907403457603038e-06, + "loss": 0.3289, + "step": 10543 + }, + { + "epoch": 1.283115302707636, + "grad_norm": 1.6621510982513428, + "learning_rate": 5.905623687257279e-06, + "loss": 0.3691, + "step": 10544 + }, + { + "epoch": 1.2832369942196533, + "grad_norm": 3.7067105770111084, + "learning_rate": 5.903844072711011e-06, + "loss": 0.4419, + "step": 10545 + }, + { + "epoch": 1.2833586857316703, + "grad_norm": 2.1652414798736572, + "learning_rate": 5.9020646140319555e-06, + "loss": 0.3358, + "step": 10546 + }, + { + "epoch": 1.2834803772436874, + "grad_norm": 2.1550374031066895, + "learning_rate": 5.900285311287826e-06, + "loss": 0.4066, + "step": 10547 + }, + { + "epoch": 1.2836020687557044, + "grad_norm": 1.7441656589508057, + "learning_rate": 5.898506164546323e-06, + "loss": 0.3564, + "step": 10548 + }, + { + "epoch": 1.2837237602677214, + "grad_norm": 1.982452392578125, + "learning_rate": 5.896727173875151e-06, + "loss": 0.3417, + "step": 10549 + }, + { + "epoch": 1.2838454517797384, + "grad_norm": 2.337707281112671, + "learning_rate": 5.894948339342003e-06, + "loss": 0.411, + "step": 10550 + }, + { + "epoch": 1.2839671432917554, + "grad_norm": 1.798502802848816, + "learning_rate": 5.89316966101457e-06, + "loss": 0.3947, + "step": 10551 + }, + { + "epoch": 1.2840888348037724, + "grad_norm": 1.6420990228652954, + "learning_rate": 5.891391138960529e-06, + "loss": 0.3632, + "step": 10552 + }, + { + "epoch": 1.2842105263157895, + "grad_norm": 1.4075000286102295, + "learning_rate": 5.889612773247561e-06, + "loss": 0.3591, + "step": 10553 + }, + { + "epoch": 1.2843322178278065, + "grad_norm": 1.4349812269210815, + "learning_rate": 5.887834563943338e-06, + "loss": 0.3356, + "step": 10554 + }, + { + "epoch": 1.2844539093398235, + "grad_norm": 1.5117881298065186, + "learning_rate": 5.886056511115519e-06, + "loss": 0.3521, + "step": 10555 + }, + { + "epoch": 1.2845756008518405, + "grad_norm": 1.259488582611084, + "learning_rate": 5.884278614831765e-06, + "loss": 0.3609, + "step": 10556 + }, + { + "epoch": 1.2846972923638575, + "grad_norm": 3.270118474960327, + "learning_rate": 5.882500875159728e-06, + "loss": 0.4136, + "step": 10557 + }, + { + "epoch": 1.2848189838758746, + "grad_norm": 1.7463117837905884, + "learning_rate": 5.880723292167054e-06, + "loss": 0.3992, + "step": 10558 + }, + { + "epoch": 1.2849406753878916, + "grad_norm": 1.4743537902832031, + "learning_rate": 5.878945865921389e-06, + "loss": 0.3141, + "step": 10559 + }, + { + "epoch": 1.2850623668999086, + "grad_norm": 1.7920653820037842, + "learning_rate": 5.877168596490361e-06, + "loss": 0.3667, + "step": 10560 + }, + { + "epoch": 1.2851840584119258, + "grad_norm": 1.4804673194885254, + "learning_rate": 5.875391483941603e-06, + "loss": 0.3988, + "step": 10561 + }, + { + "epoch": 1.2853057499239429, + "grad_norm": 1.9991148710250854, + "learning_rate": 5.873614528342737e-06, + "loss": 0.3508, + "step": 10562 + }, + { + "epoch": 1.2854274414359599, + "grad_norm": 1.456757664680481, + "learning_rate": 5.871837729761378e-06, + "loss": 0.3761, + "step": 10563 + }, + { + "epoch": 1.285549132947977, + "grad_norm": 1.9267749786376953, + "learning_rate": 5.870061088265138e-06, + "loss": 0.357, + "step": 10564 + }, + { + "epoch": 1.285670824459994, + "grad_norm": 1.635720133781433, + "learning_rate": 5.868284603921626e-06, + "loss": 0.3508, + "step": 10565 + }, + { + "epoch": 1.285792515972011, + "grad_norm": 1.7783342599868774, + "learning_rate": 5.866508276798434e-06, + "loss": 0.321, + "step": 10566 + }, + { + "epoch": 1.285914207484028, + "grad_norm": 1.870968222618103, + "learning_rate": 5.864732106963158e-06, + "loss": 0.4141, + "step": 10567 + }, + { + "epoch": 1.286035898996045, + "grad_norm": 1.668931007385254, + "learning_rate": 5.862956094483385e-06, + "loss": 0.3479, + "step": 10568 + }, + { + "epoch": 1.286157590508062, + "grad_norm": 1.4777226448059082, + "learning_rate": 5.8611802394266944e-06, + "loss": 0.3287, + "step": 10569 + }, + { + "epoch": 1.286279282020079, + "grad_norm": 2.1105763912200928, + "learning_rate": 5.859404541860665e-06, + "loss": 0.4109, + "step": 10570 + }, + { + "epoch": 1.2864009735320963, + "grad_norm": 2.2539310455322266, + "learning_rate": 5.8576290018528615e-06, + "loss": 0.4011, + "step": 10571 + }, + { + "epoch": 1.2865226650441133, + "grad_norm": 2.440284252166748, + "learning_rate": 5.855853619470848e-06, + "loss": 0.285, + "step": 10572 + }, + { + "epoch": 1.2866443565561303, + "grad_norm": 1.7038896083831787, + "learning_rate": 5.8540783947821856e-06, + "loss": 0.4046, + "step": 10573 + }, + { + "epoch": 1.2867660480681473, + "grad_norm": 1.7720586061477661, + "learning_rate": 5.852303327854418e-06, + "loss": 0.3774, + "step": 10574 + }, + { + "epoch": 1.2868877395801643, + "grad_norm": 2.3250913619995117, + "learning_rate": 5.850528418755095e-06, + "loss": 0.3411, + "step": 10575 + }, + { + "epoch": 1.2870094310921814, + "grad_norm": 1.7034109830856323, + "learning_rate": 5.848753667551755e-06, + "loss": 0.3591, + "step": 10576 + }, + { + "epoch": 1.2871311226041984, + "grad_norm": 2.0150527954101562, + "learning_rate": 5.846979074311934e-06, + "loss": 0.4218, + "step": 10577 + }, + { + "epoch": 1.2872528141162154, + "grad_norm": 2.4230263233184814, + "learning_rate": 5.845204639103155e-06, + "loss": 0.4298, + "step": 10578 + }, + { + "epoch": 1.2873745056282324, + "grad_norm": 1.7150936126708984, + "learning_rate": 5.8434303619929345e-06, + "loss": 0.3683, + "step": 10579 + }, + { + "epoch": 1.2874961971402494, + "grad_norm": 2.0455844402313232, + "learning_rate": 5.841656243048794e-06, + "loss": 0.3148, + "step": 10580 + }, + { + "epoch": 1.2876178886522665, + "grad_norm": 2.2733371257781982, + "learning_rate": 5.83988228233824e-06, + "loss": 0.4184, + "step": 10581 + }, + { + "epoch": 1.2877395801642835, + "grad_norm": 1.4156023263931274, + "learning_rate": 5.8381084799287776e-06, + "loss": 0.3499, + "step": 10582 + }, + { + "epoch": 1.2878612716763005, + "grad_norm": 1.6243747472763062, + "learning_rate": 5.836334835887905e-06, + "loss": 0.4141, + "step": 10583 + }, + { + "epoch": 1.2879829631883175, + "grad_norm": 2.7751576900482178, + "learning_rate": 5.834561350283108e-06, + "loss": 0.3387, + "step": 10584 + }, + { + "epoch": 1.2881046547003345, + "grad_norm": 1.280044436454773, + "learning_rate": 5.83278802318187e-06, + "loss": 0.3603, + "step": 10585 + }, + { + "epoch": 1.2882263462123518, + "grad_norm": 3.362574815750122, + "learning_rate": 5.831014854651678e-06, + "loss": 0.3225, + "step": 10586 + }, + { + "epoch": 1.2883480377243688, + "grad_norm": 1.702264428138733, + "learning_rate": 5.829241844759998e-06, + "loss": 0.3199, + "step": 10587 + }, + { + "epoch": 1.2884697292363858, + "grad_norm": 1.7620378732681274, + "learning_rate": 5.827468993574301e-06, + "loss": 0.2932, + "step": 10588 + }, + { + "epoch": 1.2885914207484028, + "grad_norm": 2.1318371295928955, + "learning_rate": 5.825696301162046e-06, + "loss": 0.4398, + "step": 10589 + }, + { + "epoch": 1.2887131122604198, + "grad_norm": 2.232062578201294, + "learning_rate": 5.823923767590682e-06, + "loss": 0.302, + "step": 10590 + }, + { + "epoch": 1.2888348037724369, + "grad_norm": 2.7815849781036377, + "learning_rate": 5.822151392927666e-06, + "loss": 0.4297, + "step": 10591 + }, + { + "epoch": 1.2889564952844539, + "grad_norm": 1.9335200786590576, + "learning_rate": 5.820379177240438e-06, + "loss": 0.4234, + "step": 10592 + }, + { + "epoch": 1.289078186796471, + "grad_norm": 1.7936642169952393, + "learning_rate": 5.81860712059643e-06, + "loss": 0.358, + "step": 10593 + }, + { + "epoch": 1.289199878308488, + "grad_norm": 1.7385996580123901, + "learning_rate": 5.81683522306308e-06, + "loss": 0.3083, + "step": 10594 + }, + { + "epoch": 1.289321569820505, + "grad_norm": 1.6687299013137817, + "learning_rate": 5.815063484707808e-06, + "loss": 0.343, + "step": 10595 + }, + { + "epoch": 1.2894432613325222, + "grad_norm": 1.531218409538269, + "learning_rate": 5.813291905598029e-06, + "loss": 0.3396, + "step": 10596 + }, + { + "epoch": 1.2895649528445392, + "grad_norm": 1.9220924377441406, + "learning_rate": 5.811520485801162e-06, + "loss": 0.4286, + "step": 10597 + }, + { + "epoch": 1.2896866443565562, + "grad_norm": 1.8867710828781128, + "learning_rate": 5.809749225384611e-06, + "loss": 0.4201, + "step": 10598 + }, + { + "epoch": 1.2898083358685732, + "grad_norm": 1.9965869188308716, + "learning_rate": 5.807978124415772e-06, + "loss": 0.4259, + "step": 10599 + }, + { + "epoch": 1.2899300273805903, + "grad_norm": 2.204007625579834, + "learning_rate": 5.8062071829620494e-06, + "loss": 0.3852, + "step": 10600 + }, + { + "epoch": 1.2900517188926073, + "grad_norm": 1.8582518100738525, + "learning_rate": 5.804436401090816e-06, + "loss": 0.355, + "step": 10601 + }, + { + "epoch": 1.2901734104046243, + "grad_norm": 1.6306055784225464, + "learning_rate": 5.802665778869466e-06, + "loss": 0.3789, + "step": 10602 + }, + { + "epoch": 1.2902951019166413, + "grad_norm": 2.7174384593963623, + "learning_rate": 5.800895316365371e-06, + "loss": 0.3226, + "step": 10603 + }, + { + "epoch": 1.2904167934286583, + "grad_norm": 1.3473379611968994, + "learning_rate": 5.799125013645899e-06, + "loss": 0.3615, + "step": 10604 + }, + { + "epoch": 1.2905384849406754, + "grad_norm": 2.451268434524536, + "learning_rate": 5.797354870778418e-06, + "loss": 0.4094, + "step": 10605 + }, + { + "epoch": 1.2906601764526924, + "grad_norm": 1.7882784605026245, + "learning_rate": 5.795584887830282e-06, + "loss": 0.3843, + "step": 10606 + }, + { + "epoch": 1.2907818679647094, + "grad_norm": 2.148496389389038, + "learning_rate": 5.7938150648688406e-06, + "loss": 0.3286, + "step": 10607 + }, + { + "epoch": 1.2909035594767264, + "grad_norm": 3.206817626953125, + "learning_rate": 5.7920454019614455e-06, + "loss": 0.3113, + "step": 10608 + }, + { + "epoch": 1.2910252509887434, + "grad_norm": 2.004594564437866, + "learning_rate": 5.790275899175435e-06, + "loss": 0.3567, + "step": 10609 + }, + { + "epoch": 1.2911469425007605, + "grad_norm": 1.7616199254989624, + "learning_rate": 5.788506556578135e-06, + "loss": 0.4024, + "step": 10610 + }, + { + "epoch": 1.2912686340127775, + "grad_norm": 2.1099853515625, + "learning_rate": 5.786737374236886e-06, + "loss": 0.3781, + "step": 10611 + }, + { + "epoch": 1.2913903255247947, + "grad_norm": 1.5661702156066895, + "learning_rate": 5.784968352218991e-06, + "loss": 0.3564, + "step": 10612 + }, + { + "epoch": 1.2915120170368117, + "grad_norm": 2.1131184101104736, + "learning_rate": 5.783199490591782e-06, + "loss": 0.3801, + "step": 10613 + }, + { + "epoch": 1.2916337085488288, + "grad_norm": 3.104557991027832, + "learning_rate": 5.781430789422559e-06, + "loss": 0.3239, + "step": 10614 + }, + { + "epoch": 1.2917554000608458, + "grad_norm": 2.0992746353149414, + "learning_rate": 5.7796622487786225e-06, + "loss": 0.3855, + "step": 10615 + }, + { + "epoch": 1.2918770915728628, + "grad_norm": 1.8904378414154053, + "learning_rate": 5.777893868727278e-06, + "loss": 0.3703, + "step": 10616 + }, + { + "epoch": 1.2919987830848798, + "grad_norm": 1.4152432680130005, + "learning_rate": 5.7761256493358086e-06, + "loss": 0.3598, + "step": 10617 + }, + { + "epoch": 1.2921204745968968, + "grad_norm": 1.6704233884811401, + "learning_rate": 5.7743575906715e-06, + "loss": 0.3916, + "step": 10618 + }, + { + "epoch": 1.2922421661089138, + "grad_norm": 1.9063706398010254, + "learning_rate": 5.7725896928016336e-06, + "loss": 0.3548, + "step": 10619 + }, + { + "epoch": 1.2923638576209309, + "grad_norm": 1.5738579034805298, + "learning_rate": 5.770821955793482e-06, + "loss": 0.3452, + "step": 10620 + }, + { + "epoch": 1.292485549132948, + "grad_norm": 4.074136734008789, + "learning_rate": 5.769054379714303e-06, + "loss": 0.4183, + "step": 10621 + }, + { + "epoch": 1.2926072406449651, + "grad_norm": 1.93488609790802, + "learning_rate": 5.767286964631367e-06, + "loss": 0.3467, + "step": 10622 + }, + { + "epoch": 1.2927289321569821, + "grad_norm": 1.6406809091567993, + "learning_rate": 5.765519710611922e-06, + "loss": 0.3344, + "step": 10623 + }, + { + "epoch": 1.2928506236689992, + "grad_norm": 1.8336777687072754, + "learning_rate": 5.763752617723218e-06, + "loss": 0.3799, + "step": 10624 + }, + { + "epoch": 1.2929723151810162, + "grad_norm": 1.4141950607299805, + "learning_rate": 5.7619856860324964e-06, + "loss": 0.3339, + "step": 10625 + }, + { + "epoch": 1.2930940066930332, + "grad_norm": 1.5580123662948608, + "learning_rate": 5.760218915606987e-06, + "loss": 0.3661, + "step": 10626 + }, + { + "epoch": 1.2932156982050502, + "grad_norm": 2.565035581588745, + "learning_rate": 5.758452306513927e-06, + "loss": 0.378, + "step": 10627 + }, + { + "epoch": 1.2933373897170672, + "grad_norm": 1.2089426517486572, + "learning_rate": 5.756685858820532e-06, + "loss": 0.3125, + "step": 10628 + }, + { + "epoch": 1.2934590812290843, + "grad_norm": 1.7160195112228394, + "learning_rate": 5.7549195725940284e-06, + "loss": 0.3794, + "step": 10629 + }, + { + "epoch": 1.2935807727411013, + "grad_norm": 1.249646544456482, + "learning_rate": 5.753153447901621e-06, + "loss": 0.3489, + "step": 10630 + }, + { + "epoch": 1.2937024642531183, + "grad_norm": 2.717773914337158, + "learning_rate": 5.751387484810512e-06, + "loss": 0.4082, + "step": 10631 + }, + { + "epoch": 1.2938241557651353, + "grad_norm": 1.9328533411026, + "learning_rate": 5.749621683387907e-06, + "loss": 0.3335, + "step": 10632 + }, + { + "epoch": 1.2939458472771523, + "grad_norm": 1.5630487203598022, + "learning_rate": 5.7478560437009955e-06, + "loss": 0.373, + "step": 10633 + }, + { + "epoch": 1.2940675387891694, + "grad_norm": 2.057236671447754, + "learning_rate": 5.746090565816962e-06, + "loss": 0.3707, + "step": 10634 + }, + { + "epoch": 1.2941892303011864, + "grad_norm": 1.8337665796279907, + "learning_rate": 5.744325249802989e-06, + "loss": 0.3275, + "step": 10635 + }, + { + "epoch": 1.2943109218132034, + "grad_norm": 1.6739068031311035, + "learning_rate": 5.742560095726244e-06, + "loss": 0.3691, + "step": 10636 + }, + { + "epoch": 1.2944326133252206, + "grad_norm": 2.1306872367858887, + "learning_rate": 5.7407951036539025e-06, + "loss": 0.412, + "step": 10637 + }, + { + "epoch": 1.2945543048372377, + "grad_norm": 2.0116026401519775, + "learning_rate": 5.7390302736531255e-06, + "loss": 0.4278, + "step": 10638 + }, + { + "epoch": 1.2946759963492547, + "grad_norm": 1.5661468505859375, + "learning_rate": 5.737265605791062e-06, + "loss": 0.3651, + "step": 10639 + }, + { + "epoch": 1.2947976878612717, + "grad_norm": 2.0398988723754883, + "learning_rate": 5.7355011001348686e-06, + "loss": 0.467, + "step": 10640 + }, + { + "epoch": 1.2949193793732887, + "grad_norm": 1.9183504581451416, + "learning_rate": 5.733736756751686e-06, + "loss": 0.4234, + "step": 10641 + }, + { + "epoch": 1.2950410708853057, + "grad_norm": 2.205601453781128, + "learning_rate": 5.731972575708646e-06, + "loss": 0.394, + "step": 10642 + }, + { + "epoch": 1.2951627623973228, + "grad_norm": 2.178201913833618, + "learning_rate": 5.730208557072887e-06, + "loss": 0.4267, + "step": 10643 + }, + { + "epoch": 1.2952844539093398, + "grad_norm": 2.3975582122802734, + "learning_rate": 5.728444700911533e-06, + "loss": 0.3761, + "step": 10644 + }, + { + "epoch": 1.2954061454213568, + "grad_norm": 1.3895853757858276, + "learning_rate": 5.7266810072916936e-06, + "loss": 0.376, + "step": 10645 + }, + { + "epoch": 1.295527836933374, + "grad_norm": 2.3043224811553955, + "learning_rate": 5.724917476280496e-06, + "loss": 0.4607, + "step": 10646 + }, + { + "epoch": 1.295649528445391, + "grad_norm": 2.0370213985443115, + "learning_rate": 5.723154107945029e-06, + "loss": 0.3345, + "step": 10647 + }, + { + "epoch": 1.295771219957408, + "grad_norm": 2.3514859676361084, + "learning_rate": 5.721390902352405e-06, + "loss": 0.3619, + "step": 10648 + }, + { + "epoch": 1.295892911469425, + "grad_norm": 2.2256734371185303, + "learning_rate": 5.719627859569714e-06, + "loss": 0.3425, + "step": 10649 + }, + { + "epoch": 1.296014602981442, + "grad_norm": 1.5580493211746216, + "learning_rate": 5.717864979664038e-06, + "loss": 0.3368, + "step": 10650 + }, + { + "epoch": 1.2961362944934591, + "grad_norm": 2.3323075771331787, + "learning_rate": 5.7161022627024675e-06, + "loss": 0.3701, + "step": 10651 + }, + { + "epoch": 1.2962579860054761, + "grad_norm": 2.9248900413513184, + "learning_rate": 5.714339708752072e-06, + "loss": 0.4025, + "step": 10652 + }, + { + "epoch": 1.2963796775174932, + "grad_norm": 1.6240113973617554, + "learning_rate": 5.712577317879918e-06, + "loss": 0.3722, + "step": 10653 + }, + { + "epoch": 1.2965013690295102, + "grad_norm": 1.3895801305770874, + "learning_rate": 5.7108150901530766e-06, + "loss": 0.312, + "step": 10654 + }, + { + "epoch": 1.2966230605415272, + "grad_norm": 1.9655169248580933, + "learning_rate": 5.709053025638598e-06, + "loss": 0.3301, + "step": 10655 + }, + { + "epoch": 1.2967447520535442, + "grad_norm": 1.5269062519073486, + "learning_rate": 5.70729112440353e-06, + "loss": 0.3574, + "step": 10656 + }, + { + "epoch": 1.2968664435655612, + "grad_norm": 3.039123773574829, + "learning_rate": 5.705529386514928e-06, + "loss": 0.4181, + "step": 10657 + }, + { + "epoch": 1.2969881350775783, + "grad_norm": 3.578644275665283, + "learning_rate": 5.703767812039813e-06, + "loss": 0.3562, + "step": 10658 + }, + { + "epoch": 1.2971098265895953, + "grad_norm": 1.9839129447937012, + "learning_rate": 5.702006401045231e-06, + "loss": 0.3227, + "step": 10659 + }, + { + "epoch": 1.2972315181016123, + "grad_norm": 1.5482141971588135, + "learning_rate": 5.700245153598201e-06, + "loss": 0.3848, + "step": 10660 + }, + { + "epoch": 1.2973532096136293, + "grad_norm": 1.9916326999664307, + "learning_rate": 5.698484069765739e-06, + "loss": 0.4203, + "step": 10661 + }, + { + "epoch": 1.2974749011256466, + "grad_norm": 1.2304826974868774, + "learning_rate": 5.696723149614866e-06, + "loss": 0.3572, + "step": 10662 + }, + { + "epoch": 1.2975965926376636, + "grad_norm": 1.5947169065475464, + "learning_rate": 5.694962393212584e-06, + "loss": 0.4098, + "step": 10663 + }, + { + "epoch": 1.2977182841496806, + "grad_norm": 1.5367302894592285, + "learning_rate": 5.693201800625892e-06, + "loss": 0.3788, + "step": 10664 + }, + { + "epoch": 1.2978399756616976, + "grad_norm": 1.4731502532958984, + "learning_rate": 5.691441371921787e-06, + "loss": 0.3261, + "step": 10665 + }, + { + "epoch": 1.2979616671737146, + "grad_norm": 1.7778632640838623, + "learning_rate": 5.689681107167258e-06, + "loss": 0.3284, + "step": 10666 + }, + { + "epoch": 1.2980833586857317, + "grad_norm": 1.5393034219741821, + "learning_rate": 5.68792100642928e-06, + "loss": 0.3748, + "step": 10667 + }, + { + "epoch": 1.2982050501977487, + "grad_norm": 1.6737003326416016, + "learning_rate": 5.686161069774837e-06, + "loss": 0.3791, + "step": 10668 + }, + { + "epoch": 1.2983267417097657, + "grad_norm": 2.314873218536377, + "learning_rate": 5.684401297270894e-06, + "loss": 0.4198, + "step": 10669 + }, + { + "epoch": 1.2984484332217827, + "grad_norm": 1.4829821586608887, + "learning_rate": 5.682641688984416e-06, + "loss": 0.3542, + "step": 10670 + }, + { + "epoch": 1.2985701247337997, + "grad_norm": 1.8030195236206055, + "learning_rate": 5.680882244982357e-06, + "loss": 0.4122, + "step": 10671 + }, + { + "epoch": 1.298691816245817, + "grad_norm": 1.9843130111694336, + "learning_rate": 5.679122965331665e-06, + "loss": 0.3588, + "step": 10672 + }, + { + "epoch": 1.298813507757834, + "grad_norm": 1.4244213104248047, + "learning_rate": 5.677363850099293e-06, + "loss": 0.3552, + "step": 10673 + }, + { + "epoch": 1.298935199269851, + "grad_norm": 2.3985137939453125, + "learning_rate": 5.675604899352171e-06, + "loss": 0.3475, + "step": 10674 + }, + { + "epoch": 1.299056890781868, + "grad_norm": 2.278571367263794, + "learning_rate": 5.673846113157232e-06, + "loss": 0.3637, + "step": 10675 + }, + { + "epoch": 1.299178582293885, + "grad_norm": 1.2912546396255493, + "learning_rate": 5.672087491581406e-06, + "loss": 0.3433, + "step": 10676 + }, + { + "epoch": 1.299300273805902, + "grad_norm": 1.8979377746582031, + "learning_rate": 5.670329034691611e-06, + "loss": 0.3946, + "step": 10677 + }, + { + "epoch": 1.299421965317919, + "grad_norm": 1.7651402950286865, + "learning_rate": 5.668570742554752e-06, + "loss": 0.3755, + "step": 10678 + }, + { + "epoch": 1.2995436568299361, + "grad_norm": 1.1389776468276978, + "learning_rate": 5.666812615237747e-06, + "loss": 0.3238, + "step": 10679 + }, + { + "epoch": 1.2996653483419531, + "grad_norm": 1.4201276302337646, + "learning_rate": 5.6650546528074905e-06, + "loss": 0.3834, + "step": 10680 + }, + { + "epoch": 1.2997870398539701, + "grad_norm": 1.8664674758911133, + "learning_rate": 5.663296855330878e-06, + "loss": 0.3467, + "step": 10681 + }, + { + "epoch": 1.2999087313659872, + "grad_norm": 2.3464839458465576, + "learning_rate": 5.661539222874795e-06, + "loss": 0.3807, + "step": 10682 + }, + { + "epoch": 1.3000304228780042, + "grad_norm": 1.6741420030593872, + "learning_rate": 5.659781755506122e-06, + "loss": 0.3618, + "step": 10683 + }, + { + "epoch": 1.3001521143900212, + "grad_norm": 1.6574125289916992, + "learning_rate": 5.658024453291741e-06, + "loss": 0.3533, + "step": 10684 + }, + { + "epoch": 1.3002738059020382, + "grad_norm": 1.9747450351715088, + "learning_rate": 5.656267316298517e-06, + "loss": 0.343, + "step": 10685 + }, + { + "epoch": 1.3003954974140552, + "grad_norm": 2.5942485332489014, + "learning_rate": 5.654510344593309e-06, + "loss": 0.2748, + "step": 10686 + }, + { + "epoch": 1.3005171889260725, + "grad_norm": 1.6068029403686523, + "learning_rate": 5.652753538242981e-06, + "loss": 0.3765, + "step": 10687 + }, + { + "epoch": 1.3006388804380895, + "grad_norm": 1.290798306465149, + "learning_rate": 5.650996897314374e-06, + "loss": 0.2886, + "step": 10688 + }, + { + "epoch": 1.3007605719501065, + "grad_norm": 2.5814549922943115, + "learning_rate": 5.649240421874342e-06, + "loss": 0.3965, + "step": 10689 + }, + { + "epoch": 1.3008822634621235, + "grad_norm": 1.8892725706100464, + "learning_rate": 5.647484111989716e-06, + "loss": 0.3884, + "step": 10690 + }, + { + "epoch": 1.3010039549741406, + "grad_norm": 1.7181422710418701, + "learning_rate": 5.645727967727327e-06, + "loss": 0.3445, + "step": 10691 + }, + { + "epoch": 1.3011256464861576, + "grad_norm": 2.149254560470581, + "learning_rate": 5.643971989154009e-06, + "loss": 0.3446, + "step": 10692 + }, + { + "epoch": 1.3012473379981746, + "grad_norm": 1.5193428993225098, + "learning_rate": 5.642216176336568e-06, + "loss": 0.3554, + "step": 10693 + }, + { + "epoch": 1.3013690295101916, + "grad_norm": 1.44844388961792, + "learning_rate": 5.640460529341819e-06, + "loss": 0.351, + "step": 10694 + }, + { + "epoch": 1.3014907210222086, + "grad_norm": 2.9043166637420654, + "learning_rate": 5.638705048236574e-06, + "loss": 0.3533, + "step": 10695 + }, + { + "epoch": 1.3016124125342257, + "grad_norm": 2.613502025604248, + "learning_rate": 5.636949733087626e-06, + "loss": 0.3205, + "step": 10696 + }, + { + "epoch": 1.301734104046243, + "grad_norm": 1.4322253465652466, + "learning_rate": 5.635194583961775e-06, + "loss": 0.3664, + "step": 10697 + }, + { + "epoch": 1.30185579555826, + "grad_norm": 2.0034613609313965, + "learning_rate": 5.633439600925805e-06, + "loss": 0.4041, + "step": 10698 + }, + { + "epoch": 1.301977487070277, + "grad_norm": 1.4181780815124512, + "learning_rate": 5.631684784046492e-06, + "loss": 0.3659, + "step": 10699 + }, + { + "epoch": 1.302099178582294, + "grad_norm": 1.8612855672836304, + "learning_rate": 5.6299301333906194e-06, + "loss": 0.3833, + "step": 10700 + }, + { + "epoch": 1.302220870094311, + "grad_norm": 2.5197055339813232, + "learning_rate": 5.628175649024951e-06, + "loss": 0.4205, + "step": 10701 + }, + { + "epoch": 1.302342561606328, + "grad_norm": 2.4088521003723145, + "learning_rate": 5.626421331016243e-06, + "loss": 0.4156, + "step": 10702 + }, + { + "epoch": 1.302464253118345, + "grad_norm": 1.8909119367599487, + "learning_rate": 5.624667179431265e-06, + "loss": 0.3487, + "step": 10703 + }, + { + "epoch": 1.302585944630362, + "grad_norm": 2.771806478500366, + "learning_rate": 5.62291319433675e-06, + "loss": 0.4214, + "step": 10704 + }, + { + "epoch": 1.302707636142379, + "grad_norm": 2.0864429473876953, + "learning_rate": 5.621159375799452e-06, + "loss": 0.3821, + "step": 10705 + }, + { + "epoch": 1.302829327654396, + "grad_norm": 2.33351993560791, + "learning_rate": 5.619405723886102e-06, + "loss": 0.4004, + "step": 10706 + }, + { + "epoch": 1.302951019166413, + "grad_norm": 1.4285186529159546, + "learning_rate": 5.6176522386634294e-06, + "loss": 0.3605, + "step": 10707 + }, + { + "epoch": 1.3030727106784301, + "grad_norm": 1.5547945499420166, + "learning_rate": 5.615898920198162e-06, + "loss": 0.3996, + "step": 10708 + }, + { + "epoch": 1.3031944021904471, + "grad_norm": 2.567091703414917, + "learning_rate": 5.614145768557017e-06, + "loss": 0.3799, + "step": 10709 + }, + { + "epoch": 1.3033160937024642, + "grad_norm": 2.726357936859131, + "learning_rate": 5.612392783806699e-06, + "loss": 0.3692, + "step": 10710 + }, + { + "epoch": 1.3034377852144812, + "grad_norm": 1.7168859243392944, + "learning_rate": 5.610639966013921e-06, + "loss": 0.3679, + "step": 10711 + }, + { + "epoch": 1.3035594767264982, + "grad_norm": 3.060255289077759, + "learning_rate": 5.608887315245378e-06, + "loss": 0.3629, + "step": 10712 + }, + { + "epoch": 1.3036811682385154, + "grad_norm": 1.4523892402648926, + "learning_rate": 5.6071348315677585e-06, + "loss": 0.304, + "step": 10713 + }, + { + "epoch": 1.3038028597505325, + "grad_norm": 1.5616751909255981, + "learning_rate": 5.605382515047755e-06, + "loss": 0.3587, + "step": 10714 + }, + { + "epoch": 1.3039245512625495, + "grad_norm": 1.6984819173812866, + "learning_rate": 5.603630365752043e-06, + "loss": 0.3928, + "step": 10715 + }, + { + "epoch": 1.3040462427745665, + "grad_norm": 1.3906701803207397, + "learning_rate": 5.601878383747295e-06, + "loss": 0.3763, + "step": 10716 + }, + { + "epoch": 1.3041679342865835, + "grad_norm": 2.763263702392578, + "learning_rate": 5.6001265691001795e-06, + "loss": 0.3068, + "step": 10717 + }, + { + "epoch": 1.3042896257986005, + "grad_norm": 1.929206132888794, + "learning_rate": 5.5983749218773496e-06, + "loss": 0.4249, + "step": 10718 + }, + { + "epoch": 1.3044113173106175, + "grad_norm": 2.449132204055786, + "learning_rate": 5.59662344214547e-06, + "loss": 0.4549, + "step": 10719 + }, + { + "epoch": 1.3045330088226346, + "grad_norm": 1.7186609506607056, + "learning_rate": 5.594872129971183e-06, + "loss": 0.3823, + "step": 10720 + }, + { + "epoch": 1.3046547003346516, + "grad_norm": 1.771635890007019, + "learning_rate": 5.593120985421125e-06, + "loss": 0.3568, + "step": 10721 + }, + { + "epoch": 1.3047763918466688, + "grad_norm": 1.5166743993759155, + "learning_rate": 5.59137000856194e-06, + "loss": 0.384, + "step": 10722 + }, + { + "epoch": 1.3048980833586858, + "grad_norm": 1.4323418140411377, + "learning_rate": 5.58961919946025e-06, + "loss": 0.3916, + "step": 10723 + }, + { + "epoch": 1.3050197748707029, + "grad_norm": 2.5157432556152344, + "learning_rate": 5.587868558182676e-06, + "loss": 0.314, + "step": 10724 + }, + { + "epoch": 1.3051414663827199, + "grad_norm": 1.4859994649887085, + "learning_rate": 5.58611808479584e-06, + "loss": 0.3415, + "step": 10725 + }, + { + "epoch": 1.305263157894737, + "grad_norm": 1.9007946252822876, + "learning_rate": 5.584367779366346e-06, + "loss": 0.3763, + "step": 10726 + }, + { + "epoch": 1.305384849406754, + "grad_norm": 2.0490918159484863, + "learning_rate": 5.582617641960799e-06, + "loss": 0.3638, + "step": 10727 + }, + { + "epoch": 1.305506540918771, + "grad_norm": 1.6577391624450684, + "learning_rate": 5.580867672645793e-06, + "loss": 0.3861, + "step": 10728 + }, + { + "epoch": 1.305628232430788, + "grad_norm": 2.240692615509033, + "learning_rate": 5.579117871487915e-06, + "loss": 0.4245, + "step": 10729 + }, + { + "epoch": 1.305749923942805, + "grad_norm": 1.5075914859771729, + "learning_rate": 5.5773682385537575e-06, + "loss": 0.3699, + "step": 10730 + }, + { + "epoch": 1.305871615454822, + "grad_norm": 2.7460334300994873, + "learning_rate": 5.5756187739098945e-06, + "loss": 0.4151, + "step": 10731 + }, + { + "epoch": 1.305993306966839, + "grad_norm": 1.6418988704681396, + "learning_rate": 5.573869477622889e-06, + "loss": 0.3999, + "step": 10732 + }, + { + "epoch": 1.306114998478856, + "grad_norm": 1.6874862909317017, + "learning_rate": 5.5721203497593155e-06, + "loss": 0.3925, + "step": 10733 + }, + { + "epoch": 1.306236689990873, + "grad_norm": 1.4764375686645508, + "learning_rate": 5.5703713903857285e-06, + "loss": 0.3305, + "step": 10734 + }, + { + "epoch": 1.30635838150289, + "grad_norm": 1.5199888944625854, + "learning_rate": 5.568622599568674e-06, + "loss": 0.3697, + "step": 10735 + }, + { + "epoch": 1.306480073014907, + "grad_norm": 1.8549597263336182, + "learning_rate": 5.566873977374707e-06, + "loss": 0.3898, + "step": 10736 + }, + { + "epoch": 1.3066017645269241, + "grad_norm": 2.7467002868652344, + "learning_rate": 5.565125523870361e-06, + "loss": 0.3731, + "step": 10737 + }, + { + "epoch": 1.3067234560389414, + "grad_norm": 2.816716432571411, + "learning_rate": 5.5633772391221685e-06, + "loss": 0.3947, + "step": 10738 + }, + { + "epoch": 1.3068451475509584, + "grad_norm": 1.8271758556365967, + "learning_rate": 5.561629123196656e-06, + "loss": 0.3973, + "step": 10739 + }, + { + "epoch": 1.3069668390629754, + "grad_norm": 1.7588022947311401, + "learning_rate": 5.559881176160338e-06, + "loss": 0.4333, + "step": 10740 + }, + { + "epoch": 1.3070885305749924, + "grad_norm": 3.211850643157959, + "learning_rate": 5.558133398079735e-06, + "loss": 0.4231, + "step": 10741 + }, + { + "epoch": 1.3072102220870094, + "grad_norm": 2.4309945106506348, + "learning_rate": 5.55638578902135e-06, + "loss": 0.3266, + "step": 10742 + }, + { + "epoch": 1.3073319135990265, + "grad_norm": 2.169997215270996, + "learning_rate": 5.554638349051679e-06, + "loss": 0.3928, + "step": 10743 + }, + { + "epoch": 1.3074536051110435, + "grad_norm": 1.443649411201477, + "learning_rate": 5.552891078237227e-06, + "loss": 0.4134, + "step": 10744 + }, + { + "epoch": 1.3075752966230605, + "grad_norm": 3.0654757022857666, + "learning_rate": 5.551143976644473e-06, + "loss": 0.4624, + "step": 10745 + }, + { + "epoch": 1.3076969881350775, + "grad_norm": 1.5582939386367798, + "learning_rate": 5.5493970443398945e-06, + "loss": 0.3789, + "step": 10746 + }, + { + "epoch": 1.3078186796470948, + "grad_norm": 1.8564646244049072, + "learning_rate": 5.5476502813899755e-06, + "loss": 0.3987, + "step": 10747 + }, + { + "epoch": 1.3079403711591118, + "grad_norm": 1.7201308012008667, + "learning_rate": 5.545903687861176e-06, + "loss": 0.3771, + "step": 10748 + }, + { + "epoch": 1.3080620626711288, + "grad_norm": 1.351222038269043, + "learning_rate": 5.544157263819967e-06, + "loss": 0.3648, + "step": 10749 + }, + { + "epoch": 1.3081837541831458, + "grad_norm": 1.7537788152694702, + "learning_rate": 5.542411009332795e-06, + "loss": 0.3215, + "step": 10750 + }, + { + "epoch": 1.3083054456951628, + "grad_norm": 1.9958311319351196, + "learning_rate": 5.540664924466106e-06, + "loss": 0.3602, + "step": 10751 + }, + { + "epoch": 1.3084271372071798, + "grad_norm": 3.8747355937957764, + "learning_rate": 5.53891900928635e-06, + "loss": 0.4722, + "step": 10752 + }, + { + "epoch": 1.3085488287191969, + "grad_norm": 1.4203927516937256, + "learning_rate": 5.537173263859961e-06, + "loss": 0.3344, + "step": 10753 + }, + { + "epoch": 1.3086705202312139, + "grad_norm": 2.0952188968658447, + "learning_rate": 5.535427688253362e-06, + "loss": 0.4384, + "step": 10754 + }, + { + "epoch": 1.308792211743231, + "grad_norm": 2.8315587043762207, + "learning_rate": 5.5336822825329836e-06, + "loss": 0.3081, + "step": 10755 + }, + { + "epoch": 1.308913903255248, + "grad_norm": 2.5170958042144775, + "learning_rate": 5.531937046765237e-06, + "loss": 0.3943, + "step": 10756 + }, + { + "epoch": 1.309035594767265, + "grad_norm": 1.618561863899231, + "learning_rate": 5.530191981016537e-06, + "loss": 0.4014, + "step": 10757 + }, + { + "epoch": 1.309157286279282, + "grad_norm": 1.997988224029541, + "learning_rate": 5.5284470853532855e-06, + "loss": 0.3795, + "step": 10758 + }, + { + "epoch": 1.309278977791299, + "grad_norm": 1.512786865234375, + "learning_rate": 5.526702359841873e-06, + "loss": 0.4177, + "step": 10759 + }, + { + "epoch": 1.309400669303316, + "grad_norm": 1.3573386669158936, + "learning_rate": 5.524957804548698e-06, + "loss": 0.3428, + "step": 10760 + }, + { + "epoch": 1.309522360815333, + "grad_norm": 1.9457066059112549, + "learning_rate": 5.523213419540147e-06, + "loss": 0.4584, + "step": 10761 + }, + { + "epoch": 1.30964405232735, + "grad_norm": 2.474031925201416, + "learning_rate": 5.521469204882584e-06, + "loss": 0.451, + "step": 10762 + }, + { + "epoch": 1.3097657438393673, + "grad_norm": 1.7259163856506348, + "learning_rate": 5.519725160642391e-06, + "loss": 0.4013, + "step": 10763 + }, + { + "epoch": 1.3098874353513843, + "grad_norm": 3.171025514602661, + "learning_rate": 5.517981286885925e-06, + "loss": 0.3812, + "step": 10764 + }, + { + "epoch": 1.3100091268634013, + "grad_norm": 1.64792799949646, + "learning_rate": 5.516237583679554e-06, + "loss": 0.4041, + "step": 10765 + }, + { + "epoch": 1.3101308183754183, + "grad_norm": 2.7931742668151855, + "learning_rate": 5.514494051089622e-06, + "loss": 0.3502, + "step": 10766 + }, + { + "epoch": 1.3102525098874354, + "grad_norm": 1.377469778060913, + "learning_rate": 5.512750689182471e-06, + "loss": 0.4009, + "step": 10767 + }, + { + "epoch": 1.3103742013994524, + "grad_norm": 1.6480742692947388, + "learning_rate": 5.51100749802445e-06, + "loss": 0.3846, + "step": 10768 + }, + { + "epoch": 1.3104958929114694, + "grad_norm": 2.1429126262664795, + "learning_rate": 5.509264477681886e-06, + "loss": 0.3927, + "step": 10769 + }, + { + "epoch": 1.3106175844234864, + "grad_norm": 1.7459510564804077, + "learning_rate": 5.507521628221099e-06, + "loss": 0.4333, + "step": 10770 + }, + { + "epoch": 1.3107392759355034, + "grad_norm": 1.4798492193222046, + "learning_rate": 5.505778949708416e-06, + "loss": 0.3349, + "step": 10771 + }, + { + "epoch": 1.3108609674475205, + "grad_norm": 1.6968584060668945, + "learning_rate": 5.504036442210148e-06, + "loss": 0.3759, + "step": 10772 + }, + { + "epoch": 1.3109826589595377, + "grad_norm": 2.111764907836914, + "learning_rate": 5.502294105792598e-06, + "loss": 0.3781, + "step": 10773 + }, + { + "epoch": 1.3111043504715547, + "grad_norm": 1.8388253450393677, + "learning_rate": 5.50055194052207e-06, + "loss": 0.3338, + "step": 10774 + }, + { + "epoch": 1.3112260419835717, + "grad_norm": 3.487530469894409, + "learning_rate": 5.498809946464847e-06, + "loss": 0.3922, + "step": 10775 + }, + { + "epoch": 1.3113477334955888, + "grad_norm": 1.7745195627212524, + "learning_rate": 5.497068123687228e-06, + "loss": 0.3338, + "step": 10776 + }, + { + "epoch": 1.3114694250076058, + "grad_norm": 1.4318186044692993, + "learning_rate": 5.495326472255486e-06, + "loss": 0.3459, + "step": 10777 + }, + { + "epoch": 1.3115911165196228, + "grad_norm": 2.126408100128174, + "learning_rate": 5.4935849922358944e-06, + "loss": 0.3566, + "step": 10778 + }, + { + "epoch": 1.3117128080316398, + "grad_norm": 1.5190150737762451, + "learning_rate": 5.491843683694723e-06, + "loss": 0.3168, + "step": 10779 + }, + { + "epoch": 1.3118344995436568, + "grad_norm": 1.3851282596588135, + "learning_rate": 5.490102546698233e-06, + "loss": 0.3795, + "step": 10780 + }, + { + "epoch": 1.3119561910556738, + "grad_norm": 2.327291488647461, + "learning_rate": 5.48836158131267e-06, + "loss": 0.3821, + "step": 10781 + }, + { + "epoch": 1.3120778825676909, + "grad_norm": 2.2141382694244385, + "learning_rate": 5.486620787604292e-06, + "loss": 0.3996, + "step": 10782 + }, + { + "epoch": 1.3121995740797079, + "grad_norm": 2.753286838531494, + "learning_rate": 5.484880165639336e-06, + "loss": 0.4084, + "step": 10783 + }, + { + "epoch": 1.312321265591725, + "grad_norm": 1.3257920742034912, + "learning_rate": 5.483139715484035e-06, + "loss": 0.331, + "step": 10784 + }, + { + "epoch": 1.312442957103742, + "grad_norm": 1.4459072351455688, + "learning_rate": 5.481399437204618e-06, + "loss": 0.3779, + "step": 10785 + }, + { + "epoch": 1.312564648615759, + "grad_norm": 2.0659801959991455, + "learning_rate": 5.4796593308673e-06, + "loss": 0.3852, + "step": 10786 + }, + { + "epoch": 1.312686340127776, + "grad_norm": 1.7968263626098633, + "learning_rate": 5.477919396538305e-06, + "loss": 0.3869, + "step": 10787 + }, + { + "epoch": 1.3128080316397932, + "grad_norm": 2.095046281814575, + "learning_rate": 5.476179634283838e-06, + "loss": 0.3218, + "step": 10788 + }, + { + "epoch": 1.3129297231518102, + "grad_norm": 1.79927396774292, + "learning_rate": 5.474440044170095e-06, + "loss": 0.4439, + "step": 10789 + }, + { + "epoch": 1.3130514146638272, + "grad_norm": 1.4196207523345947, + "learning_rate": 5.472700626263281e-06, + "loss": 0.3987, + "step": 10790 + }, + { + "epoch": 1.3131731061758443, + "grad_norm": 1.6785361766815186, + "learning_rate": 5.470961380629578e-06, + "loss": 0.3916, + "step": 10791 + }, + { + "epoch": 1.3132947976878613, + "grad_norm": 2.620760917663574, + "learning_rate": 5.4692223073351665e-06, + "loss": 0.4446, + "step": 10792 + }, + { + "epoch": 1.3134164891998783, + "grad_norm": 2.1112852096557617, + "learning_rate": 5.467483406446228e-06, + "loss": 0.3836, + "step": 10793 + }, + { + "epoch": 1.3135381807118953, + "grad_norm": 2.9808385372161865, + "learning_rate": 5.465744678028926e-06, + "loss": 0.4082, + "step": 10794 + }, + { + "epoch": 1.3136598722239123, + "grad_norm": 1.2926630973815918, + "learning_rate": 5.464006122149426e-06, + "loss": 0.3539, + "step": 10795 + }, + { + "epoch": 1.3137815637359294, + "grad_norm": 2.858870506286621, + "learning_rate": 5.462267738873881e-06, + "loss": 0.349, + "step": 10796 + }, + { + "epoch": 1.3139032552479464, + "grad_norm": 3.8252112865448, + "learning_rate": 5.4605295282684375e-06, + "loss": 0.3765, + "step": 10797 + }, + { + "epoch": 1.3140249467599636, + "grad_norm": 2.7853469848632812, + "learning_rate": 5.458791490399247e-06, + "loss": 0.3613, + "step": 10798 + }, + { + "epoch": 1.3141466382719806, + "grad_norm": 1.9212602376937866, + "learning_rate": 5.457053625332438e-06, + "loss": 0.4021, + "step": 10799 + }, + { + "epoch": 1.3142683297839977, + "grad_norm": 1.397031545639038, + "learning_rate": 5.455315933134139e-06, + "loss": 0.3773, + "step": 10800 + }, + { + "epoch": 1.3143900212960147, + "grad_norm": 1.456973910331726, + "learning_rate": 5.45357841387048e-06, + "loss": 0.3913, + "step": 10801 + }, + { + "epoch": 1.3145117128080317, + "grad_norm": 3.196429491043091, + "learning_rate": 5.451841067607572e-06, + "loss": 0.4016, + "step": 10802 + }, + { + "epoch": 1.3146334043200487, + "grad_norm": 2.8266994953155518, + "learning_rate": 5.450103894411522e-06, + "loss": 0.3874, + "step": 10803 + }, + { + "epoch": 1.3147550958320657, + "grad_norm": 2.5611789226531982, + "learning_rate": 5.44836689434844e-06, + "loss": 0.4418, + "step": 10804 + }, + { + "epoch": 1.3148767873440828, + "grad_norm": 1.2674380540847778, + "learning_rate": 5.446630067484419e-06, + "loss": 0.3815, + "step": 10805 + }, + { + "epoch": 1.3149984788560998, + "grad_norm": 2.272742748260498, + "learning_rate": 5.4448934138855466e-06, + "loss": 0.3356, + "step": 10806 + }, + { + "epoch": 1.3151201703681168, + "grad_norm": 2.0775740146636963, + "learning_rate": 5.443156933617915e-06, + "loss": 0.4188, + "step": 10807 + }, + { + "epoch": 1.3152418618801338, + "grad_norm": 2.362074375152588, + "learning_rate": 5.441420626747586e-06, + "loss": 0.3616, + "step": 10808 + }, + { + "epoch": 1.3153635533921508, + "grad_norm": 1.909151554107666, + "learning_rate": 5.439684493340644e-06, + "loss": 0.4277, + "step": 10809 + }, + { + "epoch": 1.3154852449041678, + "grad_norm": 1.517953872680664, + "learning_rate": 5.437948533463145e-06, + "loss": 0.3889, + "step": 10810 + }, + { + "epoch": 1.3156069364161849, + "grad_norm": 3.1799468994140625, + "learning_rate": 5.436212747181142e-06, + "loss": 0.4338, + "step": 10811 + }, + { + "epoch": 1.3157286279282019, + "grad_norm": 3.3358051776885986, + "learning_rate": 5.4344771345606965e-06, + "loss": 0.3598, + "step": 10812 + }, + { + "epoch": 1.315850319440219, + "grad_norm": 2.6447231769561768, + "learning_rate": 5.432741695667845e-06, + "loss": 0.3384, + "step": 10813 + }, + { + "epoch": 1.3159720109522361, + "grad_norm": 1.3671926259994507, + "learning_rate": 5.431006430568622e-06, + "loss": 0.3934, + "step": 10814 + }, + { + "epoch": 1.3160937024642532, + "grad_norm": 1.8913819789886475, + "learning_rate": 5.429271339329065e-06, + "loss": 0.4453, + "step": 10815 + }, + { + "epoch": 1.3162153939762702, + "grad_norm": 2.112213134765625, + "learning_rate": 5.42753642201519e-06, + "loss": 0.4043, + "step": 10816 + }, + { + "epoch": 1.3163370854882872, + "grad_norm": 1.8930308818817139, + "learning_rate": 5.425801678693023e-06, + "loss": 0.4618, + "step": 10817 + }, + { + "epoch": 1.3164587770003042, + "grad_norm": 1.4577845335006714, + "learning_rate": 5.424067109428574e-06, + "loss": 0.3792, + "step": 10818 + }, + { + "epoch": 1.3165804685123212, + "grad_norm": 2.575348138809204, + "learning_rate": 5.422332714287837e-06, + "loss": 0.3006, + "step": 10819 + }, + { + "epoch": 1.3167021600243383, + "grad_norm": 1.3824480772018433, + "learning_rate": 5.420598493336818e-06, + "loss": 0.3531, + "step": 10820 + }, + { + "epoch": 1.3168238515363553, + "grad_norm": 1.181577444076538, + "learning_rate": 5.418864446641505e-06, + "loss": 0.3998, + "step": 10821 + }, + { + "epoch": 1.3169455430483723, + "grad_norm": 1.3310928344726562, + "learning_rate": 5.41713057426788e-06, + "loss": 0.3639, + "step": 10822 + }, + { + "epoch": 1.3170672345603895, + "grad_norm": 1.439767837524414, + "learning_rate": 5.4153968762819256e-06, + "loss": 0.3961, + "step": 10823 + }, + { + "epoch": 1.3171889260724066, + "grad_norm": 1.7048437595367432, + "learning_rate": 5.413663352749605e-06, + "loss": 0.3459, + "step": 10824 + }, + { + "epoch": 1.3173106175844236, + "grad_norm": 1.9653639793395996, + "learning_rate": 5.411930003736892e-06, + "loss": 0.3475, + "step": 10825 + }, + { + "epoch": 1.3174323090964406, + "grad_norm": 2.0754644870758057, + "learning_rate": 5.410196829309738e-06, + "loss": 0.4101, + "step": 10826 + }, + { + "epoch": 1.3175540006084576, + "grad_norm": 2.066164255142212, + "learning_rate": 5.4084638295340915e-06, + "loss": 0.343, + "step": 10827 + }, + { + "epoch": 1.3176756921204746, + "grad_norm": 1.9408520460128784, + "learning_rate": 5.406731004475905e-06, + "loss": 0.3826, + "step": 10828 + }, + { + "epoch": 1.3177973836324917, + "grad_norm": 1.6110838651657104, + "learning_rate": 5.40499835420111e-06, + "loss": 0.3858, + "step": 10829 + }, + { + "epoch": 1.3179190751445087, + "grad_norm": 2.0025970935821533, + "learning_rate": 5.40326587877564e-06, + "loss": 0.4049, + "step": 10830 + }, + { + "epoch": 1.3180407666565257, + "grad_norm": 1.7093302011489868, + "learning_rate": 5.401533578265416e-06, + "loss": 0.3293, + "step": 10831 + }, + { + "epoch": 1.3181624581685427, + "grad_norm": 1.944166660308838, + "learning_rate": 5.399801452736356e-06, + "loss": 0.3493, + "step": 10832 + }, + { + "epoch": 1.3182841496805597, + "grad_norm": 1.8357899188995361, + "learning_rate": 5.3980695022543764e-06, + "loss": 0.3223, + "step": 10833 + }, + { + "epoch": 1.3184058411925768, + "grad_norm": 1.4382829666137695, + "learning_rate": 5.396337726885377e-06, + "loss": 0.3225, + "step": 10834 + }, + { + "epoch": 1.3185275327045938, + "grad_norm": 1.2934057712554932, + "learning_rate": 5.394606126695252e-06, + "loss": 0.3611, + "step": 10835 + }, + { + "epoch": 1.3186492242166108, + "grad_norm": 2.2172439098358154, + "learning_rate": 5.3928747017499e-06, + "loss": 0.2899, + "step": 10836 + }, + { + "epoch": 1.3187709157286278, + "grad_norm": 1.8254777193069458, + "learning_rate": 5.391143452115203e-06, + "loss": 0.3467, + "step": 10837 + }, + { + "epoch": 1.3188926072406448, + "grad_norm": 1.2833013534545898, + "learning_rate": 5.389412377857033e-06, + "loss": 0.318, + "step": 10838 + }, + { + "epoch": 1.319014298752662, + "grad_norm": 1.3253841400146484, + "learning_rate": 5.387681479041269e-06, + "loss": 0.2984, + "step": 10839 + }, + { + "epoch": 1.319135990264679, + "grad_norm": 2.0427513122558594, + "learning_rate": 5.385950755733773e-06, + "loss": 0.4273, + "step": 10840 + }, + { + "epoch": 1.3192576817766961, + "grad_norm": 1.7031419277191162, + "learning_rate": 5.384220208000396e-06, + "loss": 0.388, + "step": 10841 + }, + { + "epoch": 1.3193793732887131, + "grad_norm": 1.6727646589279175, + "learning_rate": 5.382489835907002e-06, + "loss": 0.3427, + "step": 10842 + }, + { + "epoch": 1.3195010648007302, + "grad_norm": 1.460808277130127, + "learning_rate": 5.380759639519421e-06, + "loss": 0.343, + "step": 10843 + }, + { + "epoch": 1.3196227563127472, + "grad_norm": 1.7380757331848145, + "learning_rate": 5.3790296189035e-06, + "loss": 0.3006, + "step": 10844 + }, + { + "epoch": 1.3197444478247642, + "grad_norm": 3.1223533153533936, + "learning_rate": 5.3772997741250665e-06, + "loss": 0.4569, + "step": 10845 + }, + { + "epoch": 1.3198661393367812, + "grad_norm": 2.2328264713287354, + "learning_rate": 5.375570105249941e-06, + "loss": 0.4092, + "step": 10846 + }, + { + "epoch": 1.3199878308487982, + "grad_norm": 2.401493787765503, + "learning_rate": 5.37384061234395e-06, + "loss": 0.4062, + "step": 10847 + }, + { + "epoch": 1.3201095223608155, + "grad_norm": 1.501154899597168, + "learning_rate": 5.372111295472899e-06, + "loss": 0.3733, + "step": 10848 + }, + { + "epoch": 1.3202312138728325, + "grad_norm": 2.488508701324463, + "learning_rate": 5.370382154702589e-06, + "loss": 0.3935, + "step": 10849 + }, + { + "epoch": 1.3203529053848495, + "grad_norm": 2.3892276287078857, + "learning_rate": 5.368653190098824e-06, + "loss": 0.4005, + "step": 10850 + }, + { + "epoch": 1.3204745968968665, + "grad_norm": 3.7241599559783936, + "learning_rate": 5.366924401727391e-06, + "loss": 0.3493, + "step": 10851 + }, + { + "epoch": 1.3205962884088835, + "grad_norm": 3.4036762714385986, + "learning_rate": 5.365195789654072e-06, + "loss": 0.4669, + "step": 10852 + }, + { + "epoch": 1.3207179799209006, + "grad_norm": 1.8968693017959595, + "learning_rate": 5.3634673539446535e-06, + "loss": 0.3649, + "step": 10853 + }, + { + "epoch": 1.3208396714329176, + "grad_norm": 2.0806210041046143, + "learning_rate": 5.361739094664891e-06, + "loss": 0.357, + "step": 10854 + }, + { + "epoch": 1.3209613629449346, + "grad_norm": 2.1857049465179443, + "learning_rate": 5.360011011880562e-06, + "loss": 0.331, + "step": 10855 + }, + { + "epoch": 1.3210830544569516, + "grad_norm": 3.2242050170898438, + "learning_rate": 5.358283105657417e-06, + "loss": 0.3365, + "step": 10856 + }, + { + "epoch": 1.3212047459689686, + "grad_norm": 1.7904189825057983, + "learning_rate": 5.356555376061204e-06, + "loss": 0.3895, + "step": 10857 + }, + { + "epoch": 1.3213264374809857, + "grad_norm": 2.237102508544922, + "learning_rate": 5.354827823157674e-06, + "loss": 0.3828, + "step": 10858 + }, + { + "epoch": 1.3214481289930027, + "grad_norm": 2.2091522216796875, + "learning_rate": 5.35310044701256e-06, + "loss": 0.3804, + "step": 10859 + }, + { + "epoch": 1.3215698205050197, + "grad_norm": 2.3677759170532227, + "learning_rate": 5.35137324769159e-06, + "loss": 0.4076, + "step": 10860 + }, + { + "epoch": 1.3216915120170367, + "grad_norm": 2.804358720779419, + "learning_rate": 5.349646225260494e-06, + "loss": 0.4119, + "step": 10861 + }, + { + "epoch": 1.3218132035290537, + "grad_norm": 2.1119935512542725, + "learning_rate": 5.347919379784984e-06, + "loss": 0.3886, + "step": 10862 + }, + { + "epoch": 1.3219348950410708, + "grad_norm": 1.7128105163574219, + "learning_rate": 5.3461927113307664e-06, + "loss": 0.3656, + "step": 10863 + }, + { + "epoch": 1.322056586553088, + "grad_norm": 1.809964895248413, + "learning_rate": 5.344466219963553e-06, + "loss": 0.4227, + "step": 10864 + }, + { + "epoch": 1.322178278065105, + "grad_norm": 1.4565883874893188, + "learning_rate": 5.342739905749038e-06, + "loss": 0.3923, + "step": 10865 + }, + { + "epoch": 1.322299969577122, + "grad_norm": 1.6060867309570312, + "learning_rate": 5.341013768752908e-06, + "loss": 0.3665, + "step": 10866 + }, + { + "epoch": 1.322421661089139, + "grad_norm": 2.354559898376465, + "learning_rate": 5.33928780904085e-06, + "loss": 0.3822, + "step": 10867 + }, + { + "epoch": 1.322543352601156, + "grad_norm": 2.4192521572113037, + "learning_rate": 5.3375620266785315e-06, + "loss": 0.3545, + "step": 10868 + }, + { + "epoch": 1.322665044113173, + "grad_norm": 1.8301217555999756, + "learning_rate": 5.335836421731633e-06, + "loss": 0.3552, + "step": 10869 + }, + { + "epoch": 1.3227867356251901, + "grad_norm": 1.254237174987793, + "learning_rate": 5.3341109942658135e-06, + "loss": 0.344, + "step": 10870 + }, + { + "epoch": 1.3229084271372071, + "grad_norm": 2.161576509475708, + "learning_rate": 5.332385744346724e-06, + "loss": 0.351, + "step": 10871 + }, + { + "epoch": 1.3230301186492242, + "grad_norm": 2.287412405014038, + "learning_rate": 5.3306606720400224e-06, + "loss": 0.3964, + "step": 10872 + }, + { + "epoch": 1.3231518101612412, + "grad_norm": 2.5778119564056396, + "learning_rate": 5.328935777411348e-06, + "loss": 0.4066, + "step": 10873 + }, + { + "epoch": 1.3232735016732584, + "grad_norm": 2.265530586242676, + "learning_rate": 5.327211060526331e-06, + "loss": 0.398, + "step": 10874 + }, + { + "epoch": 1.3233951931852754, + "grad_norm": 1.5640454292297363, + "learning_rate": 5.3254865214506095e-06, + "loss": 0.3738, + "step": 10875 + }, + { + "epoch": 1.3235168846972925, + "grad_norm": 1.3931158781051636, + "learning_rate": 5.323762160249802e-06, + "loss": 0.346, + "step": 10876 + }, + { + "epoch": 1.3236385762093095, + "grad_norm": 2.767368793487549, + "learning_rate": 5.322037976989522e-06, + "loss": 0.4648, + "step": 10877 + }, + { + "epoch": 1.3237602677213265, + "grad_norm": 1.3860892057418823, + "learning_rate": 5.320313971735383e-06, + "loss": 0.3881, + "step": 10878 + }, + { + "epoch": 1.3238819592333435, + "grad_norm": 2.4217209815979004, + "learning_rate": 5.318590144552978e-06, + "loss": 0.4659, + "step": 10879 + }, + { + "epoch": 1.3240036507453605, + "grad_norm": 2.8353214263916016, + "learning_rate": 5.316866495507911e-06, + "loss": 0.3475, + "step": 10880 + }, + { + "epoch": 1.3241253422573775, + "grad_norm": 1.6553510427474976, + "learning_rate": 5.315143024665769e-06, + "loss": 0.3787, + "step": 10881 + }, + { + "epoch": 1.3242470337693946, + "grad_norm": 1.603233814239502, + "learning_rate": 5.3134197320921265e-06, + "loss": 0.4088, + "step": 10882 + }, + { + "epoch": 1.3243687252814116, + "grad_norm": 2.316512107849121, + "learning_rate": 5.3116966178525685e-06, + "loss": 0.3828, + "step": 10883 + }, + { + "epoch": 1.3244904167934286, + "grad_norm": 1.7286615371704102, + "learning_rate": 5.309973682012655e-06, + "loss": 0.3632, + "step": 10884 + }, + { + "epoch": 1.3246121083054456, + "grad_norm": 1.503751516342163, + "learning_rate": 5.308250924637955e-06, + "loss": 0.4247, + "step": 10885 + }, + { + "epoch": 1.3247337998174626, + "grad_norm": 2.024657964706421, + "learning_rate": 5.306528345794018e-06, + "loss": 0.359, + "step": 10886 + }, + { + "epoch": 1.3248554913294797, + "grad_norm": 1.592625379562378, + "learning_rate": 5.304805945546388e-06, + "loss": 0.3936, + "step": 10887 + }, + { + "epoch": 1.3249771828414967, + "grad_norm": 2.4652585983276367, + "learning_rate": 5.303083723960619e-06, + "loss": 0.3413, + "step": 10888 + }, + { + "epoch": 1.325098874353514, + "grad_norm": 1.7822929620742798, + "learning_rate": 5.301361681102233e-06, + "loss": 0.3564, + "step": 10889 + }, + { + "epoch": 1.325220565865531, + "grad_norm": 1.8490363359451294, + "learning_rate": 5.299639817036754e-06, + "loss": 0.377, + "step": 10890 + }, + { + "epoch": 1.325342257377548, + "grad_norm": 1.8932664394378662, + "learning_rate": 5.297918131829716e-06, + "loss": 0.3983, + "step": 10891 + }, + { + "epoch": 1.325463948889565, + "grad_norm": 1.6952601671218872, + "learning_rate": 5.296196625546621e-06, + "loss": 0.4208, + "step": 10892 + }, + { + "epoch": 1.325585640401582, + "grad_norm": 1.800462245941162, + "learning_rate": 5.294475298252985e-06, + "loss": 0.3771, + "step": 10893 + }, + { + "epoch": 1.325707331913599, + "grad_norm": 1.980384349822998, + "learning_rate": 5.292754150014303e-06, + "loss": 0.3828, + "step": 10894 + }, + { + "epoch": 1.325829023425616, + "grad_norm": 1.3604977130889893, + "learning_rate": 5.2910331808960655e-06, + "loss": 0.3379, + "step": 10895 + }, + { + "epoch": 1.325950714937633, + "grad_norm": 1.4096249341964722, + "learning_rate": 5.289312390963766e-06, + "loss": 0.3629, + "step": 10896 + }, + { + "epoch": 1.32607240644965, + "grad_norm": 1.9162101745605469, + "learning_rate": 5.287591780282881e-06, + "loss": 0.4101, + "step": 10897 + }, + { + "epoch": 1.326194097961667, + "grad_norm": 1.796937108039856, + "learning_rate": 5.2858713489188795e-06, + "loss": 0.3996, + "step": 10898 + }, + { + "epoch": 1.3263157894736843, + "grad_norm": 2.8475372791290283, + "learning_rate": 5.2841510969372375e-06, + "loss": 0.354, + "step": 10899 + }, + { + "epoch": 1.3264374809857014, + "grad_norm": 1.438982367515564, + "learning_rate": 5.282431024403401e-06, + "loss": 0.3605, + "step": 10900 + }, + { + "epoch": 1.3265591724977184, + "grad_norm": 1.7382758855819702, + "learning_rate": 5.280711131382832e-06, + "loss": 0.3984, + "step": 10901 + }, + { + "epoch": 1.3266808640097354, + "grad_norm": 3.1082699298858643, + "learning_rate": 5.278991417940973e-06, + "loss": 0.3469, + "step": 10902 + }, + { + "epoch": 1.3268025555217524, + "grad_norm": 1.4664252996444702, + "learning_rate": 5.277271884143259e-06, + "loss": 0.3312, + "step": 10903 + }, + { + "epoch": 1.3269242470337694, + "grad_norm": 1.8658509254455566, + "learning_rate": 5.2755525300551295e-06, + "loss": 0.3507, + "step": 10904 + }, + { + "epoch": 1.3270459385457865, + "grad_norm": 2.7156667709350586, + "learning_rate": 5.2738333557420045e-06, + "loss": 0.3672, + "step": 10905 + }, + { + "epoch": 1.3271676300578035, + "grad_norm": 2.6489243507385254, + "learning_rate": 5.272114361269298e-06, + "loss": 0.3829, + "step": 10906 + }, + { + "epoch": 1.3272893215698205, + "grad_norm": 1.456113576889038, + "learning_rate": 5.270395546702433e-06, + "loss": 0.307, + "step": 10907 + }, + { + "epoch": 1.3274110130818375, + "grad_norm": 1.3985824584960938, + "learning_rate": 5.268676912106805e-06, + "loss": 0.368, + "step": 10908 + }, + { + "epoch": 1.3275327045938545, + "grad_norm": 2.178978204727173, + "learning_rate": 5.26695845754781e-06, + "loss": 0.4253, + "step": 10909 + }, + { + "epoch": 1.3276543961058715, + "grad_norm": 4.004980087280273, + "learning_rate": 5.2652401830908474e-06, + "loss": 0.3909, + "step": 10910 + }, + { + "epoch": 1.3277760876178886, + "grad_norm": 1.9495490789413452, + "learning_rate": 5.263522088801296e-06, + "loss": 0.4048, + "step": 10911 + }, + { + "epoch": 1.3278977791299056, + "grad_norm": 1.7496061325073242, + "learning_rate": 5.261804174744533e-06, + "loss": 0.4121, + "step": 10912 + }, + { + "epoch": 1.3280194706419226, + "grad_norm": 2.009120225906372, + "learning_rate": 5.2600864409859275e-06, + "loss": 0.3716, + "step": 10913 + }, + { + "epoch": 1.3281411621539396, + "grad_norm": 1.8285285234451294, + "learning_rate": 5.258368887590841e-06, + "loss": 0.3674, + "step": 10914 + }, + { + "epoch": 1.3282628536659569, + "grad_norm": 1.4351909160614014, + "learning_rate": 5.256651514624638e-06, + "loss": 0.355, + "step": 10915 + }, + { + "epoch": 1.3283845451779739, + "grad_norm": 1.6309455633163452, + "learning_rate": 5.254934322152662e-06, + "loss": 0.3654, + "step": 10916 + }, + { + "epoch": 1.328506236689991, + "grad_norm": 1.6981252431869507, + "learning_rate": 5.253217310240253e-06, + "loss": 0.3963, + "step": 10917 + }, + { + "epoch": 1.328627928202008, + "grad_norm": 1.5298811197280884, + "learning_rate": 5.2515004789527535e-06, + "loss": 0.3659, + "step": 10918 + }, + { + "epoch": 1.328749619714025, + "grad_norm": 1.5758899450302124, + "learning_rate": 5.249783828355492e-06, + "loss": 0.3805, + "step": 10919 + }, + { + "epoch": 1.328871311226042, + "grad_norm": 4.301183700561523, + "learning_rate": 5.248067358513782e-06, + "loss": 0.3676, + "step": 10920 + }, + { + "epoch": 1.328993002738059, + "grad_norm": 1.7789143323898315, + "learning_rate": 5.24635106949295e-06, + "loss": 0.3816, + "step": 10921 + }, + { + "epoch": 1.329114694250076, + "grad_norm": 2.5354020595550537, + "learning_rate": 5.244634961358299e-06, + "loss": 0.3886, + "step": 10922 + }, + { + "epoch": 1.329236385762093, + "grad_norm": 1.7551963329315186, + "learning_rate": 5.242919034175131e-06, + "loss": 0.369, + "step": 10923 + }, + { + "epoch": 1.3293580772741103, + "grad_norm": 2.167905569076538, + "learning_rate": 5.2412032880087405e-06, + "loss": 0.3587, + "step": 10924 + }, + { + "epoch": 1.3294797687861273, + "grad_norm": 2.0046024322509766, + "learning_rate": 5.23948772292441e-06, + "loss": 0.4379, + "step": 10925 + }, + { + "epoch": 1.3296014602981443, + "grad_norm": 2.3789079189300537, + "learning_rate": 5.23777233898743e-06, + "loss": 0.3971, + "step": 10926 + }, + { + "epoch": 1.3297231518101613, + "grad_norm": 2.103271484375, + "learning_rate": 5.236057136263071e-06, + "loss": 0.4176, + "step": 10927 + }, + { + "epoch": 1.3298448433221783, + "grad_norm": 2.9267852306365967, + "learning_rate": 5.234342114816594e-06, + "loss": 0.2982, + "step": 10928 + }, + { + "epoch": 1.3299665348341954, + "grad_norm": 1.86089289188385, + "learning_rate": 5.232627274713267e-06, + "loss": 0.3925, + "step": 10929 + }, + { + "epoch": 1.3300882263462124, + "grad_norm": 2.7386250495910645, + "learning_rate": 5.230912616018341e-06, + "loss": 0.3149, + "step": 10930 + }, + { + "epoch": 1.3302099178582294, + "grad_norm": 2.2234046459198, + "learning_rate": 5.229198138797058e-06, + "loss": 0.3928, + "step": 10931 + }, + { + "epoch": 1.3303316093702464, + "grad_norm": 1.5488901138305664, + "learning_rate": 5.227483843114663e-06, + "loss": 0.3629, + "step": 10932 + }, + { + "epoch": 1.3304533008822634, + "grad_norm": 1.6239206790924072, + "learning_rate": 5.2257697290363875e-06, + "loss": 0.3808, + "step": 10933 + }, + { + "epoch": 1.3305749923942805, + "grad_norm": 1.7368719577789307, + "learning_rate": 5.2240557966274564e-06, + "loss": 0.3652, + "step": 10934 + }, + { + "epoch": 1.3306966839062975, + "grad_norm": 1.4475150108337402, + "learning_rate": 5.222342045953087e-06, + "loss": 0.3542, + "step": 10935 + }, + { + "epoch": 1.3308183754183145, + "grad_norm": 1.8466107845306396, + "learning_rate": 5.220628477078489e-06, + "loss": 0.3924, + "step": 10936 + }, + { + "epoch": 1.3309400669303315, + "grad_norm": 1.828380823135376, + "learning_rate": 5.218915090068873e-06, + "loss": 0.3881, + "step": 10937 + }, + { + "epoch": 1.3310617584423485, + "grad_norm": 2.7511560916900635, + "learning_rate": 5.217201884989435e-06, + "loss": 0.3828, + "step": 10938 + }, + { + "epoch": 1.3311834499543655, + "grad_norm": 1.4727199077606201, + "learning_rate": 5.21548886190536e-06, + "loss": 0.3536, + "step": 10939 + }, + { + "epoch": 1.3313051414663828, + "grad_norm": 1.5623347759246826, + "learning_rate": 5.213776020881842e-06, + "loss": 0.386, + "step": 10940 + }, + { + "epoch": 1.3314268329783998, + "grad_norm": 1.6486202478408813, + "learning_rate": 5.212063361984054e-06, + "loss": 0.3669, + "step": 10941 + }, + { + "epoch": 1.3315485244904168, + "grad_norm": 2.885010004043579, + "learning_rate": 5.21035088527716e-06, + "loss": 0.4332, + "step": 10942 + }, + { + "epoch": 1.3316702160024338, + "grad_norm": 1.7678548097610474, + "learning_rate": 5.208638590826334e-06, + "loss": 0.3471, + "step": 10943 + }, + { + "epoch": 1.3317919075144509, + "grad_norm": 2.591498374938965, + "learning_rate": 5.206926478696723e-06, + "loss": 0.3486, + "step": 10944 + }, + { + "epoch": 1.3319135990264679, + "grad_norm": 2.8749332427978516, + "learning_rate": 5.2052145489534885e-06, + "loss": 0.3323, + "step": 10945 + }, + { + "epoch": 1.332035290538485, + "grad_norm": 1.5610345602035522, + "learning_rate": 5.203502801661762e-06, + "loss": 0.3617, + "step": 10946 + }, + { + "epoch": 1.332156982050502, + "grad_norm": 2.0968525409698486, + "learning_rate": 5.2017912368866765e-06, + "loss": 0.3361, + "step": 10947 + }, + { + "epoch": 1.332278673562519, + "grad_norm": 1.8223997354507446, + "learning_rate": 5.2000798546933705e-06, + "loss": 0.3799, + "step": 10948 + }, + { + "epoch": 1.3324003650745362, + "grad_norm": 1.9765418767929077, + "learning_rate": 5.1983686551469615e-06, + "loss": 0.3482, + "step": 10949 + }, + { + "epoch": 1.3325220565865532, + "grad_norm": 2.3010082244873047, + "learning_rate": 5.196657638312561e-06, + "loss": 0.3744, + "step": 10950 + }, + { + "epoch": 1.3326437480985702, + "grad_norm": 1.559601902961731, + "learning_rate": 5.194946804255283e-06, + "loss": 0.3385, + "step": 10951 + }, + { + "epoch": 1.3327654396105872, + "grad_norm": 1.601814866065979, + "learning_rate": 5.1932361530402195e-06, + "loss": 0.3439, + "step": 10952 + }, + { + "epoch": 1.3328871311226043, + "grad_norm": 2.0287764072418213, + "learning_rate": 5.191525684732477e-06, + "loss": 0.2864, + "step": 10953 + }, + { + "epoch": 1.3330088226346213, + "grad_norm": 1.5020554065704346, + "learning_rate": 5.189815399397133e-06, + "loss": 0.2729, + "step": 10954 + }, + { + "epoch": 1.3331305141466383, + "grad_norm": 3.7550368309020996, + "learning_rate": 5.188105297099266e-06, + "loss": 0.4167, + "step": 10955 + }, + { + "epoch": 1.3332522056586553, + "grad_norm": 2.2288944721221924, + "learning_rate": 5.186395377903957e-06, + "loss": 0.3785, + "step": 10956 + }, + { + "epoch": 1.3333738971706723, + "grad_norm": 1.3996793031692505, + "learning_rate": 5.18468564187627e-06, + "loss": 0.3485, + "step": 10957 + }, + { + "epoch": 1.3334955886826894, + "grad_norm": 1.8882296085357666, + "learning_rate": 5.182976089081255e-06, + "loss": 0.3639, + "step": 10958 + }, + { + "epoch": 1.3336172801947064, + "grad_norm": 2.441636562347412, + "learning_rate": 5.181266719583974e-06, + "loss": 0.3695, + "step": 10959 + }, + { + "epoch": 1.3337389717067234, + "grad_norm": 3.7391064167022705, + "learning_rate": 5.179557533449464e-06, + "loss": 0.4286, + "step": 10960 + }, + { + "epoch": 1.3338606632187404, + "grad_norm": 2.417163133621216, + "learning_rate": 5.177848530742773e-06, + "loss": 0.3759, + "step": 10961 + }, + { + "epoch": 1.3339823547307574, + "grad_norm": 3.05417537689209, + "learning_rate": 5.1761397115289255e-06, + "loss": 0.3989, + "step": 10962 + }, + { + "epoch": 1.3341040462427745, + "grad_norm": 1.6889636516571045, + "learning_rate": 5.174431075872943e-06, + "loss": 0.4065, + "step": 10963 + }, + { + "epoch": 1.3342257377547915, + "grad_norm": 1.7248908281326294, + "learning_rate": 5.172722623839851e-06, + "loss": 0.4049, + "step": 10964 + }, + { + "epoch": 1.3343474292668087, + "grad_norm": 3.922619581222534, + "learning_rate": 5.171014355494654e-06, + "loss": 0.3583, + "step": 10965 + }, + { + "epoch": 1.3344691207788257, + "grad_norm": 2.3132288455963135, + "learning_rate": 5.169306270902354e-06, + "loss": 0.4408, + "step": 10966 + }, + { + "epoch": 1.3345908122908428, + "grad_norm": 2.1008007526397705, + "learning_rate": 5.167598370127952e-06, + "loss": 0.3859, + "step": 10967 + }, + { + "epoch": 1.3347125038028598, + "grad_norm": 2.025282859802246, + "learning_rate": 5.165890653236435e-06, + "loss": 0.3615, + "step": 10968 + }, + { + "epoch": 1.3348341953148768, + "grad_norm": 2.9155805110931396, + "learning_rate": 5.164183120292785e-06, + "loss": 0.3736, + "step": 10969 + }, + { + "epoch": 1.3349558868268938, + "grad_norm": 2.0699708461761475, + "learning_rate": 5.1624757713619766e-06, + "loss": 0.3756, + "step": 10970 + }, + { + "epoch": 1.3350775783389108, + "grad_norm": 2.333455801010132, + "learning_rate": 5.160768606508973e-06, + "loss": 0.3889, + "step": 10971 + }, + { + "epoch": 1.3351992698509279, + "grad_norm": 2.908857583999634, + "learning_rate": 5.159061625798747e-06, + "loss": 0.3909, + "step": 10972 + }, + { + "epoch": 1.3353209613629449, + "grad_norm": 2.964445114135742, + "learning_rate": 5.157354829296244e-06, + "loss": 0.3294, + "step": 10973 + }, + { + "epoch": 1.3354426528749619, + "grad_norm": 1.7204053401947021, + "learning_rate": 5.155648217066411e-06, + "loss": 0.3511, + "step": 10974 + }, + { + "epoch": 1.3355643443869791, + "grad_norm": 2.3303165435791016, + "learning_rate": 5.153941789174194e-06, + "loss": 0.3879, + "step": 10975 + }, + { + "epoch": 1.3356860358989961, + "grad_norm": 1.7581350803375244, + "learning_rate": 5.152235545684523e-06, + "loss": 0.4142, + "step": 10976 + }, + { + "epoch": 1.3358077274110132, + "grad_norm": 1.7393678426742554, + "learning_rate": 5.150529486662319e-06, + "loss": 0.3486, + "step": 10977 + }, + { + "epoch": 1.3359294189230302, + "grad_norm": 1.4734742641448975, + "learning_rate": 5.148823612172511e-06, + "loss": 0.3759, + "step": 10978 + }, + { + "epoch": 1.3360511104350472, + "grad_norm": 1.6412012577056885, + "learning_rate": 5.147117922280005e-06, + "loss": 0.3195, + "step": 10979 + }, + { + "epoch": 1.3361728019470642, + "grad_norm": 2.7351601123809814, + "learning_rate": 5.145412417049707e-06, + "loss": 0.3785, + "step": 10980 + }, + { + "epoch": 1.3362944934590812, + "grad_norm": 1.4758644104003906, + "learning_rate": 5.143707096546515e-06, + "loss": 0.3697, + "step": 10981 + }, + { + "epoch": 1.3364161849710983, + "grad_norm": 1.6413012742996216, + "learning_rate": 5.142001960835316e-06, + "loss": 0.3696, + "step": 10982 + }, + { + "epoch": 1.3365378764831153, + "grad_norm": 2.974024534225464, + "learning_rate": 5.140297009981003e-06, + "loss": 0.3704, + "step": 10983 + }, + { + "epoch": 1.3366595679951323, + "grad_norm": 4.0569167137146, + "learning_rate": 5.138592244048447e-06, + "loss": 0.4529, + "step": 10984 + }, + { + "epoch": 1.3367812595071493, + "grad_norm": 1.8888123035430908, + "learning_rate": 5.136887663102514e-06, + "loss": 0.3136, + "step": 10985 + }, + { + "epoch": 1.3369029510191663, + "grad_norm": 1.581889033317566, + "learning_rate": 5.135183267208077e-06, + "loss": 0.3576, + "step": 10986 + }, + { + "epoch": 1.3370246425311834, + "grad_norm": 2.990297317504883, + "learning_rate": 5.133479056429986e-06, + "loss": 0.3445, + "step": 10987 + }, + { + "epoch": 1.3371463340432004, + "grad_norm": 3.5888357162475586, + "learning_rate": 5.131775030833085e-06, + "loss": 0.4068, + "step": 10988 + }, + { + "epoch": 1.3372680255552174, + "grad_norm": 1.796064019203186, + "learning_rate": 5.130071190482225e-06, + "loss": 0.3147, + "step": 10989 + }, + { + "epoch": 1.3373897170672346, + "grad_norm": 2.4908409118652344, + "learning_rate": 5.128367535442237e-06, + "loss": 0.3955, + "step": 10990 + }, + { + "epoch": 1.3375114085792517, + "grad_norm": 1.7676397562026978, + "learning_rate": 5.126664065777947e-06, + "loss": 0.3926, + "step": 10991 + }, + { + "epoch": 1.3376331000912687, + "grad_norm": 1.6447612047195435, + "learning_rate": 5.124960781554178e-06, + "loss": 0.3353, + "step": 10992 + }, + { + "epoch": 1.3377547916032857, + "grad_norm": 1.9016059637069702, + "learning_rate": 5.123257682835737e-06, + "loss": 0.34, + "step": 10993 + }, + { + "epoch": 1.3378764831153027, + "grad_norm": 3.1524505615234375, + "learning_rate": 5.12155476968744e-06, + "loss": 0.3321, + "step": 10994 + }, + { + "epoch": 1.3379981746273197, + "grad_norm": 1.9542748928070068, + "learning_rate": 5.1198520421740816e-06, + "loss": 0.39, + "step": 10995 + }, + { + "epoch": 1.3381198661393368, + "grad_norm": 1.716983675956726, + "learning_rate": 5.1181495003604495e-06, + "loss": 0.3991, + "step": 10996 + }, + { + "epoch": 1.3382415576513538, + "grad_norm": 2.202787160873413, + "learning_rate": 5.116447144311339e-06, + "loss": 0.3492, + "step": 10997 + }, + { + "epoch": 1.3383632491633708, + "grad_norm": 1.6653982400894165, + "learning_rate": 5.114744974091523e-06, + "loss": 0.3534, + "step": 10998 + }, + { + "epoch": 1.3384849406753878, + "grad_norm": 1.9553024768829346, + "learning_rate": 5.113042989765769e-06, + "loss": 0.3404, + "step": 10999 + }, + { + "epoch": 1.338606632187405, + "grad_norm": 2.9529871940612793, + "learning_rate": 5.111341191398847e-06, + "loss": 0.3477, + "step": 11000 + }, + { + "epoch": 1.338728323699422, + "grad_norm": 1.39569091796875, + "learning_rate": 5.109639579055513e-06, + "loss": 0.3418, + "step": 11001 + }, + { + "epoch": 1.338850015211439, + "grad_norm": 1.9004137516021729, + "learning_rate": 5.1079381528005115e-06, + "loss": 0.3201, + "step": 11002 + }, + { + "epoch": 1.3389717067234561, + "grad_norm": 1.811944603919983, + "learning_rate": 5.106236912698597e-06, + "loss": 0.3368, + "step": 11003 + }, + { + "epoch": 1.3390933982354731, + "grad_norm": 1.8880418539047241, + "learning_rate": 5.1045358588144885e-06, + "loss": 0.3987, + "step": 11004 + }, + { + "epoch": 1.3392150897474902, + "grad_norm": 2.5061557292938232, + "learning_rate": 5.10283499121293e-06, + "loss": 0.3529, + "step": 11005 + }, + { + "epoch": 1.3393367812595072, + "grad_norm": 2.251542091369629, + "learning_rate": 5.101134309958634e-06, + "loss": 0.3721, + "step": 11006 + }, + { + "epoch": 1.3394584727715242, + "grad_norm": 3.39823317527771, + "learning_rate": 5.099433815116315e-06, + "loss": 0.4286, + "step": 11007 + }, + { + "epoch": 1.3395801642835412, + "grad_norm": 3.1394879817962646, + "learning_rate": 5.097733506750687e-06, + "loss": 0.3897, + "step": 11008 + }, + { + "epoch": 1.3397018557955582, + "grad_norm": 2.324920892715454, + "learning_rate": 5.0960333849264465e-06, + "loss": 0.3739, + "step": 11009 + }, + { + "epoch": 1.3398235473075752, + "grad_norm": 1.7832491397857666, + "learning_rate": 5.0943334497082815e-06, + "loss": 0.3615, + "step": 11010 + }, + { + "epoch": 1.3399452388195923, + "grad_norm": 1.780424952507019, + "learning_rate": 5.0926337011608875e-06, + "loss": 0.3665, + "step": 11011 + }, + { + "epoch": 1.3400669303316093, + "grad_norm": 2.180391788482666, + "learning_rate": 5.0909341393489345e-06, + "loss": 0.3355, + "step": 11012 + }, + { + "epoch": 1.3401886218436263, + "grad_norm": 1.671390175819397, + "learning_rate": 5.089234764337103e-06, + "loss": 0.3471, + "step": 11013 + }, + { + "epoch": 1.3403103133556433, + "grad_norm": 2.992978096008301, + "learning_rate": 5.087535576190057e-06, + "loss": 0.4249, + "step": 11014 + }, + { + "epoch": 1.3404320048676603, + "grad_norm": 1.7745660543441772, + "learning_rate": 5.0858365749724425e-06, + "loss": 0.3514, + "step": 11015 + }, + { + "epoch": 1.3405536963796776, + "grad_norm": 2.2330129146575928, + "learning_rate": 5.084137760748923e-06, + "loss": 0.4246, + "step": 11016 + }, + { + "epoch": 1.3406753878916946, + "grad_norm": 2.6199281215667725, + "learning_rate": 5.082439133584136e-06, + "loss": 0.4635, + "step": 11017 + }, + { + "epoch": 1.3407970794037116, + "grad_norm": 2.5350892543792725, + "learning_rate": 5.0807406935427165e-06, + "loss": 0.3737, + "step": 11018 + }, + { + "epoch": 1.3409187709157286, + "grad_norm": 2.2068848609924316, + "learning_rate": 5.0790424406892966e-06, + "loss": 0.3746, + "step": 11019 + }, + { + "epoch": 1.3410404624277457, + "grad_norm": 1.8008586168289185, + "learning_rate": 5.0773443750884965e-06, + "loss": 0.4095, + "step": 11020 + }, + { + "epoch": 1.3411621539397627, + "grad_norm": 3.852505922317505, + "learning_rate": 5.075646496804936e-06, + "loss": 0.3295, + "step": 11021 + }, + { + "epoch": 1.3412838454517797, + "grad_norm": 1.5841361284255981, + "learning_rate": 5.073948805903217e-06, + "loss": 0.3955, + "step": 11022 + }, + { + "epoch": 1.3414055369637967, + "grad_norm": 2.9455623626708984, + "learning_rate": 5.072251302447942e-06, + "loss": 0.3759, + "step": 11023 + }, + { + "epoch": 1.3415272284758137, + "grad_norm": 2.2216196060180664, + "learning_rate": 5.070553986503706e-06, + "loss": 0.374, + "step": 11024 + }, + { + "epoch": 1.341648919987831, + "grad_norm": 3.1855056285858154, + "learning_rate": 5.068856858135094e-06, + "loss": 0.3967, + "step": 11025 + }, + { + "epoch": 1.341770611499848, + "grad_norm": 3.5295183658599854, + "learning_rate": 5.0671599174066865e-06, + "loss": 0.3371, + "step": 11026 + }, + { + "epoch": 1.341892303011865, + "grad_norm": 2.3580572605133057, + "learning_rate": 5.065463164383054e-06, + "loss": 0.3989, + "step": 11027 + }, + { + "epoch": 1.342013994523882, + "grad_norm": 4.957223892211914, + "learning_rate": 5.0637665991287585e-06, + "loss": 0.3306, + "step": 11028 + }, + { + "epoch": 1.342135686035899, + "grad_norm": 2.0160727500915527, + "learning_rate": 5.0620702217083665e-06, + "loss": 0.4112, + "step": 11029 + }, + { + "epoch": 1.342257377547916, + "grad_norm": 1.795076847076416, + "learning_rate": 5.0603740321864215e-06, + "loss": 0.323, + "step": 11030 + }, + { + "epoch": 1.342379069059933, + "grad_norm": 2.9094138145446777, + "learning_rate": 5.058678030627466e-06, + "loss": 0.275, + "step": 11031 + }, + { + "epoch": 1.3425007605719501, + "grad_norm": 1.4378584623336792, + "learning_rate": 5.056982217096043e-06, + "loss": 0.312, + "step": 11032 + }, + { + "epoch": 1.3426224520839671, + "grad_norm": 1.352764368057251, + "learning_rate": 5.055286591656678e-06, + "loss": 0.3493, + "step": 11033 + }, + { + "epoch": 1.3427441435959842, + "grad_norm": 1.5660258531570435, + "learning_rate": 5.053591154373891e-06, + "loss": 0.3427, + "step": 11034 + }, + { + "epoch": 1.3428658351080012, + "grad_norm": 3.100874423980713, + "learning_rate": 5.051895905312201e-06, + "loss": 0.3818, + "step": 11035 + }, + { + "epoch": 1.3429875266200182, + "grad_norm": 2.558004140853882, + "learning_rate": 5.050200844536114e-06, + "loss": 0.3746, + "step": 11036 + }, + { + "epoch": 1.3431092181320352, + "grad_norm": 4.097098350524902, + "learning_rate": 5.048505972110131e-06, + "loss": 0.4383, + "step": 11037 + }, + { + "epoch": 1.3432309096440522, + "grad_norm": 2.1375014781951904, + "learning_rate": 5.046811288098745e-06, + "loss": 0.3771, + "step": 11038 + }, + { + "epoch": 1.3433526011560692, + "grad_norm": 1.6098134517669678, + "learning_rate": 5.045116792566438e-06, + "loss": 0.3305, + "step": 11039 + }, + { + "epoch": 1.3434742926680863, + "grad_norm": 2.4263265132904053, + "learning_rate": 5.043422485577696e-06, + "loss": 0.3789, + "step": 11040 + }, + { + "epoch": 1.3435959841801035, + "grad_norm": 2.0546152591705322, + "learning_rate": 5.041728367196988e-06, + "loss": 0.3486, + "step": 11041 + }, + { + "epoch": 1.3437176756921205, + "grad_norm": 2.3783087730407715, + "learning_rate": 5.0400344374887765e-06, + "loss": 0.3779, + "step": 11042 + }, + { + "epoch": 1.3438393672041375, + "grad_norm": 4.532798767089844, + "learning_rate": 5.038340696517523e-06, + "loss": 0.4212, + "step": 11043 + }, + { + "epoch": 1.3439610587161546, + "grad_norm": 1.5730412006378174, + "learning_rate": 5.0366471443476775e-06, + "loss": 0.353, + "step": 11044 + }, + { + "epoch": 1.3440827502281716, + "grad_norm": 1.2899422645568848, + "learning_rate": 5.034953781043678e-06, + "loss": 0.3765, + "step": 11045 + }, + { + "epoch": 1.3442044417401886, + "grad_norm": 2.337146282196045, + "learning_rate": 5.033260606669968e-06, + "loss": 0.3261, + "step": 11046 + }, + { + "epoch": 1.3443261332522056, + "grad_norm": 1.57066011428833, + "learning_rate": 5.0315676212909715e-06, + "loss": 0.3513, + "step": 11047 + }, + { + "epoch": 1.3444478247642226, + "grad_norm": 2.039445400238037, + "learning_rate": 5.029874824971108e-06, + "loss": 0.3483, + "step": 11048 + }, + { + "epoch": 1.3445695162762397, + "grad_norm": 1.8404277563095093, + "learning_rate": 5.028182217774802e-06, + "loss": 0.3445, + "step": 11049 + }, + { + "epoch": 1.344691207788257, + "grad_norm": 1.4669520854949951, + "learning_rate": 5.026489799766447e-06, + "loss": 0.3664, + "step": 11050 + }, + { + "epoch": 1.344812899300274, + "grad_norm": 1.6410268545150757, + "learning_rate": 5.024797571010454e-06, + "loss": 0.3669, + "step": 11051 + }, + { + "epoch": 1.344934590812291, + "grad_norm": 1.673137903213501, + "learning_rate": 5.0231055315712095e-06, + "loss": 0.3959, + "step": 11052 + }, + { + "epoch": 1.345056282324308, + "grad_norm": 1.8528831005096436, + "learning_rate": 5.0214136815130985e-06, + "loss": 0.3912, + "step": 11053 + }, + { + "epoch": 1.345177973836325, + "grad_norm": 1.6221925020217896, + "learning_rate": 5.019722020900506e-06, + "loss": 0.3897, + "step": 11054 + }, + { + "epoch": 1.345299665348342, + "grad_norm": 2.3154237270355225, + "learning_rate": 5.018030549797799e-06, + "loss": 0.3461, + "step": 11055 + }, + { + "epoch": 1.345421356860359, + "grad_norm": 2.1729185581207275, + "learning_rate": 5.016339268269338e-06, + "loss": 0.3425, + "step": 11056 + }, + { + "epoch": 1.345543048372376, + "grad_norm": 1.5695834159851074, + "learning_rate": 5.014648176379487e-06, + "loss": 0.3625, + "step": 11057 + }, + { + "epoch": 1.345664739884393, + "grad_norm": 2.028552770614624, + "learning_rate": 5.012957274192592e-06, + "loss": 0.4064, + "step": 11058 + }, + { + "epoch": 1.34578643139641, + "grad_norm": 2.6498844623565674, + "learning_rate": 5.011266561772992e-06, + "loss": 0.3956, + "step": 11059 + }, + { + "epoch": 1.345908122908427, + "grad_norm": 1.7356956005096436, + "learning_rate": 5.009576039185034e-06, + "loss": 0.3046, + "step": 11060 + }, + { + "epoch": 1.3460298144204441, + "grad_norm": 2.5046088695526123, + "learning_rate": 5.007885706493028e-06, + "loss": 0.3949, + "step": 11061 + }, + { + "epoch": 1.3461515059324611, + "grad_norm": 2.589621067047119, + "learning_rate": 5.006195563761307e-06, + "loss": 0.4018, + "step": 11062 + }, + { + "epoch": 1.3462731974444782, + "grad_norm": 1.794716238975525, + "learning_rate": 5.004505611054182e-06, + "loss": 0.3862, + "step": 11063 + }, + { + "epoch": 1.3463948889564952, + "grad_norm": 1.9696511030197144, + "learning_rate": 5.002815848435956e-06, + "loss": 0.3539, + "step": 11064 + }, + { + "epoch": 1.3465165804685122, + "grad_norm": 2.4545681476593018, + "learning_rate": 5.001126275970934e-06, + "loss": 0.3609, + "step": 11065 + }, + { + "epoch": 1.3466382719805294, + "grad_norm": 3.0479249954223633, + "learning_rate": 4.999436893723404e-06, + "loss": 0.336, + "step": 11066 + }, + { + "epoch": 1.3467599634925465, + "grad_norm": 1.4956940412521362, + "learning_rate": 4.997747701757647e-06, + "loss": 0.3373, + "step": 11067 + }, + { + "epoch": 1.3468816550045635, + "grad_norm": 2.651766300201416, + "learning_rate": 4.996058700137948e-06, + "loss": 0.3302, + "step": 11068 + }, + { + "epoch": 1.3470033465165805, + "grad_norm": 2.408039093017578, + "learning_rate": 4.994369888928573e-06, + "loss": 0.3779, + "step": 11069 + }, + { + "epoch": 1.3471250380285975, + "grad_norm": 4.037755489349365, + "learning_rate": 4.992681268193781e-06, + "loss": 0.4393, + "step": 11070 + }, + { + "epoch": 1.3472467295406145, + "grad_norm": 2.839583158493042, + "learning_rate": 4.990992837997836e-06, + "loss": 0.4211, + "step": 11071 + }, + { + "epoch": 1.3473684210526315, + "grad_norm": 1.764530062675476, + "learning_rate": 4.989304598404981e-06, + "loss": 0.3597, + "step": 11072 + }, + { + "epoch": 1.3474901125646486, + "grad_norm": 1.5183846950531006, + "learning_rate": 4.987616549479456e-06, + "loss": 0.3285, + "step": 11073 + }, + { + "epoch": 1.3476118040766656, + "grad_norm": 4.086575508117676, + "learning_rate": 4.9859286912854975e-06, + "loss": 0.4522, + "step": 11074 + }, + { + "epoch": 1.3477334955886826, + "grad_norm": 1.79486882686615, + "learning_rate": 4.984241023887327e-06, + "loss": 0.3729, + "step": 11075 + }, + { + "epoch": 1.3478551871006998, + "grad_norm": 2.400681734085083, + "learning_rate": 4.982553547349173e-06, + "loss": 0.4159, + "step": 11076 + }, + { + "epoch": 1.3479768786127169, + "grad_norm": 1.9631298780441284, + "learning_rate": 4.980866261735237e-06, + "loss": 0.3716, + "step": 11077 + }, + { + "epoch": 1.3480985701247339, + "grad_norm": 1.5370818376541138, + "learning_rate": 4.979179167109732e-06, + "loss": 0.329, + "step": 11078 + }, + { + "epoch": 1.348220261636751, + "grad_norm": 1.6928389072418213, + "learning_rate": 4.977492263536853e-06, + "loss": 0.332, + "step": 11079 + }, + { + "epoch": 1.348341953148768, + "grad_norm": 1.4783992767333984, + "learning_rate": 4.975805551080784e-06, + "loss": 0.3804, + "step": 11080 + }, + { + "epoch": 1.348463644660785, + "grad_norm": 1.3643468618392944, + "learning_rate": 4.97411902980572e-06, + "loss": 0.3536, + "step": 11081 + }, + { + "epoch": 1.348585336172802, + "grad_norm": 1.8942606449127197, + "learning_rate": 4.972432699775827e-06, + "loss": 0.35, + "step": 11082 + }, + { + "epoch": 1.348707027684819, + "grad_norm": 1.7877579927444458, + "learning_rate": 4.970746561055278e-06, + "loss": 0.2827, + "step": 11083 + }, + { + "epoch": 1.348828719196836, + "grad_norm": 1.600560188293457, + "learning_rate": 4.969060613708232e-06, + "loss": 0.3747, + "step": 11084 + }, + { + "epoch": 1.348950410708853, + "grad_norm": 1.5231003761291504, + "learning_rate": 4.967374857798839e-06, + "loss": 0.4006, + "step": 11085 + }, + { + "epoch": 1.34907210222087, + "grad_norm": 1.6011158227920532, + "learning_rate": 4.965689293391254e-06, + "loss": 0.3422, + "step": 11086 + }, + { + "epoch": 1.349193793732887, + "grad_norm": 3.4856338500976562, + "learning_rate": 4.964003920549612e-06, + "loss": 0.3868, + "step": 11087 + }, + { + "epoch": 1.349315485244904, + "grad_norm": 4.397075176239014, + "learning_rate": 4.962318739338039e-06, + "loss": 0.4357, + "step": 11088 + }, + { + "epoch": 1.349437176756921, + "grad_norm": 1.8966400623321533, + "learning_rate": 4.9606337498206715e-06, + "loss": 0.4009, + "step": 11089 + }, + { + "epoch": 1.3495588682689381, + "grad_norm": 1.6261204481124878, + "learning_rate": 4.95894895206162e-06, + "loss": 0.4101, + "step": 11090 + }, + { + "epoch": 1.3496805597809554, + "grad_norm": 1.422709345817566, + "learning_rate": 4.95726434612499e-06, + "loss": 0.3949, + "step": 11091 + }, + { + "epoch": 1.3498022512929724, + "grad_norm": 1.7959972620010376, + "learning_rate": 4.955579932074896e-06, + "loss": 0.2994, + "step": 11092 + }, + { + "epoch": 1.3499239428049894, + "grad_norm": 3.601912498474121, + "learning_rate": 4.953895709975425e-06, + "loss": 0.4894, + "step": 11093 + }, + { + "epoch": 1.3500456343170064, + "grad_norm": 1.9235748052597046, + "learning_rate": 4.952211679890664e-06, + "loss": 0.4036, + "step": 11094 + }, + { + "epoch": 1.3501673258290234, + "grad_norm": 1.8110328912734985, + "learning_rate": 4.950527841884705e-06, + "loss": 0.3139, + "step": 11095 + }, + { + "epoch": 1.3502890173410405, + "grad_norm": 1.5664286613464355, + "learning_rate": 4.948844196021605e-06, + "loss": 0.3698, + "step": 11096 + }, + { + "epoch": 1.3504107088530575, + "grad_norm": 1.812437653541565, + "learning_rate": 4.947160742365442e-06, + "loss": 0.4136, + "step": 11097 + }, + { + "epoch": 1.3505324003650745, + "grad_norm": 1.8608636856079102, + "learning_rate": 4.945477480980273e-06, + "loss": 0.3985, + "step": 11098 + }, + { + "epoch": 1.3506540918770915, + "grad_norm": 1.386445164680481, + "learning_rate": 4.943794411930143e-06, + "loss": 0.3881, + "step": 11099 + }, + { + "epoch": 1.3507757833891085, + "grad_norm": 1.3475912809371948, + "learning_rate": 4.942111535279106e-06, + "loss": 0.33, + "step": 11100 + }, + { + "epoch": 1.3508974749011258, + "grad_norm": 1.3175837993621826, + "learning_rate": 4.940428851091195e-06, + "loss": 0.3595, + "step": 11101 + }, + { + "epoch": 1.3510191664131428, + "grad_norm": 2.505901336669922, + "learning_rate": 4.938746359430434e-06, + "loss": 0.3406, + "step": 11102 + }, + { + "epoch": 1.3511408579251598, + "grad_norm": 1.8229089975357056, + "learning_rate": 4.937064060360854e-06, + "loss": 0.4044, + "step": 11103 + }, + { + "epoch": 1.3512625494371768, + "grad_norm": 1.2791388034820557, + "learning_rate": 4.9353819539464664e-06, + "loss": 0.3495, + "step": 11104 + }, + { + "epoch": 1.3513842409491938, + "grad_norm": 1.715727686882019, + "learning_rate": 4.933700040251276e-06, + "loss": 0.3581, + "step": 11105 + }, + { + "epoch": 1.3515059324612109, + "grad_norm": 2.3101534843444824, + "learning_rate": 4.932018319339291e-06, + "loss": 0.3331, + "step": 11106 + }, + { + "epoch": 1.3516276239732279, + "grad_norm": 1.4475650787353516, + "learning_rate": 4.930336791274494e-06, + "loss": 0.3815, + "step": 11107 + }, + { + "epoch": 1.351749315485245, + "grad_norm": 1.5390148162841797, + "learning_rate": 4.928655456120878e-06, + "loss": 0.4125, + "step": 11108 + }, + { + "epoch": 1.351871006997262, + "grad_norm": 1.4437994956970215, + "learning_rate": 4.92697431394242e-06, + "loss": 0.3737, + "step": 11109 + }, + { + "epoch": 1.351992698509279, + "grad_norm": 1.6687712669372559, + "learning_rate": 4.925293364803084e-06, + "loss": 0.3661, + "step": 11110 + }, + { + "epoch": 1.352114390021296, + "grad_norm": 1.7024102210998535, + "learning_rate": 4.923612608766846e-06, + "loss": 0.3334, + "step": 11111 + }, + { + "epoch": 1.352236081533313, + "grad_norm": 2.6565771102905273, + "learning_rate": 4.921932045897654e-06, + "loss": 0.4211, + "step": 11112 + }, + { + "epoch": 1.35235777304533, + "grad_norm": 1.6554299592971802, + "learning_rate": 4.920251676259455e-06, + "loss": 0.3413, + "step": 11113 + }, + { + "epoch": 1.352479464557347, + "grad_norm": 4.151785373687744, + "learning_rate": 4.918571499916199e-06, + "loss": 0.455, + "step": 11114 + }, + { + "epoch": 1.352601156069364, + "grad_norm": 2.1160130500793457, + "learning_rate": 4.9168915169318155e-06, + "loss": 0.3694, + "step": 11115 + }, + { + "epoch": 1.352722847581381, + "grad_norm": 2.1712119579315186, + "learning_rate": 4.915211727370226e-06, + "loss": 0.3696, + "step": 11116 + }, + { + "epoch": 1.3528445390933983, + "grad_norm": 1.4879486560821533, + "learning_rate": 4.91353213129536e-06, + "loss": 0.3153, + "step": 11117 + }, + { + "epoch": 1.3529662306054153, + "grad_norm": 1.6520042419433594, + "learning_rate": 4.911852728771125e-06, + "loss": 0.365, + "step": 11118 + }, + { + "epoch": 1.3530879221174323, + "grad_norm": 1.4935072660446167, + "learning_rate": 4.910173519861425e-06, + "loss": 0.4036, + "step": 11119 + }, + { + "epoch": 1.3532096136294494, + "grad_norm": 2.27902889251709, + "learning_rate": 4.908494504630158e-06, + "loss": 0.4058, + "step": 11120 + }, + { + "epoch": 1.3533313051414664, + "grad_norm": 2.504993200302124, + "learning_rate": 4.906815683141211e-06, + "loss": 0.3431, + "step": 11121 + }, + { + "epoch": 1.3534529966534834, + "grad_norm": 2.718082904815674, + "learning_rate": 4.905137055458472e-06, + "loss": 0.3761, + "step": 11122 + }, + { + "epoch": 1.3535746881655004, + "grad_norm": 1.9521477222442627, + "learning_rate": 4.903458621645815e-06, + "loss": 0.3712, + "step": 11123 + }, + { + "epoch": 1.3536963796775174, + "grad_norm": 1.510922908782959, + "learning_rate": 4.901780381767103e-06, + "loss": 0.3575, + "step": 11124 + }, + { + "epoch": 1.3538180711895345, + "grad_norm": 2.4058730602264404, + "learning_rate": 4.9001023358862036e-06, + "loss": 0.3185, + "step": 11125 + }, + { + "epoch": 1.3539397627015517, + "grad_norm": 2.4367456436157227, + "learning_rate": 4.898424484066966e-06, + "loss": 0.3175, + "step": 11126 + }, + { + "epoch": 1.3540614542135687, + "grad_norm": 1.4776997566223145, + "learning_rate": 4.896746826373232e-06, + "loss": 0.3438, + "step": 11127 + }, + { + "epoch": 1.3541831457255857, + "grad_norm": 1.9074993133544922, + "learning_rate": 4.8950693628688495e-06, + "loss": 0.339, + "step": 11128 + }, + { + "epoch": 1.3543048372376028, + "grad_norm": 2.4589500427246094, + "learning_rate": 4.893392093617645e-06, + "loss": 0.2854, + "step": 11129 + }, + { + "epoch": 1.3544265287496198, + "grad_norm": 3.9764435291290283, + "learning_rate": 4.89171501868344e-06, + "loss": 0.434, + "step": 11130 + }, + { + "epoch": 1.3545482202616368, + "grad_norm": 1.4589622020721436, + "learning_rate": 4.8900381381300524e-06, + "loss": 0.3141, + "step": 11131 + }, + { + "epoch": 1.3546699117736538, + "grad_norm": 2.069058895111084, + "learning_rate": 4.888361452021288e-06, + "loss": 0.3738, + "step": 11132 + }, + { + "epoch": 1.3547916032856708, + "grad_norm": 1.5543010234832764, + "learning_rate": 4.886684960420953e-06, + "loss": 0.343, + "step": 11133 + }, + { + "epoch": 1.3549132947976879, + "grad_norm": 2.2902374267578125, + "learning_rate": 4.88500866339284e-06, + "loss": 0.3737, + "step": 11134 + }, + { + "epoch": 1.3550349863097049, + "grad_norm": 1.6328562498092651, + "learning_rate": 4.883332561000731e-06, + "loss": 0.3629, + "step": 11135 + }, + { + "epoch": 1.3551566778217219, + "grad_norm": 1.4300404787063599, + "learning_rate": 4.881656653308413e-06, + "loss": 0.3383, + "step": 11136 + }, + { + "epoch": 1.355278369333739, + "grad_norm": 1.3524949550628662, + "learning_rate": 4.879980940379649e-06, + "loss": 0.3324, + "step": 11137 + }, + { + "epoch": 1.355400060845756, + "grad_norm": 1.8797709941864014, + "learning_rate": 4.878305422278213e-06, + "loss": 0.4218, + "step": 11138 + }, + { + "epoch": 1.355521752357773, + "grad_norm": 1.634743571281433, + "learning_rate": 4.8766300990678576e-06, + "loss": 0.3653, + "step": 11139 + }, + { + "epoch": 1.35564344386979, + "grad_norm": 1.4865691661834717, + "learning_rate": 4.874954970812326e-06, + "loss": 0.3549, + "step": 11140 + }, + { + "epoch": 1.355765135381807, + "grad_norm": 2.021141529083252, + "learning_rate": 4.873280037575375e-06, + "loss": 0.3067, + "step": 11141 + }, + { + "epoch": 1.3558868268938242, + "grad_norm": 1.5794376134872437, + "learning_rate": 4.8716052994207254e-06, + "loss": 0.3709, + "step": 11142 + }, + { + "epoch": 1.3560085184058412, + "grad_norm": 1.9783570766448975, + "learning_rate": 4.869930756412107e-06, + "loss": 0.3104, + "step": 11143 + }, + { + "epoch": 1.3561302099178583, + "grad_norm": 1.8349593877792358, + "learning_rate": 4.868256408613244e-06, + "loss": 0.4265, + "step": 11144 + }, + { + "epoch": 1.3562519014298753, + "grad_norm": 1.4364620447158813, + "learning_rate": 4.866582256087845e-06, + "loss": 0.3353, + "step": 11145 + }, + { + "epoch": 1.3563735929418923, + "grad_norm": 1.933782696723938, + "learning_rate": 4.8649082988996185e-06, + "loss": 0.3553, + "step": 11146 + }, + { + "epoch": 1.3564952844539093, + "grad_norm": 1.6134716272354126, + "learning_rate": 4.863234537112261e-06, + "loss": 0.382, + "step": 11147 + }, + { + "epoch": 1.3566169759659263, + "grad_norm": 1.8891401290893555, + "learning_rate": 4.86156097078946e-06, + "loss": 0.3518, + "step": 11148 + }, + { + "epoch": 1.3567386674779434, + "grad_norm": 3.6986119747161865, + "learning_rate": 4.8598875999949015e-06, + "loss": 0.457, + "step": 11149 + }, + { + "epoch": 1.3568603589899604, + "grad_norm": 2.8852577209472656, + "learning_rate": 4.8582144247922606e-06, + "loss": 0.409, + "step": 11150 + }, + { + "epoch": 1.3569820505019776, + "grad_norm": 1.6108317375183105, + "learning_rate": 4.8565414452451986e-06, + "loss": 0.4154, + "step": 11151 + }, + { + "epoch": 1.3571037420139946, + "grad_norm": 1.7871568202972412, + "learning_rate": 4.854868661417385e-06, + "loss": 0.3947, + "step": 11152 + }, + { + "epoch": 1.3572254335260117, + "grad_norm": 1.8654072284698486, + "learning_rate": 4.85319607337247e-06, + "loss": 0.3953, + "step": 11153 + }, + { + "epoch": 1.3573471250380287, + "grad_norm": 1.3134747743606567, + "learning_rate": 4.8515236811740965e-06, + "loss": 0.3352, + "step": 11154 + }, + { + "epoch": 1.3574688165500457, + "grad_norm": 1.8670462369918823, + "learning_rate": 4.849851484885905e-06, + "loss": 0.4032, + "step": 11155 + }, + { + "epoch": 1.3575905080620627, + "grad_norm": 1.9213924407958984, + "learning_rate": 4.8481794845715195e-06, + "loss": 0.3549, + "step": 11156 + }, + { + "epoch": 1.3577121995740797, + "grad_norm": 2.5830886363983154, + "learning_rate": 4.846507680294572e-06, + "loss": 0.3792, + "step": 11157 + }, + { + "epoch": 1.3578338910860968, + "grad_norm": 2.2905547618865967, + "learning_rate": 4.844836072118676e-06, + "loss": 0.3398, + "step": 11158 + }, + { + "epoch": 1.3579555825981138, + "grad_norm": 2.351374626159668, + "learning_rate": 4.843164660107432e-06, + "loss": 0.3302, + "step": 11159 + }, + { + "epoch": 1.3580772741101308, + "grad_norm": 1.4515116214752197, + "learning_rate": 4.841493444324452e-06, + "loss": 0.3071, + "step": 11160 + }, + { + "epoch": 1.3581989656221478, + "grad_norm": 1.2680836915969849, + "learning_rate": 4.839822424833324e-06, + "loss": 0.3357, + "step": 11161 + }, + { + "epoch": 1.3583206571341648, + "grad_norm": 2.178415298461914, + "learning_rate": 4.8381516016976305e-06, + "loss": 0.3983, + "step": 11162 + }, + { + "epoch": 1.3584423486461819, + "grad_norm": 1.6810710430145264, + "learning_rate": 4.836480974980956e-06, + "loss": 0.3884, + "step": 11163 + }, + { + "epoch": 1.3585640401581989, + "grad_norm": 1.5945277214050293, + "learning_rate": 4.834810544746867e-06, + "loss": 0.3873, + "step": 11164 + }, + { + "epoch": 1.358685731670216, + "grad_norm": 5.168517589569092, + "learning_rate": 4.83314031105893e-06, + "loss": 0.3778, + "step": 11165 + }, + { + "epoch": 1.358807423182233, + "grad_norm": 2.691084623336792, + "learning_rate": 4.8314702739806984e-06, + "loss": 0.416, + "step": 11166 + }, + { + "epoch": 1.3589291146942502, + "grad_norm": 2.2331182956695557, + "learning_rate": 4.829800433575717e-06, + "loss": 0.4013, + "step": 11167 + }, + { + "epoch": 1.3590508062062672, + "grad_norm": 1.7573703527450562, + "learning_rate": 4.828130789907536e-06, + "loss": 0.3484, + "step": 11168 + }, + { + "epoch": 1.3591724977182842, + "grad_norm": 2.2589328289031982, + "learning_rate": 4.826461343039682e-06, + "loss": 0.3059, + "step": 11169 + }, + { + "epoch": 1.3592941892303012, + "grad_norm": 5.430410861968994, + "learning_rate": 4.8247920930356795e-06, + "loss": 0.4594, + "step": 11170 + }, + { + "epoch": 1.3594158807423182, + "grad_norm": 1.2312405109405518, + "learning_rate": 4.823123039959054e-06, + "loss": 0.3406, + "step": 11171 + }, + { + "epoch": 1.3595375722543352, + "grad_norm": 1.5096276998519897, + "learning_rate": 4.821454183873312e-06, + "loss": 0.3146, + "step": 11172 + }, + { + "epoch": 1.3596592637663523, + "grad_norm": 1.4163577556610107, + "learning_rate": 4.819785524841954e-06, + "loss": 0.3208, + "step": 11173 + }, + { + "epoch": 1.3597809552783693, + "grad_norm": 1.7854074239730835, + "learning_rate": 4.818117062928481e-06, + "loss": 0.3653, + "step": 11174 + }, + { + "epoch": 1.3599026467903863, + "grad_norm": 3.257835865020752, + "learning_rate": 4.8164487981963816e-06, + "loss": 0.4086, + "step": 11175 + }, + { + "epoch": 1.3600243383024035, + "grad_norm": 1.231567621231079, + "learning_rate": 4.8147807307091345e-06, + "loss": 0.3517, + "step": 11176 + }, + { + "epoch": 1.3601460298144206, + "grad_norm": 1.357016921043396, + "learning_rate": 4.813112860530213e-06, + "loss": 0.3041, + "step": 11177 + }, + { + "epoch": 1.3602677213264376, + "grad_norm": 2.265507459640503, + "learning_rate": 4.811445187723081e-06, + "loss": 0.3566, + "step": 11178 + }, + { + "epoch": 1.3603894128384546, + "grad_norm": 1.910288691520691, + "learning_rate": 4.809777712351202e-06, + "loss": 0.3672, + "step": 11179 + }, + { + "epoch": 1.3605111043504716, + "grad_norm": 2.2209866046905518, + "learning_rate": 4.808110434478024e-06, + "loss": 0.4218, + "step": 11180 + }, + { + "epoch": 1.3606327958624886, + "grad_norm": 3.924684524536133, + "learning_rate": 4.806443354166986e-06, + "loss": 0.4281, + "step": 11181 + }, + { + "epoch": 1.3607544873745057, + "grad_norm": 1.9214997291564941, + "learning_rate": 4.804776471481534e-06, + "loss": 0.3956, + "step": 11182 + }, + { + "epoch": 1.3608761788865227, + "grad_norm": 2.4110934734344482, + "learning_rate": 4.803109786485089e-06, + "loss": 0.3236, + "step": 11183 + }, + { + "epoch": 1.3609978703985397, + "grad_norm": 1.5801680088043213, + "learning_rate": 4.801443299241072e-06, + "loss": 0.375, + "step": 11184 + }, + { + "epoch": 1.3611195619105567, + "grad_norm": 1.6021701097488403, + "learning_rate": 4.7997770098129e-06, + "loss": 0.3742, + "step": 11185 + }, + { + "epoch": 1.3612412534225737, + "grad_norm": 1.6113061904907227, + "learning_rate": 4.798110918263978e-06, + "loss": 0.2971, + "step": 11186 + }, + { + "epoch": 1.3613629449345908, + "grad_norm": 1.3568525314331055, + "learning_rate": 4.796445024657701e-06, + "loss": 0.3095, + "step": 11187 + }, + { + "epoch": 1.3614846364466078, + "grad_norm": 2.1532278060913086, + "learning_rate": 4.794779329057463e-06, + "loss": 0.4125, + "step": 11188 + }, + { + "epoch": 1.3616063279586248, + "grad_norm": 1.948684811592102, + "learning_rate": 4.793113831526641e-06, + "loss": 0.39, + "step": 11189 + }, + { + "epoch": 1.3617280194706418, + "grad_norm": 3.202752113342285, + "learning_rate": 4.791448532128621e-06, + "loss": 0.44, + "step": 11190 + }, + { + "epoch": 1.3618497109826588, + "grad_norm": 1.9924675226211548, + "learning_rate": 4.789783430926763e-06, + "loss": 0.3694, + "step": 11191 + }, + { + "epoch": 1.361971402494676, + "grad_norm": 1.710642695426941, + "learning_rate": 4.7881185279844275e-06, + "loss": 0.3514, + "step": 11192 + }, + { + "epoch": 1.362093094006693, + "grad_norm": 1.8457367420196533, + "learning_rate": 4.786453823364975e-06, + "loss": 0.4265, + "step": 11193 + }, + { + "epoch": 1.3622147855187101, + "grad_norm": 1.7339402437210083, + "learning_rate": 4.784789317131744e-06, + "loss": 0.4053, + "step": 11194 + }, + { + "epoch": 1.3623364770307271, + "grad_norm": 1.512866497039795, + "learning_rate": 4.783125009348072e-06, + "loss": 0.3624, + "step": 11195 + }, + { + "epoch": 1.3624581685427442, + "grad_norm": 1.9473291635513306, + "learning_rate": 4.781460900077294e-06, + "loss": 0.3932, + "step": 11196 + }, + { + "epoch": 1.3625798600547612, + "grad_norm": 1.3948858976364136, + "learning_rate": 4.779796989382728e-06, + "loss": 0.3708, + "step": 11197 + }, + { + "epoch": 1.3627015515667782, + "grad_norm": 1.562318205833435, + "learning_rate": 4.7781332773276945e-06, + "loss": 0.3409, + "step": 11198 + }, + { + "epoch": 1.3628232430787952, + "grad_norm": 4.144116401672363, + "learning_rate": 4.776469763975503e-06, + "loss": 0.3516, + "step": 11199 + }, + { + "epoch": 1.3629449345908122, + "grad_norm": 4.386763095855713, + "learning_rate": 4.774806449389442e-06, + "loss": 0.3822, + "step": 11200 + }, + { + "epoch": 1.3630666261028292, + "grad_norm": 1.58619225025177, + "learning_rate": 4.773143333632814e-06, + "loss": 0.4008, + "step": 11201 + }, + { + "epoch": 1.3631883176148465, + "grad_norm": 2.9979045391082764, + "learning_rate": 4.771480416768902e-06, + "loss": 0.3307, + "step": 11202 + }, + { + "epoch": 1.3633100091268635, + "grad_norm": 1.6023509502410889, + "learning_rate": 4.769817698860978e-06, + "loss": 0.3813, + "step": 11203 + }, + { + "epoch": 1.3634317006388805, + "grad_norm": 2.7294957637786865, + "learning_rate": 4.768155179972321e-06, + "loss": 0.4011, + "step": 11204 + }, + { + "epoch": 1.3635533921508975, + "grad_norm": 1.89555823802948, + "learning_rate": 4.7664928601661844e-06, + "loss": 0.4013, + "step": 11205 + }, + { + "epoch": 1.3636750836629146, + "grad_norm": 1.9388405084609985, + "learning_rate": 4.76483073950583e-06, + "loss": 0.437, + "step": 11206 + }, + { + "epoch": 1.3637967751749316, + "grad_norm": 2.4625535011291504, + "learning_rate": 4.763168818054505e-06, + "loss": 0.2918, + "step": 11207 + }, + { + "epoch": 1.3639184666869486, + "grad_norm": 2.10256028175354, + "learning_rate": 4.761507095875439e-06, + "loss": 0.3831, + "step": 11208 + }, + { + "epoch": 1.3640401581989656, + "grad_norm": 3.1596407890319824, + "learning_rate": 4.759845573031877e-06, + "loss": 0.4174, + "step": 11209 + }, + { + "epoch": 1.3641618497109826, + "grad_norm": 1.5453768968582153, + "learning_rate": 4.758184249587041e-06, + "loss": 0.3806, + "step": 11210 + }, + { + "epoch": 1.3642835412229997, + "grad_norm": 2.5782411098480225, + "learning_rate": 4.756523125604137e-06, + "loss": 0.4121, + "step": 11211 + }, + { + "epoch": 1.3644052327350167, + "grad_norm": 1.759214162826538, + "learning_rate": 4.754862201146384e-06, + "loss": 0.3601, + "step": 11212 + }, + { + "epoch": 1.3645269242470337, + "grad_norm": 3.6430277824401855, + "learning_rate": 4.753201476276978e-06, + "loss": 0.4007, + "step": 11213 + }, + { + "epoch": 1.3646486157590507, + "grad_norm": 6.325989246368408, + "learning_rate": 4.75154095105912e-06, + "loss": 0.3273, + "step": 11214 + }, + { + "epoch": 1.3647703072710677, + "grad_norm": 2.039947986602783, + "learning_rate": 4.749880625555991e-06, + "loss": 0.3529, + "step": 11215 + }, + { + "epoch": 1.3648919987830848, + "grad_norm": 2.276052951812744, + "learning_rate": 4.74822049983077e-06, + "loss": 0.4259, + "step": 11216 + }, + { + "epoch": 1.3650136902951018, + "grad_norm": 1.8155701160430908, + "learning_rate": 4.746560573946633e-06, + "loss": 0.4126, + "step": 11217 + }, + { + "epoch": 1.365135381807119, + "grad_norm": 2.353361129760742, + "learning_rate": 4.744900847966739e-06, + "loss": 0.3198, + "step": 11218 + }, + { + "epoch": 1.365257073319136, + "grad_norm": 2.2139289379119873, + "learning_rate": 4.743241321954242e-06, + "loss": 0.4392, + "step": 11219 + }, + { + "epoch": 1.365378764831153, + "grad_norm": 1.3920056819915771, + "learning_rate": 4.741581995972298e-06, + "loss": 0.3511, + "step": 11220 + }, + { + "epoch": 1.36550045634317, + "grad_norm": 2.7540769577026367, + "learning_rate": 4.739922870084041e-06, + "loss": 0.3217, + "step": 11221 + }, + { + "epoch": 1.365622147855187, + "grad_norm": 1.6999354362487793, + "learning_rate": 4.738263944352609e-06, + "loss": 0.3583, + "step": 11222 + }, + { + "epoch": 1.3657438393672041, + "grad_norm": 1.662536382675171, + "learning_rate": 4.736605218841124e-06, + "loss": 0.3725, + "step": 11223 + }, + { + "epoch": 1.3658655308792211, + "grad_norm": 1.5856044292449951, + "learning_rate": 4.734946693612702e-06, + "loss": 0.4252, + "step": 11224 + }, + { + "epoch": 1.3659872223912382, + "grad_norm": 4.299996852874756, + "learning_rate": 4.733288368730459e-06, + "loss": 0.3554, + "step": 11225 + }, + { + "epoch": 1.3661089139032552, + "grad_norm": 1.7832560539245605, + "learning_rate": 4.731630244257497e-06, + "loss": 0.3323, + "step": 11226 + }, + { + "epoch": 1.3662306054152724, + "grad_norm": 1.8432408571243286, + "learning_rate": 4.729972320256903e-06, + "loss": 0.3535, + "step": 11227 + }, + { + "epoch": 1.3663522969272894, + "grad_norm": 1.6611496210098267, + "learning_rate": 4.728314596791777e-06, + "loss": 0.3071, + "step": 11228 + }, + { + "epoch": 1.3664739884393065, + "grad_norm": 1.9368723630905151, + "learning_rate": 4.726657073925189e-06, + "loss": 0.3531, + "step": 11229 + }, + { + "epoch": 1.3665956799513235, + "grad_norm": 1.4649137258529663, + "learning_rate": 4.724999751720214e-06, + "loss": 0.3766, + "step": 11230 + }, + { + "epoch": 1.3667173714633405, + "grad_norm": 1.8864132165908813, + "learning_rate": 4.723342630239919e-06, + "loss": 0.3952, + "step": 11231 + }, + { + "epoch": 1.3668390629753575, + "grad_norm": 3.544740915298462, + "learning_rate": 4.72168570954736e-06, + "loss": 0.4523, + "step": 11232 + }, + { + "epoch": 1.3669607544873745, + "grad_norm": 2.1816415786743164, + "learning_rate": 4.720028989705584e-06, + "loss": 0.3944, + "step": 11233 + }, + { + "epoch": 1.3670824459993915, + "grad_norm": 1.3318713903427124, + "learning_rate": 4.718372470777635e-06, + "loss": 0.3632, + "step": 11234 + }, + { + "epoch": 1.3672041375114086, + "grad_norm": 1.5592565536499023, + "learning_rate": 4.716716152826542e-06, + "loss": 0.3181, + "step": 11235 + }, + { + "epoch": 1.3673258290234256, + "grad_norm": 2.314906597137451, + "learning_rate": 4.7150600359153385e-06, + "loss": 0.3936, + "step": 11236 + }, + { + "epoch": 1.3674475205354426, + "grad_norm": 1.9761723279953003, + "learning_rate": 4.7134041201070406e-06, + "loss": 0.4037, + "step": 11237 + }, + { + "epoch": 1.3675692120474596, + "grad_norm": 3.8374202251434326, + "learning_rate": 4.711748405464655e-06, + "loss": 0.4521, + "step": 11238 + }, + { + "epoch": 1.3676909035594766, + "grad_norm": 2.2602813243865967, + "learning_rate": 4.710092892051191e-06, + "loss": 0.3984, + "step": 11239 + }, + { + "epoch": 1.3678125950714937, + "grad_norm": 1.8705501556396484, + "learning_rate": 4.708437579929642e-06, + "loss": 0.4067, + "step": 11240 + }, + { + "epoch": 1.3679342865835107, + "grad_norm": 1.5998222827911377, + "learning_rate": 4.706782469162994e-06, + "loss": 0.3658, + "step": 11241 + }, + { + "epoch": 1.3680559780955277, + "grad_norm": 2.1664841175079346, + "learning_rate": 4.705127559814231e-06, + "loss": 0.3904, + "step": 11242 + }, + { + "epoch": 1.368177669607545, + "grad_norm": 1.323347806930542, + "learning_rate": 4.703472851946325e-06, + "loss": 0.3457, + "step": 11243 + }, + { + "epoch": 1.368299361119562, + "grad_norm": 1.7622110843658447, + "learning_rate": 4.701818345622238e-06, + "loss": 0.3091, + "step": 11244 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.828781008720398, + "learning_rate": 4.700164040904936e-06, + "loss": 0.3753, + "step": 11245 + }, + { + "epoch": 1.368542744143596, + "grad_norm": 1.8733725547790527, + "learning_rate": 4.698509937857354e-06, + "loss": 0.4113, + "step": 11246 + }, + { + "epoch": 1.368664435655613, + "grad_norm": 1.968394160270691, + "learning_rate": 4.696856036542447e-06, + "loss": 0.3656, + "step": 11247 + }, + { + "epoch": 1.36878612716763, + "grad_norm": 1.6193249225616455, + "learning_rate": 4.695202337023143e-06, + "loss": 0.4199, + "step": 11248 + }, + { + "epoch": 1.368907818679647, + "grad_norm": 2.1883695125579834, + "learning_rate": 4.693548839362368e-06, + "loss": 0.3244, + "step": 11249 + }, + { + "epoch": 1.369029510191664, + "grad_norm": 1.4155499935150146, + "learning_rate": 4.691895543623048e-06, + "loss": 0.375, + "step": 11250 + }, + { + "epoch": 1.369151201703681, + "grad_norm": 1.5698657035827637, + "learning_rate": 4.690242449868089e-06, + "loss": 0.3605, + "step": 11251 + }, + { + "epoch": 1.3692728932156983, + "grad_norm": 2.13388991355896, + "learning_rate": 4.6885895581603915e-06, + "loss": 0.4531, + "step": 11252 + }, + { + "epoch": 1.3693945847277154, + "grad_norm": 1.7883191108703613, + "learning_rate": 4.686936868562859e-06, + "loss": 0.3171, + "step": 11253 + }, + { + "epoch": 1.3695162762397324, + "grad_norm": 1.669055700302124, + "learning_rate": 4.685284381138376e-06, + "loss": 0.3846, + "step": 11254 + }, + { + "epoch": 1.3696379677517494, + "grad_norm": 2.980462074279785, + "learning_rate": 4.683632095949821e-06, + "loss": 0.3179, + "step": 11255 + }, + { + "epoch": 1.3697596592637664, + "grad_norm": 3.4764130115509033, + "learning_rate": 4.681980013060075e-06, + "loss": 0.3374, + "step": 11256 + }, + { + "epoch": 1.3698813507757834, + "grad_norm": 1.9100525379180908, + "learning_rate": 4.680328132531989e-06, + "loss": 0.3807, + "step": 11257 + }, + { + "epoch": 1.3700030422878005, + "grad_norm": 1.5455349683761597, + "learning_rate": 4.678676454428434e-06, + "loss": 0.3704, + "step": 11258 + }, + { + "epoch": 1.3701247337998175, + "grad_norm": 1.6455035209655762, + "learning_rate": 4.677024978812254e-06, + "loss": 0.3814, + "step": 11259 + }, + { + "epoch": 1.3702464253118345, + "grad_norm": 1.8717092275619507, + "learning_rate": 4.675373705746287e-06, + "loss": 0.3903, + "step": 11260 + }, + { + "epoch": 1.3703681168238515, + "grad_norm": 1.6716012954711914, + "learning_rate": 4.673722635293375e-06, + "loss": 0.3497, + "step": 11261 + }, + { + "epoch": 1.3704898083358685, + "grad_norm": 2.577040195465088, + "learning_rate": 4.672071767516343e-06, + "loss": 0.3992, + "step": 11262 + }, + { + "epoch": 1.3706114998478856, + "grad_norm": 1.626160740852356, + "learning_rate": 4.670421102478004e-06, + "loss": 0.3655, + "step": 11263 + }, + { + "epoch": 1.3707331913599026, + "grad_norm": 1.9731955528259277, + "learning_rate": 4.668770640241178e-06, + "loss": 0.369, + "step": 11264 + }, + { + "epoch": 1.3708548828719196, + "grad_norm": 1.845909833908081, + "learning_rate": 4.667120380868661e-06, + "loss": 0.3667, + "step": 11265 + }, + { + "epoch": 1.3709765743839366, + "grad_norm": 1.9171596765518188, + "learning_rate": 4.665470324423255e-06, + "loss": 0.302, + "step": 11266 + }, + { + "epoch": 1.3710982658959536, + "grad_norm": 1.7243751287460327, + "learning_rate": 4.6638204709677445e-06, + "loss": 0.3544, + "step": 11267 + }, + { + "epoch": 1.3712199574079709, + "grad_norm": 1.6981006860733032, + "learning_rate": 4.6621708205649105e-06, + "loss": 0.4065, + "step": 11268 + }, + { + "epoch": 1.3713416489199879, + "grad_norm": 1.5829240083694458, + "learning_rate": 4.660521373277525e-06, + "loss": 0.3857, + "step": 11269 + }, + { + "epoch": 1.371463340432005, + "grad_norm": 1.477675199508667, + "learning_rate": 4.658872129168355e-06, + "loss": 0.3129, + "step": 11270 + }, + { + "epoch": 1.371585031944022, + "grad_norm": 1.269054651260376, + "learning_rate": 4.657223088300151e-06, + "loss": 0.3613, + "step": 11271 + }, + { + "epoch": 1.371706723456039, + "grad_norm": 3.0516796112060547, + "learning_rate": 4.655574250735671e-06, + "loss": 0.4244, + "step": 11272 + }, + { + "epoch": 1.371828414968056, + "grad_norm": 2.2524068355560303, + "learning_rate": 4.653925616537651e-06, + "loss": 0.3565, + "step": 11273 + }, + { + "epoch": 1.371950106480073, + "grad_norm": 1.4977946281433105, + "learning_rate": 4.65227718576883e-06, + "loss": 0.3243, + "step": 11274 + }, + { + "epoch": 1.37207179799209, + "grad_norm": 3.0709052085876465, + "learning_rate": 4.650628958491931e-06, + "loss": 0.3919, + "step": 11275 + }, + { + "epoch": 1.372193489504107, + "grad_norm": 1.6849305629730225, + "learning_rate": 4.648980934769668e-06, + "loss": 0.348, + "step": 11276 + }, + { + "epoch": 1.3723151810161243, + "grad_norm": 1.665854811668396, + "learning_rate": 4.647333114664762e-06, + "loss": 0.3312, + "step": 11277 + }, + { + "epoch": 1.3724368725281413, + "grad_norm": 1.9021978378295898, + "learning_rate": 4.645685498239909e-06, + "loss": 0.3391, + "step": 11278 + }, + { + "epoch": 1.3725585640401583, + "grad_norm": 1.8014719486236572, + "learning_rate": 4.644038085557806e-06, + "loss": 0.344, + "step": 11279 + }, + { + "epoch": 1.3726802555521753, + "grad_norm": 1.668756127357483, + "learning_rate": 4.642390876681141e-06, + "loss": 0.3821, + "step": 11280 + }, + { + "epoch": 1.3728019470641923, + "grad_norm": 1.6157748699188232, + "learning_rate": 4.640743871672588e-06, + "loss": 0.3467, + "step": 11281 + }, + { + "epoch": 1.3729236385762094, + "grad_norm": 1.963357925415039, + "learning_rate": 4.639097070594828e-06, + "loss": 0.3915, + "step": 11282 + }, + { + "epoch": 1.3730453300882264, + "grad_norm": 1.9872376918792725, + "learning_rate": 4.637450473510521e-06, + "loss": 0.3925, + "step": 11283 + }, + { + "epoch": 1.3731670216002434, + "grad_norm": 2.877149820327759, + "learning_rate": 4.635804080482319e-06, + "loss": 0.4215, + "step": 11284 + }, + { + "epoch": 1.3732887131122604, + "grad_norm": 2.3329317569732666, + "learning_rate": 4.634157891572879e-06, + "loss": 0.4413, + "step": 11285 + }, + { + "epoch": 1.3734104046242774, + "grad_norm": 2.8469250202178955, + "learning_rate": 4.632511906844837e-06, + "loss": 0.4668, + "step": 11286 + }, + { + "epoch": 1.3735320961362945, + "grad_norm": 2.2457942962646484, + "learning_rate": 4.630866126360824e-06, + "loss": 0.4771, + "step": 11287 + }, + { + "epoch": 1.3736537876483115, + "grad_norm": 2.9940145015716553, + "learning_rate": 4.629220550183473e-06, + "loss": 0.296, + "step": 11288 + }, + { + "epoch": 1.3737754791603285, + "grad_norm": 1.4465837478637695, + "learning_rate": 4.6275751783753964e-06, + "loss": 0.3473, + "step": 11289 + }, + { + "epoch": 1.3738971706723455, + "grad_norm": 2.904381036758423, + "learning_rate": 4.625930010999201e-06, + "loss": 0.3652, + "step": 11290 + }, + { + "epoch": 1.3740188621843625, + "grad_norm": 1.8450020551681519, + "learning_rate": 4.6242850481174995e-06, + "loss": 0.4307, + "step": 11291 + }, + { + "epoch": 1.3741405536963796, + "grad_norm": 2.3697352409362793, + "learning_rate": 4.6226402897928715e-06, + "loss": 0.3899, + "step": 11292 + }, + { + "epoch": 1.3742622452083968, + "grad_norm": 1.4843295812606812, + "learning_rate": 4.620995736087915e-06, + "loss": 0.3708, + "step": 11293 + }, + { + "epoch": 1.3743839367204138, + "grad_norm": 1.7908962965011597, + "learning_rate": 4.619351387065205e-06, + "loss": 0.3973, + "step": 11294 + }, + { + "epoch": 1.3745056282324308, + "grad_norm": 1.9400103092193604, + "learning_rate": 4.6177072427873075e-06, + "loss": 0.3714, + "step": 11295 + }, + { + "epoch": 1.3746273197444479, + "grad_norm": 1.4860111474990845, + "learning_rate": 4.616063303316795e-06, + "loss": 0.4392, + "step": 11296 + }, + { + "epoch": 1.3747490112564649, + "grad_norm": 1.3192747831344604, + "learning_rate": 4.614419568716216e-06, + "loss": 0.3788, + "step": 11297 + }, + { + "epoch": 1.374870702768482, + "grad_norm": 2.877135992050171, + "learning_rate": 4.612776039048118e-06, + "loss": 0.5236, + "step": 11298 + }, + { + "epoch": 1.374992394280499, + "grad_norm": 1.6368414163589478, + "learning_rate": 4.611132714375046e-06, + "loss": 0.3863, + "step": 11299 + }, + { + "epoch": 1.375114085792516, + "grad_norm": 1.3492391109466553, + "learning_rate": 4.609489594759528e-06, + "loss": 0.3683, + "step": 11300 + }, + { + "epoch": 1.375235777304533, + "grad_norm": 1.476997971534729, + "learning_rate": 4.6078466802640855e-06, + "loss": 0.3085, + "step": 11301 + }, + { + "epoch": 1.37535746881655, + "grad_norm": 1.7999513149261475, + "learning_rate": 4.606203970951245e-06, + "loss": 0.3299, + "step": 11302 + }, + { + "epoch": 1.3754791603285672, + "grad_norm": 1.768798828125, + "learning_rate": 4.604561466883502e-06, + "loss": 0.3842, + "step": 11303 + }, + { + "epoch": 1.3756008518405842, + "grad_norm": 4.651552200317383, + "learning_rate": 4.602919168123366e-06, + "loss": 0.3592, + "step": 11304 + }, + { + "epoch": 1.3757225433526012, + "grad_norm": 1.3234188556671143, + "learning_rate": 4.6012770747333255e-06, + "loss": 0.3503, + "step": 11305 + }, + { + "epoch": 1.3758442348646183, + "grad_norm": 1.2548002004623413, + "learning_rate": 4.599635186775865e-06, + "loss": 0.3456, + "step": 11306 + }, + { + "epoch": 1.3759659263766353, + "grad_norm": 3.3660314083099365, + "learning_rate": 4.597993504313466e-06, + "loss": 0.2967, + "step": 11307 + }, + { + "epoch": 1.3760876178886523, + "grad_norm": 2.4953486919403076, + "learning_rate": 4.596352027408597e-06, + "loss": 0.3775, + "step": 11308 + }, + { + "epoch": 1.3762093094006693, + "grad_norm": 1.842812180519104, + "learning_rate": 4.594710756123715e-06, + "loss": 0.3858, + "step": 11309 + }, + { + "epoch": 1.3763310009126863, + "grad_norm": 1.8454877138137817, + "learning_rate": 4.593069690521279e-06, + "loss": 0.3118, + "step": 11310 + }, + { + "epoch": 1.3764526924247034, + "grad_norm": 3.179696559906006, + "learning_rate": 4.5914288306637346e-06, + "loss": 0.476, + "step": 11311 + }, + { + "epoch": 1.3765743839367204, + "grad_norm": 2.8451809883117676, + "learning_rate": 4.5897881766135136e-06, + "loss": 0.4487, + "step": 11312 + }, + { + "epoch": 1.3766960754487374, + "grad_norm": 2.830974817276001, + "learning_rate": 4.588147728433055e-06, + "loss": 0.2987, + "step": 11313 + }, + { + "epoch": 1.3768177669607544, + "grad_norm": 1.6668356657028198, + "learning_rate": 4.586507486184778e-06, + "loss": 0.3726, + "step": 11314 + }, + { + "epoch": 1.3769394584727714, + "grad_norm": 1.8309497833251953, + "learning_rate": 4.584867449931095e-06, + "loss": 0.4037, + "step": 11315 + }, + { + "epoch": 1.3770611499847885, + "grad_norm": 2.203819751739502, + "learning_rate": 4.5832276197344165e-06, + "loss": 0.4586, + "step": 11316 + }, + { + "epoch": 1.3771828414968055, + "grad_norm": 1.5618456602096558, + "learning_rate": 4.581587995657133e-06, + "loss": 0.3356, + "step": 11317 + }, + { + "epoch": 1.3773045330088227, + "grad_norm": 1.8384875059127808, + "learning_rate": 4.579948577761647e-06, + "loss": 0.4344, + "step": 11318 + }, + { + "epoch": 1.3774262245208397, + "grad_norm": 2.093533992767334, + "learning_rate": 4.578309366110336e-06, + "loss": 0.343, + "step": 11319 + }, + { + "epoch": 1.3775479160328568, + "grad_norm": 1.7189770936965942, + "learning_rate": 4.576670360765572e-06, + "loss": 0.3907, + "step": 11320 + }, + { + "epoch": 1.3776696075448738, + "grad_norm": 1.4578478336334229, + "learning_rate": 4.57503156178973e-06, + "loss": 0.3138, + "step": 11321 + }, + { + "epoch": 1.3777912990568908, + "grad_norm": 2.045624256134033, + "learning_rate": 4.573392969245167e-06, + "loss": 0.4335, + "step": 11322 + }, + { + "epoch": 1.3779129905689078, + "grad_norm": 1.5540378093719482, + "learning_rate": 4.57175458319423e-06, + "loss": 0.3568, + "step": 11323 + }, + { + "epoch": 1.3780346820809248, + "grad_norm": 2.0552492141723633, + "learning_rate": 4.5701164036992706e-06, + "loss": 0.338, + "step": 11324 + }, + { + "epoch": 1.3781563735929419, + "grad_norm": 1.56297767162323, + "learning_rate": 4.56847843082262e-06, + "loss": 0.3403, + "step": 11325 + }, + { + "epoch": 1.3782780651049589, + "grad_norm": 1.832349181175232, + "learning_rate": 4.56684066462661e-06, + "loss": 0.4034, + "step": 11326 + }, + { + "epoch": 1.378399756616976, + "grad_norm": 2.4731876850128174, + "learning_rate": 4.565203105173557e-06, + "loss": 0.339, + "step": 11327 + }, + { + "epoch": 1.3785214481289931, + "grad_norm": 1.8244894742965698, + "learning_rate": 4.563565752525773e-06, + "loss": 0.3866, + "step": 11328 + }, + { + "epoch": 1.3786431396410102, + "grad_norm": 2.375509023666382, + "learning_rate": 4.561928606745567e-06, + "loss": 0.2979, + "step": 11329 + }, + { + "epoch": 1.3787648311530272, + "grad_norm": 1.6645686626434326, + "learning_rate": 4.560291667895235e-06, + "loss": 0.3358, + "step": 11330 + }, + { + "epoch": 1.3788865226650442, + "grad_norm": 1.4858492612838745, + "learning_rate": 4.55865493603706e-06, + "loss": 0.3173, + "step": 11331 + }, + { + "epoch": 1.3790082141770612, + "grad_norm": 3.204472780227661, + "learning_rate": 4.557018411233332e-06, + "loss": 0.3732, + "step": 11332 + }, + { + "epoch": 1.3791299056890782, + "grad_norm": 1.775059700012207, + "learning_rate": 4.555382093546316e-06, + "loss": 0.3285, + "step": 11333 + }, + { + "epoch": 1.3792515972010952, + "grad_norm": 2.6282403469085693, + "learning_rate": 4.553745983038285e-06, + "loss": 0.4236, + "step": 11334 + }, + { + "epoch": 1.3793732887131123, + "grad_norm": 4.959954738616943, + "learning_rate": 4.552110079771491e-06, + "loss": 0.4402, + "step": 11335 + }, + { + "epoch": 1.3794949802251293, + "grad_norm": 1.8816577196121216, + "learning_rate": 4.550474383808184e-06, + "loss": 0.3597, + "step": 11336 + }, + { + "epoch": 1.3796166717371463, + "grad_norm": 1.4939597845077515, + "learning_rate": 4.548838895210611e-06, + "loss": 0.352, + "step": 11337 + }, + { + "epoch": 1.3797383632491633, + "grad_norm": 1.2710790634155273, + "learning_rate": 4.547203614040999e-06, + "loss": 0.3423, + "step": 11338 + }, + { + "epoch": 1.3798600547611803, + "grad_norm": 1.7677258253097534, + "learning_rate": 4.545568540361572e-06, + "loss": 0.2839, + "step": 11339 + }, + { + "epoch": 1.3799817462731974, + "grad_norm": 2.359163999557495, + "learning_rate": 4.543933674234558e-06, + "loss": 0.3293, + "step": 11340 + }, + { + "epoch": 1.3801034377852144, + "grad_norm": 2.50783634185791, + "learning_rate": 4.542299015722155e-06, + "loss": 0.3739, + "step": 11341 + }, + { + "epoch": 1.3802251292972314, + "grad_norm": 1.5659841299057007, + "learning_rate": 4.540664564886575e-06, + "loss": 0.3727, + "step": 11342 + }, + { + "epoch": 1.3803468208092484, + "grad_norm": 5.482038974761963, + "learning_rate": 4.53903032179001e-06, + "loss": 0.482, + "step": 11343 + }, + { + "epoch": 1.3804685123212657, + "grad_norm": 2.9925808906555176, + "learning_rate": 4.5373962864946395e-06, + "loss": 0.4054, + "step": 11344 + }, + { + "epoch": 1.3805902038332827, + "grad_norm": 1.8215121030807495, + "learning_rate": 4.535762459062653e-06, + "loss": 0.3994, + "step": 11345 + }, + { + "epoch": 1.3807118953452997, + "grad_norm": 1.8021489381790161, + "learning_rate": 4.534128839556213e-06, + "loss": 0.3685, + "step": 11346 + }, + { + "epoch": 1.3808335868573167, + "grad_norm": 3.0243401527404785, + "learning_rate": 4.532495428037482e-06, + "loss": 0.3313, + "step": 11347 + }, + { + "epoch": 1.3809552783693337, + "grad_norm": 1.5342028141021729, + "learning_rate": 4.530862224568624e-06, + "loss": 0.3824, + "step": 11348 + }, + { + "epoch": 1.3810769698813508, + "grad_norm": 1.3513374328613281, + "learning_rate": 4.5292292292117724e-06, + "loss": 0.3075, + "step": 11349 + }, + { + "epoch": 1.3811986613933678, + "grad_norm": 1.4740082025527954, + "learning_rate": 4.527596442029075e-06, + "loss": 0.3729, + "step": 11350 + }, + { + "epoch": 1.3813203529053848, + "grad_norm": 1.7651846408843994, + "learning_rate": 4.5259638630826605e-06, + "loss": 0.4135, + "step": 11351 + }, + { + "epoch": 1.3814420444174018, + "grad_norm": 2.790583610534668, + "learning_rate": 4.5243314924346485e-06, + "loss": 0.359, + "step": 11352 + }, + { + "epoch": 1.381563735929419, + "grad_norm": 2.103055000305176, + "learning_rate": 4.52269933014716e-06, + "loss": 0.3836, + "step": 11353 + }, + { + "epoch": 1.381685427441436, + "grad_norm": 1.776471495628357, + "learning_rate": 4.521067376282301e-06, + "loss": 0.4036, + "step": 11354 + }, + { + "epoch": 1.381807118953453, + "grad_norm": 1.5256181955337524, + "learning_rate": 4.519435630902164e-06, + "loss": 0.3902, + "step": 11355 + }, + { + "epoch": 1.3819288104654701, + "grad_norm": 2.557204484939575, + "learning_rate": 4.517804094068849e-06, + "loss": 0.4893, + "step": 11356 + }, + { + "epoch": 1.3820505019774871, + "grad_norm": 3.6220035552978516, + "learning_rate": 4.516172765844436e-06, + "loss": 0.3794, + "step": 11357 + }, + { + "epoch": 1.3821721934895042, + "grad_norm": 2.16264009475708, + "learning_rate": 4.5145416462909976e-06, + "loss": 0.459, + "step": 11358 + }, + { + "epoch": 1.3822938850015212, + "grad_norm": 1.595890760421753, + "learning_rate": 4.512910735470606e-06, + "loss": 0.3543, + "step": 11359 + }, + { + "epoch": 1.3824155765135382, + "grad_norm": 1.916425347328186, + "learning_rate": 4.5112800334453185e-06, + "loss": 0.3977, + "step": 11360 + }, + { + "epoch": 1.3825372680255552, + "grad_norm": 1.6924155950546265, + "learning_rate": 4.509649540277188e-06, + "loss": 0.3828, + "step": 11361 + }, + { + "epoch": 1.3826589595375722, + "grad_norm": 1.6039602756500244, + "learning_rate": 4.5080192560282564e-06, + "loss": 0.3664, + "step": 11362 + }, + { + "epoch": 1.3827806510495892, + "grad_norm": 1.7230364084243774, + "learning_rate": 4.506389180760557e-06, + "loss": 0.3634, + "step": 11363 + }, + { + "epoch": 1.3829023425616063, + "grad_norm": 1.7189810276031494, + "learning_rate": 4.504759314536123e-06, + "loss": 0.4175, + "step": 11364 + }, + { + "epoch": 1.3830240340736233, + "grad_norm": 1.8771231174468994, + "learning_rate": 4.503129657416971e-06, + "loss": 0.3689, + "step": 11365 + }, + { + "epoch": 1.3831457255856403, + "grad_norm": 1.9943089485168457, + "learning_rate": 4.50150020946511e-06, + "loss": 0.3966, + "step": 11366 + }, + { + "epoch": 1.3832674170976573, + "grad_norm": 2.8318376541137695, + "learning_rate": 4.4998709707425525e-06, + "loss": 0.3531, + "step": 11367 + }, + { + "epoch": 1.3833891086096743, + "grad_norm": 2.465700626373291, + "learning_rate": 4.498241941311288e-06, + "loss": 0.4314, + "step": 11368 + }, + { + "epoch": 1.3835108001216916, + "grad_norm": 2.425323009490967, + "learning_rate": 4.496613121233302e-06, + "loss": 0.3252, + "step": 11369 + }, + { + "epoch": 1.3836324916337086, + "grad_norm": 1.7423495054244995, + "learning_rate": 4.494984510570583e-06, + "loss": 0.3892, + "step": 11370 + }, + { + "epoch": 1.3837541831457256, + "grad_norm": 1.561933994293213, + "learning_rate": 4.493356109385097e-06, + "loss": 0.3891, + "step": 11371 + }, + { + "epoch": 1.3838758746577426, + "grad_norm": 2.1172046661376953, + "learning_rate": 4.4917279177388085e-06, + "loss": 0.402, + "step": 11372 + }, + { + "epoch": 1.3839975661697597, + "grad_norm": 1.6774377822875977, + "learning_rate": 4.490099935693675e-06, + "loss": 0.3604, + "step": 11373 + }, + { + "epoch": 1.3841192576817767, + "grad_norm": 1.4198288917541504, + "learning_rate": 4.48847216331164e-06, + "loss": 0.3604, + "step": 11374 + }, + { + "epoch": 1.3842409491937937, + "grad_norm": 1.3024564981460571, + "learning_rate": 4.486844600654651e-06, + "loss": 0.3492, + "step": 11375 + }, + { + "epoch": 1.3843626407058107, + "grad_norm": 1.8111578226089478, + "learning_rate": 4.485217247784638e-06, + "loss": 0.3669, + "step": 11376 + }, + { + "epoch": 1.3844843322178277, + "grad_norm": 1.1767746210098267, + "learning_rate": 4.483590104763519e-06, + "loss": 0.3423, + "step": 11377 + }, + { + "epoch": 1.384606023729845, + "grad_norm": 1.3728400468826294, + "learning_rate": 4.481963171653217e-06, + "loss": 0.3543, + "step": 11378 + }, + { + "epoch": 1.384727715241862, + "grad_norm": 1.6849199533462524, + "learning_rate": 4.48033644851564e-06, + "loss": 0.3278, + "step": 11379 + }, + { + "epoch": 1.384849406753879, + "grad_norm": 1.7433174848556519, + "learning_rate": 4.478709935412681e-06, + "loss": 0.4172, + "step": 11380 + }, + { + "epoch": 1.384971098265896, + "grad_norm": 1.8216497898101807, + "learning_rate": 4.477083632406243e-06, + "loss": 0.3028, + "step": 11381 + }, + { + "epoch": 1.385092789777913, + "grad_norm": 1.8246872425079346, + "learning_rate": 4.475457539558203e-06, + "loss": 0.3328, + "step": 11382 + }, + { + "epoch": 1.38521448128993, + "grad_norm": 1.909900426864624, + "learning_rate": 4.473831656930439e-06, + "loss": 0.3353, + "step": 11383 + }, + { + "epoch": 1.385336172801947, + "grad_norm": 1.9581698179244995, + "learning_rate": 4.472205984584821e-06, + "loss": 0.3799, + "step": 11384 + }, + { + "epoch": 1.3854578643139641, + "grad_norm": 2.3372864723205566, + "learning_rate": 4.470580522583201e-06, + "loss": 0.4281, + "step": 11385 + }, + { + "epoch": 1.3855795558259811, + "grad_norm": 2.010037422180176, + "learning_rate": 4.468955270987443e-06, + "loss": 0.3909, + "step": 11386 + }, + { + "epoch": 1.3857012473379982, + "grad_norm": 1.598394513130188, + "learning_rate": 4.467330229859383e-06, + "loss": 0.3731, + "step": 11387 + }, + { + "epoch": 1.3858229388500152, + "grad_norm": 1.558213233947754, + "learning_rate": 4.465705399260859e-06, + "loss": 0.3479, + "step": 11388 + }, + { + "epoch": 1.3859446303620322, + "grad_norm": 1.814674735069275, + "learning_rate": 4.4640807792537025e-06, + "loss": 0.4653, + "step": 11389 + }, + { + "epoch": 1.3860663218740492, + "grad_norm": 1.3796453475952148, + "learning_rate": 4.46245636989973e-06, + "loss": 0.3472, + "step": 11390 + }, + { + "epoch": 1.3861880133860662, + "grad_norm": 1.381128191947937, + "learning_rate": 4.460832171260752e-06, + "loss": 0.3375, + "step": 11391 + }, + { + "epoch": 1.3863097048980833, + "grad_norm": 2.082923173904419, + "learning_rate": 4.459208183398579e-06, + "loss": 0.3844, + "step": 11392 + }, + { + "epoch": 1.3864313964101003, + "grad_norm": 1.6432932615280151, + "learning_rate": 4.457584406374999e-06, + "loss": 0.3989, + "step": 11393 + }, + { + "epoch": 1.3865530879221175, + "grad_norm": 1.6915321350097656, + "learning_rate": 4.455960840251813e-06, + "loss": 0.4035, + "step": 11394 + }, + { + "epoch": 1.3866747794341345, + "grad_norm": 1.5092614889144897, + "learning_rate": 4.454337485090789e-06, + "loss": 0.3719, + "step": 11395 + }, + { + "epoch": 1.3867964709461515, + "grad_norm": 1.3614698648452759, + "learning_rate": 4.452714340953698e-06, + "loss": 0.3252, + "step": 11396 + }, + { + "epoch": 1.3869181624581686, + "grad_norm": 1.6111360788345337, + "learning_rate": 4.4510914079023115e-06, + "loss": 0.3535, + "step": 11397 + }, + { + "epoch": 1.3870398539701856, + "grad_norm": 1.8023755550384521, + "learning_rate": 4.4494686859983835e-06, + "loss": 0.4033, + "step": 11398 + }, + { + "epoch": 1.3871615454822026, + "grad_norm": 1.939241886138916, + "learning_rate": 4.447846175303657e-06, + "loss": 0.3782, + "step": 11399 + }, + { + "epoch": 1.3872832369942196, + "grad_norm": 1.8333767652511597, + "learning_rate": 4.44622387587988e-06, + "loss": 0.3926, + "step": 11400 + }, + { + "epoch": 1.3874049285062366, + "grad_norm": 1.9471579790115356, + "learning_rate": 4.444601787788777e-06, + "loss": 0.2893, + "step": 11401 + }, + { + "epoch": 1.3875266200182537, + "grad_norm": 2.24674391746521, + "learning_rate": 4.442979911092078e-06, + "loss": 0.3827, + "step": 11402 + }, + { + "epoch": 1.3876483115302707, + "grad_norm": 2.093810558319092, + "learning_rate": 4.4413582458514955e-06, + "loss": 0.3692, + "step": 11403 + }, + { + "epoch": 1.387770003042288, + "grad_norm": 1.5624829530715942, + "learning_rate": 4.439736792128735e-06, + "loss": 0.3367, + "step": 11404 + }, + { + "epoch": 1.387891694554305, + "grad_norm": 1.9874701499938965, + "learning_rate": 4.438115549985501e-06, + "loss": 0.3984, + "step": 11405 + }, + { + "epoch": 1.388013386066322, + "grad_norm": 1.513587474822998, + "learning_rate": 4.436494519483487e-06, + "loss": 0.3688, + "step": 11406 + }, + { + "epoch": 1.388135077578339, + "grad_norm": 2.258695363998413, + "learning_rate": 4.434873700684365e-06, + "loss": 0.3907, + "step": 11407 + }, + { + "epoch": 1.388256769090356, + "grad_norm": 1.6159029006958008, + "learning_rate": 4.433253093649822e-06, + "loss": 0.4083, + "step": 11408 + }, + { + "epoch": 1.388378460602373, + "grad_norm": 3.6507537364959717, + "learning_rate": 4.431632698441517e-06, + "loss": 0.4805, + "step": 11409 + }, + { + "epoch": 1.38850015211439, + "grad_norm": 1.2714152336120605, + "learning_rate": 4.430012515121117e-06, + "loss": 0.3325, + "step": 11410 + }, + { + "epoch": 1.388621843626407, + "grad_norm": 1.7035398483276367, + "learning_rate": 4.428392543750271e-06, + "loss": 0.3462, + "step": 11411 + }, + { + "epoch": 1.388743535138424, + "grad_norm": 3.3468666076660156, + "learning_rate": 4.426772784390617e-06, + "loss": 0.458, + "step": 11412 + }, + { + "epoch": 1.388865226650441, + "grad_norm": 3.2629191875457764, + "learning_rate": 4.425153237103799e-06, + "loss": 0.4103, + "step": 11413 + }, + { + "epoch": 1.3889869181624581, + "grad_norm": 2.212270736694336, + "learning_rate": 4.423533901951439e-06, + "loss": 0.403, + "step": 11414 + }, + { + "epoch": 1.3891086096744751, + "grad_norm": 1.7704497575759888, + "learning_rate": 4.421914778995154e-06, + "loss": 0.4272, + "step": 11415 + }, + { + "epoch": 1.3892303011864922, + "grad_norm": 2.283195972442627, + "learning_rate": 4.4202958682965595e-06, + "loss": 0.379, + "step": 11416 + }, + { + "epoch": 1.3893519926985092, + "grad_norm": 1.9091100692749023, + "learning_rate": 4.418677169917257e-06, + "loss": 0.4111, + "step": 11417 + }, + { + "epoch": 1.3894736842105262, + "grad_norm": 1.59117591381073, + "learning_rate": 4.417058683918841e-06, + "loss": 0.421, + "step": 11418 + }, + { + "epoch": 1.3895953757225434, + "grad_norm": 1.984761357307434, + "learning_rate": 4.415440410362899e-06, + "loss": 0.3825, + "step": 11419 + }, + { + "epoch": 1.3897170672345605, + "grad_norm": 1.6840970516204834, + "learning_rate": 4.413822349311005e-06, + "loss": 0.4399, + "step": 11420 + }, + { + "epoch": 1.3898387587465775, + "grad_norm": 2.9814164638519287, + "learning_rate": 4.412204500824736e-06, + "loss": 0.4524, + "step": 11421 + }, + { + "epoch": 1.3899604502585945, + "grad_norm": 1.4622747898101807, + "learning_rate": 4.410586864965653e-06, + "loss": 0.3557, + "step": 11422 + }, + { + "epoch": 1.3900821417706115, + "grad_norm": 5.3173322677612305, + "learning_rate": 4.408969441795305e-06, + "loss": 0.3576, + "step": 11423 + }, + { + "epoch": 1.3902038332826285, + "grad_norm": 4.087156772613525, + "learning_rate": 4.407352231375246e-06, + "loss": 0.3436, + "step": 11424 + }, + { + "epoch": 1.3903255247946456, + "grad_norm": 3.7394979000091553, + "learning_rate": 4.4057352337670114e-06, + "loss": 0.3127, + "step": 11425 + }, + { + "epoch": 1.3904472163066626, + "grad_norm": 3.23745059967041, + "learning_rate": 4.4041184490321265e-06, + "loss": 0.3613, + "step": 11426 + }, + { + "epoch": 1.3905689078186796, + "grad_norm": 2.2304234504699707, + "learning_rate": 4.402501877232121e-06, + "loss": 0.396, + "step": 11427 + }, + { + "epoch": 1.3906905993306966, + "grad_norm": 1.6873611211776733, + "learning_rate": 4.4008855184285045e-06, + "loss": 0.3306, + "step": 11428 + }, + { + "epoch": 1.3908122908427138, + "grad_norm": 1.6144554615020752, + "learning_rate": 4.3992693726827855e-06, + "loss": 0.3194, + "step": 11429 + }, + { + "epoch": 1.3909339823547309, + "grad_norm": 1.2821298837661743, + "learning_rate": 4.397653440056459e-06, + "loss": 0.3537, + "step": 11430 + }, + { + "epoch": 1.3910556738667479, + "grad_norm": 1.6831331253051758, + "learning_rate": 4.39603772061101e-06, + "loss": 0.3916, + "step": 11431 + }, + { + "epoch": 1.391177365378765, + "grad_norm": 1.4870332479476929, + "learning_rate": 4.394422214407932e-06, + "loss": 0.3698, + "step": 11432 + }, + { + "epoch": 1.391299056890782, + "grad_norm": 1.2417694330215454, + "learning_rate": 4.3928069215086896e-06, + "loss": 0.3397, + "step": 11433 + }, + { + "epoch": 1.391420748402799, + "grad_norm": 1.4579051733016968, + "learning_rate": 4.391191841974747e-06, + "loss": 0.304, + "step": 11434 + }, + { + "epoch": 1.391542439914816, + "grad_norm": 4.942966461181641, + "learning_rate": 4.389576975867568e-06, + "loss": 0.4639, + "step": 11435 + }, + { + "epoch": 1.391664131426833, + "grad_norm": 2.9270763397216797, + "learning_rate": 4.387962323248598e-06, + "loss": 0.3798, + "step": 11436 + }, + { + "epoch": 1.39178582293885, + "grad_norm": 5.224781513214111, + "learning_rate": 4.386347884179274e-06, + "loss": 0.4888, + "step": 11437 + }, + { + "epoch": 1.391907514450867, + "grad_norm": 4.197554111480713, + "learning_rate": 4.384733658721036e-06, + "loss": 0.4567, + "step": 11438 + }, + { + "epoch": 1.392029205962884, + "grad_norm": 1.351085901260376, + "learning_rate": 4.3831196469353056e-06, + "loss": 0.3538, + "step": 11439 + }, + { + "epoch": 1.392150897474901, + "grad_norm": 2.068093776702881, + "learning_rate": 4.381505848883496e-06, + "loss": 0.2867, + "step": 11440 + }, + { + "epoch": 1.392272588986918, + "grad_norm": 1.9180797338485718, + "learning_rate": 4.379892264627025e-06, + "loss": 0.432, + "step": 11441 + }, + { + "epoch": 1.392394280498935, + "grad_norm": 2.743220329284668, + "learning_rate": 4.378278894227279e-06, + "loss": 0.3381, + "step": 11442 + }, + { + "epoch": 1.3925159720109521, + "grad_norm": 1.7271233797073364, + "learning_rate": 4.376665737745661e-06, + "loss": 0.3046, + "step": 11443 + }, + { + "epoch": 1.3926376635229691, + "grad_norm": 2.314553737640381, + "learning_rate": 4.3750527952435514e-06, + "loss": 0.4137, + "step": 11444 + }, + { + "epoch": 1.3927593550349864, + "grad_norm": 5.153557300567627, + "learning_rate": 4.373440066782323e-06, + "loss": 0.4349, + "step": 11445 + }, + { + "epoch": 1.3928810465470034, + "grad_norm": 3.608574390411377, + "learning_rate": 4.371827552423349e-06, + "loss": 0.4626, + "step": 11446 + }, + { + "epoch": 1.3930027380590204, + "grad_norm": 1.3261542320251465, + "learning_rate": 4.370215252227987e-06, + "loss": 0.3246, + "step": 11447 + }, + { + "epoch": 1.3931244295710374, + "grad_norm": 2.7190096378326416, + "learning_rate": 4.368603166257584e-06, + "loss": 0.3735, + "step": 11448 + }, + { + "epoch": 1.3932461210830545, + "grad_norm": 2.8585147857666016, + "learning_rate": 4.36699129457349e-06, + "loss": 0.39, + "step": 11449 + }, + { + "epoch": 1.3933678125950715, + "grad_norm": 2.667886734008789, + "learning_rate": 4.365379637237037e-06, + "loss": 0.3437, + "step": 11450 + }, + { + "epoch": 1.3934895041070885, + "grad_norm": 1.6503410339355469, + "learning_rate": 4.363768194309549e-06, + "loss": 0.3958, + "step": 11451 + }, + { + "epoch": 1.3936111956191055, + "grad_norm": 1.6660648584365845, + "learning_rate": 4.362156965852354e-06, + "loss": 0.3694, + "step": 11452 + }, + { + "epoch": 1.3937328871311225, + "grad_norm": 1.6979115009307861, + "learning_rate": 4.36054595192675e-06, + "loss": 0.4006, + "step": 11453 + }, + { + "epoch": 1.3938545786431398, + "grad_norm": 3.540794849395752, + "learning_rate": 4.358935152594048e-06, + "loss": 0.4581, + "step": 11454 + }, + { + "epoch": 1.3939762701551568, + "grad_norm": 2.0232386589050293, + "learning_rate": 4.357324567915541e-06, + "loss": 0.3625, + "step": 11455 + }, + { + "epoch": 1.3940979616671738, + "grad_norm": 1.5823118686676025, + "learning_rate": 4.355714197952508e-06, + "loss": 0.4557, + "step": 11456 + }, + { + "epoch": 1.3942196531791908, + "grad_norm": 2.286123275756836, + "learning_rate": 4.354104042766238e-06, + "loss": 0.3242, + "step": 11457 + }, + { + "epoch": 1.3943413446912079, + "grad_norm": 4.621548175811768, + "learning_rate": 4.352494102417996e-06, + "loss": 0.347, + "step": 11458 + }, + { + "epoch": 1.3944630362032249, + "grad_norm": 2.9518280029296875, + "learning_rate": 4.350884376969039e-06, + "loss": 0.3284, + "step": 11459 + }, + { + "epoch": 1.394584727715242, + "grad_norm": 1.5486581325531006, + "learning_rate": 4.349274866480627e-06, + "loss": 0.3911, + "step": 11460 + }, + { + "epoch": 1.394706419227259, + "grad_norm": 2.0710244178771973, + "learning_rate": 4.347665571014e-06, + "loss": 0.4565, + "step": 11461 + }, + { + "epoch": 1.394828110739276, + "grad_norm": 2.828049659729004, + "learning_rate": 4.346056490630403e-06, + "loss": 0.3245, + "step": 11462 + }, + { + "epoch": 1.394949802251293, + "grad_norm": 1.3766549825668335, + "learning_rate": 4.344447625391057e-06, + "loss": 0.371, + "step": 11463 + }, + { + "epoch": 1.39507149376331, + "grad_norm": 1.8920338153839111, + "learning_rate": 4.342838975357187e-06, + "loss": 0.3637, + "step": 11464 + }, + { + "epoch": 1.395193185275327, + "grad_norm": 1.9529694318771362, + "learning_rate": 4.341230540590003e-06, + "loss": 0.3806, + "step": 11465 + }, + { + "epoch": 1.395314876787344, + "grad_norm": 1.478337287902832, + "learning_rate": 4.339622321150709e-06, + "loss": 0.3799, + "step": 11466 + }, + { + "epoch": 1.395436568299361, + "grad_norm": 1.3936914205551147, + "learning_rate": 4.3380143171004996e-06, + "loss": 0.3239, + "step": 11467 + }, + { + "epoch": 1.395558259811378, + "grad_norm": 1.5418885946273804, + "learning_rate": 4.336406528500567e-06, + "loss": 0.3663, + "step": 11468 + }, + { + "epoch": 1.395679951323395, + "grad_norm": 1.9808118343353271, + "learning_rate": 4.3347989554120876e-06, + "loss": 0.3557, + "step": 11469 + }, + { + "epoch": 1.3958016428354123, + "grad_norm": 2.683344841003418, + "learning_rate": 4.333191597896237e-06, + "loss": 0.443, + "step": 11470 + }, + { + "epoch": 1.3959233343474293, + "grad_norm": 2.1260132789611816, + "learning_rate": 4.331584456014175e-06, + "loss": 0.3739, + "step": 11471 + }, + { + "epoch": 1.3960450258594463, + "grad_norm": 1.494470477104187, + "learning_rate": 4.329977529827055e-06, + "loss": 0.3471, + "step": 11472 + }, + { + "epoch": 1.3961667173714634, + "grad_norm": 2.17232084274292, + "learning_rate": 4.3283708193960295e-06, + "loss": 0.329, + "step": 11473 + }, + { + "epoch": 1.3962884088834804, + "grad_norm": 2.302095651626587, + "learning_rate": 4.326764324782234e-06, + "loss": 0.3771, + "step": 11474 + }, + { + "epoch": 1.3964101003954974, + "grad_norm": 3.183126211166382, + "learning_rate": 4.325158046046798e-06, + "loss": 0.4264, + "step": 11475 + }, + { + "epoch": 1.3965317919075144, + "grad_norm": 1.6612156629562378, + "learning_rate": 4.323551983250846e-06, + "loss": 0.3465, + "step": 11476 + }, + { + "epoch": 1.3966534834195314, + "grad_norm": 1.7970367670059204, + "learning_rate": 4.321946136455486e-06, + "loss": 0.4116, + "step": 11477 + }, + { + "epoch": 1.3967751749315485, + "grad_norm": 2.088991641998291, + "learning_rate": 4.320340505721833e-06, + "loss": 0.3573, + "step": 11478 + }, + { + "epoch": 1.3968968664435657, + "grad_norm": 1.435523271560669, + "learning_rate": 4.318735091110979e-06, + "loss": 0.4107, + "step": 11479 + }, + { + "epoch": 1.3970185579555827, + "grad_norm": 1.9187802076339722, + "learning_rate": 4.3171298926840114e-06, + "loss": 0.3931, + "step": 11480 + }, + { + "epoch": 1.3971402494675997, + "grad_norm": 1.7297875881195068, + "learning_rate": 4.315524910502017e-06, + "loss": 0.4248, + "step": 11481 + }, + { + "epoch": 1.3972619409796168, + "grad_norm": 1.4448972940444946, + "learning_rate": 4.313920144626067e-06, + "loss": 0.334, + "step": 11482 + }, + { + "epoch": 1.3973836324916338, + "grad_norm": 1.4364911317825317, + "learning_rate": 4.312315595117221e-06, + "loss": 0.3802, + "step": 11483 + }, + { + "epoch": 1.3975053240036508, + "grad_norm": 1.8982006311416626, + "learning_rate": 4.310711262036543e-06, + "loss": 0.3451, + "step": 11484 + }, + { + "epoch": 1.3976270155156678, + "grad_norm": 1.715415120124817, + "learning_rate": 4.3091071454450785e-06, + "loss": 0.3993, + "step": 11485 + }, + { + "epoch": 1.3977487070276848, + "grad_norm": 1.5801661014556885, + "learning_rate": 4.307503245403862e-06, + "loss": 0.3968, + "step": 11486 + }, + { + "epoch": 1.3978703985397019, + "grad_norm": 1.4895697832107544, + "learning_rate": 4.305899561973937e-06, + "loss": 0.4462, + "step": 11487 + }, + { + "epoch": 1.3979920900517189, + "grad_norm": 1.5394319295883179, + "learning_rate": 4.304296095216314e-06, + "loss": 0.3441, + "step": 11488 + }, + { + "epoch": 1.398113781563736, + "grad_norm": 1.4770684242248535, + "learning_rate": 4.302692845192016e-06, + "loss": 0.4017, + "step": 11489 + }, + { + "epoch": 1.398235473075753, + "grad_norm": 1.7553646564483643, + "learning_rate": 4.301089811962048e-06, + "loss": 0.4142, + "step": 11490 + }, + { + "epoch": 1.39835716458777, + "grad_norm": 2.124732732772827, + "learning_rate": 4.299486995587406e-06, + "loss": 0.3152, + "step": 11491 + }, + { + "epoch": 1.398478856099787, + "grad_norm": 2.317671298980713, + "learning_rate": 4.2978843961290875e-06, + "loss": 0.3979, + "step": 11492 + }, + { + "epoch": 1.398600547611804, + "grad_norm": 1.6811238527297974, + "learning_rate": 4.296282013648069e-06, + "loss": 0.3974, + "step": 11493 + }, + { + "epoch": 1.398722239123821, + "grad_norm": 2.0519702434539795, + "learning_rate": 4.294679848205323e-06, + "loss": 0.4209, + "step": 11494 + }, + { + "epoch": 1.3988439306358382, + "grad_norm": 1.7124383449554443, + "learning_rate": 4.293077899861821e-06, + "loss": 0.3319, + "step": 11495 + }, + { + "epoch": 1.3989656221478552, + "grad_norm": 1.9717894792556763, + "learning_rate": 4.2914761686785186e-06, + "loss": 0.3663, + "step": 11496 + }, + { + "epoch": 1.3990873136598723, + "grad_norm": 1.6932785511016846, + "learning_rate": 4.28987465471636e-06, + "loss": 0.3505, + "step": 11497 + }, + { + "epoch": 1.3992090051718893, + "grad_norm": 1.7399998903274536, + "learning_rate": 4.288273358036298e-06, + "loss": 0.3124, + "step": 11498 + }, + { + "epoch": 1.3993306966839063, + "grad_norm": 3.1257483959198, + "learning_rate": 4.286672278699249e-06, + "loss": 0.316, + "step": 11499 + }, + { + "epoch": 1.3994523881959233, + "grad_norm": 1.7928040027618408, + "learning_rate": 4.28507141676615e-06, + "loss": 0.4447, + "step": 11500 + }, + { + "epoch": 1.3995740797079403, + "grad_norm": 3.268521547317505, + "learning_rate": 4.283470772297912e-06, + "loss": 0.3814, + "step": 11501 + }, + { + "epoch": 1.3996957712199574, + "grad_norm": 1.739776849746704, + "learning_rate": 4.281870345355441e-06, + "loss": 0.394, + "step": 11502 + }, + { + "epoch": 1.3998174627319744, + "grad_norm": 2.0779480934143066, + "learning_rate": 4.280270135999642e-06, + "loss": 0.3376, + "step": 11503 + }, + { + "epoch": 1.3999391542439914, + "grad_norm": 2.316032886505127, + "learning_rate": 4.278670144291405e-06, + "loss": 0.4113, + "step": 11504 + }, + { + "epoch": 1.4000608457560086, + "grad_norm": 1.5981416702270508, + "learning_rate": 4.277070370291606e-06, + "loss": 0.3413, + "step": 11505 + }, + { + "epoch": 1.4001825372680257, + "grad_norm": 2.4327147006988525, + "learning_rate": 4.27547081406113e-06, + "loss": 0.3933, + "step": 11506 + }, + { + "epoch": 1.4003042287800427, + "grad_norm": 1.9172366857528687, + "learning_rate": 4.273871475660839e-06, + "loss": 0.3367, + "step": 11507 + }, + { + "epoch": 1.4004259202920597, + "grad_norm": 1.5525368452072144, + "learning_rate": 4.272272355151586e-06, + "loss": 0.3646, + "step": 11508 + }, + { + "epoch": 1.4005476118040767, + "grad_norm": 1.6372947692871094, + "learning_rate": 4.27067345259423e-06, + "loss": 0.3979, + "step": 11509 + }, + { + "epoch": 1.4006693033160937, + "grad_norm": 3.172664165496826, + "learning_rate": 4.2690747680496085e-06, + "loss": 0.4206, + "step": 11510 + }, + { + "epoch": 1.4007909948281108, + "grad_norm": 1.5217859745025635, + "learning_rate": 4.267476301578554e-06, + "loss": 0.3368, + "step": 11511 + }, + { + "epoch": 1.4009126863401278, + "grad_norm": 2.774683952331543, + "learning_rate": 4.265878053241892e-06, + "loss": 0.4314, + "step": 11512 + }, + { + "epoch": 1.4010343778521448, + "grad_norm": 2.561152935028076, + "learning_rate": 4.264280023100435e-06, + "loss": 0.3476, + "step": 11513 + }, + { + "epoch": 1.4011560693641618, + "grad_norm": 1.487470269203186, + "learning_rate": 4.262682211214999e-06, + "loss": 0.3923, + "step": 11514 + }, + { + "epoch": 1.4012777608761788, + "grad_norm": 3.286255359649658, + "learning_rate": 4.261084617646382e-06, + "loss": 0.3526, + "step": 11515 + }, + { + "epoch": 1.4013994523881959, + "grad_norm": 1.7256561517715454, + "learning_rate": 4.259487242455369e-06, + "loss": 0.3762, + "step": 11516 + }, + { + "epoch": 1.4015211439002129, + "grad_norm": 1.8141251802444458, + "learning_rate": 4.257890085702753e-06, + "loss": 0.2966, + "step": 11517 + }, + { + "epoch": 1.40164283541223, + "grad_norm": 1.8115521669387817, + "learning_rate": 4.2562931474493044e-06, + "loss": 0.3801, + "step": 11518 + }, + { + "epoch": 1.401764526924247, + "grad_norm": 1.7352278232574463, + "learning_rate": 4.2546964277557854e-06, + "loss": 0.3983, + "step": 11519 + }, + { + "epoch": 1.4018862184362642, + "grad_norm": 1.9483028650283813, + "learning_rate": 4.253099926682965e-06, + "loss": 0.4182, + "step": 11520 + }, + { + "epoch": 1.4020079099482812, + "grad_norm": 1.8020724058151245, + "learning_rate": 4.251503644291587e-06, + "loss": 0.4037, + "step": 11521 + }, + { + "epoch": 1.4021296014602982, + "grad_norm": 1.4089369773864746, + "learning_rate": 4.2499075806423954e-06, + "loss": 0.3354, + "step": 11522 + }, + { + "epoch": 1.4022512929723152, + "grad_norm": 1.9180755615234375, + "learning_rate": 4.24831173579612e-06, + "loss": 0.3392, + "step": 11523 + }, + { + "epoch": 1.4023729844843322, + "grad_norm": 1.626389980316162, + "learning_rate": 4.2467161098134864e-06, + "loss": 0.378, + "step": 11524 + }, + { + "epoch": 1.4024946759963492, + "grad_norm": 1.5367153882980347, + "learning_rate": 4.2451207027552164e-06, + "loss": 0.3744, + "step": 11525 + }, + { + "epoch": 1.4026163675083663, + "grad_norm": 2.8640105724334717, + "learning_rate": 4.243525514682017e-06, + "loss": 0.3655, + "step": 11526 + }, + { + "epoch": 1.4027380590203833, + "grad_norm": 1.424145221710205, + "learning_rate": 4.241930545654582e-06, + "loss": 0.3609, + "step": 11527 + }, + { + "epoch": 1.4028597505324003, + "grad_norm": 2.524362802505493, + "learning_rate": 4.240335795733612e-06, + "loss": 0.3687, + "step": 11528 + }, + { + "epoch": 1.4029814420444173, + "grad_norm": 1.7690109014511108, + "learning_rate": 4.238741264979783e-06, + "loss": 0.3613, + "step": 11529 + }, + { + "epoch": 1.4031031335564346, + "grad_norm": 2.6616768836975098, + "learning_rate": 4.237146953453779e-06, + "loss": 0.389, + "step": 11530 + }, + { + "epoch": 1.4032248250684516, + "grad_norm": 1.9031298160552979, + "learning_rate": 4.235552861216261e-06, + "loss": 0.3357, + "step": 11531 + }, + { + "epoch": 1.4033465165804686, + "grad_norm": 2.886734962463379, + "learning_rate": 4.233958988327884e-06, + "loss": 0.3888, + "step": 11532 + }, + { + "epoch": 1.4034682080924856, + "grad_norm": 1.8774120807647705, + "learning_rate": 4.232365334849311e-06, + "loss": 0.3595, + "step": 11533 + }, + { + "epoch": 1.4035898996045026, + "grad_norm": 3.600778818130493, + "learning_rate": 4.230771900841168e-06, + "loss": 0.4511, + "step": 11534 + }, + { + "epoch": 1.4037115911165197, + "grad_norm": 2.5160017013549805, + "learning_rate": 4.229178686364098e-06, + "loss": 0.4237, + "step": 11535 + }, + { + "epoch": 1.4038332826285367, + "grad_norm": 2.9807233810424805, + "learning_rate": 4.227585691478726e-06, + "loss": 0.3137, + "step": 11536 + }, + { + "epoch": 1.4039549741405537, + "grad_norm": 2.2398462295532227, + "learning_rate": 4.2259929162456616e-06, + "loss": 0.4278, + "step": 11537 + }, + { + "epoch": 1.4040766656525707, + "grad_norm": 1.914557933807373, + "learning_rate": 4.224400360725522e-06, + "loss": 0.3701, + "step": 11538 + }, + { + "epoch": 1.4041983571645877, + "grad_norm": 3.069185972213745, + "learning_rate": 4.222808024978905e-06, + "loss": 0.4091, + "step": 11539 + }, + { + "epoch": 1.4043200486766048, + "grad_norm": 1.572724461555481, + "learning_rate": 4.221215909066395e-06, + "loss": 0.3355, + "step": 11540 + }, + { + "epoch": 1.4044417401886218, + "grad_norm": 2.696105480194092, + "learning_rate": 4.219624013048587e-06, + "loss": 0.3454, + "step": 11541 + }, + { + "epoch": 1.4045634317006388, + "grad_norm": 1.6319700479507446, + "learning_rate": 4.218032336986048e-06, + "loss": 0.3653, + "step": 11542 + }, + { + "epoch": 1.4046851232126558, + "grad_norm": 1.9677331447601318, + "learning_rate": 4.216440880939344e-06, + "loss": 0.4175, + "step": 11543 + }, + { + "epoch": 1.4048068147246728, + "grad_norm": 3.3107752799987793, + "learning_rate": 4.214849644969042e-06, + "loss": 0.3172, + "step": 11544 + }, + { + "epoch": 1.4049285062366899, + "grad_norm": 2.2747137546539307, + "learning_rate": 4.213258629135679e-06, + "loss": 0.3307, + "step": 11545 + }, + { + "epoch": 1.405050197748707, + "grad_norm": 3.6300511360168457, + "learning_rate": 4.211667833499807e-06, + "loss": 0.2855, + "step": 11546 + }, + { + "epoch": 1.4051718892607241, + "grad_norm": 1.8248096704483032, + "learning_rate": 4.210077258121955e-06, + "loss": 0.4238, + "step": 11547 + }, + { + "epoch": 1.4052935807727411, + "grad_norm": 1.8915961980819702, + "learning_rate": 4.208486903062644e-06, + "loss": 0.3683, + "step": 11548 + }, + { + "epoch": 1.4054152722847582, + "grad_norm": 1.917109727859497, + "learning_rate": 4.206896768382398e-06, + "loss": 0.4559, + "step": 11549 + }, + { + "epoch": 1.4055369637967752, + "grad_norm": 1.8176966905593872, + "learning_rate": 4.205306854141722e-06, + "loss": 0.3801, + "step": 11550 + }, + { + "epoch": 1.4056586553087922, + "grad_norm": 1.6042671203613281, + "learning_rate": 4.2037171604011096e-06, + "loss": 0.3847, + "step": 11551 + }, + { + "epoch": 1.4057803468208092, + "grad_norm": 2.1626269817352295, + "learning_rate": 4.2021276872210605e-06, + "loss": 0.3817, + "step": 11552 + }, + { + "epoch": 1.4059020383328262, + "grad_norm": 1.3933879137039185, + "learning_rate": 4.200538434662054e-06, + "loss": 0.3506, + "step": 11553 + }, + { + "epoch": 1.4060237298448433, + "grad_norm": 1.5468640327453613, + "learning_rate": 4.198949402784561e-06, + "loss": 0.3542, + "step": 11554 + }, + { + "epoch": 1.4061454213568605, + "grad_norm": 2.298234701156616, + "learning_rate": 4.197360591649053e-06, + "loss": 0.3936, + "step": 11555 + }, + { + "epoch": 1.4062671128688775, + "grad_norm": 1.9949541091918945, + "learning_rate": 4.1957720013159874e-06, + "loss": 0.3653, + "step": 11556 + }, + { + "epoch": 1.4063888043808945, + "grad_norm": 2.873588800430298, + "learning_rate": 4.19418363184581e-06, + "loss": 0.4122, + "step": 11557 + }, + { + "epoch": 1.4065104958929115, + "grad_norm": 4.114194393157959, + "learning_rate": 4.192595483298964e-06, + "loss": 0.4581, + "step": 11558 + }, + { + "epoch": 1.4066321874049286, + "grad_norm": 2.229585647583008, + "learning_rate": 4.191007555735876e-06, + "loss": 0.3804, + "step": 11559 + }, + { + "epoch": 1.4067538789169456, + "grad_norm": 1.6459980010986328, + "learning_rate": 4.189419849216977e-06, + "loss": 0.3162, + "step": 11560 + }, + { + "epoch": 1.4068755704289626, + "grad_norm": 2.128148078918457, + "learning_rate": 4.187832363802681e-06, + "loss": 0.423, + "step": 11561 + }, + { + "epoch": 1.4069972619409796, + "grad_norm": 2.1356284618377686, + "learning_rate": 4.186245099553391e-06, + "loss": 0.3911, + "step": 11562 + }, + { + "epoch": 1.4071189534529966, + "grad_norm": 1.7619588375091553, + "learning_rate": 4.184658056529511e-06, + "loss": 0.3467, + "step": 11563 + }, + { + "epoch": 1.4072406449650137, + "grad_norm": 1.9123762845993042, + "learning_rate": 4.183071234791431e-06, + "loss": 0.415, + "step": 11564 + }, + { + "epoch": 1.4073623364770307, + "grad_norm": 1.8862640857696533, + "learning_rate": 4.181484634399526e-06, + "loss": 0.3797, + "step": 11565 + }, + { + "epoch": 1.4074840279890477, + "grad_norm": 2.488956928253174, + "learning_rate": 4.179898255414178e-06, + "loss": 0.3823, + "step": 11566 + }, + { + "epoch": 1.4076057195010647, + "grad_norm": 1.537616491317749, + "learning_rate": 4.178312097895748e-06, + "loss": 0.3446, + "step": 11567 + }, + { + "epoch": 1.4077274110130817, + "grad_norm": 2.983016014099121, + "learning_rate": 4.176726161904593e-06, + "loss": 0.4306, + "step": 11568 + }, + { + "epoch": 1.4078491025250988, + "grad_norm": 1.7719531059265137, + "learning_rate": 4.175140447501061e-06, + "loss": 0.3656, + "step": 11569 + }, + { + "epoch": 1.4079707940371158, + "grad_norm": 1.6745668649673462, + "learning_rate": 4.173554954745489e-06, + "loss": 0.3083, + "step": 11570 + }, + { + "epoch": 1.408092485549133, + "grad_norm": 2.064340353012085, + "learning_rate": 4.171969683698215e-06, + "loss": 0.439, + "step": 11571 + }, + { + "epoch": 1.40821417706115, + "grad_norm": 1.612450361251831, + "learning_rate": 4.1703846344195565e-06, + "loss": 0.3397, + "step": 11572 + }, + { + "epoch": 1.408335868573167, + "grad_norm": 1.6694869995117188, + "learning_rate": 4.168799806969827e-06, + "loss": 0.388, + "step": 11573 + }, + { + "epoch": 1.408457560085184, + "grad_norm": 3.5114548206329346, + "learning_rate": 4.167215201409337e-06, + "loss": 0.2922, + "step": 11574 + }, + { + "epoch": 1.408579251597201, + "grad_norm": 3.0673043727874756, + "learning_rate": 4.165630817798383e-06, + "loss": 0.3318, + "step": 11575 + }, + { + "epoch": 1.4087009431092181, + "grad_norm": 2.271623134613037, + "learning_rate": 4.164046656197248e-06, + "loss": 0.357, + "step": 11576 + }, + { + "epoch": 1.4088226346212351, + "grad_norm": 1.4271769523620605, + "learning_rate": 4.162462716666222e-06, + "loss": 0.3588, + "step": 11577 + }, + { + "epoch": 1.4089443261332522, + "grad_norm": 1.7905840873718262, + "learning_rate": 4.160878999265573e-06, + "loss": 0.3455, + "step": 11578 + }, + { + "epoch": 1.4090660176452692, + "grad_norm": 2.0919485092163086, + "learning_rate": 4.159295504055564e-06, + "loss": 0.3094, + "step": 11579 + }, + { + "epoch": 1.4091877091572864, + "grad_norm": 2.895566940307617, + "learning_rate": 4.15771223109645e-06, + "loss": 0.449, + "step": 11580 + }, + { + "epoch": 1.4093094006693034, + "grad_norm": 2.123392343521118, + "learning_rate": 4.156129180448476e-06, + "loss": 0.3325, + "step": 11581 + }, + { + "epoch": 1.4094310921813205, + "grad_norm": 1.632764458656311, + "learning_rate": 4.154546352171885e-06, + "loss": 0.3615, + "step": 11582 + }, + { + "epoch": 1.4095527836933375, + "grad_norm": 2.295473337173462, + "learning_rate": 4.152963746326907e-06, + "loss": 0.3268, + "step": 11583 + }, + { + "epoch": 1.4096744752053545, + "grad_norm": 1.784664273262024, + "learning_rate": 4.1513813629737556e-06, + "loss": 0.3758, + "step": 11584 + }, + { + "epoch": 1.4097961667173715, + "grad_norm": 2.5662484169006348, + "learning_rate": 4.149799202172653e-06, + "loss": 0.3607, + "step": 11585 + }, + { + "epoch": 1.4099178582293885, + "grad_norm": 1.2922815084457397, + "learning_rate": 4.1482172639837966e-06, + "loss": 0.3072, + "step": 11586 + }, + { + "epoch": 1.4100395497414056, + "grad_norm": 2.3864541053771973, + "learning_rate": 4.14663554846739e-06, + "loss": 0.3599, + "step": 11587 + }, + { + "epoch": 1.4101612412534226, + "grad_norm": 2.0226659774780273, + "learning_rate": 4.145054055683616e-06, + "loss": 0.3749, + "step": 11588 + }, + { + "epoch": 1.4102829327654396, + "grad_norm": 2.0993988513946533, + "learning_rate": 4.143472785692652e-06, + "loss": 0.3672, + "step": 11589 + }, + { + "epoch": 1.4104046242774566, + "grad_norm": 1.5659104585647583, + "learning_rate": 4.1418917385546766e-06, + "loss": 0.3596, + "step": 11590 + }, + { + "epoch": 1.4105263157894736, + "grad_norm": 2.0968754291534424, + "learning_rate": 4.140310914329843e-06, + "loss": 0.3578, + "step": 11591 + }, + { + "epoch": 1.4106480073014906, + "grad_norm": 1.5416018962860107, + "learning_rate": 4.138730313078304e-06, + "loss": 0.3326, + "step": 11592 + }, + { + "epoch": 1.4107696988135077, + "grad_norm": 2.7538788318634033, + "learning_rate": 4.137149934860213e-06, + "loss": 0.3819, + "step": 11593 + }, + { + "epoch": 1.4108913903255247, + "grad_norm": 1.3816417455673218, + "learning_rate": 4.1355697797356985e-06, + "loss": 0.3746, + "step": 11594 + }, + { + "epoch": 1.4110130818375417, + "grad_norm": 1.8974199295043945, + "learning_rate": 4.133989847764897e-06, + "loss": 0.3768, + "step": 11595 + }, + { + "epoch": 1.411134773349559, + "grad_norm": 1.9509323835372925, + "learning_rate": 4.1324101390079215e-06, + "loss": 0.3915, + "step": 11596 + }, + { + "epoch": 1.411256464861576, + "grad_norm": 3.6561601161956787, + "learning_rate": 4.130830653524885e-06, + "loss": 0.3351, + "step": 11597 + }, + { + "epoch": 1.411378156373593, + "grad_norm": 1.790881872177124, + "learning_rate": 4.129251391375891e-06, + "loss": 0.3948, + "step": 11598 + }, + { + "epoch": 1.41149984788561, + "grad_norm": 1.3854141235351562, + "learning_rate": 4.127672352621035e-06, + "loss": 0.3927, + "step": 11599 + }, + { + "epoch": 1.411621539397627, + "grad_norm": 1.6384949684143066, + "learning_rate": 4.126093537320398e-06, + "loss": 0.3742, + "step": 11600 + }, + { + "epoch": 1.411743230909644, + "grad_norm": 2.4517855644226074, + "learning_rate": 4.124514945534063e-06, + "loss": 0.358, + "step": 11601 + }, + { + "epoch": 1.411864922421661, + "grad_norm": 1.6706990003585815, + "learning_rate": 4.122936577322096e-06, + "loss": 0.3935, + "step": 11602 + }, + { + "epoch": 1.411986613933678, + "grad_norm": 2.1631698608398438, + "learning_rate": 4.121358432744558e-06, + "loss": 0.3471, + "step": 11603 + }, + { + "epoch": 1.412108305445695, + "grad_norm": 1.5076167583465576, + "learning_rate": 4.119780511861499e-06, + "loss": 0.3795, + "step": 11604 + }, + { + "epoch": 1.4122299969577121, + "grad_norm": 3.6191859245300293, + "learning_rate": 4.118202814732959e-06, + "loss": 0.3478, + "step": 11605 + }, + { + "epoch": 1.4123516884697294, + "grad_norm": 1.7758814096450806, + "learning_rate": 4.11662534141898e-06, + "loss": 0.3883, + "step": 11606 + }, + { + "epoch": 1.4124733799817464, + "grad_norm": 3.163689374923706, + "learning_rate": 4.115048091979584e-06, + "loss": 0.3568, + "step": 11607 + }, + { + "epoch": 1.4125950714937634, + "grad_norm": 2.9635977745056152, + "learning_rate": 4.113471066474788e-06, + "loss": 0.3138, + "step": 11608 + }, + { + "epoch": 1.4127167630057804, + "grad_norm": 2.9105312824249268, + "learning_rate": 4.111894264964604e-06, + "loss": 0.3379, + "step": 11609 + }, + { + "epoch": 1.4128384545177974, + "grad_norm": 2.5093648433685303, + "learning_rate": 4.110317687509033e-06, + "loss": 0.333, + "step": 11610 + }, + { + "epoch": 1.4129601460298145, + "grad_norm": 1.6861945390701294, + "learning_rate": 4.108741334168059e-06, + "loss": 0.4078, + "step": 11611 + }, + { + "epoch": 1.4130818375418315, + "grad_norm": 1.7538056373596191, + "learning_rate": 4.107165205001676e-06, + "loss": 0.3883, + "step": 11612 + }, + { + "epoch": 1.4132035290538485, + "grad_norm": 1.4961466789245605, + "learning_rate": 4.105589300069855e-06, + "loss": 0.3697, + "step": 11613 + }, + { + "epoch": 1.4133252205658655, + "grad_norm": 3.3296072483062744, + "learning_rate": 4.104013619432561e-06, + "loss": 0.4123, + "step": 11614 + }, + { + "epoch": 1.4134469120778825, + "grad_norm": 1.4338408708572388, + "learning_rate": 4.102438163149753e-06, + "loss": 0.3279, + "step": 11615 + }, + { + "epoch": 1.4135686035898996, + "grad_norm": 4.271087169647217, + "learning_rate": 4.100862931281378e-06, + "loss": 0.4388, + "step": 11616 + }, + { + "epoch": 1.4136902951019166, + "grad_norm": 1.2837892770767212, + "learning_rate": 4.099287923887381e-06, + "loss": 0.3554, + "step": 11617 + }, + { + "epoch": 1.4138119866139336, + "grad_norm": 2.8733601570129395, + "learning_rate": 4.097713141027692e-06, + "loss": 0.3854, + "step": 11618 + }, + { + "epoch": 1.4139336781259506, + "grad_norm": 1.828884482383728, + "learning_rate": 4.096138582762232e-06, + "loss": 0.3675, + "step": 11619 + }, + { + "epoch": 1.4140553696379676, + "grad_norm": 3.118661403656006, + "learning_rate": 4.094564249150923e-06, + "loss": 0.3981, + "step": 11620 + }, + { + "epoch": 1.4141770611499849, + "grad_norm": 1.6310796737670898, + "learning_rate": 4.0929901402536676e-06, + "loss": 0.4055, + "step": 11621 + }, + { + "epoch": 1.414298752662002, + "grad_norm": 2.197155714035034, + "learning_rate": 4.091416256130362e-06, + "loss": 0.3165, + "step": 11622 + }, + { + "epoch": 1.414420444174019, + "grad_norm": 2.284440755844116, + "learning_rate": 4.0898425968409e-06, + "loss": 0.3538, + "step": 11623 + }, + { + "epoch": 1.414542135686036, + "grad_norm": 1.3533656597137451, + "learning_rate": 4.088269162445161e-06, + "loss": 0.3745, + "step": 11624 + }, + { + "epoch": 1.414663827198053, + "grad_norm": 2.233342170715332, + "learning_rate": 4.086695953003017e-06, + "loss": 0.3001, + "step": 11625 + }, + { + "epoch": 1.41478551871007, + "grad_norm": 3.6351616382598877, + "learning_rate": 4.085122968574332e-06, + "loss": 0.4268, + "step": 11626 + }, + { + "epoch": 1.414907210222087, + "grad_norm": 3.2335991859436035, + "learning_rate": 4.0835502092189575e-06, + "loss": 0.4177, + "step": 11627 + }, + { + "epoch": 1.415028901734104, + "grad_norm": 2.717853546142578, + "learning_rate": 4.081977674996746e-06, + "loss": 0.4144, + "step": 11628 + }, + { + "epoch": 1.415150593246121, + "grad_norm": 2.9840035438537598, + "learning_rate": 4.080405365967536e-06, + "loss": 0.3131, + "step": 11629 + }, + { + "epoch": 1.415272284758138, + "grad_norm": 1.9089453220367432, + "learning_rate": 4.07883328219115e-06, + "loss": 0.3306, + "step": 11630 + }, + { + "epoch": 1.4153939762701553, + "grad_norm": 1.6232671737670898, + "learning_rate": 4.077261423727418e-06, + "loss": 0.371, + "step": 11631 + }, + { + "epoch": 1.4155156677821723, + "grad_norm": 1.6911869049072266, + "learning_rate": 4.075689790636147e-06, + "loss": 0.3758, + "step": 11632 + }, + { + "epoch": 1.4156373592941893, + "grad_norm": 1.5354719161987305, + "learning_rate": 4.074118382977139e-06, + "loss": 0.3222, + "step": 11633 + }, + { + "epoch": 1.4157590508062063, + "grad_norm": 3.1932930946350098, + "learning_rate": 4.072547200810196e-06, + "loss": 0.3415, + "step": 11634 + }, + { + "epoch": 1.4158807423182234, + "grad_norm": 1.4369210004806519, + "learning_rate": 4.070976244195102e-06, + "loss": 0.337, + "step": 11635 + }, + { + "epoch": 1.4160024338302404, + "grad_norm": 1.9137746095657349, + "learning_rate": 4.069405513191634e-06, + "loss": 0.3774, + "step": 11636 + }, + { + "epoch": 1.4161241253422574, + "grad_norm": 1.9250438213348389, + "learning_rate": 4.067835007859562e-06, + "loss": 0.3551, + "step": 11637 + }, + { + "epoch": 1.4162458168542744, + "grad_norm": 1.6549873352050781, + "learning_rate": 4.066264728258642e-06, + "loss": 0.3784, + "step": 11638 + }, + { + "epoch": 1.4163675083662914, + "grad_norm": 1.709864616394043, + "learning_rate": 4.064694674448636e-06, + "loss": 0.3809, + "step": 11639 + }, + { + "epoch": 1.4164891998783085, + "grad_norm": 1.252031922340393, + "learning_rate": 4.0631248464892825e-06, + "loss": 0.3522, + "step": 11640 + }, + { + "epoch": 1.4166108913903255, + "grad_norm": 1.765067458152771, + "learning_rate": 4.061555244440314e-06, + "loss": 0.3058, + "step": 11641 + }, + { + "epoch": 1.4167325829023425, + "grad_norm": 1.901914358139038, + "learning_rate": 4.059985868361464e-06, + "loss": 0.3765, + "step": 11642 + }, + { + "epoch": 1.4168542744143595, + "grad_norm": 1.3037997484207153, + "learning_rate": 4.058416718312447e-06, + "loss": 0.3065, + "step": 11643 + }, + { + "epoch": 1.4169759659263765, + "grad_norm": 2.255718946456909, + "learning_rate": 4.056847794352968e-06, + "loss": 0.3515, + "step": 11644 + }, + { + "epoch": 1.4170976574383936, + "grad_norm": 1.8065667152404785, + "learning_rate": 4.055279096542736e-06, + "loss": 0.3469, + "step": 11645 + }, + { + "epoch": 1.4172193489504106, + "grad_norm": 1.3460602760314941, + "learning_rate": 4.053710624941436e-06, + "loss": 0.3414, + "step": 11646 + }, + { + "epoch": 1.4173410404624278, + "grad_norm": 1.5945611000061035, + "learning_rate": 4.052142379608758e-06, + "loss": 0.385, + "step": 11647 + }, + { + "epoch": 1.4174627319744448, + "grad_norm": 1.628516435623169, + "learning_rate": 4.050574360604378e-06, + "loss": 0.3145, + "step": 11648 + }, + { + "epoch": 1.4175844234864619, + "grad_norm": 1.7932103872299194, + "learning_rate": 4.049006567987951e-06, + "loss": 0.3489, + "step": 11649 + }, + { + "epoch": 1.4177061149984789, + "grad_norm": 1.348899245262146, + "learning_rate": 4.047439001819144e-06, + "loss": 0.3242, + "step": 11650 + }, + { + "epoch": 1.417827806510496, + "grad_norm": 1.6273852586746216, + "learning_rate": 4.045871662157605e-06, + "loss": 0.3842, + "step": 11651 + }, + { + "epoch": 1.417949498022513, + "grad_norm": 1.4806421995162964, + "learning_rate": 4.044304549062972e-06, + "loss": 0.3879, + "step": 11652 + }, + { + "epoch": 1.41807118953453, + "grad_norm": 1.8799042701721191, + "learning_rate": 4.04273766259488e-06, + "loss": 0.3662, + "step": 11653 + }, + { + "epoch": 1.418192881046547, + "grad_norm": 1.725543737411499, + "learning_rate": 4.041171002812947e-06, + "loss": 0.3752, + "step": 11654 + }, + { + "epoch": 1.418314572558564, + "grad_norm": 1.7824726104736328, + "learning_rate": 4.039604569776796e-06, + "loss": 0.3588, + "step": 11655 + }, + { + "epoch": 1.4184362640705812, + "grad_norm": 1.758988380432129, + "learning_rate": 4.038038363546028e-06, + "loss": 0.3467, + "step": 11656 + }, + { + "epoch": 1.4185579555825982, + "grad_norm": 1.4466619491577148, + "learning_rate": 4.036472384180238e-06, + "loss": 0.3394, + "step": 11657 + }, + { + "epoch": 1.4186796470946152, + "grad_norm": 1.6314854621887207, + "learning_rate": 4.03490663173902e-06, + "loss": 0.3508, + "step": 11658 + }, + { + "epoch": 1.4188013386066323, + "grad_norm": 2.778224468231201, + "learning_rate": 4.033341106281956e-06, + "loss": 0.3291, + "step": 11659 + }, + { + "epoch": 1.4189230301186493, + "grad_norm": 1.6974302530288696, + "learning_rate": 4.031775807868606e-06, + "loss": 0.3611, + "step": 11660 + }, + { + "epoch": 1.4190447216306663, + "grad_norm": 2.893326759338379, + "learning_rate": 4.0302107365585445e-06, + "loss": 0.4098, + "step": 11661 + }, + { + "epoch": 1.4191664131426833, + "grad_norm": 1.8200770616531372, + "learning_rate": 4.028645892411316e-06, + "loss": 0.3686, + "step": 11662 + }, + { + "epoch": 1.4192881046547003, + "grad_norm": 2.087089776992798, + "learning_rate": 4.027081275486476e-06, + "loss": 0.3142, + "step": 11663 + }, + { + "epoch": 1.4194097961667174, + "grad_norm": 1.401525855064392, + "learning_rate": 4.025516885843557e-06, + "loss": 0.3441, + "step": 11664 + }, + { + "epoch": 1.4195314876787344, + "grad_norm": 1.5148062705993652, + "learning_rate": 4.023952723542083e-06, + "loss": 0.3847, + "step": 11665 + }, + { + "epoch": 1.4196531791907514, + "grad_norm": 1.7427053451538086, + "learning_rate": 4.02238878864158e-06, + "loss": 0.3287, + "step": 11666 + }, + { + "epoch": 1.4197748707027684, + "grad_norm": 1.761740803718567, + "learning_rate": 4.020825081201558e-06, + "loss": 0.4148, + "step": 11667 + }, + { + "epoch": 1.4198965622147854, + "grad_norm": 1.9368391036987305, + "learning_rate": 4.019261601281514e-06, + "loss": 0.2846, + "step": 11668 + }, + { + "epoch": 1.4200182537268025, + "grad_norm": 1.5413599014282227, + "learning_rate": 4.017698348940949e-06, + "loss": 0.3407, + "step": 11669 + }, + { + "epoch": 1.4201399452388195, + "grad_norm": 3.175100803375244, + "learning_rate": 4.016135324239344e-06, + "loss": 0.4225, + "step": 11670 + }, + { + "epoch": 1.4202616367508365, + "grad_norm": 2.09101939201355, + "learning_rate": 4.014572527236177e-06, + "loss": 0.4306, + "step": 11671 + }, + { + "epoch": 1.4203833282628537, + "grad_norm": 1.820353388786316, + "learning_rate": 4.013009957990913e-06, + "loss": 0.3679, + "step": 11672 + }, + { + "epoch": 1.4205050197748708, + "grad_norm": 2.646521806716919, + "learning_rate": 4.01144761656301e-06, + "loss": 0.2648, + "step": 11673 + }, + { + "epoch": 1.4206267112868878, + "grad_norm": 2.137427568435669, + "learning_rate": 4.0098855030119245e-06, + "loss": 0.3471, + "step": 11674 + }, + { + "epoch": 1.4207484027989048, + "grad_norm": 3.1136484146118164, + "learning_rate": 4.008323617397094e-06, + "loss": 0.3904, + "step": 11675 + }, + { + "epoch": 1.4208700943109218, + "grad_norm": 1.9761993885040283, + "learning_rate": 4.006761959777948e-06, + "loss": 0.3784, + "step": 11676 + }, + { + "epoch": 1.4209917858229388, + "grad_norm": 2.099623441696167, + "learning_rate": 4.005200530213919e-06, + "loss": 0.3342, + "step": 11677 + }, + { + "epoch": 1.4211134773349559, + "grad_norm": 1.583484172821045, + "learning_rate": 4.003639328764418e-06, + "loss": 0.3922, + "step": 11678 + }, + { + "epoch": 1.4212351688469729, + "grad_norm": 1.5651918649673462, + "learning_rate": 4.002078355488849e-06, + "loss": 0.4242, + "step": 11679 + }, + { + "epoch": 1.42135686035899, + "grad_norm": 3.1957945823669434, + "learning_rate": 4.000517610446617e-06, + "loss": 0.507, + "step": 11680 + }, + { + "epoch": 1.4214785518710071, + "grad_norm": 1.4976094961166382, + "learning_rate": 3.998957093697109e-06, + "loss": 0.3484, + "step": 11681 + }, + { + "epoch": 1.4216002433830242, + "grad_norm": 1.3932868242263794, + "learning_rate": 3.997396805299703e-06, + "loss": 0.3209, + "step": 11682 + }, + { + "epoch": 1.4217219348950412, + "grad_norm": 1.8951544761657715, + "learning_rate": 3.9958367453137754e-06, + "loss": 0.3818, + "step": 11683 + }, + { + "epoch": 1.4218436264070582, + "grad_norm": 2.4057655334472656, + "learning_rate": 3.994276913798684e-06, + "loss": 0.3066, + "step": 11684 + }, + { + "epoch": 1.4219653179190752, + "grad_norm": 1.3851104974746704, + "learning_rate": 3.99271731081379e-06, + "loss": 0.3387, + "step": 11685 + }, + { + "epoch": 1.4220870094310922, + "grad_norm": 1.9990328550338745, + "learning_rate": 3.991157936418436e-06, + "loss": 0.3664, + "step": 11686 + }, + { + "epoch": 1.4222087009431092, + "grad_norm": 2.7234108448028564, + "learning_rate": 3.989598790671957e-06, + "loss": 0.3085, + "step": 11687 + }, + { + "epoch": 1.4223303924551263, + "grad_norm": 1.5538443326950073, + "learning_rate": 3.988039873633689e-06, + "loss": 0.3131, + "step": 11688 + }, + { + "epoch": 1.4224520839671433, + "grad_norm": 1.3935569524765015, + "learning_rate": 3.986481185362947e-06, + "loss": 0.314, + "step": 11689 + }, + { + "epoch": 1.4225737754791603, + "grad_norm": 1.9987400770187378, + "learning_rate": 3.98492272591904e-06, + "loss": 0.3539, + "step": 11690 + }, + { + "epoch": 1.4226954669911773, + "grad_norm": 1.3666478395462036, + "learning_rate": 3.9833644953612785e-06, + "loss": 0.3419, + "step": 11691 + }, + { + "epoch": 1.4228171585031943, + "grad_norm": 4.389426231384277, + "learning_rate": 3.981806493748949e-06, + "loss": 0.4448, + "step": 11692 + }, + { + "epoch": 1.4229388500152114, + "grad_norm": 1.5672175884246826, + "learning_rate": 3.980248721141339e-06, + "loss": 0.3707, + "step": 11693 + }, + { + "epoch": 1.4230605415272284, + "grad_norm": 1.8768202066421509, + "learning_rate": 3.978691177597731e-06, + "loss": 0.389, + "step": 11694 + }, + { + "epoch": 1.4231822330392454, + "grad_norm": 1.4674445390701294, + "learning_rate": 3.977133863177381e-06, + "loss": 0.38, + "step": 11695 + }, + { + "epoch": 1.4233039245512624, + "grad_norm": 1.3890225887298584, + "learning_rate": 3.975576777939556e-06, + "loss": 0.3341, + "step": 11696 + }, + { + "epoch": 1.4234256160632797, + "grad_norm": 1.265735149383545, + "learning_rate": 3.974019921943505e-06, + "loss": 0.3712, + "step": 11697 + }, + { + "epoch": 1.4235473075752967, + "grad_norm": 2.511049509048462, + "learning_rate": 3.972463295248465e-06, + "loss": 0.3819, + "step": 11698 + }, + { + "epoch": 1.4236689990873137, + "grad_norm": 3.0046520233154297, + "learning_rate": 3.970906897913678e-06, + "loss": 0.4123, + "step": 11699 + }, + { + "epoch": 1.4237906905993307, + "grad_norm": 2.1112117767333984, + "learning_rate": 3.969350729998362e-06, + "loss": 0.3653, + "step": 11700 + }, + { + "epoch": 1.4239123821113477, + "grad_norm": 1.5638095140457153, + "learning_rate": 3.967794791561731e-06, + "loss": 0.4055, + "step": 11701 + }, + { + "epoch": 1.4240340736233648, + "grad_norm": 2.9030213356018066, + "learning_rate": 3.966239082662996e-06, + "loss": 0.3683, + "step": 11702 + }, + { + "epoch": 1.4241557651353818, + "grad_norm": 1.6132267713546753, + "learning_rate": 3.964683603361355e-06, + "loss": 0.4415, + "step": 11703 + }, + { + "epoch": 1.4242774566473988, + "grad_norm": 1.4999154806137085, + "learning_rate": 3.96312835371599e-06, + "loss": 0.3537, + "step": 11704 + }, + { + "epoch": 1.4243991481594158, + "grad_norm": 2.0627658367156982, + "learning_rate": 3.9615733337860915e-06, + "loss": 0.4057, + "step": 11705 + }, + { + "epoch": 1.4245208396714328, + "grad_norm": 2.203972339630127, + "learning_rate": 3.9600185436308255e-06, + "loss": 0.3672, + "step": 11706 + }, + { + "epoch": 1.42464253118345, + "grad_norm": 1.9013432264328003, + "learning_rate": 3.958463983309357e-06, + "loss": 0.3331, + "step": 11707 + }, + { + "epoch": 1.424764222695467, + "grad_norm": 1.4748945236206055, + "learning_rate": 3.956909652880839e-06, + "loss": 0.3533, + "step": 11708 + }, + { + "epoch": 1.4248859142074841, + "grad_norm": 1.6467561721801758, + "learning_rate": 3.955355552404414e-06, + "loss": 0.3832, + "step": 11709 + }, + { + "epoch": 1.4250076057195011, + "grad_norm": 3.121297597885132, + "learning_rate": 3.953801681939225e-06, + "loss": 0.3639, + "step": 11710 + }, + { + "epoch": 1.4251292972315182, + "grad_norm": 1.8061774969100952, + "learning_rate": 3.952248041544397e-06, + "loss": 0.3764, + "step": 11711 + }, + { + "epoch": 1.4252509887435352, + "grad_norm": 2.0275731086730957, + "learning_rate": 3.950694631279045e-06, + "loss": 0.329, + "step": 11712 + }, + { + "epoch": 1.4253726802555522, + "grad_norm": 1.9685959815979004, + "learning_rate": 3.949141451202288e-06, + "loss": 0.3271, + "step": 11713 + }, + { + "epoch": 1.4254943717675692, + "grad_norm": 2.367690324783325, + "learning_rate": 3.9475885013732195e-06, + "loss": 0.4217, + "step": 11714 + }, + { + "epoch": 1.4256160632795862, + "grad_norm": 1.5932812690734863, + "learning_rate": 3.9460357818509396e-06, + "loss": 0.3827, + "step": 11715 + }, + { + "epoch": 1.4257377547916033, + "grad_norm": 3.4910826683044434, + "learning_rate": 3.944483292694531e-06, + "loss": 0.4391, + "step": 11716 + }, + { + "epoch": 1.4258594463036203, + "grad_norm": 2.5507473945617676, + "learning_rate": 3.942931033963066e-06, + "loss": 0.4097, + "step": 11717 + }, + { + "epoch": 1.4259811378156373, + "grad_norm": 1.3516994714736938, + "learning_rate": 3.9413790057156124e-06, + "loss": 0.3461, + "step": 11718 + }, + { + "epoch": 1.4261028293276543, + "grad_norm": 1.527092695236206, + "learning_rate": 3.939827208011228e-06, + "loss": 0.3823, + "step": 11719 + }, + { + "epoch": 1.4262245208396713, + "grad_norm": 1.394440770149231, + "learning_rate": 3.938275640908959e-06, + "loss": 0.3598, + "step": 11720 + }, + { + "epoch": 1.4263462123516883, + "grad_norm": 1.4894014596939087, + "learning_rate": 3.936724304467853e-06, + "loss": 0.3684, + "step": 11721 + }, + { + "epoch": 1.4264679038637056, + "grad_norm": 1.5485066175460815, + "learning_rate": 3.9351731987469336e-06, + "loss": 0.37, + "step": 11722 + }, + { + "epoch": 1.4265895953757226, + "grad_norm": 1.475352168083191, + "learning_rate": 3.933622323805231e-06, + "loss": 0.3328, + "step": 11723 + }, + { + "epoch": 1.4267112868877396, + "grad_norm": 1.4506787061691284, + "learning_rate": 3.932071679701755e-06, + "loss": 0.3116, + "step": 11724 + }, + { + "epoch": 1.4268329783997566, + "grad_norm": 1.5375412702560425, + "learning_rate": 3.93052126649551e-06, + "loss": 0.3718, + "step": 11725 + }, + { + "epoch": 1.4269546699117737, + "grad_norm": 1.848252534866333, + "learning_rate": 3.928971084245495e-06, + "loss": 0.3976, + "step": 11726 + }, + { + "epoch": 1.4270763614237907, + "grad_norm": 4.220135688781738, + "learning_rate": 3.927421133010698e-06, + "loss": 0.408, + "step": 11727 + }, + { + "epoch": 1.4271980529358077, + "grad_norm": 1.7481589317321777, + "learning_rate": 3.925871412850093e-06, + "loss": 0.3421, + "step": 11728 + }, + { + "epoch": 1.4273197444478247, + "grad_norm": 1.9360086917877197, + "learning_rate": 3.92432192382266e-06, + "loss": 0.3238, + "step": 11729 + }, + { + "epoch": 1.4274414359598417, + "grad_norm": 1.5617867708206177, + "learning_rate": 3.922772665987347e-06, + "loss": 0.3866, + "step": 11730 + }, + { + "epoch": 1.4275631274718588, + "grad_norm": 1.4919909238815308, + "learning_rate": 3.921223639403117e-06, + "loss": 0.3566, + "step": 11731 + }, + { + "epoch": 1.427684818983876, + "grad_norm": 1.6973735094070435, + "learning_rate": 3.919674844128909e-06, + "loss": 0.3859, + "step": 11732 + }, + { + "epoch": 1.427806510495893, + "grad_norm": 1.8454428911209106, + "learning_rate": 3.918126280223657e-06, + "loss": 0.3984, + "step": 11733 + }, + { + "epoch": 1.42792820200791, + "grad_norm": 1.8268356323242188, + "learning_rate": 3.91657794774629e-06, + "loss": 0.2983, + "step": 11734 + }, + { + "epoch": 1.428049893519927, + "grad_norm": 3.7695889472961426, + "learning_rate": 3.915029846755726e-06, + "loss": 0.4432, + "step": 11735 + }, + { + "epoch": 1.428171585031944, + "grad_norm": 2.152716875076294, + "learning_rate": 3.913481977310867e-06, + "loss": 0.3962, + "step": 11736 + }, + { + "epoch": 1.428293276543961, + "grad_norm": 1.473909616470337, + "learning_rate": 3.91193433947062e-06, + "loss": 0.3062, + "step": 11737 + }, + { + "epoch": 1.4284149680559781, + "grad_norm": 2.0247116088867188, + "learning_rate": 3.9103869332938745e-06, + "loss": 0.4168, + "step": 11738 + }, + { + "epoch": 1.4285366595679951, + "grad_norm": 1.6107152700424194, + "learning_rate": 3.908839758839506e-06, + "loss": 0.3707, + "step": 11739 + }, + { + "epoch": 1.4286583510800122, + "grad_norm": 1.9907371997833252, + "learning_rate": 3.907292816166401e-06, + "loss": 0.3497, + "step": 11740 + }, + { + "epoch": 1.4287800425920292, + "grad_norm": 1.8290342092514038, + "learning_rate": 3.905746105333409e-06, + "loss": 0.3482, + "step": 11741 + }, + { + "epoch": 1.4289017341040462, + "grad_norm": 3.007143020629883, + "learning_rate": 3.904199626399395e-06, + "loss": 0.384, + "step": 11742 + }, + { + "epoch": 1.4290234256160632, + "grad_norm": 1.9857935905456543, + "learning_rate": 3.902653379423202e-06, + "loss": 0.3575, + "step": 11743 + }, + { + "epoch": 1.4291451171280802, + "grad_norm": 1.9340925216674805, + "learning_rate": 3.901107364463666e-06, + "loss": 0.4019, + "step": 11744 + }, + { + "epoch": 1.4292668086400973, + "grad_norm": 1.9911713600158691, + "learning_rate": 3.8995615815796214e-06, + "loss": 0.3237, + "step": 11745 + }, + { + "epoch": 1.4293885001521143, + "grad_norm": 1.326897382736206, + "learning_rate": 3.898016030829886e-06, + "loss": 0.3059, + "step": 11746 + }, + { + "epoch": 1.4295101916641313, + "grad_norm": 1.8565852642059326, + "learning_rate": 3.896470712273269e-06, + "loss": 0.3276, + "step": 11747 + }, + { + "epoch": 1.4296318831761485, + "grad_norm": 1.446336269378662, + "learning_rate": 3.894925625968578e-06, + "loss": 0.3664, + "step": 11748 + }, + { + "epoch": 1.4297535746881656, + "grad_norm": 3.1635358333587646, + "learning_rate": 3.893380771974603e-06, + "loss": 0.4464, + "step": 11749 + }, + { + "epoch": 1.4298752662001826, + "grad_norm": 1.8254159688949585, + "learning_rate": 3.891836150350127e-06, + "loss": 0.4186, + "step": 11750 + }, + { + "epoch": 1.4299969577121996, + "grad_norm": 1.7408673763275146, + "learning_rate": 3.89029176115393e-06, + "loss": 0.3052, + "step": 11751 + }, + { + "epoch": 1.4301186492242166, + "grad_norm": 2.184199571609497, + "learning_rate": 3.88874760444478e-06, + "loss": 0.4172, + "step": 11752 + }, + { + "epoch": 1.4302403407362336, + "grad_norm": 1.6625468730926514, + "learning_rate": 3.887203680281433e-06, + "loss": 0.3711, + "step": 11753 + }, + { + "epoch": 1.4303620322482506, + "grad_norm": 2.4049620628356934, + "learning_rate": 3.8856599887226386e-06, + "loss": 0.4471, + "step": 11754 + }, + { + "epoch": 1.4304837237602677, + "grad_norm": 1.7581387758255005, + "learning_rate": 3.884116529827134e-06, + "loss": 0.4297, + "step": 11755 + }, + { + "epoch": 1.4306054152722847, + "grad_norm": 1.495323896408081, + "learning_rate": 3.882573303653658e-06, + "loss": 0.3354, + "step": 11756 + }, + { + "epoch": 1.430727106784302, + "grad_norm": 2.040177345275879, + "learning_rate": 3.88103031026093e-06, + "loss": 0.3901, + "step": 11757 + }, + { + "epoch": 1.430848798296319, + "grad_norm": 1.8757530450820923, + "learning_rate": 3.8794875497076625e-06, + "loss": 0.368, + "step": 11758 + }, + { + "epoch": 1.430970489808336, + "grad_norm": 2.0033326148986816, + "learning_rate": 3.877945022052564e-06, + "loss": 0.3475, + "step": 11759 + }, + { + "epoch": 1.431092181320353, + "grad_norm": 1.5022252798080444, + "learning_rate": 3.87640272735433e-06, + "loss": 0.3681, + "step": 11760 + }, + { + "epoch": 1.43121387283237, + "grad_norm": 1.929874062538147, + "learning_rate": 3.874860665671644e-06, + "loss": 0.3533, + "step": 11761 + }, + { + "epoch": 1.431335564344387, + "grad_norm": 1.5449442863464355, + "learning_rate": 3.8733188370631915e-06, + "loss": 0.3603, + "step": 11762 + }, + { + "epoch": 1.431457255856404, + "grad_norm": 2.9099409580230713, + "learning_rate": 3.871777241587638e-06, + "loss": 0.4045, + "step": 11763 + }, + { + "epoch": 1.431578947368421, + "grad_norm": 2.839965581893921, + "learning_rate": 3.870235879303646e-06, + "loss": 0.4013, + "step": 11764 + }, + { + "epoch": 1.431700638880438, + "grad_norm": 1.417090892791748, + "learning_rate": 3.868694750269867e-06, + "loss": 0.3344, + "step": 11765 + }, + { + "epoch": 1.431822330392455, + "grad_norm": 1.6607081890106201, + "learning_rate": 3.867153854544939e-06, + "loss": 0.3903, + "step": 11766 + }, + { + "epoch": 1.4319440219044721, + "grad_norm": 1.6627159118652344, + "learning_rate": 3.865613192187506e-06, + "loss": 0.3605, + "step": 11767 + }, + { + "epoch": 1.4320657134164891, + "grad_norm": 1.7895677089691162, + "learning_rate": 3.864072763256188e-06, + "loss": 0.3654, + "step": 11768 + }, + { + "epoch": 1.4321874049285062, + "grad_norm": 1.3444929122924805, + "learning_rate": 3.862532567809599e-06, + "loss": 0.357, + "step": 11769 + }, + { + "epoch": 1.4323090964405232, + "grad_norm": 1.7331032752990723, + "learning_rate": 3.860992605906353e-06, + "loss": 0.3871, + "step": 11770 + }, + { + "epoch": 1.4324307879525402, + "grad_norm": 2.553884983062744, + "learning_rate": 3.859452877605044e-06, + "loss": 0.4075, + "step": 11771 + }, + { + "epoch": 1.4325524794645572, + "grad_norm": 2.045876979827881, + "learning_rate": 3.857913382964261e-06, + "loss": 0.3976, + "step": 11772 + }, + { + "epoch": 1.4326741709765745, + "grad_norm": 1.304618239402771, + "learning_rate": 3.856374122042591e-06, + "loss": 0.3049, + "step": 11773 + }, + { + "epoch": 1.4327958624885915, + "grad_norm": 1.9653626680374146, + "learning_rate": 3.854835094898598e-06, + "loss": 0.4501, + "step": 11774 + }, + { + "epoch": 1.4329175540006085, + "grad_norm": 1.49382483959198, + "learning_rate": 3.853296301590857e-06, + "loss": 0.335, + "step": 11775 + }, + { + "epoch": 1.4330392455126255, + "grad_norm": 1.4614157676696777, + "learning_rate": 3.851757742177912e-06, + "loss": 0.3579, + "step": 11776 + }, + { + "epoch": 1.4331609370246425, + "grad_norm": 2.047070264816284, + "learning_rate": 3.850219416718306e-06, + "loss": 0.3805, + "step": 11777 + }, + { + "epoch": 1.4332826285366596, + "grad_norm": 1.5494405031204224, + "learning_rate": 3.848681325270585e-06, + "loss": 0.34, + "step": 11778 + }, + { + "epoch": 1.4334043200486766, + "grad_norm": 1.595807433128357, + "learning_rate": 3.847143467893271e-06, + "loss": 0.3889, + "step": 11779 + }, + { + "epoch": 1.4335260115606936, + "grad_norm": 1.5325510501861572, + "learning_rate": 3.845605844644881e-06, + "loss": 0.3825, + "step": 11780 + }, + { + "epoch": 1.4336477030727106, + "grad_norm": 1.9219975471496582, + "learning_rate": 3.844068455583932e-06, + "loss": 0.3713, + "step": 11781 + }, + { + "epoch": 1.4337693945847279, + "grad_norm": 2.2359395027160645, + "learning_rate": 3.842531300768915e-06, + "loss": 0.3794, + "step": 11782 + }, + { + "epoch": 1.4338910860967449, + "grad_norm": 2.6751513481140137, + "learning_rate": 3.840994380258332e-06, + "loss": 0.325, + "step": 11783 + }, + { + "epoch": 1.434012777608762, + "grad_norm": 1.601576328277588, + "learning_rate": 3.83945769411066e-06, + "loss": 0.3825, + "step": 11784 + }, + { + "epoch": 1.434134469120779, + "grad_norm": 1.6968153715133667, + "learning_rate": 3.837921242384372e-06, + "loss": 0.3246, + "step": 11785 + }, + { + "epoch": 1.434256160632796, + "grad_norm": 2.769641160964966, + "learning_rate": 3.836385025137942e-06, + "loss": 0.3494, + "step": 11786 + }, + { + "epoch": 1.434377852144813, + "grad_norm": 1.795230507850647, + "learning_rate": 3.834849042429817e-06, + "loss": 0.3866, + "step": 11787 + }, + { + "epoch": 1.43449954365683, + "grad_norm": 2.0857882499694824, + "learning_rate": 3.833313294318442e-06, + "loss": 0.4158, + "step": 11788 + }, + { + "epoch": 1.434621235168847, + "grad_norm": 2.680990695953369, + "learning_rate": 3.831777780862266e-06, + "loss": 0.3682, + "step": 11789 + }, + { + "epoch": 1.434742926680864, + "grad_norm": 1.160776138305664, + "learning_rate": 3.830242502119709e-06, + "loss": 0.3289, + "step": 11790 + }, + { + "epoch": 1.434864618192881, + "grad_norm": 1.482572078704834, + "learning_rate": 3.828707458149198e-06, + "loss": 0.3864, + "step": 11791 + }, + { + "epoch": 1.434986309704898, + "grad_norm": 1.968894362449646, + "learning_rate": 3.827172649009142e-06, + "loss": 0.3856, + "step": 11792 + }, + { + "epoch": 1.435108001216915, + "grad_norm": 2.602078676223755, + "learning_rate": 3.825638074757941e-06, + "loss": 0.4144, + "step": 11793 + }, + { + "epoch": 1.435229692728932, + "grad_norm": 2.042684316635132, + "learning_rate": 3.824103735453995e-06, + "loss": 0.3899, + "step": 11794 + }, + { + "epoch": 1.435351384240949, + "grad_norm": 1.7161979675292969, + "learning_rate": 3.822569631155686e-06, + "loss": 0.3642, + "step": 11795 + }, + { + "epoch": 1.4354730757529661, + "grad_norm": 1.9018559455871582, + "learning_rate": 3.821035761921385e-06, + "loss": 0.3232, + "step": 11796 + }, + { + "epoch": 1.4355947672649831, + "grad_norm": 1.9780354499816895, + "learning_rate": 3.8195021278094656e-06, + "loss": 0.3364, + "step": 11797 + }, + { + "epoch": 1.4357164587770004, + "grad_norm": 1.8928637504577637, + "learning_rate": 3.817968728878284e-06, + "loss": 0.3205, + "step": 11798 + }, + { + "epoch": 1.4358381502890174, + "grad_norm": 1.4485167264938354, + "learning_rate": 3.816435565186189e-06, + "loss": 0.359, + "step": 11799 + }, + { + "epoch": 1.4359598418010344, + "grad_norm": 1.6143624782562256, + "learning_rate": 3.814902636791519e-06, + "loss": 0.3548, + "step": 11800 + }, + { + "epoch": 1.4360815333130514, + "grad_norm": 1.4651527404785156, + "learning_rate": 3.8133699437526016e-06, + "loss": 0.2954, + "step": 11801 + }, + { + "epoch": 1.4362032248250685, + "grad_norm": 1.8096466064453125, + "learning_rate": 3.8118374861277685e-06, + "loss": 0.3804, + "step": 11802 + }, + { + "epoch": 1.4363249163370855, + "grad_norm": 3.1749520301818848, + "learning_rate": 3.810305263975328e-06, + "loss": 0.4436, + "step": 11803 + }, + { + "epoch": 1.4364466078491025, + "grad_norm": 1.4107646942138672, + "learning_rate": 3.80877327735358e-06, + "loss": 0.3028, + "step": 11804 + }, + { + "epoch": 1.4365682993611195, + "grad_norm": 1.6116647720336914, + "learning_rate": 3.8072415263208275e-06, + "loss": 0.3476, + "step": 11805 + }, + { + "epoch": 1.4366899908731365, + "grad_norm": 1.5863817930221558, + "learning_rate": 3.805710010935354e-06, + "loss": 0.3669, + "step": 11806 + }, + { + "epoch": 1.4368116823851536, + "grad_norm": 2.238461494445801, + "learning_rate": 3.8041787312554312e-06, + "loss": 0.3938, + "step": 11807 + }, + { + "epoch": 1.4369333738971708, + "grad_norm": 1.3938432931900024, + "learning_rate": 3.802647687339337e-06, + "loss": 0.3412, + "step": 11808 + }, + { + "epoch": 1.4370550654091878, + "grad_norm": 2.802577495574951, + "learning_rate": 3.8011168792453256e-06, + "loss": 0.2997, + "step": 11809 + }, + { + "epoch": 1.4371767569212048, + "grad_norm": 2.1231021881103516, + "learning_rate": 3.7995863070316473e-06, + "loss": 0.4151, + "step": 11810 + }, + { + "epoch": 1.4372984484332219, + "grad_norm": 1.5988826751708984, + "learning_rate": 3.7980559707565455e-06, + "loss": 0.4082, + "step": 11811 + }, + { + "epoch": 1.4374201399452389, + "grad_norm": 2.132887840270996, + "learning_rate": 3.796525870478247e-06, + "loss": 0.3783, + "step": 11812 + }, + { + "epoch": 1.437541831457256, + "grad_norm": 1.3722410202026367, + "learning_rate": 3.794996006254984e-06, + "loss": 0.3728, + "step": 11813 + }, + { + "epoch": 1.437663522969273, + "grad_norm": 1.8361307382583618, + "learning_rate": 3.7934663781449654e-06, + "loss": 0.3767, + "step": 11814 + }, + { + "epoch": 1.43778521448129, + "grad_norm": 1.7051621675491333, + "learning_rate": 3.791936986206395e-06, + "loss": 0.3715, + "step": 11815 + }, + { + "epoch": 1.437906905993307, + "grad_norm": 2.7762765884399414, + "learning_rate": 3.7904078304974756e-06, + "loss": 0.4432, + "step": 11816 + }, + { + "epoch": 1.438028597505324, + "grad_norm": 1.4747251272201538, + "learning_rate": 3.7888789110763915e-06, + "loss": 0.3014, + "step": 11817 + }, + { + "epoch": 1.438150289017341, + "grad_norm": 1.429440975189209, + "learning_rate": 3.7873502280013174e-06, + "loss": 0.3243, + "step": 11818 + }, + { + "epoch": 1.438271980529358, + "grad_norm": 2.280829906463623, + "learning_rate": 3.785821781330431e-06, + "loss": 0.4309, + "step": 11819 + }, + { + "epoch": 1.438393672041375, + "grad_norm": 1.8403630256652832, + "learning_rate": 3.7842935711218876e-06, + "loss": 0.4118, + "step": 11820 + }, + { + "epoch": 1.438515363553392, + "grad_norm": 2.2532782554626465, + "learning_rate": 3.7827655974338397e-06, + "loss": 0.3616, + "step": 11821 + }, + { + "epoch": 1.438637055065409, + "grad_norm": 1.6302249431610107, + "learning_rate": 3.7812378603244306e-06, + "loss": 0.3878, + "step": 11822 + }, + { + "epoch": 1.4387587465774263, + "grad_norm": 2.0840559005737305, + "learning_rate": 3.779710359851788e-06, + "loss": 0.3749, + "step": 11823 + }, + { + "epoch": 1.4388804380894433, + "grad_norm": 2.1594605445861816, + "learning_rate": 3.778183096074046e-06, + "loss": 0.3359, + "step": 11824 + }, + { + "epoch": 1.4390021296014603, + "grad_norm": 3.089811086654663, + "learning_rate": 3.776656069049316e-06, + "loss": 0.4711, + "step": 11825 + }, + { + "epoch": 1.4391238211134774, + "grad_norm": 1.4995942115783691, + "learning_rate": 3.7751292788357007e-06, + "loss": 0.3762, + "step": 11826 + }, + { + "epoch": 1.4392455126254944, + "grad_norm": 1.6174546480178833, + "learning_rate": 3.773602725491303e-06, + "loss": 0.3667, + "step": 11827 + }, + { + "epoch": 1.4393672041375114, + "grad_norm": 2.0613558292388916, + "learning_rate": 3.7720764090742112e-06, + "loss": 0.3034, + "step": 11828 + }, + { + "epoch": 1.4394888956495284, + "grad_norm": 3.3389575481414795, + "learning_rate": 3.7705503296424996e-06, + "loss": 0.4531, + "step": 11829 + }, + { + "epoch": 1.4396105871615454, + "grad_norm": 1.471073865890503, + "learning_rate": 3.769024487254245e-06, + "loss": 0.4015, + "step": 11830 + }, + { + "epoch": 1.4397322786735625, + "grad_norm": 2.452979803085327, + "learning_rate": 3.7674988819675064e-06, + "loss": 0.4444, + "step": 11831 + }, + { + "epoch": 1.4398539701855795, + "grad_norm": 1.6104811429977417, + "learning_rate": 3.7659735138403354e-06, + "loss": 0.3864, + "step": 11832 + }, + { + "epoch": 1.4399756616975967, + "grad_norm": 1.776667833328247, + "learning_rate": 3.7644483829307764e-06, + "loss": 0.3966, + "step": 11833 + }, + { + "epoch": 1.4400973532096137, + "grad_norm": 2.803340435028076, + "learning_rate": 3.76292348929686e-06, + "loss": 0.4458, + "step": 11834 + }, + { + "epoch": 1.4402190447216308, + "grad_norm": 1.9894449710845947, + "learning_rate": 3.761398832996618e-06, + "loss": 0.334, + "step": 11835 + }, + { + "epoch": 1.4403407362336478, + "grad_norm": 1.810025691986084, + "learning_rate": 3.759874414088064e-06, + "loss": 0.4043, + "step": 11836 + }, + { + "epoch": 1.4404624277456648, + "grad_norm": 1.4075738191604614, + "learning_rate": 3.758350232629201e-06, + "loss": 0.4105, + "step": 11837 + }, + { + "epoch": 1.4405841192576818, + "grad_norm": 1.4784384965896606, + "learning_rate": 3.756826288678035e-06, + "loss": 0.4071, + "step": 11838 + }, + { + "epoch": 1.4407058107696988, + "grad_norm": 2.2317841053009033, + "learning_rate": 3.7553025822925505e-06, + "loss": 0.4204, + "step": 11839 + }, + { + "epoch": 1.4408275022817159, + "grad_norm": 1.469972848892212, + "learning_rate": 3.7537791135307258e-06, + "loss": 0.324, + "step": 11840 + }, + { + "epoch": 1.4409491937937329, + "grad_norm": 1.5468652248382568, + "learning_rate": 3.7522558824505384e-06, + "loss": 0.3675, + "step": 11841 + }, + { + "epoch": 1.44107088530575, + "grad_norm": 1.3524500131607056, + "learning_rate": 3.7507328891099425e-06, + "loss": 0.3084, + "step": 11842 + }, + { + "epoch": 1.441192576817767, + "grad_norm": 3.782473087310791, + "learning_rate": 3.7492101335668995e-06, + "loss": 0.3221, + "step": 11843 + }, + { + "epoch": 1.441314268329784, + "grad_norm": 2.458317518234253, + "learning_rate": 3.747687615879353e-06, + "loss": 0.2772, + "step": 11844 + }, + { + "epoch": 1.441435959841801, + "grad_norm": 1.239877700805664, + "learning_rate": 3.7461653361052276e-06, + "loss": 0.3302, + "step": 11845 + }, + { + "epoch": 1.441557651353818, + "grad_norm": 3.6148478984832764, + "learning_rate": 3.744643294302459e-06, + "loss": 0.4576, + "step": 11846 + }, + { + "epoch": 1.441679342865835, + "grad_norm": 3.3129348754882812, + "learning_rate": 3.7431214905289602e-06, + "loss": 0.4444, + "step": 11847 + }, + { + "epoch": 1.441801034377852, + "grad_norm": 1.6403416395187378, + "learning_rate": 3.7415999248426373e-06, + "loss": 0.3334, + "step": 11848 + }, + { + "epoch": 1.4419227258898693, + "grad_norm": 1.4138256311416626, + "learning_rate": 3.7400785973013944e-06, + "loss": 0.336, + "step": 11849 + }, + { + "epoch": 1.4420444174018863, + "grad_norm": 4.209815502166748, + "learning_rate": 3.738557507963115e-06, + "loss": 0.4022, + "step": 11850 + }, + { + "epoch": 1.4421661089139033, + "grad_norm": 2.5665271282196045, + "learning_rate": 3.7370366568856864e-06, + "loss": 0.3204, + "step": 11851 + }, + { + "epoch": 1.4422878004259203, + "grad_norm": 2.005777597427368, + "learning_rate": 3.735516044126977e-06, + "loss": 0.3902, + "step": 11852 + }, + { + "epoch": 1.4424094919379373, + "grad_norm": 1.5276718139648438, + "learning_rate": 3.7339956697448444e-06, + "loss": 0.3977, + "step": 11853 + }, + { + "epoch": 1.4425311834499543, + "grad_norm": 1.3939039707183838, + "learning_rate": 3.73247553379715e-06, + "loss": 0.4163, + "step": 11854 + }, + { + "epoch": 1.4426528749619714, + "grad_norm": 3.7330803871154785, + "learning_rate": 3.7309556363417388e-06, + "loss": 0.4434, + "step": 11855 + }, + { + "epoch": 1.4427745664739884, + "grad_norm": 2.1777427196502686, + "learning_rate": 3.7294359774364343e-06, + "loss": 0.3846, + "step": 11856 + }, + { + "epoch": 1.4428962579860054, + "grad_norm": 2.101963758468628, + "learning_rate": 3.727916557139074e-06, + "loss": 0.3819, + "step": 11857 + }, + { + "epoch": 1.4430179494980226, + "grad_norm": 1.8848042488098145, + "learning_rate": 3.726397375507468e-06, + "loss": 0.3851, + "step": 11858 + }, + { + "epoch": 1.4431396410100397, + "grad_norm": 1.75052011013031, + "learning_rate": 3.72487843259943e-06, + "loss": 0.3621, + "step": 11859 + }, + { + "epoch": 1.4432613325220567, + "grad_norm": 2.2883315086364746, + "learning_rate": 3.7233597284727573e-06, + "loss": 0.3541, + "step": 11860 + }, + { + "epoch": 1.4433830240340737, + "grad_norm": 1.6344425678253174, + "learning_rate": 3.721841263185235e-06, + "loss": 0.3794, + "step": 11861 + }, + { + "epoch": 1.4435047155460907, + "grad_norm": 1.5160001516342163, + "learning_rate": 3.720323036794652e-06, + "loss": 0.3713, + "step": 11862 + }, + { + "epoch": 1.4436264070581077, + "grad_norm": 1.483747124671936, + "learning_rate": 3.7188050493587746e-06, + "loss": 0.3758, + "step": 11863 + }, + { + "epoch": 1.4437480985701248, + "grad_norm": 1.8093531131744385, + "learning_rate": 3.7172873009353637e-06, + "loss": 0.3687, + "step": 11864 + }, + { + "epoch": 1.4438697900821418, + "grad_norm": 2.2343521118164062, + "learning_rate": 3.7157697915821787e-06, + "loss": 0.3168, + "step": 11865 + }, + { + "epoch": 1.4439914815941588, + "grad_norm": 3.938396453857422, + "learning_rate": 3.7142525213569615e-06, + "loss": 0.3098, + "step": 11866 + }, + { + "epoch": 1.4441131731061758, + "grad_norm": 1.385934591293335, + "learning_rate": 3.712735490317445e-06, + "loss": 0.3536, + "step": 11867 + }, + { + "epoch": 1.4442348646181928, + "grad_norm": 2.027087688446045, + "learning_rate": 3.711218698521358e-06, + "loss": 0.3843, + "step": 11868 + }, + { + "epoch": 1.4443565561302099, + "grad_norm": 2.3759968280792236, + "learning_rate": 3.709702146026413e-06, + "loss": 0.3281, + "step": 11869 + }, + { + "epoch": 1.4444782476422269, + "grad_norm": 1.6625268459320068, + "learning_rate": 3.7081858328903254e-06, + "loss": 0.3773, + "step": 11870 + }, + { + "epoch": 1.444599939154244, + "grad_norm": 3.1017301082611084, + "learning_rate": 3.70666975917079e-06, + "loss": 0.309, + "step": 11871 + }, + { + "epoch": 1.444721630666261, + "grad_norm": 1.438114047050476, + "learning_rate": 3.705153924925493e-06, + "loss": 0.3391, + "step": 11872 + }, + { + "epoch": 1.444843322178278, + "grad_norm": 1.330441951751709, + "learning_rate": 3.703638330212123e-06, + "loss": 0.3506, + "step": 11873 + }, + { + "epoch": 1.4449650136902952, + "grad_norm": 1.717523217201233, + "learning_rate": 3.7021229750883456e-06, + "loss": 0.3191, + "step": 11874 + }, + { + "epoch": 1.4450867052023122, + "grad_norm": 3.1649675369262695, + "learning_rate": 3.700607859611822e-06, + "loss": 0.4068, + "step": 11875 + }, + { + "epoch": 1.4452083967143292, + "grad_norm": 2.1028671264648438, + "learning_rate": 3.6990929838402134e-06, + "loss": 0.3753, + "step": 11876 + }, + { + "epoch": 1.4453300882263462, + "grad_norm": 1.4174188375473022, + "learning_rate": 3.6975783478311577e-06, + "loss": 0.3268, + "step": 11877 + }, + { + "epoch": 1.4454517797383633, + "grad_norm": 1.5427979230880737, + "learning_rate": 3.6960639516422914e-06, + "loss": 0.3437, + "step": 11878 + }, + { + "epoch": 1.4455734712503803, + "grad_norm": 4.0022969245910645, + "learning_rate": 3.6945497953312393e-06, + "loss": 0.4219, + "step": 11879 + }, + { + "epoch": 1.4456951627623973, + "grad_norm": 4.140995025634766, + "learning_rate": 3.6930358789556166e-06, + "loss": 0.4492, + "step": 11880 + }, + { + "epoch": 1.4458168542744143, + "grad_norm": 1.6450996398925781, + "learning_rate": 3.691522202573037e-06, + "loss": 0.3716, + "step": 11881 + }, + { + "epoch": 1.4459385457864313, + "grad_norm": 2.075960874557495, + "learning_rate": 3.6900087662410944e-06, + "loss": 0.378, + "step": 11882 + }, + { + "epoch": 1.4460602372984486, + "grad_norm": 1.8620332479476929, + "learning_rate": 3.688495570017376e-06, + "loss": 0.3823, + "step": 11883 + }, + { + "epoch": 1.4461819288104656, + "grad_norm": 1.837510585784912, + "learning_rate": 3.68698261395947e-06, + "loss": 0.3609, + "step": 11884 + }, + { + "epoch": 1.4463036203224826, + "grad_norm": 1.5520201921463013, + "learning_rate": 3.6854698981249416e-06, + "loss": 0.3907, + "step": 11885 + }, + { + "epoch": 1.4464253118344996, + "grad_norm": 1.6463804244995117, + "learning_rate": 3.6839574225713505e-06, + "loss": 0.3894, + "step": 11886 + }, + { + "epoch": 1.4465470033465166, + "grad_norm": 2.3525595664978027, + "learning_rate": 3.682445187356257e-06, + "loss": 0.4395, + "step": 11887 + }, + { + "epoch": 1.4466686948585337, + "grad_norm": 2.8774092197418213, + "learning_rate": 3.6809331925372005e-06, + "loss": 0.3538, + "step": 11888 + }, + { + "epoch": 1.4467903863705507, + "grad_norm": 1.9676817655563354, + "learning_rate": 3.679421438171712e-06, + "loss": 0.366, + "step": 11889 + }, + { + "epoch": 1.4469120778825677, + "grad_norm": 1.7366608381271362, + "learning_rate": 3.6779099243173277e-06, + "loss": 0.3624, + "step": 11890 + }, + { + "epoch": 1.4470337693945847, + "grad_norm": 1.9219353199005127, + "learning_rate": 3.6763986510315506e-06, + "loss": 0.371, + "step": 11891 + }, + { + "epoch": 1.4471554609066017, + "grad_norm": 3.129209041595459, + "learning_rate": 3.674887618371896e-06, + "loss": 0.318, + "step": 11892 + }, + { + "epoch": 1.4472771524186188, + "grad_norm": 2.042206048965454, + "learning_rate": 3.6733768263958615e-06, + "loss": 0.4412, + "step": 11893 + }, + { + "epoch": 1.4473988439306358, + "grad_norm": 2.022676706314087, + "learning_rate": 3.671866275160929e-06, + "loss": 0.3759, + "step": 11894 + }, + { + "epoch": 1.4475205354426528, + "grad_norm": 2.3494081497192383, + "learning_rate": 3.670355964724588e-06, + "loss": 0.3779, + "step": 11895 + }, + { + "epoch": 1.4476422269546698, + "grad_norm": 2.8019931316375732, + "learning_rate": 3.668845895144304e-06, + "loss": 0.3019, + "step": 11896 + }, + { + "epoch": 1.4477639184666868, + "grad_norm": 2.2933526039123535, + "learning_rate": 3.6673360664775337e-06, + "loss": 0.4281, + "step": 11897 + }, + { + "epoch": 1.4478856099787039, + "grad_norm": 1.5204875469207764, + "learning_rate": 3.665826478781739e-06, + "loss": 0.3532, + "step": 11898 + }, + { + "epoch": 1.448007301490721, + "grad_norm": 1.6592719554901123, + "learning_rate": 3.6643171321143568e-06, + "loss": 0.383, + "step": 11899 + }, + { + "epoch": 1.4481289930027381, + "grad_norm": 1.5485148429870605, + "learning_rate": 3.6628080265328193e-06, + "loss": 0.4048, + "step": 11900 + }, + { + "epoch": 1.4482506845147551, + "grad_norm": 1.8762445449829102, + "learning_rate": 3.661299162094559e-06, + "loss": 0.4269, + "step": 11901 + }, + { + "epoch": 1.4483723760267722, + "grad_norm": 1.6729037761688232, + "learning_rate": 3.659790538856981e-06, + "loss": 0.3873, + "step": 11902 + }, + { + "epoch": 1.4484940675387892, + "grad_norm": 1.9526728391647339, + "learning_rate": 3.6582821568774997e-06, + "loss": 0.3149, + "step": 11903 + }, + { + "epoch": 1.4486157590508062, + "grad_norm": 2.0222623348236084, + "learning_rate": 3.656774016213508e-06, + "loss": 0.3841, + "step": 11904 + }, + { + "epoch": 1.4487374505628232, + "grad_norm": 2.0432982444763184, + "learning_rate": 3.6552661169223912e-06, + "loss": 0.3788, + "step": 11905 + }, + { + "epoch": 1.4488591420748402, + "grad_norm": 1.6651802062988281, + "learning_rate": 3.653758459061536e-06, + "loss": 0.3382, + "step": 11906 + }, + { + "epoch": 1.4489808335868573, + "grad_norm": 1.8043780326843262, + "learning_rate": 3.6522510426883064e-06, + "loss": 0.4021, + "step": 11907 + }, + { + "epoch": 1.4491025250988743, + "grad_norm": 1.5165379047393799, + "learning_rate": 3.6507438678600604e-06, + "loss": 0.3457, + "step": 11908 + }, + { + "epoch": 1.4492242166108915, + "grad_norm": 1.8150503635406494, + "learning_rate": 3.649236934634156e-06, + "loss": 0.4514, + "step": 11909 + }, + { + "epoch": 1.4493459081229085, + "grad_norm": 3.865121841430664, + "learning_rate": 3.6477302430679294e-06, + "loss": 0.4431, + "step": 11910 + }, + { + "epoch": 1.4494675996349256, + "grad_norm": 3.333472967147827, + "learning_rate": 3.646223793218717e-06, + "loss": 0.4465, + "step": 11911 + }, + { + "epoch": 1.4495892911469426, + "grad_norm": 2.021068572998047, + "learning_rate": 3.644717585143842e-06, + "loss": 0.3345, + "step": 11912 + }, + { + "epoch": 1.4497109826589596, + "grad_norm": 2.026867628097534, + "learning_rate": 3.6432116189006185e-06, + "loss": 0.3145, + "step": 11913 + }, + { + "epoch": 1.4498326741709766, + "grad_norm": 1.637890100479126, + "learning_rate": 3.641705894546349e-06, + "loss": 0.3591, + "step": 11914 + }, + { + "epoch": 1.4499543656829936, + "grad_norm": 1.6168177127838135, + "learning_rate": 3.6402004121383316e-06, + "loss": 0.3411, + "step": 11915 + }, + { + "epoch": 1.4500760571950106, + "grad_norm": 1.5202151536941528, + "learning_rate": 3.6386951717338494e-06, + "loss": 0.3333, + "step": 11916 + }, + { + "epoch": 1.4501977487070277, + "grad_norm": 2.092015266418457, + "learning_rate": 3.637190173390187e-06, + "loss": 0.3933, + "step": 11917 + }, + { + "epoch": 1.4503194402190447, + "grad_norm": 1.5029441118240356, + "learning_rate": 3.635685417164605e-06, + "loss": 0.3518, + "step": 11918 + }, + { + "epoch": 1.4504411317310617, + "grad_norm": 1.5704599618911743, + "learning_rate": 3.6341809031143695e-06, + "loss": 0.3498, + "step": 11919 + }, + { + "epoch": 1.4505628232430787, + "grad_norm": 1.5989277362823486, + "learning_rate": 3.6326766312967267e-06, + "loss": 0.3436, + "step": 11920 + }, + { + "epoch": 1.4506845147550957, + "grad_norm": 1.2575188875198364, + "learning_rate": 3.6311726017689154e-06, + "loss": 0.3572, + "step": 11921 + }, + { + "epoch": 1.4508062062671128, + "grad_norm": 3.50934100151062, + "learning_rate": 3.629668814588172e-06, + "loss": 0.4281, + "step": 11922 + }, + { + "epoch": 1.4509278977791298, + "grad_norm": 1.7198940515518188, + "learning_rate": 3.6281652698117163e-06, + "loss": 0.4014, + "step": 11923 + }, + { + "epoch": 1.451049589291147, + "grad_norm": 1.6251307725906372, + "learning_rate": 3.626661967496761e-06, + "loss": 0.3663, + "step": 11924 + }, + { + "epoch": 1.451171280803164, + "grad_norm": 1.9700456857681274, + "learning_rate": 3.62515890770051e-06, + "loss": 0.4203, + "step": 11925 + }, + { + "epoch": 1.451292972315181, + "grad_norm": 2.024721622467041, + "learning_rate": 3.6236560904801545e-06, + "loss": 0.3733, + "step": 11926 + }, + { + "epoch": 1.451414663827198, + "grad_norm": 2.203974723815918, + "learning_rate": 3.6221535158928857e-06, + "loss": 0.4132, + "step": 11927 + }, + { + "epoch": 1.451536355339215, + "grad_norm": 1.67996084690094, + "learning_rate": 3.620651183995878e-06, + "loss": 0.3888, + "step": 11928 + }, + { + "epoch": 1.4516580468512321, + "grad_norm": 3.2756850719451904, + "learning_rate": 3.6191490948462937e-06, + "loss": 0.3353, + "step": 11929 + }, + { + "epoch": 1.4517797383632491, + "grad_norm": 1.6566343307495117, + "learning_rate": 3.617647248501297e-06, + "loss": 0.3667, + "step": 11930 + }, + { + "epoch": 1.4519014298752662, + "grad_norm": 1.4988598823547363, + "learning_rate": 3.616145645018033e-06, + "loss": 0.365, + "step": 11931 + }, + { + "epoch": 1.4520231213872832, + "grad_norm": 1.937064290046692, + "learning_rate": 3.614644284453639e-06, + "loss": 0.391, + "step": 11932 + }, + { + "epoch": 1.4521448128993002, + "grad_norm": 1.973625659942627, + "learning_rate": 3.6131431668652482e-06, + "loss": 0.374, + "step": 11933 + }, + { + "epoch": 1.4522665044113174, + "grad_norm": 1.3987855911254883, + "learning_rate": 3.6116422923099814e-06, + "loss": 0.3773, + "step": 11934 + }, + { + "epoch": 1.4523881959233345, + "grad_norm": 2.3750061988830566, + "learning_rate": 3.6101416608449457e-06, + "loss": 0.4469, + "step": 11935 + }, + { + "epoch": 1.4525098874353515, + "grad_norm": 1.690173864364624, + "learning_rate": 3.608641272527251e-06, + "loss": 0.4323, + "step": 11936 + }, + { + "epoch": 1.4526315789473685, + "grad_norm": 3.122556447982788, + "learning_rate": 3.60714112741398e-06, + "loss": 0.3131, + "step": 11937 + }, + { + "epoch": 1.4527532704593855, + "grad_norm": 1.7985584735870361, + "learning_rate": 3.605641225562225e-06, + "loss": 0.3807, + "step": 11938 + }, + { + "epoch": 1.4528749619714025, + "grad_norm": 2.649031639099121, + "learning_rate": 3.604141567029057e-06, + "loss": 0.3515, + "step": 11939 + }, + { + "epoch": 1.4529966534834196, + "grad_norm": 3.458806037902832, + "learning_rate": 3.602642151871537e-06, + "loss": 0.4738, + "step": 11940 + }, + { + "epoch": 1.4531183449954366, + "grad_norm": 1.8226171731948853, + "learning_rate": 3.601142980146728e-06, + "loss": 0.344, + "step": 11941 + }, + { + "epoch": 1.4532400365074536, + "grad_norm": 1.8936265707015991, + "learning_rate": 3.599644051911674e-06, + "loss": 0.3878, + "step": 11942 + }, + { + "epoch": 1.4533617280194706, + "grad_norm": 1.7052464485168457, + "learning_rate": 3.598145367223409e-06, + "loss": 0.3156, + "step": 11943 + }, + { + "epoch": 1.4534834195314876, + "grad_norm": 1.5494877099990845, + "learning_rate": 3.596646926138967e-06, + "loss": 0.3175, + "step": 11944 + }, + { + "epoch": 1.4536051110435046, + "grad_norm": 1.7864489555358887, + "learning_rate": 3.595148728715363e-06, + "loss": 0.3378, + "step": 11945 + }, + { + "epoch": 1.4537268025555217, + "grad_norm": 1.6891885995864868, + "learning_rate": 3.593650775009604e-06, + "loss": 0.3571, + "step": 11946 + }, + { + "epoch": 1.4538484940675387, + "grad_norm": 1.2989020347595215, + "learning_rate": 3.5921530650787008e-06, + "loss": 0.332, + "step": 11947 + }, + { + "epoch": 1.4539701855795557, + "grad_norm": 2.4695024490356445, + "learning_rate": 3.59065559897963e-06, + "loss": 0.4224, + "step": 11948 + }, + { + "epoch": 1.4540918770915727, + "grad_norm": 2.111696243286133, + "learning_rate": 3.5891583767693825e-06, + "loss": 0.4453, + "step": 11949 + }, + { + "epoch": 1.45421356860359, + "grad_norm": 1.526094913482666, + "learning_rate": 3.5876613985049302e-06, + "loss": 0.3938, + "step": 11950 + }, + { + "epoch": 1.454335260115607, + "grad_norm": 1.7118867635726929, + "learning_rate": 3.5861646642432304e-06, + "loss": 0.413, + "step": 11951 + }, + { + "epoch": 1.454456951627624, + "grad_norm": 1.8664582967758179, + "learning_rate": 3.5846681740412447e-06, + "loss": 0.4039, + "step": 11952 + }, + { + "epoch": 1.454578643139641, + "grad_norm": 1.5890673398971558, + "learning_rate": 3.583171927955913e-06, + "loss": 0.3706, + "step": 11953 + }, + { + "epoch": 1.454700334651658, + "grad_norm": 1.3090864419937134, + "learning_rate": 3.581675926044169e-06, + "loss": 0.3452, + "step": 11954 + }, + { + "epoch": 1.454822026163675, + "grad_norm": 2.5138299465179443, + "learning_rate": 3.5801801683629444e-06, + "loss": 0.4256, + "step": 11955 + }, + { + "epoch": 1.454943717675692, + "grad_norm": 1.707141637802124, + "learning_rate": 3.5786846549691532e-06, + "loss": 0.3863, + "step": 11956 + }, + { + "epoch": 1.455065409187709, + "grad_norm": 1.729933261871338, + "learning_rate": 3.5771893859196984e-06, + "loss": 0.4005, + "step": 11957 + }, + { + "epoch": 1.4551871006997261, + "grad_norm": 2.7479751110076904, + "learning_rate": 3.5756943612714856e-06, + "loss": 0.33, + "step": 11958 + }, + { + "epoch": 1.4553087922117434, + "grad_norm": 1.5847742557525635, + "learning_rate": 3.574199581081399e-06, + "loss": 0.3682, + "step": 11959 + }, + { + "epoch": 1.4554304837237604, + "grad_norm": 2.085653066635132, + "learning_rate": 3.5727050454063205e-06, + "loss": 0.3414, + "step": 11960 + }, + { + "epoch": 1.4555521752357774, + "grad_norm": 2.3502464294433594, + "learning_rate": 3.571210754303117e-06, + "loss": 0.3308, + "step": 11961 + }, + { + "epoch": 1.4556738667477944, + "grad_norm": 1.8462797403335571, + "learning_rate": 3.56971670782865e-06, + "loss": 0.4166, + "step": 11962 + }, + { + "epoch": 1.4557955582598114, + "grad_norm": 1.438338279724121, + "learning_rate": 3.5682229060397744e-06, + "loss": 0.3399, + "step": 11963 + }, + { + "epoch": 1.4559172497718285, + "grad_norm": 2.339810371398926, + "learning_rate": 3.5667293489933298e-06, + "loss": 0.3943, + "step": 11964 + }, + { + "epoch": 1.4560389412838455, + "grad_norm": 1.5432193279266357, + "learning_rate": 3.565236036746147e-06, + "loss": 0.3556, + "step": 11965 + }, + { + "epoch": 1.4561606327958625, + "grad_norm": 2.055554151535034, + "learning_rate": 3.5637429693550552e-06, + "loss": 0.4105, + "step": 11966 + }, + { + "epoch": 1.4562823243078795, + "grad_norm": 1.8492835760116577, + "learning_rate": 3.5622501468768654e-06, + "loss": 0.3668, + "step": 11967 + }, + { + "epoch": 1.4564040158198965, + "grad_norm": 2.7835192680358887, + "learning_rate": 3.5607575693683806e-06, + "loss": 0.452, + "step": 11968 + }, + { + "epoch": 1.4565257073319136, + "grad_norm": 1.5163910388946533, + "learning_rate": 3.559265236886401e-06, + "loss": 0.352, + "step": 11969 + }, + { + "epoch": 1.4566473988439306, + "grad_norm": 1.883076548576355, + "learning_rate": 3.557773149487711e-06, + "loss": 0.4034, + "step": 11970 + }, + { + "epoch": 1.4567690903559476, + "grad_norm": 1.7124474048614502, + "learning_rate": 3.556281307229088e-06, + "loss": 0.3203, + "step": 11971 + }, + { + "epoch": 1.4568907818679646, + "grad_norm": 1.624720573425293, + "learning_rate": 3.554789710167299e-06, + "loss": 0.3725, + "step": 11972 + }, + { + "epoch": 1.4570124733799816, + "grad_norm": 1.664584994316101, + "learning_rate": 3.5532983583590985e-06, + "loss": 0.3516, + "step": 11973 + }, + { + "epoch": 1.4571341648919987, + "grad_norm": 1.4626065492630005, + "learning_rate": 3.5518072518612434e-06, + "loss": 0.4068, + "step": 11974 + }, + { + "epoch": 1.457255856404016, + "grad_norm": 3.118086576461792, + "learning_rate": 3.550316390730468e-06, + "loss": 0.4359, + "step": 11975 + }, + { + "epoch": 1.457377547916033, + "grad_norm": 1.965084195137024, + "learning_rate": 3.5488257750235024e-06, + "loss": 0.4313, + "step": 11976 + }, + { + "epoch": 1.45749923942805, + "grad_norm": 1.603909969329834, + "learning_rate": 3.547335404797072e-06, + "loss": 0.3767, + "step": 11977 + }, + { + "epoch": 1.457620930940067, + "grad_norm": 1.9170496463775635, + "learning_rate": 3.5458452801078814e-06, + "loss": 0.4499, + "step": 11978 + }, + { + "epoch": 1.457742622452084, + "grad_norm": 1.6861101388931274, + "learning_rate": 3.5443554010126413e-06, + "loss": 0.4329, + "step": 11979 + }, + { + "epoch": 1.457864313964101, + "grad_norm": 1.5012520551681519, + "learning_rate": 3.542865767568041e-06, + "loss": 0.3644, + "step": 11980 + }, + { + "epoch": 1.457986005476118, + "grad_norm": 2.4398927688598633, + "learning_rate": 3.54137637983076e-06, + "loss": 0.3288, + "step": 11981 + }, + { + "epoch": 1.458107696988135, + "grad_norm": 1.597055196762085, + "learning_rate": 3.539887237857483e-06, + "loss": 0.3522, + "step": 11982 + }, + { + "epoch": 1.458229388500152, + "grad_norm": 2.284586191177368, + "learning_rate": 3.538398341704866e-06, + "loss": 0.3993, + "step": 11983 + }, + { + "epoch": 1.4583510800121693, + "grad_norm": 2.395482063293457, + "learning_rate": 3.536909691429563e-06, + "loss": 0.3262, + "step": 11984 + }, + { + "epoch": 1.4584727715241863, + "grad_norm": 2.9379451274871826, + "learning_rate": 3.5354212870882264e-06, + "loss": 0.3193, + "step": 11985 + }, + { + "epoch": 1.4585944630362033, + "grad_norm": 2.296464681625366, + "learning_rate": 3.5339331287374888e-06, + "loss": 0.384, + "step": 11986 + }, + { + "epoch": 1.4587161545482203, + "grad_norm": 2.4101970195770264, + "learning_rate": 3.532445216433982e-06, + "loss": 0.3865, + "step": 11987 + }, + { + "epoch": 1.4588378460602374, + "grad_norm": 1.3474007844924927, + "learning_rate": 3.530957550234323e-06, + "loss": 0.3637, + "step": 11988 + }, + { + "epoch": 1.4589595375722544, + "grad_norm": 1.4934849739074707, + "learning_rate": 3.529470130195115e-06, + "loss": 0.3237, + "step": 11989 + }, + { + "epoch": 1.4590812290842714, + "grad_norm": 2.1972146034240723, + "learning_rate": 3.527982956372965e-06, + "loss": 0.3321, + "step": 11990 + }, + { + "epoch": 1.4592029205962884, + "grad_norm": 1.6108638048171997, + "learning_rate": 3.526496028824461e-06, + "loss": 0.3198, + "step": 11991 + }, + { + "epoch": 1.4593246121083054, + "grad_norm": 2.217272996902466, + "learning_rate": 3.5250093476061777e-06, + "loss": 0.3584, + "step": 11992 + }, + { + "epoch": 1.4594463036203225, + "grad_norm": 3.880554676055908, + "learning_rate": 3.5235229127746962e-06, + "loss": 0.4702, + "step": 11993 + }, + { + "epoch": 1.4595679951323395, + "grad_norm": 2.569859027862549, + "learning_rate": 3.5220367243865716e-06, + "loss": 0.3564, + "step": 11994 + }, + { + "epoch": 1.4596896866443565, + "grad_norm": 1.963043451309204, + "learning_rate": 3.5205507824983597e-06, + "loss": 0.3921, + "step": 11995 + }, + { + "epoch": 1.4598113781563735, + "grad_norm": 1.3961774110794067, + "learning_rate": 3.519065087166602e-06, + "loss": 0.3369, + "step": 11996 + }, + { + "epoch": 1.4599330696683905, + "grad_norm": 3.0910370349884033, + "learning_rate": 3.517579638447829e-06, + "loss": 0.2967, + "step": 11997 + }, + { + "epoch": 1.4600547611804076, + "grad_norm": 1.6313807964324951, + "learning_rate": 3.5160944363985716e-06, + "loss": 0.3744, + "step": 11998 + }, + { + "epoch": 1.4601764526924246, + "grad_norm": 2.917386770248413, + "learning_rate": 3.514609481075343e-06, + "loss": 0.3837, + "step": 11999 + }, + { + "epoch": 1.4602981442044418, + "grad_norm": 2.9949278831481934, + "learning_rate": 3.513124772534643e-06, + "loss": 0.3854, + "step": 12000 + }, + { + "epoch": 1.4604198357164588, + "grad_norm": 2.0964198112487793, + "learning_rate": 3.5116403108329767e-06, + "loss": 0.3759, + "step": 12001 + }, + { + "epoch": 1.4605415272284759, + "grad_norm": 1.702501654624939, + "learning_rate": 3.5101560960268267e-06, + "loss": 0.3552, + "step": 12002 + }, + { + "epoch": 1.4606632187404929, + "grad_norm": 1.7093279361724854, + "learning_rate": 3.5086721281726665e-06, + "loss": 0.3581, + "step": 12003 + }, + { + "epoch": 1.46078491025251, + "grad_norm": 2.492913246154785, + "learning_rate": 3.5071884073269713e-06, + "loss": 0.3737, + "step": 12004 + }, + { + "epoch": 1.460906601764527, + "grad_norm": 2.7656280994415283, + "learning_rate": 3.505704933546198e-06, + "loss": 0.3497, + "step": 12005 + }, + { + "epoch": 1.461028293276544, + "grad_norm": 1.679345726966858, + "learning_rate": 3.5042217068867935e-06, + "loss": 0.368, + "step": 12006 + }, + { + "epoch": 1.461149984788561, + "grad_norm": 1.9937193393707275, + "learning_rate": 3.5027387274051985e-06, + "loss": 0.3724, + "step": 12007 + }, + { + "epoch": 1.461271676300578, + "grad_norm": 2.082627296447754, + "learning_rate": 3.5012559951578395e-06, + "loss": 0.3904, + "step": 12008 + }, + { + "epoch": 1.461393367812595, + "grad_norm": 1.383838415145874, + "learning_rate": 3.4997735102011464e-06, + "loss": 0.3145, + "step": 12009 + }, + { + "epoch": 1.4615150593246122, + "grad_norm": 2.8954319953918457, + "learning_rate": 3.498291272591525e-06, + "loss": 0.467, + "step": 12010 + }, + { + "epoch": 1.4616367508366293, + "grad_norm": 1.8391460180282593, + "learning_rate": 3.496809282385375e-06, + "loss": 0.3086, + "step": 12011 + }, + { + "epoch": 1.4617584423486463, + "grad_norm": 2.2016239166259766, + "learning_rate": 3.4953275396390972e-06, + "loss": 0.3987, + "step": 12012 + }, + { + "epoch": 1.4618801338606633, + "grad_norm": 4.661727428436279, + "learning_rate": 3.4938460444090695e-06, + "loss": 0.4281, + "step": 12013 + }, + { + "epoch": 1.4620018253726803, + "grad_norm": 1.52132248878479, + "learning_rate": 3.492364796751664e-06, + "loss": 0.3512, + "step": 12014 + }, + { + "epoch": 1.4621235168846973, + "grad_norm": 1.402281641960144, + "learning_rate": 3.49088379672325e-06, + "loss": 0.3362, + "step": 12015 + }, + { + "epoch": 1.4622452083967143, + "grad_norm": 2.3437771797180176, + "learning_rate": 3.489403044380182e-06, + "loss": 0.3978, + "step": 12016 + }, + { + "epoch": 1.4623668999087314, + "grad_norm": 1.3682541847229004, + "learning_rate": 3.4879225397788042e-06, + "loss": 0.3377, + "step": 12017 + }, + { + "epoch": 1.4624885914207484, + "grad_norm": 1.7647216320037842, + "learning_rate": 3.486442282975453e-06, + "loss": 0.4121, + "step": 12018 + }, + { + "epoch": 1.4626102829327654, + "grad_norm": 2.1108624935150146, + "learning_rate": 3.484962274026451e-06, + "loss": 0.4369, + "step": 12019 + }, + { + "epoch": 1.4627319744447824, + "grad_norm": 1.7634366750717163, + "learning_rate": 3.4834825129881244e-06, + "loss": 0.4307, + "step": 12020 + }, + { + "epoch": 1.4628536659567994, + "grad_norm": 1.9122487306594849, + "learning_rate": 3.4820029999167758e-06, + "loss": 0.388, + "step": 12021 + }, + { + "epoch": 1.4629753574688165, + "grad_norm": 2.420741319656372, + "learning_rate": 3.4805237348687025e-06, + "loss": 0.3697, + "step": 12022 + }, + { + "epoch": 1.4630970489808335, + "grad_norm": 1.7084821462631226, + "learning_rate": 3.4790447179001974e-06, + "loss": 0.3942, + "step": 12023 + }, + { + "epoch": 1.4632187404928505, + "grad_norm": 2.1634058952331543, + "learning_rate": 3.4775659490675394e-06, + "loss": 0.406, + "step": 12024 + }, + { + "epoch": 1.4633404320048677, + "grad_norm": 1.580566167831421, + "learning_rate": 3.476087428426994e-06, + "loss": 0.3885, + "step": 12025 + }, + { + "epoch": 1.4634621235168848, + "grad_norm": 1.4776626825332642, + "learning_rate": 3.4746091560348295e-06, + "loss": 0.3787, + "step": 12026 + }, + { + "epoch": 1.4635838150289018, + "grad_norm": 1.5963472127914429, + "learning_rate": 3.4731311319472926e-06, + "loss": 0.3267, + "step": 12027 + }, + { + "epoch": 1.4637055065409188, + "grad_norm": 2.0312659740448, + "learning_rate": 3.4716533562206266e-06, + "loss": 0.3485, + "step": 12028 + }, + { + "epoch": 1.4638271980529358, + "grad_norm": 1.6842656135559082, + "learning_rate": 3.4701758289110642e-06, + "loss": 0.4169, + "step": 12029 + }, + { + "epoch": 1.4639488895649528, + "grad_norm": 1.9565033912658691, + "learning_rate": 3.4686985500748226e-06, + "loss": 0.3502, + "step": 12030 + }, + { + "epoch": 1.4640705810769699, + "grad_norm": 1.7969274520874023, + "learning_rate": 3.467221519768125e-06, + "loss": 0.3997, + "step": 12031 + }, + { + "epoch": 1.4641922725889869, + "grad_norm": 1.9222075939178467, + "learning_rate": 3.46574473804717e-06, + "loss": 0.439, + "step": 12032 + }, + { + "epoch": 1.464313964101004, + "grad_norm": 3.1811234951019287, + "learning_rate": 3.464268204968149e-06, + "loss": 0.3264, + "step": 12033 + }, + { + "epoch": 1.464435655613021, + "grad_norm": 1.7962932586669922, + "learning_rate": 3.462791920587254e-06, + "loss": 0.3697, + "step": 12034 + }, + { + "epoch": 1.4645573471250382, + "grad_norm": 1.8668720722198486, + "learning_rate": 3.461315884960659e-06, + "loss": 0.3675, + "step": 12035 + }, + { + "epoch": 1.4646790386370552, + "grad_norm": 2.976686954498291, + "learning_rate": 3.4598400981445244e-06, + "loss": 0.3709, + "step": 12036 + }, + { + "epoch": 1.4648007301490722, + "grad_norm": 2.3302364349365234, + "learning_rate": 3.4583645601950153e-06, + "loss": 0.3864, + "step": 12037 + }, + { + "epoch": 1.4649224216610892, + "grad_norm": 2.78696608543396, + "learning_rate": 3.456889271168272e-06, + "loss": 0.4148, + "step": 12038 + }, + { + "epoch": 1.4650441131731062, + "grad_norm": 1.9746683835983276, + "learning_rate": 3.4554142311204397e-06, + "loss": 0.3723, + "step": 12039 + }, + { + "epoch": 1.4651658046851233, + "grad_norm": 2.125821113586426, + "learning_rate": 3.4539394401076443e-06, + "loss": 0.3339, + "step": 12040 + }, + { + "epoch": 1.4652874961971403, + "grad_norm": 1.5860700607299805, + "learning_rate": 3.4524648981859965e-06, + "loss": 0.3673, + "step": 12041 + }, + { + "epoch": 1.4654091877091573, + "grad_norm": 2.7654309272766113, + "learning_rate": 3.450990605411616e-06, + "loss": 0.4254, + "step": 12042 + }, + { + "epoch": 1.4655308792211743, + "grad_norm": 1.426876425743103, + "learning_rate": 3.4495165618405947e-06, + "loss": 0.3114, + "step": 12043 + }, + { + "epoch": 1.4656525707331913, + "grad_norm": 2.3017098903656006, + "learning_rate": 3.4480427675290306e-06, + "loss": 0.3791, + "step": 12044 + }, + { + "epoch": 1.4657742622452083, + "grad_norm": 2.061642646789551, + "learning_rate": 3.4465692225330007e-06, + "loss": 0.3914, + "step": 12045 + }, + { + "epoch": 1.4658959537572254, + "grad_norm": 1.9715923070907593, + "learning_rate": 3.4450959269085725e-06, + "loss": 0.3855, + "step": 12046 + }, + { + "epoch": 1.4660176452692424, + "grad_norm": 1.894641399383545, + "learning_rate": 3.4436228807118168e-06, + "loss": 0.3273, + "step": 12047 + }, + { + "epoch": 1.4661393367812594, + "grad_norm": 1.7752881050109863, + "learning_rate": 3.442150083998782e-06, + "loss": 0.345, + "step": 12048 + }, + { + "epoch": 1.4662610282932764, + "grad_norm": 2.1672749519348145, + "learning_rate": 3.4406775368255062e-06, + "loss": 0.3675, + "step": 12049 + }, + { + "epoch": 1.4663827198052934, + "grad_norm": 1.4513928890228271, + "learning_rate": 3.43920523924803e-06, + "loss": 0.3514, + "step": 12050 + }, + { + "epoch": 1.4665044113173107, + "grad_norm": 1.7190641164779663, + "learning_rate": 3.4377331913223756e-06, + "loss": 0.3328, + "step": 12051 + }, + { + "epoch": 1.4666261028293277, + "grad_norm": 1.7609766721725464, + "learning_rate": 3.436261393104556e-06, + "loss": 0.3829, + "step": 12052 + }, + { + "epoch": 1.4667477943413447, + "grad_norm": 3.0008814334869385, + "learning_rate": 3.434789844650577e-06, + "loss": 0.4678, + "step": 12053 + }, + { + "epoch": 1.4668694858533617, + "grad_norm": 17.622514724731445, + "learning_rate": 3.43331854601643e-06, + "loss": 0.3467, + "step": 12054 + }, + { + "epoch": 1.4669911773653788, + "grad_norm": 2.601954460144043, + "learning_rate": 3.431847497258107e-06, + "loss": 0.3984, + "step": 12055 + }, + { + "epoch": 1.4671128688773958, + "grad_norm": 1.7869925498962402, + "learning_rate": 3.4303766984315834e-06, + "loss": 0.3568, + "step": 12056 + }, + { + "epoch": 1.4672345603894128, + "grad_norm": 1.9167250394821167, + "learning_rate": 3.428906149592821e-06, + "loss": 0.3639, + "step": 12057 + }, + { + "epoch": 1.4673562519014298, + "grad_norm": 1.9740310907363892, + "learning_rate": 3.4274358507977846e-06, + "loss": 0.3733, + "step": 12058 + }, + { + "epoch": 1.4674779434134468, + "grad_norm": 1.9413151741027832, + "learning_rate": 3.425965802102418e-06, + "loss": 0.3117, + "step": 12059 + }, + { + "epoch": 1.467599634925464, + "grad_norm": 1.640803575515747, + "learning_rate": 3.424496003562656e-06, + "loss": 0.4151, + "step": 12060 + }, + { + "epoch": 1.467721326437481, + "grad_norm": 2.9612045288085938, + "learning_rate": 3.423026455234437e-06, + "loss": 0.3484, + "step": 12061 + }, + { + "epoch": 1.4678430179494981, + "grad_norm": 1.853200078010559, + "learning_rate": 3.421557157173674e-06, + "loss": 0.4105, + "step": 12062 + }, + { + "epoch": 1.4679647094615151, + "grad_norm": 1.6028133630752563, + "learning_rate": 3.420088109436277e-06, + "loss": 0.4252, + "step": 12063 + }, + { + "epoch": 1.4680864009735322, + "grad_norm": 2.3818438053131104, + "learning_rate": 3.4186193120781485e-06, + "loss": 0.327, + "step": 12064 + }, + { + "epoch": 1.4682080924855492, + "grad_norm": 1.8532391786575317, + "learning_rate": 3.417150765155174e-06, + "loss": 0.3683, + "step": 12065 + }, + { + "epoch": 1.4683297839975662, + "grad_norm": 1.5387063026428223, + "learning_rate": 3.4156824687232413e-06, + "loss": 0.3411, + "step": 12066 + }, + { + "epoch": 1.4684514755095832, + "grad_norm": 1.6694824695587158, + "learning_rate": 3.4142144228382203e-06, + "loss": 0.3726, + "step": 12067 + }, + { + "epoch": 1.4685731670216002, + "grad_norm": 2.199007749557495, + "learning_rate": 3.4127466275559685e-06, + "loss": 0.3869, + "step": 12068 + }, + { + "epoch": 1.4686948585336173, + "grad_norm": 1.4756526947021484, + "learning_rate": 3.411279082932346e-06, + "loss": 0.3549, + "step": 12069 + }, + { + "epoch": 1.4688165500456343, + "grad_norm": 2.005603551864624, + "learning_rate": 3.409811789023193e-06, + "loss": 0.3589, + "step": 12070 + }, + { + "epoch": 1.4689382415576513, + "grad_norm": 1.502556324005127, + "learning_rate": 3.4083447458843376e-06, + "loss": 0.3628, + "step": 12071 + }, + { + "epoch": 1.4690599330696683, + "grad_norm": 2.0606141090393066, + "learning_rate": 3.4068779535716133e-06, + "loss": 0.3447, + "step": 12072 + }, + { + "epoch": 1.4691816245816853, + "grad_norm": 1.632533311843872, + "learning_rate": 3.405411412140829e-06, + "loss": 0.4003, + "step": 12073 + }, + { + "epoch": 1.4693033160937023, + "grad_norm": 1.3820217847824097, + "learning_rate": 3.4039451216477903e-06, + "loss": 0.3776, + "step": 12074 + }, + { + "epoch": 1.4694250076057194, + "grad_norm": 1.6384810209274292, + "learning_rate": 3.402479082148293e-06, + "loss": 0.3988, + "step": 12075 + }, + { + "epoch": 1.4695466991177366, + "grad_norm": 1.8373570442199707, + "learning_rate": 3.40101329369812e-06, + "loss": 0.4093, + "step": 12076 + }, + { + "epoch": 1.4696683906297536, + "grad_norm": 1.817745327949524, + "learning_rate": 3.3995477563530533e-06, + "loss": 0.4081, + "step": 12077 + }, + { + "epoch": 1.4697900821417706, + "grad_norm": 2.7041759490966797, + "learning_rate": 3.3980824701688576e-06, + "loss": 0.4207, + "step": 12078 + }, + { + "epoch": 1.4699117736537877, + "grad_norm": 2.0243911743164062, + "learning_rate": 3.396617435201284e-06, + "loss": 0.3708, + "step": 12079 + }, + { + "epoch": 1.4700334651658047, + "grad_norm": 2.2408790588378906, + "learning_rate": 3.3951526515060895e-06, + "loss": 0.3973, + "step": 12080 + }, + { + "epoch": 1.4701551566778217, + "grad_norm": 2.189399480819702, + "learning_rate": 3.3936881191390083e-06, + "loss": 0.351, + "step": 12081 + }, + { + "epoch": 1.4702768481898387, + "grad_norm": 1.7004191875457764, + "learning_rate": 3.392223838155765e-06, + "loss": 0.3859, + "step": 12082 + }, + { + "epoch": 1.4703985397018557, + "grad_norm": 1.840395212173462, + "learning_rate": 3.390759808612084e-06, + "loss": 0.377, + "step": 12083 + }, + { + "epoch": 1.4705202312138728, + "grad_norm": 2.119800329208374, + "learning_rate": 3.3892960305636747e-06, + "loss": 0.3085, + "step": 12084 + }, + { + "epoch": 1.47064192272589, + "grad_norm": 2.191340208053589, + "learning_rate": 3.3878325040662297e-06, + "loss": 0.3916, + "step": 12085 + }, + { + "epoch": 1.470763614237907, + "grad_norm": 2.118105411529541, + "learning_rate": 3.3863692291754523e-06, + "loss": 0.3561, + "step": 12086 + }, + { + "epoch": 1.470885305749924, + "grad_norm": 2.6521801948547363, + "learning_rate": 3.3849062059470073e-06, + "loss": 0.4316, + "step": 12087 + }, + { + "epoch": 1.471006997261941, + "grad_norm": 2.1146295070648193, + "learning_rate": 3.3834434344365784e-06, + "loss": 0.3892, + "step": 12088 + }, + { + "epoch": 1.471128688773958, + "grad_norm": 2.861733913421631, + "learning_rate": 3.3819809146998216e-06, + "loss": 0.3464, + "step": 12089 + }, + { + "epoch": 1.471250380285975, + "grad_norm": 1.8602722883224487, + "learning_rate": 3.3805186467923855e-06, + "loss": 0.3542, + "step": 12090 + }, + { + "epoch": 1.4713720717979921, + "grad_norm": 1.3949251174926758, + "learning_rate": 3.379056630769921e-06, + "loss": 0.3686, + "step": 12091 + }, + { + "epoch": 1.4714937633100091, + "grad_norm": 1.4818003177642822, + "learning_rate": 3.3775948666880566e-06, + "loss": 0.3793, + "step": 12092 + }, + { + "epoch": 1.4716154548220262, + "grad_norm": 1.301381230354309, + "learning_rate": 3.376133354602411e-06, + "loss": 0.3386, + "step": 12093 + }, + { + "epoch": 1.4717371463340432, + "grad_norm": 2.334848642349243, + "learning_rate": 3.374672094568605e-06, + "loss": 0.4194, + "step": 12094 + }, + { + "epoch": 1.4718588378460602, + "grad_norm": 1.5721102952957153, + "learning_rate": 3.3732110866422364e-06, + "loss": 0.3587, + "step": 12095 + }, + { + "epoch": 1.4719805293580772, + "grad_norm": 1.7915695905685425, + "learning_rate": 3.3717503308789056e-06, + "loss": 0.3563, + "step": 12096 + }, + { + "epoch": 1.4721022208700942, + "grad_norm": 2.92470645904541, + "learning_rate": 3.3702898273341987e-06, + "loss": 0.3223, + "step": 12097 + }, + { + "epoch": 1.4722239123821113, + "grad_norm": 3.1266584396362305, + "learning_rate": 3.368829576063679e-06, + "loss": 0.433, + "step": 12098 + }, + { + "epoch": 1.4723456038941283, + "grad_norm": 1.9093586206436157, + "learning_rate": 3.367369577122924e-06, + "loss": 0.3603, + "step": 12099 + }, + { + "epoch": 1.4724672954061453, + "grad_norm": 2.7054426670074463, + "learning_rate": 3.3659098305674855e-06, + "loss": 0.4244, + "step": 12100 + }, + { + "epoch": 1.4725889869181625, + "grad_norm": 1.9046179056167603, + "learning_rate": 3.3644503364529047e-06, + "loss": 0.3284, + "step": 12101 + }, + { + "epoch": 1.4727106784301796, + "grad_norm": 1.6244277954101562, + "learning_rate": 3.362991094834729e-06, + "loss": 0.3715, + "step": 12102 + }, + { + "epoch": 1.4728323699421966, + "grad_norm": 1.5298324823379517, + "learning_rate": 3.3615321057684747e-06, + "loss": 0.3457, + "step": 12103 + }, + { + "epoch": 1.4729540614542136, + "grad_norm": 2.714778423309326, + "learning_rate": 3.3600733693096686e-06, + "loss": 0.3881, + "step": 12104 + }, + { + "epoch": 1.4730757529662306, + "grad_norm": 3.4847700595855713, + "learning_rate": 3.3586148855138157e-06, + "loss": 0.4263, + "step": 12105 + }, + { + "epoch": 1.4731974444782476, + "grad_norm": 1.8944257497787476, + "learning_rate": 3.3571566544364086e-06, + "loss": 0.3414, + "step": 12106 + }, + { + "epoch": 1.4733191359902647, + "grad_norm": 1.9065788984298706, + "learning_rate": 3.3556986761329445e-06, + "loss": 0.3471, + "step": 12107 + }, + { + "epoch": 1.4734408275022817, + "grad_norm": 1.49790620803833, + "learning_rate": 3.354240950658899e-06, + "loss": 0.3368, + "step": 12108 + }, + { + "epoch": 1.4735625190142987, + "grad_norm": 1.9343204498291016, + "learning_rate": 3.352783478069741e-06, + "loss": 0.3494, + "step": 12109 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.758023977279663, + "learning_rate": 3.3513262584209315e-06, + "loss": 0.3451, + "step": 12110 + }, + { + "epoch": 1.473805902038333, + "grad_norm": 1.5986489057540894, + "learning_rate": 3.3498692917679154e-06, + "loss": 0.3503, + "step": 12111 + }, + { + "epoch": 1.47392759355035, + "grad_norm": 1.632510781288147, + "learning_rate": 3.3484125781661414e-06, + "loss": 0.34, + "step": 12112 + }, + { + "epoch": 1.474049285062367, + "grad_norm": 2.4656519889831543, + "learning_rate": 3.3469561176710376e-06, + "loss": 0.4074, + "step": 12113 + }, + { + "epoch": 1.474170976574384, + "grad_norm": 1.7424700260162354, + "learning_rate": 3.3454999103380202e-06, + "loss": 0.3615, + "step": 12114 + }, + { + "epoch": 1.474292668086401, + "grad_norm": 1.3843655586242676, + "learning_rate": 3.344043956222509e-06, + "loss": 0.3445, + "step": 12115 + }, + { + "epoch": 1.474414359598418, + "grad_norm": 1.902854323387146, + "learning_rate": 3.3425882553799026e-06, + "loss": 0.43, + "step": 12116 + }, + { + "epoch": 1.474536051110435, + "grad_norm": 1.8997992277145386, + "learning_rate": 3.341132807865589e-06, + "loss": 0.3671, + "step": 12117 + }, + { + "epoch": 1.474657742622452, + "grad_norm": 3.5117878913879395, + "learning_rate": 3.3396776137349595e-06, + "loss": 0.4317, + "step": 12118 + }, + { + "epoch": 1.474779434134469, + "grad_norm": 1.873831033706665, + "learning_rate": 3.338222673043383e-06, + "loss": 0.362, + "step": 12119 + }, + { + "epoch": 1.4749011256464861, + "grad_norm": 1.730852723121643, + "learning_rate": 3.336767985846222e-06, + "loss": 0.3248, + "step": 12120 + }, + { + "epoch": 1.4750228171585031, + "grad_norm": 2.0717759132385254, + "learning_rate": 3.335313552198832e-06, + "loss": 0.422, + "step": 12121 + }, + { + "epoch": 1.4751445086705202, + "grad_norm": 2.1184513568878174, + "learning_rate": 3.333859372156553e-06, + "loss": 0.411, + "step": 12122 + }, + { + "epoch": 1.4752662001825372, + "grad_norm": 2.4771809577941895, + "learning_rate": 3.3324054457747258e-06, + "loss": 0.3334, + "step": 12123 + }, + { + "epoch": 1.4753878916945542, + "grad_norm": 1.6757452487945557, + "learning_rate": 3.330951773108673e-06, + "loss": 0.3808, + "step": 12124 + }, + { + "epoch": 1.4755095832065712, + "grad_norm": 1.8989259004592896, + "learning_rate": 3.3294983542137062e-06, + "loss": 0.3271, + "step": 12125 + }, + { + "epoch": 1.4756312747185885, + "grad_norm": 1.9451688528060913, + "learning_rate": 3.328045189145137e-06, + "loss": 0.3893, + "step": 12126 + }, + { + "epoch": 1.4757529662306055, + "grad_norm": 2.464289903640747, + "learning_rate": 3.3265922779582593e-06, + "loss": 0.3895, + "step": 12127 + }, + { + "epoch": 1.4758746577426225, + "grad_norm": 3.6289501190185547, + "learning_rate": 3.325139620708354e-06, + "loss": 0.4726, + "step": 12128 + }, + { + "epoch": 1.4759963492546395, + "grad_norm": 2.3231496810913086, + "learning_rate": 3.3236872174507072e-06, + "loss": 0.3482, + "step": 12129 + }, + { + "epoch": 1.4761180407666565, + "grad_norm": 1.5108734369277954, + "learning_rate": 3.32223506824058e-06, + "loss": 0.3742, + "step": 12130 + }, + { + "epoch": 1.4762397322786736, + "grad_norm": 2.2625434398651123, + "learning_rate": 3.3207831731332284e-06, + "loss": 0.3627, + "step": 12131 + }, + { + "epoch": 1.4763614237906906, + "grad_norm": 3.0276291370391846, + "learning_rate": 3.319331532183908e-06, + "loss": 0.3702, + "step": 12132 + }, + { + "epoch": 1.4764831153027076, + "grad_norm": 2.318291425704956, + "learning_rate": 3.317880145447845e-06, + "loss": 0.438, + "step": 12133 + }, + { + "epoch": 1.4766048068147246, + "grad_norm": 2.4994823932647705, + "learning_rate": 3.316429012980276e-06, + "loss": 0.4382, + "step": 12134 + }, + { + "epoch": 1.4767264983267416, + "grad_norm": 3.6462714672088623, + "learning_rate": 3.3149781348364185e-06, + "loss": 0.2762, + "step": 12135 + }, + { + "epoch": 1.4768481898387589, + "grad_norm": 1.6942640542984009, + "learning_rate": 3.313527511071476e-06, + "loss": 0.4008, + "step": 12136 + }, + { + "epoch": 1.476969881350776, + "grad_norm": 1.773508906364441, + "learning_rate": 3.312077141740655e-06, + "loss": 0.382, + "step": 12137 + }, + { + "epoch": 1.477091572862793, + "grad_norm": 1.9742419719696045, + "learning_rate": 3.3106270268991425e-06, + "loss": 0.378, + "step": 12138 + }, + { + "epoch": 1.47721326437481, + "grad_norm": 1.8255996704101562, + "learning_rate": 3.3091771666021146e-06, + "loss": 0.435, + "step": 12139 + }, + { + "epoch": 1.477334955886827, + "grad_norm": 2.394289016723633, + "learning_rate": 3.3077275609047486e-06, + "loss": 0.3651, + "step": 12140 + }, + { + "epoch": 1.477456647398844, + "grad_norm": 1.5934715270996094, + "learning_rate": 3.3062782098622004e-06, + "loss": 0.3511, + "step": 12141 + }, + { + "epoch": 1.477578338910861, + "grad_norm": 1.7620885372161865, + "learning_rate": 3.3048291135296185e-06, + "loss": 0.4043, + "step": 12142 + }, + { + "epoch": 1.477700030422878, + "grad_norm": 2.0183396339416504, + "learning_rate": 3.3033802719621533e-06, + "loss": 0.3745, + "step": 12143 + }, + { + "epoch": 1.477821721934895, + "grad_norm": 4.824260711669922, + "learning_rate": 3.301931685214924e-06, + "loss": 0.4956, + "step": 12144 + }, + { + "epoch": 1.477943413446912, + "grad_norm": 1.6940417289733887, + "learning_rate": 3.3004833533430615e-06, + "loss": 0.3681, + "step": 12145 + }, + { + "epoch": 1.478065104958929, + "grad_norm": 1.8248379230499268, + "learning_rate": 3.299035276401674e-06, + "loss": 0.4143, + "step": 12146 + }, + { + "epoch": 1.478186796470946, + "grad_norm": 2.2205722332000732, + "learning_rate": 3.297587454445862e-06, + "loss": 0.396, + "step": 12147 + }, + { + "epoch": 1.478308487982963, + "grad_norm": 1.820279598236084, + "learning_rate": 3.2961398875307238e-06, + "loss": 0.4017, + "step": 12148 + }, + { + "epoch": 1.4784301794949801, + "grad_norm": 1.601264238357544, + "learning_rate": 3.294692575711339e-06, + "loss": 0.3632, + "step": 12149 + }, + { + "epoch": 1.4785518710069971, + "grad_norm": 1.8530502319335938, + "learning_rate": 3.293245519042777e-06, + "loss": 0.3576, + "step": 12150 + }, + { + "epoch": 1.4786735625190142, + "grad_norm": 1.7436153888702393, + "learning_rate": 3.291798717580109e-06, + "loss": 0.3867, + "step": 12151 + }, + { + "epoch": 1.4787952540310314, + "grad_norm": 1.6185338497161865, + "learning_rate": 3.290352171378385e-06, + "loss": 0.3272, + "step": 12152 + }, + { + "epoch": 1.4789169455430484, + "grad_norm": 1.5198289155960083, + "learning_rate": 3.2889058804926455e-06, + "loss": 0.3829, + "step": 12153 + }, + { + "epoch": 1.4790386370550654, + "grad_norm": 1.5279617309570312, + "learning_rate": 3.287459844977933e-06, + "loss": 0.4059, + "step": 12154 + }, + { + "epoch": 1.4791603285670825, + "grad_norm": 1.4052528142929077, + "learning_rate": 3.2860140648892657e-06, + "loss": 0.3874, + "step": 12155 + }, + { + "epoch": 1.4792820200790995, + "grad_norm": 1.8463801145553589, + "learning_rate": 3.284568540281662e-06, + "loss": 0.387, + "step": 12156 + }, + { + "epoch": 1.4794037115911165, + "grad_norm": 2.442044496536255, + "learning_rate": 3.2831232712101245e-06, + "loss": 0.3689, + "step": 12157 + }, + { + "epoch": 1.4795254031031335, + "grad_norm": 1.3333532810211182, + "learning_rate": 3.281678257729647e-06, + "loss": 0.3751, + "step": 12158 + }, + { + "epoch": 1.4796470946151505, + "grad_norm": 1.7679804563522339, + "learning_rate": 3.280233499895221e-06, + "loss": 0.385, + "step": 12159 + }, + { + "epoch": 1.4797687861271676, + "grad_norm": 2.173210620880127, + "learning_rate": 3.2787889977618194e-06, + "loss": 0.388, + "step": 12160 + }, + { + "epoch": 1.4798904776391848, + "grad_norm": 1.9507856369018555, + "learning_rate": 3.2773447513844058e-06, + "loss": 0.3477, + "step": 12161 + }, + { + "epoch": 1.4800121691512018, + "grad_norm": 1.4006422758102417, + "learning_rate": 3.275900760817943e-06, + "loss": 0.3612, + "step": 12162 + }, + { + "epoch": 1.4801338606632188, + "grad_norm": 1.785213589668274, + "learning_rate": 3.274457026117369e-06, + "loss": 0.4075, + "step": 12163 + }, + { + "epoch": 1.4802555521752359, + "grad_norm": 1.889771819114685, + "learning_rate": 3.2730135473376324e-06, + "loss": 0.2934, + "step": 12164 + }, + { + "epoch": 1.4803772436872529, + "grad_norm": 1.497920036315918, + "learning_rate": 3.271570324533653e-06, + "loss": 0.3495, + "step": 12165 + }, + { + "epoch": 1.48049893519927, + "grad_norm": 1.4012306928634644, + "learning_rate": 3.27012735776035e-06, + "loss": 0.3788, + "step": 12166 + }, + { + "epoch": 1.480620626711287, + "grad_norm": 1.801177978515625, + "learning_rate": 3.2686846470726306e-06, + "loss": 0.4004, + "step": 12167 + }, + { + "epoch": 1.480742318223304, + "grad_norm": 1.9686452150344849, + "learning_rate": 3.2672421925253938e-06, + "loss": 0.3819, + "step": 12168 + }, + { + "epoch": 1.480864009735321, + "grad_norm": 3.5021603107452393, + "learning_rate": 3.265799994173524e-06, + "loss": 0.4607, + "step": 12169 + }, + { + "epoch": 1.480985701247338, + "grad_norm": 1.4304561614990234, + "learning_rate": 3.2643580520719075e-06, + "loss": 0.3915, + "step": 12170 + }, + { + "epoch": 1.481107392759355, + "grad_norm": 1.576928973197937, + "learning_rate": 3.262916366275406e-06, + "loss": 0.3759, + "step": 12171 + }, + { + "epoch": 1.481229084271372, + "grad_norm": 2.650402545928955, + "learning_rate": 3.2614749368388842e-06, + "loss": 0.2997, + "step": 12172 + }, + { + "epoch": 1.481350775783389, + "grad_norm": 2.327786445617676, + "learning_rate": 3.2600337638171897e-06, + "loss": 0.3778, + "step": 12173 + }, + { + "epoch": 1.481472467295406, + "grad_norm": 1.6953985691070557, + "learning_rate": 3.2585928472651573e-06, + "loss": 0.4064, + "step": 12174 + }, + { + "epoch": 1.481594158807423, + "grad_norm": 2.0793440341949463, + "learning_rate": 3.2571521872376243e-06, + "loss": 0.4006, + "step": 12175 + }, + { + "epoch": 1.48171585031944, + "grad_norm": 2.706766128540039, + "learning_rate": 3.255711783789408e-06, + "loss": 0.3322, + "step": 12176 + }, + { + "epoch": 1.4818375418314573, + "grad_norm": 1.3230936527252197, + "learning_rate": 3.2542716369753156e-06, + "loss": 0.3511, + "step": 12177 + }, + { + "epoch": 1.4819592333434743, + "grad_norm": 1.5271930694580078, + "learning_rate": 3.2528317468501557e-06, + "loss": 0.3682, + "step": 12178 + }, + { + "epoch": 1.4820809248554914, + "grad_norm": 3.0575695037841797, + "learning_rate": 3.2513921134687086e-06, + "loss": 0.3287, + "step": 12179 + }, + { + "epoch": 1.4822026163675084, + "grad_norm": 1.637450933456421, + "learning_rate": 3.249952736885762e-06, + "loss": 0.3497, + "step": 12180 + }, + { + "epoch": 1.4823243078795254, + "grad_norm": 1.9152295589447021, + "learning_rate": 3.248513617156087e-06, + "loss": 0.365, + "step": 12181 + }, + { + "epoch": 1.4824459993915424, + "grad_norm": 2.66620135307312, + "learning_rate": 3.2470747543344394e-06, + "loss": 0.4223, + "step": 12182 + }, + { + "epoch": 1.4825676909035594, + "grad_norm": 1.3882116079330444, + "learning_rate": 3.2456361484755795e-06, + "loss": 0.3241, + "step": 12183 + }, + { + "epoch": 1.4826893824155765, + "grad_norm": 1.5836743116378784, + "learning_rate": 3.2441977996342443e-06, + "loss": 0.3478, + "step": 12184 + }, + { + "epoch": 1.4828110739275935, + "grad_norm": 2.3171470165252686, + "learning_rate": 3.242759707865163e-06, + "loss": 0.3475, + "step": 12185 + }, + { + "epoch": 1.4829327654396107, + "grad_norm": 1.8339293003082275, + "learning_rate": 3.2413218732230643e-06, + "loss": 0.3714, + "step": 12186 + }, + { + "epoch": 1.4830544569516277, + "grad_norm": 1.4502087831497192, + "learning_rate": 3.2398842957626596e-06, + "loss": 0.3598, + "step": 12187 + }, + { + "epoch": 1.4831761484636448, + "grad_norm": 1.9981969594955444, + "learning_rate": 3.238446975538646e-06, + "loss": 0.3509, + "step": 12188 + }, + { + "epoch": 1.4832978399756618, + "grad_norm": 2.196707010269165, + "learning_rate": 3.2370099126057277e-06, + "loss": 0.3101, + "step": 12189 + }, + { + "epoch": 1.4834195314876788, + "grad_norm": 1.7515709400177002, + "learning_rate": 3.2355731070185737e-06, + "loss": 0.4177, + "step": 12190 + }, + { + "epoch": 1.4835412229996958, + "grad_norm": 1.9297406673431396, + "learning_rate": 3.2341365588318676e-06, + "loss": 0.3684, + "step": 12191 + }, + { + "epoch": 1.4836629145117128, + "grad_norm": 1.7228189706802368, + "learning_rate": 3.2327002681002706e-06, + "loss": 0.3595, + "step": 12192 + }, + { + "epoch": 1.4837846060237299, + "grad_norm": 2.0338523387908936, + "learning_rate": 3.231264234878432e-06, + "loss": 0.4159, + "step": 12193 + }, + { + "epoch": 1.4839062975357469, + "grad_norm": 1.951887845993042, + "learning_rate": 3.229828459221004e-06, + "loss": 0.383, + "step": 12194 + }, + { + "epoch": 1.484027989047764, + "grad_norm": 1.9407411813735962, + "learning_rate": 3.2283929411826152e-06, + "loss": 0.3774, + "step": 12195 + }, + { + "epoch": 1.484149680559781, + "grad_norm": 1.7167006731033325, + "learning_rate": 3.2269576808178895e-06, + "loss": 0.3505, + "step": 12196 + }, + { + "epoch": 1.484271372071798, + "grad_norm": 1.6156951189041138, + "learning_rate": 3.2255226781814465e-06, + "loss": 0.3908, + "step": 12197 + }, + { + "epoch": 1.484393063583815, + "grad_norm": 1.6708869934082031, + "learning_rate": 3.2240879333278872e-06, + "loss": 0.4116, + "step": 12198 + }, + { + "epoch": 1.484514755095832, + "grad_norm": 1.7810593843460083, + "learning_rate": 3.2226534463118055e-06, + "loss": 0.3949, + "step": 12199 + }, + { + "epoch": 1.484636446607849, + "grad_norm": 1.5908105373382568, + "learning_rate": 3.221219217187791e-06, + "loss": 0.3814, + "step": 12200 + }, + { + "epoch": 1.484758138119866, + "grad_norm": 2.0193192958831787, + "learning_rate": 3.219785246010416e-06, + "loss": 0.4046, + "step": 12201 + }, + { + "epoch": 1.4848798296318833, + "grad_norm": 1.867391586303711, + "learning_rate": 3.2183515328342473e-06, + "loss": 0.3854, + "step": 12202 + }, + { + "epoch": 1.4850015211439003, + "grad_norm": 3.2296524047851562, + "learning_rate": 3.2169180777138407e-06, + "loss": 0.3279, + "step": 12203 + }, + { + "epoch": 1.4851232126559173, + "grad_norm": 1.8475428819656372, + "learning_rate": 3.2154848807037376e-06, + "loss": 0.4015, + "step": 12204 + }, + { + "epoch": 1.4852449041679343, + "grad_norm": 1.554054856300354, + "learning_rate": 3.2140519418584805e-06, + "loss": 0.4448, + "step": 12205 + }, + { + "epoch": 1.4853665956799513, + "grad_norm": 1.6051626205444336, + "learning_rate": 3.2126192612325934e-06, + "loss": 0.3706, + "step": 12206 + }, + { + "epoch": 1.4854882871919683, + "grad_norm": 2.1893246173858643, + "learning_rate": 3.21118683888059e-06, + "loss": 0.4109, + "step": 12207 + }, + { + "epoch": 1.4856099787039854, + "grad_norm": 3.1736106872558594, + "learning_rate": 3.209754674856982e-06, + "loss": 0.3119, + "step": 12208 + }, + { + "epoch": 1.4857316702160024, + "grad_norm": 3.722097158432007, + "learning_rate": 3.208322769216264e-06, + "loss": 0.318, + "step": 12209 + }, + { + "epoch": 1.4858533617280194, + "grad_norm": 1.6310083866119385, + "learning_rate": 3.2068911220129197e-06, + "loss": 0.3542, + "step": 12210 + }, + { + "epoch": 1.4859750532400364, + "grad_norm": 3.6910557746887207, + "learning_rate": 3.2054597333014328e-06, + "loss": 0.3327, + "step": 12211 + }, + { + "epoch": 1.4860967447520537, + "grad_norm": 1.5434637069702148, + "learning_rate": 3.2040286031362654e-06, + "loss": 0.3696, + "step": 12212 + }, + { + "epoch": 1.4862184362640707, + "grad_norm": 1.804858684539795, + "learning_rate": 3.202597731571878e-06, + "loss": 0.4362, + "step": 12213 + }, + { + "epoch": 1.4863401277760877, + "grad_norm": 1.7253929376602173, + "learning_rate": 3.201167118662716e-06, + "loss": 0.3495, + "step": 12214 + }, + { + "epoch": 1.4864618192881047, + "grad_norm": 2.440077781677246, + "learning_rate": 3.1997367644632148e-06, + "loss": 0.3428, + "step": 12215 + }, + { + "epoch": 1.4865835108001217, + "grad_norm": 1.3952327966690063, + "learning_rate": 3.198306669027809e-06, + "loss": 0.3885, + "step": 12216 + }, + { + "epoch": 1.4867052023121388, + "grad_norm": 4.270806312561035, + "learning_rate": 3.1968768324109123e-06, + "loss": 0.3218, + "step": 12217 + }, + { + "epoch": 1.4868268938241558, + "grad_norm": 1.5582537651062012, + "learning_rate": 3.1954472546669313e-06, + "loss": 0.348, + "step": 12218 + }, + { + "epoch": 1.4869485853361728, + "grad_norm": 1.860722303390503, + "learning_rate": 3.19401793585027e-06, + "loss": 0.3393, + "step": 12219 + }, + { + "epoch": 1.4870702768481898, + "grad_norm": 2.8334057331085205, + "learning_rate": 3.1925888760153144e-06, + "loss": 0.3748, + "step": 12220 + }, + { + "epoch": 1.4871919683602068, + "grad_norm": 1.6063847541809082, + "learning_rate": 3.1911600752164383e-06, + "loss": 0.3489, + "step": 12221 + }, + { + "epoch": 1.4873136598722239, + "grad_norm": 3.600930690765381, + "learning_rate": 3.189731533508019e-06, + "loss": 0.4335, + "step": 12222 + }, + { + "epoch": 1.4874353513842409, + "grad_norm": 2.0729432106018066, + "learning_rate": 3.188303250944408e-06, + "loss": 0.3805, + "step": 12223 + }, + { + "epoch": 1.487557042896258, + "grad_norm": 1.4001272916793823, + "learning_rate": 3.1868752275799643e-06, + "loss": 0.3465, + "step": 12224 + }, + { + "epoch": 1.487678734408275, + "grad_norm": 2.107030153274536, + "learning_rate": 3.185447463469017e-06, + "loss": 0.374, + "step": 12225 + }, + { + "epoch": 1.487800425920292, + "grad_norm": 2.100602865219116, + "learning_rate": 3.184019958665896e-06, + "loss": 0.4118, + "step": 12226 + }, + { + "epoch": 1.4879221174323092, + "grad_norm": 3.5554583072662354, + "learning_rate": 3.1825927132249266e-06, + "loss": 0.4575, + "step": 12227 + }, + { + "epoch": 1.4880438089443262, + "grad_norm": 1.6292518377304077, + "learning_rate": 3.181165727200416e-06, + "loss": 0.3751, + "step": 12228 + }, + { + "epoch": 1.4881655004563432, + "grad_norm": 2.390096664428711, + "learning_rate": 3.179739000646661e-06, + "loss": 0.3181, + "step": 12229 + }, + { + "epoch": 1.4882871919683602, + "grad_norm": 1.8347625732421875, + "learning_rate": 3.1783125336179577e-06, + "loss": 0.4166, + "step": 12230 + }, + { + "epoch": 1.4884088834803773, + "grad_norm": 3.1557180881500244, + "learning_rate": 3.176886326168578e-06, + "loss": 0.3343, + "step": 12231 + }, + { + "epoch": 1.4885305749923943, + "grad_norm": 1.8456231355667114, + "learning_rate": 3.1754603783528e-06, + "loss": 0.4191, + "step": 12232 + }, + { + "epoch": 1.4886522665044113, + "grad_norm": 2.0623562335968018, + "learning_rate": 3.174034690224882e-06, + "loss": 0.3064, + "step": 12233 + }, + { + "epoch": 1.4887739580164283, + "grad_norm": 1.72364342212677, + "learning_rate": 3.172609261839069e-06, + "loss": 0.3882, + "step": 12234 + }, + { + "epoch": 1.4888956495284453, + "grad_norm": 1.563386082649231, + "learning_rate": 3.1711840932496117e-06, + "loss": 0.3801, + "step": 12235 + }, + { + "epoch": 1.4890173410404623, + "grad_norm": 3.1208484172821045, + "learning_rate": 3.1697591845107324e-06, + "loss": 0.3294, + "step": 12236 + }, + { + "epoch": 1.4891390325524796, + "grad_norm": 1.7949192523956299, + "learning_rate": 3.168334535676649e-06, + "loss": 0.4018, + "step": 12237 + }, + { + "epoch": 1.4892607240644966, + "grad_norm": 2.5957248210906982, + "learning_rate": 3.166910146801582e-06, + "loss": 0.34, + "step": 12238 + }, + { + "epoch": 1.4893824155765136, + "grad_norm": 2.835603713989258, + "learning_rate": 3.165486017939724e-06, + "loss": 0.2921, + "step": 12239 + }, + { + "epoch": 1.4895041070885306, + "grad_norm": 1.5516680479049683, + "learning_rate": 3.1640621491452726e-06, + "loss": 0.3857, + "step": 12240 + }, + { + "epoch": 1.4896257986005477, + "grad_norm": 1.36301851272583, + "learning_rate": 3.162638540472406e-06, + "loss": 0.3229, + "step": 12241 + }, + { + "epoch": 1.4897474901125647, + "grad_norm": 2.6442418098449707, + "learning_rate": 3.161215191975292e-06, + "loss": 0.3383, + "step": 12242 + }, + { + "epoch": 1.4898691816245817, + "grad_norm": 1.591164231300354, + "learning_rate": 3.1597921037080993e-06, + "loss": 0.3961, + "step": 12243 + }, + { + "epoch": 1.4899908731365987, + "grad_norm": 1.8963242769241333, + "learning_rate": 3.1583692757249752e-06, + "loss": 0.431, + "step": 12244 + }, + { + "epoch": 1.4901125646486157, + "grad_norm": 1.5690912008285522, + "learning_rate": 3.156946708080059e-06, + "loss": 0.3149, + "step": 12245 + }, + { + "epoch": 1.4902342561606328, + "grad_norm": 3.6722986698150635, + "learning_rate": 3.1555244008274878e-06, + "loss": 0.4469, + "step": 12246 + }, + { + "epoch": 1.4903559476726498, + "grad_norm": 1.4763903617858887, + "learning_rate": 3.15410235402138e-06, + "loss": 0.343, + "step": 12247 + }, + { + "epoch": 1.4904776391846668, + "grad_norm": 1.4336321353912354, + "learning_rate": 3.1526805677158477e-06, + "loss": 0.3849, + "step": 12248 + }, + { + "epoch": 1.4905993306966838, + "grad_norm": 2.9285664558410645, + "learning_rate": 3.1512590419649935e-06, + "loss": 0.4136, + "step": 12249 + }, + { + "epoch": 1.4907210222087008, + "grad_norm": 1.8608766794204712, + "learning_rate": 3.149837776822906e-06, + "loss": 0.3577, + "step": 12250 + }, + { + "epoch": 1.4908427137207179, + "grad_norm": 1.4334529638290405, + "learning_rate": 3.1484167723436723e-06, + "loss": 0.3481, + "step": 12251 + }, + { + "epoch": 1.4909644052327349, + "grad_norm": 3.0986416339874268, + "learning_rate": 3.146996028581363e-06, + "loss": 0.3838, + "step": 12252 + }, + { + "epoch": 1.4910860967447521, + "grad_norm": 2.436950922012329, + "learning_rate": 3.145575545590036e-06, + "loss": 0.4266, + "step": 12253 + }, + { + "epoch": 1.4912077882567691, + "grad_norm": 2.039731502532959, + "learning_rate": 3.144155323423752e-06, + "loss": 0.3808, + "step": 12254 + }, + { + "epoch": 1.4913294797687862, + "grad_norm": 1.6108180284500122, + "learning_rate": 3.1427353621365474e-06, + "loss": 0.4109, + "step": 12255 + }, + { + "epoch": 1.4914511712808032, + "grad_norm": 2.0467262268066406, + "learning_rate": 3.1413156617824537e-06, + "loss": 0.4496, + "step": 12256 + }, + { + "epoch": 1.4915728627928202, + "grad_norm": 2.6676440238952637, + "learning_rate": 3.1398962224154983e-06, + "loss": 0.3434, + "step": 12257 + }, + { + "epoch": 1.4916945543048372, + "grad_norm": 1.7897522449493408, + "learning_rate": 3.1384770440896917e-06, + "loss": 0.382, + "step": 12258 + }, + { + "epoch": 1.4918162458168542, + "grad_norm": 1.4741199016571045, + "learning_rate": 3.137058126859036e-06, + "loss": 0.3462, + "step": 12259 + }, + { + "epoch": 1.4919379373288713, + "grad_norm": 2.7347793579101562, + "learning_rate": 3.135639470777524e-06, + "loss": 0.3285, + "step": 12260 + }, + { + "epoch": 1.4920596288408883, + "grad_norm": 2.7070937156677246, + "learning_rate": 3.1342210758991355e-06, + "loss": 0.4233, + "step": 12261 + }, + { + "epoch": 1.4921813203529055, + "grad_norm": 2.465555191040039, + "learning_rate": 3.1328029422778494e-06, + "loss": 0.3469, + "step": 12262 + }, + { + "epoch": 1.4923030118649225, + "grad_norm": 1.5377061367034912, + "learning_rate": 3.1313850699676264e-06, + "loss": 0.3873, + "step": 12263 + }, + { + "epoch": 1.4924247033769396, + "grad_norm": 1.359940528869629, + "learning_rate": 3.129967459022415e-06, + "loss": 0.425, + "step": 12264 + }, + { + "epoch": 1.4925463948889566, + "grad_norm": 2.1438324451446533, + "learning_rate": 3.1285501094961645e-06, + "loss": 0.353, + "step": 12265 + }, + { + "epoch": 1.4926680864009736, + "grad_norm": 1.7696703672409058, + "learning_rate": 3.1271330214428063e-06, + "loss": 0.3193, + "step": 12266 + }, + { + "epoch": 1.4927897779129906, + "grad_norm": 2.484936237335205, + "learning_rate": 3.1257161949162598e-06, + "loss": 0.3175, + "step": 12267 + }, + { + "epoch": 1.4929114694250076, + "grad_norm": 1.596076250076294, + "learning_rate": 3.1242996299704432e-06, + "loss": 0.3954, + "step": 12268 + }, + { + "epoch": 1.4930331609370247, + "grad_norm": 1.520337700843811, + "learning_rate": 3.1228833266592585e-06, + "loss": 0.351, + "step": 12269 + }, + { + "epoch": 1.4931548524490417, + "grad_norm": 1.6884106397628784, + "learning_rate": 3.121467285036598e-06, + "loss": 0.4404, + "step": 12270 + }, + { + "epoch": 1.4932765439610587, + "grad_norm": 2.1992413997650146, + "learning_rate": 3.1200515051563442e-06, + "loss": 0.2917, + "step": 12271 + }, + { + "epoch": 1.4933982354730757, + "grad_norm": 1.7355977296829224, + "learning_rate": 3.118635987072369e-06, + "loss": 0.4039, + "step": 12272 + }, + { + "epoch": 1.4935199269850927, + "grad_norm": 2.3108272552490234, + "learning_rate": 3.1172207308385406e-06, + "loss": 0.3668, + "step": 12273 + }, + { + "epoch": 1.4936416184971097, + "grad_norm": 1.553948998451233, + "learning_rate": 3.11580573650871e-06, + "loss": 0.3946, + "step": 12274 + }, + { + "epoch": 1.4937633100091268, + "grad_norm": 2.9436333179473877, + "learning_rate": 3.1143910041367185e-06, + "loss": 0.3571, + "step": 12275 + }, + { + "epoch": 1.4938850015211438, + "grad_norm": 2.040424346923828, + "learning_rate": 3.112976533776404e-06, + "loss": 0.3794, + "step": 12276 + }, + { + "epoch": 1.4940066930331608, + "grad_norm": 2.03022837638855, + "learning_rate": 3.1115623254815886e-06, + "loss": 0.3916, + "step": 12277 + }, + { + "epoch": 1.494128384545178, + "grad_norm": 1.7078419923782349, + "learning_rate": 3.1101483793060816e-06, + "loss": 0.391, + "step": 12278 + }, + { + "epoch": 1.494250076057195, + "grad_norm": 1.7216203212738037, + "learning_rate": 3.1087346953036924e-06, + "loss": 0.3995, + "step": 12279 + }, + { + "epoch": 1.494371767569212, + "grad_norm": 2.206731081008911, + "learning_rate": 3.107321273528212e-06, + "loss": 0.3593, + "step": 12280 + }, + { + "epoch": 1.494493459081229, + "grad_norm": 2.5501060485839844, + "learning_rate": 3.1059081140334224e-06, + "loss": 0.3681, + "step": 12281 + }, + { + "epoch": 1.4946151505932461, + "grad_norm": 2.21803617477417, + "learning_rate": 3.1044952168731047e-06, + "loss": 0.3587, + "step": 12282 + }, + { + "epoch": 1.4947368421052631, + "grad_norm": 3.62646484375, + "learning_rate": 3.10308258210101e-06, + "loss": 0.4794, + "step": 12283 + }, + { + "epoch": 1.4948585336172802, + "grad_norm": 1.92630934715271, + "learning_rate": 3.101670209770903e-06, + "loss": 0.3491, + "step": 12284 + }, + { + "epoch": 1.4949802251292972, + "grad_norm": 3.7176802158355713, + "learning_rate": 3.1002580999365228e-06, + "loss": 0.3309, + "step": 12285 + }, + { + "epoch": 1.4951019166413142, + "grad_norm": 1.882136583328247, + "learning_rate": 3.0988462526516004e-06, + "loss": 0.4068, + "step": 12286 + }, + { + "epoch": 1.4952236081533314, + "grad_norm": 3.4674339294433594, + "learning_rate": 3.0974346679698653e-06, + "loss": 0.3601, + "step": 12287 + }, + { + "epoch": 1.4953452996653485, + "grad_norm": 1.4098377227783203, + "learning_rate": 3.0960233459450295e-06, + "loss": 0.3596, + "step": 12288 + }, + { + "epoch": 1.4954669911773655, + "grad_norm": 2.281935691833496, + "learning_rate": 3.0946122866307925e-06, + "loss": 0.3438, + "step": 12289 + }, + { + "epoch": 1.4955886826893825, + "grad_norm": 1.6393074989318848, + "learning_rate": 3.093201490080854e-06, + "loss": 0.3468, + "step": 12290 + }, + { + "epoch": 1.4957103742013995, + "grad_norm": 2.3345911502838135, + "learning_rate": 3.0917909563488924e-06, + "loss": 0.3274, + "step": 12291 + }, + { + "epoch": 1.4958320657134165, + "grad_norm": 1.6504086256027222, + "learning_rate": 3.0903806854885875e-06, + "loss": 0.3735, + "step": 12292 + }, + { + "epoch": 1.4959537572254336, + "grad_norm": 1.6430517435073853, + "learning_rate": 3.0889706775536023e-06, + "loss": 0.3549, + "step": 12293 + }, + { + "epoch": 1.4960754487374506, + "grad_norm": 2.504863977432251, + "learning_rate": 3.0875609325975818e-06, + "loss": 0.3972, + "step": 12294 + }, + { + "epoch": 1.4961971402494676, + "grad_norm": 1.7399461269378662, + "learning_rate": 3.0861514506741786e-06, + "loss": 0.3185, + "step": 12295 + }, + { + "epoch": 1.4963188317614846, + "grad_norm": 3.7056572437286377, + "learning_rate": 3.0847422318370247e-06, + "loss": 0.4252, + "step": 12296 + }, + { + "epoch": 1.4964405232735016, + "grad_norm": 1.4100056886672974, + "learning_rate": 3.083333276139738e-06, + "loss": 0.3506, + "step": 12297 + }, + { + "epoch": 1.4965622147855187, + "grad_norm": 1.956782579421997, + "learning_rate": 3.0819245836359413e-06, + "loss": 0.3145, + "step": 12298 + }, + { + "epoch": 1.4966839062975357, + "grad_norm": 1.6348721981048584, + "learning_rate": 3.0805161543792293e-06, + "loss": 0.3186, + "step": 12299 + }, + { + "epoch": 1.4968055978095527, + "grad_norm": 2.610935688018799, + "learning_rate": 3.0791079884232044e-06, + "loss": 0.4086, + "step": 12300 + }, + { + "epoch": 1.4969272893215697, + "grad_norm": 2.711599588394165, + "learning_rate": 3.0777000858214458e-06, + "loss": 0.3667, + "step": 12301 + }, + { + "epoch": 1.4970489808335867, + "grad_norm": 4.114050388336182, + "learning_rate": 3.0762924466275244e-06, + "loss": 0.4347, + "step": 12302 + }, + { + "epoch": 1.497170672345604, + "grad_norm": 3.9740138053894043, + "learning_rate": 3.0748850708950096e-06, + "loss": 0.4042, + "step": 12303 + }, + { + "epoch": 1.497292363857621, + "grad_norm": 2.804448127746582, + "learning_rate": 3.073477958677451e-06, + "loss": 0.4266, + "step": 12304 + }, + { + "epoch": 1.497414055369638, + "grad_norm": 4.193317890167236, + "learning_rate": 3.072071110028395e-06, + "loss": 0.4139, + "step": 12305 + }, + { + "epoch": 1.497535746881655, + "grad_norm": 1.7350159883499146, + "learning_rate": 3.0706645250013722e-06, + "loss": 0.337, + "step": 12306 + }, + { + "epoch": 1.497657438393672, + "grad_norm": 2.3582441806793213, + "learning_rate": 3.069258203649904e-06, + "loss": 0.3674, + "step": 12307 + }, + { + "epoch": 1.497779129905689, + "grad_norm": 1.4542208909988403, + "learning_rate": 3.06785214602751e-06, + "loss": 0.3679, + "step": 12308 + }, + { + "epoch": 1.497900821417706, + "grad_norm": 3.017589569091797, + "learning_rate": 3.066446352187692e-06, + "loss": 0.2985, + "step": 12309 + }, + { + "epoch": 1.498022512929723, + "grad_norm": 1.587577223777771, + "learning_rate": 3.0650408221839365e-06, + "loss": 0.3747, + "step": 12310 + }, + { + "epoch": 1.4981442044417401, + "grad_norm": 1.7981516122817993, + "learning_rate": 3.063635556069737e-06, + "loss": 0.4031, + "step": 12311 + }, + { + "epoch": 1.4982658959537571, + "grad_norm": 2.5378201007843018, + "learning_rate": 3.062230553898562e-06, + "loss": 0.406, + "step": 12312 + }, + { + "epoch": 1.4983875874657744, + "grad_norm": 1.496023178100586, + "learning_rate": 3.060825815723871e-06, + "loss": 0.3844, + "step": 12313 + }, + { + "epoch": 1.4985092789777914, + "grad_norm": 1.455578088760376, + "learning_rate": 3.059421341599126e-06, + "loss": 0.344, + "step": 12314 + }, + { + "epoch": 1.4986309704898084, + "grad_norm": 1.6901764869689941, + "learning_rate": 3.058017131577763e-06, + "loss": 0.3795, + "step": 12315 + }, + { + "epoch": 1.4987526620018254, + "grad_norm": 1.5380728244781494, + "learning_rate": 3.0566131857132186e-06, + "loss": 0.4138, + "step": 12316 + }, + { + "epoch": 1.4988743535138425, + "grad_norm": 1.7879115343093872, + "learning_rate": 3.0552095040589134e-06, + "loss": 0.3994, + "step": 12317 + }, + { + "epoch": 1.4989960450258595, + "grad_norm": 3.5872092247009277, + "learning_rate": 3.0538060866682596e-06, + "loss": 0.3033, + "step": 12318 + }, + { + "epoch": 1.4991177365378765, + "grad_norm": 1.6339712142944336, + "learning_rate": 3.052402933594664e-06, + "loss": 0.3853, + "step": 12319 + }, + { + "epoch": 1.4992394280498935, + "grad_norm": 1.9802073240280151, + "learning_rate": 3.0510000448915177e-06, + "loss": 0.4178, + "step": 12320 + }, + { + "epoch": 1.4993611195619105, + "grad_norm": 3.277489185333252, + "learning_rate": 3.0495974206122015e-06, + "loss": 0.3994, + "step": 12321 + }, + { + "epoch": 1.4994828110739276, + "grad_norm": 1.8114643096923828, + "learning_rate": 3.048195060810092e-06, + "loss": 0.4076, + "step": 12322 + }, + { + "epoch": 1.4996045025859446, + "grad_norm": 1.8180392980575562, + "learning_rate": 3.04679296553855e-06, + "loss": 0.4102, + "step": 12323 + }, + { + "epoch": 1.4997261940979616, + "grad_norm": 1.731507658958435, + "learning_rate": 3.0453911348509246e-06, + "loss": 0.4115, + "step": 12324 + }, + { + "epoch": 1.4998478856099786, + "grad_norm": 1.3817925453186035, + "learning_rate": 3.043989568800565e-06, + "loss": 0.4276, + "step": 12325 + }, + { + "epoch": 1.4999695771219956, + "grad_norm": 1.6861214637756348, + "learning_rate": 3.0425882674408012e-06, + "loss": 0.3839, + "step": 12326 + }, + { + "epoch": 1.5000912686340127, + "grad_norm": 2.5139353275299072, + "learning_rate": 3.041187230824951e-06, + "loss": 0.3891, + "step": 12327 + }, + { + "epoch": 1.5002129601460297, + "grad_norm": 2.5731089115142822, + "learning_rate": 3.0397864590063377e-06, + "loss": 0.3831, + "step": 12328 + }, + { + "epoch": 1.5003346516580467, + "grad_norm": 1.5949252843856812, + "learning_rate": 3.0383859520382485e-06, + "loss": 0.3552, + "step": 12329 + }, + { + "epoch": 1.500456343170064, + "grad_norm": 1.2993236780166626, + "learning_rate": 3.0369857099739884e-06, + "loss": 0.3684, + "step": 12330 + }, + { + "epoch": 1.500578034682081, + "grad_norm": 2.266195058822632, + "learning_rate": 3.0355857328668337e-06, + "loss": 0.3409, + "step": 12331 + }, + { + "epoch": 1.500699726194098, + "grad_norm": 1.638208270072937, + "learning_rate": 3.034186020770055e-06, + "loss": 0.3756, + "step": 12332 + }, + { + "epoch": 1.500821417706115, + "grad_norm": 2.017385482788086, + "learning_rate": 3.032786573736919e-06, + "loss": 0.3811, + "step": 12333 + }, + { + "epoch": 1.500943109218132, + "grad_norm": 1.5251866579055786, + "learning_rate": 3.031387391820676e-06, + "loss": 0.3349, + "step": 12334 + }, + { + "epoch": 1.501064800730149, + "grad_norm": 1.678708553314209, + "learning_rate": 3.0299884750745635e-06, + "loss": 0.4216, + "step": 12335 + }, + { + "epoch": 1.5011864922421663, + "grad_norm": 1.482534646987915, + "learning_rate": 3.0285898235518197e-06, + "loss": 0.3626, + "step": 12336 + }, + { + "epoch": 1.5013081837541833, + "grad_norm": 2.3882389068603516, + "learning_rate": 3.027191437305663e-06, + "loss": 0.3867, + "step": 12337 + }, + { + "epoch": 1.5014298752662003, + "grad_norm": 2.533510684967041, + "learning_rate": 3.025793316389303e-06, + "loss": 0.4303, + "step": 12338 + }, + { + "epoch": 1.5015515667782173, + "grad_norm": 1.591119408607483, + "learning_rate": 3.024395460855948e-06, + "loss": 0.3339, + "step": 12339 + }, + { + "epoch": 1.5016732582902343, + "grad_norm": 1.8293664455413818, + "learning_rate": 3.022997870758779e-06, + "loss": 0.3485, + "step": 12340 + }, + { + "epoch": 1.5017949498022514, + "grad_norm": 2.356353282928467, + "learning_rate": 3.021600546150986e-06, + "loss": 0.3984, + "step": 12341 + }, + { + "epoch": 1.5019166413142684, + "grad_norm": 1.9324548244476318, + "learning_rate": 3.0202034870857354e-06, + "loss": 0.3387, + "step": 12342 + }, + { + "epoch": 1.5020383328262854, + "grad_norm": 1.3541065454483032, + "learning_rate": 3.0188066936161875e-06, + "loss": 0.3919, + "step": 12343 + }, + { + "epoch": 1.5021600243383024, + "grad_norm": 2.017469882965088, + "learning_rate": 3.017410165795498e-06, + "loss": 0.3456, + "step": 12344 + }, + { + "epoch": 1.5022817158503194, + "grad_norm": 1.600197672843933, + "learning_rate": 3.0160139036768045e-06, + "loss": 0.3802, + "step": 12345 + }, + { + "epoch": 1.5024034073623365, + "grad_norm": 1.4959287643432617, + "learning_rate": 3.0146179073132355e-06, + "loss": 0.3474, + "step": 12346 + }, + { + "epoch": 1.5025250988743535, + "grad_norm": 1.5233023166656494, + "learning_rate": 3.0132221767579164e-06, + "loss": 0.3382, + "step": 12347 + }, + { + "epoch": 1.5026467903863705, + "grad_norm": 3.6474270820617676, + "learning_rate": 3.0118267120639557e-06, + "loss": 0.4429, + "step": 12348 + }, + { + "epoch": 1.5027684818983875, + "grad_norm": 1.9193079471588135, + "learning_rate": 3.010431513284451e-06, + "loss": 0.3909, + "step": 12349 + }, + { + "epoch": 1.5028901734104045, + "grad_norm": 1.6091670989990234, + "learning_rate": 3.009036580472496e-06, + "loss": 0.3952, + "step": 12350 + }, + { + "epoch": 1.5030118649224216, + "grad_norm": 1.455883502960205, + "learning_rate": 3.0076419136811717e-06, + "loss": 0.3619, + "step": 12351 + }, + { + "epoch": 1.5031335564344386, + "grad_norm": 1.6678181886672974, + "learning_rate": 3.006247512963545e-06, + "loss": 0.3523, + "step": 12352 + }, + { + "epoch": 1.5032552479464556, + "grad_norm": 2.2384750843048096, + "learning_rate": 3.0048533783726774e-06, + "loss": 0.4137, + "step": 12353 + }, + { + "epoch": 1.5033769394584726, + "grad_norm": 1.8170641660690308, + "learning_rate": 3.0034595099616137e-06, + "loss": 0.3231, + "step": 12354 + }, + { + "epoch": 1.5034986309704899, + "grad_norm": 1.6375850439071655, + "learning_rate": 3.0020659077834014e-06, + "loss": 0.3981, + "step": 12355 + }, + { + "epoch": 1.5036203224825069, + "grad_norm": 1.9837037324905396, + "learning_rate": 3.000672571891067e-06, + "loss": 0.4079, + "step": 12356 + }, + { + "epoch": 1.503742013994524, + "grad_norm": 1.7627718448638916, + "learning_rate": 2.999279502337624e-06, + "loss": 0.3472, + "step": 12357 + }, + { + "epoch": 1.503863705506541, + "grad_norm": 1.6974141597747803, + "learning_rate": 2.9978866991760912e-06, + "loss": 0.3918, + "step": 12358 + }, + { + "epoch": 1.503985397018558, + "grad_norm": 1.7849807739257812, + "learning_rate": 2.99649416245946e-06, + "loss": 0.3682, + "step": 12359 + }, + { + "epoch": 1.504107088530575, + "grad_norm": 2.7937188148498535, + "learning_rate": 2.995101892240725e-06, + "loss": 0.37, + "step": 12360 + }, + { + "epoch": 1.5042287800425922, + "grad_norm": 2.260521650314331, + "learning_rate": 2.993709888572862e-06, + "loss": 0.4007, + "step": 12361 + }, + { + "epoch": 1.5043504715546092, + "grad_norm": 2.6758663654327393, + "learning_rate": 2.9923181515088407e-06, + "loss": 0.3588, + "step": 12362 + }, + { + "epoch": 1.5044721630666262, + "grad_norm": 1.8291544914245605, + "learning_rate": 2.990926681101619e-06, + "loss": 0.3485, + "step": 12363 + }, + { + "epoch": 1.5045938545786433, + "grad_norm": 1.969672679901123, + "learning_rate": 2.989535477404144e-06, + "loss": 0.4138, + "step": 12364 + }, + { + "epoch": 1.5047155460906603, + "grad_norm": 2.022167921066284, + "learning_rate": 2.9881445404693533e-06, + "loss": 0.3373, + "step": 12365 + }, + { + "epoch": 1.5048372376026773, + "grad_norm": 1.8693008422851562, + "learning_rate": 2.9867538703501788e-06, + "loss": 0.3704, + "step": 12366 + }, + { + "epoch": 1.5049589291146943, + "grad_norm": 2.33036208152771, + "learning_rate": 2.9853634670995323e-06, + "loss": 0.3394, + "step": 12367 + }, + { + "epoch": 1.5050806206267113, + "grad_norm": 2.0149638652801514, + "learning_rate": 2.98397333077033e-06, + "loss": 0.3277, + "step": 12368 + }, + { + "epoch": 1.5052023121387283, + "grad_norm": 2.071512460708618, + "learning_rate": 2.982583461415464e-06, + "loss": 0.2941, + "step": 12369 + }, + { + "epoch": 1.5053240036507454, + "grad_norm": 1.4121744632720947, + "learning_rate": 2.9811938590878197e-06, + "loss": 0.3543, + "step": 12370 + }, + { + "epoch": 1.5054456951627624, + "grad_norm": 3.1035492420196533, + "learning_rate": 2.9798045238402794e-06, + "loss": 0.4504, + "step": 12371 + }, + { + "epoch": 1.5055673866747794, + "grad_norm": 1.631083369255066, + "learning_rate": 2.9784154557257094e-06, + "loss": 0.3375, + "step": 12372 + }, + { + "epoch": 1.5056890781867964, + "grad_norm": 1.757651686668396, + "learning_rate": 2.977026654796962e-06, + "loss": 0.3235, + "step": 12373 + }, + { + "epoch": 1.5058107696988134, + "grad_norm": 1.765066146850586, + "learning_rate": 2.975638121106892e-06, + "loss": 0.3623, + "step": 12374 + }, + { + "epoch": 1.5059324612108305, + "grad_norm": 2.4763071537017822, + "learning_rate": 2.9742498547083254e-06, + "loss": 0.4339, + "step": 12375 + }, + { + "epoch": 1.5060541527228475, + "grad_norm": 1.6039776802062988, + "learning_rate": 2.9728618556540976e-06, + "loss": 0.3895, + "step": 12376 + }, + { + "epoch": 1.5061758442348645, + "grad_norm": 1.8766047954559326, + "learning_rate": 2.9714741239970223e-06, + "loss": 0.3891, + "step": 12377 + }, + { + "epoch": 1.5062975357468815, + "grad_norm": 3.1880428791046143, + "learning_rate": 2.9700866597899016e-06, + "loss": 0.3314, + "step": 12378 + }, + { + "epoch": 1.5064192272588985, + "grad_norm": 1.4541547298431396, + "learning_rate": 2.968699463085537e-06, + "loss": 0.3526, + "step": 12379 + }, + { + "epoch": 1.5065409187709158, + "grad_norm": 1.342864990234375, + "learning_rate": 2.9673125339367113e-06, + "loss": 0.3583, + "step": 12380 + }, + { + "epoch": 1.5066626102829328, + "grad_norm": 1.548964262008667, + "learning_rate": 2.9659258723961993e-06, + "loss": 0.358, + "step": 12381 + }, + { + "epoch": 1.5067843017949498, + "grad_norm": 2.163334846496582, + "learning_rate": 2.9645394785167693e-06, + "loss": 0.4127, + "step": 12382 + }, + { + "epoch": 1.5069059933069668, + "grad_norm": 2.184917449951172, + "learning_rate": 2.963153352351175e-06, + "loss": 0.3757, + "step": 12383 + }, + { + "epoch": 1.5070276848189839, + "grad_norm": 1.9177147150039673, + "learning_rate": 2.961767493952158e-06, + "loss": 0.3478, + "step": 12384 + }, + { + "epoch": 1.5071493763310009, + "grad_norm": 1.805364727973938, + "learning_rate": 2.960381903372462e-06, + "loss": 0.3776, + "step": 12385 + }, + { + "epoch": 1.507271067843018, + "grad_norm": 1.5803080797195435, + "learning_rate": 2.9589965806648004e-06, + "loss": 0.3481, + "step": 12386 + }, + { + "epoch": 1.5073927593550351, + "grad_norm": 1.3207484483718872, + "learning_rate": 2.957611525881896e-06, + "loss": 0.3245, + "step": 12387 + }, + { + "epoch": 1.5075144508670522, + "grad_norm": 2.636009931564331, + "learning_rate": 2.9562267390764497e-06, + "loss": 0.4136, + "step": 12388 + }, + { + "epoch": 1.5076361423790692, + "grad_norm": 1.4088382720947266, + "learning_rate": 2.9548422203011527e-06, + "loss": 0.3853, + "step": 12389 + }, + { + "epoch": 1.5077578338910862, + "grad_norm": 2.709378957748413, + "learning_rate": 2.9534579696086953e-06, + "loss": 0.3597, + "step": 12390 + }, + { + "epoch": 1.5078795254031032, + "grad_norm": 1.9916608333587646, + "learning_rate": 2.9520739870517468e-06, + "loss": 0.3141, + "step": 12391 + }, + { + "epoch": 1.5080012169151202, + "grad_norm": 3.2743358612060547, + "learning_rate": 2.950690272682969e-06, + "loss": 0.366, + "step": 12392 + }, + { + "epoch": 1.5081229084271373, + "grad_norm": 1.5566741228103638, + "learning_rate": 2.9493068265550207e-06, + "loss": 0.3648, + "step": 12393 + }, + { + "epoch": 1.5082445999391543, + "grad_norm": 2.2265734672546387, + "learning_rate": 2.947923648720542e-06, + "loss": 0.3565, + "step": 12394 + }, + { + "epoch": 1.5083662914511713, + "grad_norm": 1.8998069763183594, + "learning_rate": 2.946540739232162e-06, + "loss": 0.3338, + "step": 12395 + }, + { + "epoch": 1.5084879829631883, + "grad_norm": 1.8418159484863281, + "learning_rate": 2.94515809814251e-06, + "loss": 0.3828, + "step": 12396 + }, + { + "epoch": 1.5086096744752053, + "grad_norm": 2.126296043395996, + "learning_rate": 2.9437757255041955e-06, + "loss": 0.3855, + "step": 12397 + }, + { + "epoch": 1.5087313659872224, + "grad_norm": 4.020832061767578, + "learning_rate": 2.94239362136982e-06, + "loss": 0.4502, + "step": 12398 + }, + { + "epoch": 1.5088530574992394, + "grad_norm": 1.7214140892028809, + "learning_rate": 2.941011785791975e-06, + "loss": 0.3744, + "step": 12399 + }, + { + "epoch": 1.5089747490112564, + "grad_norm": 1.624267816543579, + "learning_rate": 2.9396302188232405e-06, + "loss": 0.3765, + "step": 12400 + }, + { + "epoch": 1.5090964405232734, + "grad_norm": 2.235553503036499, + "learning_rate": 2.9382489205161926e-06, + "loss": 0.3796, + "step": 12401 + }, + { + "epoch": 1.5092181320352904, + "grad_norm": 1.3899976015090942, + "learning_rate": 2.9368678909233917e-06, + "loss": 0.3368, + "step": 12402 + }, + { + "epoch": 1.5093398235473074, + "grad_norm": 1.6225268840789795, + "learning_rate": 2.935487130097383e-06, + "loss": 0.3764, + "step": 12403 + }, + { + "epoch": 1.5094615150593245, + "grad_norm": 1.8236868381500244, + "learning_rate": 2.9341066380907157e-06, + "loss": 0.3919, + "step": 12404 + }, + { + "epoch": 1.5095832065713415, + "grad_norm": 1.3602375984191895, + "learning_rate": 2.9327264149559154e-06, + "loss": 0.3399, + "step": 12405 + }, + { + "epoch": 1.5097048980833587, + "grad_norm": 1.8719074726104736, + "learning_rate": 2.931346460745501e-06, + "loss": 0.3856, + "step": 12406 + }, + { + "epoch": 1.5098265895953757, + "grad_norm": 2.090416431427002, + "learning_rate": 2.929966775511989e-06, + "loss": 0.3272, + "step": 12407 + }, + { + "epoch": 1.5099482811073928, + "grad_norm": 1.2473992109298706, + "learning_rate": 2.928587359307874e-06, + "loss": 0.3621, + "step": 12408 + }, + { + "epoch": 1.5100699726194098, + "grad_norm": 2.0594165325164795, + "learning_rate": 2.927208212185647e-06, + "loss": 0.3726, + "step": 12409 + }, + { + "epoch": 1.5101916641314268, + "grad_norm": 1.631903052330017, + "learning_rate": 2.9258293341977884e-06, + "loss": 0.3792, + "step": 12410 + }, + { + "epoch": 1.5103133556434438, + "grad_norm": 1.4740819931030273, + "learning_rate": 2.9244507253967625e-06, + "loss": 0.362, + "step": 12411 + }, + { + "epoch": 1.510435047155461, + "grad_norm": 2.6567625999450684, + "learning_rate": 2.9230723858350353e-06, + "loss": 0.4339, + "step": 12412 + }, + { + "epoch": 1.510556738667478, + "grad_norm": 1.7639751434326172, + "learning_rate": 2.9216943155650524e-06, + "loss": 0.3229, + "step": 12413 + }, + { + "epoch": 1.510678430179495, + "grad_norm": 1.784854531288147, + "learning_rate": 2.9203165146392496e-06, + "loss": 0.3856, + "step": 12414 + }, + { + "epoch": 1.5108001216915121, + "grad_norm": 1.7744238376617432, + "learning_rate": 2.9189389831100603e-06, + "loss": 0.3556, + "step": 12415 + }, + { + "epoch": 1.5109218132035291, + "grad_norm": 1.7978202104568481, + "learning_rate": 2.917561721029899e-06, + "loss": 0.3962, + "step": 12416 + }, + { + "epoch": 1.5110435047155462, + "grad_norm": 1.4225305318832397, + "learning_rate": 2.9161847284511715e-06, + "loss": 0.357, + "step": 12417 + }, + { + "epoch": 1.5111651962275632, + "grad_norm": 1.4157168865203857, + "learning_rate": 2.9148080054262805e-06, + "loss": 0.3326, + "step": 12418 + }, + { + "epoch": 1.5112868877395802, + "grad_norm": 1.620936632156372, + "learning_rate": 2.9134315520076072e-06, + "loss": 0.3285, + "step": 12419 + }, + { + "epoch": 1.5114085792515972, + "grad_norm": 2.4878740310668945, + "learning_rate": 2.9120553682475394e-06, + "loss": 0.4731, + "step": 12420 + }, + { + "epoch": 1.5115302707636142, + "grad_norm": 1.468770146369934, + "learning_rate": 2.9106794541984316e-06, + "loss": 0.3874, + "step": 12421 + }, + { + "epoch": 1.5116519622756313, + "grad_norm": 1.8907181024551392, + "learning_rate": 2.9093038099126416e-06, + "loss": 0.3466, + "step": 12422 + }, + { + "epoch": 1.5117736537876483, + "grad_norm": 2.9018452167510986, + "learning_rate": 2.907928435442522e-06, + "loss": 0.3131, + "step": 12423 + }, + { + "epoch": 1.5118953452996653, + "grad_norm": 1.9180059432983398, + "learning_rate": 2.9065533308404046e-06, + "loss": 0.3924, + "step": 12424 + }, + { + "epoch": 1.5120170368116823, + "grad_norm": 3.1403298377990723, + "learning_rate": 2.9051784961586138e-06, + "loss": 0.367, + "step": 12425 + }, + { + "epoch": 1.5121387283236993, + "grad_norm": 1.482758641242981, + "learning_rate": 2.903803931449469e-06, + "loss": 0.3428, + "step": 12426 + }, + { + "epoch": 1.5122604198357164, + "grad_norm": 1.934027075767517, + "learning_rate": 2.902429636765269e-06, + "loss": 0.3327, + "step": 12427 + }, + { + "epoch": 1.5123821113477334, + "grad_norm": 1.9481582641601562, + "learning_rate": 2.9010556121583176e-06, + "loss": 0.3512, + "step": 12428 + }, + { + "epoch": 1.5125038028597504, + "grad_norm": 1.6440929174423218, + "learning_rate": 2.8996818576808926e-06, + "loss": 0.3366, + "step": 12429 + }, + { + "epoch": 1.5126254943717674, + "grad_norm": 1.5502510070800781, + "learning_rate": 2.8983083733852666e-06, + "loss": 0.3289, + "step": 12430 + }, + { + "epoch": 1.5127471858837847, + "grad_norm": 1.5090821981430054, + "learning_rate": 2.896935159323714e-06, + "loss": 0.3845, + "step": 12431 + }, + { + "epoch": 1.5128688773958017, + "grad_norm": 2.690147638320923, + "learning_rate": 2.8955622155484777e-06, + "loss": 0.3743, + "step": 12432 + }, + { + "epoch": 1.5129905689078187, + "grad_norm": 1.9491065740585327, + "learning_rate": 2.8941895421118004e-06, + "loss": 0.3994, + "step": 12433 + }, + { + "epoch": 1.5131122604198357, + "grad_norm": 1.8123835325241089, + "learning_rate": 2.8928171390659234e-06, + "loss": 0.3529, + "step": 12434 + }, + { + "epoch": 1.5132339519318527, + "grad_norm": 2.0912837982177734, + "learning_rate": 2.891445006463062e-06, + "loss": 0.3105, + "step": 12435 + }, + { + "epoch": 1.5133556434438697, + "grad_norm": 1.463146448135376, + "learning_rate": 2.8900731443554354e-06, + "loss": 0.3564, + "step": 12436 + }, + { + "epoch": 1.513477334955887, + "grad_norm": 1.616438865661621, + "learning_rate": 2.8887015527952412e-06, + "loss": 0.3844, + "step": 12437 + }, + { + "epoch": 1.513599026467904, + "grad_norm": 1.9377530813217163, + "learning_rate": 2.887330231834671e-06, + "loss": 0.4028, + "step": 12438 + }, + { + "epoch": 1.513720717979921, + "grad_norm": 2.1920993328094482, + "learning_rate": 2.8859591815259092e-06, + "loss": 0.3064, + "step": 12439 + }, + { + "epoch": 1.513842409491938, + "grad_norm": 1.726955771446228, + "learning_rate": 2.8845884019211268e-06, + "loss": 0.3637, + "step": 12440 + }, + { + "epoch": 1.513964101003955, + "grad_norm": 2.236267328262329, + "learning_rate": 2.883217893072481e-06, + "loss": 0.3733, + "step": 12441 + }, + { + "epoch": 1.514085792515972, + "grad_norm": 3.0263047218322754, + "learning_rate": 2.8818476550321282e-06, + "loss": 0.3966, + "step": 12442 + }, + { + "epoch": 1.514207484027989, + "grad_norm": 2.3909547328948975, + "learning_rate": 2.8804776878522056e-06, + "loss": 0.3394, + "step": 12443 + }, + { + "epoch": 1.5143291755400061, + "grad_norm": 1.9603559970855713, + "learning_rate": 2.879107991584843e-06, + "loss": 0.3672, + "step": 12444 + }, + { + "epoch": 1.5144508670520231, + "grad_norm": 2.0302340984344482, + "learning_rate": 2.8777385662821623e-06, + "loss": 0.3737, + "step": 12445 + }, + { + "epoch": 1.5145725585640402, + "grad_norm": 2.8463363647460938, + "learning_rate": 2.8763694119962672e-06, + "loss": 0.3996, + "step": 12446 + }, + { + "epoch": 1.5146942500760572, + "grad_norm": 1.9522300958633423, + "learning_rate": 2.875000528779265e-06, + "loss": 0.2895, + "step": 12447 + }, + { + "epoch": 1.5148159415880742, + "grad_norm": 1.9654104709625244, + "learning_rate": 2.8736319166832393e-06, + "loss": 0.4268, + "step": 12448 + }, + { + "epoch": 1.5149376331000912, + "grad_norm": 1.757759928703308, + "learning_rate": 2.872263575760268e-06, + "loss": 0.4314, + "step": 12449 + }, + { + "epoch": 1.5150593246121082, + "grad_norm": 3.167323350906372, + "learning_rate": 2.8708955060624245e-06, + "loss": 0.3707, + "step": 12450 + }, + { + "epoch": 1.5151810161241253, + "grad_norm": 2.120405912399292, + "learning_rate": 2.8695277076417626e-06, + "loss": 0.3659, + "step": 12451 + }, + { + "epoch": 1.5153027076361423, + "grad_norm": 3.1846115589141846, + "learning_rate": 2.8681601805503278e-06, + "loss": 0.312, + "step": 12452 + }, + { + "epoch": 1.5154243991481593, + "grad_norm": 1.6613367795944214, + "learning_rate": 2.8667929248401626e-06, + "loss": 0.3389, + "step": 12453 + }, + { + "epoch": 1.5155460906601763, + "grad_norm": 2.0838582515716553, + "learning_rate": 2.865425940563293e-06, + "loss": 0.3417, + "step": 12454 + }, + { + "epoch": 1.5156677821721933, + "grad_norm": 1.655215859413147, + "learning_rate": 2.8640592277717337e-06, + "loss": 0.3561, + "step": 12455 + }, + { + "epoch": 1.5157894736842106, + "grad_norm": 2.4793524742126465, + "learning_rate": 2.8626927865174914e-06, + "loss": 0.2821, + "step": 12456 + }, + { + "epoch": 1.5159111651962276, + "grad_norm": 1.7385034561157227, + "learning_rate": 2.8613266168525577e-06, + "loss": 0.3545, + "step": 12457 + }, + { + "epoch": 1.5160328567082446, + "grad_norm": 1.9146374464035034, + "learning_rate": 2.859960718828927e-06, + "loss": 0.3674, + "step": 12458 + }, + { + "epoch": 1.5161545482202616, + "grad_norm": 3.5179266929626465, + "learning_rate": 2.8585950924985695e-06, + "loss": 0.4101, + "step": 12459 + }, + { + "epoch": 1.5162762397322787, + "grad_norm": 3.0011661052703857, + "learning_rate": 2.8572297379134472e-06, + "loss": 0.4241, + "step": 12460 + }, + { + "epoch": 1.5163979312442957, + "grad_norm": 3.7639846801757812, + "learning_rate": 2.855864655125522e-06, + "loss": 0.4429, + "step": 12461 + }, + { + "epoch": 1.516519622756313, + "grad_norm": 2.267141103744507, + "learning_rate": 2.8544998441867333e-06, + "loss": 0.2562, + "step": 12462 + }, + { + "epoch": 1.51664131426833, + "grad_norm": 1.5660878419876099, + "learning_rate": 2.853135305149014e-06, + "loss": 0.3535, + "step": 12463 + }, + { + "epoch": 1.516763005780347, + "grad_norm": 1.9517383575439453, + "learning_rate": 2.8517710380642916e-06, + "loss": 0.3922, + "step": 12464 + }, + { + "epoch": 1.516884697292364, + "grad_norm": 1.3870606422424316, + "learning_rate": 2.850407042984478e-06, + "loss": 0.3472, + "step": 12465 + }, + { + "epoch": 1.517006388804381, + "grad_norm": 2.4180305004119873, + "learning_rate": 2.849043319961474e-06, + "loss": 0.3357, + "step": 12466 + }, + { + "epoch": 1.517128080316398, + "grad_norm": 2.154930591583252, + "learning_rate": 2.8476798690471742e-06, + "loss": 0.3861, + "step": 12467 + }, + { + "epoch": 1.517249771828415, + "grad_norm": 1.5330091714859009, + "learning_rate": 2.846316690293457e-06, + "loss": 0.3422, + "step": 12468 + }, + { + "epoch": 1.517371463340432, + "grad_norm": 1.9610263109207153, + "learning_rate": 2.8449537837522e-06, + "loss": 0.3979, + "step": 12469 + }, + { + "epoch": 1.517493154852449, + "grad_norm": 1.2669917345046997, + "learning_rate": 2.843591149475261e-06, + "loss": 0.3288, + "step": 12470 + }, + { + "epoch": 1.517614846364466, + "grad_norm": 1.6009371280670166, + "learning_rate": 2.8422287875144895e-06, + "loss": 0.4017, + "step": 12471 + }, + { + "epoch": 1.517736537876483, + "grad_norm": 2.4372897148132324, + "learning_rate": 2.8408666979217315e-06, + "loss": 0.4656, + "step": 12472 + }, + { + "epoch": 1.5178582293885001, + "grad_norm": 2.3142457008361816, + "learning_rate": 2.8395048807488136e-06, + "loss": 0.4048, + "step": 12473 + }, + { + "epoch": 1.5179799209005171, + "grad_norm": 1.6589879989624023, + "learning_rate": 2.838143336047554e-06, + "loss": 0.3546, + "step": 12474 + }, + { + "epoch": 1.5181016124125342, + "grad_norm": 2.1142783164978027, + "learning_rate": 2.8367820638697672e-06, + "loss": 0.2929, + "step": 12475 + }, + { + "epoch": 1.5182233039245512, + "grad_norm": 2.1573996543884277, + "learning_rate": 2.8354210642672497e-06, + "loss": 0.3343, + "step": 12476 + }, + { + "epoch": 1.5183449954365682, + "grad_norm": 2.197007179260254, + "learning_rate": 2.8340603372917907e-06, + "loss": 0.3919, + "step": 12477 + }, + { + "epoch": 1.5184666869485852, + "grad_norm": 1.6790157556533813, + "learning_rate": 2.832699882995169e-06, + "loss": 0.3679, + "step": 12478 + }, + { + "epoch": 1.5185883784606022, + "grad_norm": 2.010957956314087, + "learning_rate": 2.8313397014291486e-06, + "loss": 0.4218, + "step": 12479 + }, + { + "epoch": 1.5187100699726193, + "grad_norm": 2.464397668838501, + "learning_rate": 2.8299797926454954e-06, + "loss": 0.449, + "step": 12480 + }, + { + "epoch": 1.5188317614846365, + "grad_norm": 1.7504805326461792, + "learning_rate": 2.8286201566959504e-06, + "loss": 0.3144, + "step": 12481 + }, + { + "epoch": 1.5189534529966535, + "grad_norm": 1.7859468460083008, + "learning_rate": 2.8272607936322504e-06, + "loss": 0.3881, + "step": 12482 + }, + { + "epoch": 1.5190751445086705, + "grad_norm": 1.5858334302902222, + "learning_rate": 2.8259017035061266e-06, + "loss": 0.3449, + "step": 12483 + }, + { + "epoch": 1.5191968360206876, + "grad_norm": 3.0674517154693604, + "learning_rate": 2.8245428863692925e-06, + "loss": 0.3004, + "step": 12484 + }, + { + "epoch": 1.5193185275327046, + "grad_norm": 2.073861598968506, + "learning_rate": 2.8231843422734507e-06, + "loss": 0.4116, + "step": 12485 + }, + { + "epoch": 1.5194402190447216, + "grad_norm": 1.8102351427078247, + "learning_rate": 2.821826071270304e-06, + "loss": 0.3684, + "step": 12486 + }, + { + "epoch": 1.5195619105567386, + "grad_norm": 2.0589699745178223, + "learning_rate": 2.8204680734115285e-06, + "loss": 0.3298, + "step": 12487 + }, + { + "epoch": 1.5196836020687559, + "grad_norm": 1.4765851497650146, + "learning_rate": 2.819110348748808e-06, + "loss": 0.3685, + "step": 12488 + }, + { + "epoch": 1.5198052935807729, + "grad_norm": 1.4881231784820557, + "learning_rate": 2.817752897333805e-06, + "loss": 0.3507, + "step": 12489 + }, + { + "epoch": 1.51992698509279, + "grad_norm": 2.6291379928588867, + "learning_rate": 2.8163957192181644e-06, + "loss": 0.3788, + "step": 12490 + }, + { + "epoch": 1.520048676604807, + "grad_norm": 1.3584263324737549, + "learning_rate": 2.81503881445354e-06, + "loss": 0.4073, + "step": 12491 + }, + { + "epoch": 1.520170368116824, + "grad_norm": 1.951324224472046, + "learning_rate": 2.8136821830915593e-06, + "loss": 0.4247, + "step": 12492 + }, + { + "epoch": 1.520292059628841, + "grad_norm": 1.4104790687561035, + "learning_rate": 2.812325825183845e-06, + "loss": 0.3687, + "step": 12493 + }, + { + "epoch": 1.520413751140858, + "grad_norm": 2.132877826690674, + "learning_rate": 2.8109697407820124e-06, + "loss": 0.3875, + "step": 12494 + }, + { + "epoch": 1.520535442652875, + "grad_norm": 2.3127293586730957, + "learning_rate": 2.809613929937659e-06, + "loss": 0.3874, + "step": 12495 + }, + { + "epoch": 1.520657134164892, + "grad_norm": 1.6824495792388916, + "learning_rate": 2.8082583927023823e-06, + "loss": 0.395, + "step": 12496 + }, + { + "epoch": 1.520778825676909, + "grad_norm": 1.815323829650879, + "learning_rate": 2.8069031291277592e-06, + "loss": 0.3297, + "step": 12497 + }, + { + "epoch": 1.520900517188926, + "grad_norm": 1.9569213390350342, + "learning_rate": 2.8055481392653584e-06, + "loss": 0.3321, + "step": 12498 + }, + { + "epoch": 1.521022208700943, + "grad_norm": 1.4917882680892944, + "learning_rate": 2.8041934231667445e-06, + "loss": 0.3932, + "step": 12499 + }, + { + "epoch": 1.52114390021296, + "grad_norm": 1.4959702491760254, + "learning_rate": 2.802838980883467e-06, + "loss": 0.3731, + "step": 12500 + }, + { + "epoch": 1.521265591724977, + "grad_norm": 2.0857603549957275, + "learning_rate": 2.8014848124670626e-06, + "loss": 0.4038, + "step": 12501 + }, + { + "epoch": 1.5213872832369941, + "grad_norm": 2.008976697921753, + "learning_rate": 2.8001309179690607e-06, + "loss": 0.4068, + "step": 12502 + }, + { + "epoch": 1.5215089747490111, + "grad_norm": 2.03781795501709, + "learning_rate": 2.798777297440979e-06, + "loss": 0.3361, + "step": 12503 + }, + { + "epoch": 1.5216306662610282, + "grad_norm": 1.6787981986999512, + "learning_rate": 2.797423950934328e-06, + "loss": 0.3921, + "step": 12504 + }, + { + "epoch": 1.5217523577730452, + "grad_norm": 3.9700543880462646, + "learning_rate": 2.796070878500605e-06, + "loss": 0.3056, + "step": 12505 + }, + { + "epoch": 1.5218740492850622, + "grad_norm": 1.6673107147216797, + "learning_rate": 2.794718080191293e-06, + "loss": 0.3423, + "step": 12506 + }, + { + "epoch": 1.5219957407970794, + "grad_norm": 1.9476972818374634, + "learning_rate": 2.7933655560578767e-06, + "loss": 0.2832, + "step": 12507 + }, + { + "epoch": 1.5221174323090965, + "grad_norm": 1.8355255126953125, + "learning_rate": 2.792013306151816e-06, + "loss": 0.3628, + "step": 12508 + }, + { + "epoch": 1.5222391238211135, + "grad_norm": 1.585462212562561, + "learning_rate": 2.7906613305245666e-06, + "loss": 0.3607, + "step": 12509 + }, + { + "epoch": 1.5223608153331305, + "grad_norm": 1.41182541847229, + "learning_rate": 2.789309629227579e-06, + "loss": 0.346, + "step": 12510 + }, + { + "epoch": 1.5224825068451475, + "grad_norm": 2.200190544128418, + "learning_rate": 2.7879582023122854e-06, + "loss": 0.4043, + "step": 12511 + }, + { + "epoch": 1.5226041983571645, + "grad_norm": 1.837247371673584, + "learning_rate": 2.7866070498301103e-06, + "loss": 0.392, + "step": 12512 + }, + { + "epoch": 1.5227258898691818, + "grad_norm": 1.808407187461853, + "learning_rate": 2.785256171832468e-06, + "loss": 0.4363, + "step": 12513 + }, + { + "epoch": 1.5228475813811988, + "grad_norm": 2.131739377975464, + "learning_rate": 2.783905568370758e-06, + "loss": 0.3802, + "step": 12514 + }, + { + "epoch": 1.5229692728932158, + "grad_norm": 1.746535062789917, + "learning_rate": 2.782555239496382e-06, + "loss": 0.3729, + "step": 12515 + }, + { + "epoch": 1.5230909644052328, + "grad_norm": 1.4207675457000732, + "learning_rate": 2.781205185260717e-06, + "loss": 0.3262, + "step": 12516 + }, + { + "epoch": 1.5232126559172499, + "grad_norm": 2.992722749710083, + "learning_rate": 2.779855405715135e-06, + "loss": 0.4238, + "step": 12517 + }, + { + "epoch": 1.5233343474292669, + "grad_norm": 3.160243272781372, + "learning_rate": 2.7785059009110025e-06, + "loss": 0.4457, + "step": 12518 + }, + { + "epoch": 1.523456038941284, + "grad_norm": 1.7625850439071655, + "learning_rate": 2.7771566708996676e-06, + "loss": 0.3357, + "step": 12519 + }, + { + "epoch": 1.523577730453301, + "grad_norm": 2.4827253818511963, + "learning_rate": 2.775807715732469e-06, + "loss": 0.4042, + "step": 12520 + }, + { + "epoch": 1.523699421965318, + "grad_norm": 2.242046594619751, + "learning_rate": 2.7744590354607436e-06, + "loss": 0.392, + "step": 12521 + }, + { + "epoch": 1.523821113477335, + "grad_norm": 1.9286634922027588, + "learning_rate": 2.7731106301358077e-06, + "loss": 0.3805, + "step": 12522 + }, + { + "epoch": 1.523942804989352, + "grad_norm": 3.6629810333251953, + "learning_rate": 2.771762499808971e-06, + "loss": 0.4606, + "step": 12523 + }, + { + "epoch": 1.524064496501369, + "grad_norm": 2.2985353469848633, + "learning_rate": 2.7704146445315326e-06, + "loss": 0.3612, + "step": 12524 + }, + { + "epoch": 1.524186188013386, + "grad_norm": 1.843945026397705, + "learning_rate": 2.769067064354779e-06, + "loss": 0.3587, + "step": 12525 + }, + { + "epoch": 1.524307879525403, + "grad_norm": 1.5582345724105835, + "learning_rate": 2.767719759329993e-06, + "loss": 0.3748, + "step": 12526 + }, + { + "epoch": 1.52442957103742, + "grad_norm": 1.6034371852874756, + "learning_rate": 2.766372729508441e-06, + "loss": 0.3417, + "step": 12527 + }, + { + "epoch": 1.524551262549437, + "grad_norm": 1.9791405200958252, + "learning_rate": 2.7650259749413765e-06, + "loss": 0.4141, + "step": 12528 + }, + { + "epoch": 1.524672954061454, + "grad_norm": 1.4464818239212036, + "learning_rate": 2.7636794956800516e-06, + "loss": 0.322, + "step": 12529 + }, + { + "epoch": 1.524794645573471, + "grad_norm": 1.4464612007141113, + "learning_rate": 2.7623332917757005e-06, + "loss": 0.3524, + "step": 12530 + }, + { + "epoch": 1.5249163370854881, + "grad_norm": 3.289478302001953, + "learning_rate": 2.7609873632795458e-06, + "loss": 0.3381, + "step": 12531 + }, + { + "epoch": 1.5250380285975054, + "grad_norm": 1.9363148212432861, + "learning_rate": 2.7596417102428085e-06, + "loss": 0.377, + "step": 12532 + }, + { + "epoch": 1.5251597201095224, + "grad_norm": 1.9632848501205444, + "learning_rate": 2.7582963327166913e-06, + "loss": 0.3776, + "step": 12533 + }, + { + "epoch": 1.5252814116215394, + "grad_norm": 2.836686134338379, + "learning_rate": 2.756951230752385e-06, + "loss": 0.3297, + "step": 12534 + }, + { + "epoch": 1.5254031031335564, + "grad_norm": 1.3542596101760864, + "learning_rate": 2.7556064044010822e-06, + "loss": 0.3276, + "step": 12535 + }, + { + "epoch": 1.5255247946455734, + "grad_norm": 1.9509390592575073, + "learning_rate": 2.7542618537139455e-06, + "loss": 0.3747, + "step": 12536 + }, + { + "epoch": 1.5256464861575905, + "grad_norm": 3.561317205429077, + "learning_rate": 2.7529175787421457e-06, + "loss": 0.4237, + "step": 12537 + }, + { + "epoch": 1.5257681776696077, + "grad_norm": 1.8905047178268433, + "learning_rate": 2.751573579536834e-06, + "loss": 0.3218, + "step": 12538 + }, + { + "epoch": 1.5258898691816247, + "grad_norm": 2.1115753650665283, + "learning_rate": 2.750229856149146e-06, + "loss": 0.3886, + "step": 12539 + }, + { + "epoch": 1.5260115606936417, + "grad_norm": 1.7737010717391968, + "learning_rate": 2.7488864086302225e-06, + "loss": 0.3517, + "step": 12540 + }, + { + "epoch": 1.5261332522056588, + "grad_norm": 2.041555643081665, + "learning_rate": 2.7475432370311793e-06, + "loss": 0.3652, + "step": 12541 + }, + { + "epoch": 1.5262549437176758, + "grad_norm": 3.035809278488159, + "learning_rate": 2.7462003414031245e-06, + "loss": 0.389, + "step": 12542 + }, + { + "epoch": 1.5263766352296928, + "grad_norm": 1.6282237768173218, + "learning_rate": 2.744857721797165e-06, + "loss": 0.4061, + "step": 12543 + }, + { + "epoch": 1.5264983267417098, + "grad_norm": 3.464914321899414, + "learning_rate": 2.7435153782643863e-06, + "loss": 0.4089, + "step": 12544 + }, + { + "epoch": 1.5266200182537268, + "grad_norm": 3.303316354751587, + "learning_rate": 2.7421733108558647e-06, + "loss": 0.3361, + "step": 12545 + }, + { + "epoch": 1.5267417097657439, + "grad_norm": 2.564096689224243, + "learning_rate": 2.7408315196226774e-06, + "loss": 0.4571, + "step": 12546 + }, + { + "epoch": 1.5268634012777609, + "grad_norm": 2.6128838062286377, + "learning_rate": 2.73949000461587e-06, + "loss": 0.3588, + "step": 12547 + }, + { + "epoch": 1.526985092789778, + "grad_norm": 1.7256296873092651, + "learning_rate": 2.7381487658865003e-06, + "loss": 0.3735, + "step": 12548 + }, + { + "epoch": 1.527106784301795, + "grad_norm": 2.496760368347168, + "learning_rate": 2.7368078034856004e-06, + "loss": 0.3611, + "step": 12549 + }, + { + "epoch": 1.527228475813812, + "grad_norm": 3.9976139068603516, + "learning_rate": 2.7354671174641943e-06, + "loss": 0.4633, + "step": 12550 + }, + { + "epoch": 1.527350167325829, + "grad_norm": 2.3817555904388428, + "learning_rate": 2.734126707873306e-06, + "loss": 0.3473, + "step": 12551 + }, + { + "epoch": 1.527471858837846, + "grad_norm": 2.1352765560150146, + "learning_rate": 2.7327865747639315e-06, + "loss": 0.4786, + "step": 12552 + }, + { + "epoch": 1.527593550349863, + "grad_norm": 1.520741581916809, + "learning_rate": 2.731446718187073e-06, + "loss": 0.3758, + "step": 12553 + }, + { + "epoch": 1.52771524186188, + "grad_norm": 2.741384983062744, + "learning_rate": 2.7301071381937115e-06, + "loss": 0.3677, + "step": 12554 + }, + { + "epoch": 1.527836933373897, + "grad_norm": 2.115781307220459, + "learning_rate": 2.72876783483482e-06, + "loss": 0.3661, + "step": 12555 + }, + { + "epoch": 1.527958624885914, + "grad_norm": 2.0032315254211426, + "learning_rate": 2.7274288081613643e-06, + "loss": 0.3689, + "step": 12556 + }, + { + "epoch": 1.5280803163979313, + "grad_norm": 1.7844867706298828, + "learning_rate": 2.7260900582242966e-06, + "loss": 0.4048, + "step": 12557 + }, + { + "epoch": 1.5282020079099483, + "grad_norm": 2.108513593673706, + "learning_rate": 2.7247515850745586e-06, + "loss": 0.3594, + "step": 12558 + }, + { + "epoch": 1.5283236994219653, + "grad_norm": 1.8052276372909546, + "learning_rate": 2.723413388763082e-06, + "loss": 0.3934, + "step": 12559 + }, + { + "epoch": 1.5284453909339824, + "grad_norm": 3.2891016006469727, + "learning_rate": 2.7220754693407834e-06, + "loss": 0.3259, + "step": 12560 + }, + { + "epoch": 1.5285670824459994, + "grad_norm": 2.9912776947021484, + "learning_rate": 2.7207378268585817e-06, + "loss": 0.4031, + "step": 12561 + }, + { + "epoch": 1.5286887739580164, + "grad_norm": 1.5246210098266602, + "learning_rate": 2.719400461367373e-06, + "loss": 0.3852, + "step": 12562 + }, + { + "epoch": 1.5288104654700336, + "grad_norm": 1.4531372785568237, + "learning_rate": 2.7180633729180427e-06, + "loss": 0.403, + "step": 12563 + }, + { + "epoch": 1.5289321569820506, + "grad_norm": 1.4160289764404297, + "learning_rate": 2.716726561561478e-06, + "loss": 0.3216, + "step": 12564 + }, + { + "epoch": 1.5290538484940677, + "grad_norm": 2.1517996788024902, + "learning_rate": 2.7153900273485424e-06, + "loss": 0.3597, + "step": 12565 + }, + { + "epoch": 1.5291755400060847, + "grad_norm": 1.7108573913574219, + "learning_rate": 2.714053770330092e-06, + "loss": 0.3644, + "step": 12566 + }, + { + "epoch": 1.5292972315181017, + "grad_norm": 2.7664637565612793, + "learning_rate": 2.7127177905569803e-06, + "loss": 0.3552, + "step": 12567 + }, + { + "epoch": 1.5294189230301187, + "grad_norm": 2.8855767250061035, + "learning_rate": 2.711382088080039e-06, + "loss": 0.4201, + "step": 12568 + }, + { + "epoch": 1.5295406145421357, + "grad_norm": 1.5161004066467285, + "learning_rate": 2.7100466629500944e-06, + "loss": 0.3911, + "step": 12569 + }, + { + "epoch": 1.5296623060541528, + "grad_norm": 1.2162444591522217, + "learning_rate": 2.7087115152179686e-06, + "loss": 0.3736, + "step": 12570 + }, + { + "epoch": 1.5297839975661698, + "grad_norm": 2.8383493423461914, + "learning_rate": 2.707376644934456e-06, + "loss": 0.3689, + "step": 12571 + }, + { + "epoch": 1.5299056890781868, + "grad_norm": 3.169105291366577, + "learning_rate": 2.7060420521503607e-06, + "loss": 0.3228, + "step": 12572 + }, + { + "epoch": 1.5300273805902038, + "grad_norm": 2.56186842918396, + "learning_rate": 2.704707736916462e-06, + "loss": 0.3461, + "step": 12573 + }, + { + "epoch": 1.5301490721022208, + "grad_norm": 2.33776593208313, + "learning_rate": 2.7033736992835314e-06, + "loss": 0.3976, + "step": 12574 + }, + { + "epoch": 1.5302707636142379, + "grad_norm": 1.4289809465408325, + "learning_rate": 2.7020399393023377e-06, + "loss": 0.396, + "step": 12575 + }, + { + "epoch": 1.5303924551262549, + "grad_norm": 2.5968401432037354, + "learning_rate": 2.700706457023631e-06, + "loss": 0.3633, + "step": 12576 + }, + { + "epoch": 1.530514146638272, + "grad_norm": 1.9646767377853394, + "learning_rate": 2.6993732524981484e-06, + "loss": 0.3955, + "step": 12577 + }, + { + "epoch": 1.530635838150289, + "grad_norm": 1.4681382179260254, + "learning_rate": 2.6980403257766287e-06, + "loss": 0.3423, + "step": 12578 + }, + { + "epoch": 1.530757529662306, + "grad_norm": 1.5288289785385132, + "learning_rate": 2.696707676909789e-06, + "loss": 0.379, + "step": 12579 + }, + { + "epoch": 1.530879221174323, + "grad_norm": 1.6512926816940308, + "learning_rate": 2.6953753059483357e-06, + "loss": 0.4064, + "step": 12580 + }, + { + "epoch": 1.53100091268634, + "grad_norm": 1.8012267351150513, + "learning_rate": 2.6940432129429785e-06, + "loss": 0.3575, + "step": 12581 + }, + { + "epoch": 1.5311226041983572, + "grad_norm": 1.7500895261764526, + "learning_rate": 2.692711397944392e-06, + "loss": 0.3951, + "step": 12582 + }, + { + "epoch": 1.5312442957103742, + "grad_norm": 1.5793031454086304, + "learning_rate": 2.691379861003266e-06, + "loss": 0.35, + "step": 12583 + }, + { + "epoch": 1.5313659872223913, + "grad_norm": 1.933671236038208, + "learning_rate": 2.690048602170264e-06, + "loss": 0.3761, + "step": 12584 + }, + { + "epoch": 1.5314876787344083, + "grad_norm": 2.2148663997650146, + "learning_rate": 2.6887176214960408e-06, + "loss": 0.4297, + "step": 12585 + }, + { + "epoch": 1.5316093702464253, + "grad_norm": 1.6534515619277954, + "learning_rate": 2.6873869190312483e-06, + "loss": 0.3652, + "step": 12586 + }, + { + "epoch": 1.5317310617584423, + "grad_norm": 1.443554401397705, + "learning_rate": 2.686056494826521e-06, + "loss": 0.3685, + "step": 12587 + }, + { + "epoch": 1.5318527532704593, + "grad_norm": 3.234772205352783, + "learning_rate": 2.6847263489324793e-06, + "loss": 0.3262, + "step": 12588 + }, + { + "epoch": 1.5319744447824766, + "grad_norm": 1.585789680480957, + "learning_rate": 2.683396481399744e-06, + "loss": 0.4032, + "step": 12589 + }, + { + "epoch": 1.5320961362944936, + "grad_norm": 3.123924732208252, + "learning_rate": 2.6820668922789185e-06, + "loss": 0.3612, + "step": 12590 + }, + { + "epoch": 1.5322178278065106, + "grad_norm": 1.779894232749939, + "learning_rate": 2.6807375816205904e-06, + "loss": 0.3907, + "step": 12591 + }, + { + "epoch": 1.5323395193185276, + "grad_norm": 3.158414363861084, + "learning_rate": 2.679408549475352e-06, + "loss": 0.3387, + "step": 12592 + }, + { + "epoch": 1.5324612108305447, + "grad_norm": 2.2403247356414795, + "learning_rate": 2.67807979589377e-06, + "loss": 0.4036, + "step": 12593 + }, + { + "epoch": 1.5325829023425617, + "grad_norm": 2.8860385417938232, + "learning_rate": 2.6767513209264084e-06, + "loss": 0.3161, + "step": 12594 + }, + { + "epoch": 1.5327045938545787, + "grad_norm": 2.474834680557251, + "learning_rate": 2.675423124623816e-06, + "loss": 0.346, + "step": 12595 + }, + { + "epoch": 1.5328262853665957, + "grad_norm": 1.8335844278335571, + "learning_rate": 2.674095207036532e-06, + "loss": 0.3623, + "step": 12596 + }, + { + "epoch": 1.5329479768786127, + "grad_norm": 1.397382140159607, + "learning_rate": 2.672767568215093e-06, + "loss": 0.3943, + "step": 12597 + }, + { + "epoch": 1.5330696683906297, + "grad_norm": 1.7127958536148071, + "learning_rate": 2.6714402082100143e-06, + "loss": 0.3074, + "step": 12598 + }, + { + "epoch": 1.5331913599026468, + "grad_norm": 1.7539894580841064, + "learning_rate": 2.6701131270718016e-06, + "loss": 0.3474, + "step": 12599 + }, + { + "epoch": 1.5333130514146638, + "grad_norm": 1.297405481338501, + "learning_rate": 2.6687863248509595e-06, + "loss": 0.3126, + "step": 12600 + }, + { + "epoch": 1.5334347429266808, + "grad_norm": 4.5004963874816895, + "learning_rate": 2.6674598015979723e-06, + "loss": 0.471, + "step": 12601 + }, + { + "epoch": 1.5335564344386978, + "grad_norm": 3.0616583824157715, + "learning_rate": 2.666133557363315e-06, + "loss": 0.406, + "step": 12602 + }, + { + "epoch": 1.5336781259507148, + "grad_norm": 1.5206066370010376, + "learning_rate": 2.664807592197458e-06, + "loss": 0.3555, + "step": 12603 + }, + { + "epoch": 1.5337998174627319, + "grad_norm": 1.8446646928787231, + "learning_rate": 2.6634819061508564e-06, + "loss": 0.3495, + "step": 12604 + }, + { + "epoch": 1.5339215089747489, + "grad_norm": 2.185856819152832, + "learning_rate": 2.6621564992739533e-06, + "loss": 0.421, + "step": 12605 + }, + { + "epoch": 1.534043200486766, + "grad_norm": 1.5439691543579102, + "learning_rate": 2.660831371617184e-06, + "loss": 0.3377, + "step": 12606 + }, + { + "epoch": 1.534164891998783, + "grad_norm": 1.750883936882019, + "learning_rate": 2.659506523230969e-06, + "loss": 0.3909, + "step": 12607 + }, + { + "epoch": 1.5342865835108002, + "grad_norm": 2.113356828689575, + "learning_rate": 2.658181954165728e-06, + "loss": 0.3931, + "step": 12608 + }, + { + "epoch": 1.5344082750228172, + "grad_norm": 1.461066722869873, + "learning_rate": 2.65685766447186e-06, + "loss": 0.2957, + "step": 12609 + }, + { + "epoch": 1.5345299665348342, + "grad_norm": 2.9559671878814697, + "learning_rate": 2.6555336541997555e-06, + "loss": 0.4491, + "step": 12610 + }, + { + "epoch": 1.5346516580468512, + "grad_norm": 1.8662328720092773, + "learning_rate": 2.6542099233997996e-06, + "loss": 0.3755, + "step": 12611 + }, + { + "epoch": 1.5347733495588682, + "grad_norm": 3.054624080657959, + "learning_rate": 2.652886472122359e-06, + "loss": 0.3513, + "step": 12612 + }, + { + "epoch": 1.5348950410708853, + "grad_norm": 1.3648021221160889, + "learning_rate": 2.6515633004177977e-06, + "loss": 0.3407, + "step": 12613 + }, + { + "epoch": 1.5350167325829025, + "grad_norm": 1.4538992643356323, + "learning_rate": 2.650240408336464e-06, + "loss": 0.3808, + "step": 12614 + }, + { + "epoch": 1.5351384240949195, + "grad_norm": 1.3085802793502808, + "learning_rate": 2.648917795928693e-06, + "loss": 0.3486, + "step": 12615 + }, + { + "epoch": 1.5352601156069365, + "grad_norm": 1.4782772064208984, + "learning_rate": 2.647595463244821e-06, + "loss": 0.362, + "step": 12616 + }, + { + "epoch": 1.5353818071189536, + "grad_norm": 1.7189334630966187, + "learning_rate": 2.6462734103351573e-06, + "loss": 0.3146, + "step": 12617 + }, + { + "epoch": 1.5355034986309706, + "grad_norm": 2.4061315059661865, + "learning_rate": 2.6449516372500093e-06, + "loss": 0.3933, + "step": 12618 + }, + { + "epoch": 1.5356251901429876, + "grad_norm": 3.2007806301116943, + "learning_rate": 2.6436301440396785e-06, + "loss": 0.425, + "step": 12619 + }, + { + "epoch": 1.5357468816550046, + "grad_norm": 1.963255763053894, + "learning_rate": 2.6423089307544447e-06, + "loss": 0.4086, + "step": 12620 + }, + { + "epoch": 1.5358685731670216, + "grad_norm": 1.522980809211731, + "learning_rate": 2.6409879974445883e-06, + "loss": 0.3698, + "step": 12621 + }, + { + "epoch": 1.5359902646790387, + "grad_norm": 1.7028958797454834, + "learning_rate": 2.6396673441603713e-06, + "loss": 0.3353, + "step": 12622 + }, + { + "epoch": 1.5361119561910557, + "grad_norm": 2.8061985969543457, + "learning_rate": 2.6383469709520436e-06, + "loss": 0.3911, + "step": 12623 + }, + { + "epoch": 1.5362336477030727, + "grad_norm": 1.8408160209655762, + "learning_rate": 2.637026877869855e-06, + "loss": 0.3534, + "step": 12624 + }, + { + "epoch": 1.5363553392150897, + "grad_norm": 2.483170509338379, + "learning_rate": 2.635707064964034e-06, + "loss": 0.3697, + "step": 12625 + }, + { + "epoch": 1.5364770307271067, + "grad_norm": 2.4414095878601074, + "learning_rate": 2.634387532284799e-06, + "loss": 0.2921, + "step": 12626 + }, + { + "epoch": 1.5365987222391237, + "grad_norm": 1.7491705417633057, + "learning_rate": 2.6330682798823704e-06, + "loss": 0.3718, + "step": 12627 + }, + { + "epoch": 1.5367204137511408, + "grad_norm": 3.8672852516174316, + "learning_rate": 2.631749307806937e-06, + "loss": 0.3717, + "step": 12628 + }, + { + "epoch": 1.5368421052631578, + "grad_norm": 1.512840986251831, + "learning_rate": 2.630430616108696e-06, + "loss": 0.3807, + "step": 12629 + }, + { + "epoch": 1.5369637967751748, + "grad_norm": 1.7544865608215332, + "learning_rate": 2.6291122048378246e-06, + "loss": 0.3682, + "step": 12630 + }, + { + "epoch": 1.5370854882871918, + "grad_norm": 1.583282232284546, + "learning_rate": 2.6277940740444874e-06, + "loss": 0.433, + "step": 12631 + }, + { + "epoch": 1.5372071797992088, + "grad_norm": 1.7873797416687012, + "learning_rate": 2.626476223778849e-06, + "loss": 0.4123, + "step": 12632 + }, + { + "epoch": 1.537328871311226, + "grad_norm": 1.5935440063476562, + "learning_rate": 2.6251586540910523e-06, + "loss": 0.3075, + "step": 12633 + }, + { + "epoch": 1.537450562823243, + "grad_norm": 2.0723776817321777, + "learning_rate": 2.623841365031231e-06, + "loss": 0.3452, + "step": 12634 + }, + { + "epoch": 1.5375722543352601, + "grad_norm": 1.8005446195602417, + "learning_rate": 2.622524356649516e-06, + "loss": 0.3785, + "step": 12635 + }, + { + "epoch": 1.5376939458472771, + "grad_norm": 1.8446670770645142, + "learning_rate": 2.6212076289960207e-06, + "loss": 0.4001, + "step": 12636 + }, + { + "epoch": 1.5378156373592942, + "grad_norm": 2.093743324279785, + "learning_rate": 2.619891182120845e-06, + "loss": 0.3409, + "step": 12637 + }, + { + "epoch": 1.5379373288713112, + "grad_norm": 1.894636869430542, + "learning_rate": 2.6185750160740885e-06, + "loss": 0.4198, + "step": 12638 + }, + { + "epoch": 1.5380590203833284, + "grad_norm": 2.0566294193267822, + "learning_rate": 2.617259130905833e-06, + "loss": 0.3987, + "step": 12639 + }, + { + "epoch": 1.5381807118953454, + "grad_norm": 2.1996231079101562, + "learning_rate": 2.615943526666147e-06, + "loss": 0.3262, + "step": 12640 + }, + { + "epoch": 1.5383024034073625, + "grad_norm": 1.6906081438064575, + "learning_rate": 2.614628203405094e-06, + "loss": 0.2881, + "step": 12641 + }, + { + "epoch": 1.5384240949193795, + "grad_norm": 2.8557193279266357, + "learning_rate": 2.6133131611727225e-06, + "loss": 0.4133, + "step": 12642 + }, + { + "epoch": 1.5385457864313965, + "grad_norm": 2.2630748748779297, + "learning_rate": 2.611998400019078e-06, + "loss": 0.4035, + "step": 12643 + }, + { + "epoch": 1.5386674779434135, + "grad_norm": 1.6517478227615356, + "learning_rate": 2.6106839199941868e-06, + "loss": 0.3366, + "step": 12644 + }, + { + "epoch": 1.5387891694554305, + "grad_norm": 2.2785139083862305, + "learning_rate": 2.609369721148064e-06, + "loss": 0.4114, + "step": 12645 + }, + { + "epoch": 1.5389108609674476, + "grad_norm": 2.9818496704101562, + "learning_rate": 2.6080558035307237e-06, + "loss": 0.3984, + "step": 12646 + }, + { + "epoch": 1.5390325524794646, + "grad_norm": 1.423867106437683, + "learning_rate": 2.606742167192161e-06, + "loss": 0.3766, + "step": 12647 + }, + { + "epoch": 1.5391542439914816, + "grad_norm": 2.039567708969116, + "learning_rate": 2.605428812182359e-06, + "loss": 0.3352, + "step": 12648 + }, + { + "epoch": 1.5392759355034986, + "grad_norm": 1.451438069343567, + "learning_rate": 2.6041157385512993e-06, + "loss": 0.4101, + "step": 12649 + }, + { + "epoch": 1.5393976270155156, + "grad_norm": 1.6416186094284058, + "learning_rate": 2.6028029463489456e-06, + "loss": 0.3564, + "step": 12650 + }, + { + "epoch": 1.5395193185275327, + "grad_norm": 1.7495027780532837, + "learning_rate": 2.60149043562525e-06, + "loss": 0.3604, + "step": 12651 + }, + { + "epoch": 1.5396410100395497, + "grad_norm": 1.9021443128585815, + "learning_rate": 2.6001782064301572e-06, + "loss": 0.4371, + "step": 12652 + }, + { + "epoch": 1.5397627015515667, + "grad_norm": 2.226966142654419, + "learning_rate": 2.598866258813598e-06, + "loss": 0.2856, + "step": 12653 + }, + { + "epoch": 1.5398843930635837, + "grad_norm": 2.5288314819335938, + "learning_rate": 2.597554592825501e-06, + "loss": 0.3907, + "step": 12654 + }, + { + "epoch": 1.5400060845756007, + "grad_norm": 1.9600913524627686, + "learning_rate": 2.596243208515773e-06, + "loss": 0.3877, + "step": 12655 + }, + { + "epoch": 1.5401277760876178, + "grad_norm": 2.3895750045776367, + "learning_rate": 2.5949321059343123e-06, + "loss": 0.3088, + "step": 12656 + }, + { + "epoch": 1.5402494675996348, + "grad_norm": 1.6667580604553223, + "learning_rate": 2.593621285131016e-06, + "loss": 0.3901, + "step": 12657 + }, + { + "epoch": 1.540371159111652, + "grad_norm": 1.8805153369903564, + "learning_rate": 2.5923107461557604e-06, + "loss": 0.3665, + "step": 12658 + }, + { + "epoch": 1.540492850623669, + "grad_norm": 2.1743226051330566, + "learning_rate": 2.5910004890584116e-06, + "loss": 0.4053, + "step": 12659 + }, + { + "epoch": 1.540614542135686, + "grad_norm": 1.501466155052185, + "learning_rate": 2.5896905138888316e-06, + "loss": 0.3491, + "step": 12660 + }, + { + "epoch": 1.540736233647703, + "grad_norm": 1.759179711341858, + "learning_rate": 2.5883808206968675e-06, + "loss": 0.3437, + "step": 12661 + }, + { + "epoch": 1.54085792515972, + "grad_norm": 2.846353054046631, + "learning_rate": 2.5870714095323536e-06, + "loss": 0.3885, + "step": 12662 + }, + { + "epoch": 1.540979616671737, + "grad_norm": 1.4050909280776978, + "learning_rate": 2.5857622804451165e-06, + "loss": 0.3338, + "step": 12663 + }, + { + "epoch": 1.5411013081837543, + "grad_norm": 1.7222799062728882, + "learning_rate": 2.584453433484968e-06, + "loss": 0.3404, + "step": 12664 + }, + { + "epoch": 1.5412229996957714, + "grad_norm": 2.3639652729034424, + "learning_rate": 2.583144868701719e-06, + "loss": 0.3892, + "step": 12665 + }, + { + "epoch": 1.5413446912077884, + "grad_norm": 1.5261552333831787, + "learning_rate": 2.5818365861451598e-06, + "loss": 0.3724, + "step": 12666 + }, + { + "epoch": 1.5414663827198054, + "grad_norm": 2.142681837081909, + "learning_rate": 2.580528585865071e-06, + "loss": 0.3862, + "step": 12667 + }, + { + "epoch": 1.5415880742318224, + "grad_norm": 1.5395457744598389, + "learning_rate": 2.5792208679112285e-06, + "loss": 0.3548, + "step": 12668 + }, + { + "epoch": 1.5417097657438394, + "grad_norm": 1.441227912902832, + "learning_rate": 2.5779134323333934e-06, + "loss": 0.317, + "step": 12669 + }, + { + "epoch": 1.5418314572558565, + "grad_norm": 1.4216769933700562, + "learning_rate": 2.5766062791813117e-06, + "loss": 0.2936, + "step": 12670 + }, + { + "epoch": 1.5419531487678735, + "grad_norm": 1.5779895782470703, + "learning_rate": 2.575299408504729e-06, + "loss": 0.3485, + "step": 12671 + }, + { + "epoch": 1.5420748402798905, + "grad_norm": 1.695778727531433, + "learning_rate": 2.5739928203533695e-06, + "loss": 0.3479, + "step": 12672 + }, + { + "epoch": 1.5421965317919075, + "grad_norm": 2.2750704288482666, + "learning_rate": 2.572686514776961e-06, + "loss": 0.3086, + "step": 12673 + }, + { + "epoch": 1.5423182233039245, + "grad_norm": 2.7548768520355225, + "learning_rate": 2.5713804918252004e-06, + "loss": 0.3797, + "step": 12674 + }, + { + "epoch": 1.5424399148159416, + "grad_norm": 2.270379066467285, + "learning_rate": 2.5700747515477853e-06, + "loss": 0.2839, + "step": 12675 + }, + { + "epoch": 1.5425616063279586, + "grad_norm": 3.0020194053649902, + "learning_rate": 2.568769293994409e-06, + "loss": 0.4161, + "step": 12676 + }, + { + "epoch": 1.5426832978399756, + "grad_norm": 1.421528697013855, + "learning_rate": 2.567464119214742e-06, + "loss": 0.3568, + "step": 12677 + }, + { + "epoch": 1.5428049893519926, + "grad_norm": 2.8901584148406982, + "learning_rate": 2.566159227258447e-06, + "loss": 0.45, + "step": 12678 + }, + { + "epoch": 1.5429266808640096, + "grad_norm": 2.137016773223877, + "learning_rate": 2.564854618175183e-06, + "loss": 0.3737, + "step": 12679 + }, + { + "epoch": 1.5430483723760267, + "grad_norm": 2.089994192123413, + "learning_rate": 2.5635502920145885e-06, + "loss": 0.4103, + "step": 12680 + }, + { + "epoch": 1.5431700638880437, + "grad_norm": 2.317518711090088, + "learning_rate": 2.5622462488263012e-06, + "loss": 0.3185, + "step": 12681 + }, + { + "epoch": 1.5432917554000607, + "grad_norm": 1.6309014558792114, + "learning_rate": 2.560942488659939e-06, + "loss": 0.3335, + "step": 12682 + }, + { + "epoch": 1.543413446912078, + "grad_norm": 2.184443235397339, + "learning_rate": 2.5596390115651105e-06, + "loss": 0.364, + "step": 12683 + }, + { + "epoch": 1.543535138424095, + "grad_norm": 2.5433719158172607, + "learning_rate": 2.558335817591421e-06, + "loss": 0.3728, + "step": 12684 + }, + { + "epoch": 1.543656829936112, + "grad_norm": 2.0452566146850586, + "learning_rate": 2.5570329067884593e-06, + "loss": 0.3588, + "step": 12685 + }, + { + "epoch": 1.543778521448129, + "grad_norm": 1.9246655702590942, + "learning_rate": 2.555730279205797e-06, + "loss": 0.3765, + "step": 12686 + }, + { + "epoch": 1.543900212960146, + "grad_norm": 1.7227977514266968, + "learning_rate": 2.5544279348930077e-06, + "loss": 0.3605, + "step": 12687 + }, + { + "epoch": 1.544021904472163, + "grad_norm": 2.0520403385162354, + "learning_rate": 2.5531258738996454e-06, + "loss": 0.3775, + "step": 12688 + }, + { + "epoch": 1.54414359598418, + "grad_norm": 2.2975685596466064, + "learning_rate": 2.5518240962752603e-06, + "loss": 0.3495, + "step": 12689 + }, + { + "epoch": 1.5442652874961973, + "grad_norm": 1.9765340089797974, + "learning_rate": 2.550522602069385e-06, + "loss": 0.3245, + "step": 12690 + }, + { + "epoch": 1.5443869790082143, + "grad_norm": 1.8159561157226562, + "learning_rate": 2.549221391331542e-06, + "loss": 0.4079, + "step": 12691 + }, + { + "epoch": 1.5445086705202313, + "grad_norm": 1.4668906927108765, + "learning_rate": 2.5479204641112496e-06, + "loss": 0.3639, + "step": 12692 + }, + { + "epoch": 1.5446303620322483, + "grad_norm": 1.7074501514434814, + "learning_rate": 2.5466198204580083e-06, + "loss": 0.3647, + "step": 12693 + }, + { + "epoch": 1.5447520535442654, + "grad_norm": 2.0606911182403564, + "learning_rate": 2.545319460421308e-06, + "loss": 0.4033, + "step": 12694 + }, + { + "epoch": 1.5448737450562824, + "grad_norm": 1.7580406665802002, + "learning_rate": 2.5440193840506353e-06, + "loss": 0.378, + "step": 12695 + }, + { + "epoch": 1.5449954365682994, + "grad_norm": 2.8147008419036865, + "learning_rate": 2.5427195913954583e-06, + "loss": 0.4229, + "step": 12696 + }, + { + "epoch": 1.5451171280803164, + "grad_norm": 1.6291056871414185, + "learning_rate": 2.541420082505237e-06, + "loss": 0.3818, + "step": 12697 + }, + { + "epoch": 1.5452388195923334, + "grad_norm": 1.9070744514465332, + "learning_rate": 2.540120857429419e-06, + "loss": 0.4015, + "step": 12698 + }, + { + "epoch": 1.5453605111043505, + "grad_norm": 1.6448923349380493, + "learning_rate": 2.538821916217441e-06, + "loss": 0.3558, + "step": 12699 + }, + { + "epoch": 1.5454822026163675, + "grad_norm": 1.9214787483215332, + "learning_rate": 2.537523258918736e-06, + "loss": 0.4017, + "step": 12700 + }, + { + "epoch": 1.5456038941283845, + "grad_norm": 2.2004947662353516, + "learning_rate": 2.5362248855827175e-06, + "loss": 0.3898, + "step": 12701 + }, + { + "epoch": 1.5457255856404015, + "grad_norm": 2.111344337463379, + "learning_rate": 2.5349267962587875e-06, + "loss": 0.4226, + "step": 12702 + }, + { + "epoch": 1.5458472771524185, + "grad_norm": 1.5048394203186035, + "learning_rate": 2.5336289909963484e-06, + "loss": 0.3805, + "step": 12703 + }, + { + "epoch": 1.5459689686644356, + "grad_norm": 1.6047000885009766, + "learning_rate": 2.532331469844781e-06, + "loss": 0.3461, + "step": 12704 + }, + { + "epoch": 1.5460906601764526, + "grad_norm": 1.5765515565872192, + "learning_rate": 2.5310342328534552e-06, + "loss": 0.3516, + "step": 12705 + }, + { + "epoch": 1.5462123516884696, + "grad_norm": 1.9117792844772339, + "learning_rate": 2.529737280071739e-06, + "loss": 0.3896, + "step": 12706 + }, + { + "epoch": 1.5463340432004866, + "grad_norm": 1.7108114957809448, + "learning_rate": 2.5284406115489835e-06, + "loss": 0.4254, + "step": 12707 + }, + { + "epoch": 1.5464557347125036, + "grad_norm": 2.0911877155303955, + "learning_rate": 2.527144227334527e-06, + "loss": 0.3505, + "step": 12708 + }, + { + "epoch": 1.5465774262245209, + "grad_norm": 2.0561294555664062, + "learning_rate": 2.5258481274777e-06, + "loss": 0.3383, + "step": 12709 + }, + { + "epoch": 1.546699117736538, + "grad_norm": 3.7373759746551514, + "learning_rate": 2.5245523120278205e-06, + "loss": 0.3182, + "step": 12710 + }, + { + "epoch": 1.546820809248555, + "grad_norm": 2.76027512550354, + "learning_rate": 2.5232567810342014e-06, + "loss": 0.3901, + "step": 12711 + }, + { + "epoch": 1.546942500760572, + "grad_norm": 1.8573646545410156, + "learning_rate": 2.521961534546138e-06, + "loss": 0.3712, + "step": 12712 + }, + { + "epoch": 1.547064192272589, + "grad_norm": 1.6403520107269287, + "learning_rate": 2.520666572612913e-06, + "loss": 0.3693, + "step": 12713 + }, + { + "epoch": 1.547185883784606, + "grad_norm": 1.7222304344177246, + "learning_rate": 2.5193718952838096e-06, + "loss": 0.4221, + "step": 12714 + }, + { + "epoch": 1.5473075752966232, + "grad_norm": 1.9656339883804321, + "learning_rate": 2.5180775026080905e-06, + "loss": 0.3913, + "step": 12715 + }, + { + "epoch": 1.5474292668086402, + "grad_norm": 1.713310956954956, + "learning_rate": 2.5167833946350052e-06, + "loss": 0.3801, + "step": 12716 + }, + { + "epoch": 1.5475509583206573, + "grad_norm": 2.168942928314209, + "learning_rate": 2.515489571413805e-06, + "loss": 0.3898, + "step": 12717 + }, + { + "epoch": 1.5476726498326743, + "grad_norm": 1.9084094762802124, + "learning_rate": 2.5141960329937175e-06, + "loss": 0.3706, + "step": 12718 + }, + { + "epoch": 1.5477943413446913, + "grad_norm": 2.4918160438537598, + "learning_rate": 2.512902779423967e-06, + "loss": 0.4113, + "step": 12719 + }, + { + "epoch": 1.5479160328567083, + "grad_norm": 1.8096450567245483, + "learning_rate": 2.5116098107537624e-06, + "loss": 0.3314, + "step": 12720 + }, + { + "epoch": 1.5480377243687253, + "grad_norm": 1.514387607574463, + "learning_rate": 2.5103171270323023e-06, + "loss": 0.4034, + "step": 12721 + }, + { + "epoch": 1.5481594158807424, + "grad_norm": 1.594966173171997, + "learning_rate": 2.509024728308781e-06, + "loss": 0.3761, + "step": 12722 + }, + { + "epoch": 1.5482811073927594, + "grad_norm": 1.6208900213241577, + "learning_rate": 2.5077326146323746e-06, + "loss": 0.337, + "step": 12723 + }, + { + "epoch": 1.5484027989047764, + "grad_norm": 2.314403772354126, + "learning_rate": 2.5064407860522465e-06, + "loss": 0.4299, + "step": 12724 + }, + { + "epoch": 1.5485244904167934, + "grad_norm": 2.0890610218048096, + "learning_rate": 2.5051492426175604e-06, + "loss": 0.406, + "step": 12725 + }, + { + "epoch": 1.5486461819288104, + "grad_norm": 2.2321109771728516, + "learning_rate": 2.503857984377459e-06, + "loss": 0.397, + "step": 12726 + }, + { + "epoch": 1.5487678734408274, + "grad_norm": 1.3876774311065674, + "learning_rate": 2.5025670113810753e-06, + "loss": 0.3795, + "step": 12727 + }, + { + "epoch": 1.5488895649528445, + "grad_norm": 1.8247785568237305, + "learning_rate": 2.5012763236775385e-06, + "loss": 0.4279, + "step": 12728 + }, + { + "epoch": 1.5490112564648615, + "grad_norm": 2.8193259239196777, + "learning_rate": 2.4999859213159584e-06, + "loss": 0.4488, + "step": 12729 + }, + { + "epoch": 1.5491329479768785, + "grad_norm": 2.129063367843628, + "learning_rate": 2.4986958043454356e-06, + "loss": 0.3799, + "step": 12730 + }, + { + "epoch": 1.5492546394888955, + "grad_norm": 1.6212902069091797, + "learning_rate": 2.4974059728150702e-06, + "loss": 0.3337, + "step": 12731 + }, + { + "epoch": 1.5493763310009125, + "grad_norm": 1.7969318628311157, + "learning_rate": 2.4961164267739312e-06, + "loss": 0.4288, + "step": 12732 + }, + { + "epoch": 1.5494980225129296, + "grad_norm": 1.800437569618225, + "learning_rate": 2.494827166271098e-06, + "loss": 0.4234, + "step": 12733 + }, + { + "epoch": 1.5496197140249468, + "grad_norm": 1.8014652729034424, + "learning_rate": 2.4935381913556243e-06, + "loss": 0.3432, + "step": 12734 + }, + { + "epoch": 1.5497414055369638, + "grad_norm": 2.6505277156829834, + "learning_rate": 2.492249502076558e-06, + "loss": 0.3559, + "step": 12735 + }, + { + "epoch": 1.5498630970489808, + "grad_norm": 1.578000545501709, + "learning_rate": 2.4909610984829414e-06, + "loss": 0.3662, + "step": 12736 + }, + { + "epoch": 1.5499847885609979, + "grad_norm": 1.5680572986602783, + "learning_rate": 2.4896729806237973e-06, + "loss": 0.3494, + "step": 12737 + }, + { + "epoch": 1.5501064800730149, + "grad_norm": 2.3017964363098145, + "learning_rate": 2.4883851485481393e-06, + "loss": 0.367, + "step": 12738 + }, + { + "epoch": 1.550228171585032, + "grad_norm": 2.2530407905578613, + "learning_rate": 2.4870976023049775e-06, + "loss": 0.3815, + "step": 12739 + }, + { + "epoch": 1.5503498630970491, + "grad_norm": 1.668676495552063, + "learning_rate": 2.4858103419433e-06, + "loss": 0.3726, + "step": 12740 + }, + { + "epoch": 1.5504715546090662, + "grad_norm": 1.8456331491470337, + "learning_rate": 2.4845233675120948e-06, + "loss": 0.3418, + "step": 12741 + }, + { + "epoch": 1.5505932461210832, + "grad_norm": 1.8216592073440552, + "learning_rate": 2.483236679060336e-06, + "loss": 0.37, + "step": 12742 + }, + { + "epoch": 1.5507149376331002, + "grad_norm": 1.434902548789978, + "learning_rate": 2.4819502766369728e-06, + "loss": 0.3701, + "step": 12743 + }, + { + "epoch": 1.5508366291451172, + "grad_norm": 2.3084845542907715, + "learning_rate": 2.4806641602909675e-06, + "loss": 0.3341, + "step": 12744 + }, + { + "epoch": 1.5509583206571342, + "grad_norm": 2.484041213989258, + "learning_rate": 2.4793783300712536e-06, + "loss": 0.3357, + "step": 12745 + }, + { + "epoch": 1.5510800121691513, + "grad_norm": 1.3554069995880127, + "learning_rate": 2.478092786026759e-06, + "loss": 0.3694, + "step": 12746 + }, + { + "epoch": 1.5512017036811683, + "grad_norm": 1.8760665655136108, + "learning_rate": 2.476807528206406e-06, + "loss": 0.3463, + "step": 12747 + }, + { + "epoch": 1.5513233951931853, + "grad_norm": 1.754461407661438, + "learning_rate": 2.4755225566590966e-06, + "loss": 0.3425, + "step": 12748 + }, + { + "epoch": 1.5514450867052023, + "grad_norm": 1.6037449836730957, + "learning_rate": 2.474237871433731e-06, + "loss": 0.2969, + "step": 12749 + }, + { + "epoch": 1.5515667782172193, + "grad_norm": 2.366725444793701, + "learning_rate": 2.4729534725791915e-06, + "loss": 0.3015, + "step": 12750 + }, + { + "epoch": 1.5516884697292364, + "grad_norm": 1.6609134674072266, + "learning_rate": 2.4716693601443507e-06, + "loss": 0.3362, + "step": 12751 + }, + { + "epoch": 1.5518101612412534, + "grad_norm": 1.4698946475982666, + "learning_rate": 2.470385534178076e-06, + "loss": 0.3661, + "step": 12752 + }, + { + "epoch": 1.5519318527532704, + "grad_norm": 2.472661018371582, + "learning_rate": 2.469101994729216e-06, + "loss": 0.4133, + "step": 12753 + }, + { + "epoch": 1.5520535442652874, + "grad_norm": 2.0256452560424805, + "learning_rate": 2.467818741846615e-06, + "loss": 0.3491, + "step": 12754 + }, + { + "epoch": 1.5521752357773044, + "grad_norm": 1.8092035055160522, + "learning_rate": 2.4665357755791006e-06, + "loss": 0.3854, + "step": 12755 + }, + { + "epoch": 1.5522969272893214, + "grad_norm": 2.6981921195983887, + "learning_rate": 2.4652530959754907e-06, + "loss": 0.4196, + "step": 12756 + }, + { + "epoch": 1.5524186188013385, + "grad_norm": 1.7002185583114624, + "learning_rate": 2.4639707030845993e-06, + "loss": 0.2888, + "step": 12757 + }, + { + "epoch": 1.5525403103133555, + "grad_norm": 2.7183339595794678, + "learning_rate": 2.462688596955222e-06, + "loss": 0.3204, + "step": 12758 + }, + { + "epoch": 1.5526620018253727, + "grad_norm": 3.830136299133301, + "learning_rate": 2.4614067776361406e-06, + "loss": 0.427, + "step": 12759 + }, + { + "epoch": 1.5527836933373897, + "grad_norm": 1.677081823348999, + "learning_rate": 2.460125245176139e-06, + "loss": 0.3515, + "step": 12760 + }, + { + "epoch": 1.5529053848494068, + "grad_norm": 2.07553768157959, + "learning_rate": 2.458843999623979e-06, + "loss": 0.4079, + "step": 12761 + }, + { + "epoch": 1.5530270763614238, + "grad_norm": 2.483259916305542, + "learning_rate": 2.4575630410284112e-06, + "loss": 0.4308, + "step": 12762 + }, + { + "epoch": 1.5531487678734408, + "grad_norm": 1.7689406871795654, + "learning_rate": 2.4562823694381845e-06, + "loss": 0.3739, + "step": 12763 + }, + { + "epoch": 1.5532704593854578, + "grad_norm": 1.9724507331848145, + "learning_rate": 2.455001984902028e-06, + "loss": 0.3871, + "step": 12764 + }, + { + "epoch": 1.553392150897475, + "grad_norm": 1.475845217704773, + "learning_rate": 2.453721887468663e-06, + "loss": 0.3674, + "step": 12765 + }, + { + "epoch": 1.553513842409492, + "grad_norm": 2.475400447845459, + "learning_rate": 2.452442077186801e-06, + "loss": 0.348, + "step": 12766 + }, + { + "epoch": 1.553635533921509, + "grad_norm": 1.8045141696929932, + "learning_rate": 2.4511625541051365e-06, + "loss": 0.3423, + "step": 12767 + }, + { + "epoch": 1.5537572254335261, + "grad_norm": 2.7508513927459717, + "learning_rate": 2.449883318272366e-06, + "loss": 0.372, + "step": 12768 + }, + { + "epoch": 1.5538789169455431, + "grad_norm": 1.6447900533676147, + "learning_rate": 2.448604369737162e-06, + "loss": 0.372, + "step": 12769 + }, + { + "epoch": 1.5540006084575602, + "grad_norm": 1.582385778427124, + "learning_rate": 2.4473257085481903e-06, + "loss": 0.353, + "step": 12770 + }, + { + "epoch": 1.5541222999695772, + "grad_norm": 1.6140209436416626, + "learning_rate": 2.4460473347541112e-06, + "loss": 0.337, + "step": 12771 + }, + { + "epoch": 1.5542439914815942, + "grad_norm": 1.881425142288208, + "learning_rate": 2.444769248403567e-06, + "loss": 0.3433, + "step": 12772 + }, + { + "epoch": 1.5543656829936112, + "grad_norm": 1.553879976272583, + "learning_rate": 2.443491449545188e-06, + "loss": 0.3668, + "step": 12773 + }, + { + "epoch": 1.5544873745056282, + "grad_norm": 1.7438925504684448, + "learning_rate": 2.4422139382276033e-06, + "loss": 0.4055, + "step": 12774 + }, + { + "epoch": 1.5546090660176453, + "grad_norm": 1.6766334772109985, + "learning_rate": 2.4409367144994223e-06, + "loss": 0.3352, + "step": 12775 + }, + { + "epoch": 1.5547307575296623, + "grad_norm": 2.2803711891174316, + "learning_rate": 2.4396597784092437e-06, + "loss": 0.3722, + "step": 12776 + }, + { + "epoch": 1.5548524490416793, + "grad_norm": 1.987220287322998, + "learning_rate": 2.438383130005664e-06, + "loss": 0.2965, + "step": 12777 + }, + { + "epoch": 1.5549741405536963, + "grad_norm": 1.925665020942688, + "learning_rate": 2.437106769337253e-06, + "loss": 0.4832, + "step": 12778 + }, + { + "epoch": 1.5550958320657133, + "grad_norm": 1.7567967176437378, + "learning_rate": 2.435830696452586e-06, + "loss": 0.3726, + "step": 12779 + }, + { + "epoch": 1.5552175235777304, + "grad_norm": 2.1436469554901123, + "learning_rate": 2.434554911400219e-06, + "loss": 0.3789, + "step": 12780 + }, + { + "epoch": 1.5553392150897474, + "grad_norm": 2.099574565887451, + "learning_rate": 2.433279414228693e-06, + "loss": 0.3108, + "step": 12781 + }, + { + "epoch": 1.5554609066017644, + "grad_norm": 2.9680776596069336, + "learning_rate": 2.432004204986551e-06, + "loss": 0.3337, + "step": 12782 + }, + { + "epoch": 1.5555825981137814, + "grad_norm": 1.6419986486434937, + "learning_rate": 2.430729283722314e-06, + "loss": 0.3845, + "step": 12783 + }, + { + "epoch": 1.5557042896257987, + "grad_norm": 2.0534918308258057, + "learning_rate": 2.429454650484493e-06, + "loss": 0.3865, + "step": 12784 + }, + { + "epoch": 1.5558259811378157, + "grad_norm": 1.4777759313583374, + "learning_rate": 2.4281803053215946e-06, + "loss": 0.3446, + "step": 12785 + }, + { + "epoch": 1.5559476726498327, + "grad_norm": 2.158703327178955, + "learning_rate": 2.4269062482821084e-06, + "loss": 0.3056, + "step": 12786 + }, + { + "epoch": 1.5560693641618497, + "grad_norm": 2.2385034561157227, + "learning_rate": 2.4256324794145124e-06, + "loss": 0.415, + "step": 12787 + }, + { + "epoch": 1.5561910556738667, + "grad_norm": 1.4018527269363403, + "learning_rate": 2.4243589987672845e-06, + "loss": 0.3467, + "step": 12788 + }, + { + "epoch": 1.5563127471858837, + "grad_norm": 1.909488558769226, + "learning_rate": 2.4230858063888717e-06, + "loss": 0.3654, + "step": 12789 + }, + { + "epoch": 1.5564344386979008, + "grad_norm": 1.8015897274017334, + "learning_rate": 2.421812902327729e-06, + "loss": 0.3994, + "step": 12790 + }, + { + "epoch": 1.556556130209918, + "grad_norm": 1.7957444190979004, + "learning_rate": 2.4205402866322938e-06, + "loss": 0.3705, + "step": 12791 + }, + { + "epoch": 1.556677821721935, + "grad_norm": 2.5776891708374023, + "learning_rate": 2.419267959350984e-06, + "loss": 0.4047, + "step": 12792 + }, + { + "epoch": 1.556799513233952, + "grad_norm": 1.8470677137374878, + "learning_rate": 2.4179959205322234e-06, + "loss": 0.352, + "step": 12793 + }, + { + "epoch": 1.556921204745969, + "grad_norm": 1.678146243095398, + "learning_rate": 2.416724170224413e-06, + "loss": 0.3677, + "step": 12794 + }, + { + "epoch": 1.557042896257986, + "grad_norm": 4.77911901473999, + "learning_rate": 2.4154527084759393e-06, + "loss": 0.4445, + "step": 12795 + }, + { + "epoch": 1.557164587770003, + "grad_norm": 1.6903306245803833, + "learning_rate": 2.414181535335194e-06, + "loss": 0.4057, + "step": 12796 + }, + { + "epoch": 1.5572862792820201, + "grad_norm": 1.584194302558899, + "learning_rate": 2.412910650850542e-06, + "loss": 0.2984, + "step": 12797 + }, + { + "epoch": 1.5574079707940371, + "grad_norm": 1.7317028045654297, + "learning_rate": 2.4116400550703423e-06, + "loss": 0.4066, + "step": 12798 + }, + { + "epoch": 1.5575296623060542, + "grad_norm": 2.235231399536133, + "learning_rate": 2.4103697480429477e-06, + "loss": 0.3974, + "step": 12799 + }, + { + "epoch": 1.5576513538180712, + "grad_norm": 1.6978048086166382, + "learning_rate": 2.409099729816694e-06, + "loss": 0.3741, + "step": 12800 + }, + { + "epoch": 1.5577730453300882, + "grad_norm": 1.6473702192306519, + "learning_rate": 2.407830000439907e-06, + "loss": 0.344, + "step": 12801 + }, + { + "epoch": 1.5578947368421052, + "grad_norm": 1.9948610067367554, + "learning_rate": 2.4065605599609044e-06, + "loss": 0.3957, + "step": 12802 + }, + { + "epoch": 1.5580164283541222, + "grad_norm": 1.2500169277191162, + "learning_rate": 2.4052914084279878e-06, + "loss": 0.3295, + "step": 12803 + }, + { + "epoch": 1.5581381198661393, + "grad_norm": 2.2401773929595947, + "learning_rate": 2.404022545889455e-06, + "loss": 0.3978, + "step": 12804 + }, + { + "epoch": 1.5582598113781563, + "grad_norm": 2.3625476360321045, + "learning_rate": 2.402753972393588e-06, + "loss": 0.4333, + "step": 12805 + }, + { + "epoch": 1.5583815028901733, + "grad_norm": 2.5307910442352295, + "learning_rate": 2.401485687988654e-06, + "loss": 0.4218, + "step": 12806 + }, + { + "epoch": 1.5585031944021903, + "grad_norm": 1.6290391683578491, + "learning_rate": 2.4002176927229203e-06, + "loss": 0.3591, + "step": 12807 + }, + { + "epoch": 1.5586248859142073, + "grad_norm": 1.7677561044692993, + "learning_rate": 2.398949986644632e-06, + "loss": 0.3591, + "step": 12808 + }, + { + "epoch": 1.5587465774262244, + "grad_norm": 1.9134935140609741, + "learning_rate": 2.3976825698020336e-06, + "loss": 0.3735, + "step": 12809 + }, + { + "epoch": 1.5588682689382416, + "grad_norm": 2.1838998794555664, + "learning_rate": 2.396415442243347e-06, + "loss": 0.35, + "step": 12810 + }, + { + "epoch": 1.5589899604502586, + "grad_norm": 1.5391291379928589, + "learning_rate": 2.3951486040167936e-06, + "loss": 0.3581, + "step": 12811 + }, + { + "epoch": 1.5591116519622756, + "grad_norm": 2.85530424118042, + "learning_rate": 2.3938820551705766e-06, + "loss": 0.2958, + "step": 12812 + }, + { + "epoch": 1.5592333434742927, + "grad_norm": 2.349602222442627, + "learning_rate": 2.3926157957528907e-06, + "loss": 0.3646, + "step": 12813 + }, + { + "epoch": 1.5593550349863097, + "grad_norm": 1.5135033130645752, + "learning_rate": 2.391349825811917e-06, + "loss": 0.359, + "step": 12814 + }, + { + "epoch": 1.5594767264983267, + "grad_norm": 2.149428606033325, + "learning_rate": 2.3900841453958344e-06, + "loss": 0.3531, + "step": 12815 + }, + { + "epoch": 1.559598418010344, + "grad_norm": 3.258680820465088, + "learning_rate": 2.3888187545527995e-06, + "loss": 0.403, + "step": 12816 + }, + { + "epoch": 1.559720109522361, + "grad_norm": 1.4699033498764038, + "learning_rate": 2.3875536533309685e-06, + "loss": 0.3263, + "step": 12817 + }, + { + "epoch": 1.559841801034378, + "grad_norm": 1.6422860622406006, + "learning_rate": 2.386288841778477e-06, + "loss": 0.3478, + "step": 12818 + }, + { + "epoch": 1.559963492546395, + "grad_norm": 1.3670568466186523, + "learning_rate": 2.385024319943452e-06, + "loss": 0.4089, + "step": 12819 + }, + { + "epoch": 1.560085184058412, + "grad_norm": 1.6304458379745483, + "learning_rate": 2.3837600878740166e-06, + "loss": 0.3463, + "step": 12820 + }, + { + "epoch": 1.560206875570429, + "grad_norm": 2.3730359077453613, + "learning_rate": 2.3824961456182747e-06, + "loss": 0.4077, + "step": 12821 + }, + { + "epoch": 1.560328567082446, + "grad_norm": 3.8978514671325684, + "learning_rate": 2.381232493224319e-06, + "loss": 0.4965, + "step": 12822 + }, + { + "epoch": 1.560450258594463, + "grad_norm": 4.243291854858398, + "learning_rate": 2.379969130740243e-06, + "loss": 0.3972, + "step": 12823 + }, + { + "epoch": 1.56057195010648, + "grad_norm": 2.884408950805664, + "learning_rate": 2.3787060582141074e-06, + "loss": 0.4588, + "step": 12824 + }, + { + "epoch": 1.560693641618497, + "grad_norm": 2.60561203956604, + "learning_rate": 2.3774432756939845e-06, + "loss": 0.3777, + "step": 12825 + }, + { + "epoch": 1.5608153331305141, + "grad_norm": 1.5718512535095215, + "learning_rate": 2.376180783227924e-06, + "loss": 0.3555, + "step": 12826 + }, + { + "epoch": 1.5609370246425311, + "grad_norm": 2.756662368774414, + "learning_rate": 2.3749185808639617e-06, + "loss": 0.3112, + "step": 12827 + }, + { + "epoch": 1.5610587161545482, + "grad_norm": 3.597346782684326, + "learning_rate": 2.3736566686501327e-06, + "loss": 0.4183, + "step": 12828 + }, + { + "epoch": 1.5611804076665652, + "grad_norm": 1.5573253631591797, + "learning_rate": 2.3723950466344535e-06, + "loss": 0.3611, + "step": 12829 + }, + { + "epoch": 1.5613020991785822, + "grad_norm": 1.6890100240707397, + "learning_rate": 2.3711337148649284e-06, + "loss": 0.3363, + "step": 12830 + }, + { + "epoch": 1.5614237906905992, + "grad_norm": 1.6519485712051392, + "learning_rate": 2.3698726733895596e-06, + "loss": 0.4162, + "step": 12831 + }, + { + "epoch": 1.5615454822026162, + "grad_norm": 1.9749494791030884, + "learning_rate": 2.3686119222563273e-06, + "loss": 0.3962, + "step": 12832 + }, + { + "epoch": 1.5616671737146333, + "grad_norm": 2.018336772918701, + "learning_rate": 2.3673514615132067e-06, + "loss": 0.3416, + "step": 12833 + }, + { + "epoch": 1.5617888652266503, + "grad_norm": 2.254579544067383, + "learning_rate": 2.3660912912081668e-06, + "loss": 0.3467, + "step": 12834 + }, + { + "epoch": 1.5619105567386675, + "grad_norm": 1.8689029216766357, + "learning_rate": 2.364831411389148e-06, + "loss": 0.3953, + "step": 12835 + }, + { + "epoch": 1.5620322482506845, + "grad_norm": 2.1085586547851562, + "learning_rate": 2.3635718221041014e-06, + "loss": 0.3573, + "step": 12836 + }, + { + "epoch": 1.5621539397627016, + "grad_norm": 2.7029595375061035, + "learning_rate": 2.3623125234009537e-06, + "loss": 0.3134, + "step": 12837 + }, + { + "epoch": 1.5622756312747186, + "grad_norm": 2.319758892059326, + "learning_rate": 2.3610535153276202e-06, + "loss": 0.3918, + "step": 12838 + }, + { + "epoch": 1.5623973227867356, + "grad_norm": 1.5745046138763428, + "learning_rate": 2.3597947979320157e-06, + "loss": 0.3674, + "step": 12839 + }, + { + "epoch": 1.5625190142987526, + "grad_norm": 1.9525219202041626, + "learning_rate": 2.3585363712620333e-06, + "loss": 0.3783, + "step": 12840 + }, + { + "epoch": 1.5626407058107699, + "grad_norm": 3.5742135047912598, + "learning_rate": 2.3572782353655555e-06, + "loss": 0.4636, + "step": 12841 + }, + { + "epoch": 1.5627623973227869, + "grad_norm": 2.036787748336792, + "learning_rate": 2.3560203902904646e-06, + "loss": 0.4117, + "step": 12842 + }, + { + "epoch": 1.562884088834804, + "grad_norm": 1.618148922920227, + "learning_rate": 2.3547628360846188e-06, + "loss": 0.3611, + "step": 12843 + }, + { + "epoch": 1.563005780346821, + "grad_norm": 2.6135871410369873, + "learning_rate": 2.353505572795869e-06, + "loss": 0.3808, + "step": 12844 + }, + { + "epoch": 1.563127471858838, + "grad_norm": 2.4590752124786377, + "learning_rate": 2.3522486004720635e-06, + "loss": 0.3229, + "step": 12845 + }, + { + "epoch": 1.563249163370855, + "grad_norm": 2.022947072982788, + "learning_rate": 2.3509919191610274e-06, + "loss": 0.4051, + "step": 12846 + }, + { + "epoch": 1.563370854882872, + "grad_norm": 3.2890682220458984, + "learning_rate": 2.3497355289105827e-06, + "loss": 0.4558, + "step": 12847 + }, + { + "epoch": 1.563492546394889, + "grad_norm": 1.9923243522644043, + "learning_rate": 2.3484794297685353e-06, + "loss": 0.3799, + "step": 12848 + }, + { + "epoch": 1.563614237906906, + "grad_norm": 2.6818184852600098, + "learning_rate": 2.3472236217826807e-06, + "loss": 0.4284, + "step": 12849 + }, + { + "epoch": 1.563735929418923, + "grad_norm": 1.4004765748977661, + "learning_rate": 2.345968105000811e-06, + "loss": 0.349, + "step": 12850 + }, + { + "epoch": 1.56385762093094, + "grad_norm": 2.124483585357666, + "learning_rate": 2.3447128794706975e-06, + "loss": 0.3637, + "step": 12851 + }, + { + "epoch": 1.563979312442957, + "grad_norm": 1.6781656742095947, + "learning_rate": 2.3434579452401007e-06, + "loss": 0.3741, + "step": 12852 + }, + { + "epoch": 1.564101003954974, + "grad_norm": 1.8484046459197998, + "learning_rate": 2.3422033023567814e-06, + "loss": 0.4061, + "step": 12853 + }, + { + "epoch": 1.564222695466991, + "grad_norm": 1.6275197267532349, + "learning_rate": 2.3409489508684767e-06, + "loss": 0.3517, + "step": 12854 + }, + { + "epoch": 1.5643443869790081, + "grad_norm": 1.8312777280807495, + "learning_rate": 2.3396948908229155e-06, + "loss": 0.3217, + "step": 12855 + }, + { + "epoch": 1.5644660784910251, + "grad_norm": 1.6628172397613525, + "learning_rate": 2.3384411222678215e-06, + "loss": 0.4065, + "step": 12856 + }, + { + "epoch": 1.5645877700030422, + "grad_norm": 1.5507392883300781, + "learning_rate": 2.3371876452509024e-06, + "loss": 0.3574, + "step": 12857 + }, + { + "epoch": 1.5647094615150592, + "grad_norm": 1.7578529119491577, + "learning_rate": 2.3359344598198543e-06, + "loss": 0.3231, + "step": 12858 + }, + { + "epoch": 1.5648311530270762, + "grad_norm": 2.074265956878662, + "learning_rate": 2.334681566022363e-06, + "loss": 0.3898, + "step": 12859 + }, + { + "epoch": 1.5649528445390934, + "grad_norm": 1.7688438892364502, + "learning_rate": 2.3334289639061026e-06, + "loss": 0.3453, + "step": 12860 + }, + { + "epoch": 1.5650745360511105, + "grad_norm": 1.606223225593567, + "learning_rate": 2.332176653518742e-06, + "loss": 0.3823, + "step": 12861 + }, + { + "epoch": 1.5651962275631275, + "grad_norm": 1.3803592920303345, + "learning_rate": 2.3309246349079306e-06, + "loss": 0.3314, + "step": 12862 + }, + { + "epoch": 1.5653179190751445, + "grad_norm": 2.2646665573120117, + "learning_rate": 2.3296729081213077e-06, + "loss": 0.3543, + "step": 12863 + }, + { + "epoch": 1.5654396105871615, + "grad_norm": 2.098952293395996, + "learning_rate": 2.328421473206511e-06, + "loss": 0.3423, + "step": 12864 + }, + { + "epoch": 1.5655613020991785, + "grad_norm": 1.7026997804641724, + "learning_rate": 2.327170330211156e-06, + "loss": 0.3363, + "step": 12865 + }, + { + "epoch": 1.5656829936111958, + "grad_norm": 5.189004898071289, + "learning_rate": 2.32591947918285e-06, + "loss": 0.4746, + "step": 12866 + }, + { + "epoch": 1.5658046851232128, + "grad_norm": 1.9921131134033203, + "learning_rate": 2.3246689201691943e-06, + "loss": 0.3542, + "step": 12867 + }, + { + "epoch": 1.5659263766352298, + "grad_norm": 2.1459908485412598, + "learning_rate": 2.3234186532177695e-06, + "loss": 0.4049, + "step": 12868 + }, + { + "epoch": 1.5660480681472468, + "grad_norm": 1.9813066720962524, + "learning_rate": 2.3221686783761623e-06, + "loss": 0.3422, + "step": 12869 + }, + { + "epoch": 1.5661697596592639, + "grad_norm": 1.6277923583984375, + "learning_rate": 2.3209189956919243e-06, + "loss": 0.3814, + "step": 12870 + }, + { + "epoch": 1.5662914511712809, + "grad_norm": 1.8347305059432983, + "learning_rate": 2.3196696052126122e-06, + "loss": 0.3676, + "step": 12871 + }, + { + "epoch": 1.566413142683298, + "grad_norm": 1.9091711044311523, + "learning_rate": 2.318420506985771e-06, + "loss": 0.4076, + "step": 12872 + }, + { + "epoch": 1.566534834195315, + "grad_norm": 1.3776190280914307, + "learning_rate": 2.317171701058929e-06, + "loss": 0.3459, + "step": 12873 + }, + { + "epoch": 1.566656525707332, + "grad_norm": 1.734372615814209, + "learning_rate": 2.315923187479604e-06, + "loss": 0.3479, + "step": 12874 + }, + { + "epoch": 1.566778217219349, + "grad_norm": 1.9317102432250977, + "learning_rate": 2.3146749662953105e-06, + "loss": 0.355, + "step": 12875 + }, + { + "epoch": 1.566899908731366, + "grad_norm": 1.7534362077713013, + "learning_rate": 2.3134270375535384e-06, + "loss": 0.3511, + "step": 12876 + }, + { + "epoch": 1.567021600243383, + "grad_norm": 2.6851043701171875, + "learning_rate": 2.312179401301782e-06, + "loss": 0.3539, + "step": 12877 + }, + { + "epoch": 1.5671432917554, + "grad_norm": 1.8159314393997192, + "learning_rate": 2.3109320575875104e-06, + "loss": 0.3932, + "step": 12878 + }, + { + "epoch": 1.567264983267417, + "grad_norm": 2.1902477741241455, + "learning_rate": 2.3096850064581878e-06, + "loss": 0.3778, + "step": 12879 + }, + { + "epoch": 1.567386674779434, + "grad_norm": 2.8415133953094482, + "learning_rate": 2.308438247961272e-06, + "loss": 0.3721, + "step": 12880 + }, + { + "epoch": 1.567508366291451, + "grad_norm": 2.1026830673217773, + "learning_rate": 2.3071917821442037e-06, + "loss": 0.4102, + "step": 12881 + }, + { + "epoch": 1.567630057803468, + "grad_norm": 2.2709245681762695, + "learning_rate": 2.3059456090544053e-06, + "loss": 0.3372, + "step": 12882 + }, + { + "epoch": 1.567751749315485, + "grad_norm": 1.5737155675888062, + "learning_rate": 2.3046997287393056e-06, + "loss": 0.3801, + "step": 12883 + }, + { + "epoch": 1.5678734408275021, + "grad_norm": 2.4267656803131104, + "learning_rate": 2.3034541412463075e-06, + "loss": 0.3498, + "step": 12884 + }, + { + "epoch": 1.5679951323395194, + "grad_norm": 2.2090985774993896, + "learning_rate": 2.302208846622812e-06, + "loss": 0.3471, + "step": 12885 + }, + { + "epoch": 1.5681168238515364, + "grad_norm": 2.4376964569091797, + "learning_rate": 2.300963844916204e-06, + "loss": 0.3633, + "step": 12886 + }, + { + "epoch": 1.5682385153635534, + "grad_norm": 2.6460013389587402, + "learning_rate": 2.2997191361738545e-06, + "loss": 0.3374, + "step": 12887 + }, + { + "epoch": 1.5683602068755704, + "grad_norm": 2.4712672233581543, + "learning_rate": 2.2984747204431345e-06, + "loss": 0.361, + "step": 12888 + }, + { + "epoch": 1.5684818983875874, + "grad_norm": 1.6455574035644531, + "learning_rate": 2.2972305977713918e-06, + "loss": 0.3658, + "step": 12889 + }, + { + "epoch": 1.5686035898996045, + "grad_norm": 2.4127776622772217, + "learning_rate": 2.2959867682059654e-06, + "loss": 0.3467, + "step": 12890 + }, + { + "epoch": 1.5687252814116215, + "grad_norm": 1.2532869577407837, + "learning_rate": 2.294743231794193e-06, + "loss": 0.3152, + "step": 12891 + }, + { + "epoch": 1.5688469729236387, + "grad_norm": 2.937251567840576, + "learning_rate": 2.2934999885833885e-06, + "loss": 0.4389, + "step": 12892 + }, + { + "epoch": 1.5689686644356557, + "grad_norm": 2.0350735187530518, + "learning_rate": 2.292257038620862e-06, + "loss": 0.363, + "step": 12893 + }, + { + "epoch": 1.5690903559476728, + "grad_norm": 2.337066888809204, + "learning_rate": 2.2910143819539087e-06, + "loss": 0.3391, + "step": 12894 + }, + { + "epoch": 1.5692120474596898, + "grad_norm": 1.6901695728302002, + "learning_rate": 2.289772018629812e-06, + "loss": 0.3862, + "step": 12895 + }, + { + "epoch": 1.5693337389717068, + "grad_norm": 2.0855400562286377, + "learning_rate": 2.2885299486958524e-06, + "loss": 0.3763, + "step": 12896 + }, + { + "epoch": 1.5694554304837238, + "grad_norm": 2.361114740371704, + "learning_rate": 2.2872881721992903e-06, + "loss": 0.3595, + "step": 12897 + }, + { + "epoch": 1.5695771219957408, + "grad_norm": 1.7107714414596558, + "learning_rate": 2.2860466891873743e-06, + "loss": 0.3336, + "step": 12898 + }, + { + "epoch": 1.5696988135077579, + "grad_norm": 2.0851094722747803, + "learning_rate": 2.284805499707352e-06, + "loss": 0.4057, + "step": 12899 + }, + { + "epoch": 1.5698205050197749, + "grad_norm": 3.4108095169067383, + "learning_rate": 2.283564603806451e-06, + "loss": 0.4048, + "step": 12900 + }, + { + "epoch": 1.569942196531792, + "grad_norm": 1.425576090812683, + "learning_rate": 2.282324001531885e-06, + "loss": 0.3624, + "step": 12901 + }, + { + "epoch": 1.570063888043809, + "grad_norm": 2.004263401031494, + "learning_rate": 2.281083692930868e-06, + "loss": 0.3748, + "step": 12902 + }, + { + "epoch": 1.570185579555826, + "grad_norm": 2.1680753231048584, + "learning_rate": 2.2798436780505937e-06, + "loss": 0.3442, + "step": 12903 + }, + { + "epoch": 1.570307271067843, + "grad_norm": 2.3241710662841797, + "learning_rate": 2.278603956938248e-06, + "loss": 0.2941, + "step": 12904 + }, + { + "epoch": 1.57042896257986, + "grad_norm": 1.7455425262451172, + "learning_rate": 2.277364529641004e-06, + "loss": 0.3813, + "step": 12905 + }, + { + "epoch": 1.570550654091877, + "grad_norm": 1.4648090600967407, + "learning_rate": 2.2761253962060216e-06, + "loss": 0.3449, + "step": 12906 + }, + { + "epoch": 1.570672345603894, + "grad_norm": 2.0846569538116455, + "learning_rate": 2.274886556680458e-06, + "loss": 0.3482, + "step": 12907 + }, + { + "epoch": 1.570794037115911, + "grad_norm": 1.5035845041275024, + "learning_rate": 2.2736480111114512e-06, + "loss": 0.3419, + "step": 12908 + }, + { + "epoch": 1.570915728627928, + "grad_norm": 2.0656044483184814, + "learning_rate": 2.2724097595461283e-06, + "loss": 0.3455, + "step": 12909 + }, + { + "epoch": 1.571037420139945, + "grad_norm": 1.9806627035140991, + "learning_rate": 2.271171802031611e-06, + "loss": 0.4156, + "step": 12910 + }, + { + "epoch": 1.5711591116519623, + "grad_norm": 2.290090322494507, + "learning_rate": 2.2699341386150044e-06, + "loss": 0.3184, + "step": 12911 + }, + { + "epoch": 1.5712808031639793, + "grad_norm": 2.0771074295043945, + "learning_rate": 2.2686967693434024e-06, + "loss": 0.385, + "step": 12912 + }, + { + "epoch": 1.5714024946759964, + "grad_norm": 1.6272902488708496, + "learning_rate": 2.2674596942638927e-06, + "loss": 0.3513, + "step": 12913 + }, + { + "epoch": 1.5715241861880134, + "grad_norm": 1.7542093992233276, + "learning_rate": 2.2662229134235482e-06, + "loss": 0.415, + "step": 12914 + }, + { + "epoch": 1.5716458777000304, + "grad_norm": 1.5956990718841553, + "learning_rate": 2.2649864268694298e-06, + "loss": 0.3545, + "step": 12915 + }, + { + "epoch": 1.5717675692120474, + "grad_norm": 1.7092236280441284, + "learning_rate": 2.2637502346485875e-06, + "loss": 0.3977, + "step": 12916 + }, + { + "epoch": 1.5718892607240647, + "grad_norm": 2.1471784114837646, + "learning_rate": 2.2625143368080604e-06, + "loss": 0.434, + "step": 12917 + }, + { + "epoch": 1.5720109522360817, + "grad_norm": 1.582375168800354, + "learning_rate": 2.2612787333948805e-06, + "loss": 0.3339, + "step": 12918 + }, + { + "epoch": 1.5721326437480987, + "grad_norm": 2.6699843406677246, + "learning_rate": 2.2600434244560632e-06, + "loss": 0.2913, + "step": 12919 + }, + { + "epoch": 1.5722543352601157, + "grad_norm": 2.6099820137023926, + "learning_rate": 2.258808410038612e-06, + "loss": 0.4198, + "step": 12920 + }, + { + "epoch": 1.5723760267721327, + "grad_norm": 1.3790663480758667, + "learning_rate": 2.257573690189526e-06, + "loss": 0.3356, + "step": 12921 + }, + { + "epoch": 1.5724977182841497, + "grad_norm": 1.509026050567627, + "learning_rate": 2.256339264955788e-06, + "loss": 0.3649, + "step": 12922 + }, + { + "epoch": 1.5726194097961668, + "grad_norm": 1.3774064779281616, + "learning_rate": 2.2551051343843666e-06, + "loss": 0.3561, + "step": 12923 + }, + { + "epoch": 1.5727411013081838, + "grad_norm": 3.1276044845581055, + "learning_rate": 2.253871298522229e-06, + "loss": 0.3388, + "step": 12924 + }, + { + "epoch": 1.5728627928202008, + "grad_norm": 2.3575377464294434, + "learning_rate": 2.2526377574163204e-06, + "loss": 0.3746, + "step": 12925 + }, + { + "epoch": 1.5729844843322178, + "grad_norm": 2.5346429347991943, + "learning_rate": 2.2514045111135796e-06, + "loss": 0.3006, + "step": 12926 + }, + { + "epoch": 1.5731061758442348, + "grad_norm": 1.745758056640625, + "learning_rate": 2.250171559660942e-06, + "loss": 0.3988, + "step": 12927 + }, + { + "epoch": 1.5732278673562519, + "grad_norm": 3.1045055389404297, + "learning_rate": 2.248938903105312e-06, + "loss": 0.4123, + "step": 12928 + }, + { + "epoch": 1.5733495588682689, + "grad_norm": 2.329341173171997, + "learning_rate": 2.247706541493603e-06, + "loss": 0.4086, + "step": 12929 + }, + { + "epoch": 1.573471250380286, + "grad_norm": 1.997171401977539, + "learning_rate": 2.2464744748727065e-06, + "loss": 0.391, + "step": 12930 + }, + { + "epoch": 1.573592941892303, + "grad_norm": 1.5768593549728394, + "learning_rate": 2.245242703289502e-06, + "loss": 0.37, + "step": 12931 + }, + { + "epoch": 1.57371463340432, + "grad_norm": 1.5939037799835205, + "learning_rate": 2.2440112267908678e-06, + "loss": 0.3793, + "step": 12932 + }, + { + "epoch": 1.573836324916337, + "grad_norm": 3.788174629211426, + "learning_rate": 2.2427800454236603e-06, + "loss": 0.2802, + "step": 12933 + }, + { + "epoch": 1.573958016428354, + "grad_norm": 1.968440055847168, + "learning_rate": 2.241549159234725e-06, + "loss": 0.3588, + "step": 12934 + }, + { + "epoch": 1.574079707940371, + "grad_norm": 1.7688645124435425, + "learning_rate": 2.240318568270906e-06, + "loss": 0.3725, + "step": 12935 + }, + { + "epoch": 1.5742013994523882, + "grad_norm": 2.482125759124756, + "learning_rate": 2.2390882725790255e-06, + "loss": 0.28, + "step": 12936 + }, + { + "epoch": 1.5743230909644053, + "grad_norm": 1.6216914653778076, + "learning_rate": 2.2378582722059027e-06, + "loss": 0.3724, + "step": 12937 + }, + { + "epoch": 1.5744447824764223, + "grad_norm": 1.8637388944625854, + "learning_rate": 2.2366285671983424e-06, + "loss": 0.3854, + "step": 12938 + }, + { + "epoch": 1.5745664739884393, + "grad_norm": 2.8731648921966553, + "learning_rate": 2.2353991576031286e-06, + "loss": 0.4259, + "step": 12939 + }, + { + "epoch": 1.5746881655004563, + "grad_norm": 1.7532217502593994, + "learning_rate": 2.234170043467051e-06, + "loss": 0.3367, + "step": 12940 + }, + { + "epoch": 1.5748098570124733, + "grad_norm": 1.7763198614120483, + "learning_rate": 2.2329412248368786e-06, + "loss": 0.3825, + "step": 12941 + }, + { + "epoch": 1.5749315485244906, + "grad_norm": 1.456264615058899, + "learning_rate": 2.2317127017593654e-06, + "loss": 0.3418, + "step": 12942 + }, + { + "epoch": 1.5750532400365076, + "grad_norm": 1.8870912790298462, + "learning_rate": 2.230484474281267e-06, + "loss": 0.4288, + "step": 12943 + }, + { + "epoch": 1.5751749315485246, + "grad_norm": 1.6714569330215454, + "learning_rate": 2.229256542449314e-06, + "loss": 0.3915, + "step": 12944 + }, + { + "epoch": 1.5752966230605416, + "grad_norm": 1.7826646566390991, + "learning_rate": 2.2280289063102365e-06, + "loss": 0.4295, + "step": 12945 + }, + { + "epoch": 1.5754183145725587, + "grad_norm": 1.8751949071884155, + "learning_rate": 2.2268015659107456e-06, + "loss": 0.3883, + "step": 12946 + }, + { + "epoch": 1.5755400060845757, + "grad_norm": 1.8790628910064697, + "learning_rate": 2.2255745212975433e-06, + "loss": 0.3593, + "step": 12947 + }, + { + "epoch": 1.5756616975965927, + "grad_norm": 1.7039514780044556, + "learning_rate": 2.2243477725173246e-06, + "loss": 0.3979, + "step": 12948 + }, + { + "epoch": 1.5757833891086097, + "grad_norm": 1.5635474920272827, + "learning_rate": 2.2231213196167678e-06, + "loss": 0.3082, + "step": 12949 + }, + { + "epoch": 1.5759050806206267, + "grad_norm": 2.9554731845855713, + "learning_rate": 2.221895162642542e-06, + "loss": 0.289, + "step": 12950 + }, + { + "epoch": 1.5760267721326437, + "grad_norm": 2.4131555557250977, + "learning_rate": 2.220669301641305e-06, + "loss": 0.3073, + "step": 12951 + }, + { + "epoch": 1.5761484636446608, + "grad_norm": 2.1077520847320557, + "learning_rate": 2.219443736659701e-06, + "loss": 0.3339, + "step": 12952 + }, + { + "epoch": 1.5762701551566778, + "grad_norm": 2.155916213989258, + "learning_rate": 2.218218467744371e-06, + "loss": 0.3404, + "step": 12953 + }, + { + "epoch": 1.5763918466686948, + "grad_norm": 2.0844171047210693, + "learning_rate": 2.2169934949419345e-06, + "loss": 0.3724, + "step": 12954 + }, + { + "epoch": 1.5765135381807118, + "grad_norm": 1.671710729598999, + "learning_rate": 2.2157688182990033e-06, + "loss": 0.4115, + "step": 12955 + }, + { + "epoch": 1.5766352296927288, + "grad_norm": 1.7781239748001099, + "learning_rate": 2.2145444378621837e-06, + "loss": 0.3941, + "step": 12956 + }, + { + "epoch": 1.5767569212047459, + "grad_norm": 1.587677240371704, + "learning_rate": 2.2133203536780635e-06, + "loss": 0.363, + "step": 12957 + }, + { + "epoch": 1.5768786127167629, + "grad_norm": 1.8353302478790283, + "learning_rate": 2.212096565793218e-06, + "loss": 0.3517, + "step": 12958 + }, + { + "epoch": 1.57700030422878, + "grad_norm": 2.0180232524871826, + "learning_rate": 2.2108730742542215e-06, + "loss": 0.3442, + "step": 12959 + }, + { + "epoch": 1.577121995740797, + "grad_norm": 2.873677968978882, + "learning_rate": 2.209649879107627e-06, + "loss": 0.4144, + "step": 12960 + }, + { + "epoch": 1.5772436872528142, + "grad_norm": 2.216658115386963, + "learning_rate": 2.2084269803999803e-06, + "loss": 0.3685, + "step": 12961 + }, + { + "epoch": 1.5773653787648312, + "grad_norm": 2.5188417434692383, + "learning_rate": 2.207204378177814e-06, + "loss": 0.3765, + "step": 12962 + }, + { + "epoch": 1.5774870702768482, + "grad_norm": 1.9065752029418945, + "learning_rate": 2.2059820724876478e-06, + "loss": 0.364, + "step": 12963 + }, + { + "epoch": 1.5776087617888652, + "grad_norm": 1.6783907413482666, + "learning_rate": 2.204760063376e-06, + "loss": 0.3583, + "step": 12964 + }, + { + "epoch": 1.5777304533008822, + "grad_norm": 2.5551974773406982, + "learning_rate": 2.2035383508893682e-06, + "loss": 0.4302, + "step": 12965 + }, + { + "epoch": 1.5778521448128993, + "grad_norm": 3.1080923080444336, + "learning_rate": 2.202316935074236e-06, + "loss": 0.339, + "step": 12966 + }, + { + "epoch": 1.5779738363249165, + "grad_norm": 1.7808754444122314, + "learning_rate": 2.201095815977087e-06, + "loss": 0.3371, + "step": 12967 + }, + { + "epoch": 1.5780955278369335, + "grad_norm": 1.6282432079315186, + "learning_rate": 2.1998749936443864e-06, + "loss": 0.3551, + "step": 12968 + }, + { + "epoch": 1.5782172193489505, + "grad_norm": 1.782135248184204, + "learning_rate": 2.198654468122584e-06, + "loss": 0.3512, + "step": 12969 + }, + { + "epoch": 1.5783389108609676, + "grad_norm": 1.5888426303863525, + "learning_rate": 2.1974342394581293e-06, + "loss": 0.3344, + "step": 12970 + }, + { + "epoch": 1.5784606023729846, + "grad_norm": 2.6294143199920654, + "learning_rate": 2.196214307697453e-06, + "loss": 0.3854, + "step": 12971 + }, + { + "epoch": 1.5785822938850016, + "grad_norm": 2.163494825363159, + "learning_rate": 2.194994672886973e-06, + "loss": 0.4103, + "step": 12972 + }, + { + "epoch": 1.5787039853970186, + "grad_norm": 2.9790916442871094, + "learning_rate": 2.193775335073106e-06, + "loss": 0.4395, + "step": 12973 + }, + { + "epoch": 1.5788256769090356, + "grad_norm": 1.5964429378509521, + "learning_rate": 2.192556294302239e-06, + "loss": 0.3879, + "step": 12974 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 3.1039741039276123, + "learning_rate": 2.1913375506207703e-06, + "loss": 0.4087, + "step": 12975 + }, + { + "epoch": 1.5790690599330697, + "grad_norm": 1.4071731567382812, + "learning_rate": 2.190119104075069e-06, + "loss": 0.3467, + "step": 12976 + }, + { + "epoch": 1.5791907514450867, + "grad_norm": 2.2165424823760986, + "learning_rate": 2.1889009547115003e-06, + "loss": 0.3726, + "step": 12977 + }, + { + "epoch": 1.5793124429571037, + "grad_norm": 1.5575865507125854, + "learning_rate": 2.18768310257642e-06, + "loss": 0.3843, + "step": 12978 + }, + { + "epoch": 1.5794341344691207, + "grad_norm": 2.5923357009887695, + "learning_rate": 2.186465547716168e-06, + "loss": 0.4024, + "step": 12979 + }, + { + "epoch": 1.5795558259811378, + "grad_norm": 2.053741931915283, + "learning_rate": 2.1852482901770734e-06, + "loss": 0.3783, + "step": 12980 + }, + { + "epoch": 1.5796775174931548, + "grad_norm": 1.9022127389907837, + "learning_rate": 2.1840313300054592e-06, + "loss": 0.4115, + "step": 12981 + }, + { + "epoch": 1.5797992090051718, + "grad_norm": 1.603399634361267, + "learning_rate": 2.18281466724763e-06, + "loss": 0.3688, + "step": 12982 + }, + { + "epoch": 1.5799209005171888, + "grad_norm": 1.7983378171920776, + "learning_rate": 2.181598301949882e-06, + "loss": 0.3906, + "step": 12983 + }, + { + "epoch": 1.5800425920292058, + "grad_norm": 1.6356163024902344, + "learning_rate": 2.1803822341585066e-06, + "loss": 0.365, + "step": 12984 + }, + { + "epoch": 1.5801642835412228, + "grad_norm": 2.31428861618042, + "learning_rate": 2.1791664639197683e-06, + "loss": 0.3253, + "step": 12985 + }, + { + "epoch": 1.58028597505324, + "grad_norm": 4.362576961517334, + "learning_rate": 2.1779509912799367e-06, + "loss": 0.3318, + "step": 12986 + }, + { + "epoch": 1.580407666565257, + "grad_norm": 1.7996320724487305, + "learning_rate": 2.17673581628526e-06, + "loss": 0.383, + "step": 12987 + }, + { + "epoch": 1.5805293580772741, + "grad_norm": 1.5419590473175049, + "learning_rate": 2.1755209389819764e-06, + "loss": 0.3869, + "step": 12988 + }, + { + "epoch": 1.5806510495892911, + "grad_norm": 3.000788927078247, + "learning_rate": 2.1743063594163205e-06, + "loss": 0.4218, + "step": 12989 + }, + { + "epoch": 1.5807727411013082, + "grad_norm": 3.0800633430480957, + "learning_rate": 2.1730920776345043e-06, + "loss": 0.3232, + "step": 12990 + }, + { + "epoch": 1.5808944326133252, + "grad_norm": 1.633508563041687, + "learning_rate": 2.171878093682733e-06, + "loss": 0.3576, + "step": 12991 + }, + { + "epoch": 1.5810161241253422, + "grad_norm": 3.2565622329711914, + "learning_rate": 2.170664407607207e-06, + "loss": 0.2872, + "step": 12992 + }, + { + "epoch": 1.5811378156373594, + "grad_norm": 1.6280717849731445, + "learning_rate": 2.1694510194541063e-06, + "loss": 0.322, + "step": 12993 + }, + { + "epoch": 1.5812595071493765, + "grad_norm": 3.7198541164398193, + "learning_rate": 2.1682379292695997e-06, + "loss": 0.4513, + "step": 12994 + }, + { + "epoch": 1.5813811986613935, + "grad_norm": 2.5929200649261475, + "learning_rate": 2.167025137099853e-06, + "loss": 0.3744, + "step": 12995 + }, + { + "epoch": 1.5815028901734105, + "grad_norm": 1.6465598344802856, + "learning_rate": 2.1658126429910133e-06, + "loss": 0.3362, + "step": 12996 + }, + { + "epoch": 1.5816245816854275, + "grad_norm": 1.9195952415466309, + "learning_rate": 2.164600446989219e-06, + "loss": 0.3038, + "step": 12997 + }, + { + "epoch": 1.5817462731974445, + "grad_norm": 1.4766395092010498, + "learning_rate": 2.163388549140595e-06, + "loss": 0.3336, + "step": 12998 + }, + { + "epoch": 1.5818679647094616, + "grad_norm": 1.4439332485198975, + "learning_rate": 2.1621769494912558e-06, + "loss": 0.3296, + "step": 12999 + }, + { + "epoch": 1.5819896562214786, + "grad_norm": 1.8840376138687134, + "learning_rate": 2.1609656480873097e-06, + "loss": 0.3503, + "step": 13000 + }, + { + "epoch": 1.5821113477334956, + "grad_norm": 2.572530508041382, + "learning_rate": 2.1597546449748463e-06, + "loss": 0.3997, + "step": 13001 + }, + { + "epoch": 1.5822330392455126, + "grad_norm": 2.019007921218872, + "learning_rate": 2.158543940199944e-06, + "loss": 0.384, + "step": 13002 + }, + { + "epoch": 1.5823547307575296, + "grad_norm": 2.364663600921631, + "learning_rate": 2.1573335338086785e-06, + "loss": 0.4355, + "step": 13003 + }, + { + "epoch": 1.5824764222695467, + "grad_norm": 1.8007522821426392, + "learning_rate": 2.156123425847103e-06, + "loss": 0.3396, + "step": 13004 + }, + { + "epoch": 1.5825981137815637, + "grad_norm": 1.7086308002471924, + "learning_rate": 2.1549136163612703e-06, + "loss": 0.3405, + "step": 13005 + }, + { + "epoch": 1.5827198052935807, + "grad_norm": 1.6272873878479004, + "learning_rate": 2.153704105397213e-06, + "loss": 0.387, + "step": 13006 + }, + { + "epoch": 1.5828414968055977, + "grad_norm": 1.6640149354934692, + "learning_rate": 2.1524948930009548e-06, + "loss": 0.3016, + "step": 13007 + }, + { + "epoch": 1.5829631883176147, + "grad_norm": 2.0438344478607178, + "learning_rate": 2.151285979218509e-06, + "loss": 0.3294, + "step": 13008 + }, + { + "epoch": 1.5830848798296318, + "grad_norm": 3.8273937702178955, + "learning_rate": 2.150077364095876e-06, + "loss": 0.4172, + "step": 13009 + }, + { + "epoch": 1.5832065713416488, + "grad_norm": 2.200937509536743, + "learning_rate": 2.1488690476790497e-06, + "loss": 0.378, + "step": 13010 + }, + { + "epoch": 1.5833282628536658, + "grad_norm": 1.7602485418319702, + "learning_rate": 2.147661030014008e-06, + "loss": 0.3285, + "step": 13011 + }, + { + "epoch": 1.583449954365683, + "grad_norm": 2.479182720184326, + "learning_rate": 2.1464533111467136e-06, + "loss": 0.4261, + "step": 13012 + }, + { + "epoch": 1.5835716458777, + "grad_norm": 1.859266757965088, + "learning_rate": 2.14524589112313e-06, + "loss": 0.3381, + "step": 13013 + }, + { + "epoch": 1.583693337389717, + "grad_norm": 1.5318127870559692, + "learning_rate": 2.1440387699891986e-06, + "loss": 0.3229, + "step": 13014 + }, + { + "epoch": 1.583815028901734, + "grad_norm": 1.622456669807434, + "learning_rate": 2.14283194779085e-06, + "loss": 0.3122, + "step": 13015 + }, + { + "epoch": 1.583936720413751, + "grad_norm": 1.5536085367202759, + "learning_rate": 2.141625424574012e-06, + "loss": 0.3398, + "step": 13016 + }, + { + "epoch": 1.5840584119257681, + "grad_norm": 2.3423728942871094, + "learning_rate": 2.1404192003845924e-06, + "loss": 0.4037, + "step": 13017 + }, + { + "epoch": 1.5841801034377854, + "grad_norm": 2.2760283946990967, + "learning_rate": 2.1392132752684893e-06, + "loss": 0.3952, + "step": 13018 + }, + { + "epoch": 1.5843017949498024, + "grad_norm": 1.316992163658142, + "learning_rate": 2.1380076492715963e-06, + "loss": 0.3143, + "step": 13019 + }, + { + "epoch": 1.5844234864618194, + "grad_norm": 2.216892957687378, + "learning_rate": 2.1368023224397793e-06, + "loss": 0.4307, + "step": 13020 + }, + { + "epoch": 1.5845451779738364, + "grad_norm": 1.8537057638168335, + "learning_rate": 2.135597294818914e-06, + "loss": 0.3856, + "step": 13021 + }, + { + "epoch": 1.5846668694858534, + "grad_norm": 2.3461999893188477, + "learning_rate": 2.1343925664548505e-06, + "loss": 0.3605, + "step": 13022 + }, + { + "epoch": 1.5847885609978705, + "grad_norm": 1.6738178730010986, + "learning_rate": 2.133188137393427e-06, + "loss": 0.3754, + "step": 13023 + }, + { + "epoch": 1.5849102525098875, + "grad_norm": 1.6088544130325317, + "learning_rate": 2.131984007680481e-06, + "loss": 0.3256, + "step": 13024 + }, + { + "epoch": 1.5850319440219045, + "grad_norm": 1.8200000524520874, + "learning_rate": 2.1307801773618296e-06, + "loss": 0.4206, + "step": 13025 + }, + { + "epoch": 1.5851536355339215, + "grad_norm": 1.6171196699142456, + "learning_rate": 2.1295766464832777e-06, + "loss": 0.3908, + "step": 13026 + }, + { + "epoch": 1.5852753270459385, + "grad_norm": 1.6358188390731812, + "learning_rate": 2.1283734150906288e-06, + "loss": 0.3749, + "step": 13027 + }, + { + "epoch": 1.5853970185579556, + "grad_norm": 2.141417980194092, + "learning_rate": 2.127170483229665e-06, + "loss": 0.4208, + "step": 13028 + }, + { + "epoch": 1.5855187100699726, + "grad_norm": 1.5128798484802246, + "learning_rate": 2.1259678509461567e-06, + "loss": 0.3967, + "step": 13029 + }, + { + "epoch": 1.5856404015819896, + "grad_norm": 2.1067793369293213, + "learning_rate": 2.124765518285875e-06, + "loss": 0.3414, + "step": 13030 + }, + { + "epoch": 1.5857620930940066, + "grad_norm": 2.328778028488159, + "learning_rate": 2.123563485294562e-06, + "loss": 0.3812, + "step": 13031 + }, + { + "epoch": 1.5858837846060236, + "grad_norm": 1.975500226020813, + "learning_rate": 2.122361752017964e-06, + "loss": 0.4166, + "step": 13032 + }, + { + "epoch": 1.5860054761180407, + "grad_norm": 1.896972417831421, + "learning_rate": 2.121160318501807e-06, + "loss": 0.3922, + "step": 13033 + }, + { + "epoch": 1.5861271676300577, + "grad_norm": 2.4338061809539795, + "learning_rate": 2.1199591847918053e-06, + "loss": 0.4501, + "step": 13034 + }, + { + "epoch": 1.5862488591420747, + "grad_norm": 4.087060451507568, + "learning_rate": 2.1187583509336707e-06, + "loss": 0.2745, + "step": 13035 + }, + { + "epoch": 1.5863705506540917, + "grad_norm": 2.049020528793335, + "learning_rate": 2.1175578169730936e-06, + "loss": 0.3481, + "step": 13036 + }, + { + "epoch": 1.586492242166109, + "grad_norm": 1.684091567993164, + "learning_rate": 2.116357582955755e-06, + "loss": 0.4156, + "step": 13037 + }, + { + "epoch": 1.586613933678126, + "grad_norm": 1.9463108777999878, + "learning_rate": 2.1151576489273306e-06, + "loss": 0.3598, + "step": 13038 + }, + { + "epoch": 1.586735625190143, + "grad_norm": 1.5346837043762207, + "learning_rate": 2.113958014933478e-06, + "loss": 0.3811, + "step": 13039 + }, + { + "epoch": 1.58685731670216, + "grad_norm": 1.8729277849197388, + "learning_rate": 2.1127586810198454e-06, + "loss": 0.371, + "step": 13040 + }, + { + "epoch": 1.586979008214177, + "grad_norm": 2.132922887802124, + "learning_rate": 2.111559647232072e-06, + "loss": 0.4112, + "step": 13041 + }, + { + "epoch": 1.587100699726194, + "grad_norm": 2.478147506713867, + "learning_rate": 2.110360913615782e-06, + "loss": 0.3597, + "step": 13042 + }, + { + "epoch": 1.5872223912382113, + "grad_norm": 2.210583448410034, + "learning_rate": 2.10916248021659e-06, + "loss": 0.3784, + "step": 13043 + }, + { + "epoch": 1.5873440827502283, + "grad_norm": 3.000032901763916, + "learning_rate": 2.107964347080099e-06, + "loss": 0.3495, + "step": 13044 + }, + { + "epoch": 1.5874657742622453, + "grad_norm": 1.791088581085205, + "learning_rate": 2.1067665142518976e-06, + "loss": 0.3636, + "step": 13045 + }, + { + "epoch": 1.5875874657742624, + "grad_norm": 1.8577041625976562, + "learning_rate": 2.1055689817775714e-06, + "loss": 0.3525, + "step": 13046 + }, + { + "epoch": 1.5877091572862794, + "grad_norm": 1.9904578924179077, + "learning_rate": 2.1043717497026863e-06, + "loss": 0.3866, + "step": 13047 + }, + { + "epoch": 1.5878308487982964, + "grad_norm": 1.590952754020691, + "learning_rate": 2.103174818072796e-06, + "loss": 0.3487, + "step": 13048 + }, + { + "epoch": 1.5879525403103134, + "grad_norm": 1.942191481590271, + "learning_rate": 2.101978186933452e-06, + "loss": 0.3676, + "step": 13049 + }, + { + "epoch": 1.5880742318223304, + "grad_norm": 1.9794538021087646, + "learning_rate": 2.1007818563301863e-06, + "loss": 0.3179, + "step": 13050 + }, + { + "epoch": 1.5881959233343474, + "grad_norm": 1.604133129119873, + "learning_rate": 2.099585826308518e-06, + "loss": 0.3418, + "step": 13051 + }, + { + "epoch": 1.5883176148463645, + "grad_norm": 1.8325567245483398, + "learning_rate": 2.098390096913965e-06, + "loss": 0.3253, + "step": 13052 + }, + { + "epoch": 1.5884393063583815, + "grad_norm": 3.32185435295105, + "learning_rate": 2.097194668192025e-06, + "loss": 0.4288, + "step": 13053 + }, + { + "epoch": 1.5885609978703985, + "grad_norm": 1.7816901206970215, + "learning_rate": 2.095999540188185e-06, + "loss": 0.3651, + "step": 13054 + }, + { + "epoch": 1.5886826893824155, + "grad_norm": 3.4924569129943848, + "learning_rate": 2.094804712947923e-06, + "loss": 0.4112, + "step": 13055 + }, + { + "epoch": 1.5888043808944325, + "grad_norm": 1.9935883283615112, + "learning_rate": 2.093610186516701e-06, + "loss": 0.4249, + "step": 13056 + }, + { + "epoch": 1.5889260724064496, + "grad_norm": 1.885443925857544, + "learning_rate": 2.0924159609399808e-06, + "loss": 0.3687, + "step": 13057 + }, + { + "epoch": 1.5890477639184666, + "grad_norm": 1.448865294456482, + "learning_rate": 2.0912220362632005e-06, + "loss": 0.3873, + "step": 13058 + }, + { + "epoch": 1.5891694554304836, + "grad_norm": 1.9033385515213013, + "learning_rate": 2.0900284125317906e-06, + "loss": 0.3221, + "step": 13059 + }, + { + "epoch": 1.5892911469425006, + "grad_norm": 1.7482808828353882, + "learning_rate": 2.0888350897911736e-06, + "loss": 0.3581, + "step": 13060 + }, + { + "epoch": 1.5894128384545176, + "grad_norm": 1.5110366344451904, + "learning_rate": 2.087642068086754e-06, + "loss": 0.3589, + "step": 13061 + }, + { + "epoch": 1.5895345299665349, + "grad_norm": 1.431944727897644, + "learning_rate": 2.0864493474639335e-06, + "loss": 0.3495, + "step": 13062 + }, + { + "epoch": 1.589656221478552, + "grad_norm": 2.4168877601623535, + "learning_rate": 2.0852569279680967e-06, + "loss": 0.3776, + "step": 13063 + }, + { + "epoch": 1.589777912990569, + "grad_norm": 1.8519970178604126, + "learning_rate": 2.0840648096446135e-06, + "loss": 0.3793, + "step": 13064 + }, + { + "epoch": 1.589899604502586, + "grad_norm": 2.2232577800750732, + "learning_rate": 2.0828729925388547e-06, + "loss": 0.3782, + "step": 13065 + }, + { + "epoch": 1.590021296014603, + "grad_norm": 2.82106614112854, + "learning_rate": 2.0816814766961633e-06, + "loss": 0.4127, + "step": 13066 + }, + { + "epoch": 1.59014298752662, + "grad_norm": 2.5844547748565674, + "learning_rate": 2.0804902621618784e-06, + "loss": 0.4223, + "step": 13067 + }, + { + "epoch": 1.5902646790386372, + "grad_norm": 2.7374610900878906, + "learning_rate": 2.079299348981335e-06, + "loss": 0.3127, + "step": 13068 + }, + { + "epoch": 1.5903863705506542, + "grad_norm": 2.436783790588379, + "learning_rate": 2.0781087371998433e-06, + "loss": 0.3122, + "step": 13069 + }, + { + "epoch": 1.5905080620626713, + "grad_norm": 2.7289369106292725, + "learning_rate": 2.076918426862715e-06, + "loss": 0.315, + "step": 13070 + }, + { + "epoch": 1.5906297535746883, + "grad_norm": 1.6221330165863037, + "learning_rate": 2.0757284180152392e-06, + "loss": 0.3614, + "step": 13071 + }, + { + "epoch": 1.5907514450867053, + "grad_norm": 1.865172028541565, + "learning_rate": 2.074538710702697e-06, + "loss": 0.3647, + "step": 13072 + }, + { + "epoch": 1.5908731365987223, + "grad_norm": 1.4285074472427368, + "learning_rate": 2.073349304970366e-06, + "loss": 0.3416, + "step": 13073 + }, + { + "epoch": 1.5909948281107393, + "grad_norm": 2.0247199535369873, + "learning_rate": 2.0721602008634987e-06, + "loss": 0.3166, + "step": 13074 + }, + { + "epoch": 1.5911165196227564, + "grad_norm": 1.9108790159225464, + "learning_rate": 2.070971398427345e-06, + "loss": 0.3427, + "step": 13075 + }, + { + "epoch": 1.5912382111347734, + "grad_norm": 1.8942264318466187, + "learning_rate": 2.0697828977071456e-06, + "loss": 0.3535, + "step": 13076 + }, + { + "epoch": 1.5913599026467904, + "grad_norm": 3.7403979301452637, + "learning_rate": 2.0685946987481165e-06, + "loss": 0.4554, + "step": 13077 + }, + { + "epoch": 1.5914815941588074, + "grad_norm": 2.674006700515747, + "learning_rate": 2.0674068015954797e-06, + "loss": 0.433, + "step": 13078 + }, + { + "epoch": 1.5916032856708244, + "grad_norm": 2.5970053672790527, + "learning_rate": 2.066219206294433e-06, + "loss": 0.3701, + "step": 13079 + }, + { + "epoch": 1.5917249771828414, + "grad_norm": 1.6705175638198853, + "learning_rate": 2.0650319128901654e-06, + "loss": 0.3817, + "step": 13080 + }, + { + "epoch": 1.5918466686948585, + "grad_norm": 1.850736379623413, + "learning_rate": 2.06384492142786e-06, + "loss": 0.3169, + "step": 13081 + }, + { + "epoch": 1.5919683602068755, + "grad_norm": 1.7765023708343506, + "learning_rate": 2.062658231952682e-06, + "loss": 0.4159, + "step": 13082 + }, + { + "epoch": 1.5920900517188925, + "grad_norm": 1.430756688117981, + "learning_rate": 2.061471844509786e-06, + "loss": 0.366, + "step": 13083 + }, + { + "epoch": 1.5922117432309095, + "grad_norm": 2.4478580951690674, + "learning_rate": 2.0602857591443205e-06, + "loss": 0.3344, + "step": 13084 + }, + { + "epoch": 1.5923334347429265, + "grad_norm": 1.5864261388778687, + "learning_rate": 2.059099975901416e-06, + "loss": 0.3823, + "step": 13085 + }, + { + "epoch": 1.5924551262549436, + "grad_norm": 1.5701196193695068, + "learning_rate": 2.0579144948261918e-06, + "loss": 0.3122, + "step": 13086 + }, + { + "epoch": 1.5925768177669608, + "grad_norm": 1.8844201564788818, + "learning_rate": 2.0567293159637624e-06, + "loss": 0.3892, + "step": 13087 + }, + { + "epoch": 1.5926985092789778, + "grad_norm": 1.6080620288848877, + "learning_rate": 2.055544439359225e-06, + "loss": 0.3795, + "step": 13088 + }, + { + "epoch": 1.5928202007909948, + "grad_norm": 1.5697171688079834, + "learning_rate": 2.0543598650576645e-06, + "loss": 0.3777, + "step": 13089 + }, + { + "epoch": 1.5929418923030119, + "grad_norm": 2.5041136741638184, + "learning_rate": 2.0531755931041588e-06, + "loss": 0.3119, + "step": 13090 + }, + { + "epoch": 1.5930635838150289, + "grad_norm": 1.8614110946655273, + "learning_rate": 2.0519916235437674e-06, + "loss": 0.3164, + "step": 13091 + }, + { + "epoch": 1.593185275327046, + "grad_norm": 3.230666160583496, + "learning_rate": 2.05080795642155e-06, + "loss": 0.3372, + "step": 13092 + }, + { + "epoch": 1.593306966839063, + "grad_norm": 2.014282464981079, + "learning_rate": 2.049624591782543e-06, + "loss": 0.3377, + "step": 13093 + }, + { + "epoch": 1.5934286583510802, + "grad_norm": 1.7611933946609497, + "learning_rate": 2.048441529671774e-06, + "loss": 0.3384, + "step": 13094 + }, + { + "epoch": 1.5935503498630972, + "grad_norm": 2.6231980323791504, + "learning_rate": 2.047258770134266e-06, + "loss": 0.3954, + "step": 13095 + }, + { + "epoch": 1.5936720413751142, + "grad_norm": 1.7613611221313477, + "learning_rate": 2.046076313215024e-06, + "loss": 0.4118, + "step": 13096 + }, + { + "epoch": 1.5937937328871312, + "grad_norm": 1.9959487915039062, + "learning_rate": 2.044894158959039e-06, + "loss": 0.3672, + "step": 13097 + }, + { + "epoch": 1.5939154243991482, + "grad_norm": 1.6007417440414429, + "learning_rate": 2.0437123074112996e-06, + "loss": 0.4135, + "step": 13098 + }, + { + "epoch": 1.5940371159111653, + "grad_norm": 3.7402307987213135, + "learning_rate": 2.0425307586167763e-06, + "loss": 0.3884, + "step": 13099 + }, + { + "epoch": 1.5941588074231823, + "grad_norm": 1.7162389755249023, + "learning_rate": 2.0413495126204274e-06, + "loss": 0.3883, + "step": 13100 + }, + { + "epoch": 1.5942804989351993, + "grad_norm": 2.3073923587799072, + "learning_rate": 2.0401685694672045e-06, + "loss": 0.3392, + "step": 13101 + }, + { + "epoch": 1.5944021904472163, + "grad_norm": 2.4966931343078613, + "learning_rate": 2.03898792920204e-06, + "loss": 0.4019, + "step": 13102 + }, + { + "epoch": 1.5945238819592333, + "grad_norm": 1.4600239992141724, + "learning_rate": 2.037807591869866e-06, + "loss": 0.3528, + "step": 13103 + }, + { + "epoch": 1.5946455734712504, + "grad_norm": 1.5368931293487549, + "learning_rate": 2.0366275575155936e-06, + "loss": 0.3938, + "step": 13104 + }, + { + "epoch": 1.5947672649832674, + "grad_norm": 1.6008902788162231, + "learning_rate": 2.035447826184124e-06, + "loss": 0.3337, + "step": 13105 + }, + { + "epoch": 1.5948889564952844, + "grad_norm": 2.228052854537964, + "learning_rate": 2.0342683979203527e-06, + "loss": 0.3756, + "step": 13106 + }, + { + "epoch": 1.5950106480073014, + "grad_norm": 2.35479736328125, + "learning_rate": 2.0330892727691564e-06, + "loss": 0.363, + "step": 13107 + }, + { + "epoch": 1.5951323395193184, + "grad_norm": 2.6611642837524414, + "learning_rate": 2.031910450775402e-06, + "loss": 0.4209, + "step": 13108 + }, + { + "epoch": 1.5952540310313355, + "grad_norm": 1.578197956085205, + "learning_rate": 2.03073193198395e-06, + "loss": 0.3965, + "step": 13109 + }, + { + "epoch": 1.5953757225433525, + "grad_norm": 1.3807129859924316, + "learning_rate": 2.029553716439644e-06, + "loss": 0.3383, + "step": 13110 + }, + { + "epoch": 1.5954974140553695, + "grad_norm": 2.376840591430664, + "learning_rate": 2.028375804187317e-06, + "loss": 0.2854, + "step": 13111 + }, + { + "epoch": 1.5956191055673865, + "grad_norm": 2.8759920597076416, + "learning_rate": 2.027198195271791e-06, + "loss": 0.3942, + "step": 13112 + }, + { + "epoch": 1.5957407970794037, + "grad_norm": 2.0009405612945557, + "learning_rate": 2.026020889737873e-06, + "loss": 0.3133, + "step": 13113 + }, + { + "epoch": 1.5958624885914208, + "grad_norm": 2.055126667022705, + "learning_rate": 2.024843887630369e-06, + "loss": 0.3896, + "step": 13114 + }, + { + "epoch": 1.5959841801034378, + "grad_norm": 1.771736741065979, + "learning_rate": 2.023667188994063e-06, + "loss": 0.3711, + "step": 13115 + }, + { + "epoch": 1.5961058716154548, + "grad_norm": 2.2798218727111816, + "learning_rate": 2.022490793873727e-06, + "loss": 0.3171, + "step": 13116 + }, + { + "epoch": 1.5962275631274718, + "grad_norm": 1.9011422395706177, + "learning_rate": 2.021314702314132e-06, + "loss": 0.387, + "step": 13117 + }, + { + "epoch": 1.5963492546394888, + "grad_norm": 3.230950117111206, + "learning_rate": 2.020138914360027e-06, + "loss": 0.4211, + "step": 13118 + }, + { + "epoch": 1.596470946151506, + "grad_norm": 3.131316900253296, + "learning_rate": 2.0189634300561513e-06, + "loss": 0.3896, + "step": 13119 + }, + { + "epoch": 1.596592637663523, + "grad_norm": 2.0703797340393066, + "learning_rate": 2.01778824944724e-06, + "loss": 0.2909, + "step": 13120 + }, + { + "epoch": 1.5967143291755401, + "grad_norm": 2.714980125427246, + "learning_rate": 2.016613372578005e-06, + "loss": 0.4278, + "step": 13121 + }, + { + "epoch": 1.5968360206875571, + "grad_norm": 2.3640944957733154, + "learning_rate": 2.0154387994931613e-06, + "loss": 0.3698, + "step": 13122 + }, + { + "epoch": 1.5969577121995742, + "grad_norm": 1.8098775148391724, + "learning_rate": 2.014264530237395e-06, + "loss": 0.3759, + "step": 13123 + }, + { + "epoch": 1.5970794037115912, + "grad_norm": 3.8114538192749023, + "learning_rate": 2.0130905648553922e-06, + "loss": 0.4176, + "step": 13124 + }, + { + "epoch": 1.5972010952236082, + "grad_norm": 1.8549028635025024, + "learning_rate": 2.011916903391826e-06, + "loss": 0.3216, + "step": 13125 + }, + { + "epoch": 1.5973227867356252, + "grad_norm": 2.009453535079956, + "learning_rate": 2.0107435458913573e-06, + "loss": 0.4127, + "step": 13126 + }, + { + "epoch": 1.5974444782476422, + "grad_norm": 1.6705901622772217, + "learning_rate": 2.0095704923986305e-06, + "loss": 0.379, + "step": 13127 + }, + { + "epoch": 1.5975661697596593, + "grad_norm": 1.6024035215377808, + "learning_rate": 2.0083977429582892e-06, + "loss": 0.3469, + "step": 13128 + }, + { + "epoch": 1.5976878612716763, + "grad_norm": 1.6293784379959106, + "learning_rate": 2.0072252976149508e-06, + "loss": 0.3898, + "step": 13129 + }, + { + "epoch": 1.5978095527836933, + "grad_norm": 1.6171514987945557, + "learning_rate": 2.006053156413238e-06, + "loss": 0.3543, + "step": 13130 + }, + { + "epoch": 1.5979312442957103, + "grad_norm": 1.3208644390106201, + "learning_rate": 2.004881319397749e-06, + "loss": 0.3965, + "step": 13131 + }, + { + "epoch": 1.5980529358077273, + "grad_norm": 1.501453161239624, + "learning_rate": 2.003709786613073e-06, + "loss": 0.3518, + "step": 13132 + }, + { + "epoch": 1.5981746273197444, + "grad_norm": 1.4601036310195923, + "learning_rate": 2.0025385581037927e-06, + "loss": 0.3484, + "step": 13133 + }, + { + "epoch": 1.5982963188317614, + "grad_norm": 1.8425116539001465, + "learning_rate": 2.001367633914476e-06, + "loss": 0.3267, + "step": 13134 + }, + { + "epoch": 1.5984180103437784, + "grad_norm": 1.508164644241333, + "learning_rate": 2.000197014089673e-06, + "loss": 0.3286, + "step": 13135 + }, + { + "epoch": 1.5985397018557954, + "grad_norm": 1.4628454446792603, + "learning_rate": 1.999026698673935e-06, + "loss": 0.3948, + "step": 13136 + }, + { + "epoch": 1.5986613933678124, + "grad_norm": 2.1673059463500977, + "learning_rate": 1.997856687711789e-06, + "loss": 0.411, + "step": 13137 + }, + { + "epoch": 1.5987830848798297, + "grad_norm": 2.268766164779663, + "learning_rate": 1.9966869812477618e-06, + "loss": 0.3785, + "step": 13138 + }, + { + "epoch": 1.5989047763918467, + "grad_norm": 1.3966866731643677, + "learning_rate": 1.9955175793263628e-06, + "loss": 0.3734, + "step": 13139 + }, + { + "epoch": 1.5990264679038637, + "grad_norm": 1.3291488885879517, + "learning_rate": 1.9943484819920832e-06, + "loss": 0.3357, + "step": 13140 + }, + { + "epoch": 1.5991481594158807, + "grad_norm": 3.306013345718384, + "learning_rate": 1.9931796892894186e-06, + "loss": 0.2922, + "step": 13141 + }, + { + "epoch": 1.5992698509278978, + "grad_norm": 1.9285842180252075, + "learning_rate": 1.9920112012628403e-06, + "loss": 0.4253, + "step": 13142 + }, + { + "epoch": 1.5993915424399148, + "grad_norm": 1.5274900197982788, + "learning_rate": 1.990843017956808e-06, + "loss": 0.3867, + "step": 13143 + }, + { + "epoch": 1.599513233951932, + "grad_norm": 1.4659581184387207, + "learning_rate": 1.98967513941578e-06, + "loss": 0.3609, + "step": 13144 + }, + { + "epoch": 1.599634925463949, + "grad_norm": 2.233898878097534, + "learning_rate": 1.9885075656841933e-06, + "loss": 0.3883, + "step": 13145 + }, + { + "epoch": 1.599756616975966, + "grad_norm": 1.4824124574661255, + "learning_rate": 1.987340296806477e-06, + "loss": 0.3154, + "step": 13146 + }, + { + "epoch": 1.599878308487983, + "grad_norm": 2.120619297027588, + "learning_rate": 1.986173332827047e-06, + "loss": 0.3706, + "step": 13147 + }, + { + "epoch": 1.6, + "grad_norm": 1.8847951889038086, + "learning_rate": 1.985006673790307e-06, + "loss": 0.356, + "step": 13148 + }, + { + "epoch": 1.600121691512017, + "grad_norm": 1.7535452842712402, + "learning_rate": 1.983840319740655e-06, + "loss": 0.3707, + "step": 13149 + }, + { + "epoch": 1.6002433830240341, + "grad_norm": 1.942562222480774, + "learning_rate": 1.9826742707224735e-06, + "loss": 0.404, + "step": 13150 + }, + { + "epoch": 1.6003650745360511, + "grad_norm": 3.911184072494507, + "learning_rate": 1.981508526780127e-06, + "loss": 0.4187, + "step": 13151 + }, + { + "epoch": 1.6004867660480682, + "grad_norm": 1.8654357194900513, + "learning_rate": 1.980343087957981e-06, + "loss": 0.3659, + "step": 13152 + }, + { + "epoch": 1.6006084575600852, + "grad_norm": 2.3797109127044678, + "learning_rate": 1.97917795430038e-06, + "loss": 0.3535, + "step": 13153 + }, + { + "epoch": 1.6007301490721022, + "grad_norm": 2.240915298461914, + "learning_rate": 1.9780131258516578e-06, + "loss": 0.3888, + "step": 13154 + }, + { + "epoch": 1.6008518405841192, + "grad_norm": 1.3131917715072632, + "learning_rate": 1.976848602656144e-06, + "loss": 0.3645, + "step": 13155 + }, + { + "epoch": 1.6009735320961362, + "grad_norm": 1.5870047807693481, + "learning_rate": 1.9756843847581463e-06, + "loss": 0.3661, + "step": 13156 + }, + { + "epoch": 1.6010952236081533, + "grad_norm": 1.7342826128005981, + "learning_rate": 1.9745204722019683e-06, + "loss": 0.3082, + "step": 13157 + }, + { + "epoch": 1.6012169151201703, + "grad_norm": 1.8618814945220947, + "learning_rate": 1.973356865031898e-06, + "loss": 0.4428, + "step": 13158 + }, + { + "epoch": 1.6013386066321873, + "grad_norm": 1.919959545135498, + "learning_rate": 1.9721935632922107e-06, + "loss": 0.4016, + "step": 13159 + }, + { + "epoch": 1.6014602981442043, + "grad_norm": 1.9391942024230957, + "learning_rate": 1.9710305670271778e-06, + "loss": 0.3585, + "step": 13160 + }, + { + "epoch": 1.6015819896562213, + "grad_norm": 1.6574639081954956, + "learning_rate": 1.969867876281051e-06, + "loss": 0.3401, + "step": 13161 + }, + { + "epoch": 1.6017036811682384, + "grad_norm": 3.038248062133789, + "learning_rate": 1.9687054910980695e-06, + "loss": 0.299, + "step": 13162 + }, + { + "epoch": 1.6018253726802556, + "grad_norm": 1.6359108686447144, + "learning_rate": 1.9675434115224713e-06, + "loss": 0.4085, + "step": 13163 + }, + { + "epoch": 1.6019470641922726, + "grad_norm": 2.4834847450256348, + "learning_rate": 1.9663816375984725e-06, + "loss": 0.3036, + "step": 13164 + }, + { + "epoch": 1.6020687557042896, + "grad_norm": 3.4020845890045166, + "learning_rate": 1.9652201693702775e-06, + "loss": 0.3211, + "step": 13165 + }, + { + "epoch": 1.6021904472163067, + "grad_norm": 1.8100526332855225, + "learning_rate": 1.9640590068820896e-06, + "loss": 0.4085, + "step": 13166 + }, + { + "epoch": 1.6023121387283237, + "grad_norm": 1.6588760614395142, + "learning_rate": 1.9628981501780897e-06, + "loss": 0.3957, + "step": 13167 + }, + { + "epoch": 1.6024338302403407, + "grad_norm": 1.6608449220657349, + "learning_rate": 1.961737599302449e-06, + "loss": 0.3919, + "step": 13168 + }, + { + "epoch": 1.602555521752358, + "grad_norm": 2.2597062587738037, + "learning_rate": 1.9605773542993357e-06, + "loss": 0.3197, + "step": 13169 + }, + { + "epoch": 1.602677213264375, + "grad_norm": 2.0825250148773193, + "learning_rate": 1.95941741521289e-06, + "loss": 0.3723, + "step": 13170 + }, + { + "epoch": 1.602798904776392, + "grad_norm": 1.9156825542449951, + "learning_rate": 1.958257782087256e-06, + "loss": 0.314, + "step": 13171 + }, + { + "epoch": 1.602920596288409, + "grad_norm": 1.633423924446106, + "learning_rate": 1.9570984549665607e-06, + "loss": 0.3614, + "step": 13172 + }, + { + "epoch": 1.603042287800426, + "grad_norm": 1.5988203287124634, + "learning_rate": 1.955939433894913e-06, + "loss": 0.3831, + "step": 13173 + }, + { + "epoch": 1.603163979312443, + "grad_norm": 1.5619373321533203, + "learning_rate": 1.9547807189164236e-06, + "loss": 0.3824, + "step": 13174 + }, + { + "epoch": 1.60328567082446, + "grad_norm": 1.826727271080017, + "learning_rate": 1.9536223100751793e-06, + "loss": 0.4226, + "step": 13175 + }, + { + "epoch": 1.603407362336477, + "grad_norm": 1.3738070726394653, + "learning_rate": 1.95246420741526e-06, + "loss": 0.3401, + "step": 13176 + }, + { + "epoch": 1.603529053848494, + "grad_norm": 2.9618113040924072, + "learning_rate": 1.951306410980738e-06, + "loss": 0.3939, + "step": 13177 + }, + { + "epoch": 1.603650745360511, + "grad_norm": 1.620686411857605, + "learning_rate": 1.9501489208156654e-06, + "loss": 0.318, + "step": 13178 + }, + { + "epoch": 1.6037724368725281, + "grad_norm": 2.3203670978546143, + "learning_rate": 1.9489917369640865e-06, + "loss": 0.3824, + "step": 13179 + }, + { + "epoch": 1.6038941283845451, + "grad_norm": 1.9192984104156494, + "learning_rate": 1.9478348594700424e-06, + "loss": 0.334, + "step": 13180 + }, + { + "epoch": 1.6040158198965622, + "grad_norm": 1.9321638345718384, + "learning_rate": 1.9466782883775437e-06, + "loss": 0.4316, + "step": 13181 + }, + { + "epoch": 1.6041375114085792, + "grad_norm": 2.7354652881622314, + "learning_rate": 1.9455220237306085e-06, + "loss": 0.4319, + "step": 13182 + }, + { + "epoch": 1.6042592029205962, + "grad_norm": 3.1546716690063477, + "learning_rate": 1.9443660655732312e-06, + "loss": 0.4116, + "step": 13183 + }, + { + "epoch": 1.6043808944326132, + "grad_norm": 2.883622169494629, + "learning_rate": 1.943210413949398e-06, + "loss": 0.3548, + "step": 13184 + }, + { + "epoch": 1.6045025859446302, + "grad_norm": 2.8604094982147217, + "learning_rate": 1.942055068903087e-06, + "loss": 0.3278, + "step": 13185 + }, + { + "epoch": 1.6046242774566473, + "grad_norm": 2.840311288833618, + "learning_rate": 1.9409000304782588e-06, + "loss": 0.4517, + "step": 13186 + }, + { + "epoch": 1.6047459689686643, + "grad_norm": 1.6016236543655396, + "learning_rate": 1.9397452987188646e-06, + "loss": 0.3896, + "step": 13187 + }, + { + "epoch": 1.6048676604806815, + "grad_norm": 1.4885835647583008, + "learning_rate": 1.9385908736688475e-06, + "loss": 0.3517, + "step": 13188 + }, + { + "epoch": 1.6049893519926985, + "grad_norm": 2.3673877716064453, + "learning_rate": 1.937436755372132e-06, + "loss": 0.3725, + "step": 13189 + }, + { + "epoch": 1.6051110435047156, + "grad_norm": 1.6779899597167969, + "learning_rate": 1.9362829438726384e-06, + "loss": 0.3373, + "step": 13190 + }, + { + "epoch": 1.6052327350167326, + "grad_norm": 2.5539908409118652, + "learning_rate": 1.9351294392142706e-06, + "loss": 0.3235, + "step": 13191 + }, + { + "epoch": 1.6053544265287496, + "grad_norm": 3.1633963584899902, + "learning_rate": 1.93397624144092e-06, + "loss": 0.4091, + "step": 13192 + }, + { + "epoch": 1.6054761180407666, + "grad_norm": 1.5882463455200195, + "learning_rate": 1.93282335059647e-06, + "loss": 0.345, + "step": 13193 + }, + { + "epoch": 1.6055978095527836, + "grad_norm": 1.4868581295013428, + "learning_rate": 1.9316707667247893e-06, + "loss": 0.3462, + "step": 13194 + }, + { + "epoch": 1.6057195010648009, + "grad_norm": 2.319028615951538, + "learning_rate": 1.930518489869734e-06, + "loss": 0.421, + "step": 13195 + }, + { + "epoch": 1.605841192576818, + "grad_norm": 1.7415883541107178, + "learning_rate": 1.9293665200751545e-06, + "loss": 0.4073, + "step": 13196 + }, + { + "epoch": 1.605962884088835, + "grad_norm": 1.8426426649093628, + "learning_rate": 1.9282148573848825e-06, + "loss": 0.4099, + "step": 13197 + }, + { + "epoch": 1.606084575600852, + "grad_norm": 1.710612416267395, + "learning_rate": 1.9270635018427463e-06, + "loss": 0.3859, + "step": 13198 + }, + { + "epoch": 1.606206267112869, + "grad_norm": 2.2894039154052734, + "learning_rate": 1.9259124534925533e-06, + "loss": 0.3537, + "step": 13199 + }, + { + "epoch": 1.606327958624886, + "grad_norm": 2.2786309719085693, + "learning_rate": 1.9247617123781005e-06, + "loss": 0.3482, + "step": 13200 + }, + { + "epoch": 1.606449650136903, + "grad_norm": 1.9055657386779785, + "learning_rate": 1.9236112785431825e-06, + "loss": 0.3787, + "step": 13201 + }, + { + "epoch": 1.60657134164892, + "grad_norm": 1.6727559566497803, + "learning_rate": 1.9224611520315726e-06, + "loss": 0.355, + "step": 13202 + }, + { + "epoch": 1.606693033160937, + "grad_norm": 1.7929413318634033, + "learning_rate": 1.921311332887036e-06, + "loss": 0.3634, + "step": 13203 + }, + { + "epoch": 1.606814724672954, + "grad_norm": 1.9333524703979492, + "learning_rate": 1.9201618211533246e-06, + "loss": 0.3852, + "step": 13204 + }, + { + "epoch": 1.606936416184971, + "grad_norm": 1.8331294059753418, + "learning_rate": 1.9190126168741776e-06, + "loss": 0.4045, + "step": 13205 + }, + { + "epoch": 1.607058107696988, + "grad_norm": 2.7449755668640137, + "learning_rate": 1.9178637200933303e-06, + "loss": 0.3119, + "step": 13206 + }, + { + "epoch": 1.6071797992090051, + "grad_norm": 1.8609424829483032, + "learning_rate": 1.9167151308544973e-06, + "loss": 0.3977, + "step": 13207 + }, + { + "epoch": 1.6073014907210221, + "grad_norm": 1.6123592853546143, + "learning_rate": 1.9155668492013823e-06, + "loss": 0.374, + "step": 13208 + }, + { + "epoch": 1.6074231822330391, + "grad_norm": 2.7192881107330322, + "learning_rate": 1.914418875177685e-06, + "loss": 0.3939, + "step": 13209 + }, + { + "epoch": 1.6075448737450562, + "grad_norm": 2.2225780487060547, + "learning_rate": 1.9132712088270854e-06, + "loss": 0.3548, + "step": 13210 + }, + { + "epoch": 1.6076665652570732, + "grad_norm": 4.248149394989014, + "learning_rate": 1.9121238501932537e-06, + "loss": 0.4465, + "step": 13211 + }, + { + "epoch": 1.6077882567690902, + "grad_norm": 1.9062058925628662, + "learning_rate": 1.9109767993198513e-06, + "loss": 0.3828, + "step": 13212 + }, + { + "epoch": 1.6079099482811072, + "grad_norm": 2.6968767642974854, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.4181, + "step": 13213 + }, + { + "epoch": 1.6080316397931245, + "grad_norm": 1.5170238018035889, + "learning_rate": 1.9086836210289107e-06, + "loss": 0.3497, + "step": 13214 + }, + { + "epoch": 1.6081533313051415, + "grad_norm": 2.2596681118011475, + "learning_rate": 1.907537493698637e-06, + "loss": 0.3901, + "step": 13215 + }, + { + "epoch": 1.6082750228171585, + "grad_norm": 2.2486374378204346, + "learning_rate": 1.9063916743033061e-06, + "loss": 0.3655, + "step": 13216 + }, + { + "epoch": 1.6083967143291755, + "grad_norm": 1.4816728830337524, + "learning_rate": 1.905246162886527e-06, + "loss": 0.364, + "step": 13217 + }, + { + "epoch": 1.6085184058411925, + "grad_norm": 1.6641440391540527, + "learning_rate": 1.904100959491888e-06, + "loss": 0.3946, + "step": 13218 + }, + { + "epoch": 1.6086400973532096, + "grad_norm": 1.7364550828933716, + "learning_rate": 1.9029560641629618e-06, + "loss": 0.4108, + "step": 13219 + }, + { + "epoch": 1.6087617888652268, + "grad_norm": 1.4976606369018555, + "learning_rate": 1.9018114769433193e-06, + "loss": 0.3788, + "step": 13220 + }, + { + "epoch": 1.6088834803772438, + "grad_norm": 1.3347288370132446, + "learning_rate": 1.9006671978765135e-06, + "loss": 0.365, + "step": 13221 + }, + { + "epoch": 1.6090051718892608, + "grad_norm": 1.9878966808319092, + "learning_rate": 1.8995232270060826e-06, + "loss": 0.3747, + "step": 13222 + }, + { + "epoch": 1.6091268634012779, + "grad_norm": 2.4338021278381348, + "learning_rate": 1.898379564375562e-06, + "loss": 0.3688, + "step": 13223 + }, + { + "epoch": 1.6092485549132949, + "grad_norm": 1.8237535953521729, + "learning_rate": 1.897236210028469e-06, + "loss": 0.3574, + "step": 13224 + }, + { + "epoch": 1.609370246425312, + "grad_norm": 2.5246663093566895, + "learning_rate": 1.896093164008307e-06, + "loss": 0.3475, + "step": 13225 + }, + { + "epoch": 1.609491937937329, + "grad_norm": 2.1172122955322266, + "learning_rate": 1.8949504263585804e-06, + "loss": 0.3452, + "step": 13226 + }, + { + "epoch": 1.609613629449346, + "grad_norm": 1.6885406970977783, + "learning_rate": 1.8938079971227608e-06, + "loss": 0.3291, + "step": 13227 + }, + { + "epoch": 1.609735320961363, + "grad_norm": 2.7502851486206055, + "learning_rate": 1.8926658763443284e-06, + "loss": 0.3592, + "step": 13228 + }, + { + "epoch": 1.60985701247338, + "grad_norm": 2.3753843307495117, + "learning_rate": 1.89152406406674e-06, + "loss": 0.3481, + "step": 13229 + }, + { + "epoch": 1.609978703985397, + "grad_norm": 1.8891501426696777, + "learning_rate": 1.8903825603334426e-06, + "loss": 0.4036, + "step": 13230 + }, + { + "epoch": 1.610100395497414, + "grad_norm": 2.0609076023101807, + "learning_rate": 1.889241365187877e-06, + "loss": 0.3969, + "step": 13231 + }, + { + "epoch": 1.610222087009431, + "grad_norm": 2.677816390991211, + "learning_rate": 1.8881004786734668e-06, + "loss": 0.3087, + "step": 13232 + }, + { + "epoch": 1.610343778521448, + "grad_norm": 1.7660808563232422, + "learning_rate": 1.8869599008336203e-06, + "loss": 0.332, + "step": 13233 + }, + { + "epoch": 1.610465470033465, + "grad_norm": 1.5322084426879883, + "learning_rate": 1.885819631711746e-06, + "loss": 0.3374, + "step": 13234 + }, + { + "epoch": 1.610587161545482, + "grad_norm": 2.8294854164123535, + "learning_rate": 1.8846796713512305e-06, + "loss": 0.4058, + "step": 13235 + }, + { + "epoch": 1.6107088530574991, + "grad_norm": 1.9636582136154175, + "learning_rate": 1.8835400197954479e-06, + "loss": 0.3704, + "step": 13236 + }, + { + "epoch": 1.6108305445695161, + "grad_norm": 1.8096963167190552, + "learning_rate": 1.8824006770877712e-06, + "loss": 0.3478, + "step": 13237 + }, + { + "epoch": 1.6109522360815332, + "grad_norm": 1.3713093996047974, + "learning_rate": 1.8812616432715503e-06, + "loss": 0.3484, + "step": 13238 + }, + { + "epoch": 1.6110739275935504, + "grad_norm": 4.010976314544678, + "learning_rate": 1.8801229183901293e-06, + "loss": 0.4626, + "step": 13239 + }, + { + "epoch": 1.6111956191055674, + "grad_norm": 1.7447024583816528, + "learning_rate": 1.8789845024868381e-06, + "loss": 0.3495, + "step": 13240 + }, + { + "epoch": 1.6113173106175844, + "grad_norm": 1.570378065109253, + "learning_rate": 1.877846395604993e-06, + "loss": 0.4024, + "step": 13241 + }, + { + "epoch": 1.6114390021296014, + "grad_norm": 1.969346523284912, + "learning_rate": 1.8767085977879085e-06, + "loss": 0.3739, + "step": 13242 + }, + { + "epoch": 1.6115606936416185, + "grad_norm": 2.5684289932250977, + "learning_rate": 1.8755711090788753e-06, + "loss": 0.4212, + "step": 13243 + }, + { + "epoch": 1.6116823851536355, + "grad_norm": 1.56687593460083, + "learning_rate": 1.8744339295211755e-06, + "loss": 0.3667, + "step": 13244 + }, + { + "epoch": 1.6118040766656527, + "grad_norm": 2.4005539417266846, + "learning_rate": 1.8732970591580857e-06, + "loss": 0.3319, + "step": 13245 + }, + { + "epoch": 1.6119257681776697, + "grad_norm": 2.3146398067474365, + "learning_rate": 1.8721604980328656e-06, + "loss": 0.3721, + "step": 13246 + }, + { + "epoch": 1.6120474596896868, + "grad_norm": 1.469888687133789, + "learning_rate": 1.8710242461887584e-06, + "loss": 0.3417, + "step": 13247 + }, + { + "epoch": 1.6121691512017038, + "grad_norm": 1.7390164136886597, + "learning_rate": 1.8698883036690075e-06, + "loss": 0.3542, + "step": 13248 + }, + { + "epoch": 1.6122908427137208, + "grad_norm": 1.7231286764144897, + "learning_rate": 1.8687526705168356e-06, + "loss": 0.3592, + "step": 13249 + }, + { + "epoch": 1.6124125342257378, + "grad_norm": 1.5603148937225342, + "learning_rate": 1.8676173467754544e-06, + "loss": 0.3409, + "step": 13250 + }, + { + "epoch": 1.6125342257377548, + "grad_norm": 2.4028549194335938, + "learning_rate": 1.8664823324880677e-06, + "loss": 0.3115, + "step": 13251 + }, + { + "epoch": 1.6126559172497719, + "grad_norm": 1.8136959075927734, + "learning_rate": 1.8653476276978599e-06, + "loss": 0.3528, + "step": 13252 + }, + { + "epoch": 1.6127776087617889, + "grad_norm": 1.9303573369979858, + "learning_rate": 1.8642132324480156e-06, + "loss": 0.361, + "step": 13253 + }, + { + "epoch": 1.612899300273806, + "grad_norm": 1.7942339181900024, + "learning_rate": 1.8630791467816979e-06, + "loss": 0.379, + "step": 13254 + }, + { + "epoch": 1.613020991785823, + "grad_norm": 1.3791202306747437, + "learning_rate": 1.8619453707420586e-06, + "loss": 0.3402, + "step": 13255 + }, + { + "epoch": 1.61314268329784, + "grad_norm": 1.652978777885437, + "learning_rate": 1.8608119043722462e-06, + "loss": 0.3817, + "step": 13256 + }, + { + "epoch": 1.613264374809857, + "grad_norm": 2.38712739944458, + "learning_rate": 1.8596787477153844e-06, + "loss": 0.3303, + "step": 13257 + }, + { + "epoch": 1.613386066321874, + "grad_norm": 1.327331781387329, + "learning_rate": 1.858545900814599e-06, + "loss": 0.3295, + "step": 13258 + }, + { + "epoch": 1.613507757833891, + "grad_norm": 2.4914231300354004, + "learning_rate": 1.8574133637129932e-06, + "loss": 0.4301, + "step": 13259 + }, + { + "epoch": 1.613629449345908, + "grad_norm": 2.065092086791992, + "learning_rate": 1.8562811364536614e-06, + "loss": 0.3564, + "step": 13260 + }, + { + "epoch": 1.613751140857925, + "grad_norm": 1.8509316444396973, + "learning_rate": 1.855149219079694e-06, + "loss": 0.344, + "step": 13261 + }, + { + "epoch": 1.613872832369942, + "grad_norm": 2.01120924949646, + "learning_rate": 1.8540176116341547e-06, + "loss": 0.3999, + "step": 13262 + }, + { + "epoch": 1.613994523881959, + "grad_norm": 2.0076661109924316, + "learning_rate": 1.8528863141601038e-06, + "loss": 0.3909, + "step": 13263 + }, + { + "epoch": 1.6141162153939763, + "grad_norm": 2.4938056468963623, + "learning_rate": 1.851755326700595e-06, + "loss": 0.3445, + "step": 13264 + }, + { + "epoch": 1.6142379069059933, + "grad_norm": 1.689375638961792, + "learning_rate": 1.850624649298659e-06, + "loss": 0.4045, + "step": 13265 + }, + { + "epoch": 1.6143595984180104, + "grad_norm": 1.6277165412902832, + "learning_rate": 1.8494942819973272e-06, + "loss": 0.3972, + "step": 13266 + }, + { + "epoch": 1.6144812899300274, + "grad_norm": 2.8080849647521973, + "learning_rate": 1.8483642248396072e-06, + "loss": 0.4208, + "step": 13267 + }, + { + "epoch": 1.6146029814420444, + "grad_norm": 1.7079126834869385, + "learning_rate": 1.8472344778685002e-06, + "loss": 0.3858, + "step": 13268 + }, + { + "epoch": 1.6147246729540614, + "grad_norm": 2.390308380126953, + "learning_rate": 1.8461050411269975e-06, + "loss": 0.4407, + "step": 13269 + }, + { + "epoch": 1.6148463644660787, + "grad_norm": 3.835757255554199, + "learning_rate": 1.844975914658077e-06, + "loss": 0.3367, + "step": 13270 + }, + { + "epoch": 1.6149680559780957, + "grad_norm": 1.788669466972351, + "learning_rate": 1.8438470985046997e-06, + "loss": 0.3264, + "step": 13271 + }, + { + "epoch": 1.6150897474901127, + "grad_norm": 2.648322582244873, + "learning_rate": 1.842718592709829e-06, + "loss": 0.4316, + "step": 13272 + }, + { + "epoch": 1.6152114390021297, + "grad_norm": 1.4995954036712646, + "learning_rate": 1.841590397316394e-06, + "loss": 0.3349, + "step": 13273 + }, + { + "epoch": 1.6153331305141467, + "grad_norm": 1.5123441219329834, + "learning_rate": 1.8404625123673336e-06, + "loss": 0.3521, + "step": 13274 + }, + { + "epoch": 1.6154548220261638, + "grad_norm": 1.4551340341567993, + "learning_rate": 1.8393349379055647e-06, + "loss": 0.3822, + "step": 13275 + }, + { + "epoch": 1.6155765135381808, + "grad_norm": 1.5746889114379883, + "learning_rate": 1.8382076739739907e-06, + "loss": 0.3589, + "step": 13276 + }, + { + "epoch": 1.6156982050501978, + "grad_norm": 2.464028835296631, + "learning_rate": 1.8370807206155106e-06, + "loss": 0.3583, + "step": 13277 + }, + { + "epoch": 1.6158198965622148, + "grad_norm": 2.8867733478546143, + "learning_rate": 1.8359540778730066e-06, + "loss": 0.3368, + "step": 13278 + }, + { + "epoch": 1.6159415880742318, + "grad_norm": 2.114790201187134, + "learning_rate": 1.8348277457893449e-06, + "loss": 0.3481, + "step": 13279 + }, + { + "epoch": 1.6160632795862488, + "grad_norm": 1.7462207078933716, + "learning_rate": 1.8337017244073907e-06, + "loss": 0.392, + "step": 13280 + }, + { + "epoch": 1.6161849710982659, + "grad_norm": 1.5940678119659424, + "learning_rate": 1.83257601376999e-06, + "loss": 0.3301, + "step": 13281 + }, + { + "epoch": 1.6163066626102829, + "grad_norm": 1.5645939111709595, + "learning_rate": 1.831450613919975e-06, + "loss": 0.3141, + "step": 13282 + }, + { + "epoch": 1.6164283541223, + "grad_norm": 1.8331551551818848, + "learning_rate": 1.830325524900175e-06, + "loss": 0.3663, + "step": 13283 + }, + { + "epoch": 1.616550045634317, + "grad_norm": 2.1362781524658203, + "learning_rate": 1.8292007467533978e-06, + "loss": 0.4157, + "step": 13284 + }, + { + "epoch": 1.616671737146334, + "grad_norm": 1.87754487991333, + "learning_rate": 1.828076279522446e-06, + "loss": 0.3922, + "step": 13285 + }, + { + "epoch": 1.616793428658351, + "grad_norm": 1.4716932773590088, + "learning_rate": 1.8269521232501065e-06, + "loss": 0.3364, + "step": 13286 + }, + { + "epoch": 1.616915120170368, + "grad_norm": 2.4856133460998535, + "learning_rate": 1.8258282779791524e-06, + "loss": 0.4281, + "step": 13287 + }, + { + "epoch": 1.617036811682385, + "grad_norm": 1.9493603706359863, + "learning_rate": 1.8247047437523557e-06, + "loss": 0.3567, + "step": 13288 + }, + { + "epoch": 1.6171585031944022, + "grad_norm": 3.3769569396972656, + "learning_rate": 1.8235815206124653e-06, + "loss": 0.4216, + "step": 13289 + }, + { + "epoch": 1.6172801947064193, + "grad_norm": 1.5918591022491455, + "learning_rate": 1.822458608602219e-06, + "loss": 0.3517, + "step": 13290 + }, + { + "epoch": 1.6174018862184363, + "grad_norm": 2.097656011581421, + "learning_rate": 1.8213360077643527e-06, + "loss": 0.3257, + "step": 13291 + }, + { + "epoch": 1.6175235777304533, + "grad_norm": 1.5599029064178467, + "learning_rate": 1.8202137181415802e-06, + "loss": 0.3685, + "step": 13292 + }, + { + "epoch": 1.6176452692424703, + "grad_norm": 2.296247959136963, + "learning_rate": 1.819091739776604e-06, + "loss": 0.4051, + "step": 13293 + }, + { + "epoch": 1.6177669607544873, + "grad_norm": 1.7134901285171509, + "learning_rate": 1.817970072712123e-06, + "loss": 0.3444, + "step": 13294 + }, + { + "epoch": 1.6178886522665044, + "grad_norm": 1.8877848386764526, + "learning_rate": 1.8168487169908166e-06, + "loss": 0.3817, + "step": 13295 + }, + { + "epoch": 1.6180103437785216, + "grad_norm": 2.377495527267456, + "learning_rate": 1.8157276726553552e-06, + "loss": 0.3206, + "step": 13296 + }, + { + "epoch": 1.6181320352905386, + "grad_norm": 1.5760836601257324, + "learning_rate": 1.8146069397483956e-06, + "loss": 0.356, + "step": 13297 + }, + { + "epoch": 1.6182537268025556, + "grad_norm": 2.420151710510254, + "learning_rate": 1.8134865183125828e-06, + "loss": 0.3043, + "step": 13298 + }, + { + "epoch": 1.6183754183145727, + "grad_norm": 1.7536336183547974, + "learning_rate": 1.8123664083905556e-06, + "loss": 0.3705, + "step": 13299 + }, + { + "epoch": 1.6184971098265897, + "grad_norm": 1.9137368202209473, + "learning_rate": 1.8112466100249337e-06, + "loss": 0.3887, + "step": 13300 + }, + { + "epoch": 1.6186188013386067, + "grad_norm": 2.0884597301483154, + "learning_rate": 1.8101271232583263e-06, + "loss": 0.3329, + "step": 13301 + }, + { + "epoch": 1.6187404928506237, + "grad_norm": 2.6297354698181152, + "learning_rate": 1.8090079481333357e-06, + "loss": 0.395, + "step": 13302 + }, + { + "epoch": 1.6188621843626407, + "grad_norm": 2.0360443592071533, + "learning_rate": 1.8078890846925478e-06, + "loss": 0.3607, + "step": 13303 + }, + { + "epoch": 1.6189838758746578, + "grad_norm": 3.129077672958374, + "learning_rate": 1.8067705329785334e-06, + "loss": 0.3456, + "step": 13304 + }, + { + "epoch": 1.6191055673866748, + "grad_norm": 1.5659916400909424, + "learning_rate": 1.8056522930338627e-06, + "loss": 0.3676, + "step": 13305 + }, + { + "epoch": 1.6192272588986918, + "grad_norm": 1.5475733280181885, + "learning_rate": 1.8045343649010838e-06, + "loss": 0.3443, + "step": 13306 + }, + { + "epoch": 1.6193489504107088, + "grad_norm": 3.415072202682495, + "learning_rate": 1.803416748622736e-06, + "loss": 0.4651, + "step": 13307 + }, + { + "epoch": 1.6194706419227258, + "grad_norm": 1.8677818775177002, + "learning_rate": 1.8022994442413466e-06, + "loss": 0.4125, + "step": 13308 + }, + { + "epoch": 1.6195923334347428, + "grad_norm": 1.4603604078292847, + "learning_rate": 1.801182451799428e-06, + "loss": 0.3511, + "step": 13309 + }, + { + "epoch": 1.6197140249467599, + "grad_norm": 1.7580389976501465, + "learning_rate": 1.800065771339492e-06, + "loss": 0.3603, + "step": 13310 + }, + { + "epoch": 1.6198357164587769, + "grad_norm": 2.6074488162994385, + "learning_rate": 1.7989494029040255e-06, + "loss": 0.4099, + "step": 13311 + }, + { + "epoch": 1.619957407970794, + "grad_norm": 1.353723406791687, + "learning_rate": 1.797833346535507e-06, + "loss": 0.3365, + "step": 13312 + }, + { + "epoch": 1.620079099482811, + "grad_norm": 1.5630590915679932, + "learning_rate": 1.79671760227641e-06, + "loss": 0.2888, + "step": 13313 + }, + { + "epoch": 1.6202007909948282, + "grad_norm": 2.4240448474884033, + "learning_rate": 1.7956021701691873e-06, + "loss": 0.4259, + "step": 13314 + }, + { + "epoch": 1.6203224825068452, + "grad_norm": 3.40415620803833, + "learning_rate": 1.7944870502562827e-06, + "loss": 0.4158, + "step": 13315 + }, + { + "epoch": 1.6204441740188622, + "grad_norm": 1.6393496990203857, + "learning_rate": 1.7933722425801326e-06, + "loss": 0.3731, + "step": 13316 + }, + { + "epoch": 1.6205658655308792, + "grad_norm": 3.0124433040618896, + "learning_rate": 1.7922577471831526e-06, + "loss": 0.4152, + "step": 13317 + }, + { + "epoch": 1.6206875570428962, + "grad_norm": 1.7624109983444214, + "learning_rate": 1.791143564107759e-06, + "loss": 0.3863, + "step": 13318 + }, + { + "epoch": 1.6208092485549133, + "grad_norm": 1.688957691192627, + "learning_rate": 1.7900296933963424e-06, + "loss": 0.3945, + "step": 13319 + }, + { + "epoch": 1.6209309400669303, + "grad_norm": 2.223710536956787, + "learning_rate": 1.7889161350912876e-06, + "loss": 0.4087, + "step": 13320 + }, + { + "epoch": 1.6210526315789475, + "grad_norm": 4.020733833312988, + "learning_rate": 1.7878028892349719e-06, + "loss": 0.2963, + "step": 13321 + }, + { + "epoch": 1.6211743230909645, + "grad_norm": 1.4559962749481201, + "learning_rate": 1.7866899558697549e-06, + "loss": 0.3353, + "step": 13322 + }, + { + "epoch": 1.6212960146029816, + "grad_norm": 3.1958560943603516, + "learning_rate": 1.7855773350379824e-06, + "loss": 0.3114, + "step": 13323 + }, + { + "epoch": 1.6214177061149986, + "grad_norm": 1.6606889963150024, + "learning_rate": 1.7844650267819973e-06, + "loss": 0.3654, + "step": 13324 + }, + { + "epoch": 1.6215393976270156, + "grad_norm": 1.6357158422470093, + "learning_rate": 1.7833530311441215e-06, + "loss": 0.3343, + "step": 13325 + }, + { + "epoch": 1.6216610891390326, + "grad_norm": 2.2711021900177, + "learning_rate": 1.7822413481666733e-06, + "loss": 0.4024, + "step": 13326 + }, + { + "epoch": 1.6217827806510496, + "grad_norm": 4.657652378082275, + "learning_rate": 1.781129977891951e-06, + "loss": 0.4325, + "step": 13327 + }, + { + "epoch": 1.6219044721630667, + "grad_norm": 1.7236744165420532, + "learning_rate": 1.780018920362243e-06, + "loss": 0.3315, + "step": 13328 + }, + { + "epoch": 1.6220261636750837, + "grad_norm": 2.7690746784210205, + "learning_rate": 1.7789081756198324e-06, + "loss": 0.3449, + "step": 13329 + }, + { + "epoch": 1.6221478551871007, + "grad_norm": 1.634344220161438, + "learning_rate": 1.7777977437069838e-06, + "loss": 0.3698, + "step": 13330 + }, + { + "epoch": 1.6222695466991177, + "grad_norm": 1.9576445817947388, + "learning_rate": 1.7766876246659458e-06, + "loss": 0.3291, + "step": 13331 + }, + { + "epoch": 1.6223912382111347, + "grad_norm": 1.6200907230377197, + "learning_rate": 1.775577818538967e-06, + "loss": 0.3436, + "step": 13332 + }, + { + "epoch": 1.6225129297231518, + "grad_norm": 2.087721586227417, + "learning_rate": 1.7744683253682737e-06, + "loss": 0.3358, + "step": 13333 + }, + { + "epoch": 1.6226346212351688, + "grad_norm": 1.6513155698776245, + "learning_rate": 1.7733591451960896e-06, + "loss": 0.3475, + "step": 13334 + }, + { + "epoch": 1.6227563127471858, + "grad_norm": 2.0104594230651855, + "learning_rate": 1.7722502780646178e-06, + "loss": 0.3424, + "step": 13335 + }, + { + "epoch": 1.6228780042592028, + "grad_norm": 1.5387104749679565, + "learning_rate": 1.77114172401605e-06, + "loss": 0.3474, + "step": 13336 + }, + { + "epoch": 1.6229996957712198, + "grad_norm": 1.7255566120147705, + "learning_rate": 1.7700334830925758e-06, + "loss": 0.3607, + "step": 13337 + }, + { + "epoch": 1.6231213872832368, + "grad_norm": 2.8821768760681152, + "learning_rate": 1.7689255553363627e-06, + "loss": 0.4013, + "step": 13338 + }, + { + "epoch": 1.6232430787952539, + "grad_norm": 1.585805058479309, + "learning_rate": 1.7678179407895667e-06, + "loss": 0.3556, + "step": 13339 + }, + { + "epoch": 1.623364770307271, + "grad_norm": 1.6983035802841187, + "learning_rate": 1.7667106394943413e-06, + "loss": 0.3919, + "step": 13340 + }, + { + "epoch": 1.6234864618192881, + "grad_norm": 2.624363422393799, + "learning_rate": 1.7656036514928166e-06, + "loss": 0.3028, + "step": 13341 + }, + { + "epoch": 1.6236081533313051, + "grad_norm": 1.7962149381637573, + "learning_rate": 1.764496976827118e-06, + "loss": 0.3503, + "step": 13342 + }, + { + "epoch": 1.6237298448433222, + "grad_norm": 1.9360661506652832, + "learning_rate": 1.7633906155393566e-06, + "loss": 0.3293, + "step": 13343 + }, + { + "epoch": 1.6238515363553392, + "grad_norm": 2.4604246616363525, + "learning_rate": 1.7622845676716271e-06, + "loss": 0.3854, + "step": 13344 + }, + { + "epoch": 1.6239732278673562, + "grad_norm": 1.814049243927002, + "learning_rate": 1.761178833266024e-06, + "loss": 0.3753, + "step": 13345 + }, + { + "epoch": 1.6240949193793734, + "grad_norm": 2.614799737930298, + "learning_rate": 1.7600734123646202e-06, + "loss": 0.4417, + "step": 13346 + }, + { + "epoch": 1.6242166108913905, + "grad_norm": 3.1902172565460205, + "learning_rate": 1.7589683050094763e-06, + "loss": 0.4114, + "step": 13347 + }, + { + "epoch": 1.6243383024034075, + "grad_norm": 1.580715298652649, + "learning_rate": 1.757863511242649e-06, + "loss": 0.3327, + "step": 13348 + }, + { + "epoch": 1.6244599939154245, + "grad_norm": 1.7152653932571411, + "learning_rate": 1.7567590311061743e-06, + "loss": 0.3681, + "step": 13349 + }, + { + "epoch": 1.6245816854274415, + "grad_norm": 1.7957149744033813, + "learning_rate": 1.755654864642079e-06, + "loss": 0.3784, + "step": 13350 + }, + { + "epoch": 1.6247033769394585, + "grad_norm": 1.749819278717041, + "learning_rate": 1.7545510118923847e-06, + "loss": 0.3579, + "step": 13351 + }, + { + "epoch": 1.6248250684514756, + "grad_norm": 1.7374850511550903, + "learning_rate": 1.7534474728990902e-06, + "loss": 0.3683, + "step": 13352 + }, + { + "epoch": 1.6249467599634926, + "grad_norm": 1.8143401145935059, + "learning_rate": 1.7523442477041885e-06, + "loss": 0.386, + "step": 13353 + }, + { + "epoch": 1.6250684514755096, + "grad_norm": 1.9732637405395508, + "learning_rate": 1.7512413363496606e-06, + "loss": 0.3592, + "step": 13354 + }, + { + "epoch": 1.6251901429875266, + "grad_norm": 2.349351167678833, + "learning_rate": 1.7501387388774716e-06, + "loss": 0.3869, + "step": 13355 + }, + { + "epoch": 1.6253118344995436, + "grad_norm": 2.04888653755188, + "learning_rate": 1.7490364553295825e-06, + "loss": 0.3378, + "step": 13356 + }, + { + "epoch": 1.6254335260115607, + "grad_norm": 1.7062122821807861, + "learning_rate": 1.7479344857479342e-06, + "loss": 0.3666, + "step": 13357 + }, + { + "epoch": 1.6255552175235777, + "grad_norm": 1.9820282459259033, + "learning_rate": 1.7468328301744564e-06, + "loss": 0.3677, + "step": 13358 + }, + { + "epoch": 1.6256769090355947, + "grad_norm": 2.2650251388549805, + "learning_rate": 1.7457314886510756e-06, + "loss": 0.3629, + "step": 13359 + }, + { + "epoch": 1.6257986005476117, + "grad_norm": 1.5225366353988647, + "learning_rate": 1.7446304612196973e-06, + "loss": 0.396, + "step": 13360 + }, + { + "epoch": 1.6259202920596287, + "grad_norm": 1.715078592300415, + "learning_rate": 1.7435297479222157e-06, + "loss": 0.3218, + "step": 13361 + }, + { + "epoch": 1.6260419835716458, + "grad_norm": 2.031444787979126, + "learning_rate": 1.7424293488005196e-06, + "loss": 0.3531, + "step": 13362 + }, + { + "epoch": 1.6261636750836628, + "grad_norm": 1.4962328672409058, + "learning_rate": 1.7413292638964773e-06, + "loss": 0.3546, + "step": 13363 + }, + { + "epoch": 1.6262853665956798, + "grad_norm": 2.2736563682556152, + "learning_rate": 1.7402294932519525e-06, + "loss": 0.4149, + "step": 13364 + }, + { + "epoch": 1.626407058107697, + "grad_norm": 2.057826042175293, + "learning_rate": 1.7391300369087928e-06, + "loss": 0.3686, + "step": 13365 + }, + { + "epoch": 1.626528749619714, + "grad_norm": 4.780618667602539, + "learning_rate": 1.73803089490883e-06, + "loss": 0.4655, + "step": 13366 + }, + { + "epoch": 1.626650441131731, + "grad_norm": 1.945212721824646, + "learning_rate": 1.736932067293896e-06, + "loss": 0.413, + "step": 13367 + }, + { + "epoch": 1.626772132643748, + "grad_norm": 2.0224502086639404, + "learning_rate": 1.7358335541058013e-06, + "loss": 0.3933, + "step": 13368 + }, + { + "epoch": 1.6268938241557651, + "grad_norm": 2.113107919692993, + "learning_rate": 1.7347353553863433e-06, + "loss": 0.3167, + "step": 13369 + }, + { + "epoch": 1.6270155156677821, + "grad_norm": 1.8089020252227783, + "learning_rate": 1.7336374711773152e-06, + "loss": 0.3683, + "step": 13370 + }, + { + "epoch": 1.6271372071797994, + "grad_norm": 2.2153992652893066, + "learning_rate": 1.7325399015204913e-06, + "loss": 0.3509, + "step": 13371 + }, + { + "epoch": 1.6272588986918164, + "grad_norm": 1.6916420459747314, + "learning_rate": 1.7314426464576351e-06, + "loss": 0.4115, + "step": 13372 + }, + { + "epoch": 1.6273805902038334, + "grad_norm": 2.248929023742676, + "learning_rate": 1.730345706030503e-06, + "loss": 0.3691, + "step": 13373 + }, + { + "epoch": 1.6275022817158504, + "grad_norm": 2.0479023456573486, + "learning_rate": 1.7292490802808349e-06, + "loss": 0.3565, + "step": 13374 + }, + { + "epoch": 1.6276239732278674, + "grad_norm": 2.097653388977051, + "learning_rate": 1.7281527692503553e-06, + "loss": 0.4014, + "step": 13375 + }, + { + "epoch": 1.6277456647398845, + "grad_norm": 1.6671005487442017, + "learning_rate": 1.7270567729807897e-06, + "loss": 0.3532, + "step": 13376 + }, + { + "epoch": 1.6278673562519015, + "grad_norm": 1.3938034772872925, + "learning_rate": 1.7259610915138336e-06, + "loss": 0.3799, + "step": 13377 + }, + { + "epoch": 1.6279890477639185, + "grad_norm": 1.6788058280944824, + "learning_rate": 1.724865724891187e-06, + "loss": 0.3988, + "step": 13378 + }, + { + "epoch": 1.6281107392759355, + "grad_norm": 3.041322708129883, + "learning_rate": 1.7237706731545278e-06, + "loss": 0.2951, + "step": 13379 + }, + { + "epoch": 1.6282324307879525, + "grad_norm": 2.0745925903320312, + "learning_rate": 1.7226759363455225e-06, + "loss": 0.337, + "step": 13380 + }, + { + "epoch": 1.6283541222999696, + "grad_norm": 2.732539176940918, + "learning_rate": 1.7215815145058335e-06, + "loss": 0.476, + "step": 13381 + }, + { + "epoch": 1.6284758138119866, + "grad_norm": 1.6170086860656738, + "learning_rate": 1.7204874076771038e-06, + "loss": 0.3839, + "step": 13382 + }, + { + "epoch": 1.6285975053240036, + "grad_norm": 2.6791563034057617, + "learning_rate": 1.7193936159009627e-06, + "loss": 0.4347, + "step": 13383 + }, + { + "epoch": 1.6287191968360206, + "grad_norm": 2.8652350902557373, + "learning_rate": 1.718300139219037e-06, + "loss": 0.401, + "step": 13384 + }, + { + "epoch": 1.6288408883480376, + "grad_norm": 1.598162293434143, + "learning_rate": 1.7172069776729305e-06, + "loss": 0.3853, + "step": 13385 + }, + { + "epoch": 1.6289625798600547, + "grad_norm": 1.6476985216140747, + "learning_rate": 1.7161141313042463e-06, + "loss": 0.3461, + "step": 13386 + }, + { + "epoch": 1.6290842713720717, + "grad_norm": 1.8403626680374146, + "learning_rate": 1.7150216001545684e-06, + "loss": 0.396, + "step": 13387 + }, + { + "epoch": 1.6292059628840887, + "grad_norm": 2.9103140830993652, + "learning_rate": 1.7139293842654625e-06, + "loss": 0.4184, + "step": 13388 + }, + { + "epoch": 1.6293276543961057, + "grad_norm": 2.9887588024139404, + "learning_rate": 1.7128374836784967e-06, + "loss": 0.423, + "step": 13389 + }, + { + "epoch": 1.629449345908123, + "grad_norm": 2.2097301483154297, + "learning_rate": 1.7117458984352186e-06, + "loss": 0.313, + "step": 13390 + }, + { + "epoch": 1.62957103742014, + "grad_norm": 1.5335193872451782, + "learning_rate": 1.7106546285771618e-06, + "loss": 0.376, + "step": 13391 + }, + { + "epoch": 1.629692728932157, + "grad_norm": 2.8846802711486816, + "learning_rate": 1.7095636741458576e-06, + "loss": 0.3729, + "step": 13392 + }, + { + "epoch": 1.629814420444174, + "grad_norm": 1.8995273113250732, + "learning_rate": 1.7084730351828138e-06, + "loss": 0.3517, + "step": 13393 + }, + { + "epoch": 1.629936111956191, + "grad_norm": 2.5518577098846436, + "learning_rate": 1.7073827117295349e-06, + "loss": 0.3734, + "step": 13394 + }, + { + "epoch": 1.630057803468208, + "grad_norm": 2.041529417037964, + "learning_rate": 1.706292703827509e-06, + "loss": 0.4001, + "step": 13395 + }, + { + "epoch": 1.630179494980225, + "grad_norm": 1.9116995334625244, + "learning_rate": 1.7052030115182105e-06, + "loss": 0.4086, + "step": 13396 + }, + { + "epoch": 1.6303011864922423, + "grad_norm": 1.8903470039367676, + "learning_rate": 1.7041136348431087e-06, + "loss": 0.3801, + "step": 13397 + }, + { + "epoch": 1.6304228780042593, + "grad_norm": 1.2369699478149414, + "learning_rate": 1.7030245738436547e-06, + "loss": 0.3393, + "step": 13398 + }, + { + "epoch": 1.6305445695162764, + "grad_norm": 2.136042833328247, + "learning_rate": 1.7019358285612897e-06, + "loss": 0.3791, + "step": 13399 + }, + { + "epoch": 1.6306662610282934, + "grad_norm": 2.6420884132385254, + "learning_rate": 1.700847399037443e-06, + "loss": 0.3927, + "step": 13400 + }, + { + "epoch": 1.6307879525403104, + "grad_norm": 1.8712613582611084, + "learning_rate": 1.699759285313528e-06, + "loss": 0.3222, + "step": 13401 + }, + { + "epoch": 1.6309096440523274, + "grad_norm": 1.794637680053711, + "learning_rate": 1.698671487430955e-06, + "loss": 0.3563, + "step": 13402 + }, + { + "epoch": 1.6310313355643444, + "grad_norm": 1.964385747909546, + "learning_rate": 1.6975840054311143e-06, + "loss": 0.3008, + "step": 13403 + }, + { + "epoch": 1.6311530270763615, + "grad_norm": 1.9887021780014038, + "learning_rate": 1.696496839355386e-06, + "loss": 0.3737, + "step": 13404 + }, + { + "epoch": 1.6312747185883785, + "grad_norm": 1.9794068336486816, + "learning_rate": 1.6954099892451426e-06, + "loss": 0.3321, + "step": 13405 + }, + { + "epoch": 1.6313964101003955, + "grad_norm": 1.858148455619812, + "learning_rate": 1.6943234551417375e-06, + "loss": 0.4109, + "step": 13406 + }, + { + "epoch": 1.6315181016124125, + "grad_norm": 1.4207062721252441, + "learning_rate": 1.6932372370865147e-06, + "loss": 0.3746, + "step": 13407 + }, + { + "epoch": 1.6316397931244295, + "grad_norm": 1.9442638158798218, + "learning_rate": 1.692151335120812e-06, + "loss": 0.4241, + "step": 13408 + }, + { + "epoch": 1.6317614846364465, + "grad_norm": 2.0604519844055176, + "learning_rate": 1.6910657492859473e-06, + "loss": 0.3387, + "step": 13409 + }, + { + "epoch": 1.6318831761484636, + "grad_norm": 1.3367180824279785, + "learning_rate": 1.6899804796232288e-06, + "loss": 0.3026, + "step": 13410 + }, + { + "epoch": 1.6320048676604806, + "grad_norm": 1.6336756944656372, + "learning_rate": 1.6888955261739549e-06, + "loss": 0.4121, + "step": 13411 + }, + { + "epoch": 1.6321265591724976, + "grad_norm": 2.9144794940948486, + "learning_rate": 1.6878108889794065e-06, + "loss": 0.2901, + "step": 13412 + }, + { + "epoch": 1.6322482506845146, + "grad_norm": 2.1106197834014893, + "learning_rate": 1.6867265680808608e-06, + "loss": 0.3281, + "step": 13413 + }, + { + "epoch": 1.6323699421965316, + "grad_norm": 1.767088770866394, + "learning_rate": 1.6856425635195771e-06, + "loss": 0.3519, + "step": 13414 + }, + { + "epoch": 1.6324916337085489, + "grad_norm": 1.462937593460083, + "learning_rate": 1.6845588753368014e-06, + "loss": 0.3725, + "step": 13415 + }, + { + "epoch": 1.632613325220566, + "grad_norm": 1.2741680145263672, + "learning_rate": 1.6834755035737749e-06, + "loss": 0.3075, + "step": 13416 + }, + { + "epoch": 1.632735016732583, + "grad_norm": 1.4983042478561401, + "learning_rate": 1.6823924482717192e-06, + "loss": 0.3593, + "step": 13417 + }, + { + "epoch": 1.6328567082446, + "grad_norm": 1.7222654819488525, + "learning_rate": 1.6813097094718455e-06, + "loss": 0.3305, + "step": 13418 + }, + { + "epoch": 1.632978399756617, + "grad_norm": 2.429203748703003, + "learning_rate": 1.680227287215358e-06, + "loss": 0.403, + "step": 13419 + }, + { + "epoch": 1.633100091268634, + "grad_norm": 1.4949259757995605, + "learning_rate": 1.6791451815434445e-06, + "loss": 0.3288, + "step": 13420 + }, + { + "epoch": 1.633221782780651, + "grad_norm": 2.1316640377044678, + "learning_rate": 1.6780633924972766e-06, + "loss": 0.3838, + "step": 13421 + }, + { + "epoch": 1.6333434742926682, + "grad_norm": 1.9163856506347656, + "learning_rate": 1.6769819201180271e-06, + "loss": 0.356, + "step": 13422 + }, + { + "epoch": 1.6334651658046853, + "grad_norm": 1.452164888381958, + "learning_rate": 1.6759007644468384e-06, + "loss": 0.3316, + "step": 13423 + }, + { + "epoch": 1.6335868573167023, + "grad_norm": 2.5675241947174072, + "learning_rate": 1.6748199255248576e-06, + "loss": 0.4283, + "step": 13424 + }, + { + "epoch": 1.6337085488287193, + "grad_norm": 2.2385644912719727, + "learning_rate": 1.6737394033932108e-06, + "loss": 0.3341, + "step": 13425 + }, + { + "epoch": 1.6338302403407363, + "grad_norm": 1.9260796308517456, + "learning_rate": 1.6726591980930117e-06, + "loss": 0.4016, + "step": 13426 + }, + { + "epoch": 1.6339519318527533, + "grad_norm": 2.012775421142578, + "learning_rate": 1.6715793096653698e-06, + "loss": 0.4048, + "step": 13427 + }, + { + "epoch": 1.6340736233647704, + "grad_norm": 1.357103705406189, + "learning_rate": 1.6704997381513744e-06, + "loss": 0.3463, + "step": 13428 + }, + { + "epoch": 1.6341953148767874, + "grad_norm": 2.2745702266693115, + "learning_rate": 1.6694204835921014e-06, + "loss": 0.3746, + "step": 13429 + }, + { + "epoch": 1.6343170063888044, + "grad_norm": 1.7849301099777222, + "learning_rate": 1.6683415460286256e-06, + "loss": 0.3575, + "step": 13430 + }, + { + "epoch": 1.6344386979008214, + "grad_norm": 1.900254726409912, + "learning_rate": 1.667262925501999e-06, + "loss": 0.3847, + "step": 13431 + }, + { + "epoch": 1.6345603894128384, + "grad_norm": 2.481780767440796, + "learning_rate": 1.6661846220532641e-06, + "loss": 0.3043, + "step": 13432 + }, + { + "epoch": 1.6346820809248555, + "grad_norm": 1.8348278999328613, + "learning_rate": 1.665106635723457e-06, + "loss": 0.3284, + "step": 13433 + }, + { + "epoch": 1.6348037724368725, + "grad_norm": 3.7086610794067383, + "learning_rate": 1.6640289665535935e-06, + "loss": 0.3311, + "step": 13434 + }, + { + "epoch": 1.6349254639488895, + "grad_norm": 2.66515851020813, + "learning_rate": 1.6629516145846836e-06, + "loss": 0.4147, + "step": 13435 + }, + { + "epoch": 1.6350471554609065, + "grad_norm": 1.8951871395111084, + "learning_rate": 1.6618745798577207e-06, + "loss": 0.3318, + "step": 13436 + }, + { + "epoch": 1.6351688469729235, + "grad_norm": 2.4279592037200928, + "learning_rate": 1.6607978624136868e-06, + "loss": 0.3737, + "step": 13437 + }, + { + "epoch": 1.6352905384849405, + "grad_norm": 1.6277490854263306, + "learning_rate": 1.6597214622935576e-06, + "loss": 0.3477, + "step": 13438 + }, + { + "epoch": 1.6354122299969576, + "grad_norm": 1.6486586332321167, + "learning_rate": 1.6586453795382906e-06, + "loss": 0.3532, + "step": 13439 + }, + { + "epoch": 1.6355339215089746, + "grad_norm": 1.428605079650879, + "learning_rate": 1.6575696141888298e-06, + "loss": 0.3293, + "step": 13440 + }, + { + "epoch": 1.6356556130209918, + "grad_norm": 3.075319528579712, + "learning_rate": 1.6564941662861156e-06, + "loss": 0.4263, + "step": 13441 + }, + { + "epoch": 1.6357773045330088, + "grad_norm": 2.772754192352295, + "learning_rate": 1.6554190358710688e-06, + "loss": 0.4277, + "step": 13442 + }, + { + "epoch": 1.6358989960450259, + "grad_norm": 2.264282703399658, + "learning_rate": 1.6543442229845973e-06, + "loss": 0.3724, + "step": 13443 + }, + { + "epoch": 1.6360206875570429, + "grad_norm": 2.1718485355377197, + "learning_rate": 1.6532697276676056e-06, + "loss": 0.3485, + "step": 13444 + }, + { + "epoch": 1.63614237906906, + "grad_norm": 1.5994768142700195, + "learning_rate": 1.6521955499609776e-06, + "loss": 0.3993, + "step": 13445 + }, + { + "epoch": 1.636264070581077, + "grad_norm": 2.743475914001465, + "learning_rate": 1.651121689905587e-06, + "loss": 0.4191, + "step": 13446 + }, + { + "epoch": 1.6363857620930942, + "grad_norm": 1.5631766319274902, + "learning_rate": 1.650048147542298e-06, + "loss": 0.3511, + "step": 13447 + }, + { + "epoch": 1.6365074536051112, + "grad_norm": 1.707613468170166, + "learning_rate": 1.648974922911958e-06, + "loss": 0.3621, + "step": 13448 + }, + { + "epoch": 1.6366291451171282, + "grad_norm": 2.5252771377563477, + "learning_rate": 1.6479020160554093e-06, + "loss": 0.3122, + "step": 13449 + }, + { + "epoch": 1.6367508366291452, + "grad_norm": 1.849367380142212, + "learning_rate": 1.6468294270134777e-06, + "loss": 0.3764, + "step": 13450 + }, + { + "epoch": 1.6368725281411622, + "grad_norm": 2.094403028488159, + "learning_rate": 1.6457571558269747e-06, + "loss": 0.3988, + "step": 13451 + }, + { + "epoch": 1.6369942196531793, + "grad_norm": 3.9948136806488037, + "learning_rate": 1.6446852025367055e-06, + "loss": 0.3298, + "step": 13452 + }, + { + "epoch": 1.6371159111651963, + "grad_norm": 1.7776210308074951, + "learning_rate": 1.6436135671834574e-06, + "loss": 0.3812, + "step": 13453 + }, + { + "epoch": 1.6372376026772133, + "grad_norm": 1.5117802619934082, + "learning_rate": 1.6425422498080112e-06, + "loss": 0.3159, + "step": 13454 + }, + { + "epoch": 1.6373592941892303, + "grad_norm": 2.207461357116699, + "learning_rate": 1.641471250451132e-06, + "loss": 0.3026, + "step": 13455 + }, + { + "epoch": 1.6374809857012473, + "grad_norm": 1.9573259353637695, + "learning_rate": 1.6404005691535707e-06, + "loss": 0.3378, + "step": 13456 + }, + { + "epoch": 1.6376026772132644, + "grad_norm": 2.1857869625091553, + "learning_rate": 1.6393302059560756e-06, + "loss": 0.3382, + "step": 13457 + }, + { + "epoch": 1.6377243687252814, + "grad_norm": 1.6931297779083252, + "learning_rate": 1.6382601608993698e-06, + "loss": 0.3608, + "step": 13458 + }, + { + "epoch": 1.6378460602372984, + "grad_norm": 1.9838147163391113, + "learning_rate": 1.6371904340241694e-06, + "loss": 0.3602, + "step": 13459 + }, + { + "epoch": 1.6379677517493154, + "grad_norm": 2.7030320167541504, + "learning_rate": 1.6361210253711868e-06, + "loss": 0.3968, + "step": 13460 + }, + { + "epoch": 1.6380894432613324, + "grad_norm": 1.8258942365646362, + "learning_rate": 1.6350519349811078e-06, + "loss": 0.3636, + "step": 13461 + }, + { + "epoch": 1.6382111347733495, + "grad_norm": 1.7149296998977661, + "learning_rate": 1.6339831628946202e-06, + "loss": 0.3665, + "step": 13462 + }, + { + "epoch": 1.6383328262853665, + "grad_norm": 2.036803722381592, + "learning_rate": 1.63291470915239e-06, + "loss": 0.3888, + "step": 13463 + }, + { + "epoch": 1.6384545177973835, + "grad_norm": 3.1245319843292236, + "learning_rate": 1.6318465737950718e-06, + "loss": 0.4389, + "step": 13464 + }, + { + "epoch": 1.6385762093094005, + "grad_norm": 1.8545536994934082, + "learning_rate": 1.630778756863315e-06, + "loss": 0.4056, + "step": 13465 + }, + { + "epoch": 1.6386979008214178, + "grad_norm": 2.2261717319488525, + "learning_rate": 1.6297112583977493e-06, + "loss": 0.363, + "step": 13466 + }, + { + "epoch": 1.6388195923334348, + "grad_norm": 2.0839240550994873, + "learning_rate": 1.6286440784389934e-06, + "loss": 0.3556, + "step": 13467 + }, + { + "epoch": 1.6389412838454518, + "grad_norm": 1.6345210075378418, + "learning_rate": 1.6275772170276638e-06, + "loss": 0.3517, + "step": 13468 + }, + { + "epoch": 1.6390629753574688, + "grad_norm": 2.0055270195007324, + "learning_rate": 1.6265106742043446e-06, + "loss": 0.3985, + "step": 13469 + }, + { + "epoch": 1.6391846668694858, + "grad_norm": 1.2477595806121826, + "learning_rate": 1.62544445000963e-06, + "loss": 0.3402, + "step": 13470 + }, + { + "epoch": 1.6393063583815028, + "grad_norm": 2.4617700576782227, + "learning_rate": 1.6243785444840888e-06, + "loss": 0.4238, + "step": 13471 + }, + { + "epoch": 1.63942804989352, + "grad_norm": 1.89751398563385, + "learning_rate": 1.623312957668277e-06, + "loss": 0.4055, + "step": 13472 + }, + { + "epoch": 1.639549741405537, + "grad_norm": 1.828223705291748, + "learning_rate": 1.6222476896027495e-06, + "loss": 0.349, + "step": 13473 + }, + { + "epoch": 1.6396714329175541, + "grad_norm": 1.4702894687652588, + "learning_rate": 1.6211827403280378e-06, + "loss": 0.3763, + "step": 13474 + }, + { + "epoch": 1.6397931244295711, + "grad_norm": 3.080322504043579, + "learning_rate": 1.620118109884663e-06, + "loss": 0.4438, + "step": 13475 + }, + { + "epoch": 1.6399148159415882, + "grad_norm": 2.0052330493927, + "learning_rate": 1.6190537983131428e-06, + "loss": 0.3738, + "step": 13476 + }, + { + "epoch": 1.6400365074536052, + "grad_norm": 1.811352252960205, + "learning_rate": 1.6179898056539734e-06, + "loss": 0.3878, + "step": 13477 + }, + { + "epoch": 1.6401581989656222, + "grad_norm": 2.2385783195495605, + "learning_rate": 1.6169261319476392e-06, + "loss": 0.346, + "step": 13478 + }, + { + "epoch": 1.6402798904776392, + "grad_norm": 1.8236132860183716, + "learning_rate": 1.615862777234619e-06, + "loss": 0.3227, + "step": 13479 + }, + { + "epoch": 1.6404015819896562, + "grad_norm": 1.4718546867370605, + "learning_rate": 1.6147997415553751e-06, + "loss": 0.3456, + "step": 13480 + }, + { + "epoch": 1.6405232735016733, + "grad_norm": 3.5592145919799805, + "learning_rate": 1.6137370249503582e-06, + "loss": 0.3065, + "step": 13481 + }, + { + "epoch": 1.6406449650136903, + "grad_norm": 2.163372755050659, + "learning_rate": 1.6126746274600048e-06, + "loss": 0.3826, + "step": 13482 + }, + { + "epoch": 1.6407666565257073, + "grad_norm": 3.2274222373962402, + "learning_rate": 1.6116125491247413e-06, + "loss": 0.3623, + "step": 13483 + }, + { + "epoch": 1.6408883480377243, + "grad_norm": 1.6441229581832886, + "learning_rate": 1.6105507899849847e-06, + "loss": 0.3639, + "step": 13484 + }, + { + "epoch": 1.6410100395497413, + "grad_norm": 2.035132884979248, + "learning_rate": 1.609489350081137e-06, + "loss": 0.3367, + "step": 13485 + }, + { + "epoch": 1.6411317310617584, + "grad_norm": 2.9197516441345215, + "learning_rate": 1.6084282294535835e-06, + "loss": 0.4271, + "step": 13486 + }, + { + "epoch": 1.6412534225737754, + "grad_norm": 1.5328617095947266, + "learning_rate": 1.6073674281427075e-06, + "loss": 0.3806, + "step": 13487 + }, + { + "epoch": 1.6413751140857924, + "grad_norm": 1.8179408311843872, + "learning_rate": 1.6063069461888737e-06, + "loss": 0.3669, + "step": 13488 + }, + { + "epoch": 1.6414968055978094, + "grad_norm": 1.252159595489502, + "learning_rate": 1.6052467836324315e-06, + "loss": 0.3202, + "step": 13489 + }, + { + "epoch": 1.6416184971098264, + "grad_norm": 1.6930807828903198, + "learning_rate": 1.604186940513729e-06, + "loss": 0.3136, + "step": 13490 + }, + { + "epoch": 1.6417401886218437, + "grad_norm": 3.6778435707092285, + "learning_rate": 1.6031274168730903e-06, + "loss": 0.4552, + "step": 13491 + }, + { + "epoch": 1.6418618801338607, + "grad_norm": 1.778458833694458, + "learning_rate": 1.6020682127508348e-06, + "loss": 0.3638, + "step": 13492 + }, + { + "epoch": 1.6419835716458777, + "grad_norm": 1.3359156847000122, + "learning_rate": 1.6010093281872662e-06, + "loss": 0.3473, + "step": 13493 + }, + { + "epoch": 1.6421052631578947, + "grad_norm": 1.4263921976089478, + "learning_rate": 1.5999507632226752e-06, + "loss": 0.3303, + "step": 13494 + }, + { + "epoch": 1.6422269546699118, + "grad_norm": 2.1085805892944336, + "learning_rate": 1.598892517897348e-06, + "loss": 0.3819, + "step": 13495 + }, + { + "epoch": 1.6423486461819288, + "grad_norm": 2.020066738128662, + "learning_rate": 1.5978345922515481e-06, + "loss": 0.3257, + "step": 13496 + }, + { + "epoch": 1.6424703376939458, + "grad_norm": 1.9756035804748535, + "learning_rate": 1.5967769863255322e-06, + "loss": 0.3569, + "step": 13497 + }, + { + "epoch": 1.642592029205963, + "grad_norm": 2.3926239013671875, + "learning_rate": 1.595719700159548e-06, + "loss": 0.3828, + "step": 13498 + }, + { + "epoch": 1.64271372071798, + "grad_norm": 1.6480560302734375, + "learning_rate": 1.594662733793826e-06, + "loss": 0.3631, + "step": 13499 + }, + { + "epoch": 1.642835412229997, + "grad_norm": 2.0234663486480713, + "learning_rate": 1.5936060872685821e-06, + "loss": 0.3821, + "step": 13500 + }, + { + "epoch": 1.642957103742014, + "grad_norm": 1.726359486579895, + "learning_rate": 1.592549760624028e-06, + "loss": 0.3989, + "step": 13501 + }, + { + "epoch": 1.643078795254031, + "grad_norm": 1.9464523792266846, + "learning_rate": 1.5914937539003594e-06, + "loss": 0.4144, + "step": 13502 + }, + { + "epoch": 1.6432004867660481, + "grad_norm": 2.832329511642456, + "learning_rate": 1.5904380671377574e-06, + "loss": 0.2845, + "step": 13503 + }, + { + "epoch": 1.6433221782780651, + "grad_norm": 1.3468236923217773, + "learning_rate": 1.589382700376394e-06, + "loss": 0.3184, + "step": 13504 + }, + { + "epoch": 1.6434438697900822, + "grad_norm": 1.4346479177474976, + "learning_rate": 1.5883276536564263e-06, + "loss": 0.3541, + "step": 13505 + }, + { + "epoch": 1.6435655613020992, + "grad_norm": 2.4173312187194824, + "learning_rate": 1.5872729270180043e-06, + "loss": 0.3372, + "step": 13506 + }, + { + "epoch": 1.6436872528141162, + "grad_norm": 1.4922704696655273, + "learning_rate": 1.5862185205012603e-06, + "loss": 0.3518, + "step": 13507 + }, + { + "epoch": 1.6438089443261332, + "grad_norm": 2.3446948528289795, + "learning_rate": 1.5851644341463157e-06, + "loss": 0.3378, + "step": 13508 + }, + { + "epoch": 1.6439306358381502, + "grad_norm": 2.416332483291626, + "learning_rate": 1.5841106679932838e-06, + "loss": 0.4321, + "step": 13509 + }, + { + "epoch": 1.6440523273501673, + "grad_norm": 1.436244249343872, + "learning_rate": 1.5830572220822604e-06, + "loss": 0.3636, + "step": 13510 + }, + { + "epoch": 1.6441740188621843, + "grad_norm": 2.0416808128356934, + "learning_rate": 1.5820040964533313e-06, + "loss": 0.3768, + "step": 13511 + }, + { + "epoch": 1.6442957103742013, + "grad_norm": 2.190077304840088, + "learning_rate": 1.5809512911465708e-06, + "loss": 0.4151, + "step": 13512 + }, + { + "epoch": 1.6444174018862183, + "grad_norm": 3.4898064136505127, + "learning_rate": 1.5798988062020392e-06, + "loss": 0.4347, + "step": 13513 + }, + { + "epoch": 1.6445390933982353, + "grad_norm": 2.559004306793213, + "learning_rate": 1.5788466416597914e-06, + "loss": 0.383, + "step": 13514 + }, + { + "epoch": 1.6446607849102524, + "grad_norm": 2.573141574859619, + "learning_rate": 1.5777947975598573e-06, + "loss": 0.4365, + "step": 13515 + }, + { + "epoch": 1.6447824764222696, + "grad_norm": 1.6363483667373657, + "learning_rate": 1.5767432739422607e-06, + "loss": 0.3973, + "step": 13516 + }, + { + "epoch": 1.6449041679342866, + "grad_norm": 5.089287281036377, + "learning_rate": 1.5756920708470213e-06, + "loss": 0.2951, + "step": 13517 + }, + { + "epoch": 1.6450258594463036, + "grad_norm": 1.376924991607666, + "learning_rate": 1.5746411883141332e-06, + "loss": 0.3305, + "step": 13518 + }, + { + "epoch": 1.6451475509583207, + "grad_norm": 1.7490031719207764, + "learning_rate": 1.5735906263835898e-06, + "loss": 0.3357, + "step": 13519 + }, + { + "epoch": 1.6452692424703377, + "grad_norm": 1.9388436079025269, + "learning_rate": 1.5725403850953647e-06, + "loss": 0.3967, + "step": 13520 + }, + { + "epoch": 1.6453909339823547, + "grad_norm": 1.8339444398880005, + "learning_rate": 1.571490464489419e-06, + "loss": 0.4305, + "step": 13521 + }, + { + "epoch": 1.6455126254943717, + "grad_norm": 1.4353859424591064, + "learning_rate": 1.5704408646057101e-06, + "loss": 0.3237, + "step": 13522 + }, + { + "epoch": 1.645634317006389, + "grad_norm": 1.4388935565948486, + "learning_rate": 1.5693915854841747e-06, + "loss": 0.4038, + "step": 13523 + }, + { + "epoch": 1.645756008518406, + "grad_norm": 1.8594416379928589, + "learning_rate": 1.5683426271647373e-06, + "loss": 0.4112, + "step": 13524 + }, + { + "epoch": 1.645877700030423, + "grad_norm": 2.123471975326538, + "learning_rate": 1.5672939896873184e-06, + "loss": 0.41, + "step": 13525 + }, + { + "epoch": 1.64599939154244, + "grad_norm": 2.739448308944702, + "learning_rate": 1.5662456730918174e-06, + "loss": 0.4007, + "step": 13526 + }, + { + "epoch": 1.646121083054457, + "grad_norm": 1.8444348573684692, + "learning_rate": 1.5651976774181255e-06, + "loss": 0.3317, + "step": 13527 + }, + { + "epoch": 1.646242774566474, + "grad_norm": 1.8997925519943237, + "learning_rate": 1.5641500027061206e-06, + "loss": 0.3291, + "step": 13528 + }, + { + "epoch": 1.646364466078491, + "grad_norm": 2.3686363697052, + "learning_rate": 1.563102648995668e-06, + "loss": 0.3302, + "step": 13529 + }, + { + "epoch": 1.646486157590508, + "grad_norm": 4.1441426277160645, + "learning_rate": 1.5620556163266244e-06, + "loss": 0.4486, + "step": 13530 + }, + { + "epoch": 1.6466078491025251, + "grad_norm": 2.0939393043518066, + "learning_rate": 1.5610089047388311e-06, + "loss": 0.3737, + "step": 13531 + }, + { + "epoch": 1.6467295406145421, + "grad_norm": 1.6156666278839111, + "learning_rate": 1.559962514272113e-06, + "loss": 0.3721, + "step": 13532 + }, + { + "epoch": 1.6468512321265592, + "grad_norm": 1.9121696949005127, + "learning_rate": 1.558916444966294e-06, + "loss": 0.4047, + "step": 13533 + }, + { + "epoch": 1.6469729236385762, + "grad_norm": 1.62413489818573, + "learning_rate": 1.557870696861178e-06, + "loss": 0.3614, + "step": 13534 + }, + { + "epoch": 1.6470946151505932, + "grad_norm": 1.4637845754623413, + "learning_rate": 1.5568252699965514e-06, + "loss": 0.3871, + "step": 13535 + }, + { + "epoch": 1.6472163066626102, + "grad_norm": 2.2136142253875732, + "learning_rate": 1.555780164412204e-06, + "loss": 0.345, + "step": 13536 + }, + { + "epoch": 1.6473379981746272, + "grad_norm": 1.998266577720642, + "learning_rate": 1.5547353801478993e-06, + "loss": 0.3507, + "step": 13537 + }, + { + "epoch": 1.6474596896866442, + "grad_norm": 1.9016863107681274, + "learning_rate": 1.5536909172433935e-06, + "loss": 0.4068, + "step": 13538 + }, + { + "epoch": 1.6475813811986613, + "grad_norm": 1.7950299978256226, + "learning_rate": 1.5526467757384322e-06, + "loss": 0.3323, + "step": 13539 + }, + { + "epoch": 1.6477030727106783, + "grad_norm": 1.8009721040725708, + "learning_rate": 1.551602955672743e-06, + "loss": 0.3648, + "step": 13540 + }, + { + "epoch": 1.6478247642226953, + "grad_norm": 1.5335566997528076, + "learning_rate": 1.5505594570860505e-06, + "loss": 0.399, + "step": 13541 + }, + { + "epoch": 1.6479464557347125, + "grad_norm": 2.015314817428589, + "learning_rate": 1.5495162800180608e-06, + "loss": 0.3872, + "step": 13542 + }, + { + "epoch": 1.6480681472467296, + "grad_norm": 2.049790382385254, + "learning_rate": 1.5484734245084665e-06, + "loss": 0.4352, + "step": 13543 + }, + { + "epoch": 1.6481898387587466, + "grad_norm": 1.487663984298706, + "learning_rate": 1.5474308905969537e-06, + "loss": 0.3068, + "step": 13544 + }, + { + "epoch": 1.6483115302707636, + "grad_norm": 1.8583332300186157, + "learning_rate": 1.5463886783231906e-06, + "loss": 0.3648, + "step": 13545 + }, + { + "epoch": 1.6484332217827806, + "grad_norm": 1.8660967350006104, + "learning_rate": 1.5453467877268346e-06, + "loss": 0.343, + "step": 13546 + }, + { + "epoch": 1.6485549132947976, + "grad_norm": 1.659022331237793, + "learning_rate": 1.544305218847536e-06, + "loss": 0.4167, + "step": 13547 + }, + { + "epoch": 1.6486766048068149, + "grad_norm": 1.9124284982681274, + "learning_rate": 1.5432639717249266e-06, + "loss": 0.308, + "step": 13548 + }, + { + "epoch": 1.648798296318832, + "grad_norm": 3.2666680812835693, + "learning_rate": 1.5422230463986277e-06, + "loss": 0.3038, + "step": 13549 + }, + { + "epoch": 1.648919987830849, + "grad_norm": 2.1303718090057373, + "learning_rate": 1.5411824429082478e-06, + "loss": 0.365, + "step": 13550 + }, + { + "epoch": 1.649041679342866, + "grad_norm": 2.429307699203491, + "learning_rate": 1.540142161293382e-06, + "loss": 0.3163, + "step": 13551 + }, + { + "epoch": 1.649163370854883, + "grad_norm": 1.8923447132110596, + "learning_rate": 1.5391022015936209e-06, + "loss": 0.3512, + "step": 13552 + }, + { + "epoch": 1.6492850623669, + "grad_norm": 2.2367300987243652, + "learning_rate": 1.5380625638485337e-06, + "loss": 0.4674, + "step": 13553 + }, + { + "epoch": 1.649406753878917, + "grad_norm": 1.349216341972351, + "learning_rate": 1.5370232480976787e-06, + "loss": 0.3204, + "step": 13554 + }, + { + "epoch": 1.649528445390934, + "grad_norm": 1.6021007299423218, + "learning_rate": 1.5359842543806103e-06, + "loss": 0.3889, + "step": 13555 + }, + { + "epoch": 1.649650136902951, + "grad_norm": 2.0629358291625977, + "learning_rate": 1.5349455827368586e-06, + "loss": 0.4152, + "step": 13556 + }, + { + "epoch": 1.649771828414968, + "grad_norm": 2.156932830810547, + "learning_rate": 1.533907233205948e-06, + "loss": 0.3068, + "step": 13557 + }, + { + "epoch": 1.649893519926985, + "grad_norm": 1.9429261684417725, + "learning_rate": 1.532869205827393e-06, + "loss": 0.3664, + "step": 13558 + }, + { + "epoch": 1.650015211439002, + "grad_norm": 1.6506551504135132, + "learning_rate": 1.5318315006406915e-06, + "loss": 0.3775, + "step": 13559 + }, + { + "epoch": 1.6501369029510191, + "grad_norm": 1.749122977256775, + "learning_rate": 1.5307941176853292e-06, + "loss": 0.3346, + "step": 13560 + }, + { + "epoch": 1.6502585944630361, + "grad_norm": 1.6698367595672607, + "learning_rate": 1.5297570570007802e-06, + "loss": 0.3525, + "step": 13561 + }, + { + "epoch": 1.6503802859750532, + "grad_norm": 3.209825038909912, + "learning_rate": 1.5287203186265055e-06, + "loss": 0.423, + "step": 13562 + }, + { + "epoch": 1.6505019774870702, + "grad_norm": 2.068608522415161, + "learning_rate": 1.5276839026019597e-06, + "loss": 0.3923, + "step": 13563 + }, + { + "epoch": 1.6506236689990872, + "grad_norm": 1.9705032110214233, + "learning_rate": 1.5266478089665793e-06, + "loss": 0.3152, + "step": 13564 + }, + { + "epoch": 1.6507453605111042, + "grad_norm": 1.3871736526489258, + "learning_rate": 1.525612037759785e-06, + "loss": 0.3552, + "step": 13565 + }, + { + "epoch": 1.6508670520231212, + "grad_norm": 2.300062417984009, + "learning_rate": 1.5245765890209963e-06, + "loss": 0.3359, + "step": 13566 + }, + { + "epoch": 1.6509887435351385, + "grad_norm": 1.5899654626846313, + "learning_rate": 1.5235414627896117e-06, + "loss": 0.3174, + "step": 13567 + }, + { + "epoch": 1.6511104350471555, + "grad_norm": 2.0018270015716553, + "learning_rate": 1.5225066591050174e-06, + "loss": 0.3696, + "step": 13568 + }, + { + "epoch": 1.6512321265591725, + "grad_norm": 2.9894344806671143, + "learning_rate": 1.5214721780065944e-06, + "loss": 0.4339, + "step": 13569 + }, + { + "epoch": 1.6513538180711895, + "grad_norm": 2.1566507816314697, + "learning_rate": 1.5204380195337022e-06, + "loss": 0.3463, + "step": 13570 + }, + { + "epoch": 1.6514755095832065, + "grad_norm": 1.2636442184448242, + "learning_rate": 1.5194041837256979e-06, + "loss": 0.3406, + "step": 13571 + }, + { + "epoch": 1.6515972010952236, + "grad_norm": 2.55454683303833, + "learning_rate": 1.5183706706219202e-06, + "loss": 0.3964, + "step": 13572 + }, + { + "epoch": 1.6517188926072408, + "grad_norm": 1.8894301652908325, + "learning_rate": 1.5173374802616892e-06, + "loss": 0.3143, + "step": 13573 + }, + { + "epoch": 1.6518405841192578, + "grad_norm": 2.53320050239563, + "learning_rate": 1.5163046126843283e-06, + "loss": 0.389, + "step": 13574 + }, + { + "epoch": 1.6519622756312748, + "grad_norm": 3.6553306579589844, + "learning_rate": 1.5152720679291377e-06, + "loss": 0.317, + "step": 13575 + }, + { + "epoch": 1.6520839671432919, + "grad_norm": 1.83319890499115, + "learning_rate": 1.5142398460354036e-06, + "loss": 0.3252, + "step": 13576 + }, + { + "epoch": 1.6522056586553089, + "grad_norm": 1.5476677417755127, + "learning_rate": 1.513207947042411e-06, + "loss": 0.3582, + "step": 13577 + }, + { + "epoch": 1.652327350167326, + "grad_norm": 2.0710372924804688, + "learning_rate": 1.512176370989421e-06, + "loss": 0.3973, + "step": 13578 + }, + { + "epoch": 1.652449041679343, + "grad_norm": 3.0273845195770264, + "learning_rate": 1.511145117915691e-06, + "loss": 0.3033, + "step": 13579 + }, + { + "epoch": 1.65257073319136, + "grad_norm": 2.030923366546631, + "learning_rate": 1.5101141878604598e-06, + "loss": 0.2935, + "step": 13580 + }, + { + "epoch": 1.652692424703377, + "grad_norm": 2.000373601913452, + "learning_rate": 1.5090835808629544e-06, + "loss": 0.4074, + "step": 13581 + }, + { + "epoch": 1.652814116215394, + "grad_norm": 1.9685217142105103, + "learning_rate": 1.5080532969623963e-06, + "loss": 0.4195, + "step": 13582 + }, + { + "epoch": 1.652935807727411, + "grad_norm": 1.5527143478393555, + "learning_rate": 1.5070233361979902e-06, + "loss": 0.3376, + "step": 13583 + }, + { + "epoch": 1.653057499239428, + "grad_norm": 2.4072747230529785, + "learning_rate": 1.5059936986089208e-06, + "loss": 0.4189, + "step": 13584 + }, + { + "epoch": 1.653179190751445, + "grad_norm": 3.1512880325317383, + "learning_rate": 1.5049643842343753e-06, + "loss": 0.427, + "step": 13585 + }, + { + "epoch": 1.653300882263462, + "grad_norm": 1.9627091884613037, + "learning_rate": 1.5039353931135169e-06, + "loss": 0.3999, + "step": 13586 + }, + { + "epoch": 1.653422573775479, + "grad_norm": 1.905883550643921, + "learning_rate": 1.5029067252855045e-06, + "loss": 0.3998, + "step": 13587 + }, + { + "epoch": 1.653544265287496, + "grad_norm": 2.125684976577759, + "learning_rate": 1.5018783807894789e-06, + "loss": 0.3692, + "step": 13588 + }, + { + "epoch": 1.6536659567995131, + "grad_norm": 1.6453704833984375, + "learning_rate": 1.5008503596645697e-06, + "loss": 0.3312, + "step": 13589 + }, + { + "epoch": 1.6537876483115301, + "grad_norm": 1.7019684314727783, + "learning_rate": 1.4998226619498979e-06, + "loss": 0.3556, + "step": 13590 + }, + { + "epoch": 1.6539093398235472, + "grad_norm": 2.6745917797088623, + "learning_rate": 1.498795287684569e-06, + "loss": 0.312, + "step": 13591 + }, + { + "epoch": 1.6540310313355644, + "grad_norm": 2.4666833877563477, + "learning_rate": 1.497768236907673e-06, + "loss": 0.3479, + "step": 13592 + }, + { + "epoch": 1.6541527228475814, + "grad_norm": 1.7143189907073975, + "learning_rate": 1.4967415096582972e-06, + "loss": 0.3898, + "step": 13593 + }, + { + "epoch": 1.6542744143595984, + "grad_norm": 1.3360440731048584, + "learning_rate": 1.495715105975507e-06, + "loss": 0.2944, + "step": 13594 + }, + { + "epoch": 1.6543961058716155, + "grad_norm": 2.0141937732696533, + "learning_rate": 1.4946890258983603e-06, + "loss": 0.4944, + "step": 13595 + }, + { + "epoch": 1.6545177973836325, + "grad_norm": 1.803688883781433, + "learning_rate": 1.4936632694659004e-06, + "loss": 0.3728, + "step": 13596 + }, + { + "epoch": 1.6546394888956495, + "grad_norm": 1.8607743978500366, + "learning_rate": 1.492637836717159e-06, + "loss": 0.3296, + "step": 13597 + }, + { + "epoch": 1.6547611804076665, + "grad_norm": 1.640473484992981, + "learning_rate": 1.4916127276911596e-06, + "loss": 0.3749, + "step": 13598 + }, + { + "epoch": 1.6548828719196838, + "grad_norm": 1.659511685371399, + "learning_rate": 1.4905879424269066e-06, + "loss": 0.3677, + "step": 13599 + }, + { + "epoch": 1.6550045634317008, + "grad_norm": 3.7554428577423096, + "learning_rate": 1.489563480963394e-06, + "loss": 0.3897, + "step": 13600 + }, + { + "epoch": 1.6551262549437178, + "grad_norm": 2.6372745037078857, + "learning_rate": 1.4885393433396089e-06, + "loss": 0.4192, + "step": 13601 + }, + { + "epoch": 1.6552479464557348, + "grad_norm": 2.3624267578125, + "learning_rate": 1.4875155295945187e-06, + "loss": 0.3046, + "step": 13602 + }, + { + "epoch": 1.6553696379677518, + "grad_norm": 1.6589763164520264, + "learning_rate": 1.4864920397670812e-06, + "loss": 0.3852, + "step": 13603 + }, + { + "epoch": 1.6554913294797688, + "grad_norm": 1.946058750152588, + "learning_rate": 1.4854688738962452e-06, + "loss": 0.3701, + "step": 13604 + }, + { + "epoch": 1.6556130209917859, + "grad_norm": 2.2866172790527344, + "learning_rate": 1.4844460320209431e-06, + "loss": 0.2729, + "step": 13605 + }, + { + "epoch": 1.6557347125038029, + "grad_norm": 2.7910420894622803, + "learning_rate": 1.4834235141800957e-06, + "loss": 0.3243, + "step": 13606 + }, + { + "epoch": 1.65585640401582, + "grad_norm": 2.127629041671753, + "learning_rate": 1.4824013204126119e-06, + "loss": 0.3572, + "step": 13607 + }, + { + "epoch": 1.655978095527837, + "grad_norm": 1.3885986804962158, + "learning_rate": 1.4813794507573865e-06, + "loss": 0.3237, + "step": 13608 + }, + { + "epoch": 1.656099787039854, + "grad_norm": 1.5610814094543457, + "learning_rate": 1.4803579052533068e-06, + "loss": 0.3115, + "step": 13609 + }, + { + "epoch": 1.656221478551871, + "grad_norm": 1.7771389484405518, + "learning_rate": 1.4793366839392443e-06, + "loss": 0.382, + "step": 13610 + }, + { + "epoch": 1.656343170063888, + "grad_norm": 1.7746970653533936, + "learning_rate": 1.4783157868540555e-06, + "loss": 0.3823, + "step": 13611 + }, + { + "epoch": 1.656464861575905, + "grad_norm": 1.6492103338241577, + "learning_rate": 1.4772952140365914e-06, + "loss": 0.3275, + "step": 13612 + }, + { + "epoch": 1.656586553087922, + "grad_norm": 1.9623640775680542, + "learning_rate": 1.4762749655256859e-06, + "loss": 0.396, + "step": 13613 + }, + { + "epoch": 1.656708244599939, + "grad_norm": 2.6154098510742188, + "learning_rate": 1.4752550413601586e-06, + "loss": 0.3634, + "step": 13614 + }, + { + "epoch": 1.656829936111956, + "grad_norm": 1.8920204639434814, + "learning_rate": 1.4742354415788252e-06, + "loss": 0.2903, + "step": 13615 + }, + { + "epoch": 1.656951627623973, + "grad_norm": 1.7871285676956177, + "learning_rate": 1.4732161662204803e-06, + "loss": 0.3853, + "step": 13616 + }, + { + "epoch": 1.6570733191359903, + "grad_norm": 1.8500959873199463, + "learning_rate": 1.4721972153239073e-06, + "loss": 0.2934, + "step": 13617 + }, + { + "epoch": 1.6571950106480073, + "grad_norm": 2.0763282775878906, + "learning_rate": 1.4711785889278863e-06, + "loss": 0.3775, + "step": 13618 + }, + { + "epoch": 1.6573167021600244, + "grad_norm": 2.6651928424835205, + "learning_rate": 1.4701602870711696e-06, + "loss": 0.422, + "step": 13619 + }, + { + "epoch": 1.6574383936720414, + "grad_norm": 2.9481241703033447, + "learning_rate": 1.4691423097925117e-06, + "loss": 0.4025, + "step": 13620 + }, + { + "epoch": 1.6575600851840584, + "grad_norm": 1.7668037414550781, + "learning_rate": 1.4681246571306474e-06, + "loss": 0.3455, + "step": 13621 + }, + { + "epoch": 1.6576817766960754, + "grad_norm": 2.107320785522461, + "learning_rate": 1.4671073291242965e-06, + "loss": 0.3867, + "step": 13622 + }, + { + "epoch": 1.6578034682080924, + "grad_norm": 1.5007433891296387, + "learning_rate": 1.4660903258121773e-06, + "loss": 0.3592, + "step": 13623 + }, + { + "epoch": 1.6579251597201097, + "grad_norm": 1.5743193626403809, + "learning_rate": 1.465073647232984e-06, + "loss": 0.3674, + "step": 13624 + }, + { + "epoch": 1.6580468512321267, + "grad_norm": 2.820394992828369, + "learning_rate": 1.4640572934254037e-06, + "loss": 0.4089, + "step": 13625 + }, + { + "epoch": 1.6581685427441437, + "grad_norm": 1.837266206741333, + "learning_rate": 1.4630412644281133e-06, + "loss": 0.3932, + "step": 13626 + }, + { + "epoch": 1.6582902342561607, + "grad_norm": 1.6454945802688599, + "learning_rate": 1.462025560279774e-06, + "loss": 0.33, + "step": 13627 + }, + { + "epoch": 1.6584119257681778, + "grad_norm": 2.0698416233062744, + "learning_rate": 1.4610101810190325e-06, + "loss": 0.398, + "step": 13628 + }, + { + "epoch": 1.6585336172801948, + "grad_norm": 2.274641513824463, + "learning_rate": 1.4599951266845325e-06, + "loss": 0.3592, + "step": 13629 + }, + { + "epoch": 1.6586553087922118, + "grad_norm": 2.0959949493408203, + "learning_rate": 1.458980397314891e-06, + "loss": 0.3721, + "step": 13630 + }, + { + "epoch": 1.6587770003042288, + "grad_norm": 1.6033892631530762, + "learning_rate": 1.4579659929487255e-06, + "loss": 0.3524, + "step": 13631 + }, + { + "epoch": 1.6588986918162458, + "grad_norm": 1.241445779800415, + "learning_rate": 1.4569519136246357e-06, + "loss": 0.3135, + "step": 13632 + }, + { + "epoch": 1.6590203833282628, + "grad_norm": 2.6747539043426514, + "learning_rate": 1.455938159381206e-06, + "loss": 0.4672, + "step": 13633 + }, + { + "epoch": 1.6591420748402799, + "grad_norm": 1.6344259977340698, + "learning_rate": 1.4549247302570169e-06, + "loss": 0.33, + "step": 13634 + }, + { + "epoch": 1.6592637663522969, + "grad_norm": 1.8994158506393433, + "learning_rate": 1.4539116262906295e-06, + "loss": 0.3706, + "step": 13635 + }, + { + "epoch": 1.659385457864314, + "grad_norm": 1.5130999088287354, + "learning_rate": 1.4528988475205918e-06, + "loss": 0.328, + "step": 13636 + }, + { + "epoch": 1.659507149376331, + "grad_norm": 1.4553889036178589, + "learning_rate": 1.4518863939854467e-06, + "loss": 0.3324, + "step": 13637 + }, + { + "epoch": 1.659628840888348, + "grad_norm": 1.8493916988372803, + "learning_rate": 1.4508742657237152e-06, + "loss": 0.3741, + "step": 13638 + }, + { + "epoch": 1.659750532400365, + "grad_norm": 3.459730863571167, + "learning_rate": 1.4498624627739167e-06, + "loss": 0.4211, + "step": 13639 + }, + { + "epoch": 1.659872223912382, + "grad_norm": 1.845629334449768, + "learning_rate": 1.4488509851745491e-06, + "loss": 0.3729, + "step": 13640 + }, + { + "epoch": 1.659993915424399, + "grad_norm": 1.6316800117492676, + "learning_rate": 1.4478398329641007e-06, + "loss": 0.3716, + "step": 13641 + }, + { + "epoch": 1.660115606936416, + "grad_norm": 2.327666759490967, + "learning_rate": 1.4468290061810497e-06, + "loss": 0.3982, + "step": 13642 + }, + { + "epoch": 1.6602372984484333, + "grad_norm": 4.206064224243164, + "learning_rate": 1.4458185048638584e-06, + "loss": 0.3973, + "step": 13643 + }, + { + "epoch": 1.6603589899604503, + "grad_norm": 1.6471965312957764, + "learning_rate": 1.4448083290509774e-06, + "loss": 0.3003, + "step": 13644 + }, + { + "epoch": 1.6604806814724673, + "grad_norm": 2.684356451034546, + "learning_rate": 1.443798478780849e-06, + "loss": 0.3836, + "step": 13645 + }, + { + "epoch": 1.6606023729844843, + "grad_norm": 1.5354562997817993, + "learning_rate": 1.442788954091897e-06, + "loss": 0.3421, + "step": 13646 + }, + { + "epoch": 1.6607240644965013, + "grad_norm": 2.4430270195007324, + "learning_rate": 1.441779755022541e-06, + "loss": 0.3507, + "step": 13647 + }, + { + "epoch": 1.6608457560085184, + "grad_norm": 2.050832748413086, + "learning_rate": 1.4407708816111787e-06, + "loss": 0.3486, + "step": 13648 + }, + { + "epoch": 1.6609674475205356, + "grad_norm": 2.2159924507141113, + "learning_rate": 1.4397623338961996e-06, + "loss": 0.3305, + "step": 13649 + }, + { + "epoch": 1.6610891390325526, + "grad_norm": 1.7031782865524292, + "learning_rate": 1.4387541119159842e-06, + "loss": 0.3476, + "step": 13650 + }, + { + "epoch": 1.6612108305445696, + "grad_norm": 5.4927263259887695, + "learning_rate": 1.437746215708895e-06, + "loss": 0.5212, + "step": 13651 + }, + { + "epoch": 1.6613325220565867, + "grad_norm": 1.991536021232605, + "learning_rate": 1.436738645313286e-06, + "loss": 0.3299, + "step": 13652 + }, + { + "epoch": 1.6614542135686037, + "grad_norm": 1.5333884954452515, + "learning_rate": 1.4357314007674972e-06, + "loss": 0.2913, + "step": 13653 + }, + { + "epoch": 1.6615759050806207, + "grad_norm": 2.365474224090576, + "learning_rate": 1.4347244821098526e-06, + "loss": 0.4077, + "step": 13654 + }, + { + "epoch": 1.6616975965926377, + "grad_norm": 2.1574690341949463, + "learning_rate": 1.433717889378673e-06, + "loss": 0.4334, + "step": 13655 + }, + { + "epoch": 1.6618192881046547, + "grad_norm": 1.1991745233535767, + "learning_rate": 1.4327116226122584e-06, + "loss": 0.3381, + "step": 13656 + }, + { + "epoch": 1.6619409796166718, + "grad_norm": 1.8424403667449951, + "learning_rate": 1.4317056818488983e-06, + "loss": 0.3181, + "step": 13657 + }, + { + "epoch": 1.6620626711286888, + "grad_norm": 2.9418604373931885, + "learning_rate": 1.4307000671268746e-06, + "loss": 0.3835, + "step": 13658 + }, + { + "epoch": 1.6621843626407058, + "grad_norm": 1.6366314888000488, + "learning_rate": 1.4296947784844505e-06, + "loss": 0.3818, + "step": 13659 + }, + { + "epoch": 1.6623060541527228, + "grad_norm": 2.9563655853271484, + "learning_rate": 1.4286898159598772e-06, + "loss": 0.4524, + "step": 13660 + }, + { + "epoch": 1.6624277456647398, + "grad_norm": 2.4889585971832275, + "learning_rate": 1.4276851795914003e-06, + "loss": 0.3964, + "step": 13661 + }, + { + "epoch": 1.6625494371767569, + "grad_norm": 2.7162702083587646, + "learning_rate": 1.4266808694172463e-06, + "loss": 0.2923, + "step": 13662 + }, + { + "epoch": 1.6626711286887739, + "grad_norm": 1.4771445989608765, + "learning_rate": 1.4256768854756286e-06, + "loss": 0.3484, + "step": 13663 + }, + { + "epoch": 1.6627928202007909, + "grad_norm": 1.7932910919189453, + "learning_rate": 1.4246732278047582e-06, + "loss": 0.3438, + "step": 13664 + }, + { + "epoch": 1.662914511712808, + "grad_norm": 1.894774079322815, + "learning_rate": 1.4236698964428164e-06, + "loss": 0.3853, + "step": 13665 + }, + { + "epoch": 1.663036203224825, + "grad_norm": 1.359831690788269, + "learning_rate": 1.4226668914279907e-06, + "loss": 0.349, + "step": 13666 + }, + { + "epoch": 1.663157894736842, + "grad_norm": 1.9268769025802612, + "learning_rate": 1.4216642127984426e-06, + "loss": 0.3697, + "step": 13667 + }, + { + "epoch": 1.6632795862488592, + "grad_norm": 2.676494598388672, + "learning_rate": 1.420661860592325e-06, + "loss": 0.4374, + "step": 13668 + }, + { + "epoch": 1.6634012777608762, + "grad_norm": 1.6084446907043457, + "learning_rate": 1.4196598348477842e-06, + "loss": 0.3356, + "step": 13669 + }, + { + "epoch": 1.6635229692728932, + "grad_norm": 1.7682455778121948, + "learning_rate": 1.4186581356029472e-06, + "loss": 0.3538, + "step": 13670 + }, + { + "epoch": 1.6636446607849102, + "grad_norm": 1.6002421379089355, + "learning_rate": 1.4176567628959282e-06, + "loss": 0.2985, + "step": 13671 + }, + { + "epoch": 1.6637663522969273, + "grad_norm": 3.169933795928955, + "learning_rate": 1.4166557167648365e-06, + "loss": 0.4102, + "step": 13672 + }, + { + "epoch": 1.6638880438089443, + "grad_norm": 3.072920322418213, + "learning_rate": 1.4156549972477618e-06, + "loss": 0.3968, + "step": 13673 + }, + { + "epoch": 1.6640097353209615, + "grad_norm": 1.5023761987686157, + "learning_rate": 1.4146546043827791e-06, + "loss": 0.3693, + "step": 13674 + }, + { + "epoch": 1.6641314268329785, + "grad_norm": 1.723341703414917, + "learning_rate": 1.4136545382079658e-06, + "loss": 0.348, + "step": 13675 + }, + { + "epoch": 1.6642531183449956, + "grad_norm": 2.0811023712158203, + "learning_rate": 1.4126547987613649e-06, + "loss": 0.3323, + "step": 13676 + }, + { + "epoch": 1.6643748098570126, + "grad_norm": 2.1050126552581787, + "learning_rate": 1.4116553860810256e-06, + "loss": 0.351, + "step": 13677 + }, + { + "epoch": 1.6644965013690296, + "grad_norm": 1.8099220991134644, + "learning_rate": 1.4106563002049756e-06, + "loss": 0.3854, + "step": 13678 + }, + { + "epoch": 1.6646181928810466, + "grad_norm": 2.0233612060546875, + "learning_rate": 1.4096575411712299e-06, + "loss": 0.2744, + "step": 13679 + }, + { + "epoch": 1.6647398843930636, + "grad_norm": 2.3380508422851562, + "learning_rate": 1.4086591090177993e-06, + "loss": 0.3445, + "step": 13680 + }, + { + "epoch": 1.6648615759050807, + "grad_norm": 1.6650595664978027, + "learning_rate": 1.4076610037826722e-06, + "loss": 0.3802, + "step": 13681 + }, + { + "epoch": 1.6649832674170977, + "grad_norm": 2.3621175289154053, + "learning_rate": 1.406663225503826e-06, + "loss": 0.4072, + "step": 13682 + }, + { + "epoch": 1.6651049589291147, + "grad_norm": 1.7309842109680176, + "learning_rate": 1.4056657742192336e-06, + "loss": 0.3742, + "step": 13683 + }, + { + "epoch": 1.6652266504411317, + "grad_norm": 2.152414321899414, + "learning_rate": 1.4046686499668493e-06, + "loss": 0.3451, + "step": 13684 + }, + { + "epoch": 1.6653483419531487, + "grad_norm": 1.6705527305603027, + "learning_rate": 1.4036718527846104e-06, + "loss": 0.3528, + "step": 13685 + }, + { + "epoch": 1.6654700334651658, + "grad_norm": 2.721670150756836, + "learning_rate": 1.4026753827104533e-06, + "loss": 0.3112, + "step": 13686 + }, + { + "epoch": 1.6655917249771828, + "grad_norm": 2.670419216156006, + "learning_rate": 1.4016792397822943e-06, + "loss": 0.3597, + "step": 13687 + }, + { + "epoch": 1.6657134164891998, + "grad_norm": 1.557462453842163, + "learning_rate": 1.4006834240380373e-06, + "loss": 0.327, + "step": 13688 + }, + { + "epoch": 1.6658351080012168, + "grad_norm": 2.4295411109924316, + "learning_rate": 1.3996879355155757e-06, + "loss": 0.3235, + "step": 13689 + }, + { + "epoch": 1.6659567995132338, + "grad_norm": 1.8355549573898315, + "learning_rate": 1.3986927742527878e-06, + "loss": 0.3706, + "step": 13690 + }, + { + "epoch": 1.6660784910252509, + "grad_norm": 2.6848819255828857, + "learning_rate": 1.3976979402875458e-06, + "loss": 0.4611, + "step": 13691 + }, + { + "epoch": 1.6662001825372679, + "grad_norm": 1.8984572887420654, + "learning_rate": 1.3967034336577024e-06, + "loss": 0.3423, + "step": 13692 + }, + { + "epoch": 1.6663218740492851, + "grad_norm": 1.914689302444458, + "learning_rate": 1.3957092544010986e-06, + "loss": 0.4128, + "step": 13693 + }, + { + "epoch": 1.6664435655613021, + "grad_norm": 1.6922495365142822, + "learning_rate": 1.394715402555571e-06, + "loss": 0.3139, + "step": 13694 + }, + { + "epoch": 1.6665652570733192, + "grad_norm": 2.5484304428100586, + "learning_rate": 1.3937218781589335e-06, + "loss": 0.4163, + "step": 13695 + }, + { + "epoch": 1.6666869485853362, + "grad_norm": 2.0473713874816895, + "learning_rate": 1.3927286812489904e-06, + "loss": 0.3593, + "step": 13696 + }, + { + "epoch": 1.6668086400973532, + "grad_norm": 2.2794864177703857, + "learning_rate": 1.3917358118635394e-06, + "loss": 0.2856, + "step": 13697 + }, + { + "epoch": 1.6669303316093702, + "grad_norm": 1.7777392864227295, + "learning_rate": 1.390743270040359e-06, + "loss": 0.3394, + "step": 13698 + }, + { + "epoch": 1.6670520231213874, + "grad_norm": 2.142390251159668, + "learning_rate": 1.3897510558172178e-06, + "loss": 0.4225, + "step": 13699 + }, + { + "epoch": 1.6671737146334045, + "grad_norm": 3.37737774848938, + "learning_rate": 1.3887591692318704e-06, + "loss": 0.3955, + "step": 13700 + }, + { + "epoch": 1.6672954061454215, + "grad_norm": 1.6774508953094482, + "learning_rate": 1.3877676103220594e-06, + "loss": 0.3749, + "step": 13701 + }, + { + "epoch": 1.6674170976574385, + "grad_norm": 1.7301967144012451, + "learning_rate": 1.3867763791255184e-06, + "loss": 0.4008, + "step": 13702 + }, + { + "epoch": 1.6675387891694555, + "grad_norm": 2.0341153144836426, + "learning_rate": 1.3857854756799649e-06, + "loss": 0.3601, + "step": 13703 + }, + { + "epoch": 1.6676604806814725, + "grad_norm": 1.6317696571350098, + "learning_rate": 1.384794900023103e-06, + "loss": 0.3255, + "step": 13704 + }, + { + "epoch": 1.6677821721934896, + "grad_norm": 2.622546911239624, + "learning_rate": 1.3838046521926286e-06, + "loss": 0.4034, + "step": 13705 + }, + { + "epoch": 1.6679038637055066, + "grad_norm": 2.03069806098938, + "learning_rate": 1.3828147322262209e-06, + "loss": 0.4118, + "step": 13706 + }, + { + "epoch": 1.6680255552175236, + "grad_norm": 1.7591652870178223, + "learning_rate": 1.3818251401615501e-06, + "loss": 0.3499, + "step": 13707 + }, + { + "epoch": 1.6681472467295406, + "grad_norm": 2.0143213272094727, + "learning_rate": 1.3808358760362728e-06, + "loss": 0.3228, + "step": 13708 + }, + { + "epoch": 1.6682689382415576, + "grad_norm": 1.5251108407974243, + "learning_rate": 1.3798469398880277e-06, + "loss": 0.3599, + "step": 13709 + }, + { + "epoch": 1.6683906297535747, + "grad_norm": 1.732371211051941, + "learning_rate": 1.3788583317544546e-06, + "loss": 0.418, + "step": 13710 + }, + { + "epoch": 1.6685123212655917, + "grad_norm": 3.6833293437957764, + "learning_rate": 1.3778700516731636e-06, + "loss": 0.4541, + "step": 13711 + }, + { + "epoch": 1.6686340127776087, + "grad_norm": 1.495136022567749, + "learning_rate": 1.3768820996817623e-06, + "loss": 0.3087, + "step": 13712 + }, + { + "epoch": 1.6687557042896257, + "grad_norm": 1.7879563570022583, + "learning_rate": 1.3758944758178482e-06, + "loss": 0.3407, + "step": 13713 + }, + { + "epoch": 1.6688773958016427, + "grad_norm": 2.007225275039673, + "learning_rate": 1.3749071801189983e-06, + "loss": 0.3569, + "step": 13714 + }, + { + "epoch": 1.6689990873136598, + "grad_norm": 1.7144052982330322, + "learning_rate": 1.3739202126227847e-06, + "loss": 0.3957, + "step": 13715 + }, + { + "epoch": 1.6691207788256768, + "grad_norm": 2.2162551879882812, + "learning_rate": 1.3729335733667627e-06, + "loss": 0.4748, + "step": 13716 + }, + { + "epoch": 1.6692424703376938, + "grad_norm": 2.049804210662842, + "learning_rate": 1.3719472623884723e-06, + "loss": 0.342, + "step": 13717 + }, + { + "epoch": 1.669364161849711, + "grad_norm": 2.3981854915618896, + "learning_rate": 1.3709612797254502e-06, + "loss": 0.4169, + "step": 13718 + }, + { + "epoch": 1.669485853361728, + "grad_norm": 1.3542389869689941, + "learning_rate": 1.369975625415213e-06, + "loss": 0.3145, + "step": 13719 + }, + { + "epoch": 1.669607544873745, + "grad_norm": 2.2779381275177, + "learning_rate": 1.3689902994952631e-06, + "loss": 0.3296, + "step": 13720 + }, + { + "epoch": 1.669729236385762, + "grad_norm": 2.5263664722442627, + "learning_rate": 1.3680053020031004e-06, + "loss": 0.2926, + "step": 13721 + }, + { + "epoch": 1.6698509278977791, + "grad_norm": 1.904555082321167, + "learning_rate": 1.3670206329762037e-06, + "loss": 0.3938, + "step": 13722 + }, + { + "epoch": 1.6699726194097961, + "grad_norm": 2.192382574081421, + "learning_rate": 1.366036292452041e-06, + "loss": 0.3653, + "step": 13723 + }, + { + "epoch": 1.6700943109218132, + "grad_norm": 2.2709121704101562, + "learning_rate": 1.3650522804680676e-06, + "loss": 0.3385, + "step": 13724 + }, + { + "epoch": 1.6702160024338304, + "grad_norm": 1.7489142417907715, + "learning_rate": 1.3640685970617274e-06, + "loss": 0.3904, + "step": 13725 + }, + { + "epoch": 1.6703376939458474, + "grad_norm": 1.9560399055480957, + "learning_rate": 1.3630852422704544e-06, + "loss": 0.4114, + "step": 13726 + }, + { + "epoch": 1.6704593854578644, + "grad_norm": 2.172668218612671, + "learning_rate": 1.3621022161316654e-06, + "loss": 0.4002, + "step": 13727 + }, + { + "epoch": 1.6705810769698815, + "grad_norm": 1.9341093301773071, + "learning_rate": 1.361119518682764e-06, + "loss": 0.3356, + "step": 13728 + }, + { + "epoch": 1.6707027684818985, + "grad_norm": 2.005704164505005, + "learning_rate": 1.3601371499611483e-06, + "loss": 0.3891, + "step": 13729 + }, + { + "epoch": 1.6708244599939155, + "grad_norm": 2.575528860092163, + "learning_rate": 1.3591551100041977e-06, + "loss": 0.3683, + "step": 13730 + }, + { + "epoch": 1.6709461515059325, + "grad_norm": 1.7623628377914429, + "learning_rate": 1.3581733988492795e-06, + "loss": 0.3643, + "step": 13731 + }, + { + "epoch": 1.6710678430179495, + "grad_norm": 1.4995167255401611, + "learning_rate": 1.3571920165337527e-06, + "loss": 0.3649, + "step": 13732 + }, + { + "epoch": 1.6711895345299665, + "grad_norm": 1.898901104927063, + "learning_rate": 1.3562109630949593e-06, + "loss": 0.3681, + "step": 13733 + }, + { + "epoch": 1.6713112260419836, + "grad_norm": 3.507563591003418, + "learning_rate": 1.3552302385702297e-06, + "loss": 0.2993, + "step": 13734 + }, + { + "epoch": 1.6714329175540006, + "grad_norm": 2.015133857727051, + "learning_rate": 1.3542498429968841e-06, + "loss": 0.3996, + "step": 13735 + }, + { + "epoch": 1.6715546090660176, + "grad_norm": 1.8106026649475098, + "learning_rate": 1.353269776412225e-06, + "loss": 0.3882, + "step": 13736 + }, + { + "epoch": 1.6716763005780346, + "grad_norm": 1.6142646074295044, + "learning_rate": 1.3522900388535509e-06, + "loss": 0.4044, + "step": 13737 + }, + { + "epoch": 1.6717979920900516, + "grad_norm": 2.2084908485412598, + "learning_rate": 1.3513106303581413e-06, + "loss": 0.3058, + "step": 13738 + }, + { + "epoch": 1.6719196836020687, + "grad_norm": 2.259007692337036, + "learning_rate": 1.3503315509632608e-06, + "loss": 0.4203, + "step": 13739 + }, + { + "epoch": 1.6720413751140857, + "grad_norm": 1.9059878587722778, + "learning_rate": 1.3493528007061718e-06, + "loss": 0.3516, + "step": 13740 + }, + { + "epoch": 1.6721630666261027, + "grad_norm": 2.7167346477508545, + "learning_rate": 1.3483743796241144e-06, + "loss": 0.3222, + "step": 13741 + }, + { + "epoch": 1.6722847581381197, + "grad_norm": 2.21913743019104, + "learning_rate": 1.3473962877543167e-06, + "loss": 0.4206, + "step": 13742 + }, + { + "epoch": 1.6724064496501367, + "grad_norm": 1.735192894935608, + "learning_rate": 1.3464185251340035e-06, + "loss": 0.4098, + "step": 13743 + }, + { + "epoch": 1.672528141162154, + "grad_norm": 1.9332529306411743, + "learning_rate": 1.3454410918003758e-06, + "loss": 0.4072, + "step": 13744 + }, + { + "epoch": 1.672649832674171, + "grad_norm": 1.307860255241394, + "learning_rate": 1.3444639877906296e-06, + "loss": 0.3356, + "step": 13745 + }, + { + "epoch": 1.672771524186188, + "grad_norm": 3.0803723335266113, + "learning_rate": 1.3434872131419452e-06, + "loss": 0.3316, + "step": 13746 + }, + { + "epoch": 1.672893215698205, + "grad_norm": 2.2157514095306396, + "learning_rate": 1.3425107678914872e-06, + "loss": 0.3123, + "step": 13747 + }, + { + "epoch": 1.673014907210222, + "grad_norm": 1.4930944442749023, + "learning_rate": 1.341534652076417e-06, + "loss": 0.3511, + "step": 13748 + }, + { + "epoch": 1.673136598722239, + "grad_norm": 2.276279926300049, + "learning_rate": 1.3405588657338754e-06, + "loss": 0.3332, + "step": 13749 + }, + { + "epoch": 1.6732582902342563, + "grad_norm": 3.006722927093506, + "learning_rate": 1.3395834089009906e-06, + "loss": 0.3942, + "step": 13750 + }, + { + "epoch": 1.6733799817462733, + "grad_norm": 1.903573989868164, + "learning_rate": 1.3386082816148848e-06, + "loss": 0.4068, + "step": 13751 + }, + { + "epoch": 1.6735016732582904, + "grad_norm": 2.9941470623016357, + "learning_rate": 1.3376334839126625e-06, + "loss": 0.2982, + "step": 13752 + }, + { + "epoch": 1.6736233647703074, + "grad_norm": 1.5512161254882812, + "learning_rate": 1.336659015831414e-06, + "loss": 0.3322, + "step": 13753 + }, + { + "epoch": 1.6737450562823244, + "grad_norm": 3.2248644828796387, + "learning_rate": 1.3356848774082255e-06, + "loss": 0.4243, + "step": 13754 + }, + { + "epoch": 1.6738667477943414, + "grad_norm": 1.7863333225250244, + "learning_rate": 1.33471106868016e-06, + "loss": 0.3175, + "step": 13755 + }, + { + "epoch": 1.6739884393063584, + "grad_norm": 1.6618281602859497, + "learning_rate": 1.3337375896842753e-06, + "loss": 0.3883, + "step": 13756 + }, + { + "epoch": 1.6741101308183755, + "grad_norm": 1.7667138576507568, + "learning_rate": 1.3327644404576135e-06, + "loss": 0.368, + "step": 13757 + }, + { + "epoch": 1.6742318223303925, + "grad_norm": 1.4757273197174072, + "learning_rate": 1.3317916210372018e-06, + "loss": 0.3654, + "step": 13758 + }, + { + "epoch": 1.6743535138424095, + "grad_norm": 1.6524502038955688, + "learning_rate": 1.330819131460064e-06, + "loss": 0.3866, + "step": 13759 + }, + { + "epoch": 1.6744752053544265, + "grad_norm": 2.698432207107544, + "learning_rate": 1.3298469717632023e-06, + "loss": 0.317, + "step": 13760 + }, + { + "epoch": 1.6745968968664435, + "grad_norm": 1.7345067262649536, + "learning_rate": 1.3288751419836077e-06, + "loss": 0.3255, + "step": 13761 + }, + { + "epoch": 1.6747185883784605, + "grad_norm": 1.4904327392578125, + "learning_rate": 1.3279036421582648e-06, + "loss": 0.3853, + "step": 13762 + }, + { + "epoch": 1.6748402798904776, + "grad_norm": 1.903839349746704, + "learning_rate": 1.3269324723241373e-06, + "loss": 0.3981, + "step": 13763 + }, + { + "epoch": 1.6749619714024946, + "grad_norm": 1.7542340755462646, + "learning_rate": 1.32596163251818e-06, + "loss": 0.3119, + "step": 13764 + }, + { + "epoch": 1.6750836629145116, + "grad_norm": 1.9150886535644531, + "learning_rate": 1.3249911227773393e-06, + "loss": 0.3797, + "step": 13765 + }, + { + "epoch": 1.6752053544265286, + "grad_norm": 1.7638375759124756, + "learning_rate": 1.3240209431385399e-06, + "loss": 0.3333, + "step": 13766 + }, + { + "epoch": 1.6753270459385456, + "grad_norm": 2.2814157009124756, + "learning_rate": 1.3230510936387042e-06, + "loss": 0.4052, + "step": 13767 + }, + { + "epoch": 1.6754487374505627, + "grad_norm": 1.629783034324646, + "learning_rate": 1.3220815743147374e-06, + "loss": 0.335, + "step": 13768 + }, + { + "epoch": 1.67557042896258, + "grad_norm": 2.9230308532714844, + "learning_rate": 1.3211123852035245e-06, + "loss": 0.39, + "step": 13769 + }, + { + "epoch": 1.675692120474597, + "grad_norm": 3.9670817852020264, + "learning_rate": 1.3201435263419514e-06, + "loss": 0.4153, + "step": 13770 + }, + { + "epoch": 1.675813811986614, + "grad_norm": 2.5139501094818115, + "learning_rate": 1.3191749977668833e-06, + "loss": 0.4397, + "step": 13771 + }, + { + "epoch": 1.675935503498631, + "grad_norm": 1.3515489101409912, + "learning_rate": 1.3182067995151736e-06, + "loss": 0.3472, + "step": 13772 + }, + { + "epoch": 1.676057195010648, + "grad_norm": 2.188394069671631, + "learning_rate": 1.3172389316236667e-06, + "loss": 0.3905, + "step": 13773 + }, + { + "epoch": 1.676178886522665, + "grad_norm": 2.337156057357788, + "learning_rate": 1.3162713941291895e-06, + "loss": 0.3573, + "step": 13774 + }, + { + "epoch": 1.6763005780346822, + "grad_norm": 3.3132240772247314, + "learning_rate": 1.3153041870685611e-06, + "loss": 0.4008, + "step": 13775 + }, + { + "epoch": 1.6764222695466993, + "grad_norm": 1.6074503660202026, + "learning_rate": 1.3143373104785862e-06, + "loss": 0.3729, + "step": 13776 + }, + { + "epoch": 1.6765439610587163, + "grad_norm": 2.6904752254486084, + "learning_rate": 1.3133707643960515e-06, + "loss": 0.3097, + "step": 13777 + }, + { + "epoch": 1.6766656525707333, + "grad_norm": 2.233013868331909, + "learning_rate": 1.3124045488577419e-06, + "loss": 0.3278, + "step": 13778 + }, + { + "epoch": 1.6767873440827503, + "grad_norm": 2.1369080543518066, + "learning_rate": 1.311438663900424e-06, + "loss": 0.3418, + "step": 13779 + }, + { + "epoch": 1.6769090355947673, + "grad_norm": 2.949039936065674, + "learning_rate": 1.3104731095608448e-06, + "loss": 0.4202, + "step": 13780 + }, + { + "epoch": 1.6770307271067844, + "grad_norm": 1.3142260313034058, + "learning_rate": 1.3095078858757527e-06, + "loss": 0.3365, + "step": 13781 + }, + { + "epoch": 1.6771524186188014, + "grad_norm": 1.9385156631469727, + "learning_rate": 1.3085429928818716e-06, + "loss": 0.3743, + "step": 13782 + }, + { + "epoch": 1.6772741101308184, + "grad_norm": 1.6352856159210205, + "learning_rate": 1.3075784306159222e-06, + "loss": 0.288, + "step": 13783 + }, + { + "epoch": 1.6773958016428354, + "grad_norm": 1.819472312927246, + "learning_rate": 1.3066141991146053e-06, + "loss": 0.4, + "step": 13784 + }, + { + "epoch": 1.6775174931548524, + "grad_norm": 4.280248641967773, + "learning_rate": 1.3056502984146103e-06, + "loss": 0.4649, + "step": 13785 + }, + { + "epoch": 1.6776391846668695, + "grad_norm": 1.7480182647705078, + "learning_rate": 1.304686728552621e-06, + "loss": 0.3945, + "step": 13786 + }, + { + "epoch": 1.6777608761788865, + "grad_norm": 2.491863250732422, + "learning_rate": 1.3037234895652984e-06, + "loss": 0.3953, + "step": 13787 + }, + { + "epoch": 1.6778825676909035, + "grad_norm": 1.9823706150054932, + "learning_rate": 1.3027605814892963e-06, + "loss": 0.3679, + "step": 13788 + }, + { + "epoch": 1.6780042592029205, + "grad_norm": 2.13973331451416, + "learning_rate": 1.3017980043612578e-06, + "loss": 0.396, + "step": 13789 + }, + { + "epoch": 1.6781259507149375, + "grad_norm": 1.9010581970214844, + "learning_rate": 1.3008357582178099e-06, + "loss": 0.3497, + "step": 13790 + }, + { + "epoch": 1.6782476422269546, + "grad_norm": 1.6171354055404663, + "learning_rate": 1.299873843095568e-06, + "loss": 0.3662, + "step": 13791 + }, + { + "epoch": 1.6783693337389716, + "grad_norm": 1.9463456869125366, + "learning_rate": 1.2989122590311342e-06, + "loss": 0.4269, + "step": 13792 + }, + { + "epoch": 1.6784910252509886, + "grad_norm": 2.181964874267578, + "learning_rate": 1.2979510060610978e-06, + "loss": 0.4205, + "step": 13793 + }, + { + "epoch": 1.6786127167630058, + "grad_norm": 1.7647093534469604, + "learning_rate": 1.2969900842220396e-06, + "loss": 0.3909, + "step": 13794 + }, + { + "epoch": 1.6787344082750228, + "grad_norm": 3.173133134841919, + "learning_rate": 1.2960294935505224e-06, + "loss": 0.4152, + "step": 13795 + }, + { + "epoch": 1.6788560997870399, + "grad_norm": 1.5895024538040161, + "learning_rate": 1.2950692340830972e-06, + "loss": 0.3698, + "step": 13796 + }, + { + "epoch": 1.6789777912990569, + "grad_norm": 1.9458640813827515, + "learning_rate": 1.2941093058563082e-06, + "loss": 0.3907, + "step": 13797 + }, + { + "epoch": 1.679099482811074, + "grad_norm": 1.5639913082122803, + "learning_rate": 1.2931497089066802e-06, + "loss": 0.3709, + "step": 13798 + }, + { + "epoch": 1.679221174323091, + "grad_norm": 2.37680721282959, + "learning_rate": 1.2921904432707255e-06, + "loss": 0.4122, + "step": 13799 + }, + { + "epoch": 1.6793428658351082, + "grad_norm": 1.9247478246688843, + "learning_rate": 1.2912315089849515e-06, + "loss": 0.3492, + "step": 13800 + }, + { + "epoch": 1.6794645573471252, + "grad_norm": 1.4941282272338867, + "learning_rate": 1.2902729060858444e-06, + "loss": 0.3421, + "step": 13801 + }, + { + "epoch": 1.6795862488591422, + "grad_norm": 2.026052951812744, + "learning_rate": 1.2893146346098817e-06, + "loss": 0.3603, + "step": 13802 + }, + { + "epoch": 1.6797079403711592, + "grad_norm": 2.213083028793335, + "learning_rate": 1.2883566945935266e-06, + "loss": 0.3487, + "step": 13803 + }, + { + "epoch": 1.6798296318831762, + "grad_norm": 2.2838826179504395, + "learning_rate": 1.28739908607323e-06, + "loss": 0.3832, + "step": 13804 + }, + { + "epoch": 1.6799513233951933, + "grad_norm": 1.776839017868042, + "learning_rate": 1.2864418090854337e-06, + "loss": 0.4198, + "step": 13805 + }, + { + "epoch": 1.6800730149072103, + "grad_norm": 1.4478399753570557, + "learning_rate": 1.285484863666563e-06, + "loss": 0.3101, + "step": 13806 + }, + { + "epoch": 1.6801947064192273, + "grad_norm": 1.535431981086731, + "learning_rate": 1.2845282498530298e-06, + "loss": 0.3448, + "step": 13807 + }, + { + "epoch": 1.6803163979312443, + "grad_norm": 2.038459300994873, + "learning_rate": 1.2835719676812374e-06, + "loss": 0.357, + "step": 13808 + }, + { + "epoch": 1.6804380894432613, + "grad_norm": 3.2336442470550537, + "learning_rate": 1.2826160171875746e-06, + "loss": 0.35, + "step": 13809 + }, + { + "epoch": 1.6805597809552784, + "grad_norm": 1.693761944770813, + "learning_rate": 1.2816603984084142e-06, + "loss": 0.3633, + "step": 13810 + }, + { + "epoch": 1.6806814724672954, + "grad_norm": 2.309576988220215, + "learning_rate": 1.280705111380124e-06, + "loss": 0.4252, + "step": 13811 + }, + { + "epoch": 1.6808031639793124, + "grad_norm": 2.395458221435547, + "learning_rate": 1.2797501561390513e-06, + "loss": 0.4361, + "step": 13812 + }, + { + "epoch": 1.6809248554913294, + "grad_norm": 1.6999075412750244, + "learning_rate": 1.278795532721534e-06, + "loss": 0.3988, + "step": 13813 + }, + { + "epoch": 1.6810465470033464, + "grad_norm": 1.5286123752593994, + "learning_rate": 1.2778412411639018e-06, + "loss": 0.3844, + "step": 13814 + }, + { + "epoch": 1.6811682385153635, + "grad_norm": 2.7576487064361572, + "learning_rate": 1.2768872815024614e-06, + "loss": 0.3417, + "step": 13815 + }, + { + "epoch": 1.6812899300273805, + "grad_norm": 1.5065091848373413, + "learning_rate": 1.2759336537735168e-06, + "loss": 0.334, + "step": 13816 + }, + { + "epoch": 1.6814116215393975, + "grad_norm": 1.6406195163726807, + "learning_rate": 1.2749803580133547e-06, + "loss": 0.3307, + "step": 13817 + }, + { + "epoch": 1.6815333130514145, + "grad_norm": 1.4684678316116333, + "learning_rate": 1.2740273942582481e-06, + "loss": 0.3893, + "step": 13818 + }, + { + "epoch": 1.6816550045634318, + "grad_norm": 1.5859050750732422, + "learning_rate": 1.273074762544463e-06, + "loss": 0.3381, + "step": 13819 + }, + { + "epoch": 1.6817766960754488, + "grad_norm": 1.3808832168579102, + "learning_rate": 1.2721224629082463e-06, + "loss": 0.3572, + "step": 13820 + }, + { + "epoch": 1.6818983875874658, + "grad_norm": 2.258392095565796, + "learning_rate": 1.271170495385834e-06, + "loss": 0.3928, + "step": 13821 + }, + { + "epoch": 1.6820200790994828, + "grad_norm": 1.7828283309936523, + "learning_rate": 1.2702188600134536e-06, + "loss": 0.4007, + "step": 13822 + }, + { + "epoch": 1.6821417706114998, + "grad_norm": 2.0993125438690186, + "learning_rate": 1.269267556827315e-06, + "loss": 0.3462, + "step": 13823 + }, + { + "epoch": 1.6822634621235169, + "grad_norm": 1.4628483057022095, + "learning_rate": 1.2683165858636159e-06, + "loss": 0.3699, + "step": 13824 + }, + { + "epoch": 1.6823851536355339, + "grad_norm": 2.096174716949463, + "learning_rate": 1.2673659471585486e-06, + "loss": 0.426, + "step": 13825 + }, + { + "epoch": 1.6825068451475511, + "grad_norm": 2.4189236164093018, + "learning_rate": 1.2664156407482787e-06, + "loss": 0.3812, + "step": 13826 + }, + { + "epoch": 1.6826285366595681, + "grad_norm": 1.8713542222976685, + "learning_rate": 1.2654656666689724e-06, + "loss": 0.4129, + "step": 13827 + }, + { + "epoch": 1.6827502281715851, + "grad_norm": 2.3637099266052246, + "learning_rate": 1.2645160249567777e-06, + "loss": 0.3983, + "step": 13828 + }, + { + "epoch": 1.6828719196836022, + "grad_norm": 1.902443766593933, + "learning_rate": 1.263566715647827e-06, + "loss": 0.3261, + "step": 13829 + }, + { + "epoch": 1.6829936111956192, + "grad_norm": 2.1366183757781982, + "learning_rate": 1.262617738778249e-06, + "loss": 0.3142, + "step": 13830 + }, + { + "epoch": 1.6831153027076362, + "grad_norm": 2.6139023303985596, + "learning_rate": 1.2616690943841515e-06, + "loss": 0.3623, + "step": 13831 + }, + { + "epoch": 1.6832369942196532, + "grad_norm": 1.7788294553756714, + "learning_rate": 1.26072078250163e-06, + "loss": 0.2905, + "step": 13832 + }, + { + "epoch": 1.6833586857316702, + "grad_norm": 1.4570634365081787, + "learning_rate": 1.2597728031667733e-06, + "loss": 0.3651, + "step": 13833 + }, + { + "epoch": 1.6834803772436873, + "grad_norm": 1.4675289392471313, + "learning_rate": 1.2588251564156517e-06, + "loss": 0.3437, + "step": 13834 + }, + { + "epoch": 1.6836020687557043, + "grad_norm": 2.492671489715576, + "learning_rate": 1.2578778422843275e-06, + "loss": 0.4255, + "step": 13835 + }, + { + "epoch": 1.6837237602677213, + "grad_norm": 1.3692865371704102, + "learning_rate": 1.2569308608088471e-06, + "loss": 0.341, + "step": 13836 + }, + { + "epoch": 1.6838454517797383, + "grad_norm": 1.5555249452590942, + "learning_rate": 1.2559842120252442e-06, + "loss": 0.3959, + "step": 13837 + }, + { + "epoch": 1.6839671432917553, + "grad_norm": 1.4702510833740234, + "learning_rate": 1.255037895969542e-06, + "loss": 0.3353, + "step": 13838 + }, + { + "epoch": 1.6840888348037724, + "grad_norm": 2.1564106941223145, + "learning_rate": 1.2540919126777484e-06, + "loss": 0.3636, + "step": 13839 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 2.056885242462158, + "learning_rate": 1.253146262185858e-06, + "loss": 0.3657, + "step": 13840 + }, + { + "epoch": 1.6843322178278064, + "grad_norm": 2.310302495956421, + "learning_rate": 1.2522009445298599e-06, + "loss": 0.3451, + "step": 13841 + }, + { + "epoch": 1.6844539093398234, + "grad_norm": 2.25286865234375, + "learning_rate": 1.2512559597457208e-06, + "loss": 0.3619, + "step": 13842 + }, + { + "epoch": 1.6845756008518404, + "grad_norm": 1.9163131713867188, + "learning_rate": 1.2503113078694028e-06, + "loss": 0.4133, + "step": 13843 + }, + { + "epoch": 1.6846972923638575, + "grad_norm": 2.0300254821777344, + "learning_rate": 1.249366988936851e-06, + "loss": 0.4232, + "step": 13844 + }, + { + "epoch": 1.6848189838758747, + "grad_norm": 2.266204833984375, + "learning_rate": 1.248423002983996e-06, + "loss": 0.4228, + "step": 13845 + }, + { + "epoch": 1.6849406753878917, + "grad_norm": 2.1649701595306396, + "learning_rate": 1.247479350046762e-06, + "loss": 0.3464, + "step": 13846 + }, + { + "epoch": 1.6850623668999087, + "grad_norm": 2.1098644733428955, + "learning_rate": 1.2465360301610562e-06, + "loss": 0.3811, + "step": 13847 + }, + { + "epoch": 1.6851840584119258, + "grad_norm": 2.8243815898895264, + "learning_rate": 1.2455930433627728e-06, + "loss": 0.3138, + "step": 13848 + }, + { + "epoch": 1.6853057499239428, + "grad_norm": 1.9501984119415283, + "learning_rate": 1.2446503896877948e-06, + "loss": 0.3886, + "step": 13849 + }, + { + "epoch": 1.6854274414359598, + "grad_norm": 1.5686861276626587, + "learning_rate": 1.2437080691719894e-06, + "loss": 0.3512, + "step": 13850 + }, + { + "epoch": 1.685549132947977, + "grad_norm": 1.653573751449585, + "learning_rate": 1.24276608185122e-06, + "loss": 0.3179, + "step": 13851 + }, + { + "epoch": 1.685670824459994, + "grad_norm": 1.947106122970581, + "learning_rate": 1.2418244277613267e-06, + "loss": 0.3815, + "step": 13852 + }, + { + "epoch": 1.685792515972011, + "grad_norm": 2.215709924697876, + "learning_rate": 1.24088310693814e-06, + "loss": 0.4164, + "step": 13853 + }, + { + "epoch": 1.685914207484028, + "grad_norm": 1.3895610570907593, + "learning_rate": 1.2399421194174833e-06, + "loss": 0.3508, + "step": 13854 + }, + { + "epoch": 1.6860358989960451, + "grad_norm": 1.9541537761688232, + "learning_rate": 1.2390014652351622e-06, + "loss": 0.37, + "step": 13855 + }, + { + "epoch": 1.6861575905080621, + "grad_norm": 2.2421467304229736, + "learning_rate": 1.2380611444269674e-06, + "loss": 0.3755, + "step": 13856 + }, + { + "epoch": 1.6862792820200792, + "grad_norm": 1.8879858255386353, + "learning_rate": 1.2371211570286835e-06, + "loss": 0.3922, + "step": 13857 + }, + { + "epoch": 1.6864009735320962, + "grad_norm": 1.4031667709350586, + "learning_rate": 1.2361815030760793e-06, + "loss": 0.3025, + "step": 13858 + }, + { + "epoch": 1.6865226650441132, + "grad_norm": 1.7046468257904053, + "learning_rate": 1.2352421826049054e-06, + "loss": 0.3903, + "step": 13859 + }, + { + "epoch": 1.6866443565561302, + "grad_norm": 2.5772104263305664, + "learning_rate": 1.2343031956509133e-06, + "loss": 0.2999, + "step": 13860 + }, + { + "epoch": 1.6867660480681472, + "grad_norm": 2.414714813232422, + "learning_rate": 1.2333645422498241e-06, + "loss": 0.3657, + "step": 13861 + }, + { + "epoch": 1.6868877395801642, + "grad_norm": 1.5761195421218872, + "learning_rate": 1.2324262224373628e-06, + "loss": 0.3329, + "step": 13862 + }, + { + "epoch": 1.6870094310921813, + "grad_norm": 1.5399327278137207, + "learning_rate": 1.2314882362492308e-06, + "loss": 0.3726, + "step": 13863 + }, + { + "epoch": 1.6871311226041983, + "grad_norm": 2.0807573795318604, + "learning_rate": 1.2305505837211195e-06, + "loss": 0.3647, + "step": 13864 + }, + { + "epoch": 1.6872528141162153, + "grad_norm": 1.6232025623321533, + "learning_rate": 1.2296132648887128e-06, + "loss": 0.3591, + "step": 13865 + }, + { + "epoch": 1.6873745056282323, + "grad_norm": 1.7631481885910034, + "learning_rate": 1.228676279787675e-06, + "loss": 0.3674, + "step": 13866 + }, + { + "epoch": 1.6874961971402493, + "grad_norm": 1.5286754369735718, + "learning_rate": 1.227739628453658e-06, + "loss": 0.3852, + "step": 13867 + }, + { + "epoch": 1.6876178886522664, + "grad_norm": 1.6232655048370361, + "learning_rate": 1.2268033109223077e-06, + "loss": 0.343, + "step": 13868 + }, + { + "epoch": 1.6877395801642834, + "grad_norm": 1.570472240447998, + "learning_rate": 1.2258673272292509e-06, + "loss": 0.395, + "step": 13869 + }, + { + "epoch": 1.6878612716763006, + "grad_norm": 2.6342594623565674, + "learning_rate": 1.2249316774101017e-06, + "loss": 0.3615, + "step": 13870 + }, + { + "epoch": 1.6879829631883176, + "grad_norm": 1.5583102703094482, + "learning_rate": 1.2239963615004701e-06, + "loss": 0.3216, + "step": 13871 + }, + { + "epoch": 1.6881046547003347, + "grad_norm": 2.0572757720947266, + "learning_rate": 1.223061379535938e-06, + "loss": 0.3879, + "step": 13872 + }, + { + "epoch": 1.6882263462123517, + "grad_norm": 1.362613558769226, + "learning_rate": 1.222126731552089e-06, + "loss": 0.2967, + "step": 13873 + }, + { + "epoch": 1.6883480377243687, + "grad_norm": 1.8617174625396729, + "learning_rate": 1.221192417584488e-06, + "loss": 0.343, + "step": 13874 + }, + { + "epoch": 1.6884697292363857, + "grad_norm": 3.0676445960998535, + "learning_rate": 1.2202584376686843e-06, + "loss": 0.416, + "step": 13875 + }, + { + "epoch": 1.688591420748403, + "grad_norm": 1.53612220287323, + "learning_rate": 1.2193247918402217e-06, + "loss": 0.4226, + "step": 13876 + }, + { + "epoch": 1.68871311226042, + "grad_norm": 1.6260018348693848, + "learning_rate": 1.2183914801346264e-06, + "loss": 0.3674, + "step": 13877 + }, + { + "epoch": 1.688834803772437, + "grad_norm": 1.915148138999939, + "learning_rate": 1.2174585025874097e-06, + "loss": 0.3835, + "step": 13878 + }, + { + "epoch": 1.688956495284454, + "grad_norm": 2.3580193519592285, + "learning_rate": 1.216525859234079e-06, + "loss": 0.4044, + "step": 13879 + }, + { + "epoch": 1.689078186796471, + "grad_norm": 1.9624818563461304, + "learning_rate": 1.2155935501101191e-06, + "loss": 0.347, + "step": 13880 + }, + { + "epoch": 1.689199878308488, + "grad_norm": 1.4094874858856201, + "learning_rate": 1.214661575251005e-06, + "loss": 0.3167, + "step": 13881 + }, + { + "epoch": 1.689321569820505, + "grad_norm": 2.3209450244903564, + "learning_rate": 1.213729934692205e-06, + "loss": 0.4138, + "step": 13882 + }, + { + "epoch": 1.689443261332522, + "grad_norm": 1.5316599607467651, + "learning_rate": 1.2127986284691683e-06, + "loss": 0.3751, + "step": 13883 + }, + { + "epoch": 1.6895649528445391, + "grad_norm": 1.7891693115234375, + "learning_rate": 1.2118676566173316e-06, + "loss": 0.4153, + "step": 13884 + }, + { + "epoch": 1.6896866443565561, + "grad_norm": 1.642689824104309, + "learning_rate": 1.2109370191721203e-06, + "loss": 0.3587, + "step": 13885 + }, + { + "epoch": 1.6898083358685732, + "grad_norm": 2.266117572784424, + "learning_rate": 1.2100067161689465e-06, + "loss": 0.4444, + "step": 13886 + }, + { + "epoch": 1.6899300273805902, + "grad_norm": 1.7410967350006104, + "learning_rate": 1.2090767476432142e-06, + "loss": 0.3354, + "step": 13887 + }, + { + "epoch": 1.6900517188926072, + "grad_norm": 1.5513304471969604, + "learning_rate": 1.208147113630307e-06, + "loss": 0.3374, + "step": 13888 + }, + { + "epoch": 1.6901734104046242, + "grad_norm": 2.0188941955566406, + "learning_rate": 1.2072178141655988e-06, + "loss": 0.3215, + "step": 13889 + }, + { + "epoch": 1.6902951019166412, + "grad_norm": 1.7523757219314575, + "learning_rate": 1.2062888492844539e-06, + "loss": 0.332, + "step": 13890 + }, + { + "epoch": 1.6904167934286582, + "grad_norm": 1.408253788948059, + "learning_rate": 1.2053602190222214e-06, + "loss": 0.3423, + "step": 13891 + }, + { + "epoch": 1.6905384849406753, + "grad_norm": 2.106708526611328, + "learning_rate": 1.2044319234142331e-06, + "loss": 0.3765, + "step": 13892 + }, + { + "epoch": 1.6906601764526923, + "grad_norm": 1.4886817932128906, + "learning_rate": 1.203503962495819e-06, + "loss": 0.3559, + "step": 13893 + }, + { + "epoch": 1.6907818679647093, + "grad_norm": 1.8618484735488892, + "learning_rate": 1.2025763363022869e-06, + "loss": 0.35, + "step": 13894 + }, + { + "epoch": 1.6909035594767265, + "grad_norm": 1.4520701169967651, + "learning_rate": 1.2016490448689344e-06, + "loss": 0.322, + "step": 13895 + }, + { + "epoch": 1.6910252509887436, + "grad_norm": 1.3975701332092285, + "learning_rate": 1.2007220882310478e-06, + "loss": 0.335, + "step": 13896 + }, + { + "epoch": 1.6911469425007606, + "grad_norm": 2.045530319213867, + "learning_rate": 1.1997954664238964e-06, + "loss": 0.4157, + "step": 13897 + }, + { + "epoch": 1.6912686340127776, + "grad_norm": 1.7703114748001099, + "learning_rate": 1.1988691794827457e-06, + "loss": 0.3433, + "step": 13898 + }, + { + "epoch": 1.6913903255247946, + "grad_norm": 1.9776839017868042, + "learning_rate": 1.1979432274428405e-06, + "loss": 0.3808, + "step": 13899 + }, + { + "epoch": 1.6915120170368116, + "grad_norm": 1.9306470155715942, + "learning_rate": 1.1970176103394115e-06, + "loss": 0.307, + "step": 13900 + }, + { + "epoch": 1.6916337085488289, + "grad_norm": 2.20820951461792, + "learning_rate": 1.1960923282076863e-06, + "loss": 0.3394, + "step": 13901 + }, + { + "epoch": 1.691755400060846, + "grad_norm": 1.6490033864974976, + "learning_rate": 1.195167381082869e-06, + "loss": 0.3451, + "step": 13902 + }, + { + "epoch": 1.691877091572863, + "grad_norm": 2.2584750652313232, + "learning_rate": 1.19424276900016e-06, + "loss": 0.3957, + "step": 13903 + }, + { + "epoch": 1.69199878308488, + "grad_norm": 1.813049554824829, + "learning_rate": 1.1933184919947404e-06, + "loss": 0.39, + "step": 13904 + }, + { + "epoch": 1.692120474596897, + "grad_norm": 1.6137909889221191, + "learning_rate": 1.1923945501017786e-06, + "loss": 0.3493, + "step": 13905 + }, + { + "epoch": 1.692242166108914, + "grad_norm": 2.3704895973205566, + "learning_rate": 1.1914709433564398e-06, + "loss": 0.3918, + "step": 13906 + }, + { + "epoch": 1.692363857620931, + "grad_norm": 2.6435279846191406, + "learning_rate": 1.1905476717938614e-06, + "loss": 0.3163, + "step": 13907 + }, + { + "epoch": 1.692485549132948, + "grad_norm": 2.7807271480560303, + "learning_rate": 1.1896247354491774e-06, + "loss": 0.4237, + "step": 13908 + }, + { + "epoch": 1.692607240644965, + "grad_norm": 2.0644032955169678, + "learning_rate": 1.18870213435751e-06, + "loss": 0.4196, + "step": 13909 + }, + { + "epoch": 1.692728932156982, + "grad_norm": 1.563829779624939, + "learning_rate": 1.1877798685539633e-06, + "loss": 0.3664, + "step": 13910 + }, + { + "epoch": 1.692850623668999, + "grad_norm": 2.3204684257507324, + "learning_rate": 1.1868579380736344e-06, + "loss": 0.3799, + "step": 13911 + }, + { + "epoch": 1.692972315181016, + "grad_norm": 1.9118934869766235, + "learning_rate": 1.1859363429516035e-06, + "loss": 0.4313, + "step": 13912 + }, + { + "epoch": 1.6930940066930331, + "grad_norm": 2.6560192108154297, + "learning_rate": 1.1850150832229368e-06, + "loss": 0.4295, + "step": 13913 + }, + { + "epoch": 1.6932156982050501, + "grad_norm": 1.937640905380249, + "learning_rate": 1.184094158922695e-06, + "loss": 0.3631, + "step": 13914 + }, + { + "epoch": 1.6933373897170672, + "grad_norm": 2.101738929748535, + "learning_rate": 1.183173570085917e-06, + "loss": 0.3749, + "step": 13915 + }, + { + "epoch": 1.6934590812290842, + "grad_norm": 1.9780380725860596, + "learning_rate": 1.1822533167476336e-06, + "loss": 0.3338, + "step": 13916 + }, + { + "epoch": 1.6935807727411012, + "grad_norm": 1.7169417142868042, + "learning_rate": 1.1813333989428665e-06, + "loss": 0.3714, + "step": 13917 + }, + { + "epoch": 1.6937024642531182, + "grad_norm": 3.403289556503296, + "learning_rate": 1.1804138167066136e-06, + "loss": 0.4806, + "step": 13918 + }, + { + "epoch": 1.6938241557651352, + "grad_norm": 1.8074461221694946, + "learning_rate": 1.1794945700738724e-06, + "loss": 0.3785, + "step": 13919 + }, + { + "epoch": 1.6939458472771525, + "grad_norm": 1.990272879600525, + "learning_rate": 1.1785756590796204e-06, + "loss": 0.3894, + "step": 13920 + }, + { + "epoch": 1.6940675387891695, + "grad_norm": 1.5768609046936035, + "learning_rate": 1.1776570837588208e-06, + "loss": 0.3387, + "step": 13921 + }, + { + "epoch": 1.6941892303011865, + "grad_norm": 2.194485902786255, + "learning_rate": 1.1767388441464333e-06, + "loss": 0.3989, + "step": 13922 + }, + { + "epoch": 1.6943109218132035, + "grad_norm": 1.8552031517028809, + "learning_rate": 1.1758209402773967e-06, + "loss": 0.4083, + "step": 13923 + }, + { + "epoch": 1.6944326133252205, + "grad_norm": 3.6940269470214844, + "learning_rate": 1.1749033721866354e-06, + "loss": 0.3644, + "step": 13924 + }, + { + "epoch": 1.6945543048372376, + "grad_norm": 3.08841872215271, + "learning_rate": 1.1739861399090702e-06, + "loss": 0.39, + "step": 13925 + }, + { + "epoch": 1.6946759963492546, + "grad_norm": 3.911529302597046, + "learning_rate": 1.173069243479601e-06, + "loss": 0.356, + "step": 13926 + }, + { + "epoch": 1.6947976878612718, + "grad_norm": 2.4748220443725586, + "learning_rate": 1.1721526829331153e-06, + "loss": 0.3627, + "step": 13927 + }, + { + "epoch": 1.6949193793732888, + "grad_norm": 1.4452297687530518, + "learning_rate": 1.1712364583044954e-06, + "loss": 0.3711, + "step": 13928 + }, + { + "epoch": 1.6950410708853059, + "grad_norm": 2.475959539413452, + "learning_rate": 1.1703205696286024e-06, + "loss": 0.3188, + "step": 13929 + }, + { + "epoch": 1.6951627623973229, + "grad_norm": 1.9028396606445312, + "learning_rate": 1.1694050169402883e-06, + "loss": 0.3438, + "step": 13930 + }, + { + "epoch": 1.69528445390934, + "grad_norm": 2.206341505050659, + "learning_rate": 1.1684898002743916e-06, + "loss": 0.3854, + "step": 13931 + }, + { + "epoch": 1.695406145421357, + "grad_norm": 1.3340353965759277, + "learning_rate": 1.167574919665736e-06, + "loss": 0.3312, + "step": 13932 + }, + { + "epoch": 1.695527836933374, + "grad_norm": 1.6846473217010498, + "learning_rate": 1.1666603751491402e-06, + "loss": 0.3982, + "step": 13933 + }, + { + "epoch": 1.695649528445391, + "grad_norm": 1.9827247858047485, + "learning_rate": 1.1657461667593994e-06, + "loss": 0.3971, + "step": 13934 + }, + { + "epoch": 1.695771219957408, + "grad_norm": 1.3461912870407104, + "learning_rate": 1.1648322945313017e-06, + "loss": 0.3421, + "step": 13935 + }, + { + "epoch": 1.695892911469425, + "grad_norm": 2.17278790473938, + "learning_rate": 1.1639187584996248e-06, + "loss": 0.3444, + "step": 13936 + }, + { + "epoch": 1.696014602981442, + "grad_norm": 1.8735146522521973, + "learning_rate": 1.1630055586991285e-06, + "loss": 0.3776, + "step": 13937 + }, + { + "epoch": 1.696136294493459, + "grad_norm": 1.9665249586105347, + "learning_rate": 1.1620926951645606e-06, + "loss": 0.4169, + "step": 13938 + }, + { + "epoch": 1.696257986005476, + "grad_norm": 1.7326833009719849, + "learning_rate": 1.161180167930661e-06, + "loss": 0.3695, + "step": 13939 + }, + { + "epoch": 1.696379677517493, + "grad_norm": 2.694737195968628, + "learning_rate": 1.1602679770321512e-06, + "loss": 0.3375, + "step": 13940 + }, + { + "epoch": 1.69650136902951, + "grad_norm": 1.9490004777908325, + "learning_rate": 1.1593561225037431e-06, + "loss": 0.3942, + "step": 13941 + }, + { + "epoch": 1.6966230605415271, + "grad_norm": 2.8919477462768555, + "learning_rate": 1.1584446043801323e-06, + "loss": 0.4085, + "step": 13942 + }, + { + "epoch": 1.6967447520535441, + "grad_norm": 1.697523832321167, + "learning_rate": 1.1575334226960045e-06, + "loss": 0.3673, + "step": 13943 + }, + { + "epoch": 1.6968664435655612, + "grad_norm": 1.7414662837982178, + "learning_rate": 1.156622577486034e-06, + "loss": 0.3127, + "step": 13944 + }, + { + "epoch": 1.6969881350775782, + "grad_norm": 1.6480767726898193, + "learning_rate": 1.15571206878488e-06, + "loss": 0.3199, + "step": 13945 + }, + { + "epoch": 1.6971098265895954, + "grad_norm": 1.942179560661316, + "learning_rate": 1.1548018966271856e-06, + "loss": 0.35, + "step": 13946 + }, + { + "epoch": 1.6972315181016124, + "grad_norm": 2.7838218212127686, + "learning_rate": 1.15389206104759e-06, + "loss": 0.3975, + "step": 13947 + }, + { + "epoch": 1.6973532096136295, + "grad_norm": 1.9651570320129395, + "learning_rate": 1.1529825620807121e-06, + "loss": 0.4118, + "step": 13948 + }, + { + "epoch": 1.6974749011256465, + "grad_norm": 3.165354013442993, + "learning_rate": 1.1520733997611577e-06, + "loss": 0.4557, + "step": 13949 + }, + { + "epoch": 1.6975965926376635, + "grad_norm": 1.6352638006210327, + "learning_rate": 1.1511645741235267e-06, + "loss": 0.3404, + "step": 13950 + }, + { + "epoch": 1.6977182841496805, + "grad_norm": 2.1372931003570557, + "learning_rate": 1.1502560852023992e-06, + "loss": 0.3674, + "step": 13951 + }, + { + "epoch": 1.6978399756616978, + "grad_norm": 2.2757043838500977, + "learning_rate": 1.1493479330323466e-06, + "loss": 0.4067, + "step": 13952 + }, + { + "epoch": 1.6979616671737148, + "grad_norm": 2.541049003601074, + "learning_rate": 1.1484401176479243e-06, + "loss": 0.3318, + "step": 13953 + }, + { + "epoch": 1.6980833586857318, + "grad_norm": 1.514437198638916, + "learning_rate": 1.147532639083676e-06, + "loss": 0.363, + "step": 13954 + }, + { + "epoch": 1.6982050501977488, + "grad_norm": 2.9432213306427, + "learning_rate": 1.1466254973741364e-06, + "loss": 0.3848, + "step": 13955 + }, + { + "epoch": 1.6983267417097658, + "grad_norm": 3.3483433723449707, + "learning_rate": 1.145718692553821e-06, + "loss": 0.2824, + "step": 13956 + }, + { + "epoch": 1.6984484332217828, + "grad_norm": 1.9449719190597534, + "learning_rate": 1.1448122246572356e-06, + "loss": 0.3729, + "step": 13957 + }, + { + "epoch": 1.6985701247337999, + "grad_norm": 2.138198137283325, + "learning_rate": 1.1439060937188762e-06, + "loss": 0.2778, + "step": 13958 + }, + { + "epoch": 1.6986918162458169, + "grad_norm": 1.4589548110961914, + "learning_rate": 1.1430002997732204e-06, + "loss": 0.3471, + "step": 13959 + }, + { + "epoch": 1.698813507757834, + "grad_norm": 2.0117695331573486, + "learning_rate": 1.1420948428547352e-06, + "loss": 0.309, + "step": 13960 + }, + { + "epoch": 1.698935199269851, + "grad_norm": 1.8093459606170654, + "learning_rate": 1.1411897229978774e-06, + "loss": 0.3968, + "step": 13961 + }, + { + "epoch": 1.699056890781868, + "grad_norm": 1.7502095699310303, + "learning_rate": 1.1402849402370853e-06, + "loss": 0.4172, + "step": 13962 + }, + { + "epoch": 1.699178582293885, + "grad_norm": 1.983483910560608, + "learning_rate": 1.1393804946067943e-06, + "loss": 0.3856, + "step": 13963 + }, + { + "epoch": 1.699300273805902, + "grad_norm": 1.778069257736206, + "learning_rate": 1.138476386141414e-06, + "loss": 0.3451, + "step": 13964 + }, + { + "epoch": 1.699421965317919, + "grad_norm": 1.9403551816940308, + "learning_rate": 1.1375726148753464e-06, + "loss": 0.3637, + "step": 13965 + }, + { + "epoch": 1.699543656829936, + "grad_norm": 1.8398736715316772, + "learning_rate": 1.1366691808429886e-06, + "loss": 0.3688, + "step": 13966 + }, + { + "epoch": 1.699665348341953, + "grad_norm": 1.6638399362564087, + "learning_rate": 1.1357660840787133e-06, + "loss": 0.3604, + "step": 13967 + }, + { + "epoch": 1.69978703985397, + "grad_norm": 1.6198534965515137, + "learning_rate": 1.134863324616885e-06, + "loss": 0.393, + "step": 13968 + }, + { + "epoch": 1.699908731365987, + "grad_norm": 2.1225740909576416, + "learning_rate": 1.1339609024918586e-06, + "loss": 0.3491, + "step": 13969 + }, + { + "epoch": 1.700030422878004, + "grad_norm": 1.749290108680725, + "learning_rate": 1.1330588177379698e-06, + "loss": 0.3901, + "step": 13970 + }, + { + "epoch": 1.7001521143900213, + "grad_norm": 2.071774959564209, + "learning_rate": 1.132157070389549e-06, + "loss": 0.3573, + "step": 13971 + }, + { + "epoch": 1.7002738059020384, + "grad_norm": 2.022597312927246, + "learning_rate": 1.1312556604809067e-06, + "loss": 0.3951, + "step": 13972 + }, + { + "epoch": 1.7003954974140554, + "grad_norm": 2.1706387996673584, + "learning_rate": 1.1303545880463418e-06, + "loss": 0.3426, + "step": 13973 + }, + { + "epoch": 1.7005171889260724, + "grad_norm": 2.5023021697998047, + "learning_rate": 1.129453853120146e-06, + "loss": 0.338, + "step": 13974 + }, + { + "epoch": 1.7006388804380894, + "grad_norm": 1.8623839616775513, + "learning_rate": 1.1285534557365929e-06, + "loss": 0.3595, + "step": 13975 + }, + { + "epoch": 1.7007605719501064, + "grad_norm": 3.5104005336761475, + "learning_rate": 1.127653395929943e-06, + "loss": 0.414, + "step": 13976 + }, + { + "epoch": 1.7008822634621237, + "grad_norm": 1.9361910820007324, + "learning_rate": 1.1267536737344465e-06, + "loss": 0.3411, + "step": 13977 + }, + { + "epoch": 1.7010039549741407, + "grad_norm": 3.107074737548828, + "learning_rate": 1.125854289184336e-06, + "loss": 0.2885, + "step": 13978 + }, + { + "epoch": 1.7011256464861577, + "grad_norm": 2.2797510623931885, + "learning_rate": 1.124955242313842e-06, + "loss": 0.3911, + "step": 13979 + }, + { + "epoch": 1.7012473379981747, + "grad_norm": 1.9399561882019043, + "learning_rate": 1.1240565331571707e-06, + "loss": 0.3593, + "step": 13980 + }, + { + "epoch": 1.7013690295101918, + "grad_norm": 2.1943323612213135, + "learning_rate": 1.1231581617485176e-06, + "loss": 0.3065, + "step": 13981 + }, + { + "epoch": 1.7014907210222088, + "grad_norm": 1.545628547668457, + "learning_rate": 1.1222601281220736e-06, + "loss": 0.3482, + "step": 13982 + }, + { + "epoch": 1.7016124125342258, + "grad_norm": 1.8336818218231201, + "learning_rate": 1.1213624323120076e-06, + "loss": 0.4191, + "step": 13983 + }, + { + "epoch": 1.7017341040462428, + "grad_norm": 1.9648090600967407, + "learning_rate": 1.1204650743524759e-06, + "loss": 0.4103, + "step": 13984 + }, + { + "epoch": 1.7018557955582598, + "grad_norm": 2.643662929534912, + "learning_rate": 1.1195680542776311e-06, + "loss": 0.3016, + "step": 13985 + }, + { + "epoch": 1.7019774870702769, + "grad_norm": 1.5862783193588257, + "learning_rate": 1.1186713721216025e-06, + "loss": 0.3496, + "step": 13986 + }, + { + "epoch": 1.7020991785822939, + "grad_norm": 2.0508296489715576, + "learning_rate": 1.1177750279185118e-06, + "loss": 0.3929, + "step": 13987 + }, + { + "epoch": 1.702220870094311, + "grad_norm": 1.6170121431350708, + "learning_rate": 1.116879021702466e-06, + "loss": 0.3313, + "step": 13988 + }, + { + "epoch": 1.702342561606328, + "grad_norm": 1.9966620206832886, + "learning_rate": 1.1159833535075594e-06, + "loss": 0.3838, + "step": 13989 + }, + { + "epoch": 1.702464253118345, + "grad_norm": 1.8811813592910767, + "learning_rate": 1.1150880233678764e-06, + "loss": 0.3447, + "step": 13990 + }, + { + "epoch": 1.702585944630362, + "grad_norm": 1.8958288431167603, + "learning_rate": 1.1141930313174843e-06, + "loss": 0.3547, + "step": 13991 + }, + { + "epoch": 1.702707636142379, + "grad_norm": 1.6099693775177002, + "learning_rate": 1.1132983773904393e-06, + "loss": 0.3381, + "step": 13992 + }, + { + "epoch": 1.702829327654396, + "grad_norm": 1.6218100786209106, + "learning_rate": 1.1124040616207866e-06, + "loss": 0.359, + "step": 13993 + }, + { + "epoch": 1.702951019166413, + "grad_norm": 1.8699159622192383, + "learning_rate": 1.1115100840425564e-06, + "loss": 0.3438, + "step": 13994 + }, + { + "epoch": 1.70307271067843, + "grad_norm": 1.7054446935653687, + "learning_rate": 1.1106164446897628e-06, + "loss": 0.3581, + "step": 13995 + }, + { + "epoch": 1.7031944021904473, + "grad_norm": 3.486537218093872, + "learning_rate": 1.1097231435964162e-06, + "loss": 0.3141, + "step": 13996 + }, + { + "epoch": 1.7033160937024643, + "grad_norm": 2.0044102668762207, + "learning_rate": 1.1088301807965052e-06, + "loss": 0.4258, + "step": 13997 + }, + { + "epoch": 1.7034377852144813, + "grad_norm": 2.704425096511841, + "learning_rate": 1.1079375563240103e-06, + "loss": 0.448, + "step": 13998 + }, + { + "epoch": 1.7035594767264983, + "grad_norm": 2.560760259628296, + "learning_rate": 1.1070452702128965e-06, + "loss": 0.2965, + "step": 13999 + }, + { + "epoch": 1.7036811682385153, + "grad_norm": 1.3174242973327637, + "learning_rate": 1.1061533224971156e-06, + "loss": 0.3196, + "step": 14000 + }, + { + "epoch": 1.7038028597505324, + "grad_norm": 1.7981939315795898, + "learning_rate": 1.1052617132106113e-06, + "loss": 0.3245, + "step": 14001 + }, + { + "epoch": 1.7039245512625496, + "grad_norm": 1.9320846796035767, + "learning_rate": 1.1043704423873104e-06, + "loss": 0.4217, + "step": 14002 + }, + { + "epoch": 1.7040462427745666, + "grad_norm": 1.9292234182357788, + "learning_rate": 1.1034795100611252e-06, + "loss": 0.3626, + "step": 14003 + }, + { + "epoch": 1.7041679342865836, + "grad_norm": 2.313657760620117, + "learning_rate": 1.102588916265962e-06, + "loss": 0.3527, + "step": 14004 + }, + { + "epoch": 1.7042896257986007, + "grad_norm": 1.4426449537277222, + "learning_rate": 1.1016986610357072e-06, + "loss": 0.3316, + "step": 14005 + }, + { + "epoch": 1.7044113173106177, + "grad_norm": 2.132216453552246, + "learning_rate": 1.1008087444042336e-06, + "loss": 0.3235, + "step": 14006 + }, + { + "epoch": 1.7045330088226347, + "grad_norm": 3.11110258102417, + "learning_rate": 1.099919166405411e-06, + "loss": 0.4182, + "step": 14007 + }, + { + "epoch": 1.7046547003346517, + "grad_norm": 2.1369504928588867, + "learning_rate": 1.0990299270730854e-06, + "loss": 0.3247, + "step": 14008 + }, + { + "epoch": 1.7047763918466687, + "grad_norm": 2.776984691619873, + "learning_rate": 1.098141026441093e-06, + "loss": 0.3989, + "step": 14009 + }, + { + "epoch": 1.7048980833586858, + "grad_norm": 1.7858883142471313, + "learning_rate": 1.0972524645432647e-06, + "loss": 0.3294, + "step": 14010 + }, + { + "epoch": 1.7050197748707028, + "grad_norm": 3.6378374099731445, + "learning_rate": 1.0963642414134035e-06, + "loss": 0.4295, + "step": 14011 + }, + { + "epoch": 1.7051414663827198, + "grad_norm": 2.4956066608428955, + "learning_rate": 1.0954763570853155e-06, + "loss": 0.353, + "step": 14012 + }, + { + "epoch": 1.7052631578947368, + "grad_norm": 1.9461241960525513, + "learning_rate": 1.0945888115927816e-06, + "loss": 0.3219, + "step": 14013 + }, + { + "epoch": 1.7053848494067538, + "grad_norm": 1.7975083589553833, + "learning_rate": 1.0937016049695759e-06, + "loss": 0.3573, + "step": 14014 + }, + { + "epoch": 1.7055065409187709, + "grad_norm": 2.1224591732025146, + "learning_rate": 1.0928147372494613e-06, + "loss": 0.3831, + "step": 14015 + }, + { + "epoch": 1.7056282324307879, + "grad_norm": 1.5556366443634033, + "learning_rate": 1.091928208466182e-06, + "loss": 0.3425, + "step": 14016 + }, + { + "epoch": 1.705749923942805, + "grad_norm": 1.7902332544326782, + "learning_rate": 1.0910420186534721e-06, + "loss": 0.349, + "step": 14017 + }, + { + "epoch": 1.705871615454822, + "grad_norm": 2.7425386905670166, + "learning_rate": 1.0901561678450546e-06, + "loss": 0.3289, + "step": 14018 + }, + { + "epoch": 1.705993306966839, + "grad_norm": 2.3676950931549072, + "learning_rate": 1.089270656074638e-06, + "loss": 0.3434, + "step": 14019 + }, + { + "epoch": 1.706114998478856, + "grad_norm": 1.9426798820495605, + "learning_rate": 1.0883854833759155e-06, + "loss": 0.3944, + "step": 14020 + }, + { + "epoch": 1.7062366899908732, + "grad_norm": 3.295860767364502, + "learning_rate": 1.0875006497825747e-06, + "loss": 0.4305, + "step": 14021 + }, + { + "epoch": 1.7063583815028902, + "grad_norm": 2.0086307525634766, + "learning_rate": 1.0866161553282784e-06, + "loss": 0.3842, + "step": 14022 + }, + { + "epoch": 1.7064800730149072, + "grad_norm": 1.6595697402954102, + "learning_rate": 1.0857320000466898e-06, + "loss": 0.3505, + "step": 14023 + }, + { + "epoch": 1.7066017645269242, + "grad_norm": 1.8961855173110962, + "learning_rate": 1.0848481839714487e-06, + "loss": 0.3619, + "step": 14024 + }, + { + "epoch": 1.7067234560389413, + "grad_norm": 2.3203125, + "learning_rate": 1.083964707136187e-06, + "loss": 0.3429, + "step": 14025 + }, + { + "epoch": 1.7068451475509583, + "grad_norm": 1.9095147848129272, + "learning_rate": 1.0830815695745245e-06, + "loss": 0.2919, + "step": 14026 + }, + { + "epoch": 1.7069668390629753, + "grad_norm": 1.5612170696258545, + "learning_rate": 1.0821987713200631e-06, + "loss": 0.3202, + "step": 14027 + }, + { + "epoch": 1.7070885305749925, + "grad_norm": 2.065566062927246, + "learning_rate": 1.0813163124063996e-06, + "loss": 0.4277, + "step": 14028 + }, + { + "epoch": 1.7072102220870096, + "grad_norm": 2.7088770866394043, + "learning_rate": 1.0804341928671102e-06, + "loss": 0.3151, + "step": 14029 + }, + { + "epoch": 1.7073319135990266, + "grad_norm": 1.5059829950332642, + "learning_rate": 1.0795524127357616e-06, + "loss": 0.3335, + "step": 14030 + }, + { + "epoch": 1.7074536051110436, + "grad_norm": 3.178105354309082, + "learning_rate": 1.07867097204591e-06, + "loss": 0.4036, + "step": 14031 + }, + { + "epoch": 1.7075752966230606, + "grad_norm": 1.8254328966140747, + "learning_rate": 1.077789870831093e-06, + "loss": 0.3365, + "step": 14032 + }, + { + "epoch": 1.7076969881350776, + "grad_norm": 1.327294111251831, + "learning_rate": 1.0769091091248397e-06, + "loss": 0.322, + "step": 14033 + }, + { + "epoch": 1.7078186796470947, + "grad_norm": 1.9347903728485107, + "learning_rate": 1.076028686960665e-06, + "loss": 0.3764, + "step": 14034 + }, + { + "epoch": 1.7079403711591117, + "grad_norm": 1.7627278566360474, + "learning_rate": 1.0751486043720683e-06, + "loss": 0.3264, + "step": 14035 + }, + { + "epoch": 1.7080620626711287, + "grad_norm": 2.2402665615081787, + "learning_rate": 1.0742688613925422e-06, + "loss": 0.4097, + "step": 14036 + }, + { + "epoch": 1.7081837541831457, + "grad_norm": 1.9333844184875488, + "learning_rate": 1.0733894580555616e-06, + "loss": 0.3718, + "step": 14037 + }, + { + "epoch": 1.7083054456951627, + "grad_norm": 3.3544297218322754, + "learning_rate": 1.072510394394587e-06, + "loss": 0.3651, + "step": 14038 + }, + { + "epoch": 1.7084271372071798, + "grad_norm": 1.7452428340911865, + "learning_rate": 1.071631670443073e-06, + "loss": 0.3477, + "step": 14039 + }, + { + "epoch": 1.7085488287191968, + "grad_norm": 1.6241368055343628, + "learning_rate": 1.0707532862344538e-06, + "loss": 0.3513, + "step": 14040 + }, + { + "epoch": 1.7086705202312138, + "grad_norm": 1.8884460926055908, + "learning_rate": 1.0698752418021542e-06, + "loss": 0.3792, + "step": 14041 + }, + { + "epoch": 1.7087922117432308, + "grad_norm": 1.3426518440246582, + "learning_rate": 1.0689975371795868e-06, + "loss": 0.3302, + "step": 14042 + }, + { + "epoch": 1.7089139032552478, + "grad_norm": 2.08935284614563, + "learning_rate": 1.0681201724001488e-06, + "loss": 0.3719, + "step": 14043 + }, + { + "epoch": 1.7090355947672649, + "grad_norm": 1.592146873474121, + "learning_rate": 1.0672431474972279e-06, + "loss": 0.3668, + "step": 14044 + }, + { + "epoch": 1.7091572862792819, + "grad_norm": 1.8158204555511475, + "learning_rate": 1.0663664625041936e-06, + "loss": 0.3592, + "step": 14045 + }, + { + "epoch": 1.709278977791299, + "grad_norm": 1.6156903505325317, + "learning_rate": 1.0654901174544053e-06, + "loss": 0.3461, + "step": 14046 + }, + { + "epoch": 1.7094006693033161, + "grad_norm": 2.518251895904541, + "learning_rate": 1.0646141123812136e-06, + "loss": 0.4092, + "step": 14047 + }, + { + "epoch": 1.7095223608153332, + "grad_norm": 2.2245280742645264, + "learning_rate": 1.0637384473179502e-06, + "loss": 0.3436, + "step": 14048 + }, + { + "epoch": 1.7096440523273502, + "grad_norm": 2.391563653945923, + "learning_rate": 1.0628631222979346e-06, + "loss": 0.3524, + "step": 14049 + }, + { + "epoch": 1.7097657438393672, + "grad_norm": 1.895557165145874, + "learning_rate": 1.0619881373544772e-06, + "loss": 0.3581, + "step": 14050 + }, + { + "epoch": 1.7098874353513842, + "grad_norm": 1.6016755104064941, + "learning_rate": 1.0611134925208722e-06, + "loss": 0.2983, + "step": 14051 + }, + { + "epoch": 1.7100091268634012, + "grad_norm": 1.8078246116638184, + "learning_rate": 1.0602391878303987e-06, + "loss": 0.3369, + "step": 14052 + }, + { + "epoch": 1.7101308183754185, + "grad_norm": 1.8278409242630005, + "learning_rate": 1.059365223316331e-06, + "loss": 0.3903, + "step": 14053 + }, + { + "epoch": 1.7102525098874355, + "grad_norm": 1.732277274131775, + "learning_rate": 1.0584915990119216e-06, + "loss": 0.3851, + "step": 14054 + }, + { + "epoch": 1.7103742013994525, + "grad_norm": 1.5919405221939087, + "learning_rate": 1.0576183149504137e-06, + "loss": 0.3369, + "step": 14055 + }, + { + "epoch": 1.7104958929114695, + "grad_norm": 1.998347282409668, + "learning_rate": 1.0567453711650432e-06, + "loss": 0.2933, + "step": 14056 + }, + { + "epoch": 1.7106175844234865, + "grad_norm": 2.7652010917663574, + "learning_rate": 1.055872767689018e-06, + "loss": 0.3579, + "step": 14057 + }, + { + "epoch": 1.7107392759355036, + "grad_norm": 2.5122175216674805, + "learning_rate": 1.055000504555549e-06, + "loss": 0.3179, + "step": 14058 + }, + { + "epoch": 1.7108609674475206, + "grad_norm": 1.6296882629394531, + "learning_rate": 1.0541285817978265e-06, + "loss": 0.3535, + "step": 14059 + }, + { + "epoch": 1.7109826589595376, + "grad_norm": 1.8032680749893188, + "learning_rate": 1.0532569994490271e-06, + "loss": 0.3228, + "step": 14060 + }, + { + "epoch": 1.7111043504715546, + "grad_norm": 2.861661195755005, + "learning_rate": 1.052385757542318e-06, + "loss": 0.3332, + "step": 14061 + }, + { + "epoch": 1.7112260419835716, + "grad_norm": 1.7813626527786255, + "learning_rate": 1.0515148561108524e-06, + "loss": 0.3764, + "step": 14062 + }, + { + "epoch": 1.7113477334955887, + "grad_norm": 1.5075675249099731, + "learning_rate": 1.050644295187767e-06, + "loss": 0.3224, + "step": 14063 + }, + { + "epoch": 1.7114694250076057, + "grad_norm": 1.7072079181671143, + "learning_rate": 1.0497740748061925e-06, + "loss": 0.3507, + "step": 14064 + }, + { + "epoch": 1.7115911165196227, + "grad_norm": 1.846217155456543, + "learning_rate": 1.0489041949992395e-06, + "loss": 0.3647, + "step": 14065 + }, + { + "epoch": 1.7117128080316397, + "grad_norm": 1.7694567441940308, + "learning_rate": 1.0480346558000076e-06, + "loss": 0.3647, + "step": 14066 + }, + { + "epoch": 1.7118344995436567, + "grad_norm": 1.5622608661651611, + "learning_rate": 1.0471654572415912e-06, + "loss": 0.358, + "step": 14067 + }, + { + "epoch": 1.7119561910556738, + "grad_norm": 2.289029121398926, + "learning_rate": 1.0462965993570562e-06, + "loss": 0.4145, + "step": 14068 + }, + { + "epoch": 1.7120778825676908, + "grad_norm": 1.8689907789230347, + "learning_rate": 1.0454280821794717e-06, + "loss": 0.3602, + "step": 14069 + }, + { + "epoch": 1.7121995740797078, + "grad_norm": 2.038823127746582, + "learning_rate": 1.0445599057418821e-06, + "loss": 0.3491, + "step": 14070 + }, + { + "epoch": 1.7123212655917248, + "grad_norm": 2.37129282951355, + "learning_rate": 1.0436920700773234e-06, + "loss": 0.3959, + "step": 14071 + }, + { + "epoch": 1.712442957103742, + "grad_norm": 2.348007917404175, + "learning_rate": 1.0428245752188226e-06, + "loss": 0.3277, + "step": 14072 + }, + { + "epoch": 1.712564648615759, + "grad_norm": 2.007065773010254, + "learning_rate": 1.0419574211993866e-06, + "loss": 0.3582, + "step": 14073 + }, + { + "epoch": 1.712686340127776, + "grad_norm": 2.0469300746917725, + "learning_rate": 1.04109060805201e-06, + "loss": 0.4017, + "step": 14074 + }, + { + "epoch": 1.7128080316397931, + "grad_norm": 1.606426477432251, + "learning_rate": 1.0402241358096832e-06, + "loss": 0.3854, + "step": 14075 + }, + { + "epoch": 1.7129297231518101, + "grad_norm": 1.681236743927002, + "learning_rate": 1.039358004505373e-06, + "loss": 0.3097, + "step": 14076 + }, + { + "epoch": 1.7130514146638272, + "grad_norm": 1.9548197984695435, + "learning_rate": 1.0384922141720356e-06, + "loss": 0.3289, + "step": 14077 + }, + { + "epoch": 1.7131731061758444, + "grad_norm": 2.0898630619049072, + "learning_rate": 1.037626764842622e-06, + "loss": 0.3583, + "step": 14078 + }, + { + "epoch": 1.7132947976878614, + "grad_norm": 2.0821692943573, + "learning_rate": 1.0367616565500604e-06, + "loss": 0.322, + "step": 14079 + }, + { + "epoch": 1.7134164891998784, + "grad_norm": 1.8662997484207153, + "learning_rate": 1.0358968893272702e-06, + "loss": 0.3544, + "step": 14080 + }, + { + "epoch": 1.7135381807118955, + "grad_norm": 1.5352036952972412, + "learning_rate": 1.0350324632071597e-06, + "loss": 0.3544, + "step": 14081 + }, + { + "epoch": 1.7136598722239125, + "grad_norm": 1.5609310865402222, + "learning_rate": 1.0341683782226164e-06, + "loss": 0.3731, + "step": 14082 + }, + { + "epoch": 1.7137815637359295, + "grad_norm": 2.0905563831329346, + "learning_rate": 1.0333046344065277e-06, + "loss": 0.3993, + "step": 14083 + }, + { + "epoch": 1.7139032552479465, + "grad_norm": 3.518739938735962, + "learning_rate": 1.0324412317917575e-06, + "loss": 0.4176, + "step": 14084 + }, + { + "epoch": 1.7140249467599635, + "grad_norm": 1.8051915168762207, + "learning_rate": 1.0315781704111583e-06, + "loss": 0.4026, + "step": 14085 + }, + { + "epoch": 1.7141466382719805, + "grad_norm": 2.331281900405884, + "learning_rate": 1.0307154502975759e-06, + "loss": 0.4104, + "step": 14086 + }, + { + "epoch": 1.7142683297839976, + "grad_norm": 1.7676728963851929, + "learning_rate": 1.0298530714838328e-06, + "loss": 0.3323, + "step": 14087 + }, + { + "epoch": 1.7143900212960146, + "grad_norm": 1.867126703262329, + "learning_rate": 1.0289910340027508e-06, + "loss": 0.3528, + "step": 14088 + }, + { + "epoch": 1.7145117128080316, + "grad_norm": 1.8186051845550537, + "learning_rate": 1.0281293378871283e-06, + "loss": 0.3911, + "step": 14089 + }, + { + "epoch": 1.7146334043200486, + "grad_norm": 2.3229928016662598, + "learning_rate": 1.0272679831697553e-06, + "loss": 0.3946, + "step": 14090 + }, + { + "epoch": 1.7147550958320656, + "grad_norm": 2.176506519317627, + "learning_rate": 1.0264069698834077e-06, + "loss": 0.318, + "step": 14091 + }, + { + "epoch": 1.7148767873440827, + "grad_norm": 3.387120246887207, + "learning_rate": 1.025546298060849e-06, + "loss": 0.4279, + "step": 14092 + }, + { + "epoch": 1.7149984788560997, + "grad_norm": 2.252229690551758, + "learning_rate": 1.0246859677348287e-06, + "loss": 0.3884, + "step": 14093 + }, + { + "epoch": 1.7151201703681167, + "grad_norm": 1.9507129192352295, + "learning_rate": 1.0238259789380867e-06, + "loss": 0.3478, + "step": 14094 + }, + { + "epoch": 1.7152418618801337, + "grad_norm": 2.2231152057647705, + "learning_rate": 1.0229663317033433e-06, + "loss": 0.3785, + "step": 14095 + }, + { + "epoch": 1.7153635533921507, + "grad_norm": 2.567385196685791, + "learning_rate": 1.022107026063316e-06, + "loss": 0.3482, + "step": 14096 + }, + { + "epoch": 1.715485244904168, + "grad_norm": 2.702180862426758, + "learning_rate": 1.0212480620506982e-06, + "loss": 0.3353, + "step": 14097 + }, + { + "epoch": 1.715606936416185, + "grad_norm": 1.8773272037506104, + "learning_rate": 1.020389439698175e-06, + "loss": 0.413, + "step": 14098 + }, + { + "epoch": 1.715728627928202, + "grad_norm": 2.5362932682037354, + "learning_rate": 1.0195311590384227e-06, + "loss": 0.3734, + "step": 14099 + }, + { + "epoch": 1.715850319440219, + "grad_norm": 2.7064740657806396, + "learning_rate": 1.018673220104097e-06, + "loss": 0.2847, + "step": 14100 + }, + { + "epoch": 1.715972010952236, + "grad_norm": 2.8950181007385254, + "learning_rate": 1.0178156229278458e-06, + "loss": 0.3263, + "step": 14101 + }, + { + "epoch": 1.716093702464253, + "grad_norm": 1.702979564666748, + "learning_rate": 1.0169583675423044e-06, + "loss": 0.3786, + "step": 14102 + }, + { + "epoch": 1.7162153939762703, + "grad_norm": 2.6131691932678223, + "learning_rate": 1.0161014539800885e-06, + "loss": 0.416, + "step": 14103 + }, + { + "epoch": 1.7163370854882873, + "grad_norm": 2.7310736179351807, + "learning_rate": 1.0152448822738093e-06, + "loss": 0.3944, + "step": 14104 + }, + { + "epoch": 1.7164587770003044, + "grad_norm": 1.8591914176940918, + "learning_rate": 1.0143886524560587e-06, + "loss": 0.3547, + "step": 14105 + }, + { + "epoch": 1.7165804685123214, + "grad_norm": 2.777805805206299, + "learning_rate": 1.0135327645594184e-06, + "loss": 0.3179, + "step": 14106 + }, + { + "epoch": 1.7167021600243384, + "grad_norm": 1.5503928661346436, + "learning_rate": 1.0126772186164592e-06, + "loss": 0.3647, + "step": 14107 + }, + { + "epoch": 1.7168238515363554, + "grad_norm": 2.6914825439453125, + "learning_rate": 1.0118220146597334e-06, + "loss": 0.3728, + "step": 14108 + }, + { + "epoch": 1.7169455430483724, + "grad_norm": 1.7153568267822266, + "learning_rate": 1.0109671527217835e-06, + "loss": 0.3973, + "step": 14109 + }, + { + "epoch": 1.7170672345603895, + "grad_norm": 1.873653769493103, + "learning_rate": 1.0101126328351418e-06, + "loss": 0.3731, + "step": 14110 + }, + { + "epoch": 1.7171889260724065, + "grad_norm": 1.6782232522964478, + "learning_rate": 1.0092584550323225e-06, + "loss": 0.4116, + "step": 14111 + }, + { + "epoch": 1.7173106175844235, + "grad_norm": 1.7122968435287476, + "learning_rate": 1.0084046193458263e-06, + "loss": 0.3643, + "step": 14112 + }, + { + "epoch": 1.7174323090964405, + "grad_norm": 1.8513596057891846, + "learning_rate": 1.0075511258081505e-06, + "loss": 0.3981, + "step": 14113 + }, + { + "epoch": 1.7175540006084575, + "grad_norm": 3.84171199798584, + "learning_rate": 1.0066979744517636e-06, + "loss": 0.3184, + "step": 14114 + }, + { + "epoch": 1.7176756921204746, + "grad_norm": 2.873119354248047, + "learning_rate": 1.005845165309135e-06, + "loss": 0.3286, + "step": 14115 + }, + { + "epoch": 1.7177973836324916, + "grad_norm": 1.6686207056045532, + "learning_rate": 1.0049926984127156e-06, + "loss": 0.3981, + "step": 14116 + }, + { + "epoch": 1.7179190751445086, + "grad_norm": 1.8420835733413696, + "learning_rate": 1.0041405737949406e-06, + "loss": 0.3602, + "step": 14117 + }, + { + "epoch": 1.7180407666565256, + "grad_norm": 3.320587396621704, + "learning_rate": 1.0032887914882394e-06, + "loss": 0.4768, + "step": 14118 + }, + { + "epoch": 1.7181624581685426, + "grad_norm": 2.6647303104400635, + "learning_rate": 1.0024373515250207e-06, + "loss": 0.4366, + "step": 14119 + }, + { + "epoch": 1.7182841496805596, + "grad_norm": 1.8319995403289795, + "learning_rate": 1.0015862539376842e-06, + "loss": 0.434, + "step": 14120 + }, + { + "epoch": 1.7184058411925767, + "grad_norm": 1.3675659894943237, + "learning_rate": 1.0007354987586171e-06, + "loss": 0.3198, + "step": 14121 + }, + { + "epoch": 1.718527532704594, + "grad_norm": 2.028301239013672, + "learning_rate": 9.998850860201925e-07, + "loss": 0.3149, + "step": 14122 + }, + { + "epoch": 1.718649224216611, + "grad_norm": 1.7637901306152344, + "learning_rate": 9.99035015754768e-07, + "loss": 0.3436, + "step": 14123 + }, + { + "epoch": 1.718770915728628, + "grad_norm": 1.949903964996338, + "learning_rate": 9.98185287994693e-07, + "loss": 0.3579, + "step": 14124 + }, + { + "epoch": 1.718892607240645, + "grad_norm": 1.688572883605957, + "learning_rate": 9.973359027723017e-07, + "loss": 0.37, + "step": 14125 + }, + { + "epoch": 1.719014298752662, + "grad_norm": 1.9323683977127075, + "learning_rate": 9.964868601199129e-07, + "loss": 0.379, + "step": 14126 + }, + { + "epoch": 1.719135990264679, + "grad_norm": 2.9864230155944824, + "learning_rate": 9.95638160069835e-07, + "loss": 0.3602, + "step": 14127 + }, + { + "epoch": 1.719257681776696, + "grad_norm": 1.9064257144927979, + "learning_rate": 9.947898026543622e-07, + "loss": 0.3822, + "step": 14128 + }, + { + "epoch": 1.7193793732887133, + "grad_norm": 2.079899549484253, + "learning_rate": 9.93941787905779e-07, + "loss": 0.316, + "step": 14129 + }, + { + "epoch": 1.7195010648007303, + "grad_norm": 2.1837902069091797, + "learning_rate": 9.930941158563512e-07, + "loss": 0.3962, + "step": 14130 + }, + { + "epoch": 1.7196227563127473, + "grad_norm": 1.7883656024932861, + "learning_rate": 9.922467865383344e-07, + "loss": 0.3655, + "step": 14131 + }, + { + "epoch": 1.7197444478247643, + "grad_norm": 2.2328412532806396, + "learning_rate": 9.913997999839742e-07, + "loss": 0.3123, + "step": 14132 + }, + { + "epoch": 1.7198661393367813, + "grad_norm": 1.6640217304229736, + "learning_rate": 9.905531562254988e-07, + "loss": 0.3419, + "step": 14133 + }, + { + "epoch": 1.7199878308487984, + "grad_norm": 2.8707523345947266, + "learning_rate": 9.897068552951228e-07, + "loss": 0.4164, + "step": 14134 + }, + { + "epoch": 1.7201095223608154, + "grad_norm": 1.5813701152801514, + "learning_rate": 9.888608972250523e-07, + "loss": 0.3304, + "step": 14135 + }, + { + "epoch": 1.7202312138728324, + "grad_norm": 1.7172472476959229, + "learning_rate": 9.880152820474765e-07, + "loss": 0.3657, + "step": 14136 + }, + { + "epoch": 1.7203529053848494, + "grad_norm": 2.234194278717041, + "learning_rate": 9.871700097945737e-07, + "loss": 0.3952, + "step": 14137 + }, + { + "epoch": 1.7204745968968664, + "grad_norm": 1.6735930442810059, + "learning_rate": 9.86325080498507e-07, + "loss": 0.4146, + "step": 14138 + }, + { + "epoch": 1.7205962884088835, + "grad_norm": 1.8130608797073364, + "learning_rate": 9.854804941914264e-07, + "loss": 0.3659, + "step": 14139 + }, + { + "epoch": 1.7207179799209005, + "grad_norm": 1.8729840517044067, + "learning_rate": 9.84636250905474e-07, + "loss": 0.329, + "step": 14140 + }, + { + "epoch": 1.7208396714329175, + "grad_norm": 1.7210155725479126, + "learning_rate": 9.83792350672773e-07, + "loss": 0.3519, + "step": 14141 + }, + { + "epoch": 1.7209613629449345, + "grad_norm": 3.2441020011901855, + "learning_rate": 9.829487935254345e-07, + "loss": 0.4159, + "step": 14142 + }, + { + "epoch": 1.7210830544569515, + "grad_norm": 1.4905451536178589, + "learning_rate": 9.821055794955603e-07, + "loss": 0.3581, + "step": 14143 + }, + { + "epoch": 1.7212047459689686, + "grad_norm": 1.6636959314346313, + "learning_rate": 9.812627086152338e-07, + "loss": 0.3813, + "step": 14144 + }, + { + "epoch": 1.7213264374809856, + "grad_norm": 3.2838358879089355, + "learning_rate": 9.80420180916528e-07, + "loss": 0.443, + "step": 14145 + }, + { + "epoch": 1.7214481289930026, + "grad_norm": 1.567162275314331, + "learning_rate": 9.795779964315056e-07, + "loss": 0.4183, + "step": 14146 + }, + { + "epoch": 1.7215698205050196, + "grad_norm": 1.8367794752120972, + "learning_rate": 9.787361551922092e-07, + "loss": 0.369, + "step": 14147 + }, + { + "epoch": 1.7216915120170369, + "grad_norm": 2.2278058528900146, + "learning_rate": 9.77894657230679e-07, + "loss": 0.3683, + "step": 14148 + }, + { + "epoch": 1.7218132035290539, + "grad_norm": 2.4154059886932373, + "learning_rate": 9.770535025789307e-07, + "loss": 0.3844, + "step": 14149 + }, + { + "epoch": 1.721934895041071, + "grad_norm": 1.7388941049575806, + "learning_rate": 9.762126912689707e-07, + "loss": 0.3821, + "step": 14150 + }, + { + "epoch": 1.722056586553088, + "grad_norm": 1.5595877170562744, + "learning_rate": 9.753722233327977e-07, + "loss": 0.3938, + "step": 14151 + }, + { + "epoch": 1.722178278065105, + "grad_norm": 1.57517409324646, + "learning_rate": 9.745320988023921e-07, + "loss": 0.2974, + "step": 14152 + }, + { + "epoch": 1.722299969577122, + "grad_norm": 1.5863057374954224, + "learning_rate": 9.7369231770972e-07, + "loss": 0.3452, + "step": 14153 + }, + { + "epoch": 1.7224216610891392, + "grad_norm": 1.5435655117034912, + "learning_rate": 9.728528800867398e-07, + "loss": 0.3594, + "step": 14154 + }, + { + "epoch": 1.7225433526011562, + "grad_norm": 1.5653700828552246, + "learning_rate": 9.72013785965391e-07, + "loss": 0.3922, + "step": 14155 + }, + { + "epoch": 1.7226650441131732, + "grad_norm": 1.9242876768112183, + "learning_rate": 9.711750353776072e-07, + "loss": 0.3363, + "step": 14156 + }, + { + "epoch": 1.7227867356251902, + "grad_norm": 1.4263627529144287, + "learning_rate": 9.703366283553006e-07, + "loss": 0.3611, + "step": 14157 + }, + { + "epoch": 1.7229084271372073, + "grad_norm": 1.5623769760131836, + "learning_rate": 9.694985649303745e-07, + "loss": 0.3852, + "step": 14158 + }, + { + "epoch": 1.7230301186492243, + "grad_norm": 1.5293095111846924, + "learning_rate": 9.686608451347245e-07, + "loss": 0.3582, + "step": 14159 + }, + { + "epoch": 1.7231518101612413, + "grad_norm": 2.105797052383423, + "learning_rate": 9.678234690002208e-07, + "loss": 0.3962, + "step": 14160 + }, + { + "epoch": 1.7232735016732583, + "grad_norm": 2.047987937927246, + "learning_rate": 9.669864365587279e-07, + "loss": 0.3334, + "step": 14161 + }, + { + "epoch": 1.7233951931852753, + "grad_norm": 1.7671821117401123, + "learning_rate": 9.661497478421001e-07, + "loss": 0.3821, + "step": 14162 + }, + { + "epoch": 1.7235168846972924, + "grad_norm": 1.756811499595642, + "learning_rate": 9.653134028821709e-07, + "loss": 0.3765, + "step": 14163 + }, + { + "epoch": 1.7236385762093094, + "grad_norm": 2.402477741241455, + "learning_rate": 9.644774017107705e-07, + "loss": 0.4495, + "step": 14164 + }, + { + "epoch": 1.7237602677213264, + "grad_norm": 1.4955434799194336, + "learning_rate": 9.636417443597067e-07, + "loss": 0.3573, + "step": 14165 + }, + { + "epoch": 1.7238819592333434, + "grad_norm": 1.8756307363510132, + "learning_rate": 9.628064308607777e-07, + "loss": 0.3663, + "step": 14166 + }, + { + "epoch": 1.7240036507453604, + "grad_norm": 1.492052674293518, + "learning_rate": 9.619714612457708e-07, + "loss": 0.3174, + "step": 14167 + }, + { + "epoch": 1.7241253422573775, + "grad_norm": 1.9347213506698608, + "learning_rate": 9.61136835546459e-07, + "loss": 0.3399, + "step": 14168 + }, + { + "epoch": 1.7242470337693945, + "grad_norm": 1.6814157962799072, + "learning_rate": 9.603025537945965e-07, + "loss": 0.3748, + "step": 14169 + }, + { + "epoch": 1.7243687252814115, + "grad_norm": 1.3750447034835815, + "learning_rate": 9.594686160219358e-07, + "loss": 0.3215, + "step": 14170 + }, + { + "epoch": 1.7244904167934285, + "grad_norm": 1.697706699371338, + "learning_rate": 9.58635022260207e-07, + "loss": 0.4036, + "step": 14171 + }, + { + "epoch": 1.7246121083054455, + "grad_norm": 2.6654069423675537, + "learning_rate": 9.578017725411304e-07, + "loss": 0.3375, + "step": 14172 + }, + { + "epoch": 1.7247337998174628, + "grad_norm": 1.6381762027740479, + "learning_rate": 9.569688668964116e-07, + "loss": 0.4025, + "step": 14173 + }, + { + "epoch": 1.7248554913294798, + "grad_norm": 1.474560022354126, + "learning_rate": 9.561363053577443e-07, + "loss": 0.3708, + "step": 14174 + }, + { + "epoch": 1.7249771828414968, + "grad_norm": 3.0399885177612305, + "learning_rate": 9.553040879568121e-07, + "loss": 0.4065, + "step": 14175 + }, + { + "epoch": 1.7250988743535138, + "grad_norm": 1.9203499555587769, + "learning_rate": 9.544722147252816e-07, + "loss": 0.3009, + "step": 14176 + }, + { + "epoch": 1.7252205658655309, + "grad_norm": 1.8001118898391724, + "learning_rate": 9.536406856948043e-07, + "loss": 0.3371, + "step": 14177 + }, + { + "epoch": 1.7253422573775479, + "grad_norm": 2.507826328277588, + "learning_rate": 9.528095008970251e-07, + "loss": 0.4456, + "step": 14178 + }, + { + "epoch": 1.7254639488895651, + "grad_norm": 2.496150255203247, + "learning_rate": 9.519786603635717e-07, + "loss": 0.3957, + "step": 14179 + }, + { + "epoch": 1.7255856404015821, + "grad_norm": 2.605888605117798, + "learning_rate": 9.511481641260567e-07, + "loss": 0.4224, + "step": 14180 + }, + { + "epoch": 1.7257073319135992, + "grad_norm": 3.6426854133605957, + "learning_rate": 9.50318012216086e-07, + "loss": 0.4519, + "step": 14181 + }, + { + "epoch": 1.7258290234256162, + "grad_norm": 1.860112190246582, + "learning_rate": 9.494882046652465e-07, + "loss": 0.3922, + "step": 14182 + }, + { + "epoch": 1.7259507149376332, + "grad_norm": 1.4793708324432373, + "learning_rate": 9.486587415051141e-07, + "loss": 0.3461, + "step": 14183 + }, + { + "epoch": 1.7260724064496502, + "grad_norm": 1.375121831893921, + "learning_rate": 9.478296227672523e-07, + "loss": 0.3433, + "step": 14184 + }, + { + "epoch": 1.7261940979616672, + "grad_norm": 1.5559217929840088, + "learning_rate": 9.470008484832083e-07, + "loss": 0.38, + "step": 14185 + }, + { + "epoch": 1.7263157894736842, + "grad_norm": 1.5865850448608398, + "learning_rate": 9.461724186845222e-07, + "loss": 0.3344, + "step": 14186 + }, + { + "epoch": 1.7264374809857013, + "grad_norm": 1.8463729619979858, + "learning_rate": 9.453443334027168e-07, + "loss": 0.3334, + "step": 14187 + }, + { + "epoch": 1.7265591724977183, + "grad_norm": 1.7615233659744263, + "learning_rate": 9.445165926692989e-07, + "loss": 0.3891, + "step": 14188 + }, + { + "epoch": 1.7266808640097353, + "grad_norm": 1.7121474742889404, + "learning_rate": 9.436891965157713e-07, + "loss": 0.4313, + "step": 14189 + }, + { + "epoch": 1.7268025555217523, + "grad_norm": 1.343726634979248, + "learning_rate": 9.428621449736142e-07, + "loss": 0.37, + "step": 14190 + }, + { + "epoch": 1.7269242470337693, + "grad_norm": 1.5749168395996094, + "learning_rate": 9.420354380742991e-07, + "loss": 0.3003, + "step": 14191 + }, + { + "epoch": 1.7270459385457864, + "grad_norm": 2.3141303062438965, + "learning_rate": 9.412090758492853e-07, + "loss": 0.4167, + "step": 14192 + }, + { + "epoch": 1.7271676300578034, + "grad_norm": 1.5556703805923462, + "learning_rate": 9.403830583300166e-07, + "loss": 0.3328, + "step": 14193 + }, + { + "epoch": 1.7272893215698204, + "grad_norm": 2.4870657920837402, + "learning_rate": 9.395573855479257e-07, + "loss": 0.3366, + "step": 14194 + }, + { + "epoch": 1.7274110130818374, + "grad_norm": 2.103954553604126, + "learning_rate": 9.387320575344305e-07, + "loss": 0.3761, + "step": 14195 + }, + { + "epoch": 1.7275327045938544, + "grad_norm": 1.3927984237670898, + "learning_rate": 9.37907074320934e-07, + "loss": 0.3617, + "step": 14196 + }, + { + "epoch": 1.7276543961058715, + "grad_norm": 1.64122474193573, + "learning_rate": 9.370824359388331e-07, + "loss": 0.3938, + "step": 14197 + }, + { + "epoch": 1.7277760876178887, + "grad_norm": 1.7214555740356445, + "learning_rate": 9.36258142419505e-07, + "loss": 0.3528, + "step": 14198 + }, + { + "epoch": 1.7278977791299057, + "grad_norm": 1.8959749937057495, + "learning_rate": 9.354341937943124e-07, + "loss": 0.4055, + "step": 14199 + }, + { + "epoch": 1.7280194706419227, + "grad_norm": 1.9538798332214355, + "learning_rate": 9.346105900946145e-07, + "loss": 0.3981, + "step": 14200 + }, + { + "epoch": 1.7281411621539398, + "grad_norm": 3.2480194568634033, + "learning_rate": 9.337873313517465e-07, + "loss": 0.4434, + "step": 14201 + }, + { + "epoch": 1.7282628536659568, + "grad_norm": 1.9845527410507202, + "learning_rate": 9.329644175970365e-07, + "loss": 0.4204, + "step": 14202 + }, + { + "epoch": 1.7283845451779738, + "grad_norm": 1.5241584777832031, + "learning_rate": 9.321418488617983e-07, + "loss": 0.3893, + "step": 14203 + }, + { + "epoch": 1.728506236689991, + "grad_norm": 2.9985358715057373, + "learning_rate": 9.313196251773338e-07, + "loss": 0.4007, + "step": 14204 + }, + { + "epoch": 1.728627928202008, + "grad_norm": 1.9829671382904053, + "learning_rate": 9.304977465749276e-07, + "loss": 0.4106, + "step": 14205 + }, + { + "epoch": 1.728749619714025, + "grad_norm": 2.017364740371704, + "learning_rate": 9.29676213085855e-07, + "loss": 0.3168, + "step": 14206 + }, + { + "epoch": 1.728871311226042, + "grad_norm": 2.7571353912353516, + "learning_rate": 9.288550247413752e-07, + "loss": 0.3981, + "step": 14207 + }, + { + "epoch": 1.7289930027380591, + "grad_norm": 2.408407688140869, + "learning_rate": 9.2803418157274e-07, + "loss": 0.3056, + "step": 14208 + }, + { + "epoch": 1.7291146942500761, + "grad_norm": 1.5391234159469604, + "learning_rate": 9.272136836111822e-07, + "loss": 0.3796, + "step": 14209 + }, + { + "epoch": 1.7292363857620932, + "grad_norm": 1.7516957521438599, + "learning_rate": 9.263935308879224e-07, + "loss": 0.3899, + "step": 14210 + }, + { + "epoch": 1.7293580772741102, + "grad_norm": 1.509750485420227, + "learning_rate": 9.25573723434171e-07, + "loss": 0.3808, + "step": 14211 + }, + { + "epoch": 1.7294797687861272, + "grad_norm": 1.683774471282959, + "learning_rate": 9.247542612811222e-07, + "loss": 0.3905, + "step": 14212 + }, + { + "epoch": 1.7296014602981442, + "grad_norm": 2.21616792678833, + "learning_rate": 9.239351444599564e-07, + "loss": 0.4452, + "step": 14213 + }, + { + "epoch": 1.7297231518101612, + "grad_norm": 1.5090255737304688, + "learning_rate": 9.231163730018478e-07, + "loss": 0.292, + "step": 14214 + }, + { + "epoch": 1.7298448433221782, + "grad_norm": 1.7390064001083374, + "learning_rate": 9.222979469379467e-07, + "loss": 0.3731, + "step": 14215 + }, + { + "epoch": 1.7299665348341953, + "grad_norm": 1.449793815612793, + "learning_rate": 9.214798662994006e-07, + "loss": 0.3619, + "step": 14216 + }, + { + "epoch": 1.7300882263462123, + "grad_norm": 2.0085577964782715, + "learning_rate": 9.206621311173391e-07, + "loss": 0.3728, + "step": 14217 + }, + { + "epoch": 1.7302099178582293, + "grad_norm": 3.629009246826172, + "learning_rate": 9.198447414228728e-07, + "loss": 0.4148, + "step": 14218 + }, + { + "epoch": 1.7303316093702463, + "grad_norm": 2.0576725006103516, + "learning_rate": 9.190276972471101e-07, + "loss": 0.4253, + "step": 14219 + }, + { + "epoch": 1.7304533008822633, + "grad_norm": 2.8660964965820312, + "learning_rate": 9.182109986211407e-07, + "loss": 0.3434, + "step": 14220 + }, + { + "epoch": 1.7305749923942804, + "grad_norm": 1.589560866355896, + "learning_rate": 9.173946455760396e-07, + "loss": 0.3497, + "step": 14221 + }, + { + "epoch": 1.7306966839062974, + "grad_norm": 1.5878167152404785, + "learning_rate": 9.165786381428732e-07, + "loss": 0.396, + "step": 14222 + }, + { + "epoch": 1.7308183754183146, + "grad_norm": 4.360469341278076, + "learning_rate": 9.15762976352691e-07, + "loss": 0.3111, + "step": 14223 + }, + { + "epoch": 1.7309400669303316, + "grad_norm": 1.77071213722229, + "learning_rate": 9.149476602365315e-07, + "loss": 0.386, + "step": 14224 + }, + { + "epoch": 1.7310617584423487, + "grad_norm": 2.8939599990844727, + "learning_rate": 9.141326898254188e-07, + "loss": 0.3852, + "step": 14225 + }, + { + "epoch": 1.7311834499543657, + "grad_norm": 1.6369757652282715, + "learning_rate": 9.133180651503615e-07, + "loss": 0.4005, + "step": 14226 + }, + { + "epoch": 1.7313051414663827, + "grad_norm": 3.2018487453460693, + "learning_rate": 9.125037862423625e-07, + "loss": 0.3807, + "step": 14227 + }, + { + "epoch": 1.7314268329783997, + "grad_norm": 2.0804715156555176, + "learning_rate": 9.11689853132407e-07, + "loss": 0.3278, + "step": 14228 + }, + { + "epoch": 1.7315485244904167, + "grad_norm": 1.760297417640686, + "learning_rate": 9.108762658514603e-07, + "loss": 0.3474, + "step": 14229 + }, + { + "epoch": 1.731670216002434, + "grad_norm": 2.205538034439087, + "learning_rate": 9.100630244304865e-07, + "loss": 0.3857, + "step": 14230 + }, + { + "epoch": 1.731791907514451, + "grad_norm": 1.6773427724838257, + "learning_rate": 9.092501289004285e-07, + "loss": 0.381, + "step": 14231 + }, + { + "epoch": 1.731913599026468, + "grad_norm": 2.9241650104522705, + "learning_rate": 9.084375792922217e-07, + "loss": 0.3144, + "step": 14232 + }, + { + "epoch": 1.732035290538485, + "grad_norm": 1.6965457201004028, + "learning_rate": 9.076253756367836e-07, + "loss": 0.3461, + "step": 14233 + }, + { + "epoch": 1.732156982050502, + "grad_norm": 1.8271918296813965, + "learning_rate": 9.068135179650173e-07, + "loss": 0.3891, + "step": 14234 + }, + { + "epoch": 1.732278673562519, + "grad_norm": 1.8727154731750488, + "learning_rate": 9.060020063078212e-07, + "loss": 0.3893, + "step": 14235 + }, + { + "epoch": 1.732400365074536, + "grad_norm": 3.413839817047119, + "learning_rate": 9.051908406960719e-07, + "loss": 0.3922, + "step": 14236 + }, + { + "epoch": 1.7325220565865531, + "grad_norm": 1.5881644487380981, + "learning_rate": 9.043800211606357e-07, + "loss": 0.3659, + "step": 14237 + }, + { + "epoch": 1.7326437480985701, + "grad_norm": 1.6653475761413574, + "learning_rate": 9.035695477323669e-07, + "loss": 0.3386, + "step": 14238 + }, + { + "epoch": 1.7327654396105872, + "grad_norm": 1.7836490869522095, + "learning_rate": 9.027594204421064e-07, + "loss": 0.331, + "step": 14239 + }, + { + "epoch": 1.7328871311226042, + "grad_norm": 2.1213221549987793, + "learning_rate": 9.019496393206794e-07, + "loss": 0.3592, + "step": 14240 + }, + { + "epoch": 1.7330088226346212, + "grad_norm": 2.1884124279022217, + "learning_rate": 9.011402043989015e-07, + "loss": 0.3906, + "step": 14241 + }, + { + "epoch": 1.7331305141466382, + "grad_norm": 2.9316279888153076, + "learning_rate": 9.0033111570757e-07, + "loss": 0.345, + "step": 14242 + }, + { + "epoch": 1.7332522056586552, + "grad_norm": 2.2177157402038574, + "learning_rate": 8.99522373277476e-07, + "loss": 0.3075, + "step": 14243 + }, + { + "epoch": 1.7333738971706723, + "grad_norm": 1.7550849914550781, + "learning_rate": 8.987139771393938e-07, + "loss": 0.4, + "step": 14244 + }, + { + "epoch": 1.7334955886826893, + "grad_norm": 3.208246946334839, + "learning_rate": 8.979059273240809e-07, + "loss": 0.4879, + "step": 14245 + }, + { + "epoch": 1.7336172801947063, + "grad_norm": 2.1785266399383545, + "learning_rate": 8.970982238622905e-07, + "loss": 0.4097, + "step": 14246 + }, + { + "epoch": 1.7337389717067233, + "grad_norm": 2.0655219554901123, + "learning_rate": 8.962908667847536e-07, + "loss": 0.3392, + "step": 14247 + }, + { + "epoch": 1.7338606632187403, + "grad_norm": 1.8200823068618774, + "learning_rate": 8.954838561221912e-07, + "loss": 0.3534, + "step": 14248 + }, + { + "epoch": 1.7339823547307576, + "grad_norm": 1.938529133796692, + "learning_rate": 8.946771919053154e-07, + "loss": 0.3548, + "step": 14249 + }, + { + "epoch": 1.7341040462427746, + "grad_norm": 2.110961437225342, + "learning_rate": 8.938708741648194e-07, + "loss": 0.407, + "step": 14250 + }, + { + "epoch": 1.7342257377547916, + "grad_norm": 2.3713085651397705, + "learning_rate": 8.930649029313854e-07, + "loss": 0.3517, + "step": 14251 + }, + { + "epoch": 1.7343474292668086, + "grad_norm": 2.8605287075042725, + "learning_rate": 8.92259278235682e-07, + "loss": 0.3971, + "step": 14252 + }, + { + "epoch": 1.7344691207788256, + "grad_norm": 3.02201247215271, + "learning_rate": 8.914540001083638e-07, + "loss": 0.4127, + "step": 14253 + }, + { + "epoch": 1.7345908122908427, + "grad_norm": 1.4600318670272827, + "learning_rate": 8.906490685800761e-07, + "loss": 0.3628, + "step": 14254 + }, + { + "epoch": 1.73471250380286, + "grad_norm": 2.366039752960205, + "learning_rate": 8.898444836814457e-07, + "loss": 0.4365, + "step": 14255 + }, + { + "epoch": 1.734834195314877, + "grad_norm": 2.0894320011138916, + "learning_rate": 8.89040245443089e-07, + "loss": 0.3759, + "step": 14256 + }, + { + "epoch": 1.734955886826894, + "grad_norm": 1.6230913400650024, + "learning_rate": 8.882363538956107e-07, + "loss": 0.3321, + "step": 14257 + }, + { + "epoch": 1.735077578338911, + "grad_norm": 1.7111051082611084, + "learning_rate": 8.874328090696005e-07, + "loss": 0.3605, + "step": 14258 + }, + { + "epoch": 1.735199269850928, + "grad_norm": 1.5047584772109985, + "learning_rate": 8.866296109956308e-07, + "loss": 0.3789, + "step": 14259 + }, + { + "epoch": 1.735320961362945, + "grad_norm": 2.572357177734375, + "learning_rate": 8.858267597042713e-07, + "loss": 0.3548, + "step": 14260 + }, + { + "epoch": 1.735442652874962, + "grad_norm": 1.667695164680481, + "learning_rate": 8.850242552260679e-07, + "loss": 0.3724, + "step": 14261 + }, + { + "epoch": 1.735564344386979, + "grad_norm": 1.6652452945709229, + "learning_rate": 8.842220975915561e-07, + "loss": 0.3255, + "step": 14262 + }, + { + "epoch": 1.735686035898996, + "grad_norm": 1.9200270175933838, + "learning_rate": 8.834202868312669e-07, + "loss": 0.3256, + "step": 14263 + }, + { + "epoch": 1.735807727411013, + "grad_norm": 1.69968843460083, + "learning_rate": 8.826188229757027e-07, + "loss": 0.3669, + "step": 14264 + }, + { + "epoch": 1.73592941892303, + "grad_norm": 1.806928277015686, + "learning_rate": 8.818177060553645e-07, + "loss": 0.3806, + "step": 14265 + }, + { + "epoch": 1.7360511104350471, + "grad_norm": 1.348667025566101, + "learning_rate": 8.81016936100737e-07, + "loss": 0.3306, + "step": 14266 + }, + { + "epoch": 1.7361728019470641, + "grad_norm": 2.4229934215545654, + "learning_rate": 8.802165131422891e-07, + "loss": 0.4349, + "step": 14267 + }, + { + "epoch": 1.7362944934590812, + "grad_norm": 1.4708613157272339, + "learning_rate": 8.794164372104807e-07, + "loss": 0.3745, + "step": 14268 + }, + { + "epoch": 1.7364161849710982, + "grad_norm": 1.7326991558074951, + "learning_rate": 8.786167083357566e-07, + "loss": 0.3747, + "step": 14269 + }, + { + "epoch": 1.7365378764831152, + "grad_norm": 1.5292972326278687, + "learning_rate": 8.778173265485446e-07, + "loss": 0.3911, + "step": 14270 + }, + { + "epoch": 1.7366595679951322, + "grad_norm": 1.3409945964813232, + "learning_rate": 8.770182918792669e-07, + "loss": 0.3427, + "step": 14271 + }, + { + "epoch": 1.7367812595071492, + "grad_norm": 1.6310046911239624, + "learning_rate": 8.76219604358326e-07, + "loss": 0.356, + "step": 14272 + }, + { + "epoch": 1.7369029510191663, + "grad_norm": 1.6166120767593384, + "learning_rate": 8.754212640161131e-07, + "loss": 0.3769, + "step": 14273 + }, + { + "epoch": 1.7370246425311835, + "grad_norm": 2.4560840129852295, + "learning_rate": 8.746232708830116e-07, + "loss": 0.3959, + "step": 14274 + }, + { + "epoch": 1.7371463340432005, + "grad_norm": 1.5992571115493774, + "learning_rate": 8.738256249893795e-07, + "loss": 0.3468, + "step": 14275 + }, + { + "epoch": 1.7372680255552175, + "grad_norm": 2.3675191402435303, + "learning_rate": 8.730283263655748e-07, + "loss": 0.3028, + "step": 14276 + }, + { + "epoch": 1.7373897170672346, + "grad_norm": 2.043269634246826, + "learning_rate": 8.722313750419343e-07, + "loss": 0.331, + "step": 14277 + }, + { + "epoch": 1.7375114085792516, + "grad_norm": 1.6750531196594238, + "learning_rate": 8.714347710487803e-07, + "loss": 0.3809, + "step": 14278 + }, + { + "epoch": 1.7376331000912686, + "grad_norm": 1.8722851276397705, + "learning_rate": 8.706385144164309e-07, + "loss": 0.3916, + "step": 14279 + }, + { + "epoch": 1.7377547916032858, + "grad_norm": 2.0383670330047607, + "learning_rate": 8.698426051751829e-07, + "loss": 0.378, + "step": 14280 + }, + { + "epoch": 1.7378764831153029, + "grad_norm": 2.141855239868164, + "learning_rate": 8.6904704335532e-07, + "loss": 0.3251, + "step": 14281 + }, + { + "epoch": 1.7379981746273199, + "grad_norm": 2.53654408454895, + "learning_rate": 8.682518289871189e-07, + "loss": 0.2825, + "step": 14282 + }, + { + "epoch": 1.7381198661393369, + "grad_norm": 2.078965902328491, + "learning_rate": 8.674569621008366e-07, + "loss": 0.4302, + "step": 14283 + }, + { + "epoch": 1.738241557651354, + "grad_norm": 2.0348286628723145, + "learning_rate": 8.666624427267212e-07, + "loss": 0.3378, + "step": 14284 + }, + { + "epoch": 1.738363249163371, + "grad_norm": 2.64128041267395, + "learning_rate": 8.65868270895004e-07, + "loss": 0.3456, + "step": 14285 + }, + { + "epoch": 1.738484940675388, + "grad_norm": 1.6047821044921875, + "learning_rate": 8.650744466359074e-07, + "loss": 0.3479, + "step": 14286 + }, + { + "epoch": 1.738606632187405, + "grad_norm": 1.5302748680114746, + "learning_rate": 8.64280969979635e-07, + "loss": 0.3611, + "step": 14287 + }, + { + "epoch": 1.738728323699422, + "grad_norm": 2.3115594387054443, + "learning_rate": 8.634878409563818e-07, + "loss": 0.3515, + "step": 14288 + }, + { + "epoch": 1.738850015211439, + "grad_norm": 1.7235603332519531, + "learning_rate": 8.626950595963269e-07, + "loss": 0.3535, + "step": 14289 + }, + { + "epoch": 1.738971706723456, + "grad_norm": 1.5920923948287964, + "learning_rate": 8.619026259296393e-07, + "loss": 0.3659, + "step": 14290 + }, + { + "epoch": 1.739093398235473, + "grad_norm": 1.8308765888214111, + "learning_rate": 8.611105399864694e-07, + "loss": 0.374, + "step": 14291 + }, + { + "epoch": 1.73921508974749, + "grad_norm": 3.2287888526916504, + "learning_rate": 8.603188017969621e-07, + "loss": 0.3936, + "step": 14292 + }, + { + "epoch": 1.739336781259507, + "grad_norm": 3.4937808513641357, + "learning_rate": 8.595274113912432e-07, + "loss": 0.4142, + "step": 14293 + }, + { + "epoch": 1.739458472771524, + "grad_norm": 2.5637383460998535, + "learning_rate": 8.587363687994233e-07, + "loss": 0.4221, + "step": 14294 + }, + { + "epoch": 1.7395801642835411, + "grad_norm": 1.9632245302200317, + "learning_rate": 8.579456740516079e-07, + "loss": 0.3726, + "step": 14295 + }, + { + "epoch": 1.7397018557955581, + "grad_norm": 1.7519874572753906, + "learning_rate": 8.571553271778832e-07, + "loss": 0.3518, + "step": 14296 + }, + { + "epoch": 1.7398235473075752, + "grad_norm": 1.641635537147522, + "learning_rate": 8.563653282083206e-07, + "loss": 0.3414, + "step": 14297 + }, + { + "epoch": 1.7399452388195922, + "grad_norm": 3.0177409648895264, + "learning_rate": 8.555756771729873e-07, + "loss": 0.4441, + "step": 14298 + }, + { + "epoch": 1.7400669303316094, + "grad_norm": 2.0765628814697266, + "learning_rate": 8.547863741019235e-07, + "loss": 0.3443, + "step": 14299 + }, + { + "epoch": 1.7401886218436264, + "grad_norm": 1.8263086080551147, + "learning_rate": 8.539974190251699e-07, + "loss": 0.3345, + "step": 14300 + }, + { + "epoch": 1.7403103133556435, + "grad_norm": 1.9698293209075928, + "learning_rate": 8.532088119727455e-07, + "loss": 0.3348, + "step": 14301 + }, + { + "epoch": 1.7404320048676605, + "grad_norm": 1.9641646146774292, + "learning_rate": 8.524205529746576e-07, + "loss": 0.3888, + "step": 14302 + }, + { + "epoch": 1.7405536963796775, + "grad_norm": 3.123169422149658, + "learning_rate": 8.516326420609033e-07, + "loss": 0.294, + "step": 14303 + }, + { + "epoch": 1.7406753878916945, + "grad_norm": 2.204244613647461, + "learning_rate": 8.508450792614631e-07, + "loss": 0.3043, + "step": 14304 + }, + { + "epoch": 1.7407970794037118, + "grad_norm": 1.8235293626785278, + "learning_rate": 8.500578646063029e-07, + "loss": 0.3215, + "step": 14305 + }, + { + "epoch": 1.7409187709157288, + "grad_norm": 3.7659685611724854, + "learning_rate": 8.492709981253822e-07, + "loss": 0.4288, + "step": 14306 + }, + { + "epoch": 1.7410404624277458, + "grad_norm": 1.9809975624084473, + "learning_rate": 8.484844798486414e-07, + "loss": 0.3776, + "step": 14307 + }, + { + "epoch": 1.7411621539397628, + "grad_norm": 1.818855881690979, + "learning_rate": 8.476983098060076e-07, + "loss": 0.3598, + "step": 14308 + }, + { + "epoch": 1.7412838454517798, + "grad_norm": 1.9004418849945068, + "learning_rate": 8.469124880274004e-07, + "loss": 0.3407, + "step": 14309 + }, + { + "epoch": 1.7414055369637969, + "grad_norm": 1.6751792430877686, + "learning_rate": 8.461270145427147e-07, + "loss": 0.395, + "step": 14310 + }, + { + "epoch": 1.7415272284758139, + "grad_norm": 1.9813085794448853, + "learning_rate": 8.453418893818455e-07, + "loss": 0.4133, + "step": 14311 + }, + { + "epoch": 1.741648919987831, + "grad_norm": 1.6053643226623535, + "learning_rate": 8.445571125746677e-07, + "loss": 0.3515, + "step": 14312 + }, + { + "epoch": 1.741770611499848, + "grad_norm": 1.8470227718353271, + "learning_rate": 8.437726841510396e-07, + "loss": 0.3951, + "step": 14313 + }, + { + "epoch": 1.741892303011865, + "grad_norm": 2.454641580581665, + "learning_rate": 8.429886041408153e-07, + "loss": 0.3849, + "step": 14314 + }, + { + "epoch": 1.742013994523882, + "grad_norm": 1.8138149976730347, + "learning_rate": 8.422048725738286e-07, + "loss": 0.353, + "step": 14315 + }, + { + "epoch": 1.742135686035899, + "grad_norm": 1.6105372905731201, + "learning_rate": 8.414214894799e-07, + "loss": 0.3365, + "step": 14316 + }, + { + "epoch": 1.742257377547916, + "grad_norm": 1.6992546319961548, + "learning_rate": 8.406384548888425e-07, + "loss": 0.3342, + "step": 14317 + }, + { + "epoch": 1.742379069059933, + "grad_norm": 2.5218346118927, + "learning_rate": 8.39855768830452e-07, + "loss": 0.375, + "step": 14318 + }, + { + "epoch": 1.74250076057195, + "grad_norm": 1.8298931121826172, + "learning_rate": 8.390734313345073e-07, + "loss": 0.3968, + "step": 14319 + }, + { + "epoch": 1.742622452083967, + "grad_norm": 4.075117588043213, + "learning_rate": 8.38291442430782e-07, + "loss": 0.3403, + "step": 14320 + }, + { + "epoch": 1.742744143595984, + "grad_norm": 1.9006128311157227, + "learning_rate": 8.375098021490313e-07, + "loss": 0.3717, + "step": 14321 + }, + { + "epoch": 1.742865835108001, + "grad_norm": 1.9493316411972046, + "learning_rate": 8.367285105189982e-07, + "loss": 0.3399, + "step": 14322 + }, + { + "epoch": 1.742987526620018, + "grad_norm": 1.9841796159744263, + "learning_rate": 8.359475675704121e-07, + "loss": 0.309, + "step": 14323 + }, + { + "epoch": 1.7431092181320353, + "grad_norm": 2.4708316326141357, + "learning_rate": 8.35166973332987e-07, + "loss": 0.3522, + "step": 14324 + }, + { + "epoch": 1.7432309096440524, + "grad_norm": 1.964381456375122, + "learning_rate": 8.343867278364315e-07, + "loss": 0.4194, + "step": 14325 + }, + { + "epoch": 1.7433526011560694, + "grad_norm": 1.8712615966796875, + "learning_rate": 8.336068311104328e-07, + "loss": 0.421, + "step": 14326 + }, + { + "epoch": 1.7434742926680864, + "grad_norm": 1.8085203170776367, + "learning_rate": 8.328272831846651e-07, + "loss": 0.3532, + "step": 14327 + }, + { + "epoch": 1.7435959841801034, + "grad_norm": 2.2099030017852783, + "learning_rate": 8.320480840887968e-07, + "loss": 0.4259, + "step": 14328 + }, + { + "epoch": 1.7437176756921204, + "grad_norm": 2.0019686222076416, + "learning_rate": 8.312692338524752e-07, + "loss": 0.3269, + "step": 14329 + }, + { + "epoch": 1.7438393672041375, + "grad_norm": 1.9506771564483643, + "learning_rate": 8.304907325053357e-07, + "loss": 0.3551, + "step": 14330 + }, + { + "epoch": 1.7439610587161547, + "grad_norm": 2.5915844440460205, + "learning_rate": 8.297125800770056e-07, + "loss": 0.4505, + "step": 14331 + }, + { + "epoch": 1.7440827502281717, + "grad_norm": 1.4320746660232544, + "learning_rate": 8.289347765970934e-07, + "loss": 0.3492, + "step": 14332 + }, + { + "epoch": 1.7442044417401887, + "grad_norm": 1.7862533330917358, + "learning_rate": 8.281573220951955e-07, + "loss": 0.3566, + "step": 14333 + }, + { + "epoch": 1.7443261332522058, + "grad_norm": 2.7786672115325928, + "learning_rate": 8.27380216600896e-07, + "loss": 0.3574, + "step": 14334 + }, + { + "epoch": 1.7444478247642228, + "grad_norm": 2.1462345123291016, + "learning_rate": 8.266034601437645e-07, + "loss": 0.3105, + "step": 14335 + }, + { + "epoch": 1.7445695162762398, + "grad_norm": 2.914695978164673, + "learning_rate": 8.258270527533607e-07, + "loss": 0.325, + "step": 14336 + }, + { + "epoch": 1.7446912077882568, + "grad_norm": 1.4044674634933472, + "learning_rate": 8.250509944592267e-07, + "loss": 0.3654, + "step": 14337 + }, + { + "epoch": 1.7448128993002738, + "grad_norm": 2.0682713985443115, + "learning_rate": 8.24275285290892e-07, + "loss": 0.3975, + "step": 14338 + }, + { + "epoch": 1.7449345908122909, + "grad_norm": 1.9333986043930054, + "learning_rate": 8.234999252778775e-07, + "loss": 0.3841, + "step": 14339 + }, + { + "epoch": 1.7450562823243079, + "grad_norm": 2.2306602001190186, + "learning_rate": 8.227249144496852e-07, + "loss": 0.4137, + "step": 14340 + }, + { + "epoch": 1.745177973836325, + "grad_norm": 1.8189393281936646, + "learning_rate": 8.219502528358036e-07, + "loss": 0.3604, + "step": 14341 + }, + { + "epoch": 1.745299665348342, + "grad_norm": 1.6609610319137573, + "learning_rate": 8.211759404657138e-07, + "loss": 0.3843, + "step": 14342 + }, + { + "epoch": 1.745421356860359, + "grad_norm": 1.6170681715011597, + "learning_rate": 8.204019773688776e-07, + "loss": 0.3643, + "step": 14343 + }, + { + "epoch": 1.745543048372376, + "grad_norm": 1.571121096611023, + "learning_rate": 8.196283635747504e-07, + "loss": 0.3501, + "step": 14344 + }, + { + "epoch": 1.745664739884393, + "grad_norm": 2.148019790649414, + "learning_rate": 8.188550991127642e-07, + "loss": 0.2856, + "step": 14345 + }, + { + "epoch": 1.74578643139641, + "grad_norm": 2.059107542037964, + "learning_rate": 8.180821840123432e-07, + "loss": 0.3551, + "step": 14346 + }, + { + "epoch": 1.745908122908427, + "grad_norm": 2.4106245040893555, + "learning_rate": 8.173096183029027e-07, + "loss": 0.2867, + "step": 14347 + }, + { + "epoch": 1.746029814420444, + "grad_norm": 1.6454966068267822, + "learning_rate": 8.165374020138384e-07, + "loss": 0.3789, + "step": 14348 + }, + { + "epoch": 1.746151505932461, + "grad_norm": 2.545868158340454, + "learning_rate": 8.15765535174533e-07, + "loss": 0.4312, + "step": 14349 + }, + { + "epoch": 1.7462731974444783, + "grad_norm": 8.065531730651855, + "learning_rate": 8.149940178143611e-07, + "loss": 0.3585, + "step": 14350 + }, + { + "epoch": 1.7463948889564953, + "grad_norm": 1.8935617208480835, + "learning_rate": 8.142228499626769e-07, + "loss": 0.3774, + "step": 14351 + }, + { + "epoch": 1.7465165804685123, + "grad_norm": 1.7391681671142578, + "learning_rate": 8.134520316488292e-07, + "loss": 0.3394, + "step": 14352 + }, + { + "epoch": 1.7466382719805293, + "grad_norm": 2.0660479068756104, + "learning_rate": 8.126815629021456e-07, + "loss": 0.457, + "step": 14353 + }, + { + "epoch": 1.7467599634925464, + "grad_norm": 1.928277850151062, + "learning_rate": 8.119114437519448e-07, + "loss": 0.3631, + "step": 14354 + }, + { + "epoch": 1.7468816550045634, + "grad_norm": 2.140286922454834, + "learning_rate": 8.111416742275347e-07, + "loss": 0.3132, + "step": 14355 + }, + { + "epoch": 1.7470033465165806, + "grad_norm": 3.118319511413574, + "learning_rate": 8.103722543582016e-07, + "loss": 0.3733, + "step": 14356 + }, + { + "epoch": 1.7471250380285976, + "grad_norm": 2.1216111183166504, + "learning_rate": 8.096031841732243e-07, + "loss": 0.3299, + "step": 14357 + }, + { + "epoch": 1.7472467295406147, + "grad_norm": 2.7144100666046143, + "learning_rate": 8.088344637018708e-07, + "loss": 0.391, + "step": 14358 + }, + { + "epoch": 1.7473684210526317, + "grad_norm": 1.6950066089630127, + "learning_rate": 8.080660929733885e-07, + "loss": 0.2955, + "step": 14359 + }, + { + "epoch": 1.7474901125646487, + "grad_norm": 1.693657636642456, + "learning_rate": 8.072980720170198e-07, + "loss": 0.3566, + "step": 14360 + }, + { + "epoch": 1.7476118040766657, + "grad_norm": 2.463639497756958, + "learning_rate": 8.065304008619878e-07, + "loss": 0.3585, + "step": 14361 + }, + { + "epoch": 1.7477334955886827, + "grad_norm": 1.5246741771697998, + "learning_rate": 8.057630795375004e-07, + "loss": 0.3634, + "step": 14362 + }, + { + "epoch": 1.7478551871006998, + "grad_norm": 2.191861152648926, + "learning_rate": 8.049961080727619e-07, + "loss": 0.3907, + "step": 14363 + }, + { + "epoch": 1.7479768786127168, + "grad_norm": 1.908806324005127, + "learning_rate": 8.042294864969536e-07, + "loss": 0.3199, + "step": 14364 + }, + { + "epoch": 1.7480985701247338, + "grad_norm": 1.8936090469360352, + "learning_rate": 8.034632148392452e-07, + "loss": 0.3729, + "step": 14365 + }, + { + "epoch": 1.7482202616367508, + "grad_norm": 1.4756361246109009, + "learning_rate": 8.026972931288001e-07, + "loss": 0.3561, + "step": 14366 + }, + { + "epoch": 1.7483419531487678, + "grad_norm": 1.8983566761016846, + "learning_rate": 8.019317213947597e-07, + "loss": 0.4401, + "step": 14367 + }, + { + "epoch": 1.7484636446607849, + "grad_norm": 3.4970991611480713, + "learning_rate": 8.011664996662561e-07, + "loss": 0.3247, + "step": 14368 + }, + { + "epoch": 1.7485853361728019, + "grad_norm": 3.1445930004119873, + "learning_rate": 8.004016279724081e-07, + "loss": 0.4137, + "step": 14369 + }, + { + "epoch": 1.748707027684819, + "grad_norm": 2.1475565433502197, + "learning_rate": 7.996371063423181e-07, + "loss": 0.3976, + "step": 14370 + }, + { + "epoch": 1.748828719196836, + "grad_norm": 2.2538836002349854, + "learning_rate": 7.988729348050817e-07, + "loss": 0.3747, + "step": 14371 + }, + { + "epoch": 1.748950410708853, + "grad_norm": 1.8164736032485962, + "learning_rate": 7.981091133897756e-07, + "loss": 0.3836, + "step": 14372 + }, + { + "epoch": 1.74907210222087, + "grad_norm": 1.7587019205093384, + "learning_rate": 7.973456421254633e-07, + "loss": 0.3501, + "step": 14373 + }, + { + "epoch": 1.749193793732887, + "grad_norm": 1.6233000755310059, + "learning_rate": 7.965825210411993e-07, + "loss": 0.3591, + "step": 14374 + }, + { + "epoch": 1.7493154852449042, + "grad_norm": 1.580428123474121, + "learning_rate": 7.958197501660203e-07, + "loss": 0.3298, + "step": 14375 + }, + { + "epoch": 1.7494371767569212, + "grad_norm": 1.9146944284439087, + "learning_rate": 7.950573295289499e-07, + "loss": 0.4258, + "step": 14376 + }, + { + "epoch": 1.7495588682689382, + "grad_norm": 2.0037081241607666, + "learning_rate": 7.942952591590025e-07, + "loss": 0.3407, + "step": 14377 + }, + { + "epoch": 1.7496805597809553, + "grad_norm": 1.64399254322052, + "learning_rate": 7.935335390851751e-07, + "loss": 0.3668, + "step": 14378 + }, + { + "epoch": 1.7498022512929723, + "grad_norm": 1.9744069576263428, + "learning_rate": 7.927721693364531e-07, + "loss": 0.3618, + "step": 14379 + }, + { + "epoch": 1.7499239428049893, + "grad_norm": 1.8075882196426392, + "learning_rate": 7.920111499418082e-07, + "loss": 0.3507, + "step": 14380 + }, + { + "epoch": 1.7500456343170065, + "grad_norm": 1.5442469120025635, + "learning_rate": 7.912504809301969e-07, + "loss": 0.3364, + "step": 14381 + }, + { + "epoch": 1.7501673258290236, + "grad_norm": 2.457545757293701, + "learning_rate": 7.904901623305672e-07, + "loss": 0.4603, + "step": 14382 + }, + { + "epoch": 1.7502890173410406, + "grad_norm": 2.4344992637634277, + "learning_rate": 7.897301941718505e-07, + "loss": 0.3341, + "step": 14383 + }, + { + "epoch": 1.7504107088530576, + "grad_norm": 1.8127057552337646, + "learning_rate": 7.889705764829614e-07, + "loss": 0.3776, + "step": 14384 + }, + { + "epoch": 1.7505324003650746, + "grad_norm": 1.8539972305297852, + "learning_rate": 7.88211309292809e-07, + "loss": 0.3802, + "step": 14385 + }, + { + "epoch": 1.7506540918770916, + "grad_norm": 1.6395106315612793, + "learning_rate": 7.874523926302846e-07, + "loss": 0.3587, + "step": 14386 + }, + { + "epoch": 1.7507757833891087, + "grad_norm": 2.129948616027832, + "learning_rate": 7.86693826524263e-07, + "loss": 0.3364, + "step": 14387 + }, + { + "epoch": 1.7508974749011257, + "grad_norm": 2.1072161197662354, + "learning_rate": 7.859356110036143e-07, + "loss": 0.3612, + "step": 14388 + }, + { + "epoch": 1.7510191664131427, + "grad_norm": 1.4651302099227905, + "learning_rate": 7.851777460971877e-07, + "loss": 0.3282, + "step": 14389 + }, + { + "epoch": 1.7511408579251597, + "grad_norm": 1.5033385753631592, + "learning_rate": 7.844202318338212e-07, + "loss": 0.3119, + "step": 14390 + }, + { + "epoch": 1.7512625494371767, + "grad_norm": 3.124924421310425, + "learning_rate": 7.836630682423396e-07, + "loss": 0.4177, + "step": 14391 + }, + { + "epoch": 1.7513842409491938, + "grad_norm": 2.8099687099456787, + "learning_rate": 7.829062553515543e-07, + "loss": 0.3768, + "step": 14392 + }, + { + "epoch": 1.7515059324612108, + "grad_norm": 2.5157947540283203, + "learning_rate": 7.821497931902655e-07, + "loss": 0.3346, + "step": 14393 + }, + { + "epoch": 1.7516276239732278, + "grad_norm": 2.5333807468414307, + "learning_rate": 7.81393681787257e-07, + "loss": 0.3606, + "step": 14394 + }, + { + "epoch": 1.7517493154852448, + "grad_norm": 1.9541484117507935, + "learning_rate": 7.806379211712978e-07, + "loss": 0.3742, + "step": 14395 + }, + { + "epoch": 1.7518710069972618, + "grad_norm": 1.5055828094482422, + "learning_rate": 7.798825113711516e-07, + "loss": 0.3868, + "step": 14396 + }, + { + "epoch": 1.7519926985092789, + "grad_norm": 2.6457931995391846, + "learning_rate": 7.791274524155601e-07, + "loss": 0.3693, + "step": 14397 + }, + { + "epoch": 1.7521143900212959, + "grad_norm": 1.9580260515213013, + "learning_rate": 7.783727443332534e-07, + "loss": 0.3472, + "step": 14398 + }, + { + "epoch": 1.752236081533313, + "grad_norm": 2.075838327407837, + "learning_rate": 7.77618387152953e-07, + "loss": 0.3959, + "step": 14399 + }, + { + "epoch": 1.7523577730453301, + "grad_norm": 1.451200246810913, + "learning_rate": 7.768643809033626e-07, + "loss": 0.3778, + "step": 14400 + }, + { + "epoch": 1.7524794645573472, + "grad_norm": 1.5976767539978027, + "learning_rate": 7.761107256131739e-07, + "loss": 0.324, + "step": 14401 + }, + { + "epoch": 1.7526011560693642, + "grad_norm": 1.5861172676086426, + "learning_rate": 7.753574213110637e-07, + "loss": 0.3853, + "step": 14402 + }, + { + "epoch": 1.7527228475813812, + "grad_norm": 1.511989712715149, + "learning_rate": 7.746044680256959e-07, + "loss": 0.3411, + "step": 14403 + }, + { + "epoch": 1.7528445390933982, + "grad_norm": 1.9029408693313599, + "learning_rate": 7.738518657857252e-07, + "loss": 0.3656, + "step": 14404 + }, + { + "epoch": 1.7529662306054152, + "grad_norm": 2.7883009910583496, + "learning_rate": 7.730996146197889e-07, + "loss": 0.4121, + "step": 14405 + }, + { + "epoch": 1.7530879221174325, + "grad_norm": 2.30157470703125, + "learning_rate": 7.723477145565083e-07, + "loss": 0.4029, + "step": 14406 + }, + { + "epoch": 1.7532096136294495, + "grad_norm": 2.4816009998321533, + "learning_rate": 7.715961656244997e-07, + "loss": 0.4004, + "step": 14407 + }, + { + "epoch": 1.7533313051414665, + "grad_norm": 1.7638683319091797, + "learning_rate": 7.708449678523588e-07, + "loss": 0.3736, + "step": 14408 + }, + { + "epoch": 1.7534529966534835, + "grad_norm": 1.883698582649231, + "learning_rate": 7.700941212686674e-07, + "loss": 0.3736, + "step": 14409 + }, + { + "epoch": 1.7535746881655006, + "grad_norm": 2.497305154800415, + "learning_rate": 7.693436259020026e-07, + "loss": 0.3629, + "step": 14410 + }, + { + "epoch": 1.7536963796775176, + "grad_norm": 1.5503554344177246, + "learning_rate": 7.68593481780917e-07, + "loss": 0.3107, + "step": 14411 + }, + { + "epoch": 1.7538180711895346, + "grad_norm": 1.9252413511276245, + "learning_rate": 7.678436889339591e-07, + "loss": 0.3278, + "step": 14412 + }, + { + "epoch": 1.7539397627015516, + "grad_norm": 1.7362934350967407, + "learning_rate": 7.670942473896592e-07, + "loss": 0.3628, + "step": 14413 + }, + { + "epoch": 1.7540614542135686, + "grad_norm": 2.065354108810425, + "learning_rate": 7.663451571765323e-07, + "loss": 0.3384, + "step": 14414 + }, + { + "epoch": 1.7541831457255856, + "grad_norm": 1.5617612600326538, + "learning_rate": 7.655964183230857e-07, + "loss": 0.3428, + "step": 14415 + }, + { + "epoch": 1.7543048372376027, + "grad_norm": 1.2491282224655151, + "learning_rate": 7.648480308578088e-07, + "loss": 0.3193, + "step": 14416 + }, + { + "epoch": 1.7544265287496197, + "grad_norm": 1.5279039144515991, + "learning_rate": 7.640999948091799e-07, + "loss": 0.3744, + "step": 14417 + }, + { + "epoch": 1.7545482202616367, + "grad_norm": 2.2969255447387695, + "learning_rate": 7.633523102056639e-07, + "loss": 0.3548, + "step": 14418 + }, + { + "epoch": 1.7546699117736537, + "grad_norm": 2.429680585861206, + "learning_rate": 7.626049770757104e-07, + "loss": 0.4012, + "step": 14419 + }, + { + "epoch": 1.7547916032856707, + "grad_norm": 2.0860016345977783, + "learning_rate": 7.618579954477589e-07, + "loss": 0.3395, + "step": 14420 + }, + { + "epoch": 1.7549132947976878, + "grad_norm": 1.5536525249481201, + "learning_rate": 7.611113653502333e-07, + "loss": 0.3173, + "step": 14421 + }, + { + "epoch": 1.7550349863097048, + "grad_norm": 1.4375079870224, + "learning_rate": 7.60365086811542e-07, + "loss": 0.3146, + "step": 14422 + }, + { + "epoch": 1.7551566778217218, + "grad_norm": 1.7376062870025635, + "learning_rate": 7.596191598600854e-07, + "loss": 0.3409, + "step": 14423 + }, + { + "epoch": 1.7552783693337388, + "grad_norm": 1.731907844543457, + "learning_rate": 7.588735845242467e-07, + "loss": 0.3972, + "step": 14424 + }, + { + "epoch": 1.755400060845756, + "grad_norm": 2.6565561294555664, + "learning_rate": 7.581283608323953e-07, + "loss": 0.3874, + "step": 14425 + }, + { + "epoch": 1.755521752357773, + "grad_norm": 1.4553632736206055, + "learning_rate": 7.573834888128906e-07, + "loss": 0.3705, + "step": 14426 + }, + { + "epoch": 1.75564344386979, + "grad_norm": 1.4372460842132568, + "learning_rate": 7.566389684940734e-07, + "loss": 0.3149, + "step": 14427 + }, + { + "epoch": 1.7557651353818071, + "grad_norm": 2.6474924087524414, + "learning_rate": 7.558947999042765e-07, + "loss": 0.3704, + "step": 14428 + }, + { + "epoch": 1.7558868268938241, + "grad_norm": 1.6344373226165771, + "learning_rate": 7.551509830718185e-07, + "loss": 0.3461, + "step": 14429 + }, + { + "epoch": 1.7560085184058412, + "grad_norm": 3.925034523010254, + "learning_rate": 7.544075180249988e-07, + "loss": 0.413, + "step": 14430 + }, + { + "epoch": 1.7561302099178582, + "grad_norm": 1.50836980342865, + "learning_rate": 7.536644047921116e-07, + "loss": 0.3432, + "step": 14431 + }, + { + "epoch": 1.7562519014298754, + "grad_norm": 2.3834168910980225, + "learning_rate": 7.52921643401433e-07, + "loss": 0.3944, + "step": 14432 + }, + { + "epoch": 1.7563735929418924, + "grad_norm": 1.8044732809066772, + "learning_rate": 7.52179233881225e-07, + "loss": 0.4377, + "step": 14433 + }, + { + "epoch": 1.7564952844539095, + "grad_norm": 2.5352587699890137, + "learning_rate": 7.514371762597405e-07, + "loss": 0.394, + "step": 14434 + }, + { + "epoch": 1.7566169759659265, + "grad_norm": 1.7870938777923584, + "learning_rate": 7.506954705652158e-07, + "loss": 0.4185, + "step": 14435 + }, + { + "epoch": 1.7567386674779435, + "grad_norm": 2.400815010070801, + "learning_rate": 7.499541168258728e-07, + "loss": 0.3877, + "step": 14436 + }, + { + "epoch": 1.7568603589899605, + "grad_norm": 2.1925830841064453, + "learning_rate": 7.492131150699211e-07, + "loss": 0.3233, + "step": 14437 + }, + { + "epoch": 1.7569820505019775, + "grad_norm": 1.7983744144439697, + "learning_rate": 7.484724653255581e-07, + "loss": 0.3404, + "step": 14438 + }, + { + "epoch": 1.7571037420139946, + "grad_norm": 2.345271348953247, + "learning_rate": 7.477321676209692e-07, + "loss": 0.3949, + "step": 14439 + }, + { + "epoch": 1.7572254335260116, + "grad_norm": 1.5213383436203003, + "learning_rate": 7.469922219843218e-07, + "loss": 0.3571, + "step": 14440 + }, + { + "epoch": 1.7573471250380286, + "grad_norm": 1.5926969051361084, + "learning_rate": 7.46252628443771e-07, + "loss": 0.3249, + "step": 14441 + }, + { + "epoch": 1.7574688165500456, + "grad_norm": 2.315727472305298, + "learning_rate": 7.455133870274645e-07, + "loss": 0.3795, + "step": 14442 + }, + { + "epoch": 1.7575905080620626, + "grad_norm": 1.7920769453048706, + "learning_rate": 7.447744977635285e-07, + "loss": 0.3804, + "step": 14443 + }, + { + "epoch": 1.7577121995740796, + "grad_norm": 1.3890974521636963, + "learning_rate": 7.440359606800796e-07, + "loss": 0.3986, + "step": 14444 + }, + { + "epoch": 1.7578338910860967, + "grad_norm": 1.3588234186172485, + "learning_rate": 7.432977758052217e-07, + "loss": 0.3592, + "step": 14445 + }, + { + "epoch": 1.7579555825981137, + "grad_norm": 2.4143753051757812, + "learning_rate": 7.425599431670438e-07, + "loss": 0.3905, + "step": 14446 + }, + { + "epoch": 1.7580772741101307, + "grad_norm": 2.8679070472717285, + "learning_rate": 7.418224627936221e-07, + "loss": 0.3409, + "step": 14447 + }, + { + "epoch": 1.7581989656221477, + "grad_norm": 2.49882173538208, + "learning_rate": 7.410853347130198e-07, + "loss": 0.3791, + "step": 14448 + }, + { + "epoch": 1.7583206571341647, + "grad_norm": 3.792658567428589, + "learning_rate": 7.403485589532833e-07, + "loss": 0.2899, + "step": 14449 + }, + { + "epoch": 1.7584423486461818, + "grad_norm": 2.197274923324585, + "learning_rate": 7.396121355424523e-07, + "loss": 0.3129, + "step": 14450 + }, + { + "epoch": 1.758564040158199, + "grad_norm": 2.00026798248291, + "learning_rate": 7.388760645085469e-07, + "loss": 0.3615, + "step": 14451 + }, + { + "epoch": 1.758685731670216, + "grad_norm": 1.3426164388656616, + "learning_rate": 7.381403458795755e-07, + "loss": 0.3495, + "step": 14452 + }, + { + "epoch": 1.758807423182233, + "grad_norm": 3.0518858432769775, + "learning_rate": 7.374049796835369e-07, + "loss": 0.4337, + "step": 14453 + }, + { + "epoch": 1.75892911469425, + "grad_norm": 1.4964326620101929, + "learning_rate": 7.366699659484111e-07, + "loss": 0.3453, + "step": 14454 + }, + { + "epoch": 1.759050806206267, + "grad_norm": 2.0643231868743896, + "learning_rate": 7.359353047021656e-07, + "loss": 0.3474, + "step": 14455 + }, + { + "epoch": 1.759172497718284, + "grad_norm": 1.6798450946807861, + "learning_rate": 7.352009959727591e-07, + "loss": 0.3761, + "step": 14456 + }, + { + "epoch": 1.7592941892303013, + "grad_norm": 2.020517110824585, + "learning_rate": 7.344670397881304e-07, + "loss": 0.3914, + "step": 14457 + }, + { + "epoch": 1.7594158807423184, + "grad_norm": 2.052978038787842, + "learning_rate": 7.337334361762094e-07, + "loss": 0.3522, + "step": 14458 + }, + { + "epoch": 1.7595375722543354, + "grad_norm": 1.6676478385925293, + "learning_rate": 7.330001851649138e-07, + "loss": 0.3355, + "step": 14459 + }, + { + "epoch": 1.7596592637663524, + "grad_norm": 2.627361536026001, + "learning_rate": 7.322672867821401e-07, + "loss": 0.3036, + "step": 14460 + }, + { + "epoch": 1.7597809552783694, + "grad_norm": 2.615955114364624, + "learning_rate": 7.315347410557804e-07, + "loss": 0.2908, + "step": 14461 + }, + { + "epoch": 1.7599026467903864, + "grad_norm": 1.537392258644104, + "learning_rate": 7.308025480137082e-07, + "loss": 0.3664, + "step": 14462 + }, + { + "epoch": 1.7600243383024035, + "grad_norm": 1.6661725044250488, + "learning_rate": 7.300707076837843e-07, + "loss": 0.355, + "step": 14463 + }, + { + "epoch": 1.7601460298144205, + "grad_norm": 2.1293396949768066, + "learning_rate": 7.293392200938587e-07, + "loss": 0.3904, + "step": 14464 + }, + { + "epoch": 1.7602677213264375, + "grad_norm": 1.4980497360229492, + "learning_rate": 7.286080852717659e-07, + "loss": 0.3552, + "step": 14465 + }, + { + "epoch": 1.7603894128384545, + "grad_norm": 3.4383912086486816, + "learning_rate": 7.278773032453235e-07, + "loss": 0.4565, + "step": 14466 + }, + { + "epoch": 1.7605111043504715, + "grad_norm": 1.9814101457595825, + "learning_rate": 7.271468740423449e-07, + "loss": 0.3661, + "step": 14467 + }, + { + "epoch": 1.7606327958624886, + "grad_norm": 1.84766685962677, + "learning_rate": 7.2641679769062e-07, + "loss": 0.3867, + "step": 14468 + }, + { + "epoch": 1.7607544873745056, + "grad_norm": 1.3371798992156982, + "learning_rate": 7.2568707421793e-07, + "loss": 0.3142, + "step": 14469 + }, + { + "epoch": 1.7608761788865226, + "grad_norm": 1.9628312587738037, + "learning_rate": 7.249577036520472e-07, + "loss": 0.3652, + "step": 14470 + }, + { + "epoch": 1.7609978703985396, + "grad_norm": 1.8537945747375488, + "learning_rate": 7.242286860207192e-07, + "loss": 0.3374, + "step": 14471 + }, + { + "epoch": 1.7611195619105566, + "grad_norm": 1.5959616899490356, + "learning_rate": 7.235000213516907e-07, + "loss": 0.3622, + "step": 14472 + }, + { + "epoch": 1.7612412534225736, + "grad_norm": 2.3115346431732178, + "learning_rate": 7.227717096726872e-07, + "loss": 0.361, + "step": 14473 + }, + { + "epoch": 1.7613629449345907, + "grad_norm": 2.1264383792877197, + "learning_rate": 7.22043751011422e-07, + "loss": 0.3836, + "step": 14474 + }, + { + "epoch": 1.7614846364466077, + "grad_norm": 2.4236934185028076, + "learning_rate": 7.213161453955974e-07, + "loss": 0.4318, + "step": 14475 + }, + { + "epoch": 1.761606327958625, + "grad_norm": 1.7381917238235474, + "learning_rate": 7.205888928529003e-07, + "loss": 0.3795, + "step": 14476 + }, + { + "epoch": 1.761728019470642, + "grad_norm": 2.0093178749084473, + "learning_rate": 7.198619934110007e-07, + "loss": 0.3751, + "step": 14477 + }, + { + "epoch": 1.761849710982659, + "grad_norm": 1.860365867614746, + "learning_rate": 7.191354470975631e-07, + "loss": 0.3591, + "step": 14478 + }, + { + "epoch": 1.761971402494676, + "grad_norm": 1.6120810508728027, + "learning_rate": 7.184092539402298e-07, + "loss": 0.3364, + "step": 14479 + }, + { + "epoch": 1.762093094006693, + "grad_norm": 3.0284183025360107, + "learning_rate": 7.176834139666378e-07, + "loss": 0.3923, + "step": 14480 + }, + { + "epoch": 1.76221478551871, + "grad_norm": 3.31282901763916, + "learning_rate": 7.169579272044058e-07, + "loss": 0.4327, + "step": 14481 + }, + { + "epoch": 1.7623364770307273, + "grad_norm": 1.8907090425491333, + "learning_rate": 7.162327936811397e-07, + "loss": 0.3964, + "step": 14482 + }, + { + "epoch": 1.7624581685427443, + "grad_norm": 1.7526501417160034, + "learning_rate": 7.155080134244319e-07, + "loss": 0.3163, + "step": 14483 + }, + { + "epoch": 1.7625798600547613, + "grad_norm": 2.4112749099731445, + "learning_rate": 7.147835864618602e-07, + "loss": 0.3856, + "step": 14484 + }, + { + "epoch": 1.7627015515667783, + "grad_norm": 2.010303020477295, + "learning_rate": 7.140595128209948e-07, + "loss": 0.2901, + "step": 14485 + }, + { + "epoch": 1.7628232430787953, + "grad_norm": 2.1501972675323486, + "learning_rate": 7.133357925293849e-07, + "loss": 0.3352, + "step": 14486 + }, + { + "epoch": 1.7629449345908124, + "grad_norm": 1.453220009803772, + "learning_rate": 7.126124256145694e-07, + "loss": 0.351, + "step": 14487 + }, + { + "epoch": 1.7630666261028294, + "grad_norm": 1.5813199281692505, + "learning_rate": 7.118894121040754e-07, + "loss": 0.3832, + "step": 14488 + }, + { + "epoch": 1.7631883176148464, + "grad_norm": 2.1204910278320312, + "learning_rate": 7.111667520254162e-07, + "loss": 0.3537, + "step": 14489 + }, + { + "epoch": 1.7633100091268634, + "grad_norm": 2.2804338932037354, + "learning_rate": 7.104444454060866e-07, + "loss": 0.2907, + "step": 14490 + }, + { + "epoch": 1.7634317006388804, + "grad_norm": 1.4901660680770874, + "learning_rate": 7.097224922735757e-07, + "loss": 0.3844, + "step": 14491 + }, + { + "epoch": 1.7635533921508975, + "grad_norm": 1.5875693559646606, + "learning_rate": 7.090008926553538e-07, + "loss": 0.3641, + "step": 14492 + }, + { + "epoch": 1.7636750836629145, + "grad_norm": 1.7893704175949097, + "learning_rate": 7.082796465788789e-07, + "loss": 0.325, + "step": 14493 + }, + { + "epoch": 1.7637967751749315, + "grad_norm": 2.1332480907440186, + "learning_rate": 7.075587540715967e-07, + "loss": 0.3292, + "step": 14494 + }, + { + "epoch": 1.7639184666869485, + "grad_norm": 1.689132571220398, + "learning_rate": 7.068382151609366e-07, + "loss": 0.3313, + "step": 14495 + }, + { + "epoch": 1.7640401581989655, + "grad_norm": 1.5373564958572388, + "learning_rate": 7.061180298743198e-07, + "loss": 0.3407, + "step": 14496 + }, + { + "epoch": 1.7641618497109826, + "grad_norm": 1.3875147104263306, + "learning_rate": 7.053981982391489e-07, + "loss": 0.2987, + "step": 14497 + }, + { + "epoch": 1.7642835412229996, + "grad_norm": 1.885918140411377, + "learning_rate": 7.046787202828142e-07, + "loss": 0.3586, + "step": 14498 + }, + { + "epoch": 1.7644052327350166, + "grad_norm": 2.2079832553863525, + "learning_rate": 7.039595960326951e-07, + "loss": 0.3362, + "step": 14499 + }, + { + "epoch": 1.7645269242470336, + "grad_norm": 1.5553251504898071, + "learning_rate": 7.03240825516156e-07, + "loss": 0.3856, + "step": 14500 + }, + { + "epoch": 1.7646486157590509, + "grad_norm": 2.093270778656006, + "learning_rate": 7.025224087605453e-07, + "loss": 0.3548, + "step": 14501 + }, + { + "epoch": 1.7647703072710679, + "grad_norm": 2.286315679550171, + "learning_rate": 7.018043457932034e-07, + "loss": 0.3291, + "step": 14502 + }, + { + "epoch": 1.764891998783085, + "grad_norm": 1.7934011220932007, + "learning_rate": 7.010866366414514e-07, + "loss": 0.325, + "step": 14503 + }, + { + "epoch": 1.765013690295102, + "grad_norm": 1.7132139205932617, + "learning_rate": 7.003692813325991e-07, + "loss": 0.3593, + "step": 14504 + }, + { + "epoch": 1.765135381807119, + "grad_norm": 2.294631242752075, + "learning_rate": 6.996522798939487e-07, + "loss": 0.4345, + "step": 14505 + }, + { + "epoch": 1.765257073319136, + "grad_norm": 1.472137451171875, + "learning_rate": 6.989356323527763e-07, + "loss": 0.3399, + "step": 14506 + }, + { + "epoch": 1.7653787648311532, + "grad_norm": 1.6393823623657227, + "learning_rate": 6.982193387363568e-07, + "loss": 0.4025, + "step": 14507 + }, + { + "epoch": 1.7655004563431702, + "grad_norm": 1.7435812950134277, + "learning_rate": 6.975033990719449e-07, + "loss": 0.3642, + "step": 14508 + }, + { + "epoch": 1.7656221478551872, + "grad_norm": 2.0616307258605957, + "learning_rate": 6.967878133867822e-07, + "loss": 0.3539, + "step": 14509 + }, + { + "epoch": 1.7657438393672042, + "grad_norm": 1.8341875076293945, + "learning_rate": 6.960725817081015e-07, + "loss": 0.3802, + "step": 14510 + }, + { + "epoch": 1.7658655308792213, + "grad_norm": 2.115043878555298, + "learning_rate": 6.953577040631176e-07, + "loss": 0.3355, + "step": 14511 + }, + { + "epoch": 1.7659872223912383, + "grad_norm": 1.9199072122573853, + "learning_rate": 6.946431804790299e-07, + "loss": 0.3446, + "step": 14512 + }, + { + "epoch": 1.7661089139032553, + "grad_norm": 1.793259620666504, + "learning_rate": 6.939290109830322e-07, + "loss": 0.3459, + "step": 14513 + }, + { + "epoch": 1.7662306054152723, + "grad_norm": 1.6848077774047852, + "learning_rate": 6.932151956022992e-07, + "loss": 0.3637, + "step": 14514 + }, + { + "epoch": 1.7663522969272893, + "grad_norm": 1.8250707387924194, + "learning_rate": 6.925017343639895e-07, + "loss": 0.3652, + "step": 14515 + }, + { + "epoch": 1.7664739884393064, + "grad_norm": 2.0465261936187744, + "learning_rate": 6.917886272952578e-07, + "loss": 0.3911, + "step": 14516 + }, + { + "epoch": 1.7665956799513234, + "grad_norm": 4.026511192321777, + "learning_rate": 6.910758744232315e-07, + "loss": 0.4958, + "step": 14517 + }, + { + "epoch": 1.7667173714633404, + "grad_norm": 1.5430704355239868, + "learning_rate": 6.9036347577504e-07, + "loss": 0.316, + "step": 14518 + }, + { + "epoch": 1.7668390629753574, + "grad_norm": 1.7800544500350952, + "learning_rate": 6.896514313777869e-07, + "loss": 0.3462, + "step": 14519 + }, + { + "epoch": 1.7669607544873744, + "grad_norm": 3.350831985473633, + "learning_rate": 6.889397412585664e-07, + "loss": 0.4432, + "step": 14520 + }, + { + "epoch": 1.7670824459993915, + "grad_norm": 1.5889924764633179, + "learning_rate": 6.882284054444632e-07, + "loss": 0.3625, + "step": 14521 + }, + { + "epoch": 1.7672041375114085, + "grad_norm": 1.7387501001358032, + "learning_rate": 6.875174239625448e-07, + "loss": 0.3761, + "step": 14522 + }, + { + "epoch": 1.7673258290234255, + "grad_norm": 1.6038727760314941, + "learning_rate": 6.868067968398618e-07, + "loss": 0.3546, + "step": 14523 + }, + { + "epoch": 1.7674475205354425, + "grad_norm": 2.071868896484375, + "learning_rate": 6.860965241034589e-07, + "loss": 0.3919, + "step": 14524 + }, + { + "epoch": 1.7675692120474595, + "grad_norm": 3.5060112476348877, + "learning_rate": 6.853866057803615e-07, + "loss": 0.4265, + "step": 14525 + }, + { + "epoch": 1.7676909035594768, + "grad_norm": 1.730776309967041, + "learning_rate": 6.846770418975835e-07, + "loss": 0.4121, + "step": 14526 + }, + { + "epoch": 1.7678125950714938, + "grad_norm": 1.892000436782837, + "learning_rate": 6.839678324821264e-07, + "loss": 0.3776, + "step": 14527 + }, + { + "epoch": 1.7679342865835108, + "grad_norm": 1.9229711294174194, + "learning_rate": 6.832589775609766e-07, + "loss": 0.361, + "step": 14528 + }, + { + "epoch": 1.7680559780955278, + "grad_norm": 1.7723015546798706, + "learning_rate": 6.825504771611069e-07, + "loss": 0.3915, + "step": 14529 + }, + { + "epoch": 1.7681776696075449, + "grad_norm": 1.7557389736175537, + "learning_rate": 6.818423313094791e-07, + "loss": 0.3661, + "step": 14530 + }, + { + "epoch": 1.7682993611195619, + "grad_norm": 2.8860726356506348, + "learning_rate": 6.811345400330349e-07, + "loss": 0.3782, + "step": 14531 + }, + { + "epoch": 1.768421052631579, + "grad_norm": 1.7480837106704712, + "learning_rate": 6.804271033587129e-07, + "loss": 0.3289, + "step": 14532 + }, + { + "epoch": 1.7685427441435961, + "grad_norm": 1.9398764371871948, + "learning_rate": 6.797200213134292e-07, + "loss": 0.3304, + "step": 14533 + }, + { + "epoch": 1.7686644356556132, + "grad_norm": 1.4975380897521973, + "learning_rate": 6.790132939240901e-07, + "loss": 0.3634, + "step": 14534 + }, + { + "epoch": 1.7687861271676302, + "grad_norm": 2.078226089477539, + "learning_rate": 6.783069212175897e-07, + "loss": 0.3395, + "step": 14535 + }, + { + "epoch": 1.7689078186796472, + "grad_norm": 1.5778584480285645, + "learning_rate": 6.776009032208042e-07, + "loss": 0.3233, + "step": 14536 + }, + { + "epoch": 1.7690295101916642, + "grad_norm": 2.097545623779297, + "learning_rate": 6.768952399606021e-07, + "loss": 0.3869, + "step": 14537 + }, + { + "epoch": 1.7691512017036812, + "grad_norm": 1.9245706796646118, + "learning_rate": 6.761899314638343e-07, + "loss": 0.3381, + "step": 14538 + }, + { + "epoch": 1.7692728932156983, + "grad_norm": 2.8432419300079346, + "learning_rate": 6.75484977757338e-07, + "loss": 0.4162, + "step": 14539 + }, + { + "epoch": 1.7693945847277153, + "grad_norm": 1.7467833757400513, + "learning_rate": 6.747803788679397e-07, + "loss": 0.3607, + "step": 14540 + }, + { + "epoch": 1.7695162762397323, + "grad_norm": 2.521716356277466, + "learning_rate": 6.7407613482245e-07, + "loss": 0.4369, + "step": 14541 + }, + { + "epoch": 1.7696379677517493, + "grad_norm": 1.7306139469146729, + "learning_rate": 6.733722456476654e-07, + "loss": 0.375, + "step": 14542 + }, + { + "epoch": 1.7697596592637663, + "grad_norm": 2.0977470874786377, + "learning_rate": 6.726687113703733e-07, + "loss": 0.3335, + "step": 14543 + }, + { + "epoch": 1.7698813507757833, + "grad_norm": 2.7140793800354004, + "learning_rate": 6.719655320173424e-07, + "loss": 0.3927, + "step": 14544 + }, + { + "epoch": 1.7700030422878004, + "grad_norm": 1.6032614707946777, + "learning_rate": 6.712627076153322e-07, + "loss": 0.3836, + "step": 14545 + }, + { + "epoch": 1.7701247337998174, + "grad_norm": 2.5950915813446045, + "learning_rate": 6.705602381910848e-07, + "loss": 0.3697, + "step": 14546 + }, + { + "epoch": 1.7702464253118344, + "grad_norm": 1.907840371131897, + "learning_rate": 6.698581237713298e-07, + "loss": 0.3228, + "step": 14547 + }, + { + "epoch": 1.7703681168238514, + "grad_norm": 2.101078987121582, + "learning_rate": 6.69156364382787e-07, + "loss": 0.3372, + "step": 14548 + }, + { + "epoch": 1.7704898083358684, + "grad_norm": 1.3305882215499878, + "learning_rate": 6.684549600521595e-07, + "loss": 0.3332, + "step": 14549 + }, + { + "epoch": 1.7706114998478855, + "grad_norm": 2.558901309967041, + "learning_rate": 6.677539108061326e-07, + "loss": 0.3533, + "step": 14550 + }, + { + "epoch": 1.7707331913599025, + "grad_norm": 1.8484891653060913, + "learning_rate": 6.670532166713906e-07, + "loss": 0.3568, + "step": 14551 + }, + { + "epoch": 1.7708548828719197, + "grad_norm": 2.8344151973724365, + "learning_rate": 6.663528776745886e-07, + "loss": 0.3123, + "step": 14552 + }, + { + "epoch": 1.7709765743839367, + "grad_norm": 2.365994453430176, + "learning_rate": 6.6565289384238e-07, + "loss": 0.3006, + "step": 14553 + }, + { + "epoch": 1.7710982658959538, + "grad_norm": 2.0647196769714355, + "learning_rate": 6.649532652014002e-07, + "loss": 0.3394, + "step": 14554 + }, + { + "epoch": 1.7712199574079708, + "grad_norm": 3.8903493881225586, + "learning_rate": 6.642539917782698e-07, + "loss": 0.2907, + "step": 14555 + }, + { + "epoch": 1.7713416489199878, + "grad_norm": 3.136115074157715, + "learning_rate": 6.635550735995999e-07, + "loss": 0.337, + "step": 14556 + }, + { + "epoch": 1.7714633404320048, + "grad_norm": 2.2149486541748047, + "learning_rate": 6.628565106919859e-07, + "loss": 0.3507, + "step": 14557 + }, + { + "epoch": 1.771585031944022, + "grad_norm": 1.881397008895874, + "learning_rate": 6.621583030820067e-07, + "loss": 0.318, + "step": 14558 + }, + { + "epoch": 1.771706723456039, + "grad_norm": 2.2142624855041504, + "learning_rate": 6.61460450796233e-07, + "loss": 0.4046, + "step": 14559 + }, + { + "epoch": 1.771828414968056, + "grad_norm": 1.8249690532684326, + "learning_rate": 6.607629538612192e-07, + "loss": 0.3841, + "step": 14560 + }, + { + "epoch": 1.7719501064800731, + "grad_norm": 1.9165360927581787, + "learning_rate": 6.600658123035053e-07, + "loss": 0.3891, + "step": 14561 + }, + { + "epoch": 1.7720717979920901, + "grad_norm": 1.7116965055465698, + "learning_rate": 6.593690261496222e-07, + "loss": 0.3797, + "step": 14562 + }, + { + "epoch": 1.7721934895041072, + "grad_norm": 1.6220554113388062, + "learning_rate": 6.586725954260787e-07, + "loss": 0.3658, + "step": 14563 + }, + { + "epoch": 1.7723151810161242, + "grad_norm": 1.531381368637085, + "learning_rate": 6.579765201593802e-07, + "loss": 0.3732, + "step": 14564 + }, + { + "epoch": 1.7724368725281412, + "grad_norm": 2.1630656719207764, + "learning_rate": 6.572808003760123e-07, + "loss": 0.3413, + "step": 14565 + }, + { + "epoch": 1.7725585640401582, + "grad_norm": 1.581964135169983, + "learning_rate": 6.56585436102446e-07, + "loss": 0.3743, + "step": 14566 + }, + { + "epoch": 1.7726802555521752, + "grad_norm": 1.8824985027313232, + "learning_rate": 6.558904273651457e-07, + "loss": 0.3843, + "step": 14567 + }, + { + "epoch": 1.7728019470641923, + "grad_norm": 3.0243945121765137, + "learning_rate": 6.551957741905556e-07, + "loss": 0.4064, + "step": 14568 + }, + { + "epoch": 1.7729236385762093, + "grad_norm": 1.7638392448425293, + "learning_rate": 6.545014766051084e-07, + "loss": 0.389, + "step": 14569 + }, + { + "epoch": 1.7730453300882263, + "grad_norm": 1.540652871131897, + "learning_rate": 6.538075346352246e-07, + "loss": 0.3169, + "step": 14570 + }, + { + "epoch": 1.7731670216002433, + "grad_norm": 3.324019432067871, + "learning_rate": 6.531139483073101e-07, + "loss": 0.4413, + "step": 14571 + }, + { + "epoch": 1.7732887131122603, + "grad_norm": 1.8459844589233398, + "learning_rate": 6.524207176477549e-07, + "loss": 0.3558, + "step": 14572 + }, + { + "epoch": 1.7734104046242773, + "grad_norm": 1.7923648357391357, + "learning_rate": 6.51727842682942e-07, + "loss": 0.3417, + "step": 14573 + }, + { + "epoch": 1.7735320961362944, + "grad_norm": 2.2081899642944336, + "learning_rate": 6.510353234392341e-07, + "loss": 0.3729, + "step": 14574 + }, + { + "epoch": 1.7736537876483114, + "grad_norm": 1.6217221021652222, + "learning_rate": 6.503431599429833e-07, + "loss": 0.3404, + "step": 14575 + }, + { + "epoch": 1.7737754791603284, + "grad_norm": 1.4036725759506226, + "learning_rate": 6.496513522205283e-07, + "loss": 0.3215, + "step": 14576 + }, + { + "epoch": 1.7738971706723456, + "grad_norm": 1.5899603366851807, + "learning_rate": 6.489599002981917e-07, + "loss": 0.324, + "step": 14577 + }, + { + "epoch": 1.7740188621843627, + "grad_norm": 1.9212393760681152, + "learning_rate": 6.482688042022889e-07, + "loss": 0.4103, + "step": 14578 + }, + { + "epoch": 1.7741405536963797, + "grad_norm": 1.695272445678711, + "learning_rate": 6.475780639591145e-07, + "loss": 0.358, + "step": 14579 + }, + { + "epoch": 1.7742622452083967, + "grad_norm": 2.3171448707580566, + "learning_rate": 6.468876795949508e-07, + "loss": 0.3968, + "step": 14580 + }, + { + "epoch": 1.7743839367204137, + "grad_norm": 2.148740291595459, + "learning_rate": 6.461976511360735e-07, + "loss": 0.3952, + "step": 14581 + }, + { + "epoch": 1.7745056282324307, + "grad_norm": 2.0499255657196045, + "learning_rate": 6.45507978608737e-07, + "loss": 0.4096, + "step": 14582 + }, + { + "epoch": 1.774627319744448, + "grad_norm": 1.8709224462509155, + "learning_rate": 6.448186620391828e-07, + "loss": 0.3363, + "step": 14583 + }, + { + "epoch": 1.774749011256465, + "grad_norm": 1.5541064739227295, + "learning_rate": 6.44129701453644e-07, + "loss": 0.3803, + "step": 14584 + }, + { + "epoch": 1.774870702768482, + "grad_norm": 1.7476093769073486, + "learning_rate": 6.434410968783355e-07, + "loss": 0.41, + "step": 14585 + }, + { + "epoch": 1.774992394280499, + "grad_norm": 1.8392903804779053, + "learning_rate": 6.427528483394607e-07, + "loss": 0.368, + "step": 14586 + }, + { + "epoch": 1.775114085792516, + "grad_norm": 1.8348569869995117, + "learning_rate": 6.420649558632075e-07, + "loss": 0.3953, + "step": 14587 + }, + { + "epoch": 1.775235777304533, + "grad_norm": 2.5158095359802246, + "learning_rate": 6.413774194757516e-07, + "loss": 0.4437, + "step": 14588 + }, + { + "epoch": 1.77535746881655, + "grad_norm": 1.5707505941390991, + "learning_rate": 6.406902392032588e-07, + "loss": 0.4074, + "step": 14589 + }, + { + "epoch": 1.7754791603285671, + "grad_norm": 2.4350297451019287, + "learning_rate": 6.400034150718737e-07, + "loss": 0.3898, + "step": 14590 + }, + { + "epoch": 1.7756008518405841, + "grad_norm": 3.083111047744751, + "learning_rate": 6.39316947107732e-07, + "loss": 0.4341, + "step": 14591 + }, + { + "epoch": 1.7757225433526012, + "grad_norm": 1.6954346895217896, + "learning_rate": 6.386308353369575e-07, + "loss": 0.3665, + "step": 14592 + }, + { + "epoch": 1.7758442348646182, + "grad_norm": 2.392301559448242, + "learning_rate": 6.379450797856557e-07, + "loss": 0.3624, + "step": 14593 + }, + { + "epoch": 1.7759659263766352, + "grad_norm": 1.7334064245224, + "learning_rate": 6.372596804799213e-07, + "loss": 0.3371, + "step": 14594 + }, + { + "epoch": 1.7760876178886522, + "grad_norm": 2.2518150806427, + "learning_rate": 6.365746374458381e-07, + "loss": 0.3714, + "step": 14595 + }, + { + "epoch": 1.7762093094006692, + "grad_norm": 1.614601492881775, + "learning_rate": 6.358899507094684e-07, + "loss": 0.325, + "step": 14596 + }, + { + "epoch": 1.7763310009126863, + "grad_norm": 1.6773942708969116, + "learning_rate": 6.352056202968726e-07, + "loss": 0.4112, + "step": 14597 + }, + { + "epoch": 1.7764526924247033, + "grad_norm": 2.3715524673461914, + "learning_rate": 6.345216462340853e-07, + "loss": 0.4126, + "step": 14598 + }, + { + "epoch": 1.7765743839367203, + "grad_norm": 1.8623312711715698, + "learning_rate": 6.338380285471324e-07, + "loss": 0.3809, + "step": 14599 + }, + { + "epoch": 1.7766960754487373, + "grad_norm": 2.9922993183135986, + "learning_rate": 6.331547672620319e-07, + "loss": 0.3062, + "step": 14600 + }, + { + "epoch": 1.7768177669607543, + "grad_norm": 1.7961255311965942, + "learning_rate": 6.324718624047809e-07, + "loss": 0.3675, + "step": 14601 + }, + { + "epoch": 1.7769394584727716, + "grad_norm": 2.4649550914764404, + "learning_rate": 6.317893140013631e-07, + "loss": 0.4175, + "step": 14602 + }, + { + "epoch": 1.7770611499847886, + "grad_norm": 2.225714921951294, + "learning_rate": 6.311071220777554e-07, + "loss": 0.4176, + "step": 14603 + }, + { + "epoch": 1.7771828414968056, + "grad_norm": 1.9400768280029297, + "learning_rate": 6.304252866599125e-07, + "loss": 0.3301, + "step": 14604 + }, + { + "epoch": 1.7773045330088226, + "grad_norm": 2.111846923828125, + "learning_rate": 6.297438077737816e-07, + "loss": 0.3309, + "step": 14605 + }, + { + "epoch": 1.7774262245208396, + "grad_norm": 1.546210765838623, + "learning_rate": 6.290626854452953e-07, + "loss": 0.3496, + "step": 14606 + }, + { + "epoch": 1.7775479160328567, + "grad_norm": 1.7552134990692139, + "learning_rate": 6.283819197003693e-07, + "loss": 0.3258, + "step": 14607 + }, + { + "epoch": 1.777669607544874, + "grad_norm": 3.7536916732788086, + "learning_rate": 6.27701510564911e-07, + "loss": 0.2992, + "step": 14608 + }, + { + "epoch": 1.777791299056891, + "grad_norm": 1.7868030071258545, + "learning_rate": 6.270214580648104e-07, + "loss": 0.3631, + "step": 14609 + }, + { + "epoch": 1.777912990568908, + "grad_norm": 2.1418042182922363, + "learning_rate": 6.263417622259427e-07, + "loss": 0.4327, + "step": 14610 + }, + { + "epoch": 1.778034682080925, + "grad_norm": 1.5145152807235718, + "learning_rate": 6.256624230741737e-07, + "loss": 0.3675, + "step": 14611 + }, + { + "epoch": 1.778156373592942, + "grad_norm": 2.19519305229187, + "learning_rate": 6.249834406353517e-07, + "loss": 0.3599, + "step": 14612 + }, + { + "epoch": 1.778278065104959, + "grad_norm": 1.9231786727905273, + "learning_rate": 6.243048149353171e-07, + "loss": 0.3864, + "step": 14613 + }, + { + "epoch": 1.778399756616976, + "grad_norm": 1.6109853982925415, + "learning_rate": 6.236265459998914e-07, + "loss": 0.3511, + "step": 14614 + }, + { + "epoch": 1.778521448128993, + "grad_norm": 3.507779598236084, + "learning_rate": 6.229486338548807e-07, + "loss": 0.4456, + "step": 14615 + }, + { + "epoch": 1.77864313964101, + "grad_norm": 1.6540296077728271, + "learning_rate": 6.222710785260866e-07, + "loss": 0.3717, + "step": 14616 + }, + { + "epoch": 1.778764831153027, + "grad_norm": 2.3607757091522217, + "learning_rate": 6.215938800392884e-07, + "loss": 0.3278, + "step": 14617 + }, + { + "epoch": 1.778886522665044, + "grad_norm": 1.7540985345840454, + "learning_rate": 6.209170384202534e-07, + "loss": 0.367, + "step": 14618 + }, + { + "epoch": 1.7790082141770611, + "grad_norm": 1.715063452720642, + "learning_rate": 6.202405536947409e-07, + "loss": 0.3603, + "step": 14619 + }, + { + "epoch": 1.7791299056890781, + "grad_norm": 1.8997678756713867, + "learning_rate": 6.195644258884903e-07, + "loss": 0.3851, + "step": 14620 + }, + { + "epoch": 1.7792515972010952, + "grad_norm": 1.4899977445602417, + "learning_rate": 6.188886550272299e-07, + "loss": 0.3277, + "step": 14621 + }, + { + "epoch": 1.7793732887131122, + "grad_norm": 2.952059030532837, + "learning_rate": 6.182132411366737e-07, + "loss": 0.4346, + "step": 14622 + }, + { + "epoch": 1.7794949802251292, + "grad_norm": 3.236086368560791, + "learning_rate": 6.17538184242521e-07, + "loss": 0.344, + "step": 14623 + }, + { + "epoch": 1.7796166717371462, + "grad_norm": 1.8219189643859863, + "learning_rate": 6.168634843704624e-07, + "loss": 0.3215, + "step": 14624 + }, + { + "epoch": 1.7797383632491632, + "grad_norm": 1.7022855281829834, + "learning_rate": 6.161891415461718e-07, + "loss": 0.3929, + "step": 14625 + }, + { + "epoch": 1.7798600547611803, + "grad_norm": 1.6795920133590698, + "learning_rate": 6.155151557953043e-07, + "loss": 0.3315, + "step": 14626 + }, + { + "epoch": 1.7799817462731975, + "grad_norm": 1.9836817979812622, + "learning_rate": 6.148415271435127e-07, + "loss": 0.3413, + "step": 14627 + }, + { + "epoch": 1.7801034377852145, + "grad_norm": 1.9625375270843506, + "learning_rate": 6.141682556164274e-07, + "loss": 0.3142, + "step": 14628 + }, + { + "epoch": 1.7802251292972315, + "grad_norm": 1.6465593576431274, + "learning_rate": 6.134953412396649e-07, + "loss": 0.3313, + "step": 14629 + }, + { + "epoch": 1.7803468208092486, + "grad_norm": 1.4613187313079834, + "learning_rate": 6.128227840388357e-07, + "loss": 0.3499, + "step": 14630 + }, + { + "epoch": 1.7804685123212656, + "grad_norm": 1.7990692853927612, + "learning_rate": 6.121505840395303e-07, + "loss": 0.3475, + "step": 14631 + }, + { + "epoch": 1.7805902038332826, + "grad_norm": 1.4610713720321655, + "learning_rate": 6.114787412673262e-07, + "loss": 0.3392, + "step": 14632 + }, + { + "epoch": 1.7807118953452996, + "grad_norm": 1.584815502166748, + "learning_rate": 6.108072557477906e-07, + "loss": 0.3899, + "step": 14633 + }, + { + "epoch": 1.7808335868573169, + "grad_norm": 2.032013416290283, + "learning_rate": 6.10136127506471e-07, + "loss": 0.3498, + "step": 14634 + }, + { + "epoch": 1.7809552783693339, + "grad_norm": 1.9763884544372559, + "learning_rate": 6.09465356568909e-07, + "loss": 0.339, + "step": 14635 + }, + { + "epoch": 1.781076969881351, + "grad_norm": 1.6281328201293945, + "learning_rate": 6.087949429606277e-07, + "loss": 0.3715, + "step": 14636 + }, + { + "epoch": 1.781198661393368, + "grad_norm": 1.9488134384155273, + "learning_rate": 6.081248867071366e-07, + "loss": 0.3505, + "step": 14637 + }, + { + "epoch": 1.781320352905385, + "grad_norm": 2.1327097415924072, + "learning_rate": 6.074551878339341e-07, + "loss": 0.3973, + "step": 14638 + }, + { + "epoch": 1.781442044417402, + "grad_norm": 2.186551570892334, + "learning_rate": 6.067858463665043e-07, + "loss": 0.4052, + "step": 14639 + }, + { + "epoch": 1.781563735929419, + "grad_norm": 2.0402286052703857, + "learning_rate": 6.061168623303137e-07, + "loss": 0.4237, + "step": 14640 + }, + { + "epoch": 1.781685427441436, + "grad_norm": 1.7206637859344482, + "learning_rate": 6.054482357508229e-07, + "loss": 0.3605, + "step": 14641 + }, + { + "epoch": 1.781807118953453, + "grad_norm": 1.7428456544876099, + "learning_rate": 6.047799666534715e-07, + "loss": 0.3451, + "step": 14642 + }, + { + "epoch": 1.78192881046547, + "grad_norm": 1.942031741142273, + "learning_rate": 6.041120550636903e-07, + "loss": 0.3564, + "step": 14643 + }, + { + "epoch": 1.782050501977487, + "grad_norm": 2.4761927127838135, + "learning_rate": 6.034445010068934e-07, + "loss": 0.3296, + "step": 14644 + }, + { + "epoch": 1.782172193489504, + "grad_norm": 1.6716270446777344, + "learning_rate": 6.027773045084817e-07, + "loss": 0.4037, + "step": 14645 + }, + { + "epoch": 1.782293885001521, + "grad_norm": 1.6377651691436768, + "learning_rate": 6.021104655938459e-07, + "loss": 0.3719, + "step": 14646 + }, + { + "epoch": 1.782415576513538, + "grad_norm": 2.054234266281128, + "learning_rate": 6.01443984288359e-07, + "loss": 0.3563, + "step": 14647 + }, + { + "epoch": 1.7825372680255551, + "grad_norm": 1.8019675016403198, + "learning_rate": 6.00777860617382e-07, + "loss": 0.3349, + "step": 14648 + }, + { + "epoch": 1.7826589595375721, + "grad_norm": 2.7690815925598145, + "learning_rate": 6.001120946062633e-07, + "loss": 0.4577, + "step": 14649 + }, + { + "epoch": 1.7827806510495892, + "grad_norm": 3.760871410369873, + "learning_rate": 5.994466862803372e-07, + "loss": 0.4424, + "step": 14650 + }, + { + "epoch": 1.7829023425616062, + "grad_norm": 1.5450057983398438, + "learning_rate": 5.987816356649201e-07, + "loss": 0.3597, + "step": 14651 + }, + { + "epoch": 1.7830240340736232, + "grad_norm": 2.7767693996429443, + "learning_rate": 5.981169427853239e-07, + "loss": 0.4472, + "step": 14652 + }, + { + "epoch": 1.7831457255856404, + "grad_norm": 1.9481687545776367, + "learning_rate": 5.974526076668386e-07, + "loss": 0.3374, + "step": 14653 + }, + { + "epoch": 1.7832674170976575, + "grad_norm": 1.608691930770874, + "learning_rate": 5.967886303347415e-07, + "loss": 0.3735, + "step": 14654 + }, + { + "epoch": 1.7833891086096745, + "grad_norm": 1.670374870300293, + "learning_rate": 5.961250108143035e-07, + "loss": 0.3309, + "step": 14655 + }, + { + "epoch": 1.7835108001216915, + "grad_norm": 2.5748684406280518, + "learning_rate": 5.954617491307713e-07, + "loss": 0.3914, + "step": 14656 + }, + { + "epoch": 1.7836324916337085, + "grad_norm": 1.6508787870407104, + "learning_rate": 5.947988453093878e-07, + "loss": 0.367, + "step": 14657 + }, + { + "epoch": 1.7837541831457255, + "grad_norm": 1.5863659381866455, + "learning_rate": 5.941362993753752e-07, + "loss": 0.3813, + "step": 14658 + }, + { + "epoch": 1.7838758746577428, + "grad_norm": 1.7271991968154907, + "learning_rate": 5.934741113539444e-07, + "loss": 0.2967, + "step": 14659 + }, + { + "epoch": 1.7839975661697598, + "grad_norm": 1.7320616245269775, + "learning_rate": 5.928122812702963e-07, + "loss": 0.3732, + "step": 14660 + }, + { + "epoch": 1.7841192576817768, + "grad_norm": 2.0183608531951904, + "learning_rate": 5.92150809149612e-07, + "loss": 0.2761, + "step": 14661 + }, + { + "epoch": 1.7842409491937938, + "grad_norm": 1.841736912727356, + "learning_rate": 5.914896950170601e-07, + "loss": 0.3946, + "step": 14662 + }, + { + "epoch": 1.7843626407058109, + "grad_norm": 1.9585292339324951, + "learning_rate": 5.908289388978028e-07, + "loss": 0.3385, + "step": 14663 + }, + { + "epoch": 1.7844843322178279, + "grad_norm": 2.909271001815796, + "learning_rate": 5.901685408169777e-07, + "loss": 0.415, + "step": 14664 + }, + { + "epoch": 1.784606023729845, + "grad_norm": 2.813800096511841, + "learning_rate": 5.895085007997192e-07, + "loss": 0.3874, + "step": 14665 + }, + { + "epoch": 1.784727715241862, + "grad_norm": 2.633601665496826, + "learning_rate": 5.888488188711427e-07, + "loss": 0.3921, + "step": 14666 + }, + { + "epoch": 1.784849406753879, + "grad_norm": 2.4266695976257324, + "learning_rate": 5.881894950563449e-07, + "loss": 0.3867, + "step": 14667 + }, + { + "epoch": 1.784971098265896, + "grad_norm": 1.7731428146362305, + "learning_rate": 5.875305293804201e-07, + "loss": 0.3018, + "step": 14668 + }, + { + "epoch": 1.785092789777913, + "grad_norm": 2.9412267208099365, + "learning_rate": 5.868719218684405e-07, + "loss": 0.3444, + "step": 14669 + }, + { + "epoch": 1.78521448128993, + "grad_norm": 3.0094337463378906, + "learning_rate": 5.862136725454681e-07, + "loss": 0.4223, + "step": 14670 + }, + { + "epoch": 1.785336172801947, + "grad_norm": 1.6128140687942505, + "learning_rate": 5.85555781436552e-07, + "loss": 0.4037, + "step": 14671 + }, + { + "epoch": 1.785457864313964, + "grad_norm": 1.832460641860962, + "learning_rate": 5.848982485667232e-07, + "loss": 0.3374, + "step": 14672 + }, + { + "epoch": 1.785579555825981, + "grad_norm": 2.198385715484619, + "learning_rate": 5.842410739610061e-07, + "loss": 0.4153, + "step": 14673 + }, + { + "epoch": 1.785701247337998, + "grad_norm": 1.7360938787460327, + "learning_rate": 5.835842576444062e-07, + "loss": 0.3957, + "step": 14674 + }, + { + "epoch": 1.785822938850015, + "grad_norm": 1.9352519512176514, + "learning_rate": 5.829277996419135e-07, + "loss": 0.4056, + "step": 14675 + }, + { + "epoch": 1.785944630362032, + "grad_norm": 1.5521093606948853, + "learning_rate": 5.822716999785127e-07, + "loss": 0.3601, + "step": 14676 + }, + { + "epoch": 1.7860663218740491, + "grad_norm": 2.041236639022827, + "learning_rate": 5.816159586791669e-07, + "loss": 0.3897, + "step": 14677 + }, + { + "epoch": 1.7861880133860664, + "grad_norm": 2.1369783878326416, + "learning_rate": 5.809605757688297e-07, + "loss": 0.4439, + "step": 14678 + }, + { + "epoch": 1.7863097048980834, + "grad_norm": 1.8396812677383423, + "learning_rate": 5.803055512724387e-07, + "loss": 0.3837, + "step": 14679 + }, + { + "epoch": 1.7864313964101004, + "grad_norm": 1.8973432779312134, + "learning_rate": 5.796508852149174e-07, + "loss": 0.3535, + "step": 14680 + }, + { + "epoch": 1.7865530879221174, + "grad_norm": 1.9612892866134644, + "learning_rate": 5.789965776211804e-07, + "loss": 0.3817, + "step": 14681 + }, + { + "epoch": 1.7866747794341344, + "grad_norm": 2.2423877716064453, + "learning_rate": 5.783426285161243e-07, + "loss": 0.4269, + "step": 14682 + }, + { + "epoch": 1.7867964709461515, + "grad_norm": 2.1847662925720215, + "learning_rate": 5.776890379246303e-07, + "loss": 0.3473, + "step": 14683 + }, + { + "epoch": 1.7869181624581687, + "grad_norm": 2.904357433319092, + "learning_rate": 5.770358058715742e-07, + "loss": 0.3336, + "step": 14684 + }, + { + "epoch": 1.7870398539701857, + "grad_norm": 2.066694498062134, + "learning_rate": 5.763829323818104e-07, + "loss": 0.367, + "step": 14685 + }, + { + "epoch": 1.7871615454822027, + "grad_norm": 2.2317566871643066, + "learning_rate": 5.757304174801792e-07, + "loss": 0.321, + "step": 14686 + }, + { + "epoch": 1.7872832369942198, + "grad_norm": 1.8779890537261963, + "learning_rate": 5.750782611915151e-07, + "loss": 0.3562, + "step": 14687 + }, + { + "epoch": 1.7874049285062368, + "grad_norm": 1.9288618564605713, + "learning_rate": 5.744264635406316e-07, + "loss": 0.3415, + "step": 14688 + }, + { + "epoch": 1.7875266200182538, + "grad_norm": 1.723334550857544, + "learning_rate": 5.737750245523311e-07, + "loss": 0.3908, + "step": 14689 + }, + { + "epoch": 1.7876483115302708, + "grad_norm": 2.3007218837738037, + "learning_rate": 5.731239442514014e-07, + "loss": 0.3273, + "step": 14690 + }, + { + "epoch": 1.7877700030422878, + "grad_norm": 1.4876885414123535, + "learning_rate": 5.724732226626173e-07, + "loss": 0.397, + "step": 14691 + }, + { + "epoch": 1.7878916945543049, + "grad_norm": 2.6491878032684326, + "learning_rate": 5.718228598107422e-07, + "loss": 0.4083, + "step": 14692 + }, + { + "epoch": 1.7880133860663219, + "grad_norm": 1.5451598167419434, + "learning_rate": 5.711728557205232e-07, + "loss": 0.3937, + "step": 14693 + }, + { + "epoch": 1.788135077578339, + "grad_norm": 1.640594720840454, + "learning_rate": 5.705232104166913e-07, + "loss": 0.3476, + "step": 14694 + }, + { + "epoch": 1.788256769090356, + "grad_norm": 1.6284178495407104, + "learning_rate": 5.698739239239715e-07, + "loss": 0.3906, + "step": 14695 + }, + { + "epoch": 1.788378460602373, + "grad_norm": 1.477540373802185, + "learning_rate": 5.692249962670671e-07, + "loss": 0.3591, + "step": 14696 + }, + { + "epoch": 1.78850015211439, + "grad_norm": 1.769839882850647, + "learning_rate": 5.685764274706717e-07, + "loss": 0.3395, + "step": 14697 + }, + { + "epoch": 1.788621843626407, + "grad_norm": 2.4802558422088623, + "learning_rate": 5.679282175594669e-07, + "loss": 0.3165, + "step": 14698 + }, + { + "epoch": 1.788743535138424, + "grad_norm": 1.9474537372589111, + "learning_rate": 5.672803665581161e-07, + "loss": 0.376, + "step": 14699 + }, + { + "epoch": 1.788865226650441, + "grad_norm": 1.8837029933929443, + "learning_rate": 5.666328744912708e-07, + "loss": 0.3021, + "step": 14700 + }, + { + "epoch": 1.788986918162458, + "grad_norm": 1.8293421268463135, + "learning_rate": 5.659857413835735e-07, + "loss": 0.376, + "step": 14701 + }, + { + "epoch": 1.789108609674475, + "grad_norm": 2.456915855407715, + "learning_rate": 5.653389672596421e-07, + "loss": 0.4254, + "step": 14702 + }, + { + "epoch": 1.7892303011864923, + "grad_norm": 2.629037618637085, + "learning_rate": 5.646925521440949e-07, + "loss": 0.3457, + "step": 14703 + }, + { + "epoch": 1.7893519926985093, + "grad_norm": 1.7283847332000732, + "learning_rate": 5.640464960615255e-07, + "loss": 0.4401, + "step": 14704 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 2.4867005348205566, + "learning_rate": 5.634007990365165e-07, + "loss": 0.3969, + "step": 14705 + }, + { + "epoch": 1.7895953757225433, + "grad_norm": 2.7062764167785645, + "learning_rate": 5.627554610936414e-07, + "loss": 0.4235, + "step": 14706 + }, + { + "epoch": 1.7897170672345604, + "grad_norm": 2.072197675704956, + "learning_rate": 5.621104822574542e-07, + "loss": 0.3981, + "step": 14707 + }, + { + "epoch": 1.7898387587465774, + "grad_norm": 1.6879130601882935, + "learning_rate": 5.614658625524983e-07, + "loss": 0.3516, + "step": 14708 + }, + { + "epoch": 1.7899604502585946, + "grad_norm": 3.3290631771087646, + "learning_rate": 5.608216020033042e-07, + "loss": 0.4062, + "step": 14709 + }, + { + "epoch": 1.7900821417706116, + "grad_norm": 1.9005234241485596, + "learning_rate": 5.601777006343856e-07, + "loss": 0.3903, + "step": 14710 + }, + { + "epoch": 1.7902038332826287, + "grad_norm": 1.6743403673171997, + "learning_rate": 5.59534158470244e-07, + "loss": 0.3932, + "step": 14711 + }, + { + "epoch": 1.7903255247946457, + "grad_norm": 1.645206332206726, + "learning_rate": 5.588909755353711e-07, + "loss": 0.3814, + "step": 14712 + }, + { + "epoch": 1.7904472163066627, + "grad_norm": 2.1297378540039062, + "learning_rate": 5.58248151854236e-07, + "loss": 0.4193, + "step": 14713 + }, + { + "epoch": 1.7905689078186797, + "grad_norm": 1.9844765663146973, + "learning_rate": 5.576056874513025e-07, + "loss": 0.3002, + "step": 14714 + }, + { + "epoch": 1.7906905993306967, + "grad_norm": 2.4359207153320312, + "learning_rate": 5.569635823510178e-07, + "loss": 0.3936, + "step": 14715 + }, + { + "epoch": 1.7908122908427138, + "grad_norm": 3.388141632080078, + "learning_rate": 5.563218365778134e-07, + "loss": 0.3248, + "step": 14716 + }, + { + "epoch": 1.7909339823547308, + "grad_norm": 2.2163338661193848, + "learning_rate": 5.556804501561131e-07, + "loss": 0.3437, + "step": 14717 + }, + { + "epoch": 1.7910556738667478, + "grad_norm": 1.8829245567321777, + "learning_rate": 5.550394231103185e-07, + "loss": 0.4333, + "step": 14718 + }, + { + "epoch": 1.7911773653787648, + "grad_norm": 1.7104471921920776, + "learning_rate": 5.543987554648234e-07, + "loss": 0.3525, + "step": 14719 + }, + { + "epoch": 1.7912990568907818, + "grad_norm": 1.5799250602722168, + "learning_rate": 5.537584472440083e-07, + "loss": 0.3834, + "step": 14720 + }, + { + "epoch": 1.7914207484027989, + "grad_norm": 1.6776342391967773, + "learning_rate": 5.531184984722371e-07, + "loss": 0.3475, + "step": 14721 + }, + { + "epoch": 1.7915424399148159, + "grad_norm": 2.1516518592834473, + "learning_rate": 5.524789091738592e-07, + "loss": 0.3278, + "step": 14722 + }, + { + "epoch": 1.791664131426833, + "grad_norm": 2.270328998565674, + "learning_rate": 5.51839679373215e-07, + "loss": 0.3314, + "step": 14723 + }, + { + "epoch": 1.79178582293885, + "grad_norm": 1.5874958038330078, + "learning_rate": 5.512008090946286e-07, + "loss": 0.3335, + "step": 14724 + }, + { + "epoch": 1.791907514450867, + "grad_norm": 1.5988296270370483, + "learning_rate": 5.505622983624093e-07, + "loss": 0.3681, + "step": 14725 + }, + { + "epoch": 1.792029205962884, + "grad_norm": 2.482724905014038, + "learning_rate": 5.499241472008532e-07, + "loss": 0.3181, + "step": 14726 + }, + { + "epoch": 1.792150897474901, + "grad_norm": 2.6830341815948486, + "learning_rate": 5.49286355634242e-07, + "loss": 0.3056, + "step": 14727 + }, + { + "epoch": 1.7922725889869182, + "grad_norm": 1.734123945236206, + "learning_rate": 5.486489236868497e-07, + "loss": 0.3785, + "step": 14728 + }, + { + "epoch": 1.7923942804989352, + "grad_norm": 2.384275197982788, + "learning_rate": 5.48011851382928e-07, + "loss": 0.3665, + "step": 14729 + }, + { + "epoch": 1.7925159720109523, + "grad_norm": 1.6589666604995728, + "learning_rate": 5.473751387467196e-07, + "loss": 0.3716, + "step": 14730 + }, + { + "epoch": 1.7926376635229693, + "grad_norm": 2.1683309078216553, + "learning_rate": 5.467387858024532e-07, + "loss": 0.3661, + "step": 14731 + }, + { + "epoch": 1.7927593550349863, + "grad_norm": 1.929255723953247, + "learning_rate": 5.461027925743422e-07, + "loss": 0.3525, + "step": 14732 + }, + { + "epoch": 1.7928810465470033, + "grad_norm": 3.0995941162109375, + "learning_rate": 5.4546715908659e-07, + "loss": 0.2643, + "step": 14733 + }, + { + "epoch": 1.7930027380590203, + "grad_norm": 2.034583568572998, + "learning_rate": 5.448318853633827e-07, + "loss": 0.3769, + "step": 14734 + }, + { + "epoch": 1.7931244295710376, + "grad_norm": 2.0144851207733154, + "learning_rate": 5.441969714288919e-07, + "loss": 0.3886, + "step": 14735 + }, + { + "epoch": 1.7932461210830546, + "grad_norm": 1.7579948902130127, + "learning_rate": 5.435624173072807e-07, + "loss": 0.295, + "step": 14736 + }, + { + "epoch": 1.7933678125950716, + "grad_norm": 1.5953644514083862, + "learning_rate": 5.429282230226918e-07, + "loss": 0.382, + "step": 14737 + }, + { + "epoch": 1.7934895041070886, + "grad_norm": 1.584346890449524, + "learning_rate": 5.422943885992582e-07, + "loss": 0.3248, + "step": 14738 + }, + { + "epoch": 1.7936111956191056, + "grad_norm": 1.7310121059417725, + "learning_rate": 5.416609140611006e-07, + "loss": 0.3619, + "step": 14739 + }, + { + "epoch": 1.7937328871311227, + "grad_norm": 3.3062760829925537, + "learning_rate": 5.41027799432321e-07, + "loss": 0.4008, + "step": 14740 + }, + { + "epoch": 1.7938545786431397, + "grad_norm": 2.234459638595581, + "learning_rate": 5.403950447370154e-07, + "loss": 0.3899, + "step": 14741 + }, + { + "epoch": 1.7939762701551567, + "grad_norm": 1.7101892232894897, + "learning_rate": 5.39762649999258e-07, + "loss": 0.3742, + "step": 14742 + }, + { + "epoch": 1.7940979616671737, + "grad_norm": 3.6123085021972656, + "learning_rate": 5.39130615243112e-07, + "loss": 0.4417, + "step": 14743 + }, + { + "epoch": 1.7942196531791907, + "grad_norm": 1.5546348094940186, + "learning_rate": 5.3849894049263e-07, + "loss": 0.381, + "step": 14744 + }, + { + "epoch": 1.7943413446912078, + "grad_norm": 1.6390193700790405, + "learning_rate": 5.378676257718474e-07, + "loss": 0.3573, + "step": 14745 + }, + { + "epoch": 1.7944630362032248, + "grad_norm": 1.231501817703247, + "learning_rate": 5.372366711047871e-07, + "loss": 0.3133, + "step": 14746 + }, + { + "epoch": 1.7945847277152418, + "grad_norm": 1.6342763900756836, + "learning_rate": 5.366060765154602e-07, + "loss": 0.3301, + "step": 14747 + }, + { + "epoch": 1.7947064192272588, + "grad_norm": 3.78139066696167, + "learning_rate": 5.359758420278571e-07, + "loss": 0.4606, + "step": 14748 + }, + { + "epoch": 1.7948281107392758, + "grad_norm": 1.6679884195327759, + "learning_rate": 5.353459676659645e-07, + "loss": 0.347, + "step": 14749 + }, + { + "epoch": 1.7949498022512929, + "grad_norm": 1.5740777254104614, + "learning_rate": 5.347164534537486e-07, + "loss": 0.3313, + "step": 14750 + }, + { + "epoch": 1.7950714937633099, + "grad_norm": 1.7896395921707153, + "learning_rate": 5.340872994151625e-07, + "loss": 0.3734, + "step": 14751 + }, + { + "epoch": 1.795193185275327, + "grad_norm": 3.9227335453033447, + "learning_rate": 5.334585055741492e-07, + "loss": 0.4533, + "step": 14752 + }, + { + "epoch": 1.795314876787344, + "grad_norm": 1.646697998046875, + "learning_rate": 5.328300719546342e-07, + "loss": 0.3643, + "step": 14753 + }, + { + "epoch": 1.7954365682993612, + "grad_norm": 1.4338403940200806, + "learning_rate": 5.322019985805294e-07, + "loss": 0.3384, + "step": 14754 + }, + { + "epoch": 1.7955582598113782, + "grad_norm": 1.9854618310928345, + "learning_rate": 5.315742854757378e-07, + "loss": 0.4001, + "step": 14755 + }, + { + "epoch": 1.7956799513233952, + "grad_norm": 1.8640873432159424, + "learning_rate": 5.309469326641437e-07, + "loss": 0.3835, + "step": 14756 + }, + { + "epoch": 1.7958016428354122, + "grad_norm": 2.356013059616089, + "learning_rate": 5.303199401696157e-07, + "loss": 0.4477, + "step": 14757 + }, + { + "epoch": 1.7959233343474292, + "grad_norm": 2.144286632537842, + "learning_rate": 5.296933080160194e-07, + "loss": 0.3604, + "step": 14758 + }, + { + "epoch": 1.7960450258594463, + "grad_norm": 1.867668628692627, + "learning_rate": 5.29067036227191e-07, + "loss": 0.4201, + "step": 14759 + }, + { + "epoch": 1.7961667173714635, + "grad_norm": 2.3393490314483643, + "learning_rate": 5.284411248269683e-07, + "loss": 0.2951, + "step": 14760 + }, + { + "epoch": 1.7962884088834805, + "grad_norm": 2.8287689685821533, + "learning_rate": 5.278155738391655e-07, + "loss": 0.3541, + "step": 14761 + }, + { + "epoch": 1.7964101003954975, + "grad_norm": 1.6722416877746582, + "learning_rate": 5.271903832875847e-07, + "loss": 0.3737, + "step": 14762 + }, + { + "epoch": 1.7965317919075146, + "grad_norm": 1.562496304512024, + "learning_rate": 5.265655531960189e-07, + "loss": 0.3359, + "step": 14763 + }, + { + "epoch": 1.7966534834195316, + "grad_norm": 2.3614540100097656, + "learning_rate": 5.259410835882429e-07, + "loss": 0.3255, + "step": 14764 + }, + { + "epoch": 1.7967751749315486, + "grad_norm": 1.6277563571929932, + "learning_rate": 5.253169744880182e-07, + "loss": 0.3295, + "step": 14765 + }, + { + "epoch": 1.7968968664435656, + "grad_norm": 2.1053812503814697, + "learning_rate": 5.24693225919094e-07, + "loss": 0.4114, + "step": 14766 + }, + { + "epoch": 1.7970185579555826, + "grad_norm": 1.4443774223327637, + "learning_rate": 5.240698379052067e-07, + "loss": 0.3216, + "step": 14767 + }, + { + "epoch": 1.7971402494675996, + "grad_norm": 3.4049203395843506, + "learning_rate": 5.234468104700741e-07, + "loss": 0.3822, + "step": 14768 + }, + { + "epoch": 1.7972619409796167, + "grad_norm": 1.7134525775909424, + "learning_rate": 5.228241436374071e-07, + "loss": 0.3893, + "step": 14769 + }, + { + "epoch": 1.7973836324916337, + "grad_norm": 1.5483367443084717, + "learning_rate": 5.22201837430899e-07, + "loss": 0.3281, + "step": 14770 + }, + { + "epoch": 1.7975053240036507, + "grad_norm": 1.4085713624954224, + "learning_rate": 5.215798918742276e-07, + "loss": 0.3698, + "step": 14771 + }, + { + "epoch": 1.7976270155156677, + "grad_norm": 1.5848411321640015, + "learning_rate": 5.209583069910617e-07, + "loss": 0.3478, + "step": 14772 + }, + { + "epoch": 1.7977487070276847, + "grad_norm": 1.6619746685028076, + "learning_rate": 5.203370828050503e-07, + "loss": 0.3173, + "step": 14773 + }, + { + "epoch": 1.7978703985397018, + "grad_norm": 1.9243499040603638, + "learning_rate": 5.197162193398375e-07, + "loss": 0.4045, + "step": 14774 + }, + { + "epoch": 1.7979920900517188, + "grad_norm": 1.8584126234054565, + "learning_rate": 5.190957166190447e-07, + "loss": 0.3574, + "step": 14775 + }, + { + "epoch": 1.7981137815637358, + "grad_norm": 1.852081537246704, + "learning_rate": 5.184755746662828e-07, + "loss": 0.3889, + "step": 14776 + }, + { + "epoch": 1.7982354730757528, + "grad_norm": 1.581626057624817, + "learning_rate": 5.178557935051532e-07, + "loss": 0.3373, + "step": 14777 + }, + { + "epoch": 1.7983571645877698, + "grad_norm": 1.7249395847320557, + "learning_rate": 5.172363731592367e-07, + "loss": 0.401, + "step": 14778 + }, + { + "epoch": 1.798478856099787, + "grad_norm": 2.10213303565979, + "learning_rate": 5.166173136521035e-07, + "loss": 0.3693, + "step": 14779 + }, + { + "epoch": 1.798600547611804, + "grad_norm": 1.5888627767562866, + "learning_rate": 5.159986150073126e-07, + "loss": 0.3742, + "step": 14780 + }, + { + "epoch": 1.7987222391238211, + "grad_norm": 3.600254774093628, + "learning_rate": 5.15380277248404e-07, + "loss": 0.4371, + "step": 14781 + }, + { + "epoch": 1.7988439306358381, + "grad_norm": 4.297314643859863, + "learning_rate": 5.147623003989089e-07, + "loss": 0.4079, + "step": 14782 + }, + { + "epoch": 1.7989656221478552, + "grad_norm": 1.860015869140625, + "learning_rate": 5.141446844823417e-07, + "loss": 0.3244, + "step": 14783 + }, + { + "epoch": 1.7990873136598722, + "grad_norm": 2.0766143798828125, + "learning_rate": 5.135274295222026e-07, + "loss": 0.4172, + "step": 14784 + }, + { + "epoch": 1.7992090051718894, + "grad_norm": 3.5105597972869873, + "learning_rate": 5.129105355419816e-07, + "loss": 0.4489, + "step": 14785 + }, + { + "epoch": 1.7993306966839064, + "grad_norm": 1.5098934173583984, + "learning_rate": 5.122940025651523e-07, + "loss": 0.3369, + "step": 14786 + }, + { + "epoch": 1.7994523881959235, + "grad_norm": 3.6979687213897705, + "learning_rate": 5.116778306151737e-07, + "loss": 0.4223, + "step": 14787 + }, + { + "epoch": 1.7995740797079405, + "grad_norm": 1.7551203966140747, + "learning_rate": 5.110620197154947e-07, + "loss": 0.3753, + "step": 14788 + }, + { + "epoch": 1.7996957712199575, + "grad_norm": 1.981912612915039, + "learning_rate": 5.104465698895456e-07, + "loss": 0.3695, + "step": 14789 + }, + { + "epoch": 1.7998174627319745, + "grad_norm": 1.7611268758773804, + "learning_rate": 5.098314811607463e-07, + "loss": 0.3461, + "step": 14790 + }, + { + "epoch": 1.7999391542439915, + "grad_norm": 2.163121461868286, + "learning_rate": 5.09216753552505e-07, + "loss": 0.4022, + "step": 14791 + }, + { + "epoch": 1.8000608457560086, + "grad_norm": 1.6740963459014893, + "learning_rate": 5.086023870882084e-07, + "loss": 0.3712, + "step": 14792 + }, + { + "epoch": 1.8001825372680256, + "grad_norm": 1.6616315841674805, + "learning_rate": 5.079883817912401e-07, + "loss": 0.3612, + "step": 14793 + }, + { + "epoch": 1.8003042287800426, + "grad_norm": 2.0691816806793213, + "learning_rate": 5.073747376849602e-07, + "loss": 0.3233, + "step": 14794 + }, + { + "epoch": 1.8004259202920596, + "grad_norm": 1.9312821626663208, + "learning_rate": 5.06761454792718e-07, + "loss": 0.3963, + "step": 14795 + }, + { + "epoch": 1.8005476118040766, + "grad_norm": 1.9697308540344238, + "learning_rate": 5.061485331378546e-07, + "loss": 0.3656, + "step": 14796 + }, + { + "epoch": 1.8006693033160937, + "grad_norm": 1.6139159202575684, + "learning_rate": 5.055359727436904e-07, + "loss": 0.329, + "step": 14797 + }, + { + "epoch": 1.8007909948281107, + "grad_norm": 2.5449719429016113, + "learning_rate": 5.049237736335343e-07, + "loss": 0.3022, + "step": 14798 + }, + { + "epoch": 1.8009126863401277, + "grad_norm": 2.2044992446899414, + "learning_rate": 5.043119358306825e-07, + "loss": 0.3018, + "step": 14799 + }, + { + "epoch": 1.8010343778521447, + "grad_norm": 2.2404911518096924, + "learning_rate": 5.037004593584161e-07, + "loss": 0.3494, + "step": 14800 + }, + { + "epoch": 1.8011560693641617, + "grad_norm": 1.726481318473816, + "learning_rate": 5.030893442400042e-07, + "loss": 0.3636, + "step": 14801 + }, + { + "epoch": 1.8012777608761787, + "grad_norm": 3.1688575744628906, + "learning_rate": 5.024785904987006e-07, + "loss": 0.5256, + "step": 14802 + }, + { + "epoch": 1.8013994523881958, + "grad_norm": 1.8053702116012573, + "learning_rate": 5.018681981577445e-07, + "loss": 0.4006, + "step": 14803 + }, + { + "epoch": 1.801521143900213, + "grad_norm": 2.033916711807251, + "learning_rate": 5.012581672403649e-07, + "loss": 0.3265, + "step": 14804 + }, + { + "epoch": 1.80164283541223, + "grad_norm": 2.071986436843872, + "learning_rate": 5.006484977697735e-07, + "loss": 0.4078, + "step": 14805 + }, + { + "epoch": 1.801764526924247, + "grad_norm": 1.6140072345733643, + "learning_rate": 5.00039189769167e-07, + "loss": 0.3562, + "step": 14806 + }, + { + "epoch": 1.801886218436264, + "grad_norm": 1.711897373199463, + "learning_rate": 4.994302432617348e-07, + "loss": 0.4249, + "step": 14807 + }, + { + "epoch": 1.802007909948281, + "grad_norm": 2.0763955116271973, + "learning_rate": 4.988216582706451e-07, + "loss": 0.3749, + "step": 14808 + }, + { + "epoch": 1.802129601460298, + "grad_norm": 1.9720592498779297, + "learning_rate": 4.982134348190604e-07, + "loss": 0.4262, + "step": 14809 + }, + { + "epoch": 1.8022512929723153, + "grad_norm": 1.66851007938385, + "learning_rate": 4.976055729301211e-07, + "loss": 0.3761, + "step": 14810 + }, + { + "epoch": 1.8023729844843324, + "grad_norm": 1.9438172578811646, + "learning_rate": 4.969980726269574e-07, + "loss": 0.3898, + "step": 14811 + }, + { + "epoch": 1.8024946759963494, + "grad_norm": 2.012744903564453, + "learning_rate": 4.963909339326877e-07, + "loss": 0.3273, + "step": 14812 + }, + { + "epoch": 1.8026163675083664, + "grad_norm": 1.625656008720398, + "learning_rate": 4.957841568704158e-07, + "loss": 0.3645, + "step": 14813 + }, + { + "epoch": 1.8027380590203834, + "grad_norm": 1.7917683124542236, + "learning_rate": 4.951777414632275e-07, + "loss": 0.4012, + "step": 14814 + }, + { + "epoch": 1.8028597505324004, + "grad_norm": 1.8424171209335327, + "learning_rate": 4.945716877342011e-07, + "loss": 0.3865, + "step": 14815 + }, + { + "epoch": 1.8029814420444175, + "grad_norm": 1.930957555770874, + "learning_rate": 4.939659957063969e-07, + "loss": 0.3136, + "step": 14816 + }, + { + "epoch": 1.8031031335564345, + "grad_norm": 1.6637803316116333, + "learning_rate": 4.933606654028633e-07, + "loss": 0.3502, + "step": 14817 + }, + { + "epoch": 1.8032248250684515, + "grad_norm": 2.7403173446655273, + "learning_rate": 4.927556968466351e-07, + "loss": 0.3471, + "step": 14818 + }, + { + "epoch": 1.8033465165804685, + "grad_norm": 1.8273621797561646, + "learning_rate": 4.921510900607285e-07, + "loss": 0.3741, + "step": 14819 + }, + { + "epoch": 1.8034682080924855, + "grad_norm": 1.7146549224853516, + "learning_rate": 4.915468450681559e-07, + "loss": 0.3427, + "step": 14820 + }, + { + "epoch": 1.8035898996045026, + "grad_norm": 1.6572078466415405, + "learning_rate": 4.909429618919059e-07, + "loss": 0.3271, + "step": 14821 + }, + { + "epoch": 1.8037115911165196, + "grad_norm": 1.6790177822113037, + "learning_rate": 4.903394405549589e-07, + "loss": 0.3641, + "step": 14822 + }, + { + "epoch": 1.8038332826285366, + "grad_norm": 2.3604867458343506, + "learning_rate": 4.897362810802808e-07, + "loss": 0.4224, + "step": 14823 + }, + { + "epoch": 1.8039549741405536, + "grad_norm": 3.3323137760162354, + "learning_rate": 4.891334834908224e-07, + "loss": 0.4482, + "step": 14824 + }, + { + "epoch": 1.8040766656525706, + "grad_norm": 2.1242923736572266, + "learning_rate": 4.885310478095196e-07, + "loss": 0.3864, + "step": 14825 + }, + { + "epoch": 1.8041983571645877, + "grad_norm": 2.0595483779907227, + "learning_rate": 4.879289740592996e-07, + "loss": 0.4129, + "step": 14826 + }, + { + "epoch": 1.8043200486766047, + "grad_norm": 1.8756804466247559, + "learning_rate": 4.873272622630709e-07, + "loss": 0.3411, + "step": 14827 + }, + { + "epoch": 1.8044417401886217, + "grad_norm": 2.0814154148101807, + "learning_rate": 4.867259124437307e-07, + "loss": 0.4108, + "step": 14828 + }, + { + "epoch": 1.804563431700639, + "grad_norm": 1.3498259782791138, + "learning_rate": 4.861249246241606e-07, + "loss": 0.2879, + "step": 14829 + }, + { + "epoch": 1.804685123212656, + "grad_norm": 2.0821359157562256, + "learning_rate": 4.855242988272279e-07, + "loss": 0.3517, + "step": 14830 + }, + { + "epoch": 1.804806814724673, + "grad_norm": 1.918953776359558, + "learning_rate": 4.849240350757911e-07, + "loss": 0.3694, + "step": 14831 + }, + { + "epoch": 1.80492850623669, + "grad_norm": 2.6444332599639893, + "learning_rate": 4.843241333926907e-07, + "loss": 0.4012, + "step": 14832 + }, + { + "epoch": 1.805050197748707, + "grad_norm": 1.9890787601470947, + "learning_rate": 4.837245938007518e-07, + "loss": 0.3355, + "step": 14833 + }, + { + "epoch": 1.805171889260724, + "grad_norm": 2.329502582550049, + "learning_rate": 4.831254163227906e-07, + "loss": 0.3957, + "step": 14834 + }, + { + "epoch": 1.805293580772741, + "grad_norm": 2.8570895195007324, + "learning_rate": 4.825266009816054e-07, + "loss": 0.3137, + "step": 14835 + }, + { + "epoch": 1.8054152722847583, + "grad_norm": 1.8884459733963013, + "learning_rate": 4.819281477999826e-07, + "loss": 0.3874, + "step": 14836 + }, + { + "epoch": 1.8055369637967753, + "grad_norm": 2.1398398876190186, + "learning_rate": 4.813300568006973e-07, + "loss": 0.3895, + "step": 14837 + }, + { + "epoch": 1.8056586553087923, + "grad_norm": 4.1433186531066895, + "learning_rate": 4.807323280065046e-07, + "loss": 0.4288, + "step": 14838 + }, + { + "epoch": 1.8057803468208093, + "grad_norm": 1.590999960899353, + "learning_rate": 4.801349614401518e-07, + "loss": 0.3796, + "step": 14839 + }, + { + "epoch": 1.8059020383328264, + "grad_norm": 3.455644369125366, + "learning_rate": 4.795379571243675e-07, + "loss": 0.2993, + "step": 14840 + }, + { + "epoch": 1.8060237298448434, + "grad_norm": 2.1806375980377197, + "learning_rate": 4.7894131508187e-07, + "loss": 0.3579, + "step": 14841 + }, + { + "epoch": 1.8061454213568604, + "grad_norm": 1.8905553817749023, + "learning_rate": 4.783450353353636e-07, + "loss": 0.3729, + "step": 14842 + }, + { + "epoch": 1.8062671128688774, + "grad_norm": 1.8146286010742188, + "learning_rate": 4.777491179075378e-07, + "loss": 0.3684, + "step": 14843 + }, + { + "epoch": 1.8063888043808944, + "grad_norm": 2.0119943618774414, + "learning_rate": 4.771535628210655e-07, + "loss": 0.3966, + "step": 14844 + }, + { + "epoch": 1.8065104958929115, + "grad_norm": 2.6113085746765137, + "learning_rate": 4.7655837009861427e-07, + "loss": 0.4367, + "step": 14845 + }, + { + "epoch": 1.8066321874049285, + "grad_norm": 2.5403854846954346, + "learning_rate": 4.7596353976282814e-07, + "loss": 0.3695, + "step": 14846 + }, + { + "epoch": 1.8067538789169455, + "grad_norm": 2.138422727584839, + "learning_rate": 4.753690718363424e-07, + "loss": 0.294, + "step": 14847 + }, + { + "epoch": 1.8068755704289625, + "grad_norm": 3.077648162841797, + "learning_rate": 4.7477496634178e-07, + "loss": 0.3872, + "step": 14848 + }, + { + "epoch": 1.8069972619409795, + "grad_norm": 1.725256085395813, + "learning_rate": 4.741812233017462e-07, + "loss": 0.3501, + "step": 14849 + }, + { + "epoch": 1.8071189534529966, + "grad_norm": 1.8914635181427002, + "learning_rate": 4.7358784273883407e-07, + "loss": 0.3695, + "step": 14850 + }, + { + "epoch": 1.8072406449650136, + "grad_norm": 1.6651238203048706, + "learning_rate": 4.729948246756222e-07, + "loss": 0.3612, + "step": 14851 + }, + { + "epoch": 1.8073623364770306, + "grad_norm": 1.5391969680786133, + "learning_rate": 4.7240216913467697e-07, + "loss": 0.3509, + "step": 14852 + }, + { + "epoch": 1.8074840279890476, + "grad_norm": 1.8276183605194092, + "learning_rate": 4.7180987613855147e-07, + "loss": 0.3932, + "step": 14853 + }, + { + "epoch": 1.8076057195010649, + "grad_norm": 1.6572011709213257, + "learning_rate": 4.712179457097821e-07, + "loss": 0.3907, + "step": 14854 + }, + { + "epoch": 1.8077274110130819, + "grad_norm": 2.9096972942352295, + "learning_rate": 4.706263778708919e-07, + "loss": 0.3413, + "step": 14855 + }, + { + "epoch": 1.807849102525099, + "grad_norm": 2.8286430835723877, + "learning_rate": 4.7003517264439413e-07, + "loss": 0.4235, + "step": 14856 + }, + { + "epoch": 1.807970794037116, + "grad_norm": 1.8517764806747437, + "learning_rate": 4.69444330052784e-07, + "loss": 0.3491, + "step": 14857 + }, + { + "epoch": 1.808092485549133, + "grad_norm": 1.8784458637237549, + "learning_rate": 4.6885385011854243e-07, + "loss": 0.3562, + "step": 14858 + }, + { + "epoch": 1.80821417706115, + "grad_norm": 2.420989751815796, + "learning_rate": 4.682637328641426e-07, + "loss": 0.43, + "step": 14859 + }, + { + "epoch": 1.808335868573167, + "grad_norm": 2.6593501567840576, + "learning_rate": 4.6767397831203543e-07, + "loss": 0.2992, + "step": 14860 + }, + { + "epoch": 1.8084575600851842, + "grad_norm": 3.873929977416992, + "learning_rate": 4.6708458648466625e-07, + "loss": 0.4299, + "step": 14861 + }, + { + "epoch": 1.8085792515972012, + "grad_norm": 2.11767840385437, + "learning_rate": 4.664955574044616e-07, + "loss": 0.4236, + "step": 14862 + }, + { + "epoch": 1.8087009431092183, + "grad_norm": 1.478190541267395, + "learning_rate": 4.6590689109383136e-07, + "loss": 0.3178, + "step": 14863 + }, + { + "epoch": 1.8088226346212353, + "grad_norm": 1.6360585689544678, + "learning_rate": 4.653185875751798e-07, + "loss": 0.3348, + "step": 14864 + }, + { + "epoch": 1.8089443261332523, + "grad_norm": 2.4978084564208984, + "learning_rate": 4.647306468708912e-07, + "loss": 0.3962, + "step": 14865 + }, + { + "epoch": 1.8090660176452693, + "grad_norm": 2.796816825866699, + "learning_rate": 4.641430690033377e-07, + "loss": 0.292, + "step": 14866 + }, + { + "epoch": 1.8091877091572863, + "grad_norm": 1.680859923362732, + "learning_rate": 4.635558539948803e-07, + "loss": 0.3859, + "step": 14867 + }, + { + "epoch": 1.8093094006693033, + "grad_norm": 2.0321059226989746, + "learning_rate": 4.629690018678601e-07, + "loss": 0.3246, + "step": 14868 + }, + { + "epoch": 1.8094310921813204, + "grad_norm": 1.5931979417800903, + "learning_rate": 4.6238251264461134e-07, + "loss": 0.3724, + "step": 14869 + }, + { + "epoch": 1.8095527836933374, + "grad_norm": 1.7058771848678589, + "learning_rate": 4.617963863474495e-07, + "loss": 0.3151, + "step": 14870 + }, + { + "epoch": 1.8096744752053544, + "grad_norm": 1.4641793966293335, + "learning_rate": 4.612106229986768e-07, + "loss": 0.3245, + "step": 14871 + }, + { + "epoch": 1.8097961667173714, + "grad_norm": 1.888519525527954, + "learning_rate": 4.606252226205854e-07, + "loss": 0.3844, + "step": 14872 + }, + { + "epoch": 1.8099178582293884, + "grad_norm": 1.5693230628967285, + "learning_rate": 4.6004018523545077e-07, + "loss": 0.374, + "step": 14873 + }, + { + "epoch": 1.8100395497414055, + "grad_norm": 1.6680448055267334, + "learning_rate": 4.5945551086553297e-07, + "loss": 0.3775, + "step": 14874 + }, + { + "epoch": 1.8101612412534225, + "grad_norm": 2.2927894592285156, + "learning_rate": 4.588711995330808e-07, + "loss": 0.3812, + "step": 14875 + }, + { + "epoch": 1.8102829327654395, + "grad_norm": 2.147362232208252, + "learning_rate": 4.5828725126032646e-07, + "loss": 0.3567, + "step": 14876 + }, + { + "epoch": 1.8104046242774565, + "grad_norm": 2.65213942527771, + "learning_rate": 4.5770366606949443e-07, + "loss": 0.3201, + "step": 14877 + }, + { + "epoch": 1.8105263157894735, + "grad_norm": 2.5378568172454834, + "learning_rate": 4.5712044398279033e-07, + "loss": 0.316, + "step": 14878 + }, + { + "epoch": 1.8106480073014906, + "grad_norm": 2.001706838607788, + "learning_rate": 4.56537585022403e-07, + "loss": 0.3454, + "step": 14879 + }, + { + "epoch": 1.8107696988135078, + "grad_norm": 1.8229354619979858, + "learning_rate": 4.559550892105169e-07, + "loss": 0.3525, + "step": 14880 + }, + { + "epoch": 1.8108913903255248, + "grad_norm": 1.6335780620574951, + "learning_rate": 4.5537295656929435e-07, + "loss": 0.3956, + "step": 14881 + }, + { + "epoch": 1.8110130818375418, + "grad_norm": 2.341587543487549, + "learning_rate": 4.5479118712088656e-07, + "loss": 0.3871, + "step": 14882 + }, + { + "epoch": 1.8111347733495589, + "grad_norm": 2.632615089416504, + "learning_rate": 4.5420978088743127e-07, + "loss": 0.3112, + "step": 14883 + }, + { + "epoch": 1.8112564648615759, + "grad_norm": 1.8785203695297241, + "learning_rate": 4.536287378910542e-07, + "loss": 0.4, + "step": 14884 + }, + { + "epoch": 1.811378156373593, + "grad_norm": 1.6285902261734009, + "learning_rate": 4.530480581538632e-07, + "loss": 0.3821, + "step": 14885 + }, + { + "epoch": 1.8114998478856101, + "grad_norm": 1.6032397747039795, + "learning_rate": 4.524677416979539e-07, + "loss": 0.3828, + "step": 14886 + }, + { + "epoch": 1.8116215393976272, + "grad_norm": 1.7675167322158813, + "learning_rate": 4.518877885454087e-07, + "loss": 0.3975, + "step": 14887 + }, + { + "epoch": 1.8117432309096442, + "grad_norm": 3.0974245071411133, + "learning_rate": 4.5130819871829766e-07, + "loss": 0.3921, + "step": 14888 + }, + { + "epoch": 1.8118649224216612, + "grad_norm": 2.473191976547241, + "learning_rate": 4.507289722386743e-07, + "loss": 0.3729, + "step": 14889 + }, + { + "epoch": 1.8119866139336782, + "grad_norm": 1.5676767826080322, + "learning_rate": 4.501501091285787e-07, + "loss": 0.3165, + "step": 14890 + }, + { + "epoch": 1.8121083054456952, + "grad_norm": 1.580743432044983, + "learning_rate": 4.4957160941003884e-07, + "loss": 0.3757, + "step": 14891 + }, + { + "epoch": 1.8122299969577123, + "grad_norm": 1.9960728883743286, + "learning_rate": 4.4899347310506824e-07, + "loss": 0.3213, + "step": 14892 + }, + { + "epoch": 1.8123516884697293, + "grad_norm": 2.9530646800994873, + "learning_rate": 4.4841570023566374e-07, + "loss": 0.4389, + "step": 14893 + }, + { + "epoch": 1.8124733799817463, + "grad_norm": 2.1559576988220215, + "learning_rate": 4.4783829082381435e-07, + "loss": 0.458, + "step": 14894 + }, + { + "epoch": 1.8125950714937633, + "grad_norm": 1.826980471611023, + "learning_rate": 4.472612448914904e-07, + "loss": 0.3778, + "step": 14895 + }, + { + "epoch": 1.8127167630057803, + "grad_norm": 1.6393816471099854, + "learning_rate": 4.466845624606464e-07, + "loss": 0.3268, + "step": 14896 + }, + { + "epoch": 1.8128384545177973, + "grad_norm": 1.5732077360153198, + "learning_rate": 4.461082435532327e-07, + "loss": 0.3603, + "step": 14897 + }, + { + "epoch": 1.8129601460298144, + "grad_norm": 1.806353211402893, + "learning_rate": 4.455322881911728e-07, + "loss": 0.3734, + "step": 14898 + }, + { + "epoch": 1.8130818375418314, + "grad_norm": 1.83968186378479, + "learning_rate": 4.4495669639638803e-07, + "loss": 0.3991, + "step": 14899 + }, + { + "epoch": 1.8132035290538484, + "grad_norm": 1.6772584915161133, + "learning_rate": 4.4438146819077765e-07, + "loss": 0.3343, + "step": 14900 + }, + { + "epoch": 1.8133252205658654, + "grad_norm": 1.7154022455215454, + "learning_rate": 4.4380660359623074e-07, + "loss": 0.3976, + "step": 14901 + }, + { + "epoch": 1.8134469120778824, + "grad_norm": 1.4220342636108398, + "learning_rate": 4.432321026346242e-07, + "loss": 0.3425, + "step": 14902 + }, + { + "epoch": 1.8135686035898995, + "grad_norm": 2.66960072517395, + "learning_rate": 4.4265796532781737e-07, + "loss": 0.3369, + "step": 14903 + }, + { + "epoch": 1.8136902951019165, + "grad_norm": 2.686824321746826, + "learning_rate": 4.4208419169765705e-07, + "loss": 0.3861, + "step": 14904 + }, + { + "epoch": 1.8138119866139337, + "grad_norm": 1.9637571573257446, + "learning_rate": 4.4151078176597694e-07, + "loss": 0.3985, + "step": 14905 + }, + { + "epoch": 1.8139336781259507, + "grad_norm": 2.498554229736328, + "learning_rate": 4.409377355545974e-07, + "loss": 0.3809, + "step": 14906 + }, + { + "epoch": 1.8140553696379678, + "grad_norm": 3.8453824520111084, + "learning_rate": 4.4036505308532207e-07, + "loss": 0.3014, + "step": 14907 + }, + { + "epoch": 1.8141770611499848, + "grad_norm": 1.4222275018692017, + "learning_rate": 4.3979273437994575e-07, + "loss": 0.3955, + "step": 14908 + }, + { + "epoch": 1.8142987526620018, + "grad_norm": 2.0273361206054688, + "learning_rate": 4.3922077946024213e-07, + "loss": 0.4061, + "step": 14909 + }, + { + "epoch": 1.8144204441740188, + "grad_norm": 1.940845251083374, + "learning_rate": 4.3864918834797934e-07, + "loss": 0.4214, + "step": 14910 + }, + { + "epoch": 1.814542135686036, + "grad_norm": 2.002195119857788, + "learning_rate": 4.380779610649055e-07, + "loss": 0.3415, + "step": 14911 + }, + { + "epoch": 1.814663827198053, + "grad_norm": 1.8036713600158691, + "learning_rate": 4.3750709763275554e-07, + "loss": 0.3641, + "step": 14912 + }, + { + "epoch": 1.81478551871007, + "grad_norm": 1.459445834159851, + "learning_rate": 4.369365980732565e-07, + "loss": 0.3373, + "step": 14913 + }, + { + "epoch": 1.8149072102220871, + "grad_norm": 1.7466773986816406, + "learning_rate": 4.363664624081132e-07, + "loss": 0.3968, + "step": 14914 + }, + { + "epoch": 1.8150289017341041, + "grad_norm": 2.2615108489990234, + "learning_rate": 4.357966906590205e-07, + "loss": 0.3276, + "step": 14915 + }, + { + "epoch": 1.8151505932461212, + "grad_norm": 1.3322075605392456, + "learning_rate": 4.3522728284766224e-07, + "loss": 0.3451, + "step": 14916 + }, + { + "epoch": 1.8152722847581382, + "grad_norm": 2.68404221534729, + "learning_rate": 4.3465823899570323e-07, + "loss": 0.4284, + "step": 14917 + }, + { + "epoch": 1.8153939762701552, + "grad_norm": 3.1832666397094727, + "learning_rate": 4.340895591247962e-07, + "loss": 0.3442, + "step": 14918 + }, + { + "epoch": 1.8155156677821722, + "grad_norm": 1.6485973596572876, + "learning_rate": 4.3352124325658385e-07, + "loss": 0.4397, + "step": 14919 + }, + { + "epoch": 1.8156373592941892, + "grad_norm": 2.054135322570801, + "learning_rate": 4.3295329141268885e-07, + "loss": 0.3407, + "step": 14920 + }, + { + "epoch": 1.8157590508062063, + "grad_norm": 2.019174098968506, + "learning_rate": 4.323857036147239e-07, + "loss": 0.3445, + "step": 14921 + }, + { + "epoch": 1.8158807423182233, + "grad_norm": 1.528096079826355, + "learning_rate": 4.3181847988428726e-07, + "loss": 0.357, + "step": 14922 + }, + { + "epoch": 1.8160024338302403, + "grad_norm": 1.6700773239135742, + "learning_rate": 4.3125162024296065e-07, + "loss": 0.359, + "step": 14923 + }, + { + "epoch": 1.8161241253422573, + "grad_norm": 1.8885477781295776, + "learning_rate": 4.306851247123178e-07, + "loss": 0.4057, + "step": 14924 + }, + { + "epoch": 1.8162458168542743, + "grad_norm": 1.8417980670928955, + "learning_rate": 4.3011899331391273e-07, + "loss": 0.3414, + "step": 14925 + }, + { + "epoch": 1.8163675083662914, + "grad_norm": 2.0142858028411865, + "learning_rate": 4.2955322606928807e-07, + "loss": 0.4231, + "step": 14926 + }, + { + "epoch": 1.8164891998783084, + "grad_norm": 1.8136248588562012, + "learning_rate": 4.2898782299997445e-07, + "loss": 0.393, + "step": 14927 + }, + { + "epoch": 1.8166108913903254, + "grad_norm": 1.5395504236221313, + "learning_rate": 4.284227841274824e-07, + "loss": 0.3433, + "step": 14928 + }, + { + "epoch": 1.8167325829023424, + "grad_norm": 1.5359646081924438, + "learning_rate": 4.278581094733181e-07, + "loss": 0.3847, + "step": 14929 + }, + { + "epoch": 1.8168542744143596, + "grad_norm": 1.5286551713943481, + "learning_rate": 4.272937990589654e-07, + "loss": 0.3432, + "step": 14930 + }, + { + "epoch": 1.8169759659263767, + "grad_norm": 2.710171699523926, + "learning_rate": 4.267298529058983e-07, + "loss": 0.428, + "step": 14931 + }, + { + "epoch": 1.8170976574383937, + "grad_norm": 1.830812692642212, + "learning_rate": 4.2616627103557516e-07, + "loss": 0.3591, + "step": 14932 + }, + { + "epoch": 1.8172193489504107, + "grad_norm": 1.8068969249725342, + "learning_rate": 4.256030534694422e-07, + "loss": 0.3809, + "step": 14933 + }, + { + "epoch": 1.8173410404624277, + "grad_norm": 1.635646939277649, + "learning_rate": 4.250402002289311e-07, + "loss": 0.3444, + "step": 14934 + }, + { + "epoch": 1.8174627319744447, + "grad_norm": 1.634432077407837, + "learning_rate": 4.2447771133546037e-07, + "loss": 0.3948, + "step": 14935 + }, + { + "epoch": 1.8175844234864618, + "grad_norm": 1.7847291231155396, + "learning_rate": 4.2391558681043057e-07, + "loss": 0.3804, + "step": 14936 + }, + { + "epoch": 1.817706114998479, + "grad_norm": 2.725579261779785, + "learning_rate": 4.233538266752357e-07, + "loss": 0.3558, + "step": 14937 + }, + { + "epoch": 1.817827806510496, + "grad_norm": 1.476682424545288, + "learning_rate": 4.22792430951251e-07, + "loss": 0.3558, + "step": 14938 + }, + { + "epoch": 1.817949498022513, + "grad_norm": 1.8366916179656982, + "learning_rate": 4.2223139965983595e-07, + "loss": 0.3411, + "step": 14939 + }, + { + "epoch": 1.81807118953453, + "grad_norm": 2.21850848197937, + "learning_rate": 4.216707328223424e-07, + "loss": 0.4216, + "step": 14940 + }, + { + "epoch": 1.818192881046547, + "grad_norm": 1.8436192274093628, + "learning_rate": 4.2111043046010434e-07, + "loss": 0.3422, + "step": 14941 + }, + { + "epoch": 1.818314572558564, + "grad_norm": 2.9281909465789795, + "learning_rate": 4.2055049259443923e-07, + "loss": 0.3926, + "step": 14942 + }, + { + "epoch": 1.8184362640705811, + "grad_norm": 2.58005690574646, + "learning_rate": 4.1999091924666e-07, + "loss": 0.3505, + "step": 14943 + }, + { + "epoch": 1.8185579555825981, + "grad_norm": 2.444199562072754, + "learning_rate": 4.19431710438053e-07, + "loss": 0.3934, + "step": 14944 + }, + { + "epoch": 1.8186796470946152, + "grad_norm": 1.4753987789154053, + "learning_rate": 4.188728661899e-07, + "loss": 0.3076, + "step": 14945 + }, + { + "epoch": 1.8188013386066322, + "grad_norm": 1.6509904861450195, + "learning_rate": 4.1831438652346733e-07, + "loss": 0.3872, + "step": 14946 + }, + { + "epoch": 1.8189230301186492, + "grad_norm": 2.4174916744232178, + "learning_rate": 4.177562714600047e-07, + "loss": 0.3958, + "step": 14947 + }, + { + "epoch": 1.8190447216306662, + "grad_norm": 1.382598638534546, + "learning_rate": 4.1719852102075076e-07, + "loss": 0.326, + "step": 14948 + }, + { + "epoch": 1.8191664131426832, + "grad_norm": 2.24420428276062, + "learning_rate": 4.166411352269284e-07, + "loss": 0.3767, + "step": 14949 + }, + { + "epoch": 1.8192881046547003, + "grad_norm": 2.2212283611297607, + "learning_rate": 4.160841140997451e-07, + "loss": 0.3377, + "step": 14950 + }, + { + "epoch": 1.8194097961667173, + "grad_norm": 1.5659985542297363, + "learning_rate": 4.1552745766040073e-07, + "loss": 0.3968, + "step": 14951 + }, + { + "epoch": 1.8195314876787343, + "grad_norm": 1.841123104095459, + "learning_rate": 4.149711659300759e-07, + "loss": 0.3932, + "step": 14952 + }, + { + "epoch": 1.8196531791907513, + "grad_norm": 2.78578782081604, + "learning_rate": 4.1441523892993716e-07, + "loss": 0.4348, + "step": 14953 + }, + { + "epoch": 1.8197748707027683, + "grad_norm": 2.753040313720703, + "learning_rate": 4.1385967668114093e-07, + "loss": 0.4322, + "step": 14954 + }, + { + "epoch": 1.8198965622147856, + "grad_norm": 3.5378448963165283, + "learning_rate": 4.1330447920482466e-07, + "loss": 0.2601, + "step": 14955 + }, + { + "epoch": 1.8200182537268026, + "grad_norm": 1.5717259645462036, + "learning_rate": 4.12749646522117e-07, + "loss": 0.382, + "step": 14956 + }, + { + "epoch": 1.8201399452388196, + "grad_norm": 1.9543402194976807, + "learning_rate": 4.1219517865413006e-07, + "loss": 0.3695, + "step": 14957 + }, + { + "epoch": 1.8202616367508366, + "grad_norm": 2.4204328060150146, + "learning_rate": 4.116410756219613e-07, + "loss": 0.4302, + "step": 14958 + }, + { + "epoch": 1.8203833282628537, + "grad_norm": 1.7716232538223267, + "learning_rate": 4.1108733744669726e-07, + "loss": 0.3662, + "step": 14959 + }, + { + "epoch": 1.8205050197748707, + "grad_norm": 1.4448550939559937, + "learning_rate": 4.105339641494077e-07, + "loss": 0.3022, + "step": 14960 + }, + { + "epoch": 1.8206267112868877, + "grad_norm": 2.1423330307006836, + "learning_rate": 4.099809557511492e-07, + "loss": 0.393, + "step": 14961 + }, + { + "epoch": 1.820748402798905, + "grad_norm": 1.6046580076217651, + "learning_rate": 4.0942831227296695e-07, + "loss": 0.4013, + "step": 14962 + }, + { + "epoch": 1.820870094310922, + "grad_norm": 1.9217573404312134, + "learning_rate": 4.088760337358888e-07, + "loss": 0.3932, + "step": 14963 + }, + { + "epoch": 1.820991785822939, + "grad_norm": 1.4996533393859863, + "learning_rate": 4.083241201609278e-07, + "loss": 0.3465, + "step": 14964 + }, + { + "epoch": 1.821113477334956, + "grad_norm": 1.8665032386779785, + "learning_rate": 4.0777257156909053e-07, + "loss": 0.3991, + "step": 14965 + }, + { + "epoch": 1.821235168846973, + "grad_norm": 1.8192962408065796, + "learning_rate": 4.072213879813602e-07, + "loss": 0.3755, + "step": 14966 + }, + { + "epoch": 1.82135686035899, + "grad_norm": 1.821222186088562, + "learning_rate": 4.066705694187134e-07, + "loss": 0.3722, + "step": 14967 + }, + { + "epoch": 1.821478551871007, + "grad_norm": 2.230487108230591, + "learning_rate": 4.0612011590210775e-07, + "loss": 0.328, + "step": 14968 + }, + { + "epoch": 1.821600243383024, + "grad_norm": 1.6029232740402222, + "learning_rate": 4.0557002745248875e-07, + "loss": 0.4133, + "step": 14969 + }, + { + "epoch": 1.821721934895041, + "grad_norm": 3.296739101409912, + "learning_rate": 4.0502030409079076e-07, + "loss": 0.3287, + "step": 14970 + }, + { + "epoch": 1.821843626407058, + "grad_norm": 1.5507313013076782, + "learning_rate": 4.0447094583793143e-07, + "loss": 0.3753, + "step": 14971 + }, + { + "epoch": 1.8219653179190751, + "grad_norm": 1.5675057172775269, + "learning_rate": 4.039219527148119e-07, + "loss": 0.3921, + "step": 14972 + }, + { + "epoch": 1.8220870094310921, + "grad_norm": 1.8070685863494873, + "learning_rate": 4.033733247423266e-07, + "loss": 0.353, + "step": 14973 + }, + { + "epoch": 1.8222087009431092, + "grad_norm": 2.161892890930176, + "learning_rate": 4.0282506194135096e-07, + "loss": 0.317, + "step": 14974 + }, + { + "epoch": 1.8223303924551262, + "grad_norm": 1.746654987335205, + "learning_rate": 4.02277164332745e-07, + "loss": 0.3619, + "step": 14975 + }, + { + "epoch": 1.8224520839671432, + "grad_norm": 1.7208940982818604, + "learning_rate": 4.017296319373598e-07, + "loss": 0.3373, + "step": 14976 + }, + { + "epoch": 1.8225737754791602, + "grad_norm": 2.248431444168091, + "learning_rate": 4.0118246477602987e-07, + "loss": 0.435, + "step": 14977 + }, + { + "epoch": 1.8226954669911772, + "grad_norm": 1.7662694454193115, + "learning_rate": 4.006356628695751e-07, + "loss": 0.3162, + "step": 14978 + }, + { + "epoch": 1.8228171585031943, + "grad_norm": 1.6861603260040283, + "learning_rate": 4.000892262388045e-07, + "loss": 0.3182, + "step": 14979 + }, + { + "epoch": 1.8229388500152113, + "grad_norm": 1.6651843786239624, + "learning_rate": 3.99543154904507e-07, + "loss": 0.3371, + "step": 14980 + }, + { + "epoch": 1.8230605415272285, + "grad_norm": 1.7793020009994507, + "learning_rate": 3.989974488874659e-07, + "loss": 0.332, + "step": 14981 + }, + { + "epoch": 1.8231822330392455, + "grad_norm": 2.287501811981201, + "learning_rate": 3.984521082084447e-07, + "loss": 0.3918, + "step": 14982 + }, + { + "epoch": 1.8233039245512626, + "grad_norm": 2.995178461074829, + "learning_rate": 3.9790713288819337e-07, + "loss": 0.3287, + "step": 14983 + }, + { + "epoch": 1.8234256160632796, + "grad_norm": 2.0127651691436768, + "learning_rate": 3.9736252294745315e-07, + "loss": 0.4102, + "step": 14984 + }, + { + "epoch": 1.8235473075752966, + "grad_norm": 1.4879025220870972, + "learning_rate": 3.968182784069441e-07, + "loss": 0.3306, + "step": 14985 + }, + { + "epoch": 1.8236689990873136, + "grad_norm": 1.9716264009475708, + "learning_rate": 3.962743992873763e-07, + "loss": 0.3785, + "step": 14986 + }, + { + "epoch": 1.8237906905993309, + "grad_norm": 1.5481476783752441, + "learning_rate": 3.957308856094477e-07, + "loss": 0.3375, + "step": 14987 + }, + { + "epoch": 1.8239123821113479, + "grad_norm": 1.5322394371032715, + "learning_rate": 3.951877373938373e-07, + "loss": 0.3049, + "step": 14988 + }, + { + "epoch": 1.824034073623365, + "grad_norm": 1.819868803024292, + "learning_rate": 3.9464495466121633e-07, + "loss": 0.3383, + "step": 14989 + }, + { + "epoch": 1.824155765135382, + "grad_norm": 2.616342306137085, + "learning_rate": 3.9410253743223605e-07, + "loss": 0.3958, + "step": 14990 + }, + { + "epoch": 1.824277456647399, + "grad_norm": 1.671526551246643, + "learning_rate": 3.9356048572753655e-07, + "loss": 0.3629, + "step": 14991 + }, + { + "epoch": 1.824399148159416, + "grad_norm": 1.829897165298462, + "learning_rate": 3.930187995677459e-07, + "loss": 0.3209, + "step": 14992 + }, + { + "epoch": 1.824520839671433, + "grad_norm": 1.5978320837020874, + "learning_rate": 3.924774789734731e-07, + "loss": 0.3816, + "step": 14993 + }, + { + "epoch": 1.82464253118345, + "grad_norm": 2.3925981521606445, + "learning_rate": 3.9193652396532056e-07, + "loss": 0.3377, + "step": 14994 + }, + { + "epoch": 1.824764222695467, + "grad_norm": 1.997206687927246, + "learning_rate": 3.913959345638707e-07, + "loss": 0.4114, + "step": 14995 + }, + { + "epoch": 1.824885914207484, + "grad_norm": 1.9419773817062378, + "learning_rate": 3.9085571078969374e-07, + "loss": 0.3384, + "step": 14996 + }, + { + "epoch": 1.825007605719501, + "grad_norm": 1.563836693763733, + "learning_rate": 3.903158526633477e-07, + "loss": 0.3305, + "step": 14997 + }, + { + "epoch": 1.825129297231518, + "grad_norm": 3.358605146408081, + "learning_rate": 3.897763602053739e-07, + "loss": 0.2995, + "step": 14998 + }, + { + "epoch": 1.825250988743535, + "grad_norm": 2.1497631072998047, + "learning_rate": 3.8923723343630146e-07, + "loss": 0.3611, + "step": 14999 + }, + { + "epoch": 1.825372680255552, + "grad_norm": 1.9597254991531372, + "learning_rate": 3.8869847237664627e-07, + "loss": 0.332, + "step": 15000 + }, + { + "epoch": 1.8254943717675691, + "grad_norm": 1.8248443603515625, + "learning_rate": 3.881600770469074e-07, + "loss": 0.3586, + "step": 15001 + }, + { + "epoch": 1.8256160632795861, + "grad_norm": 1.559475064277649, + "learning_rate": 3.8762204746757403e-07, + "loss": 0.3672, + "step": 15002 + }, + { + "epoch": 1.8257377547916032, + "grad_norm": 2.0782363414764404, + "learning_rate": 3.870843836591176e-07, + "loss": 0.3074, + "step": 15003 + }, + { + "epoch": 1.8258594463036202, + "grad_norm": 2.658369302749634, + "learning_rate": 3.865470856419973e-07, + "loss": 0.405, + "step": 15004 + }, + { + "epoch": 1.8259811378156372, + "grad_norm": 1.6301521062850952, + "learning_rate": 3.8601015343666113e-07, + "loss": 0.3411, + "step": 15005 + }, + { + "epoch": 1.8261028293276544, + "grad_norm": 2.93886399269104, + "learning_rate": 3.8547358706353733e-07, + "loss": 0.4146, + "step": 15006 + }, + { + "epoch": 1.8262245208396715, + "grad_norm": 2.226526975631714, + "learning_rate": 3.849373865430439e-07, + "loss": 0.4016, + "step": 15007 + }, + { + "epoch": 1.8263462123516885, + "grad_norm": 2.2823421955108643, + "learning_rate": 3.844015518955857e-07, + "loss": 0.3723, + "step": 15008 + }, + { + "epoch": 1.8264679038637055, + "grad_norm": 1.8886463642120361, + "learning_rate": 3.8386608314155195e-07, + "loss": 0.4396, + "step": 15009 + }, + { + "epoch": 1.8265895953757225, + "grad_norm": 1.7071367502212524, + "learning_rate": 3.8333098030131745e-07, + "loss": 0.3098, + "step": 15010 + }, + { + "epoch": 1.8267112868877395, + "grad_norm": 1.6018060445785522, + "learning_rate": 3.827962433952448e-07, + "loss": 0.3382, + "step": 15011 + }, + { + "epoch": 1.8268329783997568, + "grad_norm": 2.620126485824585, + "learning_rate": 3.8226187244368216e-07, + "loss": 0.3337, + "step": 15012 + }, + { + "epoch": 1.8269546699117738, + "grad_norm": 1.7114201784133911, + "learning_rate": 3.8172786746696223e-07, + "loss": 0.3633, + "step": 15013 + }, + { + "epoch": 1.8270763614237908, + "grad_norm": 1.8939762115478516, + "learning_rate": 3.8119422848540644e-07, + "loss": 0.391, + "step": 15014 + }, + { + "epoch": 1.8271980529358078, + "grad_norm": 2.332385540008545, + "learning_rate": 3.806609555193186e-07, + "loss": 0.4033, + "step": 15015 + }, + { + "epoch": 1.8273197444478249, + "grad_norm": 2.4864838123321533, + "learning_rate": 3.8012804858899353e-07, + "loss": 0.4233, + "step": 15016 + }, + { + "epoch": 1.8274414359598419, + "grad_norm": 1.6785238981246948, + "learning_rate": 3.7959550771470845e-07, + "loss": 0.367, + "step": 15017 + }, + { + "epoch": 1.827563127471859, + "grad_norm": 1.452405571937561, + "learning_rate": 3.7906333291672594e-07, + "loss": 0.3055, + "step": 15018 + }, + { + "epoch": 1.827684818983876, + "grad_norm": 1.5213143825531006, + "learning_rate": 3.785315242152998e-07, + "loss": 0.2946, + "step": 15019 + }, + { + "epoch": 1.827806510495893, + "grad_norm": 2.9879708290100098, + "learning_rate": 3.7800008163066394e-07, + "loss": 0.4212, + "step": 15020 + }, + { + "epoch": 1.82792820200791, + "grad_norm": 1.889747142791748, + "learning_rate": 3.7746900518303984e-07, + "loss": 0.3937, + "step": 15021 + }, + { + "epoch": 1.828049893519927, + "grad_norm": 1.8190747499465942, + "learning_rate": 3.769382948926392e-07, + "loss": 0.3118, + "step": 15022 + }, + { + "epoch": 1.828171585031944, + "grad_norm": 1.568833351135254, + "learning_rate": 3.7640795077965475e-07, + "loss": 0.2836, + "step": 15023 + }, + { + "epoch": 1.828293276543961, + "grad_norm": 1.5308839082717896, + "learning_rate": 3.7587797286426806e-07, + "loss": 0.3454, + "step": 15024 + }, + { + "epoch": 1.828414968055978, + "grad_norm": 2.0651838779449463, + "learning_rate": 3.753483611666453e-07, + "loss": 0.3839, + "step": 15025 + }, + { + "epoch": 1.828536659567995, + "grad_norm": 1.687839388847351, + "learning_rate": 3.748191157069381e-07, + "loss": 0.314, + "step": 15026 + }, + { + "epoch": 1.828658351080012, + "grad_norm": 1.7700896263122559, + "learning_rate": 3.74290236505287e-07, + "loss": 0.3191, + "step": 15027 + }, + { + "epoch": 1.828780042592029, + "grad_norm": 1.647594928741455, + "learning_rate": 3.7376172358181696e-07, + "loss": 0.3375, + "step": 15028 + }, + { + "epoch": 1.828901734104046, + "grad_norm": 2.0445783138275146, + "learning_rate": 3.7323357695663754e-07, + "loss": 0.3821, + "step": 15029 + }, + { + "epoch": 1.8290234256160631, + "grad_norm": 1.649852991104126, + "learning_rate": 3.727057966498482e-07, + "loss": 0.3384, + "step": 15030 + }, + { + "epoch": 1.8291451171280804, + "grad_norm": 3.663666009902954, + "learning_rate": 3.721783826815306e-07, + "loss": 0.4125, + "step": 15031 + }, + { + "epoch": 1.8292668086400974, + "grad_norm": 1.822206974029541, + "learning_rate": 3.716513350717532e-07, + "loss": 0.3325, + "step": 15032 + }, + { + "epoch": 1.8293885001521144, + "grad_norm": 1.4554885625839233, + "learning_rate": 3.7112465384057215e-07, + "loss": 0.3452, + "step": 15033 + }, + { + "epoch": 1.8295101916641314, + "grad_norm": 1.5985687971115112, + "learning_rate": 3.705983390080303e-07, + "loss": 0.3472, + "step": 15034 + }, + { + "epoch": 1.8296318831761484, + "grad_norm": 1.9971174001693726, + "learning_rate": 3.7007239059415165e-07, + "loss": 0.3875, + "step": 15035 + }, + { + "epoch": 1.8297535746881655, + "grad_norm": 1.4277081489562988, + "learning_rate": 3.695468086189524e-07, + "loss": 0.3395, + "step": 15036 + }, + { + "epoch": 1.8298752662001825, + "grad_norm": 2.9293782711029053, + "learning_rate": 3.690215931024299e-07, + "loss": 0.4317, + "step": 15037 + }, + { + "epoch": 1.8299969577121997, + "grad_norm": 2.684494733810425, + "learning_rate": 3.684967440645715e-07, + "loss": 0.3208, + "step": 15038 + }, + { + "epoch": 1.8301186492242167, + "grad_norm": 1.6981792449951172, + "learning_rate": 3.679722615253489e-07, + "loss": 0.3645, + "step": 15039 + }, + { + "epoch": 1.8302403407362338, + "grad_norm": 1.5855158567428589, + "learning_rate": 3.674481455047174e-07, + "loss": 0.3293, + "step": 15040 + }, + { + "epoch": 1.8303620322482508, + "grad_norm": 2.08746337890625, + "learning_rate": 3.669243960226232e-07, + "loss": 0.3535, + "step": 15041 + }, + { + "epoch": 1.8304837237602678, + "grad_norm": 2.050413131713867, + "learning_rate": 3.6640101309899477e-07, + "loss": 0.4106, + "step": 15042 + }, + { + "epoch": 1.8306054152722848, + "grad_norm": 1.5070345401763916, + "learning_rate": 3.6587799675374844e-07, + "loss": 0.3533, + "step": 15043 + }, + { + "epoch": 1.8307271067843018, + "grad_norm": 2.5825185775756836, + "learning_rate": 3.653553470067861e-07, + "loss": 0.3073, + "step": 15044 + }, + { + "epoch": 1.8308487982963189, + "grad_norm": 1.5110516548156738, + "learning_rate": 3.6483306387799397e-07, + "loss": 0.3057, + "step": 15045 + }, + { + "epoch": 1.8309704898083359, + "grad_norm": 2.3110103607177734, + "learning_rate": 3.643111473872496e-07, + "loss": 0.3908, + "step": 15046 + }, + { + "epoch": 1.831092181320353, + "grad_norm": 3.019033193588257, + "learning_rate": 3.6378959755441035e-07, + "loss": 0.324, + "step": 15047 + }, + { + "epoch": 1.83121387283237, + "grad_norm": 1.8984547853469849, + "learning_rate": 3.6326841439932147e-07, + "loss": 0.3604, + "step": 15048 + }, + { + "epoch": 1.831335564344387, + "grad_norm": 2.5512828826904297, + "learning_rate": 3.627475979418182e-07, + "loss": 0.4046, + "step": 15049 + }, + { + "epoch": 1.831457255856404, + "grad_norm": 1.5460147857666016, + "learning_rate": 3.622271482017159e-07, + "loss": 0.3642, + "step": 15050 + }, + { + "epoch": 1.831578947368421, + "grad_norm": 2.112250804901123, + "learning_rate": 3.617070651988186e-07, + "loss": 0.3605, + "step": 15051 + }, + { + "epoch": 1.831700638880438, + "grad_norm": 2.0232927799224854, + "learning_rate": 3.611873489529194e-07, + "loss": 0.393, + "step": 15052 + }, + { + "epoch": 1.831822330392455, + "grad_norm": 2.1189792156219482, + "learning_rate": 3.606679994837903e-07, + "loss": 0.3369, + "step": 15053 + }, + { + "epoch": 1.831944021904472, + "grad_norm": 2.586393356323242, + "learning_rate": 3.601490168111987e-07, + "loss": 0.3743, + "step": 15054 + }, + { + "epoch": 1.832065713416489, + "grad_norm": 1.8087722063064575, + "learning_rate": 3.596304009548901e-07, + "loss": 0.3956, + "step": 15055 + }, + { + "epoch": 1.8321874049285063, + "grad_norm": 1.9052002429962158, + "learning_rate": 3.591121519345975e-07, + "loss": 0.4009, + "step": 15056 + }, + { + "epoch": 1.8323090964405233, + "grad_norm": 3.31994891166687, + "learning_rate": 3.585942697700451e-07, + "loss": 0.4156, + "step": 15057 + }, + { + "epoch": 1.8324307879525403, + "grad_norm": 1.8642053604125977, + "learning_rate": 3.580767544809383e-07, + "loss": 0.3964, + "step": 15058 + }, + { + "epoch": 1.8325524794645573, + "grad_norm": 2.8211753368377686, + "learning_rate": 3.575596060869657e-07, + "loss": 0.3874, + "step": 15059 + }, + { + "epoch": 1.8326741709765744, + "grad_norm": 1.7911968231201172, + "learning_rate": 3.5704282460781057e-07, + "loss": 0.3658, + "step": 15060 + }, + { + "epoch": 1.8327958624885914, + "grad_norm": 1.4197808504104614, + "learning_rate": 3.5652641006313384e-07, + "loss": 0.3368, + "step": 15061 + }, + { + "epoch": 1.8329175540006084, + "grad_norm": 1.7975486516952515, + "learning_rate": 3.560103624725908e-07, + "loss": 0.3851, + "step": 15062 + }, + { + "epoch": 1.8330392455126256, + "grad_norm": 4.038433074951172, + "learning_rate": 3.554946818558136e-07, + "loss": 0.2721, + "step": 15063 + }, + { + "epoch": 1.8331609370246427, + "grad_norm": 1.4514188766479492, + "learning_rate": 3.5497936823242653e-07, + "loss": 0.3719, + "step": 15064 + }, + { + "epoch": 1.8332826285366597, + "grad_norm": 2.8439102172851562, + "learning_rate": 3.544644216220405e-07, + "loss": 0.3495, + "step": 15065 + }, + { + "epoch": 1.8334043200486767, + "grad_norm": 1.6029181480407715, + "learning_rate": 3.5394984204424776e-07, + "loss": 0.3445, + "step": 15066 + }, + { + "epoch": 1.8335260115606937, + "grad_norm": 1.7452284097671509, + "learning_rate": 3.534356295186281e-07, + "loss": 0.3849, + "step": 15067 + }, + { + "epoch": 1.8336477030727107, + "grad_norm": 1.9027994871139526, + "learning_rate": 3.529217840647514e-07, + "loss": 0.3716, + "step": 15068 + }, + { + "epoch": 1.8337693945847278, + "grad_norm": 2.0823395252227783, + "learning_rate": 3.524083057021699e-07, + "loss": 0.3348, + "step": 15069 + }, + { + "epoch": 1.8338910860967448, + "grad_norm": 1.9788453578948975, + "learning_rate": 3.518951944504212e-07, + "loss": 0.3973, + "step": 15070 + }, + { + "epoch": 1.8340127776087618, + "grad_norm": 1.5093154907226562, + "learning_rate": 3.5138245032903086e-07, + "loss": 0.4161, + "step": 15071 + }, + { + "epoch": 1.8341344691207788, + "grad_norm": 2.215609312057495, + "learning_rate": 3.508700733575088e-07, + "loss": 0.3169, + "step": 15072 + }, + { + "epoch": 1.8342561606327958, + "grad_norm": 1.928064227104187, + "learning_rate": 3.5035806355535385e-07, + "loss": 0.3305, + "step": 15073 + }, + { + "epoch": 1.8343778521448129, + "grad_norm": 3.4645094871520996, + "learning_rate": 3.4984642094204824e-07, + "loss": 0.4295, + "step": 15074 + }, + { + "epoch": 1.8344995436568299, + "grad_norm": 2.571255922317505, + "learning_rate": 3.493351455370597e-07, + "loss": 0.3183, + "step": 15075 + }, + { + "epoch": 1.834621235168847, + "grad_norm": 1.6098570823669434, + "learning_rate": 3.4882423735984716e-07, + "loss": 0.3828, + "step": 15076 + }, + { + "epoch": 1.834742926680864, + "grad_norm": 1.7994794845581055, + "learning_rate": 3.4831369642984724e-07, + "loss": 0.381, + "step": 15077 + }, + { + "epoch": 1.834864618192881, + "grad_norm": 1.593248724937439, + "learning_rate": 3.4780352276649e-07, + "loss": 0.3426, + "step": 15078 + }, + { + "epoch": 1.834986309704898, + "grad_norm": 2.4551877975463867, + "learning_rate": 3.472937163891876e-07, + "loss": 0.3939, + "step": 15079 + }, + { + "epoch": 1.835108001216915, + "grad_norm": 2.4965217113494873, + "learning_rate": 3.467842773173391e-07, + "loss": 0.386, + "step": 15080 + }, + { + "epoch": 1.835229692728932, + "grad_norm": 2.558504581451416, + "learning_rate": 3.4627520557033e-07, + "loss": 0.3221, + "step": 15081 + }, + { + "epoch": 1.8353513842409492, + "grad_norm": 1.6214685440063477, + "learning_rate": 3.4576650116753265e-07, + "loss": 0.3636, + "step": 15082 + }, + { + "epoch": 1.8354730757529663, + "grad_norm": 1.7680699825286865, + "learning_rate": 3.4525816412830036e-07, + "loss": 0.3357, + "step": 15083 + }, + { + "epoch": 1.8355947672649833, + "grad_norm": 1.8350796699523926, + "learning_rate": 3.447501944719811e-07, + "loss": 0.3534, + "step": 15084 + }, + { + "epoch": 1.8357164587770003, + "grad_norm": 1.7724733352661133, + "learning_rate": 3.4424259221790155e-07, + "loss": 0.3712, + "step": 15085 + }, + { + "epoch": 1.8358381502890173, + "grad_norm": 2.4058008193969727, + "learning_rate": 3.4373535738537746e-07, + "loss": 0.3934, + "step": 15086 + }, + { + "epoch": 1.8359598418010343, + "grad_norm": 1.8062288761138916, + "learning_rate": 3.4322848999371106e-07, + "loss": 0.4319, + "step": 15087 + }, + { + "epoch": 1.8360815333130516, + "grad_norm": 1.8181830644607544, + "learning_rate": 3.427219900621892e-07, + "loss": 0.3181, + "step": 15088 + }, + { + "epoch": 1.8362032248250686, + "grad_norm": 1.898027777671814, + "learning_rate": 3.422158576100842e-07, + "loss": 0.357, + "step": 15089 + }, + { + "epoch": 1.8363249163370856, + "grad_norm": 2.4604787826538086, + "learning_rate": 3.4171009265665633e-07, + "loss": 0.307, + "step": 15090 + }, + { + "epoch": 1.8364466078491026, + "grad_norm": 2.7061424255371094, + "learning_rate": 3.412046952211523e-07, + "loss": 0.3301, + "step": 15091 + }, + { + "epoch": 1.8365682993611196, + "grad_norm": 1.8778622150421143, + "learning_rate": 3.4069966532280116e-07, + "loss": 0.3737, + "step": 15092 + }, + { + "epoch": 1.8366899908731367, + "grad_norm": 1.9590085744857788, + "learning_rate": 3.401950029808221e-07, + "loss": 0.447, + "step": 15093 + }, + { + "epoch": 1.8368116823851537, + "grad_norm": 1.6582603454589844, + "learning_rate": 3.396907082144174e-07, + "loss": 0.355, + "step": 15094 + }, + { + "epoch": 1.8369333738971707, + "grad_norm": 2.134652614593506, + "learning_rate": 3.3918678104277737e-07, + "loss": 0.3619, + "step": 15095 + }, + { + "epoch": 1.8370550654091877, + "grad_norm": 1.419028639793396, + "learning_rate": 3.386832214850766e-07, + "loss": 0.3579, + "step": 15096 + }, + { + "epoch": 1.8371767569212047, + "grad_norm": 1.4091005325317383, + "learning_rate": 3.381800295604765e-07, + "loss": 0.3231, + "step": 15097 + }, + { + "epoch": 1.8372984484332218, + "grad_norm": 1.621938943862915, + "learning_rate": 3.376772052881261e-07, + "loss": 0.3788, + "step": 15098 + }, + { + "epoch": 1.8374201399452388, + "grad_norm": 2.6516916751861572, + "learning_rate": 3.371747486871579e-07, + "loss": 0.3172, + "step": 15099 + }, + { + "epoch": 1.8375418314572558, + "grad_norm": 1.6465470790863037, + "learning_rate": 3.3667265977669117e-07, + "loss": 0.369, + "step": 15100 + }, + { + "epoch": 1.8376635229692728, + "grad_norm": 2.082284688949585, + "learning_rate": 3.361709385758316e-07, + "loss": 0.3221, + "step": 15101 + }, + { + "epoch": 1.8377852144812898, + "grad_norm": 1.710175633430481, + "learning_rate": 3.3566958510367177e-07, + "loss": 0.3632, + "step": 15102 + }, + { + "epoch": 1.8379069059933069, + "grad_norm": 2.417053699493408, + "learning_rate": 3.351685993792864e-07, + "loss": 0.3403, + "step": 15103 + }, + { + "epoch": 1.8380285975053239, + "grad_norm": 1.7789360284805298, + "learning_rate": 3.3466798142174353e-07, + "loss": 0.3055, + "step": 15104 + }, + { + "epoch": 1.838150289017341, + "grad_norm": 2.2343626022338867, + "learning_rate": 3.3416773125008797e-07, + "loss": 0.31, + "step": 15105 + }, + { + "epoch": 1.838271980529358, + "grad_norm": 1.6454588174819946, + "learning_rate": 3.336678488833589e-07, + "loss": 0.3604, + "step": 15106 + }, + { + "epoch": 1.8383936720413752, + "grad_norm": 1.929332971572876, + "learning_rate": 3.331683343405756e-07, + "loss": 0.3654, + "step": 15107 + }, + { + "epoch": 1.8385153635533922, + "grad_norm": 2.3010215759277344, + "learning_rate": 3.3266918764074616e-07, + "loss": 0.4226, + "step": 15108 + }, + { + "epoch": 1.8386370550654092, + "grad_norm": 2.4542360305786133, + "learning_rate": 3.3217040880286435e-07, + "loss": 0.398, + "step": 15109 + }, + { + "epoch": 1.8387587465774262, + "grad_norm": 2.573143243789673, + "learning_rate": 3.316719978459104e-07, + "loss": 0.4139, + "step": 15110 + }, + { + "epoch": 1.8388804380894432, + "grad_norm": 2.1813714504241943, + "learning_rate": 3.3117395478884815e-07, + "loss": 0.3524, + "step": 15111 + }, + { + "epoch": 1.8390021296014603, + "grad_norm": 1.7164812088012695, + "learning_rate": 3.306762796506313e-07, + "loss": 0.3183, + "step": 15112 + }, + { + "epoch": 1.8391238211134775, + "grad_norm": 2.0599563121795654, + "learning_rate": 3.3017897245019583e-07, + "loss": 0.4082, + "step": 15113 + }, + { + "epoch": 1.8392455126254945, + "grad_norm": 1.7602815628051758, + "learning_rate": 3.2968203320646655e-07, + "loss": 0.4196, + "step": 15114 + }, + { + "epoch": 1.8393672041375115, + "grad_norm": 2.440704584121704, + "learning_rate": 3.291854619383539e-07, + "loss": 0.3172, + "step": 15115 + }, + { + "epoch": 1.8394888956495286, + "grad_norm": 1.635888695716858, + "learning_rate": 3.286892586647494e-07, + "loss": 0.3669, + "step": 15116 + }, + { + "epoch": 1.8396105871615456, + "grad_norm": 2.2009193897247314, + "learning_rate": 3.2819342340453807e-07, + "loss": 0.3385, + "step": 15117 + }, + { + "epoch": 1.8397322786735626, + "grad_norm": 1.8456350564956665, + "learning_rate": 3.2769795617658694e-07, + "loss": 0.3331, + "step": 15118 + }, + { + "epoch": 1.8398539701855796, + "grad_norm": 2.1818649768829346, + "learning_rate": 3.2720285699974764e-07, + "loss": 0.3314, + "step": 15119 + }, + { + "epoch": 1.8399756616975966, + "grad_norm": 1.5128535032272339, + "learning_rate": 3.2670812589286285e-07, + "loss": 0.3716, + "step": 15120 + }, + { + "epoch": 1.8400973532096137, + "grad_norm": 2.060521125793457, + "learning_rate": 3.2621376287475416e-07, + "loss": 0.3641, + "step": 15121 + }, + { + "epoch": 1.8402190447216307, + "grad_norm": 1.9721227884292603, + "learning_rate": 3.2571976796423767e-07, + "loss": 0.3446, + "step": 15122 + }, + { + "epoch": 1.8403407362336477, + "grad_norm": 2.0915703773498535, + "learning_rate": 3.252261411801083e-07, + "loss": 0.3902, + "step": 15123 + }, + { + "epoch": 1.8404624277456647, + "grad_norm": 1.8351625204086304, + "learning_rate": 3.2473288254114886e-07, + "loss": 0.3585, + "step": 15124 + }, + { + "epoch": 1.8405841192576817, + "grad_norm": 2.097517967224121, + "learning_rate": 3.24239992066131e-07, + "loss": 0.3783, + "step": 15125 + }, + { + "epoch": 1.8407058107696987, + "grad_norm": 3.4559755325317383, + "learning_rate": 3.2374746977380965e-07, + "loss": 0.3982, + "step": 15126 + }, + { + "epoch": 1.8408275022817158, + "grad_norm": 1.6628080606460571, + "learning_rate": 3.2325531568292545e-07, + "loss": 0.3737, + "step": 15127 + }, + { + "epoch": 1.8409491937937328, + "grad_norm": 1.8398913145065308, + "learning_rate": 3.227635298122056e-07, + "loss": 0.3331, + "step": 15128 + }, + { + "epoch": 1.8410708853057498, + "grad_norm": 1.7329350709915161, + "learning_rate": 3.2227211218036404e-07, + "loss": 0.371, + "step": 15129 + }, + { + "epoch": 1.8411925768177668, + "grad_norm": 3.174161195755005, + "learning_rate": 3.2178106280610135e-07, + "loss": 0.4001, + "step": 15130 + }, + { + "epoch": 1.8413142683297838, + "grad_norm": 1.6231906414031982, + "learning_rate": 3.2129038170810143e-07, + "loss": 0.3269, + "step": 15131 + }, + { + "epoch": 1.841435959841801, + "grad_norm": 2.7355408668518066, + "learning_rate": 3.208000689050361e-07, + "loss": 0.3541, + "step": 15132 + }, + { + "epoch": 1.841557651353818, + "grad_norm": 2.3104732036590576, + "learning_rate": 3.2031012441556375e-07, + "loss": 0.3618, + "step": 15133 + }, + { + "epoch": 1.8416793428658351, + "grad_norm": 1.5579098463058472, + "learning_rate": 3.198205482583261e-07, + "loss": 0.3479, + "step": 15134 + }, + { + "epoch": 1.8418010343778521, + "grad_norm": 2.145002841949463, + "learning_rate": 3.193313404519538e-07, + "loss": 0.366, + "step": 15135 + }, + { + "epoch": 1.8419227258898692, + "grad_norm": 2.3516693115234375, + "learning_rate": 3.18842501015062e-07, + "loss": 0.2951, + "step": 15136 + }, + { + "epoch": 1.8420444174018862, + "grad_norm": 1.9513347148895264, + "learning_rate": 3.183540299662524e-07, + "loss": 0.3569, + "step": 15137 + }, + { + "epoch": 1.8421661089139034, + "grad_norm": 2.0186514854431152, + "learning_rate": 3.1786592732411136e-07, + "loss": 0.3985, + "step": 15138 + }, + { + "epoch": 1.8422878004259204, + "grad_norm": 1.64278244972229, + "learning_rate": 3.1737819310721395e-07, + "loss": 0.3467, + "step": 15139 + }, + { + "epoch": 1.8424094919379375, + "grad_norm": 2.21455979347229, + "learning_rate": 3.168908273341154e-07, + "loss": 0.4267, + "step": 15140 + }, + { + "epoch": 1.8425311834499545, + "grad_norm": 1.6781272888183594, + "learning_rate": 3.164038300233663e-07, + "loss": 0.3837, + "step": 15141 + }, + { + "epoch": 1.8426528749619715, + "grad_norm": 1.504429817199707, + "learning_rate": 3.1591720119349525e-07, + "loss": 0.3743, + "step": 15142 + }, + { + "epoch": 1.8427745664739885, + "grad_norm": 1.5023126602172852, + "learning_rate": 3.154309408630174e-07, + "loss": 0.3861, + "step": 15143 + }, + { + "epoch": 1.8428962579860055, + "grad_norm": 1.779693603515625, + "learning_rate": 3.149450490504402e-07, + "loss": 0.337, + "step": 15144 + }, + { + "epoch": 1.8430179494980226, + "grad_norm": 2.134334087371826, + "learning_rate": 3.144595257742511e-07, + "loss": 0.404, + "step": 15145 + }, + { + "epoch": 1.8431396410100396, + "grad_norm": 3.270294189453125, + "learning_rate": 3.1397437105292307e-07, + "loss": 0.3595, + "step": 15146 + }, + { + "epoch": 1.8432613325220566, + "grad_norm": 2.270339250564575, + "learning_rate": 3.134895849049213e-07, + "loss": 0.271, + "step": 15147 + }, + { + "epoch": 1.8433830240340736, + "grad_norm": 1.5615133047103882, + "learning_rate": 3.1300516734869e-07, + "loss": 0.364, + "step": 15148 + }, + { + "epoch": 1.8435047155460906, + "grad_norm": 1.4227180480957031, + "learning_rate": 3.125211184026622e-07, + "loss": 0.3286, + "step": 15149 + }, + { + "epoch": 1.8436264070581077, + "grad_norm": 2.8514857292175293, + "learning_rate": 3.1203743808525975e-07, + "loss": 0.3491, + "step": 15150 + }, + { + "epoch": 1.8437480985701247, + "grad_norm": 1.4530268907546997, + "learning_rate": 3.1155412641488356e-07, + "loss": 0.3486, + "step": 15151 + }, + { + "epoch": 1.8438697900821417, + "grad_norm": 1.465126872062683, + "learning_rate": 3.1107118340992895e-07, + "loss": 0.3542, + "step": 15152 + }, + { + "epoch": 1.8439914815941587, + "grad_norm": 2.551419258117676, + "learning_rate": 3.1058860908876997e-07, + "loss": 0.3966, + "step": 15153 + }, + { + "epoch": 1.8441131731061757, + "grad_norm": 2.1102793216705322, + "learning_rate": 3.101064034697698e-07, + "loss": 0.3871, + "step": 15154 + }, + { + "epoch": 1.8442348646181927, + "grad_norm": 1.720428466796875, + "learning_rate": 3.096245665712783e-07, + "loss": 0.3529, + "step": 15155 + }, + { + "epoch": 1.8443565561302098, + "grad_norm": 2.118997573852539, + "learning_rate": 3.091430984116306e-07, + "loss": 0.3602, + "step": 15156 + }, + { + "epoch": 1.844478247642227, + "grad_norm": 1.8666397333145142, + "learning_rate": 3.0866199900914553e-07, + "loss": 0.3575, + "step": 15157 + }, + { + "epoch": 1.844599939154244, + "grad_norm": 1.8927935361862183, + "learning_rate": 3.081812683821328e-07, + "loss": 0.3797, + "step": 15158 + }, + { + "epoch": 1.844721630666261, + "grad_norm": 1.6338809728622437, + "learning_rate": 3.077009065488834e-07, + "loss": 0.4088, + "step": 15159 + }, + { + "epoch": 1.844843322178278, + "grad_norm": 1.8532401323318481, + "learning_rate": 3.072209135276749e-07, + "loss": 0.3875, + "step": 15160 + }, + { + "epoch": 1.844965013690295, + "grad_norm": 3.4864485263824463, + "learning_rate": 3.0674128933677603e-07, + "loss": 0.423, + "step": 15161 + }, + { + "epoch": 1.845086705202312, + "grad_norm": 3.0972182750701904, + "learning_rate": 3.062620339944344e-07, + "loss": 0.362, + "step": 15162 + }, + { + "epoch": 1.8452083967143291, + "grad_norm": 2.235102891921997, + "learning_rate": 3.057831475188866e-07, + "loss": 0.3345, + "step": 15163 + }, + { + "epoch": 1.8453300882263464, + "grad_norm": 2.061497688293457, + "learning_rate": 3.0530462992835687e-07, + "loss": 0.3522, + "step": 15164 + }, + { + "epoch": 1.8454517797383634, + "grad_norm": 2.01693058013916, + "learning_rate": 3.0482648124105176e-07, + "loss": 0.3951, + "step": 15165 + }, + { + "epoch": 1.8455734712503804, + "grad_norm": 3.279359817504883, + "learning_rate": 3.043487014751678e-07, + "loss": 0.4126, + "step": 15166 + }, + { + "epoch": 1.8456951627623974, + "grad_norm": 2.221430778503418, + "learning_rate": 3.03871290648885e-07, + "loss": 0.3545, + "step": 15167 + }, + { + "epoch": 1.8458168542744144, + "grad_norm": 1.5250096321105957, + "learning_rate": 3.033942487803676e-07, + "loss": 0.3335, + "step": 15168 + }, + { + "epoch": 1.8459385457864315, + "grad_norm": 1.7814027070999146, + "learning_rate": 3.0291757588777116e-07, + "loss": 0.3662, + "step": 15169 + }, + { + "epoch": 1.8460602372984485, + "grad_norm": 1.7331829071044922, + "learning_rate": 3.024412719892333e-07, + "loss": 0.36, + "step": 15170 + }, + { + "epoch": 1.8461819288104655, + "grad_norm": 1.3819475173950195, + "learning_rate": 3.0196533710287636e-07, + "loss": 0.3452, + "step": 15171 + }, + { + "epoch": 1.8463036203224825, + "grad_norm": 2.0687620639801025, + "learning_rate": 3.0148977124681343e-07, + "loss": 0.3245, + "step": 15172 + }, + { + "epoch": 1.8464253118344995, + "grad_norm": 2.9504642486572266, + "learning_rate": 3.010145744391402e-07, + "loss": 0.4151, + "step": 15173 + }, + { + "epoch": 1.8465470033465166, + "grad_norm": 3.3617234230041504, + "learning_rate": 3.005397466979376e-07, + "loss": 0.3657, + "step": 15174 + }, + { + "epoch": 1.8466686948585336, + "grad_norm": 2.615053415298462, + "learning_rate": 3.0006528804127466e-07, + "loss": 0.3949, + "step": 15175 + }, + { + "epoch": 1.8467903863705506, + "grad_norm": 2.1150665283203125, + "learning_rate": 2.995911984872035e-07, + "loss": 0.3486, + "step": 15176 + }, + { + "epoch": 1.8469120778825676, + "grad_norm": 1.580841302871704, + "learning_rate": 2.9911747805376754e-07, + "loss": 0.3805, + "step": 15177 + }, + { + "epoch": 1.8470337693945846, + "grad_norm": 2.793536424636841, + "learning_rate": 2.9864412675899125e-07, + "loss": 0.2972, + "step": 15178 + }, + { + "epoch": 1.8471554609066017, + "grad_norm": 2.1198718547821045, + "learning_rate": 2.981711446208857e-07, + "loss": 0.357, + "step": 15179 + }, + { + "epoch": 1.8472771524186187, + "grad_norm": 1.786293864250183, + "learning_rate": 2.9769853165744986e-07, + "loss": 0.3824, + "step": 15180 + }, + { + "epoch": 1.8473988439306357, + "grad_norm": 1.8318227529525757, + "learning_rate": 2.972262878866683e-07, + "loss": 0.3819, + "step": 15181 + }, + { + "epoch": 1.8475205354426527, + "grad_norm": 1.9865673780441284, + "learning_rate": 2.967544133265099e-07, + "loss": 0.3809, + "step": 15182 + }, + { + "epoch": 1.84764222695467, + "grad_norm": 1.9973698854446411, + "learning_rate": 2.962829079949314e-07, + "loss": 0.3838, + "step": 15183 + }, + { + "epoch": 1.847763918466687, + "grad_norm": 1.7482609748840332, + "learning_rate": 2.9581177190987185e-07, + "loss": 0.3674, + "step": 15184 + }, + { + "epoch": 1.847885609978704, + "grad_norm": 2.033256769180298, + "learning_rate": 2.9534100508926355e-07, + "loss": 0.4078, + "step": 15185 + }, + { + "epoch": 1.848007301490721, + "grad_norm": 1.7239772081375122, + "learning_rate": 2.948706075510166e-07, + "loss": 0.4165, + "step": 15186 + }, + { + "epoch": 1.848128993002738, + "grad_norm": 2.1953210830688477, + "learning_rate": 2.9440057931303e-07, + "loss": 0.4181, + "step": 15187 + }, + { + "epoch": 1.848250684514755, + "grad_norm": 1.8499325513839722, + "learning_rate": 2.9393092039319284e-07, + "loss": 0.3764, + "step": 15188 + }, + { + "epoch": 1.8483723760267723, + "grad_norm": 3.3107144832611084, + "learning_rate": 2.93461630809373e-07, + "loss": 0.3034, + "step": 15189 + }, + { + "epoch": 1.8484940675387893, + "grad_norm": 1.5686134099960327, + "learning_rate": 2.929927105794306e-07, + "loss": 0.3414, + "step": 15190 + }, + { + "epoch": 1.8486157590508063, + "grad_norm": 1.612724781036377, + "learning_rate": 2.925241597212081e-07, + "loss": 0.3761, + "step": 15191 + }, + { + "epoch": 1.8487374505628233, + "grad_norm": 2.137622594833374, + "learning_rate": 2.920559782525334e-07, + "loss": 0.3147, + "step": 15192 + }, + { + "epoch": 1.8488591420748404, + "grad_norm": 1.8977779150009155, + "learning_rate": 2.9158816619122457e-07, + "loss": 0.4063, + "step": 15193 + }, + { + "epoch": 1.8489808335868574, + "grad_norm": 2.2496049404144287, + "learning_rate": 2.911207235550806e-07, + "loss": 0.4324, + "step": 15194 + }, + { + "epoch": 1.8491025250988744, + "grad_norm": 1.6088521480560303, + "learning_rate": 2.906536503618884e-07, + "loss": 0.3934, + "step": 15195 + }, + { + "epoch": 1.8492242166108914, + "grad_norm": 2.3476169109344482, + "learning_rate": 2.901869466294249e-07, + "loss": 0.3294, + "step": 15196 + }, + { + "epoch": 1.8493459081229084, + "grad_norm": 1.5555016994476318, + "learning_rate": 2.897206123754437e-07, + "loss": 0.3347, + "step": 15197 + }, + { + "epoch": 1.8494675996349255, + "grad_norm": 1.8243275880813599, + "learning_rate": 2.8925464761769385e-07, + "loss": 0.3261, + "step": 15198 + }, + { + "epoch": 1.8495892911469425, + "grad_norm": 2.080986738204956, + "learning_rate": 2.887890523739045e-07, + "loss": 0.3702, + "step": 15199 + }, + { + "epoch": 1.8497109826589595, + "grad_norm": 1.6361138820648193, + "learning_rate": 2.883238266617916e-07, + "loss": 0.3394, + "step": 15200 + }, + { + "epoch": 1.8498326741709765, + "grad_norm": 1.9605023860931396, + "learning_rate": 2.8785897049906086e-07, + "loss": 0.4017, + "step": 15201 + }, + { + "epoch": 1.8499543656829935, + "grad_norm": 2.717085123062134, + "learning_rate": 2.873944839033982e-07, + "loss": 0.3728, + "step": 15202 + }, + { + "epoch": 1.8500760571950106, + "grad_norm": 1.5822906494140625, + "learning_rate": 2.8693036689247943e-07, + "loss": 0.3602, + "step": 15203 + }, + { + "epoch": 1.8501977487070276, + "grad_norm": 1.9276150465011597, + "learning_rate": 2.8646661948396494e-07, + "loss": 0.3797, + "step": 15204 + }, + { + "epoch": 1.8503194402190446, + "grad_norm": 1.3737303018569946, + "learning_rate": 2.860032416955028e-07, + "loss": 0.355, + "step": 15205 + }, + { + "epoch": 1.8504411317310616, + "grad_norm": 1.8732149600982666, + "learning_rate": 2.855402335447222e-07, + "loss": 0.3691, + "step": 15206 + }, + { + "epoch": 1.8505628232430786, + "grad_norm": 1.601396083831787, + "learning_rate": 2.850775950492457e-07, + "loss": 0.3833, + "step": 15207 + }, + { + "epoch": 1.8506845147550959, + "grad_norm": 2.2502572536468506, + "learning_rate": 2.8461532622667486e-07, + "loss": 0.3758, + "step": 15208 + }, + { + "epoch": 1.850806206267113, + "grad_norm": 3.461381196975708, + "learning_rate": 2.8415342709460003e-07, + "loss": 0.4597, + "step": 15209 + }, + { + "epoch": 1.85092789777913, + "grad_norm": 1.6245616674423218, + "learning_rate": 2.8369189767059823e-07, + "loss": 0.3802, + "step": 15210 + }, + { + "epoch": 1.851049589291147, + "grad_norm": 2.85707950592041, + "learning_rate": 2.83230737972231e-07, + "loss": 0.3331, + "step": 15211 + }, + { + "epoch": 1.851171280803164, + "grad_norm": 3.1344103813171387, + "learning_rate": 2.827699480170476e-07, + "loss": 0.4383, + "step": 15212 + }, + { + "epoch": 1.851292972315181, + "grad_norm": 1.7445528507232666, + "learning_rate": 2.823095278225807e-07, + "loss": 0.3499, + "step": 15213 + }, + { + "epoch": 1.8514146638271982, + "grad_norm": 1.7127130031585693, + "learning_rate": 2.818494774063507e-07, + "loss": 0.376, + "step": 15214 + }, + { + "epoch": 1.8515363553392152, + "grad_norm": 1.6919753551483154, + "learning_rate": 2.813897967858636e-07, + "loss": 0.3692, + "step": 15215 + }, + { + "epoch": 1.8516580468512323, + "grad_norm": 2.6670053005218506, + "learning_rate": 2.8093048597861107e-07, + "loss": 0.3234, + "step": 15216 + }, + { + "epoch": 1.8517797383632493, + "grad_norm": 2.696377992630005, + "learning_rate": 2.8047154500207117e-07, + "loss": 0.3818, + "step": 15217 + }, + { + "epoch": 1.8519014298752663, + "grad_norm": 3.575444221496582, + "learning_rate": 2.800129738737067e-07, + "loss": 0.4169, + "step": 15218 + }, + { + "epoch": 1.8520231213872833, + "grad_norm": 1.633947491645813, + "learning_rate": 2.795547726109693e-07, + "loss": 0.4023, + "step": 15219 + }, + { + "epoch": 1.8521448128993003, + "grad_norm": 1.7125251293182373, + "learning_rate": 2.790969412312927e-07, + "loss": 0.3347, + "step": 15220 + }, + { + "epoch": 1.8522665044113173, + "grad_norm": 2.236868143081665, + "learning_rate": 2.7863947975209747e-07, + "loss": 0.3169, + "step": 15221 + }, + { + "epoch": 1.8523881959233344, + "grad_norm": 3.2324862480163574, + "learning_rate": 2.781823881907919e-07, + "loss": 0.3111, + "step": 15222 + }, + { + "epoch": 1.8525098874353514, + "grad_norm": 1.803200364112854, + "learning_rate": 2.7772566656477096e-07, + "loss": 0.3723, + "step": 15223 + }, + { + "epoch": 1.8526315789473684, + "grad_norm": 3.2707302570343018, + "learning_rate": 2.7726931489141186e-07, + "loss": 0.4346, + "step": 15224 + }, + { + "epoch": 1.8527532704593854, + "grad_norm": 2.8390579223632812, + "learning_rate": 2.7681333318807956e-07, + "loss": 0.3423, + "step": 15225 + }, + { + "epoch": 1.8528749619714024, + "grad_norm": 2.4574379920959473, + "learning_rate": 2.763577214721269e-07, + "loss": 0.3909, + "step": 15226 + }, + { + "epoch": 1.8529966534834195, + "grad_norm": 1.9136743545532227, + "learning_rate": 2.7590247976089004e-07, + "loss": 0.3789, + "step": 15227 + }, + { + "epoch": 1.8531183449954365, + "grad_norm": 1.6445471048355103, + "learning_rate": 2.754476080716906e-07, + "loss": 0.3508, + "step": 15228 + }, + { + "epoch": 1.8532400365074535, + "grad_norm": 1.5453400611877441, + "learning_rate": 2.7499310642183807e-07, + "loss": 0.407, + "step": 15229 + }, + { + "epoch": 1.8533617280194705, + "grad_norm": 2.1475095748901367, + "learning_rate": 2.745389748286287e-07, + "loss": 0.3421, + "step": 15230 + }, + { + "epoch": 1.8534834195314875, + "grad_norm": 1.6891398429870605, + "learning_rate": 2.740852133093419e-07, + "loss": 0.3363, + "step": 15231 + }, + { + "epoch": 1.8536051110435046, + "grad_norm": 3.554314136505127, + "learning_rate": 2.7363182188124396e-07, + "loss": 0.4546, + "step": 15232 + }, + { + "epoch": 1.8537268025555218, + "grad_norm": 1.834157109260559, + "learning_rate": 2.731788005615865e-07, + "loss": 0.3502, + "step": 15233 + }, + { + "epoch": 1.8538484940675388, + "grad_norm": 2.061800956726074, + "learning_rate": 2.7272614936761033e-07, + "loss": 0.3579, + "step": 15234 + }, + { + "epoch": 1.8539701855795558, + "grad_norm": 1.362410306930542, + "learning_rate": 2.7227386831653933e-07, + "loss": 0.3333, + "step": 15235 + }, + { + "epoch": 1.8540918770915729, + "grad_norm": 1.7776938676834106, + "learning_rate": 2.718219574255809e-07, + "loss": 0.3627, + "step": 15236 + }, + { + "epoch": 1.8542135686035899, + "grad_norm": 1.718324899673462, + "learning_rate": 2.7137041671193463e-07, + "loss": 0.4081, + "step": 15237 + }, + { + "epoch": 1.854335260115607, + "grad_norm": 1.710709810256958, + "learning_rate": 2.709192461927801e-07, + "loss": 0.3901, + "step": 15238 + }, + { + "epoch": 1.8544569516276241, + "grad_norm": 4.159104347229004, + "learning_rate": 2.7046844588528684e-07, + "loss": 0.4716, + "step": 15239 + }, + { + "epoch": 1.8545786431396412, + "grad_norm": 1.7879972457885742, + "learning_rate": 2.700180158066079e-07, + "loss": 0.3916, + "step": 15240 + }, + { + "epoch": 1.8547003346516582, + "grad_norm": 1.9301176071166992, + "learning_rate": 2.6956795597388396e-07, + "loss": 0.3835, + "step": 15241 + }, + { + "epoch": 1.8548220261636752, + "grad_norm": 1.9738837480545044, + "learning_rate": 2.691182664042413e-07, + "loss": 0.3659, + "step": 15242 + }, + { + "epoch": 1.8549437176756922, + "grad_norm": 2.536100387573242, + "learning_rate": 2.686689471147896e-07, + "loss": 0.4327, + "step": 15243 + }, + { + "epoch": 1.8550654091877092, + "grad_norm": 6.848067760467529, + "learning_rate": 2.682199981226252e-07, + "loss": 0.395, + "step": 15244 + }, + { + "epoch": 1.8551871006997263, + "grad_norm": 1.7462328672409058, + "learning_rate": 2.6777141944483553e-07, + "loss": 0.3478, + "step": 15245 + }, + { + "epoch": 1.8553087922117433, + "grad_norm": 2.3958640098571777, + "learning_rate": 2.67323211098488e-07, + "loss": 0.3869, + "step": 15246 + }, + { + "epoch": 1.8554304837237603, + "grad_norm": 1.5834935903549194, + "learning_rate": 2.6687537310063685e-07, + "loss": 0.3268, + "step": 15247 + }, + { + "epoch": 1.8555521752357773, + "grad_norm": 2.825343608856201, + "learning_rate": 2.66427905468325e-07, + "loss": 0.3079, + "step": 15248 + }, + { + "epoch": 1.8556738667477943, + "grad_norm": 1.737581729888916, + "learning_rate": 2.659808082185777e-07, + "loss": 0.399, + "step": 15249 + }, + { + "epoch": 1.8557955582598114, + "grad_norm": 2.0236270427703857, + "learning_rate": 2.655340813684104e-07, + "loss": 0.3246, + "step": 15250 + }, + { + "epoch": 1.8559172497718284, + "grad_norm": 2.477104425430298, + "learning_rate": 2.650877249348205e-07, + "loss": 0.4306, + "step": 15251 + }, + { + "epoch": 1.8560389412838454, + "grad_norm": 1.8327890634536743, + "learning_rate": 2.6464173893479104e-07, + "loss": 0.403, + "step": 15252 + }, + { + "epoch": 1.8561606327958624, + "grad_norm": 1.4874261617660522, + "learning_rate": 2.641961233852963e-07, + "loss": 0.3513, + "step": 15253 + }, + { + "epoch": 1.8562823243078794, + "grad_norm": 2.209660768508911, + "learning_rate": 2.637508783032916e-07, + "loss": 0.376, + "step": 15254 + }, + { + "epoch": 1.8564040158198964, + "grad_norm": 1.6877037286758423, + "learning_rate": 2.6330600370571667e-07, + "loss": 0.352, + "step": 15255 + }, + { + "epoch": 1.8565257073319135, + "grad_norm": 2.268897294998169, + "learning_rate": 2.6286149960950357e-07, + "loss": 0.3148, + "step": 15256 + }, + { + "epoch": 1.8566473988439305, + "grad_norm": 1.7968549728393555, + "learning_rate": 2.6241736603156433e-07, + "loss": 0.3961, + "step": 15257 + }, + { + "epoch": 1.8567690903559477, + "grad_norm": 3.1995365619659424, + "learning_rate": 2.619736029888009e-07, + "loss": 0.3272, + "step": 15258 + }, + { + "epoch": 1.8568907818679647, + "grad_norm": 2.673677921295166, + "learning_rate": 2.615302104980988e-07, + "loss": 0.444, + "step": 15259 + }, + { + "epoch": 1.8570124733799818, + "grad_norm": 2.7062039375305176, + "learning_rate": 2.6108718857632776e-07, + "loss": 0.395, + "step": 15260 + }, + { + "epoch": 1.8571341648919988, + "grad_norm": 1.8717856407165527, + "learning_rate": 2.6064453724034986e-07, + "loss": 0.376, + "step": 15261 + }, + { + "epoch": 1.8572558564040158, + "grad_norm": 2.5740532875061035, + "learning_rate": 2.602022565070061e-07, + "loss": 0.406, + "step": 15262 + }, + { + "epoch": 1.8573775479160328, + "grad_norm": 2.5122294425964355, + "learning_rate": 2.5976034639312533e-07, + "loss": 0.3715, + "step": 15263 + }, + { + "epoch": 1.8574992394280498, + "grad_norm": 2.4028637409210205, + "learning_rate": 2.5931880691552613e-07, + "loss": 0.3769, + "step": 15264 + }, + { + "epoch": 1.857620930940067, + "grad_norm": 1.950917363166809, + "learning_rate": 2.588776380910074e-07, + "loss": 0.3324, + "step": 15265 + }, + { + "epoch": 1.857742622452084, + "grad_norm": 1.5598349571228027, + "learning_rate": 2.5843683993635903e-07, + "loss": 0.3523, + "step": 15266 + }, + { + "epoch": 1.8578643139641011, + "grad_norm": 2.1785759925842285, + "learning_rate": 2.5799641246835206e-07, + "loss": 0.3828, + "step": 15267 + }, + { + "epoch": 1.8579860054761181, + "grad_norm": 1.9486876726150513, + "learning_rate": 2.5755635570374525e-07, + "loss": 0.3908, + "step": 15268 + }, + { + "epoch": 1.8581076969881352, + "grad_norm": 1.6697043180465698, + "learning_rate": 2.571166696592864e-07, + "loss": 0.3804, + "step": 15269 + }, + { + "epoch": 1.8582293885001522, + "grad_norm": 2.2978954315185547, + "learning_rate": 2.5667735435170426e-07, + "loss": 0.3852, + "step": 15270 + }, + { + "epoch": 1.8583510800121692, + "grad_norm": 1.653773546218872, + "learning_rate": 2.5623840979771665e-07, + "loss": 0.3304, + "step": 15271 + }, + { + "epoch": 1.8584727715241862, + "grad_norm": 2.2193028926849365, + "learning_rate": 2.557998360140257e-07, + "loss": 0.3923, + "step": 15272 + }, + { + "epoch": 1.8585944630362032, + "grad_norm": 1.6757093667984009, + "learning_rate": 2.553616330173214e-07, + "loss": 0.3615, + "step": 15273 + }, + { + "epoch": 1.8587161545482203, + "grad_norm": 1.5936883687973022, + "learning_rate": 2.5492380082427495e-07, + "loss": 0.3563, + "step": 15274 + }, + { + "epoch": 1.8588378460602373, + "grad_norm": 1.9530621767044067, + "learning_rate": 2.544863394515507e-07, + "loss": 0.3807, + "step": 15275 + }, + { + "epoch": 1.8589595375722543, + "grad_norm": 4.100805282592773, + "learning_rate": 2.540492489157942e-07, + "loss": 0.3468, + "step": 15276 + }, + { + "epoch": 1.8590812290842713, + "grad_norm": 1.8061838150024414, + "learning_rate": 2.5361252923363553e-07, + "loss": 0.3802, + "step": 15277 + }, + { + "epoch": 1.8592029205962883, + "grad_norm": 2.2591936588287354, + "learning_rate": 2.531761804216948e-07, + "loss": 0.3756, + "step": 15278 + }, + { + "epoch": 1.8593246121083054, + "grad_norm": 2.2334301471710205, + "learning_rate": 2.5274020249657414e-07, + "loss": 0.3608, + "step": 15279 + }, + { + "epoch": 1.8594463036203224, + "grad_norm": 1.7504310607910156, + "learning_rate": 2.523045954748649e-07, + "loss": 0.3622, + "step": 15280 + }, + { + "epoch": 1.8595679951323394, + "grad_norm": 1.748343825340271, + "learning_rate": 2.518693593731425e-07, + "loss": 0.3355, + "step": 15281 + }, + { + "epoch": 1.8596896866443564, + "grad_norm": 1.4133572578430176, + "learning_rate": 2.5143449420796826e-07, + "loss": 0.3691, + "step": 15282 + }, + { + "epoch": 1.8598113781563734, + "grad_norm": 3.210618495941162, + "learning_rate": 2.509999999958912e-07, + "loss": 0.4337, + "step": 15283 + }, + { + "epoch": 1.8599330696683907, + "grad_norm": 1.4826692342758179, + "learning_rate": 2.5056587675344245e-07, + "loss": 0.3766, + "step": 15284 + }, + { + "epoch": 1.8600547611804077, + "grad_norm": 2.6870193481445312, + "learning_rate": 2.5013212449714106e-07, + "loss": 0.3954, + "step": 15285 + }, + { + "epoch": 1.8601764526924247, + "grad_norm": 3.0309722423553467, + "learning_rate": 2.49698743243495e-07, + "loss": 0.3014, + "step": 15286 + }, + { + "epoch": 1.8602981442044417, + "grad_norm": 2.19174861907959, + "learning_rate": 2.4926573300899426e-07, + "loss": 0.3673, + "step": 15287 + }, + { + "epoch": 1.8604198357164587, + "grad_norm": 1.744202733039856, + "learning_rate": 2.488330938101147e-07, + "loss": 0.3411, + "step": 15288 + }, + { + "epoch": 1.8605415272284758, + "grad_norm": 2.1084840297698975, + "learning_rate": 2.484008256633197e-07, + "loss": 0.384, + "step": 15289 + }, + { + "epoch": 1.860663218740493, + "grad_norm": 1.4459794759750366, + "learning_rate": 2.479689285850573e-07, + "loss": 0.3464, + "step": 15290 + }, + { + "epoch": 1.86078491025251, + "grad_norm": 2.2902674674987793, + "learning_rate": 2.4753740259176427e-07, + "loss": 0.4493, + "step": 15291 + }, + { + "epoch": 1.860906601764527, + "grad_norm": 1.6270930767059326, + "learning_rate": 2.471062476998587e-07, + "loss": 0.3248, + "step": 15292 + }, + { + "epoch": 1.861028293276544, + "grad_norm": 1.6411547660827637, + "learning_rate": 2.466754639257474e-07, + "loss": 0.3784, + "step": 15293 + }, + { + "epoch": 1.861149984788561, + "grad_norm": 2.00333833694458, + "learning_rate": 2.4624505128582276e-07, + "loss": 0.3918, + "step": 15294 + }, + { + "epoch": 1.861271676300578, + "grad_norm": 1.498854637145996, + "learning_rate": 2.45815009796464e-07, + "loss": 0.3572, + "step": 15295 + }, + { + "epoch": 1.8613933678125951, + "grad_norm": 2.089965343475342, + "learning_rate": 2.4538533947403355e-07, + "loss": 0.3509, + "step": 15296 + }, + { + "epoch": 1.8615150593246121, + "grad_norm": 1.6366585493087769, + "learning_rate": 2.4495604033488165e-07, + "loss": 0.3424, + "step": 15297 + }, + { + "epoch": 1.8616367508366292, + "grad_norm": 1.997127652168274, + "learning_rate": 2.445271123953452e-07, + "loss": 0.3526, + "step": 15298 + }, + { + "epoch": 1.8617584423486462, + "grad_norm": 2.454453706741333, + "learning_rate": 2.4409855567174347e-07, + "loss": 0.3492, + "step": 15299 + }, + { + "epoch": 1.8618801338606632, + "grad_norm": 1.4662631750106812, + "learning_rate": 2.4367037018038777e-07, + "loss": 0.2955, + "step": 15300 + }, + { + "epoch": 1.8620018253726802, + "grad_norm": 2.0737311840057373, + "learning_rate": 2.4324255593756727e-07, + "loss": 0.3533, + "step": 15301 + }, + { + "epoch": 1.8621235168846972, + "grad_norm": 1.6240527629852295, + "learning_rate": 2.428151129595624e-07, + "loss": 0.3655, + "step": 15302 + }, + { + "epoch": 1.8622452083967143, + "grad_norm": 1.8514087200164795, + "learning_rate": 2.4238804126264006e-07, + "loss": 0.3334, + "step": 15303 + }, + { + "epoch": 1.8623668999087313, + "grad_norm": 1.7822703123092651, + "learning_rate": 2.4196134086304835e-07, + "loss": 0.3779, + "step": 15304 + }, + { + "epoch": 1.8624885914207483, + "grad_norm": 1.6161086559295654, + "learning_rate": 2.4153501177702656e-07, + "loss": 0.395, + "step": 15305 + }, + { + "epoch": 1.8626102829327653, + "grad_norm": 2.832211971282959, + "learning_rate": 2.4110905402079723e-07, + "loss": 0.4319, + "step": 15306 + }, + { + "epoch": 1.8627319744447823, + "grad_norm": 1.7324323654174805, + "learning_rate": 2.4068346761056627e-07, + "loss": 0.3761, + "step": 15307 + }, + { + "epoch": 1.8628536659567994, + "grad_norm": 1.9043869972229004, + "learning_rate": 2.4025825256253186e-07, + "loss": 0.374, + "step": 15308 + }, + { + "epoch": 1.8629753574688166, + "grad_norm": 3.8759820461273193, + "learning_rate": 2.398334088928711e-07, + "loss": 0.3906, + "step": 15309 + }, + { + "epoch": 1.8630970489808336, + "grad_norm": 1.8025131225585938, + "learning_rate": 2.394089366177521e-07, + "loss": 0.3806, + "step": 15310 + }, + { + "epoch": 1.8632187404928506, + "grad_norm": 1.7823030948638916, + "learning_rate": 2.3898483575332866e-07, + "loss": 0.391, + "step": 15311 + }, + { + "epoch": 1.8633404320048677, + "grad_norm": 1.5300712585449219, + "learning_rate": 2.3856110631573336e-07, + "loss": 0.375, + "step": 15312 + }, + { + "epoch": 1.8634621235168847, + "grad_norm": 1.6364474296569824, + "learning_rate": 2.3813774832109338e-07, + "loss": 0.3543, + "step": 15313 + }, + { + "epoch": 1.8635838150289017, + "grad_norm": 1.6200556755065918, + "learning_rate": 2.377147617855191e-07, + "loss": 0.3764, + "step": 15314 + }, + { + "epoch": 1.863705506540919, + "grad_norm": 3.346451997756958, + "learning_rate": 2.3729214672510436e-07, + "loss": 0.4574, + "step": 15315 + }, + { + "epoch": 1.863827198052936, + "grad_norm": 2.808462142944336, + "learning_rate": 2.3686990315593071e-07, + "loss": 0.4139, + "step": 15316 + }, + { + "epoch": 1.863948889564953, + "grad_norm": 1.7323073148727417, + "learning_rate": 2.3644803109406643e-07, + "loss": 0.3828, + "step": 15317 + }, + { + "epoch": 1.86407058107697, + "grad_norm": 1.712723731994629, + "learning_rate": 2.3602653055556424e-07, + "loss": 0.344, + "step": 15318 + }, + { + "epoch": 1.864192272588987, + "grad_norm": 2.311335802078247, + "learning_rate": 2.356054015564624e-07, + "loss": 0.3344, + "step": 15319 + }, + { + "epoch": 1.864313964101004, + "grad_norm": 1.6981480121612549, + "learning_rate": 2.3518464411278586e-07, + "loss": 0.3042, + "step": 15320 + }, + { + "epoch": 1.864435655613021, + "grad_norm": 1.6295275688171387, + "learning_rate": 2.3476425824054627e-07, + "loss": 0.3374, + "step": 15321 + }, + { + "epoch": 1.864557347125038, + "grad_norm": 2.2025394439697266, + "learning_rate": 2.3434424395573973e-07, + "loss": 0.3913, + "step": 15322 + }, + { + "epoch": 1.864679038637055, + "grad_norm": 1.3795732259750366, + "learning_rate": 2.3392460127434903e-07, + "loss": 0.3406, + "step": 15323 + }, + { + "epoch": 1.864800730149072, + "grad_norm": 1.7225010395050049, + "learning_rate": 2.3350533021234134e-07, + "loss": 0.3303, + "step": 15324 + }, + { + "epoch": 1.8649224216610891, + "grad_norm": 1.5403178930282593, + "learning_rate": 2.3308643078567062e-07, + "loss": 0.397, + "step": 15325 + }, + { + "epoch": 1.8650441131731061, + "grad_norm": 1.648912787437439, + "learning_rate": 2.3266790301027853e-07, + "loss": 0.311, + "step": 15326 + }, + { + "epoch": 1.8651658046851232, + "grad_norm": 1.8623329401016235, + "learning_rate": 2.3224974690209013e-07, + "loss": 0.3852, + "step": 15327 + }, + { + "epoch": 1.8652874961971402, + "grad_norm": 1.4475737810134888, + "learning_rate": 2.318319624770149e-07, + "loss": 0.3351, + "step": 15328 + }, + { + "epoch": 1.8654091877091572, + "grad_norm": 1.4394919872283936, + "learning_rate": 2.3141454975095457e-07, + "loss": 0.3422, + "step": 15329 + }, + { + "epoch": 1.8655308792211742, + "grad_norm": 1.9390974044799805, + "learning_rate": 2.3099750873979088e-07, + "loss": 0.3536, + "step": 15330 + }, + { + "epoch": 1.8656525707331912, + "grad_norm": 1.6818997859954834, + "learning_rate": 2.3058083945939114e-07, + "loss": 0.3544, + "step": 15331 + }, + { + "epoch": 1.8657742622452083, + "grad_norm": 1.7142422199249268, + "learning_rate": 2.3016454192561267e-07, + "loss": 0.3336, + "step": 15332 + }, + { + "epoch": 1.8658959537572253, + "grad_norm": 1.8560435771942139, + "learning_rate": 2.2974861615429612e-07, + "loss": 0.3293, + "step": 15333 + }, + { + "epoch": 1.8660176452692425, + "grad_norm": 2.3031299114227295, + "learning_rate": 2.293330621612677e-07, + "loss": 0.3405, + "step": 15334 + }, + { + "epoch": 1.8661393367812595, + "grad_norm": 4.432735443115234, + "learning_rate": 2.289178799623415e-07, + "loss": 0.4603, + "step": 15335 + }, + { + "epoch": 1.8662610282932766, + "grad_norm": 1.8172458410263062, + "learning_rate": 2.285030695733137e-07, + "loss": 0.3746, + "step": 15336 + }, + { + "epoch": 1.8663827198052936, + "grad_norm": 2.4081640243530273, + "learning_rate": 2.280886310099706e-07, + "loss": 0.3503, + "step": 15337 + }, + { + "epoch": 1.8665044113173106, + "grad_norm": 1.8210499286651611, + "learning_rate": 2.2767456428808177e-07, + "loss": 0.4008, + "step": 15338 + }, + { + "epoch": 1.8666261028293276, + "grad_norm": 2.1988589763641357, + "learning_rate": 2.2726086942340243e-07, + "loss": 0.4072, + "step": 15339 + }, + { + "epoch": 1.8667477943413449, + "grad_norm": 1.9303128719329834, + "learning_rate": 2.2684754643167662e-07, + "loss": 0.3619, + "step": 15340 + }, + { + "epoch": 1.8668694858533619, + "grad_norm": 2.564828872680664, + "learning_rate": 2.2643459532863067e-07, + "loss": 0.3622, + "step": 15341 + }, + { + "epoch": 1.866991177365379, + "grad_norm": 1.9748661518096924, + "learning_rate": 2.2602201612997754e-07, + "loss": 0.3693, + "step": 15342 + }, + { + "epoch": 1.867112868877396, + "grad_norm": 1.621315360069275, + "learning_rate": 2.2560980885141913e-07, + "loss": 0.3503, + "step": 15343 + }, + { + "epoch": 1.867234560389413, + "grad_norm": 1.4281065464019775, + "learning_rate": 2.2519797350863848e-07, + "loss": 0.3465, + "step": 15344 + }, + { + "epoch": 1.86735625190143, + "grad_norm": 1.5208765268325806, + "learning_rate": 2.2478651011730745e-07, + "loss": 0.3136, + "step": 15345 + }, + { + "epoch": 1.867477943413447, + "grad_norm": 1.624214768409729, + "learning_rate": 2.2437541869308577e-07, + "loss": 0.3631, + "step": 15346 + }, + { + "epoch": 1.867599634925464, + "grad_norm": 2.243067979812622, + "learning_rate": 2.239646992516109e-07, + "loss": 0.4242, + "step": 15347 + }, + { + "epoch": 1.867721326437481, + "grad_norm": 2.440727710723877, + "learning_rate": 2.2355435180851592e-07, + "loss": 0.3705, + "step": 15348 + }, + { + "epoch": 1.867843017949498, + "grad_norm": 1.867019534111023, + "learning_rate": 2.231443763794139e-07, + "loss": 0.4287, + "step": 15349 + }, + { + "epoch": 1.867964709461515, + "grad_norm": 1.9152992963790894, + "learning_rate": 2.2273477297990453e-07, + "loss": 0.3443, + "step": 15350 + }, + { + "epoch": 1.868086400973532, + "grad_norm": 1.8000767230987549, + "learning_rate": 2.223255416255754e-07, + "loss": 0.3629, + "step": 15351 + }, + { + "epoch": 1.868208092485549, + "grad_norm": 1.6828755140304565, + "learning_rate": 2.2191668233199848e-07, + "loss": 0.3457, + "step": 15352 + }, + { + "epoch": 1.868329783997566, + "grad_norm": 7.77219820022583, + "learning_rate": 2.215081951147302e-07, + "loss": 0.3679, + "step": 15353 + }, + { + "epoch": 1.8684514755095831, + "grad_norm": 1.7122520208358765, + "learning_rate": 2.2110007998931705e-07, + "loss": 0.3527, + "step": 15354 + }, + { + "epoch": 1.8685731670216001, + "grad_norm": 1.5874251127243042, + "learning_rate": 2.2069233697128657e-07, + "loss": 0.3767, + "step": 15355 + }, + { + "epoch": 1.8686948585336172, + "grad_norm": 1.8478678464889526, + "learning_rate": 2.2028496607615303e-07, + "loss": 0.3479, + "step": 15356 + }, + { + "epoch": 1.8688165500456342, + "grad_norm": 2.899851083755493, + "learning_rate": 2.1987796731942178e-07, + "loss": 0.409, + "step": 15357 + }, + { + "epoch": 1.8689382415576512, + "grad_norm": 1.7577241659164429, + "learning_rate": 2.1947134071657607e-07, + "loss": 0.3985, + "step": 15358 + }, + { + "epoch": 1.8690599330696684, + "grad_norm": 1.868581771850586, + "learning_rate": 2.190650862830912e-07, + "loss": 0.3799, + "step": 15359 + }, + { + "epoch": 1.8691816245816855, + "grad_norm": 1.7281532287597656, + "learning_rate": 2.1865920403442598e-07, + "loss": 0.3182, + "step": 15360 + }, + { + "epoch": 1.8693033160937025, + "grad_norm": 2.024193525314331, + "learning_rate": 2.1825369398602247e-07, + "loss": 0.3637, + "step": 15361 + }, + { + "epoch": 1.8694250076057195, + "grad_norm": 1.9359935522079468, + "learning_rate": 2.178485561533139e-07, + "loss": 0.3745, + "step": 15362 + }, + { + "epoch": 1.8695466991177365, + "grad_norm": 3.233109712600708, + "learning_rate": 2.174437905517157e-07, + "loss": 0.2921, + "step": 15363 + }, + { + "epoch": 1.8696683906297535, + "grad_norm": 2.986978054046631, + "learning_rate": 2.1703939719663004e-07, + "loss": 0.3269, + "step": 15364 + }, + { + "epoch": 1.8697900821417706, + "grad_norm": 1.7951374053955078, + "learning_rate": 2.1663537610344453e-07, + "loss": 0.3309, + "step": 15365 + }, + { + "epoch": 1.8699117736537878, + "grad_norm": 1.7709498405456543, + "learning_rate": 2.1623172728753472e-07, + "loss": 0.4393, + "step": 15366 + }, + { + "epoch": 1.8700334651658048, + "grad_norm": 1.9826070070266724, + "learning_rate": 2.158284507642572e-07, + "loss": 0.4589, + "step": 15367 + }, + { + "epoch": 1.8701551566778218, + "grad_norm": 1.6621332168579102, + "learning_rate": 2.1542554654896076e-07, + "loss": 0.3578, + "step": 15368 + }, + { + "epoch": 1.8702768481898389, + "grad_norm": 1.561108946800232, + "learning_rate": 2.150230146569743e-07, + "loss": 0.3652, + "step": 15369 + }, + { + "epoch": 1.8703985397018559, + "grad_norm": 1.57069730758667, + "learning_rate": 2.1462085510361664e-07, + "loss": 0.3255, + "step": 15370 + }, + { + "epoch": 1.870520231213873, + "grad_norm": 1.6626564264297485, + "learning_rate": 2.1421906790419001e-07, + "loss": 0.3069, + "step": 15371 + }, + { + "epoch": 1.87064192272589, + "grad_norm": 1.2337212562561035, + "learning_rate": 2.1381765307398216e-07, + "loss": 0.3105, + "step": 15372 + }, + { + "epoch": 1.870763614237907, + "grad_norm": 2.4486405849456787, + "learning_rate": 2.134166106282698e-07, + "loss": 0.3505, + "step": 15373 + }, + { + "epoch": 1.870885305749924, + "grad_norm": 1.6470117568969727, + "learning_rate": 2.130159405823129e-07, + "loss": 0.382, + "step": 15374 + }, + { + "epoch": 1.871006997261941, + "grad_norm": 2.046447277069092, + "learning_rate": 2.1261564295135596e-07, + "loss": 0.3482, + "step": 15375 + }, + { + "epoch": 1.871128688773958, + "grad_norm": 1.8801674842834473, + "learning_rate": 2.1221571775063343e-07, + "loss": 0.3365, + "step": 15376 + }, + { + "epoch": 1.871250380285975, + "grad_norm": 2.0557637214660645, + "learning_rate": 2.1181616499536206e-07, + "loss": 0.342, + "step": 15377 + }, + { + "epoch": 1.871372071797992, + "grad_norm": 2.9107978343963623, + "learning_rate": 2.1141698470074746e-07, + "loss": 0.3432, + "step": 15378 + }, + { + "epoch": 1.871493763310009, + "grad_norm": 2.4534497261047363, + "learning_rate": 2.1101817688197633e-07, + "loss": 0.3972, + "step": 15379 + }, + { + "epoch": 1.871615454822026, + "grad_norm": 1.4412715435028076, + "learning_rate": 2.106197415542266e-07, + "loss": 0.3127, + "step": 15380 + }, + { + "epoch": 1.871737146334043, + "grad_norm": 2.2530510425567627, + "learning_rate": 2.102216787326583e-07, + "loss": 0.3331, + "step": 15381 + }, + { + "epoch": 1.87185883784606, + "grad_norm": 1.7369531393051147, + "learning_rate": 2.0982398843241937e-07, + "loss": 0.3607, + "step": 15382 + }, + { + "epoch": 1.8719805293580771, + "grad_norm": 1.5745741128921509, + "learning_rate": 2.0942667066864099e-07, + "loss": 0.3367, + "step": 15383 + }, + { + "epoch": 1.8721022208700941, + "grad_norm": 2.732997417449951, + "learning_rate": 2.090297254564444e-07, + "loss": 0.3978, + "step": 15384 + }, + { + "epoch": 1.8722239123821114, + "grad_norm": 1.5832599401474, + "learning_rate": 2.0863315281093088e-07, + "loss": 0.3261, + "step": 15385 + }, + { + "epoch": 1.8723456038941284, + "grad_norm": 1.58500075340271, + "learning_rate": 2.0823695274719503e-07, + "loss": 0.3575, + "step": 15386 + }, + { + "epoch": 1.8724672954061454, + "grad_norm": 1.656009316444397, + "learning_rate": 2.078411252803103e-07, + "loss": 0.3785, + "step": 15387 + }, + { + "epoch": 1.8725889869181624, + "grad_norm": 3.3702714443206787, + "learning_rate": 2.0744567042533914e-07, + "loss": 0.2634, + "step": 15388 + }, + { + "epoch": 1.8727106784301795, + "grad_norm": 2.3989360332489014, + "learning_rate": 2.070505881973306e-07, + "loss": 0.3286, + "step": 15389 + }, + { + "epoch": 1.8728323699421965, + "grad_norm": 1.991233468055725, + "learning_rate": 2.0665587861131707e-07, + "loss": 0.3526, + "step": 15390 + }, + { + "epoch": 1.8729540614542137, + "grad_norm": 1.6651912927627563, + "learning_rate": 2.062615416823177e-07, + "loss": 0.3437, + "step": 15391 + }, + { + "epoch": 1.8730757529662307, + "grad_norm": 1.644325852394104, + "learning_rate": 2.0586757742533937e-07, + "loss": 0.3465, + "step": 15392 + }, + { + "epoch": 1.8731974444782478, + "grad_norm": 2.5114777088165283, + "learning_rate": 2.0547398585537225e-07, + "loss": 0.413, + "step": 15393 + }, + { + "epoch": 1.8733191359902648, + "grad_norm": 2.8682193756103516, + "learning_rate": 2.0508076698739333e-07, + "loss": 0.4729, + "step": 15394 + }, + { + "epoch": 1.8734408275022818, + "grad_norm": 1.4600131511688232, + "learning_rate": 2.0468792083636612e-07, + "loss": 0.3513, + "step": 15395 + }, + { + "epoch": 1.8735625190142988, + "grad_norm": 1.5856443643569946, + "learning_rate": 2.042954474172376e-07, + "loss": 0.3222, + "step": 15396 + }, + { + "epoch": 1.8736842105263158, + "grad_norm": 2.255934953689575, + "learning_rate": 2.0390334674494473e-07, + "loss": 0.3912, + "step": 15397 + }, + { + "epoch": 1.8738059020383329, + "grad_norm": 1.7907888889312744, + "learning_rate": 2.035116188344055e-07, + "loss": 0.3687, + "step": 15398 + }, + { + "epoch": 1.8739275935503499, + "grad_norm": 1.64853036403656, + "learning_rate": 2.0312026370052696e-07, + "loss": 0.3127, + "step": 15399 + }, + { + "epoch": 1.874049285062367, + "grad_norm": 1.8663063049316406, + "learning_rate": 2.0272928135820047e-07, + "loss": 0.3636, + "step": 15400 + }, + { + "epoch": 1.874170976574384, + "grad_norm": 1.712429165840149, + "learning_rate": 2.0233867182230527e-07, + "loss": 0.341, + "step": 15401 + }, + { + "epoch": 1.874292668086401, + "grad_norm": 1.8692585229873657, + "learning_rate": 2.019484351077017e-07, + "loss": 0.3972, + "step": 15402 + }, + { + "epoch": 1.874414359598418, + "grad_norm": 2.110351324081421, + "learning_rate": 2.0155857122924337e-07, + "loss": 0.3629, + "step": 15403 + }, + { + "epoch": 1.874536051110435, + "grad_norm": 1.7450706958770752, + "learning_rate": 2.0116908020176073e-07, + "loss": 0.4152, + "step": 15404 + }, + { + "epoch": 1.874657742622452, + "grad_norm": 1.606277585029602, + "learning_rate": 2.0077996204007855e-07, + "loss": 0.4053, + "step": 15405 + }, + { + "epoch": 1.874779434134469, + "grad_norm": 1.9735743999481201, + "learning_rate": 2.0039121675900163e-07, + "loss": 0.3732, + "step": 15406 + }, + { + "epoch": 1.874901125646486, + "grad_norm": 2.281611442565918, + "learning_rate": 2.0000284437332264e-07, + "loss": 0.4196, + "step": 15407 + }, + { + "epoch": 1.875022817158503, + "grad_norm": 2.0328779220581055, + "learning_rate": 1.996148448978208e-07, + "loss": 0.3523, + "step": 15408 + }, + { + "epoch": 1.87514450867052, + "grad_norm": 1.740028738975525, + "learning_rate": 1.9922721834725988e-07, + "loss": 0.3739, + "step": 15409 + }, + { + "epoch": 1.8752662001825373, + "grad_norm": 1.7364581823349, + "learning_rate": 1.9883996473638922e-07, + "loss": 0.3868, + "step": 15410 + }, + { + "epoch": 1.8753878916945543, + "grad_norm": 2.5917022228240967, + "learning_rate": 1.984530840799459e-07, + "loss": 0.4013, + "step": 15411 + }, + { + "epoch": 1.8755095832065714, + "grad_norm": 3.0971415042877197, + "learning_rate": 1.9806657639265037e-07, + "loss": 0.4174, + "step": 15412 + }, + { + "epoch": 1.8756312747185884, + "grad_norm": 1.984106183052063, + "learning_rate": 1.9768044168920974e-07, + "loss": 0.3486, + "step": 15413 + }, + { + "epoch": 1.8757529662306054, + "grad_norm": 3.0246214866638184, + "learning_rate": 1.9729467998432007e-07, + "loss": 0.3716, + "step": 15414 + }, + { + "epoch": 1.8758746577426224, + "grad_norm": 1.9176599979400635, + "learning_rate": 1.9690929129265735e-07, + "loss": 0.3722, + "step": 15415 + }, + { + "epoch": 1.8759963492546397, + "grad_norm": 1.8779288530349731, + "learning_rate": 1.9652427562888653e-07, + "loss": 0.3107, + "step": 15416 + }, + { + "epoch": 1.8761180407666567, + "grad_norm": 1.471330165863037, + "learning_rate": 1.9613963300766038e-07, + "loss": 0.3379, + "step": 15417 + }, + { + "epoch": 1.8762397322786737, + "grad_norm": 2.8038527965545654, + "learning_rate": 1.957553634436127e-07, + "loss": 0.3973, + "step": 15418 + }, + { + "epoch": 1.8763614237906907, + "grad_norm": 1.803779125213623, + "learning_rate": 1.9537146695136843e-07, + "loss": 0.3537, + "step": 15419 + }, + { + "epoch": 1.8764831153027077, + "grad_norm": 2.6164183616638184, + "learning_rate": 1.9498794354553374e-07, + "loss": 0.3273, + "step": 15420 + }, + { + "epoch": 1.8766048068147247, + "grad_norm": 2.1499290466308594, + "learning_rate": 1.9460479324070247e-07, + "loss": 0.3379, + "step": 15421 + }, + { + "epoch": 1.8767264983267418, + "grad_norm": 2.333621025085449, + "learning_rate": 1.942220160514552e-07, + "loss": 0.4227, + "step": 15422 + }, + { + "epoch": 1.8768481898387588, + "grad_norm": 3.023346185684204, + "learning_rate": 1.93839611992358e-07, + "loss": 0.4454, + "step": 15423 + }, + { + "epoch": 1.8769698813507758, + "grad_norm": 1.423231840133667, + "learning_rate": 1.9345758107795932e-07, + "loss": 0.3203, + "step": 15424 + }, + { + "epoch": 1.8770915728627928, + "grad_norm": 1.7558668851852417, + "learning_rate": 1.9307592332279966e-07, + "loss": 0.3339, + "step": 15425 + }, + { + "epoch": 1.8772132643748098, + "grad_norm": 1.9423383474349976, + "learning_rate": 1.926946387413997e-07, + "loss": 0.3719, + "step": 15426 + }, + { + "epoch": 1.8773349558868269, + "grad_norm": 1.695603847503662, + "learning_rate": 1.9231372734827004e-07, + "loss": 0.3467, + "step": 15427 + }, + { + "epoch": 1.8774566473988439, + "grad_norm": 1.5282888412475586, + "learning_rate": 1.9193318915790237e-07, + "loss": 0.401, + "step": 15428 + }, + { + "epoch": 1.877578338910861, + "grad_norm": 2.061734437942505, + "learning_rate": 1.915530241847785e-07, + "loss": 0.3678, + "step": 15429 + }, + { + "epoch": 1.877700030422878, + "grad_norm": 1.6862027645111084, + "learning_rate": 1.911732324433646e-07, + "loss": 0.4015, + "step": 15430 + }, + { + "epoch": 1.877821721934895, + "grad_norm": 2.7050788402557373, + "learning_rate": 1.907938139481136e-07, + "loss": 0.3686, + "step": 15431 + }, + { + "epoch": 1.877943413446912, + "grad_norm": 2.4505159854888916, + "learning_rate": 1.9041476871345943e-07, + "loss": 0.4082, + "step": 15432 + }, + { + "epoch": 1.878065104958929, + "grad_norm": 2.776437997817993, + "learning_rate": 1.900360967538306e-07, + "loss": 0.4408, + "step": 15433 + }, + { + "epoch": 1.878186796470946, + "grad_norm": 1.6510276794433594, + "learning_rate": 1.8965779808363227e-07, + "loss": 0.3673, + "step": 15434 + }, + { + "epoch": 1.8783084879829632, + "grad_norm": 1.5085569620132446, + "learning_rate": 1.8927987271726067e-07, + "loss": 0.3514, + "step": 15435 + }, + { + "epoch": 1.8784301794949803, + "grad_norm": 1.781345009803772, + "learning_rate": 1.8890232066909764e-07, + "loss": 0.3616, + "step": 15436 + }, + { + "epoch": 1.8785518710069973, + "grad_norm": 1.3456544876098633, + "learning_rate": 1.8852514195350947e-07, + "loss": 0.3266, + "step": 15437 + }, + { + "epoch": 1.8786735625190143, + "grad_norm": 1.8861418962478638, + "learning_rate": 1.881483365848491e-07, + "loss": 0.404, + "step": 15438 + }, + { + "epoch": 1.8787952540310313, + "grad_norm": 1.5723501443862915, + "learning_rate": 1.8777190457745287e-07, + "loss": 0.3765, + "step": 15439 + }, + { + "epoch": 1.8789169455430483, + "grad_norm": 1.626685619354248, + "learning_rate": 1.8739584594564485e-07, + "loss": 0.3452, + "step": 15440 + }, + { + "epoch": 1.8790386370550656, + "grad_norm": 3.0832009315490723, + "learning_rate": 1.8702016070373807e-07, + "loss": 0.4342, + "step": 15441 + }, + { + "epoch": 1.8791603285670826, + "grad_norm": 2.1513757705688477, + "learning_rate": 1.866448488660244e-07, + "loss": 0.402, + "step": 15442 + }, + { + "epoch": 1.8792820200790996, + "grad_norm": 1.746595859527588, + "learning_rate": 1.8626991044678688e-07, + "loss": 0.3934, + "step": 15443 + }, + { + "epoch": 1.8794037115911166, + "grad_norm": 1.7503626346588135, + "learning_rate": 1.8589534546029297e-07, + "loss": 0.3429, + "step": 15444 + }, + { + "epoch": 1.8795254031031337, + "grad_norm": 2.16701340675354, + "learning_rate": 1.8552115392079462e-07, + "loss": 0.3243, + "step": 15445 + }, + { + "epoch": 1.8796470946151507, + "grad_norm": 3.6273088455200195, + "learning_rate": 1.8514733584253153e-07, + "loss": 0.3482, + "step": 15446 + }, + { + "epoch": 1.8797687861271677, + "grad_norm": 1.6590532064437866, + "learning_rate": 1.84773891239729e-07, + "loss": 0.362, + "step": 15447 + }, + { + "epoch": 1.8798904776391847, + "grad_norm": 1.4681428670883179, + "learning_rate": 1.8440082012659455e-07, + "loss": 0.3314, + "step": 15448 + }, + { + "epoch": 1.8800121691512017, + "grad_norm": 2.2805116176605225, + "learning_rate": 1.8402812251732683e-07, + "loss": 0.3541, + "step": 15449 + }, + { + "epoch": 1.8801338606632187, + "grad_norm": 2.2875566482543945, + "learning_rate": 1.836557984261078e-07, + "loss": 0.4365, + "step": 15450 + }, + { + "epoch": 1.8802555521752358, + "grad_norm": 1.4187343120574951, + "learning_rate": 1.8328384786710396e-07, + "loss": 0.3401, + "step": 15451 + }, + { + "epoch": 1.8803772436872528, + "grad_norm": 1.6635375022888184, + "learning_rate": 1.829122708544695e-07, + "loss": 0.3574, + "step": 15452 + }, + { + "epoch": 1.8804989351992698, + "grad_norm": 1.9890598058700562, + "learning_rate": 1.82541067402342e-07, + "loss": 0.3674, + "step": 15453 + }, + { + "epoch": 1.8806206267112868, + "grad_norm": 1.6499123573303223, + "learning_rate": 1.8217023752484907e-07, + "loss": 0.3469, + "step": 15454 + }, + { + "epoch": 1.8807423182233038, + "grad_norm": 3.207336664199829, + "learning_rate": 1.8179978123609942e-07, + "loss": 0.4323, + "step": 15455 + }, + { + "epoch": 1.8808640097353209, + "grad_norm": 1.9012500047683716, + "learning_rate": 1.8142969855019067e-07, + "loss": 0.4465, + "step": 15456 + }, + { + "epoch": 1.8809857012473379, + "grad_norm": 2.123323917388916, + "learning_rate": 1.8105998948120595e-07, + "loss": 0.3544, + "step": 15457 + }, + { + "epoch": 1.881107392759355, + "grad_norm": 1.8889573812484741, + "learning_rate": 1.8069065404321295e-07, + "loss": 0.3495, + "step": 15458 + }, + { + "epoch": 1.881229084271372, + "grad_norm": 2.0151238441467285, + "learning_rate": 1.8032169225026487e-07, + "loss": 0.4075, + "step": 15459 + }, + { + "epoch": 1.8813507757833892, + "grad_norm": 1.5618926286697388, + "learning_rate": 1.7995310411640266e-07, + "loss": 0.3821, + "step": 15460 + }, + { + "epoch": 1.8814724672954062, + "grad_norm": 2.0830960273742676, + "learning_rate": 1.795848896556507e-07, + "loss": 0.3659, + "step": 15461 + }, + { + "epoch": 1.8815941588074232, + "grad_norm": 2.1608448028564453, + "learning_rate": 1.7921704888202107e-07, + "loss": 0.3866, + "step": 15462 + }, + { + "epoch": 1.8817158503194402, + "grad_norm": 2.254701852798462, + "learning_rate": 1.788495818095104e-07, + "loss": 0.3561, + "step": 15463 + }, + { + "epoch": 1.8818375418314572, + "grad_norm": 2.0414462089538574, + "learning_rate": 1.7848248845210192e-07, + "loss": 0.3438, + "step": 15464 + }, + { + "epoch": 1.8819592333434743, + "grad_norm": 1.682150959968567, + "learning_rate": 1.7811576882376446e-07, + "loss": 0.3432, + "step": 15465 + }, + { + "epoch": 1.8820809248554913, + "grad_norm": 1.691027045249939, + "learning_rate": 1.7774942293845132e-07, + "loss": 0.3851, + "step": 15466 + }, + { + "epoch": 1.8822026163675085, + "grad_norm": 1.6096749305725098, + "learning_rate": 1.7738345081010354e-07, + "loss": 0.4077, + "step": 15467 + }, + { + "epoch": 1.8823243078795255, + "grad_norm": 2.0246798992156982, + "learning_rate": 1.7701785245264779e-07, + "loss": 0.3679, + "step": 15468 + }, + { + "epoch": 1.8824459993915426, + "grad_norm": 2.1744384765625, + "learning_rate": 1.7665262787999625e-07, + "loss": 0.4022, + "step": 15469 + }, + { + "epoch": 1.8825676909035596, + "grad_norm": 2.3079404830932617, + "learning_rate": 1.762877771060434e-07, + "loss": 0.418, + "step": 15470 + }, + { + "epoch": 1.8826893824155766, + "grad_norm": 1.723802089691162, + "learning_rate": 1.7592330014467586e-07, + "loss": 0.3776, + "step": 15471 + }, + { + "epoch": 1.8828110739275936, + "grad_norm": 1.6644879579544067, + "learning_rate": 1.7555919700976144e-07, + "loss": 0.3448, + "step": 15472 + }, + { + "epoch": 1.8829327654396106, + "grad_norm": 1.8033770322799683, + "learning_rate": 1.751954677151546e-07, + "loss": 0.3741, + "step": 15473 + }, + { + "epoch": 1.8830544569516277, + "grad_norm": 2.4408578872680664, + "learning_rate": 1.7483211227469654e-07, + "loss": 0.4361, + "step": 15474 + }, + { + "epoch": 1.8831761484636447, + "grad_norm": 4.081774711608887, + "learning_rate": 1.7446913070221283e-07, + "loss": 0.2735, + "step": 15475 + }, + { + "epoch": 1.8832978399756617, + "grad_norm": 1.7569750547409058, + "learning_rate": 1.7410652301151798e-07, + "loss": 0.4259, + "step": 15476 + }, + { + "epoch": 1.8834195314876787, + "grad_norm": 1.794790267944336, + "learning_rate": 1.7374428921640761e-07, + "loss": 0.3754, + "step": 15477 + }, + { + "epoch": 1.8835412229996957, + "grad_norm": 1.802484154701233, + "learning_rate": 1.733824293306663e-07, + "loss": 0.3809, + "step": 15478 + }, + { + "epoch": 1.8836629145117127, + "grad_norm": 1.5311236381530762, + "learning_rate": 1.7302094336806298e-07, + "loss": 0.325, + "step": 15479 + }, + { + "epoch": 1.8837846060237298, + "grad_norm": 1.5800395011901855, + "learning_rate": 1.7265983134235442e-07, + "loss": 0.3426, + "step": 15480 + }, + { + "epoch": 1.8839062975357468, + "grad_norm": 1.874714732170105, + "learning_rate": 1.7229909326727968e-07, + "loss": 0.3422, + "step": 15481 + }, + { + "epoch": 1.8840279890477638, + "grad_norm": 2.0682287216186523, + "learning_rate": 1.7193872915656773e-07, + "loss": 0.3549, + "step": 15482 + }, + { + "epoch": 1.8841496805597808, + "grad_norm": 1.8225303888320923, + "learning_rate": 1.7157873902392986e-07, + "loss": 0.3752, + "step": 15483 + }, + { + "epoch": 1.8842713720717978, + "grad_norm": 2.681377649307251, + "learning_rate": 1.7121912288306508e-07, + "loss": 0.2461, + "step": 15484 + }, + { + "epoch": 1.8843930635838149, + "grad_norm": 1.4650429487228394, + "learning_rate": 1.7085988074765692e-07, + "loss": 0.3798, + "step": 15485 + }, + { + "epoch": 1.884514755095832, + "grad_norm": 1.6445276737213135, + "learning_rate": 1.7050101263137443e-07, + "loss": 0.3276, + "step": 15486 + }, + { + "epoch": 1.8846364466078491, + "grad_norm": 2.489896059036255, + "learning_rate": 1.7014251854787556e-07, + "loss": 0.3001, + "step": 15487 + }, + { + "epoch": 1.8847581381198661, + "grad_norm": 2.9276480674743652, + "learning_rate": 1.6978439851080054e-07, + "loss": 0.3048, + "step": 15488 + }, + { + "epoch": 1.8848798296318832, + "grad_norm": 1.4694931507110596, + "learning_rate": 1.6942665253377622e-07, + "loss": 0.3504, + "step": 15489 + }, + { + "epoch": 1.8850015211439002, + "grad_norm": 2.2043704986572266, + "learning_rate": 1.6906928063041504e-07, + "loss": 0.3718, + "step": 15490 + }, + { + "epoch": 1.8851232126559172, + "grad_norm": 2.199695587158203, + "learning_rate": 1.687122828143184e-07, + "loss": 0.3321, + "step": 15491 + }, + { + "epoch": 1.8852449041679344, + "grad_norm": 1.8922832012176514, + "learning_rate": 1.6835565909906758e-07, + "loss": 0.3783, + "step": 15492 + }, + { + "epoch": 1.8853665956799515, + "grad_norm": 1.9197243452072144, + "learning_rate": 1.6799940949823513e-07, + "loss": 0.3661, + "step": 15493 + }, + { + "epoch": 1.8854882871919685, + "grad_norm": 1.7733432054519653, + "learning_rate": 1.676435340253757e-07, + "loss": 0.3622, + "step": 15494 + }, + { + "epoch": 1.8856099787039855, + "grad_norm": 1.6435930728912354, + "learning_rate": 1.6728803269403182e-07, + "loss": 0.4033, + "step": 15495 + }, + { + "epoch": 1.8857316702160025, + "grad_norm": 2.256098747253418, + "learning_rate": 1.6693290551773155e-07, + "loss": 0.3568, + "step": 15496 + }, + { + "epoch": 1.8858533617280195, + "grad_norm": 1.4301310777664185, + "learning_rate": 1.6657815250998744e-07, + "loss": 0.3644, + "step": 15497 + }, + { + "epoch": 1.8859750532400366, + "grad_norm": 1.7706140279769897, + "learning_rate": 1.6622377368429864e-07, + "loss": 0.3567, + "step": 15498 + }, + { + "epoch": 1.8860967447520536, + "grad_norm": 2.2572269439697266, + "learning_rate": 1.6586976905414997e-07, + "loss": 0.4194, + "step": 15499 + }, + { + "epoch": 1.8862184362640706, + "grad_norm": 1.787251591682434, + "learning_rate": 1.6551613863301063e-07, + "loss": 0.3687, + "step": 15500 + }, + { + "epoch": 1.8863401277760876, + "grad_norm": 1.6112436056137085, + "learning_rate": 1.6516288243433986e-07, + "loss": 0.3715, + "step": 15501 + }, + { + "epoch": 1.8864618192881046, + "grad_norm": 2.2957050800323486, + "learning_rate": 1.6481000047157803e-07, + "loss": 0.3914, + "step": 15502 + }, + { + "epoch": 1.8865835108001217, + "grad_norm": 1.4749581813812256, + "learning_rate": 1.6445749275815326e-07, + "loss": 0.3638, + "step": 15503 + }, + { + "epoch": 1.8867052023121387, + "grad_norm": 2.2407288551330566, + "learning_rate": 1.641053593074804e-07, + "loss": 0.2987, + "step": 15504 + }, + { + "epoch": 1.8868268938241557, + "grad_norm": 2.0497801303863525, + "learning_rate": 1.637536001329565e-07, + "loss": 0.3919, + "step": 15505 + }, + { + "epoch": 1.8869485853361727, + "grad_norm": 2.0333075523376465, + "learning_rate": 1.634022152479686e-07, + "loss": 0.4193, + "step": 15506 + }, + { + "epoch": 1.8870702768481897, + "grad_norm": 1.5225858688354492, + "learning_rate": 1.630512046658883e-07, + "loss": 0.3381, + "step": 15507 + }, + { + "epoch": 1.8871919683602068, + "grad_norm": 1.4889533519744873, + "learning_rate": 1.6270056840006932e-07, + "loss": 0.3417, + "step": 15508 + }, + { + "epoch": 1.8873136598722238, + "grad_norm": 2.391403913497925, + "learning_rate": 1.6235030646385652e-07, + "loss": 0.4248, + "step": 15509 + }, + { + "epoch": 1.8874353513842408, + "grad_norm": 1.5303593873977661, + "learning_rate": 1.6200041887057705e-07, + "loss": 0.3308, + "step": 15510 + }, + { + "epoch": 1.887557042896258, + "grad_norm": 1.7096195220947266, + "learning_rate": 1.616509056335458e-07, + "loss": 0.3759, + "step": 15511 + }, + { + "epoch": 1.887678734408275, + "grad_norm": 3.1133453845977783, + "learning_rate": 1.6130176676606101e-07, + "loss": 0.4211, + "step": 15512 + }, + { + "epoch": 1.887800425920292, + "grad_norm": 1.7224997282028198, + "learning_rate": 1.6095300228140876e-07, + "loss": 0.357, + "step": 15513 + }, + { + "epoch": 1.887922117432309, + "grad_norm": 1.5754659175872803, + "learning_rate": 1.6060461219286172e-07, + "loss": 0.3594, + "step": 15514 + }, + { + "epoch": 1.888043808944326, + "grad_norm": 2.473769426345825, + "learning_rate": 1.6025659651367487e-07, + "loss": 0.3307, + "step": 15515 + }, + { + "epoch": 1.8881655004563431, + "grad_norm": 1.7727367877960205, + "learning_rate": 1.5990895525709094e-07, + "loss": 0.3692, + "step": 15516 + }, + { + "epoch": 1.8882871919683604, + "grad_norm": 2.061589002609253, + "learning_rate": 1.5956168843634046e-07, + "loss": 0.4008, + "step": 15517 + }, + { + "epoch": 1.8884088834803774, + "grad_norm": 1.6778526306152344, + "learning_rate": 1.592147960646362e-07, + "loss": 0.4159, + "step": 15518 + }, + { + "epoch": 1.8885305749923944, + "grad_norm": 3.4384090900421143, + "learning_rate": 1.5886827815517757e-07, + "loss": 0.4471, + "step": 15519 + }, + { + "epoch": 1.8886522665044114, + "grad_norm": 1.743391513824463, + "learning_rate": 1.5852213472115187e-07, + "loss": 0.368, + "step": 15520 + }, + { + "epoch": 1.8887739580164284, + "grad_norm": 2.0557942390441895, + "learning_rate": 1.5817636577572847e-07, + "loss": 0.3795, + "step": 15521 + }, + { + "epoch": 1.8888956495284455, + "grad_norm": 1.49049973487854, + "learning_rate": 1.5783097133206694e-07, + "loss": 0.3636, + "step": 15522 + }, + { + "epoch": 1.8890173410404625, + "grad_norm": 1.6343085765838623, + "learning_rate": 1.574859514033089e-07, + "loss": 0.386, + "step": 15523 + }, + { + "epoch": 1.8891390325524795, + "grad_norm": 1.5958449840545654, + "learning_rate": 1.571413060025828e-07, + "loss": 0.3486, + "step": 15524 + }, + { + "epoch": 1.8892607240644965, + "grad_norm": 2.0006320476531982, + "learning_rate": 1.5679703514300372e-07, + "loss": 0.3787, + "step": 15525 + }, + { + "epoch": 1.8893824155765135, + "grad_norm": 1.7781809568405151, + "learning_rate": 1.564531388376722e-07, + "loss": 0.3478, + "step": 15526 + }, + { + "epoch": 1.8895041070885306, + "grad_norm": 1.8301674127578735, + "learning_rate": 1.5610961709967233e-07, + "loss": 0.3798, + "step": 15527 + }, + { + "epoch": 1.8896257986005476, + "grad_norm": 1.5751827955245972, + "learning_rate": 1.5576646994207912e-07, + "loss": 0.3759, + "step": 15528 + }, + { + "epoch": 1.8897474901125646, + "grad_norm": 2.0050911903381348, + "learning_rate": 1.554236973779477e-07, + "loss": 0.3099, + "step": 15529 + }, + { + "epoch": 1.8898691816245816, + "grad_norm": 2.4886465072631836, + "learning_rate": 1.5508129942032103e-07, + "loss": 0.304, + "step": 15530 + }, + { + "epoch": 1.8899908731365986, + "grad_norm": 2.0082767009735107, + "learning_rate": 1.547392760822297e-07, + "loss": 0.3541, + "step": 15531 + }, + { + "epoch": 1.8901125646486157, + "grad_norm": 1.6126676797866821, + "learning_rate": 1.5439762737668672e-07, + "loss": 0.3408, + "step": 15532 + }, + { + "epoch": 1.8902342561606327, + "grad_norm": 1.9257162809371948, + "learning_rate": 1.540563533166939e-07, + "loss": 0.3124, + "step": 15533 + }, + { + "epoch": 1.8903559476726497, + "grad_norm": 2.029927968978882, + "learning_rate": 1.5371545391523636e-07, + "loss": 0.3296, + "step": 15534 + }, + { + "epoch": 1.8904776391846667, + "grad_norm": 2.923900604248047, + "learning_rate": 1.5337492918528596e-07, + "loss": 0.3741, + "step": 15535 + }, + { + "epoch": 1.890599330696684, + "grad_norm": 1.9904184341430664, + "learning_rate": 1.5303477913980235e-07, + "loss": 0.3912, + "step": 15536 + }, + { + "epoch": 1.890721022208701, + "grad_norm": 1.8978614807128906, + "learning_rate": 1.526950037917263e-07, + "loss": 0.3184, + "step": 15537 + }, + { + "epoch": 1.890842713720718, + "grad_norm": 1.5287790298461914, + "learning_rate": 1.5235560315398856e-07, + "loss": 0.3667, + "step": 15538 + }, + { + "epoch": 1.890964405232735, + "grad_norm": 1.7170741558074951, + "learning_rate": 1.5201657723950326e-07, + "loss": 0.4334, + "step": 15539 + }, + { + "epoch": 1.891086096744752, + "grad_norm": 2.079960823059082, + "learning_rate": 1.5167792606117226e-07, + "loss": 0.3985, + "step": 15540 + }, + { + "epoch": 1.891207788256769, + "grad_norm": 1.5038026571273804, + "learning_rate": 1.5133964963187975e-07, + "loss": 0.362, + "step": 15541 + }, + { + "epoch": 1.8913294797687863, + "grad_norm": 1.8691158294677734, + "learning_rate": 1.5100174796450096e-07, + "loss": 0.3201, + "step": 15542 + }, + { + "epoch": 1.8914511712808033, + "grad_norm": 2.1943020820617676, + "learning_rate": 1.5066422107189117e-07, + "loss": 0.3209, + "step": 15543 + }, + { + "epoch": 1.8915728627928203, + "grad_norm": 2.417699098587036, + "learning_rate": 1.5032706896689452e-07, + "loss": 0.2917, + "step": 15544 + }, + { + "epoch": 1.8916945543048374, + "grad_norm": 1.629287838935852, + "learning_rate": 1.499902916623408e-07, + "loss": 0.3766, + "step": 15545 + }, + { + "epoch": 1.8918162458168544, + "grad_norm": 3.198397159576416, + "learning_rate": 1.496538891710442e-07, + "loss": 0.4089, + "step": 15546 + }, + { + "epoch": 1.8919379373288714, + "grad_norm": 1.9084103107452393, + "learning_rate": 1.4931786150580775e-07, + "loss": 0.3456, + "step": 15547 + }, + { + "epoch": 1.8920596288408884, + "grad_norm": 1.6851654052734375, + "learning_rate": 1.489822086794157e-07, + "loss": 0.3023, + "step": 15548 + }, + { + "epoch": 1.8921813203529054, + "grad_norm": 2.7930757999420166, + "learning_rate": 1.486469307046401e-07, + "loss": 0.383, + "step": 15549 + }, + { + "epoch": 1.8923030118649224, + "grad_norm": 1.7203458547592163, + "learning_rate": 1.4831202759424068e-07, + "loss": 0.3327, + "step": 15550 + }, + { + "epoch": 1.8924247033769395, + "grad_norm": 2.0448713302612305, + "learning_rate": 1.4797749936096173e-07, + "loss": 0.4002, + "step": 15551 + }, + { + "epoch": 1.8925463948889565, + "grad_norm": 2.388394594192505, + "learning_rate": 1.4764334601752972e-07, + "loss": 0.391, + "step": 15552 + }, + { + "epoch": 1.8926680864009735, + "grad_norm": 4.091358661651611, + "learning_rate": 1.4730956757666337e-07, + "loss": 0.4297, + "step": 15553 + }, + { + "epoch": 1.8927897779129905, + "grad_norm": 1.76665461063385, + "learning_rate": 1.469761640510603e-07, + "loss": 0.2893, + "step": 15554 + }, + { + "epoch": 1.8929114694250075, + "grad_norm": 1.7423388957977295, + "learning_rate": 1.4664313545340926e-07, + "loss": 0.3966, + "step": 15555 + }, + { + "epoch": 1.8930331609370246, + "grad_norm": 1.6806039810180664, + "learning_rate": 1.463104817963823e-07, + "loss": 0.3056, + "step": 15556 + }, + { + "epoch": 1.8931548524490416, + "grad_norm": 1.9396659135818481, + "learning_rate": 1.4597820309263712e-07, + "loss": 0.424, + "step": 15557 + }, + { + "epoch": 1.8932765439610586, + "grad_norm": 3.064082145690918, + "learning_rate": 1.4564629935481912e-07, + "loss": 0.3772, + "step": 15558 + }, + { + "epoch": 1.8933982354730756, + "grad_norm": 1.7717182636260986, + "learning_rate": 1.45314770595556e-07, + "loss": 0.3954, + "step": 15559 + }, + { + "epoch": 1.8935199269850926, + "grad_norm": 1.7230448722839355, + "learning_rate": 1.449836168274632e-07, + "loss": 0.3335, + "step": 15560 + }, + { + "epoch": 1.8936416184971099, + "grad_norm": 1.8982417583465576, + "learning_rate": 1.4465283806314402e-07, + "loss": 0.3958, + "step": 15561 + }, + { + "epoch": 1.893763310009127, + "grad_norm": 1.5071955919265747, + "learning_rate": 1.4432243431518279e-07, + "loss": 0.3356, + "step": 15562 + }, + { + "epoch": 1.893885001521144, + "grad_norm": 1.5439248085021973, + "learning_rate": 1.439924055961539e-07, + "loss": 0.3759, + "step": 15563 + }, + { + "epoch": 1.894006693033161, + "grad_norm": 4.547462463378906, + "learning_rate": 1.43662751918614e-07, + "loss": 0.4813, + "step": 15564 + }, + { + "epoch": 1.894128384545178, + "grad_norm": 1.606374979019165, + "learning_rate": 1.433334732951086e-07, + "loss": 0.3566, + "step": 15565 + }, + { + "epoch": 1.894250076057195, + "grad_norm": 2.2416622638702393, + "learning_rate": 1.4300456973816656e-07, + "loss": 0.4402, + "step": 15566 + }, + { + "epoch": 1.894371767569212, + "grad_norm": 1.9301303625106812, + "learning_rate": 1.4267604126030454e-07, + "loss": 0.3631, + "step": 15567 + }, + { + "epoch": 1.8944934590812292, + "grad_norm": 3.0308613777160645, + "learning_rate": 1.4234788787402033e-07, + "loss": 0.3651, + "step": 15568 + }, + { + "epoch": 1.8946151505932463, + "grad_norm": 2.5868449211120605, + "learning_rate": 1.42020109591805e-07, + "loss": 0.3535, + "step": 15569 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 1.9669126272201538, + "learning_rate": 1.4169270642612755e-07, + "loss": 0.3292, + "step": 15570 + }, + { + "epoch": 1.8948585336172803, + "grad_norm": 2.1283349990844727, + "learning_rate": 1.4136567838945015e-07, + "loss": 0.3361, + "step": 15571 + }, + { + "epoch": 1.8949802251292973, + "grad_norm": 1.5383472442626953, + "learning_rate": 1.41039025494214e-07, + "loss": 0.3531, + "step": 15572 + }, + { + "epoch": 1.8951019166413143, + "grad_norm": 2.2978322505950928, + "learning_rate": 1.4071274775285027e-07, + "loss": 0.3438, + "step": 15573 + }, + { + "epoch": 1.8952236081533314, + "grad_norm": 1.8739210367202759, + "learning_rate": 1.4038684517777347e-07, + "loss": 0.382, + "step": 15574 + }, + { + "epoch": 1.8953452996653484, + "grad_norm": 1.5733166933059692, + "learning_rate": 1.4006131778138698e-07, + "loss": 0.3613, + "step": 15575 + }, + { + "epoch": 1.8954669911773654, + "grad_norm": 1.9286514520645142, + "learning_rate": 1.3973616557607538e-07, + "loss": 0.4076, + "step": 15576 + }, + { + "epoch": 1.8955886826893824, + "grad_norm": 4.407425880432129, + "learning_rate": 1.3941138857421211e-07, + "loss": 0.3411, + "step": 15577 + }, + { + "epoch": 1.8957103742013994, + "grad_norm": 1.8577276468276978, + "learning_rate": 1.39086986788155e-07, + "loss": 0.4108, + "step": 15578 + }, + { + "epoch": 1.8958320657134164, + "grad_norm": 2.144033193588257, + "learning_rate": 1.387629602302498e-07, + "loss": 0.4027, + "step": 15579 + }, + { + "epoch": 1.8959537572254335, + "grad_norm": 1.8283655643463135, + "learning_rate": 1.3843930891282665e-07, + "loss": 0.364, + "step": 15580 + }, + { + "epoch": 1.8960754487374505, + "grad_norm": 1.5492373704910278, + "learning_rate": 1.3811603284819786e-07, + "loss": 0.3841, + "step": 15581 + }, + { + "epoch": 1.8961971402494675, + "grad_norm": 2.024186372756958, + "learning_rate": 1.377931320486692e-07, + "loss": 0.3897, + "step": 15582 + }, + { + "epoch": 1.8963188317614845, + "grad_norm": 2.529646396636963, + "learning_rate": 1.374706065265241e-07, + "loss": 0.3494, + "step": 15583 + }, + { + "epoch": 1.8964405232735015, + "grad_norm": 2.182511329650879, + "learning_rate": 1.371484562940373e-07, + "loss": 0.3583, + "step": 15584 + }, + { + "epoch": 1.8965622147855186, + "grad_norm": 2.0404226779937744, + "learning_rate": 1.3682668136346667e-07, + "loss": 0.3291, + "step": 15585 + }, + { + "epoch": 1.8966839062975356, + "grad_norm": 1.476284146308899, + "learning_rate": 1.365052817470569e-07, + "loss": 0.3515, + "step": 15586 + }, + { + "epoch": 1.8968055978095528, + "grad_norm": 1.8726391792297363, + "learning_rate": 1.36184257457036e-07, + "loss": 0.3278, + "step": 15587 + }, + { + "epoch": 1.8969272893215698, + "grad_norm": 3.92242693901062, + "learning_rate": 1.3586360850562307e-07, + "loss": 0.4368, + "step": 15588 + }, + { + "epoch": 1.8970489808335869, + "grad_norm": 1.8666071891784668, + "learning_rate": 1.3554333490501614e-07, + "loss": 0.3376, + "step": 15589 + }, + { + "epoch": 1.8971706723456039, + "grad_norm": 1.6082663536071777, + "learning_rate": 1.3522343666740435e-07, + "loss": 0.3344, + "step": 15590 + }, + { + "epoch": 1.897292363857621, + "grad_norm": 1.8468172550201416, + "learning_rate": 1.3490391380495905e-07, + "loss": 0.3396, + "step": 15591 + }, + { + "epoch": 1.897414055369638, + "grad_norm": 1.9696063995361328, + "learning_rate": 1.345847663298394e-07, + "loss": 0.3169, + "step": 15592 + }, + { + "epoch": 1.8975357468816552, + "grad_norm": 1.651789665222168, + "learning_rate": 1.342659942541902e-07, + "loss": 0.3724, + "step": 15593 + }, + { + "epoch": 1.8976574383936722, + "grad_norm": 1.7387598752975464, + "learning_rate": 1.3394759759014164e-07, + "loss": 0.2967, + "step": 15594 + }, + { + "epoch": 1.8977791299056892, + "grad_norm": 1.9992504119873047, + "learning_rate": 1.3362957634980744e-07, + "loss": 0.3944, + "step": 15595 + }, + { + "epoch": 1.8979008214177062, + "grad_norm": 1.7703934907913208, + "learning_rate": 1.3331193054529012e-07, + "loss": 0.3752, + "step": 15596 + }, + { + "epoch": 1.8980225129297232, + "grad_norm": 1.9858839511871338, + "learning_rate": 1.329946601886778e-07, + "loss": 0.3696, + "step": 15597 + }, + { + "epoch": 1.8981442044417403, + "grad_norm": 1.469612717628479, + "learning_rate": 1.326777652920408e-07, + "loss": 0.3892, + "step": 15598 + }, + { + "epoch": 1.8982658959537573, + "grad_norm": 1.9799754619598389, + "learning_rate": 1.3236124586744058e-07, + "loss": 0.3085, + "step": 15599 + }, + { + "epoch": 1.8983875874657743, + "grad_norm": 3.9746949672698975, + "learning_rate": 1.3204510192691867e-07, + "loss": 0.4683, + "step": 15600 + }, + { + "epoch": 1.8985092789777913, + "grad_norm": 1.6107414960861206, + "learning_rate": 1.3172933348250649e-07, + "loss": 0.3116, + "step": 15601 + }, + { + "epoch": 1.8986309704898083, + "grad_norm": 2.864382028579712, + "learning_rate": 1.3141394054621892e-07, + "loss": 0.4334, + "step": 15602 + }, + { + "epoch": 1.8987526620018254, + "grad_norm": 1.5157403945922852, + "learning_rate": 1.3109892313005856e-07, + "loss": 0.3401, + "step": 15603 + }, + { + "epoch": 1.8988743535138424, + "grad_norm": 1.9197707176208496, + "learning_rate": 1.307842812460114e-07, + "loss": 0.3541, + "step": 15604 + }, + { + "epoch": 1.8989960450258594, + "grad_norm": 2.0894527435302734, + "learning_rate": 1.3047001490605005e-07, + "loss": 0.366, + "step": 15605 + }, + { + "epoch": 1.8991177365378764, + "grad_norm": 1.8883907794952393, + "learning_rate": 1.3015612412213386e-07, + "loss": 0.3127, + "step": 15606 + }, + { + "epoch": 1.8992394280498934, + "grad_norm": 2.121696949005127, + "learning_rate": 1.298426089062066e-07, + "loss": 0.3748, + "step": 15607 + }, + { + "epoch": 1.8993611195619104, + "grad_norm": 1.4395122528076172, + "learning_rate": 1.2952946927019872e-07, + "loss": 0.3828, + "step": 15608 + }, + { + "epoch": 1.8994828110739275, + "grad_norm": 1.9695862531661987, + "learning_rate": 1.2921670522602402e-07, + "loss": 0.4084, + "step": 15609 + }, + { + "epoch": 1.8996045025859445, + "grad_norm": 1.5089620351791382, + "learning_rate": 1.2890431678558635e-07, + "loss": 0.3479, + "step": 15610 + }, + { + "epoch": 1.8997261940979615, + "grad_norm": 1.7086600065231323, + "learning_rate": 1.285923039607706e-07, + "loss": 0.3913, + "step": 15611 + }, + { + "epoch": 1.8998478856099787, + "grad_norm": 1.719649076461792, + "learning_rate": 1.2828066676345064e-07, + "loss": 0.3901, + "step": 15612 + }, + { + "epoch": 1.8999695771219958, + "grad_norm": 2.038633108139038, + "learning_rate": 1.2796940520548473e-07, + "loss": 0.4471, + "step": 15613 + }, + { + "epoch": 1.9000912686340128, + "grad_norm": 2.121171712875366, + "learning_rate": 1.2765851929871675e-07, + "loss": 0.38, + "step": 15614 + }, + { + "epoch": 1.9002129601460298, + "grad_norm": 2.4557530879974365, + "learning_rate": 1.2734800905497725e-07, + "loss": 0.4629, + "step": 15615 + }, + { + "epoch": 1.9003346516580468, + "grad_norm": 1.3311986923217773, + "learning_rate": 1.2703787448608118e-07, + "loss": 0.3179, + "step": 15616 + }, + { + "epoch": 1.9004563431700638, + "grad_norm": 3.0475714206695557, + "learning_rate": 1.2672811560382913e-07, + "loss": 0.4957, + "step": 15617 + }, + { + "epoch": 1.900578034682081, + "grad_norm": 1.663323163986206, + "learning_rate": 1.2641873242000946e-07, + "loss": 0.3727, + "step": 15618 + }, + { + "epoch": 1.900699726194098, + "grad_norm": 2.1675822734832764, + "learning_rate": 1.2610972494639384e-07, + "loss": 0.3994, + "step": 15619 + }, + { + "epoch": 1.9008214177061151, + "grad_norm": 1.463021159172058, + "learning_rate": 1.2580109319474065e-07, + "loss": 0.3273, + "step": 15620 + }, + { + "epoch": 1.9009431092181321, + "grad_norm": 2.210233211517334, + "learning_rate": 1.2549283717679607e-07, + "loss": 0.3225, + "step": 15621 + }, + { + "epoch": 1.9010648007301492, + "grad_norm": 1.7576329708099365, + "learning_rate": 1.2518495690428623e-07, + "loss": 0.3378, + "step": 15622 + }, + { + "epoch": 1.9011864922421662, + "grad_norm": 1.4421683549880981, + "learning_rate": 1.2487745238892956e-07, + "loss": 0.3749, + "step": 15623 + }, + { + "epoch": 1.9013081837541832, + "grad_norm": 1.74948251247406, + "learning_rate": 1.245703236424267e-07, + "loss": 0.3327, + "step": 15624 + }, + { + "epoch": 1.9014298752662002, + "grad_norm": 2.2133705615997314, + "learning_rate": 1.2426357067646278e-07, + "loss": 0.2968, + "step": 15625 + }, + { + "epoch": 1.9015515667782172, + "grad_norm": 1.6672260761260986, + "learning_rate": 1.2395719350271174e-07, + "loss": 0.3839, + "step": 15626 + }, + { + "epoch": 1.9016732582902343, + "grad_norm": 1.4690799713134766, + "learning_rate": 1.2365119213283317e-07, + "loss": 0.335, + "step": 15627 + }, + { + "epoch": 1.9017949498022513, + "grad_norm": 1.741631031036377, + "learning_rate": 1.2334556657846774e-07, + "loss": 0.3582, + "step": 15628 + }, + { + "epoch": 1.9019166413142683, + "grad_norm": 2.2371408939361572, + "learning_rate": 1.230403168512484e-07, + "loss": 0.4062, + "step": 15629 + }, + { + "epoch": 1.9020383328262853, + "grad_norm": 3.551867723464966, + "learning_rate": 1.2273544296278806e-07, + "loss": 0.4439, + "step": 15630 + }, + { + "epoch": 1.9021600243383023, + "grad_norm": 1.673030972480774, + "learning_rate": 1.2243094492468964e-07, + "loss": 0.383, + "step": 15631 + }, + { + "epoch": 1.9022817158503194, + "grad_norm": 2.83841609954834, + "learning_rate": 1.221268227485395e-07, + "loss": 0.4466, + "step": 15632 + }, + { + "epoch": 1.9024034073623364, + "grad_norm": 1.5153568983078003, + "learning_rate": 1.218230764459094e-07, + "loss": 0.357, + "step": 15633 + }, + { + "epoch": 1.9025250988743534, + "grad_norm": 1.9943279027938843, + "learning_rate": 1.2151970602835904e-07, + "loss": 0.3939, + "step": 15634 + }, + { + "epoch": 1.9026467903863704, + "grad_norm": 1.7895900011062622, + "learning_rate": 1.212167115074303e-07, + "loss": 0.4009, + "step": 15635 + }, + { + "epoch": 1.9027684818983874, + "grad_norm": 2.7744574546813965, + "learning_rate": 1.2091409289465283e-07, + "loss": 0.3392, + "step": 15636 + }, + { + "epoch": 1.9028901734104047, + "grad_norm": 2.0328197479248047, + "learning_rate": 1.2061185020154409e-07, + "loss": 0.3749, + "step": 15637 + }, + { + "epoch": 1.9030118649224217, + "grad_norm": 2.0816256999969482, + "learning_rate": 1.2030998343960265e-07, + "loss": 0.4097, + "step": 15638 + }, + { + "epoch": 1.9031335564344387, + "grad_norm": 1.815958023071289, + "learning_rate": 1.200084926203171e-07, + "loss": 0.3476, + "step": 15639 + }, + { + "epoch": 1.9032552479464557, + "grad_norm": 1.5143921375274658, + "learning_rate": 1.197073777551583e-07, + "loss": 0.3698, + "step": 15640 + }, + { + "epoch": 1.9033769394584727, + "grad_norm": 2.451936721801758, + "learning_rate": 1.1940663885558368e-07, + "loss": 0.3102, + "step": 15641 + }, + { + "epoch": 1.9034986309704898, + "grad_norm": 3.6819612979888916, + "learning_rate": 1.1910627593303969e-07, + "loss": 0.3438, + "step": 15642 + }, + { + "epoch": 1.903620322482507, + "grad_norm": 1.8531838655471802, + "learning_rate": 1.1880628899895496e-07, + "loss": 0.4247, + "step": 15643 + }, + { + "epoch": 1.903742013994524, + "grad_norm": 2.0949432849884033, + "learning_rate": 1.1850667806474148e-07, + "loss": 0.3146, + "step": 15644 + }, + { + "epoch": 1.903863705506541, + "grad_norm": 1.786766529083252, + "learning_rate": 1.1820744314180455e-07, + "loss": 0.3664, + "step": 15645 + }, + { + "epoch": 1.903985397018558, + "grad_norm": 2.2288992404937744, + "learning_rate": 1.179085842415284e-07, + "loss": 0.3379, + "step": 15646 + }, + { + "epoch": 1.904107088530575, + "grad_norm": 1.7041383981704712, + "learning_rate": 1.1761010137528506e-07, + "loss": 0.3304, + "step": 15647 + }, + { + "epoch": 1.904228780042592, + "grad_norm": 3.200002670288086, + "learning_rate": 1.173119945544321e-07, + "loss": 0.3029, + "step": 15648 + }, + { + "epoch": 1.9043504715546091, + "grad_norm": 1.7898532152175903, + "learning_rate": 1.1701426379031377e-07, + "loss": 0.4038, + "step": 15649 + }, + { + "epoch": 1.9044721630666261, + "grad_norm": 3.10357403755188, + "learning_rate": 1.1671690909425992e-07, + "loss": 0.2895, + "step": 15650 + }, + { + "epoch": 1.9045938545786432, + "grad_norm": 1.7661055326461792, + "learning_rate": 1.1641993047758593e-07, + "loss": 0.3555, + "step": 15651 + }, + { + "epoch": 1.9047155460906602, + "grad_norm": 1.898592233657837, + "learning_rate": 1.1612332795158943e-07, + "loss": 0.3322, + "step": 15652 + }, + { + "epoch": 1.9048372376026772, + "grad_norm": 1.8657114505767822, + "learning_rate": 1.1582710152756027e-07, + "loss": 0.3779, + "step": 15653 + }, + { + "epoch": 1.9049589291146942, + "grad_norm": 1.4222372770309448, + "learning_rate": 1.1553125121676833e-07, + "loss": 0.3662, + "step": 15654 + }, + { + "epoch": 1.9050806206267112, + "grad_norm": 2.719543695449829, + "learning_rate": 1.1523577703047239e-07, + "loss": 0.3852, + "step": 15655 + }, + { + "epoch": 1.9052023121387283, + "grad_norm": 1.905992865562439, + "learning_rate": 1.1494067897991568e-07, + "loss": 0.4082, + "step": 15656 + }, + { + "epoch": 1.9053240036507453, + "grad_norm": 3.3642287254333496, + "learning_rate": 1.1464595707632697e-07, + "loss": 0.4085, + "step": 15657 + }, + { + "epoch": 1.9054456951627623, + "grad_norm": 4.392092704772949, + "learning_rate": 1.1435161133092065e-07, + "loss": 0.2837, + "step": 15658 + }, + { + "epoch": 1.9055673866747793, + "grad_norm": 2.469750165939331, + "learning_rate": 1.1405764175489887e-07, + "loss": 0.3988, + "step": 15659 + }, + { + "epoch": 1.9056890781867963, + "grad_norm": 1.797170639038086, + "learning_rate": 1.1376404835944488e-07, + "loss": 0.3257, + "step": 15660 + }, + { + "epoch": 1.9058107696988134, + "grad_norm": 1.7183449268341064, + "learning_rate": 1.134708311557331e-07, + "loss": 0.3756, + "step": 15661 + }, + { + "epoch": 1.9059324612108306, + "grad_norm": 1.552887201309204, + "learning_rate": 1.1317799015492014e-07, + "loss": 0.4094, + "step": 15662 + }, + { + "epoch": 1.9060541527228476, + "grad_norm": 3.746464967727661, + "learning_rate": 1.1288552536814934e-07, + "loss": 0.3351, + "step": 15663 + }, + { + "epoch": 1.9061758442348646, + "grad_norm": 2.0496416091918945, + "learning_rate": 1.1259343680654955e-07, + "loss": 0.2993, + "step": 15664 + }, + { + "epoch": 1.9062975357468817, + "grad_norm": 2.1597886085510254, + "learning_rate": 1.1230172448123522e-07, + "loss": 0.3792, + "step": 15665 + }, + { + "epoch": 1.9064192272588987, + "grad_norm": 2.3849825859069824, + "learning_rate": 1.1201038840330636e-07, + "loss": 0.3366, + "step": 15666 + }, + { + "epoch": 1.9065409187709157, + "grad_norm": 1.6846128702163696, + "learning_rate": 1.1171942858384966e-07, + "loss": 0.34, + "step": 15667 + }, + { + "epoch": 1.9066626102829327, + "grad_norm": 1.7930631637573242, + "learning_rate": 1.1142884503393625e-07, + "loss": 0.3384, + "step": 15668 + }, + { + "epoch": 1.90678430179495, + "grad_norm": 2.7511916160583496, + "learning_rate": 1.1113863776462286e-07, + "loss": 0.452, + "step": 15669 + }, + { + "epoch": 1.906905993306967, + "grad_norm": 1.9824519157409668, + "learning_rate": 1.1084880678695397e-07, + "loss": 0.3485, + "step": 15670 + }, + { + "epoch": 1.907027684818984, + "grad_norm": 1.8829624652862549, + "learning_rate": 1.1055935211195745e-07, + "loss": 0.3514, + "step": 15671 + }, + { + "epoch": 1.907149376331001, + "grad_norm": 1.7604342699050903, + "learning_rate": 1.1027027375064669e-07, + "loss": 0.3201, + "step": 15672 + }, + { + "epoch": 1.907271067843018, + "grad_norm": 1.853173851966858, + "learning_rate": 1.0998157171402402e-07, + "loss": 0.3277, + "step": 15673 + }, + { + "epoch": 1.907392759355035, + "grad_norm": 1.6976581811904907, + "learning_rate": 1.0969324601307174e-07, + "loss": 0.3902, + "step": 15674 + }, + { + "epoch": 1.907514450867052, + "grad_norm": 1.4061689376831055, + "learning_rate": 1.0940529665876554e-07, + "loss": 0.319, + "step": 15675 + }, + { + "epoch": 1.907636142379069, + "grad_norm": 1.9750257730484009, + "learning_rate": 1.0911772366205886e-07, + "loss": 0.3862, + "step": 15676 + }, + { + "epoch": 1.907757833891086, + "grad_norm": 2.3953728675842285, + "learning_rate": 1.0883052703389518e-07, + "loss": 0.3656, + "step": 15677 + }, + { + "epoch": 1.9078795254031031, + "grad_norm": 1.772693395614624, + "learning_rate": 1.0854370678520465e-07, + "loss": 0.3592, + "step": 15678 + }, + { + "epoch": 1.9080012169151201, + "grad_norm": 2.816084861755371, + "learning_rate": 1.0825726292690075e-07, + "loss": 0.3902, + "step": 15679 + }, + { + "epoch": 1.9081229084271372, + "grad_norm": 1.6737862825393677, + "learning_rate": 1.0797119546988255e-07, + "loss": 0.325, + "step": 15680 + }, + { + "epoch": 1.9082445999391542, + "grad_norm": 2.518547296524048, + "learning_rate": 1.0768550442503466e-07, + "loss": 0.3202, + "step": 15681 + }, + { + "epoch": 1.9083662914511712, + "grad_norm": 1.8919771909713745, + "learning_rate": 1.0740018980322953e-07, + "loss": 0.3589, + "step": 15682 + }, + { + "epoch": 1.9084879829631882, + "grad_norm": 1.4554303884506226, + "learning_rate": 1.071152516153251e-07, + "loss": 0.3503, + "step": 15683 + }, + { + "epoch": 1.9086096744752052, + "grad_norm": 1.468931794166565, + "learning_rate": 1.068306898721616e-07, + "loss": 0.3455, + "step": 15684 + }, + { + "epoch": 1.9087313659872223, + "grad_norm": 1.76096773147583, + "learning_rate": 1.0654650458456705e-07, + "loss": 0.3422, + "step": 15685 + }, + { + "epoch": 1.9088530574992393, + "grad_norm": 1.7063286304473877, + "learning_rate": 1.0626269576335723e-07, + "loss": 0.3293, + "step": 15686 + }, + { + "epoch": 1.9089747490112563, + "grad_norm": 1.6289657354354858, + "learning_rate": 1.059792634193313e-07, + "loss": 0.3133, + "step": 15687 + }, + { + "epoch": 1.9090964405232735, + "grad_norm": 1.7014787197113037, + "learning_rate": 1.0569620756327281e-07, + "loss": 0.4098, + "step": 15688 + }, + { + "epoch": 1.9092181320352906, + "grad_norm": 2.8411800861358643, + "learning_rate": 1.054135282059543e-07, + "loss": 0.403, + "step": 15689 + }, + { + "epoch": 1.9093398235473076, + "grad_norm": 2.6975014209747314, + "learning_rate": 1.0513122535813158e-07, + "loss": 0.464, + "step": 15690 + }, + { + "epoch": 1.9094615150593246, + "grad_norm": 1.574360728263855, + "learning_rate": 1.048492990305483e-07, + "loss": 0.3582, + "step": 15691 + }, + { + "epoch": 1.9095832065713416, + "grad_norm": 1.4731158018112183, + "learning_rate": 1.0456774923392921e-07, + "loss": 0.3162, + "step": 15692 + }, + { + "epoch": 1.9097048980833586, + "grad_norm": 1.5337339639663696, + "learning_rate": 1.0428657597899016e-07, + "loss": 0.3401, + "step": 15693 + }, + { + "epoch": 1.9098265895953759, + "grad_norm": 1.8068760633468628, + "learning_rate": 1.0400577927643041e-07, + "loss": 0.3964, + "step": 15694 + }, + { + "epoch": 1.909948281107393, + "grad_norm": 2.491594076156616, + "learning_rate": 1.037253591369336e-07, + "loss": 0.3843, + "step": 15695 + }, + { + "epoch": 1.91006997261941, + "grad_norm": 1.5058473348617554, + "learning_rate": 1.034453155711712e-07, + "loss": 0.3571, + "step": 15696 + }, + { + "epoch": 1.910191664131427, + "grad_norm": 2.8071610927581787, + "learning_rate": 1.0316564858980027e-07, + "loss": 0.3699, + "step": 15697 + }, + { + "epoch": 1.910313355643444, + "grad_norm": 1.8608394861221313, + "learning_rate": 1.0288635820346004e-07, + "loss": 0.3273, + "step": 15698 + }, + { + "epoch": 1.910435047155461, + "grad_norm": 2.825749635696411, + "learning_rate": 1.0260744442278093e-07, + "loss": 0.4064, + "step": 15699 + }, + { + "epoch": 1.910556738667478, + "grad_norm": 2.0415709018707275, + "learning_rate": 1.0232890725837441e-07, + "loss": 0.3803, + "step": 15700 + }, + { + "epoch": 1.910678430179495, + "grad_norm": 1.860910177230835, + "learning_rate": 1.0205074672084092e-07, + "loss": 0.4217, + "step": 15701 + }, + { + "epoch": 1.910800121691512, + "grad_norm": 1.3647487163543701, + "learning_rate": 1.017729628207631e-07, + "loss": 0.3409, + "step": 15702 + }, + { + "epoch": 1.910921813203529, + "grad_norm": 2.0112497806549072, + "learning_rate": 1.014955555687136e-07, + "loss": 0.4113, + "step": 15703 + }, + { + "epoch": 1.911043504715546, + "grad_norm": 2.5934998989105225, + "learning_rate": 1.012185249752462e-07, + "loss": 0.4251, + "step": 15704 + }, + { + "epoch": 1.911165196227563, + "grad_norm": 1.5455267429351807, + "learning_rate": 1.0094187105090358e-07, + "loss": 0.3572, + "step": 15705 + }, + { + "epoch": 1.91128688773958, + "grad_norm": 2.318258285522461, + "learning_rate": 1.0066559380621177e-07, + "loss": 0.4247, + "step": 15706 + }, + { + "epoch": 1.9114085792515971, + "grad_norm": 1.9367578029632568, + "learning_rate": 1.0038969325168569e-07, + "loss": 0.3123, + "step": 15707 + }, + { + "epoch": 1.9115302707636141, + "grad_norm": 3.868952512741089, + "learning_rate": 1.0011416939782248e-07, + "loss": 0.4518, + "step": 15708 + }, + { + "epoch": 1.9116519622756312, + "grad_norm": 1.7361475229263306, + "learning_rate": 9.9839022255106e-08, + "loss": 0.3311, + "step": 15709 + }, + { + "epoch": 1.9117736537876482, + "grad_norm": 1.4297748804092407, + "learning_rate": 9.956425183400786e-08, + "loss": 0.3352, + "step": 15710 + }, + { + "epoch": 1.9118953452996652, + "grad_norm": 1.944142460823059, + "learning_rate": 9.928985814498193e-08, + "loss": 0.3698, + "step": 15711 + }, + { + "epoch": 1.9120170368116822, + "grad_norm": 1.6730833053588867, + "learning_rate": 9.901584119847096e-08, + "loss": 0.365, + "step": 15712 + }, + { + "epoch": 1.9121387283236995, + "grad_norm": 2.269139289855957, + "learning_rate": 9.874220100490106e-08, + "loss": 0.3638, + "step": 15713 + }, + { + "epoch": 1.9122604198357165, + "grad_norm": 1.6421120166778564, + "learning_rate": 9.8468937574685e-08, + "loss": 0.4002, + "step": 15714 + }, + { + "epoch": 1.9123821113477335, + "grad_norm": 1.5852395296096802, + "learning_rate": 9.819605091822004e-08, + "loss": 0.3799, + "step": 15715 + }, + { + "epoch": 1.9125038028597505, + "grad_norm": 2.432729721069336, + "learning_rate": 9.79235410458923e-08, + "loss": 0.3478, + "step": 15716 + }, + { + "epoch": 1.9126254943717675, + "grad_norm": 1.454453945159912, + "learning_rate": 9.765140796806904e-08, + "loss": 0.3429, + "step": 15717 + }, + { + "epoch": 1.9127471858837846, + "grad_norm": 2.1835670471191406, + "learning_rate": 9.737965169510644e-08, + "loss": 0.3391, + "step": 15718 + }, + { + "epoch": 1.9128688773958018, + "grad_norm": 1.790948748588562, + "learning_rate": 9.710827223734621e-08, + "loss": 0.3522, + "step": 15719 + }, + { + "epoch": 1.9129905689078188, + "grad_norm": 3.2709999084472656, + "learning_rate": 9.683726960511231e-08, + "loss": 0.3517, + "step": 15720 + }, + { + "epoch": 1.9131122604198358, + "grad_norm": 1.9048463106155396, + "learning_rate": 9.656664380872094e-08, + "loss": 0.3838, + "step": 15721 + }, + { + "epoch": 1.9132339519318529, + "grad_norm": 1.8403513431549072, + "learning_rate": 9.62963948584661e-08, + "loss": 0.3448, + "step": 15722 + }, + { + "epoch": 1.9133556434438699, + "grad_norm": 1.816890835762024, + "learning_rate": 9.602652276463398e-08, + "loss": 0.4082, + "step": 15723 + }, + { + "epoch": 1.913477334955887, + "grad_norm": 1.6230320930480957, + "learning_rate": 9.575702753749194e-08, + "loss": 0.3814, + "step": 15724 + }, + { + "epoch": 1.913599026467904, + "grad_norm": 1.7542905807495117, + "learning_rate": 9.548790918729622e-08, + "loss": 0.3474, + "step": 15725 + }, + { + "epoch": 1.913720717979921, + "grad_norm": 2.545095205307007, + "learning_rate": 9.521916772428752e-08, + "loss": 0.4076, + "step": 15726 + }, + { + "epoch": 1.913842409491938, + "grad_norm": 1.2396000623703003, + "learning_rate": 9.495080315868988e-08, + "loss": 0.3157, + "step": 15727 + }, + { + "epoch": 1.913964101003955, + "grad_norm": 1.5171116590499878, + "learning_rate": 9.468281550071734e-08, + "loss": 0.3628, + "step": 15728 + }, + { + "epoch": 1.914085792515972, + "grad_norm": 2.3970634937286377, + "learning_rate": 9.441520476056621e-08, + "loss": 0.3573, + "step": 15729 + }, + { + "epoch": 1.914207484027989, + "grad_norm": 2.6460959911346436, + "learning_rate": 9.414797094842054e-08, + "loss": 0.4904, + "step": 15730 + }, + { + "epoch": 1.914329175540006, + "grad_norm": 1.9427192211151123, + "learning_rate": 9.388111407444666e-08, + "loss": 0.3427, + "step": 15731 + }, + { + "epoch": 1.914450867052023, + "grad_norm": 1.7334887981414795, + "learning_rate": 9.361463414880311e-08, + "loss": 0.4313, + "step": 15732 + }, + { + "epoch": 1.91457255856404, + "grad_norm": 2.8944931030273438, + "learning_rate": 9.334853118162735e-08, + "loss": 0.3892, + "step": 15733 + }, + { + "epoch": 1.914694250076057, + "grad_norm": 2.0976483821868896, + "learning_rate": 9.308280518304458e-08, + "loss": 0.3385, + "step": 15734 + }, + { + "epoch": 1.914815941588074, + "grad_norm": 1.70675528049469, + "learning_rate": 9.281745616316784e-08, + "loss": 0.4182, + "step": 15735 + }, + { + "epoch": 1.9149376331000911, + "grad_norm": 1.8453956842422485, + "learning_rate": 9.25524841320946e-08, + "loss": 0.3672, + "step": 15736 + }, + { + "epoch": 1.9150593246121081, + "grad_norm": 1.8507795333862305, + "learning_rate": 9.22878890999046e-08, + "loss": 0.3733, + "step": 15737 + }, + { + "epoch": 1.9151810161241254, + "grad_norm": 2.1185927391052246, + "learning_rate": 9.202367107667087e-08, + "loss": 0.3521, + "step": 15738 + }, + { + "epoch": 1.9153027076361424, + "grad_norm": 1.6565144062042236, + "learning_rate": 9.175983007244316e-08, + "loss": 0.3779, + "step": 15739 + }, + { + "epoch": 1.9154243991481594, + "grad_norm": 2.4728691577911377, + "learning_rate": 9.149636609726231e-08, + "loss": 0.3876, + "step": 15740 + }, + { + "epoch": 1.9155460906601764, + "grad_norm": 1.765580415725708, + "learning_rate": 9.123327916115588e-08, + "loss": 0.3668, + "step": 15741 + }, + { + "epoch": 1.9156677821721935, + "grad_norm": 1.8593168258666992, + "learning_rate": 9.097056927413139e-08, + "loss": 0.3295, + "step": 15742 + }, + { + "epoch": 1.9157894736842105, + "grad_norm": 1.524196982383728, + "learning_rate": 9.070823644618865e-08, + "loss": 0.3489, + "step": 15743 + }, + { + "epoch": 1.9159111651962277, + "grad_norm": 2.2295501232147217, + "learning_rate": 9.044628068730854e-08, + "loss": 0.4168, + "step": 15744 + }, + { + "epoch": 1.9160328567082447, + "grad_norm": 2.052743911743164, + "learning_rate": 9.018470200745866e-08, + "loss": 0.3469, + "step": 15745 + }, + { + "epoch": 1.9161545482202618, + "grad_norm": 2.7995383739471436, + "learning_rate": 8.992350041659325e-08, + "loss": 0.2683, + "step": 15746 + }, + { + "epoch": 1.9162762397322788, + "grad_norm": 1.984447717666626, + "learning_rate": 8.966267592465105e-08, + "loss": 0.4216, + "step": 15747 + }, + { + "epoch": 1.9163979312442958, + "grad_norm": 1.9310307502746582, + "learning_rate": 8.940222854155745e-08, + "loss": 0.4126, + "step": 15748 + }, + { + "epoch": 1.9165196227563128, + "grad_norm": 2.677833318710327, + "learning_rate": 8.914215827722338e-08, + "loss": 0.4041, + "step": 15749 + }, + { + "epoch": 1.9166413142683298, + "grad_norm": 1.8265058994293213, + "learning_rate": 8.888246514154431e-08, + "loss": 0.3288, + "step": 15750 + }, + { + "epoch": 1.9167630057803469, + "grad_norm": 1.5710548162460327, + "learning_rate": 8.862314914440229e-08, + "loss": 0.3424, + "step": 15751 + }, + { + "epoch": 1.9168846972923639, + "grad_norm": 2.239837408065796, + "learning_rate": 8.836421029566388e-08, + "loss": 0.3345, + "step": 15752 + }, + { + "epoch": 1.917006388804381, + "grad_norm": 1.3584866523742676, + "learning_rate": 8.810564860518456e-08, + "loss": 0.3143, + "step": 15753 + }, + { + "epoch": 1.917128080316398, + "grad_norm": 2.688485860824585, + "learning_rate": 8.784746408280087e-08, + "loss": 0.4219, + "step": 15754 + }, + { + "epoch": 1.917249771828415, + "grad_norm": 1.5568344593048096, + "learning_rate": 8.758965673833942e-08, + "loss": 0.3493, + "step": 15755 + }, + { + "epoch": 1.917371463340432, + "grad_norm": 1.7253013849258423, + "learning_rate": 8.73322265816079e-08, + "loss": 0.3759, + "step": 15756 + }, + { + "epoch": 1.917493154852449, + "grad_norm": 1.5808874368667603, + "learning_rate": 8.707517362240292e-08, + "loss": 0.3862, + "step": 15757 + }, + { + "epoch": 1.917614846364466, + "grad_norm": 1.8901225328445435, + "learning_rate": 8.681849787050667e-08, + "loss": 0.3183, + "step": 15758 + }, + { + "epoch": 1.917736537876483, + "grad_norm": 3.3234410285949707, + "learning_rate": 8.656219933568688e-08, + "loss": 0.3031, + "step": 15759 + }, + { + "epoch": 1.9178582293885, + "grad_norm": 2.0144050121307373, + "learning_rate": 8.630627802769465e-08, + "loss": 0.327, + "step": 15760 + }, + { + "epoch": 1.917979920900517, + "grad_norm": 1.5334062576293945, + "learning_rate": 8.605073395626772e-08, + "loss": 0.3797, + "step": 15761 + }, + { + "epoch": 1.918101612412534, + "grad_norm": 1.4984906911849976, + "learning_rate": 8.579556713113169e-08, + "loss": 0.3441, + "step": 15762 + }, + { + "epoch": 1.9182233039245513, + "grad_norm": 1.9602434635162354, + "learning_rate": 8.554077756199652e-08, + "loss": 0.3244, + "step": 15763 + }, + { + "epoch": 1.9183449954365683, + "grad_norm": 2.1524736881256104, + "learning_rate": 8.52863652585556e-08, + "loss": 0.4062, + "step": 15764 + }, + { + "epoch": 1.9184666869485854, + "grad_norm": 1.5244736671447754, + "learning_rate": 8.503233023049006e-08, + "loss": 0.3846, + "step": 15765 + }, + { + "epoch": 1.9185883784606024, + "grad_norm": 1.5238925218582153, + "learning_rate": 8.477867248746773e-08, + "loss": 0.3626, + "step": 15766 + }, + { + "epoch": 1.9187100699726194, + "grad_norm": 1.6026420593261719, + "learning_rate": 8.452539203914089e-08, + "loss": 0.3641, + "step": 15767 + }, + { + "epoch": 1.9188317614846364, + "grad_norm": 2.1201412677764893, + "learning_rate": 8.427248889514738e-08, + "loss": 0.3758, + "step": 15768 + }, + { + "epoch": 1.9189534529966534, + "grad_norm": 3.2570815086364746, + "learning_rate": 8.40199630651084e-08, + "loss": 0.312, + "step": 15769 + }, + { + "epoch": 1.9190751445086707, + "grad_norm": 3.0580813884735107, + "learning_rate": 8.376781455863625e-08, + "loss": 0.4261, + "step": 15770 + }, + { + "epoch": 1.9191968360206877, + "grad_norm": 2.4813880920410156, + "learning_rate": 8.351604338532327e-08, + "loss": 0.3485, + "step": 15771 + }, + { + "epoch": 1.9193185275327047, + "grad_norm": 2.4027113914489746, + "learning_rate": 8.326464955475177e-08, + "loss": 0.4122, + "step": 15772 + }, + { + "epoch": 1.9194402190447217, + "grad_norm": 1.8295562267303467, + "learning_rate": 8.301363307648636e-08, + "loss": 0.3475, + "step": 15773 + }, + { + "epoch": 1.9195619105567387, + "grad_norm": 2.1208078861236572, + "learning_rate": 8.276299396007825e-08, + "loss": 0.41, + "step": 15774 + }, + { + "epoch": 1.9196836020687558, + "grad_norm": 2.149620771408081, + "learning_rate": 8.251273221506762e-08, + "loss": 0.3311, + "step": 15775 + }, + { + "epoch": 1.9198052935807728, + "grad_norm": 2.189979076385498, + "learning_rate": 8.22628478509746e-08, + "loss": 0.3161, + "step": 15776 + }, + { + "epoch": 1.9199269850927898, + "grad_norm": 1.9679685831069946, + "learning_rate": 8.201334087730828e-08, + "loss": 0.321, + "step": 15777 + }, + { + "epoch": 1.9200486766048068, + "grad_norm": 2.358872413635254, + "learning_rate": 8.176421130356438e-08, + "loss": 0.4111, + "step": 15778 + }, + { + "epoch": 1.9201703681168238, + "grad_norm": 1.8389935493469238, + "learning_rate": 8.151545913922198e-08, + "loss": 0.4004, + "step": 15779 + }, + { + "epoch": 1.9202920596288409, + "grad_norm": 2.633004903793335, + "learning_rate": 8.126708439374464e-08, + "loss": 0.4396, + "step": 15780 + }, + { + "epoch": 1.9204137511408579, + "grad_norm": 1.9778844118118286, + "learning_rate": 8.101908707658702e-08, + "loss": 0.3944, + "step": 15781 + }, + { + "epoch": 1.920535442652875, + "grad_norm": 1.9150971174240112, + "learning_rate": 8.077146719718487e-08, + "loss": 0.3514, + "step": 15782 + }, + { + "epoch": 1.920657134164892, + "grad_norm": 1.8329989910125732, + "learning_rate": 8.052422476495846e-08, + "loss": 0.3841, + "step": 15783 + }, + { + "epoch": 1.920778825676909, + "grad_norm": 1.71555757522583, + "learning_rate": 8.027735978931917e-08, + "loss": 0.339, + "step": 15784 + }, + { + "epoch": 1.920900517188926, + "grad_norm": 1.8899199962615967, + "learning_rate": 8.003087227965723e-08, + "loss": 0.293, + "step": 15785 + }, + { + "epoch": 1.921022208700943, + "grad_norm": 1.4862643480300903, + "learning_rate": 7.978476224535514e-08, + "loss": 0.3697, + "step": 15786 + }, + { + "epoch": 1.92114390021296, + "grad_norm": 2.145812511444092, + "learning_rate": 7.953902969577653e-08, + "loss": 0.3748, + "step": 15787 + }, + { + "epoch": 1.921265591724977, + "grad_norm": 1.9779895544052124, + "learning_rate": 7.92936746402717e-08, + "loss": 0.3314, + "step": 15788 + }, + { + "epoch": 1.9213872832369943, + "grad_norm": 2.167039155960083, + "learning_rate": 7.904869708817653e-08, + "loss": 0.3366, + "step": 15789 + }, + { + "epoch": 1.9215089747490113, + "grad_norm": 2.6096041202545166, + "learning_rate": 7.880409704881464e-08, + "loss": 0.3251, + "step": 15790 + }, + { + "epoch": 1.9216306662610283, + "grad_norm": 1.6183769702911377, + "learning_rate": 7.855987453149195e-08, + "loss": 0.406, + "step": 15791 + }, + { + "epoch": 1.9217523577730453, + "grad_norm": 1.6336644887924194, + "learning_rate": 7.831602954550322e-08, + "loss": 0.3797, + "step": 15792 + }, + { + "epoch": 1.9218740492850623, + "grad_norm": 2.0201914310455322, + "learning_rate": 7.807256210012659e-08, + "loss": 0.3829, + "step": 15793 + }, + { + "epoch": 1.9219957407970794, + "grad_norm": 2.7254021167755127, + "learning_rate": 7.782947220462466e-08, + "loss": 0.4408, + "step": 15794 + }, + { + "epoch": 1.9221174323090966, + "grad_norm": 2.432457208633423, + "learning_rate": 7.758675986825115e-08, + "loss": 0.4338, + "step": 15795 + }, + { + "epoch": 1.9222391238211136, + "grad_norm": 2.04353666305542, + "learning_rate": 7.734442510023754e-08, + "loss": 0.339, + "step": 15796 + }, + { + "epoch": 1.9223608153331306, + "grad_norm": 1.6202785968780518, + "learning_rate": 7.710246790980869e-08, + "loss": 0.3562, + "step": 15797 + }, + { + "epoch": 1.9224825068451477, + "grad_norm": 2.105527400970459, + "learning_rate": 7.686088830616945e-08, + "loss": 0.3797, + "step": 15798 + }, + { + "epoch": 1.9226041983571647, + "grad_norm": 1.8749754428863525, + "learning_rate": 7.661968629851357e-08, + "loss": 0.3421, + "step": 15799 + }, + { + "epoch": 1.9227258898691817, + "grad_norm": 2.8890326023101807, + "learning_rate": 7.637886189601817e-08, + "loss": 0.2965, + "step": 15800 + }, + { + "epoch": 1.9228475813811987, + "grad_norm": 1.6345950365066528, + "learning_rate": 7.613841510784925e-08, + "loss": 0.3284, + "step": 15801 + }, + { + "epoch": 1.9229692728932157, + "grad_norm": 2.3004817962646484, + "learning_rate": 7.589834594315393e-08, + "loss": 0.3615, + "step": 15802 + }, + { + "epoch": 1.9230909644052328, + "grad_norm": 2.3893001079559326, + "learning_rate": 7.565865441106823e-08, + "loss": 0.3655, + "step": 15803 + }, + { + "epoch": 1.9232126559172498, + "grad_norm": 2.915879249572754, + "learning_rate": 7.541934052071375e-08, + "loss": 0.4081, + "step": 15804 + }, + { + "epoch": 1.9233343474292668, + "grad_norm": 3.374998092651367, + "learning_rate": 7.518040428119433e-08, + "loss": 0.3004, + "step": 15805 + }, + { + "epoch": 1.9234560389412838, + "grad_norm": 1.637037992477417, + "learning_rate": 7.494184570160601e-08, + "loss": 0.338, + "step": 15806 + }, + { + "epoch": 1.9235777304533008, + "grad_norm": 1.5632528066635132, + "learning_rate": 7.470366479102265e-08, + "loss": 0.3495, + "step": 15807 + }, + { + "epoch": 1.9236994219653178, + "grad_norm": 4.725611209869385, + "learning_rate": 7.446586155851032e-08, + "loss": 0.4816, + "step": 15808 + }, + { + "epoch": 1.9238211134773349, + "grad_norm": 2.030162811279297, + "learning_rate": 7.422843601311624e-08, + "loss": 0.4317, + "step": 15809 + }, + { + "epoch": 1.9239428049893519, + "grad_norm": 1.5780426263809204, + "learning_rate": 7.39913881638743e-08, + "loss": 0.3599, + "step": 15810 + }, + { + "epoch": 1.924064496501369, + "grad_norm": 1.7926127910614014, + "learning_rate": 7.375471801980727e-08, + "loss": 0.3422, + "step": 15811 + }, + { + "epoch": 1.924186188013386, + "grad_norm": 1.820106029510498, + "learning_rate": 7.351842558991906e-08, + "loss": 0.3676, + "step": 15812 + }, + { + "epoch": 1.924307879525403, + "grad_norm": 1.8103939294815063, + "learning_rate": 7.328251088320027e-08, + "loss": 0.393, + "step": 15813 + }, + { + "epoch": 1.9244295710374202, + "grad_norm": 2.31059193611145, + "learning_rate": 7.304697390863036e-08, + "loss": 0.3972, + "step": 15814 + }, + { + "epoch": 1.9245512625494372, + "grad_norm": 1.330661654472351, + "learning_rate": 7.281181467517106e-08, + "loss": 0.2745, + "step": 15815 + }, + { + "epoch": 1.9246729540614542, + "grad_norm": 2.3426334857940674, + "learning_rate": 7.257703319176968e-08, + "loss": 0.3793, + "step": 15816 + }, + { + "epoch": 1.9247946455734712, + "grad_norm": 1.8138560056686401, + "learning_rate": 7.234262946736015e-08, + "loss": 0.424, + "step": 15817 + }, + { + "epoch": 1.9249163370854883, + "grad_norm": 1.8721749782562256, + "learning_rate": 7.210860351086423e-08, + "loss": 0.3555, + "step": 15818 + }, + { + "epoch": 1.9250380285975053, + "grad_norm": 2.384075403213501, + "learning_rate": 7.187495533118482e-08, + "loss": 0.3465, + "step": 15819 + }, + { + "epoch": 1.9251597201095225, + "grad_norm": 1.7856847047805786, + "learning_rate": 7.164168493721368e-08, + "loss": 0.3739, + "step": 15820 + }, + { + "epoch": 1.9252814116215395, + "grad_norm": 2.844778299331665, + "learning_rate": 7.140879233782594e-08, + "loss": 0.3887, + "step": 15821 + }, + { + "epoch": 1.9254031031335566, + "grad_norm": 1.6135432720184326, + "learning_rate": 7.117627754188561e-08, + "loss": 0.3408, + "step": 15822 + }, + { + "epoch": 1.9255247946455736, + "grad_norm": 1.8190762996673584, + "learning_rate": 7.094414055824006e-08, + "loss": 0.3555, + "step": 15823 + }, + { + "epoch": 1.9256464861575906, + "grad_norm": 2.879199504852295, + "learning_rate": 7.071238139572001e-08, + "loss": 0.4137, + "step": 15824 + }, + { + "epoch": 1.9257681776696076, + "grad_norm": 2.9349617958068848, + "learning_rate": 7.048100006314839e-08, + "loss": 0.4067, + "step": 15825 + }, + { + "epoch": 1.9258898691816246, + "grad_norm": 1.6504396200180054, + "learning_rate": 7.024999656932597e-08, + "loss": 0.3466, + "step": 15826 + }, + { + "epoch": 1.9260115606936417, + "grad_norm": 2.51949143409729, + "learning_rate": 7.001937092304568e-08, + "loss": 0.4156, + "step": 15827 + }, + { + "epoch": 1.9261332522056587, + "grad_norm": 1.7395340204238892, + "learning_rate": 6.978912313308272e-08, + "loss": 0.3635, + "step": 15828 + }, + { + "epoch": 1.9262549437176757, + "grad_norm": 2.0017318725585938, + "learning_rate": 6.955925320819678e-08, + "loss": 0.391, + "step": 15829 + }, + { + "epoch": 1.9263766352296927, + "grad_norm": 2.966348886489868, + "learning_rate": 6.932976115713752e-08, + "loss": 0.321, + "step": 15830 + }, + { + "epoch": 1.9264983267417097, + "grad_norm": 1.6849247217178345, + "learning_rate": 6.910064698863572e-08, + "loss": 0.3536, + "step": 15831 + }, + { + "epoch": 1.9266200182537268, + "grad_norm": 2.099560260772705, + "learning_rate": 6.887191071140886e-08, + "loss": 0.3786, + "step": 15832 + }, + { + "epoch": 1.9267417097657438, + "grad_norm": 2.352947235107422, + "learning_rate": 6.86435523341633e-08, + "loss": 0.3312, + "step": 15833 + }, + { + "epoch": 1.9268634012777608, + "grad_norm": 1.2084604501724243, + "learning_rate": 6.841557186558767e-08, + "loss": 0.3361, + "step": 15834 + }, + { + "epoch": 1.9269850927897778, + "grad_norm": 1.5980653762817383, + "learning_rate": 6.818796931435612e-08, + "loss": 0.3373, + "step": 15835 + }, + { + "epoch": 1.9271067843017948, + "grad_norm": 1.4440498352050781, + "learning_rate": 6.796074468913061e-08, + "loss": 0.3445, + "step": 15836 + }, + { + "epoch": 1.9272284758138118, + "grad_norm": 1.4435983896255493, + "learning_rate": 6.773389799855534e-08, + "loss": 0.3818, + "step": 15837 + }, + { + "epoch": 1.9273501673258289, + "grad_norm": 2.4774882793426514, + "learning_rate": 6.750742925126563e-08, + "loss": 0.3883, + "step": 15838 + }, + { + "epoch": 1.927471858837846, + "grad_norm": 1.8974932432174683, + "learning_rate": 6.728133845587681e-08, + "loss": 0.415, + "step": 15839 + }, + { + "epoch": 1.9275935503498631, + "grad_norm": 2.4979612827301025, + "learning_rate": 6.70556256209931e-08, + "loss": 0.4277, + "step": 15840 + }, + { + "epoch": 1.9277152418618801, + "grad_norm": 2.7445714473724365, + "learning_rate": 6.68302907552032e-08, + "loss": 0.4246, + "step": 15841 + }, + { + "epoch": 1.9278369333738972, + "grad_norm": 1.7106176614761353, + "learning_rate": 6.660533386708023e-08, + "loss": 0.4145, + "step": 15842 + }, + { + "epoch": 1.9279586248859142, + "grad_norm": 1.5466960668563843, + "learning_rate": 6.638075496518515e-08, + "loss": 0.3284, + "step": 15843 + }, + { + "epoch": 1.9280803163979312, + "grad_norm": 3.4106321334838867, + "learning_rate": 6.615655405806442e-08, + "loss": 0.3081, + "step": 15844 + }, + { + "epoch": 1.9282020079099484, + "grad_norm": 1.6802036762237549, + "learning_rate": 6.593273115424903e-08, + "loss": 0.3772, + "step": 15845 + }, + { + "epoch": 1.9283236994219655, + "grad_norm": 1.694337010383606, + "learning_rate": 6.570928626225548e-08, + "loss": 0.3815, + "step": 15846 + }, + { + "epoch": 1.9284453909339825, + "grad_norm": 2.3030476570129395, + "learning_rate": 6.548621939058585e-08, + "loss": 0.4042, + "step": 15847 + }, + { + "epoch": 1.9285670824459995, + "grad_norm": 2.331472158432007, + "learning_rate": 6.526353054772894e-08, + "loss": 0.4184, + "step": 15848 + }, + { + "epoch": 1.9286887739580165, + "grad_norm": 2.278980016708374, + "learning_rate": 6.504121974215904e-08, + "loss": 0.4, + "step": 15849 + }, + { + "epoch": 1.9288104654700335, + "grad_norm": 1.4729667901992798, + "learning_rate": 6.481928698233497e-08, + "loss": 0.3584, + "step": 15850 + }, + { + "epoch": 1.9289321569820506, + "grad_norm": 3.1923413276672363, + "learning_rate": 6.459773227670107e-08, + "loss": 0.4412, + "step": 15851 + }, + { + "epoch": 1.9290538484940676, + "grad_norm": 2.1268229484558105, + "learning_rate": 6.437655563368838e-08, + "loss": 0.3147, + "step": 15852 + }, + { + "epoch": 1.9291755400060846, + "grad_norm": 1.7822880744934082, + "learning_rate": 6.415575706171462e-08, + "loss": 0.3684, + "step": 15853 + }, + { + "epoch": 1.9292972315181016, + "grad_norm": 1.8047879934310913, + "learning_rate": 6.393533656917972e-08, + "loss": 0.3204, + "step": 15854 + }, + { + "epoch": 1.9294189230301186, + "grad_norm": 1.5757339000701904, + "learning_rate": 6.371529416447254e-08, + "loss": 0.3826, + "step": 15855 + }, + { + "epoch": 1.9295406145421357, + "grad_norm": 1.5452866554260254, + "learning_rate": 6.349562985596413e-08, + "loss": 0.3321, + "step": 15856 + }, + { + "epoch": 1.9296623060541527, + "grad_norm": 2.2325963973999023, + "learning_rate": 6.327634365201452e-08, + "loss": 0.4216, + "step": 15857 + }, + { + "epoch": 1.9297839975661697, + "grad_norm": 1.651720404624939, + "learning_rate": 6.305743556096922e-08, + "loss": 0.2962, + "step": 15858 + }, + { + "epoch": 1.9299056890781867, + "grad_norm": 1.6414620876312256, + "learning_rate": 6.28389055911549e-08, + "loss": 0.4104, + "step": 15859 + }, + { + "epoch": 1.9300273805902037, + "grad_norm": 2.2379367351531982, + "learning_rate": 6.262075375089049e-08, + "loss": 0.3987, + "step": 15860 + }, + { + "epoch": 1.9301490721022208, + "grad_norm": 1.4453548192977905, + "learning_rate": 6.240298004847489e-08, + "loss": 0.3214, + "step": 15861 + }, + { + "epoch": 1.9302707636142378, + "grad_norm": 1.6983914375305176, + "learning_rate": 6.218558449219591e-08, + "loss": 0.354, + "step": 15862 + }, + { + "epoch": 1.9303924551262548, + "grad_norm": 1.6801940202713013, + "learning_rate": 6.196856709032584e-08, + "loss": 0.3805, + "step": 15863 + }, + { + "epoch": 1.930514146638272, + "grad_norm": 2.0834500789642334, + "learning_rate": 6.17519278511225e-08, + "loss": 0.4061, + "step": 15864 + }, + { + "epoch": 1.930635838150289, + "grad_norm": 2.9435620307922363, + "learning_rate": 6.15356667828293e-08, + "loss": 0.4177, + "step": 15865 + }, + { + "epoch": 1.930757529662306, + "grad_norm": 2.922804117202759, + "learning_rate": 6.131978389367522e-08, + "loss": 0.2928, + "step": 15866 + }, + { + "epoch": 1.930879221174323, + "grad_norm": 2.215226888656616, + "learning_rate": 6.110427919187478e-08, + "loss": 0.3764, + "step": 15867 + }, + { + "epoch": 1.93100091268634, + "grad_norm": 2.2295496463775635, + "learning_rate": 6.088915268562922e-08, + "loss": 0.3259, + "step": 15868 + }, + { + "epoch": 1.9311226041983571, + "grad_norm": 3.030305862426758, + "learning_rate": 6.067440438312532e-08, + "loss": 0.436, + "step": 15869 + }, + { + "epoch": 1.9312442957103741, + "grad_norm": 2.6606290340423584, + "learning_rate": 6.046003429253211e-08, + "loss": 0.2812, + "step": 15870 + }, + { + "epoch": 1.9313659872223914, + "grad_norm": 1.3542592525482178, + "learning_rate": 6.02460424220086e-08, + "loss": 0.3399, + "step": 15871 + }, + { + "epoch": 1.9314876787344084, + "grad_norm": 1.776929497718811, + "learning_rate": 6.003242877969828e-08, + "loss": 0.3546, + "step": 15872 + }, + { + "epoch": 1.9316093702464254, + "grad_norm": 3.7559821605682373, + "learning_rate": 5.981919337372688e-08, + "loss": 0.292, + "step": 15873 + }, + { + "epoch": 1.9317310617584424, + "grad_norm": 1.4685168266296387, + "learning_rate": 5.960633621221235e-08, + "loss": 0.3764, + "step": 15874 + }, + { + "epoch": 1.9318527532704595, + "grad_norm": 1.5051461458206177, + "learning_rate": 5.939385730325042e-08, + "loss": 0.3203, + "step": 15875 + }, + { + "epoch": 1.9319744447824765, + "grad_norm": 1.6219699382781982, + "learning_rate": 5.918175665492909e-08, + "loss": 0.3176, + "step": 15876 + }, + { + "epoch": 1.9320961362944935, + "grad_norm": 1.6281224489212036, + "learning_rate": 5.897003427531855e-08, + "loss": 0.3485, + "step": 15877 + }, + { + "epoch": 1.9322178278065105, + "grad_norm": 2.3135859966278076, + "learning_rate": 5.87586901724746e-08, + "loss": 0.3808, + "step": 15878 + }, + { + "epoch": 1.9323395193185275, + "grad_norm": 1.6280550956726074, + "learning_rate": 5.8547724354439673e-08, + "loss": 0.3935, + "step": 15879 + }, + { + "epoch": 1.9324612108305446, + "grad_norm": 1.603858470916748, + "learning_rate": 5.83371368292418e-08, + "loss": 0.3458, + "step": 15880 + }, + { + "epoch": 1.9325829023425616, + "grad_norm": 1.6695835590362549, + "learning_rate": 5.8126927604893467e-08, + "loss": 0.369, + "step": 15881 + }, + { + "epoch": 1.9327045938545786, + "grad_norm": 1.996770977973938, + "learning_rate": 5.791709668939383e-08, + "loss": 0.3828, + "step": 15882 + }, + { + "epoch": 1.9328262853665956, + "grad_norm": 1.8552868366241455, + "learning_rate": 5.770764409072871e-08, + "loss": 0.3891, + "step": 15883 + }, + { + "epoch": 1.9329479768786126, + "grad_norm": 1.8187626600265503, + "learning_rate": 5.749856981686619e-08, + "loss": 0.3679, + "step": 15884 + }, + { + "epoch": 1.9330696683906297, + "grad_norm": 2.197300672531128, + "learning_rate": 5.7289873875763234e-08, + "loss": 0.3222, + "step": 15885 + }, + { + "epoch": 1.9331913599026467, + "grad_norm": 1.9732637405395508, + "learning_rate": 5.708155627536127e-08, + "loss": 0.398, + "step": 15886 + }, + { + "epoch": 1.9333130514146637, + "grad_norm": 2.3674983978271484, + "learning_rate": 5.687361702358618e-08, + "loss": 0.3764, + "step": 15887 + }, + { + "epoch": 1.9334347429266807, + "grad_norm": 3.2757568359375, + "learning_rate": 5.6666056128351634e-08, + "loss": 0.4391, + "step": 15888 + }, + { + "epoch": 1.9335564344386977, + "grad_norm": 1.9373129606246948, + "learning_rate": 5.6458873597554645e-08, + "loss": 0.4269, + "step": 15889 + }, + { + "epoch": 1.933678125950715, + "grad_norm": 2.645625352859497, + "learning_rate": 5.6252069439080014e-08, + "loss": 0.3673, + "step": 15890 + }, + { + "epoch": 1.933799817462732, + "grad_norm": 1.5224021673202515, + "learning_rate": 5.604564366079701e-08, + "loss": 0.3461, + "step": 15891 + }, + { + "epoch": 1.933921508974749, + "grad_norm": 1.5954240560531616, + "learning_rate": 5.583959627055935e-08, + "loss": 0.3779, + "step": 15892 + }, + { + "epoch": 1.934043200486766, + "grad_norm": 2.3299670219421387, + "learning_rate": 5.563392727620853e-08, + "loss": 0.3916, + "step": 15893 + }, + { + "epoch": 1.934164891998783, + "grad_norm": 2.9327049255371094, + "learning_rate": 5.5428636685570524e-08, + "loss": 0.4028, + "step": 15894 + }, + { + "epoch": 1.9342865835108, + "grad_norm": 1.5220563411712646, + "learning_rate": 5.522372450645686e-08, + "loss": 0.3277, + "step": 15895 + }, + { + "epoch": 1.9344082750228173, + "grad_norm": 2.57047438621521, + "learning_rate": 5.501919074666462e-08, + "loss": 0.3375, + "step": 15896 + }, + { + "epoch": 1.9345299665348343, + "grad_norm": 2.0873324871063232, + "learning_rate": 5.481503541397759e-08, + "loss": 0.2955, + "step": 15897 + }, + { + "epoch": 1.9346516580468514, + "grad_norm": 1.910020112991333, + "learning_rate": 5.4611258516164e-08, + "loss": 0.4043, + "step": 15898 + }, + { + "epoch": 1.9347733495588684, + "grad_norm": 1.6588650941848755, + "learning_rate": 5.440786006097876e-08, + "loss": 0.3143, + "step": 15899 + }, + { + "epoch": 1.9348950410708854, + "grad_norm": 2.2277395725250244, + "learning_rate": 5.4204840056159e-08, + "loss": 0.3514, + "step": 15900 + }, + { + "epoch": 1.9350167325829024, + "grad_norm": 2.939955234527588, + "learning_rate": 5.4002198509433004e-08, + "loss": 0.391, + "step": 15901 + }, + { + "epoch": 1.9351384240949194, + "grad_norm": 1.6179908514022827, + "learning_rate": 5.379993542850903e-08, + "loss": 0.3564, + "step": 15902 + }, + { + "epoch": 1.9352601156069364, + "grad_norm": 2.887336492538452, + "learning_rate": 5.359805082108649e-08, + "loss": 0.3235, + "step": 15903 + }, + { + "epoch": 1.9353818071189535, + "grad_norm": 1.9876681566238403, + "learning_rate": 5.33965446948459e-08, + "loss": 0.3223, + "step": 15904 + }, + { + "epoch": 1.9355034986309705, + "grad_norm": 3.082040786743164, + "learning_rate": 5.3195417057454455e-08, + "loss": 0.3496, + "step": 15905 + }, + { + "epoch": 1.9356251901429875, + "grad_norm": 2.421233654022217, + "learning_rate": 5.299466791656604e-08, + "loss": 0.3516, + "step": 15906 + }, + { + "epoch": 1.9357468816550045, + "grad_norm": 2.028790235519409, + "learning_rate": 5.279429727982011e-08, + "loss": 0.2996, + "step": 15907 + }, + { + "epoch": 1.9358685731670215, + "grad_norm": 1.9469631910324097, + "learning_rate": 5.259430515484054e-08, + "loss": 0.3539, + "step": 15908 + }, + { + "epoch": 1.9359902646790386, + "grad_norm": 3.711719512939453, + "learning_rate": 5.239469154923793e-08, + "loss": 0.4552, + "step": 15909 + }, + { + "epoch": 1.9361119561910556, + "grad_norm": 1.7252106666564941, + "learning_rate": 5.219545647060731e-08, + "loss": 0.389, + "step": 15910 + }, + { + "epoch": 1.9362336477030726, + "grad_norm": 2.494966506958008, + "learning_rate": 5.1996599926531496e-08, + "loss": 0.3958, + "step": 15911 + }, + { + "epoch": 1.9363553392150896, + "grad_norm": 1.5225744247436523, + "learning_rate": 5.1798121924575565e-08, + "loss": 0.3981, + "step": 15912 + }, + { + "epoch": 1.9364770307271066, + "grad_norm": 1.4590448141098022, + "learning_rate": 5.160002247229234e-08, + "loss": 0.321, + "step": 15913 + }, + { + "epoch": 1.9365987222391237, + "grad_norm": 2.6050050258636475, + "learning_rate": 5.140230157722026e-08, + "loss": 0.4119, + "step": 15914 + }, + { + "epoch": 1.936720413751141, + "grad_norm": 2.958155870437622, + "learning_rate": 5.120495924688329e-08, + "loss": 0.3796, + "step": 15915 + }, + { + "epoch": 1.936842105263158, + "grad_norm": 2.356903553009033, + "learning_rate": 5.100799548879099e-08, + "loss": 0.4146, + "step": 15916 + }, + { + "epoch": 1.936963796775175, + "grad_norm": 2.283723831176758, + "learning_rate": 5.0811410310437346e-08, + "loss": 0.425, + "step": 15917 + }, + { + "epoch": 1.937085488287192, + "grad_norm": 1.717651128768921, + "learning_rate": 5.061520371930306e-08, + "loss": 0.3578, + "step": 15918 + }, + { + "epoch": 1.937207179799209, + "grad_norm": 2.4509005546569824, + "learning_rate": 5.041937572285438e-08, + "loss": 0.4109, + "step": 15919 + }, + { + "epoch": 1.937328871311226, + "grad_norm": 2.104342460632324, + "learning_rate": 5.022392632854311e-08, + "loss": 0.3676, + "step": 15920 + }, + { + "epoch": 1.9374505628232432, + "grad_norm": 1.9883087873458862, + "learning_rate": 5.0028855543806654e-08, + "loss": 0.3678, + "step": 15921 + }, + { + "epoch": 1.9375722543352603, + "grad_norm": 1.5696316957473755, + "learning_rate": 4.9834163376066836e-08, + "loss": 0.3506, + "step": 15922 + }, + { + "epoch": 1.9376939458472773, + "grad_norm": 2.4884350299835205, + "learning_rate": 4.963984983273329e-08, + "loss": 0.4102, + "step": 15923 + }, + { + "epoch": 1.9378156373592943, + "grad_norm": 1.818729043006897, + "learning_rate": 4.94459149212001e-08, + "loss": 0.3828, + "step": 15924 + }, + { + "epoch": 1.9379373288713113, + "grad_norm": 1.5971750020980835, + "learning_rate": 4.925235864884581e-08, + "loss": 0.3442, + "step": 15925 + }, + { + "epoch": 1.9380590203833283, + "grad_norm": 1.9785457849502563, + "learning_rate": 4.905918102303564e-08, + "loss": 0.3104, + "step": 15926 + }, + { + "epoch": 1.9381807118953454, + "grad_norm": 1.3062514066696167, + "learning_rate": 4.886638205112149e-08, + "loss": 0.3409, + "step": 15927 + }, + { + "epoch": 1.9383024034073624, + "grad_norm": 1.5457227230072021, + "learning_rate": 4.86739617404397e-08, + "loss": 0.3599, + "step": 15928 + }, + { + "epoch": 1.9384240949193794, + "grad_norm": 1.4135849475860596, + "learning_rate": 4.8481920098311095e-08, + "loss": 0.3431, + "step": 15929 + }, + { + "epoch": 1.9385457864313964, + "grad_norm": 3.181957244873047, + "learning_rate": 4.8290257132044274e-08, + "loss": 0.345, + "step": 15930 + }, + { + "epoch": 1.9386674779434134, + "grad_norm": 1.5733072757720947, + "learning_rate": 4.809897284893117e-08, + "loss": 0.3679, + "step": 15931 + }, + { + "epoch": 1.9387891694554305, + "grad_norm": 1.9676730632781982, + "learning_rate": 4.790806725625263e-08, + "loss": 0.3481, + "step": 15932 + }, + { + "epoch": 1.9389108609674475, + "grad_norm": 2.5600335597991943, + "learning_rate": 4.7717540361271744e-08, + "loss": 0.3413, + "step": 15933 + }, + { + "epoch": 1.9390325524794645, + "grad_norm": 1.895156979560852, + "learning_rate": 4.752739217123825e-08, + "loss": 0.4411, + "step": 15934 + }, + { + "epoch": 1.9391542439914815, + "grad_norm": 2.6482043266296387, + "learning_rate": 4.7337622693387486e-08, + "loss": 0.2969, + "step": 15935 + }, + { + "epoch": 1.9392759355034985, + "grad_norm": 2.233037233352661, + "learning_rate": 4.7148231934941445e-08, + "loss": 0.3547, + "step": 15936 + }, + { + "epoch": 1.9393976270155155, + "grad_norm": 1.7740463018417358, + "learning_rate": 4.695921990310659e-08, + "loss": 0.3599, + "step": 15937 + }, + { + "epoch": 1.9395193185275326, + "grad_norm": 3.1780738830566406, + "learning_rate": 4.6770586605074945e-08, + "loss": 0.3708, + "step": 15938 + }, + { + "epoch": 1.9396410100395496, + "grad_norm": 2.140378952026367, + "learning_rate": 4.65823320480252e-08, + "loss": 0.3065, + "step": 15939 + }, + { + "epoch": 1.9397627015515668, + "grad_norm": 1.6263338327407837, + "learning_rate": 4.6394456239119424e-08, + "loss": 0.389, + "step": 15940 + }, + { + "epoch": 1.9398843930635838, + "grad_norm": 2.4274227619171143, + "learning_rate": 4.6206959185507437e-08, + "loss": 0.3412, + "step": 15941 + }, + { + "epoch": 1.9400060845756009, + "grad_norm": 1.710370421409607, + "learning_rate": 4.6019840894323544e-08, + "loss": 0.3445, + "step": 15942 + }, + { + "epoch": 1.9401277760876179, + "grad_norm": 2.23415207862854, + "learning_rate": 4.5833101372688706e-08, + "loss": 0.3304, + "step": 15943 + }, + { + "epoch": 1.940249467599635, + "grad_norm": 2.093322992324829, + "learning_rate": 4.564674062770835e-08, + "loss": 0.3868, + "step": 15944 + }, + { + "epoch": 1.940371159111652, + "grad_norm": 1.8244776725769043, + "learning_rate": 4.546075866647459e-08, + "loss": 0.3683, + "step": 15945 + }, + { + "epoch": 1.9404928506236692, + "grad_norm": 1.6568856239318848, + "learning_rate": 4.5275155496062873e-08, + "loss": 0.3578, + "step": 15946 + }, + { + "epoch": 1.9406145421356862, + "grad_norm": 2.0331029891967773, + "learning_rate": 4.508993112353754e-08, + "loss": 0.4123, + "step": 15947 + }, + { + "epoch": 1.9407362336477032, + "grad_norm": 1.6933448314666748, + "learning_rate": 4.490508555594519e-08, + "loss": 0.3644, + "step": 15948 + }, + { + "epoch": 1.9408579251597202, + "grad_norm": 4.004017353057861, + "learning_rate": 4.4720618800319085e-08, + "loss": 0.428, + "step": 15949 + }, + { + "epoch": 1.9409796166717372, + "grad_norm": 1.8626937866210938, + "learning_rate": 4.453653086368137e-08, + "loss": 0.3997, + "step": 15950 + }, + { + "epoch": 1.9411013081837543, + "grad_norm": 1.7269723415374756, + "learning_rate": 4.4352821753035345e-08, + "loss": 0.3515, + "step": 15951 + }, + { + "epoch": 1.9412229996957713, + "grad_norm": 3.0375661849975586, + "learning_rate": 4.4169491475370975e-08, + "loss": 0.3747, + "step": 15952 + }, + { + "epoch": 1.9413446912077883, + "grad_norm": 1.7200435400009155, + "learning_rate": 4.3986540037664896e-08, + "loss": 0.3163, + "step": 15953 + }, + { + "epoch": 1.9414663827198053, + "grad_norm": 1.8690217733383179, + "learning_rate": 4.3803967446878204e-08, + "loss": 0.3644, + "step": 15954 + }, + { + "epoch": 1.9415880742318223, + "grad_norm": 2.563432216644287, + "learning_rate": 4.362177370995979e-08, + "loss": 0.3973, + "step": 15955 + }, + { + "epoch": 1.9417097657438394, + "grad_norm": 2.1452534198760986, + "learning_rate": 4.343995883384078e-08, + "loss": 0.3255, + "step": 15956 + }, + { + "epoch": 1.9418314572558564, + "grad_norm": 1.381900668144226, + "learning_rate": 4.325852282544119e-08, + "loss": 0.3546, + "step": 15957 + }, + { + "epoch": 1.9419531487678734, + "grad_norm": 1.7779309749603271, + "learning_rate": 4.3077465691663264e-08, + "loss": 0.3581, + "step": 15958 + }, + { + "epoch": 1.9420748402798904, + "grad_norm": 1.611080288887024, + "learning_rate": 4.289678743939707e-08, + "loss": 0.3374, + "step": 15959 + }, + { + "epoch": 1.9421965317919074, + "grad_norm": 2.6402385234832764, + "learning_rate": 4.2716488075519314e-08, + "loss": 0.3906, + "step": 15960 + }, + { + "epoch": 1.9423182233039245, + "grad_norm": 1.5428540706634521, + "learning_rate": 4.253656760688896e-08, + "loss": 0.3611, + "step": 15961 + }, + { + "epoch": 1.9424399148159415, + "grad_norm": 1.8161102533340454, + "learning_rate": 4.2357026040352744e-08, + "loss": 0.3672, + "step": 15962 + }, + { + "epoch": 1.9425616063279585, + "grad_norm": 2.851447105407715, + "learning_rate": 4.21778633827441e-08, + "loss": 0.3657, + "step": 15963 + }, + { + "epoch": 1.9426832978399755, + "grad_norm": 2.0196964740753174, + "learning_rate": 4.199907964087757e-08, + "loss": 0.3698, + "step": 15964 + }, + { + "epoch": 1.9428049893519928, + "grad_norm": 2.4098381996154785, + "learning_rate": 4.1820674821558825e-08, + "loss": 0.3131, + "step": 15965 + }, + { + "epoch": 1.9429266808640098, + "grad_norm": 1.8653944730758667, + "learning_rate": 4.164264893157577e-08, + "loss": 0.3446, + "step": 15966 + }, + { + "epoch": 1.9430483723760268, + "grad_norm": 2.68943190574646, + "learning_rate": 4.146500197770298e-08, + "loss": 0.4029, + "step": 15967 + }, + { + "epoch": 1.9431700638880438, + "grad_norm": 2.2966091632843018, + "learning_rate": 4.1287733966699495e-08, + "loss": 0.394, + "step": 15968 + }, + { + "epoch": 1.9432917554000608, + "grad_norm": 1.923325538635254, + "learning_rate": 4.111084490531214e-08, + "loss": 0.3354, + "step": 15969 + }, + { + "epoch": 1.9434134469120778, + "grad_norm": 1.472470760345459, + "learning_rate": 4.093433480026887e-08, + "loss": 0.3755, + "step": 15970 + }, + { + "epoch": 1.9435351384240949, + "grad_norm": 2.5652997493743896, + "learning_rate": 4.075820365828986e-08, + "loss": 0.4053, + "step": 15971 + }, + { + "epoch": 1.943656829936112, + "grad_norm": 1.952427625656128, + "learning_rate": 4.058245148607532e-08, + "loss": 0.3667, + "step": 15972 + }, + { + "epoch": 1.9437785214481291, + "grad_norm": 1.5961142778396606, + "learning_rate": 4.040707829031321e-08, + "loss": 0.3591, + "step": 15973 + }, + { + "epoch": 1.9439002129601461, + "grad_norm": 1.9078994989395142, + "learning_rate": 4.023208407767709e-08, + "loss": 0.3099, + "step": 15974 + }, + { + "epoch": 1.9440219044721632, + "grad_norm": 1.7736799716949463, + "learning_rate": 4.005746885482609e-08, + "loss": 0.4071, + "step": 15975 + }, + { + "epoch": 1.9441435959841802, + "grad_norm": 2.0563454627990723, + "learning_rate": 3.9883232628403766e-08, + "loss": 0.3942, + "step": 15976 + }, + { + "epoch": 1.9442652874961972, + "grad_norm": 1.8751639127731323, + "learning_rate": 3.970937540504039e-08, + "loss": 0.3518, + "step": 15977 + }, + { + "epoch": 1.9443869790082142, + "grad_norm": 1.75934636592865, + "learning_rate": 3.953589719135287e-08, + "loss": 0.3584, + "step": 15978 + }, + { + "epoch": 1.9445086705202312, + "grad_norm": 2.3915746212005615, + "learning_rate": 3.936279799394149e-08, + "loss": 0.3663, + "step": 15979 + }, + { + "epoch": 1.9446303620322483, + "grad_norm": 1.4720486402511597, + "learning_rate": 3.9190077819393214e-08, + "loss": 0.3363, + "step": 15980 + }, + { + "epoch": 1.9447520535442653, + "grad_norm": 1.9294682741165161, + "learning_rate": 3.901773667427944e-08, + "loss": 0.3896, + "step": 15981 + }, + { + "epoch": 1.9448737450562823, + "grad_norm": 2.3747177124023438, + "learning_rate": 3.884577456515826e-08, + "loss": 0.4731, + "step": 15982 + }, + { + "epoch": 1.9449954365682993, + "grad_norm": 1.736438274383545, + "learning_rate": 3.867419149857554e-08, + "loss": 0.3822, + "step": 15983 + }, + { + "epoch": 1.9451171280803163, + "grad_norm": 2.718778371810913, + "learning_rate": 3.8502987481057185e-08, + "loss": 0.3277, + "step": 15984 + }, + { + "epoch": 1.9452388195923334, + "grad_norm": 2.622084856033325, + "learning_rate": 3.8332162519120196e-08, + "loss": 0.394, + "step": 15985 + }, + { + "epoch": 1.9453605111043504, + "grad_norm": 1.9948043823242188, + "learning_rate": 3.816171661926382e-08, + "loss": 0.3631, + "step": 15986 + }, + { + "epoch": 1.9454822026163674, + "grad_norm": 1.9194804430007935, + "learning_rate": 3.799164978797398e-08, + "loss": 0.2973, + "step": 15987 + }, + { + "epoch": 1.9456038941283844, + "grad_norm": 2.271669864654541, + "learning_rate": 3.782196203172217e-08, + "loss": 0.3971, + "step": 15988 + }, + { + "epoch": 1.9457255856404014, + "grad_norm": 1.896245002746582, + "learning_rate": 3.765265335696433e-08, + "loss": 0.4208, + "step": 15989 + }, + { + "epoch": 1.9458472771524185, + "grad_norm": 1.5037046670913696, + "learning_rate": 3.7483723770145306e-08, + "loss": 0.3632, + "step": 15990 + }, + { + "epoch": 1.9459689686644357, + "grad_norm": 2.6111867427825928, + "learning_rate": 3.7315173277691075e-08, + "loss": 0.3129, + "step": 15991 + }, + { + "epoch": 1.9460906601764527, + "grad_norm": 2.1137571334838867, + "learning_rate": 3.71470018860165e-08, + "loss": 0.3001, + "step": 15992 + }, + { + "epoch": 1.9462123516884697, + "grad_norm": 1.4770915508270264, + "learning_rate": 3.697920960151979e-08, + "loss": 0.3151, + "step": 15993 + }, + { + "epoch": 1.9463340432004868, + "grad_norm": 1.6807974576950073, + "learning_rate": 3.6811796430588074e-08, + "loss": 0.3619, + "step": 15994 + }, + { + "epoch": 1.9464557347125038, + "grad_norm": 1.555757999420166, + "learning_rate": 3.664476237958847e-08, + "loss": 0.3426, + "step": 15995 + }, + { + "epoch": 1.9465774262245208, + "grad_norm": 2.976696252822876, + "learning_rate": 3.6478107454879234e-08, + "loss": 0.4211, + "step": 15996 + }, + { + "epoch": 1.946699117736538, + "grad_norm": 1.7319098711013794, + "learning_rate": 3.631183166280194e-08, + "loss": 0.3544, + "step": 15997 + }, + { + "epoch": 1.946820809248555, + "grad_norm": 1.7513951063156128, + "learning_rate": 3.6145935009681554e-08, + "loss": 0.3818, + "step": 15998 + }, + { + "epoch": 1.946942500760572, + "grad_norm": 2.1964359283447266, + "learning_rate": 3.598041750183412e-08, + "loss": 0.3394, + "step": 15999 + }, + { + "epoch": 1.947064192272589, + "grad_norm": 1.5427451133728027, + "learning_rate": 3.5815279145555716e-08, + "loss": 0.3621, + "step": 16000 + }, + { + "epoch": 1.947185883784606, + "grad_norm": 1.4409089088439941, + "learning_rate": 3.56505199471302e-08, + "loss": 0.3287, + "step": 16001 + }, + { + "epoch": 1.9473075752966231, + "grad_norm": 1.894395112991333, + "learning_rate": 3.548613991282812e-08, + "loss": 0.4158, + "step": 16002 + }, + { + "epoch": 1.9474292668086401, + "grad_norm": 2.9766595363616943, + "learning_rate": 3.532213904890336e-08, + "loss": 0.3318, + "step": 16003 + }, + { + "epoch": 1.9475509583206572, + "grad_norm": 2.333193778991699, + "learning_rate": 3.515851736159648e-08, + "loss": 0.3275, + "step": 16004 + }, + { + "epoch": 1.9476726498326742, + "grad_norm": 1.5578022003173828, + "learning_rate": 3.499527485713583e-08, + "loss": 0.3629, + "step": 16005 + }, + { + "epoch": 1.9477943413446912, + "grad_norm": 1.478871464729309, + "learning_rate": 3.483241154172978e-08, + "loss": 0.3571, + "step": 16006 + }, + { + "epoch": 1.9479160328567082, + "grad_norm": 1.7768558263778687, + "learning_rate": 3.466992742157782e-08, + "loss": 0.3865, + "step": 16007 + }, + { + "epoch": 1.9480377243687252, + "grad_norm": 4.546865463256836, + "learning_rate": 3.4507822502861666e-08, + "loss": 0.4083, + "step": 16008 + }, + { + "epoch": 1.9481594158807423, + "grad_norm": 1.4959570169448853, + "learning_rate": 3.4346096791750825e-08, + "loss": 0.3811, + "step": 16009 + }, + { + "epoch": 1.9482811073927593, + "grad_norm": 1.3378942012786865, + "learning_rate": 3.418475029439927e-08, + "loss": 0.3596, + "step": 16010 + }, + { + "epoch": 1.9484027989047763, + "grad_norm": 1.8297083377838135, + "learning_rate": 3.402378301694431e-08, + "loss": 0.3718, + "step": 16011 + }, + { + "epoch": 1.9485244904167933, + "grad_norm": 2.106321334838867, + "learning_rate": 3.386319496551438e-08, + "loss": 0.4147, + "step": 16012 + }, + { + "epoch": 1.9486461819288103, + "grad_norm": 2.4800286293029785, + "learning_rate": 3.370298614621903e-08, + "loss": 0.3483, + "step": 16013 + }, + { + "epoch": 1.9487678734408274, + "grad_norm": 1.6255266666412354, + "learning_rate": 3.354315656515339e-08, + "loss": 0.3275, + "step": 16014 + }, + { + "epoch": 1.9488895649528444, + "grad_norm": 1.6460814476013184, + "learning_rate": 3.338370622840037e-08, + "loss": 0.3962, + "step": 16015 + }, + { + "epoch": 1.9490112564648616, + "grad_norm": 2.4279298782348633, + "learning_rate": 3.322463514202623e-08, + "loss": 0.3808, + "step": 16016 + }, + { + "epoch": 1.9491329479768786, + "grad_norm": 2.511500358581543, + "learning_rate": 3.306594331208501e-08, + "loss": 0.3024, + "step": 16017 + }, + { + "epoch": 1.9492546394888957, + "grad_norm": 2.590264320373535, + "learning_rate": 3.2907630744616335e-08, + "loss": 0.4136, + "step": 16018 + }, + { + "epoch": 1.9493763310009127, + "grad_norm": 2.1603734493255615, + "learning_rate": 3.274969744564205e-08, + "loss": 0.4408, + "step": 16019 + }, + { + "epoch": 1.9494980225129297, + "grad_norm": 1.8866347074508667, + "learning_rate": 3.259214342117289e-08, + "loss": 0.3826, + "step": 16020 + }, + { + "epoch": 1.9496197140249467, + "grad_norm": 1.7558435201644897, + "learning_rate": 3.243496867720408e-08, + "loss": 0.4097, + "step": 16021 + }, + { + "epoch": 1.949741405536964, + "grad_norm": 2.186951160430908, + "learning_rate": 3.227817321971638e-08, + "loss": 0.3438, + "step": 16022 + }, + { + "epoch": 1.949863097048981, + "grad_norm": 1.4934724569320679, + "learning_rate": 3.212175705467613e-08, + "loss": 0.3182, + "step": 16023 + }, + { + "epoch": 1.949984788560998, + "grad_norm": 2.3411896228790283, + "learning_rate": 3.1965720188036345e-08, + "loss": 0.3446, + "step": 16024 + }, + { + "epoch": 1.950106480073015, + "grad_norm": 1.7738183736801147, + "learning_rate": 3.1810062625732274e-08, + "loss": 0.3253, + "step": 16025 + }, + { + "epoch": 1.950228171585032, + "grad_norm": 1.5678980350494385, + "learning_rate": 3.1654784373690295e-08, + "loss": 0.3764, + "step": 16026 + }, + { + "epoch": 1.950349863097049, + "grad_norm": 2.606781482696533, + "learning_rate": 3.149988543781457e-08, + "loss": 0.4517, + "step": 16027 + }, + { + "epoch": 1.950471554609066, + "grad_norm": 2.046215772628784, + "learning_rate": 3.134536582400372e-08, + "loss": 0.3378, + "step": 16028 + }, + { + "epoch": 1.950593246121083, + "grad_norm": 1.9816588163375854, + "learning_rate": 3.119122553813525e-08, + "loss": 0.4039, + "step": 16029 + }, + { + "epoch": 1.9507149376331, + "grad_norm": 2.3498826026916504, + "learning_rate": 3.103746458607448e-08, + "loss": 0.4089, + "step": 16030 + }, + { + "epoch": 1.9508366291451171, + "grad_norm": 1.5352933406829834, + "learning_rate": 3.08840829736734e-08, + "loss": 0.3538, + "step": 16031 + }, + { + "epoch": 1.9509583206571341, + "grad_norm": 2.039468288421631, + "learning_rate": 3.073108070676733e-08, + "loss": 0.3778, + "step": 16032 + }, + { + "epoch": 1.9510800121691512, + "grad_norm": 2.404261350631714, + "learning_rate": 3.0578457791179404e-08, + "loss": 0.3906, + "step": 16033 + }, + { + "epoch": 1.9512017036811682, + "grad_norm": 1.7531213760375977, + "learning_rate": 3.042621423271608e-08, + "loss": 0.3398, + "step": 16034 + }, + { + "epoch": 1.9513233951931852, + "grad_norm": 1.822922706604004, + "learning_rate": 3.027435003717161e-08, + "loss": 0.3521, + "step": 16035 + }, + { + "epoch": 1.9514450867052022, + "grad_norm": 2.2279117107391357, + "learning_rate": 3.0122865210324704e-08, + "loss": 0.4432, + "step": 16036 + }, + { + "epoch": 1.9515667782172192, + "grad_norm": 2.238145112991333, + "learning_rate": 2.9971759757939644e-08, + "loss": 0.3728, + "step": 16037 + }, + { + "epoch": 1.9516884697292363, + "grad_norm": 1.7669050693511963, + "learning_rate": 2.9821033685764055e-08, + "loss": 0.429, + "step": 16038 + }, + { + "epoch": 1.9518101612412533, + "grad_norm": 1.7786117792129517, + "learning_rate": 2.967068699953668e-08, + "loss": 0.3257, + "step": 16039 + }, + { + "epoch": 1.9519318527532703, + "grad_norm": 1.7841204404830933, + "learning_rate": 2.9520719704977386e-08, + "loss": 0.3802, + "step": 16040 + }, + { + "epoch": 1.9520535442652875, + "grad_norm": 2.8488128185272217, + "learning_rate": 2.937113180779161e-08, + "loss": 0.3097, + "step": 16041 + }, + { + "epoch": 1.9521752357773046, + "grad_norm": 1.5512897968292236, + "learning_rate": 2.9221923313672573e-08, + "loss": 0.3635, + "step": 16042 + }, + { + "epoch": 1.9522969272893216, + "grad_norm": 1.670295000076294, + "learning_rate": 2.907309422829796e-08, + "loss": 0.3826, + "step": 16043 + }, + { + "epoch": 1.9524186188013386, + "grad_norm": 1.8494594097137451, + "learning_rate": 2.8924644557331015e-08, + "loss": 0.3266, + "step": 16044 + }, + { + "epoch": 1.9525403103133556, + "grad_norm": 2.493541955947876, + "learning_rate": 2.8776574306419446e-08, + "loss": 0.3569, + "step": 16045 + }, + { + "epoch": 1.9526620018253726, + "grad_norm": 1.8433974981307983, + "learning_rate": 2.8628883481198743e-08, + "loss": 0.4145, + "step": 16046 + }, + { + "epoch": 1.9527836933373899, + "grad_norm": 1.6594692468643188, + "learning_rate": 2.8481572087288855e-08, + "loss": 0.3873, + "step": 16047 + }, + { + "epoch": 1.952905384849407, + "grad_norm": 1.4882068634033203, + "learning_rate": 2.83346401302953e-08, + "loss": 0.3582, + "step": 16048 + }, + { + "epoch": 1.953027076361424, + "grad_norm": 2.0764756202697754, + "learning_rate": 2.8188087615809157e-08, + "loss": 0.3712, + "step": 16049 + }, + { + "epoch": 1.953148767873441, + "grad_norm": 2.594630002975464, + "learning_rate": 2.8041914549405967e-08, + "loss": 0.4236, + "step": 16050 + }, + { + "epoch": 1.953270459385458, + "grad_norm": 2.0064070224761963, + "learning_rate": 2.7896120936650174e-08, + "loss": 0.4179, + "step": 16051 + }, + { + "epoch": 1.953392150897475, + "grad_norm": 2.3302161693573, + "learning_rate": 2.7750706783086222e-08, + "loss": 0.3522, + "step": 16052 + }, + { + "epoch": 1.953513842409492, + "grad_norm": 1.6610610485076904, + "learning_rate": 2.7605672094250802e-08, + "loss": 0.3613, + "step": 16053 + }, + { + "epoch": 1.953635533921509, + "grad_norm": 5.792760848999023, + "learning_rate": 2.7461016875660608e-08, + "loss": 0.5273, + "step": 16054 + }, + { + "epoch": 1.953757225433526, + "grad_norm": 1.9264659881591797, + "learning_rate": 2.7316741132820122e-08, + "loss": 0.3776, + "step": 16055 + }, + { + "epoch": 1.953878916945543, + "grad_norm": 1.7694602012634277, + "learning_rate": 2.7172844871221625e-08, + "loss": 0.3428, + "step": 16056 + }, + { + "epoch": 1.95400060845756, + "grad_norm": 2.1041338443756104, + "learning_rate": 2.7029328096337403e-08, + "loss": 0.3819, + "step": 16057 + }, + { + "epoch": 1.954122299969577, + "grad_norm": 3.1114678382873535, + "learning_rate": 2.6886190813630862e-08, + "loss": 0.3021, + "step": 16058 + }, + { + "epoch": 1.9542439914815941, + "grad_norm": 2.371156692504883, + "learning_rate": 2.674343302854765e-08, + "loss": 0.4205, + "step": 16059 + }, + { + "epoch": 1.9543656829936111, + "grad_norm": 2.124628782272339, + "learning_rate": 2.6601054746521194e-08, + "loss": 0.4336, + "step": 16060 + }, + { + "epoch": 1.9544873745056281, + "grad_norm": 2.71779203414917, + "learning_rate": 2.6459055972967162e-08, + "loss": 0.3969, + "step": 16061 + }, + { + "epoch": 1.9546090660176452, + "grad_norm": 1.8790838718414307, + "learning_rate": 2.6317436713291233e-08, + "loss": 0.3227, + "step": 16062 + }, + { + "epoch": 1.9547307575296622, + "grad_norm": 1.962717056274414, + "learning_rate": 2.6176196972880208e-08, + "loss": 0.4185, + "step": 16063 + }, + { + "epoch": 1.9548524490416792, + "grad_norm": 2.240490198135376, + "learning_rate": 2.6035336757110896e-08, + "loss": 0.3573, + "step": 16064 + }, + { + "epoch": 1.9549741405536962, + "grad_norm": 2.883488178253174, + "learning_rate": 2.5894856071341236e-08, + "loss": 0.3443, + "step": 16065 + }, + { + "epoch": 1.9550958320657135, + "grad_norm": 1.692785620689392, + "learning_rate": 2.5754754920916947e-08, + "loss": 0.3398, + "step": 16066 + }, + { + "epoch": 1.9552175235777305, + "grad_norm": 2.052774667739868, + "learning_rate": 2.5615033311170433e-08, + "loss": 0.3912, + "step": 16067 + }, + { + "epoch": 1.9553392150897475, + "grad_norm": 1.7729231119155884, + "learning_rate": 2.5475691247417444e-08, + "loss": 0.389, + "step": 16068 + }, + { + "epoch": 1.9554609066017645, + "grad_norm": 4.135091304779053, + "learning_rate": 2.5336728734960403e-08, + "loss": 0.4294, + "step": 16069 + }, + { + "epoch": 1.9555825981137815, + "grad_norm": 1.6622675657272339, + "learning_rate": 2.5198145779088413e-08, + "loss": 0.3551, + "step": 16070 + }, + { + "epoch": 1.9557042896257986, + "grad_norm": 1.3635361194610596, + "learning_rate": 2.505994238507281e-08, + "loss": 0.3084, + "step": 16071 + }, + { + "epoch": 1.9558259811378156, + "grad_norm": 1.7262868881225586, + "learning_rate": 2.4922118558173835e-08, + "loss": 0.3765, + "step": 16072 + }, + { + "epoch": 1.9559476726498328, + "grad_norm": 2.049860954284668, + "learning_rate": 2.478467430363507e-08, + "loss": 0.4249, + "step": 16073 + }, + { + "epoch": 1.9560693641618498, + "grad_norm": 1.459931492805481, + "learning_rate": 2.4647609626686774e-08, + "loss": 0.3565, + "step": 16074 + }, + { + "epoch": 1.9561910556738669, + "grad_norm": 2.6633172035217285, + "learning_rate": 2.4510924532544777e-08, + "loss": 0.3888, + "step": 16075 + }, + { + "epoch": 1.9563127471858839, + "grad_norm": 2.477372884750366, + "learning_rate": 2.4374619026410473e-08, + "loss": 0.3738, + "step": 16076 + }, + { + "epoch": 1.956434438697901, + "grad_norm": 2.220189094543457, + "learning_rate": 2.4238693113470824e-08, + "loss": 0.3724, + "step": 16077 + }, + { + "epoch": 1.956556130209918, + "grad_norm": 2.200385093688965, + "learning_rate": 2.4103146798897247e-08, + "loss": 0.3123, + "step": 16078 + }, + { + "epoch": 1.956677821721935, + "grad_norm": 2.3237109184265137, + "learning_rate": 2.3967980087847842e-08, + "loss": 0.3445, + "step": 16079 + }, + { + "epoch": 1.956799513233952, + "grad_norm": 2.357635498046875, + "learning_rate": 2.383319298546627e-08, + "loss": 0.3084, + "step": 16080 + }, + { + "epoch": 1.956921204745969, + "grad_norm": 1.877802848815918, + "learning_rate": 2.369878549688065e-08, + "loss": 0.3306, + "step": 16081 + }, + { + "epoch": 1.957042896257986, + "grad_norm": 1.9022371768951416, + "learning_rate": 2.356475762720578e-08, + "loss": 0.363, + "step": 16082 + }, + { + "epoch": 1.957164587770003, + "grad_norm": 1.8036013841629028, + "learning_rate": 2.3431109381543137e-08, + "loss": 0.3516, + "step": 16083 + }, + { + "epoch": 1.95728627928202, + "grad_norm": 1.7448277473449707, + "learning_rate": 2.3297840764976432e-08, + "loss": 0.3648, + "step": 16084 + }, + { + "epoch": 1.957407970794037, + "grad_norm": 2.354055643081665, + "learning_rate": 2.316495178257827e-08, + "loss": 0.3157, + "step": 16085 + }, + { + "epoch": 1.957529662306054, + "grad_norm": 1.772237777709961, + "learning_rate": 2.3032442439403502e-08, + "loss": 0.3831, + "step": 16086 + }, + { + "epoch": 1.957651353818071, + "grad_norm": 1.6501935720443726, + "learning_rate": 2.2900312740495866e-08, + "loss": 0.3756, + "step": 16087 + }, + { + "epoch": 1.9577730453300881, + "grad_norm": 2.007291078567505, + "learning_rate": 2.2768562690883568e-08, + "loss": 0.3488, + "step": 16088 + }, + { + "epoch": 1.9578947368421051, + "grad_norm": 1.6944053173065186, + "learning_rate": 2.2637192295578147e-08, + "loss": 0.3762, + "step": 16089 + }, + { + "epoch": 1.9580164283541222, + "grad_norm": 1.4936788082122803, + "learning_rate": 2.2506201559580056e-08, + "loss": 0.3153, + "step": 16090 + }, + { + "epoch": 1.9581381198661392, + "grad_norm": 1.573096752166748, + "learning_rate": 2.2375590487873077e-08, + "loss": 0.377, + "step": 16091 + }, + { + "epoch": 1.9582598113781564, + "grad_norm": 2.0307915210723877, + "learning_rate": 2.2245359085427687e-08, + "loss": 0.3704, + "step": 16092 + }, + { + "epoch": 1.9583815028901734, + "grad_norm": 1.6768741607666016, + "learning_rate": 2.2115507357198806e-08, + "loss": 0.3644, + "step": 16093 + }, + { + "epoch": 1.9585031944021905, + "grad_norm": 2.0135579109191895, + "learning_rate": 2.198603530812915e-08, + "loss": 0.396, + "step": 16094 + }, + { + "epoch": 1.9586248859142075, + "grad_norm": 1.7168631553649902, + "learning_rate": 2.1856942943142556e-08, + "loss": 0.3222, + "step": 16095 + }, + { + "epoch": 1.9587465774262245, + "grad_norm": 1.6933259963989258, + "learning_rate": 2.172823026715398e-08, + "loss": 0.405, + "step": 16096 + }, + { + "epoch": 1.9588682689382415, + "grad_norm": 3.757620096206665, + "learning_rate": 2.1599897285059514e-08, + "loss": 0.406, + "step": 16097 + }, + { + "epoch": 1.9589899604502587, + "grad_norm": 2.306986093521118, + "learning_rate": 2.1471944001744128e-08, + "loss": 0.4266, + "step": 16098 + }, + { + "epoch": 1.9591116519622758, + "grad_norm": 1.6254311800003052, + "learning_rate": 2.1344370422075043e-08, + "loss": 0.373, + "step": 16099 + }, + { + "epoch": 1.9592333434742928, + "grad_norm": 1.3391528129577637, + "learning_rate": 2.121717655090727e-08, + "loss": 0.3522, + "step": 16100 + }, + { + "epoch": 1.9593550349863098, + "grad_norm": 1.8421125411987305, + "learning_rate": 2.1090362393080266e-08, + "loss": 0.3164, + "step": 16101 + }, + { + "epoch": 1.9594767264983268, + "grad_norm": 1.8858824968338013, + "learning_rate": 2.0963927953421283e-08, + "loss": 0.3655, + "step": 16102 + }, + { + "epoch": 1.9595984180103438, + "grad_norm": 1.868971824645996, + "learning_rate": 2.0837873236739803e-08, + "loss": 0.3726, + "step": 16103 + }, + { + "epoch": 1.9597201095223609, + "grad_norm": 1.9227591753005981, + "learning_rate": 2.0712198247831994e-08, + "loss": 0.3896, + "step": 16104 + }, + { + "epoch": 1.9598418010343779, + "grad_norm": 2.1970951557159424, + "learning_rate": 2.0586902991482915e-08, + "loss": 0.3871, + "step": 16105 + }, + { + "epoch": 1.959963492546395, + "grad_norm": 1.4184823036193848, + "learning_rate": 2.0461987472457644e-08, + "loss": 0.3336, + "step": 16106 + }, + { + "epoch": 1.960085184058412, + "grad_norm": 1.4658305644989014, + "learning_rate": 2.033745169551016e-08, + "loss": 0.3737, + "step": 16107 + }, + { + "epoch": 1.960206875570429, + "grad_norm": 2.533212900161743, + "learning_rate": 2.0213295665378885e-08, + "loss": 0.2941, + "step": 16108 + }, + { + "epoch": 1.960328567082446, + "grad_norm": 1.6192365884780884, + "learning_rate": 2.0089519386788935e-08, + "loss": 0.3499, + "step": 16109 + }, + { + "epoch": 1.960450258594463, + "grad_norm": 1.6580541133880615, + "learning_rate": 1.9966122864450986e-08, + "loss": 0.3724, + "step": 16110 + }, + { + "epoch": 1.96057195010648, + "grad_norm": 1.4288220405578613, + "learning_rate": 1.9843106103059063e-08, + "loss": 0.3662, + "step": 16111 + }, + { + "epoch": 1.960693641618497, + "grad_norm": 1.8832440376281738, + "learning_rate": 1.972046910729386e-08, + "loss": 0.4455, + "step": 16112 + }, + { + "epoch": 1.960815333130514, + "grad_norm": 1.7494819164276123, + "learning_rate": 1.9598211881823868e-08, + "loss": 0.3949, + "step": 16113 + }, + { + "epoch": 1.960937024642531, + "grad_norm": 1.9469034671783447, + "learning_rate": 1.9476334431299815e-08, + "loss": 0.3741, + "step": 16114 + }, + { + "epoch": 1.961058716154548, + "grad_norm": 1.8368041515350342, + "learning_rate": 1.9354836760360206e-08, + "loss": 0.4006, + "step": 16115 + }, + { + "epoch": 1.961180407666565, + "grad_norm": 1.9660885334014893, + "learning_rate": 1.9233718873628016e-08, + "loss": 0.3155, + "step": 16116 + }, + { + "epoch": 1.9613020991785823, + "grad_norm": 2.2395100593566895, + "learning_rate": 1.9112980775711775e-08, + "loss": 0.4024, + "step": 16117 + }, + { + "epoch": 1.9614237906905994, + "grad_norm": 2.684568405151367, + "learning_rate": 1.8992622471205592e-08, + "loss": 0.4188, + "step": 16118 + }, + { + "epoch": 1.9615454822026164, + "grad_norm": 2.0264804363250732, + "learning_rate": 1.887264396468913e-08, + "loss": 0.3289, + "step": 16119 + }, + { + "epoch": 1.9616671737146334, + "grad_norm": 1.872266411781311, + "learning_rate": 1.8753045260728742e-08, + "loss": 0.3763, + "step": 16120 + }, + { + "epoch": 1.9617888652266504, + "grad_norm": 2.1223602294921875, + "learning_rate": 1.863382636387523e-08, + "loss": 0.3701, + "step": 16121 + }, + { + "epoch": 1.9619105567386674, + "grad_norm": 1.5758085250854492, + "learning_rate": 1.8514987278664966e-08, + "loss": 0.381, + "step": 16122 + }, + { + "epoch": 1.9620322482506847, + "grad_norm": 1.8471357822418213, + "learning_rate": 1.8396528009619887e-08, + "loss": 0.3878, + "step": 16123 + }, + { + "epoch": 1.9621539397627017, + "grad_norm": 2.659285545349121, + "learning_rate": 1.8278448561247496e-08, + "loss": 0.3419, + "step": 16124 + }, + { + "epoch": 1.9622756312747187, + "grad_norm": 1.811944603919983, + "learning_rate": 1.8160748938041984e-08, + "loss": 0.3854, + "step": 16125 + }, + { + "epoch": 1.9623973227867357, + "grad_norm": 2.0797219276428223, + "learning_rate": 1.8043429144480874e-08, + "loss": 0.3321, + "step": 16126 + }, + { + "epoch": 1.9625190142987528, + "grad_norm": 1.619655966758728, + "learning_rate": 1.7926489185028373e-08, + "loss": 0.3394, + "step": 16127 + }, + { + "epoch": 1.9626407058107698, + "grad_norm": 1.9596425294876099, + "learning_rate": 1.780992906413537e-08, + "loss": 0.3439, + "step": 16128 + }, + { + "epoch": 1.9627623973227868, + "grad_norm": 1.8175820112228394, + "learning_rate": 1.7693748786236087e-08, + "loss": 0.3718, + "step": 16129 + }, + { + "epoch": 1.9628840888348038, + "grad_norm": 1.6436271667480469, + "learning_rate": 1.757794835575144e-08, + "loss": 0.3715, + "step": 16130 + }, + { + "epoch": 1.9630057803468208, + "grad_norm": 2.266298532485962, + "learning_rate": 1.746252777708901e-08, + "loss": 0.3947, + "step": 16131 + }, + { + "epoch": 1.9631274718588378, + "grad_norm": 1.6623202562332153, + "learning_rate": 1.7347487054639732e-08, + "loss": 0.3988, + "step": 16132 + }, + { + "epoch": 1.9632491633708549, + "grad_norm": 1.9400012493133545, + "learning_rate": 1.7232826192782327e-08, + "loss": 0.4185, + "step": 16133 + }, + { + "epoch": 1.9633708548828719, + "grad_norm": 2.1501822471618652, + "learning_rate": 1.7118545195877745e-08, + "loss": 0.3492, + "step": 16134 + }, + { + "epoch": 1.963492546394889, + "grad_norm": 2.000150203704834, + "learning_rate": 1.7004644068276955e-08, + "loss": 0.3468, + "step": 16135 + }, + { + "epoch": 1.963614237906906, + "grad_norm": 2.302781820297241, + "learning_rate": 1.6891122814313154e-08, + "loss": 0.3665, + "step": 16136 + }, + { + "epoch": 1.963735929418923, + "grad_norm": 1.699919581413269, + "learning_rate": 1.6777981438305113e-08, + "loss": 0.3105, + "step": 16137 + }, + { + "epoch": 1.96385762093094, + "grad_norm": 1.5578714609146118, + "learning_rate": 1.6665219944560497e-08, + "loss": 0.3423, + "step": 16138 + }, + { + "epoch": 1.963979312442957, + "grad_norm": 1.8221158981323242, + "learning_rate": 1.6552838337366983e-08, + "loss": 0.3813, + "step": 16139 + }, + { + "epoch": 1.964101003954974, + "grad_norm": 1.7642605304718018, + "learning_rate": 1.6440836621003375e-08, + "loss": 0.3716, + "step": 16140 + }, + { + "epoch": 1.964222695466991, + "grad_norm": 1.5965232849121094, + "learning_rate": 1.6329214799731817e-08, + "loss": 0.3911, + "step": 16141 + }, + { + "epoch": 1.9643443869790083, + "grad_norm": 1.5525596141815186, + "learning_rate": 1.6217972877796695e-08, + "loss": 0.333, + "step": 16142 + }, + { + "epoch": 1.9644660784910253, + "grad_norm": 1.4319900274276733, + "learning_rate": 1.6107110859434617e-08, + "loss": 0.3741, + "step": 16143 + }, + { + "epoch": 1.9645877700030423, + "grad_norm": 2.2056970596313477, + "learning_rate": 1.59966287488611e-08, + "loss": 0.3785, + "step": 16144 + }, + { + "epoch": 1.9647094615150593, + "grad_norm": 1.7470972537994385, + "learning_rate": 1.5886526550282776e-08, + "loss": 0.3641, + "step": 16145 + }, + { + "epoch": 1.9648311530270763, + "grad_norm": 2.975201368331909, + "learning_rate": 1.5776804267887414e-08, + "loss": 0.3732, + "step": 16146 + }, + { + "epoch": 1.9649528445390934, + "grad_norm": 1.6766966581344604, + "learning_rate": 1.5667461905850556e-08, + "loss": 0.334, + "step": 16147 + }, + { + "epoch": 1.9650745360511106, + "grad_norm": 1.821400761604309, + "learning_rate": 1.5558499468334432e-08, + "loss": 0.4138, + "step": 16148 + }, + { + "epoch": 1.9651962275631276, + "grad_norm": 1.8996427059173584, + "learning_rate": 1.5449916959483502e-08, + "loss": 0.4095, + "step": 16149 + }, + { + "epoch": 1.9653179190751446, + "grad_norm": 2.5826408863067627, + "learning_rate": 1.5341714383430018e-08, + "loss": 0.3497, + "step": 16150 + }, + { + "epoch": 1.9654396105871617, + "grad_norm": 1.8483394384384155, + "learning_rate": 1.5233891744290684e-08, + "loss": 0.35, + "step": 16151 + }, + { + "epoch": 1.9655613020991787, + "grad_norm": 1.6593778133392334, + "learning_rate": 1.512644904617e-08, + "loss": 0.4012, + "step": 16152 + }, + { + "epoch": 1.9656829936111957, + "grad_norm": 1.8529874086380005, + "learning_rate": 1.50193862931558e-08, + "loss": 0.3777, + "step": 16153 + }, + { + "epoch": 1.9658046851232127, + "grad_norm": 1.5400457382202148, + "learning_rate": 1.49127034893215e-08, + "loss": 0.3583, + "step": 16154 + }, + { + "epoch": 1.9659263766352297, + "grad_norm": 2.7630953788757324, + "learning_rate": 1.480640063872607e-08, + "loss": 0.3829, + "step": 16155 + }, + { + "epoch": 1.9660480681472468, + "grad_norm": 4.0489068031311035, + "learning_rate": 1.4700477745416275e-08, + "loss": 0.4039, + "step": 16156 + }, + { + "epoch": 1.9661697596592638, + "grad_norm": 1.8888777494430542, + "learning_rate": 1.4594934813421113e-08, + "loss": 0.4238, + "step": 16157 + }, + { + "epoch": 1.9662914511712808, + "grad_norm": 1.5959055423736572, + "learning_rate": 1.4489771846757373e-08, + "loss": 0.352, + "step": 16158 + }, + { + "epoch": 1.9664131426832978, + "grad_norm": 1.896551251411438, + "learning_rate": 1.4384988849426296e-08, + "loss": 0.4144, + "step": 16159 + }, + { + "epoch": 1.9665348341953148, + "grad_norm": 1.9961283206939697, + "learning_rate": 1.4280585825415805e-08, + "loss": 0.3635, + "step": 16160 + }, + { + "epoch": 1.9666565257073318, + "grad_norm": 2.557892322540283, + "learning_rate": 1.4176562778698278e-08, + "loss": 0.3433, + "step": 16161 + }, + { + "epoch": 1.9667782172193489, + "grad_norm": 2.7010657787323, + "learning_rate": 1.407291971323277e-08, + "loss": 0.3392, + "step": 16162 + }, + { + "epoch": 1.9668999087313659, + "grad_norm": 1.998677372932434, + "learning_rate": 1.3969656632961682e-08, + "loss": 0.3896, + "step": 16163 + }, + { + "epoch": 1.967021600243383, + "grad_norm": 1.8208341598510742, + "learning_rate": 1.3866773541815204e-08, + "loss": 0.3376, + "step": 16164 + }, + { + "epoch": 1.9671432917554, + "grad_norm": 3.2257418632507324, + "learning_rate": 1.3764270443709094e-08, + "loss": 0.3229, + "step": 16165 + }, + { + "epoch": 1.967264983267417, + "grad_norm": 1.6839380264282227, + "learning_rate": 1.3662147342541344e-08, + "loss": 0.3679, + "step": 16166 + }, + { + "epoch": 1.9673866747794342, + "grad_norm": 2.1701269149780273, + "learning_rate": 1.3560404242199954e-08, + "loss": 0.3186, + "step": 16167 + }, + { + "epoch": 1.9675083662914512, + "grad_norm": 2.662816286087036, + "learning_rate": 1.3459041146556272e-08, + "loss": 0.3982, + "step": 16168 + }, + { + "epoch": 1.9676300578034682, + "grad_norm": 1.735885500907898, + "learning_rate": 1.3358058059467217e-08, + "loss": 0.373, + "step": 16169 + }, + { + "epoch": 1.9677517493154852, + "grad_norm": 2.0652430057525635, + "learning_rate": 1.3257454984775265e-08, + "loss": 0.3565, + "step": 16170 + }, + { + "epoch": 1.9678734408275023, + "grad_norm": 3.1041510105133057, + "learning_rate": 1.3157231926308467e-08, + "loss": 0.3567, + "step": 16171 + }, + { + "epoch": 1.9679951323395193, + "grad_norm": 2.1822738647460938, + "learning_rate": 1.305738888788044e-08, + "loss": 0.3221, + "step": 16172 + }, + { + "epoch": 1.9681168238515363, + "grad_norm": 3.687045097351074, + "learning_rate": 1.2957925873290367e-08, + "loss": 0.3152, + "step": 16173 + }, + { + "epoch": 1.9682385153635535, + "grad_norm": 1.8525265455245972, + "learning_rate": 1.2858842886324108e-08, + "loss": 0.3924, + "step": 16174 + }, + { + "epoch": 1.9683602068755706, + "grad_norm": 1.6356488466262817, + "learning_rate": 1.276013993075087e-08, + "loss": 0.3358, + "step": 16175 + }, + { + "epoch": 1.9684818983875876, + "grad_norm": 1.6764427423477173, + "learning_rate": 1.2661817010326538e-08, + "loss": 0.3529, + "step": 16176 + }, + { + "epoch": 1.9686035898996046, + "grad_norm": 1.6589916944503784, + "learning_rate": 1.2563874128792564e-08, + "loss": 0.3771, + "step": 16177 + }, + { + "epoch": 1.9687252814116216, + "grad_norm": 2.913634777069092, + "learning_rate": 1.2466311289875965e-08, + "loss": 0.4074, + "step": 16178 + }, + { + "epoch": 1.9688469729236386, + "grad_norm": 1.6322712898254395, + "learning_rate": 1.2369128497289329e-08, + "loss": 0.3433, + "step": 16179 + }, + { + "epoch": 1.9689686644356557, + "grad_norm": 2.0338664054870605, + "learning_rate": 1.2272325754730807e-08, + "loss": 0.3754, + "step": 16180 + }, + { + "epoch": 1.9690903559476727, + "grad_norm": 2.6447393894195557, + "learning_rate": 1.217590306588301e-08, + "loss": 0.3151, + "step": 16181 + }, + { + "epoch": 1.9692120474596897, + "grad_norm": 2.7490193843841553, + "learning_rate": 1.2079860434416335e-08, + "loss": 0.2946, + "step": 16182 + }, + { + "epoch": 1.9693337389717067, + "grad_norm": 1.993751049041748, + "learning_rate": 1.1984197863985637e-08, + "loss": 0.3396, + "step": 16183 + }, + { + "epoch": 1.9694554304837237, + "grad_norm": 2.14383602142334, + "learning_rate": 1.1888915358229114e-08, + "loss": 0.388, + "step": 16184 + }, + { + "epoch": 1.9695771219957408, + "grad_norm": 1.6024258136749268, + "learning_rate": 1.1794012920773867e-08, + "loss": 0.4065, + "step": 16185 + }, + { + "epoch": 1.9696988135077578, + "grad_norm": 1.5283539295196533, + "learning_rate": 1.1699490555231452e-08, + "loss": 0.3292, + "step": 16186 + }, + { + "epoch": 1.9698205050197748, + "grad_norm": 2.4187207221984863, + "learning_rate": 1.160534826519788e-08, + "loss": 0.2955, + "step": 16187 + }, + { + "epoch": 1.9699421965317918, + "grad_norm": 2.8960108757019043, + "learning_rate": 1.1511586054254731e-08, + "loss": 0.399, + "step": 16188 + }, + { + "epoch": 1.9700638880438088, + "grad_norm": 1.844589114189148, + "learning_rate": 1.1418203925972482e-08, + "loss": 0.4326, + "step": 16189 + }, + { + "epoch": 1.9701855795558258, + "grad_norm": 1.4832305908203125, + "learning_rate": 1.1325201883901626e-08, + "loss": 0.3673, + "step": 16190 + }, + { + "epoch": 1.9703072710678429, + "grad_norm": 2.5775561332702637, + "learning_rate": 1.1232579931582666e-08, + "loss": 0.3513, + "step": 16191 + }, + { + "epoch": 1.9704289625798599, + "grad_norm": 1.834261178970337, + "learning_rate": 1.1140338072539448e-08, + "loss": 0.3442, + "step": 16192 + }, + { + "epoch": 1.9705506540918771, + "grad_norm": 2.057279348373413, + "learning_rate": 1.1048476310281386e-08, + "loss": 0.4094, + "step": 16193 + }, + { + "epoch": 1.9706723456038941, + "grad_norm": 1.5632085800170898, + "learning_rate": 1.0956994648305685e-08, + "loss": 0.3743, + "step": 16194 + }, + { + "epoch": 1.9707940371159112, + "grad_norm": 2.784318208694458, + "learning_rate": 1.0865893090092894e-08, + "loss": 0.3268, + "step": 16195 + }, + { + "epoch": 1.9709157286279282, + "grad_norm": 2.4518778324127197, + "learning_rate": 1.0775171639108018e-08, + "loss": 0.4051, + "step": 16196 + }, + { + "epoch": 1.9710374201399452, + "grad_norm": 2.298598527908325, + "learning_rate": 1.068483029880496e-08, + "loss": 0.327, + "step": 16197 + }, + { + "epoch": 1.9711591116519622, + "grad_norm": 1.675595760345459, + "learning_rate": 1.0594869072622083e-08, + "loss": 0.3892, + "step": 16198 + }, + { + "epoch": 1.9712808031639795, + "grad_norm": 2.040555715560913, + "learning_rate": 1.0505287963979981e-08, + "loss": 0.4143, + "step": 16199 + }, + { + "epoch": 1.9714024946759965, + "grad_norm": 2.217957019805908, + "learning_rate": 1.0416086976289263e-08, + "loss": 0.2885, + "step": 16200 + }, + { + "epoch": 1.9715241861880135, + "grad_norm": 1.6215285062789917, + "learning_rate": 1.0327266112943879e-08, + "loss": 0.3627, + "step": 16201 + }, + { + "epoch": 1.9716458777000305, + "grad_norm": 2.09826397895813, + "learning_rate": 1.0238825377322237e-08, + "loss": 0.4525, + "step": 16202 + }, + { + "epoch": 1.9717675692120475, + "grad_norm": 2.2282772064208984, + "learning_rate": 1.0150764772792755e-08, + "loss": 0.3621, + "step": 16203 + }, + { + "epoch": 1.9718892607240646, + "grad_norm": 2.02528715133667, + "learning_rate": 1.0063084302703862e-08, + "loss": 0.4205, + "step": 16204 + }, + { + "epoch": 1.9720109522360816, + "grad_norm": 2.36088490486145, + "learning_rate": 9.975783970391783e-09, + "loss": 0.3504, + "step": 16205 + }, + { + "epoch": 1.9721326437480986, + "grad_norm": 2.020817518234253, + "learning_rate": 9.888863779180524e-09, + "loss": 0.3479, + "step": 16206 + }, + { + "epoch": 1.9722543352601156, + "grad_norm": 2.278749704360962, + "learning_rate": 9.802323732375219e-09, + "loss": 0.3466, + "step": 16207 + }, + { + "epoch": 1.9723760267721326, + "grad_norm": 2.411249876022339, + "learning_rate": 9.71616383327101e-09, + "loss": 0.3689, + "step": 16208 + }, + { + "epoch": 1.9724977182841497, + "grad_norm": 2.818837881088257, + "learning_rate": 9.630384085145273e-09, + "loss": 0.2697, + "step": 16209 + }, + { + "epoch": 1.9726194097961667, + "grad_norm": 1.7738935947418213, + "learning_rate": 9.544984491263177e-09, + "loss": 0.3966, + "step": 16210 + }, + { + "epoch": 1.9727411013081837, + "grad_norm": 1.6400485038757324, + "learning_rate": 9.459965054872122e-09, + "loss": 0.3621, + "step": 16211 + }, + { + "epoch": 1.9728627928202007, + "grad_norm": 1.6486150026321411, + "learning_rate": 9.375325779209521e-09, + "loss": 0.3513, + "step": 16212 + }, + { + "epoch": 1.9729844843322177, + "grad_norm": 1.7734371423721313, + "learning_rate": 9.29106666749502e-09, + "loss": 0.3557, + "step": 16213 + }, + { + "epoch": 1.9731061758442348, + "grad_norm": 1.3012028932571411, + "learning_rate": 9.20718772293494e-09, + "loss": 0.3305, + "step": 16214 + }, + { + "epoch": 1.9732278673562518, + "grad_norm": 1.6839501857757568, + "learning_rate": 9.123688948721176e-09, + "loss": 0.3148, + "step": 16215 + }, + { + "epoch": 1.9733495588682688, + "grad_norm": 3.3136820793151855, + "learning_rate": 9.040570348031185e-09, + "loss": 0.4215, + "step": 16216 + }, + { + "epoch": 1.9734712503802858, + "grad_norm": 1.7796372175216675, + "learning_rate": 8.957831924027993e-09, + "loss": 0.3669, + "step": 16217 + }, + { + "epoch": 1.973592941892303, + "grad_norm": 1.924067497253418, + "learning_rate": 8.87547367985908e-09, + "loss": 0.4166, + "step": 16218 + }, + { + "epoch": 1.97371463340432, + "grad_norm": 3.0644567012786865, + "learning_rate": 8.79349561865861e-09, + "loss": 0.3381, + "step": 16219 + }, + { + "epoch": 1.973836324916337, + "grad_norm": 1.8464536666870117, + "learning_rate": 8.711897743547416e-09, + "loss": 0.3084, + "step": 16220 + }, + { + "epoch": 1.9739580164283541, + "grad_norm": 1.678406000137329, + "learning_rate": 8.630680057628572e-09, + "loss": 0.382, + "step": 16221 + }, + { + "epoch": 1.9740797079403711, + "grad_norm": 2.144219160079956, + "learning_rate": 8.549842563992939e-09, + "loss": 0.4112, + "step": 16222 + }, + { + "epoch": 1.9742013994523882, + "grad_norm": 2.707542896270752, + "learning_rate": 8.469385265718055e-09, + "loss": 0.4175, + "step": 16223 + }, + { + "epoch": 1.9743230909644054, + "grad_norm": 2.1397154331207275, + "learning_rate": 8.389308165863696e-09, + "loss": 0.3802, + "step": 16224 + }, + { + "epoch": 1.9744447824764224, + "grad_norm": 2.093106269836426, + "learning_rate": 8.309611267477424e-09, + "loss": 0.365, + "step": 16225 + }, + { + "epoch": 1.9745664739884394, + "grad_norm": 1.6024620532989502, + "learning_rate": 8.230294573592368e-09, + "loss": 0.3809, + "step": 16226 + }, + { + "epoch": 1.9746881655004564, + "grad_norm": 1.8772417306900024, + "learning_rate": 8.151358087226113e-09, + "loss": 0.3627, + "step": 16227 + }, + { + "epoch": 1.9748098570124735, + "grad_norm": 2.2356367111206055, + "learning_rate": 8.072801811384034e-09, + "loss": 0.3675, + "step": 16228 + }, + { + "epoch": 1.9749315485244905, + "grad_norm": 1.6852242946624756, + "learning_rate": 7.994625749052632e-09, + "loss": 0.3483, + "step": 16229 + }, + { + "epoch": 1.9750532400365075, + "grad_norm": 2.2831907272338867, + "learning_rate": 7.916829903208411e-09, + "loss": 0.2979, + "step": 16230 + }, + { + "epoch": 1.9751749315485245, + "grad_norm": 1.5482177734375, + "learning_rate": 7.83941427681123e-09, + "loss": 0.3406, + "step": 16231 + }, + { + "epoch": 1.9752966230605415, + "grad_norm": 3.005664825439453, + "learning_rate": 7.762378872806509e-09, + "loss": 0.3234, + "step": 16232 + }, + { + "epoch": 1.9754183145725586, + "grad_norm": 2.0898256301879883, + "learning_rate": 7.685723694127456e-09, + "loss": 0.3982, + "step": 16233 + }, + { + "epoch": 1.9755400060845756, + "grad_norm": 2.306986093521118, + "learning_rate": 7.609448743688407e-09, + "loss": 0.3629, + "step": 16234 + }, + { + "epoch": 1.9756616975965926, + "grad_norm": 1.5853625535964966, + "learning_rate": 7.533554024392598e-09, + "loss": 0.373, + "step": 16235 + }, + { + "epoch": 1.9757833891086096, + "grad_norm": 1.8018343448638916, + "learning_rate": 7.458039539129935e-09, + "loss": 0.403, + "step": 16236 + }, + { + "epoch": 1.9759050806206266, + "grad_norm": 3.2835536003112793, + "learning_rate": 7.382905290770348e-09, + "loss": 0.4353, + "step": 16237 + }, + { + "epoch": 1.9760267721326437, + "grad_norm": 1.5987604856491089, + "learning_rate": 7.30815128217599e-09, + "loss": 0.3654, + "step": 16238 + }, + { + "epoch": 1.9761484636446607, + "grad_norm": 1.9460461139678955, + "learning_rate": 7.233777516190144e-09, + "loss": 0.3835, + "step": 16239 + }, + { + "epoch": 1.9762701551566777, + "grad_norm": 1.9640945196151733, + "learning_rate": 7.1597839956427665e-09, + "loss": 0.3879, + "step": 16240 + }, + { + "epoch": 1.9763918466686947, + "grad_norm": 3.649320363998413, + "learning_rate": 7.086170723350494e-09, + "loss": 0.2882, + "step": 16241 + }, + { + "epoch": 1.9765135381807117, + "grad_norm": 2.2662365436553955, + "learning_rate": 7.0129377021121996e-09, + "loss": 0.409, + "step": 16242 + }, + { + "epoch": 1.976635229692729, + "grad_norm": 1.8585344552993774, + "learning_rate": 6.940084934716762e-09, + "loss": 0.4072, + "step": 16243 + }, + { + "epoch": 1.976756921204746, + "grad_norm": 2.306539297103882, + "learning_rate": 6.867612423935299e-09, + "loss": 0.3298, + "step": 16244 + }, + { + "epoch": 1.976878612716763, + "grad_norm": 2.2497708797454834, + "learning_rate": 6.795520172525605e-09, + "loss": 0.4123, + "step": 16245 + }, + { + "epoch": 1.97700030422878, + "grad_norm": 1.7580674886703491, + "learning_rate": 6.72380818323215e-09, + "loss": 0.3592, + "step": 16246 + }, + { + "epoch": 1.977121995740797, + "grad_norm": 2.7794220447540283, + "learning_rate": 6.6524764587816425e-09, + "loss": 0.4345, + "step": 16247 + }, + { + "epoch": 1.977243687252814, + "grad_norm": 1.8133848905563354, + "learning_rate": 6.581525001890798e-09, + "loss": 0.3562, + "step": 16248 + }, + { + "epoch": 1.9773653787648313, + "grad_norm": 2.026087760925293, + "learning_rate": 6.510953815256349e-09, + "loss": 0.3765, + "step": 16249 + }, + { + "epoch": 1.9774870702768483, + "grad_norm": 2.466622829437256, + "learning_rate": 6.440762901567254e-09, + "loss": 0.3347, + "step": 16250 + }, + { + "epoch": 1.9776087617888654, + "grad_norm": 3.245797872543335, + "learning_rate": 6.370952263491381e-09, + "loss": 0.421, + "step": 16251 + }, + { + "epoch": 1.9777304533008824, + "grad_norm": 1.4088108539581299, + "learning_rate": 6.301521903687713e-09, + "loss": 0.3869, + "step": 16252 + }, + { + "epoch": 1.9778521448128994, + "grad_norm": 2.29914927482605, + "learning_rate": 6.23247182479525e-09, + "loss": 0.3552, + "step": 16253 + }, + { + "epoch": 1.9779738363249164, + "grad_norm": 1.9885870218276978, + "learning_rate": 6.1638020294441105e-09, + "loss": 0.4096, + "step": 16254 + }, + { + "epoch": 1.9780955278369334, + "grad_norm": 2.1441006660461426, + "learning_rate": 6.095512520246649e-09, + "loss": 0.3472, + "step": 16255 + }, + { + "epoch": 1.9782172193489505, + "grad_norm": 1.488221526145935, + "learning_rate": 6.027603299800788e-09, + "loss": 0.3222, + "step": 16256 + }, + { + "epoch": 1.9783389108609675, + "grad_norm": 1.8380842208862305, + "learning_rate": 5.960074370692237e-09, + "loss": 0.3661, + "step": 16257 + }, + { + "epoch": 1.9784606023729845, + "grad_norm": 2.064291000366211, + "learning_rate": 5.892925735487831e-09, + "loss": 0.3727, + "step": 16258 + }, + { + "epoch": 1.9785822938850015, + "grad_norm": 2.0364344120025635, + "learning_rate": 5.826157396744414e-09, + "loss": 0.3331, + "step": 16259 + }, + { + "epoch": 1.9787039853970185, + "grad_norm": 1.9616774320602417, + "learning_rate": 5.759769357003287e-09, + "loss": 0.3762, + "step": 16260 + }, + { + "epoch": 1.9788256769090355, + "grad_norm": 1.8364132642745972, + "learning_rate": 5.6937616187890954e-09, + "loss": 0.3147, + "step": 16261 + }, + { + "epoch": 1.9789473684210526, + "grad_norm": 1.9974987506866455, + "learning_rate": 5.628134184615386e-09, + "loss": 0.357, + "step": 16262 + }, + { + "epoch": 1.9790690599330696, + "grad_norm": 1.3815975189208984, + "learning_rate": 5.562887056977939e-09, + "loss": 0.3701, + "step": 16263 + }, + { + "epoch": 1.9791907514450866, + "grad_norm": 2.103435754776001, + "learning_rate": 5.498020238359214e-09, + "loss": 0.3786, + "step": 16264 + }, + { + "epoch": 1.9793124429571036, + "grad_norm": 1.6844791173934937, + "learning_rate": 5.4335337312283465e-09, + "loss": 0.3856, + "step": 16265 + }, + { + "epoch": 1.9794341344691206, + "grad_norm": 1.456583857536316, + "learning_rate": 5.369427538040039e-09, + "loss": 0.3776, + "step": 16266 + }, + { + "epoch": 1.9795558259811377, + "grad_norm": 2.071108341217041, + "learning_rate": 5.305701661232343e-09, + "loss": 0.3246, + "step": 16267 + }, + { + "epoch": 1.979677517493155, + "grad_norm": 1.4558171033859253, + "learning_rate": 5.242356103231094e-09, + "loss": 0.3382, + "step": 16268 + }, + { + "epoch": 1.979799209005172, + "grad_norm": 1.5601192712783813, + "learning_rate": 5.179390866445477e-09, + "loss": 0.3436, + "step": 16269 + }, + { + "epoch": 1.979920900517189, + "grad_norm": 1.646645426750183, + "learning_rate": 5.116805953272464e-09, + "loss": 0.4212, + "step": 16270 + }, + { + "epoch": 1.980042592029206, + "grad_norm": 2.3790793418884277, + "learning_rate": 5.054601366093481e-09, + "loss": 0.3075, + "step": 16271 + }, + { + "epoch": 1.980164283541223, + "grad_norm": 1.6709163188934326, + "learning_rate": 4.9927771072744154e-09, + "loss": 0.3358, + "step": 16272 + }, + { + "epoch": 1.98028597505324, + "grad_norm": 1.6739990711212158, + "learning_rate": 4.93133317917005e-09, + "loss": 0.3622, + "step": 16273 + }, + { + "epoch": 1.980407666565257, + "grad_norm": 1.8143036365509033, + "learning_rate": 4.870269584116294e-09, + "loss": 0.3606, + "step": 16274 + }, + { + "epoch": 1.9805293580772743, + "grad_norm": 2.032560348510742, + "learning_rate": 4.809586324437954e-09, + "loss": 0.3398, + "step": 16275 + }, + { + "epoch": 1.9806510495892913, + "grad_norm": 1.6154775619506836, + "learning_rate": 4.749283402443183e-09, + "loss": 0.3544, + "step": 16276 + }, + { + "epoch": 1.9807727411013083, + "grad_norm": 1.6917407512664795, + "learning_rate": 4.689360820427924e-09, + "loss": 0.3929, + "step": 16277 + }, + { + "epoch": 1.9808944326133253, + "grad_norm": 1.994578242301941, + "learning_rate": 4.629818580670353e-09, + "loss": 0.3394, + "step": 16278 + }, + { + "epoch": 1.9810161241253423, + "grad_norm": 1.7406855821609497, + "learning_rate": 4.570656685438657e-09, + "loss": 0.3199, + "step": 16279 + }, + { + "epoch": 1.9811378156373594, + "grad_norm": 1.9072710275650024, + "learning_rate": 4.511875136983257e-09, + "loss": 0.3633, + "step": 16280 + }, + { + "epoch": 1.9812595071493764, + "grad_norm": 1.6412389278411865, + "learning_rate": 4.453473937539033e-09, + "loss": 0.3626, + "step": 16281 + }, + { + "epoch": 1.9813811986613934, + "grad_norm": 1.6243566274642944, + "learning_rate": 4.39545308933087e-09, + "loss": 0.324, + "step": 16282 + }, + { + "epoch": 1.9815028901734104, + "grad_norm": 1.5476268529891968, + "learning_rate": 4.3378125945647835e-09, + "loss": 0.3918, + "step": 16283 + }, + { + "epoch": 1.9816245816854274, + "grad_norm": 1.8445093631744385, + "learning_rate": 4.280552455435683e-09, + "loss": 0.2988, + "step": 16284 + }, + { + "epoch": 1.9817462731974445, + "grad_norm": 2.0630104541778564, + "learning_rate": 4.223672674120716e-09, + "loss": 0.346, + "step": 16285 + }, + { + "epoch": 1.9818679647094615, + "grad_norm": 1.8288735151290894, + "learning_rate": 4.167173252785928e-09, + "loss": 0.3808, + "step": 16286 + }, + { + "epoch": 1.9819896562214785, + "grad_norm": 2.116935968399048, + "learning_rate": 4.1110541935796e-09, + "loss": 0.4183, + "step": 16287 + }, + { + "epoch": 1.9821113477334955, + "grad_norm": 2.6456518173217773, + "learning_rate": 4.055315498638912e-09, + "loss": 0.3277, + "step": 16288 + }, + { + "epoch": 1.9822330392455125, + "grad_norm": 1.4434562921524048, + "learning_rate": 3.999957170083279e-09, + "loss": 0.3202, + "step": 16289 + }, + { + "epoch": 1.9823547307575295, + "grad_norm": 1.547217845916748, + "learning_rate": 3.944979210019906e-09, + "loss": 0.3672, + "step": 16290 + }, + { + "epoch": 1.9824764222695466, + "grad_norm": 1.7337281703948975, + "learning_rate": 3.890381620541561e-09, + "loss": 0.3867, + "step": 16291 + }, + { + "epoch": 1.9825981137815636, + "grad_norm": 2.7446372509002686, + "learning_rate": 3.836164403724363e-09, + "loss": 0.4059, + "step": 16292 + }, + { + "epoch": 1.9827198052935808, + "grad_norm": 2.2903504371643066, + "learning_rate": 3.7823275616322155e-09, + "loss": 0.4208, + "step": 16293 + }, + { + "epoch": 1.9828414968055978, + "grad_norm": 2.8361003398895264, + "learning_rate": 3.728871096312369e-09, + "loss": 0.4194, + "step": 16294 + }, + { + "epoch": 1.9829631883176149, + "grad_norm": 1.6007697582244873, + "learning_rate": 3.6757950098020854e-09, + "loss": 0.338, + "step": 16295 + }, + { + "epoch": 1.9830848798296319, + "grad_norm": 2.583404064178467, + "learning_rate": 3.623099304117528e-09, + "loss": 0.4514, + "step": 16296 + }, + { + "epoch": 1.983206571341649, + "grad_norm": 1.7916226387023926, + "learning_rate": 3.5707839812659796e-09, + "loss": 0.3888, + "step": 16297 + }, + { + "epoch": 1.983328262853666, + "grad_norm": 2.3017799854278564, + "learning_rate": 3.5188490432380704e-09, + "loss": 0.3621, + "step": 16298 + }, + { + "epoch": 1.983449954365683, + "grad_norm": 1.813517451286316, + "learning_rate": 3.467294492008888e-09, + "loss": 0.3652, + "step": 16299 + }, + { + "epoch": 1.9835716458777002, + "grad_norm": 1.5048482418060303, + "learning_rate": 3.4161203295401955e-09, + "loss": 0.3617, + "step": 16300 + }, + { + "epoch": 1.9836933373897172, + "grad_norm": 1.6744508743286133, + "learning_rate": 3.3653265577815454e-09, + "loss": 0.356, + "step": 16301 + }, + { + "epoch": 1.9838150289017342, + "grad_norm": 1.656672716140747, + "learning_rate": 3.3149131786625046e-09, + "loss": 0.3753, + "step": 16302 + }, + { + "epoch": 1.9839367204137512, + "grad_norm": 1.8303706645965576, + "learning_rate": 3.264880194103759e-09, + "loss": 0.3477, + "step": 16303 + }, + { + "epoch": 1.9840584119257683, + "grad_norm": 2.753491163253784, + "learning_rate": 3.2152276060093413e-09, + "loss": 0.4346, + "step": 16304 + }, + { + "epoch": 1.9841801034377853, + "grad_norm": 2.1673684120178223, + "learning_rate": 3.1659554162666307e-09, + "loss": 0.3974, + "step": 16305 + }, + { + "epoch": 1.9843017949498023, + "grad_norm": 2.2132246494293213, + "learning_rate": 3.117063626751904e-09, + "loss": 0.3672, + "step": 16306 + }, + { + "epoch": 1.9844234864618193, + "grad_norm": 1.270691990852356, + "learning_rate": 3.0685522393247846e-09, + "loss": 0.3121, + "step": 16307 + }, + { + "epoch": 1.9845451779738363, + "grad_norm": 3.200900077819824, + "learning_rate": 3.020421255832684e-09, + "loss": 0.4375, + "step": 16308 + }, + { + "epoch": 1.9846668694858534, + "grad_norm": 1.888856053352356, + "learning_rate": 2.9726706781052493e-09, + "loss": 0.3802, + "step": 16309 + }, + { + "epoch": 1.9847885609978704, + "grad_norm": 2.113400459289551, + "learning_rate": 2.9253005079610265e-09, + "loss": 0.341, + "step": 16310 + }, + { + "epoch": 1.9849102525098874, + "grad_norm": 1.684683084487915, + "learning_rate": 2.878310747201907e-09, + "loss": 0.2993, + "step": 16311 + }, + { + "epoch": 1.9850319440219044, + "grad_norm": 1.3620672225952148, + "learning_rate": 2.8317013976153496e-09, + "loss": 0.2926, + "step": 16312 + }, + { + "epoch": 1.9851536355339214, + "grad_norm": 2.3848178386688232, + "learning_rate": 2.7854724609766014e-09, + "loss": 0.3355, + "step": 16313 + }, + { + "epoch": 1.9852753270459385, + "grad_norm": 2.0870959758758545, + "learning_rate": 2.7396239390431455e-09, + "loss": 0.3394, + "step": 16314 + }, + { + "epoch": 1.9853970185579555, + "grad_norm": 1.4668195247650146, + "learning_rate": 2.6941558335602523e-09, + "loss": 0.3769, + "step": 16315 + }, + { + "epoch": 1.9855187100699725, + "grad_norm": 1.9599826335906982, + "learning_rate": 2.6490681462576497e-09, + "loss": 0.337, + "step": 16316 + }, + { + "epoch": 1.9856404015819895, + "grad_norm": 1.9957466125488281, + "learning_rate": 2.6043608788517415e-09, + "loss": 0.4528, + "step": 16317 + }, + { + "epoch": 1.9857620930940065, + "grad_norm": 1.765285849571228, + "learning_rate": 2.5600340330433905e-09, + "loss": 0.3745, + "step": 16318 + }, + { + "epoch": 1.9858837846060238, + "grad_norm": 2.070235252380371, + "learning_rate": 2.5160876105190245e-09, + "loss": 0.3708, + "step": 16319 + }, + { + "epoch": 1.9860054761180408, + "grad_norm": 1.7341270446777344, + "learning_rate": 2.4725216129517506e-09, + "loss": 0.3626, + "step": 16320 + }, + { + "epoch": 1.9861271676300578, + "grad_norm": 1.7934695482254028, + "learning_rate": 2.4293360419980204e-09, + "loss": 0.3657, + "step": 16321 + }, + { + "epoch": 1.9862488591420748, + "grad_norm": 2.795962333679199, + "learning_rate": 2.3865308993020754e-09, + "loss": 0.3037, + "step": 16322 + }, + { + "epoch": 1.9863705506540918, + "grad_norm": 2.0234384536743164, + "learning_rate": 2.3441061864926118e-09, + "loss": 0.4152, + "step": 16323 + }, + { + "epoch": 1.9864922421661089, + "grad_norm": 2.041461229324341, + "learning_rate": 2.3020619051850046e-09, + "loss": 0.3394, + "step": 16324 + }, + { + "epoch": 1.986613933678126, + "grad_norm": 1.6153767108917236, + "learning_rate": 2.2603980569768648e-09, + "loss": 0.358, + "step": 16325 + }, + { + "epoch": 1.9867356251901431, + "grad_norm": 1.8474339246749878, + "learning_rate": 2.2191146434547006e-09, + "loss": 0.3957, + "step": 16326 + }, + { + "epoch": 1.9868573167021601, + "grad_norm": 2.107572078704834, + "learning_rate": 2.1782116661894782e-09, + "loss": 0.3802, + "step": 16327 + }, + { + "epoch": 1.9869790082141772, + "grad_norm": 1.6258896589279175, + "learning_rate": 2.13768912673884e-09, + "loss": 0.3612, + "step": 16328 + }, + { + "epoch": 1.9871006997261942, + "grad_norm": 3.6315109729766846, + "learning_rate": 2.0975470266426657e-09, + "loss": 0.4427, + "step": 16329 + }, + { + "epoch": 1.9872223912382112, + "grad_norm": 2.224144220352173, + "learning_rate": 2.057785367428622e-09, + "loss": 0.4167, + "step": 16330 + }, + { + "epoch": 1.9873440827502282, + "grad_norm": 1.6486867666244507, + "learning_rate": 2.0184041506121633e-09, + "loss": 0.3673, + "step": 16331 + }, + { + "epoch": 1.9874657742622452, + "grad_norm": 1.7491037845611572, + "learning_rate": 1.97940337768876e-09, + "loss": 0.4017, + "step": 16332 + }, + { + "epoch": 1.9875874657742623, + "grad_norm": 2.3490676879882812, + "learning_rate": 1.9407830501438907e-09, + "loss": 0.3274, + "step": 16333 + }, + { + "epoch": 1.9877091572862793, + "grad_norm": 1.7487815618515015, + "learning_rate": 1.9025431694474907e-09, + "loss": 0.3174, + "step": 16334 + }, + { + "epoch": 1.9878308487982963, + "grad_norm": 2.012241840362549, + "learning_rate": 1.8646837370539515e-09, + "loss": 0.3311, + "step": 16335 + }, + { + "epoch": 1.9879525403103133, + "grad_norm": 1.857872724533081, + "learning_rate": 1.8272047544043436e-09, + "loss": 0.3877, + "step": 16336 + }, + { + "epoch": 1.9880742318223303, + "grad_norm": 1.7917633056640625, + "learning_rate": 1.790106222925303e-09, + "loss": 0.3811, + "step": 16337 + }, + { + "epoch": 1.9881959233343474, + "grad_norm": 2.7612621784210205, + "learning_rate": 1.7533881440268131e-09, + "loss": 0.4067, + "step": 16338 + }, + { + "epoch": 1.9883176148463644, + "grad_norm": 1.765322208404541, + "learning_rate": 1.7170505191077546e-09, + "loss": 0.3418, + "step": 16339 + }, + { + "epoch": 1.9884393063583814, + "grad_norm": 2.567042350769043, + "learning_rate": 1.6810933495503556e-09, + "loss": 0.3368, + "step": 16340 + }, + { + "epoch": 1.9885609978703984, + "grad_norm": 1.4109476804733276, + "learning_rate": 1.6455166367224106e-09, + "loss": 0.3518, + "step": 16341 + }, + { + "epoch": 1.9886826893824154, + "grad_norm": 1.6678557395935059, + "learning_rate": 1.610320381978392e-09, + "loss": 0.399, + "step": 16342 + }, + { + "epoch": 1.9888043808944325, + "grad_norm": 1.952375054359436, + "learning_rate": 1.5755045866572283e-09, + "loss": 0.3809, + "step": 16343 + }, + { + "epoch": 1.9889260724064497, + "grad_norm": 2.4348909854888916, + "learning_rate": 1.5410692520834158e-09, + "loss": 0.359, + "step": 16344 + }, + { + "epoch": 1.9890477639184667, + "grad_norm": 1.7893550395965576, + "learning_rate": 1.5070143795670177e-09, + "loss": 0.3626, + "step": 16345 + }, + { + "epoch": 1.9891694554304837, + "grad_norm": 1.501473307609558, + "learning_rate": 1.4733399704058848e-09, + "loss": 0.3764, + "step": 16346 + }, + { + "epoch": 1.9892911469425008, + "grad_norm": 1.5196964740753174, + "learning_rate": 1.440046025878994e-09, + "loss": 0.3354, + "step": 16347 + }, + { + "epoch": 1.9894128384545178, + "grad_norm": 1.4251240491867065, + "learning_rate": 1.4071325472542197e-09, + "loss": 0.3727, + "step": 16348 + }, + { + "epoch": 1.9895345299665348, + "grad_norm": 1.7477946281433105, + "learning_rate": 1.3745995357838937e-09, + "loss": 0.2962, + "step": 16349 + }, + { + "epoch": 1.989656221478552, + "grad_norm": 2.126129150390625, + "learning_rate": 1.342446992707025e-09, + "loss": 0.3324, + "step": 16350 + }, + { + "epoch": 1.989777912990569, + "grad_norm": 1.8252509832382202, + "learning_rate": 1.3106749192448587e-09, + "loss": 0.381, + "step": 16351 + }, + { + "epoch": 1.989899604502586, + "grad_norm": 2.0943539142608643, + "learning_rate": 1.2792833166086483e-09, + "loss": 0.3907, + "step": 16352 + }, + { + "epoch": 1.990021296014603, + "grad_norm": 2.352362632751465, + "learning_rate": 1.2482721859907731e-09, + "loss": 0.4269, + "step": 16353 + }, + { + "epoch": 1.99014298752662, + "grad_norm": 1.5144728422164917, + "learning_rate": 1.2176415285736209e-09, + "loss": 0.3615, + "step": 16354 + }, + { + "epoch": 1.9902646790386371, + "grad_norm": 2.0436851978302, + "learning_rate": 1.1873913455207053e-09, + "loss": 0.3611, + "step": 16355 + }, + { + "epoch": 1.9903863705506541, + "grad_norm": 1.8111693859100342, + "learning_rate": 1.1575216379844378e-09, + "loss": 0.3684, + "step": 16356 + }, + { + "epoch": 1.9905080620626712, + "grad_norm": 1.826930046081543, + "learning_rate": 1.1280324071016868e-09, + "loss": 0.3116, + "step": 16357 + }, + { + "epoch": 1.9906297535746882, + "grad_norm": 2.86380672454834, + "learning_rate": 1.0989236539926673e-09, + "loss": 0.3789, + "step": 16358 + }, + { + "epoch": 1.9907514450867052, + "grad_norm": 3.0312790870666504, + "learning_rate": 1.070195379766492e-09, + "loss": 0.4541, + "step": 16359 + }, + { + "epoch": 1.9908731365987222, + "grad_norm": 2.439685583114624, + "learning_rate": 1.0418475855156207e-09, + "loss": 0.3987, + "step": 16360 + }, + { + "epoch": 1.9909948281107392, + "grad_norm": 2.344956159591675, + "learning_rate": 1.0138802723203001e-09, + "loss": 0.2862, + "step": 16361 + }, + { + "epoch": 1.9911165196227563, + "grad_norm": 2.3051962852478027, + "learning_rate": 9.862934412430135e-10, + "loss": 0.4021, + "step": 16362 + }, + { + "epoch": 1.9912382111347733, + "grad_norm": 2.1198222637176514, + "learning_rate": 9.590870933340324e-10, + "loss": 0.3418, + "step": 16363 + }, + { + "epoch": 1.9913599026467903, + "grad_norm": 1.7135363817214966, + "learning_rate": 9.322612296280842e-10, + "loss": 0.4111, + "step": 16364 + }, + { + "epoch": 1.9914815941588073, + "grad_norm": 2.4856932163238525, + "learning_rate": 9.058158511476845e-10, + "loss": 0.4134, + "step": 16365 + }, + { + "epoch": 1.9916032856708243, + "grad_norm": 1.8987823724746704, + "learning_rate": 8.797509588964748e-10, + "loss": 0.4072, + "step": 16366 + }, + { + "epoch": 1.9917249771828414, + "grad_norm": 1.743479609489441, + "learning_rate": 8.54066553869215e-10, + "loss": 0.3501, + "step": 16367 + }, + { + "epoch": 1.9918466686948584, + "grad_norm": 1.894911527633667, + "learning_rate": 8.287626370406809e-10, + "loss": 0.3745, + "step": 16368 + }, + { + "epoch": 1.9919683602068756, + "grad_norm": 2.3067073822021484, + "learning_rate": 8.038392093756564e-10, + "loss": 0.3584, + "step": 16369 + }, + { + "epoch": 1.9920900517188926, + "grad_norm": 1.5864689350128174, + "learning_rate": 7.792962718222719e-10, + "loss": 0.3831, + "step": 16370 + }, + { + "epoch": 1.9922117432309097, + "grad_norm": 2.287158250808716, + "learning_rate": 7.551338253131147e-10, + "loss": 0.4143, + "step": 16371 + }, + { + "epoch": 1.9923334347429267, + "grad_norm": 2.90242862701416, + "learning_rate": 7.313518707685596e-10, + "loss": 0.4469, + "step": 16372 + }, + { + "epoch": 1.9924551262549437, + "grad_norm": 1.7477468252182007, + "learning_rate": 7.079504090934386e-10, + "loss": 0.3422, + "step": 16373 + }, + { + "epoch": 1.9925768177669607, + "grad_norm": 1.8733853101730347, + "learning_rate": 6.849294411781504e-10, + "loss": 0.3696, + "step": 16374 + }, + { + "epoch": 1.9926985092789777, + "grad_norm": 3.6517434120178223, + "learning_rate": 6.62288967898661e-10, + "loss": 0.397, + "step": 16375 + }, + { + "epoch": 1.992820200790995, + "grad_norm": 1.9230009317398071, + "learning_rate": 6.400289901165036e-10, + "loss": 0.3587, + "step": 16376 + }, + { + "epoch": 1.992941892303012, + "grad_norm": 2.0829169750213623, + "learning_rate": 6.181495086787781e-10, + "loss": 0.3699, + "step": 16377 + }, + { + "epoch": 1.993063583815029, + "grad_norm": 1.6313719749450684, + "learning_rate": 5.966505244181519e-10, + "loss": 0.362, + "step": 16378 + }, + { + "epoch": 1.993185275327046, + "grad_norm": 1.5598270893096924, + "learning_rate": 5.755320381528595e-10, + "loss": 0.3335, + "step": 16379 + }, + { + "epoch": 1.993306966839063, + "grad_norm": 2.51648211479187, + "learning_rate": 5.54794050685592e-10, + "loss": 0.4075, + "step": 16380 + }, + { + "epoch": 1.99342865835108, + "grad_norm": 2.1215717792510986, + "learning_rate": 5.344365628068282e-10, + "loss": 0.3458, + "step": 16381 + }, + { + "epoch": 1.993550349863097, + "grad_norm": 2.1789610385894775, + "learning_rate": 5.144595752903936e-10, + "loss": 0.3712, + "step": 16382 + }, + { + "epoch": 1.9936720413751141, + "grad_norm": 1.5859107971191406, + "learning_rate": 4.948630888956807e-10, + "loss": 0.3295, + "step": 16383 + }, + { + "epoch": 1.9937937328871311, + "grad_norm": 1.3667292594909668, + "learning_rate": 4.756471043698696e-10, + "loss": 0.339, + "step": 16384 + }, + { + "epoch": 1.9939154243991482, + "grad_norm": 1.7453447580337524, + "learning_rate": 4.568116224445973e-10, + "loss": 0.3853, + "step": 16385 + }, + { + "epoch": 1.9940371159111652, + "grad_norm": 2.116302728652954, + "learning_rate": 4.3835664383373724e-10, + "loss": 0.3441, + "step": 16386 + }, + { + "epoch": 1.9941588074231822, + "grad_norm": 1.5490386486053467, + "learning_rate": 4.202821692422809e-10, + "loss": 0.3576, + "step": 16387 + }, + { + "epoch": 1.9942804989351992, + "grad_norm": 1.4974098205566406, + "learning_rate": 4.0258819935745654e-10, + "loss": 0.3656, + "step": 16388 + }, + { + "epoch": 1.9944021904472162, + "grad_norm": 1.8241342306137085, + "learning_rate": 3.852747348520591e-10, + "loss": 0.398, + "step": 16389 + }, + { + "epoch": 1.9945238819592332, + "grad_norm": 1.5747286081314087, + "learning_rate": 3.68341776384451e-10, + "loss": 0.3647, + "step": 16390 + }, + { + "epoch": 1.9946455734712503, + "grad_norm": 2.0876684188842773, + "learning_rate": 3.5178932459967175e-10, + "loss": 0.339, + "step": 16391 + }, + { + "epoch": 1.9947672649832673, + "grad_norm": 1.552120566368103, + "learning_rate": 3.35617380128328e-10, + "loss": 0.3347, + "step": 16392 + }, + { + "epoch": 1.9948889564952843, + "grad_norm": 2.3483457565307617, + "learning_rate": 3.198259435832629e-10, + "loss": 0.4246, + "step": 16393 + }, + { + "epoch": 1.9950106480073015, + "grad_norm": 1.9285962581634521, + "learning_rate": 3.044150155684378e-10, + "loss": 0.3341, + "step": 16394 + }, + { + "epoch": 1.9951323395193186, + "grad_norm": 1.5609934329986572, + "learning_rate": 2.8938459666894016e-10, + "loss": 0.3261, + "step": 16395 + }, + { + "epoch": 1.9952540310313356, + "grad_norm": 1.5706086158752441, + "learning_rate": 2.747346874554246e-10, + "loss": 0.368, + "step": 16396 + }, + { + "epoch": 1.9953757225433526, + "grad_norm": 1.583938717842102, + "learning_rate": 2.604652884874437e-10, + "loss": 0.3634, + "step": 16397 + }, + { + "epoch": 1.9954974140553696, + "grad_norm": 1.7183619737625122, + "learning_rate": 2.4657640030678607e-10, + "loss": 0.34, + "step": 16398 + }, + { + "epoch": 1.9956191055673866, + "grad_norm": 4.436192989349365, + "learning_rate": 2.3306802344191804e-10, + "loss": 0.4817, + "step": 16399 + }, + { + "epoch": 1.9957407970794037, + "grad_norm": 1.836579442024231, + "learning_rate": 2.19940158407983e-10, + "loss": 0.3699, + "step": 16400 + }, + { + "epoch": 1.995862488591421, + "grad_norm": 1.6603862047195435, + "learning_rate": 2.0719280570347112e-10, + "loss": 0.3768, + "step": 16401 + }, + { + "epoch": 1.995984180103438, + "grad_norm": 3.0537452697753906, + "learning_rate": 1.9482596581354984e-10, + "loss": 0.3005, + "step": 16402 + }, + { + "epoch": 1.996105871615455, + "grad_norm": 1.9862174987792969, + "learning_rate": 1.8283963920895375e-10, + "loss": 0.3786, + "step": 16403 + }, + { + "epoch": 1.996227563127472, + "grad_norm": 1.6122006177902222, + "learning_rate": 1.712338263459845e-10, + "loss": 0.3504, + "step": 16404 + }, + { + "epoch": 1.996349254639489, + "grad_norm": 2.2194674015045166, + "learning_rate": 1.6000852766540064e-10, + "loss": 0.3099, + "step": 16405 + }, + { + "epoch": 1.996470946151506, + "grad_norm": 1.7488921880722046, + "learning_rate": 1.4916374359574825e-10, + "loss": 0.3493, + "step": 16406 + }, + { + "epoch": 1.996592637663523, + "grad_norm": 1.438949704170227, + "learning_rate": 1.3869947454892007e-10, + "loss": 0.339, + "step": 16407 + }, + { + "epoch": 1.99671432917554, + "grad_norm": 2.3451156616210938, + "learning_rate": 1.2861572092237595e-10, + "loss": 0.4242, + "step": 16408 + }, + { + "epoch": 1.996836020687557, + "grad_norm": 1.373085618019104, + "learning_rate": 1.1891248310136328e-10, + "loss": 0.3164, + "step": 16409 + }, + { + "epoch": 1.996957712199574, + "grad_norm": 1.339638352394104, + "learning_rate": 1.0958976145447609e-10, + "loss": 0.3573, + "step": 16410 + }, + { + "epoch": 1.997079403711591, + "grad_norm": 2.091158866882324, + "learning_rate": 1.0064755633587553e-10, + "loss": 0.3794, + "step": 16411 + }, + { + "epoch": 1.9972010952236081, + "grad_norm": 2.4368090629577637, + "learning_rate": 9.208586808640008e-11, + "loss": 0.2876, + "step": 16412 + }, + { + "epoch": 1.9973227867356251, + "grad_norm": 1.7995102405548096, + "learning_rate": 8.390469703134507e-11, + "loss": 0.393, + "step": 16413 + }, + { + "epoch": 1.9974444782476422, + "grad_norm": 2.7665796279907227, + "learning_rate": 7.610404348379341e-11, + "loss": 0.4315, + "step": 16414 + }, + { + "epoch": 1.9975661697596592, + "grad_norm": 1.7597237825393677, + "learning_rate": 6.868390773795419e-11, + "loss": 0.2918, + "step": 16415 + }, + { + "epoch": 1.9976878612716762, + "grad_norm": 2.6422579288482666, + "learning_rate": 6.16442900780445e-11, + "loss": 0.4158, + "step": 16416 + }, + { + "epoch": 1.9978095527836932, + "grad_norm": 1.8573795557022095, + "learning_rate": 5.498519077051789e-11, + "loss": 0.394, + "step": 16417 + }, + { + "epoch": 1.9979312442957102, + "grad_norm": 1.8805248737335205, + "learning_rate": 4.870661006961541e-11, + "loss": 0.3527, + "step": 16418 + }, + { + "epoch": 1.9980529358077272, + "grad_norm": 1.790338397026062, + "learning_rate": 4.280854821514524e-11, + "loss": 0.3372, + "step": 16419 + }, + { + "epoch": 1.9981746273197445, + "grad_norm": 1.7787131071090698, + "learning_rate": 3.729100543026221e-11, + "loss": 0.3965, + "step": 16420 + }, + { + "epoch": 1.9982963188317615, + "grad_norm": 1.8633769750595093, + "learning_rate": 3.215398192479846e-11, + "loss": 0.3527, + "step": 16421 + }, + { + "epoch": 1.9984180103437785, + "grad_norm": 2.000217914581299, + "learning_rate": 2.7397477894153257e-11, + "loss": 0.349, + "step": 16422 + }, + { + "epoch": 1.9985397018557955, + "grad_norm": 2.3797690868377686, + "learning_rate": 2.3021493521513395e-11, + "loss": 0.3906, + "step": 16423 + }, + { + "epoch": 1.9986613933678126, + "grad_norm": 2.995281457901001, + "learning_rate": 1.9026028970081656e-11, + "loss": 0.4525, + "step": 16424 + }, + { + "epoch": 1.9987830848798296, + "grad_norm": 1.4523077011108398, + "learning_rate": 1.5411084394179044e-11, + "loss": 0.3783, + "step": 16425 + }, + { + "epoch": 1.9989047763918468, + "grad_norm": 1.8615294694900513, + "learning_rate": 1.2176659931473212e-11, + "loss": 0.3783, + "step": 16426 + }, + { + "epoch": 1.9990264679038638, + "grad_norm": 2.6837189197540283, + "learning_rate": 9.32275570297847e-12, + "loss": 0.3881, + "step": 16427 + }, + { + "epoch": 1.9991481594158809, + "grad_norm": 1.8235241174697876, + "learning_rate": 6.849371819717121e-12, + "loss": 0.3603, + "step": 16428 + }, + { + "epoch": 1.9992698509278979, + "grad_norm": 1.722519040107727, + "learning_rate": 4.756508373837676e-12, + "loss": 0.3998, + "step": 16429 + }, + { + "epoch": 1.999391542439915, + "grad_norm": 3.307969808578491, + "learning_rate": 3.0441654463864156e-12, + "loss": 0.2877, + "step": 16430 + }, + { + "epoch": 1.999513233951932, + "grad_norm": 2.423255443572998, + "learning_rate": 1.7123431017562752e-12, + "loss": 0.4201, + "step": 16431 + }, + { + "epoch": 1.999634925463949, + "grad_norm": 1.7691155672073364, + "learning_rate": 7.61041391017514e-13, + "loss": 0.3697, + "step": 16432 + }, + { + "epoch": 1.999756616975966, + "grad_norm": 1.6485546827316284, + "learning_rate": 1.902603496972688e-13, + "loss": 0.3719, + "step": 16433 + }, + { + "epoch": 1.999878308487983, + "grad_norm": 1.541142463684082, + "learning_rate": 0.0, + "loss": 0.4099, + "step": 16434 + }, + { + "epoch": 1.999878308487983, + "step": 16434, + "total_flos": 7.693823835761541e+17, + "train_loss": 0.4233635055173034, + "train_runtime": 43491.2027, + "train_samples_per_second": 48.37, + "train_steps_per_second": 0.378 + } + ], + "logging_steps": 1.0, + "max_steps": 16434, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.693823835761541e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}