diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11313 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9995807518516795, + "eval_steps": 10500, + "global_step": 16098, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001863325103647459, + "grad_norm": 8.962045669555664, + "learning_rate": 0.00019987576096409493, + "loss": 2.809, + "step": 10 + }, + { + "epoch": 0.003726650207294918, + "grad_norm": 3.3185620307922363, + "learning_rate": 0.00019975152192818984, + "loss": 0.4593, + "step": 20 + }, + { + "epoch": 0.0055899753109423765, + "grad_norm": 3.248004674911499, + "learning_rate": 0.00019962728289228476, + "loss": 0.2631, + "step": 30 + }, + { + "epoch": 0.007453300414589836, + "grad_norm": 1.8456782102584839, + "learning_rate": 0.00019950304385637967, + "loss": 0.186, + "step": 40 + }, + { + "epoch": 0.009316625518237294, + "grad_norm": 1.2301431894302368, + "learning_rate": 0.0001993788048204746, + "loss": 0.1519, + "step": 50 + }, + { + "epoch": 0.011179950621884753, + "grad_norm": 0.8274269104003906, + "learning_rate": 0.0001992545657845695, + "loss": 0.1339, + "step": 60 + }, + { + "epoch": 0.013043275725532212, + "grad_norm": 1.3983819484710693, + "learning_rate": 0.00019913032674866442, + "loss": 0.1403, + "step": 70 + }, + { + "epoch": 0.014906600829179672, + "grad_norm": 0.9112318158149719, + "learning_rate": 0.00019900608771275934, + "loss": 0.119, + "step": 80 + }, + { + "epoch": 0.01676992593282713, + "grad_norm": 0.9845889806747437, + "learning_rate": 0.00019888184867685428, + "loss": 0.1003, + "step": 90 + }, + { + "epoch": 0.018633251036474587, + "grad_norm": 0.6126352548599243, + "learning_rate": 0.0001987576096409492, + "loss": 0.1063, + "step": 100 + }, + { + "epoch": 0.02049657614012205, + "grad_norm": 0.6967755556106567, + "learning_rate": 0.0001986333706050441, + "loss": 0.1097, + "step": 110 + }, + { + "epoch": 0.022359901243769506, + "grad_norm": 0.994688093662262, + "learning_rate": 0.00019850913156913903, + "loss": 0.1087, + "step": 120 + }, + { + "epoch": 0.024223226347416967, + "grad_norm": 0.9732158184051514, + "learning_rate": 0.00019838489253323397, + "loss": 0.0942, + "step": 130 + }, + { + "epoch": 0.026086551451064425, + "grad_norm": 0.45771920680999756, + "learning_rate": 0.00019826065349732889, + "loss": 0.0915, + "step": 140 + }, + { + "epoch": 0.027949876554711883, + "grad_norm": 0.6761167645454407, + "learning_rate": 0.0001981364144614238, + "loss": 0.0882, + "step": 150 + }, + { + "epoch": 0.029813201658359344, + "grad_norm": 0.8559990525245667, + "learning_rate": 0.00019801217542551872, + "loss": 0.0842, + "step": 160 + }, + { + "epoch": 0.0316765267620068, + "grad_norm": 0.6518134474754333, + "learning_rate": 0.00019788793638961363, + "loss": 0.0891, + "step": 170 + }, + { + "epoch": 0.03353985186565426, + "grad_norm": 0.6274904608726501, + "learning_rate": 0.00019776369735370855, + "loss": 0.0851, + "step": 180 + }, + { + "epoch": 0.03540317696930172, + "grad_norm": 0.4390016496181488, + "learning_rate": 0.00019763945831780346, + "loss": 0.0932, + "step": 190 + }, + { + "epoch": 0.037266502072949174, + "grad_norm": 0.5347189903259277, + "learning_rate": 0.00019751521928189838, + "loss": 0.0766, + "step": 200 + }, + { + "epoch": 0.03912982717659664, + "grad_norm": 2.5851993560791016, + "learning_rate": 0.0001973909802459933, + "loss": 0.0769, + "step": 210 + }, + { + "epoch": 0.0409931522802441, + "grad_norm": 0.7917025089263916, + "learning_rate": 0.0001972667412100882, + "loss": 0.0891, + "step": 220 + }, + { + "epoch": 0.042856477383891554, + "grad_norm": 1.9254189729690552, + "learning_rate": 0.00019714250217418313, + "loss": 0.0748, + "step": 230 + }, + { + "epoch": 0.04471980248753901, + "grad_norm": 0.4789351224899292, + "learning_rate": 0.00019701826313827804, + "loss": 0.0762, + "step": 240 + }, + { + "epoch": 0.04658312759118647, + "grad_norm": 0.4679170548915863, + "learning_rate": 0.00019689402410237296, + "loss": 0.0716, + "step": 250 + }, + { + "epoch": 0.048446452694833934, + "grad_norm": 0.6142864227294922, + "learning_rate": 0.0001967697850664679, + "loss": 0.0717, + "step": 260 + }, + { + "epoch": 0.05030977779848139, + "grad_norm": 0.6055127382278442, + "learning_rate": 0.00019664554603056282, + "loss": 0.0784, + "step": 270 + }, + { + "epoch": 0.05217310290212885, + "grad_norm": 0.5234378576278687, + "learning_rate": 0.00019652130699465773, + "loss": 0.0773, + "step": 280 + }, + { + "epoch": 0.05403642800577631, + "grad_norm": 0.6473028063774109, + "learning_rate": 0.00019639706795875265, + "loss": 0.0801, + "step": 290 + }, + { + "epoch": 0.055899753109423765, + "grad_norm": 0.5021067261695862, + "learning_rate": 0.00019627282892284757, + "loss": 0.0821, + "step": 300 + }, + { + "epoch": 0.05776307821307122, + "grad_norm": 0.5899055600166321, + "learning_rate": 0.00019614858988694248, + "loss": 0.0829, + "step": 310 + }, + { + "epoch": 0.05962640331671869, + "grad_norm": 0.9202219247817993, + "learning_rate": 0.0001960243508510374, + "loss": 0.0723, + "step": 320 + }, + { + "epoch": 0.061489728420366145, + "grad_norm": 0.45627737045288086, + "learning_rate": 0.0001959001118151323, + "loss": 0.0815, + "step": 330 + }, + { + "epoch": 0.0633530535240136, + "grad_norm": 0.4791790246963501, + "learning_rate": 0.00019577587277922726, + "loss": 0.0754, + "step": 340 + }, + { + "epoch": 0.06521637862766107, + "grad_norm": 0.48401352763175964, + "learning_rate": 0.00019565163374332217, + "loss": 0.0834, + "step": 350 + }, + { + "epoch": 0.06707970373130852, + "grad_norm": 0.45857805013656616, + "learning_rate": 0.0001955273947074171, + "loss": 0.0794, + "step": 360 + }, + { + "epoch": 0.06894302883495598, + "grad_norm": 0.3174554705619812, + "learning_rate": 0.000195403155671512, + "loss": 0.0567, + "step": 370 + }, + { + "epoch": 0.07080635393860343, + "grad_norm": 0.4005182087421417, + "learning_rate": 0.00019527891663560692, + "loss": 0.0593, + "step": 380 + }, + { + "epoch": 0.0726696790422509, + "grad_norm": 0.6048309206962585, + "learning_rate": 0.00019515467759970183, + "loss": 0.0802, + "step": 390 + }, + { + "epoch": 0.07453300414589835, + "grad_norm": 1.142407774925232, + "learning_rate": 0.00019503043856379675, + "loss": 0.0695, + "step": 400 + }, + { + "epoch": 0.07639632924954581, + "grad_norm": 0.4419887065887451, + "learning_rate": 0.00019490619952789167, + "loss": 0.0601, + "step": 410 + }, + { + "epoch": 0.07825965435319328, + "grad_norm": 0.33916333317756653, + "learning_rate": 0.00019478196049198658, + "loss": 0.052, + "step": 420 + }, + { + "epoch": 0.08012297945684073, + "grad_norm": 0.8268319964408875, + "learning_rate": 0.00019465772145608152, + "loss": 0.0602, + "step": 430 + }, + { + "epoch": 0.0819863045604882, + "grad_norm": 0.3288143575191498, + "learning_rate": 0.00019453348242017644, + "loss": 0.0559, + "step": 440 + }, + { + "epoch": 0.08384962966413564, + "grad_norm": 1.1518080234527588, + "learning_rate": 0.00019440924338427136, + "loss": 0.0586, + "step": 450 + }, + { + "epoch": 0.08571295476778311, + "grad_norm": 0.4662655293941498, + "learning_rate": 0.00019428500434836627, + "loss": 0.0828, + "step": 460 + }, + { + "epoch": 0.08757627987143057, + "grad_norm": 0.5718135237693787, + "learning_rate": 0.0001941607653124612, + "loss": 0.0538, + "step": 470 + }, + { + "epoch": 0.08943960497507802, + "grad_norm": 0.40107667446136475, + "learning_rate": 0.0001940365262765561, + "loss": 0.0492, + "step": 480 + }, + { + "epoch": 0.09130293007872549, + "grad_norm": 3.8866326808929443, + "learning_rate": 0.00019391228724065102, + "loss": 0.0554, + "step": 490 + }, + { + "epoch": 0.09316625518237294, + "grad_norm": 0.42105963826179504, + "learning_rate": 0.00019378804820474594, + "loss": 0.0597, + "step": 500 + }, + { + "epoch": 0.0950295802860204, + "grad_norm": 0.49508970975875854, + "learning_rate": 0.00019366380916884085, + "loss": 0.0541, + "step": 510 + }, + { + "epoch": 0.09689290538966787, + "grad_norm": 0.35285356640815735, + "learning_rate": 0.00019353957013293577, + "loss": 0.0489, + "step": 520 + }, + { + "epoch": 0.09875623049331532, + "grad_norm": 0.5713452100753784, + "learning_rate": 0.00019341533109703068, + "loss": 0.0606, + "step": 530 + }, + { + "epoch": 0.10061955559696278, + "grad_norm": 0.5628998279571533, + "learning_rate": 0.0001932910920611256, + "loss": 0.0604, + "step": 540 + }, + { + "epoch": 0.10248288070061023, + "grad_norm": 0.4256053864955902, + "learning_rate": 0.00019316685302522054, + "loss": 0.054, + "step": 550 + }, + { + "epoch": 0.1043462058042577, + "grad_norm": 0.4267830550670624, + "learning_rate": 0.00019304261398931546, + "loss": 0.0687, + "step": 560 + }, + { + "epoch": 0.10620953090790515, + "grad_norm": 0.5250473618507385, + "learning_rate": 0.00019291837495341037, + "loss": 0.0587, + "step": 570 + }, + { + "epoch": 0.10807285601155261, + "grad_norm": 0.45175454020500183, + "learning_rate": 0.0001927941359175053, + "loss": 0.0532, + "step": 580 + }, + { + "epoch": 0.10993618111520008, + "grad_norm": 0.40816521644592285, + "learning_rate": 0.0001926698968816002, + "loss": 0.0583, + "step": 590 + }, + { + "epoch": 0.11179950621884753, + "grad_norm": 0.5700634717941284, + "learning_rate": 0.00019254565784569515, + "loss": 0.0507, + "step": 600 + }, + { + "epoch": 0.113662831322495, + "grad_norm": 0.32277169823646545, + "learning_rate": 0.00019242141880979006, + "loss": 0.0485, + "step": 610 + }, + { + "epoch": 0.11552615642614245, + "grad_norm": 0.5027480721473694, + "learning_rate": 0.00019229717977388498, + "loss": 0.0493, + "step": 620 + }, + { + "epoch": 0.11738948152978991, + "grad_norm": 0.43033790588378906, + "learning_rate": 0.0001921729407379799, + "loss": 0.0498, + "step": 630 + }, + { + "epoch": 0.11925280663343737, + "grad_norm": 0.42897236347198486, + "learning_rate": 0.0001920487017020748, + "loss": 0.0542, + "step": 640 + }, + { + "epoch": 0.12111613173708483, + "grad_norm": 0.46961647272109985, + "learning_rate": 0.00019192446266616973, + "loss": 0.0532, + "step": 650 + }, + { + "epoch": 0.12297945684073229, + "grad_norm": 0.3467908799648285, + "learning_rate": 0.00019180022363026464, + "loss": 0.0702, + "step": 660 + }, + { + "epoch": 0.12484278194437974, + "grad_norm": 0.3707217574119568, + "learning_rate": 0.00019167598459435956, + "loss": 0.0496, + "step": 670 + }, + { + "epoch": 0.1267061070480272, + "grad_norm": 0.28889259696006775, + "learning_rate": 0.00019155174555845447, + "loss": 0.0507, + "step": 680 + }, + { + "epoch": 0.12856943215167466, + "grad_norm": 0.3841610550880432, + "learning_rate": 0.0001914275065225494, + "loss": 0.0518, + "step": 690 + }, + { + "epoch": 0.13043275725532213, + "grad_norm": 1.2005867958068848, + "learning_rate": 0.0001913032674866443, + "loss": 0.051, + "step": 700 + }, + { + "epoch": 0.13229608235896959, + "grad_norm": 0.3136664628982544, + "learning_rate": 0.00019117902845073922, + "loss": 0.0512, + "step": 710 + }, + { + "epoch": 0.13415940746261704, + "grad_norm": 0.2575332224369049, + "learning_rate": 0.00019105478941483414, + "loss": 0.0471, + "step": 720 + }, + { + "epoch": 0.1360227325662645, + "grad_norm": 0.44971975684165955, + "learning_rate": 0.00019093055037892905, + "loss": 0.0542, + "step": 730 + }, + { + "epoch": 0.13788605766991197, + "grad_norm": 0.40713998675346375, + "learning_rate": 0.00019080631134302397, + "loss": 0.0456, + "step": 740 + }, + { + "epoch": 0.13974938277355942, + "grad_norm": 0.4077875018119812, + "learning_rate": 0.00019068207230711888, + "loss": 0.0532, + "step": 750 + }, + { + "epoch": 0.14161270787720687, + "grad_norm": 0.4080505073070526, + "learning_rate": 0.00019055783327121383, + "loss": 0.0598, + "step": 760 + }, + { + "epoch": 0.14347603298085435, + "grad_norm": 0.30817773938179016, + "learning_rate": 0.00019043359423530874, + "loss": 0.0451, + "step": 770 + }, + { + "epoch": 0.1453393580845018, + "grad_norm": 0.27631935477256775, + "learning_rate": 0.00019030935519940366, + "loss": 0.0469, + "step": 780 + }, + { + "epoch": 0.14720268318814925, + "grad_norm": 0.8231372237205505, + "learning_rate": 0.0001901851161634986, + "loss": 0.0465, + "step": 790 + }, + { + "epoch": 0.1490660082917967, + "grad_norm": 0.3672221302986145, + "learning_rate": 0.00019006087712759352, + "loss": 0.0489, + "step": 800 + }, + { + "epoch": 0.15092933339544418, + "grad_norm": 0.32903629541397095, + "learning_rate": 0.00018993663809168843, + "loss": 0.0382, + "step": 810 + }, + { + "epoch": 0.15279265849909163, + "grad_norm": 0.291094034910202, + "learning_rate": 0.00018981239905578335, + "loss": 0.0478, + "step": 820 + }, + { + "epoch": 0.15465598360273908, + "grad_norm": 0.45158231258392334, + "learning_rate": 0.00018968816001987827, + "loss": 0.0443, + "step": 830 + }, + { + "epoch": 0.15651930870638656, + "grad_norm": 0.3328520953655243, + "learning_rate": 0.00018956392098397318, + "loss": 0.0369, + "step": 840 + }, + { + "epoch": 0.158382633810034, + "grad_norm": 0.3631761372089386, + "learning_rate": 0.0001894396819480681, + "loss": 0.0606, + "step": 850 + }, + { + "epoch": 0.16024595891368146, + "grad_norm": 0.37086474895477295, + "learning_rate": 0.000189315442912163, + "loss": 0.046, + "step": 860 + }, + { + "epoch": 0.16210928401732894, + "grad_norm": 0.43115702271461487, + "learning_rate": 0.00018919120387625793, + "loss": 0.0401, + "step": 870 + }, + { + "epoch": 0.1639726091209764, + "grad_norm": 0.44107526540756226, + "learning_rate": 0.00018906696484035284, + "loss": 0.0441, + "step": 880 + }, + { + "epoch": 0.16583593422462384, + "grad_norm": 0.3664778172969818, + "learning_rate": 0.00018894272580444776, + "loss": 0.0498, + "step": 890 + }, + { + "epoch": 0.1676992593282713, + "grad_norm": 0.24749009311199188, + "learning_rate": 0.00018881848676854268, + "loss": 0.0408, + "step": 900 + }, + { + "epoch": 0.16956258443191877, + "grad_norm": 0.41872236132621765, + "learning_rate": 0.0001886942477326376, + "loss": 0.0568, + "step": 910 + }, + { + "epoch": 0.17142590953556622, + "grad_norm": 0.25826430320739746, + "learning_rate": 0.0001885700086967325, + "loss": 0.0477, + "step": 920 + }, + { + "epoch": 0.17328923463921367, + "grad_norm": 0.29239359498023987, + "learning_rate": 0.00018844576966082742, + "loss": 0.0472, + "step": 930 + }, + { + "epoch": 0.17515255974286115, + "grad_norm": 0.8567617535591125, + "learning_rate": 0.00018832153062492234, + "loss": 0.0396, + "step": 940 + }, + { + "epoch": 0.1770158848465086, + "grad_norm": 0.47980305552482605, + "learning_rate": 0.00018819729158901728, + "loss": 0.0404, + "step": 950 + }, + { + "epoch": 0.17887920995015605, + "grad_norm": 0.2695956528186798, + "learning_rate": 0.0001880730525531122, + "loss": 0.0385, + "step": 960 + }, + { + "epoch": 0.1807425350538035, + "grad_norm": 0.3030606508255005, + "learning_rate": 0.00018794881351720711, + "loss": 0.0433, + "step": 970 + }, + { + "epoch": 0.18260586015745098, + "grad_norm": 0.5657457709312439, + "learning_rate": 0.00018782457448130203, + "loss": 0.0492, + "step": 980 + }, + { + "epoch": 0.18446918526109843, + "grad_norm": 0.4033714532852173, + "learning_rate": 0.00018770033544539695, + "loss": 0.0439, + "step": 990 + }, + { + "epoch": 0.18633251036474588, + "grad_norm": 0.21745193004608154, + "learning_rate": 0.00018757609640949186, + "loss": 0.0353, + "step": 1000 + }, + { + "epoch": 0.18819583546839336, + "grad_norm": 0.3541043698787689, + "learning_rate": 0.0001874518573735868, + "loss": 0.0387, + "step": 1010 + }, + { + "epoch": 0.1900591605720408, + "grad_norm": 0.38949277997016907, + "learning_rate": 0.00018732761833768172, + "loss": 0.0477, + "step": 1020 + }, + { + "epoch": 0.19192248567568826, + "grad_norm": 0.2696048617362976, + "learning_rate": 0.00018720337930177664, + "loss": 0.0467, + "step": 1030 + }, + { + "epoch": 0.19378581077933574, + "grad_norm": 0.40931448340415955, + "learning_rate": 0.00018707914026587155, + "loss": 0.0387, + "step": 1040 + }, + { + "epoch": 0.1956491358829832, + "grad_norm": 1.0407683849334717, + "learning_rate": 0.00018695490122996647, + "loss": 0.0418, + "step": 1050 + }, + { + "epoch": 0.19751246098663064, + "grad_norm": 0.8484430313110352, + "learning_rate": 0.00018683066219406138, + "loss": 0.0379, + "step": 1060 + }, + { + "epoch": 0.1993757860902781, + "grad_norm": 0.2715752422809601, + "learning_rate": 0.0001867064231581563, + "loss": 0.0472, + "step": 1070 + }, + { + "epoch": 0.20123911119392557, + "grad_norm": 0.3399510383605957, + "learning_rate": 0.00018658218412225122, + "loss": 0.0412, + "step": 1080 + }, + { + "epoch": 0.20310243629757302, + "grad_norm": 0.36955976486206055, + "learning_rate": 0.00018645794508634613, + "loss": 0.0414, + "step": 1090 + }, + { + "epoch": 0.20496576140122047, + "grad_norm": 0.43797340989112854, + "learning_rate": 0.00018633370605044105, + "loss": 0.0439, + "step": 1100 + }, + { + "epoch": 0.20682908650486795, + "grad_norm": 0.2698400020599365, + "learning_rate": 0.00018620946701453596, + "loss": 0.0426, + "step": 1110 + }, + { + "epoch": 0.2086924116085154, + "grad_norm": 0.39951518177986145, + "learning_rate": 0.0001860852279786309, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.21055573671216285, + "grad_norm": 0.24244333803653717, + "learning_rate": 0.00018596098894272582, + "loss": 0.0424, + "step": 1130 + }, + { + "epoch": 0.2124190618158103, + "grad_norm": 1.0004985332489014, + "learning_rate": 0.00018583674990682074, + "loss": 0.0586, + "step": 1140 + }, + { + "epoch": 0.21428238691945778, + "grad_norm": 0.40292367339134216, + "learning_rate": 0.00018571251087091565, + "loss": 0.0416, + "step": 1150 + }, + { + "epoch": 0.21614571202310523, + "grad_norm": 2.8957316875457764, + "learning_rate": 0.00018558827183501057, + "loss": 0.0486, + "step": 1160 + }, + { + "epoch": 0.21800903712675268, + "grad_norm": 0.3073020875453949, + "learning_rate": 0.00018546403279910548, + "loss": 0.048, + "step": 1170 + }, + { + "epoch": 0.21987236223040016, + "grad_norm": 0.3981713354587555, + "learning_rate": 0.0001853397937632004, + "loss": 0.0406, + "step": 1180 + }, + { + "epoch": 0.2217356873340476, + "grad_norm": 0.3430505394935608, + "learning_rate": 0.00018521555472729532, + "loss": 0.0342, + "step": 1190 + }, + { + "epoch": 0.22359901243769506, + "grad_norm": 0.2942293584346771, + "learning_rate": 0.00018509131569139023, + "loss": 0.0373, + "step": 1200 + }, + { + "epoch": 0.22546233754134254, + "grad_norm": 0.2513025403022766, + "learning_rate": 0.00018496707665548515, + "loss": 0.0437, + "step": 1210 + }, + { + "epoch": 0.22732566264499, + "grad_norm": 0.5296605825424194, + "learning_rate": 0.0001848428376195801, + "loss": 0.0431, + "step": 1220 + }, + { + "epoch": 0.22918898774863744, + "grad_norm": 0.28046369552612305, + "learning_rate": 0.000184718598583675, + "loss": 0.0393, + "step": 1230 + }, + { + "epoch": 0.2310523128522849, + "grad_norm": 0.4907744526863098, + "learning_rate": 0.00018459435954776992, + "loss": 0.043, + "step": 1240 + }, + { + "epoch": 0.23291563795593237, + "grad_norm": 0.829265296459198, + "learning_rate": 0.00018447012051186484, + "loss": 0.0333, + "step": 1250 + }, + { + "epoch": 0.23477896305957982, + "grad_norm": 0.21454624831676483, + "learning_rate": 0.00018434588147595975, + "loss": 0.0303, + "step": 1260 + }, + { + "epoch": 0.23664228816322727, + "grad_norm": 0.2321559488773346, + "learning_rate": 0.00018422164244005467, + "loss": 0.0525, + "step": 1270 + }, + { + "epoch": 0.23850561326687475, + "grad_norm": 0.30932319164276123, + "learning_rate": 0.00018409740340414959, + "loss": 0.0295, + "step": 1280 + }, + { + "epoch": 0.2403689383705222, + "grad_norm": 0.2902001738548279, + "learning_rate": 0.00018397316436824453, + "loss": 0.0408, + "step": 1290 + }, + { + "epoch": 0.24223226347416965, + "grad_norm": 0.3144378066062927, + "learning_rate": 0.00018384892533233944, + "loss": 0.0321, + "step": 1300 + }, + { + "epoch": 0.2440955885778171, + "grad_norm": 0.20050252974033356, + "learning_rate": 0.00018372468629643436, + "loss": 0.0317, + "step": 1310 + }, + { + "epoch": 0.24595891368146458, + "grad_norm": 0.6081618666648865, + "learning_rate": 0.00018360044726052928, + "loss": 0.0334, + "step": 1320 + }, + { + "epoch": 0.24782223878511203, + "grad_norm": 0.2507849931716919, + "learning_rate": 0.0001834762082246242, + "loss": 0.0486, + "step": 1330 + }, + { + "epoch": 0.24968556388875948, + "grad_norm": 0.25980520248413086, + "learning_rate": 0.0001833519691887191, + "loss": 0.0454, + "step": 1340 + }, + { + "epoch": 0.25154888899240696, + "grad_norm": 0.2981957197189331, + "learning_rate": 0.00018322773015281402, + "loss": 0.0347, + "step": 1350 + }, + { + "epoch": 0.2534122140960544, + "grad_norm": 0.21605946123600006, + "learning_rate": 0.00018310349111690894, + "loss": 0.0347, + "step": 1360 + }, + { + "epoch": 0.25527553919970186, + "grad_norm": 0.16565227508544922, + "learning_rate": 0.00018297925208100385, + "loss": 0.0348, + "step": 1370 + }, + { + "epoch": 0.2571388643033493, + "grad_norm": 0.23056954145431519, + "learning_rate": 0.00018285501304509877, + "loss": 0.0313, + "step": 1380 + }, + { + "epoch": 0.25900218940699676, + "grad_norm": 0.1978144496679306, + "learning_rate": 0.0001827307740091937, + "loss": 0.0324, + "step": 1390 + }, + { + "epoch": 0.26086551451064427, + "grad_norm": 0.4201340675354004, + "learning_rate": 0.0001826065349732886, + "loss": 0.038, + "step": 1400 + }, + { + "epoch": 0.2627288396142917, + "grad_norm": 0.2694622874259949, + "learning_rate": 0.00018248229593738352, + "loss": 0.0617, + "step": 1410 + }, + { + "epoch": 0.26459216471793917, + "grad_norm": 0.2527810037136078, + "learning_rate": 0.00018235805690147843, + "loss": 0.0278, + "step": 1420 + }, + { + "epoch": 0.2664554898215866, + "grad_norm": 0.1482096016407013, + "learning_rate": 0.00018223381786557338, + "loss": 0.035, + "step": 1430 + }, + { + "epoch": 0.2683188149252341, + "grad_norm": 0.36221590638160706, + "learning_rate": 0.0001821095788296683, + "loss": 0.0306, + "step": 1440 + }, + { + "epoch": 0.2701821400288815, + "grad_norm": 1.1332191228866577, + "learning_rate": 0.0001819853397937632, + "loss": 0.0383, + "step": 1450 + }, + { + "epoch": 0.272045465132529, + "grad_norm": 0.16091710329055786, + "learning_rate": 0.00018186110075785815, + "loss": 0.0553, + "step": 1460 + }, + { + "epoch": 0.2739087902361765, + "grad_norm": 0.2549150288105011, + "learning_rate": 0.00018173686172195307, + "loss": 0.0335, + "step": 1470 + }, + { + "epoch": 0.27577211533982393, + "grad_norm": 0.6201621890068054, + "learning_rate": 0.00018161262268604798, + "loss": 0.0336, + "step": 1480 + }, + { + "epoch": 0.2776354404434714, + "grad_norm": 0.24882057309150696, + "learning_rate": 0.0001814883836501429, + "loss": 0.0407, + "step": 1490 + }, + { + "epoch": 0.27949876554711883, + "grad_norm": 0.2643592059612274, + "learning_rate": 0.00018136414461423781, + "loss": 0.0336, + "step": 1500 + }, + { + "epoch": 0.2813620906507663, + "grad_norm": 0.2674497067928314, + "learning_rate": 0.00018123990557833273, + "loss": 0.036, + "step": 1510 + }, + { + "epoch": 0.28322541575441373, + "grad_norm": 0.1852717250585556, + "learning_rate": 0.00018111566654242765, + "loss": 0.0306, + "step": 1520 + }, + { + "epoch": 0.2850887408580612, + "grad_norm": 0.16907139122486115, + "learning_rate": 0.00018099142750652256, + "loss": 0.0309, + "step": 1530 + }, + { + "epoch": 0.2869520659617087, + "grad_norm": 0.2268272340297699, + "learning_rate": 0.00018086718847061748, + "loss": 0.0338, + "step": 1540 + }, + { + "epoch": 0.28881539106535614, + "grad_norm": 0.1555815041065216, + "learning_rate": 0.0001807429494347124, + "loss": 0.0341, + "step": 1550 + }, + { + "epoch": 0.2906787161690036, + "grad_norm": 0.23342694342136383, + "learning_rate": 0.0001806187103988073, + "loss": 0.0301, + "step": 1560 + }, + { + "epoch": 0.29254204127265104, + "grad_norm": 0.2339681088924408, + "learning_rate": 0.00018049447136290223, + "loss": 0.0278, + "step": 1570 + }, + { + "epoch": 0.2944053663762985, + "grad_norm": 0.2596907317638397, + "learning_rate": 0.00018037023232699714, + "loss": 0.0316, + "step": 1580 + }, + { + "epoch": 0.29626869147994594, + "grad_norm": 0.1681392937898636, + "learning_rate": 0.00018024599329109206, + "loss": 0.0384, + "step": 1590 + }, + { + "epoch": 0.2981320165835934, + "grad_norm": 0.3269426226615906, + "learning_rate": 0.00018012175425518697, + "loss": 0.03, + "step": 1600 + }, + { + "epoch": 0.2999953416872409, + "grad_norm": 0.37414050102233887, + "learning_rate": 0.0001799975152192819, + "loss": 0.0377, + "step": 1610 + }, + { + "epoch": 0.30185866679088835, + "grad_norm": 0.237156942486763, + "learning_rate": 0.0001798732761833768, + "loss": 0.0361, + "step": 1620 + }, + { + "epoch": 0.3037219918945358, + "grad_norm": 0.1954265832901001, + "learning_rate": 0.00017974903714747175, + "loss": 0.0458, + "step": 1630 + }, + { + "epoch": 0.30558531699818325, + "grad_norm": 0.4432564079761505, + "learning_rate": 0.00017962479811156666, + "loss": 0.0329, + "step": 1640 + }, + { + "epoch": 0.3074486421018307, + "grad_norm": 0.20280279219150543, + "learning_rate": 0.00017950055907566158, + "loss": 0.0346, + "step": 1650 + }, + { + "epoch": 0.30931196720547816, + "grad_norm": 0.18664799630641937, + "learning_rate": 0.0001793763200397565, + "loss": 0.0325, + "step": 1660 + }, + { + "epoch": 0.31117529230912566, + "grad_norm": 0.4096941351890564, + "learning_rate": 0.00017925208100385144, + "loss": 0.0326, + "step": 1670 + }, + { + "epoch": 0.3130386174127731, + "grad_norm": 0.14809350669384003, + "learning_rate": 0.00017912784196794635, + "loss": 0.0305, + "step": 1680 + }, + { + "epoch": 0.31490194251642056, + "grad_norm": 0.1550980508327484, + "learning_rate": 0.00017900360293204127, + "loss": 0.0419, + "step": 1690 + }, + { + "epoch": 0.316765267620068, + "grad_norm": 0.2249947041273117, + "learning_rate": 0.00017887936389613619, + "loss": 0.0329, + "step": 1700 + }, + { + "epoch": 0.31862859272371546, + "grad_norm": 0.28853726387023926, + "learning_rate": 0.0001787551248602311, + "loss": 0.0395, + "step": 1710 + }, + { + "epoch": 0.3204919178273629, + "grad_norm": 0.2440110743045807, + "learning_rate": 0.00017863088582432602, + "loss": 0.0374, + "step": 1720 + }, + { + "epoch": 0.32235524293101037, + "grad_norm": 0.2219010591506958, + "learning_rate": 0.00017850664678842093, + "loss": 0.0325, + "step": 1730 + }, + { + "epoch": 0.3242185680346579, + "grad_norm": 1.0239918231964111, + "learning_rate": 0.00017838240775251585, + "loss": 0.0354, + "step": 1740 + }, + { + "epoch": 0.3260818931383053, + "grad_norm": 0.27604353427886963, + "learning_rate": 0.00017825816871661076, + "loss": 0.0291, + "step": 1750 + }, + { + "epoch": 0.3279452182419528, + "grad_norm": 0.25242915749549866, + "learning_rate": 0.00017813392968070568, + "loss": 0.0417, + "step": 1760 + }, + { + "epoch": 0.3298085433456002, + "grad_norm": 0.1866777390241623, + "learning_rate": 0.0001780096906448006, + "loss": 0.0315, + "step": 1770 + }, + { + "epoch": 0.3316718684492477, + "grad_norm": 0.26006487011909485, + "learning_rate": 0.0001778854516088955, + "loss": 0.0281, + "step": 1780 + }, + { + "epoch": 0.3335351935528951, + "grad_norm": 0.2924884557723999, + "learning_rate": 0.00017776121257299043, + "loss": 0.0371, + "step": 1790 + }, + { + "epoch": 0.3353985186565426, + "grad_norm": 0.1723160594701767, + "learning_rate": 0.00017763697353708537, + "loss": 0.0344, + "step": 1800 + }, + { + "epoch": 0.3372618437601901, + "grad_norm": 0.3166184425354004, + "learning_rate": 0.00017751273450118029, + "loss": 0.0309, + "step": 1810 + }, + { + "epoch": 0.33912516886383753, + "grad_norm": 0.17607541382312775, + "learning_rate": 0.0001773884954652752, + "loss": 0.0406, + "step": 1820 + }, + { + "epoch": 0.340988493967485, + "grad_norm": 0.36102867126464844, + "learning_rate": 0.00017726425642937012, + "loss": 0.0399, + "step": 1830 + }, + { + "epoch": 0.34285181907113244, + "grad_norm": 0.23942948877811432, + "learning_rate": 0.00017714001739346503, + "loss": 0.0387, + "step": 1840 + }, + { + "epoch": 0.3447151441747799, + "grad_norm": 0.33895015716552734, + "learning_rate": 0.00017701577835755995, + "loss": 0.0347, + "step": 1850 + }, + { + "epoch": 0.34657846927842734, + "grad_norm": 0.21242383122444153, + "learning_rate": 0.00017689153932165487, + "loss": 0.0288, + "step": 1860 + }, + { + "epoch": 0.3484417943820748, + "grad_norm": 0.264288067817688, + "learning_rate": 0.00017676730028574978, + "loss": 0.0379, + "step": 1870 + }, + { + "epoch": 0.3503051194857223, + "grad_norm": 0.17818303406238556, + "learning_rate": 0.0001766430612498447, + "loss": 0.03, + "step": 1880 + }, + { + "epoch": 0.35216844458936974, + "grad_norm": 0.2825514078140259, + "learning_rate": 0.00017651882221393964, + "loss": 0.0275, + "step": 1890 + }, + { + "epoch": 0.3540317696930172, + "grad_norm": 0.36010608077049255, + "learning_rate": 0.00017639458317803456, + "loss": 0.038, + "step": 1900 + }, + { + "epoch": 0.35589509479666465, + "grad_norm": 0.22792565822601318, + "learning_rate": 0.00017627034414212947, + "loss": 0.0351, + "step": 1910 + }, + { + "epoch": 0.3577584199003121, + "grad_norm": 0.2936442196369171, + "learning_rate": 0.0001761461051062244, + "loss": 0.0421, + "step": 1920 + }, + { + "epoch": 0.35962174500395955, + "grad_norm": 0.2216373234987259, + "learning_rate": 0.0001760218660703193, + "loss": 0.027, + "step": 1930 + }, + { + "epoch": 0.361485070107607, + "grad_norm": 2.177262783050537, + "learning_rate": 0.00017589762703441422, + "loss": 0.031, + "step": 1940 + }, + { + "epoch": 0.3633483952112545, + "grad_norm": 0.14397388696670532, + "learning_rate": 0.00017577338799850913, + "loss": 0.0373, + "step": 1950 + }, + { + "epoch": 0.36521172031490196, + "grad_norm": 0.11747460812330246, + "learning_rate": 0.00017564914896260405, + "loss": 0.0284, + "step": 1960 + }, + { + "epoch": 0.3670750454185494, + "grad_norm": 0.23211225867271423, + "learning_rate": 0.000175524909926699, + "loss": 0.0307, + "step": 1970 + }, + { + "epoch": 0.36893837052219686, + "grad_norm": 0.6417592763900757, + "learning_rate": 0.0001754006708907939, + "loss": 0.0291, + "step": 1980 + }, + { + "epoch": 0.3708016956258443, + "grad_norm": 0.22125494480133057, + "learning_rate": 0.00017527643185488882, + "loss": 0.0313, + "step": 1990 + }, + { + "epoch": 0.37266502072949176, + "grad_norm": 0.22006738185882568, + "learning_rate": 0.00017515219281898374, + "loss": 0.0343, + "step": 2000 + }, + { + "epoch": 0.37452834583313926, + "grad_norm": 0.4610983729362488, + "learning_rate": 0.00017502795378307866, + "loss": 0.0375, + "step": 2010 + }, + { + "epoch": 0.3763916709367867, + "grad_norm": 0.29008451104164124, + "learning_rate": 0.00017490371474717357, + "loss": 0.0347, + "step": 2020 + }, + { + "epoch": 0.37825499604043417, + "grad_norm": 0.20057004690170288, + "learning_rate": 0.0001747794757112685, + "loss": 0.0305, + "step": 2030 + }, + { + "epoch": 0.3801183211440816, + "grad_norm": 0.22764652967453003, + "learning_rate": 0.0001746552366753634, + "loss": 0.0317, + "step": 2040 + }, + { + "epoch": 0.38198164624772907, + "grad_norm": 0.19526733458042145, + "learning_rate": 0.00017453099763945832, + "loss": 0.0394, + "step": 2050 + }, + { + "epoch": 0.3838449713513765, + "grad_norm": 0.2324579507112503, + "learning_rate": 0.00017440675860355324, + "loss": 0.0248, + "step": 2060 + }, + { + "epoch": 0.38570829645502397, + "grad_norm": 0.21112307906150818, + "learning_rate": 0.00017428251956764815, + "loss": 0.0323, + "step": 2070 + }, + { + "epoch": 0.3875716215586715, + "grad_norm": 0.3266056478023529, + "learning_rate": 0.00017415828053174307, + "loss": 0.0297, + "step": 2080 + }, + { + "epoch": 0.3894349466623189, + "grad_norm": 0.13637982308864594, + "learning_rate": 0.00017403404149583798, + "loss": 0.0262, + "step": 2090 + }, + { + "epoch": 0.3912982717659664, + "grad_norm": 0.16478992998600006, + "learning_rate": 0.00017390980245993293, + "loss": 0.0313, + "step": 2100 + }, + { + "epoch": 0.3931615968696138, + "grad_norm": 0.16533003747463226, + "learning_rate": 0.00017378556342402784, + "loss": 0.0291, + "step": 2110 + }, + { + "epoch": 0.3950249219732613, + "grad_norm": 0.2503752112388611, + "learning_rate": 0.00017366132438812276, + "loss": 0.0293, + "step": 2120 + }, + { + "epoch": 0.39688824707690873, + "grad_norm": 0.36518189311027527, + "learning_rate": 0.00017353708535221767, + "loss": 0.0377, + "step": 2130 + }, + { + "epoch": 0.3987515721805562, + "grad_norm": 0.1784435659646988, + "learning_rate": 0.0001734128463163126, + "loss": 0.0217, + "step": 2140 + }, + { + "epoch": 0.4006148972842037, + "grad_norm": 0.2254277467727661, + "learning_rate": 0.00017328860728040753, + "loss": 0.0334, + "step": 2150 + }, + { + "epoch": 0.40247822238785114, + "grad_norm": 0.23453503847122192, + "learning_rate": 0.00017316436824450245, + "loss": 0.0342, + "step": 2160 + }, + { + "epoch": 0.4043415474914986, + "grad_norm": 0.17916861176490784, + "learning_rate": 0.00017304012920859736, + "loss": 0.0366, + "step": 2170 + }, + { + "epoch": 0.40620487259514604, + "grad_norm": 0.20418667793273926, + "learning_rate": 0.00017291589017269228, + "loss": 0.028, + "step": 2180 + }, + { + "epoch": 0.4080681976987935, + "grad_norm": 0.32335731387138367, + "learning_rate": 0.0001727916511367872, + "loss": 0.0336, + "step": 2190 + }, + { + "epoch": 0.40993152280244094, + "grad_norm": 0.19899824261665344, + "learning_rate": 0.0001726674121008821, + "loss": 0.0398, + "step": 2200 + }, + { + "epoch": 0.4117948479060884, + "grad_norm": 0.22357973456382751, + "learning_rate": 0.00017254317306497703, + "loss": 0.0315, + "step": 2210 + }, + { + "epoch": 0.4136581730097359, + "grad_norm": 0.12390164285898209, + "learning_rate": 0.00017241893402907194, + "loss": 0.0265, + "step": 2220 + }, + { + "epoch": 0.41552149811338335, + "grad_norm": 0.2476750761270523, + "learning_rate": 0.00017229469499316686, + "loss": 0.0405, + "step": 2230 + }, + { + "epoch": 0.4173848232170308, + "grad_norm": 0.24649333953857422, + "learning_rate": 0.00017217045595726177, + "loss": 0.0327, + "step": 2240 + }, + { + "epoch": 0.41924814832067825, + "grad_norm": 0.30495890974998474, + "learning_rate": 0.0001720462169213567, + "loss": 0.0319, + "step": 2250 + }, + { + "epoch": 0.4211114734243257, + "grad_norm": 0.2112802118062973, + "learning_rate": 0.0001719219778854516, + "loss": 0.0294, + "step": 2260 + }, + { + "epoch": 0.42297479852797315, + "grad_norm": 0.25762444734573364, + "learning_rate": 0.00017179773884954652, + "loss": 0.0291, + "step": 2270 + }, + { + "epoch": 0.4248381236316206, + "grad_norm": 0.3094237744808197, + "learning_rate": 0.00017167349981364144, + "loss": 0.0391, + "step": 2280 + }, + { + "epoch": 0.4267014487352681, + "grad_norm": 0.17324984073638916, + "learning_rate": 0.00017154926077773635, + "loss": 0.0256, + "step": 2290 + }, + { + "epoch": 0.42856477383891556, + "grad_norm": 0.12840083241462708, + "learning_rate": 0.00017142502174183127, + "loss": 0.04, + "step": 2300 + }, + { + "epoch": 0.430428098942563, + "grad_norm": 0.23374256491661072, + "learning_rate": 0.0001713007827059262, + "loss": 0.0277, + "step": 2310 + }, + { + "epoch": 0.43229142404621046, + "grad_norm": 0.27365607023239136, + "learning_rate": 0.00017117654367002113, + "loss": 0.0365, + "step": 2320 + }, + { + "epoch": 0.4341547491498579, + "grad_norm": 0.12370350956916809, + "learning_rate": 0.00017105230463411604, + "loss": 0.0449, + "step": 2330 + }, + { + "epoch": 0.43601807425350536, + "grad_norm": 0.3122682571411133, + "learning_rate": 0.000170928065598211, + "loss": 0.0263, + "step": 2340 + }, + { + "epoch": 0.4378813993571528, + "grad_norm": 0.20058439671993256, + "learning_rate": 0.0001708038265623059, + "loss": 0.0296, + "step": 2350 + }, + { + "epoch": 0.4397447244608003, + "grad_norm": 0.16738611459732056, + "learning_rate": 0.00017067958752640082, + "loss": 0.0376, + "step": 2360 + }, + { + "epoch": 0.44160804956444777, + "grad_norm": 0.11145570129156113, + "learning_rate": 0.00017055534849049573, + "loss": 0.0227, + "step": 2370 + }, + { + "epoch": 0.4434713746680952, + "grad_norm": 0.2827621102333069, + "learning_rate": 0.00017043110945459065, + "loss": 0.0281, + "step": 2380 + }, + { + "epoch": 0.44533469977174267, + "grad_norm": 0.13350103795528412, + "learning_rate": 0.00017030687041868557, + "loss": 0.038, + "step": 2390 + }, + { + "epoch": 0.4471980248753901, + "grad_norm": 0.10742458701133728, + "learning_rate": 0.00017018263138278048, + "loss": 0.0295, + "step": 2400 + }, + { + "epoch": 0.44906134997903757, + "grad_norm": 0.21985341608524323, + "learning_rate": 0.0001700583923468754, + "loss": 0.0301, + "step": 2410 + }, + { + "epoch": 0.4509246750826851, + "grad_norm": 0.22676043212413788, + "learning_rate": 0.0001699341533109703, + "loss": 0.0323, + "step": 2420 + }, + { + "epoch": 0.45278800018633253, + "grad_norm": 0.39583566784858704, + "learning_rate": 0.00016980991427506523, + "loss": 0.0283, + "step": 2430 + }, + { + "epoch": 0.45465132528998, + "grad_norm": 0.33214402198791504, + "learning_rate": 0.00016968567523916014, + "loss": 0.0328, + "step": 2440 + }, + { + "epoch": 0.45651465039362743, + "grad_norm": 0.19369973242282867, + "learning_rate": 0.00016956143620325506, + "loss": 0.0251, + "step": 2450 + }, + { + "epoch": 0.4583779754972749, + "grad_norm": 0.1785402148962021, + "learning_rate": 0.00016943719716734998, + "loss": 0.0338, + "step": 2460 + }, + { + "epoch": 0.46024130060092233, + "grad_norm": 0.30956077575683594, + "learning_rate": 0.0001693129581314449, + "loss": 0.0251, + "step": 2470 + }, + { + "epoch": 0.4621046257045698, + "grad_norm": 0.17932891845703125, + "learning_rate": 0.0001691887190955398, + "loss": 0.0291, + "step": 2480 + }, + { + "epoch": 0.4639679508082173, + "grad_norm": 0.584335446357727, + "learning_rate": 0.00016906448005963475, + "loss": 0.0223, + "step": 2490 + }, + { + "epoch": 0.46583127591186474, + "grad_norm": 0.2823520004749298, + "learning_rate": 0.00016894024102372967, + "loss": 0.0289, + "step": 2500 + }, + { + "epoch": 0.4676946010155122, + "grad_norm": 0.12712427973747253, + "learning_rate": 0.00016881600198782458, + "loss": 0.0321, + "step": 2510 + }, + { + "epoch": 0.46955792611915964, + "grad_norm": 0.18848009407520294, + "learning_rate": 0.0001686917629519195, + "loss": 0.0183, + "step": 2520 + }, + { + "epoch": 0.4714212512228071, + "grad_norm": 0.5352602601051331, + "learning_rate": 0.00016856752391601441, + "loss": 0.0417, + "step": 2530 + }, + { + "epoch": 0.47328457632645454, + "grad_norm": 0.1589186191558838, + "learning_rate": 0.00016844328488010933, + "loss": 0.0258, + "step": 2540 + }, + { + "epoch": 0.475147901430102, + "grad_norm": 0.4766451120376587, + "learning_rate": 0.00016831904584420427, + "loss": 0.0224, + "step": 2550 + }, + { + "epoch": 0.4770112265337495, + "grad_norm": 0.11691979318857193, + "learning_rate": 0.0001681948068082992, + "loss": 0.0264, + "step": 2560 + }, + { + "epoch": 0.47887455163739695, + "grad_norm": 0.19135166704654694, + "learning_rate": 0.0001680705677723941, + "loss": 0.0311, + "step": 2570 + }, + { + "epoch": 0.4807378767410444, + "grad_norm": 0.1519075334072113, + "learning_rate": 0.00016794632873648902, + "loss": 0.0316, + "step": 2580 + }, + { + "epoch": 0.48260120184469185, + "grad_norm": 0.211242213845253, + "learning_rate": 0.00016782208970058394, + "loss": 0.0267, + "step": 2590 + }, + { + "epoch": 0.4844645269483393, + "grad_norm": 0.1120869442820549, + "learning_rate": 0.00016769785066467885, + "loss": 0.0275, + "step": 2600 + }, + { + "epoch": 0.48632785205198675, + "grad_norm": 0.12016676366329193, + "learning_rate": 0.00016757361162877377, + "loss": 0.0269, + "step": 2610 + }, + { + "epoch": 0.4881911771556342, + "grad_norm": 0.17459681630134583, + "learning_rate": 0.00016744937259286868, + "loss": 0.0275, + "step": 2620 + }, + { + "epoch": 0.4900545022592817, + "grad_norm": 0.4547879993915558, + "learning_rate": 0.0001673251335569636, + "loss": 0.0445, + "step": 2630 + }, + { + "epoch": 0.49191782736292916, + "grad_norm": 0.20323753356933594, + "learning_rate": 0.00016720089452105851, + "loss": 0.0274, + "step": 2640 + }, + { + "epoch": 0.4937811524665766, + "grad_norm": 0.14843535423278809, + "learning_rate": 0.00016707665548515343, + "loss": 0.0261, + "step": 2650 + }, + { + "epoch": 0.49564447757022406, + "grad_norm": 0.3513728678226471, + "learning_rate": 0.00016695241644924837, + "loss": 0.0292, + "step": 2660 + }, + { + "epoch": 0.4975078026738715, + "grad_norm": 0.10806536674499512, + "learning_rate": 0.0001668281774133433, + "loss": 0.0263, + "step": 2670 + }, + { + "epoch": 0.49937112777751896, + "grad_norm": 0.2897528409957886, + "learning_rate": 0.0001667039383774382, + "loss": 0.0265, + "step": 2680 + }, + { + "epoch": 0.5012344528811664, + "grad_norm": 0.26667869091033936, + "learning_rate": 0.00016657969934153312, + "loss": 0.0319, + "step": 2690 + }, + { + "epoch": 0.5030977779848139, + "grad_norm": 0.21708862483501434, + "learning_rate": 0.00016645546030562804, + "loss": 0.0254, + "step": 2700 + }, + { + "epoch": 0.5049611030884613, + "grad_norm": 0.1377273052930832, + "learning_rate": 0.00016633122126972295, + "loss": 0.021, + "step": 2710 + }, + { + "epoch": 0.5068244281921088, + "grad_norm": 0.18345075845718384, + "learning_rate": 0.00016620698223381787, + "loss": 0.0329, + "step": 2720 + }, + { + "epoch": 0.5086877532957563, + "grad_norm": 0.1460709273815155, + "learning_rate": 0.00016608274319791278, + "loss": 0.0263, + "step": 2730 + }, + { + "epoch": 0.5105510783994037, + "grad_norm": 0.32654863595962524, + "learning_rate": 0.0001659585041620077, + "loss": 0.0226, + "step": 2740 + }, + { + "epoch": 0.5124144035030512, + "grad_norm": 0.166800856590271, + "learning_rate": 0.00016583426512610262, + "loss": 0.0235, + "step": 2750 + }, + { + "epoch": 0.5142777286066986, + "grad_norm": 0.15170325338840485, + "learning_rate": 0.00016571002609019756, + "loss": 0.0267, + "step": 2760 + }, + { + "epoch": 0.5161410537103461, + "grad_norm": 0.2023041844367981, + "learning_rate": 0.00016558578705429247, + "loss": 0.0218, + "step": 2770 + }, + { + "epoch": 0.5180043788139935, + "grad_norm": 0.32125502824783325, + "learning_rate": 0.0001654615480183874, + "loss": 0.0295, + "step": 2780 + }, + { + "epoch": 0.519867703917641, + "grad_norm": 0.21208274364471436, + "learning_rate": 0.0001653373089824823, + "loss": 0.0242, + "step": 2790 + }, + { + "epoch": 0.5217310290212885, + "grad_norm": 0.22862495481967926, + "learning_rate": 0.00016521306994657722, + "loss": 0.038, + "step": 2800 + }, + { + "epoch": 0.5235943541249359, + "grad_norm": 0.1913980096578598, + "learning_rate": 0.00016508883091067214, + "loss": 0.0308, + "step": 2810 + }, + { + "epoch": 0.5254576792285834, + "grad_norm": 0.14556388556957245, + "learning_rate": 0.00016496459187476705, + "loss": 0.0329, + "step": 2820 + }, + { + "epoch": 0.5273210043322308, + "grad_norm": 0.19217199087142944, + "learning_rate": 0.000164840352838862, + "loss": 0.0253, + "step": 2830 + }, + { + "epoch": 0.5291843294358783, + "grad_norm": 0.11398417502641678, + "learning_rate": 0.0001647161138029569, + "loss": 0.0205, + "step": 2840 + }, + { + "epoch": 0.5310476545395257, + "grad_norm": 0.16399814188480377, + "learning_rate": 0.00016459187476705183, + "loss": 0.0222, + "step": 2850 + }, + { + "epoch": 0.5329109796431732, + "grad_norm": 0.4391205906867981, + "learning_rate": 0.00016446763573114674, + "loss": 0.0288, + "step": 2860 + }, + { + "epoch": 0.5347743047468208, + "grad_norm": 0.1345457136631012, + "learning_rate": 0.00016434339669524166, + "loss": 0.0236, + "step": 2870 + }, + { + "epoch": 0.5366376298504681, + "grad_norm": 0.18585018813610077, + "learning_rate": 0.00016421915765933658, + "loss": 0.0262, + "step": 2880 + }, + { + "epoch": 0.5385009549541157, + "grad_norm": 0.26273420453071594, + "learning_rate": 0.0001640949186234315, + "loss": 0.0278, + "step": 2890 + }, + { + "epoch": 0.540364280057763, + "grad_norm": 0.21691930294036865, + "learning_rate": 0.0001639706795875264, + "loss": 0.0246, + "step": 2900 + }, + { + "epoch": 0.5422276051614106, + "grad_norm": 0.21845707297325134, + "learning_rate": 0.00016384644055162132, + "loss": 0.0361, + "step": 2910 + }, + { + "epoch": 0.544090930265058, + "grad_norm": 0.16350382566452026, + "learning_rate": 0.00016372220151571624, + "loss": 0.0252, + "step": 2920 + }, + { + "epoch": 0.5459542553687055, + "grad_norm": 0.16099347174167633, + "learning_rate": 0.00016359796247981115, + "loss": 0.0219, + "step": 2930 + }, + { + "epoch": 0.547817580472353, + "grad_norm": 0.16874344646930695, + "learning_rate": 0.00016347372344390607, + "loss": 0.0204, + "step": 2940 + }, + { + "epoch": 0.5496809055760004, + "grad_norm": 0.45683175325393677, + "learning_rate": 0.00016334948440800099, + "loss": 0.0257, + "step": 2950 + }, + { + "epoch": 0.5515442306796479, + "grad_norm": 0.3147335648536682, + "learning_rate": 0.0001632252453720959, + "loss": 0.0238, + "step": 2960 + }, + { + "epoch": 0.5534075557832953, + "grad_norm": 0.14535823464393616, + "learning_rate": 0.00016310100633619082, + "loss": 0.0317, + "step": 2970 + }, + { + "epoch": 0.5552708808869428, + "grad_norm": 0.17752012610435486, + "learning_rate": 0.00016297676730028576, + "loss": 0.0332, + "step": 2980 + }, + { + "epoch": 0.5571342059905902, + "grad_norm": 0.1567634791135788, + "learning_rate": 0.00016285252826438068, + "loss": 0.026, + "step": 2990 + }, + { + "epoch": 0.5589975310942377, + "grad_norm": 0.12023455649614334, + "learning_rate": 0.0001627282892284756, + "loss": 0.0244, + "step": 3000 + }, + { + "epoch": 0.5608608561978852, + "grad_norm": 0.20631061494350433, + "learning_rate": 0.00016260405019257054, + "loss": 0.0289, + "step": 3010 + }, + { + "epoch": 0.5627241813015326, + "grad_norm": 0.15248391032218933, + "learning_rate": 0.00016247981115666545, + "loss": 0.0245, + "step": 3020 + }, + { + "epoch": 0.5645875064051801, + "grad_norm": 0.1587265431880951, + "learning_rate": 0.00016235557212076037, + "loss": 0.0375, + "step": 3030 + }, + { + "epoch": 0.5664508315088275, + "grad_norm": 0.12269464135169983, + "learning_rate": 0.00016223133308485528, + "loss": 0.0212, + "step": 3040 + }, + { + "epoch": 0.568314156612475, + "grad_norm": 0.1672634780406952, + "learning_rate": 0.0001621070940489502, + "loss": 0.0285, + "step": 3050 + }, + { + "epoch": 0.5701774817161224, + "grad_norm": 0.1095886081457138, + "learning_rate": 0.00016198285501304511, + "loss": 0.0245, + "step": 3060 + }, + { + "epoch": 0.5720408068197699, + "grad_norm": 0.19466912746429443, + "learning_rate": 0.00016185861597714003, + "loss": 0.03, + "step": 3070 + }, + { + "epoch": 0.5739041319234174, + "grad_norm": 0.13806433975696564, + "learning_rate": 0.00016173437694123495, + "loss": 0.0307, + "step": 3080 + }, + { + "epoch": 0.5757674570270648, + "grad_norm": 0.08115004748106003, + "learning_rate": 0.00016161013790532986, + "loss": 0.0239, + "step": 3090 + }, + { + "epoch": 0.5776307821307123, + "grad_norm": 0.13298851251602173, + "learning_rate": 0.00016148589886942478, + "loss": 0.0298, + "step": 3100 + }, + { + "epoch": 0.5794941072343597, + "grad_norm": 0.1280878335237503, + "learning_rate": 0.0001613616598335197, + "loss": 0.0222, + "step": 3110 + }, + { + "epoch": 0.5813574323380072, + "grad_norm": 0.18590818345546722, + "learning_rate": 0.0001612374207976146, + "loss": 0.0247, + "step": 3120 + }, + { + "epoch": 0.5832207574416546, + "grad_norm": 0.1540619432926178, + "learning_rate": 0.00016111318176170953, + "loss": 0.028, + "step": 3130 + }, + { + "epoch": 0.5850840825453021, + "grad_norm": 0.18428578972816467, + "learning_rate": 0.00016098894272580444, + "loss": 0.0297, + "step": 3140 + }, + { + "epoch": 0.5869474076489496, + "grad_norm": 0.155143141746521, + "learning_rate": 0.00016086470368989936, + "loss": 0.0252, + "step": 3150 + }, + { + "epoch": 0.588810732752597, + "grad_norm": 0.10080347210168839, + "learning_rate": 0.00016074046465399427, + "loss": 0.032, + "step": 3160 + }, + { + "epoch": 0.5906740578562445, + "grad_norm": 0.12629786133766174, + "learning_rate": 0.00016061622561808922, + "loss": 0.0233, + "step": 3170 + }, + { + "epoch": 0.5925373829598919, + "grad_norm": 0.11422615498304367, + "learning_rate": 0.00016049198658218413, + "loss": 0.0226, + "step": 3180 + }, + { + "epoch": 0.5944007080635394, + "grad_norm": 0.34021082520484924, + "learning_rate": 0.00016036774754627905, + "loss": 0.0282, + "step": 3190 + }, + { + "epoch": 0.5962640331671868, + "grad_norm": 0.1320790946483612, + "learning_rate": 0.00016024350851037396, + "loss": 0.0285, + "step": 3200 + }, + { + "epoch": 0.5981273582708343, + "grad_norm": 0.20607562363147736, + "learning_rate": 0.00016011926947446888, + "loss": 0.0373, + "step": 3210 + }, + { + "epoch": 0.5999906833744818, + "grad_norm": 0.17851243913173676, + "learning_rate": 0.00015999503043856382, + "loss": 0.0227, + "step": 3220 + }, + { + "epoch": 0.6018540084781292, + "grad_norm": 0.118630051612854, + "learning_rate": 0.00015987079140265874, + "loss": 0.0261, + "step": 3230 + }, + { + "epoch": 0.6037173335817767, + "grad_norm": 0.1931089609861374, + "learning_rate": 0.00015974655236675365, + "loss": 0.0242, + "step": 3240 + }, + { + "epoch": 0.6055806586854241, + "grad_norm": 0.1010395735502243, + "learning_rate": 0.00015962231333084857, + "loss": 0.0472, + "step": 3250 + }, + { + "epoch": 0.6074439837890716, + "grad_norm": 0.3368585705757141, + "learning_rate": 0.00015949807429494348, + "loss": 0.0236, + "step": 3260 + }, + { + "epoch": 0.609307308892719, + "grad_norm": 0.23170095682144165, + "learning_rate": 0.0001593738352590384, + "loss": 0.0188, + "step": 3270 + }, + { + "epoch": 0.6111706339963665, + "grad_norm": 0.20712672173976898, + "learning_rate": 0.00015924959622313332, + "loss": 0.0342, + "step": 3280 + }, + { + "epoch": 0.613033959100014, + "grad_norm": 0.2359304428100586, + "learning_rate": 0.00015912535718722823, + "loss": 0.0226, + "step": 3290 + }, + { + "epoch": 0.6148972842036614, + "grad_norm": 0.19130489230155945, + "learning_rate": 0.00015900111815132315, + "loss": 0.0277, + "step": 3300 + }, + { + "epoch": 0.6167606093073089, + "grad_norm": 0.10351112484931946, + "learning_rate": 0.00015887687911541806, + "loss": 0.0246, + "step": 3310 + }, + { + "epoch": 0.6186239344109563, + "grad_norm": 0.15079782903194427, + "learning_rate": 0.00015875264007951298, + "loss": 0.0247, + "step": 3320 + }, + { + "epoch": 0.6204872595146038, + "grad_norm": 0.13707318902015686, + "learning_rate": 0.0001586284010436079, + "loss": 0.0246, + "step": 3330 + }, + { + "epoch": 0.6223505846182513, + "grad_norm": 0.3334473669528961, + "learning_rate": 0.00015850416200770284, + "loss": 0.026, + "step": 3340 + }, + { + "epoch": 0.6242139097218987, + "grad_norm": 0.14594854414463043, + "learning_rate": 0.00015837992297179775, + "loss": 0.0231, + "step": 3350 + }, + { + "epoch": 0.6260772348255462, + "grad_norm": 0.17928946018218994, + "learning_rate": 0.00015825568393589267, + "loss": 0.0243, + "step": 3360 + }, + { + "epoch": 0.6279405599291936, + "grad_norm": 0.18053315579891205, + "learning_rate": 0.00015813144489998759, + "loss": 0.0184, + "step": 3370 + }, + { + "epoch": 0.6298038850328411, + "grad_norm": 0.1368819922208786, + "learning_rate": 0.0001580072058640825, + "loss": 0.0229, + "step": 3380 + }, + { + "epoch": 0.6316672101364885, + "grad_norm": 0.1753019094467163, + "learning_rate": 0.00015788296682817742, + "loss": 0.0244, + "step": 3390 + }, + { + "epoch": 0.633530535240136, + "grad_norm": 0.189280703663826, + "learning_rate": 0.00015775872779227233, + "loss": 0.023, + "step": 3400 + }, + { + "epoch": 0.6353938603437835, + "grad_norm": 0.10799846053123474, + "learning_rate": 0.00015763448875636725, + "loss": 0.0232, + "step": 3410 + }, + { + "epoch": 0.6372571854474309, + "grad_norm": 0.14953063428401947, + "learning_rate": 0.00015751024972046216, + "loss": 0.0242, + "step": 3420 + }, + { + "epoch": 0.6391205105510784, + "grad_norm": 0.46299856901168823, + "learning_rate": 0.0001573860106845571, + "loss": 0.0346, + "step": 3430 + }, + { + "epoch": 0.6409838356547258, + "grad_norm": 0.21256397664546967, + "learning_rate": 0.00015726177164865202, + "loss": 0.0284, + "step": 3440 + }, + { + "epoch": 0.6428471607583733, + "grad_norm": 0.21798713505268097, + "learning_rate": 0.00015713753261274694, + "loss": 0.0299, + "step": 3450 + }, + { + "epoch": 0.6447104858620207, + "grad_norm": 0.16905297338962555, + "learning_rate": 0.00015701329357684186, + "loss": 0.0206, + "step": 3460 + }, + { + "epoch": 0.6465738109656682, + "grad_norm": 0.11575949192047119, + "learning_rate": 0.00015688905454093677, + "loss": 0.0247, + "step": 3470 + }, + { + "epoch": 0.6484371360693157, + "grad_norm": 0.11626104265451431, + "learning_rate": 0.0001567648155050317, + "loss": 0.0233, + "step": 3480 + }, + { + "epoch": 0.6503004611729631, + "grad_norm": 0.214884415268898, + "learning_rate": 0.0001566405764691266, + "loss": 0.0177, + "step": 3490 + }, + { + "epoch": 0.6521637862766106, + "grad_norm": 0.14941707253456116, + "learning_rate": 0.00015651633743322152, + "loss": 0.0218, + "step": 3500 + }, + { + "epoch": 0.654027111380258, + "grad_norm": 0.1662212312221527, + "learning_rate": 0.00015639209839731643, + "loss": 0.0262, + "step": 3510 + }, + { + "epoch": 0.6558904364839055, + "grad_norm": 0.1752852499485016, + "learning_rate": 0.00015626785936141138, + "loss": 0.024, + "step": 3520 + }, + { + "epoch": 0.6577537615875529, + "grad_norm": 0.15072235465049744, + "learning_rate": 0.0001561436203255063, + "loss": 0.0234, + "step": 3530 + }, + { + "epoch": 0.6596170866912004, + "grad_norm": 0.18537260591983795, + "learning_rate": 0.0001560193812896012, + "loss": 0.0268, + "step": 3540 + }, + { + "epoch": 0.661480411794848, + "grad_norm": 0.10309719294309616, + "learning_rate": 0.00015589514225369612, + "loss": 0.029, + "step": 3550 + }, + { + "epoch": 0.6633437368984954, + "grad_norm": 0.14345987141132355, + "learning_rate": 0.00015577090321779104, + "loss": 0.0192, + "step": 3560 + }, + { + "epoch": 0.6652070620021429, + "grad_norm": 0.12496069073677063, + "learning_rate": 0.00015564666418188596, + "loss": 0.0173, + "step": 3570 + }, + { + "epoch": 0.6670703871057903, + "grad_norm": 0.15995663404464722, + "learning_rate": 0.00015552242514598087, + "loss": 0.02, + "step": 3580 + }, + { + "epoch": 0.6689337122094378, + "grad_norm": 0.1138169914484024, + "learning_rate": 0.0001553981861100758, + "loss": 0.0156, + "step": 3590 + }, + { + "epoch": 0.6707970373130852, + "grad_norm": 0.2230171263217926, + "learning_rate": 0.0001552739470741707, + "loss": 0.0297, + "step": 3600 + }, + { + "epoch": 0.6726603624167327, + "grad_norm": 0.08599012345075607, + "learning_rate": 0.00015514970803826562, + "loss": 0.0292, + "step": 3610 + }, + { + "epoch": 0.6745236875203802, + "grad_norm": 0.1836289018392563, + "learning_rate": 0.00015502546900236054, + "loss": 0.0229, + "step": 3620 + }, + { + "epoch": 0.6763870126240276, + "grad_norm": 0.20711427927017212, + "learning_rate": 0.00015490122996645545, + "loss": 0.0206, + "step": 3630 + }, + { + "epoch": 0.6782503377276751, + "grad_norm": 0.23631249368190765, + "learning_rate": 0.0001547769909305504, + "loss": 0.0188, + "step": 3640 + }, + { + "epoch": 0.6801136628313225, + "grad_norm": 0.28623124957084656, + "learning_rate": 0.0001546527518946453, + "loss": 0.0279, + "step": 3650 + }, + { + "epoch": 0.68197698793497, + "grad_norm": 0.16205276548862457, + "learning_rate": 0.00015452851285874023, + "loss": 0.0205, + "step": 3660 + }, + { + "epoch": 0.6838403130386174, + "grad_norm": 0.3658526539802551, + "learning_rate": 0.00015440427382283514, + "loss": 0.0345, + "step": 3670 + }, + { + "epoch": 0.6857036381422649, + "grad_norm": 0.19878637790679932, + "learning_rate": 0.00015428003478693006, + "loss": 0.0188, + "step": 3680 + }, + { + "epoch": 0.6875669632459124, + "grad_norm": 0.148567795753479, + "learning_rate": 0.000154155795751025, + "loss": 0.018, + "step": 3690 + }, + { + "epoch": 0.6894302883495598, + "grad_norm": 0.0717538595199585, + "learning_rate": 0.00015403155671511992, + "loss": 0.0187, + "step": 3700 + }, + { + "epoch": 0.6912936134532073, + "grad_norm": 0.18270643055438995, + "learning_rate": 0.00015390731767921483, + "loss": 0.0323, + "step": 3710 + }, + { + "epoch": 0.6931569385568547, + "grad_norm": 0.10219421982765198, + "learning_rate": 0.00015378307864330975, + "loss": 0.0207, + "step": 3720 + }, + { + "epoch": 0.6950202636605022, + "grad_norm": 0.15821602940559387, + "learning_rate": 0.00015365883960740466, + "loss": 0.0324, + "step": 3730 + }, + { + "epoch": 0.6968835887641496, + "grad_norm": 0.23627109825611115, + "learning_rate": 0.00015353460057149958, + "loss": 0.0237, + "step": 3740 + }, + { + "epoch": 0.6987469138677971, + "grad_norm": 0.1554834395647049, + "learning_rate": 0.0001534103615355945, + "loss": 0.0254, + "step": 3750 + }, + { + "epoch": 0.7006102389714446, + "grad_norm": 0.13305582106113434, + "learning_rate": 0.0001532861224996894, + "loss": 0.021, + "step": 3760 + }, + { + "epoch": 0.702473564075092, + "grad_norm": 0.11506448686122894, + "learning_rate": 0.00015316188346378433, + "loss": 0.0342, + "step": 3770 + }, + { + "epoch": 0.7043368891787395, + "grad_norm": 0.33708521723747253, + "learning_rate": 0.00015303764442787924, + "loss": 0.0231, + "step": 3780 + }, + { + "epoch": 0.7062002142823869, + "grad_norm": 0.15879079699516296, + "learning_rate": 0.00015291340539197416, + "loss": 0.0318, + "step": 3790 + }, + { + "epoch": 0.7080635393860344, + "grad_norm": 0.13867144286632538, + "learning_rate": 0.00015278916635606907, + "loss": 0.0282, + "step": 3800 + }, + { + "epoch": 0.7099268644896818, + "grad_norm": 0.1752207726240158, + "learning_rate": 0.000152664927320164, + "loss": 0.0259, + "step": 3810 + }, + { + "epoch": 0.7117901895933293, + "grad_norm": 0.1792452335357666, + "learning_rate": 0.0001525406882842589, + "loss": 0.0285, + "step": 3820 + }, + { + "epoch": 0.7136535146969768, + "grad_norm": 0.14697203040122986, + "learning_rate": 0.00015241644924835382, + "loss": 0.0277, + "step": 3830 + }, + { + "epoch": 0.7155168398006242, + "grad_norm": 0.1709863245487213, + "learning_rate": 0.00015229221021244874, + "loss": 0.0226, + "step": 3840 + }, + { + "epoch": 0.7173801649042717, + "grad_norm": 0.2217591106891632, + "learning_rate": 0.00015216797117654365, + "loss": 0.0214, + "step": 3850 + }, + { + "epoch": 0.7192434900079191, + "grad_norm": 0.13732624053955078, + "learning_rate": 0.0001520437321406386, + "loss": 0.0227, + "step": 3860 + }, + { + "epoch": 0.7211068151115666, + "grad_norm": 0.1472344845533371, + "learning_rate": 0.0001519194931047335, + "loss": 0.0196, + "step": 3870 + }, + { + "epoch": 0.722970140215214, + "grad_norm": 0.26150020956993103, + "learning_rate": 0.00015179525406882843, + "loss": 0.0278, + "step": 3880 + }, + { + "epoch": 0.7248334653188615, + "grad_norm": 0.12479466199874878, + "learning_rate": 0.00015167101503292337, + "loss": 0.0277, + "step": 3890 + }, + { + "epoch": 0.726696790422509, + "grad_norm": 0.09242439270019531, + "learning_rate": 0.00015154677599701829, + "loss": 0.0209, + "step": 3900 + }, + { + "epoch": 0.7285601155261564, + "grad_norm": 0.1810086965560913, + "learning_rate": 0.0001514225369611132, + "loss": 0.0228, + "step": 3910 + }, + { + "epoch": 0.7304234406298039, + "grad_norm": 0.10720740258693695, + "learning_rate": 0.00015129829792520812, + "loss": 0.0179, + "step": 3920 + }, + { + "epoch": 0.7322867657334513, + "grad_norm": 0.12426480650901794, + "learning_rate": 0.00015117405888930303, + "loss": 0.0203, + "step": 3930 + }, + { + "epoch": 0.7341500908370988, + "grad_norm": 0.11750097572803497, + "learning_rate": 0.00015104981985339795, + "loss": 0.0211, + "step": 3940 + }, + { + "epoch": 0.7360134159407462, + "grad_norm": 0.12620018422603607, + "learning_rate": 0.00015092558081749287, + "loss": 0.0186, + "step": 3950 + }, + { + "epoch": 0.7378767410443937, + "grad_norm": 0.13408900797367096, + "learning_rate": 0.00015080134178158778, + "loss": 0.0271, + "step": 3960 + }, + { + "epoch": 0.7397400661480412, + "grad_norm": 0.12148467451334, + "learning_rate": 0.0001506771027456827, + "loss": 0.0214, + "step": 3970 + }, + { + "epoch": 0.7416033912516886, + "grad_norm": 0.13083823025226593, + "learning_rate": 0.0001505528637097776, + "loss": 0.0186, + "step": 3980 + }, + { + "epoch": 0.7434667163553361, + "grad_norm": 0.10845719277858734, + "learning_rate": 0.00015042862467387253, + "loss": 0.0233, + "step": 3990 + }, + { + "epoch": 0.7453300414589835, + "grad_norm": 0.14539609849452972, + "learning_rate": 0.00015030438563796744, + "loss": 0.0267, + "step": 4000 + }, + { + "epoch": 0.747193366562631, + "grad_norm": 0.1348503679037094, + "learning_rate": 0.00015018014660206236, + "loss": 0.0264, + "step": 4010 + }, + { + "epoch": 0.7490566916662785, + "grad_norm": 0.12877729535102844, + "learning_rate": 0.00015005590756615728, + "loss": 0.0224, + "step": 4020 + }, + { + "epoch": 0.7509200167699259, + "grad_norm": 0.3890345096588135, + "learning_rate": 0.00014993166853025222, + "loss": 0.0212, + "step": 4030 + }, + { + "epoch": 0.7527833418735734, + "grad_norm": 0.21587280929088593, + "learning_rate": 0.00014980742949434713, + "loss": 0.0295, + "step": 4040 + }, + { + "epoch": 0.7546466669772208, + "grad_norm": 0.10658788681030273, + "learning_rate": 0.00014968319045844205, + "loss": 0.0189, + "step": 4050 + }, + { + "epoch": 0.7565099920808683, + "grad_norm": 0.10835613310337067, + "learning_rate": 0.00014955895142253697, + "loss": 0.0233, + "step": 4060 + }, + { + "epoch": 0.7583733171845157, + "grad_norm": 0.22986513376235962, + "learning_rate": 0.00014943471238663188, + "loss": 0.0272, + "step": 4070 + }, + { + "epoch": 0.7602366422881632, + "grad_norm": 0.11246643960475922, + "learning_rate": 0.0001493104733507268, + "loss": 0.021, + "step": 4080 + }, + { + "epoch": 0.7620999673918107, + "grad_norm": 0.16520898044109344, + "learning_rate": 0.00014918623431482171, + "loss": 0.0192, + "step": 4090 + }, + { + "epoch": 0.7639632924954581, + "grad_norm": 0.40314796566963196, + "learning_rate": 0.00014906199527891666, + "loss": 0.0397, + "step": 4100 + }, + { + "epoch": 0.7658266175991056, + "grad_norm": 0.0877520963549614, + "learning_rate": 0.00014893775624301157, + "loss": 0.0253, + "step": 4110 + }, + { + "epoch": 0.767689942702753, + "grad_norm": 0.16271091997623444, + "learning_rate": 0.0001488135172071065, + "loss": 0.022, + "step": 4120 + }, + { + "epoch": 0.7695532678064005, + "grad_norm": 0.21588853001594543, + "learning_rate": 0.0001486892781712014, + "loss": 0.023, + "step": 4130 + }, + { + "epoch": 0.7714165929100479, + "grad_norm": 0.2022632360458374, + "learning_rate": 0.00014856503913529632, + "loss": 0.0188, + "step": 4140 + }, + { + "epoch": 0.7732799180136954, + "grad_norm": 0.11369698494672775, + "learning_rate": 0.00014844080009939124, + "loss": 0.0278, + "step": 4150 + }, + { + "epoch": 0.775143243117343, + "grad_norm": 0.4832044243812561, + "learning_rate": 0.00014831656106348615, + "loss": 0.0239, + "step": 4160 + }, + { + "epoch": 0.7770065682209903, + "grad_norm": 0.1868063062429428, + "learning_rate": 0.00014819232202758107, + "loss": 0.0256, + "step": 4170 + }, + { + "epoch": 0.7788698933246379, + "grad_norm": 0.1666928231716156, + "learning_rate": 0.00014806808299167598, + "loss": 0.0267, + "step": 4180 + }, + { + "epoch": 0.7807332184282852, + "grad_norm": 0.14221766591072083, + "learning_rate": 0.0001479438439557709, + "loss": 0.0174, + "step": 4190 + }, + { + "epoch": 0.7825965435319328, + "grad_norm": 0.11577267944812775, + "learning_rate": 0.00014781960491986584, + "loss": 0.0243, + "step": 4200 + }, + { + "epoch": 0.7844598686355801, + "grad_norm": 0.1612447202205658, + "learning_rate": 0.00014769536588396076, + "loss": 0.0221, + "step": 4210 + }, + { + "epoch": 0.7863231937392277, + "grad_norm": 0.11659658700227737, + "learning_rate": 0.00014757112684805567, + "loss": 0.0433, + "step": 4220 + }, + { + "epoch": 0.7881865188428752, + "grad_norm": 0.31002146005630493, + "learning_rate": 0.0001474468878121506, + "loss": 0.0232, + "step": 4230 + }, + { + "epoch": 0.7900498439465226, + "grad_norm": 0.6392256021499634, + "learning_rate": 0.0001473226487762455, + "loss": 0.0297, + "step": 4240 + }, + { + "epoch": 0.7919131690501701, + "grad_norm": 0.1782752126455307, + "learning_rate": 0.00014719840974034042, + "loss": 0.0183, + "step": 4250 + }, + { + "epoch": 0.7937764941538175, + "grad_norm": 0.1695147305727005, + "learning_rate": 0.00014707417070443534, + "loss": 0.0201, + "step": 4260 + }, + { + "epoch": 0.795639819257465, + "grad_norm": 0.19251610338687897, + "learning_rate": 0.00014694993166853025, + "loss": 0.0222, + "step": 4270 + }, + { + "epoch": 0.7975031443611124, + "grad_norm": 0.12733601033687592, + "learning_rate": 0.00014682569263262517, + "loss": 0.0164, + "step": 4280 + }, + { + "epoch": 0.7993664694647599, + "grad_norm": 0.14594994485378265, + "learning_rate": 0.00014670145359672008, + "loss": 0.0219, + "step": 4290 + }, + { + "epoch": 0.8012297945684074, + "grad_norm": 0.17092078924179077, + "learning_rate": 0.000146577214560815, + "loss": 0.0237, + "step": 4300 + }, + { + "epoch": 0.8030931196720548, + "grad_norm": 0.1365964561700821, + "learning_rate": 0.00014645297552490994, + "loss": 0.0198, + "step": 4310 + }, + { + "epoch": 0.8049564447757023, + "grad_norm": 0.28921860456466675, + "learning_rate": 0.00014632873648900486, + "loss": 0.0353, + "step": 4320 + }, + { + "epoch": 0.8068197698793497, + "grad_norm": 0.17445138096809387, + "learning_rate": 0.00014620449745309977, + "loss": 0.022, + "step": 4330 + }, + { + "epoch": 0.8086830949829972, + "grad_norm": 0.1306416541337967, + "learning_rate": 0.0001460802584171947, + "loss": 0.0174, + "step": 4340 + }, + { + "epoch": 0.8105464200866446, + "grad_norm": 0.19088391959667206, + "learning_rate": 0.0001459560193812896, + "loss": 0.021, + "step": 4350 + }, + { + "epoch": 0.8124097451902921, + "grad_norm": 0.11422597616910934, + "learning_rate": 0.00014583178034538452, + "loss": 0.0194, + "step": 4360 + }, + { + "epoch": 0.8142730702939396, + "grad_norm": 0.18395616114139557, + "learning_rate": 0.00014570754130947946, + "loss": 0.0262, + "step": 4370 + }, + { + "epoch": 0.816136395397587, + "grad_norm": 0.3011746108531952, + "learning_rate": 0.00014558330227357438, + "loss": 0.0255, + "step": 4380 + }, + { + "epoch": 0.8179997205012345, + "grad_norm": 0.1338394582271576, + "learning_rate": 0.0001454590632376693, + "loss": 0.0228, + "step": 4390 + }, + { + "epoch": 0.8198630456048819, + "grad_norm": 0.2081775963306427, + "learning_rate": 0.0001453348242017642, + "loss": 0.0205, + "step": 4400 + }, + { + "epoch": 0.8217263707085294, + "grad_norm": 0.28005295991897583, + "learning_rate": 0.00014521058516585913, + "loss": 0.0212, + "step": 4410 + }, + { + "epoch": 0.8235896958121768, + "grad_norm": 0.12276031076908112, + "learning_rate": 0.00014508634612995404, + "loss": 0.0213, + "step": 4420 + }, + { + "epoch": 0.8254530209158243, + "grad_norm": 0.11224307119846344, + "learning_rate": 0.00014496210709404896, + "loss": 0.0238, + "step": 4430 + }, + { + "epoch": 0.8273163460194718, + "grad_norm": 0.15174053609371185, + "learning_rate": 0.00014483786805814388, + "loss": 0.0193, + "step": 4440 + }, + { + "epoch": 0.8291796711231192, + "grad_norm": 0.13163380324840546, + "learning_rate": 0.0001447136290222388, + "loss": 0.0181, + "step": 4450 + }, + { + "epoch": 0.8310429962267667, + "grad_norm": 0.1232830286026001, + "learning_rate": 0.0001445893899863337, + "loss": 0.0238, + "step": 4460 + }, + { + "epoch": 0.8329063213304141, + "grad_norm": 0.13815902173519135, + "learning_rate": 0.00014446515095042862, + "loss": 0.0181, + "step": 4470 + }, + { + "epoch": 0.8347696464340616, + "grad_norm": 0.17953529953956604, + "learning_rate": 0.00014434091191452354, + "loss": 0.0184, + "step": 4480 + }, + { + "epoch": 0.836632971537709, + "grad_norm": 0.15750688314437866, + "learning_rate": 0.00014421667287861845, + "loss": 0.0288, + "step": 4490 + }, + { + "epoch": 0.8384962966413565, + "grad_norm": 0.14837270975112915, + "learning_rate": 0.00014409243384271337, + "loss": 0.0166, + "step": 4500 + }, + { + "epoch": 0.840359621745004, + "grad_norm": 0.19016404449939728, + "learning_rate": 0.00014396819480680829, + "loss": 0.0248, + "step": 4510 + }, + { + "epoch": 0.8422229468486514, + "grad_norm": 0.10917269438505173, + "learning_rate": 0.00014384395577090323, + "loss": 0.0216, + "step": 4520 + }, + { + "epoch": 0.8440862719522989, + "grad_norm": 0.12285543233156204, + "learning_rate": 0.00014371971673499814, + "loss": 0.0283, + "step": 4530 + }, + { + "epoch": 0.8459495970559463, + "grad_norm": 0.14190028607845306, + "learning_rate": 0.00014359547769909306, + "loss": 0.0187, + "step": 4540 + }, + { + "epoch": 0.8478129221595938, + "grad_norm": 0.1671181321144104, + "learning_rate": 0.00014347123866318798, + "loss": 0.0201, + "step": 4550 + }, + { + "epoch": 0.8496762472632412, + "grad_norm": 0.17794017493724823, + "learning_rate": 0.00014334699962728292, + "loss": 0.0146, + "step": 4560 + }, + { + "epoch": 0.8515395723668887, + "grad_norm": 0.11655906587839127, + "learning_rate": 0.00014322276059137784, + "loss": 0.0216, + "step": 4570 + }, + { + "epoch": 0.8534028974705362, + "grad_norm": 0.10486368089914322, + "learning_rate": 0.00014309852155547275, + "loss": 0.0246, + "step": 4580 + }, + { + "epoch": 0.8552662225741836, + "grad_norm": 0.12611308693885803, + "learning_rate": 0.00014297428251956767, + "loss": 0.0211, + "step": 4590 + }, + { + "epoch": 0.8571295476778311, + "grad_norm": 0.22120191156864166, + "learning_rate": 0.00014285004348366258, + "loss": 0.0198, + "step": 4600 + }, + { + "epoch": 0.8589928727814785, + "grad_norm": 0.21815341711044312, + "learning_rate": 0.0001427258044477575, + "loss": 0.023, + "step": 4610 + }, + { + "epoch": 0.860856197885126, + "grad_norm": 0.11714211106300354, + "learning_rate": 0.00014260156541185241, + "loss": 0.0286, + "step": 4620 + }, + { + "epoch": 0.8627195229887734, + "grad_norm": 0.1074879840016365, + "learning_rate": 0.00014247732637594733, + "loss": 0.0198, + "step": 4630 + }, + { + "epoch": 0.8645828480924209, + "grad_norm": 0.1850721836090088, + "learning_rate": 0.00014235308734004225, + "loss": 0.0244, + "step": 4640 + }, + { + "epoch": 0.8664461731960684, + "grad_norm": 0.18282188475131989, + "learning_rate": 0.00014222884830413716, + "loss": 0.0183, + "step": 4650 + }, + { + "epoch": 0.8683094982997158, + "grad_norm": 0.20297600328922272, + "learning_rate": 0.00014210460926823208, + "loss": 0.0209, + "step": 4660 + }, + { + "epoch": 0.8701728234033633, + "grad_norm": 0.10349154472351074, + "learning_rate": 0.000141980370232327, + "loss": 0.0359, + "step": 4670 + }, + { + "epoch": 0.8720361485070107, + "grad_norm": 0.25054970383644104, + "learning_rate": 0.0001418561311964219, + "loss": 0.0185, + "step": 4680 + }, + { + "epoch": 0.8738994736106582, + "grad_norm": 0.1190393716096878, + "learning_rate": 0.00014173189216051683, + "loss": 0.0193, + "step": 4690 + }, + { + "epoch": 0.8757627987143056, + "grad_norm": 0.24704180657863617, + "learning_rate": 0.00014160765312461174, + "loss": 0.0209, + "step": 4700 + }, + { + "epoch": 0.8776261238179531, + "grad_norm": 0.14505434036254883, + "learning_rate": 0.00014148341408870668, + "loss": 0.0189, + "step": 4710 + }, + { + "epoch": 0.8794894489216006, + "grad_norm": 0.1667511761188507, + "learning_rate": 0.0001413591750528016, + "loss": 0.0198, + "step": 4720 + }, + { + "epoch": 0.881352774025248, + "grad_norm": 0.15274561941623688, + "learning_rate": 0.00014123493601689652, + "loss": 0.0163, + "step": 4730 + }, + { + "epoch": 0.8832160991288955, + "grad_norm": 0.1500704288482666, + "learning_rate": 0.00014111069698099143, + "loss": 0.0153, + "step": 4740 + }, + { + "epoch": 0.8850794242325429, + "grad_norm": 0.4478381276130676, + "learning_rate": 0.00014098645794508635, + "loss": 0.0395, + "step": 4750 + }, + { + "epoch": 0.8869427493361904, + "grad_norm": 0.2906605005264282, + "learning_rate": 0.00014086221890918126, + "loss": 0.0224, + "step": 4760 + }, + { + "epoch": 0.888806074439838, + "grad_norm": 0.1912839412689209, + "learning_rate": 0.0001407379798732762, + "loss": 0.0279, + "step": 4770 + }, + { + "epoch": 0.8906693995434853, + "grad_norm": 0.08297467231750488, + "learning_rate": 0.00014061374083737112, + "loss": 0.0201, + "step": 4780 + }, + { + "epoch": 0.8925327246471328, + "grad_norm": 0.18305924534797668, + "learning_rate": 0.00014048950180146604, + "loss": 0.0206, + "step": 4790 + }, + { + "epoch": 0.8943960497507802, + "grad_norm": 0.1690533459186554, + "learning_rate": 0.00014036526276556095, + "loss": 0.0176, + "step": 4800 + }, + { + "epoch": 0.8962593748544277, + "grad_norm": 0.16443593800067902, + "learning_rate": 0.00014024102372965587, + "loss": 0.0209, + "step": 4810 + }, + { + "epoch": 0.8981226999580751, + "grad_norm": 0.19682125747203827, + "learning_rate": 0.00014011678469375078, + "loss": 0.0193, + "step": 4820 + }, + { + "epoch": 0.8999860250617226, + "grad_norm": 0.15023230016231537, + "learning_rate": 0.0001399925456578457, + "loss": 0.0198, + "step": 4830 + }, + { + "epoch": 0.9018493501653702, + "grad_norm": 0.10205192863941193, + "learning_rate": 0.00013986830662194062, + "loss": 0.0209, + "step": 4840 + }, + { + "epoch": 0.9037126752690176, + "grad_norm": 0.10337179154157639, + "learning_rate": 0.00013974406758603553, + "loss": 0.0172, + "step": 4850 + }, + { + "epoch": 0.9055760003726651, + "grad_norm": 0.18049459159374237, + "learning_rate": 0.00013961982855013045, + "loss": 0.0234, + "step": 4860 + }, + { + "epoch": 0.9074393254763125, + "grad_norm": 0.11622573435306549, + "learning_rate": 0.00013949558951422536, + "loss": 0.0207, + "step": 4870 + }, + { + "epoch": 0.90930265057996, + "grad_norm": 0.22294333577156067, + "learning_rate": 0.00013937135047832028, + "loss": 0.0185, + "step": 4880 + }, + { + "epoch": 0.9111659756836074, + "grad_norm": 0.37016913294792175, + "learning_rate": 0.00013924711144241522, + "loss": 0.0238, + "step": 4890 + }, + { + "epoch": 0.9130293007872549, + "grad_norm": 0.14420120418071747, + "learning_rate": 0.00013912287240651014, + "loss": 0.0168, + "step": 4900 + }, + { + "epoch": 0.9148926258909024, + "grad_norm": 0.13367877900600433, + "learning_rate": 0.00013899863337060505, + "loss": 0.018, + "step": 4910 + }, + { + "epoch": 0.9167559509945498, + "grad_norm": 0.1635744571685791, + "learning_rate": 0.00013887439433469997, + "loss": 0.0155, + "step": 4920 + }, + { + "epoch": 0.9186192760981973, + "grad_norm": 0.14747698605060577, + "learning_rate": 0.00013875015529879489, + "loss": 0.0221, + "step": 4930 + }, + { + "epoch": 0.9204826012018447, + "grad_norm": 0.1708410084247589, + "learning_rate": 0.0001386259162628898, + "loss": 0.0197, + "step": 4940 + }, + { + "epoch": 0.9223459263054922, + "grad_norm": 0.09329384565353394, + "learning_rate": 0.00013850167722698472, + "loss": 0.0176, + "step": 4950 + }, + { + "epoch": 0.9242092514091396, + "grad_norm": 0.15747447311878204, + "learning_rate": 0.00013837743819107963, + "loss": 0.0204, + "step": 4960 + }, + { + "epoch": 0.9260725765127871, + "grad_norm": 0.14883270859718323, + "learning_rate": 0.00013825319915517455, + "loss": 0.019, + "step": 4970 + }, + { + "epoch": 0.9279359016164346, + "grad_norm": 0.13427717983722687, + "learning_rate": 0.0001381289601192695, + "loss": 0.0275, + "step": 4980 + }, + { + "epoch": 0.929799226720082, + "grad_norm": 0.11976408958435059, + "learning_rate": 0.0001380047210833644, + "loss": 0.0182, + "step": 4990 + }, + { + "epoch": 0.9316625518237295, + "grad_norm": 0.12686076760292053, + "learning_rate": 0.00013788048204745932, + "loss": 0.0257, + "step": 5000 + }, + { + "epoch": 0.9335258769273769, + "grad_norm": 0.16047020256519318, + "learning_rate": 0.00013775624301155424, + "loss": 0.0192, + "step": 5010 + }, + { + "epoch": 0.9353892020310244, + "grad_norm": 0.15550382435321808, + "learning_rate": 0.00013763200397564916, + "loss": 0.0217, + "step": 5020 + }, + { + "epoch": 0.9372525271346718, + "grad_norm": 0.11465305089950562, + "learning_rate": 0.00013750776493974407, + "loss": 0.0222, + "step": 5030 + }, + { + "epoch": 0.9391158522383193, + "grad_norm": 0.1337389051914215, + "learning_rate": 0.000137383525903839, + "loss": 0.0186, + "step": 5040 + }, + { + "epoch": 0.9409791773419668, + "grad_norm": 0.19774989783763885, + "learning_rate": 0.0001372592868679339, + "loss": 0.0192, + "step": 5050 + }, + { + "epoch": 0.9428425024456142, + "grad_norm": 0.1269395351409912, + "learning_rate": 0.00013713504783202885, + "loss": 0.0188, + "step": 5060 + }, + { + "epoch": 0.9447058275492617, + "grad_norm": 0.25458306074142456, + "learning_rate": 0.00013701080879612376, + "loss": 0.0192, + "step": 5070 + }, + { + "epoch": 0.9465691526529091, + "grad_norm": 0.17495116591453552, + "learning_rate": 0.00013688656976021868, + "loss": 0.0159, + "step": 5080 + }, + { + "epoch": 0.9484324777565566, + "grad_norm": 0.07973502576351166, + "learning_rate": 0.0001367623307243136, + "loss": 0.0158, + "step": 5090 + }, + { + "epoch": 0.950295802860204, + "grad_norm": 0.13139723241329193, + "learning_rate": 0.0001366380916884085, + "loss": 0.0202, + "step": 5100 + }, + { + "epoch": 0.9521591279638515, + "grad_norm": 0.2671772837638855, + "learning_rate": 0.00013651385265250342, + "loss": 0.0199, + "step": 5110 + }, + { + "epoch": 0.954022453067499, + "grad_norm": 0.13597209751605988, + "learning_rate": 0.00013638961361659834, + "loss": 0.021, + "step": 5120 + }, + { + "epoch": 0.9558857781711464, + "grad_norm": 0.15831440687179565, + "learning_rate": 0.00013626537458069326, + "loss": 0.0236, + "step": 5130 + }, + { + "epoch": 0.9577491032747939, + "grad_norm": 0.08694765716791153, + "learning_rate": 0.00013614113554478817, + "loss": 0.0191, + "step": 5140 + }, + { + "epoch": 0.9596124283784413, + "grad_norm": 0.12419598549604416, + "learning_rate": 0.0001360168965088831, + "loss": 0.0259, + "step": 5150 + }, + { + "epoch": 0.9614757534820888, + "grad_norm": 0.4500528872013092, + "learning_rate": 0.000135892657472978, + "loss": 0.0233, + "step": 5160 + }, + { + "epoch": 0.9633390785857362, + "grad_norm": 0.13462555408477783, + "learning_rate": 0.00013576841843707292, + "loss": 0.0239, + "step": 5170 + }, + { + "epoch": 0.9652024036893837, + "grad_norm": 0.11178270727396011, + "learning_rate": 0.00013564417940116784, + "loss": 0.0204, + "step": 5180 + }, + { + "epoch": 0.9670657287930312, + "grad_norm": 0.09808334708213806, + "learning_rate": 0.00013551994036526278, + "loss": 0.0176, + "step": 5190 + }, + { + "epoch": 0.9689290538966786, + "grad_norm": 0.12407030910253525, + "learning_rate": 0.0001353957013293577, + "loss": 0.0201, + "step": 5200 + }, + { + "epoch": 0.9707923790003261, + "grad_norm": 0.11156706511974335, + "learning_rate": 0.0001352714622934526, + "loss": 0.0176, + "step": 5210 + }, + { + "epoch": 0.9726557041039735, + "grad_norm": 0.16406476497650146, + "learning_rate": 0.00013514722325754753, + "loss": 0.0208, + "step": 5220 + }, + { + "epoch": 0.974519029207621, + "grad_norm": 0.08763796091079712, + "learning_rate": 0.00013502298422164247, + "loss": 0.0236, + "step": 5230 + }, + { + "epoch": 0.9763823543112684, + "grad_norm": 0.10975922644138336, + "learning_rate": 0.00013489874518573738, + "loss": 0.0172, + "step": 5240 + }, + { + "epoch": 0.9782456794149159, + "grad_norm": 0.14900168776512146, + "learning_rate": 0.0001347745061498323, + "loss": 0.0176, + "step": 5250 + }, + { + "epoch": 0.9801090045185634, + "grad_norm": 0.08646497875452042, + "learning_rate": 0.00013465026711392722, + "loss": 0.0255, + "step": 5260 + }, + { + "epoch": 0.9819723296222108, + "grad_norm": 0.12455034255981445, + "learning_rate": 0.00013452602807802213, + "loss": 0.0148, + "step": 5270 + }, + { + "epoch": 0.9838356547258583, + "grad_norm": 0.08824939280748367, + "learning_rate": 0.00013440178904211705, + "loss": 0.0219, + "step": 5280 + }, + { + "epoch": 0.9856989798295057, + "grad_norm": 0.12328789383172989, + "learning_rate": 0.00013427755000621196, + "loss": 0.0182, + "step": 5290 + }, + { + "epoch": 0.9875623049331532, + "grad_norm": 0.07896114885807037, + "learning_rate": 0.00013415331097030688, + "loss": 0.0203, + "step": 5300 + }, + { + "epoch": 0.9894256300368006, + "grad_norm": 1.816659688949585, + "learning_rate": 0.0001340290719344018, + "loss": 0.0186, + "step": 5310 + }, + { + "epoch": 0.9912889551404481, + "grad_norm": 0.1731538623571396, + "learning_rate": 0.0001339048328984967, + "loss": 0.0132, + "step": 5320 + }, + { + "epoch": 0.9931522802440956, + "grad_norm": 0.1737430840730667, + "learning_rate": 0.00013378059386259163, + "loss": 0.0206, + "step": 5330 + }, + { + "epoch": 0.995015605347743, + "grad_norm": 0.10039955377578735, + "learning_rate": 0.00013365635482668654, + "loss": 0.0182, + "step": 5340 + }, + { + "epoch": 0.9968789304513905, + "grad_norm": 0.07095132768154144, + "learning_rate": 0.00013353211579078146, + "loss": 0.0187, + "step": 5350 + }, + { + "epoch": 0.9987422555550379, + "grad_norm": 0.08819052577018738, + "learning_rate": 0.00013340787675487637, + "loss": 0.0162, + "step": 5360 + }, + { + "epoch": 1.0006055806586853, + "grad_norm": 0.10541682690382004, + "learning_rate": 0.0001332836377189713, + "loss": 0.0218, + "step": 5370 + }, + { + "epoch": 1.0024689057623328, + "grad_norm": 0.09925888478755951, + "learning_rate": 0.0001331593986830662, + "loss": 0.017, + "step": 5380 + }, + { + "epoch": 1.0043322308659803, + "grad_norm": 0.130414217710495, + "learning_rate": 0.00013303515964716112, + "loss": 0.0183, + "step": 5390 + }, + { + "epoch": 1.0061955559696278, + "grad_norm": 0.10512223094701767, + "learning_rate": 0.00013291092061125606, + "loss": 0.0177, + "step": 5400 + }, + { + "epoch": 1.0080588810732753, + "grad_norm": 0.09342774003744125, + "learning_rate": 0.00013278668157535098, + "loss": 0.0128, + "step": 5410 + }, + { + "epoch": 1.0099222061769226, + "grad_norm": 0.14016024768352509, + "learning_rate": 0.0001326624425394459, + "loss": 0.0174, + "step": 5420 + }, + { + "epoch": 1.0117855312805701, + "grad_norm": 0.33078333735466003, + "learning_rate": 0.00013253820350354084, + "loss": 0.0141, + "step": 5430 + }, + { + "epoch": 1.0136488563842176, + "grad_norm": 0.14986422657966614, + "learning_rate": 0.00013241396446763575, + "loss": 0.0138, + "step": 5440 + }, + { + "epoch": 1.0155121814878652, + "grad_norm": 0.05468583106994629, + "learning_rate": 0.00013228972543173067, + "loss": 0.0134, + "step": 5450 + }, + { + "epoch": 1.0173755065915127, + "grad_norm": 0.15035457909107208, + "learning_rate": 0.00013216548639582559, + "loss": 0.0138, + "step": 5460 + }, + { + "epoch": 1.01923883169516, + "grad_norm": 0.12102866917848587, + "learning_rate": 0.0001320412473599205, + "loss": 0.0121, + "step": 5470 + }, + { + "epoch": 1.0211021567988074, + "grad_norm": 0.1015734151005745, + "learning_rate": 0.00013191700832401542, + "loss": 0.0156, + "step": 5480 + }, + { + "epoch": 1.022965481902455, + "grad_norm": 0.08906779438257217, + "learning_rate": 0.00013179276928811033, + "loss": 0.0137, + "step": 5490 + }, + { + "epoch": 1.0248288070061025, + "grad_norm": 0.04403156042098999, + "learning_rate": 0.00013166853025220525, + "loss": 0.0115, + "step": 5500 + }, + { + "epoch": 1.0266921321097497, + "grad_norm": 0.1241898313164711, + "learning_rate": 0.00013154429121630017, + "loss": 0.0176, + "step": 5510 + }, + { + "epoch": 1.0285554572133973, + "grad_norm": 0.1671934872865677, + "learning_rate": 0.00013142005218039508, + "loss": 0.0136, + "step": 5520 + }, + { + "epoch": 1.0304187823170448, + "grad_norm": 0.6383609771728516, + "learning_rate": 0.00013129581314449, + "loss": 0.022, + "step": 5530 + }, + { + "epoch": 1.0322821074206923, + "grad_norm": 0.08012861758470535, + "learning_rate": 0.0001311715741085849, + "loss": 0.0141, + "step": 5540 + }, + { + "epoch": 1.0341454325243398, + "grad_norm": 0.11658889800310135, + "learning_rate": 0.00013104733507267983, + "loss": 0.0152, + "step": 5550 + }, + { + "epoch": 1.036008757627987, + "grad_norm": 0.13740360736846924, + "learning_rate": 0.00013092309603677474, + "loss": 0.0198, + "step": 5560 + }, + { + "epoch": 1.0378720827316346, + "grad_norm": 0.8119271993637085, + "learning_rate": 0.0001307988570008697, + "loss": 0.0151, + "step": 5570 + }, + { + "epoch": 1.039735407835282, + "grad_norm": 0.2707085907459259, + "learning_rate": 0.0001306746179649646, + "loss": 0.0163, + "step": 5580 + }, + { + "epoch": 1.0415987329389296, + "grad_norm": 0.10634835809469223, + "learning_rate": 0.00013055037892905952, + "loss": 0.0121, + "step": 5590 + }, + { + "epoch": 1.043462058042577, + "grad_norm": 0.12598338723182678, + "learning_rate": 0.00013042613989315443, + "loss": 0.0166, + "step": 5600 + }, + { + "epoch": 1.0453253831462244, + "grad_norm": 0.13499614596366882, + "learning_rate": 0.00013030190085724935, + "loss": 0.0119, + "step": 5610 + }, + { + "epoch": 1.0471887082498719, + "grad_norm": 0.1680738776922226, + "learning_rate": 0.00013017766182134427, + "loss": 0.024, + "step": 5620 + }, + { + "epoch": 1.0490520333535194, + "grad_norm": 0.1279182881116867, + "learning_rate": 0.00013005342278543918, + "loss": 0.0166, + "step": 5630 + }, + { + "epoch": 1.0509153584571669, + "grad_norm": 0.07233988493680954, + "learning_rate": 0.0001299291837495341, + "loss": 0.0157, + "step": 5640 + }, + { + "epoch": 1.0527786835608142, + "grad_norm": 0.1104668378829956, + "learning_rate": 0.00012980494471362904, + "loss": 0.0156, + "step": 5650 + }, + { + "epoch": 1.0546420086644617, + "grad_norm": 0.1559237837791443, + "learning_rate": 0.00012968070567772396, + "loss": 0.0147, + "step": 5660 + }, + { + "epoch": 1.0565053337681092, + "grad_norm": 0.04395401477813721, + "learning_rate": 0.00012955646664181887, + "loss": 0.0134, + "step": 5670 + }, + { + "epoch": 1.0583686588717567, + "grad_norm": 0.07985333353281021, + "learning_rate": 0.0001294322276059138, + "loss": 0.0146, + "step": 5680 + }, + { + "epoch": 1.0602319839754042, + "grad_norm": 0.06879911571741104, + "learning_rate": 0.0001293079885700087, + "loss": 0.0202, + "step": 5690 + }, + { + "epoch": 1.0620953090790515, + "grad_norm": 0.10023193806409836, + "learning_rate": 0.00012918374953410362, + "loss": 0.0202, + "step": 5700 + }, + { + "epoch": 1.063958634182699, + "grad_norm": 0.07399614155292511, + "learning_rate": 0.00012905951049819854, + "loss": 0.0119, + "step": 5710 + }, + { + "epoch": 1.0658219592863465, + "grad_norm": 0.10685840249061584, + "learning_rate": 0.00012893527146229345, + "loss": 0.0153, + "step": 5720 + }, + { + "epoch": 1.067685284389994, + "grad_norm": 0.12774063646793365, + "learning_rate": 0.00012881103242638837, + "loss": 0.0153, + "step": 5730 + }, + { + "epoch": 1.0695486094936415, + "grad_norm": 0.09937337785959244, + "learning_rate": 0.0001286867933904833, + "loss": 0.0383, + "step": 5740 + }, + { + "epoch": 1.0714119345972888, + "grad_norm": 0.09235163778066635, + "learning_rate": 0.00012856255435457823, + "loss": 0.0173, + "step": 5750 + }, + { + "epoch": 1.0732752597009363, + "grad_norm": 0.14451561868190765, + "learning_rate": 0.00012843831531867314, + "loss": 0.0125, + "step": 5760 + }, + { + "epoch": 1.0751385848045838, + "grad_norm": 0.07140475511550903, + "learning_rate": 0.00012831407628276806, + "loss": 0.0152, + "step": 5770 + }, + { + "epoch": 1.0770019099082313, + "grad_norm": 0.10524556040763855, + "learning_rate": 0.00012818983724686297, + "loss": 0.0167, + "step": 5780 + }, + { + "epoch": 1.0788652350118788, + "grad_norm": 0.1626797616481781, + "learning_rate": 0.0001280655982109579, + "loss": 0.0164, + "step": 5790 + }, + { + "epoch": 1.080728560115526, + "grad_norm": 0.20539982616901398, + "learning_rate": 0.0001279413591750528, + "loss": 0.0145, + "step": 5800 + }, + { + "epoch": 1.0825918852191736, + "grad_norm": 0.11382734030485153, + "learning_rate": 0.00012781712013914772, + "loss": 0.0149, + "step": 5810 + }, + { + "epoch": 1.084455210322821, + "grad_norm": 0.07617650181055069, + "learning_rate": 0.00012769288110324264, + "loss": 0.0141, + "step": 5820 + }, + { + "epoch": 1.0863185354264686, + "grad_norm": 0.14019963145256042, + "learning_rate": 0.00012756864206733755, + "loss": 0.017, + "step": 5830 + }, + { + "epoch": 1.088181860530116, + "grad_norm": 0.097674660384655, + "learning_rate": 0.00012744440303143247, + "loss": 0.019, + "step": 5840 + }, + { + "epoch": 1.0900451856337634, + "grad_norm": 0.13726158440113068, + "learning_rate": 0.00012732016399552738, + "loss": 0.0166, + "step": 5850 + }, + { + "epoch": 1.091908510737411, + "grad_norm": 0.2003697007894516, + "learning_rate": 0.00012719592495962233, + "loss": 0.0154, + "step": 5860 + }, + { + "epoch": 1.0937718358410584, + "grad_norm": 0.08333192020654678, + "learning_rate": 0.00012707168592371724, + "loss": 0.0143, + "step": 5870 + }, + { + "epoch": 1.095635160944706, + "grad_norm": 0.13497716188430786, + "learning_rate": 0.00012694744688781216, + "loss": 0.018, + "step": 5880 + }, + { + "epoch": 1.0974984860483532, + "grad_norm": 0.09423158317804337, + "learning_rate": 0.00012682320785190707, + "loss": 0.0203, + "step": 5890 + }, + { + "epoch": 1.0993618111520007, + "grad_norm": 0.1061846911907196, + "learning_rate": 0.000126698968816002, + "loss": 0.0106, + "step": 5900 + }, + { + "epoch": 1.1012251362556482, + "grad_norm": 0.10075803101062775, + "learning_rate": 0.00012657472978009693, + "loss": 0.0104, + "step": 5910 + }, + { + "epoch": 1.1030884613592957, + "grad_norm": 0.19989649951457977, + "learning_rate": 0.00012645049074419185, + "loss": 0.0205, + "step": 5920 + }, + { + "epoch": 1.104951786462943, + "grad_norm": 0.10290393233299255, + "learning_rate": 0.00012632625170828676, + "loss": 0.0101, + "step": 5930 + }, + { + "epoch": 1.1068151115665905, + "grad_norm": 0.1811441034078598, + "learning_rate": 0.00012620201267238168, + "loss": 0.0169, + "step": 5940 + }, + { + "epoch": 1.108678436670238, + "grad_norm": 0.07941067963838577, + "learning_rate": 0.0001260777736364766, + "loss": 0.0156, + "step": 5950 + }, + { + "epoch": 1.1105417617738855, + "grad_norm": 0.15073469281196594, + "learning_rate": 0.0001259535346005715, + "loss": 0.0219, + "step": 5960 + }, + { + "epoch": 1.112405086877533, + "grad_norm": 0.09446485340595245, + "learning_rate": 0.00012582929556466643, + "loss": 0.0122, + "step": 5970 + }, + { + "epoch": 1.1142684119811803, + "grad_norm": 0.08050578832626343, + "learning_rate": 0.00012570505652876134, + "loss": 0.012, + "step": 5980 + }, + { + "epoch": 1.1161317370848278, + "grad_norm": 0.08295507729053497, + "learning_rate": 0.00012558081749285626, + "loss": 0.0153, + "step": 5990 + }, + { + "epoch": 1.1179950621884753, + "grad_norm": 0.09732520580291748, + "learning_rate": 0.00012545657845695118, + "loss": 0.0203, + "step": 6000 + }, + { + "epoch": 1.1198583872921228, + "grad_norm": 0.12038660049438477, + "learning_rate": 0.0001253323394210461, + "loss": 0.0182, + "step": 6010 + }, + { + "epoch": 1.1217217123957703, + "grad_norm": 0.10337748378515244, + "learning_rate": 0.000125208100385141, + "loss": 0.0118, + "step": 6020 + }, + { + "epoch": 1.1235850374994176, + "grad_norm": 0.15960311889648438, + "learning_rate": 0.00012508386134923592, + "loss": 0.0167, + "step": 6030 + }, + { + "epoch": 1.1254483626030651, + "grad_norm": 0.12002138048410416, + "learning_rate": 0.00012495962231333084, + "loss": 0.0141, + "step": 6040 + }, + { + "epoch": 1.1273116877067126, + "grad_norm": 0.1650262326002121, + "learning_rate": 0.00012483538327742575, + "loss": 0.018, + "step": 6050 + }, + { + "epoch": 1.1291750128103601, + "grad_norm": 0.13276247680187225, + "learning_rate": 0.00012471114424152067, + "loss": 0.0209, + "step": 6060 + }, + { + "epoch": 1.1310383379140077, + "grad_norm": 0.0923129990696907, + "learning_rate": 0.0001245869052056156, + "loss": 0.0173, + "step": 6070 + }, + { + "epoch": 1.132901663017655, + "grad_norm": 0.1202143207192421, + "learning_rate": 0.00012446266616971053, + "loss": 0.0158, + "step": 6080 + }, + { + "epoch": 1.1347649881213024, + "grad_norm": 0.12622275948524475, + "learning_rate": 0.00012433842713380544, + "loss": 0.0145, + "step": 6090 + }, + { + "epoch": 1.13662831322495, + "grad_norm": 0.21728819608688354, + "learning_rate": 0.0001242141880979004, + "loss": 0.0137, + "step": 6100 + }, + { + "epoch": 1.1384916383285975, + "grad_norm": 0.14400696754455566, + "learning_rate": 0.0001240899490619953, + "loss": 0.0147, + "step": 6110 + }, + { + "epoch": 1.140354963432245, + "grad_norm": 0.04675738885998726, + "learning_rate": 0.00012396571002609022, + "loss": 0.0141, + "step": 6120 + }, + { + "epoch": 1.1422182885358922, + "grad_norm": 0.11618246138095856, + "learning_rate": 0.00012384147099018514, + "loss": 0.0124, + "step": 6130 + }, + { + "epoch": 1.1440816136395398, + "grad_norm": 0.08097544312477112, + "learning_rate": 0.00012371723195428005, + "loss": 0.0112, + "step": 6140 + }, + { + "epoch": 1.1459449387431873, + "grad_norm": 0.1423315703868866, + "learning_rate": 0.00012359299291837497, + "loss": 0.0138, + "step": 6150 + }, + { + "epoch": 1.1478082638468348, + "grad_norm": 0.4167300760746002, + "learning_rate": 0.00012346875388246988, + "loss": 0.0171, + "step": 6160 + }, + { + "epoch": 1.149671588950482, + "grad_norm": 0.12244168668985367, + "learning_rate": 0.0001233445148465648, + "loss": 0.0135, + "step": 6170 + }, + { + "epoch": 1.1515349140541296, + "grad_norm": 0.10434919595718384, + "learning_rate": 0.00012322027581065971, + "loss": 0.0128, + "step": 6180 + }, + { + "epoch": 1.153398239157777, + "grad_norm": 0.11119112372398376, + "learning_rate": 0.00012309603677475463, + "loss": 0.0122, + "step": 6190 + }, + { + "epoch": 1.1552615642614246, + "grad_norm": 0.14864900708198547, + "learning_rate": 0.00012297179773884955, + "loss": 0.0119, + "step": 6200 + }, + { + "epoch": 1.1571248893650719, + "grad_norm": 0.9202861189842224, + "learning_rate": 0.00012284755870294446, + "loss": 0.0149, + "step": 6210 + }, + { + "epoch": 1.1589882144687194, + "grad_norm": 0.10615832358598709, + "learning_rate": 0.00012272331966703938, + "loss": 0.0161, + "step": 6220 + }, + { + "epoch": 1.1608515395723669, + "grad_norm": 0.05958676338195801, + "learning_rate": 0.0001225990806311343, + "loss": 0.0137, + "step": 6230 + }, + { + "epoch": 1.1627148646760144, + "grad_norm": 0.10617449879646301, + "learning_rate": 0.0001224748415952292, + "loss": 0.012, + "step": 6240 + }, + { + "epoch": 1.1645781897796619, + "grad_norm": 0.141194686293602, + "learning_rate": 0.00012235060255932412, + "loss": 0.0129, + "step": 6250 + }, + { + "epoch": 1.1664415148833092, + "grad_norm": 0.09088177978992462, + "learning_rate": 0.00012222636352341907, + "loss": 0.0125, + "step": 6260 + }, + { + "epoch": 1.1683048399869567, + "grad_norm": 0.12682999670505524, + "learning_rate": 0.00012210212448751398, + "loss": 0.012, + "step": 6270 + }, + { + "epoch": 1.1701681650906042, + "grad_norm": 0.13394199311733246, + "learning_rate": 0.00012197788545160891, + "loss": 0.0159, + "step": 6280 + }, + { + "epoch": 1.1720314901942517, + "grad_norm": 0.12738922238349915, + "learning_rate": 0.00012185364641570383, + "loss": 0.0155, + "step": 6290 + }, + { + "epoch": 1.1738948152978992, + "grad_norm": 0.11358192563056946, + "learning_rate": 0.00012172940737979874, + "loss": 0.0114, + "step": 6300 + }, + { + "epoch": 1.1757581404015465, + "grad_norm": 0.09697134792804718, + "learning_rate": 0.00012160516834389366, + "loss": 0.0126, + "step": 6310 + }, + { + "epoch": 1.177621465505194, + "grad_norm": 0.12397624552249908, + "learning_rate": 0.00012148092930798858, + "loss": 0.0133, + "step": 6320 + }, + { + "epoch": 1.1794847906088415, + "grad_norm": 0.07363367825746536, + "learning_rate": 0.00012135669027208349, + "loss": 0.0108, + "step": 6330 + }, + { + "epoch": 1.181348115712489, + "grad_norm": 0.10165391117334366, + "learning_rate": 0.00012123245123617841, + "loss": 0.0162, + "step": 6340 + }, + { + "epoch": 1.1832114408161365, + "grad_norm": 0.07939434796571732, + "learning_rate": 0.00012110821220027332, + "loss": 0.011, + "step": 6350 + }, + { + "epoch": 1.1850747659197838, + "grad_norm": 0.11631353199481964, + "learning_rate": 0.00012098397316436825, + "loss": 0.0135, + "step": 6360 + }, + { + "epoch": 1.1869380910234313, + "grad_norm": 0.12175942212343216, + "learning_rate": 0.00012085973412846317, + "loss": 0.012, + "step": 6370 + }, + { + "epoch": 1.1888014161270788, + "grad_norm": 0.06976991146802902, + "learning_rate": 0.00012073549509255808, + "loss": 0.0232, + "step": 6380 + }, + { + "epoch": 1.1906647412307263, + "grad_norm": 0.07860373705625534, + "learning_rate": 0.000120611256056653, + "loss": 0.0183, + "step": 6390 + }, + { + "epoch": 1.1925280663343738, + "grad_norm": 0.10476142168045044, + "learning_rate": 0.00012048701702074792, + "loss": 0.015, + "step": 6400 + }, + { + "epoch": 1.194391391438021, + "grad_norm": 0.15761305391788483, + "learning_rate": 0.00012036277798484283, + "loss": 0.0163, + "step": 6410 + }, + { + "epoch": 1.1962547165416686, + "grad_norm": 0.11684102565050125, + "learning_rate": 0.00012023853894893775, + "loss": 0.0114, + "step": 6420 + }, + { + "epoch": 1.198118041645316, + "grad_norm": 0.07757461071014404, + "learning_rate": 0.00012011429991303269, + "loss": 0.0106, + "step": 6430 + }, + { + "epoch": 1.1999813667489636, + "grad_norm": 0.08250425010919571, + "learning_rate": 0.0001199900608771276, + "loss": 0.0162, + "step": 6440 + }, + { + "epoch": 1.201844691852611, + "grad_norm": 0.07098899036645889, + "learning_rate": 0.00011986582184122252, + "loss": 0.0094, + "step": 6450 + }, + { + "epoch": 1.2037080169562584, + "grad_norm": 0.12341819703578949, + "learning_rate": 0.00011974158280531744, + "loss": 0.014, + "step": 6460 + }, + { + "epoch": 1.205571342059906, + "grad_norm": 0.12068308889865875, + "learning_rate": 0.00011961734376941235, + "loss": 0.0144, + "step": 6470 + }, + { + "epoch": 1.2074346671635534, + "grad_norm": 0.12204370647668839, + "learning_rate": 0.00011949310473350728, + "loss": 0.0139, + "step": 6480 + }, + { + "epoch": 1.209297992267201, + "grad_norm": 0.07349750399589539, + "learning_rate": 0.0001193688656976022, + "loss": 0.0134, + "step": 6490 + }, + { + "epoch": 1.2111613173708482, + "grad_norm": 0.09023798257112503, + "learning_rate": 0.00011924462666169712, + "loss": 0.0133, + "step": 6500 + }, + { + "epoch": 1.2130246424744957, + "grad_norm": 0.11921370029449463, + "learning_rate": 0.00011912038762579203, + "loss": 0.0202, + "step": 6510 + }, + { + "epoch": 1.2148879675781432, + "grad_norm": 0.16381864249706268, + "learning_rate": 0.00011899614858988695, + "loss": 0.013, + "step": 6520 + }, + { + "epoch": 1.2167512926817907, + "grad_norm": 0.18260565400123596, + "learning_rate": 0.00011887190955398186, + "loss": 0.0152, + "step": 6530 + }, + { + "epoch": 1.218614617785438, + "grad_norm": 0.17924544215202332, + "learning_rate": 0.00011874767051807678, + "loss": 0.0156, + "step": 6540 + }, + { + "epoch": 1.2204779428890855, + "grad_norm": 0.0538809597492218, + "learning_rate": 0.0001186234314821717, + "loss": 0.0096, + "step": 6550 + }, + { + "epoch": 1.222341267992733, + "grad_norm": 0.15469981729984283, + "learning_rate": 0.00011849919244626661, + "loss": 0.0122, + "step": 6560 + }, + { + "epoch": 1.2242045930963805, + "grad_norm": 0.10429839044809341, + "learning_rate": 0.00011837495341036154, + "loss": 0.0153, + "step": 6570 + }, + { + "epoch": 1.226067918200028, + "grad_norm": 0.09104898571968079, + "learning_rate": 0.00011825071437445646, + "loss": 0.0136, + "step": 6580 + }, + { + "epoch": 1.2279312433036753, + "grad_norm": 0.11124694347381592, + "learning_rate": 0.00011812647533855137, + "loss": 0.012, + "step": 6590 + }, + { + "epoch": 1.2297945684073228, + "grad_norm": 0.18156588077545166, + "learning_rate": 0.00011800223630264631, + "loss": 0.013, + "step": 6600 + }, + { + "epoch": 1.2316578935109703, + "grad_norm": 0.10969128459692001, + "learning_rate": 0.00011787799726674123, + "loss": 0.0124, + "step": 6610 + }, + { + "epoch": 1.2335212186146178, + "grad_norm": 0.1498788446187973, + "learning_rate": 0.00011775375823083615, + "loss": 0.0131, + "step": 6620 + }, + { + "epoch": 1.2353845437182653, + "grad_norm": 0.08221983164548874, + "learning_rate": 0.00011762951919493106, + "loss": 0.0151, + "step": 6630 + }, + { + "epoch": 1.2372478688219126, + "grad_norm": 0.10865967720746994, + "learning_rate": 0.00011750528015902598, + "loss": 0.0115, + "step": 6640 + }, + { + "epoch": 1.2391111939255601, + "grad_norm": 0.09154286235570908, + "learning_rate": 0.00011738104112312089, + "loss": 0.0118, + "step": 6650 + }, + { + "epoch": 1.2409745190292076, + "grad_norm": 0.2195647954940796, + "learning_rate": 0.00011725680208721581, + "loss": 0.024, + "step": 6660 + }, + { + "epoch": 1.2428378441328551, + "grad_norm": 0.09780694544315338, + "learning_rate": 0.00011713256305131072, + "loss": 0.0121, + "step": 6670 + }, + { + "epoch": 1.2447011692365026, + "grad_norm": 0.12990465760231018, + "learning_rate": 0.00011700832401540564, + "loss": 0.0118, + "step": 6680 + }, + { + "epoch": 1.24656449434015, + "grad_norm": 0.09181374311447144, + "learning_rate": 0.00011688408497950057, + "loss": 0.015, + "step": 6690 + }, + { + "epoch": 1.2484278194437974, + "grad_norm": 0.0914270430803299, + "learning_rate": 0.00011675984594359549, + "loss": 0.0085, + "step": 6700 + }, + { + "epoch": 1.250291144547445, + "grad_norm": 0.09390436112880707, + "learning_rate": 0.0001166356069076904, + "loss": 0.0143, + "step": 6710 + }, + { + "epoch": 1.2521544696510924, + "grad_norm": 0.0553867444396019, + "learning_rate": 0.00011651136787178532, + "loss": 0.0096, + "step": 6720 + }, + { + "epoch": 1.25401779475474, + "grad_norm": 0.08669324219226837, + "learning_rate": 0.00011638712883588023, + "loss": 0.012, + "step": 6730 + }, + { + "epoch": 1.2558811198583872, + "grad_norm": 0.10883240401744843, + "learning_rate": 0.00011626288979997515, + "loss": 0.0156, + "step": 6740 + }, + { + "epoch": 1.2577444449620347, + "grad_norm": 0.39218905568122864, + "learning_rate": 0.00011613865076407006, + "loss": 0.0404, + "step": 6750 + }, + { + "epoch": 1.2596077700656823, + "grad_norm": 0.10930469632148743, + "learning_rate": 0.00011601441172816498, + "loss": 0.014, + "step": 6760 + }, + { + "epoch": 1.2614710951693295, + "grad_norm": 0.19345052540302277, + "learning_rate": 0.00011589017269225992, + "loss": 0.0147, + "step": 6770 + }, + { + "epoch": 1.2633344202729773, + "grad_norm": 0.0582694374024868, + "learning_rate": 0.00011576593365635484, + "loss": 0.0116, + "step": 6780 + }, + { + "epoch": 1.2651977453766245, + "grad_norm": 0.0844726487994194, + "learning_rate": 0.00011564169462044975, + "loss": 0.0152, + "step": 6790 + }, + { + "epoch": 1.267061070480272, + "grad_norm": 0.11363263428211212, + "learning_rate": 0.00011551745558454467, + "loss": 0.0128, + "step": 6800 + }, + { + "epoch": 1.2689243955839196, + "grad_norm": 0.13044427335262299, + "learning_rate": 0.0001153932165486396, + "loss": 0.014, + "step": 6810 + }, + { + "epoch": 1.2707877206875668, + "grad_norm": 0.1002730280160904, + "learning_rate": 0.00011526897751273452, + "loss": 0.0139, + "step": 6820 + }, + { + "epoch": 1.2726510457912144, + "grad_norm": 0.10550817102193832, + "learning_rate": 0.00011514473847682943, + "loss": 0.0125, + "step": 6830 + }, + { + "epoch": 1.2745143708948619, + "grad_norm": 0.11775978654623032, + "learning_rate": 0.00011502049944092435, + "loss": 0.0103, + "step": 6840 + }, + { + "epoch": 1.2763776959985094, + "grad_norm": 0.19106002151966095, + "learning_rate": 0.00011489626040501926, + "loss": 0.0153, + "step": 6850 + }, + { + "epoch": 1.2782410211021569, + "grad_norm": 0.07046420127153397, + "learning_rate": 0.00011477202136911418, + "loss": 0.0204, + "step": 6860 + }, + { + "epoch": 1.2801043462058042, + "grad_norm": 0.10768242925405502, + "learning_rate": 0.0001146477823332091, + "loss": 0.0111, + "step": 6870 + }, + { + "epoch": 1.2819676713094517, + "grad_norm": 0.08488526940345764, + "learning_rate": 0.00011452354329730401, + "loss": 0.0166, + "step": 6880 + }, + { + "epoch": 1.2838309964130992, + "grad_norm": 0.11364570260047913, + "learning_rate": 0.00011439930426139893, + "loss": 0.0127, + "step": 6890 + }, + { + "epoch": 1.2856943215167467, + "grad_norm": 0.09781802445650101, + "learning_rate": 0.00011427506522549386, + "loss": 0.0122, + "step": 6900 + }, + { + "epoch": 1.2875576466203942, + "grad_norm": 0.11134841293096542, + "learning_rate": 0.00011415082618958877, + "loss": 0.0222, + "step": 6910 + }, + { + "epoch": 1.2894209717240415, + "grad_norm": 0.09454433619976044, + "learning_rate": 0.00011402658715368369, + "loss": 0.014, + "step": 6920 + }, + { + "epoch": 1.291284296827689, + "grad_norm": 0.13544605672359467, + "learning_rate": 0.0001139023481177786, + "loss": 0.0172, + "step": 6930 + }, + { + "epoch": 1.2931476219313365, + "grad_norm": 0.06879796087741852, + "learning_rate": 0.00011377810908187355, + "loss": 0.0136, + "step": 6940 + }, + { + "epoch": 1.295010947034984, + "grad_norm": 0.10580771416425705, + "learning_rate": 0.00011365387004596846, + "loss": 0.0208, + "step": 6950 + }, + { + "epoch": 1.2968742721386315, + "grad_norm": 0.1340237557888031, + "learning_rate": 0.00011352963101006338, + "loss": 0.0117, + "step": 6960 + }, + { + "epoch": 1.2987375972422788, + "grad_norm": 0.14873532950878143, + "learning_rate": 0.0001134053919741583, + "loss": 0.0134, + "step": 6970 + }, + { + "epoch": 1.3006009223459263, + "grad_norm": 0.11179360747337341, + "learning_rate": 0.00011328115293825321, + "loss": 0.0142, + "step": 6980 + }, + { + "epoch": 1.3024642474495738, + "grad_norm": 0.10709336400032043, + "learning_rate": 0.00011315691390234813, + "loss": 0.014, + "step": 6990 + }, + { + "epoch": 1.3043275725532213, + "grad_norm": 0.20911122858524323, + "learning_rate": 0.00011303267486644304, + "loss": 0.018, + "step": 7000 + }, + { + "epoch": 1.3061908976568688, + "grad_norm": 0.0970635861158371, + "learning_rate": 0.00011290843583053796, + "loss": 0.0157, + "step": 7010 + }, + { + "epoch": 1.308054222760516, + "grad_norm": 0.13426648080348969, + "learning_rate": 0.00011278419679463287, + "loss": 0.0128, + "step": 7020 + }, + { + "epoch": 1.3099175478641636, + "grad_norm": 0.11847269535064697, + "learning_rate": 0.0001126599577587278, + "loss": 0.0085, + "step": 7030 + }, + { + "epoch": 1.311780872967811, + "grad_norm": 0.07585333287715912, + "learning_rate": 0.00011253571872282272, + "loss": 0.0127, + "step": 7040 + }, + { + "epoch": 1.3136441980714584, + "grad_norm": 0.11813944578170776, + "learning_rate": 0.00011241147968691763, + "loss": 0.0142, + "step": 7050 + }, + { + "epoch": 1.315507523175106, + "grad_norm": 0.11280883848667145, + "learning_rate": 0.00011228724065101255, + "loss": 0.0093, + "step": 7060 + }, + { + "epoch": 1.3173708482787534, + "grad_norm": 0.07703553140163422, + "learning_rate": 0.00011216300161510747, + "loss": 0.014, + "step": 7070 + }, + { + "epoch": 1.319234173382401, + "grad_norm": 0.1067732498049736, + "learning_rate": 0.00011203876257920238, + "loss": 0.012, + "step": 7080 + }, + { + "epoch": 1.3210974984860484, + "grad_norm": 0.12903323769569397, + "learning_rate": 0.0001119145235432973, + "loss": 0.0122, + "step": 7090 + }, + { + "epoch": 1.3229608235896957, + "grad_norm": 0.07112511992454529, + "learning_rate": 0.00011179028450739221, + "loss": 0.0149, + "step": 7100 + }, + { + "epoch": 1.3248241486933432, + "grad_norm": 0.08477038890123367, + "learning_rate": 0.00011166604547148716, + "loss": 0.0133, + "step": 7110 + }, + { + "epoch": 1.3266874737969907, + "grad_norm": 0.07981168478727341, + "learning_rate": 0.00011154180643558207, + "loss": 0.0127, + "step": 7120 + }, + { + "epoch": 1.3285507989006382, + "grad_norm": 0.15720146894454956, + "learning_rate": 0.00011141756739967699, + "loss": 0.0132, + "step": 7130 + }, + { + "epoch": 1.3304141240042857, + "grad_norm": 0.19825328886508942, + "learning_rate": 0.0001112933283637719, + "loss": 0.015, + "step": 7140 + }, + { + "epoch": 1.332277449107933, + "grad_norm": 0.11979183554649353, + "learning_rate": 0.00011116908932786683, + "loss": 0.0146, + "step": 7150 + }, + { + "epoch": 1.3341407742115805, + "grad_norm": 0.1986120492219925, + "learning_rate": 0.00011104485029196175, + "loss": 0.0146, + "step": 7160 + }, + { + "epoch": 1.336004099315228, + "grad_norm": 0.20982642471790314, + "learning_rate": 0.00011092061125605666, + "loss": 0.0122, + "step": 7170 + }, + { + "epoch": 1.3378674244188755, + "grad_norm": 0.12676341831684113, + "learning_rate": 0.00011079637222015158, + "loss": 0.0182, + "step": 7180 + }, + { + "epoch": 1.339730749522523, + "grad_norm": 0.072712741792202, + "learning_rate": 0.0001106721331842465, + "loss": 0.0169, + "step": 7190 + }, + { + "epoch": 1.3415940746261703, + "grad_norm": 0.07570649683475494, + "learning_rate": 0.00011054789414834141, + "loss": 0.0148, + "step": 7200 + }, + { + "epoch": 1.3434573997298178, + "grad_norm": 0.0949859768152237, + "learning_rate": 0.00011042365511243633, + "loss": 0.0126, + "step": 7210 + }, + { + "epoch": 1.3453207248334653, + "grad_norm": 0.0748329609632492, + "learning_rate": 0.00011029941607653124, + "loss": 0.016, + "step": 7220 + }, + { + "epoch": 1.3471840499371128, + "grad_norm": 0.10189643502235413, + "learning_rate": 0.00011017517704062616, + "loss": 0.0129, + "step": 7230 + }, + { + "epoch": 1.3490473750407603, + "grad_norm": 0.11995694786310196, + "learning_rate": 0.00011005093800472109, + "loss": 0.02, + "step": 7240 + }, + { + "epoch": 1.3509107001444076, + "grad_norm": 0.04982425644993782, + "learning_rate": 0.000109926698968816, + "loss": 0.0187, + "step": 7250 + }, + { + "epoch": 1.3527740252480551, + "grad_norm": 0.12379368394613266, + "learning_rate": 0.00010980245993291092, + "loss": 0.013, + "step": 7260 + }, + { + "epoch": 1.3546373503517026, + "grad_norm": 0.15470531582832336, + "learning_rate": 0.00010967822089700584, + "loss": 0.0111, + "step": 7270 + }, + { + "epoch": 1.3565006754553501, + "grad_norm": 0.14728385210037231, + "learning_rate": 0.00010955398186110078, + "loss": 0.0142, + "step": 7280 + }, + { + "epoch": 1.3583640005589976, + "grad_norm": 0.04794132709503174, + "learning_rate": 0.0001094297428251957, + "loss": 0.0132, + "step": 7290 + }, + { + "epoch": 1.360227325662645, + "grad_norm": 0.12964306771755219, + "learning_rate": 0.00010930550378929061, + "loss": 0.0133, + "step": 7300 + }, + { + "epoch": 1.3620906507662924, + "grad_norm": 0.2551344931125641, + "learning_rate": 0.00010918126475338553, + "loss": 0.0222, + "step": 7310 + }, + { + "epoch": 1.36395397586994, + "grad_norm": 0.06458201259374619, + "learning_rate": 0.00010905702571748044, + "loss": 0.0134, + "step": 7320 + }, + { + "epoch": 1.3658173009735874, + "grad_norm": 0.09689020365476608, + "learning_rate": 0.00010893278668157536, + "loss": 0.0097, + "step": 7330 + }, + { + "epoch": 1.367680626077235, + "grad_norm": 0.12692950665950775, + "learning_rate": 0.00010880854764567027, + "loss": 0.017, + "step": 7340 + }, + { + "epoch": 1.3695439511808822, + "grad_norm": 0.18245796859264374, + "learning_rate": 0.00010868430860976519, + "loss": 0.0139, + "step": 7350 + }, + { + "epoch": 1.3714072762845297, + "grad_norm": 0.1438094675540924, + "learning_rate": 0.00010856006957386012, + "loss": 0.0125, + "step": 7360 + }, + { + "epoch": 1.3732706013881772, + "grad_norm": 0.05395403876900673, + "learning_rate": 0.00010843583053795503, + "loss": 0.0151, + "step": 7370 + }, + { + "epoch": 1.3751339264918245, + "grad_norm": 0.06818302720785141, + "learning_rate": 0.00010831159150204995, + "loss": 0.0112, + "step": 7380 + }, + { + "epoch": 1.376997251595472, + "grad_norm": 0.09590564668178558, + "learning_rate": 0.00010818735246614487, + "loss": 0.0126, + "step": 7390 + }, + { + "epoch": 1.3788605766991195, + "grad_norm": 0.09465160220861435, + "learning_rate": 0.00010806311343023978, + "loss": 0.0118, + "step": 7400 + }, + { + "epoch": 1.380723901802767, + "grad_norm": 0.07145224511623383, + "learning_rate": 0.0001079388743943347, + "loss": 0.0105, + "step": 7410 + }, + { + "epoch": 1.3825872269064146, + "grad_norm": 0.12949031591415405, + "learning_rate": 0.00010781463535842961, + "loss": 0.0154, + "step": 7420 + }, + { + "epoch": 1.3844505520100618, + "grad_norm": 0.10481762140989304, + "learning_rate": 0.00010769039632252453, + "loss": 0.0193, + "step": 7430 + }, + { + "epoch": 1.3863138771137093, + "grad_norm": 0.08996088802814484, + "learning_rate": 0.00010756615728661945, + "loss": 0.013, + "step": 7440 + }, + { + "epoch": 1.3881772022173569, + "grad_norm": 0.07961221039295197, + "learning_rate": 0.00010744191825071439, + "loss": 0.0161, + "step": 7450 + }, + { + "epoch": 1.3900405273210044, + "grad_norm": 0.11494274437427521, + "learning_rate": 0.0001073176792148093, + "loss": 0.0117, + "step": 7460 + }, + { + "epoch": 1.3919038524246519, + "grad_norm": 0.16665033996105194, + "learning_rate": 0.00010719344017890422, + "loss": 0.013, + "step": 7470 + }, + { + "epoch": 1.3937671775282992, + "grad_norm": 0.09289297461509705, + "learning_rate": 0.00010706920114299915, + "loss": 0.015, + "step": 7480 + }, + { + "epoch": 1.3956305026319467, + "grad_norm": 0.06463921070098877, + "learning_rate": 0.00010694496210709406, + "loss": 0.0099, + "step": 7490 + }, + { + "epoch": 1.3974938277355942, + "grad_norm": 0.15225523710250854, + "learning_rate": 0.00010682072307118898, + "loss": 0.0124, + "step": 7500 + }, + { + "epoch": 1.3993571528392417, + "grad_norm": 0.07819876074790955, + "learning_rate": 0.0001066964840352839, + "loss": 0.0106, + "step": 7510 + }, + { + "epoch": 1.4012204779428892, + "grad_norm": 0.2872188985347748, + "learning_rate": 0.00010657224499937881, + "loss": 0.0191, + "step": 7520 + }, + { + "epoch": 1.4030838030465365, + "grad_norm": 0.13932381570339203, + "learning_rate": 0.00010644800596347373, + "loss": 0.015, + "step": 7530 + }, + { + "epoch": 1.404947128150184, + "grad_norm": 0.09094579517841339, + "learning_rate": 0.00010632376692756864, + "loss": 0.0141, + "step": 7540 + }, + { + "epoch": 1.4068104532538315, + "grad_norm": 0.07863321900367737, + "learning_rate": 0.00010619952789166356, + "loss": 0.0123, + "step": 7550 + }, + { + "epoch": 1.408673778357479, + "grad_norm": 0.08263008296489716, + "learning_rate": 0.00010607528885575848, + "loss": 0.0128, + "step": 7560 + }, + { + "epoch": 1.4105371034611265, + "grad_norm": 0.055199749767780304, + "learning_rate": 0.0001059510498198534, + "loss": 0.0158, + "step": 7570 + }, + { + "epoch": 1.4124004285647738, + "grad_norm": 0.09670916199684143, + "learning_rate": 0.00010582681078394832, + "loss": 0.0107, + "step": 7580 + }, + { + "epoch": 1.4142637536684213, + "grad_norm": 0.1617942899465561, + "learning_rate": 0.00010570257174804324, + "loss": 0.0173, + "step": 7590 + }, + { + "epoch": 1.4161270787720688, + "grad_norm": 0.099526546895504, + "learning_rate": 0.00010557833271213815, + "loss": 0.0127, + "step": 7600 + }, + { + "epoch": 1.4179904038757163, + "grad_norm": 0.18963083624839783, + "learning_rate": 0.00010545409367623307, + "loss": 0.011, + "step": 7610 + }, + { + "epoch": 1.4198537289793638, + "grad_norm": 0.11049555242061615, + "learning_rate": 0.00010532985464032798, + "loss": 0.0115, + "step": 7620 + }, + { + "epoch": 1.421717054083011, + "grad_norm": 0.0848945677280426, + "learning_rate": 0.00010520561560442293, + "loss": 0.0128, + "step": 7630 + }, + { + "epoch": 1.4235803791866586, + "grad_norm": 0.09841426461935043, + "learning_rate": 0.00010508137656851784, + "loss": 0.0116, + "step": 7640 + }, + { + "epoch": 1.425443704290306, + "grad_norm": 0.11386366933584213, + "learning_rate": 0.00010495713753261276, + "loss": 0.0141, + "step": 7650 + }, + { + "epoch": 1.4273070293939534, + "grad_norm": 0.10864854604005814, + "learning_rate": 0.00010483289849670767, + "loss": 0.0144, + "step": 7660 + }, + { + "epoch": 1.429170354497601, + "grad_norm": 0.09958741068840027, + "learning_rate": 0.00010470865946080259, + "loss": 0.0097, + "step": 7670 + }, + { + "epoch": 1.4310336796012484, + "grad_norm": 0.12464049458503723, + "learning_rate": 0.0001045844204248975, + "loss": 0.0146, + "step": 7680 + }, + { + "epoch": 1.432897004704896, + "grad_norm": 0.10106810927391052, + "learning_rate": 0.00010446018138899244, + "loss": 0.0082, + "step": 7690 + }, + { + "epoch": 1.4347603298085434, + "grad_norm": 0.10265982151031494, + "learning_rate": 0.00010433594235308735, + "loss": 0.0137, + "step": 7700 + }, + { + "epoch": 1.4366236549121907, + "grad_norm": 0.25061050057411194, + "learning_rate": 0.00010421170331718227, + "loss": 0.0153, + "step": 7710 + }, + { + "epoch": 1.4384869800158382, + "grad_norm": 0.15667365491390228, + "learning_rate": 0.00010408746428127718, + "loss": 0.0119, + "step": 7720 + }, + { + "epoch": 1.4403503051194857, + "grad_norm": 0.08727956563234329, + "learning_rate": 0.0001039632252453721, + "loss": 0.0156, + "step": 7730 + }, + { + "epoch": 1.4422136302231332, + "grad_norm": 0.05918029323220253, + "learning_rate": 0.00010383898620946701, + "loss": 0.0137, + "step": 7740 + }, + { + "epoch": 1.4440769553267807, + "grad_norm": 0.07321096211671829, + "learning_rate": 0.00010371474717356193, + "loss": 0.0114, + "step": 7750 + }, + { + "epoch": 1.445940280430428, + "grad_norm": 0.11011398583650589, + "learning_rate": 0.00010359050813765685, + "loss": 0.0125, + "step": 7760 + }, + { + "epoch": 1.4478036055340755, + "grad_norm": 0.10525079816579819, + "learning_rate": 0.00010346626910175176, + "loss": 0.0119, + "step": 7770 + }, + { + "epoch": 1.449666930637723, + "grad_norm": 0.09164225310087204, + "learning_rate": 0.00010334203006584669, + "loss": 0.0114, + "step": 7780 + }, + { + "epoch": 1.4515302557413705, + "grad_norm": 0.33539170026779175, + "learning_rate": 0.0001032177910299416, + "loss": 0.0199, + "step": 7790 + }, + { + "epoch": 1.453393580845018, + "grad_norm": 0.11161132156848907, + "learning_rate": 0.00010309355199403654, + "loss": 0.0142, + "step": 7800 + }, + { + "epoch": 1.4552569059486653, + "grad_norm": 0.06660373508930206, + "learning_rate": 0.00010296931295813145, + "loss": 0.0106, + "step": 7810 + }, + { + "epoch": 1.4571202310523128, + "grad_norm": 0.08710351586341858, + "learning_rate": 0.00010284507392222638, + "loss": 0.0137, + "step": 7820 + }, + { + "epoch": 1.4589835561559603, + "grad_norm": 0.0928850769996643, + "learning_rate": 0.0001027208348863213, + "loss": 0.0123, + "step": 7830 + }, + { + "epoch": 1.4608468812596078, + "grad_norm": 0.05278251692652702, + "learning_rate": 0.00010259659585041621, + "loss": 0.0141, + "step": 7840 + }, + { + "epoch": 1.4627102063632553, + "grad_norm": 0.15453903377056122, + "learning_rate": 0.00010247235681451113, + "loss": 0.0112, + "step": 7850 + }, + { + "epoch": 1.4645735314669026, + "grad_norm": 0.14208067953586578, + "learning_rate": 0.00010234811777860604, + "loss": 0.0151, + "step": 7860 + }, + { + "epoch": 1.4664368565705501, + "grad_norm": 0.12577812373638153, + "learning_rate": 0.00010222387874270096, + "loss": 0.0102, + "step": 7870 + }, + { + "epoch": 1.4683001816741976, + "grad_norm": 0.06806960701942444, + "learning_rate": 0.00010209963970679588, + "loss": 0.0146, + "step": 7880 + }, + { + "epoch": 1.4701635067778451, + "grad_norm": 0.23436611890792847, + "learning_rate": 0.00010197540067089079, + "loss": 0.0141, + "step": 7890 + }, + { + "epoch": 1.4720268318814926, + "grad_norm": 0.12878048419952393, + "learning_rate": 0.00010185116163498571, + "loss": 0.0093, + "step": 7900 + }, + { + "epoch": 1.47389015698514, + "grad_norm": 0.054260533303022385, + "learning_rate": 0.00010172692259908064, + "loss": 0.0124, + "step": 7910 + }, + { + "epoch": 1.4757534820887874, + "grad_norm": 0.09768477827310562, + "learning_rate": 0.00010160268356317555, + "loss": 0.0118, + "step": 7920 + }, + { + "epoch": 1.477616807192435, + "grad_norm": 0.10654323548078537, + "learning_rate": 0.00010147844452727047, + "loss": 0.0117, + "step": 7930 + }, + { + "epoch": 1.4794801322960824, + "grad_norm": 0.10613780468702316, + "learning_rate": 0.00010135420549136538, + "loss": 0.0136, + "step": 7940 + }, + { + "epoch": 1.48134345739973, + "grad_norm": 0.11709728837013245, + "learning_rate": 0.0001012299664554603, + "loss": 0.0113, + "step": 7950 + }, + { + "epoch": 1.4832067825033772, + "grad_norm": 0.05041942372918129, + "learning_rate": 0.00010110572741955522, + "loss": 0.0101, + "step": 7960 + }, + { + "epoch": 1.4850701076070247, + "grad_norm": 0.06611751019954681, + "learning_rate": 0.00010098148838365016, + "loss": 0.0119, + "step": 7970 + }, + { + "epoch": 1.4869334327106722, + "grad_norm": 0.11693181842565536, + "learning_rate": 0.00010085724934774507, + "loss": 0.0139, + "step": 7980 + }, + { + "epoch": 1.4887967578143195, + "grad_norm": 0.1092601865530014, + "learning_rate": 0.00010073301031183999, + "loss": 0.0104, + "step": 7990 + }, + { + "epoch": 1.490660082917967, + "grad_norm": 0.09393884241580963, + "learning_rate": 0.0001006087712759349, + "loss": 0.0109, + "step": 8000 + }, + { + "epoch": 1.4925234080216145, + "grad_norm": 0.12773072719573975, + "learning_rate": 0.00010048453224002982, + "loss": 0.0175, + "step": 8010 + }, + { + "epoch": 1.494386733125262, + "grad_norm": 0.10976378619670868, + "learning_rate": 0.00010036029320412474, + "loss": 0.0125, + "step": 8020 + }, + { + "epoch": 1.4962500582289096, + "grad_norm": 0.09273622930049896, + "learning_rate": 0.00010023605416821967, + "loss": 0.0101, + "step": 8030 + }, + { + "epoch": 1.4981133833325568, + "grad_norm": 0.10113687813282013, + "learning_rate": 0.00010011181513231458, + "loss": 0.01, + "step": 8040 + }, + { + "epoch": 1.4999767084362043, + "grad_norm": 0.09726346284151077, + "learning_rate": 9.99875760964095e-05, + "loss": 0.0131, + "step": 8050 + }, + { + "epoch": 1.5018400335398518, + "grad_norm": 0.09806101024150848, + "learning_rate": 9.986333706050441e-05, + "loss": 0.0123, + "step": 8060 + }, + { + "epoch": 1.5037033586434994, + "grad_norm": 0.0637512356042862, + "learning_rate": 9.973909802459933e-05, + "loss": 0.0102, + "step": 8070 + }, + { + "epoch": 1.5055666837471469, + "grad_norm": 0.13974900543689728, + "learning_rate": 9.961485898869425e-05, + "loss": 0.0107, + "step": 8080 + }, + { + "epoch": 1.5074300088507941, + "grad_norm": 0.10583356767892838, + "learning_rate": 9.949061995278916e-05, + "loss": 0.0093, + "step": 8090 + }, + { + "epoch": 1.5092933339544417, + "grad_norm": 0.08961586654186249, + "learning_rate": 9.936638091688409e-05, + "loss": 0.0103, + "step": 8100 + }, + { + "epoch": 1.5111566590580892, + "grad_norm": 0.12322380393743515, + "learning_rate": 9.924214188097901e-05, + "loss": 0.0126, + "step": 8110 + }, + { + "epoch": 1.5130199841617367, + "grad_norm": 0.09430979937314987, + "learning_rate": 9.911790284507392e-05, + "loss": 0.0165, + "step": 8120 + }, + { + "epoch": 1.5148833092653842, + "grad_norm": 0.08856257051229477, + "learning_rate": 9.899366380916885e-05, + "loss": 0.0106, + "step": 8130 + }, + { + "epoch": 1.5167466343690315, + "grad_norm": 0.26079264283180237, + "learning_rate": 9.886942477326377e-05, + "loss": 0.0123, + "step": 8140 + }, + { + "epoch": 1.518609959472679, + "grad_norm": 0.06865710020065308, + "learning_rate": 9.874518573735868e-05, + "loss": 0.0154, + "step": 8150 + }, + { + "epoch": 1.5204732845763265, + "grad_norm": 0.04991608113050461, + "learning_rate": 9.86209467014536e-05, + "loss": 0.0097, + "step": 8160 + }, + { + "epoch": 1.5223366096799738, + "grad_norm": 0.1879420429468155, + "learning_rate": 9.849670766554852e-05, + "loss": 0.015, + "step": 8170 + }, + { + "epoch": 1.5241999347836215, + "grad_norm": 0.08976439386606216, + "learning_rate": 9.837246862964345e-05, + "loss": 0.0089, + "step": 8180 + }, + { + "epoch": 1.5260632598872688, + "grad_norm": 0.08291597664356232, + "learning_rate": 9.824822959373836e-05, + "loss": 0.0122, + "step": 8190 + }, + { + "epoch": 1.5279265849909163, + "grad_norm": 0.15329314768314362, + "learning_rate": 9.812399055783328e-05, + "loss": 0.0159, + "step": 8200 + }, + { + "epoch": 1.5297899100945638, + "grad_norm": 0.05117841437458992, + "learning_rate": 9.799975152192819e-05, + "loss": 0.008, + "step": 8210 + }, + { + "epoch": 1.531653235198211, + "grad_norm": 0.15377692878246307, + "learning_rate": 9.787551248602311e-05, + "loss": 0.0302, + "step": 8220 + }, + { + "epoch": 1.5335165603018588, + "grad_norm": 0.1379365473985672, + "learning_rate": 9.775127345011802e-05, + "loss": 0.016, + "step": 8230 + }, + { + "epoch": 1.535379885405506, + "grad_norm": 0.1009238213300705, + "learning_rate": 9.762703441421295e-05, + "loss": 0.0113, + "step": 8240 + }, + { + "epoch": 1.5372432105091536, + "grad_norm": 0.05966337025165558, + "learning_rate": 9.750279537830787e-05, + "loss": 0.0108, + "step": 8250 + }, + { + "epoch": 1.539106535612801, + "grad_norm": 0.129754900932312, + "learning_rate": 9.737855634240279e-05, + "loss": 0.0137, + "step": 8260 + }, + { + "epoch": 1.5409698607164484, + "grad_norm": 0.09081212431192398, + "learning_rate": 9.725431730649771e-05, + "loss": 0.0116, + "step": 8270 + }, + { + "epoch": 1.542833185820096, + "grad_norm": 0.10083411633968353, + "learning_rate": 9.713007827059263e-05, + "loss": 0.0122, + "step": 8280 + }, + { + "epoch": 1.5446965109237434, + "grad_norm": 0.10409017652273178, + "learning_rate": 9.700583923468755e-05, + "loss": 0.01, + "step": 8290 + }, + { + "epoch": 1.5465598360273909, + "grad_norm": 0.07942856103181839, + "learning_rate": 9.688160019878246e-05, + "loss": 0.0103, + "step": 8300 + }, + { + "epoch": 1.5484231611310384, + "grad_norm": 0.09349773079156876, + "learning_rate": 9.675736116287738e-05, + "loss": 0.0122, + "step": 8310 + }, + { + "epoch": 1.5502864862346857, + "grad_norm": 0.05473365634679794, + "learning_rate": 9.66331221269723e-05, + "loss": 0.0101, + "step": 8320 + }, + { + "epoch": 1.5521498113383334, + "grad_norm": 0.12536931037902832, + "learning_rate": 9.650888309106721e-05, + "loss": 0.0154, + "step": 8330 + }, + { + "epoch": 1.5540131364419807, + "grad_norm": 0.08710866421461105, + "learning_rate": 9.638464405516214e-05, + "loss": 0.0121, + "step": 8340 + }, + { + "epoch": 1.5558764615456282, + "grad_norm": 0.2000790685415268, + "learning_rate": 9.626040501925705e-05, + "loss": 0.012, + "step": 8350 + }, + { + "epoch": 1.5577397866492757, + "grad_norm": 0.1262202113866806, + "learning_rate": 9.613616598335198e-05, + "loss": 0.0186, + "step": 8360 + }, + { + "epoch": 1.559603111752923, + "grad_norm": 0.102536141872406, + "learning_rate": 9.60119269474469e-05, + "loss": 0.0123, + "step": 8370 + }, + { + "epoch": 1.5614664368565707, + "grad_norm": 0.1080302819609642, + "learning_rate": 9.588768791154182e-05, + "loss": 0.0107, + "step": 8380 + }, + { + "epoch": 1.563329761960218, + "grad_norm": 0.04270603507757187, + "learning_rate": 9.576344887563673e-05, + "loss": 0.0123, + "step": 8390 + }, + { + "epoch": 1.5651930870638655, + "grad_norm": 0.07982005923986435, + "learning_rate": 9.563920983973165e-05, + "loss": 0.0225, + "step": 8400 + }, + { + "epoch": 1.567056412167513, + "grad_norm": 0.1117529571056366, + "learning_rate": 9.551497080382656e-05, + "loss": 0.0171, + "step": 8410 + }, + { + "epoch": 1.5689197372711603, + "grad_norm": 0.06128223240375519, + "learning_rate": 9.539073176792148e-05, + "loss": 0.0122, + "step": 8420 + }, + { + "epoch": 1.5707830623748078, + "grad_norm": 0.10701828449964523, + "learning_rate": 9.52664927320164e-05, + "loss": 0.0106, + "step": 8430 + }, + { + "epoch": 1.5726463874784553, + "grad_norm": 0.09437397867441177, + "learning_rate": 9.514225369611132e-05, + "loss": 0.0132, + "step": 8440 + }, + { + "epoch": 1.5745097125821028, + "grad_norm": 0.07649078965187073, + "learning_rate": 9.501801466020624e-05, + "loss": 0.0149, + "step": 8450 + }, + { + "epoch": 1.5763730376857503, + "grad_norm": 0.12064909189939499, + "learning_rate": 9.489377562430116e-05, + "loss": 0.0155, + "step": 8460 + }, + { + "epoch": 1.5782363627893976, + "grad_norm": 0.10540273040533066, + "learning_rate": 9.476953658839609e-05, + "loss": 0.0152, + "step": 8470 + }, + { + "epoch": 1.5800996878930451, + "grad_norm": 0.09898658096790314, + "learning_rate": 9.4645297552491e-05, + "loss": 0.0161, + "step": 8480 + }, + { + "epoch": 1.5819630129966926, + "grad_norm": 0.08065871894359589, + "learning_rate": 9.452105851658592e-05, + "loss": 0.0091, + "step": 8490 + }, + { + "epoch": 1.58382633810034, + "grad_norm": 0.04499693587422371, + "learning_rate": 9.439681948068083e-05, + "loss": 0.0105, + "step": 8500 + }, + { + "epoch": 1.5856896632039876, + "grad_norm": 0.04815511777997017, + "learning_rate": 9.427258044477575e-05, + "loss": 0.011, + "step": 8510 + }, + { + "epoch": 1.587552988307635, + "grad_norm": 0.11696461588144302, + "learning_rate": 9.414834140887068e-05, + "loss": 0.0113, + "step": 8520 + }, + { + "epoch": 1.5894163134112824, + "grad_norm": 0.06907574832439423, + "learning_rate": 9.40241023729656e-05, + "loss": 0.0149, + "step": 8530 + }, + { + "epoch": 1.59127963851493, + "grad_norm": 0.11363095790147781, + "learning_rate": 9.389986333706051e-05, + "loss": 0.0147, + "step": 8540 + }, + { + "epoch": 1.5931429636185772, + "grad_norm": 0.17413190007209778, + "learning_rate": 9.377562430115543e-05, + "loss": 0.0134, + "step": 8550 + }, + { + "epoch": 1.595006288722225, + "grad_norm": 0.08960302174091339, + "learning_rate": 9.365138526525034e-05, + "loss": 0.0102, + "step": 8560 + }, + { + "epoch": 1.5968696138258722, + "grad_norm": 0.023280244320631027, + "learning_rate": 9.352714622934527e-05, + "loss": 0.011, + "step": 8570 + }, + { + "epoch": 1.5987329389295197, + "grad_norm": 0.07135169953107834, + "learning_rate": 9.340290719344019e-05, + "loss": 0.0095, + "step": 8580 + }, + { + "epoch": 1.6005962640331672, + "grad_norm": 0.05813858285546303, + "learning_rate": 9.32786681575351e-05, + "loss": 0.0098, + "step": 8590 + }, + { + "epoch": 1.6024595891368145, + "grad_norm": 0.08756371587514877, + "learning_rate": 9.315442912163002e-05, + "loss": 0.0173, + "step": 8600 + }, + { + "epoch": 1.6043229142404623, + "grad_norm": 0.12145531177520752, + "learning_rate": 9.303019008572495e-05, + "loss": 0.0127, + "step": 8610 + }, + { + "epoch": 1.6061862393441095, + "grad_norm": 0.11375415325164795, + "learning_rate": 9.290595104981986e-05, + "loss": 0.0113, + "step": 8620 + }, + { + "epoch": 1.608049564447757, + "grad_norm": 0.056099046021699905, + "learning_rate": 9.278171201391478e-05, + "loss": 0.0118, + "step": 8630 + }, + { + "epoch": 1.6099128895514045, + "grad_norm": 0.11111286282539368, + "learning_rate": 9.26574729780097e-05, + "loss": 0.0131, + "step": 8640 + }, + { + "epoch": 1.6117762146550518, + "grad_norm": 0.08085739612579346, + "learning_rate": 9.253323394210461e-05, + "loss": 0.016, + "step": 8650 + }, + { + "epoch": 1.6136395397586996, + "grad_norm": 0.11837384849786758, + "learning_rate": 9.240899490619953e-05, + "loss": 0.0162, + "step": 8660 + }, + { + "epoch": 1.6155028648623468, + "grad_norm": 0.06581897288560867, + "learning_rate": 9.228475587029444e-05, + "loss": 0.0087, + "step": 8670 + }, + { + "epoch": 1.6173661899659943, + "grad_norm": 0.1268416792154312, + "learning_rate": 9.216051683438937e-05, + "loss": 0.0094, + "step": 8680 + }, + { + "epoch": 1.6192295150696419, + "grad_norm": 0.08039700239896774, + "learning_rate": 9.203627779848429e-05, + "loss": 0.0132, + "step": 8690 + }, + { + "epoch": 1.6210928401732891, + "grad_norm": 0.09226547926664352, + "learning_rate": 9.191203876257922e-05, + "loss": 0.0114, + "step": 8700 + }, + { + "epoch": 1.6229561652769366, + "grad_norm": 0.15534359216690063, + "learning_rate": 9.178779972667413e-05, + "loss": 0.0154, + "step": 8710 + }, + { + "epoch": 1.6248194903805842, + "grad_norm": 0.11844916641712189, + "learning_rate": 9.166356069076905e-05, + "loss": 0.0136, + "step": 8720 + }, + { + "epoch": 1.6266828154842317, + "grad_norm": 0.08913256973028183, + "learning_rate": 9.153932165486396e-05, + "loss": 0.0162, + "step": 8730 + }, + { + "epoch": 1.6285461405878792, + "grad_norm": 0.08861401677131653, + "learning_rate": 9.141508261895888e-05, + "loss": 0.0162, + "step": 8740 + }, + { + "epoch": 1.6304094656915264, + "grad_norm": 0.0848320946097374, + "learning_rate": 9.12908435830538e-05, + "loss": 0.0143, + "step": 8750 + }, + { + "epoch": 1.632272790795174, + "grad_norm": 0.12847183644771576, + "learning_rate": 9.116660454714871e-05, + "loss": 0.0271, + "step": 8760 + }, + { + "epoch": 1.6341361158988215, + "grad_norm": 0.09224189072847366, + "learning_rate": 9.104236551124363e-05, + "loss": 0.0099, + "step": 8770 + }, + { + "epoch": 1.6359994410024687, + "grad_norm": 0.09641575813293457, + "learning_rate": 9.091812647533856e-05, + "loss": 0.0098, + "step": 8780 + }, + { + "epoch": 1.6378627661061165, + "grad_norm": 0.07251239567995071, + "learning_rate": 9.079388743943347e-05, + "loss": 0.012, + "step": 8790 + }, + { + "epoch": 1.6397260912097638, + "grad_norm": 0.06500142812728882, + "learning_rate": 9.06696484035284e-05, + "loss": 0.013, + "step": 8800 + }, + { + "epoch": 1.6415894163134113, + "grad_norm": 0.106007419526577, + "learning_rate": 9.054540936762332e-05, + "loss": 0.0072, + "step": 8810 + }, + { + "epoch": 1.6434527414170588, + "grad_norm": 0.09809418767690659, + "learning_rate": 9.042117033171823e-05, + "loss": 0.0111, + "step": 8820 + }, + { + "epoch": 1.645316066520706, + "grad_norm": 0.07123219221830368, + "learning_rate": 9.029693129581315e-05, + "loss": 0.0111, + "step": 8830 + }, + { + "epoch": 1.6471793916243538, + "grad_norm": 0.1504216492176056, + "learning_rate": 9.017269225990806e-05, + "loss": 0.0138, + "step": 8840 + }, + { + "epoch": 1.649042716728001, + "grad_norm": 0.11574763059616089, + "learning_rate": 9.004845322400298e-05, + "loss": 0.0101, + "step": 8850 + }, + { + "epoch": 1.6509060418316486, + "grad_norm": 0.09216564893722534, + "learning_rate": 8.99242141880979e-05, + "loss": 0.0118, + "step": 8860 + }, + { + "epoch": 1.652769366935296, + "grad_norm": 0.050066202878952026, + "learning_rate": 8.979997515219283e-05, + "loss": 0.0115, + "step": 8870 + }, + { + "epoch": 1.6546326920389434, + "grad_norm": 0.07031989097595215, + "learning_rate": 8.967573611628774e-05, + "loss": 0.0126, + "step": 8880 + }, + { + "epoch": 1.656496017142591, + "grad_norm": 0.05028266832232475, + "learning_rate": 8.955149708038266e-05, + "loss": 0.0138, + "step": 8890 + }, + { + "epoch": 1.6583593422462384, + "grad_norm": 0.08418164402246475, + "learning_rate": 8.942725804447757e-05, + "loss": 0.0102, + "step": 8900 + }, + { + "epoch": 1.6602226673498859, + "grad_norm": 0.07395261526107788, + "learning_rate": 8.93030190085725e-05, + "loss": 0.0123, + "step": 8910 + }, + { + "epoch": 1.6620859924535334, + "grad_norm": 0.051781509071588516, + "learning_rate": 8.917877997266742e-05, + "loss": 0.0102, + "step": 8920 + }, + { + "epoch": 1.6639493175571807, + "grad_norm": 0.12200374156236649, + "learning_rate": 8.905454093676233e-05, + "loss": 0.0091, + "step": 8930 + }, + { + "epoch": 1.6658126426608284, + "grad_norm": 0.09385799616575241, + "learning_rate": 8.893030190085725e-05, + "loss": 0.0133, + "step": 8940 + }, + { + "epoch": 1.6676759677644757, + "grad_norm": 0.10892565548419952, + "learning_rate": 8.880606286495218e-05, + "loss": 0.0091, + "step": 8950 + }, + { + "epoch": 1.6695392928681232, + "grad_norm": 0.06951303780078888, + "learning_rate": 8.86818238290471e-05, + "loss": 0.0083, + "step": 8960 + }, + { + "epoch": 1.6714026179717707, + "grad_norm": 0.09501393139362335, + "learning_rate": 8.855758479314201e-05, + "loss": 0.0145, + "step": 8970 + }, + { + "epoch": 1.673265943075418, + "grad_norm": 0.10199866443872452, + "learning_rate": 8.843334575723693e-05, + "loss": 0.0107, + "step": 8980 + }, + { + "epoch": 1.6751292681790655, + "grad_norm": 0.07333105802536011, + "learning_rate": 8.830910672133184e-05, + "loss": 0.0111, + "step": 8990 + }, + { + "epoch": 1.676992593282713, + "grad_norm": 0.07807095348834991, + "learning_rate": 8.818486768542676e-05, + "loss": 0.0119, + "step": 9000 + }, + { + "epoch": 1.6788559183863605, + "grad_norm": 0.06234030798077583, + "learning_rate": 8.806062864952169e-05, + "loss": 0.0127, + "step": 9010 + }, + { + "epoch": 1.680719243490008, + "grad_norm": 0.07110045850276947, + "learning_rate": 8.79363896136166e-05, + "loss": 0.0099, + "step": 9020 + }, + { + "epoch": 1.6825825685936553, + "grad_norm": 0.0808567926287651, + "learning_rate": 8.781215057771152e-05, + "loss": 0.0115, + "step": 9030 + }, + { + "epoch": 1.6844458936973028, + "grad_norm": 0.07250918447971344, + "learning_rate": 8.768791154180645e-05, + "loss": 0.0106, + "step": 9040 + }, + { + "epoch": 1.6863092188009503, + "grad_norm": 0.05763540416955948, + "learning_rate": 8.756367250590136e-05, + "loss": 0.0132, + "step": 9050 + }, + { + "epoch": 1.6881725439045976, + "grad_norm": 0.07025442272424698, + "learning_rate": 8.743943346999628e-05, + "loss": 0.0122, + "step": 9060 + }, + { + "epoch": 1.6900358690082453, + "grad_norm": 0.12990747392177582, + "learning_rate": 8.73151944340912e-05, + "loss": 0.0137, + "step": 9070 + }, + { + "epoch": 1.6918991941118926, + "grad_norm": 0.07888498157262802, + "learning_rate": 8.719095539818611e-05, + "loss": 0.0071, + "step": 9080 + }, + { + "epoch": 1.69376251921554, + "grad_norm": 0.05839758738875389, + "learning_rate": 8.706671636228103e-05, + "loss": 0.0122, + "step": 9090 + }, + { + "epoch": 1.6956258443191876, + "grad_norm": 0.05912087857723236, + "learning_rate": 8.694247732637594e-05, + "loss": 0.0125, + "step": 9100 + }, + { + "epoch": 1.697489169422835, + "grad_norm": 0.13204564154148102, + "learning_rate": 8.681823829047086e-05, + "loss": 0.0104, + "step": 9110 + }, + { + "epoch": 1.6993524945264826, + "grad_norm": 0.12108772248029709, + "learning_rate": 8.669399925456579e-05, + "loss": 0.0143, + "step": 9120 + }, + { + "epoch": 1.70121581963013, + "grad_norm": 0.10718534886837006, + "learning_rate": 8.656976021866072e-05, + "loss": 0.0108, + "step": 9130 + }, + { + "epoch": 1.7030791447337774, + "grad_norm": 0.06931287050247192, + "learning_rate": 8.644552118275563e-05, + "loss": 0.0108, + "step": 9140 + }, + { + "epoch": 1.704942469837425, + "grad_norm": 0.12279055267572403, + "learning_rate": 8.632128214685055e-05, + "loss": 0.0131, + "step": 9150 + }, + { + "epoch": 1.7068057949410722, + "grad_norm": 0.06814446300268173, + "learning_rate": 8.619704311094547e-05, + "loss": 0.0101, + "step": 9160 + }, + { + "epoch": 1.70866912004472, + "grad_norm": 0.043399911373853683, + "learning_rate": 8.607280407504038e-05, + "loss": 0.0077, + "step": 9170 + }, + { + "epoch": 1.7105324451483672, + "grad_norm": 0.10185623914003372, + "learning_rate": 8.59485650391353e-05, + "loss": 0.0154, + "step": 9180 + }, + { + "epoch": 1.7123957702520147, + "grad_norm": 0.0697004571557045, + "learning_rate": 8.582432600323021e-05, + "loss": 0.0089, + "step": 9190 + }, + { + "epoch": 1.7142590953556622, + "grad_norm": 0.08959820121526718, + "learning_rate": 8.570008696732513e-05, + "loss": 0.0125, + "step": 9200 + }, + { + "epoch": 1.7161224204593095, + "grad_norm": 0.07031527906656265, + "learning_rate": 8.557584793142006e-05, + "loss": 0.0068, + "step": 9210 + }, + { + "epoch": 1.7179857455629572, + "grad_norm": 0.14567942917346954, + "learning_rate": 8.545160889551497e-05, + "loss": 0.0149, + "step": 9220 + }, + { + "epoch": 1.7198490706666045, + "grad_norm": 0.05701775848865509, + "learning_rate": 8.532736985960989e-05, + "loss": 0.0128, + "step": 9230 + }, + { + "epoch": 1.721712395770252, + "grad_norm": 0.1378256380558014, + "learning_rate": 8.520313082370482e-05, + "loss": 0.0117, + "step": 9240 + }, + { + "epoch": 1.7235757208738995, + "grad_norm": 0.1644400954246521, + "learning_rate": 8.507889178779973e-05, + "loss": 0.0126, + "step": 9250 + }, + { + "epoch": 1.7254390459775468, + "grad_norm": 0.1157638356089592, + "learning_rate": 8.495465275189465e-05, + "loss": 0.0091, + "step": 9260 + }, + { + "epoch": 1.7273023710811946, + "grad_norm": 0.06827396899461746, + "learning_rate": 8.483041371598957e-05, + "loss": 0.0115, + "step": 9270 + }, + { + "epoch": 1.7291656961848418, + "grad_norm": 0.07707670331001282, + "learning_rate": 8.470617468008448e-05, + "loss": 0.0104, + "step": 9280 + }, + { + "epoch": 1.7310290212884893, + "grad_norm": 0.09398248046636581, + "learning_rate": 8.45819356441794e-05, + "loss": 0.0119, + "step": 9290 + }, + { + "epoch": 1.7328923463921369, + "grad_norm": 0.051631245762109756, + "learning_rate": 8.445769660827433e-05, + "loss": 0.0094, + "step": 9300 + }, + { + "epoch": 1.7347556714957841, + "grad_norm": 0.06436185538768768, + "learning_rate": 8.433345757236924e-05, + "loss": 0.0131, + "step": 9310 + }, + { + "epoch": 1.7366189965994316, + "grad_norm": 0.07613561302423477, + "learning_rate": 8.420921853646416e-05, + "loss": 0.0107, + "step": 9320 + }, + { + "epoch": 1.7384823217030791, + "grad_norm": 0.07821378856897354, + "learning_rate": 8.408497950055907e-05, + "loss": 0.0156, + "step": 9330 + }, + { + "epoch": 1.7403456468067267, + "grad_norm": 0.06976544111967087, + "learning_rate": 8.396074046465399e-05, + "loss": 0.0108, + "step": 9340 + }, + { + "epoch": 1.7422089719103742, + "grad_norm": 0.09211380779743195, + "learning_rate": 8.383650142874892e-05, + "loss": 0.0106, + "step": 9350 + }, + { + "epoch": 1.7440722970140214, + "grad_norm": 0.09161022305488586, + "learning_rate": 8.371226239284384e-05, + "loss": 0.0144, + "step": 9360 + }, + { + "epoch": 1.745935622117669, + "grad_norm": 0.10251525789499283, + "learning_rate": 8.358802335693875e-05, + "loss": 0.0096, + "step": 9370 + }, + { + "epoch": 1.7477989472213165, + "grad_norm": 0.09510977566242218, + "learning_rate": 8.346378432103368e-05, + "loss": 0.013, + "step": 9380 + }, + { + "epoch": 1.7496622723249637, + "grad_norm": 0.13077139854431152, + "learning_rate": 8.33395452851286e-05, + "loss": 0.0106, + "step": 9390 + }, + { + "epoch": 1.7515255974286115, + "grad_norm": 0.09187795966863632, + "learning_rate": 8.321530624922351e-05, + "loss": 0.0089, + "step": 9400 + }, + { + "epoch": 1.7533889225322588, + "grad_norm": 0.13412202894687653, + "learning_rate": 8.309106721331843e-05, + "loss": 0.0107, + "step": 9410 + }, + { + "epoch": 1.7552522476359063, + "grad_norm": 0.07389464974403381, + "learning_rate": 8.296682817741334e-05, + "loss": 0.0109, + "step": 9420 + }, + { + "epoch": 1.7571155727395538, + "grad_norm": 0.10995054244995117, + "learning_rate": 8.284258914150826e-05, + "loss": 0.0124, + "step": 9430 + }, + { + "epoch": 1.758978897843201, + "grad_norm": 0.055603329092264175, + "learning_rate": 8.271835010560318e-05, + "loss": 0.012, + "step": 9440 + }, + { + "epoch": 1.7608422229468488, + "grad_norm": 0.1301773190498352, + "learning_rate": 8.25941110696981e-05, + "loss": 0.0166, + "step": 9450 + }, + { + "epoch": 1.762705548050496, + "grad_norm": 0.1451479196548462, + "learning_rate": 8.246987203379302e-05, + "loss": 0.0164, + "step": 9460 + }, + { + "epoch": 1.7645688731541436, + "grad_norm": 0.05127796530723572, + "learning_rate": 8.234563299788795e-05, + "loss": 0.0094, + "step": 9470 + }, + { + "epoch": 1.766432198257791, + "grad_norm": 0.11545635014772415, + "learning_rate": 8.222139396198287e-05, + "loss": 0.0161, + "step": 9480 + }, + { + "epoch": 1.7682955233614384, + "grad_norm": 0.10210834443569183, + "learning_rate": 8.209715492607778e-05, + "loss": 0.0093, + "step": 9490 + }, + { + "epoch": 1.770158848465086, + "grad_norm": 0.11474502831697464, + "learning_rate": 8.19729158901727e-05, + "loss": 0.0093, + "step": 9500 + }, + { + "epoch": 1.7720221735687334, + "grad_norm": 0.07989396154880524, + "learning_rate": 8.184867685426761e-05, + "loss": 0.011, + "step": 9510 + }, + { + "epoch": 1.7738854986723809, + "grad_norm": 0.0556521899998188, + "learning_rate": 8.172443781836253e-05, + "loss": 0.0126, + "step": 9520 + }, + { + "epoch": 1.7757488237760284, + "grad_norm": 0.1595865786075592, + "learning_rate": 8.160019878245745e-05, + "loss": 0.0107, + "step": 9530 + }, + { + "epoch": 1.7776121488796757, + "grad_norm": 0.11151939630508423, + "learning_rate": 8.147595974655236e-05, + "loss": 0.0114, + "step": 9540 + }, + { + "epoch": 1.7794754739833234, + "grad_norm": 0.1147758886218071, + "learning_rate": 8.135172071064729e-05, + "loss": 0.0097, + "step": 9550 + }, + { + "epoch": 1.7813387990869707, + "grad_norm": 0.09774407744407654, + "learning_rate": 8.12274816747422e-05, + "loss": 0.0114, + "step": 9560 + }, + { + "epoch": 1.7832021241906182, + "grad_norm": 0.08929464966058731, + "learning_rate": 8.110324263883714e-05, + "loss": 0.0103, + "step": 9570 + }, + { + "epoch": 1.7850654492942657, + "grad_norm": 0.05338837951421738, + "learning_rate": 8.097900360293205e-05, + "loss": 0.0073, + "step": 9580 + }, + { + "epoch": 1.786928774397913, + "grad_norm": 0.0910460576415062, + "learning_rate": 8.085476456702697e-05, + "loss": 0.0086, + "step": 9590 + }, + { + "epoch": 1.7887920995015605, + "grad_norm": 0.12020589411258698, + "learning_rate": 8.073052553112188e-05, + "loss": 0.0104, + "step": 9600 + }, + { + "epoch": 1.790655424605208, + "grad_norm": 0.0601915568113327, + "learning_rate": 8.06062864952168e-05, + "loss": 0.0085, + "step": 9610 + }, + { + "epoch": 1.7925187497088555, + "grad_norm": 0.0929914340376854, + "learning_rate": 8.048204745931171e-05, + "loss": 0.011, + "step": 9620 + }, + { + "epoch": 1.794382074812503, + "grad_norm": 0.1210533082485199, + "learning_rate": 8.035780842340663e-05, + "loss": 0.0095, + "step": 9630 + }, + { + "epoch": 1.7962453999161503, + "grad_norm": 0.08702056854963303, + "learning_rate": 8.023356938750156e-05, + "loss": 0.0083, + "step": 9640 + }, + { + "epoch": 1.7981087250197978, + "grad_norm": 0.09298733621835709, + "learning_rate": 8.010933035159648e-05, + "loss": 0.0116, + "step": 9650 + }, + { + "epoch": 1.7999720501234453, + "grad_norm": 0.06856974214315414, + "learning_rate": 7.998509131569139e-05, + "loss": 0.0124, + "step": 9660 + }, + { + "epoch": 1.8018353752270926, + "grad_norm": 0.10700483620166779, + "learning_rate": 7.986085227978631e-05, + "loss": 0.0099, + "step": 9670 + }, + { + "epoch": 1.8036987003307403, + "grad_norm": 0.23971673846244812, + "learning_rate": 7.973661324388124e-05, + "loss": 0.013, + "step": 9680 + }, + { + "epoch": 1.8055620254343876, + "grad_norm": 0.05989585071802139, + "learning_rate": 7.961237420797615e-05, + "loss": 0.0081, + "step": 9690 + }, + { + "epoch": 1.807425350538035, + "grad_norm": 0.06265757232904434, + "learning_rate": 7.948813517207107e-05, + "loss": 0.0086, + "step": 9700 + }, + { + "epoch": 1.8092886756416826, + "grad_norm": 0.09412927180528641, + "learning_rate": 7.936389613616598e-05, + "loss": 0.0123, + "step": 9710 + }, + { + "epoch": 1.81115200074533, + "grad_norm": 0.09182179719209671, + "learning_rate": 7.923965710026091e-05, + "loss": 0.0109, + "step": 9720 + }, + { + "epoch": 1.8130153258489776, + "grad_norm": 0.06258631497621536, + "learning_rate": 7.911541806435583e-05, + "loss": 0.0112, + "step": 9730 + }, + { + "epoch": 1.814878650952625, + "grad_norm": 0.06989934295415878, + "learning_rate": 7.899117902845075e-05, + "loss": 0.0087, + "step": 9740 + }, + { + "epoch": 1.8167419760562724, + "grad_norm": 0.09968297928571701, + "learning_rate": 7.886693999254566e-05, + "loss": 0.007, + "step": 9750 + }, + { + "epoch": 1.81860530115992, + "grad_norm": 0.07616010308265686, + "learning_rate": 7.874270095664058e-05, + "loss": 0.009, + "step": 9760 + }, + { + "epoch": 1.8204686262635672, + "grad_norm": 0.055172596126794815, + "learning_rate": 7.861846192073549e-05, + "loss": 0.0123, + "step": 9770 + }, + { + "epoch": 1.822331951367215, + "grad_norm": 0.09393038600683212, + "learning_rate": 7.849422288483041e-05, + "loss": 0.0096, + "step": 9780 + }, + { + "epoch": 1.8241952764708622, + "grad_norm": 0.04935317859053612, + "learning_rate": 7.836998384892534e-05, + "loss": 0.0097, + "step": 9790 + }, + { + "epoch": 1.8260586015745097, + "grad_norm": 0.08303016424179077, + "learning_rate": 7.824574481302025e-05, + "loss": 0.0092, + "step": 9800 + }, + { + "epoch": 1.8279219266781572, + "grad_norm": 0.07915756106376648, + "learning_rate": 7.812150577711518e-05, + "loss": 0.011, + "step": 9810 + }, + { + "epoch": 1.8297852517818045, + "grad_norm": 0.08060972392559052, + "learning_rate": 7.79972667412101e-05, + "loss": 0.0127, + "step": 9820 + }, + { + "epoch": 1.8316485768854522, + "grad_norm": 0.06542019546031952, + "learning_rate": 7.787302770530501e-05, + "loss": 0.0078, + "step": 9830 + }, + { + "epoch": 1.8335119019890995, + "grad_norm": 0.1199030801653862, + "learning_rate": 7.774878866939993e-05, + "loss": 0.0263, + "step": 9840 + }, + { + "epoch": 1.835375227092747, + "grad_norm": 0.05325644463300705, + "learning_rate": 7.762454963349485e-05, + "loss": 0.0144, + "step": 9850 + }, + { + "epoch": 1.8372385521963945, + "grad_norm": 0.10904191434383392, + "learning_rate": 7.750031059758976e-05, + "loss": 0.0097, + "step": 9860 + }, + { + "epoch": 1.8391018773000418, + "grad_norm": 0.06219085678458214, + "learning_rate": 7.737607156168468e-05, + "loss": 0.0106, + "step": 9870 + }, + { + "epoch": 1.8409652024036895, + "grad_norm": 0.13541138172149658, + "learning_rate": 7.72518325257796e-05, + "loss": 0.0106, + "step": 9880 + }, + { + "epoch": 1.8428285275073368, + "grad_norm": 0.02818934991955757, + "learning_rate": 7.712759348987452e-05, + "loss": 0.0078, + "step": 9890 + }, + { + "epoch": 1.8446918526109843, + "grad_norm": 0.07589732110500336, + "learning_rate": 7.700335445396944e-05, + "loss": 0.0104, + "step": 9900 + }, + { + "epoch": 1.8465551777146318, + "grad_norm": 0.07196231186389923, + "learning_rate": 7.687911541806437e-05, + "loss": 0.01, + "step": 9910 + }, + { + "epoch": 1.8484185028182791, + "grad_norm": 0.09193772822618484, + "learning_rate": 7.675487638215928e-05, + "loss": 0.0093, + "step": 9920 + }, + { + "epoch": 1.8502818279219266, + "grad_norm": 0.1183035671710968, + "learning_rate": 7.66306373462542e-05, + "loss": 0.0129, + "step": 9930 + }, + { + "epoch": 1.8521451530255741, + "grad_norm": 0.19073550403118134, + "learning_rate": 7.650639831034912e-05, + "loss": 0.0133, + "step": 9940 + }, + { + "epoch": 1.8540084781292216, + "grad_norm": 0.09900416433811188, + "learning_rate": 7.638215927444403e-05, + "loss": 0.0202, + "step": 9950 + }, + { + "epoch": 1.8558718032328692, + "grad_norm": 0.1016170009970665, + "learning_rate": 7.625792023853895e-05, + "loss": 0.0104, + "step": 9960 + }, + { + "epoch": 1.8577351283365164, + "grad_norm": 0.06790608912706375, + "learning_rate": 7.613368120263386e-05, + "loss": 0.012, + "step": 9970 + }, + { + "epoch": 1.859598453440164, + "grad_norm": 0.058812689036130905, + "learning_rate": 7.600944216672879e-05, + "loss": 0.0078, + "step": 9980 + }, + { + "epoch": 1.8614617785438115, + "grad_norm": 0.1043725460767746, + "learning_rate": 7.588520313082371e-05, + "loss": 0.0076, + "step": 9990 + }, + { + "epoch": 1.8633251036474587, + "grad_norm": 0.08266306668519974, + "learning_rate": 7.576096409491862e-05, + "loss": 0.0095, + "step": 10000 + }, + { + "epoch": 1.8651884287511065, + "grad_norm": 0.27030545473098755, + "learning_rate": 7.563672505901355e-05, + "loss": 0.0129, + "step": 10010 + }, + { + "epoch": 1.8670517538547537, + "grad_norm": 0.06581460684537888, + "learning_rate": 7.551248602310847e-05, + "loss": 0.0107, + "step": 10020 + }, + { + "epoch": 1.8689150789584013, + "grad_norm": 0.046561527997255325, + "learning_rate": 7.538824698720338e-05, + "loss": 0.0115, + "step": 10030 + }, + { + "epoch": 1.8707784040620488, + "grad_norm": 0.0690365731716156, + "learning_rate": 7.52640079512983e-05, + "loss": 0.0081, + "step": 10040 + }, + { + "epoch": 1.872641729165696, + "grad_norm": 0.08930855244398117, + "learning_rate": 7.513976891539322e-05, + "loss": 0.0094, + "step": 10050 + }, + { + "epoch": 1.8745050542693438, + "grad_norm": 0.056025706231594086, + "learning_rate": 7.501552987948813e-05, + "loss": 0.0095, + "step": 10060 + }, + { + "epoch": 1.876368379372991, + "grad_norm": 0.0407416895031929, + "learning_rate": 7.489129084358306e-05, + "loss": 0.0115, + "step": 10070 + }, + { + "epoch": 1.8782317044766386, + "grad_norm": 0.09971708804368973, + "learning_rate": 7.476705180767798e-05, + "loss": 0.0104, + "step": 10080 + }, + { + "epoch": 1.880095029580286, + "grad_norm": 0.0750962570309639, + "learning_rate": 7.46428127717729e-05, + "loss": 0.0086, + "step": 10090 + }, + { + "epoch": 1.8819583546839334, + "grad_norm": 0.06815943866968155, + "learning_rate": 7.451857373586781e-05, + "loss": 0.009, + "step": 10100 + }, + { + "epoch": 1.883821679787581, + "grad_norm": 0.08812274783849716, + "learning_rate": 7.439433469996272e-05, + "loss": 0.0105, + "step": 10110 + }, + { + "epoch": 1.8856850048912284, + "grad_norm": 0.06751228123903275, + "learning_rate": 7.427009566405765e-05, + "loss": 0.007, + "step": 10120 + }, + { + "epoch": 1.8875483299948759, + "grad_norm": 0.1457456648349762, + "learning_rate": 7.414585662815257e-05, + "loss": 0.0121, + "step": 10130 + }, + { + "epoch": 1.8894116550985234, + "grad_norm": 0.05187823250889778, + "learning_rate": 7.402161759224749e-05, + "loss": 0.0112, + "step": 10140 + }, + { + "epoch": 1.8912749802021707, + "grad_norm": 0.05900781601667404, + "learning_rate": 7.389737855634242e-05, + "loss": 0.0089, + "step": 10150 + }, + { + "epoch": 1.8931383053058184, + "grad_norm": 0.09933795034885406, + "learning_rate": 7.377313952043733e-05, + "loss": 0.0102, + "step": 10160 + }, + { + "epoch": 1.8950016304094657, + "grad_norm": 0.09261105209589005, + "learning_rate": 7.364890048453225e-05, + "loss": 0.0111, + "step": 10170 + }, + { + "epoch": 1.8968649555131132, + "grad_norm": 0.0641617551445961, + "learning_rate": 7.352466144862716e-05, + "loss": 0.0082, + "step": 10180 + }, + { + "epoch": 1.8987282806167607, + "grad_norm": 0.09815941751003265, + "learning_rate": 7.340042241272208e-05, + "loss": 0.0115, + "step": 10190 + }, + { + "epoch": 1.900591605720408, + "grad_norm": 0.12402988225221634, + "learning_rate": 7.3276183376817e-05, + "loss": 0.0092, + "step": 10200 + }, + { + "epoch": 1.9024549308240555, + "grad_norm": 0.07279510796070099, + "learning_rate": 7.315194434091191e-05, + "loss": 0.0119, + "step": 10210 + }, + { + "epoch": 1.904318255927703, + "grad_norm": 0.12178485840559006, + "learning_rate": 7.302770530500683e-05, + "loss": 0.0094, + "step": 10220 + }, + { + "epoch": 1.9061815810313505, + "grad_norm": 0.07118740677833557, + "learning_rate": 7.290346626910176e-05, + "loss": 0.0117, + "step": 10230 + }, + { + "epoch": 1.908044906134998, + "grad_norm": 0.15892274677753448, + "learning_rate": 7.277922723319668e-05, + "loss": 0.0082, + "step": 10240 + }, + { + "epoch": 1.9099082312386453, + "grad_norm": 0.058475472033023834, + "learning_rate": 7.26549881972916e-05, + "loss": 0.0105, + "step": 10250 + }, + { + "epoch": 1.9117715563422928, + "grad_norm": 0.07305944710969925, + "learning_rate": 7.253074916138652e-05, + "loss": 0.0103, + "step": 10260 + }, + { + "epoch": 1.9136348814459403, + "grad_norm": 0.05083195120096207, + "learning_rate": 7.240651012548143e-05, + "loss": 0.0097, + "step": 10270 + }, + { + "epoch": 1.9154982065495876, + "grad_norm": 0.10659267753362656, + "learning_rate": 7.228227108957635e-05, + "loss": 0.0127, + "step": 10280 + }, + { + "epoch": 1.9173615316532353, + "grad_norm": 0.05836571007966995, + "learning_rate": 7.215803205367126e-05, + "loss": 0.0118, + "step": 10290 + }, + { + "epoch": 1.9192248567568826, + "grad_norm": 0.07619835436344147, + "learning_rate": 7.203379301776618e-05, + "loss": 0.0128, + "step": 10300 + }, + { + "epoch": 1.92108818186053, + "grad_norm": 0.0604756698012352, + "learning_rate": 7.19095539818611e-05, + "loss": 0.0094, + "step": 10310 + }, + { + "epoch": 1.9229515069641776, + "grad_norm": 0.08139246702194214, + "learning_rate": 7.178531494595602e-05, + "loss": 0.008, + "step": 10320 + }, + { + "epoch": 1.9248148320678249, + "grad_norm": 0.04249119386076927, + "learning_rate": 7.166107591005094e-05, + "loss": 0.0117, + "step": 10330 + }, + { + "epoch": 1.9266781571714726, + "grad_norm": 0.06616081297397614, + "learning_rate": 7.153683687414586e-05, + "loss": 0.0099, + "step": 10340 + }, + { + "epoch": 1.92854148227512, + "grad_norm": 0.1366560459136963, + "learning_rate": 7.141259783824079e-05, + "loss": 0.0097, + "step": 10350 + }, + { + "epoch": 1.9304048073787674, + "grad_norm": 0.06568538397550583, + "learning_rate": 7.12883588023357e-05, + "loss": 0.0084, + "step": 10360 + }, + { + "epoch": 1.932268132482415, + "grad_norm": 0.06210561469197273, + "learning_rate": 7.116411976643062e-05, + "loss": 0.0112, + "step": 10370 + }, + { + "epoch": 1.9341314575860622, + "grad_norm": 0.0465511716902256, + "learning_rate": 7.103988073052553e-05, + "loss": 0.0127, + "step": 10380 + }, + { + "epoch": 1.93599478268971, + "grad_norm": 0.06106647104024887, + "learning_rate": 7.091564169462045e-05, + "loss": 0.0098, + "step": 10390 + }, + { + "epoch": 1.9378581077933572, + "grad_norm": 0.05775139108300209, + "learning_rate": 7.079140265871536e-05, + "loss": 0.0098, + "step": 10400 + }, + { + "epoch": 1.9397214328970047, + "grad_norm": 0.038371358066797256, + "learning_rate": 7.06671636228103e-05, + "loss": 0.0109, + "step": 10410 + }, + { + "epoch": 1.9415847580006522, + "grad_norm": 0.0670790821313858, + "learning_rate": 7.054292458690521e-05, + "loss": 0.0111, + "step": 10420 + }, + { + "epoch": 1.9434480831042995, + "grad_norm": 0.13203051686286926, + "learning_rate": 7.041868555100013e-05, + "loss": 0.0102, + "step": 10430 + }, + { + "epoch": 1.9453114082079472, + "grad_norm": 0.06752142310142517, + "learning_rate": 7.029444651509504e-05, + "loss": 0.0147, + "step": 10440 + }, + { + "epoch": 1.9471747333115945, + "grad_norm": 0.1533273160457611, + "learning_rate": 7.017020747918997e-05, + "loss": 0.0132, + "step": 10450 + }, + { + "epoch": 1.949038058415242, + "grad_norm": 0.06826525926589966, + "learning_rate": 7.004596844328489e-05, + "loss": 0.0088, + "step": 10460 + }, + { + "epoch": 1.9509013835188895, + "grad_norm": 0.10859847068786621, + "learning_rate": 6.99217294073798e-05, + "loss": 0.0135, + "step": 10470 + }, + { + "epoch": 1.9527647086225368, + "grad_norm": 0.08629245311021805, + "learning_rate": 6.979749037147472e-05, + "loss": 0.01, + "step": 10480 + }, + { + "epoch": 1.9546280337261843, + "grad_norm": 0.08627601712942123, + "learning_rate": 6.967325133556965e-05, + "loss": 0.0083, + "step": 10490 + }, + { + "epoch": 1.9564913588298318, + "grad_norm": 0.09831984341144562, + "learning_rate": 6.954901229966456e-05, + "loss": 0.012, + "step": 10500 + }, + { + "epoch": 1.9564913588298318, + "eval_loss": 0.009778189472854137, + "eval_runtime": 3.848, + "eval_samples_per_second": 51.975, + "eval_steps_per_second": 12.994, + "step": 10500 + }, + { + "epoch": 1.9583546839334793, + "grad_norm": 0.06444848328828812, + "learning_rate": 6.942477326375948e-05, + "loss": 0.0122, + "step": 10510 + }, + { + "epoch": 1.9602180090371268, + "grad_norm": 0.06674704700708389, + "learning_rate": 6.93005342278544e-05, + "loss": 0.0075, + "step": 10520 + }, + { + "epoch": 1.9620813341407741, + "grad_norm": 0.09635625034570694, + "learning_rate": 6.917629519194931e-05, + "loss": 0.0103, + "step": 10530 + }, + { + "epoch": 1.9639446592444216, + "grad_norm": 0.08825049549341202, + "learning_rate": 6.905205615604423e-05, + "loss": 0.0132, + "step": 10540 + }, + { + "epoch": 1.9658079843480691, + "grad_norm": 0.11532425135374069, + "learning_rate": 6.892781712013914e-05, + "loss": 0.0097, + "step": 10550 + }, + { + "epoch": 1.9676713094517166, + "grad_norm": 0.06080237403512001, + "learning_rate": 6.880357808423407e-05, + "loss": 0.0073, + "step": 10560 + }, + { + "epoch": 1.9695346345553641, + "grad_norm": 0.11655959486961365, + "learning_rate": 6.867933904832899e-05, + "loss": 0.0095, + "step": 10570 + }, + { + "epoch": 1.9713979596590114, + "grad_norm": 0.12663961946964264, + "learning_rate": 6.855510001242392e-05, + "loss": 0.0091, + "step": 10580 + }, + { + "epoch": 1.973261284762659, + "grad_norm": 0.07252644747495651, + "learning_rate": 6.843086097651883e-05, + "loss": 0.0121, + "step": 10590 + }, + { + "epoch": 1.9751246098663064, + "grad_norm": 0.06399507820606232, + "learning_rate": 6.830662194061375e-05, + "loss": 0.0079, + "step": 10600 + }, + { + "epoch": 1.9769879349699537, + "grad_norm": 0.13027061522006989, + "learning_rate": 6.818238290470866e-05, + "loss": 0.0116, + "step": 10610 + }, + { + "epoch": 1.9788512600736015, + "grad_norm": 0.09459857642650604, + "learning_rate": 6.805814386880358e-05, + "loss": 0.01, + "step": 10620 + }, + { + "epoch": 1.9807145851772487, + "grad_norm": 0.06579575687646866, + "learning_rate": 6.79339048328985e-05, + "loss": 0.0105, + "step": 10630 + }, + { + "epoch": 1.9825779102808962, + "grad_norm": 0.08901679515838623, + "learning_rate": 6.780966579699341e-05, + "loss": 0.008, + "step": 10640 + }, + { + "epoch": 1.9844412353845438, + "grad_norm": 0.1649041771888733, + "learning_rate": 6.768542676108833e-05, + "loss": 0.0141, + "step": 10650 + }, + { + "epoch": 1.986304560488191, + "grad_norm": 0.052169207483530045, + "learning_rate": 6.756118772518326e-05, + "loss": 0.0104, + "step": 10660 + }, + { + "epoch": 1.9881678855918388, + "grad_norm": 0.06536156684160233, + "learning_rate": 6.743694868927817e-05, + "loss": 0.0121, + "step": 10670 + }, + { + "epoch": 1.990031210695486, + "grad_norm": 0.1085677444934845, + "learning_rate": 6.73127096533731e-05, + "loss": 0.0109, + "step": 10680 + }, + { + "epoch": 1.9918945357991336, + "grad_norm": 0.04441511258482933, + "learning_rate": 6.718847061746802e-05, + "loss": 0.007, + "step": 10690 + }, + { + "epoch": 1.993757860902781, + "grad_norm": 0.06156736612319946, + "learning_rate": 6.706423158156293e-05, + "loss": 0.008, + "step": 10700 + }, + { + "epoch": 1.9956211860064283, + "grad_norm": 0.05205359682440758, + "learning_rate": 6.693999254565785e-05, + "loss": 0.0076, + "step": 10710 + }, + { + "epoch": 1.997484511110076, + "grad_norm": 0.11469718813896179, + "learning_rate": 6.681575350975277e-05, + "loss": 0.0138, + "step": 10720 + }, + { + "epoch": 1.9993478362137234, + "grad_norm": 0.10553640872240067, + "learning_rate": 6.669151447384768e-05, + "loss": 0.0099, + "step": 10730 + }, + { + "epoch": 2.0012111613173706, + "grad_norm": 0.06447623670101166, + "learning_rate": 6.65672754379426e-05, + "loss": 0.007, + "step": 10740 + }, + { + "epoch": 2.0030744864210184, + "grad_norm": 0.102451391518116, + "learning_rate": 6.644303640203753e-05, + "loss": 0.006, + "step": 10750 + }, + { + "epoch": 2.0049378115246657, + "grad_norm": 0.030060315504670143, + "learning_rate": 6.631879736613244e-05, + "loss": 0.0116, + "step": 10760 + }, + { + "epoch": 2.0068011366283134, + "grad_norm": 0.05747315660119057, + "learning_rate": 6.619455833022736e-05, + "loss": 0.0067, + "step": 10770 + }, + { + "epoch": 2.0086644617319607, + "grad_norm": 0.09092244505882263, + "learning_rate": 6.607031929432227e-05, + "loss": 0.0068, + "step": 10780 + }, + { + "epoch": 2.010527786835608, + "grad_norm": 0.05836044251918793, + "learning_rate": 6.59460802584172e-05, + "loss": 0.0073, + "step": 10790 + }, + { + "epoch": 2.0123911119392557, + "grad_norm": 0.05856243893504143, + "learning_rate": 6.582184122251212e-05, + "loss": 0.0052, + "step": 10800 + }, + { + "epoch": 2.014254437042903, + "grad_norm": 0.08944112807512283, + "learning_rate": 6.569760218660703e-05, + "loss": 0.0072, + "step": 10810 + }, + { + "epoch": 2.0161177621465507, + "grad_norm": 0.06631997972726822, + "learning_rate": 6.557336315070195e-05, + "loss": 0.0063, + "step": 10820 + }, + { + "epoch": 2.017981087250198, + "grad_norm": 0.09783359616994858, + "learning_rate": 6.544912411479687e-05, + "loss": 0.006, + "step": 10830 + }, + { + "epoch": 2.0198444123538453, + "grad_norm": 0.1494947224855423, + "learning_rate": 6.53248850788918e-05, + "loss": 0.0071, + "step": 10840 + }, + { + "epoch": 2.021707737457493, + "grad_norm": 0.07631028443574905, + "learning_rate": 6.520064604298671e-05, + "loss": 0.0072, + "step": 10850 + }, + { + "epoch": 2.0235710625611403, + "grad_norm": 0.08964214473962784, + "learning_rate": 6.507640700708163e-05, + "loss": 0.0062, + "step": 10860 + }, + { + "epoch": 2.025434387664788, + "grad_norm": 0.0452706478536129, + "learning_rate": 6.495216797117654e-05, + "loss": 0.0061, + "step": 10870 + }, + { + "epoch": 2.0272977127684353, + "grad_norm": 0.050183720886707306, + "learning_rate": 6.482792893527146e-05, + "loss": 0.0065, + "step": 10880 + }, + { + "epoch": 2.0291610378720826, + "grad_norm": 0.06710134446620941, + "learning_rate": 6.470368989936639e-05, + "loss": 0.0041, + "step": 10890 + }, + { + "epoch": 2.0310243629757303, + "grad_norm": 0.0552317276597023, + "learning_rate": 6.45794508634613e-05, + "loss": 0.0122, + "step": 10900 + }, + { + "epoch": 2.0328876880793776, + "grad_norm": 0.07701552659273148, + "learning_rate": 6.445521182755622e-05, + "loss": 0.0061, + "step": 10910 + }, + { + "epoch": 2.0347510131830253, + "grad_norm": 0.021767565980553627, + "learning_rate": 6.433097279165115e-05, + "loss": 0.0058, + "step": 10920 + }, + { + "epoch": 2.0366143382866726, + "grad_norm": 0.07532000541687012, + "learning_rate": 6.420673375574607e-05, + "loss": 0.0065, + "step": 10930 + }, + { + "epoch": 2.03847766339032, + "grad_norm": 0.06637753546237946, + "learning_rate": 6.408249471984098e-05, + "loss": 0.011, + "step": 10940 + }, + { + "epoch": 2.0403409884939676, + "grad_norm": 0.08492667973041534, + "learning_rate": 6.39582556839359e-05, + "loss": 0.0137, + "step": 10950 + }, + { + "epoch": 2.042204313597615, + "grad_norm": 0.06736136972904205, + "learning_rate": 6.383401664803081e-05, + "loss": 0.007, + "step": 10960 + }, + { + "epoch": 2.0440676387012626, + "grad_norm": 0.14675526320934296, + "learning_rate": 6.370977761212573e-05, + "loss": 0.0077, + "step": 10970 + }, + { + "epoch": 2.04593096380491, + "grad_norm": 0.05233339965343475, + "learning_rate": 6.358553857622064e-05, + "loss": 0.007, + "step": 10980 + }, + { + "epoch": 2.047794288908557, + "grad_norm": 0.05443247780203819, + "learning_rate": 6.346129954031556e-05, + "loss": 0.0073, + "step": 10990 + }, + { + "epoch": 2.049657614012205, + "grad_norm": 0.07367470115423203, + "learning_rate": 6.333706050441049e-05, + "loss": 0.0071, + "step": 11000 + }, + { + "epoch": 2.051520939115852, + "grad_norm": 0.04883608594536781, + "learning_rate": 6.321282146850542e-05, + "loss": 0.0063, + "step": 11010 + }, + { + "epoch": 2.0533842642194995, + "grad_norm": 0.06024446710944176, + "learning_rate": 6.308858243260033e-05, + "loss": 0.0078, + "step": 11020 + }, + { + "epoch": 2.055247589323147, + "grad_norm": 0.11617977172136307, + "learning_rate": 6.296434339669525e-05, + "loss": 0.0065, + "step": 11030 + }, + { + "epoch": 2.0571109144267945, + "grad_norm": 0.07751832902431488, + "learning_rate": 6.284010436079017e-05, + "loss": 0.0064, + "step": 11040 + }, + { + "epoch": 2.0589742395304422, + "grad_norm": 0.07423734664916992, + "learning_rate": 6.271586532488508e-05, + "loss": 0.0053, + "step": 11050 + }, + { + "epoch": 2.0608375646340895, + "grad_norm": 0.11013174802064896, + "learning_rate": 6.259162628898e-05, + "loss": 0.0081, + "step": 11060 + }, + { + "epoch": 2.062700889737737, + "grad_norm": 0.03833114728331566, + "learning_rate": 6.246738725307491e-05, + "loss": 0.0064, + "step": 11070 + }, + { + "epoch": 2.0645642148413845, + "grad_norm": 0.13632163405418396, + "learning_rate": 6.234314821716983e-05, + "loss": 0.0067, + "step": 11080 + }, + { + "epoch": 2.066427539945032, + "grad_norm": 0.05485612899065018, + "learning_rate": 6.221890918126476e-05, + "loss": 0.0069, + "step": 11090 + }, + { + "epoch": 2.0682908650486795, + "grad_norm": 0.1530752331018448, + "learning_rate": 6.209467014535967e-05, + "loss": 0.0052, + "step": 11100 + }, + { + "epoch": 2.070154190152327, + "grad_norm": 0.06478920578956604, + "learning_rate": 6.197043110945459e-05, + "loss": 0.0069, + "step": 11110 + }, + { + "epoch": 2.072017515255974, + "grad_norm": 0.050055887550115585, + "learning_rate": 6.184619207354952e-05, + "loss": 0.0072, + "step": 11120 + }, + { + "epoch": 2.073880840359622, + "grad_norm": 0.06400181353092194, + "learning_rate": 6.172195303764444e-05, + "loss": 0.0111, + "step": 11130 + }, + { + "epoch": 2.075744165463269, + "grad_norm": 0.07093937695026398, + "learning_rate": 6.159771400173935e-05, + "loss": 0.0073, + "step": 11140 + }, + { + "epoch": 2.077607490566917, + "grad_norm": 0.07676272094249725, + "learning_rate": 6.147347496583427e-05, + "loss": 0.0062, + "step": 11150 + }, + { + "epoch": 2.079470815670564, + "grad_norm": 0.06639111787080765, + "learning_rate": 6.134923592992918e-05, + "loss": 0.0098, + "step": 11160 + }, + { + "epoch": 2.0813341407742114, + "grad_norm": 0.14330090582370758, + "learning_rate": 6.12249968940241e-05, + "loss": 0.0083, + "step": 11170 + }, + { + "epoch": 2.083197465877859, + "grad_norm": 0.05557774752378464, + "learning_rate": 6.110075785811903e-05, + "loss": 0.006, + "step": 11180 + }, + { + "epoch": 2.0850607909815064, + "grad_norm": 0.07704410701990128, + "learning_rate": 6.0976518822213944e-05, + "loss": 0.0081, + "step": 11190 + }, + { + "epoch": 2.086924116085154, + "grad_norm": 0.1603652834892273, + "learning_rate": 6.0852279786308867e-05, + "loss": 0.0079, + "step": 11200 + }, + { + "epoch": 2.0887874411888014, + "grad_norm": 0.04601491987705231, + "learning_rate": 6.072804075040378e-05, + "loss": 0.0075, + "step": 11210 + }, + { + "epoch": 2.0906507662924487, + "grad_norm": 0.13114094734191895, + "learning_rate": 6.06038017144987e-05, + "loss": 0.0095, + "step": 11220 + }, + { + "epoch": 2.0925140913960965, + "grad_norm": 0.04922521114349365, + "learning_rate": 6.0479562678593614e-05, + "loss": 0.009, + "step": 11230 + }, + { + "epoch": 2.0943774164997437, + "grad_norm": 0.06616795063018799, + "learning_rate": 6.035532364268853e-05, + "loss": 0.0077, + "step": 11240 + }, + { + "epoch": 2.0962407416033915, + "grad_norm": 0.06306971609592438, + "learning_rate": 6.023108460678345e-05, + "loss": 0.0079, + "step": 11250 + }, + { + "epoch": 2.0981040667070388, + "grad_norm": 0.04845070093870163, + "learning_rate": 6.010684557087837e-05, + "loss": 0.0078, + "step": 11260 + }, + { + "epoch": 2.099967391810686, + "grad_norm": 0.047751642763614655, + "learning_rate": 5.99826065349733e-05, + "loss": 0.0083, + "step": 11270 + }, + { + "epoch": 2.1018307169143338, + "grad_norm": 0.07718124240636826, + "learning_rate": 5.9858367499068213e-05, + "loss": 0.0072, + "step": 11280 + }, + { + "epoch": 2.103694042017981, + "grad_norm": 0.13909615576267242, + "learning_rate": 5.973412846316313e-05, + "loss": 0.0064, + "step": 11290 + }, + { + "epoch": 2.1055573671216283, + "grad_norm": 0.10713133960962296, + "learning_rate": 5.9609889427258045e-05, + "loss": 0.0074, + "step": 11300 + }, + { + "epoch": 2.107420692225276, + "grad_norm": 0.08075924217700958, + "learning_rate": 5.948565039135297e-05, + "loss": 0.0075, + "step": 11310 + }, + { + "epoch": 2.1092840173289233, + "grad_norm": 0.0693148747086525, + "learning_rate": 5.9361411355447883e-05, + "loss": 0.0053, + "step": 11320 + }, + { + "epoch": 2.111147342432571, + "grad_norm": 0.08559976518154144, + "learning_rate": 5.92371723195428e-05, + "loss": 0.0088, + "step": 11330 + }, + { + "epoch": 2.1130106675362184, + "grad_norm": 0.05133243277668953, + "learning_rate": 5.9112933283637715e-05, + "loss": 0.0067, + "step": 11340 + }, + { + "epoch": 2.1148739926398656, + "grad_norm": 0.08110009133815765, + "learning_rate": 5.8988694247732644e-05, + "loss": 0.0061, + "step": 11350 + }, + { + "epoch": 2.1167373177435134, + "grad_norm": 0.07096893340349197, + "learning_rate": 5.886445521182756e-05, + "loss": 0.0063, + "step": 11360 + }, + { + "epoch": 2.1186006428471607, + "grad_norm": 0.06264513731002808, + "learning_rate": 5.874021617592248e-05, + "loss": 0.0057, + "step": 11370 + }, + { + "epoch": 2.1204639679508084, + "grad_norm": 0.11413908004760742, + "learning_rate": 5.86159771400174e-05, + "loss": 0.007, + "step": 11380 + }, + { + "epoch": 2.1223272930544557, + "grad_norm": 0.07597790658473969, + "learning_rate": 5.8491738104112314e-05, + "loss": 0.0068, + "step": 11390 + }, + { + "epoch": 2.124190618158103, + "grad_norm": 0.03327861428260803, + "learning_rate": 5.836749906820723e-05, + "loss": 0.0057, + "step": 11400 + }, + { + "epoch": 2.1260539432617507, + "grad_norm": 0.10454593598842621, + "learning_rate": 5.8243260032302146e-05, + "loss": 0.0105, + "step": 11410 + }, + { + "epoch": 2.127917268365398, + "grad_norm": 0.0536537803709507, + "learning_rate": 5.811902099639707e-05, + "loss": 0.007, + "step": 11420 + }, + { + "epoch": 2.1297805934690457, + "grad_norm": 0.07390698790550232, + "learning_rate": 5.7994781960491984e-05, + "loss": 0.0063, + "step": 11430 + }, + { + "epoch": 2.131643918572693, + "grad_norm": 0.060124851763248444, + "learning_rate": 5.7870542924586914e-05, + "loss": 0.0054, + "step": 11440 + }, + { + "epoch": 2.1335072436763403, + "grad_norm": 0.06745817512273788, + "learning_rate": 5.774630388868183e-05, + "loss": 0.0075, + "step": 11450 + }, + { + "epoch": 2.135370568779988, + "grad_norm": 0.04149980843067169, + "learning_rate": 5.7622064852776745e-05, + "loss": 0.0082, + "step": 11460 + }, + { + "epoch": 2.1372338938836353, + "grad_norm": 0.08865126222372055, + "learning_rate": 5.749782581687166e-05, + "loss": 0.005, + "step": 11470 + }, + { + "epoch": 2.139097218987283, + "grad_norm": 0.0498250387609005, + "learning_rate": 5.7373586780966584e-05, + "loss": 0.0061, + "step": 11480 + }, + { + "epoch": 2.1409605440909303, + "grad_norm": 0.051534079015254974, + "learning_rate": 5.72493477450615e-05, + "loss": 0.0083, + "step": 11490 + }, + { + "epoch": 2.1428238691945776, + "grad_norm": 0.07920075207948685, + "learning_rate": 5.7125108709156415e-05, + "loss": 0.0125, + "step": 11500 + }, + { + "epoch": 2.1446871942982253, + "grad_norm": 0.0594758465886116, + "learning_rate": 5.700086967325133e-05, + "loss": 0.007, + "step": 11510 + }, + { + "epoch": 2.1465505194018726, + "grad_norm": 0.0814061388373375, + "learning_rate": 5.687663063734626e-05, + "loss": 0.0073, + "step": 11520 + }, + { + "epoch": 2.1484138445055203, + "grad_norm": 0.061178576201200485, + "learning_rate": 5.6752391601441176e-05, + "loss": 0.0059, + "step": 11530 + }, + { + "epoch": 2.1502771696091676, + "grad_norm": 0.046416446566581726, + "learning_rate": 5.66281525655361e-05, + "loss": 0.0072, + "step": 11540 + }, + { + "epoch": 2.152140494712815, + "grad_norm": 0.0688127651810646, + "learning_rate": 5.6503913529631015e-05, + "loss": 0.0069, + "step": 11550 + }, + { + "epoch": 2.1540038198164626, + "grad_norm": 0.15987776219844818, + "learning_rate": 5.637967449372593e-05, + "loss": 0.0076, + "step": 11560 + }, + { + "epoch": 2.15586714492011, + "grad_norm": 0.06576576083898544, + "learning_rate": 5.6255435457820846e-05, + "loss": 0.0082, + "step": 11570 + }, + { + "epoch": 2.1577304700237576, + "grad_norm": 0.09326786547899246, + "learning_rate": 5.613119642191577e-05, + "loss": 0.0047, + "step": 11580 + }, + { + "epoch": 2.159593795127405, + "grad_norm": 0.05868074297904968, + "learning_rate": 5.6006957386010685e-05, + "loss": 0.0077, + "step": 11590 + }, + { + "epoch": 2.161457120231052, + "grad_norm": 0.03512386605143547, + "learning_rate": 5.58827183501056e-05, + "loss": 0.0074, + "step": 11600 + }, + { + "epoch": 2.1633204453347, + "grad_norm": 0.09879045188426971, + "learning_rate": 5.575847931420053e-05, + "loss": 0.0074, + "step": 11610 + }, + { + "epoch": 2.165183770438347, + "grad_norm": 0.08072607964277267, + "learning_rate": 5.5634240278295446e-05, + "loss": 0.0068, + "step": 11620 + }, + { + "epoch": 2.1670470955419945, + "grad_norm": 0.06294963508844376, + "learning_rate": 5.551000124239036e-05, + "loss": 0.0054, + "step": 11630 + }, + { + "epoch": 2.168910420645642, + "grad_norm": 0.13824103772640228, + "learning_rate": 5.5385762206485284e-05, + "loss": 0.0074, + "step": 11640 + }, + { + "epoch": 2.1707737457492895, + "grad_norm": 0.09912306070327759, + "learning_rate": 5.52615231705802e-05, + "loss": 0.0094, + "step": 11650 + }, + { + "epoch": 2.1726370708529372, + "grad_norm": 0.1159447655081749, + "learning_rate": 5.5137284134675116e-05, + "loss": 0.0061, + "step": 11660 + }, + { + "epoch": 2.1745003959565845, + "grad_norm": 0.11456414312124252, + "learning_rate": 5.501304509877003e-05, + "loss": 0.0071, + "step": 11670 + }, + { + "epoch": 2.176363721060232, + "grad_norm": 0.06627790629863739, + "learning_rate": 5.488880606286495e-05, + "loss": 0.0057, + "step": 11680 + }, + { + "epoch": 2.1782270461638795, + "grad_norm": 0.09946975111961365, + "learning_rate": 5.476456702695988e-05, + "loss": 0.0094, + "step": 11690 + }, + { + "epoch": 2.180090371267527, + "grad_norm": 0.02680755965411663, + "learning_rate": 5.46403279910548e-05, + "loss": 0.0068, + "step": 11700 + }, + { + "epoch": 2.1819536963711745, + "grad_norm": 0.10289502143859863, + "learning_rate": 5.4516088955149715e-05, + "loss": 0.006, + "step": 11710 + }, + { + "epoch": 2.183817021474822, + "grad_norm": 0.0671938881278038, + "learning_rate": 5.439184991924463e-05, + "loss": 0.0066, + "step": 11720 + }, + { + "epoch": 2.185680346578469, + "grad_norm": 0.059153467416763306, + "learning_rate": 5.426761088333955e-05, + "loss": 0.0062, + "step": 11730 + }, + { + "epoch": 2.187543671682117, + "grad_norm": 0.06136234104633331, + "learning_rate": 5.414337184743446e-05, + "loss": 0.009, + "step": 11740 + }, + { + "epoch": 2.189406996785764, + "grad_norm": 0.08328571915626526, + "learning_rate": 5.4019132811529385e-05, + "loss": 0.0072, + "step": 11750 + }, + { + "epoch": 2.191270321889412, + "grad_norm": 0.08631386607885361, + "learning_rate": 5.38948937756243e-05, + "loss": 0.0064, + "step": 11760 + }, + { + "epoch": 2.193133646993059, + "grad_norm": 0.11763385683298111, + "learning_rate": 5.377065473971922e-05, + "loss": 0.0066, + "step": 11770 + }, + { + "epoch": 2.1949969720967064, + "grad_norm": 0.0526675283908844, + "learning_rate": 5.3646415703814146e-05, + "loss": 0.0042, + "step": 11780 + }, + { + "epoch": 2.196860297200354, + "grad_norm": 0.06366570293903351, + "learning_rate": 5.352217666790906e-05, + "loss": 0.0083, + "step": 11790 + }, + { + "epoch": 2.1987236223040014, + "grad_norm": 0.05300299450755119, + "learning_rate": 5.339793763200398e-05, + "loss": 0.0058, + "step": 11800 + }, + { + "epoch": 2.200586947407649, + "grad_norm": 0.04540044814348221, + "learning_rate": 5.32736985960989e-05, + "loss": 0.0057, + "step": 11810 + }, + { + "epoch": 2.2024502725112964, + "grad_norm": 0.03275251388549805, + "learning_rate": 5.3149459560193816e-05, + "loss": 0.0075, + "step": 11820 + }, + { + "epoch": 2.2043135976149437, + "grad_norm": 0.03645419701933861, + "learning_rate": 5.302522052428873e-05, + "loss": 0.0049, + "step": 11830 + }, + { + "epoch": 2.2061769227185914, + "grad_norm": 0.09000103920698166, + "learning_rate": 5.290098148838365e-05, + "loss": 0.0075, + "step": 11840 + }, + { + "epoch": 2.2080402478222387, + "grad_norm": 0.09125344455242157, + "learning_rate": 5.2776742452478564e-05, + "loss": 0.0068, + "step": 11850 + }, + { + "epoch": 2.209903572925886, + "grad_norm": 0.06729969382286072, + "learning_rate": 5.265250341657349e-05, + "loss": 0.0049, + "step": 11860 + }, + { + "epoch": 2.2117668980295337, + "grad_norm": 0.0643710121512413, + "learning_rate": 5.2528264380668416e-05, + "loss": 0.0069, + "step": 11870 + }, + { + "epoch": 2.213630223133181, + "grad_norm": 0.0587821826338768, + "learning_rate": 5.240402534476333e-05, + "loss": 0.0077, + "step": 11880 + }, + { + "epoch": 2.2154935482368288, + "grad_norm": 0.04202594608068466, + "learning_rate": 5.227978630885825e-05, + "loss": 0.0057, + "step": 11890 + }, + { + "epoch": 2.217356873340476, + "grad_norm": 0.05304161086678505, + "learning_rate": 5.215554727295316e-05, + "loss": 0.0054, + "step": 11900 + }, + { + "epoch": 2.2192201984441233, + "grad_norm": 0.05053735896945, + "learning_rate": 5.203130823704808e-05, + "loss": 0.0067, + "step": 11910 + }, + { + "epoch": 2.221083523547771, + "grad_norm": 0.05211859941482544, + "learning_rate": 5.1907069201143e-05, + "loss": 0.0084, + "step": 11920 + }, + { + "epoch": 2.2229468486514183, + "grad_norm": 0.09272083640098572, + "learning_rate": 5.178283016523792e-05, + "loss": 0.0062, + "step": 11930 + }, + { + "epoch": 2.224810173755066, + "grad_norm": 0.05025614798069, + "learning_rate": 5.165859112933283e-05, + "loss": 0.0075, + "step": 11940 + }, + { + "epoch": 2.2266734988587134, + "grad_norm": 0.11683744937181473, + "learning_rate": 5.153435209342776e-05, + "loss": 0.0072, + "step": 11950 + }, + { + "epoch": 2.2285368239623606, + "grad_norm": 0.051841359585523605, + "learning_rate": 5.141011305752268e-05, + "loss": 0.0101, + "step": 11960 + }, + { + "epoch": 2.2304001490660084, + "grad_norm": 0.04127165675163269, + "learning_rate": 5.1285874021617594e-05, + "loss": 0.006, + "step": 11970 + }, + { + "epoch": 2.2322634741696556, + "grad_norm": 0.07321721315383911, + "learning_rate": 5.1161634985712517e-05, + "loss": 0.0062, + "step": 11980 + }, + { + "epoch": 2.2341267992733034, + "grad_norm": 0.10085262358188629, + "learning_rate": 5.103739594980743e-05, + "loss": 0.0068, + "step": 11990 + }, + { + "epoch": 2.2359901243769507, + "grad_norm": 0.07528786361217499, + "learning_rate": 5.091315691390235e-05, + "loss": 0.0082, + "step": 12000 + }, + { + "epoch": 2.237853449480598, + "grad_norm": 0.08790519088506699, + "learning_rate": 5.0788917877997264e-05, + "loss": 0.0066, + "step": 12010 + }, + { + "epoch": 2.2397167745842457, + "grad_norm": 0.0749954879283905, + "learning_rate": 5.0664678842092187e-05, + "loss": 0.0067, + "step": 12020 + }, + { + "epoch": 2.241580099687893, + "grad_norm": 0.04362763091921806, + "learning_rate": 5.05404398061871e-05, + "loss": 0.0052, + "step": 12030 + }, + { + "epoch": 2.2434434247915407, + "grad_norm": 0.09141118079423904, + "learning_rate": 5.041620077028203e-05, + "loss": 0.0061, + "step": 12040 + }, + { + "epoch": 2.245306749895188, + "grad_norm": 0.05899785831570625, + "learning_rate": 5.029196173437695e-05, + "loss": 0.0077, + "step": 12050 + }, + { + "epoch": 2.2471700749988353, + "grad_norm": 0.06144483759999275, + "learning_rate": 5.016772269847186e-05, + "loss": 0.0058, + "step": 12060 + }, + { + "epoch": 2.249033400102483, + "grad_norm": 0.06769667565822601, + "learning_rate": 5.004348366256678e-05, + "loss": 0.0062, + "step": 12070 + }, + { + "epoch": 2.2508967252061303, + "grad_norm": 0.05238569527864456, + "learning_rate": 4.99192446266617e-05, + "loss": 0.0081, + "step": 12080 + }, + { + "epoch": 2.252760050309778, + "grad_norm": 0.09564723819494247, + "learning_rate": 4.979500559075662e-05, + "loss": 0.0063, + "step": 12090 + }, + { + "epoch": 2.2546233754134253, + "grad_norm": 0.07752995193004608, + "learning_rate": 4.967076655485154e-05, + "loss": 0.0056, + "step": 12100 + }, + { + "epoch": 2.2564867005170726, + "grad_norm": 0.12467604875564575, + "learning_rate": 4.9546527518946456e-05, + "loss": 0.0074, + "step": 12110 + }, + { + "epoch": 2.2583500256207203, + "grad_norm": 0.03957875445485115, + "learning_rate": 4.942228848304137e-05, + "loss": 0.0067, + "step": 12120 + }, + { + "epoch": 2.2602133507243676, + "grad_norm": 0.09976095706224442, + "learning_rate": 4.929804944713629e-05, + "loss": 0.0076, + "step": 12130 + }, + { + "epoch": 2.2620766758280153, + "grad_norm": 0.09455903619527817, + "learning_rate": 4.917381041123122e-05, + "loss": 0.0071, + "step": 12140 + }, + { + "epoch": 2.2639400009316626, + "grad_norm": 0.15039753913879395, + "learning_rate": 4.904957137532613e-05, + "loss": 0.0078, + "step": 12150 + }, + { + "epoch": 2.26580332603531, + "grad_norm": 0.08308923989534378, + "learning_rate": 4.892533233942105e-05, + "loss": 0.0073, + "step": 12160 + }, + { + "epoch": 2.2676666511389576, + "grad_norm": 0.13981153070926666, + "learning_rate": 4.8801093303515964e-05, + "loss": 0.0102, + "step": 12170 + }, + { + "epoch": 2.269529976242605, + "grad_norm": 0.07559193670749664, + "learning_rate": 4.867685426761088e-05, + "loss": 0.0068, + "step": 12180 + }, + { + "epoch": 2.2713933013462526, + "grad_norm": 0.06467331200838089, + "learning_rate": 4.85526152317058e-05, + "loss": 0.0053, + "step": 12190 + }, + { + "epoch": 2.2732566264499, + "grad_norm": 0.04058116301894188, + "learning_rate": 4.8428376195800725e-05, + "loss": 0.007, + "step": 12200 + }, + { + "epoch": 2.275119951553547, + "grad_norm": 0.04690202698111534, + "learning_rate": 4.830413715989564e-05, + "loss": 0.0062, + "step": 12210 + }, + { + "epoch": 2.276983276657195, + "grad_norm": 0.03539729863405228, + "learning_rate": 4.817989812399056e-05, + "loss": 0.0078, + "step": 12220 + }, + { + "epoch": 2.278846601760842, + "grad_norm": 0.0975772812962532, + "learning_rate": 4.805565908808548e-05, + "loss": 0.007, + "step": 12230 + }, + { + "epoch": 2.28070992686449, + "grad_norm": 0.057454656809568405, + "learning_rate": 4.7931420052180395e-05, + "loss": 0.008, + "step": 12240 + }, + { + "epoch": 2.282573251968137, + "grad_norm": 0.0594792366027832, + "learning_rate": 4.780718101627532e-05, + "loss": 0.0066, + "step": 12250 + }, + { + "epoch": 2.2844365770717845, + "grad_norm": 0.07810505479574203, + "learning_rate": 4.7682941980370234e-05, + "loss": 0.0052, + "step": 12260 + }, + { + "epoch": 2.286299902175432, + "grad_norm": 0.1288454383611679, + "learning_rate": 4.7558702944465156e-05, + "loss": 0.0078, + "step": 12270 + }, + { + "epoch": 2.2881632272790795, + "grad_norm": 0.05404103919863701, + "learning_rate": 4.743446390856007e-05, + "loss": 0.0056, + "step": 12280 + }, + { + "epoch": 2.290026552382727, + "grad_norm": 0.09221246093511581, + "learning_rate": 4.731022487265499e-05, + "loss": 0.0034, + "step": 12290 + }, + { + "epoch": 2.2918898774863745, + "grad_norm": 0.06454599648714066, + "learning_rate": 4.718598583674991e-05, + "loss": 0.0066, + "step": 12300 + }, + { + "epoch": 2.293753202590022, + "grad_norm": 0.23072722554206848, + "learning_rate": 4.706174680084483e-05, + "loss": 0.0081, + "step": 12310 + }, + { + "epoch": 2.2956165276936695, + "grad_norm": 0.03680235520005226, + "learning_rate": 4.693750776493975e-05, + "loss": 0.0052, + "step": 12320 + }, + { + "epoch": 2.297479852797317, + "grad_norm": 0.05998710170388222, + "learning_rate": 4.6813268729034665e-05, + "loss": 0.0075, + "step": 12330 + }, + { + "epoch": 2.299343177900964, + "grad_norm": 0.15091590583324432, + "learning_rate": 4.668902969312958e-05, + "loss": 0.0079, + "step": 12340 + }, + { + "epoch": 2.301206503004612, + "grad_norm": 0.10594821721315384, + "learning_rate": 4.6564790657224496e-05, + "loss": 0.007, + "step": 12350 + }, + { + "epoch": 2.303069828108259, + "grad_norm": 0.0922156348824501, + "learning_rate": 4.6440551621319426e-05, + "loss": 0.0054, + "step": 12360 + }, + { + "epoch": 2.304933153211907, + "grad_norm": 0.08536434918642044, + "learning_rate": 4.631631258541434e-05, + "loss": 0.008, + "step": 12370 + }, + { + "epoch": 2.306796478315554, + "grad_norm": 0.05940347537398338, + "learning_rate": 4.619207354950926e-05, + "loss": 0.0062, + "step": 12380 + }, + { + "epoch": 2.3086598034192014, + "grad_norm": 0.0685829296708107, + "learning_rate": 4.606783451360417e-05, + "loss": 0.0046, + "step": 12390 + }, + { + "epoch": 2.310523128522849, + "grad_norm": 0.0558197982609272, + "learning_rate": 4.5943595477699096e-05, + "loss": 0.0073, + "step": 12400 + }, + { + "epoch": 2.3123864536264964, + "grad_norm": 0.069609135389328, + "learning_rate": 4.581935644179401e-05, + "loss": 0.0067, + "step": 12410 + }, + { + "epoch": 2.3142497787301437, + "grad_norm": 0.037157054990530014, + "learning_rate": 4.5695117405888934e-05, + "loss": 0.0111, + "step": 12420 + }, + { + "epoch": 2.3161131038337914, + "grad_norm": 0.055848293006420135, + "learning_rate": 4.557087836998385e-05, + "loss": 0.0059, + "step": 12430 + }, + { + "epoch": 2.3179764289374387, + "grad_norm": 0.09797196090221405, + "learning_rate": 4.544663933407877e-05, + "loss": 0.007, + "step": 12440 + }, + { + "epoch": 2.3198397540410864, + "grad_norm": 0.045269809663295746, + "learning_rate": 4.532240029817369e-05, + "loss": 0.0052, + "step": 12450 + }, + { + "epoch": 2.3217030791447337, + "grad_norm": 0.04363720491528511, + "learning_rate": 4.5198161262268604e-05, + "loss": 0.0077, + "step": 12460 + }, + { + "epoch": 2.323566404248381, + "grad_norm": 0.06502070277929306, + "learning_rate": 4.507392222636353e-05, + "loss": 0.0071, + "step": 12470 + }, + { + "epoch": 2.3254297293520287, + "grad_norm": 0.043661072850227356, + "learning_rate": 4.494968319045844e-05, + "loss": 0.0068, + "step": 12480 + }, + { + "epoch": 2.327293054455676, + "grad_norm": 0.06712755560874939, + "learning_rate": 4.4825444154553365e-05, + "loss": 0.0049, + "step": 12490 + }, + { + "epoch": 2.3291563795593238, + "grad_norm": 0.06757821142673492, + "learning_rate": 4.470120511864828e-05, + "loss": 0.0056, + "step": 12500 + }, + { + "epoch": 2.331019704662971, + "grad_norm": 0.09899108856916428, + "learning_rate": 4.45769660827432e-05, + "loss": 0.0059, + "step": 12510 + }, + { + "epoch": 2.3328830297666183, + "grad_norm": 0.0666062980890274, + "learning_rate": 4.445272704683812e-05, + "loss": 0.0048, + "step": 12520 + }, + { + "epoch": 2.334746354870266, + "grad_norm": 0.0807105004787445, + "learning_rate": 4.432848801093304e-05, + "loss": 0.0085, + "step": 12530 + }, + { + "epoch": 2.3366096799739133, + "grad_norm": 0.0663641020655632, + "learning_rate": 4.420424897502796e-05, + "loss": 0.0053, + "step": 12540 + }, + { + "epoch": 2.338473005077561, + "grad_norm": 0.07503866404294968, + "learning_rate": 4.4080009939122874e-05, + "loss": 0.0049, + "step": 12550 + }, + { + "epoch": 2.3403363301812083, + "grad_norm": 0.03897761553525925, + "learning_rate": 4.395577090321779e-05, + "loss": 0.0055, + "step": 12560 + }, + { + "epoch": 2.3421996552848556, + "grad_norm": 0.11116714030504227, + "learning_rate": 4.383153186731271e-05, + "loss": 0.008, + "step": 12570 + }, + { + "epoch": 2.3440629803885034, + "grad_norm": 0.052072007209062576, + "learning_rate": 4.3707292831407634e-05, + "loss": 0.0079, + "step": 12580 + }, + { + "epoch": 2.3459263054921506, + "grad_norm": 0.06437745690345764, + "learning_rate": 4.358305379550255e-05, + "loss": 0.0065, + "step": 12590 + }, + { + "epoch": 2.3477896305957984, + "grad_norm": 0.028404507786035538, + "learning_rate": 4.3458814759597466e-05, + "loss": 0.0052, + "step": 12600 + }, + { + "epoch": 2.3496529556994457, + "grad_norm": 0.10816732794046402, + "learning_rate": 4.333457572369239e-05, + "loss": 0.0111, + "step": 12610 + }, + { + "epoch": 2.351516280803093, + "grad_norm": 0.11128159612417221, + "learning_rate": 4.3210336687787305e-05, + "loss": 0.0091, + "step": 12620 + }, + { + "epoch": 2.3533796059067407, + "grad_norm": 0.046126995235681534, + "learning_rate": 4.308609765188222e-05, + "loss": 0.007, + "step": 12630 + }, + { + "epoch": 2.355242931010388, + "grad_norm": 0.03195841982960701, + "learning_rate": 4.296185861597714e-05, + "loss": 0.0081, + "step": 12640 + }, + { + "epoch": 2.3571062561140357, + "grad_norm": 0.05619229003787041, + "learning_rate": 4.283761958007206e-05, + "loss": 0.0052, + "step": 12650 + }, + { + "epoch": 2.358969581217683, + "grad_norm": 0.12804652750492096, + "learning_rate": 4.271338054416698e-05, + "loss": 0.0077, + "step": 12660 + }, + { + "epoch": 2.3608329063213302, + "grad_norm": 0.08166605234146118, + "learning_rate": 4.25891415082619e-05, + "loss": 0.0075, + "step": 12670 + }, + { + "epoch": 2.362696231424978, + "grad_norm": 0.06898462772369385, + "learning_rate": 4.246490247235681e-05, + "loss": 0.0055, + "step": 12680 + }, + { + "epoch": 2.3645595565286253, + "grad_norm": 0.12175576388835907, + "learning_rate": 4.2340663436451735e-05, + "loss": 0.0053, + "step": 12690 + }, + { + "epoch": 2.366422881632273, + "grad_norm": 0.06706222146749496, + "learning_rate": 4.221642440054666e-05, + "loss": 0.007, + "step": 12700 + }, + { + "epoch": 2.3682862067359203, + "grad_norm": 0.05738810822367668, + "learning_rate": 4.2092185364641574e-05, + "loss": 0.0054, + "step": 12710 + }, + { + "epoch": 2.3701495318395676, + "grad_norm": 0.0824676901102066, + "learning_rate": 4.196794632873649e-05, + "loss": 0.005, + "step": 12720 + }, + { + "epoch": 2.3720128569432153, + "grad_norm": 0.10526188462972641, + "learning_rate": 4.1843707292831406e-05, + "loss": 0.0071, + "step": 12730 + }, + { + "epoch": 2.3738761820468626, + "grad_norm": 0.0358128622174263, + "learning_rate": 4.171946825692633e-05, + "loss": 0.0118, + "step": 12740 + }, + { + "epoch": 2.3757395071505103, + "grad_norm": 0.07586889714002609, + "learning_rate": 4.159522922102125e-05, + "loss": 0.0062, + "step": 12750 + }, + { + "epoch": 2.3776028322541576, + "grad_norm": 0.04203260689973831, + "learning_rate": 4.1470990185116166e-05, + "loss": 0.0061, + "step": 12760 + }, + { + "epoch": 2.379466157357805, + "grad_norm": 0.061605483293533325, + "learning_rate": 4.134675114921108e-05, + "loss": 0.005, + "step": 12770 + }, + { + "epoch": 2.3813294824614526, + "grad_norm": 0.0388299860060215, + "learning_rate": 4.1222512113306005e-05, + "loss": 0.0057, + "step": 12780 + }, + { + "epoch": 2.3831928075651, + "grad_norm": 0.07655610889196396, + "learning_rate": 4.109827307740092e-05, + "loss": 0.0063, + "step": 12790 + }, + { + "epoch": 2.3850561326687476, + "grad_norm": 0.05981520935893059, + "learning_rate": 4.097403404149584e-05, + "loss": 0.0056, + "step": 12800 + }, + { + "epoch": 2.386919457772395, + "grad_norm": 0.07374788820743561, + "learning_rate": 4.084979500559076e-05, + "loss": 0.0055, + "step": 12810 + }, + { + "epoch": 2.388782782876042, + "grad_norm": 0.07417966425418854, + "learning_rate": 4.0725555969685675e-05, + "loss": 0.0048, + "step": 12820 + }, + { + "epoch": 2.39064610797969, + "grad_norm": 0.13554275035858154, + "learning_rate": 4.06013169337806e-05, + "loss": 0.0063, + "step": 12830 + }, + { + "epoch": 2.392509433083337, + "grad_norm": 0.2652145028114319, + "learning_rate": 4.047707789787551e-05, + "loss": 0.0091, + "step": 12840 + }, + { + "epoch": 2.394372758186985, + "grad_norm": 0.055324655026197433, + "learning_rate": 4.035283886197043e-05, + "loss": 0.0052, + "step": 12850 + }, + { + "epoch": 2.396236083290632, + "grad_norm": 0.07505489140748978, + "learning_rate": 4.022859982606535e-05, + "loss": 0.0063, + "step": 12860 + }, + { + "epoch": 2.3980994083942795, + "grad_norm": 0.08703186362981796, + "learning_rate": 4.0104360790160274e-05, + "loss": 0.004, + "step": 12870 + }, + { + "epoch": 2.399962733497927, + "grad_norm": 0.09088319540023804, + "learning_rate": 3.998012175425519e-05, + "loss": 0.0062, + "step": 12880 + }, + { + "epoch": 2.4018260586015745, + "grad_norm": 0.059139229357242584, + "learning_rate": 3.9855882718350106e-05, + "loss": 0.0077, + "step": 12890 + }, + { + "epoch": 2.403689383705222, + "grad_norm": 0.04634464904665947, + "learning_rate": 3.973164368244502e-05, + "loss": 0.0051, + "step": 12900 + }, + { + "epoch": 2.4055527088088695, + "grad_norm": 0.09583749622106552, + "learning_rate": 3.9607404646539944e-05, + "loss": 0.0213, + "step": 12910 + }, + { + "epoch": 2.407416033912517, + "grad_norm": 0.09496133774518967, + "learning_rate": 3.948316561063487e-05, + "loss": 0.0065, + "step": 12920 + }, + { + "epoch": 2.4092793590161645, + "grad_norm": 0.04073603078722954, + "learning_rate": 3.935892657472978e-05, + "loss": 0.006, + "step": 12930 + }, + { + "epoch": 2.411142684119812, + "grad_norm": 0.047502193599939346, + "learning_rate": 3.92346875388247e-05, + "loss": 0.0066, + "step": 12940 + }, + { + "epoch": 2.413006009223459, + "grad_norm": 0.09126313030719757, + "learning_rate": 3.9110448502919614e-05, + "loss": 0.007, + "step": 12950 + }, + { + "epoch": 2.414869334327107, + "grad_norm": 0.05916162207722664, + "learning_rate": 3.898620946701454e-05, + "loss": 0.0057, + "step": 12960 + }, + { + "epoch": 2.416732659430754, + "grad_norm": 0.285349577665329, + "learning_rate": 3.886197043110946e-05, + "loss": 0.0065, + "step": 12970 + }, + { + "epoch": 2.418595984534402, + "grad_norm": 0.03480862081050873, + "learning_rate": 3.8737731395204375e-05, + "loss": 0.0045, + "step": 12980 + }, + { + "epoch": 2.420459309638049, + "grad_norm": 0.07367054373025894, + "learning_rate": 3.861349235929929e-05, + "loss": 0.006, + "step": 12990 + }, + { + "epoch": 2.4223226347416964, + "grad_norm": 0.17037363350391388, + "learning_rate": 3.8489253323394214e-05, + "loss": 0.0052, + "step": 13000 + }, + { + "epoch": 2.424185959845344, + "grad_norm": 0.10525022447109222, + "learning_rate": 3.836501428748913e-05, + "loss": 0.0058, + "step": 13010 + }, + { + "epoch": 2.4260492849489914, + "grad_norm": 0.08784069120883942, + "learning_rate": 3.824077525158405e-05, + "loss": 0.0066, + "step": 13020 + }, + { + "epoch": 2.4279126100526387, + "grad_norm": 0.05001769959926605, + "learning_rate": 3.811653621567897e-05, + "loss": 0.0067, + "step": 13030 + }, + { + "epoch": 2.4297759351562864, + "grad_norm": 0.043823111802339554, + "learning_rate": 3.799229717977389e-05, + "loss": 0.0083, + "step": 13040 + }, + { + "epoch": 2.4316392602599337, + "grad_norm": 0.05830995738506317, + "learning_rate": 3.7868058143868806e-05, + "loss": 0.0066, + "step": 13050 + }, + { + "epoch": 2.4335025853635814, + "grad_norm": 0.0677856057882309, + "learning_rate": 3.774381910796372e-05, + "loss": 0.0062, + "step": 13060 + }, + { + "epoch": 2.4353659104672287, + "grad_norm": 0.057640671730041504, + "learning_rate": 3.761958007205864e-05, + "loss": 0.0057, + "step": 13070 + }, + { + "epoch": 2.437229235570876, + "grad_norm": 0.06286245584487915, + "learning_rate": 3.749534103615357e-05, + "loss": 0.0077, + "step": 13080 + }, + { + "epoch": 2.4390925606745237, + "grad_norm": 0.06981087476015091, + "learning_rate": 3.737110200024848e-05, + "loss": 0.0076, + "step": 13090 + }, + { + "epoch": 2.440955885778171, + "grad_norm": 0.0729713961482048, + "learning_rate": 3.72468629643434e-05, + "loss": 0.005, + "step": 13100 + }, + { + "epoch": 2.4428192108818187, + "grad_norm": 0.028910622000694275, + "learning_rate": 3.7122623928438315e-05, + "loss": 0.0037, + "step": 13110 + }, + { + "epoch": 2.444682535985466, + "grad_norm": 0.07171040773391724, + "learning_rate": 3.699838489253323e-05, + "loss": 0.0055, + "step": 13120 + }, + { + "epoch": 2.4465458610891133, + "grad_norm": 0.057173121720552444, + "learning_rate": 3.687414585662815e-05, + "loss": 0.0057, + "step": 13130 + }, + { + "epoch": 2.448409186192761, + "grad_norm": 0.07052276283502579, + "learning_rate": 3.6749906820723076e-05, + "loss": 0.0048, + "step": 13140 + }, + { + "epoch": 2.4502725112964083, + "grad_norm": 0.09433647245168686, + "learning_rate": 3.662566778481799e-05, + "loss": 0.005, + "step": 13150 + }, + { + "epoch": 2.452135836400056, + "grad_norm": 0.05447697639465332, + "learning_rate": 3.650142874891291e-05, + "loss": 0.0059, + "step": 13160 + }, + { + "epoch": 2.4539991615037033, + "grad_norm": 0.06254272907972336, + "learning_rate": 3.637718971300783e-05, + "loss": 0.0054, + "step": 13170 + }, + { + "epoch": 2.4558624866073506, + "grad_norm": 0.030393915250897408, + "learning_rate": 3.6252950677102746e-05, + "loss": 0.0189, + "step": 13180 + }, + { + "epoch": 2.4577258117109984, + "grad_norm": 0.15565629303455353, + "learning_rate": 3.612871164119767e-05, + "loss": 0.0055, + "step": 13190 + }, + { + "epoch": 2.4595891368146456, + "grad_norm": 0.06977463513612747, + "learning_rate": 3.6004472605292584e-05, + "loss": 0.0062, + "step": 13200 + }, + { + "epoch": 2.4614524619182934, + "grad_norm": 0.025816500186920166, + "learning_rate": 3.588023356938751e-05, + "loss": 0.0049, + "step": 13210 + }, + { + "epoch": 2.4633157870219407, + "grad_norm": 0.05781162530183792, + "learning_rate": 3.575599453348242e-05, + "loss": 0.0071, + "step": 13220 + }, + { + "epoch": 2.465179112125588, + "grad_norm": 0.048669327050447464, + "learning_rate": 3.563175549757734e-05, + "loss": 0.0041, + "step": 13230 + }, + { + "epoch": 2.4670424372292357, + "grad_norm": 0.03702136129140854, + "learning_rate": 3.550751646167226e-05, + "loss": 0.005, + "step": 13240 + }, + { + "epoch": 2.468905762332883, + "grad_norm": 0.08766678720712662, + "learning_rate": 3.538327742576718e-05, + "loss": 0.0067, + "step": 13250 + }, + { + "epoch": 2.4707690874365307, + "grad_norm": 0.09898655116558075, + "learning_rate": 3.52590383898621e-05, + "loss": 0.0066, + "step": 13260 + }, + { + "epoch": 2.472632412540178, + "grad_norm": 0.11770444363355637, + "learning_rate": 3.5134799353957015e-05, + "loss": 0.0052, + "step": 13270 + }, + { + "epoch": 2.4744957376438252, + "grad_norm": 0.05603978410363197, + "learning_rate": 3.501056031805193e-05, + "loss": 0.0085, + "step": 13280 + }, + { + "epoch": 2.476359062747473, + "grad_norm": 0.12456575036048889, + "learning_rate": 3.488632128214685e-05, + "loss": 0.0056, + "step": 13290 + }, + { + "epoch": 2.4782223878511203, + "grad_norm": 0.047835420817136765, + "learning_rate": 3.4762082246241776e-05, + "loss": 0.0058, + "step": 13300 + }, + { + "epoch": 2.480085712954768, + "grad_norm": 0.06229006126523018, + "learning_rate": 3.463784321033669e-05, + "loss": 0.0062, + "step": 13310 + }, + { + "epoch": 2.4819490380584153, + "grad_norm": 0.030443726107478142, + "learning_rate": 3.451360417443161e-05, + "loss": 0.0052, + "step": 13320 + }, + { + "epoch": 2.4838123631620626, + "grad_norm": 0.09114624559879303, + "learning_rate": 3.4389365138526523e-05, + "loss": 0.0066, + "step": 13330 + }, + { + "epoch": 2.4856756882657103, + "grad_norm": 0.07564748078584671, + "learning_rate": 3.4265126102621446e-05, + "loss": 0.0068, + "step": 13340 + }, + { + "epoch": 2.4875390133693576, + "grad_norm": 0.07381059229373932, + "learning_rate": 3.414088706671636e-05, + "loss": 0.0079, + "step": 13350 + }, + { + "epoch": 2.4894023384730053, + "grad_norm": 0.03531397879123688, + "learning_rate": 3.4016648030811284e-05, + "loss": 0.0056, + "step": 13360 + }, + { + "epoch": 2.4912656635766526, + "grad_norm": 0.055100537836551666, + "learning_rate": 3.38924089949062e-05, + "loss": 0.0082, + "step": 13370 + }, + { + "epoch": 2.4931289886803, + "grad_norm": 0.07642512023448944, + "learning_rate": 3.376816995900112e-05, + "loss": 0.0048, + "step": 13380 + }, + { + "epoch": 2.4949923137839476, + "grad_norm": 0.09925362467765808, + "learning_rate": 3.364393092309604e-05, + "loss": 0.007, + "step": 13390 + }, + { + "epoch": 2.496855638887595, + "grad_norm": 0.059747230261564255, + "learning_rate": 3.3519691887190954e-05, + "loss": 0.0051, + "step": 13400 + }, + { + "epoch": 2.4987189639912426, + "grad_norm": 0.10106447339057922, + "learning_rate": 3.339545285128588e-05, + "loss": 0.0071, + "step": 13410 + }, + { + "epoch": 2.50058228909489, + "grad_norm": 0.07489795237779617, + "learning_rate": 3.327121381538079e-05, + "loss": 0.0074, + "step": 13420 + }, + { + "epoch": 2.502445614198537, + "grad_norm": 0.054214879870414734, + "learning_rate": 3.3146974779475715e-05, + "loss": 0.006, + "step": 13430 + }, + { + "epoch": 2.504308939302185, + "grad_norm": 0.05765419453382492, + "learning_rate": 3.302273574357063e-05, + "loss": 0.0053, + "step": 13440 + }, + { + "epoch": 2.506172264405832, + "grad_norm": 0.038531653583049774, + "learning_rate": 3.289849670766555e-05, + "loss": 0.0067, + "step": 13450 + }, + { + "epoch": 2.50803558950948, + "grad_norm": 0.08210862427949905, + "learning_rate": 3.277425767176047e-05, + "loss": 0.0055, + "step": 13460 + }, + { + "epoch": 2.509898914613127, + "grad_norm": 0.03968588635325432, + "learning_rate": 3.265001863585539e-05, + "loss": 0.0044, + "step": 13470 + }, + { + "epoch": 2.5117622397167745, + "grad_norm": 0.0714801549911499, + "learning_rate": 3.252577959995031e-05, + "loss": 0.0073, + "step": 13480 + }, + { + "epoch": 2.513625564820422, + "grad_norm": 0.031204110011458397, + "learning_rate": 3.2401540564045224e-05, + "loss": 0.0045, + "step": 13490 + }, + { + "epoch": 2.5154888899240695, + "grad_norm": 0.06460673362016678, + "learning_rate": 3.227730152814014e-05, + "loss": 0.008, + "step": 13500 + }, + { + "epoch": 2.517352215027717, + "grad_norm": 0.05626816302537918, + "learning_rate": 3.215306249223506e-05, + "loss": 0.0068, + "step": 13510 + }, + { + "epoch": 2.5192155401313645, + "grad_norm": 0.0728488489985466, + "learning_rate": 3.2028823456329985e-05, + "loss": 0.005, + "step": 13520 + }, + { + "epoch": 2.521078865235012, + "grad_norm": 0.075608991086483, + "learning_rate": 3.19045844204249e-05, + "loss": 0.0048, + "step": 13530 + }, + { + "epoch": 2.522942190338659, + "grad_norm": 0.0798393115401268, + "learning_rate": 3.1780345384519816e-05, + "loss": 0.0056, + "step": 13540 + }, + { + "epoch": 2.524805515442307, + "grad_norm": 0.07284027338027954, + "learning_rate": 3.165610634861473e-05, + "loss": 0.0054, + "step": 13550 + }, + { + "epoch": 2.5266688405459545, + "grad_norm": 0.07089147716760635, + "learning_rate": 3.1531867312709655e-05, + "loss": 0.0067, + "step": 13560 + }, + { + "epoch": 2.528532165649602, + "grad_norm": 0.11620229482650757, + "learning_rate": 3.140762827680457e-05, + "loss": 0.0062, + "step": 13570 + }, + { + "epoch": 2.530395490753249, + "grad_norm": 0.029851248487830162, + "learning_rate": 3.128338924089949e-05, + "loss": 0.007, + "step": 13580 + }, + { + "epoch": 2.5322588158568964, + "grad_norm": 0.0742005780339241, + "learning_rate": 3.115915020499441e-05, + "loss": 0.006, + "step": 13590 + }, + { + "epoch": 2.534122140960544, + "grad_norm": 0.06323488801717758, + "learning_rate": 3.103491116908933e-05, + "loss": 0.007, + "step": 13600 + }, + { + "epoch": 2.5359854660641914, + "grad_norm": 0.03448924049735069, + "learning_rate": 3.091067213318425e-05, + "loss": 0.0072, + "step": 13610 + }, + { + "epoch": 2.537848791167839, + "grad_norm": 0.04961373656988144, + "learning_rate": 3.078643309727916e-05, + "loss": 0.0062, + "step": 13620 + }, + { + "epoch": 2.5397121162714864, + "grad_norm": 0.049638889729976654, + "learning_rate": 3.0662194061374086e-05, + "loss": 0.0047, + "step": 13630 + }, + { + "epoch": 2.5415754413751337, + "grad_norm": 0.0409548357129097, + "learning_rate": 3.053795502546901e-05, + "loss": 0.0061, + "step": 13640 + }, + { + "epoch": 2.5434387664787814, + "grad_norm": 0.07288318872451782, + "learning_rate": 3.0413715989563924e-05, + "loss": 0.006, + "step": 13650 + }, + { + "epoch": 2.5453020915824287, + "grad_norm": 0.08695163577795029, + "learning_rate": 3.028947695365884e-05, + "loss": 0.0059, + "step": 13660 + }, + { + "epoch": 2.5471654166860764, + "grad_norm": 0.06203850731253624, + "learning_rate": 3.016523791775376e-05, + "loss": 0.0066, + "step": 13670 + }, + { + "epoch": 2.5490287417897237, + "grad_norm": 0.0684581995010376, + "learning_rate": 3.0040998881848682e-05, + "loss": 0.0061, + "step": 13680 + }, + { + "epoch": 2.550892066893371, + "grad_norm": 0.0407898835837841, + "learning_rate": 2.9916759845943598e-05, + "loss": 0.0042, + "step": 13690 + }, + { + "epoch": 2.5527553919970187, + "grad_norm": 0.07625031471252441, + "learning_rate": 2.9792520810038517e-05, + "loss": 0.0059, + "step": 13700 + }, + { + "epoch": 2.554618717100666, + "grad_norm": 0.07639207690954208, + "learning_rate": 2.9668281774133433e-05, + "loss": 0.0065, + "step": 13710 + }, + { + "epoch": 2.5564820422043137, + "grad_norm": 0.09934278577566147, + "learning_rate": 2.9544042738228352e-05, + "loss": 0.0041, + "step": 13720 + }, + { + "epoch": 2.558345367307961, + "grad_norm": 0.03425678610801697, + "learning_rate": 2.9419803702323274e-05, + "loss": 0.0069, + "step": 13730 + }, + { + "epoch": 2.5602086924116083, + "grad_norm": 0.14032401144504547, + "learning_rate": 2.929556466641819e-05, + "loss": 0.0088, + "step": 13740 + }, + { + "epoch": 2.562072017515256, + "grad_norm": 0.04956042021512985, + "learning_rate": 2.917132563051311e-05, + "loss": 0.0054, + "step": 13750 + }, + { + "epoch": 2.5639353426189033, + "grad_norm": 0.013684896752238274, + "learning_rate": 2.9047086594608025e-05, + "loss": 0.0092, + "step": 13760 + }, + { + "epoch": 2.565798667722551, + "grad_norm": 0.06913986057043076, + "learning_rate": 2.8922847558702948e-05, + "loss": 0.005, + "step": 13770 + }, + { + "epoch": 2.5676619928261983, + "grad_norm": 0.06730835139751434, + "learning_rate": 2.8798608522797864e-05, + "loss": 0.0094, + "step": 13780 + }, + { + "epoch": 2.5695253179298456, + "grad_norm": 0.045726943761110306, + "learning_rate": 2.8674369486892783e-05, + "loss": 0.0043, + "step": 13790 + }, + { + "epoch": 2.5713886430334933, + "grad_norm": 0.07426123321056366, + "learning_rate": 2.85501304509877e-05, + "loss": 0.01, + "step": 13800 + }, + { + "epoch": 2.5732519681371406, + "grad_norm": 0.044499099254608154, + "learning_rate": 2.842589141508262e-05, + "loss": 0.0054, + "step": 13810 + }, + { + "epoch": 2.5751152932407884, + "grad_norm": 0.09332112967967987, + "learning_rate": 2.830165237917754e-05, + "loss": 0.0042, + "step": 13820 + }, + { + "epoch": 2.5769786183444356, + "grad_norm": 0.017959794029593468, + "learning_rate": 2.8177413343272456e-05, + "loss": 0.0052, + "step": 13830 + }, + { + "epoch": 2.578841943448083, + "grad_norm": 0.043127838522195816, + "learning_rate": 2.8053174307367375e-05, + "loss": 0.0056, + "step": 13840 + }, + { + "epoch": 2.5807052685517307, + "grad_norm": 0.07607263326644897, + "learning_rate": 2.792893527146229e-05, + "loss": 0.0041, + "step": 13850 + }, + { + "epoch": 2.582568593655378, + "grad_norm": 0.0485481321811676, + "learning_rate": 2.7804696235557214e-05, + "loss": 0.0077, + "step": 13860 + }, + { + "epoch": 2.5844319187590257, + "grad_norm": 0.06554541736841202, + "learning_rate": 2.7680457199652133e-05, + "loss": 0.0057, + "step": 13870 + }, + { + "epoch": 2.586295243862673, + "grad_norm": 0.04134681820869446, + "learning_rate": 2.755621816374705e-05, + "loss": 0.0045, + "step": 13880 + }, + { + "epoch": 2.5881585689663202, + "grad_norm": 0.052181366831064224, + "learning_rate": 2.7431979127841968e-05, + "loss": 0.0042, + "step": 13890 + }, + { + "epoch": 2.590021894069968, + "grad_norm": 0.06621751934289932, + "learning_rate": 2.730774009193689e-05, + "loss": 0.0067, + "step": 13900 + }, + { + "epoch": 2.5918852191736153, + "grad_norm": 0.056223683059215546, + "learning_rate": 2.7183501056031806e-05, + "loss": 0.0061, + "step": 13910 + }, + { + "epoch": 2.593748544277263, + "grad_norm": 0.0572345070540905, + "learning_rate": 2.7059262020126726e-05, + "loss": 0.0064, + "step": 13920 + }, + { + "epoch": 2.5956118693809103, + "grad_norm": 0.04334909841418266, + "learning_rate": 2.693502298422164e-05, + "loss": 0.0039, + "step": 13930 + }, + { + "epoch": 2.5974751944845575, + "grad_norm": 0.08207881450653076, + "learning_rate": 2.6810783948316564e-05, + "loss": 0.0056, + "step": 13940 + }, + { + "epoch": 2.5993385195882053, + "grad_norm": 0.5891373157501221, + "learning_rate": 2.6686544912411483e-05, + "loss": 0.007, + "step": 13950 + }, + { + "epoch": 2.6012018446918526, + "grad_norm": 0.041573237627744675, + "learning_rate": 2.65623058765064e-05, + "loss": 0.004, + "step": 13960 + }, + { + "epoch": 2.6030651697955003, + "grad_norm": 0.071868397295475, + "learning_rate": 2.6438066840601318e-05, + "loss": 0.0051, + "step": 13970 + }, + { + "epoch": 2.6049284948991476, + "grad_norm": 0.07241298258304596, + "learning_rate": 2.631382780469624e-05, + "loss": 0.0039, + "step": 13980 + }, + { + "epoch": 2.606791820002795, + "grad_norm": 0.0609959252178669, + "learning_rate": 2.6189588768791157e-05, + "loss": 0.0066, + "step": 13990 + }, + { + "epoch": 2.6086551451064426, + "grad_norm": 0.051202163100242615, + "learning_rate": 2.6065349732886072e-05, + "loss": 0.0064, + "step": 14000 + }, + { + "epoch": 2.61051847021009, + "grad_norm": 0.06758036464452744, + "learning_rate": 2.594111069698099e-05, + "loss": 0.0052, + "step": 14010 + }, + { + "epoch": 2.6123817953137376, + "grad_norm": 0.01979978010058403, + "learning_rate": 2.5816871661075907e-05, + "loss": 0.0047, + "step": 14020 + }, + { + "epoch": 2.614245120417385, + "grad_norm": 0.05341746285557747, + "learning_rate": 2.569263262517083e-05, + "loss": 0.0056, + "step": 14030 + }, + { + "epoch": 2.616108445521032, + "grad_norm": 0.05054103955626488, + "learning_rate": 2.556839358926575e-05, + "loss": 0.0057, + "step": 14040 + }, + { + "epoch": 2.61797177062468, + "grad_norm": 0.03094291128218174, + "learning_rate": 2.5444154553360665e-05, + "loss": 0.0041, + "step": 14050 + }, + { + "epoch": 2.619835095728327, + "grad_norm": 0.060007113963365555, + "learning_rate": 2.5319915517455584e-05, + "loss": 0.0062, + "step": 14060 + }, + { + "epoch": 2.621698420831975, + "grad_norm": 0.058343224227428436, + "learning_rate": 2.5195676481550507e-05, + "loss": 0.0069, + "step": 14070 + }, + { + "epoch": 2.623561745935622, + "grad_norm": 0.03955060616135597, + "learning_rate": 2.5071437445645423e-05, + "loss": 0.0053, + "step": 14080 + }, + { + "epoch": 2.6254250710392695, + "grad_norm": 0.08612386882305145, + "learning_rate": 2.4947198409740342e-05, + "loss": 0.0075, + "step": 14090 + }, + { + "epoch": 2.6272883961429168, + "grad_norm": 0.07171555608510971, + "learning_rate": 2.482295937383526e-05, + "loss": 0.0051, + "step": 14100 + }, + { + "epoch": 2.6291517212465645, + "grad_norm": 0.06530676782131195, + "learning_rate": 2.469872033793018e-05, + "loss": 0.0052, + "step": 14110 + }, + { + "epoch": 2.631015046350212, + "grad_norm": 0.08867678046226501, + "learning_rate": 2.45744813020251e-05, + "loss": 0.0088, + "step": 14120 + }, + { + "epoch": 2.6328783714538595, + "grad_norm": 0.05115736648440361, + "learning_rate": 2.4450242266120015e-05, + "loss": 0.0036, + "step": 14130 + }, + { + "epoch": 2.634741696557507, + "grad_norm": 0.046469271183013916, + "learning_rate": 2.4326003230214934e-05, + "loss": 0.007, + "step": 14140 + }, + { + "epoch": 2.636605021661154, + "grad_norm": 0.045745860785245895, + "learning_rate": 2.4201764194309854e-05, + "loss": 0.0055, + "step": 14150 + }, + { + "epoch": 2.638468346764802, + "grad_norm": 0.08737187087535858, + "learning_rate": 2.407752515840477e-05, + "loss": 0.0049, + "step": 14160 + }, + { + "epoch": 2.640331671868449, + "grad_norm": 0.0638500526547432, + "learning_rate": 2.3953286122499692e-05, + "loss": 0.0034, + "step": 14170 + }, + { + "epoch": 2.642194996972097, + "grad_norm": 0.04362380877137184, + "learning_rate": 2.3829047086594608e-05, + "loss": 0.004, + "step": 14180 + }, + { + "epoch": 2.644058322075744, + "grad_norm": 0.056783124804496765, + "learning_rate": 2.3704808050689527e-05, + "loss": 0.0071, + "step": 14190 + }, + { + "epoch": 2.6459216471793914, + "grad_norm": 0.050139475613832474, + "learning_rate": 2.3580569014784446e-05, + "loss": 0.0053, + "step": 14200 + }, + { + "epoch": 2.647784972283039, + "grad_norm": 0.04129103943705559, + "learning_rate": 2.3456329978879365e-05, + "loss": 0.0079, + "step": 14210 + }, + { + "epoch": 2.6496482973866864, + "grad_norm": 0.07041865587234497, + "learning_rate": 2.3332090942974285e-05, + "loss": 0.0054, + "step": 14220 + }, + { + "epoch": 2.651511622490334, + "grad_norm": 0.08971253037452698, + "learning_rate": 2.3207851907069204e-05, + "loss": 0.0053, + "step": 14230 + }, + { + "epoch": 2.6533749475939814, + "grad_norm": 0.060063548386096954, + "learning_rate": 2.308361287116412e-05, + "loss": 0.0042, + "step": 14240 + }, + { + "epoch": 2.6552382726976287, + "grad_norm": 0.02573762647807598, + "learning_rate": 2.295937383525904e-05, + "loss": 0.0041, + "step": 14250 + }, + { + "epoch": 2.6571015978012764, + "grad_norm": 0.06436144560575485, + "learning_rate": 2.2835134799353958e-05, + "loss": 0.0051, + "step": 14260 + }, + { + "epoch": 2.6589649229049237, + "grad_norm": 0.04029161483049393, + "learning_rate": 2.2710895763448877e-05, + "loss": 0.0054, + "step": 14270 + }, + { + "epoch": 2.6608282480085714, + "grad_norm": 0.12254491448402405, + "learning_rate": 2.2586656727543796e-05, + "loss": 0.0078, + "step": 14280 + }, + { + "epoch": 2.6626915731122187, + "grad_norm": 0.0719374418258667, + "learning_rate": 2.2462417691638716e-05, + "loss": 0.0096, + "step": 14290 + }, + { + "epoch": 2.664554898215866, + "grad_norm": 0.055200230330228806, + "learning_rate": 2.233817865573363e-05, + "loss": 0.0062, + "step": 14300 + }, + { + "epoch": 2.6664182233195137, + "grad_norm": 0.07559479773044586, + "learning_rate": 2.221393961982855e-05, + "loss": 0.0064, + "step": 14310 + }, + { + "epoch": 2.668281548423161, + "grad_norm": 0.08524436503648758, + "learning_rate": 2.208970058392347e-05, + "loss": 0.0043, + "step": 14320 + }, + { + "epoch": 2.6701448735268087, + "grad_norm": 0.045584194362163544, + "learning_rate": 2.196546154801839e-05, + "loss": 0.0077, + "step": 14330 + }, + { + "epoch": 2.672008198630456, + "grad_norm": 0.15084679424762726, + "learning_rate": 2.1841222512113308e-05, + "loss": 0.0062, + "step": 14340 + }, + { + "epoch": 2.6738715237341033, + "grad_norm": 0.08916497230529785, + "learning_rate": 2.1716983476208224e-05, + "loss": 0.0053, + "step": 14350 + }, + { + "epoch": 2.675734848837751, + "grad_norm": 0.047935601323843, + "learning_rate": 2.1592744440303143e-05, + "loss": 0.0049, + "step": 14360 + }, + { + "epoch": 2.6775981739413983, + "grad_norm": 0.05849865823984146, + "learning_rate": 2.1468505404398062e-05, + "loss": 0.0061, + "step": 14370 + }, + { + "epoch": 2.679461499045046, + "grad_norm": 0.03962693363428116, + "learning_rate": 2.134426636849298e-05, + "loss": 0.0039, + "step": 14380 + }, + { + "epoch": 2.6813248241486933, + "grad_norm": 0.05341466888785362, + "learning_rate": 2.12200273325879e-05, + "loss": 0.0053, + "step": 14390 + }, + { + "epoch": 2.6831881492523406, + "grad_norm": 0.04300933703780174, + "learning_rate": 2.109578829668282e-05, + "loss": 0.0043, + "step": 14400 + }, + { + "epoch": 2.6850514743559883, + "grad_norm": 0.0594099722802639, + "learning_rate": 2.0971549260777736e-05, + "loss": 0.0087, + "step": 14410 + }, + { + "epoch": 2.6869147994596356, + "grad_norm": 0.07009299099445343, + "learning_rate": 2.084731022487266e-05, + "loss": 0.0054, + "step": 14420 + }, + { + "epoch": 2.6887781245632834, + "grad_norm": 0.0624958761036396, + "learning_rate": 2.0723071188967574e-05, + "loss": 0.0048, + "step": 14430 + }, + { + "epoch": 2.6906414496669306, + "grad_norm": 0.0850406363606453, + "learning_rate": 2.0598832153062493e-05, + "loss": 0.0066, + "step": 14440 + }, + { + "epoch": 2.692504774770578, + "grad_norm": 0.07980000972747803, + "learning_rate": 2.0474593117157413e-05, + "loss": 0.0055, + "step": 14450 + }, + { + "epoch": 2.6943680998742257, + "grad_norm": 0.04403868690133095, + "learning_rate": 2.035035408125233e-05, + "loss": 0.0037, + "step": 14460 + }, + { + "epoch": 2.696231424977873, + "grad_norm": 0.06792305409908295, + "learning_rate": 2.0226115045347248e-05, + "loss": 0.0048, + "step": 14470 + }, + { + "epoch": 2.6980947500815207, + "grad_norm": 0.07273188978433609, + "learning_rate": 2.0101876009442167e-05, + "loss": 0.0044, + "step": 14480 + }, + { + "epoch": 2.699958075185168, + "grad_norm": 0.06579861044883728, + "learning_rate": 1.9977636973537086e-05, + "loss": 0.0061, + "step": 14490 + }, + { + "epoch": 2.7018214002888152, + "grad_norm": 0.036138229072093964, + "learning_rate": 1.9853397937632005e-05, + "loss": 0.0052, + "step": 14500 + }, + { + "epoch": 2.703684725392463, + "grad_norm": 0.07591810822486877, + "learning_rate": 1.9729158901726924e-05, + "loss": 0.0068, + "step": 14510 + }, + { + "epoch": 2.7055480504961102, + "grad_norm": 0.11958233267068863, + "learning_rate": 1.960491986582184e-05, + "loss": 0.0049, + "step": 14520 + }, + { + "epoch": 2.707411375599758, + "grad_norm": 0.05322478339076042, + "learning_rate": 1.9480680829916763e-05, + "loss": 0.0042, + "step": 14530 + }, + { + "epoch": 2.7092747007034053, + "grad_norm": 0.060096897184848785, + "learning_rate": 1.935644179401168e-05, + "loss": 0.0043, + "step": 14540 + }, + { + "epoch": 2.7111380258070525, + "grad_norm": 0.0774926245212555, + "learning_rate": 1.9232202758106598e-05, + "loss": 0.0058, + "step": 14550 + }, + { + "epoch": 2.7130013509107003, + "grad_norm": 0.08724630624055862, + "learning_rate": 1.9107963722201517e-05, + "loss": 0.0047, + "step": 14560 + }, + { + "epoch": 2.7148646760143476, + "grad_norm": 0.02089923806488514, + "learning_rate": 1.8983724686296436e-05, + "loss": 0.0055, + "step": 14570 + }, + { + "epoch": 2.7167280011179953, + "grad_norm": 0.04341026023030281, + "learning_rate": 1.8859485650391352e-05, + "loss": 0.0068, + "step": 14580 + }, + { + "epoch": 2.7185913262216426, + "grad_norm": 0.05886482074856758, + "learning_rate": 1.8735246614486275e-05, + "loss": 0.0041, + "step": 14590 + }, + { + "epoch": 2.72045465132529, + "grad_norm": 0.030764520168304443, + "learning_rate": 1.861100757858119e-05, + "loss": 0.0045, + "step": 14600 + }, + { + "epoch": 2.7223179764289376, + "grad_norm": 0.08881295472383499, + "learning_rate": 1.848676854267611e-05, + "loss": 0.004, + "step": 14610 + }, + { + "epoch": 2.724181301532585, + "grad_norm": 0.03445763513445854, + "learning_rate": 1.836252950677103e-05, + "loss": 0.0044, + "step": 14620 + }, + { + "epoch": 2.7260446266362326, + "grad_norm": 0.09047674387693405, + "learning_rate": 1.8238290470865945e-05, + "loss": 0.0049, + "step": 14630 + }, + { + "epoch": 2.72790795173988, + "grad_norm": 0.04898625984787941, + "learning_rate": 1.8114051434960867e-05, + "loss": 0.0056, + "step": 14640 + }, + { + "epoch": 2.729771276843527, + "grad_norm": 0.13687947392463684, + "learning_rate": 1.7989812399055783e-05, + "loss": 0.0048, + "step": 14650 + }, + { + "epoch": 2.731634601947175, + "grad_norm": 0.10672298073768616, + "learning_rate": 1.7865573363150702e-05, + "loss": 0.0096, + "step": 14660 + }, + { + "epoch": 2.733497927050822, + "grad_norm": 0.07122719287872314, + "learning_rate": 1.774133432724562e-05, + "loss": 0.005, + "step": 14670 + }, + { + "epoch": 2.73536125215447, + "grad_norm": 0.11941694468259811, + "learning_rate": 1.761709529134054e-05, + "loss": 0.0066, + "step": 14680 + }, + { + "epoch": 2.737224577258117, + "grad_norm": 0.14367525279521942, + "learning_rate": 1.749285625543546e-05, + "loss": 0.0057, + "step": 14690 + }, + { + "epoch": 2.7390879023617645, + "grad_norm": 0.09323915839195251, + "learning_rate": 1.736861721953038e-05, + "loss": 0.0053, + "step": 14700 + }, + { + "epoch": 2.7409512274654118, + "grad_norm": 0.08907133340835571, + "learning_rate": 1.7244378183625295e-05, + "loss": 0.0043, + "step": 14710 + }, + { + "epoch": 2.7428145525690595, + "grad_norm": 0.05132559314370155, + "learning_rate": 1.7120139147720214e-05, + "loss": 0.0042, + "step": 14720 + }, + { + "epoch": 2.744677877672707, + "grad_norm": 0.08204059302806854, + "learning_rate": 1.6995900111815133e-05, + "loss": 0.0074, + "step": 14730 + }, + { + "epoch": 2.7465412027763545, + "grad_norm": 0.08662567287683487, + "learning_rate": 1.6871661075910052e-05, + "loss": 0.0056, + "step": 14740 + }, + { + "epoch": 2.748404527880002, + "grad_norm": 0.08066625148057938, + "learning_rate": 1.674742204000497e-05, + "loss": 0.0063, + "step": 14750 + }, + { + "epoch": 2.750267852983649, + "grad_norm": 0.08454219996929169, + "learning_rate": 1.6623183004099887e-05, + "loss": 0.0057, + "step": 14760 + }, + { + "epoch": 2.752131178087297, + "grad_norm": 0.07747121155261993, + "learning_rate": 1.6498943968194807e-05, + "loss": 0.0039, + "step": 14770 + }, + { + "epoch": 2.753994503190944, + "grad_norm": 0.0393957644701004, + "learning_rate": 1.6374704932289726e-05, + "loss": 0.0052, + "step": 14780 + }, + { + "epoch": 2.755857828294592, + "grad_norm": 0.03469419479370117, + "learning_rate": 1.6250465896384645e-05, + "loss": 0.0052, + "step": 14790 + }, + { + "epoch": 2.757721153398239, + "grad_norm": 0.07738560438156128, + "learning_rate": 1.6126226860479564e-05, + "loss": 0.0046, + "step": 14800 + }, + { + "epoch": 2.7595844785018864, + "grad_norm": 0.06221096217632294, + "learning_rate": 1.6001987824574483e-05, + "loss": 0.0038, + "step": 14810 + }, + { + "epoch": 2.761447803605534, + "grad_norm": 0.027473099529743195, + "learning_rate": 1.58777487886694e-05, + "loss": 0.0047, + "step": 14820 + }, + { + "epoch": 2.7633111287091814, + "grad_norm": 0.05012982711195946, + "learning_rate": 1.575350975276432e-05, + "loss": 0.0046, + "step": 14830 + }, + { + "epoch": 2.765174453812829, + "grad_norm": 0.07022465020418167, + "learning_rate": 1.5629270716859238e-05, + "loss": 0.006, + "step": 14840 + }, + { + "epoch": 2.7670377789164764, + "grad_norm": 0.03442366048693657, + "learning_rate": 1.5505031680954157e-05, + "loss": 0.0034, + "step": 14850 + }, + { + "epoch": 2.7689011040201237, + "grad_norm": 0.04125374183058739, + "learning_rate": 1.5380792645049076e-05, + "loss": 0.0058, + "step": 14860 + }, + { + "epoch": 2.7707644291237714, + "grad_norm": 0.043936800211668015, + "learning_rate": 1.5256553609143995e-05, + "loss": 0.0035, + "step": 14870 + }, + { + "epoch": 2.7726277542274187, + "grad_norm": 0.03655455261468887, + "learning_rate": 1.5132314573238913e-05, + "loss": 0.0043, + "step": 14880 + }, + { + "epoch": 2.7744910793310664, + "grad_norm": 0.05578184127807617, + "learning_rate": 1.5008075537333832e-05, + "loss": 0.0038, + "step": 14890 + }, + { + "epoch": 2.7763544044347137, + "grad_norm": 0.03317585960030556, + "learning_rate": 1.488383650142875e-05, + "loss": 0.0051, + "step": 14900 + }, + { + "epoch": 2.778217729538361, + "grad_norm": 0.09801258891820908, + "learning_rate": 1.475959746552367e-05, + "loss": 0.0089, + "step": 14910 + }, + { + "epoch": 2.7800810546420087, + "grad_norm": 0.034611500799655914, + "learning_rate": 1.4635358429618588e-05, + "loss": 0.0046, + "step": 14920 + }, + { + "epoch": 2.781944379745656, + "grad_norm": 0.05122116208076477, + "learning_rate": 1.4511119393713504e-05, + "loss": 0.0064, + "step": 14930 + }, + { + "epoch": 2.7838077048493037, + "grad_norm": 0.08390213549137115, + "learning_rate": 1.4386880357808424e-05, + "loss": 0.0055, + "step": 14940 + }, + { + "epoch": 2.785671029952951, + "grad_norm": 0.071256123483181, + "learning_rate": 1.4262641321903342e-05, + "loss": 0.0057, + "step": 14950 + }, + { + "epoch": 2.7875343550565983, + "grad_norm": 0.061553046107292175, + "learning_rate": 1.4138402285998261e-05, + "loss": 0.0075, + "step": 14960 + }, + { + "epoch": 2.789397680160246, + "grad_norm": 0.06200006976723671, + "learning_rate": 1.4014163250093179e-05, + "loss": 0.0047, + "step": 14970 + }, + { + "epoch": 2.7912610052638933, + "grad_norm": 0.04005870595574379, + "learning_rate": 1.38899242141881e-05, + "loss": 0.0052, + "step": 14980 + }, + { + "epoch": 2.793124330367541, + "grad_norm": 0.04067656025290489, + "learning_rate": 1.3765685178283017e-05, + "loss": 0.0049, + "step": 14990 + }, + { + "epoch": 2.7949876554711883, + "grad_norm": 0.10033728182315826, + "learning_rate": 1.3641446142377936e-05, + "loss": 0.0068, + "step": 15000 + }, + { + "epoch": 2.7968509805748356, + "grad_norm": 0.049095358699560165, + "learning_rate": 1.3517207106472854e-05, + "loss": 0.0044, + "step": 15010 + }, + { + "epoch": 2.7987143056784833, + "grad_norm": 0.03246445581316948, + "learning_rate": 1.3392968070567775e-05, + "loss": 0.004, + "step": 15020 + }, + { + "epoch": 2.8005776307821306, + "grad_norm": 0.0769830197095871, + "learning_rate": 1.3268729034662692e-05, + "loss": 0.006, + "step": 15030 + }, + { + "epoch": 2.8024409558857784, + "grad_norm": 0.08352731913328171, + "learning_rate": 1.3144489998757611e-05, + "loss": 0.007, + "step": 15040 + }, + { + "epoch": 2.8043042809894256, + "grad_norm": 0.04997175931930542, + "learning_rate": 1.3020250962852529e-05, + "loss": 0.0038, + "step": 15050 + }, + { + "epoch": 2.806167606093073, + "grad_norm": 0.08544106036424637, + "learning_rate": 1.289601192694745e-05, + "loss": 0.0067, + "step": 15060 + }, + { + "epoch": 2.8080309311967206, + "grad_norm": 0.05177277326583862, + "learning_rate": 1.2771772891042365e-05, + "loss": 0.0038, + "step": 15070 + }, + { + "epoch": 2.809894256300368, + "grad_norm": 0.06597385555505753, + "learning_rate": 1.2647533855137283e-05, + "loss": 0.0073, + "step": 15080 + }, + { + "epoch": 2.8117575814040157, + "grad_norm": 0.047354232519865036, + "learning_rate": 1.2523294819232204e-05, + "loss": 0.0035, + "step": 15090 + }, + { + "epoch": 2.813620906507663, + "grad_norm": 0.04057525470852852, + "learning_rate": 1.2399055783327123e-05, + "loss": 0.0047, + "step": 15100 + }, + { + "epoch": 2.8154842316113102, + "grad_norm": 0.037249766290187836, + "learning_rate": 1.227481674742204e-05, + "loss": 0.0033, + "step": 15110 + }, + { + "epoch": 2.817347556714958, + "grad_norm": 0.11450658738613129, + "learning_rate": 1.215057771151696e-05, + "loss": 0.0098, + "step": 15120 + }, + { + "epoch": 2.8192108818186052, + "grad_norm": 0.03987234830856323, + "learning_rate": 1.2026338675611879e-05, + "loss": 0.0051, + "step": 15130 + }, + { + "epoch": 2.821074206922253, + "grad_norm": 0.027388526126742363, + "learning_rate": 1.1902099639706796e-05, + "loss": 0.0055, + "step": 15140 + }, + { + "epoch": 2.8229375320259003, + "grad_norm": 0.04660959541797638, + "learning_rate": 1.1777860603801714e-05, + "loss": 0.0042, + "step": 15150 + }, + { + "epoch": 2.8248008571295475, + "grad_norm": 0.0806809738278389, + "learning_rate": 1.1653621567896633e-05, + "loss": 0.0061, + "step": 15160 + }, + { + "epoch": 2.8266641822331953, + "grad_norm": 0.04064582288265228, + "learning_rate": 1.1529382531991552e-05, + "loss": 0.0063, + "step": 15170 + }, + { + "epoch": 2.8285275073368426, + "grad_norm": 0.09423641860485077, + "learning_rate": 1.140514349608647e-05, + "loss": 0.0043, + "step": 15180 + }, + { + "epoch": 2.8303908324404903, + "grad_norm": 0.03595752641558647, + "learning_rate": 1.1280904460181389e-05, + "loss": 0.0046, + "step": 15190 + }, + { + "epoch": 2.8322541575441376, + "grad_norm": 0.05157044902443886, + "learning_rate": 1.1156665424276308e-05, + "loss": 0.0049, + "step": 15200 + }, + { + "epoch": 2.834117482647785, + "grad_norm": 0.025802219286561012, + "learning_rate": 1.1032426388371227e-05, + "loss": 0.004, + "step": 15210 + }, + { + "epoch": 2.8359808077514326, + "grad_norm": 0.04284001141786575, + "learning_rate": 1.0908187352466145e-05, + "loss": 0.0049, + "step": 15220 + }, + { + "epoch": 2.83784413285508, + "grad_norm": 0.06535849720239639, + "learning_rate": 1.0783948316561064e-05, + "loss": 0.0053, + "step": 15230 + }, + { + "epoch": 2.8397074579587276, + "grad_norm": 0.034743234515190125, + "learning_rate": 1.0659709280655983e-05, + "loss": 0.0037, + "step": 15240 + }, + { + "epoch": 2.841570783062375, + "grad_norm": 0.04481494799256325, + "learning_rate": 1.0535470244750901e-05, + "loss": 0.0091, + "step": 15250 + }, + { + "epoch": 2.843434108166022, + "grad_norm": 0.027818024158477783, + "learning_rate": 1.041123120884582e-05, + "loss": 0.0056, + "step": 15260 + }, + { + "epoch": 2.84529743326967, + "grad_norm": 0.07148770242929459, + "learning_rate": 1.028699217294074e-05, + "loss": 0.0042, + "step": 15270 + }, + { + "epoch": 2.847160758373317, + "grad_norm": 0.07908165454864502, + "learning_rate": 1.0162753137035658e-05, + "loss": 0.0052, + "step": 15280 + }, + { + "epoch": 2.849024083476965, + "grad_norm": 0.0394151397049427, + "learning_rate": 1.0038514101130576e-05, + "loss": 0.005, + "step": 15290 + }, + { + "epoch": 2.850887408580612, + "grad_norm": 0.07566548883914948, + "learning_rate": 9.914275065225493e-06, + "loss": 0.0066, + "step": 15300 + }, + { + "epoch": 2.8527507336842595, + "grad_norm": 0.043212130665779114, + "learning_rate": 9.790036029320413e-06, + "loss": 0.006, + "step": 15310 + }, + { + "epoch": 2.8546140587879067, + "grad_norm": 0.042301010340452194, + "learning_rate": 9.665796993415332e-06, + "loss": 0.0042, + "step": 15320 + }, + { + "epoch": 2.8564773838915545, + "grad_norm": 0.0396822988986969, + "learning_rate": 9.54155795751025e-06, + "loss": 0.0041, + "step": 15330 + }, + { + "epoch": 2.858340708995202, + "grad_norm": 0.1604224443435669, + "learning_rate": 9.417318921605169e-06, + "loss": 0.0049, + "step": 15340 + }, + { + "epoch": 2.8602040340988495, + "grad_norm": 0.27743810415267944, + "learning_rate": 9.293079885700088e-06, + "loss": 0.0034, + "step": 15350 + }, + { + "epoch": 2.8620673592024968, + "grad_norm": 0.03571480140089989, + "learning_rate": 9.168840849795005e-06, + "loss": 0.0041, + "step": 15360 + }, + { + "epoch": 2.863930684306144, + "grad_norm": 0.057948265224695206, + "learning_rate": 9.044601813889924e-06, + "loss": 0.0063, + "step": 15370 + }, + { + "epoch": 2.865794009409792, + "grad_norm": 0.07889413088560104, + "learning_rate": 8.920362777984844e-06, + "loss": 0.0053, + "step": 15380 + }, + { + "epoch": 2.867657334513439, + "grad_norm": 0.06906258314847946, + "learning_rate": 8.796123742079763e-06, + "loss": 0.0066, + "step": 15390 + }, + { + "epoch": 2.869520659617087, + "grad_norm": 0.11181332916021347, + "learning_rate": 8.67188470617468e-06, + "loss": 0.0039, + "step": 15400 + }, + { + "epoch": 2.871383984720734, + "grad_norm": 0.049036044627428055, + "learning_rate": 8.5476456702696e-06, + "loss": 0.0044, + "step": 15410 + }, + { + "epoch": 2.8732473098243814, + "grad_norm": 0.07270175218582153, + "learning_rate": 8.423406634364519e-06, + "loss": 0.0043, + "step": 15420 + }, + { + "epoch": 2.875110634928029, + "grad_norm": 0.10137112438678741, + "learning_rate": 8.299167598459436e-06, + "loss": 0.0052, + "step": 15430 + }, + { + "epoch": 2.8769739600316764, + "grad_norm": 0.08990975469350815, + "learning_rate": 8.174928562554355e-06, + "loss": 0.0056, + "step": 15440 + }, + { + "epoch": 2.878837285135324, + "grad_norm": 0.022517943754792213, + "learning_rate": 8.050689526649273e-06, + "loss": 0.004, + "step": 15450 + }, + { + "epoch": 2.8807006102389714, + "grad_norm": 0.020834220573306084, + "learning_rate": 7.926450490744192e-06, + "loss": 0.0055, + "step": 15460 + }, + { + "epoch": 2.8825639353426187, + "grad_norm": 0.02067444659769535, + "learning_rate": 7.80221145483911e-06, + "loss": 0.005, + "step": 15470 + }, + { + "epoch": 2.8844272604462664, + "grad_norm": 0.04585389420390129, + "learning_rate": 7.677972418934029e-06, + "loss": 0.0044, + "step": 15480 + }, + { + "epoch": 2.8862905855499137, + "grad_norm": 0.09295323491096497, + "learning_rate": 7.553733383028948e-06, + "loss": 0.0066, + "step": 15490 + }, + { + "epoch": 2.8881539106535614, + "grad_norm": 0.04389965534210205, + "learning_rate": 7.429494347123866e-06, + "loss": 0.0047, + "step": 15500 + }, + { + "epoch": 2.8900172357572087, + "grad_norm": 0.08827071636915207, + "learning_rate": 7.305255311218786e-06, + "loss": 0.0091, + "step": 15510 + }, + { + "epoch": 2.891880560860856, + "grad_norm": 0.06935074180364609, + "learning_rate": 7.181016275313704e-06, + "loss": 0.0058, + "step": 15520 + }, + { + "epoch": 2.8937438859645037, + "grad_norm": 0.07258989661931992, + "learning_rate": 7.056777239408622e-06, + "loss": 0.0082, + "step": 15530 + }, + { + "epoch": 2.895607211068151, + "grad_norm": 0.07443951815366745, + "learning_rate": 6.9325382035035415e-06, + "loss": 0.0064, + "step": 15540 + }, + { + "epoch": 2.8974705361717987, + "grad_norm": 0.045825716108083725, + "learning_rate": 6.80829916759846e-06, + "loss": 0.0065, + "step": 15550 + }, + { + "epoch": 2.899333861275446, + "grad_norm": 0.062278542667627335, + "learning_rate": 6.684060131693379e-06, + "loss": 0.0052, + "step": 15560 + }, + { + "epoch": 2.9011971863790933, + "grad_norm": 0.07567639648914337, + "learning_rate": 6.559821095788297e-06, + "loss": 0.0045, + "step": 15570 + }, + { + "epoch": 2.903060511482741, + "grad_norm": 0.04492925852537155, + "learning_rate": 6.435582059883216e-06, + "loss": 0.0043, + "step": 15580 + }, + { + "epoch": 2.9049238365863883, + "grad_norm": 0.10229244828224182, + "learning_rate": 6.311343023978135e-06, + "loss": 0.0069, + "step": 15590 + }, + { + "epoch": 2.906787161690036, + "grad_norm": 0.01675078086555004, + "learning_rate": 6.187103988073053e-06, + "loss": 0.0045, + "step": 15600 + }, + { + "epoch": 2.9086504867936833, + "grad_norm": 0.03617624193429947, + "learning_rate": 6.062864952167972e-06, + "loss": 0.0041, + "step": 15610 + }, + { + "epoch": 2.9105138118973306, + "grad_norm": 0.06336949020624161, + "learning_rate": 5.93862591626289e-06, + "loss": 0.0053, + "step": 15620 + }, + { + "epoch": 2.9123771370009783, + "grad_norm": 0.09426115453243256, + "learning_rate": 5.814386880357809e-06, + "loss": 0.0061, + "step": 15630 + }, + { + "epoch": 2.9142404621046256, + "grad_norm": 0.06645146012306213, + "learning_rate": 5.690147844452727e-06, + "loss": 0.0049, + "step": 15640 + }, + { + "epoch": 2.9161037872082733, + "grad_norm": 0.06695625931024551, + "learning_rate": 5.565908808547646e-06, + "loss": 0.0036, + "step": 15650 + }, + { + "epoch": 2.9179671123119206, + "grad_norm": 0.03353876248002052, + "learning_rate": 5.441669772642564e-06, + "loss": 0.0044, + "step": 15660 + }, + { + "epoch": 2.919830437415568, + "grad_norm": 0.05638827010989189, + "learning_rate": 5.3174307367374834e-06, + "loss": 0.005, + "step": 15670 + }, + { + "epoch": 2.9216937625192156, + "grad_norm": 0.03586374223232269, + "learning_rate": 5.193191700832402e-06, + "loss": 0.0043, + "step": 15680 + }, + { + "epoch": 2.923557087622863, + "grad_norm": 0.034597247838974, + "learning_rate": 5.068952664927321e-06, + "loss": 0.0061, + "step": 15690 + }, + { + "epoch": 2.9254204127265107, + "grad_norm": 0.03682232275605202, + "learning_rate": 4.944713629022239e-06, + "loss": 0.0032, + "step": 15700 + }, + { + "epoch": 2.927283737830158, + "grad_norm": 0.05273206904530525, + "learning_rate": 4.820474593117158e-06, + "loss": 0.0044, + "step": 15710 + }, + { + "epoch": 2.9291470629338052, + "grad_norm": 0.043077465146780014, + "learning_rate": 4.696235557212076e-06, + "loss": 0.0044, + "step": 15720 + }, + { + "epoch": 2.931010388037453, + "grad_norm": 0.05343864858150482, + "learning_rate": 4.571996521306994e-06, + "loss": 0.0091, + "step": 15730 + }, + { + "epoch": 2.9328737131411002, + "grad_norm": 0.06297837197780609, + "learning_rate": 4.447757485401914e-06, + "loss": 0.0048, + "step": 15740 + }, + { + "epoch": 2.934737038244748, + "grad_norm": 0.055205777287483215, + "learning_rate": 4.323518449496832e-06, + "loss": 0.0049, + "step": 15750 + }, + { + "epoch": 2.9366003633483952, + "grad_norm": 0.05605635046958923, + "learning_rate": 4.199279413591751e-06, + "loss": 0.0044, + "step": 15760 + }, + { + "epoch": 2.9384636884520425, + "grad_norm": 0.06049589440226555, + "learning_rate": 4.0750403776866695e-06, + "loss": 0.0036, + "step": 15770 + }, + { + "epoch": 2.9403270135556903, + "grad_norm": 0.04346240684390068, + "learning_rate": 3.950801341781589e-06, + "loss": 0.0037, + "step": 15780 + }, + { + "epoch": 2.9421903386593375, + "grad_norm": 0.036925263702869415, + "learning_rate": 3.826562305876506e-06, + "loss": 0.0067, + "step": 15790 + }, + { + "epoch": 2.9440536637629853, + "grad_norm": 0.08391120284795761, + "learning_rate": 3.702323269971425e-06, + "loss": 0.006, + "step": 15800 + }, + { + "epoch": 2.9459169888666326, + "grad_norm": 0.06644539535045624, + "learning_rate": 3.5780842340663437e-06, + "loss": 0.0049, + "step": 15810 + }, + { + "epoch": 2.94778031397028, + "grad_norm": 0.14044621586799622, + "learning_rate": 3.4538451981612625e-06, + "loss": 0.0078, + "step": 15820 + }, + { + "epoch": 2.9496436390739276, + "grad_norm": 0.04912354797124863, + "learning_rate": 3.3296061622561813e-06, + "loss": 0.005, + "step": 15830 + }, + { + "epoch": 2.951506964177575, + "grad_norm": 0.030744953081011772, + "learning_rate": 3.2053671263510996e-06, + "loss": 0.0047, + "step": 15840 + }, + { + "epoch": 2.9533702892812226, + "grad_norm": 0.05932426080107689, + "learning_rate": 3.081128090446018e-06, + "loss": 0.007, + "step": 15850 + }, + { + "epoch": 2.95523361438487, + "grad_norm": 0.09052354097366333, + "learning_rate": 2.9568890545409367e-06, + "loss": 0.0049, + "step": 15860 + }, + { + "epoch": 2.957096939488517, + "grad_norm": 0.03159482032060623, + "learning_rate": 2.8326500186358555e-06, + "loss": 0.0042, + "step": 15870 + }, + { + "epoch": 2.958960264592165, + "grad_norm": 0.06023601070046425, + "learning_rate": 2.7084109827307743e-06, + "loss": 0.0044, + "step": 15880 + }, + { + "epoch": 2.960823589695812, + "grad_norm": 0.02894243411719799, + "learning_rate": 2.5841719468256926e-06, + "loss": 0.0048, + "step": 15890 + }, + { + "epoch": 2.96268691479946, + "grad_norm": 0.06545311957597733, + "learning_rate": 2.4599329109206114e-06, + "loss": 0.0041, + "step": 15900 + }, + { + "epoch": 2.964550239903107, + "grad_norm": 0.07171537727117538, + "learning_rate": 2.33569387501553e-06, + "loss": 0.0051, + "step": 15910 + }, + { + "epoch": 2.9664135650067545, + "grad_norm": 0.04423753172159195, + "learning_rate": 2.2114548391104485e-06, + "loss": 0.0047, + "step": 15920 + }, + { + "epoch": 2.9682768901104017, + "grad_norm": 0.022418806329369545, + "learning_rate": 2.0872158032053673e-06, + "loss": 0.0064, + "step": 15930 + }, + { + "epoch": 2.9701402152140495, + "grad_norm": 0.06295423209667206, + "learning_rate": 1.9629767673002857e-06, + "loss": 0.0046, + "step": 15940 + }, + { + "epoch": 2.972003540317697, + "grad_norm": 0.03230300545692444, + "learning_rate": 1.8387377313952046e-06, + "loss": 0.0038, + "step": 15950 + }, + { + "epoch": 2.9738668654213445, + "grad_norm": 0.07564916461706161, + "learning_rate": 1.714498695490123e-06, + "loss": 0.0065, + "step": 15960 + }, + { + "epoch": 2.9757301905249918, + "grad_norm": 0.037815775722265244, + "learning_rate": 1.5902596595850416e-06, + "loss": 0.005, + "step": 15970 + }, + { + "epoch": 2.977593515628639, + "grad_norm": 0.0978199765086174, + "learning_rate": 1.4660206236799603e-06, + "loss": 0.0036, + "step": 15980 + }, + { + "epoch": 2.979456840732287, + "grad_norm": 0.0537540465593338, + "learning_rate": 1.341781587774879e-06, + "loss": 0.0056, + "step": 15990 + }, + { + "epoch": 2.981320165835934, + "grad_norm": 0.05983177572488785, + "learning_rate": 1.2175425518697977e-06, + "loss": 0.0048, + "step": 16000 + }, + { + "epoch": 2.983183490939582, + "grad_norm": 0.05021652951836586, + "learning_rate": 1.0933035159647162e-06, + "loss": 0.0054, + "step": 16010 + }, + { + "epoch": 2.985046816043229, + "grad_norm": 0.05442306399345398, + "learning_rate": 9.690644800596348e-07, + "loss": 0.0054, + "step": 16020 + }, + { + "epoch": 2.9869101411468764, + "grad_norm": 0.023789361119270325, + "learning_rate": 8.448254441545534e-07, + "loss": 0.0045, + "step": 16030 + }, + { + "epoch": 2.988773466250524, + "grad_norm": 0.02481863461434841, + "learning_rate": 7.20586408249472e-07, + "loss": 0.0044, + "step": 16040 + }, + { + "epoch": 2.9906367913541714, + "grad_norm": 0.03807642310857773, + "learning_rate": 5.963473723443906e-07, + "loss": 0.0046, + "step": 16050 + }, + { + "epoch": 2.992500116457819, + "grad_norm": 0.17297309637069702, + "learning_rate": 4.7210833643930925e-07, + "loss": 0.0045, + "step": 16060 + }, + { + "epoch": 2.9943634415614664, + "grad_norm": 0.08204018324613571, + "learning_rate": 3.4786930053422787e-07, + "loss": 0.0055, + "step": 16070 + }, + { + "epoch": 2.9962267666651137, + "grad_norm": 0.07848156243562698, + "learning_rate": 2.2363026462914648e-07, + "loss": 0.004, + "step": 16080 + }, + { + "epoch": 2.9980900917687614, + "grad_norm": 0.1832299679517746, + "learning_rate": 9.93912287240651e-08, + "loss": 0.0069, + "step": 16090 + }, + { + "epoch": 2.9995807518516795, + "step": 16098, + "total_flos": 1.2695660559534653e+18, + "train_loss": 0.020097828387903093, + "train_runtime": 24374.3923, + "train_samples_per_second": 10.569, + "train_steps_per_second": 0.66 + } + ], + "logging_steps": 10, + "max_steps": 16098, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 10500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2695660559534653e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}