{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.998743529175252, "eval_steps": 500, "global_step": 4968, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016082826556767352, "grad_norm": 93.80207061767578, "learning_rate": 2.666666666666667e-06, "loss": 0.8299, "step": 10 }, { "epoch": 0.032165653113534705, "grad_norm": 50.166954040527344, "learning_rate": 5.333333333333334e-06, "loss": 0.7131, "step": 20 }, { "epoch": 0.048248479670302054, "grad_norm": 37.23706817626953, "learning_rate": 8.000000000000001e-06, "loss": 0.5976, "step": 30 }, { "epoch": 0.06433130622706941, "grad_norm": 37.21980285644531, "learning_rate": 1.0666666666666667e-05, "loss": 0.5263, "step": 40 }, { "epoch": 0.08041413278383676, "grad_norm": 29.091915130615234, "learning_rate": 1.3333333333333333e-05, "loss": 0.4731, "step": 50 }, { "epoch": 0.09649695934060411, "grad_norm": 32.472801208496094, "learning_rate": 1.6000000000000003e-05, "loss": 0.4357, "step": 60 }, { "epoch": 0.11257978589737146, "grad_norm": 29.79865264892578, "learning_rate": 1.866666666666667e-05, "loss": 0.3916, "step": 70 }, { "epoch": 0.12866261245413882, "grad_norm": 28.13816261291504, "learning_rate": 2.1333333333333335e-05, "loss": 0.3721, "step": 80 }, { "epoch": 0.14474543901090617, "grad_norm": 30.40574073791504, "learning_rate": 2.4e-05, "loss": 0.3382, "step": 90 }, { "epoch": 0.16082826556767352, "grad_norm": 30.368940353393555, "learning_rate": 2.6666666666666667e-05, "loss": 0.3207, "step": 100 }, { "epoch": 0.17691109212444087, "grad_norm": 31.629531860351562, "learning_rate": 2.9333333333333333e-05, "loss": 0.305, "step": 110 }, { "epoch": 0.19299391868120822, "grad_norm": 29.47364044189453, "learning_rate": 3.2000000000000005e-05, "loss": 0.2812, "step": 120 }, { "epoch": 0.20907674523797556, "grad_norm": 30.890962600708008, "learning_rate": 3.466666666666667e-05, "loss": 0.2665, "step": 130 }, { "epoch": 0.2251595717947429, "grad_norm": 31.893320083618164, "learning_rate": 3.733333333333334e-05, "loss": 0.2505, "step": 140 }, { "epoch": 0.2412423983515103, "grad_norm": 29.82271957397461, "learning_rate": 4e-05, "loss": 0.2404, "step": 150 }, { "epoch": 0.25732522490827764, "grad_norm": 31.970462799072266, "learning_rate": 3.9999574828039864e-05, "loss": 0.2188, "step": 160 }, { "epoch": 0.273408051465045, "grad_norm": 25.94739532470703, "learning_rate": 3.999829933023657e-05, "loss": 0.2156, "step": 170 }, { "epoch": 0.28949087802181234, "grad_norm": 32.104461669921875, "learning_rate": 3.9996173560820705e-05, "loss": 0.2064, "step": 180 }, { "epoch": 1.018596079745351, "grad_norm": 29.600008010864258, "learning_rate": 3.999319761017403e-05, "loss": 0.2122, "step": 190 }, { "epoch": 1.0722063997319484, "grad_norm": 28.94344139099121, "learning_rate": 3.998937160482562e-05, "loss": 0.1835, "step": 200 }, { "epoch": 1.1258167197185458, "grad_norm": 27.56523323059082, "learning_rate": 3.998469570744648e-05, "loss": 0.1815, "step": 210 }, { "epoch": 1.1794270397051432, "grad_norm": 28.684629440307617, "learning_rate": 3.997917011684268e-05, "loss": 0.1717, "step": 220 }, { "epoch": 1.2330373596917408, "grad_norm": 27.716796875, "learning_rate": 3.9972795067946826e-05, "loss": 0.1615, "step": 230 }, { "epoch": 1.2866476796783382, "grad_norm": 25.33133316040039, "learning_rate": 3.996557083180813e-05, "loss": 0.1585, "step": 240 }, { "epoch": 1.3402579996649355, "grad_norm": 28.910871505737305, "learning_rate": 3.9957497715580844e-05, "loss": 0.1488, "step": 250 }, { "epoch": 1.393868319651533, "grad_norm": 26.948163986206055, "learning_rate": 3.994857606251124e-05, "loss": 0.1448, "step": 260 }, { "epoch": 1.4474786396381303, "grad_norm": 26.12610626220703, "learning_rate": 3.993880625192298e-05, "loss": 0.1411, "step": 270 }, { "epoch": 1.5010889596247279, "grad_norm": 26.017358779907227, "learning_rate": 3.9928188699201035e-05, "loss": 0.1375, "step": 280 }, { "epoch": 1.554699279611325, "grad_norm": 26.367809295654297, "learning_rate": 3.991672385577396e-05, "loss": 0.1272, "step": 290 }, { "epoch": 1.6083095995979226, "grad_norm": 24.2690372467041, "learning_rate": 3.9904412209094755e-05, "loss": 0.1288, "step": 300 }, { "epoch": 1.66191991958452, "grad_norm": 29.307832717895508, "learning_rate": 3.9891254282620115e-05, "loss": 0.1321, "step": 310 }, { "epoch": 1.7155302395711174, "grad_norm": 27.43846321105957, "learning_rate": 3.9877250635788184e-05, "loss": 0.1161, "step": 320 }, { "epoch": 1.769140559557715, "grad_norm": 23.861331939697266, "learning_rate": 3.9862401863994744e-05, "loss": 0.1169, "step": 330 }, { "epoch": 1.8227508795443121, "grad_norm": 23.86732292175293, "learning_rate": 3.9846708598567956e-05, "loss": 0.1123, "step": 340 }, { "epoch": 1.8763611995309097, "grad_norm": 28.334510803222656, "learning_rate": 3.983017150674145e-05, "loss": 0.1042, "step": 350 }, { "epoch": 1.9299715195175071, "grad_norm": 26.570316314697266, "learning_rate": 3.9812791291626e-05, "loss": 0.1069, "step": 360 }, { "epoch": 1.9835818395041045, "grad_norm": 23.24406623840332, "learning_rate": 3.979456869217962e-05, "loss": 0.1074, "step": 370 }, { "epoch": 2.0316112458969604, "grad_norm": 23.662790298461914, "learning_rate": 3.977550448317615e-05, "loss": 0.1278, "step": 380 }, { "epoch": 2.077279862994149, "grad_norm": 20.232288360595703, "learning_rate": 3.97555994751723e-05, "loss": 0.1282, "step": 390 }, { "epoch": 2.1229484800913374, "grad_norm": 24.64867401123047, "learning_rate": 3.973485451447318e-05, "loss": 0.1146, "step": 400 }, { "epoch": 2.168617097188526, "grad_norm": 25.983293533325195, "learning_rate": 3.9713270483096374e-05, "loss": 0.1123, "step": 410 }, { "epoch": 2.2142857142857144, "grad_norm": 23.095712661743164, "learning_rate": 3.969084829873436e-05, "loss": 0.103, "step": 420 }, { "epoch": 2.259954331382903, "grad_norm": 23.112424850463867, "learning_rate": 3.966758891471555e-05, "loss": 0.1065, "step": 430 }, { "epoch": 2.3056229484800914, "grad_norm": 23.07062339782715, "learning_rate": 3.964349331996373e-05, "loss": 0.1021, "step": 440 }, { "epoch": 2.35129156557728, "grad_norm": 25.471771240234375, "learning_rate": 3.961856253895603e-05, "loss": 0.1064, "step": 450 }, { "epoch": 2.3969601826744684, "grad_norm": 24.417654037475586, "learning_rate": 3.959279763167935e-05, "loss": 0.0956, "step": 460 }, { "epoch": 2.442628799771657, "grad_norm": 24.067520141601562, "learning_rate": 3.9566199693585304e-05, "loss": 0.1113, "step": 470 }, { "epoch": 2.4882974168688454, "grad_norm": 23.70163345336914, "learning_rate": 3.953876985554364e-05, "loss": 0.0911, "step": 480 }, { "epoch": 2.533966033966034, "grad_norm": 23.784481048583984, "learning_rate": 3.951050928379415e-05, "loss": 0.0888, "step": 490 }, { "epoch": 2.5796346510632224, "grad_norm": 23.621828079223633, "learning_rate": 3.948141917989712e-05, "loss": 0.0904, "step": 500 }, { "epoch": 2.625303268160411, "grad_norm": 19.741653442382812, "learning_rate": 3.945150078068219e-05, "loss": 0.0879, "step": 510 }, { "epoch": 2.6709718852575994, "grad_norm": 22.743593215942383, "learning_rate": 3.9420755358195804e-05, "loss": 0.0851, "step": 520 }, { "epoch": 2.716640502354788, "grad_norm": 18.910329818725586, "learning_rate": 3.938918421964711e-05, "loss": 0.0801, "step": 530 }, { "epoch": 2.7623091194519764, "grad_norm": 22.360628128051758, "learning_rate": 3.9356788707352406e-05, "loss": 0.078, "step": 540 }, { "epoch": 2.807977736549165, "grad_norm": 24.22591209411621, "learning_rate": 3.932357019867803e-05, "loss": 0.0822, "step": 550 }, { "epoch": 2.8536463536463534, "grad_norm": 24.46196746826172, "learning_rate": 3.928953010598183e-05, "loss": 0.0695, "step": 560 }, { "epoch": 2.899314970743542, "grad_norm": 24.530746459960938, "learning_rate": 3.925466987655309e-05, "loss": 0.082, "step": 570 }, { "epoch": 2.9449835878407304, "grad_norm": 23.36806297302246, "learning_rate": 3.921899099255104e-05, "loss": 0.0751, "step": 580 }, { "epoch": 2.9906522049379194, "grad_norm": 19.65323257446289, "learning_rate": 3.918249497094176e-05, "loss": 0.07, "step": 590 }, { "epoch": 3.02390615940413, "grad_norm": 17.223087310791016, "learning_rate": 3.9145183363433777e-05, "loss": 0.0662, "step": 600 }, { "epoch": 3.05245082734936, "grad_norm": 19.922897338867188, "learning_rate": 3.9107057756411995e-05, "loss": 0.0695, "step": 610 }, { "epoch": 3.08099549529459, "grad_norm": 22.456689834594727, "learning_rate": 3.906811977087035e-05, "loss": 0.0574, "step": 620 }, { "epoch": 3.10954016323982, "grad_norm": 18.3155460357666, "learning_rate": 3.902837106234278e-05, "loss": 0.0638, "step": 630 }, { "epoch": 3.1380848311850498, "grad_norm": 19.499990463256836, "learning_rate": 3.8987813320832935e-05, "loss": 0.0663, "step": 640 }, { "epoch": 3.1666294991302797, "grad_norm": 18.689781188964844, "learning_rate": 3.894644827074225e-05, "loss": 0.0583, "step": 650 }, { "epoch": 3.1951741670755096, "grad_norm": 19.73504066467285, "learning_rate": 3.890427767079667e-05, "loss": 0.062, "step": 660 }, { "epoch": 3.2237188350207395, "grad_norm": 19.44004249572754, "learning_rate": 3.886130331397186e-05, "loss": 0.0577, "step": 670 }, { "epoch": 3.2522635029659694, "grad_norm": 19.139127731323242, "learning_rate": 3.881752702741697e-05, "loss": 0.0618, "step": 680 }, { "epoch": 3.2808081709111994, "grad_norm": 21.88005828857422, "learning_rate": 3.877295067237697e-05, "loss": 0.059, "step": 690 }, { "epoch": 3.3093528388564293, "grad_norm": 24.21089744567871, "learning_rate": 3.872757614411346e-05, "loss": 0.0593, "step": 700 }, { "epoch": 3.337897506801659, "grad_norm": 20.264284133911133, "learning_rate": 3.868140537182417e-05, "loss": 0.054, "step": 710 }, { "epoch": 3.366442174746889, "grad_norm": 21.731857299804688, "learning_rate": 3.863444031856088e-05, "loss": 0.062, "step": 720 }, { "epoch": 3.394986842692119, "grad_norm": 21.47838592529297, "learning_rate": 3.8586682981145956e-05, "loss": 0.0552, "step": 730 }, { "epoch": 3.423531510637349, "grad_norm": 18.726280212402344, "learning_rate": 3.853813539008746e-05, "loss": 0.0532, "step": 740 }, { "epoch": 3.452076178582579, "grad_norm": 19.791046142578125, "learning_rate": 3.848879960949287e-05, "loss": 0.0558, "step": 750 }, { "epoch": 3.480620846527809, "grad_norm": 18.885759353637695, "learning_rate": 3.8438677736981215e-05, "loss": 0.0553, "step": 760 }, { "epoch": 3.5091655144730387, "grad_norm": 16.527170181274414, "learning_rate": 3.838777190359397e-05, "loss": 0.0476, "step": 770 }, { "epoch": 3.5377101824182686, "grad_norm": 16.75018310546875, "learning_rate": 3.8336084273704457e-05, "loss": 0.0532, "step": 780 }, { "epoch": 3.5662548503634985, "grad_norm": 18.81423568725586, "learning_rate": 3.828361704492575e-05, "loss": 0.0499, "step": 790 }, { "epoch": 3.5947995183087285, "grad_norm": 19.174463272094727, "learning_rate": 3.823037244801729e-05, "loss": 0.0494, "step": 800 }, { "epoch": 3.6233441862539584, "grad_norm": 17.0285701751709, "learning_rate": 3.817635274679006e-05, "loss": 0.0461, "step": 810 }, { "epoch": 3.6518888541991883, "grad_norm": 17.395580291748047, "learning_rate": 3.812156023801028e-05, "loss": 0.0496, "step": 820 }, { "epoch": 3.680433522144418, "grad_norm": 18.277786254882812, "learning_rate": 3.8065997251301776e-05, "loss": 0.0477, "step": 830 }, { "epoch": 3.708978190089648, "grad_norm": 17.72475242614746, "learning_rate": 3.8009666149046957e-05, "loss": 0.0457, "step": 840 }, { "epoch": 3.737522858034878, "grad_norm": 20.809040069580078, "learning_rate": 3.7952569326286336e-05, "loss": 0.0471, "step": 850 }, { "epoch": 3.766067525980108, "grad_norm": 17.868568420410156, "learning_rate": 3.7894709210616714e-05, "loss": 0.0456, "step": 860 }, { "epoch": 3.794612193925338, "grad_norm": 15.575334548950195, "learning_rate": 3.7836088262087975e-05, "loss": 0.044, "step": 870 }, { "epoch": 3.823156861870568, "grad_norm": 17.568668365478516, "learning_rate": 3.7776708973098476e-05, "loss": 0.0446, "step": 880 }, { "epoch": 3.8517015298157977, "grad_norm": 17.17595672607422, "learning_rate": 3.771657386828908e-05, "loss": 0.0496, "step": 890 }, { "epoch": 3.8802461977610276, "grad_norm": 24.375370025634766, "learning_rate": 3.765568550443583e-05, "loss": 0.0424, "step": 900 }, { "epoch": 3.9087908657062576, "grad_norm": 16.25655174255371, "learning_rate": 3.7594046470341246e-05, "loss": 0.046, "step": 910 }, { "epoch": 3.9373355336514875, "grad_norm": 18.85159683227539, "learning_rate": 3.7531659386724195e-05, "loss": 0.0435, "step": 920 }, { "epoch": 3.9658802015967174, "grad_norm": 19.97796058654785, "learning_rate": 3.746852690610855e-05, "loss": 0.0431, "step": 930 }, { "epoch": 3.9944248695419473, "grad_norm": 15.388335227966309, "learning_rate": 3.7404651712710365e-05, "loss": 0.0389, "step": 940 }, { "epoch": 4.019183642211671, "grad_norm": 20.02805519104004, "learning_rate": 3.734003652232376e-05, "loss": 0.039, "step": 950 }, { "epoch": 4.043689692142748, "grad_norm": 15.568504333496094, "learning_rate": 3.727468408220544e-05, "loss": 0.0375, "step": 960 }, { "epoch": 4.068195742073825, "grad_norm": 15.18822956085205, "learning_rate": 3.720859717095792e-05, "loss": 0.0365, "step": 970 }, { "epoch": 4.092701792004902, "grad_norm": 14.499895095825195, "learning_rate": 3.714177859841136e-05, "loss": 0.038, "step": 980 }, { "epoch": 4.117207841935978, "grad_norm": 18.488901138305664, "learning_rate": 3.707423120550411e-05, "loss": 0.0406, "step": 990 }, { "epoch": 4.141713891867055, "grad_norm": 16.12656593322754, "learning_rate": 3.7005957864161905e-05, "loss": 0.0354, "step": 1000 }, { "epoch": 4.166219941798132, "grad_norm": 18.07503318786621, "learning_rate": 3.693696147717579e-05, "loss": 0.0373, "step": 1010 }, { "epoch": 4.190725991729209, "grad_norm": 17.39132308959961, "learning_rate": 3.686724497807867e-05, "loss": 0.0345, "step": 1020 }, { "epoch": 4.215232041660285, "grad_norm": 15.007177352905273, "learning_rate": 3.67968113310206e-05, "loss": 0.0325, "step": 1030 }, { "epoch": 4.239738091591361, "grad_norm": 15.444381713867188, "learning_rate": 3.6725663530642755e-05, "loss": 0.0327, "step": 1040 }, { "epoch": 4.264244141522438, "grad_norm": 14.16204833984375, "learning_rate": 3.6653804601950126e-05, "loss": 0.0338, "step": 1050 }, { "epoch": 4.288750191453515, "grad_norm": 16.405170440673828, "learning_rate": 3.6581237600182856e-05, "loss": 0.0342, "step": 1060 }, { "epoch": 4.313256241384591, "grad_norm": 19.641298294067383, "learning_rate": 3.650796561068639e-05, "loss": 0.0394, "step": 1070 }, { "epoch": 4.337762291315668, "grad_norm": 14.00063705444336, "learning_rate": 3.6433991748780255e-05, "loss": 0.0336, "step": 1080 }, { "epoch": 4.362268341246745, "grad_norm": 13.914216995239258, "learning_rate": 3.635931915962565e-05, "loss": 0.0326, "step": 1090 }, { "epoch": 4.386774391177822, "grad_norm": 15.238022804260254, "learning_rate": 3.628395101809169e-05, "loss": 0.0312, "step": 1100 }, { "epoch": 4.411280441108898, "grad_norm": 15.279886245727539, "learning_rate": 3.62078905286204e-05, "loss": 0.0313, "step": 1110 }, { "epoch": 4.435786491039975, "grad_norm": 15.173819541931152, "learning_rate": 3.613114092509054e-05, "loss": 0.0315, "step": 1120 }, { "epoch": 4.460292540971052, "grad_norm": 15.986420631408691, "learning_rate": 3.6053705470680044e-05, "loss": 0.0333, "step": 1130 }, { "epoch": 4.484798590902129, "grad_norm": 18.724811553955078, "learning_rate": 3.59755874577273e-05, "loss": 0.0322, "step": 1140 }, { "epoch": 4.509304640833205, "grad_norm": 14.428422927856445, "learning_rate": 3.589679020759118e-05, "loss": 0.0278, "step": 1150 }, { "epoch": 4.533810690764282, "grad_norm": 14.249613761901855, "learning_rate": 3.5817317070509814e-05, "loss": 0.0323, "step": 1160 }, { "epoch": 4.558316740695359, "grad_norm": 13.707551002502441, "learning_rate": 3.573717142545814e-05, "loss": 0.0299, "step": 1170 }, { "epoch": 4.582822790626436, "grad_norm": 18.068727493286133, "learning_rate": 3.565635668000427e-05, "loss": 0.0319, "step": 1180 }, { "epoch": 4.607328840557512, "grad_norm": 15.44510269165039, "learning_rate": 3.557487627016458e-05, "loss": 0.0308, "step": 1190 }, { "epoch": 4.631834890488589, "grad_norm": 15.211899757385254, "learning_rate": 3.5492733660257605e-05, "loss": 0.029, "step": 1200 }, { "epoch": 4.656340940419666, "grad_norm": 18.195812225341797, "learning_rate": 3.5409932342756824e-05, "loss": 0.029, "step": 1210 }, { "epoch": 4.680846990350743, "grad_norm": 15.29293155670166, "learning_rate": 3.532647583814205e-05, "loss": 0.0275, "step": 1220 }, { "epoch": 4.705353040281819, "grad_norm": 13.911247253417969, "learning_rate": 3.524236769474987e-05, "loss": 0.0259, "step": 1230 }, { "epoch": 4.729859090212896, "grad_norm": 15.558411598205566, "learning_rate": 3.51576114886227e-05, "loss": 0.0287, "step": 1240 }, { "epoch": 4.754365140143973, "grad_norm": 16.093111038208008, "learning_rate": 3.507221082335676e-05, "loss": 0.0293, "step": 1250 }, { "epoch": 4.77887119007505, "grad_norm": 13.53354549407959, "learning_rate": 3.498616932994888e-05, "loss": 0.0278, "step": 1260 }, { "epoch": 4.803377240006126, "grad_norm": 22.743614196777344, "learning_rate": 3.489949066664211e-05, "loss": 0.034, "step": 1270 }, { "epoch": 4.827883289937203, "grad_norm": 14.596455574035645, "learning_rate": 3.481217851877015e-05, "loss": 0.0292, "step": 1280 }, { "epoch": 4.85238933986828, "grad_norm": 17.450109481811523, "learning_rate": 3.4724236598600725e-05, "loss": 0.0301, "step": 1290 }, { "epoch": 4.8768953897993566, "grad_norm": 15.233014106750488, "learning_rate": 3.4635668645177674e-05, "loss": 0.0292, "step": 1300 }, { "epoch": 4.901401439730433, "grad_norm": 15.098063468933105, "learning_rate": 3.454647842416204e-05, "loss": 0.0276, "step": 1310 }, { "epoch": 4.92590748966151, "grad_norm": 16.780668258666992, "learning_rate": 3.4456669727671944e-05, "loss": 0.027, "step": 1320 }, { "epoch": 4.950413539592587, "grad_norm": 16.340227127075195, "learning_rate": 3.436624637412132e-05, "loss": 0.0309, "step": 1330 }, { "epoch": 4.9749195895236635, "grad_norm": 12.311773300170898, "learning_rate": 3.427521220805763e-05, "loss": 0.0257, "step": 1340 }, { "epoch": 4.99942563945474, "grad_norm": 15.815475463867188, "learning_rate": 3.4183571099998355e-05, "loss": 0.0261, "step": 1350 }, { "epoch": 5.02021921165498, "grad_norm": 12.780634880065918, "learning_rate": 3.409132694626643e-05, "loss": 0.0281, "step": 1360 }, { "epoch": 5.042004220845531, "grad_norm": 14.720085144042969, "learning_rate": 3.3998483668824645e-05, "loss": 0.0236, "step": 1370 }, { "epoch": 5.063789230036082, "grad_norm": 16.020496368408203, "learning_rate": 3.390504521510882e-05, "loss": 0.0241, "step": 1380 }, { "epoch": 5.0855742392266325, "grad_norm": 13.678121566772461, "learning_rate": 3.381101555785999e-05, "loss": 0.0232, "step": 1390 }, { "epoch": 5.107359248417183, "grad_norm": 13.695241928100586, "learning_rate": 3.371639869495554e-05, "loss": 0.0237, "step": 1400 }, { "epoch": 5.1291442576077335, "grad_norm": 11.553495407104492, "learning_rate": 3.362119864923918e-05, "loss": 0.0237, "step": 1410 }, { "epoch": 5.1509292667982844, "grad_norm": 12.397970199584961, "learning_rate": 3.35254194683499e-05, "loss": 0.0236, "step": 1420 }, { "epoch": 5.172714275988835, "grad_norm": 14.61323356628418, "learning_rate": 3.342906522454992e-05, "loss": 0.0239, "step": 1430 }, { "epoch": 5.194499285179386, "grad_norm": 11.521512031555176, "learning_rate": 3.333214001455149e-05, "loss": 0.0191, "step": 1440 }, { "epoch": 5.216284294369936, "grad_norm": 15.835281372070312, "learning_rate": 3.323464795934279e-05, "loss": 0.0253, "step": 1450 }, { "epoch": 5.238069303560487, "grad_norm": 14.1622953414917, "learning_rate": 3.313659320401263e-05, "loss": 0.0243, "step": 1460 }, { "epoch": 5.259854312751038, "grad_norm": 12.993020057678223, "learning_rate": 3.303797991757425e-05, "loss": 0.0211, "step": 1470 }, { "epoch": 5.281639321941589, "grad_norm": 13.310782432556152, "learning_rate": 3.29388122927881e-05, "loss": 0.0278, "step": 1480 }, { "epoch": 5.30342433113214, "grad_norm": 17.85926628112793, "learning_rate": 3.2839094545983505e-05, "loss": 0.0212, "step": 1490 }, { "epoch": 5.32520934032269, "grad_norm": 12.155655860900879, "learning_rate": 3.273883091687946e-05, "loss": 0.0224, "step": 1500 }, { "epoch": 5.346994349513241, "grad_norm": 10.895421981811523, "learning_rate": 3.2638025668404334e-05, "loss": 0.0241, "step": 1510 }, { "epoch": 5.368779358703792, "grad_norm": 12.233269691467285, "learning_rate": 3.2536683086514634e-05, "loss": 0.0206, "step": 1520 }, { "epoch": 5.390564367894343, "grad_norm": 12.179084777832031, "learning_rate": 3.243480748001278e-05, "loss": 0.0241, "step": 1530 }, { "epoch": 5.412349377084894, "grad_norm": 13.7705078125, "learning_rate": 3.2332403180363906e-05, "loss": 0.0253, "step": 1540 }, { "epoch": 5.434134386275444, "grad_norm": 10.06460952758789, "learning_rate": 3.222947454151169e-05, "loss": 0.0249, "step": 1550 }, { "epoch": 5.455919395465995, "grad_norm": 16.193252563476562, "learning_rate": 3.212602593969325e-05, "loss": 0.0245, "step": 1560 }, { "epoch": 5.477704404656546, "grad_norm": 11.988511085510254, "learning_rate": 3.202206177325306e-05, "loss": 0.0238, "step": 1570 }, { "epoch": 5.499489413847097, "grad_norm": 11.607162475585938, "learning_rate": 3.191758646245596e-05, "loss": 0.0226, "step": 1580 }, { "epoch": 5.521274423037648, "grad_norm": 12.626535415649414, "learning_rate": 3.181260444929923e-05, "loss": 0.0204, "step": 1590 }, { "epoch": 5.543059432228198, "grad_norm": 12.591373443603516, "learning_rate": 3.1707120197323686e-05, "loss": 0.0207, "step": 1600 }, { "epoch": 5.564844441418749, "grad_norm": 12.233884811401367, "learning_rate": 3.1601138191423966e-05, "loss": 0.0223, "step": 1610 }, { "epoch": 5.5866294506092995, "grad_norm": 13.553182601928711, "learning_rate": 3.149466293765778e-05, "loss": 0.021, "step": 1620 }, { "epoch": 5.60841445979985, "grad_norm": 13.30004596710205, "learning_rate": 3.138769896305434e-05, "loss": 0.0188, "step": 1630 }, { "epoch": 5.630199468990401, "grad_norm": 13.62940502166748, "learning_rate": 3.128025081542196e-05, "loss": 0.0176, "step": 1640 }, { "epoch": 5.651984478180951, "grad_norm": 12.465331077575684, "learning_rate": 3.117232306315456e-05, "loss": 0.0195, "step": 1650 }, { "epoch": 5.673769487371502, "grad_norm": 12.430222511291504, "learning_rate": 3.106392029503757e-05, "loss": 0.0216, "step": 1660 }, { "epoch": 5.695554496562053, "grad_norm": 12.926973342895508, "learning_rate": 3.09550471200527e-05, "loss": 0.0192, "step": 1670 }, { "epoch": 5.717339505752604, "grad_norm": 13.914267539978027, "learning_rate": 3.08457081671821e-05, "loss": 0.021, "step": 1680 }, { "epoch": 5.739124514943155, "grad_norm": 13.50471019744873, "learning_rate": 3.073590808521144e-05, "loss": 0.0218, "step": 1690 }, { "epoch": 5.760909524133705, "grad_norm": 10.271846771240234, "learning_rate": 3.062565154253233e-05, "loss": 0.0202, "step": 1700 }, { "epoch": 5.782694533324256, "grad_norm": 14.907472610473633, "learning_rate": 3.0514943226943816e-05, "loss": 0.0236, "step": 1710 }, { "epoch": 5.804479542514807, "grad_norm": 13.36329174041748, "learning_rate": 3.040378784545304e-05, "loss": 0.021, "step": 1720 }, { "epoch": 5.826264551705358, "grad_norm": 11.128992080688477, "learning_rate": 3.0292190124075162e-05, "loss": 0.0176, "step": 1730 }, { "epoch": 5.848049560895909, "grad_norm": 11.310523986816406, "learning_rate": 3.018015480763236e-05, "loss": 0.0207, "step": 1740 }, { "epoch": 5.869834570086459, "grad_norm": 11.527318000793457, "learning_rate": 3.006768665955215e-05, "loss": 0.0187, "step": 1750 }, { "epoch": 5.89161957927701, "grad_norm": 11.697135925292969, "learning_rate": 2.9954790461664834e-05, "loss": 0.0202, "step": 1760 }, { "epoch": 5.913404588467561, "grad_norm": 10.337966918945312, "learning_rate": 2.984147101400018e-05, "loss": 0.0168, "step": 1770 }, { "epoch": 5.935189597658112, "grad_norm": 10.729581832885742, "learning_rate": 2.9727733134583358e-05, "loss": 0.021, "step": 1780 }, { "epoch": 5.956974606848663, "grad_norm": 11.193035125732422, "learning_rate": 2.961358165923008e-05, "loss": 0.0203, "step": 1790 }, { "epoch": 5.978759616039213, "grad_norm": 11.645442962646484, "learning_rate": 2.9499021441341012e-05, "loss": 0.0182, "step": 1800 }, { "epoch": 6.000544625229764, "grad_norm": 11.237954139709473, "learning_rate": 2.938405735169537e-05, "loss": 0.0184, "step": 1810 }, { "epoch": 6.019290314590042, "grad_norm": 10.546935081481934, "learning_rate": 2.9268694278243903e-05, "loss": 0.0179, "step": 1820 }, { "epoch": 6.039107010156057, "grad_norm": 9.956415176391602, "learning_rate": 2.915293712590102e-05, "loss": 0.0196, "step": 1830 }, { "epoch": 6.058923705722071, "grad_norm": 9.116511344909668, "learning_rate": 2.9036790816336252e-05, "loss": 0.0199, "step": 1840 }, { "epoch": 6.078740401288085, "grad_norm": 16.642379760742188, "learning_rate": 2.892026028776501e-05, "loss": 0.0173, "step": 1850 }, { "epoch": 6.098557096854099, "grad_norm": 11.179176330566406, "learning_rate": 2.8803350494738615e-05, "loss": 0.019, "step": 1860 }, { "epoch": 6.118373792420114, "grad_norm": 13.457623481750488, "learning_rate": 2.8686066407933656e-05, "loss": 0.0164, "step": 1870 }, { "epoch": 6.138190487986129, "grad_norm": 11.937878608703613, "learning_rate": 2.8568413013940642e-05, "loss": 0.019, "step": 1880 }, { "epoch": 6.158007183552143, "grad_norm": 14.586573600769043, "learning_rate": 2.845039531505199e-05, "loss": 0.0187, "step": 1890 }, { "epoch": 6.177823879118157, "grad_norm": 10.834576606750488, "learning_rate": 2.833201832904933e-05, "loss": 0.0205, "step": 1900 }, { "epoch": 6.197640574684171, "grad_norm": 10.595796585083008, "learning_rate": 2.8213287088990184e-05, "loss": 0.0194, "step": 1910 }, { "epoch": 6.217457270250185, "grad_norm": 12.627229690551758, "learning_rate": 2.8094206642993955e-05, "loss": 0.0145, "step": 1920 }, { "epoch": 3.1077549379303413, "grad_norm": 10.363633155822754, "learning_rate": 2.7974782054027308e-05, "loss": 0.0179, "step": 1930 }, { "epoch": 3.1238377644871087, "grad_norm": 14.35067081451416, "learning_rate": 2.7855018399688908e-05, "loss": 0.0184, "step": 1940 }, { "epoch": 3.139920591043876, "grad_norm": 10.560155868530273, "learning_rate": 2.773492077199351e-05, "loss": 0.0173, "step": 1950 }, { "epoch": 3.1560034176006435, "grad_norm": 10.620994567871094, "learning_rate": 2.76144942771555e-05, "loss": 0.0155, "step": 1960 }, { "epoch": 3.1720862441574105, "grad_norm": 9.053291320800781, "learning_rate": 2.749374403537177e-05, "loss": 0.0145, "step": 1970 }, { "epoch": 3.188169070714178, "grad_norm": 12.468178749084473, "learning_rate": 2.7372675180603994e-05, "loss": 0.0183, "step": 1980 }, { "epoch": 3.2042518972709453, "grad_norm": 8.465781211853027, "learning_rate": 2.7251292860360424e-05, "loss": 0.0164, "step": 1990 }, { "epoch": 3.2203347238277127, "grad_norm": 10.253599166870117, "learning_rate": 2.712960223547696e-05, "loss": 0.015, "step": 2000 }, { "epoch": 3.23641755038448, "grad_norm": 9.734599113464355, "learning_rate": 2.700760847989775e-05, "loss": 0.0144, "step": 2010 }, { "epoch": 3.2525003769412475, "grad_norm": 12.44884967803955, "learning_rate": 2.6885316780455208e-05, "loss": 0.0129, "step": 2020 }, { "epoch": 3.268583203498015, "grad_norm": 10.425430297851562, "learning_rate": 2.6762732336649492e-05, "loss": 0.0185, "step": 2030 }, { "epoch": 3.2846660300547823, "grad_norm": 10.850104331970215, "learning_rate": 2.6639860360427426e-05, "loss": 0.0143, "step": 2040 }, { "epoch": 3.3007488566115493, "grad_norm": 9.267366409301758, "learning_rate": 2.651670607596092e-05, "loss": 0.0146, "step": 2050 }, { "epoch": 3.3168316831683167, "grad_norm": 9.598543167114258, "learning_rate": 2.6393274719424814e-05, "loss": 0.0157, "step": 2060 }, { "epoch": 3.332914509725084, "grad_norm": 9.140937805175781, "learning_rate": 2.6269571538774294e-05, "loss": 0.0172, "step": 2070 }, { "epoch": 3.3489973362818515, "grad_norm": 10.654680252075195, "learning_rate": 2.6145601793521734e-05, "loss": 0.0162, "step": 2080 }, { "epoch": 3.365080162838619, "grad_norm": 10.139638900756836, "learning_rate": 2.6021370754513096e-05, "loss": 0.0168, "step": 2090 }, { "epoch": 3.3811629893953863, "grad_norm": 9.781733512878418, "learning_rate": 2.589688370370382e-05, "loss": 0.0165, "step": 2100 }, { "epoch": 3.3972458159521537, "grad_norm": 10.93750286102295, "learning_rate": 2.5772145933934235e-05, "loss": 0.0145, "step": 2110 }, { "epoch": 3.413328642508921, "grad_norm": 11.465789794921875, "learning_rate": 2.5647162748704562e-05, "loss": 0.0135, "step": 2120 }, { "epoch": 3.4294114690656885, "grad_norm": 9.151410102844238, "learning_rate": 2.5521939461949384e-05, "loss": 0.0163, "step": 2130 }, { "epoch": 3.4454942956224555, "grad_norm": 8.722734451293945, "learning_rate": 2.5396481397811715e-05, "loss": 0.0171, "step": 2140 }, { "epoch": 3.461577122179223, "grad_norm": 9.99445629119873, "learning_rate": 2.5270793890416677e-05, "loss": 0.0146, "step": 2150 }, { "epoch": 3.4776599487359903, "grad_norm": 11.865700721740723, "learning_rate": 2.5144882283644644e-05, "loss": 0.0172, "step": 2160 }, { "epoch": 3.4937427752927577, "grad_norm": 14.123621940612793, "learning_rate": 2.50187519309041e-05, "loss": 0.0146, "step": 2170 }, { "epoch": 3.509825601849525, "grad_norm": 10.353002548217773, "learning_rate": 2.4892408194903963e-05, "loss": 0.0155, "step": 2180 }, { "epoch": 3.5259084284062925, "grad_norm": 10.808701515197754, "learning_rate": 2.4765856447425614e-05, "loss": 0.0133, "step": 2190 }, { "epoch": 3.54199125496306, "grad_norm": 8.521575927734375, "learning_rate": 2.4639102069094522e-05, "loss": 0.0125, "step": 2200 }, { "epoch": 3.558074081519827, "grad_norm": 7.443869113922119, "learning_rate": 2.4512150449151433e-05, "loss": 0.0143, "step": 2210 }, { "epoch": 3.5741569080765947, "grad_norm": 10.696161270141602, "learning_rate": 2.438500698522325e-05, "loss": 0.0176, "step": 2220 }, { "epoch": 3.5902397346333617, "grad_norm": 19.051715850830078, "learning_rate": 2.4257677083093553e-05, "loss": 0.0167, "step": 2230 }, { "epoch": 3.606322561190129, "grad_norm": 13.287577629089355, "learning_rate": 2.413016615647275e-05, "loss": 0.0173, "step": 2240 }, { "epoch": 3.6224053877468965, "grad_norm": 10.227944374084473, "learning_rate": 2.4002479626767903e-05, "loss": 0.0153, "step": 2250 }, { "epoch": 3.638488214303664, "grad_norm": 10.98816204071045, "learning_rate": 2.3874622922852225e-05, "loss": 0.0136, "step": 2260 }, { "epoch": 3.6545710408604313, "grad_norm": 9.841387748718262, "learning_rate": 2.3746601480834258e-05, "loss": 0.0164, "step": 2270 }, { "epoch": 3.6706538674171987, "grad_norm": 9.20659351348877, "learning_rate": 2.361842074382674e-05, "loss": 0.0133, "step": 2280 }, { "epoch": 3.686736693973966, "grad_norm": 8.02350902557373, "learning_rate": 2.3490086161715197e-05, "loss": 0.0113, "step": 2290 }, { "epoch": 3.702819520530733, "grad_norm": 7.091673851013184, "learning_rate": 2.336160319092621e-05, "loss": 0.0127, "step": 2300 }, { "epoch": 3.718902347087501, "grad_norm": 9.675036430358887, "learning_rate": 2.3232977294195437e-05, "loss": 0.0195, "step": 2310 }, { "epoch": 3.734985173644268, "grad_norm": 8.969931602478027, "learning_rate": 2.3104213940335338e-05, "loss": 0.0118, "step": 2320 }, { "epoch": 3.7510680002010353, "grad_norm": 11.032634735107422, "learning_rate": 2.2975318604002667e-05, "loss": 0.0148, "step": 2330 }, { "epoch": 3.7671508267578027, "grad_norm": 8.483484268188477, "learning_rate": 2.2846296765465708e-05, "loss": 0.013, "step": 2340 }, { "epoch": 3.78323365331457, "grad_norm": 12.28922176361084, "learning_rate": 2.271715391037126e-05, "loss": 0.0123, "step": 2350 }, { "epoch": 3.7993164798713375, "grad_norm": 11.338895797729492, "learning_rate": 2.2587895529511396e-05, "loss": 0.0134, "step": 2360 }, { "epoch": 3.815399306428105, "grad_norm": 12.580510139465332, "learning_rate": 2.245852711859004e-05, "loss": 0.0132, "step": 2370 }, { "epoch": 3.8314821329848723, "grad_norm": 9.075506210327148, "learning_rate": 2.232905417798929e-05, "loss": 0.0148, "step": 2380 }, { "epoch": 3.8475649595416392, "grad_norm": 8.931917190551758, "learning_rate": 2.2199482212535522e-05, "loss": 0.0128, "step": 2390 }, { "epoch": 3.8636477860984066, "grad_norm": 12.407690048217773, "learning_rate": 2.206981673126539e-05, "loss": 0.0168, "step": 2400 }, { "epoch": 3.879730612655174, "grad_norm": 10.473590850830078, "learning_rate": 2.1940063247191582e-05, "loss": 0.0128, "step": 2410 }, { "epoch": 4.002587467656654, "grad_norm": 7.4311203956604, "learning_rate": 2.181022727706842e-05, "loss": 0.0122, "step": 2420 }, { "epoch": 4.020587242659467, "grad_norm": 7.310738563537598, "learning_rate": 2.168031434115729e-05, "loss": 0.0067, "step": 2430 }, { "epoch": 4.038587017662279, "grad_norm": 13.81872272491455, "learning_rate": 2.1550329962991946e-05, "loss": 0.008, "step": 2440 }, { "epoch": 4.056586792665092, "grad_norm": 7.483142852783203, "learning_rate": 2.142027966914368e-05, "loss": 0.0072, "step": 2450 }, { "epoch": 4.074586567667904, "grad_norm": 7.784074783325195, "learning_rate": 2.1290168988986332e-05, "loss": 0.0073, "step": 2460 }, { "epoch": 4.0925863426707165, "grad_norm": 7.706643581390381, "learning_rate": 2.116000345446118e-05, "loss": 0.0074, "step": 2470 }, { "epoch": 4.110586117673529, "grad_norm": 8.789175987243652, "learning_rate": 2.1029788599841784e-05, "loss": 0.0077, "step": 2480 }, { "epoch": 4.128585892676342, "grad_norm": 6.578799724578857, "learning_rate": 2.0899529961498633e-05, "loss": 0.0074, "step": 2490 }, { "epoch": 4.146585667679154, "grad_norm": 6.066508769989014, "learning_rate": 2.076923307766379e-05, "loss": 0.0071, "step": 2500 }, { "epoch": 4.164585442681966, "grad_norm": 8.597545623779297, "learning_rate": 2.0638903488195406e-05, "loss": 0.0074, "step": 2510 }, { "epoch": 4.182585217684779, "grad_norm": 9.465729713439941, "learning_rate": 2.050854673434217e-05, "loss": 0.0077, "step": 2520 }, { "epoch": 4.200584992687592, "grad_norm": 9.023458480834961, "learning_rate": 2.037816835850776e-05, "loss": 0.0076, "step": 2530 }, { "epoch": 4.218584767690404, "grad_norm": 7.603120803833008, "learning_rate": 2.024777390401512e-05, "loss": 0.0076, "step": 2540 }, { "epoch": 4.236584542693216, "grad_norm": 7.11006498336792, "learning_rate": 2.0117368914870838e-05, "loss": 0.0079, "step": 2550 }, { "epoch": 4.254584317696029, "grad_norm": 9.57967758178711, "learning_rate": 1.9986958935529393e-05, "loss": 0.0082, "step": 2560 }, { "epoch": 4.272584092698842, "grad_norm": 10.809629440307617, "learning_rate": 1.9856549510657447e-05, "loss": 0.0086, "step": 2570 }, { "epoch": 4.2905838677016535, "grad_norm": 9.166589736938477, "learning_rate": 1.9726146184898066e-05, "loss": 0.0075, "step": 2580 }, { "epoch": 4.308583642704466, "grad_norm": 7.303510665893555, "learning_rate": 1.959575450263503e-05, "loss": 0.0076, "step": 2590 }, { "epoch": 4.326583417707279, "grad_norm": 6.71479606628418, "learning_rate": 1.9465380007757043e-05, "loss": 0.0076, "step": 2600 }, { "epoch": 4.3445831927100915, "grad_norm": 6.198269367218018, "learning_rate": 1.933502824342205e-05, "loss": 0.0071, "step": 2610 }, { "epoch": 4.362582967712903, "grad_norm": 6.006600856781006, "learning_rate": 1.9204704751821586e-05, "loss": 0.0072, "step": 2620 }, { "epoch": 4.380582742715716, "grad_norm": 6.971927165985107, "learning_rate": 1.907441507394507e-05, "loss": 0.0076, "step": 2630 }, { "epoch": 4.398582517718529, "grad_norm": 8.39477825164795, "learning_rate": 1.894416474934429e-05, "loss": 0.0075, "step": 2640 }, { "epoch": 4.416582292721341, "grad_norm": 7.6670355796813965, "learning_rate": 1.8813959315897815e-05, "loss": 0.0083, "step": 2650 }, { "epoch": 4.434582067724153, "grad_norm": 7.985522747039795, "learning_rate": 1.8683804309575587e-05, "loss": 0.0075, "step": 2660 }, { "epoch": 4.452581842726966, "grad_norm": 7.552544593811035, "learning_rate": 1.855370526420352e-05, "loss": 0.0073, "step": 2670 }, { "epoch": 4.470581617729779, "grad_norm": 7.256811618804932, "learning_rate": 1.842366771122823e-05, "loss": 0.0066, "step": 2680 }, { "epoch": 4.488581392732591, "grad_norm": 7.66050386428833, "learning_rate": 1.829369717948185e-05, "loss": 0.0078, "step": 2690 }, { "epoch": 4.506581167735403, "grad_norm": 6.683782577514648, "learning_rate": 1.8163799194946938e-05, "loss": 0.0079, "step": 2700 }, { "epoch": 4.524580942738216, "grad_norm": 6.800795078277588, "learning_rate": 1.8033979280521584e-05, "loss": 0.0069, "step": 2710 }, { "epoch": 4.5425807177410285, "grad_norm": 8.025465965270996, "learning_rate": 1.790424295578453e-05, "loss": 0.0069, "step": 2720 }, { "epoch": 4.56058049274384, "grad_norm": 10.645038604736328, "learning_rate": 1.777459573676051e-05, "loss": 0.0076, "step": 2730 }, { "epoch": 4.578580267746653, "grad_norm": 9.160017013549805, "learning_rate": 1.764504313568577e-05, "loss": 0.0068, "step": 2740 }, { "epoch": 4.596580042749466, "grad_norm": 8.349514961242676, "learning_rate": 1.7515590660773633e-05, "loss": 0.0076, "step": 2750 }, { "epoch": 4.614579817752278, "grad_norm": 5.219119071960449, "learning_rate": 1.7386243815980354e-05, "loss": 0.0073, "step": 2760 }, { "epoch": 4.632579592755091, "grad_norm": 7.130075931549072, "learning_rate": 1.7257008100771072e-05, "loss": 0.007, "step": 2770 }, { "epoch": 4.650579367757903, "grad_norm": 6.3263115882873535, "learning_rate": 1.7127889009886036e-05, "loss": 0.0067, "step": 2780 }, { "epoch": 4.6685791427607155, "grad_norm": 6.6792778968811035, "learning_rate": 1.699889203310695e-05, "loss": 0.0075, "step": 2790 }, { "epoch": 4.686578917763528, "grad_norm": 6.040131092071533, "learning_rate": 1.6870022655023544e-05, "loss": 0.0072, "step": 2800 }, { "epoch": 4.70457869276634, "grad_norm": 6.8368306159973145, "learning_rate": 1.674128635480044e-05, "loss": 0.0071, "step": 2810 }, { "epoch": 4.722578467769153, "grad_norm": 8.718803405761719, "learning_rate": 1.6612688605944133e-05, "loss": 0.0074, "step": 2820 }, { "epoch": 4.740578242771965, "grad_norm": 8.861642837524414, "learning_rate": 1.6484234876070335e-05, "loss": 0.0063, "step": 2830 }, { "epoch": 4.758578017774778, "grad_norm": 6.4469475746154785, "learning_rate": 1.6355930626671447e-05, "loss": 0.007, "step": 2840 }, { "epoch": 4.776577792777591, "grad_norm": 9.30246639251709, "learning_rate": 1.6227781312884388e-05, "loss": 0.0073, "step": 2850 }, { "epoch": 4.794577567780403, "grad_norm": 8.216259002685547, "learning_rate": 1.6099792383258664e-05, "loss": 0.0071, "step": 2860 }, { "epoch": 4.812577342783215, "grad_norm": 10.296393394470215, "learning_rate": 1.5971969279524668e-05, "loss": 0.0075, "step": 2870 }, { "epoch": 5.01673208014873, "grad_norm": 6.276814937591553, "learning_rate": 1.584431743636237e-05, "loss": 0.0059, "step": 2880 }, { "epoch": 5.038766095159402, "grad_norm": 6.1525068283081055, "learning_rate": 1.5716842281170205e-05, "loss": 0.0059, "step": 2890 }, { "epoch": 5.060800110170075, "grad_norm": 5.246829032897949, "learning_rate": 1.558954923383432e-05, "loss": 0.0057, "step": 2900 }, { "epoch": 5.082834125180748, "grad_norm": 8.341444969177246, "learning_rate": 1.5462443706498178e-05, "loss": 0.0061, "step": 2910 }, { "epoch": 5.104868140191421, "grad_norm": 8.144927024841309, "learning_rate": 1.533553110333239e-05, "loss": 0.0058, "step": 2920 }, { "epoch": 5.126902155202093, "grad_norm": 4.849141597747803, "learning_rate": 1.5208816820304973e-05, "loss": 0.0055, "step": 2930 }, { "epoch": 5.148936170212766, "grad_norm": 5.376830577850342, "learning_rate": 1.5082306244951956e-05, "loss": 0.0052, "step": 2940 }, { "epoch": 5.170970185223439, "grad_norm": 5.531591892242432, "learning_rate": 1.495600475614825e-05, "loss": 0.0059, "step": 2950 }, { "epoch": 5.193004200234111, "grad_norm": 4.80387020111084, "learning_rate": 1.4829917723879029e-05, "loss": 0.0056, "step": 2960 }, { "epoch": 5.2150382152447845, "grad_norm": 6.294495582580566, "learning_rate": 1.4704050509011345e-05, "loss": 0.0056, "step": 2970 }, { "epoch": 5.237072230255457, "grad_norm": 5.95064640045166, "learning_rate": 1.4578408463066246e-05, "loss": 0.0058, "step": 2980 }, { "epoch": 5.259106245266129, "grad_norm": 6.749286651611328, "learning_rate": 1.4452996927991236e-05, "loss": 0.006, "step": 2990 }, { "epoch": 5.2811402602768025, "grad_norm": 6.431356906890869, "learning_rate": 1.4327821235933126e-05, "loss": 0.0062, "step": 3000 }, { "epoch": 5.303174275287475, "grad_norm": 5.065823554992676, "learning_rate": 1.4202886709011357e-05, "loss": 0.005, "step": 3010 }, { "epoch": 5.325208290298148, "grad_norm": 4.399689674377441, "learning_rate": 1.4078198659091686e-05, "loss": 0.006, "step": 3020 }, { "epoch": 5.3472423053088205, "grad_norm": 4.889394283294678, "learning_rate": 1.3953762387560392e-05, "loss": 0.0054, "step": 3030 }, { "epoch": 5.369276320319493, "grad_norm": 6.588573455810547, "learning_rate": 1.3829583185098802e-05, "loss": 0.0056, "step": 3040 }, { "epoch": 5.391310335330166, "grad_norm": 4.880826473236084, "learning_rate": 1.3705666331458424e-05, "loss": 0.0052, "step": 3050 }, { "epoch": 5.4133443503408385, "grad_norm": 5.972387313842773, "learning_rate": 1.3582017095236413e-05, "loss": 0.0052, "step": 3060 }, { "epoch": 5.435378365351512, "grad_norm": 5.3322224617004395, "learning_rate": 1.345864073365157e-05, "loss": 0.0054, "step": 3070 }, { "epoch": 5.457412380362184, "grad_norm": 4.680153846740723, "learning_rate": 1.3335542492320856e-05, "loss": 0.0059, "step": 3080 }, { "epoch": 5.4794463953728565, "grad_norm": 5.0644636154174805, "learning_rate": 1.3212727605036319e-05, "loss": 0.0055, "step": 3090 }, { "epoch": 5.50148041038353, "grad_norm": 6.729560375213623, "learning_rate": 1.3090201293542597e-05, "loss": 0.0061, "step": 3100 }, { "epoch": 5.523514425394202, "grad_norm": 6.099545001983643, "learning_rate": 1.2967968767314898e-05, "loss": 0.0063, "step": 3110 }, { "epoch": 5.545548440404875, "grad_norm": 4.657865524291992, "learning_rate": 1.284603522333749e-05, "loss": 0.0052, "step": 3120 }, { "epoch": 5.567582455415548, "grad_norm": 5.916351795196533, "learning_rate": 1.2724405845882775e-05, "loss": 0.0056, "step": 3130 }, { "epoch": 5.58961647042622, "grad_norm": 6.424000263214111, "learning_rate": 1.2603085806290824e-05, "loss": 0.0065, "step": 3140 }, { "epoch": 5.611650485436893, "grad_norm": 7.819843769073486, "learning_rate": 1.2482080262749538e-05, "loss": 0.0057, "step": 3150 }, { "epoch": 5.633684500447566, "grad_norm": 6.704712867736816, "learning_rate": 1.2361394360075348e-05, "loss": 0.0052, "step": 3160 }, { "epoch": 5.655718515458239, "grad_norm": 5.237440586090088, "learning_rate": 1.224103322949442e-05, "loss": 0.0052, "step": 3170 }, { "epoch": 5.677752530468911, "grad_norm": 6.460971355438232, "learning_rate": 1.2121001988424541e-05, "loss": 0.0057, "step": 3180 }, { "epoch": 5.699786545479584, "grad_norm": 5.491466522216797, "learning_rate": 1.2001305740257505e-05, "loss": 0.0051, "step": 3190 }, { "epoch": 5.721820560490257, "grad_norm": 5.925656318664551, "learning_rate": 1.188194957414217e-05, "loss": 0.0054, "step": 3200 }, { "epoch": 5.743854575500929, "grad_norm": 6.257553577423096, "learning_rate": 1.176293856476804e-05, "loss": 0.0053, "step": 3210 }, { "epoch": 5.765888590511603, "grad_norm": 4.400306224822998, "learning_rate": 1.1644277772149531e-05, "loss": 0.0051, "step": 3220 }, { "epoch": 5.787922605522275, "grad_norm": 6.251142978668213, "learning_rate": 1.1525972241410827e-05, "loss": 0.0052, "step": 3230 }, { "epoch": 5.809956620532947, "grad_norm": 4.689172267913818, "learning_rate": 1.1408027002571359e-05, "loss": 0.0057, "step": 3240 }, { "epoch": 5.831990635543621, "grad_norm": 6.150318145751953, "learning_rate": 1.1290447070331958e-05, "loss": 0.0053, "step": 3250 }, { "epoch": 5.854024650554293, "grad_norm": 5.218411445617676, "learning_rate": 1.1173237443861678e-05, "loss": 0.0057, "step": 3260 }, { "epoch": 5.876058665564966, "grad_norm": 4.724580764770508, "learning_rate": 1.1056403106585156e-05, "loss": 0.005, "step": 3270 }, { "epoch": 5.898092680575639, "grad_norm": 3.99684476852417, "learning_rate": 1.093994902597082e-05, "loss": 0.0053, "step": 3280 }, { "epoch": 5.920126695586311, "grad_norm": 5.558387756347656, "learning_rate": 1.0823880153319642e-05, "loss": 0.0051, "step": 3290 }, { "epoch": 5.942160710596984, "grad_norm": 5.6572957038879395, "learning_rate": 1.0708201423554634e-05, "loss": 0.0055, "step": 3300 }, { "epoch": 5.964194725607657, "grad_norm": 3.7635843753814697, "learning_rate": 1.059291775501102e-05, "loss": 0.0056, "step": 3310 }, { "epoch": 5.98622874061833, "grad_norm": 5.005063056945801, "learning_rate": 1.0478034049227137e-05, "loss": 0.0054, "step": 3320 }, { "epoch": 6.008149523300085, "grad_norm": 12.51547908782959, "learning_rate": 1.036355519073602e-05, "loss": 0.0065, "step": 3330 }, { "epoch": 6.028287341493345, "grad_norm": 3.451265335083008, "learning_rate": 1.0249486046857735e-05, "loss": 0.0051, "step": 3340 }, { "epoch": 6.048425159686605, "grad_norm": 7.321857929229736, "learning_rate": 1.0135831467492432e-05, "loss": 0.0059, "step": 3350 }, { "epoch": 6.068562977879865, "grad_norm": 4.42759895324707, "learning_rate": 1.0022596284914138e-05, "loss": 0.006, "step": 3360 }, { "epoch": 6.088700796073126, "grad_norm": 3.1307969093322754, "learning_rate": 9.90978531356531e-06, "loss": 0.005, "step": 3370 }, { "epoch": 6.108838614266386, "grad_norm": 5.168570518493652, "learning_rate": 9.797403349852126e-06, "loss": 0.0044, "step": 3380 }, { "epoch": 6.128976432459646, "grad_norm": 6.52720832824707, "learning_rate": 9.685455171940567e-06, "loss": 0.005, "step": 3390 }, { "epoch": 6.1491142506529055, "grad_norm": 4.172718048095703, "learning_rate": 9.573945539553258e-06, "loss": 0.0044, "step": 3400 }, { "epoch": 6.169252068846166, "grad_norm": 8.326397895812988, "learning_rate": 9.462879193767092e-06, "loss": 0.0053, "step": 3410 }, { "epoch": 6.189389887039426, "grad_norm": 4.124663352966309, "learning_rate": 9.352260856811667e-06, "loss": 0.0058, "step": 3420 }, { "epoch": 6.209527705232686, "grad_norm": 4.169667720794678, "learning_rate": 9.2420952318685e-06, "loss": 0.0049, "step": 3430 }, { "epoch": 6.229665523425946, "grad_norm": 5.089596271514893, "learning_rate": 9.132387002871057e-06, "loss": 0.0044, "step": 3440 }, { "epoch": 6.249803341619207, "grad_norm": 6.561634540557861, "learning_rate": 9.023140834305621e-06, "loss": 0.0051, "step": 3450 }, { "epoch": 6.269941159812467, "grad_norm": 3.9571590423583984, "learning_rate": 8.914361371012939e-06, "loss": 0.0045, "step": 3460 }, { "epoch": 6.2900789780057265, "grad_norm": 5.203815460205078, "learning_rate": 8.806053237990788e-06, "loss": 0.0065, "step": 3470 }, { "epoch": 6.310216796198986, "grad_norm": 4.430067539215088, "learning_rate": 8.698221040197288e-06, "loss": 0.0047, "step": 3480 }, { "epoch": 6.330354614392247, "grad_norm": 6.157893180847168, "learning_rate": 8.590869362355128e-06, "loss": 0.0063, "step": 3490 }, { "epoch": 6.350492432585507, "grad_norm": 4.458155155181885, "learning_rate": 8.484002768756643e-06, "loss": 0.0048, "step": 3500 }, { "epoch": 6.370630250778767, "grad_norm": 3.2868919372558594, "learning_rate": 8.37762580306972e-06, "loss": 0.0042, "step": 3510 }, { "epoch": 6.390768068972028, "grad_norm": 4.93739652633667, "learning_rate": 8.271742988144688e-06, "loss": 0.0051, "step": 3520 }, { "epoch": 6.4109058871652875, "grad_norm": 3.718449115753174, "learning_rate": 8.166358825821923e-06, "loss": 0.0048, "step": 3530 }, { "epoch": 6.431043705358547, "grad_norm": 3.594763994216919, "learning_rate": 8.061477796740511e-06, "loss": 0.0054, "step": 3540 }, { "epoch": 6.451181523551807, "grad_norm": 6.66500997543335, "learning_rate": 7.957104360147746e-06, "loss": 0.0046, "step": 3550 }, { "epoch": 6.471319341745068, "grad_norm": 3.4088094234466553, "learning_rate": 7.853242953709467e-06, "loss": 0.006, "step": 3560 }, { "epoch": 6.491457159938328, "grad_norm": 3.0382471084594727, "learning_rate": 7.74989799332146e-06, "loss": 0.0051, "step": 3570 }, { "epoch": 6.511594978131588, "grad_norm": 3.609813928604126, "learning_rate": 7.64707387292166e-06, "loss": 0.005, "step": 3580 }, { "epoch": 6.531732796324848, "grad_norm": 4.544133186340332, "learning_rate": 7.544774964303341e-06, "loss": 0.005, "step": 3590 }, { "epoch": 6.5518706145181085, "grad_norm": 3.8849527835845947, "learning_rate": 7.443005616929277e-06, "loss": 0.0045, "step": 3600 }, { "epoch": 6.572008432711368, "grad_norm": 4.574479579925537, "learning_rate": 7.341770157746737e-06, "loss": 0.0047, "step": 3610 }, { "epoch": 6.592146250904628, "grad_norm": 3.9820139408111572, "learning_rate": 7.241072891003589e-06, "loss": 0.005, "step": 3620 }, { "epoch": 6.612284069097889, "grad_norm": 3.3841769695281982, "learning_rate": 7.1409180980652596e-06, "loss": 0.0039, "step": 3630 }, { "epoch": 6.632421887291149, "grad_norm": 3.9114112854003906, "learning_rate": 7.041310037232712e-06, "loss": 0.0047, "step": 3640 }, { "epoch": 6.652559705484409, "grad_norm": 10.5076904296875, "learning_rate": 6.942252943561396e-06, "loss": 0.0051, "step": 3650 }, { "epoch": 6.672697523677669, "grad_norm": 3.7937240600585938, "learning_rate": 6.843751028681178e-06, "loss": 0.0041, "step": 3660 }, { "epoch": 6.692835341870929, "grad_norm": 5.625157356262207, "learning_rate": 6.74580848061728e-06, "loss": 0.0044, "step": 3670 }, { "epoch": 6.712973160064189, "grad_norm": 3.2087152004241943, "learning_rate": 6.648429463612218e-06, "loss": 0.0066, "step": 3680 }, { "epoch": 6.733110978257449, "grad_norm": 3.721176862716675, "learning_rate": 6.551618117948746e-06, "loss": 0.0044, "step": 3690 }, { "epoch": 6.753248796450709, "grad_norm": 3.429137945175171, "learning_rate": 6.4553785597738195e-06, "loss": 0.0048, "step": 3700 }, { "epoch": 6.77338661464397, "grad_norm": 3.475482225418091, "learning_rate": 6.359714880923602e-06, "loss": 0.006, "step": 3710 }, { "epoch": 6.79352443283723, "grad_norm": 4.675537586212158, "learning_rate": 6.2646311487494785e-06, "loss": 0.0044, "step": 3720 }, { "epoch": 6.81366225103049, "grad_norm": 6.543276786804199, "learning_rate": 6.170131405945125e-06, "loss": 0.0049, "step": 3730 }, { "epoch": 6.83380006922375, "grad_norm": 3.8348066806793213, "learning_rate": 6.0762196703746324e-06, "loss": 0.0049, "step": 3740 }, { "epoch": 6.85393788741701, "grad_norm": 3.8558757305145264, "learning_rate": 5.982899934901667e-06, "loss": 0.0042, "step": 3750 }, { "epoch": 6.87407570561027, "grad_norm": 3.2190654277801514, "learning_rate": 5.8901761672197165e-06, "loss": 0.0039, "step": 3760 }, { "epoch": 6.89421352380353, "grad_norm": 3.8839619159698486, "learning_rate": 5.798052309683384e-06, "loss": 0.005, "step": 3770 }, { "epoch": 6.914351341996791, "grad_norm": 3.031508684158325, "learning_rate": 5.706532279140782e-06, "loss": 0.0048, "step": 3780 }, { "epoch": 6.934489160190051, "grad_norm": 4.985992431640625, "learning_rate": 5.61561996676699e-06, "loss": 0.0059, "step": 3790 }, { "epoch": 6.954626978383311, "grad_norm": 4.688712120056152, "learning_rate": 5.5253192378985966e-06, "loss": 0.0043, "step": 3800 }, { "epoch": 6.9747647965765704, "grad_norm": 5.569628715515137, "learning_rate": 5.43563393186941e-06, "loss": 0.0043, "step": 3810 }, { "epoch": 6.994902614769831, "grad_norm": 2.9404146671295166, "learning_rate": 5.346567861847168e-06, "loss": 0.0045, "step": 3820 }, { "epoch": 7.01372683596431, "grad_norm": 2.525343656539917, "learning_rate": 5.258124814671403e-06, "loss": 0.007, "step": 3830 }, { "epoch": 7.0320292839167235, "grad_norm": 6.035243988037109, "learning_rate": 5.1703085506925225e-06, "loss": 0.0087, "step": 3840 }, { "epoch": 7.050331731869138, "grad_norm": 3.9128825664520264, "learning_rate": 5.083122803611802e-06, "loss": 0.0065, "step": 3850 }, { "epoch": 7.068634179821551, "grad_norm": 5.710306644439697, "learning_rate": 4.996571280322762e-06, "loss": 0.0116, "step": 3860 }, { "epoch": 7.086936627773965, "grad_norm": 9.083086967468262, "learning_rate": 4.910657660753482e-06, "loss": 0.0094, "step": 3870 }, { "epoch": 7.105239075726378, "grad_norm": 4.0207672119140625, "learning_rate": 4.825385597710148e-06, "loss": 0.0085, "step": 3880 }, { "epoch": 7.123541523678792, "grad_norm": 3.8497507572174072, "learning_rate": 4.740758716721803e-06, "loss": 0.0083, "step": 3890 }, { "epoch": 7.141843971631205, "grad_norm": 5.6043009757995605, "learning_rate": 4.6567806158861164e-06, "loss": 0.0054, "step": 3900 }, { "epoch": 7.1601464195836195, "grad_norm": 3.8586933612823486, "learning_rate": 4.573454865716465e-06, "loss": 0.0068, "step": 3910 }, { "epoch": 7.178448867536033, "grad_norm": 4.219987392425537, "learning_rate": 4.490785008990113e-06, "loss": 0.0084, "step": 3920 }, { "epoch": 7.196751315488447, "grad_norm": 4.6731109619140625, "learning_rate": 4.408774560597544e-06, "loss": 0.0068, "step": 3930 }, { "epoch": 7.21505376344086, "grad_norm": 2.894176483154297, "learning_rate": 4.32742700739309e-06, "loss": 0.007, "step": 3940 }, { "epoch": 7.233356211393274, "grad_norm": 3.3003957271575928, "learning_rate": 4.246745808046599e-06, "loss": 0.0078, "step": 3950 }, { "epoch": 7.251658659345687, "grad_norm": 3.965242862701416, "learning_rate": 4.166734392896438e-06, "loss": 0.0054, "step": 3960 }, { "epoch": 7.269961107298101, "grad_norm": 3.6206185817718506, "learning_rate": 4.087396163803645e-06, "loss": 0.0066, "step": 3970 }, { "epoch": 7.288263555250515, "grad_norm": 3.3707611560821533, "learning_rate": 4.008734494007241e-06, "loss": 0.0084, "step": 3980 }, { "epoch": 7.306566003202929, "grad_norm": 3.408390522003174, "learning_rate": 3.9307527279808665e-06, "loss": 0.0045, "step": 3990 }, { "epoch": 7.324868451155342, "grad_norm": 3.1554362773895264, "learning_rate": 3.85345418129055e-06, "loss": 0.0084, "step": 4000 }, { "epoch": 7.343170899107756, "grad_norm": 3.7730562686920166, "learning_rate": 3.776842140453756e-06, "loss": 0.0056, "step": 4010 }, { "epoch": 7.361473347060169, "grad_norm": 2.509883165359497, "learning_rate": 3.700919862799639e-06, "loss": 0.0077, "step": 4020 }, { "epoch": 7.379775795012583, "grad_norm": 4.287370681762695, "learning_rate": 3.6256905763305605e-06, "loss": 0.0067, "step": 4030 }, { "epoch": 7.3980782429649965, "grad_norm": 6.043769359588623, "learning_rate": 3.5511574795848415e-06, "loss": 0.0051, "step": 4040 }, { "epoch": 7.416380690917411, "grad_norm": 11.113882064819336, "learning_rate": 3.4773237415007644e-06, "loss": 0.0077, "step": 4050 }, { "epoch": 7.434683138869824, "grad_norm": 4.403820037841797, "learning_rate": 3.4041925012818423e-06, "loss": 0.0061, "step": 4060 }, { "epoch": 7.452985586822238, "grad_norm": 3.961599826812744, "learning_rate": 3.3317668682633532e-06, "loss": 0.0081, "step": 4070 }, { "epoch": 7.471288034774651, "grad_norm": 2.8031773567199707, "learning_rate": 3.2600499217801307e-06, "loss": 0.0083, "step": 4080 }, { "epoch": 7.489590482727065, "grad_norm": 2.444967269897461, "learning_rate": 3.189044711035645e-06, "loss": 0.0082, "step": 4090 }, { "epoch": 7.507892930679478, "grad_norm": 2.957968235015869, "learning_rate": 3.1187542549723625e-06, "loss": 0.0083, "step": 4100 }, { "epoch": 7.526195378631892, "grad_norm": 4.196249961853027, "learning_rate": 3.0491815421433825e-06, "loss": 0.0053, "step": 4110 }, { "epoch": 7.544497826584306, "grad_norm": 4.068223476409912, "learning_rate": 2.980329530585362e-06, "loss": 0.0048, "step": 4120 }, { "epoch": 7.56280027453672, "grad_norm": 10.506719589233398, "learning_rate": 2.912201147692786e-06, "loss": 0.0053, "step": 4130 }, { "epoch": 7.581102722489133, "grad_norm": 3.4478495121002197, "learning_rate": 2.8447992900934583e-06, "loss": 0.0064, "step": 4140 }, { "epoch": 7.599405170441546, "grad_norm": 3.022067070007324, "learning_rate": 2.778126823525373e-06, "loss": 0.0045, "step": 4150 }, { "epoch": 7.61770761839396, "grad_norm": 2.931964874267578, "learning_rate": 2.712186582714862e-06, "loss": 0.0074, "step": 4160 }, { "epoch": 7.6360100663463735, "grad_norm": 5.001142978668213, "learning_rate": 2.6469813712560544e-06, "loss": 0.005, "step": 4170 }, { "epoch": 7.654312514298788, "grad_norm": 4.039334297180176, "learning_rate": 2.5825139614917238e-06, "loss": 0.0054, "step": 4180 }, { "epoch": 7.672614962251201, "grad_norm": 2.894651412963867, "learning_rate": 2.518787094395363e-06, "loss": 0.0051, "step": 4190 }, { "epoch": 7.690917410203615, "grad_norm": 3.445218801498413, "learning_rate": 2.455803479454664e-06, "loss": 0.0077, "step": 4200 }, { "epoch": 7.709219858156028, "grad_norm": 5.986388683319092, "learning_rate": 2.3935657945563427e-06, "loss": 0.0051, "step": 4210 }, { "epoch": 7.727522306108442, "grad_norm": 4.5863237380981445, "learning_rate": 2.332076685872231e-06, "loss": 0.0062, "step": 4220 }, { "epoch": 7.745824754060855, "grad_norm": 3.8240745067596436, "learning_rate": 2.2713387677468267e-06, "loss": 0.0066, "step": 4230 }, { "epoch": 7.7641272020132694, "grad_norm": 3.147395372390747, "learning_rate": 2.2113546225861037e-06, "loss": 0.0067, "step": 4240 }, { "epoch": 7.782429649965683, "grad_norm": 4.106767177581787, "learning_rate": 2.1521268007477047e-06, "loss": 0.008, "step": 4250 }, { "epoch": 7.800732097918097, "grad_norm": 3.5174560546875, "learning_rate": 2.0936578204325575e-06, "loss": 0.008, "step": 4260 }, { "epoch": 7.81903454587051, "grad_norm": 3.5646681785583496, "learning_rate": 2.035950167577747e-06, "loss": 0.0062, "step": 4270 }, { "epoch": 7.837336993822924, "grad_norm": 3.414524555206299, "learning_rate": 1.9790062957508626e-06, "loss": 0.0074, "step": 4280 }, { "epoch": 7.855639441775337, "grad_norm": 4.365599632263184, "learning_rate": 1.9228286260456673e-06, "loss": 0.0102, "step": 4290 }, { "epoch": 7.873941889727751, "grad_norm": 6.737311840057373, "learning_rate": 1.8674195469791524e-06, "loss": 0.006, "step": 4300 }, { "epoch": 7.8922443376801645, "grad_norm": 8.64922046661377, "learning_rate": 1.8127814143900012e-06, "loss": 0.0061, "step": 4310 }, { "epoch": 7.910546785632579, "grad_norm": 3.7279410362243652, "learning_rate": 1.7589165513383988e-06, "loss": 0.0062, "step": 4320 }, { "epoch": 7.928849233584992, "grad_norm": 5.265659332275391, "learning_rate": 1.7058272480072879e-06, "loss": 0.0063, "step": 4330 }, { "epoch": 7.947151681537406, "grad_norm": 3.1222264766693115, "learning_rate": 1.6535157616049867e-06, "loss": 0.0058, "step": 4340 }, { "epoch": 7.965454129489819, "grad_norm": 3.9116389751434326, "learning_rate": 1.601984316269214e-06, "loss": 0.0066, "step": 4350 }, { "epoch": 7.983756577442233, "grad_norm": 8.46506118774414, "learning_rate": 1.5512351029725325e-06, "loss": 0.0052, "step": 4360 }, { "epoch": 7.036990501080565, "grad_norm": 3.518129825592041, "learning_rate": 1.5012702794291901e-06, "loss": 0.0049, "step": 4370 }, { "epoch": 7.053073327637332, "grad_norm": 5.544999122619629, "learning_rate": 1.4520919700033864e-06, "loss": 0.0054, "step": 4380 }, { "epoch": 7.0691561541941, "grad_norm": 2.621429681777954, "learning_rate": 1.4037022656189425e-06, "loss": 0.0071, "step": 4390 }, { "epoch": 7.085238980750867, "grad_norm": 6.043819427490234, "learning_rate": 1.356103223670402e-06, "loss": 0.0073, "step": 4400 }, { "epoch": 7.1013218073076345, "grad_norm": 3.584305763244629, "learning_rate": 1.3092968679355634e-06, "loss": 0.0045, "step": 4410 }, { "epoch": 7.1174046338644015, "grad_norm": 6.561378479003906, "learning_rate": 1.2632851884894293e-06, "loss": 0.0091, "step": 4420 }, { "epoch": 7.133487460421169, "grad_norm": 15.719043731689453, "learning_rate": 1.2180701416195894e-06, "loss": 0.0155, "step": 4430 }, { "epoch": 7.149570286977936, "grad_norm": 7.752620697021484, "learning_rate": 1.1736536497430584e-06, "loss": 0.0098, "step": 4440 }, { "epoch": 7.165653113534704, "grad_norm": 7.116891384124756, "learning_rate": 1.1300376013245272e-06, "loss": 0.0107, "step": 4450 }, { "epoch": 7.181735940091471, "grad_norm": 4.907498836517334, "learning_rate": 1.0872238507960753e-06, "loss": 0.0087, "step": 4460 }, { "epoch": 7.197818766648238, "grad_norm": 3.2054624557495117, "learning_rate": 1.0452142184783232e-06, "loss": 0.0091, "step": 4470 }, { "epoch": 7.213901593205006, "grad_norm": 4.177403926849365, "learning_rate": 1.0040104905030467e-06, "loss": 0.0064, "step": 4480 }, { "epoch": 7.229984419761773, "grad_norm": 5.655606269836426, "learning_rate": 9.63614418737222e-07, "loss": 0.0107, "step": 4490 }, { "epoch": 7.246067246318541, "grad_norm": 2.246121883392334, "learning_rate": 9.240277207085557e-07, "loss": 0.008, "step": 4500 }, { "epoch": 7.262150072875308, "grad_norm": 5.548453330993652, "learning_rate": 8.852520795324349e-07, "loss": 0.0074, "step": 4510 }, { "epoch": 7.2782328994320755, "grad_norm": 12.830928802490234, "learning_rate": 8.472891438404108e-07, "loss": 0.0123, "step": 4520 }, { "epoch": 7.2943157259888425, "grad_norm": 3.613041400909424, "learning_rate": 8.101405277100549e-07, "loss": 0.0118, "step": 4530 }, { "epoch": 7.31039855254561, "grad_norm": 3.5919158458709717, "learning_rate": 7.738078105963565e-07, "loss": 0.0058, "step": 4540 }, { "epoch": 7.326481379102377, "grad_norm": 6.3182196617126465, "learning_rate": 7.3829253726458e-07, "loss": 0.0106, "step": 4550 }, { "epoch": 7.342564205659144, "grad_norm": 4.684800624847412, "learning_rate": 7.035962177245536e-07, "loss": 0.0065, "step": 4560 }, { "epoch": 7.358647032215912, "grad_norm": 2.9966583251953125, "learning_rate": 6.697203271665054e-07, "loss": 0.0081, "step": 4570 }, { "epoch": 7.374729858772679, "grad_norm": 3.141700506210327, "learning_rate": 6.366663058983102e-07, "loss": 0.009, "step": 4580 }, { "epoch": 7.390812685329447, "grad_norm": 4.013637542724609, "learning_rate": 6.044355592842644e-07, "loss": 0.0087, "step": 4590 }, { "epoch": 7.406895511886214, "grad_norm": 3.2047641277313232, "learning_rate": 5.730294576853501e-07, "loss": 0.007, "step": 4600 }, { "epoch": 7.422978338442982, "grad_norm": 6.322583198547363, "learning_rate": 5.424493364009364e-07, "loss": 0.0066, "step": 4610 }, { "epoch": 7.439061164999749, "grad_norm": 7.329522132873535, "learning_rate": 5.126964956120351e-07, "loss": 0.0095, "step": 4620 }, { "epoch": 7.4551439915565165, "grad_norm": 3.080005168914795, "learning_rate": 4.837722003260136e-07, "loss": 0.0091, "step": 4630 }, { "epoch": 7.4712268181132835, "grad_norm": 4.727446556091309, "learning_rate": 4.5567768032280136e-07, "loss": 0.0077, "step": 4640 }, { "epoch": 7.4873096446700504, "grad_norm": 2.2113332748413086, "learning_rate": 4.2841413010261456e-07, "loss": 0.0066, "step": 4650 }, { "epoch": 7.503392471226818, "grad_norm": 11.121126174926758, "learning_rate": 4.01982708835158e-07, "loss": 0.0066, "step": 4660 }, { "epoch": 7.519475297783585, "grad_norm": 4.20743465423584, "learning_rate": 3.7638454031035276e-07, "loss": 0.0111, "step": 4670 }, { "epoch": 7.535558124340353, "grad_norm": 6.002978324890137, "learning_rate": 3.5162071289055245e-07, "loss": 0.0066, "step": 4680 }, { "epoch": 7.55164095089712, "grad_norm": 3.0463833808898926, "learning_rate": 3.276922794642534e-07, "loss": 0.0072, "step": 4690 }, { "epoch": 7.567723777453887, "grad_norm": 6.407285213470459, "learning_rate": 3.046002574013551e-07, "loss": 0.0072, "step": 4700 }, { "epoch": 7.583806604010655, "grad_norm": 8.836779594421387, "learning_rate": 2.8234562850988356e-07, "loss": 0.0079, "step": 4710 }, { "epoch": 7.599889430567423, "grad_norm": 5.871425628662109, "learning_rate": 2.609293389942602e-07, "loss": 0.0077, "step": 4720 }, { "epoch": 7.61597225712419, "grad_norm": 6.142898082733154, "learning_rate": 2.403522994150609e-07, "loss": 0.0071, "step": 4730 }, { "epoch": 7.632055083680957, "grad_norm": 3.5649702548980713, "learning_rate": 2.2061538465031117e-07, "loss": 0.0071, "step": 4740 }, { "epoch": 7.6481379102377245, "grad_norm": 7.242725372314453, "learning_rate": 2.017194338582873e-07, "loss": 0.0105, "step": 4750 }, { "epoch": 7.6642207367944915, "grad_norm": 12.46696662902832, "learning_rate": 1.8366525044183126e-07, "loss": 0.0095, "step": 4760 }, { "epoch": 7.680303563351259, "grad_norm": 5.7213640213012695, "learning_rate": 1.6645360201420046e-07, "loss": 0.0095, "step": 4770 }, { "epoch": 7.696386389908026, "grad_norm": 3.3008534908294678, "learning_rate": 1.5008522036642048e-07, "loss": 0.0078, "step": 4780 }, { "epoch": 7.712469216464793, "grad_norm": 4.01139497756958, "learning_rate": 1.3456080143618767e-07, "loss": 0.0097, "step": 4790 }, { "epoch": 7.728552043021561, "grad_norm": 4.135232448577881, "learning_rate": 1.198810052782595e-07, "loss": 0.0091, "step": 4800 }, { "epoch": 7.744634869578328, "grad_norm": 6.216122150421143, "learning_rate": 1.060464560364105e-07, "loss": 0.0071, "step": 4810 }, { "epoch": 7.760717696135096, "grad_norm": 3.039104461669922, "learning_rate": 9.305774191687988e-08, "loss": 0.009, "step": 4820 }, { "epoch": 7.776800522691863, "grad_norm": 2.9482333660125732, "learning_rate": 8.091541516337398e-08, "loss": 0.0104, "step": 4830 }, { "epoch": 7.792883349248631, "grad_norm": 8.094056129455566, "learning_rate": 6.961999203357605e-08, "loss": 0.008, "step": 4840 }, { "epoch": 7.808966175805398, "grad_norm": 2.1247482299804688, "learning_rate": 5.917195277721055e-08, "loss": 0.0046, "step": 4850 }, { "epoch": 7.8250490023621655, "grad_norm": 7.984273433685303, "learning_rate": 4.957174161560607e-08, "loss": 0.0109, "step": 4860 }, { "epoch": 7.8411318289189325, "grad_norm": 5.030553817749023, "learning_rate": 4.0819766722826057e-08, "loss": 0.0062, "step": 4870 }, { "epoch": 7.857214655475699, "grad_norm": 2.809941291809082, "learning_rate": 3.291640020829823e-08, "loss": 0.0081, "step": 4880 }, { "epoch": 7.873297482032467, "grad_norm": 6.414947032928467, "learning_rate": 2.5861978101009433e-08, "loss": 0.0075, "step": 4890 }, { "epoch": 7.889380308589234, "grad_norm": 5.527952671051025, "learning_rate": 1.9656800335206004e-08, "loss": 0.0058, "step": 4900 }, { "epoch": 7.905463135146002, "grad_norm": 6.868896007537842, "learning_rate": 1.4301130737646163e-08, "loss": 0.0067, "step": 4910 }, { "epoch": 7.921545961702769, "grad_norm": 11.053786277770996, "learning_rate": 9.795197016384538e-09, "loss": 0.0102, "step": 4920 }, { "epoch": 7.937628788259537, "grad_norm": 5.266638278961182, "learning_rate": 6.1391907510888195e-09, "loss": 0.0095, "step": 4930 }, { "epoch": 7.953711614816304, "grad_norm": 3.4717257022857666, "learning_rate": 3.3332673848951448e-09, "loss": 0.0065, "step": 4940 }, { "epoch": 7.969794441373072, "grad_norm": 4.068334579467773, "learning_rate": 1.3775462177956222e-09, "loss": 0.0083, "step": 4950 }, { "epoch": 7.985877267929839, "grad_norm": 3.321983814239502, "learning_rate": 2.721104015712683e-10, "loss": 0.0062, "step": 4960 } ], "logging_steps": 10, "max_steps": 4968, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.935941540711301e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }