samahadhoud's picture
Upload folder using huggingface_hub
2942049 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.998743529175252,
"eval_steps": 500,
"global_step": 4968,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016082826556767352,
"grad_norm": 93.80207061767578,
"learning_rate": 2.666666666666667e-06,
"loss": 0.8299,
"step": 10
},
{
"epoch": 0.032165653113534705,
"grad_norm": 50.166954040527344,
"learning_rate": 5.333333333333334e-06,
"loss": 0.7131,
"step": 20
},
{
"epoch": 0.048248479670302054,
"grad_norm": 37.23706817626953,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5976,
"step": 30
},
{
"epoch": 0.06433130622706941,
"grad_norm": 37.21980285644531,
"learning_rate": 1.0666666666666667e-05,
"loss": 0.5263,
"step": 40
},
{
"epoch": 0.08041413278383676,
"grad_norm": 29.091915130615234,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.4731,
"step": 50
},
{
"epoch": 0.09649695934060411,
"grad_norm": 32.472801208496094,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.4357,
"step": 60
},
{
"epoch": 0.11257978589737146,
"grad_norm": 29.79865264892578,
"learning_rate": 1.866666666666667e-05,
"loss": 0.3916,
"step": 70
},
{
"epoch": 0.12866261245413882,
"grad_norm": 28.13816261291504,
"learning_rate": 2.1333333333333335e-05,
"loss": 0.3721,
"step": 80
},
{
"epoch": 0.14474543901090617,
"grad_norm": 30.40574073791504,
"learning_rate": 2.4e-05,
"loss": 0.3382,
"step": 90
},
{
"epoch": 0.16082826556767352,
"grad_norm": 30.368940353393555,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.3207,
"step": 100
},
{
"epoch": 0.17691109212444087,
"grad_norm": 31.629531860351562,
"learning_rate": 2.9333333333333333e-05,
"loss": 0.305,
"step": 110
},
{
"epoch": 0.19299391868120822,
"grad_norm": 29.47364044189453,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.2812,
"step": 120
},
{
"epoch": 0.20907674523797556,
"grad_norm": 30.890962600708008,
"learning_rate": 3.466666666666667e-05,
"loss": 0.2665,
"step": 130
},
{
"epoch": 0.2251595717947429,
"grad_norm": 31.893320083618164,
"learning_rate": 3.733333333333334e-05,
"loss": 0.2505,
"step": 140
},
{
"epoch": 0.2412423983515103,
"grad_norm": 29.82271957397461,
"learning_rate": 4e-05,
"loss": 0.2404,
"step": 150
},
{
"epoch": 0.25732522490827764,
"grad_norm": 31.970462799072266,
"learning_rate": 3.9999574828039864e-05,
"loss": 0.2188,
"step": 160
},
{
"epoch": 0.273408051465045,
"grad_norm": 25.94739532470703,
"learning_rate": 3.999829933023657e-05,
"loss": 0.2156,
"step": 170
},
{
"epoch": 0.28949087802181234,
"grad_norm": 32.104461669921875,
"learning_rate": 3.9996173560820705e-05,
"loss": 0.2064,
"step": 180
},
{
"epoch": 1.018596079745351,
"grad_norm": 29.600008010864258,
"learning_rate": 3.999319761017403e-05,
"loss": 0.2122,
"step": 190
},
{
"epoch": 1.0722063997319484,
"grad_norm": 28.94344139099121,
"learning_rate": 3.998937160482562e-05,
"loss": 0.1835,
"step": 200
},
{
"epoch": 1.1258167197185458,
"grad_norm": 27.56523323059082,
"learning_rate": 3.998469570744648e-05,
"loss": 0.1815,
"step": 210
},
{
"epoch": 1.1794270397051432,
"grad_norm": 28.684629440307617,
"learning_rate": 3.997917011684268e-05,
"loss": 0.1717,
"step": 220
},
{
"epoch": 1.2330373596917408,
"grad_norm": 27.716796875,
"learning_rate": 3.9972795067946826e-05,
"loss": 0.1615,
"step": 230
},
{
"epoch": 1.2866476796783382,
"grad_norm": 25.33133316040039,
"learning_rate": 3.996557083180813e-05,
"loss": 0.1585,
"step": 240
},
{
"epoch": 1.3402579996649355,
"grad_norm": 28.910871505737305,
"learning_rate": 3.9957497715580844e-05,
"loss": 0.1488,
"step": 250
},
{
"epoch": 1.393868319651533,
"grad_norm": 26.948163986206055,
"learning_rate": 3.994857606251124e-05,
"loss": 0.1448,
"step": 260
},
{
"epoch": 1.4474786396381303,
"grad_norm": 26.12610626220703,
"learning_rate": 3.993880625192298e-05,
"loss": 0.1411,
"step": 270
},
{
"epoch": 1.5010889596247279,
"grad_norm": 26.017358779907227,
"learning_rate": 3.9928188699201035e-05,
"loss": 0.1375,
"step": 280
},
{
"epoch": 1.554699279611325,
"grad_norm": 26.367809295654297,
"learning_rate": 3.991672385577396e-05,
"loss": 0.1272,
"step": 290
},
{
"epoch": 1.6083095995979226,
"grad_norm": 24.2690372467041,
"learning_rate": 3.9904412209094755e-05,
"loss": 0.1288,
"step": 300
},
{
"epoch": 1.66191991958452,
"grad_norm": 29.307832717895508,
"learning_rate": 3.9891254282620115e-05,
"loss": 0.1321,
"step": 310
},
{
"epoch": 1.7155302395711174,
"grad_norm": 27.43846321105957,
"learning_rate": 3.9877250635788184e-05,
"loss": 0.1161,
"step": 320
},
{
"epoch": 1.769140559557715,
"grad_norm": 23.861331939697266,
"learning_rate": 3.9862401863994744e-05,
"loss": 0.1169,
"step": 330
},
{
"epoch": 1.8227508795443121,
"grad_norm": 23.86732292175293,
"learning_rate": 3.9846708598567956e-05,
"loss": 0.1123,
"step": 340
},
{
"epoch": 1.8763611995309097,
"grad_norm": 28.334510803222656,
"learning_rate": 3.983017150674145e-05,
"loss": 0.1042,
"step": 350
},
{
"epoch": 1.9299715195175071,
"grad_norm": 26.570316314697266,
"learning_rate": 3.9812791291626e-05,
"loss": 0.1069,
"step": 360
},
{
"epoch": 1.9835818395041045,
"grad_norm": 23.24406623840332,
"learning_rate": 3.979456869217962e-05,
"loss": 0.1074,
"step": 370
},
{
"epoch": 2.0316112458969604,
"grad_norm": 23.662790298461914,
"learning_rate": 3.977550448317615e-05,
"loss": 0.1278,
"step": 380
},
{
"epoch": 2.077279862994149,
"grad_norm": 20.232288360595703,
"learning_rate": 3.97555994751723e-05,
"loss": 0.1282,
"step": 390
},
{
"epoch": 2.1229484800913374,
"grad_norm": 24.64867401123047,
"learning_rate": 3.973485451447318e-05,
"loss": 0.1146,
"step": 400
},
{
"epoch": 2.168617097188526,
"grad_norm": 25.983293533325195,
"learning_rate": 3.9713270483096374e-05,
"loss": 0.1123,
"step": 410
},
{
"epoch": 2.2142857142857144,
"grad_norm": 23.095712661743164,
"learning_rate": 3.969084829873436e-05,
"loss": 0.103,
"step": 420
},
{
"epoch": 2.259954331382903,
"grad_norm": 23.112424850463867,
"learning_rate": 3.966758891471555e-05,
"loss": 0.1065,
"step": 430
},
{
"epoch": 2.3056229484800914,
"grad_norm": 23.07062339782715,
"learning_rate": 3.964349331996373e-05,
"loss": 0.1021,
"step": 440
},
{
"epoch": 2.35129156557728,
"grad_norm": 25.471771240234375,
"learning_rate": 3.961856253895603e-05,
"loss": 0.1064,
"step": 450
},
{
"epoch": 2.3969601826744684,
"grad_norm": 24.417654037475586,
"learning_rate": 3.959279763167935e-05,
"loss": 0.0956,
"step": 460
},
{
"epoch": 2.442628799771657,
"grad_norm": 24.067520141601562,
"learning_rate": 3.9566199693585304e-05,
"loss": 0.1113,
"step": 470
},
{
"epoch": 2.4882974168688454,
"grad_norm": 23.70163345336914,
"learning_rate": 3.953876985554364e-05,
"loss": 0.0911,
"step": 480
},
{
"epoch": 2.533966033966034,
"grad_norm": 23.784481048583984,
"learning_rate": 3.951050928379415e-05,
"loss": 0.0888,
"step": 490
},
{
"epoch": 2.5796346510632224,
"grad_norm": 23.621828079223633,
"learning_rate": 3.948141917989712e-05,
"loss": 0.0904,
"step": 500
},
{
"epoch": 2.625303268160411,
"grad_norm": 19.741653442382812,
"learning_rate": 3.945150078068219e-05,
"loss": 0.0879,
"step": 510
},
{
"epoch": 2.6709718852575994,
"grad_norm": 22.743593215942383,
"learning_rate": 3.9420755358195804e-05,
"loss": 0.0851,
"step": 520
},
{
"epoch": 2.716640502354788,
"grad_norm": 18.910329818725586,
"learning_rate": 3.938918421964711e-05,
"loss": 0.0801,
"step": 530
},
{
"epoch": 2.7623091194519764,
"grad_norm": 22.360628128051758,
"learning_rate": 3.9356788707352406e-05,
"loss": 0.078,
"step": 540
},
{
"epoch": 2.807977736549165,
"grad_norm": 24.22591209411621,
"learning_rate": 3.932357019867803e-05,
"loss": 0.0822,
"step": 550
},
{
"epoch": 2.8536463536463534,
"grad_norm": 24.46196746826172,
"learning_rate": 3.928953010598183e-05,
"loss": 0.0695,
"step": 560
},
{
"epoch": 2.899314970743542,
"grad_norm": 24.530746459960938,
"learning_rate": 3.925466987655309e-05,
"loss": 0.082,
"step": 570
},
{
"epoch": 2.9449835878407304,
"grad_norm": 23.36806297302246,
"learning_rate": 3.921899099255104e-05,
"loss": 0.0751,
"step": 580
},
{
"epoch": 2.9906522049379194,
"grad_norm": 19.65323257446289,
"learning_rate": 3.918249497094176e-05,
"loss": 0.07,
"step": 590
},
{
"epoch": 3.02390615940413,
"grad_norm": 17.223087310791016,
"learning_rate": 3.9145183363433777e-05,
"loss": 0.0662,
"step": 600
},
{
"epoch": 3.05245082734936,
"grad_norm": 19.922897338867188,
"learning_rate": 3.9107057756411995e-05,
"loss": 0.0695,
"step": 610
},
{
"epoch": 3.08099549529459,
"grad_norm": 22.456689834594727,
"learning_rate": 3.906811977087035e-05,
"loss": 0.0574,
"step": 620
},
{
"epoch": 3.10954016323982,
"grad_norm": 18.3155460357666,
"learning_rate": 3.902837106234278e-05,
"loss": 0.0638,
"step": 630
},
{
"epoch": 3.1380848311850498,
"grad_norm": 19.499990463256836,
"learning_rate": 3.8987813320832935e-05,
"loss": 0.0663,
"step": 640
},
{
"epoch": 3.1666294991302797,
"grad_norm": 18.689781188964844,
"learning_rate": 3.894644827074225e-05,
"loss": 0.0583,
"step": 650
},
{
"epoch": 3.1951741670755096,
"grad_norm": 19.73504066467285,
"learning_rate": 3.890427767079667e-05,
"loss": 0.062,
"step": 660
},
{
"epoch": 3.2237188350207395,
"grad_norm": 19.44004249572754,
"learning_rate": 3.886130331397186e-05,
"loss": 0.0577,
"step": 670
},
{
"epoch": 3.2522635029659694,
"grad_norm": 19.139127731323242,
"learning_rate": 3.881752702741697e-05,
"loss": 0.0618,
"step": 680
},
{
"epoch": 3.2808081709111994,
"grad_norm": 21.88005828857422,
"learning_rate": 3.877295067237697e-05,
"loss": 0.059,
"step": 690
},
{
"epoch": 3.3093528388564293,
"grad_norm": 24.21089744567871,
"learning_rate": 3.872757614411346e-05,
"loss": 0.0593,
"step": 700
},
{
"epoch": 3.337897506801659,
"grad_norm": 20.264284133911133,
"learning_rate": 3.868140537182417e-05,
"loss": 0.054,
"step": 710
},
{
"epoch": 3.366442174746889,
"grad_norm": 21.731857299804688,
"learning_rate": 3.863444031856088e-05,
"loss": 0.062,
"step": 720
},
{
"epoch": 3.394986842692119,
"grad_norm": 21.47838592529297,
"learning_rate": 3.8586682981145956e-05,
"loss": 0.0552,
"step": 730
},
{
"epoch": 3.423531510637349,
"grad_norm": 18.726280212402344,
"learning_rate": 3.853813539008746e-05,
"loss": 0.0532,
"step": 740
},
{
"epoch": 3.452076178582579,
"grad_norm": 19.791046142578125,
"learning_rate": 3.848879960949287e-05,
"loss": 0.0558,
"step": 750
},
{
"epoch": 3.480620846527809,
"grad_norm": 18.885759353637695,
"learning_rate": 3.8438677736981215e-05,
"loss": 0.0553,
"step": 760
},
{
"epoch": 3.5091655144730387,
"grad_norm": 16.527170181274414,
"learning_rate": 3.838777190359397e-05,
"loss": 0.0476,
"step": 770
},
{
"epoch": 3.5377101824182686,
"grad_norm": 16.75018310546875,
"learning_rate": 3.8336084273704457e-05,
"loss": 0.0532,
"step": 780
},
{
"epoch": 3.5662548503634985,
"grad_norm": 18.81423568725586,
"learning_rate": 3.828361704492575e-05,
"loss": 0.0499,
"step": 790
},
{
"epoch": 3.5947995183087285,
"grad_norm": 19.174463272094727,
"learning_rate": 3.823037244801729e-05,
"loss": 0.0494,
"step": 800
},
{
"epoch": 3.6233441862539584,
"grad_norm": 17.0285701751709,
"learning_rate": 3.817635274679006e-05,
"loss": 0.0461,
"step": 810
},
{
"epoch": 3.6518888541991883,
"grad_norm": 17.395580291748047,
"learning_rate": 3.812156023801028e-05,
"loss": 0.0496,
"step": 820
},
{
"epoch": 3.680433522144418,
"grad_norm": 18.277786254882812,
"learning_rate": 3.8065997251301776e-05,
"loss": 0.0477,
"step": 830
},
{
"epoch": 3.708978190089648,
"grad_norm": 17.72475242614746,
"learning_rate": 3.8009666149046957e-05,
"loss": 0.0457,
"step": 840
},
{
"epoch": 3.737522858034878,
"grad_norm": 20.809040069580078,
"learning_rate": 3.7952569326286336e-05,
"loss": 0.0471,
"step": 850
},
{
"epoch": 3.766067525980108,
"grad_norm": 17.868568420410156,
"learning_rate": 3.7894709210616714e-05,
"loss": 0.0456,
"step": 860
},
{
"epoch": 3.794612193925338,
"grad_norm": 15.575334548950195,
"learning_rate": 3.7836088262087975e-05,
"loss": 0.044,
"step": 870
},
{
"epoch": 3.823156861870568,
"grad_norm": 17.568668365478516,
"learning_rate": 3.7776708973098476e-05,
"loss": 0.0446,
"step": 880
},
{
"epoch": 3.8517015298157977,
"grad_norm": 17.17595672607422,
"learning_rate": 3.771657386828908e-05,
"loss": 0.0496,
"step": 890
},
{
"epoch": 3.8802461977610276,
"grad_norm": 24.375370025634766,
"learning_rate": 3.765568550443583e-05,
"loss": 0.0424,
"step": 900
},
{
"epoch": 3.9087908657062576,
"grad_norm": 16.25655174255371,
"learning_rate": 3.7594046470341246e-05,
"loss": 0.046,
"step": 910
},
{
"epoch": 3.9373355336514875,
"grad_norm": 18.85159683227539,
"learning_rate": 3.7531659386724195e-05,
"loss": 0.0435,
"step": 920
},
{
"epoch": 3.9658802015967174,
"grad_norm": 19.97796058654785,
"learning_rate": 3.746852690610855e-05,
"loss": 0.0431,
"step": 930
},
{
"epoch": 3.9944248695419473,
"grad_norm": 15.388335227966309,
"learning_rate": 3.7404651712710365e-05,
"loss": 0.0389,
"step": 940
},
{
"epoch": 4.019183642211671,
"grad_norm": 20.02805519104004,
"learning_rate": 3.734003652232376e-05,
"loss": 0.039,
"step": 950
},
{
"epoch": 4.043689692142748,
"grad_norm": 15.568504333496094,
"learning_rate": 3.727468408220544e-05,
"loss": 0.0375,
"step": 960
},
{
"epoch": 4.068195742073825,
"grad_norm": 15.18822956085205,
"learning_rate": 3.720859717095792e-05,
"loss": 0.0365,
"step": 970
},
{
"epoch": 4.092701792004902,
"grad_norm": 14.499895095825195,
"learning_rate": 3.714177859841136e-05,
"loss": 0.038,
"step": 980
},
{
"epoch": 4.117207841935978,
"grad_norm": 18.488901138305664,
"learning_rate": 3.707423120550411e-05,
"loss": 0.0406,
"step": 990
},
{
"epoch": 4.141713891867055,
"grad_norm": 16.12656593322754,
"learning_rate": 3.7005957864161905e-05,
"loss": 0.0354,
"step": 1000
},
{
"epoch": 4.166219941798132,
"grad_norm": 18.07503318786621,
"learning_rate": 3.693696147717579e-05,
"loss": 0.0373,
"step": 1010
},
{
"epoch": 4.190725991729209,
"grad_norm": 17.39132308959961,
"learning_rate": 3.686724497807867e-05,
"loss": 0.0345,
"step": 1020
},
{
"epoch": 4.215232041660285,
"grad_norm": 15.007177352905273,
"learning_rate": 3.67968113310206e-05,
"loss": 0.0325,
"step": 1030
},
{
"epoch": 4.239738091591361,
"grad_norm": 15.444381713867188,
"learning_rate": 3.6725663530642755e-05,
"loss": 0.0327,
"step": 1040
},
{
"epoch": 4.264244141522438,
"grad_norm": 14.16204833984375,
"learning_rate": 3.6653804601950126e-05,
"loss": 0.0338,
"step": 1050
},
{
"epoch": 4.288750191453515,
"grad_norm": 16.405170440673828,
"learning_rate": 3.6581237600182856e-05,
"loss": 0.0342,
"step": 1060
},
{
"epoch": 4.313256241384591,
"grad_norm": 19.641298294067383,
"learning_rate": 3.650796561068639e-05,
"loss": 0.0394,
"step": 1070
},
{
"epoch": 4.337762291315668,
"grad_norm": 14.00063705444336,
"learning_rate": 3.6433991748780255e-05,
"loss": 0.0336,
"step": 1080
},
{
"epoch": 4.362268341246745,
"grad_norm": 13.914216995239258,
"learning_rate": 3.635931915962565e-05,
"loss": 0.0326,
"step": 1090
},
{
"epoch": 4.386774391177822,
"grad_norm": 15.238022804260254,
"learning_rate": 3.628395101809169e-05,
"loss": 0.0312,
"step": 1100
},
{
"epoch": 4.411280441108898,
"grad_norm": 15.279886245727539,
"learning_rate": 3.62078905286204e-05,
"loss": 0.0313,
"step": 1110
},
{
"epoch": 4.435786491039975,
"grad_norm": 15.173819541931152,
"learning_rate": 3.613114092509054e-05,
"loss": 0.0315,
"step": 1120
},
{
"epoch": 4.460292540971052,
"grad_norm": 15.986420631408691,
"learning_rate": 3.6053705470680044e-05,
"loss": 0.0333,
"step": 1130
},
{
"epoch": 4.484798590902129,
"grad_norm": 18.724811553955078,
"learning_rate": 3.59755874577273e-05,
"loss": 0.0322,
"step": 1140
},
{
"epoch": 4.509304640833205,
"grad_norm": 14.428422927856445,
"learning_rate": 3.589679020759118e-05,
"loss": 0.0278,
"step": 1150
},
{
"epoch": 4.533810690764282,
"grad_norm": 14.249613761901855,
"learning_rate": 3.5817317070509814e-05,
"loss": 0.0323,
"step": 1160
},
{
"epoch": 4.558316740695359,
"grad_norm": 13.707551002502441,
"learning_rate": 3.573717142545814e-05,
"loss": 0.0299,
"step": 1170
},
{
"epoch": 4.582822790626436,
"grad_norm": 18.068727493286133,
"learning_rate": 3.565635668000427e-05,
"loss": 0.0319,
"step": 1180
},
{
"epoch": 4.607328840557512,
"grad_norm": 15.44510269165039,
"learning_rate": 3.557487627016458e-05,
"loss": 0.0308,
"step": 1190
},
{
"epoch": 4.631834890488589,
"grad_norm": 15.211899757385254,
"learning_rate": 3.5492733660257605e-05,
"loss": 0.029,
"step": 1200
},
{
"epoch": 4.656340940419666,
"grad_norm": 18.195812225341797,
"learning_rate": 3.5409932342756824e-05,
"loss": 0.029,
"step": 1210
},
{
"epoch": 4.680846990350743,
"grad_norm": 15.29293155670166,
"learning_rate": 3.532647583814205e-05,
"loss": 0.0275,
"step": 1220
},
{
"epoch": 4.705353040281819,
"grad_norm": 13.911247253417969,
"learning_rate": 3.524236769474987e-05,
"loss": 0.0259,
"step": 1230
},
{
"epoch": 4.729859090212896,
"grad_norm": 15.558411598205566,
"learning_rate": 3.51576114886227e-05,
"loss": 0.0287,
"step": 1240
},
{
"epoch": 4.754365140143973,
"grad_norm": 16.093111038208008,
"learning_rate": 3.507221082335676e-05,
"loss": 0.0293,
"step": 1250
},
{
"epoch": 4.77887119007505,
"grad_norm": 13.53354549407959,
"learning_rate": 3.498616932994888e-05,
"loss": 0.0278,
"step": 1260
},
{
"epoch": 4.803377240006126,
"grad_norm": 22.743614196777344,
"learning_rate": 3.489949066664211e-05,
"loss": 0.034,
"step": 1270
},
{
"epoch": 4.827883289937203,
"grad_norm": 14.596455574035645,
"learning_rate": 3.481217851877015e-05,
"loss": 0.0292,
"step": 1280
},
{
"epoch": 4.85238933986828,
"grad_norm": 17.450109481811523,
"learning_rate": 3.4724236598600725e-05,
"loss": 0.0301,
"step": 1290
},
{
"epoch": 4.8768953897993566,
"grad_norm": 15.233014106750488,
"learning_rate": 3.4635668645177674e-05,
"loss": 0.0292,
"step": 1300
},
{
"epoch": 4.901401439730433,
"grad_norm": 15.098063468933105,
"learning_rate": 3.454647842416204e-05,
"loss": 0.0276,
"step": 1310
},
{
"epoch": 4.92590748966151,
"grad_norm": 16.780668258666992,
"learning_rate": 3.4456669727671944e-05,
"loss": 0.027,
"step": 1320
},
{
"epoch": 4.950413539592587,
"grad_norm": 16.340227127075195,
"learning_rate": 3.436624637412132e-05,
"loss": 0.0309,
"step": 1330
},
{
"epoch": 4.9749195895236635,
"grad_norm": 12.311773300170898,
"learning_rate": 3.427521220805763e-05,
"loss": 0.0257,
"step": 1340
},
{
"epoch": 4.99942563945474,
"grad_norm": 15.815475463867188,
"learning_rate": 3.4183571099998355e-05,
"loss": 0.0261,
"step": 1350
},
{
"epoch": 5.02021921165498,
"grad_norm": 12.780634880065918,
"learning_rate": 3.409132694626643e-05,
"loss": 0.0281,
"step": 1360
},
{
"epoch": 5.042004220845531,
"grad_norm": 14.720085144042969,
"learning_rate": 3.3998483668824645e-05,
"loss": 0.0236,
"step": 1370
},
{
"epoch": 5.063789230036082,
"grad_norm": 16.020496368408203,
"learning_rate": 3.390504521510882e-05,
"loss": 0.0241,
"step": 1380
},
{
"epoch": 5.0855742392266325,
"grad_norm": 13.678121566772461,
"learning_rate": 3.381101555785999e-05,
"loss": 0.0232,
"step": 1390
},
{
"epoch": 5.107359248417183,
"grad_norm": 13.695241928100586,
"learning_rate": 3.371639869495554e-05,
"loss": 0.0237,
"step": 1400
},
{
"epoch": 5.1291442576077335,
"grad_norm": 11.553495407104492,
"learning_rate": 3.362119864923918e-05,
"loss": 0.0237,
"step": 1410
},
{
"epoch": 5.1509292667982844,
"grad_norm": 12.397970199584961,
"learning_rate": 3.35254194683499e-05,
"loss": 0.0236,
"step": 1420
},
{
"epoch": 5.172714275988835,
"grad_norm": 14.61323356628418,
"learning_rate": 3.342906522454992e-05,
"loss": 0.0239,
"step": 1430
},
{
"epoch": 5.194499285179386,
"grad_norm": 11.521512031555176,
"learning_rate": 3.333214001455149e-05,
"loss": 0.0191,
"step": 1440
},
{
"epoch": 5.216284294369936,
"grad_norm": 15.835281372070312,
"learning_rate": 3.323464795934279e-05,
"loss": 0.0253,
"step": 1450
},
{
"epoch": 5.238069303560487,
"grad_norm": 14.1622953414917,
"learning_rate": 3.313659320401263e-05,
"loss": 0.0243,
"step": 1460
},
{
"epoch": 5.259854312751038,
"grad_norm": 12.993020057678223,
"learning_rate": 3.303797991757425e-05,
"loss": 0.0211,
"step": 1470
},
{
"epoch": 5.281639321941589,
"grad_norm": 13.310782432556152,
"learning_rate": 3.29388122927881e-05,
"loss": 0.0278,
"step": 1480
},
{
"epoch": 5.30342433113214,
"grad_norm": 17.85926628112793,
"learning_rate": 3.2839094545983505e-05,
"loss": 0.0212,
"step": 1490
},
{
"epoch": 5.32520934032269,
"grad_norm": 12.155655860900879,
"learning_rate": 3.273883091687946e-05,
"loss": 0.0224,
"step": 1500
},
{
"epoch": 5.346994349513241,
"grad_norm": 10.895421981811523,
"learning_rate": 3.2638025668404334e-05,
"loss": 0.0241,
"step": 1510
},
{
"epoch": 5.368779358703792,
"grad_norm": 12.233269691467285,
"learning_rate": 3.2536683086514634e-05,
"loss": 0.0206,
"step": 1520
},
{
"epoch": 5.390564367894343,
"grad_norm": 12.179084777832031,
"learning_rate": 3.243480748001278e-05,
"loss": 0.0241,
"step": 1530
},
{
"epoch": 5.412349377084894,
"grad_norm": 13.7705078125,
"learning_rate": 3.2332403180363906e-05,
"loss": 0.0253,
"step": 1540
},
{
"epoch": 5.434134386275444,
"grad_norm": 10.06460952758789,
"learning_rate": 3.222947454151169e-05,
"loss": 0.0249,
"step": 1550
},
{
"epoch": 5.455919395465995,
"grad_norm": 16.193252563476562,
"learning_rate": 3.212602593969325e-05,
"loss": 0.0245,
"step": 1560
},
{
"epoch": 5.477704404656546,
"grad_norm": 11.988511085510254,
"learning_rate": 3.202206177325306e-05,
"loss": 0.0238,
"step": 1570
},
{
"epoch": 5.499489413847097,
"grad_norm": 11.607162475585938,
"learning_rate": 3.191758646245596e-05,
"loss": 0.0226,
"step": 1580
},
{
"epoch": 5.521274423037648,
"grad_norm": 12.626535415649414,
"learning_rate": 3.181260444929923e-05,
"loss": 0.0204,
"step": 1590
},
{
"epoch": 5.543059432228198,
"grad_norm": 12.591373443603516,
"learning_rate": 3.1707120197323686e-05,
"loss": 0.0207,
"step": 1600
},
{
"epoch": 5.564844441418749,
"grad_norm": 12.233884811401367,
"learning_rate": 3.1601138191423966e-05,
"loss": 0.0223,
"step": 1610
},
{
"epoch": 5.5866294506092995,
"grad_norm": 13.553182601928711,
"learning_rate": 3.149466293765778e-05,
"loss": 0.021,
"step": 1620
},
{
"epoch": 5.60841445979985,
"grad_norm": 13.30004596710205,
"learning_rate": 3.138769896305434e-05,
"loss": 0.0188,
"step": 1630
},
{
"epoch": 5.630199468990401,
"grad_norm": 13.62940502166748,
"learning_rate": 3.128025081542196e-05,
"loss": 0.0176,
"step": 1640
},
{
"epoch": 5.651984478180951,
"grad_norm": 12.465331077575684,
"learning_rate": 3.117232306315456e-05,
"loss": 0.0195,
"step": 1650
},
{
"epoch": 5.673769487371502,
"grad_norm": 12.430222511291504,
"learning_rate": 3.106392029503757e-05,
"loss": 0.0216,
"step": 1660
},
{
"epoch": 5.695554496562053,
"grad_norm": 12.926973342895508,
"learning_rate": 3.09550471200527e-05,
"loss": 0.0192,
"step": 1670
},
{
"epoch": 5.717339505752604,
"grad_norm": 13.914267539978027,
"learning_rate": 3.08457081671821e-05,
"loss": 0.021,
"step": 1680
},
{
"epoch": 5.739124514943155,
"grad_norm": 13.50471019744873,
"learning_rate": 3.073590808521144e-05,
"loss": 0.0218,
"step": 1690
},
{
"epoch": 5.760909524133705,
"grad_norm": 10.271846771240234,
"learning_rate": 3.062565154253233e-05,
"loss": 0.0202,
"step": 1700
},
{
"epoch": 5.782694533324256,
"grad_norm": 14.907472610473633,
"learning_rate": 3.0514943226943816e-05,
"loss": 0.0236,
"step": 1710
},
{
"epoch": 5.804479542514807,
"grad_norm": 13.36329174041748,
"learning_rate": 3.040378784545304e-05,
"loss": 0.021,
"step": 1720
},
{
"epoch": 5.826264551705358,
"grad_norm": 11.128992080688477,
"learning_rate": 3.0292190124075162e-05,
"loss": 0.0176,
"step": 1730
},
{
"epoch": 5.848049560895909,
"grad_norm": 11.310523986816406,
"learning_rate": 3.018015480763236e-05,
"loss": 0.0207,
"step": 1740
},
{
"epoch": 5.869834570086459,
"grad_norm": 11.527318000793457,
"learning_rate": 3.006768665955215e-05,
"loss": 0.0187,
"step": 1750
},
{
"epoch": 5.89161957927701,
"grad_norm": 11.697135925292969,
"learning_rate": 2.9954790461664834e-05,
"loss": 0.0202,
"step": 1760
},
{
"epoch": 5.913404588467561,
"grad_norm": 10.337966918945312,
"learning_rate": 2.984147101400018e-05,
"loss": 0.0168,
"step": 1770
},
{
"epoch": 5.935189597658112,
"grad_norm": 10.729581832885742,
"learning_rate": 2.9727733134583358e-05,
"loss": 0.021,
"step": 1780
},
{
"epoch": 5.956974606848663,
"grad_norm": 11.193035125732422,
"learning_rate": 2.961358165923008e-05,
"loss": 0.0203,
"step": 1790
},
{
"epoch": 5.978759616039213,
"grad_norm": 11.645442962646484,
"learning_rate": 2.9499021441341012e-05,
"loss": 0.0182,
"step": 1800
},
{
"epoch": 6.000544625229764,
"grad_norm": 11.237954139709473,
"learning_rate": 2.938405735169537e-05,
"loss": 0.0184,
"step": 1810
},
{
"epoch": 6.019290314590042,
"grad_norm": 10.546935081481934,
"learning_rate": 2.9268694278243903e-05,
"loss": 0.0179,
"step": 1820
},
{
"epoch": 6.039107010156057,
"grad_norm": 9.956415176391602,
"learning_rate": 2.915293712590102e-05,
"loss": 0.0196,
"step": 1830
},
{
"epoch": 6.058923705722071,
"grad_norm": 9.116511344909668,
"learning_rate": 2.9036790816336252e-05,
"loss": 0.0199,
"step": 1840
},
{
"epoch": 6.078740401288085,
"grad_norm": 16.642379760742188,
"learning_rate": 2.892026028776501e-05,
"loss": 0.0173,
"step": 1850
},
{
"epoch": 6.098557096854099,
"grad_norm": 11.179176330566406,
"learning_rate": 2.8803350494738615e-05,
"loss": 0.019,
"step": 1860
},
{
"epoch": 6.118373792420114,
"grad_norm": 13.457623481750488,
"learning_rate": 2.8686066407933656e-05,
"loss": 0.0164,
"step": 1870
},
{
"epoch": 6.138190487986129,
"grad_norm": 11.937878608703613,
"learning_rate": 2.8568413013940642e-05,
"loss": 0.019,
"step": 1880
},
{
"epoch": 6.158007183552143,
"grad_norm": 14.586573600769043,
"learning_rate": 2.845039531505199e-05,
"loss": 0.0187,
"step": 1890
},
{
"epoch": 6.177823879118157,
"grad_norm": 10.834576606750488,
"learning_rate": 2.833201832904933e-05,
"loss": 0.0205,
"step": 1900
},
{
"epoch": 6.197640574684171,
"grad_norm": 10.595796585083008,
"learning_rate": 2.8213287088990184e-05,
"loss": 0.0194,
"step": 1910
},
{
"epoch": 6.217457270250185,
"grad_norm": 12.627229690551758,
"learning_rate": 2.8094206642993955e-05,
"loss": 0.0145,
"step": 1920
},
{
"epoch": 3.1077549379303413,
"grad_norm": 10.363633155822754,
"learning_rate": 2.7974782054027308e-05,
"loss": 0.0179,
"step": 1930
},
{
"epoch": 3.1238377644871087,
"grad_norm": 14.35067081451416,
"learning_rate": 2.7855018399688908e-05,
"loss": 0.0184,
"step": 1940
},
{
"epoch": 3.139920591043876,
"grad_norm": 10.560155868530273,
"learning_rate": 2.773492077199351e-05,
"loss": 0.0173,
"step": 1950
},
{
"epoch": 3.1560034176006435,
"grad_norm": 10.620994567871094,
"learning_rate": 2.76144942771555e-05,
"loss": 0.0155,
"step": 1960
},
{
"epoch": 3.1720862441574105,
"grad_norm": 9.053291320800781,
"learning_rate": 2.749374403537177e-05,
"loss": 0.0145,
"step": 1970
},
{
"epoch": 3.188169070714178,
"grad_norm": 12.468178749084473,
"learning_rate": 2.7372675180603994e-05,
"loss": 0.0183,
"step": 1980
},
{
"epoch": 3.2042518972709453,
"grad_norm": 8.465781211853027,
"learning_rate": 2.7251292860360424e-05,
"loss": 0.0164,
"step": 1990
},
{
"epoch": 3.2203347238277127,
"grad_norm": 10.253599166870117,
"learning_rate": 2.712960223547696e-05,
"loss": 0.015,
"step": 2000
},
{
"epoch": 3.23641755038448,
"grad_norm": 9.734599113464355,
"learning_rate": 2.700760847989775e-05,
"loss": 0.0144,
"step": 2010
},
{
"epoch": 3.2525003769412475,
"grad_norm": 12.44884967803955,
"learning_rate": 2.6885316780455208e-05,
"loss": 0.0129,
"step": 2020
},
{
"epoch": 3.268583203498015,
"grad_norm": 10.425430297851562,
"learning_rate": 2.6762732336649492e-05,
"loss": 0.0185,
"step": 2030
},
{
"epoch": 3.2846660300547823,
"grad_norm": 10.850104331970215,
"learning_rate": 2.6639860360427426e-05,
"loss": 0.0143,
"step": 2040
},
{
"epoch": 3.3007488566115493,
"grad_norm": 9.267366409301758,
"learning_rate": 2.651670607596092e-05,
"loss": 0.0146,
"step": 2050
},
{
"epoch": 3.3168316831683167,
"grad_norm": 9.598543167114258,
"learning_rate": 2.6393274719424814e-05,
"loss": 0.0157,
"step": 2060
},
{
"epoch": 3.332914509725084,
"grad_norm": 9.140937805175781,
"learning_rate": 2.6269571538774294e-05,
"loss": 0.0172,
"step": 2070
},
{
"epoch": 3.3489973362818515,
"grad_norm": 10.654680252075195,
"learning_rate": 2.6145601793521734e-05,
"loss": 0.0162,
"step": 2080
},
{
"epoch": 3.365080162838619,
"grad_norm": 10.139638900756836,
"learning_rate": 2.6021370754513096e-05,
"loss": 0.0168,
"step": 2090
},
{
"epoch": 3.3811629893953863,
"grad_norm": 9.781733512878418,
"learning_rate": 2.589688370370382e-05,
"loss": 0.0165,
"step": 2100
},
{
"epoch": 3.3972458159521537,
"grad_norm": 10.93750286102295,
"learning_rate": 2.5772145933934235e-05,
"loss": 0.0145,
"step": 2110
},
{
"epoch": 3.413328642508921,
"grad_norm": 11.465789794921875,
"learning_rate": 2.5647162748704562e-05,
"loss": 0.0135,
"step": 2120
},
{
"epoch": 3.4294114690656885,
"grad_norm": 9.151410102844238,
"learning_rate": 2.5521939461949384e-05,
"loss": 0.0163,
"step": 2130
},
{
"epoch": 3.4454942956224555,
"grad_norm": 8.722734451293945,
"learning_rate": 2.5396481397811715e-05,
"loss": 0.0171,
"step": 2140
},
{
"epoch": 3.461577122179223,
"grad_norm": 9.99445629119873,
"learning_rate": 2.5270793890416677e-05,
"loss": 0.0146,
"step": 2150
},
{
"epoch": 3.4776599487359903,
"grad_norm": 11.865700721740723,
"learning_rate": 2.5144882283644644e-05,
"loss": 0.0172,
"step": 2160
},
{
"epoch": 3.4937427752927577,
"grad_norm": 14.123621940612793,
"learning_rate": 2.50187519309041e-05,
"loss": 0.0146,
"step": 2170
},
{
"epoch": 3.509825601849525,
"grad_norm": 10.353002548217773,
"learning_rate": 2.4892408194903963e-05,
"loss": 0.0155,
"step": 2180
},
{
"epoch": 3.5259084284062925,
"grad_norm": 10.808701515197754,
"learning_rate": 2.4765856447425614e-05,
"loss": 0.0133,
"step": 2190
},
{
"epoch": 3.54199125496306,
"grad_norm": 8.521575927734375,
"learning_rate": 2.4639102069094522e-05,
"loss": 0.0125,
"step": 2200
},
{
"epoch": 3.558074081519827,
"grad_norm": 7.443869113922119,
"learning_rate": 2.4512150449151433e-05,
"loss": 0.0143,
"step": 2210
},
{
"epoch": 3.5741569080765947,
"grad_norm": 10.696161270141602,
"learning_rate": 2.438500698522325e-05,
"loss": 0.0176,
"step": 2220
},
{
"epoch": 3.5902397346333617,
"grad_norm": 19.051715850830078,
"learning_rate": 2.4257677083093553e-05,
"loss": 0.0167,
"step": 2230
},
{
"epoch": 3.606322561190129,
"grad_norm": 13.287577629089355,
"learning_rate": 2.413016615647275e-05,
"loss": 0.0173,
"step": 2240
},
{
"epoch": 3.6224053877468965,
"grad_norm": 10.227944374084473,
"learning_rate": 2.4002479626767903e-05,
"loss": 0.0153,
"step": 2250
},
{
"epoch": 3.638488214303664,
"grad_norm": 10.98816204071045,
"learning_rate": 2.3874622922852225e-05,
"loss": 0.0136,
"step": 2260
},
{
"epoch": 3.6545710408604313,
"grad_norm": 9.841387748718262,
"learning_rate": 2.3746601480834258e-05,
"loss": 0.0164,
"step": 2270
},
{
"epoch": 3.6706538674171987,
"grad_norm": 9.20659351348877,
"learning_rate": 2.361842074382674e-05,
"loss": 0.0133,
"step": 2280
},
{
"epoch": 3.686736693973966,
"grad_norm": 8.02350902557373,
"learning_rate": 2.3490086161715197e-05,
"loss": 0.0113,
"step": 2290
},
{
"epoch": 3.702819520530733,
"grad_norm": 7.091673851013184,
"learning_rate": 2.336160319092621e-05,
"loss": 0.0127,
"step": 2300
},
{
"epoch": 3.718902347087501,
"grad_norm": 9.675036430358887,
"learning_rate": 2.3232977294195437e-05,
"loss": 0.0195,
"step": 2310
},
{
"epoch": 3.734985173644268,
"grad_norm": 8.969931602478027,
"learning_rate": 2.3104213940335338e-05,
"loss": 0.0118,
"step": 2320
},
{
"epoch": 3.7510680002010353,
"grad_norm": 11.032634735107422,
"learning_rate": 2.2975318604002667e-05,
"loss": 0.0148,
"step": 2330
},
{
"epoch": 3.7671508267578027,
"grad_norm": 8.483484268188477,
"learning_rate": 2.2846296765465708e-05,
"loss": 0.013,
"step": 2340
},
{
"epoch": 3.78323365331457,
"grad_norm": 12.28922176361084,
"learning_rate": 2.271715391037126e-05,
"loss": 0.0123,
"step": 2350
},
{
"epoch": 3.7993164798713375,
"grad_norm": 11.338895797729492,
"learning_rate": 2.2587895529511396e-05,
"loss": 0.0134,
"step": 2360
},
{
"epoch": 3.815399306428105,
"grad_norm": 12.580510139465332,
"learning_rate": 2.245852711859004e-05,
"loss": 0.0132,
"step": 2370
},
{
"epoch": 3.8314821329848723,
"grad_norm": 9.075506210327148,
"learning_rate": 2.232905417798929e-05,
"loss": 0.0148,
"step": 2380
},
{
"epoch": 3.8475649595416392,
"grad_norm": 8.931917190551758,
"learning_rate": 2.2199482212535522e-05,
"loss": 0.0128,
"step": 2390
},
{
"epoch": 3.8636477860984066,
"grad_norm": 12.407690048217773,
"learning_rate": 2.206981673126539e-05,
"loss": 0.0168,
"step": 2400
},
{
"epoch": 3.879730612655174,
"grad_norm": 10.473590850830078,
"learning_rate": 2.1940063247191582e-05,
"loss": 0.0128,
"step": 2410
},
{
"epoch": 4.002587467656654,
"grad_norm": 7.4311203956604,
"learning_rate": 2.181022727706842e-05,
"loss": 0.0122,
"step": 2420
},
{
"epoch": 4.020587242659467,
"grad_norm": 7.310738563537598,
"learning_rate": 2.168031434115729e-05,
"loss": 0.0067,
"step": 2430
},
{
"epoch": 4.038587017662279,
"grad_norm": 13.81872272491455,
"learning_rate": 2.1550329962991946e-05,
"loss": 0.008,
"step": 2440
},
{
"epoch": 4.056586792665092,
"grad_norm": 7.483142852783203,
"learning_rate": 2.142027966914368e-05,
"loss": 0.0072,
"step": 2450
},
{
"epoch": 4.074586567667904,
"grad_norm": 7.784074783325195,
"learning_rate": 2.1290168988986332e-05,
"loss": 0.0073,
"step": 2460
},
{
"epoch": 4.0925863426707165,
"grad_norm": 7.706643581390381,
"learning_rate": 2.116000345446118e-05,
"loss": 0.0074,
"step": 2470
},
{
"epoch": 4.110586117673529,
"grad_norm": 8.789175987243652,
"learning_rate": 2.1029788599841784e-05,
"loss": 0.0077,
"step": 2480
},
{
"epoch": 4.128585892676342,
"grad_norm": 6.578799724578857,
"learning_rate": 2.0899529961498633e-05,
"loss": 0.0074,
"step": 2490
},
{
"epoch": 4.146585667679154,
"grad_norm": 6.066508769989014,
"learning_rate": 2.076923307766379e-05,
"loss": 0.0071,
"step": 2500
},
{
"epoch": 4.164585442681966,
"grad_norm": 8.597545623779297,
"learning_rate": 2.0638903488195406e-05,
"loss": 0.0074,
"step": 2510
},
{
"epoch": 4.182585217684779,
"grad_norm": 9.465729713439941,
"learning_rate": 2.050854673434217e-05,
"loss": 0.0077,
"step": 2520
},
{
"epoch": 4.200584992687592,
"grad_norm": 9.023458480834961,
"learning_rate": 2.037816835850776e-05,
"loss": 0.0076,
"step": 2530
},
{
"epoch": 4.218584767690404,
"grad_norm": 7.603120803833008,
"learning_rate": 2.024777390401512e-05,
"loss": 0.0076,
"step": 2540
},
{
"epoch": 4.236584542693216,
"grad_norm": 7.11006498336792,
"learning_rate": 2.0117368914870838e-05,
"loss": 0.0079,
"step": 2550
},
{
"epoch": 4.254584317696029,
"grad_norm": 9.57967758178711,
"learning_rate": 1.9986958935529393e-05,
"loss": 0.0082,
"step": 2560
},
{
"epoch": 4.272584092698842,
"grad_norm": 10.809629440307617,
"learning_rate": 1.9856549510657447e-05,
"loss": 0.0086,
"step": 2570
},
{
"epoch": 4.2905838677016535,
"grad_norm": 9.166589736938477,
"learning_rate": 1.9726146184898066e-05,
"loss": 0.0075,
"step": 2580
},
{
"epoch": 4.308583642704466,
"grad_norm": 7.303510665893555,
"learning_rate": 1.959575450263503e-05,
"loss": 0.0076,
"step": 2590
},
{
"epoch": 4.326583417707279,
"grad_norm": 6.71479606628418,
"learning_rate": 1.9465380007757043e-05,
"loss": 0.0076,
"step": 2600
},
{
"epoch": 4.3445831927100915,
"grad_norm": 6.198269367218018,
"learning_rate": 1.933502824342205e-05,
"loss": 0.0071,
"step": 2610
},
{
"epoch": 4.362582967712903,
"grad_norm": 6.006600856781006,
"learning_rate": 1.9204704751821586e-05,
"loss": 0.0072,
"step": 2620
},
{
"epoch": 4.380582742715716,
"grad_norm": 6.971927165985107,
"learning_rate": 1.907441507394507e-05,
"loss": 0.0076,
"step": 2630
},
{
"epoch": 4.398582517718529,
"grad_norm": 8.39477825164795,
"learning_rate": 1.894416474934429e-05,
"loss": 0.0075,
"step": 2640
},
{
"epoch": 4.416582292721341,
"grad_norm": 7.6670355796813965,
"learning_rate": 1.8813959315897815e-05,
"loss": 0.0083,
"step": 2650
},
{
"epoch": 4.434582067724153,
"grad_norm": 7.985522747039795,
"learning_rate": 1.8683804309575587e-05,
"loss": 0.0075,
"step": 2660
},
{
"epoch": 4.452581842726966,
"grad_norm": 7.552544593811035,
"learning_rate": 1.855370526420352e-05,
"loss": 0.0073,
"step": 2670
},
{
"epoch": 4.470581617729779,
"grad_norm": 7.256811618804932,
"learning_rate": 1.842366771122823e-05,
"loss": 0.0066,
"step": 2680
},
{
"epoch": 4.488581392732591,
"grad_norm": 7.66050386428833,
"learning_rate": 1.829369717948185e-05,
"loss": 0.0078,
"step": 2690
},
{
"epoch": 4.506581167735403,
"grad_norm": 6.683782577514648,
"learning_rate": 1.8163799194946938e-05,
"loss": 0.0079,
"step": 2700
},
{
"epoch": 4.524580942738216,
"grad_norm": 6.800795078277588,
"learning_rate": 1.8033979280521584e-05,
"loss": 0.0069,
"step": 2710
},
{
"epoch": 4.5425807177410285,
"grad_norm": 8.025465965270996,
"learning_rate": 1.790424295578453e-05,
"loss": 0.0069,
"step": 2720
},
{
"epoch": 4.56058049274384,
"grad_norm": 10.645038604736328,
"learning_rate": 1.777459573676051e-05,
"loss": 0.0076,
"step": 2730
},
{
"epoch": 4.578580267746653,
"grad_norm": 9.160017013549805,
"learning_rate": 1.764504313568577e-05,
"loss": 0.0068,
"step": 2740
},
{
"epoch": 4.596580042749466,
"grad_norm": 8.349514961242676,
"learning_rate": 1.7515590660773633e-05,
"loss": 0.0076,
"step": 2750
},
{
"epoch": 4.614579817752278,
"grad_norm": 5.219119071960449,
"learning_rate": 1.7386243815980354e-05,
"loss": 0.0073,
"step": 2760
},
{
"epoch": 4.632579592755091,
"grad_norm": 7.130075931549072,
"learning_rate": 1.7257008100771072e-05,
"loss": 0.007,
"step": 2770
},
{
"epoch": 4.650579367757903,
"grad_norm": 6.3263115882873535,
"learning_rate": 1.7127889009886036e-05,
"loss": 0.0067,
"step": 2780
},
{
"epoch": 4.6685791427607155,
"grad_norm": 6.6792778968811035,
"learning_rate": 1.699889203310695e-05,
"loss": 0.0075,
"step": 2790
},
{
"epoch": 4.686578917763528,
"grad_norm": 6.040131092071533,
"learning_rate": 1.6870022655023544e-05,
"loss": 0.0072,
"step": 2800
},
{
"epoch": 4.70457869276634,
"grad_norm": 6.8368306159973145,
"learning_rate": 1.674128635480044e-05,
"loss": 0.0071,
"step": 2810
},
{
"epoch": 4.722578467769153,
"grad_norm": 8.718803405761719,
"learning_rate": 1.6612688605944133e-05,
"loss": 0.0074,
"step": 2820
},
{
"epoch": 4.740578242771965,
"grad_norm": 8.861642837524414,
"learning_rate": 1.6484234876070335e-05,
"loss": 0.0063,
"step": 2830
},
{
"epoch": 4.758578017774778,
"grad_norm": 6.4469475746154785,
"learning_rate": 1.6355930626671447e-05,
"loss": 0.007,
"step": 2840
},
{
"epoch": 4.776577792777591,
"grad_norm": 9.30246639251709,
"learning_rate": 1.6227781312884388e-05,
"loss": 0.0073,
"step": 2850
},
{
"epoch": 4.794577567780403,
"grad_norm": 8.216259002685547,
"learning_rate": 1.6099792383258664e-05,
"loss": 0.0071,
"step": 2860
},
{
"epoch": 4.812577342783215,
"grad_norm": 10.296393394470215,
"learning_rate": 1.5971969279524668e-05,
"loss": 0.0075,
"step": 2870
},
{
"epoch": 5.01673208014873,
"grad_norm": 6.276814937591553,
"learning_rate": 1.584431743636237e-05,
"loss": 0.0059,
"step": 2880
},
{
"epoch": 5.038766095159402,
"grad_norm": 6.1525068283081055,
"learning_rate": 1.5716842281170205e-05,
"loss": 0.0059,
"step": 2890
},
{
"epoch": 5.060800110170075,
"grad_norm": 5.246829032897949,
"learning_rate": 1.558954923383432e-05,
"loss": 0.0057,
"step": 2900
},
{
"epoch": 5.082834125180748,
"grad_norm": 8.341444969177246,
"learning_rate": 1.5462443706498178e-05,
"loss": 0.0061,
"step": 2910
},
{
"epoch": 5.104868140191421,
"grad_norm": 8.144927024841309,
"learning_rate": 1.533553110333239e-05,
"loss": 0.0058,
"step": 2920
},
{
"epoch": 5.126902155202093,
"grad_norm": 4.849141597747803,
"learning_rate": 1.5208816820304973e-05,
"loss": 0.0055,
"step": 2930
},
{
"epoch": 5.148936170212766,
"grad_norm": 5.376830577850342,
"learning_rate": 1.5082306244951956e-05,
"loss": 0.0052,
"step": 2940
},
{
"epoch": 5.170970185223439,
"grad_norm": 5.531591892242432,
"learning_rate": 1.495600475614825e-05,
"loss": 0.0059,
"step": 2950
},
{
"epoch": 5.193004200234111,
"grad_norm": 4.80387020111084,
"learning_rate": 1.4829917723879029e-05,
"loss": 0.0056,
"step": 2960
},
{
"epoch": 5.2150382152447845,
"grad_norm": 6.294495582580566,
"learning_rate": 1.4704050509011345e-05,
"loss": 0.0056,
"step": 2970
},
{
"epoch": 5.237072230255457,
"grad_norm": 5.95064640045166,
"learning_rate": 1.4578408463066246e-05,
"loss": 0.0058,
"step": 2980
},
{
"epoch": 5.259106245266129,
"grad_norm": 6.749286651611328,
"learning_rate": 1.4452996927991236e-05,
"loss": 0.006,
"step": 2990
},
{
"epoch": 5.2811402602768025,
"grad_norm": 6.431356906890869,
"learning_rate": 1.4327821235933126e-05,
"loss": 0.0062,
"step": 3000
},
{
"epoch": 5.303174275287475,
"grad_norm": 5.065823554992676,
"learning_rate": 1.4202886709011357e-05,
"loss": 0.005,
"step": 3010
},
{
"epoch": 5.325208290298148,
"grad_norm": 4.399689674377441,
"learning_rate": 1.4078198659091686e-05,
"loss": 0.006,
"step": 3020
},
{
"epoch": 5.3472423053088205,
"grad_norm": 4.889394283294678,
"learning_rate": 1.3953762387560392e-05,
"loss": 0.0054,
"step": 3030
},
{
"epoch": 5.369276320319493,
"grad_norm": 6.588573455810547,
"learning_rate": 1.3829583185098802e-05,
"loss": 0.0056,
"step": 3040
},
{
"epoch": 5.391310335330166,
"grad_norm": 4.880826473236084,
"learning_rate": 1.3705666331458424e-05,
"loss": 0.0052,
"step": 3050
},
{
"epoch": 5.4133443503408385,
"grad_norm": 5.972387313842773,
"learning_rate": 1.3582017095236413e-05,
"loss": 0.0052,
"step": 3060
},
{
"epoch": 5.435378365351512,
"grad_norm": 5.3322224617004395,
"learning_rate": 1.345864073365157e-05,
"loss": 0.0054,
"step": 3070
},
{
"epoch": 5.457412380362184,
"grad_norm": 4.680153846740723,
"learning_rate": 1.3335542492320856e-05,
"loss": 0.0059,
"step": 3080
},
{
"epoch": 5.4794463953728565,
"grad_norm": 5.0644636154174805,
"learning_rate": 1.3212727605036319e-05,
"loss": 0.0055,
"step": 3090
},
{
"epoch": 5.50148041038353,
"grad_norm": 6.729560375213623,
"learning_rate": 1.3090201293542597e-05,
"loss": 0.0061,
"step": 3100
},
{
"epoch": 5.523514425394202,
"grad_norm": 6.099545001983643,
"learning_rate": 1.2967968767314898e-05,
"loss": 0.0063,
"step": 3110
},
{
"epoch": 5.545548440404875,
"grad_norm": 4.657865524291992,
"learning_rate": 1.284603522333749e-05,
"loss": 0.0052,
"step": 3120
},
{
"epoch": 5.567582455415548,
"grad_norm": 5.916351795196533,
"learning_rate": 1.2724405845882775e-05,
"loss": 0.0056,
"step": 3130
},
{
"epoch": 5.58961647042622,
"grad_norm": 6.424000263214111,
"learning_rate": 1.2603085806290824e-05,
"loss": 0.0065,
"step": 3140
},
{
"epoch": 5.611650485436893,
"grad_norm": 7.819843769073486,
"learning_rate": 1.2482080262749538e-05,
"loss": 0.0057,
"step": 3150
},
{
"epoch": 5.633684500447566,
"grad_norm": 6.704712867736816,
"learning_rate": 1.2361394360075348e-05,
"loss": 0.0052,
"step": 3160
},
{
"epoch": 5.655718515458239,
"grad_norm": 5.237440586090088,
"learning_rate": 1.224103322949442e-05,
"loss": 0.0052,
"step": 3170
},
{
"epoch": 5.677752530468911,
"grad_norm": 6.460971355438232,
"learning_rate": 1.2121001988424541e-05,
"loss": 0.0057,
"step": 3180
},
{
"epoch": 5.699786545479584,
"grad_norm": 5.491466522216797,
"learning_rate": 1.2001305740257505e-05,
"loss": 0.0051,
"step": 3190
},
{
"epoch": 5.721820560490257,
"grad_norm": 5.925656318664551,
"learning_rate": 1.188194957414217e-05,
"loss": 0.0054,
"step": 3200
},
{
"epoch": 5.743854575500929,
"grad_norm": 6.257553577423096,
"learning_rate": 1.176293856476804e-05,
"loss": 0.0053,
"step": 3210
},
{
"epoch": 5.765888590511603,
"grad_norm": 4.400306224822998,
"learning_rate": 1.1644277772149531e-05,
"loss": 0.0051,
"step": 3220
},
{
"epoch": 5.787922605522275,
"grad_norm": 6.251142978668213,
"learning_rate": 1.1525972241410827e-05,
"loss": 0.0052,
"step": 3230
},
{
"epoch": 5.809956620532947,
"grad_norm": 4.689172267913818,
"learning_rate": 1.1408027002571359e-05,
"loss": 0.0057,
"step": 3240
},
{
"epoch": 5.831990635543621,
"grad_norm": 6.150318145751953,
"learning_rate": 1.1290447070331958e-05,
"loss": 0.0053,
"step": 3250
},
{
"epoch": 5.854024650554293,
"grad_norm": 5.218411445617676,
"learning_rate": 1.1173237443861678e-05,
"loss": 0.0057,
"step": 3260
},
{
"epoch": 5.876058665564966,
"grad_norm": 4.724580764770508,
"learning_rate": 1.1056403106585156e-05,
"loss": 0.005,
"step": 3270
},
{
"epoch": 5.898092680575639,
"grad_norm": 3.99684476852417,
"learning_rate": 1.093994902597082e-05,
"loss": 0.0053,
"step": 3280
},
{
"epoch": 5.920126695586311,
"grad_norm": 5.558387756347656,
"learning_rate": 1.0823880153319642e-05,
"loss": 0.0051,
"step": 3290
},
{
"epoch": 5.942160710596984,
"grad_norm": 5.6572957038879395,
"learning_rate": 1.0708201423554634e-05,
"loss": 0.0055,
"step": 3300
},
{
"epoch": 5.964194725607657,
"grad_norm": 3.7635843753814697,
"learning_rate": 1.059291775501102e-05,
"loss": 0.0056,
"step": 3310
},
{
"epoch": 5.98622874061833,
"grad_norm": 5.005063056945801,
"learning_rate": 1.0478034049227137e-05,
"loss": 0.0054,
"step": 3320
},
{
"epoch": 6.008149523300085,
"grad_norm": 12.51547908782959,
"learning_rate": 1.036355519073602e-05,
"loss": 0.0065,
"step": 3330
},
{
"epoch": 6.028287341493345,
"grad_norm": 3.451265335083008,
"learning_rate": 1.0249486046857735e-05,
"loss": 0.0051,
"step": 3340
},
{
"epoch": 6.048425159686605,
"grad_norm": 7.321857929229736,
"learning_rate": 1.0135831467492432e-05,
"loss": 0.0059,
"step": 3350
},
{
"epoch": 6.068562977879865,
"grad_norm": 4.42759895324707,
"learning_rate": 1.0022596284914138e-05,
"loss": 0.006,
"step": 3360
},
{
"epoch": 6.088700796073126,
"grad_norm": 3.1307969093322754,
"learning_rate": 9.90978531356531e-06,
"loss": 0.005,
"step": 3370
},
{
"epoch": 6.108838614266386,
"grad_norm": 5.168570518493652,
"learning_rate": 9.797403349852126e-06,
"loss": 0.0044,
"step": 3380
},
{
"epoch": 6.128976432459646,
"grad_norm": 6.52720832824707,
"learning_rate": 9.685455171940567e-06,
"loss": 0.005,
"step": 3390
},
{
"epoch": 6.1491142506529055,
"grad_norm": 4.172718048095703,
"learning_rate": 9.573945539553258e-06,
"loss": 0.0044,
"step": 3400
},
{
"epoch": 6.169252068846166,
"grad_norm": 8.326397895812988,
"learning_rate": 9.462879193767092e-06,
"loss": 0.0053,
"step": 3410
},
{
"epoch": 6.189389887039426,
"grad_norm": 4.124663352966309,
"learning_rate": 9.352260856811667e-06,
"loss": 0.0058,
"step": 3420
},
{
"epoch": 6.209527705232686,
"grad_norm": 4.169667720794678,
"learning_rate": 9.2420952318685e-06,
"loss": 0.0049,
"step": 3430
},
{
"epoch": 6.229665523425946,
"grad_norm": 5.089596271514893,
"learning_rate": 9.132387002871057e-06,
"loss": 0.0044,
"step": 3440
},
{
"epoch": 6.249803341619207,
"grad_norm": 6.561634540557861,
"learning_rate": 9.023140834305621e-06,
"loss": 0.0051,
"step": 3450
},
{
"epoch": 6.269941159812467,
"grad_norm": 3.9571590423583984,
"learning_rate": 8.914361371012939e-06,
"loss": 0.0045,
"step": 3460
},
{
"epoch": 6.2900789780057265,
"grad_norm": 5.203815460205078,
"learning_rate": 8.806053237990788e-06,
"loss": 0.0065,
"step": 3470
},
{
"epoch": 6.310216796198986,
"grad_norm": 4.430067539215088,
"learning_rate": 8.698221040197288e-06,
"loss": 0.0047,
"step": 3480
},
{
"epoch": 6.330354614392247,
"grad_norm": 6.157893180847168,
"learning_rate": 8.590869362355128e-06,
"loss": 0.0063,
"step": 3490
},
{
"epoch": 6.350492432585507,
"grad_norm": 4.458155155181885,
"learning_rate": 8.484002768756643e-06,
"loss": 0.0048,
"step": 3500
},
{
"epoch": 6.370630250778767,
"grad_norm": 3.2868919372558594,
"learning_rate": 8.37762580306972e-06,
"loss": 0.0042,
"step": 3510
},
{
"epoch": 6.390768068972028,
"grad_norm": 4.93739652633667,
"learning_rate": 8.271742988144688e-06,
"loss": 0.0051,
"step": 3520
},
{
"epoch": 6.4109058871652875,
"grad_norm": 3.718449115753174,
"learning_rate": 8.166358825821923e-06,
"loss": 0.0048,
"step": 3530
},
{
"epoch": 6.431043705358547,
"grad_norm": 3.594763994216919,
"learning_rate": 8.061477796740511e-06,
"loss": 0.0054,
"step": 3540
},
{
"epoch": 6.451181523551807,
"grad_norm": 6.66500997543335,
"learning_rate": 7.957104360147746e-06,
"loss": 0.0046,
"step": 3550
},
{
"epoch": 6.471319341745068,
"grad_norm": 3.4088094234466553,
"learning_rate": 7.853242953709467e-06,
"loss": 0.006,
"step": 3560
},
{
"epoch": 6.491457159938328,
"grad_norm": 3.0382471084594727,
"learning_rate": 7.74989799332146e-06,
"loss": 0.0051,
"step": 3570
},
{
"epoch": 6.511594978131588,
"grad_norm": 3.609813928604126,
"learning_rate": 7.64707387292166e-06,
"loss": 0.005,
"step": 3580
},
{
"epoch": 6.531732796324848,
"grad_norm": 4.544133186340332,
"learning_rate": 7.544774964303341e-06,
"loss": 0.005,
"step": 3590
},
{
"epoch": 6.5518706145181085,
"grad_norm": 3.8849527835845947,
"learning_rate": 7.443005616929277e-06,
"loss": 0.0045,
"step": 3600
},
{
"epoch": 6.572008432711368,
"grad_norm": 4.574479579925537,
"learning_rate": 7.341770157746737e-06,
"loss": 0.0047,
"step": 3610
},
{
"epoch": 6.592146250904628,
"grad_norm": 3.9820139408111572,
"learning_rate": 7.241072891003589e-06,
"loss": 0.005,
"step": 3620
},
{
"epoch": 6.612284069097889,
"grad_norm": 3.3841769695281982,
"learning_rate": 7.1409180980652596e-06,
"loss": 0.0039,
"step": 3630
},
{
"epoch": 6.632421887291149,
"grad_norm": 3.9114112854003906,
"learning_rate": 7.041310037232712e-06,
"loss": 0.0047,
"step": 3640
},
{
"epoch": 6.652559705484409,
"grad_norm": 10.5076904296875,
"learning_rate": 6.942252943561396e-06,
"loss": 0.0051,
"step": 3650
},
{
"epoch": 6.672697523677669,
"grad_norm": 3.7937240600585938,
"learning_rate": 6.843751028681178e-06,
"loss": 0.0041,
"step": 3660
},
{
"epoch": 6.692835341870929,
"grad_norm": 5.625157356262207,
"learning_rate": 6.74580848061728e-06,
"loss": 0.0044,
"step": 3670
},
{
"epoch": 6.712973160064189,
"grad_norm": 3.2087152004241943,
"learning_rate": 6.648429463612218e-06,
"loss": 0.0066,
"step": 3680
},
{
"epoch": 6.733110978257449,
"grad_norm": 3.721176862716675,
"learning_rate": 6.551618117948746e-06,
"loss": 0.0044,
"step": 3690
},
{
"epoch": 6.753248796450709,
"grad_norm": 3.429137945175171,
"learning_rate": 6.4553785597738195e-06,
"loss": 0.0048,
"step": 3700
},
{
"epoch": 6.77338661464397,
"grad_norm": 3.475482225418091,
"learning_rate": 6.359714880923602e-06,
"loss": 0.006,
"step": 3710
},
{
"epoch": 6.79352443283723,
"grad_norm": 4.675537586212158,
"learning_rate": 6.2646311487494785e-06,
"loss": 0.0044,
"step": 3720
},
{
"epoch": 6.81366225103049,
"grad_norm": 6.543276786804199,
"learning_rate": 6.170131405945125e-06,
"loss": 0.0049,
"step": 3730
},
{
"epoch": 6.83380006922375,
"grad_norm": 3.8348066806793213,
"learning_rate": 6.0762196703746324e-06,
"loss": 0.0049,
"step": 3740
},
{
"epoch": 6.85393788741701,
"grad_norm": 3.8558757305145264,
"learning_rate": 5.982899934901667e-06,
"loss": 0.0042,
"step": 3750
},
{
"epoch": 6.87407570561027,
"grad_norm": 3.2190654277801514,
"learning_rate": 5.8901761672197165e-06,
"loss": 0.0039,
"step": 3760
},
{
"epoch": 6.89421352380353,
"grad_norm": 3.8839619159698486,
"learning_rate": 5.798052309683384e-06,
"loss": 0.005,
"step": 3770
},
{
"epoch": 6.914351341996791,
"grad_norm": 3.031508684158325,
"learning_rate": 5.706532279140782e-06,
"loss": 0.0048,
"step": 3780
},
{
"epoch": 6.934489160190051,
"grad_norm": 4.985992431640625,
"learning_rate": 5.61561996676699e-06,
"loss": 0.0059,
"step": 3790
},
{
"epoch": 6.954626978383311,
"grad_norm": 4.688712120056152,
"learning_rate": 5.5253192378985966e-06,
"loss": 0.0043,
"step": 3800
},
{
"epoch": 6.9747647965765704,
"grad_norm": 5.569628715515137,
"learning_rate": 5.43563393186941e-06,
"loss": 0.0043,
"step": 3810
},
{
"epoch": 6.994902614769831,
"grad_norm": 2.9404146671295166,
"learning_rate": 5.346567861847168e-06,
"loss": 0.0045,
"step": 3820
},
{
"epoch": 7.01372683596431,
"grad_norm": 2.525343656539917,
"learning_rate": 5.258124814671403e-06,
"loss": 0.007,
"step": 3830
},
{
"epoch": 7.0320292839167235,
"grad_norm": 6.035243988037109,
"learning_rate": 5.1703085506925225e-06,
"loss": 0.0087,
"step": 3840
},
{
"epoch": 7.050331731869138,
"grad_norm": 3.9128825664520264,
"learning_rate": 5.083122803611802e-06,
"loss": 0.0065,
"step": 3850
},
{
"epoch": 7.068634179821551,
"grad_norm": 5.710306644439697,
"learning_rate": 4.996571280322762e-06,
"loss": 0.0116,
"step": 3860
},
{
"epoch": 7.086936627773965,
"grad_norm": 9.083086967468262,
"learning_rate": 4.910657660753482e-06,
"loss": 0.0094,
"step": 3870
},
{
"epoch": 7.105239075726378,
"grad_norm": 4.0207672119140625,
"learning_rate": 4.825385597710148e-06,
"loss": 0.0085,
"step": 3880
},
{
"epoch": 7.123541523678792,
"grad_norm": 3.8497507572174072,
"learning_rate": 4.740758716721803e-06,
"loss": 0.0083,
"step": 3890
},
{
"epoch": 7.141843971631205,
"grad_norm": 5.6043009757995605,
"learning_rate": 4.6567806158861164e-06,
"loss": 0.0054,
"step": 3900
},
{
"epoch": 7.1601464195836195,
"grad_norm": 3.8586933612823486,
"learning_rate": 4.573454865716465e-06,
"loss": 0.0068,
"step": 3910
},
{
"epoch": 7.178448867536033,
"grad_norm": 4.219987392425537,
"learning_rate": 4.490785008990113e-06,
"loss": 0.0084,
"step": 3920
},
{
"epoch": 7.196751315488447,
"grad_norm": 4.6731109619140625,
"learning_rate": 4.408774560597544e-06,
"loss": 0.0068,
"step": 3930
},
{
"epoch": 7.21505376344086,
"grad_norm": 2.894176483154297,
"learning_rate": 4.32742700739309e-06,
"loss": 0.007,
"step": 3940
},
{
"epoch": 7.233356211393274,
"grad_norm": 3.3003957271575928,
"learning_rate": 4.246745808046599e-06,
"loss": 0.0078,
"step": 3950
},
{
"epoch": 7.251658659345687,
"grad_norm": 3.965242862701416,
"learning_rate": 4.166734392896438e-06,
"loss": 0.0054,
"step": 3960
},
{
"epoch": 7.269961107298101,
"grad_norm": 3.6206185817718506,
"learning_rate": 4.087396163803645e-06,
"loss": 0.0066,
"step": 3970
},
{
"epoch": 7.288263555250515,
"grad_norm": 3.3707611560821533,
"learning_rate": 4.008734494007241e-06,
"loss": 0.0084,
"step": 3980
},
{
"epoch": 7.306566003202929,
"grad_norm": 3.408390522003174,
"learning_rate": 3.9307527279808665e-06,
"loss": 0.0045,
"step": 3990
},
{
"epoch": 7.324868451155342,
"grad_norm": 3.1554362773895264,
"learning_rate": 3.85345418129055e-06,
"loss": 0.0084,
"step": 4000
},
{
"epoch": 7.343170899107756,
"grad_norm": 3.7730562686920166,
"learning_rate": 3.776842140453756e-06,
"loss": 0.0056,
"step": 4010
},
{
"epoch": 7.361473347060169,
"grad_norm": 2.509883165359497,
"learning_rate": 3.700919862799639e-06,
"loss": 0.0077,
"step": 4020
},
{
"epoch": 7.379775795012583,
"grad_norm": 4.287370681762695,
"learning_rate": 3.6256905763305605e-06,
"loss": 0.0067,
"step": 4030
},
{
"epoch": 7.3980782429649965,
"grad_norm": 6.043769359588623,
"learning_rate": 3.5511574795848415e-06,
"loss": 0.0051,
"step": 4040
},
{
"epoch": 7.416380690917411,
"grad_norm": 11.113882064819336,
"learning_rate": 3.4773237415007644e-06,
"loss": 0.0077,
"step": 4050
},
{
"epoch": 7.434683138869824,
"grad_norm": 4.403820037841797,
"learning_rate": 3.4041925012818423e-06,
"loss": 0.0061,
"step": 4060
},
{
"epoch": 7.452985586822238,
"grad_norm": 3.961599826812744,
"learning_rate": 3.3317668682633532e-06,
"loss": 0.0081,
"step": 4070
},
{
"epoch": 7.471288034774651,
"grad_norm": 2.8031773567199707,
"learning_rate": 3.2600499217801307e-06,
"loss": 0.0083,
"step": 4080
},
{
"epoch": 7.489590482727065,
"grad_norm": 2.444967269897461,
"learning_rate": 3.189044711035645e-06,
"loss": 0.0082,
"step": 4090
},
{
"epoch": 7.507892930679478,
"grad_norm": 2.957968235015869,
"learning_rate": 3.1187542549723625e-06,
"loss": 0.0083,
"step": 4100
},
{
"epoch": 7.526195378631892,
"grad_norm": 4.196249961853027,
"learning_rate": 3.0491815421433825e-06,
"loss": 0.0053,
"step": 4110
},
{
"epoch": 7.544497826584306,
"grad_norm": 4.068223476409912,
"learning_rate": 2.980329530585362e-06,
"loss": 0.0048,
"step": 4120
},
{
"epoch": 7.56280027453672,
"grad_norm": 10.506719589233398,
"learning_rate": 2.912201147692786e-06,
"loss": 0.0053,
"step": 4130
},
{
"epoch": 7.581102722489133,
"grad_norm": 3.4478495121002197,
"learning_rate": 2.8447992900934583e-06,
"loss": 0.0064,
"step": 4140
},
{
"epoch": 7.599405170441546,
"grad_norm": 3.022067070007324,
"learning_rate": 2.778126823525373e-06,
"loss": 0.0045,
"step": 4150
},
{
"epoch": 7.61770761839396,
"grad_norm": 2.931964874267578,
"learning_rate": 2.712186582714862e-06,
"loss": 0.0074,
"step": 4160
},
{
"epoch": 7.6360100663463735,
"grad_norm": 5.001142978668213,
"learning_rate": 2.6469813712560544e-06,
"loss": 0.005,
"step": 4170
},
{
"epoch": 7.654312514298788,
"grad_norm": 4.039334297180176,
"learning_rate": 2.5825139614917238e-06,
"loss": 0.0054,
"step": 4180
},
{
"epoch": 7.672614962251201,
"grad_norm": 2.894651412963867,
"learning_rate": 2.518787094395363e-06,
"loss": 0.0051,
"step": 4190
},
{
"epoch": 7.690917410203615,
"grad_norm": 3.445218801498413,
"learning_rate": 2.455803479454664e-06,
"loss": 0.0077,
"step": 4200
},
{
"epoch": 7.709219858156028,
"grad_norm": 5.986388683319092,
"learning_rate": 2.3935657945563427e-06,
"loss": 0.0051,
"step": 4210
},
{
"epoch": 7.727522306108442,
"grad_norm": 4.5863237380981445,
"learning_rate": 2.332076685872231e-06,
"loss": 0.0062,
"step": 4220
},
{
"epoch": 7.745824754060855,
"grad_norm": 3.8240745067596436,
"learning_rate": 2.2713387677468267e-06,
"loss": 0.0066,
"step": 4230
},
{
"epoch": 7.7641272020132694,
"grad_norm": 3.147395372390747,
"learning_rate": 2.2113546225861037e-06,
"loss": 0.0067,
"step": 4240
},
{
"epoch": 7.782429649965683,
"grad_norm": 4.106767177581787,
"learning_rate": 2.1521268007477047e-06,
"loss": 0.008,
"step": 4250
},
{
"epoch": 7.800732097918097,
"grad_norm": 3.5174560546875,
"learning_rate": 2.0936578204325575e-06,
"loss": 0.008,
"step": 4260
},
{
"epoch": 7.81903454587051,
"grad_norm": 3.5646681785583496,
"learning_rate": 2.035950167577747e-06,
"loss": 0.0062,
"step": 4270
},
{
"epoch": 7.837336993822924,
"grad_norm": 3.414524555206299,
"learning_rate": 1.9790062957508626e-06,
"loss": 0.0074,
"step": 4280
},
{
"epoch": 7.855639441775337,
"grad_norm": 4.365599632263184,
"learning_rate": 1.9228286260456673e-06,
"loss": 0.0102,
"step": 4290
},
{
"epoch": 7.873941889727751,
"grad_norm": 6.737311840057373,
"learning_rate": 1.8674195469791524e-06,
"loss": 0.006,
"step": 4300
},
{
"epoch": 7.8922443376801645,
"grad_norm": 8.64922046661377,
"learning_rate": 1.8127814143900012e-06,
"loss": 0.0061,
"step": 4310
},
{
"epoch": 7.910546785632579,
"grad_norm": 3.7279410362243652,
"learning_rate": 1.7589165513383988e-06,
"loss": 0.0062,
"step": 4320
},
{
"epoch": 7.928849233584992,
"grad_norm": 5.265659332275391,
"learning_rate": 1.7058272480072879e-06,
"loss": 0.0063,
"step": 4330
},
{
"epoch": 7.947151681537406,
"grad_norm": 3.1222264766693115,
"learning_rate": 1.6535157616049867e-06,
"loss": 0.0058,
"step": 4340
},
{
"epoch": 7.965454129489819,
"grad_norm": 3.9116389751434326,
"learning_rate": 1.601984316269214e-06,
"loss": 0.0066,
"step": 4350
},
{
"epoch": 7.983756577442233,
"grad_norm": 8.46506118774414,
"learning_rate": 1.5512351029725325e-06,
"loss": 0.0052,
"step": 4360
},
{
"epoch": 7.036990501080565,
"grad_norm": 3.518129825592041,
"learning_rate": 1.5012702794291901e-06,
"loss": 0.0049,
"step": 4370
},
{
"epoch": 7.053073327637332,
"grad_norm": 5.544999122619629,
"learning_rate": 1.4520919700033864e-06,
"loss": 0.0054,
"step": 4380
},
{
"epoch": 7.0691561541941,
"grad_norm": 2.621429681777954,
"learning_rate": 1.4037022656189425e-06,
"loss": 0.0071,
"step": 4390
},
{
"epoch": 7.085238980750867,
"grad_norm": 6.043819427490234,
"learning_rate": 1.356103223670402e-06,
"loss": 0.0073,
"step": 4400
},
{
"epoch": 7.1013218073076345,
"grad_norm": 3.584305763244629,
"learning_rate": 1.3092968679355634e-06,
"loss": 0.0045,
"step": 4410
},
{
"epoch": 7.1174046338644015,
"grad_norm": 6.561378479003906,
"learning_rate": 1.2632851884894293e-06,
"loss": 0.0091,
"step": 4420
},
{
"epoch": 7.133487460421169,
"grad_norm": 15.719043731689453,
"learning_rate": 1.2180701416195894e-06,
"loss": 0.0155,
"step": 4430
},
{
"epoch": 7.149570286977936,
"grad_norm": 7.752620697021484,
"learning_rate": 1.1736536497430584e-06,
"loss": 0.0098,
"step": 4440
},
{
"epoch": 7.165653113534704,
"grad_norm": 7.116891384124756,
"learning_rate": 1.1300376013245272e-06,
"loss": 0.0107,
"step": 4450
},
{
"epoch": 7.181735940091471,
"grad_norm": 4.907498836517334,
"learning_rate": 1.0872238507960753e-06,
"loss": 0.0087,
"step": 4460
},
{
"epoch": 7.197818766648238,
"grad_norm": 3.2054624557495117,
"learning_rate": 1.0452142184783232e-06,
"loss": 0.0091,
"step": 4470
},
{
"epoch": 7.213901593205006,
"grad_norm": 4.177403926849365,
"learning_rate": 1.0040104905030467e-06,
"loss": 0.0064,
"step": 4480
},
{
"epoch": 7.229984419761773,
"grad_norm": 5.655606269836426,
"learning_rate": 9.63614418737222e-07,
"loss": 0.0107,
"step": 4490
},
{
"epoch": 7.246067246318541,
"grad_norm": 2.246121883392334,
"learning_rate": 9.240277207085557e-07,
"loss": 0.008,
"step": 4500
},
{
"epoch": 7.262150072875308,
"grad_norm": 5.548453330993652,
"learning_rate": 8.852520795324349e-07,
"loss": 0.0074,
"step": 4510
},
{
"epoch": 7.2782328994320755,
"grad_norm": 12.830928802490234,
"learning_rate": 8.472891438404108e-07,
"loss": 0.0123,
"step": 4520
},
{
"epoch": 7.2943157259888425,
"grad_norm": 3.613041400909424,
"learning_rate": 8.101405277100549e-07,
"loss": 0.0118,
"step": 4530
},
{
"epoch": 7.31039855254561,
"grad_norm": 3.5919158458709717,
"learning_rate": 7.738078105963565e-07,
"loss": 0.0058,
"step": 4540
},
{
"epoch": 7.326481379102377,
"grad_norm": 6.3182196617126465,
"learning_rate": 7.3829253726458e-07,
"loss": 0.0106,
"step": 4550
},
{
"epoch": 7.342564205659144,
"grad_norm": 4.684800624847412,
"learning_rate": 7.035962177245536e-07,
"loss": 0.0065,
"step": 4560
},
{
"epoch": 7.358647032215912,
"grad_norm": 2.9966583251953125,
"learning_rate": 6.697203271665054e-07,
"loss": 0.0081,
"step": 4570
},
{
"epoch": 7.374729858772679,
"grad_norm": 3.141700506210327,
"learning_rate": 6.366663058983102e-07,
"loss": 0.009,
"step": 4580
},
{
"epoch": 7.390812685329447,
"grad_norm": 4.013637542724609,
"learning_rate": 6.044355592842644e-07,
"loss": 0.0087,
"step": 4590
},
{
"epoch": 7.406895511886214,
"grad_norm": 3.2047641277313232,
"learning_rate": 5.730294576853501e-07,
"loss": 0.007,
"step": 4600
},
{
"epoch": 7.422978338442982,
"grad_norm": 6.322583198547363,
"learning_rate": 5.424493364009364e-07,
"loss": 0.0066,
"step": 4610
},
{
"epoch": 7.439061164999749,
"grad_norm": 7.329522132873535,
"learning_rate": 5.126964956120351e-07,
"loss": 0.0095,
"step": 4620
},
{
"epoch": 7.4551439915565165,
"grad_norm": 3.080005168914795,
"learning_rate": 4.837722003260136e-07,
"loss": 0.0091,
"step": 4630
},
{
"epoch": 7.4712268181132835,
"grad_norm": 4.727446556091309,
"learning_rate": 4.5567768032280136e-07,
"loss": 0.0077,
"step": 4640
},
{
"epoch": 7.4873096446700504,
"grad_norm": 2.2113332748413086,
"learning_rate": 4.2841413010261456e-07,
"loss": 0.0066,
"step": 4650
},
{
"epoch": 7.503392471226818,
"grad_norm": 11.121126174926758,
"learning_rate": 4.01982708835158e-07,
"loss": 0.0066,
"step": 4660
},
{
"epoch": 7.519475297783585,
"grad_norm": 4.20743465423584,
"learning_rate": 3.7638454031035276e-07,
"loss": 0.0111,
"step": 4670
},
{
"epoch": 7.535558124340353,
"grad_norm": 6.002978324890137,
"learning_rate": 3.5162071289055245e-07,
"loss": 0.0066,
"step": 4680
},
{
"epoch": 7.55164095089712,
"grad_norm": 3.0463833808898926,
"learning_rate": 3.276922794642534e-07,
"loss": 0.0072,
"step": 4690
},
{
"epoch": 7.567723777453887,
"grad_norm": 6.407285213470459,
"learning_rate": 3.046002574013551e-07,
"loss": 0.0072,
"step": 4700
},
{
"epoch": 7.583806604010655,
"grad_norm": 8.836779594421387,
"learning_rate": 2.8234562850988356e-07,
"loss": 0.0079,
"step": 4710
},
{
"epoch": 7.599889430567423,
"grad_norm": 5.871425628662109,
"learning_rate": 2.609293389942602e-07,
"loss": 0.0077,
"step": 4720
},
{
"epoch": 7.61597225712419,
"grad_norm": 6.142898082733154,
"learning_rate": 2.403522994150609e-07,
"loss": 0.0071,
"step": 4730
},
{
"epoch": 7.632055083680957,
"grad_norm": 3.5649702548980713,
"learning_rate": 2.2061538465031117e-07,
"loss": 0.0071,
"step": 4740
},
{
"epoch": 7.6481379102377245,
"grad_norm": 7.242725372314453,
"learning_rate": 2.017194338582873e-07,
"loss": 0.0105,
"step": 4750
},
{
"epoch": 7.6642207367944915,
"grad_norm": 12.46696662902832,
"learning_rate": 1.8366525044183126e-07,
"loss": 0.0095,
"step": 4760
},
{
"epoch": 7.680303563351259,
"grad_norm": 5.7213640213012695,
"learning_rate": 1.6645360201420046e-07,
"loss": 0.0095,
"step": 4770
},
{
"epoch": 7.696386389908026,
"grad_norm": 3.3008534908294678,
"learning_rate": 1.5008522036642048e-07,
"loss": 0.0078,
"step": 4780
},
{
"epoch": 7.712469216464793,
"grad_norm": 4.01139497756958,
"learning_rate": 1.3456080143618767e-07,
"loss": 0.0097,
"step": 4790
},
{
"epoch": 7.728552043021561,
"grad_norm": 4.135232448577881,
"learning_rate": 1.198810052782595e-07,
"loss": 0.0091,
"step": 4800
},
{
"epoch": 7.744634869578328,
"grad_norm": 6.216122150421143,
"learning_rate": 1.060464560364105e-07,
"loss": 0.0071,
"step": 4810
},
{
"epoch": 7.760717696135096,
"grad_norm": 3.039104461669922,
"learning_rate": 9.305774191687988e-08,
"loss": 0.009,
"step": 4820
},
{
"epoch": 7.776800522691863,
"grad_norm": 2.9482333660125732,
"learning_rate": 8.091541516337398e-08,
"loss": 0.0104,
"step": 4830
},
{
"epoch": 7.792883349248631,
"grad_norm": 8.094056129455566,
"learning_rate": 6.961999203357605e-08,
"loss": 0.008,
"step": 4840
},
{
"epoch": 7.808966175805398,
"grad_norm": 2.1247482299804688,
"learning_rate": 5.917195277721055e-08,
"loss": 0.0046,
"step": 4850
},
{
"epoch": 7.8250490023621655,
"grad_norm": 7.984273433685303,
"learning_rate": 4.957174161560607e-08,
"loss": 0.0109,
"step": 4860
},
{
"epoch": 7.8411318289189325,
"grad_norm": 5.030553817749023,
"learning_rate": 4.0819766722826057e-08,
"loss": 0.0062,
"step": 4870
},
{
"epoch": 7.857214655475699,
"grad_norm": 2.809941291809082,
"learning_rate": 3.291640020829823e-08,
"loss": 0.0081,
"step": 4880
},
{
"epoch": 7.873297482032467,
"grad_norm": 6.414947032928467,
"learning_rate": 2.5861978101009433e-08,
"loss": 0.0075,
"step": 4890
},
{
"epoch": 7.889380308589234,
"grad_norm": 5.527952671051025,
"learning_rate": 1.9656800335206004e-08,
"loss": 0.0058,
"step": 4900
},
{
"epoch": 7.905463135146002,
"grad_norm": 6.868896007537842,
"learning_rate": 1.4301130737646163e-08,
"loss": 0.0067,
"step": 4910
},
{
"epoch": 7.921545961702769,
"grad_norm": 11.053786277770996,
"learning_rate": 9.795197016384538e-09,
"loss": 0.0102,
"step": 4920
},
{
"epoch": 7.937628788259537,
"grad_norm": 5.266638278961182,
"learning_rate": 6.1391907510888195e-09,
"loss": 0.0095,
"step": 4930
},
{
"epoch": 7.953711614816304,
"grad_norm": 3.4717257022857666,
"learning_rate": 3.3332673848951448e-09,
"loss": 0.0065,
"step": 4940
},
{
"epoch": 7.969794441373072,
"grad_norm": 4.068334579467773,
"learning_rate": 1.3775462177956222e-09,
"loss": 0.0083,
"step": 4950
},
{
"epoch": 7.985877267929839,
"grad_norm": 3.321983814239502,
"learning_rate": 2.721104015712683e-10,
"loss": 0.0062,
"step": 4960
}
],
"logging_steps": 10,
"max_steps": 4968,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.935941540711301e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}