phishing-email-disilBERT / trainer_state.json
rahulkothuri's picture
Upload folder using huggingface_hub
f6123f4 verified
{
"best_metric": 0.026551904156804085,
"best_model_checkpoint": "./phishing-email-detection/checkpoint-825",
"epoch": 3.0,
"eval_steps": 1,
"global_step": 825,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0036363636363636364,
"grad_norm": 0.5588774085044861,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.6866,
"step": 1
},
{
"epoch": 0.007272727272727273,
"grad_norm": 1.193735957145691,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7133,
"step": 2
},
{
"epoch": 0.01090909090909091,
"grad_norm": 0.7555140256881714,
"learning_rate": 3e-06,
"loss": 0.6972,
"step": 3
},
{
"epoch": 0.014545454545454545,
"grad_norm": 0.7989323735237122,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6927,
"step": 4
},
{
"epoch": 0.01818181818181818,
"grad_norm": 0.608884334564209,
"learning_rate": 5e-06,
"loss": 0.6873,
"step": 5
},
{
"epoch": 0.02181818181818182,
"grad_norm": 0.5146162509918213,
"learning_rate": 6e-06,
"loss": 0.6783,
"step": 6
},
{
"epoch": 0.025454545454545455,
"grad_norm": 1.2107092142105103,
"learning_rate": 7.000000000000001e-06,
"loss": 0.6978,
"step": 7
},
{
"epoch": 0.02909090909090909,
"grad_norm": 0.7423388957977295,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6951,
"step": 8
},
{
"epoch": 0.03272727272727273,
"grad_norm": 1.1845803260803223,
"learning_rate": 9e-06,
"loss": 0.6952,
"step": 9
},
{
"epoch": 0.03636363636363636,
"grad_norm": 0.6241262555122375,
"learning_rate": 1e-05,
"loss": 0.6854,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 1.0969253778457642,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.6649,
"step": 11
},
{
"epoch": 0.04363636363636364,
"grad_norm": 0.7080792188644409,
"learning_rate": 1.2e-05,
"loss": 0.6827,
"step": 12
},
{
"epoch": 0.04727272727272727,
"grad_norm": 0.6348336935043335,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.6681,
"step": 13
},
{
"epoch": 0.05090909090909091,
"grad_norm": 0.714078426361084,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.6719,
"step": 14
},
{
"epoch": 0.05454545454545454,
"grad_norm": 0.6718598008155823,
"learning_rate": 1.5e-05,
"loss": 0.6702,
"step": 15
},
{
"epoch": 0.05818181818181818,
"grad_norm": 1.0352996587753296,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.6689,
"step": 16
},
{
"epoch": 0.06181818181818182,
"grad_norm": 0.8572512269020081,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.6459,
"step": 17
},
{
"epoch": 0.06545454545454546,
"grad_norm": 1.2123569250106812,
"learning_rate": 1.8e-05,
"loss": 0.6487,
"step": 18
},
{
"epoch": 0.06909090909090909,
"grad_norm": 2.0234522819519043,
"learning_rate": 1.9e-05,
"loss": 0.6572,
"step": 19
},
{
"epoch": 0.07272727272727272,
"grad_norm": 2.4744138717651367,
"learning_rate": 2e-05,
"loss": 0.586,
"step": 20
},
{
"epoch": 0.07636363636363637,
"grad_norm": 1.7895574569702148,
"learning_rate": 2.1e-05,
"loss": 0.6114,
"step": 21
},
{
"epoch": 0.08,
"grad_norm": 2.3886725902557373,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.6199,
"step": 22
},
{
"epoch": 0.08363636363636363,
"grad_norm": 2.5716464519500732,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.5951,
"step": 23
},
{
"epoch": 0.08727272727272728,
"grad_norm": 1.6023527383804321,
"learning_rate": 2.4e-05,
"loss": 0.5584,
"step": 24
},
{
"epoch": 0.09090909090909091,
"grad_norm": 1.8739789724349976,
"learning_rate": 2.5e-05,
"loss": 0.5711,
"step": 25
},
{
"epoch": 0.09454545454545454,
"grad_norm": 1.912356972694397,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.5251,
"step": 26
},
{
"epoch": 0.09818181818181818,
"grad_norm": 2.616036891937256,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.4568,
"step": 27
},
{
"epoch": 0.10181818181818182,
"grad_norm": 2.2210693359375,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.4547,
"step": 28
},
{
"epoch": 0.10545454545454545,
"grad_norm": 4.0840325355529785,
"learning_rate": 2.9e-05,
"loss": 0.409,
"step": 29
},
{
"epoch": 0.10909090909090909,
"grad_norm": 3.1913392543792725,
"learning_rate": 3e-05,
"loss": 0.3759,
"step": 30
},
{
"epoch": 0.11272727272727273,
"grad_norm": 2.67592191696167,
"learning_rate": 3.1e-05,
"loss": 0.3727,
"step": 31
},
{
"epoch": 0.11636363636363636,
"grad_norm": 3.8523147106170654,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.3395,
"step": 32
},
{
"epoch": 0.12,
"grad_norm": 3.1377878189086914,
"learning_rate": 3.3e-05,
"loss": 0.2602,
"step": 33
},
{
"epoch": 0.12363636363636364,
"grad_norm": 4.252383232116699,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.2641,
"step": 34
},
{
"epoch": 0.12727272727272726,
"grad_norm": 2.4112355709075928,
"learning_rate": 3.5e-05,
"loss": 0.2815,
"step": 35
},
{
"epoch": 0.13090909090909092,
"grad_norm": 2.1956121921539307,
"learning_rate": 3.6e-05,
"loss": 0.212,
"step": 36
},
{
"epoch": 0.13454545454545455,
"grad_norm": 1.6787267923355103,
"learning_rate": 3.7e-05,
"loss": 0.2461,
"step": 37
},
{
"epoch": 0.13818181818181818,
"grad_norm": 5.1338300704956055,
"learning_rate": 3.8e-05,
"loss": 0.2723,
"step": 38
},
{
"epoch": 0.14181818181818182,
"grad_norm": 1.7655150890350342,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.2078,
"step": 39
},
{
"epoch": 0.14545454545454545,
"grad_norm": 2.906001329421997,
"learning_rate": 4e-05,
"loss": 0.3349,
"step": 40
},
{
"epoch": 0.14909090909090908,
"grad_norm": 4.4803900718688965,
"learning_rate": 4.1e-05,
"loss": 0.1256,
"step": 41
},
{
"epoch": 0.15272727272727274,
"grad_norm": 2.0880069732666016,
"learning_rate": 4.2e-05,
"loss": 0.1547,
"step": 42
},
{
"epoch": 0.15636363636363637,
"grad_norm": 3.3797402381896973,
"learning_rate": 4.3e-05,
"loss": 0.137,
"step": 43
},
{
"epoch": 0.16,
"grad_norm": 3.1822092533111572,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.1881,
"step": 44
},
{
"epoch": 0.16363636363636364,
"grad_norm": 1.7508845329284668,
"learning_rate": 4.5e-05,
"loss": 0.0657,
"step": 45
},
{
"epoch": 0.16727272727272727,
"grad_norm": 1.9360814094543457,
"learning_rate": 4.600000000000001e-05,
"loss": 0.0578,
"step": 46
},
{
"epoch": 0.1709090909090909,
"grad_norm": 5.421896934509277,
"learning_rate": 4.7e-05,
"loss": 0.2809,
"step": 47
},
{
"epoch": 0.17454545454545456,
"grad_norm": 3.034318685531616,
"learning_rate": 4.8e-05,
"loss": 0.1716,
"step": 48
},
{
"epoch": 0.1781818181818182,
"grad_norm": 8.016839027404785,
"learning_rate": 4.9e-05,
"loss": 0.261,
"step": 49
},
{
"epoch": 0.18181818181818182,
"grad_norm": 2.5083861351013184,
"learning_rate": 5e-05,
"loss": 0.0848,
"step": 50
},
{
"epoch": 0.18545454545454546,
"grad_norm": 7.909552097320557,
"learning_rate": 4.9935483870967744e-05,
"loss": 0.0823,
"step": 51
},
{
"epoch": 0.1890909090909091,
"grad_norm": 8.652934074401855,
"learning_rate": 4.9870967741935485e-05,
"loss": 0.1812,
"step": 52
},
{
"epoch": 0.19272727272727272,
"grad_norm": 2.450180768966675,
"learning_rate": 4.9806451612903226e-05,
"loss": 0.065,
"step": 53
},
{
"epoch": 0.19636363636363635,
"grad_norm": 3.1136999130249023,
"learning_rate": 4.9741935483870974e-05,
"loss": 0.1683,
"step": 54
},
{
"epoch": 0.2,
"grad_norm": 6.53245735168457,
"learning_rate": 4.967741935483871e-05,
"loss": 0.1243,
"step": 55
},
{
"epoch": 0.20363636363636364,
"grad_norm": 3.6356089115142822,
"learning_rate": 4.961290322580646e-05,
"loss": 0.1899,
"step": 56
},
{
"epoch": 0.20727272727272728,
"grad_norm": 13.28806209564209,
"learning_rate": 4.95483870967742e-05,
"loss": 0.416,
"step": 57
},
{
"epoch": 0.2109090909090909,
"grad_norm": 6.773140907287598,
"learning_rate": 4.948387096774193e-05,
"loss": 0.1387,
"step": 58
},
{
"epoch": 0.21454545454545454,
"grad_norm": 4.9038848876953125,
"learning_rate": 4.941935483870968e-05,
"loss": 0.1795,
"step": 59
},
{
"epoch": 0.21818181818181817,
"grad_norm": 1.2570604085922241,
"learning_rate": 4.935483870967742e-05,
"loss": 0.0462,
"step": 60
},
{
"epoch": 0.22181818181818183,
"grad_norm": 4.595444202423096,
"learning_rate": 4.929032258064516e-05,
"loss": 0.2883,
"step": 61
},
{
"epoch": 0.22545454545454546,
"grad_norm": 5.1306891441345215,
"learning_rate": 4.9225806451612904e-05,
"loss": 0.1155,
"step": 62
},
{
"epoch": 0.2290909090909091,
"grad_norm": 3.7912144660949707,
"learning_rate": 4.916129032258065e-05,
"loss": 0.254,
"step": 63
},
{
"epoch": 0.23272727272727273,
"grad_norm": 5.713457107543945,
"learning_rate": 4.9096774193548387e-05,
"loss": 0.1327,
"step": 64
},
{
"epoch": 0.23636363636363636,
"grad_norm": 4.170778274536133,
"learning_rate": 4.903225806451613e-05,
"loss": 0.1039,
"step": 65
},
{
"epoch": 0.24,
"grad_norm": 1.75148344039917,
"learning_rate": 4.8967741935483876e-05,
"loss": 0.0852,
"step": 66
},
{
"epoch": 0.24363636363636362,
"grad_norm": 9.794595718383789,
"learning_rate": 4.890322580645161e-05,
"loss": 0.2108,
"step": 67
},
{
"epoch": 0.24727272727272728,
"grad_norm": 9.14588737487793,
"learning_rate": 4.883870967741936e-05,
"loss": 0.1652,
"step": 68
},
{
"epoch": 0.2509090909090909,
"grad_norm": 0.7871703505516052,
"learning_rate": 4.87741935483871e-05,
"loss": 0.0286,
"step": 69
},
{
"epoch": 0.2545454545454545,
"grad_norm": 3.963533401489258,
"learning_rate": 4.870967741935484e-05,
"loss": 0.1447,
"step": 70
},
{
"epoch": 0.2581818181818182,
"grad_norm": 1.9403924942016602,
"learning_rate": 4.864516129032258e-05,
"loss": 0.0346,
"step": 71
},
{
"epoch": 0.26181818181818184,
"grad_norm": 4.882287979125977,
"learning_rate": 4.858064516129032e-05,
"loss": 0.0858,
"step": 72
},
{
"epoch": 0.26545454545454544,
"grad_norm": 3.18686842918396,
"learning_rate": 4.8516129032258065e-05,
"loss": 0.1201,
"step": 73
},
{
"epoch": 0.2690909090909091,
"grad_norm": 1.2996779680252075,
"learning_rate": 4.8451612903225806e-05,
"loss": 0.0285,
"step": 74
},
{
"epoch": 0.2727272727272727,
"grad_norm": 4.498321056365967,
"learning_rate": 4.8387096774193554e-05,
"loss": 0.1272,
"step": 75
},
{
"epoch": 0.27636363636363637,
"grad_norm": 7.456954479217529,
"learning_rate": 4.8322580645161295e-05,
"loss": 0.1553,
"step": 76
},
{
"epoch": 0.28,
"grad_norm": 1.462753415107727,
"learning_rate": 4.8258064516129036e-05,
"loss": 0.0688,
"step": 77
},
{
"epoch": 0.28363636363636363,
"grad_norm": 2.150094985961914,
"learning_rate": 4.819354838709678e-05,
"loss": 0.0558,
"step": 78
},
{
"epoch": 0.2872727272727273,
"grad_norm": 4.6224284172058105,
"learning_rate": 4.812903225806452e-05,
"loss": 0.1185,
"step": 79
},
{
"epoch": 0.2909090909090909,
"grad_norm": 3.0150296688079834,
"learning_rate": 4.806451612903226e-05,
"loss": 0.2397,
"step": 80
},
{
"epoch": 0.29454545454545455,
"grad_norm": 13.630721092224121,
"learning_rate": 4.8e-05,
"loss": 0.2093,
"step": 81
},
{
"epoch": 0.29818181818181816,
"grad_norm": 8.345911026000977,
"learning_rate": 4.793548387096774e-05,
"loss": 0.1799,
"step": 82
},
{
"epoch": 0.3018181818181818,
"grad_norm": 4.615355491638184,
"learning_rate": 4.7870967741935484e-05,
"loss": 0.0951,
"step": 83
},
{
"epoch": 0.3054545454545455,
"grad_norm": 6.051864147186279,
"learning_rate": 4.780645161290323e-05,
"loss": 0.2423,
"step": 84
},
{
"epoch": 0.3090909090909091,
"grad_norm": 3.912353277206421,
"learning_rate": 4.774193548387097e-05,
"loss": 0.0923,
"step": 85
},
{
"epoch": 0.31272727272727274,
"grad_norm": 5.80419397354126,
"learning_rate": 4.767741935483871e-05,
"loss": 0.1406,
"step": 86
},
{
"epoch": 0.31636363636363635,
"grad_norm": 4.3275227546691895,
"learning_rate": 4.7612903225806455e-05,
"loss": 0.2217,
"step": 87
},
{
"epoch": 0.32,
"grad_norm": 3.613757610321045,
"learning_rate": 4.75483870967742e-05,
"loss": 0.1888,
"step": 88
},
{
"epoch": 0.3236363636363636,
"grad_norm": 0.619616687297821,
"learning_rate": 4.748387096774194e-05,
"loss": 0.0193,
"step": 89
},
{
"epoch": 0.32727272727272727,
"grad_norm": 3.2283294200897217,
"learning_rate": 4.741935483870968e-05,
"loss": 0.1036,
"step": 90
},
{
"epoch": 0.33090909090909093,
"grad_norm": 2.8031208515167236,
"learning_rate": 4.735483870967742e-05,
"loss": 0.1384,
"step": 91
},
{
"epoch": 0.33454545454545453,
"grad_norm": 2.338831901550293,
"learning_rate": 4.729032258064516e-05,
"loss": 0.0763,
"step": 92
},
{
"epoch": 0.3381818181818182,
"grad_norm": 1.7797901630401611,
"learning_rate": 4.72258064516129e-05,
"loss": 0.1343,
"step": 93
},
{
"epoch": 0.3418181818181818,
"grad_norm": 2.9699721336364746,
"learning_rate": 4.716129032258065e-05,
"loss": 0.124,
"step": 94
},
{
"epoch": 0.34545454545454546,
"grad_norm": 1.207624077796936,
"learning_rate": 4.7096774193548385e-05,
"loss": 0.0384,
"step": 95
},
{
"epoch": 0.3490909090909091,
"grad_norm": 1.7600481510162354,
"learning_rate": 4.7032258064516133e-05,
"loss": 0.1058,
"step": 96
},
{
"epoch": 0.3527272727272727,
"grad_norm": 1.9546951055526733,
"learning_rate": 4.6967741935483875e-05,
"loss": 0.0867,
"step": 97
},
{
"epoch": 0.3563636363636364,
"grad_norm": 1.6174895763397217,
"learning_rate": 4.6903225806451616e-05,
"loss": 0.0249,
"step": 98
},
{
"epoch": 0.36,
"grad_norm": 2.494776725769043,
"learning_rate": 4.683870967741936e-05,
"loss": 0.0813,
"step": 99
},
{
"epoch": 0.36363636363636365,
"grad_norm": 1.799558162689209,
"learning_rate": 4.67741935483871e-05,
"loss": 0.0698,
"step": 100
},
{
"epoch": 0.36727272727272725,
"grad_norm": 1.8396685123443604,
"learning_rate": 4.670967741935484e-05,
"loss": 0.1093,
"step": 101
},
{
"epoch": 0.3709090909090909,
"grad_norm": 1.1487964391708374,
"learning_rate": 4.664516129032258e-05,
"loss": 0.0199,
"step": 102
},
{
"epoch": 0.37454545454545457,
"grad_norm": 3.693464756011963,
"learning_rate": 4.658064516129033e-05,
"loss": 0.0468,
"step": 103
},
{
"epoch": 0.3781818181818182,
"grad_norm": 0.4796558916568756,
"learning_rate": 4.651612903225806e-05,
"loss": 0.0142,
"step": 104
},
{
"epoch": 0.38181818181818183,
"grad_norm": 1.3589314222335815,
"learning_rate": 4.645161290322581e-05,
"loss": 0.0197,
"step": 105
},
{
"epoch": 0.38545454545454544,
"grad_norm": 5.373131275177002,
"learning_rate": 4.638709677419355e-05,
"loss": 0.1297,
"step": 106
},
{
"epoch": 0.3890909090909091,
"grad_norm": 5.07847261428833,
"learning_rate": 4.632258064516129e-05,
"loss": 0.1152,
"step": 107
},
{
"epoch": 0.3927272727272727,
"grad_norm": 2.5810861587524414,
"learning_rate": 4.6258064516129035e-05,
"loss": 0.0499,
"step": 108
},
{
"epoch": 0.39636363636363636,
"grad_norm": 3.111640691757202,
"learning_rate": 4.6193548387096776e-05,
"loss": 0.0488,
"step": 109
},
{
"epoch": 0.4,
"grad_norm": 2.8592653274536133,
"learning_rate": 4.612903225806452e-05,
"loss": 0.0648,
"step": 110
},
{
"epoch": 0.4036363636363636,
"grad_norm": 4.070368766784668,
"learning_rate": 4.606451612903226e-05,
"loss": 0.1844,
"step": 111
},
{
"epoch": 0.4072727272727273,
"grad_norm": 2.12369441986084,
"learning_rate": 4.600000000000001e-05,
"loss": 0.0645,
"step": 112
},
{
"epoch": 0.4109090909090909,
"grad_norm": 2.3667800426483154,
"learning_rate": 4.593548387096774e-05,
"loss": 0.0471,
"step": 113
},
{
"epoch": 0.41454545454545455,
"grad_norm": 4.2587056159973145,
"learning_rate": 4.587096774193548e-05,
"loss": 0.062,
"step": 114
},
{
"epoch": 0.41818181818181815,
"grad_norm": 4.729971885681152,
"learning_rate": 4.580645161290323e-05,
"loss": 0.1277,
"step": 115
},
{
"epoch": 0.4218181818181818,
"grad_norm": 2.8815808296203613,
"learning_rate": 4.5741935483870965e-05,
"loss": 0.1424,
"step": 116
},
{
"epoch": 0.4254545454545455,
"grad_norm": 3.5163843631744385,
"learning_rate": 4.567741935483871e-05,
"loss": 0.1753,
"step": 117
},
{
"epoch": 0.4290909090909091,
"grad_norm": 0.9345911145210266,
"learning_rate": 4.5612903225806454e-05,
"loss": 0.0173,
"step": 118
},
{
"epoch": 0.43272727272727274,
"grad_norm": 2.4038431644439697,
"learning_rate": 4.5548387096774196e-05,
"loss": 0.1339,
"step": 119
},
{
"epoch": 0.43636363636363634,
"grad_norm": 1.086334228515625,
"learning_rate": 4.548387096774194e-05,
"loss": 0.06,
"step": 120
},
{
"epoch": 0.44,
"grad_norm": 3.3562028408050537,
"learning_rate": 4.5419354838709685e-05,
"loss": 0.2079,
"step": 121
},
{
"epoch": 0.44363636363636366,
"grad_norm": 3.6790003776550293,
"learning_rate": 4.535483870967742e-05,
"loss": 0.1622,
"step": 122
},
{
"epoch": 0.44727272727272727,
"grad_norm": 4.95260763168335,
"learning_rate": 4.529032258064516e-05,
"loss": 0.069,
"step": 123
},
{
"epoch": 0.4509090909090909,
"grad_norm": 1.6130903959274292,
"learning_rate": 4.522580645161291e-05,
"loss": 0.1197,
"step": 124
},
{
"epoch": 0.45454545454545453,
"grad_norm": 3.9144365787506104,
"learning_rate": 4.516129032258064e-05,
"loss": 0.063,
"step": 125
},
{
"epoch": 0.4581818181818182,
"grad_norm": 7.754761219024658,
"learning_rate": 4.509677419354839e-05,
"loss": 0.358,
"step": 126
},
{
"epoch": 0.4618181818181818,
"grad_norm": 4.277048110961914,
"learning_rate": 4.503225806451613e-05,
"loss": 0.1137,
"step": 127
},
{
"epoch": 0.46545454545454545,
"grad_norm": 4.798757553100586,
"learning_rate": 4.4967741935483873e-05,
"loss": 0.1365,
"step": 128
},
{
"epoch": 0.4690909090909091,
"grad_norm": 2.407785177230835,
"learning_rate": 4.4903225806451615e-05,
"loss": 0.0339,
"step": 129
},
{
"epoch": 0.4727272727272727,
"grad_norm": 2.8577773571014404,
"learning_rate": 4.4838709677419356e-05,
"loss": 0.0634,
"step": 130
},
{
"epoch": 0.4763636363636364,
"grad_norm": 2.8833367824554443,
"learning_rate": 4.47741935483871e-05,
"loss": 0.1274,
"step": 131
},
{
"epoch": 0.48,
"grad_norm": 3.3655707836151123,
"learning_rate": 4.470967741935484e-05,
"loss": 0.0845,
"step": 132
},
{
"epoch": 0.48363636363636364,
"grad_norm": 0.4524034559726715,
"learning_rate": 4.4645161290322586e-05,
"loss": 0.0201,
"step": 133
},
{
"epoch": 0.48727272727272725,
"grad_norm": 2.0502264499664307,
"learning_rate": 4.458064516129032e-05,
"loss": 0.0241,
"step": 134
},
{
"epoch": 0.4909090909090909,
"grad_norm": 3.171466588973999,
"learning_rate": 4.451612903225807e-05,
"loss": 0.0988,
"step": 135
},
{
"epoch": 0.49454545454545457,
"grad_norm": 3.592583179473877,
"learning_rate": 4.445161290322581e-05,
"loss": 0.0352,
"step": 136
},
{
"epoch": 0.49818181818181817,
"grad_norm": 2.234022617340088,
"learning_rate": 4.438709677419355e-05,
"loss": 0.1045,
"step": 137
},
{
"epoch": 0.5018181818181818,
"grad_norm": 1.0845825672149658,
"learning_rate": 4.432258064516129e-05,
"loss": 0.1009,
"step": 138
},
{
"epoch": 0.5054545454545455,
"grad_norm": 2.9205362796783447,
"learning_rate": 4.4258064516129034e-05,
"loss": 0.0515,
"step": 139
},
{
"epoch": 0.509090909090909,
"grad_norm": 3.975783109664917,
"learning_rate": 4.4193548387096775e-05,
"loss": 0.1285,
"step": 140
},
{
"epoch": 0.5127272727272727,
"grad_norm": 5.859277248382568,
"learning_rate": 4.4129032258064516e-05,
"loss": 0.0997,
"step": 141
},
{
"epoch": 0.5163636363636364,
"grad_norm": 4.712912559509277,
"learning_rate": 4.4064516129032264e-05,
"loss": 0.1876,
"step": 142
},
{
"epoch": 0.52,
"grad_norm": 3.739158868789673,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.0487,
"step": 143
},
{
"epoch": 0.5236363636363637,
"grad_norm": 0.8333390951156616,
"learning_rate": 4.393548387096774e-05,
"loss": 0.0169,
"step": 144
},
{
"epoch": 0.5272727272727272,
"grad_norm": 1.7556527853012085,
"learning_rate": 4.387096774193549e-05,
"loss": 0.0418,
"step": 145
},
{
"epoch": 0.5309090909090909,
"grad_norm": 0.7867715358734131,
"learning_rate": 4.380645161290323e-05,
"loss": 0.0097,
"step": 146
},
{
"epoch": 0.5345454545454545,
"grad_norm": 1.7784496545791626,
"learning_rate": 4.374193548387097e-05,
"loss": 0.0308,
"step": 147
},
{
"epoch": 0.5381818181818182,
"grad_norm": 3.104606866836548,
"learning_rate": 4.367741935483871e-05,
"loss": 0.0512,
"step": 148
},
{
"epoch": 0.5418181818181819,
"grad_norm": 1.6434437036514282,
"learning_rate": 4.361290322580645e-05,
"loss": 0.086,
"step": 149
},
{
"epoch": 0.5454545454545454,
"grad_norm": 3.0264408588409424,
"learning_rate": 4.3548387096774194e-05,
"loss": 0.0293,
"step": 150
},
{
"epoch": 0.5490909090909091,
"grad_norm": 0.4782671332359314,
"learning_rate": 4.3483870967741936e-05,
"loss": 0.0107,
"step": 151
},
{
"epoch": 0.5527272727272727,
"grad_norm": 3.8326480388641357,
"learning_rate": 4.3419354838709684e-05,
"loss": 0.1158,
"step": 152
},
{
"epoch": 0.5563636363636364,
"grad_norm": 2.803429126739502,
"learning_rate": 4.335483870967742e-05,
"loss": 0.0907,
"step": 153
},
{
"epoch": 0.56,
"grad_norm": 6.267507076263428,
"learning_rate": 4.3290322580645166e-05,
"loss": 0.0712,
"step": 154
},
{
"epoch": 0.5636363636363636,
"grad_norm": 2.4244847297668457,
"learning_rate": 4.322580645161291e-05,
"loss": 0.0567,
"step": 155
},
{
"epoch": 0.5672727272727273,
"grad_norm": 1.6119318008422852,
"learning_rate": 4.316129032258065e-05,
"loss": 0.166,
"step": 156
},
{
"epoch": 0.5709090909090909,
"grad_norm": 2.521883964538574,
"learning_rate": 4.309677419354839e-05,
"loss": 0.0308,
"step": 157
},
{
"epoch": 0.5745454545454546,
"grad_norm": 2.1113216876983643,
"learning_rate": 4.303225806451613e-05,
"loss": 0.0302,
"step": 158
},
{
"epoch": 0.5781818181818181,
"grad_norm": 1.94487726688385,
"learning_rate": 4.296774193548387e-05,
"loss": 0.0199,
"step": 159
},
{
"epoch": 0.5818181818181818,
"grad_norm": 3.142296552658081,
"learning_rate": 4.2903225806451614e-05,
"loss": 0.0744,
"step": 160
},
{
"epoch": 0.5854545454545454,
"grad_norm": 1.6849907636642456,
"learning_rate": 4.283870967741936e-05,
"loss": 0.0321,
"step": 161
},
{
"epoch": 0.5890909090909091,
"grad_norm": 5.332469463348389,
"learning_rate": 4.2774193548387096e-05,
"loss": 0.1344,
"step": 162
},
{
"epoch": 0.5927272727272728,
"grad_norm": 0.8968711495399475,
"learning_rate": 4.2709677419354844e-05,
"loss": 0.073,
"step": 163
},
{
"epoch": 0.5963636363636363,
"grad_norm": 5.70121955871582,
"learning_rate": 4.2645161290322585e-05,
"loss": 0.1092,
"step": 164
},
{
"epoch": 0.6,
"grad_norm": 1.3494465351104736,
"learning_rate": 4.258064516129032e-05,
"loss": 0.0153,
"step": 165
},
{
"epoch": 0.6036363636363636,
"grad_norm": 1.2071605920791626,
"learning_rate": 4.251612903225807e-05,
"loss": 0.0636,
"step": 166
},
{
"epoch": 0.6072727272727273,
"grad_norm": 2.2756154537200928,
"learning_rate": 4.245161290322581e-05,
"loss": 0.0523,
"step": 167
},
{
"epoch": 0.610909090909091,
"grad_norm": 2.9887783527374268,
"learning_rate": 4.238709677419355e-05,
"loss": 0.0991,
"step": 168
},
{
"epoch": 0.6145454545454545,
"grad_norm": 3.471949577331543,
"learning_rate": 4.232258064516129e-05,
"loss": 0.1146,
"step": 169
},
{
"epoch": 0.6181818181818182,
"grad_norm": 0.9178719520568848,
"learning_rate": 4.225806451612904e-05,
"loss": 0.0283,
"step": 170
},
{
"epoch": 0.6218181818181818,
"grad_norm": 1.7535454034805298,
"learning_rate": 4.2193548387096774e-05,
"loss": 0.1037,
"step": 171
},
{
"epoch": 0.6254545454545455,
"grad_norm": 0.523720383644104,
"learning_rate": 4.2129032258064515e-05,
"loss": 0.0188,
"step": 172
},
{
"epoch": 0.6290909090909091,
"grad_norm": 2.872236728668213,
"learning_rate": 4.206451612903226e-05,
"loss": 0.137,
"step": 173
},
{
"epoch": 0.6327272727272727,
"grad_norm": 1.1402463912963867,
"learning_rate": 4.2e-05,
"loss": 0.0469,
"step": 174
},
{
"epoch": 0.6363636363636364,
"grad_norm": 1.3822078704833984,
"learning_rate": 4.1935483870967746e-05,
"loss": 0.0299,
"step": 175
},
{
"epoch": 0.64,
"grad_norm": 1.6050485372543335,
"learning_rate": 4.187096774193549e-05,
"loss": 0.0313,
"step": 176
},
{
"epoch": 0.6436363636363637,
"grad_norm": 1.4865597486495972,
"learning_rate": 4.180645161290323e-05,
"loss": 0.0199,
"step": 177
},
{
"epoch": 0.6472727272727272,
"grad_norm": 1.0985281467437744,
"learning_rate": 4.174193548387097e-05,
"loss": 0.0161,
"step": 178
},
{
"epoch": 0.6509090909090909,
"grad_norm": 2.7019283771514893,
"learning_rate": 4.167741935483871e-05,
"loss": 0.1127,
"step": 179
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.20963142812252045,
"learning_rate": 4.161290322580645e-05,
"loss": 0.0069,
"step": 180
},
{
"epoch": 0.6581818181818182,
"grad_norm": 2.687619209289551,
"learning_rate": 4.154838709677419e-05,
"loss": 0.0189,
"step": 181
},
{
"epoch": 0.6618181818181819,
"grad_norm": 0.5330418348312378,
"learning_rate": 4.148387096774194e-05,
"loss": 0.0878,
"step": 182
},
{
"epoch": 0.6654545454545454,
"grad_norm": 6.277083873748779,
"learning_rate": 4.1419354838709676e-05,
"loss": 0.2121,
"step": 183
},
{
"epoch": 0.6690909090909091,
"grad_norm": 4.049676895141602,
"learning_rate": 4.1354838709677424e-05,
"loss": 0.069,
"step": 184
},
{
"epoch": 0.6727272727272727,
"grad_norm": 2.069470167160034,
"learning_rate": 4.1290322580645165e-05,
"loss": 0.0485,
"step": 185
},
{
"epoch": 0.6763636363636364,
"grad_norm": 5.907510280609131,
"learning_rate": 4.1225806451612906e-05,
"loss": 0.0886,
"step": 186
},
{
"epoch": 0.68,
"grad_norm": 2.1604321002960205,
"learning_rate": 4.116129032258065e-05,
"loss": 0.0116,
"step": 187
},
{
"epoch": 0.6836363636363636,
"grad_norm": 2.6261606216430664,
"learning_rate": 4.109677419354839e-05,
"loss": 0.0438,
"step": 188
},
{
"epoch": 0.6872727272727273,
"grad_norm": 2.597719669342041,
"learning_rate": 4.103225806451613e-05,
"loss": 0.1081,
"step": 189
},
{
"epoch": 0.6909090909090909,
"grad_norm": 0.45269355177879333,
"learning_rate": 4.096774193548387e-05,
"loss": 0.0057,
"step": 190
},
{
"epoch": 0.6945454545454546,
"grad_norm": 3.5486068725585938,
"learning_rate": 4.090322580645162e-05,
"loss": 0.0569,
"step": 191
},
{
"epoch": 0.6981818181818182,
"grad_norm": 2.485374927520752,
"learning_rate": 4.0838709677419354e-05,
"loss": 0.1183,
"step": 192
},
{
"epoch": 0.7018181818181818,
"grad_norm": 4.209328651428223,
"learning_rate": 4.0774193548387095e-05,
"loss": 0.1006,
"step": 193
},
{
"epoch": 0.7054545454545454,
"grad_norm": 1.0859043598175049,
"learning_rate": 4.070967741935484e-05,
"loss": 0.0117,
"step": 194
},
{
"epoch": 0.7090909090909091,
"grad_norm": 1.0647627115249634,
"learning_rate": 4.0645161290322584e-05,
"loss": 0.0618,
"step": 195
},
{
"epoch": 0.7127272727272728,
"grad_norm": 1.131689429283142,
"learning_rate": 4.0580645161290325e-05,
"loss": 0.0439,
"step": 196
},
{
"epoch": 0.7163636363636363,
"grad_norm": 3.031101942062378,
"learning_rate": 4.0516129032258067e-05,
"loss": 0.1021,
"step": 197
},
{
"epoch": 0.72,
"grad_norm": 0.11612915247678757,
"learning_rate": 4.045161290322581e-05,
"loss": 0.0048,
"step": 198
},
{
"epoch": 0.7236363636363636,
"grad_norm": 1.4697656631469727,
"learning_rate": 4.038709677419355e-05,
"loss": 0.0296,
"step": 199
},
{
"epoch": 0.7272727272727273,
"grad_norm": 2.3507072925567627,
"learning_rate": 4.032258064516129e-05,
"loss": 0.0975,
"step": 200
},
{
"epoch": 0.730909090909091,
"grad_norm": 3.2952606678009033,
"learning_rate": 4.025806451612903e-05,
"loss": 0.0741,
"step": 201
},
{
"epoch": 0.7345454545454545,
"grad_norm": 1.3989083766937256,
"learning_rate": 4.019354838709677e-05,
"loss": 0.0603,
"step": 202
},
{
"epoch": 0.7381818181818182,
"grad_norm": 0.7519178986549377,
"learning_rate": 4.012903225806452e-05,
"loss": 0.0195,
"step": 203
},
{
"epoch": 0.7418181818181818,
"grad_norm": 0.9893004298210144,
"learning_rate": 4.006451612903226e-05,
"loss": 0.0316,
"step": 204
},
{
"epoch": 0.7454545454545455,
"grad_norm": 2.2764501571655273,
"learning_rate": 4e-05,
"loss": 0.0347,
"step": 205
},
{
"epoch": 0.7490909090909091,
"grad_norm": 2.6084823608398438,
"learning_rate": 3.9935483870967745e-05,
"loss": 0.1071,
"step": 206
},
{
"epoch": 0.7527272727272727,
"grad_norm": 1.7547663450241089,
"learning_rate": 3.9870967741935486e-05,
"loss": 0.0181,
"step": 207
},
{
"epoch": 0.7563636363636363,
"grad_norm": 0.3479810953140259,
"learning_rate": 3.980645161290323e-05,
"loss": 0.0101,
"step": 208
},
{
"epoch": 0.76,
"grad_norm": 1.399048924446106,
"learning_rate": 3.974193548387097e-05,
"loss": 0.0244,
"step": 209
},
{
"epoch": 0.7636363636363637,
"grad_norm": 3.616453170776367,
"learning_rate": 3.9677419354838716e-05,
"loss": 0.0863,
"step": 210
},
{
"epoch": 0.7672727272727272,
"grad_norm": 2.61297345161438,
"learning_rate": 3.961290322580645e-05,
"loss": 0.0447,
"step": 211
},
{
"epoch": 0.7709090909090909,
"grad_norm": 1.899042010307312,
"learning_rate": 3.95483870967742e-05,
"loss": 0.0313,
"step": 212
},
{
"epoch": 0.7745454545454545,
"grad_norm": 0.9543875455856323,
"learning_rate": 3.948387096774194e-05,
"loss": 0.0243,
"step": 213
},
{
"epoch": 0.7781818181818182,
"grad_norm": 1.5872313976287842,
"learning_rate": 3.9419354838709674e-05,
"loss": 0.0122,
"step": 214
},
{
"epoch": 0.7818181818181819,
"grad_norm": 1.8551832437515259,
"learning_rate": 3.935483870967742e-05,
"loss": 0.0535,
"step": 215
},
{
"epoch": 0.7854545454545454,
"grad_norm": 3.93450927734375,
"learning_rate": 3.9290322580645164e-05,
"loss": 0.1321,
"step": 216
},
{
"epoch": 0.7890909090909091,
"grad_norm": 4.761463165283203,
"learning_rate": 3.9225806451612905e-05,
"loss": 0.142,
"step": 217
},
{
"epoch": 0.7927272727272727,
"grad_norm": 1.0365723371505737,
"learning_rate": 3.9161290322580646e-05,
"loss": 0.0197,
"step": 218
},
{
"epoch": 0.7963636363636364,
"grad_norm": 2.633030414581299,
"learning_rate": 3.9096774193548394e-05,
"loss": 0.0375,
"step": 219
},
{
"epoch": 0.8,
"grad_norm": 1.4770023822784424,
"learning_rate": 3.903225806451613e-05,
"loss": 0.0251,
"step": 220
},
{
"epoch": 0.8036363636363636,
"grad_norm": 2.1788926124572754,
"learning_rate": 3.896774193548387e-05,
"loss": 0.0146,
"step": 221
},
{
"epoch": 0.8072727272727273,
"grad_norm": 1.1818387508392334,
"learning_rate": 3.890322580645162e-05,
"loss": 0.0307,
"step": 222
},
{
"epoch": 0.8109090909090909,
"grad_norm": 1.7780365943908691,
"learning_rate": 3.883870967741935e-05,
"loss": 0.0436,
"step": 223
},
{
"epoch": 0.8145454545454546,
"grad_norm": 0.3603725731372833,
"learning_rate": 3.87741935483871e-05,
"loss": 0.0086,
"step": 224
},
{
"epoch": 0.8181818181818182,
"grad_norm": 2.094778060913086,
"learning_rate": 3.870967741935484e-05,
"loss": 0.0535,
"step": 225
},
{
"epoch": 0.8218181818181818,
"grad_norm": 1.7837364673614502,
"learning_rate": 3.864516129032258e-05,
"loss": 0.0324,
"step": 226
},
{
"epoch": 0.8254545454545454,
"grad_norm": 1.0126112699508667,
"learning_rate": 3.8580645161290324e-05,
"loss": 0.0195,
"step": 227
},
{
"epoch": 0.8290909090909091,
"grad_norm": 2.095411777496338,
"learning_rate": 3.8516129032258065e-05,
"loss": 0.0858,
"step": 228
},
{
"epoch": 0.8327272727272728,
"grad_norm": 2.12591290473938,
"learning_rate": 3.845161290322581e-05,
"loss": 0.0745,
"step": 229
},
{
"epoch": 0.8363636363636363,
"grad_norm": 4.18102502822876,
"learning_rate": 3.838709677419355e-05,
"loss": 0.0876,
"step": 230
},
{
"epoch": 0.84,
"grad_norm": 2.6359128952026367,
"learning_rate": 3.8322580645161296e-05,
"loss": 0.0261,
"step": 231
},
{
"epoch": 0.8436363636363636,
"grad_norm": 2.9271247386932373,
"learning_rate": 3.825806451612903e-05,
"loss": 0.0432,
"step": 232
},
{
"epoch": 0.8472727272727273,
"grad_norm": 4.78306245803833,
"learning_rate": 3.819354838709678e-05,
"loss": 0.1117,
"step": 233
},
{
"epoch": 0.850909090909091,
"grad_norm": 0.5705394148826599,
"learning_rate": 3.812903225806452e-05,
"loss": 0.0054,
"step": 234
},
{
"epoch": 0.8545454545454545,
"grad_norm": 2.0522520542144775,
"learning_rate": 3.8064516129032254e-05,
"loss": 0.0245,
"step": 235
},
{
"epoch": 0.8581818181818182,
"grad_norm": 2.7212295532226562,
"learning_rate": 3.8e-05,
"loss": 0.0252,
"step": 236
},
{
"epoch": 0.8618181818181818,
"grad_norm": 0.7977845072746277,
"learning_rate": 3.793548387096774e-05,
"loss": 0.0127,
"step": 237
},
{
"epoch": 0.8654545454545455,
"grad_norm": 3.720811605453491,
"learning_rate": 3.7870967741935485e-05,
"loss": 0.0465,
"step": 238
},
{
"epoch": 0.8690909090909091,
"grad_norm": 2.362733840942383,
"learning_rate": 3.7806451612903226e-05,
"loss": 0.0266,
"step": 239
},
{
"epoch": 0.8727272727272727,
"grad_norm": 2.5481631755828857,
"learning_rate": 3.7741935483870974e-05,
"loss": 0.0224,
"step": 240
},
{
"epoch": 0.8763636363636363,
"grad_norm": 2.1781375408172607,
"learning_rate": 3.767741935483871e-05,
"loss": 0.0228,
"step": 241
},
{
"epoch": 0.88,
"grad_norm": 1.9818776845932007,
"learning_rate": 3.761290322580645e-05,
"loss": 0.0231,
"step": 242
},
{
"epoch": 0.8836363636363637,
"grad_norm": 0.20954543352127075,
"learning_rate": 3.75483870967742e-05,
"loss": 0.0031,
"step": 243
},
{
"epoch": 0.8872727272727273,
"grad_norm": 2.769566535949707,
"learning_rate": 3.748387096774193e-05,
"loss": 0.039,
"step": 244
},
{
"epoch": 0.8909090909090909,
"grad_norm": 2.511801242828369,
"learning_rate": 3.741935483870968e-05,
"loss": 0.0154,
"step": 245
},
{
"epoch": 0.8945454545454545,
"grad_norm": 0.3851822018623352,
"learning_rate": 3.735483870967742e-05,
"loss": 0.0047,
"step": 246
},
{
"epoch": 0.8981818181818182,
"grad_norm": 1.6585170030593872,
"learning_rate": 3.729032258064516e-05,
"loss": 0.0882,
"step": 247
},
{
"epoch": 0.9018181818181819,
"grad_norm": 1.9124457836151123,
"learning_rate": 3.7225806451612904e-05,
"loss": 0.0138,
"step": 248
},
{
"epoch": 0.9054545454545454,
"grad_norm": 5.743299961090088,
"learning_rate": 3.716129032258065e-05,
"loss": 0.0494,
"step": 249
},
{
"epoch": 0.9090909090909091,
"grad_norm": 3.5978081226348877,
"learning_rate": 3.7096774193548386e-05,
"loss": 0.0182,
"step": 250
},
{
"epoch": 0.9127272727272727,
"grad_norm": 0.024638742208480835,
"learning_rate": 3.703225806451613e-05,
"loss": 0.0012,
"step": 251
},
{
"epoch": 0.9163636363636364,
"grad_norm": 0.04621901735663414,
"learning_rate": 3.6967741935483876e-05,
"loss": 0.0014,
"step": 252
},
{
"epoch": 0.92,
"grad_norm": 0.06507572531700134,
"learning_rate": 3.690322580645162e-05,
"loss": 0.0018,
"step": 253
},
{
"epoch": 0.9236363636363636,
"grad_norm": 0.683228075504303,
"learning_rate": 3.683870967741936e-05,
"loss": 0.0064,
"step": 254
},
{
"epoch": 0.9272727272727272,
"grad_norm": 9.123976707458496,
"learning_rate": 3.67741935483871e-05,
"loss": 0.087,
"step": 255
},
{
"epoch": 0.9309090909090909,
"grad_norm": 0.9182856678962708,
"learning_rate": 3.670967741935484e-05,
"loss": 0.0891,
"step": 256
},
{
"epoch": 0.9345454545454546,
"grad_norm": 4.344281196594238,
"learning_rate": 3.664516129032258e-05,
"loss": 0.0301,
"step": 257
},
{
"epoch": 0.9381818181818182,
"grad_norm": 0.22626306116580963,
"learning_rate": 3.658064516129032e-05,
"loss": 0.0027,
"step": 258
},
{
"epoch": 0.9418181818181818,
"grad_norm": 2.9443519115448,
"learning_rate": 3.6516129032258064e-05,
"loss": 0.0826,
"step": 259
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.35616394877433777,
"learning_rate": 3.6451612903225805e-05,
"loss": 0.003,
"step": 260
},
{
"epoch": 0.9490909090909091,
"grad_norm": 5.846389293670654,
"learning_rate": 3.6387096774193553e-05,
"loss": 0.1148,
"step": 261
},
{
"epoch": 0.9527272727272728,
"grad_norm": 0.351965069770813,
"learning_rate": 3.6322580645161295e-05,
"loss": 0.0054,
"step": 262
},
{
"epoch": 0.9563636363636364,
"grad_norm": 1.9264423847198486,
"learning_rate": 3.6258064516129036e-05,
"loss": 0.0241,
"step": 263
},
{
"epoch": 0.96,
"grad_norm": 7.070173263549805,
"learning_rate": 3.619354838709678e-05,
"loss": 0.1209,
"step": 264
},
{
"epoch": 0.9636363636363636,
"grad_norm": 4.125431537628174,
"learning_rate": 3.612903225806452e-05,
"loss": 0.0451,
"step": 265
},
{
"epoch": 0.9672727272727273,
"grad_norm": 0.08269821852445602,
"learning_rate": 3.606451612903226e-05,
"loss": 0.0024,
"step": 266
},
{
"epoch": 0.9709090909090909,
"grad_norm": 2.2969746589660645,
"learning_rate": 3.6e-05,
"loss": 0.03,
"step": 267
},
{
"epoch": 0.9745454545454545,
"grad_norm": 3.2518858909606934,
"learning_rate": 3.593548387096774e-05,
"loss": 0.0242,
"step": 268
},
{
"epoch": 0.9781818181818182,
"grad_norm": 0.2708548903465271,
"learning_rate": 3.5870967741935483e-05,
"loss": 0.0041,
"step": 269
},
{
"epoch": 0.9818181818181818,
"grad_norm": 3.211353063583374,
"learning_rate": 3.580645161290323e-05,
"loss": 0.0195,
"step": 270
},
{
"epoch": 0.9854545454545455,
"grad_norm": 2.5400710105895996,
"learning_rate": 3.574193548387097e-05,
"loss": 0.023,
"step": 271
},
{
"epoch": 0.9890909090909091,
"grad_norm": 4.467423915863037,
"learning_rate": 3.567741935483871e-05,
"loss": 0.1522,
"step": 272
},
{
"epoch": 0.9927272727272727,
"grad_norm": 0.29972556233406067,
"learning_rate": 3.5612903225806455e-05,
"loss": 0.0033,
"step": 273
},
{
"epoch": 0.9963636363636363,
"grad_norm": 2.243041753768921,
"learning_rate": 3.5548387096774196e-05,
"loss": 0.0159,
"step": 274
},
{
"epoch": 1.0,
"grad_norm": 0.03446698188781738,
"learning_rate": 3.548387096774194e-05,
"loss": 0.0015,
"step": 275
},
{
"epoch": 1.0,
"eval_accuracy": 0.9890710382513661,
"eval_loss": 0.032855160534381866,
"eval_runtime": 12.8051,
"eval_samples_per_second": 342.989,
"eval_steps_per_second": 5.388,
"step": 275
},
{
"epoch": 1.0036363636363637,
"grad_norm": 0.1759863644838333,
"learning_rate": 3.541935483870968e-05,
"loss": 0.0025,
"step": 276
},
{
"epoch": 1.0072727272727273,
"grad_norm": 0.30346181988716125,
"learning_rate": 3.535483870967743e-05,
"loss": 0.0035,
"step": 277
},
{
"epoch": 1.010909090909091,
"grad_norm": 0.5249446630477905,
"learning_rate": 3.529032258064516e-05,
"loss": 0.0071,
"step": 278
},
{
"epoch": 1.0145454545454546,
"grad_norm": 2.6401705741882324,
"learning_rate": 3.52258064516129e-05,
"loss": 0.0947,
"step": 279
},
{
"epoch": 1.018181818181818,
"grad_norm": 1.9211256504058838,
"learning_rate": 3.516129032258065e-05,
"loss": 0.0223,
"step": 280
},
{
"epoch": 1.0218181818181817,
"grad_norm": 0.13240855932235718,
"learning_rate": 3.5096774193548385e-05,
"loss": 0.0024,
"step": 281
},
{
"epoch": 1.0254545454545454,
"grad_norm": 1.043957233428955,
"learning_rate": 3.503225806451613e-05,
"loss": 0.0082,
"step": 282
},
{
"epoch": 1.029090909090909,
"grad_norm": 1.6595826148986816,
"learning_rate": 3.4967741935483874e-05,
"loss": 0.0711,
"step": 283
},
{
"epoch": 1.0327272727272727,
"grad_norm": 0.50986248254776,
"learning_rate": 3.4903225806451616e-05,
"loss": 0.0043,
"step": 284
},
{
"epoch": 1.0363636363636364,
"grad_norm": 2.219081401824951,
"learning_rate": 3.483870967741936e-05,
"loss": 0.0776,
"step": 285
},
{
"epoch": 1.04,
"grad_norm": 0.7167057991027832,
"learning_rate": 3.47741935483871e-05,
"loss": 0.005,
"step": 286
},
{
"epoch": 1.0436363636363637,
"grad_norm": 0.13360421359539032,
"learning_rate": 3.470967741935484e-05,
"loss": 0.0023,
"step": 287
},
{
"epoch": 1.0472727272727274,
"grad_norm": 1.3663884401321411,
"learning_rate": 3.464516129032258e-05,
"loss": 0.0111,
"step": 288
},
{
"epoch": 1.050909090909091,
"grad_norm": 0.01989702135324478,
"learning_rate": 3.458064516129033e-05,
"loss": 0.001,
"step": 289
},
{
"epoch": 1.0545454545454545,
"grad_norm": 1.4087766408920288,
"learning_rate": 3.451612903225806e-05,
"loss": 0.014,
"step": 290
},
{
"epoch": 1.0581818181818181,
"grad_norm": 0.47182703018188477,
"learning_rate": 3.445161290322581e-05,
"loss": 0.0047,
"step": 291
},
{
"epoch": 1.0618181818181818,
"grad_norm": 0.7205504775047302,
"learning_rate": 3.438709677419355e-05,
"loss": 0.0045,
"step": 292
},
{
"epoch": 1.0654545454545454,
"grad_norm": 0.07702212780714035,
"learning_rate": 3.432258064516129e-05,
"loss": 0.002,
"step": 293
},
{
"epoch": 1.069090909090909,
"grad_norm": 0.44914644956588745,
"learning_rate": 3.4258064516129035e-05,
"loss": 0.0044,
"step": 294
},
{
"epoch": 1.0727272727272728,
"grad_norm": 0.26627811789512634,
"learning_rate": 3.4193548387096776e-05,
"loss": 0.0035,
"step": 295
},
{
"epoch": 1.0763636363636364,
"grad_norm": 1.8016518354415894,
"learning_rate": 3.412903225806452e-05,
"loss": 0.012,
"step": 296
},
{
"epoch": 1.08,
"grad_norm": 2.806971311569214,
"learning_rate": 3.406451612903226e-05,
"loss": 0.051,
"step": 297
},
{
"epoch": 1.0836363636363637,
"grad_norm": 0.03818153962492943,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.0014,
"step": 298
},
{
"epoch": 1.0872727272727274,
"grad_norm": 0.04983190819621086,
"learning_rate": 3.393548387096774e-05,
"loss": 0.0016,
"step": 299
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.05590420216321945,
"learning_rate": 3.387096774193548e-05,
"loss": 0.0013,
"step": 300
},
{
"epoch": 1.0945454545454545,
"grad_norm": 0.3262059986591339,
"learning_rate": 3.380645161290323e-05,
"loss": 0.0024,
"step": 301
},
{
"epoch": 1.0981818181818181,
"grad_norm": 0.10000384598970413,
"learning_rate": 3.3741935483870965e-05,
"loss": 0.0016,
"step": 302
},
{
"epoch": 1.1018181818181818,
"grad_norm": 1.12041437625885,
"learning_rate": 3.367741935483871e-05,
"loss": 0.0116,
"step": 303
},
{
"epoch": 1.1054545454545455,
"grad_norm": 3.5186989307403564,
"learning_rate": 3.3612903225806454e-05,
"loss": 0.0688,
"step": 304
},
{
"epoch": 1.1090909090909091,
"grad_norm": 0.599452018737793,
"learning_rate": 3.3548387096774195e-05,
"loss": 0.004,
"step": 305
},
{
"epoch": 1.1127272727272728,
"grad_norm": 4.202869415283203,
"learning_rate": 3.3483870967741936e-05,
"loss": 0.0139,
"step": 306
},
{
"epoch": 1.1163636363636364,
"grad_norm": 0.8411999344825745,
"learning_rate": 3.341935483870968e-05,
"loss": 0.0053,
"step": 307
},
{
"epoch": 1.12,
"grad_norm": 0.222166046500206,
"learning_rate": 3.335483870967742e-05,
"loss": 0.0016,
"step": 308
},
{
"epoch": 1.1236363636363635,
"grad_norm": 0.7113834023475647,
"learning_rate": 3.329032258064516e-05,
"loss": 0.0049,
"step": 309
},
{
"epoch": 1.1272727272727272,
"grad_norm": 0.022323666140437126,
"learning_rate": 3.322580645161291e-05,
"loss": 0.0008,
"step": 310
},
{
"epoch": 1.1309090909090909,
"grad_norm": 0.7203249335289001,
"learning_rate": 3.316129032258064e-05,
"loss": 0.0047,
"step": 311
},
{
"epoch": 1.1345454545454545,
"grad_norm": 0.20508606731891632,
"learning_rate": 3.309677419354839e-05,
"loss": 0.0016,
"step": 312
},
{
"epoch": 1.1381818181818182,
"grad_norm": 0.1757669299840927,
"learning_rate": 3.303225806451613e-05,
"loss": 0.0012,
"step": 313
},
{
"epoch": 1.1418181818181818,
"grad_norm": 0.6888485550880432,
"learning_rate": 3.296774193548387e-05,
"loss": 0.1091,
"step": 314
},
{
"epoch": 1.1454545454545455,
"grad_norm": 3.370866298675537,
"learning_rate": 3.2903225806451614e-05,
"loss": 0.0273,
"step": 315
},
{
"epoch": 1.1490909090909092,
"grad_norm": 2.7193377017974854,
"learning_rate": 3.2838709677419356e-05,
"loss": 0.1505,
"step": 316
},
{
"epoch": 1.1527272727272728,
"grad_norm": 0.01697608083486557,
"learning_rate": 3.27741935483871e-05,
"loss": 0.0007,
"step": 317
},
{
"epoch": 1.1563636363636363,
"grad_norm": 1.244520902633667,
"learning_rate": 3.270967741935484e-05,
"loss": 0.0076,
"step": 318
},
{
"epoch": 1.16,
"grad_norm": 0.18265090882778168,
"learning_rate": 3.2645161290322586e-05,
"loss": 0.0018,
"step": 319
},
{
"epoch": 1.1636363636363636,
"grad_norm": 0.03304216265678406,
"learning_rate": 3.258064516129033e-05,
"loss": 0.0008,
"step": 320
},
{
"epoch": 1.1672727272727272,
"grad_norm": 1.4162017107009888,
"learning_rate": 3.251612903225806e-05,
"loss": 0.0071,
"step": 321
},
{
"epoch": 1.170909090909091,
"grad_norm": 0.0734863430261612,
"learning_rate": 3.245161290322581e-05,
"loss": 0.0012,
"step": 322
},
{
"epoch": 1.1745454545454546,
"grad_norm": 4.381263256072998,
"learning_rate": 3.238709677419355e-05,
"loss": 0.0188,
"step": 323
},
{
"epoch": 1.1781818181818182,
"grad_norm": 2.71781325340271,
"learning_rate": 3.232258064516129e-05,
"loss": 0.0278,
"step": 324
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.1261809766292572,
"learning_rate": 3.2258064516129034e-05,
"loss": 0.0014,
"step": 325
},
{
"epoch": 1.1854545454545455,
"grad_norm": 2.4839842319488525,
"learning_rate": 3.2193548387096775e-05,
"loss": 0.072,
"step": 326
},
{
"epoch": 1.189090909090909,
"grad_norm": 0.1383955031633377,
"learning_rate": 3.2129032258064516e-05,
"loss": 0.0016,
"step": 327
},
{
"epoch": 1.1927272727272726,
"grad_norm": 3.2481300830841064,
"learning_rate": 3.206451612903226e-05,
"loss": 0.0301,
"step": 328
},
{
"epoch": 1.1963636363636363,
"grad_norm": 4.403379440307617,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.0323,
"step": 329
},
{
"epoch": 1.2,
"grad_norm": 0.015226184390485287,
"learning_rate": 3.193548387096774e-05,
"loss": 0.0007,
"step": 330
},
{
"epoch": 1.2036363636363636,
"grad_norm": 0.37973034381866455,
"learning_rate": 3.187096774193549e-05,
"loss": 0.0028,
"step": 331
},
{
"epoch": 1.2072727272727273,
"grad_norm": 6.127589225769043,
"learning_rate": 3.180645161290323e-05,
"loss": 0.0691,
"step": 332
},
{
"epoch": 1.210909090909091,
"grad_norm": 0.8358224034309387,
"learning_rate": 3.174193548387097e-05,
"loss": 0.1013,
"step": 333
},
{
"epoch": 1.2145454545454546,
"grad_norm": 0.036338452249765396,
"learning_rate": 3.167741935483871e-05,
"loss": 0.0011,
"step": 334
},
{
"epoch": 1.2181818181818183,
"grad_norm": 0.02295631729066372,
"learning_rate": 3.161290322580645e-05,
"loss": 0.0009,
"step": 335
},
{
"epoch": 1.221818181818182,
"grad_norm": 0.05319954827427864,
"learning_rate": 3.1548387096774194e-05,
"loss": 0.0012,
"step": 336
},
{
"epoch": 1.2254545454545456,
"grad_norm": 6.075497150421143,
"learning_rate": 3.1483870967741935e-05,
"loss": 0.1689,
"step": 337
},
{
"epoch": 1.229090909090909,
"grad_norm": 0.02964276447892189,
"learning_rate": 3.141935483870968e-05,
"loss": 0.0012,
"step": 338
},
{
"epoch": 1.2327272727272727,
"grad_norm": 3.0539228916168213,
"learning_rate": 3.135483870967742e-05,
"loss": 0.0269,
"step": 339
},
{
"epoch": 1.2363636363636363,
"grad_norm": 0.04519687220454216,
"learning_rate": 3.1290322580645166e-05,
"loss": 0.0017,
"step": 340
},
{
"epoch": 1.24,
"grad_norm": 0.10507076978683472,
"learning_rate": 3.122580645161291e-05,
"loss": 0.0027,
"step": 341
},
{
"epoch": 1.2436363636363637,
"grad_norm": 3.865663766860962,
"learning_rate": 3.116129032258064e-05,
"loss": 0.0435,
"step": 342
},
{
"epoch": 1.2472727272727273,
"grad_norm": 1.0297927856445312,
"learning_rate": 3.109677419354839e-05,
"loss": 0.0093,
"step": 343
},
{
"epoch": 1.250909090909091,
"grad_norm": 1.1631922721862793,
"learning_rate": 3.103225806451613e-05,
"loss": 0.0073,
"step": 344
},
{
"epoch": 1.2545454545454544,
"grad_norm": 0.11844321340322495,
"learning_rate": 3.096774193548387e-05,
"loss": 0.0022,
"step": 345
},
{
"epoch": 1.2581818181818183,
"grad_norm": 0.6237489581108093,
"learning_rate": 3.090322580645161e-05,
"loss": 0.004,
"step": 346
},
{
"epoch": 1.2618181818181817,
"grad_norm": 0.883538544178009,
"learning_rate": 3.083870967741936e-05,
"loss": 0.0056,
"step": 347
},
{
"epoch": 1.2654545454545454,
"grad_norm": 0.5708529949188232,
"learning_rate": 3.0774193548387096e-05,
"loss": 0.0051,
"step": 348
},
{
"epoch": 1.269090909090909,
"grad_norm": 0.3859453797340393,
"learning_rate": 3.070967741935484e-05,
"loss": 0.0026,
"step": 349
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.07389602810144424,
"learning_rate": 3.0645161290322585e-05,
"loss": 0.0018,
"step": 350
},
{
"epoch": 1.2763636363636364,
"grad_norm": 5.179856300354004,
"learning_rate": 3.058064516129032e-05,
"loss": 0.1931,
"step": 351
},
{
"epoch": 1.28,
"grad_norm": 1.7609326839447021,
"learning_rate": 3.0516129032258067e-05,
"loss": 0.0627,
"step": 352
},
{
"epoch": 1.2836363636363637,
"grad_norm": 2.443671464920044,
"learning_rate": 3.0451612903225805e-05,
"loss": 0.0098,
"step": 353
},
{
"epoch": 1.2872727272727273,
"grad_norm": 0.024096990004181862,
"learning_rate": 3.0387096774193553e-05,
"loss": 0.001,
"step": 354
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.03544619679450989,
"learning_rate": 3.032258064516129e-05,
"loss": 0.0013,
"step": 355
},
{
"epoch": 1.2945454545454544,
"grad_norm": 0.8081026673316956,
"learning_rate": 3.0258064516129032e-05,
"loss": 0.0049,
"step": 356
},
{
"epoch": 1.298181818181818,
"grad_norm": 0.13682828843593597,
"learning_rate": 3.0193548387096777e-05,
"loss": 0.0025,
"step": 357
},
{
"epoch": 1.3018181818181818,
"grad_norm": 0.04892565682530403,
"learning_rate": 3.0129032258064515e-05,
"loss": 0.0012,
"step": 358
},
{
"epoch": 1.3054545454545454,
"grad_norm": 0.04417359083890915,
"learning_rate": 3.006451612903226e-05,
"loss": 0.0009,
"step": 359
},
{
"epoch": 1.309090909090909,
"grad_norm": 1.1065845489501953,
"learning_rate": 3e-05,
"loss": 0.0099,
"step": 360
},
{
"epoch": 1.3127272727272727,
"grad_norm": 1.6691566705703735,
"learning_rate": 2.9935483870967745e-05,
"loss": 0.0704,
"step": 361
},
{
"epoch": 1.3163636363636364,
"grad_norm": 0.744577944278717,
"learning_rate": 2.9870967741935487e-05,
"loss": 0.0546,
"step": 362
},
{
"epoch": 1.32,
"grad_norm": 1.7217384576797485,
"learning_rate": 2.9806451612903224e-05,
"loss": 0.11,
"step": 363
},
{
"epoch": 1.3236363636363637,
"grad_norm": 0.28095120191574097,
"learning_rate": 2.974193548387097e-05,
"loss": 0.0035,
"step": 364
},
{
"epoch": 1.3272727272727272,
"grad_norm": 0.25743359327316284,
"learning_rate": 2.967741935483871e-05,
"loss": 0.0026,
"step": 365
},
{
"epoch": 1.330909090909091,
"grad_norm": 0.04120393097400665,
"learning_rate": 2.9612903225806455e-05,
"loss": 0.0012,
"step": 366
},
{
"epoch": 1.3345454545454545,
"grad_norm": 0.08902228623628616,
"learning_rate": 2.9548387096774193e-05,
"loss": 0.0026,
"step": 367
},
{
"epoch": 1.3381818181818181,
"grad_norm": 2.2871615886688232,
"learning_rate": 2.9483870967741937e-05,
"loss": 0.0267,
"step": 368
},
{
"epoch": 1.3418181818181818,
"grad_norm": 0.9016657471656799,
"learning_rate": 2.941935483870968e-05,
"loss": 0.127,
"step": 369
},
{
"epoch": 1.3454545454545455,
"grad_norm": 3.976222276687622,
"learning_rate": 2.9354838709677417e-05,
"loss": 0.0278,
"step": 370
},
{
"epoch": 1.3490909090909091,
"grad_norm": 0.11579588800668716,
"learning_rate": 2.9290322580645165e-05,
"loss": 0.0027,
"step": 371
},
{
"epoch": 1.3527272727272728,
"grad_norm": 2.413121223449707,
"learning_rate": 2.9225806451612902e-05,
"loss": 0.0107,
"step": 372
},
{
"epoch": 1.3563636363636364,
"grad_norm": 1.4316357374191284,
"learning_rate": 2.9161290322580647e-05,
"loss": 0.0093,
"step": 373
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.041745781898498535,
"learning_rate": 2.909677419354839e-05,
"loss": 0.0015,
"step": 374
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.59097820520401,
"learning_rate": 2.9032258064516133e-05,
"loss": 0.0034,
"step": 375
},
{
"epoch": 1.3672727272727272,
"grad_norm": 0.06494897603988647,
"learning_rate": 2.896774193548387e-05,
"loss": 0.0015,
"step": 376
},
{
"epoch": 1.3709090909090909,
"grad_norm": 3.1881821155548096,
"learning_rate": 2.8903225806451615e-05,
"loss": 0.0568,
"step": 377
},
{
"epoch": 1.3745454545454545,
"grad_norm": 0.9627525210380554,
"learning_rate": 2.8838709677419357e-05,
"loss": 0.0067,
"step": 378
},
{
"epoch": 1.3781818181818182,
"grad_norm": 2.6984145641326904,
"learning_rate": 2.8774193548387095e-05,
"loss": 0.0655,
"step": 379
},
{
"epoch": 1.3818181818181818,
"grad_norm": 1.9964426755905151,
"learning_rate": 2.8709677419354843e-05,
"loss": 0.0546,
"step": 380
},
{
"epoch": 1.3854545454545455,
"grad_norm": 0.019922640174627304,
"learning_rate": 2.864516129032258e-05,
"loss": 0.001,
"step": 381
},
{
"epoch": 1.3890909090909092,
"grad_norm": 3.0499842166900635,
"learning_rate": 2.8580645161290325e-05,
"loss": 0.0726,
"step": 382
},
{
"epoch": 1.3927272727272726,
"grad_norm": 0.20027759671211243,
"learning_rate": 2.8516129032258066e-05,
"loss": 0.003,
"step": 383
},
{
"epoch": 1.3963636363636365,
"grad_norm": 0.549941897392273,
"learning_rate": 2.845161290322581e-05,
"loss": 0.1096,
"step": 384
},
{
"epoch": 1.4,
"grad_norm": 5.2639946937561035,
"learning_rate": 2.838709677419355e-05,
"loss": 0.0766,
"step": 385
},
{
"epoch": 1.4036363636363636,
"grad_norm": 0.5863090753555298,
"learning_rate": 2.832258064516129e-05,
"loss": 0.0076,
"step": 386
},
{
"epoch": 1.4072727272727272,
"grad_norm": 0.020899731665849686,
"learning_rate": 2.8258064516129035e-05,
"loss": 0.001,
"step": 387
},
{
"epoch": 1.410909090909091,
"grad_norm": 0.2003995031118393,
"learning_rate": 2.8193548387096776e-05,
"loss": 0.0031,
"step": 388
},
{
"epoch": 1.4145454545454546,
"grad_norm": 2.7366487979888916,
"learning_rate": 2.812903225806452e-05,
"loss": 0.052,
"step": 389
},
{
"epoch": 1.4181818181818182,
"grad_norm": 0.03661293536424637,
"learning_rate": 2.806451612903226e-05,
"loss": 0.0014,
"step": 390
},
{
"epoch": 1.4218181818181819,
"grad_norm": 1.4454936981201172,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.0122,
"step": 391
},
{
"epoch": 1.4254545454545455,
"grad_norm": 0.03340213745832443,
"learning_rate": 2.7935483870967744e-05,
"loss": 0.0015,
"step": 392
},
{
"epoch": 1.4290909090909092,
"grad_norm": 0.654367983341217,
"learning_rate": 2.7870967741935482e-05,
"loss": 0.0069,
"step": 393
},
{
"epoch": 1.4327272727272726,
"grad_norm": 0.08472099900245667,
"learning_rate": 2.7806451612903227e-05,
"loss": 0.0018,
"step": 394
},
{
"epoch": 1.4363636363636363,
"grad_norm": 3.4286415576934814,
"learning_rate": 2.7741935483870968e-05,
"loss": 0.0662,
"step": 395
},
{
"epoch": 1.44,
"grad_norm": 1.312857747077942,
"learning_rate": 2.7677419354838713e-05,
"loss": 0.014,
"step": 396
},
{
"epoch": 1.4436363636363636,
"grad_norm": 0.035416845232248306,
"learning_rate": 2.7612903225806454e-05,
"loss": 0.0013,
"step": 397
},
{
"epoch": 1.4472727272727273,
"grad_norm": 4.134556770324707,
"learning_rate": 2.75483870967742e-05,
"loss": 0.0354,
"step": 398
},
{
"epoch": 1.450909090909091,
"grad_norm": 0.43342649936676025,
"learning_rate": 2.7483870967741936e-05,
"loss": 0.0053,
"step": 399
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.270112007856369,
"learning_rate": 2.7419354838709678e-05,
"loss": 0.0029,
"step": 400
},
{
"epoch": 1.4581818181818182,
"grad_norm": 0.039803147315979004,
"learning_rate": 2.7354838709677422e-05,
"loss": 0.0017,
"step": 401
},
{
"epoch": 1.461818181818182,
"grad_norm": 2.7924137115478516,
"learning_rate": 2.729032258064516e-05,
"loss": 0.0366,
"step": 402
},
{
"epoch": 1.4654545454545453,
"grad_norm": 0.29749271273612976,
"learning_rate": 2.7225806451612905e-05,
"loss": 0.0026,
"step": 403
},
{
"epoch": 1.4690909090909092,
"grad_norm": 1.2988048791885376,
"learning_rate": 2.7161290322580646e-05,
"loss": 0.0083,
"step": 404
},
{
"epoch": 1.4727272727272727,
"grad_norm": 0.07334749400615692,
"learning_rate": 2.709677419354839e-05,
"loss": 0.0019,
"step": 405
},
{
"epoch": 1.4763636363636363,
"grad_norm": 0.04504287615418434,
"learning_rate": 2.7032258064516132e-05,
"loss": 0.0018,
"step": 406
},
{
"epoch": 1.48,
"grad_norm": 0.08913391828536987,
"learning_rate": 2.696774193548387e-05,
"loss": 0.0018,
"step": 407
},
{
"epoch": 1.4836363636363636,
"grad_norm": 0.20501653850078583,
"learning_rate": 2.6903225806451614e-05,
"loss": 0.0022,
"step": 408
},
{
"epoch": 1.4872727272727273,
"grad_norm": 1.9350942373275757,
"learning_rate": 2.6838709677419355e-05,
"loss": 0.0097,
"step": 409
},
{
"epoch": 1.490909090909091,
"grad_norm": 4.018691062927246,
"learning_rate": 2.67741935483871e-05,
"loss": 0.0256,
"step": 410
},
{
"epoch": 1.4945454545454546,
"grad_norm": 0.02390647679567337,
"learning_rate": 2.6709677419354838e-05,
"loss": 0.001,
"step": 411
},
{
"epoch": 1.498181818181818,
"grad_norm": 2.684476375579834,
"learning_rate": 2.6645161290322586e-05,
"loss": 0.0146,
"step": 412
},
{
"epoch": 1.501818181818182,
"grad_norm": 0.021060334518551826,
"learning_rate": 2.6580645161290324e-05,
"loss": 0.0009,
"step": 413
},
{
"epoch": 1.5054545454545454,
"grad_norm": 0.4146246910095215,
"learning_rate": 2.6516129032258065e-05,
"loss": 0.0025,
"step": 414
},
{
"epoch": 1.509090909090909,
"grad_norm": 3.564082384109497,
"learning_rate": 2.645161290322581e-05,
"loss": 0.0188,
"step": 415
},
{
"epoch": 1.5127272727272727,
"grad_norm": 0.027183400467038155,
"learning_rate": 2.6387096774193548e-05,
"loss": 0.0012,
"step": 416
},
{
"epoch": 1.5163636363636364,
"grad_norm": 0.4653225541114807,
"learning_rate": 2.6322580645161292e-05,
"loss": 0.0055,
"step": 417
},
{
"epoch": 1.52,
"grad_norm": 0.2512191832065582,
"learning_rate": 2.6258064516129033e-05,
"loss": 0.0015,
"step": 418
},
{
"epoch": 1.5236363636363637,
"grad_norm": 2.2450575828552246,
"learning_rate": 2.6193548387096778e-05,
"loss": 0.0209,
"step": 419
},
{
"epoch": 1.5272727272727273,
"grad_norm": 2.3167543411254883,
"learning_rate": 2.6129032258064516e-05,
"loss": 0.0544,
"step": 420
},
{
"epoch": 1.5309090909090908,
"grad_norm": 0.03268599510192871,
"learning_rate": 2.6064516129032257e-05,
"loss": 0.0011,
"step": 421
},
{
"epoch": 1.5345454545454547,
"grad_norm": 0.028883187100291252,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.001,
"step": 422
},
{
"epoch": 1.538181818181818,
"grad_norm": 0.1544177383184433,
"learning_rate": 2.5935483870967743e-05,
"loss": 0.0022,
"step": 423
},
{
"epoch": 1.541818181818182,
"grad_norm": 2.9123668670654297,
"learning_rate": 2.5870967741935488e-05,
"loss": 0.0187,
"step": 424
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.15737684071063995,
"learning_rate": 2.5806451612903226e-05,
"loss": 0.0021,
"step": 425
},
{
"epoch": 1.549090909090909,
"grad_norm": 0.023125503212213516,
"learning_rate": 2.574193548387097e-05,
"loss": 0.0009,
"step": 426
},
{
"epoch": 1.5527272727272727,
"grad_norm": 5.130437850952148,
"learning_rate": 2.567741935483871e-05,
"loss": 0.1732,
"step": 427
},
{
"epoch": 1.5563636363636364,
"grad_norm": 0.017136206850409508,
"learning_rate": 2.561290322580645e-05,
"loss": 0.0008,
"step": 428
},
{
"epoch": 1.56,
"grad_norm": 2.5280985832214355,
"learning_rate": 2.5548387096774197e-05,
"loss": 0.0143,
"step": 429
},
{
"epoch": 1.5636363636363635,
"grad_norm": 0.2261432558298111,
"learning_rate": 2.5483870967741935e-05,
"loss": 0.0029,
"step": 430
},
{
"epoch": 1.5672727272727274,
"grad_norm": 0.018230870366096497,
"learning_rate": 2.541935483870968e-05,
"loss": 0.0008,
"step": 431
},
{
"epoch": 1.5709090909090908,
"grad_norm": 0.12864048779010773,
"learning_rate": 2.535483870967742e-05,
"loss": 0.0016,
"step": 432
},
{
"epoch": 1.5745454545454547,
"grad_norm": 0.03463654965162277,
"learning_rate": 2.5290322580645166e-05,
"loss": 0.0009,
"step": 433
},
{
"epoch": 1.5781818181818181,
"grad_norm": 0.04040815308690071,
"learning_rate": 2.5225806451612903e-05,
"loss": 0.0009,
"step": 434
},
{
"epoch": 1.5818181818181818,
"grad_norm": 6.010333061218262,
"learning_rate": 2.5161290322580645e-05,
"loss": 0.0476,
"step": 435
},
{
"epoch": 1.5854545454545454,
"grad_norm": 2.538048505783081,
"learning_rate": 2.509677419354839e-05,
"loss": 0.0706,
"step": 436
},
{
"epoch": 1.589090909090909,
"grad_norm": 0.9795745015144348,
"learning_rate": 2.5032258064516127e-05,
"loss": 0.0043,
"step": 437
},
{
"epoch": 1.5927272727272728,
"grad_norm": 0.07065194100141525,
"learning_rate": 2.4967741935483872e-05,
"loss": 0.0011,
"step": 438
},
{
"epoch": 1.5963636363636362,
"grad_norm": 2.8576443195343018,
"learning_rate": 2.4903225806451613e-05,
"loss": 0.0471,
"step": 439
},
{
"epoch": 1.6,
"grad_norm": 2.197402238845825,
"learning_rate": 2.4838709677419354e-05,
"loss": 0.0809,
"step": 440
},
{
"epoch": 1.6036363636363635,
"grad_norm": 0.3648858368396759,
"learning_rate": 2.47741935483871e-05,
"loss": 0.0038,
"step": 441
},
{
"epoch": 1.6072727272727274,
"grad_norm": 0.1489875763654709,
"learning_rate": 2.470967741935484e-05,
"loss": 0.0017,
"step": 442
},
{
"epoch": 1.6109090909090908,
"grad_norm": 0.07092121988534927,
"learning_rate": 2.464516129032258e-05,
"loss": 0.0014,
"step": 443
},
{
"epoch": 1.6145454545454545,
"grad_norm": 2.7964096069335938,
"learning_rate": 2.4580645161290326e-05,
"loss": 0.0509,
"step": 444
},
{
"epoch": 1.6181818181818182,
"grad_norm": 1.7272242307662964,
"learning_rate": 2.4516129032258064e-05,
"loss": 0.0067,
"step": 445
},
{
"epoch": 1.6218181818181818,
"grad_norm": 0.06453117728233337,
"learning_rate": 2.4451612903225805e-05,
"loss": 0.0016,
"step": 446
},
{
"epoch": 1.6254545454545455,
"grad_norm": 3.195802927017212,
"learning_rate": 2.438709677419355e-05,
"loss": 0.0547,
"step": 447
},
{
"epoch": 1.6290909090909091,
"grad_norm": 0.01947391778230667,
"learning_rate": 2.432258064516129e-05,
"loss": 0.0008,
"step": 448
},
{
"epoch": 1.6327272727272728,
"grad_norm": 0.5001751780509949,
"learning_rate": 2.4258064516129032e-05,
"loss": 0.004,
"step": 449
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.4999580383300781,
"learning_rate": 2.4193548387096777e-05,
"loss": 0.0057,
"step": 450
},
{
"epoch": 1.6400000000000001,
"grad_norm": 1.5922938585281372,
"learning_rate": 2.4129032258064518e-05,
"loss": 0.0116,
"step": 451
},
{
"epoch": 1.6436363636363636,
"grad_norm": 2.461899518966675,
"learning_rate": 2.406451612903226e-05,
"loss": 0.018,
"step": 452
},
{
"epoch": 1.6472727272727272,
"grad_norm": 0.07266916334629059,
"learning_rate": 2.4e-05,
"loss": 0.0014,
"step": 453
},
{
"epoch": 1.6509090909090909,
"grad_norm": 1.8282543420791626,
"learning_rate": 2.3935483870967742e-05,
"loss": 0.0841,
"step": 454
},
{
"epoch": 1.6545454545454545,
"grad_norm": 0.06280002743005753,
"learning_rate": 2.3870967741935486e-05,
"loss": 0.0013,
"step": 455
},
{
"epoch": 1.6581818181818182,
"grad_norm": 0.06250961124897003,
"learning_rate": 2.3806451612903228e-05,
"loss": 0.0016,
"step": 456
},
{
"epoch": 1.6618181818181819,
"grad_norm": 0.35135146975517273,
"learning_rate": 2.374193548387097e-05,
"loss": 0.0253,
"step": 457
},
{
"epoch": 1.6654545454545455,
"grad_norm": 0.0423726923763752,
"learning_rate": 2.367741935483871e-05,
"loss": 0.0009,
"step": 458
},
{
"epoch": 1.669090909090909,
"grad_norm": 1.339455246925354,
"learning_rate": 2.361290322580645e-05,
"loss": 0.0794,
"step": 459
},
{
"epoch": 1.6727272727272728,
"grad_norm": 0.1556975394487381,
"learning_rate": 2.3548387096774193e-05,
"loss": 0.0018,
"step": 460
},
{
"epoch": 1.6763636363636363,
"grad_norm": 1.5904016494750977,
"learning_rate": 2.3483870967741937e-05,
"loss": 0.0036,
"step": 461
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.6247179508209229,
"learning_rate": 2.341935483870968e-05,
"loss": 0.0037,
"step": 462
},
{
"epoch": 1.6836363636363636,
"grad_norm": 1.879459023475647,
"learning_rate": 2.335483870967742e-05,
"loss": 0.0099,
"step": 463
},
{
"epoch": 1.6872727272727273,
"grad_norm": 0.07403961569070816,
"learning_rate": 2.3290322580645164e-05,
"loss": 0.0016,
"step": 464
},
{
"epoch": 1.690909090909091,
"grad_norm": 0.05183988809585571,
"learning_rate": 2.3225806451612906e-05,
"loss": 0.0012,
"step": 465
},
{
"epoch": 1.6945454545454546,
"grad_norm": 5.193166255950928,
"learning_rate": 2.3161290322580644e-05,
"loss": 0.0709,
"step": 466
},
{
"epoch": 1.6981818181818182,
"grad_norm": 1.2823094129562378,
"learning_rate": 2.3096774193548388e-05,
"loss": 0.0195,
"step": 467
},
{
"epoch": 1.7018181818181817,
"grad_norm": 2.308457851409912,
"learning_rate": 2.303225806451613e-05,
"loss": 0.0082,
"step": 468
},
{
"epoch": 1.7054545454545456,
"grad_norm": 0.39891281723976135,
"learning_rate": 2.296774193548387e-05,
"loss": 0.0036,
"step": 469
},
{
"epoch": 1.709090909090909,
"grad_norm": 0.03392359986901283,
"learning_rate": 2.2903225806451615e-05,
"loss": 0.0012,
"step": 470
},
{
"epoch": 1.7127272727272729,
"grad_norm": 0.01698954403400421,
"learning_rate": 2.2838709677419357e-05,
"loss": 0.0007,
"step": 471
},
{
"epoch": 1.7163636363636363,
"grad_norm": 0.059636473655700684,
"learning_rate": 2.2774193548387098e-05,
"loss": 0.0016,
"step": 472
},
{
"epoch": 1.72,
"grad_norm": 0.07056300342082977,
"learning_rate": 2.2709677419354842e-05,
"loss": 0.0017,
"step": 473
},
{
"epoch": 1.7236363636363636,
"grad_norm": 0.1932046115398407,
"learning_rate": 2.264516129032258e-05,
"loss": 0.0029,
"step": 474
},
{
"epoch": 1.7272727272727273,
"grad_norm": 2.354381561279297,
"learning_rate": 2.258064516129032e-05,
"loss": 0.0141,
"step": 475
},
{
"epoch": 1.730909090909091,
"grad_norm": 0.17981848120689392,
"learning_rate": 2.2516129032258066e-05,
"loss": 0.002,
"step": 476
},
{
"epoch": 1.7345454545454544,
"grad_norm": 0.03439001739025116,
"learning_rate": 2.2451612903225807e-05,
"loss": 0.0008,
"step": 477
},
{
"epoch": 1.7381818181818183,
"grad_norm": 1.2802492380142212,
"learning_rate": 2.238709677419355e-05,
"loss": 0.0143,
"step": 478
},
{
"epoch": 1.7418181818181817,
"grad_norm": 0.36729562282562256,
"learning_rate": 2.2322580645161293e-05,
"loss": 0.0085,
"step": 479
},
{
"epoch": 1.7454545454545456,
"grad_norm": 0.03446509316563606,
"learning_rate": 2.2258064516129034e-05,
"loss": 0.0009,
"step": 480
},
{
"epoch": 1.749090909090909,
"grad_norm": 0.05642567202448845,
"learning_rate": 2.2193548387096776e-05,
"loss": 0.0014,
"step": 481
},
{
"epoch": 1.7527272727272727,
"grad_norm": 0.12370045483112335,
"learning_rate": 2.2129032258064517e-05,
"loss": 0.0016,
"step": 482
},
{
"epoch": 1.7563636363636363,
"grad_norm": 0.038627080619335175,
"learning_rate": 2.2064516129032258e-05,
"loss": 0.0011,
"step": 483
},
{
"epoch": 1.76,
"grad_norm": 0.037693917751312256,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.0007,
"step": 484
},
{
"epoch": 1.7636363636363637,
"grad_norm": 1.4172790050506592,
"learning_rate": 2.1935483870967744e-05,
"loss": 0.0865,
"step": 485
},
{
"epoch": 1.767272727272727,
"grad_norm": 2.643702268600464,
"learning_rate": 2.1870967741935485e-05,
"loss": 0.0205,
"step": 486
},
{
"epoch": 1.770909090909091,
"grad_norm": 3.630894899368286,
"learning_rate": 2.1806451612903227e-05,
"loss": 0.0227,
"step": 487
},
{
"epoch": 1.7745454545454544,
"grad_norm": 0.11117129772901535,
"learning_rate": 2.1741935483870968e-05,
"loss": 0.0014,
"step": 488
},
{
"epoch": 1.7781818181818183,
"grad_norm": 0.041525308042764664,
"learning_rate": 2.167741935483871e-05,
"loss": 0.001,
"step": 489
},
{
"epoch": 1.7818181818181817,
"grad_norm": 0.8061334490776062,
"learning_rate": 2.1612903225806454e-05,
"loss": 0.1222,
"step": 490
},
{
"epoch": 1.7854545454545454,
"grad_norm": 0.03137822821736336,
"learning_rate": 2.1548387096774195e-05,
"loss": 0.0008,
"step": 491
},
{
"epoch": 1.789090909090909,
"grad_norm": 0.44713956117630005,
"learning_rate": 2.1483870967741936e-05,
"loss": 0.0023,
"step": 492
},
{
"epoch": 1.7927272727272727,
"grad_norm": 0.43540823459625244,
"learning_rate": 2.141935483870968e-05,
"loss": 0.0035,
"step": 493
},
{
"epoch": 1.7963636363636364,
"grad_norm": 0.1677086055278778,
"learning_rate": 2.1354838709677422e-05,
"loss": 0.002,
"step": 494
},
{
"epoch": 1.8,
"grad_norm": 0.30730533599853516,
"learning_rate": 2.129032258064516e-05,
"loss": 0.0044,
"step": 495
},
{
"epoch": 1.8036363636363637,
"grad_norm": 0.14470332860946655,
"learning_rate": 2.1225806451612904e-05,
"loss": 0.0018,
"step": 496
},
{
"epoch": 1.8072727272727271,
"grad_norm": 0.05039800703525543,
"learning_rate": 2.1161290322580646e-05,
"loss": 0.0011,
"step": 497
},
{
"epoch": 1.810909090909091,
"grad_norm": 0.15073451399803162,
"learning_rate": 2.1096774193548387e-05,
"loss": 0.0018,
"step": 498
},
{
"epoch": 1.8145454545454545,
"grad_norm": 2.081968307495117,
"learning_rate": 2.103225806451613e-05,
"loss": 0.0765,
"step": 499
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.030230529606342316,
"learning_rate": 2.0967741935483873e-05,
"loss": 0.001,
"step": 500
},
{
"epoch": 1.8218181818181818,
"grad_norm": 0.2655714750289917,
"learning_rate": 2.0903225806451614e-05,
"loss": 0.0021,
"step": 501
},
{
"epoch": 1.8254545454545454,
"grad_norm": 0.15943261981010437,
"learning_rate": 2.0838709677419355e-05,
"loss": 0.0015,
"step": 502
},
{
"epoch": 1.829090909090909,
"grad_norm": 4.7471489906311035,
"learning_rate": 2.0774193548387097e-05,
"loss": 0.1086,
"step": 503
},
{
"epoch": 1.8327272727272728,
"grad_norm": 0.1162559986114502,
"learning_rate": 2.0709677419354838e-05,
"loss": 0.0016,
"step": 504
},
{
"epoch": 1.8363636363636364,
"grad_norm": 0.0627504363656044,
"learning_rate": 2.0645161290322582e-05,
"loss": 0.0013,
"step": 505
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.5027517080307007,
"learning_rate": 2.0580645161290324e-05,
"loss": 0.0035,
"step": 506
},
{
"epoch": 1.8436363636363637,
"grad_norm": 0.02140502817928791,
"learning_rate": 2.0516129032258065e-05,
"loss": 0.0008,
"step": 507
},
{
"epoch": 1.8472727272727272,
"grad_norm": 0.07203751057386398,
"learning_rate": 2.045161290322581e-05,
"loss": 0.0018,
"step": 508
},
{
"epoch": 1.850909090909091,
"grad_norm": 1.8638368844985962,
"learning_rate": 2.0387096774193547e-05,
"loss": 0.0594,
"step": 509
},
{
"epoch": 1.8545454545454545,
"grad_norm": 0.12549816071987152,
"learning_rate": 2.0322580645161292e-05,
"loss": 0.0019,
"step": 510
},
{
"epoch": 1.8581818181818182,
"grad_norm": 0.5917963981628418,
"learning_rate": 2.0258064516129033e-05,
"loss": 0.0044,
"step": 511
},
{
"epoch": 1.8618181818181818,
"grad_norm": 0.013607682660222054,
"learning_rate": 2.0193548387096775e-05,
"loss": 0.0007,
"step": 512
},
{
"epoch": 1.8654545454545455,
"grad_norm": 0.01925772987306118,
"learning_rate": 2.0129032258064516e-05,
"loss": 0.0007,
"step": 513
},
{
"epoch": 1.8690909090909091,
"grad_norm": 0.023444948717951775,
"learning_rate": 2.006451612903226e-05,
"loss": 0.0008,
"step": 514
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.19968028366565704,
"learning_rate": 2e-05,
"loss": 0.0018,
"step": 515
},
{
"epoch": 1.8763636363636365,
"grad_norm": 1.308447003364563,
"learning_rate": 1.9935483870967743e-05,
"loss": 0.0883,
"step": 516
},
{
"epoch": 1.88,
"grad_norm": 0.20825034379959106,
"learning_rate": 1.9870967741935484e-05,
"loss": 0.0022,
"step": 517
},
{
"epoch": 1.8836363636363638,
"grad_norm": 0.06526435911655426,
"learning_rate": 1.9806451612903225e-05,
"loss": 0.0014,
"step": 518
},
{
"epoch": 1.8872727272727272,
"grad_norm": 0.1210133358836174,
"learning_rate": 1.974193548387097e-05,
"loss": 0.0009,
"step": 519
},
{
"epoch": 1.8909090909090909,
"grad_norm": 1.0739092826843262,
"learning_rate": 1.967741935483871e-05,
"loss": 0.1127,
"step": 520
},
{
"epoch": 1.8945454545454545,
"grad_norm": 0.018074801191687584,
"learning_rate": 1.9612903225806452e-05,
"loss": 0.0008,
"step": 521
},
{
"epoch": 1.8981818181818182,
"grad_norm": 2.0916597843170166,
"learning_rate": 1.9548387096774197e-05,
"loss": 0.0115,
"step": 522
},
{
"epoch": 1.9018181818181819,
"grad_norm": 0.015548643656075,
"learning_rate": 1.9483870967741935e-05,
"loss": 0.0006,
"step": 523
},
{
"epoch": 1.9054545454545453,
"grad_norm": 0.10145322978496552,
"learning_rate": 1.9419354838709676e-05,
"loss": 0.002,
"step": 524
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.24982163310050964,
"learning_rate": 1.935483870967742e-05,
"loss": 0.0024,
"step": 525
},
{
"epoch": 1.9127272727272726,
"grad_norm": 0.14598214626312256,
"learning_rate": 1.9290322580645162e-05,
"loss": 0.0017,
"step": 526
},
{
"epoch": 1.9163636363636365,
"grad_norm": 4.5397629737854,
"learning_rate": 1.9225806451612903e-05,
"loss": 0.1227,
"step": 527
},
{
"epoch": 1.92,
"grad_norm": 0.5093303322792053,
"learning_rate": 1.9161290322580648e-05,
"loss": 0.0323,
"step": 528
},
{
"epoch": 1.9236363636363636,
"grad_norm": 0.14823508262634277,
"learning_rate": 1.909677419354839e-05,
"loss": 0.0017,
"step": 529
},
{
"epoch": 1.9272727272727272,
"grad_norm": 0.6760469675064087,
"learning_rate": 1.9032258064516127e-05,
"loss": 0.0054,
"step": 530
},
{
"epoch": 1.930909090909091,
"grad_norm": 0.05310118570923805,
"learning_rate": 1.896774193548387e-05,
"loss": 0.0012,
"step": 531
},
{
"epoch": 1.9345454545454546,
"grad_norm": 0.2781686782836914,
"learning_rate": 1.8903225806451613e-05,
"loss": 0.0026,
"step": 532
},
{
"epoch": 1.9381818181818182,
"grad_norm": 0.0464974045753479,
"learning_rate": 1.8838709677419354e-05,
"loss": 0.0015,
"step": 533
},
{
"epoch": 1.9418181818181819,
"grad_norm": 0.5551739931106567,
"learning_rate": 1.87741935483871e-05,
"loss": 0.0065,
"step": 534
},
{
"epoch": 1.9454545454545453,
"grad_norm": 0.08245756477117538,
"learning_rate": 1.870967741935484e-05,
"loss": 0.0015,
"step": 535
},
{
"epoch": 1.9490909090909092,
"grad_norm": 1.5298570394515991,
"learning_rate": 1.864516129032258e-05,
"loss": 0.0814,
"step": 536
},
{
"epoch": 1.9527272727272726,
"grad_norm": 0.028485940769314766,
"learning_rate": 1.8580645161290326e-05,
"loss": 0.0009,
"step": 537
},
{
"epoch": 1.9563636363636365,
"grad_norm": 0.12668201327323914,
"learning_rate": 1.8516129032258064e-05,
"loss": 0.0015,
"step": 538
},
{
"epoch": 1.96,
"grad_norm": 0.11611904203891754,
"learning_rate": 1.845161290322581e-05,
"loss": 0.0023,
"step": 539
},
{
"epoch": 1.9636363636363636,
"grad_norm": 1.0167148113250732,
"learning_rate": 1.838709677419355e-05,
"loss": 0.0055,
"step": 540
},
{
"epoch": 1.9672727272727273,
"grad_norm": 4.344989776611328,
"learning_rate": 1.832258064516129e-05,
"loss": 0.0237,
"step": 541
},
{
"epoch": 1.970909090909091,
"grad_norm": 1.655159831047058,
"learning_rate": 1.8258064516129032e-05,
"loss": 0.01,
"step": 542
},
{
"epoch": 1.9745454545454546,
"grad_norm": 0.03663047030568123,
"learning_rate": 1.8193548387096777e-05,
"loss": 0.0012,
"step": 543
},
{
"epoch": 1.978181818181818,
"grad_norm": 0.4332762062549591,
"learning_rate": 1.8129032258064518e-05,
"loss": 0.0039,
"step": 544
},
{
"epoch": 1.981818181818182,
"grad_norm": 3.9883310794830322,
"learning_rate": 1.806451612903226e-05,
"loss": 0.0309,
"step": 545
},
{
"epoch": 1.9854545454545454,
"grad_norm": 2.0200157165527344,
"learning_rate": 1.8e-05,
"loss": 0.0127,
"step": 546
},
{
"epoch": 1.9890909090909092,
"grad_norm": 1.3924773931503296,
"learning_rate": 1.7935483870967742e-05,
"loss": 0.0069,
"step": 547
},
{
"epoch": 1.9927272727272727,
"grad_norm": 0.2380281239748001,
"learning_rate": 1.7870967741935486e-05,
"loss": 0.0196,
"step": 548
},
{
"epoch": 1.9963636363636363,
"grad_norm": 0.9676334261894226,
"learning_rate": 1.7806451612903228e-05,
"loss": 0.1096,
"step": 549
},
{
"epoch": 2.0,
"grad_norm": 0.012336778454482555,
"learning_rate": 1.774193548387097e-05,
"loss": 0.0006,
"step": 550
},
{
"epoch": 2.0,
"eval_accuracy": 0.9918032786885246,
"eval_loss": 0.03281828388571739,
"eval_runtime": 12.743,
"eval_samples_per_second": 344.661,
"eval_steps_per_second": 5.415,
"step": 550
},
{
"epoch": 2.0036363636363634,
"grad_norm": 0.019314678385853767,
"learning_rate": 1.7677419354838713e-05,
"loss": 0.0008,
"step": 551
},
{
"epoch": 2.0072727272727273,
"grad_norm": 0.11856160312891006,
"learning_rate": 1.761290322580645e-05,
"loss": 0.0016,
"step": 552
},
{
"epoch": 2.0109090909090908,
"grad_norm": 0.019284116104245186,
"learning_rate": 1.7548387096774193e-05,
"loss": 0.0008,
"step": 553
},
{
"epoch": 2.0145454545454546,
"grad_norm": 0.701896607875824,
"learning_rate": 1.7483870967741937e-05,
"loss": 0.0056,
"step": 554
},
{
"epoch": 2.018181818181818,
"grad_norm": 0.1820172518491745,
"learning_rate": 1.741935483870968e-05,
"loss": 0.0016,
"step": 555
},
{
"epoch": 2.021818181818182,
"grad_norm": 0.014085445553064346,
"learning_rate": 1.735483870967742e-05,
"loss": 0.0007,
"step": 556
},
{
"epoch": 2.0254545454545454,
"grad_norm": 0.01478899922221899,
"learning_rate": 1.7290322580645164e-05,
"loss": 0.0007,
"step": 557
},
{
"epoch": 2.0290909090909093,
"grad_norm": 0.01645870879292488,
"learning_rate": 1.7225806451612906e-05,
"loss": 0.0007,
"step": 558
},
{
"epoch": 2.0327272727272727,
"grad_norm": 0.09879927337169647,
"learning_rate": 1.7161290322580643e-05,
"loss": 0.0013,
"step": 559
},
{
"epoch": 2.036363636363636,
"grad_norm": 0.016561178490519524,
"learning_rate": 1.7096774193548388e-05,
"loss": 0.0008,
"step": 560
},
{
"epoch": 2.04,
"grad_norm": 0.030180329456925392,
"learning_rate": 1.703225806451613e-05,
"loss": 0.0011,
"step": 561
},
{
"epoch": 2.0436363636363635,
"grad_norm": 0.023942433297634125,
"learning_rate": 1.696774193548387e-05,
"loss": 0.0009,
"step": 562
},
{
"epoch": 2.0472727272727274,
"grad_norm": 0.5132169127464294,
"learning_rate": 1.6903225806451615e-05,
"loss": 0.0216,
"step": 563
},
{
"epoch": 2.050909090909091,
"grad_norm": 0.01695244014263153,
"learning_rate": 1.6838709677419356e-05,
"loss": 0.0008,
"step": 564
},
{
"epoch": 2.0545454545454547,
"grad_norm": 0.019326528534293175,
"learning_rate": 1.6774193548387098e-05,
"loss": 0.0008,
"step": 565
},
{
"epoch": 2.058181818181818,
"grad_norm": 0.018215378746390343,
"learning_rate": 1.670967741935484e-05,
"loss": 0.0007,
"step": 566
},
{
"epoch": 2.061818181818182,
"grad_norm": 0.02021806314587593,
"learning_rate": 1.664516129032258e-05,
"loss": 0.0008,
"step": 567
},
{
"epoch": 2.0654545454545454,
"grad_norm": 0.030195925384759903,
"learning_rate": 1.658064516129032e-05,
"loss": 0.0009,
"step": 568
},
{
"epoch": 2.0690909090909093,
"grad_norm": 0.2582416236400604,
"learning_rate": 1.6516129032258066e-05,
"loss": 0.0014,
"step": 569
},
{
"epoch": 2.0727272727272728,
"grad_norm": 0.052597131580114365,
"learning_rate": 1.6451612903225807e-05,
"loss": 0.0013,
"step": 570
},
{
"epoch": 2.076363636363636,
"grad_norm": 4.027952194213867,
"learning_rate": 1.638709677419355e-05,
"loss": 0.0367,
"step": 571
},
{
"epoch": 2.08,
"grad_norm": 3.2471768856048584,
"learning_rate": 1.6322580645161293e-05,
"loss": 0.0131,
"step": 572
},
{
"epoch": 2.0836363636363635,
"grad_norm": 0.013683440163731575,
"learning_rate": 1.625806451612903e-05,
"loss": 0.0006,
"step": 573
},
{
"epoch": 2.0872727272727274,
"grad_norm": 0.9055846929550171,
"learning_rate": 1.6193548387096776e-05,
"loss": 0.0045,
"step": 574
},
{
"epoch": 2.090909090909091,
"grad_norm": 0.06875227391719818,
"learning_rate": 1.6129032258064517e-05,
"loss": 0.0011,
"step": 575
},
{
"epoch": 2.0945454545454547,
"grad_norm": 0.08661270886659622,
"learning_rate": 1.6064516129032258e-05,
"loss": 0.0018,
"step": 576
},
{
"epoch": 2.098181818181818,
"grad_norm": 0.3580359220504761,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.0208,
"step": 577
},
{
"epoch": 2.101818181818182,
"grad_norm": 3.5332114696502686,
"learning_rate": 1.5935483870967744e-05,
"loss": 0.0246,
"step": 578
},
{
"epoch": 2.1054545454545455,
"grad_norm": 0.23546698689460754,
"learning_rate": 1.5870967741935485e-05,
"loss": 0.0058,
"step": 579
},
{
"epoch": 2.109090909090909,
"grad_norm": 0.0922674611210823,
"learning_rate": 1.5806451612903226e-05,
"loss": 0.001,
"step": 580
},
{
"epoch": 2.112727272727273,
"grad_norm": 0.03179372474551201,
"learning_rate": 1.5741935483870968e-05,
"loss": 0.0009,
"step": 581
},
{
"epoch": 2.1163636363636362,
"grad_norm": 0.08128567039966583,
"learning_rate": 1.567741935483871e-05,
"loss": 0.0021,
"step": 582
},
{
"epoch": 2.12,
"grad_norm": 0.03141499683260918,
"learning_rate": 1.5612903225806454e-05,
"loss": 0.0009,
"step": 583
},
{
"epoch": 2.1236363636363635,
"grad_norm": 0.056340087205171585,
"learning_rate": 1.5548387096774195e-05,
"loss": 0.0012,
"step": 584
},
{
"epoch": 2.1272727272727274,
"grad_norm": 0.011782072484493256,
"learning_rate": 1.5483870967741936e-05,
"loss": 0.0005,
"step": 585
},
{
"epoch": 2.130909090909091,
"grad_norm": 0.017823919653892517,
"learning_rate": 1.541935483870968e-05,
"loss": 0.0007,
"step": 586
},
{
"epoch": 2.1345454545454547,
"grad_norm": 0.19493459165096283,
"learning_rate": 1.535483870967742e-05,
"loss": 0.0015,
"step": 587
},
{
"epoch": 2.138181818181818,
"grad_norm": 3.2469050884246826,
"learning_rate": 1.529032258064516e-05,
"loss": 0.0172,
"step": 588
},
{
"epoch": 2.1418181818181816,
"grad_norm": 0.05841919407248497,
"learning_rate": 1.5225806451612903e-05,
"loss": 0.0008,
"step": 589
},
{
"epoch": 2.1454545454545455,
"grad_norm": 1.5849275588989258,
"learning_rate": 1.5161290322580646e-05,
"loss": 0.0064,
"step": 590
},
{
"epoch": 2.149090909090909,
"grad_norm": 0.0897936001420021,
"learning_rate": 1.5096774193548389e-05,
"loss": 0.0012,
"step": 591
},
{
"epoch": 2.152727272727273,
"grad_norm": 0.030406808480620384,
"learning_rate": 1.503225806451613e-05,
"loss": 0.0008,
"step": 592
},
{
"epoch": 2.1563636363636363,
"grad_norm": 0.42377015948295593,
"learning_rate": 1.4967741935483873e-05,
"loss": 0.0014,
"step": 593
},
{
"epoch": 2.16,
"grad_norm": 0.037775181233882904,
"learning_rate": 1.4903225806451612e-05,
"loss": 0.0009,
"step": 594
},
{
"epoch": 2.1636363636363636,
"grad_norm": 0.01757960021495819,
"learning_rate": 1.4838709677419355e-05,
"loss": 0.0007,
"step": 595
},
{
"epoch": 2.1672727272727275,
"grad_norm": 0.13892705738544464,
"learning_rate": 1.4774193548387096e-05,
"loss": 0.0013,
"step": 596
},
{
"epoch": 2.170909090909091,
"grad_norm": 0.24957135319709778,
"learning_rate": 1.470967741935484e-05,
"loss": 0.0021,
"step": 597
},
{
"epoch": 2.174545454545455,
"grad_norm": 0.042269039899110794,
"learning_rate": 1.4645161290322582e-05,
"loss": 0.0007,
"step": 598
},
{
"epoch": 2.178181818181818,
"grad_norm": 0.027613814920186996,
"learning_rate": 1.4580645161290324e-05,
"loss": 0.0006,
"step": 599
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.09081171452999115,
"learning_rate": 1.4516129032258066e-05,
"loss": 0.0007,
"step": 600
},
{
"epoch": 2.1854545454545455,
"grad_norm": 0.014807288534939289,
"learning_rate": 1.4451612903225808e-05,
"loss": 0.0006,
"step": 601
},
{
"epoch": 2.189090909090909,
"grad_norm": 0.016062721610069275,
"learning_rate": 1.4387096774193547e-05,
"loss": 0.0006,
"step": 602
},
{
"epoch": 2.192727272727273,
"grad_norm": 0.011297466233372688,
"learning_rate": 1.432258064516129e-05,
"loss": 0.0005,
"step": 603
},
{
"epoch": 2.1963636363636363,
"grad_norm": 0.018607553094625473,
"learning_rate": 1.4258064516129033e-05,
"loss": 0.0006,
"step": 604
},
{
"epoch": 2.2,
"grad_norm": 1.5661729574203491,
"learning_rate": 1.4193548387096774e-05,
"loss": 0.0571,
"step": 605
},
{
"epoch": 2.2036363636363636,
"grad_norm": 0.018998922780156136,
"learning_rate": 1.4129032258064517e-05,
"loss": 0.0007,
"step": 606
},
{
"epoch": 2.207272727272727,
"grad_norm": 0.00895577110350132,
"learning_rate": 1.406451612903226e-05,
"loss": 0.0004,
"step": 607
},
{
"epoch": 2.210909090909091,
"grad_norm": 0.023930538445711136,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.0006,
"step": 608
},
{
"epoch": 2.2145454545454544,
"grad_norm": 0.31831488013267517,
"learning_rate": 1.3935483870967741e-05,
"loss": 0.002,
"step": 609
},
{
"epoch": 2.2181818181818183,
"grad_norm": 0.46921107172966003,
"learning_rate": 1.3870967741935484e-05,
"loss": 0.0026,
"step": 610
},
{
"epoch": 2.2218181818181817,
"grad_norm": 1.8380354642868042,
"learning_rate": 1.3806451612903227e-05,
"loss": 0.0341,
"step": 611
},
{
"epoch": 2.2254545454545456,
"grad_norm": 0.016989752650260925,
"learning_rate": 1.3741935483870968e-05,
"loss": 0.0006,
"step": 612
},
{
"epoch": 2.229090909090909,
"grad_norm": 0.04719064012169838,
"learning_rate": 1.3677419354838711e-05,
"loss": 0.0009,
"step": 613
},
{
"epoch": 2.232727272727273,
"grad_norm": 0.45522865653038025,
"learning_rate": 1.3612903225806452e-05,
"loss": 0.004,
"step": 614
},
{
"epoch": 2.2363636363636363,
"grad_norm": 4.254357814788818,
"learning_rate": 1.3548387096774195e-05,
"loss": 0.0321,
"step": 615
},
{
"epoch": 2.24,
"grad_norm": 0.42286837100982666,
"learning_rate": 1.3483870967741935e-05,
"loss": 0.0112,
"step": 616
},
{
"epoch": 2.2436363636363637,
"grad_norm": 0.011312934570014477,
"learning_rate": 1.3419354838709678e-05,
"loss": 0.0005,
"step": 617
},
{
"epoch": 2.247272727272727,
"grad_norm": 0.056383103132247925,
"learning_rate": 1.3354838709677419e-05,
"loss": 0.0007,
"step": 618
},
{
"epoch": 2.250909090909091,
"grad_norm": 0.008479413576424122,
"learning_rate": 1.3290322580645162e-05,
"loss": 0.0004,
"step": 619
},
{
"epoch": 2.2545454545454544,
"grad_norm": 0.2737773358821869,
"learning_rate": 1.3225806451612905e-05,
"loss": 0.017,
"step": 620
},
{
"epoch": 2.2581818181818183,
"grad_norm": 0.021578149870038033,
"learning_rate": 1.3161290322580646e-05,
"loss": 0.0005,
"step": 621
},
{
"epoch": 2.2618181818181817,
"grad_norm": 0.017386844381690025,
"learning_rate": 1.3096774193548389e-05,
"loss": 0.0005,
"step": 622
},
{
"epoch": 2.2654545454545456,
"grad_norm": 0.010488376021385193,
"learning_rate": 1.3032258064516129e-05,
"loss": 0.0005,
"step": 623
},
{
"epoch": 2.269090909090909,
"grad_norm": 0.2101997584104538,
"learning_rate": 1.2967741935483872e-05,
"loss": 0.0067,
"step": 624
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.01739770732820034,
"learning_rate": 1.2903225806451613e-05,
"loss": 0.0006,
"step": 625
},
{
"epoch": 2.2763636363636364,
"grad_norm": 4.312204837799072,
"learning_rate": 1.2838709677419356e-05,
"loss": 0.0568,
"step": 626
},
{
"epoch": 2.2800000000000002,
"grad_norm": 0.03970419242978096,
"learning_rate": 1.2774193548387099e-05,
"loss": 0.0008,
"step": 627
},
{
"epoch": 2.2836363636363637,
"grad_norm": 0.06886734068393707,
"learning_rate": 1.270967741935484e-05,
"loss": 0.001,
"step": 628
},
{
"epoch": 2.287272727272727,
"grad_norm": 0.020005859434604645,
"learning_rate": 1.2645161290322583e-05,
"loss": 0.0006,
"step": 629
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.042734235525131226,
"learning_rate": 1.2580645161290322e-05,
"loss": 0.0006,
"step": 630
},
{
"epoch": 2.2945454545454544,
"grad_norm": 0.1663779318332672,
"learning_rate": 1.2516129032258064e-05,
"loss": 0.0013,
"step": 631
},
{
"epoch": 2.2981818181818183,
"grad_norm": 0.8976339101791382,
"learning_rate": 1.2451612903225807e-05,
"loss": 0.0025,
"step": 632
},
{
"epoch": 2.3018181818181818,
"grad_norm": 0.013320104219019413,
"learning_rate": 1.238709677419355e-05,
"loss": 0.0005,
"step": 633
},
{
"epoch": 2.3054545454545456,
"grad_norm": 0.20691031217575073,
"learning_rate": 1.232258064516129e-05,
"loss": 0.001,
"step": 634
},
{
"epoch": 2.309090909090909,
"grad_norm": 0.05960312858223915,
"learning_rate": 1.2258064516129032e-05,
"loss": 0.0008,
"step": 635
},
{
"epoch": 2.3127272727272725,
"grad_norm": 3.190467596054077,
"learning_rate": 1.2193548387096775e-05,
"loss": 0.0132,
"step": 636
},
{
"epoch": 2.3163636363636364,
"grad_norm": 0.9548889994621277,
"learning_rate": 1.2129032258064516e-05,
"loss": 0.0046,
"step": 637
},
{
"epoch": 2.32,
"grad_norm": 0.037591926753520966,
"learning_rate": 1.2064516129032259e-05,
"loss": 0.0006,
"step": 638
},
{
"epoch": 2.3236363636363637,
"grad_norm": 0.025826606899499893,
"learning_rate": 1.2e-05,
"loss": 0.0006,
"step": 639
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.13162964582443237,
"learning_rate": 1.1935483870967743e-05,
"loss": 0.0013,
"step": 640
},
{
"epoch": 2.330909090909091,
"grad_norm": 0.009305083192884922,
"learning_rate": 1.1870967741935484e-05,
"loss": 0.0004,
"step": 641
},
{
"epoch": 2.3345454545454545,
"grad_norm": 0.00890254881232977,
"learning_rate": 1.1806451612903226e-05,
"loss": 0.0004,
"step": 642
},
{
"epoch": 2.3381818181818184,
"grad_norm": 0.00882215891033411,
"learning_rate": 1.1741935483870969e-05,
"loss": 0.0004,
"step": 643
},
{
"epoch": 2.341818181818182,
"grad_norm": 0.11077472567558289,
"learning_rate": 1.167741935483871e-05,
"loss": 0.0008,
"step": 644
},
{
"epoch": 2.3454545454545457,
"grad_norm": 0.016244694590568542,
"learning_rate": 1.1612903225806453e-05,
"loss": 0.0005,
"step": 645
},
{
"epoch": 2.349090909090909,
"grad_norm": 0.038961056619882584,
"learning_rate": 1.1548387096774194e-05,
"loss": 0.0007,
"step": 646
},
{
"epoch": 2.3527272727272726,
"grad_norm": 0.008070679381489754,
"learning_rate": 1.1483870967741935e-05,
"loss": 0.0004,
"step": 647
},
{
"epoch": 2.3563636363636364,
"grad_norm": 0.03419802337884903,
"learning_rate": 1.1419354838709678e-05,
"loss": 0.0007,
"step": 648
},
{
"epoch": 2.36,
"grad_norm": 0.1610228568315506,
"learning_rate": 1.1354838709677421e-05,
"loss": 0.0018,
"step": 649
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.21933913230895996,
"learning_rate": 1.129032258064516e-05,
"loss": 0.0012,
"step": 650
},
{
"epoch": 2.367272727272727,
"grad_norm": 0.024096714332699776,
"learning_rate": 1.1225806451612904e-05,
"loss": 0.0005,
"step": 651
},
{
"epoch": 2.370909090909091,
"grad_norm": 0.013976830057799816,
"learning_rate": 1.1161290322580647e-05,
"loss": 0.0005,
"step": 652
},
{
"epoch": 2.3745454545454545,
"grad_norm": 0.009037399664521217,
"learning_rate": 1.1096774193548388e-05,
"loss": 0.0004,
"step": 653
},
{
"epoch": 2.378181818181818,
"grad_norm": 4.028433799743652,
"learning_rate": 1.1032258064516129e-05,
"loss": 0.0074,
"step": 654
},
{
"epoch": 2.381818181818182,
"grad_norm": 0.9609191417694092,
"learning_rate": 1.0967741935483872e-05,
"loss": 0.0054,
"step": 655
},
{
"epoch": 2.3854545454545453,
"grad_norm": 1.4599242210388184,
"learning_rate": 1.0903225806451613e-05,
"loss": 0.0105,
"step": 656
},
{
"epoch": 2.389090909090909,
"grad_norm": 1.0803523063659668,
"learning_rate": 1.0838709677419355e-05,
"loss": 0.0037,
"step": 657
},
{
"epoch": 2.3927272727272726,
"grad_norm": 0.006454968359321356,
"learning_rate": 1.0774193548387097e-05,
"loss": 0.0003,
"step": 658
},
{
"epoch": 2.3963636363636365,
"grad_norm": 0.043990444391965866,
"learning_rate": 1.070967741935484e-05,
"loss": 0.0008,
"step": 659
},
{
"epoch": 2.4,
"grad_norm": 0.04728386178612709,
"learning_rate": 1.064516129032258e-05,
"loss": 0.0006,
"step": 660
},
{
"epoch": 2.403636363636364,
"grad_norm": 0.012434919364750385,
"learning_rate": 1.0580645161290323e-05,
"loss": 0.0005,
"step": 661
},
{
"epoch": 2.4072727272727272,
"grad_norm": 0.010447041131556034,
"learning_rate": 1.0516129032258066e-05,
"loss": 0.0003,
"step": 662
},
{
"epoch": 2.410909090909091,
"grad_norm": 0.007828759960830212,
"learning_rate": 1.0451612903225807e-05,
"loss": 0.0004,
"step": 663
},
{
"epoch": 2.4145454545454546,
"grad_norm": 0.00887393206357956,
"learning_rate": 1.0387096774193548e-05,
"loss": 0.0004,
"step": 664
},
{
"epoch": 2.418181818181818,
"grad_norm": 0.8037237524986267,
"learning_rate": 1.0322580645161291e-05,
"loss": 0.0983,
"step": 665
},
{
"epoch": 2.421818181818182,
"grad_norm": 0.014007828198373318,
"learning_rate": 1.0258064516129032e-05,
"loss": 0.0004,
"step": 666
},
{
"epoch": 2.4254545454545453,
"grad_norm": 0.010563932359218597,
"learning_rate": 1.0193548387096774e-05,
"loss": 0.0004,
"step": 667
},
{
"epoch": 2.429090909090909,
"grad_norm": 0.009581638500094414,
"learning_rate": 1.0129032258064517e-05,
"loss": 0.0004,
"step": 668
},
{
"epoch": 2.4327272727272726,
"grad_norm": 0.014039217494428158,
"learning_rate": 1.0064516129032258e-05,
"loss": 0.0004,
"step": 669
},
{
"epoch": 2.4363636363636365,
"grad_norm": 0.035384513437747955,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 670
},
{
"epoch": 2.44,
"grad_norm": 0.011463082395493984,
"learning_rate": 9.935483870967742e-06,
"loss": 0.0004,
"step": 671
},
{
"epoch": 2.443636363636364,
"grad_norm": 0.008181700482964516,
"learning_rate": 9.870967741935485e-06,
"loss": 0.0004,
"step": 672
},
{
"epoch": 2.4472727272727273,
"grad_norm": 0.7373052835464478,
"learning_rate": 9.806451612903226e-06,
"loss": 0.006,
"step": 673
},
{
"epoch": 2.450909090909091,
"grad_norm": 0.018753718584775925,
"learning_rate": 9.741935483870967e-06,
"loss": 0.0004,
"step": 674
},
{
"epoch": 2.4545454545454546,
"grad_norm": 0.009037282317876816,
"learning_rate": 9.67741935483871e-06,
"loss": 0.0004,
"step": 675
},
{
"epoch": 2.458181818181818,
"grad_norm": 0.016004854813218117,
"learning_rate": 9.612903225806452e-06,
"loss": 0.0005,
"step": 676
},
{
"epoch": 2.461818181818182,
"grad_norm": 0.037706032395362854,
"learning_rate": 9.548387096774195e-06,
"loss": 0.0006,
"step": 677
},
{
"epoch": 2.4654545454545453,
"grad_norm": 0.05756361410021782,
"learning_rate": 9.483870967741936e-06,
"loss": 0.0008,
"step": 678
},
{
"epoch": 2.4690909090909092,
"grad_norm": 0.0136951869353652,
"learning_rate": 9.419354838709677e-06,
"loss": 0.0005,
"step": 679
},
{
"epoch": 2.4727272727272727,
"grad_norm": 0.011940555647015572,
"learning_rate": 9.35483870967742e-06,
"loss": 0.0006,
"step": 680
},
{
"epoch": 2.4763636363636365,
"grad_norm": 0.25020283460617065,
"learning_rate": 9.290322580645163e-06,
"loss": 0.0088,
"step": 681
},
{
"epoch": 2.48,
"grad_norm": 0.008918453007936478,
"learning_rate": 9.225806451612904e-06,
"loss": 0.0004,
"step": 682
},
{
"epoch": 2.4836363636363634,
"grad_norm": 0.0643925741314888,
"learning_rate": 9.161290322580645e-06,
"loss": 0.0008,
"step": 683
},
{
"epoch": 2.4872727272727273,
"grad_norm": 0.08033094555139542,
"learning_rate": 9.096774193548388e-06,
"loss": 0.0007,
"step": 684
},
{
"epoch": 2.4909090909090907,
"grad_norm": 0.009339767508208752,
"learning_rate": 9.03225806451613e-06,
"loss": 0.0004,
"step": 685
},
{
"epoch": 2.4945454545454546,
"grad_norm": 0.012149178422987461,
"learning_rate": 8.967741935483871e-06,
"loss": 0.0005,
"step": 686
},
{
"epoch": 2.498181818181818,
"grad_norm": 0.028004921972751617,
"learning_rate": 8.903225806451614e-06,
"loss": 0.0004,
"step": 687
},
{
"epoch": 2.501818181818182,
"grad_norm": 0.013502350077033043,
"learning_rate": 8.838709677419357e-06,
"loss": 0.0004,
"step": 688
},
{
"epoch": 2.5054545454545454,
"grad_norm": 0.008973742835223675,
"learning_rate": 8.774193548387096e-06,
"loss": 0.0004,
"step": 689
},
{
"epoch": 2.509090909090909,
"grad_norm": 0.017967596650123596,
"learning_rate": 8.70967741935484e-06,
"loss": 0.0005,
"step": 690
},
{
"epoch": 2.5127272727272727,
"grad_norm": 0.009786793030798435,
"learning_rate": 8.645161290322582e-06,
"loss": 0.0004,
"step": 691
},
{
"epoch": 2.5163636363636366,
"grad_norm": 0.03004172444343567,
"learning_rate": 8.580645161290322e-06,
"loss": 0.0007,
"step": 692
},
{
"epoch": 2.52,
"grad_norm": 0.014010576531291008,
"learning_rate": 8.516129032258065e-06,
"loss": 0.0005,
"step": 693
},
{
"epoch": 2.5236363636363635,
"grad_norm": 0.10186956822872162,
"learning_rate": 8.451612903225808e-06,
"loss": 0.0007,
"step": 694
},
{
"epoch": 2.5272727272727273,
"grad_norm": 0.007655604742467403,
"learning_rate": 8.387096774193549e-06,
"loss": 0.0003,
"step": 695
},
{
"epoch": 2.5309090909090908,
"grad_norm": 0.026972953230142593,
"learning_rate": 8.32258064516129e-06,
"loss": 0.0006,
"step": 696
},
{
"epoch": 2.5345454545454547,
"grad_norm": 0.00847043376415968,
"learning_rate": 8.258064516129033e-06,
"loss": 0.0004,
"step": 697
},
{
"epoch": 2.538181818181818,
"grad_norm": 3.0761332511901855,
"learning_rate": 8.193548387096774e-06,
"loss": 0.045,
"step": 698
},
{
"epoch": 2.541818181818182,
"grad_norm": 0.010722169652581215,
"learning_rate": 8.129032258064515e-06,
"loss": 0.0004,
"step": 699
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.024674193933606148,
"learning_rate": 8.064516129032258e-06,
"loss": 0.0004,
"step": 700
},
{
"epoch": 2.549090909090909,
"grad_norm": 0.010132328607141972,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0004,
"step": 701
},
{
"epoch": 2.5527272727272727,
"grad_norm": 0.007771102711558342,
"learning_rate": 7.935483870967743e-06,
"loss": 0.0003,
"step": 702
},
{
"epoch": 2.5563636363636366,
"grad_norm": 0.009345349855720997,
"learning_rate": 7.870967741935484e-06,
"loss": 0.0004,
"step": 703
},
{
"epoch": 2.56,
"grad_norm": 0.061426129192113876,
"learning_rate": 7.806451612903227e-06,
"loss": 0.0012,
"step": 704
},
{
"epoch": 2.5636363636363635,
"grad_norm": 0.006356612779200077,
"learning_rate": 7.741935483870968e-06,
"loss": 0.0003,
"step": 705
},
{
"epoch": 2.5672727272727274,
"grad_norm": 0.5145014524459839,
"learning_rate": 7.67741935483871e-06,
"loss": 0.0159,
"step": 706
},
{
"epoch": 2.570909090909091,
"grad_norm": 0.01627645082771778,
"learning_rate": 7.612903225806451e-06,
"loss": 0.0005,
"step": 707
},
{
"epoch": 2.5745454545454547,
"grad_norm": 0.009603966027498245,
"learning_rate": 7.548387096774194e-06,
"loss": 0.0005,
"step": 708
},
{
"epoch": 2.578181818181818,
"grad_norm": 0.008407434448599815,
"learning_rate": 7.483870967741936e-06,
"loss": 0.0004,
"step": 709
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.007048910949379206,
"learning_rate": 7.419354838709678e-06,
"loss": 0.0003,
"step": 710
},
{
"epoch": 2.5854545454545454,
"grad_norm": 0.007168797310441732,
"learning_rate": 7.35483870967742e-06,
"loss": 0.0003,
"step": 711
},
{
"epoch": 2.589090909090909,
"grad_norm": 0.017820533365011215,
"learning_rate": 7.290322580645162e-06,
"loss": 0.0006,
"step": 712
},
{
"epoch": 2.5927272727272728,
"grad_norm": 1.8030993938446045,
"learning_rate": 7.225806451612904e-06,
"loss": 0.1001,
"step": 713
},
{
"epoch": 2.596363636363636,
"grad_norm": 0.006081653293222189,
"learning_rate": 7.161290322580645e-06,
"loss": 0.0003,
"step": 714
},
{
"epoch": 2.6,
"grad_norm": 0.013014406897127628,
"learning_rate": 7.096774193548387e-06,
"loss": 0.0005,
"step": 715
},
{
"epoch": 2.6036363636363635,
"grad_norm": 0.5529889464378357,
"learning_rate": 7.03225806451613e-06,
"loss": 0.0032,
"step": 716
},
{
"epoch": 2.6072727272727274,
"grad_norm": 0.010706817731261253,
"learning_rate": 6.9677419354838705e-06,
"loss": 0.0004,
"step": 717
},
{
"epoch": 2.610909090909091,
"grad_norm": 0.009283789433538914,
"learning_rate": 6.9032258064516135e-06,
"loss": 0.0004,
"step": 718
},
{
"epoch": 2.6145454545454543,
"grad_norm": 0.006831625942140818,
"learning_rate": 6.8387096774193555e-06,
"loss": 0.0003,
"step": 719
},
{
"epoch": 2.618181818181818,
"grad_norm": 0.010544957593083382,
"learning_rate": 6.774193548387098e-06,
"loss": 0.0004,
"step": 720
},
{
"epoch": 2.621818181818182,
"grad_norm": 0.573939323425293,
"learning_rate": 6.709677419354839e-06,
"loss": 0.1136,
"step": 721
},
{
"epoch": 2.6254545454545455,
"grad_norm": 0.007392039522528648,
"learning_rate": 6.645161290322581e-06,
"loss": 0.0003,
"step": 722
},
{
"epoch": 2.629090909090909,
"grad_norm": 0.03405938670039177,
"learning_rate": 6.580645161290323e-06,
"loss": 0.0007,
"step": 723
},
{
"epoch": 2.632727272727273,
"grad_norm": 0.018447572365403175,
"learning_rate": 6.516129032258064e-06,
"loss": 0.0005,
"step": 724
},
{
"epoch": 2.6363636363636362,
"grad_norm": 0.00638270378112793,
"learning_rate": 6.451612903225806e-06,
"loss": 0.0003,
"step": 725
},
{
"epoch": 2.64,
"grad_norm": 0.017991170287132263,
"learning_rate": 6.387096774193549e-06,
"loss": 0.0004,
"step": 726
},
{
"epoch": 2.6436363636363636,
"grad_norm": 0.00989875290542841,
"learning_rate": 6.322580645161291e-06,
"loss": 0.0004,
"step": 727
},
{
"epoch": 2.6472727272727274,
"grad_norm": 3.8099822998046875,
"learning_rate": 6.258064516129032e-06,
"loss": 0.0512,
"step": 728
},
{
"epoch": 2.650909090909091,
"grad_norm": 0.012733093462884426,
"learning_rate": 6.193548387096775e-06,
"loss": 0.0005,
"step": 729
},
{
"epoch": 2.6545454545454543,
"grad_norm": 0.04748505726456642,
"learning_rate": 6.129032258064516e-06,
"loss": 0.0007,
"step": 730
},
{
"epoch": 2.658181818181818,
"grad_norm": 4.789937973022461,
"learning_rate": 6.064516129032258e-06,
"loss": 0.048,
"step": 731
},
{
"epoch": 2.661818181818182,
"grad_norm": 0.03021158277988434,
"learning_rate": 6e-06,
"loss": 0.0006,
"step": 732
},
{
"epoch": 2.6654545454545455,
"grad_norm": 0.017927074804902077,
"learning_rate": 5.935483870967742e-06,
"loss": 0.0005,
"step": 733
},
{
"epoch": 2.669090909090909,
"grad_norm": 0.01334038283675909,
"learning_rate": 5.870967741935484e-06,
"loss": 0.0005,
"step": 734
},
{
"epoch": 2.672727272727273,
"grad_norm": 5.3513970375061035,
"learning_rate": 5.806451612903226e-06,
"loss": 0.0182,
"step": 735
},
{
"epoch": 2.6763636363636363,
"grad_norm": 0.012456363067030907,
"learning_rate": 5.741935483870968e-06,
"loss": 0.0004,
"step": 736
},
{
"epoch": 2.68,
"grad_norm": 0.024449974298477173,
"learning_rate": 5.677419354838711e-06,
"loss": 0.0006,
"step": 737
},
{
"epoch": 2.6836363636363636,
"grad_norm": 0.015060571022331715,
"learning_rate": 5.612903225806452e-06,
"loss": 0.0005,
"step": 738
},
{
"epoch": 2.6872727272727275,
"grad_norm": 4.723052024841309,
"learning_rate": 5.548387096774194e-06,
"loss": 0.0212,
"step": 739
},
{
"epoch": 2.690909090909091,
"grad_norm": 0.020275188609957695,
"learning_rate": 5.483870967741936e-06,
"loss": 0.0005,
"step": 740
},
{
"epoch": 2.6945454545454544,
"grad_norm": 0.00891982950270176,
"learning_rate": 5.419354838709677e-06,
"loss": 0.0004,
"step": 741
},
{
"epoch": 2.6981818181818182,
"grad_norm": 0.006951956544071436,
"learning_rate": 5.35483870967742e-06,
"loss": 0.0003,
"step": 742
},
{
"epoch": 2.7018181818181817,
"grad_norm": 1.1897835731506348,
"learning_rate": 5.2903225806451614e-06,
"loss": 0.1006,
"step": 743
},
{
"epoch": 2.7054545454545456,
"grad_norm": 0.0193193256855011,
"learning_rate": 5.2258064516129035e-06,
"loss": 0.0005,
"step": 744
},
{
"epoch": 2.709090909090909,
"grad_norm": 0.006243540905416012,
"learning_rate": 5.161290322580646e-06,
"loss": 0.0003,
"step": 745
},
{
"epoch": 2.712727272727273,
"grad_norm": 0.011104391887784004,
"learning_rate": 5.096774193548387e-06,
"loss": 0.0005,
"step": 746
},
{
"epoch": 2.7163636363636363,
"grad_norm": 0.015710929408669472,
"learning_rate": 5.032258064516129e-06,
"loss": 0.0005,
"step": 747
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.008195963688194752,
"learning_rate": 4.967741935483871e-06,
"loss": 0.0004,
"step": 748
},
{
"epoch": 2.7236363636363636,
"grad_norm": 0.5103004574775696,
"learning_rate": 4.903225806451613e-06,
"loss": 0.0028,
"step": 749
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.009401354938745499,
"learning_rate": 4.838709677419355e-06,
"loss": 0.0004,
"step": 750
},
{
"epoch": 2.730909090909091,
"grad_norm": 0.010534017346799374,
"learning_rate": 4.774193548387097e-06,
"loss": 0.0004,
"step": 751
},
{
"epoch": 2.7345454545454544,
"grad_norm": 0.02028539776802063,
"learning_rate": 4.7096774193548385e-06,
"loss": 0.0005,
"step": 752
},
{
"epoch": 2.7381818181818183,
"grad_norm": 0.10612978786230087,
"learning_rate": 4.6451612903225815e-06,
"loss": 0.0013,
"step": 753
},
{
"epoch": 2.7418181818181817,
"grad_norm": 0.008999668061733246,
"learning_rate": 4.580645161290323e-06,
"loss": 0.0004,
"step": 754
},
{
"epoch": 2.7454545454545456,
"grad_norm": 0.020546872168779373,
"learning_rate": 4.516129032258065e-06,
"loss": 0.0006,
"step": 755
},
{
"epoch": 2.749090909090909,
"grad_norm": 0.04722006618976593,
"learning_rate": 4.451612903225807e-06,
"loss": 0.0006,
"step": 756
},
{
"epoch": 2.752727272727273,
"grad_norm": 0.6383763551712036,
"learning_rate": 4.387096774193548e-06,
"loss": 0.0017,
"step": 757
},
{
"epoch": 2.7563636363636363,
"grad_norm": 0.013335658237338066,
"learning_rate": 4.322580645161291e-06,
"loss": 0.0005,
"step": 758
},
{
"epoch": 2.76,
"grad_norm": 0.01042084489017725,
"learning_rate": 4.258064516129032e-06,
"loss": 0.0004,
"step": 759
},
{
"epoch": 2.7636363636363637,
"grad_norm": 0.0967690572142601,
"learning_rate": 4.193548387096774e-06,
"loss": 0.0011,
"step": 760
},
{
"epoch": 2.767272727272727,
"grad_norm": 0.02381141297519207,
"learning_rate": 4.1290322580645165e-06,
"loss": 0.0007,
"step": 761
},
{
"epoch": 2.770909090909091,
"grad_norm": 0.02535305730998516,
"learning_rate": 4.064516129032258e-06,
"loss": 0.0006,
"step": 762
},
{
"epoch": 2.7745454545454544,
"grad_norm": 0.0302995927631855,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0007,
"step": 763
},
{
"epoch": 2.7781818181818183,
"grad_norm": 0.01775578036904335,
"learning_rate": 3.935483870967742e-06,
"loss": 0.0005,
"step": 764
},
{
"epoch": 2.7818181818181817,
"grad_norm": 0.011038933880627155,
"learning_rate": 3.870967741935484e-06,
"loss": 0.0004,
"step": 765
},
{
"epoch": 2.785454545454545,
"grad_norm": 0.010119972750544548,
"learning_rate": 3.8064516129032257e-06,
"loss": 0.0004,
"step": 766
},
{
"epoch": 2.789090909090909,
"grad_norm": 0.015269882045686245,
"learning_rate": 3.741935483870968e-06,
"loss": 0.0005,
"step": 767
},
{
"epoch": 2.792727272727273,
"grad_norm": 0.048511989414691925,
"learning_rate": 3.67741935483871e-06,
"loss": 0.0009,
"step": 768
},
{
"epoch": 2.7963636363636364,
"grad_norm": 0.08957032114267349,
"learning_rate": 3.612903225806452e-06,
"loss": 0.0009,
"step": 769
},
{
"epoch": 2.8,
"grad_norm": 0.012502241879701614,
"learning_rate": 3.5483870967741936e-06,
"loss": 0.0005,
"step": 770
},
{
"epoch": 2.8036363636363637,
"grad_norm": 0.017656538635492325,
"learning_rate": 3.4838709677419353e-06,
"loss": 0.0004,
"step": 771
},
{
"epoch": 2.807272727272727,
"grad_norm": 0.03845641762018204,
"learning_rate": 3.4193548387096778e-06,
"loss": 0.001,
"step": 772
},
{
"epoch": 2.810909090909091,
"grad_norm": 0.6619153618812561,
"learning_rate": 3.3548387096774194e-06,
"loss": 0.0026,
"step": 773
},
{
"epoch": 2.8145454545454545,
"grad_norm": 0.012663335539400578,
"learning_rate": 3.2903225806451615e-06,
"loss": 0.0005,
"step": 774
},
{
"epoch": 2.8181818181818183,
"grad_norm": 0.2195906788110733,
"learning_rate": 3.225806451612903e-06,
"loss": 0.0085,
"step": 775
},
{
"epoch": 2.821818181818182,
"grad_norm": 0.02970001846551895,
"learning_rate": 3.1612903225806457e-06,
"loss": 0.0009,
"step": 776
},
{
"epoch": 2.825454545454545,
"grad_norm": 0.025815211236476898,
"learning_rate": 3.0967741935483874e-06,
"loss": 0.0007,
"step": 777
},
{
"epoch": 2.829090909090909,
"grad_norm": 0.026814987882971764,
"learning_rate": 3.032258064516129e-06,
"loss": 0.0006,
"step": 778
},
{
"epoch": 2.832727272727273,
"grad_norm": 0.7879754900932312,
"learning_rate": 2.967741935483871e-06,
"loss": 0.1117,
"step": 779
},
{
"epoch": 2.8363636363636364,
"grad_norm": 0.0365438349545002,
"learning_rate": 2.903225806451613e-06,
"loss": 0.0009,
"step": 780
},
{
"epoch": 2.84,
"grad_norm": 0.1561412513256073,
"learning_rate": 2.8387096774193553e-06,
"loss": 0.0012,
"step": 781
},
{
"epoch": 2.8436363636363637,
"grad_norm": 0.010296767577528954,
"learning_rate": 2.774193548387097e-06,
"loss": 0.0004,
"step": 782
},
{
"epoch": 2.847272727272727,
"grad_norm": 0.16209469735622406,
"learning_rate": 2.7096774193548386e-06,
"loss": 0.0013,
"step": 783
},
{
"epoch": 2.850909090909091,
"grad_norm": 0.014227217994630337,
"learning_rate": 2.6451612903225807e-06,
"loss": 0.0004,
"step": 784
},
{
"epoch": 2.8545454545454545,
"grad_norm": 0.06868361681699753,
"learning_rate": 2.580645161290323e-06,
"loss": 0.0006,
"step": 785
},
{
"epoch": 2.8581818181818184,
"grad_norm": 0.02630774676799774,
"learning_rate": 2.5161290322580645e-06,
"loss": 0.0006,
"step": 786
},
{
"epoch": 2.861818181818182,
"grad_norm": 0.007779018487781286,
"learning_rate": 2.4516129032258066e-06,
"loss": 0.0004,
"step": 787
},
{
"epoch": 2.8654545454545453,
"grad_norm": 0.03792530298233032,
"learning_rate": 2.3870967741935486e-06,
"loss": 0.0006,
"step": 788
},
{
"epoch": 2.869090909090909,
"grad_norm": 0.009642829187214375,
"learning_rate": 2.3225806451612907e-06,
"loss": 0.0004,
"step": 789
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.014530934393405914,
"learning_rate": 2.2580645161290324e-06,
"loss": 0.0005,
"step": 790
},
{
"epoch": 2.8763636363636365,
"grad_norm": 0.09846967458724976,
"learning_rate": 2.193548387096774e-06,
"loss": 0.001,
"step": 791
},
{
"epoch": 2.88,
"grad_norm": 0.02130993641912937,
"learning_rate": 2.129032258064516e-06,
"loss": 0.0006,
"step": 792
},
{
"epoch": 2.8836363636363638,
"grad_norm": 0.6667435765266418,
"learning_rate": 2.0645161290322582e-06,
"loss": 0.0023,
"step": 793
},
{
"epoch": 2.887272727272727,
"grad_norm": 0.10888272523880005,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0009,
"step": 794
},
{
"epoch": 2.8909090909090907,
"grad_norm": 0.048606500029563904,
"learning_rate": 1.935483870967742e-06,
"loss": 0.0005,
"step": 795
},
{
"epoch": 2.8945454545454545,
"grad_norm": 0.23399078845977783,
"learning_rate": 1.870967741935484e-06,
"loss": 0.0027,
"step": 796
},
{
"epoch": 2.8981818181818184,
"grad_norm": 0.01046321727335453,
"learning_rate": 1.806451612903226e-06,
"loss": 0.0004,
"step": 797
},
{
"epoch": 2.901818181818182,
"grad_norm": 0.015533102676272392,
"learning_rate": 1.7419354838709676e-06,
"loss": 0.0006,
"step": 798
},
{
"epoch": 2.9054545454545453,
"grad_norm": 0.016338596120476723,
"learning_rate": 1.6774193548387097e-06,
"loss": 0.0006,
"step": 799
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.00850651878863573,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.0004,
"step": 800
},
{
"epoch": 2.9127272727272726,
"grad_norm": 0.7118433713912964,
"learning_rate": 1.5483870967741937e-06,
"loss": 0.0027,
"step": 801
},
{
"epoch": 2.9163636363636365,
"grad_norm": 1.7679587602615356,
"learning_rate": 1.4838709677419356e-06,
"loss": 0.0496,
"step": 802
},
{
"epoch": 2.92,
"grad_norm": 0.010516179725527763,
"learning_rate": 1.4193548387096776e-06,
"loss": 0.0004,
"step": 803
},
{
"epoch": 2.923636363636364,
"grad_norm": 0.0124241653829813,
"learning_rate": 1.3548387096774193e-06,
"loss": 0.0004,
"step": 804
},
{
"epoch": 2.9272727272727272,
"grad_norm": 0.00775744765996933,
"learning_rate": 1.2903225806451614e-06,
"loss": 0.0003,
"step": 805
},
{
"epoch": 2.9309090909090907,
"grad_norm": 1.7472180128097534,
"learning_rate": 1.2258064516129033e-06,
"loss": 0.0587,
"step": 806
},
{
"epoch": 2.9345454545454546,
"grad_norm": 0.014171603135764599,
"learning_rate": 1.1612903225806454e-06,
"loss": 0.0005,
"step": 807
},
{
"epoch": 2.9381818181818184,
"grad_norm": 0.011718512512743473,
"learning_rate": 1.096774193548387e-06,
"loss": 0.0004,
"step": 808
},
{
"epoch": 2.941818181818182,
"grad_norm": 0.011687002144753933,
"learning_rate": 1.0322580645161291e-06,
"loss": 0.0004,
"step": 809
},
{
"epoch": 2.9454545454545453,
"grad_norm": 0.010272631421685219,
"learning_rate": 9.67741935483871e-07,
"loss": 0.0004,
"step": 810
},
{
"epoch": 2.949090909090909,
"grad_norm": 0.4066472351551056,
"learning_rate": 9.03225806451613e-07,
"loss": 0.0023,
"step": 811
},
{
"epoch": 2.9527272727272726,
"grad_norm": 1.5127947330474854,
"learning_rate": 8.387096774193549e-07,
"loss": 0.0037,
"step": 812
},
{
"epoch": 2.9563636363636365,
"grad_norm": 0.017161400988698006,
"learning_rate": 7.741935483870968e-07,
"loss": 0.0005,
"step": 813
},
{
"epoch": 2.96,
"grad_norm": 0.009147647768259048,
"learning_rate": 7.096774193548388e-07,
"loss": 0.0004,
"step": 814
},
{
"epoch": 2.963636363636364,
"grad_norm": 0.010304290801286697,
"learning_rate": 6.451612903225807e-07,
"loss": 0.0005,
"step": 815
},
{
"epoch": 2.9672727272727273,
"grad_norm": 0.027671974152326584,
"learning_rate": 5.806451612903227e-07,
"loss": 0.0008,
"step": 816
},
{
"epoch": 2.9709090909090907,
"grad_norm": 3.4109323024749756,
"learning_rate": 5.161290322580646e-07,
"loss": 0.0096,
"step": 817
},
{
"epoch": 2.9745454545454546,
"grad_norm": 0.1351071149110794,
"learning_rate": 4.516129032258065e-07,
"loss": 0.0012,
"step": 818
},
{
"epoch": 2.978181818181818,
"grad_norm": 0.01651870645582676,
"learning_rate": 3.870967741935484e-07,
"loss": 0.0005,
"step": 819
},
{
"epoch": 2.981818181818182,
"grad_norm": 0.012661241926252842,
"learning_rate": 3.2258064516129035e-07,
"loss": 0.0004,
"step": 820
},
{
"epoch": 2.9854545454545454,
"grad_norm": 0.007508403621613979,
"learning_rate": 2.580645161290323e-07,
"loss": 0.0003,
"step": 821
},
{
"epoch": 2.9890909090909092,
"grad_norm": 0.2275754064321518,
"learning_rate": 1.935483870967742e-07,
"loss": 0.001,
"step": 822
},
{
"epoch": 2.9927272727272727,
"grad_norm": 0.012626181356608868,
"learning_rate": 1.2903225806451614e-07,
"loss": 0.0005,
"step": 823
},
{
"epoch": 2.996363636363636,
"grad_norm": 0.016164276748895645,
"learning_rate": 6.451612903225807e-08,
"loss": 0.0006,
"step": 824
},
{
"epoch": 3.0,
"grad_norm": 0.007542683742940426,
"learning_rate": 0.0,
"loss": 0.0003,
"step": 825
},
{
"epoch": 3.0,
"eval_accuracy": 0.9938524590163934,
"eval_loss": 0.026551904156804085,
"eval_runtime": 12.7394,
"eval_samples_per_second": 344.756,
"eval_steps_per_second": 5.416,
"step": 825
}
],
"logging_steps": 1,
"max_steps": 825,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6981561778765824.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}