AdithyaSK's picture
Upload folder using huggingface_hub
ee4137a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9983750406239844,
"eval_steps": 128,
"global_step": 512,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.5149741172790527,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.8709,
"step": 1
},
{
"epoch": 0.0,
"eval_loss": 1.8383064270019531,
"eval_runtime": 707.8127,
"eval_samples_per_second": 7.169,
"eval_steps_per_second": 1.793,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.48140937089920044,
"learning_rate": 4.000000000000001e-06,
"loss": 1.7751,
"step": 2
},
{
"epoch": 0.01,
"grad_norm": 0.4886001944541931,
"learning_rate": 6e-06,
"loss": 1.795,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 0.46349120140075684,
"learning_rate": 8.000000000000001e-06,
"loss": 1.7569,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 0.5320057272911072,
"learning_rate": 1e-05,
"loss": 1.9278,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 0.48083460330963135,
"learning_rate": 1.2e-05,
"loss": 1.778,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 0.503804624080658,
"learning_rate": 1.4e-05,
"loss": 1.8358,
"step": 7
},
{
"epoch": 0.02,
"grad_norm": 0.5177507400512695,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.8655,
"step": 8
},
{
"epoch": 0.02,
"grad_norm": 0.5006410479545593,
"learning_rate": 1.8e-05,
"loss": 1.8087,
"step": 9
},
{
"epoch": 0.02,
"grad_norm": 0.500285804271698,
"learning_rate": 2e-05,
"loss": 1.8254,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 0.4819566607475281,
"learning_rate": 1.9999804178263253e-05,
"loss": 1.7627,
"step": 11
},
{
"epoch": 0.02,
"grad_norm": 0.4860954284667969,
"learning_rate": 1.999921672072223e-05,
"loss": 1.7034,
"step": 12
},
{
"epoch": 0.03,
"grad_norm": 0.5111412405967712,
"learning_rate": 1.9998237650384324e-05,
"loss": 1.7203,
"step": 13
},
{
"epoch": 0.03,
"grad_norm": 0.500988245010376,
"learning_rate": 1.9996867005594193e-05,
"loss": 1.6721,
"step": 14
},
{
"epoch": 0.03,
"grad_norm": 0.4903103709220886,
"learning_rate": 1.999510484003224e-05,
"loss": 1.6167,
"step": 15
},
{
"epoch": 0.03,
"grad_norm": 0.4756762683391571,
"learning_rate": 1.999295122271253e-05,
"loss": 1.57,
"step": 16
},
{
"epoch": 0.03,
"grad_norm": 0.4689522385597229,
"learning_rate": 1.999040623798008e-05,
"loss": 1.5461,
"step": 17
},
{
"epoch": 0.04,
"grad_norm": 0.5094612836837769,
"learning_rate": 1.9987469985507553e-05,
"loss": 1.5526,
"step": 18
},
{
"epoch": 0.04,
"grad_norm": 0.49769631028175354,
"learning_rate": 1.9984142580291368e-05,
"loss": 1.5115,
"step": 19
},
{
"epoch": 0.04,
"grad_norm": 0.46388670802116394,
"learning_rate": 1.9980424152647174e-05,
"loss": 1.467,
"step": 20
},
{
"epoch": 0.04,
"grad_norm": 0.4357146918773651,
"learning_rate": 1.9976314848204762e-05,
"loss": 1.3887,
"step": 21
},
{
"epoch": 0.04,
"grad_norm": 0.440377414226532,
"learning_rate": 1.997181482790236e-05,
"loss": 1.3845,
"step": 22
},
{
"epoch": 0.04,
"grad_norm": 0.4116402566432953,
"learning_rate": 1.9966924267980326e-05,
"loss": 1.4091,
"step": 23
},
{
"epoch": 0.05,
"grad_norm": 0.3181552588939667,
"learning_rate": 1.996164335997425e-05,
"loss": 1.3324,
"step": 24
},
{
"epoch": 0.05,
"grad_norm": 0.2932267189025879,
"learning_rate": 1.995597231070744e-05,
"loss": 1.315,
"step": 25
},
{
"epoch": 0.05,
"grad_norm": 0.328800231218338,
"learning_rate": 1.994991134228285e-05,
"loss": 1.3334,
"step": 26
},
{
"epoch": 0.05,
"grad_norm": 0.32027724385261536,
"learning_rate": 1.9943460692074345e-05,
"loss": 1.3161,
"step": 27
},
{
"epoch": 0.05,
"grad_norm": 0.3247709274291992,
"learning_rate": 1.993662061271743e-05,
"loss": 1.2601,
"step": 28
},
{
"epoch": 0.06,
"grad_norm": 0.33424896001815796,
"learning_rate": 1.9929391372099352e-05,
"loss": 1.2807,
"step": 29
},
{
"epoch": 0.06,
"grad_norm": 0.28847330808639526,
"learning_rate": 1.9921773253348604e-05,
"loss": 1.2427,
"step": 30
},
{
"epoch": 0.06,
"grad_norm": 0.2601753771305084,
"learning_rate": 1.991376655482383e-05,
"loss": 1.2602,
"step": 31
},
{
"epoch": 0.06,
"grad_norm": 0.25505828857421875,
"learning_rate": 1.9905371590102157e-05,
"loss": 1.2539,
"step": 32
},
{
"epoch": 0.06,
"grad_norm": 0.25789541006088257,
"learning_rate": 1.989658868796689e-05,
"loss": 1.2796,
"step": 33
},
{
"epoch": 0.07,
"grad_norm": 0.1963696926832199,
"learning_rate": 1.988741819239467e-05,
"loss": 1.2533,
"step": 34
},
{
"epoch": 0.07,
"grad_norm": 0.1652669906616211,
"learning_rate": 1.9877860462541964e-05,
"loss": 1.27,
"step": 35
},
{
"epoch": 0.07,
"grad_norm": 0.15272551774978638,
"learning_rate": 1.986791587273103e-05,
"loss": 1.2092,
"step": 36
},
{
"epoch": 0.07,
"grad_norm": 0.14809414744377136,
"learning_rate": 1.985758481243523e-05,
"loss": 1.2028,
"step": 37
},
{
"epoch": 0.07,
"grad_norm": 0.14091093838214874,
"learning_rate": 1.98468676862638e-05,
"loss": 1.1737,
"step": 38
},
{
"epoch": 0.08,
"grad_norm": 0.13234961032867432,
"learning_rate": 1.9835764913945998e-05,
"loss": 1.2242,
"step": 39
},
{
"epoch": 0.08,
"grad_norm": 0.12562313675880432,
"learning_rate": 1.982427693031465e-05,
"loss": 1.1846,
"step": 40
},
{
"epoch": 0.08,
"grad_norm": 0.12460777163505554,
"learning_rate": 1.981240418528914e-05,
"loss": 1.1954,
"step": 41
},
{
"epoch": 0.08,
"grad_norm": 0.1261477917432785,
"learning_rate": 1.9800147143857774e-05,
"loss": 1.1944,
"step": 42
},
{
"epoch": 0.08,
"grad_norm": 0.12070100754499435,
"learning_rate": 1.9787506286059584e-05,
"loss": 1.1814,
"step": 43
},
{
"epoch": 0.09,
"grad_norm": 0.1318473368883133,
"learning_rate": 1.9774482106965512e-05,
"loss": 1.2289,
"step": 44
},
{
"epoch": 0.09,
"grad_norm": 0.11869361251592636,
"learning_rate": 1.9761075116659037e-05,
"loss": 1.1507,
"step": 45
},
{
"epoch": 0.09,
"grad_norm": 0.11668427288532257,
"learning_rate": 1.974728584021618e-05,
"loss": 1.1693,
"step": 46
},
{
"epoch": 0.09,
"grad_norm": 0.12271205335855484,
"learning_rate": 1.9733114817684957e-05,
"loss": 1.219,
"step": 47
},
{
"epoch": 0.09,
"grad_norm": 0.12055838108062744,
"learning_rate": 1.9718562604064213e-05,
"loss": 1.2424,
"step": 48
},
{
"epoch": 0.1,
"grad_norm": 0.1191168949007988,
"learning_rate": 1.97036297692819e-05,
"loss": 1.2206,
"step": 49
},
{
"epoch": 0.1,
"grad_norm": 0.11361384391784668,
"learning_rate": 1.9688316898172744e-05,
"loss": 1.1927,
"step": 50
},
{
"epoch": 0.1,
"grad_norm": 0.109556645154953,
"learning_rate": 1.967262459045535e-05,
"loss": 1.2013,
"step": 51
},
{
"epoch": 0.1,
"grad_norm": 0.11278169602155685,
"learning_rate": 1.9656553460708707e-05,
"loss": 1.2379,
"step": 52
},
{
"epoch": 0.1,
"grad_norm": 0.11011548340320587,
"learning_rate": 1.9640104138348124e-05,
"loss": 1.1808,
"step": 53
},
{
"epoch": 0.11,
"grad_norm": 0.09818632155656815,
"learning_rate": 1.9623277267600574e-05,
"loss": 1.1731,
"step": 54
},
{
"epoch": 0.11,
"grad_norm": 0.1045491099357605,
"learning_rate": 1.9606073507479466e-05,
"loss": 1.1729,
"step": 55
},
{
"epoch": 0.11,
"grad_norm": 0.0985143780708313,
"learning_rate": 1.9588493531758843e-05,
"loss": 1.165,
"step": 56
},
{
"epoch": 0.11,
"grad_norm": 0.09513280540704727,
"learning_rate": 1.9570538028946974e-05,
"loss": 1.1765,
"step": 57
},
{
"epoch": 0.11,
"grad_norm": 0.09834066778421402,
"learning_rate": 1.9552207702259412e-05,
"loss": 1.1411,
"step": 58
},
{
"epoch": 0.12,
"grad_norm": 0.09748240560293198,
"learning_rate": 1.9533503269591438e-05,
"loss": 1.1995,
"step": 59
},
{
"epoch": 0.12,
"grad_norm": 0.09501401335000992,
"learning_rate": 1.9514425463489946e-05,
"loss": 1.1414,
"step": 60
},
{
"epoch": 0.12,
"grad_norm": 0.09078366309404373,
"learning_rate": 1.9494975031124768e-05,
"loss": 1.1132,
"step": 61
},
{
"epoch": 0.12,
"grad_norm": 0.09064218401908875,
"learning_rate": 1.947515273425939e-05,
"loss": 1.1498,
"step": 62
},
{
"epoch": 0.12,
"grad_norm": 0.09029112011194229,
"learning_rate": 1.945495934922113e-05,
"loss": 1.158,
"step": 63
},
{
"epoch": 0.12,
"grad_norm": 0.09335145354270935,
"learning_rate": 1.9434395666870735e-05,
"loss": 1.181,
"step": 64
},
{
"epoch": 0.13,
"grad_norm": 0.08959628641605377,
"learning_rate": 1.9413462492571403e-05,
"loss": 1.1353,
"step": 65
},
{
"epoch": 0.13,
"grad_norm": 0.09235028922557831,
"learning_rate": 1.9392160646157242e-05,
"loss": 1.1566,
"step": 66
},
{
"epoch": 0.13,
"grad_norm": 0.08852320164442062,
"learning_rate": 1.937049096190117e-05,
"loss": 1.1015,
"step": 67
},
{
"epoch": 0.13,
"grad_norm": 0.09060905128717422,
"learning_rate": 1.934845428848222e-05,
"loss": 1.1312,
"step": 68
},
{
"epoch": 0.13,
"grad_norm": 0.09065355360507965,
"learning_rate": 1.9326051488952334e-05,
"loss": 1.1456,
"step": 69
},
{
"epoch": 0.14,
"grad_norm": 0.09140690416097641,
"learning_rate": 1.9303283440702524e-05,
"loss": 1.1661,
"step": 70
},
{
"epoch": 0.14,
"grad_norm": 0.08641023188829422,
"learning_rate": 1.9280151035428544e-05,
"loss": 1.1153,
"step": 71
},
{
"epoch": 0.14,
"grad_norm": 0.08729224652051926,
"learning_rate": 1.9256655179095954e-05,
"loss": 1.1956,
"step": 72
},
{
"epoch": 0.14,
"grad_norm": 0.08514908701181412,
"learning_rate": 1.9232796791904627e-05,
"loss": 1.0969,
"step": 73
},
{
"epoch": 0.14,
"grad_norm": 0.08789129555225372,
"learning_rate": 1.9208576808252725e-05,
"loss": 1.1669,
"step": 74
},
{
"epoch": 0.15,
"grad_norm": 0.0829731896519661,
"learning_rate": 1.918399617670011e-05,
"loss": 1.101,
"step": 75
},
{
"epoch": 0.15,
"grad_norm": 0.08415351063013077,
"learning_rate": 1.9159055859931163e-05,
"loss": 1.122,
"step": 76
},
{
"epoch": 0.15,
"grad_norm": 0.07933653146028519,
"learning_rate": 1.9133756834717118e-05,
"loss": 1.1175,
"step": 77
},
{
"epoch": 0.15,
"grad_norm": 0.0849999189376831,
"learning_rate": 1.9108100091877787e-05,
"loss": 1.1577,
"step": 78
},
{
"epoch": 0.15,
"grad_norm": 0.0835108831524849,
"learning_rate": 1.9082086636242757e-05,
"loss": 1.1253,
"step": 79
},
{
"epoch": 0.16,
"grad_norm": 0.07834841310977936,
"learning_rate": 1.905571748661204e-05,
"loss": 1.0963,
"step": 80
},
{
"epoch": 0.16,
"grad_norm": 0.07953493297100067,
"learning_rate": 1.902899367571617e-05,
"loss": 1.1102,
"step": 81
},
{
"epoch": 0.16,
"grad_norm": 0.07989759743213654,
"learning_rate": 1.9001916250175764e-05,
"loss": 1.1576,
"step": 82
},
{
"epoch": 0.16,
"grad_norm": 0.07849448174238205,
"learning_rate": 1.8974486270460518e-05,
"loss": 1.0963,
"step": 83
},
{
"epoch": 0.16,
"grad_norm": 0.07805287837982178,
"learning_rate": 1.894670481084769e-05,
"loss": 1.1364,
"step": 84
},
{
"epoch": 0.17,
"grad_norm": 0.07698098570108414,
"learning_rate": 1.8918572959380005e-05,
"loss": 1.1407,
"step": 85
},
{
"epoch": 0.17,
"grad_norm": 0.0766262486577034,
"learning_rate": 1.8890091817823073e-05,
"loss": 1.1225,
"step": 86
},
{
"epoch": 0.17,
"grad_norm": 0.0798678770661354,
"learning_rate": 1.8861262501622213e-05,
"loss": 1.137,
"step": 87
},
{
"epoch": 0.17,
"grad_norm": 0.07717825472354889,
"learning_rate": 1.8832086139858777e-05,
"loss": 1.1311,
"step": 88
},
{
"epoch": 0.17,
"grad_norm": 0.07542562484741211,
"learning_rate": 1.880256387520593e-05,
"loss": 1.1066,
"step": 89
},
{
"epoch": 0.18,
"grad_norm": 0.07316063344478607,
"learning_rate": 1.8772696863883905e-05,
"loss": 1.0976,
"step": 90
},
{
"epoch": 0.18,
"grad_norm": 0.0738874301314354,
"learning_rate": 1.8742486275614706e-05,
"loss": 1.0901,
"step": 91
},
{
"epoch": 0.18,
"grad_norm": 0.07698226720094681,
"learning_rate": 1.8711933293576303e-05,
"loss": 1.1224,
"step": 92
},
{
"epoch": 0.18,
"grad_norm": 0.07452582567930222,
"learning_rate": 1.8681039114356298e-05,
"loss": 1.1399,
"step": 93
},
{
"epoch": 0.18,
"grad_norm": 0.07452700287103653,
"learning_rate": 1.8649804947905057e-05,
"loss": 1.1639,
"step": 94
},
{
"epoch": 0.19,
"grad_norm": 0.07358838617801666,
"learning_rate": 1.861823201748833e-05,
"loss": 1.1139,
"step": 95
},
{
"epoch": 0.19,
"grad_norm": 0.07469804584980011,
"learning_rate": 1.8586321559639316e-05,
"loss": 1.1103,
"step": 96
},
{
"epoch": 0.19,
"grad_norm": 0.07484911382198334,
"learning_rate": 1.8554074824110285e-05,
"loss": 1.1231,
"step": 97
},
{
"epoch": 0.19,
"grad_norm": 0.07320189476013184,
"learning_rate": 1.8521493073823583e-05,
"loss": 1.1405,
"step": 98
},
{
"epoch": 0.19,
"grad_norm": 0.07219311594963074,
"learning_rate": 1.8488577584822197e-05,
"loss": 1.1084,
"step": 99
},
{
"epoch": 0.19,
"grad_norm": 0.07267658412456512,
"learning_rate": 1.8455329646219767e-05,
"loss": 1.109,
"step": 100
},
{
"epoch": 0.2,
"grad_norm": 0.07124843448400497,
"learning_rate": 1.8421750560150112e-05,
"loss": 1.0997,
"step": 101
},
{
"epoch": 0.2,
"grad_norm": 0.06921572983264923,
"learning_rate": 1.8387841641716226e-05,
"loss": 1.1095,
"step": 102
},
{
"epoch": 0.2,
"grad_norm": 0.07149618864059448,
"learning_rate": 1.835360421893876e-05,
"loss": 1.1078,
"step": 103
},
{
"epoch": 0.2,
"grad_norm": 0.07851895689964294,
"learning_rate": 1.8319039632704042e-05,
"loss": 1.1195,
"step": 104
},
{
"epoch": 0.2,
"grad_norm": 0.07615454494953156,
"learning_rate": 1.8284149236711527e-05,
"loss": 1.0754,
"step": 105
},
{
"epoch": 0.21,
"grad_norm": 0.07054944336414337,
"learning_rate": 1.8248934397420802e-05,
"loss": 1.0943,
"step": 106
},
{
"epoch": 0.21,
"grad_norm": 0.07253159582614899,
"learning_rate": 1.821339649399807e-05,
"loss": 1.1263,
"step": 107
},
{
"epoch": 0.21,
"grad_norm": 0.0729857012629509,
"learning_rate": 1.817753691826212e-05,
"loss": 1.0977,
"step": 108
},
{
"epoch": 0.21,
"grad_norm": 0.07234011590480804,
"learning_rate": 1.8141357074629838e-05,
"loss": 1.1334,
"step": 109
},
{
"epoch": 0.21,
"grad_norm": 0.07030120491981506,
"learning_rate": 1.8104858380061178e-05,
"loss": 1.0767,
"step": 110
},
{
"epoch": 0.22,
"grad_norm": 0.07036615908145905,
"learning_rate": 1.80680422640037e-05,
"loss": 1.0796,
"step": 111
},
{
"epoch": 0.22,
"grad_norm": 0.0742933601140976,
"learning_rate": 1.8030910168336558e-05,
"loss": 1.0671,
"step": 112
},
{
"epoch": 0.22,
"grad_norm": 0.07065165787935257,
"learning_rate": 1.7993463547314044e-05,
"loss": 1.1594,
"step": 113
},
{
"epoch": 0.22,
"grad_norm": 0.07182008028030396,
"learning_rate": 1.7955703867508634e-05,
"loss": 1.0936,
"step": 114
},
{
"epoch": 0.22,
"grad_norm": 0.06882106512784958,
"learning_rate": 1.791763260775354e-05,
"loss": 1.1017,
"step": 115
},
{
"epoch": 0.23,
"grad_norm": 0.07001936435699463,
"learning_rate": 1.7879251259084803e-05,
"loss": 1.1267,
"step": 116
},
{
"epoch": 0.23,
"grad_norm": 0.06916490197181702,
"learning_rate": 1.78405613246829e-05,
"loss": 1.0787,
"step": 117
},
{
"epoch": 0.23,
"grad_norm": 0.07149837166070938,
"learning_rate": 1.7801564319813854e-05,
"loss": 1.1302,
"step": 118
},
{
"epoch": 0.23,
"grad_norm": 0.06783504039049149,
"learning_rate": 1.776226177176991e-05,
"loss": 1.1159,
"step": 119
},
{
"epoch": 0.23,
"grad_norm": 0.07285293936729431,
"learning_rate": 1.7722655219809718e-05,
"loss": 1.0758,
"step": 120
},
{
"epoch": 0.24,
"grad_norm": 0.07273004204034805,
"learning_rate": 1.768274621509803e-05,
"loss": 1.1019,
"step": 121
},
{
"epoch": 0.24,
"grad_norm": 0.07392899692058563,
"learning_rate": 1.7642536320644964e-05,
"loss": 1.1111,
"step": 122
},
{
"epoch": 0.24,
"grad_norm": 0.0693732351064682,
"learning_rate": 1.7602027111244807e-05,
"loss": 1.1109,
"step": 123
},
{
"epoch": 0.24,
"grad_norm": 0.0721542090177536,
"learning_rate": 1.7561220173414297e-05,
"loss": 1.1246,
"step": 124
},
{
"epoch": 0.24,
"grad_norm": 0.07002190500497818,
"learning_rate": 1.7520117105330524e-05,
"loss": 1.073,
"step": 125
},
{
"epoch": 0.25,
"grad_norm": 0.0697953850030899,
"learning_rate": 1.7478719516768324e-05,
"loss": 1.0913,
"step": 126
},
{
"epoch": 0.25,
"grad_norm": 0.07040461152791977,
"learning_rate": 1.7437029029037233e-05,
"loss": 1.1445,
"step": 127
},
{
"epoch": 0.25,
"grad_norm": 0.07231634110212326,
"learning_rate": 1.7395047274917994e-05,
"loss": 1.1106,
"step": 128
},
{
"epoch": 0.25,
"eval_loss": 1.0988876819610596,
"eval_runtime": 708.4228,
"eval_samples_per_second": 7.162,
"eval_steps_per_second": 1.791,
"step": 128
},
{
"epoch": 0.25,
"grad_norm": 0.0713375061750412,
"learning_rate": 1.7352775898598615e-05,
"loss": 1.0982,
"step": 129
},
{
"epoch": 0.25,
"grad_norm": 0.06747942417860031,
"learning_rate": 1.731021655560995e-05,
"loss": 1.1017,
"step": 130
},
{
"epoch": 0.26,
"grad_norm": 0.071540467441082,
"learning_rate": 1.72673709127609e-05,
"loss": 1.0859,
"step": 131
},
{
"epoch": 0.26,
"grad_norm": 0.06861750036478043,
"learning_rate": 1.7224240648073097e-05,
"loss": 1.0728,
"step": 132
},
{
"epoch": 0.26,
"grad_norm": 0.06919445842504501,
"learning_rate": 1.718082745071521e-05,
"loss": 1.1218,
"step": 133
},
{
"epoch": 0.26,
"grad_norm": 0.07422851771116257,
"learning_rate": 1.7137133020936783e-05,
"loss": 1.0881,
"step": 134
},
{
"epoch": 0.26,
"grad_norm": 0.07452652603387833,
"learning_rate": 1.7093159070001637e-05,
"loss": 1.1073,
"step": 135
},
{
"epoch": 0.27,
"grad_norm": 0.07337850332260132,
"learning_rate": 1.7048907320120867e-05,
"loss": 1.1065,
"step": 136
},
{
"epoch": 0.27,
"grad_norm": 0.07020066678524017,
"learning_rate": 1.700437950438537e-05,
"loss": 1.0742,
"step": 137
},
{
"epoch": 0.27,
"grad_norm": 0.07053718715906143,
"learning_rate": 1.695957736669799e-05,
"loss": 1.0627,
"step": 138
},
{
"epoch": 0.27,
"grad_norm": 0.07288292795419693,
"learning_rate": 1.6914502661705216e-05,
"loss": 1.0842,
"step": 139
},
{
"epoch": 0.27,
"grad_norm": 0.07197044044733047,
"learning_rate": 1.6869157154728437e-05,
"loss": 1.065,
"step": 140
},
{
"epoch": 0.27,
"grad_norm": 0.07109569013118744,
"learning_rate": 1.6823542621694852e-05,
"loss": 1.0996,
"step": 141
},
{
"epoch": 0.28,
"grad_norm": 0.07084467262029648,
"learning_rate": 1.677766084906787e-05,
"loss": 1.0862,
"step": 142
},
{
"epoch": 0.28,
"grad_norm": 0.07195379585027695,
"learning_rate": 1.6731513633777173e-05,
"loss": 1.1184,
"step": 143
},
{
"epoch": 0.28,
"grad_norm": 0.07326792180538177,
"learning_rate": 1.668510278314833e-05,
"loss": 1.0867,
"step": 144
},
{
"epoch": 0.28,
"grad_norm": 0.07582233846187592,
"learning_rate": 1.6638430114832015e-05,
"loss": 1.0721,
"step": 145
},
{
"epoch": 0.28,
"grad_norm": 0.07204006612300873,
"learning_rate": 1.6591497456732827e-05,
"loss": 1.0565,
"step": 146
},
{
"epoch": 0.29,
"grad_norm": 0.07225130498409271,
"learning_rate": 1.6544306646937683e-05,
"loss": 1.1036,
"step": 147
},
{
"epoch": 0.29,
"grad_norm": 0.07662148773670197,
"learning_rate": 1.649685953364385e-05,
"loss": 1.0289,
"step": 148
},
{
"epoch": 0.29,
"grad_norm": 0.07611638307571411,
"learning_rate": 1.644915797508656e-05,
"loss": 1.1068,
"step": 149
},
{
"epoch": 0.29,
"grad_norm": 0.07609565556049347,
"learning_rate": 1.6401203839466212e-05,
"loss": 1.0816,
"step": 150
},
{
"epoch": 0.29,
"grad_norm": 0.0737641304731369,
"learning_rate": 1.6352999004875242e-05,
"loss": 1.1016,
"step": 151
},
{
"epoch": 0.3,
"grad_norm": 0.07359515875577927,
"learning_rate": 1.630454535922452e-05,
"loss": 1.0787,
"step": 152
},
{
"epoch": 0.3,
"grad_norm": 0.07506351917982101,
"learning_rate": 1.6255844800169472e-05,
"loss": 1.0789,
"step": 153
},
{
"epoch": 0.3,
"grad_norm": 0.07777760922908783,
"learning_rate": 1.62068992350357e-05,
"loss": 1.096,
"step": 154
},
{
"epoch": 0.3,
"grad_norm": 0.07574637979269028,
"learning_rate": 1.6157710580744322e-05,
"loss": 1.1007,
"step": 155
},
{
"epoch": 0.3,
"grad_norm": 0.07857154309749603,
"learning_rate": 1.610828076373687e-05,
"loss": 1.0735,
"step": 156
},
{
"epoch": 0.31,
"grad_norm": 0.07402702420949936,
"learning_rate": 1.605861171989988e-05,
"loss": 1.1003,
"step": 157
},
{
"epoch": 0.31,
"grad_norm": 0.07439373433589935,
"learning_rate": 1.6008705394489032e-05,
"loss": 1.0662,
"step": 158
},
{
"epoch": 0.31,
"grad_norm": 0.07392847537994385,
"learning_rate": 1.5958563742052987e-05,
"loss": 1.0487,
"step": 159
},
{
"epoch": 0.31,
"grad_norm": 0.07773245126008987,
"learning_rate": 1.5908188726356843e-05,
"loss": 1.1107,
"step": 160
},
{
"epoch": 0.31,
"grad_norm": 0.07752656936645508,
"learning_rate": 1.5857582320305207e-05,
"loss": 1.0426,
"step": 161
},
{
"epoch": 0.32,
"grad_norm": 0.07541097700595856,
"learning_rate": 1.5806746505864947e-05,
"loss": 1.081,
"step": 162
},
{
"epoch": 0.32,
"grad_norm": 0.07938623428344727,
"learning_rate": 1.5755683273987554e-05,
"loss": 1.0969,
"step": 163
},
{
"epoch": 0.32,
"grad_norm": 0.07379717379808426,
"learning_rate": 1.5704394624531184e-05,
"loss": 1.0763,
"step": 164
},
{
"epoch": 0.32,
"grad_norm": 0.07850446552038193,
"learning_rate": 1.5652882566182316e-05,
"loss": 1.1029,
"step": 165
},
{
"epoch": 0.32,
"grad_norm": 0.07627106457948685,
"learning_rate": 1.5601149116377095e-05,
"loss": 1.0611,
"step": 166
},
{
"epoch": 0.33,
"grad_norm": 0.07577154785394669,
"learning_rate": 1.554919630122232e-05,
"loss": 1.0973,
"step": 167
},
{
"epoch": 0.33,
"grad_norm": 0.07844171673059464,
"learning_rate": 1.5497026155416087e-05,
"loss": 1.1006,
"step": 168
},
{
"epoch": 0.33,
"grad_norm": 0.08061926811933517,
"learning_rate": 1.5444640722168114e-05,
"loss": 1.0879,
"step": 169
},
{
"epoch": 0.33,
"grad_norm": 0.07918211817741394,
"learning_rate": 1.53920420531197e-05,
"loss": 1.0602,
"step": 170
},
{
"epoch": 0.33,
"grad_norm": 0.08213488012552261,
"learning_rate": 1.5339232208263394e-05,
"loss": 1.0798,
"step": 171
},
{
"epoch": 0.34,
"grad_norm": 0.07898285239934921,
"learning_rate": 1.5286213255862295e-05,
"loss": 1.0969,
"step": 172
},
{
"epoch": 0.34,
"grad_norm": 0.08233582973480225,
"learning_rate": 1.5232987272369076e-05,
"loss": 1.0699,
"step": 173
},
{
"epoch": 0.34,
"grad_norm": 0.08074311912059784,
"learning_rate": 1.5179556342344643e-05,
"loss": 1.0851,
"step": 174
},
{
"epoch": 0.34,
"grad_norm": 0.08196305483579636,
"learning_rate": 1.51259225583765e-05,
"loss": 1.076,
"step": 175
},
{
"epoch": 0.34,
"grad_norm": 0.08637065440416336,
"learning_rate": 1.5072088020996791e-05,
"loss": 1.0989,
"step": 176
},
{
"epoch": 0.35,
"grad_norm": 0.08313170820474625,
"learning_rate": 1.5018054838600033e-05,
"loss": 1.09,
"step": 177
},
{
"epoch": 0.35,
"grad_norm": 0.08245568722486496,
"learning_rate": 1.496382512736056e-05,
"loss": 1.0572,
"step": 178
},
{
"epoch": 0.35,
"grad_norm": 0.08442118763923645,
"learning_rate": 1.490940101114961e-05,
"loss": 1.0669,
"step": 179
},
{
"epoch": 0.35,
"grad_norm": 0.08224523812532425,
"learning_rate": 1.4854784621452176e-05,
"loss": 1.0842,
"step": 180
},
{
"epoch": 0.35,
"grad_norm": 0.08642537891864777,
"learning_rate": 1.479997809728352e-05,
"loss": 1.123,
"step": 181
},
{
"epoch": 0.35,
"grad_norm": 0.08723440766334534,
"learning_rate": 1.4744983585105388e-05,
"loss": 1.0649,
"step": 182
},
{
"epoch": 0.36,
"grad_norm": 0.08666212856769562,
"learning_rate": 1.4689803238741955e-05,
"loss": 1.0938,
"step": 183
},
{
"epoch": 0.36,
"grad_norm": 0.09213647246360779,
"learning_rate": 1.463443921929548e-05,
"loss": 1.0903,
"step": 184
},
{
"epoch": 0.36,
"grad_norm": 0.08998877555131912,
"learning_rate": 1.4578893695061644e-05,
"loss": 1.0778,
"step": 185
},
{
"epoch": 0.36,
"grad_norm": 0.09158129245042801,
"learning_rate": 1.4523168841444657e-05,
"loss": 1.0932,
"step": 186
},
{
"epoch": 0.36,
"grad_norm": 0.09460633993148804,
"learning_rate": 1.4467266840872041e-05,
"loss": 1.0691,
"step": 187
},
{
"epoch": 0.37,
"grad_norm": 0.09502755105495453,
"learning_rate": 1.441118988270916e-05,
"loss": 1.0684,
"step": 188
},
{
"epoch": 0.37,
"grad_norm": 0.09307122975587845,
"learning_rate": 1.4354940163173486e-05,
"loss": 1.0776,
"step": 189
},
{
"epoch": 0.37,
"grad_norm": 0.09580650180578232,
"learning_rate": 1.4298519885248574e-05,
"loss": 1.0882,
"step": 190
},
{
"epoch": 0.37,
"grad_norm": 0.09251459687948227,
"learning_rate": 1.4241931258597781e-05,
"loss": 1.077,
"step": 191
},
{
"epoch": 0.37,
"grad_norm": 0.09432998299598694,
"learning_rate": 1.4185176499477742e-05,
"loss": 1.0012,
"step": 192
},
{
"epoch": 0.38,
"grad_norm": 0.09586652368307114,
"learning_rate": 1.4128257830651554e-05,
"loss": 1.0334,
"step": 193
},
{
"epoch": 0.38,
"grad_norm": 0.09538242220878601,
"learning_rate": 1.407117748130174e-05,
"loss": 1.0731,
"step": 194
},
{
"epoch": 0.38,
"grad_norm": 0.09691152721643448,
"learning_rate": 1.401393768694292e-05,
"loss": 1.0412,
"step": 195
},
{
"epoch": 0.38,
"grad_norm": 0.09779084473848343,
"learning_rate": 1.3956540689334286e-05,
"loss": 1.0602,
"step": 196
},
{
"epoch": 0.38,
"grad_norm": 0.0998532623052597,
"learning_rate": 1.3898988736391792e-05,
"loss": 1.0261,
"step": 197
},
{
"epoch": 0.39,
"grad_norm": 0.10739872604608536,
"learning_rate": 1.384128408210011e-05,
"loss": 1.0502,
"step": 198
},
{
"epoch": 0.39,
"grad_norm": 0.11806387454271317,
"learning_rate": 1.3783428986424366e-05,
"loss": 1.1188,
"step": 199
},
{
"epoch": 0.39,
"grad_norm": 0.10208501666784286,
"learning_rate": 1.3725425715221625e-05,
"loss": 1.0465,
"step": 200
},
{
"epoch": 0.39,
"grad_norm": 0.1044783741235733,
"learning_rate": 1.3667276540152143e-05,
"loss": 1.0561,
"step": 201
},
{
"epoch": 0.39,
"grad_norm": 0.1070132926106453,
"learning_rate": 1.3608983738590414e-05,
"loss": 1.0429,
"step": 202
},
{
"epoch": 0.4,
"grad_norm": 0.11181865632534027,
"learning_rate": 1.3550549593535965e-05,
"loss": 1.0564,
"step": 203
},
{
"epoch": 0.4,
"grad_norm": 0.11098324507474899,
"learning_rate": 1.3491976393523952e-05,
"loss": 1.0632,
"step": 204
},
{
"epoch": 0.4,
"grad_norm": 0.10281454026699066,
"learning_rate": 1.343326643253552e-05,
"loss": 1.0637,
"step": 205
},
{
"epoch": 0.4,
"grad_norm": 0.10408665239810944,
"learning_rate": 1.3374422009907984e-05,
"loss": 1.0701,
"step": 206
},
{
"epoch": 0.4,
"grad_norm": 0.10533872246742249,
"learning_rate": 1.3315445430244744e-05,
"loss": 1.0654,
"step": 207
},
{
"epoch": 0.41,
"grad_norm": 0.10545054078102112,
"learning_rate": 1.3256339003325054e-05,
"loss": 1.0518,
"step": 208
},
{
"epoch": 0.41,
"grad_norm": 0.09894714504480362,
"learning_rate": 1.3197105044013544e-05,
"loss": 1.0671,
"step": 209
},
{
"epoch": 0.41,
"grad_norm": 0.08720172196626663,
"learning_rate": 1.3137745872169578e-05,
"loss": 1.0127,
"step": 210
},
{
"epoch": 0.41,
"grad_norm": 0.08827454596757889,
"learning_rate": 1.3078263812556377e-05,
"loss": 1.0154,
"step": 211
},
{
"epoch": 0.41,
"grad_norm": 0.0914626345038414,
"learning_rate": 1.3018661194749986e-05,
"loss": 1.0201,
"step": 212
},
{
"epoch": 0.42,
"grad_norm": 0.08843535929918289,
"learning_rate": 1.295894035304803e-05,
"loss": 1.0516,
"step": 213
},
{
"epoch": 0.42,
"grad_norm": 0.08639541268348694,
"learning_rate": 1.28991036263783e-05,
"loss": 1.0165,
"step": 214
},
{
"epoch": 0.42,
"grad_norm": 0.07750130444765091,
"learning_rate": 1.2839153358207142e-05,
"loss": 1.0223,
"step": 215
},
{
"epoch": 0.42,
"grad_norm": 0.0824190005660057,
"learning_rate": 1.2779091896447682e-05,
"loss": 1.0337,
"step": 216
},
{
"epoch": 0.42,
"grad_norm": 0.08451572805643082,
"learning_rate": 1.2718921593367874e-05,
"loss": 1.0542,
"step": 217
},
{
"epoch": 0.43,
"grad_norm": 0.0857366994023323,
"learning_rate": 1.2658644805498361e-05,
"loss": 1.0759,
"step": 218
},
{
"epoch": 0.43,
"grad_norm": 0.07681415975093842,
"learning_rate": 1.2598263893540207e-05,
"loss": 1.0506,
"step": 219
},
{
"epoch": 0.43,
"grad_norm": 0.07856535911560059,
"learning_rate": 1.2537781222272423e-05,
"loss": 1.0974,
"step": 220
},
{
"epoch": 0.43,
"grad_norm": 0.08015410602092743,
"learning_rate": 1.2477199160459345e-05,
"loss": 1.0604,
"step": 221
},
{
"epoch": 0.43,
"grad_norm": 0.08314133435487747,
"learning_rate": 1.2416520080757892e-05,
"loss": 1.0889,
"step": 222
},
{
"epoch": 0.43,
"grad_norm": 0.08028203994035721,
"learning_rate": 1.2355746359624621e-05,
"loss": 1.0281,
"step": 223
},
{
"epoch": 0.44,
"grad_norm": 0.0775797963142395,
"learning_rate": 1.2294880377222649e-05,
"loss": 1.078,
"step": 224
},
{
"epoch": 0.44,
"grad_norm": 0.08315123617649078,
"learning_rate": 1.2233924517328456e-05,
"loss": 1.0356,
"step": 225
},
{
"epoch": 0.44,
"grad_norm": 0.0795183852314949,
"learning_rate": 1.2172881167238515e-05,
"loss": 1.0332,
"step": 226
},
{
"epoch": 0.44,
"grad_norm": 0.0779062882065773,
"learning_rate": 1.2111752717675788e-05,
"loss": 0.9954,
"step": 227
},
{
"epoch": 0.44,
"grad_norm": 0.07758854329586029,
"learning_rate": 1.205054156269611e-05,
"loss": 1.0242,
"step": 228
},
{
"epoch": 0.45,
"grad_norm": 0.07713694125413895,
"learning_rate": 1.1989250099594412e-05,
"loss": 1.0686,
"step": 229
},
{
"epoch": 0.45,
"grad_norm": 0.07772821933031082,
"learning_rate": 1.192788072881085e-05,
"loss": 1.0338,
"step": 230
},
{
"epoch": 0.45,
"grad_norm": 0.08006665855646133,
"learning_rate": 1.1866435853836773e-05,
"loss": 1.0946,
"step": 231
},
{
"epoch": 0.45,
"grad_norm": 0.0821637436747551,
"learning_rate": 1.1804917881120608e-05,
"loss": 1.0525,
"step": 232
},
{
"epoch": 0.45,
"grad_norm": 0.07892850786447525,
"learning_rate": 1.1743329219973609e-05,
"loss": 1.0127,
"step": 233
},
{
"epoch": 0.46,
"grad_norm": 0.07800798863172531,
"learning_rate": 1.1681672282475495e-05,
"loss": 1.0254,
"step": 234
},
{
"epoch": 0.46,
"grad_norm": 0.07875402271747589,
"learning_rate": 1.161994948337998e-05,
"loss": 1.0319,
"step": 235
},
{
"epoch": 0.46,
"grad_norm": 0.08178096264600754,
"learning_rate": 1.1558163240020209e-05,
"loss": 1.0541,
"step": 236
},
{
"epoch": 0.46,
"grad_norm": 0.08126726001501083,
"learning_rate": 1.1496315972214076e-05,
"loss": 1.0681,
"step": 237
},
{
"epoch": 0.46,
"grad_norm": 0.08104463666677475,
"learning_rate": 1.1434410102169462e-05,
"loss": 0.9767,
"step": 238
},
{
"epoch": 0.47,
"grad_norm": 0.0746295303106308,
"learning_rate": 1.1372448054389364e-05,
"loss": 1.0586,
"step": 239
},
{
"epoch": 0.47,
"grad_norm": 0.08171354979276657,
"learning_rate": 1.1310432255576944e-05,
"loss": 1.0655,
"step": 240
},
{
"epoch": 0.47,
"grad_norm": 0.08069796115159988,
"learning_rate": 1.1248365134540489e-05,
"loss": 1.079,
"step": 241
},
{
"epoch": 0.47,
"grad_norm": 0.07922904193401337,
"learning_rate": 1.1186249122098282e-05,
"loss": 1.0371,
"step": 242
},
{
"epoch": 0.47,
"grad_norm": 0.07877922058105469,
"learning_rate": 1.1124086650983415e-05,
"loss": 1.0236,
"step": 243
},
{
"epoch": 0.48,
"grad_norm": 0.07606945931911469,
"learning_rate": 1.1061880155748497e-05,
"loss": 1.0255,
"step": 244
},
{
"epoch": 0.48,
"grad_norm": 0.08225277811288834,
"learning_rate": 1.0999632072670314e-05,
"loss": 1.0571,
"step": 245
},
{
"epoch": 0.48,
"grad_norm": 0.07907744497060776,
"learning_rate": 1.0937344839654416e-05,
"loss": 1.0745,
"step": 246
},
{
"epoch": 0.48,
"grad_norm": 0.07885382324457169,
"learning_rate": 1.087502089613963e-05,
"loss": 0.9899,
"step": 247
},
{
"epoch": 0.48,
"grad_norm": 0.08236192911863327,
"learning_rate": 1.0812662683002528e-05,
"loss": 1.046,
"step": 248
},
{
"epoch": 0.49,
"grad_norm": 0.08153583109378815,
"learning_rate": 1.075027264246183e-05,
"loss": 1.0769,
"step": 249
},
{
"epoch": 0.49,
"grad_norm": 0.0847182348370552,
"learning_rate": 1.068785321798276e-05,
"loss": 1.0695,
"step": 250
},
{
"epoch": 0.49,
"grad_norm": 0.07414229959249496,
"learning_rate": 1.062540685418133e-05,
"loss": 1.0555,
"step": 251
},
{
"epoch": 0.49,
"grad_norm": 0.07932449132204056,
"learning_rate": 1.0562935996728629e-05,
"loss": 1.0644,
"step": 252
},
{
"epoch": 0.49,
"grad_norm": 0.08247576653957367,
"learning_rate": 1.0500443092255017e-05,
"loss": 1.064,
"step": 253
},
{
"epoch": 0.5,
"grad_norm": 0.07860003411769867,
"learning_rate": 1.043793058825431e-05,
"loss": 1.0579,
"step": 254
},
{
"epoch": 0.5,
"grad_norm": 0.08330255001783371,
"learning_rate": 1.0375400932987932e-05,
"loss": 1.0218,
"step": 255
},
{
"epoch": 0.5,
"grad_norm": 0.08150562644004822,
"learning_rate": 1.0312856575389016e-05,
"loss": 1.0379,
"step": 256
},
{
"epoch": 0.5,
"eval_loss": 1.0509783029556274,
"eval_runtime": 708.357,
"eval_samples_per_second": 7.163,
"eval_steps_per_second": 1.791,
"step": 256
},
{
"epoch": 0.5,
"grad_norm": 0.08040876686573029,
"learning_rate": 1.025029996496651e-05,
"loss": 1.03,
"step": 257
},
{
"epoch": 0.5,
"grad_norm": 0.07957625389099121,
"learning_rate": 1.0187733551709236e-05,
"loss": 1.0235,
"step": 258
},
{
"epoch": 0.51,
"grad_norm": 0.07723429054021835,
"learning_rate": 1.0125159785989933e-05,
"loss": 1.0633,
"step": 259
},
{
"epoch": 0.51,
"grad_norm": 0.079456627368927,
"learning_rate": 1.00625811184693e-05,
"loss": 1.0403,
"step": 260
},
{
"epoch": 0.51,
"grad_norm": 0.07875042408704758,
"learning_rate": 1e-05,
"loss": 1.0393,
"step": 261
},
{
"epoch": 0.51,
"grad_norm": 0.0797952190041542,
"learning_rate": 9.937418881530704e-06,
"loss": 1.0456,
"step": 262
},
{
"epoch": 0.51,
"grad_norm": 0.08356386423110962,
"learning_rate": 9.874840214010069e-06,
"loss": 1.0967,
"step": 263
},
{
"epoch": 0.51,
"grad_norm": 0.0836138129234314,
"learning_rate": 9.812266448290767e-06,
"loss": 1.065,
"step": 264
},
{
"epoch": 0.52,
"grad_norm": 0.07885371893644333,
"learning_rate": 9.749700035033492e-06,
"loss": 1.0664,
"step": 265
},
{
"epoch": 0.52,
"grad_norm": 0.07813183963298798,
"learning_rate": 9.687143424610986e-06,
"loss": 1.0277,
"step": 266
},
{
"epoch": 0.52,
"grad_norm": 0.07894445955753326,
"learning_rate": 9.624599067012073e-06,
"loss": 1.0646,
"step": 267
},
{
"epoch": 0.52,
"grad_norm": 0.08423804491758347,
"learning_rate": 9.562069411745692e-06,
"loss": 1.0592,
"step": 268
},
{
"epoch": 0.52,
"grad_norm": 0.08294244855642319,
"learning_rate": 9.499556907744985e-06,
"loss": 1.11,
"step": 269
},
{
"epoch": 0.53,
"grad_norm": 0.07947829365730286,
"learning_rate": 9.437064003271373e-06,
"loss": 1.0872,
"step": 270
},
{
"epoch": 0.53,
"grad_norm": 0.07858666032552719,
"learning_rate": 9.374593145818673e-06,
"loss": 1.0711,
"step": 271
},
{
"epoch": 0.53,
"grad_norm": 0.07969223707914352,
"learning_rate": 9.312146782017244e-06,
"loss": 1.0515,
"step": 272
},
{
"epoch": 0.53,
"grad_norm": 0.08079613745212555,
"learning_rate": 9.249727357538171e-06,
"loss": 1.0755,
"step": 273
},
{
"epoch": 0.53,
"grad_norm": 0.07963809370994568,
"learning_rate": 9.187337316997475e-06,
"loss": 1.0371,
"step": 274
},
{
"epoch": 0.54,
"grad_norm": 0.07834453135728836,
"learning_rate": 9.124979103860374e-06,
"loss": 1.0512,
"step": 275
},
{
"epoch": 0.54,
"grad_norm": 0.07958018779754639,
"learning_rate": 9.062655160345587e-06,
"loss": 1.0395,
"step": 276
},
{
"epoch": 0.54,
"grad_norm": 0.08063136041164398,
"learning_rate": 9.000367927329691e-06,
"loss": 1.0417,
"step": 277
},
{
"epoch": 0.54,
"grad_norm": 0.07932613044977188,
"learning_rate": 8.938119844251507e-06,
"loss": 1.0223,
"step": 278
},
{
"epoch": 0.54,
"grad_norm": 0.08033180236816406,
"learning_rate": 8.87591334901659e-06,
"loss": 1.0549,
"step": 279
},
{
"epoch": 0.55,
"grad_norm": 0.07867812365293503,
"learning_rate": 8.813750877901723e-06,
"loss": 1.0559,
"step": 280
},
{
"epoch": 0.55,
"grad_norm": 0.07839823514223099,
"learning_rate": 8.751634865459518e-06,
"loss": 1.0506,
"step": 281
},
{
"epoch": 0.55,
"grad_norm": 0.08088871091604233,
"learning_rate": 8.68956774442306e-06,
"loss": 1.0387,
"step": 282
},
{
"epoch": 0.55,
"grad_norm": 0.07918152213096619,
"learning_rate": 8.627551945610641e-06,
"loss": 1.0213,
"step": 283
},
{
"epoch": 0.55,
"grad_norm": 0.08179888874292374,
"learning_rate": 8.565589897830543e-06,
"loss": 1.0328,
"step": 284
},
{
"epoch": 0.56,
"grad_norm": 0.07968099415302277,
"learning_rate": 8.503684027785929e-06,
"loss": 1.0358,
"step": 285
},
{
"epoch": 0.56,
"grad_norm": 0.07866982370615005,
"learning_rate": 8.441836759979796e-06,
"loss": 1.0278,
"step": 286
},
{
"epoch": 0.56,
"grad_norm": 0.08549748361110687,
"learning_rate": 8.380050516620026e-06,
"loss": 1.0062,
"step": 287
},
{
"epoch": 0.56,
"grad_norm": 0.07923188805580139,
"learning_rate": 8.31832771752451e-06,
"loss": 1.0566,
"step": 288
},
{
"epoch": 0.56,
"grad_norm": 0.07845155894756317,
"learning_rate": 8.256670780026393e-06,
"loss": 1.0425,
"step": 289
},
{
"epoch": 0.57,
"grad_norm": 0.08341726660728455,
"learning_rate": 8.195082118879397e-06,
"loss": 1.0269,
"step": 290
},
{
"epoch": 0.57,
"grad_norm": 0.09000853449106216,
"learning_rate": 8.133564146163232e-06,
"loss": 1.0602,
"step": 291
},
{
"epoch": 0.57,
"grad_norm": 0.07995503395795822,
"learning_rate": 8.072119271189155e-06,
"loss": 1.0283,
"step": 292
},
{
"epoch": 0.57,
"grad_norm": 0.08204926550388336,
"learning_rate": 8.01074990040559e-06,
"loss": 1.0397,
"step": 293
},
{
"epoch": 0.57,
"grad_norm": 0.08035868406295776,
"learning_rate": 7.949458437303892e-06,
"loss": 1.0455,
"step": 294
},
{
"epoch": 0.58,
"grad_norm": 0.08018406480550766,
"learning_rate": 7.888247282324212e-06,
"loss": 1.0796,
"step": 295
},
{
"epoch": 0.58,
"grad_norm": 0.07843944430351257,
"learning_rate": 7.827118832761487e-06,
"loss": 1.0541,
"step": 296
},
{
"epoch": 0.58,
"grad_norm": 0.07853015512228012,
"learning_rate": 7.766075482671544e-06,
"loss": 1.045,
"step": 297
},
{
"epoch": 0.58,
"grad_norm": 0.08274030685424805,
"learning_rate": 7.705119622777351e-06,
"loss": 1.0465,
"step": 298
},
{
"epoch": 0.58,
"grad_norm": 0.08550436049699783,
"learning_rate": 7.644253640375382e-06,
"loss": 0.9985,
"step": 299
},
{
"epoch": 0.58,
"grad_norm": 0.08202614635229111,
"learning_rate": 7.583479919242108e-06,
"loss": 1.039,
"step": 300
},
{
"epoch": 0.59,
"grad_norm": 0.08024229854345322,
"learning_rate": 7.522800839540656e-06,
"loss": 1.0158,
"step": 301
},
{
"epoch": 0.59,
"grad_norm": 0.08370032161474228,
"learning_rate": 7.462218777727581e-06,
"loss": 1.0143,
"step": 302
},
{
"epoch": 0.59,
"grad_norm": 0.0836290642619133,
"learning_rate": 7.4017361064597925e-06,
"loss": 1.0427,
"step": 303
},
{
"epoch": 0.59,
"grad_norm": 0.08153602480888367,
"learning_rate": 7.341355194501638e-06,
"loss": 1.0573,
"step": 304
},
{
"epoch": 0.59,
"grad_norm": 0.08701439201831818,
"learning_rate": 7.281078406632127e-06,
"loss": 1.0185,
"step": 305
},
{
"epoch": 0.6,
"grad_norm": 0.08054394274950027,
"learning_rate": 7.220908103552319e-06,
"loss": 1.0349,
"step": 306
},
{
"epoch": 0.6,
"grad_norm": 0.0855594277381897,
"learning_rate": 7.160846641792858e-06,
"loss": 1.0594,
"step": 307
},
{
"epoch": 0.6,
"grad_norm": 0.08319972455501556,
"learning_rate": 7.1008963736217e-06,
"loss": 1.0537,
"step": 308
},
{
"epoch": 0.6,
"grad_norm": 0.08150538802146912,
"learning_rate": 7.041059646951971e-06,
"loss": 1.025,
"step": 309
},
{
"epoch": 0.6,
"grad_norm": 0.07885453850030899,
"learning_rate": 6.981338805250015e-06,
"loss": 1.013,
"step": 310
},
{
"epoch": 0.61,
"grad_norm": 0.0796799287199974,
"learning_rate": 6.921736187443624e-06,
"loss": 1.0002,
"step": 311
},
{
"epoch": 0.61,
"grad_norm": 0.08254199475049973,
"learning_rate": 6.862254127830426e-06,
"loss": 1.076,
"step": 312
},
{
"epoch": 0.61,
"grad_norm": 0.08591365814208984,
"learning_rate": 6.802894955986459e-06,
"loss": 1.0464,
"step": 313
},
{
"epoch": 0.61,
"grad_norm": 0.08346603810787201,
"learning_rate": 6.74366099667495e-06,
"loss": 1.0497,
"step": 314
},
{
"epoch": 0.61,
"grad_norm": 0.0861155241727829,
"learning_rate": 6.684554569755258e-06,
"loss": 1.0272,
"step": 315
},
{
"epoch": 0.62,
"grad_norm": 0.08690168708562851,
"learning_rate": 6.625577990092019e-06,
"loss": 1.0239,
"step": 316
},
{
"epoch": 0.62,
"grad_norm": 0.08035359531641006,
"learning_rate": 6.56673356746448e-06,
"loss": 1.0224,
"step": 317
},
{
"epoch": 0.62,
"grad_norm": 0.08103054761886597,
"learning_rate": 6.508023606476052e-06,
"loss": 1.0484,
"step": 318
},
{
"epoch": 0.62,
"grad_norm": 0.08610061556100845,
"learning_rate": 6.44945040646404e-06,
"loss": 1.0345,
"step": 319
},
{
"epoch": 0.62,
"grad_norm": 0.07808943837881088,
"learning_rate": 6.39101626140959e-06,
"loss": 1.0312,
"step": 320
},
{
"epoch": 0.63,
"grad_norm": 0.08029762655496597,
"learning_rate": 6.3327234598478605e-06,
"loss": 1.0705,
"step": 321
},
{
"epoch": 0.63,
"grad_norm": 0.08351495862007141,
"learning_rate": 6.274574284778379e-06,
"loss": 1.0243,
"step": 322
},
{
"epoch": 0.63,
"grad_norm": 0.07923708111047745,
"learning_rate": 6.2165710135756365e-06,
"loss": 1.0663,
"step": 323
},
{
"epoch": 0.63,
"grad_norm": 0.0885549783706665,
"learning_rate": 6.158715917899892e-06,
"loss": 1.0464,
"step": 324
},
{
"epoch": 0.63,
"grad_norm": 0.08241602033376694,
"learning_rate": 6.10101126360821e-06,
"loss": 1.0619,
"step": 325
},
{
"epoch": 0.64,
"grad_norm": 0.08293651044368744,
"learning_rate": 6.043459310665716e-06,
"loss": 1.0406,
"step": 326
},
{
"epoch": 0.64,
"grad_norm": 0.08230554312467575,
"learning_rate": 5.986062313057084e-06,
"loss": 1.0268,
"step": 327
},
{
"epoch": 0.64,
"grad_norm": 0.08260442316532135,
"learning_rate": 5.928822518698263e-06,
"loss": 1.0341,
"step": 328
},
{
"epoch": 0.64,
"grad_norm": 0.08099180459976196,
"learning_rate": 5.871742169348447e-06,
"loss": 1.0127,
"step": 329
},
{
"epoch": 0.64,
"grad_norm": 0.08358913660049438,
"learning_rate": 5.81482350052226e-06,
"loss": 1.0125,
"step": 330
},
{
"epoch": 0.65,
"grad_norm": 0.07926523685455322,
"learning_rate": 5.758068741402223e-06,
"loss": 1.0362,
"step": 331
},
{
"epoch": 0.65,
"grad_norm": 0.0837349146604538,
"learning_rate": 5.701480114751432e-06,
"loss": 1.068,
"step": 332
},
{
"epoch": 0.65,
"grad_norm": 0.08267077058553696,
"learning_rate": 5.645059836826518e-06,
"loss": 1.025,
"step": 333
},
{
"epoch": 0.65,
"grad_norm": 0.07937329262495041,
"learning_rate": 5.588810117290843e-06,
"loss": 1.073,
"step": 334
},
{
"epoch": 0.65,
"grad_norm": 0.08118374645709991,
"learning_rate": 5.532733159127963e-06,
"loss": 1.0477,
"step": 335
},
{
"epoch": 0.66,
"grad_norm": 0.08420372009277344,
"learning_rate": 5.476831158555345e-06,
"loss": 1.0384,
"step": 336
},
{
"epoch": 0.66,
"grad_norm": 0.08201659470796585,
"learning_rate": 5.421106304938356e-06,
"loss": 1.0402,
"step": 337
},
{
"epoch": 0.66,
"grad_norm": 0.08588004112243652,
"learning_rate": 5.365560780704524e-06,
"loss": 1.0363,
"step": 338
},
{
"epoch": 0.66,
"grad_norm": 0.08382384479045868,
"learning_rate": 5.310196761258048e-06,
"loss": 1.0392,
"step": 339
},
{
"epoch": 0.66,
"grad_norm": 0.08232959359884262,
"learning_rate": 5.255016414894616e-06,
"loss": 1.0476,
"step": 340
},
{
"epoch": 0.66,
"grad_norm": 0.08402816951274872,
"learning_rate": 5.200021902716483e-06,
"loss": 1.0668,
"step": 341
},
{
"epoch": 0.67,
"grad_norm": 0.0849897488951683,
"learning_rate": 5.145215378547825e-06,
"loss": 1.0068,
"step": 342
},
{
"epoch": 0.67,
"grad_norm": 0.08432206511497498,
"learning_rate": 5.0905989888503924e-06,
"loss": 1.0815,
"step": 343
},
{
"epoch": 0.67,
"grad_norm": 0.0807095468044281,
"learning_rate": 5.0361748726394435e-06,
"loss": 1.058,
"step": 344
},
{
"epoch": 0.67,
"grad_norm": 0.0844447985291481,
"learning_rate": 4.981945161399969e-06,
"loss": 1.0134,
"step": 345
},
{
"epoch": 0.67,
"grad_norm": 0.08169819414615631,
"learning_rate": 4.927911979003214e-06,
"loss": 1.0486,
"step": 346
},
{
"epoch": 0.68,
"grad_norm": 0.08409695327281952,
"learning_rate": 4.874077441623504e-06,
"loss": 1.0711,
"step": 347
},
{
"epoch": 0.68,
"grad_norm": 0.08292841166257858,
"learning_rate": 4.82044365765536e-06,
"loss": 1.0311,
"step": 348
},
{
"epoch": 0.68,
"grad_norm": 0.084018275141716,
"learning_rate": 4.767012727630927e-06,
"loss": 1.0254,
"step": 349
},
{
"epoch": 0.68,
"grad_norm": 0.08370067924261093,
"learning_rate": 4.71378674413771e-06,
"loss": 1.0309,
"step": 350
},
{
"epoch": 0.68,
"grad_norm": 0.08273698389530182,
"learning_rate": 4.6607677917366155e-06,
"loss": 1.0068,
"step": 351
},
{
"epoch": 0.69,
"grad_norm": 0.0803639367222786,
"learning_rate": 4.607957946880305e-06,
"loss": 1.0414,
"step": 352
},
{
"epoch": 0.69,
"grad_norm": 0.08122535794973373,
"learning_rate": 4.55535927783189e-06,
"loss": 1.0474,
"step": 353
},
{
"epoch": 0.69,
"grad_norm": 0.07980665564537048,
"learning_rate": 4.502973844583914e-06,
"loss": 1.0281,
"step": 354
},
{
"epoch": 0.69,
"grad_norm": 0.08257856965065002,
"learning_rate": 4.450803698777684e-06,
"loss": 0.9825,
"step": 355
},
{
"epoch": 0.69,
"grad_norm": 0.0826905369758606,
"learning_rate": 4.398850883622905e-06,
"loss": 1.0168,
"step": 356
},
{
"epoch": 0.7,
"grad_norm": 0.0833047404885292,
"learning_rate": 4.347117433817687e-06,
"loss": 1.0481,
"step": 357
},
{
"epoch": 0.7,
"grad_norm": 0.0809812918305397,
"learning_rate": 4.295605375468818e-06,
"loss": 1.017,
"step": 358
},
{
"epoch": 0.7,
"grad_norm": 0.0812465101480484,
"learning_rate": 4.244316726012446e-06,
"loss": 1.0376,
"step": 359
},
{
"epoch": 0.7,
"grad_norm": 0.07954216003417969,
"learning_rate": 4.1932534941350545e-06,
"loss": 1.0491,
"step": 360
},
{
"epoch": 0.7,
"grad_norm": 0.08341410756111145,
"learning_rate": 4.142417679694794e-06,
"loss": 1.0556,
"step": 361
},
{
"epoch": 0.71,
"grad_norm": 0.08242359757423401,
"learning_rate": 4.091811273643157e-06,
"loss": 1.0573,
"step": 362
},
{
"epoch": 0.71,
"grad_norm": 0.08539704233407974,
"learning_rate": 4.041436257947015e-06,
"loss": 1.0458,
"step": 363
},
{
"epoch": 0.71,
"grad_norm": 0.08542854338884354,
"learning_rate": 3.991294605510969e-06,
"loss": 1.052,
"step": 364
},
{
"epoch": 0.71,
"grad_norm": 0.07968278974294662,
"learning_rate": 3.94138828010012e-06,
"loss": 1.0441,
"step": 365
},
{
"epoch": 0.71,
"grad_norm": 0.08242907375097275,
"learning_rate": 3.8917192362631285e-06,
"loss": 1.0321,
"step": 366
},
{
"epoch": 0.72,
"grad_norm": 0.08331603556871414,
"learning_rate": 3.842289419255681e-06,
"loss": 1.0296,
"step": 367
},
{
"epoch": 0.72,
"grad_norm": 0.08717033267021179,
"learning_rate": 3.793100764964299e-06,
"loss": 1.0797,
"step": 368
},
{
"epoch": 0.72,
"grad_norm": 0.08025430142879486,
"learning_rate": 3.744155199830526e-06,
"loss": 1.0322,
"step": 369
},
{
"epoch": 0.72,
"grad_norm": 0.0844465121626854,
"learning_rate": 3.69545464077548e-06,
"loss": 1.0334,
"step": 370
},
{
"epoch": 0.72,
"grad_norm": 0.08173581212759018,
"learning_rate": 3.647000995124763e-06,
"loss": 1.062,
"step": 371
},
{
"epoch": 0.73,
"grad_norm": 0.08682363480329514,
"learning_rate": 3.5987961605337894e-06,
"loss": 1.0115,
"step": 372
},
{
"epoch": 0.73,
"grad_norm": 0.09901048988103867,
"learning_rate": 3.5508420249134432e-06,
"loss": 1.0422,
"step": 373
},
{
"epoch": 0.73,
"grad_norm": 0.0846165269613266,
"learning_rate": 3.5031404663561507e-06,
"loss": 1.0187,
"step": 374
},
{
"epoch": 0.73,
"grad_norm": 0.08063055574893951,
"learning_rate": 3.4556933530623193e-06,
"loss": 1.0233,
"step": 375
},
{
"epoch": 0.73,
"grad_norm": 0.08191438019275665,
"learning_rate": 3.4085025432671746e-06,
"loss": 1.0758,
"step": 376
},
{
"epoch": 0.74,
"grad_norm": 0.08018861711025238,
"learning_rate": 3.3615698851679866e-06,
"loss": 1.0546,
"step": 377
},
{
"epoch": 0.74,
"grad_norm": 0.08329375833272934,
"learning_rate": 3.3148972168516737e-06,
"loss": 1.0403,
"step": 378
},
{
"epoch": 0.74,
"grad_norm": 0.08185313642024994,
"learning_rate": 3.2684863662228307e-06,
"loss": 1.0272,
"step": 379
},
{
"epoch": 0.74,
"grad_norm": 0.08924821019172668,
"learning_rate": 3.2223391509321335e-06,
"loss": 1.0447,
"step": 380
},
{
"epoch": 0.74,
"grad_norm": 0.08294879645109177,
"learning_rate": 3.176457378305151e-06,
"loss": 1.004,
"step": 381
},
{
"epoch": 0.74,
"grad_norm": 0.08492495864629745,
"learning_rate": 3.1308428452715643e-06,
"loss": 1.0479,
"step": 382
},
{
"epoch": 0.75,
"grad_norm": 0.08171503990888596,
"learning_rate": 3.0854973382947884e-06,
"loss": 0.9821,
"step": 383
},
{
"epoch": 0.75,
"grad_norm": 0.08578088134527206,
"learning_rate": 3.0404226333020117e-06,
"loss": 1.0402,
"step": 384
},
{
"epoch": 0.75,
"eval_loss": 1.0385833978652954,
"eval_runtime": 708.3771,
"eval_samples_per_second": 7.163,
"eval_steps_per_second": 1.791,
"step": 384
},
{
"epoch": 0.75,
"grad_norm": 0.0851321890950203,
"learning_rate": 2.995620495614633e-06,
"loss": 1.0501,
"step": 385
},
{
"epoch": 0.75,
"grad_norm": 0.08435554802417755,
"learning_rate": 2.951092679879136e-06,
"loss": 1.0222,
"step": 386
},
{
"epoch": 0.75,
"grad_norm": 0.08391427248716354,
"learning_rate": 2.9068409299983634e-06,
"loss": 1.0561,
"step": 387
},
{
"epoch": 0.76,
"grad_norm": 0.08458521217107773,
"learning_rate": 2.862866979063219e-06,
"loss": 1.0279,
"step": 388
},
{
"epoch": 0.76,
"grad_norm": 0.08274800330400467,
"learning_rate": 2.8191725492847923e-06,
"loss": 1.0497,
"step": 389
},
{
"epoch": 0.76,
"grad_norm": 0.0817330926656723,
"learning_rate": 2.7757593519269088e-06,
"loss": 1.0518,
"step": 390
},
{
"epoch": 0.76,
"grad_norm": 0.08396860957145691,
"learning_rate": 2.732629087239106e-06,
"loss": 1.0205,
"step": 391
},
{
"epoch": 0.76,
"grad_norm": 0.0885666087269783,
"learning_rate": 2.689783444390053e-06,
"loss": 1.0322,
"step": 392
},
{
"epoch": 0.77,
"grad_norm": 0.0838237777352333,
"learning_rate": 2.647224101401389e-06,
"loss": 1.0457,
"step": 393
},
{
"epoch": 0.77,
"grad_norm": 0.08539842069149017,
"learning_rate": 2.604952725082005e-06,
"loss": 1.0324,
"step": 394
},
{
"epoch": 0.77,
"grad_norm": 0.08621538430452347,
"learning_rate": 2.562970970962768e-06,
"loss": 1.0304,
"step": 395
},
{
"epoch": 0.77,
"grad_norm": 0.08441948890686035,
"learning_rate": 2.5212804832316783e-06,
"loss": 1.0347,
"step": 396
},
{
"epoch": 0.77,
"grad_norm": 0.08076242357492447,
"learning_rate": 2.479882894669481e-06,
"loss": 1.0287,
"step": 397
},
{
"epoch": 0.78,
"grad_norm": 0.08226581662893295,
"learning_rate": 2.4387798265857078e-06,
"loss": 1.0543,
"step": 398
},
{
"epoch": 0.78,
"grad_norm": 0.08637404441833496,
"learning_rate": 2.397972888755197e-06,
"loss": 1.0628,
"step": 399
},
{
"epoch": 0.78,
"grad_norm": 0.086219921708107,
"learning_rate": 2.3574636793550376e-06,
"loss": 1.0006,
"step": 400
},
{
"epoch": 0.78,
"grad_norm": 0.08456084877252579,
"learning_rate": 2.317253784901976e-06,
"loss": 0.9901,
"step": 401
},
{
"epoch": 0.78,
"grad_norm": 0.08231404423713684,
"learning_rate": 2.277344780190286e-06,
"loss": 1.0143,
"step": 402
},
{
"epoch": 0.79,
"grad_norm": 0.0869838073849678,
"learning_rate": 2.237738228230091e-06,
"loss": 1.0385,
"step": 403
},
{
"epoch": 0.79,
"grad_norm": 0.0852610394358635,
"learning_rate": 2.1984356801861506e-06,
"loss": 1.0513,
"step": 404
},
{
"epoch": 0.79,
"grad_norm": 0.08569534868001938,
"learning_rate": 2.1594386753171035e-06,
"loss": 1.0364,
"step": 405
},
{
"epoch": 0.79,
"grad_norm": 0.08554627001285553,
"learning_rate": 2.1207487409151984e-06,
"loss": 1.0468,
"step": 406
},
{
"epoch": 0.79,
"grad_norm": 0.08350294083356857,
"learning_rate": 2.0823673922464625e-06,
"loss": 1.0184,
"step": 407
},
{
"epoch": 0.8,
"grad_norm": 0.08463279157876968,
"learning_rate": 2.044296132491369e-06,
"loss": 1.0686,
"step": 408
},
{
"epoch": 0.8,
"grad_norm": 0.08454636484384537,
"learning_rate": 2.0065364526859576e-06,
"loss": 1.043,
"step": 409
},
{
"epoch": 0.8,
"grad_norm": 0.08581449091434479,
"learning_rate": 1.969089831663443e-06,
"loss": 1.0257,
"step": 410
},
{
"epoch": 0.8,
"grad_norm": 0.08445479720830917,
"learning_rate": 1.931957735996304e-06,
"loss": 1.0024,
"step": 411
},
{
"epoch": 0.8,
"grad_norm": 0.08243846148252487,
"learning_rate": 1.895141619938825e-06,
"loss": 1.0211,
"step": 412
},
{
"epoch": 0.81,
"grad_norm": 0.0819249302148819,
"learning_rate": 1.8586429253701676e-06,
"loss": 1.0421,
"step": 413
},
{
"epoch": 0.81,
"grad_norm": 0.0827661082148552,
"learning_rate": 1.822463081737883e-06,
"loss": 1.0343,
"step": 414
},
{
"epoch": 0.81,
"grad_norm": 0.0872969999909401,
"learning_rate": 1.7866035060019338e-06,
"loss": 1.0225,
"step": 415
},
{
"epoch": 0.81,
"grad_norm": 0.08620977401733398,
"learning_rate": 1.7510656025792005e-06,
"loss": 1.0102,
"step": 416
},
{
"epoch": 0.81,
"grad_norm": 0.08344931900501251,
"learning_rate": 1.7158507632884801e-06,
"loss": 1.0812,
"step": 417
},
{
"epoch": 0.82,
"grad_norm": 0.08682362735271454,
"learning_rate": 1.6809603672959618e-06,
"loss": 1.091,
"step": 418
},
{
"epoch": 0.82,
"grad_norm": 0.08464641869068146,
"learning_rate": 1.6463957810612408e-06,
"loss": 1.042,
"step": 419
},
{
"epoch": 0.82,
"grad_norm": 0.08458367735147476,
"learning_rate": 1.6121583582837773e-06,
"loss": 1.0451,
"step": 420
},
{
"epoch": 0.82,
"grad_norm": 0.08101867139339447,
"learning_rate": 1.5782494398498882e-06,
"loss": 0.9999,
"step": 421
},
{
"epoch": 0.82,
"grad_norm": 0.08441881835460663,
"learning_rate": 1.5446703537802344e-06,
"loss": 1.094,
"step": 422
},
{
"epoch": 0.82,
"grad_norm": 0.08229435980319977,
"learning_rate": 1.5114224151778068e-06,
"loss": 1.0322,
"step": 423
},
{
"epoch": 0.83,
"grad_norm": 0.0848761573433876,
"learning_rate": 1.4785069261764184e-06,
"loss": 1.051,
"step": 424
},
{
"epoch": 0.83,
"grad_norm": 0.08010521531105042,
"learning_rate": 1.4459251758897153e-06,
"loss": 1.0484,
"step": 425
},
{
"epoch": 0.83,
"grad_norm": 0.08490080386400223,
"learning_rate": 1.413678440360684e-06,
"loss": 1.0518,
"step": 426
},
{
"epoch": 0.83,
"grad_norm": 0.08818584680557251,
"learning_rate": 1.3817679825116748e-06,
"loss": 1.0199,
"step": 427
},
{
"epoch": 0.83,
"grad_norm": 0.08122722804546356,
"learning_rate": 1.3501950520949436e-06,
"loss": 1.0309,
"step": 428
},
{
"epoch": 0.84,
"grad_norm": 0.08668461441993713,
"learning_rate": 1.3189608856437053e-06,
"loss": 1.0309,
"step": 429
},
{
"epoch": 0.84,
"grad_norm": 0.08044726401567459,
"learning_rate": 1.2880667064237006e-06,
"loss": 1.025,
"step": 430
},
{
"epoch": 0.84,
"grad_norm": 0.08157233893871307,
"learning_rate": 1.2575137243852965e-06,
"loss": 1.0674,
"step": 431
},
{
"epoch": 0.84,
"grad_norm": 0.0853031650185585,
"learning_rate": 1.2273031361160958e-06,
"loss": 1.0461,
"step": 432
},
{
"epoch": 0.84,
"grad_norm": 0.08920850604772568,
"learning_rate": 1.1974361247940702e-06,
"loss": 1.0559,
"step": 433
},
{
"epoch": 0.85,
"grad_norm": 0.08126967400312424,
"learning_rate": 1.1679138601412253e-06,
"loss": 1.0144,
"step": 434
},
{
"epoch": 0.85,
"grad_norm": 0.08786173164844513,
"learning_rate": 1.1387374983777888e-06,
"loss": 1.0611,
"step": 435
},
{
"epoch": 0.85,
"grad_norm": 0.08111479878425598,
"learning_rate": 1.1099081821769297e-06,
"loss": 1.0329,
"step": 436
},
{
"epoch": 0.85,
"grad_norm": 0.08781804144382477,
"learning_rate": 1.0814270406199967e-06,
"loss": 1.0505,
"step": 437
},
{
"epoch": 0.85,
"grad_norm": 0.08612725883722305,
"learning_rate": 1.0532951891523124e-06,
"loss": 1.0203,
"step": 438
},
{
"epoch": 0.86,
"grad_norm": 0.08277811855077744,
"learning_rate": 1.0255137295394813e-06,
"loss": 1.033,
"step": 439
},
{
"epoch": 0.86,
"grad_norm": 0.0849086195230484,
"learning_rate": 9.980837498242357e-07,
"loss": 1.0127,
"step": 440
},
{
"epoch": 0.86,
"grad_norm": 0.08069565892219543,
"learning_rate": 9.710063242838286e-07,
"loss": 1.0368,
"step": 441
},
{
"epoch": 0.86,
"grad_norm": 0.08577218651771545,
"learning_rate": 9.442825133879608e-07,
"loss": 1.03,
"step": 442
},
{
"epoch": 0.86,
"grad_norm": 0.08074500411748886,
"learning_rate": 9.179133637572457e-07,
"loss": 1.0344,
"step": 443
},
{
"epoch": 0.87,
"grad_norm": 0.08309192210435867,
"learning_rate": 8.918999081222157e-07,
"loss": 1.0128,
"step": 444
},
{
"epoch": 0.87,
"grad_norm": 0.08286837488412857,
"learning_rate": 8.66243165282884e-07,
"loss": 1.0241,
"step": 445
},
{
"epoch": 0.87,
"grad_norm": 0.0842629224061966,
"learning_rate": 8.409441400688401e-07,
"loss": 1.0243,
"step": 446
},
{
"epoch": 0.87,
"grad_norm": 0.07928766310214996,
"learning_rate": 8.160038232998935e-07,
"loss": 1.0177,
"step": 447
},
{
"epoch": 0.87,
"grad_norm": 0.08503233641386032,
"learning_rate": 7.914231917472748e-07,
"loss": 0.999,
"step": 448
},
{
"epoch": 0.88,
"grad_norm": 0.08141094446182251,
"learning_rate": 7.672032080953751e-07,
"loss": 1.0414,
"step": 449
},
{
"epoch": 0.88,
"grad_norm": 0.08452077209949493,
"learning_rate": 7.433448209040495e-07,
"loss": 1.0363,
"step": 450
},
{
"epoch": 0.88,
"grad_norm": 0.08524196594953537,
"learning_rate": 7.198489645714579e-07,
"loss": 1.0299,
"step": 451
},
{
"epoch": 0.88,
"grad_norm": 0.08539626747369766,
"learning_rate": 6.96716559297479e-07,
"loss": 1.0349,
"step": 452
},
{
"epoch": 0.88,
"grad_norm": 0.08527474850416183,
"learning_rate": 6.739485110476707e-07,
"loss": 1.0227,
"step": 453
},
{
"epoch": 0.89,
"grad_norm": 0.08794491738080978,
"learning_rate": 6.515457115177804e-07,
"loss": 1.0193,
"step": 454
},
{
"epoch": 0.89,
"grad_norm": 0.08151602745056152,
"learning_rate": 6.295090380988323e-07,
"loss": 1.0322,
"step": 455
},
{
"epoch": 0.89,
"grad_norm": 0.08208628743886948,
"learning_rate": 6.078393538427574e-07,
"loss": 1.0048,
"step": 456
},
{
"epoch": 0.89,
"grad_norm": 0.08371996879577637,
"learning_rate": 5.865375074286006e-07,
"loss": 1.0268,
"step": 457
},
{
"epoch": 0.89,
"grad_norm": 0.08067172765731812,
"learning_rate": 5.656043331292682e-07,
"loss": 1.0198,
"step": 458
},
{
"epoch": 0.9,
"grad_norm": 0.08308243006467819,
"learning_rate": 5.45040650778873e-07,
"loss": 1.0508,
"step": 459
},
{
"epoch": 0.9,
"grad_norm": 0.08296984434127808,
"learning_rate": 5.248472657406123e-07,
"loss": 1.038,
"step": 460
},
{
"epoch": 0.9,
"grad_norm": 0.08200722932815552,
"learning_rate": 5.050249688752329e-07,
"loss": 1.0326,
"step": 461
},
{
"epoch": 0.9,
"grad_norm": 0.08149310201406479,
"learning_rate": 4.855745365100539e-07,
"loss": 1.0299,
"step": 462
},
{
"epoch": 0.9,
"grad_norm": 0.0821991041302681,
"learning_rate": 4.664967304085655e-07,
"loss": 1.0455,
"step": 463
},
{
"epoch": 0.9,
"grad_norm": 0.08251185715198517,
"learning_rate": 4.477922977405913e-07,
"loss": 1.0434,
"step": 464
},
{
"epoch": 0.91,
"grad_norm": 0.08696113526821136,
"learning_rate": 4.29461971053029e-07,
"loss": 1.0123,
"step": 465
},
{
"epoch": 0.91,
"grad_norm": 0.08152768760919571,
"learning_rate": 4.115064682411607e-07,
"loss": 0.9981,
"step": 466
},
{
"epoch": 0.91,
"grad_norm": 0.08121586591005325,
"learning_rate": 3.939264925205355e-07,
"loss": 1.0169,
"step": 467
},
{
"epoch": 0.91,
"grad_norm": 0.08272110670804977,
"learning_rate": 3.7672273239942936e-07,
"loss": 1.0872,
"step": 468
},
{
"epoch": 0.91,
"grad_norm": 0.08001048117876053,
"learning_rate": 3.5989586165187884e-07,
"loss": 1.0012,
"step": 469
},
{
"epoch": 0.92,
"grad_norm": 0.08298242092132568,
"learning_rate": 3.4344653929129554e-07,
"loss": 1.0136,
"step": 470
},
{
"epoch": 0.92,
"grad_norm": 0.0829276293516159,
"learning_rate": 3.2737540954465244e-07,
"loss": 1.0279,
"step": 471
},
{
"epoch": 0.92,
"grad_norm": 0.0854070633649826,
"learning_rate": 3.1168310182725814e-07,
"loss": 1.0709,
"step": 472
},
{
"epoch": 0.92,
"grad_norm": 0.08476186543703079,
"learning_rate": 2.9637023071810155e-07,
"loss": 1.0499,
"step": 473
},
{
"epoch": 0.92,
"grad_norm": 0.08046039938926697,
"learning_rate": 2.8143739593578854e-07,
"loss": 1.0322,
"step": 474
},
{
"epoch": 0.93,
"grad_norm": 0.08418085426092148,
"learning_rate": 2.6688518231504535e-07,
"loss": 1.0202,
"step": 475
},
{
"epoch": 0.93,
"grad_norm": 0.08194521069526672,
"learning_rate": 2.527141597838212e-07,
"loss": 1.0059,
"step": 476
},
{
"epoch": 0.93,
"grad_norm": 0.08417163044214249,
"learning_rate": 2.389248833409663e-07,
"loss": 1.0563,
"step": 477
},
{
"epoch": 0.93,
"grad_norm": 0.08419531583786011,
"learning_rate": 2.2551789303449034e-07,
"loss": 1.0099,
"step": 478
},
{
"epoch": 0.93,
"grad_norm": 0.08371371775865555,
"learning_rate": 2.124937139404204e-07,
"loss": 1.0209,
"step": 479
},
{
"epoch": 0.94,
"grad_norm": 0.08230051398277283,
"learning_rate": 1.998528561422297e-07,
"loss": 1.0551,
"step": 480
},
{
"epoch": 0.94,
"grad_norm": 0.08176656812429428,
"learning_rate": 1.8759581471086363e-07,
"loss": 1.0296,
"step": 481
},
{
"epoch": 0.94,
"grad_norm": 0.08562692999839783,
"learning_rate": 1.757230696853518e-07,
"loss": 1.0499,
"step": 482
},
{
"epoch": 0.94,
"grad_norm": 0.08376387506723404,
"learning_rate": 1.6423508605400318e-07,
"loss": 1.04,
"step": 483
},
{
"epoch": 0.94,
"grad_norm": 0.08506414294242859,
"learning_rate": 1.5313231373619953e-07,
"loss": 1.0253,
"step": 484
},
{
"epoch": 0.95,
"grad_norm": 0.08413960039615631,
"learning_rate": 1.424151875647717e-07,
"loss": 1.021,
"step": 485
},
{
"epoch": 0.95,
"grad_norm": 0.08077908307313919,
"learning_rate": 1.3208412726897324e-07,
"loss": 1.0217,
"step": 486
},
{
"epoch": 0.95,
"grad_norm": 0.08394316583871841,
"learning_rate": 1.2213953745803587e-07,
"loss": 0.9818,
"step": 487
},
{
"epoch": 0.95,
"grad_norm": 0.08442597091197968,
"learning_rate": 1.1258180760533089e-07,
"loss": 1.0122,
"step": 488
},
{
"epoch": 0.95,
"grad_norm": 0.08355535566806793,
"learning_rate": 1.0341131203311039e-07,
"loss": 1.0422,
"step": 489
},
{
"epoch": 0.96,
"grad_norm": 0.08156078308820724,
"learning_rate": 9.462840989784671e-08,
"loss": 1.0425,
"step": 490
},
{
"epoch": 0.96,
"grad_norm": 0.08547282218933105,
"learning_rate": 8.62334451761715e-08,
"loss": 1.0479,
"step": 491
},
{
"epoch": 0.96,
"grad_norm": 0.08260347694158554,
"learning_rate": 7.822674665139751e-08,
"loss": 1.0144,
"step": 492
},
{
"epoch": 0.96,
"grad_norm": 0.08177413791418076,
"learning_rate": 7.060862790064793e-08,
"loss": 1.0365,
"step": 493
},
{
"epoch": 0.96,
"grad_norm": 0.08669156581163406,
"learning_rate": 6.337938728257054e-08,
"loss": 1.0332,
"step": 494
},
{
"epoch": 0.97,
"grad_norm": 0.08690541237592697,
"learning_rate": 5.653930792565821e-08,
"loss": 1.0179,
"step": 495
},
{
"epoch": 0.97,
"grad_norm": 0.085110604763031,
"learning_rate": 5.008865771715221e-08,
"loss": 1.0572,
"step": 496
},
{
"epoch": 0.97,
"grad_norm": 0.08295170217752457,
"learning_rate": 4.4027689292560626e-08,
"loss": 1.0266,
"step": 497
},
{
"epoch": 0.97,
"grad_norm": 0.07970304042100906,
"learning_rate": 3.83566400257529e-08,
"loss": 1.0388,
"step": 498
},
{
"epoch": 0.97,
"grad_norm": 0.08482901751995087,
"learning_rate": 3.3075732019675065e-08,
"loss": 1.0551,
"step": 499
},
{
"epoch": 0.97,
"grad_norm": 0.0838318020105362,
"learning_rate": 2.8185172097641156e-08,
"loss": 1.075,
"step": 500
},
{
"epoch": 0.98,
"grad_norm": 0.08117860555648804,
"learning_rate": 2.368515179523967e-08,
"loss": 1.0418,
"step": 501
},
{
"epoch": 0.98,
"grad_norm": 0.08278267085552216,
"learning_rate": 1.957584735282847e-08,
"loss": 1.0176,
"step": 502
},
{
"epoch": 0.98,
"grad_norm": 0.08319726586341858,
"learning_rate": 1.5857419708633636e-08,
"loss": 1.0229,
"step": 503
},
{
"epoch": 0.98,
"grad_norm": 0.08518896996974945,
"learning_rate": 1.253001449244673e-08,
"loss": 1.0313,
"step": 504
},
{
"epoch": 0.98,
"grad_norm": 0.08542507141828537,
"learning_rate": 9.593762019922681e-09,
"loss": 1.0703,
"step": 505
},
{
"epoch": 0.99,
"grad_norm": 0.08333377540111542,
"learning_rate": 7.048777287472774e-09,
"loss": 1.0058,
"step": 506
},
{
"epoch": 0.99,
"grad_norm": 0.09070394933223724,
"learning_rate": 4.895159967762686e-09,
"loss": 1.0059,
"step": 507
},
{
"epoch": 0.99,
"grad_norm": 0.0817839726805687,
"learning_rate": 3.132994405808942e-09,
"loss": 1.0301,
"step": 508
},
{
"epoch": 0.99,
"grad_norm": 0.08134960383176804,
"learning_rate": 1.7623496156771169e-09,
"loss": 1.0419,
"step": 509
},
{
"epoch": 0.99,
"grad_norm": 0.08472370356321335,
"learning_rate": 7.832792777739962e-10,
"loss": 1.0253,
"step": 510
},
{
"epoch": 1.0,
"grad_norm": 0.08445168286561966,
"learning_rate": 1.958217367514781e-10,
"loss": 1.0475,
"step": 511
},
{
"epoch": 1.0,
"grad_norm": 0.08768552541732788,
"learning_rate": 0.0,
"loss": 1.0703,
"step": 512
},
{
"epoch": 1.0,
"eval_loss": 1.0364834070205688,
"eval_runtime": 707.9527,
"eval_samples_per_second": 7.167,
"eval_steps_per_second": 1.792,
"step": 512
}
],
"logging_steps": 1,
"max_steps": 512,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 256,
"total_flos": 4.525540736237568e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}