End of training

Browse files

Files changed (5) hide show

README.md +14 -2
all_results.json +16 -0
eval_results.json +10 -0
train_results.json +9 -0
trainer_state.json +2392 -0

README.md CHANGED Viewed

@@ -2,11 +2,23 @@
 library_name: transformers
 tags:
 - generated_from_trainer
 metrics:
 - accuracy
 model-index:
 - name: opt-babylm2-clean-spacy-32k_seed-42_3e-4
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -14,7 +26,7 @@ should probably proofread and complete it, then remove this comment. -->
 # opt-babylm2-clean-spacy-32k_seed-42_3e-4
-This model was trained from scratch on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.0190
 - Accuracy: 0.4234

 library_name: transformers
 tags:
 - generated_from_trainer
+datasets:
+- kanishka/babylm2-clean-spacy
 metrics:
 - accuracy
 model-index:
 - name: opt-babylm2-clean-spacy-32k_seed-42_3e-4
+  results:
+  - task:
+      name: Causal Language Modeling
+      type: text-generation
+    dataset:
+      name: kanishka/babylm2-clean-spacy
+      type: kanishka/babylm2-clean-spacy
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.4234014597448438
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # opt-babylm2-clean-spacy-32k_seed-42_3e-4
+This model was trained from scratch on the kanishka/babylm2-clean-spacy dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.0190
 - Accuracy: 0.4234

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.4234014597448438,
+    "eval_loss": 3.0190186500549316,
+    "eval_runtime": 111.5107,
+    "eval_samples": 52440,
+    "eval_samples_per_second": 470.269,
+    "eval_steps_per_second": 7.354,
+    "perplexity": 20.47119242004321,
+    "total_flos": 1.29957250203648e+18,
+    "train_loss": 2.7038125842232534,
+    "train_runtime": 44110.0674,
+    "train_samples": 497364,
+    "train_samples_per_second": 225.51,
+    "train_steps_per_second": 7.047
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.4234014597448438,
+    "eval_loss": 3.0190186500549316,
+    "eval_runtime": 111.5107,
+    "eval_samples": 52440,
+    "eval_samples_per_second": 470.269,
+    "eval_steps_per_second": 7.354,
+    "perplexity": 20.47119242004321
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 20.0,
+    "total_flos": 1.29957250203648e+18,
+    "train_loss": 2.7038125842232534,
+    "train_runtime": 44110.0674,
+    "train_samples": 497364,
+    "train_samples_per_second": 225.51,
+    "train_steps_per_second": 7.047
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2392 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 20.0,
+  "eval_steps": 500,
+  "global_step": 310860,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.06433764395547835,
+      "grad_norm": 0.9469536542892456,
+      "learning_rate": 9.375e-06,
+      "loss": 7.0597,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1286752879109567,
+      "grad_norm": 0.9877486824989319,
+      "learning_rate": 1.875e-05,
+      "loss": 4.8574,
+      "step": 2000
+    },
+    {
+      "epoch": 0.19301293186643506,
+      "grad_norm": 1.1811100244522095,
+      "learning_rate": 2.8125e-05,
+      "loss": 4.5464,
+      "step": 3000
+    },
+    {
+      "epoch": 0.2573505758219134,
+      "grad_norm": 1.155553936958313,
+      "learning_rate": 3.75e-05,
+      "loss": 4.3086,
+      "step": 4000
+    },
+    {
+      "epoch": 0.32168821977739176,
+      "grad_norm": 1.0062898397445679,
+      "learning_rate": 4.6874999999999994e-05,
+      "loss": 4.1307,
+      "step": 5000
+    },
+    {
+      "epoch": 0.3860258637328701,
+      "grad_norm": 0.9749443531036377,
+      "learning_rate": 5.625e-05,
+      "loss": 3.986,
+      "step": 6000
+    },
+    {
+      "epoch": 0.45036350768834843,
+      "grad_norm": 0.9870838522911072,
+      "learning_rate": 6.5625e-05,
+      "loss": 3.8708,
+      "step": 7000
+    },
+    {
+      "epoch": 0.5147011516438268,
+      "grad_norm": 1.0740858316421509,
+      "learning_rate": 7.5e-05,
+      "loss": 3.781,
+      "step": 8000
+    },
+    {
+      "epoch": 0.5790387955993052,
+      "grad_norm": 0.969571053981781,
+      "learning_rate": 8.437499999999999e-05,
+      "loss": 3.6942,
+      "step": 9000
+    },
+    {
+      "epoch": 0.6433764395547835,
+      "grad_norm": 0.923062801361084,
+      "learning_rate": 9.374999999999999e-05,
+      "loss": 3.6225,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7077140835102619,
+      "grad_norm": 0.87486732006073,
+      "learning_rate": 0.00010312499999999999,
+      "loss": 3.5667,
+      "step": 11000
+    },
+    {
+      "epoch": 0.7720517274657402,
+      "grad_norm": 0.8343172073364258,
+      "learning_rate": 0.000112490625,
+      "loss": 3.5107,
+      "step": 12000
+    },
+    {
+      "epoch": 0.8363893714212186,
+      "grad_norm": 0.8089198470115662,
+      "learning_rate": 0.000121865625,
+      "loss": 3.4681,
+      "step": 13000
+    },
+    {
+      "epoch": 0.9007270153766969,
+      "grad_norm": 0.8141182661056519,
+      "learning_rate": 0.00013123125,
+      "loss": 3.4337,
+      "step": 14000
+    },
+    {
+      "epoch": 0.9650646593321752,
+      "grad_norm": 0.7596079707145691,
+      "learning_rate": 0.00014060625,
+      "loss": 3.3944,
+      "step": 15000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.37339323372369543,
+      "eval_loss": 3.4211647510528564,
+      "eval_runtime": 112.5567,
+      "eval_samples_per_second": 465.898,
+      "eval_steps_per_second": 7.285,
+      "step": 15543
+    },
+    {
+      "epoch": 1.0294023032876536,
+      "grad_norm": 0.7583508491516113,
+      "learning_rate": 0.000149971875,
+      "loss": 3.345,
+      "step": 16000
+    },
+    {
+      "epoch": 1.093739947243132,
+      "grad_norm": 0.7395954728126526,
+      "learning_rate": 0.00015933749999999996,
+      "loss": 3.3182,
+      "step": 17000
+    },
+    {
+      "epoch": 1.1580775911986103,
+      "grad_norm": 0.7119142413139343,
+      "learning_rate": 0.00016871249999999996,
+      "loss": 3.304,
+      "step": 18000
+    },
+    {
+      "epoch": 1.2224152351540887,
+      "grad_norm": 0.7133814692497253,
+      "learning_rate": 0.00017808749999999999,
+      "loss": 3.2808,
+      "step": 19000
+    },
+    {
+      "epoch": 1.286752879109567,
+      "grad_norm": 0.6662284731864929,
+      "learning_rate": 0.00018745312499999998,
+      "loss": 3.2624,
+      "step": 20000
+    },
+    {
+      "epoch": 1.3510905230650454,
+      "grad_norm": 0.6821054816246033,
+      "learning_rate": 0.00019682812499999998,
+      "loss": 3.2468,
+      "step": 21000
+    },
+    {
+      "epoch": 1.4154281670205238,
+      "grad_norm": 0.6423399448394775,
+      "learning_rate": 0.00020619374999999998,
+      "loss": 3.2323,
+      "step": 22000
+    },
+    {
+      "epoch": 1.4797658109760021,
+      "grad_norm": 0.6489351987838745,
+      "learning_rate": 0.00021556874999999998,
+      "loss": 3.218,
+      "step": 23000
+    },
+    {
+      "epoch": 1.5441034549314803,
+      "grad_norm": 0.6388360261917114,
+      "learning_rate": 0.00022493437499999998,
+      "loss": 3.2063,
+      "step": 24000
+    },
+    {
+      "epoch": 1.6084410988869586,
+      "grad_norm": 0.6035541296005249,
+      "learning_rate": 0.00023430937499999997,
+      "loss": 3.1971,
+      "step": 25000
+    },
+    {
+      "epoch": 1.672778742842437,
+      "grad_norm": 0.5949345231056213,
+      "learning_rate": 0.00024367499999999997,
+      "loss": 3.1683,
+      "step": 26000
+    },
+    {
+      "epoch": 1.7371163867979154,
+      "grad_norm": 0.5953760147094727,
+      "learning_rate": 0.00025305,
+      "loss": 3.1728,
+      "step": 27000
+    },
+    {
+      "epoch": 1.8014540307533937,
+      "grad_norm": 0.5276063680648804,
+      "learning_rate": 0.000262415625,
+      "loss": 3.1607,
+      "step": 28000
+    },
+    {
+      "epoch": 1.865791674708872,
+      "grad_norm": 0.5257272124290466,
+      "learning_rate": 0.000271790625,
+      "loss": 3.1472,
+      "step": 29000
+    },
+    {
+      "epoch": 1.9301293186643504,
+      "grad_norm": 0.49043259024620056,
+      "learning_rate": 0.000281165625,
+      "loss": 3.1367,
+      "step": 30000
+    },
+    {
+      "epoch": 1.9944669626198288,
+      "grad_norm": 0.5030378699302673,
+      "learning_rate": 0.000290521875,
+      "loss": 3.1245,
+      "step": 31000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.3939620256950988,
+      "eval_loss": 3.2037432193756104,
+      "eval_runtime": 113.0293,
+      "eval_samples_per_second": 463.951,
+      "eval_steps_per_second": 7.255,
+      "step": 31086
+    },
+    {
+      "epoch": 2.058804606575307,
+      "grad_norm": 0.5003546476364136,
+      "learning_rate": 0.000299896875,
+      "loss": 3.0828,
+      "step": 32000
+    },
+    {
+      "epoch": 2.1231422505307855,
+      "grad_norm": 0.48286330699920654,
+      "learning_rate": 0.00029893602524564295,
+      "loss": 3.08,
+      "step": 33000
+    },
+    {
+      "epoch": 2.187479894486264,
+      "grad_norm": 0.4852472245693207,
+      "learning_rate": 0.0002978602165961414,
+      "loss": 3.0633,
+      "step": 34000
+    },
+    {
+      "epoch": 2.2518175384417423,
+      "grad_norm": 0.4629572927951813,
+      "learning_rate": 0.00029678548375528934,
+      "loss": 3.063,
+      "step": 35000
+    },
+    {
+      "epoch": 2.3161551823972206,
+      "grad_norm": 0.4571368992328644,
+      "learning_rate": 0.0002957096751057878,
+      "loss": 3.0453,
+      "step": 36000
+    },
+    {
+      "epoch": 2.380492826352699,
+      "grad_norm": 0.44331055879592896,
+      "learning_rate": 0.0002946349422649358,
+      "loss": 3.0408,
+      "step": 37000
+    },
+    {
+      "epoch": 2.4448304703081774,
+      "grad_norm": 0.4230923354625702,
+      "learning_rate": 0.00029355913361543424,
+      "loss": 3.0359,
+      "step": 38000
+    },
+    {
+      "epoch": 2.5091681142636557,
+      "grad_norm": 0.4260108768939972,
+      "learning_rate": 0.0002924833249659327,
+      "loss": 3.0316,
+      "step": 39000
+    },
+    {
+      "epoch": 2.573505758219134,
+      "grad_norm": 0.41887935996055603,
+      "learning_rate": 0.0002914085921250807,
+      "loss": 3.0299,
+      "step": 40000
+    },
+    {
+      "epoch": 2.6378434021746124,
+      "grad_norm": 0.41068920493125916,
+      "learning_rate": 0.00029033278347557914,
+      "loss": 3.0138,
+      "step": 41000
+    },
+    {
+      "epoch": 2.702181046130091,
+      "grad_norm": 0.39430394768714905,
+      "learning_rate": 0.0002892591264433766,
+      "loss": 3.0038,
+      "step": 42000
+    },
+    {
+      "epoch": 2.766518690085569,
+      "grad_norm": 0.4100017547607422,
+      "learning_rate": 0.00028818331779387505,
+      "loss": 3.0088,
+      "step": 43000
+    },
+    {
+      "epoch": 2.8308563340410475,
+      "grad_norm": 0.4101816415786743,
+      "learning_rate": 0.0002871075091443735,
+      "loss": 2.9937,
+      "step": 44000
+    },
+    {
+      "epoch": 2.895193977996526,
+      "grad_norm": 0.38294607400894165,
+      "learning_rate": 0.000286031700494872,
+      "loss": 2.9898,
+      "step": 45000
+    },
+    {
+      "epoch": 2.9595316219520043,
+      "grad_norm": 0.37260037660598755,
+      "learning_rate": 0.00028495589184537043,
+      "loss": 2.9807,
+      "step": 46000
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.40728960081362825,
+      "eval_loss": 3.079435110092163,
+      "eval_runtime": 112.8117,
+      "eval_samples_per_second": 464.845,
+      "eval_steps_per_second": 7.269,
+      "step": 46629
+    },
+    {
+      "epoch": 3.0238692659074826,
+      "grad_norm": 0.40783312916755676,
+      "learning_rate": 0.00028388115900451836,
+      "loss": 2.9534,
+      "step": 47000
+    },
+    {
+      "epoch": 3.088206909862961,
+      "grad_norm": 0.38361623883247375,
+      "learning_rate": 0.0002828053503550168,
+      "loss": 2.9122,
+      "step": 48000
+    },
+    {
+      "epoch": 3.1525445538184393,
+      "grad_norm": 0.3766690790653229,
+      "learning_rate": 0.0002817306175141648,
+      "loss": 2.9119,
+      "step": 49000
+    },
+    {
+      "epoch": 3.2168821977739177,
+      "grad_norm": 0.38536399602890015,
+      "learning_rate": 0.00028065480886466326,
+      "loss": 2.9135,
+      "step": 50000
+    },
+    {
+      "epoch": 3.2812198417293956,
+      "grad_norm": 0.38374361395835876,
+      "learning_rate": 0.0002795800760238112,
+      "loss": 2.9094,
+      "step": 51000
+    },
+    {
+      "epoch": 3.345557485684874,
+      "grad_norm": 0.3872029185295105,
+      "learning_rate": 0.00027850426737430965,
+      "loss": 2.9069,
+      "step": 52000
+    },
+    {
+      "epoch": 3.4098951296403524,
+      "grad_norm": 0.37401601672172546,
+      "learning_rate": 0.0002774284587248081,
+      "loss": 2.9092,
+      "step": 53000
+    },
+    {
+      "epoch": 3.4742327735958307,
+      "grad_norm": 0.34951257705688477,
+      "learning_rate": 0.0002763537258839561,
+      "loss": 2.9124,
+      "step": 54000
+    },
+    {
+      "epoch": 3.538570417551309,
+      "grad_norm": 0.36252182722091675,
+      "learning_rate": 0.000275278993043104,
+      "loss": 2.9086,
+      "step": 55000
+    },
+    {
+      "epoch": 3.6029080615067874,
+      "grad_norm": 0.3610841631889343,
+      "learning_rate": 0.0002742031843936025,
+      "loss": 2.9044,
+      "step": 56000
+    },
+    {
+      "epoch": 3.667245705462266,
+      "grad_norm": 0.356315016746521,
+      "learning_rate": 0.00027312737574410094,
+      "loss": 2.8984,
+      "step": 57000
+    },
+    {
+      "epoch": 3.731583349417744,
+      "grad_norm": 0.3501368761062622,
+      "learning_rate": 0.0002720515670945994,
+      "loss": 2.8991,
+      "step": 58000
+    },
+    {
+      "epoch": 3.7959209933732225,
+      "grad_norm": 0.3654986619949341,
+      "learning_rate": 0.00027097575844509786,
+      "loss": 2.8946,
+      "step": 59000
+    },
+    {
+      "epoch": 3.860258637328701,
+      "grad_norm": 0.34233444929122925,
+      "learning_rate": 0.00026990102560424585,
+      "loss": 2.8981,
+      "step": 60000
+    },
+    {
+      "epoch": 3.9245962812841793,
+      "grad_norm": 0.35118335485458374,
+      "learning_rate": 0.0002688252169547443,
+      "loss": 2.8932,
+      "step": 61000
+    },
+    {
+      "epoch": 3.9889339252396576,
+      "grad_norm": 0.3542274236679077,
+      "learning_rate": 0.00026775048411389223,
+      "loss": 2.8872,
+      "step": 62000
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.4139775803532702,
+      "eval_loss": 3.0204551219940186,
+      "eval_runtime": 113.294,
+      "eval_samples_per_second": 462.866,
+      "eval_steps_per_second": 7.238,
+      "step": 62172
+    },
+    {
+      "epoch": 4.053271569195136,
+      "grad_norm": 0.359250545501709,
+      "learning_rate": 0.0002666746754643907,
+      "loss": 2.8242,
+      "step": 63000
+    },
+    {
+      "epoch": 4.117609213150614,
+      "grad_norm": 0.34917619824409485,
+      "learning_rate": 0.00026559886681488915,
+      "loss": 2.8144,
+      "step": 64000
+    },
+    {
+      "epoch": 4.181946857106093,
+      "grad_norm": 0.351457417011261,
+      "learning_rate": 0.00026452413397403714,
+      "loss": 2.8132,
+      "step": 65000
+    },
+    {
+      "epoch": 4.246284501061571,
+      "grad_norm": 0.35231146216392517,
+      "learning_rate": 0.0002634483253245356,
+      "loss": 2.8203,
+      "step": 66000
+    },
+    {
+      "epoch": 4.310622145017049,
+      "grad_norm": 0.354030579328537,
+      "learning_rate": 0.0002623735924836836,
+      "loss": 2.8273,
+      "step": 67000
+    },
+    {
+      "epoch": 4.374959788972528,
+      "grad_norm": 0.3434860408306122,
+      "learning_rate": 0.00026129778383418204,
+      "loss": 2.8221,
+      "step": 68000
+    },
+    {
+      "epoch": 4.439297432928006,
+      "grad_norm": 0.35598379373550415,
+      "learning_rate": 0.0002602219751846805,
+      "loss": 2.8283,
+      "step": 69000
+    },
+    {
+      "epoch": 4.5036350768834845,
+      "grad_norm": 0.350340873003006,
+      "learning_rate": 0.00025914616653517896,
+      "loss": 2.8242,
+      "step": 70000
+    },
+    {
+      "epoch": 4.567972720838963,
+      "grad_norm": 0.34078752994537354,
+      "learning_rate": 0.0002580714336943269,
+      "loss": 2.8309,
+      "step": 71000
+    },
+    {
+      "epoch": 4.632310364794441,
+      "grad_norm": 0.3571733832359314,
+      "learning_rate": 0.00025699670085347487,
+      "loss": 2.8248,
+      "step": 72000
+    },
+    {
+      "epoch": 4.69664800874992,
+      "grad_norm": 0.35940021276474,
+      "learning_rate": 0.00025592089220397333,
+      "loss": 2.8334,
+      "step": 73000
+    },
+    {
+      "epoch": 4.760985652705398,
+      "grad_norm": 0.3354775607585907,
+      "learning_rate": 0.0002548450835544718,
+      "loss": 2.8263,
+      "step": 74000
+    },
+    {
+      "epoch": 4.825323296660876,
+      "grad_norm": 0.330805242061615,
+      "learning_rate": 0.0002537703507136197,
+      "loss": 2.8296,
+      "step": 75000
+    },
+    {
+      "epoch": 4.889660940616355,
+      "grad_norm": 0.32566189765930176,
+      "learning_rate": 0.0002526945420641182,
+      "loss": 2.8208,
+      "step": 76000
+    },
+    {
+      "epoch": 4.953998584571833,
+      "grad_norm": 0.32299116253852844,
+      "learning_rate": 0.00025161980922326616,
+      "loss": 2.8286,
+      "step": 77000
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.417990981289541,
+      "eval_loss": 2.988518238067627,
+      "eval_runtime": 113.0062,
+      "eval_samples_per_second": 464.045,
+      "eval_steps_per_second": 7.256,
+      "step": 77715
+    },
+    {
+      "epoch": 5.018336228527311,
+      "grad_norm": 0.332711786031723,
+      "learning_rate": 0.00025054400057376457,
+      "loss": 2.797,
+      "step": 78000
+    },
+    {
+      "epoch": 5.08267387248279,
+      "grad_norm": 0.3597155809402466,
+      "learning_rate": 0.000249468191924263,
+      "loss": 2.7461,
+      "step": 79000
+    },
+    {
+      "epoch": 5.147011516438268,
+      "grad_norm": 0.3411096930503845,
+      "learning_rate": 0.000248393459083411,
+      "loss": 2.7493,
+      "step": 80000
+    },
+    {
+      "epoch": 5.2113491603937465,
+      "grad_norm": 0.35248109698295593,
+      "learning_rate": 0.00024731765043390947,
+      "loss": 2.7584,
+      "step": 81000
+    },
+    {
+      "epoch": 5.275686804349225,
+      "grad_norm": 0.3520190417766571,
+      "learning_rate": 0.00024624184178440793,
+      "loss": 2.755,
+      "step": 82000
+    },
+    {
+      "epoch": 5.340024448304703,
+      "grad_norm": 0.34867680072784424,
+      "learning_rate": 0.00024516710894355586,
+      "loss": 2.7649,
+      "step": 83000
+    },
+    {
+      "epoch": 5.404362092260182,
+      "grad_norm": 0.3400154709815979,
+      "learning_rate": 0.00024409130029405434,
+      "loss": 2.7586,
+      "step": 84000
+    },
+    {
+      "epoch": 5.46869973621566,
+      "grad_norm": 0.3640024662017822,
+      "learning_rate": 0.0002430154916445528,
+      "loss": 2.7606,
+      "step": 85000
+    },
+    {
+      "epoch": 5.533037380171138,
+      "grad_norm": 0.3456322252750397,
+      "learning_rate": 0.00024193968299505126,
+      "loss": 2.767,
+      "step": 86000
+    },
+    {
+      "epoch": 5.597375024126617,
+      "grad_norm": 0.3284786343574524,
+      "learning_rate": 0.00024086495015419922,
+      "loss": 2.7687,
+      "step": 87000
+    },
+    {
+      "epoch": 5.661712668082095,
+      "grad_norm": 0.3351786732673645,
+      "learning_rate": 0.00023978914150469768,
+      "loss": 2.7705,
+      "step": 88000
+    },
+    {
+      "epoch": 5.726050312037573,
+      "grad_norm": 0.3189627528190613,
+      "learning_rate": 0.00023871440866384563,
+      "loss": 2.7743,
+      "step": 89000
+    },
+    {
+      "epoch": 5.790387955993052,
+      "grad_norm": 0.3447468876838684,
+      "learning_rate": 0.0002376386000143441,
+      "loss": 2.7712,
+      "step": 90000
+    },
+    {
+      "epoch": 5.85472559994853,
+      "grad_norm": 0.3212040364742279,
+      "learning_rate": 0.00023656386717349205,
+      "loss": 2.7741,
+      "step": 91000
+    },
+    {
+      "epoch": 5.9190632439040085,
+      "grad_norm": 0.3384701609611511,
+      "learning_rate": 0.0002354880585239905,
+      "loss": 2.7747,
+      "step": 92000
+    },
+    {
+      "epoch": 5.983400887859487,
+      "grad_norm": 0.33266380429267883,
+      "learning_rate": 0.00023441224987448894,
+      "loss": 2.779,
+      "step": 93000
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.4206485843765424,
+      "eval_loss": 2.969926595687866,
+      "eval_runtime": 113.3066,
+      "eval_samples_per_second": 462.815,
+      "eval_steps_per_second": 7.237,
+      "step": 93258
+    },
+    {
+      "epoch": 6.047738531814965,
+      "grad_norm": 0.35645657777786255,
+      "learning_rate": 0.0002333364412249874,
+      "loss": 2.7059,
+      "step": 94000
+    },
+    {
+      "epoch": 6.112076175770444,
+      "grad_norm": 0.35733386874198914,
+      "learning_rate": 0.0002322617083841354,
+      "loss": 2.6966,
+      "step": 95000
+    },
+    {
+      "epoch": 6.176413819725922,
+      "grad_norm": 0.3435540199279785,
+      "learning_rate": 0.00023118589973463387,
+      "loss": 2.6986,
+      "step": 96000
+    },
+    {
+      "epoch": 6.2407514636814,
+      "grad_norm": 0.3479596972465515,
+      "learning_rate": 0.0002301100910851323,
+      "loss": 2.7068,
+      "step": 97000
+    },
+    {
+      "epoch": 6.305089107636879,
+      "grad_norm": 0.3150424659252167,
+      "learning_rate": 0.00022903428243563077,
+      "loss": 2.7074,
+      "step": 98000
+    },
+    {
+      "epoch": 6.369426751592357,
+      "grad_norm": 0.34055858850479126,
+      "learning_rate": 0.00022795954959477872,
+      "loss": 2.7065,
+      "step": 99000
+    },
+    {
+      "epoch": 6.433764395547835,
+      "grad_norm": 0.3491341769695282,
+      "learning_rate": 0.0002268848167539267,
+      "loss": 2.7156,
+      "step": 100000
+    },
+    {
+      "epoch": 6.498102039503314,
+      "grad_norm": 0.3347100019454956,
+      "learning_rate": 0.00022580900810442514,
+      "loss": 2.714,
+      "step": 101000
+    },
+    {
+      "epoch": 6.562439683458791,
+      "grad_norm": 0.35210439562797546,
+      "learning_rate": 0.00022473427526357312,
+      "loss": 2.7194,
+      "step": 102000
+    },
+    {
+      "epoch": 6.6267773274142705,
+      "grad_norm": 0.3326897919178009,
+      "learning_rate": 0.00022365846661407155,
+      "loss": 2.727,
+      "step": 103000
+    },
+    {
+      "epoch": 6.691114971369748,
+      "grad_norm": 0.3269229531288147,
+      "learning_rate": 0.00022258265796457,
+      "loss": 2.7203,
+      "step": 104000
+    },
+    {
+      "epoch": 6.755452615325227,
+      "grad_norm": 0.34183254837989807,
+      "learning_rate": 0.00022150684931506847,
+      "loss": 2.7328,
+      "step": 105000
+    },
+    {
+      "epoch": 6.819790259280705,
+      "grad_norm": 0.33449244499206543,
+      "learning_rate": 0.00022043211647421643,
+      "loss": 2.7291,
+      "step": 106000
+    },
+    {
+      "epoch": 6.884127903236184,
+      "grad_norm": 0.33734798431396484,
+      "learning_rate": 0.0002193563078247149,
+      "loss": 2.7277,
+      "step": 107000
+    },
+    {
+      "epoch": 6.948465547191661,
+      "grad_norm": 0.34088870882987976,
+      "learning_rate": 0.00021828157498386284,
+      "loss": 2.7316,
+      "step": 108000
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.42224443247932275,
+      "eval_loss": 2.958820104598999,
+      "eval_runtime": 113.0743,
+      "eval_samples_per_second": 463.766,
+      "eval_steps_per_second": 7.252,
+      "step": 108801
+    },
+    {
+      "epoch": 7.01280319114714,
+      "grad_norm": 0.3517482876777649,
+      "learning_rate": 0.0002172057663343613,
+      "loss": 2.7116,
+      "step": 109000
+    },
+    {
+      "epoch": 7.077140835102618,
+      "grad_norm": 0.3411085903644562,
+      "learning_rate": 0.00021613103349350926,
+      "loss": 2.6469,
+      "step": 110000
+    },
+    {
+      "epoch": 7.1414784790580965,
+      "grad_norm": 0.3486618399620056,
+      "learning_rate": 0.00021505522484400772,
+      "loss": 2.6546,
+      "step": 111000
+    },
+    {
+      "epoch": 7.205816123013575,
+      "grad_norm": 0.35618531703948975,
+      "learning_rate": 0.00021397941619450618,
+      "loss": 2.6603,
+      "step": 112000
+    },
+    {
+      "epoch": 7.270153766969053,
+      "grad_norm": 0.34740447998046875,
+      "learning_rate": 0.00021290468335365413,
+      "loss": 2.6632,
+      "step": 113000
+    },
+    {
+      "epoch": 7.334491410924532,
+      "grad_norm": 0.339108407497406,
+      "learning_rate": 0.0002118288747041526,
+      "loss": 2.6682,
+      "step": 114000
+    },
+    {
+      "epoch": 7.39882905488001,
+      "grad_norm": 0.36686399579048157,
+      "learning_rate": 0.00021075306605465105,
+      "loss": 2.6718,
+      "step": 115000
+    },
+    {
+      "epoch": 7.463166698835488,
+      "grad_norm": 0.3336213529109955,
+      "learning_rate": 0.000209678333213799,
+      "loss": 2.6806,
+      "step": 116000
+    },
+    {
+      "epoch": 7.527504342790967,
+      "grad_norm": 0.34256553649902344,
+      "learning_rate": 0.00020860252456429747,
+      "loss": 2.6772,
+      "step": 117000
+    },
+    {
+      "epoch": 7.591841986746445,
+      "grad_norm": 0.3527204096317291,
+      "learning_rate": 0.00020752671591479593,
+      "loss": 2.6786,
+      "step": 118000
+    },
+    {
+      "epoch": 7.656179630701923,
+      "grad_norm": 0.34285178780555725,
+      "learning_rate": 0.0002064509072652944,
+      "loss": 2.6816,
+      "step": 119000
+    },
+    {
+      "epoch": 7.720517274657402,
+      "grad_norm": 0.3418208658695221,
+      "learning_rate": 0.00020537617442444234,
+      "loss": 2.6893,
+      "step": 120000
+    },
+    {
+      "epoch": 7.78485491861288,
+      "grad_norm": 0.34486138820648193,
+      "learning_rate": 0.0002043003657749408,
+      "loss": 2.6847,
+      "step": 121000
+    },
+    {
+      "epoch": 7.8491925625683585,
+      "grad_norm": 0.348530650138855,
+      "learning_rate": 0.00020322563293408876,
+      "loss": 2.6826,
+      "step": 122000
+    },
+    {
+      "epoch": 7.913530206523837,
+      "grad_norm": 0.33808425068855286,
+      "learning_rate": 0.00020215090009323674,
+      "loss": 2.6905,
+      "step": 123000
+    },
+    {
+      "epoch": 7.977867850479315,
+      "grad_norm": 0.3486366868019104,
+      "learning_rate": 0.0002010750914437352,
+      "loss": 2.6909,
+      "step": 124000
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.4232837528604119,
+      "eval_loss": 2.9554243087768555,
+      "eval_runtime": 111.9904,
+      "eval_samples_per_second": 468.254,
+      "eval_steps_per_second": 7.322,
+      "step": 124344
+    },
+    {
+      "epoch": 8.042205494434794,
+      "grad_norm": 0.35380104184150696,
+      "learning_rate": 0.00020000035860288316,
+      "loss": 2.6303,
+      "step": 125000
+    },
+    {
+      "epoch": 8.106543138390272,
+      "grad_norm": 0.3654320240020752,
+      "learning_rate": 0.00019892454995338162,
+      "loss": 2.6128,
+      "step": 126000
+    },
+    {
+      "epoch": 8.170880782345751,
+      "grad_norm": 0.3670574724674225,
+      "learning_rate": 0.00019784874130388008,
+      "loss": 2.617,
+      "step": 127000
+    },
+    {
+      "epoch": 8.235218426301229,
+      "grad_norm": 0.38059455156326294,
+      "learning_rate": 0.00019677400846302803,
+      "loss": 2.6274,
+      "step": 128000
+    },
+    {
+      "epoch": 8.299556070256708,
+      "grad_norm": 0.3698261082172394,
+      "learning_rate": 0.00019569927562217599,
+      "loss": 2.6309,
+      "step": 129000
+    },
+    {
+      "epoch": 8.363893714212185,
+      "grad_norm": 0.3583601117134094,
+      "learning_rate": 0.00019462346697267445,
+      "loss": 2.6312,
+      "step": 130000
+    },
+    {
+      "epoch": 8.428231358167665,
+      "grad_norm": 0.3602234721183777,
+      "learning_rate": 0.0001935476583231729,
+      "loss": 2.6368,
+      "step": 131000
+    },
+    {
+      "epoch": 8.492569002123142,
+      "grad_norm": 0.3441711664199829,
+      "learning_rate": 0.00019247184967367137,
+      "loss": 2.6372,
+      "step": 132000
+    },
+    {
+      "epoch": 8.556906646078621,
+      "grad_norm": 0.3533187508583069,
+      "learning_rate": 0.00019139604102416983,
+      "loss": 2.6443,
+      "step": 133000
+    },
+    {
+      "epoch": 8.621244290034099,
+      "grad_norm": 0.3579193651676178,
+      "learning_rate": 0.00019032130818331778,
+      "loss": 2.6481,
+      "step": 134000
+    },
+    {
+      "epoch": 8.685581933989578,
+      "grad_norm": 0.3524502217769623,
+      "learning_rate": 0.00018924549953381624,
+      "loss": 2.6509,
+      "step": 135000
+    },
+    {
+      "epoch": 8.749919577945056,
+      "grad_norm": 0.36159747838974,
+      "learning_rate": 0.0001881707666929642,
+      "loss": 2.6456,
+      "step": 136000
+    },
+    {
+      "epoch": 8.814257221900533,
+      "grad_norm": 0.34249147772789,
+      "learning_rate": 0.00018709495804346266,
+      "loss": 2.6538,
+      "step": 137000
+    },
+    {
+      "epoch": 8.878594865856012,
+      "grad_norm": 0.34867429733276367,
+      "learning_rate": 0.0001860202252026106,
+      "loss": 2.6558,
+      "step": 138000
+    },
+    {
+      "epoch": 8.942932509811492,
+      "grad_norm": 0.3351230025291443,
+      "learning_rate": 0.00018494441655310907,
+      "loss": 2.6504,
+      "step": 139000
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.4238085730096768,
+      "eval_loss": 2.9544410705566406,
+      "eval_runtime": 112.4181,
+      "eval_samples_per_second": 466.473,
+      "eval_steps_per_second": 7.294,
+      "step": 139887
+    },
+    {
+      "epoch": 9.007270153766969,
+      "grad_norm": 0.36276528239250183,
+      "learning_rate": 0.00018386968371225703,
+      "loss": 2.6469,
+      "step": 140000
+    },
+    {
+      "epoch": 9.071607797722447,
+      "grad_norm": 0.36368831992149353,
+      "learning_rate": 0.0001827938750627555,
+      "loss": 2.5666,
+      "step": 141000
+    },
+    {
+      "epoch": 9.135945441677926,
+      "grad_norm": 0.36417004466056824,
+      "learning_rate": 0.00018171806641325395,
+      "loss": 2.5832,
+      "step": 142000
+    },
+    {
+      "epoch": 9.200283085633403,
+      "grad_norm": 0.3550620973110199,
+      "learning_rate": 0.0001806422577637524,
+      "loss": 2.5888,
+      "step": 143000
+    },
+    {
+      "epoch": 9.264620729588882,
+      "grad_norm": 0.3513035178184509,
+      "learning_rate": 0.00017956644911425084,
+      "loss": 2.5872,
+      "step": 144000
+    },
+    {
+      "epoch": 9.32895837354436,
+      "grad_norm": 0.3576969802379608,
+      "learning_rate": 0.00017849279208204832,
+      "loss": 2.599,
+      "step": 145000
+    },
+    {
+      "epoch": 9.39329601749984,
+      "grad_norm": 0.3496710956096649,
+      "learning_rate": 0.00017741698343254678,
+      "loss": 2.6042,
+      "step": 146000
+    },
+    {
+      "epoch": 9.457633661455317,
+      "grad_norm": 0.3502206802368164,
+      "learning_rate": 0.00017634225059169476,
+      "loss": 2.6069,
+      "step": 147000
+    },
+    {
+      "epoch": 9.521971305410796,
+      "grad_norm": 0.3516786992549896,
+      "learning_rate": 0.00017526644194219322,
+      "loss": 2.606,
+      "step": 148000
+    },
+    {
+      "epoch": 9.586308949366273,
+      "grad_norm": 0.3671824336051941,
+      "learning_rate": 0.00017419063329269168,
+      "loss": 2.6151,
+      "step": 149000
+    },
+    {
+      "epoch": 9.650646593321753,
+      "grad_norm": 0.36615684628486633,
+      "learning_rate": 0.00017311590045183964,
+      "loss": 2.6174,
+      "step": 150000
+    },
+    {
+      "epoch": 9.71498423727723,
+      "grad_norm": 0.369759202003479,
+      "learning_rate": 0.0001720400918023381,
+      "loss": 2.6162,
+      "step": 151000
+    },
+    {
+      "epoch": 9.77932188123271,
+      "grad_norm": 0.3495037257671356,
+      "learning_rate": 0.00017096428315283656,
+      "loss": 2.6186,
+      "step": 152000
+    },
+    {
+      "epoch": 9.843659525188187,
+      "grad_norm": 0.3635868728160858,
+      "learning_rate": 0.0001698895503119845,
+      "loss": 2.616,
+      "step": 153000
+    },
+    {
+      "epoch": 9.907997169143666,
+      "grad_norm": 0.352250337600708,
+      "learning_rate": 0.00016881374166248297,
+      "loss": 2.626,
+      "step": 154000
+    },
+    {
+      "epoch": 9.972334813099144,
+      "grad_norm": 0.3688776195049286,
+      "learning_rate": 0.00016773900882163093,
+      "loss": 2.6246,
+      "step": 155000
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.424411016885778,
+      "eval_loss": 2.9523308277130127,
+      "eval_runtime": 112.0636,
+      "eval_samples_per_second": 467.948,
+      "eval_steps_per_second": 7.317,
+      "step": 155430
+    },
+    {
+      "epoch": 10.036672457054623,
+      "grad_norm": 0.3961314558982849,
+      "learning_rate": 0.0001666632001721294,
+      "loss": 2.5827,
+      "step": 156000
+    },
+    {
+      "epoch": 10.1010101010101,
+      "grad_norm": 0.3705954849720001,
+      "learning_rate": 0.00016558739152262782,
+      "loss": 2.5413,
+      "step": 157000
+    },
+    {
+      "epoch": 10.16534774496558,
+      "grad_norm": 0.37091416120529175,
+      "learning_rate": 0.00016451158287312628,
+      "loss": 2.5502,
+      "step": 158000
+    },
+    {
+      "epoch": 10.229685388921057,
+      "grad_norm": 0.38428565859794617,
+      "learning_rate": 0.00016343685003227424,
+      "loss": 2.5592,
+      "step": 159000
+    },
+    {
+      "epoch": 10.294023032876536,
+      "grad_norm": 0.3688577115535736,
+      "learning_rate": 0.0001623610413827727,
+      "loss": 2.5673,
+      "step": 160000
+    },
+    {
+      "epoch": 10.358360676832014,
+      "grad_norm": 0.38183775544166565,
+      "learning_rate": 0.00016128630854192065,
+      "loss": 2.5697,
+      "step": 161000
+    },
+    {
+      "epoch": 10.422698320787493,
+      "grad_norm": 0.37677517533302307,
+      "learning_rate": 0.0001602104998924191,
+      "loss": 2.5713,
+      "step": 162000
+    },
+    {
+      "epoch": 10.48703596474297,
+      "grad_norm": 0.3694332540035248,
+      "learning_rate": 0.00015913576705156707,
+      "loss": 2.5751,
+      "step": 163000
+    },
+    {
+      "epoch": 10.55137360869845,
+      "grad_norm": 0.3814958333969116,
+      "learning_rate": 0.00015806103421071502,
+      "loss": 2.5792,
+      "step": 164000
+    },
+    {
+      "epoch": 10.615711252653927,
+      "grad_norm": 0.38280004262924194,
+      "learning_rate": 0.00015698522556121348,
+      "loss": 2.5782,
+      "step": 165000
+    },
+    {
+      "epoch": 10.680048896609406,
+      "grad_norm": 0.3659280240535736,
+      "learning_rate": 0.00015590941691171194,
+      "loss": 2.5862,
+      "step": 166000
+    },
+    {
+      "epoch": 10.744386540564884,
+      "grad_norm": 0.34562841057777405,
+      "learning_rate": 0.0001548336082622104,
+      "loss": 2.5869,
+      "step": 167000
+    },
+    {
+      "epoch": 10.808724184520363,
+      "grad_norm": 0.3570345938205719,
+      "learning_rate": 0.00015375887542135836,
+      "loss": 2.59,
+      "step": 168000
+    },
+    {
+      "epoch": 10.87306182847584,
+      "grad_norm": 0.360215961933136,
+      "learning_rate": 0.00015268306677185682,
+      "loss": 2.5979,
+      "step": 169000
+    },
+    {
+      "epoch": 10.93739947243132,
+      "grad_norm": 0.370670884847641,
+      "learning_rate": 0.00015160725812235528,
+      "loss": 2.5988,
+      "step": 170000
+    },
+    {
+      "epoch": 11.0,
+      "eval_accuracy": 0.4248191770987571,
+      "eval_loss": 2.9567785263061523,
+      "eval_runtime": 112.9375,
+      "eval_samples_per_second": 464.328,
+      "eval_steps_per_second": 7.261,
+      "step": 170973
+    },
+    {
+      "epoch": 11.001737116386797,
+      "grad_norm": 0.38218948245048523,
+      "learning_rate": 0.00015053252528150323,
+      "loss": 2.5933,
+      "step": 171000
+    },
+    {
+      "epoch": 11.066074760342277,
+      "grad_norm": 0.396331787109375,
+      "learning_rate": 0.00014945671663200172,
+      "loss": 2.5023,
+      "step": 172000
+    },
+    {
+      "epoch": 11.130412404297754,
+      "grad_norm": 0.3751789927482605,
+      "learning_rate": 0.00014838090798250018,
+      "loss": 2.5227,
+      "step": 173000
+    },
+    {
+      "epoch": 11.194750048253233,
+      "grad_norm": 0.37265828251838684,
+      "learning_rate": 0.00014730509933299864,
+      "loss": 2.5299,
+      "step": 174000
+    },
+    {
+      "epoch": 11.25908769220871,
+      "grad_norm": 0.37080228328704834,
+      "learning_rate": 0.0001462303664921466,
+      "loss": 2.5333,
+      "step": 175000
+    },
+    {
+      "epoch": 11.32342533616419,
+      "grad_norm": 0.3808966875076294,
+      "learning_rate": 0.00014515563365129455,
+      "loss": 2.5376,
+      "step": 176000
+    },
+    {
+      "epoch": 11.387762980119668,
+      "grad_norm": 0.38901346921920776,
+      "learning_rate": 0.000144079825001793,
+      "loss": 2.5422,
+      "step": 177000
+    },
+    {
+      "epoch": 11.452100624075147,
+      "grad_norm": 0.380100816488266,
+      "learning_rate": 0.00014300401635229144,
+      "loss": 2.5533,
+      "step": 178000
+    },
+    {
+      "epoch": 11.516438268030624,
+      "grad_norm": 0.39306920766830444,
+      "learning_rate": 0.0001419282077027899,
+      "loss": 2.5507,
+      "step": 179000
+    },
+    {
+      "epoch": 11.580775911986104,
+      "grad_norm": 0.3917422890663147,
+      "learning_rate": 0.00014085239905328836,
+      "loss": 2.5579,
+      "step": 180000
+    },
+    {
+      "epoch": 11.645113555941581,
+      "grad_norm": 0.38742849230766296,
+      "learning_rate": 0.00013977766621243632,
+      "loss": 2.5531,
+      "step": 181000
+    },
+    {
+      "epoch": 11.70945119989706,
+      "grad_norm": 0.3767852187156677,
+      "learning_rate": 0.00013870185756293478,
+      "loss": 2.5633,
+      "step": 182000
+    },
+    {
+      "epoch": 11.773788843852538,
+      "grad_norm": 0.39576900005340576,
+      "learning_rate": 0.00013762604891343324,
+      "loss": 2.5648,
+      "step": 183000
+    },
+    {
+      "epoch": 11.838126487808017,
+      "grad_norm": 0.37659791111946106,
+      "learning_rate": 0.00013655131607258122,
+      "loss": 2.5631,
+      "step": 184000
+    },
+    {
+      "epoch": 11.902464131763494,
+      "grad_norm": 0.38377416133880615,
+      "learning_rate": 0.00013547658323172918,
+      "loss": 2.5631,
+      "step": 185000
+    },
+    {
+      "epoch": 11.966801775718974,
+      "grad_norm": 0.37857234477996826,
+      "learning_rate": 0.00013440077458222764,
+      "loss": 2.5639,
+      "step": 186000
+    },
+    {
+      "epoch": 12.0,
+      "eval_accuracy": 0.4247610714766456,
+      "eval_loss": 2.9595353603363037,
+      "eval_runtime": 112.0415,
+      "eval_samples_per_second": 468.041,
+      "eval_steps_per_second": 7.319,
+      "step": 186516
+    },
+    {
+      "epoch": 12.031139419674451,
+      "grad_norm": 0.4024442136287689,
+      "learning_rate": 0.0001333249659327261,
+      "loss": 2.5273,
+      "step": 187000
+    },
+    {
+      "epoch": 12.09547706362993,
+      "grad_norm": 0.4137458801269531,
+      "learning_rate": 0.00013225023309187405,
+      "loss": 2.4933,
+      "step": 188000
+    },
+    {
+      "epoch": 12.159814707585408,
+      "grad_norm": 0.409184992313385,
+      "learning_rate": 0.0001311744244423725,
+      "loss": 2.4967,
+      "step": 189000
+    },
+    {
+      "epoch": 12.224152351540887,
+      "grad_norm": 0.41316309571266174,
+      "learning_rate": 0.00013009861579287097,
+      "loss": 2.5063,
+      "step": 190000
+    },
+    {
+      "epoch": 12.288489995496365,
+      "grad_norm": 0.3909110724925995,
+      "learning_rate": 0.00012902280714336943,
+      "loss": 2.5153,
+      "step": 191000
+    },
+    {
+      "epoch": 12.352827639451844,
+      "grad_norm": 0.39046111702919006,
+      "learning_rate": 0.0001279469984938679,
+      "loss": 2.5115,
+      "step": 192000
+    },
+    {
+      "epoch": 12.417165283407321,
+      "grad_norm": 0.40070855617523193,
+      "learning_rate": 0.00012687226565301585,
+      "loss": 2.5157,
+      "step": 193000
+    },
+    {
+      "epoch": 12.4815029273628,
+      "grad_norm": 0.3970703184604645,
+      "learning_rate": 0.00012579645700351428,
+      "loss": 2.5198,
+      "step": 194000
+    },
+    {
+      "epoch": 12.545840571318278,
+      "grad_norm": 0.40202242136001587,
+      "learning_rate": 0.00012472064835401274,
+      "loss": 2.526,
+      "step": 195000
+    },
+    {
+      "epoch": 12.610178215273757,
+      "grad_norm": 0.3841732144355774,
+      "learning_rate": 0.0001236459155131607,
+      "loss": 2.5295,
+      "step": 196000
+    },
+    {
+      "epoch": 12.674515859229235,
+      "grad_norm": 0.40759024024009705,
+      "learning_rate": 0.00012257010686365916,
+      "loss": 2.5307,
+      "step": 197000
+    },
+    {
+      "epoch": 12.738853503184714,
+      "grad_norm": 0.3963831663131714,
+      "learning_rate": 0.00012149429821415763,
+      "loss": 2.534,
+      "step": 198000
+    },
+    {
+      "epoch": 12.803191147140192,
+      "grad_norm": 0.37255486845970154,
+      "learning_rate": 0.0001204195653733056,
+      "loss": 2.5354,
+      "step": 199000
+    },
+    {
+      "epoch": 12.86752879109567,
+      "grad_norm": 0.397368460893631,
+      "learning_rate": 0.00011934375672380406,
+      "loss": 2.5352,
+      "step": 200000
+    },
+    {
+      "epoch": 12.931866435051148,
+      "grad_norm": 0.379574716091156,
+      "learning_rate": 0.00011826902388295201,
+      "loss": 2.5397,
+      "step": 201000
+    },
+    {
+      "epoch": 12.996204079006628,
+      "grad_norm": 0.3803842067718506,
+      "learning_rate": 0.00011719321523345048,
+      "loss": 2.5361,
+      "step": 202000
+    },
+    {
+      "epoch": 13.0,
+      "eval_accuracy": 0.42475613586395655,
+      "eval_loss": 2.9698119163513184,
+      "eval_runtime": 112.2708,
+      "eval_samples_per_second": 467.085,
+      "eval_steps_per_second": 7.304,
+      "step": 202059
+    },
+    {
+      "epoch": 13.060541722962105,
+      "grad_norm": 0.40816885232925415,
+      "learning_rate": 0.00011611740658394894,
+      "loss": 2.4669,
+      "step": 203000
+    },
+    {
+      "epoch": 13.124879366917584,
+      "grad_norm": 0.42818671464920044,
+      "learning_rate": 0.00011504159793444738,
+      "loss": 2.467,
+      "step": 204000
+    },
+    {
+      "epoch": 13.189217010873062,
+      "grad_norm": 0.40255987644195557,
+      "learning_rate": 0.00011396686509359535,
+      "loss": 2.4753,
+      "step": 205000
+    },
+    {
+      "epoch": 13.253554654828541,
+      "grad_norm": 0.4254453778266907,
+      "learning_rate": 0.0001128921322527433,
+      "loss": 2.4808,
+      "step": 206000
+    },
+    {
+      "epoch": 13.317892298784018,
+      "grad_norm": 0.4060657322406769,
+      "learning_rate": 0.00011181632360324175,
+      "loss": 2.4932,
+      "step": 207000
+    },
+    {
+      "epoch": 13.382229942739498,
+      "grad_norm": 0.4138365387916565,
+      "learning_rate": 0.00011074051495374021,
+      "loss": 2.4922,
+      "step": 208000
+    },
+    {
+      "epoch": 13.446567586694975,
+      "grad_norm": 0.4098254442214966,
+      "learning_rate": 0.00010966578211288817,
+      "loss": 2.4948,
+      "step": 209000
+    },
+    {
+      "epoch": 13.510905230650454,
+      "grad_norm": 0.4242159128189087,
+      "learning_rate": 0.00010858997346338663,
+      "loss": 2.5012,
+      "step": 210000
+    },
+    {
+      "epoch": 13.575242874605932,
+      "grad_norm": 0.42177829146385193,
+      "learning_rate": 0.00010751416481388509,
+      "loss": 2.4998,
+      "step": 211000
+    },
+    {
+      "epoch": 13.63958051856141,
+      "grad_norm": 0.4196189045906067,
+      "learning_rate": 0.00010643943197303304,
+      "loss": 2.5048,
+      "step": 212000
+    },
+    {
+      "epoch": 13.703918162516889,
+      "grad_norm": 0.3965640366077423,
+      "learning_rate": 0.0001053636233235315,
+      "loss": 2.5092,
+      "step": 213000
+    },
+    {
+      "epoch": 13.768255806472368,
+      "grad_norm": 0.39778339862823486,
+      "learning_rate": 0.00010428781467402996,
+      "loss": 2.5121,
+      "step": 214000
+    },
+    {
+      "epoch": 13.832593450427845,
+      "grad_norm": 0.40292391180992126,
+      "learning_rate": 0.00010321308183317793,
+      "loss": 2.5119,
+      "step": 215000
+    },
+    {
+      "epoch": 13.896931094383323,
+      "grad_norm": 0.41673198342323303,
+      "learning_rate": 0.00010213727318367639,
+      "loss": 2.5112,
+      "step": 216000
+    },
+    {
+      "epoch": 13.961268738338802,
+      "grad_norm": 0.40400612354278564,
+      "learning_rate": 0.00010106254034282435,
+      "loss": 2.5098,
+      "step": 217000
+    },
+    {
+      "epoch": 14.0,
+      "eval_accuracy": 0.424743796832234,
+      "eval_loss": 2.9747180938720703,
+      "eval_runtime": 112.0802,
+      "eval_samples_per_second": 467.879,
+      "eval_steps_per_second": 7.316,
+      "step": 217602
+    },
+    {
+      "epoch": 14.02560638229428,
+      "grad_norm": 0.40745100378990173,
+      "learning_rate": 9.998673169332281e-05,
+      "loss": 2.4894,
+      "step": 218000
+    },
+    {
+      "epoch": 14.089944026249759,
+      "grad_norm": 0.42399463057518005,
+      "learning_rate": 9.891092304382127e-05,
+      "loss": 2.449,
+      "step": 219000
+    },
+    {
+      "epoch": 14.154281670205236,
+      "grad_norm": 0.4149724841117859,
+      "learning_rate": 9.783511439431973e-05,
+      "loss": 2.4534,
+      "step": 220000
+    },
+    {
+      "epoch": 14.218619314160716,
+      "grad_norm": 0.40756285190582275,
+      "learning_rate": 9.676145736211718e-05,
+      "loss": 2.4576,
+      "step": 221000
+    },
+    {
+      "epoch": 14.282956958116193,
+      "grad_norm": 0.4224795997142792,
+      "learning_rate": 9.568564871261564e-05,
+      "loss": 2.4584,
+      "step": 222000
+    },
+    {
+      "epoch": 14.347294602071672,
+      "grad_norm": 0.41213053464889526,
+      "learning_rate": 9.461091587176359e-05,
+      "loss": 2.4707,
+      "step": 223000
+    },
+    {
+      "epoch": 14.41163224602715,
+      "grad_norm": 0.4161031246185303,
+      "learning_rate": 9.353510722226205e-05,
+      "loss": 2.4701,
+      "step": 224000
+    },
+    {
+      "epoch": 14.475969889982629,
+      "grad_norm": 0.42417025566101074,
+      "learning_rate": 9.245929857276051e-05,
+      "loss": 2.4706,
+      "step": 225000
+    },
+    {
+      "epoch": 14.540307533938106,
+      "grad_norm": 0.4227360785007477,
+      "learning_rate": 9.138348992325897e-05,
+      "loss": 2.4678,
+      "step": 226000
+    },
+    {
+      "epoch": 14.604645177893586,
+      "grad_norm": 0.3956305682659149,
+      "learning_rate": 9.030768127375742e-05,
+      "loss": 2.4816,
+      "step": 227000
+    },
+    {
+      "epoch": 14.668982821849063,
+      "grad_norm": 0.42013561725616455,
+      "learning_rate": 8.92329484329054e-05,
+      "loss": 2.4791,
+      "step": 228000
+    },
+    {
+      "epoch": 14.733320465804542,
+      "grad_norm": 0.41232335567474365,
+      "learning_rate": 8.815713978340386e-05,
+      "loss": 2.4861,
+      "step": 229000
+    },
+    {
+      "epoch": 14.79765810976002,
+      "grad_norm": 0.398253858089447,
+      "learning_rate": 8.708240694255182e-05,
+      "loss": 2.4857,
+      "step": 230000
+    },
+    {
+      "epoch": 14.8619957537155,
+      "grad_norm": 0.41056615114212036,
+      "learning_rate": 8.600659829305028e-05,
+      "loss": 2.4826,
+      "step": 231000
+    },
+    {
+      "epoch": 14.926333397670977,
+      "grad_norm": 0.4065124988555908,
+      "learning_rate": 8.493186545219823e-05,
+      "loss": 2.4791,
+      "step": 232000
+    },
+    {
+      "epoch": 14.990671041626456,
+      "grad_norm": 0.42194780707359314,
+      "learning_rate": 8.385605680269669e-05,
+      "loss": 2.4899,
+      "step": 233000
+    },
+    {
+      "epoch": 15.0,
+      "eval_accuracy": 0.4246625087868862,
+      "eval_loss": 2.9792003631591797,
+      "eval_runtime": 112.3403,
+      "eval_samples_per_second": 466.796,
+      "eval_steps_per_second": 7.299,
+      "step": 233145
+    },
+    {
+      "epoch": 15.055008685581933,
+      "grad_norm": 0.444181889295578,
+      "learning_rate": 8.278024815319514e-05,
+      "loss": 2.4309,
+      "step": 234000
+    },
+    {
+      "epoch": 15.119346329537413,
+      "grad_norm": 0.4177301526069641,
+      "learning_rate": 8.17044395036936e-05,
+      "loss": 2.4254,
+      "step": 235000
+    },
+    {
+      "epoch": 15.18368397349289,
+      "grad_norm": 0.43864157795906067,
+      "learning_rate": 8.062970666284155e-05,
+      "loss": 2.432,
+      "step": 236000
+    },
+    {
+      "epoch": 15.24802161744837,
+      "grad_norm": 0.43071264028549194,
+      "learning_rate": 7.955497382198951e-05,
+      "loss": 2.4372,
+      "step": 237000
+    },
+    {
+      "epoch": 15.312359261403847,
+      "grad_norm": 0.44551989436149597,
+      "learning_rate": 7.847916517248797e-05,
+      "loss": 2.4441,
+      "step": 238000
+    },
+    {
+      "epoch": 15.376696905359326,
+      "grad_norm": 0.42598387598991394,
+      "learning_rate": 7.740335652298643e-05,
+      "loss": 2.4448,
+      "step": 239000
+    },
+    {
+      "epoch": 15.441034549314804,
+      "grad_norm": 0.4412069618701935,
+      "learning_rate": 7.632754787348489e-05,
+      "loss": 2.4481,
+      "step": 240000
+    },
+    {
+      "epoch": 15.505372193270283,
+      "grad_norm": 0.4257245361804962,
+      "learning_rate": 7.525173922398335e-05,
+      "loss": 2.4496,
+      "step": 241000
+    },
+    {
+      "epoch": 15.56970983722576,
+      "grad_norm": 0.4463740885257721,
+      "learning_rate": 7.417593057448181e-05,
+      "loss": 2.4583,
+      "step": 242000
+    },
+    {
+      "epoch": 15.63404748118124,
+      "grad_norm": 0.40843266248703003,
+      "learning_rate": 7.310119773362977e-05,
+      "loss": 2.4549,
+      "step": 243000
+    },
+    {
+      "epoch": 15.698385125136717,
+      "grad_norm": 0.43823161721229553,
+      "learning_rate": 7.202538908412823e-05,
+      "loss": 2.4565,
+      "step": 244000
+    },
+    {
+      "epoch": 15.762722769092196,
+      "grad_norm": 0.4224304258823395,
+      "learning_rate": 7.09506562432762e-05,
+      "loss": 2.4664,
+      "step": 245000
+    },
+    {
+      "epoch": 15.827060413047674,
+      "grad_norm": 0.42779698967933655,
+      "learning_rate": 6.987484759377464e-05,
+      "loss": 2.4607,
+      "step": 246000
+    },
+    {
+      "epoch": 15.891398057003153,
+      "grad_norm": 0.41904374957084656,
+      "learning_rate": 6.880011475292261e-05,
+      "loss": 2.463,
+      "step": 247000
+    },
+    {
+      "epoch": 15.95573570095863,
+      "grad_norm": 0.4636126458644867,
+      "learning_rate": 6.772430610342107e-05,
+      "loss": 2.4626,
+      "step": 248000
+    },
+    {
+      "epoch": 16.0,
+      "eval_accuracy": 0.4244173733566653,
+      "eval_loss": 2.9882283210754395,
+      "eval_runtime": 112.1119,
+      "eval_samples_per_second": 467.747,
+      "eval_steps_per_second": 7.314,
+      "step": 248688
+    },
+    {
+      "epoch": 16.02007334491411,
+      "grad_norm": 0.44689562916755676,
+      "learning_rate": 6.664849745391953e-05,
+      "loss": 2.4432,
+      "step": 249000
+    },
+    {
+      "epoch": 16.08441098886959,
+      "grad_norm": 0.45889049768447876,
+      "learning_rate": 6.557376461306749e-05,
+      "loss": 2.4048,
+      "step": 250000
+    },
+    {
+      "epoch": 16.148748632825065,
+      "grad_norm": 0.4538269639015198,
+      "learning_rate": 6.449795596356593e-05,
+      "loss": 2.4123,
+      "step": 251000
+    },
+    {
+      "epoch": 16.213086276780544,
+      "grad_norm": 0.44775742292404175,
+      "learning_rate": 6.342214731406439e-05,
+      "loss": 2.4137,
+      "step": 252000
+    },
+    {
+      "epoch": 16.277423920736023,
+      "grad_norm": 0.4506843090057373,
+      "learning_rate": 6.234741447321236e-05,
+      "loss": 2.4152,
+      "step": 253000
+    },
+    {
+      "epoch": 16.341761564691502,
+      "grad_norm": 0.4564642310142517,
+      "learning_rate": 6.127160582371082e-05,
+      "loss": 2.4236,
+      "step": 254000
+    },
+    {
+      "epoch": 16.406099208646978,
+      "grad_norm": 0.4492376744747162,
+      "learning_rate": 6.0195797174209275e-05,
+      "loss": 2.4222,
+      "step": 255000
+    },
+    {
+      "epoch": 16.470436852602457,
+      "grad_norm": 0.44002753496170044,
+      "learning_rate": 5.9119988524707736e-05,
+      "loss": 2.4277,
+      "step": 256000
+    },
+    {
+      "epoch": 16.534774496557937,
+      "grad_norm": 0.437580406665802,
+      "learning_rate": 5.8044179875206196e-05,
+      "loss": 2.4303,
+      "step": 257000
+    },
+    {
+      "epoch": 16.599112140513416,
+      "grad_norm": 0.42502424120903015,
+      "learning_rate": 5.697052284300365e-05,
+      "loss": 2.4359,
+      "step": 258000
+    },
+    {
+      "epoch": 16.66344978446889,
+      "grad_norm": 0.44441190361976624,
+      "learning_rate": 5.5894714193502106e-05,
+      "loss": 2.4306,
+      "step": 259000
+    },
+    {
+      "epoch": 16.72778742842437,
+      "grad_norm": 0.4539526700973511,
+      "learning_rate": 5.4818905544000566e-05,
+      "loss": 2.4342,
+      "step": 260000
+    },
+    {
+      "epoch": 16.79212507237985,
+      "grad_norm": 0.4554595947265625,
+      "learning_rate": 5.374417270314853e-05,
+      "loss": 2.4388,
+      "step": 261000
+    },
+    {
+      "epoch": 16.85646271633533,
+      "grad_norm": 0.4573330283164978,
+      "learning_rate": 5.266836405364699e-05,
+      "loss": 2.441,
+      "step": 262000
+    },
+    {
+      "epoch": 16.920800360290805,
+      "grad_norm": 0.449770450592041,
+      "learning_rate": 5.159255540414545e-05,
+      "loss": 2.4411,
+      "step": 263000
+    },
+    {
+      "epoch": 16.985138004246284,
+      "grad_norm": 0.48139625787734985,
+      "learning_rate": 5.05178225632934e-05,
+      "loss": 2.4399,
+      "step": 264000
+    },
+    {
+      "epoch": 17.0,
+      "eval_accuracy": 0.4242649676193895,
+      "eval_loss": 2.9961202144622803,
+      "eval_runtime": 112.3106,
+      "eval_samples_per_second": 466.919,
+      "eval_steps_per_second": 7.301,
+      "step": 264231
+    },
+    {
+      "epoch": 17.049475648201764,
+      "grad_norm": 0.4543195366859436,
+      "learning_rate": 4.9442013913791863e-05,
+      "loss": 2.4013,
+      "step": 265000
+    },
+    {
+      "epoch": 17.113813292157243,
+      "grad_norm": 0.4699794054031372,
+      "learning_rate": 4.836620526429032e-05,
+      "loss": 2.3928,
+      "step": 266000
+    },
+    {
+      "epoch": 17.17815093611272,
+      "grad_norm": 0.4636929929256439,
+      "learning_rate": 4.7291472423438285e-05,
+      "loss": 2.3989,
+      "step": 267000
+    },
+    {
+      "epoch": 17.242488580068198,
+      "grad_norm": 0.4614698886871338,
+      "learning_rate": 4.6215663773936746e-05,
+      "loss": 2.4004,
+      "step": 268000
+    },
+    {
+      "epoch": 17.306826224023677,
+      "grad_norm": 0.46002906560897827,
+      "learning_rate": 4.513985512443519e-05,
+      "loss": 2.3982,
+      "step": 269000
+    },
+    {
+      "epoch": 17.371163867979156,
+      "grad_norm": 0.42619064450263977,
+      "learning_rate": 4.4065122283583154e-05,
+      "loss": 2.4053,
+      "step": 270000
+    },
+    {
+      "epoch": 17.435501511934632,
+      "grad_norm": 0.45975300669670105,
+      "learning_rate": 4.2989313634081614e-05,
+      "loss": 2.4041,
+      "step": 271000
+    },
+    {
+      "epoch": 17.49983915589011,
+      "grad_norm": 0.4545740485191345,
+      "learning_rate": 4.1913504984580075e-05,
+      "loss": 2.406,
+      "step": 272000
+    },
+    {
+      "epoch": 17.56417679984559,
+      "grad_norm": 0.458011269569397,
+      "learning_rate": 4.083769633507853e-05,
+      "loss": 2.4168,
+      "step": 273000
+    },
+    {
+      "epoch": 17.62851444380107,
+      "grad_norm": 0.4604107439517975,
+      "learning_rate": 3.976296349422649e-05,
+      "loss": 2.411,
+      "step": 274000
+    },
+    {
+      "epoch": 17.692852087756545,
+      "grad_norm": 0.4420773684978485,
+      "learning_rate": 3.8687154844724944e-05,
+      "loss": 2.4144,
+      "step": 275000
+    },
+    {
+      "epoch": 17.757189731712025,
+      "grad_norm": 0.45774900913238525,
+      "learning_rate": 3.7611346195223404e-05,
+      "loss": 2.412,
+      "step": 276000
+    },
+    {
+      "epoch": 17.821527375667504,
+      "grad_norm": 0.4509606659412384,
+      "learning_rate": 3.6536613354371366e-05,
+      "loss": 2.4086,
+      "step": 277000
+    },
+    {
+      "epoch": 17.885865019622983,
+      "grad_norm": 0.4442935883998871,
+      "learning_rate": 3.5460804704869826e-05,
+      "loss": 2.4134,
+      "step": 278000
+    },
+    {
+      "epoch": 17.95020266357846,
+      "grad_norm": 0.42292436957359314,
+      "learning_rate": 3.438607186401778e-05,
+      "loss": 2.4186,
+      "step": 279000
+    },
+    {
+      "epoch": 18.0,
+      "eval_accuracy": 0.42388941236296196,
+      "eval_loss": 3.0051016807556152,
+      "eval_runtime": 112.3705,
+      "eval_samples_per_second": 466.67,
+      "eval_steps_per_second": 7.297,
+      "step": 279774
+    },
+    {
+      "epoch": 18.014540307533938,
+      "grad_norm": 0.48824623227119446,
+      "learning_rate": 3.331026321451624e-05,
+      "loss": 2.4055,
+      "step": 280000
+    },
+    {
+      "epoch": 18.078877951489417,
+      "grad_norm": 0.46934977173805237,
+      "learning_rate": 3.22355303736642e-05,
+      "loss": 2.3736,
+      "step": 281000
+    },
+    {
+      "epoch": 18.143215595444893,
+      "grad_norm": 0.5045217275619507,
+      "learning_rate": 3.115972172416266e-05,
+      "loss": 2.382,
+      "step": 282000
+    },
+    {
+      "epoch": 18.207553239400372,
+      "grad_norm": 0.46461954712867737,
+      "learning_rate": 3.008391307466112e-05,
+      "loss": 2.3806,
+      "step": 283000
+    },
+    {
+      "epoch": 18.27189088335585,
+      "grad_norm": 0.4565331041812897,
+      "learning_rate": 2.9009180233809078e-05,
+      "loss": 2.3813,
+      "step": 284000
+    },
+    {
+      "epoch": 18.33622852731133,
+      "grad_norm": 0.4561784863471985,
+      "learning_rate": 2.793337158430754e-05,
+      "loss": 2.3863,
+      "step": 285000
+    },
+    {
+      "epoch": 18.400566171266806,
+      "grad_norm": 0.4438989758491516,
+      "learning_rate": 2.6858638743455493e-05,
+      "loss": 2.3845,
+      "step": 286000
+    },
+    {
+      "epoch": 18.464903815222286,
+      "grad_norm": 0.461086630821228,
+      "learning_rate": 2.578283009395395e-05,
+      "loss": 2.3833,
+      "step": 287000
+    },
+    {
+      "epoch": 18.529241459177765,
+      "grad_norm": 0.4639764726161957,
+      "learning_rate": 2.470702144445241e-05,
+      "loss": 2.3918,
+      "step": 288000
+    },
+    {
+      "epoch": 18.593579103133244,
+      "grad_norm": 0.4645422697067261,
+      "learning_rate": 2.3631212794950868e-05,
+      "loss": 2.3953,
+      "step": 289000
+    },
+    {
+      "epoch": 18.65791674708872,
+      "grad_norm": 0.47392553091049194,
+      "learning_rate": 2.2555404145449328e-05,
+      "loss": 2.3829,
+      "step": 290000
+    },
+    {
+      "epoch": 18.7222543910442,
+      "grad_norm": 0.4530762732028961,
+      "learning_rate": 2.148174711324679e-05,
+      "loss": 2.3904,
+      "step": 291000
+    },
+    {
+      "epoch": 18.78659203499968,
+      "grad_norm": 0.47473639249801636,
+      "learning_rate": 2.0405938463745248e-05,
+      "loss": 2.3966,
+      "step": 292000
+    },
+    {
+      "epoch": 18.850929678955158,
+      "grad_norm": 0.43500351905822754,
+      "learning_rate": 1.9330129814243705e-05,
+      "loss": 2.396,
+      "step": 293000
+    },
+    {
+      "epoch": 18.915267322910633,
+      "grad_norm": 0.45157596468925476,
+      "learning_rate": 1.8254321164742165e-05,
+      "loss": 2.3959,
+      "step": 294000
+    },
+    {
+      "epoch": 18.979604966866113,
+      "grad_norm": 0.4546051621437073,
+      "learning_rate": 1.7180664132539624e-05,
+      "loss": 2.3869,
+      "step": 295000
+    },
+    {
+      "epoch": 19.0,
+      "eval_accuracy": 0.42373925008599933,
+      "eval_loss": 3.011887311935425,
+      "eval_runtime": 112.2632,
+      "eval_samples_per_second": 467.116,
+      "eval_steps_per_second": 7.304,
+      "step": 295317
+    },
+    {
+      "epoch": 19.043942610821592,
+      "grad_norm": 0.46901893615722656,
+      "learning_rate": 1.610485548303808e-05,
+      "loss": 2.3726,
+      "step": 296000
+    },
+    {
+      "epoch": 19.10828025477707,
+      "grad_norm": 0.43862438201904297,
+      "learning_rate": 1.502904683353654e-05,
+      "loss": 2.3688,
+      "step": 297000
+    },
+    {
+      "epoch": 19.172617898732547,
+      "grad_norm": 0.4580424427986145,
+      "learning_rate": 1.3954313992684501e-05,
+      "loss": 2.3682,
+      "step": 298000
+    },
+    {
+      "epoch": 19.236955542688026,
+      "grad_norm": 0.47557470202445984,
+      "learning_rate": 1.2878505343182957e-05,
+      "loss": 2.3687,
+      "step": 299000
+    },
+    {
+      "epoch": 19.301293186643505,
+      "grad_norm": 0.48615992069244385,
+      "learning_rate": 1.1802696693681415e-05,
+      "loss": 2.3636,
+      "step": 300000
+    },
+    {
+      "epoch": 19.365630830598985,
+      "grad_norm": 0.5019800662994385,
+      "learning_rate": 1.0726888044179874e-05,
+      "loss": 2.3668,
+      "step": 301000
+    },
+    {
+      "epoch": 19.42996847455446,
+      "grad_norm": 0.4481401741504669,
+      "learning_rate": 9.652155203327834e-06,
+      "loss": 2.3721,
+      "step": 302000
+    },
+    {
+      "epoch": 19.49430611850994,
+      "grad_norm": 0.4632056653499603,
+      "learning_rate": 8.577422362475793e-06,
+      "loss": 2.372,
+      "step": 303000
+    },
+    {
+      "epoch": 19.55864376246542,
+      "grad_norm": 0.4590476453304291,
+      "learning_rate": 7.5016137129742514e-06,
+      "loss": 2.3725,
+      "step": 304000
+    },
+    {
+      "epoch": 19.622981406420898,
+      "grad_norm": 0.4774569272994995,
+      "learning_rate": 6.42580506347271e-06,
+      "loss": 2.37,
+      "step": 305000
+    },
+    {
+      "epoch": 19.687319050376374,
+      "grad_norm": 0.47048863768577576,
+      "learning_rate": 5.351072222620669e-06,
+      "loss": 2.371,
+      "step": 306000
+    },
+    {
+      "epoch": 19.751656694331853,
+      "grad_norm": 0.4567144215106964,
+      "learning_rate": 4.275263573119128e-06,
+      "loss": 2.3706,
+      "step": 307000
+    },
+    {
+      "epoch": 19.815994338287332,
+      "grad_norm": 0.4492277503013611,
+      "learning_rate": 3.200530732267087e-06,
+      "loss": 2.3714,
+      "step": 308000
+    },
+    {
+      "epoch": 19.88033198224281,
+      "grad_norm": 0.44562822580337524,
+      "learning_rate": 2.1247220827655454e-06,
+      "loss": 2.3711,
+      "step": 309000
+    },
+    {
+      "epoch": 19.944669626198287,
+      "grad_norm": 0.4758046269416809,
+      "learning_rate": 1.0499892419135049e-06,
+      "loss": 2.3686,
+      "step": 310000
+    },
+    {
+      "epoch": 20.0,
+      "eval_accuracy": 0.4234014597448438,
+      "eval_loss": 3.0190186500549316,
+      "eval_runtime": 112.3566,
+      "eval_samples_per_second": 466.728,
+      "eval_steps_per_second": 7.298,
+      "step": 310860
+    },
+    {
+      "epoch": 20.0,
+      "step": 310860,
+      "total_flos": 1.29957250203648e+18,
+      "train_loss": 2.7038125842232534,
+      "train_runtime": 44110.0674,
+      "train_samples_per_second": 225.51,
+      "train_steps_per_second": 7.047
+    }
+  ],
+  "logging_steps": 1000,
+  "max_steps": 310860,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.29957250203648e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}