End of training

Browse files

Files changed (6) hide show

README.md +14 -2
all_results.json +16 -0
eval_results.json +10 -0
runs/Jul18_08-56-31_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1721393909.phyl-ling-p01.la.utexas.edu.3189178.1 +3 -0
train_results.json +9 -0
trainer_state.json +2819 -0

README.md CHANGED Viewed

@@ -1,11 +1,23 @@
 ---
 tags:
 - generated_from_trainer
 metrics:
 - accuracy
 model-index:
 - name: smolm-autoreg-bpe-counterfactual_babylm_aann_high_variability_numeral-seed_1024-1e-3
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -13,7 +25,7 @@ should probably proofread and complete it, then remove this comment. -->
 # smolm-autoreg-bpe-counterfactual_babylm_aann_high_variability_numeral-seed_1024-1e-3
-This model was trained from scratch on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.4236
 - Accuracy: 0.4102

 ---
 tags:
 - generated_from_trainer
+datasets:
+- kanishka/counterfactual_babylm_aann_high_variability_numeral
 metrics:
 - accuracy
 model-index:
 - name: smolm-autoreg-bpe-counterfactual_babylm_aann_high_variability_numeral-seed_1024-1e-3
+  results:
+  - task:
+      name: Causal Language Modeling
+      type: text-generation
+    dataset:
+      name: kanishka/counterfactual_babylm_aann_high_variability_numeral
+      type: kanishka/counterfactual_babylm_aann_high_variability_numeral
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.41021489963935376
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # smolm-autoreg-bpe-counterfactual_babylm_aann_high_variability_numeral-seed_1024-1e-3
+This model was trained from scratch on the kanishka/counterfactual_babylm_aann_high_variability_numeral dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.4236
 - Accuracy: 0.4102

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.41021489963935376,
+    "eval_loss": 3.423628807067871,
+    "eval_runtime": 154.4177,
+    "eval_samples": 57917,
+    "eval_samples_per_second": 375.067,
+    "eval_steps_per_second": 5.861,
+    "perplexity": 30.68054704672833,
+    "total_flos": 1.5670047538944e+18,
+    "train_loss": 3.0279207580395733,
+    "train_runtime": 82395.3872,
+    "train_samples": 595065,
+    "train_samples_per_second": 144.441,
+    "train_steps_per_second": 4.514
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.41021489963935376,
+    "eval_loss": 3.423628807067871,
+    "eval_runtime": 154.4177,
+    "eval_samples": 57917,
+    "eval_samples_per_second": 375.067,
+    "eval_steps_per_second": 5.861,
+    "perplexity": 30.68054704672833
+}

runs/Jul18_08-56-31_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1721393909.phyl-ling-p01.la.utexas.edu.3189178.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43f215dc6313bef0b581d68c4ada5a3d107c5caf75f6c5e9046f672945ca797a
+size 417

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 20.0,
+    "total_flos": 1.5670047538944e+18,
+    "train_loss": 3.0279207580395733,
+    "train_runtime": 82395.3872,
+    "train_samples": 595065,
+    "train_samples_per_second": 144.441,
+    "train_steps_per_second": 4.514
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2819 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 20.0,
+  "eval_steps": 500,
+  "global_step": 371920,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.05377500537750054,
+      "grad_norm": 0.8324036598205566,
+      "learning_rate": 3.125e-05,
+      "loss": 6.2314,
+      "step": 1000
+    },
+    {
+      "epoch": 0.10755001075500108,
+      "grad_norm": 0.9016917943954468,
+      "learning_rate": 6.25e-05,
+      "loss": 5.01,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1613250161325016,
+      "grad_norm": 0.8393586874008179,
+      "learning_rate": 9.375e-05,
+      "loss": 4.6792,
+      "step": 3000
+    },
+    {
+      "epoch": 0.21510002151000215,
+      "grad_norm": 0.8001790642738342,
+      "learning_rate": 0.000125,
+      "loss": 4.4675,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2688750268875027,
+      "grad_norm": 0.7500863671302795,
+      "learning_rate": 0.00015625,
+      "loss": 4.3004,
+      "step": 5000
+    },
+    {
+      "epoch": 0.3226500322650032,
+      "grad_norm": 0.6959784626960754,
+      "learning_rate": 0.0001875,
+      "loss": 4.1762,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3764250376425038,
+      "grad_norm": 0.7082997560501099,
+      "learning_rate": 0.00021875,
+      "loss": 4.0795,
+      "step": 7000
+    },
+    {
+      "epoch": 0.4302000430200043,
+      "grad_norm": 0.7400528788566589,
+      "learning_rate": 0.00025,
+      "loss": 3.9794,
+      "step": 8000
+    },
+    {
+      "epoch": 0.4839750483975048,
+      "grad_norm": 0.6886024475097656,
+      "learning_rate": 0.00028121875,
+      "loss": 3.9062,
+      "step": 9000
+    },
+    {
+      "epoch": 0.5377500537750054,
+      "grad_norm": 0.6196364760398865,
+      "learning_rate": 0.0003124375,
+      "loss": 3.8427,
+      "step": 10000
+    },
+    {
+      "epoch": 0.5915250591525059,
+      "grad_norm": 0.5815768241882324,
+      "learning_rate": 0.00034368749999999997,
+      "loss": 3.7992,
+      "step": 11000
+    },
+    {
+      "epoch": 0.6453000645300064,
+      "grad_norm": 0.5629006624221802,
+      "learning_rate": 0.0003749375,
+      "loss": 3.7502,
+      "step": 12000
+    },
+    {
+      "epoch": 0.699075069907507,
+      "grad_norm": 0.5031692981719971,
+      "learning_rate": 0.00040615625,
+      "loss": 3.7233,
+      "step": 13000
+    },
+    {
+      "epoch": 0.7528500752850076,
+      "grad_norm": 0.4921340048313141,
+      "learning_rate": 0.00043737500000000005,
+      "loss": 3.6917,
+      "step": 14000
+    },
+    {
+      "epoch": 0.806625080662508,
+      "grad_norm": 0.45878851413726807,
+      "learning_rate": 0.000468625,
+      "loss": 3.6641,
+      "step": 15000
+    },
+    {
+      "epoch": 0.8604000860400086,
+      "grad_norm": 0.4047335684299469,
+      "learning_rate": 0.000499875,
+      "loss": 3.6404,
+      "step": 16000
+    },
+    {
+      "epoch": 0.9141750914175092,
+      "grad_norm": 0.4339119493961334,
+      "learning_rate": 0.000531125,
+      "loss": 3.6129,
+      "step": 17000
+    },
+    {
+      "epoch": 0.9679500967950097,
+      "grad_norm": 0.3588213324546814,
+      "learning_rate": 0.00056234375,
+      "loss": 3.5932,
+      "step": 18000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.3588065680197524,
+      "eval_loss": 3.770080804824829,
+      "eval_runtime": 152.9859,
+      "eval_samples_per_second": 378.577,
+      "eval_steps_per_second": 5.916,
+      "step": 18596
+    },
+    {
+      "epoch": 1.0217251021725102,
+      "grad_norm": 0.3476862907409668,
+      "learning_rate": 0.00059359375,
+      "loss": 3.5726,
+      "step": 19000
+    },
+    {
+      "epoch": 1.0755001075500108,
+      "grad_norm": 0.3370579481124878,
+      "learning_rate": 0.0006248437500000001,
+      "loss": 3.5453,
+      "step": 20000
+    },
+    {
+      "epoch": 1.1292751129275114,
+      "grad_norm": 0.3253530263900757,
+      "learning_rate": 0.00065609375,
+      "loss": 3.5364,
+      "step": 21000
+    },
+    {
+      "epoch": 1.1830501183050117,
+      "grad_norm": 0.3063829839229584,
+      "learning_rate": 0.00068728125,
+      "loss": 3.5214,
+      "step": 22000
+    },
+    {
+      "epoch": 1.2368251236825123,
+      "grad_norm": 0.28737059235572815,
+      "learning_rate": 0.00071853125,
+      "loss": 3.5158,
+      "step": 23000
+    },
+    {
+      "epoch": 1.2906001290600129,
+      "grad_norm": 0.29937857389450073,
+      "learning_rate": 0.00074978125,
+      "loss": 3.5014,
+      "step": 24000
+    },
+    {
+      "epoch": 1.3443751344375134,
+      "grad_norm": 0.2835935056209564,
+      "learning_rate": 0.0007810312499999999,
+      "loss": 3.4946,
+      "step": 25000
+    },
+    {
+      "epoch": 1.398150139815014,
+      "grad_norm": 0.2764816880226135,
+      "learning_rate": 0.00081225,
+      "loss": 3.4832,
+      "step": 26000
+    },
+    {
+      "epoch": 1.4519251451925146,
+      "grad_norm": 0.2620868384838104,
+      "learning_rate": 0.0008435000000000001,
+      "loss": 3.4761,
+      "step": 27000
+    },
+    {
+      "epoch": 1.5057001505700152,
+      "grad_norm": 0.2731957733631134,
+      "learning_rate": 0.00087471875,
+      "loss": 3.4653,
+      "step": 28000
+    },
+    {
+      "epoch": 1.5594751559475157,
+      "grad_norm": 0.26957619190216064,
+      "learning_rate": 0.00090596875,
+      "loss": 3.4552,
+      "step": 29000
+    },
+    {
+      "epoch": 1.613250161325016,
+      "grad_norm": 0.24591492116451263,
+      "learning_rate": 0.00093721875,
+      "loss": 3.4474,
+      "step": 30000
+    },
+    {
+      "epoch": 1.6670251667025167,
+      "grad_norm": 0.23927152156829834,
+      "learning_rate": 0.00096846875,
+      "loss": 3.4443,
+      "step": 31000
+    },
+    {
+      "epoch": 1.7208001720800172,
+      "grad_norm": 0.2176426500082016,
+      "learning_rate": 0.0009996875,
+      "loss": 3.4401,
+      "step": 32000
+    },
+    {
+      "epoch": 1.7745751774575176,
+      "grad_norm": 0.20793931186199188,
+      "learning_rate": 0.0009970875500117675,
+      "loss": 3.4261,
+      "step": 33000
+    },
+    {
+      "epoch": 1.8283501828350182,
+      "grad_norm": 0.2189057469367981,
+      "learning_rate": 0.0009941486232054601,
+      "loss": 3.419,
+      "step": 34000
+    },
+    {
+      "epoch": 1.8821251882125187,
+      "grad_norm": 0.2241194099187851,
+      "learning_rate": 0.0009912096963991528,
+      "loss": 3.4088,
+      "step": 35000
+    },
+    {
+      "epoch": 1.9359001935900193,
+      "grad_norm": 0.23365530371665955,
+      "learning_rate": 0.0009882678277241704,
+      "loss": 3.3934,
+      "step": 36000
+    },
+    {
+      "epoch": 1.9896751989675199,
+      "grad_norm": 0.2019016444683075,
+      "learning_rate": 0.000985328900917863,
+      "loss": 3.3833,
+      "step": 37000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.38185108449506,
+      "eval_loss": 3.5596837997436523,
+      "eval_runtime": 154.0793,
+      "eval_samples_per_second": 375.891,
+      "eval_steps_per_second": 5.874,
+      "step": 37192
+    },
+    {
+      "epoch": 2.0434502043450204,
+      "grad_norm": 0.20308424532413483,
+      "learning_rate": 0.0009823870322428808,
+      "loss": 3.3404,
+      "step": 38000
+    },
+    {
+      "epoch": 2.097225209722521,
+      "grad_norm": 0.21531735360622406,
+      "learning_rate": 0.0009794451635678984,
+      "loss": 3.3266,
+      "step": 39000
+    },
+    {
+      "epoch": 2.1510002151000216,
+      "grad_norm": 0.26003631949424744,
+      "learning_rate": 0.000976503294892916,
+      "loss": 3.3242,
+      "step": 40000
+    },
+    {
+      "epoch": 2.204775220477522,
+      "grad_norm": 0.24143873155117035,
+      "learning_rate": 0.0009735643680866086,
+      "loss": 3.3166,
+      "step": 41000
+    },
+    {
+      "epoch": 2.2585502258550227,
+      "grad_norm": 0.19083160161972046,
+      "learning_rate": 0.0009706224994116263,
+      "loss": 3.3187,
+      "step": 42000
+    },
+    {
+      "epoch": 2.3123252312325233,
+      "grad_norm": 0.22694003582000732,
+      "learning_rate": 0.000967680630736644,
+      "loss": 3.3112,
+      "step": 43000
+    },
+    {
+      "epoch": 2.3661002366100234,
+      "grad_norm": 0.21774055063724518,
+      "learning_rate": 0.0009647417039303365,
+      "loss": 3.3075,
+      "step": 44000
+    },
+    {
+      "epoch": 2.419875241987524,
+      "grad_norm": 0.2047697901725769,
+      "learning_rate": 0.0009617998352553542,
+      "loss": 3.2992,
+      "step": 45000
+    },
+    {
+      "epoch": 2.4736502473650246,
+      "grad_norm": 0.21876117587089539,
+      "learning_rate": 0.0009588579665803719,
+      "loss": 3.2983,
+      "step": 46000
+    },
+    {
+      "epoch": 2.527425252742525,
+      "grad_norm": 0.21647591888904572,
+      "learning_rate": 0.0009559190397740644,
+      "loss": 3.2876,
+      "step": 47000
+    },
+    {
+      "epoch": 2.5812002581200257,
+      "grad_norm": 0.20933736860752106,
+      "learning_rate": 0.0009529771710990821,
+      "loss": 3.2814,
+      "step": 48000
+    },
+    {
+      "epoch": 2.6349752634975263,
+      "grad_norm": 0.1911548376083374,
+      "learning_rate": 0.0009500382442927748,
+      "loss": 3.2797,
+      "step": 49000
+    },
+    {
+      "epoch": 2.688750268875027,
+      "grad_norm": 0.22081832587718964,
+      "learning_rate": 0.0009470963756177925,
+      "loss": 3.2783,
+      "step": 50000
+    },
+    {
+      "epoch": 2.7425252742525275,
+      "grad_norm": 0.21164289116859436,
+      "learning_rate": 0.0009441545069428101,
+      "loss": 3.2752,
+      "step": 51000
+    },
+    {
+      "epoch": 2.796300279630028,
+      "grad_norm": 0.21225039660930634,
+      "learning_rate": 0.0009412126382678278,
+      "loss": 3.2681,
+      "step": 52000
+    },
+    {
+      "epoch": 2.8500752850075286,
+      "grad_norm": 0.1924898326396942,
+      "learning_rate": 0.0009382707695928455,
+      "loss": 3.2629,
+      "step": 53000
+    },
+    {
+      "epoch": 2.903850290385029,
+      "grad_norm": 0.19862565398216248,
+      "learning_rate": 0.000935331842786538,
+      "loss": 3.2634,
+      "step": 54000
+    },
+    {
+      "epoch": 2.9576252957625293,
+      "grad_norm": 0.19020138680934906,
+      "learning_rate": 0.0009323899741115557,
+      "loss": 3.2597,
+      "step": 55000
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.39273080241152825,
+      "eval_loss": 3.4648334980010986,
+      "eval_runtime": 154.5571,
+      "eval_samples_per_second": 374.729,
+      "eval_steps_per_second": 5.855,
+      "step": 55788
+    },
+    {
+      "epoch": 3.0114003011400303,
+      "grad_norm": 0.19856859743595123,
+      "learning_rate": 0.0009294481054365734,
+      "loss": 3.239,
+      "step": 56000
+    },
+    {
+      "epoch": 3.0651753065175305,
+      "grad_norm": 0.22371411323547363,
+      "learning_rate": 0.0009265091786302659,
+      "loss": 3.1886,
+      "step": 57000
+    },
+    {
+      "epoch": 3.118950311895031,
+      "grad_norm": 0.21350081264972687,
+      "learning_rate": 0.0009235673099552836,
+      "loss": 3.1941,
+      "step": 58000
+    },
+    {
+      "epoch": 3.1727253172725316,
+      "grad_norm": 0.219674214720726,
+      "learning_rate": 0.0009206254412803013,
+      "loss": 3.1942,
+      "step": 59000
+    },
+    {
+      "epoch": 3.226500322650032,
+      "grad_norm": 0.19072189927101135,
+      "learning_rate": 0.0009176865144739939,
+      "loss": 3.1973,
+      "step": 60000
+    },
+    {
+      "epoch": 3.2802753280275327,
+      "grad_norm": 0.205557718873024,
+      "learning_rate": 0.0009147475876676865,
+      "loss": 3.1932,
+      "step": 61000
+    },
+    {
+      "epoch": 3.3340503334050333,
+      "grad_norm": 0.2098790556192398,
+      "learning_rate": 0.0009118057189927041,
+      "loss": 3.1935,
+      "step": 62000
+    },
+    {
+      "epoch": 3.387825338782534,
+      "grad_norm": 0.196111798286438,
+      "learning_rate": 0.0009088638503177218,
+      "loss": 3.1954,
+      "step": 63000
+    },
+    {
+      "epoch": 3.4416003441600345,
+      "grad_norm": 0.19440898299217224,
+      "learning_rate": 0.0009059219816427395,
+      "loss": 3.1924,
+      "step": 64000
+    },
+    {
+      "epoch": 3.495375349537535,
+      "grad_norm": 0.21081770956516266,
+      "learning_rate": 0.0009029801129677572,
+      "loss": 3.1952,
+      "step": 65000
+    },
+    {
+      "epoch": 3.5491503549150356,
+      "grad_norm": 0.21867215633392334,
+      "learning_rate": 0.0009000411861614498,
+      "loss": 3.195,
+      "step": 66000
+    },
+    {
+      "epoch": 3.602925360292536,
+      "grad_norm": 0.22000326216220856,
+      "learning_rate": 0.0008970993174864674,
+      "loss": 3.1911,
+      "step": 67000
+    },
+    {
+      "epoch": 3.6567003656700363,
+      "grad_norm": 0.1891467422246933,
+      "learning_rate": 0.0008941574488114851,
+      "loss": 3.1934,
+      "step": 68000
+    },
+    {
+      "epoch": 3.7104753710475373,
+      "grad_norm": 0.18787287175655365,
+      "learning_rate": 0.0008912185220051777,
+      "loss": 3.191,
+      "step": 69000
+    },
+    {
+      "epoch": 3.7642503764250375,
+      "grad_norm": 0.23694172501564026,
+      "learning_rate": 0.0008882766533301954,
+      "loss": 3.1831,
+      "step": 70000
+    },
+    {
+      "epoch": 3.818025381802538,
+      "grad_norm": 0.19812917709350586,
+      "learning_rate": 0.000885334784655213,
+      "loss": 3.1815,
+      "step": 71000
+    },
+    {
+      "epoch": 3.8718003871800386,
+      "grad_norm": 0.2005423903465271,
+      "learning_rate": 0.0008823958578489056,
+      "loss": 3.1801,
+      "step": 72000
+    },
+    {
+      "epoch": 3.925575392557539,
+      "grad_norm": 0.21525584161281586,
+      "learning_rate": 0.0008794539891739233,
+      "loss": 3.1795,
+      "step": 73000
+    },
+    {
+      "epoch": 3.9793503979350398,
+      "grad_norm": 0.19802774488925934,
+      "learning_rate": 0.0008765150623676159,
+      "loss": 3.1741,
+      "step": 74000
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.3976694409529698,
+      "eval_loss": 3.419067859649658,
+      "eval_runtime": 155.1328,
+      "eval_samples_per_second": 373.338,
+      "eval_steps_per_second": 5.834,
+      "step": 74384
+    },
+    {
+      "epoch": 4.033125403312541,
+      "grad_norm": 0.2355957329273224,
+      "learning_rate": 0.0008735731936926335,
+      "loss": 3.1356,
+      "step": 75000
+    },
+    {
+      "epoch": 4.086900408690041,
+      "grad_norm": 0.20892471075057983,
+      "learning_rate": 0.0008706313250176512,
+      "loss": 3.1124,
+      "step": 76000
+    },
+    {
+      "epoch": 4.140675414067541,
+      "grad_norm": 0.24330681562423706,
+      "learning_rate": 0.0008676923982113439,
+      "loss": 3.1228,
+      "step": 77000
+    },
+    {
+      "epoch": 4.194450419445042,
+      "grad_norm": 0.30532753467559814,
+      "learning_rate": 0.0008647505295363614,
+      "loss": 3.1191,
+      "step": 78000
+    },
+    {
+      "epoch": 4.248225424822542,
+      "grad_norm": 0.2023121416568756,
+      "learning_rate": 0.0008618116027300541,
+      "loss": 3.1185,
+      "step": 79000
+    },
+    {
+      "epoch": 4.302000430200043,
+      "grad_norm": 0.20023038983345032,
+      "learning_rate": 0.0008588697340550719,
+      "loss": 3.1248,
+      "step": 80000
+    },
+    {
+      "epoch": 4.355775435577543,
+      "grad_norm": 0.20664258301258087,
+      "learning_rate": 0.0008559278653800895,
+      "loss": 3.1305,
+      "step": 81000
+    },
+    {
+      "epoch": 4.409550440955044,
+      "grad_norm": 0.21807469427585602,
+      "learning_rate": 0.0008529889385737821,
+      "loss": 3.1265,
+      "step": 82000
+    },
+    {
+      "epoch": 4.4633254463325445,
+      "grad_norm": 0.20922619104385376,
+      "learning_rate": 0.0008500470698987998,
+      "loss": 3.1287,
+      "step": 83000
+    },
+    {
+      "epoch": 4.5171004517100455,
+      "grad_norm": 0.22318531572818756,
+      "learning_rate": 0.0008471052012238174,
+      "loss": 3.1265,
+      "step": 84000
+    },
+    {
+      "epoch": 4.570875457087546,
+      "grad_norm": 0.20071184635162354,
+      "learning_rate": 0.000844163332548835,
+      "loss": 3.1244,
+      "step": 85000
+    },
+    {
+      "epoch": 4.624650462465047,
+      "grad_norm": 0.23887498676776886,
+      "learning_rate": 0.0008412244057425277,
+      "loss": 3.1309,
+      "step": 86000
+    },
+    {
+      "epoch": 4.678425467842547,
+      "grad_norm": 0.21280068159103394,
+      "learning_rate": 0.0008382825370675454,
+      "loss": 3.1261,
+      "step": 87000
+    },
+    {
+      "epoch": 4.732200473220047,
+      "grad_norm": 0.20855990052223206,
+      "learning_rate": 0.0008353406683925629,
+      "loss": 3.1227,
+      "step": 88000
+    },
+    {
+      "epoch": 4.785975478597548,
+      "grad_norm": 0.23701632022857666,
+      "learning_rate": 0.0008324017415862556,
+      "loss": 3.1274,
+      "step": 89000
+    },
+    {
+      "epoch": 4.839750483975048,
+      "grad_norm": 0.22062337398529053,
+      "learning_rate": 0.0008294598729112733,
+      "loss": 3.1259,
+      "step": 90000
+    },
+    {
+      "epoch": 4.893525489352549,
+      "grad_norm": 0.21007812023162842,
+      "learning_rate": 0.0008265209461049658,
+      "loss": 3.1241,
+      "step": 91000
+    },
+    {
+      "epoch": 4.947300494730049,
+      "grad_norm": 0.3277081847190857,
+      "learning_rate": 0.0008235790774299835,
+      "loss": 3.1213,
+      "step": 92000
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.4008789177643117,
+      "eval_loss": 3.396653652191162,
+      "eval_runtime": 155.4385,
+      "eval_samples_per_second": 372.604,
+      "eval_steps_per_second": 5.822,
+      "step": 92980
+    },
+    {
+      "epoch": 5.00107550010755,
+      "grad_norm": 0.20310255885124207,
+      "learning_rate": 0.0008206401506236762,
+      "loss": 3.1191,
+      "step": 93000
+    },
+    {
+      "epoch": 5.05485050548505,
+      "grad_norm": 0.20080606639385223,
+      "learning_rate": 0.0008176982819486937,
+      "loss": 3.0521,
+      "step": 94000
+    },
+    {
+      "epoch": 5.108625510862551,
+      "grad_norm": 0.21395424008369446,
+      "learning_rate": 0.0008147593551423864,
+      "loss": 3.0656,
+      "step": 95000
+    },
+    {
+      "epoch": 5.1624005162400515,
+      "grad_norm": 0.22563432157039642,
+      "learning_rate": 0.0008118174864674042,
+      "loss": 3.0641,
+      "step": 96000
+    },
+    {
+      "epoch": 5.2161755216175525,
+      "grad_norm": 0.21297597885131836,
+      "learning_rate": 0.0008088785596610967,
+      "loss": 3.07,
+      "step": 97000
+    },
+    {
+      "epoch": 5.269950526995053,
+      "grad_norm": 0.20302899181842804,
+      "learning_rate": 0.0008059366909861144,
+      "loss": 3.0717,
+      "step": 98000
+    },
+    {
+      "epoch": 5.323725532372554,
+      "grad_norm": 0.2152853012084961,
+      "learning_rate": 0.0008029948223111321,
+      "loss": 3.0759,
+      "step": 99000
+    },
+    {
+      "epoch": 5.377500537750054,
+      "grad_norm": 0.2148328423500061,
+      "learning_rate": 0.0008000529536361497,
+      "loss": 3.0728,
+      "step": 100000
+    },
+    {
+      "epoch": 5.431275543127555,
+      "grad_norm": 0.20232610404491425,
+      "learning_rate": 0.0007971140268298423,
+      "loss": 3.0798,
+      "step": 101000
+    },
+    {
+      "epoch": 5.485050548505055,
+      "grad_norm": 0.22732730209827423,
+      "learning_rate": 0.000794175100023535,
+      "loss": 3.0756,
+      "step": 102000
+    },
+    {
+      "epoch": 5.538825553882555,
+      "grad_norm": 0.2203952670097351,
+      "learning_rate": 0.0007912332313485526,
+      "loss": 3.0776,
+      "step": 103000
+    },
+    {
+      "epoch": 5.592600559260056,
+      "grad_norm": 0.21848390996456146,
+      "learning_rate": 0.0007882943045422453,
+      "loss": 3.0763,
+      "step": 104000
+    },
+    {
+      "epoch": 5.646375564637556,
+      "grad_norm": 0.22204072773456573,
+      "learning_rate": 0.0007853524358672629,
+      "loss": 3.0797,
+      "step": 105000
+    },
+    {
+      "epoch": 5.700150570015057,
+      "grad_norm": 0.20933043956756592,
+      "learning_rate": 0.0007824135090609555,
+      "loss": 3.0763,
+      "step": 106000
+    },
+    {
+      "epoch": 5.753925575392557,
+      "grad_norm": 0.19925065338611603,
+      "learning_rate": 0.0007794716403859732,
+      "loss": 3.0802,
+      "step": 107000
+    },
+    {
+      "epoch": 5.807700580770058,
+      "grad_norm": 0.20748205482959747,
+      "learning_rate": 0.0007765297717109908,
+      "loss": 3.081,
+      "step": 108000
+    },
+    {
+      "epoch": 5.8614755861475585,
+      "grad_norm": 0.2089342474937439,
+      "learning_rate": 0.0007735908449046834,
+      "loss": 3.0787,
+      "step": 109000
+    },
+    {
+      "epoch": 5.9152505915250595,
+      "grad_norm": 0.20147345960140228,
+      "learning_rate": 0.0007706489762297011,
+      "loss": 3.0829,
+      "step": 110000
+    },
+    {
+      "epoch": 5.96902559690256,
+      "grad_norm": 0.2211214154958725,
+      "learning_rate": 0.0007677071075547188,
+      "loss": 3.0783,
+      "step": 111000
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.405020628943781,
+      "eval_loss": 3.3772811889648438,
+      "eval_runtime": 156.2536,
+      "eval_samples_per_second": 370.66,
+      "eval_steps_per_second": 5.792,
+      "step": 111576
+    },
+    {
+      "epoch": 6.022800602280061,
+      "grad_norm": 0.21148885786533356,
+      "learning_rate": 0.0007647681807484113,
+      "loss": 3.0451,
+      "step": 112000
+    },
+    {
+      "epoch": 6.076575607657561,
+      "grad_norm": 0.2195775806903839,
+      "learning_rate": 0.000761826312073429,
+      "loss": 3.0147,
+      "step": 113000
+    },
+    {
+      "epoch": 6.130350613035061,
+      "grad_norm": 0.20522399246692657,
+      "learning_rate": 0.0007588873852671217,
+      "loss": 3.0194,
+      "step": 114000
+    },
+    {
+      "epoch": 6.184125618412562,
+      "grad_norm": 0.20723003149032593,
+      "learning_rate": 0.0007559455165921393,
+      "loss": 3.026,
+      "step": 115000
+    },
+    {
+      "epoch": 6.237900623790062,
+      "grad_norm": 0.23514005541801453,
+      "learning_rate": 0.000753003647917157,
+      "loss": 3.0231,
+      "step": 116000
+    },
+    {
+      "epoch": 6.291675629167563,
+      "grad_norm": 0.20580914616584778,
+      "learning_rate": 0.0007500647211108497,
+      "loss": 3.0321,
+      "step": 117000
+    },
+    {
+      "epoch": 6.345450634545063,
+      "grad_norm": 0.2240120768547058,
+      "learning_rate": 0.0007471228524358674,
+      "loss": 3.0332,
+      "step": 118000
+    },
+    {
+      "epoch": 6.399225639922564,
+      "grad_norm": 0.23184897005558014,
+      "learning_rate": 0.0007441839256295599,
+      "loss": 3.0369,
+      "step": 119000
+    },
+    {
+      "epoch": 6.453000645300064,
+      "grad_norm": 0.22646069526672363,
+      "learning_rate": 0.0007412449988232526,
+      "loss": 3.0357,
+      "step": 120000
+    },
+    {
+      "epoch": 6.506775650677565,
+      "grad_norm": 0.21927151083946228,
+      "learning_rate": 0.0007383031301482702,
+      "loss": 3.0398,
+      "step": 121000
+    },
+    {
+      "epoch": 6.5605506560550655,
+      "grad_norm": 0.24726586043834686,
+      "learning_rate": 0.0007353612614732878,
+      "loss": 3.0373,
+      "step": 122000
+    },
+    {
+      "epoch": 6.6143256614325665,
+      "grad_norm": 0.21686062216758728,
+      "learning_rate": 0.0007324193927983055,
+      "loss": 3.0399,
+      "step": 123000
+    },
+    {
+      "epoch": 6.668100666810067,
+      "grad_norm": 0.21142247319221497,
+      "learning_rate": 0.0007294804659919982,
+      "loss": 3.0446,
+      "step": 124000
+    },
+    {
+      "epoch": 6.721875672187567,
+      "grad_norm": 0.21460475027561188,
+      "learning_rate": 0.0007265385973170157,
+      "loss": 3.0403,
+      "step": 125000
+    },
+    {
+      "epoch": 6.775650677565068,
+      "grad_norm": 0.22398121654987335,
+      "learning_rate": 0.0007235967286420334,
+      "loss": 3.0426,
+      "step": 126000
+    },
+    {
+      "epoch": 6.829425682942568,
+      "grad_norm": 0.23123160004615784,
+      "learning_rate": 0.0007206548599670511,
+      "loss": 3.0443,
+      "step": 127000
+    },
+    {
+      "epoch": 6.883200688320069,
+      "grad_norm": 0.21254226565361023,
+      "learning_rate": 0.0007177159331607437,
+      "loss": 3.0424,
+      "step": 128000
+    },
+    {
+      "epoch": 6.936975693697569,
+      "grad_norm": 0.21302445232868195,
+      "learning_rate": 0.0007147740644857613,
+      "loss": 3.0472,
+      "step": 129000
+    },
+    {
+      "epoch": 6.99075069907507,
+      "grad_norm": 0.2217877358198166,
+      "learning_rate": 0.0007118321958107791,
+      "loss": 3.0456,
+      "step": 130000
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.4055193299898036,
+      "eval_loss": 3.3825550079345703,
+      "eval_runtime": 155.2208,
+      "eval_samples_per_second": 373.126,
+      "eval_steps_per_second": 5.83,
+      "step": 130172
+    },
+    {
+      "epoch": 7.04452570445257,
+      "grad_norm": 0.23532521724700928,
+      "learning_rate": 0.0007088903271357967,
+      "loss": 2.9839,
+      "step": 131000
+    },
+    {
+      "epoch": 7.098300709830071,
+      "grad_norm": 0.22214515507221222,
+      "learning_rate": 0.0007059514003294893,
+      "loss": 2.9765,
+      "step": 132000
+    },
+    {
+      "epoch": 7.152075715207571,
+      "grad_norm": 0.2383100390434265,
+      "learning_rate": 0.0007030154153918568,
+      "loss": 2.9843,
+      "step": 133000
+    },
+    {
+      "epoch": 7.205850720585072,
+      "grad_norm": 0.22472046315670013,
+      "learning_rate": 0.0007000735467168746,
+      "loss": 2.9925,
+      "step": 134000
+    },
+    {
+      "epoch": 7.2596257259625725,
+      "grad_norm": 0.26296138763427734,
+      "learning_rate": 0.0006971316780418923,
+      "loss": 2.997,
+      "step": 135000
+    },
+    {
+      "epoch": 7.3134007313400735,
+      "grad_norm": 0.2494724839925766,
+      "learning_rate": 0.0006941898093669099,
+      "loss": 2.997,
+      "step": 136000
+    },
+    {
+      "epoch": 7.367175736717574,
+      "grad_norm": 0.22137367725372314,
+      "learning_rate": 0.0006912508825606025,
+      "loss": 2.9973,
+      "step": 137000
+    },
+    {
+      "epoch": 7.420950742095075,
+      "grad_norm": 0.22704289853572845,
+      "learning_rate": 0.0006883090138856202,
+      "loss": 3.0066,
+      "step": 138000
+    },
+    {
+      "epoch": 7.474725747472575,
+      "grad_norm": 0.2145918905735016,
+      "learning_rate": 0.0006853700870793128,
+      "loss": 3.0054,
+      "step": 139000
+    },
+    {
+      "epoch": 7.528500752850075,
+      "grad_norm": 0.21607990562915802,
+      "learning_rate": 0.0006824282184043304,
+      "loss": 3.0018,
+      "step": 140000
+    },
+    {
+      "epoch": 7.582275758227576,
+      "grad_norm": 0.2057826817035675,
+      "learning_rate": 0.0006794863497293481,
+      "loss": 3.0101,
+      "step": 141000
+    },
+    {
+      "epoch": 7.636050763605076,
+      "grad_norm": 0.23032937943935394,
+      "learning_rate": 0.0006765474229230408,
+      "loss": 3.0099,
+      "step": 142000
+    },
+    {
+      "epoch": 7.689825768982577,
+      "grad_norm": 0.22495923936367035,
+      "learning_rate": 0.0006736055542480583,
+      "loss": 3.008,
+      "step": 143000
+    },
+    {
+      "epoch": 7.743600774360077,
+      "grad_norm": 0.2345353364944458,
+      "learning_rate": 0.000670666627441751,
+      "loss": 3.0099,
+      "step": 144000
+    },
+    {
+      "epoch": 7.797375779737578,
+      "grad_norm": 0.23005186021327972,
+      "learning_rate": 0.0006677277006354437,
+      "loss": 3.0125,
+      "step": 145000
+    },
+    {
+      "epoch": 7.851150785115078,
+      "grad_norm": 0.29431313276290894,
+      "learning_rate": 0.0006647858319604612,
+      "loss": 3.0107,
+      "step": 146000
+    },
+    {
+      "epoch": 7.904925790492579,
+      "grad_norm": 0.2245541363954544,
+      "learning_rate": 0.0006618439632854789,
+      "loss": 3.0133,
+      "step": 147000
+    },
+    {
+      "epoch": 7.9587007958700795,
+      "grad_norm": 0.22786079347133636,
+      "learning_rate": 0.0006589020946104966,
+      "loss": 3.0126,
+      "step": 148000
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.40771172002548395,
+      "eval_loss": 3.3547439575195312,
+      "eval_runtime": 154.7201,
+      "eval_samples_per_second": 374.334,
+      "eval_steps_per_second": 5.849,
+      "step": 148768
+    },
+    {
+      "epoch": 8.01247580124758,
+      "grad_norm": 0.2346237152814865,
+      "learning_rate": 0.0006559602259355142,
+      "loss": 2.9965,
+      "step": 149000
+    },
+    {
+      "epoch": 8.066250806625082,
+      "grad_norm": 0.23346728086471558,
+      "learning_rate": 0.0006530212991292069,
+      "loss": 2.9479,
+      "step": 150000
+    },
+    {
+      "epoch": 8.12002581200258,
+      "grad_norm": 0.24378512799739838,
+      "learning_rate": 0.0006500794304542246,
+      "loss": 2.9513,
+      "step": 151000
+    },
+    {
+      "epoch": 8.173800817380082,
+      "grad_norm": 0.24439002573490143,
+      "learning_rate": 0.0006471405036479172,
+      "loss": 2.9601,
+      "step": 152000
+    },
+    {
+      "epoch": 8.227575822757583,
+      "grad_norm": 0.27081623673439026,
+      "learning_rate": 0.0006441986349729348,
+      "loss": 2.9631,
+      "step": 153000
+    },
+    {
+      "epoch": 8.281350828135082,
+      "grad_norm": 0.2521245777606964,
+      "learning_rate": 0.0006412626500353025,
+      "loss": 2.966,
+      "step": 154000
+    },
+    {
+      "epoch": 8.335125833512583,
+      "grad_norm": 0.21975190937519073,
+      "learning_rate": 0.0006383207813603201,
+      "loss": 2.9678,
+      "step": 155000
+    },
+    {
+      "epoch": 8.388900838890084,
+      "grad_norm": 0.2267887145280838,
+      "learning_rate": 0.0006353789126853378,
+      "loss": 2.9696,
+      "step": 156000
+    },
+    {
+      "epoch": 8.442675844267585,
+      "grad_norm": 0.218279168009758,
+      "learning_rate": 0.0006324370440103554,
+      "loss": 2.9713,
+      "step": 157000
+    },
+    {
+      "epoch": 8.496450849645084,
+      "grad_norm": 0.23300865292549133,
+      "learning_rate": 0.0006294951753353731,
+      "loss": 2.9772,
+      "step": 158000
+    },
+    {
+      "epoch": 8.550225855022585,
+      "grad_norm": 0.21749693155288696,
+      "learning_rate": 0.0006265562485290657,
+      "loss": 2.9773,
+      "step": 159000
+    },
+    {
+      "epoch": 8.604000860400086,
+      "grad_norm": 0.26928380131721497,
+      "learning_rate": 0.0006236143798540833,
+      "loss": 2.978,
+      "step": 160000
+    },
+    {
+      "epoch": 8.657775865777587,
+      "grad_norm": 0.22122180461883545,
+      "learning_rate": 0.000620672511179101,
+      "loss": 2.9794,
+      "step": 161000
+    },
+    {
+      "epoch": 8.711550871155087,
+      "grad_norm": 0.22700442373752594,
+      "learning_rate": 0.0006177306425041186,
+      "loss": 2.9824,
+      "step": 162000
+    },
+    {
+      "epoch": 8.765325876532588,
+      "grad_norm": 0.2541004419326782,
+      "learning_rate": 0.0006147917156978112,
+      "loss": 2.9841,
+      "step": 163000
+    },
+    {
+      "epoch": 8.819100881910089,
+      "grad_norm": 0.2551893889904022,
+      "learning_rate": 0.0006118498470228289,
+      "loss": 2.9837,
+      "step": 164000
+    },
+    {
+      "epoch": 8.872875887287588,
+      "grad_norm": 0.25604966282844543,
+      "learning_rate": 0.0006089079783478466,
+      "loss": 2.984,
+      "step": 165000
+    },
+    {
+      "epoch": 8.926650892665089,
+      "grad_norm": 0.24571265280246735,
+      "learning_rate": 0.0006059719934102142,
+      "loss": 2.9849,
+      "step": 166000
+    },
+    {
+      "epoch": 8.98042589804259,
+      "grad_norm": 0.24128392338752747,
+      "learning_rate": 0.0006030301247352318,
+      "loss": 2.9843,
+      "step": 167000
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.4083424360998555,
+      "eval_loss": 3.3613698482513428,
+      "eval_runtime": 155.0364,
+      "eval_samples_per_second": 373.57,
+      "eval_steps_per_second": 5.837,
+      "step": 167364
+    },
+    {
+      "epoch": 9.034200903420091,
+      "grad_norm": 0.24964158236980438,
+      "learning_rate": 0.0006000882560602495,
+      "loss": 2.9432,
+      "step": 168000
+    },
+    {
+      "epoch": 9.08797590879759,
+      "grad_norm": 0.2405262142419815,
+      "learning_rate": 0.0005971463873852672,
+      "loss": 2.922,
+      "step": 169000
+    },
+    {
+      "epoch": 9.141750914175091,
+      "grad_norm": 0.22288870811462402,
+      "learning_rate": 0.0005942045187102848,
+      "loss": 2.9269,
+      "step": 170000
+    },
+    {
+      "epoch": 9.195525919552592,
+      "grad_norm": 0.3041359484195709,
+      "learning_rate": 0.0005912626500353024,
+      "loss": 2.9342,
+      "step": 171000
+    },
+    {
+      "epoch": 9.249300924930093,
+      "grad_norm": 0.22550632059574127,
+      "learning_rate": 0.0005883237232289951,
+      "loss": 2.9392,
+      "step": 172000
+    },
+    {
+      "epoch": 9.303075930307593,
+      "grad_norm": 0.23584921658039093,
+      "learning_rate": 0.0005853818545540127,
+      "loss": 2.9422,
+      "step": 173000
+    },
+    {
+      "epoch": 9.356850935685094,
+      "grad_norm": 0.2634640634059906,
+      "learning_rate": 0.0005824458696163803,
+      "loss": 2.945,
+      "step": 174000
+    },
+    {
+      "epoch": 9.410625941062595,
+      "grad_norm": 0.2354883849620819,
+      "learning_rate": 0.000579504000941398,
+      "loss": 2.9485,
+      "step": 175000
+    },
+    {
+      "epoch": 9.464400946440094,
+      "grad_norm": 0.26491352915763855,
+      "learning_rate": 0.0005765621322664157,
+      "loss": 2.9475,
+      "step": 176000
+    },
+    {
+      "epoch": 9.518175951817595,
+      "grad_norm": 0.2462054342031479,
+      "learning_rate": 0.0005736202635914332,
+      "loss": 2.9482,
+      "step": 177000
+    },
+    {
+      "epoch": 9.571950957195096,
+      "grad_norm": 0.2643495202064514,
+      "learning_rate": 0.0005706783949164509,
+      "loss": 2.9502,
+      "step": 178000
+    },
+    {
+      "epoch": 9.625725962572597,
+      "grad_norm": 0.24029669165611267,
+      "learning_rate": 0.0005677365262414686,
+      "loss": 2.9518,
+      "step": 179000
+    },
+    {
+      "epoch": 9.679500967950096,
+      "grad_norm": 0.2550260126590729,
+      "learning_rate": 0.0005648005413038361,
+      "loss": 2.9571,
+      "step": 180000
+    },
+    {
+      "epoch": 9.733275973327597,
+      "grad_norm": 0.23675589263439178,
+      "learning_rate": 0.0005618586726288538,
+      "loss": 2.9543,
+      "step": 181000
+    },
+    {
+      "epoch": 9.787050978705098,
+      "grad_norm": 0.2279191017150879,
+      "learning_rate": 0.0005589197458225465,
+      "loss": 2.9593,
+      "step": 182000
+    },
+    {
+      "epoch": 9.8408259840826,
+      "grad_norm": 0.27392587065696716,
+      "learning_rate": 0.0005559778771475641,
+      "loss": 2.9561,
+      "step": 183000
+    },
+    {
+      "epoch": 9.894600989460098,
+      "grad_norm": 0.2388741672039032,
+      "learning_rate": 0.0005530360084725818,
+      "loss": 2.9565,
+      "step": 184000
+    },
+    {
+      "epoch": 9.9483759948376,
+      "grad_norm": 0.2503463327884674,
+      "learning_rate": 0.0005500970816662745,
+      "loss": 2.9592,
+      "step": 185000
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.40854156716551776,
+      "eval_loss": 3.377901077270508,
+      "eval_runtime": 154.6842,
+      "eval_samples_per_second": 374.421,
+      "eval_steps_per_second": 5.851,
+      "step": 185960
+    },
+    {
+      "epoch": 10.0021510002151,
+      "grad_norm": 0.2553974688053131,
+      "learning_rate": 0.0005471552129912922,
+      "loss": 2.9593,
+      "step": 186000
+    },
+    {
+      "epoch": 10.055926005592601,
+      "grad_norm": 0.25415316224098206,
+      "learning_rate": 0.0005442133443163097,
+      "loss": 2.8931,
+      "step": 187000
+    },
+    {
+      "epoch": 10.1097010109701,
+      "grad_norm": 0.2477007806301117,
+      "learning_rate": 0.0005412744175100024,
+      "loss": 2.8999,
+      "step": 188000
+    },
+    {
+      "epoch": 10.163476016347602,
+      "grad_norm": 0.23852670192718506,
+      "learning_rate": 0.0005383325488350201,
+      "loss": 2.9028,
+      "step": 189000
+    },
+    {
+      "epoch": 10.217251021725103,
+      "grad_norm": 0.2484176605939865,
+      "learning_rate": 0.0005353906801600376,
+      "loss": 2.9117,
+      "step": 190000
+    },
+    {
+      "epoch": 10.271026027102602,
+      "grad_norm": 0.27494022250175476,
+      "learning_rate": 0.0005324517533537303,
+      "loss": 2.9133,
+      "step": 191000
+    },
+    {
+      "epoch": 10.324801032480103,
+      "grad_norm": 0.2577020823955536,
+      "learning_rate": 0.000529509884678748,
+      "loss": 2.9214,
+      "step": 192000
+    },
+    {
+      "epoch": 10.378576037857604,
+      "grad_norm": 0.2292626053094864,
+      "learning_rate": 0.0005265680160037656,
+      "loss": 2.9208,
+      "step": 193000
+    },
+    {
+      "epoch": 10.432351043235105,
+      "grad_norm": 0.267873615026474,
+      "learning_rate": 0.0005236290891974582,
+      "loss": 2.9213,
+      "step": 194000
+    },
+    {
+      "epoch": 10.486126048612604,
+      "grad_norm": 0.24749253690242767,
+      "learning_rate": 0.0005206872205224759,
+      "loss": 2.9239,
+      "step": 195000
+    },
+    {
+      "epoch": 10.539901053990105,
+      "grad_norm": 0.26779085397720337,
+      "learning_rate": 0.0005177453518474935,
+      "loss": 2.929,
+      "step": 196000
+    },
+    {
+      "epoch": 10.593676059367606,
+      "grad_norm": 0.2552465796470642,
+      "learning_rate": 0.0005148064250411861,
+      "loss": 2.9309,
+      "step": 197000
+    },
+    {
+      "epoch": 10.647451064745107,
+      "grad_norm": 0.2551726996898651,
+      "learning_rate": 0.0005118645563662038,
+      "loss": 2.9287,
+      "step": 198000
+    },
+    {
+      "epoch": 10.701226070122607,
+      "grad_norm": 0.24481531977653503,
+      "learning_rate": 0.0005089256295598964,
+      "loss": 2.9326,
+      "step": 199000
+    },
+    {
+      "epoch": 10.755001075500108,
+      "grad_norm": 0.2609283924102783,
+      "learning_rate": 0.0005059837608849142,
+      "loss": 2.9332,
+      "step": 200000
+    },
+    {
+      "epoch": 10.808776080877609,
+      "grad_norm": 0.22893798351287842,
+      "learning_rate": 0.0005030418922099318,
+      "loss": 2.9361,
+      "step": 201000
+    },
+    {
+      "epoch": 10.86255108625511,
+      "grad_norm": 0.24516218900680542,
+      "learning_rate": 0.0005001029654036244,
+      "loss": 2.9358,
+      "step": 202000
+    },
+    {
+      "epoch": 10.916326091632609,
+      "grad_norm": 0.23790410161018372,
+      "learning_rate": 0.0004971610967286421,
+      "loss": 2.9344,
+      "step": 203000
+    },
+    {
+      "epoch": 10.97010109701011,
+      "grad_norm": 0.2549736797809601,
+      "learning_rate": 0.0004942221699223347,
+      "loss": 2.9367,
+      "step": 204000
+    },
+    {
+      "epoch": 11.0,
+      "eval_accuracy": 0.40993629082379995,
+      "eval_loss": 3.3604304790496826,
+      "eval_runtime": 154.4315,
+      "eval_samples_per_second": 375.034,
+      "eval_steps_per_second": 5.86,
+      "step": 204556
+    },
+    {
+      "epoch": 11.02387610238761,
+      "grad_norm": 0.2493738979101181,
+      "learning_rate": 0.0004912803012473523,
+      "loss": 2.9057,
+      "step": 205000
+    },
+    {
+      "epoch": 11.07765110776511,
+      "grad_norm": 0.245314821600914,
+      "learning_rate": 0.00048833843257237,
+      "loss": 2.8775,
+      "step": 206000
+    },
+    {
+      "epoch": 11.131426113142611,
+      "grad_norm": 0.24536246061325073,
+      "learning_rate": 0.0004853995057660626,
+      "loss": 2.8783,
+      "step": 207000
+    },
+    {
+      "epoch": 11.185201118520112,
+      "grad_norm": 0.26116108894348145,
+      "learning_rate": 0.00048245763709108024,
+      "loss": 2.8851,
+      "step": 208000
+    },
+    {
+      "epoch": 11.238976123897613,
+      "grad_norm": 0.24136824905872345,
+      "learning_rate": 0.0004795187102847729,
+      "loss": 2.8918,
+      "step": 209000
+    },
+    {
+      "epoch": 11.292751129275112,
+      "grad_norm": 0.275006502866745,
+      "learning_rate": 0.0004765768416097905,
+      "loss": 2.8951,
+      "step": 210000
+    },
+    {
+      "epoch": 11.346526134652613,
+      "grad_norm": 0.27772292494773865,
+      "learning_rate": 0.0004736349729348082,
+      "loss": 2.8973,
+      "step": 211000
+    },
+    {
+      "epoch": 11.400301140030114,
+      "grad_norm": 0.2865879237651825,
+      "learning_rate": 0.0004706960461285008,
+      "loss": 2.8977,
+      "step": 212000
+    },
+    {
+      "epoch": 11.454076145407615,
+      "grad_norm": 0.24032117426395416,
+      "learning_rate": 0.00046775711932219347,
+      "loss": 2.9002,
+      "step": 213000
+    },
+    {
+      "epoch": 11.507851150785115,
+      "grad_norm": 0.24863946437835693,
+      "learning_rate": 0.00046481525064721114,
+      "loss": 2.9052,
+      "step": 214000
+    },
+    {
+      "epoch": 11.561626156162616,
+      "grad_norm": 0.25115910172462463,
+      "learning_rate": 0.00046187338197222876,
+      "loss": 2.9081,
+      "step": 215000
+    },
+    {
+      "epoch": 11.615401161540117,
+      "grad_norm": 0.25204330682754517,
+      "learning_rate": 0.00045893151329724644,
+      "loss": 2.906,
+      "step": 216000
+    },
+    {
+      "epoch": 11.669176166917616,
+      "grad_norm": 0.25553619861602783,
+      "learning_rate": 0.00045599258649093905,
+      "loss": 2.9074,
+      "step": 217000
+    },
+    {
+      "epoch": 11.722951172295117,
+      "grad_norm": 0.24004510045051575,
+      "learning_rate": 0.00045305071781595667,
+      "loss": 2.9105,
+      "step": 218000
+    },
+    {
+      "epoch": 11.776726177672618,
+      "grad_norm": 0.28220391273498535,
+      "learning_rate": 0.00045010884914097434,
+      "loss": 2.9075,
+      "step": 219000
+    },
+    {
+      "epoch": 11.830501183050119,
+      "grad_norm": 0.2582526206970215,
+      "learning_rate": 0.00044716992233466695,
+      "loss": 2.9129,
+      "step": 220000
+    },
+    {
+      "epoch": 11.884276188427618,
+      "grad_norm": 0.27006328105926514,
+      "learning_rate": 0.0004442309955283596,
+      "loss": 2.9146,
+      "step": 221000
+    },
+    {
+      "epoch": 11.93805119380512,
+      "grad_norm": 0.3025253415107727,
+      "learning_rate": 0.0004412891268533773,
+      "loss": 2.9161,
+      "step": 222000
+    },
+    {
+      "epoch": 11.99182619918262,
+      "grad_norm": 0.2534899115562439,
+      "learning_rate": 0.00043834725817839496,
+      "loss": 2.9145,
+      "step": 223000
+    },
+    {
+      "epoch": 12.0,
+      "eval_accuracy": 0.40973608482660917,
+      "eval_loss": 3.3759312629699707,
+      "eval_runtime": 154.6971,
+      "eval_samples_per_second": 374.39,
+      "eval_steps_per_second": 5.85,
+      "step": 223152
+    },
+    {
+      "epoch": 12.045601204560121,
+      "grad_norm": 0.27566835284233093,
+      "learning_rate": 0.0004354053895034126,
+      "loss": 2.8626,
+      "step": 224000
+    },
+    {
+      "epoch": 12.09937620993762,
+      "grad_norm": 0.243771031498909,
+      "learning_rate": 0.0004324664626971052,
+      "loss": 2.8583,
+      "step": 225000
+    },
+    {
+      "epoch": 12.153151215315122,
+      "grad_norm": 0.26204991340637207,
+      "learning_rate": 0.00042952459402212286,
+      "loss": 2.8654,
+      "step": 226000
+    },
+    {
+      "epoch": 12.206926220692623,
+      "grad_norm": 0.2582913637161255,
+      "learning_rate": 0.0004265827253471405,
+      "loss": 2.8673,
+      "step": 227000
+    },
+    {
+      "epoch": 12.260701226070122,
+      "grad_norm": 0.27284786105155945,
+      "learning_rate": 0.00042364674040950813,
+      "loss": 2.8735,
+      "step": 228000
+    },
+    {
+      "epoch": 12.314476231447623,
+      "grad_norm": 0.25654593110084534,
+      "learning_rate": 0.00042070487173452575,
+      "loss": 2.8751,
+      "step": 229000
+    },
+    {
+      "epoch": 12.368251236825124,
+      "grad_norm": 0.23449215292930603,
+      "learning_rate": 0.0004177630030595435,
+      "loss": 2.8769,
+      "step": 230000
+    },
+    {
+      "epoch": 12.422026242202625,
+      "grad_norm": 0.24525216221809387,
+      "learning_rate": 0.00041482407625323604,
+      "loss": 2.8763,
+      "step": 231000
+    },
+    {
+      "epoch": 12.475801247580124,
+      "grad_norm": 0.26022928953170776,
+      "learning_rate": 0.0004118822075782537,
+      "loss": 2.8811,
+      "step": 232000
+    },
+    {
+      "epoch": 12.529576252957625,
+      "grad_norm": 0.24519123136997223,
+      "learning_rate": 0.0004089403389032714,
+      "loss": 2.8844,
+      "step": 233000
+    },
+    {
+      "epoch": 12.583351258335126,
+      "grad_norm": 0.3118698298931122,
+      "learning_rate": 0.000405998470228289,
+      "loss": 2.8837,
+      "step": 234000
+    },
+    {
+      "epoch": 12.637126263712627,
+      "grad_norm": 0.28287139534950256,
+      "learning_rate": 0.0004030566015533066,
+      "loss": 2.8843,
+      "step": 235000
+    },
+    {
+      "epoch": 12.690901269090126,
+      "grad_norm": 0.24490605294704437,
+      "learning_rate": 0.0004001176747469993,
+      "loss": 2.8868,
+      "step": 236000
+    },
+    {
+      "epoch": 12.744676274467627,
+      "grad_norm": 0.2810141444206238,
+      "learning_rate": 0.00039717580607201696,
+      "loss": 2.8905,
+      "step": 237000
+    },
+    {
+      "epoch": 12.798451279845128,
+      "grad_norm": 0.2660733759403229,
+      "learning_rate": 0.0003942368792657096,
+      "loss": 2.8917,
+      "step": 238000
+    },
+    {
+      "epoch": 12.852226285222628,
+      "grad_norm": 0.2440771758556366,
+      "learning_rate": 0.00039129501059072724,
+      "loss": 2.888,
+      "step": 239000
+    },
+    {
+      "epoch": 12.906001290600129,
+      "grad_norm": 0.25520211458206177,
+      "learning_rate": 0.0003883560837844199,
+      "loss": 2.8938,
+      "step": 240000
+    },
+    {
+      "epoch": 12.95977629597763,
+      "grad_norm": 0.2661096751689911,
+      "learning_rate": 0.0003854142151094375,
+      "loss": 2.8924,
+      "step": 241000
+    },
+    {
+      "epoch": 13.0,
+      "eval_accuracy": 0.4095556978794759,
+      "eval_loss": 3.3856160640716553,
+      "eval_runtime": 154.4788,
+      "eval_samples_per_second": 374.919,
+      "eval_steps_per_second": 5.858,
+      "step": 241748
+    },
+    {
+      "epoch": 13.01355130135513,
+      "grad_norm": 0.29137110710144043,
+      "learning_rate": 0.00038247234643445514,
+      "loss": 2.8796,
+      "step": 242000
+    },
+    {
+      "epoch": 13.06732630673263,
+      "grad_norm": 0.26854678988456726,
+      "learning_rate": 0.0003795304777594728,
+      "loss": 2.837,
+      "step": 243000
+    },
+    {
+      "epoch": 13.121101312110131,
+      "grad_norm": 0.2614552974700928,
+      "learning_rate": 0.00037659155095316543,
+      "loss": 2.8409,
+      "step": 244000
+    },
+    {
+      "epoch": 13.174876317487632,
+      "grad_norm": 0.3031397759914398,
+      "learning_rate": 0.00037364968227818316,
+      "loss": 2.8458,
+      "step": 245000
+    },
+    {
+      "epoch": 13.228651322865133,
+      "grad_norm": 0.2507542073726654,
+      "learning_rate": 0.0003707107554718757,
+      "loss": 2.8537,
+      "step": 246000
+    },
+    {
+      "epoch": 13.282426328242632,
+      "grad_norm": 0.2524189054965973,
+      "learning_rate": 0.0003677688867968934,
+      "loss": 2.852,
+      "step": 247000
+    },
+    {
+      "epoch": 13.336201333620133,
+      "grad_norm": 0.27850231528282166,
+      "learning_rate": 0.00036482701812191106,
+      "loss": 2.8574,
+      "step": 248000
+    },
+    {
+      "epoch": 13.389976338997634,
+      "grad_norm": 0.2408652901649475,
+      "learning_rate": 0.00036188809131560367,
+      "loss": 2.8563,
+      "step": 249000
+    },
+    {
+      "epoch": 13.443751344375134,
+      "grad_norm": 0.28609830141067505,
+      "learning_rate": 0.00035894622264062134,
+      "loss": 2.8627,
+      "step": 250000
+    },
+    {
+      "epoch": 13.497526349752635,
+      "grad_norm": 0.2690850496292114,
+      "learning_rate": 0.00035600435396563896,
+      "loss": 2.8619,
+      "step": 251000
+    },
+    {
+      "epoch": 13.551301355130136,
+      "grad_norm": 0.2862522304058075,
+      "learning_rate": 0.0003530654271593316,
+      "loss": 2.8646,
+      "step": 252000
+    },
+    {
+      "epoch": 13.605076360507637,
+      "grad_norm": 0.2629512548446655,
+      "learning_rate": 0.0003501235584843493,
+      "loss": 2.865,
+      "step": 253000
+    },
+    {
+      "epoch": 13.658851365885136,
+      "grad_norm": 0.2542857825756073,
+      "learning_rate": 0.00034718463167804185,
+      "loss": 2.865,
+      "step": 254000
+    },
+    {
+      "epoch": 13.712626371262637,
+      "grad_norm": 0.258798211812973,
+      "learning_rate": 0.0003442427630030596,
+      "loss": 2.8686,
+      "step": 255000
+    },
+    {
+      "epoch": 13.766401376640138,
+      "grad_norm": 0.2492339164018631,
+      "learning_rate": 0.0003413008943280772,
+      "loss": 2.8689,
+      "step": 256000
+    },
+    {
+      "epoch": 13.820176382017639,
+      "grad_norm": 0.29779887199401855,
+      "learning_rate": 0.0003383590256530948,
+      "loss": 2.8691,
+      "step": 257000
+    },
+    {
+      "epoch": 13.873951387395138,
+      "grad_norm": 0.2670515179634094,
+      "learning_rate": 0.0003354200988467875,
+      "loss": 2.8711,
+      "step": 258000
+    },
+    {
+      "epoch": 13.92772639277264,
+      "grad_norm": 0.26957201957702637,
+      "learning_rate": 0.0003324782301718051,
+      "loss": 2.8711,
+      "step": 259000
+    },
+    {
+      "epoch": 13.98150139815014,
+      "grad_norm": 0.2824675142765045,
+      "learning_rate": 0.00032953636149682283,
+      "loss": 2.8757,
+      "step": 260000
+    },
+    {
+      "epoch": 14.0,
+      "eval_accuracy": 0.410457699798363,
+      "eval_loss": 3.384411573410034,
+      "eval_runtime": 154.9443,
+      "eval_samples_per_second": 373.792,
+      "eval_steps_per_second": 5.841,
+      "step": 260344
+    },
+    {
+      "epoch": 14.035276403527641,
+      "grad_norm": 0.29713505506515503,
+      "learning_rate": 0.00032659449282184045,
+      "loss": 2.8354,
+      "step": 261000
+    },
+    {
+      "epoch": 14.08905140890514,
+      "grad_norm": 0.27730679512023926,
+      "learning_rate": 0.00032365850788420805,
+      "loss": 2.8198,
+      "step": 262000
+    },
+    {
+      "epoch": 14.142826414282641,
+      "grad_norm": 0.29409125447273254,
+      "learning_rate": 0.0003207166392092257,
+      "loss": 2.824,
+      "step": 263000
+    },
+    {
+      "epoch": 14.196601419660142,
+      "grad_norm": 0.280676931142807,
+      "learning_rate": 0.00031777477053424334,
+      "loss": 2.8301,
+      "step": 264000
+    },
+    {
+      "epoch": 14.250376425037642,
+      "grad_norm": 0.3202875554561615,
+      "learning_rate": 0.000314835843727936,
+      "loss": 2.8271,
+      "step": 265000
+    },
+    {
+      "epoch": 14.304151430415143,
+      "grad_norm": 0.2673070728778839,
+      "learning_rate": 0.0003118939750529536,
+      "loss": 2.8376,
+      "step": 266000
+    },
+    {
+      "epoch": 14.357926435792644,
+      "grad_norm": 0.2868790030479431,
+      "learning_rate": 0.0003089550482466463,
+      "loss": 2.8416,
+      "step": 267000
+    },
+    {
+      "epoch": 14.411701441170145,
+      "grad_norm": 0.2689562737941742,
+      "learning_rate": 0.0003060131795716639,
+      "loss": 2.8408,
+      "step": 268000
+    },
+    {
+      "epoch": 14.465476446547644,
+      "grad_norm": 0.2907434403896332,
+      "learning_rate": 0.0003030713108966815,
+      "loss": 2.8429,
+      "step": 269000
+    },
+    {
+      "epoch": 14.519251451925145,
+      "grad_norm": 0.26338914036750793,
+      "learning_rate": 0.00030012944222169925,
+      "loss": 2.8465,
+      "step": 270000
+    },
+    {
+      "epoch": 14.573026457302646,
+      "grad_norm": 0.2925278842449188,
+      "learning_rate": 0.00029719051541539186,
+      "loss": 2.8444,
+      "step": 271000
+    },
+    {
+      "epoch": 14.626801462680147,
+      "grad_norm": 0.2883965075016022,
+      "learning_rate": 0.00029424864674040954,
+      "loss": 2.8466,
+      "step": 272000
+    },
+    {
+      "epoch": 14.680576468057646,
+      "grad_norm": 0.28422123193740845,
+      "learning_rate": 0.00029130971993410215,
+      "loss": 2.8458,
+      "step": 273000
+    },
+    {
+      "epoch": 14.734351473435147,
+      "grad_norm": 0.28057223558425903,
+      "learning_rate": 0.0002883678512591198,
+      "loss": 2.8512,
+      "step": 274000
+    },
+    {
+      "epoch": 14.788126478812648,
+      "grad_norm": 0.2739641070365906,
+      "learning_rate": 0.00028542892445281243,
+      "loss": 2.8508,
+      "step": 275000
+    },
+    {
+      "epoch": 14.84190148419015,
+      "grad_norm": 0.26283150911331177,
+      "learning_rate": 0.00028248705577783005,
+      "loss": 2.8491,
+      "step": 276000
+    },
+    {
+      "epoch": 14.895676489567649,
+      "grad_norm": 0.25209805369377136,
+      "learning_rate": 0.0002795451871028477,
+      "loss": 2.8528,
+      "step": 277000
+    },
+    {
+      "epoch": 14.94945149494515,
+      "grad_norm": 0.29996606707572937,
+      "learning_rate": 0.0002766033184278654,
+      "loss": 2.8545,
+      "step": 278000
+    },
+    {
+      "epoch": 15.0,
+      "eval_accuracy": 0.4106898178253074,
+      "eval_loss": 3.383195400238037,
+      "eval_runtime": 154.9853,
+      "eval_samples_per_second": 373.693,
+      "eval_steps_per_second": 5.839,
+      "step": 278940
+    },
+    {
+      "epoch": 15.00322650032265,
+      "grad_norm": 0.30987992882728577,
+      "learning_rate": 0.00027366439162155806,
+      "loss": 2.8502,
+      "step": 279000
+    },
+    {
+      "epoch": 15.05700150570015,
+      "grad_norm": 0.2791779041290283,
+      "learning_rate": 0.0002707225229465757,
+      "loss": 2.8054,
+      "step": 280000
+    },
+    {
+      "epoch": 15.11077651107765,
+      "grad_norm": 0.29352006316185,
+      "learning_rate": 0.00026778359614026834,
+      "loss": 2.8087,
+      "step": 281000
+    },
+    {
+      "epoch": 15.164551516455152,
+      "grad_norm": 0.2739843428134918,
+      "learning_rate": 0.00026484466933396095,
+      "loss": 2.8134,
+      "step": 282000
+    },
+    {
+      "epoch": 15.218326521832653,
+      "grad_norm": 0.29397326707839966,
+      "learning_rate": 0.00026190280065897857,
+      "loss": 2.814,
+      "step": 283000
+    },
+    {
+      "epoch": 15.272101527210152,
+      "grad_norm": 0.2891228199005127,
+      "learning_rate": 0.00025896387385267123,
+      "loss": 2.8154,
+      "step": 284000
+    },
+    {
+      "epoch": 15.325876532587653,
+      "grad_norm": 0.2697419226169586,
+      "learning_rate": 0.00025602200517768885,
+      "loss": 2.815,
+      "step": 285000
+    },
+    {
+      "epoch": 15.379651537965154,
+      "grad_norm": 0.27169767022132874,
+      "learning_rate": 0.00025308013650270653,
+      "loss": 2.8179,
+      "step": 286000
+    },
+    {
+      "epoch": 15.433426543342655,
+      "grad_norm": 0.28420206904411316,
+      "learning_rate": 0.0002501382678277242,
+      "loss": 2.8242,
+      "step": 287000
+    },
+    {
+      "epoch": 15.487201548720154,
+      "grad_norm": 0.29990944266319275,
+      "learning_rate": 0.0002471993410214168,
+      "loss": 2.8284,
+      "step": 288000
+    },
+    {
+      "epoch": 15.540976554097655,
+      "grad_norm": 0.29358208179473877,
+      "learning_rate": 0.0002442574723464345,
+      "loss": 2.8242,
+      "step": 289000
+    },
+    {
+      "epoch": 15.594751559475156,
+      "grad_norm": 0.2709376811981201,
+      "learning_rate": 0.0002413156036714521,
+      "loss": 2.8323,
+      "step": 290000
+    },
+    {
+      "epoch": 15.648526564852656,
+      "grad_norm": 0.27639514207839966,
+      "learning_rate": 0.00023837961873381973,
+      "loss": 2.8295,
+      "step": 291000
+    },
+    {
+      "epoch": 15.702301570230157,
+      "grad_norm": 0.27499568462371826,
+      "learning_rate": 0.00023543775005883738,
+      "loss": 2.8273,
+      "step": 292000
+    },
+    {
+      "epoch": 15.756076575607658,
+      "grad_norm": 0.2614821493625641,
+      "learning_rate": 0.00023249588138385502,
+      "loss": 2.8302,
+      "step": 293000
+    },
+    {
+      "epoch": 15.809851580985159,
+      "grad_norm": 0.30566710233688354,
+      "learning_rate": 0.00022955695457754766,
+      "loss": 2.8315,
+      "step": 294000
+    },
+    {
+      "epoch": 15.863626586362658,
+      "grad_norm": 0.28071853518486023,
+      "learning_rate": 0.00022661508590256533,
+      "loss": 2.8282,
+      "step": 295000
+    },
+    {
+      "epoch": 15.917401591740159,
+      "grad_norm": 0.26871344447135925,
+      "learning_rate": 0.00022367321722758295,
+      "loss": 2.8318,
+      "step": 296000
+    },
+    {
+      "epoch": 15.97117659711766,
+      "grad_norm": 0.27507010102272034,
+      "learning_rate": 0.0002207313485526006,
+      "loss": 2.8339,
+      "step": 297000
+    },
+    {
+      "epoch": 16.0,
+      "eval_accuracy": 0.4098433764298017,
+      "eval_loss": 3.4079394340515137,
+      "eval_runtime": 155.0447,
+      "eval_samples_per_second": 373.55,
+      "eval_steps_per_second": 5.837,
+      "step": 297536
+    },
+    {
+      "epoch": 16.02495160249516,
+      "grad_norm": 0.2846646010875702,
+      "learning_rate": 0.00021778947987761827,
+      "loss": 2.8126,
+      "step": 298000
+    },
+    {
+      "epoch": 16.07872660787266,
+      "grad_norm": 0.30357855558395386,
+      "learning_rate": 0.0002148505530713109,
+      "loss": 2.7901,
+      "step": 299000
+    },
+    {
+      "epoch": 16.132501613250163,
+      "grad_norm": 0.27245578169822693,
+      "learning_rate": 0.00021191162626500354,
+      "loss": 2.791,
+      "step": 300000
+    },
+    {
+      "epoch": 16.186276618627662,
+      "grad_norm": 0.27511173486709595,
+      "learning_rate": 0.0002089697575900212,
+      "loss": 2.7991,
+      "step": 301000
+    },
+    {
+      "epoch": 16.24005162400516,
+      "grad_norm": 0.2818892300128937,
+      "learning_rate": 0.00020602788891503884,
+      "loss": 2.8017,
+      "step": 302000
+    },
+    {
+      "epoch": 16.293826629382664,
+      "grad_norm": 0.29610157012939453,
+      "learning_rate": 0.00020308896210873147,
+      "loss": 2.7992,
+      "step": 303000
+    },
+    {
+      "epoch": 16.347601634760164,
+      "grad_norm": 0.28958022594451904,
+      "learning_rate": 0.00020014709343374912,
+      "loss": 2.8032,
+      "step": 304000
+    },
+    {
+      "epoch": 16.401376640137663,
+      "grad_norm": 0.32356253266334534,
+      "learning_rate": 0.00019720522475876677,
+      "loss": 2.8018,
+      "step": 305000
+    },
+    {
+      "epoch": 16.455151645515166,
+      "grad_norm": 0.2847062051296234,
+      "learning_rate": 0.0001942633560837844,
+      "loss": 2.8077,
+      "step": 306000
+    },
+    {
+      "epoch": 16.508926650892665,
+      "grad_norm": 0.2739544212818146,
+      "learning_rate": 0.00019132442927747705,
+      "loss": 2.81,
+      "step": 307000
+    },
+    {
+      "epoch": 16.562701656270164,
+      "grad_norm": 0.28697946667671204,
+      "learning_rate": 0.0001883825606024947,
+      "loss": 2.807,
+      "step": 308000
+    },
+    {
+      "epoch": 16.616476661647667,
+      "grad_norm": 0.29771292209625244,
+      "learning_rate": 0.00018544363379618733,
+      "loss": 2.8096,
+      "step": 309000
+    },
+    {
+      "epoch": 16.670251667025166,
+      "grad_norm": 0.29724204540252686,
+      "learning_rate": 0.000182501765121205,
+      "loss": 2.8081,
+      "step": 310000
+    },
+    {
+      "epoch": 16.72402667240267,
+      "grad_norm": 0.2964895963668823,
+      "learning_rate": 0.00017955989644622265,
+      "loss": 2.8133,
+      "step": 311000
+    },
+    {
+      "epoch": 16.777801677780168,
+      "grad_norm": 0.28841668367385864,
+      "learning_rate": 0.00017662391150859025,
+      "loss": 2.8086,
+      "step": 312000
+    },
+    {
+      "epoch": 16.831576683157667,
+      "grad_norm": 0.2766316831111908,
+      "learning_rate": 0.00017368204283360793,
+      "loss": 2.8152,
+      "step": 313000
+    },
+    {
+      "epoch": 16.88535168853517,
+      "grad_norm": 0.32704171538352966,
+      "learning_rate": 0.00017074017415862554,
+      "loss": 2.8162,
+      "step": 314000
+    },
+    {
+      "epoch": 16.93912669391267,
+      "grad_norm": 0.3360968828201294,
+      "learning_rate": 0.00016779830548364322,
+      "loss": 2.8133,
+      "step": 315000
+    },
+    {
+      "epoch": 16.99290169929017,
+      "grad_norm": 0.31055524945259094,
+      "learning_rate": 0.00016485643680866087,
+      "loss": 2.8157,
+      "step": 316000
+    },
+    {
+      "epoch": 17.0,
+      "eval_accuracy": 0.4103879636154489,
+      "eval_loss": 3.3883779048919678,
+      "eval_runtime": 154.6539,
+      "eval_samples_per_second": 374.494,
+      "eval_steps_per_second": 5.852,
+      "step": 316132
+    },
+    {
+      "epoch": 17.04667670466767,
+      "grad_norm": 0.30388155579566956,
+      "learning_rate": 0.0001619175100023535,
+      "loss": 2.7807,
+      "step": 317000
+    },
+    {
+      "epoch": 17.10045171004517,
+      "grad_norm": 0.2881651818752289,
+      "learning_rate": 0.00015897564132737118,
+      "loss": 2.7798,
+      "step": 318000
+    },
+    {
+      "epoch": 17.15422671542267,
+      "grad_norm": 0.2695824205875397,
+      "learning_rate": 0.0001560337726523888,
+      "loss": 2.7817,
+      "step": 319000
+    },
+    {
+      "epoch": 17.208001720800173,
+      "grad_norm": 0.30308765172958374,
+      "learning_rate": 0.00015309484584608143,
+      "loss": 2.7808,
+      "step": 320000
+    },
+    {
+      "epoch": 17.261776726177672,
+      "grad_norm": 0.29839852452278137,
+      "learning_rate": 0.00015015297717109908,
+      "loss": 2.7878,
+      "step": 321000
+    },
+    {
+      "epoch": 17.315551731555175,
+      "grad_norm": 0.2739504873752594,
+      "learning_rate": 0.00014721110849611675,
+      "loss": 2.7849,
+      "step": 322000
+    },
+    {
+      "epoch": 17.369326736932674,
+      "grad_norm": 0.33289453387260437,
+      "learning_rate": 0.00014427512355848435,
+      "loss": 2.7878,
+      "step": 323000
+    },
+    {
+      "epoch": 17.423101742310173,
+      "grad_norm": 0.30872535705566406,
+      "learning_rate": 0.00014133325488350202,
+      "loss": 2.7862,
+      "step": 324000
+    },
+    {
+      "epoch": 17.476876747687676,
+      "grad_norm": 0.30065229535102844,
+      "learning_rate": 0.00013839138620851964,
+      "loss": 2.7903,
+      "step": 325000
+    },
+    {
+      "epoch": 17.530651753065175,
+      "grad_norm": 0.294783353805542,
+      "learning_rate": 0.0001354495175335373,
+      "loss": 2.7901,
+      "step": 326000
+    },
+    {
+      "epoch": 17.584426758442675,
+      "grad_norm": 0.3017074167728424,
+      "learning_rate": 0.00013251059072722993,
+      "loss": 2.789,
+      "step": 327000
+    },
+    {
+      "epoch": 17.638201763820177,
+      "grad_norm": 0.28931453824043274,
+      "learning_rate": 0.0001295687220522476,
+      "loss": 2.7962,
+      "step": 328000
+    },
+    {
+      "epoch": 17.691976769197677,
+      "grad_norm": 0.30777105689048767,
+      "learning_rate": 0.00012662685337726525,
+      "loss": 2.7905,
+      "step": 329000
+    },
+    {
+      "epoch": 17.745751774575176,
+      "grad_norm": 0.30437570810317993,
+      "learning_rate": 0.00012368792657095788,
+      "loss": 2.7922,
+      "step": 330000
+    },
+    {
+      "epoch": 17.79952677995268,
+      "grad_norm": 0.279985249042511,
+      "learning_rate": 0.00012074605789597553,
+      "loss": 2.7958,
+      "step": 331000
+    },
+    {
+      "epoch": 17.853301785330178,
+      "grad_norm": 0.2660251557826996,
+      "learning_rate": 0.00011780418922099318,
+      "loss": 2.7954,
+      "step": 332000
+    },
+    {
+      "epoch": 17.90707679070768,
+      "grad_norm": 0.2697618901729584,
+      "learning_rate": 0.00011486526241468581,
+      "loss": 2.7969,
+      "step": 333000
+    },
+    {
+      "epoch": 17.96085179608518,
+      "grad_norm": 0.2890709340572357,
+      "learning_rate": 0.00011192633560837845,
+      "loss": 2.7966,
+      "step": 334000
+    },
+    {
+      "epoch": 18.0,
+      "eval_accuracy": 0.4105063404500295,
+      "eval_loss": 3.4080612659454346,
+      "eval_runtime": 154.9923,
+      "eval_samples_per_second": 373.677,
+      "eval_steps_per_second": 5.839,
+      "step": 334728
+    },
+    {
+      "epoch": 18.01462680146268,
+      "grad_norm": 0.2767917215824127,
+      "learning_rate": 0.0001089844669333961,
+      "loss": 2.7867,
+      "step": 335000
+    },
+    {
+      "epoch": 18.068401806840182,
+      "grad_norm": 0.29336050152778625,
+      "learning_rate": 0.00010604259825841375,
+      "loss": 2.7639,
+      "step": 336000
+    },
+    {
+      "epoch": 18.12217681221768,
+      "grad_norm": 0.3294292688369751,
+      "learning_rate": 0.0001031007295834314,
+      "loss": 2.7658,
+      "step": 337000
+    },
+    {
+      "epoch": 18.17595181759518,
+      "grad_norm": 0.30230215191841125,
+      "learning_rate": 0.00010015886090844905,
+      "loss": 2.766,
+      "step": 338000
+    },
+    {
+      "epoch": 18.229726822972683,
+      "grad_norm": 0.28469741344451904,
+      "learning_rate": 9.721993410214168e-05,
+      "loss": 2.7682,
+      "step": 339000
+    },
+    {
+      "epoch": 18.283501828350182,
+      "grad_norm": 0.2829948365688324,
+      "learning_rate": 9.427806542715933e-05,
+      "loss": 2.7747,
+      "step": 340000
+    },
+    {
+      "epoch": 18.33727683372768,
+      "grad_norm": 0.27874138951301575,
+      "learning_rate": 9.133913862085197e-05,
+      "loss": 2.7719,
+      "step": 341000
+    },
+    {
+      "epoch": 18.391051839105184,
+      "grad_norm": 0.3442845642566681,
+      "learning_rate": 8.839726994586963e-05,
+      "loss": 2.7753,
+      "step": 342000
+    },
+    {
+      "epoch": 18.444826844482684,
+      "grad_norm": 0.36807698011398315,
+      "learning_rate": 8.545540127088727e-05,
+      "loss": 2.7734,
+      "step": 343000
+    },
+    {
+      "epoch": 18.498601849860187,
+      "grad_norm": 0.3008968234062195,
+      "learning_rate": 8.25164744645799e-05,
+      "loss": 2.7742,
+      "step": 344000
+    },
+    {
+      "epoch": 18.552376855237686,
+      "grad_norm": 0.3300212621688843,
+      "learning_rate": 7.957460578959754e-05,
+      "loss": 2.7729,
+      "step": 345000
+    },
+    {
+      "epoch": 18.606151860615185,
+      "grad_norm": 0.2936519682407379,
+      "learning_rate": 7.66327371146152e-05,
+      "loss": 2.7756,
+      "step": 346000
+    },
+    {
+      "epoch": 18.659926865992688,
+      "grad_norm": 0.2997318208217621,
+      "learning_rate": 7.369381030830784e-05,
+      "loss": 2.7787,
+      "step": 347000
+    },
+    {
+      "epoch": 18.713701871370187,
+      "grad_norm": 0.27832144498825073,
+      "learning_rate": 7.075194163332549e-05,
+      "loss": 2.7736,
+      "step": 348000
+    },
+    {
+      "epoch": 18.767476876747686,
+      "grad_norm": 0.28568559885025024,
+      "learning_rate": 6.781301482701812e-05,
+      "loss": 2.7752,
+      "step": 349000
+    },
+    {
+      "epoch": 18.82125188212519,
+      "grad_norm": 0.2887098789215088,
+      "learning_rate": 6.487408802071074e-05,
+      "loss": 2.7764,
+      "step": 350000
+    },
+    {
+      "epoch": 18.87502688750269,
+      "grad_norm": 0.31327852606773376,
+      "learning_rate": 6.19322193457284e-05,
+      "loss": 2.7776,
+      "step": 351000
+    },
+    {
+      "epoch": 18.928801892880188,
+      "grad_norm": 0.2947293519973755,
+      "learning_rate": 5.899035067074606e-05,
+      "loss": 2.7778,
+      "step": 352000
+    },
+    {
+      "epoch": 18.98257689825769,
+      "grad_norm": 0.27630892395973206,
+      "learning_rate": 5.604848199576371e-05,
+      "loss": 2.7807,
+      "step": 353000
+    },
+    {
+      "epoch": 19.0,
+      "eval_accuracy": 0.41043096087659053,
+      "eval_loss": 3.4180543422698975,
+      "eval_runtime": 155.0182,
+      "eval_samples_per_second": 373.614,
+      "eval_steps_per_second": 5.838,
+      "step": 353324
+    },
+    {
+      "epoch": 19.03635190363519,
+      "grad_norm": 0.29157590866088867,
+      "learning_rate": 5.310955518945634e-05,
+      "loss": 2.7661,
+      "step": 354000
+    },
+    {
+      "epoch": 19.090126909012692,
+      "grad_norm": 0.26800793409347534,
+      "learning_rate": 5.0167686514473995e-05,
+      "loss": 2.7558,
+      "step": 355000
+    },
+    {
+      "epoch": 19.14390191439019,
+      "grad_norm": 0.31518709659576416,
+      "learning_rate": 4.722581783949165e-05,
+      "loss": 2.7579,
+      "step": 356000
+    },
+    {
+      "epoch": 19.19767691976769,
+      "grad_norm": 0.2998282313346863,
+      "learning_rate": 4.428689103318428e-05,
+      "loss": 2.7574,
+      "step": 357000
+    },
+    {
+      "epoch": 19.251451925145194,
+      "grad_norm": 0.3034313917160034,
+      "learning_rate": 4.134502235820193e-05,
+      "loss": 2.7606,
+      "step": 358000
+    },
+    {
+      "epoch": 19.305226930522693,
+      "grad_norm": 0.2875937819480896,
+      "learning_rate": 3.8403153683219584e-05,
+      "loss": 2.7586,
+      "step": 359000
+    },
+    {
+      "epoch": 19.359001935900192,
+      "grad_norm": 0.30761197209358215,
+      "learning_rate": 3.546422687691222e-05,
+      "loss": 2.7577,
+      "step": 360000
+    },
+    {
+      "epoch": 19.412776941277695,
+      "grad_norm": 0.29801440238952637,
+      "learning_rate": 3.252530007060485e-05,
+      "loss": 2.7557,
+      "step": 361000
+    },
+    {
+      "epoch": 19.466551946655194,
+      "grad_norm": 0.2939643859863281,
+      "learning_rate": 2.95834313956225e-05,
+      "loss": 2.7571,
+      "step": 362000
+    },
+    {
+      "epoch": 19.520326952032697,
+      "grad_norm": 0.29968592524528503,
+      "learning_rate": 2.6641562720640153e-05,
+      "loss": 2.761,
+      "step": 363000
+    },
+    {
+      "epoch": 19.574101957410196,
+      "grad_norm": 0.28651562333106995,
+      "learning_rate": 2.36996940456578e-05,
+      "loss": 2.7579,
+      "step": 364000
+    },
+    {
+      "epoch": 19.627876962787695,
+      "grad_norm": 0.30474939942359924,
+      "learning_rate": 2.0760767239350436e-05,
+      "loss": 2.7598,
+      "step": 365000
+    },
+    {
+      "epoch": 19.6816519681652,
+      "grad_norm": 0.2941524088382721,
+      "learning_rate": 1.7818898564368086e-05,
+      "loss": 2.7644,
+      "step": 366000
+    },
+    {
+      "epoch": 19.735426973542697,
+      "grad_norm": 0.2570919990539551,
+      "learning_rate": 1.487997175806072e-05,
+      "loss": 2.7549,
+      "step": 367000
+    },
+    {
+      "epoch": 19.789201978920197,
+      "grad_norm": 0.31410789489746094,
+      "learning_rate": 1.193810308307837e-05,
+      "loss": 2.7624,
+      "step": 368000
+    },
+    {
+      "epoch": 19.8429769842977,
+      "grad_norm": 0.29923874139785767,
+      "learning_rate": 8.999176276771005e-06,
+      "loss": 2.7621,
+      "step": 369000
+    },
+    {
+      "epoch": 19.8967519896752,
+      "grad_norm": 0.29635271430015564,
+      "learning_rate": 6.057307601788656e-06,
+      "loss": 2.7601,
+      "step": 370000
+    },
+    {
+      "epoch": 19.950526995052698,
+      "grad_norm": 0.29568880796432495,
+      "learning_rate": 3.1154389268063074e-06,
+      "loss": 2.7595,
+      "step": 371000
+    },
+    {
+      "epoch": 20.0,
+      "eval_accuracy": 0.41021489963935376,
+      "eval_loss": 3.423628807067871,
+      "eval_runtime": 154.8423,
+      "eval_samples_per_second": 374.039,
+      "eval_steps_per_second": 5.845,
+      "step": 371920
+    },
+    {
+      "epoch": 20.0,
+      "step": 371920,
+      "total_flos": 1.5670047538944e+18,
+      "train_loss": 3.0279207580395733,
+      "train_runtime": 82395.3872,
+      "train_samples_per_second": 144.441,
+      "train_steps_per_second": 4.514
+    }
+  ],
+  "logging_steps": 1000,
+  "max_steps": 371920,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5670047538944e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}