Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +1053 -3

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc0bc753d549873778279b4352a2fef1178f8b18e419b345700de66966f94f38
 size 3705288

 version https://git-lfs.github.com/spec/v1
+oid sha256:afbb9fafdaa3ca57948b5758a6f7f91aff0d8bb88753df2cf01d7363ea7ed873
 size 3705288

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:399e81a7072d05f6bd91a40835348ac36d6e7a76eea6f1c41f12f661c7013db2
 size 2213690

 version https://git-lfs.github.com/spec/v1
+oid sha256:4ef832f29c75c0a4f145b612ac65d501410d3c9e4ad8c11582372091e773bf49
 size 2213690

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c282cd48f6ae31c73c753236ccc00d4589da3b2940ecbca74e49b6784205ce15
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:7fca791d6b819bad1dbc285bf8bd7345964c2aeaf16b7d702b5c0c5380f2a057
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b8618bf2b16b696607893880c2e64585cb7c07b1c54a6d87b831038f8cd2066
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:6b04bc150d39b084b09acd98d05c3563c323fc35277059ff5584f9f2d3fde608
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 29.286150091519218,
   "eval_steps": 500,
-  "global_step": 6000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2107,6 +2107,1056 @@
       "learning_rate": 0.00014187192118226603,
       "loss": 0.444,
       "step": 6000
     }
   ],
   "logging_steps": 20,
@@ -2126,7 +3176,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.1953936764223488e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 43.92922513727883,
   "eval_steps": 500,
+  "global_step": 9000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 0.00014187192118226603,
       "loss": 0.444,
       "step": 6000
+    },
+    {
+      "epoch": 29.383770591824284,
+      "grad_norm": 4.88236141204834,
+      "learning_rate": 0.00014167487684729063,
+      "loss": 0.4639,
+      "step": 6020
+    },
+    {
+      "epoch": 29.481391092129346,
+      "grad_norm": 3.7870137691497803,
+      "learning_rate": 0.00014147783251231528,
+      "loss": 0.4873,
+      "step": 6040
+    },
+    {
+      "epoch": 29.579011592434412,
+      "grad_norm": 3.091411590576172,
+      "learning_rate": 0.0001412807881773399,
+      "loss": 0.4834,
+      "step": 6060
+    },
+    {
+      "epoch": 29.676632092739474,
+      "grad_norm": 2.7498538494110107,
+      "learning_rate": 0.00014108374384236454,
+      "loss": 0.4846,
+      "step": 6080
+    },
+    {
+      "epoch": 29.77425259304454,
+      "grad_norm": 3.2043850421905518,
+      "learning_rate": 0.00014088669950738917,
+      "loss": 0.4983,
+      "step": 6100
+    },
+    {
+      "epoch": 29.871873093349603,
+      "grad_norm": 3.270357847213745,
+      "learning_rate": 0.0001406896551724138,
+      "loss": 0.4803,
+      "step": 6120
+    },
+    {
+      "epoch": 29.96949359365467,
+      "grad_norm": 3.031405210494995,
+      "learning_rate": 0.00014049261083743842,
+      "loss": 0.5287,
+      "step": 6140
+    },
+    {
+      "epoch": 30.06711409395973,
+      "grad_norm": 3.390765905380249,
+      "learning_rate": 0.00014029556650246307,
+      "loss": 0.4619,
+      "step": 6160
+    },
+    {
+      "epoch": 30.164734594264797,
+      "grad_norm": 3.2783963680267334,
+      "learning_rate": 0.0001400985221674877,
+      "loss": 0.4328,
+      "step": 6180
+    },
+    {
+      "epoch": 30.26235509456986,
+      "grad_norm": 3.6925759315490723,
+      "learning_rate": 0.00013990147783251233,
+      "loss": 0.487,
+      "step": 6200
+    },
+    {
+      "epoch": 30.359975594874925,
+      "grad_norm": 3.0115065574645996,
+      "learning_rate": 0.00013970443349753696,
+      "loss": 0.467,
+      "step": 6220
+    },
+    {
+      "epoch": 30.457596095179987,
+      "grad_norm": 4.561310291290283,
+      "learning_rate": 0.00013950738916256158,
+      "loss": 0.4801,
+      "step": 6240
+    },
+    {
+      "epoch": 30.555216595485053,
+      "grad_norm": 3.2879674434661865,
+      "learning_rate": 0.0001393103448275862,
+      "loss": 0.4638,
+      "step": 6260
+    },
+    {
+      "epoch": 30.652837095790115,
+      "grad_norm": 2.793945789337158,
+      "learning_rate": 0.00013911330049261084,
+      "loss": 0.463,
+      "step": 6280
+    },
+    {
+      "epoch": 30.75045759609518,
+      "grad_norm": 3.615793466567993,
+      "learning_rate": 0.0001389162561576355,
+      "loss": 0.4907,
+      "step": 6300
+    },
+    {
+      "epoch": 30.848078096400243,
+      "grad_norm": 3.160133123397827,
+      "learning_rate": 0.0001387192118226601,
+      "loss": 0.477,
+      "step": 6320
+    },
+    {
+      "epoch": 30.94569859670531,
+      "grad_norm": 3.62670636177063,
+      "learning_rate": 0.00013852216748768475,
+      "loss": 0.4945,
+      "step": 6340
+    },
+    {
+      "epoch": 31.04331909701037,
+      "grad_norm": 3.346158981323242,
+      "learning_rate": 0.00013832512315270935,
+      "loss": 0.4543,
+      "step": 6360
+    },
+    {
+      "epoch": 31.140939597315437,
+      "grad_norm": 2.8707423210144043,
+      "learning_rate": 0.000138128078817734,
+      "loss": 0.4352,
+      "step": 6380
+    },
+    {
+      "epoch": 31.2385600976205,
+      "grad_norm": 2.5617620944976807,
+      "learning_rate": 0.00013793103448275863,
+      "loss": 0.4611,
+      "step": 6400
+    },
+    {
+      "epoch": 31.336180597925566,
+      "grad_norm": 3.2273828983306885,
+      "learning_rate": 0.00013773399014778325,
+      "loss": 0.4593,
+      "step": 6420
+    },
+    {
+      "epoch": 31.433801098230628,
+      "grad_norm": 3.502797842025757,
+      "learning_rate": 0.00013753694581280788,
+      "loss": 0.4717,
+      "step": 6440
+    },
+    {
+      "epoch": 31.531421598535694,
+      "grad_norm": 3.9278218746185303,
+      "learning_rate": 0.0001373399014778325,
+      "loss": 0.4813,
+      "step": 6460
+    },
+    {
+      "epoch": 31.629042098840756,
+      "grad_norm": 3.013709545135498,
+      "learning_rate": 0.00013714285714285716,
+      "loss": 0.4305,
+      "step": 6480
+    },
+    {
+      "epoch": 31.726662599145822,
+      "grad_norm": 2.661198377609253,
+      "learning_rate": 0.0001369458128078818,
+      "loss": 0.4495,
+      "step": 6500
+    },
+    {
+      "epoch": 31.824283099450884,
+      "grad_norm": 2.6343297958374023,
+      "learning_rate": 0.00013674876847290642,
+      "loss": 0.4809,
+      "step": 6520
+    },
+    {
+      "epoch": 31.92190359975595,
+      "grad_norm": 6.334170818328857,
+      "learning_rate": 0.00013655172413793104,
+      "loss": 0.4576,
+      "step": 6540
+    },
+    {
+      "epoch": 32.01952410006101,
+      "grad_norm": 3.728727102279663,
+      "learning_rate": 0.00013635467980295567,
+      "loss": 0.5034,
+      "step": 6560
+    },
+    {
+      "epoch": 32.117144600366075,
+      "grad_norm": 2.0572702884674072,
+      "learning_rate": 0.0001361576354679803,
+      "loss": 0.4161,
+      "step": 6580
+    },
+    {
+      "epoch": 32.214765100671144,
+      "grad_norm": 2.7006356716156006,
+      "learning_rate": 0.00013596059113300492,
+      "loss": 0.4357,
+      "step": 6600
+    },
+    {
+      "epoch": 32.31238560097621,
+      "grad_norm": 3.526782989501953,
+      "learning_rate": 0.00013576354679802955,
+      "loss": 0.4367,
+      "step": 6620
+    },
+    {
+      "epoch": 32.41000610128127,
+      "grad_norm": 3.240647792816162,
+      "learning_rate": 0.0001355665024630542,
+      "loss": 0.4416,
+      "step": 6640
+    },
+    {
+      "epoch": 32.50762660158633,
+      "grad_norm": 2.965851306915283,
+      "learning_rate": 0.0001353694581280788,
+      "loss": 0.4649,
+      "step": 6660
+    },
+    {
+      "epoch": 32.6052471018914,
+      "grad_norm": 3.028812885284424,
+      "learning_rate": 0.00013517241379310346,
+      "loss": 0.4381,
+      "step": 6680
+    },
+    {
+      "epoch": 32.70286760219646,
+      "grad_norm": 4.041370391845703,
+      "learning_rate": 0.0001349753694581281,
+      "loss": 0.4671,
+      "step": 6700
+    },
+    {
+      "epoch": 32.800488102501525,
+      "grad_norm": 5.677656650543213,
+      "learning_rate": 0.00013477832512315271,
+      "loss": 0.4718,
+      "step": 6720
+    },
+    {
+      "epoch": 32.89810860280659,
+      "grad_norm": 3.1538727283477783,
+      "learning_rate": 0.00013458128078817737,
+      "loss": 0.4705,
+      "step": 6740
+    },
+    {
+      "epoch": 32.99572910311166,
+      "grad_norm": 3.8186867237091064,
+      "learning_rate": 0.00013438423645320197,
+      "loss": 0.4724,
+      "step": 6760
+    },
+    {
+      "epoch": 33.09334960341672,
+      "grad_norm": 2.8248584270477295,
+      "learning_rate": 0.00013418719211822662,
+      "loss": 0.4399,
+      "step": 6780
+    },
+    {
+      "epoch": 33.19097010372178,
+      "grad_norm": 2.2694895267486572,
+      "learning_rate": 0.00013399014778325122,
+      "loss": 0.4147,
+      "step": 6800
+    },
+    {
+      "epoch": 33.288590604026844,
+      "grad_norm": 3.305610418319702,
+      "learning_rate": 0.00013379310344827588,
+      "loss": 0.4028,
+      "step": 6820
+    },
+    {
+      "epoch": 33.38621110433191,
+      "grad_norm": 3.610136032104492,
+      "learning_rate": 0.0001335960591133005,
+      "loss": 0.4319,
+      "step": 6840
+    },
+    {
+      "epoch": 33.483831604636975,
+      "grad_norm": 3.4783689975738525,
+      "learning_rate": 0.00013339901477832513,
+      "loss": 0.4361,
+      "step": 6860
+    },
+    {
+      "epoch": 33.58145210494204,
+      "grad_norm": 3.0984203815460205,
+      "learning_rate": 0.00013320197044334976,
+      "loss": 0.4488,
+      "step": 6880
+    },
+    {
+      "epoch": 33.6790726052471,
+      "grad_norm": 3.1558122634887695,
+      "learning_rate": 0.00013300492610837438,
+      "loss": 0.4262,
+      "step": 6900
+    },
+    {
+      "epoch": 33.77669310555217,
+      "grad_norm": 4.813379764556885,
+      "learning_rate": 0.000132807881773399,
+      "loss": 0.452,
+      "step": 6920
+    },
+    {
+      "epoch": 33.87431360585723,
+      "grad_norm": 3.047551393508911,
+      "learning_rate": 0.00013261083743842364,
+      "loss": 0.4517,
+      "step": 6940
+    },
+    {
+      "epoch": 33.971934106162294,
+      "grad_norm": 3.0880701541900635,
+      "learning_rate": 0.0001324137931034483,
+      "loss": 0.5147,
+      "step": 6960
+    },
+    {
+      "epoch": 34.06955460646736,
+      "grad_norm": 2.824169874191284,
+      "learning_rate": 0.00013221674876847292,
+      "loss": 0.4017,
+      "step": 6980
+    },
+    {
+      "epoch": 34.16717510677242,
+      "grad_norm": 3.1136012077331543,
+      "learning_rate": 0.00013201970443349755,
+      "loss": 0.4291,
+      "step": 7000
+    },
+    {
+      "epoch": 34.26479560707749,
+      "grad_norm": 4.246958255767822,
+      "learning_rate": 0.00013182266009852217,
+      "loss": 0.4318,
+      "step": 7020
+    },
+    {
+      "epoch": 34.36241610738255,
+      "grad_norm": 2.4655661582946777,
+      "learning_rate": 0.0001316256157635468,
+      "loss": 0.4283,
+      "step": 7040
+    },
+    {
+      "epoch": 34.46003660768761,
+      "grad_norm": 4.322596549987793,
+      "learning_rate": 0.00013142857142857143,
+      "loss": 0.4323,
+      "step": 7060
+    },
+    {
+      "epoch": 34.557657107992675,
+      "grad_norm": 4.425800800323486,
+      "learning_rate": 0.00013123152709359608,
+      "loss": 0.4376,
+      "step": 7080
+    },
+    {
+      "epoch": 34.655277608297745,
+      "grad_norm": 3.796889305114746,
+      "learning_rate": 0.00013103448275862068,
+      "loss": 0.4276,
+      "step": 7100
+    },
+    {
+      "epoch": 34.75289810860281,
+      "grad_norm": 3.9222586154937744,
+      "learning_rate": 0.00013083743842364534,
+      "loss": 0.4658,
+      "step": 7120
+    },
+    {
+      "epoch": 34.85051860890787,
+      "grad_norm": 4.5007548332214355,
+      "learning_rate": 0.00013064039408866994,
+      "loss": 0.4293,
+      "step": 7140
+    },
+    {
+      "epoch": 34.94813910921293,
+      "grad_norm": 3.0858423709869385,
+      "learning_rate": 0.0001304433497536946,
+      "loss": 0.4214,
+      "step": 7160
+    },
+    {
+      "epoch": 35.045759609518,
+      "grad_norm": 3.586949586868286,
+      "learning_rate": 0.00013024630541871922,
+      "loss": 0.4199,
+      "step": 7180
+    },
+    {
+      "epoch": 35.14338010982306,
+      "grad_norm": 2.916937828063965,
+      "learning_rate": 0.00013004926108374385,
+      "loss": 0.4071,
+      "step": 7200
+    },
+    {
+      "epoch": 35.241000610128125,
+      "grad_norm": 3.1324169635772705,
+      "learning_rate": 0.00012985221674876847,
+      "loss": 0.4151,
+      "step": 7220
+    },
+    {
+      "epoch": 35.33862111043319,
+      "grad_norm": 2.8730344772338867,
+      "learning_rate": 0.0001296551724137931,
+      "loss": 0.3984,
+      "step": 7240
+    },
+    {
+      "epoch": 35.43624161073826,
+      "grad_norm": 3.0865273475646973,
+      "learning_rate": 0.00012945812807881775,
+      "loss": 0.4273,
+      "step": 7260
+    },
+    {
+      "epoch": 35.53386211104332,
+      "grad_norm": 4.397771835327148,
+      "learning_rate": 0.00012926108374384238,
+      "loss": 0.4232,
+      "step": 7280
+    },
+    {
+      "epoch": 35.63148261134838,
+      "grad_norm": 2.4203243255615234,
+      "learning_rate": 0.000129064039408867,
+      "loss": 0.4035,
+      "step": 7300
+    },
+    {
+      "epoch": 35.729103111653444,
+      "grad_norm": 2.94404673576355,
+      "learning_rate": 0.00012886699507389164,
+      "loss": 0.4332,
+      "step": 7320
+    },
+    {
+      "epoch": 35.82672361195851,
+      "grad_norm": 3.4141249656677246,
+      "learning_rate": 0.00012866995073891626,
+      "loss": 0.4484,
+      "step": 7340
+    },
+    {
+      "epoch": 35.924344112263576,
+      "grad_norm": 2.8227927684783936,
+      "learning_rate": 0.0001284729064039409,
+      "loss": 0.4509,
+      "step": 7360
+    },
+    {
+      "epoch": 36.02196461256864,
+      "grad_norm": 2.768937110900879,
+      "learning_rate": 0.00012827586206896552,
+      "loss": 0.4391,
+      "step": 7380
+    },
+    {
+      "epoch": 36.1195851128737,
+      "grad_norm": 4.155871391296387,
+      "learning_rate": 0.00012807881773399014,
+      "loss": 0.3954,
+      "step": 7400
+    },
+    {
+      "epoch": 36.21720561317877,
+      "grad_norm": 2.484731912612915,
+      "learning_rate": 0.0001278817733990148,
+      "loss": 0.4363,
+      "step": 7420
+    },
+    {
+      "epoch": 36.31482611348383,
+      "grad_norm": 2.7758595943450928,
+      "learning_rate": 0.0001276847290640394,
+      "loss": 0.4058,
+      "step": 7440
+    },
+    {
+      "epoch": 36.412446613788894,
+      "grad_norm": 3.9609923362731934,
+      "learning_rate": 0.00012748768472906405,
+      "loss": 0.3845,
+      "step": 7460
+    },
+    {
+      "epoch": 36.51006711409396,
+      "grad_norm": 3.963120222091675,
+      "learning_rate": 0.00012729064039408868,
+      "loss": 0.4301,
+      "step": 7480
+    },
+    {
+      "epoch": 36.607687614399026,
+      "grad_norm": 2.77718448638916,
+      "learning_rate": 0.0001270935960591133,
+      "loss": 0.4034,
+      "step": 7500
+    },
+    {
+      "epoch": 36.70530811470409,
+      "grad_norm": 3.6000113487243652,
+      "learning_rate": 0.00012689655172413793,
+      "loss": 0.4087,
+      "step": 7520
+    },
+    {
+      "epoch": 36.80292861500915,
+      "grad_norm": 3.4430975914001465,
+      "learning_rate": 0.00012669950738916256,
+      "loss": 0.4109,
+      "step": 7540
+    },
+    {
+      "epoch": 36.90054911531421,
+      "grad_norm": 3.3932645320892334,
+      "learning_rate": 0.00012650246305418721,
+      "loss": 0.4394,
+      "step": 7560
+    },
+    {
+      "epoch": 36.99816961561928,
+      "grad_norm": 4.054554462432861,
+      "learning_rate": 0.00012630541871921181,
+      "loss": 0.4203,
+      "step": 7580
+    },
+    {
+      "epoch": 37.095790115924345,
+      "grad_norm": 2.8766210079193115,
+      "learning_rate": 0.00012610837438423647,
+      "loss": 0.3861,
+      "step": 7600
+    },
+    {
+      "epoch": 37.19341061622941,
+      "grad_norm": 4.115131855010986,
+      "learning_rate": 0.0001259113300492611,
+      "loss": 0.4236,
+      "step": 7620
+    },
+    {
+      "epoch": 37.29103111653447,
+      "grad_norm": 2.776914358139038,
+      "learning_rate": 0.00012571428571428572,
+      "loss": 0.4244,
+      "step": 7640
+    },
+    {
+      "epoch": 37.38865161683954,
+      "grad_norm": 3.8428800106048584,
+      "learning_rate": 0.00012551724137931035,
+      "loss": 0.4028,
+      "step": 7660
+    },
+    {
+      "epoch": 37.4862721171446,
+      "grad_norm": 3.028683662414551,
+      "learning_rate": 0.00012532019704433498,
+      "loss": 0.4127,
+      "step": 7680
+    },
+    {
+      "epoch": 37.58389261744966,
+      "grad_norm": 2.678617477416992,
+      "learning_rate": 0.0001251231527093596,
+      "loss": 0.4251,
+      "step": 7700
+    },
+    {
+      "epoch": 37.681513117754726,
+      "grad_norm": 3.496917247772217,
+      "learning_rate": 0.00012492610837438423,
+      "loss": 0.404,
+      "step": 7720
+    },
+    {
+      "epoch": 37.779133618059795,
+      "grad_norm": 4.018653869628906,
+      "learning_rate": 0.00012472906403940889,
+      "loss": 0.4028,
+      "step": 7740
+    },
+    {
+      "epoch": 37.87675411836486,
+      "grad_norm": 3.317580223083496,
+      "learning_rate": 0.0001245320197044335,
+      "loss": 0.4032,
+      "step": 7760
+    },
+    {
+      "epoch": 37.97437461866992,
+      "grad_norm": 3.7693002223968506,
+      "learning_rate": 0.00012433497536945814,
+      "loss": 0.3935,
+      "step": 7780
+    },
+    {
+      "epoch": 38.07199511897498,
+      "grad_norm": 2.809558629989624,
+      "learning_rate": 0.00012413793103448277,
+      "loss": 0.4113,
+      "step": 7800
+    },
+    {
+      "epoch": 38.16961561928005,
+      "grad_norm": 3.2092092037200928,
+      "learning_rate": 0.0001239408866995074,
+      "loss": 0.4019,
+      "step": 7820
+    },
+    {
+      "epoch": 38.267236119585114,
+      "grad_norm": 3.3514404296875,
+      "learning_rate": 0.00012374384236453202,
+      "loss": 0.4013,
+      "step": 7840
+    },
+    {
+      "epoch": 38.364856619890176,
+      "grad_norm": 3.9514451026916504,
+      "learning_rate": 0.00012354679802955667,
+      "loss": 0.3889,
+      "step": 7860
+    },
+    {
+      "epoch": 38.46247712019524,
+      "grad_norm": 2.7896828651428223,
+      "learning_rate": 0.00012334975369458127,
+      "loss": 0.377,
+      "step": 7880
+    },
+    {
+      "epoch": 38.56009762050031,
+      "grad_norm": 3.522840738296509,
+      "learning_rate": 0.00012315270935960593,
+      "loss": 0.4158,
+      "step": 7900
+    },
+    {
+      "epoch": 38.65771812080537,
+      "grad_norm": 3.422250270843506,
+      "learning_rate": 0.00012295566502463053,
+      "loss": 0.3837,
+      "step": 7920
+    },
+    {
+      "epoch": 38.75533862111043,
+      "grad_norm": 3.0469913482666016,
+      "learning_rate": 0.00012275862068965518,
+      "loss": 0.4036,
+      "step": 7940
+    },
+    {
+      "epoch": 38.852959121415495,
+      "grad_norm": 2.904141664505005,
+      "learning_rate": 0.0001225615763546798,
+      "loss": 0.3928,
+      "step": 7960
+    },
+    {
+      "epoch": 38.950579621720564,
+      "grad_norm": 3.7538552284240723,
+      "learning_rate": 0.00012236453201970444,
+      "loss": 0.4092,
+      "step": 7980
+    },
+    {
+      "epoch": 39.04820012202563,
+      "grad_norm": 3.562114715576172,
+      "learning_rate": 0.00012216748768472906,
+      "loss": 0.3982,
+      "step": 8000
+    },
+    {
+      "epoch": 39.14582062233069,
+      "grad_norm": 2.4931962490081787,
+      "learning_rate": 0.00012197044334975369,
+      "loss": 0.3547,
+      "step": 8020
+    },
+    {
+      "epoch": 39.24344112263575,
+      "grad_norm": 2.461050271987915,
+      "learning_rate": 0.00012177339901477833,
+      "loss": 0.3762,
+      "step": 8040
+    },
+    {
+      "epoch": 39.34106162294082,
+      "grad_norm": 3.1320595741271973,
+      "learning_rate": 0.00012157635467980295,
+      "loss": 0.3907,
+      "step": 8060
+    },
+    {
+      "epoch": 39.43868212324588,
+      "grad_norm": 3.044754981994629,
+      "learning_rate": 0.00012137931034482759,
+      "loss": 0.4068,
+      "step": 8080
+    },
+    {
+      "epoch": 39.536302623550945,
+      "grad_norm": 2.9243273735046387,
+      "learning_rate": 0.00012118226600985223,
+      "loss": 0.3903,
+      "step": 8100
+    },
+    {
+      "epoch": 39.63392312385601,
+      "grad_norm": 4.234837055206299,
+      "learning_rate": 0.00012098522167487685,
+      "loss": 0.3841,
+      "step": 8120
+    },
+    {
+      "epoch": 39.73154362416108,
+      "grad_norm": 3.993495464324951,
+      "learning_rate": 0.00012078817733990148,
+      "loss": 0.4082,
+      "step": 8140
+    },
+    {
+      "epoch": 39.82916412446614,
+      "grad_norm": 3.8363142013549805,
+      "learning_rate": 0.00012059113300492611,
+      "loss": 0.3939,
+      "step": 8160
+    },
+    {
+      "epoch": 39.9267846247712,
+      "grad_norm": 4.398952007293701,
+      "learning_rate": 0.00012039408866995075,
+      "loss": 0.4145,
+      "step": 8180
+    },
+    {
+      "epoch": 40.024405125076264,
+      "grad_norm": 2.7002291679382324,
+      "learning_rate": 0.00012019704433497539,
+      "loss": 0.386,
+      "step": 8200
+    },
+    {
+      "epoch": 40.12202562538133,
+      "grad_norm": 3.1867945194244385,
+      "learning_rate": 0.00012,
+      "loss": 0.3924,
+      "step": 8220
+    },
+    {
+      "epoch": 40.219646125686396,
+      "grad_norm": 2.9179584980010986,
+      "learning_rate": 0.00011980295566502464,
+      "loss": 0.3741,
+      "step": 8240
+    },
+    {
+      "epoch": 40.31726662599146,
+      "grad_norm": 5.108730316162109,
+      "learning_rate": 0.00011960591133004926,
+      "loss": 0.371,
+      "step": 8260
+    },
+    {
+      "epoch": 40.41488712629652,
+      "grad_norm": 3.4418270587921143,
+      "learning_rate": 0.0001194088669950739,
+      "loss": 0.3845,
+      "step": 8280
+    },
+    {
+      "epoch": 40.51250762660159,
+      "grad_norm": 3.245562791824341,
+      "learning_rate": 0.00011921182266009854,
+      "loss": 0.375,
+      "step": 8300
+    },
+    {
+      "epoch": 40.61012812690665,
+      "grad_norm": 2.6644446849823,
+      "learning_rate": 0.00011901477832512315,
+      "loss": 0.3839,
+      "step": 8320
+    },
+    {
+      "epoch": 40.707748627211714,
+      "grad_norm": 4.975727558135986,
+      "learning_rate": 0.00011881773399014779,
+      "loss": 0.3889,
+      "step": 8340
+    },
+    {
+      "epoch": 40.80536912751678,
+      "grad_norm": 3.6427066326141357,
+      "learning_rate": 0.0001186206896551724,
+      "loss": 0.393,
+      "step": 8360
+    },
+    {
+      "epoch": 40.902989627821846,
+      "grad_norm": 3.7799060344696045,
+      "learning_rate": 0.00011842364532019705,
+      "loss": 0.3894,
+      "step": 8380
+    },
+    {
+      "epoch": 41.00061012812691,
+      "grad_norm": 4.170138835906982,
+      "learning_rate": 0.00011822660098522169,
+      "loss": 0.3965,
+      "step": 8400
+    },
+    {
+      "epoch": 41.09823062843197,
+      "grad_norm": 2.660006523132324,
+      "learning_rate": 0.00011802955665024631,
+      "loss": 0.3412,
+      "step": 8420
+    },
+    {
+      "epoch": 41.19585112873703,
+      "grad_norm": 3.9118030071258545,
+      "learning_rate": 0.00011783251231527096,
+      "loss": 0.3608,
+      "step": 8440
+    },
+    {
+      "epoch": 41.2934716290421,
+      "grad_norm": 4.68622350692749,
+      "learning_rate": 0.00011763546798029557,
+      "loss": 0.3742,
+      "step": 8460
+    },
+    {
+      "epoch": 41.391092129347165,
+      "grad_norm": 2.5423784255981445,
+      "learning_rate": 0.00011743842364532021,
+      "loss": 0.3901,
+      "step": 8480
+    },
+    {
+      "epoch": 41.48871262965223,
+      "grad_norm": 3.6446280479431152,
+      "learning_rate": 0.00011724137931034482,
+      "loss": 0.3518,
+      "step": 8500
+    },
+    {
+      "epoch": 41.58633312995729,
+      "grad_norm": 2.6701178550720215,
+      "learning_rate": 0.00011704433497536946,
+      "loss": 0.3809,
+      "step": 8520
+    },
+    {
+      "epoch": 41.68395363026236,
+      "grad_norm": 3.226100206375122,
+      "learning_rate": 0.0001168472906403941,
+      "loss": 0.3834,
+      "step": 8540
+    },
+    {
+      "epoch": 41.78157413056742,
+      "grad_norm": 3.4181952476501465,
+      "learning_rate": 0.00011665024630541872,
+      "loss": 0.4098,
+      "step": 8560
+    },
+    {
+      "epoch": 41.87919463087248,
+      "grad_norm": 2.9190330505371094,
+      "learning_rate": 0.00011645320197044336,
+      "loss": 0.3838,
+      "step": 8580
+    },
+    {
+      "epoch": 41.976815131177545,
+      "grad_norm": 4.082178115844727,
+      "learning_rate": 0.00011625615763546797,
+      "loss": 0.4109,
+      "step": 8600
+    },
+    {
+      "epoch": 42.074435631482615,
+      "grad_norm": 2.899162530899048,
+      "learning_rate": 0.00011605911330049261,
+      "loss": 0.3624,
+      "step": 8620
+    },
+    {
+      "epoch": 42.17205613178768,
+      "grad_norm": 2.4065990447998047,
+      "learning_rate": 0.00011586206896551725,
+      "loss": 0.3573,
+      "step": 8640
+    },
+    {
+      "epoch": 42.26967663209274,
+      "grad_norm": 2.818037509918213,
+      "learning_rate": 0.00011566502463054188,
+      "loss": 0.3699,
+      "step": 8660
+    },
+    {
+      "epoch": 42.3672971323978,
+      "grad_norm": 2.8875226974487305,
+      "learning_rate": 0.00011546798029556651,
+      "loss": 0.3489,
+      "step": 8680
+    },
+    {
+      "epoch": 42.464917632702864,
+      "grad_norm": 3.0840396881103516,
+      "learning_rate": 0.00011527093596059113,
+      "loss": 0.3733,
+      "step": 8700
+    },
+    {
+      "epoch": 42.56253813300793,
+      "grad_norm": 2.6554925441741943,
+      "learning_rate": 0.00011507389162561578,
+      "loss": 0.3541,
+      "step": 8720
+    },
+    {
+      "epoch": 42.660158633312996,
+      "grad_norm": 2.766045331954956,
+      "learning_rate": 0.00011487684729064042,
+      "loss": 0.3682,
+      "step": 8740
+    },
+    {
+      "epoch": 42.75777913361806,
+      "grad_norm": 3.0672762393951416,
+      "learning_rate": 0.00011467980295566503,
+      "loss": 0.3943,
+      "step": 8760
+    },
+    {
+      "epoch": 42.85539963392312,
+      "grad_norm": 2.898484468460083,
+      "learning_rate": 0.00011448275862068967,
+      "loss": 0.3702,
+      "step": 8780
+    },
+    {
+      "epoch": 42.95302013422819,
+      "grad_norm": 2.7023797035217285,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 0.388,
+      "step": 8800
+    },
+    {
+      "epoch": 43.05064063453325,
+      "grad_norm": 2.4088499546051025,
+      "learning_rate": 0.00011408866995073892,
+      "loss": 0.3615,
+      "step": 8820
+    },
+    {
+      "epoch": 43.148261134838314,
+      "grad_norm": 2.3739655017852783,
+      "learning_rate": 0.00011389162561576354,
+      "loss": 0.3703,
+      "step": 8840
+    },
+    {
+      "epoch": 43.24588163514338,
+      "grad_norm": 3.2558271884918213,
+      "learning_rate": 0.00011369458128078818,
+      "loss": 0.3478,
+      "step": 8860
+    },
+    {
+      "epoch": 43.343502135448446,
+      "grad_norm": 2.931380271911621,
+      "learning_rate": 0.00011349753694581282,
+      "loss": 0.3553,
+      "step": 8880
+    },
+    {
+      "epoch": 43.44112263575351,
+      "grad_norm": 2.5165908336639404,
+      "learning_rate": 0.00011330049261083743,
+      "loss": 0.3495,
+      "step": 8900
+    },
+    {
+      "epoch": 43.53874313605857,
+      "grad_norm": 3.5619068145751953,
+      "learning_rate": 0.00011310344827586207,
+      "loss": 0.3692,
+      "step": 8920
+    },
+    {
+      "epoch": 43.63636363636363,
+      "grad_norm": 2.39534068107605,
+      "learning_rate": 0.0001129064039408867,
+      "loss": 0.3674,
+      "step": 8940
+    },
+    {
+      "epoch": 43.7339841366687,
+      "grad_norm": 3.495316505432129,
+      "learning_rate": 0.00011270935960591134,
+      "loss": 0.367,
+      "step": 8960
+    },
+    {
+      "epoch": 43.831604636973765,
+      "grad_norm": 2.8195016384124756,
+      "learning_rate": 0.00011251231527093598,
+      "loss": 0.411,
+      "step": 8980
+    },
+    {
+      "epoch": 43.92922513727883,
+      "grad_norm": 3.446014165878296,
+      "learning_rate": 0.0001123152709359606,
+      "loss": 0.3774,
+      "step": 9000
     }
   ],
   "logging_steps": 20,
       "attributes": {}
     }
   },
+  "total_flos": 1.79330460401664e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null