{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3705, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008097165991902834, "grad_norm": 8.212680311060101, "learning_rate": 4.0000000000000003e-07, "loss": 1.8584, "step": 1 }, { "epoch": 0.0016194331983805667, "grad_norm": 8.289202083488915, "learning_rate": 8.000000000000001e-07, "loss": 1.8781, "step": 2 }, { "epoch": 0.0024291497975708503, "grad_norm": 8.121793660526253, "learning_rate": 1.2000000000000002e-06, "loss": 1.8632, "step": 3 }, { "epoch": 0.0032388663967611335, "grad_norm": 8.282365329091148, "learning_rate": 1.6000000000000001e-06, "loss": 1.8428, "step": 4 }, { "epoch": 0.004048582995951417, "grad_norm": 6.205624862594086, "learning_rate": 2.0000000000000003e-06, "loss": 1.8148, "step": 5 }, { "epoch": 0.004858299595141701, "grad_norm": 3.874015809560423, "learning_rate": 2.4000000000000003e-06, "loss": 1.8443, "step": 6 }, { "epoch": 0.005668016194331984, "grad_norm": 3.295685239888731, "learning_rate": 2.8000000000000003e-06, "loss": 1.7624, "step": 7 }, { "epoch": 0.006477732793522267, "grad_norm": 3.35229547435271, "learning_rate": 3.2000000000000003e-06, "loss": 1.7892, "step": 8 }, { "epoch": 0.0072874493927125505, "grad_norm": 3.6152531340095924, "learning_rate": 3.6000000000000003e-06, "loss": 1.7692, "step": 9 }, { "epoch": 0.008097165991902834, "grad_norm": 2.839020140388393, "learning_rate": 4.000000000000001e-06, "loss": 1.7546, "step": 10 }, { "epoch": 0.008906882591093117, "grad_norm": 3.280470763841352, "learning_rate": 4.4e-06, "loss": 1.781, "step": 11 }, { "epoch": 0.009716599190283401, "grad_norm": 2.919143442971804, "learning_rate": 4.800000000000001e-06, "loss": 1.7255, "step": 12 }, { "epoch": 0.010526315789473684, "grad_norm": 2.1780641385791957, "learning_rate": 5.2e-06, "loss": 1.7069, "step": 13 }, { "epoch": 0.011336032388663968, "grad_norm": 2.0864543101995356, "learning_rate": 5.600000000000001e-06, "loss": 1.7801, "step": 14 }, { "epoch": 0.012145748987854251, "grad_norm": 2.433989036218932, "learning_rate": 6e-06, "loss": 1.6886, "step": 15 }, { "epoch": 0.012955465587044534, "grad_norm": 2.108153096043187, "learning_rate": 6.4000000000000006e-06, "loss": 1.7845, "step": 16 }, { "epoch": 0.013765182186234818, "grad_norm": 2.147481330273472, "learning_rate": 6.800000000000001e-06, "loss": 1.7888, "step": 17 }, { "epoch": 0.014574898785425101, "grad_norm": 2.0230155835526658, "learning_rate": 7.2000000000000005e-06, "loss": 1.7484, "step": 18 }, { "epoch": 0.015384615384615385, "grad_norm": 1.8351338768344623, "learning_rate": 7.600000000000001e-06, "loss": 1.6815, "step": 19 }, { "epoch": 0.016194331983805668, "grad_norm": 1.8385443541885478, "learning_rate": 8.000000000000001e-06, "loss": 1.7259, "step": 20 }, { "epoch": 0.01700404858299595, "grad_norm": 1.9450404722821018, "learning_rate": 8.400000000000001e-06, "loss": 1.7513, "step": 21 }, { "epoch": 0.017813765182186234, "grad_norm": 1.7329165869002578, "learning_rate": 8.8e-06, "loss": 1.6934, "step": 22 }, { "epoch": 0.01862348178137652, "grad_norm": 1.8043465898806423, "learning_rate": 9.200000000000002e-06, "loss": 1.7411, "step": 23 }, { "epoch": 0.019433198380566803, "grad_norm": 1.6575139155258072, "learning_rate": 9.600000000000001e-06, "loss": 1.7141, "step": 24 }, { "epoch": 0.020242914979757085, "grad_norm": 1.6149477634364793, "learning_rate": 1e-05, "loss": 1.6359, "step": 25 }, { "epoch": 0.021052631578947368, "grad_norm": 1.720501765932264, "learning_rate": 1.04e-05, "loss": 1.6983, "step": 26 }, { "epoch": 0.02186234817813765, "grad_norm": 1.669970332575193, "learning_rate": 1.0800000000000002e-05, "loss": 1.7, "step": 27 }, { "epoch": 0.022672064777327937, "grad_norm": 1.588644479806073, "learning_rate": 1.1200000000000001e-05, "loss": 1.6018, "step": 28 }, { "epoch": 0.02348178137651822, "grad_norm": 1.6915129269465283, "learning_rate": 1.16e-05, "loss": 1.6732, "step": 29 }, { "epoch": 0.024291497975708502, "grad_norm": 1.67842726423677, "learning_rate": 1.2e-05, "loss": 1.7291, "step": 30 }, { "epoch": 0.025101214574898785, "grad_norm": 1.662602613195812, "learning_rate": 1.2400000000000002e-05, "loss": 1.714, "step": 31 }, { "epoch": 0.025910931174089068, "grad_norm": 1.5692125530449308, "learning_rate": 1.2800000000000001e-05, "loss": 1.7681, "step": 32 }, { "epoch": 0.026720647773279354, "grad_norm": 1.5593466931369984, "learning_rate": 1.3200000000000002e-05, "loss": 1.6625, "step": 33 }, { "epoch": 0.027530364372469637, "grad_norm": 1.5314296376489476, "learning_rate": 1.3600000000000002e-05, "loss": 1.6711, "step": 34 }, { "epoch": 0.02834008097165992, "grad_norm": 1.7793373457448158, "learning_rate": 1.4e-05, "loss": 1.7056, "step": 35 }, { "epoch": 0.029149797570850202, "grad_norm": 1.5178537061767556, "learning_rate": 1.4400000000000001e-05, "loss": 1.7037, "step": 36 }, { "epoch": 0.029959514170040485, "grad_norm": 1.5459895717695264, "learning_rate": 1.48e-05, "loss": 1.7683, "step": 37 }, { "epoch": 0.03076923076923077, "grad_norm": 1.5735552822755368, "learning_rate": 1.5200000000000002e-05, "loss": 1.6872, "step": 38 }, { "epoch": 0.031578947368421054, "grad_norm": 1.4358615071215153, "learning_rate": 1.5600000000000003e-05, "loss": 1.7402, "step": 39 }, { "epoch": 0.032388663967611336, "grad_norm": 1.712555492023088, "learning_rate": 1.6000000000000003e-05, "loss": 1.7413, "step": 40 }, { "epoch": 0.03319838056680162, "grad_norm": 1.527931327004627, "learning_rate": 1.64e-05, "loss": 1.7361, "step": 41 }, { "epoch": 0.0340080971659919, "grad_norm": 1.5654078241724048, "learning_rate": 1.6800000000000002e-05, "loss": 1.7174, "step": 42 }, { "epoch": 0.034817813765182185, "grad_norm": 1.5967020304138295, "learning_rate": 1.72e-05, "loss": 1.6863, "step": 43 }, { "epoch": 0.03562753036437247, "grad_norm": 1.5449242343523315, "learning_rate": 1.76e-05, "loss": 1.7186, "step": 44 }, { "epoch": 0.03643724696356275, "grad_norm": 1.5664894285095186, "learning_rate": 1.8e-05, "loss": 1.677, "step": 45 }, { "epoch": 0.03724696356275304, "grad_norm": 1.6465253425137338, "learning_rate": 1.8400000000000003e-05, "loss": 1.767, "step": 46 }, { "epoch": 0.03805668016194332, "grad_norm": 1.6352491164142435, "learning_rate": 1.88e-05, "loss": 1.5998, "step": 47 }, { "epoch": 0.038866396761133605, "grad_norm": 1.6363309288994599, "learning_rate": 1.9200000000000003e-05, "loss": 1.6266, "step": 48 }, { "epoch": 0.03967611336032389, "grad_norm": 1.8173439281602377, "learning_rate": 1.9600000000000002e-05, "loss": 1.6592, "step": 49 }, { "epoch": 0.04048582995951417, "grad_norm": 1.5272192265553266, "learning_rate": 2e-05, "loss": 1.738, "step": 50 }, { "epoch": 0.04129554655870445, "grad_norm": 1.8649531326109419, "learning_rate": 1.9999996306016426e-05, "loss": 1.7247, "step": 51 }, { "epoch": 0.042105263157894736, "grad_norm": 1.7074872326048571, "learning_rate": 1.9999985224068418e-05, "loss": 1.7084, "step": 52 }, { "epoch": 0.04291497975708502, "grad_norm": 1.4977194700952365, "learning_rate": 1.9999966754164176e-05, "loss": 1.7055, "step": 53 }, { "epoch": 0.0437246963562753, "grad_norm": 1.6230626841795457, "learning_rate": 1.9999940896317337e-05, "loss": 1.5649, "step": 54 }, { "epoch": 0.044534412955465584, "grad_norm": 1.6916647362719663, "learning_rate": 1.9999907650547006e-05, "loss": 1.7438, "step": 55 }, { "epoch": 0.045344129554655874, "grad_norm": 1.4840562908731014, "learning_rate": 1.999986701687775e-05, "loss": 1.6202, "step": 56 }, { "epoch": 0.046153846153846156, "grad_norm": 2.1064626020990596, "learning_rate": 1.9999818995339587e-05, "loss": 1.6874, "step": 57 }, { "epoch": 0.04696356275303644, "grad_norm": 1.514821876037186, "learning_rate": 1.999976358596799e-05, "loss": 1.6145, "step": 58 }, { "epoch": 0.04777327935222672, "grad_norm": 1.5837774362892976, "learning_rate": 1.9999700788803902e-05, "loss": 1.7424, "step": 59 }, { "epoch": 0.048582995951417005, "grad_norm": 1.5015921648476362, "learning_rate": 1.999963060389371e-05, "loss": 1.7074, "step": 60 }, { "epoch": 0.04939271255060729, "grad_norm": 1.5081987493072453, "learning_rate": 1.9999553031289277e-05, "loss": 1.7671, "step": 61 }, { "epoch": 0.05020242914979757, "grad_norm": 1.449828681342794, "learning_rate": 1.9999468071047904e-05, "loss": 1.7608, "step": 62 }, { "epoch": 0.05101214574898785, "grad_norm": 1.601994782437138, "learning_rate": 1.9999375723232362e-05, "loss": 1.7219, "step": 63 }, { "epoch": 0.051821862348178135, "grad_norm": 1.5204297943669711, "learning_rate": 1.999927598791088e-05, "loss": 1.756, "step": 64 }, { "epoch": 0.05263157894736842, "grad_norm": 1.4755593142051295, "learning_rate": 1.9999168865157137e-05, "loss": 1.6837, "step": 65 }, { "epoch": 0.05344129554655871, "grad_norm": 1.5345109179851053, "learning_rate": 1.999905435505028e-05, "loss": 1.7347, "step": 66 }, { "epoch": 0.05425101214574899, "grad_norm": 1.6001626392593242, "learning_rate": 1.9998932457674904e-05, "loss": 1.654, "step": 67 }, { "epoch": 0.05506072874493927, "grad_norm": 1.4701409508033054, "learning_rate": 1.999880317312107e-05, "loss": 1.7161, "step": 68 }, { "epoch": 0.055870445344129556, "grad_norm": 1.3692751510613166, "learning_rate": 1.999866650148429e-05, "loss": 1.7001, "step": 69 }, { "epoch": 0.05668016194331984, "grad_norm": 1.48694331218701, "learning_rate": 1.999852244286554e-05, "loss": 1.812, "step": 70 }, { "epoch": 0.05748987854251012, "grad_norm": 1.3924742554067722, "learning_rate": 1.999837099737125e-05, "loss": 1.6882, "step": 71 }, { "epoch": 0.058299595141700404, "grad_norm": 1.4558556277711194, "learning_rate": 1.9998212165113305e-05, "loss": 1.7295, "step": 72 }, { "epoch": 0.05910931174089069, "grad_norm": 1.4086032761044172, "learning_rate": 1.999804594620905e-05, "loss": 1.6551, "step": 73 }, { "epoch": 0.05991902834008097, "grad_norm": 1.4939708802308547, "learning_rate": 1.999787234078129e-05, "loss": 1.7266, "step": 74 }, { "epoch": 0.06072874493927125, "grad_norm": 1.4235299645544777, "learning_rate": 1.9997691348958278e-05, "loss": 1.7154, "step": 75 }, { "epoch": 0.06153846153846154, "grad_norm": 1.644992651490282, "learning_rate": 1.9997502970873736e-05, "loss": 1.7414, "step": 76 }, { "epoch": 0.062348178137651825, "grad_norm": 1.4731685460504425, "learning_rate": 1.9997307206666835e-05, "loss": 1.6953, "step": 77 }, { "epoch": 0.06315789473684211, "grad_norm": 1.4684965944812223, "learning_rate": 1.9997104056482206e-05, "loss": 1.7127, "step": 78 }, { "epoch": 0.06396761133603239, "grad_norm": 1.5057059153504087, "learning_rate": 1.9996893520469934e-05, "loss": 1.7393, "step": 79 }, { "epoch": 0.06477732793522267, "grad_norm": 1.3158005918665274, "learning_rate": 1.999667559878556e-05, "loss": 1.6802, "step": 80 }, { "epoch": 0.06558704453441296, "grad_norm": 1.405150913782594, "learning_rate": 1.9996450291590093e-05, "loss": 1.7208, "step": 81 }, { "epoch": 0.06639676113360324, "grad_norm": 1.540226649005545, "learning_rate": 1.9996217599049978e-05, "loss": 1.7939, "step": 82 }, { "epoch": 0.06720647773279352, "grad_norm": 1.4885304031726005, "learning_rate": 1.9995977521337134e-05, "loss": 1.6682, "step": 83 }, { "epoch": 0.0680161943319838, "grad_norm": 1.3901851985498637, "learning_rate": 1.9995730058628928e-05, "loss": 1.667, "step": 84 }, { "epoch": 0.06882591093117409, "grad_norm": 1.4725073250334886, "learning_rate": 1.9995475211108183e-05, "loss": 1.7547, "step": 85 }, { "epoch": 0.06963562753036437, "grad_norm": 1.452556987551732, "learning_rate": 1.9995212978963185e-05, "loss": 1.7608, "step": 86 }, { "epoch": 0.07044534412955465, "grad_norm": 1.4996449206080487, "learning_rate": 1.9994943362387666e-05, "loss": 1.6664, "step": 87 }, { "epoch": 0.07125506072874493, "grad_norm": 1.4199683810583805, "learning_rate": 1.9994666361580815e-05, "loss": 1.6367, "step": 88 }, { "epoch": 0.07206477732793522, "grad_norm": 1.571126361723556, "learning_rate": 1.999438197674729e-05, "loss": 1.7765, "step": 89 }, { "epoch": 0.0728744939271255, "grad_norm": 1.3926335473604161, "learning_rate": 1.9994090208097176e-05, "loss": 1.722, "step": 90 }, { "epoch": 0.07368421052631578, "grad_norm": 1.4552870672529465, "learning_rate": 1.9993791055846048e-05, "loss": 1.6651, "step": 91 }, { "epoch": 0.07449392712550608, "grad_norm": 1.379931558535098, "learning_rate": 1.999348452021491e-05, "loss": 1.7096, "step": 92 }, { "epoch": 0.07530364372469636, "grad_norm": 1.4198581520574205, "learning_rate": 1.9993170601430233e-05, "loss": 1.6601, "step": 93 }, { "epoch": 0.07611336032388664, "grad_norm": 1.5078914507853327, "learning_rate": 1.9992849299723933e-05, "loss": 1.8005, "step": 94 }, { "epoch": 0.07692307692307693, "grad_norm": 1.5076216489476741, "learning_rate": 1.9992520615333393e-05, "loss": 1.682, "step": 95 }, { "epoch": 0.07773279352226721, "grad_norm": 1.3937658114601108, "learning_rate": 1.9992184548501444e-05, "loss": 1.6415, "step": 96 }, { "epoch": 0.07854251012145749, "grad_norm": 1.5041710747062258, "learning_rate": 1.9991841099476365e-05, "loss": 1.6654, "step": 97 }, { "epoch": 0.07935222672064778, "grad_norm": 1.4006737880954128, "learning_rate": 1.9991490268511903e-05, "loss": 1.6976, "step": 98 }, { "epoch": 0.08016194331983806, "grad_norm": 1.5631879172620895, "learning_rate": 1.9991132055867244e-05, "loss": 1.669, "step": 99 }, { "epoch": 0.08097165991902834, "grad_norm": 1.394392254430093, "learning_rate": 1.9990766461807037e-05, "loss": 1.6968, "step": 100 }, { "epoch": 0.08178137651821862, "grad_norm": 1.472525619978714, "learning_rate": 1.9990393486601385e-05, "loss": 1.6773, "step": 101 }, { "epoch": 0.0825910931174089, "grad_norm": 1.378206403293524, "learning_rate": 1.9990013130525835e-05, "loss": 1.6821, "step": 102 }, { "epoch": 0.08340080971659919, "grad_norm": 1.328026656261711, "learning_rate": 1.9989625393861397e-05, "loss": 1.6103, "step": 103 }, { "epoch": 0.08421052631578947, "grad_norm": 1.4064722782717616, "learning_rate": 1.9989230276894525e-05, "loss": 1.7324, "step": 104 }, { "epoch": 0.08502024291497975, "grad_norm": 1.4813376774614593, "learning_rate": 1.9988827779917138e-05, "loss": 1.7272, "step": 105 }, { "epoch": 0.08582995951417004, "grad_norm": 1.3687982499187121, "learning_rate": 1.998841790322659e-05, "loss": 1.673, "step": 106 }, { "epoch": 0.08663967611336032, "grad_norm": 1.422876362819873, "learning_rate": 1.9988000647125703e-05, "loss": 1.7597, "step": 107 }, { "epoch": 0.0874493927125506, "grad_norm": 1.308219863689626, "learning_rate": 1.9987576011922743e-05, "loss": 1.7697, "step": 108 }, { "epoch": 0.08825910931174089, "grad_norm": 1.4183268142705985, "learning_rate": 1.9987143997931428e-05, "loss": 1.6809, "step": 109 }, { "epoch": 0.08906882591093117, "grad_norm": 1.4357441361156031, "learning_rate": 1.9986704605470932e-05, "loss": 1.6358, "step": 110 }, { "epoch": 0.08987854251012145, "grad_norm": 1.3529581082840894, "learning_rate": 1.998625783486587e-05, "loss": 1.6681, "step": 111 }, { "epoch": 0.09068825910931175, "grad_norm": 1.408818420235949, "learning_rate": 1.998580368644632e-05, "loss": 1.6938, "step": 112 }, { "epoch": 0.09149797570850203, "grad_norm": 1.4451180541868947, "learning_rate": 1.998534216054781e-05, "loss": 1.7739, "step": 113 }, { "epoch": 0.09230769230769231, "grad_norm": 1.2941303668841717, "learning_rate": 1.9984873257511296e-05, "loss": 1.6704, "step": 114 }, { "epoch": 0.0931174089068826, "grad_norm": 1.440123130144802, "learning_rate": 1.9984396977683223e-05, "loss": 1.7456, "step": 115 }, { "epoch": 0.09392712550607288, "grad_norm": 1.3845008076241843, "learning_rate": 1.998391332141545e-05, "loss": 1.6968, "step": 116 }, { "epoch": 0.09473684210526316, "grad_norm": 1.5220029955302263, "learning_rate": 1.998342228906531e-05, "loss": 1.8585, "step": 117 }, { "epoch": 0.09554655870445344, "grad_norm": 1.350510831118027, "learning_rate": 1.998292388099557e-05, "loss": 1.7009, "step": 118 }, { "epoch": 0.09635627530364373, "grad_norm": 1.410947289102555, "learning_rate": 1.9982418097574458e-05, "loss": 1.7082, "step": 119 }, { "epoch": 0.09716599190283401, "grad_norm": 1.4674312785560688, "learning_rate": 1.998190493917564e-05, "loss": 1.6933, "step": 120 }, { "epoch": 0.09797570850202429, "grad_norm": 1.3586528211235367, "learning_rate": 1.9981384406178235e-05, "loss": 1.697, "step": 121 }, { "epoch": 0.09878542510121457, "grad_norm": 1.323874170111494, "learning_rate": 1.998085649896682e-05, "loss": 1.6996, "step": 122 }, { "epoch": 0.09959514170040486, "grad_norm": 1.3857202766997219, "learning_rate": 1.99803212179314e-05, "loss": 1.6158, "step": 123 }, { "epoch": 0.10040485829959514, "grad_norm": 1.3135040066156267, "learning_rate": 1.9979778563467446e-05, "loss": 1.6917, "step": 124 }, { "epoch": 0.10121457489878542, "grad_norm": 1.450855282133572, "learning_rate": 1.9979228535975866e-05, "loss": 1.6773, "step": 125 }, { "epoch": 0.1020242914979757, "grad_norm": 1.347188642125951, "learning_rate": 1.997867113586302e-05, "loss": 1.7144, "step": 126 }, { "epoch": 0.10283400809716599, "grad_norm": 1.345981107132026, "learning_rate": 1.997810636354071e-05, "loss": 1.6041, "step": 127 }, { "epoch": 0.10364372469635627, "grad_norm": 1.2972173884632567, "learning_rate": 1.9977534219426195e-05, "loss": 1.6855, "step": 128 }, { "epoch": 0.10445344129554655, "grad_norm": 1.275527287436048, "learning_rate": 1.997695470394217e-05, "loss": 1.696, "step": 129 }, { "epoch": 0.10526315789473684, "grad_norm": 1.3215678390799637, "learning_rate": 1.9976367817516773e-05, "loss": 1.6935, "step": 130 }, { "epoch": 0.10607287449392712, "grad_norm": 1.3828255635499427, "learning_rate": 1.99757735605836e-05, "loss": 1.7256, "step": 131 }, { "epoch": 0.10688259109311742, "grad_norm": 1.448631635568024, "learning_rate": 1.997517193358169e-05, "loss": 1.6388, "step": 132 }, { "epoch": 0.1076923076923077, "grad_norm": 1.3338631760173656, "learning_rate": 1.9974562936955513e-05, "loss": 1.6965, "step": 133 }, { "epoch": 0.10850202429149798, "grad_norm": 1.3424092875489613, "learning_rate": 1.9973946571155e-05, "loss": 1.709, "step": 134 }, { "epoch": 0.10931174089068826, "grad_norm": 1.3898696103299275, "learning_rate": 1.9973322836635517e-05, "loss": 1.6822, "step": 135 }, { "epoch": 0.11012145748987855, "grad_norm": 1.4706810809601643, "learning_rate": 1.997269173385788e-05, "loss": 1.7058, "step": 136 }, { "epoch": 0.11093117408906883, "grad_norm": 1.4026119282799607, "learning_rate": 1.9972053263288346e-05, "loss": 1.6531, "step": 137 }, { "epoch": 0.11174089068825911, "grad_norm": 1.3288506201522425, "learning_rate": 1.9971407425398614e-05, "loss": 1.6532, "step": 138 }, { "epoch": 0.1125506072874494, "grad_norm": 1.5239912432928768, "learning_rate": 1.9970754220665824e-05, "loss": 1.7604, "step": 139 }, { "epoch": 0.11336032388663968, "grad_norm": 1.444106448065426, "learning_rate": 1.9970093649572567e-05, "loss": 1.7596, "step": 140 }, { "epoch": 0.11417004048582996, "grad_norm": 1.3200905343039002, "learning_rate": 1.9969425712606864e-05, "loss": 1.6867, "step": 141 }, { "epoch": 0.11497975708502024, "grad_norm": 1.3681666300591042, "learning_rate": 1.996875041026219e-05, "loss": 1.6352, "step": 142 }, { "epoch": 0.11578947368421053, "grad_norm": 1.4975233179603091, "learning_rate": 1.9968067743037453e-05, "loss": 1.679, "step": 143 }, { "epoch": 0.11659919028340081, "grad_norm": 1.3172078248631487, "learning_rate": 1.9967377711437008e-05, "loss": 1.7421, "step": 144 }, { "epoch": 0.11740890688259109, "grad_norm": 1.260492754026676, "learning_rate": 1.9966680315970647e-05, "loss": 1.5965, "step": 145 }, { "epoch": 0.11821862348178137, "grad_norm": 1.3786263323616315, "learning_rate": 1.9965975557153604e-05, "loss": 1.7367, "step": 146 }, { "epoch": 0.11902834008097166, "grad_norm": 1.3104589610244388, "learning_rate": 1.996526343550655e-05, "loss": 1.6468, "step": 147 }, { "epoch": 0.11983805668016194, "grad_norm": 1.3212338618017334, "learning_rate": 1.99645439515556e-05, "loss": 1.6932, "step": 148 }, { "epoch": 0.12064777327935222, "grad_norm": 1.2823071957240442, "learning_rate": 1.9963817105832305e-05, "loss": 1.6809, "step": 149 }, { "epoch": 0.1214574898785425, "grad_norm": 1.2331269340221573, "learning_rate": 1.996308289887366e-05, "loss": 1.7086, "step": 150 }, { "epoch": 0.12226720647773279, "grad_norm": 1.3183668422073427, "learning_rate": 1.9962341331222092e-05, "loss": 1.6732, "step": 151 }, { "epoch": 0.12307692307692308, "grad_norm": 1.3211093232564248, "learning_rate": 1.996159240342547e-05, "loss": 1.6752, "step": 152 }, { "epoch": 0.12388663967611337, "grad_norm": 1.3807756953246901, "learning_rate": 1.9960836116037095e-05, "loss": 1.6819, "step": 153 }, { "epoch": 0.12469635627530365, "grad_norm": 1.229226565824642, "learning_rate": 1.9960072469615716e-05, "loss": 1.7211, "step": 154 }, { "epoch": 0.12550607287449392, "grad_norm": 1.37757201668452, "learning_rate": 1.9959301464725507e-05, "loss": 1.7252, "step": 155 }, { "epoch": 0.12631578947368421, "grad_norm": 1.3625152047582556, "learning_rate": 1.9958523101936083e-05, "loss": 1.7572, "step": 156 }, { "epoch": 0.12712550607287448, "grad_norm": 1.3700626216344285, "learning_rate": 1.9957737381822505e-05, "loss": 1.7384, "step": 157 }, { "epoch": 0.12793522267206478, "grad_norm": 1.3265891040200295, "learning_rate": 1.9956944304965257e-05, "loss": 1.7058, "step": 158 }, { "epoch": 0.12874493927125505, "grad_norm": 1.477875528364369, "learning_rate": 1.9956143871950252e-05, "loss": 1.6846, "step": 159 }, { "epoch": 0.12955465587044535, "grad_norm": 1.3079566287214603, "learning_rate": 1.995533608336886e-05, "loss": 1.7252, "step": 160 }, { "epoch": 0.13036437246963561, "grad_norm": 1.422258383650131, "learning_rate": 1.9954520939817863e-05, "loss": 1.7018, "step": 161 }, { "epoch": 0.1311740890688259, "grad_norm": 1.4264781196637615, "learning_rate": 1.9953698441899494e-05, "loss": 1.6585, "step": 162 }, { "epoch": 0.1319838056680162, "grad_norm": 1.2963346637584106, "learning_rate": 1.9952868590221403e-05, "loss": 1.6369, "step": 163 }, { "epoch": 0.13279352226720648, "grad_norm": 1.5083900065401323, "learning_rate": 1.9952031385396694e-05, "loss": 1.7224, "step": 164 }, { "epoch": 0.13360323886639677, "grad_norm": 1.5179214854876595, "learning_rate": 1.995118682804388e-05, "loss": 1.7588, "step": 165 }, { "epoch": 0.13441295546558704, "grad_norm": 1.3017296176425739, "learning_rate": 1.995033491878692e-05, "loss": 1.7246, "step": 166 }, { "epoch": 0.13522267206477734, "grad_norm": 1.505962687706292, "learning_rate": 1.9949475658255207e-05, "loss": 1.7567, "step": 167 }, { "epoch": 0.1360323886639676, "grad_norm": 1.2555297272542156, "learning_rate": 1.994860904708355e-05, "loss": 1.7062, "step": 168 }, { "epoch": 0.1368421052631579, "grad_norm": 1.485635141654141, "learning_rate": 1.994773508591221e-05, "loss": 1.6812, "step": 169 }, { "epoch": 0.13765182186234817, "grad_norm": 1.2908226955038942, "learning_rate": 1.9946853775386857e-05, "loss": 1.6867, "step": 170 }, { "epoch": 0.13846153846153847, "grad_norm": 1.32792940285529, "learning_rate": 1.9945965116158605e-05, "loss": 1.6529, "step": 171 }, { "epoch": 0.13927125506072874, "grad_norm": 1.2871011365969764, "learning_rate": 1.9945069108883993e-05, "loss": 1.6857, "step": 172 }, { "epoch": 0.14008097165991903, "grad_norm": 1.3266369892431742, "learning_rate": 1.994416575422499e-05, "loss": 1.7121, "step": 173 }, { "epoch": 0.1408906882591093, "grad_norm": 1.3444583702433448, "learning_rate": 1.9943255052848984e-05, "loss": 1.6784, "step": 174 }, { "epoch": 0.1417004048582996, "grad_norm": 1.2393516324343374, "learning_rate": 1.9942337005428805e-05, "loss": 1.6686, "step": 175 }, { "epoch": 0.14251012145748987, "grad_norm": 1.3248524170639142, "learning_rate": 1.99414116126427e-05, "loss": 1.6597, "step": 176 }, { "epoch": 0.14331983805668017, "grad_norm": 1.1663228528592244, "learning_rate": 1.9940478875174346e-05, "loss": 1.5819, "step": 177 }, { "epoch": 0.14412955465587043, "grad_norm": 1.3787198785492285, "learning_rate": 1.9939538793712852e-05, "loss": 1.6482, "step": 178 }, { "epoch": 0.14493927125506073, "grad_norm": 1.350980964072927, "learning_rate": 1.993859136895274e-05, "loss": 1.6578, "step": 179 }, { "epoch": 0.145748987854251, "grad_norm": 1.4172264638957366, "learning_rate": 1.9937636601593965e-05, "loss": 1.7364, "step": 180 }, { "epoch": 0.1465587044534413, "grad_norm": 1.2164996262071472, "learning_rate": 1.9936674492341913e-05, "loss": 1.6566, "step": 181 }, { "epoch": 0.14736842105263157, "grad_norm": 1.163342097978053, "learning_rate": 1.9935705041907375e-05, "loss": 1.6742, "step": 182 }, { "epoch": 0.14817813765182186, "grad_norm": 1.3251945549485633, "learning_rate": 1.9934728251006593e-05, "loss": 1.6875, "step": 183 }, { "epoch": 0.14898785425101216, "grad_norm": 1.2641308251973702, "learning_rate": 1.9933744120361202e-05, "loss": 1.7407, "step": 184 }, { "epoch": 0.14979757085020243, "grad_norm": 1.3486708746298524, "learning_rate": 1.9932752650698285e-05, "loss": 1.6667, "step": 185 }, { "epoch": 0.15060728744939272, "grad_norm": 1.2926218776022609, "learning_rate": 1.993175384275033e-05, "loss": 1.6742, "step": 186 }, { "epoch": 0.151417004048583, "grad_norm": 1.367440454792052, "learning_rate": 1.9930747697255263e-05, "loss": 1.6944, "step": 187 }, { "epoch": 0.1522267206477733, "grad_norm": 1.3527036774330383, "learning_rate": 1.992973421495641e-05, "loss": 1.626, "step": 188 }, { "epoch": 0.15303643724696356, "grad_norm": 1.287788530805172, "learning_rate": 1.992871339660253e-05, "loss": 1.6921, "step": 189 }, { "epoch": 0.15384615384615385, "grad_norm": 1.4084861938787652, "learning_rate": 1.9927685242947804e-05, "loss": 1.7108, "step": 190 }, { "epoch": 0.15465587044534412, "grad_norm": 1.279148085366124, "learning_rate": 1.9926649754751825e-05, "loss": 1.6675, "step": 191 }, { "epoch": 0.15546558704453442, "grad_norm": 1.2664930272396757, "learning_rate": 1.9925606932779615e-05, "loss": 1.7115, "step": 192 }, { "epoch": 0.1562753036437247, "grad_norm": 1.321724868931457, "learning_rate": 1.99245567778016e-05, "loss": 1.6596, "step": 193 }, { "epoch": 0.15708502024291499, "grad_norm": 1.2948713460179608, "learning_rate": 1.9923499290593637e-05, "loss": 1.6664, "step": 194 }, { "epoch": 0.15789473684210525, "grad_norm": 1.3818189742460931, "learning_rate": 1.9922434471936987e-05, "loss": 1.7305, "step": 195 }, { "epoch": 0.15870445344129555, "grad_norm": 1.2479572194235047, "learning_rate": 1.9921362322618337e-05, "loss": 1.7553, "step": 196 }, { "epoch": 0.15951417004048582, "grad_norm": 1.2898516448229447, "learning_rate": 1.9920282843429795e-05, "loss": 1.65, "step": 197 }, { "epoch": 0.16032388663967612, "grad_norm": 1.344427118443871, "learning_rate": 1.9919196035168865e-05, "loss": 1.6366, "step": 198 }, { "epoch": 0.16113360323886639, "grad_norm": 1.2531134681362286, "learning_rate": 1.9918101898638488e-05, "loss": 1.6608, "step": 199 }, { "epoch": 0.16194331983805668, "grad_norm": 1.5273799001568773, "learning_rate": 1.9917000434647e-05, "loss": 1.6969, "step": 200 }, { "epoch": 0.16275303643724695, "grad_norm": 1.4281468350234128, "learning_rate": 1.9915891644008164e-05, "loss": 1.6933, "step": 201 }, { "epoch": 0.16356275303643725, "grad_norm": 1.2479553523055322, "learning_rate": 1.991477552754115e-05, "loss": 1.633, "step": 202 }, { "epoch": 0.16437246963562754, "grad_norm": 1.3889081568820743, "learning_rate": 1.9913652086070535e-05, "loss": 1.6847, "step": 203 }, { "epoch": 0.1651821862348178, "grad_norm": 1.3301717360886383, "learning_rate": 1.9912521320426327e-05, "loss": 1.6712, "step": 204 }, { "epoch": 0.1659919028340081, "grad_norm": 1.2916336401520587, "learning_rate": 1.991138323144392e-05, "loss": 1.6672, "step": 205 }, { "epoch": 0.16680161943319838, "grad_norm": 1.2694035888961783, "learning_rate": 1.9910237819964135e-05, "loss": 1.6759, "step": 206 }, { "epoch": 0.16761133603238867, "grad_norm": 1.2658690812785272, "learning_rate": 1.9909085086833198e-05, "loss": 1.6743, "step": 207 }, { "epoch": 0.16842105263157894, "grad_norm": 1.2672804576639893, "learning_rate": 1.9907925032902745e-05, "loss": 1.664, "step": 208 }, { "epoch": 0.16923076923076924, "grad_norm": 1.1845691219627643, "learning_rate": 1.9906757659029817e-05, "loss": 1.6352, "step": 209 }, { "epoch": 0.1700404858299595, "grad_norm": 1.260533932342834, "learning_rate": 1.990558296607687e-05, "loss": 1.6717, "step": 210 }, { "epoch": 0.1708502024291498, "grad_norm": 1.1724140852250677, "learning_rate": 1.9904400954911763e-05, "loss": 1.5936, "step": 211 }, { "epoch": 0.17165991902834007, "grad_norm": 1.3376574942903026, "learning_rate": 1.990321162640776e-05, "loss": 1.6479, "step": 212 }, { "epoch": 0.17246963562753037, "grad_norm": 1.3302629754630635, "learning_rate": 1.9902014981443532e-05, "loss": 1.6688, "step": 213 }, { "epoch": 0.17327935222672064, "grad_norm": 1.2504083776545458, "learning_rate": 1.9900811020903158e-05, "loss": 1.6221, "step": 214 }, { "epoch": 0.17408906882591094, "grad_norm": 1.279410572569658, "learning_rate": 1.9899599745676123e-05, "loss": 1.7135, "step": 215 }, { "epoch": 0.1748987854251012, "grad_norm": 1.3819762266443132, "learning_rate": 1.989838115665731e-05, "loss": 1.6995, "step": 216 }, { "epoch": 0.1757085020242915, "grad_norm": 1.260595817978411, "learning_rate": 1.9897155254747006e-05, "loss": 1.6651, "step": 217 }, { "epoch": 0.17651821862348177, "grad_norm": 1.3746335152221298, "learning_rate": 1.989592204085091e-05, "loss": 1.732, "step": 218 }, { "epoch": 0.17732793522267207, "grad_norm": 1.2946738994826934, "learning_rate": 1.9894681515880106e-05, "loss": 1.6498, "step": 219 }, { "epoch": 0.17813765182186234, "grad_norm": 1.3214716293210784, "learning_rate": 1.9893433680751105e-05, "loss": 1.6552, "step": 220 }, { "epoch": 0.17894736842105263, "grad_norm": 1.267390260422078, "learning_rate": 1.9892178536385788e-05, "loss": 1.6601, "step": 221 }, { "epoch": 0.1797570850202429, "grad_norm": 1.2661450912214733, "learning_rate": 1.9890916083711463e-05, "loss": 1.6884, "step": 222 }, { "epoch": 0.1805668016194332, "grad_norm": 1.2915140670465655, "learning_rate": 1.9889646323660816e-05, "loss": 1.7442, "step": 223 }, { "epoch": 0.1813765182186235, "grad_norm": 1.2082943407898998, "learning_rate": 1.9888369257171952e-05, "loss": 1.6767, "step": 224 }, { "epoch": 0.18218623481781376, "grad_norm": 1.3731016770046895, "learning_rate": 1.9887084885188354e-05, "loss": 1.7334, "step": 225 }, { "epoch": 0.18299595141700406, "grad_norm": 1.2555298583209036, "learning_rate": 1.988579320865892e-05, "loss": 1.7203, "step": 226 }, { "epoch": 0.18380566801619433, "grad_norm": 1.263140649035035, "learning_rate": 1.988449422853793e-05, "loss": 1.6895, "step": 227 }, { "epoch": 0.18461538461538463, "grad_norm": 1.2749775919052824, "learning_rate": 1.9883187945785067e-05, "loss": 1.6948, "step": 228 }, { "epoch": 0.1854251012145749, "grad_norm": 1.2429895536427575, "learning_rate": 1.9881874361365413e-05, "loss": 1.6361, "step": 229 }, { "epoch": 0.1862348178137652, "grad_norm": 1.2416951269903427, "learning_rate": 1.9880553476249437e-05, "loss": 1.6758, "step": 230 }, { "epoch": 0.18704453441295546, "grad_norm": 1.239555283382809, "learning_rate": 1.9879225291413e-05, "loss": 1.6989, "step": 231 }, { "epoch": 0.18785425101214576, "grad_norm": 1.2919283930709724, "learning_rate": 1.9877889807837373e-05, "loss": 1.6731, "step": 232 }, { "epoch": 0.18866396761133603, "grad_norm": 1.2233652648633166, "learning_rate": 1.9876547026509194e-05, "loss": 1.6967, "step": 233 }, { "epoch": 0.18947368421052632, "grad_norm": 1.2745651498433337, "learning_rate": 1.987519694842051e-05, "loss": 1.7336, "step": 234 }, { "epoch": 0.1902834008097166, "grad_norm": 1.2732698824293152, "learning_rate": 1.9873839574568756e-05, "loss": 1.6757, "step": 235 }, { "epoch": 0.1910931174089069, "grad_norm": 1.3370656954094273, "learning_rate": 1.9872474905956752e-05, "loss": 1.7574, "step": 236 }, { "epoch": 0.19190283400809716, "grad_norm": 1.337679917073954, "learning_rate": 1.9871102943592717e-05, "loss": 1.6757, "step": 237 }, { "epoch": 0.19271255060728745, "grad_norm": 1.3246803706512758, "learning_rate": 1.9869723688490247e-05, "loss": 1.7, "step": 238 }, { "epoch": 0.19352226720647772, "grad_norm": 1.2000006625597532, "learning_rate": 1.9868337141668333e-05, "loss": 1.6744, "step": 239 }, { "epoch": 0.19433198380566802, "grad_norm": 1.3328073640320248, "learning_rate": 1.9866943304151346e-05, "loss": 1.6301, "step": 240 }, { "epoch": 0.1951417004048583, "grad_norm": 1.2595984845371397, "learning_rate": 1.9865542176969055e-05, "loss": 1.7314, "step": 241 }, { "epoch": 0.19595141700404858, "grad_norm": 1.2697481850011105, "learning_rate": 1.986413376115661e-05, "loss": 1.7403, "step": 242 }, { "epoch": 0.19676113360323888, "grad_norm": 1.184815618587701, "learning_rate": 1.9862718057754536e-05, "loss": 1.6913, "step": 243 }, { "epoch": 0.19757085020242915, "grad_norm": 1.2575508256727617, "learning_rate": 1.9861295067808754e-05, "loss": 1.6972, "step": 244 }, { "epoch": 0.19838056680161945, "grad_norm": 1.2037684360733698, "learning_rate": 1.9859864792370565e-05, "loss": 1.6808, "step": 245 }, { "epoch": 0.19919028340080971, "grad_norm": 1.1984658119885334, "learning_rate": 1.985842723249665e-05, "loss": 1.6486, "step": 246 }, { "epoch": 0.2, "grad_norm": 1.3302394542035112, "learning_rate": 1.985698238924908e-05, "loss": 1.6491, "step": 247 }, { "epoch": 0.20080971659919028, "grad_norm": 1.2819039031405692, "learning_rate": 1.9855530263695287e-05, "loss": 1.7145, "step": 248 }, { "epoch": 0.20161943319838058, "grad_norm": 1.2645396518124346, "learning_rate": 1.9854070856908113e-05, "loss": 1.6766, "step": 249 }, { "epoch": 0.20242914979757085, "grad_norm": 1.3551588648492328, "learning_rate": 1.985260416996575e-05, "loss": 1.7532, "step": 250 }, { "epoch": 0.20323886639676114, "grad_norm": 1.1872824323555884, "learning_rate": 1.9851130203951787e-05, "loss": 1.722, "step": 251 }, { "epoch": 0.2040485829959514, "grad_norm": 1.3233391136396748, "learning_rate": 1.9849648959955187e-05, "loss": 1.6066, "step": 252 }, { "epoch": 0.2048582995951417, "grad_norm": 1.3232293456672852, "learning_rate": 1.9848160439070284e-05, "loss": 1.6556, "step": 253 }, { "epoch": 0.20566801619433198, "grad_norm": 1.3252440210019765, "learning_rate": 1.9846664642396793e-05, "loss": 1.6771, "step": 254 }, { "epoch": 0.20647773279352227, "grad_norm": 1.2091779306727122, "learning_rate": 1.9845161571039805e-05, "loss": 1.7038, "step": 255 }, { "epoch": 0.20728744939271254, "grad_norm": 1.2887939358404155, "learning_rate": 1.9843651226109784e-05, "loss": 1.6948, "step": 256 }, { "epoch": 0.20809716599190284, "grad_norm": 1.2583822543738294, "learning_rate": 1.984213360872257e-05, "loss": 1.6888, "step": 257 }, { "epoch": 0.2089068825910931, "grad_norm": 1.3279683247483347, "learning_rate": 1.9840608719999367e-05, "loss": 1.7059, "step": 258 }, { "epoch": 0.2097165991902834, "grad_norm": 1.270081125900351, "learning_rate": 1.9839076561066766e-05, "loss": 1.7175, "step": 259 }, { "epoch": 0.21052631578947367, "grad_norm": 1.254273959928416, "learning_rate": 1.983753713305672e-05, "loss": 1.6763, "step": 260 }, { "epoch": 0.21133603238866397, "grad_norm": 1.3521388079612242, "learning_rate": 1.9835990437106542e-05, "loss": 1.6948, "step": 261 }, { "epoch": 0.21214574898785424, "grad_norm": 1.267536006413824, "learning_rate": 1.983443647435894e-05, "loss": 1.675, "step": 262 }, { "epoch": 0.21295546558704453, "grad_norm": 1.3388930511150345, "learning_rate": 1.9832875245961972e-05, "loss": 1.7291, "step": 263 }, { "epoch": 0.21376518218623483, "grad_norm": 1.2427093269086842, "learning_rate": 1.9831306753069066e-05, "loss": 1.6094, "step": 264 }, { "epoch": 0.2145748987854251, "grad_norm": 1.349674351751306, "learning_rate": 1.982973099683902e-05, "loss": 1.6964, "step": 265 }, { "epoch": 0.2153846153846154, "grad_norm": 1.365432576876562, "learning_rate": 1.9828147978436e-05, "loss": 1.7176, "step": 266 }, { "epoch": 0.21619433198380567, "grad_norm": 1.2307745643880328, "learning_rate": 1.982655769902953e-05, "loss": 1.6596, "step": 267 }, { "epoch": 0.21700404858299596, "grad_norm": 1.2552754011065252, "learning_rate": 1.9824960159794512e-05, "loss": 1.7251, "step": 268 }, { "epoch": 0.21781376518218623, "grad_norm": 1.2821452869506051, "learning_rate": 1.9823355361911192e-05, "loss": 1.6452, "step": 269 }, { "epoch": 0.21862348178137653, "grad_norm": 1.2062731542559535, "learning_rate": 1.98217433065652e-05, "loss": 1.6814, "step": 270 }, { "epoch": 0.2194331983805668, "grad_norm": 1.2262168648135, "learning_rate": 1.9820123994947505e-05, "loss": 1.6816, "step": 271 }, { "epoch": 0.2202429149797571, "grad_norm": 1.2938841697813679, "learning_rate": 1.981849742825446e-05, "loss": 1.7483, "step": 272 }, { "epoch": 0.22105263157894736, "grad_norm": 1.2645687786911326, "learning_rate": 1.981686360768776e-05, "loss": 1.6805, "step": 273 }, { "epoch": 0.22186234817813766, "grad_norm": 1.222316415160104, "learning_rate": 1.9815222534454472e-05, "loss": 1.7249, "step": 274 }, { "epoch": 0.22267206477732793, "grad_norm": 1.2204096535680333, "learning_rate": 1.9813574209767013e-05, "loss": 1.6393, "step": 275 }, { "epoch": 0.22348178137651822, "grad_norm": 1.2272417519081043, "learning_rate": 1.981191863484316e-05, "loss": 1.665, "step": 276 }, { "epoch": 0.2242914979757085, "grad_norm": 1.2688048841631627, "learning_rate": 1.9810255810906046e-05, "loss": 1.6725, "step": 277 }, { "epoch": 0.2251012145748988, "grad_norm": 1.1858272375472187, "learning_rate": 1.9808585739184156e-05, "loss": 1.6861, "step": 278 }, { "epoch": 0.22591093117408906, "grad_norm": 1.2436292029105176, "learning_rate": 1.980690842091134e-05, "loss": 1.679, "step": 279 }, { "epoch": 0.22672064777327935, "grad_norm": 1.2348131442739951, "learning_rate": 1.9805223857326794e-05, "loss": 1.6451, "step": 280 }, { "epoch": 0.22753036437246962, "grad_norm": 1.195784828107809, "learning_rate": 1.9803532049675062e-05, "loss": 1.637, "step": 281 }, { "epoch": 0.22834008097165992, "grad_norm": 1.2391790592489964, "learning_rate": 1.9801832999206057e-05, "loss": 1.7539, "step": 282 }, { "epoch": 0.2291497975708502, "grad_norm": 1.1984543549350761, "learning_rate": 1.980012670717502e-05, "loss": 1.6307, "step": 283 }, { "epoch": 0.22995951417004049, "grad_norm": 1.2082020794861243, "learning_rate": 1.9798413174842565e-05, "loss": 1.7061, "step": 284 }, { "epoch": 0.23076923076923078, "grad_norm": 1.17049178703938, "learning_rate": 1.9796692403474632e-05, "loss": 1.6614, "step": 285 }, { "epoch": 0.23157894736842105, "grad_norm": 1.1776506077280637, "learning_rate": 1.9794964394342532e-05, "loss": 1.7012, "step": 286 }, { "epoch": 0.23238866396761135, "grad_norm": 1.250032564625677, "learning_rate": 1.9793229148722907e-05, "loss": 1.7472, "step": 287 }, { "epoch": 0.23319838056680162, "grad_norm": 1.1923743701902663, "learning_rate": 1.979148666789775e-05, "loss": 1.6169, "step": 288 }, { "epoch": 0.2340080971659919, "grad_norm": 1.2445641617846566, "learning_rate": 1.9789736953154405e-05, "loss": 1.6697, "step": 289 }, { "epoch": 0.23481781376518218, "grad_norm": 1.2338781631842157, "learning_rate": 1.9787980005785553e-05, "loss": 1.6691, "step": 290 }, { "epoch": 0.23562753036437248, "grad_norm": 1.263470721107049, "learning_rate": 1.9786215827089216e-05, "loss": 1.6716, "step": 291 }, { "epoch": 0.23643724696356275, "grad_norm": 1.2561973670317472, "learning_rate": 1.978444441836877e-05, "loss": 1.6759, "step": 292 }, { "epoch": 0.23724696356275304, "grad_norm": 1.2208658433077548, "learning_rate": 1.9782665780932926e-05, "loss": 1.6881, "step": 293 }, { "epoch": 0.2380566801619433, "grad_norm": 1.2129567864078932, "learning_rate": 1.9780879916095733e-05, "loss": 1.6417, "step": 294 }, { "epoch": 0.2388663967611336, "grad_norm": 1.260630886604336, "learning_rate": 1.977908682517658e-05, "loss": 1.6602, "step": 295 }, { "epoch": 0.23967611336032388, "grad_norm": 1.2506139069671103, "learning_rate": 1.97772865095002e-05, "loss": 1.7444, "step": 296 }, { "epoch": 0.24048582995951417, "grad_norm": 1.360572780404123, "learning_rate": 1.9775478970396663e-05, "loss": 1.6975, "step": 297 }, { "epoch": 0.24129554655870444, "grad_norm": 1.2420340903090243, "learning_rate": 1.9773664209201368e-05, "loss": 1.6733, "step": 298 }, { "epoch": 0.24210526315789474, "grad_norm": 1.2691192435537564, "learning_rate": 1.977184222725505e-05, "loss": 1.6496, "step": 299 }, { "epoch": 0.242914979757085, "grad_norm": 1.251420231930437, "learning_rate": 1.9770013025903797e-05, "loss": 1.6639, "step": 300 }, { "epoch": 0.2437246963562753, "grad_norm": 1.3007823671114107, "learning_rate": 1.9768176606499005e-05, "loss": 1.7034, "step": 301 }, { "epoch": 0.24453441295546557, "grad_norm": 1.2809858772390412, "learning_rate": 1.976633297039742e-05, "loss": 1.6903, "step": 302 }, { "epoch": 0.24534412955465587, "grad_norm": 1.2280345470671303, "learning_rate": 1.976448211896111e-05, "loss": 1.6723, "step": 303 }, { "epoch": 0.24615384615384617, "grad_norm": 1.1497220827533912, "learning_rate": 1.9762624053557485e-05, "loss": 1.6747, "step": 304 }, { "epoch": 0.24696356275303644, "grad_norm": 1.318495032765947, "learning_rate": 1.9760758775559275e-05, "loss": 1.6121, "step": 305 }, { "epoch": 0.24777327935222673, "grad_norm": 1.2399534094992168, "learning_rate": 1.9758886286344536e-05, "loss": 1.7047, "step": 306 }, { "epoch": 0.248582995951417, "grad_norm": 1.2398070884711676, "learning_rate": 1.9757006587296664e-05, "loss": 1.6521, "step": 307 }, { "epoch": 0.2493927125506073, "grad_norm": 1.148807377709599, "learning_rate": 1.975511967980437e-05, "loss": 1.7039, "step": 308 }, { "epoch": 0.25020242914979757, "grad_norm": 1.213998992883127, "learning_rate": 1.9753225565261695e-05, "loss": 1.6879, "step": 309 }, { "epoch": 0.25101214574898784, "grad_norm": 1.2397569747054522, "learning_rate": 1.9751324245068008e-05, "loss": 1.7233, "step": 310 }, { "epoch": 0.25182186234817816, "grad_norm": 1.1848797839629908, "learning_rate": 1.9749415720627993e-05, "loss": 1.6697, "step": 311 }, { "epoch": 0.25263157894736843, "grad_norm": 1.334789053117042, "learning_rate": 1.974749999335167e-05, "loss": 1.6449, "step": 312 }, { "epoch": 0.2534412955465587, "grad_norm": 1.1684958236354195, "learning_rate": 1.974557706465436e-05, "loss": 1.6163, "step": 313 }, { "epoch": 0.25425101214574897, "grad_norm": 1.1517446837404286, "learning_rate": 1.9743646935956727e-05, "loss": 1.6145, "step": 314 }, { "epoch": 0.2550607287449393, "grad_norm": 1.3651988245433082, "learning_rate": 1.974170960868474e-05, "loss": 1.7181, "step": 315 }, { "epoch": 0.25587044534412956, "grad_norm": 1.2278060656467518, "learning_rate": 1.973976508426969e-05, "loss": 1.7003, "step": 316 }, { "epoch": 0.25668016194331983, "grad_norm": 1.298927878086763, "learning_rate": 1.9737813364148187e-05, "loss": 1.7066, "step": 317 }, { "epoch": 0.2574898785425101, "grad_norm": 1.280731725281388, "learning_rate": 1.973585444976215e-05, "loss": 1.6962, "step": 318 }, { "epoch": 0.2582995951417004, "grad_norm": 1.3481441118932556, "learning_rate": 1.973388834255882e-05, "loss": 1.6744, "step": 319 }, { "epoch": 0.2591093117408907, "grad_norm": 1.2693166429730327, "learning_rate": 1.973191504399076e-05, "loss": 1.6762, "step": 320 }, { "epoch": 0.25991902834008096, "grad_norm": 1.2340493742688543, "learning_rate": 1.9729934555515823e-05, "loss": 1.6857, "step": 321 }, { "epoch": 0.26072874493927123, "grad_norm": 1.2452283567023132, "learning_rate": 1.9727946878597193e-05, "loss": 1.6444, "step": 322 }, { "epoch": 0.26153846153846155, "grad_norm": 1.3029700463193483, "learning_rate": 1.9725952014703366e-05, "loss": 1.7188, "step": 323 }, { "epoch": 0.2623481781376518, "grad_norm": 1.179155915390884, "learning_rate": 1.9723949965308132e-05, "loss": 1.6801, "step": 324 }, { "epoch": 0.2631578947368421, "grad_norm": 1.286301672885254, "learning_rate": 1.97219407318906e-05, "loss": 1.6617, "step": 325 }, { "epoch": 0.2639676113360324, "grad_norm": 1.2990996064383904, "learning_rate": 1.9719924315935185e-05, "loss": 1.7269, "step": 326 }, { "epoch": 0.2647773279352227, "grad_norm": 1.2224528081088977, "learning_rate": 1.971790071893161e-05, "loss": 1.7372, "step": 327 }, { "epoch": 0.26558704453441295, "grad_norm": 1.2367757457178075, "learning_rate": 1.9715869942374902e-05, "loss": 1.6821, "step": 328 }, { "epoch": 0.2663967611336032, "grad_norm": 1.2373862157826907, "learning_rate": 1.9713831987765394e-05, "loss": 1.7366, "step": 329 }, { "epoch": 0.26720647773279355, "grad_norm": 1.2623515990334666, "learning_rate": 1.9711786856608714e-05, "loss": 1.6406, "step": 330 }, { "epoch": 0.2680161943319838, "grad_norm": 1.1587501394940165, "learning_rate": 1.9709734550415804e-05, "loss": 1.6572, "step": 331 }, { "epoch": 0.2688259109311741, "grad_norm": 1.3564850748615598, "learning_rate": 1.97076750707029e-05, "loss": 1.6981, "step": 332 }, { "epoch": 0.26963562753036435, "grad_norm": 1.1910339514470332, "learning_rate": 1.9705608418991534e-05, "loss": 1.712, "step": 333 }, { "epoch": 0.2704453441295547, "grad_norm": 1.2174211006655304, "learning_rate": 1.9703534596808547e-05, "loss": 1.6783, "step": 334 }, { "epoch": 0.27125506072874495, "grad_norm": 1.1830375775538684, "learning_rate": 1.970145360568607e-05, "loss": 1.7121, "step": 335 }, { "epoch": 0.2720647773279352, "grad_norm": 1.1619637628427306, "learning_rate": 1.9699365447161535e-05, "loss": 1.6468, "step": 336 }, { "epoch": 0.2728744939271255, "grad_norm": 1.2028809053995226, "learning_rate": 1.969727012277766e-05, "loss": 1.6848, "step": 337 }, { "epoch": 0.2736842105263158, "grad_norm": 1.1621842682222943, "learning_rate": 1.969516763408247e-05, "loss": 1.644, "step": 338 }, { "epoch": 0.2744939271255061, "grad_norm": 1.2484226953452637, "learning_rate": 1.9693057982629277e-05, "loss": 1.6179, "step": 339 }, { "epoch": 0.27530364372469635, "grad_norm": 1.3119636180148664, "learning_rate": 1.969094116997668e-05, "loss": 1.7032, "step": 340 }, { "epoch": 0.2761133603238866, "grad_norm": 1.2508340881483084, "learning_rate": 1.9688817197688576e-05, "loss": 1.6472, "step": 341 }, { "epoch": 0.27692307692307694, "grad_norm": 1.202434258153614, "learning_rate": 1.968668606733415e-05, "loss": 1.6996, "step": 342 }, { "epoch": 0.2777327935222672, "grad_norm": 1.1862376977274527, "learning_rate": 1.9684547780487873e-05, "loss": 1.6536, "step": 343 }, { "epoch": 0.2785425101214575, "grad_norm": 1.3042945008375981, "learning_rate": 1.9682402338729504e-05, "loss": 1.714, "step": 344 }, { "epoch": 0.2793522267206478, "grad_norm": 1.2321323526734465, "learning_rate": 1.968024974364408e-05, "loss": 1.6788, "step": 345 }, { "epoch": 0.28016194331983807, "grad_norm": 1.176640860678081, "learning_rate": 1.967808999682195e-05, "loss": 1.6697, "step": 346 }, { "epoch": 0.28097165991902834, "grad_norm": 1.2519779435098668, "learning_rate": 1.9675923099858712e-05, "loss": 1.663, "step": 347 }, { "epoch": 0.2817813765182186, "grad_norm": 1.2594751793446184, "learning_rate": 1.9673749054355268e-05, "loss": 1.7454, "step": 348 }, { "epoch": 0.28259109311740893, "grad_norm": 1.1910600819228896, "learning_rate": 1.9671567861917796e-05, "loss": 1.6278, "step": 349 }, { "epoch": 0.2834008097165992, "grad_norm": 1.1545397374773405, "learning_rate": 1.9669379524157755e-05, "loss": 1.6888, "step": 350 }, { "epoch": 0.28421052631578947, "grad_norm": 1.1774954626925083, "learning_rate": 1.9667184042691877e-05, "loss": 1.62, "step": 351 }, { "epoch": 0.28502024291497974, "grad_norm": 1.1669061183444, "learning_rate": 1.966498141914218e-05, "loss": 1.6305, "step": 352 }, { "epoch": 0.28582995951417006, "grad_norm": 1.2826242009462285, "learning_rate": 1.9662771655135954e-05, "loss": 1.7025, "step": 353 }, { "epoch": 0.28663967611336033, "grad_norm": 1.215510453464448, "learning_rate": 1.9660554752305763e-05, "loss": 1.7183, "step": 354 }, { "epoch": 0.2874493927125506, "grad_norm": 1.1500016494292455, "learning_rate": 1.9658330712289456e-05, "loss": 1.6474, "step": 355 }, { "epoch": 0.28825910931174087, "grad_norm": 1.1771753941978587, "learning_rate": 1.965609953673014e-05, "loss": 1.6303, "step": 356 }, { "epoch": 0.2890688259109312, "grad_norm": 1.1888955521350377, "learning_rate": 1.9653861227276197e-05, "loss": 1.7237, "step": 357 }, { "epoch": 0.28987854251012146, "grad_norm": 1.3602337615622975, "learning_rate": 1.9651615785581287e-05, "loss": 1.6749, "step": 358 }, { "epoch": 0.29068825910931173, "grad_norm": 1.1994666759895225, "learning_rate": 1.9649363213304337e-05, "loss": 1.6335, "step": 359 }, { "epoch": 0.291497975708502, "grad_norm": 1.1952494705657468, "learning_rate": 1.9647103512109535e-05, "loss": 1.6583, "step": 360 }, { "epoch": 0.2923076923076923, "grad_norm": 1.2493452825647193, "learning_rate": 1.9644836683666347e-05, "loss": 1.7336, "step": 361 }, { "epoch": 0.2931174089068826, "grad_norm": 1.2043236715266186, "learning_rate": 1.9642562729649492e-05, "loss": 1.6457, "step": 362 }, { "epoch": 0.29392712550607286, "grad_norm": 1.3258481801373758, "learning_rate": 1.964028165173896e-05, "loss": 1.7147, "step": 363 }, { "epoch": 0.29473684210526313, "grad_norm": 1.3308234537433237, "learning_rate": 1.963799345162001e-05, "loss": 1.7239, "step": 364 }, { "epoch": 0.29554655870445345, "grad_norm": 1.2105650981201128, "learning_rate": 1.9635698130983153e-05, "loss": 1.69, "step": 365 }, { "epoch": 0.2963562753036437, "grad_norm": 1.4724018359938562, "learning_rate": 1.9633395691524163e-05, "loss": 1.7904, "step": 366 }, { "epoch": 0.297165991902834, "grad_norm": 1.167011876559427, "learning_rate": 1.9631086134944076e-05, "loss": 1.6719, "step": 367 }, { "epoch": 0.2979757085020243, "grad_norm": 1.2115645319681754, "learning_rate": 1.9628769462949187e-05, "loss": 1.7076, "step": 368 }, { "epoch": 0.2987854251012146, "grad_norm": 1.2930285867259568, "learning_rate": 1.9626445677251043e-05, "loss": 1.7555, "step": 369 }, { "epoch": 0.29959514170040485, "grad_norm": 1.1530085275414237, "learning_rate": 1.962411477956645e-05, "loss": 1.6965, "step": 370 }, { "epoch": 0.3004048582995951, "grad_norm": 1.4097103876759387, "learning_rate": 1.9621776771617464e-05, "loss": 1.6768, "step": 371 }, { "epoch": 0.30121457489878545, "grad_norm": 1.168938484091032, "learning_rate": 1.9619431655131404e-05, "loss": 1.6209, "step": 372 }, { "epoch": 0.3020242914979757, "grad_norm": 1.2057005406635852, "learning_rate": 1.961707943184083e-05, "loss": 1.6893, "step": 373 }, { "epoch": 0.302834008097166, "grad_norm": 1.2784392150421353, "learning_rate": 1.9614720103483562e-05, "loss": 1.7162, "step": 374 }, { "epoch": 0.30364372469635625, "grad_norm": 1.2120518452791202, "learning_rate": 1.9612353671802658e-05, "loss": 1.5952, "step": 375 }, { "epoch": 0.3044534412955466, "grad_norm": 1.2873149200801106, "learning_rate": 1.960998013854643e-05, "loss": 1.7035, "step": 376 }, { "epoch": 0.30526315789473685, "grad_norm": 1.3006849812067949, "learning_rate": 1.960759950546844e-05, "loss": 1.6379, "step": 377 }, { "epoch": 0.3060728744939271, "grad_norm": 1.2990265392717135, "learning_rate": 1.960521177432749e-05, "loss": 1.7101, "step": 378 }, { "epoch": 0.3068825910931174, "grad_norm": 1.3082477762819935, "learning_rate": 1.9602816946887634e-05, "loss": 1.691, "step": 379 }, { "epoch": 0.3076923076923077, "grad_norm": 1.2057202532696198, "learning_rate": 1.960041502491815e-05, "loss": 1.7229, "step": 380 }, { "epoch": 0.308502024291498, "grad_norm": 1.155390745921477, "learning_rate": 1.959800601019358e-05, "loss": 1.649, "step": 381 }, { "epoch": 0.30931174089068825, "grad_norm": 1.2547425138455335, "learning_rate": 1.9595589904493696e-05, "loss": 1.7094, "step": 382 }, { "epoch": 0.3101214574898785, "grad_norm": 1.2268479082620285, "learning_rate": 1.9593166709603503e-05, "loss": 1.6612, "step": 383 }, { "epoch": 0.31093117408906884, "grad_norm": 1.247022945456135, "learning_rate": 1.9590736427313255e-05, "loss": 1.7201, "step": 384 }, { "epoch": 0.3117408906882591, "grad_norm": 1.1780738130918533, "learning_rate": 1.9588299059418434e-05, "loss": 1.707, "step": 385 }, { "epoch": 0.3125506072874494, "grad_norm": 1.3110540287499943, "learning_rate": 1.958585460771976e-05, "loss": 1.5952, "step": 386 }, { "epoch": 0.3133603238866397, "grad_norm": 1.2356287734723737, "learning_rate": 1.9583403074023183e-05, "loss": 1.7514, "step": 387 }, { "epoch": 0.31417004048582997, "grad_norm": 1.251297643939624, "learning_rate": 1.9580944460139896e-05, "loss": 1.6577, "step": 388 }, { "epoch": 0.31497975708502024, "grad_norm": 1.283479017126295, "learning_rate": 1.9578478767886303e-05, "loss": 1.6909, "step": 389 }, { "epoch": 0.3157894736842105, "grad_norm": 1.1892005294340031, "learning_rate": 1.957600599908406e-05, "loss": 1.6956, "step": 390 }, { "epoch": 0.31659919028340083, "grad_norm": 1.299357039138039, "learning_rate": 1.957352615556004e-05, "loss": 1.6994, "step": 391 }, { "epoch": 0.3174089068825911, "grad_norm": 1.2192402309654093, "learning_rate": 1.9571039239146332e-05, "loss": 1.7085, "step": 392 }, { "epoch": 0.31821862348178137, "grad_norm": 1.2089295961333866, "learning_rate": 1.9568545251680272e-05, "loss": 1.7312, "step": 393 }, { "epoch": 0.31902834008097164, "grad_norm": 1.2002238729382657, "learning_rate": 1.956604419500441e-05, "loss": 1.6919, "step": 394 }, { "epoch": 0.31983805668016196, "grad_norm": 1.1372502072400479, "learning_rate": 1.9563536070966513e-05, "loss": 1.6842, "step": 395 }, { "epoch": 0.32064777327935223, "grad_norm": 1.1848402112810035, "learning_rate": 1.956102088141958e-05, "loss": 1.6854, "step": 396 }, { "epoch": 0.3214574898785425, "grad_norm": 1.212172944607681, "learning_rate": 1.9558498628221816e-05, "loss": 1.6693, "step": 397 }, { "epoch": 0.32226720647773277, "grad_norm": 1.2323473874443758, "learning_rate": 1.9555969313236666e-05, "loss": 1.6916, "step": 398 }, { "epoch": 0.3230769230769231, "grad_norm": 1.197869807167727, "learning_rate": 1.955343293833277e-05, "loss": 1.6777, "step": 399 }, { "epoch": 0.32388663967611336, "grad_norm": 1.2182554044770237, "learning_rate": 1.9550889505383996e-05, "loss": 1.7348, "step": 400 }, { "epoch": 0.32469635627530363, "grad_norm": 1.2204060971165689, "learning_rate": 1.954833901626943e-05, "loss": 1.6656, "step": 401 }, { "epoch": 0.3255060728744939, "grad_norm": 1.2003350489961402, "learning_rate": 1.9545781472873354e-05, "loss": 1.6162, "step": 402 }, { "epoch": 0.3263157894736842, "grad_norm": 1.3712370071917266, "learning_rate": 1.954321687708528e-05, "loss": 1.6689, "step": 403 }, { "epoch": 0.3271255060728745, "grad_norm": 1.2066042919530333, "learning_rate": 1.954064523079992e-05, "loss": 1.6092, "step": 404 }, { "epoch": 0.32793522267206476, "grad_norm": 1.2696649824839026, "learning_rate": 1.9538066535917196e-05, "loss": 1.6442, "step": 405 }, { "epoch": 0.3287449392712551, "grad_norm": 1.470140395099795, "learning_rate": 1.9535480794342248e-05, "loss": 1.6058, "step": 406 }, { "epoch": 0.32955465587044536, "grad_norm": 1.3119894938694066, "learning_rate": 1.9532888007985408e-05, "loss": 1.7058, "step": 407 }, { "epoch": 0.3303643724696356, "grad_norm": 1.4968426504458228, "learning_rate": 1.9530288178762213e-05, "loss": 1.6929, "step": 408 }, { "epoch": 0.3311740890688259, "grad_norm": 1.2601976626437872, "learning_rate": 1.9527681308593412e-05, "loss": 1.6336, "step": 409 }, { "epoch": 0.3319838056680162, "grad_norm": 1.3452338661981322, "learning_rate": 1.952506739940496e-05, "loss": 1.7445, "step": 410 }, { "epoch": 0.3327935222672065, "grad_norm": 1.2717619212491589, "learning_rate": 1.9522446453127994e-05, "loss": 1.6666, "step": 411 }, { "epoch": 0.33360323886639676, "grad_norm": 1.1984329741126858, "learning_rate": 1.951981847169886e-05, "loss": 1.7448, "step": 412 }, { "epoch": 0.334412955465587, "grad_norm": 1.1961449074373034, "learning_rate": 1.951718345705911e-05, "loss": 1.667, "step": 413 }, { "epoch": 0.33522267206477735, "grad_norm": 1.330214100403057, "learning_rate": 1.9514541411155478e-05, "loss": 1.7108, "step": 414 }, { "epoch": 0.3360323886639676, "grad_norm": 1.183842085273524, "learning_rate": 1.9511892335939904e-05, "loss": 1.621, "step": 415 }, { "epoch": 0.3368421052631579, "grad_norm": 1.2664210311376265, "learning_rate": 1.950923623336951e-05, "loss": 1.6512, "step": 416 }, { "epoch": 0.33765182186234816, "grad_norm": 1.1261494079421026, "learning_rate": 1.9506573105406623e-05, "loss": 1.6123, "step": 417 }, { "epoch": 0.3384615384615385, "grad_norm": 1.27913282791293, "learning_rate": 1.9503902954018748e-05, "loss": 1.7083, "step": 418 }, { "epoch": 0.33927125506072875, "grad_norm": 1.1019506289059786, "learning_rate": 1.9501225781178586e-05, "loss": 1.6501, "step": 419 }, { "epoch": 0.340080971659919, "grad_norm": 1.1163063251074246, "learning_rate": 1.9498541588864022e-05, "loss": 1.626, "step": 420 }, { "epoch": 0.3408906882591093, "grad_norm": 1.1208221900644928, "learning_rate": 1.9495850379058127e-05, "loss": 1.6486, "step": 421 }, { "epoch": 0.3417004048582996, "grad_norm": 1.1252985647383595, "learning_rate": 1.9493152153749162e-05, "loss": 1.6098, "step": 422 }, { "epoch": 0.3425101214574899, "grad_norm": 1.1658129970131546, "learning_rate": 1.9490446914930564e-05, "loss": 1.6067, "step": 423 }, { "epoch": 0.34331983805668015, "grad_norm": 1.2465190601078746, "learning_rate": 1.9487734664600956e-05, "loss": 1.7689, "step": 424 }, { "epoch": 0.3441295546558704, "grad_norm": 1.2032865897994165, "learning_rate": 1.948501540476414e-05, "loss": 1.6348, "step": 425 }, { "epoch": 0.34493927125506074, "grad_norm": 1.3211565552357492, "learning_rate": 1.9482289137429098e-05, "loss": 1.6349, "step": 426 }, { "epoch": 0.345748987854251, "grad_norm": 1.1307592944834632, "learning_rate": 1.9479555864609985e-05, "loss": 1.593, "step": 427 }, { "epoch": 0.3465587044534413, "grad_norm": 1.2787848323635016, "learning_rate": 1.947681558832613e-05, "loss": 1.7022, "step": 428 }, { "epoch": 0.3473684210526316, "grad_norm": 1.2091547875890887, "learning_rate": 1.9474068310602048e-05, "loss": 1.6566, "step": 429 }, { "epoch": 0.3481781376518219, "grad_norm": 1.2483509469560066, "learning_rate": 1.9471314033467413e-05, "loss": 1.6993, "step": 430 }, { "epoch": 0.34898785425101214, "grad_norm": 1.2498538878012477, "learning_rate": 1.9468552758957076e-05, "loss": 1.7252, "step": 431 }, { "epoch": 0.3497975708502024, "grad_norm": 1.1989025930704638, "learning_rate": 1.9465784489111063e-05, "loss": 1.6999, "step": 432 }, { "epoch": 0.35060728744939273, "grad_norm": 1.1762119281728465, "learning_rate": 1.9463009225974558e-05, "loss": 1.6501, "step": 433 }, { "epoch": 0.351417004048583, "grad_norm": 1.2950834978750903, "learning_rate": 1.9460226971597916e-05, "loss": 1.7399, "step": 434 }, { "epoch": 0.3522267206477733, "grad_norm": 1.2455256194553699, "learning_rate": 1.945743772803666e-05, "loss": 1.6708, "step": 435 }, { "epoch": 0.35303643724696354, "grad_norm": 1.2202828687384126, "learning_rate": 1.945464149735147e-05, "loss": 1.6896, "step": 436 }, { "epoch": 0.35384615384615387, "grad_norm": 1.158420520477845, "learning_rate": 1.94518382816082e-05, "loss": 1.6619, "step": 437 }, { "epoch": 0.35465587044534413, "grad_norm": 1.1779822358008685, "learning_rate": 1.9449028082877843e-05, "loss": 1.6571, "step": 438 }, { "epoch": 0.3554655870445344, "grad_norm": 1.1534682512913468, "learning_rate": 1.944621090323658e-05, "loss": 1.5662, "step": 439 }, { "epoch": 0.3562753036437247, "grad_norm": 1.231944980721245, "learning_rate": 1.9443386744765726e-05, "loss": 1.6072, "step": 440 }, { "epoch": 0.357085020242915, "grad_norm": 1.2309431429104132, "learning_rate": 1.9440555609551758e-05, "loss": 1.6478, "step": 441 }, { "epoch": 0.35789473684210527, "grad_norm": 1.1256347239923377, "learning_rate": 1.9437717499686313e-05, "loss": 1.5872, "step": 442 }, { "epoch": 0.35870445344129553, "grad_norm": 1.177945395683949, "learning_rate": 1.9434872417266176e-05, "loss": 1.6847, "step": 443 }, { "epoch": 0.3595141700404858, "grad_norm": 1.1817750781529393, "learning_rate": 1.943202036439329e-05, "loss": 1.6555, "step": 444 }, { "epoch": 0.3603238866396761, "grad_norm": 1.155486180808622, "learning_rate": 1.942916134317473e-05, "loss": 1.703, "step": 445 }, { "epoch": 0.3611336032388664, "grad_norm": 1.2249672033301868, "learning_rate": 1.9426295355722745e-05, "loss": 1.6686, "step": 446 }, { "epoch": 0.36194331983805667, "grad_norm": 1.1939431449591948, "learning_rate": 1.9423422404154708e-05, "loss": 1.6403, "step": 447 }, { "epoch": 0.362753036437247, "grad_norm": 1.283040329656735, "learning_rate": 1.942054249059315e-05, "loss": 1.7033, "step": 448 }, { "epoch": 0.36356275303643726, "grad_norm": 1.2302822837989298, "learning_rate": 1.941765561716574e-05, "loss": 1.6309, "step": 449 }, { "epoch": 0.3643724696356275, "grad_norm": 1.258830672551768, "learning_rate": 1.9414761786005293e-05, "loss": 1.6807, "step": 450 }, { "epoch": 0.3651821862348178, "grad_norm": 1.2346970982772958, "learning_rate": 1.9411860999249762e-05, "loss": 1.6063, "step": 451 }, { "epoch": 0.3659919028340081, "grad_norm": 1.323015686111149, "learning_rate": 1.9408953259042236e-05, "loss": 1.6917, "step": 452 }, { "epoch": 0.3668016194331984, "grad_norm": 1.1863762011370744, "learning_rate": 1.9406038567530944e-05, "loss": 1.6365, "step": 453 }, { "epoch": 0.36761133603238866, "grad_norm": 1.2001660652965502, "learning_rate": 1.9403116926869253e-05, "loss": 1.6383, "step": 454 }, { "epoch": 0.3684210526315789, "grad_norm": 1.2089353111723722, "learning_rate": 1.9400188339215657e-05, "loss": 1.6778, "step": 455 }, { "epoch": 0.36923076923076925, "grad_norm": 1.2546231222228346, "learning_rate": 1.9397252806733793e-05, "loss": 1.6405, "step": 456 }, { "epoch": 0.3700404858299595, "grad_norm": 1.3359592739488402, "learning_rate": 1.939431033159242e-05, "loss": 1.643, "step": 457 }, { "epoch": 0.3708502024291498, "grad_norm": 1.1715630102315027, "learning_rate": 1.9391360915965426e-05, "loss": 1.5935, "step": 458 }, { "epoch": 0.37165991902834006, "grad_norm": 1.1385520627989953, "learning_rate": 1.9388404562031836e-05, "loss": 1.6979, "step": 459 }, { "epoch": 0.3724696356275304, "grad_norm": 1.2399884358282447, "learning_rate": 1.9385441271975786e-05, "loss": 1.6434, "step": 460 }, { "epoch": 0.37327935222672065, "grad_norm": 1.163385165068389, "learning_rate": 1.9382471047986555e-05, "loss": 1.6838, "step": 461 }, { "epoch": 0.3740890688259109, "grad_norm": 1.1225008795816183, "learning_rate": 1.9379493892258527e-05, "loss": 1.6541, "step": 462 }, { "epoch": 0.3748987854251012, "grad_norm": 1.2531286384618345, "learning_rate": 1.937650980699122e-05, "loss": 1.654, "step": 463 }, { "epoch": 0.3757085020242915, "grad_norm": 1.1876213186476707, "learning_rate": 1.9373518794389263e-05, "loss": 1.6901, "step": 464 }, { "epoch": 0.3765182186234818, "grad_norm": 1.112836379139156, "learning_rate": 1.9370520856662406e-05, "loss": 1.6399, "step": 465 }, { "epoch": 0.37732793522267205, "grad_norm": 1.3185471861887414, "learning_rate": 1.9367515996025516e-05, "loss": 1.6746, "step": 466 }, { "epoch": 0.3781376518218624, "grad_norm": 1.3149218591633627, "learning_rate": 1.9364504214698578e-05, "loss": 1.6721, "step": 467 }, { "epoch": 0.37894736842105264, "grad_norm": 1.1367079723459552, "learning_rate": 1.936148551490668e-05, "loss": 1.7142, "step": 468 }, { "epoch": 0.3797570850202429, "grad_norm": 1.1910905835785592, "learning_rate": 1.935845989888003e-05, "loss": 1.6307, "step": 469 }, { "epoch": 0.3805668016194332, "grad_norm": 1.1936772698797782, "learning_rate": 1.9355427368853946e-05, "loss": 1.6976, "step": 470 }, { "epoch": 0.3813765182186235, "grad_norm": 1.2030550020345074, "learning_rate": 1.935238792706885e-05, "loss": 1.7098, "step": 471 }, { "epoch": 0.3821862348178138, "grad_norm": 1.171000490443613, "learning_rate": 1.934934157577027e-05, "loss": 1.7119, "step": 472 }, { "epoch": 0.38299595141700404, "grad_norm": 1.164404989871964, "learning_rate": 1.934628831720884e-05, "loss": 1.6364, "step": 473 }, { "epoch": 0.3838056680161943, "grad_norm": 1.1421132414106066, "learning_rate": 1.9343228153640296e-05, "loss": 1.6652, "step": 474 }, { "epoch": 0.38461538461538464, "grad_norm": 1.1666775642165048, "learning_rate": 1.9340161087325483e-05, "loss": 1.6711, "step": 475 }, { "epoch": 0.3854251012145749, "grad_norm": 1.1725292925597457, "learning_rate": 1.9337087120530335e-05, "loss": 1.5852, "step": 476 }, { "epoch": 0.3862348178137652, "grad_norm": 1.2217452550533756, "learning_rate": 1.9334006255525884e-05, "loss": 1.6768, "step": 477 }, { "epoch": 0.38704453441295544, "grad_norm": 1.1442586319126382, "learning_rate": 1.9330918494588275e-05, "loss": 1.6791, "step": 478 }, { "epoch": 0.38785425101214577, "grad_norm": 1.0946129383309182, "learning_rate": 1.9327823839998726e-05, "loss": 1.7151, "step": 479 }, { "epoch": 0.38866396761133604, "grad_norm": 1.2204161246543435, "learning_rate": 1.932472229404356e-05, "loss": 1.6237, "step": 480 }, { "epoch": 0.3894736842105263, "grad_norm": 1.1880149763428396, "learning_rate": 1.932161385901419e-05, "loss": 1.6761, "step": 481 }, { "epoch": 0.3902834008097166, "grad_norm": 1.2592668241512581, "learning_rate": 1.931849853720712e-05, "loss": 1.5898, "step": 482 }, { "epoch": 0.3910931174089069, "grad_norm": 1.209479528339513, "learning_rate": 1.931537633092393e-05, "loss": 1.6279, "step": 483 }, { "epoch": 0.39190283400809717, "grad_norm": 1.2513199284132253, "learning_rate": 1.9312247242471306e-05, "loss": 1.6343, "step": 484 }, { "epoch": 0.39271255060728744, "grad_norm": 1.1989743043638685, "learning_rate": 1.9309111274161005e-05, "loss": 1.6881, "step": 485 }, { "epoch": 0.39352226720647776, "grad_norm": 1.2273758702155306, "learning_rate": 1.930596842830987e-05, "loss": 1.6703, "step": 486 }, { "epoch": 0.39433198380566803, "grad_norm": 1.1911609997920505, "learning_rate": 1.9302818707239822e-05, "loss": 1.6359, "step": 487 }, { "epoch": 0.3951417004048583, "grad_norm": 1.1555821594774291, "learning_rate": 1.9299662113277867e-05, "loss": 1.7005, "step": 488 }, { "epoch": 0.39595141700404857, "grad_norm": 1.161714222746895, "learning_rate": 1.929649864875609e-05, "loss": 1.6259, "step": 489 }, { "epoch": 0.3967611336032389, "grad_norm": 1.2024264839115575, "learning_rate": 1.9293328316011645e-05, "loss": 1.6454, "step": 490 }, { "epoch": 0.39757085020242916, "grad_norm": 1.182376794250563, "learning_rate": 1.929015111738676e-05, "loss": 1.6839, "step": 491 }, { "epoch": 0.39838056680161943, "grad_norm": 1.2463525022449193, "learning_rate": 1.9286967055228744e-05, "loss": 1.7041, "step": 492 }, { "epoch": 0.3991902834008097, "grad_norm": 1.1765792609257826, "learning_rate": 1.928377613188997e-05, "loss": 1.6963, "step": 493 }, { "epoch": 0.4, "grad_norm": 1.1464191471865726, "learning_rate": 1.9280578349727882e-05, "loss": 1.7587, "step": 494 }, { "epoch": 0.4008097165991903, "grad_norm": 1.2250198503430287, "learning_rate": 1.927737371110499e-05, "loss": 1.6011, "step": 495 }, { "epoch": 0.40161943319838056, "grad_norm": 1.2273800649829825, "learning_rate": 1.927416221838887e-05, "loss": 1.7219, "step": 496 }, { "epoch": 0.40242914979757083, "grad_norm": 1.1823664342882607, "learning_rate": 1.9270943873952162e-05, "loss": 1.6612, "step": 497 }, { "epoch": 0.40323886639676115, "grad_norm": 1.2129258567045582, "learning_rate": 1.9267718680172574e-05, "loss": 1.6969, "step": 498 }, { "epoch": 0.4040485829959514, "grad_norm": 1.142011991460994, "learning_rate": 1.926448663943286e-05, "loss": 1.6331, "step": 499 }, { "epoch": 0.4048582995951417, "grad_norm": 1.1900750233989383, "learning_rate": 1.9261247754120846e-05, "loss": 1.5953, "step": 500 }, { "epoch": 0.40566801619433196, "grad_norm": 1.1876038257951016, "learning_rate": 1.925800202662941e-05, "loss": 1.635, "step": 501 }, { "epoch": 0.4064777327935223, "grad_norm": 1.1483754178488699, "learning_rate": 1.9254749459356482e-05, "loss": 1.672, "step": 502 }, { "epoch": 0.40728744939271255, "grad_norm": 1.2261880502908553, "learning_rate": 1.9251490054705053e-05, "loss": 1.7095, "step": 503 }, { "epoch": 0.4080971659919028, "grad_norm": 1.2012813998498333, "learning_rate": 1.9248223815083155e-05, "loss": 1.6417, "step": 504 }, { "epoch": 0.4089068825910931, "grad_norm": 1.1128246203440006, "learning_rate": 1.924495074290388e-05, "loss": 1.6963, "step": 505 }, { "epoch": 0.4097165991902834, "grad_norm": 1.1860636204765815, "learning_rate": 1.9241670840585357e-05, "loss": 1.6741, "step": 506 }, { "epoch": 0.4105263157894737, "grad_norm": 1.3472485519941122, "learning_rate": 1.923838411055077e-05, "loss": 1.6403, "step": 507 }, { "epoch": 0.41133603238866395, "grad_norm": 1.243377675921519, "learning_rate": 1.923509055522835e-05, "loss": 1.6858, "step": 508 }, { "epoch": 0.4121457489878543, "grad_norm": 1.1780699401889794, "learning_rate": 1.9231790177051354e-05, "loss": 1.7021, "step": 509 }, { "epoch": 0.41295546558704455, "grad_norm": 1.1235409565713237, "learning_rate": 1.92284829784581e-05, "loss": 1.8083, "step": 510 }, { "epoch": 0.4137651821862348, "grad_norm": 1.2751087877042944, "learning_rate": 1.922516896189193e-05, "loss": 1.6194, "step": 511 }, { "epoch": 0.4145748987854251, "grad_norm": 1.1876545871499165, "learning_rate": 1.922184812980123e-05, "loss": 1.6526, "step": 512 }, { "epoch": 0.4153846153846154, "grad_norm": 1.1881650289467287, "learning_rate": 1.921852048463942e-05, "loss": 1.6574, "step": 513 }, { "epoch": 0.4161943319838057, "grad_norm": 1.2137821843512242, "learning_rate": 1.9215186028864955e-05, "loss": 1.631, "step": 514 }, { "epoch": 0.41700404858299595, "grad_norm": 1.2205249821982338, "learning_rate": 1.9211844764941318e-05, "loss": 1.6862, "step": 515 }, { "epoch": 0.4178137651821862, "grad_norm": 1.229223049157285, "learning_rate": 1.920849669533702e-05, "loss": 1.7166, "step": 516 }, { "epoch": 0.41862348178137654, "grad_norm": 1.1248482042561723, "learning_rate": 1.920514182252561e-05, "loss": 1.7045, "step": 517 }, { "epoch": 0.4194331983805668, "grad_norm": 1.1551125743150399, "learning_rate": 1.9201780148985657e-05, "loss": 1.6242, "step": 518 }, { "epoch": 0.4202429149797571, "grad_norm": 1.1669084355179586, "learning_rate": 1.9198411677200753e-05, "loss": 1.708, "step": 519 }, { "epoch": 0.42105263157894735, "grad_norm": 1.1195972354108037, "learning_rate": 1.919503640965951e-05, "loss": 1.6543, "step": 520 }, { "epoch": 0.42186234817813767, "grad_norm": 1.1430422701282208, "learning_rate": 1.919165434885557e-05, "loss": 1.6929, "step": 521 }, { "epoch": 0.42267206477732794, "grad_norm": 1.248012426300318, "learning_rate": 1.9188265497287587e-05, "loss": 1.6886, "step": 522 }, { "epoch": 0.4234817813765182, "grad_norm": 1.2892887919197453, "learning_rate": 1.9184869857459233e-05, "loss": 1.6493, "step": 523 }, { "epoch": 0.4242914979757085, "grad_norm": 1.1122675853550497, "learning_rate": 1.918146743187919e-05, "loss": 1.6726, "step": 524 }, { "epoch": 0.4251012145748988, "grad_norm": 1.2687690810982857, "learning_rate": 1.917805822306117e-05, "loss": 1.6835, "step": 525 }, { "epoch": 0.42591093117408907, "grad_norm": 1.248733843761085, "learning_rate": 1.9174642233523876e-05, "loss": 1.6022, "step": 526 }, { "epoch": 0.42672064777327934, "grad_norm": 1.1610405823160372, "learning_rate": 1.9171219465791037e-05, "loss": 1.6776, "step": 527 }, { "epoch": 0.42753036437246966, "grad_norm": 1.1844102471692486, "learning_rate": 1.9167789922391374e-05, "loss": 1.7224, "step": 528 }, { "epoch": 0.42834008097165993, "grad_norm": 1.1045406051602051, "learning_rate": 1.916435360585863e-05, "loss": 1.6201, "step": 529 }, { "epoch": 0.4291497975708502, "grad_norm": 1.1718884314754825, "learning_rate": 1.916091051873154e-05, "loss": 1.6654, "step": 530 }, { "epoch": 0.42995951417004047, "grad_norm": 1.2149881721900548, "learning_rate": 1.915746066355385e-05, "loss": 1.6386, "step": 531 }, { "epoch": 0.4307692307692308, "grad_norm": 1.122281282890927, "learning_rate": 1.9154004042874295e-05, "loss": 1.6699, "step": 532 }, { "epoch": 0.43157894736842106, "grad_norm": 1.1414789158751815, "learning_rate": 1.915054065924662e-05, "loss": 1.7295, "step": 533 }, { "epoch": 0.43238866396761133, "grad_norm": 1.1773843495524834, "learning_rate": 1.914707051522956e-05, "loss": 1.6519, "step": 534 }, { "epoch": 0.4331983805668016, "grad_norm": 1.1370478468226877, "learning_rate": 1.9143593613386845e-05, "loss": 1.663, "step": 535 }, { "epoch": 0.4340080971659919, "grad_norm": 1.2049613179791026, "learning_rate": 1.9140109956287202e-05, "loss": 1.6453, "step": 536 }, { "epoch": 0.4348178137651822, "grad_norm": 1.1634868725319234, "learning_rate": 1.9136619546504344e-05, "loss": 1.626, "step": 537 }, { "epoch": 0.43562753036437246, "grad_norm": 1.151511606446221, "learning_rate": 1.9133122386616972e-05, "loss": 1.676, "step": 538 }, { "epoch": 0.43643724696356273, "grad_norm": 1.1352020906409594, "learning_rate": 1.9129618479208775e-05, "loss": 1.665, "step": 539 }, { "epoch": 0.43724696356275305, "grad_norm": 1.0886967891950499, "learning_rate": 1.9126107826868436e-05, "loss": 1.6408, "step": 540 }, { "epoch": 0.4380566801619433, "grad_norm": 1.209588887486826, "learning_rate": 1.91225904321896e-05, "loss": 1.7039, "step": 541 }, { "epoch": 0.4388663967611336, "grad_norm": 1.140155316990926, "learning_rate": 1.9119066297770924e-05, "loss": 1.6966, "step": 542 }, { "epoch": 0.43967611336032386, "grad_norm": 1.1285673550031636, "learning_rate": 1.9115535426216018e-05, "loss": 1.6644, "step": 543 }, { "epoch": 0.4404858299595142, "grad_norm": 1.179973190293816, "learning_rate": 1.9111997820133472e-05, "loss": 1.7061, "step": 544 }, { "epoch": 0.44129554655870445, "grad_norm": 1.308360305063932, "learning_rate": 1.9108453482136866e-05, "loss": 1.7163, "step": 545 }, { "epoch": 0.4421052631578947, "grad_norm": 1.1712581078253976, "learning_rate": 1.9104902414844746e-05, "loss": 1.644, "step": 546 }, { "epoch": 0.44291497975708505, "grad_norm": 1.1720636280682097, "learning_rate": 1.9101344620880625e-05, "loss": 1.7228, "step": 547 }, { "epoch": 0.4437246963562753, "grad_norm": 1.2371895291825066, "learning_rate": 1.909778010287299e-05, "loss": 1.6121, "step": 548 }, { "epoch": 0.4445344129554656, "grad_norm": 1.0466963526468558, "learning_rate": 1.9094208863455296e-05, "loss": 1.6056, "step": 549 }, { "epoch": 0.44534412955465585, "grad_norm": 1.1539410237456056, "learning_rate": 1.9090630905265963e-05, "loss": 1.7385, "step": 550 }, { "epoch": 0.4461538461538462, "grad_norm": 1.1690769087937798, "learning_rate": 1.9087046230948373e-05, "loss": 1.7135, "step": 551 }, { "epoch": 0.44696356275303645, "grad_norm": 1.2143571533897817, "learning_rate": 1.9083454843150875e-05, "loss": 1.6558, "step": 552 }, { "epoch": 0.4477732793522267, "grad_norm": 1.1034768392638734, "learning_rate": 1.9079856744526775e-05, "loss": 1.643, "step": 553 }, { "epoch": 0.448582995951417, "grad_norm": 1.150409314573767, "learning_rate": 1.9076251937734328e-05, "loss": 1.6522, "step": 554 }, { "epoch": 0.4493927125506073, "grad_norm": 1.2891397853585957, "learning_rate": 1.9072640425436762e-05, "loss": 1.6858, "step": 555 }, { "epoch": 0.4502024291497976, "grad_norm": 1.2158215396752632, "learning_rate": 1.906902221030225e-05, "loss": 1.6065, "step": 556 }, { "epoch": 0.45101214574898785, "grad_norm": 1.1200407349127612, "learning_rate": 1.9065397295003917e-05, "loss": 1.651, "step": 557 }, { "epoch": 0.4518218623481781, "grad_norm": 1.1631363825604073, "learning_rate": 1.9061765682219833e-05, "loss": 1.6214, "step": 558 }, { "epoch": 0.45263157894736844, "grad_norm": 1.1276692541556212, "learning_rate": 1.9058127374633027e-05, "loss": 1.6493, "step": 559 }, { "epoch": 0.4534412955465587, "grad_norm": 1.1960342249167806, "learning_rate": 1.905448237493147e-05, "loss": 1.6425, "step": 560 }, { "epoch": 0.454251012145749, "grad_norm": 1.1128837104785292, "learning_rate": 1.905083068580807e-05, "loss": 1.6333, "step": 561 }, { "epoch": 0.45506072874493925, "grad_norm": 1.2256171630437762, "learning_rate": 1.9047172309960685e-05, "loss": 1.6754, "step": 562 }, { "epoch": 0.45587044534412957, "grad_norm": 1.2397790580338823, "learning_rate": 1.9043507250092113e-05, "loss": 1.6666, "step": 563 }, { "epoch": 0.45668016194331984, "grad_norm": 1.1082657174113733, "learning_rate": 1.9039835508910086e-05, "loss": 1.6955, "step": 564 }, { "epoch": 0.4574898785425101, "grad_norm": 1.1906286541004671, "learning_rate": 1.9036157089127278e-05, "loss": 1.6531, "step": 565 }, { "epoch": 0.4582995951417004, "grad_norm": 1.1424432978234285, "learning_rate": 1.903247199346129e-05, "loss": 1.7165, "step": 566 }, { "epoch": 0.4591093117408907, "grad_norm": 1.209971661625088, "learning_rate": 1.902878022463466e-05, "loss": 1.7129, "step": 567 }, { "epoch": 0.45991902834008097, "grad_norm": 1.1858172748323061, "learning_rate": 1.9025081785374854e-05, "loss": 1.6542, "step": 568 }, { "epoch": 0.46072874493927124, "grad_norm": 1.133164376536185, "learning_rate": 1.9021376678414266e-05, "loss": 1.6183, "step": 569 }, { "epoch": 0.46153846153846156, "grad_norm": 1.0890524812398936, "learning_rate": 1.901766490649022e-05, "loss": 1.6258, "step": 570 }, { "epoch": 0.46234817813765183, "grad_norm": 1.2014659731558368, "learning_rate": 1.901394647234496e-05, "loss": 1.7053, "step": 571 }, { "epoch": 0.4631578947368421, "grad_norm": 1.1482794624137653, "learning_rate": 1.901022137872565e-05, "loss": 1.6733, "step": 572 }, { "epoch": 0.46396761133603237, "grad_norm": 1.1400965356949173, "learning_rate": 1.9006489628384374e-05, "loss": 1.6498, "step": 573 }, { "epoch": 0.4647773279352227, "grad_norm": 1.1411852291997495, "learning_rate": 1.9002751224078148e-05, "loss": 1.6479, "step": 574 }, { "epoch": 0.46558704453441296, "grad_norm": 1.1952888204227503, "learning_rate": 1.8999006168568883e-05, "loss": 1.6397, "step": 575 }, { "epoch": 0.46639676113360323, "grad_norm": 1.1088918517492075, "learning_rate": 1.899525446462342e-05, "loss": 1.6055, "step": 576 }, { "epoch": 0.4672064777327935, "grad_norm": 1.1036176426671134, "learning_rate": 1.89914961150135e-05, "loss": 1.5637, "step": 577 }, { "epoch": 0.4680161943319838, "grad_norm": 1.2620277434980276, "learning_rate": 1.8987731122515783e-05, "loss": 1.7563, "step": 578 }, { "epoch": 0.4688259109311741, "grad_norm": 1.2114730386349555, "learning_rate": 1.8983959489911833e-05, "loss": 1.6029, "step": 579 }, { "epoch": 0.46963562753036436, "grad_norm": 1.1429824486613362, "learning_rate": 1.8980181219988117e-05, "loss": 1.6325, "step": 580 }, { "epoch": 0.47044534412955463, "grad_norm": 1.2235645463897764, "learning_rate": 1.897639631553601e-05, "loss": 1.6529, "step": 581 }, { "epoch": 0.47125506072874496, "grad_norm": 1.2114311834584224, "learning_rate": 1.897260477935179e-05, "loss": 1.7054, "step": 582 }, { "epoch": 0.4720647773279352, "grad_norm": 1.166387438730835, "learning_rate": 1.8968806614236625e-05, "loss": 1.5569, "step": 583 }, { "epoch": 0.4728744939271255, "grad_norm": 1.195523192072768, "learning_rate": 1.8965001822996597e-05, "loss": 1.6743, "step": 584 }, { "epoch": 0.47368421052631576, "grad_norm": 1.1792702417522374, "learning_rate": 1.8961190408442662e-05, "loss": 1.5886, "step": 585 }, { "epoch": 0.4744939271255061, "grad_norm": 1.2088013728568938, "learning_rate": 1.8957372373390686e-05, "loss": 1.7337, "step": 586 }, { "epoch": 0.47530364372469636, "grad_norm": 1.1827861831578297, "learning_rate": 1.895354772066142e-05, "loss": 1.6138, "step": 587 }, { "epoch": 0.4761133603238866, "grad_norm": 1.2447777229069645, "learning_rate": 1.8949716453080508e-05, "loss": 1.6402, "step": 588 }, { "epoch": 0.47692307692307695, "grad_norm": 1.2650084378850481, "learning_rate": 1.894587857347847e-05, "loss": 1.7021, "step": 589 }, { "epoch": 0.4777327935222672, "grad_norm": 1.1035611492696404, "learning_rate": 1.8942034084690727e-05, "loss": 1.666, "step": 590 }, { "epoch": 0.4785425101214575, "grad_norm": 1.2449301610510557, "learning_rate": 1.893818298955757e-05, "loss": 1.6586, "step": 591 }, { "epoch": 0.47935222672064776, "grad_norm": 1.233085689754001, "learning_rate": 1.8934325290924177e-05, "loss": 1.6828, "step": 592 }, { "epoch": 0.4801619433198381, "grad_norm": 1.1810873725015931, "learning_rate": 1.8930460991640606e-05, "loss": 1.6581, "step": 593 }, { "epoch": 0.48097165991902835, "grad_norm": 1.2288793304782808, "learning_rate": 1.8926590094561784e-05, "loss": 1.691, "step": 594 }, { "epoch": 0.4817813765182186, "grad_norm": 1.2201380059232416, "learning_rate": 1.8922712602547516e-05, "loss": 1.6666, "step": 595 }, { "epoch": 0.4825910931174089, "grad_norm": 1.145832667110585, "learning_rate": 1.891882851846249e-05, "loss": 1.678, "step": 596 }, { "epoch": 0.4834008097165992, "grad_norm": 1.1830711402174343, "learning_rate": 1.891493784517624e-05, "loss": 1.6358, "step": 597 }, { "epoch": 0.4842105263157895, "grad_norm": 1.2028619373981595, "learning_rate": 1.8911040585563196e-05, "loss": 1.7163, "step": 598 }, { "epoch": 0.48502024291497975, "grad_norm": 1.1632991822145227, "learning_rate": 1.8907136742502633e-05, "loss": 1.7096, "step": 599 }, { "epoch": 0.48582995951417, "grad_norm": 1.177024703746382, "learning_rate": 1.89032263188787e-05, "loss": 1.6444, "step": 600 }, { "epoch": 0.48663967611336034, "grad_norm": 1.2097396232078657, "learning_rate": 1.8899309317580403e-05, "loss": 1.7032, "step": 601 }, { "epoch": 0.4874493927125506, "grad_norm": 1.0312500629058605, "learning_rate": 1.8895385741501608e-05, "loss": 1.6393, "step": 602 }, { "epoch": 0.4882591093117409, "grad_norm": 1.1817251113637202, "learning_rate": 1.889145559354105e-05, "loss": 1.6357, "step": 603 }, { "epoch": 0.48906882591093115, "grad_norm": 1.192720416754525, "learning_rate": 1.88875188766023e-05, "loss": 1.6244, "step": 604 }, { "epoch": 0.4898785425101215, "grad_norm": 1.1432510606242128, "learning_rate": 1.8883575593593793e-05, "loss": 1.6741, "step": 605 }, { "epoch": 0.49068825910931174, "grad_norm": 1.1569463862557519, "learning_rate": 1.8879625747428815e-05, "loss": 1.6761, "step": 606 }, { "epoch": 0.491497975708502, "grad_norm": 1.187603448727291, "learning_rate": 1.8875669341025498e-05, "loss": 1.728, "step": 607 }, { "epoch": 0.49230769230769234, "grad_norm": 1.2130166999284056, "learning_rate": 1.8871706377306826e-05, "loss": 1.6865, "step": 608 }, { "epoch": 0.4931174089068826, "grad_norm": 1.1670591496027172, "learning_rate": 1.886773685920062e-05, "loss": 1.6744, "step": 609 }, { "epoch": 0.4939271255060729, "grad_norm": 1.2163771157473018, "learning_rate": 1.8863760789639548e-05, "loss": 1.7203, "step": 610 }, { "epoch": 0.49473684210526314, "grad_norm": 1.229371456590416, "learning_rate": 1.8859778171561118e-05, "loss": 1.7059, "step": 611 }, { "epoch": 0.49554655870445347, "grad_norm": 1.192832189327257, "learning_rate": 1.8855789007907672e-05, "loss": 1.6862, "step": 612 }, { "epoch": 0.49635627530364373, "grad_norm": 1.2220933781088503, "learning_rate": 1.885179330162639e-05, "loss": 1.6475, "step": 613 }, { "epoch": 0.497165991902834, "grad_norm": 1.1263582806182404, "learning_rate": 1.8847791055669297e-05, "loss": 1.5831, "step": 614 }, { "epoch": 0.4979757085020243, "grad_norm": 1.2054405676608149, "learning_rate": 1.8843782272993225e-05, "loss": 1.6485, "step": 615 }, { "epoch": 0.4987854251012146, "grad_norm": 1.1513573612013865, "learning_rate": 1.883976695655986e-05, "loss": 1.6391, "step": 616 }, { "epoch": 0.49959514170040487, "grad_norm": 1.1215561830277645, "learning_rate": 1.88357451093357e-05, "loss": 1.689, "step": 617 }, { "epoch": 0.5004048582995951, "grad_norm": 1.2642509822675319, "learning_rate": 1.8831716734292074e-05, "loss": 1.6564, "step": 618 }, { "epoch": 0.5012145748987854, "grad_norm": 1.1935335114428585, "learning_rate": 1.882768183440513e-05, "loss": 1.6118, "step": 619 }, { "epoch": 0.5020242914979757, "grad_norm": 1.2850491601729415, "learning_rate": 1.8823640412655844e-05, "loss": 1.652, "step": 620 }, { "epoch": 0.5028340080971659, "grad_norm": 1.202299368491898, "learning_rate": 1.881959247203e-05, "loss": 1.6723, "step": 621 }, { "epoch": 0.5036437246963563, "grad_norm": 1.0583336420886984, "learning_rate": 1.8815538015518203e-05, "loss": 1.6403, "step": 622 }, { "epoch": 0.5044534412955466, "grad_norm": 1.24127140876251, "learning_rate": 1.8811477046115877e-05, "loss": 1.692, "step": 623 }, { "epoch": 0.5052631578947369, "grad_norm": 1.174302573947688, "learning_rate": 1.880740956682325e-05, "loss": 1.7306, "step": 624 }, { "epoch": 0.5060728744939271, "grad_norm": 1.1958655509103453, "learning_rate": 1.880333558064536e-05, "loss": 1.7106, "step": 625 }, { "epoch": 0.5068825910931174, "grad_norm": 1.230409287496346, "learning_rate": 1.8799255090592056e-05, "loss": 1.6266, "step": 626 }, { "epoch": 0.5076923076923077, "grad_norm": 1.1988542561396345, "learning_rate": 1.8795168099677992e-05, "loss": 1.6516, "step": 627 }, { "epoch": 0.5085020242914979, "grad_norm": 1.1887473097347059, "learning_rate": 1.8791074610922624e-05, "loss": 1.7295, "step": 628 }, { "epoch": 0.5093117408906883, "grad_norm": 1.3303290476414642, "learning_rate": 1.8786974627350206e-05, "loss": 1.6802, "step": 629 }, { "epoch": 0.5101214574898786, "grad_norm": 1.1121862887308984, "learning_rate": 1.878286815198979e-05, "loss": 1.6759, "step": 630 }, { "epoch": 0.5109311740890689, "grad_norm": 1.1527683977442384, "learning_rate": 1.8778755187875236e-05, "loss": 1.675, "step": 631 }, { "epoch": 0.5117408906882591, "grad_norm": 1.1283416836021556, "learning_rate": 1.877463573804518e-05, "loss": 1.7042, "step": 632 }, { "epoch": 0.5125506072874494, "grad_norm": 1.111511552571126, "learning_rate": 1.877050980554306e-05, "loss": 1.6402, "step": 633 }, { "epoch": 0.5133603238866397, "grad_norm": 1.1686794783887113, "learning_rate": 1.8766377393417104e-05, "loss": 1.6266, "step": 634 }, { "epoch": 0.5141700404858299, "grad_norm": 1.0774322060832857, "learning_rate": 1.876223850472032e-05, "loss": 1.6109, "step": 635 }, { "epoch": 0.5149797570850202, "grad_norm": 1.1204612114124226, "learning_rate": 1.875809314251051e-05, "loss": 1.5896, "step": 636 }, { "epoch": 0.5157894736842106, "grad_norm": 1.1411930639576617, "learning_rate": 1.8753941309850248e-05, "loss": 1.6531, "step": 637 }, { "epoch": 0.5165991902834008, "grad_norm": 1.246386803807688, "learning_rate": 1.8749783009806898e-05, "loss": 1.7025, "step": 638 }, { "epoch": 0.5174089068825911, "grad_norm": 1.176836096422221, "learning_rate": 1.8745618245452596e-05, "loss": 1.6579, "step": 639 }, { "epoch": 0.5182186234817814, "grad_norm": 1.10956833415483, "learning_rate": 1.8741447019864263e-05, "loss": 1.6245, "step": 640 }, { "epoch": 0.5190283400809717, "grad_norm": 1.1188976219487057, "learning_rate": 1.873726933612358e-05, "loss": 1.6296, "step": 641 }, { "epoch": 0.5198380566801619, "grad_norm": 1.2300857210684373, "learning_rate": 1.873308519731701e-05, "loss": 1.6823, "step": 642 }, { "epoch": 0.5206477732793522, "grad_norm": 1.1437764748313324, "learning_rate": 1.872889460653578e-05, "loss": 1.6442, "step": 643 }, { "epoch": 0.5214574898785425, "grad_norm": 1.1135134095150818, "learning_rate": 1.872469756687588e-05, "loss": 1.5917, "step": 644 }, { "epoch": 0.5222672064777328, "grad_norm": 1.1837475043251595, "learning_rate": 1.872049408143808e-05, "loss": 1.7025, "step": 645 }, { "epoch": 0.5230769230769231, "grad_norm": 1.1575973920612108, "learning_rate": 1.8716284153327887e-05, "loss": 1.6407, "step": 646 }, { "epoch": 0.5238866396761134, "grad_norm": 1.1699024098280943, "learning_rate": 1.871206778565559e-05, "loss": 1.6838, "step": 647 }, { "epoch": 0.5246963562753036, "grad_norm": 1.1006692187902312, "learning_rate": 1.870784498153623e-05, "loss": 1.6798, "step": 648 }, { "epoch": 0.5255060728744939, "grad_norm": 1.1232448009893263, "learning_rate": 1.87036157440896e-05, "loss": 1.6862, "step": 649 }, { "epoch": 0.5263157894736842, "grad_norm": 1.1938575522143917, "learning_rate": 1.8699380076440242e-05, "loss": 1.7397, "step": 650 }, { "epoch": 0.5271255060728745, "grad_norm": 1.1943224317319683, "learning_rate": 1.8695137981717452e-05, "loss": 1.6201, "step": 651 }, { "epoch": 0.5279352226720648, "grad_norm": 1.1217940510846476, "learning_rate": 1.8690889463055285e-05, "loss": 1.637, "step": 652 }, { "epoch": 0.5287449392712551, "grad_norm": 1.1039437922379434, "learning_rate": 1.8686634523592523e-05, "loss": 1.6459, "step": 653 }, { "epoch": 0.5295546558704454, "grad_norm": 1.1566442258336185, "learning_rate": 1.868237316647271e-05, "loss": 1.669, "step": 654 }, { "epoch": 0.5303643724696356, "grad_norm": 1.2402960513081014, "learning_rate": 1.8678105394844114e-05, "loss": 1.6826, "step": 655 }, { "epoch": 0.5311740890688259, "grad_norm": 1.1427219527507249, "learning_rate": 1.8673831211859758e-05, "loss": 1.6865, "step": 656 }, { "epoch": 0.5319838056680162, "grad_norm": 1.1397055186220963, "learning_rate": 1.866955062067739e-05, "loss": 1.6982, "step": 657 }, { "epoch": 0.5327935222672064, "grad_norm": 1.1723541895025018, "learning_rate": 1.8665263624459497e-05, "loss": 1.6186, "step": 658 }, { "epoch": 0.5336032388663967, "grad_norm": 1.0732319910985701, "learning_rate": 1.86609702263733e-05, "loss": 1.7125, "step": 659 }, { "epoch": 0.5344129554655871, "grad_norm": 1.102482299703252, "learning_rate": 1.8656670429590745e-05, "loss": 1.7209, "step": 660 }, { "epoch": 0.5352226720647774, "grad_norm": 1.1656759203730942, "learning_rate": 1.8652364237288507e-05, "loss": 1.6379, "step": 661 }, { "epoch": 0.5360323886639676, "grad_norm": 1.1796882306648648, "learning_rate": 1.864805165264799e-05, "loss": 1.6733, "step": 662 }, { "epoch": 0.5368421052631579, "grad_norm": 1.0979399574684228, "learning_rate": 1.8643732678855314e-05, "loss": 1.6439, "step": 663 }, { "epoch": 0.5376518218623482, "grad_norm": 1.1656259629485042, "learning_rate": 1.8639407319101325e-05, "loss": 1.7003, "step": 664 }, { "epoch": 0.5384615384615384, "grad_norm": 1.109949494450682, "learning_rate": 1.8635075576581587e-05, "loss": 1.6913, "step": 665 }, { "epoch": 0.5392712550607287, "grad_norm": 1.1384719929801121, "learning_rate": 1.8630737454496374e-05, "loss": 1.6767, "step": 666 }, { "epoch": 0.540080971659919, "grad_norm": 1.1146510327537758, "learning_rate": 1.8626392956050675e-05, "loss": 1.6193, "step": 667 }, { "epoch": 0.5408906882591094, "grad_norm": 1.1533020616397756, "learning_rate": 1.862204208445419e-05, "loss": 1.6373, "step": 668 }, { "epoch": 0.5417004048582996, "grad_norm": 1.1409514813984398, "learning_rate": 1.8617684842921337e-05, "loss": 1.5814, "step": 669 }, { "epoch": 0.5425101214574899, "grad_norm": 1.1892690565319142, "learning_rate": 1.861332123467122e-05, "loss": 1.71, "step": 670 }, { "epoch": 0.5433198380566802, "grad_norm": 1.0770444731078834, "learning_rate": 1.8608951262927667e-05, "loss": 1.6821, "step": 671 }, { "epoch": 0.5441295546558704, "grad_norm": 1.2201808284178386, "learning_rate": 1.8604574930919198e-05, "loss": 1.608, "step": 672 }, { "epoch": 0.5449392712550607, "grad_norm": 1.1156767804046464, "learning_rate": 1.860019224187903e-05, "loss": 1.6091, "step": 673 }, { "epoch": 0.545748987854251, "grad_norm": 1.1312661674531492, "learning_rate": 1.8595803199045083e-05, "loss": 1.6964, "step": 674 }, { "epoch": 0.5465587044534413, "grad_norm": 1.2293348497150374, "learning_rate": 1.859140780565996e-05, "loss": 1.6872, "step": 675 }, { "epoch": 0.5473684210526316, "grad_norm": 1.062664670461368, "learning_rate": 1.858700606497097e-05, "loss": 1.6528, "step": 676 }, { "epoch": 0.5481781376518219, "grad_norm": 1.1400817362492008, "learning_rate": 1.85825979802301e-05, "loss": 1.6562, "step": 677 }, { "epoch": 0.5489878542510122, "grad_norm": 1.0787086213793413, "learning_rate": 1.8578183554694035e-05, "loss": 1.7133, "step": 678 }, { "epoch": 0.5497975708502024, "grad_norm": 1.0952800726386747, "learning_rate": 1.8573762791624132e-05, "loss": 1.6316, "step": 679 }, { "epoch": 0.5506072874493927, "grad_norm": 1.140497018650844, "learning_rate": 1.856933569428644e-05, "loss": 1.5434, "step": 680 }, { "epoch": 0.551417004048583, "grad_norm": 1.155661350966275, "learning_rate": 1.856490226595168e-05, "loss": 1.7302, "step": 681 }, { "epoch": 0.5522267206477732, "grad_norm": 1.162518892367102, "learning_rate": 1.856046250989526e-05, "loss": 1.7335, "step": 682 }, { "epoch": 0.5530364372469636, "grad_norm": 1.1661792099551183, "learning_rate": 1.8556016429397248e-05, "loss": 1.6408, "step": 683 }, { "epoch": 0.5538461538461539, "grad_norm": 1.129401750350817, "learning_rate": 1.8551564027742404e-05, "loss": 1.6326, "step": 684 }, { "epoch": 0.5546558704453441, "grad_norm": 1.2694837443603195, "learning_rate": 1.8547105308220142e-05, "loss": 1.6501, "step": 685 }, { "epoch": 0.5554655870445344, "grad_norm": 1.0791347009798566, "learning_rate": 1.854264027412455e-05, "loss": 1.6494, "step": 686 }, { "epoch": 0.5562753036437247, "grad_norm": 1.2420182578602168, "learning_rate": 1.853816892875438e-05, "loss": 1.6413, "step": 687 }, { "epoch": 0.557085020242915, "grad_norm": 1.2039522453979503, "learning_rate": 1.853369127541305e-05, "loss": 1.639, "step": 688 }, { "epoch": 0.5578947368421052, "grad_norm": 1.136925790043882, "learning_rate": 1.8529207317408634e-05, "loss": 1.6843, "step": 689 }, { "epoch": 0.5587044534412956, "grad_norm": 1.2928638734791758, "learning_rate": 1.852471705805387e-05, "loss": 1.5799, "step": 690 }, { "epoch": 0.5595141700404859, "grad_norm": 1.0426663613551523, "learning_rate": 1.8520220500666133e-05, "loss": 1.6005, "step": 691 }, { "epoch": 0.5603238866396761, "grad_norm": 1.3130919141243858, "learning_rate": 1.8515717648567476e-05, "loss": 1.7198, "step": 692 }, { "epoch": 0.5611336032388664, "grad_norm": 1.0955024598419019, "learning_rate": 1.8511208505084593e-05, "loss": 1.6197, "step": 693 }, { "epoch": 0.5619433198380567, "grad_norm": 1.0883596134382716, "learning_rate": 1.850669307354882e-05, "loss": 1.6603, "step": 694 }, { "epoch": 0.562753036437247, "grad_norm": 1.218741988618394, "learning_rate": 1.8502171357296144e-05, "loss": 1.6684, "step": 695 }, { "epoch": 0.5635627530364372, "grad_norm": 1.1391537110795147, "learning_rate": 1.8497643359667193e-05, "loss": 1.6579, "step": 696 }, { "epoch": 0.5643724696356275, "grad_norm": 1.1387711931236417, "learning_rate": 1.8493109084007236e-05, "loss": 1.6465, "step": 697 }, { "epoch": 0.5651821862348179, "grad_norm": 1.1254009149677715, "learning_rate": 1.8488568533666183e-05, "loss": 1.6042, "step": 698 }, { "epoch": 0.5659919028340081, "grad_norm": 1.1257268411329382, "learning_rate": 1.848402171199858e-05, "loss": 1.6514, "step": 699 }, { "epoch": 0.5668016194331984, "grad_norm": 1.0927876479165208, "learning_rate": 1.84794686223636e-05, "loss": 1.6822, "step": 700 }, { "epoch": 0.5676113360323887, "grad_norm": 1.090171472139705, "learning_rate": 1.8474909268125053e-05, "loss": 1.7033, "step": 701 }, { "epoch": 0.5684210526315789, "grad_norm": 1.088693023569449, "learning_rate": 1.8470343652651374e-05, "loss": 1.615, "step": 702 }, { "epoch": 0.5692307692307692, "grad_norm": 1.3432139320476364, "learning_rate": 1.846577177931562e-05, "loss": 1.6546, "step": 703 }, { "epoch": 0.5700404858299595, "grad_norm": 1.0705730809837788, "learning_rate": 1.8461193651495482e-05, "loss": 1.6723, "step": 704 }, { "epoch": 0.5708502024291497, "grad_norm": 1.121041236995467, "learning_rate": 1.8456609272573268e-05, "loss": 1.5757, "step": 705 }, { "epoch": 0.5716599190283401, "grad_norm": 1.2097795973645553, "learning_rate": 1.8452018645935895e-05, "loss": 1.6559, "step": 706 }, { "epoch": 0.5724696356275304, "grad_norm": 1.1312684155340222, "learning_rate": 1.844742177497491e-05, "loss": 1.6322, "step": 707 }, { "epoch": 0.5732793522267207, "grad_norm": 1.126110922221143, "learning_rate": 1.8442818663086456e-05, "loss": 1.6482, "step": 708 }, { "epoch": 0.5740890688259109, "grad_norm": 1.1010830873261865, "learning_rate": 1.8438209313671307e-05, "loss": 1.6829, "step": 709 }, { "epoch": 0.5748987854251012, "grad_norm": 1.2743855652131832, "learning_rate": 1.8433593730134835e-05, "loss": 1.669, "step": 710 }, { "epoch": 0.5757085020242915, "grad_norm": 1.1448636929742997, "learning_rate": 1.842897191588701e-05, "loss": 1.692, "step": 711 }, { "epoch": 0.5765182186234817, "grad_norm": 1.1106904762427132, "learning_rate": 1.842434387434242e-05, "loss": 1.6477, "step": 712 }, { "epoch": 0.5773279352226721, "grad_norm": 1.2682672769652985, "learning_rate": 1.8419709608920243e-05, "loss": 1.6194, "step": 713 }, { "epoch": 0.5781376518218624, "grad_norm": 1.2002428737470912, "learning_rate": 1.8415069123044263e-05, "loss": 1.6244, "step": 714 }, { "epoch": 0.5789473684210527, "grad_norm": 1.105036420479548, "learning_rate": 1.841042242014285e-05, "loss": 1.6496, "step": 715 }, { "epoch": 0.5797570850202429, "grad_norm": 1.1794855208133117, "learning_rate": 1.840576950364898e-05, "loss": 1.6911, "step": 716 }, { "epoch": 0.5805668016194332, "grad_norm": 1.1055378048289626, "learning_rate": 1.8401110377000206e-05, "loss": 1.6052, "step": 717 }, { "epoch": 0.5813765182186235, "grad_norm": 1.2287534777350115, "learning_rate": 1.839644504363868e-05, "loss": 1.6905, "step": 718 }, { "epoch": 0.5821862348178137, "grad_norm": 1.1569748526557413, "learning_rate": 1.839177350701113e-05, "loss": 1.6255, "step": 719 }, { "epoch": 0.582995951417004, "grad_norm": 1.1818168990284688, "learning_rate": 1.838709577056888e-05, "loss": 1.5795, "step": 720 }, { "epoch": 0.5838056680161944, "grad_norm": 1.1555576007146946, "learning_rate": 1.838241183776782e-05, "loss": 1.6674, "step": 721 }, { "epoch": 0.5846153846153846, "grad_norm": 1.1401390602412884, "learning_rate": 1.8377721712068424e-05, "loss": 1.675, "step": 722 }, { "epoch": 0.5854251012145749, "grad_norm": 1.162723139199304, "learning_rate": 1.8373025396935743e-05, "loss": 1.5534, "step": 723 }, { "epoch": 0.5862348178137652, "grad_norm": 1.1799321705036754, "learning_rate": 1.8368322895839397e-05, "loss": 1.639, "step": 724 }, { "epoch": 0.5870445344129555, "grad_norm": 1.2240975875375573, "learning_rate": 1.8363614212253585e-05, "loss": 1.597, "step": 725 }, { "epoch": 0.5878542510121457, "grad_norm": 1.1698625227265869, "learning_rate": 1.8358899349657063e-05, "loss": 1.7323, "step": 726 }, { "epoch": 0.588663967611336, "grad_norm": 1.1339957776380405, "learning_rate": 1.8354178311533152e-05, "loss": 1.6183, "step": 727 }, { "epoch": 0.5894736842105263, "grad_norm": 1.2392896642339328, "learning_rate": 1.8349451101369742e-05, "loss": 1.6272, "step": 728 }, { "epoch": 0.5902834008097166, "grad_norm": 1.1522226881000637, "learning_rate": 1.8344717722659285e-05, "loss": 1.6365, "step": 729 }, { "epoch": 0.5910931174089069, "grad_norm": 1.2425239844288822, "learning_rate": 1.833997817889878e-05, "loss": 1.5836, "step": 730 }, { "epoch": 0.5919028340080972, "grad_norm": 1.1120981246598145, "learning_rate": 1.833523247358979e-05, "loss": 1.6734, "step": 731 }, { "epoch": 0.5927125506072874, "grad_norm": 1.1298710482715226, "learning_rate": 1.8330480610238424e-05, "loss": 1.607, "step": 732 }, { "epoch": 0.5935222672064777, "grad_norm": 1.1377574517661373, "learning_rate": 1.8325722592355344e-05, "loss": 1.6454, "step": 733 }, { "epoch": 0.594331983805668, "grad_norm": 1.1538583104420033, "learning_rate": 1.8320958423455756e-05, "loss": 1.6921, "step": 734 }, { "epoch": 0.5951417004048583, "grad_norm": 1.0793087009627134, "learning_rate": 1.8316188107059418e-05, "loss": 1.6726, "step": 735 }, { "epoch": 0.5959514170040486, "grad_norm": 1.1201444715810196, "learning_rate": 1.8311411646690616e-05, "loss": 1.6588, "step": 736 }, { "epoch": 0.5967611336032389, "grad_norm": 1.096654674640351, "learning_rate": 1.8306629045878192e-05, "loss": 1.6684, "step": 737 }, { "epoch": 0.5975708502024292, "grad_norm": 1.1079216672953291, "learning_rate": 1.8301840308155507e-05, "loss": 1.6618, "step": 738 }, { "epoch": 0.5983805668016194, "grad_norm": 1.0879802564948844, "learning_rate": 1.8297045437060474e-05, "loss": 1.7383, "step": 739 }, { "epoch": 0.5991902834008097, "grad_norm": 1.1424198121843356, "learning_rate": 1.8292244436135517e-05, "loss": 1.6709, "step": 740 }, { "epoch": 0.6, "grad_norm": 1.1430964238759362, "learning_rate": 1.828743730892761e-05, "loss": 1.64, "step": 741 }, { "epoch": 0.6008097165991902, "grad_norm": 1.0895066483885778, "learning_rate": 1.8282624058988237e-05, "loss": 1.6584, "step": 742 }, { "epoch": 0.6016194331983805, "grad_norm": 1.1291234703082627, "learning_rate": 1.827780468987341e-05, "loss": 1.6452, "step": 743 }, { "epoch": 0.6024291497975709, "grad_norm": 1.115557218075291, "learning_rate": 1.8272979205143674e-05, "loss": 1.6076, "step": 744 }, { "epoch": 0.6032388663967612, "grad_norm": 1.1171405856483771, "learning_rate": 1.8268147608364068e-05, "loss": 1.6046, "step": 745 }, { "epoch": 0.6040485829959514, "grad_norm": 1.0965261272427589, "learning_rate": 1.8263309903104163e-05, "loss": 1.6013, "step": 746 }, { "epoch": 0.6048582995951417, "grad_norm": 1.2185450067830268, "learning_rate": 1.8258466092938042e-05, "loss": 1.6519, "step": 747 }, { "epoch": 0.605668016194332, "grad_norm": 1.1702812963364408, "learning_rate": 1.82536161814443e-05, "loss": 1.6953, "step": 748 }, { "epoch": 0.6064777327935222, "grad_norm": 1.0686726410287588, "learning_rate": 1.8248760172206024e-05, "loss": 1.6262, "step": 749 }, { "epoch": 0.6072874493927125, "grad_norm": 1.136164025439449, "learning_rate": 1.8243898068810833e-05, "loss": 1.6661, "step": 750 }, { "epoch": 0.6080971659919029, "grad_norm": 1.1954715351994403, "learning_rate": 1.8239029874850823e-05, "loss": 1.674, "step": 751 }, { "epoch": 0.6089068825910932, "grad_norm": 1.1184558352613179, "learning_rate": 1.82341555939226e-05, "loss": 1.721, "step": 752 }, { "epoch": 0.6097165991902834, "grad_norm": 1.129119070175351, "learning_rate": 1.822927522962727e-05, "loss": 1.6457, "step": 753 }, { "epoch": 0.6105263157894737, "grad_norm": 1.1410321623438338, "learning_rate": 1.822438878557043e-05, "loss": 1.6529, "step": 754 }, { "epoch": 0.611336032388664, "grad_norm": 1.1589440461468792, "learning_rate": 1.8219496265362164e-05, "loss": 1.7046, "step": 755 }, { "epoch": 0.6121457489878542, "grad_norm": 1.115213970991795, "learning_rate": 1.8214597672617054e-05, "loss": 1.7141, "step": 756 }, { "epoch": 0.6129554655870445, "grad_norm": 1.1739020879997546, "learning_rate": 1.8209693010954166e-05, "loss": 1.6513, "step": 757 }, { "epoch": 0.6137651821862348, "grad_norm": 1.1420116503362077, "learning_rate": 1.820478228399704e-05, "loss": 1.6475, "step": 758 }, { "epoch": 0.6145748987854251, "grad_norm": 1.2176203227373104, "learning_rate": 1.819986549537372e-05, "loss": 1.6648, "step": 759 }, { "epoch": 0.6153846153846154, "grad_norm": 1.1052645276051432, "learning_rate": 1.81949426487167e-05, "loss": 1.6193, "step": 760 }, { "epoch": 0.6161943319838057, "grad_norm": 1.099724825808027, "learning_rate": 1.819001374766296e-05, "loss": 1.5659, "step": 761 }, { "epoch": 0.617004048582996, "grad_norm": 1.255382813893447, "learning_rate": 1.818507879585397e-05, "loss": 1.6868, "step": 762 }, { "epoch": 0.6178137651821862, "grad_norm": 1.1337123092988888, "learning_rate": 1.8180137796935648e-05, "loss": 1.7117, "step": 763 }, { "epoch": 0.6186234817813765, "grad_norm": 1.0916001764698793, "learning_rate": 1.8175190754558384e-05, "loss": 1.6079, "step": 764 }, { "epoch": 0.6194331983805668, "grad_norm": 1.1462547249605433, "learning_rate": 1.8170237672377046e-05, "loss": 1.6266, "step": 765 }, { "epoch": 0.620242914979757, "grad_norm": 1.184448821537451, "learning_rate": 1.8165278554050946e-05, "loss": 1.7218, "step": 766 }, { "epoch": 0.6210526315789474, "grad_norm": 1.1988569585117117, "learning_rate": 1.8160313403243874e-05, "loss": 1.7002, "step": 767 }, { "epoch": 0.6218623481781377, "grad_norm": 1.1270467979508891, "learning_rate": 1.8155342223624054e-05, "loss": 1.6241, "step": 768 }, { "epoch": 0.622672064777328, "grad_norm": 1.0627366879538067, "learning_rate": 1.8150365018864192e-05, "loss": 1.63, "step": 769 }, { "epoch": 0.6234817813765182, "grad_norm": 1.1499690148470678, "learning_rate": 1.814538179264142e-05, "loss": 1.6664, "step": 770 }, { "epoch": 0.6242914979757085, "grad_norm": 1.1381510719594987, "learning_rate": 1.8140392548637333e-05, "loss": 1.6597, "step": 771 }, { "epoch": 0.6251012145748988, "grad_norm": 1.1404467641041274, "learning_rate": 1.8135397290537967e-05, "loss": 1.6967, "step": 772 }, { "epoch": 0.625910931174089, "grad_norm": 1.1388300535406857, "learning_rate": 1.81303960220338e-05, "loss": 1.607, "step": 773 }, { "epoch": 0.6267206477732794, "grad_norm": 1.1302382932024566, "learning_rate": 1.812538874681976e-05, "loss": 1.6584, "step": 774 }, { "epoch": 0.6275303643724697, "grad_norm": 1.1014453045454282, "learning_rate": 1.81203754685952e-05, "loss": 1.6767, "step": 775 }, { "epoch": 0.6283400809716599, "grad_norm": 1.1824063773454823, "learning_rate": 1.8115356191063913e-05, "loss": 1.6373, "step": 776 }, { "epoch": 0.6291497975708502, "grad_norm": 1.072773503139197, "learning_rate": 1.811033091793413e-05, "loss": 1.6094, "step": 777 }, { "epoch": 0.6299595141700405, "grad_norm": 1.1293001784836003, "learning_rate": 1.8105299652918496e-05, "loss": 1.6591, "step": 778 }, { "epoch": 0.6307692307692307, "grad_norm": 1.2465499878402888, "learning_rate": 1.8100262399734102e-05, "loss": 1.7228, "step": 779 }, { "epoch": 0.631578947368421, "grad_norm": 1.0649028197252695, "learning_rate": 1.8095219162102453e-05, "loss": 1.6456, "step": 780 }, { "epoch": 0.6323886639676113, "grad_norm": 1.2144051445060644, "learning_rate": 1.8090169943749477e-05, "loss": 1.6515, "step": 781 }, { "epoch": 0.6331983805668017, "grad_norm": 1.0813125393674639, "learning_rate": 1.8085114748405514e-05, "loss": 1.6634, "step": 782 }, { "epoch": 0.6340080971659919, "grad_norm": 1.0907352727369097, "learning_rate": 1.8080053579805333e-05, "loss": 1.647, "step": 783 }, { "epoch": 0.6348178137651822, "grad_norm": 1.1795860026440994, "learning_rate": 1.8074986441688102e-05, "loss": 1.6494, "step": 784 }, { "epoch": 0.6356275303643725, "grad_norm": 1.0999290606147423, "learning_rate": 1.8069913337797414e-05, "loss": 1.6061, "step": 785 }, { "epoch": 0.6364372469635627, "grad_norm": 1.1271587687692461, "learning_rate": 1.8064834271881252e-05, "loss": 1.6512, "step": 786 }, { "epoch": 0.637246963562753, "grad_norm": 1.2475268374395798, "learning_rate": 1.805974924769202e-05, "loss": 1.5999, "step": 787 }, { "epoch": 0.6380566801619433, "grad_norm": 1.1666492616855777, "learning_rate": 1.8054658268986517e-05, "loss": 1.6134, "step": 788 }, { "epoch": 0.6388663967611335, "grad_norm": 1.190647434820481, "learning_rate": 1.8049561339525938e-05, "loss": 1.6325, "step": 789 }, { "epoch": 0.6396761133603239, "grad_norm": 1.1567789620189808, "learning_rate": 1.804445846307588e-05, "loss": 1.6126, "step": 790 }, { "epoch": 0.6404858299595142, "grad_norm": 1.1205352247753744, "learning_rate": 1.803934964340633e-05, "loss": 1.6759, "step": 791 }, { "epoch": 0.6412955465587045, "grad_norm": 1.1623098699690382, "learning_rate": 1.803423488429167e-05, "loss": 1.6557, "step": 792 }, { "epoch": 0.6421052631578947, "grad_norm": 1.0452603483361436, "learning_rate": 1.8029114189510664e-05, "loss": 1.637, "step": 793 }, { "epoch": 0.642914979757085, "grad_norm": 1.1457994199931247, "learning_rate": 1.8023987562846468e-05, "loss": 1.7003, "step": 794 }, { "epoch": 0.6437246963562753, "grad_norm": 1.1078125949520747, "learning_rate": 1.801885500808661e-05, "loss": 1.6411, "step": 795 }, { "epoch": 0.6445344129554655, "grad_norm": 1.063287107792573, "learning_rate": 1.8013716529023013e-05, "loss": 1.5944, "step": 796 }, { "epoch": 0.6453441295546559, "grad_norm": 1.1463230389504844, "learning_rate": 1.8008572129451963e-05, "loss": 1.6437, "step": 797 }, { "epoch": 0.6461538461538462, "grad_norm": 1.0745077662710292, "learning_rate": 1.800342181317413e-05, "loss": 1.6141, "step": 798 }, { "epoch": 0.6469635627530365, "grad_norm": 1.1207965270353415, "learning_rate": 1.7998265583994544e-05, "loss": 1.6043, "step": 799 }, { "epoch": 0.6477732793522267, "grad_norm": 1.0890890467994665, "learning_rate": 1.7993103445722615e-05, "loss": 1.6425, "step": 800 }, { "epoch": 0.648582995951417, "grad_norm": 1.13852357372144, "learning_rate": 1.7987935402172114e-05, "loss": 1.6762, "step": 801 }, { "epoch": 0.6493927125506073, "grad_norm": 1.1427359144052136, "learning_rate": 1.7982761457161175e-05, "loss": 1.6973, "step": 802 }, { "epoch": 0.6502024291497975, "grad_norm": 1.0845396194326615, "learning_rate": 1.7977581614512286e-05, "loss": 1.6781, "step": 803 }, { "epoch": 0.6510121457489878, "grad_norm": 1.1491199289525131, "learning_rate": 1.7972395878052304e-05, "loss": 1.6786, "step": 804 }, { "epoch": 0.6518218623481782, "grad_norm": 1.0901262157214715, "learning_rate": 1.7967204251612432e-05, "loss": 1.6037, "step": 805 }, { "epoch": 0.6526315789473685, "grad_norm": 1.1102939312877738, "learning_rate": 1.796200673902823e-05, "loss": 1.6314, "step": 806 }, { "epoch": 0.6534412955465587, "grad_norm": 1.1409639624890162, "learning_rate": 1.7956803344139592e-05, "loss": 1.7136, "step": 807 }, { "epoch": 0.654251012145749, "grad_norm": 1.0991234850498368, "learning_rate": 1.795159407079078e-05, "loss": 1.6653, "step": 808 }, { "epoch": 0.6550607287449393, "grad_norm": 1.1172599612597327, "learning_rate": 1.7946378922830386e-05, "loss": 1.625, "step": 809 }, { "epoch": 0.6558704453441295, "grad_norm": 1.0552333200507726, "learning_rate": 1.7941157904111346e-05, "loss": 1.7037, "step": 810 }, { "epoch": 0.6566801619433198, "grad_norm": 1.080338568915127, "learning_rate": 1.7935931018490923e-05, "loss": 1.6834, "step": 811 }, { "epoch": 0.6574898785425102, "grad_norm": 1.086653104326212, "learning_rate": 1.7930698269830733e-05, "loss": 1.6364, "step": 812 }, { "epoch": 0.6582995951417004, "grad_norm": 1.0712198372881643, "learning_rate": 1.7925459661996707e-05, "loss": 1.5958, "step": 813 }, { "epoch": 0.6591093117408907, "grad_norm": 1.1256703101949748, "learning_rate": 1.7920215198859114e-05, "loss": 1.6405, "step": 814 }, { "epoch": 0.659919028340081, "grad_norm": 1.1116563555135301, "learning_rate": 1.7914964884292543e-05, "loss": 1.6689, "step": 815 }, { "epoch": 0.6607287449392713, "grad_norm": 1.12107713293399, "learning_rate": 1.7909708722175914e-05, "loss": 1.6928, "step": 816 }, { "epoch": 0.6615384615384615, "grad_norm": 1.1619550044989124, "learning_rate": 1.7904446716392457e-05, "loss": 1.6034, "step": 817 }, { "epoch": 0.6623481781376518, "grad_norm": 1.1005819448367389, "learning_rate": 1.789917887082973e-05, "loss": 1.6888, "step": 818 }, { "epoch": 0.6631578947368421, "grad_norm": 1.1827182789963149, "learning_rate": 1.7893905189379594e-05, "loss": 1.6213, "step": 819 }, { "epoch": 0.6639676113360324, "grad_norm": 1.1422059406503742, "learning_rate": 1.7888625675938237e-05, "loss": 1.6266, "step": 820 }, { "epoch": 0.6647773279352227, "grad_norm": 1.0470507697677474, "learning_rate": 1.7883340334406136e-05, "loss": 1.6561, "step": 821 }, { "epoch": 0.665587044534413, "grad_norm": 1.1216106844368927, "learning_rate": 1.7878049168688087e-05, "loss": 1.6705, "step": 822 }, { "epoch": 0.6663967611336032, "grad_norm": 1.2138659468039459, "learning_rate": 1.787275218269319e-05, "loss": 1.6336, "step": 823 }, { "epoch": 0.6672064777327935, "grad_norm": 1.0986476320949485, "learning_rate": 1.7867449380334834e-05, "loss": 1.7078, "step": 824 }, { "epoch": 0.6680161943319838, "grad_norm": 1.288830051080927, "learning_rate": 1.7862140765530718e-05, "loss": 1.6254, "step": 825 }, { "epoch": 0.668825910931174, "grad_norm": 1.1447438193383035, "learning_rate": 1.7856826342202828e-05, "loss": 1.6201, "step": 826 }, { "epoch": 0.6696356275303643, "grad_norm": 1.1649226403537143, "learning_rate": 1.785150611427744e-05, "loss": 1.6864, "step": 827 }, { "epoch": 0.6704453441295547, "grad_norm": 1.118218283333164, "learning_rate": 1.7846180085685122e-05, "loss": 1.6007, "step": 828 }, { "epoch": 0.671255060728745, "grad_norm": 1.1014855871694855, "learning_rate": 1.7840848260360728e-05, "loss": 1.6499, "step": 829 }, { "epoch": 0.6720647773279352, "grad_norm": 1.2218105006616033, "learning_rate": 1.783551064224339e-05, "loss": 1.6338, "step": 830 }, { "epoch": 0.6728744939271255, "grad_norm": 1.1476489793760862, "learning_rate": 1.7830167235276524e-05, "loss": 1.6073, "step": 831 }, { "epoch": 0.6736842105263158, "grad_norm": 1.1653286803987006, "learning_rate": 1.7824818043407828e-05, "loss": 1.5989, "step": 832 }, { "epoch": 0.674493927125506, "grad_norm": 1.1159344507824178, "learning_rate": 1.7819463070589256e-05, "loss": 1.6211, "step": 833 }, { "epoch": 0.6753036437246963, "grad_norm": 1.0909665741148316, "learning_rate": 1.781410232077705e-05, "loss": 1.6052, "step": 834 }, { "epoch": 0.6761133603238867, "grad_norm": 1.1405758039320455, "learning_rate": 1.7808735797931715e-05, "loss": 1.7158, "step": 835 }, { "epoch": 0.676923076923077, "grad_norm": 1.1662611008186603, "learning_rate": 1.780336350601802e-05, "loss": 1.714, "step": 836 }, { "epoch": 0.6777327935222672, "grad_norm": 1.1318426366131789, "learning_rate": 1.7797985449004996e-05, "loss": 1.6362, "step": 837 }, { "epoch": 0.6785425101214575, "grad_norm": 1.1122001699040855, "learning_rate": 1.7792601630865937e-05, "loss": 1.6337, "step": 838 }, { "epoch": 0.6793522267206478, "grad_norm": 1.1309770551503653, "learning_rate": 1.7787212055578383e-05, "loss": 1.5989, "step": 839 }, { "epoch": 0.680161943319838, "grad_norm": 1.1673264771022611, "learning_rate": 1.7781816727124138e-05, "loss": 1.6649, "step": 840 }, { "epoch": 0.6809716599190283, "grad_norm": 1.1003607212760034, "learning_rate": 1.7776415649489257e-05, "loss": 1.6183, "step": 841 }, { "epoch": 0.6817813765182186, "grad_norm": 1.1092869043197529, "learning_rate": 1.7771008826664036e-05, "loss": 1.6678, "step": 842 }, { "epoch": 0.682591093117409, "grad_norm": 1.2211528749408376, "learning_rate": 1.7765596262643013e-05, "loss": 1.6584, "step": 843 }, { "epoch": 0.6834008097165992, "grad_norm": 1.1758598116755494, "learning_rate": 1.776017796142498e-05, "loss": 1.6415, "step": 844 }, { "epoch": 0.6842105263157895, "grad_norm": 1.1767326941123255, "learning_rate": 1.7754753927012955e-05, "loss": 1.6265, "step": 845 }, { "epoch": 0.6850202429149798, "grad_norm": 1.1747867351240824, "learning_rate": 1.77493241634142e-05, "loss": 1.625, "step": 846 }, { "epoch": 0.68582995951417, "grad_norm": 1.1345683268627573, "learning_rate": 1.7743888674640203e-05, "loss": 1.6418, "step": 847 }, { "epoch": 0.6866396761133603, "grad_norm": 1.1283603891809075, "learning_rate": 1.773844746470669e-05, "loss": 1.6412, "step": 848 }, { "epoch": 0.6874493927125506, "grad_norm": 1.1378377091313905, "learning_rate": 1.7733000537633605e-05, "loss": 1.6298, "step": 849 }, { "epoch": 0.6882591093117408, "grad_norm": 1.1253702667997967, "learning_rate": 1.7727547897445117e-05, "loss": 1.6472, "step": 850 }, { "epoch": 0.6890688259109312, "grad_norm": 1.1529285701924186, "learning_rate": 1.772208954816963e-05, "loss": 1.6063, "step": 851 }, { "epoch": 0.6898785425101215, "grad_norm": 1.2088268908844393, "learning_rate": 1.771662549383974e-05, "loss": 1.6893, "step": 852 }, { "epoch": 0.6906882591093118, "grad_norm": 1.1008137117384047, "learning_rate": 1.7711155738492286e-05, "loss": 1.7038, "step": 853 }, { "epoch": 0.691497975708502, "grad_norm": 1.1203578405697336, "learning_rate": 1.7705680286168297e-05, "loss": 1.691, "step": 854 }, { "epoch": 0.6923076923076923, "grad_norm": 1.0904874884732818, "learning_rate": 1.770019914091302e-05, "loss": 1.66, "step": 855 }, { "epoch": 0.6931174089068826, "grad_norm": 1.2342232334641767, "learning_rate": 1.769471230677591e-05, "loss": 1.6029, "step": 856 }, { "epoch": 0.6939271255060728, "grad_norm": 1.1892221297007457, "learning_rate": 1.7689219787810618e-05, "loss": 1.6115, "step": 857 }, { "epoch": 0.6947368421052632, "grad_norm": 1.1436275460030911, "learning_rate": 1.7683721588075005e-05, "loss": 1.6779, "step": 858 }, { "epoch": 0.6955465587044535, "grad_norm": 1.0531746133706077, "learning_rate": 1.7678217711631115e-05, "loss": 1.6254, "step": 859 }, { "epoch": 0.6963562753036437, "grad_norm": 1.204345691882841, "learning_rate": 1.76727081625452e-05, "loss": 1.7005, "step": 860 }, { "epoch": 0.697165991902834, "grad_norm": 1.1113015688625698, "learning_rate": 1.7667192944887696e-05, "loss": 1.6242, "step": 861 }, { "epoch": 0.6979757085020243, "grad_norm": 1.0642180179681062, "learning_rate": 1.7661672062733226e-05, "loss": 1.6057, "step": 862 }, { "epoch": 0.6987854251012146, "grad_norm": 1.1074568311327786, "learning_rate": 1.76561455201606e-05, "loss": 1.6569, "step": 863 }, { "epoch": 0.6995951417004048, "grad_norm": 1.1475467339069338, "learning_rate": 1.765061332125281e-05, "loss": 1.6668, "step": 864 }, { "epoch": 0.7004048582995951, "grad_norm": 1.0613765631942633, "learning_rate": 1.7645075470097024e-05, "loss": 1.6471, "step": 865 }, { "epoch": 0.7012145748987855, "grad_norm": 1.1262125184431948, "learning_rate": 1.7639531970784594e-05, "loss": 1.632, "step": 866 }, { "epoch": 0.7020242914979757, "grad_norm": 1.053405645515529, "learning_rate": 1.763398282741103e-05, "loss": 1.6628, "step": 867 }, { "epoch": 0.702834008097166, "grad_norm": 1.053485822528232, "learning_rate": 1.762842804407603e-05, "loss": 1.6268, "step": 868 }, { "epoch": 0.7036437246963563, "grad_norm": 1.1067274819740494, "learning_rate": 1.7622867624883446e-05, "loss": 1.6268, "step": 869 }, { "epoch": 0.7044534412955465, "grad_norm": 1.1365300210950073, "learning_rate": 1.7617301573941296e-05, "loss": 1.6751, "step": 870 }, { "epoch": 0.7052631578947368, "grad_norm": 1.0944652038336933, "learning_rate": 1.7611729895361763e-05, "loss": 1.7041, "step": 871 }, { "epoch": 0.7060728744939271, "grad_norm": 1.0536399181541465, "learning_rate": 1.760615259326118e-05, "loss": 1.6283, "step": 872 }, { "epoch": 0.7068825910931175, "grad_norm": 1.0489723553006804, "learning_rate": 1.760056967176005e-05, "loss": 1.7136, "step": 873 }, { "epoch": 0.7076923076923077, "grad_norm": 1.0719751555919763, "learning_rate": 1.7594981134983003e-05, "loss": 1.6001, "step": 874 }, { "epoch": 0.708502024291498, "grad_norm": 1.1499881882739111, "learning_rate": 1.758938698705884e-05, "loss": 1.677, "step": 875 }, { "epoch": 0.7093117408906883, "grad_norm": 1.101761750936134, "learning_rate": 1.75837872321205e-05, "loss": 1.5747, "step": 876 }, { "epoch": 0.7101214574898785, "grad_norm": 1.077980130931622, "learning_rate": 1.757818187430506e-05, "loss": 1.6827, "step": 877 }, { "epoch": 0.7109311740890688, "grad_norm": 1.0897713278469439, "learning_rate": 1.757257091775374e-05, "loss": 1.659, "step": 878 }, { "epoch": 0.7117408906882591, "grad_norm": 1.4838670869876838, "learning_rate": 1.7566954366611896e-05, "loss": 1.6481, "step": 879 }, { "epoch": 0.7125506072874493, "grad_norm": 1.1522319490945014, "learning_rate": 1.7561332225029022e-05, "loss": 1.6426, "step": 880 }, { "epoch": 0.7133603238866397, "grad_norm": 1.1601026341967655, "learning_rate": 1.7555704497158734e-05, "loss": 1.6299, "step": 881 }, { "epoch": 0.71417004048583, "grad_norm": 1.0844245975035776, "learning_rate": 1.755007118715878e-05, "loss": 1.6366, "step": 882 }, { "epoch": 0.7149797570850203, "grad_norm": 1.106288732347795, "learning_rate": 1.754443229919103e-05, "loss": 1.6275, "step": 883 }, { "epoch": 0.7157894736842105, "grad_norm": 1.0464400021747562, "learning_rate": 1.7538787837421475e-05, "loss": 1.5273, "step": 884 }, { "epoch": 0.7165991902834008, "grad_norm": 1.1156141109394004, "learning_rate": 1.7533137806020226e-05, "loss": 1.6415, "step": 885 }, { "epoch": 0.7174089068825911, "grad_norm": 1.1617480668291316, "learning_rate": 1.752748220916151e-05, "loss": 1.6285, "step": 886 }, { "epoch": 0.7182186234817813, "grad_norm": 1.0663413051063326, "learning_rate": 1.752182105102366e-05, "loss": 1.6715, "step": 887 }, { "epoch": 0.7190283400809716, "grad_norm": 1.1351571282717112, "learning_rate": 1.7516154335789123e-05, "loss": 1.676, "step": 888 }, { "epoch": 0.719838056680162, "grad_norm": 1.0745374695294985, "learning_rate": 1.751048206764445e-05, "loss": 1.6808, "step": 889 }, { "epoch": 0.7206477732793523, "grad_norm": 1.1821923099732665, "learning_rate": 1.7504804250780292e-05, "loss": 1.6717, "step": 890 }, { "epoch": 0.7214574898785425, "grad_norm": 1.1136129753607058, "learning_rate": 1.7499120889391403e-05, "loss": 1.6181, "step": 891 }, { "epoch": 0.7222672064777328, "grad_norm": 1.114536036094605, "learning_rate": 1.7493431987676628e-05, "loss": 1.6641, "step": 892 }, { "epoch": 0.7230769230769231, "grad_norm": 1.2106559019950824, "learning_rate": 1.7487737549838915e-05, "loss": 1.673, "step": 893 }, { "epoch": 0.7238866396761133, "grad_norm": 1.1049671120265863, "learning_rate": 1.748203758008529e-05, "loss": 1.6829, "step": 894 }, { "epoch": 0.7246963562753036, "grad_norm": 1.0330699329285005, "learning_rate": 1.747633208262688e-05, "loss": 1.5981, "step": 895 }, { "epoch": 0.725506072874494, "grad_norm": 1.122795995838303, "learning_rate": 1.747062106167888e-05, "loss": 1.6874, "step": 896 }, { "epoch": 0.7263157894736842, "grad_norm": 1.0411327718132521, "learning_rate": 1.7464904521460574e-05, "loss": 1.6655, "step": 897 }, { "epoch": 0.7271255060728745, "grad_norm": 1.094203881932362, "learning_rate": 1.7459182466195328e-05, "loss": 1.6611, "step": 898 }, { "epoch": 0.7279352226720648, "grad_norm": 1.1197343663542865, "learning_rate": 1.7453454900110575e-05, "loss": 1.6484, "step": 899 }, { "epoch": 0.728744939271255, "grad_norm": 1.0891684716999215, "learning_rate": 1.744772182743782e-05, "loss": 1.6823, "step": 900 }, { "epoch": 0.7295546558704453, "grad_norm": 1.0117179095825084, "learning_rate": 1.744198325241264e-05, "loss": 1.5874, "step": 901 }, { "epoch": 0.7303643724696356, "grad_norm": 1.11669367321381, "learning_rate": 1.743623917927468e-05, "loss": 1.6327, "step": 902 }, { "epoch": 0.7311740890688259, "grad_norm": 1.0477262743830713, "learning_rate": 1.7430489612267634e-05, "loss": 1.5939, "step": 903 }, { "epoch": 0.7319838056680162, "grad_norm": 1.1755335589263856, "learning_rate": 1.742473455563927e-05, "loss": 1.6671, "step": 904 }, { "epoch": 0.7327935222672065, "grad_norm": 1.1271564518170447, "learning_rate": 1.74189740136414e-05, "loss": 1.6179, "step": 905 }, { "epoch": 0.7336032388663968, "grad_norm": 1.1024780353509462, "learning_rate": 1.7413207990529897e-05, "loss": 1.6315, "step": 906 }, { "epoch": 0.734412955465587, "grad_norm": 1.057413980088522, "learning_rate": 1.7407436490564675e-05, "loss": 1.6019, "step": 907 }, { "epoch": 0.7352226720647773, "grad_norm": 1.0903114614208873, "learning_rate": 1.740165951800971e-05, "loss": 1.6435, "step": 908 }, { "epoch": 0.7360323886639676, "grad_norm": 1.1465351066882732, "learning_rate": 1.7395877077132996e-05, "loss": 1.6659, "step": 909 }, { "epoch": 0.7368421052631579, "grad_norm": 1.0879106203040465, "learning_rate": 1.7390089172206594e-05, "loss": 1.6035, "step": 910 }, { "epoch": 0.7376518218623481, "grad_norm": 1.0251909256160818, "learning_rate": 1.738429580750658e-05, "loss": 1.6751, "step": 911 }, { "epoch": 0.7384615384615385, "grad_norm": 1.101454173734882, "learning_rate": 1.7378496987313078e-05, "loss": 1.6924, "step": 912 }, { "epoch": 0.7392712550607288, "grad_norm": 1.0632875331068412, "learning_rate": 1.7372692715910236e-05, "loss": 1.7052, "step": 913 }, { "epoch": 0.740080971659919, "grad_norm": 1.2019849547754717, "learning_rate": 1.736688299758623e-05, "loss": 1.6453, "step": 914 }, { "epoch": 0.7408906882591093, "grad_norm": 1.0424618834849857, "learning_rate": 1.736106783663326e-05, "loss": 1.6095, "step": 915 }, { "epoch": 0.7417004048582996, "grad_norm": 1.119522754130525, "learning_rate": 1.735524723734755e-05, "loss": 1.5714, "step": 916 }, { "epoch": 0.7425101214574898, "grad_norm": 1.1126763401822644, "learning_rate": 1.7349421204029343e-05, "loss": 1.5884, "step": 917 }, { "epoch": 0.7433198380566801, "grad_norm": 1.1436206847704529, "learning_rate": 1.734358974098288e-05, "loss": 1.6716, "step": 918 }, { "epoch": 0.7441295546558705, "grad_norm": 1.1033761896656453, "learning_rate": 1.7337752852516443e-05, "loss": 1.6642, "step": 919 }, { "epoch": 0.7449392712550608, "grad_norm": 1.0771058808303946, "learning_rate": 1.7331910542942298e-05, "loss": 1.6655, "step": 920 }, { "epoch": 0.745748987854251, "grad_norm": 1.1540799878521129, "learning_rate": 1.732606281657672e-05, "loss": 1.6533, "step": 921 }, { "epoch": 0.7465587044534413, "grad_norm": 1.1632383389035768, "learning_rate": 1.732020967774e-05, "loss": 1.6251, "step": 922 }, { "epoch": 0.7473684210526316, "grad_norm": 1.0554497626152348, "learning_rate": 1.7314351130756412e-05, "loss": 1.6913, "step": 923 }, { "epoch": 0.7481781376518218, "grad_norm": 1.0558743109746518, "learning_rate": 1.7308487179954233e-05, "loss": 1.6377, "step": 924 }, { "epoch": 0.7489878542510121, "grad_norm": 1.0485799147334454, "learning_rate": 1.7302617829665725e-05, "loss": 1.6837, "step": 925 }, { "epoch": 0.7497975708502024, "grad_norm": 1.1396931481843333, "learning_rate": 1.729674308422715e-05, "loss": 1.6475, "step": 926 }, { "epoch": 0.7506072874493928, "grad_norm": 1.089831914564661, "learning_rate": 1.7290862947978753e-05, "loss": 1.6612, "step": 927 }, { "epoch": 0.751417004048583, "grad_norm": 1.214076933636336, "learning_rate": 1.7284977425264755e-05, "loss": 1.6558, "step": 928 }, { "epoch": 0.7522267206477733, "grad_norm": 1.0860938684718011, "learning_rate": 1.727908652043336e-05, "loss": 1.6885, "step": 929 }, { "epoch": 0.7530364372469636, "grad_norm": 1.101065862505211, "learning_rate": 1.7273190237836757e-05, "loss": 1.6578, "step": 930 }, { "epoch": 0.7538461538461538, "grad_norm": 1.1342711520778153, "learning_rate": 1.726728858183109e-05, "loss": 1.5637, "step": 931 }, { "epoch": 0.7546558704453441, "grad_norm": 1.0939363758915137, "learning_rate": 1.726138155677649e-05, "loss": 1.6303, "step": 932 }, { "epoch": 0.7554655870445344, "grad_norm": 1.0310679045269706, "learning_rate": 1.725546916703704e-05, "loss": 1.5678, "step": 933 }, { "epoch": 0.7562753036437248, "grad_norm": 1.1728811689558527, "learning_rate": 1.7249551416980806e-05, "loss": 1.6611, "step": 934 }, { "epoch": 0.757085020242915, "grad_norm": 1.1888012056779975, "learning_rate": 1.7243628310979793e-05, "loss": 1.6931, "step": 935 }, { "epoch": 0.7578947368421053, "grad_norm": 1.1056546162441991, "learning_rate": 1.7237699853409974e-05, "loss": 1.6788, "step": 936 }, { "epoch": 0.7587044534412956, "grad_norm": 1.1020429695381275, "learning_rate": 1.7231766048651272e-05, "loss": 1.6557, "step": 937 }, { "epoch": 0.7595141700404858, "grad_norm": 1.174830307710626, "learning_rate": 1.722582690108757e-05, "loss": 1.5964, "step": 938 }, { "epoch": 0.7603238866396761, "grad_norm": 1.1111495814988965, "learning_rate": 1.7219882415106685e-05, "loss": 1.6547, "step": 939 }, { "epoch": 0.7611336032388664, "grad_norm": 1.1511962350490403, "learning_rate": 1.7213932595100384e-05, "loss": 1.6447, "step": 940 }, { "epoch": 0.7619433198380566, "grad_norm": 1.1124431654124347, "learning_rate": 1.7207977445464374e-05, "loss": 1.6756, "step": 941 }, { "epoch": 0.762753036437247, "grad_norm": 1.094632192703889, "learning_rate": 1.7202016970598303e-05, "loss": 1.6216, "step": 942 }, { "epoch": 0.7635627530364373, "grad_norm": 1.0848286346166132, "learning_rate": 1.7196051174905746e-05, "loss": 1.6162, "step": 943 }, { "epoch": 0.7643724696356275, "grad_norm": 1.117835402570124, "learning_rate": 1.719008006279422e-05, "loss": 1.6988, "step": 944 }, { "epoch": 0.7651821862348178, "grad_norm": 1.0695545232404842, "learning_rate": 1.7184103638675157e-05, "loss": 1.6417, "step": 945 }, { "epoch": 0.7659919028340081, "grad_norm": 1.2187648019898212, "learning_rate": 1.7178121906963925e-05, "loss": 1.5581, "step": 946 }, { "epoch": 0.7668016194331984, "grad_norm": 1.1428081923285947, "learning_rate": 1.71721348720798e-05, "loss": 1.5958, "step": 947 }, { "epoch": 0.7676113360323886, "grad_norm": 1.1258009592099063, "learning_rate": 1.7166142538445986e-05, "loss": 1.7054, "step": 948 }, { "epoch": 0.7684210526315789, "grad_norm": 1.122110087454184, "learning_rate": 1.7160144910489602e-05, "loss": 1.6479, "step": 949 }, { "epoch": 0.7692307692307693, "grad_norm": 1.2197448941368936, "learning_rate": 1.715414199264168e-05, "loss": 1.6362, "step": 950 }, { "epoch": 0.7700404858299595, "grad_norm": 1.081986455490488, "learning_rate": 1.7148133789337145e-05, "loss": 1.6376, "step": 951 }, { "epoch": 0.7708502024291498, "grad_norm": 1.0866654156187887, "learning_rate": 1.7142120305014848e-05, "loss": 1.6275, "step": 952 }, { "epoch": 0.7716599190283401, "grad_norm": 1.1078330177065072, "learning_rate": 1.7136101544117526e-05, "loss": 1.62, "step": 953 }, { "epoch": 0.7724696356275303, "grad_norm": 1.0817145396361252, "learning_rate": 1.7130077511091817e-05, "loss": 1.6552, "step": 954 }, { "epoch": 0.7732793522267206, "grad_norm": 1.105540137190186, "learning_rate": 1.7124048210388268e-05, "loss": 1.6459, "step": 955 }, { "epoch": 0.7740890688259109, "grad_norm": 1.0847839201754785, "learning_rate": 1.7118013646461295e-05, "loss": 1.6759, "step": 956 }, { "epoch": 0.7748987854251013, "grad_norm": 1.084427169214924, "learning_rate": 1.711197382376922e-05, "loss": 1.6833, "step": 957 }, { "epoch": 0.7757085020242915, "grad_norm": 1.131195586447811, "learning_rate": 1.710592874677424e-05, "loss": 1.6091, "step": 958 }, { "epoch": 0.7765182186234818, "grad_norm": 1.0521712965264904, "learning_rate": 1.7099878419942444e-05, "loss": 1.6319, "step": 959 }, { "epoch": 0.7773279352226721, "grad_norm": 1.0638585192349859, "learning_rate": 1.709382284774379e-05, "loss": 1.6039, "step": 960 }, { "epoch": 0.7781376518218623, "grad_norm": 1.1423083079533596, "learning_rate": 1.7087762034652113e-05, "loss": 1.672, "step": 961 }, { "epoch": 0.7789473684210526, "grad_norm": 1.1028125449584352, "learning_rate": 1.7081695985145124e-05, "loss": 1.6914, "step": 962 }, { "epoch": 0.7797570850202429, "grad_norm": 1.1268022996799139, "learning_rate": 1.7075624703704404e-05, "loss": 1.7132, "step": 963 }, { "epoch": 0.7805668016194331, "grad_norm": 1.1209367289547245, "learning_rate": 1.7069548194815387e-05, "loss": 1.6357, "step": 964 }, { "epoch": 0.7813765182186235, "grad_norm": 1.1887708228854563, "learning_rate": 1.706346646296739e-05, "loss": 1.5665, "step": 965 }, { "epoch": 0.7821862348178138, "grad_norm": 1.118443433275976, "learning_rate": 1.7057379512653565e-05, "loss": 1.6763, "step": 966 }, { "epoch": 0.7829959514170041, "grad_norm": 1.0454329186401545, "learning_rate": 1.7051287348370934e-05, "loss": 1.5866, "step": 967 }, { "epoch": 0.7838056680161943, "grad_norm": 1.076636884210294, "learning_rate": 1.704518997462037e-05, "loss": 1.6171, "step": 968 }, { "epoch": 0.7846153846153846, "grad_norm": 1.02837143519001, "learning_rate": 1.7039087395906593e-05, "loss": 1.6411, "step": 969 }, { "epoch": 0.7854251012145749, "grad_norm": 1.0924312535153355, "learning_rate": 1.7032979616738167e-05, "loss": 1.6169, "step": 970 }, { "epoch": 0.7862348178137651, "grad_norm": 1.0621344949493872, "learning_rate": 1.7026866641627503e-05, "loss": 1.6543, "step": 971 }, { "epoch": 0.7870445344129555, "grad_norm": 1.0840593933228118, "learning_rate": 1.7020748475090835e-05, "loss": 1.6604, "step": 972 }, { "epoch": 0.7878542510121458, "grad_norm": 1.0324309043206548, "learning_rate": 1.701462512164826e-05, "loss": 1.6547, "step": 973 }, { "epoch": 0.7886639676113361, "grad_norm": 1.1347770119446707, "learning_rate": 1.700849658582368e-05, "loss": 1.6716, "step": 974 }, { "epoch": 0.7894736842105263, "grad_norm": 1.1772883414899646, "learning_rate": 1.7002362872144843e-05, "loss": 1.6968, "step": 975 }, { "epoch": 0.7902834008097166, "grad_norm": 1.1298734658918796, "learning_rate": 1.6996223985143314e-05, "loss": 1.7227, "step": 976 }, { "epoch": 0.7910931174089069, "grad_norm": 1.0893915856547574, "learning_rate": 1.6990079929354485e-05, "loss": 1.6051, "step": 977 }, { "epoch": 0.7919028340080971, "grad_norm": 1.0564375360450058, "learning_rate": 1.698393070931756e-05, "loss": 1.6565, "step": 978 }, { "epoch": 0.7927125506072874, "grad_norm": 1.0465087498034826, "learning_rate": 1.6977776329575566e-05, "loss": 1.6737, "step": 979 }, { "epoch": 0.7935222672064778, "grad_norm": 1.0938693523776055, "learning_rate": 1.697161679467534e-05, "loss": 1.6149, "step": 980 }, { "epoch": 0.794331983805668, "grad_norm": 1.1328659849792375, "learning_rate": 1.696545210916752e-05, "loss": 1.6203, "step": 981 }, { "epoch": 0.7951417004048583, "grad_norm": 1.1231511367270903, "learning_rate": 1.695928227760656e-05, "loss": 1.7056, "step": 982 }, { "epoch": 0.7959514170040486, "grad_norm": 1.1090374560264693, "learning_rate": 1.6953107304550714e-05, "loss": 1.5986, "step": 983 }, { "epoch": 0.7967611336032389, "grad_norm": 1.0770482845738483, "learning_rate": 1.694692719456202e-05, "loss": 1.6314, "step": 984 }, { "epoch": 0.7975708502024291, "grad_norm": 1.0762090702083147, "learning_rate": 1.6940741952206342e-05, "loss": 1.6346, "step": 985 }, { "epoch": 0.7983805668016194, "grad_norm": 1.0669244133657225, "learning_rate": 1.69345515820533e-05, "loss": 1.6763, "step": 986 }, { "epoch": 0.7991902834008097, "grad_norm": 1.062879048024205, "learning_rate": 1.6928356088676325e-05, "loss": 1.7014, "step": 987 }, { "epoch": 0.8, "grad_norm": 1.0947855744497017, "learning_rate": 1.6922155476652625e-05, "loss": 1.631, "step": 988 }, { "epoch": 0.8008097165991903, "grad_norm": 1.034452979238415, "learning_rate": 1.6915949750563202e-05, "loss": 1.6141, "step": 989 }, { "epoch": 0.8016194331983806, "grad_norm": 1.0485411735651209, "learning_rate": 1.6909738914992812e-05, "loss": 1.6518, "step": 990 }, { "epoch": 0.8024291497975709, "grad_norm": 1.055581466504577, "learning_rate": 1.6903522974530005e-05, "loss": 1.613, "step": 991 }, { "epoch": 0.8032388663967611, "grad_norm": 1.0560876471593497, "learning_rate": 1.6897301933767103e-05, "loss": 1.7046, "step": 992 }, { "epoch": 0.8040485829959514, "grad_norm": 1.1003223798755561, "learning_rate": 1.6891075797300184e-05, "loss": 1.6625, "step": 993 }, { "epoch": 0.8048582995951417, "grad_norm": 1.1021700834191077, "learning_rate": 1.6884844569729097e-05, "loss": 1.6678, "step": 994 }, { "epoch": 0.805668016194332, "grad_norm": 1.1166959526421196, "learning_rate": 1.6878608255657457e-05, "loss": 1.6381, "step": 995 }, { "epoch": 0.8064777327935223, "grad_norm": 1.065512317839742, "learning_rate": 1.687236685969263e-05, "loss": 1.6296, "step": 996 }, { "epoch": 0.8072874493927126, "grad_norm": 1.1156855620907848, "learning_rate": 1.6866120386445737e-05, "loss": 1.6858, "step": 997 }, { "epoch": 0.8080971659919028, "grad_norm": 1.0953706109586931, "learning_rate": 1.6859868840531654e-05, "loss": 1.7635, "step": 998 }, { "epoch": 0.8089068825910931, "grad_norm": 1.0233507628776497, "learning_rate": 1.6853612226569005e-05, "loss": 1.6315, "step": 999 }, { "epoch": 0.8097165991902834, "grad_norm": 1.1751028012727203, "learning_rate": 1.6847350549180148e-05, "loss": 1.6798, "step": 1000 }, { "epoch": 0.8105263157894737, "grad_norm": 1.128106775328797, "learning_rate": 1.68410838129912e-05, "loss": 1.667, "step": 1001 }, { "epoch": 0.8113360323886639, "grad_norm": 1.0812082492824788, "learning_rate": 1.6834812022632e-05, "loss": 1.6104, "step": 1002 }, { "epoch": 0.8121457489878543, "grad_norm": 1.1490248467998732, "learning_rate": 1.682853518273612e-05, "loss": 1.6542, "step": 1003 }, { "epoch": 0.8129554655870446, "grad_norm": 1.0715868026423407, "learning_rate": 1.6822253297940876e-05, "loss": 1.6435, "step": 1004 }, { "epoch": 0.8137651821862348, "grad_norm": 1.117058675593047, "learning_rate": 1.6815966372887305e-05, "loss": 1.6082, "step": 1005 }, { "epoch": 0.8145748987854251, "grad_norm": 1.1438973084718087, "learning_rate": 1.6809674412220166e-05, "loss": 1.6409, "step": 1006 }, { "epoch": 0.8153846153846154, "grad_norm": 1.1167195347301986, "learning_rate": 1.6803377420587935e-05, "loss": 1.6139, "step": 1007 }, { "epoch": 0.8161943319838056, "grad_norm": 1.050529006455886, "learning_rate": 1.679707540264281e-05, "loss": 1.6202, "step": 1008 }, { "epoch": 0.8170040485829959, "grad_norm": 1.0456426778952734, "learning_rate": 1.6790768363040704e-05, "loss": 1.6511, "step": 1009 }, { "epoch": 0.8178137651821862, "grad_norm": 1.0744209810818393, "learning_rate": 1.6784456306441234e-05, "loss": 1.6343, "step": 1010 }, { "epoch": 0.8186234817813766, "grad_norm": 1.0454475493984707, "learning_rate": 1.6778139237507728e-05, "loss": 1.6411, "step": 1011 }, { "epoch": 0.8194331983805668, "grad_norm": 1.1450988763255063, "learning_rate": 1.6771817160907214e-05, "loss": 1.6547, "step": 1012 }, { "epoch": 0.8202429149797571, "grad_norm": 1.0820294221210558, "learning_rate": 1.6765490081310426e-05, "loss": 1.7049, "step": 1013 }, { "epoch": 0.8210526315789474, "grad_norm": 1.0795056916204937, "learning_rate": 1.6759158003391783e-05, "loss": 1.6503, "step": 1014 }, { "epoch": 0.8218623481781376, "grad_norm": 1.0754101028602392, "learning_rate": 1.675282093182941e-05, "loss": 1.6015, "step": 1015 }, { "epoch": 0.8226720647773279, "grad_norm": 1.0902544138244956, "learning_rate": 1.674647887130511e-05, "loss": 1.6625, "step": 1016 }, { "epoch": 0.8234817813765182, "grad_norm": 1.0728952819859472, "learning_rate": 1.674013182650438e-05, "loss": 1.6318, "step": 1017 }, { "epoch": 0.8242914979757086, "grad_norm": 1.087191133019499, "learning_rate": 1.673377980211639e-05, "loss": 1.607, "step": 1018 }, { "epoch": 0.8251012145748988, "grad_norm": 1.092471584016294, "learning_rate": 1.6727422802834e-05, "loss": 1.6151, "step": 1019 }, { "epoch": 0.8259109311740891, "grad_norm": 1.0942633386482077, "learning_rate": 1.672106083335374e-05, "loss": 1.6538, "step": 1020 }, { "epoch": 0.8267206477732794, "grad_norm": 1.0611257052505665, "learning_rate": 1.671469389837581e-05, "loss": 1.6813, "step": 1021 }, { "epoch": 0.8275303643724696, "grad_norm": 1.1243190686891869, "learning_rate": 1.6708322002604085e-05, "loss": 1.6294, "step": 1022 }, { "epoch": 0.8283400809716599, "grad_norm": 1.1202657006614518, "learning_rate": 1.6701945150746094e-05, "loss": 1.6243, "step": 1023 }, { "epoch": 0.8291497975708502, "grad_norm": 1.1727083505566944, "learning_rate": 1.6695563347513036e-05, "loss": 1.631, "step": 1024 }, { "epoch": 0.8299595141700404, "grad_norm": 1.0959511539538467, "learning_rate": 1.6689176597619773e-05, "loss": 1.7506, "step": 1025 }, { "epoch": 0.8307692307692308, "grad_norm": 1.1456195238430023, "learning_rate": 1.6682784905784808e-05, "loss": 1.6222, "step": 1026 }, { "epoch": 0.8315789473684211, "grad_norm": 1.1142846493104133, "learning_rate": 1.6676388276730305e-05, "loss": 1.5693, "step": 1027 }, { "epoch": 0.8323886639676114, "grad_norm": 1.054064976471123, "learning_rate": 1.6669986715182064e-05, "loss": 1.677, "step": 1028 }, { "epoch": 0.8331983805668016, "grad_norm": 1.026287560156129, "learning_rate": 1.6663580225869554e-05, "loss": 1.711, "step": 1029 }, { "epoch": 0.8340080971659919, "grad_norm": 1.1059699325557255, "learning_rate": 1.6657168813525855e-05, "loss": 1.6314, "step": 1030 }, { "epoch": 0.8348178137651822, "grad_norm": 1.0663812567827384, "learning_rate": 1.6650752482887698e-05, "loss": 1.6292, "step": 1031 }, { "epoch": 0.8356275303643724, "grad_norm": 1.1308684576404386, "learning_rate": 1.6644331238695454e-05, "loss": 1.6018, "step": 1032 }, { "epoch": 0.8364372469635628, "grad_norm": 1.0658872985205445, "learning_rate": 1.6637905085693113e-05, "loss": 1.603, "step": 1033 }, { "epoch": 0.8372469635627531, "grad_norm": 1.0560438364461333, "learning_rate": 1.6631474028628298e-05, "loss": 1.6701, "step": 1034 }, { "epoch": 0.8380566801619433, "grad_norm": 1.097675248377845, "learning_rate": 1.662503807225225e-05, "loss": 1.6673, "step": 1035 }, { "epoch": 0.8388663967611336, "grad_norm": 1.1463142445281125, "learning_rate": 1.6618597221319835e-05, "loss": 1.6401, "step": 1036 }, { "epoch": 0.8396761133603239, "grad_norm": 1.0991588788894604, "learning_rate": 1.6612151480589526e-05, "loss": 1.6822, "step": 1037 }, { "epoch": 0.8404858299595142, "grad_norm": 1.1432757857765516, "learning_rate": 1.6605700854823427e-05, "loss": 1.6552, "step": 1038 }, { "epoch": 0.8412955465587044, "grad_norm": 1.0996521956343472, "learning_rate": 1.659924534878723e-05, "loss": 1.6155, "step": 1039 }, { "epoch": 0.8421052631578947, "grad_norm": 1.2092130985611158, "learning_rate": 1.659278496725024e-05, "loss": 1.6697, "step": 1040 }, { "epoch": 0.8429149797570851, "grad_norm": 1.1859643866916718, "learning_rate": 1.6586319714985372e-05, "loss": 1.6514, "step": 1041 }, { "epoch": 0.8437246963562753, "grad_norm": 1.096493975856786, "learning_rate": 1.6579849596769132e-05, "loss": 1.6892, "step": 1042 }, { "epoch": 0.8445344129554656, "grad_norm": 1.041583638770386, "learning_rate": 1.6573374617381622e-05, "loss": 1.5834, "step": 1043 }, { "epoch": 0.8453441295546559, "grad_norm": 1.256429469555766, "learning_rate": 1.656689478160653e-05, "loss": 1.5953, "step": 1044 }, { "epoch": 0.8461538461538461, "grad_norm": 1.1156384269735415, "learning_rate": 1.6560410094231144e-05, "loss": 1.608, "step": 1045 }, { "epoch": 0.8469635627530364, "grad_norm": 1.2175373188796819, "learning_rate": 1.655392056004633e-05, "loss": 1.7043, "step": 1046 }, { "epoch": 0.8477732793522267, "grad_norm": 1.1837512398752925, "learning_rate": 1.6547426183846527e-05, "loss": 1.6364, "step": 1047 }, { "epoch": 0.848582995951417, "grad_norm": 1.0731112756132954, "learning_rate": 1.6540926970429768e-05, "loss": 1.6826, "step": 1048 }, { "epoch": 0.8493927125506073, "grad_norm": 1.2861445088088037, "learning_rate": 1.6534422924597647e-05, "loss": 1.671, "step": 1049 }, { "epoch": 0.8502024291497976, "grad_norm": 1.2667796024188396, "learning_rate": 1.6527914051155328e-05, "loss": 1.6239, "step": 1050 }, { "epoch": 0.8510121457489879, "grad_norm": 1.0621089700252984, "learning_rate": 1.652140035491155e-05, "loss": 1.6278, "step": 1051 }, { "epoch": 0.8518218623481781, "grad_norm": 1.1959034368355665, "learning_rate": 1.6514881840678606e-05, "loss": 1.6383, "step": 1052 }, { "epoch": 0.8526315789473684, "grad_norm": 1.1679490573250864, "learning_rate": 1.650835851327236e-05, "loss": 1.6685, "step": 1053 }, { "epoch": 0.8534412955465587, "grad_norm": 1.0724625429268337, "learning_rate": 1.6501830377512214e-05, "loss": 1.6101, "step": 1054 }, { "epoch": 0.854251012145749, "grad_norm": 1.0992880410878538, "learning_rate": 1.6495297438221145e-05, "loss": 1.6482, "step": 1055 }, { "epoch": 0.8550607287449393, "grad_norm": 1.2082881920964228, "learning_rate": 1.6488759700225663e-05, "loss": 1.6947, "step": 1056 }, { "epoch": 0.8558704453441296, "grad_norm": 1.064085800343604, "learning_rate": 1.6482217168355824e-05, "loss": 1.7083, "step": 1057 }, { "epoch": 0.8566801619433199, "grad_norm": 1.2300421861334183, "learning_rate": 1.647566984744523e-05, "loss": 1.6707, "step": 1058 }, { "epoch": 0.8574898785425101, "grad_norm": 1.168895681590581, "learning_rate": 1.6469117742331023e-05, "loss": 1.5947, "step": 1059 }, { "epoch": 0.8582995951417004, "grad_norm": 1.0860055758849936, "learning_rate": 1.6462560857853876e-05, "loss": 1.6144, "step": 1060 }, { "epoch": 0.8591093117408907, "grad_norm": 1.2059795818651675, "learning_rate": 1.645599919885799e-05, "loss": 1.7368, "step": 1061 }, { "epoch": 0.8599190283400809, "grad_norm": 1.1705528700907784, "learning_rate": 1.6449432770191104e-05, "loss": 1.7067, "step": 1062 }, { "epoch": 0.8607287449392712, "grad_norm": 1.0305975110654955, "learning_rate": 1.6442861576704467e-05, "loss": 1.6808, "step": 1063 }, { "epoch": 0.8615384615384616, "grad_norm": 1.0672600681647995, "learning_rate": 1.6436285623252863e-05, "loss": 1.6516, "step": 1064 }, { "epoch": 0.8623481781376519, "grad_norm": 1.0921123850603622, "learning_rate": 1.6429704914694573e-05, "loss": 1.6578, "step": 1065 }, { "epoch": 0.8631578947368421, "grad_norm": 1.1420424627061119, "learning_rate": 1.6423119455891412e-05, "loss": 1.6166, "step": 1066 }, { "epoch": 0.8639676113360324, "grad_norm": 1.1974027318652682, "learning_rate": 1.6416529251708695e-05, "loss": 1.5715, "step": 1067 }, { "epoch": 0.8647773279352227, "grad_norm": 1.0470830216699876, "learning_rate": 1.6409934307015237e-05, "loss": 1.6083, "step": 1068 }, { "epoch": 0.8655870445344129, "grad_norm": 1.070487613342487, "learning_rate": 1.6403334626683373e-05, "loss": 1.596, "step": 1069 }, { "epoch": 0.8663967611336032, "grad_norm": 1.1529368703348062, "learning_rate": 1.6396730215588913e-05, "loss": 1.7128, "step": 1070 }, { "epoch": 0.8672064777327935, "grad_norm": 1.1573095803039568, "learning_rate": 1.639012107861118e-05, "loss": 1.5791, "step": 1071 }, { "epoch": 0.8680161943319838, "grad_norm": 1.0795488410207628, "learning_rate": 1.6383507220632983e-05, "loss": 1.6544, "step": 1072 }, { "epoch": 0.8688259109311741, "grad_norm": 1.037145472528182, "learning_rate": 1.6376888646540617e-05, "loss": 1.6401, "step": 1073 }, { "epoch": 0.8696356275303644, "grad_norm": 1.060717046272381, "learning_rate": 1.6370265361223864e-05, "loss": 1.6801, "step": 1074 }, { "epoch": 0.8704453441295547, "grad_norm": 1.0964370572935223, "learning_rate": 1.6363637369575984e-05, "loss": 1.6636, "step": 1075 }, { "epoch": 0.8712550607287449, "grad_norm": 1.0310388617035904, "learning_rate": 1.6357004676493716e-05, "loss": 1.5856, "step": 1076 }, { "epoch": 0.8720647773279352, "grad_norm": 1.0168478194178103, "learning_rate": 1.635036728687727e-05, "loss": 1.5626, "step": 1077 }, { "epoch": 0.8728744939271255, "grad_norm": 1.1227525434781638, "learning_rate": 1.6343725205630335e-05, "loss": 1.6922, "step": 1078 }, { "epoch": 0.8736842105263158, "grad_norm": 1.0084316594907183, "learning_rate": 1.633707843766005e-05, "loss": 1.6777, "step": 1079 }, { "epoch": 0.8744939271255061, "grad_norm": 1.0733928985684549, "learning_rate": 1.633042698787703e-05, "loss": 1.6883, "step": 1080 }, { "epoch": 0.8753036437246964, "grad_norm": 1.0420921585152885, "learning_rate": 1.632377086119534e-05, "loss": 1.5657, "step": 1081 }, { "epoch": 0.8761133603238866, "grad_norm": 1.0479710213227003, "learning_rate": 1.631711006253251e-05, "loss": 1.7318, "step": 1082 }, { "epoch": 0.8769230769230769, "grad_norm": 1.0675821904258518, "learning_rate": 1.6310444596809514e-05, "loss": 1.6281, "step": 1083 }, { "epoch": 0.8777327935222672, "grad_norm": 1.0092848547062299, "learning_rate": 1.6303774468950776e-05, "loss": 1.5721, "step": 1084 }, { "epoch": 0.8785425101214575, "grad_norm": 1.1037917612800165, "learning_rate": 1.6297099683884163e-05, "loss": 1.588, "step": 1085 }, { "epoch": 0.8793522267206477, "grad_norm": 1.081509721545105, "learning_rate": 1.629042024654099e-05, "loss": 1.6159, "step": 1086 }, { "epoch": 0.8801619433198381, "grad_norm": 1.0836597364116385, "learning_rate": 1.6283736161855995e-05, "loss": 1.6192, "step": 1087 }, { "epoch": 0.8809716599190284, "grad_norm": 1.0587543748723958, "learning_rate": 1.6277047434767364e-05, "loss": 1.5687, "step": 1088 }, { "epoch": 0.8817813765182186, "grad_norm": 1.0747135995302248, "learning_rate": 1.6270354070216704e-05, "loss": 1.6513, "step": 1089 }, { "epoch": 0.8825910931174089, "grad_norm": 1.0869005500984592, "learning_rate": 1.626365607314905e-05, "loss": 1.6559, "step": 1090 }, { "epoch": 0.8834008097165992, "grad_norm": 1.0595343275908542, "learning_rate": 1.625695344851286e-05, "loss": 1.6598, "step": 1091 }, { "epoch": 0.8842105263157894, "grad_norm": 1.1107620614268396, "learning_rate": 1.6250246201260017e-05, "loss": 1.6897, "step": 1092 }, { "epoch": 0.8850202429149797, "grad_norm": 1.1907108746856276, "learning_rate": 1.624353433634581e-05, "loss": 1.6426, "step": 1093 }, { "epoch": 0.8858299595141701, "grad_norm": 1.046609129170575, "learning_rate": 1.6236817858728937e-05, "loss": 1.5677, "step": 1094 }, { "epoch": 0.8866396761133604, "grad_norm": 1.0744260551582498, "learning_rate": 1.6230096773371514e-05, "loss": 1.6268, "step": 1095 }, { "epoch": 0.8874493927125506, "grad_norm": 1.1340503009391705, "learning_rate": 1.622337108523906e-05, "loss": 1.6032, "step": 1096 }, { "epoch": 0.8882591093117409, "grad_norm": 1.07216583644219, "learning_rate": 1.621664079930049e-05, "loss": 1.66, "step": 1097 }, { "epoch": 0.8890688259109312, "grad_norm": 1.137011474293611, "learning_rate": 1.620990592052811e-05, "loss": 1.6506, "step": 1098 }, { "epoch": 0.8898785425101214, "grad_norm": 1.0925701713040545, "learning_rate": 1.620316645389764e-05, "loss": 1.6441, "step": 1099 }, { "epoch": 0.8906882591093117, "grad_norm": 0.9809230441134872, "learning_rate": 1.619642240438816e-05, "loss": 1.6116, "step": 1100 }, { "epoch": 0.891497975708502, "grad_norm": 1.0227226563848153, "learning_rate": 1.618967377698216e-05, "loss": 1.6869, "step": 1101 }, { "epoch": 0.8923076923076924, "grad_norm": 1.0936896377565115, "learning_rate": 1.6182920576665508e-05, "loss": 1.6799, "step": 1102 }, { "epoch": 0.8931174089068826, "grad_norm": 1.064709684635027, "learning_rate": 1.6176162808427437e-05, "loss": 1.6547, "step": 1103 }, { "epoch": 0.8939271255060729, "grad_norm": 1.089323924603956, "learning_rate": 1.6169400477260566e-05, "loss": 1.6514, "step": 1104 }, { "epoch": 0.8947368421052632, "grad_norm": 1.051251367154035, "learning_rate": 1.616263358816089e-05, "loss": 1.5567, "step": 1105 }, { "epoch": 0.8955465587044534, "grad_norm": 1.0955130855844533, "learning_rate": 1.6155862146127757e-05, "loss": 1.6892, "step": 1106 }, { "epoch": 0.8963562753036437, "grad_norm": 1.2427921629069014, "learning_rate": 1.6149086156163893e-05, "loss": 1.6672, "step": 1107 }, { "epoch": 0.897165991902834, "grad_norm": 1.0531149381114697, "learning_rate": 1.6142305623275367e-05, "loss": 1.6756, "step": 1108 }, { "epoch": 0.8979757085020242, "grad_norm": 1.1185947127125788, "learning_rate": 1.6135520552471625e-05, "loss": 1.6263, "step": 1109 }, { "epoch": 0.8987854251012146, "grad_norm": 1.0114886200304354, "learning_rate": 1.612873094876545e-05, "loss": 1.6887, "step": 1110 }, { "epoch": 0.8995951417004049, "grad_norm": 1.1380754110775286, "learning_rate": 1.612193681717298e-05, "loss": 1.6664, "step": 1111 }, { "epoch": 0.9004048582995952, "grad_norm": 1.0844337360811653, "learning_rate": 1.611513816271369e-05, "loss": 1.6717, "step": 1112 }, { "epoch": 0.9012145748987854, "grad_norm": 1.0684169391575367, "learning_rate": 1.6108334990410413e-05, "loss": 1.6056, "step": 1113 }, { "epoch": 0.9020242914979757, "grad_norm": 1.0828647321774048, "learning_rate": 1.610152730528931e-05, "loss": 1.6916, "step": 1114 }, { "epoch": 0.902834008097166, "grad_norm": 1.0539257127891122, "learning_rate": 1.6094715112379874e-05, "loss": 1.6259, "step": 1115 }, { "epoch": 0.9036437246963562, "grad_norm": 1.0495197445873474, "learning_rate": 1.6087898416714928e-05, "loss": 1.6449, "step": 1116 }, { "epoch": 0.9044534412955466, "grad_norm": 1.037832176782658, "learning_rate": 1.608107722333063e-05, "loss": 1.5811, "step": 1117 }, { "epoch": 0.9052631578947369, "grad_norm": 1.1204020070782086, "learning_rate": 1.607425153726645e-05, "loss": 1.6671, "step": 1118 }, { "epoch": 0.9060728744939271, "grad_norm": 1.0947786620111954, "learning_rate": 1.6067421363565185e-05, "loss": 1.6438, "step": 1119 }, { "epoch": 0.9068825910931174, "grad_norm": 1.1703591478489133, "learning_rate": 1.6060586707272943e-05, "loss": 1.5142, "step": 1120 }, { "epoch": 0.9076923076923077, "grad_norm": 1.1101060613426852, "learning_rate": 1.6053747573439147e-05, "loss": 1.6466, "step": 1121 }, { "epoch": 0.908502024291498, "grad_norm": 0.9844797162954405, "learning_rate": 1.6046903967116532e-05, "loss": 1.5892, "step": 1122 }, { "epoch": 0.9093117408906882, "grad_norm": 1.2066011149235545, "learning_rate": 1.604005589336112e-05, "loss": 1.6379, "step": 1123 }, { "epoch": 0.9101214574898785, "grad_norm": 1.12551790929863, "learning_rate": 1.6033203357232255e-05, "loss": 1.5908, "step": 1124 }, { "epoch": 0.9109311740890689, "grad_norm": 1.0539443046535852, "learning_rate": 1.6026346363792565e-05, "loss": 1.6407, "step": 1125 }, { "epoch": 0.9117408906882591, "grad_norm": 1.2718959974915909, "learning_rate": 1.6019484918107977e-05, "loss": 1.6657, "step": 1126 }, { "epoch": 0.9125506072874494, "grad_norm": 1.1172864236142332, "learning_rate": 1.60126190252477e-05, "loss": 1.5977, "step": 1127 }, { "epoch": 0.9133603238866397, "grad_norm": 1.029629174783446, "learning_rate": 1.600574869028423e-05, "loss": 1.5827, "step": 1128 }, { "epoch": 0.91417004048583, "grad_norm": 1.1276448324458075, "learning_rate": 1.599887391829336e-05, "loss": 1.5556, "step": 1129 }, { "epoch": 0.9149797570850202, "grad_norm": 1.234475202377491, "learning_rate": 1.599199471435414e-05, "loss": 1.6962, "step": 1130 }, { "epoch": 0.9157894736842105, "grad_norm": 1.0658020028121022, "learning_rate": 1.5985111083548905e-05, "loss": 1.6202, "step": 1131 }, { "epoch": 0.9165991902834008, "grad_norm": 1.1895352614746753, "learning_rate": 1.5978223030963257e-05, "loss": 1.5915, "step": 1132 }, { "epoch": 0.9174089068825911, "grad_norm": 1.1294963341176811, "learning_rate": 1.5971330561686073e-05, "loss": 1.6618, "step": 1133 }, { "epoch": 0.9182186234817814, "grad_norm": 1.0383726295620532, "learning_rate": 1.596443368080948e-05, "loss": 1.7049, "step": 1134 }, { "epoch": 0.9190283400809717, "grad_norm": 1.0765024500512697, "learning_rate": 1.5957532393428872e-05, "loss": 1.6578, "step": 1135 }, { "epoch": 0.9198380566801619, "grad_norm": 1.0435257860758407, "learning_rate": 1.5950626704642898e-05, "loss": 1.5491, "step": 1136 }, { "epoch": 0.9206477732793522, "grad_norm": 1.040007314475756, "learning_rate": 1.594371661955346e-05, "loss": 1.5721, "step": 1137 }, { "epoch": 0.9214574898785425, "grad_norm": 1.1078111485151643, "learning_rate": 1.5936802143265708e-05, "loss": 1.6908, "step": 1138 }, { "epoch": 0.9222672064777327, "grad_norm": 1.1320470811777017, "learning_rate": 1.592988328088803e-05, "loss": 1.6745, "step": 1139 }, { "epoch": 0.9230769230769231, "grad_norm": 1.016464328891757, "learning_rate": 1.5922960037532057e-05, "loss": 1.5978, "step": 1140 }, { "epoch": 0.9238866396761134, "grad_norm": 1.0843626078556605, "learning_rate": 1.5916032418312665e-05, "loss": 1.5708, "step": 1141 }, { "epoch": 0.9246963562753037, "grad_norm": 1.1378816169740498, "learning_rate": 1.5909100428347953e-05, "loss": 1.7212, "step": 1142 }, { "epoch": 0.9255060728744939, "grad_norm": 1.0606143686664404, "learning_rate": 1.590216407275925e-05, "loss": 1.5639, "step": 1143 }, { "epoch": 0.9263157894736842, "grad_norm": 1.3367149432598577, "learning_rate": 1.5895223356671116e-05, "loss": 1.6906, "step": 1144 }, { "epoch": 0.9271255060728745, "grad_norm": 1.0416656067812, "learning_rate": 1.588827828521133e-05, "loss": 1.5926, "step": 1145 }, { "epoch": 0.9279352226720647, "grad_norm": 1.0644446557633547, "learning_rate": 1.5881328863510885e-05, "loss": 1.6572, "step": 1146 }, { "epoch": 0.928744939271255, "grad_norm": 1.0307821440286953, "learning_rate": 1.5874375096703993e-05, "loss": 1.6722, "step": 1147 }, { "epoch": 0.9295546558704454, "grad_norm": 1.0777429781962335, "learning_rate": 1.5867416989928077e-05, "loss": 1.6206, "step": 1148 }, { "epoch": 0.9303643724696357, "grad_norm": 1.0968718914039448, "learning_rate": 1.5860454548323755e-05, "loss": 1.6033, "step": 1149 }, { "epoch": 0.9311740890688259, "grad_norm": 1.094113134525346, "learning_rate": 1.585348777703486e-05, "loss": 1.6282, "step": 1150 }, { "epoch": 0.9319838056680162, "grad_norm": 1.110942326892508, "learning_rate": 1.5846516681208425e-05, "loss": 1.6293, "step": 1151 }, { "epoch": 0.9327935222672065, "grad_norm": 1.1089357151037313, "learning_rate": 1.5839541265994663e-05, "loss": 1.7625, "step": 1152 }, { "epoch": 0.9336032388663967, "grad_norm": 1.0813117854976675, "learning_rate": 1.5832561536546998e-05, "loss": 1.6626, "step": 1153 }, { "epoch": 0.934412955465587, "grad_norm": 1.0395260288355859, "learning_rate": 1.5825577498022027e-05, "loss": 1.6422, "step": 1154 }, { "epoch": 0.9352226720647774, "grad_norm": 1.0434934564308764, "learning_rate": 1.581858915557953e-05, "loss": 1.6427, "step": 1155 }, { "epoch": 0.9360323886639677, "grad_norm": 1.1131287757276462, "learning_rate": 1.5811596514382474e-05, "loss": 1.6684, "step": 1156 }, { "epoch": 0.9368421052631579, "grad_norm": 1.0535686300577007, "learning_rate": 1.5804599579597007e-05, "loss": 1.5772, "step": 1157 }, { "epoch": 0.9376518218623482, "grad_norm": 1.0265557890806118, "learning_rate": 1.5797598356392433e-05, "loss": 1.6554, "step": 1158 }, { "epoch": 0.9384615384615385, "grad_norm": 1.040297515122633, "learning_rate": 1.5790592849941234e-05, "loss": 1.6032, "step": 1159 }, { "epoch": 0.9392712550607287, "grad_norm": 1.0584403749937723, "learning_rate": 1.5783583065419054e-05, "loss": 1.6032, "step": 1160 }, { "epoch": 0.940080971659919, "grad_norm": 1.1762205201783962, "learning_rate": 1.5776569008004705e-05, "loss": 1.6195, "step": 1161 }, { "epoch": 0.9408906882591093, "grad_norm": 1.116940246849939, "learning_rate": 1.5769550682880143e-05, "loss": 1.6004, "step": 1162 }, { "epoch": 0.9417004048582996, "grad_norm": 1.0651667577950013, "learning_rate": 1.5762528095230488e-05, "loss": 1.584, "step": 1163 }, { "epoch": 0.9425101214574899, "grad_norm": 1.1032179981000514, "learning_rate": 1.5755501250244e-05, "loss": 1.6358, "step": 1164 }, { "epoch": 0.9433198380566802, "grad_norm": 1.1087189553464278, "learning_rate": 1.5748470153112093e-05, "loss": 1.5852, "step": 1165 }, { "epoch": 0.9441295546558705, "grad_norm": 1.0912761006902574, "learning_rate": 1.574143480902932e-05, "loss": 1.6718, "step": 1166 }, { "epoch": 0.9449392712550607, "grad_norm": 1.0720080749650522, "learning_rate": 1.5734395223193367e-05, "loss": 1.637, "step": 1167 }, { "epoch": 0.945748987854251, "grad_norm": 1.1016199611874298, "learning_rate": 1.5727351400805054e-05, "loss": 1.7111, "step": 1168 }, { "epoch": 0.9465587044534413, "grad_norm": 1.0389321277980663, "learning_rate": 1.572030334706834e-05, "loss": 1.6109, "step": 1169 }, { "epoch": 0.9473684210526315, "grad_norm": 1.1153836189672202, "learning_rate": 1.57132510671903e-05, "loss": 1.6806, "step": 1170 }, { "epoch": 0.9481781376518219, "grad_norm": 1.0852648007480221, "learning_rate": 1.5706194566381136e-05, "loss": 1.5954, "step": 1171 }, { "epoch": 0.9489878542510122, "grad_norm": 1.1024355588691714, "learning_rate": 1.5699133849854164e-05, "loss": 1.7109, "step": 1172 }, { "epoch": 0.9497975708502024, "grad_norm": 1.1103223087215968, "learning_rate": 1.5692068922825826e-05, "loss": 1.6344, "step": 1173 }, { "epoch": 0.9506072874493927, "grad_norm": 1.0273649828440345, "learning_rate": 1.5684999790515664e-05, "loss": 1.6134, "step": 1174 }, { "epoch": 0.951417004048583, "grad_norm": 1.025710171640773, "learning_rate": 1.5677926458146327e-05, "loss": 1.6368, "step": 1175 }, { "epoch": 0.9522267206477733, "grad_norm": 1.080001734059546, "learning_rate": 1.567084893094357e-05, "loss": 1.6528, "step": 1176 }, { "epoch": 0.9530364372469635, "grad_norm": 1.0606923317026444, "learning_rate": 1.566376721413625e-05, "loss": 1.5999, "step": 1177 }, { "epoch": 0.9538461538461539, "grad_norm": 1.0103172442950479, "learning_rate": 1.5656681312956316e-05, "loss": 1.5915, "step": 1178 }, { "epoch": 0.9546558704453442, "grad_norm": 1.1057352054726133, "learning_rate": 1.5649591232638804e-05, "loss": 1.6497, "step": 1179 }, { "epoch": 0.9554655870445344, "grad_norm": 1.1047227327452962, "learning_rate": 1.5642496978421842e-05, "loss": 1.6919, "step": 1180 }, { "epoch": 0.9562753036437247, "grad_norm": 1.0283713285578344, "learning_rate": 1.563539855554665e-05, "loss": 1.6248, "step": 1181 }, { "epoch": 0.957085020242915, "grad_norm": 1.0214669851609182, "learning_rate": 1.5628295969257515e-05, "loss": 1.623, "step": 1182 }, { "epoch": 0.9578947368421052, "grad_norm": 1.0696244337426537, "learning_rate": 1.5621189224801797e-05, "loss": 1.6771, "step": 1183 }, { "epoch": 0.9587044534412955, "grad_norm": 1.0663459977571883, "learning_rate": 1.5614078327429947e-05, "loss": 1.7245, "step": 1184 }, { "epoch": 0.9595141700404858, "grad_norm": 1.100787082491128, "learning_rate": 1.560696328239547e-05, "loss": 1.6435, "step": 1185 }, { "epoch": 0.9603238866396762, "grad_norm": 1.0861397245205235, "learning_rate": 1.559984409495493e-05, "loss": 1.5699, "step": 1186 }, { "epoch": 0.9611336032388664, "grad_norm": 1.0927150205723406, "learning_rate": 1.5592720770367967e-05, "loss": 1.6555, "step": 1187 }, { "epoch": 0.9619433198380567, "grad_norm": 1.0739743395413222, "learning_rate": 1.5585593313897267e-05, "loss": 1.5801, "step": 1188 }, { "epoch": 0.962753036437247, "grad_norm": 1.058740021769692, "learning_rate": 1.5578461730808575e-05, "loss": 1.6698, "step": 1189 }, { "epoch": 0.9635627530364372, "grad_norm": 1.0525423850505513, "learning_rate": 1.5571326026370676e-05, "loss": 1.5322, "step": 1190 }, { "epoch": 0.9643724696356275, "grad_norm": 1.075841311496856, "learning_rate": 1.5564186205855407e-05, "loss": 1.5897, "step": 1191 }, { "epoch": 0.9651821862348178, "grad_norm": 1.1185012350730061, "learning_rate": 1.5557042274537644e-05, "loss": 1.632, "step": 1192 }, { "epoch": 0.965991902834008, "grad_norm": 1.0818043247678821, "learning_rate": 1.5549894237695302e-05, "loss": 1.6112, "step": 1193 }, { "epoch": 0.9668016194331984, "grad_norm": 1.0762571974215163, "learning_rate": 1.5542742100609324e-05, "loss": 1.5975, "step": 1194 }, { "epoch": 0.9676113360323887, "grad_norm": 1.1294759095742624, "learning_rate": 1.5535585868563688e-05, "loss": 1.628, "step": 1195 }, { "epoch": 0.968421052631579, "grad_norm": 1.0550884938111755, "learning_rate": 1.552842554684539e-05, "loss": 1.6819, "step": 1196 }, { "epoch": 0.9692307692307692, "grad_norm": 1.1137617349520605, "learning_rate": 1.5521261140744458e-05, "loss": 1.5694, "step": 1197 }, { "epoch": 0.9700404858299595, "grad_norm": 1.122314201391633, "learning_rate": 1.551409265555393e-05, "loss": 1.7246, "step": 1198 }, { "epoch": 0.9708502024291498, "grad_norm": 1.0252937720177295, "learning_rate": 1.5506920096569857e-05, "loss": 1.5807, "step": 1199 }, { "epoch": 0.97165991902834, "grad_norm": 1.066566322393118, "learning_rate": 1.5499743469091303e-05, "loss": 1.5806, "step": 1200 }, { "epoch": 0.9724696356275304, "grad_norm": 1.1862181447108153, "learning_rate": 1.5492562778420342e-05, "loss": 1.6677, "step": 1201 }, { "epoch": 0.9732793522267207, "grad_norm": 1.1600571833816153, "learning_rate": 1.5485378029862034e-05, "loss": 1.6181, "step": 1202 }, { "epoch": 0.974089068825911, "grad_norm": 1.1209706305697864, "learning_rate": 1.547818922872446e-05, "loss": 1.595, "step": 1203 }, { "epoch": 0.9748987854251012, "grad_norm": 1.2007659516216234, "learning_rate": 1.547099638031867e-05, "loss": 1.5757, "step": 1204 }, { "epoch": 0.9757085020242915, "grad_norm": 1.087592209286536, "learning_rate": 1.5463799489958727e-05, "loss": 1.6574, "step": 1205 }, { "epoch": 0.9765182186234818, "grad_norm": 1.0599754950787568, "learning_rate": 1.5456598562961666e-05, "loss": 1.6501, "step": 1206 }, { "epoch": 0.977327935222672, "grad_norm": 1.2383281394995351, "learning_rate": 1.544939360464751e-05, "loss": 1.6212, "step": 1207 }, { "epoch": 0.9781376518218623, "grad_norm": 1.1245330174866524, "learning_rate": 1.5442184620339252e-05, "loss": 1.6466, "step": 1208 }, { "epoch": 0.9789473684210527, "grad_norm": 0.9799617502733805, "learning_rate": 1.5434971615362875e-05, "loss": 1.6056, "step": 1209 }, { "epoch": 0.979757085020243, "grad_norm": 1.142502619460788, "learning_rate": 1.542775459504732e-05, "loss": 1.6629, "step": 1210 }, { "epoch": 0.9805668016194332, "grad_norm": 1.0942507084406352, "learning_rate": 1.5420533564724495e-05, "loss": 1.6764, "step": 1211 }, { "epoch": 0.9813765182186235, "grad_norm": 1.161749663929768, "learning_rate": 1.5413308529729274e-05, "loss": 1.6983, "step": 1212 }, { "epoch": 0.9821862348178138, "grad_norm": 1.0441007142843908, "learning_rate": 1.5406079495399495e-05, "loss": 1.5531, "step": 1213 }, { "epoch": 0.982995951417004, "grad_norm": 1.0129014720851601, "learning_rate": 1.5398846467075937e-05, "loss": 1.5144, "step": 1214 }, { "epoch": 0.9838056680161943, "grad_norm": 1.0916694795654285, "learning_rate": 1.5391609450102346e-05, "loss": 1.5713, "step": 1215 }, { "epoch": 0.9846153846153847, "grad_norm": 1.0119872612879202, "learning_rate": 1.5384368449825395e-05, "loss": 1.6537, "step": 1216 }, { "epoch": 0.9854251012145749, "grad_norm": 1.089804356576127, "learning_rate": 1.5377123471594723e-05, "loss": 1.6905, "step": 1217 }, { "epoch": 0.9862348178137652, "grad_norm": 1.018296330143969, "learning_rate": 1.536987452076289e-05, "loss": 1.5738, "step": 1218 }, { "epoch": 0.9870445344129555, "grad_norm": 1.1463126951348903, "learning_rate": 1.5362621602685394e-05, "loss": 1.5971, "step": 1219 }, { "epoch": 0.9878542510121457, "grad_norm": 1.0831047769241162, "learning_rate": 1.5355364722720674e-05, "loss": 1.6642, "step": 1220 }, { "epoch": 0.988663967611336, "grad_norm": 1.0229539605576596, "learning_rate": 1.5348103886230086e-05, "loss": 1.6169, "step": 1221 }, { "epoch": 0.9894736842105263, "grad_norm": 1.0706268272715531, "learning_rate": 1.5340839098577912e-05, "loss": 1.6369, "step": 1222 }, { "epoch": 0.9902834008097166, "grad_norm": 1.076282820889626, "learning_rate": 1.5333570365131353e-05, "loss": 1.6125, "step": 1223 }, { "epoch": 0.9910931174089069, "grad_norm": 1.0521363851747962, "learning_rate": 1.5326297691260526e-05, "loss": 1.6126, "step": 1224 }, { "epoch": 0.9919028340080972, "grad_norm": 1.028881455867217, "learning_rate": 1.5319021082338458e-05, "loss": 1.6541, "step": 1225 }, { "epoch": 0.9927125506072875, "grad_norm": 1.04183651822074, "learning_rate": 1.5311740543741088e-05, "loss": 1.6238, "step": 1226 }, { "epoch": 0.9935222672064777, "grad_norm": 1.0858918280353071, "learning_rate": 1.5304456080847247e-05, "loss": 1.6627, "step": 1227 }, { "epoch": 0.994331983805668, "grad_norm": 1.0035165826353316, "learning_rate": 1.5297167699038673e-05, "loss": 1.6204, "step": 1228 }, { "epoch": 0.9951417004048583, "grad_norm": 1.0816188271259373, "learning_rate": 1.5289875403700005e-05, "loss": 1.6276, "step": 1229 }, { "epoch": 0.9959514170040485, "grad_norm": 1.0599193319828009, "learning_rate": 1.5282579200218762e-05, "loss": 1.6398, "step": 1230 }, { "epoch": 0.9967611336032388, "grad_norm": 1.0919178042931876, "learning_rate": 1.5275279093985355e-05, "loss": 1.5611, "step": 1231 }, { "epoch": 0.9975708502024292, "grad_norm": 1.1107755668554948, "learning_rate": 1.5267975090393078e-05, "loss": 1.715, "step": 1232 }, { "epoch": 0.9983805668016195, "grad_norm": 0.9966294564153475, "learning_rate": 1.526066719483811e-05, "loss": 1.6605, "step": 1233 }, { "epoch": 0.9991902834008097, "grad_norm": 1.1569636814062747, "learning_rate": 1.5253355412719498e-05, "loss": 1.6844, "step": 1234 }, { "epoch": 1.0, "grad_norm": 1.0838838624298, "learning_rate": 1.5246039749439159e-05, "loss": 1.6023, "step": 1235 }, { "epoch": 1.0008097165991903, "grad_norm": 1.3033229459063758, "learning_rate": 1.5238720210401881e-05, "loss": 1.4524, "step": 1236 }, { "epoch": 1.0016194331983805, "grad_norm": 1.1817666802596205, "learning_rate": 1.5231396801015321e-05, "loss": 1.4273, "step": 1237 }, { "epoch": 1.0024291497975708, "grad_norm": 1.1157969184839365, "learning_rate": 1.5224069526689981e-05, "loss": 1.4376, "step": 1238 }, { "epoch": 1.003238866396761, "grad_norm": 1.1746908857400815, "learning_rate": 1.5216738392839241e-05, "loss": 1.3419, "step": 1239 }, { "epoch": 1.0040485829959513, "grad_norm": 1.5854462006834873, "learning_rate": 1.5209403404879305e-05, "loss": 1.3791, "step": 1240 }, { "epoch": 1.0048582995951416, "grad_norm": 1.4607487661303684, "learning_rate": 1.5202064568229242e-05, "loss": 1.3514, "step": 1241 }, { "epoch": 1.0056680161943319, "grad_norm": 1.4180363035835093, "learning_rate": 1.5194721888310966e-05, "loss": 1.4033, "step": 1242 }, { "epoch": 1.0064777327935224, "grad_norm": 1.3518288634394915, "learning_rate": 1.5187375370549218e-05, "loss": 1.3413, "step": 1243 }, { "epoch": 1.0072874493927126, "grad_norm": 1.3776243915536928, "learning_rate": 1.5180025020371585e-05, "loss": 1.4354, "step": 1244 }, { "epoch": 1.008097165991903, "grad_norm": 1.263117229706304, "learning_rate": 1.5172670843208477e-05, "loss": 1.3634, "step": 1245 }, { "epoch": 1.0089068825910932, "grad_norm": 1.2727019839645528, "learning_rate": 1.5165312844493146e-05, "loss": 1.3325, "step": 1246 }, { "epoch": 1.0097165991902834, "grad_norm": 1.2423022959844323, "learning_rate": 1.5157951029661644e-05, "loss": 1.3249, "step": 1247 }, { "epoch": 1.0105263157894737, "grad_norm": 1.3569865807346784, "learning_rate": 1.5150585404152864e-05, "loss": 1.388, "step": 1248 }, { "epoch": 1.011336032388664, "grad_norm": 1.3839955804790385, "learning_rate": 1.5143215973408505e-05, "loss": 1.3716, "step": 1249 }, { "epoch": 1.0121457489878543, "grad_norm": 1.281605620473294, "learning_rate": 1.5135842742873077e-05, "loss": 1.3419, "step": 1250 }, { "epoch": 1.0129554655870445, "grad_norm": 1.240875523498585, "learning_rate": 1.5128465717993898e-05, "loss": 1.4467, "step": 1251 }, { "epoch": 1.0137651821862348, "grad_norm": 1.2013839184847355, "learning_rate": 1.5121084904221093e-05, "loss": 1.3489, "step": 1252 }, { "epoch": 1.014574898785425, "grad_norm": 1.224004173475795, "learning_rate": 1.5113700307007575e-05, "loss": 1.3784, "step": 1253 }, { "epoch": 1.0153846153846153, "grad_norm": 1.2009133574758046, "learning_rate": 1.510631193180907e-05, "loss": 1.3149, "step": 1254 }, { "epoch": 1.0161943319838056, "grad_norm": 1.2303064249057094, "learning_rate": 1.5098919784084083e-05, "loss": 1.3512, "step": 1255 }, { "epoch": 1.0170040485829959, "grad_norm": 1.2885247341623098, "learning_rate": 1.50915238692939e-05, "loss": 1.3995, "step": 1256 }, { "epoch": 1.0178137651821861, "grad_norm": 1.2509468790226062, "learning_rate": 1.5084124192902612e-05, "loss": 1.3568, "step": 1257 }, { "epoch": 1.0186234817813766, "grad_norm": 1.2329823773251085, "learning_rate": 1.5076720760377064e-05, "loss": 1.3742, "step": 1258 }, { "epoch": 1.019433198380567, "grad_norm": 1.2514982887751502, "learning_rate": 1.5069313577186892e-05, "loss": 1.4078, "step": 1259 }, { "epoch": 1.0202429149797572, "grad_norm": 1.1878670599291559, "learning_rate": 1.5061902648804503e-05, "loss": 1.3712, "step": 1260 }, { "epoch": 1.0210526315789474, "grad_norm": 1.1702105583771456, "learning_rate": 1.5054487980705059e-05, "loss": 1.3456, "step": 1261 }, { "epoch": 1.0218623481781377, "grad_norm": 1.2390980605393822, "learning_rate": 1.5047069578366497e-05, "loss": 1.3626, "step": 1262 }, { "epoch": 1.022672064777328, "grad_norm": 1.1903294920255179, "learning_rate": 1.5039647447269508e-05, "loss": 1.3562, "step": 1263 }, { "epoch": 1.0234817813765182, "grad_norm": 1.210946578195798, "learning_rate": 1.5032221592897536e-05, "loss": 1.3543, "step": 1264 }, { "epoch": 1.0242914979757085, "grad_norm": 1.1617580200681779, "learning_rate": 1.502479202073678e-05, "loss": 1.3388, "step": 1265 }, { "epoch": 1.0251012145748988, "grad_norm": 1.2440798916020672, "learning_rate": 1.5017358736276183e-05, "loss": 1.378, "step": 1266 }, { "epoch": 1.025910931174089, "grad_norm": 1.2628930798261047, "learning_rate": 1.500992174500743e-05, "loss": 1.3923, "step": 1267 }, { "epoch": 1.0267206477732793, "grad_norm": 1.2359583176315028, "learning_rate": 1.5002481052424945e-05, "loss": 1.3872, "step": 1268 }, { "epoch": 1.0275303643724696, "grad_norm": 1.1963174169460538, "learning_rate": 1.499503666402589e-05, "loss": 1.3951, "step": 1269 }, { "epoch": 1.0283400809716599, "grad_norm": 1.124590149714284, "learning_rate": 1.4987588585310154e-05, "loss": 1.361, "step": 1270 }, { "epoch": 1.0291497975708501, "grad_norm": 1.1617067643230052, "learning_rate": 1.4980136821780348e-05, "loss": 1.3614, "step": 1271 }, { "epoch": 1.0299595141700404, "grad_norm": 1.6170111738264419, "learning_rate": 1.497268137894182e-05, "loss": 1.4219, "step": 1272 }, { "epoch": 1.0307692307692307, "grad_norm": 1.2275764161254092, "learning_rate": 1.4965222262302621e-05, "loss": 1.3835, "step": 1273 }, { "epoch": 1.0315789473684212, "grad_norm": 1.1996462946756752, "learning_rate": 1.4957759477373519e-05, "loss": 1.2693, "step": 1274 }, { "epoch": 1.0323886639676114, "grad_norm": 1.2125778356953747, "learning_rate": 1.4950293029668004e-05, "loss": 1.3383, "step": 1275 }, { "epoch": 1.0331983805668017, "grad_norm": 1.2012198190789183, "learning_rate": 1.4942822924702252e-05, "loss": 1.374, "step": 1276 }, { "epoch": 1.034008097165992, "grad_norm": 1.2130852863124042, "learning_rate": 1.4935349167995161e-05, "loss": 1.4402, "step": 1277 }, { "epoch": 1.0348178137651822, "grad_norm": 1.1740546729540866, "learning_rate": 1.4927871765068314e-05, "loss": 1.3831, "step": 1278 }, { "epoch": 1.0356275303643725, "grad_norm": 1.1917159960823642, "learning_rate": 1.4920390721445993e-05, "loss": 1.3729, "step": 1279 }, { "epoch": 1.0364372469635628, "grad_norm": 1.1733824111242548, "learning_rate": 1.4912906042655164e-05, "loss": 1.3949, "step": 1280 }, { "epoch": 1.037246963562753, "grad_norm": 1.1752871025521956, "learning_rate": 1.4905417734225488e-05, "loss": 1.3553, "step": 1281 }, { "epoch": 1.0380566801619433, "grad_norm": 1.1715814660303838, "learning_rate": 1.4897925801689304e-05, "loss": 1.3838, "step": 1282 }, { "epoch": 1.0388663967611336, "grad_norm": 1.2415703860859453, "learning_rate": 1.4890430250581622e-05, "loss": 1.3841, "step": 1283 }, { "epoch": 1.0396761133603238, "grad_norm": 1.1825484273450233, "learning_rate": 1.4882931086440133e-05, "loss": 1.3573, "step": 1284 }, { "epoch": 1.040485829959514, "grad_norm": 1.1504373958278675, "learning_rate": 1.4875428314805195e-05, "loss": 1.3256, "step": 1285 }, { "epoch": 1.0412955465587044, "grad_norm": 1.1436853666808242, "learning_rate": 1.4867921941219834e-05, "loss": 1.3932, "step": 1286 }, { "epoch": 1.0421052631578946, "grad_norm": 1.162386050137052, "learning_rate": 1.4860411971229728e-05, "loss": 1.3845, "step": 1287 }, { "epoch": 1.042914979757085, "grad_norm": 1.2092585781453715, "learning_rate": 1.485289841038322e-05, "loss": 1.3369, "step": 1288 }, { "epoch": 1.0437246963562754, "grad_norm": 1.145037292414635, "learning_rate": 1.484538126423131e-05, "loss": 1.4054, "step": 1289 }, { "epoch": 1.0445344129554657, "grad_norm": 1.1686385666583774, "learning_rate": 1.483786053832763e-05, "loss": 1.3353, "step": 1290 }, { "epoch": 1.045344129554656, "grad_norm": 1.2275739238178558, "learning_rate": 1.483033623822848e-05, "loss": 1.4264, "step": 1291 }, { "epoch": 1.0461538461538462, "grad_norm": 1.1623896668490261, "learning_rate": 1.4822808369492778e-05, "loss": 1.4345, "step": 1292 }, { "epoch": 1.0469635627530365, "grad_norm": 1.2296387845097625, "learning_rate": 1.4815276937682094e-05, "loss": 1.3207, "step": 1293 }, { "epoch": 1.0477732793522267, "grad_norm": 1.176201622341318, "learning_rate": 1.4807741948360625e-05, "loss": 1.3585, "step": 1294 }, { "epoch": 1.048582995951417, "grad_norm": 1.1610794617830476, "learning_rate": 1.4800203407095194e-05, "loss": 1.3479, "step": 1295 }, { "epoch": 1.0493927125506073, "grad_norm": 1.1776599630056936, "learning_rate": 1.4792661319455252e-05, "loss": 1.4027, "step": 1296 }, { "epoch": 1.0502024291497976, "grad_norm": 1.2338228585850806, "learning_rate": 1.4785115691012866e-05, "loss": 1.3332, "step": 1297 }, { "epoch": 1.0510121457489878, "grad_norm": 1.1880604320942654, "learning_rate": 1.4777566527342729e-05, "loss": 1.3304, "step": 1298 }, { "epoch": 1.051821862348178, "grad_norm": 1.1817231459155564, "learning_rate": 1.4770013834022128e-05, "loss": 1.3714, "step": 1299 }, { "epoch": 1.0526315789473684, "grad_norm": 1.185176582407899, "learning_rate": 1.4762457616630972e-05, "loss": 1.2879, "step": 1300 }, { "epoch": 1.0534412955465586, "grad_norm": 1.1828727222185689, "learning_rate": 1.4754897880751776e-05, "loss": 1.2923, "step": 1301 }, { "epoch": 1.054251012145749, "grad_norm": 1.1744325557146087, "learning_rate": 1.474733463196964e-05, "loss": 1.3548, "step": 1302 }, { "epoch": 1.0550607287449392, "grad_norm": 1.160850787157178, "learning_rate": 1.4739767875872271e-05, "loss": 1.4059, "step": 1303 }, { "epoch": 1.0558704453441297, "grad_norm": 1.176977157825984, "learning_rate": 1.473219761804996e-05, "loss": 1.3351, "step": 1304 }, { "epoch": 1.05668016194332, "grad_norm": 1.2503348243091958, "learning_rate": 1.4724623864095595e-05, "loss": 1.4432, "step": 1305 }, { "epoch": 1.0574898785425102, "grad_norm": 1.1872676938264057, "learning_rate": 1.4717046619604636e-05, "loss": 1.4101, "step": 1306 }, { "epoch": 1.0582995951417005, "grad_norm": 1.168594441423703, "learning_rate": 1.4709465890175125e-05, "loss": 1.3947, "step": 1307 }, { "epoch": 1.0591093117408907, "grad_norm": 1.1705898387765592, "learning_rate": 1.4701881681407684e-05, "loss": 1.3358, "step": 1308 }, { "epoch": 1.059919028340081, "grad_norm": 1.1603679171263597, "learning_rate": 1.46942939989055e-05, "loss": 1.3784, "step": 1309 }, { "epoch": 1.0607287449392713, "grad_norm": 1.164400476925873, "learning_rate": 1.4686702848274328e-05, "loss": 1.3856, "step": 1310 }, { "epoch": 1.0615384615384615, "grad_norm": 1.1554651486479175, "learning_rate": 1.4679108235122482e-05, "loss": 1.3397, "step": 1311 }, { "epoch": 1.0623481781376518, "grad_norm": 1.2155869311427083, "learning_rate": 1.467151016506084e-05, "loss": 1.3729, "step": 1312 }, { "epoch": 1.063157894736842, "grad_norm": 1.2133615435457628, "learning_rate": 1.4663908643702836e-05, "loss": 1.3122, "step": 1313 }, { "epoch": 1.0639676113360323, "grad_norm": 1.1634564916763344, "learning_rate": 1.465630367666444e-05, "loss": 1.3354, "step": 1314 }, { "epoch": 1.0647773279352226, "grad_norm": 1.2001955918988352, "learning_rate": 1.4648695269564182e-05, "loss": 1.4082, "step": 1315 }, { "epoch": 1.0655870445344129, "grad_norm": 1.1379663684306365, "learning_rate": 1.4641083428023124e-05, "loss": 1.3493, "step": 1316 }, { "epoch": 1.0663967611336032, "grad_norm": 1.1718455198343034, "learning_rate": 1.4633468157664879e-05, "loss": 1.3505, "step": 1317 }, { "epoch": 1.0672064777327934, "grad_norm": 1.1646759648158063, "learning_rate": 1.4625849464115571e-05, "loss": 1.3558, "step": 1318 }, { "epoch": 1.068016194331984, "grad_norm": 1.162310000496439, "learning_rate": 1.4618227353003878e-05, "loss": 1.3958, "step": 1319 }, { "epoch": 1.0688259109311742, "grad_norm": 1.1488822317358849, "learning_rate": 1.461060182996098e-05, "loss": 1.4, "step": 1320 }, { "epoch": 1.0696356275303645, "grad_norm": 1.2033656269010369, "learning_rate": 1.4602972900620596e-05, "loss": 1.3656, "step": 1321 }, { "epoch": 1.0704453441295547, "grad_norm": 1.2654857663774528, "learning_rate": 1.459534057061895e-05, "loss": 1.314, "step": 1322 }, { "epoch": 1.071255060728745, "grad_norm": 1.1409457494007944, "learning_rate": 1.4587704845594784e-05, "loss": 1.3914, "step": 1323 }, { "epoch": 1.0720647773279353, "grad_norm": 1.1625130696761021, "learning_rate": 1.4580065731189344e-05, "loss": 1.3809, "step": 1324 }, { "epoch": 1.0728744939271255, "grad_norm": 1.1754811815518604, "learning_rate": 1.4572423233046386e-05, "loss": 1.357, "step": 1325 }, { "epoch": 1.0736842105263158, "grad_norm": 1.274907802092002, "learning_rate": 1.456477735681216e-05, "loss": 1.3543, "step": 1326 }, { "epoch": 1.074493927125506, "grad_norm": 1.194810175856374, "learning_rate": 1.455712810813542e-05, "loss": 1.3274, "step": 1327 }, { "epoch": 1.0753036437246963, "grad_norm": 1.1841749394103487, "learning_rate": 1.4549475492667395e-05, "loss": 1.4214, "step": 1328 }, { "epoch": 1.0761133603238866, "grad_norm": 1.1616739871602735, "learning_rate": 1.4541819516061824e-05, "loss": 1.3143, "step": 1329 }, { "epoch": 1.0769230769230769, "grad_norm": 1.235456801171255, "learning_rate": 1.4534160183974908e-05, "loss": 1.3862, "step": 1330 }, { "epoch": 1.0777327935222671, "grad_norm": 1.185526004922521, "learning_rate": 1.4526497502065343e-05, "loss": 1.3827, "step": 1331 }, { "epoch": 1.0785425101214574, "grad_norm": 1.181146915990783, "learning_rate": 1.4518831475994287e-05, "loss": 1.3566, "step": 1332 }, { "epoch": 1.0793522267206477, "grad_norm": 1.2073400109649837, "learning_rate": 1.4511162111425377e-05, "loss": 1.3772, "step": 1333 }, { "epoch": 1.0801619433198382, "grad_norm": 1.2061215113680948, "learning_rate": 1.450348941402472e-05, "loss": 1.4011, "step": 1334 }, { "epoch": 1.0809716599190284, "grad_norm": 1.2058666314849154, "learning_rate": 1.4495813389460875e-05, "loss": 1.3617, "step": 1335 }, { "epoch": 1.0817813765182187, "grad_norm": 1.1903692062674975, "learning_rate": 1.448813404340486e-05, "loss": 1.2916, "step": 1336 }, { "epoch": 1.082591093117409, "grad_norm": 1.2340367866655837, "learning_rate": 1.4480451381530159e-05, "loss": 1.2935, "step": 1337 }, { "epoch": 1.0834008097165992, "grad_norm": 1.2659821523943215, "learning_rate": 1.447276540951269e-05, "loss": 1.362, "step": 1338 }, { "epoch": 1.0842105263157895, "grad_norm": 1.2228755854040374, "learning_rate": 1.4465076133030828e-05, "loss": 1.3781, "step": 1339 }, { "epoch": 1.0850202429149798, "grad_norm": 1.1570938076905548, "learning_rate": 1.4457383557765385e-05, "loss": 1.3808, "step": 1340 }, { "epoch": 1.08582995951417, "grad_norm": 1.1637617032486594, "learning_rate": 1.4449687689399607e-05, "loss": 1.3515, "step": 1341 }, { "epoch": 1.0866396761133603, "grad_norm": 1.1678076483156412, "learning_rate": 1.4441988533619182e-05, "loss": 1.3391, "step": 1342 }, { "epoch": 1.0874493927125506, "grad_norm": 1.2160441877476182, "learning_rate": 1.4434286096112215e-05, "loss": 1.3475, "step": 1343 }, { "epoch": 1.0882591093117409, "grad_norm": 1.235249585783021, "learning_rate": 1.4426580382569241e-05, "loss": 1.2836, "step": 1344 }, { "epoch": 1.0890688259109311, "grad_norm": 1.2382294650559509, "learning_rate": 1.4418871398683227e-05, "loss": 1.392, "step": 1345 }, { "epoch": 1.0898785425101214, "grad_norm": 1.173232332418364, "learning_rate": 1.4411159150149532e-05, "loss": 1.3071, "step": 1346 }, { "epoch": 1.0906882591093117, "grad_norm": 1.1510674680418143, "learning_rate": 1.4403443642665946e-05, "loss": 1.3228, "step": 1347 }, { "epoch": 1.091497975708502, "grad_norm": 1.1961923726867185, "learning_rate": 1.439572488193266e-05, "loss": 1.3194, "step": 1348 }, { "epoch": 1.0923076923076924, "grad_norm": 1.1902807444112073, "learning_rate": 1.438800287365227e-05, "loss": 1.3624, "step": 1349 }, { "epoch": 1.0931174089068827, "grad_norm": 1.1834064174607142, "learning_rate": 1.4380277623529766e-05, "loss": 1.3173, "step": 1350 }, { "epoch": 1.093927125506073, "grad_norm": 1.2136583995550188, "learning_rate": 1.437254913727254e-05, "loss": 1.3443, "step": 1351 }, { "epoch": 1.0947368421052632, "grad_norm": 1.154981299062566, "learning_rate": 1.4364817420590373e-05, "loss": 1.3872, "step": 1352 }, { "epoch": 1.0955465587044535, "grad_norm": 1.2102553779995304, "learning_rate": 1.4357082479195435e-05, "loss": 1.2916, "step": 1353 }, { "epoch": 1.0963562753036438, "grad_norm": 1.188609622826024, "learning_rate": 1.434934431880227e-05, "loss": 1.3302, "step": 1354 }, { "epoch": 1.097165991902834, "grad_norm": 1.200714765859928, "learning_rate": 1.4341602945127806e-05, "loss": 1.2963, "step": 1355 }, { "epoch": 1.0979757085020243, "grad_norm": 1.1787138315863135, "learning_rate": 1.4333858363891346e-05, "loss": 1.3517, "step": 1356 }, { "epoch": 1.0987854251012146, "grad_norm": 1.2170954780680936, "learning_rate": 1.4326110580814563e-05, "loss": 1.3765, "step": 1357 }, { "epoch": 1.0995951417004048, "grad_norm": 1.250816449134447, "learning_rate": 1.431835960162149e-05, "loss": 1.3175, "step": 1358 }, { "epoch": 1.1004048582995951, "grad_norm": 1.177624560111963, "learning_rate": 1.4310605432038527e-05, "loss": 1.3935, "step": 1359 }, { "epoch": 1.1012145748987854, "grad_norm": 1.2197836972826388, "learning_rate": 1.4302848077794427e-05, "loss": 1.4803, "step": 1360 }, { "epoch": 1.1020242914979756, "grad_norm": 1.2127800632997665, "learning_rate": 1.42950875446203e-05, "loss": 1.3772, "step": 1361 }, { "epoch": 1.102834008097166, "grad_norm": 1.23399617147964, "learning_rate": 1.4287323838249603e-05, "loss": 1.3533, "step": 1362 }, { "epoch": 1.1036437246963562, "grad_norm": 1.2240074250465314, "learning_rate": 1.4279556964418135e-05, "loss": 1.3569, "step": 1363 }, { "epoch": 1.1044534412955465, "grad_norm": 1.2564824376072379, "learning_rate": 1.4271786928864037e-05, "loss": 1.4501, "step": 1364 }, { "epoch": 1.1052631578947367, "grad_norm": 1.1991759525151162, "learning_rate": 1.426401373732779e-05, "loss": 1.3907, "step": 1365 }, { "epoch": 1.1060728744939272, "grad_norm": 1.2378861157976855, "learning_rate": 1.4256237395552195e-05, "loss": 1.3469, "step": 1366 }, { "epoch": 1.1068825910931175, "grad_norm": 1.2292213801115794, "learning_rate": 1.4248457909282391e-05, "loss": 1.3883, "step": 1367 }, { "epoch": 1.1076923076923078, "grad_norm": 1.1914894912948184, "learning_rate": 1.4240675284265838e-05, "loss": 1.3029, "step": 1368 }, { "epoch": 1.108502024291498, "grad_norm": 1.180343522184691, "learning_rate": 1.4232889526252316e-05, "loss": 1.3656, "step": 1369 }, { "epoch": 1.1093117408906883, "grad_norm": 1.1905529250584874, "learning_rate": 1.422510064099391e-05, "loss": 1.3206, "step": 1370 }, { "epoch": 1.1101214574898786, "grad_norm": 1.211461020000152, "learning_rate": 1.421730863424503e-05, "loss": 1.2979, "step": 1371 }, { "epoch": 1.1109311740890688, "grad_norm": 1.2001565370308576, "learning_rate": 1.4209513511762381e-05, "loss": 1.3776, "step": 1372 }, { "epoch": 1.111740890688259, "grad_norm": 1.2955199788028005, "learning_rate": 1.420171527930498e-05, "loss": 1.3443, "step": 1373 }, { "epoch": 1.1125506072874494, "grad_norm": 1.1599118475957895, "learning_rate": 1.4193913942634122e-05, "loss": 1.3644, "step": 1374 }, { "epoch": 1.1133603238866396, "grad_norm": 1.1585810167212327, "learning_rate": 1.4186109507513425e-05, "loss": 1.3979, "step": 1375 }, { "epoch": 1.11417004048583, "grad_norm": 1.2720040494303266, "learning_rate": 1.417830197970877e-05, "loss": 1.3597, "step": 1376 }, { "epoch": 1.1149797570850202, "grad_norm": 1.2098574998597365, "learning_rate": 1.4170491364988336e-05, "loss": 1.4478, "step": 1377 }, { "epoch": 1.1157894736842104, "grad_norm": 1.2313276250589487, "learning_rate": 1.416267766912258e-05, "loss": 1.3725, "step": 1378 }, { "epoch": 1.1165991902834007, "grad_norm": 1.1949141861834391, "learning_rate": 1.4154860897884234e-05, "loss": 1.394, "step": 1379 }, { "epoch": 1.117408906882591, "grad_norm": 1.1798433594200126, "learning_rate": 1.4147041057048303e-05, "loss": 1.3759, "step": 1380 }, { "epoch": 1.1182186234817815, "grad_norm": 1.2126045567567152, "learning_rate": 1.4139218152392058e-05, "loss": 1.4342, "step": 1381 }, { "epoch": 1.1190283400809717, "grad_norm": 1.1831352962334098, "learning_rate": 1.4131392189695037e-05, "loss": 1.3377, "step": 1382 }, { "epoch": 1.119838056680162, "grad_norm": 1.1965883843750686, "learning_rate": 1.4123563174739036e-05, "loss": 1.3369, "step": 1383 }, { "epoch": 1.1206477732793523, "grad_norm": 1.1612564728192138, "learning_rate": 1.4115731113308106e-05, "loss": 1.3275, "step": 1384 }, { "epoch": 1.1214574898785425, "grad_norm": 1.2213439759905513, "learning_rate": 1.4107896011188546e-05, "loss": 1.3599, "step": 1385 }, { "epoch": 1.1222672064777328, "grad_norm": 1.2321562126222285, "learning_rate": 1.4100057874168906e-05, "loss": 1.4032, "step": 1386 }, { "epoch": 1.123076923076923, "grad_norm": 1.2058709080617371, "learning_rate": 1.4092216708039974e-05, "loss": 1.3196, "step": 1387 }, { "epoch": 1.1238866396761134, "grad_norm": 1.1731302542020694, "learning_rate": 1.4084372518594777e-05, "loss": 1.372, "step": 1388 }, { "epoch": 1.1246963562753036, "grad_norm": 1.2473568748894555, "learning_rate": 1.4076525311628581e-05, "loss": 1.3703, "step": 1389 }, { "epoch": 1.125506072874494, "grad_norm": 1.1445398846114365, "learning_rate": 1.4068675092938872e-05, "loss": 1.3289, "step": 1390 }, { "epoch": 1.1263157894736842, "grad_norm": 1.178653116534255, "learning_rate": 1.406082186832537e-05, "loss": 1.3859, "step": 1391 }, { "epoch": 1.1271255060728744, "grad_norm": 1.2540075296621336, "learning_rate": 1.4052965643590006e-05, "loss": 1.3663, "step": 1392 }, { "epoch": 1.1279352226720647, "grad_norm": 1.278208376958459, "learning_rate": 1.4045106424536938e-05, "loss": 1.4342, "step": 1393 }, { "epoch": 1.128744939271255, "grad_norm": 1.2422771469402274, "learning_rate": 1.403724421697253e-05, "loss": 1.3634, "step": 1394 }, { "epoch": 1.1295546558704452, "grad_norm": 1.220480716235926, "learning_rate": 1.4029379026705352e-05, "loss": 1.3913, "step": 1395 }, { "epoch": 1.1303643724696357, "grad_norm": 1.2008169986282988, "learning_rate": 1.4021510859546184e-05, "loss": 1.3316, "step": 1396 }, { "epoch": 1.131174089068826, "grad_norm": 1.206940001904269, "learning_rate": 1.4013639721308004e-05, "loss": 1.3366, "step": 1397 }, { "epoch": 1.1319838056680163, "grad_norm": 1.1864838007376735, "learning_rate": 1.4005765617805977e-05, "loss": 1.3345, "step": 1398 }, { "epoch": 1.1327935222672065, "grad_norm": 1.1806069165225561, "learning_rate": 1.3997888554857468e-05, "loss": 1.3515, "step": 1399 }, { "epoch": 1.1336032388663968, "grad_norm": 1.244689296436616, "learning_rate": 1.3990008538282027e-05, "loss": 1.4887, "step": 1400 }, { "epoch": 1.134412955465587, "grad_norm": 1.2092159259655217, "learning_rate": 1.3982125573901384e-05, "loss": 1.3932, "step": 1401 }, { "epoch": 1.1352226720647773, "grad_norm": 1.201131078854986, "learning_rate": 1.3974239667539445e-05, "loss": 1.3621, "step": 1402 }, { "epoch": 1.1360323886639676, "grad_norm": 1.2951035372189361, "learning_rate": 1.396635082502229e-05, "loss": 1.3733, "step": 1403 }, { "epoch": 1.1368421052631579, "grad_norm": 1.2124240532342052, "learning_rate": 1.3958459052178175e-05, "loss": 1.3841, "step": 1404 }, { "epoch": 1.1376518218623481, "grad_norm": 1.2078175074517925, "learning_rate": 1.3950564354837512e-05, "loss": 1.3062, "step": 1405 }, { "epoch": 1.1384615384615384, "grad_norm": 1.2235438631380933, "learning_rate": 1.3942666738832879e-05, "loss": 1.3788, "step": 1406 }, { "epoch": 1.1392712550607287, "grad_norm": 1.2535687484192757, "learning_rate": 1.3934766209999012e-05, "loss": 1.32, "step": 1407 }, { "epoch": 1.140080971659919, "grad_norm": 1.1792494735598216, "learning_rate": 1.3926862774172789e-05, "loss": 1.3501, "step": 1408 }, { "epoch": 1.1408906882591092, "grad_norm": 1.2550576055364107, "learning_rate": 1.391895643719325e-05, "loss": 1.3685, "step": 1409 }, { "epoch": 1.1417004048582995, "grad_norm": 1.187775539751402, "learning_rate": 1.391104720490156e-05, "loss": 1.3756, "step": 1410 }, { "epoch": 1.14251012145749, "grad_norm": 1.1716789792191453, "learning_rate": 1.3903135083141046e-05, "loss": 1.3657, "step": 1411 }, { "epoch": 1.1433198380566802, "grad_norm": 1.1878226281741322, "learning_rate": 1.389522007775715e-05, "loss": 1.3513, "step": 1412 }, { "epoch": 1.1441295546558705, "grad_norm": 1.172121127796374, "learning_rate": 1.3887302194597455e-05, "loss": 1.3747, "step": 1413 }, { "epoch": 1.1449392712550608, "grad_norm": 1.1977041128005224, "learning_rate": 1.3879381439511664e-05, "loss": 1.3213, "step": 1414 }, { "epoch": 1.145748987854251, "grad_norm": 1.1951668935892397, "learning_rate": 1.387145781835161e-05, "loss": 1.3693, "step": 1415 }, { "epoch": 1.1465587044534413, "grad_norm": 1.2425273987483105, "learning_rate": 1.3863531336971231e-05, "loss": 1.3388, "step": 1416 }, { "epoch": 1.1473684210526316, "grad_norm": 1.2738201413338597, "learning_rate": 1.3855602001226596e-05, "loss": 1.4744, "step": 1417 }, { "epoch": 1.1481781376518219, "grad_norm": 1.2067856613703325, "learning_rate": 1.384766981697586e-05, "loss": 1.3698, "step": 1418 }, { "epoch": 1.1489878542510121, "grad_norm": 1.2309122591312711, "learning_rate": 1.3839734790079304e-05, "loss": 1.4008, "step": 1419 }, { "epoch": 1.1497975708502024, "grad_norm": 1.1924268380366414, "learning_rate": 1.3831796926399295e-05, "loss": 1.4003, "step": 1420 }, { "epoch": 1.1506072874493927, "grad_norm": 1.252471702177511, "learning_rate": 1.3823856231800301e-05, "loss": 1.3868, "step": 1421 }, { "epoch": 1.151417004048583, "grad_norm": 1.209467941700512, "learning_rate": 1.3815912712148885e-05, "loss": 1.3534, "step": 1422 }, { "epoch": 1.1522267206477732, "grad_norm": 1.1971438481581644, "learning_rate": 1.3807966373313689e-05, "loss": 1.355, "step": 1423 }, { "epoch": 1.1530364372469635, "grad_norm": 1.2066657341311795, "learning_rate": 1.380001722116544e-05, "loss": 1.3531, "step": 1424 }, { "epoch": 1.1538461538461537, "grad_norm": 1.1713117606005494, "learning_rate": 1.3792065261576953e-05, "loss": 1.3873, "step": 1425 }, { "epoch": 1.1546558704453442, "grad_norm": 1.2248765090877878, "learning_rate": 1.3784110500423104e-05, "loss": 1.3402, "step": 1426 }, { "epoch": 1.1554655870445345, "grad_norm": 1.2378827680661408, "learning_rate": 1.3776152943580846e-05, "loss": 1.2761, "step": 1427 }, { "epoch": 1.1562753036437248, "grad_norm": 1.1711500146006906, "learning_rate": 1.3768192596929195e-05, "loss": 1.377, "step": 1428 }, { "epoch": 1.157085020242915, "grad_norm": 1.2549196705935586, "learning_rate": 1.376022946634923e-05, "loss": 1.4122, "step": 1429 }, { "epoch": 1.1578947368421053, "grad_norm": 1.221604782084902, "learning_rate": 1.3752263557724088e-05, "loss": 1.4329, "step": 1430 }, { "epoch": 1.1587044534412956, "grad_norm": 1.244326599131787, "learning_rate": 1.3744294876938953e-05, "loss": 1.4161, "step": 1431 }, { "epoch": 1.1595141700404858, "grad_norm": 1.1776858381109803, "learning_rate": 1.3736323429881056e-05, "loss": 1.411, "step": 1432 }, { "epoch": 1.1603238866396761, "grad_norm": 1.2341697465268895, "learning_rate": 1.3728349222439682e-05, "loss": 1.3794, "step": 1433 }, { "epoch": 1.1611336032388664, "grad_norm": 1.2025715980532212, "learning_rate": 1.3720372260506152e-05, "loss": 1.4168, "step": 1434 }, { "epoch": 1.1619433198380567, "grad_norm": 1.2316761185815908, "learning_rate": 1.3712392549973814e-05, "loss": 1.4173, "step": 1435 }, { "epoch": 1.162753036437247, "grad_norm": 1.1692638075647215, "learning_rate": 1.370441009673805e-05, "loss": 1.3268, "step": 1436 }, { "epoch": 1.1635627530364372, "grad_norm": 1.2081335987676685, "learning_rate": 1.3696424906696275e-05, "loss": 1.3603, "step": 1437 }, { "epoch": 1.1643724696356275, "grad_norm": 1.2286436380651753, "learning_rate": 1.3688436985747922e-05, "loss": 1.398, "step": 1438 }, { "epoch": 1.1651821862348177, "grad_norm": 1.2708424368976516, "learning_rate": 1.3680446339794436e-05, "loss": 1.36, "step": 1439 }, { "epoch": 1.165991902834008, "grad_norm": 1.2883548798199778, "learning_rate": 1.3672452974739278e-05, "loss": 1.4225, "step": 1440 }, { "epoch": 1.1668016194331985, "grad_norm": 1.2173917634376092, "learning_rate": 1.366445689648793e-05, "loss": 1.315, "step": 1441 }, { "epoch": 1.1676113360323888, "grad_norm": 1.242957617527675, "learning_rate": 1.3656458110947864e-05, "loss": 1.3515, "step": 1442 }, { "epoch": 1.168421052631579, "grad_norm": 1.2488134934449235, "learning_rate": 1.364845662402855e-05, "loss": 1.3045, "step": 1443 }, { "epoch": 1.1692307692307693, "grad_norm": 1.2204335700413085, "learning_rate": 1.3640452441641466e-05, "loss": 1.396, "step": 1444 }, { "epoch": 1.1700404858299596, "grad_norm": 1.28192971889865, "learning_rate": 1.3632445569700078e-05, "loss": 1.3832, "step": 1445 }, { "epoch": 1.1708502024291498, "grad_norm": 1.244268004826121, "learning_rate": 1.362443601411983e-05, "loss": 1.3672, "step": 1446 }, { "epoch": 1.17165991902834, "grad_norm": 1.155976566029991, "learning_rate": 1.361642378081816e-05, "loss": 1.2883, "step": 1447 }, { "epoch": 1.1724696356275304, "grad_norm": 1.2611118512015467, "learning_rate": 1.3608408875714478e-05, "loss": 1.3181, "step": 1448 }, { "epoch": 1.1732793522267206, "grad_norm": 1.1853603357638687, "learning_rate": 1.3600391304730174e-05, "loss": 1.3431, "step": 1449 }, { "epoch": 1.174089068825911, "grad_norm": 1.1956700418684363, "learning_rate": 1.3592371073788595e-05, "loss": 1.4223, "step": 1450 }, { "epoch": 1.1748987854251012, "grad_norm": 1.2850373149613217, "learning_rate": 1.3584348188815066e-05, "loss": 1.4498, "step": 1451 }, { "epoch": 1.1757085020242914, "grad_norm": 1.193949378852388, "learning_rate": 1.357632265573687e-05, "loss": 1.3437, "step": 1452 }, { "epoch": 1.1765182186234817, "grad_norm": 1.1874525953583595, "learning_rate": 1.356829448048324e-05, "loss": 1.3978, "step": 1453 }, { "epoch": 1.177327935222672, "grad_norm": 1.194001231503012, "learning_rate": 1.3560263668985366e-05, "loss": 1.2924, "step": 1454 }, { "epoch": 1.1781376518218623, "grad_norm": 1.1791770992975843, "learning_rate": 1.355223022717639e-05, "loss": 1.3411, "step": 1455 }, { "epoch": 1.1789473684210527, "grad_norm": 1.2243559430427213, "learning_rate": 1.3544194160991388e-05, "loss": 1.4058, "step": 1456 }, { "epoch": 1.1797570850202428, "grad_norm": 1.1894939791550536, "learning_rate": 1.353615547636738e-05, "loss": 1.3892, "step": 1457 }, { "epoch": 1.1805668016194333, "grad_norm": 1.2776607461433331, "learning_rate": 1.3528114179243322e-05, "loss": 1.3352, "step": 1458 }, { "epoch": 1.1813765182186236, "grad_norm": 1.201017466707103, "learning_rate": 1.3520070275560093e-05, "loss": 1.3605, "step": 1459 }, { "epoch": 1.1821862348178138, "grad_norm": 1.2221052297543349, "learning_rate": 1.3512023771260507e-05, "loss": 1.3831, "step": 1460 }, { "epoch": 1.182995951417004, "grad_norm": 1.2535433317816922, "learning_rate": 1.3503974672289295e-05, "loss": 1.4064, "step": 1461 }, { "epoch": 1.1838056680161944, "grad_norm": 1.25001419317799, "learning_rate": 1.3495922984593101e-05, "loss": 1.354, "step": 1462 }, { "epoch": 1.1846153846153846, "grad_norm": 1.2089559561561227, "learning_rate": 1.3487868714120494e-05, "loss": 1.3245, "step": 1463 }, { "epoch": 1.185425101214575, "grad_norm": 1.1679267338188017, "learning_rate": 1.347981186682193e-05, "loss": 1.4321, "step": 1464 }, { "epoch": 1.1862348178137652, "grad_norm": 1.2411376067872044, "learning_rate": 1.347175244864979e-05, "loss": 1.4119, "step": 1465 }, { "epoch": 1.1870445344129554, "grad_norm": 1.2595186312439597, "learning_rate": 1.3463690465558346e-05, "loss": 1.3369, "step": 1466 }, { "epoch": 1.1878542510121457, "grad_norm": 1.1909117279605281, "learning_rate": 1.3455625923503762e-05, "loss": 1.3863, "step": 1467 }, { "epoch": 1.188663967611336, "grad_norm": 1.2054369811230554, "learning_rate": 1.344755882844409e-05, "loss": 1.3767, "step": 1468 }, { "epoch": 1.1894736842105262, "grad_norm": 1.2323426762141974, "learning_rate": 1.3439489186339283e-05, "loss": 1.377, "step": 1469 }, { "epoch": 1.1902834008097165, "grad_norm": 1.2454039577692348, "learning_rate": 1.3431417003151162e-05, "loss": 1.3446, "step": 1470 }, { "epoch": 1.191093117408907, "grad_norm": 1.2587301942793938, "learning_rate": 1.3423342284843428e-05, "loss": 1.3762, "step": 1471 }, { "epoch": 1.191902834008097, "grad_norm": 1.2722119186788876, "learning_rate": 1.3415265037381657e-05, "loss": 1.3654, "step": 1472 }, { "epoch": 1.1927125506072875, "grad_norm": 1.2805355921406205, "learning_rate": 1.3407185266733294e-05, "loss": 1.3376, "step": 1473 }, { "epoch": 1.1935222672064778, "grad_norm": 1.2081785962123999, "learning_rate": 1.3399102978867648e-05, "loss": 1.4041, "step": 1474 }, { "epoch": 1.194331983805668, "grad_norm": 1.1925409611242321, "learning_rate": 1.3391018179755886e-05, "loss": 1.3684, "step": 1475 }, { "epoch": 1.1951417004048583, "grad_norm": 1.227526446454291, "learning_rate": 1.3382930875371028e-05, "loss": 1.4175, "step": 1476 }, { "epoch": 1.1959514170040486, "grad_norm": 1.1817699551420382, "learning_rate": 1.3374841071687949e-05, "loss": 1.3924, "step": 1477 }, { "epoch": 1.1967611336032389, "grad_norm": 1.1756054088231052, "learning_rate": 1.3366748774683376e-05, "loss": 1.357, "step": 1478 }, { "epoch": 1.1975708502024291, "grad_norm": 1.222659746027629, "learning_rate": 1.3358653990335863e-05, "loss": 1.3636, "step": 1479 }, { "epoch": 1.1983805668016194, "grad_norm": 1.2400190458109341, "learning_rate": 1.3350556724625809e-05, "loss": 1.3603, "step": 1480 }, { "epoch": 1.1991902834008097, "grad_norm": 1.2088314629672083, "learning_rate": 1.3342456983535457e-05, "loss": 1.416, "step": 1481 }, { "epoch": 1.2, "grad_norm": 1.2142465353436538, "learning_rate": 1.3334354773048863e-05, "loss": 1.3035, "step": 1482 }, { "epoch": 1.2008097165991902, "grad_norm": 1.1986889230221165, "learning_rate": 1.3326250099151911e-05, "loss": 1.3957, "step": 1483 }, { "epoch": 1.2016194331983805, "grad_norm": 1.2413091007676673, "learning_rate": 1.331814296783231e-05, "loss": 1.3504, "step": 1484 }, { "epoch": 1.2024291497975708, "grad_norm": 1.2256638183999424, "learning_rate": 1.3310033385079589e-05, "loss": 1.3678, "step": 1485 }, { "epoch": 1.2032388663967613, "grad_norm": 1.2400693672871794, "learning_rate": 1.330192135688507e-05, "loss": 1.4066, "step": 1486 }, { "epoch": 1.2040485829959513, "grad_norm": 1.2362345111970168, "learning_rate": 1.3293806889241898e-05, "loss": 1.3888, "step": 1487 }, { "epoch": 1.2048582995951418, "grad_norm": 1.2587009847578705, "learning_rate": 1.3285689988145011e-05, "loss": 1.4359, "step": 1488 }, { "epoch": 1.205668016194332, "grad_norm": 1.1914663304562276, "learning_rate": 1.3277570659591159e-05, "loss": 1.3391, "step": 1489 }, { "epoch": 1.2064777327935223, "grad_norm": 1.1834896782723838, "learning_rate": 1.3269448909578866e-05, "loss": 1.4029, "step": 1490 }, { "epoch": 1.2072874493927126, "grad_norm": 1.1813018458720312, "learning_rate": 1.3261324744108454e-05, "loss": 1.415, "step": 1491 }, { "epoch": 1.2080971659919029, "grad_norm": 1.1818375702199966, "learning_rate": 1.3253198169182033e-05, "loss": 1.3473, "step": 1492 }, { "epoch": 1.2089068825910931, "grad_norm": 1.194541380193379, "learning_rate": 1.3245069190803495e-05, "loss": 1.3753, "step": 1493 }, { "epoch": 1.2097165991902834, "grad_norm": 1.2134975986163172, "learning_rate": 1.3236937814978493e-05, "loss": 1.3514, "step": 1494 }, { "epoch": 1.2105263157894737, "grad_norm": 1.1847950487403296, "learning_rate": 1.3228804047714462e-05, "loss": 1.3309, "step": 1495 }, { "epoch": 1.211336032388664, "grad_norm": 1.1484211925319983, "learning_rate": 1.322066789502061e-05, "loss": 1.3631, "step": 1496 }, { "epoch": 1.2121457489878542, "grad_norm": 1.2388971531409982, "learning_rate": 1.3212529362907894e-05, "loss": 1.3734, "step": 1497 }, { "epoch": 1.2129554655870445, "grad_norm": 1.247111008797553, "learning_rate": 1.3204388457389032e-05, "loss": 1.3013, "step": 1498 }, { "epoch": 1.2137651821862347, "grad_norm": 1.2555192712705792, "learning_rate": 1.319624518447851e-05, "loss": 1.3507, "step": 1499 }, { "epoch": 1.214574898785425, "grad_norm": 1.3054090163651753, "learning_rate": 1.3188099550192537e-05, "loss": 1.3748, "step": 1500 }, { "epoch": 1.2153846153846155, "grad_norm": 1.250883089302353, "learning_rate": 1.317995156054909e-05, "loss": 1.3597, "step": 1501 }, { "epoch": 1.2161943319838056, "grad_norm": 1.2376049269488396, "learning_rate": 1.3171801221567872e-05, "loss": 1.3149, "step": 1502 }, { "epoch": 1.217004048582996, "grad_norm": 1.1862482520157116, "learning_rate": 1.3163648539270333e-05, "loss": 1.4034, "step": 1503 }, { "epoch": 1.2178137651821863, "grad_norm": 1.26463939374948, "learning_rate": 1.315549351967964e-05, "loss": 1.3365, "step": 1504 }, { "epoch": 1.2186234817813766, "grad_norm": 1.2437748606370247, "learning_rate": 1.31473361688207e-05, "loss": 1.4052, "step": 1505 }, { "epoch": 1.2194331983805669, "grad_norm": 1.2684663340272135, "learning_rate": 1.3139176492720137e-05, "loss": 1.3346, "step": 1506 }, { "epoch": 1.2202429149797571, "grad_norm": 1.2311472588728176, "learning_rate": 1.3131014497406288e-05, "loss": 1.3399, "step": 1507 }, { "epoch": 1.2210526315789474, "grad_norm": 1.262810193750128, "learning_rate": 1.3122850188909216e-05, "loss": 1.3896, "step": 1508 }, { "epoch": 1.2218623481781377, "grad_norm": 1.227527399226647, "learning_rate": 1.3114683573260677e-05, "loss": 1.408, "step": 1509 }, { "epoch": 1.222672064777328, "grad_norm": 1.2375397432956372, "learning_rate": 1.3106514656494147e-05, "loss": 1.3952, "step": 1510 }, { "epoch": 1.2234817813765182, "grad_norm": 1.1550935938629225, "learning_rate": 1.3098343444644793e-05, "loss": 1.3888, "step": 1511 }, { "epoch": 1.2242914979757085, "grad_norm": 1.1942656359224868, "learning_rate": 1.3090169943749475e-05, "loss": 1.3832, "step": 1512 }, { "epoch": 1.2251012145748987, "grad_norm": 1.1727975184000596, "learning_rate": 1.3081994159846753e-05, "loss": 1.3812, "step": 1513 }, { "epoch": 1.225910931174089, "grad_norm": 1.197178362301058, "learning_rate": 1.3073816098976871e-05, "loss": 1.3731, "step": 1514 }, { "epoch": 1.2267206477732793, "grad_norm": 1.2142805284091027, "learning_rate": 1.3065635767181748e-05, "loss": 1.4057, "step": 1515 }, { "epoch": 1.2275303643724695, "grad_norm": 1.1711215174781089, "learning_rate": 1.3057453170504988e-05, "loss": 1.4099, "step": 1516 }, { "epoch": 1.2283400809716598, "grad_norm": 1.183153242000738, "learning_rate": 1.304926831499187e-05, "loss": 1.374, "step": 1517 }, { "epoch": 1.2291497975708503, "grad_norm": 1.2202202744850468, "learning_rate": 1.3041081206689335e-05, "loss": 1.3325, "step": 1518 }, { "epoch": 1.2299595141700406, "grad_norm": 1.1858098990909935, "learning_rate": 1.3032891851645994e-05, "loss": 1.3086, "step": 1519 }, { "epoch": 1.2307692307692308, "grad_norm": 1.230959944898959, "learning_rate": 1.302470025591211e-05, "loss": 1.3373, "step": 1520 }, { "epoch": 1.231578947368421, "grad_norm": 1.1451597949607148, "learning_rate": 1.3016506425539615e-05, "loss": 1.3721, "step": 1521 }, { "epoch": 1.2323886639676114, "grad_norm": 1.3441292904286972, "learning_rate": 1.3008310366582081e-05, "loss": 1.4799, "step": 1522 }, { "epoch": 1.2331983805668016, "grad_norm": 1.2341770886622436, "learning_rate": 1.300011208509473e-05, "loss": 1.3638, "step": 1523 }, { "epoch": 1.234008097165992, "grad_norm": 1.1915183190289644, "learning_rate": 1.2991911587134416e-05, "loss": 1.4123, "step": 1524 }, { "epoch": 1.2348178137651822, "grad_norm": 1.2162901503483015, "learning_rate": 1.2983708878759655e-05, "loss": 1.4574, "step": 1525 }, { "epoch": 1.2356275303643725, "grad_norm": 1.2621342134046012, "learning_rate": 1.2975503966030574e-05, "loss": 1.3648, "step": 1526 }, { "epoch": 1.2364372469635627, "grad_norm": 1.1896912894693648, "learning_rate": 1.2967296855008932e-05, "loss": 1.3118, "step": 1527 }, { "epoch": 1.237246963562753, "grad_norm": 1.2023777230870145, "learning_rate": 1.2959087551758121e-05, "loss": 1.4103, "step": 1528 }, { "epoch": 1.2380566801619433, "grad_norm": 1.228550678791859, "learning_rate": 1.2950876062343147e-05, "loss": 1.3102, "step": 1529 }, { "epoch": 1.2388663967611335, "grad_norm": 1.2650925867678326, "learning_rate": 1.2942662392830632e-05, "loss": 1.388, "step": 1530 }, { "epoch": 1.2396761133603238, "grad_norm": 1.1719459344716467, "learning_rate": 1.2934446549288801e-05, "loss": 1.3515, "step": 1531 }, { "epoch": 1.240485829959514, "grad_norm": 1.2442167711303942, "learning_rate": 1.2926228537787498e-05, "loss": 1.3685, "step": 1532 }, { "epoch": 1.2412955465587046, "grad_norm": 1.192558403731443, "learning_rate": 1.2918008364398164e-05, "loss": 1.3777, "step": 1533 }, { "epoch": 1.2421052631578948, "grad_norm": 1.187107614370798, "learning_rate": 1.2909786035193836e-05, "loss": 1.444, "step": 1534 }, { "epoch": 1.242914979757085, "grad_norm": 1.2441337176267466, "learning_rate": 1.290156155624914e-05, "loss": 1.3631, "step": 1535 }, { "epoch": 1.2437246963562754, "grad_norm": 1.174168892326793, "learning_rate": 1.2893334933640296e-05, "loss": 1.3321, "step": 1536 }, { "epoch": 1.2445344129554656, "grad_norm": 1.208004365868136, "learning_rate": 1.2885106173445108e-05, "loss": 1.363, "step": 1537 }, { "epoch": 1.245344129554656, "grad_norm": 1.1950930591538556, "learning_rate": 1.287687528174295e-05, "loss": 1.3179, "step": 1538 }, { "epoch": 1.2461538461538462, "grad_norm": 1.1790688919955996, "learning_rate": 1.2868642264614787e-05, "loss": 1.4061, "step": 1539 }, { "epoch": 1.2469635627530364, "grad_norm": 1.1987689806967812, "learning_rate": 1.286040712814314e-05, "loss": 1.3569, "step": 1540 }, { "epoch": 1.2477732793522267, "grad_norm": 1.1817370751262748, "learning_rate": 1.2852169878412102e-05, "loss": 1.3369, "step": 1541 }, { "epoch": 1.248582995951417, "grad_norm": 1.2282449878735011, "learning_rate": 1.2843930521507324e-05, "loss": 1.3283, "step": 1542 }, { "epoch": 1.2493927125506072, "grad_norm": 1.26597563474877, "learning_rate": 1.2835689063516019e-05, "loss": 1.4247, "step": 1543 }, { "epoch": 1.2502024291497975, "grad_norm": 1.2159532514071452, "learning_rate": 1.2827445510526945e-05, "loss": 1.3501, "step": 1544 }, { "epoch": 1.2510121457489878, "grad_norm": 1.2015354671386715, "learning_rate": 1.2819199868630419e-05, "loss": 1.4067, "step": 1545 }, { "epoch": 1.2518218623481783, "grad_norm": 1.1488073343742868, "learning_rate": 1.2810952143918284e-05, "loss": 1.3881, "step": 1546 }, { "epoch": 1.2526315789473683, "grad_norm": 1.1469365945891747, "learning_rate": 1.2802702342483941e-05, "loss": 1.2904, "step": 1547 }, { "epoch": 1.2534412955465588, "grad_norm": 1.2210656858397426, "learning_rate": 1.279445047042231e-05, "loss": 1.3727, "step": 1548 }, { "epoch": 1.2542510121457489, "grad_norm": 1.199892214891525, "learning_rate": 1.278619653382985e-05, "loss": 1.4002, "step": 1549 }, { "epoch": 1.2550607287449393, "grad_norm": 1.233459919301486, "learning_rate": 1.2777940538804545e-05, "loss": 1.4107, "step": 1550 }, { "epoch": 1.2558704453441296, "grad_norm": 1.147765149659166, "learning_rate": 1.2769682491445892e-05, "loss": 1.3451, "step": 1551 }, { "epoch": 1.2566801619433199, "grad_norm": 1.1348340061961697, "learning_rate": 1.276142239785491e-05, "loss": 1.2995, "step": 1552 }, { "epoch": 1.2574898785425102, "grad_norm": 1.2149990877087464, "learning_rate": 1.275316026413413e-05, "loss": 1.2791, "step": 1553 }, { "epoch": 1.2582995951417004, "grad_norm": 1.1996055435289459, "learning_rate": 1.274489609638759e-05, "loss": 1.3854, "step": 1554 }, { "epoch": 1.2591093117408907, "grad_norm": 1.2048053640102194, "learning_rate": 1.2736629900720832e-05, "loss": 1.3213, "step": 1555 }, { "epoch": 1.259919028340081, "grad_norm": 1.1862492357917775, "learning_rate": 1.2728361683240889e-05, "loss": 1.4001, "step": 1556 }, { "epoch": 1.2607287449392712, "grad_norm": 1.246276511809421, "learning_rate": 1.2720091450056293e-05, "loss": 1.3218, "step": 1557 }, { "epoch": 1.2615384615384615, "grad_norm": 1.223220081058916, "learning_rate": 1.2711819207277071e-05, "loss": 1.356, "step": 1558 }, { "epoch": 1.2623481781376518, "grad_norm": 1.2502562370531392, "learning_rate": 1.2703544961014727e-05, "loss": 1.3222, "step": 1559 }, { "epoch": 1.263157894736842, "grad_norm": 1.224324008996058, "learning_rate": 1.2695268717382242e-05, "loss": 1.3478, "step": 1560 }, { "epoch": 1.2639676113360325, "grad_norm": 1.2105633319160907, "learning_rate": 1.268699048249408e-05, "loss": 1.4391, "step": 1561 }, { "epoch": 1.2647773279352226, "grad_norm": 1.2837901936738587, "learning_rate": 1.2678710262466178e-05, "loss": 1.4009, "step": 1562 }, { "epoch": 1.265587044534413, "grad_norm": 1.2548245764229349, "learning_rate": 1.2670428063415932e-05, "loss": 1.4045, "step": 1563 }, { "epoch": 1.266396761133603, "grad_norm": 1.1561209573769804, "learning_rate": 1.26621438914622e-05, "loss": 1.3793, "step": 1564 }, { "epoch": 1.2672064777327936, "grad_norm": 1.1691572154518026, "learning_rate": 1.2653857752725305e-05, "loss": 1.3663, "step": 1565 }, { "epoch": 1.2680161943319839, "grad_norm": 1.2306512954942797, "learning_rate": 1.2645569653327024e-05, "loss": 1.3454, "step": 1566 }, { "epoch": 1.2688259109311741, "grad_norm": 1.1802175610787244, "learning_rate": 1.2637279599390569e-05, "loss": 1.369, "step": 1567 }, { "epoch": 1.2696356275303644, "grad_norm": 1.1959566114951496, "learning_rate": 1.2628987597040605e-05, "loss": 1.3326, "step": 1568 }, { "epoch": 1.2704453441295547, "grad_norm": 1.2258150541786852, "learning_rate": 1.2620693652403241e-05, "loss": 1.4098, "step": 1569 }, { "epoch": 1.271255060728745, "grad_norm": 1.2413950001716263, "learning_rate": 1.2612397771606015e-05, "loss": 1.328, "step": 1570 }, { "epoch": 1.2720647773279352, "grad_norm": 1.1898669800873198, "learning_rate": 1.2604099960777896e-05, "loss": 1.3843, "step": 1571 }, { "epoch": 1.2728744939271255, "grad_norm": 1.2359955799975981, "learning_rate": 1.2595800226049277e-05, "loss": 1.4085, "step": 1572 }, { "epoch": 1.2736842105263158, "grad_norm": 1.2178561670112904, "learning_rate": 1.258749857355198e-05, "loss": 1.2919, "step": 1573 }, { "epoch": 1.274493927125506, "grad_norm": 1.2042766779705247, "learning_rate": 1.2579195009419234e-05, "loss": 1.3142, "step": 1574 }, { "epoch": 1.2753036437246963, "grad_norm": 1.2110686014257106, "learning_rate": 1.2570889539785683e-05, "loss": 1.3382, "step": 1575 }, { "epoch": 1.2761133603238866, "grad_norm": 1.2052344706519602, "learning_rate": 1.2562582170787385e-05, "loss": 1.4298, "step": 1576 }, { "epoch": 1.2769230769230768, "grad_norm": 1.243022267671595, "learning_rate": 1.2554272908561798e-05, "loss": 1.3593, "step": 1577 }, { "epoch": 1.2777327935222673, "grad_norm": 1.2033161377378727, "learning_rate": 1.2545961759247775e-05, "loss": 1.3927, "step": 1578 }, { "epoch": 1.2785425101214574, "grad_norm": 1.2330632054756852, "learning_rate": 1.2537648728985565e-05, "loss": 1.4024, "step": 1579 }, { "epoch": 1.2793522267206479, "grad_norm": 1.201478870274986, "learning_rate": 1.2529333823916807e-05, "loss": 1.4012, "step": 1580 }, { "epoch": 1.2801619433198381, "grad_norm": 1.1664964260293886, "learning_rate": 1.2521017050184531e-05, "loss": 1.3771, "step": 1581 }, { "epoch": 1.2809716599190284, "grad_norm": 1.180036690598496, "learning_rate": 1.251269841393314e-05, "loss": 1.3307, "step": 1582 }, { "epoch": 1.2817813765182187, "grad_norm": 1.2026900475737514, "learning_rate": 1.2504377921308408e-05, "loss": 1.3842, "step": 1583 }, { "epoch": 1.282591093117409, "grad_norm": 1.2152832958337092, "learning_rate": 1.2496055578457496e-05, "loss": 1.323, "step": 1584 }, { "epoch": 1.2834008097165992, "grad_norm": 1.1654873586192795, "learning_rate": 1.2487731391528919e-05, "loss": 1.3791, "step": 1585 }, { "epoch": 1.2842105263157895, "grad_norm": 1.2189615995410232, "learning_rate": 1.2479405366672562e-05, "loss": 1.4297, "step": 1586 }, { "epoch": 1.2850202429149797, "grad_norm": 1.2231303739404338, "learning_rate": 1.2471077510039665e-05, "loss": 1.4837, "step": 1587 }, { "epoch": 1.28582995951417, "grad_norm": 1.1937220120755236, "learning_rate": 1.2462747827782818e-05, "loss": 1.3918, "step": 1588 }, { "epoch": 1.2866396761133603, "grad_norm": 1.263374882520897, "learning_rate": 1.2454416326055964e-05, "loss": 1.4037, "step": 1589 }, { "epoch": 1.2874493927125505, "grad_norm": 1.2138622497968994, "learning_rate": 1.2446083011014389e-05, "loss": 1.3822, "step": 1590 }, { "epoch": 1.2882591093117408, "grad_norm": 1.1753589986143762, "learning_rate": 1.2437747888814722e-05, "loss": 1.3099, "step": 1591 }, { "epoch": 1.289068825910931, "grad_norm": 1.195846367618967, "learning_rate": 1.242941096561492e-05, "loss": 1.3619, "step": 1592 }, { "epoch": 1.2898785425101216, "grad_norm": 1.1675847607196994, "learning_rate": 1.2421072247574277e-05, "loss": 1.3881, "step": 1593 }, { "epoch": 1.2906882591093116, "grad_norm": 1.2109119056603415, "learning_rate": 1.2412731740853405e-05, "loss": 1.3412, "step": 1594 }, { "epoch": 1.291497975708502, "grad_norm": 1.1465692161369532, "learning_rate": 1.2404389451614253e-05, "loss": 1.3789, "step": 1595 }, { "epoch": 1.2923076923076924, "grad_norm": 1.1968403107276433, "learning_rate": 1.2396045386020066e-05, "loss": 1.3803, "step": 1596 }, { "epoch": 1.2931174089068826, "grad_norm": 1.3220159374383795, "learning_rate": 1.2387699550235419e-05, "loss": 1.3292, "step": 1597 }, { "epoch": 1.293927125506073, "grad_norm": 1.1851391120160597, "learning_rate": 1.2379351950426188e-05, "loss": 1.388, "step": 1598 }, { "epoch": 1.2947368421052632, "grad_norm": 1.216804570437914, "learning_rate": 1.2371002592759553e-05, "loss": 1.3748, "step": 1599 }, { "epoch": 1.2955465587044535, "grad_norm": 1.3015331306057583, "learning_rate": 1.2362651483403985e-05, "loss": 1.3344, "step": 1600 }, { "epoch": 1.2963562753036437, "grad_norm": 1.1871910644240773, "learning_rate": 1.2354298628529263e-05, "loss": 1.3712, "step": 1601 }, { "epoch": 1.297165991902834, "grad_norm": 1.2506226744247222, "learning_rate": 1.2345944034306447e-05, "loss": 1.3352, "step": 1602 }, { "epoch": 1.2979757085020243, "grad_norm": 1.222415145371624, "learning_rate": 1.2337587706907885e-05, "loss": 1.4298, "step": 1603 }, { "epoch": 1.2987854251012145, "grad_norm": 1.2046074904547404, "learning_rate": 1.2329229652507199e-05, "loss": 1.3016, "step": 1604 }, { "epoch": 1.2995951417004048, "grad_norm": 1.1789481943544442, "learning_rate": 1.2320869877279297e-05, "loss": 1.4301, "step": 1605 }, { "epoch": 1.300404858299595, "grad_norm": 1.2312716488423778, "learning_rate": 1.2312508387400356e-05, "loss": 1.3756, "step": 1606 }, { "epoch": 1.3012145748987853, "grad_norm": 1.1876391454771218, "learning_rate": 1.230414518904781e-05, "loss": 1.347, "step": 1607 }, { "epoch": 1.3020242914979758, "grad_norm": 1.1818075707743736, "learning_rate": 1.2295780288400365e-05, "loss": 1.2704, "step": 1608 }, { "epoch": 1.3028340080971659, "grad_norm": 1.230394929800309, "learning_rate": 1.2287413691637986e-05, "loss": 1.4056, "step": 1609 }, { "epoch": 1.3036437246963564, "grad_norm": 1.2093149673925736, "learning_rate": 1.2279045404941883e-05, "loss": 1.355, "step": 1610 }, { "epoch": 1.3044534412955466, "grad_norm": 1.2368989128597225, "learning_rate": 1.2270675434494523e-05, "loss": 1.3272, "step": 1611 }, { "epoch": 1.305263157894737, "grad_norm": 1.2457418382739227, "learning_rate": 1.2262303786479603e-05, "loss": 1.3158, "step": 1612 }, { "epoch": 1.3060728744939272, "grad_norm": 1.193000113626177, "learning_rate": 1.2253930467082082e-05, "loss": 1.3006, "step": 1613 }, { "epoch": 1.3068825910931174, "grad_norm": 1.177523460512185, "learning_rate": 1.2245555482488134e-05, "loss": 1.4537, "step": 1614 }, { "epoch": 1.3076923076923077, "grad_norm": 1.20594416334784, "learning_rate": 1.2237178838885168e-05, "loss": 1.3691, "step": 1615 }, { "epoch": 1.308502024291498, "grad_norm": 1.2486014445170293, "learning_rate": 1.2228800542461828e-05, "loss": 1.333, "step": 1616 }, { "epoch": 1.3093117408906882, "grad_norm": 1.183684994706002, "learning_rate": 1.2220420599407965e-05, "loss": 1.3391, "step": 1617 }, { "epoch": 1.3101214574898785, "grad_norm": 1.2014854800059243, "learning_rate": 1.2212039015914656e-05, "loss": 1.3952, "step": 1618 }, { "epoch": 1.3109311740890688, "grad_norm": 1.2525560938912015, "learning_rate": 1.2203655798174188e-05, "loss": 1.3096, "step": 1619 }, { "epoch": 1.311740890688259, "grad_norm": 1.1545428943491032, "learning_rate": 1.2195270952380052e-05, "loss": 1.3919, "step": 1620 }, { "epoch": 1.3125506072874493, "grad_norm": 1.1981415320424176, "learning_rate": 1.2186884484726948e-05, "loss": 1.3424, "step": 1621 }, { "epoch": 1.3133603238866396, "grad_norm": 1.2551393649132647, "learning_rate": 1.2178496401410772e-05, "loss": 1.3471, "step": 1622 }, { "epoch": 1.31417004048583, "grad_norm": 1.2509875632018495, "learning_rate": 1.2170106708628604e-05, "loss": 1.3454, "step": 1623 }, { "epoch": 1.3149797570850201, "grad_norm": 1.2300196893904036, "learning_rate": 1.2161715412578729e-05, "loss": 1.3805, "step": 1624 }, { "epoch": 1.3157894736842106, "grad_norm": 1.1907433595568389, "learning_rate": 1.215332251946061e-05, "loss": 1.3729, "step": 1625 }, { "epoch": 1.3165991902834009, "grad_norm": 1.16306796314768, "learning_rate": 1.2144928035474886e-05, "loss": 1.314, "step": 1626 }, { "epoch": 1.3174089068825912, "grad_norm": 1.1442311854910465, "learning_rate": 1.213653196682337e-05, "loss": 1.3982, "step": 1627 }, { "epoch": 1.3182186234817814, "grad_norm": 1.2255785520913092, "learning_rate": 1.2128134319709057e-05, "loss": 1.4047, "step": 1628 }, { "epoch": 1.3190283400809717, "grad_norm": 1.144767078265005, "learning_rate": 1.21197351003361e-05, "loss": 1.3996, "step": 1629 }, { "epoch": 1.319838056680162, "grad_norm": 1.2322890116383336, "learning_rate": 1.2111334314909811e-05, "loss": 1.3791, "step": 1630 }, { "epoch": 1.3206477732793522, "grad_norm": 1.2107662604876777, "learning_rate": 1.2102931969636664e-05, "loss": 1.3275, "step": 1631 }, { "epoch": 1.3214574898785425, "grad_norm": 1.2271628627713616, "learning_rate": 1.2094528070724286e-05, "loss": 1.3068, "step": 1632 }, { "epoch": 1.3222672064777328, "grad_norm": 1.240148364284859, "learning_rate": 1.2086122624381446e-05, "loss": 1.3663, "step": 1633 }, { "epoch": 1.323076923076923, "grad_norm": 1.2055440099566974, "learning_rate": 1.2077715636818066e-05, "loss": 1.4334, "step": 1634 }, { "epoch": 1.3238866396761133, "grad_norm": 1.230867923850948, "learning_rate": 1.2069307114245197e-05, "loss": 1.3325, "step": 1635 }, { "epoch": 1.3246963562753036, "grad_norm": 1.1621119132494113, "learning_rate": 1.2060897062875027e-05, "loss": 1.3705, "step": 1636 }, { "epoch": 1.3255060728744938, "grad_norm": 1.2483246064419058, "learning_rate": 1.2052485488920877e-05, "loss": 1.3473, "step": 1637 }, { "epoch": 1.3263157894736843, "grad_norm": 1.2207617901689776, "learning_rate": 1.2044072398597188e-05, "loss": 1.3736, "step": 1638 }, { "epoch": 1.3271255060728744, "grad_norm": 1.181566130734583, "learning_rate": 1.2035657798119527e-05, "loss": 1.3351, "step": 1639 }, { "epoch": 1.3279352226720649, "grad_norm": 1.2130403891894281, "learning_rate": 1.2027241693704567e-05, "loss": 1.4527, "step": 1640 }, { "epoch": 1.3287449392712551, "grad_norm": 1.2379457536405214, "learning_rate": 1.2018824091570103e-05, "loss": 1.4002, "step": 1641 }, { "epoch": 1.3295546558704454, "grad_norm": 1.2287910940298021, "learning_rate": 1.2010404997935032e-05, "loss": 1.4004, "step": 1642 }, { "epoch": 1.3303643724696357, "grad_norm": 1.1412833841977206, "learning_rate": 1.2001984419019353e-05, "loss": 1.3346, "step": 1643 }, { "epoch": 1.331174089068826, "grad_norm": 1.15781602550099, "learning_rate": 1.1993562361044157e-05, "loss": 1.4022, "step": 1644 }, { "epoch": 1.3319838056680162, "grad_norm": 1.1570840461641474, "learning_rate": 1.1985138830231638e-05, "loss": 1.3471, "step": 1645 }, { "epoch": 1.3327935222672065, "grad_norm": 1.1957634256697691, "learning_rate": 1.1976713832805071e-05, "loss": 1.3155, "step": 1646 }, { "epoch": 1.3336032388663968, "grad_norm": 1.1870263239595638, "learning_rate": 1.1968287374988819e-05, "loss": 1.3684, "step": 1647 }, { "epoch": 1.334412955465587, "grad_norm": 1.2264800114026502, "learning_rate": 1.1959859463008316e-05, "loss": 1.3983, "step": 1648 }, { "epoch": 1.3352226720647773, "grad_norm": 1.2388487173488427, "learning_rate": 1.1951430103090079e-05, "loss": 1.3473, "step": 1649 }, { "epoch": 1.3360323886639676, "grad_norm": 1.2542679619160093, "learning_rate": 1.1942999301461694e-05, "loss": 1.2761, "step": 1650 }, { "epoch": 1.3368421052631578, "grad_norm": 1.1812219242563955, "learning_rate": 1.1934567064351802e-05, "loss": 1.3625, "step": 1651 }, { "epoch": 1.337651821862348, "grad_norm": 1.2130955096327123, "learning_rate": 1.192613339799012e-05, "loss": 1.3876, "step": 1652 }, { "epoch": 1.3384615384615386, "grad_norm": 1.261762862681965, "learning_rate": 1.1917698308607409e-05, "loss": 1.3848, "step": 1653 }, { "epoch": 1.3392712550607286, "grad_norm": 1.187112353472305, "learning_rate": 1.1909261802435485e-05, "loss": 1.3847, "step": 1654 }, { "epoch": 1.3400809716599191, "grad_norm": 1.1853951778926974, "learning_rate": 1.1900823885707216e-05, "loss": 1.3522, "step": 1655 }, { "epoch": 1.3408906882591092, "grad_norm": 1.2146857185059894, "learning_rate": 1.1892384564656499e-05, "loss": 1.3787, "step": 1656 }, { "epoch": 1.3417004048582997, "grad_norm": 1.3043886121989412, "learning_rate": 1.1883943845518282e-05, "loss": 1.3538, "step": 1657 }, { "epoch": 1.34251012145749, "grad_norm": 1.184170924694241, "learning_rate": 1.187550173452854e-05, "loss": 1.3652, "step": 1658 }, { "epoch": 1.3433198380566802, "grad_norm": 1.2197338560870996, "learning_rate": 1.1867058237924276e-05, "loss": 1.3972, "step": 1659 }, { "epoch": 1.3441295546558705, "grad_norm": 1.158381117646045, "learning_rate": 1.1858613361943518e-05, "loss": 1.3444, "step": 1660 }, { "epoch": 1.3449392712550607, "grad_norm": 1.1488130103833536, "learning_rate": 1.1850167112825316e-05, "loss": 1.3224, "step": 1661 }, { "epoch": 1.345748987854251, "grad_norm": 1.189395323195293, "learning_rate": 1.1841719496809725e-05, "loss": 1.3928, "step": 1662 }, { "epoch": 1.3465587044534413, "grad_norm": 1.2004459789438373, "learning_rate": 1.1833270520137819e-05, "loss": 1.3517, "step": 1663 }, { "epoch": 1.3473684210526315, "grad_norm": 1.204495169303041, "learning_rate": 1.182482018905167e-05, "loss": 1.4155, "step": 1664 }, { "epoch": 1.3481781376518218, "grad_norm": 1.2633604245127454, "learning_rate": 1.1816368509794365e-05, "loss": 1.4195, "step": 1665 }, { "epoch": 1.348987854251012, "grad_norm": 1.180591301251816, "learning_rate": 1.1807915488609968e-05, "loss": 1.3682, "step": 1666 }, { "epoch": 1.3497975708502024, "grad_norm": 1.1375804044964366, "learning_rate": 1.1799461131743548e-05, "loss": 1.3662, "step": 1667 }, { "epoch": 1.3506072874493928, "grad_norm": 1.1650602887537418, "learning_rate": 1.179100544544115e-05, "loss": 1.3532, "step": 1668 }, { "epoch": 1.351417004048583, "grad_norm": 1.1656597543760445, "learning_rate": 1.1782548435949814e-05, "loss": 1.3439, "step": 1669 }, { "epoch": 1.3522267206477734, "grad_norm": 1.1984145456570525, "learning_rate": 1.177409010951755e-05, "loss": 1.3584, "step": 1670 }, { "epoch": 1.3530364372469634, "grad_norm": 1.2125180037933434, "learning_rate": 1.1765630472393338e-05, "loss": 1.3553, "step": 1671 }, { "epoch": 1.353846153846154, "grad_norm": 1.278487148224879, "learning_rate": 1.1757169530827129e-05, "loss": 1.3817, "step": 1672 }, { "epoch": 1.3546558704453442, "grad_norm": 1.2618389577279896, "learning_rate": 1.1748707291069846e-05, "loss": 1.4483, "step": 1673 }, { "epoch": 1.3554655870445345, "grad_norm": 1.2332866269192326, "learning_rate": 1.1740243759373358e-05, "loss": 1.3525, "step": 1674 }, { "epoch": 1.3562753036437247, "grad_norm": 1.2162494031574367, "learning_rate": 1.1731778941990497e-05, "loss": 1.3197, "step": 1675 }, { "epoch": 1.357085020242915, "grad_norm": 1.182031395601063, "learning_rate": 1.1723312845175041e-05, "loss": 1.3705, "step": 1676 }, { "epoch": 1.3578947368421053, "grad_norm": 1.188685842481188, "learning_rate": 1.1714845475181716e-05, "loss": 1.3931, "step": 1677 }, { "epoch": 1.3587044534412955, "grad_norm": 1.216181329532205, "learning_rate": 1.1706376838266185e-05, "loss": 1.3821, "step": 1678 }, { "epoch": 1.3595141700404858, "grad_norm": 1.2693089283894199, "learning_rate": 1.169790694068505e-05, "loss": 1.3803, "step": 1679 }, { "epoch": 1.360323886639676, "grad_norm": 1.214964553877776, "learning_rate": 1.1689435788695844e-05, "loss": 1.3528, "step": 1680 }, { "epoch": 1.3611336032388663, "grad_norm": 1.2317976939891357, "learning_rate": 1.1680963388557028e-05, "loss": 1.4125, "step": 1681 }, { "epoch": 1.3619433198380566, "grad_norm": 1.2844113480291337, "learning_rate": 1.1672489746527979e-05, "loss": 1.3941, "step": 1682 }, { "epoch": 1.362753036437247, "grad_norm": 1.2279453781778626, "learning_rate": 1.1664014868869e-05, "loss": 1.3184, "step": 1683 }, { "epoch": 1.3635627530364371, "grad_norm": 1.2537576929676622, "learning_rate": 1.16555387618413e-05, "loss": 1.3569, "step": 1684 }, { "epoch": 1.3643724696356276, "grad_norm": 1.2833642997336299, "learning_rate": 1.1647061431707e-05, "loss": 1.4056, "step": 1685 }, { "epoch": 1.3651821862348177, "grad_norm": 1.235092352736005, "learning_rate": 1.1638582884729127e-05, "loss": 1.4097, "step": 1686 }, { "epoch": 1.3659919028340082, "grad_norm": 1.187634661590241, "learning_rate": 1.16301031271716e-05, "loss": 1.4244, "step": 1687 }, { "epoch": 1.3668016194331984, "grad_norm": 1.19737827161909, "learning_rate": 1.1621622165299233e-05, "loss": 1.3609, "step": 1688 }, { "epoch": 1.3676113360323887, "grad_norm": 1.2414142941245716, "learning_rate": 1.161314000537774e-05, "loss": 1.4491, "step": 1689 }, { "epoch": 1.368421052631579, "grad_norm": 1.231861969240117, "learning_rate": 1.1604656653673707e-05, "loss": 1.3283, "step": 1690 }, { "epoch": 1.3692307692307693, "grad_norm": 1.1830853058793631, "learning_rate": 1.1596172116454609e-05, "loss": 1.3827, "step": 1691 }, { "epoch": 1.3700404858299595, "grad_norm": 1.2656452551880795, "learning_rate": 1.1587686399988793e-05, "loss": 1.419, "step": 1692 }, { "epoch": 1.3708502024291498, "grad_norm": 1.2452981014868072, "learning_rate": 1.157919951054548e-05, "loss": 1.3345, "step": 1693 }, { "epoch": 1.37165991902834, "grad_norm": 1.2262829471723262, "learning_rate": 1.1570711454394759e-05, "loss": 1.3604, "step": 1694 }, { "epoch": 1.3724696356275303, "grad_norm": 1.1921863334032212, "learning_rate": 1.156222223780757e-05, "loss": 1.3358, "step": 1695 }, { "epoch": 1.3732793522267206, "grad_norm": 1.2818863761177053, "learning_rate": 1.1553731867055724e-05, "loss": 1.3675, "step": 1696 }, { "epoch": 1.3740890688259109, "grad_norm": 1.2743486168767448, "learning_rate": 1.1545240348411877e-05, "loss": 1.4062, "step": 1697 }, { "epoch": 1.3748987854251011, "grad_norm": 1.2406034474747183, "learning_rate": 1.1536747688149537e-05, "loss": 1.4102, "step": 1698 }, { "epoch": 1.3757085020242914, "grad_norm": 1.2104446295299616, "learning_rate": 1.1528253892543053e-05, "loss": 1.44, "step": 1699 }, { "epoch": 1.376518218623482, "grad_norm": 1.176123338179055, "learning_rate": 1.1519758967867608e-05, "loss": 1.4084, "step": 1700 }, { "epoch": 1.377327935222672, "grad_norm": 1.2508166417186997, "learning_rate": 1.1511262920399233e-05, "loss": 1.3585, "step": 1701 }, { "epoch": 1.3781376518218624, "grad_norm": 1.1964180815673577, "learning_rate": 1.1502765756414776e-05, "loss": 1.3996, "step": 1702 }, { "epoch": 1.3789473684210527, "grad_norm": 1.2382693764014767, "learning_rate": 1.1494267482191912e-05, "loss": 1.2927, "step": 1703 }, { "epoch": 1.379757085020243, "grad_norm": 1.346059359155808, "learning_rate": 1.1485768104009141e-05, "loss": 1.3396, "step": 1704 }, { "epoch": 1.3805668016194332, "grad_norm": 1.3195598291408253, "learning_rate": 1.1477267628145777e-05, "loss": 1.3325, "step": 1705 }, { "epoch": 1.3813765182186235, "grad_norm": 1.278714888738768, "learning_rate": 1.146876606088194e-05, "loss": 1.3388, "step": 1706 }, { "epoch": 1.3821862348178138, "grad_norm": 1.2060319063156253, "learning_rate": 1.1460263408498557e-05, "loss": 1.337, "step": 1707 }, { "epoch": 1.382995951417004, "grad_norm": 1.3011345426618277, "learning_rate": 1.1451759677277367e-05, "loss": 1.4089, "step": 1708 }, { "epoch": 1.3838056680161943, "grad_norm": 1.238105150141394, "learning_rate": 1.1443254873500897e-05, "loss": 1.3463, "step": 1709 }, { "epoch": 1.3846153846153846, "grad_norm": 1.2043600689977974, "learning_rate": 1.1434749003452467e-05, "loss": 1.4289, "step": 1710 }, { "epoch": 1.3854251012145749, "grad_norm": 1.252328542955408, "learning_rate": 1.1426242073416183e-05, "loss": 1.3714, "step": 1711 }, { "epoch": 1.3862348178137651, "grad_norm": 1.302041753695186, "learning_rate": 1.1417734089676939e-05, "loss": 1.363, "step": 1712 }, { "epoch": 1.3870445344129554, "grad_norm": 1.2218471138487301, "learning_rate": 1.140922505852041e-05, "loss": 1.3402, "step": 1713 }, { "epoch": 1.3878542510121457, "grad_norm": 1.2397585007328973, "learning_rate": 1.1400714986233035e-05, "loss": 1.3859, "step": 1714 }, { "epoch": 1.3886639676113361, "grad_norm": 1.2723411222710095, "learning_rate": 1.1392203879102027e-05, "loss": 1.4312, "step": 1715 }, { "epoch": 1.3894736842105262, "grad_norm": 1.2616308378503096, "learning_rate": 1.1383691743415364e-05, "loss": 1.3441, "step": 1716 }, { "epoch": 1.3902834008097167, "grad_norm": 1.2096630244554334, "learning_rate": 1.1375178585461788e-05, "loss": 1.3332, "step": 1717 }, { "epoch": 1.391093117408907, "grad_norm": 1.159693781497365, "learning_rate": 1.136666441153079e-05, "loss": 1.3252, "step": 1718 }, { "epoch": 1.3919028340080972, "grad_norm": 1.1854318858942796, "learning_rate": 1.1358149227912613e-05, "loss": 1.3676, "step": 1719 }, { "epoch": 1.3927125506072875, "grad_norm": 1.2233170433120137, "learning_rate": 1.1349633040898246e-05, "loss": 1.399, "step": 1720 }, { "epoch": 1.3935222672064778, "grad_norm": 1.2879065439442032, "learning_rate": 1.1341115856779423e-05, "loss": 1.4255, "step": 1721 }, { "epoch": 1.394331983805668, "grad_norm": 1.2707729718124967, "learning_rate": 1.133259768184861e-05, "loss": 1.3265, "step": 1722 }, { "epoch": 1.3951417004048583, "grad_norm": 1.236173964659415, "learning_rate": 1.1324078522399005e-05, "loss": 1.4351, "step": 1723 }, { "epoch": 1.3959514170040486, "grad_norm": 1.2153770369914687, "learning_rate": 1.1315558384724537e-05, "loss": 1.4008, "step": 1724 }, { "epoch": 1.3967611336032388, "grad_norm": 1.1899505676690036, "learning_rate": 1.1307037275119854e-05, "loss": 1.3839, "step": 1725 }, { "epoch": 1.397570850202429, "grad_norm": 1.1928149560420538, "learning_rate": 1.1298515199880327e-05, "loss": 1.3546, "step": 1726 }, { "epoch": 1.3983805668016194, "grad_norm": 1.2238669730900604, "learning_rate": 1.1289992165302036e-05, "loss": 1.2984, "step": 1727 }, { "epoch": 1.3991902834008096, "grad_norm": 1.1887889650994987, "learning_rate": 1.1281468177681767e-05, "loss": 1.3361, "step": 1728 }, { "epoch": 1.4, "grad_norm": 1.2214260231171459, "learning_rate": 1.1272943243317017e-05, "loss": 1.3368, "step": 1729 }, { "epoch": 1.4008097165991904, "grad_norm": 1.206685962166214, "learning_rate": 1.1264417368505981e-05, "loss": 1.377, "step": 1730 }, { "epoch": 1.4016194331983804, "grad_norm": 1.2113952130760868, "learning_rate": 1.1255890559547549e-05, "loss": 1.3545, "step": 1731 }, { "epoch": 1.402429149797571, "grad_norm": 1.1657293085603149, "learning_rate": 1.1247362822741292e-05, "loss": 1.3407, "step": 1732 }, { "epoch": 1.4032388663967612, "grad_norm": 1.1487734412817017, "learning_rate": 1.123883416438748e-05, "loss": 1.2937, "step": 1733 }, { "epoch": 1.4040485829959515, "grad_norm": 1.1513182139430822, "learning_rate": 1.1230304590787059e-05, "loss": 1.4015, "step": 1734 }, { "epoch": 1.4048582995951417, "grad_norm": 1.2294748713620087, "learning_rate": 1.1221774108241646e-05, "loss": 1.4282, "step": 1735 }, { "epoch": 1.405668016194332, "grad_norm": 1.2580084797725928, "learning_rate": 1.121324272305353e-05, "loss": 1.4106, "step": 1736 }, { "epoch": 1.4064777327935223, "grad_norm": 1.2246093554948188, "learning_rate": 1.1204710441525677e-05, "loss": 1.3874, "step": 1737 }, { "epoch": 1.4072874493927126, "grad_norm": 1.2289401792154757, "learning_rate": 1.119617726996171e-05, "loss": 1.4159, "step": 1738 }, { "epoch": 1.4080971659919028, "grad_norm": 1.1896362142853578, "learning_rate": 1.1187643214665905e-05, "loss": 1.3545, "step": 1739 }, { "epoch": 1.408906882591093, "grad_norm": 1.1629116832244053, "learning_rate": 1.117910828194319e-05, "loss": 1.3602, "step": 1740 }, { "epoch": 1.4097165991902834, "grad_norm": 1.2609850112703145, "learning_rate": 1.117057247809915e-05, "loss": 1.4164, "step": 1741 }, { "epoch": 1.4105263157894736, "grad_norm": 1.220342239587353, "learning_rate": 1.1162035809440005e-05, "loss": 1.3738, "step": 1742 }, { "epoch": 1.411336032388664, "grad_norm": 1.168299823561213, "learning_rate": 1.1153498282272626e-05, "loss": 1.3956, "step": 1743 }, { "epoch": 1.4121457489878542, "grad_norm": 1.1755175754663574, "learning_rate": 1.11449599029045e-05, "loss": 1.3535, "step": 1744 }, { "epoch": 1.4129554655870447, "grad_norm": 1.2143297219911169, "learning_rate": 1.1136420677643763e-05, "loss": 1.3979, "step": 1745 }, { "epoch": 1.4137651821862347, "grad_norm": 1.1767586889180381, "learning_rate": 1.1127880612799158e-05, "loss": 1.4074, "step": 1746 }, { "epoch": 1.4145748987854252, "grad_norm": 1.1817264441831037, "learning_rate": 1.1119339714680062e-05, "loss": 1.3971, "step": 1747 }, { "epoch": 1.4153846153846155, "grad_norm": 1.1851169547567846, "learning_rate": 1.111079798959646e-05, "loss": 1.3668, "step": 1748 }, { "epoch": 1.4161943319838057, "grad_norm": 1.2004222820473625, "learning_rate": 1.1102255443858953e-05, "loss": 1.3868, "step": 1749 }, { "epoch": 1.417004048582996, "grad_norm": 1.1884889082407253, "learning_rate": 1.1093712083778748e-05, "loss": 1.3551, "step": 1750 }, { "epoch": 1.4178137651821863, "grad_norm": 1.1839821452630235, "learning_rate": 1.108516791566764e-05, "loss": 1.3438, "step": 1751 }, { "epoch": 1.4186234817813765, "grad_norm": 1.2051936032476567, "learning_rate": 1.1076622945838045e-05, "loss": 1.3842, "step": 1752 }, { "epoch": 1.4194331983805668, "grad_norm": 1.2344360426305427, "learning_rate": 1.1068077180602953e-05, "loss": 1.3924, "step": 1753 }, { "epoch": 1.420242914979757, "grad_norm": 1.2188954123830977, "learning_rate": 1.1059530626275948e-05, "loss": 1.4137, "step": 1754 }, { "epoch": 1.4210526315789473, "grad_norm": 1.2359084462091137, "learning_rate": 1.1050983289171195e-05, "loss": 1.3869, "step": 1755 }, { "epoch": 1.4218623481781376, "grad_norm": 1.2213164143326687, "learning_rate": 1.1042435175603439e-05, "loss": 1.3499, "step": 1756 }, { "epoch": 1.4226720647773279, "grad_norm": 1.2425021636630276, "learning_rate": 1.1033886291888004e-05, "loss": 1.3418, "step": 1757 }, { "epoch": 1.4234817813765182, "grad_norm": 1.188641014563366, "learning_rate": 1.102533664434077e-05, "loss": 1.3177, "step": 1758 }, { "epoch": 1.4242914979757084, "grad_norm": 1.1958395164768627, "learning_rate": 1.1016786239278188e-05, "loss": 1.3896, "step": 1759 }, { "epoch": 1.425101214574899, "grad_norm": 1.1862097808375378, "learning_rate": 1.1008235083017272e-05, "loss": 1.3035, "step": 1760 }, { "epoch": 1.425910931174089, "grad_norm": 1.2172514085079498, "learning_rate": 1.0999683181875591e-05, "loss": 1.3385, "step": 1761 }, { "epoch": 1.4267206477732794, "grad_norm": 1.263795641893233, "learning_rate": 1.0991130542171255e-05, "loss": 1.3504, "step": 1762 }, { "epoch": 1.4275303643724697, "grad_norm": 1.218807375034323, "learning_rate": 1.0982577170222934e-05, "loss": 1.3728, "step": 1763 }, { "epoch": 1.42834008097166, "grad_norm": 1.2785005424913352, "learning_rate": 1.0974023072349824e-05, "loss": 1.3292, "step": 1764 }, { "epoch": 1.4291497975708503, "grad_norm": 1.1955787893390333, "learning_rate": 1.096546825487167e-05, "loss": 1.4012, "step": 1765 }, { "epoch": 1.4299595141700405, "grad_norm": 1.2472396886736037, "learning_rate": 1.0956912724108737e-05, "loss": 1.3221, "step": 1766 }, { "epoch": 1.4307692307692308, "grad_norm": 1.2548879430356208, "learning_rate": 1.0948356486381829e-05, "loss": 1.3776, "step": 1767 }, { "epoch": 1.431578947368421, "grad_norm": 1.242576432624912, "learning_rate": 1.0939799548012262e-05, "loss": 1.4025, "step": 1768 }, { "epoch": 1.4323886639676113, "grad_norm": 1.2574563622805863, "learning_rate": 1.0931241915321877e-05, "loss": 1.412, "step": 1769 }, { "epoch": 1.4331983805668016, "grad_norm": 1.1995040638978254, "learning_rate": 1.092268359463302e-05, "loss": 1.3246, "step": 1770 }, { "epoch": 1.4340080971659919, "grad_norm": 1.2739773809734911, "learning_rate": 1.0914124592268557e-05, "loss": 1.3698, "step": 1771 }, { "epoch": 1.4348178137651821, "grad_norm": 1.2199023602628671, "learning_rate": 1.0905564914551847e-05, "loss": 1.434, "step": 1772 }, { "epoch": 1.4356275303643724, "grad_norm": 1.2464548777989193, "learning_rate": 1.0897004567806754e-05, "loss": 1.4201, "step": 1773 }, { "epoch": 1.4364372469635627, "grad_norm": 1.214101570752936, "learning_rate": 1.088844355835763e-05, "loss": 1.3015, "step": 1774 }, { "epoch": 1.4372469635627532, "grad_norm": 1.163110661848742, "learning_rate": 1.0879881892529325e-05, "loss": 1.3638, "step": 1775 }, { "epoch": 1.4380566801619432, "grad_norm": 1.1936498971792766, "learning_rate": 1.0871319576647166e-05, "loss": 1.3742, "step": 1776 }, { "epoch": 1.4388663967611337, "grad_norm": 1.2451768719667269, "learning_rate": 1.0862756617036965e-05, "loss": 1.4094, "step": 1777 }, { "epoch": 1.4396761133603238, "grad_norm": 1.1478943380908433, "learning_rate": 1.085419302002501e-05, "loss": 1.3577, "step": 1778 }, { "epoch": 1.4404858299595142, "grad_norm": 1.2145161220203882, "learning_rate": 1.0845628791938058e-05, "loss": 1.3611, "step": 1779 }, { "epoch": 1.4412955465587045, "grad_norm": 1.23124963332477, "learning_rate": 1.0837063939103332e-05, "loss": 1.3967, "step": 1780 }, { "epoch": 1.4421052631578948, "grad_norm": 1.1833240982834867, "learning_rate": 1.0828498467848515e-05, "loss": 1.381, "step": 1781 }, { "epoch": 1.442914979757085, "grad_norm": 1.2492813677148087, "learning_rate": 1.0819932384501755e-05, "loss": 1.3991, "step": 1782 }, { "epoch": 1.4437246963562753, "grad_norm": 1.340019879016165, "learning_rate": 1.081136569539164e-05, "loss": 1.3585, "step": 1783 }, { "epoch": 1.4445344129554656, "grad_norm": 1.2206768859147779, "learning_rate": 1.0802798406847213e-05, "loss": 1.4541, "step": 1784 }, { "epoch": 1.4453441295546559, "grad_norm": 1.1832342780679208, "learning_rate": 1.0794230525197959e-05, "loss": 1.3752, "step": 1785 }, { "epoch": 1.4461538461538461, "grad_norm": 1.196053859850085, "learning_rate": 1.0785662056773805e-05, "loss": 1.327, "step": 1786 }, { "epoch": 1.4469635627530364, "grad_norm": 1.211397922712786, "learning_rate": 1.0777093007905102e-05, "loss": 1.4449, "step": 1787 }, { "epoch": 1.4477732793522267, "grad_norm": 1.2836918819568495, "learning_rate": 1.0768523384922635e-05, "loss": 1.342, "step": 1788 }, { "epoch": 1.448582995951417, "grad_norm": 1.2272591940165982, "learning_rate": 1.0759953194157617e-05, "loss": 1.4064, "step": 1789 }, { "epoch": 1.4493927125506074, "grad_norm": 1.2062079148051859, "learning_rate": 1.0751382441941677e-05, "loss": 1.3203, "step": 1790 }, { "epoch": 1.4502024291497975, "grad_norm": 1.116389048563294, "learning_rate": 1.0742811134606856e-05, "loss": 1.3733, "step": 1791 }, { "epoch": 1.451012145748988, "grad_norm": 1.25258756424204, "learning_rate": 1.0734239278485608e-05, "loss": 1.3817, "step": 1792 }, { "epoch": 1.451821862348178, "grad_norm": 1.208392946627262, "learning_rate": 1.0725666879910792e-05, "loss": 1.3108, "step": 1793 }, { "epoch": 1.4526315789473685, "grad_norm": 1.1948986892653393, "learning_rate": 1.071709394521567e-05, "loss": 1.4184, "step": 1794 }, { "epoch": 1.4534412955465588, "grad_norm": 1.1408273455202025, "learning_rate": 1.0708520480733895e-05, "loss": 1.4017, "step": 1795 }, { "epoch": 1.454251012145749, "grad_norm": 1.1858291893292587, "learning_rate": 1.0699946492799515e-05, "loss": 1.3898, "step": 1796 }, { "epoch": 1.4550607287449393, "grad_norm": 1.1442203734732623, "learning_rate": 1.0691371987746968e-05, "loss": 1.3862, "step": 1797 }, { "epoch": 1.4558704453441296, "grad_norm": 1.1796945509151606, "learning_rate": 1.0682796971911067e-05, "loss": 1.3721, "step": 1798 }, { "epoch": 1.4566801619433198, "grad_norm": 1.2518379890147606, "learning_rate": 1.0674221451627003e-05, "loss": 1.4382, "step": 1799 }, { "epoch": 1.45748987854251, "grad_norm": 1.1544556302617863, "learning_rate": 1.0665645433230345e-05, "loss": 1.3804, "step": 1800 }, { "epoch": 1.4582995951417004, "grad_norm": 1.2433303275893108, "learning_rate": 1.065706892305703e-05, "loss": 1.4092, "step": 1801 }, { "epoch": 1.4591093117408906, "grad_norm": 1.277806150716417, "learning_rate": 1.0648491927443352e-05, "loss": 1.3671, "step": 1802 }, { "epoch": 1.459919028340081, "grad_norm": 1.2764604355710378, "learning_rate": 1.0639914452725966e-05, "loss": 1.3823, "step": 1803 }, { "epoch": 1.4607287449392712, "grad_norm": 1.1879055573384147, "learning_rate": 1.0631336505241885e-05, "loss": 1.3828, "step": 1804 }, { "epoch": 1.4615384615384617, "grad_norm": 1.2161612489980715, "learning_rate": 1.0622758091328469e-05, "loss": 1.3319, "step": 1805 }, { "epoch": 1.4623481781376517, "grad_norm": 1.2261075654968276, "learning_rate": 1.0614179217323418e-05, "loss": 1.3419, "step": 1806 }, { "epoch": 1.4631578947368422, "grad_norm": 1.2117166223283236, "learning_rate": 1.0605599889564782e-05, "loss": 1.3463, "step": 1807 }, { "epoch": 1.4639676113360323, "grad_norm": 1.2031717859192184, "learning_rate": 1.0597020114390932e-05, "loss": 1.3635, "step": 1808 }, { "epoch": 1.4647773279352228, "grad_norm": 1.2031388198613926, "learning_rate": 1.0588439898140586e-05, "loss": 1.3451, "step": 1809 }, { "epoch": 1.465587044534413, "grad_norm": 1.3474438239487712, "learning_rate": 1.0579859247152774e-05, "loss": 1.4206, "step": 1810 }, { "epoch": 1.4663967611336033, "grad_norm": 1.2444786342836838, "learning_rate": 1.0571278167766857e-05, "loss": 1.3636, "step": 1811 }, { "epoch": 1.4672064777327936, "grad_norm": 1.219013237010411, "learning_rate": 1.0562696666322502e-05, "loss": 1.3193, "step": 1812 }, { "epoch": 1.4680161943319838, "grad_norm": 1.1690703137014555, "learning_rate": 1.05541147491597e-05, "loss": 1.315, "step": 1813 }, { "epoch": 1.468825910931174, "grad_norm": 1.2063145766276413, "learning_rate": 1.0545532422618742e-05, "loss": 1.3397, "step": 1814 }, { "epoch": 1.4696356275303644, "grad_norm": 1.2003457207424921, "learning_rate": 1.0536949693040224e-05, "loss": 1.3279, "step": 1815 }, { "epoch": 1.4704453441295546, "grad_norm": 1.2020614644434036, "learning_rate": 1.0528366566765032e-05, "loss": 1.408, "step": 1816 }, { "epoch": 1.471255060728745, "grad_norm": 1.281360311469663, "learning_rate": 1.0519783050134358e-05, "loss": 1.3833, "step": 1817 }, { "epoch": 1.4720647773279352, "grad_norm": 1.2169999342183757, "learning_rate": 1.0511199149489673e-05, "loss": 1.3173, "step": 1818 }, { "epoch": 1.4728744939271254, "grad_norm": 1.1845396830671333, "learning_rate": 1.0502614871172736e-05, "loss": 1.3344, "step": 1819 }, { "epoch": 1.4736842105263157, "grad_norm": 1.2867321451496658, "learning_rate": 1.0494030221525582e-05, "loss": 1.4286, "step": 1820 }, { "epoch": 1.474493927125506, "grad_norm": 1.2064628750455095, "learning_rate": 1.0485445206890522e-05, "loss": 1.3356, "step": 1821 }, { "epoch": 1.4753036437246965, "grad_norm": 1.228960560736752, "learning_rate": 1.0476859833610142e-05, "loss": 1.4522, "step": 1822 }, { "epoch": 1.4761133603238865, "grad_norm": 1.2058195619871348, "learning_rate": 1.046827410802728e-05, "loss": 1.3681, "step": 1823 }, { "epoch": 1.476923076923077, "grad_norm": 1.1852192696221284, "learning_rate": 1.0459688036485044e-05, "loss": 1.3427, "step": 1824 }, { "epoch": 1.4777327935222673, "grad_norm": 1.20736083607171, "learning_rate": 1.0451101625326798e-05, "loss": 1.4147, "step": 1825 }, { "epoch": 1.4785425101214575, "grad_norm": 1.2516509317308389, "learning_rate": 1.0442514880896156e-05, "loss": 1.3319, "step": 1826 }, { "epoch": 1.4793522267206478, "grad_norm": 1.2212025089204959, "learning_rate": 1.043392780953697e-05, "loss": 1.34, "step": 1827 }, { "epoch": 1.480161943319838, "grad_norm": 1.241429196352147, "learning_rate": 1.0425340417593341e-05, "loss": 1.3743, "step": 1828 }, { "epoch": 1.4809716599190283, "grad_norm": 1.18843326826917, "learning_rate": 1.0416752711409612e-05, "loss": 1.3653, "step": 1829 }, { "epoch": 1.4817813765182186, "grad_norm": 1.218215122004799, "learning_rate": 1.0408164697330348e-05, "loss": 1.3632, "step": 1830 }, { "epoch": 1.4825910931174089, "grad_norm": 1.203207018787504, "learning_rate": 1.0399576381700346e-05, "loss": 1.3731, "step": 1831 }, { "epoch": 1.4834008097165992, "grad_norm": 1.2359647155470024, "learning_rate": 1.0390987770864623e-05, "loss": 1.363, "step": 1832 }, { "epoch": 1.4842105263157894, "grad_norm": 1.211501248267098, "learning_rate": 1.0382398871168421e-05, "loss": 1.3222, "step": 1833 }, { "epoch": 1.4850202429149797, "grad_norm": 1.1936705020590501, "learning_rate": 1.0373809688957192e-05, "loss": 1.3877, "step": 1834 }, { "epoch": 1.48582995951417, "grad_norm": 1.1937889310278138, "learning_rate": 1.0365220230576592e-05, "loss": 1.4284, "step": 1835 }, { "epoch": 1.4866396761133602, "grad_norm": 1.1808597278513449, "learning_rate": 1.035663050237248e-05, "loss": 1.3622, "step": 1836 }, { "epoch": 1.4874493927125507, "grad_norm": 1.1900999195170403, "learning_rate": 1.0348040510690929e-05, "loss": 1.4129, "step": 1837 }, { "epoch": 1.4882591093117408, "grad_norm": 1.2368390047722708, "learning_rate": 1.033945026187819e-05, "loss": 1.3577, "step": 1838 }, { "epoch": 1.4890688259109313, "grad_norm": 1.250086709756194, "learning_rate": 1.0330859762280712e-05, "loss": 1.3997, "step": 1839 }, { "epoch": 1.4898785425101215, "grad_norm": 1.2472030369660945, "learning_rate": 1.0322269018245128e-05, "loss": 1.3501, "step": 1840 }, { "epoch": 1.4906882591093118, "grad_norm": 1.2452205777512158, "learning_rate": 1.0313678036118253e-05, "loss": 1.3399, "step": 1841 }, { "epoch": 1.491497975708502, "grad_norm": 1.2034361473861883, "learning_rate": 1.0305086822247077e-05, "loss": 1.3746, "step": 1842 }, { "epoch": 1.4923076923076923, "grad_norm": 1.2952780810330986, "learning_rate": 1.0296495382978756e-05, "loss": 1.3704, "step": 1843 }, { "epoch": 1.4931174089068826, "grad_norm": 1.2614416983159662, "learning_rate": 1.0287903724660617e-05, "loss": 1.3932, "step": 1844 }, { "epoch": 1.4939271255060729, "grad_norm": 1.207803764074404, "learning_rate": 1.0279311853640157e-05, "loss": 1.3694, "step": 1845 }, { "epoch": 1.4947368421052631, "grad_norm": 1.181956708022318, "learning_rate": 1.0270719776265017e-05, "loss": 1.3917, "step": 1846 }, { "epoch": 1.4955465587044534, "grad_norm": 1.1444875887339165, "learning_rate": 1.0262127498882992e-05, "loss": 1.3259, "step": 1847 }, { "epoch": 1.4963562753036437, "grad_norm": 1.2022791631767018, "learning_rate": 1.0253535027842032e-05, "loss": 1.3427, "step": 1848 }, { "epoch": 1.497165991902834, "grad_norm": 1.2070972768865362, "learning_rate": 1.024494236949023e-05, "loss": 1.3986, "step": 1849 }, { "epoch": 1.4979757085020242, "grad_norm": 1.2147753961877357, "learning_rate": 1.0236349530175807e-05, "loss": 1.3675, "step": 1850 }, { "epoch": 1.4987854251012145, "grad_norm": 1.1591467128751487, "learning_rate": 1.0227756516247127e-05, "loss": 1.3841, "step": 1851 }, { "epoch": 1.499595141700405, "grad_norm": 1.2069756938729441, "learning_rate": 1.0219163334052682e-05, "loss": 1.3365, "step": 1852 }, { "epoch": 1.500404858299595, "grad_norm": 1.2440005880081049, "learning_rate": 1.0210569989941085e-05, "loss": 1.357, "step": 1853 }, { "epoch": 1.5012145748987855, "grad_norm": 1.2000119522275203, "learning_rate": 1.020197649026107e-05, "loss": 1.3938, "step": 1854 }, { "epoch": 1.5020242914979756, "grad_norm": 1.187811020715873, "learning_rate": 1.019338284136149e-05, "loss": 1.3288, "step": 1855 }, { "epoch": 1.502834008097166, "grad_norm": 1.1684805914260927, "learning_rate": 1.01847890495913e-05, "loss": 1.3791, "step": 1856 }, { "epoch": 1.5036437246963563, "grad_norm": 1.2362993183012971, "learning_rate": 1.0176195121299567e-05, "loss": 1.394, "step": 1857 }, { "epoch": 1.5044534412955466, "grad_norm": 1.2284232654390486, "learning_rate": 1.0167601062835459e-05, "loss": 1.3728, "step": 1858 }, { "epoch": 1.5052631578947369, "grad_norm": 1.2440453874825037, "learning_rate": 1.0159006880548237e-05, "loss": 1.3519, "step": 1859 }, { "epoch": 1.5060728744939271, "grad_norm": 1.14460238709053, "learning_rate": 1.015041258078725e-05, "loss": 1.365, "step": 1860 }, { "epoch": 1.5068825910931174, "grad_norm": 1.1391435364818712, "learning_rate": 1.0141818169901945e-05, "loss": 1.3191, "step": 1861 }, { "epoch": 1.5076923076923077, "grad_norm": 1.1847183299604456, "learning_rate": 1.013322365424184e-05, "loss": 1.4322, "step": 1862 }, { "epoch": 1.508502024291498, "grad_norm": 1.2242479719103367, "learning_rate": 1.012462904015654e-05, "loss": 1.3728, "step": 1863 }, { "epoch": 1.5093117408906882, "grad_norm": 1.2141368241612376, "learning_rate": 1.011603433399571e-05, "loss": 1.4276, "step": 1864 }, { "epoch": 1.5101214574898787, "grad_norm": 1.197406422654741, "learning_rate": 1.0107439542109097e-05, "loss": 1.387, "step": 1865 }, { "epoch": 1.5109311740890687, "grad_norm": 1.141149626390884, "learning_rate": 1.0098844670846504e-05, "loss": 1.3297, "step": 1866 }, { "epoch": 1.5117408906882592, "grad_norm": 1.2001491455611746, "learning_rate": 1.0090249726557795e-05, "loss": 1.2897, "step": 1867 }, { "epoch": 1.5125506072874493, "grad_norm": 1.1667785806675015, "learning_rate": 1.0081654715592881e-05, "loss": 1.3459, "step": 1868 }, { "epoch": 1.5133603238866398, "grad_norm": 1.1839167724233615, "learning_rate": 1.007305964430173e-05, "loss": 1.3219, "step": 1869 }, { "epoch": 1.5141700404858298, "grad_norm": 1.2144434763318714, "learning_rate": 1.0064464519034358e-05, "loss": 1.3816, "step": 1870 }, { "epoch": 1.5149797570850203, "grad_norm": 1.2135857961398115, "learning_rate": 1.005586934614081e-05, "loss": 1.4097, "step": 1871 }, { "epoch": 1.5157894736842106, "grad_norm": 1.2095532800796946, "learning_rate": 1.004727413197117e-05, "loss": 1.332, "step": 1872 }, { "epoch": 1.5165991902834008, "grad_norm": 1.2181508610257055, "learning_rate": 1.0038678882875557e-05, "loss": 1.3892, "step": 1873 }, { "epoch": 1.5174089068825911, "grad_norm": 1.2026433653521167, "learning_rate": 1.0030083605204115e-05, "loss": 1.4083, "step": 1874 }, { "epoch": 1.5182186234817814, "grad_norm": 1.1773517841295498, "learning_rate": 1.0021488305307003e-05, "loss": 1.3584, "step": 1875 }, { "epoch": 1.5190283400809717, "grad_norm": 1.231433388400886, "learning_rate": 1.00128929895344e-05, "loss": 1.374, "step": 1876 }, { "epoch": 1.519838056680162, "grad_norm": 1.2332502916196273, "learning_rate": 1.0004297664236502e-05, "loss": 1.4351, "step": 1877 }, { "epoch": 1.5206477732793522, "grad_norm": 1.213292815409757, "learning_rate": 9.9957023357635e-06, "loss": 1.2996, "step": 1878 }, { "epoch": 1.5214574898785425, "grad_norm": 1.2346861234405477, "learning_rate": 9.9871070104656e-06, "loss": 1.3471, "step": 1879 }, { "epoch": 1.522267206477733, "grad_norm": 1.254827265085309, "learning_rate": 9.978511694692999e-06, "loss": 1.3533, "step": 1880 }, { "epoch": 1.523076923076923, "grad_norm": 1.1917367114901438, "learning_rate": 9.969916394795888e-06, "loss": 1.3383, "step": 1881 }, { "epoch": 1.5238866396761135, "grad_norm": 1.180916076508306, "learning_rate": 9.961321117124444e-06, "loss": 1.3722, "step": 1882 }, { "epoch": 1.5246963562753035, "grad_norm": 1.2010789539898061, "learning_rate": 9.952725868028831e-06, "loss": 1.342, "step": 1883 }, { "epoch": 1.525506072874494, "grad_norm": 1.2029836828056097, "learning_rate": 9.944130653859195e-06, "loss": 1.3714, "step": 1884 }, { "epoch": 1.526315789473684, "grad_norm": 1.230235446023344, "learning_rate": 9.935535480965647e-06, "loss": 1.419, "step": 1885 }, { "epoch": 1.5271255060728746, "grad_norm": 1.2304103134503401, "learning_rate": 9.92694035569827e-06, "loss": 1.3459, "step": 1886 }, { "epoch": 1.5279352226720648, "grad_norm": 1.2003919942780323, "learning_rate": 9.918345284407122e-06, "loss": 1.4122, "step": 1887 }, { "epoch": 1.528744939271255, "grad_norm": 1.2639140447939796, "learning_rate": 9.909750273442208e-06, "loss": 1.3368, "step": 1888 }, { "epoch": 1.5295546558704454, "grad_norm": 1.2269119630065992, "learning_rate": 9.901155329153498e-06, "loss": 1.3554, "step": 1889 }, { "epoch": 1.5303643724696356, "grad_norm": 1.2657559146957076, "learning_rate": 9.892560457890907e-06, "loss": 1.4207, "step": 1890 }, { "epoch": 1.531174089068826, "grad_norm": 1.1993508806620081, "learning_rate": 9.883965666004293e-06, "loss": 1.3638, "step": 1891 }, { "epoch": 1.5319838056680162, "grad_norm": 1.219522089298675, "learning_rate": 9.875370959843465e-06, "loss": 1.3768, "step": 1892 }, { "epoch": 1.5327935222672064, "grad_norm": 1.2178201278785572, "learning_rate": 9.866776345758166e-06, "loss": 1.408, "step": 1893 }, { "epoch": 1.5336032388663967, "grad_norm": 1.1451898153111888, "learning_rate": 9.858181830098058e-06, "loss": 1.3561, "step": 1894 }, { "epoch": 1.5344129554655872, "grad_norm": 1.2469841070465375, "learning_rate": 9.849587419212751e-06, "loss": 1.4224, "step": 1895 }, { "epoch": 1.5352226720647772, "grad_norm": 1.1845691507885716, "learning_rate": 9.840993119451768e-06, "loss": 1.4053, "step": 1896 }, { "epoch": 1.5360323886639677, "grad_norm": 1.1330718380072349, "learning_rate": 9.832398937164545e-06, "loss": 1.3581, "step": 1897 }, { "epoch": 1.5368421052631578, "grad_norm": 1.1691776979372561, "learning_rate": 9.823804878700434e-06, "loss": 1.3208, "step": 1898 }, { "epoch": 1.5376518218623483, "grad_norm": 1.1868100114985074, "learning_rate": 9.815210950408703e-06, "loss": 1.351, "step": 1899 }, { "epoch": 1.5384615384615383, "grad_norm": 1.1980676569924105, "learning_rate": 9.806617158638515e-06, "loss": 1.3409, "step": 1900 }, { "epoch": 1.5392712550607288, "grad_norm": 1.2286369763296952, "learning_rate": 9.798023509738932e-06, "loss": 1.3736, "step": 1901 }, { "epoch": 1.5400809716599189, "grad_norm": 1.230286054787407, "learning_rate": 9.789430010058918e-06, "loss": 1.3594, "step": 1902 }, { "epoch": 1.5408906882591094, "grad_norm": 1.1833188959041652, "learning_rate": 9.78083666594732e-06, "loss": 1.3609, "step": 1903 }, { "epoch": 1.5417004048582996, "grad_norm": 1.225456208787112, "learning_rate": 9.772243483752876e-06, "loss": 1.3461, "step": 1904 }, { "epoch": 1.54251012145749, "grad_norm": 1.1594495050829507, "learning_rate": 9.763650469824198e-06, "loss": 1.3852, "step": 1905 }, { "epoch": 1.5433198380566802, "grad_norm": 1.2470244297864663, "learning_rate": 9.755057630509774e-06, "loss": 1.3146, "step": 1906 }, { "epoch": 1.5441295546558704, "grad_norm": 1.2036173468340592, "learning_rate": 9.746464972157971e-06, "loss": 1.3576, "step": 1907 }, { "epoch": 1.5449392712550607, "grad_norm": 1.235082778436893, "learning_rate": 9.737872501117013e-06, "loss": 1.3347, "step": 1908 }, { "epoch": 1.545748987854251, "grad_norm": 1.247100018489477, "learning_rate": 9.729280223734988e-06, "loss": 1.4114, "step": 1909 }, { "epoch": 1.5465587044534415, "grad_norm": 1.1457263494567647, "learning_rate": 9.720688146359843e-06, "loss": 1.386, "step": 1910 }, { "epoch": 1.5473684210526315, "grad_norm": 1.1729296197232764, "learning_rate": 9.712096275339381e-06, "loss": 1.3698, "step": 1911 }, { "epoch": 1.548178137651822, "grad_norm": 1.1426028010449, "learning_rate": 9.703504617021247e-06, "loss": 1.286, "step": 1912 }, { "epoch": 1.548987854251012, "grad_norm": 1.2004758472296264, "learning_rate": 9.694913177752927e-06, "loss": 1.4253, "step": 1913 }, { "epoch": 1.5497975708502025, "grad_norm": 1.2117453405073084, "learning_rate": 9.68632196388175e-06, "loss": 1.2984, "step": 1914 }, { "epoch": 1.5506072874493926, "grad_norm": 1.2033634956141823, "learning_rate": 9.677730981754875e-06, "loss": 1.2999, "step": 1915 }, { "epoch": 1.551417004048583, "grad_norm": 1.2564126564820501, "learning_rate": 9.669140237719292e-06, "loss": 1.3905, "step": 1916 }, { "epoch": 1.5522267206477731, "grad_norm": 1.2062084382081446, "learning_rate": 9.660549738121814e-06, "loss": 1.3715, "step": 1917 }, { "epoch": 1.5530364372469636, "grad_norm": 1.2829351534612063, "learning_rate": 9.651959489309073e-06, "loss": 1.3494, "step": 1918 }, { "epoch": 1.5538461538461539, "grad_norm": 1.2383174742670793, "learning_rate": 9.643369497627521e-06, "loss": 1.4225, "step": 1919 }, { "epoch": 1.5546558704453441, "grad_norm": 1.222803913479318, "learning_rate": 9.634779769423412e-06, "loss": 1.3943, "step": 1920 }, { "epoch": 1.5554655870445344, "grad_norm": 1.193502143064031, "learning_rate": 9.62619031104281e-06, "loss": 1.3762, "step": 1921 }, { "epoch": 1.5562753036437247, "grad_norm": 1.1907724889410738, "learning_rate": 9.61760112883158e-06, "loss": 1.3944, "step": 1922 }, { "epoch": 1.557085020242915, "grad_norm": 1.198345852396561, "learning_rate": 9.609012229135379e-06, "loss": 1.3854, "step": 1923 }, { "epoch": 1.5578947368421052, "grad_norm": 1.2182523045236915, "learning_rate": 9.600423618299659e-06, "loss": 1.367, "step": 1924 }, { "epoch": 1.5587044534412957, "grad_norm": 1.1805065924488671, "learning_rate": 9.591835302669657e-06, "loss": 1.3546, "step": 1925 }, { "epoch": 1.5595141700404858, "grad_norm": 1.2249464401299464, "learning_rate": 9.58324728859039e-06, "loss": 1.3609, "step": 1926 }, { "epoch": 1.5603238866396762, "grad_norm": 1.2243122967938407, "learning_rate": 9.57465958240666e-06, "loss": 1.3999, "step": 1927 }, { "epoch": 1.5611336032388663, "grad_norm": 1.2274491912534922, "learning_rate": 9.566072190463032e-06, "loss": 1.306, "step": 1928 }, { "epoch": 1.5619433198380568, "grad_norm": 1.2488797733856527, "learning_rate": 9.557485119103849e-06, "loss": 1.3739, "step": 1929 }, { "epoch": 1.5627530364372468, "grad_norm": 1.2239459163300153, "learning_rate": 9.548898374673205e-06, "loss": 1.3517, "step": 1930 }, { "epoch": 1.5635627530364373, "grad_norm": 1.2178846415184048, "learning_rate": 9.540311963514957e-06, "loss": 1.3731, "step": 1931 }, { "epoch": 1.5643724696356274, "grad_norm": 1.265558847290855, "learning_rate": 9.531725891972725e-06, "loss": 1.4236, "step": 1932 }, { "epoch": 1.5651821862348179, "grad_norm": 1.1681836474926013, "learning_rate": 9.523140166389864e-06, "loss": 1.4025, "step": 1933 }, { "epoch": 1.5659919028340081, "grad_norm": 1.2040234728629644, "learning_rate": 9.514554793109477e-06, "loss": 1.4339, "step": 1934 }, { "epoch": 1.5668016194331984, "grad_norm": 1.265710944730449, "learning_rate": 9.505969778474418e-06, "loss": 1.3561, "step": 1935 }, { "epoch": 1.5676113360323887, "grad_norm": 1.1846379371827758, "learning_rate": 9.497385128827266e-06, "loss": 1.3652, "step": 1936 }, { "epoch": 1.568421052631579, "grad_norm": 1.174180232851037, "learning_rate": 9.48880085051033e-06, "loss": 1.2807, "step": 1937 }, { "epoch": 1.5692307692307692, "grad_norm": 1.242422752714555, "learning_rate": 9.480216949865644e-06, "loss": 1.375, "step": 1938 }, { "epoch": 1.5700404858299595, "grad_norm": 1.239409495392598, "learning_rate": 9.471633433234972e-06, "loss": 1.3435, "step": 1939 }, { "epoch": 1.5708502024291497, "grad_norm": 1.2564868657700816, "learning_rate": 9.463050306959782e-06, "loss": 1.3503, "step": 1940 }, { "epoch": 1.57165991902834, "grad_norm": 1.2398521277680479, "learning_rate": 9.454467577381263e-06, "loss": 1.3266, "step": 1941 }, { "epoch": 1.5724696356275305, "grad_norm": 1.2135937983968192, "learning_rate": 9.445885250840301e-06, "loss": 1.3269, "step": 1942 }, { "epoch": 1.5732793522267206, "grad_norm": 1.1895520607669423, "learning_rate": 9.4373033336775e-06, "loss": 1.3121, "step": 1943 }, { "epoch": 1.574089068825911, "grad_norm": 1.1824970891733284, "learning_rate": 9.428721832233148e-06, "loss": 1.3569, "step": 1944 }, { "epoch": 1.574898785425101, "grad_norm": 1.2023256894687824, "learning_rate": 9.42014075284723e-06, "loss": 1.4145, "step": 1945 }, { "epoch": 1.5757085020242916, "grad_norm": 1.18100975780351, "learning_rate": 9.411560101859417e-06, "loss": 1.3616, "step": 1946 }, { "epoch": 1.5765182186234816, "grad_norm": 1.2357755167800077, "learning_rate": 9.402979885609071e-06, "loss": 1.3371, "step": 1947 }, { "epoch": 1.5773279352226721, "grad_norm": 1.2279972155267704, "learning_rate": 9.394400110435225e-06, "loss": 1.3994, "step": 1948 }, { "epoch": 1.5781376518218624, "grad_norm": 1.1914709750868986, "learning_rate": 9.385820782676584e-06, "loss": 1.3488, "step": 1949 }, { "epoch": 1.5789473684210527, "grad_norm": 1.1994067475194277, "learning_rate": 9.377241908671533e-06, "loss": 1.3631, "step": 1950 }, { "epoch": 1.579757085020243, "grad_norm": 1.1718520198213915, "learning_rate": 9.368663494758115e-06, "loss": 1.3229, "step": 1951 }, { "epoch": 1.5805668016194332, "grad_norm": 1.2111814984055784, "learning_rate": 9.360085547274036e-06, "loss": 1.3721, "step": 1952 }, { "epoch": 1.5813765182186235, "grad_norm": 1.2166854804513723, "learning_rate": 9.351508072556651e-06, "loss": 1.3382, "step": 1953 }, { "epoch": 1.5821862348178137, "grad_norm": 1.2328123213604463, "learning_rate": 9.342931076942973e-06, "loss": 1.4074, "step": 1954 }, { "epoch": 1.582995951417004, "grad_norm": 1.2394612442811164, "learning_rate": 9.334354566769658e-06, "loss": 1.3018, "step": 1955 }, { "epoch": 1.5838056680161943, "grad_norm": 1.2112567483732097, "learning_rate": 9.325778548373e-06, "loss": 1.2831, "step": 1956 }, { "epoch": 1.5846153846153848, "grad_norm": 1.1733648170105058, "learning_rate": 9.317203028088938e-06, "loss": 1.3761, "step": 1957 }, { "epoch": 1.5854251012145748, "grad_norm": 1.2203734468272454, "learning_rate": 9.308628012253032e-06, "loss": 1.3634, "step": 1958 }, { "epoch": 1.5862348178137653, "grad_norm": 1.189447117782888, "learning_rate": 9.300053507200487e-06, "loss": 1.4065, "step": 1959 }, { "epoch": 1.5870445344129553, "grad_norm": 1.2206492902377073, "learning_rate": 9.291479519266108e-06, "loss": 1.3927, "step": 1960 }, { "epoch": 1.5878542510121458, "grad_norm": 1.1783321735346974, "learning_rate": 9.282906054784333e-06, "loss": 1.3714, "step": 1961 }, { "epoch": 1.5886639676113359, "grad_norm": 1.165535947374695, "learning_rate": 9.274333120089211e-06, "loss": 1.3888, "step": 1962 }, { "epoch": 1.5894736842105264, "grad_norm": 1.2497678418850922, "learning_rate": 9.265760721514397e-06, "loss": 1.3666, "step": 1963 }, { "epoch": 1.5902834008097166, "grad_norm": 1.2311086453584528, "learning_rate": 9.257188865393148e-06, "loss": 1.411, "step": 1964 }, { "epoch": 1.591093117408907, "grad_norm": 1.1853289463831915, "learning_rate": 9.248617558058328e-06, "loss": 1.3884, "step": 1965 }, { "epoch": 1.5919028340080972, "grad_norm": 1.2149617440803817, "learning_rate": 9.240046805842383e-06, "loss": 1.3175, "step": 1966 }, { "epoch": 1.5927125506072874, "grad_norm": 1.2303091959416708, "learning_rate": 9.231476615077366e-06, "loss": 1.3443, "step": 1967 }, { "epoch": 1.5935222672064777, "grad_norm": 1.2403372778652388, "learning_rate": 9.2229069920949e-06, "loss": 1.3279, "step": 1968 }, { "epoch": 1.594331983805668, "grad_norm": 1.2729706893834039, "learning_rate": 9.214337943226199e-06, "loss": 1.4091, "step": 1969 }, { "epoch": 1.5951417004048583, "grad_norm": 1.1994129479779139, "learning_rate": 9.205769474802045e-06, "loss": 1.3433, "step": 1970 }, { "epoch": 1.5959514170040485, "grad_norm": 1.2304449200153609, "learning_rate": 9.19720159315279e-06, "loss": 1.2943, "step": 1971 }, { "epoch": 1.596761133603239, "grad_norm": 1.2186847444701223, "learning_rate": 9.188634304608366e-06, "loss": 1.3831, "step": 1972 }, { "epoch": 1.597570850202429, "grad_norm": 1.2840247109912253, "learning_rate": 9.180067615498251e-06, "loss": 1.3342, "step": 1973 }, { "epoch": 1.5983805668016196, "grad_norm": 1.183188023738025, "learning_rate": 9.171501532151486e-06, "loss": 1.3873, "step": 1974 }, { "epoch": 1.5991902834008096, "grad_norm": 1.1776750182121347, "learning_rate": 9.162936060896672e-06, "loss": 1.3365, "step": 1975 }, { "epoch": 1.6, "grad_norm": 1.328668412553802, "learning_rate": 9.154371208061943e-06, "loss": 1.3847, "step": 1976 }, { "epoch": 1.6008097165991901, "grad_norm": 1.2168461558396328, "learning_rate": 9.145806979974991e-06, "loss": 1.3555, "step": 1977 }, { "epoch": 1.6016194331983806, "grad_norm": 1.1622150592681906, "learning_rate": 9.137243382963039e-06, "loss": 1.3405, "step": 1978 }, { "epoch": 1.602429149797571, "grad_norm": 1.2436189606158277, "learning_rate": 9.128680423352839e-06, "loss": 1.3591, "step": 1979 }, { "epoch": 1.6032388663967612, "grad_norm": 1.2610631910478973, "learning_rate": 9.12011810747068e-06, "loss": 1.4099, "step": 1980 }, { "epoch": 1.6040485829959514, "grad_norm": 1.2026899264299413, "learning_rate": 9.111556441642375e-06, "loss": 1.3482, "step": 1981 }, { "epoch": 1.6048582995951417, "grad_norm": 1.2364975494939607, "learning_rate": 9.10299543219325e-06, "loss": 1.3467, "step": 1982 }, { "epoch": 1.605668016194332, "grad_norm": 1.2106651427399964, "learning_rate": 9.094435085448153e-06, "loss": 1.3102, "step": 1983 }, { "epoch": 1.6064777327935222, "grad_norm": 1.1715140130298451, "learning_rate": 9.085875407731444e-06, "loss": 1.3365, "step": 1984 }, { "epoch": 1.6072874493927125, "grad_norm": 1.1983281783955513, "learning_rate": 9.07731640536698e-06, "loss": 1.4136, "step": 1985 }, { "epoch": 1.6080971659919028, "grad_norm": 1.1502136649316144, "learning_rate": 9.068758084678126e-06, "loss": 1.3614, "step": 1986 }, { "epoch": 1.6089068825910933, "grad_norm": 1.213144325311362, "learning_rate": 9.060200451987741e-06, "loss": 1.4013, "step": 1987 }, { "epoch": 1.6097165991902833, "grad_norm": 1.2047368862136312, "learning_rate": 9.051643513618176e-06, "loss": 1.3126, "step": 1988 }, { "epoch": 1.6105263157894738, "grad_norm": 1.2612831728284535, "learning_rate": 9.043087275891266e-06, "loss": 1.3652, "step": 1989 }, { "epoch": 1.6113360323886639, "grad_norm": 1.1804234187245097, "learning_rate": 9.034531745128334e-06, "loss": 1.3749, "step": 1990 }, { "epoch": 1.6121457489878543, "grad_norm": 1.1934129239753541, "learning_rate": 9.025976927650176e-06, "loss": 1.3223, "step": 1991 }, { "epoch": 1.6129554655870444, "grad_norm": 1.2290844683292121, "learning_rate": 9.017422829777068e-06, "loss": 1.3693, "step": 1992 }, { "epoch": 1.6137651821862349, "grad_norm": 1.234955363945963, "learning_rate": 9.008869457828748e-06, "loss": 1.3732, "step": 1993 }, { "epoch": 1.6145748987854251, "grad_norm": 1.2343436283938842, "learning_rate": 9.000316818124412e-06, "loss": 1.3267, "step": 1994 }, { "epoch": 1.6153846153846154, "grad_norm": 1.1943281327624646, "learning_rate": 8.991764916982731e-06, "loss": 1.2832, "step": 1995 }, { "epoch": 1.6161943319838057, "grad_norm": 1.241814934842723, "learning_rate": 8.98321376072182e-06, "loss": 1.4049, "step": 1996 }, { "epoch": 1.617004048582996, "grad_norm": 1.2281313938106257, "learning_rate": 8.974663355659237e-06, "loss": 1.3636, "step": 1997 }, { "epoch": 1.6178137651821862, "grad_norm": 1.2025757332697389, "learning_rate": 8.966113708111998e-06, "loss": 1.2603, "step": 1998 }, { "epoch": 1.6186234817813765, "grad_norm": 1.209319926631317, "learning_rate": 8.957564824396561e-06, "loss": 1.3905, "step": 1999 }, { "epoch": 1.6194331983805668, "grad_norm": 1.2048829491205542, "learning_rate": 8.949016710828808e-06, "loss": 1.3835, "step": 2000 }, { "epoch": 1.620242914979757, "grad_norm": 1.1837603866283808, "learning_rate": 8.940469373724054e-06, "loss": 1.3579, "step": 2001 }, { "epoch": 1.6210526315789475, "grad_norm": 7.115639597248133, "learning_rate": 8.93192281939705e-06, "loss": 1.328, "step": 2002 }, { "epoch": 1.6218623481781376, "grad_norm": 1.225986024046339, "learning_rate": 8.923377054161959e-06, "loss": 1.3977, "step": 2003 }, { "epoch": 1.622672064777328, "grad_norm": 1.173818300527572, "learning_rate": 8.914832084332363e-06, "loss": 1.4135, "step": 2004 }, { "epoch": 1.623481781376518, "grad_norm": 1.2141976317329137, "learning_rate": 8.906287916221259e-06, "loss": 1.3668, "step": 2005 }, { "epoch": 1.6242914979757086, "grad_norm": 1.1459187530159702, "learning_rate": 8.897744556141047e-06, "loss": 1.3697, "step": 2006 }, { "epoch": 1.6251012145748986, "grad_norm": 1.1655094826381835, "learning_rate": 8.88920201040354e-06, "loss": 1.3506, "step": 2007 }, { "epoch": 1.6259109311740891, "grad_norm": 1.2237829104408862, "learning_rate": 8.880660285319941e-06, "loss": 1.4369, "step": 2008 }, { "epoch": 1.6267206477732794, "grad_norm": 1.2400129820732735, "learning_rate": 8.872119387200844e-06, "loss": 1.3921, "step": 2009 }, { "epoch": 1.6275303643724697, "grad_norm": 1.2312502428364493, "learning_rate": 8.863579322356242e-06, "loss": 1.2692, "step": 2010 }, { "epoch": 1.62834008097166, "grad_norm": 1.2294849309798999, "learning_rate": 8.855040097095504e-06, "loss": 1.3047, "step": 2011 }, { "epoch": 1.6291497975708502, "grad_norm": 1.224426204175276, "learning_rate": 8.846501717727378e-06, "loss": 1.4028, "step": 2012 }, { "epoch": 1.6299595141700405, "grad_norm": 1.2489783782355426, "learning_rate": 8.837964190559998e-06, "loss": 1.384, "step": 2013 }, { "epoch": 1.6307692307692307, "grad_norm": 1.2161261861703765, "learning_rate": 8.829427521900852e-06, "loss": 1.4003, "step": 2014 }, { "epoch": 1.631578947368421, "grad_norm": 1.1628457277298792, "learning_rate": 8.820891718056815e-06, "loss": 1.3563, "step": 2015 }, { "epoch": 1.6323886639676113, "grad_norm": 1.1999497209012282, "learning_rate": 8.8123567853341e-06, "loss": 1.3742, "step": 2016 }, { "epoch": 1.6331983805668018, "grad_norm": 1.1700076641674673, "learning_rate": 8.803822730038292e-06, "loss": 1.3734, "step": 2017 }, { "epoch": 1.6340080971659918, "grad_norm": 1.184177246603208, "learning_rate": 8.795289558474325e-06, "loss": 1.3199, "step": 2018 }, { "epoch": 1.6348178137651823, "grad_norm": 1.207218490736053, "learning_rate": 8.786757276946473e-06, "loss": 1.4515, "step": 2019 }, { "epoch": 1.6356275303643724, "grad_norm": 1.1821552126386663, "learning_rate": 8.77822589175836e-06, "loss": 1.383, "step": 2020 }, { "epoch": 1.6364372469635629, "grad_norm": 1.2274446623556885, "learning_rate": 8.769695409212946e-06, "loss": 1.3733, "step": 2021 }, { "epoch": 1.637246963562753, "grad_norm": 1.2480667270847892, "learning_rate": 8.76116583561252e-06, "loss": 1.4254, "step": 2022 }, { "epoch": 1.6380566801619434, "grad_norm": 1.2019082482183845, "learning_rate": 8.752637177258708e-06, "loss": 1.3923, "step": 2023 }, { "epoch": 1.6388663967611334, "grad_norm": 1.1636761907843682, "learning_rate": 8.744109440452455e-06, "loss": 1.3485, "step": 2024 }, { "epoch": 1.639676113360324, "grad_norm": 1.2715956411428166, "learning_rate": 8.73558263149402e-06, "loss": 1.2978, "step": 2025 }, { "epoch": 1.6404858299595142, "grad_norm": 1.2120052316388328, "learning_rate": 8.727056756682985e-06, "loss": 1.4335, "step": 2026 }, { "epoch": 1.6412955465587045, "grad_norm": 1.2616855428891294, "learning_rate": 8.718531822318236e-06, "loss": 1.3146, "step": 2027 }, { "epoch": 1.6421052631578947, "grad_norm": 1.2332469667194608, "learning_rate": 8.71000783469797e-06, "loss": 1.4207, "step": 2028 }, { "epoch": 1.642914979757085, "grad_norm": 1.184361570240323, "learning_rate": 8.701484800119678e-06, "loss": 1.4211, "step": 2029 }, { "epoch": 1.6437246963562753, "grad_norm": 1.1901404032712164, "learning_rate": 8.692962724880148e-06, "loss": 1.2767, "step": 2030 }, { "epoch": 1.6445344129554655, "grad_norm": 1.1932059161816782, "learning_rate": 8.684441615275465e-06, "loss": 1.3932, "step": 2031 }, { "epoch": 1.645344129554656, "grad_norm": 1.1891787691549471, "learning_rate": 8.675921477600996e-06, "loss": 1.3344, "step": 2032 }, { "epoch": 1.646153846153846, "grad_norm": 1.2379204694615744, "learning_rate": 8.667402318151394e-06, "loss": 1.3101, "step": 2033 }, { "epoch": 1.6469635627530366, "grad_norm": 1.1672160458877656, "learning_rate": 8.65888414322058e-06, "loss": 1.3827, "step": 2034 }, { "epoch": 1.6477732793522266, "grad_norm": 1.2157475649535694, "learning_rate": 8.650366959101757e-06, "loss": 1.3832, "step": 2035 }, { "epoch": 1.648582995951417, "grad_norm": 1.1812552928529416, "learning_rate": 8.641850772087392e-06, "loss": 1.3611, "step": 2036 }, { "epoch": 1.6493927125506072, "grad_norm": 1.2769305561908675, "learning_rate": 8.633335588469215e-06, "loss": 1.3163, "step": 2037 }, { "epoch": 1.6502024291497976, "grad_norm": 1.1910880418512533, "learning_rate": 8.62482141453821e-06, "loss": 1.3636, "step": 2038 }, { "epoch": 1.6510121457489877, "grad_norm": 1.2422148934633142, "learning_rate": 8.616308256584636e-06, "loss": 1.3399, "step": 2039 }, { "epoch": 1.6518218623481782, "grad_norm": 1.2039779168904876, "learning_rate": 8.607796120897978e-06, "loss": 1.3927, "step": 2040 }, { "epoch": 1.6526315789473685, "grad_norm": 1.2033223841568303, "learning_rate": 8.599285013766969e-06, "loss": 1.3786, "step": 2041 }, { "epoch": 1.6534412955465587, "grad_norm": 1.2064528480180141, "learning_rate": 8.590774941479594e-06, "loss": 1.3453, "step": 2042 }, { "epoch": 1.654251012145749, "grad_norm": 1.2144333343052642, "learning_rate": 8.582265910323063e-06, "loss": 1.3132, "step": 2043 }, { "epoch": 1.6550607287449393, "grad_norm": 1.2356297509544136, "learning_rate": 8.57375792658382e-06, "loss": 1.384, "step": 2044 }, { "epoch": 1.6558704453441295, "grad_norm": 1.2113533133382017, "learning_rate": 8.565250996547538e-06, "loss": 1.4371, "step": 2045 }, { "epoch": 1.6566801619433198, "grad_norm": 1.19928327777792, "learning_rate": 8.556745126499104e-06, "loss": 1.3767, "step": 2046 }, { "epoch": 1.6574898785425103, "grad_norm": 1.2044105767224726, "learning_rate": 8.548240322722634e-06, "loss": 1.3739, "step": 2047 }, { "epoch": 1.6582995951417003, "grad_norm": 1.2310691515165295, "learning_rate": 8.539736591501444e-06, "loss": 1.4491, "step": 2048 }, { "epoch": 1.6591093117408908, "grad_norm": 1.1954706292971082, "learning_rate": 8.531233939118064e-06, "loss": 1.3675, "step": 2049 }, { "epoch": 1.6599190283400809, "grad_norm": 1.1942959686441643, "learning_rate": 8.522732371854228e-06, "loss": 1.5045, "step": 2050 }, { "epoch": 1.6607287449392714, "grad_norm": 1.1935221793122746, "learning_rate": 8.514231895990862e-06, "loss": 1.3651, "step": 2051 }, { "epoch": 1.6615384615384614, "grad_norm": 1.1971712548588236, "learning_rate": 8.50573251780809e-06, "loss": 1.3726, "step": 2052 }, { "epoch": 1.662348178137652, "grad_norm": 1.189506111043996, "learning_rate": 8.497234243585229e-06, "loss": 1.4117, "step": 2053 }, { "epoch": 1.663157894736842, "grad_norm": 1.1995587258339253, "learning_rate": 8.488737079600767e-06, "loss": 1.3298, "step": 2054 }, { "epoch": 1.6639676113360324, "grad_norm": 1.1893431869171311, "learning_rate": 8.480241032132394e-06, "loss": 1.368, "step": 2055 }, { "epoch": 1.6647773279352227, "grad_norm": 1.2451970649626896, "learning_rate": 8.47174610745695e-06, "loss": 1.3723, "step": 2056 }, { "epoch": 1.665587044534413, "grad_norm": 1.19193248474202, "learning_rate": 8.463252311850466e-06, "loss": 1.3371, "step": 2057 }, { "epoch": 1.6663967611336032, "grad_norm": 1.2609189929314095, "learning_rate": 8.454759651588127e-06, "loss": 1.4365, "step": 2058 }, { "epoch": 1.6672064777327935, "grad_norm": 1.214748523646701, "learning_rate": 8.446268132944279e-06, "loss": 1.3793, "step": 2059 }, { "epoch": 1.6680161943319838, "grad_norm": 1.2916028366953496, "learning_rate": 8.437777762192434e-06, "loss": 1.4646, "step": 2060 }, { "epoch": 1.668825910931174, "grad_norm": 1.217799194319695, "learning_rate": 8.429288545605248e-06, "loss": 1.374, "step": 2061 }, { "epoch": 1.6696356275303643, "grad_norm": 1.1681254207450982, "learning_rate": 8.42080048945452e-06, "loss": 1.3769, "step": 2062 }, { "epoch": 1.6704453441295546, "grad_norm": 1.2364124010512334, "learning_rate": 8.412313600011209e-06, "loss": 1.3553, "step": 2063 }, { "epoch": 1.671255060728745, "grad_norm": 1.237450931928774, "learning_rate": 8.403827883545393e-06, "loss": 1.4733, "step": 2064 }, { "epoch": 1.6720647773279351, "grad_norm": 1.1760825158488026, "learning_rate": 8.395343346326295e-06, "loss": 1.3914, "step": 2065 }, { "epoch": 1.6728744939271256, "grad_norm": 1.2383834233283117, "learning_rate": 8.386859994622266e-06, "loss": 1.3251, "step": 2066 }, { "epoch": 1.6736842105263157, "grad_norm": 1.2192262734981085, "learning_rate": 8.378377834700769e-06, "loss": 1.3688, "step": 2067 }, { "epoch": 1.6744939271255062, "grad_norm": 1.1753730133070337, "learning_rate": 8.369896872828406e-06, "loss": 1.376, "step": 2068 }, { "epoch": 1.6753036437246962, "grad_norm": 1.1758350186139863, "learning_rate": 8.361417115270878e-06, "loss": 1.3457, "step": 2069 }, { "epoch": 1.6761133603238867, "grad_norm": 1.2192998551029626, "learning_rate": 8.352938568293e-06, "loss": 1.3951, "step": 2070 }, { "epoch": 1.676923076923077, "grad_norm": 1.2527451825200207, "learning_rate": 8.3444612381587e-06, "loss": 1.3647, "step": 2071 }, { "epoch": 1.6777327935222672, "grad_norm": 1.2447953070318596, "learning_rate": 8.335985131131002e-06, "loss": 1.4072, "step": 2072 }, { "epoch": 1.6785425101214575, "grad_norm": 1.2245743293643092, "learning_rate": 8.327510253472023e-06, "loss": 1.3841, "step": 2073 }, { "epoch": 1.6793522267206478, "grad_norm": 1.210563123337093, "learning_rate": 8.319036611442974e-06, "loss": 1.3674, "step": 2074 }, { "epoch": 1.680161943319838, "grad_norm": 1.215284281361853, "learning_rate": 8.310564211304159e-06, "loss": 1.3919, "step": 2075 }, { "epoch": 1.6809716599190283, "grad_norm": 1.1606361526638542, "learning_rate": 8.302093059314955e-06, "loss": 1.4128, "step": 2076 }, { "epoch": 1.6817813765182186, "grad_norm": 1.2280121763912009, "learning_rate": 8.293623161733819e-06, "loss": 1.3144, "step": 2077 }, { "epoch": 1.6825910931174088, "grad_norm": 1.1672110474120607, "learning_rate": 8.285154524818288e-06, "loss": 1.3403, "step": 2078 }, { "epoch": 1.6834008097165993, "grad_norm": 1.1959606638614337, "learning_rate": 8.27668715482496e-06, "loss": 1.4199, "step": 2079 }, { "epoch": 1.6842105263157894, "grad_norm": 1.178527090126989, "learning_rate": 8.268221058009506e-06, "loss": 1.331, "step": 2080 }, { "epoch": 1.6850202429149799, "grad_norm": 1.212039723837128, "learning_rate": 8.259756240626646e-06, "loss": 1.3688, "step": 2081 }, { "epoch": 1.68582995951417, "grad_norm": 1.2117727419701412, "learning_rate": 8.251292708930156e-06, "loss": 1.4021, "step": 2082 }, { "epoch": 1.6866396761133604, "grad_norm": 1.1328543998014728, "learning_rate": 8.242830469172873e-06, "loss": 1.2837, "step": 2083 }, { "epoch": 1.6874493927125505, "grad_norm": 1.174797996844683, "learning_rate": 8.234369527606667e-06, "loss": 1.3522, "step": 2084 }, { "epoch": 1.688259109311741, "grad_norm": 1.187195276450799, "learning_rate": 8.225909890482456e-06, "loss": 1.3685, "step": 2085 }, { "epoch": 1.6890688259109312, "grad_norm": 1.2247680029203445, "learning_rate": 8.217451564050185e-06, "loss": 1.3567, "step": 2086 }, { "epoch": 1.6898785425101215, "grad_norm": 1.1886609297985533, "learning_rate": 8.20899455455885e-06, "loss": 1.336, "step": 2087 }, { "epoch": 1.6906882591093118, "grad_norm": 1.209073662488914, "learning_rate": 8.200538868256455e-06, "loss": 1.3241, "step": 2088 }, { "epoch": 1.691497975708502, "grad_norm": 1.2521452085782394, "learning_rate": 8.192084511390033e-06, "loss": 1.3865, "step": 2089 }, { "epoch": 1.6923076923076923, "grad_norm": 1.1899332902396975, "learning_rate": 8.183631490205636e-06, "loss": 1.3589, "step": 2090 }, { "epoch": 1.6931174089068826, "grad_norm": 1.1695363453646483, "learning_rate": 8.17517981094833e-06, "loss": 1.3299, "step": 2091 }, { "epoch": 1.6939271255060728, "grad_norm": 1.1873531255692886, "learning_rate": 8.166729479862185e-06, "loss": 1.41, "step": 2092 }, { "epoch": 1.694736842105263, "grad_norm": 1.177164040688767, "learning_rate": 8.15828050319028e-06, "loss": 1.376, "step": 2093 }, { "epoch": 1.6955465587044536, "grad_norm": 1.2131772752466448, "learning_rate": 8.149832887174686e-06, "loss": 1.3092, "step": 2094 }, { "epoch": 1.6963562753036436, "grad_norm": 1.1863052171248314, "learning_rate": 8.141386638056482e-06, "loss": 1.3064, "step": 2095 }, { "epoch": 1.6971659919028341, "grad_norm": 1.2087525026441543, "learning_rate": 8.132941762075726e-06, "loss": 1.3384, "step": 2096 }, { "epoch": 1.6979757085020242, "grad_norm": 1.1909825383271502, "learning_rate": 8.124498265471462e-06, "loss": 1.3282, "step": 2097 }, { "epoch": 1.6987854251012147, "grad_norm": 1.2073382417531069, "learning_rate": 8.116056154481721e-06, "loss": 1.4204, "step": 2098 }, { "epoch": 1.6995951417004047, "grad_norm": 1.2336753723660032, "learning_rate": 8.107615435343506e-06, "loss": 1.4096, "step": 2099 }, { "epoch": 1.7004048582995952, "grad_norm": 1.2244965473022886, "learning_rate": 8.09917611429279e-06, "loss": 1.3345, "step": 2100 }, { "epoch": 1.7012145748987855, "grad_norm": 1.2365934665855882, "learning_rate": 8.090738197564519e-06, "loss": 1.3549, "step": 2101 }, { "epoch": 1.7020242914979757, "grad_norm": 1.2403759805495438, "learning_rate": 8.082301691392593e-06, "loss": 1.4056, "step": 2102 }, { "epoch": 1.702834008097166, "grad_norm": 1.23974335524669, "learning_rate": 8.073866602009883e-06, "loss": 1.314, "step": 2103 }, { "epoch": 1.7036437246963563, "grad_norm": 1.2444897063130003, "learning_rate": 8.0654329356482e-06, "loss": 1.3807, "step": 2104 }, { "epoch": 1.7044534412955465, "grad_norm": 1.1949535882055378, "learning_rate": 8.057000698538311e-06, "loss": 1.3949, "step": 2105 }, { "epoch": 1.7052631578947368, "grad_norm": 1.2104242593278358, "learning_rate": 8.048569896909925e-06, "loss": 1.2957, "step": 2106 }, { "epoch": 1.706072874493927, "grad_norm": 1.242882880007763, "learning_rate": 8.040140536991688e-06, "loss": 1.3838, "step": 2107 }, { "epoch": 1.7068825910931174, "grad_norm": 1.2414139288923256, "learning_rate": 8.031712625011186e-06, "loss": 1.3637, "step": 2108 }, { "epoch": 1.7076923076923078, "grad_norm": 1.1934131947311941, "learning_rate": 8.023286167194934e-06, "loss": 1.345, "step": 2109 }, { "epoch": 1.708502024291498, "grad_norm": 1.2339934690903747, "learning_rate": 8.014861169768362e-06, "loss": 1.43, "step": 2110 }, { "epoch": 1.7093117408906884, "grad_norm": 1.172849536919364, "learning_rate": 8.006437638955846e-06, "loss": 1.4032, "step": 2111 }, { "epoch": 1.7101214574898784, "grad_norm": 1.1971353964773175, "learning_rate": 7.99801558098065e-06, "loss": 1.3609, "step": 2112 }, { "epoch": 1.710931174089069, "grad_norm": 1.210793522412267, "learning_rate": 7.98959500206497e-06, "loss": 1.3799, "step": 2113 }, { "epoch": 1.711740890688259, "grad_norm": 1.1992770532754535, "learning_rate": 7.9811759084299e-06, "loss": 1.3861, "step": 2114 }, { "epoch": 1.7125506072874495, "grad_norm": 1.250496757926102, "learning_rate": 7.972758306295436e-06, "loss": 1.3585, "step": 2115 }, { "epoch": 1.7133603238866397, "grad_norm": 1.2143039563042084, "learning_rate": 7.964342201880478e-06, "loss": 1.3448, "step": 2116 }, { "epoch": 1.71417004048583, "grad_norm": 1.2503268692965592, "learning_rate": 7.955927601402817e-06, "loss": 1.342, "step": 2117 }, { "epoch": 1.7149797570850203, "grad_norm": 1.272542261752361, "learning_rate": 7.947514511079126e-06, "loss": 1.4221, "step": 2118 }, { "epoch": 1.7157894736842105, "grad_norm": 1.2723130848911237, "learning_rate": 7.939102937124975e-06, "loss": 1.4166, "step": 2119 }, { "epoch": 1.7165991902834008, "grad_norm": 1.2332492013002891, "learning_rate": 7.930692885754806e-06, "loss": 1.3435, "step": 2120 }, { "epoch": 1.717408906882591, "grad_norm": 1.2106787991881476, "learning_rate": 7.922284363181937e-06, "loss": 1.3128, "step": 2121 }, { "epoch": 1.7182186234817813, "grad_norm": 1.2133263111046222, "learning_rate": 7.913877375618555e-06, "loss": 1.3606, "step": 2122 }, { "epoch": 1.7190283400809716, "grad_norm": 1.2393512093973866, "learning_rate": 7.90547192927572e-06, "loss": 1.3813, "step": 2123 }, { "epoch": 1.719838056680162, "grad_norm": 1.18596615707014, "learning_rate": 7.897068030363341e-06, "loss": 1.3718, "step": 2124 }, { "epoch": 1.7206477732793521, "grad_norm": 1.1774499718505693, "learning_rate": 7.888665685090194e-06, "loss": 1.3439, "step": 2125 }, { "epoch": 1.7214574898785426, "grad_norm": 1.196454598660986, "learning_rate": 7.880264899663901e-06, "loss": 1.3454, "step": 2126 }, { "epoch": 1.7222672064777327, "grad_norm": 1.247233467944127, "learning_rate": 7.871865680290943e-06, "loss": 1.3977, "step": 2127 }, { "epoch": 1.7230769230769232, "grad_norm": 1.189652568647767, "learning_rate": 7.863468033176632e-06, "loss": 1.3581, "step": 2128 }, { "epoch": 1.7238866396761132, "grad_norm": 1.1938170625000661, "learning_rate": 7.855071964525115e-06, "loss": 1.3333, "step": 2129 }, { "epoch": 1.7246963562753037, "grad_norm": 1.1673683171310625, "learning_rate": 7.846677480539392e-06, "loss": 1.3601, "step": 2130 }, { "epoch": 1.725506072874494, "grad_norm": 1.1775636728244776, "learning_rate": 7.838284587421273e-06, "loss": 1.3889, "step": 2131 }, { "epoch": 1.7263157894736842, "grad_norm": 1.2347561246730585, "learning_rate": 7.829893291371399e-06, "loss": 1.3683, "step": 2132 }, { "epoch": 1.7271255060728745, "grad_norm": 1.1960868396656386, "learning_rate": 7.821503598589234e-06, "loss": 1.364, "step": 2133 }, { "epoch": 1.7279352226720648, "grad_norm": 1.1576414391616703, "learning_rate": 7.813115515273052e-06, "loss": 1.3586, "step": 2134 }, { "epoch": 1.728744939271255, "grad_norm": 1.2728426715326342, "learning_rate": 7.80472904761995e-06, "loss": 1.3417, "step": 2135 }, { "epoch": 1.7295546558704453, "grad_norm": 1.196854159805804, "learning_rate": 7.796344201825816e-06, "loss": 1.3683, "step": 2136 }, { "epoch": 1.7303643724696356, "grad_norm": 1.2227814698116768, "learning_rate": 7.787960984085346e-06, "loss": 1.3132, "step": 2137 }, { "epoch": 1.7311740890688259, "grad_norm": 1.1904404210188404, "learning_rate": 7.779579400592039e-06, "loss": 1.2808, "step": 2138 }, { "epoch": 1.7319838056680164, "grad_norm": 1.2071293931299591, "learning_rate": 7.771199457538177e-06, "loss": 1.3415, "step": 2139 }, { "epoch": 1.7327935222672064, "grad_norm": 1.2589835991554157, "learning_rate": 7.762821161114834e-06, "loss": 1.3671, "step": 2140 }, { "epoch": 1.733603238866397, "grad_norm": 1.2502098184841257, "learning_rate": 7.754444517511869e-06, "loss": 1.3771, "step": 2141 }, { "epoch": 1.734412955465587, "grad_norm": 1.194748364690635, "learning_rate": 7.746069532917918e-06, "loss": 1.3148, "step": 2142 }, { "epoch": 1.7352226720647774, "grad_norm": 1.2615364812188445, "learning_rate": 7.737696213520397e-06, "loss": 1.3587, "step": 2143 }, { "epoch": 1.7360323886639675, "grad_norm": 1.1791684259343356, "learning_rate": 7.72932456550548e-06, "loss": 1.335, "step": 2144 }, { "epoch": 1.736842105263158, "grad_norm": 1.2804937427129839, "learning_rate": 7.720954595058118e-06, "loss": 1.3563, "step": 2145 }, { "epoch": 1.737651821862348, "grad_norm": 1.2678895788470363, "learning_rate": 7.712586308362017e-06, "loss": 1.3297, "step": 2146 }, { "epoch": 1.7384615384615385, "grad_norm": 1.1864809688777394, "learning_rate": 7.704219711599637e-06, "loss": 1.3111, "step": 2147 }, { "epoch": 1.7392712550607288, "grad_norm": 1.1913833583807754, "learning_rate": 7.695854810952194e-06, "loss": 1.3823, "step": 2148 }, { "epoch": 1.740080971659919, "grad_norm": 1.2454501834374865, "learning_rate": 7.687491612599651e-06, "loss": 1.3384, "step": 2149 }, { "epoch": 1.7408906882591093, "grad_norm": 1.2311094048424676, "learning_rate": 7.679130122720704e-06, "loss": 1.4009, "step": 2150 }, { "epoch": 1.7417004048582996, "grad_norm": 1.2146362073312729, "learning_rate": 7.670770347492804e-06, "loss": 1.3641, "step": 2151 }, { "epoch": 1.7425101214574898, "grad_norm": 1.214516287695011, "learning_rate": 7.662412293092118e-06, "loss": 1.3468, "step": 2152 }, { "epoch": 1.7433198380566801, "grad_norm": 1.1764628400623898, "learning_rate": 7.654055965693556e-06, "loss": 1.3273, "step": 2153 }, { "epoch": 1.7441295546558706, "grad_norm": 1.1411724141436934, "learning_rate": 7.64570137147074e-06, "loss": 1.3301, "step": 2154 }, { "epoch": 1.7449392712550607, "grad_norm": 1.2122093274665457, "learning_rate": 7.637348516596016e-06, "loss": 1.3525, "step": 2155 }, { "epoch": 1.7457489878542511, "grad_norm": 1.2399842029700836, "learning_rate": 7.628997407240453e-06, "loss": 1.3807, "step": 2156 }, { "epoch": 1.7465587044534412, "grad_norm": 1.256148734557012, "learning_rate": 7.620648049573815e-06, "loss": 1.3519, "step": 2157 }, { "epoch": 1.7473684210526317, "grad_norm": 1.2286990308441597, "learning_rate": 7.61230044976458e-06, "loss": 1.3506, "step": 2158 }, { "epoch": 1.7481781376518217, "grad_norm": 1.2260430970223977, "learning_rate": 7.603954613979933e-06, "loss": 1.2832, "step": 2159 }, { "epoch": 1.7489878542510122, "grad_norm": 1.1988962090230735, "learning_rate": 7.59561054838575e-06, "loss": 1.307, "step": 2160 }, { "epoch": 1.7497975708502023, "grad_norm": 1.1876276282662488, "learning_rate": 7.587268259146596e-06, "loss": 1.3742, "step": 2161 }, { "epoch": 1.7506072874493928, "grad_norm": 1.327874554832877, "learning_rate": 7.578927752425727e-06, "loss": 1.3885, "step": 2162 }, { "epoch": 1.751417004048583, "grad_norm": 1.2617574575652093, "learning_rate": 7.570589034385083e-06, "loss": 1.4301, "step": 2163 }, { "epoch": 1.7522267206477733, "grad_norm": 1.1822211358693704, "learning_rate": 7.562252111185282e-06, "loss": 1.3268, "step": 2164 }, { "epoch": 1.7530364372469636, "grad_norm": 1.2264998801603384, "learning_rate": 7.5539169889856135e-06, "loss": 1.3632, "step": 2165 }, { "epoch": 1.7538461538461538, "grad_norm": 1.2308240591640838, "learning_rate": 7.545583673944038e-06, "loss": 1.3703, "step": 2166 }, { "epoch": 1.754655870445344, "grad_norm": 1.2034026004405005, "learning_rate": 7.537252172217185e-06, "loss": 1.4068, "step": 2167 }, { "epoch": 1.7554655870445344, "grad_norm": 1.2716169869380658, "learning_rate": 7.528922489960339e-06, "loss": 1.3546, "step": 2168 }, { "epoch": 1.7562753036437249, "grad_norm": 1.1963463371260916, "learning_rate": 7.52059463332744e-06, "loss": 1.3827, "step": 2169 }, { "epoch": 1.757085020242915, "grad_norm": 1.2214699387119217, "learning_rate": 7.512268608471083e-06, "loss": 1.3597, "step": 2170 }, { "epoch": 1.7578947368421054, "grad_norm": 1.1980661545941786, "learning_rate": 7.503944421542508e-06, "loss": 1.4244, "step": 2171 }, { "epoch": 1.7587044534412954, "grad_norm": 1.2346433817261153, "learning_rate": 7.495622078691597e-06, "loss": 1.3887, "step": 2172 }, { "epoch": 1.759514170040486, "grad_norm": 1.2316791370226217, "learning_rate": 7.487301586066866e-06, "loss": 1.4128, "step": 2173 }, { "epoch": 1.760323886639676, "grad_norm": 1.247303111408137, "learning_rate": 7.47898294981547e-06, "loss": 1.3464, "step": 2174 }, { "epoch": 1.7611336032388665, "grad_norm": 1.1858468665595168, "learning_rate": 7.470666176083193e-06, "loss": 1.3297, "step": 2175 }, { "epoch": 1.7619433198380565, "grad_norm": 1.17298641444367, "learning_rate": 7.462351271014438e-06, "loss": 1.3335, "step": 2176 }, { "epoch": 1.762753036437247, "grad_norm": 1.2071311078649347, "learning_rate": 7.454038240752228e-06, "loss": 1.3957, "step": 2177 }, { "epoch": 1.7635627530364373, "grad_norm": 1.2383361119848715, "learning_rate": 7.4457270914382056e-06, "loss": 1.3592, "step": 2178 }, { "epoch": 1.7643724696356275, "grad_norm": 1.2321882683113468, "learning_rate": 7.437417829212618e-06, "loss": 1.3527, "step": 2179 }, { "epoch": 1.7651821862348178, "grad_norm": 1.1470511569497224, "learning_rate": 7.42911046021432e-06, "loss": 1.3536, "step": 2180 }, { "epoch": 1.765991902834008, "grad_norm": 1.1704186219839523, "learning_rate": 7.420804990580772e-06, "loss": 1.3738, "step": 2181 }, { "epoch": 1.7668016194331984, "grad_norm": 1.155250015703002, "learning_rate": 7.4125014264480225e-06, "loss": 1.3488, "step": 2182 }, { "epoch": 1.7676113360323886, "grad_norm": 1.2361131817316744, "learning_rate": 7.404199773950724e-06, "loss": 1.3283, "step": 2183 }, { "epoch": 1.768421052631579, "grad_norm": 1.1572149457442449, "learning_rate": 7.395900039222108e-06, "loss": 1.4089, "step": 2184 }, { "epoch": 1.7692307692307692, "grad_norm": 1.2024459906835356, "learning_rate": 7.387602228393987e-06, "loss": 1.3428, "step": 2185 }, { "epoch": 1.7700404858299597, "grad_norm": 1.2444788254439085, "learning_rate": 7.379306347596762e-06, "loss": 1.3889, "step": 2186 }, { "epoch": 1.7708502024291497, "grad_norm": 1.2199415758978813, "learning_rate": 7.371012402959399e-06, "loss": 1.3438, "step": 2187 }, { "epoch": 1.7716599190283402, "grad_norm": 1.147217425822318, "learning_rate": 7.362720400609437e-06, "loss": 1.4056, "step": 2188 }, { "epoch": 1.7724696356275302, "grad_norm": 1.2527673146015181, "learning_rate": 7.354430346672983e-06, "loss": 1.3813, "step": 2189 }, { "epoch": 1.7732793522267207, "grad_norm": 1.261309500485428, "learning_rate": 7.346142247274695e-06, "loss": 1.4284, "step": 2190 }, { "epoch": 1.7740890688259108, "grad_norm": 1.2320074330261714, "learning_rate": 7.337856108537802e-06, "loss": 1.4159, "step": 2191 }, { "epoch": 1.7748987854251013, "grad_norm": 1.247234002362416, "learning_rate": 7.329571936584072e-06, "loss": 1.4277, "step": 2192 }, { "epoch": 1.7757085020242915, "grad_norm": 1.2396500425068102, "learning_rate": 7.321289737533826e-06, "loss": 1.2784, "step": 2193 }, { "epoch": 1.7765182186234818, "grad_norm": 1.221512450585911, "learning_rate": 7.313009517505923e-06, "loss": 1.4277, "step": 2194 }, { "epoch": 1.777327935222672, "grad_norm": 1.2207042254565934, "learning_rate": 7.304731282617762e-06, "loss": 1.3541, "step": 2195 }, { "epoch": 1.7781376518218623, "grad_norm": 1.179448361791702, "learning_rate": 7.29645503898528e-06, "loss": 1.3417, "step": 2196 }, { "epoch": 1.7789473684210526, "grad_norm": 1.210105556711578, "learning_rate": 7.288180792722934e-06, "loss": 1.3518, "step": 2197 }, { "epoch": 1.7797570850202429, "grad_norm": 1.2517556094115736, "learning_rate": 7.279908549943708e-06, "loss": 1.3949, "step": 2198 }, { "epoch": 1.7805668016194331, "grad_norm": 1.213422808224917, "learning_rate": 7.271638316759116e-06, "loss": 1.2904, "step": 2199 }, { "epoch": 1.7813765182186234, "grad_norm": 1.21215761739376, "learning_rate": 7.263370099279173e-06, "loss": 1.282, "step": 2200 }, { "epoch": 1.782186234817814, "grad_norm": 1.2357528688986366, "learning_rate": 7.255103903612413e-06, "loss": 1.3415, "step": 2201 }, { "epoch": 1.782995951417004, "grad_norm": 1.2102242154201936, "learning_rate": 7.246839735865874e-06, "loss": 1.3856, "step": 2202 }, { "epoch": 1.7838056680161944, "grad_norm": 1.2066789803427276, "learning_rate": 7.238577602145094e-06, "loss": 1.3408, "step": 2203 }, { "epoch": 1.7846153846153845, "grad_norm": 1.2002015685758425, "learning_rate": 7.230317508554113e-06, "loss": 1.3005, "step": 2204 }, { "epoch": 1.785425101214575, "grad_norm": 1.2371780286395402, "learning_rate": 7.2220594611954606e-06, "loss": 1.3464, "step": 2205 }, { "epoch": 1.786234817813765, "grad_norm": 1.2122162279977085, "learning_rate": 7.21380346617015e-06, "loss": 1.4097, "step": 2206 }, { "epoch": 1.7870445344129555, "grad_norm": 1.1890616495957214, "learning_rate": 7.20554952957769e-06, "loss": 1.3381, "step": 2207 }, { "epoch": 1.7878542510121458, "grad_norm": 1.2240916448009678, "learning_rate": 7.197297657516062e-06, "loss": 1.2961, "step": 2208 }, { "epoch": 1.788663967611336, "grad_norm": 1.2291351852794627, "learning_rate": 7.189047856081719e-06, "loss": 1.3057, "step": 2209 }, { "epoch": 1.7894736842105263, "grad_norm": 1.2303308564438198, "learning_rate": 7.1808001313695855e-06, "loss": 1.434, "step": 2210 }, { "epoch": 1.7902834008097166, "grad_norm": 1.2068008557972785, "learning_rate": 7.172554489473057e-06, "loss": 1.371, "step": 2211 }, { "epoch": 1.7910931174089069, "grad_norm": 1.236399538012193, "learning_rate": 7.164310936483986e-06, "loss": 1.3062, "step": 2212 }, { "epoch": 1.7919028340080971, "grad_norm": 1.1998903560290575, "learning_rate": 7.156069478492679e-06, "loss": 1.3007, "step": 2213 }, { "epoch": 1.7927125506072874, "grad_norm": 1.2525924525988954, "learning_rate": 7.1478301215878975e-06, "loss": 1.4231, "step": 2214 }, { "epoch": 1.7935222672064777, "grad_norm": 1.2758950545332266, "learning_rate": 7.1395928718568605e-06, "loss": 1.359, "step": 2215 }, { "epoch": 1.7943319838056682, "grad_norm": 1.2001272107207346, "learning_rate": 7.131357735385213e-06, "loss": 1.3625, "step": 2216 }, { "epoch": 1.7951417004048582, "grad_norm": 1.2200265505607486, "learning_rate": 7.123124718257052e-06, "loss": 1.3997, "step": 2217 }, { "epoch": 1.7959514170040487, "grad_norm": 1.2159603840435709, "learning_rate": 7.114893826554896e-06, "loss": 1.3696, "step": 2218 }, { "epoch": 1.7967611336032387, "grad_norm": 1.2133557545824332, "learning_rate": 7.106665066359708e-06, "loss": 1.4013, "step": 2219 }, { "epoch": 1.7975708502024292, "grad_norm": 1.2428462413099133, "learning_rate": 7.098438443750865e-06, "loss": 1.4124, "step": 2220 }, { "epoch": 1.7983805668016193, "grad_norm": 1.1962533108457247, "learning_rate": 7.0902139648061676e-06, "loss": 1.3797, "step": 2221 }, { "epoch": 1.7991902834008098, "grad_norm": 1.1949448871594797, "learning_rate": 7.081991635601835e-06, "loss": 1.3777, "step": 2222 }, { "epoch": 1.8, "grad_norm": 1.1739839756203938, "learning_rate": 7.073771462212502e-06, "loss": 1.3431, "step": 2223 }, { "epoch": 1.8008097165991903, "grad_norm": 1.2246364614759249, "learning_rate": 7.065553450711202e-06, "loss": 1.3858, "step": 2224 }, { "epoch": 1.8016194331983806, "grad_norm": 1.2541194418394814, "learning_rate": 7.057337607169373e-06, "loss": 1.4208, "step": 2225 }, { "epoch": 1.8024291497975709, "grad_norm": 1.1625873389342016, "learning_rate": 7.049123937656855e-06, "loss": 1.2865, "step": 2226 }, { "epoch": 1.8032388663967611, "grad_norm": 1.1789523515967617, "learning_rate": 7.040912448241881e-06, "loss": 1.3901, "step": 2227 }, { "epoch": 1.8040485829959514, "grad_norm": 1.1706741638509202, "learning_rate": 7.032703144991071e-06, "loss": 1.3657, "step": 2228 }, { "epoch": 1.8048582995951417, "grad_norm": 1.1921078298881833, "learning_rate": 7.024496033969432e-06, "loss": 1.3611, "step": 2229 }, { "epoch": 1.805668016194332, "grad_norm": 1.17097174113593, "learning_rate": 7.016291121240346e-06, "loss": 1.3633, "step": 2230 }, { "epoch": 1.8064777327935224, "grad_norm": 1.1852035894468138, "learning_rate": 7.0080884128655844e-06, "loss": 1.3807, "step": 2231 }, { "epoch": 1.8072874493927125, "grad_norm": 1.2725267927716697, "learning_rate": 6.999887914905275e-06, "loss": 1.4361, "step": 2232 }, { "epoch": 1.808097165991903, "grad_norm": 1.220459422861511, "learning_rate": 6.991689633417922e-06, "loss": 1.3571, "step": 2233 }, { "epoch": 1.808906882591093, "grad_norm": 1.1788740570003207, "learning_rate": 6.983493574460387e-06, "loss": 1.3274, "step": 2234 }, { "epoch": 1.8097165991902835, "grad_norm": 1.3416750880261363, "learning_rate": 6.975299744087891e-06, "loss": 1.3581, "step": 2235 }, { "epoch": 1.8105263157894735, "grad_norm": 1.225592957552821, "learning_rate": 6.967108148354012e-06, "loss": 1.4076, "step": 2236 }, { "epoch": 1.811336032388664, "grad_norm": 1.2485859196199363, "learning_rate": 6.958918793310669e-06, "loss": 1.434, "step": 2237 }, { "epoch": 1.8121457489878543, "grad_norm": 1.179048945702912, "learning_rate": 6.950731685008132e-06, "loss": 1.3373, "step": 2238 }, { "epoch": 1.8129554655870446, "grad_norm": 1.208305356954193, "learning_rate": 6.942546829495014e-06, "loss": 1.3677, "step": 2239 }, { "epoch": 1.8137651821862348, "grad_norm": 1.179294023466567, "learning_rate": 6.934364232818254e-06, "loss": 1.3397, "step": 2240 }, { "epoch": 1.814574898785425, "grad_norm": 1.238375865893255, "learning_rate": 6.926183901023134e-06, "loss": 1.3259, "step": 2241 }, { "epoch": 1.8153846153846154, "grad_norm": 1.2004335795147096, "learning_rate": 6.91800584015325e-06, "loss": 1.3741, "step": 2242 }, { "epoch": 1.8161943319838056, "grad_norm": 1.2101607717425258, "learning_rate": 6.909830056250527e-06, "loss": 1.3455, "step": 2243 }, { "epoch": 1.817004048582996, "grad_norm": 1.203030571859603, "learning_rate": 6.901656555355212e-06, "loss": 1.3705, "step": 2244 }, { "epoch": 1.8178137651821862, "grad_norm": 1.276448418257517, "learning_rate": 6.8934853435058566e-06, "loss": 1.3609, "step": 2245 }, { "epoch": 1.8186234817813767, "grad_norm": 1.1499994230561823, "learning_rate": 6.8853164267393234e-06, "loss": 1.3902, "step": 2246 }, { "epoch": 1.8194331983805667, "grad_norm": 1.1943909447731087, "learning_rate": 6.877149811090785e-06, "loss": 1.2936, "step": 2247 }, { "epoch": 1.8202429149797572, "grad_norm": 1.2276088320117058, "learning_rate": 6.8689855025937124e-06, "loss": 1.3459, "step": 2248 }, { "epoch": 1.8210526315789473, "grad_norm": 1.2131982314479368, "learning_rate": 6.860823507279868e-06, "loss": 1.3071, "step": 2249 }, { "epoch": 1.8218623481781377, "grad_norm": 1.2662064967702196, "learning_rate": 6.852663831179303e-06, "loss": 1.3959, "step": 2250 }, { "epoch": 1.8226720647773278, "grad_norm": 1.28182916895796, "learning_rate": 6.844506480320363e-06, "loss": 1.3637, "step": 2251 }, { "epoch": 1.8234817813765183, "grad_norm": 1.1875870250303597, "learning_rate": 6.836351460729673e-06, "loss": 1.354, "step": 2252 }, { "epoch": 1.8242914979757086, "grad_norm": 1.2135144065873424, "learning_rate": 6.828198778432131e-06, "loss": 1.3731, "step": 2253 }, { "epoch": 1.8251012145748988, "grad_norm": 1.190208517610891, "learning_rate": 6.820048439450913e-06, "loss": 1.3976, "step": 2254 }, { "epoch": 1.825910931174089, "grad_norm": 1.1613715093942938, "learning_rate": 6.811900449807465e-06, "loss": 1.3521, "step": 2255 }, { "epoch": 1.8267206477732794, "grad_norm": 1.1600854701348962, "learning_rate": 6.803754815521495e-06, "loss": 1.3905, "step": 2256 }, { "epoch": 1.8275303643724696, "grad_norm": 1.1683331040020393, "learning_rate": 6.7956115426109695e-06, "loss": 1.3801, "step": 2257 }, { "epoch": 1.82834008097166, "grad_norm": 1.1228286175250644, "learning_rate": 6.78747063709211e-06, "loss": 1.3634, "step": 2258 }, { "epoch": 1.8291497975708502, "grad_norm": 1.1651839094767948, "learning_rate": 6.779332104979394e-06, "loss": 1.3268, "step": 2259 }, { "epoch": 1.8299595141700404, "grad_norm": 1.1999740716361973, "learning_rate": 6.771195952285541e-06, "loss": 1.3391, "step": 2260 }, { "epoch": 1.830769230769231, "grad_norm": 1.1698993424325916, "learning_rate": 6.763062185021511e-06, "loss": 1.3434, "step": 2261 }, { "epoch": 1.831578947368421, "grad_norm": 1.197859926690675, "learning_rate": 6.754930809196507e-06, "loss": 1.3116, "step": 2262 }, { "epoch": 1.8323886639676115, "grad_norm": 1.2075936874086974, "learning_rate": 6.746801830817966e-06, "loss": 1.3516, "step": 2263 }, { "epoch": 1.8331983805668015, "grad_norm": 1.2280800864883632, "learning_rate": 6.738675255891548e-06, "loss": 1.3995, "step": 2264 }, { "epoch": 1.834008097165992, "grad_norm": 1.186966643343012, "learning_rate": 6.730551090421137e-06, "loss": 1.3301, "step": 2265 }, { "epoch": 1.834817813765182, "grad_norm": 1.1752482043139705, "learning_rate": 6.7224293404088445e-06, "loss": 1.288, "step": 2266 }, { "epoch": 1.8356275303643725, "grad_norm": 1.1737943908874262, "learning_rate": 6.714310011854989e-06, "loss": 1.379, "step": 2267 }, { "epoch": 1.8364372469635628, "grad_norm": 1.1881877312022122, "learning_rate": 6.7061931107581055e-06, "loss": 1.3928, "step": 2268 }, { "epoch": 1.837246963562753, "grad_norm": 1.2419072015459927, "learning_rate": 6.698078643114935e-06, "loss": 1.3845, "step": 2269 }, { "epoch": 1.8380566801619433, "grad_norm": 1.1961721466530393, "learning_rate": 6.689966614920414e-06, "loss": 1.3745, "step": 2270 }, { "epoch": 1.8388663967611336, "grad_norm": 1.151446689287607, "learning_rate": 6.681857032167689e-06, "loss": 1.3118, "step": 2271 }, { "epoch": 1.8396761133603239, "grad_norm": 1.196438525357998, "learning_rate": 6.673749900848092e-06, "loss": 1.3183, "step": 2272 }, { "epoch": 1.8404858299595142, "grad_norm": 1.2018986018410842, "learning_rate": 6.665645226951141e-06, "loss": 1.4007, "step": 2273 }, { "epoch": 1.8412955465587044, "grad_norm": 1.175635606943612, "learning_rate": 6.657543016464546e-06, "loss": 1.39, "step": 2274 }, { "epoch": 1.8421052631578947, "grad_norm": 1.2177369319411393, "learning_rate": 6.6494432753741935e-06, "loss": 1.3989, "step": 2275 }, { "epoch": 1.8429149797570852, "grad_norm": 1.15435997929181, "learning_rate": 6.641346009664142e-06, "loss": 1.3644, "step": 2276 }, { "epoch": 1.8437246963562752, "grad_norm": 1.1812983300442905, "learning_rate": 6.63325122531663e-06, "loss": 1.3, "step": 2277 }, { "epoch": 1.8445344129554657, "grad_norm": 1.1786903009596423, "learning_rate": 6.62515892831205e-06, "loss": 1.3513, "step": 2278 }, { "epoch": 1.8453441295546558, "grad_norm": 1.1533542073236585, "learning_rate": 6.6170691246289744e-06, "loss": 1.3916, "step": 2279 }, { "epoch": 1.8461538461538463, "grad_norm": 1.122771919065958, "learning_rate": 6.608981820244116e-06, "loss": 1.2917, "step": 2280 }, { "epoch": 1.8469635627530363, "grad_norm": 1.1565154319571267, "learning_rate": 6.600897021132353e-06, "loss": 1.3448, "step": 2281 }, { "epoch": 1.8477732793522268, "grad_norm": 1.1994419287429507, "learning_rate": 6.592814733266708e-06, "loss": 1.3973, "step": 2282 }, { "epoch": 1.8485829959514168, "grad_norm": 1.2145511142569878, "learning_rate": 6.5847349626183444e-06, "loss": 1.2976, "step": 2283 }, { "epoch": 1.8493927125506073, "grad_norm": 1.2470922439448142, "learning_rate": 6.576657715156576e-06, "loss": 1.4211, "step": 2284 }, { "epoch": 1.8502024291497976, "grad_norm": 1.1849987198146783, "learning_rate": 6.568582996848844e-06, "loss": 1.4217, "step": 2285 }, { "epoch": 1.8510121457489879, "grad_norm": 1.2246339182732113, "learning_rate": 6.560510813660719e-06, "loss": 1.347, "step": 2286 }, { "epoch": 1.8518218623481781, "grad_norm": 1.2511212401796088, "learning_rate": 6.5524411715559125e-06, "loss": 1.4248, "step": 2287 }, { "epoch": 1.8526315789473684, "grad_norm": 1.2304999920366795, "learning_rate": 6.544374076496243e-06, "loss": 1.3445, "step": 2288 }, { "epoch": 1.8534412955465587, "grad_norm": 1.2349915243981289, "learning_rate": 6.536309534441658e-06, "loss": 1.3398, "step": 2289 }, { "epoch": 1.854251012145749, "grad_norm": 1.1884282705027234, "learning_rate": 6.528247551350213e-06, "loss": 1.2971, "step": 2290 }, { "epoch": 1.8550607287449394, "grad_norm": 1.181801214740644, "learning_rate": 6.5201881331780725e-06, "loss": 1.3792, "step": 2291 }, { "epoch": 1.8558704453441295, "grad_norm": 1.2135112397667827, "learning_rate": 6.512131285879513e-06, "loss": 1.3898, "step": 2292 }, { "epoch": 1.85668016194332, "grad_norm": 1.2180427276548653, "learning_rate": 6.504077015406902e-06, "loss": 1.3809, "step": 2293 }, { "epoch": 1.85748987854251, "grad_norm": 1.1561446887739786, "learning_rate": 6.496025327710707e-06, "loss": 1.3113, "step": 2294 }, { "epoch": 1.8582995951417005, "grad_norm": 1.2247735536559143, "learning_rate": 6.487976228739493e-06, "loss": 1.3531, "step": 2295 }, { "epoch": 1.8591093117408906, "grad_norm": 1.2152586819122253, "learning_rate": 6.4799297244399085e-06, "loss": 1.3964, "step": 2296 }, { "epoch": 1.859919028340081, "grad_norm": 1.2454451424496096, "learning_rate": 6.471885820756683e-06, "loss": 1.3904, "step": 2297 }, { "epoch": 1.860728744939271, "grad_norm": 1.1655322922036064, "learning_rate": 6.463844523632622e-06, "loss": 1.3364, "step": 2298 }, { "epoch": 1.8615384615384616, "grad_norm": 1.2996377865771735, "learning_rate": 6.455805839008615e-06, "loss": 1.4075, "step": 2299 }, { "epoch": 1.8623481781376519, "grad_norm": 1.1964728670717037, "learning_rate": 6.4477697728236146e-06, "loss": 1.3517, "step": 2300 }, { "epoch": 1.8631578947368421, "grad_norm": 1.16799096604399, "learning_rate": 6.439736331014637e-06, "loss": 1.3809, "step": 2301 }, { "epoch": 1.8639676113360324, "grad_norm": 1.2252233979220486, "learning_rate": 6.431705519516763e-06, "loss": 1.3675, "step": 2302 }, { "epoch": 1.8647773279352227, "grad_norm": 1.2104500576735182, "learning_rate": 6.4236773442631325e-06, "loss": 1.3805, "step": 2303 }, { "epoch": 1.865587044534413, "grad_norm": 1.3193178213753143, "learning_rate": 6.415651811184935e-06, "loss": 1.3997, "step": 2304 }, { "epoch": 1.8663967611336032, "grad_norm": 1.2018055746906207, "learning_rate": 6.407628926211409e-06, "loss": 1.3964, "step": 2305 }, { "epoch": 1.8672064777327935, "grad_norm": 1.174760468858027, "learning_rate": 6.39960869526983e-06, "loss": 1.3464, "step": 2306 }, { "epoch": 1.8680161943319837, "grad_norm": 1.1810773457338066, "learning_rate": 6.391591124285524e-06, "loss": 1.2858, "step": 2307 }, { "epoch": 1.8688259109311742, "grad_norm": 1.2262009022694145, "learning_rate": 6.383576219181844e-06, "loss": 1.3559, "step": 2308 }, { "epoch": 1.8696356275303643, "grad_norm": 1.2773594939431656, "learning_rate": 6.375563985880174e-06, "loss": 1.3705, "step": 2309 }, { "epoch": 1.8704453441295548, "grad_norm": 1.2208108751520061, "learning_rate": 6.367554430299924e-06, "loss": 1.3333, "step": 2310 }, { "epoch": 1.8712550607287448, "grad_norm": 1.4160184181629654, "learning_rate": 6.3595475583585344e-06, "loss": 1.3816, "step": 2311 }, { "epoch": 1.8720647773279353, "grad_norm": 1.2094735305859114, "learning_rate": 6.351543375971453e-06, "loss": 1.3275, "step": 2312 }, { "epoch": 1.8728744939271254, "grad_norm": 1.2855800361668372, "learning_rate": 6.34354188905214e-06, "loss": 1.4212, "step": 2313 }, { "epoch": 1.8736842105263158, "grad_norm": 1.1929901078467973, "learning_rate": 6.335543103512072e-06, "loss": 1.368, "step": 2314 }, { "epoch": 1.874493927125506, "grad_norm": 1.1975185842016003, "learning_rate": 6.327547025260723e-06, "loss": 1.3372, "step": 2315 }, { "epoch": 1.8753036437246964, "grad_norm": 1.2685945017090825, "learning_rate": 6.319553660205569e-06, "loss": 1.3461, "step": 2316 }, { "epoch": 1.8761133603238866, "grad_norm": 1.1962632720706978, "learning_rate": 6.3115630142520835e-06, "loss": 1.404, "step": 2317 }, { "epoch": 1.876923076923077, "grad_norm": 1.198552614864501, "learning_rate": 6.303575093303725e-06, "loss": 1.4111, "step": 2318 }, { "epoch": 1.8777327935222672, "grad_norm": 1.1891573003084546, "learning_rate": 6.2955899032619515e-06, "loss": 1.3048, "step": 2319 }, { "epoch": 1.8785425101214575, "grad_norm": 1.1495642468155007, "learning_rate": 6.287607450026189e-06, "loss": 1.4159, "step": 2320 }, { "epoch": 1.8793522267206477, "grad_norm": 1.1707115053399164, "learning_rate": 6.27962773949385e-06, "loss": 1.4223, "step": 2321 }, { "epoch": 1.880161943319838, "grad_norm": 1.2148777682926812, "learning_rate": 6.271650777560318e-06, "loss": 1.3416, "step": 2322 }, { "epoch": 1.8809716599190285, "grad_norm": 1.222151610467891, "learning_rate": 6.263676570118948e-06, "loss": 1.3497, "step": 2323 }, { "epoch": 1.8817813765182185, "grad_norm": 1.1961853962501905, "learning_rate": 6.2557051230610534e-06, "loss": 1.3338, "step": 2324 }, { "epoch": 1.882591093117409, "grad_norm": 1.1926637799351172, "learning_rate": 6.247736442275918e-06, "loss": 1.4051, "step": 2325 }, { "epoch": 1.883400809716599, "grad_norm": 1.1918108744163147, "learning_rate": 6.239770533650771e-06, "loss": 1.398, "step": 2326 }, { "epoch": 1.8842105263157896, "grad_norm": 1.234775751606103, "learning_rate": 6.231807403070806e-06, "loss": 1.3671, "step": 2327 }, { "epoch": 1.8850202429149796, "grad_norm": 1.2099918714991094, "learning_rate": 6.223847056419154e-06, "loss": 1.3623, "step": 2328 }, { "epoch": 1.88582995951417, "grad_norm": 1.2291790207100255, "learning_rate": 6.215889499576898e-06, "loss": 1.3359, "step": 2329 }, { "epoch": 1.8866396761133604, "grad_norm": 1.2504678829644937, "learning_rate": 6.2079347384230505e-06, "loss": 1.3474, "step": 2330 }, { "epoch": 1.8874493927125506, "grad_norm": 1.1571650257183987, "learning_rate": 6.199982778834561e-06, "loss": 1.3704, "step": 2331 }, { "epoch": 1.888259109311741, "grad_norm": 1.192980883967285, "learning_rate": 6.192033626686316e-06, "loss": 1.3893, "step": 2332 }, { "epoch": 1.8890688259109312, "grad_norm": 1.2365704933508046, "learning_rate": 6.1840872878511215e-06, "loss": 1.3739, "step": 2333 }, { "epoch": 1.8898785425101214, "grad_norm": 1.2430427056052022, "learning_rate": 6.1761437681997e-06, "loss": 1.3773, "step": 2334 }, { "epoch": 1.8906882591093117, "grad_norm": 1.2140264228246762, "learning_rate": 6.168203073600706e-06, "loss": 1.3265, "step": 2335 }, { "epoch": 1.891497975708502, "grad_norm": 1.1949388259542468, "learning_rate": 6.160265209920698e-06, "loss": 1.4013, "step": 2336 }, { "epoch": 1.8923076923076922, "grad_norm": 1.2825641807955082, "learning_rate": 6.152330183024142e-06, "loss": 1.3582, "step": 2337 }, { "epoch": 1.8931174089068827, "grad_norm": 1.1900913534174808, "learning_rate": 6.1443979987734086e-06, "loss": 1.3772, "step": 2338 }, { "epoch": 1.8939271255060728, "grad_norm": 1.2005697884400928, "learning_rate": 6.1364686630287694e-06, "loss": 1.3625, "step": 2339 }, { "epoch": 1.8947368421052633, "grad_norm": 1.2630270064813116, "learning_rate": 6.128542181648395e-06, "loss": 1.3342, "step": 2340 }, { "epoch": 1.8955465587044533, "grad_norm": 1.2002800976397625, "learning_rate": 6.120618560488341e-06, "loss": 1.3267, "step": 2341 }, { "epoch": 1.8963562753036438, "grad_norm": 1.215442464121004, "learning_rate": 6.112697805402548e-06, "loss": 1.3688, "step": 2342 }, { "epoch": 1.8971659919028339, "grad_norm": 1.2288864318659574, "learning_rate": 6.104779922242851e-06, "loss": 1.3636, "step": 2343 }, { "epoch": 1.8979757085020244, "grad_norm": 1.194317008155087, "learning_rate": 6.096864916858957e-06, "loss": 1.32, "step": 2344 }, { "epoch": 1.8987854251012146, "grad_norm": 1.2124723409709166, "learning_rate": 6.088952795098442e-06, "loss": 1.3127, "step": 2345 }, { "epoch": 1.8995951417004049, "grad_norm": 1.188500019539976, "learning_rate": 6.081043562806754e-06, "loss": 1.388, "step": 2346 }, { "epoch": 1.9004048582995952, "grad_norm": 1.1552719513756389, "learning_rate": 6.073137225827213e-06, "loss": 1.3228, "step": 2347 }, { "epoch": 1.9012145748987854, "grad_norm": 1.2237471760776777, "learning_rate": 6.065233790000993e-06, "loss": 1.3489, "step": 2348 }, { "epoch": 1.9020242914979757, "grad_norm": 1.1792800865204056, "learning_rate": 6.057333261167122e-06, "loss": 1.3479, "step": 2349 }, { "epoch": 1.902834008097166, "grad_norm": 1.199559285857203, "learning_rate": 6.049435645162487e-06, "loss": 1.3519, "step": 2350 }, { "epoch": 1.9036437246963562, "grad_norm": 1.181957404617024, "learning_rate": 6.041540947821827e-06, "loss": 1.3371, "step": 2351 }, { "epoch": 1.9044534412955465, "grad_norm": 1.2505276548541542, "learning_rate": 6.0336491749777115e-06, "loss": 1.4061, "step": 2352 }, { "epoch": 1.905263157894737, "grad_norm": 1.195960955129131, "learning_rate": 6.025760332460558e-06, "loss": 1.3586, "step": 2353 }, { "epoch": 1.906072874493927, "grad_norm": 1.1707714015115926, "learning_rate": 6.01787442609862e-06, "loss": 1.3412, "step": 2354 }, { "epoch": 1.9068825910931175, "grad_norm": 1.140910772316072, "learning_rate": 6.009991461717977e-06, "loss": 1.3314, "step": 2355 }, { "epoch": 1.9076923076923076, "grad_norm": 1.23349460378788, "learning_rate": 6.002111445142533e-06, "loss": 1.3255, "step": 2356 }, { "epoch": 1.908502024291498, "grad_norm": 1.2462129342380746, "learning_rate": 5.994234382194026e-06, "loss": 1.3989, "step": 2357 }, { "epoch": 1.9093117408906881, "grad_norm": 1.2688670795842787, "learning_rate": 5.986360278691998e-06, "loss": 1.3718, "step": 2358 }, { "epoch": 1.9101214574898786, "grad_norm": 1.2095702791808156, "learning_rate": 5.978489140453817e-06, "loss": 1.3534, "step": 2359 }, { "epoch": 1.9109311740890689, "grad_norm": 1.1982546045786988, "learning_rate": 5.9706209732946495e-06, "loss": 1.3671, "step": 2360 }, { "epoch": 1.9117408906882591, "grad_norm": 1.1822667381374636, "learning_rate": 5.962755783027473e-06, "loss": 1.2912, "step": 2361 }, { "epoch": 1.9125506072874494, "grad_norm": 1.242784856625613, "learning_rate": 5.954893575463064e-06, "loss": 1.369, "step": 2362 }, { "epoch": 1.9133603238866397, "grad_norm": 1.2344835123108033, "learning_rate": 5.9470343564099975e-06, "loss": 1.3867, "step": 2363 }, { "epoch": 1.91417004048583, "grad_norm": 1.2420930503800902, "learning_rate": 5.939178131674633e-06, "loss": 1.3344, "step": 2364 }, { "epoch": 1.9149797570850202, "grad_norm": 1.2002007408254076, "learning_rate": 5.931324907061131e-06, "loss": 1.4089, "step": 2365 }, { "epoch": 1.9157894736842105, "grad_norm": 1.1431280050371697, "learning_rate": 5.92347468837142e-06, "loss": 1.3774, "step": 2366 }, { "epoch": 1.9165991902834008, "grad_norm": 1.2176788699652055, "learning_rate": 5.915627481405224e-06, "loss": 1.3993, "step": 2367 }, { "epoch": 1.9174089068825912, "grad_norm": 1.1570899200524465, "learning_rate": 5.907783291960027e-06, "loss": 1.2658, "step": 2368 }, { "epoch": 1.9182186234817813, "grad_norm": 1.2498879079115792, "learning_rate": 5.899942125831097e-06, "loss": 1.286, "step": 2369 }, { "epoch": 1.9190283400809718, "grad_norm": 1.2146345067146647, "learning_rate": 5.892103988811457e-06, "loss": 1.4007, "step": 2370 }, { "epoch": 1.9198380566801618, "grad_norm": 1.2400136674029751, "learning_rate": 5.884268886691898e-06, "loss": 1.3853, "step": 2371 }, { "epoch": 1.9206477732793523, "grad_norm": 1.2266043645344005, "learning_rate": 5.876436825260967e-06, "loss": 1.3606, "step": 2372 }, { "epoch": 1.9214574898785424, "grad_norm": 1.1901114955712122, "learning_rate": 5.868607810304967e-06, "loss": 1.4107, "step": 2373 }, { "epoch": 1.9222672064777329, "grad_norm": 1.2389492781589038, "learning_rate": 5.860781847607943e-06, "loss": 1.3883, "step": 2374 }, { "epoch": 1.9230769230769231, "grad_norm": 1.223439005294772, "learning_rate": 5.852958942951701e-06, "loss": 1.3636, "step": 2375 }, { "epoch": 1.9238866396761134, "grad_norm": 1.1896039830081746, "learning_rate": 5.845139102115769e-06, "loss": 1.3259, "step": 2376 }, { "epoch": 1.9246963562753037, "grad_norm": 1.2049235269890495, "learning_rate": 5.837322330877421e-06, "loss": 1.3567, "step": 2377 }, { "epoch": 1.925506072874494, "grad_norm": 1.2157467237161108, "learning_rate": 5.829508635011667e-06, "loss": 1.3588, "step": 2378 }, { "epoch": 1.9263157894736842, "grad_norm": 1.248188375178168, "learning_rate": 5.821698020291234e-06, "loss": 1.3572, "step": 2379 }, { "epoch": 1.9271255060728745, "grad_norm": 1.212346730632553, "learning_rate": 5.8138904924865766e-06, "loss": 1.3043, "step": 2380 }, { "epoch": 1.9279352226720647, "grad_norm": 1.1824421069183566, "learning_rate": 5.806086057365878e-06, "loss": 1.3943, "step": 2381 }, { "epoch": 1.928744939271255, "grad_norm": 1.1763429489984647, "learning_rate": 5.798284720695022e-06, "loss": 1.365, "step": 2382 }, { "epoch": 1.9295546558704455, "grad_norm": 1.204932841462832, "learning_rate": 5.790486488237619e-06, "loss": 1.3377, "step": 2383 }, { "epoch": 1.9303643724696355, "grad_norm": 1.2045578783510957, "learning_rate": 5.782691365754971e-06, "loss": 1.4084, "step": 2384 }, { "epoch": 1.931174089068826, "grad_norm": 1.2608711906516323, "learning_rate": 5.774899359006092e-06, "loss": 1.2661, "step": 2385 }, { "epoch": 1.931983805668016, "grad_norm": 1.1990033949590717, "learning_rate": 5.76711047374769e-06, "loss": 1.3528, "step": 2386 }, { "epoch": 1.9327935222672066, "grad_norm": 1.172948011576386, "learning_rate": 5.759324715734166e-06, "loss": 1.4189, "step": 2387 }, { "epoch": 1.9336032388663966, "grad_norm": 1.241926186312199, "learning_rate": 5.7515420907176105e-06, "loss": 1.4028, "step": 2388 }, { "epoch": 1.9344129554655871, "grad_norm": 1.1964066986025503, "learning_rate": 5.743762604447809e-06, "loss": 1.3317, "step": 2389 }, { "epoch": 1.9352226720647774, "grad_norm": 1.1945494693033618, "learning_rate": 5.735986262672211e-06, "loss": 1.341, "step": 2390 }, { "epoch": 1.9360323886639677, "grad_norm": 1.1755268375572105, "learning_rate": 5.728213071135962e-06, "loss": 1.3911, "step": 2391 }, { "epoch": 1.936842105263158, "grad_norm": 1.1804944891484381, "learning_rate": 5.720443035581867e-06, "loss": 1.3697, "step": 2392 }, { "epoch": 1.9376518218623482, "grad_norm": 1.2046317934373794, "learning_rate": 5.712676161750399e-06, "loss": 1.4039, "step": 2393 }, { "epoch": 1.9384615384615385, "grad_norm": 1.1637358644282172, "learning_rate": 5.704912455379703e-06, "loss": 1.363, "step": 2394 }, { "epoch": 1.9392712550607287, "grad_norm": 1.1974299223091525, "learning_rate": 5.697151922205575e-06, "loss": 1.3971, "step": 2395 }, { "epoch": 1.940080971659919, "grad_norm": 1.157981186836544, "learning_rate": 5.689394567961477e-06, "loss": 1.2967, "step": 2396 }, { "epoch": 1.9408906882591093, "grad_norm": 1.1670968402428008, "learning_rate": 5.681640398378514e-06, "loss": 1.3849, "step": 2397 }, { "epoch": 1.9417004048582998, "grad_norm": 1.1292919479938222, "learning_rate": 5.673889419185439e-06, "loss": 1.4253, "step": 2398 }, { "epoch": 1.9425101214574898, "grad_norm": 1.213702805085859, "learning_rate": 5.666141636108655e-06, "loss": 1.3198, "step": 2399 }, { "epoch": 1.9433198380566803, "grad_norm": 1.157847450719407, "learning_rate": 5.658397054872197e-06, "loss": 1.3175, "step": 2400 }, { "epoch": 1.9441295546558703, "grad_norm": 1.2106036490901373, "learning_rate": 5.650655681197734e-06, "loss": 1.4105, "step": 2401 }, { "epoch": 1.9449392712550608, "grad_norm": 1.219262613527522, "learning_rate": 5.642917520804569e-06, "loss": 1.3505, "step": 2402 }, { "epoch": 1.9457489878542509, "grad_norm": 1.1742978362533214, "learning_rate": 5.635182579409626e-06, "loss": 1.299, "step": 2403 }, { "epoch": 1.9465587044534414, "grad_norm": 1.2293963021779448, "learning_rate": 5.627450862727461e-06, "loss": 1.3391, "step": 2404 }, { "epoch": 1.9473684210526314, "grad_norm": 1.2034045616590177, "learning_rate": 5.619722376470238e-06, "loss": 1.3877, "step": 2405 }, { "epoch": 1.948178137651822, "grad_norm": 1.2272183799034553, "learning_rate": 5.611997126347732e-06, "loss": 1.3988, "step": 2406 }, { "epoch": 1.9489878542510122, "grad_norm": 1.2695110730414088, "learning_rate": 5.604275118067341e-06, "loss": 1.3653, "step": 2407 }, { "epoch": 1.9497975708502024, "grad_norm": 1.248909293580144, "learning_rate": 5.596556357334056e-06, "loss": 1.3562, "step": 2408 }, { "epoch": 1.9506072874493927, "grad_norm": 1.2131088122427793, "learning_rate": 5.588840849850472e-06, "loss": 1.3928, "step": 2409 }, { "epoch": 1.951417004048583, "grad_norm": 1.1900481644176077, "learning_rate": 5.581128601316774e-06, "loss": 1.3777, "step": 2410 }, { "epoch": 1.9522267206477733, "grad_norm": 1.2099498350109, "learning_rate": 5.573419617430758e-06, "loss": 1.3499, "step": 2411 }, { "epoch": 1.9530364372469635, "grad_norm": 1.217148269565451, "learning_rate": 5.565713903887788e-06, "loss": 1.2877, "step": 2412 }, { "epoch": 1.953846153846154, "grad_norm": 1.2490893741382743, "learning_rate": 5.558011466380824e-06, "loss": 1.3768, "step": 2413 }, { "epoch": 1.954655870445344, "grad_norm": 1.240461136852775, "learning_rate": 5.550312310600394e-06, "loss": 1.4229, "step": 2414 }, { "epoch": 1.9554655870445345, "grad_norm": 1.2295726427309241, "learning_rate": 5.542616442234618e-06, "loss": 1.3347, "step": 2415 }, { "epoch": 1.9562753036437246, "grad_norm": 1.2008815138181963, "learning_rate": 5.534923866969175e-06, "loss": 1.4176, "step": 2416 }, { "epoch": 1.957085020242915, "grad_norm": 1.1676560157055105, "learning_rate": 5.527234590487314e-06, "loss": 1.3314, "step": 2417 }, { "epoch": 1.9578947368421051, "grad_norm": 1.1701886277830924, "learning_rate": 5.5195486184698435e-06, "loss": 1.363, "step": 2418 }, { "epoch": 1.9587044534412956, "grad_norm": 1.1778230913774466, "learning_rate": 5.511865956595142e-06, "loss": 1.3683, "step": 2419 }, { "epoch": 1.9595141700404857, "grad_norm": 1.173338300441953, "learning_rate": 5.504186610539131e-06, "loss": 1.3393, "step": 2420 }, { "epoch": 1.9603238866396762, "grad_norm": 1.1767463019339799, "learning_rate": 5.496510585975285e-06, "loss": 1.3235, "step": 2421 }, { "epoch": 1.9611336032388664, "grad_norm": 1.2644216036741416, "learning_rate": 5.488837888574623e-06, "loss": 1.3373, "step": 2422 }, { "epoch": 1.9619433198380567, "grad_norm": 1.2103895583574469, "learning_rate": 5.4811685240057165e-06, "loss": 1.3983, "step": 2423 }, { "epoch": 1.962753036437247, "grad_norm": 1.1846356504436384, "learning_rate": 5.473502497934663e-06, "loss": 1.3294, "step": 2424 }, { "epoch": 1.9635627530364372, "grad_norm": 1.2235189988456545, "learning_rate": 5.465839816025093e-06, "loss": 1.3276, "step": 2425 }, { "epoch": 1.9643724696356275, "grad_norm": 1.2314816730763305, "learning_rate": 5.458180483938179e-06, "loss": 1.4498, "step": 2426 }, { "epoch": 1.9651821862348178, "grad_norm": 1.212596357115965, "learning_rate": 5.450524507332606e-06, "loss": 1.3656, "step": 2427 }, { "epoch": 1.965991902834008, "grad_norm": 1.1639800229825885, "learning_rate": 5.442871891864585e-06, "loss": 1.3776, "step": 2428 }, { "epoch": 1.9668016194331983, "grad_norm": 1.1367690723379495, "learning_rate": 5.435222643187843e-06, "loss": 1.3242, "step": 2429 }, { "epoch": 1.9676113360323888, "grad_norm": 1.2165886607110268, "learning_rate": 5.427576766953615e-06, "loss": 1.3591, "step": 2430 }, { "epoch": 1.9684210526315788, "grad_norm": 1.1790081897002724, "learning_rate": 5.419934268810659e-06, "loss": 1.3848, "step": 2431 }, { "epoch": 1.9692307692307693, "grad_norm": 1.2315181532137929, "learning_rate": 5.412295154405217e-06, "loss": 1.3431, "step": 2432 }, { "epoch": 1.9700404858299594, "grad_norm": 1.2728808844442372, "learning_rate": 5.4046594293810515e-06, "loss": 1.3639, "step": 2433 }, { "epoch": 1.9708502024291499, "grad_norm": 1.232337828546579, "learning_rate": 5.397027099379406e-06, "loss": 1.3697, "step": 2434 }, { "epoch": 1.97165991902834, "grad_norm": 1.243179350091489, "learning_rate": 5.3893981700390215e-06, "loss": 1.3252, "step": 2435 }, { "epoch": 1.9724696356275304, "grad_norm": 1.2371857762137026, "learning_rate": 5.381772646996128e-06, "loss": 1.3827, "step": 2436 }, { "epoch": 1.9732793522267207, "grad_norm": 1.288958547364644, "learning_rate": 5.374150535884433e-06, "loss": 1.3265, "step": 2437 }, { "epoch": 1.974089068825911, "grad_norm": 1.2475048960238355, "learning_rate": 5.3665318423351255e-06, "loss": 1.2822, "step": 2438 }, { "epoch": 1.9748987854251012, "grad_norm": 1.1986060107719656, "learning_rate": 5.358916571976878e-06, "loss": 1.3558, "step": 2439 }, { "epoch": 1.9757085020242915, "grad_norm": 1.1809389390768783, "learning_rate": 5.35130473043582e-06, "loss": 1.3428, "step": 2440 }, { "epoch": 1.9765182186234818, "grad_norm": 1.1854316658221693, "learning_rate": 5.343696323335564e-06, "loss": 1.4093, "step": 2441 }, { "epoch": 1.977327935222672, "grad_norm": 1.211156343447039, "learning_rate": 5.336091356297168e-06, "loss": 1.2688, "step": 2442 }, { "epoch": 1.9781376518218623, "grad_norm": 1.2978073337950151, "learning_rate": 5.328489834939162e-06, "loss": 1.3924, "step": 2443 }, { "epoch": 1.9789473684210526, "grad_norm": 1.1739628299787876, "learning_rate": 5.320891764877522e-06, "loss": 1.371, "step": 2444 }, { "epoch": 1.979757085020243, "grad_norm": 1.198921849736746, "learning_rate": 5.313297151725679e-06, "loss": 1.3149, "step": 2445 }, { "epoch": 1.980566801619433, "grad_norm": 1.2012656881257282, "learning_rate": 5.305706001094504e-06, "loss": 1.3979, "step": 2446 }, { "epoch": 1.9813765182186236, "grad_norm": 1.1827392393082905, "learning_rate": 5.298118318592316e-06, "loss": 1.3565, "step": 2447 }, { "epoch": 1.9821862348178136, "grad_norm": 1.2628168108725613, "learning_rate": 5.290534109824875e-06, "loss": 1.3705, "step": 2448 }, { "epoch": 1.9829959514170041, "grad_norm": 1.248756806447712, "learning_rate": 5.282953380395366e-06, "loss": 1.2367, "step": 2449 }, { "epoch": 1.9838056680161942, "grad_norm": 1.2171206230007487, "learning_rate": 5.275376135904408e-06, "loss": 1.3403, "step": 2450 }, { "epoch": 1.9846153846153847, "grad_norm": 1.289234216018497, "learning_rate": 5.267802381950042e-06, "loss": 1.3819, "step": 2451 }, { "epoch": 1.985425101214575, "grad_norm": 1.2241063424118177, "learning_rate": 5.260232124127734e-06, "loss": 1.3171, "step": 2452 }, { "epoch": 1.9862348178137652, "grad_norm": 1.2256066411153002, "learning_rate": 5.252665368030362e-06, "loss": 1.4032, "step": 2453 }, { "epoch": 1.9870445344129555, "grad_norm": 1.2295354974621036, "learning_rate": 5.245102119248227e-06, "loss": 1.3562, "step": 2454 }, { "epoch": 1.9878542510121457, "grad_norm": 1.207357620526114, "learning_rate": 5.2375423833690255e-06, "loss": 1.3962, "step": 2455 }, { "epoch": 1.988663967611336, "grad_norm": 1.2123767685021807, "learning_rate": 5.229986165977874e-06, "loss": 1.3524, "step": 2456 }, { "epoch": 1.9894736842105263, "grad_norm": 1.1985170410291037, "learning_rate": 5.222433472657276e-06, "loss": 1.342, "step": 2457 }, { "epoch": 1.9902834008097166, "grad_norm": 1.2549278916904512, "learning_rate": 5.214884308987136e-06, "loss": 1.2894, "step": 2458 }, { "epoch": 1.9910931174089068, "grad_norm": 1.2721385349454946, "learning_rate": 5.207338680544754e-06, "loss": 1.3614, "step": 2459 }, { "epoch": 1.9919028340080973, "grad_norm": 1.1730908013315933, "learning_rate": 5.1997965929048125e-06, "loss": 1.3449, "step": 2460 }, { "epoch": 1.9927125506072874, "grad_norm": 1.237418271403445, "learning_rate": 5.192258051639378e-06, "loss": 1.3128, "step": 2461 }, { "epoch": 1.9935222672064778, "grad_norm": 1.195751862323348, "learning_rate": 5.184723062317905e-06, "loss": 1.3297, "step": 2462 }, { "epoch": 1.994331983805668, "grad_norm": 1.1842709296523235, "learning_rate": 5.177191630507221e-06, "loss": 1.3214, "step": 2463 }, { "epoch": 1.9951417004048584, "grad_norm": 1.1803392844860812, "learning_rate": 5.169663761771522e-06, "loss": 1.415, "step": 2464 }, { "epoch": 1.9959514170040484, "grad_norm": 1.1998108340136762, "learning_rate": 5.1621394616723705e-06, "loss": 1.3449, "step": 2465 }, { "epoch": 1.996761133603239, "grad_norm": 1.2193208223683405, "learning_rate": 5.154618735768695e-06, "loss": 1.2948, "step": 2466 }, { "epoch": 1.9975708502024292, "grad_norm": 1.1939406883857941, "learning_rate": 5.147101589616783e-06, "loss": 1.3627, "step": 2467 }, { "epoch": 1.9983805668016195, "grad_norm": 1.209031327432166, "learning_rate": 5.139588028770275e-06, "loss": 1.3994, "step": 2468 }, { "epoch": 1.9991902834008097, "grad_norm": 1.169763485845793, "learning_rate": 5.13207805878017e-06, "loss": 1.3228, "step": 2469 }, { "epoch": 2.0, "grad_norm": 1.532603626298016, "learning_rate": 5.124571685194804e-06, "loss": 1.2018, "step": 2470 }, { "epoch": 2.0008097165991905, "grad_norm": 2.854383023719633, "learning_rate": 5.1170689135598675e-06, "loss": 1.0829, "step": 2471 }, { "epoch": 2.0016194331983805, "grad_norm": 3.050414009521738, "learning_rate": 5.10956974941838e-06, "loss": 1.0063, "step": 2472 }, { "epoch": 2.002429149797571, "grad_norm": 2.4422306774820672, "learning_rate": 5.102074198310701e-06, "loss": 1.0396, "step": 2473 }, { "epoch": 2.003238866396761, "grad_norm": 2.0451498041031044, "learning_rate": 5.094582265774515e-06, "loss": 0.9888, "step": 2474 }, { "epoch": 2.0040485829959516, "grad_norm": 4.046585838573169, "learning_rate": 5.087093957344841e-06, "loss": 1.0822, "step": 2475 }, { "epoch": 2.0048582995951416, "grad_norm": 4.926775146816468, "learning_rate": 5.079609278554011e-06, "loss": 1.0585, "step": 2476 }, { "epoch": 2.005668016194332, "grad_norm": 3.3365273107950393, "learning_rate": 5.07212823493169e-06, "loss": 0.9847, "step": 2477 }, { "epoch": 2.006477732793522, "grad_norm": 2.24516980725426, "learning_rate": 5.064650832004839e-06, "loss": 1.0298, "step": 2478 }, { "epoch": 2.0072874493927126, "grad_norm": 1.7417896088574096, "learning_rate": 5.057177075297748e-06, "loss": 0.9663, "step": 2479 }, { "epoch": 2.0080971659919027, "grad_norm": 2.031730299689621, "learning_rate": 5.049706970332e-06, "loss": 0.9755, "step": 2480 }, { "epoch": 2.008906882591093, "grad_norm": 1.9976555875126931, "learning_rate": 5.0422405226264825e-06, "loss": 0.9815, "step": 2481 }, { "epoch": 2.0097165991902832, "grad_norm": 1.9153896673542294, "learning_rate": 5.034777737697384e-06, "loss": 0.9825, "step": 2482 }, { "epoch": 2.0105263157894737, "grad_norm": 1.6395893752150272, "learning_rate": 5.027318621058182e-06, "loss": 1.0408, "step": 2483 }, { "epoch": 2.0113360323886638, "grad_norm": 1.6423936187066637, "learning_rate": 5.019863178219653e-06, "loss": 0.9731, "step": 2484 }, { "epoch": 2.0121457489878543, "grad_norm": 1.7371817317654348, "learning_rate": 5.0124114146898505e-06, "loss": 0.9819, "step": 2485 }, { "epoch": 2.0129554655870447, "grad_norm": 1.7846901106310733, "learning_rate": 5.004963335974112e-06, "loss": 1.0394, "step": 2486 }, { "epoch": 2.013765182186235, "grad_norm": 1.7793293478267915, "learning_rate": 4.997518947575058e-06, "loss": 1.0592, "step": 2487 }, { "epoch": 2.0145748987854253, "grad_norm": 1.6437092415729435, "learning_rate": 4.990078254992574e-06, "loss": 0.999, "step": 2488 }, { "epoch": 2.0153846153846153, "grad_norm": 1.6534586866603884, "learning_rate": 4.982641263723822e-06, "loss": 1.0597, "step": 2489 }, { "epoch": 2.016194331983806, "grad_norm": 1.5683644605257934, "learning_rate": 4.9752079792632244e-06, "loss": 1.0008, "step": 2490 }, { "epoch": 2.017004048582996, "grad_norm": 1.6067812383281972, "learning_rate": 4.967778407102466e-06, "loss": 0.9898, "step": 2491 }, { "epoch": 2.0178137651821864, "grad_norm": 1.6310900165582578, "learning_rate": 4.960352552730495e-06, "loss": 0.978, "step": 2492 }, { "epoch": 2.0186234817813764, "grad_norm": 1.6529602670760182, "learning_rate": 4.952930421633506e-06, "loss": 0.9264, "step": 2493 }, { "epoch": 2.019433198380567, "grad_norm": 1.7217510219463308, "learning_rate": 4.945512019294941e-06, "loss": 0.9976, "step": 2494 }, { "epoch": 2.020242914979757, "grad_norm": 1.7026003567072832, "learning_rate": 4.938097351195499e-06, "loss": 0.9611, "step": 2495 }, { "epoch": 2.0210526315789474, "grad_norm": 1.6872762681564812, "learning_rate": 4.9306864228131094e-06, "loss": 0.9497, "step": 2496 }, { "epoch": 2.0218623481781375, "grad_norm": 1.6295705608523692, "learning_rate": 4.92327923962294e-06, "loss": 1.054, "step": 2497 }, { "epoch": 2.022672064777328, "grad_norm": 1.6156871771493067, "learning_rate": 4.91587580709739e-06, "loss": 1.0382, "step": 2498 }, { "epoch": 2.023481781376518, "grad_norm": 1.667645461991409, "learning_rate": 4.9084761307061e-06, "loss": 1.0005, "step": 2499 }, { "epoch": 2.0242914979757085, "grad_norm": 1.6226089763668954, "learning_rate": 4.9010802159159224e-06, "loss": 0.9241, "step": 2500 }, { "epoch": 2.025101214574899, "grad_norm": 1.6532518189213758, "learning_rate": 4.893688068190933e-06, "loss": 1.0058, "step": 2501 }, { "epoch": 2.025910931174089, "grad_norm": 1.5598422496487256, "learning_rate": 4.886299692992425e-06, "loss": 0.9634, "step": 2502 }, { "epoch": 2.0267206477732795, "grad_norm": 1.6760718021088303, "learning_rate": 4.878915095778911e-06, "loss": 0.9279, "step": 2503 }, { "epoch": 2.0275303643724696, "grad_norm": 1.6857184074433873, "learning_rate": 4.871534282006105e-06, "loss": 1.033, "step": 2504 }, { "epoch": 2.02834008097166, "grad_norm": 1.6164129806079546, "learning_rate": 4.864157257126928e-06, "loss": 1.0426, "step": 2505 }, { "epoch": 2.02914979757085, "grad_norm": 1.6723447210528488, "learning_rate": 4.856784026591497e-06, "loss": 0.9716, "step": 2506 }, { "epoch": 2.0299595141700406, "grad_norm": 1.574208483958483, "learning_rate": 4.849414595847138e-06, "loss": 1.0813, "step": 2507 }, { "epoch": 2.0307692307692307, "grad_norm": 1.5924598069963187, "learning_rate": 4.84204897033836e-06, "loss": 1.0232, "step": 2508 }, { "epoch": 2.031578947368421, "grad_norm": 1.6310804523408304, "learning_rate": 4.834687155506861e-06, "loss": 0.9985, "step": 2509 }, { "epoch": 2.032388663967611, "grad_norm": 1.6173868253735737, "learning_rate": 4.8273291567915225e-06, "loss": 0.9707, "step": 2510 }, { "epoch": 2.0331983805668017, "grad_norm": 1.665833483770265, "learning_rate": 4.8199749796284175e-06, "loss": 0.9886, "step": 2511 }, { "epoch": 2.0340080971659917, "grad_norm": 1.5330111731342393, "learning_rate": 4.812624629450785e-06, "loss": 1.0049, "step": 2512 }, { "epoch": 2.0348178137651822, "grad_norm": 1.6868658667526049, "learning_rate": 4.805278111689035e-06, "loss": 0.9885, "step": 2513 }, { "epoch": 2.0356275303643723, "grad_norm": 1.7897259905366785, "learning_rate": 4.797935431770758e-06, "loss": 0.9826, "step": 2514 }, { "epoch": 2.0364372469635628, "grad_norm": 1.7488839728709102, "learning_rate": 4.790596595120699e-06, "loss": 1.0691, "step": 2515 }, { "epoch": 2.0372469635627533, "grad_norm": 1.6698774320169687, "learning_rate": 4.783261607160764e-06, "loss": 1.0223, "step": 2516 }, { "epoch": 2.0380566801619433, "grad_norm": 1.6944023574692242, "learning_rate": 4.775930473310021e-06, "loss": 0.9541, "step": 2517 }, { "epoch": 2.038866396761134, "grad_norm": 1.6030259296495724, "learning_rate": 4.768603198984683e-06, "loss": 0.9359, "step": 2518 }, { "epoch": 2.039676113360324, "grad_norm": 1.7509997817466008, "learning_rate": 4.761279789598122e-06, "loss": 0.9543, "step": 2519 }, { "epoch": 2.0404858299595143, "grad_norm": 1.6053230945872223, "learning_rate": 4.753960250560843e-06, "loss": 1.0359, "step": 2520 }, { "epoch": 2.0412955465587044, "grad_norm": 1.6470550257288128, "learning_rate": 4.746644587280505e-06, "loss": 1.0407, "step": 2521 }, { "epoch": 2.042105263157895, "grad_norm": 1.5960268921634935, "learning_rate": 4.739332805161892e-06, "loss": 1.0435, "step": 2522 }, { "epoch": 2.042914979757085, "grad_norm": 1.6787492821611882, "learning_rate": 4.732024909606923e-06, "loss": 0.918, "step": 2523 }, { "epoch": 2.0437246963562754, "grad_norm": 1.686860082454776, "learning_rate": 4.7247209060146495e-06, "loss": 1.0009, "step": 2524 }, { "epoch": 2.0445344129554655, "grad_norm": 1.8404325405304682, "learning_rate": 4.7174207997812436e-06, "loss": 0.9994, "step": 2525 }, { "epoch": 2.045344129554656, "grad_norm": 1.6172176781453194, "learning_rate": 4.710124596299998e-06, "loss": 1.0434, "step": 2526 }, { "epoch": 2.046153846153846, "grad_norm": 1.603579157288264, "learning_rate": 4.70283230096133e-06, "loss": 0.9923, "step": 2527 }, { "epoch": 2.0469635627530365, "grad_norm": 1.6474458591059657, "learning_rate": 4.6955439191527556e-06, "loss": 0.9799, "step": 2528 }, { "epoch": 2.0477732793522265, "grad_norm": 1.6208921660937512, "learning_rate": 4.688259456258916e-06, "loss": 1.0971, "step": 2529 }, { "epoch": 2.048582995951417, "grad_norm": 1.6056850691607139, "learning_rate": 4.680978917661544e-06, "loss": 0.9683, "step": 2530 }, { "epoch": 2.049392712550607, "grad_norm": 1.6643700134128743, "learning_rate": 4.673702308739478e-06, "loss": 1.0623, "step": 2531 }, { "epoch": 2.0502024291497976, "grad_norm": 1.6630243276082293, "learning_rate": 4.666429634868651e-06, "loss": 0.9393, "step": 2532 }, { "epoch": 2.051012145748988, "grad_norm": 1.6267769934323475, "learning_rate": 4.659160901422094e-06, "loss": 1.0042, "step": 2533 }, { "epoch": 2.051821862348178, "grad_norm": 1.798215794043288, "learning_rate": 4.651896113769917e-06, "loss": 1.0247, "step": 2534 }, { "epoch": 2.0526315789473686, "grad_norm": 1.5817141672309059, "learning_rate": 4.6446352772793256e-06, "loss": 1.045, "step": 2535 }, { "epoch": 2.0534412955465586, "grad_norm": 1.589804333120599, "learning_rate": 4.637378397314607e-06, "loss": 1.0288, "step": 2536 }, { "epoch": 2.054251012145749, "grad_norm": 1.6164730177481579, "learning_rate": 4.630125479237114e-06, "loss": 0.9591, "step": 2537 }, { "epoch": 2.055060728744939, "grad_norm": 1.5869372349513844, "learning_rate": 4.622876528405281e-06, "loss": 1.0036, "step": 2538 }, { "epoch": 2.0558704453441297, "grad_norm": 1.6778929953471509, "learning_rate": 4.615631550174609e-06, "loss": 1.0513, "step": 2539 }, { "epoch": 2.0566801619433197, "grad_norm": 1.5623260111430552, "learning_rate": 4.608390549897661e-06, "loss": 1.0336, "step": 2540 }, { "epoch": 2.05748987854251, "grad_norm": 1.6803766333569634, "learning_rate": 4.601153532924064e-06, "loss": 0.9903, "step": 2541 }, { "epoch": 2.0582995951417002, "grad_norm": 1.7893026888075112, "learning_rate": 4.593920504600508e-06, "loss": 0.9702, "step": 2542 }, { "epoch": 2.0591093117408907, "grad_norm": 1.7193621454208152, "learning_rate": 4.586691470270725e-06, "loss": 1.0157, "step": 2543 }, { "epoch": 2.059919028340081, "grad_norm": 1.660520678676988, "learning_rate": 4.579466435275506e-06, "loss": 0.9825, "step": 2544 }, { "epoch": 2.0607287449392713, "grad_norm": 1.654111044168718, "learning_rate": 4.5722454049526825e-06, "loss": 0.9855, "step": 2545 }, { "epoch": 2.0615384615384613, "grad_norm": 1.6536593350319584, "learning_rate": 4.565028384637127e-06, "loss": 0.9827, "step": 2546 }, { "epoch": 2.062348178137652, "grad_norm": 1.7206634958914158, "learning_rate": 4.557815379660749e-06, "loss": 0.9644, "step": 2547 }, { "epoch": 2.0631578947368423, "grad_norm": 1.6167239693996955, "learning_rate": 4.550606395352496e-06, "loss": 1.0163, "step": 2548 }, { "epoch": 2.0639676113360323, "grad_norm": 1.6089251662105508, "learning_rate": 4.543401437038335e-06, "loss": 0.9805, "step": 2549 }, { "epoch": 2.064777327935223, "grad_norm": 1.6783012771976722, "learning_rate": 4.536200510041271e-06, "loss": 1.0095, "step": 2550 }, { "epoch": 2.065587044534413, "grad_norm": 1.7505966956320587, "learning_rate": 4.5290036196813294e-06, "loss": 0.9746, "step": 2551 }, { "epoch": 2.0663967611336034, "grad_norm": 1.8266001199253628, "learning_rate": 4.521810771275543e-06, "loss": 1.0251, "step": 2552 }, { "epoch": 2.0672064777327934, "grad_norm": 1.6164027260208844, "learning_rate": 4.514621970137967e-06, "loss": 1.0155, "step": 2553 }, { "epoch": 2.068016194331984, "grad_norm": 1.7050380636821716, "learning_rate": 4.507437221579662e-06, "loss": 0.9616, "step": 2554 }, { "epoch": 2.068825910931174, "grad_norm": 1.7946850395276819, "learning_rate": 4.5002565309087e-06, "loss": 0.9502, "step": 2555 }, { "epoch": 2.0696356275303645, "grad_norm": 1.585579416581573, "learning_rate": 4.493079903430144e-06, "loss": 0.998, "step": 2556 }, { "epoch": 2.0704453441295545, "grad_norm": 1.6347584013839378, "learning_rate": 4.485907344446073e-06, "loss": 1.006, "step": 2557 }, { "epoch": 2.071255060728745, "grad_norm": 1.7338467691904824, "learning_rate": 4.478738859255542e-06, "loss": 1.0128, "step": 2558 }, { "epoch": 2.072064777327935, "grad_norm": 1.8039959794063494, "learning_rate": 4.4715744531546115e-06, "loss": 0.9555, "step": 2559 }, { "epoch": 2.0728744939271255, "grad_norm": 1.7310213842710942, "learning_rate": 4.4644141314363165e-06, "loss": 1.0433, "step": 2560 }, { "epoch": 2.0736842105263156, "grad_norm": 1.5892088009452698, "learning_rate": 4.45725789939068e-06, "loss": 0.9779, "step": 2561 }, { "epoch": 2.074493927125506, "grad_norm": 1.5328100810210468, "learning_rate": 4.450105762304703e-06, "loss": 1.057, "step": 2562 }, { "epoch": 2.0753036437246966, "grad_norm": 1.5920760677864, "learning_rate": 4.44295772546236e-06, "loss": 1.0849, "step": 2563 }, { "epoch": 2.0761133603238866, "grad_norm": 1.660029789485431, "learning_rate": 4.435813794144596e-06, "loss": 1.0217, "step": 2564 }, { "epoch": 2.076923076923077, "grad_norm": 1.665382546622789, "learning_rate": 4.4286739736293285e-06, "loss": 0.9562, "step": 2565 }, { "epoch": 2.077732793522267, "grad_norm": 1.6552078961930563, "learning_rate": 4.421538269191427e-06, "loss": 1.031, "step": 2566 }, { "epoch": 2.0785425101214576, "grad_norm": 1.6606164514071944, "learning_rate": 4.414406686102734e-06, "loss": 0.9781, "step": 2567 }, { "epoch": 2.0793522267206477, "grad_norm": 1.6124589826824705, "learning_rate": 4.407279229632034e-06, "loss": 1.0374, "step": 2568 }, { "epoch": 2.080161943319838, "grad_norm": 1.7193015635747628, "learning_rate": 4.400155905045073e-06, "loss": 1.0336, "step": 2569 }, { "epoch": 2.080971659919028, "grad_norm": 1.7315044687603365, "learning_rate": 4.393036717604536e-06, "loss": 0.9621, "step": 2570 }, { "epoch": 2.0817813765182187, "grad_norm": 1.7098596418635459, "learning_rate": 4.385921672570054e-06, "loss": 0.9662, "step": 2571 }, { "epoch": 2.0825910931174088, "grad_norm": 1.607788160578827, "learning_rate": 4.378810775198203e-06, "loss": 1.0082, "step": 2572 }, { "epoch": 2.0834008097165992, "grad_norm": 1.653291357704816, "learning_rate": 4.371704030742491e-06, "loss": 1.0356, "step": 2573 }, { "epoch": 2.0842105263157893, "grad_norm": 1.6881096442374148, "learning_rate": 4.36460144445335e-06, "loss": 1.0342, "step": 2574 }, { "epoch": 2.08502024291498, "grad_norm": 1.7125542227995731, "learning_rate": 4.357503021578158e-06, "loss": 0.9266, "step": 2575 }, { "epoch": 2.08582995951417, "grad_norm": 1.670269245177905, "learning_rate": 4.3504087673612e-06, "loss": 1.001, "step": 2576 }, { "epoch": 2.0866396761133603, "grad_norm": 1.5784965452737714, "learning_rate": 4.343318687043691e-06, "loss": 1.0012, "step": 2577 }, { "epoch": 2.087449392712551, "grad_norm": 1.6681995748372629, "learning_rate": 4.336232785863756e-06, "loss": 1.0095, "step": 2578 }, { "epoch": 2.088259109311741, "grad_norm": 1.748694757073851, "learning_rate": 4.329151069056432e-06, "loss": 0.9591, "step": 2579 }, { "epoch": 2.0890688259109313, "grad_norm": 1.7719387348638524, "learning_rate": 4.322073541853677e-06, "loss": 0.9696, "step": 2580 }, { "epoch": 2.0898785425101214, "grad_norm": 1.64114377111076, "learning_rate": 4.3150002094843415e-06, "loss": 0.9863, "step": 2581 }, { "epoch": 2.090688259109312, "grad_norm": 1.64738596574169, "learning_rate": 4.307931077174175e-06, "loss": 1.0393, "step": 2582 }, { "epoch": 2.091497975708502, "grad_norm": 1.691860211239468, "learning_rate": 4.300866150145837e-06, "loss": 0.9525, "step": 2583 }, { "epoch": 2.0923076923076924, "grad_norm": 1.576591959382383, "learning_rate": 4.293805433618869e-06, "loss": 1.0705, "step": 2584 }, { "epoch": 2.0931174089068825, "grad_norm": 1.5738372098947238, "learning_rate": 4.286748932809707e-06, "loss": 1.0264, "step": 2585 }, { "epoch": 2.093927125506073, "grad_norm": 1.6490322231201804, "learning_rate": 4.279696652931663e-06, "loss": 0.9917, "step": 2586 }, { "epoch": 2.094736842105263, "grad_norm": 1.8188564774664804, "learning_rate": 4.272648599194948e-06, "loss": 0.9347, "step": 2587 }, { "epoch": 2.0955465587044535, "grad_norm": 1.8581675482172395, "learning_rate": 4.265604776806638e-06, "loss": 0.9164, "step": 2588 }, { "epoch": 2.0963562753036435, "grad_norm": 1.8577584612627356, "learning_rate": 4.258565190970684e-06, "loss": 1.0061, "step": 2589 }, { "epoch": 2.097165991902834, "grad_norm": 1.743492463005408, "learning_rate": 4.2515298468879064e-06, "loss": 1.0067, "step": 2590 }, { "epoch": 2.097975708502024, "grad_norm": 1.646875858354436, "learning_rate": 4.244498749756e-06, "loss": 1.0357, "step": 2591 }, { "epoch": 2.0987854251012146, "grad_norm": 1.7035172799966736, "learning_rate": 4.237471904769514e-06, "loss": 1.0574, "step": 2592 }, { "epoch": 2.099595141700405, "grad_norm": 1.729516687109549, "learning_rate": 4.2304493171198605e-06, "loss": 1.0129, "step": 2593 }, { "epoch": 2.100404858299595, "grad_norm": 1.8017575479884655, "learning_rate": 4.223430991995296e-06, "loss": 1.0019, "step": 2594 }, { "epoch": 2.1012145748987856, "grad_norm": 1.6594942547323506, "learning_rate": 4.216416934580947e-06, "loss": 0.9952, "step": 2595 }, { "epoch": 2.1020242914979756, "grad_norm": 1.7239899018632698, "learning_rate": 4.2094071500587695e-06, "loss": 0.9851, "step": 2596 }, { "epoch": 2.102834008097166, "grad_norm": 1.703904052950475, "learning_rate": 4.202401643607572e-06, "loss": 1.0284, "step": 2597 }, { "epoch": 2.103643724696356, "grad_norm": 1.8593640278653734, "learning_rate": 4.1954004204029945e-06, "loss": 0.9578, "step": 2598 }, { "epoch": 2.1044534412955467, "grad_norm": 1.856774903804991, "learning_rate": 4.188403485617526e-06, "loss": 1.0438, "step": 2599 }, { "epoch": 2.1052631578947367, "grad_norm": 1.78449027364225, "learning_rate": 4.181410844420473e-06, "loss": 0.9355, "step": 2600 }, { "epoch": 2.106072874493927, "grad_norm": 1.6445451971076204, "learning_rate": 4.174422501977976e-06, "loss": 1.0028, "step": 2601 }, { "epoch": 2.1068825910931173, "grad_norm": 1.6462257848218957, "learning_rate": 4.167438463453003e-06, "loss": 0.9803, "step": 2602 }, { "epoch": 2.1076923076923078, "grad_norm": 1.6907368674533254, "learning_rate": 4.160458734005337e-06, "loss": 1.0294, "step": 2603 }, { "epoch": 2.108502024291498, "grad_norm": 1.6636161911260379, "learning_rate": 4.153483318791579e-06, "loss": 0.9813, "step": 2604 }, { "epoch": 2.1093117408906883, "grad_norm": 1.7183152187153377, "learning_rate": 4.146512222965144e-06, "loss": 0.9823, "step": 2605 }, { "epoch": 2.1101214574898783, "grad_norm": 1.6814396856451228, "learning_rate": 4.139545451676248e-06, "loss": 1.032, "step": 2606 }, { "epoch": 2.110931174089069, "grad_norm": 1.7645866557280991, "learning_rate": 4.1325830100719275e-06, "loss": 1.0356, "step": 2607 }, { "epoch": 2.1117408906882593, "grad_norm": 1.6961679753232464, "learning_rate": 4.125624903296009e-06, "loss": 1.0811, "step": 2608 }, { "epoch": 2.1125506072874494, "grad_norm": 1.6525910699956303, "learning_rate": 4.118671136489115e-06, "loss": 0.9451, "step": 2609 }, { "epoch": 2.11336032388664, "grad_norm": 1.6360702685596986, "learning_rate": 4.111721714788671e-06, "loss": 0.9375, "step": 2610 }, { "epoch": 2.11417004048583, "grad_norm": 1.6179640067775025, "learning_rate": 4.104776643328886e-06, "loss": 1.0224, "step": 2611 }, { "epoch": 2.1149797570850204, "grad_norm": 1.6525238056683458, "learning_rate": 4.097835927240753e-06, "loss": 1.0351, "step": 2612 }, { "epoch": 2.1157894736842104, "grad_norm": 1.585412724307839, "learning_rate": 4.090899571652053e-06, "loss": 0.9931, "step": 2613 }, { "epoch": 2.116599190283401, "grad_norm": 1.646625538543497, "learning_rate": 4.083967581687338e-06, "loss": 0.9385, "step": 2614 }, { "epoch": 2.117408906882591, "grad_norm": 1.6984647506858865, "learning_rate": 4.077039962467946e-06, "loss": 0.9629, "step": 2615 }, { "epoch": 2.1182186234817815, "grad_norm": 1.6719455263043292, "learning_rate": 4.070116719111973e-06, "loss": 1.0239, "step": 2616 }, { "epoch": 2.1190283400809715, "grad_norm": 1.7112777230654765, "learning_rate": 4.063197856734295e-06, "loss": 0.979, "step": 2617 }, { "epoch": 2.119838056680162, "grad_norm": 1.6987541653351368, "learning_rate": 4.056283380446542e-06, "loss": 1.0153, "step": 2618 }, { "epoch": 2.120647773279352, "grad_norm": 1.6871431309146279, "learning_rate": 4.049373295357105e-06, "loss": 1.0376, "step": 2619 }, { "epoch": 2.1214574898785425, "grad_norm": 1.6903927451402816, "learning_rate": 4.042467606571134e-06, "loss": 0.9581, "step": 2620 }, { "epoch": 2.1222672064777326, "grad_norm": 1.6637711833229312, "learning_rate": 4.0355663191905285e-06, "loss": 1.0201, "step": 2621 }, { "epoch": 2.123076923076923, "grad_norm": 1.7111155440518553, "learning_rate": 4.028669438313933e-06, "loss": 0.9114, "step": 2622 }, { "epoch": 2.1238866396761136, "grad_norm": 1.7277429546609078, "learning_rate": 4.0217769690367426e-06, "loss": 0.9072, "step": 2623 }, { "epoch": 2.1246963562753036, "grad_norm": 1.6772060985996178, "learning_rate": 4.014888916451097e-06, "loss": 0.9913, "step": 2624 }, { "epoch": 2.125506072874494, "grad_norm": 1.6078641618211225, "learning_rate": 4.008005285645863e-06, "loss": 1.069, "step": 2625 }, { "epoch": 2.126315789473684, "grad_norm": 1.620520443252991, "learning_rate": 4.001126081706643e-06, "loss": 0.9733, "step": 2626 }, { "epoch": 2.1271255060728746, "grad_norm": 1.6078988933775202, "learning_rate": 3.994251309715772e-06, "loss": 1.0605, "step": 2627 }, { "epoch": 2.1279352226720647, "grad_norm": 1.6029687010632365, "learning_rate": 3.9873809747523075e-06, "loss": 0.9842, "step": 2628 }, { "epoch": 2.128744939271255, "grad_norm": 1.705951335796002, "learning_rate": 3.98051508189203e-06, "loss": 1.0009, "step": 2629 }, { "epoch": 2.1295546558704452, "grad_norm": 1.6635774008101378, "learning_rate": 3.973653636207437e-06, "loss": 0.8982, "step": 2630 }, { "epoch": 2.1303643724696357, "grad_norm": 1.7082989524349448, "learning_rate": 3.966796642767745e-06, "loss": 1.0141, "step": 2631 }, { "epoch": 2.1311740890688258, "grad_norm": 1.6653197593597078, "learning_rate": 3.959944106638881e-06, "loss": 0.9809, "step": 2632 }, { "epoch": 2.1319838056680163, "grad_norm": 1.6678594956056483, "learning_rate": 3.953096032883473e-06, "loss": 0.98, "step": 2633 }, { "epoch": 2.1327935222672063, "grad_norm": 1.7005284089411195, "learning_rate": 3.946252426560855e-06, "loss": 0.9979, "step": 2634 }, { "epoch": 2.133603238866397, "grad_norm": 1.6151071121961287, "learning_rate": 3.939413292727061e-06, "loss": 0.9968, "step": 2635 }, { "epoch": 2.134412955465587, "grad_norm": 1.6806724677037892, "learning_rate": 3.932578636434822e-06, "loss": 0.9318, "step": 2636 }, { "epoch": 2.1352226720647773, "grad_norm": 1.7311866714728976, "learning_rate": 3.9257484627335545e-06, "loss": 0.965, "step": 2637 }, { "epoch": 2.136032388663968, "grad_norm": 1.7711121211961232, "learning_rate": 3.9189227766693715e-06, "loss": 0.9824, "step": 2638 }, { "epoch": 2.136842105263158, "grad_norm": 1.684667995414289, "learning_rate": 3.912101583285072e-06, "loss": 0.915, "step": 2639 }, { "epoch": 2.1376518218623484, "grad_norm": 1.7191802163538186, "learning_rate": 3.9052848876201285e-06, "loss": 0.9592, "step": 2640 }, { "epoch": 2.1384615384615384, "grad_norm": 1.6407834559216017, "learning_rate": 3.898472694710692e-06, "loss": 0.9821, "step": 2641 }, { "epoch": 2.139271255060729, "grad_norm": 1.6128402655438978, "learning_rate": 3.891665009589588e-06, "loss": 0.9976, "step": 2642 }, { "epoch": 2.140080971659919, "grad_norm": 1.7113632799708938, "learning_rate": 3.884861837286314e-06, "loss": 0.9884, "step": 2643 }, { "epoch": 2.1408906882591094, "grad_norm": 1.6447541227519287, "learning_rate": 3.878063182827025e-06, "loss": 0.9051, "step": 2644 }, { "epoch": 2.1417004048582995, "grad_norm": 1.700018398274763, "learning_rate": 3.8712690512345555e-06, "loss": 1.0079, "step": 2645 }, { "epoch": 2.14251012145749, "grad_norm": 1.7278282489587906, "learning_rate": 3.8644794475283754e-06, "loss": 0.9488, "step": 2646 }, { "epoch": 2.14331983805668, "grad_norm": 1.7030788584383703, "learning_rate": 3.857694376724634e-06, "loss": 0.9766, "step": 2647 }, { "epoch": 2.1441295546558705, "grad_norm": 1.6975892598366673, "learning_rate": 3.850913843836111e-06, "loss": 0.9645, "step": 2648 }, { "epoch": 2.1449392712550606, "grad_norm": 1.6362040151463373, "learning_rate": 3.844137853872245e-06, "loss": 0.9809, "step": 2649 }, { "epoch": 2.145748987854251, "grad_norm": 1.6598578036651748, "learning_rate": 3.837366411839114e-06, "loss": 0.9822, "step": 2650 }, { "epoch": 2.146558704453441, "grad_norm": 1.6357704564069009, "learning_rate": 3.830599522739437e-06, "loss": 1.0175, "step": 2651 }, { "epoch": 2.1473684210526316, "grad_norm": 1.6232730145470107, "learning_rate": 3.823837191572567e-06, "loss": 0.9417, "step": 2652 }, { "epoch": 2.148178137651822, "grad_norm": 1.6210092712146753, "learning_rate": 3.817079423334497e-06, "loss": 0.9656, "step": 2653 }, { "epoch": 2.148987854251012, "grad_norm": 1.688514119577297, "learning_rate": 3.8103262230178395e-06, "loss": 1.0027, "step": 2654 }, { "epoch": 2.1497975708502026, "grad_norm": 1.6972006744474526, "learning_rate": 3.8035775956118416e-06, "loss": 1.0177, "step": 2655 }, { "epoch": 2.1506072874493927, "grad_norm": 1.6862311012034847, "learning_rate": 3.7968335461023654e-06, "loss": 0.9403, "step": 2656 }, { "epoch": 2.151417004048583, "grad_norm": 1.6576918628517623, "learning_rate": 3.790094079471891e-06, "loss": 0.9654, "step": 2657 }, { "epoch": 2.152226720647773, "grad_norm": 1.705286165779682, "learning_rate": 3.7833592006995144e-06, "loss": 1.0502, "step": 2658 }, { "epoch": 2.1530364372469637, "grad_norm": 1.6153316475940291, "learning_rate": 3.77662891476094e-06, "loss": 0.9806, "step": 2659 }, { "epoch": 2.1538461538461537, "grad_norm": 1.6880698498431286, "learning_rate": 3.7699032266284863e-06, "loss": 1.0253, "step": 2660 }, { "epoch": 2.1546558704453442, "grad_norm": 1.7777836789038124, "learning_rate": 3.7631821412710668e-06, "loss": 0.9982, "step": 2661 }, { "epoch": 2.1554655870445343, "grad_norm": 1.6474905913845634, "learning_rate": 3.7564656636541928e-06, "loss": 0.9736, "step": 2662 }, { "epoch": 2.1562753036437248, "grad_norm": 1.64817525670547, "learning_rate": 3.7497537987399836e-06, "loss": 0.9996, "step": 2663 }, { "epoch": 2.157085020242915, "grad_norm": 1.6328494859671836, "learning_rate": 3.7430465514871405e-06, "loss": 0.9529, "step": 2664 }, { "epoch": 2.1578947368421053, "grad_norm": 1.5460907995988389, "learning_rate": 3.736343926850954e-06, "loss": 0.9776, "step": 2665 }, { "epoch": 2.1587044534412954, "grad_norm": 1.7999028275226707, "learning_rate": 3.729645929783302e-06, "loss": 0.9444, "step": 2666 }, { "epoch": 2.159514170040486, "grad_norm": 1.6674054885247187, "learning_rate": 3.7229525652326392e-06, "loss": 0.9698, "step": 2667 }, { "epoch": 2.1603238866396763, "grad_norm": 1.7459838166181108, "learning_rate": 3.7162638381440077e-06, "loss": 0.9791, "step": 2668 }, { "epoch": 2.1611336032388664, "grad_norm": 1.7171429916272887, "learning_rate": 3.709579753459015e-06, "loss": 1.0177, "step": 2669 }, { "epoch": 2.161943319838057, "grad_norm": 1.729852115207468, "learning_rate": 3.702900316115836e-06, "loss": 1.0042, "step": 2670 }, { "epoch": 2.162753036437247, "grad_norm": 1.7165892661582856, "learning_rate": 3.6962255310492256e-06, "loss": 0.9719, "step": 2671 }, { "epoch": 2.1635627530364374, "grad_norm": 1.628850570144987, "learning_rate": 3.689555403190488e-06, "loss": 1.0131, "step": 2672 }, { "epoch": 2.1643724696356275, "grad_norm": 1.6296844076079475, "learning_rate": 3.6828899374674933e-06, "loss": 0.9733, "step": 2673 }, { "epoch": 2.165182186234818, "grad_norm": 1.623822465699174, "learning_rate": 3.67622913880466e-06, "loss": 1.0049, "step": 2674 }, { "epoch": 2.165991902834008, "grad_norm": 1.6705667827629207, "learning_rate": 3.6695730121229734e-06, "loss": 1.0427, "step": 2675 }, { "epoch": 2.1668016194331985, "grad_norm": 1.6901112337582092, "learning_rate": 3.6629215623399526e-06, "loss": 0.9016, "step": 2676 }, { "epoch": 2.1676113360323885, "grad_norm": 1.635822547002648, "learning_rate": 3.6562747943696696e-06, "loss": 1.0312, "step": 2677 }, { "epoch": 2.168421052631579, "grad_norm": 1.773469367884702, "learning_rate": 3.6496327131227284e-06, "loss": 0.9799, "step": 2678 }, { "epoch": 2.169230769230769, "grad_norm": 1.703119798535184, "learning_rate": 3.6429953235062853e-06, "loss": 0.9642, "step": 2679 }, { "epoch": 2.1700404858299596, "grad_norm": 1.5950037755310869, "learning_rate": 3.6363626304240185e-06, "loss": 0.9954, "step": 2680 }, { "epoch": 2.1708502024291496, "grad_norm": 1.6651131112700766, "learning_rate": 3.629734638776139e-06, "loss": 0.9598, "step": 2681 }, { "epoch": 2.17165991902834, "grad_norm": 1.674961128025693, "learning_rate": 3.6231113534593833e-06, "loss": 0.9485, "step": 2682 }, { "epoch": 2.1724696356275306, "grad_norm": 1.6927204922354684, "learning_rate": 3.616492779367018e-06, "loss": 1.0114, "step": 2683 }, { "epoch": 2.1732793522267206, "grad_norm": 1.6141482144444916, "learning_rate": 3.609878921388822e-06, "loss": 1.0204, "step": 2684 }, { "epoch": 2.174089068825911, "grad_norm": 1.5938306093069654, "learning_rate": 3.6032697844110896e-06, "loss": 1.0281, "step": 2685 }, { "epoch": 2.174898785425101, "grad_norm": 1.674185209940073, "learning_rate": 3.596665373316629e-06, "loss": 1.0156, "step": 2686 }, { "epoch": 2.1757085020242917, "grad_norm": 1.652753408219103, "learning_rate": 3.590065692984762e-06, "loss": 0.9686, "step": 2687 }, { "epoch": 2.1765182186234817, "grad_norm": 1.6554790863907958, "learning_rate": 3.583470748291309e-06, "loss": 0.961, "step": 2688 }, { "epoch": 2.177327935222672, "grad_norm": 1.7595295916415807, "learning_rate": 3.5768805441085885e-06, "loss": 1.0052, "step": 2689 }, { "epoch": 2.1781376518218623, "grad_norm": 1.7129389096310346, "learning_rate": 3.5702950853054284e-06, "loss": 1.0033, "step": 2690 }, { "epoch": 2.1789473684210527, "grad_norm": 1.6958273168912117, "learning_rate": 3.5637143767471427e-06, "loss": 1.0284, "step": 2691 }, { "epoch": 2.179757085020243, "grad_norm": 1.6851614860716266, "learning_rate": 3.5571384232955365e-06, "loss": 0.971, "step": 2692 }, { "epoch": 2.1805668016194333, "grad_norm": 1.6326875705295927, "learning_rate": 3.550567229808901e-06, "loss": 1.002, "step": 2693 }, { "epoch": 2.1813765182186233, "grad_norm": 1.6508788198860427, "learning_rate": 3.5440008011420103e-06, "loss": 1.0164, "step": 2694 }, { "epoch": 2.182186234817814, "grad_norm": 1.6307505301931884, "learning_rate": 3.5374391421461273e-06, "loss": 0.9336, "step": 2695 }, { "epoch": 2.182995951417004, "grad_norm": 1.6240785364108545, "learning_rate": 3.5308822576689805e-06, "loss": 1.0049, "step": 2696 }, { "epoch": 2.1838056680161944, "grad_norm": 1.6543399441514246, "learning_rate": 3.5243301525547714e-06, "loss": 1.0455, "step": 2697 }, { "epoch": 2.184615384615385, "grad_norm": 1.650315122489001, "learning_rate": 3.5177828316441797e-06, "loss": 0.9574, "step": 2698 }, { "epoch": 2.185425101214575, "grad_norm": 1.6675806777098334, "learning_rate": 3.511240299774341e-06, "loss": 1.005, "step": 2699 }, { "epoch": 2.1862348178137654, "grad_norm": 1.6634779623745304, "learning_rate": 3.5047025617788578e-06, "loss": 1.0074, "step": 2700 }, { "epoch": 2.1870445344129554, "grad_norm": 1.6256164088565501, "learning_rate": 3.4981696224877893e-06, "loss": 0.9875, "step": 2701 }, { "epoch": 2.187854251012146, "grad_norm": 1.7291877052522024, "learning_rate": 3.491641486727645e-06, "loss": 0.9844, "step": 2702 }, { "epoch": 2.188663967611336, "grad_norm": 1.7129557045779524, "learning_rate": 3.4851181593213967e-06, "loss": 0.9625, "step": 2703 }, { "epoch": 2.1894736842105265, "grad_norm": 1.7620888900482394, "learning_rate": 3.478599645088453e-06, "loss": 0.9872, "step": 2704 }, { "epoch": 2.1902834008097165, "grad_norm": 1.6649502085884391, "learning_rate": 3.4720859488446744e-06, "loss": 0.9264, "step": 2705 }, { "epoch": 2.191093117408907, "grad_norm": 1.6084607566937745, "learning_rate": 3.4655770754023574e-06, "loss": 0.9531, "step": 2706 }, { "epoch": 2.191902834008097, "grad_norm": 1.6974794965679043, "learning_rate": 3.4590730295702356e-06, "loss": 0.9283, "step": 2707 }, { "epoch": 2.1927125506072875, "grad_norm": 1.687339468024081, "learning_rate": 3.452573816153476e-06, "loss": 0.9698, "step": 2708 }, { "epoch": 2.1935222672064776, "grad_norm": 1.7543324832553966, "learning_rate": 3.446079439953677e-06, "loss": 0.9411, "step": 2709 }, { "epoch": 2.194331983805668, "grad_norm": 1.711648121979403, "learning_rate": 3.4395899057688575e-06, "loss": 0.9817, "step": 2710 }, { "epoch": 2.195141700404858, "grad_norm": 1.664633436704711, "learning_rate": 3.4331052183934687e-06, "loss": 0.9834, "step": 2711 }, { "epoch": 2.1959514170040486, "grad_norm": 1.655215067217286, "learning_rate": 3.4266253826183805e-06, "loss": 0.9665, "step": 2712 }, { "epoch": 2.196761133603239, "grad_norm": 1.7006734272766477, "learning_rate": 3.4201504032308695e-06, "loss": 0.9777, "step": 2713 }, { "epoch": 2.197570850202429, "grad_norm": 1.714785930728774, "learning_rate": 3.41368028501463e-06, "loss": 0.9656, "step": 2714 }, { "epoch": 2.1983805668016196, "grad_norm": 1.6909113867716612, "learning_rate": 3.407215032749763e-06, "loss": 1.0659, "step": 2715 }, { "epoch": 2.1991902834008097, "grad_norm": 1.68682986907009, "learning_rate": 3.4007546512127764e-06, "loss": 0.9103, "step": 2716 }, { "epoch": 2.2, "grad_norm": 1.6137573322333, "learning_rate": 3.3942991451765793e-06, "loss": 0.9957, "step": 2717 }, { "epoch": 2.2008097165991902, "grad_norm": 1.6129056026950326, "learning_rate": 3.387848519410475e-06, "loss": 0.9935, "step": 2718 }, { "epoch": 2.2016194331983807, "grad_norm": 1.694139432275297, "learning_rate": 3.3814027786801675e-06, "loss": 0.9418, "step": 2719 }, { "epoch": 2.2024291497975708, "grad_norm": 1.7634838421878205, "learning_rate": 3.374961927747751e-06, "loss": 0.9193, "step": 2720 }, { "epoch": 2.2032388663967613, "grad_norm": 1.6980171565314421, "learning_rate": 3.3685259713717034e-06, "loss": 0.9727, "step": 2721 }, { "epoch": 2.2040485829959513, "grad_norm": 1.722930414761989, "learning_rate": 3.362094914306888e-06, "loss": 0.93, "step": 2722 }, { "epoch": 2.204858299595142, "grad_norm": 1.6124571834590236, "learning_rate": 3.355668761304548e-06, "loss": 1.1032, "step": 2723 }, { "epoch": 2.205668016194332, "grad_norm": 1.6394333882696688, "learning_rate": 3.349247517112305e-06, "loss": 1.0452, "step": 2724 }, { "epoch": 2.2064777327935223, "grad_norm": 1.6345795057358297, "learning_rate": 3.342831186474149e-06, "loss": 0.9504, "step": 2725 }, { "epoch": 2.2072874493927124, "grad_norm": 1.6475621511846046, "learning_rate": 3.336419774130447e-06, "loss": 1.0154, "step": 2726 }, { "epoch": 2.208097165991903, "grad_norm": 1.6238349459249526, "learning_rate": 3.3300132848179346e-06, "loss": 0.9662, "step": 2727 }, { "epoch": 2.208906882591093, "grad_norm": 1.6615756750809165, "learning_rate": 3.3236117232696984e-06, "loss": 1.0466, "step": 2728 }, { "epoch": 2.2097165991902834, "grad_norm": 1.7120719200934522, "learning_rate": 3.3172150942151947e-06, "loss": 0.9345, "step": 2729 }, { "epoch": 2.2105263157894735, "grad_norm": 1.6804653865746528, "learning_rate": 3.31082340238023e-06, "loss": 0.9928, "step": 2730 }, { "epoch": 2.211336032388664, "grad_norm": 1.7147429622035564, "learning_rate": 3.3044366524869652e-06, "loss": 0.9555, "step": 2731 }, { "epoch": 2.2121457489878544, "grad_norm": 1.664455141304305, "learning_rate": 3.2980548492539064e-06, "loss": 1.0162, "step": 2732 }, { "epoch": 2.2129554655870445, "grad_norm": 1.7280945010756663, "learning_rate": 3.291677997395918e-06, "loss": 0.9628, "step": 2733 }, { "epoch": 2.213765182186235, "grad_norm": 1.673275550114484, "learning_rate": 3.2853061016241884e-06, "loss": 1.0242, "step": 2734 }, { "epoch": 2.214574898785425, "grad_norm": 1.7272385557412295, "learning_rate": 3.27893916664626e-06, "loss": 0.9813, "step": 2735 }, { "epoch": 2.2153846153846155, "grad_norm": 1.6681828389749171, "learning_rate": 3.2725771971660002e-06, "loss": 0.9479, "step": 2736 }, { "epoch": 2.2161943319838056, "grad_norm": 1.6481760708593136, "learning_rate": 3.266220197883613e-06, "loss": 0.9863, "step": 2737 }, { "epoch": 2.217004048582996, "grad_norm": 1.6193831329668813, "learning_rate": 3.259868173495626e-06, "loss": 0.972, "step": 2738 }, { "epoch": 2.217813765182186, "grad_norm": 1.6439379984114413, "learning_rate": 3.2535211286948955e-06, "loss": 0.9832, "step": 2739 }, { "epoch": 2.2186234817813766, "grad_norm": 1.6572224667024693, "learning_rate": 3.2471790681705928e-06, "loss": 1.036, "step": 2740 }, { "epoch": 2.2194331983805666, "grad_norm": 1.6609689760873279, "learning_rate": 3.2408419966082195e-06, "loss": 1.0051, "step": 2741 }, { "epoch": 2.220242914979757, "grad_norm": 1.6875131566811306, "learning_rate": 3.2345099186895758e-06, "loss": 0.9738, "step": 2742 }, { "epoch": 2.221052631578947, "grad_norm": 1.7178275452285305, "learning_rate": 3.2281828390927873e-06, "loss": 1.0007, "step": 2743 }, { "epoch": 2.2218623481781377, "grad_norm": 1.7172578500910411, "learning_rate": 3.221860762492275e-06, "loss": 1.0334, "step": 2744 }, { "epoch": 2.2226720647773277, "grad_norm": 1.6450445517301915, "learning_rate": 3.215543693558769e-06, "loss": 1.0215, "step": 2745 }, { "epoch": 2.223481781376518, "grad_norm": 1.6443211869924201, "learning_rate": 3.2092316369593e-06, "loss": 0.9702, "step": 2746 }, { "epoch": 2.2242914979757087, "grad_norm": 1.6072557369507485, "learning_rate": 3.20292459735719e-06, "loss": 0.9568, "step": 2747 }, { "epoch": 2.2251012145748987, "grad_norm": 1.5890263149144106, "learning_rate": 3.1966225794120666e-06, "loss": 0.9662, "step": 2748 }, { "epoch": 2.2259109311740892, "grad_norm": 1.6626050407972168, "learning_rate": 3.1903255877798365e-06, "loss": 0.9184, "step": 2749 }, { "epoch": 2.2267206477732793, "grad_norm": 1.6429805630498902, "learning_rate": 3.1840336271126935e-06, "loss": 0.9815, "step": 2750 }, { "epoch": 2.2275303643724698, "grad_norm": 1.7143845132000823, "learning_rate": 3.1777467020591236e-06, "loss": 1.0032, "step": 2751 }, { "epoch": 2.22834008097166, "grad_norm": 1.7160771696928598, "learning_rate": 3.1714648172638827e-06, "loss": 0.9347, "step": 2752 }, { "epoch": 2.2291497975708503, "grad_norm": 1.737109438949758, "learning_rate": 3.165187977368007e-06, "loss": 0.9485, "step": 2753 }, { "epoch": 2.2299595141700403, "grad_norm": 1.6534753583589024, "learning_rate": 3.158916187008806e-06, "loss": 0.9403, "step": 2754 }, { "epoch": 2.230769230769231, "grad_norm": 1.6881735304925691, "learning_rate": 3.152649450819852e-06, "loss": 0.989, "step": 2755 }, { "epoch": 2.231578947368421, "grad_norm": 1.683248270203457, "learning_rate": 3.146387773431e-06, "loss": 1.0457, "step": 2756 }, { "epoch": 2.2323886639676114, "grad_norm": 1.716360447509341, "learning_rate": 3.1401311594683494e-06, "loss": 1.011, "step": 2757 }, { "epoch": 2.2331983805668014, "grad_norm": 1.726117425194692, "learning_rate": 3.1338796135542647e-06, "loss": 0.9894, "step": 2758 }, { "epoch": 2.234008097165992, "grad_norm": 1.722136573207579, "learning_rate": 3.1276331403073733e-06, "loss": 0.9064, "step": 2759 }, { "epoch": 2.234817813765182, "grad_norm": 1.6672196996268878, "learning_rate": 3.1213917443425456e-06, "loss": 0.9917, "step": 2760 }, { "epoch": 2.2356275303643725, "grad_norm": 1.7657193112711613, "learning_rate": 3.1151554302709063e-06, "loss": 1.0076, "step": 2761 }, { "epoch": 2.236437246963563, "grad_norm": 1.6902194564599813, "learning_rate": 3.108924202699819e-06, "loss": 1.0375, "step": 2762 }, { "epoch": 2.237246963562753, "grad_norm": 1.7990661956680112, "learning_rate": 3.1026980662328997e-06, "loss": 0.9798, "step": 2763 }, { "epoch": 2.2380566801619435, "grad_norm": 1.6588109292539122, "learning_rate": 3.096477025469996e-06, "loss": 0.9786, "step": 2764 }, { "epoch": 2.2388663967611335, "grad_norm": 1.7473866019174604, "learning_rate": 3.0902610850071922e-06, "loss": 1.001, "step": 2765 }, { "epoch": 2.239676113360324, "grad_norm": 1.5963776881440452, "learning_rate": 3.084050249436802e-06, "loss": 0.9453, "step": 2766 }, { "epoch": 2.240485829959514, "grad_norm": 1.6511854722362378, "learning_rate": 3.077844523347374e-06, "loss": 0.9677, "step": 2767 }, { "epoch": 2.2412955465587046, "grad_norm": 1.8031304104053283, "learning_rate": 3.0716439113236785e-06, "loss": 0.9372, "step": 2768 }, { "epoch": 2.2421052631578946, "grad_norm": 1.6716997949617207, "learning_rate": 3.0654484179467047e-06, "loss": 0.9719, "step": 2769 }, { "epoch": 2.242914979757085, "grad_norm": 1.623759924284454, "learning_rate": 3.0592580477936606e-06, "loss": 0.9986, "step": 2770 }, { "epoch": 2.243724696356275, "grad_norm": 1.750132587533433, "learning_rate": 3.0530728054379787e-06, "loss": 0.9968, "step": 2771 }, { "epoch": 2.2445344129554656, "grad_norm": 1.6683052360621997, "learning_rate": 3.0468926954492907e-06, "loss": 0.9192, "step": 2772 }, { "epoch": 2.2453441295546557, "grad_norm": 1.7517498796441024, "learning_rate": 3.0407177223934426e-06, "loss": 0.9623, "step": 2773 }, { "epoch": 2.246153846153846, "grad_norm": 1.7300413517444366, "learning_rate": 3.034547890832481e-06, "loss": 1.0161, "step": 2774 }, { "epoch": 2.246963562753036, "grad_norm": 1.6389215305609568, "learning_rate": 3.0283832053246644e-06, "loss": 0.9079, "step": 2775 }, { "epoch": 2.2477732793522267, "grad_norm": 1.6760927536723689, "learning_rate": 3.022223670424437e-06, "loss": 0.9628, "step": 2776 }, { "epoch": 2.248582995951417, "grad_norm": 1.6857597576034187, "learning_rate": 3.016069290682441e-06, "loss": 0.9542, "step": 2777 }, { "epoch": 2.2493927125506072, "grad_norm": 1.6715075659799083, "learning_rate": 3.009920070645518e-06, "loss": 1.048, "step": 2778 }, { "epoch": 2.2502024291497977, "grad_norm": 1.6998697843304018, "learning_rate": 3.0037760148566874e-06, "loss": 0.9779, "step": 2779 }, { "epoch": 2.251012145748988, "grad_norm": 1.7822427598135133, "learning_rate": 2.99763712785516e-06, "loss": 0.8956, "step": 2780 }, { "epoch": 2.2518218623481783, "grad_norm": 1.6755771799162749, "learning_rate": 2.9915034141763234e-06, "loss": 0.9552, "step": 2781 }, { "epoch": 2.2526315789473683, "grad_norm": 1.7432840821094, "learning_rate": 2.9853748783517435e-06, "loss": 1.0165, "step": 2782 }, { "epoch": 2.253441295546559, "grad_norm": 1.7142235267700365, "learning_rate": 2.9792515249091657e-06, "loss": 1.0134, "step": 2783 }, { "epoch": 2.254251012145749, "grad_norm": 1.7197547849869146, "learning_rate": 2.973133358372504e-06, "loss": 1.0114, "step": 2784 }, { "epoch": 2.2550607287449393, "grad_norm": 1.6535591134631202, "learning_rate": 2.967020383261834e-06, "loss": 1.0338, "step": 2785 }, { "epoch": 2.2558704453441294, "grad_norm": 1.5835662400274373, "learning_rate": 2.960912604093409e-06, "loss": 1.0032, "step": 2786 }, { "epoch": 2.25668016194332, "grad_norm": 1.5736860921183824, "learning_rate": 2.954810025379633e-06, "loss": 1.0048, "step": 2787 }, { "epoch": 2.25748987854251, "grad_norm": 1.7772332085899527, "learning_rate": 2.948712651629071e-06, "loss": 0.9409, "step": 2788 }, { "epoch": 2.2582995951417004, "grad_norm": 1.8153302390569306, "learning_rate": 2.9426204873464414e-06, "loss": 0.9485, "step": 2789 }, { "epoch": 2.2591093117408905, "grad_norm": 1.6435844735478726, "learning_rate": 2.9365335370326143e-06, "loss": 0.9743, "step": 2790 }, { "epoch": 2.259919028340081, "grad_norm": 1.7393090327566456, "learning_rate": 2.9304518051846143e-06, "loss": 0.9736, "step": 2791 }, { "epoch": 2.2607287449392715, "grad_norm": 1.6924799124970187, "learning_rate": 2.924375296295597e-06, "loss": 0.9779, "step": 2792 }, { "epoch": 2.2615384615384615, "grad_norm": 1.6747668468696226, "learning_rate": 2.9183040148548757e-06, "loss": 0.9792, "step": 2793 }, { "epoch": 2.262348178137652, "grad_norm": 1.8608821257094024, "learning_rate": 2.9122379653478894e-06, "loss": 0.9717, "step": 2794 }, { "epoch": 2.263157894736842, "grad_norm": 1.6857679680681223, "learning_rate": 2.9061771522562143e-06, "loss": 0.9856, "step": 2795 }, { "epoch": 2.2639676113360325, "grad_norm": 1.5974903308608277, "learning_rate": 2.90012158005756e-06, "loss": 0.9913, "step": 2796 }, { "epoch": 2.2647773279352226, "grad_norm": 1.699823981139351, "learning_rate": 2.8940712532257633e-06, "loss": 0.9562, "step": 2797 }, { "epoch": 2.265587044534413, "grad_norm": 1.6553947780834652, "learning_rate": 2.8880261762307837e-06, "loss": 0.9666, "step": 2798 }, { "epoch": 2.266396761133603, "grad_norm": 1.6975659495235669, "learning_rate": 2.8819863535387083e-06, "loss": 0.983, "step": 2799 }, { "epoch": 2.2672064777327936, "grad_norm": 1.6417408606225983, "learning_rate": 2.875951789611734e-06, "loss": 0.9544, "step": 2800 }, { "epoch": 2.2680161943319836, "grad_norm": 1.6970530026128983, "learning_rate": 2.8699224889081825e-06, "loss": 1.0598, "step": 2801 }, { "epoch": 2.268825910931174, "grad_norm": 1.6632904185152642, "learning_rate": 2.8638984558824777e-06, "loss": 1.0024, "step": 2802 }, { "epoch": 2.269635627530364, "grad_norm": 1.7530525316894194, "learning_rate": 2.857879694985156e-06, "loss": 1.0393, "step": 2803 }, { "epoch": 2.2704453441295547, "grad_norm": 1.6495544362471581, "learning_rate": 2.851866210662858e-06, "loss": 0.9628, "step": 2804 }, { "epoch": 2.2712550607287447, "grad_norm": 1.636310387000526, "learning_rate": 2.8458580073583262e-06, "loss": 0.9938, "step": 2805 }, { "epoch": 2.272064777327935, "grad_norm": 1.6690369023149698, "learning_rate": 2.839855089510398e-06, "loss": 0.9759, "step": 2806 }, { "epoch": 2.2728744939271257, "grad_norm": 1.7396582090782353, "learning_rate": 2.8338574615540136e-06, "loss": 1.0541, "step": 2807 }, { "epoch": 2.2736842105263158, "grad_norm": 1.6449963646693437, "learning_rate": 2.827865127920203e-06, "loss": 0.9446, "step": 2808 }, { "epoch": 2.2744939271255062, "grad_norm": 1.71078655363176, "learning_rate": 2.821878093036079e-06, "loss": 0.9266, "step": 2809 }, { "epoch": 2.2753036437246963, "grad_norm": 1.722249474010595, "learning_rate": 2.8158963613248437e-06, "loss": 0.9671, "step": 2810 }, { "epoch": 2.276113360323887, "grad_norm": 1.6729015082048833, "learning_rate": 2.8099199372057818e-06, "loss": 0.995, "step": 2811 }, { "epoch": 2.276923076923077, "grad_norm": 1.7356714743398183, "learning_rate": 2.803948825094255e-06, "loss": 1.0026, "step": 2812 }, { "epoch": 2.2777327935222673, "grad_norm": 1.7087100915274263, "learning_rate": 2.7979830294016985e-06, "loss": 0.9608, "step": 2813 }, { "epoch": 2.2785425101214574, "grad_norm": 1.6622618230635267, "learning_rate": 2.792022554535625e-06, "loss": 1.0405, "step": 2814 }, { "epoch": 2.279352226720648, "grad_norm": 1.6282728230738879, "learning_rate": 2.7860674048996174e-06, "loss": 1.0188, "step": 2815 }, { "epoch": 2.280161943319838, "grad_norm": 1.6465016952207205, "learning_rate": 2.780117584893317e-06, "loss": 1.0251, "step": 2816 }, { "epoch": 2.2809716599190284, "grad_norm": 1.6700201012055247, "learning_rate": 2.774173098912433e-06, "loss": 0.9897, "step": 2817 }, { "epoch": 2.2817813765182184, "grad_norm": 1.654997034971534, "learning_rate": 2.76823395134873e-06, "loss": 0.9931, "step": 2818 }, { "epoch": 2.282591093117409, "grad_norm": 1.6463795592129717, "learning_rate": 2.7623001465900323e-06, "loss": 0.9646, "step": 2819 }, { "epoch": 2.283400809716599, "grad_norm": 1.689096129045378, "learning_rate": 2.756371689020214e-06, "loss": 1.037, "step": 2820 }, { "epoch": 2.2842105263157895, "grad_norm": 1.739455812749162, "learning_rate": 2.7504485830191985e-06, "loss": 1.0553, "step": 2821 }, { "epoch": 2.28502024291498, "grad_norm": 1.737347509141222, "learning_rate": 2.7445308329629593e-06, "loss": 0.9492, "step": 2822 }, { "epoch": 2.28582995951417, "grad_norm": 1.7541768615419884, "learning_rate": 2.738618443223513e-06, "loss": 0.954, "step": 2823 }, { "epoch": 2.2866396761133605, "grad_norm": 1.680772296035219, "learning_rate": 2.7327114181689117e-06, "loss": 1.0036, "step": 2824 }, { "epoch": 2.2874493927125505, "grad_norm": 1.7198767387432525, "learning_rate": 2.7268097621632473e-06, "loss": 1.0243, "step": 2825 }, { "epoch": 2.288259109311741, "grad_norm": 1.7541843187045827, "learning_rate": 2.7209134795666404e-06, "loss": 0.9636, "step": 2826 }, { "epoch": 2.289068825910931, "grad_norm": 1.7969143994647754, "learning_rate": 2.715022574735249e-06, "loss": 0.956, "step": 2827 }, { "epoch": 2.2898785425101216, "grad_norm": 1.7235098932288484, "learning_rate": 2.709137052021248e-06, "loss": 0.9696, "step": 2828 }, { "epoch": 2.2906882591093116, "grad_norm": 1.7147800208396093, "learning_rate": 2.7032569157728503e-06, "loss": 0.9945, "step": 2829 }, { "epoch": 2.291497975708502, "grad_norm": 1.6634199908382443, "learning_rate": 2.697382170334275e-06, "loss": 0.9715, "step": 2830 }, { "epoch": 2.292307692307692, "grad_norm": 1.7025316188789756, "learning_rate": 2.6915128200457706e-06, "loss": 0.9584, "step": 2831 }, { "epoch": 2.2931174089068826, "grad_norm": 1.7135149326400867, "learning_rate": 2.68564886924359e-06, "loss": 0.9675, "step": 2832 }, { "epoch": 2.2939271255060727, "grad_norm": 1.697368241080622, "learning_rate": 2.679790322260002e-06, "loss": 1.0273, "step": 2833 }, { "epoch": 2.294736842105263, "grad_norm": 1.6304843444402075, "learning_rate": 2.673937183423282e-06, "loss": 0.998, "step": 2834 }, { "epoch": 2.2955465587044532, "grad_norm": 1.6737945308046214, "learning_rate": 2.6680894570577042e-06, "loss": 0.9654, "step": 2835 }, { "epoch": 2.2963562753036437, "grad_norm": 1.6851080909667857, "learning_rate": 2.6622471474835585e-06, "loss": 0.9605, "step": 2836 }, { "epoch": 2.297165991902834, "grad_norm": 1.7289635381651745, "learning_rate": 2.6564102590171204e-06, "loss": 1.0554, "step": 2837 }, { "epoch": 2.2979757085020243, "grad_norm": 1.6276718372129562, "learning_rate": 2.6505787959706607e-06, "loss": 0.9341, "step": 2838 }, { "epoch": 2.2987854251012148, "grad_norm": 1.775958697906154, "learning_rate": 2.6447527626524504e-06, "loss": 0.929, "step": 2839 }, { "epoch": 2.299595141700405, "grad_norm": 1.6271115167068193, "learning_rate": 2.638932163366742e-06, "loss": 1.0366, "step": 2840 }, { "epoch": 2.3004048582995953, "grad_norm": 1.7404302828297658, "learning_rate": 2.633117002413774e-06, "loss": 1.0187, "step": 2841 }, { "epoch": 2.3012145748987853, "grad_norm": 1.7172201475593398, "learning_rate": 2.6273072840897685e-06, "loss": 0.9908, "step": 2842 }, { "epoch": 2.302024291497976, "grad_norm": 1.7540383203717043, "learning_rate": 2.6215030126869235e-06, "loss": 0.9614, "step": 2843 }, { "epoch": 2.302834008097166, "grad_norm": 1.7302429828447257, "learning_rate": 2.6157041924934223e-06, "loss": 0.9773, "step": 2844 }, { "epoch": 2.3036437246963564, "grad_norm": 1.621473177336414, "learning_rate": 2.6099108277934105e-06, "loss": 0.9527, "step": 2845 }, { "epoch": 2.3044534412955464, "grad_norm": 1.6861502912702049, "learning_rate": 2.604122922867004e-06, "loss": 1.0025, "step": 2846 }, { "epoch": 2.305263157894737, "grad_norm": 1.6470296585152888, "learning_rate": 2.5983404819902937e-06, "loss": 1.0029, "step": 2847 }, { "epoch": 2.306072874493927, "grad_norm": 1.732391716550845, "learning_rate": 2.592563509435325e-06, "loss": 0.9638, "step": 2848 }, { "epoch": 2.3068825910931174, "grad_norm": 1.6830485680896312, "learning_rate": 2.586792009470107e-06, "loss": 1.038, "step": 2849 }, { "epoch": 2.3076923076923075, "grad_norm": 1.7578887787386592, "learning_rate": 2.581025986358602e-06, "loss": 0.9445, "step": 2850 }, { "epoch": 2.308502024291498, "grad_norm": 1.7346747000095792, "learning_rate": 2.575265444360733e-06, "loss": 0.9888, "step": 2851 }, { "epoch": 2.3093117408906885, "grad_norm": 1.7156101092464429, "learning_rate": 2.5695103877323678e-06, "loss": 1.0043, "step": 2852 }, { "epoch": 2.3101214574898785, "grad_norm": 1.6144048959910942, "learning_rate": 2.563760820725325e-06, "loss": 1.0411, "step": 2853 }, { "epoch": 2.310931174089069, "grad_norm": 1.6919970110753482, "learning_rate": 2.5580167475873595e-06, "loss": 0.9792, "step": 2854 }, { "epoch": 2.311740890688259, "grad_norm": 1.763114954790134, "learning_rate": 2.5522781725621814e-06, "loss": 0.9677, "step": 2855 }, { "epoch": 2.3125506072874495, "grad_norm": 1.7245366432759541, "learning_rate": 2.5465450998894294e-06, "loss": 0.9699, "step": 2856 }, { "epoch": 2.3133603238866396, "grad_norm": 1.7077901989875472, "learning_rate": 2.540817533804676e-06, "loss": 1.0007, "step": 2857 }, { "epoch": 2.31417004048583, "grad_norm": 1.6847209459419308, "learning_rate": 2.535095478539428e-06, "loss": 1.0254, "step": 2858 }, { "epoch": 2.31497975708502, "grad_norm": 1.706175440021964, "learning_rate": 2.529378938321124e-06, "loss": 0.9536, "step": 2859 }, { "epoch": 2.3157894736842106, "grad_norm": 1.7691390032285985, "learning_rate": 2.523667917373125e-06, "loss": 1.0103, "step": 2860 }, { "epoch": 2.3165991902834007, "grad_norm": 1.8077440425603108, "learning_rate": 2.517962419914712e-06, "loss": 0.9207, "step": 2861 }, { "epoch": 2.317408906882591, "grad_norm": 1.6663947281505385, "learning_rate": 2.512262450161087e-06, "loss": 0.9743, "step": 2862 }, { "epoch": 2.318218623481781, "grad_norm": 1.6909579843290505, "learning_rate": 2.5065680123233737e-06, "loss": 0.9382, "step": 2863 }, { "epoch": 2.3190283400809717, "grad_norm": 1.6847267233784076, "learning_rate": 2.5008791106086015e-06, "loss": 0.9839, "step": 2864 }, { "epoch": 2.3198380566801617, "grad_norm": 1.7283759037557056, "learning_rate": 2.4951957492197097e-06, "loss": 0.9233, "step": 2865 }, { "epoch": 2.3206477732793522, "grad_norm": 1.6108950115636136, "learning_rate": 2.4895179323555517e-06, "loss": 0.9222, "step": 2866 }, { "epoch": 2.3214574898785427, "grad_norm": 1.759239040012402, "learning_rate": 2.483845664210879e-06, "loss": 1.0203, "step": 2867 }, { "epoch": 2.3222672064777328, "grad_norm": 1.6569900658909889, "learning_rate": 2.478178948976342e-06, "loss": 1.0225, "step": 2868 }, { "epoch": 2.3230769230769233, "grad_norm": 1.6165160420682663, "learning_rate": 2.4725177908384936e-06, "loss": 0.9963, "step": 2869 }, { "epoch": 2.3238866396761133, "grad_norm": 1.6910680194151018, "learning_rate": 2.4668621939797745e-06, "loss": 0.9845, "step": 2870 }, { "epoch": 2.324696356275304, "grad_norm": 1.7564925975810206, "learning_rate": 2.461212162578527e-06, "loss": 1.0053, "step": 2871 }, { "epoch": 2.325506072874494, "grad_norm": 1.6736697804350984, "learning_rate": 2.455567700808974e-06, "loss": 1.0172, "step": 2872 }, { "epoch": 2.3263157894736843, "grad_norm": 1.6235333292980543, "learning_rate": 2.4499288128412214e-06, "loss": 1.0632, "step": 2873 }, { "epoch": 2.3271255060728744, "grad_norm": 1.7363325081342043, "learning_rate": 2.4442955028412672e-06, "loss": 0.9349, "step": 2874 }, { "epoch": 2.327935222672065, "grad_norm": 1.6683416186196407, "learning_rate": 2.438667774970981e-06, "loss": 0.9533, "step": 2875 }, { "epoch": 2.328744939271255, "grad_norm": 1.7103163002011812, "learning_rate": 2.433045633388106e-06, "loss": 0.9585, "step": 2876 }, { "epoch": 2.3295546558704454, "grad_norm": 1.6372160886662246, "learning_rate": 2.4274290822462656e-06, "loss": 0.9704, "step": 2877 }, { "epoch": 2.3303643724696355, "grad_norm": 1.7503426443559542, "learning_rate": 2.4218181256949434e-06, "loss": 1.0724, "step": 2878 }, { "epoch": 2.331174089068826, "grad_norm": 1.6224949379470495, "learning_rate": 2.4162127678795045e-06, "loss": 0.9353, "step": 2879 }, { "epoch": 2.331983805668016, "grad_norm": 1.7179059296387311, "learning_rate": 2.4106130129411608e-06, "loss": 0.9951, "step": 2880 }, { "epoch": 2.3327935222672065, "grad_norm": 1.6820449772558062, "learning_rate": 2.405018865016999e-06, "loss": 1.0007, "step": 2881 }, { "epoch": 2.333603238866397, "grad_norm": 1.640951194648763, "learning_rate": 2.3994303282399544e-06, "loss": 0.9456, "step": 2882 }, { "epoch": 2.334412955465587, "grad_norm": 1.6725791781090846, "learning_rate": 2.3938474067388208e-06, "loss": 0.9674, "step": 2883 }, { "epoch": 2.3352226720647775, "grad_norm": 1.8442889599518857, "learning_rate": 2.388270104638242e-06, "loss": 0.9674, "step": 2884 }, { "epoch": 2.3360323886639676, "grad_norm": 1.676925947550545, "learning_rate": 2.3826984260587084e-06, "loss": 0.9523, "step": 2885 }, { "epoch": 2.336842105263158, "grad_norm": 1.717243161461183, "learning_rate": 2.3771323751165563e-06, "loss": 0.9958, "step": 2886 }, { "epoch": 2.337651821862348, "grad_norm": 1.7024062048223407, "learning_rate": 2.3715719559239727e-06, "loss": 1.0246, "step": 2887 }, { "epoch": 2.3384615384615386, "grad_norm": 1.6526087755299945, "learning_rate": 2.3660171725889703e-06, "loss": 0.9659, "step": 2888 }, { "epoch": 2.3392712550607286, "grad_norm": 1.6804957380379042, "learning_rate": 2.360468029215409e-06, "loss": 1.0254, "step": 2889 }, { "epoch": 2.340080971659919, "grad_norm": 1.6575687386773823, "learning_rate": 2.354924529902978e-06, "loss": 1.0599, "step": 2890 }, { "epoch": 2.340890688259109, "grad_norm": 1.6411725270506965, "learning_rate": 2.349386678747194e-06, "loss": 1.0325, "step": 2891 }, { "epoch": 2.3417004048582997, "grad_norm": 1.6593350291881466, "learning_rate": 2.343854479839405e-06, "loss": 0.9796, "step": 2892 }, { "epoch": 2.3425101214574897, "grad_norm": 1.6411951141419265, "learning_rate": 2.3383279372667787e-06, "loss": 0.9762, "step": 2893 }, { "epoch": 2.34331983805668, "grad_norm": 1.6675022296883175, "learning_rate": 2.332807055112306e-06, "loss": 0.9739, "step": 2894 }, { "epoch": 2.3441295546558703, "grad_norm": 1.6845312790330145, "learning_rate": 2.327291837454799e-06, "loss": 0.9776, "step": 2895 }, { "epoch": 2.3449392712550607, "grad_norm": 1.6631177985983452, "learning_rate": 2.3217822883688855e-06, "loss": 0.9856, "step": 2896 }, { "epoch": 2.3457489878542512, "grad_norm": 1.7002279127037514, "learning_rate": 2.316278411924998e-06, "loss": 1.004, "step": 2897 }, { "epoch": 2.3465587044534413, "grad_norm": 1.7044705523610206, "learning_rate": 2.310780212189384e-06, "loss": 0.9071, "step": 2898 }, { "epoch": 2.3473684210526318, "grad_norm": 1.7588763360318254, "learning_rate": 2.3052876932240943e-06, "loss": 0.939, "step": 2899 }, { "epoch": 2.348178137651822, "grad_norm": 1.6865532061555057, "learning_rate": 2.2998008590869838e-06, "loss": 0.9706, "step": 2900 }, { "epoch": 2.3489878542510123, "grad_norm": 1.8099761327981618, "learning_rate": 2.294319713831705e-06, "loss": 0.9509, "step": 2901 }, { "epoch": 2.3497975708502024, "grad_norm": 1.655965409511294, "learning_rate": 2.2888442615077145e-06, "loss": 0.9411, "step": 2902 }, { "epoch": 2.350607287449393, "grad_norm": 1.661395531775376, "learning_rate": 2.2833745061602587e-06, "loss": 1.011, "step": 2903 }, { "epoch": 2.351417004048583, "grad_norm": 1.6324577056900818, "learning_rate": 2.277910451830373e-06, "loss": 1.0316, "step": 2904 }, { "epoch": 2.3522267206477734, "grad_norm": 1.6836043670245235, "learning_rate": 2.2724521025548828e-06, "loss": 1.0146, "step": 2905 }, { "epoch": 2.3530364372469634, "grad_norm": 1.7435287774310333, "learning_rate": 2.2669994623664006e-06, "loss": 0.9744, "step": 2906 }, { "epoch": 2.353846153846154, "grad_norm": 1.7490120988400564, "learning_rate": 2.2615525352933156e-06, "loss": 0.9675, "step": 2907 }, { "epoch": 2.354655870445344, "grad_norm": 1.6560540854955874, "learning_rate": 2.256111325359801e-06, "loss": 0.9834, "step": 2908 }, { "epoch": 2.3554655870445345, "grad_norm": 1.6937685129731164, "learning_rate": 2.250675836585803e-06, "loss": 1.0068, "step": 2909 }, { "epoch": 2.3562753036437245, "grad_norm": 1.6622436941899015, "learning_rate": 2.245246072987045e-06, "loss": 0.9918, "step": 2910 }, { "epoch": 2.357085020242915, "grad_norm": 1.7854484056765068, "learning_rate": 2.2398220385750213e-06, "loss": 1.0529, "step": 2911 }, { "epoch": 2.3578947368421055, "grad_norm": 1.6881598706045204, "learning_rate": 2.234403737356987e-06, "loss": 0.9479, "step": 2912 }, { "epoch": 2.3587044534412955, "grad_norm": 1.7582794873521799, "learning_rate": 2.228991173335967e-06, "loss": 1.0117, "step": 2913 }, { "epoch": 2.3595141700404856, "grad_norm": 1.6302584416633792, "learning_rate": 2.2235843505107447e-06, "loss": 0.9808, "step": 2914 }, { "epoch": 2.360323886639676, "grad_norm": 1.741990356965201, "learning_rate": 2.2181832728758635e-06, "loss": 0.9415, "step": 2915 }, { "epoch": 2.3611336032388666, "grad_norm": 1.7084803772752033, "learning_rate": 2.2127879444216184e-06, "loss": 1.0223, "step": 2916 }, { "epoch": 2.3619433198380566, "grad_norm": 1.680568814995532, "learning_rate": 2.2073983691340673e-06, "loss": 0.9924, "step": 2917 }, { "epoch": 2.362753036437247, "grad_norm": 1.6884873524658965, "learning_rate": 2.202014550995003e-06, "loss": 0.946, "step": 2918 }, { "epoch": 2.363562753036437, "grad_norm": 1.6973802116265153, "learning_rate": 2.1966364939819797e-06, "loss": 0.9663, "step": 2919 }, { "epoch": 2.3643724696356276, "grad_norm": 1.6489385262541483, "learning_rate": 2.191264202068286e-06, "loss": 0.985, "step": 2920 }, { "epoch": 2.3651821862348177, "grad_norm": 1.6792446355838644, "learning_rate": 2.185897679222951e-06, "loss": 0.9868, "step": 2921 }, { "epoch": 2.365991902834008, "grad_norm": 1.6804030489521116, "learning_rate": 2.180536929410748e-06, "loss": 1.0269, "step": 2922 }, { "epoch": 2.3668016194331982, "grad_norm": 1.6589791003331682, "learning_rate": 2.1751819565921774e-06, "loss": 0.9795, "step": 2923 }, { "epoch": 2.3676113360323887, "grad_norm": 1.6922330016889355, "learning_rate": 2.169832764723475e-06, "loss": 0.9302, "step": 2924 }, { "epoch": 2.3684210526315788, "grad_norm": 1.617132099924979, "learning_rate": 2.1644893577566118e-06, "loss": 0.9114, "step": 2925 }, { "epoch": 2.3692307692307693, "grad_norm": 1.683358875672709, "learning_rate": 2.1591517396392735e-06, "loss": 1.0196, "step": 2926 }, { "epoch": 2.3700404858299597, "grad_norm": 1.662270804025337, "learning_rate": 2.15381991431488e-06, "loss": 0.9346, "step": 2927 }, { "epoch": 2.37085020242915, "grad_norm": 1.5560767626917962, "learning_rate": 2.1484938857225636e-06, "loss": 1.0323, "step": 2928 }, { "epoch": 2.37165991902834, "grad_norm": 1.6168560582415574, "learning_rate": 2.1431736577971763e-06, "loss": 1.0005, "step": 2929 }, { "epoch": 2.3724696356275303, "grad_norm": 1.6890995844263526, "learning_rate": 2.137859234469286e-06, "loss": 1.0447, "step": 2930 }, { "epoch": 2.373279352226721, "grad_norm": 1.6394854020625869, "learning_rate": 2.132550619665168e-06, "loss": 0.9983, "step": 2931 }, { "epoch": 2.374089068825911, "grad_norm": 1.6603872171093927, "learning_rate": 2.1272478173068147e-06, "loss": 0.9744, "step": 2932 }, { "epoch": 2.3748987854251014, "grad_norm": 1.6252945957266938, "learning_rate": 2.1219508313119163e-06, "loss": 0.947, "step": 2933 }, { "epoch": 2.3757085020242914, "grad_norm": 1.7572254760843087, "learning_rate": 2.1166596655938676e-06, "loss": 0.9481, "step": 2934 }, { "epoch": 2.376518218623482, "grad_norm": 1.680859166288277, "learning_rate": 2.1113743240617668e-06, "loss": 0.9892, "step": 2935 }, { "epoch": 2.377327935222672, "grad_norm": 1.7086071225889468, "learning_rate": 2.1060948106204072e-06, "loss": 1.0586, "step": 2936 }, { "epoch": 2.3781376518218624, "grad_norm": 1.6757697718883062, "learning_rate": 2.100821129170274e-06, "loss": 0.9855, "step": 2937 }, { "epoch": 2.3789473684210525, "grad_norm": 1.6316446681345318, "learning_rate": 2.0955532836075445e-06, "loss": 1.0192, "step": 2938 }, { "epoch": 2.379757085020243, "grad_norm": 1.6928291171366985, "learning_rate": 2.090291277824089e-06, "loss": 0.974, "step": 2939 }, { "epoch": 2.380566801619433, "grad_norm": 1.6550872597895547, "learning_rate": 2.08503511570746e-06, "loss": 1.069, "step": 2940 }, { "epoch": 2.3813765182186235, "grad_norm": 1.6475664661450247, "learning_rate": 2.0797848011408906e-06, "loss": 0.9001, "step": 2941 }, { "epoch": 2.382186234817814, "grad_norm": 1.6992711495500126, "learning_rate": 2.0745403380032947e-06, "loss": 0.9789, "step": 2942 }, { "epoch": 2.382995951417004, "grad_norm": 1.7621872944691108, "learning_rate": 2.0693017301692698e-06, "loss": 1.0104, "step": 2943 }, { "epoch": 2.383805668016194, "grad_norm": 1.6841266297477848, "learning_rate": 2.0640689815090777e-06, "loss": 0.9808, "step": 2944 }, { "epoch": 2.3846153846153846, "grad_norm": 1.7074153273587775, "learning_rate": 2.058842095888658e-06, "loss": 0.9962, "step": 2945 }, { "epoch": 2.385425101214575, "grad_norm": 1.6367366387306832, "learning_rate": 2.053621077169613e-06, "loss": 0.9874, "step": 2946 }, { "epoch": 2.386234817813765, "grad_norm": 1.6204049507354517, "learning_rate": 2.0484059292092196e-06, "loss": 1.047, "step": 2947 }, { "epoch": 2.3870445344129556, "grad_norm": 1.6516699596226871, "learning_rate": 2.0431966558604097e-06, "loss": 1.0036, "step": 2948 }, { "epoch": 2.3878542510121457, "grad_norm": 1.630232092481421, "learning_rate": 2.0379932609717767e-06, "loss": 1.016, "step": 2949 }, { "epoch": 2.388663967611336, "grad_norm": 1.6118486578131621, "learning_rate": 2.0327957483875693e-06, "loss": 1.0195, "step": 2950 }, { "epoch": 2.389473684210526, "grad_norm": 1.679620212591038, "learning_rate": 2.0276041219476985e-06, "loss": 1.0092, "step": 2951 }, { "epoch": 2.3902834008097167, "grad_norm": 1.6192037143814413, "learning_rate": 2.0224183854877165e-06, "loss": 1.0289, "step": 2952 }, { "epoch": 2.3910931174089067, "grad_norm": 1.6913936221204569, "learning_rate": 2.0172385428388288e-06, "loss": 0.9022, "step": 2953 }, { "epoch": 2.3919028340080972, "grad_norm": 1.712872458206929, "learning_rate": 2.0120645978278887e-06, "loss": 0.9932, "step": 2954 }, { "epoch": 2.3927125506072873, "grad_norm": 1.6358216619621548, "learning_rate": 2.006896554277388e-06, "loss": 0.9879, "step": 2955 }, { "epoch": 2.3935222672064778, "grad_norm": 1.7962624248609893, "learning_rate": 2.0017344160054597e-06, "loss": 0.9944, "step": 2956 }, { "epoch": 2.3943319838056683, "grad_norm": 1.7097327349314781, "learning_rate": 1.996578186825876e-06, "loss": 0.9837, "step": 2957 }, { "epoch": 2.3951417004048583, "grad_norm": 1.7984710742922152, "learning_rate": 1.991427870548038e-06, "loss": 1.0043, "step": 2958 }, { "epoch": 2.3959514170040483, "grad_norm": 1.7633179302331299, "learning_rate": 1.9862834709769897e-06, "loss": 0.9943, "step": 2959 }, { "epoch": 2.396761133603239, "grad_norm": 1.6081821375494785, "learning_rate": 1.981144991913392e-06, "loss": 1.0207, "step": 2960 }, { "epoch": 2.3975708502024293, "grad_norm": 1.6574008938350164, "learning_rate": 1.976012437153535e-06, "loss": 1.0171, "step": 2961 }, { "epoch": 2.3983805668016194, "grad_norm": 1.6854338035743857, "learning_rate": 1.970885810489337e-06, "loss": 0.9726, "step": 2962 }, { "epoch": 2.39919028340081, "grad_norm": 1.6012012823118715, "learning_rate": 1.9657651157083322e-06, "loss": 0.9743, "step": 2963 }, { "epoch": 2.4, "grad_norm": 3.279317777475881, "learning_rate": 1.960650356593672e-06, "loss": 1.0349, "step": 2964 }, { "epoch": 2.4008097165991904, "grad_norm": 1.6559506921615998, "learning_rate": 1.9555415369241228e-06, "loss": 1.0237, "step": 2965 }, { "epoch": 2.4016194331983804, "grad_norm": 1.696563890993013, "learning_rate": 1.9504386604740632e-06, "loss": 1.034, "step": 2966 }, { "epoch": 2.402429149797571, "grad_norm": 1.6229320190285514, "learning_rate": 1.9453417310134857e-06, "loss": 0.9817, "step": 2967 }, { "epoch": 2.403238866396761, "grad_norm": 1.717120363215466, "learning_rate": 1.9402507523079794e-06, "loss": 0.9337, "step": 2968 }, { "epoch": 2.4040485829959515, "grad_norm": 1.8037959085157829, "learning_rate": 1.9351657281187484e-06, "loss": 0.9729, "step": 2969 }, { "epoch": 2.4048582995951415, "grad_norm": 1.6684409182222344, "learning_rate": 1.930086662202589e-06, "loss": 1.0042, "step": 2970 }, { "epoch": 2.405668016194332, "grad_norm": 1.698807530710582, "learning_rate": 1.9250135583119e-06, "loss": 0.9531, "step": 2971 }, { "epoch": 2.4064777327935225, "grad_norm": 1.6713412841686315, "learning_rate": 1.9199464201946717e-06, "loss": 0.9905, "step": 2972 }, { "epoch": 2.4072874493927126, "grad_norm": 1.6587820441376417, "learning_rate": 1.9148852515944893e-06, "loss": 0.9788, "step": 2973 }, { "epoch": 2.4080971659919026, "grad_norm": 1.6459864544000329, "learning_rate": 1.9098300562505266e-06, "loss": 0.9819, "step": 2974 }, { "epoch": 2.408906882591093, "grad_norm": 1.6811137602745856, "learning_rate": 1.9047808378975485e-06, "loss": 1.0037, "step": 2975 }, { "epoch": 2.4097165991902836, "grad_norm": 1.6452093088766784, "learning_rate": 1.8997376002658974e-06, "loss": 0.9404, "step": 2976 }, { "epoch": 2.4105263157894736, "grad_norm": 1.7009168710382707, "learning_rate": 1.894700347081505e-06, "loss": 0.9519, "step": 2977 }, { "epoch": 2.411336032388664, "grad_norm": 1.6634076517052085, "learning_rate": 1.8896690820658758e-06, "loss": 1.0487, "step": 2978 }, { "epoch": 2.412145748987854, "grad_norm": 1.6737820388111349, "learning_rate": 1.8846438089360896e-06, "loss": 1.0075, "step": 2979 }, { "epoch": 2.4129554655870447, "grad_norm": 1.7249507058482862, "learning_rate": 1.8796245314048046e-06, "loss": 1.0098, "step": 2980 }, { "epoch": 2.4137651821862347, "grad_norm": 1.7251783475658493, "learning_rate": 1.874611253180244e-06, "loss": 0.9775, "step": 2981 }, { "epoch": 2.414574898785425, "grad_norm": 1.6136959740020433, "learning_rate": 1.8696039779662012e-06, "loss": 0.9962, "step": 2982 }, { "epoch": 2.4153846153846152, "grad_norm": 1.6425732401790396, "learning_rate": 1.8646027094620345e-06, "loss": 0.9816, "step": 2983 }, { "epoch": 2.4161943319838057, "grad_norm": 1.8229989217932228, "learning_rate": 1.8596074513626694e-06, "loss": 0.9151, "step": 2984 }, { "epoch": 2.417004048582996, "grad_norm": 1.7061703390123246, "learning_rate": 1.8546182073585828e-06, "loss": 1.0206, "step": 2985 }, { "epoch": 2.4178137651821863, "grad_norm": 1.6590284413749856, "learning_rate": 1.8496349811358116e-06, "loss": 1.0051, "step": 2986 }, { "epoch": 2.4186234817813768, "grad_norm": 1.7189358586751335, "learning_rate": 1.8446577763759478e-06, "loss": 0.8961, "step": 2987 }, { "epoch": 2.419433198380567, "grad_norm": 1.7082228107534516, "learning_rate": 1.8396865967561317e-06, "loss": 1.0398, "step": 2988 }, { "epoch": 2.420242914979757, "grad_norm": 1.6578774454136742, "learning_rate": 1.8347214459490548e-06, "loss": 0.9466, "step": 2989 }, { "epoch": 2.4210526315789473, "grad_norm": 1.7186094993741328, "learning_rate": 1.829762327622958e-06, "loss": 1.0007, "step": 2990 }, { "epoch": 2.421862348178138, "grad_norm": 1.7016828071558703, "learning_rate": 1.8248092454416166e-06, "loss": 1.0301, "step": 2991 }, { "epoch": 2.422672064777328, "grad_norm": 1.7081136775369437, "learning_rate": 1.8198622030643564e-06, "loss": 0.9879, "step": 2992 }, { "epoch": 2.4234817813765184, "grad_norm": 1.7205639728799287, "learning_rate": 1.814921204146033e-06, "loss": 0.9609, "step": 2993 }, { "epoch": 2.4242914979757084, "grad_norm": 1.6612183377575551, "learning_rate": 1.8099862523370415e-06, "loss": 0.9652, "step": 2994 }, { "epoch": 2.425101214574899, "grad_norm": 1.674361206817772, "learning_rate": 1.805057351283307e-06, "loss": 0.9703, "step": 2995 }, { "epoch": 2.425910931174089, "grad_norm": 1.5856102512600692, "learning_rate": 1.8001345046262864e-06, "loss": 0.9733, "step": 2996 }, { "epoch": 2.4267206477732794, "grad_norm": 1.63966324613164, "learning_rate": 1.7952177160029594e-06, "loss": 1.0133, "step": 2997 }, { "epoch": 2.4275303643724695, "grad_norm": 1.6634576125787222, "learning_rate": 1.7903069890458347e-06, "loss": 0.9702, "step": 2998 }, { "epoch": 2.42834008097166, "grad_norm": 1.7282172413441768, "learning_rate": 1.7854023273829467e-06, "loss": 0.8979, "step": 2999 }, { "epoch": 2.42914979757085, "grad_norm": 1.6699414779639086, "learning_rate": 1.7805037346378384e-06, "loss": 0.9921, "step": 3000 }, { "epoch": 2.4299595141700405, "grad_norm": 1.6478445275579376, "learning_rate": 1.7756112144295745e-06, "loss": 1.0098, "step": 3001 }, { "epoch": 2.430769230769231, "grad_norm": 1.7177993329143486, "learning_rate": 1.7707247703727325e-06, "loss": 0.9902, "step": 3002 }, { "epoch": 2.431578947368421, "grad_norm": 1.6717118318062545, "learning_rate": 1.7658444060774028e-06, "loss": 0.9317, "step": 3003 }, { "epoch": 2.432388663967611, "grad_norm": 1.656893954893965, "learning_rate": 1.7609701251491796e-06, "loss": 1.0093, "step": 3004 }, { "epoch": 2.4331983805668016, "grad_norm": 1.672458797771242, "learning_rate": 1.756101931189169e-06, "loss": 0.994, "step": 3005 }, { "epoch": 2.434008097165992, "grad_norm": 1.7979249229953589, "learning_rate": 1.7512398277939735e-06, "loss": 0.9501, "step": 3006 }, { "epoch": 2.434817813765182, "grad_norm": 1.7675347987379522, "learning_rate": 1.7463838185557024e-06, "loss": 0.9573, "step": 3007 }, { "epoch": 2.4356275303643726, "grad_norm": 1.6829178005541199, "learning_rate": 1.7415339070619586e-06, "loss": 0.9926, "step": 3008 }, { "epoch": 2.4364372469635627, "grad_norm": 1.7579588916573732, "learning_rate": 1.73669009689584e-06, "loss": 0.9126, "step": 3009 }, { "epoch": 2.437246963562753, "grad_norm": 1.739729410392981, "learning_rate": 1.7318523916359376e-06, "loss": 0.9609, "step": 3010 }, { "epoch": 2.438056680161943, "grad_norm": 1.6681245792489996, "learning_rate": 1.7270207948563323e-06, "loss": 1.0533, "step": 3011 }, { "epoch": 2.4388663967611337, "grad_norm": 1.696281337597073, "learning_rate": 1.7221953101265888e-06, "loss": 0.9251, "step": 3012 }, { "epoch": 2.4396761133603238, "grad_norm": 1.6189610965490386, "learning_rate": 1.7173759410117663e-06, "loss": 0.9526, "step": 3013 }, { "epoch": 2.4404858299595142, "grad_norm": 1.724367632228813, "learning_rate": 1.7125626910723915e-06, "loss": 0.9634, "step": 3014 }, { "epoch": 2.4412955465587043, "grad_norm": 1.6773880195931092, "learning_rate": 1.7077555638644838e-06, "loss": 0.9881, "step": 3015 }, { "epoch": 2.442105263157895, "grad_norm": 1.7197370339372542, "learning_rate": 1.7029545629395306e-06, "loss": 0.9735, "step": 3016 }, { "epoch": 2.4429149797570853, "grad_norm": 1.7700548528489848, "learning_rate": 1.6981596918444953e-06, "loss": 1.0035, "step": 3017 }, { "epoch": 2.4437246963562753, "grad_norm": 1.7118799526231616, "learning_rate": 1.6933709541218125e-06, "loss": 1.0057, "step": 3018 }, { "epoch": 2.4445344129554654, "grad_norm": 1.6696288935408505, "learning_rate": 1.6885883533093839e-06, "loss": 1.0043, "step": 3019 }, { "epoch": 2.445344129554656, "grad_norm": 1.6491620840254673, "learning_rate": 1.6838118929405856e-06, "loss": 0.9407, "step": 3020 }, { "epoch": 2.4461538461538463, "grad_norm": 1.659428974002118, "learning_rate": 1.6790415765442458e-06, "loss": 1.0047, "step": 3021 }, { "epoch": 2.4469635627530364, "grad_norm": 1.6354681687186046, "learning_rate": 1.6742774076446578e-06, "loss": 0.986, "step": 3022 }, { "epoch": 2.447773279352227, "grad_norm": 1.6775886642207607, "learning_rate": 1.6695193897615781e-06, "loss": 0.9518, "step": 3023 }, { "epoch": 2.448582995951417, "grad_norm": 1.6648950365114745, "learning_rate": 1.6647675264102126e-06, "loss": 0.9382, "step": 3024 }, { "epoch": 2.4493927125506074, "grad_norm": 1.646166538722141, "learning_rate": 1.660021821101222e-06, "loss": 0.998, "step": 3025 }, { "epoch": 2.4502024291497975, "grad_norm": 1.6268027966670777, "learning_rate": 1.6552822773407163e-06, "loss": 1.0009, "step": 3026 }, { "epoch": 2.451012145748988, "grad_norm": 1.6955695087440212, "learning_rate": 1.6505488986302586e-06, "loss": 1.0315, "step": 3027 }, { "epoch": 2.451821862348178, "grad_norm": 1.6851857353319628, "learning_rate": 1.645821688466851e-06, "loss": 0.9873, "step": 3028 }, { "epoch": 2.4526315789473685, "grad_norm": 1.6305706879900275, "learning_rate": 1.6411006503429428e-06, "loss": 0.912, "step": 3029 }, { "epoch": 2.4534412955465585, "grad_norm": 1.6588245540634141, "learning_rate": 1.6363857877464161e-06, "loss": 1.0123, "step": 3030 }, { "epoch": 2.454251012145749, "grad_norm": 1.6911002514254574, "learning_rate": 1.6316771041606027e-06, "loss": 0.9738, "step": 3031 }, { "epoch": 2.455060728744939, "grad_norm": 1.7042305673840112, "learning_rate": 1.6269746030642607e-06, "loss": 0.9485, "step": 3032 }, { "epoch": 2.4558704453441296, "grad_norm": 1.620560078412615, "learning_rate": 1.6222782879315802e-06, "loss": 0.9963, "step": 3033 }, { "epoch": 2.4566801619433196, "grad_norm": 1.6956591304380106, "learning_rate": 1.6175881622321832e-06, "loss": 0.9385, "step": 3034 }, { "epoch": 2.45748987854251, "grad_norm": 1.710112971818022, "learning_rate": 1.6129042294311227e-06, "loss": 0.9377, "step": 3035 }, { "epoch": 2.4582995951417006, "grad_norm": 1.7103868534898998, "learning_rate": 1.6082264929888702e-06, "loss": 0.997, "step": 3036 }, { "epoch": 2.4591093117408906, "grad_norm": 1.5708740432232011, "learning_rate": 1.603554956361324e-06, "loss": 0.9684, "step": 3037 }, { "epoch": 2.459919028340081, "grad_norm": 1.7023113258556464, "learning_rate": 1.5988896229997952e-06, "loss": 0.9485, "step": 3038 }, { "epoch": 2.460728744939271, "grad_norm": 1.709049003849265, "learning_rate": 1.5942304963510236e-06, "loss": 0.9504, "step": 3039 }, { "epoch": 2.4615384615384617, "grad_norm": 1.7137018057495825, "learning_rate": 1.5895775798571523e-06, "loss": 0.962, "step": 3040 }, { "epoch": 2.4623481781376517, "grad_norm": 1.6578218057308711, "learning_rate": 1.5849308769557393e-06, "loss": 0.9717, "step": 3041 }, { "epoch": 2.463157894736842, "grad_norm": 1.7381084856427103, "learning_rate": 1.5802903910797584e-06, "loss": 0.9704, "step": 3042 }, { "epoch": 2.4639676113360323, "grad_norm": 1.7855052345593487, "learning_rate": 1.575656125657583e-06, "loss": 0.9614, "step": 3043 }, { "epoch": 2.4647773279352228, "grad_norm": 1.6487843421840669, "learning_rate": 1.5710280841129932e-06, "loss": 1.0039, "step": 3044 }, { "epoch": 2.465587044534413, "grad_norm": 1.6882419309228567, "learning_rate": 1.5664062698651706e-06, "loss": 0.9755, "step": 3045 }, { "epoch": 2.4663967611336033, "grad_norm": 1.6447631511348773, "learning_rate": 1.5617906863286936e-06, "loss": 1.026, "step": 3046 }, { "epoch": 2.4672064777327933, "grad_norm": 1.7529100807012996, "learning_rate": 1.5571813369135457e-06, "loss": 0.9185, "step": 3047 }, { "epoch": 2.468016194331984, "grad_norm": 1.7744701731060566, "learning_rate": 1.5525782250250953e-06, "loss": 0.9455, "step": 3048 }, { "epoch": 2.468825910931174, "grad_norm": 1.7939853067544942, "learning_rate": 1.5479813540641064e-06, "loss": 1.0429, "step": 3049 }, { "epoch": 2.4696356275303644, "grad_norm": 1.8049689554151258, "learning_rate": 1.5433907274267357e-06, "loss": 0.9599, "step": 3050 }, { "epoch": 2.470445344129555, "grad_norm": 1.7257816506826475, "learning_rate": 1.538806348504519e-06, "loss": 1.0007, "step": 3051 }, { "epoch": 2.471255060728745, "grad_norm": 1.6882725120061068, "learning_rate": 1.534228220684384e-06, "loss": 0.9825, "step": 3052 }, { "epoch": 2.4720647773279354, "grad_norm": 1.5881294637631733, "learning_rate": 1.5296563473486325e-06, "loss": 0.9745, "step": 3053 }, { "epoch": 2.4728744939271254, "grad_norm": 1.730530663215343, "learning_rate": 1.525090731874951e-06, "loss": 0.9413, "step": 3054 }, { "epoch": 2.473684210526316, "grad_norm": 1.669751163835487, "learning_rate": 1.5205313776364028e-06, "loss": 0.9574, "step": 3055 }, { "epoch": 2.474493927125506, "grad_norm": 1.6752509320040634, "learning_rate": 1.5159782880014207e-06, "loss": 1.0109, "step": 3056 }, { "epoch": 2.4753036437246965, "grad_norm": 1.6347253102331993, "learning_rate": 1.511431466333817e-06, "loss": 0.9337, "step": 3057 }, { "epoch": 2.4761133603238865, "grad_norm": 1.6962317193949117, "learning_rate": 1.506890915992766e-06, "loss": 1.0413, "step": 3058 }, { "epoch": 2.476923076923077, "grad_norm": 1.6430785203789686, "learning_rate": 1.5023566403328105e-06, "loss": 0.9546, "step": 3059 }, { "epoch": 2.477732793522267, "grad_norm": 1.6664785121693346, "learning_rate": 1.4978286427038602e-06, "loss": 0.9703, "step": 3060 }, { "epoch": 2.4785425101214575, "grad_norm": 1.614002773301002, "learning_rate": 1.4933069264511834e-06, "loss": 1.029, "step": 3061 }, { "epoch": 2.4793522267206476, "grad_norm": 1.6894913124402384, "learning_rate": 1.488791494915408e-06, "loss": 0.8944, "step": 3062 }, { "epoch": 2.480161943319838, "grad_norm": 1.5775586891733708, "learning_rate": 1.4842823514325244e-06, "loss": 0.9362, "step": 3063 }, { "epoch": 2.480971659919028, "grad_norm": 1.645201425602457, "learning_rate": 1.4797794993338676e-06, "loss": 0.9915, "step": 3064 }, { "epoch": 2.4817813765182186, "grad_norm": 1.6765332328829647, "learning_rate": 1.4752829419461357e-06, "loss": 1.0843, "step": 3065 }, { "epoch": 2.482591093117409, "grad_norm": 1.718267617032678, "learning_rate": 1.4707926825913676e-06, "loss": 0.9818, "step": 3066 }, { "epoch": 2.483400809716599, "grad_norm": 1.6812421248362865, "learning_rate": 1.4663087245869512e-06, "loss": 0.9961, "step": 3067 }, { "epoch": 2.4842105263157896, "grad_norm": 1.6802970912002102, "learning_rate": 1.4618310712456218e-06, "loss": 0.9768, "step": 3068 }, { "epoch": 2.4850202429149797, "grad_norm": 1.7391114674842005, "learning_rate": 1.4573597258754546e-06, "loss": 0.9987, "step": 3069 }, { "epoch": 2.48582995951417, "grad_norm": 1.694339682778352, "learning_rate": 1.4528946917798603e-06, "loss": 0.9906, "step": 3070 }, { "epoch": 2.4866396761133602, "grad_norm": 1.7363838335149249, "learning_rate": 1.448435972257597e-06, "loss": 0.9866, "step": 3071 }, { "epoch": 2.4874493927125507, "grad_norm": 1.6870306102440977, "learning_rate": 1.4439835706027526e-06, "loss": 0.9262, "step": 3072 }, { "epoch": 2.4882591093117408, "grad_norm": 1.6692336890689885, "learning_rate": 1.4395374901047443e-06, "loss": 0.9166, "step": 3073 }, { "epoch": 2.4890688259109313, "grad_norm": 1.6886739622081277, "learning_rate": 1.4350977340483218e-06, "loss": 1.0644, "step": 3074 }, { "epoch": 2.4898785425101213, "grad_norm": 1.6364093438608744, "learning_rate": 1.4306643057135638e-06, "loss": 1.0401, "step": 3075 }, { "epoch": 2.490688259109312, "grad_norm": 1.6954035597211397, "learning_rate": 1.4262372083758714e-06, "loss": 0.981, "step": 3076 }, { "epoch": 2.491497975708502, "grad_norm": 1.6508448957202095, "learning_rate": 1.4218164453059669e-06, "loss": 1.0243, "step": 3077 }, { "epoch": 2.4923076923076923, "grad_norm": 1.7298928655955168, "learning_rate": 1.4174020197699e-06, "loss": 0.9714, "step": 3078 }, { "epoch": 2.4931174089068824, "grad_norm": 1.668595707670543, "learning_rate": 1.4129939350290312e-06, "loss": 1.01, "step": 3079 }, { "epoch": 2.493927125506073, "grad_norm": 1.7117470225490021, "learning_rate": 1.4085921943400416e-06, "loss": 0.961, "step": 3080 }, { "epoch": 2.4947368421052634, "grad_norm": 1.6420783440308226, "learning_rate": 1.404196800954921e-06, "loss": 0.954, "step": 3081 }, { "epoch": 2.4955465587044534, "grad_norm": 1.6312366446037767, "learning_rate": 1.3998077581209712e-06, "loss": 0.9967, "step": 3082 }, { "epoch": 2.496356275303644, "grad_norm": 1.7256120345016341, "learning_rate": 1.3954250690808036e-06, "loss": 0.9527, "step": 3083 }, { "epoch": 2.497165991902834, "grad_norm": 1.6410665457677764, "learning_rate": 1.3910487370723347e-06, "loss": 1.0166, "step": 3084 }, { "epoch": 2.4979757085020244, "grad_norm": 1.683999810079675, "learning_rate": 1.3866787653287804e-06, "loss": 0.8987, "step": 3085 }, { "epoch": 2.4987854251012145, "grad_norm": 1.7049093242978388, "learning_rate": 1.3823151570786653e-06, "loss": 0.9651, "step": 3086 }, { "epoch": 2.499595141700405, "grad_norm": 1.7012673477415141, "learning_rate": 1.3779579155458089e-06, "loss": 0.9147, "step": 3087 }, { "epoch": 2.500404858299595, "grad_norm": 1.7058323956163846, "learning_rate": 1.3736070439493277e-06, "loss": 0.9477, "step": 3088 }, { "epoch": 2.5012145748987855, "grad_norm": 1.7742651035823065, "learning_rate": 1.369262545503629e-06, "loss": 0.9419, "step": 3089 }, { "epoch": 2.5020242914979756, "grad_norm": 1.5760497645036358, "learning_rate": 1.3649244234184157e-06, "loss": 0.9739, "step": 3090 }, { "epoch": 2.502834008097166, "grad_norm": 1.637791167494247, "learning_rate": 1.3605926808986758e-06, "loss": 0.9618, "step": 3091 }, { "epoch": 2.5036437246963565, "grad_norm": 1.6641954657060896, "learning_rate": 1.3562673211446863e-06, "loss": 0.992, "step": 3092 }, { "epoch": 2.5044534412955466, "grad_norm": 1.687575151546589, "learning_rate": 1.3519483473520124e-06, "loss": 0.946, "step": 3093 }, { "epoch": 2.5052631578947366, "grad_norm": 1.6264510448768228, "learning_rate": 1.3476357627114945e-06, "loss": 1.0184, "step": 3094 }, { "epoch": 2.506072874493927, "grad_norm": 1.630753363552384, "learning_rate": 1.3433295704092586e-06, "loss": 0.9742, "step": 3095 }, { "epoch": 2.5068825910931176, "grad_norm": 1.7544663565446108, "learning_rate": 1.3390297736267033e-06, "loss": 0.9339, "step": 3096 }, { "epoch": 2.5076923076923077, "grad_norm": 1.6250326610562258, "learning_rate": 1.3347363755405064e-06, "loss": 0.9355, "step": 3097 }, { "epoch": 2.5085020242914977, "grad_norm": 1.7027349777509002, "learning_rate": 1.3304493793226135e-06, "loss": 0.9723, "step": 3098 }, { "epoch": 2.509311740890688, "grad_norm": 1.8050244837715836, "learning_rate": 1.3261687881402464e-06, "loss": 0.99, "step": 3099 }, { "epoch": 2.5101214574898787, "grad_norm": 1.7168877357885066, "learning_rate": 1.3218946051558867e-06, "loss": 1.0173, "step": 3100 }, { "epoch": 2.5109311740890687, "grad_norm": 1.7435696786750583, "learning_rate": 1.3176268335272934e-06, "loss": 1.0018, "step": 3101 }, { "epoch": 2.5117408906882592, "grad_norm": 1.5938621762485412, "learning_rate": 1.3133654764074765e-06, "loss": 1.111, "step": 3102 }, { "epoch": 2.5125506072874493, "grad_norm": 1.6757330684573215, "learning_rate": 1.3091105369447166e-06, "loss": 0.9999, "step": 3103 }, { "epoch": 2.5133603238866398, "grad_norm": 1.7130683949058882, "learning_rate": 1.3048620182825478e-06, "loss": 1.0611, "step": 3104 }, { "epoch": 2.51417004048583, "grad_norm": 1.7070694868968392, "learning_rate": 1.3006199235597628e-06, "loss": 0.9069, "step": 3105 }, { "epoch": 2.5149797570850203, "grad_norm": 1.6521586885628545, "learning_rate": 1.2963842559104045e-06, "loss": 1.0559, "step": 3106 }, { "epoch": 2.515789473684211, "grad_norm": 1.6872800067468905, "learning_rate": 1.29215501846377e-06, "loss": 1.0301, "step": 3107 }, { "epoch": 2.516599190283401, "grad_norm": 1.6209272379948023, "learning_rate": 1.2879322143444095e-06, "loss": 0.9728, "step": 3108 }, { "epoch": 2.517408906882591, "grad_norm": 1.6880406144910838, "learning_rate": 1.2837158466721155e-06, "loss": 0.982, "step": 3109 }, { "epoch": 2.5182186234817814, "grad_norm": 1.7156607604203853, "learning_rate": 1.279505918561923e-06, "loss": 1.0017, "step": 3110 }, { "epoch": 2.519028340080972, "grad_norm": 1.6981687300306323, "learning_rate": 1.2753024331241193e-06, "loss": 0.9594, "step": 3111 }, { "epoch": 2.519838056680162, "grad_norm": 1.7067120833158638, "learning_rate": 1.2711053934642225e-06, "loss": 0.9487, "step": 3112 }, { "epoch": 2.520647773279352, "grad_norm": 1.689789662984842, "learning_rate": 1.2669148026829915e-06, "loss": 1.0139, "step": 3113 }, { "epoch": 2.5214574898785425, "grad_norm": 1.6470500284421414, "learning_rate": 1.2627306638764213e-06, "loss": 1.0077, "step": 3114 }, { "epoch": 2.522267206477733, "grad_norm": 1.5826496661345488, "learning_rate": 1.2585529801357377e-06, "loss": 1.0494, "step": 3115 }, { "epoch": 2.523076923076923, "grad_norm": 1.742074564535538, "learning_rate": 1.2543817545474036e-06, "loss": 1.0169, "step": 3116 }, { "epoch": 2.5238866396761135, "grad_norm": 1.7660057671815605, "learning_rate": 1.250216990193105e-06, "loss": 0.9641, "step": 3117 }, { "epoch": 2.5246963562753035, "grad_norm": 1.652494097329047, "learning_rate": 1.246058690149755e-06, "loss": 0.9172, "step": 3118 }, { "epoch": 2.525506072874494, "grad_norm": 1.6461968449457183, "learning_rate": 1.2419068574894943e-06, "loss": 1.0005, "step": 3119 }, { "epoch": 2.526315789473684, "grad_norm": 1.6624721041701427, "learning_rate": 1.2377614952796825e-06, "loss": 0.9542, "step": 3120 }, { "epoch": 2.5271255060728746, "grad_norm": 1.7097282118454058, "learning_rate": 1.2336226065828993e-06, "loss": 0.9636, "step": 3121 }, { "epoch": 2.527935222672065, "grad_norm": 1.7315637749359838, "learning_rate": 1.2294901944569394e-06, "loss": 0.9522, "step": 3122 }, { "epoch": 2.528744939271255, "grad_norm": 1.68876777571607, "learning_rate": 1.22536426195482e-06, "loss": 0.9978, "step": 3123 }, { "epoch": 2.529554655870445, "grad_norm": 1.7501789467762574, "learning_rate": 1.2212448121247643e-06, "loss": 0.9859, "step": 3124 }, { "epoch": 2.5303643724696356, "grad_norm": 1.6915559058052043, "learning_rate": 1.217131848010209e-06, "loss": 0.9187, "step": 3125 }, { "epoch": 2.531174089068826, "grad_norm": 1.7489759930129678, "learning_rate": 1.2130253726497954e-06, "loss": 0.9737, "step": 3126 }, { "epoch": 2.531983805668016, "grad_norm": 1.6670553247236088, "learning_rate": 1.2089253890773789e-06, "loss": 0.8828, "step": 3127 }, { "epoch": 2.532793522267206, "grad_norm": 1.6651695693818536, "learning_rate": 1.204831900322011e-06, "loss": 0.9528, "step": 3128 }, { "epoch": 2.5336032388663967, "grad_norm": 1.6250021225860003, "learning_rate": 1.200744909407946e-06, "loss": 0.9981, "step": 3129 }, { "epoch": 2.534412955465587, "grad_norm": 1.7704071649167492, "learning_rate": 1.196664419354644e-06, "loss": 0.9586, "step": 3130 }, { "epoch": 2.5352226720647772, "grad_norm": 1.732509606852866, "learning_rate": 1.1925904331767545e-06, "loss": 0.9765, "step": 3131 }, { "epoch": 2.5360323886639677, "grad_norm": 1.7830253671280303, "learning_rate": 1.1885229538841259e-06, "loss": 0.9934, "step": 3132 }, { "epoch": 2.536842105263158, "grad_norm": 1.7228909788826947, "learning_rate": 1.1844619844817995e-06, "loss": 0.9926, "step": 3133 }, { "epoch": 2.5376518218623483, "grad_norm": 1.7301843770006853, "learning_rate": 1.1804075279700023e-06, "loss": 0.9108, "step": 3134 }, { "epoch": 2.5384615384615383, "grad_norm": 1.7297103310761568, "learning_rate": 1.176359587344158e-06, "loss": 0.9898, "step": 3135 }, { "epoch": 2.539271255060729, "grad_norm": 1.6692620409074408, "learning_rate": 1.17231816559487e-06, "loss": 0.9766, "step": 3136 }, { "epoch": 2.540080971659919, "grad_norm": 1.7138975481366907, "learning_rate": 1.168283265707927e-06, "loss": 1.0342, "step": 3137 }, { "epoch": 2.5408906882591094, "grad_norm": 1.8684377760837239, "learning_rate": 1.1642548906643003e-06, "loss": 0.917, "step": 3138 }, { "epoch": 2.5417004048582994, "grad_norm": 1.6682784606850616, "learning_rate": 1.160233043440141e-06, "loss": 0.9585, "step": 3139 }, { "epoch": 2.54251012145749, "grad_norm": 1.646760129502982, "learning_rate": 1.1562177270067766e-06, "loss": 0.9977, "step": 3140 }, { "epoch": 2.5433198380566804, "grad_norm": 1.7761301320650948, "learning_rate": 1.1522089443307083e-06, "loss": 1.0222, "step": 3141 }, { "epoch": 2.5441295546558704, "grad_norm": 1.713346287093232, "learning_rate": 1.1482066983736095e-06, "loss": 1.0214, "step": 3142 }, { "epoch": 2.5449392712550605, "grad_norm": 1.739109874170232, "learning_rate": 1.1442109920923317e-06, "loss": 0.9595, "step": 3143 }, { "epoch": 2.545748987854251, "grad_norm": 1.797935688363354, "learning_rate": 1.1402218284388845e-06, "loss": 0.9891, "step": 3144 }, { "epoch": 2.5465587044534415, "grad_norm": 1.7311448213719867, "learning_rate": 1.1362392103604536e-06, "loss": 0.9481, "step": 3145 }, { "epoch": 2.5473684210526315, "grad_norm": 1.6344396564565618, "learning_rate": 1.132263140799381e-06, "loss": 0.9162, "step": 3146 }, { "epoch": 2.548178137651822, "grad_norm": 1.7616052113712068, "learning_rate": 1.1282936226931762e-06, "loss": 0.947, "step": 3147 }, { "epoch": 2.548987854251012, "grad_norm": 1.671278873181062, "learning_rate": 1.124330658974503e-06, "loss": 0.9724, "step": 3148 }, { "epoch": 2.5497975708502025, "grad_norm": 1.624337775639022, "learning_rate": 1.120374252571188e-06, "loss": 0.9633, "step": 3149 }, { "epoch": 2.5506072874493926, "grad_norm": 1.7342053818211673, "learning_rate": 1.1164244064062101e-06, "loss": 0.9351, "step": 3150 }, { "epoch": 2.551417004048583, "grad_norm": 1.7210916677441777, "learning_rate": 1.112481123397704e-06, "loss": 0.936, "step": 3151 }, { "epoch": 2.552226720647773, "grad_norm": 1.7346383956750289, "learning_rate": 1.1085444064589523e-06, "loss": 0.9564, "step": 3152 }, { "epoch": 2.5530364372469636, "grad_norm": 1.7122232488976015, "learning_rate": 1.104614258498392e-06, "loss": 1.0299, "step": 3153 }, { "epoch": 2.5538461538461537, "grad_norm": 1.6894970983661606, "learning_rate": 1.1006906824196006e-06, "loss": 1.052, "step": 3154 }, { "epoch": 2.554655870445344, "grad_norm": 1.7405016755106155, "learning_rate": 1.0967736811213048e-06, "loss": 0.9761, "step": 3155 }, { "epoch": 2.5554655870445346, "grad_norm": 1.6635809223324118, "learning_rate": 1.0928632574973718e-06, "loss": 0.9735, "step": 3156 }, { "epoch": 2.5562753036437247, "grad_norm": 1.5865674939483103, "learning_rate": 1.0889594144368088e-06, "loss": 0.9703, "step": 3157 }, { "epoch": 2.5570850202429147, "grad_norm": 1.7886384841861689, "learning_rate": 1.0850621548237604e-06, "loss": 0.953, "step": 3158 }, { "epoch": 2.557894736842105, "grad_norm": 1.65444440912227, "learning_rate": 1.081171481537513e-06, "loss": 1.0416, "step": 3159 }, { "epoch": 2.5587044534412957, "grad_norm": 1.6010494418210732, "learning_rate": 1.0772873974524833e-06, "loss": 0.978, "step": 3160 }, { "epoch": 2.5595141700404858, "grad_norm": 1.753931513386066, "learning_rate": 1.0734099054382186e-06, "loss": 1.0272, "step": 3161 }, { "epoch": 2.5603238866396762, "grad_norm": 1.7002625406870047, "learning_rate": 1.069539008359397e-06, "loss": 0.9895, "step": 3162 }, { "epoch": 2.5611336032388663, "grad_norm": 1.696885279537435, "learning_rate": 1.0656747090758246e-06, "loss": 1.042, "step": 3163 }, { "epoch": 2.561943319838057, "grad_norm": 1.6725123732842846, "learning_rate": 1.061817010442433e-06, "loss": 0.9873, "step": 3164 }, { "epoch": 2.562753036437247, "grad_norm": 1.744422716922843, "learning_rate": 1.0579659153092759e-06, "loss": 0.9915, "step": 3165 }, { "epoch": 2.5635627530364373, "grad_norm": 1.6982287591882945, "learning_rate": 1.0541214265215328e-06, "loss": 0.993, "step": 3166 }, { "epoch": 2.5643724696356274, "grad_norm": 1.6580163858315626, "learning_rate": 1.0502835469194961e-06, "loss": 1.0379, "step": 3167 }, { "epoch": 2.565182186234818, "grad_norm": 1.6503684640252827, "learning_rate": 1.0464522793385822e-06, "loss": 0.9683, "step": 3168 }, { "epoch": 2.565991902834008, "grad_norm": 1.6600515587726554, "learning_rate": 1.0426276266093172e-06, "loss": 1.0482, "step": 3169 }, { "epoch": 2.5668016194331984, "grad_norm": 1.7091294278437665, "learning_rate": 1.0388095915573427e-06, "loss": 1.0417, "step": 3170 }, { "epoch": 2.567611336032389, "grad_norm": 1.6555794090881855, "learning_rate": 1.034998177003409e-06, "loss": 0.9302, "step": 3171 }, { "epoch": 2.568421052631579, "grad_norm": 1.7961476476441376, "learning_rate": 1.0311933857633772e-06, "loss": 0.967, "step": 3172 }, { "epoch": 2.569230769230769, "grad_norm": 1.6778244264366644, "learning_rate": 1.027395220648213e-06, "loss": 1.0396, "step": 3173 }, { "epoch": 2.5700404858299595, "grad_norm": 1.6859023669000333, "learning_rate": 1.0236036844639897e-06, "loss": 0.9793, "step": 3174 }, { "epoch": 2.57085020242915, "grad_norm": 1.7334612254526203, "learning_rate": 1.0198187800118842e-06, "loss": 0.9165, "step": 3175 }, { "epoch": 2.57165991902834, "grad_norm": 1.726216684645843, "learning_rate": 1.0160405100881699e-06, "loss": 1.0163, "step": 3176 }, { "epoch": 2.5724696356275305, "grad_norm": 1.6800195699709957, "learning_rate": 1.0122688774842194e-06, "loss": 0.9615, "step": 3177 }, { "epoch": 2.5732793522267206, "grad_norm": 1.7496662315340423, "learning_rate": 1.0085038849865025e-06, "loss": 0.9433, "step": 3178 }, { "epoch": 2.574089068825911, "grad_norm": 1.669757711256767, "learning_rate": 1.0047455353765845e-06, "loss": 0.998, "step": 3179 }, { "epoch": 2.574898785425101, "grad_norm": 1.7649644449020268, "learning_rate": 1.0009938314311186e-06, "loss": 0.8943, "step": 3180 }, { "epoch": 2.5757085020242916, "grad_norm": 1.665726033058726, "learning_rate": 9.972487759218551e-07, "loss": 0.9636, "step": 3181 }, { "epoch": 2.5765182186234816, "grad_norm": 1.6714613239053895, "learning_rate": 9.935103716156258e-07, "loss": 0.9908, "step": 3182 }, { "epoch": 2.577327935222672, "grad_norm": 1.780013775907417, "learning_rate": 9.897786212743543e-07, "loss": 0.9168, "step": 3183 }, { "epoch": 2.578137651821862, "grad_norm": 1.6637866101217857, "learning_rate": 9.860535276550443e-07, "loss": 0.9549, "step": 3184 }, { "epoch": 2.5789473684210527, "grad_norm": 1.6430785762195612, "learning_rate": 9.82335093509782e-07, "loss": 0.9702, "step": 3185 }, { "epoch": 2.579757085020243, "grad_norm": 1.7132270223349593, "learning_rate": 9.786233215857354e-07, "loss": 0.9692, "step": 3186 }, { "epoch": 2.580566801619433, "grad_norm": 1.7060451332011317, "learning_rate": 9.74918214625149e-07, "loss": 0.9941, "step": 3187 }, { "epoch": 2.5813765182186232, "grad_norm": 1.6254697719748301, "learning_rate": 9.712197753653418e-07, "loss": 1.0413, "step": 3188 }, { "epoch": 2.5821862348178137, "grad_norm": 1.6890779172726746, "learning_rate": 9.675280065387117e-07, "loss": 0.9546, "step": 3189 }, { "epoch": 2.582995951417004, "grad_norm": 1.6665698649055485, "learning_rate": 9.638429108727232e-07, "loss": 0.9145, "step": 3190 }, { "epoch": 2.5838056680161943, "grad_norm": 1.7455050077057428, "learning_rate": 9.601644910899144e-07, "loss": 0.9475, "step": 3191 }, { "epoch": 2.5846153846153848, "grad_norm": 1.707769647747537, "learning_rate": 9.56492749907889e-07, "loss": 0.9925, "step": 3192 }, { "epoch": 2.585425101214575, "grad_norm": 1.7626431911020435, "learning_rate": 9.528276900393185e-07, "loss": 0.9197, "step": 3193 }, { "epoch": 2.5862348178137653, "grad_norm": 1.6723942662699407, "learning_rate": 9.491693141919345e-07, "loss": 0.9899, "step": 3194 }, { "epoch": 2.5870445344129553, "grad_norm": 1.66701720323838, "learning_rate": 9.455176250685338e-07, "loss": 1.0288, "step": 3195 }, { "epoch": 2.587854251012146, "grad_norm": 1.6539257270825336, "learning_rate": 9.418726253669741e-07, "loss": 1.0232, "step": 3196 }, { "epoch": 2.588663967611336, "grad_norm": 1.5465095186573317, "learning_rate": 9.38234317780169e-07, "loss": 0.9658, "step": 3197 }, { "epoch": 2.5894736842105264, "grad_norm": 1.7029852892332802, "learning_rate": 9.346027049960849e-07, "loss": 0.9513, "step": 3198 }, { "epoch": 2.5902834008097164, "grad_norm": 1.831209877056402, "learning_rate": 9.309777896977501e-07, "loss": 0.9345, "step": 3199 }, { "epoch": 2.591093117408907, "grad_norm": 1.6872660132013921, "learning_rate": 9.27359574563238e-07, "loss": 0.9709, "step": 3200 }, { "epoch": 2.5919028340080974, "grad_norm": 1.6893163114944878, "learning_rate": 9.237480622656736e-07, "loss": 0.9725, "step": 3201 }, { "epoch": 2.5927125506072874, "grad_norm": 1.6058163315797132, "learning_rate": 9.201432554732304e-07, "loss": 1.0278, "step": 3202 }, { "epoch": 2.5935222672064775, "grad_norm": 1.692466394304998, "learning_rate": 9.165451568491257e-07, "loss": 1.0333, "step": 3203 }, { "epoch": 2.594331983805668, "grad_norm": 1.640592263798185, "learning_rate": 9.129537690516277e-07, "loss": 0.9587, "step": 3204 }, { "epoch": 2.5951417004048585, "grad_norm": 1.6407337256128884, "learning_rate": 9.093690947340406e-07, "loss": 0.937, "step": 3205 }, { "epoch": 2.5959514170040485, "grad_norm": 1.738587051999838, "learning_rate": 9.057911365447058e-07, "loss": 0.9966, "step": 3206 }, { "epoch": 2.596761133603239, "grad_norm": 1.6250090533624972, "learning_rate": 9.022198971270124e-07, "loss": 0.9348, "step": 3207 }, { "epoch": 2.597570850202429, "grad_norm": 1.643745528654048, "learning_rate": 8.986553791193775e-07, "loss": 1.0099, "step": 3208 }, { "epoch": 2.5983805668016196, "grad_norm": 1.6883666457612738, "learning_rate": 8.950975851552568e-07, "loss": 0.9812, "step": 3209 }, { "epoch": 2.5991902834008096, "grad_norm": 1.7178288521933882, "learning_rate": 8.915465178631344e-07, "loss": 0.9968, "step": 3210 }, { "epoch": 2.6, "grad_norm": 1.6723879917548437, "learning_rate": 8.880021798665295e-07, "loss": 0.9755, "step": 3211 }, { "epoch": 2.60080971659919, "grad_norm": 1.705852934667388, "learning_rate": 8.844645737839874e-07, "loss": 0.9901, "step": 3212 }, { "epoch": 2.6016194331983806, "grad_norm": 1.7322533252061503, "learning_rate": 8.809337022290787e-07, "loss": 0.938, "step": 3213 }, { "epoch": 2.6024291497975707, "grad_norm": 1.6855599969482573, "learning_rate": 8.774095678103978e-07, "loss": 0.9581, "step": 3214 }, { "epoch": 2.603238866396761, "grad_norm": 1.6274201556331367, "learning_rate": 8.738921731315686e-07, "loss": 1.0798, "step": 3215 }, { "epoch": 2.6040485829959517, "grad_norm": 1.61403911744669, "learning_rate": 8.70381520791227e-07, "loss": 1.0021, "step": 3216 }, { "epoch": 2.6048582995951417, "grad_norm": 1.678255519779955, "learning_rate": 8.668776133830315e-07, "loss": 0.9717, "step": 3217 }, { "epoch": 2.6056680161943317, "grad_norm": 1.627162580752701, "learning_rate": 8.633804534956591e-07, "loss": 0.8836, "step": 3218 }, { "epoch": 2.6064777327935222, "grad_norm": 1.6220265843378787, "learning_rate": 8.598900437127999e-07, "loss": 0.9926, "step": 3219 }, { "epoch": 2.6072874493927127, "grad_norm": 1.637954735309685, "learning_rate": 8.564063866131567e-07, "loss": 1.0443, "step": 3220 }, { "epoch": 2.6080971659919028, "grad_norm": 1.6447001385429936, "learning_rate": 8.529294847704428e-07, "loss": 0.8968, "step": 3221 }, { "epoch": 2.6089068825910933, "grad_norm": 1.732595510365215, "learning_rate": 8.494593407533814e-07, "loss": 0.9386, "step": 3222 }, { "epoch": 2.6097165991902833, "grad_norm": 1.6660686105327704, "learning_rate": 8.459959571257071e-07, "loss": 1.0091, "step": 3223 }, { "epoch": 2.610526315789474, "grad_norm": 1.7013721994565274, "learning_rate": 8.425393364461542e-07, "loss": 0.9747, "step": 3224 }, { "epoch": 2.611336032388664, "grad_norm": 1.7819007792713526, "learning_rate": 8.390894812684602e-07, "loss": 0.9579, "step": 3225 }, { "epoch": 2.6121457489878543, "grad_norm": 1.5960953357473817, "learning_rate": 8.356463941413717e-07, "loss": 1.0558, "step": 3226 }, { "epoch": 2.6129554655870444, "grad_norm": 1.7717830181251943, "learning_rate": 8.322100776086272e-07, "loss": 1.0088, "step": 3227 }, { "epoch": 2.613765182186235, "grad_norm": 1.7439967991496321, "learning_rate": 8.287805342089672e-07, "loss": 0.9053, "step": 3228 }, { "epoch": 2.614574898785425, "grad_norm": 1.670184838895179, "learning_rate": 8.253577664761259e-07, "loss": 1.0389, "step": 3229 }, { "epoch": 2.6153846153846154, "grad_norm": 1.7399555221858654, "learning_rate": 8.219417769388316e-07, "loss": 0.9939, "step": 3230 }, { "epoch": 2.616194331983806, "grad_norm": 1.6749694700524644, "learning_rate": 8.1853256812081e-07, "loss": 1.0025, "step": 3231 }, { "epoch": 2.617004048582996, "grad_norm": 1.7379585443686947, "learning_rate": 8.151301425407699e-07, "loss": 0.9861, "step": 3232 }, { "epoch": 2.617813765182186, "grad_norm": 1.6065227038635674, "learning_rate": 8.117345027124146e-07, "loss": 1.0168, "step": 3233 }, { "epoch": 2.6186234817813765, "grad_norm": 1.7012911860488888, "learning_rate": 8.083456511444309e-07, "loss": 1.0043, "step": 3234 }, { "epoch": 2.619433198380567, "grad_norm": 1.716691520394183, "learning_rate": 8.049635903404907e-07, "loss": 0.9689, "step": 3235 }, { "epoch": 2.620242914979757, "grad_norm": 1.7278598183805447, "learning_rate": 8.015883227992505e-07, "loss": 1.027, "step": 3236 }, { "epoch": 2.6210526315789475, "grad_norm": 1.7367934858252325, "learning_rate": 7.982198510143457e-07, "loss": 0.9283, "step": 3237 }, { "epoch": 2.6218623481781376, "grad_norm": 1.6429152171351453, "learning_rate": 7.948581774743902e-07, "loss": 0.9303, "step": 3238 }, { "epoch": 2.622672064777328, "grad_norm": 1.7790796815176042, "learning_rate": 7.915033046629817e-07, "loss": 0.9561, "step": 3239 }, { "epoch": 2.623481781376518, "grad_norm": 1.686258952709826, "learning_rate": 7.881552350586863e-07, "loss": 0.8947, "step": 3240 }, { "epoch": 2.6242914979757086, "grad_norm": 1.758970152066431, "learning_rate": 7.848139711350489e-07, "loss": 1.048, "step": 3241 }, { "epoch": 2.6251012145748986, "grad_norm": 1.6233206462939287, "learning_rate": 7.814795153605825e-07, "loss": 1.0282, "step": 3242 }, { "epoch": 2.625910931174089, "grad_norm": 1.5853805998689159, "learning_rate": 7.781518701987734e-07, "loss": 1.0069, "step": 3243 }, { "epoch": 2.626720647773279, "grad_norm": 1.849575708030945, "learning_rate": 7.748310381080749e-07, "loss": 0.904, "step": 3244 }, { "epoch": 2.6275303643724697, "grad_norm": 1.655980362780037, "learning_rate": 7.715170215419043e-07, "loss": 0.988, "step": 3245 }, { "epoch": 2.62834008097166, "grad_norm": 1.6899812614311005, "learning_rate": 7.682098229486478e-07, "loss": 0.9808, "step": 3246 }, { "epoch": 2.62914979757085, "grad_norm": 1.6541647003198807, "learning_rate": 7.649094447716532e-07, "loss": 0.961, "step": 3247 }, { "epoch": 2.6299595141700403, "grad_norm": 1.6996182835373292, "learning_rate": 7.616158894492298e-07, "loss": 0.9375, "step": 3248 }, { "epoch": 2.6307692307692307, "grad_norm": 1.644339997790146, "learning_rate": 7.583291594146458e-07, "loss": 1.0002, "step": 3249 }, { "epoch": 2.6315789473684212, "grad_norm": 1.7094644080217443, "learning_rate": 7.550492570961243e-07, "loss": 0.9905, "step": 3250 }, { "epoch": 2.6323886639676113, "grad_norm": 1.724822229310986, "learning_rate": 7.517761849168481e-07, "loss": 0.9991, "step": 3251 }, { "epoch": 2.6331983805668018, "grad_norm": 1.6081123719870767, "learning_rate": 7.485099452949507e-07, "loss": 0.9177, "step": 3252 }, { "epoch": 2.634008097165992, "grad_norm": 1.7226834341746038, "learning_rate": 7.452505406435184e-07, "loss": 0.9244, "step": 3253 }, { "epoch": 2.6348178137651823, "grad_norm": 1.6628814186493102, "learning_rate": 7.419979733705929e-07, "loss": 1.0378, "step": 3254 }, { "epoch": 2.6356275303643724, "grad_norm": 1.738063003473798, "learning_rate": 7.387522458791552e-07, "loss": 0.9352, "step": 3255 }, { "epoch": 2.636437246963563, "grad_norm": 1.680315939042461, "learning_rate": 7.355133605671417e-07, "loss": 1.0976, "step": 3256 }, { "epoch": 2.637246963562753, "grad_norm": 1.665843160940704, "learning_rate": 7.322813198274303e-07, "loss": 0.9783, "step": 3257 }, { "epoch": 2.6380566801619434, "grad_norm": 1.702934773134636, "learning_rate": 7.290561260478401e-07, "loss": 1.012, "step": 3258 }, { "epoch": 2.6388663967611334, "grad_norm": 1.6427499028473984, "learning_rate": 7.258377816111339e-07, "loss": 1.0654, "step": 3259 }, { "epoch": 2.639676113360324, "grad_norm": 1.681106134788699, "learning_rate": 7.226262888950153e-07, "loss": 0.9816, "step": 3260 }, { "epoch": 2.6404858299595144, "grad_norm": 1.6644509739713758, "learning_rate": 7.194216502721219e-07, "loss": 1.0321, "step": 3261 }, { "epoch": 2.6412955465587045, "grad_norm": 1.6556000089922558, "learning_rate": 7.16223868110032e-07, "loss": 0.9842, "step": 3262 }, { "epoch": 2.6421052631578945, "grad_norm": 1.6874491415426638, "learning_rate": 7.130329447712581e-07, "loss": 0.9057, "step": 3263 }, { "epoch": 2.642914979757085, "grad_norm": 1.7091112739486392, "learning_rate": 7.098488826132422e-07, "loss": 1.008, "step": 3264 }, { "epoch": 2.6437246963562755, "grad_norm": 1.6178389429525835, "learning_rate": 7.066716839883592e-07, "loss": 0.9924, "step": 3265 }, { "epoch": 2.6445344129554655, "grad_norm": 1.6485423266523553, "learning_rate": 7.035013512439126e-07, "loss": 1.054, "step": 3266 }, { "epoch": 2.645344129554656, "grad_norm": 1.661813731422273, "learning_rate": 7.003378867221344e-07, "loss": 0.9126, "step": 3267 }, { "epoch": 2.646153846153846, "grad_norm": 1.6864743875444674, "learning_rate": 6.971812927601806e-07, "loss": 0.9689, "step": 3268 }, { "epoch": 2.6469635627530366, "grad_norm": 1.7764344486102688, "learning_rate": 6.940315716901347e-07, "loss": 0.9694, "step": 3269 }, { "epoch": 2.6477732793522266, "grad_norm": 1.7426314403267598, "learning_rate": 6.908887258389974e-07, "loss": 1.0287, "step": 3270 }, { "epoch": 2.648582995951417, "grad_norm": 1.6967622835020153, "learning_rate": 6.877527575286958e-07, "loss": 0.8432, "step": 3271 }, { "epoch": 2.649392712550607, "grad_norm": 1.659074402231809, "learning_rate": 6.846236690760721e-07, "loss": 0.931, "step": 3272 }, { "epoch": 2.6502024291497976, "grad_norm": 1.6674943656562866, "learning_rate": 6.815014627928862e-07, "loss": 0.9656, "step": 3273 }, { "epoch": 2.6510121457489877, "grad_norm": 1.710449217756733, "learning_rate": 6.783861409858128e-07, "loss": 0.9369, "step": 3274 }, { "epoch": 2.651821862348178, "grad_norm": 1.7486026332376843, "learning_rate": 6.752777059564431e-07, "loss": 0.9058, "step": 3275 }, { "epoch": 2.6526315789473687, "grad_norm": 1.7290326137066063, "learning_rate": 6.721761600012766e-07, "loss": 0.9418, "step": 3276 }, { "epoch": 2.6534412955465587, "grad_norm": 1.6974512106097555, "learning_rate": 6.690815054117283e-07, "loss": 0.9965, "step": 3277 }, { "epoch": 2.6542510121457488, "grad_norm": 1.570906783647935, "learning_rate": 6.659937444741149e-07, "loss": 0.9696, "step": 3278 }, { "epoch": 2.6550607287449393, "grad_norm": 1.680749541696145, "learning_rate": 6.629128794696694e-07, "loss": 0.8977, "step": 3279 }, { "epoch": 2.6558704453441297, "grad_norm": 1.6928204758699041, "learning_rate": 6.598389126745209e-07, "loss": 1.0138, "step": 3280 }, { "epoch": 2.65668016194332, "grad_norm": 1.5916031288683266, "learning_rate": 6.567718463597061e-07, "loss": 1.0033, "step": 3281 }, { "epoch": 2.6574898785425103, "grad_norm": 1.7010292729133476, "learning_rate": 6.537116827911649e-07, "loss": 0.9784, "step": 3282 }, { "epoch": 2.6582995951417003, "grad_norm": 1.7137596107033524, "learning_rate": 6.506584242297332e-07, "loss": 0.9155, "step": 3283 }, { "epoch": 2.659109311740891, "grad_norm": 1.744228981789455, "learning_rate": 6.476120729311531e-07, "loss": 0.9438, "step": 3284 }, { "epoch": 2.659919028340081, "grad_norm": 1.6938026901844712, "learning_rate": 6.445726311460553e-07, "loss": 1.0437, "step": 3285 }, { "epoch": 2.6607287449392714, "grad_norm": 1.7064562907469736, "learning_rate": 6.415401011199707e-07, "loss": 0.9533, "step": 3286 }, { "epoch": 2.6615384615384614, "grad_norm": 1.653652412179681, "learning_rate": 6.385144850933222e-07, "loss": 0.9629, "step": 3287 }, { "epoch": 2.662348178137652, "grad_norm": 1.7289904638695122, "learning_rate": 6.354957853014254e-07, "loss": 1.017, "step": 3288 }, { "epoch": 2.663157894736842, "grad_norm": 1.7152833008239363, "learning_rate": 6.324840039744862e-07, "loss": 0.9085, "step": 3289 }, { "epoch": 2.6639676113360324, "grad_norm": 1.7289503260777432, "learning_rate": 6.29479143337598e-07, "loss": 0.8867, "step": 3290 }, { "epoch": 2.664777327935223, "grad_norm": 1.6648250831416591, "learning_rate": 6.264812056107406e-07, "loss": 0.9589, "step": 3291 }, { "epoch": 2.665587044534413, "grad_norm": 1.7548308959687973, "learning_rate": 6.234901930087822e-07, "loss": 0.9672, "step": 3292 }, { "epoch": 2.666396761133603, "grad_norm": 1.6680110599487112, "learning_rate": 6.205061077414743e-07, "loss": 0.9872, "step": 3293 }, { "epoch": 2.6672064777327935, "grad_norm": 1.7292774474924442, "learning_rate": 6.175289520134464e-07, "loss": 0.9107, "step": 3294 }, { "epoch": 2.668016194331984, "grad_norm": 1.7431371411748449, "learning_rate": 6.145587280242138e-07, "loss": 1.009, "step": 3295 }, { "epoch": 2.668825910931174, "grad_norm": 1.787321743542622, "learning_rate": 6.115954379681666e-07, "loss": 0.9275, "step": 3296 }, { "epoch": 2.669635627530364, "grad_norm": 1.6827076549424453, "learning_rate": 6.086390840345758e-07, "loss": 0.9042, "step": 3297 }, { "epoch": 2.6704453441295546, "grad_norm": 1.6938249525960054, "learning_rate": 6.05689668407582e-07, "loss": 1.0097, "step": 3298 }, { "epoch": 2.671255060728745, "grad_norm": 1.745931288902666, "learning_rate": 6.027471932662087e-07, "loss": 1.028, "step": 3299 }, { "epoch": 2.672064777327935, "grad_norm": 1.6780881861216097, "learning_rate": 5.99811660784344e-07, "loss": 1.0207, "step": 3300 }, { "epoch": 2.6728744939271256, "grad_norm": 1.666413076144872, "learning_rate": 5.968830731307507e-07, "loss": 0.9445, "step": 3301 }, { "epoch": 2.6736842105263157, "grad_norm": 1.6645120361386307, "learning_rate": 5.93961432469058e-07, "loss": 0.9988, "step": 3302 }, { "epoch": 2.674493927125506, "grad_norm": 1.6392363775063152, "learning_rate": 5.910467409577669e-07, "loss": 0.8894, "step": 3303 }, { "epoch": 2.675303643724696, "grad_norm": 1.6878734747415562, "learning_rate": 5.881390007502397e-07, "loss": 0.9665, "step": 3304 }, { "epoch": 2.6761133603238867, "grad_norm": 1.7768958238131116, "learning_rate": 5.852382139947077e-07, "loss": 0.9143, "step": 3305 }, { "epoch": 2.676923076923077, "grad_norm": 1.5994762220145167, "learning_rate": 5.82344382834259e-07, "loss": 1.0549, "step": 3306 }, { "epoch": 2.6777327935222672, "grad_norm": 1.7374717323879252, "learning_rate": 5.7945750940685e-07, "loss": 1.0182, "step": 3307 }, { "epoch": 2.6785425101214573, "grad_norm": 1.650625300217036, "learning_rate": 5.765775958452935e-07, "loss": 1.0116, "step": 3308 }, { "epoch": 2.6793522267206478, "grad_norm": 1.7121361852833241, "learning_rate": 5.737046442772576e-07, "loss": 1.0502, "step": 3309 }, { "epoch": 2.6801619433198383, "grad_norm": 1.6502806678346016, "learning_rate": 5.708386568252688e-07, "loss": 0.9503, "step": 3310 }, { "epoch": 2.6809716599190283, "grad_norm": 1.686124777598768, "learning_rate": 5.679796356067135e-07, "loss": 0.9896, "step": 3311 }, { "epoch": 2.6817813765182184, "grad_norm": 1.6024788294961974, "learning_rate": 5.651275827338242e-07, "loss": 1.0105, "step": 3312 }, { "epoch": 2.682591093117409, "grad_norm": 1.7206476831624593, "learning_rate": 5.622825003136878e-07, "loss": 0.977, "step": 3313 }, { "epoch": 2.6834008097165993, "grad_norm": 1.6455934468913418, "learning_rate": 5.594443904482439e-07, "loss": 0.9792, "step": 3314 }, { "epoch": 2.6842105263157894, "grad_norm": 1.7190031808580448, "learning_rate": 5.566132552342784e-07, "loss": 1.0892, "step": 3315 }, { "epoch": 2.68502024291498, "grad_norm": 1.6275442282090031, "learning_rate": 5.53789096763423e-07, "loss": 0.9998, "step": 3316 }, { "epoch": 2.68582995951417, "grad_norm": 1.6723953815880728, "learning_rate": 5.509719171221583e-07, "loss": 0.9851, "step": 3317 }, { "epoch": 2.6866396761133604, "grad_norm": 1.5877924488779105, "learning_rate": 5.481617183918053e-07, "loss": 0.9935, "step": 3318 }, { "epoch": 2.6874493927125505, "grad_norm": 1.725319551891715, "learning_rate": 5.45358502648532e-07, "loss": 0.9045, "step": 3319 }, { "epoch": 2.688259109311741, "grad_norm": 1.6281167669786418, "learning_rate": 5.425622719633428e-07, "loss": 0.997, "step": 3320 }, { "epoch": 2.6890688259109314, "grad_norm": 1.6635114803247797, "learning_rate": 5.397730284020863e-07, "loss": 1.04, "step": 3321 }, { "epoch": 2.6898785425101215, "grad_norm": 1.6162289487244115, "learning_rate": 5.369907740254454e-07, "loss": 0.9811, "step": 3322 }, { "epoch": 2.6906882591093115, "grad_norm": 1.7327330282080413, "learning_rate": 5.342155108889391e-07, "loss": 0.9866, "step": 3323 }, { "epoch": 2.691497975708502, "grad_norm": 1.673161064229558, "learning_rate": 5.31447241042925e-07, "loss": 0.9979, "step": 3324 }, { "epoch": 2.6923076923076925, "grad_norm": 1.5832573352945025, "learning_rate": 5.286859665325905e-07, "loss": 0.9559, "step": 3325 }, { "epoch": 2.6931174089068826, "grad_norm": 1.6724549773177813, "learning_rate": 5.259316893979549e-07, "loss": 0.9106, "step": 3326 }, { "epoch": 2.6939271255060726, "grad_norm": 1.6314795157115118, "learning_rate": 5.231844116738716e-07, "loss": 0.972, "step": 3327 }, { "epoch": 2.694736842105263, "grad_norm": 1.6811218193411817, "learning_rate": 5.204441353900169e-07, "loss": 0.9769, "step": 3328 }, { "epoch": 2.6955465587044536, "grad_norm": 1.728655218184744, "learning_rate": 5.177108625709026e-07, "loss": 1.0373, "step": 3329 }, { "epoch": 2.6963562753036436, "grad_norm": 1.652166979844383, "learning_rate": 5.149845952358589e-07, "loss": 1.0337, "step": 3330 }, { "epoch": 2.697165991902834, "grad_norm": 1.668823613835129, "learning_rate": 5.122653353990437e-07, "loss": 0.9601, "step": 3331 }, { "epoch": 2.697975708502024, "grad_norm": 1.619860141890713, "learning_rate": 5.095530850694375e-07, "loss": 1.0214, "step": 3332 }, { "epoch": 2.6987854251012147, "grad_norm": 1.7472110705893544, "learning_rate": 5.068478462508409e-07, "loss": 0.9791, "step": 3333 }, { "epoch": 2.6995951417004047, "grad_norm": 1.6679499678127792, "learning_rate": 5.04149620941875e-07, "loss": 0.9824, "step": 3334 }, { "epoch": 2.700404858299595, "grad_norm": 1.6663417046865165, "learning_rate": 5.014584111359811e-07, "loss": 0.9539, "step": 3335 }, { "epoch": 2.7012145748987857, "grad_norm": 1.672587987397243, "learning_rate": 4.987742188214162e-07, "loss": 0.9183, "step": 3336 }, { "epoch": 2.7020242914979757, "grad_norm": 1.7188596938545961, "learning_rate": 4.960970459812542e-07, "loss": 0.9772, "step": 3337 }, { "epoch": 2.702834008097166, "grad_norm": 1.6817958789804741, "learning_rate": 4.934268945933784e-07, "loss": 0.9823, "step": 3338 }, { "epoch": 2.7036437246963563, "grad_norm": 1.6819764284995349, "learning_rate": 4.907637666304898e-07, "loss": 0.9582, "step": 3339 }, { "epoch": 2.7044534412955468, "grad_norm": 1.6643127350965405, "learning_rate": 4.881076640600979e-07, "loss": 0.9677, "step": 3340 }, { "epoch": 2.705263157894737, "grad_norm": 1.777174635148692, "learning_rate": 4.854585888445218e-07, "loss": 0.9946, "step": 3341 }, { "epoch": 2.706072874493927, "grad_norm": 1.6692254557052726, "learning_rate": 4.828165429408926e-07, "loss": 1.0203, "step": 3342 }, { "epoch": 2.7068825910931174, "grad_norm": 1.7053185479226727, "learning_rate": 4.801815283011413e-07, "loss": 0.9442, "step": 3343 }, { "epoch": 2.707692307692308, "grad_norm": 1.700154883033202, "learning_rate": 4.775535468720105e-07, "loss": 0.9464, "step": 3344 }, { "epoch": 2.708502024291498, "grad_norm": 1.7405575127592974, "learning_rate": 4.7493260059504497e-07, "loss": 0.9651, "step": 3345 }, { "epoch": 2.7093117408906884, "grad_norm": 1.6499300506429417, "learning_rate": 4.7231869140658804e-07, "loss": 0.9186, "step": 3346 }, { "epoch": 2.7101214574898784, "grad_norm": 1.684334696306486, "learning_rate": 4.6971182123779045e-07, "loss": 0.8985, "step": 3347 }, { "epoch": 2.710931174089069, "grad_norm": 1.6675095321437483, "learning_rate": 4.6711199201459833e-07, "loss": 1.0096, "step": 3348 }, { "epoch": 2.711740890688259, "grad_norm": 1.7918664382749507, "learning_rate": 4.645192056577541e-07, "loss": 1.0911, "step": 3349 }, { "epoch": 2.7125506072874495, "grad_norm": 1.7239165785172468, "learning_rate": 4.6193346408280216e-07, "loss": 1.0048, "step": 3350 }, { "epoch": 2.71336032388664, "grad_norm": 1.6745541046010592, "learning_rate": 4.5935476920008213e-07, "loss": 0.9741, "step": 3351 }, { "epoch": 2.71417004048583, "grad_norm": 1.770541331198052, "learning_rate": 4.5678312291472347e-07, "loss": 0.9229, "step": 3352 }, { "epoch": 2.71497975708502, "grad_norm": 1.7454452776343088, "learning_rate": 4.542185271266486e-07, "loss": 0.9203, "step": 3353 }, { "epoch": 2.7157894736842105, "grad_norm": 1.6546940699373445, "learning_rate": 4.516609837305741e-07, "loss": 0.9511, "step": 3354 }, { "epoch": 2.716599190283401, "grad_norm": 1.673530449759611, "learning_rate": 4.491104946160052e-07, "loss": 0.9926, "step": 3355 }, { "epoch": 2.717408906882591, "grad_norm": 1.6784803821715035, "learning_rate": 4.465670616672313e-07, "loss": 0.951, "step": 3356 }, { "epoch": 2.718218623481781, "grad_norm": 1.7286011969705144, "learning_rate": 4.440306867633359e-07, "loss": 0.9863, "step": 3357 }, { "epoch": 2.7190283400809716, "grad_norm": 1.759405070641066, "learning_rate": 4.4150137177818243e-07, "loss": 1.0233, "step": 3358 }, { "epoch": 2.719838056680162, "grad_norm": 1.7032342660835975, "learning_rate": 4.389791185804237e-07, "loss": 0.9514, "step": 3359 }, { "epoch": 2.720647773279352, "grad_norm": 1.7760045827311535, "learning_rate": 4.3646392903348823e-07, "loss": 0.9509, "step": 3360 }, { "epoch": 2.7214574898785426, "grad_norm": 1.703842102243129, "learning_rate": 4.3395580499559276e-07, "loss": 0.9985, "step": 3361 }, { "epoch": 2.7222672064777327, "grad_norm": 1.7058090054545807, "learning_rate": 4.3145474831972845e-07, "loss": 0.994, "step": 3362 }, { "epoch": 2.723076923076923, "grad_norm": 1.704324426436363, "learning_rate": 4.2896076085367056e-07, "loss": 0.9599, "step": 3363 }, { "epoch": 2.723886639676113, "grad_norm": 1.6980076718179375, "learning_rate": 4.264738444399652e-07, "loss": 0.9828, "step": 3364 }, { "epoch": 2.7246963562753037, "grad_norm": 1.6087858125711858, "learning_rate": 4.2399400091594154e-07, "loss": 1.0022, "step": 3365 }, { "epoch": 2.725506072874494, "grad_norm": 1.6883922800088609, "learning_rate": 4.2152123211369633e-07, "loss": 0.9971, "step": 3366 }, { "epoch": 2.7263157894736842, "grad_norm": 1.7249610106754352, "learning_rate": 4.1905553986010707e-07, "loss": 0.9756, "step": 3367 }, { "epoch": 2.7271255060728743, "grad_norm": 1.650676985334564, "learning_rate": 4.165969259768177e-07, "loss": 0.9695, "step": 3368 }, { "epoch": 2.727935222672065, "grad_norm": 1.7286467505133185, "learning_rate": 4.1414539228024297e-07, "loss": 1.0002, "step": 3369 }, { "epoch": 2.7287449392712553, "grad_norm": 1.7055124440922338, "learning_rate": 4.117009405815686e-07, "loss": 0.9834, "step": 3370 }, { "epoch": 2.7295546558704453, "grad_norm": 1.7056058182983371, "learning_rate": 4.0926357268674667e-07, "loss": 0.9676, "step": 3371 }, { "epoch": 2.7303643724696354, "grad_norm": 1.7942169407381743, "learning_rate": 4.068332903964978e-07, "loss": 1.0179, "step": 3372 }, { "epoch": 2.731174089068826, "grad_norm": 1.7569046659689378, "learning_rate": 4.0441009550630683e-07, "loss": 0.9497, "step": 3373 }, { "epoch": 2.7319838056680164, "grad_norm": 1.7915957106720466, "learning_rate": 4.0199398980641955e-07, "loss": 1.0083, "step": 3374 }, { "epoch": 2.7327935222672064, "grad_norm": 1.6604774269182196, "learning_rate": 3.9958497508185036e-07, "loss": 0.9262, "step": 3375 }, { "epoch": 2.733603238866397, "grad_norm": 1.7981493121093652, "learning_rate": 3.9718305311236996e-07, "loss": 0.9811, "step": 3376 }, { "epoch": 2.734412955465587, "grad_norm": 1.7140059177794198, "learning_rate": 3.9478822567251e-07, "loss": 1.02, "step": 3377 }, { "epoch": 2.7352226720647774, "grad_norm": 1.7037447623006154, "learning_rate": 3.924004945315618e-07, "loss": 1.0029, "step": 3378 }, { "epoch": 2.7360323886639675, "grad_norm": 1.6598577339137044, "learning_rate": 3.900198614535711e-07, "loss": 0.9981, "step": 3379 }, { "epoch": 2.736842105263158, "grad_norm": 1.643004118226858, "learning_rate": 3.8764632819734526e-07, "loss": 0.9708, "step": 3380 }, { "epoch": 2.737651821862348, "grad_norm": 1.6338597557404355, "learning_rate": 3.852798965164406e-07, "loss": 1.0315, "step": 3381 }, { "epoch": 2.7384615384615385, "grad_norm": 1.6815159903047672, "learning_rate": 3.8292056815916965e-07, "loss": 0.9838, "step": 3382 }, { "epoch": 2.7392712550607285, "grad_norm": 1.6880628778391817, "learning_rate": 3.805683448685971e-07, "loss": 0.9882, "step": 3383 }, { "epoch": 2.740080971659919, "grad_norm": 1.7337464917612249, "learning_rate": 3.782232283825371e-07, "loss": 0.9449, "step": 3384 }, { "epoch": 2.7408906882591095, "grad_norm": 1.618990796172125, "learning_rate": 3.758852204335539e-07, "loss": 0.9886, "step": 3385 }, { "epoch": 2.7417004048582996, "grad_norm": 1.7016005187057022, "learning_rate": 3.735543227489591e-07, "loss": 1.032, "step": 3386 }, { "epoch": 2.7425101214574896, "grad_norm": 1.6521289220479534, "learning_rate": 3.712305370508151e-07, "loss": 0.9023, "step": 3387 }, { "epoch": 2.74331983805668, "grad_norm": 1.6818920788449179, "learning_rate": 3.6891386505592543e-07, "loss": 0.9534, "step": 3388 }, { "epoch": 2.7441295546558706, "grad_norm": 1.6371230429182253, "learning_rate": 3.6660430847583973e-07, "loss": 1.0219, "step": 3389 }, { "epoch": 2.7449392712550607, "grad_norm": 1.6513777881762906, "learning_rate": 3.643018690168487e-07, "loss": 1.0237, "step": 3390 }, { "epoch": 2.745748987854251, "grad_norm": 1.702844558572882, "learning_rate": 3.620065483799917e-07, "loss": 1.0022, "step": 3391 }, { "epoch": 2.746558704453441, "grad_norm": 1.6684488841014413, "learning_rate": 3.5971834826104114e-07, "loss": 1.0313, "step": 3392 }, { "epoch": 2.7473684210526317, "grad_norm": 1.678173521295049, "learning_rate": 3.5743727035051245e-07, "loss": 0.9862, "step": 3393 }, { "epoch": 2.7481781376518217, "grad_norm": 1.6963824088714325, "learning_rate": 3.551633163336565e-07, "loss": 0.9448, "step": 3394 }, { "epoch": 2.748987854251012, "grad_norm": 1.650614217397481, "learning_rate": 3.5289648789046616e-07, "loss": 0.9745, "step": 3395 }, { "epoch": 2.7497975708502023, "grad_norm": 1.6241513485984185, "learning_rate": 3.5063678669566616e-07, "loss": 0.9782, "step": 3396 }, { "epoch": 2.7506072874493928, "grad_norm": 1.7366532492678954, "learning_rate": 3.4838421441871553e-07, "loss": 0.9545, "step": 3397 }, { "epoch": 2.751417004048583, "grad_norm": 1.643649947081736, "learning_rate": 3.4613877272380526e-07, "loss": 0.9422, "step": 3398 }, { "epoch": 2.7522267206477733, "grad_norm": 1.7397550528137697, "learning_rate": 3.4390046326986506e-07, "loss": 0.9788, "step": 3399 }, { "epoch": 2.753036437246964, "grad_norm": 1.7012047353401265, "learning_rate": 3.4166928771054653e-07, "loss": 1.0243, "step": 3400 }, { "epoch": 2.753846153846154, "grad_norm": 1.6224734532349014, "learning_rate": 3.394452476942367e-07, "loss": 1.0144, "step": 3401 }, { "epoch": 2.754655870445344, "grad_norm": 1.6480456374209134, "learning_rate": 3.37228344864049e-07, "loss": 0.9859, "step": 3402 }, { "epoch": 2.7554655870445344, "grad_norm": 1.6460293515314257, "learning_rate": 3.350185808578232e-07, "loss": 1.0034, "step": 3403 }, { "epoch": 2.756275303643725, "grad_norm": 1.757720445735907, "learning_rate": 3.328159573081258e-07, "loss": 1.0231, "step": 3404 }, { "epoch": 2.757085020242915, "grad_norm": 1.7467262631474252, "learning_rate": 3.3062047584224934e-07, "loss": 1.0036, "step": 3405 }, { "epoch": 2.7578947368421054, "grad_norm": 1.665170966625211, "learning_rate": 3.284321380822053e-07, "loss": 1.0402, "step": 3406 }, { "epoch": 2.7587044534412954, "grad_norm": 1.6956185677978453, "learning_rate": 3.262509456447327e-07, "loss": 0.998, "step": 3407 }, { "epoch": 2.759514170040486, "grad_norm": 1.7519136605866905, "learning_rate": 3.240769001412891e-07, "loss": 0.9528, "step": 3408 }, { "epoch": 2.760323886639676, "grad_norm": 1.7155229720938194, "learning_rate": 3.21910003178052e-07, "loss": 0.9239, "step": 3409 }, { "epoch": 2.7611336032388665, "grad_norm": 1.6753569936900223, "learning_rate": 3.197502563559185e-07, "loss": 1.0247, "step": 3410 }, { "epoch": 2.7619433198380565, "grad_norm": 1.6291004521416133, "learning_rate": 3.1759766127050116e-07, "loss": 1.0405, "step": 3411 }, { "epoch": 2.762753036437247, "grad_norm": 1.7193595883451167, "learning_rate": 3.1545221951213125e-07, "loss": 0.9007, "step": 3412 }, { "epoch": 2.763562753036437, "grad_norm": 1.7346002461970236, "learning_rate": 3.13313932665853e-07, "loss": 0.922, "step": 3413 }, { "epoch": 2.7643724696356275, "grad_norm": 1.6724531692395965, "learning_rate": 3.1118280231142496e-07, "loss": 0.9998, "step": 3414 }, { "epoch": 2.765182186234818, "grad_norm": 1.7916657747046403, "learning_rate": 3.0905883002332213e-07, "loss": 0.9282, "step": 3415 }, { "epoch": 2.765991902834008, "grad_norm": 1.6206762833247699, "learning_rate": 3.069420173707249e-07, "loss": 1.0229, "step": 3416 }, { "epoch": 2.766801619433198, "grad_norm": 1.7079187810682614, "learning_rate": 3.048323659175301e-07, "loss": 0.9421, "step": 3417 }, { "epoch": 2.7676113360323886, "grad_norm": 1.6489425364151793, "learning_rate": 3.027298772223419e-07, "loss": 1.0534, "step": 3418 }, { "epoch": 2.768421052631579, "grad_norm": 1.6128333988404466, "learning_rate": 3.006345528384691e-07, "loss": 0.9396, "step": 3419 }, { "epoch": 2.769230769230769, "grad_norm": 1.6925085208629478, "learning_rate": 2.985463943139322e-07, "loss": 0.9452, "step": 3420 }, { "epoch": 2.7700404858299597, "grad_norm": 1.728805527613981, "learning_rate": 2.96465403191456e-07, "loss": 0.9629, "step": 3421 }, { "epoch": 2.7708502024291497, "grad_norm": 1.6026531892520557, "learning_rate": 2.943915810084685e-07, "loss": 0.9992, "step": 3422 }, { "epoch": 2.77165991902834, "grad_norm": 1.7066029483605916, "learning_rate": 2.923249292971042e-07, "loss": 1.0369, "step": 3423 }, { "epoch": 2.7724696356275302, "grad_norm": 1.6385263272251447, "learning_rate": 2.9026544958419833e-07, "loss": 1.0205, "step": 3424 }, { "epoch": 2.7732793522267207, "grad_norm": 1.6324065649477524, "learning_rate": 2.882131433912883e-07, "loss": 0.9805, "step": 3425 }, { "epoch": 2.7740890688259108, "grad_norm": 1.6647087033829902, "learning_rate": 2.8616801223461e-07, "loss": 1.0149, "step": 3426 }, { "epoch": 2.7748987854251013, "grad_norm": 1.7242997046116755, "learning_rate": 2.841300576250994e-07, "loss": 0.9302, "step": 3427 }, { "epoch": 2.7757085020242913, "grad_norm": 1.6351854391159, "learning_rate": 2.8209928106839204e-07, "loss": 0.9689, "step": 3428 }, { "epoch": 2.776518218623482, "grad_norm": 1.6547619822957484, "learning_rate": 2.800756840648178e-07, "loss": 1.041, "step": 3429 }, { "epoch": 2.7773279352226723, "grad_norm": 1.6907136500019693, "learning_rate": 2.7805926810940297e-07, "loss": 1.0214, "step": 3430 }, { "epoch": 2.7781376518218623, "grad_norm": 1.6858591321540695, "learning_rate": 2.7605003469187044e-07, "loss": 1.0106, "step": 3431 }, { "epoch": 2.7789473684210524, "grad_norm": 1.761735692393002, "learning_rate": 2.74047985296636e-07, "loss": 1.0111, "step": 3432 }, { "epoch": 2.779757085020243, "grad_norm": 1.6840626486171097, "learning_rate": 2.720531214028055e-07, "loss": 0.9443, "step": 3433 }, { "epoch": 2.7805668016194334, "grad_norm": 1.7212890453015592, "learning_rate": 2.700654444841777e-07, "loss": 0.9862, "step": 3434 }, { "epoch": 2.7813765182186234, "grad_norm": 1.6376530049169284, "learning_rate": 2.6808495600924355e-07, "loss": 0.9878, "step": 3435 }, { "epoch": 2.782186234817814, "grad_norm": 1.6855350369784343, "learning_rate": 2.661116574411793e-07, "loss": 0.9569, "step": 3436 }, { "epoch": 2.782995951417004, "grad_norm": 1.774069519379109, "learning_rate": 2.6414555023785204e-07, "loss": 0.9306, "step": 3437 }, { "epoch": 2.7838056680161944, "grad_norm": 1.592262691990277, "learning_rate": 2.6218663585181547e-07, "loss": 1.0135, "step": 3438 }, { "epoch": 2.7846153846153845, "grad_norm": 1.7023463434984625, "learning_rate": 2.602349157303108e-07, "loss": 0.9985, "step": 3439 }, { "epoch": 2.785425101214575, "grad_norm": 1.7075079740626764, "learning_rate": 2.582903913152612e-07, "loss": 0.9859, "step": 3440 }, { "epoch": 2.786234817813765, "grad_norm": 1.6394777254757238, "learning_rate": 2.563530640432732e-07, "loss": 0.9818, "step": 3441 }, { "epoch": 2.7870445344129555, "grad_norm": 1.6722333217908798, "learning_rate": 2.5442293534564067e-07, "loss": 0.9843, "step": 3442 }, { "epoch": 2.7878542510121456, "grad_norm": 1.64564423365469, "learning_rate": 2.525000066483352e-07, "loss": 0.9853, "step": 3443 }, { "epoch": 2.788663967611336, "grad_norm": 1.5885870007756162, "learning_rate": 2.5058427937200816e-07, "loss": 0.9864, "step": 3444 }, { "epoch": 2.7894736842105265, "grad_norm": 1.6567088157067262, "learning_rate": 2.4867575493199515e-07, "loss": 0.9909, "step": 3445 }, { "epoch": 2.7902834008097166, "grad_norm": 1.7241493918176931, "learning_rate": 2.467744347383072e-07, "loss": 0.9471, "step": 3446 }, { "epoch": 2.7910931174089066, "grad_norm": 1.5953145152003532, "learning_rate": 2.44880320195634e-07, "loss": 1.0425, "step": 3447 }, { "epoch": 2.791902834008097, "grad_norm": 1.704422723526784, "learning_rate": 2.4299341270333955e-07, "loss": 0.9508, "step": 3448 }, { "epoch": 2.7927125506072876, "grad_norm": 1.6615202773394242, "learning_rate": 2.4111371365546643e-07, "loss": 0.9808, "step": 3449 }, { "epoch": 2.7935222672064777, "grad_norm": 1.6622420527909882, "learning_rate": 2.392412244407294e-07, "loss": 0.9952, "step": 3450 }, { "epoch": 2.794331983805668, "grad_norm": 1.7028019856102616, "learning_rate": 2.373759464425174e-07, "loss": 1.0009, "step": 3451 }, { "epoch": 2.795141700404858, "grad_norm": 1.6318324035656953, "learning_rate": 2.3551788103889027e-07, "loss": 0.9645, "step": 3452 }, { "epoch": 2.7959514170040487, "grad_norm": 1.6683299909476288, "learning_rate": 2.3366702960258336e-07, "loss": 1.0226, "step": 3453 }, { "epoch": 2.7967611336032387, "grad_norm": 1.7972866276194772, "learning_rate": 2.3182339350099724e-07, "loss": 0.9662, "step": 3454 }, { "epoch": 2.7975708502024292, "grad_norm": 1.7250223932824822, "learning_rate": 2.2998697409620573e-07, "loss": 1.0532, "step": 3455 }, { "epoch": 2.7983805668016193, "grad_norm": 1.6934825177542452, "learning_rate": 2.2815777274495022e-07, "loss": 1.0051, "step": 3456 }, { "epoch": 2.7991902834008098, "grad_norm": 1.7033740545061948, "learning_rate": 2.2633579079863632e-07, "loss": 0.997, "step": 3457 }, { "epoch": 2.8, "grad_norm": 1.7292274929656826, "learning_rate": 2.2452102960334064e-07, "loss": 0.9835, "step": 3458 }, { "epoch": 2.8008097165991903, "grad_norm": 1.6514262457771633, "learning_rate": 2.2271349049979962e-07, "loss": 0.9233, "step": 3459 }, { "epoch": 2.801619433198381, "grad_norm": 1.7415649309506156, "learning_rate": 2.2091317482342056e-07, "loss": 0.9879, "step": 3460 }, { "epoch": 2.802429149797571, "grad_norm": 1.6947139567148872, "learning_rate": 2.1912008390426953e-07, "loss": 0.9346, "step": 3461 }, { "epoch": 2.803238866396761, "grad_norm": 1.760629650452238, "learning_rate": 2.1733421906707464e-07, "loss": 1.0005, "step": 3462 }, { "epoch": 2.8040485829959514, "grad_norm": 1.6060526892387195, "learning_rate": 2.1555558163122935e-07, "loss": 1.075, "step": 3463 }, { "epoch": 2.804858299595142, "grad_norm": 1.6926722698944767, "learning_rate": 2.137841729107848e-07, "loss": 0.8934, "step": 3464 }, { "epoch": 2.805668016194332, "grad_norm": 1.6918582251550736, "learning_rate": 2.1201999421445074e-07, "loss": 0.9724, "step": 3465 }, { "epoch": 2.8064777327935224, "grad_norm": 1.6163610863078754, "learning_rate": 2.1026304684559685e-07, "loss": 1.0206, "step": 3466 }, { "epoch": 2.8072874493927125, "grad_norm": 1.6797477934507816, "learning_rate": 2.0851333210225032e-07, "loss": 0.9478, "step": 3467 }, { "epoch": 2.808097165991903, "grad_norm": 1.6836631767653443, "learning_rate": 2.0677085127709495e-07, "loss": 1.0166, "step": 3468 }, { "epoch": 2.808906882591093, "grad_norm": 1.7134147411086924, "learning_rate": 2.0503560565747092e-07, "loss": 0.9374, "step": 3469 }, { "epoch": 2.8097165991902835, "grad_norm": 1.7189098388876702, "learning_rate": 2.0330759652536835e-07, "loss": 0.9788, "step": 3470 }, { "epoch": 2.8105263157894735, "grad_norm": 1.6505141468009865, "learning_rate": 2.0158682515743933e-07, "loss": 0.9734, "step": 3471 }, { "epoch": 2.811336032388664, "grad_norm": 1.6506249870105763, "learning_rate": 1.9987329282498024e-07, "loss": 0.9544, "step": 3472 }, { "epoch": 2.812145748987854, "grad_norm": 1.7316946412284435, "learning_rate": 1.9816700079394625e-07, "loss": 1.0232, "step": 3473 }, { "epoch": 2.8129554655870446, "grad_norm": 1.6533763703707396, "learning_rate": 1.964679503249367e-07, "loss": 0.9491, "step": 3474 }, { "epoch": 2.813765182186235, "grad_norm": 1.76162742728525, "learning_rate": 1.9477614267320867e-07, "loss": 1.0347, "step": 3475 }, { "epoch": 2.814574898785425, "grad_norm": 1.6828209980534072, "learning_rate": 1.9309157908866116e-07, "loss": 0.9581, "step": 3476 }, { "epoch": 2.815384615384615, "grad_norm": 1.674090593565143, "learning_rate": 1.9141426081584537e-07, "loss": 0.9604, "step": 3477 }, { "epoch": 2.8161943319838056, "grad_norm": 1.725348808312188, "learning_rate": 1.8974418909395774e-07, "loss": 1.0223, "step": 3478 }, { "epoch": 2.817004048582996, "grad_norm": 1.6949407572974873, "learning_rate": 1.880813651568425e-07, "loss": 1.0619, "step": 3479 }, { "epoch": 2.817813765182186, "grad_norm": 1.662538589931312, "learning_rate": 1.8642579023298913e-07, "loss": 1.0112, "step": 3480 }, { "epoch": 2.8186234817813767, "grad_norm": 1.7612232629543667, "learning_rate": 1.8477746554552922e-07, "loss": 0.9445, "step": 3481 }, { "epoch": 2.8194331983805667, "grad_norm": 1.6705151929988862, "learning_rate": 1.831363923122409e-07, "loss": 0.9485, "step": 3482 }, { "epoch": 2.820242914979757, "grad_norm": 1.6573033708453293, "learning_rate": 1.815025717455432e-07, "loss": 0.9873, "step": 3483 }, { "epoch": 2.8210526315789473, "grad_norm": 1.6767470480305524, "learning_rate": 1.7987600505249726e-07, "loss": 0.9749, "step": 3484 }, { "epoch": 2.8218623481781377, "grad_norm": 1.7183296390160356, "learning_rate": 1.7825669343480624e-07, "loss": 0.9317, "step": 3485 }, { "epoch": 2.822672064777328, "grad_norm": 1.6844074820085622, "learning_rate": 1.7664463808880983e-07, "loss": 0.9173, "step": 3486 }, { "epoch": 2.8234817813765183, "grad_norm": 1.6126455021944093, "learning_rate": 1.7503984020549203e-07, "loss": 0.9177, "step": 3487 }, { "epoch": 2.8242914979757083, "grad_norm": 1.674650005216152, "learning_rate": 1.7344230097047111e-07, "loss": 0.9383, "step": 3488 }, { "epoch": 2.825101214574899, "grad_norm": 1.7018972026288255, "learning_rate": 1.7185202156400294e-07, "loss": 0.9821, "step": 3489 }, { "epoch": 2.8259109311740893, "grad_norm": 1.6593676997323308, "learning_rate": 1.7026900316098217e-07, "loss": 0.9953, "step": 3490 }, { "epoch": 2.8267206477732794, "grad_norm": 1.6763973087070592, "learning_rate": 1.6869324693093768e-07, "loss": 0.97, "step": 3491 }, { "epoch": 2.8275303643724694, "grad_norm": 1.6756347235552076, "learning_rate": 1.6712475403803164e-07, "loss": 1.0831, "step": 3492 }, { "epoch": 2.82834008097166, "grad_norm": 1.6961902060855358, "learning_rate": 1.655635256410615e-07, "loss": 1.0187, "step": 3493 }, { "epoch": 2.8291497975708504, "grad_norm": 1.7106926602042642, "learning_rate": 1.6400956289345903e-07, "loss": 0.9455, "step": 3494 }, { "epoch": 2.8299595141700404, "grad_norm": 1.6339795546096871, "learning_rate": 1.6246286694328594e-07, "loss": 1.0447, "step": 3495 }, { "epoch": 2.830769230769231, "grad_norm": 1.6545843031250616, "learning_rate": 1.6092343893323593e-07, "loss": 0.9805, "step": 3496 }, { "epoch": 2.831578947368421, "grad_norm": 1.6809638570842909, "learning_rate": 1.5939128000063364e-07, "loss": 0.9301, "step": 3497 }, { "epoch": 2.8323886639676115, "grad_norm": 1.699507326131043, "learning_rate": 1.5786639127743363e-07, "loss": 0.9757, "step": 3498 }, { "epoch": 2.8331983805668015, "grad_norm": 1.7076118973222212, "learning_rate": 1.5634877389021695e-07, "loss": 0.9538, "step": 3499 }, { "epoch": 2.834008097165992, "grad_norm": 1.6599897623636726, "learning_rate": 1.5483842896019675e-07, "loss": 1.0356, "step": 3500 }, { "epoch": 2.834817813765182, "grad_norm": 1.71624063763336, "learning_rate": 1.5333535760320929e-07, "loss": 0.96, "step": 3501 }, { "epoch": 2.8356275303643725, "grad_norm": 1.7240409170070614, "learning_rate": 1.518395609297185e-07, "loss": 0.9491, "step": 3502 }, { "epoch": 2.8364372469635626, "grad_norm": 1.6299987076331925, "learning_rate": 1.5035104004481604e-07, "loss": 1.0197, "step": 3503 }, { "epoch": 2.837246963562753, "grad_norm": 1.5841940758881194, "learning_rate": 1.4886979604821328e-07, "loss": 0.9758, "step": 3504 }, { "epoch": 2.8380566801619436, "grad_norm": 1.6462756907053675, "learning_rate": 1.473958300342504e-07, "loss": 0.975, "step": 3505 }, { "epoch": 2.8388663967611336, "grad_norm": 1.6923662140676001, "learning_rate": 1.4592914309188965e-07, "loss": 0.9598, "step": 3506 }, { "epoch": 2.8396761133603237, "grad_norm": 1.71758711743743, "learning_rate": 1.4446973630471207e-07, "loss": 0.9187, "step": 3507 }, { "epoch": 2.840485829959514, "grad_norm": 1.7314496796265455, "learning_rate": 1.4301761075092402e-07, "loss": 0.9947, "step": 3508 }, { "epoch": 2.8412955465587046, "grad_norm": 1.6658482977303863, "learning_rate": 1.415727675033507e-07, "loss": 0.9994, "step": 3509 }, { "epoch": 2.8421052631578947, "grad_norm": 1.652120289375282, "learning_rate": 1.401352076294371e-07, "loss": 1.0019, "step": 3510 }, { "epoch": 2.842914979757085, "grad_norm": 1.7489773597175606, "learning_rate": 1.3870493219124814e-07, "loss": 1.0033, "step": 3511 }, { "epoch": 2.8437246963562752, "grad_norm": 1.703873872469804, "learning_rate": 1.3728194224546742e-07, "loss": 0.9716, "step": 3512 }, { "epoch": 2.8445344129554657, "grad_norm": 1.7380413078833634, "learning_rate": 1.35866238843394e-07, "loss": 0.9694, "step": 3513 }, { "epoch": 2.8453441295546558, "grad_norm": 1.7289000733110216, "learning_rate": 1.3445782303094568e-07, "loss": 0.9484, "step": 3514 }, { "epoch": 2.8461538461538463, "grad_norm": 1.748270696926121, "learning_rate": 1.3305669584865565e-07, "loss": 0.9799, "step": 3515 }, { "epoch": 2.8469635627530363, "grad_norm": 1.6413747236212781, "learning_rate": 1.3166285833167147e-07, "loss": 1.0151, "step": 3516 }, { "epoch": 2.847773279352227, "grad_norm": 1.6667240650346242, "learning_rate": 1.3027631150975606e-07, "loss": 0.9758, "step": 3517 }, { "epoch": 2.848582995951417, "grad_norm": 1.6872730298067318, "learning_rate": 1.2889705640728445e-07, "loss": 0.9228, "step": 3518 }, { "epoch": 2.8493927125506073, "grad_norm": 1.6295638968820234, "learning_rate": 1.275250940432471e-07, "loss": 0.979, "step": 3519 }, { "epoch": 2.850202429149798, "grad_norm": 1.6588363373893749, "learning_rate": 1.261604254312454e-07, "loss": 1.0192, "step": 3520 }, { "epoch": 2.851012145748988, "grad_norm": 1.6683421475461793, "learning_rate": 1.248030515794907e-07, "loss": 0.9543, "step": 3521 }, { "epoch": 2.851821862348178, "grad_norm": 1.6524719881626193, "learning_rate": 1.2345297349080852e-07, "loss": 0.9376, "step": 3522 }, { "epoch": 2.8526315789473684, "grad_norm": 1.698816182274864, "learning_rate": 1.221101921626311e-07, "loss": 0.969, "step": 3523 }, { "epoch": 2.853441295546559, "grad_norm": 1.6990536494185664, "learning_rate": 1.2077470858699925e-07, "loss": 0.9081, "step": 3524 }, { "epoch": 2.854251012145749, "grad_norm": 1.6689902626693056, "learning_rate": 1.1944652375056597e-07, "loss": 0.9413, "step": 3525 }, { "epoch": 2.8550607287449394, "grad_norm": 1.7541568188456584, "learning_rate": 1.1812563863458859e-07, "loss": 0.9368, "step": 3526 }, { "epoch": 2.8558704453441295, "grad_norm": 1.6900489394163216, "learning_rate": 1.1681205421493425e-07, "loss": 1.0227, "step": 3527 }, { "epoch": 2.85668016194332, "grad_norm": 1.6593250577589962, "learning_rate": 1.1550577146207331e-07, "loss": 0.9822, "step": 3528 }, { "epoch": 2.85748987854251, "grad_norm": 1.706412070441469, "learning_rate": 1.1420679134108382e-07, "loss": 0.9934, "step": 3529 }, { "epoch": 2.8582995951417005, "grad_norm": 1.6579862881360155, "learning_rate": 1.1291511481164807e-07, "loss": 0.9466, "step": 3530 }, { "epoch": 2.8591093117408906, "grad_norm": 1.7170922289715347, "learning_rate": 1.1163074282805165e-07, "loss": 0.9103, "step": 3531 }, { "epoch": 2.859919028340081, "grad_norm": 1.7114518472316398, "learning_rate": 1.1035367633918436e-07, "loss": 0.9184, "step": 3532 }, { "epoch": 2.860728744939271, "grad_norm": 1.731323195336544, "learning_rate": 1.0908391628854042e-07, "loss": 0.9779, "step": 3533 }, { "epoch": 2.8615384615384616, "grad_norm": 1.7111499506843564, "learning_rate": 1.0782146361421275e-07, "loss": 1.0177, "step": 3534 }, { "epoch": 2.862348178137652, "grad_norm": 1.6337915289731522, "learning_rate": 1.0656631924889749e-07, "loss": 0.974, "step": 3535 }, { "epoch": 2.863157894736842, "grad_norm": 1.6468027663173628, "learning_rate": 1.0531848411989287e-07, "loss": 1.0301, "step": 3536 }, { "epoch": 2.863967611336032, "grad_norm": 1.704350995042486, "learning_rate": 1.0407795914909258e-07, "loss": 1.0112, "step": 3537 }, { "epoch": 2.8647773279352227, "grad_norm": 1.6680112193827918, "learning_rate": 1.0284474525299459e-07, "loss": 1.0021, "step": 3538 }, { "epoch": 2.865587044534413, "grad_norm": 1.630227489137973, "learning_rate": 1.0161884334269234e-07, "loss": 0.9871, "step": 3539 }, { "epoch": 2.866396761133603, "grad_norm": 1.7269095533127516, "learning_rate": 1.0040025432387801e-07, "loss": 0.9789, "step": 3540 }, { "epoch": 2.8672064777327932, "grad_norm": 1.645077368110772, "learning_rate": 9.918897909684144e-08, "loss": 0.9977, "step": 3541 }, { "epoch": 2.8680161943319837, "grad_norm": 1.6531908723812594, "learning_rate": 9.798501855646792e-08, "loss": 1.0315, "step": 3542 }, { "epoch": 2.8688259109311742, "grad_norm": 1.6773021644682622, "learning_rate": 9.678837359224148e-08, "loss": 0.9943, "step": 3543 }, { "epoch": 2.8696356275303643, "grad_norm": 1.6110461386436772, "learning_rate": 9.559904508823825e-08, "loss": 0.9716, "step": 3544 }, { "epoch": 2.8704453441295548, "grad_norm": 1.547369809807511, "learning_rate": 9.441703392313095e-08, "loss": 0.9595, "step": 3545 }, { "epoch": 2.871255060728745, "grad_norm": 1.6684471743402367, "learning_rate": 9.324234097018436e-08, "loss": 0.9031, "step": 3546 }, { "epoch": 2.8720647773279353, "grad_norm": 1.6770696827512974, "learning_rate": 9.20749670972576e-08, "loss": 0.9372, "step": 3547 }, { "epoch": 2.8728744939271254, "grad_norm": 1.7263050739430634, "learning_rate": 9.091491316680411e-08, "loss": 0.997, "step": 3548 }, { "epoch": 2.873684210526316, "grad_norm": 1.6963472723142492, "learning_rate": 8.976218003586722e-08, "loss": 1.0112, "step": 3549 }, { "epoch": 2.8744939271255063, "grad_norm": 1.7236666081560383, "learning_rate": 8.861676855608237e-08, "loss": 0.9723, "step": 3550 }, { "epoch": 2.8753036437246964, "grad_norm": 1.6959061896936787, "learning_rate": 8.747867957367595e-08, "loss": 0.9236, "step": 3551 }, { "epoch": 2.8761133603238864, "grad_norm": 1.7019359454703777, "learning_rate": 8.634791392946429e-08, "loss": 0.9874, "step": 3552 }, { "epoch": 2.876923076923077, "grad_norm": 1.6598850043781408, "learning_rate": 8.522447245885356e-08, "loss": 0.9448, "step": 3553 }, { "epoch": 2.8777327935222674, "grad_norm": 1.657178469902909, "learning_rate": 8.410835599183875e-08, "loss": 1.0182, "step": 3554 }, { "epoch": 2.8785425101214575, "grad_norm": 1.7487204603764595, "learning_rate": 8.299956535300135e-08, "loss": 0.9883, "step": 3555 }, { "epoch": 2.8793522267206475, "grad_norm": 1.7130380837811467, "learning_rate": 8.189810136151388e-08, "loss": 0.9205, "step": 3556 }, { "epoch": 2.880161943319838, "grad_norm": 1.7416692432911214, "learning_rate": 8.08039648311354e-08, "loss": 1.0081, "step": 3557 }, { "epoch": 2.8809716599190285, "grad_norm": 1.7940488637799301, "learning_rate": 7.971715657020706e-08, "loss": 1.0193, "step": 3558 }, { "epoch": 2.8817813765182185, "grad_norm": 1.7643531171720135, "learning_rate": 7.863767738166217e-08, "loss": 0.9777, "step": 3559 }, { "epoch": 2.882591093117409, "grad_norm": 1.7549243652435764, "learning_rate": 7.756552806301498e-08, "loss": 0.9255, "step": 3560 }, { "epoch": 2.883400809716599, "grad_norm": 1.6037364745875238, "learning_rate": 7.650070940636634e-08, "loss": 1.0126, "step": 3561 }, { "epoch": 2.8842105263157896, "grad_norm": 1.6938156063789074, "learning_rate": 7.54432221984014e-08, "loss": 1.001, "step": 3562 }, { "epoch": 2.8850202429149796, "grad_norm": 1.640394353127388, "learning_rate": 7.43930672203863e-08, "loss": 1.0106, "step": 3563 }, { "epoch": 2.88582995951417, "grad_norm": 1.682827146515058, "learning_rate": 7.335024524817492e-08, "loss": 0.9672, "step": 3564 }, { "epoch": 2.8866396761133606, "grad_norm": 1.64220479848791, "learning_rate": 7.23147570521987e-08, "loss": 1.0053, "step": 3565 }, { "epoch": 2.8874493927125506, "grad_norm": 1.6800144926826894, "learning_rate": 7.128660339747239e-08, "loss": 1.0534, "step": 3566 }, { "epoch": 2.8882591093117407, "grad_norm": 1.6706178385744694, "learning_rate": 7.026578504359394e-08, "loss": 0.9858, "step": 3567 }, { "epoch": 2.889068825910931, "grad_norm": 1.6725170356062997, "learning_rate": 6.925230274474003e-08, "loss": 0.9447, "step": 3568 }, { "epoch": 2.8898785425101217, "grad_norm": 1.7484788389494295, "learning_rate": 6.824615724966843e-08, "loss": 0.982, "step": 3569 }, { "epoch": 2.8906882591093117, "grad_norm": 1.7412794412751307, "learning_rate": 6.724734930171561e-08, "loss": 0.9185, "step": 3570 }, { "epoch": 2.8914979757085018, "grad_norm": 1.7112532122718904, "learning_rate": 6.625587963879909e-08, "loss": 0.884, "step": 3571 }, { "epoch": 2.8923076923076922, "grad_norm": 1.669951486754946, "learning_rate": 6.527174899341071e-08, "loss": 1.0064, "step": 3572 }, { "epoch": 2.8931174089068827, "grad_norm": 1.780604450503182, "learning_rate": 6.429495809262554e-08, "loss": 0.9273, "step": 3573 }, { "epoch": 2.893927125506073, "grad_norm": 1.6302873462180967, "learning_rate": 6.332550765809075e-08, "loss": 0.9506, "step": 3574 }, { "epoch": 2.8947368421052633, "grad_norm": 1.7710784892224958, "learning_rate": 6.236339840603677e-08, "loss": 0.9671, "step": 3575 }, { "epoch": 2.8955465587044533, "grad_norm": 1.7228628615096542, "learning_rate": 6.140863104726391e-08, "loss": 0.9461, "step": 3576 }, { "epoch": 2.896356275303644, "grad_norm": 1.7105420499553026, "learning_rate": 6.046120628715124e-08, "loss": 0.9828, "step": 3577 }, { "epoch": 2.897165991902834, "grad_norm": 1.6456141765558494, "learning_rate": 5.952112482565442e-08, "loss": 0.9371, "step": 3578 }, { "epoch": 2.8979757085020244, "grad_norm": 1.629221588967243, "learning_rate": 5.858838735730232e-08, "loss": 0.9767, "step": 3579 }, { "epoch": 2.898785425101215, "grad_norm": 1.7148433089723767, "learning_rate": 5.766299457119817e-08, "loss": 0.9423, "step": 3580 }, { "epoch": 2.899595141700405, "grad_norm": 1.6770711797146378, "learning_rate": 5.674494715101841e-08, "loss": 0.9575, "step": 3581 }, { "epoch": 2.900404858299595, "grad_norm": 1.7228669637843592, "learning_rate": 5.583424577501273e-08, "loss": 0.982, "step": 3582 }, { "epoch": 2.9012145748987854, "grad_norm": 1.6410483294411622, "learning_rate": 5.4930891116007355e-08, "loss": 0.9662, "step": 3583 }, { "epoch": 2.902024291497976, "grad_norm": 1.695947041652728, "learning_rate": 5.40348838413951e-08, "loss": 0.9336, "step": 3584 }, { "epoch": 2.902834008097166, "grad_norm": 1.7092686814192066, "learning_rate": 5.3146224613144225e-08, "loss": 1.0737, "step": 3585 }, { "epoch": 2.903643724696356, "grad_norm": 1.7080949259326936, "learning_rate": 5.2264914087792885e-08, "loss": 0.9843, "step": 3586 }, { "epoch": 2.9044534412955465, "grad_norm": 1.6572515892039745, "learning_rate": 5.139095291645024e-08, "loss": 1.0179, "step": 3587 }, { "epoch": 2.905263157894737, "grad_norm": 1.6690887807850592, "learning_rate": 5.052434174479759e-08, "loss": 1.0066, "step": 3588 }, { "epoch": 2.906072874493927, "grad_norm": 1.6532884012275588, "learning_rate": 4.966508121308167e-08, "loss": 0.9931, "step": 3589 }, { "epoch": 2.9068825910931175, "grad_norm": 1.7133322607154833, "learning_rate": 4.8813171956123565e-08, "loss": 0.9854, "step": 3590 }, { "epoch": 2.9076923076923076, "grad_norm": 1.6567745485844279, "learning_rate": 4.796861460330982e-08, "loss": 1.0568, "step": 3591 }, { "epoch": 2.908502024291498, "grad_norm": 1.7343588764303524, "learning_rate": 4.713140977859687e-08, "loss": 0.9529, "step": 3592 }, { "epoch": 2.909311740890688, "grad_norm": 1.6208499139806953, "learning_rate": 4.630155810050885e-08, "loss": 1.0186, "step": 3593 }, { "epoch": 2.9101214574898786, "grad_norm": 1.6493435535547711, "learning_rate": 4.547906018213866e-08, "loss": 1.0102, "step": 3594 }, { "epoch": 2.910931174089069, "grad_norm": 1.6692153459998045, "learning_rate": 4.4663916631143554e-08, "loss": 0.9606, "step": 3595 }, { "epoch": 2.911740890688259, "grad_norm": 1.6721684961284737, "learning_rate": 4.3856128049749594e-08, "loss": 0.9796, "step": 3596 }, { "epoch": 2.912550607287449, "grad_norm": 1.7315280769395167, "learning_rate": 4.3055695034747155e-08, "loss": 0.9748, "step": 3597 }, { "epoch": 2.9133603238866397, "grad_norm": 1.6378741930140095, "learning_rate": 4.226261817749544e-08, "loss": 0.9593, "step": 3598 }, { "epoch": 2.91417004048583, "grad_norm": 1.7322062631852746, "learning_rate": 4.147689806391575e-08, "loss": 1.0119, "step": 3599 }, { "epoch": 2.91497975708502, "grad_norm": 1.7138380672008324, "learning_rate": 4.069853527449596e-08, "loss": 0.988, "step": 3600 }, { "epoch": 2.9157894736842103, "grad_norm": 1.7209134121305623, "learning_rate": 3.9927530384288314e-08, "loss": 0.9688, "step": 3601 }, { "epoch": 2.9165991902834008, "grad_norm": 1.7412002078631348, "learning_rate": 3.916388396290716e-08, "loss": 0.9903, "step": 3602 }, { "epoch": 2.9174089068825912, "grad_norm": 1.7072796682597313, "learning_rate": 3.840759657453452e-08, "loss": 0.9591, "step": 3603 }, { "epoch": 2.9182186234817813, "grad_norm": 1.6648056164804599, "learning_rate": 3.7658668777910135e-08, "loss": 0.9767, "step": 3604 }, { "epoch": 2.919028340080972, "grad_norm": 1.7237893699815425, "learning_rate": 3.691710112634139e-08, "loss": 1.0162, "step": 3605 }, { "epoch": 2.919838056680162, "grad_norm": 1.7961044154386043, "learning_rate": 3.61828941676956e-08, "loss": 1.0186, "step": 3606 }, { "epoch": 2.9206477732793523, "grad_norm": 1.6497363776121279, "learning_rate": 3.54560484444022e-08, "loss": 0.909, "step": 3607 }, { "epoch": 2.9214574898785424, "grad_norm": 1.747468367170849, "learning_rate": 3.473656449345275e-08, "loss": 0.89, "step": 3608 }, { "epoch": 2.922267206477733, "grad_norm": 1.762439292957918, "learning_rate": 3.402444284639872e-08, "loss": 0.9111, "step": 3609 }, { "epoch": 2.9230769230769234, "grad_norm": 1.7149546863394658, "learning_rate": 3.3319684029354815e-08, "loss": 0.9661, "step": 3610 }, { "epoch": 2.9238866396761134, "grad_norm": 1.6630865806186383, "learning_rate": 3.262228856299343e-08, "loss": 0.9029, "step": 3611 }, { "epoch": 2.9246963562753034, "grad_norm": 1.7363529145465648, "learning_rate": 3.193225696254798e-08, "loss": 0.9848, "step": 3612 }, { "epoch": 2.925506072874494, "grad_norm": 1.6797985549185608, "learning_rate": 3.124958973781178e-08, "loss": 1.0399, "step": 3613 }, { "epoch": 2.9263157894736844, "grad_norm": 1.6469873014958698, "learning_rate": 3.057428739313695e-08, "loss": 0.9589, "step": 3614 }, { "epoch": 2.9271255060728745, "grad_norm": 1.7250247951219637, "learning_rate": 2.9906350427435505e-08, "loss": 0.937, "step": 3615 }, { "epoch": 2.9279352226720645, "grad_norm": 1.6796115243111611, "learning_rate": 2.924577933417716e-08, "loss": 0.9864, "step": 3616 }, { "epoch": 2.928744939271255, "grad_norm": 1.6550805457651048, "learning_rate": 2.85925746013882e-08, "loss": 1.0157, "step": 3617 }, { "epoch": 2.9295546558704455, "grad_norm": 1.7491759234899906, "learning_rate": 2.7946736711654822e-08, "loss": 1.004, "step": 3618 }, { "epoch": 2.9303643724696355, "grad_norm": 1.64099300892061, "learning_rate": 2.7308266142119788e-08, "loss": 1.0059, "step": 3619 }, { "epoch": 2.931174089068826, "grad_norm": 1.6299619725247847, "learning_rate": 2.667716336448356e-08, "loss": 0.9822, "step": 3620 }, { "epoch": 2.931983805668016, "grad_norm": 1.6619338478776728, "learning_rate": 2.605342884500206e-08, "loss": 0.9879, "step": 3621 }, { "epoch": 2.9327935222672066, "grad_norm": 1.6869289651046775, "learning_rate": 2.5437063044488895e-08, "loss": 0.9443, "step": 3622 }, { "epoch": 2.9336032388663966, "grad_norm": 1.650516004800676, "learning_rate": 2.4828066418314256e-08, "loss": 0.9792, "step": 3623 }, { "epoch": 2.934412955465587, "grad_norm": 1.714896614796574, "learning_rate": 2.422643941640046e-08, "loss": 0.9162, "step": 3624 }, { "epoch": 2.9352226720647776, "grad_norm": 1.614786417424853, "learning_rate": 2.3632182483228628e-08, "loss": 0.9934, "step": 3625 }, { "epoch": 2.9360323886639677, "grad_norm": 1.7199405245576622, "learning_rate": 2.3045296057834232e-08, "loss": 0.9467, "step": 3626 }, { "epoch": 2.9368421052631577, "grad_norm": 1.689626777761019, "learning_rate": 2.2465780573807105e-08, "loss": 0.9896, "step": 3627 }, { "epoch": 2.937651821862348, "grad_norm": 1.618965258362145, "learning_rate": 2.1893636459289213e-08, "loss": 0.9987, "step": 3628 }, { "epoch": 2.9384615384615387, "grad_norm": 1.6953456161595155, "learning_rate": 2.132886413698243e-08, "loss": 0.9242, "step": 3629 }, { "epoch": 2.9392712550607287, "grad_norm": 1.640775504856468, "learning_rate": 2.077146402413521e-08, "loss": 0.9815, "step": 3630 }, { "epoch": 2.9400809716599188, "grad_norm": 1.630716994540923, "learning_rate": 2.0221436532555928e-08, "loss": 0.9879, "step": 3631 }, { "epoch": 2.9408906882591093, "grad_norm": 1.6746857067688603, "learning_rate": 1.9678782068600633e-08, "loss": 0.9805, "step": 3632 }, { "epoch": 2.9417004048582998, "grad_norm": 1.692709178383313, "learning_rate": 1.9143501033181965e-08, "loss": 0.9926, "step": 3633 }, { "epoch": 2.94251012145749, "grad_norm": 1.722482470681197, "learning_rate": 1.8615593821763587e-08, "loss": 1.0066, "step": 3634 }, { "epoch": 2.9433198380566803, "grad_norm": 1.6955658283586104, "learning_rate": 1.80950608243613e-08, "loss": 1.0168, "step": 3635 }, { "epoch": 2.9441295546558703, "grad_norm": 1.6593146288029987, "learning_rate": 1.758190242554303e-08, "loss": 0.9471, "step": 3636 }, { "epoch": 2.944939271255061, "grad_norm": 1.6437181186269008, "learning_rate": 1.7076119004429958e-08, "loss": 1.0134, "step": 3637 }, { "epoch": 2.945748987854251, "grad_norm": 1.7191306396154686, "learning_rate": 1.657771093469096e-08, "loss": 0.8814, "step": 3638 }, { "epoch": 2.9465587044534414, "grad_norm": 1.6761533860830595, "learning_rate": 1.6086678584550374e-08, "loss": 0.9793, "step": 3639 }, { "epoch": 2.9473684210526314, "grad_norm": 1.704443447654609, "learning_rate": 1.5603022316780235e-08, "loss": 0.9372, "step": 3640 }, { "epoch": 2.948178137651822, "grad_norm": 1.7083386239114013, "learning_rate": 1.5126742488703604e-08, "loss": 0.9712, "step": 3641 }, { "epoch": 2.948987854251012, "grad_norm": 1.6835922657350433, "learning_rate": 1.4657839452195677e-08, "loss": 0.9932, "step": 3642 }, { "epoch": 2.9497975708502024, "grad_norm": 1.7376719503586473, "learning_rate": 1.4196313553680453e-08, "loss": 0.8903, "step": 3643 }, { "epoch": 2.950607287449393, "grad_norm": 1.7489663276342433, "learning_rate": 1.3742165134130736e-08, "loss": 0.9544, "step": 3644 }, { "epoch": 2.951417004048583, "grad_norm": 1.6975365462225342, "learning_rate": 1.329539452907036e-08, "loss": 0.9856, "step": 3645 }, { "epoch": 2.952226720647773, "grad_norm": 1.7039651291858493, "learning_rate": 1.285600206857196e-08, "loss": 0.9986, "step": 3646 }, { "epoch": 2.9530364372469635, "grad_norm": 1.6540752577120743, "learning_rate": 1.2423988077258087e-08, "loss": 1.0064, "step": 3647 }, { "epoch": 2.953846153846154, "grad_norm": 1.563653194661081, "learning_rate": 1.1999352874297876e-08, "loss": 1.0462, "step": 3648 }, { "epoch": 2.954655870445344, "grad_norm": 1.6611119980537208, "learning_rate": 1.1582096773410379e-08, "loss": 1.0157, "step": 3649 }, { "epoch": 2.9554655870445345, "grad_norm": 1.6209215418673872, "learning_rate": 1.117222008286456e-08, "loss": 1.0152, "step": 3650 }, { "epoch": 2.9562753036437246, "grad_norm": 1.651092000152933, "learning_rate": 1.0769723105474861e-08, "loss": 0.9698, "step": 3651 }, { "epoch": 2.957085020242915, "grad_norm": 1.6427078147132284, "learning_rate": 1.0374606138605636e-08, "loss": 0.9562, "step": 3652 }, { "epoch": 2.957894736842105, "grad_norm": 1.6736772750275095, "learning_rate": 9.986869474166716e-09, "loss": 0.9024, "step": 3653 }, { "epoch": 2.9587044534412956, "grad_norm": 1.744834463851239, "learning_rate": 9.606513398617846e-09, "loss": 0.8867, "step": 3654 }, { "epoch": 2.9595141700404857, "grad_norm": 1.58327630324171, "learning_rate": 9.233538192963132e-09, "loss": 1.0089, "step": 3655 }, { "epoch": 2.960323886639676, "grad_norm": 1.6438964342520213, "learning_rate": 8.867944132757711e-09, "loss": 0.9828, "step": 3656 }, { "epoch": 2.961133603238866, "grad_norm": 1.7224600116058253, "learning_rate": 8.50973148809997e-09, "loss": 0.9382, "step": 3657 }, { "epoch": 2.9619433198380567, "grad_norm": 1.6459642766819444, "learning_rate": 8.158900523635993e-09, "loss": 0.9835, "step": 3658 }, { "epoch": 2.962753036437247, "grad_norm": 1.6777112954645979, "learning_rate": 7.815451498559557e-09, "loss": 0.9928, "step": 3659 }, { "epoch": 2.9635627530364372, "grad_norm": 1.7128316731395987, "learning_rate": 7.479384666608802e-09, "loss": 0.9999, "step": 3660 }, { "epoch": 2.9643724696356273, "grad_norm": 1.7331941430919588, "learning_rate": 7.150700276068457e-09, "loss": 0.9244, "step": 3661 }, { "epoch": 2.9651821862348178, "grad_norm": 1.6425820505456277, "learning_rate": 6.82939856977094e-09, "loss": 1.0678, "step": 3662 }, { "epoch": 2.9659919028340083, "grad_norm": 1.6923771797910705, "learning_rate": 6.515479785091927e-09, "loss": 0.9698, "step": 3663 }, { "epoch": 2.9668016194331983, "grad_norm": 1.6765814114028854, "learning_rate": 6.208944153953678e-09, "loss": 0.9733, "step": 3664 }, { "epoch": 2.967611336032389, "grad_norm": 1.631450848919302, "learning_rate": 5.909791902823925e-09, "loss": 0.9317, "step": 3665 }, { "epoch": 2.968421052631579, "grad_norm": 1.6990001884177788, "learning_rate": 5.618023252714766e-09, "loss": 0.9603, "step": 3666 }, { "epoch": 2.9692307692307693, "grad_norm": 1.6821450995668512, "learning_rate": 5.333638419184883e-09, "loss": 0.9634, "step": 3667 }, { "epoch": 2.9700404858299594, "grad_norm": 1.6746580159744173, "learning_rate": 5.056637612336212e-09, "loss": 0.8863, "step": 3668 }, { "epoch": 2.97085020242915, "grad_norm": 1.6902742756024816, "learning_rate": 4.787021036816164e-09, "loss": 0.9718, "step": 3669 }, { "epoch": 2.97165991902834, "grad_norm": 1.7087829075461096, "learning_rate": 4.524788891816512e-09, "loss": 0.9482, "step": 3670 }, { "epoch": 2.9724696356275304, "grad_norm": 1.7218111890670291, "learning_rate": 4.269941371073394e-09, "loss": 0.9897, "step": 3671 }, { "epoch": 2.9732793522267205, "grad_norm": 1.6710781319892993, "learning_rate": 4.022478662867313e-09, "loss": 0.9778, "step": 3672 }, { "epoch": 2.974089068825911, "grad_norm": 1.6227351235688, "learning_rate": 3.782400950023135e-09, "loss": 0.9521, "step": 3673 }, { "epoch": 2.9748987854251014, "grad_norm": 1.6542823286693975, "learning_rate": 3.5497084099100907e-09, "loss": 0.9691, "step": 3674 }, { "epoch": 2.9757085020242915, "grad_norm": 1.6462032546016088, "learning_rate": 3.3244012144395545e-09, "loss": 0.9743, "step": 3675 }, { "epoch": 2.9765182186234815, "grad_norm": 1.6774459355985822, "learning_rate": 3.1064795300683735e-09, "loss": 1.0424, "step": 3676 }, { "epoch": 2.977327935222672, "grad_norm": 1.6990599030124527, "learning_rate": 2.8959435177955407e-09, "loss": 0.9464, "step": 3677 }, { "epoch": 2.9781376518218625, "grad_norm": 1.6902993214670898, "learning_rate": 2.692793333165522e-09, "loss": 1.0075, "step": 3678 }, { "epoch": 2.9789473684210526, "grad_norm": 1.623677495815907, "learning_rate": 2.4970291262649272e-09, "loss": 0.9633, "step": 3679 }, { "epoch": 2.979757085020243, "grad_norm": 1.7265662360747522, "learning_rate": 2.3086510417225093e-09, "loss": 1.0392, "step": 3680 }, { "epoch": 2.980566801619433, "grad_norm": 1.6751044099068313, "learning_rate": 2.1276592187124966e-09, "loss": 0.9776, "step": 3681 }, { "epoch": 2.9813765182186236, "grad_norm": 1.6716882137403286, "learning_rate": 1.95405379095126e-09, "loss": 0.9679, "step": 3682 }, { "epoch": 2.9821862348178136, "grad_norm": 1.7059793709856448, "learning_rate": 1.7878348866962047e-09, "loss": 0.9525, "step": 3683 }, { "epoch": 2.982995951417004, "grad_norm": 1.6552987765137663, "learning_rate": 1.6290026287513194e-09, "loss": 1.0131, "step": 3684 }, { "epoch": 2.983805668016194, "grad_norm": 1.6707647453986292, "learning_rate": 1.4775571344605167e-09, "loss": 0.9977, "step": 3685 }, { "epoch": 2.9846153846153847, "grad_norm": 1.6942525860442892, "learning_rate": 1.3334985157109625e-09, "loss": 0.9585, "step": 3686 }, { "epoch": 2.9854251012145747, "grad_norm": 1.6490557158214814, "learning_rate": 1.1968268789330773e-09, "loss": 0.9482, "step": 3687 }, { "epoch": 2.986234817813765, "grad_norm": 1.6249913718833546, "learning_rate": 1.0675423250994244e-09, "loss": 0.9683, "step": 3688 }, { "epoch": 2.9870445344129557, "grad_norm": 1.6778588925327733, "learning_rate": 9.456449497247112e-10, "loss": 0.9532, "step": 3689 }, { "epoch": 2.9878542510121457, "grad_norm": 1.6962491724057556, "learning_rate": 8.311348428657884e-10, "loss": 0.9811, "step": 3690 }, { "epoch": 2.988663967611336, "grad_norm": 1.6716685263614628, "learning_rate": 7.240120891238711e-10, "loss": 0.9267, "step": 3691 }, { "epoch": 2.9894736842105263, "grad_norm": 1.7528045485583463, "learning_rate": 6.242767676400974e-10, "loss": 0.9343, "step": 3692 }, { "epoch": 2.9902834008097168, "grad_norm": 1.7066732550694934, "learning_rate": 5.31928952098859e-10, "loss": 0.956, "step": 3693 }, { "epoch": 2.991093117408907, "grad_norm": 1.6307704468251862, "learning_rate": 4.469687107255816e-10, "loss": 0.984, "step": 3694 }, { "epoch": 2.9919028340080973, "grad_norm": 1.7099555082281752, "learning_rate": 3.6939610628894396e-10, "loss": 0.9458, "step": 3695 }, { "epoch": 2.9927125506072874, "grad_norm": 1.6724491743800372, "learning_rate": 2.9921119609976903e-10, "loss": 0.958, "step": 3696 }, { "epoch": 2.993522267206478, "grad_norm": 1.7205791609019359, "learning_rate": 2.364140320110231e-10, "loss": 0.8848, "step": 3697 }, { "epoch": 2.994331983805668, "grad_norm": 1.7073529607261773, "learning_rate": 1.8100466041559573e-10, "loss": 0.9302, "step": 3698 }, { "epoch": 2.9951417004048584, "grad_norm": 1.6422918196631109, "learning_rate": 1.3298312225074051e-10, "loss": 0.9791, "step": 3699 }, { "epoch": 2.9959514170040484, "grad_norm": 1.7056433896621659, "learning_rate": 9.234945299363418e-11, "loss": 0.9875, "step": 3700 }, { "epoch": 2.996761133603239, "grad_norm": 1.6180923945102257, "learning_rate": 5.910368266470734e-11, "loss": 0.9499, "step": 3701 }, { "epoch": 2.997570850202429, "grad_norm": 1.6943866994325132, "learning_rate": 3.324583582653418e-11, "loss": 1.0206, "step": 3702 }, { "epoch": 2.9983805668016195, "grad_norm": 1.5972662423702781, "learning_rate": 1.477593158272228e-11, "loss": 1.0339, "step": 3703 }, { "epoch": 2.99919028340081, "grad_norm": 1.6264801708756609, "learning_rate": 3.693983577912619e-12, "loss": 0.9317, "step": 3704 }, { "epoch": 3.0, "grad_norm": 1.6339826687405365, "learning_rate": 0.0, "loss": 0.9082, "step": 3705 } ], "logging_steps": 1, "max_steps": 3705, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 618, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.96481039548416e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }