diff --git "a/checkpoint-3596/trainer_state.json" "b/checkpoint-3596/trainer_state.json" deleted file mode 100644--- "a/checkpoint-3596/trainer_state.json" +++ /dev/null @@ -1,25201 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.9834538375973305, - "eval_steps": 500, - "global_step": 3596, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "grad_norm": 0.02996581234037876, - "learning_rate": 2e-05, - "loss": 1.2729, - "step": 1 - }, - { - "epoch": 0.0, - "grad_norm": 0.029612887650728226, - "learning_rate": 4e-05, - "loss": 1.208, - "step": 2 - }, - { - "epoch": 0.0, - "grad_norm": 0.027333809062838554, - "learning_rate": 6e-05, - "loss": 1.0525, - "step": 3 - }, - { - "epoch": 0.0, - "grad_norm": 0.02706489898264408, - "learning_rate": 8e-05, - "loss": 1.021, - "step": 4 - }, - { - "epoch": 0.0, - "grad_norm": 0.033770810812711716, - "learning_rate": 0.0001, - "loss": 1.223, - "step": 5 - }, - { - "epoch": 0.0, - "grad_norm": 0.03023155778646469, - "learning_rate": 0.00012, - "loss": 1.0973, - "step": 6 - }, - { - "epoch": 0.0, - "grad_norm": 0.034239206463098526, - "learning_rate": 0.00014, - "loss": 1.1578, - "step": 7 - }, - { - "epoch": 0.0, - "grad_norm": 0.037418968975543976, - "learning_rate": 0.00016, - "loss": 1.0797, - "step": 8 - }, - { - "epoch": 0.01, - "grad_norm": 0.03651932254433632, - "learning_rate": 0.00018, - "loss": 1.0795, - "step": 9 - }, - { - "epoch": 0.01, - "grad_norm": 0.03634285926818848, - "learning_rate": 0.0002, - "loss": 0.9999, - "step": 10 - }, - { - "epoch": 0.01, - "grad_norm": 0.03892536088824272, - "learning_rate": 0.00019999996162493256, - "loss": 1.0103, - "step": 11 - }, - { - "epoch": 0.01, - "grad_norm": 0.0294953566044569, - "learning_rate": 0.00019999984649975977, - "loss": 1.0729, - "step": 12 - }, - { - "epoch": 0.01, - "grad_norm": 0.04408486932516098, - "learning_rate": 0.00019999965462456993, - "loss": 1.0439, - "step": 13 - }, - { - "epoch": 0.01, - "grad_norm": 0.03400202468037605, - "learning_rate": 0.00019999938599951034, - "loss": 0.8412, - "step": 14 - }, - { - "epoch": 0.01, - "grad_norm": 0.034877073019742966, - "learning_rate": 0.00019999904062478714, - "loss": 0.9438, - "step": 15 - }, - { - "epoch": 0.01, - "grad_norm": 0.0340421162545681, - "learning_rate": 0.0001999986185006654, - "loss": 1.015, - "step": 16 - }, - { - "epoch": 0.01, - "grad_norm": 0.03420884534716606, - "learning_rate": 0.00019999811962746915, - "loss": 0.9792, - "step": 17 - }, - { - "epoch": 0.01, - "grad_norm": 0.03177829086780548, - "learning_rate": 0.00019999754400558124, - "loss": 0.8856, - "step": 18 - }, - { - "epoch": 0.01, - "grad_norm": 0.03189516067504883, - "learning_rate": 0.00019999689163544346, - "loss": 0.8841, - "step": 19 - }, - { - "epoch": 0.01, - "grad_norm": 0.025729015469551086, - "learning_rate": 0.00019999616251755651, - "loss": 0.9524, - "step": 20 - }, - { - "epoch": 0.01, - "grad_norm": 0.02780766598880291, - "learning_rate": 0.00019999535665248002, - "loss": 0.8998, - "step": 21 - }, - { - "epoch": 0.01, - "grad_norm": 0.023537974804639816, - "learning_rate": 0.00019999447404083244, - "loss": 0.9479, - "step": 22 - }, - { - "epoch": 0.01, - "grad_norm": 0.023690495640039444, - "learning_rate": 0.0001999935146832912, - "loss": 0.9138, - "step": 23 - }, - { - "epoch": 0.01, - "grad_norm": 0.023732604458928108, - "learning_rate": 0.00019999247858059257, - "loss": 1.036, - "step": 24 - }, - { - "epoch": 0.01, - "grad_norm": 0.02902216836810112, - "learning_rate": 0.00019999136573353184, - "loss": 0.9619, - "step": 25 - }, - { - "epoch": 0.01, - "grad_norm": 0.024132205173373222, - "learning_rate": 0.0001999901761429631, - "loss": 0.9338, - "step": 26 - }, - { - "epoch": 0.02, - "grad_norm": 0.02214036136865616, - "learning_rate": 0.0001999889098097993, - "loss": 0.8229, - "step": 27 - }, - { - "epoch": 0.02, - "grad_norm": 0.023357391357421875, - "learning_rate": 0.0001999875667350124, - "loss": 0.8983, - "step": 28 - }, - { - "epoch": 0.02, - "grad_norm": 0.022214027121663094, - "learning_rate": 0.00019998614691963323, - "loss": 0.7894, - "step": 29 - }, - { - "epoch": 0.02, - "grad_norm": 0.021391017362475395, - "learning_rate": 0.00019998465036475148, - "loss": 0.9419, - "step": 30 - }, - { - "epoch": 0.02, - "grad_norm": 0.02627565711736679, - "learning_rate": 0.0001999830770715157, - "loss": 0.9285, - "step": 31 - }, - { - "epoch": 0.02, - "grad_norm": 0.022590599954128265, - "learning_rate": 0.0001999814270411335, - "loss": 0.9401, - "step": 32 - }, - { - "epoch": 0.02, - "grad_norm": 0.024948442354798317, - "learning_rate": 0.00019997970027487122, - "loss": 0.9069, - "step": 33 - }, - { - "epoch": 0.02, - "grad_norm": 0.02103595621883869, - "learning_rate": 0.00019997789677405416, - "loss": 0.9892, - "step": 34 - }, - { - "epoch": 0.02, - "grad_norm": 0.02005797065794468, - "learning_rate": 0.0001999760165400665, - "loss": 0.9577, - "step": 35 - }, - { - "epoch": 0.02, - "grad_norm": 0.021683409810066223, - "learning_rate": 0.00019997405957435135, - "loss": 0.9099, - "step": 36 - }, - { - "epoch": 0.02, - "grad_norm": 0.020573126152157784, - "learning_rate": 0.00019997202587841066, - "loss": 0.8264, - "step": 37 - }, - { - "epoch": 0.02, - "grad_norm": 0.023178325966000557, - "learning_rate": 0.00019996991545380533, - "loss": 0.8658, - "step": 38 - }, - { - "epoch": 0.02, - "grad_norm": 0.01991512067615986, - "learning_rate": 0.00019996772830215505, - "loss": 0.9239, - "step": 39 - }, - { - "epoch": 0.02, - "grad_norm": 0.02087959088385105, - "learning_rate": 0.00019996546442513854, - "loss": 0.8532, - "step": 40 - }, - { - "epoch": 0.02, - "grad_norm": 0.01916460692882538, - "learning_rate": 0.00019996312382449327, - "loss": 1.0405, - "step": 41 - }, - { - "epoch": 0.02, - "grad_norm": 0.023638034239411354, - "learning_rate": 0.00019996070650201567, - "loss": 0.9984, - "step": 42 - }, - { - "epoch": 0.02, - "grad_norm": 0.023328714072704315, - "learning_rate": 0.000199958212459561, - "loss": 0.8696, - "step": 43 - }, - { - "epoch": 0.02, - "grad_norm": 0.021662859246134758, - "learning_rate": 0.00019995564169904354, - "loss": 0.8519, - "step": 44 - }, - { - "epoch": 0.03, - "grad_norm": 0.024608176201581955, - "learning_rate": 0.0001999529942224363, - "loss": 0.8428, - "step": 45 - }, - { - "epoch": 0.03, - "grad_norm": 0.020750664174556732, - "learning_rate": 0.00019995027003177118, - "loss": 0.8653, - "step": 46 - }, - { - "epoch": 0.03, - "grad_norm": 0.021747557446360588, - "learning_rate": 0.00019994746912913906, - "loss": 0.8973, - "step": 47 - }, - { - "epoch": 0.03, - "grad_norm": 0.02252582646906376, - "learning_rate": 0.00019994459151668957, - "loss": 0.8664, - "step": 48 - }, - { - "epoch": 0.03, - "grad_norm": 0.0229500625282526, - "learning_rate": 0.00019994163719663137, - "loss": 0.8613, - "step": 49 - }, - { - "epoch": 0.03, - "grad_norm": 0.021505488082766533, - "learning_rate": 0.00019993860617123184, - "loss": 0.812, - "step": 50 - }, - { - "epoch": 0.03, - "grad_norm": 0.020829355344176292, - "learning_rate": 0.0001999354984428173, - "loss": 0.9139, - "step": 51 - }, - { - "epoch": 0.03, - "grad_norm": 0.024170633405447006, - "learning_rate": 0.000199932314013773, - "loss": 0.937, - "step": 52 - }, - { - "epoch": 0.03, - "grad_norm": 0.021097104996442795, - "learning_rate": 0.0001999290528865429, - "loss": 0.9093, - "step": 53 - }, - { - "epoch": 0.03, - "grad_norm": 0.023191392421722412, - "learning_rate": 0.00019992571506363, - "loss": 0.9728, - "step": 54 - }, - { - "epoch": 0.03, - "grad_norm": 0.023909488692879677, - "learning_rate": 0.000199922300547596, - "loss": 0.8279, - "step": 55 - }, - { - "epoch": 0.03, - "grad_norm": 0.024314479902386665, - "learning_rate": 0.0001999188093410616, - "loss": 0.8118, - "step": 56 - }, - { - "epoch": 0.03, - "grad_norm": 0.022049203515052795, - "learning_rate": 0.00019991524144670635, - "loss": 0.8637, - "step": 57 - }, - { - "epoch": 0.03, - "grad_norm": 0.030246440321207047, - "learning_rate": 0.0001999115968672685, - "loss": 0.8515, - "step": 58 - }, - { - "epoch": 0.03, - "grad_norm": 0.021488850936293602, - "learning_rate": 0.00019990787560554538, - "loss": 0.844, - "step": 59 - }, - { - "epoch": 0.03, - "grad_norm": 0.022307060658931732, - "learning_rate": 0.00019990407766439297, - "loss": 0.7771, - "step": 60 - }, - { - "epoch": 0.03, - "grad_norm": 0.027180947363376617, - "learning_rate": 0.00019990020304672628, - "loss": 0.9951, - "step": 61 - }, - { - "epoch": 0.03, - "grad_norm": 0.02594638243317604, - "learning_rate": 0.000199896251755519, - "loss": 0.8643, - "step": 62 - }, - { - "epoch": 0.04, - "grad_norm": 0.025260182097554207, - "learning_rate": 0.00019989222379380384, - "loss": 0.9858, - "step": 63 - }, - { - "epoch": 0.04, - "grad_norm": 0.029186906293034554, - "learning_rate": 0.0001998881191646722, - "loss": 0.7514, - "step": 64 - }, - { - "epoch": 0.04, - "grad_norm": 0.02333957888185978, - "learning_rate": 0.00019988393787127441, - "loss": 0.7914, - "step": 65 - }, - { - "epoch": 0.04, - "grad_norm": 0.0235314778983593, - "learning_rate": 0.00019987967991681965, - "loss": 0.8198, - "step": 66 - }, - { - "epoch": 0.04, - "grad_norm": 0.023193303495645523, - "learning_rate": 0.00019987534530457583, - "loss": 0.8976, - "step": 67 - }, - { - "epoch": 0.04, - "grad_norm": 0.026709623634815216, - "learning_rate": 0.00019987093403786986, - "loss": 0.8833, - "step": 68 - }, - { - "epoch": 0.04, - "grad_norm": 0.024577545002102852, - "learning_rate": 0.00019986644612008728, - "loss": 0.8259, - "step": 69 - }, - { - "epoch": 0.04, - "grad_norm": 0.027963118627667427, - "learning_rate": 0.0001998618815546727, - "loss": 0.8767, - "step": 70 - }, - { - "epoch": 0.04, - "grad_norm": 0.02565196342766285, - "learning_rate": 0.00019985724034512936, - "loss": 0.7295, - "step": 71 - }, - { - "epoch": 0.04, - "grad_norm": 0.026185255497694016, - "learning_rate": 0.0001998525224950194, - "loss": 0.876, - "step": 72 - }, - { - "epoch": 0.04, - "grad_norm": 0.023345299065113068, - "learning_rate": 0.00019984772800796377, - "loss": 0.997, - "step": 73 - }, - { - "epoch": 0.04, - "grad_norm": 0.024012910202145576, - "learning_rate": 0.00019984285688764226, - "loss": 0.9337, - "step": 74 - }, - { - "epoch": 0.04, - "grad_norm": 0.02843514457345009, - "learning_rate": 0.00019983790913779347, - "loss": 0.8949, - "step": 75 - }, - { - "epoch": 0.04, - "grad_norm": 0.022462787106633186, - "learning_rate": 0.0001998328847622148, - "loss": 0.9018, - "step": 76 - }, - { - "epoch": 0.04, - "grad_norm": 0.024008071050047874, - "learning_rate": 0.00019982778376476245, - "loss": 0.8376, - "step": 77 - }, - { - "epoch": 0.04, - "grad_norm": 0.023886198177933693, - "learning_rate": 0.00019982260614935144, - "loss": 0.8118, - "step": 78 - }, - { - "epoch": 0.04, - "grad_norm": 0.025386439636349678, - "learning_rate": 0.00019981735191995563, - "loss": 0.9161, - "step": 79 - }, - { - "epoch": 0.04, - "grad_norm": 0.02613622136414051, - "learning_rate": 0.0001998120210806076, - "loss": 0.9008, - "step": 80 - }, - { - "epoch": 0.05, - "grad_norm": 0.02593972161412239, - "learning_rate": 0.00019980661363539883, - "loss": 0.9567, - "step": 81 - }, - { - "epoch": 0.05, - "grad_norm": 0.02825133316218853, - "learning_rate": 0.0001998011295884795, - "loss": 0.8386, - "step": 82 - }, - { - "epoch": 0.05, - "grad_norm": 0.025737276300787926, - "learning_rate": 0.00019979556894405862, - "loss": 0.7957, - "step": 83 - }, - { - "epoch": 0.05, - "grad_norm": 0.023754647001624107, - "learning_rate": 0.00019978993170640403, - "loss": 0.8167, - "step": 84 - }, - { - "epoch": 0.05, - "grad_norm": 0.02431335486471653, - "learning_rate": 0.00019978421787984228, - "loss": 0.7568, - "step": 85 - }, - { - "epoch": 0.05, - "grad_norm": 0.024608569219708443, - "learning_rate": 0.00019977842746875878, - "loss": 0.7992, - "step": 86 - }, - { - "epoch": 0.05, - "grad_norm": 0.024010393768548965, - "learning_rate": 0.00019977256047759765, - "loss": 0.9094, - "step": 87 - }, - { - "epoch": 0.05, - "grad_norm": 0.023806700482964516, - "learning_rate": 0.00019976661691086182, - "loss": 0.8677, - "step": 88 - }, - { - "epoch": 0.05, - "grad_norm": 0.02781900390982628, - "learning_rate": 0.00019976059677311297, - "loss": 0.7736, - "step": 89 - }, - { - "epoch": 0.05, - "grad_norm": 0.027234690263867378, - "learning_rate": 0.0001997545000689716, - "loss": 0.789, - "step": 90 - }, - { - "epoch": 0.05, - "grad_norm": 0.028865503147244453, - "learning_rate": 0.00019974832680311688, - "loss": 0.9041, - "step": 91 - }, - { - "epoch": 0.05, - "grad_norm": 0.05750074237585068, - "learning_rate": 0.00019974207698028685, - "loss": 0.8886, - "step": 92 - }, - { - "epoch": 0.05, - "grad_norm": 0.025579992681741714, - "learning_rate": 0.00019973575060527825, - "loss": 0.835, - "step": 93 - }, - { - "epoch": 0.05, - "grad_norm": 0.02479216828942299, - "learning_rate": 0.00019972934768294659, - "loss": 0.765, - "step": 94 - }, - { - "epoch": 0.05, - "grad_norm": 0.024221569299697876, - "learning_rate": 0.0001997228682182061, - "loss": 0.7935, - "step": 95 - }, - { - "epoch": 0.05, - "grad_norm": 0.025168055668473244, - "learning_rate": 0.00019971631221602976, - "loss": 0.7801, - "step": 96 - }, - { - "epoch": 0.05, - "grad_norm": 0.027467774227261543, - "learning_rate": 0.00019970967968144937, - "loss": 0.8287, - "step": 97 - }, - { - "epoch": 0.05, - "grad_norm": 0.023378336802124977, - "learning_rate": 0.00019970297061955533, - "loss": 0.7486, - "step": 98 - }, - { - "epoch": 0.06, - "grad_norm": 0.025852292776107788, - "learning_rate": 0.00019969618503549693, - "loss": 0.8939, - "step": 99 - }, - { - "epoch": 0.06, - "grad_norm": 0.02506246417760849, - "learning_rate": 0.00019968932293448207, - "loss": 0.8112, - "step": 100 - }, - { - "epoch": 0.06, - "grad_norm": 0.026856260374188423, - "learning_rate": 0.00019968238432177744, - "loss": 0.8782, - "step": 101 - }, - { - "epoch": 0.06, - "grad_norm": 0.028643622994422913, - "learning_rate": 0.0001996753692027084, - "loss": 0.7884, - "step": 102 - }, - { - "epoch": 0.06, - "grad_norm": 0.029276320710778236, - "learning_rate": 0.00019966827758265913, - "loss": 0.8926, - "step": 103 - }, - { - "epoch": 0.06, - "grad_norm": 0.027838734909892082, - "learning_rate": 0.00019966110946707244, - "loss": 0.87, - "step": 104 - }, - { - "epoch": 0.06, - "grad_norm": 0.026706190779805183, - "learning_rate": 0.0001996538648614498, - "loss": 0.9119, - "step": 105 - }, - { - "epoch": 0.06, - "grad_norm": 0.024687886238098145, - "learning_rate": 0.00019964654377135153, - "loss": 0.7647, - "step": 106 - }, - { - "epoch": 0.06, - "grad_norm": 0.02482466958463192, - "learning_rate": 0.00019963914620239656, - "loss": 0.894, - "step": 107 - }, - { - "epoch": 0.06, - "grad_norm": 0.024776702746748924, - "learning_rate": 0.0001996316721602625, - "loss": 0.7515, - "step": 108 - }, - { - "epoch": 0.06, - "grad_norm": 0.02663383074104786, - "learning_rate": 0.00019962412165068573, - "loss": 0.8499, - "step": 109 - }, - { - "epoch": 0.06, - "grad_norm": 0.027590934187173843, - "learning_rate": 0.00019961649467946125, - "loss": 0.7446, - "step": 110 - }, - { - "epoch": 0.06, - "grad_norm": 0.025346562266349792, - "learning_rate": 0.0001996087912524428, - "loss": 0.8308, - "step": 111 - }, - { - "epoch": 0.06, - "grad_norm": 0.02552192285656929, - "learning_rate": 0.0001996010113755427, - "loss": 0.8552, - "step": 112 - }, - { - "epoch": 0.06, - "grad_norm": 0.024720516055822372, - "learning_rate": 0.0001995931550547321, - "loss": 0.8648, - "step": 113 - }, - { - "epoch": 0.06, - "grad_norm": 0.02904409170150757, - "learning_rate": 0.0001995852222960407, - "loss": 0.8826, - "step": 114 - }, - { - "epoch": 0.06, - "grad_norm": 0.027031444013118744, - "learning_rate": 0.00019957721310555693, - "loss": 0.8748, - "step": 115 - }, - { - "epoch": 0.06, - "grad_norm": 0.025683356449007988, - "learning_rate": 0.0001995691274894278, - "loss": 0.8441, - "step": 116 - }, - { - "epoch": 0.07, - "grad_norm": 0.03195862099528313, - "learning_rate": 0.00019956096545385905, - "loss": 0.7829, - "step": 117 - }, - { - "epoch": 0.07, - "grad_norm": 0.025551646947860718, - "learning_rate": 0.00019955272700511507, - "loss": 0.7934, - "step": 118 - }, - { - "epoch": 0.07, - "grad_norm": 0.028811989352107048, - "learning_rate": 0.0001995444121495189, - "loss": 0.8001, - "step": 119 - }, - { - "epoch": 0.07, - "grad_norm": 0.030996061861515045, - "learning_rate": 0.00019953602089345217, - "loss": 0.7128, - "step": 120 - }, - { - "epoch": 0.07, - "grad_norm": 0.026600060984492302, - "learning_rate": 0.00019952755324335514, - "loss": 0.7414, - "step": 121 - }, - { - "epoch": 0.07, - "grad_norm": 0.025535378605127335, - "learning_rate": 0.00019951900920572684, - "loss": 0.8381, - "step": 122 - }, - { - "epoch": 0.07, - "grad_norm": 0.035039834678173065, - "learning_rate": 0.00019951038878712475, - "loss": 0.9607, - "step": 123 - }, - { - "epoch": 0.07, - "grad_norm": 0.03230242058634758, - "learning_rate": 0.00019950169199416513, - "loss": 0.876, - "step": 124 - }, - { - "epoch": 0.07, - "grad_norm": 0.031107056885957718, - "learning_rate": 0.0001994929188335227, - "loss": 0.8444, - "step": 125 - }, - { - "epoch": 0.07, - "grad_norm": 0.031076829880475998, - "learning_rate": 0.0001994840693119309, - "loss": 0.8577, - "step": 126 - }, - { - "epoch": 0.07, - "grad_norm": 0.028398986905813217, - "learning_rate": 0.0001994751434361818, - "loss": 0.7828, - "step": 127 - }, - { - "epoch": 0.07, - "grad_norm": 0.027821028605103493, - "learning_rate": 0.00019946614121312594, - "loss": 0.8243, - "step": 128 - }, - { - "epoch": 0.07, - "grad_norm": 0.030113285407423973, - "learning_rate": 0.0001994570626496726, - "loss": 0.7923, - "step": 129 - }, - { - "epoch": 0.07, - "grad_norm": 0.028168270364403725, - "learning_rate": 0.00019944790775278955, - "loss": 0.722, - "step": 130 - }, - { - "epoch": 0.07, - "grad_norm": 0.030299954116344452, - "learning_rate": 0.0001994386765295032, - "loss": 0.8462, - "step": 131 - }, - { - "epoch": 0.07, - "grad_norm": 0.029045602306723595, - "learning_rate": 0.00019942936898689854, - "loss": 0.7916, - "step": 132 - }, - { - "epoch": 0.07, - "grad_norm": 0.02803182788193226, - "learning_rate": 0.0001994199851321191, - "loss": 0.7949, - "step": 133 - }, - { - "epoch": 0.07, - "grad_norm": 0.029968315735459328, - "learning_rate": 0.00019941052497236703, - "loss": 0.9672, - "step": 134 - }, - { - "epoch": 0.08, - "grad_norm": 0.027893947437405586, - "learning_rate": 0.00019940098851490298, - "loss": 0.8284, - "step": 135 - }, - { - "epoch": 0.08, - "grad_norm": 0.030356327071785927, - "learning_rate": 0.0001993913757670462, - "loss": 0.8001, - "step": 136 - }, - { - "epoch": 0.08, - "grad_norm": 0.02701139822602272, - "learning_rate": 0.00019938168673617454, - "loss": 0.8137, - "step": 137 - }, - { - "epoch": 0.08, - "grad_norm": 0.032546404749155045, - "learning_rate": 0.00019937192142972427, - "loss": 0.7451, - "step": 138 - }, - { - "epoch": 0.08, - "grad_norm": 0.02752624824643135, - "learning_rate": 0.0001993620798551903, - "loss": 0.9893, - "step": 139 - }, - { - "epoch": 0.08, - "grad_norm": 0.02512490376830101, - "learning_rate": 0.0001993521620201261, - "loss": 0.9108, - "step": 140 - }, - { - "epoch": 0.08, - "grad_norm": 0.029267175123095512, - "learning_rate": 0.00019934216793214354, - "loss": 0.8274, - "step": 141 - }, - { - "epoch": 0.08, - "grad_norm": 0.02983001247048378, - "learning_rate": 0.00019933209759891317, - "loss": 0.8394, - "step": 142 - }, - { - "epoch": 0.08, - "grad_norm": 0.031017007306218147, - "learning_rate": 0.0001993219510281639, - "loss": 0.8455, - "step": 143 - }, - { - "epoch": 0.08, - "grad_norm": 0.02880048379302025, - "learning_rate": 0.00019931172822768335, - "loss": 0.8257, - "step": 144 - }, - { - "epoch": 0.08, - "grad_norm": 0.03203625977039337, - "learning_rate": 0.00019930142920531743, - "loss": 0.8693, - "step": 145 - }, - { - "epoch": 0.08, - "grad_norm": 0.027382776141166687, - "learning_rate": 0.0001992910539689707, - "loss": 0.8478, - "step": 146 - }, - { - "epoch": 0.08, - "grad_norm": 0.03054049052298069, - "learning_rate": 0.00019928060252660617, - "loss": 0.9593, - "step": 147 - }, - { - "epoch": 0.08, - "grad_norm": 0.026648158207535744, - "learning_rate": 0.00019927007488624535, - "loss": 0.8915, - "step": 148 - }, - { - "epoch": 0.08, - "grad_norm": 0.03315131738781929, - "learning_rate": 0.00019925947105596816, - "loss": 0.9008, - "step": 149 - }, - { - "epoch": 0.08, - "grad_norm": 0.027361949905753136, - "learning_rate": 0.00019924879104391309, - "loss": 0.8705, - "step": 150 - }, - { - "epoch": 0.08, - "grad_norm": 0.029892416670918465, - "learning_rate": 0.0001992380348582771, - "loss": 0.9083, - "step": 151 - }, - { - "epoch": 0.08, - "grad_norm": 0.027648137882351875, - "learning_rate": 0.00019922720250731553, - "loss": 0.7381, - "step": 152 - }, - { - "epoch": 0.09, - "grad_norm": 0.02656717598438263, - "learning_rate": 0.00019921629399934223, - "loss": 0.9495, - "step": 153 - }, - { - "epoch": 0.09, - "grad_norm": 0.027288291603326797, - "learning_rate": 0.0001992053093427295, - "loss": 0.8505, - "step": 154 - }, - { - "epoch": 0.09, - "grad_norm": 0.032024212181568146, - "learning_rate": 0.0001991942485459081, - "loss": 0.8288, - "step": 155 - }, - { - "epoch": 0.09, - "grad_norm": 0.027418840676546097, - "learning_rate": 0.00019918311161736717, - "loss": 0.8137, - "step": 156 - }, - { - "epoch": 0.09, - "grad_norm": 0.03174857795238495, - "learning_rate": 0.0001991718985656543, - "loss": 0.8754, - "step": 157 - }, - { - "epoch": 0.09, - "grad_norm": 0.02887050248682499, - "learning_rate": 0.0001991606093993756, - "loss": 0.8497, - "step": 158 - }, - { - "epoch": 0.09, - "grad_norm": 0.03064458630979061, - "learning_rate": 0.00019914924412719546, - "loss": 0.9331, - "step": 159 - }, - { - "epoch": 0.09, - "grad_norm": 0.02989426627755165, - "learning_rate": 0.00019913780275783674, - "loss": 0.9075, - "step": 160 - }, - { - "epoch": 0.09, - "grad_norm": 0.03449084982275963, - "learning_rate": 0.00019912628530008075, - "loss": 0.8531, - "step": 161 - }, - { - "epoch": 0.09, - "grad_norm": 0.03213748335838318, - "learning_rate": 0.00019911469176276712, - "loss": 0.7411, - "step": 162 - }, - { - "epoch": 0.09, - "grad_norm": 0.031524017453193665, - "learning_rate": 0.0001991030221547939, - "loss": 0.8003, - "step": 163 - }, - { - "epoch": 0.09, - "grad_norm": 0.028880394995212555, - "learning_rate": 0.00019909127648511755, - "loss": 0.86, - "step": 164 - }, - { - "epoch": 0.09, - "grad_norm": 0.0295130405575037, - "learning_rate": 0.0001990794547627529, - "loss": 0.802, - "step": 165 - }, - { - "epoch": 0.09, - "grad_norm": 0.032397400587797165, - "learning_rate": 0.00019906755699677312, - "loss": 0.8574, - "step": 166 - }, - { - "epoch": 0.09, - "grad_norm": 0.030780531466007233, - "learning_rate": 0.00019905558319630978, - "loss": 0.8748, - "step": 167 - }, - { - "epoch": 0.09, - "grad_norm": 0.026546768844127655, - "learning_rate": 0.0001990435333705527, - "loss": 0.8099, - "step": 168 - }, - { - "epoch": 0.09, - "grad_norm": 0.03083784319460392, - "learning_rate": 0.00019903140752875026, - "loss": 0.7867, - "step": 169 - }, - { - "epoch": 0.09, - "grad_norm": 0.02955833077430725, - "learning_rate": 0.000199019205680209, - "loss": 0.8806, - "step": 170 - }, - { - "epoch": 0.1, - "grad_norm": 0.029685625806450844, - "learning_rate": 0.00019900692783429385, - "loss": 0.9009, - "step": 171 - }, - { - "epoch": 0.1, - "grad_norm": 0.03194108605384827, - "learning_rate": 0.00019899457400042808, - "loss": 0.7654, - "step": 172 - }, - { - "epoch": 0.1, - "grad_norm": 0.032116640359163284, - "learning_rate": 0.0001989821441880933, - "loss": 1.0055, - "step": 173 - }, - { - "epoch": 0.1, - "grad_norm": 0.03061555325984955, - "learning_rate": 0.00019896963840682937, - "loss": 0.782, - "step": 174 - }, - { - "epoch": 0.1, - "grad_norm": 0.030177190899848938, - "learning_rate": 0.0001989570566662345, - "loss": 0.7403, - "step": 175 - }, - { - "epoch": 0.1, - "grad_norm": 0.031266387552022934, - "learning_rate": 0.00019894439897596522, - "loss": 0.8553, - "step": 176 - }, - { - "epoch": 0.1, - "grad_norm": 0.032219842076301575, - "learning_rate": 0.0001989316653457363, - "loss": 0.7479, - "step": 177 - }, - { - "epoch": 0.1, - "grad_norm": 0.029450630769133568, - "learning_rate": 0.0001989188557853208, - "loss": 0.7373, - "step": 178 - }, - { - "epoch": 0.1, - "grad_norm": 0.03027605637907982, - "learning_rate": 0.00019890597030455015, - "loss": 0.7844, - "step": 179 - }, - { - "epoch": 0.1, - "grad_norm": 0.03511941432952881, - "learning_rate": 0.00019889300891331392, - "loss": 0.8679, - "step": 180 - }, - { - "epoch": 0.1, - "grad_norm": 0.031290799379348755, - "learning_rate": 0.00019887997162155996, - "loss": 0.7283, - "step": 181 - }, - { - "epoch": 0.1, - "grad_norm": 0.02689247764647007, - "learning_rate": 0.0001988668584392945, - "loss": 0.7628, - "step": 182 - }, - { - "epoch": 0.1, - "grad_norm": 0.030273688957095146, - "learning_rate": 0.0001988536693765818, - "loss": 0.7985, - "step": 183 - }, - { - "epoch": 0.1, - "grad_norm": 0.030390895903110504, - "learning_rate": 0.00019884040444354461, - "loss": 0.8086, - "step": 184 - }, - { - "epoch": 0.1, - "grad_norm": 0.029167182743549347, - "learning_rate": 0.00019882706365036373, - "loss": 0.7081, - "step": 185 - }, - { - "epoch": 0.1, - "grad_norm": 0.03029320389032364, - "learning_rate": 0.00019881364700727823, - "loss": 0.6738, - "step": 186 - }, - { - "epoch": 0.1, - "grad_norm": 0.028749890625476837, - "learning_rate": 0.00019880015452458544, - "loss": 0.812, - "step": 187 - }, - { - "epoch": 0.1, - "grad_norm": 0.030206618830561638, - "learning_rate": 0.0001987865862126408, - "loss": 0.8358, - "step": 188 - }, - { - "epoch": 0.11, - "grad_norm": 0.030770782381296158, - "learning_rate": 0.00019877294208185803, - "loss": 0.7326, - "step": 189 - }, - { - "epoch": 0.11, - "grad_norm": 0.03188487887382507, - "learning_rate": 0.00019875922214270903, - "loss": 0.817, - "step": 190 - }, - { - "epoch": 0.11, - "grad_norm": 0.031357571482658386, - "learning_rate": 0.00019874542640572384, - "loss": 0.7981, - "step": 191 - }, - { - "epoch": 0.11, - "grad_norm": 0.06510663032531738, - "learning_rate": 0.00019873155488149078, - "loss": 0.8416, - "step": 192 - }, - { - "epoch": 0.11, - "grad_norm": 0.030021656304597855, - "learning_rate": 0.0001987176075806562, - "loss": 0.7412, - "step": 193 - }, - { - "epoch": 0.11, - "grad_norm": 0.033732689917087555, - "learning_rate": 0.00019870358451392467, - "loss": 1.0041, - "step": 194 - }, - { - "epoch": 0.11, - "grad_norm": 0.02999632991850376, - "learning_rate": 0.00019868948569205897, - "loss": 0.7975, - "step": 195 - }, - { - "epoch": 0.11, - "grad_norm": 0.02894209697842598, - "learning_rate": 0.0001986753111258799, - "loss": 0.7465, - "step": 196 - }, - { - "epoch": 0.11, - "grad_norm": 0.033549994230270386, - "learning_rate": 0.0001986610608262665, - "loss": 0.8114, - "step": 197 - }, - { - "epoch": 0.11, - "grad_norm": 0.03032383695244789, - "learning_rate": 0.00019864673480415589, - "loss": 0.7155, - "step": 198 - }, - { - "epoch": 0.11, - "grad_norm": 0.03509935364127159, - "learning_rate": 0.00019863233307054325, - "loss": 0.8527, - "step": 199 - }, - { - "epoch": 0.11, - "grad_norm": 0.03238251060247421, - "learning_rate": 0.00019861785563648202, - "loss": 0.7574, - "step": 200 - }, - { - "epoch": 0.11, - "grad_norm": 0.03257328271865845, - "learning_rate": 0.00019860330251308362, - "loss": 0.9374, - "step": 201 - }, - { - "epoch": 0.11, - "grad_norm": 0.03191230073571205, - "learning_rate": 0.00019858867371151754, - "loss": 0.8127, - "step": 202 - }, - { - "epoch": 0.11, - "grad_norm": 0.030960606411099434, - "learning_rate": 0.0001985739692430115, - "loss": 0.7888, - "step": 203 - }, - { - "epoch": 0.11, - "grad_norm": 0.03744737431406975, - "learning_rate": 0.0001985591891188511, - "loss": 0.8152, - "step": 204 - }, - { - "epoch": 0.11, - "grad_norm": 0.03747240826487541, - "learning_rate": 0.0001985443333503802, - "loss": 0.7457, - "step": 205 - }, - { - "epoch": 0.11, - "grad_norm": 0.030524997040629387, - "learning_rate": 0.00019852940194900053, - "loss": 0.8093, - "step": 206 - }, - { - "epoch": 0.12, - "grad_norm": 0.032599907368421555, - "learning_rate": 0.00019851439492617204, - "loss": 0.7419, - "step": 207 - }, - { - "epoch": 0.12, - "grad_norm": 0.0342191606760025, - "learning_rate": 0.00019849931229341258, - "loss": 0.7183, - "step": 208 - }, - { - "epoch": 0.12, - "grad_norm": 0.03359980136156082, - "learning_rate": 0.00019848415406229812, - "loss": 0.8392, - "step": 209 - }, - { - "epoch": 0.12, - "grad_norm": 0.03139995038509369, - "learning_rate": 0.00019846892024446265, - "loss": 0.7657, - "step": 210 - }, - { - "epoch": 0.12, - "grad_norm": 0.029806002974510193, - "learning_rate": 0.00019845361085159806, - "loss": 0.6918, - "step": 211 - }, - { - "epoch": 0.12, - "grad_norm": 0.03195658698678017, - "learning_rate": 0.00019843822589545441, - "loss": 0.8361, - "step": 212 - }, - { - "epoch": 0.12, - "grad_norm": 0.03134671226143837, - "learning_rate": 0.00019842276538783966, - "loss": 0.9247, - "step": 213 - }, - { - "epoch": 0.12, - "grad_norm": 0.03182549774646759, - "learning_rate": 0.00019840722934061974, - "loss": 0.8478, - "step": 214 - }, - { - "epoch": 0.12, - "grad_norm": 0.035036999732255936, - "learning_rate": 0.00019839161776571864, - "loss": 0.7699, - "step": 215 - }, - { - "epoch": 0.12, - "grad_norm": 0.030924562364816666, - "learning_rate": 0.00019837593067511823, - "loss": 0.8543, - "step": 216 - }, - { - "epoch": 0.12, - "grad_norm": 0.03198299929499626, - "learning_rate": 0.00019836016808085836, - "loss": 0.761, - "step": 217 - }, - { - "epoch": 0.12, - "grad_norm": 0.03266490250825882, - "learning_rate": 0.00019834432999503684, - "loss": 0.7841, - "step": 218 - }, - { - "epoch": 0.12, - "grad_norm": 0.03606751561164856, - "learning_rate": 0.00019832841642980945, - "loss": 0.8398, - "step": 219 - }, - { - "epoch": 0.12, - "grad_norm": 0.03128010407090187, - "learning_rate": 0.00019831242739738985, - "loss": 0.7795, - "step": 220 - }, - { - "epoch": 0.12, - "grad_norm": 0.031938567757606506, - "learning_rate": 0.00019829636291004968, - "loss": 0.7383, - "step": 221 - }, - { - "epoch": 0.12, - "grad_norm": 0.03146064281463623, - "learning_rate": 0.00019828022298011842, - "loss": 0.8463, - "step": 222 - }, - { - "epoch": 0.12, - "grad_norm": 0.03626205027103424, - "learning_rate": 0.00019826400761998353, - "loss": 0.9204, - "step": 223 - }, - { - "epoch": 0.12, - "grad_norm": 0.03110690787434578, - "learning_rate": 0.00019824771684209025, - "loss": 0.7676, - "step": 224 - }, - { - "epoch": 0.13, - "grad_norm": 0.0522528775036335, - "learning_rate": 0.00019823135065894185, - "loss": 0.8221, - "step": 225 - }, - { - "epoch": 0.13, - "grad_norm": 0.031512025743722916, - "learning_rate": 0.00019821490908309932, - "loss": 0.745, - "step": 226 - }, - { - "epoch": 0.13, - "grad_norm": 0.03326869383454323, - "learning_rate": 0.00019819839212718168, - "loss": 0.8885, - "step": 227 - }, - { - "epoch": 0.13, - "grad_norm": 0.03730284795165062, - "learning_rate": 0.00019818179980386563, - "loss": 0.7942, - "step": 228 - }, - { - "epoch": 0.13, - "grad_norm": 0.03505445271730423, - "learning_rate": 0.00019816513212588585, - "loss": 0.7352, - "step": 229 - }, - { - "epoch": 0.13, - "grad_norm": 0.03741452097892761, - "learning_rate": 0.00019814838910603481, - "loss": 0.9211, - "step": 230 - }, - { - "epoch": 0.13, - "grad_norm": 0.03283902630209923, - "learning_rate": 0.00019813157075716277, - "loss": 0.9182, - "step": 231 - }, - { - "epoch": 0.13, - "grad_norm": 0.0364568717777729, - "learning_rate": 0.0001981146770921779, - "loss": 0.9211, - "step": 232 - }, - { - "epoch": 0.13, - "grad_norm": 0.03377554938197136, - "learning_rate": 0.000198097708124046, - "loss": 0.8209, - "step": 233 - }, - { - "epoch": 0.13, - "grad_norm": 0.030919229611754417, - "learning_rate": 0.00019808066386579085, - "loss": 0.7564, - "step": 234 - }, - { - "epoch": 0.13, - "grad_norm": 0.032216716557741165, - "learning_rate": 0.00019806354433049393, - "loss": 0.8913, - "step": 235 - }, - { - "epoch": 0.13, - "grad_norm": 0.038493942469358444, - "learning_rate": 0.00019804634953129454, - "loss": 0.701, - "step": 236 - }, - { - "epoch": 0.13, - "grad_norm": 0.047857046127319336, - "learning_rate": 0.00019802907948138963, - "loss": 1.0259, - "step": 237 - }, - { - "epoch": 0.13, - "grad_norm": 0.0336415059864521, - "learning_rate": 0.00019801173419403405, - "loss": 0.9172, - "step": 238 - }, - { - "epoch": 0.13, - "grad_norm": 0.030060969293117523, - "learning_rate": 0.00019799431368254034, - "loss": 0.8397, - "step": 239 - }, - { - "epoch": 0.13, - "grad_norm": 0.043135106563568115, - "learning_rate": 0.0001979768179602787, - "loss": 0.9176, - "step": 240 - }, - { - "epoch": 0.13, - "grad_norm": 0.03419271484017372, - "learning_rate": 0.00019795924704067721, - "loss": 0.9116, - "step": 241 - }, - { - "epoch": 0.13, - "grad_norm": 0.03154682740569115, - "learning_rate": 0.0001979416009372215, - "loss": 0.7786, - "step": 242 - }, - { - "epoch": 0.14, - "grad_norm": 0.03246486559510231, - "learning_rate": 0.000197923879663455, - "loss": 0.7744, - "step": 243 - }, - { - "epoch": 0.14, - "grad_norm": 0.03503431752324104, - "learning_rate": 0.0001979060832329788, - "loss": 0.7365, - "step": 244 - }, - { - "epoch": 0.14, - "grad_norm": 0.03330378606915474, - "learning_rate": 0.00019788821165945172, - "loss": 0.6759, - "step": 245 - }, - { - "epoch": 0.14, - "grad_norm": 0.030394921079277992, - "learning_rate": 0.00019787026495659022, - "loss": 0.7804, - "step": 246 - }, - { - "epoch": 0.14, - "grad_norm": 0.032463330775499344, - "learning_rate": 0.00019785224313816836, - "loss": 0.8431, - "step": 247 - }, - { - "epoch": 0.14, - "grad_norm": 0.032576996833086014, - "learning_rate": 0.00019783414621801798, - "loss": 0.787, - "step": 248 - }, - { - "epoch": 0.14, - "grad_norm": 0.03479291498661041, - "learning_rate": 0.00019781597421002843, - "loss": 0.8343, - "step": 249 - }, - { - "epoch": 0.14, - "grad_norm": 0.03418460860848427, - "learning_rate": 0.00019779772712814677, - "loss": 0.8228, - "step": 250 - }, - { - "epoch": 0.14, - "grad_norm": 0.03314866125583649, - "learning_rate": 0.0001977794049863777, - "loss": 0.8703, - "step": 251 - }, - { - "epoch": 0.14, - "grad_norm": 0.031179893761873245, - "learning_rate": 0.00019776100779878345, - "loss": 0.6981, - "step": 252 - }, - { - "epoch": 0.14, - "grad_norm": 0.03774740546941757, - "learning_rate": 0.00019774253557948386, - "loss": 0.5688, - "step": 253 - }, - { - "epoch": 0.14, - "grad_norm": 0.033188916742801666, - "learning_rate": 0.00019772398834265643, - "loss": 0.7994, - "step": 254 - }, - { - "epoch": 0.14, - "grad_norm": 0.03498241677880287, - "learning_rate": 0.00019770536610253618, - "loss": 0.7059, - "step": 255 - }, - { - "epoch": 0.14, - "grad_norm": 0.035760391503572464, - "learning_rate": 0.00019768666887341567, - "loss": 0.8468, - "step": 256 - }, - { - "epoch": 0.14, - "grad_norm": 0.0330347940325737, - "learning_rate": 0.0001976678966696451, - "loss": 0.7864, - "step": 257 - }, - { - "epoch": 0.14, - "grad_norm": 0.036673109978437424, - "learning_rate": 0.00019764904950563214, - "loss": 0.867, - "step": 258 - }, - { - "epoch": 0.14, - "grad_norm": 0.03514648601412773, - "learning_rate": 0.00019763012739584205, - "loss": 0.9021, - "step": 259 - }, - { - "epoch": 0.14, - "grad_norm": 0.03706090897321701, - "learning_rate": 0.00019761113035479747, - "loss": 0.7132, - "step": 260 - }, - { - "epoch": 0.15, - "grad_norm": 0.033022210001945496, - "learning_rate": 0.00019759205839707877, - "loss": 0.8326, - "step": 261 - }, - { - "epoch": 0.15, - "grad_norm": 0.03423427790403366, - "learning_rate": 0.00019757291153732362, - "loss": 0.9028, - "step": 262 - }, - { - "epoch": 0.15, - "grad_norm": 0.03209824860095978, - "learning_rate": 0.00019755368979022732, - "loss": 0.7991, - "step": 263 - }, - { - "epoch": 0.15, - "grad_norm": 0.03312317654490471, - "learning_rate": 0.00019753439317054256, - "loss": 0.7846, - "step": 264 - }, - { - "epoch": 0.15, - "grad_norm": 0.03310060128569603, - "learning_rate": 0.00019751502169307954, - "loss": 0.7369, - "step": 265 - }, - { - "epoch": 0.15, - "grad_norm": 0.03352689370512962, - "learning_rate": 0.00019749557537270584, - "loss": 0.8118, - "step": 266 - }, - { - "epoch": 0.15, - "grad_norm": 0.037001077085733414, - "learning_rate": 0.00019747605422434662, - "loss": 0.7978, - "step": 267 - }, - { - "epoch": 0.15, - "grad_norm": 0.03153040632605553, - "learning_rate": 0.00019745645826298432, - "loss": 0.6908, - "step": 268 - }, - { - "epoch": 0.15, - "grad_norm": 0.029728813096880913, - "learning_rate": 0.00019743678750365888, - "loss": 0.7491, - "step": 269 - }, - { - "epoch": 0.15, - "grad_norm": 0.04103512689471245, - "learning_rate": 0.00019741704196146766, - "loss": 0.8505, - "step": 270 - }, - { - "epoch": 0.15, - "grad_norm": 0.036623742431402206, - "learning_rate": 0.00019739722165156538, - "loss": 0.8811, - "step": 271 - }, - { - "epoch": 0.15, - "grad_norm": 0.031018711626529694, - "learning_rate": 0.00019737732658916412, - "loss": 0.7914, - "step": 272 - }, - { - "epoch": 0.15, - "grad_norm": 0.0346776582300663, - "learning_rate": 0.00019735735678953344, - "loss": 0.8786, - "step": 273 - }, - { - "epoch": 0.15, - "grad_norm": 0.035631489008665085, - "learning_rate": 0.00019733731226800015, - "loss": 0.7063, - "step": 274 - }, - { - "epoch": 0.15, - "grad_norm": 0.034868933260440826, - "learning_rate": 0.0001973171930399484, - "loss": 0.7211, - "step": 275 - }, - { - "epoch": 0.15, - "grad_norm": 0.030991340056061745, - "learning_rate": 0.00019729699912081979, - "loss": 0.7908, - "step": 276 - }, - { - "epoch": 0.15, - "grad_norm": 0.034880708903074265, - "learning_rate": 0.00019727673052611315, - "loss": 0.6827, - "step": 277 - }, - { - "epoch": 0.15, - "grad_norm": 0.03172425180673599, - "learning_rate": 0.00019725638727138468, - "loss": 0.7884, - "step": 278 - }, - { - "epoch": 0.16, - "grad_norm": 0.035629045218229294, - "learning_rate": 0.00019723596937224781, - "loss": 0.8101, - "step": 279 - }, - { - "epoch": 0.16, - "grad_norm": 0.03561673313379288, - "learning_rate": 0.00019721547684437333, - "loss": 0.8164, - "step": 280 - }, - { - "epoch": 0.16, - "grad_norm": 0.0325719490647316, - "learning_rate": 0.0001971949097034893, - "loss": 0.7356, - "step": 281 - }, - { - "epoch": 0.16, - "grad_norm": 0.04635859653353691, - "learning_rate": 0.00019717426796538102, - "loss": 0.9928, - "step": 282 - }, - { - "epoch": 0.16, - "grad_norm": 0.0328851044178009, - "learning_rate": 0.00019715355164589106, - "loss": 0.7742, - "step": 283 - }, - { - "epoch": 0.16, - "grad_norm": 0.032772939652204514, - "learning_rate": 0.00019713276076091917, - "loss": 0.7487, - "step": 284 - }, - { - "epoch": 0.16, - "grad_norm": 0.03649919852614403, - "learning_rate": 0.00019711189532642243, - "loss": 0.7322, - "step": 285 - }, - { - "epoch": 0.16, - "grad_norm": 0.03704332187771797, - "learning_rate": 0.0001970909553584151, - "loss": 0.7951, - "step": 286 - }, - { - "epoch": 0.16, - "grad_norm": 0.03600141033530235, - "learning_rate": 0.00019706994087296859, - "loss": 0.8053, - "step": 287 - }, - { - "epoch": 0.16, - "grad_norm": 0.03415602818131447, - "learning_rate": 0.0001970488518862116, - "loss": 0.8079, - "step": 288 - }, - { - "epoch": 0.16, - "grad_norm": 0.032455701380968094, - "learning_rate": 0.0001970276884143299, - "loss": 0.8268, - "step": 289 - }, - { - "epoch": 0.16, - "grad_norm": 0.03594019636511803, - "learning_rate": 0.00019700645047356652, - "loss": 0.7773, - "step": 290 - }, - { - "epoch": 0.16, - "grad_norm": 0.03449614718556404, - "learning_rate": 0.0001969851380802216, - "loss": 0.8421, - "step": 291 - }, - { - "epoch": 0.16, - "grad_norm": 0.035057246685028076, - "learning_rate": 0.0001969637512506524, - "loss": 0.7891, - "step": 292 - }, - { - "epoch": 0.16, - "grad_norm": 0.03100276179611683, - "learning_rate": 0.00019694229000127337, - "loss": 0.7188, - "step": 293 - }, - { - "epoch": 0.16, - "grad_norm": 0.035093460232019424, - "learning_rate": 0.00019692075434855606, - "loss": 0.6879, - "step": 294 - }, - { - "epoch": 0.16, - "grad_norm": 0.03476954251527786, - "learning_rate": 0.0001968991443090291, - "loss": 0.8428, - "step": 295 - }, - { - "epoch": 0.16, - "grad_norm": 0.03413707762956619, - "learning_rate": 0.00019687745989927823, - "loss": 0.8469, - "step": 296 - }, - { - "epoch": 0.17, - "grad_norm": 0.03546985611319542, - "learning_rate": 0.00019685570113594624, - "loss": 0.8675, - "step": 297 - }, - { - "epoch": 0.17, - "grad_norm": 0.036925192922353745, - "learning_rate": 0.00019683386803573302, - "loss": 0.7033, - "step": 298 - }, - { - "epoch": 0.17, - "grad_norm": 0.03350600227713585, - "learning_rate": 0.00019681196061539552, - "loss": 0.7405, - "step": 299 - }, - { - "epoch": 0.17, - "grad_norm": 0.03388972207903862, - "learning_rate": 0.00019678997889174772, - "loss": 0.7672, - "step": 300 - }, - { - "epoch": 0.17, - "grad_norm": 0.03225654736161232, - "learning_rate": 0.00019676792288166056, - "loss": 0.8161, - "step": 301 - }, - { - "epoch": 0.17, - "grad_norm": 0.035303112119436264, - "learning_rate": 0.00019674579260206215, - "loss": 0.8378, - "step": 302 - }, - { - "epoch": 0.17, - "grad_norm": 0.03430440276861191, - "learning_rate": 0.00019672358806993744, - "loss": 0.8755, - "step": 303 - }, - { - "epoch": 0.17, - "grad_norm": 0.03425530344247818, - "learning_rate": 0.00019670130930232849, - "loss": 0.8579, - "step": 304 - }, - { - "epoch": 0.17, - "grad_norm": 0.03408996760845184, - "learning_rate": 0.00019667895631633427, - "loss": 0.7241, - "step": 305 - }, - { - "epoch": 0.17, - "grad_norm": 0.034297868609428406, - "learning_rate": 0.00019665652912911067, - "loss": 0.7586, - "step": 306 - }, - { - "epoch": 0.17, - "grad_norm": 0.03555282577872276, - "learning_rate": 0.00019663402775787066, - "loss": 0.7913, - "step": 307 - }, - { - "epoch": 0.17, - "grad_norm": 0.046342890709638596, - "learning_rate": 0.000196611452219884, - "loss": 0.756, - "step": 308 - }, - { - "epoch": 0.17, - "grad_norm": 0.035010941326618195, - "learning_rate": 0.00019658880253247752, - "loss": 0.7984, - "step": 309 - }, - { - "epoch": 0.17, - "grad_norm": 0.034292541444301605, - "learning_rate": 0.00019656607871303486, - "loss": 0.7529, - "step": 310 - }, - { - "epoch": 0.17, - "grad_norm": 0.034864772111177444, - "learning_rate": 0.00019654328077899655, - "loss": 0.7659, - "step": 311 - }, - { - "epoch": 0.17, - "grad_norm": 0.03819848969578743, - "learning_rate": 0.0001965204087478601, - "loss": 0.8601, - "step": 312 - }, - { - "epoch": 0.17, - "grad_norm": 0.03443516790866852, - "learning_rate": 0.00019649746263717974, - "loss": 0.755, - "step": 313 - }, - { - "epoch": 0.17, - "grad_norm": 0.039128776639699936, - "learning_rate": 0.00019647444246456672, - "loss": 0.8283, - "step": 314 - }, - { - "epoch": 0.18, - "grad_norm": 0.03328540921211243, - "learning_rate": 0.000196451348247689, - "loss": 0.8621, - "step": 315 - }, - { - "epoch": 0.18, - "grad_norm": 0.03752969205379486, - "learning_rate": 0.00019642818000427145, - "loss": 0.6716, - "step": 316 - }, - { - "epoch": 0.18, - "grad_norm": 0.033460091799497604, - "learning_rate": 0.00019640493775209574, - "loss": 0.644, - "step": 317 - }, - { - "epoch": 0.18, - "grad_norm": 0.03952079266309738, - "learning_rate": 0.00019638162150900027, - "loss": 0.8444, - "step": 318 - }, - { - "epoch": 0.18, - "grad_norm": 0.03375870734453201, - "learning_rate": 0.00019635823129288038, - "loss": 0.7582, - "step": 319 - }, - { - "epoch": 0.18, - "grad_norm": 0.0385657362639904, - "learning_rate": 0.00019633476712168803, - "loss": 0.6819, - "step": 320 - }, - { - "epoch": 0.18, - "grad_norm": 0.03305663540959358, - "learning_rate": 0.000196311229013432, - "loss": 0.769, - "step": 321 - }, - { - "epoch": 0.18, - "grad_norm": 0.033982716500759125, - "learning_rate": 0.00019628761698617782, - "loss": 0.8634, - "step": 322 - }, - { - "epoch": 0.18, - "grad_norm": 0.035749975591897964, - "learning_rate": 0.00019626393105804776, - "loss": 0.7829, - "step": 323 - }, - { - "epoch": 0.18, - "grad_norm": 0.03398462384939194, - "learning_rate": 0.00019624017124722086, - "loss": 0.803, - "step": 324 - }, - { - "epoch": 0.18, - "grad_norm": 0.03670307993888855, - "learning_rate": 0.0001962163375719327, - "loss": 0.9054, - "step": 325 - }, - { - "epoch": 0.18, - "grad_norm": 0.034984368830919266, - "learning_rate": 0.00019619243005047576, - "loss": 0.7222, - "step": 326 - }, - { - "epoch": 0.18, - "grad_norm": 0.03537537157535553, - "learning_rate": 0.00019616844870119904, - "loss": 0.8563, - "step": 327 - }, - { - "epoch": 0.18, - "grad_norm": 0.03322187438607216, - "learning_rate": 0.00019614439354250824, - "loss": 0.8273, - "step": 328 - }, - { - "epoch": 0.18, - "grad_norm": 0.03316264599561691, - "learning_rate": 0.00019612026459286578, - "loss": 0.6768, - "step": 329 - }, - { - "epoch": 0.18, - "grad_norm": 0.03384651243686676, - "learning_rate": 0.00019609606187079065, - "loss": 0.7345, - "step": 330 - }, - { - "epoch": 0.18, - "grad_norm": 0.03392638638615608, - "learning_rate": 0.0001960717853948584, - "loss": 0.7833, - "step": 331 - }, - { - "epoch": 0.18, - "grad_norm": 0.03634137287735939, - "learning_rate": 0.00019604743518370133, - "loss": 0.8709, - "step": 332 - }, - { - "epoch": 0.19, - "grad_norm": 0.03778916969895363, - "learning_rate": 0.00019602301125600828, - "loss": 0.8933, - "step": 333 - }, - { - "epoch": 0.19, - "grad_norm": 0.04155290871858597, - "learning_rate": 0.0001959985136305246, - "loss": 0.9487, - "step": 334 - }, - { - "epoch": 0.19, - "grad_norm": 0.03602902963757515, - "learning_rate": 0.00019597394232605223, - "loss": 0.8372, - "step": 335 - }, - { - "epoch": 0.19, - "grad_norm": 0.033098701387643814, - "learning_rate": 0.00019594929736144976, - "loss": 0.8942, - "step": 336 - }, - { - "epoch": 0.19, - "grad_norm": 0.04318083077669144, - "learning_rate": 0.00019592457875563214, - "loss": 0.7353, - "step": 337 - }, - { - "epoch": 0.19, - "grad_norm": 0.035677891224622726, - "learning_rate": 0.000195899786527571, - "loss": 0.7229, - "step": 338 - }, - { - "epoch": 0.19, - "grad_norm": 0.03783939778804779, - "learning_rate": 0.0001958749206962944, - "loss": 0.7931, - "step": 339 - }, - { - "epoch": 0.19, - "grad_norm": 0.03574792295694351, - "learning_rate": 0.00019584998128088684, - "loss": 0.8139, - "step": 340 - }, - { - "epoch": 0.19, - "grad_norm": 0.042883213609457016, - "learning_rate": 0.00019582496830048947, - "loss": 0.8044, - "step": 341 - }, - { - "epoch": 0.19, - "grad_norm": 0.040097806602716446, - "learning_rate": 0.00019579988177429968, - "loss": 0.7571, - "step": 342 - }, - { - "epoch": 0.19, - "grad_norm": 0.033648643642663956, - "learning_rate": 0.00019577472172157144, - "loss": 0.771, - "step": 343 - }, - { - "epoch": 0.19, - "grad_norm": 0.04123647138476372, - "learning_rate": 0.00019574948816161513, - "loss": 0.8319, - "step": 344 - }, - { - "epoch": 0.19, - "grad_norm": 0.036461152136325836, - "learning_rate": 0.00019572418111379758, - "loss": 0.8421, - "step": 345 - }, - { - "epoch": 0.19, - "grad_norm": 0.034517571330070496, - "learning_rate": 0.00019569880059754194, - "loss": 0.9179, - "step": 346 - }, - { - "epoch": 0.19, - "grad_norm": 0.03678009659051895, - "learning_rate": 0.00019567334663232776, - "loss": 0.7756, - "step": 347 - }, - { - "epoch": 0.19, - "grad_norm": 0.038212236016988754, - "learning_rate": 0.00019564781923769108, - "loss": 0.7422, - "step": 348 - }, - { - "epoch": 0.19, - "grad_norm": 0.039440006017684937, - "learning_rate": 0.00019562221843322415, - "loss": 0.8191, - "step": 349 - }, - { - "epoch": 0.19, - "grad_norm": 0.0428134948015213, - "learning_rate": 0.0001955965442385756, - "loss": 0.8473, - "step": 350 - }, - { - "epoch": 0.2, - "grad_norm": 0.03534878045320511, - "learning_rate": 0.0001955707966734505, - "loss": 0.7492, - "step": 351 - }, - { - "epoch": 0.2, - "grad_norm": 0.03833247721195221, - "learning_rate": 0.00019554497575761004, - "loss": 0.7775, - "step": 352 - }, - { - "epoch": 0.2, - "grad_norm": 0.03430182486772537, - "learning_rate": 0.00019551908151087187, - "loss": 0.8519, - "step": 353 - }, - { - "epoch": 0.2, - "grad_norm": 0.03104781173169613, - "learning_rate": 0.00019549311395310984, - "loss": 0.7292, - "step": 354 - }, - { - "epoch": 0.2, - "grad_norm": 0.033952996134757996, - "learning_rate": 0.00019546707310425407, - "loss": 0.7592, - "step": 355 - }, - { - "epoch": 0.2, - "grad_norm": 0.03525162115693092, - "learning_rate": 0.00019544095898429097, - "loss": 0.7484, - "step": 356 - }, - { - "epoch": 0.2, - "grad_norm": 0.035235751420259476, - "learning_rate": 0.00019541477161326318, - "loss": 0.7206, - "step": 357 - }, - { - "epoch": 0.2, - "grad_norm": 0.038712091743946075, - "learning_rate": 0.00019538851101126947, - "loss": 0.8494, - "step": 358 - }, - { - "epoch": 0.2, - "grad_norm": 0.034723225980997086, - "learning_rate": 0.00019536217719846497, - "loss": 0.784, - "step": 359 - }, - { - "epoch": 0.2, - "grad_norm": 0.03219115734100342, - "learning_rate": 0.0001953357701950609, - "loss": 0.7373, - "step": 360 - }, - { - "epoch": 0.2, - "grad_norm": 0.037317853420972824, - "learning_rate": 0.00019530929002132462, - "loss": 0.7768, - "step": 361 - }, - { - "epoch": 0.2, - "grad_norm": 0.033880557864904404, - "learning_rate": 0.00019528273669757972, - "loss": 0.8359, - "step": 362 - }, - { - "epoch": 0.2, - "grad_norm": 0.037638477981090546, - "learning_rate": 0.00019525611024420595, - "loss": 0.7786, - "step": 363 - }, - { - "epoch": 0.2, - "grad_norm": 0.03666766360402107, - "learning_rate": 0.0001952294106816391, - "loss": 0.7666, - "step": 364 - }, - { - "epoch": 0.2, - "grad_norm": 0.03592545911669731, - "learning_rate": 0.00019520263803037116, - "loss": 0.8049, - "step": 365 - }, - { - "epoch": 0.2, - "grad_norm": 0.03477128967642784, - "learning_rate": 0.00019517579231095018, - "loss": 0.8243, - "step": 366 - }, - { - "epoch": 0.2, - "grad_norm": 0.03351607918739319, - "learning_rate": 0.00019514887354398023, - "loss": 0.6382, - "step": 367 - }, - { - "epoch": 0.2, - "grad_norm": 0.03531970828771591, - "learning_rate": 0.00019512188175012157, - "loss": 0.7363, - "step": 368 - }, - { - "epoch": 0.21, - "grad_norm": 0.03541957587003708, - "learning_rate": 0.00019509481695009035, - "loss": 0.7136, - "step": 369 - }, - { - "epoch": 0.21, - "grad_norm": 0.03538355231285095, - "learning_rate": 0.00019506767916465894, - "loss": 0.8702, - "step": 370 - }, - { - "epoch": 0.21, - "grad_norm": 0.036403317004442215, - "learning_rate": 0.00019504046841465556, - "loss": 0.773, - "step": 371 - }, - { - "epoch": 0.21, - "grad_norm": 0.050964437425136566, - "learning_rate": 0.0001950131847209645, - "loss": 0.65, - "step": 372 - }, - { - "epoch": 0.21, - "grad_norm": 0.033455755561590195, - "learning_rate": 0.0001949858281045261, - "loss": 0.8465, - "step": 373 - }, - { - "epoch": 0.21, - "grad_norm": 0.038496553897857666, - "learning_rate": 0.00019495839858633648, - "loss": 0.9064, - "step": 374 - }, - { - "epoch": 0.21, - "grad_norm": 0.046744123101234436, - "learning_rate": 0.00019493089618744796, - "loss": 0.7656, - "step": 375 - }, - { - "epoch": 0.21, - "grad_norm": 0.03650742024183273, - "learning_rate": 0.00019490332092896858, - "loss": 0.7547, - "step": 376 - }, - { - "epoch": 0.21, - "grad_norm": 0.03537287563085556, - "learning_rate": 0.00019487567283206242, - "loss": 0.7773, - "step": 377 - }, - { - "epoch": 0.21, - "grad_norm": 0.03856958448886871, - "learning_rate": 0.00019484795191794944, - "loss": 0.8212, - "step": 378 - }, - { - "epoch": 0.21, - "grad_norm": 0.03926542028784752, - "learning_rate": 0.0001948201582079055, - "loss": 0.7704, - "step": 379 - }, - { - "epoch": 0.21, - "grad_norm": 0.03413626179099083, - "learning_rate": 0.00019479229172326222, - "loss": 0.8243, - "step": 380 - }, - { - "epoch": 0.21, - "grad_norm": 0.036304738372564316, - "learning_rate": 0.00019476435248540726, - "loss": 0.7403, - "step": 381 - }, - { - "epoch": 0.21, - "grad_norm": 0.035184238106012344, - "learning_rate": 0.00019473634051578396, - "loss": 0.7825, - "step": 382 - }, - { - "epoch": 0.21, - "grad_norm": 0.0506867952644825, - "learning_rate": 0.00019470825583589157, - "loss": 0.951, - "step": 383 - }, - { - "epoch": 0.21, - "grad_norm": 0.034371040761470795, - "learning_rate": 0.00019468009846728513, - "loss": 0.6246, - "step": 384 - }, - { - "epoch": 0.21, - "grad_norm": 0.036034394055604935, - "learning_rate": 0.00019465186843157546, - "loss": 0.8194, - "step": 385 - }, - { - "epoch": 0.21, - "grad_norm": 0.03933184966444969, - "learning_rate": 0.00019462356575042914, - "loss": 0.6906, - "step": 386 - }, - { - "epoch": 0.22, - "grad_norm": 0.03522591292858124, - "learning_rate": 0.00019459519044556846, - "loss": 0.6198, - "step": 387 - }, - { - "epoch": 0.22, - "grad_norm": 0.033830877393484116, - "learning_rate": 0.00019456674253877162, - "loss": 0.7374, - "step": 388 - }, - { - "epoch": 0.22, - "grad_norm": 0.03900926932692528, - "learning_rate": 0.00019453822205187232, - "loss": 0.7763, - "step": 389 - }, - { - "epoch": 0.22, - "grad_norm": 0.035312309861183167, - "learning_rate": 0.00019450962900676016, - "loss": 0.6672, - "step": 390 - }, - { - "epoch": 0.22, - "grad_norm": 0.034244999289512634, - "learning_rate": 0.0001944809634253803, - "loss": 0.6852, - "step": 391 - }, - { - "epoch": 0.22, - "grad_norm": 0.03672676905989647, - "learning_rate": 0.00019445222532973356, - "loss": 0.7416, - "step": 392 - }, - { - "epoch": 0.22, - "grad_norm": 0.03532637655735016, - "learning_rate": 0.00019442341474187658, - "loss": 0.7647, - "step": 393 - }, - { - "epoch": 0.22, - "grad_norm": 0.03439967334270477, - "learning_rate": 0.00019439453168392144, - "loss": 0.7458, - "step": 394 - }, - { - "epoch": 0.22, - "grad_norm": 0.03463077172636986, - "learning_rate": 0.00019436557617803595, - "loss": 0.8125, - "step": 395 - }, - { - "epoch": 0.22, - "grad_norm": 0.03627005219459534, - "learning_rate": 0.00019433654824644347, - "loss": 0.6992, - "step": 396 - }, - { - "epoch": 0.22, - "grad_norm": 0.03494995832443237, - "learning_rate": 0.00019430744791142305, - "loss": 0.7078, - "step": 397 - }, - { - "epoch": 0.22, - "grad_norm": 0.03355133906006813, - "learning_rate": 0.00019427827519530918, - "loss": 0.6952, - "step": 398 - }, - { - "epoch": 0.22, - "grad_norm": 0.03713174909353256, - "learning_rate": 0.00019424903012049195, - "loss": 0.7782, - "step": 399 - }, - { - "epoch": 0.22, - "grad_norm": 0.03741822391748428, - "learning_rate": 0.00019421971270941701, - "loss": 0.7951, - "step": 400 - }, - { - "epoch": 0.22, - "grad_norm": 0.03539593145251274, - "learning_rate": 0.00019419032298458554, - "loss": 0.6305, - "step": 401 - }, - { - "epoch": 0.22, - "grad_norm": 0.03932331129908562, - "learning_rate": 0.00019416086096855418, - "loss": 0.6978, - "step": 402 - }, - { - "epoch": 0.22, - "grad_norm": 0.03865070268511772, - "learning_rate": 0.00019413132668393503, - "loss": 0.7058, - "step": 403 - }, - { - "epoch": 0.22, - "grad_norm": 0.036367420107126236, - "learning_rate": 0.00019410172015339576, - "loss": 0.7991, - "step": 404 - }, - { - "epoch": 0.23, - "grad_norm": 0.04757410287857056, - "learning_rate": 0.00019407204139965936, - "loss": 0.7277, - "step": 405 - }, - { - "epoch": 0.23, - "grad_norm": 0.04415794834494591, - "learning_rate": 0.00019404229044550433, - "loss": 0.7263, - "step": 406 - }, - { - "epoch": 0.23, - "grad_norm": 0.030960189178586006, - "learning_rate": 0.0001940124673137646, - "loss": 0.7514, - "step": 407 - }, - { - "epoch": 0.23, - "grad_norm": 0.0355895459651947, - "learning_rate": 0.0001939825720273294, - "loss": 0.9352, - "step": 408 - }, - { - "epoch": 0.23, - "grad_norm": 0.034076202660799026, - "learning_rate": 0.00019395260460914346, - "loss": 0.7354, - "step": 409 - }, - { - "epoch": 0.23, - "grad_norm": 0.03768068179488182, - "learning_rate": 0.00019392256508220682, - "loss": 0.797, - "step": 410 - }, - { - "epoch": 0.23, - "grad_norm": 0.03416946902871132, - "learning_rate": 0.00019389245346957478, - "loss": 0.8355, - "step": 411 - }, - { - "epoch": 0.23, - "grad_norm": 0.036176249384880066, - "learning_rate": 0.00019386226979435813, - "loss": 0.775, - "step": 412 - }, - { - "epoch": 0.23, - "grad_norm": 0.036431025713682175, - "learning_rate": 0.00019383201407972286, - "loss": 0.6291, - "step": 413 - }, - { - "epoch": 0.23, - "grad_norm": 0.03727782517671585, - "learning_rate": 0.00019380168634889025, - "loss": 0.8582, - "step": 414 - }, - { - "epoch": 0.23, - "grad_norm": 0.03564276918768883, - "learning_rate": 0.00019377128662513687, - "loss": 0.7397, - "step": 415 - }, - { - "epoch": 0.23, - "grad_norm": 0.037083700299263, - "learning_rate": 0.00019374081493179457, - "loss": 0.7173, - "step": 416 - }, - { - "epoch": 0.23, - "grad_norm": 0.040047336369752884, - "learning_rate": 0.00019371027129225042, - "loss": 0.6917, - "step": 417 - }, - { - "epoch": 0.23, - "grad_norm": 0.038617394864559174, - "learning_rate": 0.00019367965572994667, - "loss": 0.6494, - "step": 418 - }, - { - "epoch": 0.23, - "grad_norm": 0.03738728538155556, - "learning_rate": 0.00019364896826838083, - "loss": 0.8023, - "step": 419 - }, - { - "epoch": 0.23, - "grad_norm": 0.03948422148823738, - "learning_rate": 0.00019361820893110557, - "loss": 0.8126, - "step": 420 - }, - { - "epoch": 0.23, - "grad_norm": 0.035054076462984085, - "learning_rate": 0.0001935873777417287, - "loss": 0.7377, - "step": 421 - }, - { - "epoch": 0.23, - "grad_norm": 0.04039939120411873, - "learning_rate": 0.00019355647472391328, - "loss": 0.7804, - "step": 422 - }, - { - "epoch": 0.24, - "grad_norm": 0.034298498183488846, - "learning_rate": 0.00019352549990137732, - "loss": 0.8092, - "step": 423 - }, - { - "epoch": 0.24, - "grad_norm": 0.035063210874795914, - "learning_rate": 0.00019349445329789404, - "loss": 0.7162, - "step": 424 - }, - { - "epoch": 0.24, - "grad_norm": 0.03656880930066109, - "learning_rate": 0.00019346333493729182, - "loss": 0.7648, - "step": 425 - }, - { - "epoch": 0.24, - "grad_norm": 0.03690245375037193, - "learning_rate": 0.000193432144843454, - "loss": 0.7194, - "step": 426 - }, - { - "epoch": 0.24, - "grad_norm": 0.04461353272199631, - "learning_rate": 0.00019340088304031905, - "loss": 0.6901, - "step": 427 - }, - { - "epoch": 0.24, - "grad_norm": 0.03801785781979561, - "learning_rate": 0.0001933695495518804, - "loss": 0.7416, - "step": 428 - }, - { - "epoch": 0.24, - "grad_norm": 0.03857959061861038, - "learning_rate": 0.00019333814440218656, - "loss": 0.7467, - "step": 429 - }, - { - "epoch": 0.24, - "grad_norm": 0.03680335357785225, - "learning_rate": 0.00019330666761534104, - "loss": 0.7313, - "step": 430 - }, - { - "epoch": 0.24, - "grad_norm": 0.03958728164434433, - "learning_rate": 0.00019327511921550232, - "loss": 0.7054, - "step": 431 - }, - { - "epoch": 0.24, - "grad_norm": 0.03355373814702034, - "learning_rate": 0.0001932434992268838, - "loss": 0.7193, - "step": 432 - }, - { - "epoch": 0.24, - "grad_norm": 0.035141825675964355, - "learning_rate": 0.00019321180767375393, - "loss": 0.7841, - "step": 433 - }, - { - "epoch": 0.24, - "grad_norm": 0.037220679223537445, - "learning_rate": 0.00019318004458043596, - "loss": 0.7195, - "step": 434 - }, - { - "epoch": 0.24, - "grad_norm": 0.040347468107938766, - "learning_rate": 0.00019314820997130814, - "loss": 0.7725, - "step": 435 - }, - { - "epoch": 0.24, - "grad_norm": 0.03546297922730446, - "learning_rate": 0.00019311630387080356, - "loss": 0.7774, - "step": 436 - }, - { - "epoch": 0.24, - "grad_norm": 0.03774712607264519, - "learning_rate": 0.0001930843263034102, - "loss": 0.842, - "step": 437 - }, - { - "epoch": 0.24, - "grad_norm": 0.0372396744787693, - "learning_rate": 0.0001930522772936709, - "loss": 0.8061, - "step": 438 - }, - { - "epoch": 0.24, - "grad_norm": 0.03598784655332565, - "learning_rate": 0.00019302015686618326, - "loss": 0.8289, - "step": 439 - }, - { - "epoch": 0.24, - "grad_norm": 0.038949351757764816, - "learning_rate": 0.00019298796504559982, - "loss": 0.9173, - "step": 440 - }, - { - "epoch": 0.25, - "grad_norm": 0.03886549547314644, - "learning_rate": 0.00019295570185662783, - "loss": 0.7436, - "step": 441 - }, - { - "epoch": 0.25, - "grad_norm": 0.03816141560673714, - "learning_rate": 0.0001929233673240293, - "loss": 0.7178, - "step": 442 - }, - { - "epoch": 0.25, - "grad_norm": 0.03811522200703621, - "learning_rate": 0.00019289096147262106, - "loss": 0.7304, - "step": 443 - }, - { - "epoch": 0.25, - "grad_norm": 0.03597824648022652, - "learning_rate": 0.00019285848432727465, - "loss": 0.9174, - "step": 444 - }, - { - "epoch": 0.25, - "grad_norm": 0.036536138504743576, - "learning_rate": 0.0001928259359129163, - "loss": 0.6808, - "step": 445 - }, - { - "epoch": 0.25, - "grad_norm": 0.03995755687355995, - "learning_rate": 0.00019279331625452696, - "loss": 0.8131, - "step": 446 - }, - { - "epoch": 0.25, - "grad_norm": 0.03879891335964203, - "learning_rate": 0.00019276062537714226, - "loss": 0.7044, - "step": 447 - }, - { - "epoch": 0.25, - "grad_norm": 0.04052872955799103, - "learning_rate": 0.00019272786330585252, - "loss": 0.7703, - "step": 448 - }, - { - "epoch": 0.25, - "grad_norm": 0.047087304294109344, - "learning_rate": 0.00019269503006580266, - "loss": 0.764, - "step": 449 - }, - { - "epoch": 0.25, - "grad_norm": 0.035790085792541504, - "learning_rate": 0.0001926621256821922, - "loss": 0.72, - "step": 450 - }, - { - "epoch": 0.25, - "grad_norm": 0.03301222249865532, - "learning_rate": 0.00019262915018027538, - "loss": 0.8394, - "step": 451 - }, - { - "epoch": 0.25, - "grad_norm": 0.0383736714720726, - "learning_rate": 0.00019259610358536087, - "loss": 0.8389, - "step": 452 - }, - { - "epoch": 0.25, - "grad_norm": 0.0351652093231678, - "learning_rate": 0.00019256298592281202, - "loss": 0.7535, - "step": 453 - }, - { - "epoch": 0.25, - "grad_norm": 0.03904608264565468, - "learning_rate": 0.0001925297972180466, - "loss": 0.7091, - "step": 454 - }, - { - "epoch": 0.25, - "grad_norm": 0.035092540085315704, - "learning_rate": 0.00019249653749653711, - "loss": 0.6403, - "step": 455 - }, - { - "epoch": 0.25, - "grad_norm": 0.03878278285264969, - "learning_rate": 0.00019246320678381035, - "loss": 0.761, - "step": 456 - }, - { - "epoch": 0.25, - "grad_norm": 0.055852390825748444, - "learning_rate": 0.0001924298051054477, - "loss": 1.0766, - "step": 457 - }, - { - "epoch": 0.25, - "grad_norm": 0.038434069603681564, - "learning_rate": 0.000192396332487085, - "loss": 0.7273, - "step": 458 - }, - { - "epoch": 0.26, - "grad_norm": 0.04319579154253006, - "learning_rate": 0.0001923627889544125, - "loss": 0.7543, - "step": 459 - }, - { - "epoch": 0.26, - "grad_norm": 0.03686597943305969, - "learning_rate": 0.00019232917453317495, - "loss": 0.7785, - "step": 460 - }, - { - "epoch": 0.26, - "grad_norm": 0.03705907613039017, - "learning_rate": 0.00019229548924917146, - "loss": 0.8554, - "step": 461 - }, - { - "epoch": 0.26, - "grad_norm": 0.043490804731845856, - "learning_rate": 0.00019226173312825553, - "loss": 0.7574, - "step": 462 - }, - { - "epoch": 0.26, - "grad_norm": 0.0411020964384079, - "learning_rate": 0.00019222790619633499, - "loss": 0.7173, - "step": 463 - }, - { - "epoch": 0.26, - "grad_norm": 0.04355505481362343, - "learning_rate": 0.0001921940084793721, - "loss": 0.7586, - "step": 464 - }, - { - "epoch": 0.26, - "grad_norm": 0.03533725440502167, - "learning_rate": 0.0001921600400033834, - "loss": 0.7315, - "step": 465 - }, - { - "epoch": 0.26, - "grad_norm": 0.037127457559108734, - "learning_rate": 0.0001921260007944397, - "loss": 0.6851, - "step": 466 - }, - { - "epoch": 0.26, - "grad_norm": 0.04237948730587959, - "learning_rate": 0.00019209189087866614, - "loss": 0.7746, - "step": 467 - }, - { - "epoch": 0.26, - "grad_norm": 0.03694011643528938, - "learning_rate": 0.0001920577102822422, - "loss": 0.8222, - "step": 468 - }, - { - "epoch": 0.26, - "grad_norm": 0.03798561915755272, - "learning_rate": 0.00019202345903140147, - "loss": 0.7533, - "step": 469 - }, - { - "epoch": 0.26, - "grad_norm": 0.03925656899809837, - "learning_rate": 0.00019198913715243182, - "loss": 0.7317, - "step": 470 - }, - { - "epoch": 0.26, - "grad_norm": 0.03404255583882332, - "learning_rate": 0.00019195474467167544, - "loss": 0.7474, - "step": 471 - }, - { - "epoch": 0.26, - "grad_norm": 0.03907306119799614, - "learning_rate": 0.00019192028161552847, - "loss": 0.8662, - "step": 472 - }, - { - "epoch": 0.26, - "grad_norm": 0.040038101375103, - "learning_rate": 0.0001918857480104414, - "loss": 0.811, - "step": 473 - }, - { - "epoch": 0.26, - "grad_norm": 0.04068785905838013, - "learning_rate": 0.00019185114388291886, - "loss": 0.7831, - "step": 474 - }, - { - "epoch": 0.26, - "grad_norm": 0.03962669521570206, - "learning_rate": 0.00019181646925951954, - "loss": 0.7659, - "step": 475 - }, - { - "epoch": 0.26, - "grad_norm": 0.036200929433107376, - "learning_rate": 0.00019178172416685628, - "loss": 0.8251, - "step": 476 - }, - { - "epoch": 0.27, - "grad_norm": 0.03592352196574211, - "learning_rate": 0.00019174690863159593, - "loss": 0.7206, - "step": 477 - }, - { - "epoch": 0.27, - "grad_norm": 0.03576627001166344, - "learning_rate": 0.0001917120226804595, - "loss": 0.8105, - "step": 478 - }, - { - "epoch": 0.27, - "grad_norm": 0.034843627363443375, - "learning_rate": 0.00019167706634022199, - "loss": 0.8635, - "step": 479 - }, - { - "epoch": 0.27, - "grad_norm": 0.040766939520835876, - "learning_rate": 0.00019164203963771244, - "loss": 0.6993, - "step": 480 - }, - { - "epoch": 0.27, - "grad_norm": 0.04198118671774864, - "learning_rate": 0.00019160694259981388, - "loss": 0.8341, - "step": 481 - }, - { - "epoch": 0.27, - "grad_norm": 0.03662072494626045, - "learning_rate": 0.0001915717752534634, - "loss": 0.794, - "step": 482 - }, - { - "epoch": 0.27, - "grad_norm": 0.0427091047167778, - "learning_rate": 0.0001915365376256519, - "loss": 0.7304, - "step": 483 - }, - { - "epoch": 0.27, - "grad_norm": 0.03634573519229889, - "learning_rate": 0.00019150122974342435, - "loss": 0.7365, - "step": 484 - }, - { - "epoch": 0.27, - "grad_norm": 0.03949028626084328, - "learning_rate": 0.0001914658516338796, - "loss": 0.8231, - "step": 485 - }, - { - "epoch": 0.27, - "grad_norm": 0.04077625274658203, - "learning_rate": 0.00019143040332417037, - "loss": 0.66, - "step": 486 - }, - { - "epoch": 0.27, - "grad_norm": 0.03831211104989052, - "learning_rate": 0.0001913948848415033, - "loss": 0.8029, - "step": 487 - }, - { - "epoch": 0.27, - "grad_norm": 0.042038679122924805, - "learning_rate": 0.0001913592962131389, - "loss": 0.807, - "step": 488 - }, - { - "epoch": 0.27, - "grad_norm": 0.040266357362270355, - "learning_rate": 0.00019132363746639147, - "loss": 0.6887, - "step": 489 - }, - { - "epoch": 0.27, - "grad_norm": 0.0406985804438591, - "learning_rate": 0.0001912879086286291, - "loss": 0.7928, - "step": 490 - }, - { - "epoch": 0.27, - "grad_norm": 0.03616916015744209, - "learning_rate": 0.00019125210972727378, - "loss": 0.697, - "step": 491 - }, - { - "epoch": 0.27, - "grad_norm": 0.0417737253010273, - "learning_rate": 0.00019121624078980123, - "loss": 0.8405, - "step": 492 - }, - { - "epoch": 0.27, - "grad_norm": 0.04053768515586853, - "learning_rate": 0.00019118030184374086, - "loss": 0.7304, - "step": 493 - }, - { - "epoch": 0.27, - "grad_norm": 0.037591926753520966, - "learning_rate": 0.00019114429291667583, - "loss": 0.797, - "step": 494 - }, - { - "epoch": 0.28, - "grad_norm": 0.03530837222933769, - "learning_rate": 0.00019110821403624316, - "loss": 0.759, - "step": 495 - }, - { - "epoch": 0.28, - "grad_norm": 0.035193149000406265, - "learning_rate": 0.0001910720652301333, - "loss": 0.7179, - "step": 496 - }, - { - "epoch": 0.28, - "grad_norm": 0.03709745407104492, - "learning_rate": 0.0001910358465260906, - "loss": 0.8888, - "step": 497 - }, - { - "epoch": 0.28, - "grad_norm": 0.036165978759527206, - "learning_rate": 0.00019099955795191296, - "loss": 0.8409, - "step": 498 - }, - { - "epoch": 0.28, - "grad_norm": 0.03585299476981163, - "learning_rate": 0.00019096319953545185, - "loss": 0.6962, - "step": 499 - }, - { - "epoch": 0.28, - "grad_norm": 0.03879237174987793, - "learning_rate": 0.00019092677130461245, - "loss": 0.7037, - "step": 500 - }, - { - "epoch": 0.28, - "grad_norm": 0.040262021124362946, - "learning_rate": 0.00019089027328735348, - "loss": 0.8989, - "step": 501 - }, - { - "epoch": 0.28, - "grad_norm": 0.09765855222940445, - "learning_rate": 0.0001908537055116872, - "loss": 0.8365, - "step": 502 - }, - { - "epoch": 0.28, - "grad_norm": 0.04247608780860901, - "learning_rate": 0.00019081706800567943, - "loss": 0.8854, - "step": 503 - }, - { - "epoch": 0.28, - "grad_norm": 0.04151815176010132, - "learning_rate": 0.00019078036079744947, - "loss": 0.7774, - "step": 504 - }, - { - "epoch": 0.28, - "grad_norm": 0.04607046768069267, - "learning_rate": 0.00019074358391517023, - "loss": 0.7117, - "step": 505 - }, - { - "epoch": 0.28, - "grad_norm": 0.047151170670986176, - "learning_rate": 0.00019070673738706798, - "loss": 0.8029, - "step": 506 - }, - { - "epoch": 0.28, - "grad_norm": 0.0413680225610733, - "learning_rate": 0.00019066982124142244, - "loss": 0.6586, - "step": 507 - }, - { - "epoch": 0.28, - "grad_norm": 0.039332833141088486, - "learning_rate": 0.00019063283550656689, - "loss": 0.6051, - "step": 508 - }, - { - "epoch": 0.28, - "grad_norm": 0.03690175712108612, - "learning_rate": 0.0001905957802108878, - "loss": 0.6932, - "step": 509 - }, - { - "epoch": 0.28, - "grad_norm": 0.04148999601602554, - "learning_rate": 0.0001905586553828253, - "loss": 0.7663, - "step": 510 - }, - { - "epoch": 0.28, - "grad_norm": 0.03818729892373085, - "learning_rate": 0.00019052146105087267, - "loss": 0.7886, - "step": 511 - }, - { - "epoch": 0.28, - "grad_norm": 0.03793442249298096, - "learning_rate": 0.0001904841972435766, - "loss": 0.7099, - "step": 512 - }, - { - "epoch": 0.29, - "grad_norm": 0.04220060631632805, - "learning_rate": 0.00019044686398953715, - "loss": 0.7333, - "step": 513 - }, - { - "epoch": 0.29, - "grad_norm": 0.03924279659986496, - "learning_rate": 0.00019040946131740764, - "loss": 0.7619, - "step": 514 - }, - { - "epoch": 0.29, - "grad_norm": 0.03894231095910072, - "learning_rate": 0.00019037198925589465, - "loss": 0.8214, - "step": 515 - }, - { - "epoch": 0.29, - "grad_norm": 0.038151923567056656, - "learning_rate": 0.00019033444783375804, - "loss": 0.8511, - "step": 516 - }, - { - "epoch": 0.29, - "grad_norm": 0.04167793318629265, - "learning_rate": 0.0001902968370798109, - "loss": 0.7582, - "step": 517 - }, - { - "epoch": 0.29, - "grad_norm": 0.039893049746751785, - "learning_rate": 0.00019025915702291956, - "loss": 0.8991, - "step": 518 - }, - { - "epoch": 0.29, - "grad_norm": 0.03804799169301987, - "learning_rate": 0.00019022140769200352, - "loss": 0.7959, - "step": 519 - }, - { - "epoch": 0.29, - "grad_norm": 0.03987705335021019, - "learning_rate": 0.00019018358911603538, - "loss": 0.7415, - "step": 520 - }, - { - "epoch": 0.29, - "grad_norm": 0.036575764417648315, - "learning_rate": 0.000190145701324041, - "loss": 0.6772, - "step": 521 - }, - { - "epoch": 0.29, - "grad_norm": 0.03801872953772545, - "learning_rate": 0.00019010774434509933, - "loss": 0.7397, - "step": 522 - }, - { - "epoch": 0.29, - "grad_norm": 0.03547314926981926, - "learning_rate": 0.0001900697182083423, - "loss": 0.6802, - "step": 523 - }, - { - "epoch": 0.29, - "grad_norm": 0.03981153294444084, - "learning_rate": 0.00019003162294295515, - "loss": 0.8714, - "step": 524 - }, - { - "epoch": 0.29, - "grad_norm": 0.035997990518808365, - "learning_rate": 0.000189993458578176, - "loss": 0.6773, - "step": 525 - }, - { - "epoch": 0.29, - "grad_norm": 0.04118787497282028, - "learning_rate": 0.00018995522514329602, - "loss": 0.7247, - "step": 526 - }, - { - "epoch": 0.29, - "grad_norm": 0.04163433611392975, - "learning_rate": 0.00018991692266765947, - "loss": 0.7977, - "step": 527 - }, - { - "epoch": 0.29, - "grad_norm": 0.03958054631948471, - "learning_rate": 0.0001898785511806635, - "loss": 0.7665, - "step": 528 - }, - { - "epoch": 0.29, - "grad_norm": 0.040298108011484146, - "learning_rate": 0.0001898401107117583, - "loss": 0.8497, - "step": 529 - }, - { - "epoch": 0.29, - "grad_norm": 0.039216410368680954, - "learning_rate": 0.00018980160129044698, - "loss": 0.8379, - "step": 530 - }, - { - "epoch": 0.3, - "grad_norm": 0.037412021309137344, - "learning_rate": 0.0001897630229462856, - "loss": 0.8241, - "step": 531 - }, - { - "epoch": 0.3, - "grad_norm": 0.042515527456998825, - "learning_rate": 0.0001897243757088831, - "loss": 0.7151, - "step": 532 - }, - { - "epoch": 0.3, - "grad_norm": 0.038337383419275284, - "learning_rate": 0.0001896856596079012, - "loss": 0.7465, - "step": 533 - }, - { - "epoch": 0.3, - "grad_norm": 0.03905492648482323, - "learning_rate": 0.00018964687467305465, - "loss": 0.6749, - "step": 534 - }, - { - "epoch": 0.3, - "grad_norm": 0.038022320717573166, - "learning_rate": 0.0001896080209341109, - "loss": 0.6692, - "step": 535 - }, - { - "epoch": 0.3, - "grad_norm": 0.0380224771797657, - "learning_rate": 0.00018956909842089023, - "loss": 0.866, - "step": 536 - }, - { - "epoch": 0.3, - "grad_norm": 0.03461451828479767, - "learning_rate": 0.00018953010716326577, - "loss": 0.7494, - "step": 537 - }, - { - "epoch": 0.3, - "grad_norm": 0.04129749909043312, - "learning_rate": 0.00018949104719116332, - "loss": 0.783, - "step": 538 - }, - { - "epoch": 0.3, - "grad_norm": 0.0383567251265049, - "learning_rate": 0.00018945191853456152, - "loss": 0.7299, - "step": 539 - }, - { - "epoch": 0.3, - "grad_norm": 0.038509126752614975, - "learning_rate": 0.0001894127212234916, - "loss": 0.8074, - "step": 540 - }, - { - "epoch": 0.3, - "grad_norm": 0.0398080088198185, - "learning_rate": 0.00018937345528803755, - "loss": 0.7684, - "step": 541 - }, - { - "epoch": 0.3, - "grad_norm": 0.04023388773202896, - "learning_rate": 0.0001893341207583361, - "loss": 0.8234, - "step": 542 - }, - { - "epoch": 0.3, - "grad_norm": 0.0396123044192791, - "learning_rate": 0.00018929471766457648, - "loss": 0.8573, - "step": 543 - }, - { - "epoch": 0.3, - "grad_norm": 0.03965805470943451, - "learning_rate": 0.00018925524603700063, - "loss": 0.8654, - "step": 544 - }, - { - "epoch": 0.3, - "grad_norm": 0.03725765272974968, - "learning_rate": 0.00018921570590590316, - "loss": 0.7674, - "step": 545 - }, - { - "epoch": 0.3, - "grad_norm": 0.042921282351017, - "learning_rate": 0.00018917609730163105, - "loss": 0.7293, - "step": 546 - }, - { - "epoch": 0.3, - "grad_norm": 0.04281310364603996, - "learning_rate": 0.00018913642025458405, - "loss": 0.8283, - "step": 547 - }, - { - "epoch": 0.3, - "grad_norm": 0.03923151642084122, - "learning_rate": 0.00018909667479521426, - "loss": 0.8305, - "step": 548 - }, - { - "epoch": 0.31, - "grad_norm": 0.035802267491817474, - "learning_rate": 0.00018905686095402647, - "loss": 0.6796, - "step": 549 - }, - { - "epoch": 0.31, - "grad_norm": 0.04895975813269615, - "learning_rate": 0.00018901697876157777, - "loss": 0.8961, - "step": 550 - }, - { - "epoch": 0.31, - "grad_norm": 0.039671383798122406, - "learning_rate": 0.00018897702824847786, - "loss": 0.7407, - "step": 551 - }, - { - "epoch": 0.31, - "grad_norm": 0.038863249123096466, - "learning_rate": 0.00018893700944538883, - "loss": 0.7412, - "step": 552 - }, - { - "epoch": 0.31, - "grad_norm": 0.03964029625058174, - "learning_rate": 0.00018889692238302508, - "loss": 0.7407, - "step": 553 - }, - { - "epoch": 0.31, - "grad_norm": 0.03963325545191765, - "learning_rate": 0.00018885676709215355, - "loss": 0.671, - "step": 554 - }, - { - "epoch": 0.31, - "grad_norm": 0.04223211482167244, - "learning_rate": 0.00018881654360359344, - "loss": 0.9177, - "step": 555 - }, - { - "epoch": 0.31, - "grad_norm": 0.04407535493373871, - "learning_rate": 0.00018877625194821637, - "loss": 0.8018, - "step": 556 - }, - { - "epoch": 0.31, - "grad_norm": 0.045421354472637177, - "learning_rate": 0.00018873589215694623, - "loss": 0.7992, - "step": 557 - }, - { - "epoch": 0.31, - "grad_norm": 0.03759824484586716, - "learning_rate": 0.00018869546426075919, - "loss": 0.6757, - "step": 558 - }, - { - "epoch": 0.31, - "grad_norm": 0.03787359595298767, - "learning_rate": 0.00018865496829068373, - "loss": 0.7533, - "step": 559 - }, - { - "epoch": 0.31, - "grad_norm": 0.04168887808918953, - "learning_rate": 0.0001886144042778006, - "loss": 0.8529, - "step": 560 - }, - { - "epoch": 0.31, - "grad_norm": 0.03604380786418915, - "learning_rate": 0.00018857377225324265, - "loss": 0.6959, - "step": 561 - }, - { - "epoch": 0.31, - "grad_norm": 0.039811234921216965, - "learning_rate": 0.00018853307224819506, - "loss": 0.6923, - "step": 562 - }, - { - "epoch": 0.31, - "grad_norm": 0.039177920669317245, - "learning_rate": 0.00018849230429389516, - "loss": 0.8089, - "step": 563 - }, - { - "epoch": 0.31, - "grad_norm": 0.03779724985361099, - "learning_rate": 0.0001884514684216324, - "loss": 0.6745, - "step": 564 - }, - { - "epoch": 0.31, - "grad_norm": 0.03678756207227707, - "learning_rate": 0.00018841056466274836, - "loss": 0.6761, - "step": 565 - }, - { - "epoch": 0.31, - "grad_norm": 0.03539562225341797, - "learning_rate": 0.0001883695930486367, - "loss": 0.7111, - "step": 566 - }, - { - "epoch": 0.32, - "grad_norm": 0.03760172799229622, - "learning_rate": 0.00018832855361074322, - "loss": 0.8614, - "step": 567 - }, - { - "epoch": 0.32, - "grad_norm": 0.03915516659617424, - "learning_rate": 0.00018828744638056574, - "loss": 0.8167, - "step": 568 - }, - { - "epoch": 0.32, - "grad_norm": 0.039351411163806915, - "learning_rate": 0.00018824627138965414, - "loss": 0.7736, - "step": 569 - }, - { - "epoch": 0.32, - "grad_norm": 0.03440462425351143, - "learning_rate": 0.00018820502866961022, - "loss": 0.6989, - "step": 570 - }, - { - "epoch": 0.32, - "grad_norm": 0.04510714113712311, - "learning_rate": 0.00018816371825208789, - "loss": 0.7669, - "step": 571 - }, - { - "epoch": 0.32, - "grad_norm": 0.038387760519981384, - "learning_rate": 0.0001881223401687929, - "loss": 0.7742, - "step": 572 - }, - { - "epoch": 0.32, - "grad_norm": 0.04283255338668823, - "learning_rate": 0.00018808089445148302, - "loss": 0.8729, - "step": 573 - }, - { - "epoch": 0.32, - "grad_norm": 0.03683461621403694, - "learning_rate": 0.00018803938113196787, - "loss": 0.7528, - "step": 574 - }, - { - "epoch": 0.32, - "grad_norm": 0.03634823486208916, - "learning_rate": 0.00018799780024210898, - "loss": 0.7812, - "step": 575 - }, - { - "epoch": 0.32, - "grad_norm": 0.04318126663565636, - "learning_rate": 0.00018795615181381976, - "loss": 0.8744, - "step": 576 - }, - { - "epoch": 0.32, - "grad_norm": 0.04043075069785118, - "learning_rate": 0.00018791443587906542, - "loss": 0.719, - "step": 577 - }, - { - "epoch": 0.32, - "grad_norm": 0.03615717217326164, - "learning_rate": 0.00018787265246986298, - "loss": 0.6402, - "step": 578 - }, - { - "epoch": 0.32, - "grad_norm": 0.0358918234705925, - "learning_rate": 0.0001878308016182813, - "loss": 0.7614, - "step": 579 - }, - { - "epoch": 0.32, - "grad_norm": 0.03841692954301834, - "learning_rate": 0.0001877888833564409, - "loss": 0.6891, - "step": 580 - }, - { - "epoch": 0.32, - "grad_norm": 0.04361288622021675, - "learning_rate": 0.00018774689771651422, - "loss": 0.866, - "step": 581 - }, - { - "epoch": 0.32, - "grad_norm": 0.04058700054883957, - "learning_rate": 0.0001877048447307252, - "loss": 0.7412, - "step": 582 - }, - { - "epoch": 0.32, - "grad_norm": 0.04075286164879799, - "learning_rate": 0.00018766272443134955, - "loss": 0.8213, - "step": 583 - }, - { - "epoch": 0.32, - "grad_norm": 0.03992673009634018, - "learning_rate": 0.00018762053685071473, - "loss": 0.7053, - "step": 584 - }, - { - "epoch": 0.33, - "grad_norm": 0.03777340427041054, - "learning_rate": 0.00018757828202119973, - "loss": 0.656, - "step": 585 - }, - { - "epoch": 0.33, - "grad_norm": 0.04056774452328682, - "learning_rate": 0.00018753595997523514, - "loss": 0.8583, - "step": 586 - }, - { - "epoch": 0.33, - "grad_norm": 0.03643214702606201, - "learning_rate": 0.00018749357074530326, - "loss": 0.7368, - "step": 587 - }, - { - "epoch": 0.33, - "grad_norm": 0.040921106934547424, - "learning_rate": 0.00018745111436393788, - "loss": 0.7502, - "step": 588 - }, - { - "epoch": 0.33, - "grad_norm": 0.037460386753082275, - "learning_rate": 0.00018740859086372429, - "loss": 0.7211, - "step": 589 - }, - { - "epoch": 0.33, - "grad_norm": 0.04132199287414551, - "learning_rate": 0.00018736600027729932, - "loss": 0.8679, - "step": 590 - }, - { - "epoch": 0.33, - "grad_norm": 0.038056958466768265, - "learning_rate": 0.00018732334263735136, - "loss": 0.7041, - "step": 591 - }, - { - "epoch": 0.33, - "grad_norm": 0.036732178181409836, - "learning_rate": 0.0001872806179766202, - "loss": 0.8356, - "step": 592 - }, - { - "epoch": 0.33, - "grad_norm": 0.04578113928437233, - "learning_rate": 0.00018723782632789701, - "loss": 0.7287, - "step": 593 - }, - { - "epoch": 0.33, - "grad_norm": 0.04152052104473114, - "learning_rate": 0.0001871949677240245, - "loss": 0.7445, - "step": 594 - }, - { - "epoch": 0.33, - "grad_norm": 0.05432068929076195, - "learning_rate": 0.00018715204219789668, - "loss": 0.6701, - "step": 595 - }, - { - "epoch": 0.33, - "grad_norm": 0.040836673229932785, - "learning_rate": 0.00018710904978245894, - "loss": 0.7228, - "step": 596 - }, - { - "epoch": 0.33, - "grad_norm": 0.04091890901327133, - "learning_rate": 0.00018706599051070808, - "loss": 0.7615, - "step": 597 - }, - { - "epoch": 0.33, - "grad_norm": 0.039113231003284454, - "learning_rate": 0.00018702286441569206, - "loss": 0.8016, - "step": 598 - }, - { - "epoch": 0.33, - "grad_norm": 0.03947889804840088, - "learning_rate": 0.00018697967153051028, - "loss": 0.8327, - "step": 599 - }, - { - "epoch": 0.33, - "grad_norm": 0.03761962801218033, - "learning_rate": 0.0001869364118883133, - "loss": 0.7843, - "step": 600 - }, - { - "epoch": 0.33, - "grad_norm": 0.035844236612319946, - "learning_rate": 0.00018689308552230296, - "loss": 0.7678, - "step": 601 - }, - { - "epoch": 0.33, - "grad_norm": 0.04012085869908333, - "learning_rate": 0.00018684969246573235, - "loss": 0.8195, - "step": 602 - }, - { - "epoch": 0.34, - "grad_norm": 0.04106171056628227, - "learning_rate": 0.00018680623275190564, - "loss": 0.8068, - "step": 603 - }, - { - "epoch": 0.34, - "grad_norm": 0.03810250759124756, - "learning_rate": 0.00018676270641417822, - "loss": 0.6382, - "step": 604 - }, - { - "epoch": 0.34, - "grad_norm": 0.04322788119316101, - "learning_rate": 0.00018671911348595667, - "loss": 0.7498, - "step": 605 - }, - { - "epoch": 0.34, - "grad_norm": 0.04026930034160614, - "learning_rate": 0.00018667545400069858, - "loss": 0.7506, - "step": 606 - }, - { - "epoch": 0.34, - "grad_norm": 0.040973201394081116, - "learning_rate": 0.00018663172799191264, - "loss": 0.7408, - "step": 607 - }, - { - "epoch": 0.34, - "grad_norm": 0.03879684954881668, - "learning_rate": 0.00018658793549315868, - "loss": 0.805, - "step": 608 - }, - { - "epoch": 0.34, - "grad_norm": 0.038516003638505936, - "learning_rate": 0.00018654407653804746, - "loss": 0.6352, - "step": 609 - }, - { - "epoch": 0.34, - "grad_norm": 0.041547078639268875, - "learning_rate": 0.0001865001511602408, - "loss": 0.7504, - "step": 610 - }, - { - "epoch": 0.34, - "grad_norm": 0.03565856069326401, - "learning_rate": 0.0001864561593934515, - "loss": 0.6952, - "step": 611 - }, - { - "epoch": 0.34, - "grad_norm": 0.043429479002952576, - "learning_rate": 0.00018641210127144327, - "loss": 0.7584, - "step": 612 - }, - { - "epoch": 0.34, - "grad_norm": 0.043735940009355545, - "learning_rate": 0.00018636797682803082, - "loss": 0.6309, - "step": 613 - }, - { - "epoch": 0.34, - "grad_norm": 0.03792436793446541, - "learning_rate": 0.00018632378609707967, - "loss": 0.6673, - "step": 614 - }, - { - "epoch": 0.34, - "grad_norm": 0.03878699988126755, - "learning_rate": 0.0001862795291125063, - "loss": 0.6326, - "step": 615 - }, - { - "epoch": 0.34, - "grad_norm": 0.04062122479081154, - "learning_rate": 0.00018623520590827799, - "loss": 0.6807, - "step": 616 - }, - { - "epoch": 0.34, - "grad_norm": 0.046398624777793884, - "learning_rate": 0.00018619081651841286, - "loss": 0.6643, - "step": 617 - }, - { - "epoch": 0.34, - "grad_norm": 0.04108712822198868, - "learning_rate": 0.00018614636097697985, - "loss": 0.7254, - "step": 618 - }, - { - "epoch": 0.34, - "grad_norm": 0.0392933115363121, - "learning_rate": 0.00018610183931809862, - "loss": 0.7912, - "step": 619 - }, - { - "epoch": 0.34, - "grad_norm": 0.03968581184744835, - "learning_rate": 0.00018605725157593958, - "loss": 0.7038, - "step": 620 - }, - { - "epoch": 0.35, - "grad_norm": 0.03825043514370918, - "learning_rate": 0.00018601259778472395, - "loss": 0.6573, - "step": 621 - }, - { - "epoch": 0.35, - "grad_norm": 0.04096253588795662, - "learning_rate": 0.00018596787797872354, - "loss": 0.7289, - "step": 622 - }, - { - "epoch": 0.35, - "grad_norm": 0.03817110136151314, - "learning_rate": 0.00018592309219226083, - "loss": 0.7687, - "step": 623 - }, - { - "epoch": 0.35, - "grad_norm": 0.0371723398566246, - "learning_rate": 0.00018587824045970903, - "loss": 0.631, - "step": 624 - }, - { - "epoch": 0.35, - "grad_norm": 0.04173294082283974, - "learning_rate": 0.0001858333228154919, - "loss": 0.7503, - "step": 625 - }, - { - "epoch": 0.35, - "grad_norm": 0.03969898074865341, - "learning_rate": 0.0001857883392940837, - "loss": 0.7999, - "step": 626 - }, - { - "epoch": 0.35, - "grad_norm": 0.034285057336091995, - "learning_rate": 0.00018574328993000946, - "loss": 0.742, - "step": 627 - }, - { - "epoch": 0.35, - "grad_norm": 0.03918811306357384, - "learning_rate": 0.0001856981747578446, - "loss": 0.647, - "step": 628 - }, - { - "epoch": 0.35, - "grad_norm": 0.039601150900125504, - "learning_rate": 0.00018565299381221505, - "loss": 0.6368, - "step": 629 - }, - { - "epoch": 0.35, - "grad_norm": 0.04142673686146736, - "learning_rate": 0.0001856077471277972, - "loss": 0.8081, - "step": 630 - }, - { - "epoch": 0.35, - "grad_norm": 0.04136984422802925, - "learning_rate": 0.00018556243473931801, - "loss": 0.7583, - "step": 631 - }, - { - "epoch": 0.35, - "grad_norm": 0.046918828040361404, - "learning_rate": 0.0001855170566815548, - "loss": 0.8415, - "step": 632 - }, - { - "epoch": 0.35, - "grad_norm": 0.03764290362596512, - "learning_rate": 0.0001854716129893353, - "loss": 0.6575, - "step": 633 - }, - { - "epoch": 0.35, - "grad_norm": 0.04044364020228386, - "learning_rate": 0.00018542610369753755, - "loss": 0.6595, - "step": 634 - }, - { - "epoch": 0.35, - "grad_norm": 0.04079195857048035, - "learning_rate": 0.00018538052884109005, - "loss": 0.6936, - "step": 635 - }, - { - "epoch": 0.35, - "grad_norm": 0.04137120023369789, - "learning_rate": 0.00018533488845497148, - "loss": 0.7419, - "step": 636 - }, - { - "epoch": 0.35, - "grad_norm": 0.0409332811832428, - "learning_rate": 0.000185289182574211, - "loss": 0.7343, - "step": 637 - }, - { - "epoch": 0.35, - "grad_norm": 0.03855755180120468, - "learning_rate": 0.0001852434112338879, - "loss": 0.7341, - "step": 638 - }, - { - "epoch": 0.36, - "grad_norm": 0.04691479727625847, - "learning_rate": 0.0001851975744691317, - "loss": 0.6813, - "step": 639 - }, - { - "epoch": 0.36, - "grad_norm": 0.04192007705569267, - "learning_rate": 0.00018515167231512224, - "loss": 0.8011, - "step": 640 - }, - { - "epoch": 0.36, - "grad_norm": 0.04521464928984642, - "learning_rate": 0.00018510570480708942, - "loss": 0.7823, - "step": 641 - }, - { - "epoch": 0.36, - "grad_norm": 0.035000029951334, - "learning_rate": 0.0001850596719803134, - "loss": 0.5621, - "step": 642 - }, - { - "epoch": 0.36, - "grad_norm": 0.04110797494649887, - "learning_rate": 0.00018501357387012447, - "loss": 0.6851, - "step": 643 - }, - { - "epoch": 0.36, - "grad_norm": 0.04273466393351555, - "learning_rate": 0.00018496741051190297, - "loss": 0.7967, - "step": 644 - }, - { - "epoch": 0.36, - "grad_norm": 0.04418378323316574, - "learning_rate": 0.0001849211819410793, - "loss": 0.7684, - "step": 645 - }, - { - "epoch": 0.36, - "grad_norm": 0.03610142320394516, - "learning_rate": 0.00018487488819313402, - "loss": 0.6515, - "step": 646 - }, - { - "epoch": 0.36, - "grad_norm": 0.041490521281957626, - "learning_rate": 0.0001848285293035976, - "loss": 0.8379, - "step": 647 - }, - { - "epoch": 0.36, - "grad_norm": 0.041976895183324814, - "learning_rate": 0.0001847821053080505, - "loss": 0.7772, - "step": 648 - }, - { - "epoch": 0.36, - "grad_norm": 0.041484154760837555, - "learning_rate": 0.0001847356162421233, - "loss": 0.7797, - "step": 649 - }, - { - "epoch": 0.36, - "grad_norm": 0.04266678914427757, - "learning_rate": 0.00018468906214149638, - "loss": 0.681, - "step": 650 - }, - { - "epoch": 0.36, - "grad_norm": 0.041691817343235016, - "learning_rate": 0.00018464244304190004, - "loss": 0.8475, - "step": 651 - }, - { - "epoch": 0.36, - "grad_norm": 0.039543136954307556, - "learning_rate": 0.00018459575897911455, - "loss": 0.5914, - "step": 652 - }, - { - "epoch": 0.36, - "grad_norm": 0.035682205110788345, - "learning_rate": 0.00018454900998896996, - "loss": 0.6921, - "step": 653 - }, - { - "epoch": 0.36, - "grad_norm": 0.04010608792304993, - "learning_rate": 0.0001845021961073462, - "loss": 0.7686, - "step": 654 - }, - { - "epoch": 0.36, - "grad_norm": 0.04509714990854263, - "learning_rate": 0.00018445531737017298, - "loss": 0.7965, - "step": 655 - }, - { - "epoch": 0.36, - "grad_norm": 0.04406216740608215, - "learning_rate": 0.0001844083738134298, - "loss": 0.8243, - "step": 656 - }, - { - "epoch": 0.37, - "grad_norm": 0.03819778561592102, - "learning_rate": 0.00018436136547314593, - "loss": 0.6683, - "step": 657 - }, - { - "epoch": 0.37, - "grad_norm": 0.04046725109219551, - "learning_rate": 0.00018431429238540027, - "loss": 0.8624, - "step": 658 - }, - { - "epoch": 0.37, - "grad_norm": 0.04344424977898598, - "learning_rate": 0.00018426715458632153, - "loss": 0.7828, - "step": 659 - }, - { - "epoch": 0.37, - "grad_norm": 0.040390148758888245, - "learning_rate": 0.00018421995211208802, - "loss": 0.7757, - "step": 660 - }, - { - "epoch": 0.37, - "grad_norm": 0.03947126492857933, - "learning_rate": 0.00018417268499892767, - "loss": 0.6989, - "step": 661 - }, - { - "epoch": 0.37, - "grad_norm": 0.04135662689805031, - "learning_rate": 0.00018412535328311814, - "loss": 0.847, - "step": 662 - }, - { - "epoch": 0.37, - "grad_norm": 0.049522459506988525, - "learning_rate": 0.00018407795700098648, - "loss": 0.7971, - "step": 663 - }, - { - "epoch": 0.37, - "grad_norm": 0.040498074144124985, - "learning_rate": 0.0001840304961889095, - "loss": 0.7314, - "step": 664 - }, - { - "epoch": 0.37, - "grad_norm": 0.05009998753666878, - "learning_rate": 0.00018398297088331332, - "loss": 0.7435, - "step": 665 - }, - { - "epoch": 0.37, - "grad_norm": 0.03603378310799599, - "learning_rate": 0.0001839353811206738, - "loss": 0.7178, - "step": 666 - }, - { - "epoch": 0.37, - "grad_norm": 0.037989791482686996, - "learning_rate": 0.00018388772693751602, - "loss": 0.8039, - "step": 667 - }, - { - "epoch": 0.37, - "grad_norm": 0.03886855021119118, - "learning_rate": 0.00018384000837041477, - "loss": 0.7001, - "step": 668 - }, - { - "epoch": 0.37, - "grad_norm": 0.04233938828110695, - "learning_rate": 0.00018379222545599402, - "loss": 0.7929, - "step": 669 - }, - { - "epoch": 0.37, - "grad_norm": 0.04034002870321274, - "learning_rate": 0.00018374437823092724, - "loss": 0.6706, - "step": 670 - }, - { - "epoch": 0.37, - "grad_norm": 0.04359966889023781, - "learning_rate": 0.00018369646673193724, - "loss": 0.825, - "step": 671 - }, - { - "epoch": 0.37, - "grad_norm": 0.04203416034579277, - "learning_rate": 0.0001836484909957962, - "loss": 0.8887, - "step": 672 - }, - { - "epoch": 0.37, - "grad_norm": 0.04705547168850899, - "learning_rate": 0.0001836004510593255, - "loss": 0.7003, - "step": 673 - }, - { - "epoch": 0.37, - "grad_norm": 0.040125973522663116, - "learning_rate": 0.00018355234695939586, - "loss": 0.7106, - "step": 674 - }, - { - "epoch": 0.38, - "grad_norm": 0.040442850440740585, - "learning_rate": 0.0001835041787329273, - "loss": 0.6736, - "step": 675 - }, - { - "epoch": 0.38, - "grad_norm": 0.04361211508512497, - "learning_rate": 0.00018345594641688893, - "loss": 0.7427, - "step": 676 - }, - { - "epoch": 0.38, - "grad_norm": 0.039316337555646896, - "learning_rate": 0.0001834076500482992, - "loss": 0.7529, - "step": 677 - }, - { - "epoch": 0.38, - "grad_norm": 0.041209638118743896, - "learning_rate": 0.00018335928966422557, - "loss": 0.7345, - "step": 678 - }, - { - "epoch": 0.38, - "grad_norm": 0.04208279401063919, - "learning_rate": 0.0001833108653017847, - "loss": 0.7544, - "step": 679 - }, - { - "epoch": 0.38, - "grad_norm": 0.042629461735486984, - "learning_rate": 0.0001832623769981424, - "loss": 0.6676, - "step": 680 - }, - { - "epoch": 0.38, - "grad_norm": 0.038116682320833206, - "learning_rate": 0.00018321382479051347, - "loss": 0.705, - "step": 681 - }, - { - "epoch": 0.38, - "grad_norm": 0.03879912942647934, - "learning_rate": 0.0001831652087161618, - "loss": 0.7405, - "step": 682 - }, - { - "epoch": 0.38, - "grad_norm": 0.04380761831998825, - "learning_rate": 0.00018311652881240032, - "loss": 0.6791, - "step": 683 - }, - { - "epoch": 0.38, - "grad_norm": 0.03818592429161072, - "learning_rate": 0.00018306778511659085, - "loss": 0.6545, - "step": 684 - }, - { - "epoch": 0.38, - "grad_norm": 0.0418897345662117, - "learning_rate": 0.00018301897766614435, - "loss": 0.815, - "step": 685 - }, - { - "epoch": 0.38, - "grad_norm": 0.04104520007967949, - "learning_rate": 0.00018297010649852052, - "loss": 0.6821, - "step": 686 - }, - { - "epoch": 0.38, - "grad_norm": 0.04211374744772911, - "learning_rate": 0.0001829211716512281, - "loss": 0.778, - "step": 687 - }, - { - "epoch": 0.38, - "grad_norm": 0.045832112431526184, - "learning_rate": 0.00018287217316182458, - "loss": 0.8436, - "step": 688 - }, - { - "epoch": 0.38, - "grad_norm": 0.042089615017175674, - "learning_rate": 0.00018282311106791645, - "loss": 0.6899, - "step": 689 - }, - { - "epoch": 0.38, - "grad_norm": 0.039306432008743286, - "learning_rate": 0.0001827739854071589, - "loss": 0.6268, - "step": 690 - }, - { - "epoch": 0.38, - "grad_norm": 0.04297361150383949, - "learning_rate": 0.00018272479621725589, - "loss": 0.7461, - "step": 691 - }, - { - "epoch": 0.38, - "grad_norm": 0.04032345116138458, - "learning_rate": 0.00018267554353596025, - "loss": 0.797, - "step": 692 - }, - { - "epoch": 0.39, - "grad_norm": 0.04208484664559364, - "learning_rate": 0.0001826262274010735, - "loss": 0.8043, - "step": 693 - }, - { - "epoch": 0.39, - "grad_norm": 0.039476584643125534, - "learning_rate": 0.00018257684785044577, - "loss": 0.6953, - "step": 694 - }, - { - "epoch": 0.39, - "grad_norm": 0.040461137890815735, - "learning_rate": 0.00018252740492197598, - "loss": 0.754, - "step": 695 - }, - { - "epoch": 0.39, - "grad_norm": 0.04078752547502518, - "learning_rate": 0.00018247789865361166, - "loss": 0.6886, - "step": 696 - }, - { - "epoch": 0.39, - "grad_norm": 0.04459698870778084, - "learning_rate": 0.00018242832908334886, - "loss": 0.7355, - "step": 697 - }, - { - "epoch": 0.39, - "grad_norm": 0.039952464401721954, - "learning_rate": 0.00018237869624923236, - "loss": 0.7305, - "step": 698 - }, - { - "epoch": 0.39, - "grad_norm": 0.037101615220308304, - "learning_rate": 0.0001823290001893554, - "loss": 0.7426, - "step": 699 - }, - { - "epoch": 0.39, - "grad_norm": 0.050815049558877945, - "learning_rate": 0.0001822792409418598, - "loss": 0.6909, - "step": 700 - }, - { - "epoch": 0.39, - "grad_norm": 0.04514652490615845, - "learning_rate": 0.0001822294185449358, - "loss": 0.8794, - "step": 701 - }, - { - "epoch": 0.39, - "grad_norm": 0.04079204425215721, - "learning_rate": 0.00018217953303682223, - "loss": 0.6969, - "step": 702 - }, - { - "epoch": 0.39, - "grad_norm": 0.03969675302505493, - "learning_rate": 0.0001821295844558062, - "loss": 0.6827, - "step": 703 - }, - { - "epoch": 0.39, - "grad_norm": 0.04383305460214615, - "learning_rate": 0.0001820795728402234, - "loss": 0.8434, - "step": 704 - }, - { - "epoch": 0.39, - "grad_norm": 0.039204008877277374, - "learning_rate": 0.00018202949822845773, - "loss": 0.6717, - "step": 705 - }, - { - "epoch": 0.39, - "grad_norm": 0.03731399402022362, - "learning_rate": 0.00018197936065894157, - "loss": 0.5858, - "step": 706 - }, - { - "epoch": 0.39, - "grad_norm": 0.04733480513095856, - "learning_rate": 0.00018192916017015557, - "loss": 0.7383, - "step": 707 - }, - { - "epoch": 0.39, - "grad_norm": 0.04360615089535713, - "learning_rate": 0.00018187889680062864, - "loss": 0.716, - "step": 708 - }, - { - "epoch": 0.39, - "grad_norm": 0.040373023599386215, - "learning_rate": 0.00018182857058893803, - "loss": 0.6859, - "step": 709 - }, - { - "epoch": 0.39, - "grad_norm": 0.041170086711645126, - "learning_rate": 0.00018177818157370915, - "loss": 0.6335, - "step": 710 - }, - { - "epoch": 0.4, - "grad_norm": 0.039953090250492096, - "learning_rate": 0.00018172772979361567, - "loss": 0.6232, - "step": 711 - }, - { - "epoch": 0.4, - "grad_norm": 0.04438021779060364, - "learning_rate": 0.0001816772152873793, - "loss": 0.7427, - "step": 712 - }, - { - "epoch": 0.4, - "grad_norm": 0.037649158388376236, - "learning_rate": 0.00018162663809377012, - "loss": 0.6732, - "step": 713 - }, - { - "epoch": 0.4, - "grad_norm": 0.04502052068710327, - "learning_rate": 0.0001815759982516061, - "loss": 0.8635, - "step": 714 - }, - { - "epoch": 0.4, - "grad_norm": 0.04553660750389099, - "learning_rate": 0.00018152529579975343, - "loss": 0.7883, - "step": 715 - }, - { - "epoch": 0.4, - "grad_norm": 0.038996774703264236, - "learning_rate": 0.00018147453077712634, - "loss": 0.7818, - "step": 716 - }, - { - "epoch": 0.4, - "grad_norm": 0.03996030241250992, - "learning_rate": 0.000181423703222687, - "loss": 0.7777, - "step": 717 - }, - { - "epoch": 0.4, - "grad_norm": 0.04063892737030983, - "learning_rate": 0.0001813728131754456, - "loss": 0.7201, - "step": 718 - }, - { - "epoch": 0.4, - "grad_norm": 0.04045387730002403, - "learning_rate": 0.00018132186067446043, - "loss": 0.8018, - "step": 719 - }, - { - "epoch": 0.4, - "grad_norm": 0.042122699320316315, - "learning_rate": 0.0001812708457588375, - "loss": 0.7921, - "step": 720 - }, - { - "epoch": 0.4, - "grad_norm": 0.04201977327466011, - "learning_rate": 0.00018121976846773084, - "loss": 0.7049, - "step": 721 - }, - { - "epoch": 0.4, - "grad_norm": 0.03838672116398811, - "learning_rate": 0.00018116862884034243, - "loss": 0.6836, - "step": 722 - }, - { - "epoch": 0.4, - "grad_norm": 0.044829823076725006, - "learning_rate": 0.0001811174269159219, - "loss": 0.765, - "step": 723 - }, - { - "epoch": 0.4, - "grad_norm": 0.04038670286536217, - "learning_rate": 0.00018106616273376683, - "loss": 0.7528, - "step": 724 - }, - { - "epoch": 0.4, - "grad_norm": 0.041412707418203354, - "learning_rate": 0.00018101483633322255, - "loss": 0.8253, - "step": 725 - }, - { - "epoch": 0.4, - "grad_norm": 0.04251568764448166, - "learning_rate": 0.00018096344775368214, - "loss": 0.7483, - "step": 726 - }, - { - "epoch": 0.4, - "grad_norm": 0.043436773121356964, - "learning_rate": 0.0001809119970345864, - "loss": 0.7989, - "step": 727 - }, - { - "epoch": 0.4, - "grad_norm": 0.04096989333629608, - "learning_rate": 0.00018086048421542383, - "loss": 0.7947, - "step": 728 - }, - { - "epoch": 0.41, - "grad_norm": 0.04080723971128464, - "learning_rate": 0.0001808089093357306, - "loss": 0.7878, - "step": 729 - }, - { - "epoch": 0.41, - "grad_norm": 0.043036088347435, - "learning_rate": 0.0001807572724350905, - "loss": 0.7725, - "step": 730 - }, - { - "epoch": 0.41, - "grad_norm": 0.040440741926431656, - "learning_rate": 0.0001807055735531349, - "loss": 0.688, - "step": 731 - }, - { - "epoch": 0.41, - "grad_norm": 0.044196490198373795, - "learning_rate": 0.00018065381272954278, - "loss": 0.8786, - "step": 732 - }, - { - "epoch": 0.41, - "grad_norm": 0.042028769850730896, - "learning_rate": 0.00018060199000404062, - "loss": 0.8395, - "step": 733 - }, - { - "epoch": 0.41, - "grad_norm": 0.04810195043683052, - "learning_rate": 0.00018055010541640244, - "loss": 0.7793, - "step": 734 - }, - { - "epoch": 0.41, - "grad_norm": 0.03826632723212242, - "learning_rate": 0.0001804981590064498, - "loss": 0.6893, - "step": 735 - }, - { - "epoch": 0.41, - "grad_norm": 0.04325885698199272, - "learning_rate": 0.00018044615081405153, - "loss": 0.8006, - "step": 736 - }, - { - "epoch": 0.41, - "grad_norm": 0.039932068437337875, - "learning_rate": 0.00018039408087912402, - "loss": 0.6664, - "step": 737 - }, - { - "epoch": 0.41, - "grad_norm": 0.04278779402375221, - "learning_rate": 0.00018034194924163103, - "loss": 0.7397, - "step": 738 - }, - { - "epoch": 0.41, - "grad_norm": 0.03901227191090584, - "learning_rate": 0.0001802897559415837, - "loss": 0.5662, - "step": 739 - }, - { - "epoch": 0.41, - "grad_norm": 0.043960485607385635, - "learning_rate": 0.0001802375010190404, - "loss": 0.7809, - "step": 740 - }, - { - "epoch": 0.41, - "grad_norm": 0.04186437651515007, - "learning_rate": 0.0001801851845141069, - "loss": 0.6976, - "step": 741 - }, - { - "epoch": 0.41, - "grad_norm": 0.041978247463703156, - "learning_rate": 0.00018013280646693612, - "loss": 0.7146, - "step": 742 - }, - { - "epoch": 0.41, - "grad_norm": 0.04281838610768318, - "learning_rate": 0.00018008036691772835, - "loss": 0.7627, - "step": 743 - }, - { - "epoch": 0.41, - "grad_norm": 0.0473211370408535, - "learning_rate": 0.00018002786590673098, - "loss": 0.8042, - "step": 744 - }, - { - "epoch": 0.41, - "grad_norm": 0.04652298986911774, - "learning_rate": 0.0001799753034742386, - "loss": 0.731, - "step": 745 - }, - { - "epoch": 0.41, - "grad_norm": 0.039468780159950256, - "learning_rate": 0.000179922679660593, - "loss": 0.7135, - "step": 746 - }, - { - "epoch": 0.42, - "grad_norm": 0.04733370244503021, - "learning_rate": 0.00017986999450618295, - "loss": 0.8023, - "step": 747 - }, - { - "epoch": 0.42, - "grad_norm": 0.04231337085366249, - "learning_rate": 0.00017981724805144443, - "loss": 0.7528, - "step": 748 - }, - { - "epoch": 0.42, - "grad_norm": 0.04062468931078911, - "learning_rate": 0.00017976444033686043, - "loss": 0.7331, - "step": 749 - }, - { - "epoch": 0.42, - "grad_norm": 0.04496033489704132, - "learning_rate": 0.0001797115714029609, - "loss": 0.6968, - "step": 750 - }, - { - "epoch": 0.42, - "grad_norm": 0.04302246496081352, - "learning_rate": 0.00017965864129032284, - "loss": 0.8159, - "step": 751 - }, - { - "epoch": 0.42, - "grad_norm": 0.04262755811214447, - "learning_rate": 0.00017960565003957018, - "loss": 0.7148, - "step": 752 - }, - { - "epoch": 0.42, - "grad_norm": 0.040617603808641434, - "learning_rate": 0.00017955259769137375, - "loss": 0.7302, - "step": 753 - }, - { - "epoch": 0.42, - "grad_norm": 0.04194014146924019, - "learning_rate": 0.00017949948428645134, - "loss": 0.6785, - "step": 754 - }, - { - "epoch": 0.42, - "grad_norm": 0.04514269158244133, - "learning_rate": 0.00017944630986556752, - "loss": 0.748, - "step": 755 - }, - { - "epoch": 0.42, - "grad_norm": 0.042905841022729874, - "learning_rate": 0.0001793930744695338, - "loss": 0.7357, - "step": 756 - }, - { - "epoch": 0.42, - "grad_norm": 0.04359228163957596, - "learning_rate": 0.00017933977813920833, - "loss": 0.7426, - "step": 757 - }, - { - "epoch": 0.42, - "grad_norm": 0.040619876235723495, - "learning_rate": 0.00017928642091549613, - "loss": 0.7278, - "step": 758 - }, - { - "epoch": 0.42, - "grad_norm": 0.039674315601587296, - "learning_rate": 0.000179233002839349, - "loss": 0.7459, - "step": 759 - }, - { - "epoch": 0.42, - "grad_norm": 0.039900798350572586, - "learning_rate": 0.00017917952395176537, - "loss": 0.6963, - "step": 760 - }, - { - "epoch": 0.42, - "grad_norm": 0.039163414388895035, - "learning_rate": 0.0001791259842937903, - "loss": 0.652, - "step": 761 - }, - { - "epoch": 0.42, - "grad_norm": 0.04467619210481644, - "learning_rate": 0.0001790723839065156, - "loss": 0.585, - "step": 762 - }, - { - "epoch": 0.42, - "grad_norm": 0.03953075036406517, - "learning_rate": 0.00017901872283107966, - "loss": 0.7537, - "step": 763 - }, - { - "epoch": 0.42, - "grad_norm": 0.0408446304500103, - "learning_rate": 0.0001789650011086674, - "loss": 0.7833, - "step": 764 - }, - { - "epoch": 0.43, - "grad_norm": 0.05530129000544548, - "learning_rate": 0.00017891121878051025, - "loss": 0.8796, - "step": 765 - }, - { - "epoch": 0.43, - "grad_norm": 0.03987415134906769, - "learning_rate": 0.00017885737588788634, - "loss": 0.6762, - "step": 766 - }, - { - "epoch": 0.43, - "grad_norm": 0.042246054857969284, - "learning_rate": 0.00017880347247212005, - "loss": 0.7387, - "step": 767 - }, - { - "epoch": 0.43, - "grad_norm": 0.04334084317088127, - "learning_rate": 0.00017874950857458242, - "loss": 0.7526, - "step": 768 - }, - { - "epoch": 0.43, - "grad_norm": 0.04815899208188057, - "learning_rate": 0.00017869548423669077, - "loss": 0.8278, - "step": 769 - }, - { - "epoch": 0.43, - "grad_norm": 0.05985521152615547, - "learning_rate": 0.00017864139949990885, - "loss": 0.6726, - "step": 770 - }, - { - "epoch": 0.43, - "grad_norm": 0.038774825632572174, - "learning_rate": 0.00017858725440574676, - "loss": 0.6549, - "step": 771 - }, - { - "epoch": 0.43, - "grad_norm": 0.09850037842988968, - "learning_rate": 0.00017853304899576093, - "loss": 1.1263, - "step": 772 - }, - { - "epoch": 0.43, - "grad_norm": 0.0418662466108799, - "learning_rate": 0.0001784787833115541, - "loss": 0.6866, - "step": 773 - }, - { - "epoch": 0.43, - "grad_norm": 0.04924052208662033, - "learning_rate": 0.00017842445739477532, - "loss": 0.7591, - "step": 774 - }, - { - "epoch": 0.43, - "grad_norm": 0.04036710783839226, - "learning_rate": 0.0001783700712871197, - "loss": 0.7177, - "step": 775 - }, - { - "epoch": 0.43, - "grad_norm": 0.04530615732073784, - "learning_rate": 0.00017831562503032867, - "loss": 0.8464, - "step": 776 - }, - { - "epoch": 0.43, - "grad_norm": 0.04415806382894516, - "learning_rate": 0.00017826111866618986, - "loss": 0.6993, - "step": 777 - }, - { - "epoch": 0.43, - "grad_norm": 0.04042017087340355, - "learning_rate": 0.00017820655223653692, - "loss": 0.6305, - "step": 778 - }, - { - "epoch": 0.43, - "grad_norm": 0.04573072865605354, - "learning_rate": 0.0001781519257832497, - "loss": 0.8541, - "step": 779 - }, - { - "epoch": 0.43, - "grad_norm": 0.04222429171204567, - "learning_rate": 0.00017809723934825405, - "loss": 0.6579, - "step": 780 - }, - { - "epoch": 0.43, - "grad_norm": 0.04141012579202652, - "learning_rate": 0.0001780424929735219, - "loss": 0.7556, - "step": 781 - }, - { - "epoch": 0.43, - "grad_norm": 0.049307554960250854, - "learning_rate": 0.00017798768670107114, - "loss": 0.6392, - "step": 782 - }, - { - "epoch": 0.44, - "grad_norm": 0.0450025275349617, - "learning_rate": 0.0001779328205729657, - "loss": 0.7718, - "step": 783 - }, - { - "epoch": 0.44, - "grad_norm": 0.043787937611341476, - "learning_rate": 0.00017787789463131538, - "loss": 0.7086, - "step": 784 - }, - { - "epoch": 0.44, - "grad_norm": 0.03968477621674538, - "learning_rate": 0.0001778229089182759, - "loss": 0.6744, - "step": 785 - }, - { - "epoch": 0.44, - "grad_norm": 0.044153615832328796, - "learning_rate": 0.00017776786347604892, - "loss": 0.7169, - "step": 786 - }, - { - "epoch": 0.44, - "grad_norm": 0.04364906996488571, - "learning_rate": 0.00017771275834688183, - "loss": 0.7977, - "step": 787 - }, - { - "epoch": 0.44, - "grad_norm": 0.04209036007523537, - "learning_rate": 0.00017765759357306796, - "loss": 0.6939, - "step": 788 - }, - { - "epoch": 0.44, - "grad_norm": 0.04396096616983414, - "learning_rate": 0.00017760236919694627, - "loss": 0.7431, - "step": 789 - }, - { - "epoch": 0.44, - "grad_norm": 0.039003144949674606, - "learning_rate": 0.00017754708526090157, - "loss": 0.6491, - "step": 790 - }, - { - "epoch": 0.44, - "grad_norm": 0.04044824466109276, - "learning_rate": 0.00017749174180736442, - "loss": 0.6875, - "step": 791 - }, - { - "epoch": 0.44, - "grad_norm": 0.04009506106376648, - "learning_rate": 0.0001774363388788109, - "loss": 0.6727, - "step": 792 - }, - { - "epoch": 0.44, - "grad_norm": 0.044321801513433456, - "learning_rate": 0.00017738087651776287, - "loss": 0.6705, - "step": 793 - }, - { - "epoch": 0.44, - "grad_norm": 0.047294605523347855, - "learning_rate": 0.00017732535476678777, - "loss": 0.8299, - "step": 794 - }, - { - "epoch": 0.44, - "grad_norm": 0.04393976926803589, - "learning_rate": 0.00017726977366849865, - "loss": 0.8299, - "step": 795 - }, - { - "epoch": 0.44, - "grad_norm": 0.038830842822790146, - "learning_rate": 0.000177214133265554, - "loss": 0.768, - "step": 796 - }, - { - "epoch": 0.44, - "grad_norm": 0.04319118335843086, - "learning_rate": 0.00017715843360065797, - "loss": 0.83, - "step": 797 - }, - { - "epoch": 0.44, - "grad_norm": 0.04209451377391815, - "learning_rate": 0.00017710267471656016, - "loss": 0.6847, - "step": 798 - }, - { - "epoch": 0.44, - "grad_norm": 0.04092387482523918, - "learning_rate": 0.00017704685665605546, - "loss": 0.7254, - "step": 799 - }, - { - "epoch": 0.44, - "grad_norm": 0.041155993938446045, - "learning_rate": 0.00017699097946198444, - "loss": 0.6125, - "step": 800 - }, - { - "epoch": 0.45, - "grad_norm": 0.03681296482682228, - "learning_rate": 0.00017693504317723284, - "loss": 0.6512, - "step": 801 - }, - { - "epoch": 0.45, - "grad_norm": 0.04595888406038284, - "learning_rate": 0.00017687904784473188, - "loss": 0.7719, - "step": 802 - }, - { - "epoch": 0.45, - "grad_norm": 0.04086902737617493, - "learning_rate": 0.00017682299350745803, - "loss": 0.6339, - "step": 803 - }, - { - "epoch": 0.45, - "grad_norm": 0.04581453278660774, - "learning_rate": 0.00017676688020843307, - "loss": 0.6704, - "step": 804 - }, - { - "epoch": 0.45, - "grad_norm": 0.041530586779117584, - "learning_rate": 0.00017671070799072402, - "loss": 0.79, - "step": 805 - }, - { - "epoch": 0.45, - "grad_norm": 0.04776669666171074, - "learning_rate": 0.0001766544768974432, - "loss": 0.782, - "step": 806 - }, - { - "epoch": 0.45, - "grad_norm": 0.041267361491918564, - "learning_rate": 0.00017659818697174796, - "loss": 0.6647, - "step": 807 - }, - { - "epoch": 0.45, - "grad_norm": 0.04318636655807495, - "learning_rate": 0.00017654183825684092, - "loss": 0.7395, - "step": 808 - }, - { - "epoch": 0.45, - "grad_norm": 0.04291536659002304, - "learning_rate": 0.00017648543079596982, - "loss": 0.7857, - "step": 809 - }, - { - "epoch": 0.45, - "grad_norm": 0.04914848133921623, - "learning_rate": 0.00017642896463242747, - "loss": 0.8949, - "step": 810 - }, - { - "epoch": 0.45, - "grad_norm": 0.042548660188913345, - "learning_rate": 0.00017637243980955168, - "loss": 0.6978, - "step": 811 - }, - { - "epoch": 0.45, - "grad_norm": 0.044454239308834076, - "learning_rate": 0.00017631585637072536, - "loss": 0.7645, - "step": 812 - }, - { - "epoch": 0.45, - "grad_norm": 0.044261105358600616, - "learning_rate": 0.00017625921435937637, - "loss": 0.704, - "step": 813 - }, - { - "epoch": 0.45, - "grad_norm": 0.03984127193689346, - "learning_rate": 0.00017620251381897752, - "loss": 0.6803, - "step": 814 - }, - { - "epoch": 0.45, - "grad_norm": 0.04928235337138176, - "learning_rate": 0.00017614575479304662, - "loss": 0.6964, - "step": 815 - }, - { - "epoch": 0.45, - "grad_norm": 0.03964222967624664, - "learning_rate": 0.00017608893732514616, - "loss": 0.7508, - "step": 816 - }, - { - "epoch": 0.45, - "grad_norm": 0.04107435792684555, - "learning_rate": 0.00017603206145888373, - "loss": 0.6977, - "step": 817 - }, - { - "epoch": 0.45, - "grad_norm": 0.03979944810271263, - "learning_rate": 0.00017597512723791162, - "loss": 0.6823, - "step": 818 - }, - { - "epoch": 0.46, - "grad_norm": 0.04214751720428467, - "learning_rate": 0.00017591813470592692, - "loss": 0.756, - "step": 819 - }, - { - "epoch": 0.46, - "grad_norm": 0.04152442887425423, - "learning_rate": 0.00017586108390667144, - "loss": 0.6977, - "step": 820 - }, - { - "epoch": 0.46, - "grad_norm": 0.04409961402416229, - "learning_rate": 0.00017580397488393176, - "loss": 0.7103, - "step": 821 - }, - { - "epoch": 0.46, - "grad_norm": 0.04341358318924904, - "learning_rate": 0.00017574680768153917, - "loss": 0.7553, - "step": 822 - }, - { - "epoch": 0.46, - "grad_norm": 0.040573328733444214, - "learning_rate": 0.00017568958234336952, - "loss": 0.6273, - "step": 823 - }, - { - "epoch": 0.46, - "grad_norm": 0.04173550009727478, - "learning_rate": 0.00017563229891334338, - "loss": 0.6156, - "step": 824 - }, - { - "epoch": 0.46, - "grad_norm": 0.043724823743104935, - "learning_rate": 0.00017557495743542585, - "loss": 0.681, - "step": 825 - }, - { - "epoch": 0.46, - "grad_norm": 0.04628165438771248, - "learning_rate": 0.00017551755795362656, - "loss": 0.707, - "step": 826 - }, - { - "epoch": 0.46, - "grad_norm": 0.0463298037648201, - "learning_rate": 0.00017546010051199971, - "loss": 0.7532, - "step": 827 - }, - { - "epoch": 0.46, - "grad_norm": 0.04120299220085144, - "learning_rate": 0.00017540258515464396, - "loss": 0.6755, - "step": 828 - }, - { - "epoch": 0.46, - "grad_norm": 0.04657519608736038, - "learning_rate": 0.00017534501192570246, - "loss": 0.794, - "step": 829 - }, - { - "epoch": 0.46, - "grad_norm": 0.04502912238240242, - "learning_rate": 0.0001752873808693627, - "loss": 0.7433, - "step": 830 - }, - { - "epoch": 0.46, - "grad_norm": 0.05056634172797203, - "learning_rate": 0.00017522969202985657, - "loss": 0.8462, - "step": 831 - }, - { - "epoch": 0.46, - "grad_norm": 0.04288472607731819, - "learning_rate": 0.00017517194545146037, - "loss": 0.6889, - "step": 832 - }, - { - "epoch": 0.46, - "grad_norm": 0.044761136174201965, - "learning_rate": 0.0001751141411784947, - "loss": 0.6657, - "step": 833 - }, - { - "epoch": 0.46, - "grad_norm": 0.04813416302204132, - "learning_rate": 0.00017505627925532442, - "loss": 0.7993, - "step": 834 - }, - { - "epoch": 0.46, - "grad_norm": 0.04681932181119919, - "learning_rate": 0.00017499835972635856, - "loss": 0.8888, - "step": 835 - }, - { - "epoch": 0.46, - "grad_norm": 0.04293239116668701, - "learning_rate": 0.0001749403826360505, - "loss": 0.7787, - "step": 836 - }, - { - "epoch": 0.47, - "grad_norm": 0.04279159754514694, - "learning_rate": 0.0001748823480288977, - "loss": 0.6838, - "step": 837 - }, - { - "epoch": 0.47, - "grad_norm": 0.04303467273712158, - "learning_rate": 0.00017482425594944184, - "loss": 0.7993, - "step": 838 - }, - { - "epoch": 0.47, - "grad_norm": 0.044777967035770416, - "learning_rate": 0.0001747661064422686, - "loss": 0.8355, - "step": 839 - }, - { - "epoch": 0.47, - "grad_norm": 0.04495497792959213, - "learning_rate": 0.00017470789955200788, - "loss": 0.8078, - "step": 840 - }, - { - "epoch": 0.47, - "grad_norm": 0.05037945136427879, - "learning_rate": 0.00017464963532333352, - "loss": 0.6933, - "step": 841 - }, - { - "epoch": 0.47, - "grad_norm": 0.04452716186642647, - "learning_rate": 0.0001745913138009634, - "loss": 0.7993, - "step": 842 - }, - { - "epoch": 0.47, - "grad_norm": 0.042573876678943634, - "learning_rate": 0.0001745329350296593, - "loss": 0.6218, - "step": 843 - }, - { - "epoch": 0.47, - "grad_norm": 0.04359611123800278, - "learning_rate": 0.00017447449905422713, - "loss": 0.7138, - "step": 844 - }, - { - "epoch": 0.47, - "grad_norm": 0.03893902152776718, - "learning_rate": 0.00017441600591951647, - "loss": 0.5625, - "step": 845 - }, - { - "epoch": 0.47, - "grad_norm": 0.04326329007744789, - "learning_rate": 0.00017435745567042095, - "loss": 0.808, - "step": 846 - }, - { - "epoch": 0.47, - "grad_norm": 0.04221475496888161, - "learning_rate": 0.0001742988483518779, - "loss": 0.6852, - "step": 847 - }, - { - "epoch": 0.47, - "grad_norm": 0.04148104414343834, - "learning_rate": 0.0001742401840088686, - "loss": 0.7226, - "step": 848 - }, - { - "epoch": 0.47, - "grad_norm": 0.043071821331977844, - "learning_rate": 0.00017418146268641794, - "loss": 0.7685, - "step": 849 - }, - { - "epoch": 0.47, - "grad_norm": 0.04110527038574219, - "learning_rate": 0.00017412268442959465, - "loss": 0.6696, - "step": 850 - }, - { - "epoch": 0.47, - "grad_norm": 0.04123936966061592, - "learning_rate": 0.00017406384928351113, - "loss": 0.6247, - "step": 851 - }, - { - "epoch": 0.47, - "grad_norm": 0.04614235460758209, - "learning_rate": 0.00017400495729332337, - "loss": 0.7239, - "step": 852 - }, - { - "epoch": 0.47, - "grad_norm": 0.04393448308110237, - "learning_rate": 0.00017394600850423114, - "loss": 0.6983, - "step": 853 - }, - { - "epoch": 0.47, - "grad_norm": 0.04823063313961029, - "learning_rate": 0.00017388700296147765, - "loss": 0.8802, - "step": 854 - }, - { - "epoch": 0.48, - "grad_norm": 0.04590068385004997, - "learning_rate": 0.00017382794071034975, - "loss": 0.764, - "step": 855 - }, - { - "epoch": 0.48, - "grad_norm": 0.043450817465782166, - "learning_rate": 0.00017376882179617783, - "loss": 0.6518, - "step": 856 - }, - { - "epoch": 0.48, - "grad_norm": 0.0406193882226944, - "learning_rate": 0.00017370964626433567, - "loss": 0.6845, - "step": 857 - }, - { - "epoch": 0.48, - "grad_norm": 0.0440056286752224, - "learning_rate": 0.00017365041416024065, - "loss": 0.7249, - "step": 858 - }, - { - "epoch": 0.48, - "grad_norm": 0.044533830136060715, - "learning_rate": 0.00017359112552935347, - "loss": 0.7045, - "step": 859 - }, - { - "epoch": 0.48, - "grad_norm": 0.04595007747411728, - "learning_rate": 0.00017353178041717814, - "loss": 0.7224, - "step": 860 - }, - { - "epoch": 0.48, - "grad_norm": 0.04445382580161095, - "learning_rate": 0.00017347237886926225, - "loss": 0.7391, - "step": 861 - }, - { - "epoch": 0.48, - "grad_norm": 0.044705115258693695, - "learning_rate": 0.0001734129209311965, - "loss": 0.8049, - "step": 862 - }, - { - "epoch": 0.48, - "grad_norm": 0.03983869031071663, - "learning_rate": 0.00017335340664861493, - "loss": 0.6245, - "step": 863 - }, - { - "epoch": 0.48, - "grad_norm": 0.03903631865978241, - "learning_rate": 0.00017329383606719481, - "loss": 0.6075, - "step": 864 - }, - { - "epoch": 0.48, - "grad_norm": 0.0470118448138237, - "learning_rate": 0.00017323420923265673, - "loss": 0.7978, - "step": 865 - }, - { - "epoch": 0.48, - "grad_norm": 0.046046625822782516, - "learning_rate": 0.00017317452619076428, - "loss": 0.7536, - "step": 866 - }, - { - "epoch": 0.48, - "grad_norm": 0.044642165303230286, - "learning_rate": 0.00017311478698732433, - "loss": 0.6868, - "step": 867 - }, - { - "epoch": 0.48, - "grad_norm": 0.043679915368556976, - "learning_rate": 0.0001730549916681868, - "loss": 0.6715, - "step": 868 - }, - { - "epoch": 0.48, - "grad_norm": 0.04374048486351967, - "learning_rate": 0.0001729951402792446, - "loss": 0.7951, - "step": 869 - }, - { - "epoch": 0.48, - "grad_norm": 0.04199199378490448, - "learning_rate": 0.00017293523286643386, - "loss": 0.6943, - "step": 870 - }, - { - "epoch": 0.48, - "grad_norm": 0.045180678367614746, - "learning_rate": 0.00017287526947573354, - "loss": 0.7037, - "step": 871 - }, - { - "epoch": 0.48, - "grad_norm": 0.04861563816666603, - "learning_rate": 0.0001728152501531656, - "loss": 0.808, - "step": 872 - }, - { - "epoch": 0.49, - "grad_norm": 0.06491879373788834, - "learning_rate": 0.000172755174944795, - "loss": 0.7232, - "step": 873 - }, - { - "epoch": 0.49, - "grad_norm": 0.056284647434949875, - "learning_rate": 0.0001726950438967295, - "loss": 0.7938, - "step": 874 - }, - { - "epoch": 0.49, - "grad_norm": 0.04053090140223503, - "learning_rate": 0.00017263485705511984, - "loss": 0.67, - "step": 875 - }, - { - "epoch": 0.49, - "grad_norm": 0.04802017658948898, - "learning_rate": 0.00017257461446615942, - "loss": 0.8086, - "step": 876 - }, - { - "epoch": 0.49, - "grad_norm": 0.055473774671554565, - "learning_rate": 0.00017251431617608452, - "loss": 0.6176, - "step": 877 - }, - { - "epoch": 0.49, - "grad_norm": 0.04549332708120346, - "learning_rate": 0.0001724539622311742, - "loss": 0.6832, - "step": 878 - }, - { - "epoch": 0.49, - "grad_norm": 0.040823038667440414, - "learning_rate": 0.00017239355267775018, - "loss": 0.6617, - "step": 879 - }, - { - "epoch": 0.49, - "grad_norm": 0.045911453664302826, - "learning_rate": 0.00017233308756217682, - "loss": 0.7098, - "step": 880 - }, - { - "epoch": 0.49, - "grad_norm": 0.044026948511600494, - "learning_rate": 0.00017227256693086123, - "loss": 0.7281, - "step": 881 - }, - { - "epoch": 0.49, - "grad_norm": 0.04080135002732277, - "learning_rate": 0.00017221199083025307, - "loss": 0.6427, - "step": 882 - }, - { - "epoch": 0.49, - "grad_norm": 0.04171103984117508, - "learning_rate": 0.0001721513593068446, - "loss": 0.77, - "step": 883 - }, - { - "epoch": 0.49, - "grad_norm": 0.04672902449965477, - "learning_rate": 0.00017209067240717057, - "loss": 0.6622, - "step": 884 - }, - { - "epoch": 0.49, - "grad_norm": 0.047331150621175766, - "learning_rate": 0.00017202993017780823, - "loss": 0.6539, - "step": 885 - }, - { - "epoch": 0.49, - "grad_norm": 0.041491780430078506, - "learning_rate": 0.0001719691326653774, - "loss": 0.693, - "step": 886 - }, - { - "epoch": 0.49, - "grad_norm": 0.04602917283773422, - "learning_rate": 0.00017190827991654014, - "loss": 0.6925, - "step": 887 - }, - { - "epoch": 0.49, - "grad_norm": 0.04347170516848564, - "learning_rate": 0.00017184737197800115, - "loss": 0.6765, - "step": 888 - }, - { - "epoch": 0.49, - "grad_norm": 0.04298747330904007, - "learning_rate": 0.00017178640889650723, - "loss": 0.6898, - "step": 889 - }, - { - "epoch": 0.49, - "grad_norm": 0.04869624972343445, - "learning_rate": 0.0001717253907188477, - "loss": 0.8587, - "step": 890 - }, - { - "epoch": 0.5, - "grad_norm": 0.04220546409487724, - "learning_rate": 0.0001716643174918541, - "loss": 0.6139, - "step": 891 - }, - { - "epoch": 0.5, - "grad_norm": 0.04683152586221695, - "learning_rate": 0.00017160318926240015, - "loss": 0.7845, - "step": 892 - }, - { - "epoch": 0.5, - "grad_norm": 0.04245654493570328, - "learning_rate": 0.0001715420060774019, - "loss": 0.6194, - "step": 893 - }, - { - "epoch": 0.5, - "grad_norm": 0.051294971257448196, - "learning_rate": 0.00017148076798381755, - "loss": 0.7762, - "step": 894 - }, - { - "epoch": 0.5, - "grad_norm": 0.05587690323591232, - "learning_rate": 0.00017141947502864738, - "loss": 0.7732, - "step": 895 - }, - { - "epoch": 0.5, - "grad_norm": 0.04731575399637222, - "learning_rate": 0.00017135812725893381, - "loss": 0.7229, - "step": 896 - }, - { - "epoch": 0.5, - "grad_norm": 0.04307183623313904, - "learning_rate": 0.00017129672472176134, - "loss": 0.6533, - "step": 897 - }, - { - "epoch": 0.5, - "grad_norm": 0.044035330414772034, - "learning_rate": 0.00017123526746425652, - "loss": 0.7754, - "step": 898 - }, - { - "epoch": 0.5, - "grad_norm": 0.04161735251545906, - "learning_rate": 0.00017117375553358786, - "loss": 0.6989, - "step": 899 - }, - { - "epoch": 0.5, - "grad_norm": 0.0438537523150444, - "learning_rate": 0.00017111218897696587, - "loss": 0.742, - "step": 900 - }, - { - "epoch": 0.5, - "grad_norm": 0.04731033742427826, - "learning_rate": 0.00017105056784164294, - "loss": 0.6689, - "step": 901 - }, - { - "epoch": 0.5, - "grad_norm": 0.04555543512105942, - "learning_rate": 0.00017098889217491338, - "loss": 0.7309, - "step": 902 - }, - { - "epoch": 0.5, - "grad_norm": 0.041630081832408905, - "learning_rate": 0.00017092716202411336, - "loss": 0.6845, - "step": 903 - }, - { - "epoch": 0.5, - "grad_norm": 0.045560967177152634, - "learning_rate": 0.00017086537743662086, - "loss": 0.6913, - "step": 904 - }, - { - "epoch": 0.5, - "grad_norm": 0.03997902199625969, - "learning_rate": 0.00017080353845985559, - "loss": 0.6016, - "step": 905 - }, - { - "epoch": 0.5, - "grad_norm": 0.04500821977853775, - "learning_rate": 0.0001707416451412791, - "loss": 0.7355, - "step": 906 - }, - { - "epoch": 0.5, - "grad_norm": 0.04442552849650383, - "learning_rate": 0.00017067969752839458, - "loss": 0.6525, - "step": 907 - }, - { - "epoch": 0.51, - "grad_norm": 0.04527292773127556, - "learning_rate": 0.0001706176956687469, - "loss": 0.6991, - "step": 908 - }, - { - "epoch": 0.51, - "grad_norm": 0.04470342397689819, - "learning_rate": 0.00017055563960992256, - "loss": 0.7352, - "step": 909 - }, - { - "epoch": 0.51, - "grad_norm": 0.04706728830933571, - "learning_rate": 0.00017049352939954967, - "loss": 0.7081, - "step": 910 - }, - { - "epoch": 0.51, - "grad_norm": 0.049026861786842346, - "learning_rate": 0.00017043136508529793, - "loss": 0.833, - "step": 911 - }, - { - "epoch": 0.51, - "grad_norm": 0.04529300704598427, - "learning_rate": 0.00017036914671487852, - "loss": 0.6624, - "step": 912 - }, - { - "epoch": 0.51, - "grad_norm": 0.05243083834648132, - "learning_rate": 0.0001703068743360441, - "loss": 0.9235, - "step": 913 - }, - { - "epoch": 0.51, - "grad_norm": 0.050981760025024414, - "learning_rate": 0.00017024454799658884, - "loss": 0.866, - "step": 914 - }, - { - "epoch": 0.51, - "grad_norm": 0.045431166887283325, - "learning_rate": 0.00017018216774434828, - "loss": 0.7997, - "step": 915 - }, - { - "epoch": 0.51, - "grad_norm": 0.049921710044145584, - "learning_rate": 0.00017011973362719932, - "loss": 0.6015, - "step": 916 - }, - { - "epoch": 0.51, - "grad_norm": 0.04454491659998894, - "learning_rate": 0.00017005724569306026, - "loss": 0.6789, - "step": 917 - }, - { - "epoch": 0.51, - "grad_norm": 0.04739166796207428, - "learning_rate": 0.0001699947039898907, - "loss": 0.7727, - "step": 918 - }, - { - "epoch": 0.51, - "grad_norm": 0.04357178509235382, - "learning_rate": 0.0001699321085656914, - "loss": 0.71, - "step": 919 - }, - { - "epoch": 0.51, - "grad_norm": 0.04351083189249039, - "learning_rate": 0.00016986945946850446, - "loss": 0.7437, - "step": 920 - }, - { - "epoch": 0.51, - "grad_norm": 0.04492335394024849, - "learning_rate": 0.00016980675674641322, - "loss": 0.7705, - "step": 921 - }, - { - "epoch": 0.51, - "grad_norm": 0.04446355253458023, - "learning_rate": 0.000169744000447542, - "loss": 0.6758, - "step": 922 - }, - { - "epoch": 0.51, - "grad_norm": 0.04474789649248123, - "learning_rate": 0.00016968119062005642, - "loss": 0.665, - "step": 923 - }, - { - "epoch": 0.51, - "grad_norm": 0.03871838003396988, - "learning_rate": 0.00016961832731216307, - "loss": 0.644, - "step": 924 - }, - { - "epoch": 0.51, - "grad_norm": 0.04366254433989525, - "learning_rate": 0.00016955541057210965, - "loss": 0.8131, - "step": 925 - }, - { - "epoch": 0.52, - "grad_norm": 0.04740358516573906, - "learning_rate": 0.0001694924404481848, - "loss": 0.6964, - "step": 926 - }, - { - "epoch": 0.52, - "grad_norm": 0.04698627069592476, - "learning_rate": 0.00016942941698871818, - "loss": 0.6736, - "step": 927 - }, - { - "epoch": 0.52, - "grad_norm": 0.0437115915119648, - "learning_rate": 0.00016936634024208047, - "loss": 0.7068, - "step": 928 - }, - { - "epoch": 0.52, - "grad_norm": 0.04518444836139679, - "learning_rate": 0.00016930321025668306, - "loss": 0.7357, - "step": 929 - }, - { - "epoch": 0.52, - "grad_norm": 0.0465158186852932, - "learning_rate": 0.00016924002708097833, - "loss": 0.7876, - "step": 930 - }, - { - "epoch": 0.52, - "grad_norm": 0.04457319527864456, - "learning_rate": 0.00016917679076345943, - "loss": 0.7335, - "step": 931 - }, - { - "epoch": 0.52, - "grad_norm": 0.05104387551546097, - "learning_rate": 0.00016911350135266035, - "loss": 0.7055, - "step": 932 - }, - { - "epoch": 0.52, - "grad_norm": 0.04391132667660713, - "learning_rate": 0.0001690501588971558, - "loss": 0.7218, - "step": 933 - }, - { - "epoch": 0.52, - "grad_norm": 0.042982205748558044, - "learning_rate": 0.00016898676344556118, - "loss": 0.7202, - "step": 934 - }, - { - "epoch": 0.52, - "grad_norm": 0.042211346328258514, - "learning_rate": 0.00016892331504653259, - "loss": 0.708, - "step": 935 - }, - { - "epoch": 0.52, - "grad_norm": 0.043890226632356644, - "learning_rate": 0.00016885981374876677, - "loss": 0.7524, - "step": 936 - }, - { - "epoch": 0.52, - "grad_norm": 0.044012345373630524, - "learning_rate": 0.00016879625960100104, - "loss": 0.7103, - "step": 937 - }, - { - "epoch": 0.52, - "grad_norm": 0.0452919527888298, - "learning_rate": 0.0001687326526520133, - "loss": 0.7012, - "step": 938 - }, - { - "epoch": 0.52, - "grad_norm": 0.04428261145949364, - "learning_rate": 0.00016866899295062197, - "loss": 0.6329, - "step": 939 - }, - { - "epoch": 0.52, - "grad_norm": 0.043744850903749466, - "learning_rate": 0.00016860528054568597, - "loss": 0.7682, - "step": 940 - }, - { - "epoch": 0.52, - "grad_norm": 0.0442328006029129, - "learning_rate": 0.00016854151548610462, - "loss": 0.6931, - "step": 941 - }, - { - "epoch": 0.52, - "grad_norm": 0.044094718992710114, - "learning_rate": 0.00016847769782081772, - "loss": 0.7495, - "step": 942 - }, - { - "epoch": 0.52, - "grad_norm": 0.041264671832323074, - "learning_rate": 0.00016841382759880542, - "loss": 0.6035, - "step": 943 - }, - { - "epoch": 0.53, - "grad_norm": 0.03848943114280701, - "learning_rate": 0.00016834990486908817, - "loss": 0.6, - "step": 944 - }, - { - "epoch": 0.53, - "grad_norm": 0.0429924838244915, - "learning_rate": 0.00016828592968072678, - "loss": 0.7149, - "step": 945 - }, - { - "epoch": 0.53, - "grad_norm": 0.037183403968811035, - "learning_rate": 0.00016822190208282226, - "loss": 0.6253, - "step": 946 - }, - { - "epoch": 0.53, - "grad_norm": 0.04193463176488876, - "learning_rate": 0.00016815782212451592, - "loss": 0.7332, - "step": 947 - }, - { - "epoch": 0.53, - "grad_norm": 0.04574836045503616, - "learning_rate": 0.00016809368985498918, - "loss": 0.676, - "step": 948 - }, - { - "epoch": 0.53, - "grad_norm": 0.044316504150629044, - "learning_rate": 0.0001680295053234637, - "loss": 0.6696, - "step": 949 - }, - { - "epoch": 0.53, - "grad_norm": 0.04649119824171066, - "learning_rate": 0.00016796526857920112, - "loss": 0.6853, - "step": 950 - }, - { - "epoch": 0.53, - "grad_norm": 0.041028618812561035, - "learning_rate": 0.00016790097967150325, - "loss": 0.5797, - "step": 951 - }, - { - "epoch": 0.53, - "grad_norm": 0.045178357511758804, - "learning_rate": 0.00016783663864971193, - "loss": 0.733, - "step": 952 - }, - { - "epoch": 0.53, - "grad_norm": 0.044117413461208344, - "learning_rate": 0.00016777224556320896, - "loss": 0.6571, - "step": 953 - }, - { - "epoch": 0.53, - "grad_norm": 0.04647046700119972, - "learning_rate": 0.00016770780046141616, - "loss": 0.7674, - "step": 954 - }, - { - "epoch": 0.53, - "grad_norm": 0.04585673660039902, - "learning_rate": 0.0001676433033937952, - "loss": 0.7153, - "step": 955 - }, - { - "epoch": 0.53, - "grad_norm": 0.04684153199195862, - "learning_rate": 0.00016757875440984768, - "loss": 0.712, - "step": 956 - }, - { - "epoch": 0.53, - "grad_norm": 0.05014558881521225, - "learning_rate": 0.000167514153559115, - "loss": 0.7872, - "step": 957 - }, - { - "epoch": 0.53, - "grad_norm": 0.04870294779539108, - "learning_rate": 0.00016744950089117846, - "loss": 0.7779, - "step": 958 - }, - { - "epoch": 0.53, - "grad_norm": 0.047236453741788864, - "learning_rate": 0.00016738479645565902, - "loss": 0.6254, - "step": 959 - }, - { - "epoch": 0.53, - "grad_norm": 0.04619355872273445, - "learning_rate": 0.00016732004030221745, - "loss": 0.7061, - "step": 960 - }, - { - "epoch": 0.53, - "grad_norm": 0.04691663756966591, - "learning_rate": 0.00016725523248055415, - "loss": 0.7212, - "step": 961 - }, - { - "epoch": 0.54, - "grad_norm": 0.046586498618125916, - "learning_rate": 0.00016719037304040922, - "loss": 0.7438, - "step": 962 - }, - { - "epoch": 0.54, - "grad_norm": 0.047182347625494, - "learning_rate": 0.00016712546203156243, - "loss": 0.7781, - "step": 963 - }, - { - "epoch": 0.54, - "grad_norm": 0.04682426154613495, - "learning_rate": 0.000167060499503833, - "loss": 0.7651, - "step": 964 - }, - { - "epoch": 0.54, - "grad_norm": 0.04150580242276192, - "learning_rate": 0.00016699548550707974, - "loss": 0.6261, - "step": 965 - }, - { - "epoch": 0.54, - "grad_norm": 0.047207970172166824, - "learning_rate": 0.00016693042009120106, - "loss": 0.7482, - "step": 966 - }, - { - "epoch": 0.54, - "grad_norm": 0.03972497582435608, - "learning_rate": 0.0001668653033061347, - "loss": 0.6568, - "step": 967 - }, - { - "epoch": 0.54, - "grad_norm": 0.05301308631896973, - "learning_rate": 0.00016680013520185786, - "loss": 0.8632, - "step": 968 - }, - { - "epoch": 0.54, - "grad_norm": 0.044552650302648544, - "learning_rate": 0.0001667349158283872, - "loss": 0.7905, - "step": 969 - }, - { - "epoch": 0.54, - "grad_norm": 0.0502367801964283, - "learning_rate": 0.0001666696452357787, - "loss": 0.694, - "step": 970 - }, - { - "epoch": 0.54, - "grad_norm": 0.047164347022771835, - "learning_rate": 0.00016660432347412752, - "loss": 0.6664, - "step": 971 - }, - { - "epoch": 0.54, - "grad_norm": 0.04584898054599762, - "learning_rate": 0.00016653895059356828, - "loss": 0.6614, - "step": 972 - }, - { - "epoch": 0.54, - "grad_norm": 0.04450858384370804, - "learning_rate": 0.00016647352664427473, - "loss": 0.7293, - "step": 973 - }, - { - "epoch": 0.54, - "grad_norm": 0.045772675424814224, - "learning_rate": 0.00016640805167645985, - "loss": 0.6825, - "step": 974 - }, - { - "epoch": 0.54, - "grad_norm": 0.04510961472988129, - "learning_rate": 0.00016634252574037575, - "loss": 0.628, - "step": 975 - }, - { - "epoch": 0.54, - "grad_norm": 0.0424661748111248, - "learning_rate": 0.00016627694888631377, - "loss": 0.7242, - "step": 976 - }, - { - "epoch": 0.54, - "grad_norm": 0.04516896605491638, - "learning_rate": 0.00016621132116460404, - "loss": 0.728, - "step": 977 - }, - { - "epoch": 0.54, - "grad_norm": 0.04178481549024582, - "learning_rate": 0.00016614564262561608, - "loss": 0.6749, - "step": 978 - }, - { - "epoch": 0.54, - "grad_norm": 0.04447667673230171, - "learning_rate": 0.00016607991331975822, - "loss": 0.7377, - "step": 979 - }, - { - "epoch": 0.55, - "grad_norm": 0.04504280164837837, - "learning_rate": 0.0001660141332974778, - "loss": 0.6817, - "step": 980 - }, - { - "epoch": 0.55, - "grad_norm": 0.0446137972176075, - "learning_rate": 0.00016594830260926102, - "loss": 0.7593, - "step": 981 - }, - { - "epoch": 0.55, - "grad_norm": 0.04577165096998215, - "learning_rate": 0.0001658824213056331, - "loss": 0.6594, - "step": 982 - }, - { - "epoch": 0.55, - "grad_norm": 0.04543627053499222, - "learning_rate": 0.00016581648943715796, - "loss": 0.7227, - "step": 983 - }, - { - "epoch": 0.55, - "grad_norm": 0.051089901477098465, - "learning_rate": 0.0001657505070544384, - "loss": 0.6931, - "step": 984 - }, - { - "epoch": 0.55, - "grad_norm": 0.04433685168623924, - "learning_rate": 0.00016568447420811612, - "loss": 0.7223, - "step": 985 - }, - { - "epoch": 0.55, - "grad_norm": 0.04630895331501961, - "learning_rate": 0.00016561839094887123, - "loss": 0.8788, - "step": 986 - }, - { - "epoch": 0.55, - "grad_norm": 0.04735618084669113, - "learning_rate": 0.00016555225732742281, - "loss": 0.6527, - "step": 987 - }, - { - "epoch": 0.55, - "grad_norm": 0.04732111841440201, - "learning_rate": 0.00016548607339452853, - "loss": 0.7427, - "step": 988 - }, - { - "epoch": 0.55, - "grad_norm": 0.04409315809607506, - "learning_rate": 0.0001654198392009846, - "loss": 0.7473, - "step": 989 - }, - { - "epoch": 0.55, - "grad_norm": 0.044037654995918274, - "learning_rate": 0.00016535355479762586, - "loss": 0.735, - "step": 990 - }, - { - "epoch": 0.55, - "grad_norm": 0.043528955429792404, - "learning_rate": 0.00016528722023532574, - "loss": 0.6415, - "step": 991 - }, - { - "epoch": 0.55, - "grad_norm": 0.04991381987929344, - "learning_rate": 0.00016522083556499596, - "loss": 0.7989, - "step": 992 - }, - { - "epoch": 0.55, - "grad_norm": 0.044087547808885574, - "learning_rate": 0.000165154400837587, - "loss": 0.7178, - "step": 993 - }, - { - "epoch": 0.55, - "grad_norm": 0.04327988997101784, - "learning_rate": 0.00016508791610408754, - "loss": 0.6599, - "step": 994 - }, - { - "epoch": 0.55, - "grad_norm": 0.04584032669663429, - "learning_rate": 0.0001650213814155247, - "loss": 0.6482, - "step": 995 - }, - { - "epoch": 0.55, - "grad_norm": 0.05077075958251953, - "learning_rate": 0.00016495479682296395, - "loss": 0.727, - "step": 996 - }, - { - "epoch": 0.55, - "grad_norm": 0.04770886152982712, - "learning_rate": 0.00016488816237750906, - "loss": 0.6737, - "step": 997 - }, - { - "epoch": 0.56, - "grad_norm": 0.04274902865290642, - "learning_rate": 0.00016482147813030203, - "loss": 0.6277, - "step": 998 - }, - { - "epoch": 0.56, - "grad_norm": 0.048387110233306885, - "learning_rate": 0.00016475474413252315, - "loss": 0.7836, - "step": 999 - }, - { - "epoch": 0.56, - "grad_norm": 0.04784883186221123, - "learning_rate": 0.0001646879604353908, - "loss": 0.8291, - "step": 1000 - }, - { - "epoch": 0.56, - "grad_norm": 0.047025181353092194, - "learning_rate": 0.00016462112709016163, - "loss": 0.7344, - "step": 1001 - }, - { - "epoch": 0.56, - "grad_norm": 0.042724139988422394, - "learning_rate": 0.00016455424414813026, - "loss": 0.607, - "step": 1002 - }, - { - "epoch": 0.56, - "grad_norm": 0.04707411676645279, - "learning_rate": 0.00016448731166062946, - "loss": 0.7879, - "step": 1003 - }, - { - "epoch": 0.56, - "grad_norm": 0.048031605780124664, - "learning_rate": 0.00016442032967903, - "loss": 0.7665, - "step": 1004 - }, - { - "epoch": 0.56, - "grad_norm": 0.0522460862994194, - "learning_rate": 0.00016435329825474065, - "loss": 0.8697, - "step": 1005 - }, - { - "epoch": 0.56, - "grad_norm": 0.043403007090091705, - "learning_rate": 0.00016428621743920814, - "loss": 0.6733, - "step": 1006 - }, - { - "epoch": 0.56, - "grad_norm": 0.04464132711291313, - "learning_rate": 0.00016421908728391703, - "loss": 0.6495, - "step": 1007 - }, - { - "epoch": 0.56, - "grad_norm": 0.04746426269412041, - "learning_rate": 0.00016415190784038983, - "loss": 0.6364, - "step": 1008 - }, - { - "epoch": 0.56, - "grad_norm": 0.04561330005526543, - "learning_rate": 0.00016408467916018688, - "loss": 0.7492, - "step": 1009 - }, - { - "epoch": 0.56, - "grad_norm": 0.046681590378284454, - "learning_rate": 0.00016401740129490624, - "loss": 0.6807, - "step": 1010 - }, - { - "epoch": 0.56, - "grad_norm": 0.04807223007082939, - "learning_rate": 0.00016395007429618382, - "loss": 0.7732, - "step": 1011 - }, - { - "epoch": 0.56, - "grad_norm": 0.04678153246641159, - "learning_rate": 0.00016388269821569312, - "loss": 0.753, - "step": 1012 - }, - { - "epoch": 0.56, - "grad_norm": 0.0424916073679924, - "learning_rate": 0.0001638152731051454, - "loss": 0.6378, - "step": 1013 - }, - { - "epoch": 0.56, - "grad_norm": 0.04331868886947632, - "learning_rate": 0.00016374779901628952, - "loss": 0.766, - "step": 1014 - }, - { - "epoch": 0.56, - "grad_norm": 0.043369825929403305, - "learning_rate": 0.00016368027600091194, - "loss": 0.6492, - "step": 1015 - }, - { - "epoch": 0.57, - "grad_norm": 0.04750450327992439, - "learning_rate": 0.00016361270411083666, - "loss": 0.7226, - "step": 1016 - }, - { - "epoch": 0.57, - "grad_norm": 0.04926105588674545, - "learning_rate": 0.0001635450833979252, - "loss": 0.7402, - "step": 1017 - }, - { - "epoch": 0.57, - "grad_norm": 0.04888901859521866, - "learning_rate": 0.00016347741391407655, - "loss": 0.7061, - "step": 1018 - }, - { - "epoch": 0.57, - "grad_norm": 0.046233415603637695, - "learning_rate": 0.00016340969571122712, - "loss": 0.7367, - "step": 1019 - }, - { - "epoch": 0.57, - "grad_norm": 0.0504348948597908, - "learning_rate": 0.00016334192884135074, - "loss": 0.7785, - "step": 1020 - }, - { - "epoch": 0.57, - "grad_norm": 0.04318249598145485, - "learning_rate": 0.00016327411335645853, - "loss": 0.6495, - "step": 1021 - }, - { - "epoch": 0.57, - "grad_norm": 0.04340077564120293, - "learning_rate": 0.00016320624930859904, - "loss": 0.6685, - "step": 1022 - }, - { - "epoch": 0.57, - "grad_norm": 0.04627622663974762, - "learning_rate": 0.00016313833674985796, - "loss": 0.6857, - "step": 1023 - }, - { - "epoch": 0.57, - "grad_norm": 0.04838955029845238, - "learning_rate": 0.0001630703757323583, - "loss": 0.7336, - "step": 1024 - }, - { - "epoch": 0.57, - "grad_norm": 0.04140018671751022, - "learning_rate": 0.00016300236630826024, - "loss": 0.6872, - "step": 1025 - }, - { - "epoch": 0.57, - "grad_norm": 0.04307800903916359, - "learning_rate": 0.000162934308529761, - "loss": 0.689, - "step": 1026 - }, - { - "epoch": 0.57, - "grad_norm": 0.046013642102479935, - "learning_rate": 0.00016286620244909518, - "loss": 0.676, - "step": 1027 - }, - { - "epoch": 0.57, - "grad_norm": 0.045836132019758224, - "learning_rate": 0.0001627980481185342, - "loss": 0.7321, - "step": 1028 - }, - { - "epoch": 0.57, - "grad_norm": 0.046595752239227295, - "learning_rate": 0.00016272984559038655, - "loss": 0.7788, - "step": 1029 - }, - { - "epoch": 0.57, - "grad_norm": 0.041627172380685806, - "learning_rate": 0.0001626615949169979, - "loss": 0.5976, - "step": 1030 - }, - { - "epoch": 0.57, - "grad_norm": 0.056944046169519424, - "learning_rate": 0.00016259329615075062, - "loss": 0.937, - "step": 1031 - }, - { - "epoch": 0.57, - "grad_norm": 0.0483165942132473, - "learning_rate": 0.00016252494934406415, - "loss": 0.6425, - "step": 1032 - }, - { - "epoch": 0.57, - "grad_norm": 0.05059583857655525, - "learning_rate": 0.00016245655454939474, - "loss": 0.7802, - "step": 1033 - }, - { - "epoch": 0.58, - "grad_norm": 0.04494575038552284, - "learning_rate": 0.0001623881118192355, - "loss": 0.709, - "step": 1034 - }, - { - "epoch": 0.58, - "grad_norm": 0.04515180364251137, - "learning_rate": 0.00016231962120611635, - "loss": 0.7219, - "step": 1035 - }, - { - "epoch": 0.58, - "grad_norm": 0.051081229001283646, - "learning_rate": 0.00016225108276260385, - "loss": 0.7955, - "step": 1036 - }, - { - "epoch": 0.58, - "grad_norm": 0.04739641770720482, - "learning_rate": 0.00016218249654130138, - "loss": 0.7017, - "step": 1037 - }, - { - "epoch": 0.58, - "grad_norm": 0.048389632254838943, - "learning_rate": 0.000162113862594849, - "loss": 0.7864, - "step": 1038 - }, - { - "epoch": 0.58, - "grad_norm": 0.04880169779062271, - "learning_rate": 0.0001620451809759233, - "loss": 0.7688, - "step": 1039 - }, - { - "epoch": 0.58, - "grad_norm": 0.044919900596141815, - "learning_rate": 0.00016197645173723758, - "loss": 0.754, - "step": 1040 - }, - { - "epoch": 0.58, - "grad_norm": 0.04826303571462631, - "learning_rate": 0.00016190767493154154, - "loss": 0.7931, - "step": 1041 - }, - { - "epoch": 0.58, - "grad_norm": 0.042201928794384, - "learning_rate": 0.0001618388506116215, - "loss": 0.603, - "step": 1042 - }, - { - "epoch": 0.58, - "grad_norm": 0.051390066742897034, - "learning_rate": 0.00016176997883030026, - "loss": 0.7412, - "step": 1043 - }, - { - "epoch": 0.58, - "grad_norm": 0.045629408210515976, - "learning_rate": 0.00016170105964043695, - "loss": 0.7082, - "step": 1044 - }, - { - "epoch": 0.58, - "grad_norm": 0.04842764884233475, - "learning_rate": 0.0001616320930949272, - "loss": 0.7796, - "step": 1045 - }, - { - "epoch": 0.58, - "grad_norm": 0.04266642406582832, - "learning_rate": 0.00016156307924670288, - "loss": 0.6521, - "step": 1046 - }, - { - "epoch": 0.58, - "grad_norm": 0.04461183398962021, - "learning_rate": 0.00016149401814873223, - "loss": 0.7277, - "step": 1047 - }, - { - "epoch": 0.58, - "grad_norm": 0.044011376798152924, - "learning_rate": 0.0001614249098540197, - "loss": 0.7393, - "step": 1048 - }, - { - "epoch": 0.58, - "grad_norm": 0.04548850655555725, - "learning_rate": 0.0001613557544156061, - "loss": 0.6724, - "step": 1049 - }, - { - "epoch": 0.58, - "grad_norm": 0.04711510241031647, - "learning_rate": 0.00016128655188656817, - "loss": 0.75, - "step": 1050 - }, - { - "epoch": 0.58, - "grad_norm": 0.04443375766277313, - "learning_rate": 0.00016121730232001905, - "loss": 0.692, - "step": 1051 - }, - { - "epoch": 0.59, - "grad_norm": 0.047497253865003586, - "learning_rate": 0.00016114800576910788, - "loss": 0.6364, - "step": 1052 - }, - { - "epoch": 0.59, - "grad_norm": 0.04716882482171059, - "learning_rate": 0.00016107866228701981, - "loss": 0.8157, - "step": 1053 - }, - { - "epoch": 0.59, - "grad_norm": 0.049585748463869095, - "learning_rate": 0.0001610092719269761, - "loss": 0.7222, - "step": 1054 - }, - { - "epoch": 0.59, - "grad_norm": 0.04663019999861717, - "learning_rate": 0.0001609398347422339, - "loss": 0.674, - "step": 1055 - }, - { - "epoch": 0.59, - "grad_norm": 0.04515184462070465, - "learning_rate": 0.00016087035078608637, - "loss": 0.641, - "step": 1056 - }, - { - "epoch": 0.59, - "grad_norm": 0.04762982949614525, - "learning_rate": 0.0001608008201118625, - "loss": 0.6312, - "step": 1057 - }, - { - "epoch": 0.59, - "grad_norm": 0.04451216757297516, - "learning_rate": 0.00016073124277292728, - "loss": 0.7158, - "step": 1058 - }, - { - "epoch": 0.59, - "grad_norm": 0.0480036623775959, - "learning_rate": 0.0001606616188226813, - "loss": 0.7029, - "step": 1059 - }, - { - "epoch": 0.59, - "grad_norm": 0.04552299529314041, - "learning_rate": 0.00016059194831456107, - "loss": 0.7185, - "step": 1060 - }, - { - "epoch": 0.59, - "grad_norm": 0.04811175912618637, - "learning_rate": 0.0001605222313020388, - "loss": 0.5746, - "step": 1061 - }, - { - "epoch": 0.59, - "grad_norm": 0.045087020844221115, - "learning_rate": 0.0001604524678386224, - "loss": 0.7196, - "step": 1062 - }, - { - "epoch": 0.59, - "grad_norm": 0.05363542214035988, - "learning_rate": 0.00016038265797785542, - "loss": 0.7144, - "step": 1063 - }, - { - "epoch": 0.59, - "grad_norm": 0.05288343131542206, - "learning_rate": 0.00016031280177331706, - "loss": 0.7173, - "step": 1064 - }, - { - "epoch": 0.59, - "grad_norm": 0.046650927513837814, - "learning_rate": 0.00016024289927862195, - "loss": 0.789, - "step": 1065 - }, - { - "epoch": 0.59, - "grad_norm": 0.048903193324804306, - "learning_rate": 0.00016017295054742046, - "loss": 0.7106, - "step": 1066 - }, - { - "epoch": 0.59, - "grad_norm": 0.04834052920341492, - "learning_rate": 0.00016010295563339825, - "loss": 0.6996, - "step": 1067 - }, - { - "epoch": 0.59, - "grad_norm": 0.05408628657460213, - "learning_rate": 0.00016003291459027653, - "loss": 0.8056, - "step": 1068 - }, - { - "epoch": 0.59, - "grad_norm": 0.04541575536131859, - "learning_rate": 0.00015996282747181197, - "loss": 0.6443, - "step": 1069 - }, - { - "epoch": 0.6, - "grad_norm": 0.04552549123764038, - "learning_rate": 0.00015989269433179645, - "loss": 0.69, - "step": 1070 - }, - { - "epoch": 0.6, - "grad_norm": 0.0493139810860157, - "learning_rate": 0.00015982251522405727, - "loss": 0.7327, - "step": 1071 - }, - { - "epoch": 0.6, - "grad_norm": 0.049078866839408875, - "learning_rate": 0.000159752290202457, - "loss": 0.8271, - "step": 1072 - }, - { - "epoch": 0.6, - "grad_norm": 0.0488913394510746, - "learning_rate": 0.0001596820193208934, - "loss": 0.7298, - "step": 1073 - }, - { - "epoch": 0.6, - "grad_norm": 0.04880421981215477, - "learning_rate": 0.00015961170263329948, - "loss": 0.6863, - "step": 1074 - }, - { - "epoch": 0.6, - "grad_norm": 0.04644745960831642, - "learning_rate": 0.00015954134019364346, - "loss": 0.6754, - "step": 1075 - }, - { - "epoch": 0.6, - "grad_norm": 0.05141911655664444, - "learning_rate": 0.00015947093205592855, - "loss": 0.7986, - "step": 1076 - }, - { - "epoch": 0.6, - "grad_norm": 0.052950434386730194, - "learning_rate": 0.00015940047827419303, - "loss": 0.8126, - "step": 1077 - }, - { - "epoch": 0.6, - "grad_norm": 0.04494583606719971, - "learning_rate": 0.0001593299789025104, - "loss": 0.724, - "step": 1078 - }, - { - "epoch": 0.6, - "grad_norm": 0.051292311400175095, - "learning_rate": 0.00015925943399498898, - "loss": 0.7201, - "step": 1079 - }, - { - "epoch": 0.6, - "grad_norm": 0.049635812640190125, - "learning_rate": 0.000159188843605772, - "loss": 0.6293, - "step": 1080 - }, - { - "epoch": 0.6, - "grad_norm": 0.08753520250320435, - "learning_rate": 0.00015911820778903777, - "loss": 0.7986, - "step": 1081 - }, - { - "epoch": 0.6, - "grad_norm": 0.05015747994184494, - "learning_rate": 0.00015904752659899935, - "loss": 0.7774, - "step": 1082 - }, - { - "epoch": 0.6, - "grad_norm": 0.05062435567378998, - "learning_rate": 0.00015897680008990467, - "loss": 0.7423, - "step": 1083 - }, - { - "epoch": 0.6, - "grad_norm": 0.04625916853547096, - "learning_rate": 0.00015890602831603634, - "loss": 0.7179, - "step": 1084 - }, - { - "epoch": 0.6, - "grad_norm": 0.06423801183700562, - "learning_rate": 0.00015883521133171186, - "loss": 0.6282, - "step": 1085 - }, - { - "epoch": 0.6, - "grad_norm": 0.048834312707185745, - "learning_rate": 0.00015876434919128335, - "loss": 0.7035, - "step": 1086 - }, - { - "epoch": 0.6, - "grad_norm": 0.049968525767326355, - "learning_rate": 0.0001586934419491376, - "loss": 0.777, - "step": 1087 - }, - { - "epoch": 0.61, - "grad_norm": 0.04475943744182587, - "learning_rate": 0.00015862248965969604, - "loss": 0.6788, - "step": 1088 - }, - { - "epoch": 0.61, - "grad_norm": 0.046110913157463074, - "learning_rate": 0.0001585514923774146, - "loss": 0.7237, - "step": 1089 - }, - { - "epoch": 0.61, - "grad_norm": 0.047238875180482864, - "learning_rate": 0.0001584804501567838, - "loss": 0.7412, - "step": 1090 - }, - { - "epoch": 0.61, - "grad_norm": 0.04489819332957268, - "learning_rate": 0.00015840936305232869, - "loss": 0.6327, - "step": 1091 - }, - { - "epoch": 0.61, - "grad_norm": 0.04712117463350296, - "learning_rate": 0.00015833823111860863, - "loss": 0.7175, - "step": 1092 - }, - { - "epoch": 0.61, - "grad_norm": 0.05276874080300331, - "learning_rate": 0.0001582670544102175, - "loss": 0.626, - "step": 1093 - }, - { - "epoch": 0.61, - "grad_norm": 0.047401316463947296, - "learning_rate": 0.0001581958329817836, - "loss": 0.7426, - "step": 1094 - }, - { - "epoch": 0.61, - "grad_norm": 0.049675095826387405, - "learning_rate": 0.0001581245668879694, - "loss": 0.7332, - "step": 1095 - }, - { - "epoch": 0.61, - "grad_norm": 0.048146385699510574, - "learning_rate": 0.00015805325618347172, - "loss": 0.7209, - "step": 1096 - }, - { - "epoch": 0.61, - "grad_norm": 0.04951384291052818, - "learning_rate": 0.00015798190092302164, - "loss": 0.6746, - "step": 1097 - }, - { - "epoch": 0.61, - "grad_norm": 0.0463559590280056, - "learning_rate": 0.0001579105011613844, - "loss": 0.6808, - "step": 1098 - }, - { - "epoch": 0.61, - "grad_norm": 0.04673503711819649, - "learning_rate": 0.00015783905695335946, - "loss": 0.6946, - "step": 1099 - }, - { - "epoch": 0.61, - "grad_norm": 0.04709373414516449, - "learning_rate": 0.0001577675683537803, - "loss": 0.691, - "step": 1100 - }, - { - "epoch": 0.61, - "grad_norm": 0.05507972836494446, - "learning_rate": 0.00015769603541751455, - "loss": 0.7342, - "step": 1101 - }, - { - "epoch": 0.61, - "grad_norm": 0.043717097491025925, - "learning_rate": 0.00015762445819946384, - "loss": 0.6183, - "step": 1102 - }, - { - "epoch": 0.61, - "grad_norm": 0.04856884479522705, - "learning_rate": 0.0001575528367545637, - "loss": 0.7489, - "step": 1103 - }, - { - "epoch": 0.61, - "grad_norm": 0.04887085407972336, - "learning_rate": 0.0001574811711377838, - "loss": 0.7202, - "step": 1104 - }, - { - "epoch": 0.61, - "grad_norm": 0.05098757520318031, - "learning_rate": 0.00015740946140412753, - "loss": 0.7171, - "step": 1105 - }, - { - "epoch": 0.62, - "grad_norm": 0.05076475441455841, - "learning_rate": 0.0001573377076086322, - "loss": 0.6581, - "step": 1106 - }, - { - "epoch": 0.62, - "grad_norm": 0.05398886650800705, - "learning_rate": 0.00015726590980636896, - "loss": 0.8532, - "step": 1107 - }, - { - "epoch": 0.62, - "grad_norm": 0.05237511545419693, - "learning_rate": 0.00015719406805244276, - "loss": 0.6418, - "step": 1108 - }, - { - "epoch": 0.62, - "grad_norm": 0.05127400532364845, - "learning_rate": 0.00015712218240199218, - "loss": 0.75, - "step": 1109 - }, - { - "epoch": 0.62, - "grad_norm": 0.051004305481910706, - "learning_rate": 0.0001570502529101896, - "loss": 0.6545, - "step": 1110 - }, - { - "epoch": 0.62, - "grad_norm": 0.05179214105010033, - "learning_rate": 0.000156978279632241, - "loss": 0.6854, - "step": 1111 - }, - { - "epoch": 0.62, - "grad_norm": 0.04017964377999306, - "learning_rate": 0.0001569062626233859, - "loss": 0.5733, - "step": 1112 - }, - { - "epoch": 0.62, - "grad_norm": 0.05255338177084923, - "learning_rate": 0.00015683420193889753, - "loss": 0.9239, - "step": 1113 - }, - { - "epoch": 0.62, - "grad_norm": 0.04868584871292114, - "learning_rate": 0.00015676209763408253, - "loss": 0.7038, - "step": 1114 - }, - { - "epoch": 0.62, - "grad_norm": 0.04898803308606148, - "learning_rate": 0.0001566899497642811, - "loss": 0.6677, - "step": 1115 - }, - { - "epoch": 0.62, - "grad_norm": 0.044663190841674805, - "learning_rate": 0.00015661775838486674, - "loss": 0.741, - "step": 1116 - }, - { - "epoch": 0.62, - "grad_norm": 0.048298414796590805, - "learning_rate": 0.00015654552355124648, - "loss": 0.7292, - "step": 1117 - }, - { - "epoch": 0.62, - "grad_norm": 0.04956801235675812, - "learning_rate": 0.00015647324531886065, - "loss": 0.7475, - "step": 1118 - }, - { - "epoch": 0.62, - "grad_norm": 0.051773227751255035, - "learning_rate": 0.0001564009237431829, - "loss": 0.751, - "step": 1119 - }, - { - "epoch": 0.62, - "grad_norm": 0.04530036076903343, - "learning_rate": 0.00015632855887972008, - "loss": 0.7123, - "step": 1120 - }, - { - "epoch": 0.62, - "grad_norm": 0.05154275894165039, - "learning_rate": 0.00015625615078401244, - "loss": 0.8531, - "step": 1121 - }, - { - "epoch": 0.62, - "grad_norm": 0.04365009814500809, - "learning_rate": 0.00015618369951163317, - "loss": 0.6508, - "step": 1122 - }, - { - "epoch": 0.62, - "grad_norm": 0.04828101769089699, - "learning_rate": 0.00015611120511818877, - "loss": 0.6855, - "step": 1123 - }, - { - "epoch": 0.63, - "grad_norm": 0.04952272027730942, - "learning_rate": 0.00015603866765931875, - "loss": 0.6806, - "step": 1124 - }, - { - "epoch": 0.63, - "grad_norm": 0.0446164608001709, - "learning_rate": 0.00015596608719069578, - "loss": 0.6364, - "step": 1125 - }, - { - "epoch": 0.63, - "grad_norm": 0.05198860913515091, - "learning_rate": 0.00015589346376802544, - "loss": 0.6415, - "step": 1126 - }, - { - "epoch": 0.63, - "grad_norm": 0.04873516410589218, - "learning_rate": 0.00015582079744704626, - "loss": 0.6668, - "step": 1127 - }, - { - "epoch": 0.63, - "grad_norm": 0.04862043261528015, - "learning_rate": 0.00015574808828352978, - "loss": 0.6277, - "step": 1128 - }, - { - "epoch": 0.63, - "grad_norm": 0.050693485885858536, - "learning_rate": 0.00015567533633328035, - "loss": 0.6917, - "step": 1129 - }, - { - "epoch": 0.63, - "grad_norm": 0.048907842487096786, - "learning_rate": 0.00015560254165213522, - "loss": 0.7091, - "step": 1130 - }, - { - "epoch": 0.63, - "grad_norm": 0.04532686248421669, - "learning_rate": 0.00015552970429596438, - "loss": 0.622, - "step": 1131 - }, - { - "epoch": 0.63, - "grad_norm": 0.043171364814043045, - "learning_rate": 0.00015545682432067067, - "loss": 0.6719, - "step": 1132 - }, - { - "epoch": 0.63, - "grad_norm": 0.05676614120602608, - "learning_rate": 0.00015538390178218946, - "loss": 0.8366, - "step": 1133 - }, - { - "epoch": 0.63, - "grad_norm": 0.045390404760837555, - "learning_rate": 0.00015531093673648897, - "loss": 0.6162, - "step": 1134 - }, - { - "epoch": 0.63, - "grad_norm": 0.05167704075574875, - "learning_rate": 0.00015523792923957, - "loss": 0.7946, - "step": 1135 - }, - { - "epoch": 0.63, - "grad_norm": 0.04950125515460968, - "learning_rate": 0.00015516487934746578, - "loss": 0.6541, - "step": 1136 - }, - { - "epoch": 0.63, - "grad_norm": 0.0524483397603035, - "learning_rate": 0.00015509178711624233, - "loss": 0.7793, - "step": 1137 - }, - { - "epoch": 0.63, - "grad_norm": 0.046446293592453, - "learning_rate": 0.00015501865260199795, - "loss": 0.6489, - "step": 1138 - }, - { - "epoch": 0.63, - "grad_norm": 0.0487704761326313, - "learning_rate": 0.00015494547586086352, - "loss": 0.6154, - "step": 1139 - }, - { - "epoch": 0.63, - "grad_norm": 0.04823431745171547, - "learning_rate": 0.00015487225694900224, - "loss": 0.6899, - "step": 1140 - }, - { - "epoch": 0.63, - "grad_norm": 0.04303416609764099, - "learning_rate": 0.00015479899592260983, - "loss": 0.5816, - "step": 1141 - }, - { - "epoch": 0.64, - "grad_norm": 0.048107780516147614, - "learning_rate": 0.0001547256928379141, - "loss": 0.6998, - "step": 1142 - }, - { - "epoch": 0.64, - "grad_norm": 0.04937562346458435, - "learning_rate": 0.0001546523477511754, - "loss": 0.7552, - "step": 1143 - }, - { - "epoch": 0.64, - "grad_norm": 0.05084756389260292, - "learning_rate": 0.00015457896071868602, - "loss": 0.6517, - "step": 1144 - }, - { - "epoch": 0.64, - "grad_norm": 0.04428539052605629, - "learning_rate": 0.00015450553179677075, - "loss": 0.6143, - "step": 1145 - }, - { - "epoch": 0.64, - "grad_norm": 0.050209447741508484, - "learning_rate": 0.0001544320610417863, - "loss": 0.6805, - "step": 1146 - }, - { - "epoch": 0.64, - "grad_norm": 0.04966219514608383, - "learning_rate": 0.0001543585485101216, - "loss": 0.7239, - "step": 1147 - }, - { - "epoch": 0.64, - "grad_norm": 0.0482448972761631, - "learning_rate": 0.00015428499425819764, - "loss": 0.7492, - "step": 1148 - }, - { - "epoch": 0.64, - "grad_norm": 0.05310830846428871, - "learning_rate": 0.00015421139834246737, - "loss": 0.7678, - "step": 1149 - }, - { - "epoch": 0.64, - "grad_norm": 0.04781487584114075, - "learning_rate": 0.0001541377608194158, - "loss": 0.7321, - "step": 1150 - }, - { - "epoch": 0.64, - "grad_norm": 0.050008464604616165, - "learning_rate": 0.00015406408174555976, - "loss": 0.7224, - "step": 1151 - }, - { - "epoch": 0.64, - "grad_norm": 0.047798726707696915, - "learning_rate": 0.00015399036117744812, - "loss": 0.6562, - "step": 1152 - }, - { - "epoch": 0.64, - "grad_norm": 0.04253503307700157, - "learning_rate": 0.00015391659917166143, - "loss": 0.613, - "step": 1153 - }, - { - "epoch": 0.64, - "grad_norm": 0.04759254679083824, - "learning_rate": 0.00015384279578481221, - "loss": 0.6502, - "step": 1154 - }, - { - "epoch": 0.64, - "grad_norm": 0.044068843126297, - "learning_rate": 0.00015376895107354464, - "loss": 0.6639, - "step": 1155 - }, - { - "epoch": 0.64, - "grad_norm": 0.04848353564739227, - "learning_rate": 0.00015369506509453458, - "loss": 0.6952, - "step": 1156 - }, - { - "epoch": 0.64, - "grad_norm": 0.046723756939172745, - "learning_rate": 0.00015362113790448967, - "loss": 0.5937, - "step": 1157 - }, - { - "epoch": 0.64, - "grad_norm": 0.050520990043878555, - "learning_rate": 0.0001535471695601491, - "loss": 0.7107, - "step": 1158 - }, - { - "epoch": 0.64, - "grad_norm": 0.04594268649816513, - "learning_rate": 0.00015347316011828373, - "loss": 0.6756, - "step": 1159 - }, - { - "epoch": 0.65, - "grad_norm": 0.05000408738851547, - "learning_rate": 0.00015339910963569584, - "loss": 0.6676, - "step": 1160 - }, - { - "epoch": 0.65, - "grad_norm": 0.05357789620757103, - "learning_rate": 0.00015332501816921928, - "loss": 0.7249, - "step": 1161 - }, - { - "epoch": 0.65, - "grad_norm": 0.051022421568632126, - "learning_rate": 0.00015325088577571939, - "loss": 0.7145, - "step": 1162 - }, - { - "epoch": 0.65, - "grad_norm": 0.04610150307416916, - "learning_rate": 0.00015317671251209285, - "loss": 0.6579, - "step": 1163 - }, - { - "epoch": 0.65, - "grad_norm": 0.04873840510845184, - "learning_rate": 0.00015310249843526776, - "loss": 0.7044, - "step": 1164 - }, - { - "epoch": 0.65, - "grad_norm": 0.046522945165634155, - "learning_rate": 0.00015302824360220353, - "loss": 0.6772, - "step": 1165 - }, - { - "epoch": 0.65, - "grad_norm": 0.046785563230514526, - "learning_rate": 0.0001529539480698908, - "loss": 0.6593, - "step": 1166 - }, - { - "epoch": 0.65, - "grad_norm": 0.04677551984786987, - "learning_rate": 0.00015287961189535155, - "loss": 0.7643, - "step": 1167 - }, - { - "epoch": 0.65, - "grad_norm": 0.057435374706983566, - "learning_rate": 0.00015280523513563885, - "loss": 0.7578, - "step": 1168 - }, - { - "epoch": 0.65, - "grad_norm": 0.048298779875040054, - "learning_rate": 0.000152730817847837, - "loss": 0.7108, - "step": 1169 - }, - { - "epoch": 0.65, - "grad_norm": 0.04588484391570091, - "learning_rate": 0.00015265636008906133, - "loss": 0.6706, - "step": 1170 - }, - { - "epoch": 0.65, - "grad_norm": 0.05275345966219902, - "learning_rate": 0.00015258186191645829, - "loss": 0.7065, - "step": 1171 - }, - { - "epoch": 0.65, - "grad_norm": 0.045389268547296524, - "learning_rate": 0.00015250732338720533, - "loss": 0.6161, - "step": 1172 - }, - { - "epoch": 0.65, - "grad_norm": 0.051542263478040695, - "learning_rate": 0.00015243274455851085, - "loss": 0.6771, - "step": 1173 - }, - { - "epoch": 0.65, - "grad_norm": 0.04666198417544365, - "learning_rate": 0.00015235812548761426, - "loss": 0.6068, - "step": 1174 - }, - { - "epoch": 0.65, - "grad_norm": 0.04389272257685661, - "learning_rate": 0.00015228346623178573, - "loss": 0.6682, - "step": 1175 - }, - { - "epoch": 0.65, - "grad_norm": 0.044190727174282074, - "learning_rate": 0.00015220876684832638, - "loss": 0.6145, - "step": 1176 - }, - { - "epoch": 0.65, - "grad_norm": 0.049365028738975525, - "learning_rate": 0.0001521340273945681, - "loss": 0.7827, - "step": 1177 - }, - { - "epoch": 0.66, - "grad_norm": 0.04868901148438454, - "learning_rate": 0.00015205924792787345, - "loss": 0.6776, - "step": 1178 - }, - { - "epoch": 0.66, - "grad_norm": 0.04864593222737312, - "learning_rate": 0.00015198442850563584, - "loss": 0.6541, - "step": 1179 - }, - { - "epoch": 0.66, - "grad_norm": 0.046926818788051605, - "learning_rate": 0.00015190956918527924, - "loss": 0.6965, - "step": 1180 - }, - { - "epoch": 0.66, - "grad_norm": 0.044857464730739594, - "learning_rate": 0.0001518346700242583, - "loss": 0.7031, - "step": 1181 - }, - { - "epoch": 0.66, - "grad_norm": 0.05430439114570618, - "learning_rate": 0.0001517597310800582, - "loss": 0.7288, - "step": 1182 - }, - { - "epoch": 0.66, - "grad_norm": 0.05577121302485466, - "learning_rate": 0.00015168475241019474, - "loss": 0.8773, - "step": 1183 - }, - { - "epoch": 0.66, - "grad_norm": 0.046056222170591354, - "learning_rate": 0.0001516097340722141, - "loss": 0.6732, - "step": 1184 - }, - { - "epoch": 0.66, - "grad_norm": 0.05171097069978714, - "learning_rate": 0.00015153467612369297, - "loss": 0.7971, - "step": 1185 - }, - { - "epoch": 0.66, - "grad_norm": 0.04264886677265167, - "learning_rate": 0.00015145957862223845, - "loss": 0.6446, - "step": 1186 - }, - { - "epoch": 0.66, - "grad_norm": 0.04378258436918259, - "learning_rate": 0.0001513844416254879, - "loss": 0.6927, - "step": 1187 - }, - { - "epoch": 0.66, - "grad_norm": 0.04480038583278656, - "learning_rate": 0.00015130926519110914, - "loss": 0.6879, - "step": 1188 - }, - { - "epoch": 0.66, - "grad_norm": 0.04745401442050934, - "learning_rate": 0.00015123404937680016, - "loss": 0.6483, - "step": 1189 - }, - { - "epoch": 0.66, - "grad_norm": 0.050725579261779785, - "learning_rate": 0.0001511587942402892, - "loss": 0.668, - "step": 1190 - }, - { - "epoch": 0.66, - "grad_norm": 0.046093638986349106, - "learning_rate": 0.00015108349983933465, - "loss": 0.6211, - "step": 1191 - }, - { - "epoch": 0.66, - "grad_norm": 0.0595737025141716, - "learning_rate": 0.00015100816623172514, - "loss": 0.7197, - "step": 1192 - }, - { - "epoch": 0.66, - "grad_norm": 0.055872704833745956, - "learning_rate": 0.0001509327934752792, - "loss": 0.7615, - "step": 1193 - }, - { - "epoch": 0.66, - "grad_norm": 0.05438286438584328, - "learning_rate": 0.00015085738162784565, - "loss": 0.7135, - "step": 1194 - }, - { - "epoch": 0.66, - "grad_norm": 0.048681970685720444, - "learning_rate": 0.0001507819307473031, - "loss": 0.7368, - "step": 1195 - }, - { - "epoch": 0.67, - "grad_norm": 0.0487043671309948, - "learning_rate": 0.00015070644089156026, - "loss": 0.6612, - "step": 1196 - }, - { - "epoch": 0.67, - "grad_norm": 0.0529305525124073, - "learning_rate": 0.0001506309121185556, - "loss": 0.6877, - "step": 1197 - }, - { - "epoch": 0.67, - "grad_norm": 0.05354293808341026, - "learning_rate": 0.00015055534448625766, - "loss": 0.7075, - "step": 1198 - }, - { - "epoch": 0.67, - "grad_norm": 0.04844878613948822, - "learning_rate": 0.00015047973805266466, - "loss": 0.693, - "step": 1199 - }, - { - "epoch": 0.67, - "grad_norm": 0.05151020362973213, - "learning_rate": 0.0001504040928758046, - "loss": 0.6552, - "step": 1200 - }, - { - "epoch": 0.67, - "grad_norm": 0.055405277758836746, - "learning_rate": 0.00015032840901373532, - "loss": 0.7242, - "step": 1201 - }, - { - "epoch": 0.67, - "grad_norm": 0.052574001252651215, - "learning_rate": 0.00015025268652454421, - "loss": 0.7383, - "step": 1202 - }, - { - "epoch": 0.67, - "grad_norm": 0.05063820630311966, - "learning_rate": 0.0001501769254663485, - "loss": 0.6245, - "step": 1203 - }, - { - "epoch": 0.67, - "grad_norm": 0.04701600968837738, - "learning_rate": 0.00015010112589729482, - "loss": 0.6344, - "step": 1204 - }, - { - "epoch": 0.67, - "grad_norm": 0.04830322787165642, - "learning_rate": 0.00015002528787555944, - "loss": 0.6309, - "step": 1205 - }, - { - "epoch": 0.67, - "grad_norm": 0.04776900261640549, - "learning_rate": 0.00014994941145934814, - "loss": 0.7883, - "step": 1206 - }, - { - "epoch": 0.67, - "grad_norm": 0.048602622002363205, - "learning_rate": 0.00014987349670689623, - "loss": 0.7106, - "step": 1207 - }, - { - "epoch": 0.67, - "grad_norm": 0.05431029573082924, - "learning_rate": 0.00014979754367646834, - "loss": 0.7456, - "step": 1208 - }, - { - "epoch": 0.67, - "grad_norm": 0.04521273076534271, - "learning_rate": 0.00014972155242635852, - "loss": 0.5661, - "step": 1209 - }, - { - "epoch": 0.67, - "grad_norm": 0.04839641973376274, - "learning_rate": 0.0001496455230148902, - "loss": 0.6362, - "step": 1210 - }, - { - "epoch": 0.67, - "grad_norm": 0.05457577481865883, - "learning_rate": 0.00014956945550041595, - "loss": 0.792, - "step": 1211 - }, - { - "epoch": 0.67, - "grad_norm": 0.051022760570049286, - "learning_rate": 0.0001494933499413178, - "loss": 0.7257, - "step": 1212 - }, - { - "epoch": 0.67, - "grad_norm": 0.05146949738264084, - "learning_rate": 0.00014941720639600686, - "loss": 0.7664, - "step": 1213 - }, - { - "epoch": 0.68, - "grad_norm": 0.04937746375799179, - "learning_rate": 0.00014934102492292336, - "loss": 0.7638, - "step": 1214 - }, - { - "epoch": 0.68, - "grad_norm": 0.05288033187389374, - "learning_rate": 0.0001492648055805367, - "loss": 0.7008, - "step": 1215 - }, - { - "epoch": 0.68, - "grad_norm": 0.04645581543445587, - "learning_rate": 0.00014918854842734533, - "loss": 0.659, - "step": 1216 - }, - { - "epoch": 0.68, - "grad_norm": 0.049302008002996445, - "learning_rate": 0.0001491122535218767, - "loss": 0.6522, - "step": 1217 - }, - { - "epoch": 0.68, - "grad_norm": 0.047872643917798996, - "learning_rate": 0.00014903592092268728, - "loss": 0.7293, - "step": 1218 - }, - { - "epoch": 0.68, - "grad_norm": 0.046734388917684555, - "learning_rate": 0.00014895955068836242, - "loss": 0.5885, - "step": 1219 - }, - { - "epoch": 0.68, - "grad_norm": 0.05098286643624306, - "learning_rate": 0.0001488831428775164, - "loss": 0.6703, - "step": 1220 - }, - { - "epoch": 0.68, - "grad_norm": 0.05204523354768753, - "learning_rate": 0.00014880669754879227, - "loss": 0.7022, - "step": 1221 - }, - { - "epoch": 0.68, - "grad_norm": 0.049951422959566116, - "learning_rate": 0.00014873021476086203, - "loss": 0.6648, - "step": 1222 - }, - { - "epoch": 0.68, - "grad_norm": 0.04843631759285927, - "learning_rate": 0.00014865369457242621, - "loss": 0.6847, - "step": 1223 - }, - { - "epoch": 0.68, - "grad_norm": 0.05291084572672844, - "learning_rate": 0.00014857713704221422, - "loss": 0.8186, - "step": 1224 - }, - { - "epoch": 0.68, - "grad_norm": 0.05095936357975006, - "learning_rate": 0.00014850054222898402, - "loss": 0.705, - "step": 1225 - }, - { - "epoch": 0.68, - "grad_norm": 0.05311046168208122, - "learning_rate": 0.00014842391019152226, - "loss": 0.6515, - "step": 1226 - }, - { - "epoch": 0.68, - "grad_norm": 0.05315253511071205, - "learning_rate": 0.00014834724098864417, - "loss": 0.6898, - "step": 1227 - }, - { - "epoch": 0.68, - "grad_norm": 0.048906102776527405, - "learning_rate": 0.0001482705346791934, - "loss": 0.7955, - "step": 1228 - }, - { - "epoch": 0.68, - "grad_norm": 0.0493389368057251, - "learning_rate": 0.00014819379132204218, - "loss": 0.7394, - "step": 1229 - }, - { - "epoch": 0.68, - "grad_norm": 0.04702428728342056, - "learning_rate": 0.00014811701097609112, - "loss": 0.5547, - "step": 1230 - }, - { - "epoch": 0.68, - "grad_norm": 0.04860011860728264, - "learning_rate": 0.00014804019370026926, - "loss": 0.7455, - "step": 1231 - }, - { - "epoch": 0.69, - "grad_norm": 0.05252429470419884, - "learning_rate": 0.00014796333955353395, - "loss": 0.7699, - "step": 1232 - }, - { - "epoch": 0.69, - "grad_norm": 0.04439985752105713, - "learning_rate": 0.0001478864485948709, - "loss": 0.6318, - "step": 1233 - }, - { - "epoch": 0.69, - "grad_norm": 0.051365215331315994, - "learning_rate": 0.00014780952088329396, - "loss": 0.748, - "step": 1234 - }, - { - "epoch": 0.69, - "grad_norm": 0.050039131194353104, - "learning_rate": 0.00014773255647784525, - "loss": 0.6782, - "step": 1235 - }, - { - "epoch": 0.69, - "grad_norm": 0.04989631101489067, - "learning_rate": 0.0001476555554375951, - "loss": 0.6101, - "step": 1236 - }, - { - "epoch": 0.69, - "grad_norm": 0.05142849683761597, - "learning_rate": 0.0001475785178216419, - "loss": 0.6938, - "step": 1237 - }, - { - "epoch": 0.69, - "grad_norm": 0.04728643596172333, - "learning_rate": 0.00014750144368911209, - "loss": 0.6917, - "step": 1238 - }, - { - "epoch": 0.69, - "grad_norm": 0.0472208671271801, - "learning_rate": 0.0001474243330991602, - "loss": 0.6225, - "step": 1239 - }, - { - "epoch": 0.69, - "grad_norm": 0.05022706091403961, - "learning_rate": 0.00014734718611096877, - "loss": 0.6543, - "step": 1240 - }, - { - "epoch": 0.69, - "grad_norm": 0.05018246918916702, - "learning_rate": 0.00014727000278374808, - "loss": 0.7208, - "step": 1241 - }, - { - "epoch": 0.69, - "grad_norm": 0.04962945356965065, - "learning_rate": 0.00014719278317673655, - "loss": 0.6386, - "step": 1242 - }, - { - "epoch": 0.69, - "grad_norm": 0.04697633534669876, - "learning_rate": 0.0001471155273492003, - "loss": 0.6764, - "step": 1243 - }, - { - "epoch": 0.69, - "grad_norm": 0.05113474652171135, - "learning_rate": 0.00014703823536043323, - "loss": 0.7663, - "step": 1244 - }, - { - "epoch": 0.69, - "grad_norm": 0.05091071128845215, - "learning_rate": 0.00014696090726975714, - "loss": 0.7107, - "step": 1245 - }, - { - "epoch": 0.69, - "grad_norm": 0.05539362505078316, - "learning_rate": 0.0001468835431365214, - "loss": 0.6792, - "step": 1246 - }, - { - "epoch": 0.69, - "grad_norm": 0.05495639145374298, - "learning_rate": 0.0001468061430201031, - "loss": 0.7616, - "step": 1247 - }, - { - "epoch": 0.69, - "grad_norm": 0.05079134553670883, - "learning_rate": 0.00014672870697990685, - "loss": 0.6765, - "step": 1248 - }, - { - "epoch": 0.69, - "grad_norm": 0.048591259866952896, - "learning_rate": 0.00014665123507536505, - "loss": 0.6775, - "step": 1249 - }, - { - "epoch": 0.7, - "grad_norm": 0.04820895567536354, - "learning_rate": 0.00014657372736593739, - "loss": 0.6892, - "step": 1250 - }, - { - "epoch": 0.7, - "grad_norm": 0.0469762422144413, - "learning_rate": 0.00014649618391111117, - "loss": 0.6423, - "step": 1251 - }, - { - "epoch": 0.7, - "grad_norm": 0.048589132726192474, - "learning_rate": 0.00014641860477040112, - "loss": 0.6854, - "step": 1252 - }, - { - "epoch": 0.7, - "grad_norm": 0.05065524950623512, - "learning_rate": 0.0001463409900033493, - "loss": 0.6663, - "step": 1253 - }, - { - "epoch": 0.7, - "grad_norm": 0.049449969083070755, - "learning_rate": 0.0001462633396695252, - "loss": 0.6662, - "step": 1254 - }, - { - "epoch": 0.7, - "grad_norm": 0.052549589425325394, - "learning_rate": 0.00014618565382852548, - "loss": 0.7664, - "step": 1255 - }, - { - "epoch": 0.7, - "grad_norm": 0.05117465555667877, - "learning_rate": 0.00014610793253997423, - "loss": 0.7495, - "step": 1256 - }, - { - "epoch": 0.7, - "grad_norm": 0.054655950516462326, - "learning_rate": 0.0001460301758635225, - "loss": 0.6196, - "step": 1257 - }, - { - "epoch": 0.7, - "grad_norm": 0.049078892916440964, - "learning_rate": 0.0001459523838588488, - "loss": 0.5712, - "step": 1258 - }, - { - "epoch": 0.7, - "grad_norm": 0.05013487488031387, - "learning_rate": 0.00014587455658565847, - "loss": 0.6546, - "step": 1259 - }, - { - "epoch": 0.7, - "grad_norm": 0.05131218209862709, - "learning_rate": 0.00014579669410368413, - "loss": 0.6762, - "step": 1260 - }, - { - "epoch": 0.7, - "grad_norm": 0.05478007718920708, - "learning_rate": 0.00014571879647268528, - "loss": 0.6899, - "step": 1261 - }, - { - "epoch": 0.7, - "grad_norm": 0.04590092971920967, - "learning_rate": 0.00014564086375244855, - "loss": 0.6682, - "step": 1262 - }, - { - "epoch": 0.7, - "grad_norm": 0.055836863815784454, - "learning_rate": 0.00014556289600278733, - "loss": 0.8131, - "step": 1263 - }, - { - "epoch": 0.7, - "grad_norm": 0.05465158447623253, - "learning_rate": 0.00014548489328354195, - "loss": 0.7749, - "step": 1264 - }, - { - "epoch": 0.7, - "grad_norm": 0.04707031324505806, - "learning_rate": 0.00014540685565457968, - "loss": 0.5979, - "step": 1265 - }, - { - "epoch": 0.7, - "grad_norm": 0.048351485282182693, - "learning_rate": 0.00014532878317579444, - "loss": 0.7093, - "step": 1266 - }, - { - "epoch": 0.7, - "grad_norm": 0.05322815105319023, - "learning_rate": 0.000145250675907107, - "loss": 0.7281, - "step": 1267 - }, - { - "epoch": 0.71, - "grad_norm": 0.04762796312570572, - "learning_rate": 0.0001451725339084648, - "loss": 0.6659, - "step": 1268 - }, - { - "epoch": 0.71, - "grad_norm": 0.047569163143634796, - "learning_rate": 0.0001450943572398419, - "loss": 0.6894, - "step": 1269 - }, - { - "epoch": 0.71, - "grad_norm": 0.04456879943609238, - "learning_rate": 0.00014501614596123898, - "loss": 0.6958, - "step": 1270 - }, - { - "epoch": 0.71, - "grad_norm": 0.050228893756866455, - "learning_rate": 0.00014493790013268338, - "loss": 0.7011, - "step": 1271 - }, - { - "epoch": 0.71, - "grad_norm": 0.056763775646686554, - "learning_rate": 0.0001448596198142288, - "loss": 0.8265, - "step": 1272 - }, - { - "epoch": 0.71, - "grad_norm": 0.04859040677547455, - "learning_rate": 0.00014478130506595556, - "loss": 0.7295, - "step": 1273 - }, - { - "epoch": 0.71, - "grad_norm": 0.054411161690950394, - "learning_rate": 0.0001447029559479703, - "loss": 0.7045, - "step": 1274 - }, - { - "epoch": 0.71, - "grad_norm": 0.047824714332818985, - "learning_rate": 0.00014462457252040607, - "loss": 0.7665, - "step": 1275 - }, - { - "epoch": 0.71, - "grad_norm": 0.04883917048573494, - "learning_rate": 0.00014454615484342222, - "loss": 0.7283, - "step": 1276 - }, - { - "epoch": 0.71, - "grad_norm": 0.04813641682267189, - "learning_rate": 0.00014446770297720448, - "loss": 0.688, - "step": 1277 - }, - { - "epoch": 0.71, - "grad_norm": 0.049928583204746246, - "learning_rate": 0.00014438921698196476, - "loss": 0.7347, - "step": 1278 - }, - { - "epoch": 0.71, - "grad_norm": 0.05147488787770271, - "learning_rate": 0.00014431069691794114, - "loss": 0.7492, - "step": 1279 - }, - { - "epoch": 0.71, - "grad_norm": 0.04984632879495621, - "learning_rate": 0.0001442321428453979, - "loss": 0.744, - "step": 1280 - }, - { - "epoch": 0.71, - "grad_norm": 0.05544523149728775, - "learning_rate": 0.00014415355482462534, - "loss": 0.7598, - "step": 1281 - }, - { - "epoch": 0.71, - "grad_norm": 0.05009017884731293, - "learning_rate": 0.00014407493291593995, - "loss": 0.6616, - "step": 1282 - }, - { - "epoch": 0.71, - "grad_norm": 0.050635501742362976, - "learning_rate": 0.00014399627717968412, - "loss": 0.6241, - "step": 1283 - }, - { - "epoch": 0.71, - "grad_norm": 0.054587721824645996, - "learning_rate": 0.0001439175876762262, - "loss": 0.6985, - "step": 1284 - }, - { - "epoch": 0.71, - "grad_norm": 0.05211169272661209, - "learning_rate": 0.00014383886446596057, - "loss": 0.6875, - "step": 1285 - }, - { - "epoch": 0.72, - "grad_norm": 0.05252880975604057, - "learning_rate": 0.00014376010760930728, - "loss": 0.6947, - "step": 1286 - }, - { - "epoch": 0.72, - "grad_norm": 0.04912309721112251, - "learning_rate": 0.00014368131716671246, - "loss": 0.6376, - "step": 1287 - }, - { - "epoch": 0.72, - "grad_norm": 0.05693266540765762, - "learning_rate": 0.00014360249319864775, - "loss": 0.7267, - "step": 1288 - }, - { - "epoch": 0.72, - "grad_norm": 0.05367673560976982, - "learning_rate": 0.00014352363576561073, - "loss": 0.714, - "step": 1289 - }, - { - "epoch": 0.72, - "grad_norm": 0.04969675466418266, - "learning_rate": 0.00014344474492812461, - "loss": 0.6023, - "step": 1290 - }, - { - "epoch": 0.72, - "grad_norm": 0.04907987639307976, - "learning_rate": 0.00014336582074673813, - "loss": 0.7009, - "step": 1291 - }, - { - "epoch": 0.72, - "grad_norm": 0.04774461314082146, - "learning_rate": 0.00014328686328202582, - "loss": 0.6393, - "step": 1292 - }, - { - "epoch": 0.72, - "grad_norm": 0.047509051859378815, - "learning_rate": 0.00014320787259458753, - "loss": 0.6444, - "step": 1293 - }, - { - "epoch": 0.72, - "grad_norm": 0.057917118072509766, - "learning_rate": 0.00014312884874504876, - "loss": 0.8577, - "step": 1294 - }, - { - "epoch": 0.72, - "grad_norm": 0.04852381348609924, - "learning_rate": 0.0001430497917940604, - "loss": 0.6916, - "step": 1295 - }, - { - "epoch": 0.72, - "grad_norm": 0.04885304719209671, - "learning_rate": 0.00014297070180229883, - "loss": 0.7174, - "step": 1296 - }, - { - "epoch": 0.72, - "grad_norm": 0.052427150309085846, - "learning_rate": 0.00014289157883046568, - "loss": 0.8375, - "step": 1297 - }, - { - "epoch": 0.72, - "grad_norm": 0.04951222613453865, - "learning_rate": 0.0001428124229392879, - "loss": 0.7188, - "step": 1298 - }, - { - "epoch": 0.72, - "grad_norm": 0.049055252224206924, - "learning_rate": 0.0001427332341895178, - "loss": 0.6977, - "step": 1299 - }, - { - "epoch": 0.72, - "grad_norm": 0.047418877482414246, - "learning_rate": 0.00014265401264193286, - "loss": 0.5577, - "step": 1300 - }, - { - "epoch": 0.72, - "grad_norm": 0.051448144018650055, - "learning_rate": 0.00014257475835733572, - "loss": 0.6462, - "step": 1301 - }, - { - "epoch": 0.72, - "grad_norm": 0.053073249757289886, - "learning_rate": 0.00014249547139655409, - "loss": 0.6129, - "step": 1302 - }, - { - "epoch": 0.72, - "grad_norm": 0.05240590125322342, - "learning_rate": 0.00014241615182044088, - "loss": 0.7354, - "step": 1303 - }, - { - "epoch": 0.73, - "grad_norm": 0.04916330799460411, - "learning_rate": 0.00014233679968987393, - "loss": 0.6193, - "step": 1304 - }, - { - "epoch": 0.73, - "grad_norm": 0.046741846948862076, - "learning_rate": 0.00014225741506575617, - "loss": 0.5996, - "step": 1305 - }, - { - "epoch": 0.73, - "grad_norm": 0.050647541880607605, - "learning_rate": 0.00014217799800901533, - "loss": 0.6619, - "step": 1306 - }, - { - "epoch": 0.73, - "grad_norm": 0.052585843950510025, - "learning_rate": 0.00014209854858060415, - "loss": 0.6119, - "step": 1307 - }, - { - "epoch": 0.73, - "grad_norm": 0.05018146336078644, - "learning_rate": 0.0001420190668415002, - "loss": 0.6795, - "step": 1308 - }, - { - "epoch": 0.73, - "grad_norm": 0.052429914474487305, - "learning_rate": 0.00014193955285270575, - "loss": 0.6865, - "step": 1309 - }, - { - "epoch": 0.73, - "grad_norm": 0.051422689110040665, - "learning_rate": 0.00014186000667524795, - "loss": 0.7339, - "step": 1310 - }, - { - "epoch": 0.73, - "grad_norm": 0.05736821889877319, - "learning_rate": 0.00014178042837017856, - "loss": 0.7109, - "step": 1311 - }, - { - "epoch": 0.73, - "grad_norm": 0.04986963048577309, - "learning_rate": 0.0001417008179985741, - "loss": 0.6964, - "step": 1312 - }, - { - "epoch": 0.73, - "grad_norm": 0.05136452242732048, - "learning_rate": 0.00014162117562153558, - "loss": 0.5924, - "step": 1313 - }, - { - "epoch": 0.73, - "grad_norm": 0.05401051789522171, - "learning_rate": 0.00014154150130018866, - "loss": 0.756, - "step": 1314 - }, - { - "epoch": 0.73, - "grad_norm": 0.05107508972287178, - "learning_rate": 0.00014146179509568344, - "loss": 0.6913, - "step": 1315 - }, - { - "epoch": 0.73, - "grad_norm": 0.048386212438344955, - "learning_rate": 0.0001413820570691946, - "loss": 0.6138, - "step": 1316 - }, - { - "epoch": 0.73, - "grad_norm": 0.056987110525369644, - "learning_rate": 0.00014130228728192118, - "loss": 0.6351, - "step": 1317 - }, - { - "epoch": 0.73, - "grad_norm": 0.04835304245352745, - "learning_rate": 0.00014122248579508657, - "loss": 0.6808, - "step": 1318 - }, - { - "epoch": 0.73, - "grad_norm": 0.0571272112429142, - "learning_rate": 0.00014114265266993846, - "loss": 0.6881, - "step": 1319 - }, - { - "epoch": 0.73, - "grad_norm": 0.058113086968660355, - "learning_rate": 0.00014106278796774903, - "loss": 0.7849, - "step": 1320 - }, - { - "epoch": 0.73, - "grad_norm": 0.05638326704502106, - "learning_rate": 0.00014098289174981443, - "loss": 0.6941, - "step": 1321 - }, - { - "epoch": 0.74, - "grad_norm": 0.05447527393698692, - "learning_rate": 0.00014090296407745517, - "loss": 0.7279, - "step": 1322 - }, - { - "epoch": 0.74, - "grad_norm": 0.05510277301073074, - "learning_rate": 0.0001408230050120158, - "loss": 0.7933, - "step": 1323 - }, - { - "epoch": 0.74, - "grad_norm": 0.059467945247888565, - "learning_rate": 0.00014074301461486504, - "loss": 0.7845, - "step": 1324 - }, - { - "epoch": 0.74, - "grad_norm": 0.060325901955366135, - "learning_rate": 0.00014066299294739567, - "loss": 0.7254, - "step": 1325 - }, - { - "epoch": 0.74, - "grad_norm": 0.05181081220507622, - "learning_rate": 0.00014058294007102432, - "loss": 0.6383, - "step": 1326 - }, - { - "epoch": 0.74, - "grad_norm": 0.050908952951431274, - "learning_rate": 0.00014050285604719177, - "loss": 0.7441, - "step": 1327 - }, - { - "epoch": 0.74, - "grad_norm": 0.049533553421497345, - "learning_rate": 0.0001404227409373626, - "loss": 0.6214, - "step": 1328 - }, - { - "epoch": 0.74, - "grad_norm": 0.05213542655110359, - "learning_rate": 0.00014034259480302523, - "loss": 0.6518, - "step": 1329 - }, - { - "epoch": 0.74, - "grad_norm": 0.05297151952981949, - "learning_rate": 0.00014026241770569197, - "loss": 0.7337, - "step": 1330 - }, - { - "epoch": 0.74, - "grad_norm": 0.04704327508807182, - "learning_rate": 0.00014018220970689883, - "loss": 0.619, - "step": 1331 - }, - { - "epoch": 0.74, - "grad_norm": 0.054115235805511475, - "learning_rate": 0.00014010197086820552, - "loss": 0.7122, - "step": 1332 - }, - { - "epoch": 0.74, - "grad_norm": 0.050988852977752686, - "learning_rate": 0.00014002170125119553, - "loss": 0.7545, - "step": 1333 - }, - { - "epoch": 0.74, - "grad_norm": 0.049325957894325256, - "learning_rate": 0.00013994140091747587, - "loss": 0.6834, - "step": 1334 - }, - { - "epoch": 0.74, - "grad_norm": 0.06573436409235, - "learning_rate": 0.00013986106992867713, - "loss": 0.7652, - "step": 1335 - }, - { - "epoch": 0.74, - "grad_norm": 0.050615664571523666, - "learning_rate": 0.00013978070834645348, - "loss": 0.648, - "step": 1336 - }, - { - "epoch": 0.74, - "grad_norm": 0.050100456923246384, - "learning_rate": 0.0001397003162324825, - "loss": 0.6443, - "step": 1337 - }, - { - "epoch": 0.74, - "grad_norm": 0.04777170345187187, - "learning_rate": 0.00013961989364846532, - "loss": 0.6743, - "step": 1338 - }, - { - "epoch": 0.74, - "grad_norm": 0.05937036871910095, - "learning_rate": 0.00013953944065612633, - "loss": 0.7196, - "step": 1339 - }, - { - "epoch": 0.75, - "grad_norm": 0.05465259775519371, - "learning_rate": 0.0001394589573172133, - "loss": 0.7038, - "step": 1340 - }, - { - "epoch": 0.75, - "grad_norm": 0.04839110001921654, - "learning_rate": 0.00013937844369349734, - "loss": 0.7035, - "step": 1341 - }, - { - "epoch": 0.75, - "grad_norm": 0.05537423491477966, - "learning_rate": 0.00013929789984677278, - "loss": 0.7386, - "step": 1342 - }, - { - "epoch": 0.75, - "grad_norm": 0.051877472549676895, - "learning_rate": 0.00013921732583885705, - "loss": 0.6811, - "step": 1343 - }, - { - "epoch": 0.75, - "grad_norm": 0.052817557007074356, - "learning_rate": 0.00013913672173159088, - "loss": 0.6852, - "step": 1344 - }, - { - "epoch": 0.75, - "grad_norm": 0.0512346476316452, - "learning_rate": 0.000139056087586838, - "loss": 0.6145, - "step": 1345 - }, - { - "epoch": 0.75, - "grad_norm": 0.05505014955997467, - "learning_rate": 0.00013897542346648524, - "loss": 0.8609, - "step": 1346 - }, - { - "epoch": 0.75, - "grad_norm": 0.0500144399702549, - "learning_rate": 0.00013889472943244243, - "loss": 0.6782, - "step": 1347 - }, - { - "epoch": 0.75, - "grad_norm": 0.0514250211417675, - "learning_rate": 0.0001388140055466423, - "loss": 0.7126, - "step": 1348 - }, - { - "epoch": 0.75, - "grad_norm": 0.049233101308345795, - "learning_rate": 0.00013873325187104056, - "loss": 0.6516, - "step": 1349 - }, - { - "epoch": 0.75, - "grad_norm": 0.04899629205465317, - "learning_rate": 0.00013865246846761581, - "loss": 0.617, - "step": 1350 - }, - { - "epoch": 0.75, - "grad_norm": 0.05683201178908348, - "learning_rate": 0.0001385716553983694, - "loss": 0.7033, - "step": 1351 - }, - { - "epoch": 0.75, - "grad_norm": 0.05018038675189018, - "learning_rate": 0.00013849081272532544, - "loss": 0.6809, - "step": 1352 - }, - { - "epoch": 0.75, - "grad_norm": 0.05704934149980545, - "learning_rate": 0.00013840994051053085, - "loss": 0.8067, - "step": 1353 - }, - { - "epoch": 0.75, - "grad_norm": 0.050851497799158096, - "learning_rate": 0.00013832903881605508, - "loss": 0.6693, - "step": 1354 - }, - { - "epoch": 0.75, - "grad_norm": 0.051365941762924194, - "learning_rate": 0.00013824810770399036, - "loss": 0.6739, - "step": 1355 - }, - { - "epoch": 0.75, - "grad_norm": 0.058710698038339615, - "learning_rate": 0.0001381671472364514, - "loss": 0.6647, - "step": 1356 - }, - { - "epoch": 0.75, - "grad_norm": 0.05513182282447815, - "learning_rate": 0.00013808615747557549, - "loss": 0.6949, - "step": 1357 - }, - { - "epoch": 0.76, - "grad_norm": 0.05616655945777893, - "learning_rate": 0.0001380051384835223, - "loss": 0.7774, - "step": 1358 - }, - { - "epoch": 0.76, - "grad_norm": 0.05610072240233421, - "learning_rate": 0.0001379240903224741, - "loss": 0.7263, - "step": 1359 - }, - { - "epoch": 0.76, - "grad_norm": 0.05135612562298775, - "learning_rate": 0.00013784301305463549, - "loss": 0.6889, - "step": 1360 - }, - { - "epoch": 0.76, - "grad_norm": 0.05268790200352669, - "learning_rate": 0.00013776190674223327, - "loss": 0.7119, - "step": 1361 - }, - { - "epoch": 0.76, - "grad_norm": 0.04713091999292374, - "learning_rate": 0.00013768077144751673, - "loss": 0.6209, - "step": 1362 - }, - { - "epoch": 0.76, - "grad_norm": 0.049119122326374054, - "learning_rate": 0.00013759960723275732, - "loss": 0.673, - "step": 1363 - }, - { - "epoch": 0.76, - "grad_norm": 0.05238724872469902, - "learning_rate": 0.00013751841416024865, - "loss": 0.7073, - "step": 1364 - }, - { - "epoch": 0.76, - "grad_norm": 0.04365584999322891, - "learning_rate": 0.0001374371922923065, - "loss": 0.5371, - "step": 1365 - }, - { - "epoch": 0.76, - "grad_norm": 0.05708995461463928, - "learning_rate": 0.0001373559416912688, - "loss": 0.6894, - "step": 1366 - }, - { - "epoch": 0.76, - "grad_norm": 0.04669976979494095, - "learning_rate": 0.00013727466241949545, - "loss": 0.7083, - "step": 1367 - }, - { - "epoch": 0.76, - "grad_norm": 0.05093741416931152, - "learning_rate": 0.00013719335453936846, - "loss": 0.6603, - "step": 1368 - }, - { - "epoch": 0.76, - "grad_norm": 0.04979169741272926, - "learning_rate": 0.0001371120181132917, - "loss": 0.6269, - "step": 1369 - }, - { - "epoch": 0.76, - "grad_norm": 0.05657560005784035, - "learning_rate": 0.000137030653203691, - "loss": 0.8572, - "step": 1370 - }, - { - "epoch": 0.76, - "grad_norm": 0.05758770555257797, - "learning_rate": 0.00013694925987301404, - "loss": 0.7561, - "step": 1371 - }, - { - "epoch": 0.76, - "grad_norm": 0.048717837780714035, - "learning_rate": 0.00013686783818373028, - "loss": 0.7044, - "step": 1372 - }, - { - "epoch": 0.76, - "grad_norm": 0.049861840903759, - "learning_rate": 0.00013678638819833103, - "loss": 0.6199, - "step": 1373 - }, - { - "epoch": 0.76, - "grad_norm": 0.050144508481025696, - "learning_rate": 0.00013670490997932922, - "loss": 0.6802, - "step": 1374 - }, - { - "epoch": 0.76, - "grad_norm": 0.05109990015625954, - "learning_rate": 0.0001366234035892595, - "loss": 0.6872, - "step": 1375 - }, - { - "epoch": 0.77, - "grad_norm": 0.05122920870780945, - "learning_rate": 0.00013654186909067817, - "loss": 0.6246, - "step": 1376 - }, - { - "epoch": 0.77, - "grad_norm": 0.05392398312687874, - "learning_rate": 0.00013646030654616302, - "loss": 0.6679, - "step": 1377 - }, - { - "epoch": 0.77, - "grad_norm": 0.051943257451057434, - "learning_rate": 0.0001363787160183134, - "loss": 0.6608, - "step": 1378 - }, - { - "epoch": 0.77, - "grad_norm": 0.05808079242706299, - "learning_rate": 0.00013629709756975023, - "loss": 0.6256, - "step": 1379 - }, - { - "epoch": 0.77, - "grad_norm": 0.05690327659249306, - "learning_rate": 0.0001362154512631157, - "loss": 0.6313, - "step": 1380 - }, - { - "epoch": 0.77, - "grad_norm": 0.0537363737821579, - "learning_rate": 0.0001361337771610735, - "loss": 0.6827, - "step": 1381 - }, - { - "epoch": 0.77, - "grad_norm": 0.05171459913253784, - "learning_rate": 0.00013605207532630864, - "loss": 0.6683, - "step": 1382 - }, - { - "epoch": 0.77, - "grad_norm": 0.051463544368743896, - "learning_rate": 0.00013597034582152733, - "loss": 0.6894, - "step": 1383 - }, - { - "epoch": 0.77, - "grad_norm": 0.049201156944036484, - "learning_rate": 0.0001358885887094571, - "loss": 0.6486, - "step": 1384 - }, - { - "epoch": 0.77, - "grad_norm": 0.052608225494623184, - "learning_rate": 0.00013580680405284664, - "loss": 0.7302, - "step": 1385 - }, - { - "epoch": 0.77, - "grad_norm": 0.05633961036801338, - "learning_rate": 0.00013572499191446578, - "loss": 0.7057, - "step": 1386 - }, - { - "epoch": 0.77, - "grad_norm": 0.05480135232210159, - "learning_rate": 0.00013564315235710546, - "loss": 0.6837, - "step": 1387 - }, - { - "epoch": 0.77, - "grad_norm": 0.04823005944490433, - "learning_rate": 0.00013556128544357763, - "loss": 0.6007, - "step": 1388 - }, - { - "epoch": 0.77, - "grad_norm": 0.05478844791650772, - "learning_rate": 0.0001354793912367153, - "loss": 0.6724, - "step": 1389 - }, - { - "epoch": 0.77, - "grad_norm": 0.04951076954603195, - "learning_rate": 0.00013539746979937233, - "loss": 0.6648, - "step": 1390 - }, - { - "epoch": 0.77, - "grad_norm": 0.08351165801286697, - "learning_rate": 0.00013531552119442356, - "loss": 0.7277, - "step": 1391 - }, - { - "epoch": 0.77, - "grad_norm": 0.050557348877191544, - "learning_rate": 0.00013523354548476468, - "loss": 0.7675, - "step": 1392 - }, - { - "epoch": 0.77, - "grad_norm": 0.04991862177848816, - "learning_rate": 0.0001351515427333121, - "loss": 0.666, - "step": 1393 - }, - { - "epoch": 0.78, - "grad_norm": 0.04630931839346886, - "learning_rate": 0.0001350695130030031, - "loss": 0.6061, - "step": 1394 - }, - { - "epoch": 0.78, - "grad_norm": 0.05627691000699997, - "learning_rate": 0.00013498745635679557, - "loss": 0.7015, - "step": 1395 - }, - { - "epoch": 0.78, - "grad_norm": 0.05329517647624016, - "learning_rate": 0.00013490537285766808, - "loss": 0.6464, - "step": 1396 - }, - { - "epoch": 0.78, - "grad_norm": 0.059979844838380814, - "learning_rate": 0.00013482326256861988, - "loss": 0.6884, - "step": 1397 - }, - { - "epoch": 0.78, - "grad_norm": 0.04928671196103096, - "learning_rate": 0.0001347411255526707, - "loss": 0.6286, - "step": 1398 - }, - { - "epoch": 0.78, - "grad_norm": 0.05504937097430229, - "learning_rate": 0.00013465896187286083, - "loss": 0.7726, - "step": 1399 - }, - { - "epoch": 0.78, - "grad_norm": 0.04796033352613449, - "learning_rate": 0.00013457677159225097, - "loss": 0.5906, - "step": 1400 - }, - { - "epoch": 0.78, - "grad_norm": 0.05083645507693291, - "learning_rate": 0.0001344945547739223, - "loss": 0.6439, - "step": 1401 - }, - { - "epoch": 0.78, - "grad_norm": 0.05347789451479912, - "learning_rate": 0.0001344123114809763, - "loss": 0.6212, - "step": 1402 - }, - { - "epoch": 0.78, - "grad_norm": 0.04871118441224098, - "learning_rate": 0.00013433004177653486, - "loss": 0.6443, - "step": 1403 - }, - { - "epoch": 0.78, - "grad_norm": 0.05615173652768135, - "learning_rate": 0.00013424774572374005, - "loss": 0.7706, - "step": 1404 - }, - { - "epoch": 0.78, - "grad_norm": 0.04990394786000252, - "learning_rate": 0.00013416542338575424, - "loss": 0.5935, - "step": 1405 - }, - { - "epoch": 0.78, - "grad_norm": 0.05090782046318054, - "learning_rate": 0.0001340830748257599, - "loss": 0.6519, - "step": 1406 - }, - { - "epoch": 0.78, - "grad_norm": 0.05769623816013336, - "learning_rate": 0.00013400070010695966, - "loss": 0.6945, - "step": 1407 - }, - { - "epoch": 0.78, - "grad_norm": 0.05073278769850731, - "learning_rate": 0.00013391829929257625, - "loss": 0.6583, - "step": 1408 - }, - { - "epoch": 0.78, - "grad_norm": 0.05385829880833626, - "learning_rate": 0.0001338358724458524, - "loss": 0.6754, - "step": 1409 - }, - { - "epoch": 0.78, - "grad_norm": 0.06437458097934723, - "learning_rate": 0.0001337534196300508, - "loss": 0.7668, - "step": 1410 - }, - { - "epoch": 0.78, - "grad_norm": 0.0532778799533844, - "learning_rate": 0.0001336709409084542, - "loss": 0.6855, - "step": 1411 - }, - { - "epoch": 0.79, - "grad_norm": 0.06863218545913696, - "learning_rate": 0.00013358843634436496, - "loss": 0.7613, - "step": 1412 - }, - { - "epoch": 0.79, - "grad_norm": 0.05395227298140526, - "learning_rate": 0.00013350590600110556, - "loss": 0.6725, - "step": 1413 - }, - { - "epoch": 0.79, - "grad_norm": 0.0562172457575798, - "learning_rate": 0.00013342334994201815, - "loss": 0.7959, - "step": 1414 - }, - { - "epoch": 0.79, - "grad_norm": 0.052872899919748306, - "learning_rate": 0.00013334076823046456, - "loss": 0.666, - "step": 1415 - }, - { - "epoch": 0.79, - "grad_norm": 0.05251738429069519, - "learning_rate": 0.0001332581609298264, - "loss": 0.686, - "step": 1416 - }, - { - "epoch": 0.79, - "grad_norm": 0.05102743208408356, - "learning_rate": 0.00013317552810350488, - "loss": 0.5487, - "step": 1417 - }, - { - "epoch": 0.79, - "grad_norm": 0.056528471410274506, - "learning_rate": 0.00013309286981492085, - "loss": 0.6795, - "step": 1418 - }, - { - "epoch": 0.79, - "grad_norm": 0.05416551232337952, - "learning_rate": 0.00013301018612751458, - "loss": 0.6314, - "step": 1419 - }, - { - "epoch": 0.79, - "grad_norm": 0.052541207522153854, - "learning_rate": 0.00013292747710474593, - "loss": 0.6579, - "step": 1420 - }, - { - "epoch": 0.79, - "grad_norm": 0.05116291716694832, - "learning_rate": 0.0001328447428100942, - "loss": 0.6836, - "step": 1421 - }, - { - "epoch": 0.79, - "grad_norm": 0.05055323615670204, - "learning_rate": 0.0001327619833070581, - "loss": 0.6109, - "step": 1422 - }, - { - "epoch": 0.79, - "grad_norm": 0.06023389473557472, - "learning_rate": 0.00013267919865915564, - "loss": 0.7022, - "step": 1423 - }, - { - "epoch": 0.79, - "grad_norm": 0.050478942692279816, - "learning_rate": 0.00013259638892992412, - "loss": 0.5096, - "step": 1424 - }, - { - "epoch": 0.79, - "grad_norm": 0.050790175795555115, - "learning_rate": 0.00013251355418292019, - "loss": 0.7475, - "step": 1425 - }, - { - "epoch": 0.79, - "grad_norm": 0.05030534043908119, - "learning_rate": 0.00013243069448171953, - "loss": 0.6937, - "step": 1426 - }, - { - "epoch": 0.79, - "grad_norm": 0.04698512703180313, - "learning_rate": 0.00013234780988991712, - "loss": 0.5889, - "step": 1427 - }, - { - "epoch": 0.79, - "grad_norm": 0.05166896432638168, - "learning_rate": 0.00013226490047112702, - "loss": 0.7394, - "step": 1428 - }, - { - "epoch": 0.79, - "grad_norm": 0.05660312995314598, - "learning_rate": 0.00013218196628898233, - "loss": 0.6686, - "step": 1429 - }, - { - "epoch": 0.8, - "grad_norm": 0.04474378004670143, - "learning_rate": 0.00013209900740713507, - "loss": 0.6026, - "step": 1430 - }, - { - "epoch": 0.8, - "grad_norm": 0.06732626259326935, - "learning_rate": 0.00013201602388925637, - "loss": 0.8321, - "step": 1431 - }, - { - "epoch": 0.8, - "grad_norm": 0.05723815783858299, - "learning_rate": 0.00013193301579903616, - "loss": 0.6508, - "step": 1432 - }, - { - "epoch": 0.8, - "grad_norm": 0.05704987049102783, - "learning_rate": 0.00013184998320018326, - "loss": 0.7649, - "step": 1433 - }, - { - "epoch": 0.8, - "grad_norm": 0.05435892567038536, - "learning_rate": 0.00013176692615642533, - "loss": 0.7658, - "step": 1434 - }, - { - "epoch": 0.8, - "grad_norm": 0.04823015630245209, - "learning_rate": 0.0001316838447315087, - "loss": 0.6798, - "step": 1435 - }, - { - "epoch": 0.8, - "grad_norm": 0.05567566677927971, - "learning_rate": 0.00013160073898919853, - "loss": 0.6289, - "step": 1436 - }, - { - "epoch": 0.8, - "grad_norm": 0.059023939073085785, - "learning_rate": 0.0001315176089932786, - "loss": 0.7173, - "step": 1437 - }, - { - "epoch": 0.8, - "grad_norm": 0.052320726215839386, - "learning_rate": 0.00013143445480755123, - "loss": 0.6257, - "step": 1438 - }, - { - "epoch": 0.8, - "grad_norm": 0.05305778235197067, - "learning_rate": 0.00013135127649583744, - "loss": 0.6709, - "step": 1439 - }, - { - "epoch": 0.8, - "grad_norm": 0.054861389100551605, - "learning_rate": 0.00013126807412197665, - "loss": 0.6239, - "step": 1440 - }, - { - "epoch": 0.8, - "grad_norm": 0.04839559271931648, - "learning_rate": 0.0001311848477498268, - "loss": 0.584, - "step": 1441 - }, - { - "epoch": 0.8, - "grad_norm": 0.04709853231906891, - "learning_rate": 0.00013110159744326427, - "loss": 0.5997, - "step": 1442 - }, - { - "epoch": 0.8, - "grad_norm": 0.05461667478084564, - "learning_rate": 0.00013101832326618376, - "loss": 0.6488, - "step": 1443 - }, - { - "epoch": 0.8, - "grad_norm": 0.05399639531970024, - "learning_rate": 0.0001309350252824983, - "loss": 0.6913, - "step": 1444 - }, - { - "epoch": 0.8, - "grad_norm": 0.05999092012643814, - "learning_rate": 0.00013085170355613926, - "loss": 0.6265, - "step": 1445 - }, - { - "epoch": 0.8, - "grad_norm": 0.05753492936491966, - "learning_rate": 0.0001307683581510561, - "loss": 0.7015, - "step": 1446 - }, - { - "epoch": 0.8, - "grad_norm": 0.054452694952487946, - "learning_rate": 0.00013068498913121657, - "loss": 0.6592, - "step": 1447 - }, - { - "epoch": 0.81, - "grad_norm": 0.05475104600191116, - "learning_rate": 0.00013060159656060654, - "loss": 0.6994, - "step": 1448 - }, - { - "epoch": 0.81, - "grad_norm": 0.05358335003256798, - "learning_rate": 0.00013051818050322986, - "loss": 0.6967, - "step": 1449 - }, - { - "epoch": 0.81, - "grad_norm": 0.05165765434503555, - "learning_rate": 0.0001304347410231085, - "loss": 0.681, - "step": 1450 - }, - { - "epoch": 0.81, - "grad_norm": 0.05551374331116676, - "learning_rate": 0.0001303512781842824, - "loss": 0.565, - "step": 1451 - }, - { - "epoch": 0.81, - "grad_norm": 0.05121142789721489, - "learning_rate": 0.00013026779205080932, - "loss": 0.6739, - "step": 1452 - }, - { - "epoch": 0.81, - "grad_norm": 0.05909129977226257, - "learning_rate": 0.00013018428268676504, - "loss": 0.8198, - "step": 1453 - }, - { - "epoch": 0.81, - "grad_norm": 0.048282720148563385, - "learning_rate": 0.0001301007501562431, - "loss": 0.636, - "step": 1454 - }, - { - "epoch": 0.81, - "grad_norm": 0.05285617336630821, - "learning_rate": 0.00013001719452335485, - "loss": 0.623, - "step": 1455 - }, - { - "epoch": 0.81, - "grad_norm": 0.05915940925478935, - "learning_rate": 0.00012993361585222928, - "loss": 0.7059, - "step": 1456 - }, - { - "epoch": 0.81, - "grad_norm": 0.0531185120344162, - "learning_rate": 0.00012985001420701318, - "loss": 0.6381, - "step": 1457 - }, - { - "epoch": 0.81, - "grad_norm": 0.05316080525517464, - "learning_rate": 0.00012976638965187095, - "loss": 0.6969, - "step": 1458 - }, - { - "epoch": 0.81, - "grad_norm": 0.052480001002550125, - "learning_rate": 0.00012968274225098452, - "loss": 0.6696, - "step": 1459 - }, - { - "epoch": 0.81, - "grad_norm": 0.05531112849712372, - "learning_rate": 0.00012959907206855343, - "loss": 0.7064, - "step": 1460 - }, - { - "epoch": 0.81, - "grad_norm": 0.055633753538131714, - "learning_rate": 0.00012951537916879458, - "loss": 0.6559, - "step": 1461 - }, - { - "epoch": 0.81, - "grad_norm": 0.052477676421403885, - "learning_rate": 0.00012943166361594242, - "loss": 0.6901, - "step": 1462 - }, - { - "epoch": 0.81, - "grad_norm": 0.05346304923295975, - "learning_rate": 0.00012934792547424873, - "loss": 0.6428, - "step": 1463 - }, - { - "epoch": 0.81, - "grad_norm": 0.05244500935077667, - "learning_rate": 0.0001292641648079827, - "loss": 0.6615, - "step": 1464 - }, - { - "epoch": 0.81, - "grad_norm": 0.05374164506793022, - "learning_rate": 0.00012918038168143066, - "loss": 0.711, - "step": 1465 - }, - { - "epoch": 0.82, - "grad_norm": 0.05638580396771431, - "learning_rate": 0.00012909657615889638, - "loss": 0.731, - "step": 1466 - }, - { - "epoch": 0.82, - "grad_norm": 0.05788502097129822, - "learning_rate": 0.00012901274830470064, - "loss": 0.7525, - "step": 1467 - }, - { - "epoch": 0.82, - "grad_norm": 0.05340276658535004, - "learning_rate": 0.0001289288981831815, - "loss": 0.6505, - "step": 1468 - }, - { - "epoch": 0.82, - "grad_norm": 0.05570358410477638, - "learning_rate": 0.00012884502585869395, - "loss": 0.6989, - "step": 1469 - }, - { - "epoch": 0.82, - "grad_norm": 0.05580015480518341, - "learning_rate": 0.00012876113139561018, - "loss": 0.7683, - "step": 1470 - }, - { - "epoch": 0.82, - "grad_norm": 0.05318528413772583, - "learning_rate": 0.0001286772148583193, - "loss": 0.6406, - "step": 1471 - }, - { - "epoch": 0.82, - "grad_norm": 0.05146130174398422, - "learning_rate": 0.0001285932763112273, - "loss": 0.65, - "step": 1472 - }, - { - "epoch": 0.82, - "grad_norm": 0.05019865557551384, - "learning_rate": 0.00012850931581875723, - "loss": 0.6883, - "step": 1473 - }, - { - "epoch": 0.82, - "grad_norm": 0.06139828637242317, - "learning_rate": 0.00012842533344534877, - "loss": 0.6625, - "step": 1474 - }, - { - "epoch": 0.82, - "grad_norm": 0.055851712822914124, - "learning_rate": 0.0001283413292554586, - "loss": 0.6335, - "step": 1475 - }, - { - "epoch": 0.82, - "grad_norm": 0.05272622033953667, - "learning_rate": 0.00012825730331355995, - "loss": 0.6381, - "step": 1476 - }, - { - "epoch": 0.82, - "grad_norm": 0.053820542991161346, - "learning_rate": 0.00012817325568414297, - "loss": 0.6784, - "step": 1477 - }, - { - "epoch": 0.82, - "grad_norm": 0.05233492702245712, - "learning_rate": 0.00012808918643171424, - "loss": 0.6492, - "step": 1478 - }, - { - "epoch": 0.82, - "grad_norm": 0.05471348017454147, - "learning_rate": 0.00012800509562079705, - "loss": 0.6332, - "step": 1479 - }, - { - "epoch": 0.82, - "grad_norm": 0.051646675914525986, - "learning_rate": 0.0001279209833159312, - "loss": 0.6563, - "step": 1480 - }, - { - "epoch": 0.82, - "grad_norm": 0.05445919558405876, - "learning_rate": 0.00012783684958167304, - "loss": 0.7054, - "step": 1481 - }, - { - "epoch": 0.82, - "grad_norm": 0.053671520203351974, - "learning_rate": 0.00012775269448259526, - "loss": 0.6493, - "step": 1482 - }, - { - "epoch": 0.82, - "grad_norm": 0.05467415601015091, - "learning_rate": 0.00012766851808328707, - "loss": 0.6223, - "step": 1483 - }, - { - "epoch": 0.83, - "grad_norm": 0.05039871484041214, - "learning_rate": 0.00012758432044835392, - "loss": 0.5863, - "step": 1484 - }, - { - "epoch": 0.83, - "grad_norm": 0.048955757170915604, - "learning_rate": 0.00012750010164241764, - "loss": 0.5957, - "step": 1485 - }, - { - "epoch": 0.83, - "grad_norm": 0.06016591191291809, - "learning_rate": 0.00012741586173011625, - "loss": 0.7174, - "step": 1486 - }, - { - "epoch": 0.83, - "grad_norm": 0.05054686218500137, - "learning_rate": 0.00012733160077610403, - "loss": 0.5833, - "step": 1487 - }, - { - "epoch": 0.83, - "grad_norm": 0.06371674686670303, - "learning_rate": 0.00012724731884505134, - "loss": 0.6756, - "step": 1488 - }, - { - "epoch": 0.83, - "grad_norm": 0.04819337651133537, - "learning_rate": 0.0001271630160016447, - "loss": 0.6621, - "step": 1489 - }, - { - "epoch": 0.83, - "grad_norm": 0.053725238889455795, - "learning_rate": 0.00012707869231058666, - "loss": 0.6667, - "step": 1490 - }, - { - "epoch": 0.83, - "grad_norm": 0.04935624450445175, - "learning_rate": 0.00012699434783659577, - "loss": 0.6873, - "step": 1491 - }, - { - "epoch": 0.83, - "grad_norm": 0.05342969298362732, - "learning_rate": 0.00012690998264440652, - "loss": 0.7205, - "step": 1492 - }, - { - "epoch": 0.83, - "grad_norm": 0.04896765202283859, - "learning_rate": 0.0001268255967987693, - "loss": 0.5972, - "step": 1493 - }, - { - "epoch": 0.83, - "grad_norm": 0.05099526047706604, - "learning_rate": 0.00012674119036445037, - "loss": 0.6677, - "step": 1494 - }, - { - "epoch": 0.83, - "grad_norm": 0.0643346831202507, - "learning_rate": 0.0001266567634062317, - "loss": 0.7039, - "step": 1495 - }, - { - "epoch": 0.83, - "grad_norm": 0.06624721735715866, - "learning_rate": 0.00012657231598891126, - "loss": 0.7797, - "step": 1496 - }, - { - "epoch": 0.83, - "grad_norm": 0.05854468047618866, - "learning_rate": 0.00012648784817730242, - "loss": 0.7198, - "step": 1497 - }, - { - "epoch": 0.83, - "grad_norm": 0.05921991169452667, - "learning_rate": 0.00012640336003623444, - "loss": 0.7499, - "step": 1498 - }, - { - "epoch": 0.83, - "grad_norm": 0.055420514196157455, - "learning_rate": 0.000126318851630552, - "loss": 0.5993, - "step": 1499 - }, - { - "epoch": 0.83, - "grad_norm": 0.06067932769656181, - "learning_rate": 0.00012623432302511544, - "loss": 0.8006, - "step": 1500 - }, - { - "epoch": 0.83, - "grad_norm": 0.05395696684718132, - "learning_rate": 0.0001261497742848006, - "loss": 0.654, - "step": 1501 - }, - { - "epoch": 0.84, - "grad_norm": 0.05506988614797592, - "learning_rate": 0.0001260652054744987, - "loss": 0.6884, - "step": 1502 - }, - { - "epoch": 0.84, - "grad_norm": 0.0567973367869854, - "learning_rate": 0.0001259806166591165, - "loss": 0.685, - "step": 1503 - }, - { - "epoch": 0.84, - "grad_norm": 0.05710934102535248, - "learning_rate": 0.00012589600790357592, - "loss": 0.6786, - "step": 1504 - }, - { - "epoch": 0.84, - "grad_norm": 0.0530974306166172, - "learning_rate": 0.00012581137927281439, - "loss": 0.6175, - "step": 1505 - }, - { - "epoch": 0.84, - "grad_norm": 0.05763082951307297, - "learning_rate": 0.0001257267308317845, - "loss": 0.5977, - "step": 1506 - }, - { - "epoch": 0.84, - "grad_norm": 0.056460849940776825, - "learning_rate": 0.00012564206264545396, - "loss": 0.706, - "step": 1507 - }, - { - "epoch": 0.84, - "grad_norm": 0.05578916147351265, - "learning_rate": 0.00012555737477880577, - "loss": 0.736, - "step": 1508 - }, - { - "epoch": 0.84, - "grad_norm": 0.05610078200697899, - "learning_rate": 0.00012547266729683797, - "loss": 0.6614, - "step": 1509 - }, - { - "epoch": 0.84, - "grad_norm": 0.05859774723649025, - "learning_rate": 0.00012538794026456366, - "loss": 0.7192, - "step": 1510 - }, - { - "epoch": 0.84, - "grad_norm": 0.05389096215367317, - "learning_rate": 0.00012530319374701098, - "loss": 0.6748, - "step": 1511 - }, - { - "epoch": 0.84, - "grad_norm": 0.05462174117565155, - "learning_rate": 0.000125218427809223, - "loss": 0.6765, - "step": 1512 - }, - { - "epoch": 0.84, - "grad_norm": 0.047790851444005966, - "learning_rate": 0.00012513364251625766, - "loss": 0.6126, - "step": 1513 - }, - { - "epoch": 0.84, - "grad_norm": 0.05302601680159569, - "learning_rate": 0.0001250488379331878, - "loss": 0.6352, - "step": 1514 - }, - { - "epoch": 0.84, - "grad_norm": 0.05396215245127678, - "learning_rate": 0.00012496401412510102, - "loss": 0.6203, - "step": 1515 - }, - { - "epoch": 0.84, - "grad_norm": 0.051495350897312164, - "learning_rate": 0.00012487917115709975, - "loss": 0.7411, - "step": 1516 - }, - { - "epoch": 0.84, - "grad_norm": 0.05668788030743599, - "learning_rate": 0.00012479430909430108, - "loss": 0.7574, - "step": 1517 - }, - { - "epoch": 0.84, - "grad_norm": 0.056466199457645416, - "learning_rate": 0.00012470942800183675, - "loss": 0.6395, - "step": 1518 - }, - { - "epoch": 0.84, - "grad_norm": 0.061602648347616196, - "learning_rate": 0.0001246245279448531, - "loss": 0.7101, - "step": 1519 - }, - { - "epoch": 0.85, - "grad_norm": 0.05588141828775406, - "learning_rate": 0.00012453960898851108, - "loss": 0.7231, - "step": 1520 - }, - { - "epoch": 0.85, - "grad_norm": 0.05210031941533089, - "learning_rate": 0.00012445467119798605, - "loss": 0.7094, - "step": 1521 - }, - { - "epoch": 0.85, - "grad_norm": 0.05762970820069313, - "learning_rate": 0.00012436971463846788, - "loss": 0.6623, - "step": 1522 - }, - { - "epoch": 0.85, - "grad_norm": 0.05559748038649559, - "learning_rate": 0.0001242847393751609, - "loss": 0.6677, - "step": 1523 - }, - { - "epoch": 0.85, - "grad_norm": 0.05742710828781128, - "learning_rate": 0.00012419974547328366, - "loss": 0.7276, - "step": 1524 - }, - { - "epoch": 0.85, - "grad_norm": 0.053973183035850525, - "learning_rate": 0.00012411473299806918, - "loss": 0.7039, - "step": 1525 - }, - { - "epoch": 0.85, - "grad_norm": 0.056114938110113144, - "learning_rate": 0.00012402970201476457, - "loss": 0.6718, - "step": 1526 - }, - { - "epoch": 0.85, - "grad_norm": 0.06136467307806015, - "learning_rate": 0.0001239446525886313, - "loss": 0.7666, - "step": 1527 - }, - { - "epoch": 0.85, - "grad_norm": 0.057720523327589035, - "learning_rate": 0.00012385958478494487, - "loss": 0.6303, - "step": 1528 - }, - { - "epoch": 0.85, - "grad_norm": 0.054988570511341095, - "learning_rate": 0.00012377449866899493, - "loss": 0.6074, - "step": 1529 - }, - { - "epoch": 0.85, - "grad_norm": 0.05281984433531761, - "learning_rate": 0.00012368939430608522, - "loss": 0.6453, - "step": 1530 - }, - { - "epoch": 0.85, - "grad_norm": 0.06022608280181885, - "learning_rate": 0.00012360427176153342, - "loss": 0.7284, - "step": 1531 - }, - { - "epoch": 0.85, - "grad_norm": 0.05347907170653343, - "learning_rate": 0.00012351913110067122, - "loss": 0.5784, - "step": 1532 - }, - { - "epoch": 0.85, - "grad_norm": 0.0521782748401165, - "learning_rate": 0.0001234339723888442, - "loss": 0.6836, - "step": 1533 - }, - { - "epoch": 0.85, - "grad_norm": 0.05518525093793869, - "learning_rate": 0.00012334879569141172, - "loss": 0.6714, - "step": 1534 - }, - { - "epoch": 0.85, - "grad_norm": 0.0590486079454422, - "learning_rate": 0.00012326360107374712, - "loss": 0.7145, - "step": 1535 - }, - { - "epoch": 0.85, - "grad_norm": 0.05373707413673401, - "learning_rate": 0.0001231783886012373, - "loss": 0.6897, - "step": 1536 - }, - { - "epoch": 0.85, - "grad_norm": 0.05092402547597885, - "learning_rate": 0.00012309315833928302, - "loss": 0.6245, - "step": 1537 - }, - { - "epoch": 0.86, - "grad_norm": 0.05266068875789642, - "learning_rate": 0.00012300791035329853, - "loss": 0.5606, - "step": 1538 - }, - { - "epoch": 0.86, - "grad_norm": 0.05256986245512962, - "learning_rate": 0.00012292264470871182, - "loss": 0.6055, - "step": 1539 - }, - { - "epoch": 0.86, - "grad_norm": 0.05765622481703758, - "learning_rate": 0.0001228373614709644, - "loss": 0.7045, - "step": 1540 - }, - { - "epoch": 0.86, - "grad_norm": 0.05402863398194313, - "learning_rate": 0.00012275206070551126, - "loss": 0.7477, - "step": 1541 - }, - { - "epoch": 0.86, - "grad_norm": 0.049794167280197144, - "learning_rate": 0.00012266674247782085, - "loss": 0.6824, - "step": 1542 - }, - { - "epoch": 0.86, - "grad_norm": 0.052845150232315063, - "learning_rate": 0.000122581406853375, - "loss": 0.6614, - "step": 1543 - }, - { - "epoch": 0.86, - "grad_norm": 0.05592498928308487, - "learning_rate": 0.00012249605389766895, - "loss": 0.6355, - "step": 1544 - }, - { - "epoch": 0.86, - "grad_norm": 0.05580741912126541, - "learning_rate": 0.0001224106836762112, - "loss": 0.557, - "step": 1545 - }, - { - "epoch": 0.86, - "grad_norm": 0.05675602704286575, - "learning_rate": 0.00012232529625452352, - "loss": 0.6461, - "step": 1546 - }, - { - "epoch": 0.86, - "grad_norm": 0.05478064343333244, - "learning_rate": 0.00012223989169814087, - "loss": 0.6754, - "step": 1547 - }, - { - "epoch": 0.86, - "grad_norm": 0.05122774466872215, - "learning_rate": 0.00012215447007261134, - "loss": 0.6126, - "step": 1548 - }, - { - "epoch": 0.86, - "grad_norm": 0.05637194588780403, - "learning_rate": 0.00012206903144349615, - "loss": 0.599, - "step": 1549 - }, - { - "epoch": 0.86, - "grad_norm": 0.06174594536423683, - "learning_rate": 0.00012198357587636957, - "loss": 0.7692, - "step": 1550 - }, - { - "epoch": 0.86, - "grad_norm": 0.0552188940346241, - "learning_rate": 0.00012189810343681889, - "loss": 0.5888, - "step": 1551 - }, - { - "epoch": 0.86, - "grad_norm": 0.06255478411912918, - "learning_rate": 0.00012181261419044428, - "loss": 0.6878, - "step": 1552 - }, - { - "epoch": 0.86, - "grad_norm": 0.049522750079631805, - "learning_rate": 0.00012172710820285885, - "loss": 0.645, - "step": 1553 - }, - { - "epoch": 0.86, - "grad_norm": 0.05213455855846405, - "learning_rate": 0.00012164158553968856, - "loss": 0.5865, - "step": 1554 - }, - { - "epoch": 0.86, - "grad_norm": 0.05377736687660217, - "learning_rate": 0.00012155604626657222, - "loss": 0.6835, - "step": 1555 - }, - { - "epoch": 0.87, - "grad_norm": 0.06013650447130203, - "learning_rate": 0.0001214704904491613, - "loss": 0.6399, - "step": 1556 - }, - { - "epoch": 0.87, - "grad_norm": 0.057262714952230453, - "learning_rate": 0.00012138491815312001, - "loss": 0.68, - "step": 1557 - }, - { - "epoch": 0.87, - "grad_norm": 0.05029933899641037, - "learning_rate": 0.00012129932944412518, - "loss": 0.6265, - "step": 1558 - }, - { - "epoch": 0.87, - "grad_norm": 0.05597386881709099, - "learning_rate": 0.00012121372438786631, - "loss": 0.7077, - "step": 1559 - }, - { - "epoch": 0.87, - "grad_norm": 0.05315025895833969, - "learning_rate": 0.00012112810305004535, - "loss": 0.6948, - "step": 1560 - }, - { - "epoch": 0.87, - "grad_norm": 0.056836340576410294, - "learning_rate": 0.00012104246549637683, - "loss": 0.6079, - "step": 1561 - }, - { - "epoch": 0.87, - "grad_norm": 0.05192887783050537, - "learning_rate": 0.00012095681179258765, - "loss": 0.6264, - "step": 1562 - }, - { - "epoch": 0.87, - "grad_norm": 0.05339023098349571, - "learning_rate": 0.00012087114200441714, - "loss": 0.6584, - "step": 1563 - }, - { - "epoch": 0.87, - "grad_norm": 0.054986581206321716, - "learning_rate": 0.00012078545619761703, - "loss": 0.5941, - "step": 1564 - }, - { - "epoch": 0.87, - "grad_norm": 0.05665560066699982, - "learning_rate": 0.00012069975443795126, - "loss": 0.6785, - "step": 1565 - }, - { - "epoch": 0.87, - "grad_norm": 0.055501628667116165, - "learning_rate": 0.00012061403679119603, - "loss": 0.7018, - "step": 1566 - }, - { - "epoch": 0.87, - "grad_norm": 0.058468155562877655, - "learning_rate": 0.00012052830332313978, - "loss": 0.76, - "step": 1567 - }, - { - "epoch": 0.87, - "grad_norm": 0.055983930826187134, - "learning_rate": 0.00012044255409958305, - "loss": 0.7121, - "step": 1568 - }, - { - "epoch": 0.87, - "grad_norm": 0.056311774998903275, - "learning_rate": 0.00012035678918633848, - "loss": 0.6893, - "step": 1569 - }, - { - "epoch": 0.87, - "grad_norm": 0.05754856392741203, - "learning_rate": 0.00012027100864923076, - "loss": 0.7448, - "step": 1570 - }, - { - "epoch": 0.87, - "grad_norm": 0.05376417189836502, - "learning_rate": 0.00012018521255409656, - "loss": 0.667, - "step": 1571 - }, - { - "epoch": 0.87, - "grad_norm": 0.0655338391661644, - "learning_rate": 0.00012009940096678452, - "loss": 0.7666, - "step": 1572 - }, - { - "epoch": 0.87, - "grad_norm": 0.055434565991163254, - "learning_rate": 0.00012001357395315511, - "loss": 0.5904, - "step": 1573 - }, - { - "epoch": 0.88, - "grad_norm": 0.053381387144327164, - "learning_rate": 0.00011992773157908073, - "loss": 0.6542, - "step": 1574 - }, - { - "epoch": 0.88, - "grad_norm": 0.05259239673614502, - "learning_rate": 0.00011984187391044548, - "loss": 0.6848, - "step": 1575 - }, - { - "epoch": 0.88, - "grad_norm": 0.05607330799102783, - "learning_rate": 0.00011975600101314525, - "loss": 0.7138, - "step": 1576 - }, - { - "epoch": 0.88, - "grad_norm": 0.051935825496912, - "learning_rate": 0.00011967011295308761, - "loss": 0.6423, - "step": 1577 - }, - { - "epoch": 0.88, - "grad_norm": 0.058591440320014954, - "learning_rate": 0.00011958420979619176, - "loss": 0.6529, - "step": 1578 - }, - { - "epoch": 0.88, - "grad_norm": 0.06154406815767288, - "learning_rate": 0.00011949829160838844, - "loss": 0.7658, - "step": 1579 - }, - { - "epoch": 0.88, - "grad_norm": 0.05416911095380783, - "learning_rate": 0.00011941235845562006, - "loss": 0.6634, - "step": 1580 - }, - { - "epoch": 0.88, - "grad_norm": 0.056073009967803955, - "learning_rate": 0.00011932641040384038, - "loss": 0.575, - "step": 1581 - }, - { - "epoch": 0.88, - "grad_norm": 0.06217704340815544, - "learning_rate": 0.00011924044751901466, - "loss": 0.7539, - "step": 1582 - }, - { - "epoch": 0.88, - "grad_norm": 0.05384403467178345, - "learning_rate": 0.00011915446986711953, - "loss": 0.566, - "step": 1583 - }, - { - "epoch": 0.88, - "grad_norm": 0.05363078787922859, - "learning_rate": 0.00011906847751414291, - "loss": 0.6722, - "step": 1584 - }, - { - "epoch": 0.88, - "grad_norm": 0.05821472778916359, - "learning_rate": 0.00011898247052608414, - "loss": 0.685, - "step": 1585 - }, - { - "epoch": 0.88, - "grad_norm": 0.06892435252666473, - "learning_rate": 0.00011889644896895362, - "loss": 0.6482, - "step": 1586 - }, - { - "epoch": 0.88, - "grad_norm": 0.06398369371891022, - "learning_rate": 0.00011881041290877303, - "loss": 0.675, - "step": 1587 - }, - { - "epoch": 0.88, - "grad_norm": 0.05386562645435333, - "learning_rate": 0.00011872436241157518, - "loss": 0.6116, - "step": 1588 - }, - { - "epoch": 0.88, - "grad_norm": 0.05238373950123787, - "learning_rate": 0.00011863829754340395, - "loss": 0.628, - "step": 1589 - }, - { - "epoch": 0.88, - "grad_norm": 0.05979369580745697, - "learning_rate": 0.00011855221837031418, - "loss": 0.723, - "step": 1590 - }, - { - "epoch": 0.88, - "grad_norm": 0.06569157540798187, - "learning_rate": 0.00011846612495837182, - "loss": 0.7057, - "step": 1591 - }, - { - "epoch": 0.89, - "grad_norm": 0.057322900742292404, - "learning_rate": 0.00011838001737365365, - "loss": 0.6529, - "step": 1592 - }, - { - "epoch": 0.89, - "grad_norm": 0.06029359623789787, - "learning_rate": 0.00011829389568224734, - "loss": 0.7157, - "step": 1593 - }, - { - "epoch": 0.89, - "grad_norm": 0.05852045491337776, - "learning_rate": 0.00011820775995025147, - "loss": 0.6289, - "step": 1594 - }, - { - "epoch": 0.89, - "grad_norm": 0.05572306364774704, - "learning_rate": 0.00011812161024377526, - "loss": 0.6662, - "step": 1595 - }, - { - "epoch": 0.89, - "grad_norm": 0.05891454592347145, - "learning_rate": 0.00011803544662893875, - "loss": 0.662, - "step": 1596 - }, - { - "epoch": 0.89, - "grad_norm": 0.05507583171129227, - "learning_rate": 0.00011794926917187264, - "loss": 0.572, - "step": 1597 - }, - { - "epoch": 0.89, - "grad_norm": 0.05523572862148285, - "learning_rate": 0.00011786307793871824, - "loss": 0.6808, - "step": 1598 - }, - { - "epoch": 0.89, - "grad_norm": 0.06197218969464302, - "learning_rate": 0.00011777687299562743, - "loss": 0.6696, - "step": 1599 - }, - { - "epoch": 0.89, - "grad_norm": 0.05204472318291664, - "learning_rate": 0.00011769065440876263, - "loss": 0.6031, - "step": 1600 - }, - { - "epoch": 0.89, - "grad_norm": 0.05904833972454071, - "learning_rate": 0.0001176044222442967, - "loss": 0.6351, - "step": 1601 - }, - { - "epoch": 0.89, - "grad_norm": 0.06034965440630913, - "learning_rate": 0.00011751817656841298, - "loss": 0.7582, - "step": 1602 - }, - { - "epoch": 0.89, - "grad_norm": 0.05305292084813118, - "learning_rate": 0.0001174319174473051, - "loss": 0.6891, - "step": 1603 - }, - { - "epoch": 0.89, - "grad_norm": 0.052683088928461075, - "learning_rate": 0.00011734564494717711, - "loss": 0.6354, - "step": 1604 - }, - { - "epoch": 0.89, - "grad_norm": 0.059987373650074005, - "learning_rate": 0.0001172593591342432, - "loss": 0.6069, - "step": 1605 - }, - { - "epoch": 0.89, - "grad_norm": 0.05370206758379936, - "learning_rate": 0.0001171730600747279, - "loss": 0.6376, - "step": 1606 - }, - { - "epoch": 0.89, - "grad_norm": 0.05009806901216507, - "learning_rate": 0.00011708674783486583, - "loss": 0.537, - "step": 1607 - }, - { - "epoch": 0.89, - "grad_norm": 0.05144254118204117, - "learning_rate": 0.00011700042248090176, - "loss": 0.6138, - "step": 1608 - }, - { - "epoch": 0.89, - "grad_norm": 0.05199851468205452, - "learning_rate": 0.0001169140840790905, - "loss": 0.6005, - "step": 1609 - }, - { - "epoch": 0.9, - "grad_norm": 0.06040224805474281, - "learning_rate": 0.00011682773269569693, - "loss": 0.6571, - "step": 1610 - }, - { - "epoch": 0.9, - "grad_norm": 0.05754357576370239, - "learning_rate": 0.00011674136839699581, - "loss": 0.7045, - "step": 1611 - }, - { - "epoch": 0.9, - "grad_norm": 0.05448243021965027, - "learning_rate": 0.00011665499124927184, - "loss": 0.642, - "step": 1612 - }, - { - "epoch": 0.9, - "grad_norm": 0.0632404014468193, - "learning_rate": 0.00011656860131881966, - "loss": 0.7868, - "step": 1613 - }, - { - "epoch": 0.9, - "grad_norm": 0.054406315088272095, - "learning_rate": 0.00011648219867194362, - "loss": 0.6224, - "step": 1614 - }, - { - "epoch": 0.9, - "grad_norm": 0.059626247733831406, - "learning_rate": 0.00011639578337495787, - "loss": 0.6865, - "step": 1615 - }, - { - "epoch": 0.9, - "grad_norm": 0.05667116865515709, - "learning_rate": 0.00011630935549418627, - "loss": 0.5988, - "step": 1616 - }, - { - "epoch": 0.9, - "grad_norm": 0.0557304285466671, - "learning_rate": 0.00011622291509596234, - "loss": 0.6422, - "step": 1617 - }, - { - "epoch": 0.9, - "grad_norm": 0.0590471550822258, - "learning_rate": 0.00011613646224662921, - "loss": 0.777, - "step": 1618 - }, - { - "epoch": 0.9, - "grad_norm": 0.06502439081668854, - "learning_rate": 0.00011604999701253953, - "loss": 0.7052, - "step": 1619 - }, - { - "epoch": 0.9, - "grad_norm": 0.05545181408524513, - "learning_rate": 0.00011596351946005552, - "loss": 0.6561, - "step": 1620 - }, - { - "epoch": 0.9, - "grad_norm": 0.05329308658838272, - "learning_rate": 0.00011587702965554878, - "loss": 0.5068, - "step": 1621 - }, - { - "epoch": 0.9, - "grad_norm": 0.05559957027435303, - "learning_rate": 0.00011579052766540039, - "loss": 0.6479, - "step": 1622 - }, - { - "epoch": 0.9, - "grad_norm": 0.061965446919202805, - "learning_rate": 0.00011570401355600071, - "loss": 0.7318, - "step": 1623 - }, - { - "epoch": 0.9, - "grad_norm": 0.061798419803380966, - "learning_rate": 0.00011561748739374945, - "loss": 0.7131, - "step": 1624 - }, - { - "epoch": 0.9, - "grad_norm": 0.05673898011445999, - "learning_rate": 0.00011553094924505557, - "loss": 0.7431, - "step": 1625 - }, - { - "epoch": 0.9, - "grad_norm": 0.055540382862091064, - "learning_rate": 0.00011544439917633718, - "loss": 0.6852, - "step": 1626 - }, - { - "epoch": 0.9, - "grad_norm": 0.05674376338720322, - "learning_rate": 0.00011535783725402163, - "loss": 0.7056, - "step": 1627 - }, - { - "epoch": 0.91, - "grad_norm": 0.057587672024965286, - "learning_rate": 0.00011527126354454525, - "loss": 0.6101, - "step": 1628 - }, - { - "epoch": 0.91, - "grad_norm": 0.06215091049671173, - "learning_rate": 0.00011518467811435352, - "loss": 0.742, - "step": 1629 - }, - { - "epoch": 0.91, - "grad_norm": 0.054263584315776825, - "learning_rate": 0.00011509808102990085, - "loss": 0.6079, - "step": 1630 - }, - { - "epoch": 0.91, - "grad_norm": 0.05362584441900253, - "learning_rate": 0.00011501147235765063, - "loss": 0.5511, - "step": 1631 - }, - { - "epoch": 0.91, - "grad_norm": 0.05936874449253082, - "learning_rate": 0.00011492485216407513, - "loss": 0.6716, - "step": 1632 - }, - { - "epoch": 0.91, - "grad_norm": 0.05744662880897522, - "learning_rate": 0.00011483822051565549, - "loss": 0.6429, - "step": 1633 - }, - { - "epoch": 0.91, - "grad_norm": 0.06137154996395111, - "learning_rate": 0.0001147515774788816, - "loss": 0.6882, - "step": 1634 - }, - { - "epoch": 0.91, - "grad_norm": 0.06001037731766701, - "learning_rate": 0.0001146649231202521, - "loss": 0.655, - "step": 1635 - }, - { - "epoch": 0.91, - "grad_norm": 0.054453328251838684, - "learning_rate": 0.0001145782575062743, - "loss": 0.6446, - "step": 1636 - }, - { - "epoch": 0.91, - "grad_norm": 0.056741863489151, - "learning_rate": 0.00011449158070346424, - "loss": 0.6569, - "step": 1637 - }, - { - "epoch": 0.91, - "grad_norm": 0.05473573878407478, - "learning_rate": 0.00011440489277834645, - "loss": 0.6475, - "step": 1638 - }, - { - "epoch": 0.91, - "grad_norm": 0.060271989554166794, - "learning_rate": 0.00011431819379745401, - "loss": 0.7187, - "step": 1639 - }, - { - "epoch": 0.91, - "grad_norm": 0.05573005974292755, - "learning_rate": 0.00011423148382732853, - "loss": 0.639, - "step": 1640 - }, - { - "epoch": 0.91, - "grad_norm": 0.05600711703300476, - "learning_rate": 0.00011414476293452001, - "loss": 0.6168, - "step": 1641 - }, - { - "epoch": 0.91, - "grad_norm": 0.053048767149448395, - "learning_rate": 0.0001140580311855869, - "loss": 0.5581, - "step": 1642 - }, - { - "epoch": 0.91, - "grad_norm": 0.05543733760714531, - "learning_rate": 0.00011397128864709586, - "loss": 0.6627, - "step": 1643 - }, - { - "epoch": 0.91, - "grad_norm": 0.05584166571497917, - "learning_rate": 0.00011388453538562195, - "loss": 0.6437, - "step": 1644 - }, - { - "epoch": 0.91, - "grad_norm": 0.06031232327222824, - "learning_rate": 0.00011379777146774844, - "loss": 0.5947, - "step": 1645 - }, - { - "epoch": 0.92, - "grad_norm": 0.055425215512514114, - "learning_rate": 0.0001137109969600667, - "loss": 0.6921, - "step": 1646 - }, - { - "epoch": 0.92, - "grad_norm": 0.05608774349093437, - "learning_rate": 0.00011362421192917631, - "loss": 0.7744, - "step": 1647 - }, - { - "epoch": 0.92, - "grad_norm": 0.06419660151004791, - "learning_rate": 0.00011353741644168487, - "loss": 0.7103, - "step": 1648 - }, - { - "epoch": 0.92, - "grad_norm": 0.060391318053007126, - "learning_rate": 0.0001134506105642081, - "loss": 0.7168, - "step": 1649 - }, - { - "epoch": 0.92, - "grad_norm": 0.05262453481554985, - "learning_rate": 0.00011336379436336955, - "loss": 0.5733, - "step": 1650 - }, - { - "epoch": 0.92, - "grad_norm": 0.0592212975025177, - "learning_rate": 0.00011327696790580083, - "loss": 0.6959, - "step": 1651 - }, - { - "epoch": 0.92, - "grad_norm": 0.061455368995666504, - "learning_rate": 0.00011319013125814131, - "loss": 0.661, - "step": 1652 - }, - { - "epoch": 0.92, - "grad_norm": 0.05837690457701683, - "learning_rate": 0.00011310328448703829, - "loss": 0.6521, - "step": 1653 - }, - { - "epoch": 0.92, - "grad_norm": 0.055113084614276886, - "learning_rate": 0.00011301642765914673, - "loss": 0.7173, - "step": 1654 - }, - { - "epoch": 0.92, - "grad_norm": 0.06535536795854568, - "learning_rate": 0.00011292956084112943, - "loss": 0.6889, - "step": 1655 - }, - { - "epoch": 0.92, - "grad_norm": 0.056155428290367126, - "learning_rate": 0.00011284268409965673, - "loss": 0.6287, - "step": 1656 - }, - { - "epoch": 0.92, - "grad_norm": 0.05229957774281502, - "learning_rate": 0.00011275579750140666, - "loss": 0.617, - "step": 1657 - }, - { - "epoch": 0.92, - "grad_norm": 0.05848075821995735, - "learning_rate": 0.00011266890111306484, - "loss": 0.6121, - "step": 1658 - }, - { - "epoch": 0.92, - "grad_norm": 0.06387878954410553, - "learning_rate": 0.00011258199500132429, - "loss": 0.6307, - "step": 1659 - }, - { - "epoch": 0.92, - "grad_norm": 0.05076488479971886, - "learning_rate": 0.00011249507923288562, - "loss": 0.6157, - "step": 1660 - }, - { - "epoch": 0.92, - "grad_norm": 0.053368885070085526, - "learning_rate": 0.0001124081538744568, - "loss": 0.675, - "step": 1661 - }, - { - "epoch": 0.92, - "grad_norm": 0.06017875671386719, - "learning_rate": 0.00011232121899275314, - "loss": 0.6557, - "step": 1662 - }, - { - "epoch": 0.92, - "grad_norm": 0.06376302242279053, - "learning_rate": 0.00011223427465449729, - "loss": 0.6831, - "step": 1663 - }, - { - "epoch": 0.93, - "grad_norm": 0.06142215430736542, - "learning_rate": 0.00011214732092641916, - "loss": 0.6506, - "step": 1664 - }, - { - "epoch": 0.93, - "grad_norm": 0.0578768253326416, - "learning_rate": 0.00011206035787525585, - "loss": 0.7647, - "step": 1665 - }, - { - "epoch": 0.93, - "grad_norm": 0.05308634787797928, - "learning_rate": 0.00011197338556775156, - "loss": 0.6295, - "step": 1666 - }, - { - "epoch": 0.93, - "grad_norm": 0.050338249653577805, - "learning_rate": 0.00011188640407065776, - "loss": 0.5454, - "step": 1667 - }, - { - "epoch": 0.93, - "grad_norm": 0.05560674890875816, - "learning_rate": 0.00011179941345073278, - "loss": 0.7058, - "step": 1668 - }, - { - "epoch": 0.93, - "grad_norm": 0.054467298090457916, - "learning_rate": 0.00011171241377474207, - "loss": 0.5801, - "step": 1669 - }, - { - "epoch": 0.93, - "grad_norm": 0.05152300372719765, - "learning_rate": 0.00011162540510945799, - "loss": 0.5644, - "step": 1670 - }, - { - "epoch": 0.93, - "grad_norm": 0.05688504874706268, - "learning_rate": 0.0001115383875216598, - "loss": 0.6589, - "step": 1671 - }, - { - "epoch": 0.93, - "grad_norm": 0.05523272603750229, - "learning_rate": 0.00011145136107813363, - "loss": 0.6098, - "step": 1672 - }, - { - "epoch": 0.93, - "grad_norm": 0.058240048587322235, - "learning_rate": 0.0001113643258456724, - "loss": 0.7046, - "step": 1673 - }, - { - "epoch": 0.93, - "grad_norm": 0.054752644151449203, - "learning_rate": 0.00011127728189107576, - "loss": 0.7045, - "step": 1674 - }, - { - "epoch": 0.93, - "grad_norm": 0.0511183924973011, - "learning_rate": 0.00011119022928115007, - "loss": 0.6525, - "step": 1675 - }, - { - "epoch": 0.93, - "grad_norm": 0.05391978099942207, - "learning_rate": 0.00011110316808270831, - "loss": 0.6324, - "step": 1676 - }, - { - "epoch": 0.93, - "grad_norm": 0.05610180273652077, - "learning_rate": 0.00011101609836257008, - "loss": 0.6382, - "step": 1677 - }, - { - "epoch": 0.93, - "grad_norm": 0.05987925082445145, - "learning_rate": 0.00011092902018756151, - "loss": 0.7006, - "step": 1678 - }, - { - "epoch": 0.93, - "grad_norm": 0.06315992027521133, - "learning_rate": 0.0001108419336245152, - "loss": 0.7349, - "step": 1679 - }, - { - "epoch": 0.93, - "grad_norm": 0.05764647200703621, - "learning_rate": 0.0001107548387402702, - "loss": 0.6374, - "step": 1680 - }, - { - "epoch": 0.93, - "grad_norm": 0.05375726893544197, - "learning_rate": 0.00011066773560167196, - "loss": 0.6368, - "step": 1681 - }, - { - "epoch": 0.94, - "grad_norm": 0.05238979682326317, - "learning_rate": 0.00011058062427557229, - "loss": 0.5693, - "step": 1682 - }, - { - "epoch": 0.94, - "grad_norm": 0.055107586085796356, - "learning_rate": 0.00011049350482882919, - "loss": 0.6024, - "step": 1683 - }, - { - "epoch": 0.94, - "grad_norm": 0.0552542470395565, - "learning_rate": 0.00011040637732830701, - "loss": 0.5507, - "step": 1684 - }, - { - "epoch": 0.94, - "grad_norm": 0.06206995248794556, - "learning_rate": 0.00011031924184087618, - "loss": 0.7039, - "step": 1685 - }, - { - "epoch": 0.94, - "grad_norm": 0.05662161484360695, - "learning_rate": 0.00011023209843341333, - "loss": 0.6329, - "step": 1686 - }, - { - "epoch": 0.94, - "grad_norm": 0.060033902525901794, - "learning_rate": 0.00011014494717280115, - "loss": 0.6859, - "step": 1687 - }, - { - "epoch": 0.94, - "grad_norm": 0.05562319979071617, - "learning_rate": 0.00011005778812592832, - "loss": 0.6442, - "step": 1688 - }, - { - "epoch": 0.94, - "grad_norm": 0.05592164769768715, - "learning_rate": 0.00010997062135968956, - "loss": 0.6388, - "step": 1689 - }, - { - "epoch": 0.94, - "grad_norm": 0.05650092288851738, - "learning_rate": 0.00010988344694098545, - "loss": 0.6502, - "step": 1690 - }, - { - "epoch": 0.94, - "grad_norm": 0.06203974410891533, - "learning_rate": 0.00010979626493672245, - "loss": 0.7274, - "step": 1691 - }, - { - "epoch": 0.94, - "grad_norm": 0.052279599010944366, - "learning_rate": 0.00010970907541381295, - "loss": 0.5828, - "step": 1692 - }, - { - "epoch": 0.94, - "grad_norm": 0.06009140610694885, - "learning_rate": 0.00010962187843917497, - "loss": 0.6759, - "step": 1693 - }, - { - "epoch": 0.94, - "grad_norm": 0.05527809262275696, - "learning_rate": 0.0001095346740797323, - "loss": 0.6657, - "step": 1694 - }, - { - "epoch": 0.94, - "grad_norm": 0.0608220137655735, - "learning_rate": 0.00010944746240241444, - "loss": 0.6928, - "step": 1695 - }, - { - "epoch": 0.94, - "grad_norm": 0.05583418905735016, - "learning_rate": 0.00010936024347415643, - "loss": 0.5839, - "step": 1696 - }, - { - "epoch": 0.94, - "grad_norm": 0.05640941858291626, - "learning_rate": 0.00010927301736189893, - "loss": 0.661, - "step": 1697 - }, - { - "epoch": 0.94, - "grad_norm": 0.0547964982688427, - "learning_rate": 0.0001091857841325881, - "loss": 0.562, - "step": 1698 - }, - { - "epoch": 0.94, - "grad_norm": 0.05608777329325676, - "learning_rate": 0.00010909854385317557, - "loss": 0.5974, - "step": 1699 - }, - { - "epoch": 0.95, - "grad_norm": 0.05370324105024338, - "learning_rate": 0.00010901129659061837, - "loss": 0.6899, - "step": 1700 - }, - { - "epoch": 0.95, - "grad_norm": 0.06166364252567291, - "learning_rate": 0.00010892404241187886, - "loss": 0.6538, - "step": 1701 - }, - { - "epoch": 0.95, - "grad_norm": 0.0633421465754509, - "learning_rate": 0.00010883678138392477, - "loss": 0.785, - "step": 1702 - }, - { - "epoch": 0.95, - "grad_norm": 0.06182889640331268, - "learning_rate": 0.00010874951357372906, - "loss": 0.6403, - "step": 1703 - }, - { - "epoch": 0.95, - "grad_norm": 0.06533250212669373, - "learning_rate": 0.0001086622390482699, - "loss": 0.7551, - "step": 1704 - }, - { - "epoch": 0.95, - "grad_norm": 0.055042069405317307, - "learning_rate": 0.00010857495787453058, - "loss": 0.6083, - "step": 1705 - }, - { - "epoch": 0.95, - "grad_norm": 0.06447125971317291, - "learning_rate": 0.00010848767011949952, - "loss": 0.615, - "step": 1706 - }, - { - "epoch": 0.95, - "grad_norm": 0.056245360523462296, - "learning_rate": 0.00010840037585017022, - "loss": 0.6705, - "step": 1707 - }, - { - "epoch": 0.95, - "grad_norm": 0.05670240893959999, - "learning_rate": 0.00010831307513354112, - "loss": 0.6491, - "step": 1708 - }, - { - "epoch": 0.95, - "grad_norm": 0.05644702538847923, - "learning_rate": 0.00010822576803661564, - "loss": 0.6462, - "step": 1709 - }, - { - "epoch": 0.95, - "grad_norm": 0.05394309014081955, - "learning_rate": 0.00010813845462640206, - "loss": 0.6643, - "step": 1710 - }, - { - "epoch": 0.95, - "grad_norm": 0.057082321494817734, - "learning_rate": 0.00010805113496991364, - "loss": 0.7565, - "step": 1711 - }, - { - "epoch": 0.95, - "grad_norm": 0.06196596845984459, - "learning_rate": 0.00010796380913416823, - "loss": 0.6655, - "step": 1712 - }, - { - "epoch": 0.95, - "grad_norm": 0.06284771114587784, - "learning_rate": 0.0001078764771861886, - "loss": 0.647, - "step": 1713 - }, - { - "epoch": 0.95, - "grad_norm": 0.06070135906338692, - "learning_rate": 0.0001077891391930021, - "loss": 0.6143, - "step": 1714 - }, - { - "epoch": 0.95, - "grad_norm": 0.058664754033088684, - "learning_rate": 0.00010770179522164079, - "loss": 0.6618, - "step": 1715 - }, - { - "epoch": 0.95, - "grad_norm": 0.0531286895275116, - "learning_rate": 0.00010761444533914125, - "loss": 0.6304, - "step": 1716 - }, - { - "epoch": 0.95, - "grad_norm": 0.053345970809459686, - "learning_rate": 0.0001075270896125446, - "loss": 0.6148, - "step": 1717 - }, - { - "epoch": 0.96, - "grad_norm": 0.05614785850048065, - "learning_rate": 0.00010743972810889655, - "loss": 0.5945, - "step": 1718 - }, - { - "epoch": 0.96, - "grad_norm": 0.05940824747085571, - "learning_rate": 0.00010735236089524716, - "loss": 0.6681, - "step": 1719 - }, - { - "epoch": 0.96, - "grad_norm": 0.0561952069401741, - "learning_rate": 0.00010726498803865088, - "loss": 0.6925, - "step": 1720 - }, - { - "epoch": 0.96, - "grad_norm": 0.053867727518081665, - "learning_rate": 0.00010717760960616643, - "loss": 0.5798, - "step": 1721 - }, - { - "epoch": 0.96, - "grad_norm": 0.06327812373638153, - "learning_rate": 0.00010709022566485698, - "loss": 0.7174, - "step": 1722 - }, - { - "epoch": 0.96, - "grad_norm": 0.06408220529556274, - "learning_rate": 0.00010700283628178975, - "loss": 0.7552, - "step": 1723 - }, - { - "epoch": 0.96, - "grad_norm": 0.06066514551639557, - "learning_rate": 0.00010691544152403623, - "loss": 0.72, - "step": 1724 - }, - { - "epoch": 0.96, - "grad_norm": 0.06779921799898148, - "learning_rate": 0.00010682804145867204, - "loss": 0.7299, - "step": 1725 - }, - { - "epoch": 0.96, - "grad_norm": 0.060735609382390976, - "learning_rate": 0.0001067406361527768, - "loss": 0.6431, - "step": 1726 - }, - { - "epoch": 0.96, - "grad_norm": 0.05965111032128334, - "learning_rate": 0.00010665322567343423, - "loss": 0.6426, - "step": 1727 - }, - { - "epoch": 0.96, - "grad_norm": 0.05794420465826988, - "learning_rate": 0.00010656581008773198, - "loss": 0.6485, - "step": 1728 - }, - { - "epoch": 0.96, - "grad_norm": 0.06031234934926033, - "learning_rate": 0.00010647838946276165, - "loss": 0.6548, - "step": 1729 - }, - { - "epoch": 0.96, - "grad_norm": 0.056517090648412704, - "learning_rate": 0.00010639096386561864, - "loss": 0.6871, - "step": 1730 - }, - { - "epoch": 0.96, - "grad_norm": 0.06209394708275795, - "learning_rate": 0.00010630353336340226, - "loss": 0.6645, - "step": 1731 - }, - { - "epoch": 0.96, - "grad_norm": 0.059892479330301285, - "learning_rate": 0.00010621609802321555, - "loss": 0.6261, - "step": 1732 - }, - { - "epoch": 0.96, - "grad_norm": 0.06408750265836716, - "learning_rate": 0.0001061286579121652, - "loss": 0.7269, - "step": 1733 - }, - { - "epoch": 0.96, - "grad_norm": 0.057373423129320145, - "learning_rate": 0.00010604121309736164, - "loss": 0.6173, - "step": 1734 - }, - { - "epoch": 0.96, - "grad_norm": 0.055811841040849686, - "learning_rate": 0.00010595376364591889, - "loss": 0.6444, - "step": 1735 - }, - { - "epoch": 0.97, - "grad_norm": 0.05903814360499382, - "learning_rate": 0.00010586630962495452, - "loss": 0.7005, - "step": 1736 - }, - { - "epoch": 0.97, - "grad_norm": 0.06342557072639465, - "learning_rate": 0.00010577885110158958, - "loss": 0.6226, - "step": 1737 - }, - { - "epoch": 0.97, - "grad_norm": 0.06033981963992119, - "learning_rate": 0.00010569138814294864, - "loss": 0.6606, - "step": 1738 - }, - { - "epoch": 0.97, - "grad_norm": 0.0658353865146637, - "learning_rate": 0.00010560392081615962, - "loss": 0.6379, - "step": 1739 - }, - { - "epoch": 0.97, - "grad_norm": 0.05433519929647446, - "learning_rate": 0.00010551644918835381, - "loss": 0.5832, - "step": 1740 - }, - { - "epoch": 0.97, - "grad_norm": 0.058659233152866364, - "learning_rate": 0.00010542897332666581, - "loss": 0.6252, - "step": 1741 - }, - { - "epoch": 0.97, - "grad_norm": 0.06791430711746216, - "learning_rate": 0.00010534149329823349, - "loss": 0.7116, - "step": 1742 - }, - { - "epoch": 0.97, - "grad_norm": 0.06629349291324615, - "learning_rate": 0.00010525400917019784, - "loss": 0.9018, - "step": 1743 - }, - { - "epoch": 0.97, - "grad_norm": 0.05723525211215019, - "learning_rate": 0.00010516652100970308, - "loss": 0.6521, - "step": 1744 - }, - { - "epoch": 0.97, - "grad_norm": 0.0574793666601181, - "learning_rate": 0.00010507902888389647, - "loss": 0.65, - "step": 1745 - }, - { - "epoch": 0.97, - "grad_norm": 0.05418836697936058, - "learning_rate": 0.00010499153285992833, - "loss": 0.5964, - "step": 1746 - }, - { - "epoch": 0.97, - "grad_norm": 0.05051202327013016, - "learning_rate": 0.00010490403300495201, - "loss": 0.6072, - "step": 1747 - }, - { - "epoch": 0.97, - "grad_norm": 0.05381901189684868, - "learning_rate": 0.00010481652938612374, - "loss": 0.5846, - "step": 1748 - }, - { - "epoch": 0.97, - "grad_norm": 0.055063627660274506, - "learning_rate": 0.00010472902207060265, - "loss": 0.5894, - "step": 1749 - }, - { - "epoch": 0.97, - "grad_norm": 0.061404235661029816, - "learning_rate": 0.00010464151112555077, - "loss": 0.6717, - "step": 1750 - }, - { - "epoch": 0.97, - "grad_norm": 0.05025755986571312, - "learning_rate": 0.00010455399661813283, - "loss": 0.5984, - "step": 1751 - }, - { - "epoch": 0.97, - "grad_norm": 0.0560632087290287, - "learning_rate": 0.00010446647861551633, - "loss": 0.5581, - "step": 1752 - }, - { - "epoch": 0.97, - "grad_norm": 0.06142037361860275, - "learning_rate": 0.0001043789571848715, - "loss": 0.5679, - "step": 1753 - }, - { - "epoch": 0.98, - "grad_norm": 0.056344058364629745, - "learning_rate": 0.00010429143239337112, - "loss": 0.6612, - "step": 1754 - }, - { - "epoch": 0.98, - "grad_norm": 0.06066104397177696, - "learning_rate": 0.00010420390430819058, - "loss": 0.7477, - "step": 1755 - }, - { - "epoch": 0.98, - "grad_norm": 0.05510355904698372, - "learning_rate": 0.00010411637299650783, - "loss": 0.6163, - "step": 1756 - }, - { - "epoch": 0.98, - "grad_norm": 0.057294245809316635, - "learning_rate": 0.00010402883852550325, - "loss": 0.6468, - "step": 1757 - }, - { - "epoch": 0.98, - "grad_norm": 0.056174419820308685, - "learning_rate": 0.00010394130096235966, - "loss": 0.6213, - "step": 1758 - }, - { - "epoch": 0.98, - "grad_norm": 0.06194084510207176, - "learning_rate": 0.00010385376037426226, - "loss": 0.7077, - "step": 1759 - }, - { - "epoch": 0.98, - "grad_norm": 0.06177836284041405, - "learning_rate": 0.00010376621682839857, - "loss": 0.6536, - "step": 1760 - }, - { - "epoch": 0.98, - "grad_norm": 0.059714220464229584, - "learning_rate": 0.00010367867039195842, - "loss": 0.5912, - "step": 1761 - }, - { - "epoch": 0.98, - "grad_norm": 0.052113138139247894, - "learning_rate": 0.00010359112113213376, - "loss": 0.5401, - "step": 1762 - }, - { - "epoch": 0.98, - "grad_norm": 0.06121697649359703, - "learning_rate": 0.0001035035691161188, - "loss": 0.6862, - "step": 1763 - }, - { - "epoch": 0.98, - "grad_norm": 0.05976003035902977, - "learning_rate": 0.00010341601441110983, - "loss": 0.58, - "step": 1764 - }, - { - "epoch": 0.98, - "grad_norm": 0.05741668865084648, - "learning_rate": 0.00010332845708430519, - "loss": 0.5233, - "step": 1765 - }, - { - "epoch": 0.98, - "grad_norm": 0.06236950680613518, - "learning_rate": 0.00010324089720290521, - "loss": 0.7257, - "step": 1766 - }, - { - "epoch": 0.98, - "grad_norm": 0.06206139549612999, - "learning_rate": 0.00010315333483411232, - "loss": 0.6985, - "step": 1767 - }, - { - "epoch": 0.98, - "grad_norm": 0.05406549200415611, - "learning_rate": 0.00010306577004513065, - "loss": 0.5677, - "step": 1768 - }, - { - "epoch": 0.98, - "grad_norm": 0.06077892333269119, - "learning_rate": 0.0001029782029031663, - "loss": 0.6745, - "step": 1769 - }, - { - "epoch": 0.98, - "grad_norm": 0.06338007748126984, - "learning_rate": 0.00010289063347542726, - "loss": 0.6612, - "step": 1770 - }, - { - "epoch": 0.98, - "grad_norm": 0.060978084802627563, - "learning_rate": 0.00010280306182912313, - "loss": 0.7625, - "step": 1771 - }, - { - "epoch": 0.99, - "grad_norm": 0.0552746020257473, - "learning_rate": 0.00010271548803146526, - "loss": 0.5827, - "step": 1772 - }, - { - "epoch": 0.99, - "grad_norm": 0.05428915098309517, - "learning_rate": 0.00010262791214966668, - "loss": 0.6786, - "step": 1773 - }, - { - "epoch": 0.99, - "grad_norm": 0.057721514254808426, - "learning_rate": 0.00010254033425094197, - "loss": 0.5914, - "step": 1774 - }, - { - "epoch": 0.99, - "grad_norm": 0.05720565468072891, - "learning_rate": 0.00010245275440250728, - "loss": 0.6013, - "step": 1775 - }, - { - "epoch": 0.99, - "grad_norm": 0.05914613604545593, - "learning_rate": 0.00010236517267158028, - "loss": 0.6439, - "step": 1776 - }, - { - "epoch": 0.99, - "grad_norm": 0.057543400675058365, - "learning_rate": 0.00010227758912538008, - "loss": 0.7292, - "step": 1777 - }, - { - "epoch": 0.99, - "grad_norm": 0.056838035583496094, - "learning_rate": 0.00010219000383112713, - "loss": 0.6506, - "step": 1778 - }, - { - "epoch": 0.99, - "grad_norm": 0.05806044489145279, - "learning_rate": 0.00010210241685604331, - "loss": 0.6114, - "step": 1779 - }, - { - "epoch": 0.99, - "grad_norm": 0.0533684603869915, - "learning_rate": 0.00010201482826735172, - "loss": 0.5946, - "step": 1780 - }, - { - "epoch": 0.99, - "grad_norm": 0.05922561138868332, - "learning_rate": 0.00010192723813227672, - "loss": 0.6626, - "step": 1781 - }, - { - "epoch": 0.99, - "grad_norm": 0.055819686502218246, - "learning_rate": 0.00010183964651804382, - "loss": 0.6422, - "step": 1782 - }, - { - "epoch": 0.99, - "grad_norm": 0.050924383103847504, - "learning_rate": 0.00010175205349187977, - "loss": 0.5344, - "step": 1783 - }, - { - "epoch": 0.99, - "grad_norm": 0.06296874582767487, - "learning_rate": 0.00010166445912101231, - "loss": 0.6451, - "step": 1784 - }, - { - "epoch": 0.99, - "grad_norm": 0.05765317752957344, - "learning_rate": 0.00010157686347267021, - "loss": 0.6403, - "step": 1785 - }, - { - "epoch": 0.99, - "grad_norm": 0.060042284429073334, - "learning_rate": 0.00010148926661408327, - "loss": 0.6919, - "step": 1786 - }, - { - "epoch": 0.99, - "grad_norm": 0.06098796799778938, - "learning_rate": 0.0001014016686124822, - "loss": 0.6371, - "step": 1787 - }, - { - "epoch": 0.99, - "grad_norm": 0.05685941502451897, - "learning_rate": 0.00010131406953509857, - "loss": 0.6399, - "step": 1788 - }, - { - "epoch": 0.99, - "grad_norm": 0.06483474373817444, - "learning_rate": 0.00010122646944916483, - "loss": 0.6915, - "step": 1789 - }, - { - "epoch": 1.0, - "grad_norm": 0.06274085491895676, - "learning_rate": 0.00010113886842191408, - "loss": 0.6212, - "step": 1790 - }, - { - "epoch": 1.0, - "grad_norm": 0.06120248883962631, - "learning_rate": 0.00010105126652058032, - "loss": 0.6859, - "step": 1791 - }, - { - "epoch": 1.0, - "grad_norm": 0.06124896928668022, - "learning_rate": 0.00010096366381239808, - "loss": 0.5996, - "step": 1792 - }, - { - "epoch": 1.0, - "grad_norm": 0.05330495536327362, - "learning_rate": 0.00010087606036460257, - "loss": 0.7189, - "step": 1793 - }, - { - "epoch": 1.0, - "grad_norm": 0.06091773882508278, - "learning_rate": 0.00010078845624442954, - "loss": 0.6692, - "step": 1794 - }, - { - "epoch": 1.0, - "grad_norm": 0.057791825383901596, - "learning_rate": 0.0001007008515191153, - "loss": 0.6403, - "step": 1795 - }, - { - "epoch": 1.0, - "grad_norm": 0.06233106181025505, - "learning_rate": 0.00010061324625589657, - "loss": 0.6166, - "step": 1796 - }, - { - "epoch": 1.0, - "grad_norm": 0.06440749019384384, - "learning_rate": 0.0001005256405220105, - "loss": 0.7032, - "step": 1797 - }, - { - "epoch": 1.0, - "grad_norm": 0.060089919716119766, - "learning_rate": 0.00010043803438469461, - "loss": 0.6484, - "step": 1798 - }, - { - "epoch": 1.0, - "grad_norm": 0.05943863466382027, - "learning_rate": 0.00010035042791118674, - "loss": 0.6422, - "step": 1799 - }, - { - "epoch": 1.0, - "grad_norm": 0.07573998719453812, - "learning_rate": 0.00010026282116872499, - "loss": 0.6176, - "step": 1800 - }, - { - "epoch": 1.0, - "grad_norm": 0.061618976294994354, - "learning_rate": 0.00010017521422454763, - "loss": 0.7475, - "step": 1801 - }, - { - "epoch": 1.0, - "grad_norm": 0.05466758459806442, - "learning_rate": 0.00010008760714589311, - "loss": 0.6769, - "step": 1802 - }, - { - "epoch": 1.0, - "grad_norm": 0.05463040620088577, - "learning_rate": 0.0001, - "loss": 0.6142, - "step": 1803 - }, - { - "epoch": 1.0, - "grad_norm": 0.05533997714519501, - "learning_rate": 9.991239285410691e-05, - "loss": 0.6714, - "step": 1804 - }, - { - "epoch": 1.0, - "grad_norm": 0.0693361908197403, - "learning_rate": 9.98247857754524e-05, - "loss": 0.6588, - "step": 1805 - }, - { - "epoch": 1.0, - "grad_norm": 0.061774842441082, - "learning_rate": 9.973717883127504e-05, - "loss": 0.7012, - "step": 1806 - }, - { - "epoch": 1.01, - "grad_norm": 0.057546455413103104, - "learning_rate": 9.964957208881329e-05, - "loss": 0.6242, - "step": 1807 - }, - { - "epoch": 1.01, - "grad_norm": 0.059542592614889145, - "learning_rate": 9.956196561530541e-05, - "loss": 0.5466, - "step": 1808 - }, - { - "epoch": 1.01, - "grad_norm": 0.07098076492547989, - "learning_rate": 9.947435947798953e-05, - "loss": 0.7131, - "step": 1809 - }, - { - "epoch": 1.01, - "grad_norm": 0.05665259063243866, - "learning_rate": 9.938675374410347e-05, - "loss": 0.6429, - "step": 1810 - }, - { - "epoch": 1.01, - "grad_norm": 0.06300745159387589, - "learning_rate": 9.929914848088473e-05, - "loss": 0.6545, - "step": 1811 - }, - { - "epoch": 1.01, - "grad_norm": 0.05833979323506355, - "learning_rate": 9.921154375557047e-05, - "loss": 0.5777, - "step": 1812 - }, - { - "epoch": 1.01, - "grad_norm": 0.06335113197565079, - "learning_rate": 9.912393963539745e-05, - "loss": 0.7244, - "step": 1813 - }, - { - "epoch": 1.01, - "grad_norm": 0.06383019685745239, - "learning_rate": 9.903633618760195e-05, - "loss": 0.6863, - "step": 1814 - }, - { - "epoch": 1.01, - "grad_norm": 0.0583442859351635, - "learning_rate": 9.894873347941971e-05, - "loss": 0.6324, - "step": 1815 - }, - { - "epoch": 1.01, - "grad_norm": 0.06413304060697556, - "learning_rate": 9.886113157808594e-05, - "loss": 0.6864, - "step": 1816 - }, - { - "epoch": 1.01, - "grad_norm": 0.05635803937911987, - "learning_rate": 9.87735305508352e-05, - "loss": 0.6943, - "step": 1817 - }, - { - "epoch": 1.01, - "grad_norm": 0.061192139983177185, - "learning_rate": 9.868593046490144e-05, - "loss": 0.6384, - "step": 1818 - }, - { - "epoch": 1.01, - "grad_norm": 0.060894276946783066, - "learning_rate": 9.859833138751783e-05, - "loss": 0.6649, - "step": 1819 - }, - { - "epoch": 1.01, - "grad_norm": 0.066097192466259, - "learning_rate": 9.851073338591675e-05, - "loss": 0.7619, - "step": 1820 - }, - { - "epoch": 1.01, - "grad_norm": 0.06046619266271591, - "learning_rate": 9.842313652732982e-05, - "loss": 0.7299, - "step": 1821 - }, - { - "epoch": 1.01, - "grad_norm": 0.06305580586194992, - "learning_rate": 9.833554087898773e-05, - "loss": 0.6843, - "step": 1822 - }, - { - "epoch": 1.01, - "grad_norm": 0.05506165698170662, - "learning_rate": 9.824794650812026e-05, - "loss": 0.5665, - "step": 1823 - }, - { - "epoch": 1.01, - "grad_norm": 0.06139134615659714, - "learning_rate": 9.81603534819562e-05, - "loss": 0.7178, - "step": 1824 - }, - { - "epoch": 1.02, - "grad_norm": 0.05725986137986183, - "learning_rate": 9.807276186772333e-05, - "loss": 0.6013, - "step": 1825 - }, - { - "epoch": 1.02, - "grad_norm": 0.060024797916412354, - "learning_rate": 9.798517173264833e-05, - "loss": 0.6464, - "step": 1826 - }, - { - "epoch": 1.02, - "grad_norm": 0.06363005191087723, - "learning_rate": 9.789758314395672e-05, - "loss": 0.6539, - "step": 1827 - }, - { - "epoch": 1.02, - "eval_loss": 0.6588459610939026, - "eval_runtime": 196.7902, - "eval_samples_per_second": 28.391, - "eval_steps_per_second": 14.198, - "step": 1827 - }, - { - "epoch": 1.0, - "grad_norm": 0.11711059510707855, - "learning_rate": 9.780999616887288e-05, - "loss": 0.7841, - "step": 1828 - }, - { - "epoch": 1.0, - "grad_norm": 0.06279384344816208, - "learning_rate": 9.772241087461997e-05, - "loss": 0.6586, - "step": 1829 - }, - { - "epoch": 1.0, - "grad_norm": 0.06254998594522476, - "learning_rate": 9.763482732841976e-05, - "loss": 0.6021, - "step": 1830 - }, - { - "epoch": 1.0, - "grad_norm": 0.06098167970776558, - "learning_rate": 9.754724559749276e-05, - "loss": 0.6118, - "step": 1831 - }, - { - "epoch": 1.0, - "grad_norm": 0.06855977326631546, - "learning_rate": 9.74596657490581e-05, - "loss": 0.6888, - "step": 1832 - }, - { - "epoch": 1.0, - "grad_norm": 0.051197465509176254, - "learning_rate": 9.737208785033337e-05, - "loss": 0.5669, - "step": 1833 - }, - { - "epoch": 1.0, - "grad_norm": 0.06223751977086067, - "learning_rate": 9.728451196853477e-05, - "loss": 0.6522, - "step": 1834 - }, - { - "epoch": 1.0, - "grad_norm": 0.05435439944267273, - "learning_rate": 9.71969381708769e-05, - "loss": 0.534, - "step": 1835 - }, - { - "epoch": 1.0, - "grad_norm": 0.05871767923235893, - "learning_rate": 9.710936652457276e-05, - "loss": 0.5852, - "step": 1836 - }, - { - "epoch": 1.01, - "grad_norm": 0.06656838208436966, - "learning_rate": 9.70217970968337e-05, - "loss": 0.6139, - "step": 1837 - }, - { - "epoch": 1.01, - "grad_norm": 0.06376973539590836, - "learning_rate": 9.693422995486939e-05, - "loss": 0.608, - "step": 1838 - }, - { - "epoch": 1.01, - "grad_norm": 0.058163829147815704, - "learning_rate": 9.684666516588772e-05, - "loss": 0.528, - "step": 1839 - }, - { - "epoch": 1.01, - "grad_norm": 0.062393542379140854, - "learning_rate": 9.675910279709477e-05, - "loss": 0.6297, - "step": 1840 - }, - { - "epoch": 1.01, - "grad_norm": 0.06609448045492172, - "learning_rate": 9.667154291569482e-05, - "loss": 0.6068, - "step": 1841 - }, - { - "epoch": 1.01, - "grad_norm": 0.06638655811548233, - "learning_rate": 9.658398558889018e-05, - "loss": 0.6549, - "step": 1842 - }, - { - "epoch": 1.01, - "grad_norm": 0.06767024099826813, - "learning_rate": 9.649643088388119e-05, - "loss": 0.5385, - "step": 1843 - }, - { - "epoch": 1.01, - "grad_norm": 0.06396374851465225, - "learning_rate": 9.640887886786624e-05, - "loss": 0.7188, - "step": 1844 - }, - { - "epoch": 1.01, - "grad_norm": 0.06499580293893814, - "learning_rate": 9.632132960804159e-05, - "loss": 0.6873, - "step": 1845 - }, - { - "epoch": 1.01, - "grad_norm": 0.06490854173898697, - "learning_rate": 9.623378317160142e-05, - "loss": 0.6862, - "step": 1846 - }, - { - "epoch": 1.01, - "grad_norm": 0.053400713950395584, - "learning_rate": 9.614623962573776e-05, - "loss": 0.4934, - "step": 1847 - }, - { - "epoch": 1.01, - "grad_norm": 0.060843680053949356, - "learning_rate": 9.605869903764036e-05, - "loss": 0.5619, - "step": 1848 - }, - { - "epoch": 1.01, - "grad_norm": 0.05479586124420166, - "learning_rate": 9.597116147449676e-05, - "loss": 0.5344, - "step": 1849 - }, - { - "epoch": 1.01, - "grad_norm": 0.06369245052337646, - "learning_rate": 9.588362700349218e-05, - "loss": 0.5114, - "step": 1850 - }, - { - "epoch": 1.01, - "grad_norm": 0.05581709370017052, - "learning_rate": 9.579609569180942e-05, - "loss": 0.5555, - "step": 1851 - }, - { - "epoch": 1.01, - "grad_norm": 0.059186048805713654, - "learning_rate": 9.570856760662889e-05, - "loss": 0.6179, - "step": 1852 - }, - { - "epoch": 1.01, - "grad_norm": 0.06566520035266876, - "learning_rate": 9.562104281512852e-05, - "loss": 0.6656, - "step": 1853 - }, - { - "epoch": 1.01, - "grad_norm": 0.06835220754146576, - "learning_rate": 9.553352138448366e-05, - "loss": 0.7285, - "step": 1854 - }, - { - "epoch": 1.02, - "grad_norm": 0.06603655219078064, - "learning_rate": 9.54460033818672e-05, - "loss": 0.6859, - "step": 1855 - }, - { - "epoch": 1.02, - "grad_norm": 0.0615200474858284, - "learning_rate": 9.535848887444925e-05, - "loss": 0.6289, - "step": 1856 - }, - { - "epoch": 1.02, - "grad_norm": 0.06494131684303284, - "learning_rate": 9.527097792939737e-05, - "loss": 0.6473, - "step": 1857 - }, - { - "epoch": 1.02, - "grad_norm": 0.06510718911886215, - "learning_rate": 9.518347061387628e-05, - "loss": 0.6555, - "step": 1858 - }, - { - "epoch": 1.02, - "grad_norm": 0.06615065038204193, - "learning_rate": 9.509596699504801e-05, - "loss": 0.5815, - "step": 1859 - }, - { - "epoch": 1.02, - "grad_norm": 0.06684267520904541, - "learning_rate": 9.500846714007168e-05, - "loss": 0.6201, - "step": 1860 - }, - { - "epoch": 1.02, - "grad_norm": 0.06057841703295708, - "learning_rate": 9.492097111610357e-05, - "loss": 0.6083, - "step": 1861 - }, - { - "epoch": 1.02, - "grad_norm": 0.06085360795259476, - "learning_rate": 9.483347899029695e-05, - "loss": 0.5041, - "step": 1862 - }, - { - "epoch": 1.02, - "grad_norm": 0.06703361123800278, - "learning_rate": 9.474599082980217e-05, - "loss": 0.6671, - "step": 1863 - }, - { - "epoch": 1.02, - "grad_norm": 0.0637744590640068, - "learning_rate": 9.465850670176654e-05, - "loss": 0.6451, - "step": 1864 - }, - { - "epoch": 1.02, - "grad_norm": 0.06312189251184464, - "learning_rate": 9.45710266733342e-05, - "loss": 0.6395, - "step": 1865 - }, - { - "epoch": 1.02, - "grad_norm": 0.0770537480711937, - "learning_rate": 9.448355081164621e-05, - "loss": 0.6448, - "step": 1866 - }, - { - "epoch": 1.02, - "grad_norm": 0.061500776559114456, - "learning_rate": 9.439607918384039e-05, - "loss": 0.6372, - "step": 1867 - }, - { - "epoch": 1.02, - "grad_norm": 0.059113096445798874, - "learning_rate": 9.430861185705137e-05, - "loss": 0.5822, - "step": 1868 - }, - { - "epoch": 1.02, - "grad_norm": 0.0576748363673687, - "learning_rate": 9.422114889841044e-05, - "loss": 0.5552, - "step": 1869 - }, - { - "epoch": 1.02, - "grad_norm": 0.06234674155712128, - "learning_rate": 9.413369037504552e-05, - "loss": 0.6586, - "step": 1870 - }, - { - "epoch": 1.02, - "grad_norm": 0.06132230535149574, - "learning_rate": 9.404623635408112e-05, - "loss": 0.6556, - "step": 1871 - }, - { - "epoch": 1.02, - "grad_norm": 0.07447368651628494, - "learning_rate": 9.395878690263837e-05, - "loss": 0.6087, - "step": 1872 - }, - { - "epoch": 1.03, - "grad_norm": 0.05781631916761398, - "learning_rate": 9.387134208783482e-05, - "loss": 0.5581, - "step": 1873 - }, - { - "epoch": 1.03, - "grad_norm": 0.06479276716709137, - "learning_rate": 9.378390197678449e-05, - "loss": 0.6272, - "step": 1874 - }, - { - "epoch": 1.03, - "grad_norm": 0.05904553085565567, - "learning_rate": 9.369646663659775e-05, - "loss": 0.534, - "step": 1875 - }, - { - "epoch": 1.03, - "grad_norm": 0.0636134073138237, - "learning_rate": 9.360903613438138e-05, - "loss": 0.6396, - "step": 1876 - }, - { - "epoch": 1.03, - "grad_norm": 0.06313899904489517, - "learning_rate": 9.352161053723838e-05, - "loss": 0.6119, - "step": 1877 - }, - { - "epoch": 1.03, - "grad_norm": 0.0698072761297226, - "learning_rate": 9.343418991226803e-05, - "loss": 0.6818, - "step": 1878 - }, - { - "epoch": 1.03, - "grad_norm": 0.05524126812815666, - "learning_rate": 9.33467743265658e-05, - "loss": 0.506, - "step": 1879 - }, - { - "epoch": 1.03, - "grad_norm": 0.06468210369348526, - "learning_rate": 9.325936384722321e-05, - "loss": 0.6361, - "step": 1880 - }, - { - "epoch": 1.03, - "grad_norm": 0.0695754885673523, - "learning_rate": 9.317195854132798e-05, - "loss": 0.6385, - "step": 1881 - }, - { - "epoch": 1.03, - "grad_norm": 0.06824535876512527, - "learning_rate": 9.308455847596378e-05, - "loss": 0.6619, - "step": 1882 - }, - { - "epoch": 1.03, - "grad_norm": 0.06896868348121643, - "learning_rate": 9.299716371821027e-05, - "loss": 0.6911, - "step": 1883 - }, - { - "epoch": 1.03, - "grad_norm": 0.05827944353222847, - "learning_rate": 9.290977433514306e-05, - "loss": 0.5489, - "step": 1884 - }, - { - "epoch": 1.03, - "grad_norm": 0.0637725293636322, - "learning_rate": 9.282239039383358e-05, - "loss": 0.7276, - "step": 1885 - }, - { - "epoch": 1.03, - "grad_norm": 0.05886625126004219, - "learning_rate": 9.273501196134915e-05, - "loss": 0.569, - "step": 1886 - }, - { - "epoch": 1.03, - "grad_norm": 0.07639192044734955, - "learning_rate": 9.264763910475285e-05, - "loss": 0.7335, - "step": 1887 - }, - { - "epoch": 1.03, - "grad_norm": 0.05936943367123604, - "learning_rate": 9.256027189110346e-05, - "loss": 0.5261, - "step": 1888 - }, - { - "epoch": 1.03, - "grad_norm": 0.06819011270999908, - "learning_rate": 9.247291038745542e-05, - "loss": 0.5912, - "step": 1889 - }, - { - "epoch": 1.03, - "grad_norm": 0.05976708605885506, - "learning_rate": 9.23855546608588e-05, - "loss": 0.5962, - "step": 1890 - }, - { - "epoch": 1.04, - "grad_norm": 0.06304595619440079, - "learning_rate": 9.229820477835927e-05, - "loss": 0.6114, - "step": 1891 - }, - { - "epoch": 1.04, - "grad_norm": 0.06259685754776001, - "learning_rate": 9.221086080699793e-05, - "loss": 0.5958, - "step": 1892 - }, - { - "epoch": 1.04, - "grad_norm": 0.06620876491069794, - "learning_rate": 9.212352281381143e-05, - "loss": 0.6885, - "step": 1893 - }, - { - "epoch": 1.04, - "grad_norm": 0.058716122061014175, - "learning_rate": 9.203619086583179e-05, - "loss": 0.5823, - "step": 1894 - }, - { - "epoch": 1.04, - "grad_norm": 0.0609891451895237, - "learning_rate": 9.19488650300864e-05, - "loss": 0.5641, - "step": 1895 - }, - { - "epoch": 1.04, - "grad_norm": 0.06413526087999344, - "learning_rate": 9.186154537359796e-05, - "loss": 0.5867, - "step": 1896 - }, - { - "epoch": 1.04, - "grad_norm": 0.0639336034655571, - "learning_rate": 9.177423196338442e-05, - "loss": 0.6663, - "step": 1897 - }, - { - "epoch": 1.04, - "grad_norm": 0.06244528293609619, - "learning_rate": 9.168692486645893e-05, - "loss": 0.5846, - "step": 1898 - }, - { - "epoch": 1.04, - "grad_norm": 0.05837123841047287, - "learning_rate": 9.159962414982983e-05, - "loss": 0.5787, - "step": 1899 - }, - { - "epoch": 1.04, - "grad_norm": 0.0634046420454979, - "learning_rate": 9.151232988050051e-05, - "loss": 0.5736, - "step": 1900 - }, - { - "epoch": 1.04, - "grad_norm": 0.05949348211288452, - "learning_rate": 9.142504212546947e-05, - "loss": 0.5429, - "step": 1901 - }, - { - "epoch": 1.04, - "grad_norm": 0.05976530909538269, - "learning_rate": 9.133776095173015e-05, - "loss": 0.5851, - "step": 1902 - }, - { - "epoch": 1.04, - "grad_norm": 0.0690242350101471, - "learning_rate": 9.125048642627098e-05, - "loss": 0.6296, - "step": 1903 - }, - { - "epoch": 1.04, - "grad_norm": 0.06474726647138596, - "learning_rate": 9.116321861607524e-05, - "loss": 0.6606, - "step": 1904 - }, - { - "epoch": 1.04, - "grad_norm": 0.0658322274684906, - "learning_rate": 9.107595758812117e-05, - "loss": 0.5752, - "step": 1905 - }, - { - "epoch": 1.04, - "grad_norm": 0.059737034142017365, - "learning_rate": 9.098870340938168e-05, - "loss": 0.589, - "step": 1906 - }, - { - "epoch": 1.04, - "grad_norm": 0.05496500805020332, - "learning_rate": 9.090145614682444e-05, - "loss": 0.583, - "step": 1907 - }, - { - "epoch": 1.04, - "grad_norm": 0.0656789019703865, - "learning_rate": 9.081421586741189e-05, - "loss": 0.6171, - "step": 1908 - }, - { - "epoch": 1.05, - "grad_norm": 0.06451639533042908, - "learning_rate": 9.072698263810107e-05, - "loss": 0.6105, - "step": 1909 - }, - { - "epoch": 1.05, - "grad_norm": 0.061076030135154724, - "learning_rate": 9.063975652584358e-05, - "loss": 0.5201, - "step": 1910 - }, - { - "epoch": 1.05, - "grad_norm": 0.06417499482631683, - "learning_rate": 9.055253759758557e-05, - "loss": 0.567, - "step": 1911 - }, - { - "epoch": 1.05, - "grad_norm": 0.06392619013786316, - "learning_rate": 9.04653259202677e-05, - "loss": 0.6125, - "step": 1912 - }, - { - "epoch": 1.05, - "grad_norm": 0.06343825906515121, - "learning_rate": 9.037812156082504e-05, - "loss": 0.6414, - "step": 1913 - }, - { - "epoch": 1.05, - "grad_norm": 0.06423365324735641, - "learning_rate": 9.029092458618705e-05, - "loss": 0.5424, - "step": 1914 - }, - { - "epoch": 1.05, - "grad_norm": 0.065882109105587, - "learning_rate": 9.020373506327754e-05, - "loss": 0.5651, - "step": 1915 - }, - { - "epoch": 1.05, - "grad_norm": 0.07035372406244278, - "learning_rate": 9.011655305901458e-05, - "loss": 0.7719, - "step": 1916 - }, - { - "epoch": 1.05, - "grad_norm": 0.0645388588309288, - "learning_rate": 9.002937864031047e-05, - "loss": 0.5106, - "step": 1917 - }, - { - "epoch": 1.05, - "grad_norm": 0.06212789565324783, - "learning_rate": 8.994221187407167e-05, - "loss": 0.5512, - "step": 1918 - }, - { - "epoch": 1.05, - "grad_norm": 0.06453739106655121, - "learning_rate": 8.985505282719885e-05, - "loss": 0.5985, - "step": 1919 - }, - { - "epoch": 1.05, - "grad_norm": 0.06423369795084, - "learning_rate": 8.976790156658665e-05, - "loss": 0.6009, - "step": 1920 - }, - { - "epoch": 1.05, - "grad_norm": 0.06537653505802155, - "learning_rate": 8.968075815912381e-05, - "loss": 0.5746, - "step": 1921 - }, - { - "epoch": 1.05, - "grad_norm": 0.06721478700637817, - "learning_rate": 8.9593622671693e-05, - "loss": 0.5686, - "step": 1922 - }, - { - "epoch": 1.05, - "grad_norm": 0.0701824277639389, - "learning_rate": 8.950649517117081e-05, - "loss": 0.7034, - "step": 1923 - }, - { - "epoch": 1.05, - "grad_norm": 0.058217499405145645, - "learning_rate": 8.941937572442773e-05, - "loss": 0.5781, - "step": 1924 - }, - { - "epoch": 1.05, - "grad_norm": 0.0669047012925148, - "learning_rate": 8.933226439832805e-05, - "loss": 0.6353, - "step": 1925 - }, - { - "epoch": 1.05, - "grad_norm": 0.06103359907865524, - "learning_rate": 8.924516125972984e-05, - "loss": 0.5691, - "step": 1926 - }, - { - "epoch": 1.06, - "grad_norm": 0.06625822931528091, - "learning_rate": 8.915806637548482e-05, - "loss": 0.6668, - "step": 1927 - }, - { - "epoch": 1.06, - "grad_norm": 0.06848111748695374, - "learning_rate": 8.907097981243851e-05, - "loss": 0.6594, - "step": 1928 - }, - { - "epoch": 1.06, - "grad_norm": 0.06156426668167114, - "learning_rate": 8.898390163742993e-05, - "loss": 0.6222, - "step": 1929 - }, - { - "epoch": 1.06, - "grad_norm": 0.06666103005409241, - "learning_rate": 8.88968319172917e-05, - "loss": 0.6358, - "step": 1930 - }, - { - "epoch": 1.06, - "grad_norm": 0.0592927522957325, - "learning_rate": 8.880977071884994e-05, - "loss": 0.5957, - "step": 1931 - }, - { - "epoch": 1.06, - "grad_norm": 0.07227488607168198, - "learning_rate": 8.872271810892425e-05, - "loss": 0.5982, - "step": 1932 - }, - { - "epoch": 1.06, - "grad_norm": 0.06739647686481476, - "learning_rate": 8.863567415432763e-05, - "loss": 0.5339, - "step": 1933 - }, - { - "epoch": 1.06, - "grad_norm": 0.06235653534531593, - "learning_rate": 8.85486389218664e-05, - "loss": 0.636, - "step": 1934 - }, - { - "epoch": 1.06, - "grad_norm": 0.06076808646321297, - "learning_rate": 8.846161247834024e-05, - "loss": 0.6074, - "step": 1935 - }, - { - "epoch": 1.06, - "grad_norm": 0.06882358342409134, - "learning_rate": 8.837459489054204e-05, - "loss": 0.5976, - "step": 1936 - }, - { - "epoch": 1.06, - "grad_norm": 0.058474499732255936, - "learning_rate": 8.828758622525797e-05, - "loss": 0.614, - "step": 1937 - }, - { - "epoch": 1.06, - "grad_norm": 0.06190056726336479, - "learning_rate": 8.820058654926725e-05, - "loss": 0.5597, - "step": 1938 - }, - { - "epoch": 1.06, - "grad_norm": 0.05922474339604378, - "learning_rate": 8.811359592934228e-05, - "loss": 0.5483, - "step": 1939 - }, - { - "epoch": 1.06, - "grad_norm": 0.06522875279188156, - "learning_rate": 8.802661443224845e-05, - "loss": 0.7029, - "step": 1940 - }, - { - "epoch": 1.06, - "grad_norm": 0.06892760097980499, - "learning_rate": 8.793964212474419e-05, - "loss": 0.6374, - "step": 1941 - }, - { - "epoch": 1.06, - "grad_norm": 0.06366990506649017, - "learning_rate": 8.785267907358085e-05, - "loss": 0.6161, - "step": 1942 - }, - { - "epoch": 1.06, - "grad_norm": 0.06452450156211853, - "learning_rate": 8.776572534550272e-05, - "loss": 0.5453, - "step": 1943 - }, - { - "epoch": 1.06, - "grad_norm": 0.0678330734372139, - "learning_rate": 8.76787810072469e-05, - "loss": 0.6144, - "step": 1944 - }, - { - "epoch": 1.07, - "grad_norm": 0.06669484078884125, - "learning_rate": 8.759184612554321e-05, - "loss": 0.5738, - "step": 1945 - }, - { - "epoch": 1.07, - "grad_norm": 0.06842266023159027, - "learning_rate": 8.750492076711439e-05, - "loss": 0.6512, - "step": 1946 - }, - { - "epoch": 1.07, - "grad_norm": 0.06860072910785675, - "learning_rate": 8.741800499867573e-05, - "loss": 0.6068, - "step": 1947 - }, - { - "epoch": 1.07, - "grad_norm": 0.06875675916671753, - "learning_rate": 8.73310988869352e-05, - "loss": 0.5997, - "step": 1948 - }, - { - "epoch": 1.07, - "grad_norm": 0.06354532390832901, - "learning_rate": 8.724420249859335e-05, - "loss": 0.5348, - "step": 1949 - }, - { - "epoch": 1.07, - "grad_norm": 0.06942533701658249, - "learning_rate": 8.71573159003433e-05, - "loss": 0.6694, - "step": 1950 - }, - { - "epoch": 1.07, - "grad_norm": 0.06216884031891823, - "learning_rate": 8.70704391588706e-05, - "loss": 0.5807, - "step": 1951 - }, - { - "epoch": 1.07, - "grad_norm": 0.06261352449655533, - "learning_rate": 8.698357234085328e-05, - "loss": 0.5809, - "step": 1952 - }, - { - "epoch": 1.07, - "grad_norm": 0.06839074939489365, - "learning_rate": 8.689671551296175e-05, - "loss": 0.5921, - "step": 1953 - }, - { - "epoch": 1.07, - "grad_norm": 0.06489168107509613, - "learning_rate": 8.680986874185873e-05, - "loss": 0.5952, - "step": 1954 - }, - { - "epoch": 1.07, - "grad_norm": 0.05821004509925842, - "learning_rate": 8.67230320941992e-05, - "loss": 0.5973, - "step": 1955 - }, - { - "epoch": 1.07, - "grad_norm": 0.059854816645383835, - "learning_rate": 8.663620563663046e-05, - "loss": 0.5361, - "step": 1956 - }, - { - "epoch": 1.07, - "grad_norm": 0.05962301418185234, - "learning_rate": 8.654938943579194e-05, - "loss": 0.5073, - "step": 1957 - }, - { - "epoch": 1.07, - "grad_norm": 0.06811553239822388, - "learning_rate": 8.646258355831514e-05, - "loss": 0.58, - "step": 1958 - }, - { - "epoch": 1.07, - "grad_norm": 0.06248083338141441, - "learning_rate": 8.637578807082373e-05, - "loss": 0.6003, - "step": 1959 - }, - { - "epoch": 1.07, - "grad_norm": 0.06940522789955139, - "learning_rate": 8.628900303993335e-05, - "loss": 0.7196, - "step": 1960 - }, - { - "epoch": 1.07, - "grad_norm": 0.07490598410367966, - "learning_rate": 8.620222853225161e-05, - "loss": 0.591, - "step": 1961 - }, - { - "epoch": 1.07, - "grad_norm": 0.06339261680841446, - "learning_rate": 8.611546461437808e-05, - "loss": 0.5554, - "step": 1962 - }, - { - "epoch": 1.08, - "grad_norm": 0.06448446959257126, - "learning_rate": 8.602871135290418e-05, - "loss": 0.6441, - "step": 1963 - }, - { - "epoch": 1.08, - "grad_norm": 0.06204627826809883, - "learning_rate": 8.594196881441314e-05, - "loss": 0.6758, - "step": 1964 - }, - { - "epoch": 1.08, - "grad_norm": 0.06171583756804466, - "learning_rate": 8.585523706548001e-05, - "loss": 0.6001, - "step": 1965 - }, - { - "epoch": 1.08, - "grad_norm": 0.07351014763116837, - "learning_rate": 8.57685161726715e-05, - "loss": 0.6279, - "step": 1966 - }, - { - "epoch": 1.08, - "grad_norm": 0.07400185614824295, - "learning_rate": 8.568180620254603e-05, - "loss": 0.7228, - "step": 1967 - }, - { - "epoch": 1.08, - "grad_norm": 0.07116332650184631, - "learning_rate": 8.55951072216536e-05, - "loss": 0.63, - "step": 1968 - }, - { - "epoch": 1.08, - "grad_norm": 0.06596235930919647, - "learning_rate": 8.550841929653579e-05, - "loss": 0.5578, - "step": 1969 - }, - { - "epoch": 1.08, - "grad_norm": 0.06503760814666748, - "learning_rate": 8.542174249372573e-05, - "loss": 0.5068, - "step": 1970 - }, - { - "epoch": 1.08, - "grad_norm": 0.06710238754749298, - "learning_rate": 8.533507687974795e-05, - "loss": 0.6624, - "step": 1971 - }, - { - "epoch": 1.08, - "grad_norm": 0.0639219582080841, - "learning_rate": 8.524842252111844e-05, - "loss": 0.6277, - "step": 1972 - }, - { - "epoch": 1.08, - "grad_norm": 0.06799468398094177, - "learning_rate": 8.516177948434452e-05, - "loss": 0.5988, - "step": 1973 - }, - { - "epoch": 1.08, - "grad_norm": 0.06464463472366333, - "learning_rate": 8.507514783592488e-05, - "loss": 0.4894, - "step": 1974 - }, - { - "epoch": 1.08, - "grad_norm": 0.0756271556019783, - "learning_rate": 8.498852764234939e-05, - "loss": 0.6645, - "step": 1975 - }, - { - "epoch": 1.08, - "grad_norm": 0.06705022603273392, - "learning_rate": 8.490191897009916e-05, - "loss": 0.543, - "step": 1976 - }, - { - "epoch": 1.08, - "grad_norm": 0.06154650077223778, - "learning_rate": 8.481532188564648e-05, - "loss": 0.5, - "step": 1977 - }, - { - "epoch": 1.08, - "grad_norm": 0.06594829261302948, - "learning_rate": 8.472873645545474e-05, - "loss": 0.5656, - "step": 1978 - }, - { - "epoch": 1.08, - "grad_norm": 0.07296450436115265, - "learning_rate": 8.464216274597838e-05, - "loss": 0.7295, - "step": 1979 - }, - { - "epoch": 1.08, - "grad_norm": 0.06797216832637787, - "learning_rate": 8.45556008236628e-05, - "loss": 0.5848, - "step": 1980 - }, - { - "epoch": 1.09, - "grad_norm": 0.06360683590173721, - "learning_rate": 8.446905075494443e-05, - "loss": 0.6403, - "step": 1981 - }, - { - "epoch": 1.09, - "grad_norm": 0.06966099143028259, - "learning_rate": 8.438251260625056e-05, - "loss": 0.6367, - "step": 1982 - }, - { - "epoch": 1.09, - "grad_norm": 0.06762167811393738, - "learning_rate": 8.42959864439993e-05, - "loss": 0.6355, - "step": 1983 - }, - { - "epoch": 1.09, - "grad_norm": 0.07219623774290085, - "learning_rate": 8.420947233459962e-05, - "loss": 0.6437, - "step": 1984 - }, - { - "epoch": 1.09, - "grad_norm": 0.0665636733174324, - "learning_rate": 8.412297034445123e-05, - "loss": 0.6083, - "step": 1985 - }, - { - "epoch": 1.09, - "grad_norm": 0.06295057386159897, - "learning_rate": 8.403648053994447e-05, - "loss": 0.5336, - "step": 1986 - }, - { - "epoch": 1.09, - "grad_norm": 0.05705847963690758, - "learning_rate": 8.395000298746045e-05, - "loss": 0.5142, - "step": 1987 - }, - { - "epoch": 1.09, - "grad_norm": 0.0647684782743454, - "learning_rate": 8.38635377533708e-05, - "loss": 0.5619, - "step": 1988 - }, - { - "epoch": 1.09, - "grad_norm": 0.07010988146066666, - "learning_rate": 8.377708490403767e-05, - "loss": 0.642, - "step": 1989 - }, - { - "epoch": 1.09, - "grad_norm": 0.06674306839704514, - "learning_rate": 8.369064450581373e-05, - "loss": 0.6161, - "step": 1990 - }, - { - "epoch": 1.09, - "grad_norm": 0.0655483826994896, - "learning_rate": 8.360421662504214e-05, - "loss": 0.6301, - "step": 1991 - }, - { - "epoch": 1.09, - "grad_norm": 0.07575639337301254, - "learning_rate": 8.35178013280564e-05, - "loss": 0.5834, - "step": 1992 - }, - { - "epoch": 1.09, - "grad_norm": 0.07528828829526901, - "learning_rate": 8.343139868118036e-05, - "loss": 0.7007, - "step": 1993 - }, - { - "epoch": 1.09, - "grad_norm": 0.06885155290365219, - "learning_rate": 8.334500875072818e-05, - "loss": 0.5922, - "step": 1994 - }, - { - "epoch": 1.09, - "grad_norm": 0.0729137510061264, - "learning_rate": 8.325863160300423e-05, - "loss": 0.5965, - "step": 1995 - }, - { - "epoch": 1.09, - "grad_norm": 0.06822966039180756, - "learning_rate": 8.317226730430309e-05, - "loss": 0.5858, - "step": 1996 - }, - { - "epoch": 1.09, - "grad_norm": 0.06264076381921768, - "learning_rate": 8.30859159209095e-05, - "loss": 0.5204, - "step": 1997 - }, - { - "epoch": 1.09, - "grad_norm": 0.0732724666595459, - "learning_rate": 8.299957751909825e-05, - "loss": 0.6712, - "step": 1998 - }, - { - "epoch": 1.1, - "grad_norm": 0.07096925377845764, - "learning_rate": 8.291325216513419e-05, - "loss": 0.6521, - "step": 1999 - }, - { - "epoch": 1.1, - "grad_norm": 0.06218922510743141, - "learning_rate": 8.282693992527213e-05, - "loss": 0.574, - "step": 2000 - }, - { - "epoch": 1.1, - "grad_norm": 0.07510274648666382, - "learning_rate": 8.274064086575681e-05, - "loss": 0.6368, - "step": 2001 - }, - { - "epoch": 1.1, - "grad_norm": 0.06338323652744293, - "learning_rate": 8.265435505282293e-05, - "loss": 0.5627, - "step": 2002 - }, - { - "epoch": 1.1, - "grad_norm": 0.06242498382925987, - "learning_rate": 8.256808255269492e-05, - "loss": 0.6282, - "step": 2003 - }, - { - "epoch": 1.1, - "grad_norm": 0.07350806891918182, - "learning_rate": 8.248182343158706e-05, - "loss": 0.627, - "step": 2004 - }, - { - "epoch": 1.1, - "grad_norm": 0.07304222136735916, - "learning_rate": 8.23955777557033e-05, - "loss": 0.668, - "step": 2005 - }, - { - "epoch": 1.1, - "grad_norm": 0.06777157634496689, - "learning_rate": 8.230934559123739e-05, - "loss": 0.5669, - "step": 2006 - }, - { - "epoch": 1.1, - "grad_norm": 0.06221771612763405, - "learning_rate": 8.22231270043726e-05, - "loss": 0.5628, - "step": 2007 - }, - { - "epoch": 1.1, - "grad_norm": 0.06386017054319382, - "learning_rate": 8.213692206128178e-05, - "loss": 0.576, - "step": 2008 - }, - { - "epoch": 1.1, - "grad_norm": 0.06762038916349411, - "learning_rate": 8.205073082812737e-05, - "loss": 0.536, - "step": 2009 - }, - { - "epoch": 1.1, - "grad_norm": 0.06616533547639847, - "learning_rate": 8.196455337106127e-05, - "loss": 0.5657, - "step": 2010 - }, - { - "epoch": 1.1, - "grad_norm": 0.06463898718357086, - "learning_rate": 8.187838975622478e-05, - "loss": 0.5349, - "step": 2011 - }, - { - "epoch": 1.1, - "grad_norm": 0.06426985561847687, - "learning_rate": 8.179224004974857e-05, - "loss": 0.5844, - "step": 2012 - }, - { - "epoch": 1.1, - "grad_norm": 0.07058148086071014, - "learning_rate": 8.170610431775267e-05, - "loss": 0.6695, - "step": 2013 - }, - { - "epoch": 1.1, - "grad_norm": 0.07236622273921967, - "learning_rate": 8.161998262634637e-05, - "loss": 0.6522, - "step": 2014 - }, - { - "epoch": 1.1, - "grad_norm": 0.06184220314025879, - "learning_rate": 8.15338750416282e-05, - "loss": 0.5503, - "step": 2015 - }, - { - "epoch": 1.1, - "grad_norm": 0.07017958909273148, - "learning_rate": 8.144778162968584e-05, - "loss": 0.6168, - "step": 2016 - }, - { - "epoch": 1.11, - "grad_norm": 0.0607057586312294, - "learning_rate": 8.136170245659609e-05, - "loss": 0.5648, - "step": 2017 - }, - { - "epoch": 1.11, - "grad_norm": 0.06027553975582123, - "learning_rate": 8.127563758842483e-05, - "loss": 0.5837, - "step": 2018 - }, - { - "epoch": 1.11, - "grad_norm": 0.06839162111282349, - "learning_rate": 8.118958709122698e-05, - "loss": 0.7024, - "step": 2019 - }, - { - "epoch": 1.11, - "grad_norm": 0.0698401927947998, - "learning_rate": 8.110355103104641e-05, - "loss": 0.629, - "step": 2020 - }, - { - "epoch": 1.11, - "grad_norm": 0.06260319799184799, - "learning_rate": 8.101752947391588e-05, - "loss": 0.5965, - "step": 2021 - }, - { - "epoch": 1.11, - "grad_norm": 0.06395679712295532, - "learning_rate": 8.09315224858571e-05, - "loss": 0.526, - "step": 2022 - }, - { - "epoch": 1.11, - "grad_norm": 0.06215297058224678, - "learning_rate": 8.084553013288048e-05, - "loss": 0.5134, - "step": 2023 - }, - { - "epoch": 1.11, - "grad_norm": 0.06511794030666351, - "learning_rate": 8.075955248098536e-05, - "loss": 0.5383, - "step": 2024 - }, - { - "epoch": 1.11, - "grad_norm": 0.06743456423282623, - "learning_rate": 8.067358959615963e-05, - "loss": 0.6318, - "step": 2025 - }, - { - "epoch": 1.11, - "grad_norm": 0.07108740508556366, - "learning_rate": 8.058764154437996e-05, - "loss": 0.6597, - "step": 2026 - }, - { - "epoch": 1.11, - "grad_norm": 0.06761852651834488, - "learning_rate": 8.050170839161157e-05, - "loss": 0.6172, - "step": 2027 - }, - { - "epoch": 1.11, - "grad_norm": 0.06814343482255936, - "learning_rate": 8.04157902038083e-05, - "loss": 0.5837, - "step": 2028 - }, - { - "epoch": 1.11, - "grad_norm": 0.06850943714380264, - "learning_rate": 8.032988704691243e-05, - "loss": 0.5799, - "step": 2029 - }, - { - "epoch": 1.11, - "grad_norm": 0.06714814901351929, - "learning_rate": 8.02439989868548e-05, - "loss": 0.5786, - "step": 2030 - }, - { - "epoch": 1.11, - "grad_norm": 0.06138110160827637, - "learning_rate": 8.015812608955457e-05, - "loss": 0.5518, - "step": 2031 - }, - { - "epoch": 1.11, - "grad_norm": 0.06873062998056412, - "learning_rate": 8.007226842091931e-05, - "loss": 0.5336, - "step": 2032 - }, - { - "epoch": 1.11, - "grad_norm": 0.06783683598041534, - "learning_rate": 7.998642604684491e-05, - "loss": 0.61, - "step": 2033 - }, - { - "epoch": 1.11, - "grad_norm": 0.0697760209441185, - "learning_rate": 7.990059903321553e-05, - "loss": 0.603, - "step": 2034 - }, - { - "epoch": 1.12, - "grad_norm": 0.062311042100191116, - "learning_rate": 7.981478744590348e-05, - "loss": 0.5093, - "step": 2035 - }, - { - "epoch": 1.12, - "grad_norm": 0.07325945049524307, - "learning_rate": 7.972899135076929e-05, - "loss": 0.6595, - "step": 2036 - }, - { - "epoch": 1.12, - "grad_norm": 0.06448288261890411, - "learning_rate": 7.964321081366157e-05, - "loss": 0.5403, - "step": 2037 - }, - { - "epoch": 1.12, - "grad_norm": 0.06438078731298447, - "learning_rate": 7.955744590041701e-05, - "loss": 0.57, - "step": 2038 - }, - { - "epoch": 1.12, - "grad_norm": 0.06576252728700638, - "learning_rate": 7.947169667686027e-05, - "loss": 0.6023, - "step": 2039 - }, - { - "epoch": 1.12, - "grad_norm": 0.07431968301534653, - "learning_rate": 7.938596320880402e-05, - "loss": 0.6057, - "step": 2040 - }, - { - "epoch": 1.12, - "grad_norm": 0.07968160510063171, - "learning_rate": 7.93002455620488e-05, - "loss": 0.6573, - "step": 2041 - }, - { - "epoch": 1.12, - "grad_norm": 0.06240157037973404, - "learning_rate": 7.9214543802383e-05, - "loss": 0.5845, - "step": 2042 - }, - { - "epoch": 1.12, - "grad_norm": 0.06589362770318985, - "learning_rate": 7.912885799558288e-05, - "loss": 0.5057, - "step": 2043 - }, - { - "epoch": 1.12, - "grad_norm": 0.06928303092718124, - "learning_rate": 7.904318820741239e-05, - "loss": 0.5991, - "step": 2044 - }, - { - "epoch": 1.12, - "grad_norm": 0.06346700340509415, - "learning_rate": 7.89575345036232e-05, - "loss": 0.5788, - "step": 2045 - }, - { - "epoch": 1.12, - "grad_norm": 0.06340914219617844, - "learning_rate": 7.887189694995466e-05, - "loss": 0.544, - "step": 2046 - }, - { - "epoch": 1.12, - "grad_norm": 0.06660696119070053, - "learning_rate": 7.878627561213369e-05, - "loss": 0.6379, - "step": 2047 - }, - { - "epoch": 1.12, - "grad_norm": 0.0683973878622055, - "learning_rate": 7.870067055587481e-05, - "loss": 0.5923, - "step": 2048 - }, - { - "epoch": 1.12, - "grad_norm": 0.06100190058350563, - "learning_rate": 7.861508184688e-05, - "loss": 0.6059, - "step": 2049 - }, - { - "epoch": 1.12, - "grad_norm": 0.059445932507514954, - "learning_rate": 7.85295095508387e-05, - "loss": 0.5476, - "step": 2050 - }, - { - "epoch": 1.12, - "grad_norm": 0.07229001820087433, - "learning_rate": 7.844395373342779e-05, - "loss": 0.6259, - "step": 2051 - }, - { - "epoch": 1.12, - "grad_norm": 0.06739497929811478, - "learning_rate": 7.835841446031143e-05, - "loss": 0.5855, - "step": 2052 - }, - { - "epoch": 1.13, - "grad_norm": 0.06135958433151245, - "learning_rate": 7.827289179714118e-05, - "loss": 0.5265, - "step": 2053 - }, - { - "epoch": 1.13, - "grad_norm": 0.07160507887601852, - "learning_rate": 7.818738580955576e-05, - "loss": 0.6323, - "step": 2054 - }, - { - "epoch": 1.13, - "grad_norm": 0.060510244220495224, - "learning_rate": 7.810189656318112e-05, - "loss": 0.4678, - "step": 2055 - }, - { - "epoch": 1.13, - "grad_norm": 0.06444878876209259, - "learning_rate": 7.801642412363041e-05, - "loss": 0.6195, - "step": 2056 - }, - { - "epoch": 1.13, - "grad_norm": 0.06845416873693466, - "learning_rate": 7.793096855650385e-05, - "loss": 0.6249, - "step": 2057 - }, - { - "epoch": 1.13, - "grad_norm": 0.06821136176586151, - "learning_rate": 7.784552992738867e-05, - "loss": 0.6733, - "step": 2058 - }, - { - "epoch": 1.13, - "grad_norm": 0.07368767261505127, - "learning_rate": 7.776010830185914e-05, - "loss": 0.5954, - "step": 2059 - }, - { - "epoch": 1.13, - "grad_norm": 0.06696832925081253, - "learning_rate": 7.767470374547647e-05, - "loss": 0.661, - "step": 2060 - }, - { - "epoch": 1.13, - "grad_norm": 0.07673228532075882, - "learning_rate": 7.75893163237888e-05, - "loss": 0.6957, - "step": 2061 - }, - { - "epoch": 1.13, - "grad_norm": 0.06611277163028717, - "learning_rate": 7.750394610233106e-05, - "loss": 0.5931, - "step": 2062 - }, - { - "epoch": 1.13, - "grad_norm": 0.06206559017300606, - "learning_rate": 7.741859314662502e-05, - "loss": 0.5773, - "step": 2063 - }, - { - "epoch": 1.13, - "grad_norm": 0.06988830119371414, - "learning_rate": 7.733325752217917e-05, - "loss": 0.6255, - "step": 2064 - }, - { - "epoch": 1.13, - "grad_norm": 0.07555034756660461, - "learning_rate": 7.724793929448875e-05, - "loss": 0.6309, - "step": 2065 - }, - { - "epoch": 1.13, - "grad_norm": 0.06582862883806229, - "learning_rate": 7.716263852903561e-05, - "loss": 0.5482, - "step": 2066 - }, - { - "epoch": 1.13, - "grad_norm": 0.06785890460014343, - "learning_rate": 7.707735529128819e-05, - "loss": 0.562, - "step": 2067 - }, - { - "epoch": 1.13, - "grad_norm": 0.062424518167972565, - "learning_rate": 7.699208964670149e-05, - "loss": 0.5801, - "step": 2068 - }, - { - "epoch": 1.13, - "grad_norm": 0.07258402556180954, - "learning_rate": 7.690684166071702e-05, - "loss": 0.6196, - "step": 2069 - }, - { - "epoch": 1.13, - "grad_norm": 0.06637036055326462, - "learning_rate": 7.68216113987627e-05, - "loss": 0.5658, - "step": 2070 - }, - { - "epoch": 1.14, - "grad_norm": 0.06919743120670319, - "learning_rate": 7.673639892625289e-05, - "loss": 0.5619, - "step": 2071 - }, - { - "epoch": 1.14, - "grad_norm": 0.06857898831367493, - "learning_rate": 7.665120430858829e-05, - "loss": 0.7035, - "step": 2072 - }, - { - "epoch": 1.14, - "grad_norm": 0.0656585618853569, - "learning_rate": 7.656602761115583e-05, - "loss": 0.6331, - "step": 2073 - }, - { - "epoch": 1.14, - "grad_norm": 0.0701858177781105, - "learning_rate": 7.648086889932879e-05, - "loss": 0.6627, - "step": 2074 - }, - { - "epoch": 1.14, - "grad_norm": 0.06948113441467285, - "learning_rate": 7.63957282384666e-05, - "loss": 0.6194, - "step": 2075 - }, - { - "epoch": 1.14, - "grad_norm": 0.07097858935594559, - "learning_rate": 7.631060569391482e-05, - "loss": 0.6377, - "step": 2076 - }, - { - "epoch": 1.14, - "grad_norm": 0.0637185126543045, - "learning_rate": 7.62255013310051e-05, - "loss": 0.5771, - "step": 2077 - }, - { - "epoch": 1.14, - "grad_norm": 0.0651545375585556, - "learning_rate": 7.614041521505517e-05, - "loss": 0.6193, - "step": 2078 - }, - { - "epoch": 1.14, - "grad_norm": 0.07133258879184723, - "learning_rate": 7.605534741136873e-05, - "loss": 0.5815, - "step": 2079 - }, - { - "epoch": 1.14, - "grad_norm": 0.0651950016617775, - "learning_rate": 7.597029798523545e-05, - "loss": 0.5649, - "step": 2080 - }, - { - "epoch": 1.14, - "grad_norm": 0.06561518460512161, - "learning_rate": 7.588526700193086e-05, - "loss": 0.5826, - "step": 2081 - }, - { - "epoch": 1.14, - "grad_norm": 0.06808426976203918, - "learning_rate": 7.580025452671637e-05, - "loss": 0.6098, - "step": 2082 - }, - { - "epoch": 1.14, - "grad_norm": 0.0729101300239563, - "learning_rate": 7.571526062483912e-05, - "loss": 0.6483, - "step": 2083 - }, - { - "epoch": 1.14, - "grad_norm": 0.07908526808023453, - "learning_rate": 7.563028536153213e-05, - "loss": 0.6949, - "step": 2084 - }, - { - "epoch": 1.14, - "grad_norm": 0.06607040762901306, - "learning_rate": 7.554532880201399e-05, - "loss": 0.5709, - "step": 2085 - }, - { - "epoch": 1.14, - "grad_norm": 0.06513672322034836, - "learning_rate": 7.546039101148895e-05, - "loss": 0.5823, - "step": 2086 - }, - { - "epoch": 1.14, - "grad_norm": 0.0633048489689827, - "learning_rate": 7.53754720551469e-05, - "loss": 0.554, - "step": 2087 - }, - { - "epoch": 1.14, - "grad_norm": 0.07791243493556976, - "learning_rate": 7.529057199816326e-05, - "loss": 0.6958, - "step": 2088 - }, - { - "epoch": 1.15, - "grad_norm": 0.07528932392597198, - "learning_rate": 7.520569090569893e-05, - "loss": 0.643, - "step": 2089 - }, - { - "epoch": 1.15, - "grad_norm": 0.06492157280445099, - "learning_rate": 7.512082884290026e-05, - "loss": 0.575, - "step": 2090 - }, - { - "epoch": 1.15, - "grad_norm": 0.07389669865369797, - "learning_rate": 7.5035985874899e-05, - "loss": 0.6689, - "step": 2091 - }, - { - "epoch": 1.15, - "grad_norm": 0.0760781541466713, - "learning_rate": 7.495116206681224e-05, - "loss": 0.6634, - "step": 2092 - }, - { - "epoch": 1.15, - "grad_norm": 0.06599968671798706, - "learning_rate": 7.486635748374237e-05, - "loss": 0.5395, - "step": 2093 - }, - { - "epoch": 1.15, - "grad_norm": 0.07587437331676483, - "learning_rate": 7.478157219077703e-05, - "loss": 0.6457, - "step": 2094 - }, - { - "epoch": 1.15, - "grad_norm": 0.0665985643863678, - "learning_rate": 7.469680625298903e-05, - "loss": 0.5487, - "step": 2095 - }, - { - "epoch": 1.15, - "grad_norm": 0.06659691780805588, - "learning_rate": 7.461205973543636e-05, - "loss": 0.4887, - "step": 2096 - }, - { - "epoch": 1.15, - "grad_norm": 0.06467236578464508, - "learning_rate": 7.452733270316209e-05, - "loss": 0.5376, - "step": 2097 - }, - { - "epoch": 1.15, - "grad_norm": 0.07357700169086456, - "learning_rate": 7.444262522119428e-05, - "loss": 0.6996, - "step": 2098 - }, - { - "epoch": 1.15, - "grad_norm": 0.06766042858362198, - "learning_rate": 7.435793735454611e-05, - "loss": 0.5223, - "step": 2099 - }, - { - "epoch": 1.15, - "grad_norm": 0.07042094320058823, - "learning_rate": 7.427326916821557e-05, - "loss": 0.5802, - "step": 2100 - }, - { - "epoch": 1.15, - "grad_norm": 0.0585549958050251, - "learning_rate": 7.418862072718562e-05, - "loss": 0.5306, - "step": 2101 - }, - { - "epoch": 1.15, - "grad_norm": 0.06087832525372505, - "learning_rate": 7.410399209642409e-05, - "loss": 0.5706, - "step": 2102 - }, - { - "epoch": 1.15, - "grad_norm": 0.06485297530889511, - "learning_rate": 7.401938334088356e-05, - "loss": 0.5307, - "step": 2103 - }, - { - "epoch": 1.15, - "grad_norm": 0.07619531452655792, - "learning_rate": 7.393479452550133e-05, - "loss": 0.5977, - "step": 2104 - }, - { - "epoch": 1.15, - "grad_norm": 0.06514798104763031, - "learning_rate": 7.385022571519947e-05, - "loss": 0.5945, - "step": 2105 - }, - { - "epoch": 1.15, - "grad_norm": 0.06858575344085693, - "learning_rate": 7.376567697488461e-05, - "loss": 0.6253, - "step": 2106 - }, - { - "epoch": 1.16, - "grad_norm": 0.06849953532218933, - "learning_rate": 7.368114836944805e-05, - "loss": 0.5536, - "step": 2107 - }, - { - "epoch": 1.16, - "grad_norm": 0.0654776319861412, - "learning_rate": 7.35966399637656e-05, - "loss": 0.5465, - "step": 2108 - }, - { - "epoch": 1.16, - "grad_norm": 0.06670403480529785, - "learning_rate": 7.35121518226976e-05, - "loss": 0.5944, - "step": 2109 - }, - { - "epoch": 1.16, - "grad_norm": 0.06398824602365494, - "learning_rate": 7.342768401108876e-05, - "loss": 0.5662, - "step": 2110 - }, - { - "epoch": 1.16, - "grad_norm": 0.06844029575586319, - "learning_rate": 7.334323659376829e-05, - "loss": 0.5791, - "step": 2111 - }, - { - "epoch": 1.16, - "grad_norm": 0.06193400174379349, - "learning_rate": 7.325880963554969e-05, - "loss": 0.4949, - "step": 2112 - }, - { - "epoch": 1.16, - "grad_norm": 0.061342112720012665, - "learning_rate": 7.317440320123075e-05, - "loss": 0.5043, - "step": 2113 - }, - { - "epoch": 1.16, - "grad_norm": 0.06053828075528145, - "learning_rate": 7.309001735559349e-05, - "loss": 0.5127, - "step": 2114 - }, - { - "epoch": 1.16, - "grad_norm": 0.07439390569925308, - "learning_rate": 7.300565216340422e-05, - "loss": 0.5662, - "step": 2115 - }, - { - "epoch": 1.16, - "grad_norm": 0.07639770209789276, - "learning_rate": 7.292130768941332e-05, - "loss": 0.5874, - "step": 2116 - }, - { - "epoch": 1.16, - "grad_norm": 0.06395827233791351, - "learning_rate": 7.283698399835528e-05, - "loss": 0.5762, - "step": 2117 - }, - { - "epoch": 1.16, - "grad_norm": 0.07043325901031494, - "learning_rate": 7.275268115494866e-05, - "loss": 0.6473, - "step": 2118 - }, - { - "epoch": 1.16, - "grad_norm": 0.07612812519073486, - "learning_rate": 7.266839922389598e-05, - "loss": 0.6374, - "step": 2119 - }, - { - "epoch": 1.16, - "grad_norm": 0.06510752439498901, - "learning_rate": 7.258413826988376e-05, - "loss": 0.5649, - "step": 2120 - }, - { - "epoch": 1.16, - "grad_norm": 0.071219302713871, - "learning_rate": 7.249989835758239e-05, - "loss": 0.6823, - "step": 2121 - }, - { - "epoch": 1.16, - "grad_norm": 0.07150042057037354, - "learning_rate": 7.24156795516461e-05, - "loss": 0.6678, - "step": 2122 - }, - { - "epoch": 1.16, - "grad_norm": 0.07282208651304245, - "learning_rate": 7.233148191671293e-05, - "loss": 0.6312, - "step": 2123 - }, - { - "epoch": 1.16, - "grad_norm": 0.06840363144874573, - "learning_rate": 7.224730551740472e-05, - "loss": 0.5773, - "step": 2124 - }, - { - "epoch": 1.17, - "grad_norm": 0.07560744881629944, - "learning_rate": 7.216315041832696e-05, - "loss": 0.5611, - "step": 2125 - }, - { - "epoch": 1.17, - "grad_norm": 0.0657825618982315, - "learning_rate": 7.207901668406878e-05, - "loss": 0.5698, - "step": 2126 - }, - { - "epoch": 1.17, - "grad_norm": 0.07382172346115112, - "learning_rate": 7.199490437920294e-05, - "loss": 0.6428, - "step": 2127 - }, - { - "epoch": 1.17, - "grad_norm": 0.07186535745859146, - "learning_rate": 7.191081356828575e-05, - "loss": 0.6556, - "step": 2128 - }, - { - "epoch": 1.17, - "grad_norm": 0.0643572062253952, - "learning_rate": 7.182674431585704e-05, - "loss": 0.5022, - "step": 2129 - }, - { - "epoch": 1.17, - "grad_norm": 0.06504008919000626, - "learning_rate": 7.174269668644004e-05, - "loss": 0.5544, - "step": 2130 - }, - { - "epoch": 1.17, - "grad_norm": 0.06553123146295547, - "learning_rate": 7.165867074454145e-05, - "loss": 0.5749, - "step": 2131 - }, - { - "epoch": 1.17, - "grad_norm": 0.07009954005479813, - "learning_rate": 7.157466655465125e-05, - "loss": 0.5924, - "step": 2132 - }, - { - "epoch": 1.17, - "grad_norm": 0.058126140385866165, - "learning_rate": 7.149068418124281e-05, - "loss": 0.505, - "step": 2133 - }, - { - "epoch": 1.17, - "grad_norm": 0.06979527324438095, - "learning_rate": 7.14067236887727e-05, - "loss": 0.5521, - "step": 2134 - }, - { - "epoch": 1.17, - "grad_norm": 0.07132134586572647, - "learning_rate": 7.132278514168073e-05, - "loss": 0.6489, - "step": 2135 - }, - { - "epoch": 1.17, - "grad_norm": 0.07470317929983139, - "learning_rate": 7.123886860438984e-05, - "loss": 0.6272, - "step": 2136 - }, - { - "epoch": 1.17, - "grad_norm": 0.0786786749958992, - "learning_rate": 7.115497414130606e-05, - "loss": 0.6775, - "step": 2137 - }, - { - "epoch": 1.17, - "grad_norm": 0.07232896238565445, - "learning_rate": 7.107110181681852e-05, - "loss": 0.6154, - "step": 2138 - }, - { - "epoch": 1.17, - "grad_norm": 0.06931086629629135, - "learning_rate": 7.098725169529936e-05, - "loss": 0.6516, - "step": 2139 - }, - { - "epoch": 1.17, - "grad_norm": 0.06935492902994156, - "learning_rate": 7.090342384110365e-05, - "loss": 0.6855, - "step": 2140 - }, - { - "epoch": 1.17, - "grad_norm": 0.08043521642684937, - "learning_rate": 7.081961831856936e-05, - "loss": 0.6556, - "step": 2141 - }, - { - "epoch": 1.17, - "grad_norm": 0.06161235272884369, - "learning_rate": 7.073583519201734e-05, - "loss": 0.5758, - "step": 2142 - }, - { - "epoch": 1.18, - "grad_norm": 0.09204084426164627, - "learning_rate": 7.06520745257513e-05, - "loss": 0.6648, - "step": 2143 - }, - { - "epoch": 1.18, - "grad_norm": 0.06960479170084, - "learning_rate": 7.056833638405762e-05, - "loss": 0.6028, - "step": 2144 - }, - { - "epoch": 1.18, - "grad_norm": 0.07587411999702454, - "learning_rate": 7.048462083120547e-05, - "loss": 0.702, - "step": 2145 - }, - { - "epoch": 1.18, - "grad_norm": 0.07557187974452972, - "learning_rate": 7.04009279314466e-05, - "loss": 0.6113, - "step": 2146 - }, - { - "epoch": 1.18, - "grad_norm": 0.06679632514715195, - "learning_rate": 7.031725774901547e-05, - "loss": 0.6169, - "step": 2147 - }, - { - "epoch": 1.18, - "grad_norm": 0.07555227726697922, - "learning_rate": 7.023361034812906e-05, - "loss": 0.6971, - "step": 2148 - }, - { - "epoch": 1.18, - "grad_norm": 0.06371418386697769, - "learning_rate": 7.014998579298683e-05, - "loss": 0.4516, - "step": 2149 - }, - { - "epoch": 1.18, - "grad_norm": 0.06644103676080704, - "learning_rate": 7.006638414777075e-05, - "loss": 0.5879, - "step": 2150 - }, - { - "epoch": 1.18, - "grad_norm": 0.07008033245801926, - "learning_rate": 6.99828054766452e-05, - "loss": 0.5955, - "step": 2151 - }, - { - "epoch": 1.18, - "grad_norm": 0.07703608274459839, - "learning_rate": 6.989924984375692e-05, - "loss": 0.6454, - "step": 2152 - }, - { - "epoch": 1.18, - "grad_norm": 0.07317256182432175, - "learning_rate": 6.981571731323497e-05, - "loss": 0.673, - "step": 2153 - }, - { - "epoch": 1.18, - "grad_norm": 0.06549296528100967, - "learning_rate": 6.97322079491907e-05, - "loss": 0.5843, - "step": 2154 - }, - { - "epoch": 1.18, - "grad_norm": 0.06609703600406647, - "learning_rate": 6.964872181571764e-05, - "loss": 0.5097, - "step": 2155 - }, - { - "epoch": 1.18, - "grad_norm": 0.06859977543354034, - "learning_rate": 6.956525897689152e-05, - "loss": 0.5882, - "step": 2156 - }, - { - "epoch": 1.18, - "grad_norm": 0.07044606655836105, - "learning_rate": 6.948181949677015e-05, - "loss": 0.6284, - "step": 2157 - }, - { - "epoch": 1.18, - "grad_norm": 0.06426123529672623, - "learning_rate": 6.93984034393935e-05, - "loss": 0.5967, - "step": 2158 - }, - { - "epoch": 1.18, - "grad_norm": 0.06794054806232452, - "learning_rate": 6.931501086878345e-05, - "loss": 0.6094, - "step": 2159 - }, - { - "epoch": 1.18, - "grad_norm": 0.06806287169456482, - "learning_rate": 6.923164184894391e-05, - "loss": 0.6135, - "step": 2160 - }, - { - "epoch": 1.19, - "grad_norm": 0.07279844582080841, - "learning_rate": 6.914829644386077e-05, - "loss": 0.6009, - "step": 2161 - }, - { - "epoch": 1.19, - "grad_norm": 0.07700279355049133, - "learning_rate": 6.906497471750171e-05, - "loss": 0.6697, - "step": 2162 - }, - { - "epoch": 1.19, - "grad_norm": 0.0661427304148674, - "learning_rate": 6.898167673381627e-05, - "loss": 0.5603, - "step": 2163 - }, - { - "epoch": 1.19, - "grad_norm": 0.08361556380987167, - "learning_rate": 6.889840255673577e-05, - "loss": 0.7131, - "step": 2164 - }, - { - "epoch": 1.19, - "grad_norm": 0.07225959002971649, - "learning_rate": 6.881515225017323e-05, - "loss": 0.6105, - "step": 2165 - }, - { - "epoch": 1.19, - "grad_norm": 0.06336675584316254, - "learning_rate": 6.87319258780234e-05, - "loss": 0.5211, - "step": 2166 - }, - { - "epoch": 1.19, - "grad_norm": 0.06372500211000443, - "learning_rate": 6.86487235041626e-05, - "loss": 0.5295, - "step": 2167 - }, - { - "epoch": 1.19, - "grad_norm": 0.07312193512916565, - "learning_rate": 6.85655451924488e-05, - "loss": 0.625, - "step": 2168 - }, - { - "epoch": 1.19, - "grad_norm": 0.06828183680772781, - "learning_rate": 6.848239100672145e-05, - "loss": 0.6262, - "step": 2169 - }, - { - "epoch": 1.19, - "grad_norm": 0.06808409094810486, - "learning_rate": 6.839926101080148e-05, - "loss": 0.6716, - "step": 2170 - }, - { - "epoch": 1.19, - "grad_norm": 0.06251493841409683, - "learning_rate": 6.831615526849133e-05, - "loss": 0.5642, - "step": 2171 - }, - { - "epoch": 1.19, - "grad_norm": 0.06784352660179138, - "learning_rate": 6.823307384357471e-05, - "loss": 0.5771, - "step": 2172 - }, - { - "epoch": 1.19, - "grad_norm": 0.07426799833774567, - "learning_rate": 6.815001679981678e-05, - "loss": 0.5582, - "step": 2173 - }, - { - "epoch": 1.19, - "grad_norm": 0.06320761144161224, - "learning_rate": 6.806698420096387e-05, - "loss": 0.6032, - "step": 2174 - }, - { - "epoch": 1.19, - "grad_norm": 0.06084798276424408, - "learning_rate": 6.798397611074367e-05, - "loss": 0.5677, - "step": 2175 - }, - { - "epoch": 1.19, - "grad_norm": 0.07241813093423843, - "learning_rate": 6.790099259286497e-05, - "loss": 0.6071, - "step": 2176 - }, - { - "epoch": 1.19, - "grad_norm": 0.0648108497262001, - "learning_rate": 6.781803371101774e-05, - "loss": 0.5046, - "step": 2177 - }, - { - "epoch": 1.19, - "grad_norm": 0.0848846361041069, - "learning_rate": 6.773509952887302e-05, - "loss": 0.6551, - "step": 2178 - }, - { - "epoch": 1.2, - "grad_norm": 0.06519858539104462, - "learning_rate": 6.76521901100829e-05, - "loss": 0.5732, - "step": 2179 - }, - { - "epoch": 1.2, - "grad_norm": 0.06119070202112198, - "learning_rate": 6.756930551828052e-05, - "loss": 0.5153, - "step": 2180 - }, - { - "epoch": 1.2, - "grad_norm": 0.0737023800611496, - "learning_rate": 6.748644581707988e-05, - "loss": 0.6208, - "step": 2181 - }, - { - "epoch": 1.2, - "grad_norm": 0.07560352236032486, - "learning_rate": 6.74036110700759e-05, - "loss": 0.6223, - "step": 2182 - }, - { - "epoch": 1.2, - "grad_norm": 0.07263612747192383, - "learning_rate": 6.732080134084435e-05, - "loss": 0.6347, - "step": 2183 - }, - { - "epoch": 1.2, - "grad_norm": 0.06753487884998322, - "learning_rate": 6.723801669294189e-05, - "loss": 0.5513, - "step": 2184 - }, - { - "epoch": 1.2, - "grad_norm": 0.07577301561832428, - "learning_rate": 6.715525718990578e-05, - "loss": 0.7817, - "step": 2185 - }, - { - "epoch": 1.2, - "grad_norm": 0.06963416188955307, - "learning_rate": 6.707252289525407e-05, - "loss": 0.5903, - "step": 2186 - }, - { - "epoch": 1.2, - "grad_norm": 0.07357131689786911, - "learning_rate": 6.698981387248544e-05, - "loss": 0.6524, - "step": 2187 - }, - { - "epoch": 1.2, - "grad_norm": 0.0715678483247757, - "learning_rate": 6.690713018507918e-05, - "loss": 0.6235, - "step": 2188 - }, - { - "epoch": 1.2, - "grad_norm": 0.07252723723649979, - "learning_rate": 6.68244718964951e-05, - "loss": 0.537, - "step": 2189 - }, - { - "epoch": 1.2, - "grad_norm": 0.06699419021606445, - "learning_rate": 6.67418390701736e-05, - "loss": 0.6012, - "step": 2190 - }, - { - "epoch": 1.2, - "grad_norm": 0.07284466177225113, - "learning_rate": 6.665923176953546e-05, - "loss": 0.6235, - "step": 2191 - }, - { - "epoch": 1.2, - "grad_norm": 0.06402984261512756, - "learning_rate": 6.657665005798186e-05, - "loss": 0.5386, - "step": 2192 - }, - { - "epoch": 1.2, - "grad_norm": 0.07105006277561188, - "learning_rate": 6.649409399889443e-05, - "loss": 0.5789, - "step": 2193 - }, - { - "epoch": 1.2, - "grad_norm": 0.07561016082763672, - "learning_rate": 6.641156365563504e-05, - "loss": 0.5687, - "step": 2194 - }, - { - "epoch": 1.2, - "grad_norm": 0.06907906383275986, - "learning_rate": 6.632905909154583e-05, - "loss": 0.6156, - "step": 2195 - }, - { - "epoch": 1.2, - "grad_norm": 0.06653522700071335, - "learning_rate": 6.624658036994918e-05, - "loss": 0.6062, - "step": 2196 - }, - { - "epoch": 1.21, - "grad_norm": 0.0678929015994072, - "learning_rate": 6.616412755414761e-05, - "loss": 0.6009, - "step": 2197 - }, - { - "epoch": 1.21, - "grad_norm": 0.062168996781110764, - "learning_rate": 6.608170070742377e-05, - "loss": 0.576, - "step": 2198 - }, - { - "epoch": 1.21, - "grad_norm": 0.06343018263578415, - "learning_rate": 6.599929989304035e-05, - "loss": 0.5534, - "step": 2199 - }, - { - "epoch": 1.21, - "grad_norm": 0.06593502312898636, - "learning_rate": 6.591692517424013e-05, - "loss": 0.5599, - "step": 2200 - }, - { - "epoch": 1.21, - "grad_norm": 0.0697464793920517, - "learning_rate": 6.583457661424576e-05, - "loss": 0.6572, - "step": 2201 - }, - { - "epoch": 1.21, - "grad_norm": 0.06897808611392975, - "learning_rate": 6.575225427625995e-05, - "loss": 0.6817, - "step": 2202 - }, - { - "epoch": 1.21, - "grad_norm": 0.07382930815219879, - "learning_rate": 6.566995822346516e-05, - "loss": 0.62, - "step": 2203 - }, - { - "epoch": 1.21, - "grad_norm": 0.06902208924293518, - "learning_rate": 6.55876885190237e-05, - "loss": 0.578, - "step": 2204 - }, - { - "epoch": 1.21, - "grad_norm": 0.06347808241844177, - "learning_rate": 6.550544522607773e-05, - "loss": 0.5382, - "step": 2205 - }, - { - "epoch": 1.21, - "grad_norm": 0.07611478865146637, - "learning_rate": 6.542322840774905e-05, - "loss": 0.657, - "step": 2206 - }, - { - "epoch": 1.21, - "grad_norm": 0.06822540611028671, - "learning_rate": 6.534103812713919e-05, - "loss": 0.5694, - "step": 2207 - }, - { - "epoch": 1.21, - "grad_norm": 0.08152306824922562, - "learning_rate": 6.525887444732932e-05, - "loss": 0.5875, - "step": 2208 - }, - { - "epoch": 1.21, - "grad_norm": 0.06699424982070923, - "learning_rate": 6.517673743138015e-05, - "loss": 0.55, - "step": 2209 - }, - { - "epoch": 1.21, - "grad_norm": 0.06906050443649292, - "learning_rate": 6.509462714233195e-05, - "loss": 0.5825, - "step": 2210 - }, - { - "epoch": 1.21, - "grad_norm": 0.06700971722602844, - "learning_rate": 6.501254364320446e-05, - "loss": 0.579, - "step": 2211 - }, - { - "epoch": 1.21, - "grad_norm": 0.07808460295200348, - "learning_rate": 6.493048699699693e-05, - "loss": 0.6068, - "step": 2212 - }, - { - "epoch": 1.21, - "grad_norm": 0.0653437152504921, - "learning_rate": 6.48484572666879e-05, - "loss": 0.512, - "step": 2213 - }, - { - "epoch": 1.21, - "grad_norm": 0.06242841109633446, - "learning_rate": 6.476645451523535e-05, - "loss": 0.4806, - "step": 2214 - }, - { - "epoch": 1.22, - "grad_norm": 0.06796973198652267, - "learning_rate": 6.468447880557644e-05, - "loss": 0.6215, - "step": 2215 - }, - { - "epoch": 1.22, - "grad_norm": 0.07036803662776947, - "learning_rate": 6.460253020062768e-05, - "loss": 0.6199, - "step": 2216 - }, - { - "epoch": 1.22, - "grad_norm": 0.06817260384559631, - "learning_rate": 6.452060876328474e-05, - "loss": 0.5752, - "step": 2217 - }, - { - "epoch": 1.22, - "grad_norm": 0.07045266032218933, - "learning_rate": 6.443871455642238e-05, - "loss": 0.583, - "step": 2218 - }, - { - "epoch": 1.22, - "grad_norm": 0.06571897864341736, - "learning_rate": 6.435684764289457e-05, - "loss": 0.5529, - "step": 2219 - }, - { - "epoch": 1.22, - "grad_norm": 0.06711366027593613, - "learning_rate": 6.427500808553424e-05, - "loss": 0.574, - "step": 2220 - }, - { - "epoch": 1.22, - "grad_norm": 0.07405182719230652, - "learning_rate": 6.419319594715339e-05, - "loss": 0.5532, - "step": 2221 - }, - { - "epoch": 1.22, - "grad_norm": 0.07331159710884094, - "learning_rate": 6.411141129054292e-05, - "loss": 0.5487, - "step": 2222 - }, - { - "epoch": 1.22, - "grad_norm": 0.07121601700782776, - "learning_rate": 6.402965417847268e-05, - "loss": 0.5828, - "step": 2223 - }, - { - "epoch": 1.22, - "grad_norm": 0.0719437301158905, - "learning_rate": 6.394792467369138e-05, - "loss": 0.6311, - "step": 2224 - }, - { - "epoch": 1.22, - "grad_norm": 0.06829216331243515, - "learning_rate": 6.38662228389265e-05, - "loss": 0.6195, - "step": 2225 - }, - { - "epoch": 1.22, - "grad_norm": 0.06692846864461899, - "learning_rate": 6.378454873688431e-05, - "loss": 0.5517, - "step": 2226 - }, - { - "epoch": 1.22, - "grad_norm": 0.07758577913045883, - "learning_rate": 6.370290243024978e-05, - "loss": 0.6157, - "step": 2227 - }, - { - "epoch": 1.22, - "grad_norm": 0.07376914471387863, - "learning_rate": 6.36212839816866e-05, - "loss": 0.6016, - "step": 2228 - }, - { - "epoch": 1.22, - "grad_norm": 0.06754311174154282, - "learning_rate": 6.353969345383701e-05, - "loss": 0.6271, - "step": 2229 - }, - { - "epoch": 1.22, - "grad_norm": 0.07988160848617554, - "learning_rate": 6.345813090932186e-05, - "loss": 0.5742, - "step": 2230 - }, - { - "epoch": 1.22, - "grad_norm": 0.06557673960924149, - "learning_rate": 6.337659641074052e-05, - "loss": 0.5015, - "step": 2231 - }, - { - "epoch": 1.22, - "grad_norm": 0.0661676675081253, - "learning_rate": 6.32950900206708e-05, - "loss": 0.5408, - "step": 2232 - }, - { - "epoch": 1.23, - "grad_norm": 0.06480776518583298, - "learning_rate": 6.3213611801669e-05, - "loss": 0.4707, - "step": 2233 - }, - { - "epoch": 1.23, - "grad_norm": 0.0753173679113388, - "learning_rate": 6.313216181626974e-05, - "loss": 0.611, - "step": 2234 - }, - { - "epoch": 1.23, - "grad_norm": 0.0757727399468422, - "learning_rate": 6.3050740126986e-05, - "loss": 0.5423, - "step": 2235 - }, - { - "epoch": 1.23, - "grad_norm": 0.07315197587013245, - "learning_rate": 6.296934679630904e-05, - "loss": 0.5457, - "step": 2236 - }, - { - "epoch": 1.23, - "grad_norm": 0.08855360001325607, - "learning_rate": 6.288798188670833e-05, - "loss": 0.7, - "step": 2237 - }, - { - "epoch": 1.23, - "grad_norm": 0.06280341744422913, - "learning_rate": 6.280664546063157e-05, - "loss": 0.4838, - "step": 2238 - }, - { - "epoch": 1.23, - "grad_norm": 0.06725641340017319, - "learning_rate": 6.272533758050457e-05, - "loss": 0.5553, - "step": 2239 - }, - { - "epoch": 1.23, - "grad_norm": 0.07229411602020264, - "learning_rate": 6.264405830873125e-05, - "loss": 0.6519, - "step": 2240 - }, - { - "epoch": 1.23, - "grad_norm": 0.06996262073516846, - "learning_rate": 6.256280770769354e-05, - "loss": 0.6112, - "step": 2241 - }, - { - "epoch": 1.23, - "grad_norm": 0.0687815472483635, - "learning_rate": 6.248158583975141e-05, - "loss": 0.5465, - "step": 2242 - }, - { - "epoch": 1.23, - "grad_norm": 0.07132161408662796, - "learning_rate": 6.240039276724272e-05, - "loss": 0.6383, - "step": 2243 - }, - { - "epoch": 1.23, - "grad_norm": 0.07156707346439362, - "learning_rate": 6.231922855248329e-05, - "loss": 0.5916, - "step": 2244 - }, - { - "epoch": 1.23, - "grad_norm": 0.06640041619539261, - "learning_rate": 6.223809325776677e-05, - "loss": 0.5626, - "step": 2245 - }, - { - "epoch": 1.23, - "grad_norm": 0.06485997885465622, - "learning_rate": 6.215698694536456e-05, - "loss": 0.4876, - "step": 2246 - }, - { - "epoch": 1.23, - "grad_norm": 0.07079490274190903, - "learning_rate": 6.20759096775259e-05, - "loss": 0.6112, - "step": 2247 - }, - { - "epoch": 1.23, - "grad_norm": 0.06052457168698311, - "learning_rate": 6.199486151647773e-05, - "loss": 0.6048, - "step": 2248 - }, - { - "epoch": 1.23, - "grad_norm": 0.06677879393100739, - "learning_rate": 6.191384252442458e-05, - "loss": 0.6207, - "step": 2249 - }, - { - "epoch": 1.23, - "grad_norm": 0.06265201419591904, - "learning_rate": 6.183285276354865e-05, - "loss": 0.5206, - "step": 2250 - }, - { - "epoch": 1.24, - "grad_norm": 0.07499994337558746, - "learning_rate": 6.175189229600969e-05, - "loss": 0.6053, - "step": 2251 - }, - { - "epoch": 1.24, - "grad_norm": 0.07424361258745193, - "learning_rate": 6.16709611839449e-05, - "loss": 0.6065, - "step": 2252 - }, - { - "epoch": 1.24, - "grad_norm": 0.07379718124866486, - "learning_rate": 6.159005948946916e-05, - "loss": 0.5934, - "step": 2253 - }, - { - "epoch": 1.24, - "grad_norm": 0.07687895745038986, - "learning_rate": 6.150918727467455e-05, - "loss": 0.6532, - "step": 2254 - }, - { - "epoch": 1.24, - "grad_norm": 0.07283301651477814, - "learning_rate": 6.14283446016306e-05, - "loss": 0.5334, - "step": 2255 - }, - { - "epoch": 1.24, - "grad_norm": 0.0724155604839325, - "learning_rate": 6.134753153238418e-05, - "loss": 0.5745, - "step": 2256 - }, - { - "epoch": 1.24, - "grad_norm": 0.07714327424764633, - "learning_rate": 6.126674812895944e-05, - "loss": 0.6511, - "step": 2257 - }, - { - "epoch": 1.24, - "grad_norm": 0.0738411694765091, - "learning_rate": 6.118599445335773e-05, - "loss": 0.5277, - "step": 2258 - }, - { - "epoch": 1.24, - "grad_norm": 0.0703483298420906, - "learning_rate": 6.110527056755762e-05, - "loss": 0.5723, - "step": 2259 - }, - { - "epoch": 1.24, - "grad_norm": 0.070271797478199, - "learning_rate": 6.102457653351479e-05, - "loss": 0.6019, - "step": 2260 - }, - { - "epoch": 1.24, - "grad_norm": 0.08007260411977768, - "learning_rate": 6.0943912413161994e-05, - "loss": 0.4893, - "step": 2261 - }, - { - "epoch": 1.24, - "grad_norm": 0.06550659239292145, - "learning_rate": 6.086327826840912e-05, - "loss": 0.5876, - "step": 2262 - }, - { - "epoch": 1.24, - "grad_norm": 0.08582957834005356, - "learning_rate": 6.078267416114295e-05, - "loss": 0.7156, - "step": 2263 - }, - { - "epoch": 1.24, - "grad_norm": 0.07212957739830017, - "learning_rate": 6.070210015322724e-05, - "loss": 0.5369, - "step": 2264 - }, - { - "epoch": 1.24, - "grad_norm": 0.0679006278514862, - "learning_rate": 6.062155630650265e-05, - "loss": 0.5725, - "step": 2265 - }, - { - "epoch": 1.24, - "grad_norm": 0.08907914161682129, - "learning_rate": 6.054104268278669e-05, - "loss": 0.6409, - "step": 2266 - }, - { - "epoch": 1.24, - "grad_norm": 0.07050459831953049, - "learning_rate": 6.046055934387368e-05, - "loss": 0.5286, - "step": 2267 - }, - { - "epoch": 1.24, - "grad_norm": 0.07432929426431656, - "learning_rate": 6.038010635153469e-05, - "loss": 0.6366, - "step": 2268 - }, - { - "epoch": 1.25, - "grad_norm": 0.07176590710878372, - "learning_rate": 6.02996837675175e-05, - "loss": 0.6173, - "step": 2269 - }, - { - "epoch": 1.25, - "grad_norm": 0.07120334357023239, - "learning_rate": 6.0219291653546536e-05, - "loss": 0.5726, - "step": 2270 - }, - { - "epoch": 1.25, - "grad_norm": 0.07870449125766754, - "learning_rate": 6.0138930071322874e-05, - "loss": 0.5968, - "step": 2271 - }, - { - "epoch": 1.25, - "grad_norm": 0.06898409128189087, - "learning_rate": 6.005859908252415e-05, - "loss": 0.6193, - "step": 2272 - }, - { - "epoch": 1.25, - "grad_norm": 0.060963038355112076, - "learning_rate": 5.997829874880447e-05, - "loss": 0.5096, - "step": 2273 - }, - { - "epoch": 1.25, - "grad_norm": 0.0753115639090538, - "learning_rate": 5.9898029131794485e-05, - "loss": 0.5989, - "step": 2274 - }, - { - "epoch": 1.25, - "grad_norm": 0.06675612926483154, - "learning_rate": 5.9817790293101204e-05, - "loss": 0.5225, - "step": 2275 - }, - { - "epoch": 1.25, - "grad_norm": 0.07885386049747467, - "learning_rate": 5.973758229430806e-05, - "loss": 0.5717, - "step": 2276 - }, - { - "epoch": 1.25, - "grad_norm": 0.06998579949140549, - "learning_rate": 5.965740519697479e-05, - "loss": 0.5932, - "step": 2277 - }, - { - "epoch": 1.25, - "grad_norm": 0.07550673931837082, - "learning_rate": 5.957725906263743e-05, - "loss": 0.7, - "step": 2278 - }, - { - "epoch": 1.25, - "grad_norm": 0.06955604255199432, - "learning_rate": 5.9497143952808234e-05, - "loss": 0.5823, - "step": 2279 - }, - { - "epoch": 1.25, - "grad_norm": 0.07412168383598328, - "learning_rate": 5.9417059928975686e-05, - "loss": 0.5776, - "step": 2280 - }, - { - "epoch": 1.25, - "grad_norm": 0.07235770672559738, - "learning_rate": 5.933700705260437e-05, - "loss": 0.5945, - "step": 2281 - }, - { - "epoch": 1.25, - "grad_norm": 0.08059383928775787, - "learning_rate": 5.9256985385134955e-05, - "loss": 0.6109, - "step": 2282 - }, - { - "epoch": 1.25, - "grad_norm": 0.06218232214450836, - "learning_rate": 5.917699498798421e-05, - "loss": 0.5188, - "step": 2283 - }, - { - "epoch": 1.25, - "grad_norm": 0.07667962461709976, - "learning_rate": 5.909703592254485e-05, - "loss": 0.6085, - "step": 2284 - }, - { - "epoch": 1.25, - "grad_norm": 0.07095900923013687, - "learning_rate": 5.9017108250185584e-05, - "loss": 0.5872, - "step": 2285 - }, - { - "epoch": 1.25, - "grad_norm": 0.07091958820819855, - "learning_rate": 5.8937212032251e-05, - "loss": 0.6289, - "step": 2286 - }, - { - "epoch": 1.26, - "grad_norm": 0.06871485710144043, - "learning_rate": 5.885734733006154e-05, - "loss": 0.5561, - "step": 2287 - }, - { - "epoch": 1.26, - "grad_norm": 0.06925121694803238, - "learning_rate": 5.877751420491346e-05, - "loss": 0.6291, - "step": 2288 - }, - { - "epoch": 1.26, - "grad_norm": 0.07990697771310806, - "learning_rate": 5.869771271807885e-05, - "loss": 0.636, - "step": 2289 - }, - { - "epoch": 1.26, - "grad_norm": 0.07092578709125519, - "learning_rate": 5.86179429308054e-05, - "loss": 0.615, - "step": 2290 - }, - { - "epoch": 1.26, - "grad_norm": 0.07381773740053177, - "learning_rate": 5.8538204904316565e-05, - "loss": 0.6482, - "step": 2291 - }, - { - "epoch": 1.26, - "grad_norm": 0.0735030248761177, - "learning_rate": 5.845849869981137e-05, - "loss": 0.5469, - "step": 2292 - }, - { - "epoch": 1.26, - "grad_norm": 0.07174662500619888, - "learning_rate": 5.8378824378464434e-05, - "loss": 0.6268, - "step": 2293 - }, - { - "epoch": 1.26, - "grad_norm": 0.06630636006593704, - "learning_rate": 5.82991820014259e-05, - "loss": 0.6448, - "step": 2294 - }, - { - "epoch": 1.26, - "grad_norm": 0.07417547702789307, - "learning_rate": 5.821957162982143e-05, - "loss": 0.6635, - "step": 2295 - }, - { - "epoch": 1.26, - "grad_norm": 0.07115903496742249, - "learning_rate": 5.813999332475206e-05, - "loss": 0.525, - "step": 2296 - }, - { - "epoch": 1.26, - "grad_norm": 0.07035580277442932, - "learning_rate": 5.8060447147294285e-05, - "loss": 0.6121, - "step": 2297 - }, - { - "epoch": 1.26, - "grad_norm": 0.06738772243261337, - "learning_rate": 5.798093315849984e-05, - "loss": 0.5151, - "step": 2298 - }, - { - "epoch": 1.26, - "grad_norm": 0.06736040860414505, - "learning_rate": 5.790145141939588e-05, - "loss": 0.5185, - "step": 2299 - }, - { - "epoch": 1.26, - "grad_norm": 0.07977905124425888, - "learning_rate": 5.7822001990984685e-05, - "loss": 0.6448, - "step": 2300 - }, - { - "epoch": 1.26, - "grad_norm": 0.06976289302110672, - "learning_rate": 5.7742584934243894e-05, - "loss": 0.597, - "step": 2301 - }, - { - "epoch": 1.26, - "grad_norm": 0.07432825863361359, - "learning_rate": 5.7663200310126084e-05, - "loss": 0.5596, - "step": 2302 - }, - { - "epoch": 1.26, - "grad_norm": 0.06685048341751099, - "learning_rate": 5.7583848179559154e-05, - "loss": 0.617, - "step": 2303 - }, - { - "epoch": 1.26, - "grad_norm": 0.07461295276880264, - "learning_rate": 5.750452860344595e-05, - "loss": 0.5618, - "step": 2304 - }, - { - "epoch": 1.27, - "grad_norm": 0.0821809321641922, - "learning_rate": 5.742524164266432e-05, - "loss": 0.746, - "step": 2305 - }, - { - "epoch": 1.27, - "grad_norm": 0.07233615964651108, - "learning_rate": 5.734598735806718e-05, - "loss": 0.6114, - "step": 2306 - }, - { - "epoch": 1.27, - "grad_norm": 0.07483091950416565, - "learning_rate": 5.7266765810482205e-05, - "loss": 0.5878, - "step": 2307 - }, - { - "epoch": 1.27, - "grad_norm": 0.06762590259313583, - "learning_rate": 5.718757706071214e-05, - "loss": 0.5829, - "step": 2308 - }, - { - "epoch": 1.27, - "grad_norm": 0.0718020498752594, - "learning_rate": 5.710842116953438e-05, - "loss": 0.5813, - "step": 2309 - }, - { - "epoch": 1.27, - "grad_norm": 0.06753186881542206, - "learning_rate": 5.702929819770123e-05, - "loss": 0.5491, - "step": 2310 - }, - { - "epoch": 1.27, - "grad_norm": 0.07231669872999191, - "learning_rate": 5.6950208205939626e-05, - "loss": 0.6531, - "step": 2311 - }, - { - "epoch": 1.27, - "grad_norm": 0.07114548236131668, - "learning_rate": 5.687115125495127e-05, - "loss": 0.6119, - "step": 2312 - }, - { - "epoch": 1.27, - "grad_norm": 0.08065725862979889, - "learning_rate": 5.679212740541253e-05, - "loss": 0.6962, - "step": 2313 - }, - { - "epoch": 1.27, - "grad_norm": 0.067630834877491, - "learning_rate": 5.6713136717974226e-05, - "loss": 0.5528, - "step": 2314 - }, - { - "epoch": 1.27, - "grad_norm": 0.0677843689918518, - "learning_rate": 5.6634179253261885e-05, - "loss": 0.6157, - "step": 2315 - }, - { - "epoch": 1.27, - "grad_norm": 0.07071098685264587, - "learning_rate": 5.6555255071875415e-05, - "loss": 0.5523, - "step": 2316 - }, - { - "epoch": 1.27, - "grad_norm": 0.07934273034334183, - "learning_rate": 5.647636423438929e-05, - "loss": 0.5646, - "step": 2317 - }, - { - "epoch": 1.27, - "grad_norm": 0.08034515380859375, - "learning_rate": 5.639750680135227e-05, - "loss": 0.6589, - "step": 2318 - }, - { - "epoch": 1.27, - "grad_norm": 0.07613985240459442, - "learning_rate": 5.6318682833287626e-05, - "loss": 0.5973, - "step": 2319 - }, - { - "epoch": 1.27, - "grad_norm": 0.07653201371431351, - "learning_rate": 5.623989239069275e-05, - "loss": 0.6002, - "step": 2320 - }, - { - "epoch": 1.27, - "grad_norm": 0.0726369246840477, - "learning_rate": 5.6161135534039476e-05, - "loss": 0.665, - "step": 2321 - }, - { - "epoch": 1.27, - "grad_norm": 0.06761615723371506, - "learning_rate": 5.60824123237738e-05, - "loss": 0.5183, - "step": 2322 - }, - { - "epoch": 1.28, - "grad_norm": 0.0639297217130661, - "learning_rate": 5.6003722820315916e-05, - "loss": 0.5161, - "step": 2323 - }, - { - "epoch": 1.28, - "grad_norm": 0.06320559233427048, - "learning_rate": 5.592506708406006e-05, - "loss": 0.5844, - "step": 2324 - }, - { - "epoch": 1.28, - "grad_norm": 0.07683044672012329, - "learning_rate": 5.584644517537464e-05, - "loss": 0.6143, - "step": 2325 - }, - { - "epoch": 1.28, - "grad_norm": 0.06600732356309891, - "learning_rate": 5.5767857154602135e-05, - "loss": 0.5548, - "step": 2326 - }, - { - "epoch": 1.28, - "grad_norm": 0.07150949537754059, - "learning_rate": 5.568930308205886e-05, - "loss": 0.562, - "step": 2327 - }, - { - "epoch": 1.28, - "grad_norm": 0.06904012709856033, - "learning_rate": 5.561078301803526e-05, - "loss": 0.5105, - "step": 2328 - }, - { - "epoch": 1.28, - "grad_norm": 0.0722079873085022, - "learning_rate": 5.553229702279551e-05, - "loss": 0.6219, - "step": 2329 - }, - { - "epoch": 1.28, - "grad_norm": 0.07431759685277939, - "learning_rate": 5.545384515657779e-05, - "loss": 0.6837, - "step": 2330 - }, - { - "epoch": 1.28, - "grad_norm": 0.07866154611110687, - "learning_rate": 5.537542747959394e-05, - "loss": 0.5907, - "step": 2331 - }, - { - "epoch": 1.28, - "grad_norm": 0.07409841567277908, - "learning_rate": 5.529704405202972e-05, - "loss": 0.5776, - "step": 2332 - }, - { - "epoch": 1.28, - "grad_norm": 0.07749959826469421, - "learning_rate": 5.521869493404444e-05, - "loss": 0.5411, - "step": 2333 - }, - { - "epoch": 1.28, - "grad_norm": 0.07008834928274155, - "learning_rate": 5.5140380185771154e-05, - "loss": 0.5368, - "step": 2334 - }, - { - "epoch": 1.28, - "grad_norm": 0.06928420811891556, - "learning_rate": 5.506209986731662e-05, - "loss": 0.5241, - "step": 2335 - }, - { - "epoch": 1.28, - "grad_norm": 0.07040610909461975, - "learning_rate": 5.4983854038760995e-05, - "loss": 0.6249, - "step": 2336 - }, - { - "epoch": 1.28, - "grad_norm": 0.06945261359214783, - "learning_rate": 5.4905642760158124e-05, - "loss": 0.6035, - "step": 2337 - }, - { - "epoch": 1.28, - "grad_norm": 0.06809067726135254, - "learning_rate": 5.4827466091535215e-05, - "loss": 0.6002, - "step": 2338 - }, - { - "epoch": 1.28, - "grad_norm": 0.07596398890018463, - "learning_rate": 5.474932409289302e-05, - "loss": 0.5523, - "step": 2339 - }, - { - "epoch": 1.28, - "grad_norm": 0.07113833725452423, - "learning_rate": 5.4671216824205575e-05, - "loss": 0.5829, - "step": 2340 - }, - { - "epoch": 1.29, - "grad_norm": 0.07487820088863373, - "learning_rate": 5.4593144345420366e-05, - "loss": 0.659, - "step": 2341 - }, - { - "epoch": 1.29, - "grad_norm": 0.07420715689659119, - "learning_rate": 5.451510671645807e-05, - "loss": 0.54, - "step": 2342 - }, - { - "epoch": 1.29, - "grad_norm": 0.07427990436553955, - "learning_rate": 5.443710399721269e-05, - "loss": 0.6448, - "step": 2343 - }, - { - "epoch": 1.29, - "grad_norm": 0.06675884127616882, - "learning_rate": 5.435913624755148e-05, - "loss": 0.4935, - "step": 2344 - }, - { - "epoch": 1.29, - "grad_norm": 0.07301631569862366, - "learning_rate": 5.4281203527314696e-05, - "loss": 0.6389, - "step": 2345 - }, - { - "epoch": 1.29, - "grad_norm": 0.06832318007946014, - "learning_rate": 5.42033058963159e-05, - "loss": 0.5719, - "step": 2346 - }, - { - "epoch": 1.29, - "grad_norm": 0.08214893192052841, - "learning_rate": 5.4125443414341534e-05, - "loss": 0.6581, - "step": 2347 - }, - { - "epoch": 1.29, - "grad_norm": 0.07999354600906372, - "learning_rate": 5.4047616141151255e-05, - "loss": 0.6648, - "step": 2348 - }, - { - "epoch": 1.29, - "grad_norm": 0.06501716375350952, - "learning_rate": 5.39698241364775e-05, - "loss": 0.5674, - "step": 2349 - }, - { - "epoch": 1.29, - "grad_norm": 0.06726286560297012, - "learning_rate": 5.389206746002584e-05, - "loss": 0.5106, - "step": 2350 - }, - { - "epoch": 1.29, - "grad_norm": 0.08208262920379639, - "learning_rate": 5.381434617147452e-05, - "loss": 0.5974, - "step": 2351 - }, - { - "epoch": 1.29, - "grad_norm": 0.07486006617546082, - "learning_rate": 5.37366603304748e-05, - "loss": 0.6769, - "step": 2352 - }, - { - "epoch": 1.29, - "grad_norm": 0.075009286403656, - "learning_rate": 5.36590099966507e-05, - "loss": 0.6224, - "step": 2353 - }, - { - "epoch": 1.29, - "grad_norm": 0.06720884144306183, - "learning_rate": 5.3581395229598887e-05, - "loss": 0.5017, - "step": 2354 - }, - { - "epoch": 1.29, - "grad_norm": 0.0757581815123558, - "learning_rate": 5.350381608888885e-05, - "loss": 0.6254, - "step": 2355 - }, - { - "epoch": 1.29, - "grad_norm": 0.06871607899665833, - "learning_rate": 5.3426272634062624e-05, - "loss": 0.5213, - "step": 2356 - }, - { - "epoch": 1.29, - "grad_norm": 0.07220682501792908, - "learning_rate": 5.3348764924634983e-05, - "loss": 0.6204, - "step": 2357 - }, - { - "epoch": 1.29, - "grad_norm": 0.09039363265037537, - "learning_rate": 5.3271293020093146e-05, - "loss": 0.6442, - "step": 2358 - }, - { - "epoch": 1.3, - "grad_norm": 0.07300581783056259, - "learning_rate": 5.319385697989696e-05, - "loss": 0.6176, - "step": 2359 - }, - { - "epoch": 1.3, - "grad_norm": 0.060775429010391235, - "learning_rate": 5.311645686347861e-05, - "loss": 0.482, - "step": 2360 - }, - { - "epoch": 1.3, - "grad_norm": 0.06812864542007446, - "learning_rate": 5.3039092730242875e-05, - "loss": 0.5296, - "step": 2361 - }, - { - "epoch": 1.3, - "grad_norm": 0.07054027915000916, - "learning_rate": 5.296176463956677e-05, - "loss": 0.5841, - "step": 2362 - }, - { - "epoch": 1.3, - "grad_norm": 0.07183098793029785, - "learning_rate": 5.288447265079972e-05, - "loss": 0.5124, - "step": 2363 - }, - { - "epoch": 1.3, - "grad_norm": 0.09338720142841339, - "learning_rate": 5.2807216823263484e-05, - "loss": 0.5999, - "step": 2364 - }, - { - "epoch": 1.3, - "grad_norm": 0.07272778451442719, - "learning_rate": 5.2729997216251926e-05, - "loss": 0.6407, - "step": 2365 - }, - { - "epoch": 1.3, - "grad_norm": 0.06850509345531464, - "learning_rate": 5.2652813889031294e-05, - "loss": 0.5894, - "step": 2366 - }, - { - "epoch": 1.3, - "grad_norm": 0.0760633572936058, - "learning_rate": 5.257566690083979e-05, - "loss": 0.6144, - "step": 2367 - }, - { - "epoch": 1.3, - "grad_norm": 0.07636122405529022, - "learning_rate": 5.249855631088794e-05, - "loss": 0.5995, - "step": 2368 - }, - { - "epoch": 1.3, - "grad_norm": 0.06955023854970932, - "learning_rate": 5.2421482178358125e-05, - "loss": 0.6299, - "step": 2369 - }, - { - "epoch": 1.3, - "grad_norm": 0.07433290779590607, - "learning_rate": 5.234444456240495e-05, - "loss": 0.6417, - "step": 2370 - }, - { - "epoch": 1.3, - "grad_norm": 0.06652615964412689, - "learning_rate": 5.226744352215478e-05, - "loss": 0.5181, - "step": 2371 - }, - { - "epoch": 1.3, - "grad_norm": 0.06942848861217499, - "learning_rate": 5.2190479116706073e-05, - "loss": 0.5557, - "step": 2372 - }, - { - "epoch": 1.3, - "grad_norm": 0.06768972426652908, - "learning_rate": 5.2113551405129145e-05, - "loss": 0.5716, - "step": 2373 - }, - { - "epoch": 1.3, - "grad_norm": 0.06542663276195526, - "learning_rate": 5.2036660446466045e-05, - "loss": 0.5762, - "step": 2374 - }, - { - "epoch": 1.3, - "grad_norm": 0.07299809157848358, - "learning_rate": 5.1959806299730774e-05, - "loss": 0.6196, - "step": 2375 - }, - { - "epoch": 1.3, - "grad_norm": 0.0744137391448021, - "learning_rate": 5.1882989023908915e-05, - "loss": 0.6276, - "step": 2376 - }, - { - "epoch": 1.31, - "grad_norm": 0.07640177011489868, - "learning_rate": 5.180620867795788e-05, - "loss": 0.539, - "step": 2377 - }, - { - "epoch": 1.31, - "grad_norm": 0.0750659853219986, - "learning_rate": 5.1729465320806645e-05, - "loss": 0.5995, - "step": 2378 - }, - { - "epoch": 1.31, - "grad_norm": 0.08161938935518265, - "learning_rate": 5.16527590113559e-05, - "loss": 0.6323, - "step": 2379 - }, - { - "epoch": 1.31, - "grad_norm": 0.07055678963661194, - "learning_rate": 5.157608980847777e-05, - "loss": 0.5966, - "step": 2380 - }, - { - "epoch": 1.31, - "grad_norm": 0.07690481096506119, - "learning_rate": 5.1499457771016e-05, - "loss": 0.7723, - "step": 2381 - }, - { - "epoch": 1.31, - "grad_norm": 0.07799238711595535, - "learning_rate": 5.1422862957785834e-05, - "loss": 0.6716, - "step": 2382 - }, - { - "epoch": 1.31, - "grad_norm": 0.07273693382740021, - "learning_rate": 5.1346305427573816e-05, - "loss": 0.5985, - "step": 2383 - }, - { - "epoch": 1.31, - "grad_norm": 0.07442136853933334, - "learning_rate": 5.1269785239138015e-05, - "loss": 0.5927, - "step": 2384 - }, - { - "epoch": 1.31, - "grad_norm": 0.06980384886264801, - "learning_rate": 5.1193302451207724e-05, - "loss": 0.5611, - "step": 2385 - }, - { - "epoch": 1.31, - "grad_norm": 0.0675831288099289, - "learning_rate": 5.111685712248364e-05, - "loss": 0.5339, - "step": 2386 - }, - { - "epoch": 1.31, - "grad_norm": 0.06704030185937881, - "learning_rate": 5.1040449311637605e-05, - "loss": 0.6157, - "step": 2387 - }, - { - "epoch": 1.31, - "grad_norm": 0.07446156442165375, - "learning_rate": 5.0964079077312774e-05, - "loss": 0.546, - "step": 2388 - }, - { - "epoch": 1.31, - "grad_norm": 0.07804377377033234, - "learning_rate": 5.0887746478123336e-05, - "loss": 0.6954, - "step": 2389 - }, - { - "epoch": 1.31, - "grad_norm": 0.07373851537704468, - "learning_rate": 5.08114515726547e-05, - "loss": 0.5971, - "step": 2390 - }, - { - "epoch": 1.31, - "grad_norm": 0.07430601865053177, - "learning_rate": 5.0735194419463304e-05, - "loss": 0.609, - "step": 2391 - }, - { - "epoch": 1.31, - "grad_norm": 0.08051323890686035, - "learning_rate": 5.0658975077076667e-05, - "loss": 0.5799, - "step": 2392 - }, - { - "epoch": 1.31, - "grad_norm": 0.07049138844013214, - "learning_rate": 5.058279360399314e-05, - "loss": 0.6887, - "step": 2393 - }, - { - "epoch": 1.31, - "grad_norm": 0.07234661281108856, - "learning_rate": 5.050665005868216e-05, - "loss": 0.551, - "step": 2394 - }, - { - "epoch": 1.32, - "grad_norm": 0.06585332751274109, - "learning_rate": 5.043054449958404e-05, - "loss": 0.516, - "step": 2395 - }, - { - "epoch": 1.32, - "grad_norm": 0.07594798505306244, - "learning_rate": 5.035447698510982e-05, - "loss": 0.6143, - "step": 2396 - }, - { - "epoch": 1.32, - "grad_norm": 0.07606874406337738, - "learning_rate": 5.0278447573641495e-05, - "loss": 0.598, - "step": 2397 - }, - { - "epoch": 1.32, - "grad_norm": 0.07295016199350357, - "learning_rate": 5.0202456323531667e-05, - "loss": 0.633, - "step": 2398 - }, - { - "epoch": 1.32, - "grad_norm": 0.0775969848036766, - "learning_rate": 5.012650329310379e-05, - "loss": 0.6548, - "step": 2399 - }, - { - "epoch": 1.32, - "grad_norm": 0.07147696614265442, - "learning_rate": 5.005058854065185e-05, - "loss": 0.5813, - "step": 2400 - }, - { - "epoch": 1.32, - "grad_norm": 0.07123620808124542, - "learning_rate": 4.997471212444059e-05, - "loss": 0.6711, - "step": 2401 - }, - { - "epoch": 1.32, - "grad_norm": 0.07753394544124603, - "learning_rate": 4.9898874102705204e-05, - "loss": 0.5952, - "step": 2402 - }, - { - "epoch": 1.32, - "grad_norm": 0.07657833397388458, - "learning_rate": 4.9823074533651495e-05, - "loss": 0.5465, - "step": 2403 - }, - { - "epoch": 1.32, - "grad_norm": 0.0730203315615654, - "learning_rate": 4.974731347545578e-05, - "loss": 0.5301, - "step": 2404 - }, - { - "epoch": 1.32, - "grad_norm": 0.08199814707040787, - "learning_rate": 4.967159098626469e-05, - "loss": 0.6718, - "step": 2405 - }, - { - "epoch": 1.32, - "grad_norm": 0.07243099808692932, - "learning_rate": 4.959590712419543e-05, - "loss": 0.62, - "step": 2406 - }, - { - "epoch": 1.32, - "grad_norm": 0.06932693719863892, - "learning_rate": 4.9520261947335364e-05, - "loss": 0.5187, - "step": 2407 - }, - { - "epoch": 1.32, - "grad_norm": 0.06874432414770126, - "learning_rate": 4.944465551374238e-05, - "loss": 0.5736, - "step": 2408 - }, - { - "epoch": 1.32, - "grad_norm": 0.06884592771530151, - "learning_rate": 4.936908788144441e-05, - "loss": 0.5958, - "step": 2409 - }, - { - "epoch": 1.32, - "grad_norm": 0.07854578644037247, - "learning_rate": 4.92935591084398e-05, - "loss": 0.6262, - "step": 2410 - }, - { - "epoch": 1.32, - "grad_norm": 0.08149847388267517, - "learning_rate": 4.921806925269691e-05, - "loss": 0.6254, - "step": 2411 - }, - { - "epoch": 1.32, - "grad_norm": 0.07334032654762268, - "learning_rate": 4.914261837215435e-05, - "loss": 0.5486, - "step": 2412 - }, - { - "epoch": 1.33, - "grad_norm": 0.0973329022526741, - "learning_rate": 4.90672065247208e-05, - "loss": 0.7698, - "step": 2413 - }, - { - "epoch": 1.33, - "grad_norm": 0.06986068189144135, - "learning_rate": 4.899183376827487e-05, - "loss": 0.5833, - "step": 2414 - }, - { - "epoch": 1.33, - "grad_norm": 0.07007471472024918, - "learning_rate": 4.891650016066536e-05, - "loss": 0.6134, - "step": 2415 - }, - { - "epoch": 1.33, - "grad_norm": 0.07884490489959717, - "learning_rate": 4.884120575971082e-05, - "loss": 0.5585, - "step": 2416 - }, - { - "epoch": 1.33, - "grad_norm": 0.07099177688360214, - "learning_rate": 4.876595062319987e-05, - "loss": 0.668, - "step": 2417 - }, - { - "epoch": 1.33, - "grad_norm": 0.08233867585659027, - "learning_rate": 4.869073480889087e-05, - "loss": 0.5372, - "step": 2418 - }, - { - "epoch": 1.33, - "grad_norm": 0.06042160093784332, - "learning_rate": 4.861555837451213e-05, - "loss": 0.5287, - "step": 2419 - }, - { - "epoch": 1.33, - "grad_norm": 0.07258826494216919, - "learning_rate": 4.854042137776158e-05, - "loss": 0.5865, - "step": 2420 - }, - { - "epoch": 1.33, - "grad_norm": 0.08246668428182602, - "learning_rate": 4.8465323876307024e-05, - "loss": 0.5417, - "step": 2421 - }, - { - "epoch": 1.33, - "grad_norm": 0.07026161253452301, - "learning_rate": 4.8390265927785905e-05, - "loss": 0.5653, - "step": 2422 - }, - { - "epoch": 1.33, - "grad_norm": 0.07703935354948044, - "learning_rate": 4.831524758980526e-05, - "loss": 0.6307, - "step": 2423 - }, - { - "epoch": 1.33, - "grad_norm": 0.07275483757257462, - "learning_rate": 4.82402689199418e-05, - "loss": 0.5763, - "step": 2424 - }, - { - "epoch": 1.33, - "grad_norm": 0.07985451072454453, - "learning_rate": 4.8165329975741715e-05, - "loss": 0.5512, - "step": 2425 - }, - { - "epoch": 1.33, - "grad_norm": 0.06975404918193817, - "learning_rate": 4.80904308147208e-05, - "loss": 0.5738, - "step": 2426 - }, - { - "epoch": 1.33, - "grad_norm": 0.08306935429573059, - "learning_rate": 4.801557149436419e-05, - "loss": 0.6353, - "step": 2427 - }, - { - "epoch": 1.33, - "grad_norm": 0.06815838813781738, - "learning_rate": 4.794075207212659e-05, - "loss": 0.4735, - "step": 2428 - }, - { - "epoch": 1.33, - "grad_norm": 0.07782094925642014, - "learning_rate": 4.786597260543194e-05, - "loss": 0.6385, - "step": 2429 - }, - { - "epoch": 1.33, - "grad_norm": 0.07282759994268417, - "learning_rate": 4.779123315167362e-05, - "loss": 0.4963, - "step": 2430 - }, - { - "epoch": 1.34, - "grad_norm": 0.08053456246852875, - "learning_rate": 4.771653376821429e-05, - "loss": 0.6452, - "step": 2431 - }, - { - "epoch": 1.34, - "grad_norm": 0.07558408379554749, - "learning_rate": 4.7641874512385745e-05, - "loss": 0.6229, - "step": 2432 - }, - { - "epoch": 1.34, - "grad_norm": 0.07459430396556854, - "learning_rate": 4.7567255441489155e-05, - "loss": 0.5877, - "step": 2433 - }, - { - "epoch": 1.34, - "grad_norm": 0.07865982502698898, - "learning_rate": 4.749267661279469e-05, - "loss": 0.5827, - "step": 2434 - }, - { - "epoch": 1.34, - "grad_norm": 0.0782080590724945, - "learning_rate": 4.741813808354175e-05, - "loss": 0.5833, - "step": 2435 - }, - { - "epoch": 1.34, - "grad_norm": 0.06451378017663956, - "learning_rate": 4.7343639910938695e-05, - "loss": 0.5107, - "step": 2436 - }, - { - "epoch": 1.34, - "grad_norm": 0.06435713171958923, - "learning_rate": 4.726918215216305e-05, - "loss": 0.4768, - "step": 2437 - }, - { - "epoch": 1.34, - "grad_norm": 0.06687245517969131, - "learning_rate": 4.7194764864361174e-05, - "loss": 0.533, - "step": 2438 - }, - { - "epoch": 1.34, - "grad_norm": 0.0725407525897026, - "learning_rate": 4.712038810464847e-05, - "loss": 0.5419, - "step": 2439 - }, - { - "epoch": 1.34, - "grad_norm": 0.06240203604102135, - "learning_rate": 4.704605193010922e-05, - "loss": 0.56, - "step": 2440 - }, - { - "epoch": 1.34, - "grad_norm": 0.08678898215293884, - "learning_rate": 4.6971756397796504e-05, - "loss": 0.6051, - "step": 2441 - }, - { - "epoch": 1.34, - "grad_norm": 0.07972444593906403, - "learning_rate": 4.689750156473228e-05, - "loss": 0.6196, - "step": 2442 - }, - { - "epoch": 1.34, - "grad_norm": 0.08531578630208969, - "learning_rate": 4.6823287487907173e-05, - "loss": 0.6209, - "step": 2443 - }, - { - "epoch": 1.34, - "grad_norm": 0.06963545829057693, - "learning_rate": 4.6749114224280664e-05, - "loss": 0.6311, - "step": 2444 - }, - { - "epoch": 1.34, - "grad_norm": 0.08195082098245621, - "learning_rate": 4.667498183078076e-05, - "loss": 0.7213, - "step": 2445 - }, - { - "epoch": 1.34, - "grad_norm": 0.07406240701675415, - "learning_rate": 4.660089036430424e-05, - "loss": 0.6393, - "step": 2446 - }, - { - "epoch": 1.34, - "grad_norm": 0.08004584908485413, - "learning_rate": 4.652683988171632e-05, - "loss": 0.5596, - "step": 2447 - }, - { - "epoch": 1.34, - "grad_norm": 0.0669763907790184, - "learning_rate": 4.645283043985095e-05, - "loss": 0.5218, - "step": 2448 - }, - { - "epoch": 1.35, - "grad_norm": 0.08354940265417099, - "learning_rate": 4.637886209551038e-05, - "loss": 0.583, - "step": 2449 - }, - { - "epoch": 1.35, - "grad_norm": 0.06934605538845062, - "learning_rate": 4.6304934905465445e-05, - "loss": 0.528, - "step": 2450 - }, - { - "epoch": 1.35, - "grad_norm": 0.06799960881471634, - "learning_rate": 4.6231048926455415e-05, - "loss": 0.573, - "step": 2451 - }, - { - "epoch": 1.35, - "grad_norm": 0.0804983377456665, - "learning_rate": 4.61572042151878e-05, - "loss": 0.6451, - "step": 2452 - }, - { - "epoch": 1.35, - "grad_norm": 0.07285530120134354, - "learning_rate": 4.60834008283386e-05, - "loss": 0.5765, - "step": 2453 - }, - { - "epoch": 1.35, - "grad_norm": 0.07094667106866837, - "learning_rate": 4.600963882255192e-05, - "loss": 0.5697, - "step": 2454 - }, - { - "epoch": 1.35, - "grad_norm": 0.0664157122373581, - "learning_rate": 4.593591825444028e-05, - "loss": 0.6048, - "step": 2455 - }, - { - "epoch": 1.35, - "grad_norm": 0.06729783862829208, - "learning_rate": 4.586223918058424e-05, - "loss": 0.594, - "step": 2456 - }, - { - "epoch": 1.35, - "grad_norm": 0.07497063279151917, - "learning_rate": 4.578860165753268e-05, - "loss": 0.6829, - "step": 2457 - }, - { - "epoch": 1.35, - "grad_norm": 0.08435964584350586, - "learning_rate": 4.57150057418024e-05, - "loss": 0.6255, - "step": 2458 - }, - { - "epoch": 1.35, - "grad_norm": 0.07012388855218887, - "learning_rate": 4.5641451489878414e-05, - "loss": 0.5059, - "step": 2459 - }, - { - "epoch": 1.35, - "grad_norm": 0.06958837062120438, - "learning_rate": 4.5567938958213704e-05, - "loss": 0.5567, - "step": 2460 - }, - { - "epoch": 1.35, - "grad_norm": 0.07721869647502899, - "learning_rate": 4.549446820322929e-05, - "loss": 0.591, - "step": 2461 - }, - { - "epoch": 1.35, - "grad_norm": 0.06777055561542511, - "learning_rate": 4.5421039281313985e-05, - "loss": 0.506, - "step": 2462 - }, - { - "epoch": 1.35, - "grad_norm": 0.07227466255426407, - "learning_rate": 4.5347652248824624e-05, - "loss": 0.6017, - "step": 2463 - }, - { - "epoch": 1.35, - "grad_norm": 0.07450714707374573, - "learning_rate": 4.5274307162085894e-05, - "loss": 0.5657, - "step": 2464 - }, - { - "epoch": 1.35, - "grad_norm": 0.07071810960769653, - "learning_rate": 4.520100407739016e-05, - "loss": 0.5606, - "step": 2465 - }, - { - "epoch": 1.35, - "grad_norm": 0.07373818010091782, - "learning_rate": 4.512774305099775e-05, - "loss": 0.5771, - "step": 2466 - }, - { - "epoch": 1.36, - "grad_norm": 0.0895610824227333, - "learning_rate": 4.505452413913649e-05, - "loss": 0.7296, - "step": 2467 - }, - { - "epoch": 1.36, - "grad_norm": 0.0758524090051651, - "learning_rate": 4.498134739800207e-05, - "loss": 0.587, - "step": 2468 - }, - { - "epoch": 1.36, - "grad_norm": 0.08151973783969879, - "learning_rate": 4.490821288375769e-05, - "loss": 0.5925, - "step": 2469 - }, - { - "epoch": 1.36, - "grad_norm": 0.07796274870634079, - "learning_rate": 4.4835120652534235e-05, - "loss": 0.6123, - "step": 2470 - }, - { - "epoch": 1.36, - "grad_norm": 0.07752220332622528, - "learning_rate": 4.476207076043003e-05, - "loss": 0.619, - "step": 2471 - }, - { - "epoch": 1.36, - "grad_norm": 0.06401935964822769, - "learning_rate": 4.4689063263511e-05, - "loss": 0.5283, - "step": 2472 - }, - { - "epoch": 1.36, - "grad_norm": 0.07612209767103195, - "learning_rate": 4.461609821781054e-05, - "loss": 0.5274, - "step": 2473 - }, - { - "epoch": 1.36, - "grad_norm": 0.07507737725973129, - "learning_rate": 4.4543175679329344e-05, - "loss": 0.6352, - "step": 2474 - }, - { - "epoch": 1.36, - "grad_norm": 0.0714292898774147, - "learning_rate": 4.447029570403561e-05, - "loss": 0.5376, - "step": 2475 - }, - { - "epoch": 1.36, - "grad_norm": 0.08474813401699066, - "learning_rate": 4.4397458347864785e-05, - "loss": 0.658, - "step": 2476 - }, - { - "epoch": 1.36, - "grad_norm": 0.08169824630022049, - "learning_rate": 4.432466366671968e-05, - "loss": 0.6571, - "step": 2477 - }, - { - "epoch": 1.36, - "grad_norm": 0.0662381574511528, - "learning_rate": 4.425191171647025e-05, - "loss": 0.6264, - "step": 2478 - }, - { - "epoch": 1.36, - "grad_norm": 0.07380590587854385, - "learning_rate": 4.417920255295379e-05, - "loss": 0.5699, - "step": 2479 - }, - { - "epoch": 1.36, - "grad_norm": 0.08449859172105789, - "learning_rate": 4.4106536231974596e-05, - "loss": 0.5647, - "step": 2480 - }, - { - "epoch": 1.36, - "grad_norm": 0.07241953909397125, - "learning_rate": 4.4033912809304214e-05, - "loss": 0.6136, - "step": 2481 - }, - { - "epoch": 1.36, - "grad_norm": 0.07251517474651337, - "learning_rate": 4.396133234068126e-05, - "loss": 0.5484, - "step": 2482 - }, - { - "epoch": 1.36, - "grad_norm": 0.08115177601575851, - "learning_rate": 4.388879488181125e-05, - "loss": 0.6351, - "step": 2483 - }, - { - "epoch": 1.36, - "grad_norm": 0.07590151578187943, - "learning_rate": 4.381630048836687e-05, - "loss": 0.5785, - "step": 2484 - }, - { - "epoch": 1.37, - "grad_norm": 0.07244231551885605, - "learning_rate": 4.3743849215987595e-05, - "loss": 0.6151, - "step": 2485 - }, - { - "epoch": 1.37, - "grad_norm": 0.07568778842687607, - "learning_rate": 4.3671441120279934e-05, - "loss": 0.5097, - "step": 2486 - }, - { - "epoch": 1.37, - "grad_norm": 0.07233958691358566, - "learning_rate": 4.3599076256817125e-05, - "loss": 0.5486, - "step": 2487 - }, - { - "epoch": 1.37, - "grad_norm": 0.08074472099542618, - "learning_rate": 4.3526754681139395e-05, - "loss": 0.626, - "step": 2488 - }, - { - "epoch": 1.37, - "grad_norm": 0.06744801998138428, - "learning_rate": 4.3454476448753546e-05, - "loss": 0.5095, - "step": 2489 - }, - { - "epoch": 1.37, - "grad_norm": 0.07199655473232269, - "learning_rate": 4.338224161513327e-05, - "loss": 0.5503, - "step": 2490 - }, - { - "epoch": 1.37, - "grad_norm": 0.08650276809930801, - "learning_rate": 4.331005023571895e-05, - "loss": 0.6253, - "step": 2491 - }, - { - "epoch": 1.37, - "grad_norm": 0.08837499469518661, - "learning_rate": 4.323790236591746e-05, - "loss": 0.5481, - "step": 2492 - }, - { - "epoch": 1.37, - "grad_norm": 0.0681748315691948, - "learning_rate": 4.316579806110249e-05, - "loss": 0.5562, - "step": 2493 - }, - { - "epoch": 1.37, - "grad_norm": 0.06990314275026321, - "learning_rate": 4.309373737661411e-05, - "loss": 0.5825, - "step": 2494 - }, - { - "epoch": 1.37, - "grad_norm": 0.07371961325407028, - "learning_rate": 4.3021720367759056e-05, - "loss": 0.5226, - "step": 2495 - }, - { - "epoch": 1.37, - "grad_norm": 0.07199500501155853, - "learning_rate": 4.294974708981041e-05, - "loss": 0.5349, - "step": 2496 - }, - { - "epoch": 1.37, - "grad_norm": 0.08209903538227081, - "learning_rate": 4.287781759800784e-05, - "loss": 0.6039, - "step": 2497 - }, - { - "epoch": 1.37, - "grad_norm": 0.07699441909790039, - "learning_rate": 4.280593194755727e-05, - "loss": 0.6163, - "step": 2498 - }, - { - "epoch": 1.37, - "grad_norm": 0.06460884213447571, - "learning_rate": 4.273409019363103e-05, - "loss": 0.5296, - "step": 2499 - }, - { - "epoch": 1.37, - "grad_norm": 0.07461009919643402, - "learning_rate": 4.266229239136783e-05, - "loss": 0.5694, - "step": 2500 - }, - { - "epoch": 1.37, - "grad_norm": 0.07356124371290207, - "learning_rate": 4.2590538595872495e-05, - "loss": 0.5508, - "step": 2501 - }, - { - "epoch": 1.38, - "grad_norm": 0.07879485189914703, - "learning_rate": 4.251882886221623e-05, - "loss": 0.6915, - "step": 2502 - }, - { - "epoch": 1.38, - "grad_norm": 0.07226796448230743, - "learning_rate": 4.2447163245436295e-05, - "loss": 0.5294, - "step": 2503 - }, - { - "epoch": 1.38, - "grad_norm": 0.07331380248069763, - "learning_rate": 4.237554180053621e-05, - "loss": 0.6423, - "step": 2504 - }, - { - "epoch": 1.38, - "grad_norm": 0.07386188954114914, - "learning_rate": 4.230396458248546e-05, - "loss": 0.627, - "step": 2505 - }, - { - "epoch": 1.38, - "grad_norm": 0.06912330538034439, - "learning_rate": 4.223243164621973e-05, - "loss": 0.5624, - "step": 2506 - }, - { - "epoch": 1.38, - "grad_norm": 0.06901656836271286, - "learning_rate": 4.216094304664056e-05, - "loss": 0.4814, - "step": 2507 - }, - { - "epoch": 1.38, - "grad_norm": 0.06887569278478622, - "learning_rate": 4.208949883861559e-05, - "loss": 0.5264, - "step": 2508 - }, - { - "epoch": 1.38, - "grad_norm": 0.07022027671337128, - "learning_rate": 4.20180990769784e-05, - "loss": 0.5159, - "step": 2509 - }, - { - "epoch": 1.38, - "grad_norm": 0.06848796457052231, - "learning_rate": 4.194674381652831e-05, - "loss": 0.4915, - "step": 2510 - }, - { - "epoch": 1.38, - "grad_norm": 0.07609882205724716, - "learning_rate": 4.187543311203066e-05, - "loss": 0.5331, - "step": 2511 - }, - { - "epoch": 1.38, - "grad_norm": 0.06382356584072113, - "learning_rate": 4.180416701821643e-05, - "loss": 0.5464, - "step": 2512 - }, - { - "epoch": 1.38, - "grad_norm": 0.0681033730506897, - "learning_rate": 4.173294558978253e-05, - "loss": 0.5528, - "step": 2513 - }, - { - "epoch": 1.38, - "grad_norm": 0.07078997790813446, - "learning_rate": 4.1661768881391416e-05, - "loss": 0.5908, - "step": 2514 - }, - { - "epoch": 1.38, - "grad_norm": 0.06293249875307083, - "learning_rate": 4.159063694767138e-05, - "loss": 0.5, - "step": 2515 - }, - { - "epoch": 1.38, - "grad_norm": 0.08230625092983246, - "learning_rate": 4.151954984321622e-05, - "loss": 0.6461, - "step": 2516 - }, - { - "epoch": 1.38, - "grad_norm": 0.06888657808303833, - "learning_rate": 4.1448507622585405e-05, - "loss": 0.5592, - "step": 2517 - }, - { - "epoch": 1.38, - "grad_norm": 0.08879472315311432, - "learning_rate": 4.137751034030399e-05, - "loss": 0.6627, - "step": 2518 - }, - { - "epoch": 1.38, - "grad_norm": 0.07641344517469406, - "learning_rate": 4.1306558050862384e-05, - "loss": 0.5746, - "step": 2519 - }, - { - "epoch": 1.39, - "grad_norm": 0.08045165240764618, - "learning_rate": 4.1235650808716665e-05, - "loss": 0.6647, - "step": 2520 - }, - { - "epoch": 1.39, - "grad_norm": 0.08117100596427917, - "learning_rate": 4.1164788668288155e-05, - "loss": 0.5934, - "step": 2521 - }, - { - "epoch": 1.39, - "grad_norm": 0.07026588171720505, - "learning_rate": 4.1093971683963706e-05, - "loss": 0.5756, - "step": 2522 - }, - { - "epoch": 1.39, - "grad_norm": 0.07604368031024933, - "learning_rate": 4.102319991009539e-05, - "loss": 0.5721, - "step": 2523 - }, - { - "epoch": 1.39, - "grad_norm": 0.07874717563390732, - "learning_rate": 4.095247340100069e-05, - "loss": 0.5319, - "step": 2524 - }, - { - "epoch": 1.39, - "grad_norm": 0.07756995409727097, - "learning_rate": 4.088179221096225e-05, - "loss": 0.5374, - "step": 2525 - }, - { - "epoch": 1.39, - "grad_norm": 0.0727904736995697, - "learning_rate": 4.0811156394228046e-05, - "loss": 0.5539, - "step": 2526 - }, - { - "epoch": 1.39, - "grad_norm": 0.07348831743001938, - "learning_rate": 4.074056600501107e-05, - "loss": 0.5754, - "step": 2527 - }, - { - "epoch": 1.39, - "grad_norm": 0.08963430672883987, - "learning_rate": 4.067002109748961e-05, - "loss": 0.701, - "step": 2528 - }, - { - "epoch": 1.39, - "grad_norm": 0.07955507934093475, - "learning_rate": 4.059952172580694e-05, - "loss": 0.63, - "step": 2529 - }, - { - "epoch": 1.39, - "grad_norm": 0.06813513487577438, - "learning_rate": 4.0529067944071455e-05, - "loss": 0.5808, - "step": 2530 - }, - { - "epoch": 1.39, - "grad_norm": 0.07727110385894775, - "learning_rate": 4.045865980635655e-05, - "loss": 0.6367, - "step": 2531 - }, - { - "epoch": 1.39, - "grad_norm": 0.0674891322851181, - "learning_rate": 4.038829736670049e-05, - "loss": 0.5446, - "step": 2532 - }, - { - "epoch": 1.39, - "grad_norm": 0.07553569972515106, - "learning_rate": 4.0317980679106613e-05, - "loss": 0.594, - "step": 2533 - }, - { - "epoch": 1.39, - "grad_norm": 0.0750473290681839, - "learning_rate": 4.024770979754301e-05, - "loss": 0.5466, - "step": 2534 - }, - { - "epoch": 1.39, - "grad_norm": 0.08206024020910263, - "learning_rate": 4.0177484775942755e-05, - "loss": 0.6497, - "step": 2535 - }, - { - "epoch": 1.39, - "grad_norm": 0.06640418618917465, - "learning_rate": 4.010730566820355e-05, - "loss": 0.5366, - "step": 2536 - }, - { - "epoch": 1.39, - "grad_norm": 0.07963666319847107, - "learning_rate": 4.003717252818805e-05, - "loss": 0.6514, - "step": 2537 - }, - { - "epoch": 1.4, - "grad_norm": 0.07602626830339432, - "learning_rate": 3.996708540972345e-05, - "loss": 0.5406, - "step": 2538 - }, - { - "epoch": 1.4, - "grad_norm": 0.08422016352415085, - "learning_rate": 3.989704436660178e-05, - "loss": 0.6306, - "step": 2539 - }, - { - "epoch": 1.4, - "grad_norm": 0.080567367374897, - "learning_rate": 3.982704945257957e-05, - "loss": 0.5657, - "step": 2540 - }, - { - "epoch": 1.4, - "grad_norm": 0.07916505634784698, - "learning_rate": 3.975710072137805e-05, - "loss": 0.6178, - "step": 2541 - }, - { - "epoch": 1.4, - "grad_norm": 0.08165088295936584, - "learning_rate": 3.968719822668299e-05, - "loss": 0.683, - "step": 2542 - }, - { - "epoch": 1.4, - "grad_norm": 0.07973385602235794, - "learning_rate": 3.961734202214458e-05, - "loss": 0.6307, - "step": 2543 - }, - { - "epoch": 1.4, - "grad_norm": 0.08140292018651962, - "learning_rate": 3.954753216137762e-05, - "loss": 0.5833, - "step": 2544 - }, - { - "epoch": 1.4, - "grad_norm": 0.06821579486131668, - "learning_rate": 3.94777686979612e-05, - "loss": 0.4946, - "step": 2545 - }, - { - "epoch": 1.4, - "grad_norm": 0.07947936654090881, - "learning_rate": 3.9408051685438975e-05, - "loss": 0.6955, - "step": 2546 - }, - { - "epoch": 1.4, - "grad_norm": 0.0787663385272026, - "learning_rate": 3.933838117731873e-05, - "loss": 0.5568, - "step": 2547 - }, - { - "epoch": 1.4, - "grad_norm": 0.0801193043589592, - "learning_rate": 3.9268757227072763e-05, - "loss": 0.5793, - "step": 2548 - }, - { - "epoch": 1.4, - "grad_norm": 0.07403931766748428, - "learning_rate": 3.919917988813748e-05, - "loss": 0.5898, - "step": 2549 - }, - { - "epoch": 1.4, - "grad_norm": 0.0729985162615776, - "learning_rate": 3.912964921391363e-05, - "loss": 0.5412, - "step": 2550 - }, - { - "epoch": 1.4, - "grad_norm": 0.07448332756757736, - "learning_rate": 3.906016525776611e-05, - "loss": 0.5064, - "step": 2551 - }, - { - "epoch": 1.4, - "grad_norm": 0.07552726566791534, - "learning_rate": 3.8990728073023906e-05, - "loss": 0.5618, - "step": 2552 - }, - { - "epoch": 1.4, - "grad_norm": 0.07500220835208893, - "learning_rate": 3.89213377129802e-05, - "loss": 0.6134, - "step": 2553 - }, - { - "epoch": 1.4, - "grad_norm": 0.07273518294095993, - "learning_rate": 3.885199423089212e-05, - "loss": 0.5164, - "step": 2554 - }, - { - "epoch": 1.4, - "grad_norm": 0.0652259886264801, - "learning_rate": 3.878269767998096e-05, - "loss": 0.541, - "step": 2555 - }, - { - "epoch": 1.41, - "grad_norm": 0.07713053375482559, - "learning_rate": 3.871344811343184e-05, - "loss": 0.6068, - "step": 2556 - }, - { - "epoch": 1.41, - "grad_norm": 0.08411411195993423, - "learning_rate": 3.8644245584393965e-05, - "loss": 0.6864, - "step": 2557 - }, - { - "epoch": 1.41, - "grad_norm": 0.06960765272378922, - "learning_rate": 3.857509014598031e-05, - "loss": 0.4659, - "step": 2558 - }, - { - "epoch": 1.41, - "grad_norm": 0.07402585446834564, - "learning_rate": 3.850598185126778e-05, - "loss": 0.53, - "step": 2559 - }, - { - "epoch": 1.41, - "grad_norm": 0.08061912655830383, - "learning_rate": 3.843692075329714e-05, - "loss": 0.5711, - "step": 2560 - }, - { - "epoch": 1.41, - "grad_norm": 0.07501520961523056, - "learning_rate": 3.8367906905072816e-05, - "loss": 0.5578, - "step": 2561 - }, - { - "epoch": 1.41, - "grad_norm": 0.09120304882526398, - "learning_rate": 3.829894035956306e-05, - "loss": 0.6549, - "step": 2562 - }, - { - "epoch": 1.41, - "grad_norm": 0.08057157695293427, - "learning_rate": 3.823002116969976e-05, - "loss": 0.4961, - "step": 2563 - }, - { - "epoch": 1.41, - "grad_norm": 0.07594940811395645, - "learning_rate": 3.816114938837853e-05, - "loss": 0.5762, - "step": 2564 - }, - { - "epoch": 1.41, - "grad_norm": 0.07835210114717484, - "learning_rate": 3.8092325068458486e-05, - "loss": 0.6856, - "step": 2565 - }, - { - "epoch": 1.41, - "grad_norm": 0.07431218773126602, - "learning_rate": 3.802354826276248e-05, - "loss": 0.6213, - "step": 2566 - }, - { - "epoch": 1.41, - "grad_norm": 0.06993934512138367, - "learning_rate": 3.795481902407672e-05, - "loss": 0.5887, - "step": 2567 - }, - { - "epoch": 1.41, - "grad_norm": 0.07355695962905884, - "learning_rate": 3.7886137405151e-05, - "loss": 0.5146, - "step": 2568 - }, - { - "epoch": 1.41, - "grad_norm": 0.08074449002742767, - "learning_rate": 3.7817503458698634e-05, - "loss": 0.6928, - "step": 2569 - }, - { - "epoch": 1.41, - "grad_norm": 0.08331965655088425, - "learning_rate": 3.774891723739616e-05, - "loss": 0.6044, - "step": 2570 - }, - { - "epoch": 1.41, - "grad_norm": 0.07481932640075684, - "learning_rate": 3.7680378793883696e-05, - "loss": 0.5267, - "step": 2571 - }, - { - "epoch": 1.41, - "grad_norm": 0.08036099374294281, - "learning_rate": 3.76118881807645e-05, - "loss": 0.6552, - "step": 2572 - }, - { - "epoch": 1.41, - "grad_norm": 0.0766538679599762, - "learning_rate": 3.7543445450605285e-05, - "loss": 0.5748, - "step": 2573 - }, - { - "epoch": 1.42, - "grad_norm": 0.08161585032939911, - "learning_rate": 3.747505065593586e-05, - "loss": 0.5854, - "step": 2574 - }, - { - "epoch": 1.42, - "grad_norm": 0.070757195353508, - "learning_rate": 3.740670384924941e-05, - "loss": 0.4891, - "step": 2575 - }, - { - "epoch": 1.42, - "grad_norm": 0.07526080310344696, - "learning_rate": 3.733840508300213e-05, - "loss": 0.6242, - "step": 2576 - }, - { - "epoch": 1.42, - "grad_norm": 0.07664346694946289, - "learning_rate": 3.727015440961343e-05, - "loss": 0.6276, - "step": 2577 - }, - { - "epoch": 1.42, - "grad_norm": 0.07076764851808548, - "learning_rate": 3.7201951881465846e-05, - "loss": 0.6141, - "step": 2578 - }, - { - "epoch": 1.42, - "grad_norm": 0.08236127346754074, - "learning_rate": 3.7133797550904834e-05, - "loss": 0.484, - "step": 2579 - }, - { - "epoch": 1.42, - "grad_norm": 0.061127692461013794, - "learning_rate": 3.7065691470239016e-05, - "loss": 0.4091, - "step": 2580 - }, - { - "epoch": 1.42, - "grad_norm": 0.07331135123968124, - "learning_rate": 3.699763369173982e-05, - "loss": 0.5704, - "step": 2581 - }, - { - "epoch": 1.42, - "grad_norm": 0.0723845437169075, - "learning_rate": 3.692962426764175e-05, - "loss": 0.6224, - "step": 2582 - }, - { - "epoch": 1.42, - "grad_norm": 0.07726027071475983, - "learning_rate": 3.686166325014206e-05, - "loss": 0.4605, - "step": 2583 - }, - { - "epoch": 1.42, - "grad_norm": 0.08010595291852951, - "learning_rate": 3.6793750691400994e-05, - "loss": 0.57, - "step": 2584 - }, - { - "epoch": 1.42, - "grad_norm": 0.08348097652196884, - "learning_rate": 3.672588664354148e-05, - "loss": 0.6409, - "step": 2585 - }, - { - "epoch": 1.42, - "grad_norm": 0.08310791850090027, - "learning_rate": 3.665807115864928e-05, - "loss": 0.7339, - "step": 2586 - }, - { - "epoch": 1.42, - "grad_norm": 0.07286600768566132, - "learning_rate": 3.659030428877292e-05, - "loss": 0.5772, - "step": 2587 - }, - { - "epoch": 1.42, - "grad_norm": 0.0777403935790062, - "learning_rate": 3.652258608592347e-05, - "loss": 0.6053, - "step": 2588 - }, - { - "epoch": 1.42, - "grad_norm": 0.07888954877853394, - "learning_rate": 3.645491660207484e-05, - "loss": 0.6204, - "step": 2589 - }, - { - "epoch": 1.42, - "grad_norm": 0.07620932906866074, - "learning_rate": 3.6387295889163366e-05, - "loss": 0.6508, - "step": 2590 - }, - { - "epoch": 1.42, - "grad_norm": 0.08210571855306625, - "learning_rate": 3.631972399908811e-05, - "loss": 0.623, - "step": 2591 - }, - { - "epoch": 1.43, - "grad_norm": 0.06440088897943497, - "learning_rate": 3.6252200983710514e-05, - "loss": 0.4804, - "step": 2592 - }, - { - "epoch": 1.43, - "grad_norm": 0.07377233356237411, - "learning_rate": 3.6184726894854656e-05, - "loss": 0.5823, - "step": 2593 - }, - { - "epoch": 1.43, - "grad_norm": 0.08177551627159119, - "learning_rate": 3.611730178430692e-05, - "loss": 0.6104, - "step": 2594 - }, - { - "epoch": 1.43, - "grad_norm": 0.0679471418261528, - "learning_rate": 3.6049925703816214e-05, - "loss": 0.5188, - "step": 2595 - }, - { - "epoch": 1.43, - "grad_norm": 0.06674007326364517, - "learning_rate": 3.5982598705093784e-05, - "loss": 0.654, - "step": 2596 - }, - { - "epoch": 1.43, - "grad_norm": 0.07756850123405457, - "learning_rate": 3.591532083981315e-05, - "loss": 0.6562, - "step": 2597 - }, - { - "epoch": 1.43, - "grad_norm": 0.07346350699663162, - "learning_rate": 3.584809215961017e-05, - "loss": 0.5255, - "step": 2598 - }, - { - "epoch": 1.43, - "grad_norm": 0.06543095409870148, - "learning_rate": 3.578091271608297e-05, - "loss": 0.5301, - "step": 2599 - }, - { - "epoch": 1.43, - "grad_norm": 0.07410463690757751, - "learning_rate": 3.5713782560791886e-05, - "loss": 0.6076, - "step": 2600 - }, - { - "epoch": 1.43, - "grad_norm": 0.07580257207155228, - "learning_rate": 3.564670174525934e-05, - "loss": 0.5918, - "step": 2601 - }, - { - "epoch": 1.43, - "grad_norm": 0.07186981290578842, - "learning_rate": 3.557967032097e-05, - "loss": 0.5289, - "step": 2602 - }, - { - "epoch": 1.43, - "grad_norm": 0.08301349729299545, - "learning_rate": 3.551268833937054e-05, - "loss": 0.5828, - "step": 2603 - }, - { - "epoch": 1.43, - "grad_norm": 0.06555919349193573, - "learning_rate": 3.544575585186976e-05, - "loss": 0.5391, - "step": 2604 - }, - { - "epoch": 1.43, - "grad_norm": 0.0664808601140976, - "learning_rate": 3.537887290983838e-05, - "loss": 0.5634, - "step": 2605 - }, - { - "epoch": 1.43, - "grad_norm": 0.06681336462497711, - "learning_rate": 3.53120395646092e-05, - "loss": 0.5548, - "step": 2606 - }, - { - "epoch": 1.43, - "grad_norm": 0.07828280329704285, - "learning_rate": 3.5245255867476856e-05, - "loss": 0.6027, - "step": 2607 - }, - { - "epoch": 1.43, - "grad_norm": 0.07380471378564835, - "learning_rate": 3.5178521869697956e-05, - "loss": 0.5826, - "step": 2608 - }, - { - "epoch": 1.43, - "grad_norm": 0.08058056235313416, - "learning_rate": 3.511183762249095e-05, - "loss": 0.6185, - "step": 2609 - }, - { - "epoch": 1.44, - "grad_norm": 0.06906460970640182, - "learning_rate": 3.5045203177036035e-05, - "loss": 0.5639, - "step": 2610 - }, - { - "epoch": 1.44, - "grad_norm": 0.07293884456157684, - "learning_rate": 3.497861858447531e-05, - "loss": 0.5673, - "step": 2611 - }, - { - "epoch": 1.44, - "grad_norm": 0.07099581509828568, - "learning_rate": 3.491208389591245e-05, - "loss": 0.4846, - "step": 2612 - }, - { - "epoch": 1.44, - "grad_norm": 0.07652019709348679, - "learning_rate": 3.484559916241301e-05, - "loss": 0.6081, - "step": 2613 - }, - { - "epoch": 1.44, - "grad_norm": 0.06962558627128601, - "learning_rate": 3.477916443500403e-05, - "loss": 0.5261, - "step": 2614 - }, - { - "epoch": 1.44, - "grad_norm": 0.06621481478214264, - "learning_rate": 3.471277976467432e-05, - "loss": 0.5502, - "step": 2615 - }, - { - "epoch": 1.44, - "grad_norm": 0.06833726167678833, - "learning_rate": 3.4646445202374146e-05, - "loss": 0.5267, - "step": 2616 - }, - { - "epoch": 1.44, - "grad_norm": 0.07530024647712708, - "learning_rate": 3.458016079901544e-05, - "loss": 0.5676, - "step": 2617 - }, - { - "epoch": 1.44, - "grad_norm": 0.07977502793073654, - "learning_rate": 3.45139266054715e-05, - "loss": 0.6458, - "step": 2618 - }, - { - "epoch": 1.44, - "grad_norm": 0.07798527181148529, - "learning_rate": 3.444774267257719e-05, - "loss": 0.5013, - "step": 2619 - }, - { - "epoch": 1.44, - "grad_norm": 0.06748402863740921, - "learning_rate": 3.438160905112881e-05, - "loss": 0.5423, - "step": 2620 - }, - { - "epoch": 1.44, - "grad_norm": 0.07368659228086472, - "learning_rate": 3.4315525791883915e-05, - "loss": 0.6145, - "step": 2621 - }, - { - "epoch": 1.44, - "grad_norm": 0.07269573956727982, - "learning_rate": 3.424949294556159e-05, - "loss": 0.5452, - "step": 2622 - }, - { - "epoch": 1.44, - "grad_norm": 0.07823427021503448, - "learning_rate": 3.418351056284206e-05, - "loss": 0.6056, - "step": 2623 - }, - { - "epoch": 1.44, - "grad_norm": 0.08030471205711365, - "learning_rate": 3.411757869436695e-05, - "loss": 0.597, - "step": 2624 - }, - { - "epoch": 1.44, - "grad_norm": 0.08266133069992065, - "learning_rate": 3.405169739073899e-05, - "loss": 0.6015, - "step": 2625 - }, - { - "epoch": 1.44, - "grad_norm": 0.07893429696559906, - "learning_rate": 3.398586670252225e-05, - "loss": 0.5495, - "step": 2626 - }, - { - "epoch": 1.44, - "grad_norm": 0.07488974928855896, - "learning_rate": 3.3920086680241795e-05, - "loss": 0.6026, - "step": 2627 - }, - { - "epoch": 1.45, - "grad_norm": 0.0660768523812294, - "learning_rate": 3.38543573743839e-05, - "loss": 0.4573, - "step": 2628 - }, - { - "epoch": 1.45, - "grad_norm": 0.0831848755478859, - "learning_rate": 3.378867883539597e-05, - "loss": 0.6324, - "step": 2629 - }, - { - "epoch": 1.45, - "grad_norm": 0.07275530695915222, - "learning_rate": 3.3723051113686263e-05, - "loss": 0.5845, - "step": 2630 - }, - { - "epoch": 1.45, - "grad_norm": 0.08007675409317017, - "learning_rate": 3.365747425962424e-05, - "loss": 0.6177, - "step": 2631 - }, - { - "epoch": 1.45, - "grad_norm": 0.0778718814253807, - "learning_rate": 3.359194832354014e-05, - "loss": 0.5392, - "step": 2632 - }, - { - "epoch": 1.45, - "grad_norm": 0.07712651044130325, - "learning_rate": 3.3526473355725294e-05, - "loss": 0.6194, - "step": 2633 - }, - { - "epoch": 1.45, - "grad_norm": 0.071379154920578, - "learning_rate": 3.346104940643174e-05, - "loss": 0.6409, - "step": 2634 - }, - { - "epoch": 1.45, - "grad_norm": 0.06798027455806732, - "learning_rate": 3.339567652587252e-05, - "loss": 0.601, - "step": 2635 - }, - { - "epoch": 1.45, - "grad_norm": 0.07837600260972977, - "learning_rate": 3.333035476422134e-05, - "loss": 0.6524, - "step": 2636 - }, - { - "epoch": 1.45, - "grad_norm": 0.0715259313583374, - "learning_rate": 3.326508417161278e-05, - "loss": 0.5168, - "step": 2637 - }, - { - "epoch": 1.45, - "grad_norm": 0.07512487471103668, - "learning_rate": 3.3199864798142146e-05, - "loss": 0.578, - "step": 2638 - }, - { - "epoch": 1.45, - "grad_norm": 0.07843317836523056, - "learning_rate": 3.3134696693865316e-05, - "loss": 0.583, - "step": 2639 - }, - { - "epoch": 1.45, - "grad_norm": 0.0695166289806366, - "learning_rate": 3.3069579908798964e-05, - "loss": 0.5844, - "step": 2640 - }, - { - "epoch": 1.45, - "grad_norm": 0.08100777119398117, - "learning_rate": 3.3004514492920257e-05, - "loss": 0.6408, - "step": 2641 - }, - { - "epoch": 1.45, - "grad_norm": 0.06774459779262543, - "learning_rate": 3.2939500496167044e-05, - "loss": 0.5583, - "step": 2642 - }, - { - "epoch": 1.45, - "grad_norm": 0.08072764426469803, - "learning_rate": 3.287453796843759e-05, - "loss": 0.6122, - "step": 2643 - }, - { - "epoch": 1.45, - "grad_norm": 0.08036383241415024, - "learning_rate": 3.280962695959079e-05, - "loss": 0.5712, - "step": 2644 - }, - { - "epoch": 1.45, - "grad_norm": 0.06367004662752151, - "learning_rate": 3.274476751944587e-05, - "loss": 0.524, - "step": 2645 - }, - { - "epoch": 1.46, - "grad_norm": 0.07899057865142822, - "learning_rate": 3.267995969778257e-05, - "loss": 0.5748, - "step": 2646 - }, - { - "epoch": 1.46, - "grad_norm": 0.07749677449464798, - "learning_rate": 3.2615203544341e-05, - "loss": 0.5404, - "step": 2647 - }, - { - "epoch": 1.46, - "grad_norm": 0.0736050009727478, - "learning_rate": 3.255049910882155e-05, - "loss": 0.5814, - "step": 2648 - }, - { - "epoch": 1.46, - "grad_norm": 0.07629131525754929, - "learning_rate": 3.248584644088501e-05, - "loss": 0.6311, - "step": 2649 - }, - { - "epoch": 1.46, - "grad_norm": 0.07851318269968033, - "learning_rate": 3.242124559015234e-05, - "loss": 0.6012, - "step": 2650 - }, - { - "epoch": 1.46, - "grad_norm": 0.06685943901538849, - "learning_rate": 3.235669660620483e-05, - "loss": 0.4975, - "step": 2651 - }, - { - "epoch": 1.46, - "grad_norm": 0.08383215963840485, - "learning_rate": 3.229219953858386e-05, - "loss": 0.6611, - "step": 2652 - }, - { - "epoch": 1.46, - "grad_norm": 0.08177929371595383, - "learning_rate": 3.222775443679107e-05, - "loss": 0.4994, - "step": 2653 - }, - { - "epoch": 1.46, - "grad_norm": 0.07951292395591736, - "learning_rate": 3.21633613502881e-05, - "loss": 0.6144, - "step": 2654 - }, - { - "epoch": 1.46, - "grad_norm": 0.07102277129888535, - "learning_rate": 3.2099020328496765e-05, - "loss": 0.5143, - "step": 2655 - }, - { - "epoch": 1.46, - "grad_norm": 0.08011867851018906, - "learning_rate": 3.2034731420798926e-05, - "loss": 0.6795, - "step": 2656 - }, - { - "epoch": 1.46, - "grad_norm": 0.0753115639090538, - "learning_rate": 3.197049467653633e-05, - "loss": 0.634, - "step": 2657 - }, - { - "epoch": 1.46, - "grad_norm": 0.06923001259565353, - "learning_rate": 3.1906310145010845e-05, - "loss": 0.4819, - "step": 2658 - }, - { - "epoch": 1.46, - "grad_norm": 0.07761702686548233, - "learning_rate": 3.1842177875484095e-05, - "loss": 0.5315, - "step": 2659 - }, - { - "epoch": 1.46, - "grad_norm": 0.07101301848888397, - "learning_rate": 3.177809791717778e-05, - "loss": 0.4987, - "step": 2660 - }, - { - "epoch": 1.46, - "grad_norm": 0.08525367081165314, - "learning_rate": 3.171407031927325e-05, - "loss": 0.5819, - "step": 2661 - }, - { - "epoch": 1.46, - "grad_norm": 0.07306365668773651, - "learning_rate": 3.165009513091187e-05, - "loss": 0.516, - "step": 2662 - }, - { - "epoch": 1.46, - "grad_norm": 0.06797949224710464, - "learning_rate": 3.158617240119461e-05, - "loss": 0.5117, - "step": 2663 - }, - { - "epoch": 1.47, - "grad_norm": 0.07655341923236847, - "learning_rate": 3.152230217918229e-05, - "loss": 0.5793, - "step": 2664 - }, - { - "epoch": 1.47, - "grad_norm": 0.07946762442588806, - "learning_rate": 3.145848451389542e-05, - "loss": 0.6222, - "step": 2665 - }, - { - "epoch": 1.47, - "grad_norm": 0.08384986221790314, - "learning_rate": 3.139471945431406e-05, - "loss": 0.6602, - "step": 2666 - }, - { - "epoch": 1.47, - "grad_norm": 0.07310443371534348, - "learning_rate": 3.133100704937804e-05, - "loss": 0.5332, - "step": 2667 - }, - { - "epoch": 1.47, - "grad_norm": 0.07994786649942398, - "learning_rate": 3.126734734798669e-05, - "loss": 0.596, - "step": 2668 - }, - { - "epoch": 1.47, - "grad_norm": 0.073966383934021, - "learning_rate": 3.120374039899897e-05, - "loss": 0.6057, - "step": 2669 - }, - { - "epoch": 1.47, - "grad_norm": 0.08127514272928238, - "learning_rate": 3.114018625123323e-05, - "loss": 0.5887, - "step": 2670 - }, - { - "epoch": 1.47, - "grad_norm": 0.08686904609203339, - "learning_rate": 3.107668495346743e-05, - "loss": 0.6865, - "step": 2671 - }, - { - "epoch": 1.47, - "grad_norm": 0.07904399186372757, - "learning_rate": 3.101323655443882e-05, - "loss": 0.5394, - "step": 2672 - }, - { - "epoch": 1.47, - "grad_norm": 0.08745576441287994, - "learning_rate": 3.0949841102844225e-05, - "loss": 0.5725, - "step": 2673 - }, - { - "epoch": 1.47, - "grad_norm": 0.06875587999820709, - "learning_rate": 3.088649864733965e-05, - "loss": 0.5781, - "step": 2674 - }, - { - "epoch": 1.47, - "grad_norm": 0.07002407312393188, - "learning_rate": 3.0823209236540596e-05, - "loss": 0.4903, - "step": 2675 - }, - { - "epoch": 1.47, - "grad_norm": 0.06887030601501465, - "learning_rate": 3.0759972919021695e-05, - "loss": 0.5208, - "step": 2676 - }, - { - "epoch": 1.47, - "grad_norm": 0.07510718703269958, - "learning_rate": 3.0696789743316945e-05, - "loss": 0.5329, - "step": 2677 - }, - { - "epoch": 1.47, - "grad_norm": 0.07978718727827072, - "learning_rate": 3.0633659757919544e-05, - "loss": 0.608, - "step": 2678 - }, - { - "epoch": 1.47, - "grad_norm": 0.08159972727298737, - "learning_rate": 3.057058301128178e-05, - "loss": 0.6215, - "step": 2679 - }, - { - "epoch": 1.47, - "grad_norm": 0.06593898683786392, - "learning_rate": 3.0507559551815223e-05, - "loss": 0.4854, - "step": 2680 - }, - { - "epoch": 1.47, - "grad_norm": 0.0734141394495964, - "learning_rate": 3.044458942789037e-05, - "loss": 0.5405, - "step": 2681 - }, - { - "epoch": 1.48, - "grad_norm": 0.07597970217466354, - "learning_rate": 3.0381672687836948e-05, - "loss": 0.5526, - "step": 2682 - }, - { - "epoch": 1.48, - "grad_norm": 0.08808930218219757, - "learning_rate": 3.031880937994359e-05, - "loss": 0.596, - "step": 2683 - }, - { - "epoch": 1.48, - "grad_norm": 0.07469739764928818, - "learning_rate": 3.0255999552458026e-05, - "loss": 0.5118, - "step": 2684 - }, - { - "epoch": 1.48, - "grad_norm": 0.07475581020116806, - "learning_rate": 3.0193243253586813e-05, - "loss": 0.6382, - "step": 2685 - }, - { - "epoch": 1.48, - "grad_norm": 0.07039845734834671, - "learning_rate": 3.0130540531495534e-05, - "loss": 0.5234, - "step": 2686 - }, - { - "epoch": 1.48, - "grad_norm": 0.07057277858257294, - "learning_rate": 3.0067891434308638e-05, - "loss": 0.4961, - "step": 2687 - }, - { - "epoch": 1.48, - "grad_norm": 0.06783699244260788, - "learning_rate": 3.000529601010934e-05, - "loss": 0.4642, - "step": 2688 - }, - { - "epoch": 1.48, - "grad_norm": 0.08259709924459457, - "learning_rate": 2.9942754306939758e-05, - "loss": 0.6044, - "step": 2689 - }, - { - "epoch": 1.48, - "grad_norm": 0.06971608847379684, - "learning_rate": 2.988026637280069e-05, - "loss": 0.5563, - "step": 2690 - }, - { - "epoch": 1.48, - "grad_norm": 0.07627476751804352, - "learning_rate": 2.9817832255651757e-05, - "loss": 0.5481, - "step": 2691 - }, - { - "epoch": 1.48, - "grad_norm": 0.07380031794309616, - "learning_rate": 2.9755452003411166e-05, - "loss": 0.5532, - "step": 2692 - }, - { - "epoch": 1.48, - "grad_norm": 0.08517714589834213, - "learning_rate": 2.9693125663955924e-05, - "loss": 0.6814, - "step": 2693 - }, - { - "epoch": 1.48, - "grad_norm": 0.07390910387039185, - "learning_rate": 2.9630853285121508e-05, - "loss": 0.593, - "step": 2694 - }, - { - "epoch": 1.48, - "grad_norm": 0.080739825963974, - "learning_rate": 2.9568634914702077e-05, - "loss": 0.5196, - "step": 2695 - }, - { - "epoch": 1.48, - "grad_norm": 0.0803290456533432, - "learning_rate": 2.950647060045034e-05, - "loss": 0.6102, - "step": 2696 - }, - { - "epoch": 1.48, - "grad_norm": 0.07901672273874283, - "learning_rate": 2.9444360390077452e-05, - "loss": 0.6337, - "step": 2697 - }, - { - "epoch": 1.48, - "grad_norm": 0.07600972801446915, - "learning_rate": 2.938230433125314e-05, - "loss": 0.6069, - "step": 2698 - }, - { - "epoch": 1.48, - "grad_norm": 0.08403575420379639, - "learning_rate": 2.932030247160543e-05, - "loss": 0.695, - "step": 2699 - }, - { - "epoch": 1.49, - "grad_norm": 0.07824236154556274, - "learning_rate": 2.9258354858720926e-05, - "loss": 0.6478, - "step": 2700 - }, - { - "epoch": 1.49, - "grad_norm": 0.07876936346292496, - "learning_rate": 2.9196461540144414e-05, - "loss": 0.6292, - "step": 2701 - }, - { - "epoch": 1.49, - "grad_norm": 0.07499687373638153, - "learning_rate": 2.9134622563379188e-05, - "loss": 0.5043, - "step": 2702 - }, - { - "epoch": 1.49, - "grad_norm": 0.07794086635112762, - "learning_rate": 2.9072837975886657e-05, - "loss": 0.5335, - "step": 2703 - }, - { - "epoch": 1.49, - "grad_norm": 0.0772542953491211, - "learning_rate": 2.901110782508665e-05, - "loss": 0.4854, - "step": 2704 - }, - { - "epoch": 1.49, - "grad_norm": 0.07389385998249054, - "learning_rate": 2.894943215835708e-05, - "loss": 0.6506, - "step": 2705 - }, - { - "epoch": 1.49, - "grad_norm": 0.08659908920526505, - "learning_rate": 2.8887811023034138e-05, - "loss": 0.6311, - "step": 2706 - }, - { - "epoch": 1.49, - "grad_norm": 0.07332868874073029, - "learning_rate": 2.8826244466412156e-05, - "loss": 0.5668, - "step": 2707 - }, - { - "epoch": 1.49, - "grad_norm": 0.07572486996650696, - "learning_rate": 2.8764732535743487e-05, - "loss": 0.5522, - "step": 2708 - }, - { - "epoch": 1.49, - "grad_norm": 0.07586922496557236, - "learning_rate": 2.8703275278238683e-05, - "loss": 0.5782, - "step": 2709 - }, - { - "epoch": 1.49, - "grad_norm": 0.07996062189340591, - "learning_rate": 2.8641872741066213e-05, - "loss": 0.6577, - "step": 2710 - }, - { - "epoch": 1.49, - "grad_norm": 0.07385245710611343, - "learning_rate": 2.858052497135265e-05, - "loss": 0.5567, - "step": 2711 - }, - { - "epoch": 1.49, - "grad_norm": 0.07866589725017548, - "learning_rate": 2.8519232016182463e-05, - "loss": 0.6275, - "step": 2712 - }, - { - "epoch": 1.49, - "grad_norm": 0.07945741713047028, - "learning_rate": 2.8457993922598103e-05, - "loss": 0.546, - "step": 2713 - }, - { - "epoch": 1.49, - "grad_norm": 0.07978387176990509, - "learning_rate": 2.8396810737599854e-05, - "loss": 0.5909, - "step": 2714 - }, - { - "epoch": 1.49, - "grad_norm": 0.08178815245628357, - "learning_rate": 2.8335682508145922e-05, - "loss": 0.5666, - "step": 2715 - }, - { - "epoch": 1.49, - "grad_norm": 0.0759677067399025, - "learning_rate": 2.827460928115232e-05, - "loss": 0.6056, - "step": 2716 - }, - { - "epoch": 1.49, - "grad_norm": 0.0705474466085434, - "learning_rate": 2.821359110349279e-05, - "loss": 0.5868, - "step": 2717 - }, - { - "epoch": 1.5, - "grad_norm": 0.07681458443403244, - "learning_rate": 2.8152628021998905e-05, - "loss": 0.5932, - "step": 2718 - }, - { - "epoch": 1.5, - "grad_norm": 0.07275570183992386, - "learning_rate": 2.809172008345986e-05, - "loss": 0.5254, - "step": 2719 - }, - { - "epoch": 1.5, - "grad_norm": 0.07109987735748291, - "learning_rate": 2.8030867334622655e-05, - "loss": 0.5581, - "step": 2720 - }, - { - "epoch": 1.5, - "grad_norm": 0.07160382717847824, - "learning_rate": 2.797006982219178e-05, - "loss": 0.5916, - "step": 2721 - }, - { - "epoch": 1.5, - "grad_norm": 0.08834366500377655, - "learning_rate": 2.790932759282947e-05, - "loss": 0.6798, - "step": 2722 - }, - { - "epoch": 1.5, - "grad_norm": 0.07338585704565048, - "learning_rate": 2.7848640693155415e-05, - "loss": 0.5371, - "step": 2723 - }, - { - "epoch": 1.5, - "grad_norm": 0.0876719132065773, - "learning_rate": 2.778800916974692e-05, - "loss": 0.6038, - "step": 2724 - }, - { - "epoch": 1.5, - "grad_norm": 0.06971060484647751, - "learning_rate": 2.7727433069138785e-05, - "loss": 0.5224, - "step": 2725 - }, - { - "epoch": 1.5, - "grad_norm": 0.08299615979194641, - "learning_rate": 2.76669124378232e-05, - "loss": 0.5817, - "step": 2726 - }, - { - "epoch": 1.5, - "grad_norm": 0.07271869480609894, - "learning_rate": 2.7606447322249872e-05, - "loss": 0.5423, - "step": 2727 - }, - { - "epoch": 1.5, - "grad_norm": 0.08478042483329773, - "learning_rate": 2.7546037768825827e-05, - "loss": 0.594, - "step": 2728 - }, - { - "epoch": 1.5, - "grad_norm": 0.08502127230167389, - "learning_rate": 2.7485683823915507e-05, - "loss": 0.6399, - "step": 2729 - }, - { - "epoch": 1.5, - "grad_norm": 0.0918736606836319, - "learning_rate": 2.742538553384061e-05, - "loss": 0.5878, - "step": 2730 - }, - { - "epoch": 1.5, - "grad_norm": 0.07426369935274124, - "learning_rate": 2.7365142944880206e-05, - "loss": 0.5661, - "step": 2731 - }, - { - "epoch": 1.5, - "grad_norm": 0.07882078737020493, - "learning_rate": 2.7304956103270508e-05, - "loss": 0.5248, - "step": 2732 - }, - { - "epoch": 1.5, - "grad_norm": 0.06941094249486923, - "learning_rate": 2.7244825055205015e-05, - "loss": 0.5908, - "step": 2733 - }, - { - "epoch": 1.5, - "grad_norm": 0.07542286813259125, - "learning_rate": 2.71847498468344e-05, - "loss": 0.6083, - "step": 2734 - }, - { - "epoch": 1.5, - "grad_norm": 0.09603607654571533, - "learning_rate": 2.7124730524266496e-05, - "loss": 0.6575, - "step": 2735 - }, - { - "epoch": 1.51, - "grad_norm": 0.0671052411198616, - "learning_rate": 2.706476713356615e-05, - "loss": 0.4713, - "step": 2736 - }, - { - "epoch": 1.51, - "grad_norm": 0.08022913336753845, - "learning_rate": 2.7004859720755372e-05, - "loss": 0.6117, - "step": 2737 - }, - { - "epoch": 1.51, - "grad_norm": 0.08039288222789764, - "learning_rate": 2.6945008331813226e-05, - "loss": 0.5949, - "step": 2738 - }, - { - "epoch": 1.51, - "grad_norm": 0.07160980254411697, - "learning_rate": 2.688521301267565e-05, - "loss": 0.6303, - "step": 2739 - }, - { - "epoch": 1.51, - "grad_norm": 0.07343259453773499, - "learning_rate": 2.6825473809235713e-05, - "loss": 0.5956, - "step": 2740 - }, - { - "epoch": 1.51, - "grad_norm": 0.07557319104671478, - "learning_rate": 2.6765790767343267e-05, - "loss": 0.5702, - "step": 2741 - }, - { - "epoch": 1.51, - "grad_norm": 0.07988648861646652, - "learning_rate": 2.6706163932805195e-05, - "loss": 0.5878, - "step": 2742 - }, - { - "epoch": 1.51, - "grad_norm": 0.0786256268620491, - "learning_rate": 2.6646593351385097e-05, - "loss": 0.5026, - "step": 2743 - }, - { - "epoch": 1.51, - "grad_norm": 0.08495064079761505, - "learning_rate": 2.6587079068803545e-05, - "loss": 0.635, - "step": 2744 - }, - { - "epoch": 1.51, - "grad_norm": 0.08269646018743515, - "learning_rate": 2.6527621130737768e-05, - "loss": 0.6067, - "step": 2745 - }, - { - "epoch": 1.51, - "grad_norm": 0.08189453929662704, - "learning_rate": 2.646821958282184e-05, - "loss": 0.5704, - "step": 2746 - }, - { - "epoch": 1.51, - "grad_norm": 0.06907159090042114, - "learning_rate": 2.6408874470646572e-05, - "loss": 0.5026, - "step": 2747 - }, - { - "epoch": 1.51, - "grad_norm": 0.07225730270147324, - "learning_rate": 2.6349585839759348e-05, - "loss": 0.5752, - "step": 2748 - }, - { - "epoch": 1.51, - "grad_norm": 0.07018494606018066, - "learning_rate": 2.629035373566433e-05, - "loss": 0.5393, - "step": 2749 - }, - { - "epoch": 1.51, - "grad_norm": 0.08471867442131042, - "learning_rate": 2.6231178203822182e-05, - "loss": 0.5768, - "step": 2750 - }, - { - "epoch": 1.51, - "grad_norm": 0.07804112136363983, - "learning_rate": 2.6172059289650263e-05, - "loss": 0.6254, - "step": 2751 - }, - { - "epoch": 1.51, - "grad_norm": 0.06657396256923676, - "learning_rate": 2.6112997038522368e-05, - "loss": 0.523, - "step": 2752 - }, - { - "epoch": 1.51, - "grad_norm": 0.075431227684021, - "learning_rate": 2.6053991495768903e-05, - "loss": 0.5552, - "step": 2753 - }, - { - "epoch": 1.52, - "grad_norm": 0.07600487768650055, - "learning_rate": 2.5995042706676643e-05, - "loss": 0.5392, - "step": 2754 - }, - { - "epoch": 1.52, - "grad_norm": 0.07607197016477585, - "learning_rate": 2.5936150716488894e-05, - "loss": 0.5937, - "step": 2755 - }, - { - "epoch": 1.52, - "grad_norm": 0.07737214118242264, - "learning_rate": 2.5877315570405368e-05, - "loss": 0.5638, - "step": 2756 - }, - { - "epoch": 1.52, - "grad_norm": 0.08522753417491913, - "learning_rate": 2.5818537313582058e-05, - "loss": 0.5747, - "step": 2757 - }, - { - "epoch": 1.52, - "grad_norm": 0.07582385092973709, - "learning_rate": 2.575981599113142e-05, - "loss": 0.5801, - "step": 2758 - }, - { - "epoch": 1.52, - "grad_norm": 0.09301973879337311, - "learning_rate": 2.570115164812209e-05, - "loss": 0.597, - "step": 2759 - }, - { - "epoch": 1.52, - "grad_norm": 0.07498759031295776, - "learning_rate": 2.5642544329579088e-05, - "loss": 0.5826, - "step": 2760 - }, - { - "epoch": 1.52, - "grad_norm": 0.07844389975070953, - "learning_rate": 2.558399408048354e-05, - "loss": 0.5778, - "step": 2761 - }, - { - "epoch": 1.52, - "grad_norm": 0.07821419835090637, - "learning_rate": 2.5525500945772918e-05, - "loss": 0.6254, - "step": 2762 - }, - { - "epoch": 1.52, - "grad_norm": 0.07561523467302322, - "learning_rate": 2.5467064970340704e-05, - "loss": 0.508, - "step": 2763 - }, - { - "epoch": 1.52, - "grad_norm": 0.0730084627866745, - "learning_rate": 2.5408686199036623e-05, - "loss": 0.5472, - "step": 2764 - }, - { - "epoch": 1.52, - "grad_norm": 0.07727015018463135, - "learning_rate": 2.5350364676666505e-05, - "loss": 0.6141, - "step": 2765 - }, - { - "epoch": 1.52, - "grad_norm": 0.06693361699581146, - "learning_rate": 2.529210044799213e-05, - "loss": 0.546, - "step": 2766 - }, - { - "epoch": 1.52, - "grad_norm": 0.07514087855815887, - "learning_rate": 2.5233893557731412e-05, - "loss": 0.4996, - "step": 2767 - }, - { - "epoch": 1.52, - "grad_norm": 0.08636999875307083, - "learning_rate": 2.517574405055819e-05, - "loss": 0.6738, - "step": 2768 - }, - { - "epoch": 1.52, - "grad_norm": 0.09145054966211319, - "learning_rate": 2.511765197110233e-05, - "loss": 0.5968, - "step": 2769 - }, - { - "epoch": 1.52, - "grad_norm": 0.07830934226512909, - "learning_rate": 2.5059617363949518e-05, - "loss": 0.5855, - "step": 2770 - }, - { - "epoch": 1.52, - "grad_norm": 0.06930552423000336, - "learning_rate": 2.500164027364147e-05, - "loss": 0.552, - "step": 2771 - }, - { - "epoch": 1.53, - "grad_norm": 0.08295781910419464, - "learning_rate": 2.4943720744675603e-05, - "loss": 0.6518, - "step": 2772 - }, - { - "epoch": 1.53, - "grad_norm": 0.07150809466838837, - "learning_rate": 2.4885858821505272e-05, - "loss": 0.5272, - "step": 2773 - }, - { - "epoch": 1.53, - "grad_norm": 0.07201548665761948, - "learning_rate": 2.4828054548539615e-05, - "loss": 0.5088, - "step": 2774 - }, - { - "epoch": 1.53, - "grad_norm": 0.07628802210092545, - "learning_rate": 2.4770307970143424e-05, - "loss": 0.5727, - "step": 2775 - }, - { - "epoch": 1.53, - "grad_norm": 0.07237250357866287, - "learning_rate": 2.471261913063734e-05, - "loss": 0.48, - "step": 2776 - }, - { - "epoch": 1.53, - "grad_norm": 0.08393524587154388, - "learning_rate": 2.4654988074297557e-05, - "loss": 0.5324, - "step": 2777 - }, - { - "epoch": 1.53, - "grad_norm": 0.0864768996834755, - "learning_rate": 2.459741484535606e-05, - "loss": 0.634, - "step": 2778 - }, - { - "epoch": 1.53, - "grad_norm": 0.07512786239385605, - "learning_rate": 2.4539899488000305e-05, - "loss": 0.5888, - "step": 2779 - }, - { - "epoch": 1.53, - "grad_norm": 0.07718805223703384, - "learning_rate": 2.4482442046373478e-05, - "loss": 0.6082, - "step": 2780 - }, - { - "epoch": 1.53, - "grad_norm": 0.08047004789113998, - "learning_rate": 2.4425042564574184e-05, - "loss": 0.5864, - "step": 2781 - }, - { - "epoch": 1.53, - "grad_norm": 0.07809320092201233, - "learning_rate": 2.4367701086656624e-05, - "loss": 0.6021, - "step": 2782 - }, - { - "epoch": 1.53, - "grad_norm": 0.07543138414621353, - "learning_rate": 2.431041765663049e-05, - "loss": 0.5307, - "step": 2783 - }, - { - "epoch": 1.53, - "grad_norm": 0.07181257754564285, - "learning_rate": 2.425319231846085e-05, - "loss": 0.5957, - "step": 2784 - }, - { - "epoch": 1.53, - "grad_norm": 0.0678059458732605, - "learning_rate": 2.4196025116068256e-05, - "loss": 0.513, - "step": 2785 - }, - { - "epoch": 1.53, - "grad_norm": 0.08409433811903, - "learning_rate": 2.4138916093328578e-05, - "loss": 0.6004, - "step": 2786 - }, - { - "epoch": 1.53, - "grad_norm": 0.0672868862748146, - "learning_rate": 2.4081865294073124e-05, - "loss": 0.4609, - "step": 2787 - }, - { - "epoch": 1.53, - "grad_norm": 0.07355806231498718, - "learning_rate": 2.402487276208839e-05, - "loss": 0.6233, - "step": 2788 - }, - { - "epoch": 1.53, - "grad_norm": 0.06832050532102585, - "learning_rate": 2.3967938541116297e-05, - "loss": 0.5685, - "step": 2789 - }, - { - "epoch": 1.54, - "grad_norm": 0.07053340971469879, - "learning_rate": 2.3911062674853858e-05, - "loss": 0.5607, - "step": 2790 - }, - { - "epoch": 1.54, - "grad_norm": 0.08480045199394226, - "learning_rate": 2.3854245206953452e-05, - "loss": 0.5808, - "step": 2791 - }, - { - "epoch": 1.54, - "grad_norm": 0.073263980448246, - "learning_rate": 2.37974861810225e-05, - "loss": 0.5948, - "step": 2792 - }, - { - "epoch": 1.54, - "grad_norm": 0.08538652211427689, - "learning_rate": 2.3740785640623643e-05, - "loss": 0.5938, - "step": 2793 - }, - { - "epoch": 1.54, - "grad_norm": 0.07441440224647522, - "learning_rate": 2.368414362927468e-05, - "loss": 0.6418, - "step": 2794 - }, - { - "epoch": 1.54, - "grad_norm": 0.07826292514801025, - "learning_rate": 2.362756019044835e-05, - "loss": 0.6989, - "step": 2795 - }, - { - "epoch": 1.54, - "grad_norm": 0.07607203722000122, - "learning_rate": 2.357103536757258e-05, - "loss": 0.5775, - "step": 2796 - }, - { - "epoch": 1.54, - "grad_norm": 0.0835345908999443, - "learning_rate": 2.35145692040302e-05, - "loss": 0.4798, - "step": 2797 - }, - { - "epoch": 1.54, - "grad_norm": 0.06911890208721161, - "learning_rate": 2.3458161743159124e-05, - "loss": 0.5362, - "step": 2798 - }, - { - "epoch": 1.54, - "grad_norm": 0.07400982826948166, - "learning_rate": 2.3401813028252085e-05, - "loss": 0.569, - "step": 2799 - }, - { - "epoch": 1.54, - "grad_norm": 0.08338724821805954, - "learning_rate": 2.3345523102556867e-05, - "loss": 0.5607, - "step": 2800 - }, - { - "epoch": 1.54, - "grad_norm": 0.07647686451673508, - "learning_rate": 2.3289292009276e-05, - "loss": 0.668, - "step": 2801 - }, - { - "epoch": 1.54, - "grad_norm": 0.08820470422506332, - "learning_rate": 2.3233119791566948e-05, - "loss": 0.641, - "step": 2802 - }, - { - "epoch": 1.54, - "grad_norm": 0.0804409384727478, - "learning_rate": 2.3177006492541976e-05, - "loss": 0.6515, - "step": 2803 - }, - { - "epoch": 1.54, - "grad_norm": 0.060682669281959534, - "learning_rate": 2.312095215526814e-05, - "loss": 0.3945, - "step": 2804 - }, - { - "epoch": 1.54, - "grad_norm": 0.09143857657909393, - "learning_rate": 2.3064956822767157e-05, - "loss": 0.6366, - "step": 2805 - }, - { - "epoch": 1.54, - "grad_norm": 0.07404692471027374, - "learning_rate": 2.300902053801556e-05, - "loss": 0.5778, - "step": 2806 - }, - { - "epoch": 1.54, - "grad_norm": 0.07459226250648499, - "learning_rate": 2.2953143343944528e-05, - "loss": 0.5378, - "step": 2807 - }, - { - "epoch": 1.55, - "grad_norm": 0.08721205592155457, - "learning_rate": 2.2897325283439864e-05, - "loss": 0.6642, - "step": 2808 - }, - { - "epoch": 1.55, - "grad_norm": 0.07177091389894485, - "learning_rate": 2.284156639934203e-05, - "loss": 0.5285, - "step": 2809 - }, - { - "epoch": 1.55, - "grad_norm": 0.08235837519168854, - "learning_rate": 2.2785866734445994e-05, - "loss": 0.6906, - "step": 2810 - }, - { - "epoch": 1.55, - "grad_norm": 0.0843455046415329, - "learning_rate": 2.2730226331501393e-05, - "loss": 0.6085, - "step": 2811 - }, - { - "epoch": 1.55, - "grad_norm": 0.07604355365037918, - "learning_rate": 2.2674645233212234e-05, - "loss": 0.5811, - "step": 2812 - }, - { - "epoch": 1.55, - "grad_norm": 0.08346116542816162, - "learning_rate": 2.261912348223717e-05, - "loss": 0.586, - "step": 2813 - }, - { - "epoch": 1.55, - "grad_norm": 0.08567336946725845, - "learning_rate": 2.256366112118913e-05, - "loss": 0.6379, - "step": 2814 - }, - { - "epoch": 1.55, - "grad_norm": 0.0706150084733963, - "learning_rate": 2.2508258192635612e-05, - "loss": 0.4955, - "step": 2815 - }, - { - "epoch": 1.55, - "grad_norm": 0.07677091658115387, - "learning_rate": 2.245291473909844e-05, - "loss": 0.5504, - "step": 2816 - }, - { - "epoch": 1.55, - "grad_norm": 0.0727921649813652, - "learning_rate": 2.239763080305375e-05, - "loss": 0.6073, - "step": 2817 - }, - { - "epoch": 1.55, - "grad_norm": 0.0712139904499054, - "learning_rate": 2.2342406426932084e-05, - "loss": 0.5859, - "step": 2818 - }, - { - "epoch": 1.55, - "grad_norm": 0.07416416704654694, - "learning_rate": 2.2287241653118172e-05, - "loss": 0.4962, - "step": 2819 - }, - { - "epoch": 1.55, - "grad_norm": 0.07774526625871658, - "learning_rate": 2.2232136523951107e-05, - "loss": 0.5622, - "step": 2820 - }, - { - "epoch": 1.55, - "grad_norm": 0.06952360272407532, - "learning_rate": 2.21770910817241e-05, - "loss": 0.4522, - "step": 2821 - }, - { - "epoch": 1.55, - "grad_norm": 0.07877884060144424, - "learning_rate": 2.2122105368684643e-05, - "loss": 0.6359, - "step": 2822 - }, - { - "epoch": 1.55, - "grad_norm": 0.08205553144216537, - "learning_rate": 2.2067179427034314e-05, - "loss": 0.5587, - "step": 2823 - }, - { - "epoch": 1.55, - "grad_norm": 0.07097803801298141, - "learning_rate": 2.2012313298928855e-05, - "loss": 0.5439, - "step": 2824 - }, - { - "epoch": 1.55, - "grad_norm": 0.08037848025560379, - "learning_rate": 2.1957507026478118e-05, - "loss": 0.6708, - "step": 2825 - }, - { - "epoch": 1.56, - "grad_norm": 0.07214182615280151, - "learning_rate": 2.1902760651745958e-05, - "loss": 0.5717, - "step": 2826 - }, - { - "epoch": 1.56, - "grad_norm": 0.08701501786708832, - "learning_rate": 2.1848074216750324e-05, - "loss": 0.5975, - "step": 2827 - }, - { - "epoch": 1.56, - "grad_norm": 0.07122653722763062, - "learning_rate": 2.1793447763463093e-05, - "loss": 0.5434, - "step": 2828 - }, - { - "epoch": 1.56, - "grad_norm": 0.08395606279373169, - "learning_rate": 2.173888133381018e-05, - "loss": 0.6117, - "step": 2829 - }, - { - "epoch": 1.56, - "grad_norm": 0.08226127922534943, - "learning_rate": 2.168437496967134e-05, - "loss": 0.5687, - "step": 2830 - }, - { - "epoch": 1.56, - "grad_norm": 0.07219874113798141, - "learning_rate": 2.1629928712880344e-05, - "loss": 0.5374, - "step": 2831 - }, - { - "epoch": 1.56, - "grad_norm": 0.0728921890258789, - "learning_rate": 2.1575542605224707e-05, - "loss": 0.5228, - "step": 2832 - }, - { - "epoch": 1.56, - "grad_norm": 0.08040345460176468, - "learning_rate": 2.152121668844588e-05, - "loss": 0.5647, - "step": 2833 - }, - { - "epoch": 1.56, - "grad_norm": 0.08091174066066742, - "learning_rate": 2.1466951004239077e-05, - "loss": 0.4601, - "step": 2834 - }, - { - "epoch": 1.56, - "grad_norm": 0.06813590973615646, - "learning_rate": 2.141274559425326e-05, - "loss": 0.4936, - "step": 2835 - }, - { - "epoch": 1.56, - "grad_norm": 0.07925499230623245, - "learning_rate": 2.1358600500091185e-05, - "loss": 0.529, - "step": 2836 - }, - { - "epoch": 1.56, - "grad_norm": 0.08047482371330261, - "learning_rate": 2.1304515763309253e-05, - "loss": 0.4992, - "step": 2837 - }, - { - "epoch": 1.56, - "grad_norm": 0.08449546247720718, - "learning_rate": 2.1250491425417607e-05, - "loss": 0.6248, - "step": 2838 - }, - { - "epoch": 1.56, - "grad_norm": 0.08615768700838089, - "learning_rate": 2.1196527527879952e-05, - "loss": 0.5692, - "step": 2839 - }, - { - "epoch": 1.56, - "grad_norm": 0.0787179172039032, - "learning_rate": 2.1142624112113707e-05, - "loss": 0.5379, - "step": 2840 - }, - { - "epoch": 1.56, - "grad_norm": 0.07938480377197266, - "learning_rate": 2.1088781219489762e-05, - "loss": 0.5346, - "step": 2841 - }, - { - "epoch": 1.56, - "grad_norm": 0.07051555812358856, - "learning_rate": 2.1034998891332637e-05, - "loss": 0.4809, - "step": 2842 - }, - { - "epoch": 1.56, - "grad_norm": 0.08885900676250458, - "learning_rate": 2.0981277168920364e-05, - "loss": 0.6303, - "step": 2843 - }, - { - "epoch": 1.57, - "grad_norm": 0.0760311633348465, - "learning_rate": 2.0927616093484394e-05, - "loss": 0.5412, - "step": 2844 - }, - { - "epoch": 1.57, - "grad_norm": 0.0825723186135292, - "learning_rate": 2.0874015706209716e-05, - "loss": 0.6254, - "step": 2845 - }, - { - "epoch": 1.57, - "grad_norm": 0.08421153575181961, - "learning_rate": 2.082047604823465e-05, - "loss": 0.6521, - "step": 2846 - }, - { - "epoch": 1.57, - "grad_norm": 0.07435724139213562, - "learning_rate": 2.0766997160651015e-05, - "loss": 0.6036, - "step": 2847 - }, - { - "epoch": 1.57, - "grad_norm": 0.0800776332616806, - "learning_rate": 2.0713579084503876e-05, - "loss": 0.5592, - "step": 2848 - }, - { - "epoch": 1.57, - "grad_norm": 0.07602731883525848, - "learning_rate": 2.0660221860791717e-05, - "loss": 0.5496, - "step": 2849 - }, - { - "epoch": 1.57, - "grad_norm": 0.08413247764110565, - "learning_rate": 2.0606925530466248e-05, - "loss": 0.5143, - "step": 2850 - }, - { - "epoch": 1.57, - "grad_norm": 0.07869786769151688, - "learning_rate": 2.055369013443248e-05, - "loss": 0.5678, - "step": 2851 - }, - { - "epoch": 1.57, - "grad_norm": 0.07760642468929291, - "learning_rate": 2.0500515713548685e-05, - "loss": 0.4704, - "step": 2852 - }, - { - "epoch": 1.57, - "grad_norm": 0.07946940511465073, - "learning_rate": 2.0447402308626262e-05, - "loss": 0.5505, - "step": 2853 - }, - { - "epoch": 1.57, - "grad_norm": 0.08587105572223663, - "learning_rate": 2.039434996042986e-05, - "loss": 0.6598, - "step": 2854 - }, - { - "epoch": 1.57, - "grad_norm": 0.06604398787021637, - "learning_rate": 2.0341358709677173e-05, - "loss": 0.4961, - "step": 2855 - }, - { - "epoch": 1.57, - "grad_norm": 0.06659547239542007, - "learning_rate": 2.0288428597039122e-05, - "loss": 0.5459, - "step": 2856 - }, - { - "epoch": 1.57, - "grad_norm": 0.07560455799102783, - "learning_rate": 2.023555966313958e-05, - "loss": 0.5326, - "step": 2857 - }, - { - "epoch": 1.57, - "grad_norm": 0.07178471237421036, - "learning_rate": 2.0182751948555577e-05, - "loss": 0.5295, - "step": 2858 - }, - { - "epoch": 1.57, - "grad_norm": 0.0831179991364479, - "learning_rate": 2.013000549381706e-05, - "loss": 0.5768, - "step": 2859 - }, - { - "epoch": 1.57, - "grad_norm": 0.08885984122753143, - "learning_rate": 2.0077320339407023e-05, - "loss": 0.637, - "step": 2860 - }, - { - "epoch": 1.57, - "grad_norm": 0.0871300920844078, - "learning_rate": 2.002469652576141e-05, - "loss": 0.6254, - "step": 2861 - }, - { - "epoch": 1.58, - "grad_norm": 0.07685824483633041, - "learning_rate": 1.9972134093269035e-05, - "loss": 0.5274, - "step": 2862 - }, - { - "epoch": 1.58, - "grad_norm": 0.08027210086584091, - "learning_rate": 1.9919633082271682e-05, - "loss": 0.5917, - "step": 2863 - }, - { - "epoch": 1.58, - "grad_norm": 0.0697130337357521, - "learning_rate": 1.9867193533063898e-05, - "loss": 0.5018, - "step": 2864 - }, - { - "epoch": 1.58, - "grad_norm": 0.08178181946277618, - "learning_rate": 1.9814815485893145e-05, - "loss": 0.5617, - "step": 2865 - }, - { - "epoch": 1.58, - "grad_norm": 0.06824837625026703, - "learning_rate": 1.976249898095962e-05, - "loss": 0.5842, - "step": 2866 - }, - { - "epoch": 1.58, - "grad_norm": 0.06705306470394135, - "learning_rate": 1.971024405841634e-05, - "loss": 0.4984, - "step": 2867 - }, - { - "epoch": 1.58, - "grad_norm": 0.07768554240465164, - "learning_rate": 1.9658050758368975e-05, - "loss": 0.5888, - "step": 2868 - }, - { - "epoch": 1.58, - "grad_norm": 0.08375062048435211, - "learning_rate": 1.9605919120876016e-05, - "loss": 0.5891, - "step": 2869 - }, - { - "epoch": 1.58, - "grad_norm": 0.07355199009180069, - "learning_rate": 1.9553849185948512e-05, - "loss": 0.533, - "step": 2870 - }, - { - "epoch": 1.58, - "grad_norm": 0.08470550924539566, - "learning_rate": 1.9501840993550236e-05, - "loss": 0.6016, - "step": 2871 - }, - { - "epoch": 1.58, - "grad_norm": 0.07025481760501862, - "learning_rate": 1.9449894583597537e-05, - "loss": 0.5826, - "step": 2872 - }, - { - "epoch": 1.58, - "grad_norm": 0.07526448369026184, - "learning_rate": 1.9398009995959365e-05, - "loss": 0.5368, - "step": 2873 - }, - { - "epoch": 1.58, - "grad_norm": 0.07720872014760971, - "learning_rate": 1.934618727045724e-05, - "loss": 0.654, - "step": 2874 - }, - { - "epoch": 1.58, - "grad_norm": 0.07709453254938126, - "learning_rate": 1.92944264468651e-05, - "loss": 0.5871, - "step": 2875 - }, - { - "epoch": 1.58, - "grad_norm": 0.08008085936307907, - "learning_rate": 1.9242727564909524e-05, - "loss": 0.579, - "step": 2876 - }, - { - "epoch": 1.58, - "grad_norm": 0.08657196909189224, - "learning_rate": 1.9191090664269396e-05, - "loss": 0.6261, - "step": 2877 - }, - { - "epoch": 1.58, - "grad_norm": 0.0801343247294426, - "learning_rate": 1.913951578457619e-05, - "loss": 0.6157, - "step": 2878 - }, - { - "epoch": 1.58, - "grad_norm": 0.07138290256261826, - "learning_rate": 1.908800296541361e-05, - "loss": 0.4559, - "step": 2879 - }, - { - "epoch": 1.59, - "grad_norm": 0.07896512746810913, - "learning_rate": 1.9036552246317895e-05, - "loss": 0.5572, - "step": 2880 - }, - { - "epoch": 1.59, - "grad_norm": 0.08345163613557816, - "learning_rate": 1.8985163666777473e-05, - "loss": 0.626, - "step": 2881 - }, - { - "epoch": 1.59, - "grad_norm": 0.07745735347270966, - "learning_rate": 1.8933837266233212e-05, - "loss": 0.5568, - "step": 2882 - }, - { - "epoch": 1.59, - "grad_norm": 0.07590015232563019, - "learning_rate": 1.8882573084078124e-05, - "loss": 0.4918, - "step": 2883 - }, - { - "epoch": 1.59, - "grad_norm": 0.07770450413227081, - "learning_rate": 1.8831371159657584e-05, - "loss": 0.5597, - "step": 2884 - }, - { - "epoch": 1.59, - "grad_norm": 0.08126380294561386, - "learning_rate": 1.8780231532269153e-05, - "loss": 0.6236, - "step": 2885 - }, - { - "epoch": 1.59, - "grad_norm": 0.07325980067253113, - "learning_rate": 1.8729154241162505e-05, - "loss": 0.5563, - "step": 2886 - }, - { - "epoch": 1.59, - "grad_norm": 0.07390178740024567, - "learning_rate": 1.86781393255396e-05, - "loss": 0.5878, - "step": 2887 - }, - { - "epoch": 1.59, - "grad_norm": 0.09135127812623978, - "learning_rate": 1.862718682455439e-05, - "loss": 0.6212, - "step": 2888 - }, - { - "epoch": 1.59, - "grad_norm": 0.08383052796125412, - "learning_rate": 1.8576296777313028e-05, - "loss": 0.591, - "step": 2889 - }, - { - "epoch": 1.59, - "grad_norm": 0.07796967774629593, - "learning_rate": 1.852546922287367e-05, - "loss": 0.531, - "step": 2890 - }, - { - "epoch": 1.59, - "grad_norm": 0.08362308144569397, - "learning_rate": 1.8474704200246573e-05, - "loss": 0.6157, - "step": 2891 - }, - { - "epoch": 1.59, - "grad_norm": 0.07821770757436752, - "learning_rate": 1.8424001748393905e-05, - "loss": 0.6096, - "step": 2892 - }, - { - "epoch": 1.59, - "grad_norm": 0.07985088974237442, - "learning_rate": 1.837336190622989e-05, - "loss": 0.6307, - "step": 2893 - }, - { - "epoch": 1.59, - "grad_norm": 0.0782153308391571, - "learning_rate": 1.832278471262071e-05, - "loss": 0.5395, - "step": 2894 - }, - { - "epoch": 1.59, - "grad_norm": 0.07065875083208084, - "learning_rate": 1.8272270206384368e-05, - "loss": 0.4953, - "step": 2895 - }, - { - "epoch": 1.59, - "grad_norm": 0.07387331128120422, - "learning_rate": 1.822181842629087e-05, - "loss": 0.5472, - "step": 2896 - }, - { - "epoch": 1.59, - "grad_norm": 0.08914386481046677, - "learning_rate": 1.817142941106198e-05, - "loss": 0.6485, - "step": 2897 - }, - { - "epoch": 1.6, - "grad_norm": 0.08163358271121979, - "learning_rate": 1.812110319937138e-05, - "loss": 0.5835, - "step": 2898 - }, - { - "epoch": 1.6, - "grad_norm": 0.08932827413082123, - "learning_rate": 1.8070839829844453e-05, - "loss": 0.6668, - "step": 2899 - }, - { - "epoch": 1.6, - "grad_norm": 0.08409236371517181, - "learning_rate": 1.8020639341058465e-05, - "loss": 0.6596, - "step": 2900 - }, - { - "epoch": 1.6, - "grad_norm": 0.07313519716262817, - "learning_rate": 1.797050177154229e-05, - "loss": 0.5545, - "step": 2901 - }, - { - "epoch": 1.6, - "grad_norm": 0.0781264528632164, - "learning_rate": 1.792042715977662e-05, - "loss": 0.5564, - "step": 2902 - }, - { - "epoch": 1.6, - "grad_norm": 0.07732515037059784, - "learning_rate": 1.787041554419381e-05, - "loss": 0.4992, - "step": 2903 - }, - { - "epoch": 1.6, - "grad_norm": 0.07877063006162643, - "learning_rate": 1.7820466963177783e-05, - "loss": 0.5458, - "step": 2904 - }, - { - "epoch": 1.6, - "grad_norm": 0.07867057621479034, - "learning_rate": 1.7770581455064206e-05, - "loss": 0.5732, - "step": 2905 - }, - { - "epoch": 1.6, - "grad_norm": 0.07103104889392853, - "learning_rate": 1.7720759058140212e-05, - "loss": 0.5122, - "step": 2906 - }, - { - "epoch": 1.6, - "grad_norm": 0.07974914461374283, - "learning_rate": 1.7670999810644616e-05, - "loss": 0.5454, - "step": 2907 - }, - { - "epoch": 1.6, - "grad_norm": 0.0706530511379242, - "learning_rate": 1.7621303750767648e-05, - "loss": 0.4803, - "step": 2908 - }, - { - "epoch": 1.6, - "grad_norm": 0.07859217375516891, - "learning_rate": 1.7571670916651162e-05, - "loss": 0.5667, - "step": 2909 - }, - { - "epoch": 1.6, - "grad_norm": 0.08591758459806442, - "learning_rate": 1.7522101346388376e-05, - "loss": 0.6084, - "step": 2910 - }, - { - "epoch": 1.6, - "grad_norm": 0.08627095073461533, - "learning_rate": 1.7472595078024012e-05, - "loss": 0.6266, - "step": 2911 - }, - { - "epoch": 1.6, - "grad_norm": 0.07653766870498657, - "learning_rate": 1.7423152149554233e-05, - "loss": 0.5526, - "step": 2912 - }, - { - "epoch": 1.6, - "grad_norm": 0.08037185668945312, - "learning_rate": 1.7373772598926507e-05, - "loss": 0.5548, - "step": 2913 - }, - { - "epoch": 1.6, - "grad_norm": 0.07113780826330185, - "learning_rate": 1.7324456464039752e-05, - "loss": 0.5352, - "step": 2914 - }, - { - "epoch": 1.6, - "grad_norm": 0.07585661113262177, - "learning_rate": 1.727520378274412e-05, - "loss": 0.5925, - "step": 2915 - }, - { - "epoch": 1.61, - "grad_norm": 0.07407297194004059, - "learning_rate": 1.7226014592841144e-05, - "loss": 0.5098, - "step": 2916 - }, - { - "epoch": 1.61, - "grad_norm": 0.07840467989444733, - "learning_rate": 1.7176888932083568e-05, - "loss": 0.5716, - "step": 2917 - }, - { - "epoch": 1.61, - "grad_norm": 0.08341764658689499, - "learning_rate": 1.7127826838175442e-05, - "loss": 0.6606, - "step": 2918 - }, - { - "epoch": 1.61, - "grad_norm": 0.08113296329975128, - "learning_rate": 1.7078828348771935e-05, - "loss": 0.621, - "step": 2919 - }, - { - "epoch": 1.61, - "grad_norm": 0.07104890048503876, - "learning_rate": 1.702989350147948e-05, - "loss": 0.485, - "step": 2920 - }, - { - "epoch": 1.61, - "grad_norm": 0.07283183932304382, - "learning_rate": 1.6981022333855667e-05, - "loss": 0.551, - "step": 2921 - }, - { - "epoch": 1.61, - "grad_norm": 0.07866816967725754, - "learning_rate": 1.6932214883409135e-05, - "loss": 0.6187, - "step": 2922 - }, - { - "epoch": 1.61, - "grad_norm": 0.07849008589982986, - "learning_rate": 1.688347118759972e-05, - "loss": 0.6274, - "step": 2923 - }, - { - "epoch": 1.61, - "grad_norm": 0.07671231031417847, - "learning_rate": 1.6834791283838215e-05, - "loss": 0.5543, - "step": 2924 - }, - { - "epoch": 1.61, - "grad_norm": 0.07396913319826126, - "learning_rate": 1.6786175209486566e-05, - "loss": 0.5548, - "step": 2925 - }, - { - "epoch": 1.61, - "grad_norm": 0.07642372697591782, - "learning_rate": 1.6737623001857618e-05, - "loss": 0.687, - "step": 2926 - }, - { - "epoch": 1.61, - "grad_norm": 0.06577810645103455, - "learning_rate": 1.6689134698215325e-05, - "loss": 0.4493, - "step": 2927 - }, - { - "epoch": 1.61, - "grad_norm": 0.08086547255516052, - "learning_rate": 1.6640710335774457e-05, - "loss": 0.5445, - "step": 2928 - }, - { - "epoch": 1.61, - "grad_norm": 0.07448156177997589, - "learning_rate": 1.6592349951700813e-05, - "loss": 0.5462, - "step": 2929 - }, - { - "epoch": 1.61, - "grad_norm": 0.07236510515213013, - "learning_rate": 1.6544053583111075e-05, - "loss": 0.5746, - "step": 2930 - }, - { - "epoch": 1.61, - "grad_norm": 0.08082794398069382, - "learning_rate": 1.649582126707272e-05, - "loss": 0.5823, - "step": 2931 - }, - { - "epoch": 1.61, - "grad_norm": 0.07060989737510681, - "learning_rate": 1.644765304060416e-05, - "loss": 0.5668, - "step": 2932 - }, - { - "epoch": 1.61, - "grad_norm": 0.09212745726108551, - "learning_rate": 1.6399548940674536e-05, - "loss": 0.6745, - "step": 2933 - }, - { - "epoch": 1.62, - "grad_norm": 0.07585174590349197, - "learning_rate": 1.6351509004203857e-05, - "loss": 0.5716, - "step": 2934 - }, - { - "epoch": 1.62, - "grad_norm": 0.07427546381950378, - "learning_rate": 1.6303533268062778e-05, - "loss": 0.5363, - "step": 2935 - }, - { - "epoch": 1.62, - "grad_norm": 0.0801323726773262, - "learning_rate": 1.6255621769072805e-05, - "loss": 0.621, - "step": 2936 - }, - { - "epoch": 1.62, - "grad_norm": 0.07136417180299759, - "learning_rate": 1.6207774544006016e-05, - "loss": 0.5717, - "step": 2937 - }, - { - "epoch": 1.62, - "grad_norm": 0.08079267293214798, - "learning_rate": 1.6159991629585248e-05, - "loss": 0.5589, - "step": 2938 - }, - { - "epoch": 1.62, - "grad_norm": 0.07775907218456268, - "learning_rate": 1.6112273062483985e-05, - "loss": 0.509, - "step": 2939 - }, - { - "epoch": 1.62, - "grad_norm": 0.07815612852573395, - "learning_rate": 1.6064618879326242e-05, - "loss": 0.6139, - "step": 2940 - }, - { - "epoch": 1.62, - "grad_norm": 0.07643534988164902, - "learning_rate": 1.6017029116686688e-05, - "loss": 0.703, - "step": 2941 - }, - { - "epoch": 1.62, - "grad_norm": 0.07673238962888718, - "learning_rate": 1.5969503811090525e-05, - "loss": 0.5453, - "step": 2942 - }, - { - "epoch": 1.62, - "grad_norm": 0.07304929941892624, - "learning_rate": 1.592204299901353e-05, - "loss": 0.5945, - "step": 2943 - }, - { - "epoch": 1.62, - "grad_norm": 0.06565451622009277, - "learning_rate": 1.587464671688187e-05, - "loss": 0.5119, - "step": 2944 - }, - { - "epoch": 1.62, - "grad_norm": 0.06776011735200882, - "learning_rate": 1.5827315001072318e-05, - "loss": 0.4697, - "step": 2945 - }, - { - "epoch": 1.62, - "grad_norm": 0.07718344032764435, - "learning_rate": 1.5780047887911987e-05, - "loss": 0.5887, - "step": 2946 - }, - { - "epoch": 1.62, - "grad_norm": 0.08132103085517883, - "learning_rate": 1.5732845413678477e-05, - "loss": 0.6108, - "step": 2947 - }, - { - "epoch": 1.62, - "grad_norm": 0.07473303377628326, - "learning_rate": 1.568570761459972e-05, - "loss": 0.5892, - "step": 2948 - }, - { - "epoch": 1.62, - "grad_norm": 0.08634821325540543, - "learning_rate": 1.563863452685409e-05, - "loss": 0.5048, - "step": 2949 - }, - { - "epoch": 1.62, - "grad_norm": 0.08100888878107071, - "learning_rate": 1.5591626186570183e-05, - "loss": 0.5892, - "step": 2950 - }, - { - "epoch": 1.62, - "grad_norm": 0.07569902390241623, - "learning_rate": 1.5544682629827002e-05, - "loss": 0.5609, - "step": 2951 - }, - { - "epoch": 1.63, - "grad_norm": 0.08422853797674179, - "learning_rate": 1.5497803892653807e-05, - "loss": 0.5317, - "step": 2952 - }, - { - "epoch": 1.63, - "grad_norm": 0.0842873826622963, - "learning_rate": 1.5450990011030044e-05, - "loss": 0.5676, - "step": 2953 - }, - { - "epoch": 1.63, - "grad_norm": 0.08060505241155624, - "learning_rate": 1.540424102088548e-05, - "loss": 0.5739, - "step": 2954 - }, - { - "epoch": 1.63, - "grad_norm": 0.0765761137008667, - "learning_rate": 1.5357556958099973e-05, - "loss": 0.5434, - "step": 2955 - }, - { - "epoch": 1.63, - "grad_norm": 0.07481849938631058, - "learning_rate": 1.531093785850366e-05, - "loss": 0.5044, - "step": 2956 - }, - { - "epoch": 1.63, - "grad_norm": 0.08106032758951187, - "learning_rate": 1.526438375787671e-05, - "loss": 0.6152, - "step": 2957 - }, - { - "epoch": 1.63, - "grad_norm": 0.08591877669095993, - "learning_rate": 1.521789469194952e-05, - "loss": 0.5657, - "step": 2958 - }, - { - "epoch": 1.63, - "grad_norm": 0.07527689635753632, - "learning_rate": 1.5171470696402446e-05, - "loss": 0.6218, - "step": 2959 - }, - { - "epoch": 1.63, - "grad_norm": 0.07529833912849426, - "learning_rate": 1.512511180686601e-05, - "loss": 0.589, - "step": 2960 - }, - { - "epoch": 1.63, - "grad_norm": 0.07544808834791183, - "learning_rate": 1.5078818058920696e-05, - "loss": 0.5448, - "step": 2961 - }, - { - "epoch": 1.63, - "grad_norm": 0.07565164566040039, - "learning_rate": 1.5032589488097038e-05, - "loss": 0.5945, - "step": 2962 - }, - { - "epoch": 1.63, - "grad_norm": 0.0887230783700943, - "learning_rate": 1.498642612987553e-05, - "loss": 0.6773, - "step": 2963 - }, - { - "epoch": 1.63, - "grad_norm": 0.0782867819070816, - "learning_rate": 1.4940328019686589e-05, - "loss": 0.5615, - "step": 2964 - }, - { - "epoch": 1.63, - "grad_norm": 0.07043831050395966, - "learning_rate": 1.48942951929106e-05, - "loss": 0.5101, - "step": 2965 - }, - { - "epoch": 1.63, - "grad_norm": 0.06438145786523819, - "learning_rate": 1.4848327684877794e-05, - "loss": 0.4303, - "step": 2966 - }, - { - "epoch": 1.63, - "grad_norm": 0.06960533559322357, - "learning_rate": 1.4802425530868324e-05, - "loss": 0.4657, - "step": 2967 - }, - { - "epoch": 1.63, - "grad_norm": 0.07519534230232239, - "learning_rate": 1.4756588766112133e-05, - "loss": 0.5149, - "step": 2968 - }, - { - "epoch": 1.63, - "grad_norm": 0.07465105503797531, - "learning_rate": 1.4710817425789014e-05, - "loss": 0.5502, - "step": 2969 - }, - { - "epoch": 1.64, - "grad_norm": 0.08811614662408829, - "learning_rate": 1.466511154502852e-05, - "loss": 0.6154, - "step": 2970 - }, - { - "epoch": 1.64, - "grad_norm": 0.0730571523308754, - "learning_rate": 1.461947115890997e-05, - "loss": 0.545, - "step": 2971 - }, - { - "epoch": 1.64, - "grad_norm": 0.07383215427398682, - "learning_rate": 1.4573896302462464e-05, - "loss": 0.5388, - "step": 2972 - }, - { - "epoch": 1.64, - "grad_norm": 0.07530464231967926, - "learning_rate": 1.4528387010664701e-05, - "loss": 0.5037, - "step": 2973 - }, - { - "epoch": 1.64, - "grad_norm": 0.06940647214651108, - "learning_rate": 1.4482943318445197e-05, - "loss": 0.6025, - "step": 2974 - }, - { - "epoch": 1.64, - "grad_norm": 0.07640254497528076, - "learning_rate": 1.4437565260681974e-05, - "loss": 0.5243, - "step": 2975 - }, - { - "epoch": 1.64, - "grad_norm": 0.07426881045103073, - "learning_rate": 1.4392252872202816e-05, - "loss": 0.5138, - "step": 2976 - }, - { - "epoch": 1.64, - "grad_norm": 0.08832768350839615, - "learning_rate": 1.4347006187784995e-05, - "loss": 0.6679, - "step": 2977 - }, - { - "epoch": 1.64, - "grad_norm": 0.08714038133621216, - "learning_rate": 1.4301825242155441e-05, - "loss": 0.5764, - "step": 2978 - }, - { - "epoch": 1.64, - "grad_norm": 0.07089292258024216, - "learning_rate": 1.4256710069990542e-05, - "loss": 0.62, - "step": 2979 - }, - { - "epoch": 1.64, - "grad_norm": 0.07467392086982727, - "learning_rate": 1.4211660705916285e-05, - "loss": 0.5115, - "step": 2980 - }, - { - "epoch": 1.64, - "grad_norm": 0.08002919703722, - "learning_rate": 1.4166677184508136e-05, - "loss": 0.6898, - "step": 2981 - }, - { - "epoch": 1.64, - "grad_norm": 0.07483605295419693, - "learning_rate": 1.4121759540290969e-05, - "loss": 0.5999, - "step": 2982 - }, - { - "epoch": 1.64, - "grad_norm": 0.08835967630147934, - "learning_rate": 1.4076907807739182e-05, - "loss": 0.5828, - "step": 2983 - }, - { - "epoch": 1.64, - "grad_norm": 0.07726197689771652, - "learning_rate": 1.4032122021276484e-05, - "loss": 0.6079, - "step": 2984 - }, - { - "epoch": 1.64, - "grad_norm": 0.07789423316717148, - "learning_rate": 1.398740221527608e-05, - "loss": 0.5703, - "step": 2985 - }, - { - "epoch": 1.64, - "grad_norm": 0.09219390153884888, - "learning_rate": 1.3942748424060425e-05, - "loss": 0.807, - "step": 2986 - }, - { - "epoch": 1.64, - "grad_norm": 0.07657124847173691, - "learning_rate": 1.3898160681901429e-05, - "loss": 0.5378, - "step": 2987 - }, - { - "epoch": 1.65, - "grad_norm": 0.07728318125009537, - "learning_rate": 1.3853639023020181e-05, - "loss": 0.6365, - "step": 2988 - }, - { - "epoch": 1.65, - "grad_norm": 0.08017481863498688, - "learning_rate": 1.3809183481587152e-05, - "loss": 0.5895, - "step": 2989 - }, - { - "epoch": 1.65, - "grad_norm": 0.09209847450256348, - "learning_rate": 1.3764794091722033e-05, - "loss": 0.6123, - "step": 2990 - }, - { - "epoch": 1.65, - "grad_norm": 0.07465118914842606, - "learning_rate": 1.3720470887493719e-05, - "loss": 0.5438, - "step": 2991 - }, - { - "epoch": 1.65, - "grad_norm": 0.08009976893663406, - "learning_rate": 1.3676213902920354e-05, - "loss": 0.5507, - "step": 2992 - }, - { - "epoch": 1.65, - "grad_norm": 0.07504164427518845, - "learning_rate": 1.36320231719692e-05, - "loss": 0.5589, - "step": 2993 - }, - { - "epoch": 1.65, - "grad_norm": 0.0721011683344841, - "learning_rate": 1.3587898728556747e-05, - "loss": 0.4541, - "step": 2994 - }, - { - "epoch": 1.65, - "grad_norm": 0.07778453081846237, - "learning_rate": 1.354384060654852e-05, - "loss": 0.5965, - "step": 2995 - }, - { - "epoch": 1.65, - "grad_norm": 0.08092589676380157, - "learning_rate": 1.3499848839759222e-05, - "loss": 0.5814, - "step": 2996 - }, - { - "epoch": 1.65, - "grad_norm": 0.078467458486557, - "learning_rate": 1.3455923461952558e-05, - "loss": 0.5764, - "step": 2997 - }, - { - "epoch": 1.65, - "grad_norm": 0.07226195186376572, - "learning_rate": 1.3412064506841327e-05, - "loss": 0.5042, - "step": 2998 - }, - { - "epoch": 1.65, - "grad_norm": 0.07678015530109406, - "learning_rate": 1.336827200808738e-05, - "loss": 0.5554, - "step": 2999 - }, - { - "epoch": 1.65, - "grad_norm": 0.07457426935434341, - "learning_rate": 1.3324545999301452e-05, - "loss": 0.6258, - "step": 3000 - }, - { - "epoch": 1.65, - "grad_norm": 0.0747811570763588, - "learning_rate": 1.3280886514043367e-05, - "loss": 0.5506, - "step": 3001 - }, - { - "epoch": 1.65, - "grad_norm": 0.07037195563316345, - "learning_rate": 1.3237293585821786e-05, - "loss": 0.4983, - "step": 3002 - }, - { - "epoch": 1.65, - "grad_norm": 0.06816434115171432, - "learning_rate": 1.3193767248094402e-05, - "loss": 0.4848, - "step": 3003 - }, - { - "epoch": 1.65, - "grad_norm": 0.07628194242715836, - "learning_rate": 1.315030753426768e-05, - "loss": 0.5544, - "step": 3004 - }, - { - "epoch": 1.65, - "grad_norm": 0.08764028549194336, - "learning_rate": 1.3106914477697063e-05, - "loss": 0.5445, - "step": 3005 - }, - { - "epoch": 1.66, - "grad_norm": 0.08041933923959732, - "learning_rate": 1.306358811168673e-05, - "loss": 0.5387, - "step": 3006 - }, - { - "epoch": 1.66, - "grad_norm": 0.07397464662790298, - "learning_rate": 1.302032846948975e-05, - "loss": 0.4767, - "step": 3007 - }, - { - "epoch": 1.66, - "grad_norm": 0.06808964163064957, - "learning_rate": 1.2977135584307964e-05, - "loss": 0.5131, - "step": 3008 - }, - { - "epoch": 1.66, - "grad_norm": 0.07995598763227463, - "learning_rate": 1.2934009489291955e-05, - "loss": 0.605, - "step": 3009 - }, - { - "epoch": 1.66, - "grad_norm": 0.0879918783903122, - "learning_rate": 1.2890950217541053e-05, - "loss": 0.5728, - "step": 3010 - }, - { - "epoch": 1.66, - "grad_norm": 0.07948015630245209, - "learning_rate": 1.2847957802103317e-05, - "loss": 0.5891, - "step": 3011 - }, - { - "epoch": 1.66, - "grad_norm": 0.07937068492174149, - "learning_rate": 1.2805032275975514e-05, - "loss": 0.5653, - "step": 3012 - }, - { - "epoch": 1.66, - "grad_norm": 0.0827297791838646, - "learning_rate": 1.2762173672102996e-05, - "loss": 0.549, - "step": 3013 - }, - { - "epoch": 1.66, - "grad_norm": 0.08054407685995102, - "learning_rate": 1.2719382023379834e-05, - "loss": 0.553, - "step": 3014 - }, - { - "epoch": 1.66, - "grad_norm": 0.06850113719701767, - "learning_rate": 1.2676657362648636e-05, - "loss": 0.5766, - "step": 3015 - }, - { - "epoch": 1.66, - "grad_norm": 0.08243125677108765, - "learning_rate": 1.2633999722700684e-05, - "loss": 0.6332, - "step": 3016 - }, - { - "epoch": 1.66, - "grad_norm": 0.08675704896450043, - "learning_rate": 1.2591409136275723e-05, - "loss": 0.6286, - "step": 3017 - }, - { - "epoch": 1.66, - "grad_norm": 0.07562907040119171, - "learning_rate": 1.2548885636062146e-05, - "loss": 0.5745, - "step": 3018 - }, - { - "epoch": 1.66, - "grad_norm": 0.07691862434148788, - "learning_rate": 1.250642925469674e-05, - "loss": 0.7418, - "step": 3019 - }, - { - "epoch": 1.66, - "grad_norm": 0.08036176860332489, - "learning_rate": 1.2464040024764855e-05, - "loss": 0.5572, - "step": 3020 - }, - { - "epoch": 1.66, - "grad_norm": 0.06832167506217957, - "learning_rate": 1.2421717978800306e-05, - "loss": 0.56, - "step": 3021 - }, - { - "epoch": 1.66, - "grad_norm": 0.07241854816675186, - "learning_rate": 1.2379463149285286e-05, - "loss": 0.5392, - "step": 3022 - }, - { - "epoch": 1.66, - "grad_norm": 0.07803373783826828, - "learning_rate": 1.2337275568650464e-05, - "loss": 0.5652, - "step": 3023 - }, - { - "epoch": 1.67, - "grad_norm": 0.07604583352804184, - "learning_rate": 1.2295155269274827e-05, - "loss": 0.5596, - "step": 3024 - }, - { - "epoch": 1.67, - "grad_norm": 0.07595550268888474, - "learning_rate": 1.2253102283485807e-05, - "loss": 0.6225, - "step": 3025 - }, - { - "epoch": 1.67, - "grad_norm": 0.08902128040790558, - "learning_rate": 1.2211116643559083e-05, - "loss": 0.712, - "step": 3026 - }, - { - "epoch": 1.67, - "grad_norm": 0.07837363332509995, - "learning_rate": 1.2169198381718726e-05, - "loss": 0.5764, - "step": 3027 - }, - { - "epoch": 1.67, - "grad_norm": 0.0752459168434143, - "learning_rate": 1.2127347530137034e-05, - "loss": 0.5559, - "step": 3028 - }, - { - "epoch": 1.67, - "grad_norm": 0.08141879737377167, - "learning_rate": 1.20855641209346e-05, - "loss": 0.5588, - "step": 3029 - }, - { - "epoch": 1.67, - "grad_norm": 0.08007825911045074, - "learning_rate": 1.2043848186180262e-05, - "loss": 0.5518, - "step": 3030 - }, - { - "epoch": 1.67, - "grad_norm": 0.07503150403499603, - "learning_rate": 1.2002199757891031e-05, - "loss": 0.5589, - "step": 3031 - }, - { - "epoch": 1.67, - "grad_norm": 0.08165138959884644, - "learning_rate": 1.1960618868032159e-05, - "loss": 0.6055, - "step": 3032 - }, - { - "epoch": 1.67, - "grad_norm": 0.07894934713840485, - "learning_rate": 1.1919105548516996e-05, - "loss": 0.5359, - "step": 3033 - }, - { - "epoch": 1.67, - "grad_norm": 0.07462869584560394, - "learning_rate": 1.1877659831207122e-05, - "loss": 0.5296, - "step": 3034 - }, - { - "epoch": 1.67, - "grad_norm": 0.08617985993623734, - "learning_rate": 1.1836281747912125e-05, - "loss": 0.5807, - "step": 3035 - }, - { - "epoch": 1.67, - "grad_norm": 0.07207958400249481, - "learning_rate": 1.1794971330389792e-05, - "loss": 0.4785, - "step": 3036 - }, - { - "epoch": 1.67, - "grad_norm": 0.08790726959705353, - "learning_rate": 1.1753728610345883e-05, - "loss": 0.6675, - "step": 3037 - }, - { - "epoch": 1.67, - "grad_norm": 0.08325272798538208, - "learning_rate": 1.1712553619434252e-05, - "loss": 0.6261, - "step": 3038 - }, - { - "epoch": 1.67, - "grad_norm": 0.07597482204437256, - "learning_rate": 1.167144638925679e-05, - "loss": 0.5318, - "step": 3039 - }, - { - "epoch": 1.67, - "grad_norm": 0.08048231899738312, - "learning_rate": 1.1630406951363314e-05, - "loss": 0.5865, - "step": 3040 - }, - { - "epoch": 1.67, - "grad_norm": 0.08627559244632721, - "learning_rate": 1.1589435337251676e-05, - "loss": 0.6283, - "step": 3041 - }, - { - "epoch": 1.68, - "grad_norm": 0.07512358576059341, - "learning_rate": 1.1548531578367604e-05, - "loss": 0.5793, - "step": 3042 - }, - { - "epoch": 1.68, - "grad_norm": 0.08154304325580597, - "learning_rate": 1.1507695706104849e-05, - "loss": 0.5347, - "step": 3043 - }, - { - "epoch": 1.68, - "grad_norm": 0.0808219239115715, - "learning_rate": 1.1466927751804935e-05, - "loss": 0.6504, - "step": 3044 - }, - { - "epoch": 1.68, - "grad_norm": 0.07881506532430649, - "learning_rate": 1.1426227746757378e-05, - "loss": 0.5969, - "step": 3045 - }, - { - "epoch": 1.68, - "grad_norm": 0.0822736844420433, - "learning_rate": 1.1385595722199438e-05, - "loss": 0.5548, - "step": 3046 - }, - { - "epoch": 1.68, - "grad_norm": 0.07585962116718292, - "learning_rate": 1.134503170931629e-05, - "loss": 0.5943, - "step": 3047 - }, - { - "epoch": 1.68, - "grad_norm": 0.08921745419502258, - "learning_rate": 1.130453573924083e-05, - "loss": 0.6943, - "step": 3048 - }, - { - "epoch": 1.68, - "grad_norm": 0.08134070783853531, - "learning_rate": 1.1264107843053783e-05, - "loss": 0.5871, - "step": 3049 - }, - { - "epoch": 1.68, - "grad_norm": 0.09125948697328568, - "learning_rate": 1.1223748051783644e-05, - "loss": 0.5844, - "step": 3050 - }, - { - "epoch": 1.68, - "grad_norm": 0.07791607826948166, - "learning_rate": 1.1183456396406556e-05, - "loss": 0.6624, - "step": 3051 - }, - { - "epoch": 1.68, - "grad_norm": 0.07797951251268387, - "learning_rate": 1.1143232907846478e-05, - "loss": 0.5632, - "step": 3052 - }, - { - "epoch": 1.68, - "grad_norm": 0.07615441083908081, - "learning_rate": 1.1103077616974932e-05, - "loss": 0.5388, - "step": 3053 - }, - { - "epoch": 1.68, - "grad_norm": 0.08791297674179077, - "learning_rate": 1.1062990554611207e-05, - "loss": 0.5462, - "step": 3054 - }, - { - "epoch": 1.68, - "grad_norm": 0.0754203200340271, - "learning_rate": 1.1022971751522127e-05, - "loss": 0.5189, - "step": 3055 - }, - { - "epoch": 1.68, - "grad_norm": 0.07922713458538055, - "learning_rate": 1.0983021238422242e-05, - "loss": 0.6262, - "step": 3056 - }, - { - "epoch": 1.68, - "grad_norm": 0.08247672766447067, - "learning_rate": 1.0943139045973549e-05, - "loss": 0.6003, - "step": 3057 - }, - { - "epoch": 1.68, - "grad_norm": 0.08441609889268875, - "learning_rate": 1.0903325204785742e-05, - "loss": 0.624, - "step": 3058 - }, - { - "epoch": 1.68, - "grad_norm": 0.0722791850566864, - "learning_rate": 1.0863579745415997e-05, - "loss": 0.5629, - "step": 3059 - }, - { - "epoch": 1.69, - "grad_norm": 0.08199836313724518, - "learning_rate": 1.082390269836896e-05, - "loss": 0.5925, - "step": 3060 - }, - { - "epoch": 1.69, - "grad_norm": 0.06521271169185638, - "learning_rate": 1.0784294094096881e-05, - "loss": 0.5251, - "step": 3061 - }, - { - "epoch": 1.69, - "grad_norm": 0.07767980545759201, - "learning_rate": 1.0744753962999355e-05, - "loss": 0.5666, - "step": 3062 - }, - { - "epoch": 1.69, - "grad_norm": 0.07202478498220444, - "learning_rate": 1.0705282335423539e-05, - "loss": 0.4518, - "step": 3063 - }, - { - "epoch": 1.69, - "grad_norm": 0.07885854691267014, - "learning_rate": 1.0665879241663922e-05, - "loss": 0.5496, - "step": 3064 - }, - { - "epoch": 1.69, - "grad_norm": 0.07686977833509445, - "learning_rate": 1.062654471196246e-05, - "loss": 0.4803, - "step": 3065 - }, - { - "epoch": 1.69, - "grad_norm": 0.08224175125360489, - "learning_rate": 1.0587278776508424e-05, - "loss": 0.5551, - "step": 3066 - }, - { - "epoch": 1.69, - "grad_norm": 0.08237634599208832, - "learning_rate": 1.0548081465438497e-05, - "loss": 0.595, - "step": 3067 - }, - { - "epoch": 1.69, - "grad_norm": 0.07160717993974686, - "learning_rate": 1.050895280883668e-05, - "loss": 0.5952, - "step": 3068 - }, - { - "epoch": 1.69, - "grad_norm": 0.08529774099588394, - "learning_rate": 1.046989283673424e-05, - "loss": 0.5683, - "step": 3069 - }, - { - "epoch": 1.69, - "grad_norm": 0.0765925943851471, - "learning_rate": 1.0430901579109787e-05, - "loss": 0.5274, - "step": 3070 - }, - { - "epoch": 1.69, - "grad_norm": 0.07462650537490845, - "learning_rate": 1.0391979065889135e-05, - "loss": 0.5388, - "step": 3071 - }, - { - "epoch": 1.69, - "grad_norm": 0.07412373274564743, - "learning_rate": 1.035312532694539e-05, - "loss": 0.5452, - "step": 3072 - }, - { - "epoch": 1.69, - "grad_norm": 0.08629931509494781, - "learning_rate": 1.031434039209883e-05, - "loss": 0.6178, - "step": 3073 - }, - { - "epoch": 1.69, - "grad_norm": 0.08814838528633118, - "learning_rate": 1.0275624291116958e-05, - "loss": 0.5785, - "step": 3074 - }, - { - "epoch": 1.69, - "grad_norm": 0.07783772051334381, - "learning_rate": 1.0236977053714414e-05, - "loss": 0.5858, - "step": 3075 - }, - { - "epoch": 1.69, - "grad_norm": 0.07740164548158646, - "learning_rate": 1.0198398709553025e-05, - "loss": 0.5905, - "step": 3076 - }, - { - "epoch": 1.69, - "grad_norm": 0.07513068616390228, - "learning_rate": 1.0159889288241731e-05, - "loss": 0.5925, - "step": 3077 - }, - { - "epoch": 1.7, - "grad_norm": 0.07727821171283722, - "learning_rate": 1.0121448819336532e-05, - "loss": 0.4903, - "step": 3078 - }, - { - "epoch": 1.7, - "grad_norm": 0.07482651621103287, - "learning_rate": 1.0083077332340562e-05, - "loss": 0.6038, - "step": 3079 - }, - { - "epoch": 1.7, - "grad_norm": 0.08040370792150497, - "learning_rate": 1.0044774856703976e-05, - "loss": 0.5698, - "step": 3080 - }, - { - "epoch": 1.7, - "grad_norm": 0.07068987935781479, - "learning_rate": 1.0006541421824012e-05, - "loss": 0.5326, - "step": 3081 - }, - { - "epoch": 1.7, - "grad_norm": 0.08372973650693893, - "learning_rate": 9.968377057044831e-06, - "loss": 0.6151, - "step": 3082 - }, - { - "epoch": 1.7, - "grad_norm": 0.07655537128448486, - "learning_rate": 9.93028179165768e-06, - "loss": 0.5554, - "step": 3083 - }, - { - "epoch": 1.7, - "grad_norm": 0.07460279017686844, - "learning_rate": 9.892255654900695e-06, - "loss": 0.5039, - "step": 3084 - }, - { - "epoch": 1.7, - "grad_norm": 0.07027482986450195, - "learning_rate": 9.854298675959007e-06, - "loss": 0.548, - "step": 3085 - }, - { - "epoch": 1.7, - "grad_norm": 0.09165271371603012, - "learning_rate": 9.816410883964623e-06, - "loss": 0.575, - "step": 3086 - }, - { - "epoch": 1.7, - "grad_norm": 0.08436719328165054, - "learning_rate": 9.778592307996504e-06, - "loss": 0.6398, - "step": 3087 - }, - { - "epoch": 1.7, - "grad_norm": 0.07201489061117172, - "learning_rate": 9.740842977080433e-06, - "loss": 0.5213, - "step": 3088 - }, - { - "epoch": 1.7, - "grad_norm": 0.0690862387418747, - "learning_rate": 9.703162920189079e-06, - "loss": 0.4581, - "step": 3089 - }, - { - "epoch": 1.7, - "grad_norm": 0.0757426768541336, - "learning_rate": 9.665552166241964e-06, - "loss": 0.5442, - "step": 3090 - }, - { - "epoch": 1.7, - "grad_norm": 0.07909826934337616, - "learning_rate": 9.628010744105353e-06, - "loss": 0.5935, - "step": 3091 - }, - { - "epoch": 1.7, - "grad_norm": 0.07645463198423386, - "learning_rate": 9.590538682592376e-06, - "loss": 0.5586, - "step": 3092 - }, - { - "epoch": 1.7, - "grad_norm": 0.07282956689596176, - "learning_rate": 9.55313601046285e-06, - "loss": 0.5714, - "step": 3093 - }, - { - "epoch": 1.7, - "grad_norm": 0.06841188669204712, - "learning_rate": 9.515802756423409e-06, - "loss": 0.5364, - "step": 3094 - }, - { - "epoch": 1.7, - "grad_norm": 0.08608322590589523, - "learning_rate": 9.478538949127346e-06, - "loss": 0.5332, - "step": 3095 - }, - { - "epoch": 1.71, - "grad_norm": 0.08561312407255173, - "learning_rate": 9.441344617174718e-06, - "loss": 0.6552, - "step": 3096 - }, - { - "epoch": 1.71, - "grad_norm": 0.07228785008192062, - "learning_rate": 9.40421978911219e-06, - "loss": 0.5415, - "step": 3097 - }, - { - "epoch": 1.71, - "grad_norm": 0.07194254547357559, - "learning_rate": 9.367164493433133e-06, - "loss": 0.5126, - "step": 3098 - }, - { - "epoch": 1.71, - "grad_norm": 0.07450117915868759, - "learning_rate": 9.330178758577568e-06, - "loss": 0.5843, - "step": 3099 - }, - { - "epoch": 1.71, - "grad_norm": 0.07577648013830185, - "learning_rate": 9.29326261293203e-06, - "loss": 0.5851, - "step": 3100 - }, - { - "epoch": 1.71, - "grad_norm": 0.07993178069591522, - "learning_rate": 9.256416084829778e-06, - "loss": 0.527, - "step": 3101 - }, - { - "epoch": 1.71, - "grad_norm": 0.07520022988319397, - "learning_rate": 9.219639202550523e-06, - "loss": 0.5325, - "step": 3102 - }, - { - "epoch": 1.71, - "grad_norm": 0.08247778564691544, - "learning_rate": 9.18293199432061e-06, - "loss": 0.533, - "step": 3103 - }, - { - "epoch": 1.71, - "grad_norm": 0.07813543826341629, - "learning_rate": 9.146294488312823e-06, - "loss": 0.5314, - "step": 3104 - }, - { - "epoch": 1.71, - "grad_norm": 0.075026735663414, - "learning_rate": 9.109726712646548e-06, - "loss": 0.5245, - "step": 3105 - }, - { - "epoch": 1.71, - "grad_norm": 0.07687216252088547, - "learning_rate": 9.07322869538756e-06, - "loss": 0.5349, - "step": 3106 - }, - { - "epoch": 1.71, - "grad_norm": 0.07420308142900467, - "learning_rate": 9.036800464548157e-06, - "loss": 0.5751, - "step": 3107 - }, - { - "epoch": 1.71, - "grad_norm": 0.07741032540798187, - "learning_rate": 9.000442048087076e-06, - "loss": 0.5099, - "step": 3108 - }, - { - "epoch": 1.71, - "grad_norm": 0.08252329379320145, - "learning_rate": 8.964153473909397e-06, - "loss": 0.5713, - "step": 3109 - }, - { - "epoch": 1.71, - "grad_norm": 0.07514884322881699, - "learning_rate": 8.927934769866719e-06, - "loss": 0.5836, - "step": 3110 - }, - { - "epoch": 1.71, - "grad_norm": 0.07817213237285614, - "learning_rate": 8.891785963756872e-06, - "loss": 0.66, - "step": 3111 - }, - { - "epoch": 1.71, - "grad_norm": 0.0779666155576706, - "learning_rate": 8.855707083324183e-06, - "loss": 0.6034, - "step": 3112 - }, - { - "epoch": 1.71, - "grad_norm": 0.07593736797571182, - "learning_rate": 8.819698156259182e-06, - "loss": 0.5346, - "step": 3113 - }, - { - "epoch": 1.72, - "grad_norm": 0.08052858710289001, - "learning_rate": 8.783759210198805e-06, - "loss": 0.5325, - "step": 3114 - }, - { - "epoch": 1.72, - "grad_norm": 0.08734001964330673, - "learning_rate": 8.747890272726222e-06, - "loss": 0.6829, - "step": 3115 - }, - { - "epoch": 1.72, - "grad_norm": 0.08773903548717499, - "learning_rate": 8.712091371370912e-06, - "loss": 0.5986, - "step": 3116 - }, - { - "epoch": 1.72, - "grad_norm": 0.07741262018680573, - "learning_rate": 8.676362533608573e-06, - "loss": 0.474, - "step": 3117 - }, - { - "epoch": 1.72, - "grad_norm": 0.08860704302787781, - "learning_rate": 8.640703786861116e-06, - "loss": 0.5966, - "step": 3118 - }, - { - "epoch": 1.72, - "grad_norm": 0.0868702232837677, - "learning_rate": 8.605115158496713e-06, - "loss": 0.566, - "step": 3119 - }, - { - "epoch": 1.72, - "grad_norm": 0.07514653354883194, - "learning_rate": 8.56959667582965e-06, - "loss": 0.5436, - "step": 3120 - }, - { - "epoch": 1.72, - "grad_norm": 0.06962697952985764, - "learning_rate": 8.534148366120432e-06, - "loss": 0.4152, - "step": 3121 - }, - { - "epoch": 1.72, - "grad_norm": 0.0840853601694107, - "learning_rate": 8.498770256575662e-06, - "loss": 0.642, - "step": 3122 - }, - { - "epoch": 1.72, - "grad_norm": 0.07120905071496964, - "learning_rate": 8.46346237434813e-06, - "loss": 0.5534, - "step": 3123 - }, - { - "epoch": 1.72, - "grad_norm": 0.0809093713760376, - "learning_rate": 8.428224746536627e-06, - "loss": 0.6058, - "step": 3124 - }, - { - "epoch": 1.72, - "grad_norm": 0.07151578366756439, - "learning_rate": 8.39305740018611e-06, - "loss": 0.5478, - "step": 3125 - }, - { - "epoch": 1.72, - "grad_norm": 0.07958785444498062, - "learning_rate": 8.357960362287587e-06, - "loss": 0.5557, - "step": 3126 - }, - { - "epoch": 1.72, - "grad_norm": 0.08158773183822632, - "learning_rate": 8.322933659778032e-06, - "loss": 0.5649, - "step": 3127 - }, - { - "epoch": 1.72, - "grad_norm": 0.08477183431386948, - "learning_rate": 8.287977319540541e-06, - "loss": 0.5418, - "step": 3128 - }, - { - "epoch": 1.72, - "grad_norm": 0.08409066498279572, - "learning_rate": 8.253091368404098e-06, - "loss": 0.5837, - "step": 3129 - }, - { - "epoch": 1.72, - "grad_norm": 0.07809256762266159, - "learning_rate": 8.218275833143752e-06, - "loss": 0.5635, - "step": 3130 - }, - { - "epoch": 1.72, - "grad_norm": 0.07307461649179459, - "learning_rate": 8.18353074048046e-06, - "loss": 0.583, - "step": 3131 - }, - { - "epoch": 1.73, - "grad_norm": 0.09218110144138336, - "learning_rate": 8.14885611708115e-06, - "loss": 0.5929, - "step": 3132 - }, - { - "epoch": 1.73, - "grad_norm": 0.07997137308120728, - "learning_rate": 8.114251989558596e-06, - "loss": 0.5021, - "step": 3133 - }, - { - "epoch": 1.73, - "grad_norm": 0.07108749449253082, - "learning_rate": 8.079718384471557e-06, - "loss": 0.5638, - "step": 3134 - }, - { - "epoch": 1.73, - "grad_norm": 0.08197928965091705, - "learning_rate": 8.045255328324596e-06, - "loss": 0.58, - "step": 3135 - }, - { - "epoch": 1.73, - "grad_norm": 0.08596963435411453, - "learning_rate": 8.010862847568168e-06, - "loss": 0.5852, - "step": 3136 - }, - { - "epoch": 1.73, - "grad_norm": 0.07822061330080032, - "learning_rate": 7.976540968598555e-06, - "loss": 0.5436, - "step": 3137 - }, - { - "epoch": 1.73, - "grad_norm": 0.07026013731956482, - "learning_rate": 7.942289717757812e-06, - "loss": 0.5479, - "step": 3138 - }, - { - "epoch": 1.73, - "grad_norm": 0.07042836397886276, - "learning_rate": 7.908109121333873e-06, - "loss": 0.4859, - "step": 3139 - }, - { - "epoch": 1.73, - "grad_norm": 0.073862724006176, - "learning_rate": 7.873999205560334e-06, - "loss": 0.6089, - "step": 3140 - }, - { - "epoch": 1.73, - "grad_norm": 0.07902369648218155, - "learning_rate": 7.839959996616652e-06, - "loss": 0.6239, - "step": 3141 - }, - { - "epoch": 1.73, - "grad_norm": 0.07941651344299316, - "learning_rate": 7.80599152062792e-06, - "loss": 0.5479, - "step": 3142 - }, - { - "epoch": 1.73, - "grad_norm": 0.07300753891468048, - "learning_rate": 7.772093803665037e-06, - "loss": 0.5183, - "step": 3143 - }, - { - "epoch": 1.73, - "grad_norm": 0.08596652001142502, - "learning_rate": 7.738266871744494e-06, - "loss": 0.6281, - "step": 3144 - }, - { - "epoch": 1.73, - "grad_norm": 0.07829903066158295, - "learning_rate": 7.704510750828542e-06, - "loss": 0.5724, - "step": 3145 - }, - { - "epoch": 1.73, - "grad_norm": 0.08107545226812363, - "learning_rate": 7.67082546682506e-06, - "loss": 0.5444, - "step": 3146 - }, - { - "epoch": 1.73, - "grad_norm": 0.07941924035549164, - "learning_rate": 7.637211045587512e-06, - "loss": 0.6107, - "step": 3147 - }, - { - "epoch": 1.73, - "grad_norm": 0.08565372228622437, - "learning_rate": 7.603667512915025e-06, - "loss": 0.5424, - "step": 3148 - }, - { - "epoch": 1.73, - "grad_norm": 0.061931777745485306, - "learning_rate": 7.570194894552307e-06, - "loss": 0.4901, - "step": 3149 - }, - { - "epoch": 1.74, - "grad_norm": 0.07151300460100174, - "learning_rate": 7.536793216189675e-06, - "loss": 0.5057, - "step": 3150 - }, - { - "epoch": 1.74, - "grad_norm": 0.22302797436714172, - "learning_rate": 7.5034625034628995e-06, - "loss": 0.6756, - "step": 3151 - }, - { - "epoch": 1.74, - "grad_norm": 0.0780290961265564, - "learning_rate": 7.470202781953395e-06, - "loss": 0.675, - "step": 3152 - }, - { - "epoch": 1.74, - "grad_norm": 0.07317093014717102, - "learning_rate": 7.437014077188009e-06, - "loss": 0.586, - "step": 3153 - }, - { - "epoch": 1.74, - "grad_norm": 0.08226374536752701, - "learning_rate": 7.403896414639144e-06, - "loss": 0.5778, - "step": 3154 - }, - { - "epoch": 1.74, - "grad_norm": 0.07802766561508179, - "learning_rate": 7.370849819724634e-06, - "loss": 0.6207, - "step": 3155 - }, - { - "epoch": 1.74, - "grad_norm": 0.07580267637968063, - "learning_rate": 7.337874317807802e-06, - "loss": 0.5755, - "step": 3156 - }, - { - "epoch": 1.74, - "grad_norm": 0.08161929994821548, - "learning_rate": 7.304969934197359e-06, - "loss": 0.5729, - "step": 3157 - }, - { - "epoch": 1.74, - "grad_norm": 0.07730533182621002, - "learning_rate": 7.27213669414748e-06, - "loss": 0.5366, - "step": 3158 - }, - { - "epoch": 1.74, - "grad_norm": 0.07133332639932632, - "learning_rate": 7.239374622857742e-06, - "loss": 0.6159, - "step": 3159 - }, - { - "epoch": 1.74, - "grad_norm": 0.08000887930393219, - "learning_rate": 7.206683745473053e-06, - "loss": 0.5803, - "step": 3160 - }, - { - "epoch": 1.74, - "grad_norm": 0.07099801301956177, - "learning_rate": 7.17406408708372e-06, - "loss": 0.5313, - "step": 3161 - }, - { - "epoch": 1.74, - "grad_norm": 0.07111170887947083, - "learning_rate": 7.141515672725363e-06, - "loss": 0.4635, - "step": 3162 - }, - { - "epoch": 1.74, - "grad_norm": 0.0841594710946083, - "learning_rate": 7.109038527378942e-06, - "loss": 0.5765, - "step": 3163 - }, - { - "epoch": 1.74, - "grad_norm": 0.07840998470783234, - "learning_rate": 7.076632675970707e-06, - "loss": 0.5434, - "step": 3164 - }, - { - "epoch": 1.74, - "grad_norm": 0.07809460163116455, - "learning_rate": 7.044298143372197e-06, - "loss": 0.5752, - "step": 3165 - }, - { - "epoch": 1.74, - "grad_norm": 0.0699603483080864, - "learning_rate": 7.012034954400193e-06, - "loss": 0.4794, - "step": 3166 - }, - { - "epoch": 1.74, - "grad_norm": 0.07665000855922699, - "learning_rate": 6.979843133816743e-06, - "loss": 0.5516, - "step": 3167 - }, - { - "epoch": 1.75, - "grad_norm": 0.07922467589378357, - "learning_rate": 6.9477227063291405e-06, - "loss": 0.6229, - "step": 3168 - }, - { - "epoch": 1.75, - "grad_norm": 0.07732383906841278, - "learning_rate": 6.9156736965898085e-06, - "loss": 0.6062, - "step": 3169 - }, - { - "epoch": 1.75, - "grad_norm": 0.0833093523979187, - "learning_rate": 6.88369612919646e-06, - "loss": 0.6237, - "step": 3170 - }, - { - "epoch": 1.75, - "grad_norm": 0.07883663475513458, - "learning_rate": 6.8517900286918735e-06, - "loss": 0.5874, - "step": 3171 - }, - { - "epoch": 1.75, - "grad_norm": 0.08125128597021103, - "learning_rate": 6.81995541956405e-06, - "loss": 0.6341, - "step": 3172 - }, - { - "epoch": 1.75, - "grad_norm": 0.07269998639822006, - "learning_rate": 6.788192326246079e-06, - "loss": 0.5798, - "step": 3173 - }, - { - "epoch": 1.75, - "grad_norm": 0.07897648215293884, - "learning_rate": 6.756500773116203e-06, - "loss": 0.6475, - "step": 3174 - }, - { - "epoch": 1.75, - "grad_norm": 0.08471954613924026, - "learning_rate": 6.724880784497689e-06, - "loss": 0.5701, - "step": 3175 - }, - { - "epoch": 1.75, - "grad_norm": 0.07630597054958344, - "learning_rate": 6.6933323846589566e-06, - "loss": 0.5155, - "step": 3176 - }, - { - "epoch": 1.75, - "grad_norm": 0.07588117569684982, - "learning_rate": 6.661855597813449e-06, - "loss": 0.6294, - "step": 3177 - }, - { - "epoch": 1.75, - "grad_norm": 0.0732317864894867, - "learning_rate": 6.630450448119618e-06, - "loss": 0.5349, - "step": 3178 - }, - { - "epoch": 1.75, - "grad_norm": 0.07867201417684555, - "learning_rate": 6.599116959680973e-06, - "loss": 0.5625, - "step": 3179 - }, - { - "epoch": 1.75, - "grad_norm": 0.07942135632038116, - "learning_rate": 6.567855156545999e-06, - "loss": 0.5349, - "step": 3180 - }, - { - "epoch": 1.75, - "grad_norm": 0.07216234505176544, - "learning_rate": 6.536665062708192e-06, - "loss": 0.4608, - "step": 3181 - }, - { - "epoch": 1.75, - "grad_norm": 0.07037379592657089, - "learning_rate": 6.505546702105958e-06, - "loss": 0.5011, - "step": 3182 - }, - { - "epoch": 1.75, - "grad_norm": 0.08123207092285156, - "learning_rate": 6.4745000986227155e-06, - "loss": 0.636, - "step": 3183 - }, - { - "epoch": 1.75, - "grad_norm": 0.07815258204936981, - "learning_rate": 6.443525276086748e-06, - "loss": 0.5994, - "step": 3184 - }, - { - "epoch": 1.75, - "grad_norm": 0.07790284603834152, - "learning_rate": 6.412622258271284e-06, - "loss": 0.5094, - "step": 3185 - }, - { - "epoch": 1.76, - "grad_norm": 0.08153413236141205, - "learning_rate": 6.381791068894438e-06, - "loss": 0.54, - "step": 3186 - }, - { - "epoch": 1.76, - "grad_norm": 0.07703083008527756, - "learning_rate": 6.3510317316191725e-06, - "loss": 0.5901, - "step": 3187 - }, - { - "epoch": 1.76, - "grad_norm": 0.08073649555444717, - "learning_rate": 6.320344270053357e-06, - "loss": 0.542, - "step": 3188 - }, - { - "epoch": 1.76, - "grad_norm": 0.07454603165388107, - "learning_rate": 6.289728707749609e-06, - "loss": 0.5095, - "step": 3189 - }, - { - "epoch": 1.76, - "grad_norm": 0.07908552139997482, - "learning_rate": 6.2591850682054535e-06, - "loss": 0.5629, - "step": 3190 - }, - { - "epoch": 1.76, - "grad_norm": 0.07613983005285263, - "learning_rate": 6.228713374863137e-06, - "loss": 0.6605, - "step": 3191 - }, - { - "epoch": 1.76, - "grad_norm": 0.08534809201955795, - "learning_rate": 6.198313651109777e-06, - "loss": 0.5604, - "step": 3192 - }, - { - "epoch": 1.76, - "grad_norm": 0.07495297491550446, - "learning_rate": 6.167985920277153e-06, - "loss": 0.5866, - "step": 3193 - }, - { - "epoch": 1.76, - "grad_norm": 0.07653788477182388, - "learning_rate": 6.137730205641856e-06, - "loss": 0.6514, - "step": 3194 - }, - { - "epoch": 1.76, - "grad_norm": 0.08158232271671295, - "learning_rate": 6.107546530425212e-06, - "loss": 0.6196, - "step": 3195 - }, - { - "epoch": 1.76, - "grad_norm": 0.07958748936653137, - "learning_rate": 6.0774349177932014e-06, - "loss": 0.5411, - "step": 3196 - }, - { - "epoch": 1.76, - "grad_norm": 0.07684215903282166, - "learning_rate": 6.047395390856547e-06, - "loss": 0.5609, - "step": 3197 - }, - { - "epoch": 1.76, - "grad_norm": 0.07646127045154572, - "learning_rate": 6.017427972670608e-06, - "loss": 0.5212, - "step": 3198 - }, - { - "epoch": 1.76, - "grad_norm": 0.07235292345285416, - "learning_rate": 5.987532686235431e-06, - "loss": 0.5087, - "step": 3199 - }, - { - "epoch": 1.76, - "grad_norm": 0.07906308770179749, - "learning_rate": 5.957709554495683e-06, - "loss": 0.4798, - "step": 3200 - }, - { - "epoch": 1.76, - "grad_norm": 0.07810641825199127, - "learning_rate": 5.927958600340666e-06, - "loss": 0.5738, - "step": 3201 - }, - { - "epoch": 1.76, - "grad_norm": 0.08186232298612595, - "learning_rate": 5.89827984660426e-06, - "loss": 0.5817, - "step": 3202 - }, - { - "epoch": 1.76, - "grad_norm": 0.0750940591096878, - "learning_rate": 5.868673316064965e-06, - "loss": 0.5311, - "step": 3203 - }, - { - "epoch": 1.77, - "grad_norm": 0.07697701454162598, - "learning_rate": 5.8391390314458395e-06, - "loss": 0.5718, - "step": 3204 - }, - { - "epoch": 1.77, - "grad_norm": 0.07784340530633926, - "learning_rate": 5.809677015414461e-06, - "loss": 0.48, - "step": 3205 - }, - { - "epoch": 1.77, - "grad_norm": 0.07141119986772537, - "learning_rate": 5.780287290582997e-06, - "loss": 0.5174, - "step": 3206 - }, - { - "epoch": 1.77, - "grad_norm": 0.08472003042697906, - "learning_rate": 5.7509698795080726e-06, - "loss": 0.6321, - "step": 3207 - }, - { - "epoch": 1.77, - "grad_norm": 0.08304649591445923, - "learning_rate": 5.721724804690853e-06, - "loss": 0.5654, - "step": 3208 - }, - { - "epoch": 1.77, - "grad_norm": 0.06623322516679764, - "learning_rate": 5.6925520885769745e-06, - "loss": 0.5074, - "step": 3209 - }, - { - "epoch": 1.77, - "grad_norm": 0.07251337170600891, - "learning_rate": 5.663451753556537e-06, - "loss": 0.5376, - "step": 3210 - }, - { - "epoch": 1.77, - "grad_norm": 0.08150351792573929, - "learning_rate": 5.634423821964074e-06, - "loss": 0.5113, - "step": 3211 - }, - { - "epoch": 1.77, - "grad_norm": 0.07414893060922623, - "learning_rate": 5.605468316078588e-06, - "loss": 0.6034, - "step": 3212 - }, - { - "epoch": 1.77, - "grad_norm": 0.07834034413099289, - "learning_rate": 5.5765852581234455e-06, - "loss": 0.5592, - "step": 3213 - }, - { - "epoch": 1.77, - "grad_norm": 0.09270931035280228, - "learning_rate": 5.547774670266426e-06, - "loss": 0.5344, - "step": 3214 - }, - { - "epoch": 1.77, - "grad_norm": 0.07382823526859283, - "learning_rate": 5.519036574619729e-06, - "loss": 0.5711, - "step": 3215 - }, - { - "epoch": 1.77, - "grad_norm": 0.07459236681461334, - "learning_rate": 5.490370993239846e-06, - "loss": 0.5897, - "step": 3216 - }, - { - "epoch": 1.77, - "grad_norm": 0.09094478189945221, - "learning_rate": 5.4617779481276665e-06, - "loss": 0.5864, - "step": 3217 - }, - { - "epoch": 1.77, - "grad_norm": 0.07464843988418579, - "learning_rate": 5.43325746122838e-06, - "loss": 0.5389, - "step": 3218 - }, - { - "epoch": 1.77, - "grad_norm": 0.08413833379745483, - "learning_rate": 5.404809554431534e-06, - "loss": 0.5677, - "step": 3219 - }, - { - "epoch": 1.77, - "grad_norm": 0.07826337218284607, - "learning_rate": 5.376434249570883e-06, - "loss": 0.5155, - "step": 3220 - }, - { - "epoch": 1.77, - "grad_norm": 0.07135774940252304, - "learning_rate": 5.348131568424563e-06, - "loss": 0.5348, - "step": 3221 - }, - { - "epoch": 1.78, - "grad_norm": 0.07859191298484802, - "learning_rate": 5.319901532714877e-06, - "loss": 0.4778, - "step": 3222 - }, - { - "epoch": 1.78, - "grad_norm": 0.07505518943071365, - "learning_rate": 5.2917441641084365e-06, - "loss": 0.6227, - "step": 3223 - }, - { - "epoch": 1.78, - "grad_norm": 0.09244594722986221, - "learning_rate": 5.2636594842160545e-06, - "loss": 0.6427, - "step": 3224 - }, - { - "epoch": 1.78, - "grad_norm": 0.08588739484548569, - "learning_rate": 5.235647514592779e-06, - "loss": 0.646, - "step": 3225 - }, - { - "epoch": 1.78, - "grad_norm": 0.08701588958501816, - "learning_rate": 5.207708276737789e-06, - "loss": 0.5213, - "step": 3226 - }, - { - "epoch": 1.78, - "grad_norm": 0.07898075133562088, - "learning_rate": 5.1798417920945305e-06, - "loss": 0.547, - "step": 3227 - }, - { - "epoch": 1.78, - "grad_norm": 0.0855628252029419, - "learning_rate": 5.152048082050564e-06, - "loss": 0.5389, - "step": 3228 - }, - { - "epoch": 1.78, - "grad_norm": 0.07486437261104584, - "learning_rate": 5.124327167937571e-06, - "loss": 0.5612, - "step": 3229 - }, - { - "epoch": 1.78, - "grad_norm": 0.0741388276219368, - "learning_rate": 5.096679071031429e-06, - "loss": 0.4953, - "step": 3230 - }, - { - "epoch": 1.78, - "grad_norm": 0.08409564197063446, - "learning_rate": 5.069103812552045e-06, - "loss": 0.5383, - "step": 3231 - }, - { - "epoch": 1.78, - "grad_norm": 0.07952644675970078, - "learning_rate": 5.041601413663511e-06, - "loss": 0.4607, - "step": 3232 - }, - { - "epoch": 1.78, - "grad_norm": 0.07196789979934692, - "learning_rate": 5.014171895473929e-06, - "loss": 0.5096, - "step": 3233 - }, - { - "epoch": 1.78, - "grad_norm": 0.06814949959516525, - "learning_rate": 4.986815279035506e-06, - "loss": 0.4978, - "step": 3234 - }, - { - "epoch": 1.78, - "grad_norm": 0.06847819685935974, - "learning_rate": 4.959531585344457e-06, - "loss": 0.528, - "step": 3235 - }, - { - "epoch": 1.78, - "grad_norm": 0.08616593480110168, - "learning_rate": 4.932320835341075e-06, - "loss": 0.5765, - "step": 3236 - }, - { - "epoch": 1.78, - "grad_norm": 0.0824628472328186, - "learning_rate": 4.9051830499096565e-06, - "loss": 0.5438, - "step": 3237 - }, - { - "epoch": 1.78, - "grad_norm": 0.07347706705331802, - "learning_rate": 4.878118249878461e-06, - "loss": 0.5409, - "step": 3238 - }, - { - "epoch": 1.78, - "grad_norm": 0.08699614554643631, - "learning_rate": 4.851126456019784e-06, - "loss": 0.6104, - "step": 3239 - }, - { - "epoch": 1.79, - "grad_norm": 0.08858591318130493, - "learning_rate": 4.8242076890498315e-06, - "loss": 0.6004, - "step": 3240 - }, - { - "epoch": 1.79, - "grad_norm": 0.08170289546251297, - "learning_rate": 4.7973619696288445e-06, - "loss": 0.6143, - "step": 3241 - }, - { - "epoch": 1.79, - "grad_norm": 0.08804669231176376, - "learning_rate": 4.770589318360896e-06, - "loss": 0.6162, - "step": 3242 - }, - { - "epoch": 1.79, - "grad_norm": 0.07593541592359543, - "learning_rate": 4.743889755794062e-06, - "loss": 0.5595, - "step": 3243 - }, - { - "epoch": 1.79, - "grad_norm": 0.08401387929916382, - "learning_rate": 4.717263302420283e-06, - "loss": 0.5798, - "step": 3244 - }, - { - "epoch": 1.79, - "grad_norm": 0.0785154476761818, - "learning_rate": 4.690709978675401e-06, - "loss": 0.5527, - "step": 3245 - }, - { - "epoch": 1.79, - "grad_norm": 0.07624941319227219, - "learning_rate": 4.664229804939135e-06, - "loss": 0.5307, - "step": 3246 - }, - { - "epoch": 1.79, - "grad_norm": 0.08080951869487762, - "learning_rate": 4.63782280153503e-06, - "loss": 0.5225, - "step": 3247 - }, - { - "epoch": 1.79, - "grad_norm": 0.07695840299129486, - "learning_rate": 4.611488988730528e-06, - "loss": 0.5664, - "step": 3248 - }, - { - "epoch": 1.79, - "grad_norm": 0.07765883207321167, - "learning_rate": 4.58522838673684e-06, - "loss": 0.4966, - "step": 3249 - }, - { - "epoch": 1.79, - "grad_norm": 0.08841906487941742, - "learning_rate": 4.559041015709042e-06, - "loss": 0.6194, - "step": 3250 - }, - { - "epoch": 1.79, - "grad_norm": 0.0755036398768425, - "learning_rate": 4.532926895745937e-06, - "loss": 0.5943, - "step": 3251 - }, - { - "epoch": 1.79, - "grad_norm": 0.07947423309087753, - "learning_rate": 4.5068860468901905e-06, - "loss": 0.5331, - "step": 3252 - }, - { - "epoch": 1.79, - "grad_norm": 0.08045419305562973, - "learning_rate": 4.4809184891281495e-06, - "loss": 0.6534, - "step": 3253 - }, - { - "epoch": 1.79, - "grad_norm": 0.07684405148029327, - "learning_rate": 4.455024242389972e-06, - "loss": 0.502, - "step": 3254 - }, - { - "epoch": 1.79, - "grad_norm": 0.07834628224372864, - "learning_rate": 4.429203326549525e-06, - "loss": 0.5875, - "step": 3255 - }, - { - "epoch": 1.79, - "grad_norm": 0.08634544163942337, - "learning_rate": 4.403455761424391e-06, - "loss": 0.5447, - "step": 3256 - }, - { - "epoch": 1.79, - "grad_norm": 0.08276887238025665, - "learning_rate": 4.377781566775874e-06, - "loss": 0.6446, - "step": 3257 - }, - { - "epoch": 1.8, - "grad_norm": 0.07752779871225357, - "learning_rate": 4.352180762308933e-06, - "loss": 0.6086, - "step": 3258 - }, - { - "epoch": 1.8, - "grad_norm": 0.08547976613044739, - "learning_rate": 4.32665336767224e-06, - "loss": 0.5989, - "step": 3259 - }, - { - "epoch": 1.8, - "grad_norm": 0.08702553063631058, - "learning_rate": 4.3011994024580785e-06, - "loss": 0.6573, - "step": 3260 - }, - { - "epoch": 1.8, - "grad_norm": 0.08295542746782303, - "learning_rate": 4.2758188862024425e-06, - "loss": 0.538, - "step": 3261 - }, - { - "epoch": 1.8, - "grad_norm": 0.07643800973892212, - "learning_rate": 4.250511838384863e-06, - "loss": 0.5697, - "step": 3262 - }, - { - "epoch": 1.8, - "grad_norm": 0.07573480159044266, - "learning_rate": 4.225278278428568e-06, - "loss": 0.5794, - "step": 3263 - }, - { - "epoch": 1.8, - "grad_norm": 0.07962439954280853, - "learning_rate": 4.200118225700345e-06, - "loss": 0.6272, - "step": 3264 - }, - { - "epoch": 1.8, - "grad_norm": 0.07286319136619568, - "learning_rate": 4.1750316995105455e-06, - "loss": 0.5384, - "step": 3265 - }, - { - "epoch": 1.8, - "grad_norm": 0.07239627093076706, - "learning_rate": 4.1500187191131466e-06, - "loss": 0.5269, - "step": 3266 - }, - { - "epoch": 1.8, - "grad_norm": 0.07041292637586594, - "learning_rate": 4.1250793037056145e-06, - "loss": 0.5506, - "step": 3267 - }, - { - "epoch": 1.8, - "grad_norm": 0.0758143737912178, - "learning_rate": 4.100213472429015e-06, - "loss": 0.5682, - "step": 3268 - }, - { - "epoch": 1.8, - "grad_norm": 0.08220957219600677, - "learning_rate": 4.075421244367861e-06, - "loss": 0.6196, - "step": 3269 - }, - { - "epoch": 1.8, - "grad_norm": 0.07229150831699371, - "learning_rate": 4.050702638550275e-06, - "loss": 0.5414, - "step": 3270 - }, - { - "epoch": 1.8, - "grad_norm": 0.08285313099622726, - "learning_rate": 4.026057673947769e-06, - "loss": 0.5856, - "step": 3271 - }, - { - "epoch": 1.8, - "grad_norm": 0.08436094224452972, - "learning_rate": 4.001486369475416e-06, - "loss": 0.6143, - "step": 3272 - }, - { - "epoch": 1.8, - "grad_norm": 0.08027667552232742, - "learning_rate": 3.976988743991739e-06, - "loss": 0.5629, - "step": 3273 - }, - { - "epoch": 1.8, - "grad_norm": 0.07529149204492569, - "learning_rate": 3.952564816298665e-06, - "loss": 0.503, - "step": 3274 - }, - { - "epoch": 1.8, - "grad_norm": 0.0745483860373497, - "learning_rate": 3.9282146051416115e-06, - "loss": 0.5342, - "step": 3275 - }, - { - "epoch": 1.81, - "grad_norm": 0.0774783343076706, - "learning_rate": 3.903938129209395e-06, - "loss": 0.5474, - "step": 3276 - }, - { - "epoch": 1.81, - "grad_norm": 0.08215305954217911, - "learning_rate": 3.879735407134244e-06, - "loss": 0.5585, - "step": 3277 - }, - { - "epoch": 1.81, - "grad_norm": 0.08368073403835297, - "learning_rate": 3.855606457491767e-06, - "loss": 0.6391, - "step": 3278 - }, - { - "epoch": 1.81, - "grad_norm": 0.07771727442741394, - "learning_rate": 3.831551298800995e-06, - "loss": 0.5553, - "step": 3279 - }, - { - "epoch": 1.81, - "grad_norm": 0.08193734288215637, - "learning_rate": 3.807569949524259e-06, - "loss": 0.5824, - "step": 3280 - }, - { - "epoch": 1.81, - "grad_norm": 0.0646643117070198, - "learning_rate": 3.7836624280672916e-06, - "loss": 0.5098, - "step": 3281 - }, - { - "epoch": 1.81, - "grad_norm": 0.0763091966509819, - "learning_rate": 3.75982875277916e-06, - "loss": 0.5041, - "step": 3282 - }, - { - "epoch": 1.81, - "grad_norm": 0.08347593247890472, - "learning_rate": 3.736068941952231e-06, - "loss": 0.6356, - "step": 3283 - }, - { - "epoch": 1.81, - "grad_norm": 0.07270161807537079, - "learning_rate": 3.712383013822196e-06, - "loss": 0.5294, - "step": 3284 - }, - { - "epoch": 1.81, - "grad_norm": 0.07627815008163452, - "learning_rate": 3.6887709865680353e-06, - "loss": 0.4897, - "step": 3285 - }, - { - "epoch": 1.81, - "grad_norm": 0.07596608996391296, - "learning_rate": 3.665232878311997e-06, - "loss": 0.5376, - "step": 3286 - }, - { - "epoch": 1.81, - "grad_norm": 0.09310761094093323, - "learning_rate": 3.6417687071196304e-06, - "loss": 0.6365, - "step": 3287 - }, - { - "epoch": 1.81, - "grad_norm": 0.062461577355861664, - "learning_rate": 3.6183784909997187e-06, - "loss": 0.475, - "step": 3288 - }, - { - "epoch": 1.81, - "grad_norm": 0.07654763758182526, - "learning_rate": 3.5950622479042683e-06, - "loss": 0.4855, - "step": 3289 - }, - { - "epoch": 1.81, - "grad_norm": 0.07452768087387085, - "learning_rate": 3.571819995728554e-06, - "loss": 0.5316, - "step": 3290 - }, - { - "epoch": 1.81, - "grad_norm": 0.07540355622768402, - "learning_rate": 3.5486517523110054e-06, - "loss": 0.5403, - "step": 3291 - }, - { - "epoch": 1.81, - "grad_norm": 0.09252862632274628, - "learning_rate": 3.5255575354332993e-06, - "loss": 0.6733, - "step": 3292 - }, - { - "epoch": 1.81, - "grad_norm": 0.08389545232057571, - "learning_rate": 3.5025373628202685e-06, - "loss": 0.5625, - "step": 3293 - }, - { - "epoch": 1.82, - "grad_norm": 0.0911962017416954, - "learning_rate": 3.4795912521399245e-06, - "loss": 0.6122, - "step": 3294 - }, - { - "epoch": 1.82, - "grad_norm": 0.07891615480184555, - "learning_rate": 3.456719221003457e-06, - "loss": 0.6536, - "step": 3295 - }, - { - "epoch": 1.82, - "grad_norm": 0.07303906232118607, - "learning_rate": 3.4339212869651582e-06, - "loss": 0.5028, - "step": 3296 - }, - { - "epoch": 1.82, - "grad_norm": 0.09300211817026138, - "learning_rate": 3.4111974675224976e-06, - "loss": 0.6058, - "step": 3297 - }, - { - "epoch": 1.82, - "grad_norm": 0.07399295270442963, - "learning_rate": 3.388547780116003e-06, - "loss": 0.5205, - "step": 3298 - }, - { - "epoch": 1.82, - "grad_norm": 0.07781734317541122, - "learning_rate": 3.3659722421293783e-06, - "loss": 0.5615, - "step": 3299 - }, - { - "epoch": 1.82, - "grad_norm": 0.07778247445821762, - "learning_rate": 3.3434708708893425e-06, - "loss": 0.6124, - "step": 3300 - }, - { - "epoch": 1.82, - "grad_norm": 0.080428346991539, - "learning_rate": 3.32104368366577e-06, - "loss": 0.5574, - "step": 3301 - }, - { - "epoch": 1.82, - "grad_norm": 0.08679382503032684, - "learning_rate": 3.2986906976715136e-06, - "loss": 0.5437, - "step": 3302 - }, - { - "epoch": 1.82, - "grad_norm": 0.07860066741704941, - "learning_rate": 3.276411930062551e-06, - "loss": 0.5408, - "step": 3303 - }, - { - "epoch": 1.82, - "grad_norm": 0.0861663743853569, - "learning_rate": 3.2542073979378473e-06, - "loss": 0.6513, - "step": 3304 - }, - { - "epoch": 1.82, - "grad_norm": 0.07389487326145172, - "learning_rate": 3.232077118339416e-06, - "loss": 0.5688, - "step": 3305 - }, - { - "epoch": 1.82, - "grad_norm": 0.09026335924863815, - "learning_rate": 3.2100211082523037e-06, - "loss": 0.664, - "step": 3306 - }, - { - "epoch": 1.82, - "grad_norm": 0.0709616020321846, - "learning_rate": 3.18803938460448e-06, - "loss": 0.4946, - "step": 3307 - }, - { - "epoch": 1.82, - "grad_norm": 0.07391082495450974, - "learning_rate": 3.166131964266983e-06, - "loss": 0.5226, - "step": 3308 - }, - { - "epoch": 1.82, - "grad_norm": 0.0852513313293457, - "learning_rate": 3.144298864053774e-06, - "loss": 0.6249, - "step": 3309 - }, - { - "epoch": 1.82, - "grad_norm": 0.0745469480752945, - "learning_rate": 3.1225401007217936e-06, - "loss": 0.5212, - "step": 3310 - }, - { - "epoch": 1.82, - "grad_norm": 0.0812850072979927, - "learning_rate": 3.1008556909709054e-06, - "loss": 0.676, - "step": 3311 - }, - { - "epoch": 1.83, - "grad_norm": 0.06535523384809494, - "learning_rate": 3.0792456514439518e-06, - "loss": 0.4669, - "step": 3312 - }, - { - "epoch": 1.83, - "grad_norm": 0.07964853197336197, - "learning_rate": 3.0577099987266323e-06, - "loss": 0.5554, - "step": 3313 - }, - { - "epoch": 1.83, - "grad_norm": 0.08415551483631134, - "learning_rate": 3.0362487493476142e-06, - "loss": 0.5914, - "step": 3314 - }, - { - "epoch": 1.83, - "grad_norm": 0.08151557296514511, - "learning_rate": 3.014861919778433e-06, - "loss": 0.6269, - "step": 3315 - }, - { - "epoch": 1.83, - "grad_norm": 0.08608482778072357, - "learning_rate": 2.9935495264334925e-06, - "loss": 0.5265, - "step": 3316 - }, - { - "epoch": 1.83, - "grad_norm": 0.08237703889608383, - "learning_rate": 2.9723115856701177e-06, - "loss": 0.5901, - "step": 3317 - }, - { - "epoch": 1.83, - "grad_norm": 0.07674803584814072, - "learning_rate": 2.9511481137884157e-06, - "loss": 0.5535, - "step": 3318 - }, - { - "epoch": 1.83, - "grad_norm": 0.0770275890827179, - "learning_rate": 2.930059127031415e-06, - "loss": 0.5352, - "step": 3319 - }, - { - "epoch": 1.83, - "grad_norm": 0.08275414258241653, - "learning_rate": 2.9090446415849017e-06, - "loss": 0.5957, - "step": 3320 - }, - { - "epoch": 1.83, - "grad_norm": 0.07877998799085617, - "learning_rate": 2.8881046735775742e-06, - "loss": 0.5577, - "step": 3321 - }, - { - "epoch": 1.83, - "grad_norm": 0.08630228042602539, - "learning_rate": 2.8672392390808323e-06, - "loss": 0.5732, - "step": 3322 - }, - { - "epoch": 1.83, - "grad_norm": 0.07677796483039856, - "learning_rate": 2.846448354108955e-06, - "loss": 0.5038, - "step": 3323 - }, - { - "epoch": 1.83, - "grad_norm": 0.07356994599103928, - "learning_rate": 2.8257320346189776e-06, - "loss": 0.588, - "step": 3324 - }, - { - "epoch": 1.83, - "grad_norm": 0.07670172303915024, - "learning_rate": 2.8050902965106927e-06, - "loss": 0.5243, - "step": 3325 - }, - { - "epoch": 1.83, - "grad_norm": 0.07875744998455048, - "learning_rate": 2.7845231556266617e-06, - "loss": 0.5418, - "step": 3326 - }, - { - "epoch": 1.83, - "grad_norm": 0.07886647433042526, - "learning_rate": 2.764030627752201e-06, - "loss": 0.6001, - "step": 3327 - }, - { - "epoch": 1.83, - "grad_norm": 0.07417932897806168, - "learning_rate": 2.7436127286153524e-06, - "loss": 0.5922, - "step": 3328 - }, - { - "epoch": 1.83, - "grad_norm": 0.08960950374603271, - "learning_rate": 2.723269473886858e-06, - "loss": 0.721, - "step": 3329 - }, - { - "epoch": 1.84, - "grad_norm": 0.089479461312294, - "learning_rate": 2.7030008791802284e-06, - "loss": 0.5886, - "step": 3330 - }, - { - "epoch": 1.84, - "grad_norm": 0.0802077129483223, - "learning_rate": 2.6828069600516205e-06, - "loss": 0.5378, - "step": 3331 - }, - { - "epoch": 1.84, - "grad_norm": 0.06853347271680832, - "learning_rate": 2.66268773199988e-06, - "loss": 0.478, - "step": 3332 - }, - { - "epoch": 1.84, - "grad_norm": 0.0751049593091011, - "learning_rate": 2.6426432104665777e-06, - "loss": 0.5244, - "step": 3333 - }, - { - "epoch": 1.84, - "grad_norm": 0.07996946573257446, - "learning_rate": 2.6226734108358743e-06, - "loss": 0.5685, - "step": 3334 - }, - { - "epoch": 1.84, - "grad_norm": 0.06483065336942673, - "learning_rate": 2.6027783484346424e-06, - "loss": 0.4836, - "step": 3335 - }, - { - "epoch": 1.84, - "grad_norm": 0.08791980147361755, - "learning_rate": 2.582958038532357e-06, - "loss": 0.6577, - "step": 3336 - }, - { - "epoch": 1.84, - "grad_norm": 0.09538288414478302, - "learning_rate": 2.5632124963411386e-06, - "loss": 0.6429, - "step": 3337 - }, - { - "epoch": 1.84, - "grad_norm": 0.08513102680444717, - "learning_rate": 2.543541737015709e-06, - "loss": 0.5725, - "step": 3338 - }, - { - "epoch": 1.84, - "grad_norm": 0.08426357060670853, - "learning_rate": 2.5239457756534133e-06, - "loss": 0.5865, - "step": 3339 - }, - { - "epoch": 1.84, - "grad_norm": 0.086363285779953, - "learning_rate": 2.5044246272941663e-06, - "loss": 0.5069, - "step": 3340 - }, - { - "epoch": 1.84, - "grad_norm": 0.09307163953781128, - "learning_rate": 2.4849783069204824e-06, - "loss": 0.6304, - "step": 3341 - }, - { - "epoch": 1.84, - "grad_norm": 0.07800079882144928, - "learning_rate": 2.465606829457445e-06, - "loss": 0.6352, - "step": 3342 - }, - { - "epoch": 1.84, - "grad_norm": 0.07679854333400726, - "learning_rate": 2.446310209772684e-06, - "loss": 0.5483, - "step": 3343 - }, - { - "epoch": 1.84, - "grad_norm": 0.07857970893383026, - "learning_rate": 2.4270884626763858e-06, - "loss": 0.57, - "step": 3344 - }, - { - "epoch": 1.84, - "grad_norm": 0.07641496509313583, - "learning_rate": 2.407941602921249e-06, - "loss": 0.5747, - "step": 3345 - }, - { - "epoch": 1.84, - "grad_norm": 0.07856767624616623, - "learning_rate": 2.3888696452025404e-06, - "loss": 0.5282, - "step": 3346 - }, - { - "epoch": 1.84, - "grad_norm": 0.07555583864450455, - "learning_rate": 2.3698726041579853e-06, - "loss": 0.5985, - "step": 3347 - }, - { - "epoch": 1.85, - "grad_norm": 0.07063550502061844, - "learning_rate": 2.350950494367865e-06, - "loss": 0.5288, - "step": 3348 - }, - { - "epoch": 1.85, - "grad_norm": 0.07329586893320084, - "learning_rate": 2.3321033303548955e-06, - "loss": 0.5528, - "step": 3349 - }, - { - "epoch": 1.85, - "grad_norm": 0.09025662392377853, - "learning_rate": 2.313331126584328e-06, - "loss": 0.6068, - "step": 3350 - }, - { - "epoch": 1.85, - "grad_norm": 0.09756608307361603, - "learning_rate": 2.2946338974638493e-06, - "loss": 0.6122, - "step": 3351 - }, - { - "epoch": 1.85, - "grad_norm": 0.07663445174694061, - "learning_rate": 2.2760116573435906e-06, - "loss": 0.5624, - "step": 3352 - }, - { - "epoch": 1.85, - "grad_norm": 0.0778941959142685, - "learning_rate": 2.257464420516164e-06, - "loss": 0.5596, - "step": 3353 - }, - { - "epoch": 1.85, - "grad_norm": 0.07972193509340286, - "learning_rate": 2.2389922012165944e-06, - "loss": 0.4985, - "step": 3354 - }, - { - "epoch": 1.85, - "grad_norm": 0.08464938402175903, - "learning_rate": 2.2205950136223176e-06, - "loss": 0.5566, - "step": 3355 - }, - { - "epoch": 1.85, - "grad_norm": 0.07752576470375061, - "learning_rate": 2.202272871853228e-06, - "loss": 0.5314, - "step": 3356 - }, - { - "epoch": 1.85, - "grad_norm": 0.08926267921924591, - "learning_rate": 2.1840257899715887e-06, - "loss": 0.6661, - "step": 3357 - }, - { - "epoch": 1.85, - "grad_norm": 0.08092442899942398, - "learning_rate": 2.1658537819820414e-06, - "loss": 0.6168, - "step": 3358 - }, - { - "epoch": 1.85, - "grad_norm": 0.08687537908554077, - "learning_rate": 2.1477568618316404e-06, - "loss": 0.6626, - "step": 3359 - }, - { - "epoch": 1.85, - "grad_norm": 0.06949488818645477, - "learning_rate": 2.1297350434097974e-06, - "loss": 0.5432, - "step": 3360 - }, - { - "epoch": 1.85, - "grad_norm": 0.09435413777828217, - "learning_rate": 2.1117883405482708e-06, - "loss": 0.6576, - "step": 3361 - }, - { - "epoch": 1.85, - "grad_norm": 0.08354892581701279, - "learning_rate": 2.0939167670211978e-06, - "loss": 0.6777, - "step": 3362 - }, - { - "epoch": 1.85, - "grad_norm": 0.0721585676074028, - "learning_rate": 2.0761203365450064e-06, - "loss": 0.4559, - "step": 3363 - }, - { - "epoch": 1.85, - "grad_norm": 0.07381545752286911, - "learning_rate": 2.0583990627785154e-06, - "loss": 0.6137, - "step": 3364 - }, - { - "epoch": 1.85, - "grad_norm": 0.08608017861843109, - "learning_rate": 2.0407529593228116e-06, - "loss": 0.5615, - "step": 3365 - }, - { - "epoch": 1.86, - "grad_norm": 0.07255735248327255, - "learning_rate": 2.023182039721294e-06, - "loss": 0.5131, - "step": 3366 - }, - { - "epoch": 1.86, - "grad_norm": 0.0802227184176445, - "learning_rate": 2.0056863174596762e-06, - "loss": 0.6041, - "step": 3367 - }, - { - "epoch": 1.86, - "grad_norm": 0.08381889015436172, - "learning_rate": 1.9882658059659496e-06, - "loss": 0.5783, - "step": 3368 - }, - { - "epoch": 1.86, - "grad_norm": 0.08564453572034836, - "learning_rate": 1.970920518610375e-06, - "loss": 0.5746, - "step": 3369 - }, - { - "epoch": 1.86, - "grad_norm": 0.07094965130090714, - "learning_rate": 1.9536504687054924e-06, - "loss": 0.5272, - "step": 3370 - }, - { - "epoch": 1.86, - "grad_norm": 0.07661442458629608, - "learning_rate": 1.9364556695060764e-06, - "loss": 0.5754, - "step": 3371 - }, - { - "epoch": 1.86, - "grad_norm": 0.07895109057426453, - "learning_rate": 1.91933613420916e-06, - "loss": 0.6123, - "step": 3372 - }, - { - "epoch": 1.86, - "grad_norm": 0.07868780195713043, - "learning_rate": 1.9022918759540209e-06, - "loss": 0.5591, - "step": 3373 - }, - { - "epoch": 1.86, - "grad_norm": 0.0766960084438324, - "learning_rate": 1.88532290782214e-06, - "loss": 0.5008, - "step": 3374 - }, - { - "epoch": 1.86, - "grad_norm": 0.08017481863498688, - "learning_rate": 1.868429242837233e-06, - "loss": 0.5747, - "step": 3375 - }, - { - "epoch": 1.86, - "grad_norm": 0.08352438360452652, - "learning_rate": 1.8516108939651945e-06, - "loss": 0.6316, - "step": 3376 - }, - { - "epoch": 1.86, - "grad_norm": 0.08620418608188629, - "learning_rate": 1.8348678741141546e-06, - "loss": 0.6034, - "step": 3377 - }, - { - "epoch": 1.86, - "grad_norm": 0.0783904418349266, - "learning_rate": 1.8182001961343787e-06, - "loss": 0.5761, - "step": 3378 - }, - { - "epoch": 1.86, - "grad_norm": 0.08407583832740784, - "learning_rate": 1.801607872818356e-06, - "loss": 0.6141, - "step": 3379 - }, - { - "epoch": 1.86, - "grad_norm": 0.08542238920927048, - "learning_rate": 1.785090916900678e-06, - "loss": 0.6253, - "step": 3380 - }, - { - "epoch": 1.86, - "grad_norm": 0.07493787258863449, - "learning_rate": 1.7686493410581707e-06, - "loss": 0.5326, - "step": 3381 - }, - { - "epoch": 1.86, - "grad_norm": 0.07105356454849243, - "learning_rate": 1.752283157909762e-06, - "loss": 0.463, - "step": 3382 - }, - { - "epoch": 1.86, - "grad_norm": 0.09019310772418976, - "learning_rate": 1.7359923800164935e-06, - "loss": 0.589, - "step": 3383 - }, - { - "epoch": 1.87, - "grad_norm": 0.08535916358232498, - "learning_rate": 1.7197770198815743e-06, - "loss": 0.6479, - "step": 3384 - }, - { - "epoch": 1.87, - "grad_norm": 0.08877049386501312, - "learning_rate": 1.7036370899503162e-06, - "loss": 0.5914, - "step": 3385 - }, - { - "epoch": 1.87, - "grad_norm": 0.07365193217992783, - "learning_rate": 1.6875726026101435e-06, - "loss": 0.562, - "step": 3386 - }, - { - "epoch": 1.87, - "grad_norm": 0.08467639982700348, - "learning_rate": 1.6715835701905603e-06, - "loss": 0.5883, - "step": 3387 - }, - { - "epoch": 1.87, - "grad_norm": 0.08218105882406235, - "learning_rate": 1.6556700049631835e-06, - "loss": 0.6002, - "step": 3388 - }, - { - "epoch": 1.87, - "grad_norm": 0.07743723690509796, - "learning_rate": 1.6398319191416656e-06, - "loss": 0.5423, - "step": 3389 - }, - { - "epoch": 1.87, - "grad_norm": 0.07386884838342667, - "learning_rate": 1.624069324881805e-06, - "loss": 0.5223, - "step": 3390 - }, - { - "epoch": 1.87, - "grad_norm": 0.07357634603977203, - "learning_rate": 1.6083822342813692e-06, - "loss": 0.478, - "step": 3391 - }, - { - "epoch": 1.87, - "grad_norm": 0.07314004749059677, - "learning_rate": 1.5927706593802494e-06, - "loss": 0.5009, - "step": 3392 - }, - { - "epoch": 1.87, - "grad_norm": 0.06364717334508896, - "learning_rate": 1.5772346121603498e-06, - "loss": 0.4972, - "step": 3393 - }, - { - "epoch": 1.87, - "grad_norm": 0.08853484690189362, - "learning_rate": 1.561774104545588e-06, - "loss": 0.6418, - "step": 3394 - }, - { - "epoch": 1.87, - "grad_norm": 0.07860343158245087, - "learning_rate": 1.54638914840195e-06, - "loss": 0.5295, - "step": 3395 - }, - { - "epoch": 1.87, - "grad_norm": 0.06886250525712967, - "learning_rate": 1.531079755537379e-06, - "loss": 0.4841, - "step": 3396 - }, - { - "epoch": 1.87, - "grad_norm": 0.07765613496303558, - "learning_rate": 1.5158459377018873e-06, - "loss": 0.5708, - "step": 3397 - }, - { - "epoch": 1.87, - "grad_norm": 0.084499292075634, - "learning_rate": 1.5006877065874336e-06, - "loss": 0.6062, - "step": 3398 - }, - { - "epoch": 1.87, - "grad_norm": 0.0805201455950737, - "learning_rate": 1.4856050738279892e-06, - "loss": 0.5724, - "step": 3399 - }, - { - "epoch": 1.87, - "grad_norm": 0.08347020298242569, - "learning_rate": 1.4705980509994833e-06, - "loss": 0.6772, - "step": 3400 - }, - { - "epoch": 1.88, - "grad_norm": 0.08040057867765427, - "learning_rate": 1.4556666496198245e-06, - "loss": 0.599, - "step": 3401 - }, - { - "epoch": 1.88, - "grad_norm": 0.0760309025645256, - "learning_rate": 1.4408108811489018e-06, - "loss": 0.489, - "step": 3402 - }, - { - "epoch": 1.88, - "grad_norm": 0.070058174431324, - "learning_rate": 1.4260307569885168e-06, - "loss": 0.5206, - "step": 3403 - }, - { - "epoch": 1.88, - "grad_norm": 0.07758305966854095, - "learning_rate": 1.411326288482462e-06, - "loss": 0.5079, - "step": 3404 - }, - { - "epoch": 1.88, - "grad_norm": 0.08807816356420517, - "learning_rate": 1.3966974869163985e-06, - "loss": 0.557, - "step": 3405 - }, - { - "epoch": 1.88, - "grad_norm": 0.08464004099369049, - "learning_rate": 1.3821443635179898e-06, - "loss": 0.5922, - "step": 3406 - }, - { - "epoch": 1.88, - "grad_norm": 0.06907836347818375, - "learning_rate": 1.3676669294567456e-06, - "loss": 0.5663, - "step": 3407 - }, - { - "epoch": 1.88, - "grad_norm": 0.07782316207885742, - "learning_rate": 1.3532651958441444e-06, - "loss": 0.5586, - "step": 3408 - }, - { - "epoch": 1.88, - "grad_norm": 0.07036988437175751, - "learning_rate": 1.3389391737335112e-06, - "loss": 0.5858, - "step": 3409 - }, - { - "epoch": 1.88, - "grad_norm": 0.08822328597307205, - "learning_rate": 1.3246888741201058e-06, - "loss": 0.6201, - "step": 3410 - }, - { - "epoch": 1.88, - "grad_norm": 0.07636477798223495, - "learning_rate": 1.3105143079410465e-06, - "loss": 0.545, - "step": 3411 - }, - { - "epoch": 1.88, - "grad_norm": 0.08318522572517395, - "learning_rate": 1.2964154860753197e-06, - "loss": 0.5728, - "step": 3412 - }, - { - "epoch": 1.88, - "grad_norm": 0.07751517742872238, - "learning_rate": 1.2823924193438142e-06, - "loss": 0.6117, - "step": 3413 - }, - { - "epoch": 1.88, - "grad_norm": 0.07766176015138626, - "learning_rate": 1.2684451185092317e-06, - "loss": 0.5847, - "step": 3414 - }, - { - "epoch": 1.88, - "grad_norm": 0.07549408078193665, - "learning_rate": 1.2545735942761539e-06, - "loss": 0.5441, - "step": 3415 - }, - { - "epoch": 1.88, - "grad_norm": 0.08800029009580612, - "learning_rate": 1.2407778572909868e-06, - "loss": 0.663, - "step": 3416 - }, - { - "epoch": 1.88, - "grad_norm": 0.06972342729568481, - "learning_rate": 1.227057918141994e-06, - "loss": 0.5834, - "step": 3417 - }, - { - "epoch": 1.88, - "grad_norm": 0.07410095632076263, - "learning_rate": 1.2134137873592299e-06, - "loss": 0.6273, - "step": 3418 - }, - { - "epoch": 1.89, - "grad_norm": 0.08256086707115173, - "learning_rate": 1.199845475414585e-06, - "loss": 0.5854, - "step": 3419 - }, - { - "epoch": 1.89, - "grad_norm": 0.08870168775320053, - "learning_rate": 1.1863529927217732e-06, - "loss": 0.7093, - "step": 3420 - }, - { - "epoch": 1.89, - "grad_norm": 0.08100131154060364, - "learning_rate": 1.1729363496362777e-06, - "loss": 0.6304, - "step": 3421 - }, - { - "epoch": 1.89, - "grad_norm": 0.07548246532678604, - "learning_rate": 1.159595556455395e-06, - "loss": 0.4772, - "step": 3422 - }, - { - "epoch": 1.89, - "grad_norm": 0.09052509814500809, - "learning_rate": 1.1463306234181903e-06, - "loss": 0.5927, - "step": 3423 - }, - { - "epoch": 1.89, - "grad_norm": 0.07959606498479843, - "learning_rate": 1.1331415607055306e-06, - "loss": 0.5433, - "step": 3424 - }, - { - "epoch": 1.89, - "grad_norm": 0.08933278918266296, - "learning_rate": 1.1200283784400411e-06, - "loss": 0.5826, - "step": 3425 - }, - { - "epoch": 1.89, - "grad_norm": 0.07218793034553528, - "learning_rate": 1.1069910866861155e-06, - "loss": 0.4826, - "step": 3426 - }, - { - "epoch": 1.89, - "grad_norm": 0.0752687007188797, - "learning_rate": 1.0940296954498607e-06, - "loss": 0.5799, - "step": 3427 - }, - { - "epoch": 1.89, - "grad_norm": 0.08115094900131226, - "learning_rate": 1.0811442146791972e-06, - "loss": 0.5199, - "step": 3428 - }, - { - "epoch": 1.89, - "grad_norm": 0.08585885167121887, - "learning_rate": 1.068334654263714e-06, - "loss": 0.603, - "step": 3429 - }, - { - "epoch": 1.89, - "grad_norm": 0.07445257157087326, - "learning_rate": 1.0556010240348025e-06, - "loss": 0.555, - "step": 3430 - }, - { - "epoch": 1.89, - "grad_norm": 0.08283374458551407, - "learning_rate": 1.0429433337655115e-06, - "loss": 0.5513, - "step": 3431 - }, - { - "epoch": 1.89, - "grad_norm": 0.0685088112950325, - "learning_rate": 1.030361593170648e-06, - "loss": 0.4646, - "step": 3432 - }, - { - "epoch": 1.89, - "grad_norm": 0.08084689825773239, - "learning_rate": 1.0178558119067315e-06, - "loss": 0.5691, - "step": 3433 - }, - { - "epoch": 1.89, - "grad_norm": 0.07466350495815277, - "learning_rate": 1.0054259995719295e-06, - "loss": 0.5365, - "step": 3434 - }, - { - "epoch": 1.89, - "grad_norm": 0.0814463198184967, - "learning_rate": 9.930721657061659e-07, - "loss": 0.5889, - "step": 3435 - }, - { - "epoch": 1.89, - "grad_norm": 0.07925049960613251, - "learning_rate": 9.80794319791012e-07, - "loss": 0.5265, - "step": 3436 - }, - { - "epoch": 1.9, - "grad_norm": 0.07562460750341415, - "learning_rate": 9.68592471249752e-07, - "loss": 0.6428, - "step": 3437 - }, - { - "epoch": 1.9, - "grad_norm": 0.07313285022974014, - "learning_rate": 9.564666294472946e-07, - "loss": 0.5749, - "step": 3438 - }, - { - "epoch": 1.9, - "grad_norm": 0.0887812003493309, - "learning_rate": 9.444168036902513e-07, - "loss": 0.6197, - "step": 3439 - }, - { - "epoch": 1.9, - "grad_norm": 0.07397332042455673, - "learning_rate": 9.324430032268794e-07, - "loss": 0.5552, - "step": 3440 - }, - { - "epoch": 1.9, - "grad_norm": 0.07077188044786453, - "learning_rate": 9.205452372470835e-07, - "loss": 0.4652, - "step": 3441 - }, - { - "epoch": 1.9, - "grad_norm": 0.07836198806762695, - "learning_rate": 9.087235148824368e-07, - "loss": 0.5961, - "step": 3442 - }, - { - "epoch": 1.9, - "grad_norm": 0.07381219416856766, - "learning_rate": 8.969778452060928e-07, - "loss": 0.5567, - "step": 3443 - }, - { - "epoch": 1.9, - "grad_norm": 0.07355515658855438, - "learning_rate": 8.853082372328847e-07, - "loss": 0.5673, - "step": 3444 - }, - { - "epoch": 1.9, - "grad_norm": 0.0868539810180664, - "learning_rate": 8.737146999192592e-07, - "loss": 0.5933, - "step": 3445 - }, - { - "epoch": 1.9, - "grad_norm": 0.08018617331981659, - "learning_rate": 8.621972421632651e-07, - "loss": 0.5437, - "step": 3446 - }, - { - "epoch": 1.9, - "grad_norm": 0.08229728788137436, - "learning_rate": 8.507558728045539e-07, - "loss": 0.4987, - "step": 3447 - }, - { - "epoch": 1.9, - "grad_norm": 0.0776493176817894, - "learning_rate": 8.393906006244234e-07, - "loss": 0.5068, - "step": 3448 - }, - { - "epoch": 1.9, - "grad_norm": 0.08558415621519089, - "learning_rate": 8.281014343457072e-07, - "loss": 0.5341, - "step": 3449 - }, - { - "epoch": 1.9, - "grad_norm": 0.08440893143415451, - "learning_rate": 8.168883826328633e-07, - "loss": 0.6416, - "step": 3450 - }, - { - "epoch": 1.9, - "grad_norm": 0.08656568825244904, - "learning_rate": 8.057514540919297e-07, - "loss": 0.5967, - "step": 3451 - }, - { - "epoch": 1.9, - "grad_norm": 0.08011357486248016, - "learning_rate": 7.946906572705137e-07, - "loss": 0.663, - "step": 3452 - }, - { - "epoch": 1.9, - "grad_norm": 0.07203607261180878, - "learning_rate": 7.837060006577801e-07, - "loss": 0.5436, - "step": 3453 - }, - { - "epoch": 1.9, - "grad_norm": 0.07224507629871368, - "learning_rate": 7.727974926844849e-07, - "loss": 0.5395, - "step": 3454 - }, - { - "epoch": 1.91, - "grad_norm": 0.07532103359699249, - "learning_rate": 7.61965141722909e-07, - "loss": 0.6166, - "step": 3455 - }, - { - "epoch": 1.91, - "grad_norm": 0.078483946621418, - "learning_rate": 7.512089560869018e-07, - "loss": 0.5581, - "step": 3456 - }, - { - "epoch": 1.91, - "grad_norm": 0.07536689192056656, - "learning_rate": 7.405289440318486e-07, - "loss": 0.526, - "step": 3457 - }, - { - "epoch": 1.91, - "grad_norm": 0.08381485939025879, - "learning_rate": 7.299251137546814e-07, - "loss": 0.561, - "step": 3458 - }, - { - "epoch": 1.91, - "grad_norm": 0.09058713912963867, - "learning_rate": 7.193974733938346e-07, - "loss": 0.6357, - "step": 3459 - }, - { - "epoch": 1.91, - "grad_norm": 0.07965592294931412, - "learning_rate": 7.089460310293117e-07, - "loss": 0.5686, - "step": 3460 - }, - { - "epoch": 1.91, - "grad_norm": 0.07699765264987946, - "learning_rate": 6.985707946825848e-07, - "loss": 0.5332, - "step": 3461 - }, - { - "epoch": 1.91, - "grad_norm": 0.07802991569042206, - "learning_rate": 6.882717723166731e-07, - "loss": 0.6359, - "step": 3462 - }, - { - "epoch": 1.91, - "grad_norm": 0.0804867297410965, - "learning_rate": 6.780489718360983e-07, - "loss": 0.6162, - "step": 3463 - }, - { - "epoch": 1.91, - "grad_norm": 0.0790751576423645, - "learning_rate": 6.679024010868618e-07, - "loss": 0.5907, - "step": 3464 - }, - { - "epoch": 1.91, - "grad_norm": 0.07376383990049362, - "learning_rate": 6.578320678564676e-07, - "loss": 0.6181, - "step": 3465 - }, - { - "epoch": 1.91, - "grad_norm": 0.07800883054733276, - "learning_rate": 6.47837979873922e-07, - "loss": 0.5485, - "step": 3466 - }, - { - "epoch": 1.91, - "grad_norm": 0.08249462395906448, - "learning_rate": 6.379201448097005e-07, - "loss": 0.6131, - "step": 3467 - }, - { - "epoch": 1.91, - "grad_norm": 0.08678142726421356, - "learning_rate": 6.280785702757475e-07, - "loss": 0.5681, - "step": 3468 - }, - { - "epoch": 1.91, - "grad_norm": 0.07358631491661072, - "learning_rate": 6.183132638254763e-07, - "loss": 0.4937, - "step": 3469 - }, - { - "epoch": 1.91, - "grad_norm": 0.08811584860086441, - "learning_rate": 6.086242329537917e-07, - "loss": 0.6157, - "step": 3470 - }, - { - "epoch": 1.91, - "grad_norm": 0.07266715168952942, - "learning_rate": 5.99011485097023e-07, - "loss": 0.5419, - "step": 3471 - }, - { - "epoch": 1.91, - "grad_norm": 0.0865073874592781, - "learning_rate": 5.894750276329796e-07, - "loss": 0.6344, - "step": 3472 - }, - { - "epoch": 1.92, - "grad_norm": 0.08059767633676529, - "learning_rate": 5.800148678808958e-07, - "loss": 0.5713, - "step": 3473 - }, - { - "epoch": 1.92, - "grad_norm": 0.07227861881256104, - "learning_rate": 5.706310131014636e-07, - "loss": 0.5453, - "step": 3474 - }, - { - "epoch": 1.92, - "grad_norm": 0.0802195817232132, - "learning_rate": 5.613234704967996e-07, - "loss": 0.5387, - "step": 3475 - }, - { - "epoch": 1.92, - "grad_norm": 0.0846519023180008, - "learning_rate": 5.520922472104561e-07, - "loss": 0.6752, - "step": 3476 - }, - { - "epoch": 1.92, - "grad_norm": 0.08230707049369812, - "learning_rate": 5.429373503274215e-07, - "loss": 0.571, - "step": 3477 - }, - { - "epoch": 1.92, - "grad_norm": 0.07529239356517792, - "learning_rate": 5.338587868740641e-07, - "loss": 0.5601, - "step": 3478 - }, - { - "epoch": 1.92, - "grad_norm": 0.08313122391700745, - "learning_rate": 5.248565638182101e-07, - "loss": 0.5917, - "step": 3479 - }, - { - "epoch": 1.92, - "grad_norm": 0.08534807711839676, - "learning_rate": 5.159306880690884e-07, - "loss": 0.5682, - "step": 3480 - }, - { - "epoch": 1.92, - "grad_norm": 0.08856728672981262, - "learning_rate": 5.070811664773079e-07, - "loss": 0.635, - "step": 3481 - }, - { - "epoch": 1.92, - "grad_norm": 0.0904325619339943, - "learning_rate": 4.98308005834891e-07, - "loss": 0.5966, - "step": 3482 - }, - { - "epoch": 1.92, - "grad_norm": 0.09154371917247772, - "learning_rate": 4.896112128752406e-07, - "loss": 0.5869, - "step": 3483 - }, - { - "epoch": 1.92, - "grad_norm": 0.0799337774515152, - "learning_rate": 4.809907942731729e-07, - "loss": 0.5264, - "step": 3484 - }, - { - "epoch": 1.92, - "grad_norm": 0.09307362139225006, - "learning_rate": 4.72446756644862e-07, - "loss": 0.6198, - "step": 3485 - }, - { - "epoch": 1.92, - "grad_norm": 0.08572401106357574, - "learning_rate": 4.639791065478738e-07, - "loss": 0.5863, - "step": 3486 - }, - { - "epoch": 1.92, - "grad_norm": 0.08228857815265656, - "learning_rate": 4.555878504811317e-07, - "loss": 0.5623, - "step": 3487 - }, - { - "epoch": 1.92, - "grad_norm": 0.08371774852275848, - "learning_rate": 4.472729948849397e-07, - "loss": 0.5971, - "step": 3488 - }, - { - "epoch": 1.92, - "grad_norm": 0.09152055531740189, - "learning_rate": 4.390345461409706e-07, - "loss": 0.594, - "step": 3489 - }, - { - "epoch": 1.92, - "grad_norm": 0.07677818089723587, - "learning_rate": 4.308725105722333e-07, - "loss": 0.551, - "step": 3490 - }, - { - "epoch": 1.93, - "grad_norm": 0.06396432965993881, - "learning_rate": 4.227868944431057e-07, - "loss": 0.4516, - "step": 3491 - }, - { - "epoch": 1.93, - "grad_norm": 0.07630179822444916, - "learning_rate": 4.147777039593015e-07, - "loss": 0.5061, - "step": 3492 - }, - { - "epoch": 1.93, - "grad_norm": 0.07213296741247177, - "learning_rate": 4.068449452679035e-07, - "loss": 0.4783, - "step": 3493 - }, - { - "epoch": 1.93, - "grad_norm": 0.07577905803918839, - "learning_rate": 3.9898862445729714e-07, - "loss": 0.5182, - "step": 3494 - }, - { - "epoch": 1.93, - "grad_norm": 0.07999127358198166, - "learning_rate": 3.912087475572257e-07, - "loss": 0.5724, - "step": 3495 - }, - { - "epoch": 1.93, - "grad_norm": 0.10361068695783615, - "learning_rate": 3.835053205387573e-07, - "loss": 0.6712, - "step": 3496 - }, - { - "epoch": 1.93, - "grad_norm": 0.08972208946943283, - "learning_rate": 3.758783493142737e-07, - "loss": 0.563, - "step": 3497 - }, - { - "epoch": 1.93, - "grad_norm": 0.07700824737548828, - "learning_rate": 3.683278397374923e-07, - "loss": 0.5676, - "step": 3498 - }, - { - "epoch": 1.93, - "grad_norm": 0.07342316210269928, - "learning_rate": 3.608537976034554e-07, - "loss": 0.5639, - "step": 3499 - }, - { - "epoch": 1.93, - "grad_norm": 0.08734098076820374, - "learning_rate": 3.534562286484633e-07, - "loss": 0.6276, - "step": 3500 - }, - { - "epoch": 1.93, - "grad_norm": 0.07715500891208649, - "learning_rate": 3.4613513855018543e-07, - "loss": 0.6094, - "step": 3501 - }, - { - "epoch": 1.93, - "grad_norm": 0.08446547389030457, - "learning_rate": 3.3889053292757155e-07, - "loss": 0.5558, - "step": 3502 - }, - { - "epoch": 1.93, - "grad_norm": 0.07040911167860031, - "learning_rate": 3.3172241734085176e-07, - "loss": 0.5575, - "step": 3503 - }, - { - "epoch": 1.93, - "grad_norm": 0.07131405174732208, - "learning_rate": 3.2463079729158076e-07, - "loss": 0.4731, - "step": 3504 - }, - { - "epoch": 1.93, - "grad_norm": 0.07866710424423218, - "learning_rate": 3.1761567822257146e-07, - "loss": 0.6311, - "step": 3505 - }, - { - "epoch": 1.93, - "grad_norm": 0.07367514818906784, - "learning_rate": 3.106770655179392e-07, - "loss": 0.5383, - "step": 3506 - }, - { - "epoch": 1.93, - "grad_norm": 0.08176425844430923, - "learning_rate": 3.0381496450309075e-07, - "loss": 0.5508, - "step": 3507 - }, - { - "epoch": 1.93, - "grad_norm": 0.07364267110824585, - "learning_rate": 2.9702938044468e-07, - "loss": 0.4938, - "step": 3508 - }, - { - "epoch": 1.94, - "grad_norm": 0.0771787241101265, - "learning_rate": 2.903203185506631e-07, - "loss": 0.5832, - "step": 3509 - }, - { - "epoch": 1.94, - "grad_norm": 0.07928218692541122, - "learning_rate": 2.836877839702545e-07, - "loss": 0.6251, - "step": 3510 - }, - { - "epoch": 1.94, - "grad_norm": 0.08749215304851532, - "learning_rate": 2.7713178179392673e-07, - "loss": 0.5893, - "step": 3511 - }, - { - "epoch": 1.94, - "grad_norm": 0.08024594187736511, - "learning_rate": 2.706523170534325e-07, - "loss": 0.5203, - "step": 3512 - }, - { - "epoch": 1.94, - "grad_norm": 0.08996916562318802, - "learning_rate": 2.6424939472176056e-07, - "loss": 0.6149, - "step": 3513 - }, - { - "epoch": 1.94, - "grad_norm": 0.08545570820569992, - "learning_rate": 2.579230197131577e-07, - "loss": 0.5568, - "step": 3514 - }, - { - "epoch": 1.94, - "grad_norm": 0.07621780782938004, - "learning_rate": 2.5167319688312876e-07, - "loss": 0.4913, - "step": 3515 - }, - { - "epoch": 1.94, - "grad_norm": 0.08227244019508362, - "learning_rate": 2.454999310284256e-07, - "loss": 0.5735, - "step": 3516 - }, - { - "epoch": 1.94, - "grad_norm": 0.08408751338720322, - "learning_rate": 2.394032268870472e-07, - "loss": 0.6155, - "step": 3517 - }, - { - "epoch": 1.94, - "grad_norm": 0.07970957458019257, - "learning_rate": 2.3338308913819496e-07, - "loss": 0.5079, - "step": 3518 - }, - { - "epoch": 1.94, - "grad_norm": 0.06704852730035782, - "learning_rate": 2.2743952240236176e-07, - "loss": 0.4832, - "step": 3519 - }, - { - "epoch": 1.94, - "grad_norm": 0.07459887862205505, - "learning_rate": 2.2157253124122092e-07, - "loss": 0.5476, - "step": 3520 - }, - { - "epoch": 1.94, - "grad_norm": 0.07807392627000809, - "learning_rate": 2.157821201577148e-07, - "loss": 0.6195, - "step": 3521 - }, - { - "epoch": 1.94, - "grad_norm": 0.07530328631401062, - "learning_rate": 2.1006829359597747e-07, - "loss": 0.5083, - "step": 3522 - }, - { - "epoch": 1.94, - "grad_norm": 0.0794893279671669, - "learning_rate": 2.0443105594137868e-07, - "loss": 0.5903, - "step": 3523 - }, - { - "epoch": 1.94, - "grad_norm": 0.06803248077630997, - "learning_rate": 1.988704115205242e-07, - "loss": 0.4399, - "step": 3524 - }, - { - "epoch": 1.94, - "grad_norm": 0.07905720919370651, - "learning_rate": 1.9338636460118908e-07, - "loss": 0.5813, - "step": 3525 - }, - { - "epoch": 1.94, - "grad_norm": 0.0881994217634201, - "learning_rate": 1.8797891939240643e-07, - "loss": 0.5493, - "step": 3526 - }, - { - "epoch": 1.95, - "grad_norm": 0.07461023330688477, - "learning_rate": 1.8264808004438972e-07, - "loss": 0.4342, - "step": 3527 - }, - { - "epoch": 1.95, - "grad_norm": 0.07800307869911194, - "learning_rate": 1.7739385064856618e-07, - "loss": 0.5938, - "step": 3528 - }, - { - "epoch": 1.95, - "grad_norm": 0.07744286209344864, - "learning_rate": 1.7221623523756557e-07, - "loss": 0.5503, - "step": 3529 - }, - { - "epoch": 1.95, - "grad_norm": 0.07589315623044968, - "learning_rate": 1.6711523778520921e-07, - "loss": 0.5541, - "step": 3530 - }, - { - "epoch": 1.95, - "grad_norm": 0.07493191957473755, - "learning_rate": 1.62090862206532e-07, - "loss": 0.5086, - "step": 3531 - }, - { - "epoch": 1.95, - "grad_norm": 0.08878099918365479, - "learning_rate": 1.5714311235773825e-07, - "loss": 0.6694, - "step": 3532 - }, - { - "epoch": 1.95, - "grad_norm": 0.07499578595161438, - "learning_rate": 1.5227199203624586e-07, - "loss": 0.5938, - "step": 3533 - }, - { - "epoch": 1.95, - "grad_norm": 0.07987038791179657, - "learning_rate": 1.4747750498061986e-07, - "loss": 0.541, - "step": 3534 - }, - { - "epoch": 1.95, - "grad_norm": 0.08032140135765076, - "learning_rate": 1.4275965487066112e-07, - "loss": 0.5959, - "step": 3535 - }, - { - "epoch": 1.95, - "grad_norm": 0.06550218909978867, - "learning_rate": 1.3811844532731766e-07, - "loss": 0.5164, - "step": 3536 - }, - { - "epoch": 1.95, - "grad_norm": 0.07750283181667328, - "learning_rate": 1.3355387991271783e-07, - "loss": 0.5726, - "step": 3537 - }, - { - "epoch": 1.95, - "grad_norm": 0.0810721144080162, - "learning_rate": 1.290659621301704e-07, - "loss": 0.6531, - "step": 3538 - }, - { - "epoch": 1.95, - "grad_norm": 0.07887770980596542, - "learning_rate": 1.2465469542417563e-07, - "loss": 0.5742, - "step": 3539 - }, - { - "epoch": 1.95, - "grad_norm": 0.08414478600025177, - "learning_rate": 1.203200831803808e-07, - "loss": 0.5464, - "step": 3540 - }, - { - "epoch": 1.95, - "grad_norm": 0.0833955630660057, - "learning_rate": 1.1606212872559141e-07, - "loss": 0.639, - "step": 3541 - }, - { - "epoch": 1.95, - "grad_norm": 0.08545012772083282, - "learning_rate": 1.1188083532780447e-07, - "loss": 0.5234, - "step": 3542 - }, - { - "epoch": 1.95, - "grad_norm": 0.07801700383424759, - "learning_rate": 1.0777620619616402e-07, - "loss": 0.5809, - "step": 3543 - }, - { - "epoch": 1.95, - "grad_norm": 0.08191072195768356, - "learning_rate": 1.0374824448099451e-07, - "loss": 0.5946, - "step": 3544 - }, - { - "epoch": 1.96, - "grad_norm": 0.08125972747802734, - "learning_rate": 9.979695327373417e-08, - "loss": 0.5225, - "step": 3545 - }, - { - "epoch": 1.96, - "grad_norm": 0.08104407042264938, - "learning_rate": 9.59223356070349e-08, - "loss": 0.576, - "step": 3546 - }, - { - "epoch": 1.96, - "grad_norm": 0.0694657638669014, - "learning_rate": 9.212439445464017e-08, - "loss": 0.5447, - "step": 3547 - }, - { - "epoch": 1.96, - "grad_norm": 0.08874676376581192, - "learning_rate": 8.840313273149603e-08, - "loss": 0.5643, - "step": 3548 - }, - { - "epoch": 1.96, - "grad_norm": 0.08475618064403534, - "learning_rate": 8.475855329367343e-08, - "loss": 0.5496, - "step": 3549 - }, - { - "epoch": 1.96, - "grad_norm": 0.08129516243934631, - "learning_rate": 8.119065893839039e-08, - "loss": 0.6609, - "step": 3550 - }, - { - "epoch": 1.96, - "grad_norm": 0.08017243444919586, - "learning_rate": 7.76994524040009e-08, - "loss": 0.5847, - "step": 3551 - }, - { - "epoch": 1.96, - "grad_norm": 0.07957109808921814, - "learning_rate": 7.428493637002821e-08, - "loss": 0.6086, - "step": 3552 - }, - { - "epoch": 1.96, - "grad_norm": 0.0790736973285675, - "learning_rate": 7.094711345710936e-08, - "loss": 0.6083, - "step": 3553 - }, - { - "epoch": 1.96, - "grad_norm": 0.07656752318143845, - "learning_rate": 6.768598622701738e-08, - "loss": 0.5596, - "step": 3554 - }, - { - "epoch": 1.96, - "grad_norm": 0.07584652304649353, - "learning_rate": 6.450155718268347e-08, - "loss": 0.5284, - "step": 3555 - }, - { - "epoch": 1.96, - "grad_norm": 0.08076095581054688, - "learning_rate": 6.139382876816368e-08, - "loss": 0.5233, - "step": 3556 - }, - { - "epoch": 1.96, - "grad_norm": 0.07191152125597, - "learning_rate": 5.8362803368638974e-08, - "loss": 0.5918, - "step": 3557 - }, - { - "epoch": 1.96, - "grad_norm": 0.08384308964014053, - "learning_rate": 5.5408483310426253e-08, - "loss": 0.5641, - "step": 3558 - }, - { - "epoch": 1.96, - "grad_norm": 0.08836299180984497, - "learning_rate": 5.2530870860956204e-08, - "loss": 0.6588, - "step": 3559 - }, - { - "epoch": 1.96, - "grad_norm": 0.08138474076986313, - "learning_rate": 4.97299682288288e-08, - "loss": 0.6033, - "step": 3560 - }, - { - "epoch": 1.96, - "grad_norm": 0.07241957634687424, - "learning_rate": 4.7005777563724483e-08, - "loss": 0.5177, - "step": 3561 - }, - { - "epoch": 1.96, - "grad_norm": 0.0745629146695137, - "learning_rate": 4.4358300956459654e-08, - "loss": 0.5968, - "step": 3562 - }, - { - "epoch": 1.97, - "grad_norm": 0.07611432671546936, - "learning_rate": 4.178754043898669e-08, - "loss": 0.5377, - "step": 3563 - }, - { - "epoch": 1.97, - "grad_norm": 0.07590966671705246, - "learning_rate": 3.929349798434956e-08, - "loss": 0.4752, - "step": 3564 - }, - { - "epoch": 1.97, - "grad_norm": 0.07255428284406662, - "learning_rate": 3.6876175506750376e-08, - "loss": 0.4553, - "step": 3565 - }, - { - "epoch": 1.97, - "grad_norm": 0.08332665264606476, - "learning_rate": 3.453557486148284e-08, - "loss": 0.5895, - "step": 3566 - }, - { - "epoch": 1.97, - "grad_norm": 0.07977409660816193, - "learning_rate": 3.2271697844954426e-08, - "loss": 0.5473, - "step": 3567 - }, - { - "epoch": 1.97, - "grad_norm": 0.08140242099761963, - "learning_rate": 3.008454619469747e-08, - "loss": 0.6037, - "step": 3568 - }, - { - "epoch": 1.97, - "grad_norm": 0.08464977890253067, - "learning_rate": 2.797412158934698e-08, - "loss": 0.5727, - "step": 3569 - }, - { - "epoch": 1.97, - "grad_norm": 0.0898507609963417, - "learning_rate": 2.5940425648662836e-08, - "loss": 0.6351, - "step": 3570 - }, - { - "epoch": 1.97, - "grad_norm": 0.07864800095558167, - "learning_rate": 2.398345993350759e-08, - "loss": 0.6619, - "step": 3571 - }, - { - "epoch": 1.97, - "grad_norm": 0.08140621334314346, - "learning_rate": 2.2103225945857563e-08, - "loss": 0.5289, - "step": 3572 - }, - { - "epoch": 1.97, - "grad_norm": 0.07230167835950851, - "learning_rate": 2.029972512880285e-08, - "loss": 0.4946, - "step": 3573 - }, - { - "epoch": 1.97, - "grad_norm": 0.08280620723962784, - "learning_rate": 1.8572958866514e-08, - "loss": 0.4832, - "step": 3574 - }, - { - "epoch": 1.97, - "grad_norm": 0.08428557217121124, - "learning_rate": 1.6922928484297552e-08, - "loss": 0.6934, - "step": 3575 - }, - { - "epoch": 1.97, - "grad_norm": 0.07942302525043488, - "learning_rate": 1.5349635248551596e-08, - "loss": 0.5662, - "step": 3576 - }, - { - "epoch": 1.97, - "grad_norm": 0.08093717694282532, - "learning_rate": 1.3853080366788008e-08, - "loss": 0.5828, - "step": 3577 - }, - { - "epoch": 1.97, - "grad_norm": 0.08332273364067078, - "learning_rate": 1.2433264987599113e-08, - "loss": 0.5815, - "step": 3578 - }, - { - "epoch": 1.97, - "grad_norm": 0.0855347216129303, - "learning_rate": 1.1090190200713225e-08, - "loss": 0.6216, - "step": 3579 - }, - { - "epoch": 1.97, - "grad_norm": 0.08472579717636108, - "learning_rate": 9.823857036928008e-09, - "loss": 0.563, - "step": 3580 - }, - { - "epoch": 1.98, - "grad_norm": 0.08015488088130951, - "learning_rate": 8.634266468154905e-09, - "loss": 0.562, - "step": 3581 - }, - { - "epoch": 1.98, - "grad_norm": 0.06776360422372818, - "learning_rate": 7.521419407419128e-09, - "loss": 0.4865, - "step": 3582 - }, - { - "epoch": 1.98, - "grad_norm": 0.08664952963590622, - "learning_rate": 6.48531670882635e-09, - "loss": 0.5859, - "step": 3583 - }, - { - "epoch": 1.98, - "grad_norm": 0.07996014505624771, - "learning_rate": 5.5259591675849155e-09, - "loss": 0.5584, - "step": 3584 - }, - { - "epoch": 1.98, - "grad_norm": 0.08663001656532288, - "learning_rate": 4.643347520005836e-09, - "loss": 0.6266, - "step": 3585 - }, - { - "epoch": 1.98, - "grad_norm": 0.08204808831214905, - "learning_rate": 3.8374824434916915e-09, - "loss": 0.5886, - "step": 3586 - }, - { - "epoch": 1.98, - "grad_norm": 0.08297493308782578, - "learning_rate": 3.1083645565477272e-09, - "loss": 0.5517, - "step": 3587 - }, - { - "epoch": 1.98, - "grad_norm": 0.07561254501342773, - "learning_rate": 2.4559944187707572e-09, - "loss": 0.528, - "step": 3588 - }, - { - "epoch": 1.98, - "grad_norm": 0.08274608105421066, - "learning_rate": 1.880372530860264e-09, - "loss": 0.5366, - "step": 3589 - }, - { - "epoch": 1.98, - "grad_norm": 0.07005135715007782, - "learning_rate": 1.3814993345961924e-09, - "loss": 0.5466, - "step": 3590 - }, - { - "epoch": 1.98, - "grad_norm": 0.07817287743091583, - "learning_rate": 9.593752128833622e-10, - "loss": 0.4973, - "step": 3591 - }, - { - "epoch": 1.98, - "grad_norm": 0.0750674456357956, - "learning_rate": 6.140004896737495e-10, - "loss": 0.5421, - "step": 3592 - }, - { - "epoch": 1.98, - "grad_norm": 0.077664814889431, - "learning_rate": 3.4537543006640804e-10, - "loss": 0.5494, - "step": 3593 - }, - { - "epoch": 1.98, - "grad_norm": 0.08603768795728683, - "learning_rate": 1.5350024022975362e-10, - "loss": 0.5667, - "step": 3594 - }, - { - "epoch": 1.98, - "grad_norm": 0.08145732432603836, - "learning_rate": 3.837506742376817e-11, - "loss": 0.5577, - "step": 3595 - }, - { - "epoch": 1.98, - "grad_norm": 0.09160507470369339, - "learning_rate": 0.0, - "loss": 0.5693, - "step": 3596 - } - ], - "logging_steps": 1, - "max_steps": 3596, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 1798, - "total_flos": 6.436865157585961e+17, - "train_batch_size": 2, - "trial_name": null, - "trial_params": null -}